home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / hackernews.recipe < prev    next >
Text File  |  2011-09-09  |  5KB  |  124 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. '''
  5. Hacker News
  6. '''
  7. from calibre.web.feeds.news import BasicNewsRecipe
  8. from calibre.ptempfile import PersistentTemporaryFile
  9. from urlparse import urlparse
  10. import re
  11.  
  12. class HackerNews(BasicNewsRecipe):
  13.     title                 = 'Hacker News'
  14.     __author__            = 'Tom Scholl'
  15.     description           = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
  16.     publisher             = 'Y Combinator'
  17.     category              = 'news, programming, it, technology'
  18.     masthead_url          = 'http://img585.imageshack.us/img585/5011/hnle.png'
  19.     cover_url             = 'http://img585.imageshack.us/img585/5011/hnle.png'
  20.     delay                 = 1
  21.     max_articles_per_feed = 30
  22.     use_embedded_content  = False
  23.     no_stylesheets        = True
  24.     encoding              = 'utf-8'
  25.     language              = 'en'
  26.     requires_version      = (0,8,16)
  27.  
  28.     feeds = [
  29.                 (u'Hacker News', 'http://news.ycombinator.com/rss')
  30.             ]
  31.  
  32.     temp_files = []
  33.     articles_are_obfuscated = True
  34.  
  35.     def get_readable_content(self, url):
  36.         self.log('get_readable_content(' + url + ')')
  37.         br = self.get_browser()
  38.         f = br.open(url)
  39.         html = f.read()
  40.         f.close()
  41.  
  42.         return self.extract_readable_article(html, url)
  43.  
  44.     def get_hn_content(self, url):
  45.         self.log('get_hn_content(' + url + ')')
  46.         soup = self.index_to_soup(url)
  47.         main = soup.find('tr').findNextSiblings('tr', limit=2)[1].td
  48.  
  49.         title = self.tag_to_string(main.find('td', 'title'))
  50.         link = main.find('td', 'title').find('a')['href']
  51.         if link.startswith('item?'):
  52.             link = 'http://news.ycombinator.com/' + link
  53.         readable_link = link.rpartition('http://')[2].rpartition('https://')[2]
  54.         subtext = self.tag_to_string(main.find('td', 'subtext'))
  55.  
  56.         title_content_td = main.find('td', 'title').findParent('tr').findNextSiblings('tr', limit=3)[2].findAll('td', limit=2)[1]
  57.         title_content = u''
  58.         if not title_content_td.find('form'):
  59.             title_content_td.name ='div'
  60.             title_content = title_content_td.prettify()
  61.  
  62.         comments = u''
  63.         for td in main.findAll('td', 'default'):
  64.             comhead = td.find('span', 'comhead')
  65.             if comhead:
  66.                 com_title = u'<h4>' + self.tag_to_string(comhead).replace(' | link', '') + u'</h4>'
  67.                 comhead.parent.extract()
  68.                 br = td.find('br')
  69.                 if br:
  70.                     br.extract()
  71.                 reply = td.find('a', attrs = {'href' : re.compile('^reply?')})
  72.                 if reply:
  73.                     reply.parent.extract()
  74.                 td.name = 'div'
  75.                 indent_width = (int(td.parent.find('td').img['width']) * 2) / 3
  76.                 td['style'] = 'padding-left: ' + str(indent_width) + 'px'
  77.                 comments = comments + com_title + td.prettify()
  78.  
  79.         body = u'<h3>' + title + u'</h3><p><a href="' + link + u'">' + readable_link + u'</a><br/><strong>' + subtext +  u'</strong></p>' + title_content + u'<br/>'
  80.         body = body + comments
  81.         return u'<html><title>' + title + u'</title><body>' + body + '</body></html>'
  82.  
  83.     def get_obfuscated_article(self, url):
  84.         if url.startswith('http://news.ycombinator.com'):
  85.             content = self.get_hn_content(url)
  86.         else:
  87.             # TODO: use content-type header instead of url
  88.             is_image = False
  89.             for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp',]:
  90.                 if url.endswith(ext):
  91.                     is_image = True
  92.                     break
  93.  
  94.             if is_image:
  95.                 self.log('using image_content (' + url + ')')
  96.                 content = u'<html><body><img src="' + url + u'"></body></html>'
  97.             else:
  98.                 content = self.get_readable_content(url)
  99.  
  100.         self.temp_files.append(PersistentTemporaryFile('_fa.html'))
  101.         self.temp_files[-1].write(content)
  102.         self.temp_files[-1].close()
  103.         return self.temp_files[-1].name
  104.  
  105.     def is_link_wanted(self, url, tag):
  106.         if url.endswith('.pdf'):
  107.             return False
  108.         return True
  109.  
  110.     def prettyify_url(self, url):
  111.         return urlparse(url).hostname
  112.  
  113.     def populate_article_metadata(self, article, soup, first):
  114.         article.text_summary = self.prettyify_url(article.url)
  115.         article.summary = article.text_summary
  116.  
  117. #    def parse_index(self):
  118. #        feeds = []
  119. #        feeds.append((u'Hacker News',[{'title': 'Testing', 'url': 'http://news.ycombinator.com/item?id=2935944'}]))
  120. #        return feeds
  121.  
  122.  
  123.  
  124.