home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / lenta_ru.recipe < prev    next >
Text File  |  2011-09-09  |  7KB  |  178 lines

  1. #!/usr/bin/env  python
  2.  
  3. '''
  4. Lenta.ru
  5. '''
  6.  
  7. from calibre.web.feeds.feedparser import parse
  8. from calibre.ebooks.BeautifulSoup import Tag
  9. from calibre.web.feeds.news import BasicNewsRecipe
  10. import re
  11.  
  12. class LentaRURecipe(BasicNewsRecipe):
  13.     title = u'Lenta.ru: \u041d\u043e\u0432\u043e\u0441\u0442\u0438'
  14.     __author__ = 'Nikolai Kotchetkov'
  15.     publisher = 'lenta.ru'
  16.     category = 'news, Russia'
  17.     description = u'''\u0415\u0436\u0435\u0434\u043d\u0435\u0432\u043d\u0430\u044f
  18.         \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u0433\u0430\u0437\u0435\u0442\u0430.
  19.         \u041d\u043e\u0432\u043e\u0441\u0442\u0438 \u0441\u043e
  20.         \u0432\u0441\u0435\u0433\u043e \u043c\u0438\u0440\u0430 \u043d\u0430
  21.         \u0440\u0443\u0441\u0441\u043a\u043e\u043c
  22.         \u044f\u0437\u044b\u043a\u0435'''
  23.     description = u'╨ò╨╢╨╡╨┤╨╜╨╡╨▓╨╜╨░╤Å ╨╕╨╜╤é╨╡╤Ç╨╜╨╡╤é-╨│╨░╨╖╨╡╤é╨░. ╨¥╨╛╨▓╨╛╤ü╤é╨╕ ╤ü╨╛ ╨▓╤ü╨╡╨│╨╛ ╨╝╨╕╤Ç╨░ ╨╜╨░ ╤Ç╤â╤ü╤ü╨║╨╛╨╝ ╤Å╨╖╤ï╨║╨╡'
  24.     oldest_article = 3
  25.     max_articles_per_feed = 100
  26.  
  27.     masthead_url = u'http://img.lenta.ru/i/logowrambler.gif'
  28.     cover_url = u'http://img.lenta.ru/i/logowrambler.gif'
  29.  
  30.     #Add feed names if you want them to be sorted (feeds of this list appear first)
  31.     sortOrder = [u'_default', u'╨Æ ╨á╨╛╤ü╤ü╨╕╨╕', u'╨▒.╨í╨í╨í╨á', u'╨Æ ╨╝╨╕╤Ç╨╡']
  32.  
  33.     encoding = 'cp1251'
  34.     language = 'ru'
  35.     no_stylesheets = True
  36.     remove_javascript = True
  37.     recursions = 0
  38.  
  39.     conversion_options = {
  40.                           'comment'   : description
  41.                         , 'tags'      : category
  42.                         , 'publisher' : publisher
  43.                         , 'language'  : language
  44.                         }
  45.  
  46.  
  47.     keep_only_tags = [dict(name='td', attrs={'class':['statya','content']})]
  48.  
  49.     remove_tags_after = [dict(name='p', attrs={'class':'links'}), dict(name='div', attrs={'id':'readers-block'})]
  50.  
  51.     remove_tags = [dict(name='table', attrs={'class':['vrezka','content']}), dict(name='div', attrs={'class':'b240'}), dict(name='div', attrs={'id':'readers-block'}), dict(name='p', attrs={'class':'links'})]
  52.  
  53.     feeds = [u'http://lenta.ru/rss/']
  54.  
  55.     extra_css = 'h1 {font-size: 1.2em; margin: 0em 0em 0em 0em;} h2 {font-size: 1.0em; margin: 0em 0em 0em 0em;} h3 {font-size: 0.8em; margin: 0em 0em 0em 0em;}'
  56.  
  57.     def parse_index(self):
  58.         try:
  59.             feedData = parse(self.feeds[0])
  60.             if not feedData:
  61.                 raise NotImplementedError
  62.             self.log("parse_index: Feed loaded successfully.")
  63.             if feedData.feed.has_key('title'):
  64.                 self.title = feedData.feed.title
  65.                 self.log("parse_index: Title updated to: ", self.title)
  66.             if feedData.feed.has_key('image'):
  67.                 self.log("HAS IMAGE!!!!")
  68.  
  69.             def get_virtual_feed_articles(feed):
  70.                 if feeds.has_key(feed):
  71.                     return feeds[feed][1]
  72.                 self.log("Adding new feed: ", feed)
  73.                 articles = []
  74.                 feeds[feed] = (feed, articles)
  75.                 return articles
  76.  
  77.             feeds = {}
  78.  
  79.             #Iterate feed items and distribute articles using tags
  80.             for item in feedData.entries:
  81.                 link = item.get('link', '');
  82.                 title = item.get('title', '');
  83.                 if '' == link or '' == title:
  84.                     continue
  85.                 article = {'title':title, 'url':link, 'description':item.get('description', ''), 'date':item.get('date', ''), 'content':''};
  86.                 if not item.has_key('tags'):
  87.                     get_virtual_feed_articles('_default').append(article)
  88.                     continue
  89.                 for tag in item.tags:
  90.                     addedToDefault = False
  91.                     term = tag.get('term', '')
  92.                     if '' == term:
  93.                         if (not addedToDefault):
  94.                             get_virtual_feed_articles('_default').append(article)
  95.                         continue
  96.                     get_virtual_feed_articles(term).append(article)
  97.  
  98.             #Get feed list
  99.             #Select sorted feeds first of all
  100.             result = []
  101.             for feedName in self.sortOrder:
  102.                 if (not feeds.has_key(feedName)): continue
  103.                 result.append(feeds[feedName])
  104.                 del feeds[feedName]
  105.             result = result + feeds.values()
  106.  
  107.             return result
  108.  
  109.         except Exception, err:
  110.             self.log(err)
  111.             raise NotImplementedError
  112.  
  113.     def preprocess_html(self, soup):
  114.         return self.adeify_images(soup)
  115.  
  116.     def postprocess_html(self, soup, first_fetch):
  117.         #self.log('Original: ', soup.prettify())
  118.  
  119.         contents = Tag(soup, 'div')
  120.  
  121.         #Extract tags with given attributes
  122.         extractElements = {'div' : [{'id' : 'readers-block'}]}
  123.  
  124.         #Remove all elements that were not extracted before
  125.         for tag, attrs in extractElements.iteritems():
  126.             for attr in attrs:
  127.                 garbage = soup.findAll(tag, attr)
  128.                 if garbage:
  129.                     for pieceOfGarbage in garbage:
  130.                         pieceOfGarbage.extract()
  131.  
  132.         #Find article text using header
  133.         #and add all elements to contents
  134.         element = soup.find({'h1' : True, 'h2' : True})
  135.         if (element):
  136.             element.name = 'h1'
  137.         while element:
  138.             nextElement = element.nextSibling
  139.             element.extract()
  140.             contents.insert(len(contents.contents), element)
  141.             element = nextElement
  142.  
  143.         #Place article date after header
  144.         dates = soup.findAll(text=re.compile('\d{2}\.\d{2}\.\d{4}, \d{2}:\d{2}:\d{2}'))
  145.         if dates:
  146.             for date in dates:
  147.                 for string in date:
  148.                     parent = date.parent
  149.                     if (parent and isinstance(parent, Tag) and 'div' == parent.name and 'dt' == parent['class']):
  150.                         #Date div found
  151.                         parent.extract()
  152.                         parent['style'] = 'font-size: 0.5em; color: gray; font-family: monospace;'
  153.                         contents.insert(1, parent)
  154.                         break
  155.  
  156.         #Place article picture after date
  157.         pic = soup.find('img')
  158.         if pic:
  159.             picDiv = Tag(soup, 'div')
  160.             picDiv['style'] = 'width: 100%; text-align: center;'
  161.             pic.extract()
  162.             picDiv.insert(0, pic)
  163.             title = pic.get('title', None)
  164.             if title:
  165.                 titleDiv = Tag(soup, 'div')
  166.                 titleDiv['style'] = 'font-size: 0.5em;'
  167.                 titleDiv.insert(0, title)
  168.                 picDiv.insert(1, titleDiv)
  169.             contents.insert(2, picDiv)
  170.  
  171.         body = soup.find('td', {'class':['statya','content']})
  172.         if body:
  173.             body.replaceWith(contents)
  174.  
  175.         #self.log('Result: ', soup.prettify())
  176.         return soup
  177.  
  178.