home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / vedomosti.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  7.0 KB  |  196 lines

  1. #!/usr/bin/env  python
  2.  
  3. u'''
  4. Ведомости
  5. '''
  6.  
  7. from calibre.web.feeds.feedparser import parse
  8. from calibre.ebooks.BeautifulSoup import Tag
  9. from calibre.web.feeds.news import BasicNewsRecipe
  10.  
  11. class VedomostiRecipe(BasicNewsRecipe):
  12.     title = u'╨Æ╨╡╨┤╨╛╨╝╨╛╤ü╤é╨╕'
  13.     __author__ = 'Nikolai Kotchetkov'
  14.     publisher = 'vedomosti.ru'
  15.     category = 'press, Russia'
  16.     description = u'╨ò╨╢╨╡╨┤╨╜╨╡╨▓╨╜╨░╤Å ╨┤╨╡╨╗╨╛╨▓╨░╤Å ╨│╨░╨╖╨╡╤é╨░'
  17.     oldest_article = 3
  18.     max_articles_per_feed = 100
  19.  
  20.     masthead_url = u'http://motorro.com/imgdir/logos/ved_logo_black2_cropped.gif'
  21.     cover_url = u'http://motorro.com/imgdir/logos/ved_logo_black2_cropped.gif'
  22.  
  23.     #Add feed names if you want them to be sorted (feeds of this list appear first)
  24.     sortOrder = [u'_default', u'╨ƒ╨╡╤Ç╨▓╨░╤Å ╨┐╨╛╨╗╨╛╤ü╨░', u'╨Æ╨╗╨░╤ü╤é╤î ╨╕ ╨┤╨╡╨╜╤î╨│╨╕']
  25.  
  26.     encoding = 'cp1251'
  27.     language = 'ru'
  28.     no_stylesheets = True
  29.     remove_javascript = True
  30.     recursions = 0
  31.  
  32.     conversion_options = {
  33.                           'comment'   : description
  34.                         , 'tags'      : category
  35.                         , 'publisher' : publisher
  36.                         , 'language'  : language
  37.                         }
  38.  
  39.  
  40.     keep_only_tags = [dict(name='td', attrs={'class' : ['second_content']})]
  41.  
  42.     remove_tags_after = [dict(name='div', attrs={'class' : 'article_text'})]
  43.  
  44.     remove_tags = [dict(name='div', attrs={'class' : ['sep', 'choice', 'articleRightTbl']})]
  45.  
  46.     feeds = [u'http://www.vedomosti.ru/newspaper/out/rss.xml']
  47.  
  48.     #base URL for relative links
  49.     base_url = u'http://www.vedomosti.ru'
  50.  
  51.     extra_css = 'h1 {font-size: 1.5em; margin: 0em 0em 0em 0em; text-align: center;}'\
  52.                 'h2 {font-size: 1.0em; margin: 0em 0em 0em 0em;}'\
  53.                 'h3 {font-size: 0.8em; margin: 0em 0em 0em 0em;}'\
  54.                 '.article_date {font-size: 0.5em; color: gray; font-family: monospace; text-align:right;}'\
  55.                 '.article_authors {font-size: 0.5em; color: gray; font-family: monospace; text-align:right;}'\
  56.                 '.article_img {width:100%; text-align: center; padding: 3px 3px 3px 3px;}'\
  57.                 '.article_img_desc {width:100%; text-align: center; font-size: 0.5em; color: gray; font-family: monospace;}'\
  58.                 '.article_desc {font-size: 1em; font-style:italic;}'
  59.  
  60.     def parse_index(self):
  61.         try:
  62.             feedData = parse(self.feeds[0])
  63.             if not feedData:
  64.                 raise NotImplementedError
  65.             self.log("parse_index: Feed loaded successfully.")
  66.             if feedData.feed.has_key('title'):
  67.                 self.title = feedData.feed.title
  68.                 self.log("parse_index: Title updated to: ", self.title)
  69.             if feedData.feed.has_key('description'):
  70.                 self.description = feedData.feed.description
  71.                 self.log("parse_index: Description updated to: ", self.description)
  72.  
  73.             def get_virtual_feed_articles(feed):
  74.                 if feeds.has_key(feed):
  75.                     return feeds[feed][1]
  76.                 self.log("Adding new feed: ", feed)
  77.                 articles = []
  78.                 feeds[feed] = (feed, articles)
  79.                 return articles
  80.  
  81.             feeds = {}
  82.  
  83.             #Iterate feed items and distribute articles using tags
  84.             for item in feedData.entries:
  85.                 link = item.get('link', '');
  86.                 title = item.get('title', '');
  87.                 if '' == link or '' == title:
  88.                     continue
  89.                 article = {'title':title, 'url':link, 'description':item.get('description', ''), 'date':item.get('date', ''), 'content':''};
  90.                 if not item.has_key('tags'):
  91.                     get_virtual_feed_articles('_default').append(article)
  92.                     continue
  93.                 for tag in item.tags:
  94.                     addedToDefault = False
  95.                     term = tag.get('term', '')
  96.                     if '' == term:
  97.                         if (not addedToDefault):
  98.                             get_virtual_feed_articles('_default').append(article)
  99.                         continue
  100.                     get_virtual_feed_articles(term).append(article)
  101.  
  102.             #Get feed list
  103.             #Select sorted feeds first of all
  104.             result = []
  105.             for feedName in self.sortOrder:
  106.                 if (not feeds.has_key(feedName)): continue
  107.                 result.append(feeds[feedName])
  108.                 del feeds[feedName]
  109.             result = result + feeds.values()
  110.  
  111.             return result
  112.  
  113.         except Exception, err:
  114.             self.log(err)
  115.             raise NotImplementedError
  116.  
  117.     def preprocess_html(self, soup):
  118.         return self.adeify_images(soup)
  119.  
  120.     def postprocess_html(self, soup, first_fetch):
  121.         #self.log('Original: ', soup.prettify())
  122.  
  123.         #Find article
  124.         contents = soup.find('div', {'class':['article_text']})
  125.         if not contents:
  126.             self.log('postprocess_html: article div not found!')
  127.             return soup
  128.         contents.extract()
  129.  
  130.         #Find title
  131.         title = soup.find('h1')
  132.         if title:
  133.             contents.insert(0, title)
  134.  
  135.         #Find article image
  136.         newstop = soup.find('div', {'class':['newstop']})
  137.         if newstop:
  138.             img = newstop.find('img')
  139.             if img:
  140.                 imgDiv = Tag(soup, 'div')
  141.                 imgDiv['class'] = 'article_img'
  142.  
  143.                 if img.has_key('width'):
  144.                     del(img['width'])
  145.                 if img.has_key('height'):
  146.                     del(img['height'])
  147.  
  148.                 #find description
  149.                 element = img.parent.nextSibling
  150.  
  151.                 img.extract()
  152.                 imgDiv.insert(0, img)
  153.  
  154.                 while element:
  155.                     if not isinstance(element, Tag):
  156.                         continue
  157.                     nextElement = element.nextSibling
  158.                     if 'p' == element.name:
  159.                         element.extract()
  160.                         element['class'] = 'article_img_desc'
  161.                         imgDiv.insert(len(imgDiv.contents), element)
  162.                     element = nextElement
  163.  
  164.                 contents.insert(1, imgDiv)
  165.  
  166.         #find article abstract
  167.         abstract = soup.find('p', {'class':['subhead']})
  168.         if abstract:
  169.             abstract['class'] = 'article_desc'
  170.             contents.insert(2, abstract)
  171.  
  172.         #Find article authors
  173.         authorsDiv = soup.find('div', {'class':['autors']})
  174.         if authorsDiv:
  175.             authorsP = authorsDiv.find('p')
  176.             if authorsP:
  177.                 authorsP['class'] = 'article_authors'
  178.                 contents.insert(len(contents.contents), authorsP)
  179.  
  180.         #Fix urls that use relative path
  181.         urls = contents.findAll('a');
  182.         if urls:
  183.             for url in urls:
  184.                 if not url.has_key('href'):
  185.                     continue
  186.                 if '/' == url['href'][0]:
  187.                     url['href'] = self.base_url + url['href']
  188.  
  189.         body = soup.find('td', {'class':['second_content']})
  190.         if body:
  191.             body.replaceWith(contents)
  192.  
  193.         self.log('Result: ', soup.prettify())
  194.         return soup
  195.  
  196.