home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / time_magazine.recipe < prev    next >
Text File  |  2011-09-09  |  4KB  |  105 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
  5. '''
  6. time.com
  7. '''
  8.  
  9. import re
  10. from calibre.web.feeds.news import BasicNewsRecipe
  11. from lxml import html
  12.  
  13. class Time(BasicNewsRecipe):
  14.     #recipe_disabled = ('This recipe has been disabled as TIME no longer'
  15.     #        ' publish complete articles on the web.')
  16.     title                 = u'Time'
  17.     __author__            = 'Kovid Goyal'
  18.     description           = 'Weekly magazine'
  19.     encoding = 'utf-8'
  20.     no_stylesheets        = True
  21.     language = 'en'
  22.     remove_javascript     = True
  23.  
  24.  
  25.     keep_only_tags = [
  26.             {
  27.                 'class':['artHd', 'articleContent',
  28.                             'entry-title','entry-meta', 'entry-content', 'thumbnail']
  29.             },
  30.         ]
  31.     remove_tags = [
  32.             {'class':['content-tools', 'quigo', 'see',
  33.                 'first-tier-social-tools', 'navigation', 'enlarge lightbox']},
  34.             {'id':['share-tools']},
  35.             {'rel':'lightbox'},
  36.             ]
  37.  
  38.     recursions = 10
  39.     match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html',r'http://www.time.com/time/specials/packages/article/.*']
  40.  
  41.     preprocess_regexps = [(re.compile(
  42.         r'<meta .+/>'), lambda m:'')]
  43.  
  44.     def parse_index(self):
  45.         raw = self.index_to_soup('http://www.time.com/time/magazine', raw=True)
  46.         root = html.fromstring(raw)
  47.         img = root.xpath('//a[.="View Large Cover" and @href]')
  48.         if img:
  49.             cover_url = 'http://www.time.com' + img[0].get('href')
  50.             try:
  51.                 nsoup = self.index_to_soup(cover_url)
  52.                 img = nsoup.find('img', src=re.compile('archive/covers'))
  53.                 if img is not None:
  54.                     self.cover_url = img['src']
  55.             except:
  56.                 self.log.exception('Failed to fetch cover')
  57.  
  58.  
  59.         feeds = []
  60.         parent = root.xpath('//div[@class="content-main-aside"]')[0]
  61.         for sec in parent.xpath(
  62.                 'descendant::section[contains(@class, "sec-mag-section")]'):
  63.             h3 = sec.xpath('./h3')
  64.             if h3:
  65.                 section = html.tostring(h3[0], encoding=unicode,
  66.                         method='text').strip().capitalize()
  67.                 self.log('Found section', section)
  68.                 articles = list(self.find_articles(sec))
  69.                 if articles:
  70.                     feeds.append((section, articles))
  71.  
  72.         return feeds
  73.  
  74.     def find_articles(self, sec):
  75.  
  76.         for article in sec.xpath('./article'):
  77.             h2 = article.xpath('./*[@class="entry-title"]')
  78.             if not h2: continue
  79.             a = h2[0].xpath('./a[@href]')
  80.             if not a: continue
  81.             title = html.tostring(a[0], encoding=unicode,
  82.                         method='text').strip()
  83.             if not title: continue
  84.             url =  a[0].get('href')
  85.             if url.startswith('/'):
  86.                 url = 'http://www.time.com'+url
  87.             desc = ''
  88.             p = article.xpath('./*[@class="entry-content"]')
  89.             if p:
  90.                 desc = html.tostring(p[0], encoding=unicode,
  91.                         method='text')
  92.             self.log('\t', title, ':\n\t\t', desc)
  93.             yield {
  94.                     'title' : title,
  95.                     'url'   : url,
  96.                     'date'  : '',
  97.                     'description' : desc
  98.                     }
  99.  
  100.     def postprocess_html(self,soup,first):
  101.         for tag in soup.findAll(attrs ={'class':['artPag','pagination']}):
  102.             tag.extract()
  103.         return soup
  104.  
  105.