home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / atlantic.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  3.6 KB  |  104 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  5. '''
  6. theatlantic.com
  7. '''
  8. import re
  9.  
  10. from calibre.web.feeds.news import BasicNewsRecipe
  11. from calibre.ebooks.BeautifulSoup import Tag, NavigableString
  12.  
  13. class TheAtlantic(BasicNewsRecipe):
  14.  
  15.     title      = 'The Atlantic'
  16.     __author__ = 'Kovid Goyal and Sujata Raman'
  17.     description = 'Current affairs and politics focussed on the US'
  18.     INDEX = 'http://www.theatlantic.com/magazine/toc/0/'
  19.     language = 'en'
  20.  
  21.     remove_tags_before = dict(name='div', id='articleHead')
  22.     remove_tags_after  = dict(id='copyright')
  23.     remove_tags        = [dict(id=['header', 'printAds', 'pageControls'])]
  24.     no_stylesheets = True
  25.  
  26.     preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
  27.  
  28.  
  29.     def print_version(self, url):
  30.         return url.replace('/archive/', '/print/')
  31.  
  32.     def parse_index(self):
  33.         articles = []
  34.  
  35.         soup = self.index_to_soup(self.INDEX)
  36.         ts = soup.find(id='magazineTopStories')
  37.         ds = self.tag_to_string(ts.find('h1')).split(':')[-1]
  38.         self.timefmt = ' [%s]'%ds
  39.  
  40.         cover = soup.find('img', src=True, attrs={'class':'cover'})
  41.         if cover is not None:
  42.             self.cover_url = cover['src']
  43.  
  44.         feeds = []
  45.         seen_titles = set([])
  46.         for section in soup.findAll('div', attrs={'class':'magazineSection'}):
  47.             section_title = self.tag_to_string(section.find('h2'))
  48.             self.log('Found section:', section_title)
  49.             articles = []
  50.             for post in section.findAll('div', attrs={'class':lambda x : x and
  51.                 'post' in x}):
  52.                 h = post.find(['h3', 'h4'])
  53.                 title = self.tag_to_string(h)
  54.                 if title in seen_titles:
  55.                     continue
  56.                 seen_titles.add(title)
  57.                 a = post.find('a', href=True)
  58.                 url = a['href']
  59.                 if url.startswith('/'):
  60.                     url = 'http://www.theatlantic.com'+url
  61.                 p = post.find('p', attrs={'class':'dek'})
  62.                 desc = None
  63.                 self.log('\tFound article:', title, 'at', url)
  64.                 if p is not None:
  65.                     desc = self.tag_to_string(p)
  66.                     self.log('\t\t', desc)
  67.                 articles.append({'title':title, 'url':url, 'description':desc,
  68.                     'date':''})
  69.             if articles:
  70.                 feeds.append((section_title, articles))
  71.  
  72.         poems = []
  73.         self.log('Found section: Poems')
  74.         pd = soup.find('h2', text='Poetry').parent.parent
  75.         for poem in pd.findAll('h4'):
  76.             title = self.tag_to_string(poem)
  77.             url   = poem.find('a')['href']
  78.             if url.startswith('/'):
  79.                 url = 'http://www.theatlantic.com' + url
  80.             self.log('\tFound article:', title, 'at', url)
  81.             poems.append({'title':title, 'url':url, 'description':'',
  82.                     'date':''})
  83.         if poems:
  84.             feeds.append(('Poems', poems))
  85.  
  86.         return feeds
  87.  
  88.     def postprocess_html(self, soup, first):
  89.         for table in soup.findAll('table', align='right'):
  90.             img = table.find('img')
  91.             if img is not None:
  92.                 img.extract()
  93.                 caption = self.tag_to_string(table).strip()
  94.                 div = Tag(soup, 'div')
  95.                 div['style'] = 'text-align:center'
  96.                 div.insert(0, img)
  97.                 div.insert(1, Tag(soup, 'br'))
  98.                 if caption:
  99.                     div.insert(2, NavigableString(caption))
  100.                 table.replaceWith(div)
  101.  
  102.         return soup
  103.  
  104.