home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / economist.recipe < prev    next >
Text File  |  2011-09-09  |  5KB  |  135 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  5. '''
  6. economist.com
  7. '''
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9. from calibre.ebooks.BeautifulSoup import Tag, NavigableString
  10. from collections import OrderedDict
  11.  
  12. import re
  13.  
  14. class Economist(BasicNewsRecipe):
  15.  
  16.     title = 'The Economist'
  17.     language = 'en'
  18.  
  19.     __author__ = "Kovid Goyal"
  20.     INDEX = 'http://www.economist.com/printedition'
  21.     description = ('Global news and current affairs from a European'
  22.             ' perspective. Best downloaded on Friday mornings (GMT)')
  23.     extra_css      = '.headline {font-size: x-large;} \n h2 { font-size: small;  } \n h1 { font-size: medium;  }'
  24.     oldest_article = 7.0
  25.     cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg'
  26.     #cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
  27.     remove_tags = [
  28.             dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
  29.             dict(attrs={'class':['dblClkTrk', 'ec-article-info',
  30.                 'share_inline_header', 'related-items']}),
  31.             {'class': lambda x: x and 'share-links-header' in x},
  32.     ]
  33.     keep_only_tags = [dict(id='ec-article-body')]
  34.     no_stylesheets = True
  35.     preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
  36.         lambda x:'</html>')]
  37.  
  38.     # economist.com has started throttling after about 60% of the total has
  39.     # downloaded with connection reset by peer (104) errors.
  40.     delay = 1
  41.  
  42.     needs_subscription = False
  43.     '''
  44.     def get_browser(self):
  45.         br = BasicNewsRecipe.get_browser()
  46.         if self.username and self.password:
  47.             br.open('http://www.economist.com/user/login')
  48.             br.select_form(nr=1)
  49.             br['name'] = self.username
  50.             br['pass'] = self.password
  51.             res = br.submit()
  52.             raw = res.read()
  53.             if '>Log out<' not in raw:
  54.                 raise ValueError('Failed to login to economist.com. '
  55.                         'Check your username and password.')
  56.         return br
  57.     '''
  58.  
  59.     def parse_index(self):
  60.         return self.economist_parse_index()
  61.  
  62.     def economist_parse_index(self):
  63.         soup = self.index_to_soup(self.INDEX)
  64.         div = soup.find('div', attrs={'class':'issue-image'})
  65.         if div is not None:
  66.             img = div.find('img', src=True)
  67.             if img is not None:
  68.                 self.cover_url = img['src']
  69.         feeds = OrderedDict()
  70.         for section in soup.findAll(attrs={'class':lambda x: x and 'section' in
  71.             x}):
  72.             h4 = section.find('h4')
  73.             if h4 is None:
  74.                 continue
  75.             section_title = self.tag_to_string(h4).strip()
  76.             if not section_title:
  77.                 continue
  78.             self.log('Found section: %s'%section_title)
  79.             articles = []
  80.             subsection = ''
  81.             for node in section.findAll(attrs={'class':'article'}):
  82.                 subsec = node.findPreviousSibling('h5')
  83.                 if subsec is not None:
  84.                     subsection = self.tag_to_string(subsec)
  85.                 prefix = (subsection+': ') if subsection else ''
  86.                 a = node.find('a', href=True)
  87.                 if a is not None:
  88.                     url = a['href']
  89.                     if url.startswith('/'): url = 'http://www.economist.com'+url
  90.                     url += '/print'
  91.                     title = self.tag_to_string(a)
  92.                     if title:
  93.                         title = prefix + title
  94.                         self.log('\tFound article:', title)
  95.                         articles.append({'title':title, 'url':url,
  96.                         'description':'', 'date':''})
  97.  
  98.             if articles:
  99.                 if section_title not in feeds:
  100.                     feeds[section_title] = []
  101.                 feeds[section_title] += articles
  102.  
  103.         ans = [(key, val) for key, val in feeds.iteritems()]
  104.         if not ans:
  105.             raise Exception('Could not find any articles, either the '
  106.                     'economist.com server is having trouble and you should '
  107.                     'try later or the website format has changed and the '
  108.                     'recipe needs to be updated.')
  109.         return ans
  110.  
  111.     def eco_find_image_tables(self, soup):
  112.         for x in soup.findAll('table', align=['right', 'center']):
  113.             if len(x.findAll('font')) in (1,2) and len(x.findAll('img')) == 1:
  114.                 yield x
  115.  
  116.     def postprocess_html(self, soup, first):
  117.         body = soup.find('body')
  118.         for name, val in body.attrs:
  119.             del body[name]
  120.  
  121.         for table in list(self.eco_find_image_tables(soup)):
  122.             caption = table.find('font')
  123.             img = table.find('img')
  124.             div = Tag(soup, 'div')
  125.             div['style'] = 'text-align:left;font-size:70%'
  126.             ns = NavigableString(self.tag_to_string(caption))
  127.             div.insert(0, ns)
  128.             div.insert(1, Tag(soup, 'br'))
  129.             del img['width']
  130.             del img['height']
  131.             img.extract()
  132.             div.insert(2, img)
  133.             table.replaceWith(div)
  134.         return soup
  135.