home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / economist_free.recipe < prev    next >
Text File  |  2011-09-09  |  11KB  |  278 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  5. '''
  6. economist.com
  7. '''
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9. from calibre.ebooks.BeautifulSoup import Tag, NavigableString
  10. from collections import OrderedDict
  11.  
  12. import time, re
  13.  
  14. class Economist(BasicNewsRecipe):
  15.  
  16.     title = 'The Economist'
  17.     language = 'en'
  18.  
  19.     __author__ = "Kovid Goyal"
  20.     INDEX = 'http://www.economist.com/printedition'
  21.     description = ('Global news and current affairs from a European'
  22.             ' perspective. Best downloaded on Friday mornings (GMT)')
  23.     extra_css      = '.headline {font-size: x-large;} \n h2 { font-size: small;  } \n h1 { font-size: medium;  }'
  24.     oldest_article = 7.0
  25.     cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg'
  26.     #cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
  27.     remove_tags = [
  28.             dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
  29.             dict(attrs={'class':['dblClkTrk', 'ec-article-info',
  30.                 'share_inline_header', 'related-items']}),
  31.             {'class': lambda x: x and 'share-links-header' in x},
  32.     ]
  33.     keep_only_tags = [dict(id='ec-article-body')]
  34.     needs_subscription = False
  35.     no_stylesheets = True
  36.     preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
  37.         lambda x:'</html>')]
  38.  
  39.     # economist.com has started throttling after about 60% of the total has
  40.     # downloaded with connection reset by peer (104) errors.
  41.     delay = 1
  42.  
  43.  
  44.     def parse_index(self):
  45.         try:
  46.             return self.economist_parse_index()
  47.         except:
  48.             raise
  49.             self.log.warn(
  50.                 'Initial attempt to parse index failed, retrying in 30 seconds')
  51.             time.sleep(30)
  52.             return self.economist_parse_index()
  53.  
  54.     def economist_parse_index(self):
  55.         soup = self.index_to_soup(self.INDEX)
  56.         div = soup.find('div', attrs={'class':'issue-image'})
  57.         if div is not None:
  58.             img = div.find('img', src=True)
  59.             if img is not None:
  60.                 self.cover_url = img['src']
  61.         feeds = OrderedDict()
  62.         for section in soup.findAll(attrs={'class':lambda x: x and 'section' in
  63.             x}):
  64.             h4 = section.find('h4')
  65.             if h4 is None:
  66.                 continue
  67.             section_title = self.tag_to_string(h4).strip()
  68.             if not section_title:
  69.                 continue
  70.             self.log('Found section: %s'%section_title)
  71.             articles = []
  72.             subsection = ''
  73.             for node in section.findAll(attrs={'class':'article'}):
  74.                 subsec = node.findPreviousSibling('h5')
  75.                 if subsec is not None:
  76.                     subsection = self.tag_to_string(subsec)
  77.                 prefix = (subsection+': ') if subsection else ''
  78.                 a = node.find('a', href=True)
  79.                 if a is not None:
  80.                     url = a['href']
  81.                     if url.startswith('/'): url = 'http://www.economist.com'+url
  82.                     url += '/print'
  83.                     title = self.tag_to_string(a)
  84.                     if title:
  85.                         title = prefix + title
  86.                         self.log('\tFound article:', title)
  87.                         articles.append({'title':title, 'url':url,
  88.                         'description':'', 'date':''})
  89.  
  90.             if articles:
  91.                 if section_title not in feeds:
  92.                     feeds[section_title] = []
  93.                 feeds[section_title] += articles
  94.  
  95.         ans = [(key, val) for key, val in feeds.iteritems()]
  96.         if not ans:
  97.             raise Exception('Could not find any articles, either the '
  98.                     'economist.com server is having trouble and you should '
  99.                     'try later or the website format has changed and the '
  100.                     'recipe needs to be updated.')
  101.         return ans
  102.  
  103.     def eco_find_image_tables(self, soup):
  104.         for x in soup.findAll('table', align=['right', 'center']):
  105.             if len(x.findAll('font')) in (1,2) and len(x.findAll('img')) == 1:
  106.                 yield x
  107.  
  108.     def postprocess_html(self, soup, first):
  109.         body = soup.find('body')
  110.         for name, val in body.attrs:
  111.             del body[name]
  112.  
  113.         for table in list(self.eco_find_image_tables(soup)):
  114.             caption = table.find('font')
  115.             img = table.find('img')
  116.             div = Tag(soup, 'div')
  117.             div['style'] = 'text-align:left;font-size:70%'
  118.             ns = NavigableString(self.tag_to_string(caption))
  119.             div.insert(0, ns)
  120.             div.insert(1, Tag(soup, 'br'))
  121.             del img['width']
  122.             del img['height']
  123.             img.extract()
  124.             div.insert(2, img)
  125.             table.replaceWith(div)
  126.         return soup
  127.  
  128. '''
  129. from calibre.web.feeds.news import BasicNewsRecipe
  130. from calibre.utils.threadpool import ThreadPool, makeRequests
  131. from calibre.ebooks.BeautifulSoup import Tag, NavigableString
  132. import time, string, re
  133. from datetime import datetime
  134. from lxml import html
  135.  
  136. class Economist(BasicNewsRecipe):
  137.  
  138.     title = 'The Economist (RSS)'
  139.     language = 'en'
  140.  
  141.     __author__ = "Kovid Goyal"
  142.     description = ('Global news and current affairs from a European'
  143.             ' perspective. Best downloaded on Friday mornings (GMT).'
  144.             ' Much slower than the print edition based version.')
  145.     extra_css      = '.headline {font-size: x-large;} \n h2 { font-size: small;  } \n h1 { font-size: medium;  }'
  146.     oldest_article = 7.0
  147.     cover_url = 'http://media.economist.com/sites/default/files/imagecache/print-cover-thumbnail/print-covers/currentcoverus_large.jpg'
  148.     #cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
  149.     remove_tags = [
  150.             dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
  151.             dict(attrs={'class':['dblClkTrk', 'ec-article-info',
  152.                 'share_inline_header', 'related-items']}),
  153.             {'class': lambda x: x and 'share-links-header' in x},
  154.     ]
  155.     keep_only_tags = [dict(id='ec-article-body')]
  156.     no_stylesheets = True
  157.     preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
  158.         lambda x:'</html>')]
  159.  
  160.     def parse_index(self):
  161.         from calibre.web.feeds.feedparser import parse
  162.         if self.test:
  163.             self.oldest_article = 14.0
  164.         raw = self.index_to_soup(
  165.                 'http://feeds.feedburner.com/economist/full_print_edition',
  166.                 raw=True)
  167.         entries = parse(raw).entries
  168.         pool = ThreadPool(10)
  169.         self.feed_dict = {}
  170.         requests = []
  171.         for i, item in enumerate(entries):
  172.             title       = item.get('title', _('Untitled article'))
  173.             published = item.date_parsed
  174.             if not published:
  175.                 published = time.gmtime()
  176.             utctime = datetime(*published[:6])
  177.             delta = datetime.utcnow() - utctime
  178.             if delta.days*24*3600 + delta.seconds > 24*3600*self.oldest_article:
  179.                 self.log.debug('Skipping article %s as it is too old.'%title)
  180.                 continue
  181.             link        = item.get('link', None)
  182.             description = item.get('description', '')
  183.             author      = item.get('author', '')
  184.  
  185.             requests.append([i, link, title, description, author, published])
  186.         if self.test:
  187.             requests = requests[:4]
  188.         requests = makeRequests(self.process_eco_feed_article, requests, self.eco_article_found,
  189.                 self.eco_article_failed)
  190.         for r in requests: pool.putRequest(r)
  191.         pool.wait()
  192.  
  193.         return self.eco_sort_sections([(t, a) for t, a in
  194.             self.feed_dict.items()])
  195.  
  196.     def eco_sort_sections(self, feeds):
  197.         if not feeds:
  198.             raise ValueError('No new articles found')
  199.         order = {
  200.             'The World This Week': 1,
  201.             'Leaders': 2,
  202.             'Letters': 3,
  203.             'Briefing': 4,
  204.             'Business': 5,
  205.             'Finance And Economics': 6,
  206.             'Science & Technology': 7,
  207.             'Books & Arts': 8,
  208.             'International': 9,
  209.             'United States': 10,
  210.             'Asia': 11,
  211.             'Europe': 12,
  212.             'The Americas': 13,
  213.             'Middle East & Africa': 14,
  214.             'Britain': 15,
  215.             'Obituary': 16,
  216.         }
  217.         return sorted(feeds, cmp=lambda x,y:cmp(order.get(x[0], 100),
  218.             order.get(y[0], 100)))
  219.  
  220.     def process_eco_feed_article(self, args):
  221.         from calibre import browser
  222.         i, url, title, description, author, published = args
  223.         br = browser()
  224.         ret = br.open(url)
  225.         raw = ret.read()
  226.         url = br.geturl().split('?')[0]+'/print'
  227.         root = html.fromstring(raw)
  228.         matches = root.xpath('//*[@class = "ec-article-info"]')
  229.         feedtitle = 'Miscellaneous'
  230.         if matches:
  231.             feedtitle = string.capwords(html.tostring(matches[-1], method='text',
  232.                     encoding=unicode).split('|')[-1].strip())
  233.         return (i, feedtitle, url, title, description, author, published)
  234.  
  235.     def eco_article_found(self, req, result):
  236.         from calibre.web.feeds import Article
  237.         i, feedtitle, link, title, description, author, published = result
  238.         self.log('Found print version for article:', title, 'in', feedtitle,
  239.                 'at', link)
  240.  
  241.         a = Article(i, title, link, author, description, published, '')
  242.  
  243.         article = dict(title=a.title, description=a.text_summary,
  244.             date=time.strftime(self.timefmt, a.date), author=a.author, url=a.url)
  245.         if feedtitle not in self.feed_dict:
  246.             self.feed_dict[feedtitle] = []
  247.         self.feed_dict[feedtitle].append(article)
  248.  
  249.     def eco_article_failed(self, req, tb):
  250.         self.log.error('Failed to download %s with error:'%req.args[0][2])
  251.         self.log.debug(tb)
  252.  
  253.     def eco_find_image_tables(self, soup):
  254.         for x in soup.findAll('table', align=['right', 'center']):
  255.             if len(x.findAll('font')) in (1,2) and len(x.findAll('img')) == 1:
  256.                 yield x
  257.  
  258.     def postprocess_html(self, soup, first):
  259.         body = soup.find('body')
  260.         for name, val in body.attrs:
  261.             del body[name]
  262.         for table in list(self.eco_find_image_tables(soup)):
  263.             caption = table.find('font')
  264.             img = table.find('img')
  265.             div = Tag(soup, 'div')
  266.             div['style'] = 'text-align:left;font-size:70%'
  267.             ns = NavigableString(self.tag_to_string(caption))
  268.             div.insert(0, ns)
  269.             div.insert(1, Tag(soup, 'br'))
  270.             img.extract()
  271.             del img['width']
  272.             del img['height']
  273.             div.insert(2, img)
  274.             table.replaceWith(div)
  275.         return soup
  276. '''
  277.  
  278.