home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / observer_gb.recipe < prev    next >
Text File  |  2011-09-09  |  7KB  |  149 lines

  1. #!/usr/bin/env  python
  2. __license__   = 'GPL v3'
  3. __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
  4. __docformat__ = 'restructuredtext en'
  5.  
  6. '''
  7. http://www.guardian.co.uk/theobserver
  8. '''
  9. from calibre import strftime
  10. from calibre.web.feeds.news import BasicNewsRecipe
  11.  
  12. class Guardian(BasicNewsRecipe):
  13.  
  14.     title = u'The Observer'
  15.     __author__ = 'jbambridge'
  16.     language = 'en_GB'
  17.     simultaneous_downloads = 5
  18.  
  19.     oldest_article = 7
  20.     max_articles_per_feed = 100
  21.     remove_javascript = True
  22.  
  23.     timefmt = ' [%a, %d %b %Y]'
  24.  
  25.     filter_regexps = [r'r\.kelkoo\.com']
  26.  
  27.     keep_only_tags = [
  28.                       dict(name='div', attrs={'id':["content","article_header","main-article-info",]}),
  29.                            ]
  30.     remove_tags = [
  31.                         dict(name='div', attrs={'class':["video-content","videos-third-column"]}),
  32.                         dict(name='div', attrs={'id':["article-toolbox","subscribe-feeds",]}),
  33.                         dict(name='div', attrs={'class':["promo-component bookshop-books-promo bookshop-books"]}),
  34.                         dict(name='ul', attrs={'class':["pagination"]}),
  35.                         dict(name='ul', attrs={'id':["content-actions"]}),
  36.                         dict(name='li', attrs={'id':["product-image"]}),
  37.                         ]
  38.     use_embedded_content    = False
  39.  
  40.     no_stylesheets = True
  41.     extra_css = '''
  42.                     .article-attributes{font-size: x-small; font-family:Arial,Helvetica,sans-serif;}
  43.                     .h1{font-size: large ;font-family:georgia,serif; font-weight:bold;}
  44.                     .stand-first-alone{color:#666666; font-size:small; font-family:Arial,Helvetica,sans-serif;}
  45.                     .caption{color:#666666; font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
  46.                     #article-wrapper{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
  47.                     .main-article-info{font-family:Arial,Helvetica,sans-serif;}
  48.                     #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
  49.                     #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
  50.                 '''
  51.  
  52.     feeds = [
  53.       (u'Main Section', u'feed://www.guardian.co.uk/theobserver/news/uknews/rss'),
  54.       (u'News', u'feed://www.guardian.co.uk/theobserver/news/rss'),
  55.       (u'World News', u'feed://www.guardian.co.uk/theobserver/news/worldnews/rss'),
  56.       (u'In Focus', u'feed://www.guardian.co.uk/theobserver/news/focus/rss'),
  57.       (u'7 Days', u'feed://www.guardian.co.uk/theobserver/news/7days/rss'),
  58.       (u'Seven Days', u'feed://www.guardian.co.uk/theobserver/news/seven-days/rss'),
  59.       (u'Media', u'feed://www.guardian.co.uk/theobserver/news/media/rss'),
  60.       (u'Business', u'feed://www.guardian.co.uk/theobserver/businessandmedia/rss'),
  61.       (u'Cash', u'feed://www.guardian.co.uk/theobserver/news/cash/rss'),
  62.       (u'Money', u'feed://feeds.guardian.co.uk/theguardian/money/rss'),
  63.       (u'Comment', u'feed://www.guardian.co.uk/theobserver/news/comment/rss'),
  64.       (u'Travel', u'feed://www.guardian.co.uk/theobserver/escape/rss'),
  65.       (u'Culture', u'feed://www.guardian.co.uk/theobserver/review/rss'),
  66.       (u'Money', u'feed://feeds.guardian.co.uk/theguardian/money/rss'),
  67.       (u'TV & Radio', u'feed://www.guardian.co.uk/tv-and-radio/rss'),
  68.       (u'New Review', u'feed://www.guardian.co.uk/theobserver/new-review/rss'),
  69.       (u'Agenda', u'feed://www.guardian.co.uk/theobserver/new-review/agenda/rss'),
  70.       (u'Critics', u'feed://www.guardian.co.uk/theobserver/new-review/critics/rss'),
  71.       (u'Features', u'feed://www.guardian.co.uk/theobserver/new-review/features/rss'),
  72.       (u'Discover', u'feed://www.guardian.co.uk/theobserver/new-review/discover/rss'),
  73.       (u'Books', u'feed://www.guardian.co.uk/theobserver/new-review/books/rss'),
  74.       (u'Magazine', u'feed://www.guardian.co.uk/theobserver/magazine/rss'),
  75.       (u'Regulars', u'feed://www.guardian.co.uk/theobserver/magazine/regulars/rss'),
  76.       (u'Life & Style', u'feed://www.guardian.co.uk/theobserver/magazine/life-and-style/rss'),
  77.       (u'Mag Features', u'feed://www.guardian.co.uk/theobserver/magazine/features2/rss'),
  78.       (u'Sport', u'feed://www.guardian.co.uk/theobserver/sport/rss')
  79.     ]
  80.  
  81.     def get_article_url(self, article):
  82.           url = article.get('guid', None)
  83.           if '/video/' in url or '/flyer/' in url or '/quiz/' in url or \
  84.               '/gallery/' in url  or 'ivebeenthere' in url or \
  85.               'pickthescore' in url or 'audioslideshow' in url :
  86.               url = None
  87.           return url
  88.  
  89.     def preprocess_html(self, soup):
  90.  
  91.           for item in soup.findAll(style=True):
  92.               del item['style']
  93.  
  94.           for item in soup.findAll(face=True):
  95.               del item['face']
  96.           for tag in soup.findAll(name=['ul','li']):
  97.                 tag.name = 'div'
  98.  
  99.           return soup
  100.  
  101.     def find_sections(self):
  102.         soup = self.index_to_soup('http://www.guardian.co.uk/theobserver')
  103.         # find cover pic
  104.         img = soup.find( 'img',attrs ={'alt':'Guardian digital edition'})
  105.         if img is not None:
  106.             self.cover_url = img['src']
  107.         # end find cover pic
  108.  
  109.         idx = soup.find('div', id='book-index')
  110.         for s in idx.findAll('strong', attrs={'class':'book'}):
  111.             a = s.find('a', href=True)
  112.             yield (self.tag_to_string(a), a['href'])
  113.  
  114.     def find_articles(self, url):
  115.         soup = self.index_to_soup(url)
  116.         div = soup.find('div', attrs={'class':'book-index'})
  117.         for ul in div.findAll('ul', attrs={'class':'trailblock'}):
  118.             for li in ul.findAll('li'):
  119.                 a = li.find(href=True)
  120.                 if not a:
  121.                     continue
  122.                 title = self.tag_to_string(a)
  123.                 url = a['href']
  124.                 if not title or not url:
  125.                     continue
  126.                 tt = li.find('div', attrs={'class':'trailtext'})
  127.                 if tt is not None:
  128.                     for da in tt.findAll('a'): da.extract()
  129.                     desc = self.tag_to_string(tt).strip()
  130.                 yield {
  131.                         'title': title, 'url':url, 'description':desc,
  132.                         'date' : strftime('%a, %d %b'),
  133.                         }
  134.  
  135.     def parse_index(self):
  136.         try:
  137.             feeds = []
  138.             for title, href in self.find_sections():
  139.                 feeds.append((title, list(self.find_articles(href))))
  140.             return feeds
  141.         except:
  142.             raise NotImplementedError
  143.  
  144.  
  145.     def postprocess_html(self,soup,first):
  146.         return soup.findAll('html')[0]
  147.  
  148.  
  149.