home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / dawn.recipe < prev    next >
Text File  |  2011-09-09  |  4KB  |  93 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. from calibre.ebooks.BeautifulSoup import Tag
  3.  
  4. class DawnRecipe(BasicNewsRecipe):
  5.     __license__  = 'GPL v3'
  6.     __author__ = 'kwetal'
  7.     language = 'en_PK'
  8.     version = 1
  9.  
  10.     title = u'Dawn'
  11.     publisher = u'Dawn Media Group'
  12.     category = u'News, Pakistan'
  13.     description = u'Leading English Newspaper of Pakistan covering national & international news'
  14.  
  15.     use_embedded_content = False
  16.     remove_empty_feeds = True
  17.     oldest_article = 2
  18.     max_articles_per_feed = 100
  19.  
  20.     no_stylesheets = True
  21.     remove_javascript = True
  22.     encoding = 'utf-8'
  23.  
  24.     # Feeds from http://www.dawn.com/wps/wcm/connect/dawn-content-library/dawn/services/rss
  25.     feeds = []
  26.     feeds.append((u'Latest News', u'http://feedproxy.google.com/Dawn-All-News'))
  27.     feeds.append((u'Pakistan News', u'http://feeds2.feedburner.com/dawn/news/pakistan'))
  28.     feeds.append((u'World News', u'http://feeds2.feedburner.com/dawn/news/world'))
  29.     feeds.append((u'Business News', u'http://feeds2.feedburner.com/dawn/news/business'))
  30.     feeds.append((u'Sport News', u'http://feeds2.feedburner.com/dawn/news/sport'))
  31.     feeds.append((u'Cricket News', u'http://feeds2.feedburner.com/dawn/news/cricket'))
  32.     feeds.append((u'Sci-tech News', u'http://feeds2.feedburner.com/dawn/news/technology'))
  33.     feeds.append((u'Entertainment News', u'http://feeds2.feedburner.com/dawn/news/entertainment'))
  34.     feeds.append((u'Columnists', u'http://feeds2.feedburner.com/dawn/news/columnists'))
  35.     #feeds.append((u'', u''))
  36.  
  37.     conversion_options = {'comments': description, 'tags': category, 'language': 'en',
  38.                           'publisher': publisher}
  39.  
  40.     extra_css = '''
  41.                 body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
  42.                 center {font-size: xx-small; color: #666666;}
  43.                 strong {font-size: small; font-weight: bold;}
  44.                 span.news_headline {font-size: xx-large; font-weight: bold; margin: 0em; padding: 0em}
  45.                 span.news_byline {font-size: x-small; color: #696969; margin-top: 1em;}
  46.                 '''
  47.  
  48.     def print_version(self, url):
  49.         return url + '?pagedesign=Dawn_PrintlyFriendlyPage'
  50.  
  51.     def preprocess_html(self, soup):
  52.         newBody = Tag(soup, 'body')
  53.  
  54.         for cl in ['page_title', 'news_headline', 'news_byline']:
  55.             tag = soup.find('span', attrs = {'class': cl})
  56.             if tag:
  57.                 # They like their <br> tags; I don't: does not work well on small screens.
  58.                 if tag['class'] == 'news_byline':
  59.                     for br in tag.findAll('br'):
  60.                         br.extract()
  61.  
  62.                 newBody.append(tag)
  63.  
  64.         table = soup.find('table', attrs = {'id': 'body table'})
  65.         if table:
  66.             for td in table.findAll('td', attrs = {'class': 'news_story'}):
  67.                 for tag in td.findAll(True):
  68.                     if tag.has_key('id') and tag['id'] == 'banner-img_slide':
  69.                         tag.extract()
  70.                     elif tag.has_key('style'):
  71.                         del tag['style']
  72.                     elif tag.name == 'script':
  73.                         tag.extract()
  74.  
  75.                 # They like their <br> tags; I don't: does not work well on small screens.
  76.                 center = td.find('center')
  77.                 if center:
  78.                     for br in center.findNextSiblings('br'):
  79.                         br.extract()
  80.                     for br in center.findPreviousSiblings('br'):
  81.                         br.extract()
  82.  
  83.                 for attr in ['align', 'valign']:
  84.                     if td.has_key(attr):
  85.                         del td[attr]
  86.  
  87.                 td.name = 'div'
  88.                 newBody.append(td)
  89.  
  90.             soup.body.replaceWith(newBody)
  91.  
  92.             return soup
  93.