home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / frontlineonnet.recipe < prev    next >
Text File  |  2011-09-09  |  4KB  |  82 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. frontlineonnet.com
  5. '''
  6.  
  7. import re
  8. from calibre import strftime
  9. from calibre.web.feeds.news import BasicNewsRecipe
  10.  
  11. class Frontlineonnet(BasicNewsRecipe):
  12.     title                = 'Frontline'
  13.     __author__           = 'Darko Miletic'
  14.     description          = "India's national magazine"
  15.     publisher            = 'Frontline'
  16.     category             = 'news, politics, India'
  17.     no_stylesheets       = True
  18.     delay                = 1
  19.     INDEX                = 'http://frontlineonnet.com/'
  20.     use_embedded_content = False
  21.     encoding             = 'cp1252'
  22.     language             = 'en_IN'
  23.     publication_type     = 'magazine'
  24.     masthead_url         = 'http://frontlineonnet.com/images/newfline.jpg'
  25.     extra_css            = """
  26.                               body{font-family: Verdana,Arial,Helvetica,sans-serif}
  27.                               img{margin-top:0.5em; margin-bottom: 0.7em; display: block}
  28.                            """
  29.  
  30.     conversion_options = {
  31.                           'comment'          : description
  32.                         , 'tags'             : category
  33.                         , 'publisher'        : publisher
  34.                         , 'language'         : language
  35.                         , 'linearize_tables' : True
  36.                         }
  37.  
  38.     preprocess_regexps = [
  39.                            (re.compile(r'.*?<base', re.DOTALL|re.IGNORECASE),lambda match: '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><html dir="ltr" xml:lang="en-IN"><head><title>title</title><base')
  40.                           ,(re.compile(r'<base .*?>', re.DOTALL|re.IGNORECASE),lambda match: '</head><body>')
  41.                           ,(re.compile(r'<byline>', re.DOTALL|re.IGNORECASE),lambda match: '<div class="byline">')
  42.                           ,(re.compile(r'</byline>', re.DOTALL|re.IGNORECASE),lambda match: '</div>')
  43.                           ,(re.compile(r'<center>', re.DOTALL|re.IGNORECASE),lambda match: '<div class="ctr">')
  44.                           ,(re.compile(r'</center>', re.DOTALL|re.IGNORECASE),lambda match: '</div>')
  45.                          ]
  46.  
  47.     keep_only_tags= [
  48.                       dict(name='font', attrs={'class':'storyhead'})
  49.                      ,dict(attrs={'class':'byline'})
  50.                     ]
  51.     remove_attributes=['size','noshade','border']
  52.  
  53.     def preprocess_html(self, soup):
  54.         for item in soup.findAll(style=True):
  55.             del item['style']
  56.         for item in soup.findAll('img'):
  57.             if not item.has_key('alt'):
  58.                item['alt'] = 'image'
  59.         return soup
  60.  
  61.     def parse_index(self):
  62.         articles = []
  63.         soup = self.index_to_soup(self.INDEX)
  64.         for feed_link in soup.findAll('a',href=True):
  65.             if feed_link['href'].startswith('stories/'):
  66.                 url   = self.INDEX + feed_link['href']
  67.                 title = self.tag_to_string(feed_link)
  68.                 date  = strftime(self.timefmt)
  69.                 articles.append({
  70.                                   'title'      :title
  71.                                  ,'date'       :date
  72.                                  ,'url'        :url
  73.                                  ,'description':''
  74.                                 })
  75.         return [('Frontline', articles)]
  76.  
  77.     def print_version(self, url):
  78.         return "http://www.hinduonnet.com/thehindu/thscrip/print.pl?prd=fline&file=" + url.rpartition('/')[2]
  79.  
  80.     def image_url_processor(self, baseurl, url):
  81.         return url.replace('../images/', self.INDEX + 'images/').strip()
  82.