home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / financial_times.recipe < prev    next >
Text File  |  2011-09-09  |  4KB  |  107 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. www.ft.com
  5. '''
  6.  
  7. import datetime
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9.  
  10. class FinancialTimes_rss(BasicNewsRecipe):
  11.     title                 = 'Financial Times'
  12.     __author__            = 'Darko Miletic'
  13.     description           = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy."
  14.     publisher             = 'The Financial Times Ltd.'
  15.     category              = 'news, finances, politics, World'
  16.     oldest_article        = 2
  17.     language              = 'en'
  18.     max_articles_per_feed = 250
  19.     no_stylesheets        = True
  20.     use_embedded_content  = False
  21.     needs_subscription    = True
  22.     encoding              = 'utf8'
  23.     publication_type      = 'newspaper'
  24.     masthead_url          = 'http://im.media.ft.com/m/img/masthead_main.jpg'
  25.     LOGIN                 = 'https://registration.ft.com/registration/barrier/login'
  26.     INDEX                 = 'http://www.ft.com'
  27.  
  28.     conversion_options = {
  29.                           'comment'          : description
  30.                         , 'tags'             : category
  31.                         , 'publisher'        : publisher
  32.                         , 'language'         : language
  33.                         , 'linearize_tables' : True
  34.                         }
  35.  
  36.     def get_browser(self):
  37.         br = BasicNewsRecipe.get_browser()
  38.         br.open(self.INDEX)
  39.         if self.username is not None and self.password is not None:
  40.             br.open(self.LOGIN)
  41.             br.select_form(name='loginForm')
  42.             br['username'] = self.username
  43.             br['password'] = self.password
  44.             br.submit()
  45.         return br
  46.  
  47.     keep_only_tags = [dict(name='div', attrs={'class':['fullstory fullstoryHeader','fullstory fullstoryBody','ft-story-header','ft-story-body','index-detail']})]
  48.     remove_tags = [
  49.                       dict(name='div', attrs={'id':'floating-con'})
  50.                      ,dict(name=['meta','iframe','base','object','embed','link'])
  51.                      ,dict(attrs={'class':['storyTools','story-package','screen-copy','story-package separator','expandable-image']})
  52.                   ]
  53.     remove_attributes = ['width','height','lang']
  54.  
  55.     extra_css = """
  56.                 body{font-family: Georgia,Times,"Times New Roman",serif}
  57.                 h2{font-size:large}
  58.                 .ft-story-header{font-size: x-small}
  59.                 .container{font-size:x-small;}
  60.                 h3{font-size:x-small;color:#003399;}
  61.                 .copyright{font-size: x-small}
  62.                 img{margin-top: 0.8em; display: block}
  63.                 .lastUpdated{font-family: Arial,Helvetica,sans-serif; font-size: x-small}
  64.                 .byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif}
  65.                 """
  66.  
  67.     feeds = [
  68.                (u'UK'         , u'http://www.ft.com/rss/home/uk'        )
  69.               ,(u'US'         , u'http://www.ft.com/rss/home/us'        )
  70.               ,(u'Asia'       , u'http://www.ft.com/rss/home/asia'      )
  71.               ,(u'Middle East', u'http://www.ft.com/rss/home/middleeast')
  72.             ]
  73.  
  74.     def preprocess_html(self, soup):
  75.         items = ['promo-box','promo-title',
  76.                  'promo-headline','promo-image',
  77.                  'promo-intro','promo-link','subhead']
  78.         for item in items:
  79.             for it in soup.findAll(item):
  80.                 it.name = 'div'
  81.                 it.attrs = []
  82.         for item in soup.findAll(style=True):
  83.             del item['style']
  84.         for item in soup.findAll('a'):
  85.             limg = item.find('img')
  86.             if item.string is not None:
  87.                str = item.string
  88.                item.replaceWith(str)
  89.             else:
  90.                if limg:
  91.                   item.name = 'div'
  92.                   item.attrs = []
  93.                else:
  94.                    str = self.tag_to_string(item)
  95.                    item.replaceWith(str)
  96.         for item in soup.findAll('img'):
  97.             if not item.has_key('alt'):
  98.                item['alt'] = 'image'
  99.         return soup
  100.  
  101.     def get_cover_url(self):
  102.         cdate = datetime.date.today()
  103.         if cdate.isoweekday() == 7:
  104.            cdate -= datetime.timedelta(days=1)
  105.         return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_USA.pdf')
  106.  
  107.