home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / financial_times_uk.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  6.7 KB  |  171 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. www.ft.com/uk-edition
  5. '''
  6.  
  7. import datetime
  8. from calibre.ptempfile import PersistentTemporaryFile
  9. from calibre import strftime
  10. from calibre.web.feeds.news import BasicNewsRecipe
  11.  
  12. class FinancialTimes(BasicNewsRecipe):
  13.     title                 = 'Financial Times - UK printed edition'
  14.     __author__            = 'Darko Miletic'
  15.     description           = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy."
  16.     publisher             = 'The Financial Times Ltd.'
  17.     category              = 'news, finances, politics, UK, World'
  18.     oldest_article        = 2
  19.     language              = 'en_GB'
  20.     max_articles_per_feed = 250
  21.     no_stylesheets        = True
  22.     use_embedded_content  = False
  23.     needs_subscription    = True
  24.     encoding              = 'utf8'
  25.     publication_type      = 'newspaper'
  26.     articles_are_obfuscated = True
  27.     temp_files              = []
  28.     masthead_url          = 'http://im.media.ft.com/m/img/masthead_main.jpg'
  29.     LOGIN                 = 'https://registration.ft.com/registration/barrier/login'
  30.     LOGIN2                = 'http://media.ft.com/h/subs3.html'
  31.     INDEX                 = 'http://www.ft.com/uk-edition'
  32.     PREFIX                = 'http://www.ft.com'
  33.  
  34.     conversion_options = {
  35.                           'comment'          : description
  36.                         , 'tags'             : category
  37.                         , 'publisher'        : publisher
  38.                         , 'language'         : language
  39.                         , 'linearize_tables' : True
  40.                         }
  41.  
  42.     def get_browser(self):
  43.         br = BasicNewsRecipe.get_browser()
  44.         br.open(self.INDEX)
  45.         if self.username is not None and self.password is not None:
  46.             br.open(self.LOGIN2)
  47.             br.select_form(name='loginForm')
  48.             br['username'] = self.username
  49.             br['password'] = self.password
  50.             br.submit()
  51.         return br
  52.  
  53.     keep_only_tags = [
  54.                         dict(name='div', attrs={'class':['fullstory fullstoryHeader', 'ft-story-header']})
  55.                        ,dict(name='div', attrs={'class':'standfirst'})
  56.                        ,dict(name='div', attrs={'id'   :'storyContent'})
  57.                        ,dict(name='div', attrs={'class':['ft-story-body','index-detail']})
  58.                      ]
  59.     remove_tags = [
  60.                       dict(name='div', attrs={'id':'floating-con'})
  61.                      ,dict(name=['meta','iframe','base','object','embed','link'])
  62.                      ,dict(attrs={'class':['storyTools','story-package','screen-copy','story-package separator','expandable-image']})
  63.                   ]
  64.     remove_attributes = ['width','height','lang']
  65.  
  66.     extra_css = """
  67.                 body{font-family: Georgia,Times,"Times New Roman",serif}
  68.                 h2{font-size:large}
  69.                 .ft-story-header{font-size: x-small}
  70.                 .container{font-size:x-small;}
  71.                 h3{font-size:x-small;color:#003399;}
  72.                 .copyright{font-size: x-small}
  73.                 img{margin-top: 0.8em; display: block}
  74.                 .lastUpdated{font-family: Arial,Helvetica,sans-serif; font-size: x-small}
  75.                 .byline,.ft-story-body,.ft-story-header{font-family: Arial,Helvetica,sans-serif}
  76.                 """
  77.  
  78.     def get_artlinks(self, elem):
  79.         articles = []
  80.         count = 0
  81.         for item in elem.findAll('a',href=True):
  82.             count = count + 1
  83.             if self.test and count > 2:
  84.                return articles
  85.             rawlink = item['href']
  86.             if rawlink.startswith('http://'):
  87.                url = rawlink
  88.             else:
  89.                url   = self.PREFIX + rawlink
  90.             urlverified = self.browser.open_novisit(url).geturl() # resolve redirect.
  91.             title = self.tag_to_string(item)
  92.             date = strftime(self.timefmt)
  93.             articles.append({
  94.                               'title'      :title
  95.                              ,'date'       :date
  96.                              ,'url'        :urlverified
  97.                              ,'description':''
  98.                             })
  99.         return articles
  100.  
  101.     def parse_index(self):
  102.         feeds = []
  103.         soup = self.index_to_soup(self.INDEX)
  104.         wide = soup.find('div',attrs={'class':'wide'})
  105.         if not wide:
  106.            return feeds
  107.         strest = wide.findAll('h3', attrs={'class':'section'})
  108.         if not strest:
  109.            return feeds
  110.         st = wide.find('h4',attrs={'class':'section-no-arrow'})
  111.         if st:
  112.            strest.insert(0,st)
  113.         count = 0
  114.         for item in strest:
  115.             count = count + 1
  116.             if self.test and count > 2:
  117.                return feeds
  118.             ftitle   = self.tag_to_string(item)
  119.             self.report_progress(0, _('Fetching feed')+' %s...'%(ftitle))
  120.             feedarts = self.get_artlinks(item.parent.ul)
  121.             feeds.append((ftitle,feedarts))
  122.         return feeds
  123.  
  124.     def preprocess_html(self, soup):
  125.         items = ['promo-box','promo-title',
  126.                  'promo-headline','promo-image',
  127.                  'promo-intro','promo-link','subhead']
  128.         for item in items:
  129.             for it in soup.findAll(item):
  130.                 it.name = 'div'
  131.                 it.attrs = []
  132.         for item in soup.findAll(style=True):
  133.             del item['style']
  134.         for item in soup.findAll('a'):
  135.             limg = item.find('img')
  136.             if item.string is not None:
  137.                str = item.string
  138.                item.replaceWith(str)
  139.             else:
  140.                if limg:
  141.                   item.name = 'div'
  142.                   item.attrs = []
  143.                else:
  144.                    str = self.tag_to_string(item)
  145.                    item.replaceWith(str)
  146.         for item in soup.findAll('img'):
  147.             if not item.has_key('alt'):
  148.                item['alt'] = 'image'
  149.         return soup
  150.  
  151.     def get_cover_url(self):
  152.         cdate = datetime.date.today()
  153.         if cdate.isoweekday() == 7:
  154.            cdate -= datetime.timedelta(days=1)
  155.         return cdate.strftime('http://specials.ft.com/vtf_pdf/%d%m%y_FRONT1_LON.pdf')
  156.  
  157.     def get_obfuscated_article(self, url):
  158.         count = 0
  159.         while (count < 10):
  160.             try:
  161.                 response = self.browser.open(url)
  162.                 html = response.read()
  163.                 count = 10
  164.             except:
  165.                 print "Retrying download..."
  166.             count += 1        
  167.         self.temp_files.append(PersistentTemporaryFile('_fa.html'))
  168.         self.temp_files[-1].write(html)
  169.         self.temp_files[-1].close()
  170.         return self.temp_files[-1].name
  171.