home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / hbr_blogs.recipe < prev    next >
Text File  |  2011-09-09  |  3KB  |  82 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. import re
  3.  
  4. class HBR(BasicNewsRecipe):
  5.  
  6.     title = 'Harvard Business Review Blogs'
  7.     description = 'To subscribe go to http://hbr.harvardbusiness.org'
  8.     __author__ = 'Kovid Goyal'
  9.     language = 'en'
  10.     no_stylesheets = True
  11.     #recipe_disabled = ('hbr.org has started requiring the use of javascript'
  12.     #        ' to log into their website. This is unsupported in calibre, so'
  13.     #        ' this recipe has been disabled. If you would like to see '
  14.     #        ' HBR supported in calibre, contact hbr.org and ask them'
  15.     #        ' to provide a javascript free login method.')
  16.     needs_subscription = False
  17.  
  18.     LOGIN_URL = 'http://hbr.org/login?request_url=/'
  19.     LOGOUT_URL = 'http://hbr.org/logout?request_url=/'
  20.  
  21.     INDEX = 'http://hbr.org/current'
  22.  
  23.     remove_tags_after = dict(id='articleBody')
  24.     remove_tags_before = dict(id='pageFeature')
  25.     feeds = [('Blog','http://feeds.harvardbusiness.org/harvardbusiness')]
  26.     oldest_article = 30
  27.     max_articles_per_feed = 100
  28.     use_embedded_content = False
  29.  
  30.     keep_only_tags = [    dict(name='div', id='pageContainer')
  31.                 ]
  32.  
  33.     remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline',
  34.         'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
  35.         'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
  36.         'articleToolbarTop','articleToolbarBottom', 'articleToolbarRD',
  37.         'mailingListTout', 'partnerCenter', 'pageFooter', 'shareWidgetTop']),
  38.         dict(name=['iframe', 'style'])]
  39.  
  40.  
  41.     def get_browser(self):
  42.         br = BasicNewsRecipe.get_browser(self)
  43.         self.logout_url = None
  44.         return br
  45.  
  46.         #'''
  47.         br.open(self.LOGIN_URL)
  48.         br.select_form(name='signin-form')
  49.         br['signin-form:username'] = self.username
  50.         br['signin-form:password'] = self.password
  51.         raw = br.submit().read()
  52.         if 'My Account' not in raw:
  53.             raise Exception('Failed to login, are you sure your username and password are correct?')
  54.         try:
  55.             link = br.find_link(text='Sign out')
  56.             if link:
  57.                 self.logout_url = link.absolute_url
  58.         except:
  59.             self.logout_url = self.LOGOUT_URL
  60.         #'''
  61.         return br
  62.  
  63. #-------------------------------------------------------------------------------------------------
  64.     def cleanup(self):
  65.         if self.logout_url is not None:
  66.             self.browser.open(self.logout_url)
  67. #-------------------------------------------------------------------------------------------------
  68.     def map_url(self, url):
  69.         if url.endswith('/ar/1'):
  70.             return url[:-1]+'pr'
  71.  
  72.     def get_cover_url(self):
  73.         cover_url = None
  74.         index = 'http://hbr.org/current'
  75.         soup = self.index_to_soup(index)
  76.         link_item = soup.find('img', alt=re.compile("Current Issue"), src=True)
  77.  
  78.         if link_item:
  79.            cover_url = 'http://hbr.org' + link_item['src']
  80.  
  81.         return cover_url
  82.