home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / hbr.recipe < prev    next >
Text File  |  2011-09-09  |  5KB  |  136 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. import re
  3. from datetime import date, timedelta
  4.  
  5. class HBR(BasicNewsRecipe):
  6.  
  7.     title = 'Harvard Business Review'
  8.     description = 'To subscribe go to http://hbr.harvardbusiness.org'
  9.     needs_subscription = True
  10.     __author__ = 'Kovid Goyal and Sujata Raman'
  11.     timefmt                = ' [%B %Y]'
  12.     language = 'en'
  13.     no_stylesheets = True
  14.     recipe_disabled = ('hbr.org has started requiring the use of javascript'
  15.             ' to log into their website. This is unsupported in calibre, so'
  16.             ' this recipe has been disabled. If you would like to see '
  17.             ' HBR supported in calibre, contact hbr.org and ask them'
  18.             ' to provide a javascript free login method.')
  19.  
  20.     LOGIN_URL = 'https://hbr.org/login?request_url=/'
  21.     LOGOUT_URL = 'https://hbr.org/logout?request_url=/'
  22.  
  23.     INDEX = 'http://hbr.org/archive-toc/BR'
  24.  
  25.     keep_only_tags = [dict(name='div', id='pageContainer')]
  26.     remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline',
  27.         'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
  28.         'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
  29.         'mailingListTout', 'partnerCenter', 'pageFooter',
  30.         'superNavHeadContainer', 'hbrDisqus',
  31.         'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']),
  32.         dict(name='iframe')]
  33.     extra_css = '''
  34.                 a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; }
  35.                 .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
  36.                 h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; }
  37.                 h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small;  }
  38.                 #articleAuthors{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;}
  39.                 #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;}
  40.                 '''
  41.  
  42.     def get_browser(self):
  43.         br = BasicNewsRecipe.get_browser(self)
  44.         self.logout_url = None
  45.  
  46.         #'''
  47.         br.open(self.LOGIN_URL)
  48.         br.select_form(name='signin-form')
  49.         br['signin-form:username'] = self.username
  50.         br['signin-form:password'] = self.password
  51.         raw = br.submit().read()
  52.         if '>Sign out<' not in raw:
  53.             raise Exception('Failed to login, are you sure your username and password are correct?')
  54.         try:
  55.             link = br.find_link(text='Sign out')
  56.             if link:
  57.                 self.logout_url = link.absolute_url
  58.         except:
  59.             self.logout_url = self.LOGOUT_URL
  60.         #'''
  61.         return br
  62.  
  63.     def cleanup(self):
  64.         if self.logout_url is not None:
  65.             self.browser.open(self.logout_url)
  66.  
  67.     def map_url(self, url):
  68.         if url.endswith('/ar/1'):
  69.             return url[:-1]+'pr'
  70.  
  71.  
  72.     def hbr_get_toc(self):
  73.         #return self.index_to_soup(open('/t/hbr.html').read())
  74.  
  75.         today = date.today()
  76.         future = today + timedelta(days=30)
  77.         for x in [x.strftime('%y%m') for x in (future, today)]:
  78.             url = self.INDEX + x
  79.             soup = self.index_to_soup(url)
  80.             if not soup.find(text='Issue Not Found'):
  81.                 return soup
  82.         raise Exception('Could not find current issue')
  83.  
  84.     def hbr_parse_toc(self, soup):
  85.         feeds = []
  86.         current_section = None
  87.         articles = []
  88.         for x in soup.find(id='archiveToc').findAll(['h3', 'h4']):
  89.             if x.name == 'h3':
  90.                 if current_section is not None and articles:
  91.                     feeds.append((current_section, articles))
  92.                 current_section = self.tag_to_string(x).capitalize()
  93.                 articles = []
  94.                 self.log('\tFound section:', current_section)
  95.             else:
  96.                 a = x.find('a', href=True)
  97.                 if a is None: continue
  98.                 title = self.tag_to_string(a)
  99.                 url = a['href']
  100.                 if '/ar/' not in url:
  101.                     continue
  102.                 if url.startswith('/'):
  103.                     url = 'http://hbr.org' + url
  104.                 url = self.map_url(url)
  105.                 p = x.parent.find('p')
  106.                 desc = ''
  107.                 if p is not None:
  108.                     desc = self.tag_to_string(p)
  109.                 self.log('\t\tFound article:', title)
  110.                 self.log('\t\t\t', url)
  111.                 self.log('\t\t\t', desc)
  112.  
  113.                 articles.append({'title':title, 'url':url, 'description':desc,
  114.                     'date':''})
  115.         return feeds
  116.  
  117.  
  118.     def parse_index(self):
  119.         soup = self.hbr_get_toc()
  120.         #open('/t/hbr.html', 'wb').write(unicode(soup).encode('utf-8'))
  121.         feeds = self.hbr_parse_toc(soup)
  122.         return feeds
  123.  
  124.     def get_cover_url(self):
  125.         cover_url = None
  126.         index = 'http://hbr.org/current'
  127.         soup = self.index_to_soup(index)
  128.         link_item = soup.find('img', alt=re.compile("Current Issue"), src=True)
  129.  
  130.         if link_item:
  131.            cover_url = 'http://hbr.org' + link_item['src']
  132.  
  133.         return cover_url
  134.  
  135.  
  136.