home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / kidney.recipe < prev    next >
Text File  |  2011-09-09  |  4KB  |  129 lines

  1. # -*- coding: utf-8 -*-
  2.  
  3. import time
  4.  
  5. from calibre.web.feeds.recipes import BasicNewsRecipe
  6. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  7.  
  8. class JASN(BasicNewsRecipe):
  9.     title          = u'Journal of the American Society of Nephrology'
  10.     language       = 'en'
  11.     __author__     = 'Krittika Goyal'
  12.     oldest_article = 31 #days
  13.     max_articles_per_feed = 25
  14.     delay = 5
  15.     needs_subscription = True
  16.  
  17.     INDEX = 'http://jasn.asnjournals.org/current.shtml'
  18.     no_stylesheets = True
  19.     remove_tags_before = dict(name='h2')
  20.     #remove_tags_after  = dict(name='th', attrs={'align':'left'})
  21.     remove_tags = [
  22.         dict(name='iframe'),
  23.        #dict(name='div', attrs={'class':'related-articles'}),
  24.         dict(name='td', attrs={'id':['jasnFooter']}),
  25.         dict(name='table', attrs={'id':"jasnNavBar"}),
  26.         dict(name='table', attrs={'class':'content_box_outer_table'}),
  27.         dict(name='th', attrs={'align':'left'})
  28.        ]
  29.  
  30.  
  31.  
  32.     #TO LOGIN
  33.     def get_browser(self):
  34.         br = BasicNewsRecipe.get_browser()
  35.         self.kidney_toc_soup = BeautifulSoup(br.open(self.INDEX).read())
  36.         toc = self.kidney_toc_soup.find(id='tocTable')
  37.         t = toc.find(text=lambda x: x and '[Full Text]' in x)
  38.         a = t.findParent('a', href=True)
  39.         url = a.get('href')
  40.         if url.startswith('/'):
  41.             url = 'http://jasn.asnjournals.org'+url
  42.         br.open(url)
  43.         br.select_form(name='UserSignIn')
  44.         br['username'] = self.username
  45.         br['code'] = self.password
  46.         response = br.submit()
  47.         raw = response.read()
  48.         if 'Sign Out' not in raw:
  49.             raise ValueError('Failed to log in, is your account expired?')
  50.         return br
  51.  
  52.     #feeds          = [
  53.         #('JASN',
  54.         #'http://jasn.asnjournals.org/rss/current.xml'),
  55.     #]
  56.  
  57.  
  58.     #TO GET ARTICLE TOC
  59.     def jasn_get_index(self):
  60.         return self.index_to_soup('http://jasn.asnjournals.org/current.shtml')
  61.  
  62.     # To parse artice toc
  63.     def parse_index(self):
  64.             parse_soup = self.jasn_get_index()
  65.  
  66.             div = parse_soup.find(id='tocBody')
  67.  
  68.             current_section = None
  69.             current_articles = []
  70.             feeds = []
  71.             for x in div.findAll(True):
  72.                 if x.name == 'h2':
  73.                     # Section heading found
  74.                     if current_articles and current_section:
  75.                         feeds.append((current_section, current_articles))
  76.                     current_section = self.tag_to_string(x)
  77.                     current_articles = []
  78.                     self.log('\tFound section:', current_section)
  79.                 if current_section is not None and x.name == 'strong':
  80.                     title = self.tag_to_string(x)
  81.                     a = x.parent.parent.find('a', href=lambda x: x and '/full/' in x)
  82.                     if a is None:
  83.                         continue
  84.                     url = a.get('href', False)
  85.                     if not url or not title:
  86.                         continue
  87.                     if url.startswith('/'):
  88.                         url = 'http://jasn.asnjournals.org'+url
  89.                     self.log('\t\tFound article:', title)
  90.                     self.log('\t\t\t', url)
  91.                     current_articles.append({'title': title, 'url':url,
  92.                         'description':'', 'date':''})
  93.  
  94.             if current_articles and current_section:
  95.                 feeds.append((current_section, current_articles))
  96.  
  97.             return feeds
  98.  
  99.  
  100.  
  101.     def preprocess_html(self, soup):
  102.         for a in soup.findAll(text=lambda x: x and '[in this window]' in x):
  103.             a = a.findParent('a')
  104.             url = a.get('href', None)
  105.             if not url:
  106.                 continue
  107.             if url.startswith('/'):
  108.                 url = 'http://jasn.asnjournals.org'+url
  109.                 img = isoup = None
  110.                 try:
  111.                     isoup = self.index_to_soup(url)
  112.                 except:
  113.                     time.sleep(5)
  114.                     try:
  115.                         isoup = self.index_to_soup(url)
  116.                     except:
  117.                         continue
  118.                 img = isoup.find('img', src=lambda x: x and x.startswith('/content/'))
  119.  
  120.             if img is not None:
  121.                 img.extract()
  122.                 table = a.findParent('table')
  123.                 table.replaceWith(img)
  124.         return soup
  125.  
  126.  
  127.  
  128.  
  129.