home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / kellog_faculty.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  2.7 KB  |  71 lines

  1. #!/usr/bin/env python
  2. # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
  3. from __future__ import with_statement
  4.  
  5. __license__   = 'GPL v3'
  6. __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
  7. __docformat__ = 'restructuredtext en'
  8.  
  9. from calibre.web.feeds.news import BasicNewsRecipe
  10. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  11.  
  12. class KellogFaculty(BasicNewsRecipe):
  13.  
  14.     title          = 'Kellog Faculty Blogs'
  15.     __author__     = 'Kovid Goyal'
  16.     description    = 'Blogs of the Kellog School of Management Faculty'
  17.     no_stylesheets = True
  18.     encoding       = 'utf-8'
  19.     language = 'en'
  20.  
  21.     remove_tags_before = {'name':'h2'}
  22.     remove_tags_after = {'class':'col-two-text'}
  23.  
  24.     def parse_index(self):
  25.         soup = self.index_to_soup('http://www.kellogg.northwestern.edu/Faculty/Blogroll.aspx')
  26.         feeds, articles = [], []
  27.         feed_title = None
  28.         main = soup.find(id='bodyCopy')
  29.         for tag in main.findAll(['h3', 'div']):
  30.             if tag.name == 'h3':
  31.                 title = self.tag_to_string(tag).capitalize()
  32.                 a = tag.find('a', href=True)
  33.                 if articles and feed_title:
  34.                     feeds.append((feed_title, articles))
  35.                 articles = []
  36.                 # Keep only blogs hosted on the Kellog servers
  37.                 feed_title = title if a and 'insight.kellog' in a['href'] else None
  38.             elif tag.name == 'div' and tag.get('class', '') == 'rssfeed':
  39.                 script = tag.find('script', src=True)
  40.                 text = \
  41.                 self.browser.open(script['src']).read().replace('document.write(',
  42.                         '')[:-2]
  43.                 text = eval(text)
  44.                 asoup = BeautifulSoup(text)
  45.                 for tag in asoup.findAll('div',
  46.                         attrs={'class':'rssincl-entry'}):
  47.                     title = self.tag_to_string(tag.find(attrs={'class':'rssincl-itemtitle'}))
  48.                     try:
  49.                         desc = self.tag_to_string(tag.find(attrs={'class':'rssincl-itemdesc'}))
  50.                     except:
  51.                         desc = ''
  52.                     url = tag.find('a', href=True)['href']
  53.  
  54.                     articles.append({
  55.                         'title':title.strip(), 'url':url, 'description':desc.strip(), 'date':''
  56.                         })
  57.  
  58.         return feeds
  59.  
  60.     def postprocess_html(self, soup, first_fetch):
  61.         for tag in soup.findAll(style=True):
  62.             del tag['style']
  63.         head = soup.find('head')
  64.         if head is not None:
  65.             for p in head.findAll('p'): p.extract()
  66.         for meta in soup.findAll('meta', attrs={'name':'description'}): meta.extract()
  67.         for t in head.findAll(text=True): t.extract()
  68.         return soup
  69.  
  70.  
  71.