home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / smith.recipe < prev    next >
Text File  |  2011-09-09  |  2KB  |  54 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  3.  
  4. class SmithsonianMagazine(BasicNewsRecipe):
  5.     title          = u'Smithsonian Magazine'
  6.     language       = 'en'
  7.     __author__     = 'Krittika Goyal'
  8.     oldest_article = 31#days
  9.     max_articles_per_feed = 50
  10.     use_embedded_content = False
  11.     #encoding = 'latin1'
  12.     recursions = 1
  13.     match_regexps = ['&page=[2-9]$']
  14.  
  15.     remove_stylesheets = True
  16.     #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
  17.     remove_tags_after  = dict(name='p', attrs={'id':'articlePaginationWrapper'})
  18.     remove_tags = [
  19.        dict(name='iframe'),
  20.        dict(name='div', attrs={'class':'article_sidebar_border'}),
  21.        dict(name='div', attrs={'id':['article_sidebar_border', 'most-popular_large', 'most-popular-body_large']}),
  22.        ##dict(name='ul', attrs={'class':'article-tools'}),
  23.        dict(name='ul', attrs={'class':'cat-breadcrumb col three last'}),
  24.     ]
  25.  
  26.  
  27.     feeds          = [
  28. ('History and Archeology',
  29.  'http://feeds.feedburner.com/smithsonianmag/history-archaeology'),
  30. ('People and Places',
  31.  'http://feeds.feedburner.com/smithsonianmag/people-places'),
  32. ('Science and Nature',
  33.  'http://feeds.feedburner.com/smithsonianmag/science-nature'),
  34. ('Arts and Culture',
  35.  'http://feeds.feedburner.com/smithsonianmag/arts-culture'),
  36. ('Travel',
  37.  'http://feeds.feedburner.com/smithsonianmag/travel'),
  38. ]
  39.  
  40.     def preprocess_html(self, soup):
  41.         story = soup.find(name='div', attrs={'id':'article-body'})
  42.         ##td = heading.findParent(name='td')
  43.         ##td.extract()
  44.         soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
  45.         body = soup.find(name='body')
  46.         body.insert(0, story)
  47.         return soup
  48.  
  49.     #def postprocess_html(self, soup, first):
  50.         #for p in soup.findAll(id='articlePaginationWrapper'): p.extract()
  51.         #if not first:
  52.              #for div in soup.findAll(id='article-head'): div.extract()
  53.         #return soup
  54.