home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / nytimesbook.recipe < prev    next >
Text File  |  2011-09-09  |  2KB  |  57 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  3.  
  4. class NewYorkTimesBookReview(BasicNewsRecipe):
  5.     title          = u'New York Times Book Review'
  6.     language       = 'en'
  7.     __author__     = 'Krittika Goyal'
  8.     oldest_article = 8 #days
  9.     max_articles_per_feed = 1000
  10.     recursions = 2
  11.     #encoding = 'latin1'
  12.  
  13.     remove_stylesheets = True
  14.     #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
  15.     remove_tags_after  = dict(name='div', attrs={'id':'authorId'})
  16.     remove_tags = [
  17.        dict(name='iframe'),
  18.        dict(name=['div', 'a'], attrs={'class':['enlargeThis', 'jumpLink']}),
  19.        dict(name='div', attrs={'id':['sidebarArticles', 'toolsRight']}),
  20.        #dict(name='ul', attrs={'class':'article-tools'}),
  21.        #dict(name='ul', attrs={'class':'articleTools'}),
  22.     ]
  23.     match_regexps = [
  24.             r'http://www.nytimes.com/.+pagewanted=[2-9]+'
  25.             ]
  26.  
  27.     feeds          = [
  28. ('New York Times Sunday Book Review',
  29.  'http://feeds.nytimes.com/nyt/rss/SundayBookReview'),
  30. ]
  31.  
  32.  
  33.     def preprocess_html(self, soup):
  34.         story = soup.find(name='div', attrs={'id':'article'})
  35.         #td = heading.findParent(name='td')
  36.         #td.extract()
  37.         soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
  38.         body = soup.find(name='body')
  39.         body.insert(0, story)
  40.         #for x in soup.findAll(name='p', text=lambda x:x and '-->' in x):
  41.              #p = x.findParent('p')
  42.              #if p is not None:
  43.                   #p.extract()
  44.         return soup
  45.  
  46.     def postprocess_html(self, soup, first):
  47.         for div in soup.findAll(id='pageLinks'):
  48.             div.extract()
  49.         if not first:
  50.             h1 = soup.find('h1')
  51.             if h1 is not None:
  52.                 h1.extract()
  53.             t = soup.find(attrs={'class':'timestamp'})
  54.             if t is not None:
  55.                 t.extract()
  56.         return soup
  57.