home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / pajama.recipe < prev    next >
Text File  |  2011-09-09  |  2KB  |  49 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  3.  
  4. class PajamasMedia(BasicNewsRecipe):
  5.     title          = u'Pajamas Media'
  6.     description = u'Provides exclusive news and opinion for forty countries.'
  7.     language       = 'en'
  8.     __author__     = 'Krittika Goyal'
  9.     oldest_article = 1 #days
  10.     max_articles_per_feed = 25
  11.     recursions = 1
  12.     match_regexps = [r'http://pajamasmedia.com/blog/.*/2/$']
  13.     #encoding = 'latin1'
  14.  
  15.     remove_stylesheets = True
  16.     #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
  17.     remove_tags_after  = dict(name='div', attrs={'class':'paged-nav'})
  18.     remove_tags = [
  19.        dict(name='iframe'),
  20.        dict(name='div', attrs={'class':['pages']}),
  21.        #dict(name='div', attrs={'id':['bookmark']}),
  22.        #dict(name='span', attrs={'class':['related_link', 'slideshowcontrols']}),
  23.        #dict(name='ul', attrs={'class':'articleTools'}),
  24.     ]
  25.  
  26.     feeds          = [
  27. ('pajamas Media',
  28.  'http://feeds.feedburner.com/PajamasMedia'),
  29.  
  30. ]
  31.  
  32.     def preprocess_html(self, soup):
  33.         story = soup.find(name='div', attrs={'id':'innerpage-content'})
  34.         #td = heading.findParent(name='td')
  35.         #td.extract()
  36.  
  37.         soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
  38.         body = soup.find(name='body')
  39.         body.insert(0, story)
  40.         return soup
  41.  
  42.     def postprocess_html(self, soup, first):
  43.         if not first:
  44.             h = soup.find(attrs={'class':'innerpage-header'})
  45.             if h: h.extract()
  46.             auth = soup.find(attrs={'class':'author'})
  47.             if auth: auth.extract()
  48.         return soup
  49.