home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / cracked_com.recipe < prev    next >
Text File  |  2011-09-09  |  3KB  |  64 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2.  
  3. class Cracked(BasicNewsRecipe):
  4.     title                 = u'Cracked.com'
  5.     __author__            = 'UnWeave'
  6.     language              = 'en'
  7.     description           = "America's Only HumorSite since 1958"
  8.     publisher             = 'Cracked'
  9.     category              = 'comedy, lists'
  10.     oldest_article        = 3 #days
  11.     max_articles_per_feed = 100
  12.     no_stylesheets        = True
  13.     encoding              = 'ascii'
  14.     remove_javascript     = True
  15.     use_embedded_content  = False
  16.  
  17.     feeds = [ (u'Articles', u'http://feeds.feedburner.com/CrackedRSS/') ]
  18.  
  19.     conversion_options = {
  20.                           'comment'   : description
  21.                         , 'tags'      : category
  22.                         , 'publisher' : publisher
  23.                         , 'language'  : language
  24.                         }
  25.  
  26.     remove_tags_before = dict(id='PrimaryContent')
  27.  
  28.     remove_tags_after = dict(name='div', attrs={'class':'shareBar'})
  29.  
  30.     remove_tags = [ dict(name='div', attrs={'class':['social',
  31.                                                      'FacebookLike',
  32.                                                      'shareBar'
  33.                                                      ]}),
  34.  
  35.                     dict(name='div', attrs={'id':['inline-share-buttons',
  36.                                                   ]}),
  37.  
  38.                     dict(name='span', attrs={'class':['views',
  39.                                                       'KonaFilter'
  40.                                                       ]}),
  41.                     #dict(name='img'),
  42.                     ]
  43.  
  44.     def appendPage(self, soup, appendTag, position):
  45.         # Check if article has multiple pages
  46.         pageNav = soup.find('nav', attrs={'class':'PaginationContent'})
  47.         if pageNav:
  48.             # Check not at last page
  49.             nextPage = pageNav.find('a', attrs={'class':'next'})
  50.             if nextPage:
  51.                 nextPageURL = nextPage['href']
  52.                 nextPageSoup = self.index_to_soup(nextPageURL)
  53.                 # 8th <section> tag contains article content
  54.                 nextPageContent = nextPageSoup.findAll('section')[7]
  55.                 newPosition = len(nextPageContent.contents)
  56.                 self.appendPage(nextPageSoup,nextPageContent,newPosition)
  57.                 nextPageContent.extract()
  58.                 pageNav.extract()
  59.                 appendTag.insert(position,nextPageContent)
  60.  
  61.     def preprocess_html(self, soup):
  62.         self.appendPage(soup, soup.body, 3)
  63.         return soup
  64.