home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / pc_lab.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  2.4 KB  |  71 lines

  1. #!/usr/bin/env  python
  2.  
  3. from calibre.web.feeds.recipes import BasicNewsRecipe
  4.  
  5. class PCLab(BasicNewsRecipe):
  6.     cover_url             = 'http://pclab.pl/img/logo.png'
  7.     title                 = u"PC Lab"
  8.     __author__            = 'ravcio - rlelusz[at]gmail.com'
  9.     description           = u"Articles from PC Lab website"
  10.     language              = 'pl'
  11.     oldest_article        = 30.0
  12.     max_articles_per_feed = 100
  13.     recursions            = 0
  14.     encoding              = 'iso-8859-2'
  15.     no_stylesheets        = True
  16.     remove_javascript     = True
  17.     use_embedded_content  = False
  18.  
  19.     keep_only_tags = [
  20.             dict(name='div', attrs={'class':['substance']})
  21.                      ]
  22.  
  23.     remove_tags = [
  24.             dict(name='div', attrs={'class':['chapters']})
  25.             ,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']})
  26.                   ]
  27.  
  28.     remove_tags_after = [
  29.             dict(name='div', attrs={'class':['navigation']})
  30.                 ]
  31.  
  32.     #links to RSS feeds
  33.     feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ]
  34.  
  35.     #load second and subsequent page content
  36.     # in: soup - full page with 'next' button
  37.     # out: appendtag - tag to which new page is to be added
  38.     def append_page(self, soup, appendtag):
  39.         # find the 'Next' button
  40.         pager = soup.find('div', attrs={'class':'next'})
  41.  
  42.         if pager:
  43.             #search for 'a' element with link to next page (exit if not found)
  44.             a = pager.find('a')
  45.             if a:
  46.                 nexturl = a['href']
  47.  
  48.                 soup2 = self.index_to_soup('http://pclab.pl/' + nexturl)
  49.  
  50.                 pagetext_substance = soup2.find('div', attrs={'class':'substance'})
  51.                 pagetext = pagetext_substance.find('div', attrs={'class':'data'})
  52.                 pagetext.extract()
  53.  
  54.                 pos = len(appendtag.contents)
  55.                 appendtag.insert(pos, pagetext)
  56.                 pos = len(appendtag.contents)
  57.  
  58.                 self.append_page(soup2, appendtag)
  59.  
  60.  
  61.     def preprocess_html(self, soup):
  62.  
  63.         # soup.body contains no title and no navigator, they are in soup
  64.         self.append_page(soup, soup.body)
  65.  
  66.         # finally remove some tags
  67.         tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']})
  68.         [tag.extract() for tag in tags]
  69.  
  70.         return soup
  71.