home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/env python
-
- from calibre.web.feeds.recipes import BasicNewsRecipe
-
- class PCLab(BasicNewsRecipe):
- cover_url = 'http://pclab.pl/img/logo.png'
- title = u"PC Lab"
- __author__ = 'ravcio - rlelusz[at]gmail.com'
- description = u"Articles from PC Lab website"
- language = 'pl'
- oldest_article = 30.0
- max_articles_per_feed = 100
- recursions = 0
- encoding = 'iso-8859-2'
- no_stylesheets = True
- remove_javascript = True
- use_embedded_content = False
-
- keep_only_tags = [
- dict(name='div', attrs={'class':['substance']})
- ]
-
- remove_tags = [
- dict(name='div', attrs={'class':['chapters']})
- ,dict(name='div', attrs={'id':['script_bxad_slot_display_list_bxad_slot']})
- ]
-
- remove_tags_after = [
- dict(name='div', attrs={'class':['navigation']})
- ]
-
- #links to RSS feeds
- feeds = [ ('PCLab', u'http://pclab.pl/xml/artykuly.xml') ]
-
- #load second and subsequent page content
- # in: soup - full page with 'next' button
- # out: appendtag - tag to which new page is to be added
- def append_page(self, soup, appendtag):
- # find the 'Next' button
- pager = soup.find('div', attrs={'class':'next'})
-
- if pager:
- #search for 'a' element with link to next page (exit if not found)
- a = pager.find('a')
- if a:
- nexturl = a['href']
-
- soup2 = self.index_to_soup('http://pclab.pl/' + nexturl)
-
- pagetext_substance = soup2.find('div', attrs={'class':'substance'})
- pagetext = pagetext_substance.find('div', attrs={'class':'data'})
- pagetext.extract()
-
- pos = len(appendtag.contents)
- appendtag.insert(pos, pagetext)
- pos = len(appendtag.contents)
-
- self.append_page(soup2, appendtag)
-
-
- def preprocess_html(self, soup):
-
- # soup.body contains no title and no navigator, they are in soup
- self.append_page(soup, soup.body)
-
- # finally remove some tags
- tags = soup.findAll('div',attrs={'class':['tags', 'index', 'script_bxad_slot_display_list_bxad_slot', 'index first', 'zumi', 'navigation']})
- [tag.extract() for tag in tags]
-
- return soup
-