home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / kopalniawiedzy.recipe < prev    next >
Text File  |  2011-09-09  |  3KB  |  80 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2011, Attis <attis@attis.one.pl>'
  3. __version__ = 'v. 0.1'
  4.  
  5. import re
  6. from calibre.web.feeds.recipes import BasicNewsRecipe
  7.  
  8. class KopalniaWiedzy(BasicNewsRecipe):
  9.         title          = u'Kopalnia Wiedzy'
  10.         publisher      = u'Kopalnia Wiedzy'
  11.         description    = u'Ciekawostki ze ┼¢wiata nauki i techniki'
  12.         encoding       = 'utf-8'
  13.         __author__     = 'Attis'
  14.         language       = 'pl'
  15.         oldest_article = 7
  16.         max_articles_per_feed = 100
  17.         INDEX          = u'http://kopalniawiedzy.pl/'
  18.         remove_javascript     = True    
  19.         no_stylesheets        = True
  20.         
  21.         remove_tags    = [{'name':'p', 'attrs': {'class': 'keywords'} }, {'name':'div', 'attrs': {'class':'sexy-bookmarks sexy-bookmarks-bg-caring'}}]
  22.         remove_tags_after = dict(attrs={'class':'ad-square'})
  23.         keep_only_tags    = [dict(name="div", attrs={'id':'articleContent'})]
  24.         extra_css      = '.topimage {margin-top: 30px}'
  25.         
  26.         preprocess_regexps = [
  27.                 (re.compile(u'<a .* rel="lightboxText" .*><img (.*)></a>'),
  28.                 lambda match: '<img class="topimage" ' + match.group(1) + '>' ),
  29.                 (re.compile(u'<br  /><br  />'),
  30.                 lambda match: '<br\/>')
  31.             ]
  32.         
  33.         feeds = [
  34.             (u'Biologia', u'http://kopalniawiedzy.pl/wiadomosci_biologia.rss'),
  35.             (u'Medycyna', u'http://kopalniawiedzy.pl/wiadomosci_medycyna.rss'),
  36.             (u'Psychologia', u'http://kopalniawiedzy.pl/wiadomosci_psychologia.rss'),
  37.             (u'Technologie', u'http://kopalniawiedzy.pl/wiadomosci_technologie.rss'),
  38.             (u'Ciekawostki', u'http://kopalniawiedzy.pl/wiadomosci_ciekawostki.rss'),
  39.             (u'Artyku┼éy', u'http://kopalniawiedzy.pl/artykuly.rss')
  40.         ]
  41.         
  42.         def is_link_wanted(self, url, tag):
  43.             return tag['class'] == 'next'
  44.             
  45.         def remove_beyond(self, tag, next):
  46.                 while tag is not None and getattr(tag, 'name', None) != 'body':
  47.                         after = getattr(tag, next)
  48.                         while after is not None:
  49.                                 ns = getattr(tag, next)
  50.                                 after.extract()
  51.                                 after = ns
  52.                         tag = tag.parent
  53.         
  54.         def append_page(self, soup, appendtag, position):
  55.                 pager = soup.find('a',attrs={'class':'next'})
  56.                 if pager:
  57.                     nexturl = self.INDEX + pager['href']
  58.                     soup2 = self.index_to_soup(nexturl)
  59.                     texttag = soup2.find('div', attrs={'id':'articleContent'})
  60.                     
  61.                     tag = texttag.find(attrs={'class':'pages'})
  62.                     self.remove_beyond(tag, 'nextSibling')
  63.                     
  64.                     newpos = len(texttag.contents)
  65.                     self.append_page(soup2,texttag,newpos)
  66.  
  67.                     appendtag.insert(position,texttag)
  68.  
  69.  
  70.         def preprocess_html(self, soup): 
  71.                 self.append_page(soup, soup.body, 3)
  72.                 
  73.                 for item in soup.findAll('div',attrs={'class':'pages'}):
  74.                     item.extract()
  75.                     
  76.                 for item in soup.findAll('p', attrs={'class':'wykop'}):
  77.                     item.extract()
  78.                     
  79.                 return soup
  80.