home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / starwars.recipe < prev    next >
Text File  |  2011-09-09  |  2KB  |  57 lines

  1. # -*- coding: utf-8 -*-
  2. from calibre.web.feeds.news import BasicNewsRecipe
  3.  
  4. class TheForce(BasicNewsRecipe):
  5.     title          = u'The Force'
  6.     language       = 'en'
  7.     __author__     = 'Krittika Goyal'
  8.     oldest_article = 1 #days
  9.     max_articles_per_feed = 25
  10.     encoding = 'cp1252'
  11.  
  12.     remove_stylesheets = True
  13.     #remove_javascripts = True
  14.     conversion_options = { 'linearize_tables' : True }
  15.     remove_tags_after= dict(name='div', attrs={'class':'KonaBody'})
  16.     keep_only_tags = dict(name='td', attrs={'background':'/images/span/tile_story_bgtile.gif'})
  17.     #keep_only_tags = dict(name='div', attrs={'class':'KonaBody'})
  18.     remove_tags = [
  19.        dict(name='iframe'),
  20.        #dict(name='div', attrs={'class':['pt-box-title', 'pt-box-content', 'blog-entry-footer', 'item-list', 'article-sub-meta']}),
  21.        #dict(name='div', attrs={'id':['block-td_search_160', 'block-cam_search_160']}),
  22.        #dict(name='table', attrs={'cellspacing':'0'}),
  23.        #dict(name='ul', attrs={'class':'articleTools'}),
  24.     ]
  25.  
  26.     feeds          = [
  27. ('The Force',
  28.  'http://www.theforce.net/outnews/tfnrdf.xml'),
  29. ]
  30.  
  31.     def preprocess_html(self, soup):
  32.         for tag in soup.findAll(name='i'):
  33.             if 'Remember to join the Star Wars Insider Facebook' in self.tag_to_string(tag):
  34.                for x in tag.findAllNext():
  35.                    x.extract()
  36.                tag.extract()
  37.                break
  38.         tag = soup.find(attrs={'class':'articleoption'})
  39.         if tag is not None:
  40.             tag = tag.findParent('table')
  41.             if tag is not None:
  42.                 for x in tag.findAllNext():
  43.                     x.extract()
  44.             tag.extract()
  45.  
  46.         for img in soup.findAll('img', src=True):
  47.             a = img.findParent('a', href=True)
  48.             if a is None: continue
  49.             url = a.get('href').split('?')[-1].partition('=')[-1]
  50.             if url:
  51.                 img.extract()
  52.                 a.name = 'img'
  53.                 a['src'] = url
  54.                 del a['href']
  55.                 img['src'] = url
  56.         return soup
  57.