home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Chip 2011 November
/
CHIP_2011_11.iso
/
Programy
/
Narzedzia
/
Calibre
/
calibre-0.8.18.msi
/
file_280
/
starwars.recipe
< prev
next >
Wrap
Text File
|
2011-09-09
|
2KB
|
57 lines
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class TheForce(BasicNewsRecipe):
title = u'The Force'
language = 'en'
__author__ = 'Krittika Goyal'
oldest_article = 1 #days
max_articles_per_feed = 25
encoding = 'cp1252'
remove_stylesheets = True
#remove_javascripts = True
conversion_options = { 'linearize_tables' : True }
remove_tags_after= dict(name='div', attrs={'class':'KonaBody'})
keep_only_tags = dict(name='td', attrs={'background':'/images/span/tile_story_bgtile.gif'})
#keep_only_tags = dict(name='div', attrs={'class':'KonaBody'})
remove_tags = [
dict(name='iframe'),
#dict(name='div', attrs={'class':['pt-box-title', 'pt-box-content', 'blog-entry-footer', 'item-list', 'article-sub-meta']}),
#dict(name='div', attrs={'id':['block-td_search_160', 'block-cam_search_160']}),
#dict(name='table', attrs={'cellspacing':'0'}),
#dict(name='ul', attrs={'class':'articleTools'}),
]
feeds = [
('The Force',
'http://www.theforce.net/outnews/tfnrdf.xml'),
]
def preprocess_html(self, soup):
for tag in soup.findAll(name='i'):
if 'Remember to join the Star Wars Insider Facebook' in self.tag_to_string(tag):
for x in tag.findAllNext():
x.extract()
tag.extract()
break
tag = soup.find(attrs={'class':'articleoption'})
if tag is not None:
tag = tag.findParent('table')
if tag is not None:
for x in tag.findAllNext():
x.extract()
tag.extract()
for img in soup.findAll('img', src=True):
a = img.findParent('a', href=True)
if a is None: continue
url = a.get('href').split('?')[-1].partition('=')[-1]
if url:
img.extract()
a.name = 'img'
a['src'] = url
del a['href']
img['src'] = url
return soup