home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Chip 2011 November
/
CHIP_2011_11.iso
/
Programy
/
Narzedzia
/
Calibre
/
calibre-0.8.18.msi
/
file_280
/
singtao_daily.recipe
< prev
next >
Wrap
Text File
|
2011-09-09
|
3KB
|
80 lines
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1278063072(BasicNewsRecipe):
title = u'Singtao Daily - Canada'
oldest_article = 7
max_articles_per_feed = 100
__author__ = 'rty'
description = 'Toronto Canada Chinese Newspaper'
publisher = 'news.singtao.ca'
category = 'Chinese, News, Canada'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
language = 'zh'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://news.singtao.ca/i/site_2009/logo.jpg'
extra_css = '''
@font-face {font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\
body {text-align: justify; margin-right: 8pt; font-family: 'DroidFont', serif;}\
h1 {font-family: 'DroidFont', serif;}\
.articledescription {font-family: 'DroidFont', serif;}
'''
keep_only_tags = [
dict(name='div', attrs={'id':['title','storybody']}),
dict(name='div', attrs={'class':'content'})
]
def parse_index(self):
feeds = []
for title, url in [
('Editorial',
'http://news.singtao.ca/toronto/editorial.html'),
('Toronto \xe5\x9f\x8e\xe5\xb8\x82/\xe7\xa4\xbe\xe5\x8d\x80'.decode('utf-8'),
'http://news.singtao.ca/toronto/city.html'),
('Canada \xe5\x8a\xa0\xe5\x9c\x8b'.decode('utf-8'),
'http://news.singtao.ca/toronto/canada.html'),
('Entertainment',
'http://news.singtao.ca/toronto/entertainment.html'),
('World',
'http://news.singtao.ca/toronto/world.html'),
('Finance \xe5\x9c\x8b\xe9\x9a\x9b\xe8\xb2\xa1\xe7\xb6\x93'.decode('utf-8'),
'http://news.singtao.ca/toronto/finance.html'),
('Sports', 'http://news.singtao.ca/toronto/sports.html'),
]:
articles = self.parse_section(url)
if articles:
feeds.append((title, articles))
return feeds
def parse_section(self, url):
soup = self.index_to_soup(url)
div = soup.find(attrs={'class': ['newslist paddingL10T10','newslist3 paddingL10T10']})
#date = div.find(attrs={'class': 'underlineBLK'})
current_articles = []
for li in div.findAll('li'):
a = li.find('a', href = True)
if a is None:
continue
title = self.tag_to_string(a)
url = a.get('href', False)
if not url or not title:
continue
if url.startswith('/'):
url = 'http://news.singtao.ca'+url
# self.log('\ \ Found article:', title)
# self.log('\ \ \ ', url)
current_articles.append({'title': title, 'url': url, 'description':''})
return current_articles
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(width=True):
del item['width']
return soup