home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Chip 2011 November
/
CHIP_2011_11.iso
/
Programy
/
Narzedzia
/
Calibre
/
calibre-0.8.18.msi
/
file_280
/
office_space.recipe
< prev
next >
Wrap
Text File
|
2011-09-09
|
4KB
|
110 lines
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
from hashlib import md5
class OfficeSpaceBlogHu(BasicNewsRecipe):
__author__ = 'Zsolt Botykai'
title = u'Office Space Blog'
description = u"officespace.blog.hu"
oldest_article = 10000
max_articles_per_feed = 10000
reverse_article_order = True
language = 'hu'
remove_javascript = True
remove_empty_feeds = True
no_stylesheets = True
feeds = [(u'Office Space Blog', u'http://officespace.blog.hu/rss')]
remove_javascript = True
use_embedded_content = False
title = u'Irodai patkényok'
feeds = [(u'Office Space', u'http://officespace.blog.hu/rss')]
masthead_url='http://m.blog.hu/of/officespace/ipfejlec7.jpg'
keep_only_tags = [
dict(name='div', attrs={'id':['mainWrapper']})
]
# 1.: I like justified lines more
# 2.: remove empty paragraphs
# 3.: drop header and sidebar
# 4.: drop comments counter
# 5.: drop everything after article-tags
# 6-8.: drop audit images
preprocess_regexps = [
(re.compile(r'<p align="left"'), lambda m: '<p'),
(re.compile(r'<p>( | )*?</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'<body[^>]+>.*?<div id="mainIn"', re.DOTALL|re.IGNORECASE), lambda match: '<body><div id="mainIn"'),
(re.compile(r'<h3 class="comments">.*?</h3>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'<div class="related">.*?</body>', re.DOTALL|re.IGNORECASE), lambda match: '<body>'),
(re.compile(r'<img style="position: absolute;" src="[^"]+pixel\?uc.*?>', re.DOTALL|re.IGNORECASE), lambda match: ''),
(re.compile(r'<noscript.+?noscript>', re.DOTALL|re.IGNORECASE), lambda m: ''),
(re.compile(r'<img style="position: absolute;top:-10px.+?>', re.DOTALL|re.IGNORECASE), lambda m: ''),
]
extra_css = '''
body { background-color: white; color: black }
'''
def get_cover_url(self):
return 'http://m.blog.hu/of/officespace/ipfejlec7.jpg'
def preprocess_html(self, soup):
for tagz in soup.findAll('h3', attrs={'class':'tags'}):
for taglink in tagz.findAll('a'):
if taglink.string is not None:
tstr = taglink.string + ','
taglink.replaceWith(tstr)
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup
# As seen here: http://www.mobileread.com/forums/showpost.php?p=1295505&postcount=10
def parse_feeds(self):
recipe_dir = os.path.join(config_dir,'recipes')
hash_dir = os.path.join(recipe_dir,'recipe_storage')
feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':'))
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='')
feed_fn = os.path.join(feed_dir,feed_hash)
past_items = set()
if os.path.exists(feed_fn):
with file(feed_fn) as f:
for h in f:
past_items.add(h.strip())
cur_items = set()
for article in feed.articles[:]:
item_hash = md5()
if article.content: item_hash.update(article.content.encode('utf-8'))
if article.summary: item_hash.update(article.summary.encode('utf-8'))
item_hash = item_hash.hexdigest()
if article.url:
item_hash = article.url + ':' + item_hash
cur_items.add(item_hash)
if item_hash in past_items:
feed.articles.remove(article)
with file(feed_fn,'w') as f:
for h in cur_items:
f.write(h+'\n')
remove = [f for f in feeds if len(f) == 0 and
self.remove_empty_feeds]
for f in remove:
feeds.remove(f)
return feeds