Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / office_space.recipe < prev next >

Wrap

Text File | 2011-09-09 | 4KB | 110 lines

import re from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.constants import config_dir, CONFIG_DIR_MODE import os, os.path, urllib from hashlib import md5 class OfficeSpaceBlogHu(BasicNewsRecipe): __author__ = 'Zsolt Botykai' title = u'Office Space Blog' description = u"officespace.blog.hu" oldest_article = 10000 max_articles_per_feed = 10000 reverse_article_order = True language = 'hu' remove_javascript = True remove_empty_feeds = True no_stylesheets = True feeds = [(u'Office Space Blog', u'http://officespace.blog.hu/rss')] remove_javascript = True use_embedded_content = False title = u'Irodai patk├⌐nyok' feeds = [(u'Office Space', u'http://officespace.blog.hu/rss')] masthead_url='http://m.blog.hu/of/officespace/ipfejlec7.jpg' keep_only_tags = [ dict(name='div', attrs={'id':['mainWrapper']}) ] # 1.: I like justified lines more # 2.: remove empty paragraphs # 3.: drop header and sidebar # 4.: drop comments counter # 5.: drop everything after article-tags # 6-8.: drop audit images preprocess_regexps = [ (re.compile(r'<p align="left"'), lambda m: '<p'), (re.compile(r'<p>( | )*?</p>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'<body[^>]+>.*?<div id="mainIn"', re.DOTALL|re.IGNORECASE), lambda match: '<body><div id="mainIn"'), (re.compile(r'<h3 class="comments">.*?</h3>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'<div class="related">.*?</body>', re.DOTALL|re.IGNORECASE), lambda match: '<body>'), (re.compile(r'<img style="position: absolute;" src="[^"]+pixel\?uc.*?>', re.DOTALL|re.IGNORECASE), lambda match: ''), (re.compile(r'<noscript.+?noscript>', re.DOTALL|re.IGNORECASE), lambda m: ''), (re.compile(r'<img style="position: absolute;top:-10px.+?>', re.DOTALL|re.IGNORECASE), lambda m: ''), ] extra_css = ''' body { background-color: white; color: black } ''' def get_cover_url(self): return 'http://m.blog.hu/of/officespace/ipfejlec7.jpg' def preprocess_html(self, soup): for tagz in soup.findAll('h3', attrs={'class':'tags'}): for taglink in tagz.findAll('a'): if taglink.string is not None: tstr = taglink.string + ',' taglink.replaceWith(tstr) for alink in soup.findAll('a'): if alink.string is not None: tstr = alink.string alink.replaceWith(tstr) return soup # As seen here: http://www.mobileread.com/forums/showpost.php?p=1295505&postcount=10 def parse_feeds(self): recipe_dir = os.path.join(config_dir,'recipes') hash_dir = os.path.join(recipe_dir,'recipe_storage') feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':')) if not os.path.isdir(feed_dir): os.makedirs(feed_dir,mode=CONFIG_DIR_MODE) feeds = BasicNewsRecipe.parse_feeds(self) for feed in feeds: feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='') feed_fn = os.path.join(feed_dir,feed_hash) past_items = set() if os.path.exists(feed_fn): with file(feed_fn) as f: for h in f: past_items.add(h.strip()) cur_items = set() for article in feed.articles[:]: item_hash = md5() if article.content: item_hash.update(article.content.encode('utf-8')) if article.summary: item_hash.update(article.summary.encode('utf-8')) item_hash = item_hash.hexdigest() if article.url: item_hash = article.url + ':' + item_hash cur_items.add(item_hash) if item_hash in past_items: feed.articles.remove(article) with file(feed_fn,'w') as f: for h in cur_items: f.write(h+'\n') remove = [f for f in feeds if len(f) == 0 and self.remove_empty_feeds] for f in remove: feeds.remove(f) return feeds