home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / office_space.recipe < prev    next >
Text File  |  2011-09-09  |  4KB  |  110 lines

  1. import re
  2. from calibre.web.feeds.recipes import BasicNewsRecipe
  3. from calibre.constants import config_dir, CONFIG_DIR_MODE
  4. import os, os.path, urllib
  5. from hashlib import md5
  6.  
  7. class OfficeSpaceBlogHu(BasicNewsRecipe):
  8.     __author__              = 'Zsolt Botykai'
  9.     title                   = u'Office Space Blog'
  10.     description             = u"officespace.blog.hu"
  11.     oldest_article          = 10000
  12.     max_articles_per_feed   = 10000
  13.     reverse_article_order   = True
  14.     language                = 'hu'
  15.     remove_javascript       = True
  16.     remove_empty_feeds      = True
  17.     no_stylesheets          = True
  18.     feeds                   = [(u'Office Space Blog', u'http://officespace.blog.hu/rss')]
  19.     remove_javascript       = True
  20.     use_embedded_content    = False
  21.     title          = u'Irodai patk├⌐nyok'
  22.     feeds          = [(u'Office Space', u'http://officespace.blog.hu/rss')]
  23.  
  24.     masthead_url='http://m.blog.hu/of/officespace/ipfejlec7.jpg'
  25.  
  26.     keep_only_tags = [
  27.                     dict(name='div', attrs={'id':['mainWrapper']})
  28.                     ]
  29.  
  30.     #   1.: I like justified lines more
  31.     #   2.: remove empty paragraphs
  32.     #   3.: drop header and sidebar
  33.     #   4.: drop comments counter
  34.     #   5.: drop everything after article-tags
  35.     # 6-8.: drop audit images
  36.  
  37.     preprocess_regexps = [
  38.         (re.compile(r'<p align="left"'), lambda m: '<p'),
  39.         (re.compile(r'<p>( | )*?</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
  40.         (re.compile(r'<body[^>]+>.*?<div id="mainIn"', re.DOTALL|re.IGNORECASE), lambda match: '<body><div id="mainIn"'),
  41.         (re.compile(r'<h3 class="comments">.*?</h3>', re.DOTALL|re.IGNORECASE), lambda match: ''),
  42.         (re.compile(r'<div class="related">.*?</body>', re.DOTALL|re.IGNORECASE), lambda match: '<body>'),
  43.         (re.compile(r'<img style="position: absolute;" src="[^"]+pixel\?uc.*?>', re.DOTALL|re.IGNORECASE), lambda match: ''),
  44.         (re.compile(r'<noscript.+?noscript>', re.DOTALL|re.IGNORECASE), lambda m: ''),
  45.         (re.compile(r'<img style="position: absolute;top:-10px.+?>', re.DOTALL|re.IGNORECASE), lambda m: ''),
  46.                          ]
  47.     extra_css = '''
  48.                     body { background-color: white; color: black }
  49.                 '''
  50.  
  51.     def get_cover_url(self):
  52.         return 'http://m.blog.hu/of/officespace/ipfejlec7.jpg'
  53.  
  54.     def preprocess_html(self, soup):
  55.         for tagz in soup.findAll('h3', attrs={'class':'tags'}):
  56.             for taglink in tagz.findAll('a'):
  57.                 if taglink.string is not None:
  58.                    tstr = taglink.string + ','
  59.                    taglink.replaceWith(tstr)
  60.  
  61.         for alink in soup.findAll('a'):
  62.             if alink.string is not None:
  63.                tstr = alink.string
  64.                alink.replaceWith(tstr)
  65.  
  66.         return soup
  67.  
  68.     # As seen here: http://www.mobileread.com/forums/showpost.php?p=1295505&postcount=10
  69.     def parse_feeds(self):
  70.         recipe_dir = os.path.join(config_dir,'recipes')
  71.         hash_dir = os.path.join(recipe_dir,'recipe_storage')
  72.         feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':'))
  73.         if not os.path.isdir(feed_dir):
  74.             os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)
  75.  
  76.         feeds = BasicNewsRecipe.parse_feeds(self)
  77.  
  78.         for feed in feeds:
  79.             feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='')
  80.             feed_fn = os.path.join(feed_dir,feed_hash)
  81.  
  82.             past_items = set()
  83.             if os.path.exists(feed_fn):
  84.                with file(feed_fn) as f:
  85.                    for h in f:
  86.                        past_items.add(h.strip())
  87.  
  88.             cur_items = set()
  89.             for article in feed.articles[:]:
  90.                 item_hash = md5()
  91.                 if article.content: item_hash.update(article.content.encode('utf-8'))
  92.                 if article.summary: item_hash.update(article.summary.encode('utf-8'))
  93.                 item_hash = item_hash.hexdigest()
  94.                 if article.url:
  95.                     item_hash = article.url + ':' + item_hash
  96.                 cur_items.add(item_hash)
  97.                 if item_hash in past_items:
  98.                     feed.articles.remove(article)
  99.             with file(feed_fn,'w') as f:
  100.                 for h in cur_items:
  101.                     f.write(h+'\n')
  102.  
  103.         remove = [f for f in feeds if len(f) == 0 and
  104.                 self.remove_empty_feeds]
  105.         for f in remove:
  106.             feeds.remove(f)
  107.  
  108.         return feeds
  109.  
  110.