home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / cnetjapan.recipe < prev    next >
Text File  |  2011-09-09  |  2KB  |  52 lines

  1. import re
  2. from calibre.web.feeds.news import BasicNewsRecipe
  3.  
  4. class CNetJapan(BasicNewsRecipe):
  5.     title          = u'CNET Japan'
  6.     oldest_article = 3
  7.     max_articles_per_feed = 30
  8.     __author__  = 'Hiroshi Miura'
  9.  
  10.     feeds          = [(u'CNet News', u'http://feed.japan.cnet.com/rss/index.rdf'),
  11.                       (u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf')
  12.                         ]
  13.     language       = 'ja'
  14.     encoding       = 'utf-8'
  15.     remove_javascript = True
  16.  
  17.     preprocess_regexps = [
  18.        (re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL|re.IGNORECASE|re.UNICODE),
  19.         lambda match: '</body>'),
  20.        (re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL|re.IGNORECASE),
  21.         lambda match: '</body>'),
  22.        (re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE),
  23.         lambda match: '<!-- removed -->'),
  24.         ]
  25.  
  26.     remove_tags_before = dict(id="contents_l")
  27.     remove_tags = [
  28.                    {'class':"social_bkm_share"},
  29.                    {'class':"social_bkm_print"},
  30.                    {'class':"block20 clearfix"},
  31.                    dict(name="div",attrs={'id':'bookreview'}),
  32.                    {'class':"tag_left_ttl"},
  33.                    {'class':"tag_right"}
  34.                     ]
  35.     remove_tags_after = {'class':"block20"}
  36.  
  37.     def parse_feeds(self):
  38.  
  39.         feeds = BasicNewsRecipe.parse_feeds(self)
  40.  
  41.         for curfeed in feeds:
  42.             delList = []
  43.             for a,curarticle in enumerate(curfeed.articles):
  44.                 if re.search(r'pheedo.jp', curarticle.url):
  45.                     delList.append(curarticle)
  46.             if len(delList)>0:
  47.                 for d in delList:
  48.                     index = curfeed.articles.index(d)
  49.                     curfeed.articles[index:index+1] = []
  50.  
  51.         return feeds
  52.