home *** CD-ROM | disk | FTP | other *** search
- import re
- from calibre.web.feeds.news import BasicNewsRecipe
-
- class CNetJapanRelease(BasicNewsRecipe):
- title = u'CNET Japan release'
- oldest_article = 3
- max_articles_per_feed = 30
- __author__ = 'Hiroshi Miura'
-
- feeds = [(u'CNet Release', u'http://feed.japan.cnet.com/rss/release/index.rdf') ]
- language = 'ja'
- encoding = 'Shift_JIS'
- remove_javascript = True
-
- preprocess_regexps = [
- (re.compile(ur'<!--\u25B2contents_left END\u25B2-->.*</body>', re.DOTALL|re.IGNORECASE|re.UNICODE),
- lambda match: '</body>'),
- (re.compile(r'<!--AD_ELU_HEADER-->.*</body>', re.DOTALL|re.IGNORECASE),
- lambda match: '</body>'),
- (re.compile(ur'<!-- \u25B2\u95A2\u9023\u30BF\u30B0\u25B2 -->.*<!-- \u25B2ZDNet\u25B2 -->', re.UNICODE),
- lambda match: '<!-- removed -->'),
- ]
-
- remove_tags_before = dict(id="contents_l")
- remove_tags = [
- {'class':"social_bkm_share"},
- {'class':"social_bkm_print"},
- {'class':"block20 clearfix"},
- dict(name="div",attrs={'id':'bookreview'}),
- {'class':"tag_left_ttl"}
- ]
- remove_tags_after = {'class':"block20"}
-
- def parse_feeds(self):
-
- feeds = BasicNewsRecipe.parse_feeds(self)
-
- for curfeed in feeds:
- delList = []
- for a,curarticle in enumerate(curfeed.articles):
- if re.search(r'pheedo.jp', curarticle.url):
- delList.append(curarticle)
- if len(delList)>0:
- for d in delList:
- index = curfeed.articles.index(d)
- curfeed.articles[index:index+1] = []
-
- return feeds
-