Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / national_post.recipe < prev next >

Wrap

Text File | 2011-09-09 | 3KB | 81 lines

from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup class NYTimes(BasicNewsRecipe): title = 'National Post' __author__ = 'Krittika Goyal' description = 'Canadian national newspaper' timefmt = ' [%d %b, %Y]' language = 'en_CA' needs_subscription = False no_stylesheets = True #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) remove_tags_after = dict(name='div', attrs={'class':'npStoryTools npWidth1-6 npRight npTxtStrong'}) remove_tags = [ dict(name='iframe'), dict(name='div', attrs={'class':['story-tools', 'npStoryTools npWidth1-6 npRight npTxtStrong']}), #dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}), #dict(name='form', attrs={'onsubmit':''}), dict(name='ul', attrs={'class':'npTxtAlt npGroup npTxtCentre npStoryShare npTxtStrong npTxtDim'}), ] # def preprocess_html(self, soup): # table = soup.find('table') # if table is not None: # table.extract() # return soup #TO GET ARTICLE TOC def nejm_get_index(self): return self.index_to_soup('http://www.nationalpost.com/todays-paper/index.html') # To parse artice toc def parse_index(self): soup = self.nejm_get_index() div = soup.find(id='npContentMain') current_section = None current_articles = [] feeds = [] for x in div.findAll(True): if x.name == 'h4': # Section found if current_articles and current_section: feeds.append((current_section, current_articles)) current_section = self.tag_to_string(x) current_articles = [] self.log('\tFound section:', current_section) if current_section is not None and x.name == 'h5': # Article found title = self.tag_to_string(x) a = x.find('a', href=lambda x: x and 'story' in x) if a is None: continue url = a.get('href', False) if not url or not title: continue #if url.startswith('story'): url = 'http://www.nationalpost.com/todays-paper/'+url self.log('\t\tFound article:', title) self.log('\t\t\t', url) current_articles.append({'title': title, 'url':url, 'description':'', 'date':''}) if current_articles and current_section: feeds.append((current_section, current_articles)) return feeds def preprocess_html(self, soup): story = soup.find(name='div', attrs={'id':'npContentMain'}) ##td = heading.findParent(name='td') ##td.extract() soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>') body = soup.find(name='body') body.insert(0, story) return soup