Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / montreal_gazette.recipe < prev next >

Wrap

Text File | 2011-09-09 | 4KB | 97 lines

#!/usr/bin/env python __license__ = 'GPL v3' ''' www.canada.com ''' from calibre.web.feeds.recipes import BasicNewsRecipe class CanWestPaper(BasicNewsRecipe): # un-comment the following three lines for the Montreal Gazette title = u'Montreal Gazette' url_prefix = 'http://www.montrealgazette.com' description = u'News from Montreal, QC' language = 'en_CA' __author__ = 'Nick Redding' no_stylesheets = True timefmt = ' [%b %d]' extra_css = ''' .timestamp { font-size:xx-small; display: block; } #storyheader { font-size: medium; } #storyheader h1 { font-size: x-large; } #storyheader h2 { font-size: large; font-style: italic; } .byline { font-size:xx-small; } #photocaption { font-size: small; font-style: italic } #photocredit { font-size: xx-small; }''' keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})] remove_tags = [{'class':'comments'}, dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}), dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}), dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}), dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}), dict(name='div', attrs={'class':'rule_grey_solid'}), dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})] def preprocess_html(self,soup): #delete iempty id attributes--they screw up the TOC for unknow reasons divtags = soup.findAll('div',attrs={'id':''}) if divtags: for div in divtags: del(div['id']) return soup def parse_index(self): soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html') articles = {} key = 'News' ans = ['News'] # Find each instance of class="sectiontitle", class="featurecontent" for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}): #self.log(" div class = %s" % divtag['class']) if divtag['class'].startswith('section_title'): # div contains section title if not divtag.h3: continue key = self.tag_to_string(divtag.h3,False) ans.append(key) self.log("Section name %s" % key) continue # div contains article data h1tag = divtag.find('h1') if not h1tag: continue atag = h1tag.find('a',href=True) if not atag: continue url = self.url_prefix+'/news/todays-paper/'+atag['href'] #self.log("Section %s" % key) #self.log("url %s" % url) title = self.tag_to_string(atag,False) #self.log("title %s" % title) pubdate = '' description = '' ptag = divtag.find('p'); if ptag: description = self.tag_to_string(ptag,False) #self.log("description %s" % description) author = '' autag = divtag.find('h4') if autag: author = self.tag_to_string(autag,False) #self.log("author %s" % author) if not articles.has_key(key): articles[key] = [] articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content='')) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans