home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/env python
- __license__ = 'GPL v3'
- '''
- www.canada.com
- '''
- from calibre.web.feeds.recipes import BasicNewsRecipe
- class CanWestPaper(BasicNewsRecipe):
- # un-comment the following three lines for the Edmonton Journal
- title = u'Edmonton Journal'
- url_prefix = 'http://www.edmontonjournal.com'
- description = u'News from Edmonton, AB'
- # un-comment the following three lines for the Calgary Herald
- #title = u'Calgary Herald'
- #url_prefix = 'http://www.calgaryherald.com'
- #description = u'News from Calgary, AB'
- # un-comment the following three lines for the Regina Leader-Post
- #title = u'Regina Leader-Post'
- #url_prefix = 'http://www.leaderpost.com'
- #description = u'News from Regina, SK'
- # un-comment the following three lines for the Saskatoon Star-Phoenix
- #title = u'Saskatoon Star-Phoenix'
- #url_prefix = 'http://www.thestarphoenix.com'
- #description = u'News from Saskatoon, SK'
- # un-comment the following three lines for the Windsor Star
- #title = u'Windsor Star'
- #url_prefix = 'http://www.windsorstar.com'
- #description = u'News from Windsor, ON'
- # un-comment the following three lines for the Ottawa Citizen
- #title = u'Ottawa Citizen'
- #url_prefix = 'http://www.ottawacitizen.com'
- #description = u'News from Ottawa, ON'
- # un-comment the following three lines for the Montreal Gazette
- #title = u'Montreal Gazette'
- #url_prefix = 'http://www.montrealgazette.com'
- #description = u'News from Montreal, QC'
- language = 'en_CA'
- __author__ = 'Nick Redding'
- no_stylesheets = True
- timefmt = ' [%b %d]'
- extra_css = '''
- .timestamp { font-size:xx-small; display: block; }
- #storyheader { font-size: medium; }
- #storyheader h1 { font-size: x-large; }
- #storyheader h2 { font-size: large; font-style: italic; }
- .byline { font-size:xx-small; }
- #photocaption { font-size: small; font-style: italic }
- #photocredit { font-size: xx-small; }'''
- keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
- remove_tags = [{'class':'comments'},
- dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
- dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
- dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
- dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
- dict(name='div', attrs={'class':'rule_grey_solid'}),
- dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
- def preprocess_html(self,soup):
- #delete iempty id attributes--they screw up the TOC for unknow reasons
- divtags = soup.findAll('div',attrs={'id':''})
- if divtags:
- for div in divtags:
- del(div['id'])
- return soup
- def parse_index(self):
- soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
- articles = {}
- key = 'News'
- ans = ['News']
- # Find each instance of class="sectiontitle", class="featurecontent"
- for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
- #self.log(" div class = %s" % divtag['class'])
- if divtag['class'].startswith('section_title'):
- # div contains section title
- if not divtag.h3:
- continue
- key = self.tag_to_string(divtag.h3,False)
- ans.append(key)
- self.log("Section name %s" % key)
- continue
- # div contains article data
- h1tag = divtag.find('h1')
- if not h1tag:
- continue
- atag = h1tag.find('a',href=True)
- if not atag:
- continue
- url = self.url_prefix+'/news/todays-paper/'+atag['href']
- #self.log("Section %s" % key)
- #self.log("url %s" % url)
- title = self.tag_to_string(atag,False)
- #self.log("title %s" % title)
- pubdate = ''
- description = ''
- ptag = divtag.find('p');
- if ptag:
- description = self.tag_to_string(ptag,False)
- #self.log("description %s" % description)
- author = ''
- autag = divtag.find('h4')
- if autag:
- author = self.tag_to_string(autag,False)
- #self.log("author %s" % author)
- if not articles.has_key(key):
- articles[key] = []
- articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
- ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
- return ans