home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / vic_times.recipe < prev    next >
Text File  |  2011-09-09  |  6KB  |  142 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4.  
  5. '''
  6. www.canada.com
  7. '''
  8.  
  9. from calibre.web.feeds.recipes import BasicNewsRecipe
  10.  
  11.  
  12. class CanWestPaper(BasicNewsRecipe):
  13.  
  14.     # un-comment the following three lines for the Victoria Times Colonist
  15.     title = u'Victoria Times Colonist'
  16.     url_prefix = 'http://www.timescolonist.com'
  17.     description = u'News from Victoria, BC'
  18.  
  19.     # un-comment the following three lines for the Vancouver Province
  20.     #title = u'Vancouver Province'
  21.     #url_prefix = 'http://www.theprovince.com'
  22.     #description = u'News from Vancouver, BC'
  23.  
  24.     # un-comment the following three lines for the Vancouver Sun
  25.     #title = u'Vancouver Sun'
  26.     #url_prefix = 'http://www.vancouversun.com'
  27.     #description = u'News from Vancouver, BC'
  28.  
  29.     # un-comment the following three lines for the Edmonton Journal
  30.     #title = u'Edmonton Journal'
  31.     #url_prefix = 'http://www.edmontonjournal.com'
  32.     #description = u'News from Edmonton, AB'
  33.  
  34.     # un-comment the following three lines for the Calgary Herald
  35.     #title = u'Calgary Herald'
  36.     #url_prefix = 'http://www.calgaryherald.com'
  37.     #description = u'News from Calgary, AB'
  38.  
  39.     # un-comment the following three lines for the Regina Leader-Post
  40.     #title = u'Regina Leader-Post'
  41.     #url_prefix = 'http://www.leaderpost.com'
  42.     #description = u'News from Regina, SK'
  43.  
  44.     # un-comment the following three lines for the Saskatoon Star-Phoenix
  45.     #title = u'Saskatoon Star-Phoenix'
  46.     #url_prefix = 'http://www.thestarphoenix.com'
  47.     #description = u'News from Saskatoon, SK'
  48.  
  49.     # un-comment the following three lines for the Windsor Star
  50.     #title = u'Windsor Star'
  51.     #url_prefix = 'http://www.windsorstar.com'
  52.     #description = u'News from Windsor, ON'
  53.  
  54.     # un-comment the following three lines for the Ottawa Citizen
  55.     #title = u'Ottawa Citizen'
  56.     #url_prefix = 'http://www.ottawacitizen.com'
  57.     #description = u'News from Ottawa, ON'
  58.  
  59.     # un-comment the following three lines for the Montreal Gazette
  60.     #title = u'Montreal Gazette'
  61.     #url_prefix = 'http://www.montrealgazette.com'
  62.     #description = u'News from Montreal, QC'
  63.  
  64.  
  65.     language = 'en_CA'
  66.     __author__ = 'Nick Redding'
  67.     no_stylesheets = True
  68.     timefmt = ' [%b %d]'
  69.     extra_css = '''
  70.                 .timestamp {  font-size:xx-small; display: block; }
  71.                 #storyheader { font-size: medium; }
  72.                 #storyheader h1 { font-size: x-large; }
  73.                 #storyheader h2 { font-size: large;  font-style: italic; }
  74.                 .byline { font-size:xx-small; }
  75.                 #photocaption { font-size: small; font-style: italic }
  76.                 #photocredit { font-size: xx-small; }'''
  77.     keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
  78.     remove_tags = [{'class':'comments'},
  79.                    dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
  80.                    dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
  81.                    dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
  82.                    dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
  83.                    dict(name='div', attrs={'class':'rule_grey_solid'}),
  84.                    dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
  85.  
  86.     def preprocess_html(self,soup):
  87.         #delete iempty id attributes--they screw up the TOC for unknow reasons
  88.         divtags = soup.findAll('div',attrs={'id':''})
  89.         if divtags:
  90.             for div in divtags:
  91.                 del(div['id'])
  92.         return soup
  93.  
  94.  
  95.     def parse_index(self):
  96.         soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
  97.  
  98.         articles = {}
  99.         key = 'News'
  100.         ans = ['News']
  101.  
  102.         # Find each instance of class="sectiontitle", class="featurecontent"
  103.         for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
  104.                 #self.log(" div class = %s" % divtag['class'])
  105.                 if divtag['class'].startswith('section_title'):
  106.                     # div contains section title
  107.                     if not divtag.h3:
  108.                         continue
  109.                     key = self.tag_to_string(divtag.h3,False)
  110.                     ans.append(key)
  111.                     self.log("Section name %s" % key)
  112.                     continue
  113.                 # div contains article data
  114.                 h1tag = divtag.find('h1')
  115.                 if not h1tag:
  116.                     continue
  117.                 atag = h1tag.find('a',href=True)
  118.                 if not atag:
  119.                     continue
  120.                 url = self.url_prefix+'/news/todays-paper/'+atag['href']
  121.                 #self.log("Section %s" % key)
  122.                 #self.log("url %s" % url)
  123.                 title = self.tag_to_string(atag,False)
  124.                 #self.log("title %s" % title)
  125.                 pubdate = ''
  126.                 description = ''
  127.                 ptag = divtag.find('p');
  128.                 if ptag:
  129.                     description = self.tag_to_string(ptag,False)
  130.                     #self.log("description %s" % description)
  131.                 author = ''
  132.                 autag = divtag.find('h4')
  133.                 if autag:
  134.                     author = self.tag_to_string(autag,False)
  135.                     #self.log("author %s" % author)
  136.                 if not articles.has_key(key):
  137.                     articles[key] = []
  138.                 articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
  139.  
  140.         ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
  141.         return ans
  142.