home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / national_post.recipe < prev    next >
Text File  |  2011-09-09  |  3KB  |  81 lines

  1. from calibre.web.feeds.recipes import BasicNewsRecipe
  2. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  3.  
  4. class NYTimes(BasicNewsRecipe):
  5.  
  6.     title       = 'National Post'
  7.     __author__  = 'Krittika Goyal'
  8.     description = 'Canadian national newspaper'
  9.     timefmt = ' [%d %b, %Y]'
  10.     language = 'en_CA'
  11.     needs_subscription = False
  12.  
  13.     no_stylesheets = True
  14.     #remove_tags_before = dict(name='h1', attrs={'class':'heading'})
  15.     remove_tags_after  = dict(name='div', attrs={'class':'npStoryTools npWidth1-6 npRight npTxtStrong'})
  16.     remove_tags = [
  17.        dict(name='iframe'),
  18.        dict(name='div', attrs={'class':['story-tools', 'npStoryTools npWidth1-6 npRight npTxtStrong']}),
  19.        #dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}),
  20.        #dict(name='form', attrs={'onsubmit':''}),
  21.        dict(name='ul', attrs={'class':'npTxtAlt npGroup npTxtCentre npStoryShare npTxtStrong npTxtDim'}),
  22.     ]
  23.  
  24.    # def preprocess_html(self, soup):
  25.         # table = soup.find('table')
  26.         # if table is not None:
  27.             # table.extract()
  28.         # return soup
  29.  
  30.  
  31.  
  32.     #TO GET ARTICLE TOC
  33.     def nejm_get_index(self):
  34.             return self.index_to_soup('http://www.nationalpost.com/todays-paper/index.html')
  35.  
  36.     # To parse artice toc
  37.     def parse_index(self):
  38.             soup = self.nejm_get_index()
  39.  
  40.             div = soup.find(id='npContentMain')
  41.  
  42.             current_section = None
  43.             current_articles = []
  44.             feeds = []
  45.             for x in div.findAll(True):
  46.                 if x.name == 'h4':
  47.                     # Section found
  48.                     if current_articles and current_section:
  49.                         feeds.append((current_section, current_articles))
  50.                     current_section = self.tag_to_string(x)
  51.                     current_articles = []
  52.                     self.log('\tFound section:', current_section)
  53.                 if current_section is not None and x.name == 'h5':
  54.                     # Article found
  55.                     title = self.tag_to_string(x)
  56.                     a = x.find('a', href=lambda x: x and 'story' in x)
  57.                     if a is None:
  58.                         continue
  59.                     url = a.get('href', False)
  60.                     if not url or not title:
  61.                         continue
  62.                     #if url.startswith('story'):
  63.                     url = 'http://www.nationalpost.com/todays-paper/'+url
  64.                     self.log('\t\tFound article:', title)
  65.                     self.log('\t\t\t', url)
  66.                     current_articles.append({'title': title, 'url':url,
  67.                         'description':'', 'date':''})
  68.  
  69.             if current_articles and current_section:
  70.                 feeds.append((current_section, current_articles))
  71.  
  72.             return feeds
  73.     def preprocess_html(self, soup):
  74.         story = soup.find(name='div', attrs={'id':'npContentMain'})
  75.         ##td = heading.findParent(name='td')
  76.         ##td.extract()
  77.         soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
  78.         body = soup.find(name='body')
  79.         body.insert(0, story)
  80.         return soup
  81.