home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / newsweek.recipe < prev    next >
Text File  |  2011-09-09  |  3KB  |  95 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2.  
  3. class Newsweek(BasicNewsRecipe):
  4.  
  5.     title          = 'Newsweek'
  6.     __author__     = 'Kovid Goyal'
  7.     description    = 'Weekly news and current affairs in the US'
  8.     language       = 'en'
  9.     encoding       = 'utf-8'
  10.     no_stylesheets = True
  11.     recipe_disabled = ('Newsweek was taken over by The Daily Beast,'
  12.             ' newsweek.com no longer exists, so this recipe '
  13.             ' has been disabled.')
  14.  
  15.     BASE_URL = 'http://www.newsweek.com'
  16.  
  17.     topics = {
  18.         'Culture' : '/tag/culture.html',
  19.         'Business' : '/tag/business.html',
  20.         'Society' : '/tag/society.html',
  21.         'Science' : '/tag/science.html',
  22.         'Education' : '/tag/education.html',
  23.         'Politics' : '/tag/politics.html',
  24.         'Health' : '/tag/health.html',
  25.         'World' : '/tag/world.html',
  26.         'Nation' : '/tag/nation.html',
  27.         'Technology' : '/tag/technology.html',
  28.         'Game Changers' : '/tag/game-changers.html',
  29.     }
  30.  
  31.     keep_only_tags = dict(name='article', attrs={'class':'article-text'})
  32.     remove_tags = [dict(attrs={'data-dartad':True})]
  33.     remove_attributes = ['property']
  34.  
  35.     def postprocess_html(self, soup, first):
  36.         for tag in soup.findAll(name=['article', 'header']):
  37.             tag.name = 'div'
  38.         return soup
  39.  
  40.     def newsweek_sections(self):
  41.         for topic_name, topic_url in self.topics.iteritems():
  42.             yield (topic_name,
  43.                     self.BASE_URL+topic_url)
  44.  
  45.  
  46.     def newsweek_parse_section_page(self, soup):
  47.         for article in soup.findAll('article', about=True,
  48.                 attrs={'class':'stream-item'}):
  49.             title = article.find(attrs={'property': 'dc:title'})
  50.             if title is None: continue
  51.             title = self.tag_to_string(title)
  52.             url = self.BASE_URL + article['about']
  53.             desc = ''
  54.             author = article.find({'property':'dc:creator'})
  55.             if author:
  56.                 desc = u'by %s. '%self.tag_to_string(author)
  57.             p = article.find(attrs={'property':'dc:abstract'})
  58.             if p is not None:
  59.                 for a in p.find('a'): a.extract()
  60.                 desc += self.tag_to_string(p)
  61.             t = article.find('time', attrs={'property':'dc:created'})
  62.             date = ''
  63.             if t is not None:
  64.                 date = u' [%s]'%self.tag_to_string(t)
  65.             self.log('\tFound article:', title, 'at', url)
  66.             self.log('\t\t', desc)
  67.             yield {'title':title, 'url':url, 'description':desc, 'date':date}
  68.  
  69.  
  70.     def parse_index(self):
  71.         sections = []
  72.         for section, shref in self.newsweek_sections():
  73.             self.log('Processing section', section, shref)
  74.             articles = []
  75.             try:
  76.                 soups = [self.index_to_soup(shref)]
  77.             except:
  78.                 self.log.warn('Section %s not found, skipping'%section)
  79.                 continue
  80.             na = soups[0].find('a', rel='next')
  81.             if na:
  82.                 soups.append(self.index_to_soup(self.BASE_URL+na['href']))
  83.             for soup in soups:
  84.                 articles.extend(self.newsweek_parse_section_page(soup))
  85.                 if self.test and len(articles) > 1:
  86.                     break
  87.             if articles:
  88.                 sections.append((section, articles))
  89.             if self.test and len(sections) > 1:
  90.                 break
  91.         return sections
  92.  
  93.  
  94.  
  95.