Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / newsweek.recipe < prev next >

Wrap

Text File | 2011-09-09 | 3KB | 95 lines

from calibre.web.feeds.news import BasicNewsRecipe class Newsweek(BasicNewsRecipe): title = 'Newsweek' __author__ = 'Kovid Goyal' description = 'Weekly news and current affairs in the US' language = 'en' encoding = 'utf-8' no_stylesheets = True recipe_disabled = ('Newsweek was taken over by The Daily Beast,' ' newsweek.com no longer exists, so this recipe ' ' has been disabled.') BASE_URL = 'http://www.newsweek.com' topics = { 'Culture' : '/tag/culture.html', 'Business' : '/tag/business.html', 'Society' : '/tag/society.html', 'Science' : '/tag/science.html', 'Education' : '/tag/education.html', 'Politics' : '/tag/politics.html', 'Health' : '/tag/health.html', 'World' : '/tag/world.html', 'Nation' : '/tag/nation.html', 'Technology' : '/tag/technology.html', 'Game Changers' : '/tag/game-changers.html', } keep_only_tags = dict(name='article', attrs={'class':'article-text'}) remove_tags = [dict(attrs={'data-dartad':True})] remove_attributes = ['property'] def postprocess_html(self, soup, first): for tag in soup.findAll(name=['article', 'header']): tag.name = 'div' return soup def newsweek_sections(self): for topic_name, topic_url in self.topics.iteritems(): yield (topic_name, self.BASE_URL+topic_url) def newsweek_parse_section_page(self, soup): for article in soup.findAll('article', about=True, attrs={'class':'stream-item'}): title = article.find(attrs={'property': 'dc:title'}) if title is None: continue title = self.tag_to_string(title) url = self.BASE_URL + article['about'] desc = '' author = article.find({'property':'dc:creator'}) if author: desc = u'by %s. '%self.tag_to_string(author) p = article.find(attrs={'property':'dc:abstract'}) if p is not None: for a in p.find('a'): a.extract() desc += self.tag_to_string(p) t = article.find('time', attrs={'property':'dc:created'}) date = '' if t is not None: date = u' [%s]'%self.tag_to_string(t) self.log('\tFound article:', title, 'at', url) self.log('\t\t', desc) yield {'title':title, 'url':url, 'description':desc, 'date':date} def parse_index(self): sections = [] for section, shref in self.newsweek_sections(): self.log('Processing section', section, shref) articles = [] try: soups = [self.index_to_soup(shref)] except: self.log.warn('Section %s not found, skipping'%section) continue na = soups[0].find('a', rel='next') if na: soups.append(self.index_to_soup(self.BASE_URL+na['href'])) for soup in soups: articles.extend(self.newsweek_parse_section_page(soup)) if self.test and len(articles) > 1: break if articles: sections.append((section, articles)) if self.test and len(sections) > 1: break return sections