Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / newsweek_polska.recipe < prev next >

Wrap

Text File | 2011-09-09 | 4KB | 136 lines

# -*- coding: utf-8 -*- #!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2010, matek09, matek09@gmail.com' from calibre.web.feeds.news import BasicNewsRecipe from calibre.ptempfile import PersistentTemporaryFile import datetime class Newsweek(BasicNewsRecipe): EDITION = '0' DATE = None YEAR = datetime.datetime.now().year title = u'Newsweek Polska' __author__ = 'matek09' description = 'Weekly magazine' encoding = 'utf-8' language = 'pl' remove_javascript = True temp_files = [] articles_are_obfuscated = True def get_obfuscated_article(self, url): br = self.get_browser() br.open(url) source = br.response().read() page = self.index_to_soup(source) main_section = page.find(id='mainSection') title = main_section.find('h1') info = main_section.find('ul', attrs={'class' : 'articleInfo'}) authors = info.find('li').find('h4') article = main_section.find('div', attrs={'id' : 'article'}) html = unicode(title) + unicode(authors) + unicode(article) next = main_section.find('li', attrs={'class' : 'next'}) while next: url = next.find('a')['href'] br.open(url) source = br.response().read() page = self.index_to_soup(source) main_section = page.find(id='mainSection') article = main_section.find('div', attrs={'id' : 'article'}) aside = article.find(id='articleAside') if aside is not None: aside.extract() html = html + unicode(article) next = main_section.find('li', attrs={'class' : 'next'}) self.temp_files.append(PersistentTemporaryFile('_temparse.html')) self.temp_files[-1].write(html) self.temp_files[-1].close() return self.temp_files[-1].name def is_full(self, issue_soup): while True: main_section = issue_soup.find(id='mainSection') next = main_section.find('li', attrs={'class' : 'next'}) if len(main_section.findAll(attrs={'class' : 'locked'})) > 1: return False elif next is None: return True else: issue_soup = self.index_to_soup(next.find('a')['href']) def find_last_full_issue(self, archive_url): archive_soup = self.index_to_soup(archive_url) select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'}) for option in select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value')): self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','') issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION) if self.is_full(issue_soup): return self.YEAR = self.YEAR - 1 self.find_last_full_issue(archive_url + ',' + str(self.YEAR)) def parse_index(self): archive_url = 'http://www.newsweek.pl/wydania/archiwum' self.find_last_full_issue(archive_url) soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION) self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'})) main_section = soup.find(id='mainSection') img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title')) self.cover_url = img['src'] feeds = [] articles = {} sections = [] while True: news_list = main_section.find('ul', attrs={'class' : 'newsList'}) for h2 in news_list.findAll('h2'): article = self.create_article(h2) category_div = h2.findNext('div', attrs={'class' : 'kategorie'}) section = self.tag_to_string(category_div) if articles.has_key(section): articles[section].append(article) else: articles[section] = [article] sections.append(section) next = main_section.find('li', attrs={'class' : 'next'}) if next is None: break soup = self.index_to_soup(next.find('a')['href']) main_section = soup.find(id='mainSection') for section in sections: feeds.append((section, articles[section])) return feeds def create_article(self, h2): article = {} a = h2.find('a') article['title'] = self.tag_to_string(a) article['url'] = a['href'] article['date'] = self.DATE desc = h2.findNext('p') if desc is not None: article['description'] = self.tag_to_string(desc) else: article['description'] = '' return article