home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / newsweek_polska.recipe < prev    next >
Text File  |  2011-09-09  |  4KB  |  136 lines

  1. # -*- coding: utf-8 -*-
  2. #!/usr/bin/env  python
  3.  
  4. __license__   = 'GPL v3'
  5. __copyright__ = '2010, matek09, matek09@gmail.com'
  6.  
  7. from calibre.web.feeds.news import BasicNewsRecipe
  8. from calibre.ptempfile import PersistentTemporaryFile
  9. import datetime
  10.  
  11.  
  12. class Newsweek(BasicNewsRecipe):
  13.     EDITION = '0'
  14.     DATE = None
  15.     YEAR = datetime.datetime.now().year
  16.  
  17.     title = u'Newsweek Polska'
  18.     __author__ = 'matek09'
  19.     description = 'Weekly magazine'
  20.     encoding = 'utf-8'
  21.     language = 'pl'
  22.     remove_javascript = True
  23.  
  24.     temp_files = [] 
  25.     articles_are_obfuscated = True 
  26.  
  27.  
  28.     def get_obfuscated_article(self, url):
  29.         br = self.get_browser()
  30.         br.open(url)
  31.         source = br.response().read()
  32.         page = self.index_to_soup(source)
  33.  
  34.         main_section = page.find(id='mainSection')
  35.         
  36.         title = main_section.find('h1')
  37.         info = main_section.find('ul', attrs={'class' : 'articleInfo'})
  38.         authors = info.find('li').find('h4')
  39.         article = main_section.find('div', attrs={'id' : 'article'})
  40.         html =  unicode(title) + unicode(authors) + unicode(article)
  41.         next = main_section.find('li', attrs={'class' : 'next'})
  42.         
  43.         while next:
  44.             url = next.find('a')['href']
  45.             br.open(url)
  46.             source = br.response().read()
  47.             page = self.index_to_soup(source)
  48.             main_section = page.find(id='mainSection')
  49.             article = main_section.find('div', attrs={'id' : 'article'})
  50.             aside = article.find(id='articleAside')
  51.             if aside is not None:
  52.                 aside.extract()
  53.             html = html + unicode(article)
  54.             next = main_section.find('li', attrs={'class' : 'next'})
  55.         
  56.         
  57.         self.temp_files.append(PersistentTemporaryFile('_temparse.html')) 
  58.         self.temp_files[-1].write(html) 
  59.         self.temp_files[-1].close() 
  60.         return self.temp_files[-1].name
  61.         
  62.     def is_full(self, issue_soup):
  63.         while True:
  64.             main_section = issue_soup.find(id='mainSection')
  65.             next = main_section.find('li', attrs={'class' : 'next'})
  66.             if len(main_section.findAll(attrs={'class' : 'locked'})) > 1:
  67.                 return False
  68.             elif next is None:
  69.                 return True
  70.             else:
  71.                 issue_soup = self.index_to_soup(next.find('a')['href'])
  72.  
  73.     def find_last_full_issue(self, archive_url):
  74.         archive_soup = self.index_to_soup(archive_url)
  75.         select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'})
  76.         for option in select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value')):
  77.             self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','')
  78.             issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
  79.             if self.is_full(issue_soup):
  80.                 return
  81.                 
  82.         self.YEAR = self.YEAR - 1
  83.         self.find_last_full_issue(archive_url + ',' + str(self.YEAR))
  84.         
  85.     def parse_index(self):
  86.         archive_url = 'http://www.newsweek.pl/wydania/archiwum'
  87.         self.find_last_full_issue(archive_url)
  88.         soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
  89.         self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'}))
  90.         main_section = soup.find(id='mainSection')
  91.         img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title'))
  92.         self.cover_url = img['src']
  93.         feeds = []
  94.         articles = {}
  95.         sections = []
  96.         while True:
  97.             news_list = main_section.find('ul', attrs={'class' : 'newsList'})
  98.             for h2 in news_list.findAll('h2'):
  99.                 
  100.                 article = self.create_article(h2)
  101.                 category_div = h2.findNext('div', attrs={'class' : 'kategorie'})
  102.                 section = self.tag_to_string(category_div)
  103.                 if articles.has_key(section):
  104.                     articles[section].append(article)
  105.                 else:
  106.                     articles[section] = [article]
  107.                     sections.append(section)
  108.                 
  109.             next = main_section.find('li', attrs={'class' : 'next'})
  110.             if next is None:
  111.                 break
  112.             soup = self.index_to_soup(next.find('a')['href'])
  113.             main_section = soup.find(id='mainSection')
  114.             
  115.         for section in sections:
  116.             feeds.append((section, articles[section]))
  117.         return feeds
  118.  
  119.     def create_article(self, h2):
  120.         article = {}
  121.         a = h2.find('a')
  122.         article['title'] = self.tag_to_string(a)
  123.         article['url'] = a['href']
  124.         article['date'] = self.DATE
  125.         desc = h2.findNext('p')
  126.  
  127.         if desc is not None:
  128.             article['description'] = self.tag_to_string(desc)
  129.         else:
  130.             article['description'] = ''
  131.         return article
  132.         
  133.  
  134.  
  135.  
  136.