home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / der_spiegel.recipe < prev    next >
Text File  |  2011-09-09  |  3KB  |  84 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2011, Nikolas Mangold <nmangold at gmail.com>'
  5. '''
  6. spiegel.de
  7. '''
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9. from calibre import strftime
  10. from calibre import re
  11.  
  12. class DerSpiegel(BasicNewsRecipe):
  13.     title                  = 'Der Spiegel'
  14.     __author__             = 'Nikolas Mangold'
  15.     description            = 'Der Spiegel, Printed Edition. Access to paid content.'
  16.     publisher              = 'SPIEGEL-VERLAG RUDOLF AUGSTEIN GMBH & CO. KG'
  17.     category               = 'news, politics, Germany'
  18.     no_stylesheets         = True
  19.     encoding               = 'cp1252'
  20.     needs_subscription     = True
  21.     remove_empty_feeds     = True
  22.     delay                  = 1
  23.     PREFIX                 = 'http://m.spiegel.de'
  24.     INDEX                  = PREFIX + '/spiegel/print/epaper/index-heftaktuell.html'
  25.     use_embedded_content   = False
  26.     masthead_url = 'http://upload.wikimedia.org/wikipedia/en/thumb/1/17/Der_Spiegel_logo.svg/200px-Der_Spiegel_logo.svg.png'
  27.     language               = 'de'
  28.     publication_type       = 'magazine'
  29.     extra_css              = ' body{font-family: Arial,Helvetica,sans-serif} '
  30.     timefmt = '[%W/%Y]'
  31.     empty_articles = ['Titelbild']
  32.     preprocess_regexps = [
  33.         (re.compile(r'<p>◆</p>', re.DOTALL|re.IGNORECASE), lambda match: '<hr>'),
  34.         ]
  35.  
  36.     def get_browser(self):
  37.         def has_login_name(form):
  38.             try:
  39.                 form.find_control(name="f.loginName")
  40.             except:
  41.                 return False
  42.             else:
  43.                 return True
  44.  
  45.         br = BasicNewsRecipe.get_browser()
  46.         if self.username is not None and self.password is not None:
  47.             br.open(self.PREFIX + '/meinspiegel/login.html')
  48.             br.select_form(predicate=has_login_name)
  49.             br['f.loginName'    ] = self.username
  50.             br['f.password'] = self.password
  51.             br.submit()
  52.         return br
  53.  
  54.     remove_tags_before =  dict(attrs={'class':'spArticleContent'})
  55.     remove_tags_after  =  dict(attrs={'class':'spArticleCredit'})
  56.  
  57.     def parse_index(self):
  58.         soup = self.index_to_soup(self.INDEX)
  59.  
  60.         cover = soup.find('img', width=248)
  61.         if cover is not None:
  62.             self.cover_url = cover['src']
  63.  
  64.         index = soup.find('dl')
  65.  
  66.         feeds = []
  67.         for section in index.findAll('dt'):
  68.             section_title = self.tag_to_string(section).strip()
  69.             self.log('Found section ', section_title)
  70.  
  71.             articles = []
  72.             for article in section.findNextSiblings(['dd','dt']):
  73.                 if article.name == 'dt':
  74.                     break
  75.                 link = article.find('a')
  76.                 title = self.tag_to_string(link).strip()
  77.                 if title in self.empty_articles:
  78.                     continue
  79.                 self.log('Found article ', title)
  80.                 url = self.PREFIX + link['href']
  81.                 articles.append({'title' : title, 'date' : strftime(self.timefmt), 'url' : url})
  82.             feeds.append((section_title,articles))
  83.         return feeds;
  84.