home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / der_standard.recipe < prev    next >
Text File  |  2011-09-09  |  3KB  |  82 lines

  1. #!/usr/bin/env  python
  2. # -*- coding: utf-8 -*-
  3.  
  4. __license__   = 'GPL v3'
  5. __copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'
  6.  
  7. ''' http://www.derstandard.at - Austrian Newspaper '''
  8. import re
  9. from calibre.web.feeds.news import BasicNewsRecipe
  10.  
  11. class DerStandardRecipe(BasicNewsRecipe):
  12.     title = u'derStandard'
  13.     __author__ = 'Gerhard Aigner and Sujata Raman'
  14.     description = u'Nachrichten aus ??sterreich'
  15.     publisher ='derStandard.at'
  16.     category = 'news, politics, nachrichten, Austria'
  17.     use_embedded_content = False
  18.     remove_empty_feeds = True
  19.     lang = 'de-AT'
  20.     no_stylesheets = True
  21.     encoding = 'utf-8'
  22.     language = 'de'
  23.  
  24.     oldest_article = 1
  25.     max_articles_per_feed = 100
  26.  
  27.     extra_css = '''
  28.                 .artikelBody{font-family:Arial,Helvetica,sans-serif;}
  29.                 .artikelLeft{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
  30.                 h4{color:#404450;font-size:x-small;}
  31.                 h6{color:#404450; font-size:x-small;}
  32.                 '''
  33.     feeds          = [(u'International', u'http://derstandard.at/?page=rss&ressort=internationalpolitik'),
  34.         (u'Inland', u'http://derstandard.at/?page=rss&ressort=innenpolitik'),
  35.         (u'Wirtschaft', u'http://derstandard.at/?page=rss&ressort=investor'),
  36.         (u'Web', u'http://derstandard.at/?page=rss&ressort=webstandard'),
  37.         (u'Sport', u'http://derstandard.at/?page=rss&ressort=sport'),
  38.         (u'Panorama', u'http://derstandard.at/?page=rss&ressort=panorama'),
  39.         (u'Etat', u'http://derstandard.at/?page=rss&ressort=etat'),
  40.         (u'Kultur', u'http://derstandard.at/?page=rss&ressort=kultur'),
  41.         (u'Wissenschaft', u'http://derstandard.at/?page=rss&ressort=wissenschaft'),
  42.         (u'Gesundheit', u'http://derstandard.at/?page=rss&ressort=gesundheit'),
  43.         (u'Bildung', u'http://derstandard.at/?page=rss&ressort=subildung')
  44.                       ]
  45.  
  46.     keep_only_tags = [
  47.                         dict(name='div', attrs={'class':["artikel","artikelLeft","artikelBody"]}) ,
  48.                          ]
  49.  
  50.     remove_tags = [
  51.                     dict(name='link'), dict(name='meta'),dict(name='iframe'),dict(name='style'),
  52.                     dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr'),
  53.                     dict(name='div', attrs={'class':["diashow"]})]
  54.     preprocess_regexps = [
  55.         (re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
  56.         (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
  57.     ]
  58.  
  59.     filter_regexps = [r'/r[1-9]*']
  60.  
  61.     def get_article_url(self, article):
  62.         '''if the article links to a index page (ressort) or a picture gallery
  63.            (ansichtssache), don't add it'''
  64.         if ( article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0 ):
  65.             return None
  66.         matchObj = re.search( re.compile(r'/r'+'[1-9]*',flags=0), article.link,flags=0)
  67.  
  68.         if matchObj:
  69.             return None
  70.  
  71.         return article.link
  72.  
  73.     def preprocess_html(self, soup):
  74.         soup.html['xml:lang'] = self.lang
  75.         soup.html['lang']     = self.lang
  76.         mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
  77.         soup.head.insert(0,mtag)
  78.  
  79.         for t in soup.findAll(['ul', 'li']):
  80.             t.name = 'div'
  81.         return soup
  82.