home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / theonion.recipe < prev    next >
Text File  |  2011-09-09  |  3KB  |  83 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2009-2011, Darko Miletic <darko.miletic at gmail.com>'
  3.  
  4. '''
  5. theonion.com
  6. '''
  7.  
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9.  
  10. class TheOnion(BasicNewsRecipe):
  11.     title                 = 'The Onion'
  12.     __author__            = 'Darko Miletic'
  13.     description           = "America's finest news source"
  14.     oldest_article        = 2
  15.     max_articles_per_feed = 100
  16.     publisher             = 'Onion, Inc.'
  17.     category              = 'humor, news, USA'
  18.     language              = 'en'
  19.     no_stylesheets        = True
  20.     use_embedded_content  = False
  21.     encoding              = 'utf-8'
  22.     publication_type      = 'newsportal'
  23.     masthead_url          = 'http://o.onionstatic.com/img/headers/onion_190.png'
  24.     extra_css             = """
  25.                                 body{font-family: Helvetica,Arial,sans-serif}
  26.                                 .section_title{color: gray; text-transform: uppercase}
  27.                                 .title{font-family: Georgia,serif}
  28.                                 .meta{color: gray; display: inline}
  29.                                 .has_caption{display: block}
  30.                                 .caption{font-size: x-small; color: gray; margin-bottom: 0.8em}
  31.                             """
  32.  
  33.     conversion_options = {
  34.                           'comment'  : description
  35.                         , 'tags'     : category
  36.                         , 'publisher': publisher
  37.                         , 'language' : language
  38.                         }
  39.  
  40.     keep_only_tags = [
  41.                          dict(name='h2', attrs={'class':['section_title','title']})
  42.                         ,dict(attrs={'class':['main_image','meta','article_photo_lead','article_body']})
  43.                         ,dict(attrs={'id':['entries']})
  44.                      ]
  45.     remove_attributes=['lang','rel']
  46.     remove_tags_after = dict(attrs={'class':['article_body','feature_content']})
  47.     remove_tags = [
  48.                      dict(name=['object','link','iframe','base','meta'])
  49.                     ,dict(name='div', attrs={'class':['toolbar_side','graphical_feature','toolbar_bottom']})
  50.                     ,dict(name='div', attrs={'id':['recent_slider','sidebar','pagination','related_media']})
  51.                   ]
  52.  
  53.  
  54.     feeds = [
  55.               (u'Daily'  , u'http://feeds.theonion.com/theonion/daily' )
  56.              ,(u'Sports' , u'http://feeds.theonion.com/theonion/sports' )
  57.             ]
  58.  
  59.     def get_article_url(self, article):
  60.         artl = BasicNewsRecipe.get_article_url(self, article)
  61.         if artl.startswith('http://www.theonion.com/audio/'):
  62.            artl = None
  63.         return artl
  64.  
  65.     def preprocess_html(self, soup):
  66.         for item in soup.findAll(style=True):
  67.             del item['style']
  68.         for item in soup.findAll('a'):
  69.             limg = item.find('img')
  70.             if item.string is not None:
  71.                str = item.string
  72.                item.replaceWith(str)
  73.             else:
  74.                if limg:
  75.                   item.name  = 'div'
  76.                   item.attrs = []
  77.                   if not limg.has_key('alt'):
  78.                      limg['alt'] = 'image'
  79.                else:
  80.                    str = self.tag_to_string(item)
  81.                    item.replaceWith(str)
  82.         return soup
  83.