home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / msnbc.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  4.4 KB  |  88 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2010-2011, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. msnbc.msn.com
  5. '''
  6.  
  7. from calibre.web.feeds.recipes import BasicNewsRecipe
  8.  
  9. class MsNBC(BasicNewsRecipe):
  10.     title                  = 'msnbc.com'
  11.     __author__             = 'Darko Miletic'
  12.     description            = 'A Fuller Spectrum of News'
  13.     oldest_article         = 2
  14.     max_articles_per_feed  = 100
  15.     no_stylesheets         = True
  16.     use_embedded_content   = False
  17.     encoding               = 'utf8'
  18.     publisher              = 'msnbc.com'
  19.     category               = 'news, USA, world'
  20.     language               = 'en'
  21.     extra_css              = """
  22.                                 body{ font-family: Georgia,Times,serif }
  23.                                 .hide{display: none}
  24.                                 .caption{font-family: Arial,sans-serif; font-size: x-small}
  25.                                 .entry-summary{font-family: Arial,sans-serif}
  26.                                 .copyright{font-size: 0.95em; font-style: italic}
  27.                                 .source-org{font-size: small; font-family: Arial,sans-serif}
  28.                                 img{display: block; margin-bottom: 0.5em}
  29.                                 span.byline{display: none}
  30.                             """
  31.  
  32.     conversion_options = {
  33.                              'comments' : description
  34.                             ,'tags'     : category
  35.                             ,'language' : language
  36.                             ,'publisher': publisher
  37.                          }
  38.  
  39.     remove_tags_before = dict(name='h1', attrs={'id':'headline'})
  40.     remove_tags_after = dict(name='span', attrs={'class':['copyright','Linear copyright']})
  41.     keep_only_tags=[
  42.                       dict(attrs={'id':['headline','deck','byline','source','intelliTXT']})
  43.                      ,dict(attrs={'class':['gl_headline','articleText','drawer-content Linear','v-center3','byline','textBodyBlack']})
  44.                    ]
  45.     remove_attributes=['property','lang','rel','xmlns:fb','xmlns:v','xmlns:dc','xmlns:dcmitype','xmlns:og','xmlns:media','xmlns:vcard','typeof','itemscope','itemtype','itemprop','about','type','size','width','height','onreadystatechange','data','border','hspace','vspace']
  46.  
  47.     remove_tags      = [
  48.                           dict(name=['iframe','object','link','embed','meta','table'])
  49.                          ,dict(name='span', attrs={'class':['copyright','Linear copyright']})
  50.                          ,dict(name='div', attrs={'class':'social'})
  51.                        ]
  52.  
  53.  
  54.     feeds = [
  55.                (u'US News'       , u'http://rss.msnbc.msn.com/id/3032524/device/rss/rss.xml'      )
  56.               ,(u'World News'    , u'http://rss.msnbc.msn.com/id/3032506/device/rss/rss.xml'      )
  57.               ,(u'Politics'      , u'http://rss.msnbc.msn.com/id/3032552/device/rss/rss.xml'      )
  58.               ,(u'Business'      , u'http://rss.msnbc.msn.com/id/3032071/device/rss/rss.xml'      )
  59.               ,(u'Sports'        , u'http://rss.nbcsports.msnbc.com/id/3032112/device/rss/rss.xml')
  60.               ,(u'Entertainment' , u'http://rss.msnbc.msn.com/id/3032083/device/rss/rss.xml'      )
  61.               ,(u'Health'        , u'http://rss.msnbc.msn.com/id/3088327/device/rss/rss.xml'      )
  62.               ,(u'Tech & Science', u'http://rss.msnbc.msn.com/id/3032117/device/rss/rss.xml'      )
  63.             ]
  64.  
  65.     def preprocess_html(self, soup):
  66.         for item in soup.body.findAll('html'):
  67.             item.name='div'
  68.         for item in soup.body.findAll('div'):
  69.             if item.has_key('id') and item['id'].startswith('vine-'):
  70.                item.extract()
  71.             if item.has_key('class') and ( item['class'].startswith('ad') or item['class'].startswith('vine')):
  72.                item.extract()
  73.         for item in soup.body.findAll('img'):
  74.             if not item.has_key('alt'):
  75.                item['alt'] = 'image'
  76.         for item in soup.body.findAll('ol'):
  77.             if item.has_key('class') and item['class'].startswith('grid'):
  78.                item.extract()
  79.         for item in soup.body.findAll('span'):
  80.             if ( item.has_key('id') and item['id'].startswith('byLine') and item.string is None) or ( item.has_key('class') and item['class'].startswith('inline') ):
  81.                item.extract()
  82.         for alink in soup.findAll('a'):
  83.             if alink.string is not None:
  84.                tstr = alink.string
  85.                alink.replaceWith(tstr)
  86.         return soup
  87.  
  88.