home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / bbc_fast.recipe < prev    next >
Text File  |  2011-09-09  |  5KB  |  88 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2010 - 2011, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. news.bbc.co.uk
  5. '''
  6. from calibre.web.feeds.recipes import BasicNewsRecipe
  7.  
  8. class BBC(BasicNewsRecipe):
  9.     title                  = 'BBC News (fast)'
  10.     __author__             = 'Darko Miletic, Starson17'
  11.     description            = 'Visit BBC News for up-to-the-minute news, breaking news, video, audio and feature stories. BBC News provides trusted World and UK news as well as local and regional perspectives. Also entertainment, business, science, technology and health news.'
  12.     oldest_article         = 2
  13.     max_articles_per_feed  = 100
  14.     no_stylesheets         = True
  15.     use_embedded_content   = False
  16.     encoding               = 'utf8'
  17.     publisher              = 'BBC'
  18.     category               = 'news, UK, world'
  19.     language               = 'en_GB'
  20.     publication_type       = 'newsportal'
  21.     masthead_url           = 'http://news.bbcimg.co.uk/img/1_0_1/cream/hi/news/news-blocks.gif'
  22.     extra_css              = """
  23.                                  body{ font-family: Verdana,Helvetica,Arial,sans-serif }
  24.                                  .introduction{font-weight: bold}
  25.                                  .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small}
  26.                                  .story-feature h2{text-align: center; text-transform: uppercase}
  27.                              """
  28.     conversion_options = {
  29.                              'comments'        : description
  30.                             ,'tags'            : category
  31.                             ,'language'        : language
  32.                             ,'publisher'       : publisher
  33.                             ,'linearize_tables': True
  34.                          }
  35.  
  36.     keep_only_tags    = [
  37.                           dict(name='div', attrs={'class':['layout-block-a layout-block']})
  38.                          ,dict(attrs={'class':['story-body','storybody']})
  39.                          ,dict(attrs={'id':['meta-information','story-body']})
  40.                         ]
  41.  
  42.     remove_tags = [
  43.                        dict(name='div', attrs={'class':['story-feature related narrow', \
  44.                                                         'share-help', 'embedded-hyper', \
  45.                                                         'story-feature wide ', \
  46.                                                         'story-feature narrow', \
  47.                                                         'hidden','story-actions', \
  48.                                                         'embedded-hyper']})
  49.                        ,dict(name=['img','meta','link','object','embed','iframe','base'])
  50.                        ,dict(attrs={'class':['hidden','videoInStoryC']})
  51.                        ,dict(attrs={'id':['bbccom_sponsor_section','toggle-controls', \
  52.                                           'toggle-images','toggle-title']})
  53.                   ]
  54.  
  55.     remove_attributes = ['width','height','xmlns:og','lang','clear']
  56.  
  57.     feeds          = [
  58.                       ('Top Stories'        , 'http://feeds.bbci.co.uk/news/rss.xml'                        ),
  59.                       ('Science/Environment', 'http://feeds.bbci.co.uk/news/science_and_environment/rss.xml'),
  60.                       ('Technology'         , 'http://feeds.bbci.co.uk/news/technology/rss.xml'             ),
  61.                       ('Entertainment/Arts' , 'http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml' ),
  62.                       ('Magazine'           , 'http://feeds.bbci.co.uk/news/magazine/rss.xml'               ),
  63.                       ('Business'           , 'http://feeds.bbci.co.uk/news/business/rss.xml'               ),
  64.                       ('Politics'           , 'http://feeds.bbci.co.uk/news/politics/rss.xml'               ),
  65.                       ('Health'             , 'http://feeds.bbci.co.uk/news/health/rss.xml'                 ),
  66.                       ('US&Canada'          , 'http://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml'    ),
  67.                       ('Latin America'      , 'http://feeds.bbci.co.uk/news/world/latin_america/rss.xml'    ),
  68.                       ('Europe'             , 'http://feeds.bbci.co.uk/news/world/europe/rss.xml'           ),
  69.                       ('South Asia'         , 'http://feeds.bbci.co.uk/news/world/south_asia/rss.xml'       ),
  70.                       ('England'            , 'http://feeds.bbci.co.uk/news/england/rss.xml'                ),
  71.                       ('Asia-Pacific'       , 'http://feeds.bbci.co.uk/news/world/asia_pacific/rss.xml'     ),
  72.                       ('Africa'             , 'http://feeds.bbci.co.uk/news/world/africa/rss.xml'           )
  73.                     ]
  74.  
  75.     def preprocess_html(self, soup):
  76.         for item in soup.findAll(style=True):
  77.             del item['style']
  78.         for item in soup.findAll('left'):
  79.             item.name='span'
  80.         for item in soup.findAll('a'):
  81.             if item.string is not None:
  82.                str = item.string
  83.                item.replaceWith(str)
  84.             else:
  85.                str = self.tag_to_string(item)
  86.                item.replaceWith(str)
  87.         return soup
  88.