home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / independent.recipe < prev    next >
Text File  |  2011-09-09  |  5KB  |  87 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. www.independent.co.uk
  5. '''
  6.  
  7. from calibre.web.feeds.news import BasicNewsRecipe
  8. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  9.  
  10. class TheIndependent(BasicNewsRecipe):
  11.     title                 = 'The Independent'
  12.     __author__            = 'Darko Miletic'
  13.     description           = 'Independent News - Breaking news, comment and features from The Independent newspaper'
  14.     publisher             = 'The Independent'
  15.     category              = 'news, politics, UK'
  16.     oldest_article        = 2
  17.     max_articles_per_feed = 200
  18.     no_stylesheets        = True
  19.     encoding              = 'cp1252'
  20.     use_embedded_content  = False
  21.     language              = 'en_GB'
  22.     remove_empty_feeds    = True
  23.     publication_type      = 'newspaper'
  24.     masthead_url          = 'http://www.independent.co.uk/independent.co.uk/images/logo-london.png'
  25.     extra_css             = """
  26.                                h1{font-family: Georgia,serif }
  27.                                body{font-family: Verdana,Arial,Helvetica,sans-serif}
  28.                                img{margin-bottom: 0.4em; display:block}
  29.                                .info,.caption,.credits{font-size: x-small}
  30.                             """
  31.  
  32.     conversion_options = {
  33.                           'comment'   : description
  34.                         , 'tags'      : category
  35.                         , 'publisher' : publisher
  36.                         , 'language'  : language
  37.                         }
  38.  
  39.     remove_tags      =[
  40.                         dict(name=['meta','link','object','embed','iframe','base','style'])
  41.                         ,dict(attrs={'class':['related-articles','share','googleCols','article-tools','paging','googleArt']})
  42.                         ,dict(attrs={'id':['newsVideoPlayer','yahoobook','google-intext']})
  43.                       ]
  44.     keep_only_tags   =[dict(attrs={'id':'article'})]
  45.     remove_attributes=['lang','onclick','width','xmlns:fb']
  46.  
  47.  
  48.     feeds = [
  49.               (u'UK'                 , u'http://www.independent.co.uk/news/uk/rss'                 )
  50.              ,(u'World'              , u'http://www.independent.co.uk/news/world/rss'              )
  51.              ,(u'Business'           , u'http://www.independent.co.uk/news/business/rss'           )
  52.              ,(u'People'             , u'http://www.independent.co.uk/news/people/rss'             )
  53.              ,(u'Science'            , u'http://www.independent.co.uk/news/science/rss'            )
  54.              ,(u'Media'              , u'http://www.independent.co.uk/news/media/rss'              )
  55.              ,(u'Education'          , u'http://www.independent.co.uk/news/education/rss'          )
  56.              ,(u'Leading Articles'   , u'http://www.independent.co.uk/opinion/leading-articles/rss')
  57.              ,(u'Comentators'        , u'http://www.independent.co.uk/opinion/commentators/rss'    )
  58.              ,(u'Columnists'         , u'http://www.independent.co.uk/opinion/columnists/rss'      )
  59.              ,(u'Letters'            , u'http://www.independent.co.uk/opinion/letters/rss'         )
  60.              ,(u'Big Question'       , u'http://www.independent.co.uk/extras/big-question/rss'     )
  61.              ,(u'Sport'              , u'http://www.independent.co.uk/sport/rss'                   )
  62.              ,(u'Life&Style'         , u'http://www.independent.co.uk/life-style/rss'              )
  63.              ,(u'Arts&Entertainment' , u'http://www.independent.co.uk/arts-entertainment/rss'      )
  64.              ,(u'Travel'             , u'http://www.independent.co.uk/travel/rss'                  )
  65.              ,(u'Money'              , u'http://www.independent.co.uk/money/rss'                   )
  66.             ]
  67.  
  68.     def get_article_url(self, article):
  69.         return article.get('guid',  None)
  70.  
  71.     def preprocess_html(self, soup):
  72.         for item in soup.body.findAll(style=True):
  73.             del item['style']
  74.         for item in soup.body.findAll(['author','preform']):
  75.             item.name='span'
  76.         for item in soup.body.findAll('img'):
  77.             if not item.has_key('alt'):
  78.                item['alt'] = 'image'
  79.         for item in soup.body.findAll('div', attrs={'class':['clear-o','body','photoCaption']}):
  80.             item.name = 'p'
  81.         for item in soup.body.findAll('div'):
  82.             if not item.attrs and not item.contents:
  83.                item.extract()
  84.         soup2 = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
  85.         soup2.body.replaceWith(soup.body)
  86.         return soup2
  87.