home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / la_tercera.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  2.8 KB  |  65 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. latercera.com
  5. '''
  6.  
  7. from calibre.web.feeds.news import BasicNewsRecipe
  8.  
  9. class LaTercera(BasicNewsRecipe):
  10.     news                  = True
  11.     title                 = 'La Tercera'
  12.     __author__            = 'Darko Miletic and Alex Mitrani'
  13.     description           = 'El sitio de noticias online de Chile'
  14.     publisher             = 'La Tercera'
  15.     category              = 'news, politics, Chile'
  16.     oldest_article        = 2
  17.     max_articles_per_feed = 100
  18.     no_stylesheets        = True
  19.     encoding              = 'cp1252'
  20.     use_embedded_content  = False
  21.     remove_empty_feeds    = True
  22.     language              = 'es_CL'
  23.  
  24.     conversion_options = {
  25.                           'comment'          : description
  26.                         , 'tags'             : category
  27.                         , 'publisher'        : publisher
  28.                         , 'language'         : language
  29.                         , 'linearize_tables' : True
  30.                         }
  31.  
  32.     keep_only_tags = [
  33.                     dict(name='h1', attrs={'class':['titularArticulo']})
  34.     ,dict(name='h4', attrs={'class':['bajadaArt']})
  35.     ,dict(name='h5', attrs={'class':['autorArt']})
  36.                     ,dict(name='div', attrs={'class':['articleContent']})
  37.                   ]
  38.  
  39.     remove_tags = [
  40.                     dict(name='div', attrs={'class':['boxCompartir','keywords']})
  41.                   ]
  42.  
  43.     remove_tags_after = [
  44.                     dict(name='div', attrs={'class':['keywords']})
  45.                   ]
  46.  
  47.  
  48.     feeds = [(u'La Tercera', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&ul=1')
  49.               ,(u'Politica', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=674')
  50.               ,(u'Nacional', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=680')
  51.               ,(u'Mundo', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=678')
  52.               ,(u'Negocios', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=655')
  53.               ,(u'Santiago', u'http://www.latercera.com/feed/manager?type=rss&sc=TEFURVJDRVJB&citId=9&categoryId=1731')
  54.               ,(u'Tendencias', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=659')
  55.               ,(u'Educacion', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=657')
  56.               ,(u'Cultura', u'http://www.latercera.com/feed/manager?type=rss&sc=TEFURVJDRVJB&citId=9&categoryId=1453')
  57.               ,(u'Entretenci├│n', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=661')
  58.               ,(u'Deportes', u'http://www.latercera.com/app/rss?sc=TEFURVJDRVJB&category=656')
  59.             ]
  60.  
  61.     def preprocess_html(self, soup):
  62.         for item in soup.findAll(style=True):
  63.             del item['style']
  64.         return soup
  65.