home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / laprensa.recipe < prev    next >
Text File  |  2011-09-09  |  5KB  |  103 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
  5. '''
  6. laprensa.com.ar
  7. '''
  8.  
  9. from calibre.web.feeds.news import BasicNewsRecipe
  10.  
  11. class LaPrensa(BasicNewsRecipe):
  12.     title                 = 'La Prensa'
  13.     __author__            = 'Darko Miletic and Sujata Raman'
  14.     description           = 'Informacion Libre las 24 horas'
  15.     publisher             = 'La Prensa'
  16.     category              = 'news, politics, Argentina'
  17.     oldest_article        = 7
  18.     max_articles_per_feed = 100
  19.     no_stylesheets        = True
  20.     use_embedded_content  = False
  21.     encoding              = 'cp1252'
  22.    # cover_url             = 'http://www.laprensa.com.ar/imgs/logo.gif'
  23.     remove_javascript     = True
  24.     language = 'es_AR'
  25.     lang = 'es'
  26.  
  27.     html2lrf_options = [
  28.                           '--comment', description
  29.                         , '--category', category
  30.                         , '--publisher', publisher
  31.                         ]
  32.  
  33.     html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
  34.     filter_regexps = [r'.*archive.aspx.*']
  35.  
  36.     remove_tags  = [
  37.                     dict(name='td', attrs={'class':["link-registro","link-buscador"]}),
  38.                     dict(name='td', attrs={'id':["TDTabItem1","TDTabItem2","TDTabItem3","TDTabItem4"]}),
  39.                     dict(name='table', attrs={'class':["marco-botonera"]}),
  40.                     dict(name='tr', attrs={'class':["messages","IUTabItemSelected"]}),
  41.                     dict(name='input', attrs={'id':"txt_allfields"}),
  42.                     dict(name='div', attrs={'id':["TabItem1","TabItem2","TabItem3","TabItem4","RCPanel"]}),
  43.                     dict(name='span', attrs={'id':["GWCNavigatorControl","_ctl15"]}),
  44.                     dict(name='span', attrs={'class':["ranking-titulo","IUTab"]}),
  45.                     dict(name='a', attrs={'class':["link-registro",]}),
  46.                     dict(name='img', src = "/versions/1/imgs/icono-comentario.gif"),
  47.                     dict(name='img', src = "/versions/1/imgs/logo.gif"),
  48.                     dict(name='img', src = "/versions/1/imgs/boton-ingresar-roll.gif"),
  49.                     dict(name='img', src = "/versions/1/imgs/icono-recomendar.gif"),
  50.                     dict(name='button'),
  51.                     dict(name='img', src = "/versions/1/imgs/boton-votar-roll.gif"),
  52.                     dict(name='img', src = "/versions/1/imgs/boton-ingresar.gif"),
  53.                     dict(name='img', src = "/versions/1/imgs/icono-imprimir.gif"),
  54.                     dict(name='img', src = "/versions/1/imgs/icono-ampliar-letra.gif"),
  55.                     dict(name='img', src = "/versions/1/imgs/icono-reducir-letra.gif"),
  56.                     dict(name='img', src = "/versions/1/imgs/pix-trans.gif"),
  57.                     dict(name='img', src = "/versions/1/imgs/icono-buscador.gif"),
  58.                     dict(name='img', src = "/versions/1/imgs/separador-linea-azul.gif"),
  59.                     dict(name='img', src = " /versions/1/imgs/separador-linea.gif"),
  60.                     dict(name='a',text ="Powered by Civinext Groupware - V. 2.0.3567.23706"),
  61.                     dict(name='img', height ="0")
  62.                     ]
  63.  
  64.     extra_css = '''
  65.                     .seccion{font-size:xx-small;}
  66.                     body{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
  67.                     .titulo-noticia-principal{font-size:large; color:#00427B; font-weight:bold;}
  68.                     .texto-subtitulos{font-weight:bold;font-size:x-small;}
  69.                     .fecha{font-size:xx-small;}
  70.                     .volanta{font-size:xx-small;}
  71.                 '''
  72.  
  73.     feeds = [
  74.               (u'Politica'    , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=4' )
  75.              ,(u'Economia'    , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx&Rss=5' )
  76.              ,(u'Opinion'     , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=6' )
  77.              ,(u'El Mundo'    , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=7' )
  78.              ,(u'Actualidad'  , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=8' )
  79.              ,(u'Deportes'    , u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=9' )
  80.              ,(u'Espectaculos', u'http://www.laprensa.com.ar/ResourcesManager.aspx?Resource=Rss.aspx?Rss=10')
  81.             ]
  82.  
  83.  
  84.     def preprocess_html(self, soup):
  85.  
  86.         for t in soup.findAll(['table','td','tr','span','tbody']):
  87.             t.name = 'div'
  88.         for t in soup.findAll(['hr']):
  89.             t.extract()
  90.  
  91.         mtag = '<meta http-equiv="Content-Language" content="es-AR"/>'
  92.         soup.head.insert(0,mtag)
  93.         for item in soup.findAll(style=True):
  94.             del item['style']
  95.         for item in soup.findAll(align = "center"):
  96.                 del item['align']
  97.         for item in soup.findAll(bgcolor="ffffff"):
  98.             del item['bgcolor']
  99.         return soup
  100.  
  101.  
  102.  
  103.