home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / el_periodico.recipe < prev    next >
Text File  |  2011-09-09  |  6KB  |  127 lines

  1. #!/usr/bin/env  python
  2. # -*- coding: utf-8 -*-
  3.  
  4. __license__     = 'GPL v3'
  5. __copyright__   = '04 December 2010, desUBIKado'
  6. __author__      = 'desUBIKado'
  7. __description__ = 'Daily newspaper from Aragon'
  8. __version__     = 'v0.07'
  9. __date__        = '06, February 2011'
  10. '''
  11. elperiodicodearagon.com
  12. '''
  13. import re
  14. from calibre.web.feeds.news import BasicNewsRecipe
  15.  
  16.  
  17. class elperiodicodearagon(BasicNewsRecipe):
  18.     title                 = u'El Periodico de Aragon'
  19.     __author__            = u'desUBIKado'
  20.     description           = u'Noticias desde Aragon'
  21.     publisher             = u'elperiodicodearagon.com'
  22.     category              = u'news, politics, Spain, Aragon'
  23.     oldest_article        = 2
  24.     delay                 = 0
  25.     max_articles_per_feed = 100
  26.     no_stylesheets        = True
  27.     use_embedded_content  = False
  28.     language              = 'es'
  29.     encoding              = 'utf8'
  30.     remove_empty_feeds    = True
  31.     remove_javascript     = True
  32.  
  33.  
  34.     conversion_options = {
  35.                              'comments'  : description
  36.                             ,'tags'      : category
  37.                             ,'language'  : language
  38.                             ,'publisher' : publisher
  39.                          }
  40.  
  41.     feeds              = [
  42.                            (u'Arag\xf3n', u'http://elperiodicodearagon.com/RSS/2.xml'),
  43.                            (u'Internacional', u'http://elperiodicodearagon.com/RSS/4.xml'),
  44.                            (u'Espa\xf1a', u'http://elperiodicodearagon.com/RSS/3.xml'),
  45.                            (u'Econom\xeda', u'http://elperiodicodearagon.com/RSS/5.xml'),
  46.                            (u'Deportes', u'http://elperiodicodearagon.com/RSS/7.xml'),
  47.                            (u'Real Zaragoza', u'http://elperiodicodearagon.com/RSS/10.xml'),
  48.                            (u'Opini\xf3n', u'http://elperiodicodearagon.com/RSS/103.xml'),
  49.                            (u'Escenarios', u'http://elperiodicodearagon.com/RSS/105.xml'),
  50.                            (u'Sociedad', u'http://elperiodicodearagon.com/RSS/104.xml'),
  51.                            (u'Gente', u'http://elperiodicodearagon.com/RSS/330.xml')
  52.                          ]
  53.  
  54.  
  55.     extra_css = '''
  56.                     h3 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
  57.                     h2 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:18px;}
  58.                     h4 {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:20px;}
  59.                     .columnaDeRecursosRelacionados {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;}
  60.                     img{margin-bottom: 0.4em}
  61.         '''
  62.  
  63.     remove_attributes = ['height','width']
  64.  
  65.     keep_only_tags     = [dict(name='div', attrs={'id':'contenidos'})]
  66.  
  67.  
  68.     # Quitar toda la morralla
  69.  
  70.     remove_tags        = [dict(name='ul', attrs={'class':'herramientasDeNoticia'}),
  71.                           dict(name='span', attrs={'class':'MasInformacion '}),
  72.                           dict(name='span', attrs={'class':'MasInformacion'}),
  73.                           dict(name='div', attrs={'class':'Middle'}),
  74.                           dict(name='div', attrs={'class':'MenuCabeceraRZaragoza'}),
  75.                           dict(name='div', attrs={'id':'MenuCabeceraRZaragoza'}),
  76.                           dict(name='div', attrs={'class':'MenuEquipo'}),
  77.                           dict(name='div', attrs={'class':'TemasRelacionados'}),
  78.                           dict(name='div', attrs={'class':'GaleriaEnNoticia'}),
  79.                           dict(name='div', attrs={'class':'Recorte'}),
  80.                           dict(name='div', attrs={'id':'NoticiasenRecursos'}),
  81.                           dict(name='div', attrs={'id':'NoticiaEnPapel'}),
  82.                           dict(name='p', attrs={'class':'RecorteEnNoticias'}),
  83.                           dict(name='div', attrs={'id':'Comparte'}),
  84.                           dict(name='div', attrs={'id':'CajaComparte'}),
  85.                           dict(name='a', attrs={'class':'EscribirComentario'}),
  86.                           dict(name='a', attrs={'class':'AvisoComentario'}),
  87.                           dict(name='div', attrs={'class':'CajaAvisoComentario'}),
  88.                           dict(name='div', attrs={'class':'navegaNoticias'}),
  89.                           dict(name='div', attrs={'class':'Mensaje'}),
  90.                           dict(name='div', attrs={'id':'PaginadorDiCom'}),
  91.                           dict(name='div', attrs={'id':'CajaAccesoCuentaUsuario'}),
  92.                           dict(name='div', attrs={'id':'CintilloComentario'}),
  93.                           dict(name='div', attrs={'id':'EscribeComentario'}),
  94.                           dict(name='div', attrs={'id':'FormularioComentario'}),
  95.                           dict(name='div', attrs={'id':'FormularioNormas'})]
  96.  
  97.     # Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)
  98.  
  99.     def get_cover_url(self):
  100.         index = 'http://pdf.elperiodicodearagon.com/'
  101.         soup = self.index_to_soup(index)
  102.         for image in soup.findAll('img',src=True):
  103.            if image['src'].startswith('http://pdf.elperiodicodearagon.com/funciones/portada-preview.php?eid='):
  104.               return image['src'].rstrip('format=2') + 'format=1'
  105.         return None
  106.  
  107.     # Para quitar espacios entre la noticia y los comentarios (lineas 1 y 2)
  108.     # El indice no apuntaba correctamente al empiece de la noticia (linea 3)
  109.  
  110.     preprocess_regexps = [
  111.         (re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
  112.         (re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
  113.         (re.compile(r'<p id="">', re.DOTALL|re.IGNORECASE), lambda match: '<p>')
  114.         ]
  115.  
  116.     # Para sustituir el video incrustado de YouTube por una imagen
  117.  
  118.     def preprocess_html(self, soup):
  119.         for video_yt in soup.findAll('iframe',{'title':'YouTube video player'}):
  120.             if video_yt:
  121.                video_yt.name = 'img'
  122.                fuente = video_yt['src']
  123.                fuente2 = fuente.replace('http://www.youtube.com/embed/','http://img.youtube.com/vi/')
  124.                video_yt['src'] = fuente2 + '/0.jpg'
  125.  
  126.         return soup
  127.