home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / el_cultural.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  3.6 KB  |  87 lines

  1. from calibre.web.feeds.recipes import BasicNewsRecipe
  2.  
  3. class RevistaElCultural(BasicNewsRecipe):
  4.  
  5.     title       = 'Revista El Cultural'
  6.     __author__  = 'Jefferson Frantz'
  7.     description = 'Revista de cultura'
  8.     timefmt = ' [%d %b, %Y]'
  9.     language = 'es'
  10.  
  11.     no_stylesheets = True
  12.     remove_javascript = True
  13.  
  14.     extra_css              = 'h1{ font-family: sans-serif; font-size: large; font-weight: bolder; text-align: justify } h2{ font-family: sans-serif; font-size: small; font-weight: 500; text-align: justify } h3{ font-family: sans-serif; font-size: small; font-weight: 500; text-align: justify } h4{ font-family: sans-serif; font-weight: lighter; font-size: medium; font-style: italic; text-align: justify } .rtsArticuloFirma{ font-family: sans-serif; font-size: small; text-align: justify } .column span-13 last{ font-family: sans-serif; font-size: medium; text-align: justify } .rtsImgArticulo{font-family: serif; font-size: small; color: #000000; text-align: justify}'
  15.  
  16.  
  17.     def preprocess_html(self, soup):
  18.             for item in soup.findAll(style=True):
  19.                del item['style']
  20.  
  21.             return soup
  22.  
  23.     keep_only_tags = [dict(name='div', attrs={'class':['column span-13 last']}),dict(name='div', attrs={'class':['rtsImgArticulo']})]
  24.  
  25.     remove_tags        = [
  26.                              dict(name=['object','link','script','ul'])
  27.                             ,dict(name='div', attrs={'class':['rtsRating']})
  28.  
  29.                          ]
  30.  
  31.  
  32.     #TO GET ARTICLES IN SECTION
  33.     def ec_parse_section(self, url, titleSection):
  34.             print 'Section: '+ titleSection
  35.             soup = self.index_to_soup(url)
  36.             div = soup.find(attrs={'id':'gallery'})
  37.             current_articles = []
  38.  
  39.             for a in div.findAllNext('a', href=True):
  40.                     if a is None:
  41.                         continue
  42.                     title = self.tag_to_string(a)
  43.  
  44.                     url = a.get('href', False)
  45.                     if not url or not title:
  46.                         continue
  47.  
  48.                     if not url.startswith('/version_papel/'+titleSection+'/'):
  49.                         if len(current_articles) > 0 and not url.startswith('/secciones/'):
  50.                             break
  51.                         continue
  52.  
  53.                     if url.startswith('/version_papel/'+titleSection+'/'):
  54.                          url = 'http://www.elcultural.es'+url
  55.  
  56.                     self.log('\t\tFound article:', title[0:title.find("|")-1])
  57.                     self.log('\t\t\t', url)
  58.                     current_articles.append({'title': title[0:title.find("|")-1], 'url':url,
  59.                         'description':'', 'date':''})
  60.  
  61.             return current_articles
  62.  
  63.  
  64.     # To GET SECTIONS
  65.     def parse_index(self):
  66.             feeds = []
  67.             for title, url in [
  68.                 ('LETRAS',
  69.                  'http://www.elcultural.es/pdf_sumario/cultural/Sumario_El_Cultural_en_PDF'),
  70.                 ('ARTE',
  71.                  'http://www.elcultural.es/pdf_sumario/cultural/Sumario_El_Cultural_en_PDF'),
  72.                 ('CINE',
  73.                  'http://www.elcultural.es/pdf_sumario/cultural/Sumario_El_Cultural_en_PDF'),
  74.                 ('CIENCIA',
  75.                  'http://www.elcultural.es/pdf_sumario/cultural/Sumario_El_Cultural_en_PDF'),
  76. ##                ('OPINION',
  77. ##                 'http://www.elcultural.es/pdf_sumario/cultural/Sumario_El_Cultural_en_PDF'),
  78.                 ('ESCENARIOS',
  79.                  'http://www.elcultural.es/pdf_sumario/cultural/Sumario_El_Cultural_en_PDF'),
  80.              ]:
  81.                articles = self.ec_parse_section(url,title)
  82.                if articles:
  83.                    feeds.append((title, articles))
  84.  
  85.  
  86.             return feeds
  87.