home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / estadao.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  5.4 KB  |  130 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. from datetime import datetime, timedelta
  3. from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup
  4. from calibre.utils.magick import Image, PixelWand
  5. from urllib2 import Request, urlopen, URLError
  6.  
  7. class Estadao(BasicNewsRecipe):
  8.     THUMBALIZR_API        = '' # ---->Get your at http://www.thumbalizr.com/ and put here
  9.     LANGUAGE              = 'pt_br'
  10.     language = 'pt'
  11.     LANGHTM               = 'pt-br'
  12.     ENCODING              = 'utf'
  13.     ENCHTM                = 'utf-8'
  14.     directionhtm          = 'ltr'
  15.     requires_version      = (0,7,47)
  16.     news                  = True
  17.  
  18.     title                 = u'Estad\xe3o'
  19.     __author__            = 'Euler Alves'
  20.     description           = u'Brazilian news from Estad\xe3o'
  21.     publisher             = u'Estad\xe3o'
  22.     category              = 'news, rss'
  23.  
  24.     oldest_article        = 4
  25.     max_articles_per_feed = 100
  26.     summary_length        = 1000
  27.  
  28.     remove_javascript     = True
  29.     no_stylesheets        = True
  30.     use_embedded_content  = False
  31.     remove_empty_feeds    = True
  32.     timefmt               = ' [%d %b %Y (%a)]'
  33.  
  34.     hoje                  = datetime.now()-timedelta(days=2)
  35.     pubdate               = hoje.strftime('%a, %d %b')
  36.     if hoje.hour<10:
  37.         hoje = hoje-timedelta(days=1)
  38.     CAPA                  = 'http://www.estadao.com.br/estadaodehoje/'+hoje.strftime('%Y%m%d')+'/img/capadodia.jpg'
  39.     SCREENSHOT            = 'http://estadao.com.br/'
  40.     cover_margins         = (0,0,'white')
  41.     masthead_url          = 'http://www.estadao.com.br/estadao/novo/img/logo.png'
  42.  
  43.     keep_only_tags = [dict(name='div', attrs={'class':['bb-md-noticia','corpo']})]
  44.     remove_tags = [
  45.                     dict(name='div',
  46.                         attrs={'id':[
  47.                             'bb-md-noticia-tabs'
  48.                         ]})
  49.                     ,dict(name='div',
  50.                         attrs={'class':[
  51.                             'tags'
  52.                             ,'discussion'
  53.                             ,'bb-gg adsense_container'
  54.                         ]})
  55.  
  56.                     ,dict(name='a')
  57.                     ,dict(name='iframe')
  58.                     ,dict(name='link')
  59.                     ,dict(name='script')
  60.     ]
  61.  
  62.  
  63.     feeds = [
  64.     (u'\xDAltimas Not\xEDcias', u'http://www.estadao.com.br/rss/ultimas.xml')
  65.     ,(u'Manchetes', u'http://www.estadao.com.br/rss/manchetes.xml')
  66.     ,(u'Brasil', u'http://www.estadao.com.br/rss/brasil.xml')
  67.     ,(u'Internacional', u'http://www.estadao.com.br/rss/internacional.xml')
  68.     ,(u'Cinema', u'http://blogs.estadao.com.br/cinema/feed/')
  69.     ,(u'Planeta', u'http://www.estadao.com.br/rss/planeta.xml')
  70.     ,(u'Ci\xEAncia', u'http://www.estadao.com.br/rss/ciencia.xml')
  71.     ,(u'Sa\xFAde', u'http://www.estadao.com.br/rss/saude.xml')
  72.     ,(u'Pol\xEDtica', u'http://www.estadao.com.br/rss/politica.xml')
  73.     ]
  74.  
  75.     conversion_options = {
  76.     'title'            : title
  77.     ,'comments'        : description
  78.     ,'publisher'       : publisher
  79.     ,'tags'            : category
  80.     ,'language'        : LANGUAGE
  81.     ,'linearize_tables': True
  82.     }
  83.  
  84.     def preprocess_html(self, soup):
  85.         for item in soup.findAll(style=True):
  86.             del item['style']
  87.         if not soup.find(attrs={'http-equiv':'Content-Language'}):
  88.             meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)])
  89.             soup.head.insert(0,meta0)
  90.         if not soup.find(attrs={'http-equiv':'Content-Type'}):
  91.             meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)])
  92.             soup.head.insert(0,meta1)
  93.         return soup
  94.  
  95.     def postprocess_html(self, soup, first):
  96.         #process all the images. assumes that the new html has the correct path
  97.         for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
  98.             iurl = tag['src']
  99.             img = Image()
  100.             img.open(iurl)
  101.             width, height = img.size
  102.             print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
  103.             if img < 0:
  104.                 raise RuntimeError('Out of memory')
  105.             pw = PixelWand()
  106.             if( width > height and width > 590) :
  107.                 print 'Rotate image'
  108.                 img.rotate(pw, -90)
  109.                 img.save(iurl)
  110.         return soup
  111.  
  112.     def get_cover_url(self):
  113.         if self.THUMBALIZR_API:
  114.             cover_url      = self.CAPA
  115.             pedido         = Request(self.CAPA)
  116.             pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)')
  117.             pedido.add_header('Accept-Charset',self.ENCHTM)
  118.             pedido.add_header('Referer',self.SCREENSHOT)
  119.             try:
  120.                 resposta   = urlopen(pedido)
  121.                 soup       = BeautifulSoup(resposta)
  122.                 cover_item = soup.find('body')
  123.                 if cover_item:
  124.                     cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
  125.                 return cover_url
  126.             except URLError:
  127.                 cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
  128.                 return cover_url
  129.  
  130.