home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / nin.recipe < prev    next >
Text File  |  2011-09-09  |  6KB  |  152 lines

  1.  
  2. __license__   = 'GPL v3'
  3. __copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
  4. '''
  5. www.nin.co.rs
  6. '''
  7.  
  8. import re
  9. from calibre import strftime
  10. from calibre.web.feeds.news import BasicNewsRecipe
  11. from contextlib import closing
  12. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  13. from calibre import entity_to_unicode
  14.  
  15. class Nin(BasicNewsRecipe):
  16.     title                  = 'NIN online'
  17.     __author__             = 'Darko Miletic'
  18.     description            = 'Nedeljne Informativne Novine'
  19.     publisher              = 'NIN d.o.o. - Ringier d.o.o.'
  20.     category               = 'news, politics, Serbia'
  21.     no_stylesheets         = True
  22.     delay                  = 1
  23.     oldest_article         = 15
  24.     encoding               = 'utf-8'
  25.     needs_subscription     = True
  26.     remove_empty_feeds     = True
  27.     PREFIX                 = 'http://www.nin.co.rs'
  28.     INDEX                  = PREFIX + '/?change_lang=ls'
  29.     use_embedded_content   = False
  30.     language               = 'sr'
  31.     publication_type       = 'magazine'
  32.     extra_css              = """
  33.                                  @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
  34.                                  body{font-family: Verdana, Lucida, sans1, sans-serif}
  35.                                  .article_description{font-family: Verdana, Lucida, sans1, sans-serif}
  36.                                  .artTitle{font-size: x-large; font-weight: bold; color: #900}
  37.                                  .izjava{font-size: x-large; font-weight: bold}
  38.                                  .columnhead{font-size: small; font-weight: bold;}
  39.                                  img{margin-top:0.5em; margin-bottom: 0.7em; display: block}
  40.                                  b{margin-top: 1em}
  41.                              """
  42.  
  43.     conversion_options = {
  44.                           'comment'   : description
  45.                         , 'tags'      : category
  46.                         , 'publisher' : publisher
  47.                         , 'language'  : language
  48.                         }
  49.  
  50.     preprocess_regexps = [
  51.                            (re.compile(r'</body>.*?<html>', re.DOTALL|re.IGNORECASE),lambda match: '</body>')
  52.                           ,(re.compile(r'</html>.*?</html>', re.DOTALL|re.IGNORECASE),lambda match: '</html>')
  53.                           ,(re.compile(u'\u0110'), lambda match: u'\u00D0')
  54.                          ]
  55.  
  56.     def get_browser(self):
  57.         br = BasicNewsRecipe.get_browser()
  58.         if self.username is not None and self.password is not None:
  59.             br.open(self.INDEX)
  60.             br.select_form(name='form1')
  61.             br['login_name'    ] = self.username
  62.             br['login_password'] = self.password
  63.             br.submit()
  64.         return br
  65.  
  66.     keep_only_tags    =[dict(name='td', attrs={'width':'520'})]
  67.     remove_tags_before =dict(name='span', attrs={'class':'izjava'})
  68.     remove_tags_after =dict(name='html')
  69.     remove_tags = [dict(name=['object','link','iframe','meta','base'])]
  70.     remove_attributes=['border','background','height','width','align','valign']
  71.  
  72.     def get_cover_url(self):
  73.         cover_url = None
  74.         soup = self.index_to_soup(self.INDEX)
  75.         link_item = soup.find('img',attrs={'width':'100','border':'0'})
  76.         if link_item:
  77.            cover_url = self.PREFIX + link_item['src']
  78.         return cover_url
  79.  
  80.     def parse_index(self):
  81.         articles = []
  82.         count = 0
  83.         soup = self.index_to_soup(self.INDEX)
  84.         for item in soup.findAll('a',attrs={'class':'lmeninavFont'}):
  85.             count = count +1
  86.             if self.test and count > 2:
  87.                return articles
  88.             section  = self.tag_to_string(item)
  89.             feedlink = self.PREFIX + item['href']
  90.             feedpage = self.index_to_soup(feedlink)
  91.             self.report_progress(0, _('Fetching feed')+' %s...'%(section))
  92.             inarts   = []
  93.             for art in feedpage.findAll('span',attrs={'class':'artTitle'}):
  94.                 alink = art.parent
  95.                 url   = self.PREFIX + alink['href']
  96.                 title = self.tag_to_string(art)
  97.                 sparent = alink.parent
  98.                 alink.extract()
  99.                 description = self.tag_to_string(sparent)
  100.                 date = strftime(self.timefmt)
  101.                 inarts.append({
  102.                                   'title'      :title
  103.                                  ,'date'       :date
  104.                                  ,'url'        :url
  105.                                  ,'description':description
  106.                                 })
  107.             articles.append((section,inarts))
  108.         return articles
  109.  
  110.     def index_to_soup(self, url_or_raw, raw=False):
  111.         if re.match(r'\w+://', url_or_raw):
  112.             open_func = getattr(self.browser, 'open_novisit', self.browser.open)
  113.             with closing(open_func(url_or_raw)) as f:
  114.                 _raw = f.read()
  115.             if not _raw:
  116.                 raise RuntimeError('Could not fetch index from %s'%url_or_raw)
  117.         else:
  118.             _raw = url_or_raw
  119.         if raw:
  120.             return _raw
  121.         if not isinstance(_raw, unicode) and self.encoding:
  122.             if callable(self.encoding):
  123.                 _raw = self.encoding(_raw)
  124.             else:
  125.                 _raw = _raw.decode(self.encoding, 'replace')
  126.         massage = list(BeautifulSoup.MARKUP_MASSAGE)
  127.         enc = 'cp1252' if callable(self.encoding) or self.encoding is None else self.encoding
  128.         massage.append((re.compile(r'&(\S+?);'), lambda match:
  129.             entity_to_unicode(match, encoding=enc)))
  130.         massage.append((re.compile(r'[\x00-\x08]+'), lambda match:
  131.             ''))
  132.         return BeautifulSoup(_raw, markupMassage=massage)
  133.  
  134.     def preprocess_html(self, soup):
  135.         for item in soup.findAll(style=True):
  136.             del item['style']
  137.         for item in soup.findAll('div'):
  138.             if len(item.contents) == 0:
  139.                item.extract()
  140.         for item in soup.findAll(['td','tr']):
  141.             item.name='div'
  142.         for item in soup.findAll('img'):
  143.             if not item.has_key('alt'):
  144.                item['alt'] = 'image'
  145.         for tbl in soup.findAll('table'):
  146.             img = tbl.find('img')
  147.             if img:
  148.                img.extract()
  149.                tbl.replaceWith(img)
  150.         return soup
  151.  
  152.