home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / nspm.recipe < prev    next >
Text File  |  2011-09-09  |  5KB  |  117 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. nspm.rs
  5. '''
  6.  
  7. import re
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9. from calibre.ebooks.BeautifulSoup import NavigableString, Tag
  10.  
  11. class Nspm(BasicNewsRecipe):
  12.     title                 = 'Nova srpska politicka misao'
  13.     __author__            = 'Darko Miletic'
  14.     description           = 'Casopis za politicku teoriju i drustvena istrazivanja'
  15.     publisher             = 'NSPM'
  16.     category              = 'news, politics, Serbia'
  17.     oldest_article        = 7
  18.     max_articles_per_feed = 100
  19.     no_stylesheets        = True
  20.     use_embedded_content  = False
  21.     INDEX                 = 'http://www.nspm.rs/?alphabet=l'
  22.     encoding              = 'utf-8'
  23.     language              = 'sr'
  24.     remove_empty_feeds    = True
  25.     publication_type      = 'magazine'
  26.     masthead_url          = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
  27.     extra_css             = """ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
  28.                                 @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
  29.                                 body{font-family: "Times New Roman", serif1, serif}
  30.                                 .article_description{font-family: Arial, sans1, sans-serif}
  31.                                 img{margin-top:0.5em; margin-bottom: 0.7em; display: block}
  32.                                 .author{color: #990000; font-weight: bold}
  33.                                 .author,.createdate{font-size: 0.9em} """
  34.  
  35.     conversion_options = {
  36.                           'comment'      : description
  37.                         , 'tags'         : category
  38.                         , 'publisher'    : publisher
  39.                         , 'language'     : language
  40.                         , 'pretty_print' : True
  41.                         }
  42.  
  43.     preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
  44.     remove_tags        = [dict(name=['link','script','meta','base','img'])]
  45.     remove_attributes  = ['width','height','lang','xmlns:fb','xmlns:og','vspace','hspace','type','start','size']
  46.  
  47.     def get_browser(self):
  48.         br = BasicNewsRecipe.get_browser()
  49.         br.open(self.INDEX)
  50.         return br
  51.  
  52.     feeds = [
  53.                  (u'Rubrike'                 , u'http://www.nspm.rs/rubrike/feed/rss.html'                 )
  54.                 ,(u'Debate'                  , u'http://www.nspm.rs/debate/feed/rss.html'                  )
  55.                 ,(u'Reci i misli'            , u'http://www.nspm.rs/reci-i-misli/feed/rss.html'            )
  56.                 ,(u'Samo smeh srbina spasava', u'http://www.nspm.rs/samo-smeh-srbina-spasava/feed/rss.html')
  57.                 ,(u'Polemike'                , u'http://www.nspm.rs/polemike/feed/rss.html'                )
  58.                 ,(u'Prikazi'                 , u'http://www.nspm.rs/prikazi/feed/rss.html'                 )
  59.                 ,(u'Prenosimo'               , u'http://www.nspm.rs/prenosimo/feed/rss.html'               )
  60.                 ,(u'Hronika'                 , u'http://www.nspm.rs/tabela/hronika/feed/rss.html'          )
  61.             ]
  62.  
  63.     def preprocess_html(self, soup):
  64.         atitle = soup.body.find('a',attrs={'class':'contentpagetitle'})
  65.         if atitle:
  66.            cleanTitle = Tag(soup,'h1',[('class','contentpagetitle')])
  67.            cnt        = NavigableString(self.tag_to_string(atitle))
  68.            cleanTitle.append(cnt)
  69.            
  70.         author = soup.body.find('span',attrs={'class':'author'})
  71.         if author:
  72.            author.extract()
  73.            author.name = 'div'
  74.            
  75.         crdate = soup.body.find('td',attrs={'class':'createdate'})
  76.         if crdate:
  77.            cleanCrdate = Tag(soup,'div',[('class','createdate')])
  78.            cnt         = NavigableString(self.tag_to_string(crdate))
  79.            cleanCrdate.append(cnt)
  80.  
  81.            #get the dependant element
  82.            artText = Tag(soup,'div',[('class','text')])
  83.            textHolderp = crdate.parent
  84.            textHolder = textHolderp.nextSibling
  85.            while textHolder and (not isinstance(textHolder,Tag) or (textHolder.name <> textHolderp.name)):
  86.                  textHolder = textHolder.nextSibling
  87.            if textHolder.td:
  88.               artText          = textHolder.td
  89.               artText.name     = 'div'
  90.               artText.attrs    = []
  91.               artText['class'] = 'text'
  92.               artText.extract()
  93.            
  94.            soup.body.contents=[]
  95.  
  96.            soup.body.append(cleanTitle)
  97.            soup.body.append(author)
  98.            soup.body.append(cleanCrdate)
  99.            soup.body.append(artText)
  100.  
  101.         for item in soup.findAll('a'):
  102.             limg = item.find('img')
  103.             if item.string is not None:
  104.                str = item.string
  105.                item.replaceWith(str)
  106.             else:
  107.                if limg:
  108.                   item.name = 'div'
  109.                   item.attrs = []
  110.                else:
  111.                    str = self.tag_to_string(item)
  112.                    item.replaceWith(str)
  113.         for item in soup.findAll('img'):
  114.             if not item.has_key('alt'):
  115.                item['alt'] = 'image'
  116.         return soup
  117.