home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / borba.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  4.1 KB  |  96 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
  5.  
  6. '''
  7. borba.rs
  8. '''
  9.  
  10. import re
  11. from calibre.web.feeds.news import BasicNewsRecipe
  12.  
  13. class Borba(BasicNewsRecipe):
  14.     title                 = 'Borba Online'
  15.     __author__            = 'Darko Miletic'
  16.     description           = 'Dnevne novine Borba Online'
  17.     publisher             = 'IP Novine Borba'
  18.     category              = 'news, politics, Serbia'    
  19.     language = 'sr'
  20.  
  21.     lang                  = _('sr-Latn-RS')
  22.     oldest_article        = 2
  23.     max_articles_per_feed = 100
  24.     no_stylesheets        = True
  25.     encoding              = 'utf-8'
  26.     use_embedded_content  = False
  27.     cover_url             = 'http://www.borba.rs/images/stories/novine/naslovna_v.jpg'
  28.     INDEX                 = u'http://www.borba.rs/'
  29.     extra_css = ' @font-face {font-family: "serif1"; src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} .contentheading{font-size: x-large; font-weight: bold} .createdate{font-size: small; font-weight: bold} '
  30.     
  31.     conversion_options = {
  32.                           'comment'          : description
  33.                         , 'tags'             : category
  34.                         , 'publisher'        : publisher
  35.                         , 'language'         : lang
  36.                         , 'pretty_print'     : True
  37.                         }
  38.      
  39.     preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
  40.  
  41.     keep_only_tags = [dict(name='div', attrs={'class':'main'})]
  42.  
  43.     remove_tags_after = dict(name='div',attrs={'id':'written_comments_title'})
  44.  
  45.     remove_tags = [
  46.                      dict(name=['object','link','iframe','base','img'])
  47.                     ,dict(name='div',attrs={'id':'written_comments_title'})
  48.                   ]
  49.  
  50.     feeds = [
  51.                (u'Najnovije vesti', u'http://www.borba.rs/content/blogsection/28/105/')
  52.               ,(u'Prvi plan'      , u'http://www.borba.rs/content/blogsection/4/92/'  )
  53.               ,(u'Dogadjaji'      , u'http://www.borba.rs/content/blogsection/21/83/' )
  54.               ,(u'Ekonomija'      , u'http://www.borba.rs/content/blogsection/5/35/'  )
  55.               ,(u'Komentari'      , u'http://www.borba.rs/content/blogsection/23/94/' )
  56.               ,(u'Svet'           , u'http://www.borba.rs/content/blogsection/7/36/'  )
  57.               ,(u'Sport'          , u'http://www.borba.rs/content/blogsection/6/37/'  )
  58.               ,(u'Fama'           , u'http://www.borba.rs/content/blogsection/25/89/' )
  59.               ,(u'B2 Dodatak'     , u'http://www.borba.rs/content/blogsection/30/116/')
  60.             ]
  61.  
  62.     def preprocess_html(self, soup):
  63.         attribs = [  'style','font','valign'
  64.                     ,'colspan','width','height'
  65.                     ,'rowspan','summary','align'
  66.                     ,'cellspacing','cellpadding'
  67.                     ,'frames','rules','border'
  68.                   ]
  69.         for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
  70.             item.name = 'div'
  71.             for attrib in attribs:
  72.                 if item.has_key(attrib):
  73.                    del item[attrib]            
  74.         return soup
  75.  
  76.     def parse_index(self):
  77.         totalfeeds = []
  78.         lfeeds = self.get_feeds()
  79.         for feedobj in lfeeds:
  80.             feedtitle, feedurl = feedobj
  81.             self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
  82.             articles = []
  83.             soup = self.index_to_soup(feedurl)
  84.             for item in soup.findAll('a', attrs={'class':'contentpagetitle'}):
  85.                 url         = item['href']
  86.                 title       = self.tag_to_string(item)
  87.                 articles.append({
  88.                                       'title'      :title
  89.                                      ,'date'       :''
  90.                                      ,'url'        :url
  91.                                      ,'description':''
  92.                                     })
  93.             totalfeeds.append((feedtitle, articles))
  94.         return totalfeeds
  95.         
  96.