home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / haaretz_en.recipe < prev    next >
Text File  |  2011-09-09  |  4KB  |  97 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. www.haaretz.com
  5. '''
  6.  
  7. import re
  8. from calibre import strftime
  9. from time import gmtime
  10. from calibre.web.feeds.news import BasicNewsRecipe
  11.  
  12. class HaaretzPrint_en(BasicNewsRecipe):
  13.     title                 = 'Haaretz - print edition'
  14.     __author__            = 'Darko Miletic'
  15.     description           = "Haaretz.com is the world's leading English-language Website for real-time news and analysis of Israel and the Middle East."
  16.     publisher             = 'Haaretz'
  17.     category              = "news, Haaretz, Israel news, Israel newspapers, Israel business news, Israel financial news, Israeli news,Israeli newspaper, Israeli newspapers, news from Israel, news in Israel, news Israel, news on Israel, newspaper Israel, Israel sports news, Israel diplomacy news"
  18.     oldest_article        = 2
  19.     max_articles_per_feed = 200
  20.     no_stylesheets        = True
  21.     encoding              = 'utf8'
  22.     use_embedded_content  = False
  23.     language              = 'en_IL'
  24.     publication_type      = 'newspaper'
  25.     PREFIX                = 'http://www.haaretz.com'
  26.     masthead_url          = PREFIX + '/images/logos/logoGrey.gif'
  27.     extra_css             = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } '
  28.  
  29.     preprocess_regexps = [(re.compile(r'</body>.*?</html>', re.DOTALL|re.IGNORECASE),lambda match: '</body></html>')]
  30.  
  31.     conversion_options = {
  32.                           'comment'  : description
  33.                         , 'tags'     : category
  34.                         , 'publisher': publisher
  35.                         , 'language' : language
  36.                         }
  37.  
  38.     keep_only_tags    = [dict(attrs={'id':'threecolumns'})]
  39.     remove_attributes = ['width','height']
  40.     remove_tags       = [
  41.                            dict(name=['iframe','link','object','embed'])
  42.                           ,dict(name='div',attrs={'class':'rightcol'})
  43.                         ]
  44.  
  45.  
  46.     feeds = [
  47.               (u'News'          , PREFIX + u'/print-edition/news'         )
  48.              ,(u'Opinion'       , PREFIX + u'/print-edition/opinion'      )
  49.              ,(u'Business'      , PREFIX + u'/print-edition/business'     )
  50.              ,(u'Real estate'   , PREFIX + u'/print-edition/real-estate'  )
  51.              ,(u'Sports'        , PREFIX + u'/print-edition/sports'       )
  52.              ,(u'Travel'        , PREFIX + u'/print-edition/travel'       )
  53.              ,(u'Books'         , PREFIX + u'/print-edition/books'        )
  54.              ,(u'Food & Wine'   , PREFIX + u'/print-edition/food-wine'    )
  55.              ,(u'Arts & Leisure', PREFIX + u'/print-edition/arts-leisure' )
  56.              ,(u'Features'      , PREFIX + u'/print-edition/features'     )
  57.             ]
  58.  
  59.  
  60.     def print_version(self, url):
  61.         article = url.rpartition('/')[2]
  62.         return 'http://www.haaretz.com/misc/article-print-page/' + article
  63.  
  64.     def parse_index(self):
  65.         totalfeeds = []
  66.         lfeeds = self.get_feeds()
  67.         for feedobj in lfeeds:
  68.             feedtitle, feedurl = feedobj
  69.             self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
  70.             articles = []
  71.             soup = self.index_to_soup(feedurl)
  72.             for item in soup.findAll(attrs={'class':'text'}):
  73.                 sp = item.find('span',attrs={'class':'h3 font-weight-normal'})
  74.                 desc = item.find('p')
  75.                 description = ''
  76.                 if sp:
  77.                     if desc:
  78.                        description = self.tag_to_string(desc)
  79.                     link        = sp.a
  80.                     url         = self.PREFIX + link['href']
  81.                     title       = self.tag_to_string(link)
  82.                     times        = strftime('%a, %d %b %Y %H:%M:%S +0000',gmtime())
  83.                     articles.append({
  84.                                           'title'      :title
  85.                                          ,'date'       :times
  86.                                          ,'url'        :url
  87.                                          ,'description':description
  88.                                         })
  89.             totalfeeds.append((feedtitle, articles))
  90.         return totalfeeds
  91.  
  92.  
  93.     def preprocess_html(self, soup):
  94.         for item in soup.findAll(style=True):
  95.             del item['style']
  96.         return soup
  97.