home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / harpers_full.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  2.9 KB  |  73 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. harpers.org - paid subscription/ printed issue articles
  5. This recipe only get's article's published in text format
  6. images and pdf's are ignored
  7. '''
  8.  
  9. from calibre import strftime
  10. from calibre.web.feeds.news import BasicNewsRecipe
  11.  
  12. class Harpers_full(BasicNewsRecipe):
  13.     title                 = "Harper's Magazine - articles from printed edition"
  14.     __author__            = 'Darko Miletic'
  15.     description           = "Harper's Magazine: Founded June 1850."
  16.     publisher             = "Harpers's"
  17.     category              = 'news, politics, USA'
  18.     oldest_article        = 30
  19.     max_articles_per_feed = 100
  20.     no_stylesheets        = True
  21.     use_embedded_content  = False
  22.     delay                 = 1
  23.     language              = 'en'
  24.     needs_subscription    = True
  25.     masthead_url          = 'http://www.harpers.org/media/image/Harpers_305x100.gif'
  26.     publication_type      = 'magazine'    
  27.     INDEX                 = strftime('http://www.harpers.org/archive/%Y/%m')
  28.     LOGIN                 = 'http://www.harpers.org'
  29.     cover_url             = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif')
  30.     extra_css             = ' body{font-family: "Georgia",serif} '
  31.  
  32.     conversion_options = {
  33.                           'comment'          : description
  34.                         , 'tags'             : category
  35.                         , 'publisher'        : publisher
  36.                         , 'language'         : language
  37.                         }
  38.  
  39.     keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ]
  40.     remove_tags = [
  41.                      dict(name='table', attrs={'class':['rcnt','rcnt topline']})
  42.                     ,dict(name='link')
  43.                   ]
  44.     remove_attributes=['xmlns']              
  45.  
  46.     def get_browser(self):
  47.         br = BasicNewsRecipe.get_browser()
  48.         if self.username is not None and self.password is not None:
  49.             br.open(self.LOGIN)
  50.             br.select_form(nr=1)
  51.             br['handle'  ] = self.username
  52.             br['password'] = self.password
  53.             br.submit()
  54.         return br
  55.  
  56.     def parse_index(self):
  57.         articles = []
  58.         print 'Processing ' + self.INDEX
  59.         soup = self.index_to_soup(self.INDEX)
  60.         for item in soup.findAll('div', attrs={'class':'title'}):
  61.             text_link = item.parent.find('img',attrs={'alt':'Text'})
  62.             if text_link:
  63.                 url   = self.LOGIN + item.a['href']
  64.                 title = item.a.contents[0]
  65.                 date  = strftime(' %B %Y')
  66.                 articles.append({
  67.                                   'title'      :title
  68.                                  ,'date'       :date
  69.                                  ,'url'        :url
  70.                                  ,'description':''
  71.                                 })
  72.         return [(soup.head.title.string, articles)]
  73.