home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / nymag.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  2.6 KB  |  75 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  5. '''
  6. theatlantic.com
  7. '''
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9.  
  10. class NewYorkMagazine(BasicNewsRecipe):
  11.  
  12.     title       = 'New York Magazine'
  13.     __author__  = 'Kovid Goyal'
  14.     description = 'Food, culture, arts and entertainment in New York'
  15.     language    = 'en'
  16.     no_stylesheets = True
  17.     remove_javascript = True
  18.     encoding = 'iso-8859-1'
  19.     recursions = 1
  20.     match_regexps = [r'http://nymag.com/.+/index[0-9]{1,2}.html$']
  21.     keep_only_tags = [dict(id='main')]
  22.     remove_tags = [
  23.             dict(attrs={'class':['start-discussion']}),
  24.             dict(id=['minibrowserbox', 'article-related', 'article-tools'])
  25.             ]
  26.  
  27.     PREFIX = 'http://nymag.com'
  28.  
  29.     def nymag_get_index(self):
  30.         return self.index_to_soup('http://nymag.com/includes/tableofcontents.htm')
  31.  
  32.     def parse_index(self):
  33.         soup = self.nymag_get_index()
  34.         self.cover_url = soup.find(attrs={'class':'cover'}).find('img',
  35.                 src=True).get('src')
  36.         feeds = []
  37.         current_section = 'Cover Story'
  38.         current_articles = []
  39.         for h in soup.findAll(['h4', 'h5']):
  40.             if h.name == 'h4':
  41.                 if current_section and current_articles:
  42.                     feeds.append((current_section, current_articles))
  43.                 current_section = self.tag_to_string(h)
  44.                 self.log('\tFound section:', current_section)
  45.                 current_articles = []
  46.             elif h.name == 'h5':
  47.                 title = self.tag_to_string(h)
  48.                 a = h.find('a', href=True)
  49.                 if a is not None:
  50.                     url = a.get('href')
  51.                     if url.startswith('/'):
  52.                         url = self.PREFIX + url
  53.                     if title and url:
  54.                         self.log('\t\tFound article:', title)
  55.                         self.log('\t\t\t', url)
  56.                         desc = ''
  57.                         p = h.findNextSibling('p')
  58.                         if p is not None:
  59.                             desc = self.tag_to_string(p)
  60.                             self.log('\t\t\t', desc)
  61.                         current_articles.append({'title':title, 'url':url,
  62.                             'date':'', 'description':desc})
  63.         return feeds
  64.  
  65.     def postprocess_html(self, soup, first):
  66.         for x in soup.findAll(attrs={'class':'page-navigation'}):
  67.             x.extract()
  68.         if not first:
  69.             for x in soup.findAll(attrs={'class':'header-spacing'}):
  70.                 x.extract()
  71.         return soup
  72.  
  73.  
  74.  
  75.