home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / ieeespectrum.recipe < prev    next >
Text File  |  2011-09-09  |  3KB  |  64 lines

  1.  
  2. __license__   = 'GPL v3'
  3. __copyright__ = '2010, Franco Venturi <fventuri at comcast.net>'
  4. '''
  5. spectrum.ieee.org
  6. '''
  7.  
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9. from string import capwords
  10. from urlparse import urljoin
  11.  
  12. class IEEESpectrum(BasicNewsRecipe):
  13.     title                 = 'IEEE Spectrum'
  14.     __author__            = 'Franco Venturi'
  15.     description           = 'Electronics News from IEEE'
  16.     publisher             = 'IEEE'
  17.     category              = 'news, electronics, IT, computer science'
  18.     oldest_article        = 32
  19.     max_articles_per_feed = 100
  20.     no_stylesheets        = True
  21.     use_embedded_content  = False
  22.     language              = 'en'
  23.     index                 = 'http://spectrum.ieee.org/magazine/'
  24.     masthead_url          = 'http://spectrum.ieee.org/images/logo_hdr.png'
  25.  
  26.     remove_javascript     = True
  27.     remove_tags           = [dict(name={'script':True, 'object':True})]
  28.     remove_attributes     = ['height','width','alt']
  29.     keep_only_tags        = [dict(attrs={'class': {'artSctn':True, 'artTitle':True, 'dekTitle': True}}), dict(attrs={'id':'artBody'})]
  30.  
  31.  
  32.     def parse_index(self):
  33.         soup = self.index_to_soup(self.index)
  34.         img = soup.find('img', image='cover.gif', src=True)
  35.         if img is not None:
  36.             self.cover_url = 'http://spectrum.ieee.org'+img['src']
  37.  
  38.         content = soup.find(id='gnrlContent')
  39.         title = content.find(attrs={'class':'style4'}).string.strip()
  40.         date = ' '.join(title.split()[0:2])
  41.         self.timefmt = ' [' + date + ']'
  42.         contents = []
  43.         for tag in content.findAll(attrs={'class': {'style2':True, 'lstngTitle':True, 'lstngBody': True}}):
  44.             if tag['class'] == 'style2':
  45.                 contents.append((capwords(tag.renderContents().strip()), []))
  46.             elif tag['class'] == 'lstngTitle':
  47.                 url = urljoin(self.index, tag.findPrevious('a')['href']) + '/0'
  48.                 contents[-1][1].append({'title': tag.renderContents().strip(),
  49.                                         'url': url,
  50.                                         'date': date,
  51.                                         'description': '',
  52.                                         'content': ''
  53.                                        })
  54.             elif tag['class'] == 'lstngBody':
  55.                 contents[-1][1][-1]['description'] = tag.renderContents().strip()
  56.  
  57.         return contents
  58.  
  59.     def preprocess_html(self, soup):
  60.         for a in soup.findAll('a'):
  61.             if not a['href'].lower().startswith('http'):
  62.                a['href'] = urljoin(self.index, a['href'])
  63.         return soup
  64.