home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / discover_magazine.recipe < prev    next >
Text File  |  2011-09-09  |  4KB  |  87 lines

  1. #!/usr/bin/env  python
  2. __license__   = 'GPL v3'
  3. __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
  4. __docformat__ = 'restructuredtext en'
  5.  
  6. '''
  7. discovermagazine.com
  8. '''
  9.  
  10. import re
  11. from calibre.web.feeds.news import BasicNewsRecipe
  12.  
  13. class DiscoverMagazine(BasicNewsRecipe):
  14.  
  15.     title = u'Discover Magazine'
  16.     description = u'Science, Technology and the Future'
  17.     __author__ = 'Starson17'
  18.     language = 'en'
  19.  
  20.     oldest_article = 33
  21.     max_articles_per_feed = 20
  22.     no_stylesheets = True
  23.     remove_javascript = True
  24.     use_embedded_content  = False
  25.     encoding = 'utf-8'
  26.     extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}'
  27.  
  28.     remove_tags = [
  29.                    dict(name='div', attrs={'id':['searchModule', 'mainMenu', 'tool-box']}),
  30.                    dict(name='div', attrs={'id':['footer','teaser','already-subscriber','teaser-suite','related-articles']}),
  31.                    dict(name='div', attrs={'class':['column']}),
  32.                    dict(name='img', attrs={'src':'http://discovermagazine.com/onebyone.gif'})]
  33.  
  34.     remove_tags_after = [dict(name='div', attrs={'class':'listingBar'})]
  35.  
  36.     def append_page(self, soup, appendtag, position):
  37.         pager = soup.find('span',attrs={'class':'next'})
  38.         if pager:
  39.            nexturl = pager.a['href']
  40.            soup2 = self.index_to_soup(nexturl)
  41.            texttag = soup2.find('div', attrs={'class':'articlebody'})
  42.            newpos = len(texttag.contents)
  43.            self.append_page(soup2,texttag,newpos)
  44.            texttag.extract()
  45.            appendtag.insert(position,texttag)
  46.  
  47.     def preprocess_html(self, soup):
  48.         mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
  49.         soup.head.insert(0,mtag)
  50.         self.append_page(soup, soup.body, 3)
  51.         pager = soup.find('div',attrs={'class':'listingBar'})
  52.         if pager:
  53.            pager.extract()
  54.         return soup
  55.  
  56.     def postprocess_html(self, soup, first_fetch):
  57.         for tag in soup.findAll(text=re.compile('^This article is a sample')):
  58.             tag.parent.extract()
  59.         for tag in soup.findAll(['table', 'tr', 'td']):
  60.             tag.name = 'div'
  61.         for tag in soup.findAll('div', attrs={'class':'discreet advert'}):
  62.             tag.extract()
  63.         for tag in soup.findAll('hr', attrs={'size':'1'}):
  64.             tag.extract()
  65.         for tag in soup.findAll('br'):
  66.             tag.extract()
  67.         return soup
  68.  
  69.     feeds = [
  70.              (u'Technology', u'http://discovermagazine.com/topics/technology/rss.xml'),
  71.              (u'Health - Medicine', u'http://discovermagazine.com/topics/health-medicine/rss.xml'),
  72.              (u'Mind Brain', u'http://discovermagazine.com/topics/mind-brain/rss.xml'),
  73.              (u'Space', u'http://discovermagazine.com/topics/space/rss.xml'),
  74.              (u'Human Origins', u'http://discovermagazine.com/topics/human-origins/rss.xml'),
  75.              (u'Living World', u'http://discovermagazine.com/topics/living-world/rss.xml'),
  76.              (u'Environment', u'http://discovermagazine.com/topics/environment/rss.xml'),
  77.              (u'Physics & Math', u'http://discovermagazine.com/topics/physics-math/rss.xml'),
  78.              (u"20 Things you didn't know about...", u'http://discovermagazine.com/columns/20-things-you-didnt-know/rss.xml'),
  79.              (u'Fuzzy Math', u'http://discovermagazine.com/columns/fuzzy-math/rss.xml'),
  80.              (u'The Brain', u'http://discovermagazine.com/columns/the-brain/rss.xml'),
  81.              (u'What is This', u'http://discovermagazine.com/columns/what-is-this/rss.xml'),
  82.              (u'Vital Signs', u'http://discovermagazine.com/columns/vital-signs/rss.xml'),
  83.              (u'Think Tech', u'http://discovermagazine.com/columns/think-tech/rss.xml'),
  84.              (u'Future Tech', u'http://discovermagazine.com/columns/future-tech/rss.xml'),
  85.              (u'Discover Interview', u'http://discovermagazine.com/columns/discover-interview/rss.xml'),
  86.             ]
  87.