home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / motherjones.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  4.2 KB  |  104 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. from calibre.ebooks.BeautifulSoup import Tag
  3. from calibre.ptempfile import PersistentTemporaryFile
  4.  
  5. class MotherJonesRecipe(BasicNewsRecipe):
  6.     __license__  = 'GPL v3'
  7.     __author__ = 'kwetal'
  8.     language = 'en'
  9.     version = 1
  10.  
  11.     title = u'Mother Jones'
  12.     publisher = u'Mother Jones'
  13.     category = u'News, Investigative journalism'
  14.     description = u'Independent investigative, political, and social justice reporting. Takes no prisoners, cleaves to no dogma, and tells it like it is.'
  15.  
  16.     oldest_article = 14
  17.     max_articles_per_feed = 100
  18.     use_embedded_content = False
  19.  
  20.     remove_empty_feeds = True
  21.     no_stylesheets = True
  22.     remove_javascript = True
  23.     simultaneous_downloads = 3
  24.  
  25.     keep_only_tags = []
  26.     keep_only_tags.append(dict(name = 'h1'))
  27.     keep_only_tags.append(dict(name = 'div', attrs = {'class': 'dek'}))
  28.     keep_only_tags.append(dict(name = 'p', attrs = {'class': 'submitted'}))
  29.     keep_only_tags.append(dict(name = 'div', attrs = {'class': 'print-content'}))
  30.     #keep_only_tags.append(dict(name = '', attrs = {'': ''}))
  31.  
  32.     remove_tags = []
  33.     remove_tags.append(dict(name = 'base'))
  34.     #remove_tags.append(dict(name = '', attrs = {'': ''}))
  35.  
  36.     remove_attributes = ['style']
  37.  
  38.     # feeds from http://motherjones.com/about/rss
  39.     feeds = []
  40.     feeds.append((u'Latest News', u'http://feeds.feedburner.com/motherjones/main?format=xml'))
  41.     feeds.append((u'Politics & Current Affairs', u'http://motherjones.com/rss/sections/Politics/feed&format=xml'))
  42.     feeds.append((u'Environment & Health', u'http://motherjones.com/rss/sections/Environment/feed'))
  43.     feeds.append((u'Media & Culture', u'http://motherjones.com/rss/sections/Media/feed'))
  44.     feeds.append((u'Blog: Kevin Drum', u'http://motherjones.com/rss/blogs/Kevin+Drum/feed'))
  45.     feeds.append((u'Blog: MoJo Blog', u'http://motherjones.com/rss/blogs/mojo/feed'))
  46.     feeds.append((u'Blog: Blue Marble', u'http://motherjones.com/rss/blogs/Blue+Marble/feed'))
  47.     feeds.append((u'Blog: The Riff', u'http://motherjones.com/rss/blogs/Riff/feed'))
  48.     ##feeds.append((u'', u''))
  49.  
  50.     extra_css = '''
  51.                 body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
  52.                 img {float: left; margin-right: 0.5em;}
  53.                 div.dek {font-style: italic;}
  54.                 p.submitted {font-size: x-small; color: #696969;}
  55.                 div.mj_support {font-size: x-small; color: #0666666; border: 1px solid black; padding: 0.5em}
  56.                 a, a[href] {text-decoration: none; color: blue;}
  57.                 '''
  58.  
  59.     conversion_options = {'comments': description, 'tags': category, 'language': 'en',
  60.                           'publisher': publisher}
  61.  
  62.     temp_files = []
  63.     articles_are_obfuscated = True
  64.  
  65.     def get_obfuscated_article(self, url):
  66.         '''
  67.         The print version is sort of hard to get. I think they look at the referer header, and if
  68.         it is not right they serve the original. This method works around that.
  69.         '''
  70.         br = self.get_browser()
  71.         br.open(url)
  72.  
  73.         response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0)
  74.         html = response.read()
  75.  
  76.         self.temp_files.append(PersistentTemporaryFile('_motherjones.html'))
  77.         self.temp_files[-1].write(html)
  78.         self.temp_files[-1].close()
  79.  
  80.         return self.temp_files[-1].name
  81.  
  82.     def get_article_url(self, article):
  83.         '''
  84.         Some of the feeds are served by feedburner (grr). Then the workaround to get their
  85.         print version doesn't work anymore. This method provides a workaround.
  86.         '''
  87.         if hasattr(article, 'feedburner_origlink'):
  88.             return article.feedburner_origlink
  89.         else:
  90.             return article.link
  91.  
  92.     def preprocess_html(self, soup):
  93.         for img in soup.findAll('img', attrs = {'src': True}):
  94.             if not img['src'].startswith('http://'):
  95.                 img['src'] = 'http://motherjones.com' + img['src']
  96.  
  97.         div = Tag(soup, 'div', [('class', 'mj_support')])
  98.         div.append('''Your tax-deductible gifts help keep Mother Jones independent and uncompromised.
  99.                       To make a contribution, visit MotherJones.com or call 877-GIV-MOJO.
  100.                    ''')
  101.         soup.body.append(div)
  102.  
  103.         return soup
  104.