home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / watchingamerica.recipe < prev    next >
Text File  |  2011-09-09  |  4KB  |  97 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  3.  
  4. class WatchingAmericaRecipe(BasicNewsRecipe):
  5.     __license__  = 'GPL v3'
  6.     __author__ = 'kwetal'
  7.     language = 'en'
  8.     version = 1
  9.  
  10.     title = u'Watching America'
  11.     publisher = u'watchingamerica.com'
  12.     category = u'News'
  13.     description = u'Global opinion about the United States'
  14.  
  15.     oldest_article = 7
  16.     max_articles_per_feed = 100
  17.     use_embedded_content = False
  18.  
  19.     no_stylesheets = True
  20.     remove_javascript = True
  21.     remove_attributes = ['style']
  22.  
  23.     extra_css = '''
  24.                     body{font-family:verdana,arial,helvetica,geneva,sans-serif ;}
  25.                     .main_content em {font-size: x-small; font-style: italic; color: #696969;}
  26.                     .main_content span strong {font-size: x-large; font-weight: bold;}
  27.                     .insideitro {font-size: xx-small; font-style: italic; color: #666666;}
  28.                     span {padding: 0em; margin 0em;}
  29.                 '''
  30.  
  31.     INDEX = u'http://watchingamerica.com/News/'
  32.  
  33.     def parse_index(self):
  34.         answer = []
  35.  
  36.         soup = self.index_to_soup(self.INDEX)
  37.  
  38.         articles = []
  39.         feature = soup.find('div', attrs = {'id': 'headzone'})
  40.         if feature:
  41.             link = feature.find('a', attrs = {'class': 'feature'})
  42.             url = link.get('href', None)
  43.             title = self.tag_to_string(link)
  44.             description = self.tag_to_string(feature.find('h1', attrs = {'class': 'pull'}))
  45.             article = {'title': title, 'date': u'', 'url': url, 'description': description}
  46.             articles.append(article)
  47.             answer.append(('Feature', articles))
  48.  
  49.         feed_titles = ['Translations from the West', 'Translations from the East']
  50.         for i in range(1, 3):
  51.             articles = []
  52.             div = soup.find('div', attrs = {'class': 'newscol' + str(i)})
  53.             if div:
  54.                 for link in div.findAll('a', attrs = {'class': 'headline'}):
  55.                     url = link.get('href', None)
  56.                     title = self.tag_to_string(link)
  57.  
  58.                     description = None
  59.                     h3 = link.findNextSibling('h3')
  60.                     if h3:
  61.                         description = self.tag_to_string(h3)
  62.  
  63.                     article = {'title': title, 'date': u'', 'url': url, 'description': description}
  64.                     articles.append(article)
  65.             answer.append((feed_titles[i - 1], articles))
  66.  
  67.         return answer
  68.  
  69.     def preprocess_html(self, soup):
  70.         freshSoup = self.get_fresh_soup(soup)
  71.         article = soup.find('p', attrs = {'class': 'MsoNormal'}).parent
  72.         if article:
  73.             article.name = 'div'
  74.             del article['width']
  75.             article['class'] = 'main_content'
  76.             org = article.find('a', attrs = {'href': '?SHOW_ORIGINAL_TEXT'})
  77.             if org:
  78.                 org.parent.extract()
  79.  
  80.             intro = article.find('span', attrs = {'class': 'insideitro'})
  81.             if intro:
  82.                 for el in intro.findAll(['strong', 'em', 'br']):
  83.                     if el.name == 'br':
  84.                         el.extract()
  85.                     else:
  86.                         el.name = 'div'
  87.  
  88.             freshSoup.body.append(article)
  89.  
  90.         return freshSoup
  91.  
  92.     def get_fresh_soup(self, oldSoup):
  93.         freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
  94.         if oldSoup.head.title:
  95.             freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
  96.         return freshSoup
  97.