home *** CD-ROM | disk | FTP | other *** search
- from calibre.web.feeds.news import BasicNewsRecipe
- from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
- class NrcNextRecipe(BasicNewsRecipe):
- __license__ = 'GPL v3'
- __author__ = 'kwetal'
- language = 'nl'
- country = 'NL'
- version = 2
- title = u'nrcnext'
- publisher = u'NRC Media'
- category = u'News, Opinion, the Netherlands'
- description = u'Dutch newsblog from the Dutch daily newspaper nrcnext.'
- conversion_options = {'comments': description, 'language': language, 'publisher': publisher}
- no_stylesheets = True
- remove_javascript = True
- keep_only_tags = [dict(name='div', attrs={'id' : 'main'})]
- remove_tags = []
- remove_tags.append(dict(name = 'div', attrs = {'class' : 'meta'}))
- remove_tags.append(dict(name = 'p', attrs = {'class' : 'meta'}))
- remove_tags.append(dict(name = 'div', attrs = {'class' : 'datumlabel'}))
- remove_tags.append(dict(name = 'div', attrs = {'class' : 'sharing-is-caring'}))
- remove_tags.append(dict(name = 'div', attrs = {'class' : 'navigation'}))
- remove_tags.append(dict(name = 'div', attrs = {'class' : 'reageer'}))
- remove_tags.append(dict(name = 'div', attrs = {'class' : 'comment odd alt thread-odd thread-alt depth-1 reactie '}))
- remove_tags.append(dict(name = 'div', attrs = {'class' : 'comment even thread-even depth-1 reactie '}))
- remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats single'}))
- remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats onderwerpen'}))
- remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats rubrieken'}))
- remove_tags.append(dict(name = 'h3', attrs = {'class' : 'reacties'}))
- extra_css = '''
- body {font-family: verdana, arial, helvetica, geneva, sans-serif; text-align: left;}
- p.wp-caption-text {font-size: x-small; color: #666666;}
- h2.sub_title {font-size: medium; color: #696969;}
- h2.vlag {font-size: small; font-weight: bold;}
- '''
- def parse_index(self) :
- # Use the wesbite as an index. Their RSS feeds can be out of date.
- feeds = {}
- feeds[u'columnisten'] = u'http://www.nrcnext.nl/columnisten/'
- feeds[u'koken'] = u'http://www.nrcnext.nl/koken/'
- feeds[u'geld & werk'] = u'http://www.nrcnext.nl/geld-en-werk/'
- feeds[u'vandaag'] = u'http://www.nrcnext.nl'
- # feeds[u'city life in afrika'] = u'http://www.nrcnext.nl/city-life-in-afrika/'
- answer = []
- articles = {}
- indices = []
- for index, feed in feeds.items() :
- soup = self.index_to_soup(feed)
- for post in soup.findAll(True, attrs={'class' : 'post '}) :
- # Find the links to the actual articles and rember the location they're pointing to and the title
- a = post.find('a', attrs={'rel' : 'bookmark'})
- href = a['href']
- title = self.tag_to_string(a)
- if index == 'columnisten' :
- # In this feed/page articles can be written by more than one author.
- # It is nice to see their names in the titles.
- flag = post.find('h2', attrs = {'class' : 'vlag'})
- author = flag.contents[0].renderContents()
- completeTitle = u''.join([author, u': ', title])
- else :
- completeTitle = title
- # Add the article to a temporary list
- article = {'title' : completeTitle, 'date' : u'', 'url' : href, 'description' : '<p> </p>'}
- if not articles.has_key(index) :
- articles[index] = []
- articles[index].append(article)
- # Add the index title to a temporary list
- indices.append(index)
- # Now, sort the temporary list of feeds in the order they appear on the website
- # indices = self.sort_index_by(indices, {u'columnisten' : 1, u'koken' : 3, u'geld & werk' : 2, u'vandaag' : 0, u'city life in afrika' : 4})
- indices = self.sort_index_by(indices, {u'columnisten' : 1, u'koken' : 3, u'geld & werk' : 2, u'vandaag' : 0})
- # Apply this sort order to the actual list of feeds and articles
- answer = [(key, articles[key]) for key in indices if articles.has_key(key)]
- return answer
- def preprocess_html(self, soup) :
- if soup.find('div', attrs = {'id' : 'main', 'class' : 'single'}):
- tag = soup.find('div', attrs = {'class' : 'post'})
- if tag:
- h2 = tag.find('h2', 'vlag')
- if h2:
- new_h2 = Tag(soup, 'h2', attrs = [('class', 'vlag')])
- new_h2.append(self.tag_to_string(h2))
- h2.replaceWith(new_h2)
- else:
- h2 = tag.find('h2')
- if h2:
- new_h2 = Tag(soup, 'h2', attrs = [('class', 'sub_title')])
- new_h2.append(self.tag_to_string(h2))
- h2.replaceWith(new_h2)
- h1 = tag.find('h1')
- if h1:
- new_h1 = Tag(soup, 'h1')
- new_h1.append(self.tag_to_string(h1))
- h1.replaceWith(new_h1)
- # Slows down my reader.
- for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqvimeo'}):
- movie.extract()
- for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqyoutube'}):
- movie.extract()
- for iframe in tag.findAll('iframe') :
- iframe.extract()
- fresh_soup = self.getFreshSoup(soup)
- fresh_soup.body.append(tag)
- return fresh_soup
- else:
- # This should never happen and other famous last words...
- return soup
- def getFreshSoup(self, oldSoup):
- freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
- if oldSoup.head.title:
- freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
- return freshSoup