Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / ncrnext.recipe < prev next >

Wrap

Text File | 2011-09-09 | 6.0 KB | 135 lines

from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class NrcNextRecipe(BasicNewsRecipe): __license__ = 'GPL v3' __author__ = 'kwetal' language = 'nl' country = 'NL' version = 2 title = u'nrcnext' publisher = u'NRC Media' category = u'News, Opinion, the Netherlands' description = u'Dutch newsblog from the Dutch daily newspaper nrcnext.' conversion_options = {'comments': description, 'language': language, 'publisher': publisher} no_stylesheets = True remove_javascript = True keep_only_tags = [dict(name='div', attrs={'id' : 'main'})] remove_tags = [] remove_tags.append(dict(name = 'div', attrs = {'class' : 'meta'})) remove_tags.append(dict(name = 'p', attrs = {'class' : 'meta'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'datumlabel'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'sharing-is-caring'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'navigation'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'reageer'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'comment odd alt thread-odd thread-alt depth-1 reactie '})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'comment even thread-even depth-1 reactie '})) remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats single'})) remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats onderwerpen'})) remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats rubrieken'})) remove_tags.append(dict(name = 'h3', attrs = {'class' : 'reacties'})) extra_css = ''' body {font-family: verdana, arial, helvetica, geneva, sans-serif; text-align: left;} p.wp-caption-text {font-size: x-small; color: #666666;} h2.sub_title {font-size: medium; color: #696969;} h2.vlag {font-size: small; font-weight: bold;} ''' def parse_index(self) : # Use the wesbite as an index. Their RSS feeds can be out of date. feeds = {} feeds[u'columnisten'] = u'http://www.nrcnext.nl/columnisten/' feeds[u'koken'] = u'http://www.nrcnext.nl/koken/' feeds[u'geld & werk'] = u'http://www.nrcnext.nl/geld-en-werk/' feeds[u'vandaag'] = u'http://www.nrcnext.nl' # feeds[u'city life in afrika'] = u'http://www.nrcnext.nl/city-life-in-afrika/' answer = [] articles = {} indices = [] for index, feed in feeds.items() : soup = self.index_to_soup(feed) for post in soup.findAll(True, attrs={'class' : 'post '}) : # Find the links to the actual articles and rember the location they're pointing to and the title a = post.find('a', attrs={'rel' : 'bookmark'}) href = a['href'] title = self.tag_to_string(a) if index == 'columnisten' : # In this feed/page articles can be written by more than one author. # It is nice to see their names in the titles. flag = post.find('h2', attrs = {'class' : 'vlag'}) author = flag.contents[0].renderContents() completeTitle = u''.join([author, u': ', title]) else : completeTitle = title # Add the article to a temporary list article = {'title' : completeTitle, 'date' : u'', 'url' : href, 'description' : '<p> </p>'} if not articles.has_key(index) : articles[index] = [] articles[index].append(article) # Add the index title to a temporary list indices.append(index) # Now, sort the temporary list of feeds in the order they appear on the website # indices = self.sort_index_by(indices, {u'columnisten' : 1, u'koken' : 3, u'geld & werk' : 2, u'vandaag' : 0, u'city life in afrika' : 4}) indices = self.sort_index_by(indices, {u'columnisten' : 1, u'koken' : 3, u'geld & werk' : 2, u'vandaag' : 0}) # Apply this sort order to the actual list of feeds and articles answer = [(key, articles[key]) for key in indices if articles.has_key(key)] return answer def preprocess_html(self, soup) : if soup.find('div', attrs = {'id' : 'main', 'class' : 'single'}): tag = soup.find('div', attrs = {'class' : 'post'}) if tag: h2 = tag.find('h2', 'vlag') if h2: new_h2 = Tag(soup, 'h2', attrs = [('class', 'vlag')]) new_h2.append(self.tag_to_string(h2)) h2.replaceWith(new_h2) else: h2 = tag.find('h2') if h2: new_h2 = Tag(soup, 'h2', attrs = [('class', 'sub_title')]) new_h2.append(self.tag_to_string(h2)) h2.replaceWith(new_h2) h1 = tag.find('h1') if h1: new_h1 = Tag(soup, 'h1') new_h1.append(self.tag_to_string(h1)) h1.replaceWith(new_h1) # Slows down my reader. for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqvimeo'}): movie.extract() for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqyoutube'}): movie.extract() for iframe in tag.findAll('iframe') : iframe.extract() fresh_soup = self.getFreshSoup(soup) fresh_soup.body.append(tag) return fresh_soup else: # This should never happen and other famous last words... return soup def getFreshSoup(self, oldSoup): freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>') if oldSoup.head.title: freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title)) return freshSoup