Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / glennbeck.recipe < prev next >

Wrap

Text File | 2011-09-09 | 4KB | 98 lines

from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, Comment class GlennBeckRecipe(BasicNewsRecipe): __license__ = 'GPL v3' __author__ = 'kwetal' language = 'en' version = 1 title = u'Glenn Beck' publisher = u'Premiere Radio Networks' category = u'News, Opinion' description = u'The fusion of entertainment and enlightenment' oldest_article = 7 max_articles_per_feed = 100 no_stylesheets = True remove_javascript = True use_embedded_content = False feeds = [(u'Glenn Beck', u'http://feeds.feedburner.com/GlennBeckArticles')] def preprocess_html(self, soup): # Their html is horribly broken; if we search for the div that has the content BeatifulSoup returns the div with only the headline and no content. # This is due to illegal nesting of tags. So we do it the hard way. # We can find this one, and we don't want it. div = soup.find('div', attrs = {'id': 'extraInfo'}) if div: div.extract() # Don't want these either. iframes = soup.findAll('iframe') [iframe.extract() for iframe in iframes] # Get empty document. freshSoup = self.getFreshSoup() # This is the broken div; but we can find the headline. newsDiv = soup.find('div', attrs = {'class': 'news-detail'}) if newsDiv: if newsDiv.h1: freshSoup.body.append(newsDiv.h1) # The content is wrapped in <p></p> tags, most of the time anyway. counter = 0 for p in soup.findAll('p'): if p.get('class') == 'smalltextwhite': # But we don't want this one. continue freshSoup.body.append(p) counter += 1 # Debugging block #h3 = Tag(freshSoup, 'h3') #h3.append('First counter: ' + str(counter)) #freshSoup.body.insert(0, h3) # In some articles the content is not wrapped in <p></p> tags. In that case the counter is low. # 2 is the magic number that seems to work. if counter <= 2: # So they are playing hard-to-get: first throw out all comments. comments = soup.findAll(text = lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] # Find all unwrapped strings. for txt in soup.findAll(text = True): raw = txt.strip() # Debugging line #para.append(raw + '(parent: ' + txt.parent.name + '; length: ' + str(len(raw)) + '; start: ' + raw[0:4] + ')') if (txt.parent.name == 'body' and len(raw) > 0) and not (len(raw) == 6 and raw == ' '): # This is our content; ignore the rest. para = Tag(freshSoup, 'p') para.append(raw) freshSoup.body.append(para) counter += 1 # Now if the counter is still 0 or 1 they did something completely different and we still have an empty article. In a last attempt, add the whole content div, just in case. if counter < 2: freshSoup.body.append(newsDiv) # Debugging block #h3 = Tag(freshSoup, 'h3') #h3.append('Second counter: ' + str(counter)) #freshSoup.body.insert(1, h3) return freshSoup def getFreshSoup(self, title = None): if title: return BeautifulSoup('<html><head><title>' + str(title) + '</title></head><body></body></html>') else: return BeautifulSoup('<html><head><title></title></head><body></body></html>')