Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / the_age.recipe < prev next >

Wrap

Text File | 2011-09-09 | 4KB | 113 lines

#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2009, Matthew Briggs <hal.sulphur@gmail.com>' __docformat__ = 'restructuredtext en' ''' theage.com.au ''' from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup import re class TheAge(BasicNewsRecipe): title = 'The Age' description = 'Business News, World News and Breaking News in Melbourne, Australia' publication_type = 'newspaper' __author__ = 'Matthew Briggs' language = 'en_AU' max_articles_per_feed = 1000 recursions = 0 remove_tags = [dict(name=['table', 'script', 'noscript', 'style']), dict(name='a', attrs={'href':'/'}), dict(name='a', attrs={'href':'/text/'})] def get_browser(self): br = BasicNewsRecipe.get_browser() br.set_handle_refresh(False) return br def parse_index(self): soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read()) section = None sections = {} for tag in soup.findAll(['h3', 'a']): if tag.name == 'h3': section = self.tag_to_string(tag) sections[section] = [] # Make sure to skip: <a href="/">TheAge</a> elif section and tag.has_key('href') and len(tag['href'].strip())>1: url = tag['href'].strip() if url.startswith('/'): url = 'http://www.theage.com.au' + url title = self.tag_to_string(tag) sections[section].append({ 'title': title, 'url' : url, 'date' : strftime('%a, %d %b'), 'description' : '', 'content' : '', }) feeds = [] # Insert feeds in specified order, if available feedSort = [ 'National', 'World', 'Opinion', 'Columns', 'Business', 'Sport', 'Entertainment' ] for i in feedSort: if i in sections: feeds.append((i,sections[i])) # Done with the sorted feeds for i in feedSort: del sections[i] # Append what is left over... for i in sections: feeds.append((i,sections[i])) return feeds def get_cover_url(self): soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/todays-paper').read()) for i in soup.findAll('a'): href = i['href'] if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf',href): return href return None def preprocess_html(self,soup): for p in soup.findAll('p'): # Collapse the paragraph by joining the non-tag contents contents = [i for i in p.contents if isinstance(i,unicode)] if len(contents): contents = ''.join(contents) # Filter out what's left of the text-mode navigation stuff if re.match('((\s)|(\ \;))*\[[\|\s*]*\]((\s)|(\ \;))*$',contents): p.extract() continue # Shrink the fine print font if contents=='This material is subject to copyright and any unauthorised use, copying or mirroring is prohibited.': p['style'] = 'font-size:small' continue return soup