Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / wapo_cartoons.recipe < prev next >

Wrap

Text File | 2011-09-09 | 5.6 KB | 146 lines

from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup from datetime import date, timedelta class WaPoCartoonsRecipe(BasicNewsRecipe): __license__ = 'GPL v3' __author__ = 'kwetal' language = 'en' version = 2 title = u'Washington Post Cartoons' publisher = u'Washington Post' category = u'News, Cartoons' description = u'Cartoons from the Washington Post' oldest_article = 7 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True feeds = [] feeds.append((u'Anderson', u'http://www.uclick.com/client/wpc/wpnan/')) feeds.append((u'Auth', u'http://www.uclick.com/client/wpc/ta/')) feeds.append((u'Bok', u'http://www.creators.com/featurepages/11_editorialcartoons_chip-bok.html?name=cb')) feeds.append((u'Carlson', u'http://www.uclick.com/client/wpc/sc/')) feeds.append((u'Luckovich', u'http://www.creators.com/featurepages/11_editorialcartoons_mike-luckovich.html?name=lk')) feeds.append((u'McCoy', u'http://www.uclick.com/client/wpc/gm/')) feeds.append((u'Pat Oliphant', u'http://www.uclick.com/client/wpc/po/')) feeds.append((u'Sargent', u'http://wpcomics.washingtonpost.com/client/wpc/bs/')) feeds.append((u'Wilkinson', u'http://www.uclick.com/client/wpc/wpswi/')) extra_css = ''' body {font-family: verdana, arial, helvetica, geneva, sans-serif;} h1 {font-size: medium; font-weight: bold; margin-bottom: -0.1em; padding: 0em; text-align: left;} #name {margin-bottom: 0.2em} #copyright {font-size: xx-small; color: #696969; text-align: right; margin-top: 0.2em;} ''' def parse_index(self): index = [] oldestDate = date.today() - timedelta(days = self.oldest_article) oldest = oldestDate.strftime('%Y%m%d') for feed in self.feeds: cartoons = [] soup = self.index_to_soup(feed[1]) cartoon = {'title': 'Current', 'date': None, 'url': feed[1], 'description' : ''} cartoons.append(cartoon) select = soup.find('select', attrs = {'name': ['url', 'dest']}) if select: cartoonCandidates = [] if select['name'] == 'url': cartoonCandidates = self.cartoonCandidatesWaPo(select, oldest) else: cartoonCandidates = self.cartoonCandidatesCreatorsCom(select, oldest) for cartoon in cartoonCandidates: cartoons.append(cartoon) index.append([feed[0], cartoons]) return index def preprocess_html(self, soup): freshSoup = self.getFreshSoup(soup) div = soup.find('div', attrs = {'id': 'name'}) if div: freshSoup.body.append(div) comic = soup.find('div', attrs = {'id': 'comic_full'}) img = comic.find('img') if '&' in img['src']: img['src'], sep, bad = img['src'].rpartition('&') freshSoup.body.append(comic) freshSoup.body.append(soup.find('div', attrs = {'id': 'copyright'})) else: span = soup.find('span', attrs = {'class': 'title'}) if span: del span['class'] span['id'] = 'name' span.name = 'div' freshSoup.body.append(span) img = soup.find('img', attrs = {'class': 'pic_big'}) if img: td = img.parent if td.has_key('style'): del td['style'] td.name = 'div' td['id'] = 'comic_full' freshSoup.body.append(td) td = soup.find('td', attrs = {'class': 'copy'}) if td: for a in td.find('a'): a.extract() del td['class'] td['id'] = 'copyright' td.name = 'div' freshSoup.body.append(td) return freshSoup def getFreshSoup(self, oldSoup): freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>') if oldSoup.head.title: freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title)) return freshSoup def cartoonCandidatesWaPo(self, select, oldest): opts = select.findAll('option') for i in range(1, len(opts)): url = opts[i]['value'].rstrip('/') dateparts = url.split('/')[-3:] datenum = str(dateparts[0]) + str(dateparts[1]) + str(dateparts[2]) if datenum >= oldest: yield {'title': self.tag_to_string(opts[i]), 'date': None, 'url': url, 'description': ''} else: return def cartoonCandidatesCreatorsCom(self, select, oldest): monthNames = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12'} opts = select.findAll('option') for i in range(1, len(opts)): if opts[i].has_key('selected'): continue dateString = self.tag_to_string(opts[i]) rest, sep, year = dateString.rpartition(', ') parts = rest.split(' ') day = parts[2].rjust(2, '0') month = monthNames[parts[1]] datenum = str(year) + month + str(day) if datenum >= oldest: yield {'title': dateString, 'date': None, 'url': opts[i]['value'], 'description': ''} else: return