Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / harpers_full.recipe < prev next >

Wrap

Text File | 2011-09-09 | 2.9 KB | 73 lines

__license__ = 'GPL v3' __copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>' ''' harpers.org - paid subscription/ printed issue articles This recipe only get's article's published in text format images and pdf's are ignored ''' from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe class Harpers_full(BasicNewsRecipe): title = "Harper's Magazine - articles from printed edition" __author__ = 'Darko Miletic' description = "Harper's Magazine: Founded June 1850." publisher = "Harpers's" category = 'news, politics, USA' oldest_article = 30 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False delay = 1 language = 'en' needs_subscription = True masthead_url = 'http://www.harpers.org/media/image/Harpers_305x100.gif' publication_type = 'magazine' INDEX = strftime('http://www.harpers.org/archive/%Y/%m') LOGIN = 'http://www.harpers.org' cover_url = strftime('http://www.harpers.org/media/pages/%Y/%m/gif/0001.gif') extra_css = ' body{font-family: "Georgia",serif} ' conversion_options = { 'comment' : description , 'tags' : category , 'publisher' : publisher , 'language' : language } keep_only_tags = [ dict(name='div', attrs={'id':'cached'}) ] remove_tags = [ dict(name='table', attrs={'class':['rcnt','rcnt topline']}) ,dict(name='link') ] remove_attributes=['xmlns'] def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: br.open(self.LOGIN) br.select_form(nr=1) br['handle' ] = self.username br['password'] = self.password br.submit() return br def parse_index(self): articles = [] print 'Processing ' + self.INDEX soup = self.index_to_soup(self.INDEX) for item in soup.findAll('div', attrs={'class':'title'}): text_link = item.parent.find('img',attrs={'alt':'Text'}) if text_link: url = self.LOGIN + item.a['href'] title = item.a.contents[0] date = strftime(' %B %Y') articles.append({ 'title' :title ,'date' :date ,'url' :url ,'description':'' }) return [(soup.head.title.string, articles)]