Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / le_monde.recipe < prev next >

Wrap

Text File | 2011-09-09 | 7KB | 140 lines

__license__ = 'GPL v3' __copyright__ = '2011' ''' lemonde.fr ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe class LeMonde(BasicNewsRecipe): title = 'Le Monde' __author__ = 'veezh' description = 'Actualit├⌐s' oldest_article = 1 max_articles_per_feed = 100 no_stylesheets = True #delay = 1 use_embedded_content = False encoding = 'cp1252' publisher = 'lemonde.fr' category = 'news, France, world' language = 'fr' #publication_type = 'newsportal' extra_css = ''' h1{font-size:130%;} .ariane{font-size:xx-small;} .source{font-size:xx-small;} #.href{font-size:xx-small;} .LM_caption{color:#666666; font-size:x-small;} #.main-article-info{font-family:Arial,Helvetica,sans-serif;} #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} ''' #preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] conversion_options = { 'comments' : description ,'tags' : category ,'language' : language ,'publisher' : publisher ,'linearize_tables': True } remove_empty_feeds = True filterDuplicates = True def preprocess_html(self, soup): for alink in soup.findAll('a'): if alink.string is not None: tstr = alink.string alink.replaceWith(tstr) return self.adeify_images(soup) preprocess_regexps = [ (re.compile(r'([0-9])%'), lambda m: m.group(1) + ' %'), (re.compile(r'([0-9])([0-9])([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + m.group(2) + m.group(3) + ' ' + m.group(4) + m.group(5) + m.group(6)), (re.compile(r'([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + ' ' + m.group(2) + m.group(3) + m.group(4)), (re.compile(r''), lambda match: ' '), (re.compile(r'\("'), lambda match: '(« '), (re.compile(r'"\)'), lambda match: ' »)'), (re.compile(r'“'), lambda match: '(« '), (re.compile(r'”'), lambda match: ' »)'), (re.compile(r'>\''), lambda match: '>‘'), (re.compile(r' \''), lambda match: ' ‘'), (re.compile(r'\''), lambda match: '’'), (re.compile(r'"'), lambda match: '« '), (re.compile(r'""'), lambda match: '« '), (re.compile(r'"<a href='), lambda match: '« <a href='), (re.compile(r'"'), lambda match: ' »'), (re.compile(r'</a>"'), lambda match: ' »</a>'), (re.compile(r'"</'), lambda match: ' »</'), (re.compile(r'>"'), lambda match: '>« '), (re.compile(r'"<'), lambda match: ' »<'), (re.compile(r'’"'), lambda match: '’┬½ '), (re.compile(r' "'), lambda match: ' « '), (re.compile(r'" '), lambda match: ' » '), (re.compile(r'"\.'), lambda match: ' ».'), (re.compile(r'",'), lambda match: ' »,'), (re.compile(r'"\?'), lambda match: ' »?'), (re.compile(r'":'), lambda match: ' »:'), (re.compile(r'";'), lambda match: ' »;'), (re.compile(r'"\!'), lambda match: ' »!'), (re.compile(r' :'), lambda match: ' :'), (re.compile(r' ;'), lambda match: ' ;'), (re.compile(r' \?'), lambda match: ' ?'), (re.compile(r' \!'), lambda match: ' !'), (re.compile(r'\s┬╗'), lambda match: ' ┬╗'), (re.compile(r'┬½\s'), lambda match: '┬½ '), (re.compile(r' %'), lambda match: ' %'), (re.compile(r'\.jpg » border='), lambda match: '.jpg'), (re.compile(r'\.png » border='), lambda match: '.png'), (re.compile(r' – '), lambda match: ' – '), (re.compile(r' ΓÇô '), lambda match: ' – '), (re.compile(r' - '), lambda match: ' – '), (re.compile(r' -,'), lambda match: ' –,'), (re.compile(r'»:'), lambda match: '» :'), ] keep_only_tags = [ dict(name='div', attrs={'class':['contenu']}) ] remove_tags = [dict(name='div', attrs={'class':['LM_atome']})] remove_tags_after = [dict(id='appel_temoignage')] def get_article_url(self, article): url = article.get('guid', None) if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url : url = None return url # def get_article_url(self, article): # link = article.get('link') # if 'blog' not in link and ('chat' not in link): # return link feeds = [ ('A la une', 'http://www.lemonde.fr/rss/une.xml'), ('International', 'http://www.lemonde.fr/rss/tag/international.xml'), ('Europe', 'http://www.lemonde.fr/rss/tag/europe.xml'), (u'Soci├⌐t├⌐', 'http://www.lemonde.fr/rss/tag/societe.xml'), ('Economie', 'http://www.lemonde.fr/rss/tag/economie.xml'), (u'M├⌐dias', 'http://www.lemonde.fr/rss/tag/actualite-medias.xml'), (u'Plan├¿te', 'http://www.lemonde.fr/rss/tag/planete.xml'), ('Culture', 'http://www.lemonde.fr/rss/tag/culture.xml'), ('Technologies', 'http://www.lemonde.fr/rss/tag/technologies.xml'), ('Livres', 'http://www.lemonde.fr/rss/tag/livres.xml'), ] def get_cover_url(self): cover_url = None soup = self.index_to_soup('http://www.lemonde.fr/web/monde_pdf/0,33-0,1-0,0.html') link_item = soup.find('div',attrs={'class':'pg-gch'}) if link_item and link_item.img: cover_url = link_item.img['src'] return cover_url