home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / hartford_courant.recipe < prev    next >
Text File  |  2011-09-09  |  6KB  |  91 lines

  1. from __future__ import with_statement
  2. __license__ = 'GPL 3'
  3. __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
  4. __docformat__ = 'restructuredtext en'
  5.  
  6. from calibre.web.feeds.news import BasicNewsRecipe
  7.  
  8. class ChicagoTribune(BasicNewsRecipe):
  9.  
  10.     title       = 'The Hartford Courant'
  11.     __author__  = 'Being and Sujata Raman'
  12.     description = 'Politics, local and business news from Hartford'
  13.     language = 'en'
  14.  
  15.     use_embedded_content    = False
  16.     no_stylesheets        = True
  17.     remove_javascript = True
  18.  
  19.     keep_only_tags = [dict(name='div', attrs={'class':["story","entry-asset asset hentry"]}),
  20.                       dict(name='div', attrs={'id':["pagebody","story","maincontentcontainer"]}),
  21.                            ]
  22.     remove_tags_after = [    {'class':['photo_article',]} ]
  23.  
  24.     remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]},
  25.                    {'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]},
  26.                    dict(name='font',attrs={'id':["cr-other-headlines"]})]
  27.     extra_css = '''
  28.                     h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
  29.                     h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
  30.                     .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
  31.                     .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
  32.                     p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  33.                     .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
  34.                     .story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  35.                     .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  36.                     .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  37.                     .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  38.                     .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  39.                     body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
  40.         '''
  41.     feeds = [
  42.              ('Breaking News', 'http://feeds.feedburner.com/courant-breaking-news/'),
  43.              ('Nation/World News', 'http://feeds.feedburner.com/courant-nation-world/'),
  44.              ('Connecticut News', 'http://feeds.feedburner.com/courant-connecticut-news/'),
  45.              ('Hartford News', 'http://feeds.feedburner.com/courant-hartford/'),
  46.              ('West Hartford News', 'http://feeds.feedburner.com/courant-west-hartford/'),
  47.              ('Bristol', 'http://feeds.feedburner.com/courant-bristol/'),
  48.              ('Politics', 'http://feeds.feedburner.com/courant-politics/'),
  49.              ('Opinion', 'http://feeds.feedburner.com/courant-opinion/'),
  50.              ('Editorials', 'http://feeds.feedburner.com/courant-editorials/'),
  51.              ('Letters', 'http://feeds.feedburner.com/courant-letters/'),
  52.              ('Bob Englehart', 'http://feeds2.feedburner.com/BobEnglehartEnglehartsView'),
  53.              ('Business', 'http://feeds.feedburner.com/courant-business/'),
  54.              ('Sports', 'http://feeds.feedburner.com/courant-sports/'),
  55.              ('Features', 'http://feeds.feedburner.com/courant-features/'),
  56.              ('Consumer', 'http://feeds.feedburner.com/courant-consumer/'),
  57.              ('Shopping', 'http://feeds.feedburner.com/courant-shopping/'),
  58.              ('Arts & Theater', 'http://feeds.feedburner.com/courant-entertainment/'),
  59.              ('Entertainment', 'http://feeds.feedburner.com/courant-stage/'),
  60.              ('Music', 'http://feeds.feedburner.com/courant-music/'),
  61.              ('TV', 'http://feeds.feedburner.com/courant-tv/'),
  62.              ('Movies', 'http://feeds.feedburner.com/courant-movies/'),
  63.              #('Metromix headlines', 'http://feeds.feedburner.com/metromix/topheadlines/'),
  64.              #('Metromix events', 'http://feeds.feedburner.com/metromix/events/'),
  65.              #('Metromix restaurants', 'http://feeds.feedburner.com/metromix/restaurants/'),
  66.              ('Outdoors', 'http://feeds.feedburner.com/courant-outdoors/'),
  67.              ('Peter Marteka', 'http://feeds.feedburner.com/courant-marteka-column/'),
  68.              ('Susan Campbell', 'http://feeds.feedburner.com/courant-campbell-column/'),
  69.              ('Helen Ubinas', 'http://feeds.feedburner.com/courant-helen-ubinas-column/'),
  70.              ('Jim Shea', 'http://feeds.feedburner.com/courant-jim-shea-column/'),
  71.              ('Tom Condon', 'http://feeds.feedburner.com/courant-tom-condon-column/'),
  72.              ('Colin McEnroe', 'http://feeds.feedburner.com/courant-colin-mcenroe-column/'),
  73.              ]
  74.  
  75.  
  76.     def get_article_url(self, article):
  77.         print article.get('feedburner_origlink', article.get('guid', article.get('link')))
  78.         return article.get('feedburner_origlink', article.get('guid', article.get('link')))
  79.  
  80.  
  81.     def postprocess_html(self, soup, first_fetch):
  82.         for t in soup.findAll(['table', 'tr', 'td']):
  83.             t.name = 'div'
  84.  
  85.         for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
  86.             tag.extract()
  87.         for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
  88.             tag.extract()
  89.  
  90.         return soup
  91.