home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / new_london_day.recipe < prev    next >
Text File  |  2011-09-09  |  4KB  |  75 lines

  1. __license__ = 'GPL 3'
  2. __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
  3. __docformat__ = 'restructuredtext en'
  4.  
  5. from calibre.web.feeds.news import BasicNewsRecipe
  6.  
  7. class AdvancedUserRecipe1294342201(BasicNewsRecipe):
  8.     title          = u'New London Day'
  9.     __author__  = 'Being'
  10.     description = 'State, local and business news from New London, CT'
  11.     language = 'en_GB'
  12.     oldest_article = 1
  13.     max_articles_per_feed = 200
  14.  
  15.     use_embedded_content    = False
  16.     no_stylesheets        = True
  17.     remove_javascript = True
  18.     remove_tags_before = dict(id='article')
  19.     remove_tags_after  = dict(id='article')
  20.     remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
  21.                 dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
  22.                 dict(name=['script', 'noscript', 'style'])]
  23.     remove_tags_after = [    {'class':['photo_article',]} ]
  24.     remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]},
  25.                    {'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]},
  26.                    dict(name='font',attrs={'id':["cr-other-headlines"]})]
  27.     extra_css = '''
  28.                     h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
  29.                     h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
  30.                     .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
  31.                     .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
  32.                     p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  33.                     .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
  34.                     .story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  35.                     .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  36.                     .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  37.                     .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  38.                     .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  39.                     body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
  40.         '''
  41.  
  42.     feeds = [
  43.             (u'All News', u'http://www.theday.com/section/rss'),
  44.             (u'Breaking News', u'http://www.theday.com/section/rss01'),
  45.             (u'Police and Courts', u'http://www.theday.com/section/rss02'),
  46.             (u'State News', u'http://www.theday.com/section/rss03'),
  47.             (u'Local Business', u'http://www.theday.com/section/rss04'),
  48.             (u'Entertainment', u'http://www.theday.com/section/rss05'),
  49.             (u'Opinion', u'http://www.theday.com/section/rss06'),
  50.             (u'Casinos', u'http://www.theday.com/section/rss12'),
  51.             (u'Defense and Military', u'http://www.theday.com/section/rss14'),
  52.             (u'Ann Baldelli Ruminations', u'http://www.theday.com/section/rss20'),
  53.             (u'Paul Choiniere Ruminations', u'http://www.theday.com/section/rss21'),
  54.             (u'Michael Costanza Omnivore', u'http://www.theday.com/section/rss23'),
  55.             (u'Rebecca Dangelo Reel Life', u'http://www.theday.com/section/rss25'),]
  56.  
  57.     def print_version(self, url):
  58.         return url.replace('/index.html', '/print.html')
  59.  
  60.     def get_article_url(self, article):
  61.         return article.get('feedburner_origlink', article.get('guid', article.get('link')))
  62.  
  63.  
  64.     def postprocess_html(self, soup, first_fetch):
  65.         for t in soup.findAll(['table', 'tr', 'td']):
  66.             t.name = 'div'
  67.  
  68.         for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
  69.             tag.extract()
  70.         for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
  71.             tag.extract()
  72.  
  73.         return soup
  74.  
  75.