home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / baltimore_sun.recipe < prev    next >
Text File  |  2011-09-09  |  13KB  |  187 lines

  1. from __future__ import with_statement
  2. __license__ = 'GPL 3'
  3. __copyright__ = 'Original 2009, Kovid Goyal <kovid@kovidgoyal.net>'
  4. __copyright__= 'Modified 2011,  Josh Hall <jwtheiv@gmail.com>'
  5. __docformat__ = 'restructuredtext en'
  6.  
  7. '''
  8. www.baltimoresun.com
  9. '''
  10.  
  11. from calibre.web.feeds.news import BasicNewsRecipe
  12.  
  13. class BaltimoreSun(BasicNewsRecipe):
  14.  
  15.     title       = 'The Baltimore Sun'
  16.     __author__ = 'Josh Hall'
  17.     description = 'Politics, local and business news from Baltimore'
  18.     language = 'en'
  19.     oldest_article = 1
  20.     max_articles_per_feed = 100
  21.     remove_empty_feeds    = True
  22.     use_embedded_content    = False
  23.     no_stylesheets        = True
  24.     remove_javascript = True
  25.     #masthead_url = 'http://www.baltimoresun.com/images/thirdpartylogo.gif'
  26.  
  27.     remove_tags_before = dict(name='div', attrs={'class':['story', 'entry']})
  28.     remove_tags_after = [
  29.                                       {'class':['photo_article',]},
  30.                                       dict(name='div', attrs={'class':'shirttail-promo right clearfix'}),
  31.                                     ]
  32.  
  33.     keep_only_tags = [dict(name='div', attrs={'class':["story","entry-asset asset hentry"]}),
  34.                       dict(name='div', attrs={'id':["pagebody","story","maincontentcontainer"]}),
  35.                            ]
  36.  
  37.  
  38.     remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer","article-promo"]},
  39.                    {'class':["entry-footer-left","entry-footer-right","shirttail-promo right clearfix","clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent","toppaginate","module","module-header","module-content"]},
  40.                    dict(name='font',attrs={'id':["cr-other-headlines"]}),
  41.                    dict(name=['iframe']),
  42.                           ]
  43.     extra_css = '''
  44.                     h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
  45.                     h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
  46.                     .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
  47.                     .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
  48.                     p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  49.                     .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
  50.                     .story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  51.                     .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  52.                     .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  53.                     .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  54.                     .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  55.                     body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
  56.         '''
  57.     feeds = [
  58.          (u'Top Headlines', u'http://www.baltimoresun.com/rss2.0.xml'),
  59.          (u'Breaking News', u'http://www.baltimoresun.com/news/breaking/rss2.0.xml'),
  60.          (u'Top Maryland', u'http://www.baltimoresun.com/news/maryland/rss2.0.xml'),
  61.          #(u'Anne Arundel County', u'http://www.baltimoresun.com/news/maryland/anne-arundel/rss2.0.xml'),
  62.          (u'Baltimore City', u'http://www.baltimoresun.com/news/maryland/baltimore-city/rss2.0.xml'),
  63.          #(u'Baltimore County', u'http://www.baltimoresun.com/news/maryland/baltimore-county/rss2.0.xml'),
  64.          #(u'Carroll County', u'http://www.baltimoresun.com/news/maryland/carroll/rss2.0.xml'),
  65.          #(u'Harford County', u'http://www.baltimoresun.com/news/maryland/harford/rss2.0.xml'),
  66.          #(u'Howard County', u'http://www.baltimoresun.com/news/maryland/howard/rss2.0.xml'),
  67.          (u'Education', u'http://www.baltimoresun.com/news/education/rss2.0.xml'),
  68.          #(u'Obituaries', u'http://www.baltimoresun.com/news/obituaries/rss2.0.xml'),
  69.          (u'Local Politics', u'http://www.baltimoresun.com/news/maryland/politics/rss2.0.xml'),
  70.          (u'Weather', u'http://www.baltimoresun.com/news/weather/rss2.0.xml'),
  71.          #(u'Traffic', u'http://www.baltimoresun.com/features/commuting/rss2.0.xml'),
  72.          (u'Nation/world', u'http://feeds.chicagotribune.com/chicagotribune/news/nationworld/'),
  73.          (u'Weird News', u'http://www.baltimoresun.com/news/offbeat/rss2.0.xml'),
  74.  
  75.  
  76.          (u'Top Sports', u'http://www.baltimoresun.com/sports/rss2.0.xml'),
  77.          (u'Orioles/Baseball', u'http://www.baltimoresun.com/sports/orioles/rss2.0.xml'),
  78.          (u'Ravens/Football', u'http://www.baltimoresun.com/sports/ravens/rss2.0.xml'),
  79.          #(u'Terps', u'http://www.baltimoresun.com/sports/terps/rss2.0.xml'),
  80.          #(u'College Football', u'http://www.baltimoresun.com/sports/college/football/rss2.0.xml'),
  81.          #(u'Lacrosse', u'http://www.baltimoresun.com/sports/college/lacrosse/rss2.0.xml'),
  82.          #(u'Horse Racing', u'http://www.baltimoresun.com/sports/horse-racing/rss2.0.xml'),
  83.          #(u'Golf', u'http://www.baltimoresun.com/sports/golf/rss2.0.xml'),
  84.          #(u'NBA', u'http://www.baltimoresun.com/sports/nba/rss2.0.xml'),
  85.          #(u'High School', u'http://www.baltimoresun.com/sports/high-school/rss2.0.xml'),
  86.          #(u'Outdoors', u'http://www.baltimoresun.com/sports/outdoors/rss2.0.xml'),
  87.  
  88.          (u'Celebrity News', u'http://www.baltimoresun.com/entertainment/celebrities/rss2.0.xml'),
  89.          (u'Arts & Theater', u'http://www.baltimoresun.com/entertainment/arts/rss2.0.xml'),
  90.          (u'Movies', u'http://www.baltimoresun.com/entertainment/movies/rss2.0.xml'),
  91.          (u'Music & Nightlife', u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'),
  92.          (u'Restaurants & Food', u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'),
  93.          (u'TV/Media', u'http://www.baltimoresun.com/entertainment/tv/rss2.0.xml'),
  94.  
  95.          (u'Health&Wellness', u'http://www.baltimoresun.com/health/rss2.0.xml'),
  96.          (u'Home & Garden', u'http://www.baltimoresun.com/features/home-garden/rss2.0.xml'),
  97.          (u'Living Green', u'http://www.baltimoresun.com/features/green/rss2.0.xml'),
  98.          (u'Parenting', u'http://www.baltimoresun.com/features/parenting/rss2.0.xml'),
  99.          (u'Fashion', u'http://www.baltimoresun.com/features/fashion/rss2.0.xml'),
  100.          (u'Travel', u'http://www.baltimoresun.com/travel/rss2.0.xml'),
  101.         (u'Faith', u'http://www.baltimoresun.com/features/faith/rss2.0.xml'),
  102.  
  103.          (u'Top Business', u'http://www.baltimoresun.com/business/rss2.0.xml'),
  104.          (u'Technology', u'http://www.baltimoresun.com/business/technology/rss2.0.xml'),
  105.          (u'Personal finance', u'http://www.baltimoresun.com/business/money/rss2.0.xml'),
  106.          (u'Real Estate', u'http://www.baltimoresun.com/classified/realestate/rss2.0.xml'),
  107.          (u'Jobs', u'http://www.baltimoresun.com/classified/jobs/rss2.0.xml'),
  108.          (u'DIY', u'http://www.baltimoresun.com/features/do-it-yourself/rss2.0.xml'),
  109.          (u'Consumer Safety', u'http://www.baltimoresun.com/business/consumer-safety/rss2.0.xml'),
  110.          (u'Investing', u'http://www.baltimoresun.com/business/money/rss2.0.xml'),
  111.  
  112.          (u'Sun Editorials', u'http://www.baltimoresun.com/news/opinion/editorial/rss2.0.xml'),
  113.          (u'Op/Ed', u'http://www.baltimoresun.com/news/opinion/oped/rss2.0.xml'),
  114.          (u'Readers Respond', u'http://www.baltimoresun.com/news/opinion/readersrespond/'),
  115.  
  116.          (u'Kevin Cowherd', 'http://www.baltimoresun.com/sports/bal-columnist-cowherd,0,6829726.columnist-rss2.0.xml'),
  117.          (u'Jay Hancock', u'http://www.baltimoresun.com/business/money/bal-columnist-hancock,0,6673611.columnist-rss2.0.xml'),
  118.          (u'Jacques Kelly', u'http://www.baltimoresun.com/news/maryland/bal-columnist-kelly,0,1154701.columnist-rss2.0.xml'),
  119.          (u'Marta H. Mossburg', u'http://www.baltimoresun.com/news/opinion/oped/bal-columnist-mossburg,0,7982155.columnist-rss2.0.xml'),
  120.          (u'Mike Preston', u'http://www.baltimoresun.com/sports/bal-columnist-preston,0,6169796.columnist-rss2.0.xml'),
  121.          (u'Susan Reimer', u'http://www.baltimoresun.com/news/opinion/bal-columnist-reimer,0,162466.columnist-rss2.0.xml'),
  122.          (u'Dan Rodricks', u'http://www.baltimoresun.com/news/maryland/bal-columnist-rodricks,0,7089843.columnist-rss2.0.xml'),
  123.          (u'Thomas F. Schaller', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-schaller,0,897397.columnist-rss2.0.xml'),
  124.          (u'Peter Schmuck', u'http://www.baltimoresun.com/sports/bal-columnist-schmuck,0,7485088.columnist-rss2.0.xml'),
  125.          (u'Ron Smith', u'http://www.baltimoresun.com/news/opinion/bal-columnist-ronsmith,0,3964803.columnist-rss2.0.xml'),
  126.  
  127.          (u'Baltimore Crime Beat', u'http://weblogs.baltimoresun.com/news/crime/blog/index.xml'),
  128.          (u'Getting There', u'http://weblogs.baltimoresun.com/news/traffic/index.xml'),
  129.          (u'InsideEd', u'http://weblogs.baltimoresun.com/news/education/blog/index.xml'),
  130.          (u'Maryland Politics', u'http://weblogs.baltimoresun.com/news/local/politics/index.xml'),
  131.          (u'Maryland Weather', u'http://weblogs.marylandweather.com/index.xml'),
  132.          (u'Second Opinion', u'http://weblogs.baltimoresun.com/news/opinion/index.xml'),
  133.          (u'You Dont Say', u'http://weblogs.baltimoresun.com/news/mcintyre/blog/index.xml'),
  134.  
  135.          (u'BaltTech', u'http://weblogs.baltimoresun.com/news/technology/index.xml'),
  136.          (u'Consuming Interests', u'http://weblogs.baltimoresun.com/business/consuminginterests/blog/index.xml'),
  137.          (u'Jay Hancocks Blog', u'http://weblogs.baltimoresun.com/business/hancock/blog/index.xml'),
  138.          (u'The Real Estate Wonk', u'http://weblogs.baltimoresun.com/business/realestate/blog/index.xml'),
  139.  
  140.          (u'Clef Notes', 'http://weblogs.baltimoresun.com/entertainment/classicalmusic/index.xml'),
  141.          (u'Dining at Large', u'http://weblogs.baltimoresun.com/entertainment/dining/reviews/blog/index.xml'),
  142.          (u'Midnight Sun', u'http://weblogs.baltimoresun.com/entertainment/midnight_sun/blog/index.xml'),
  143.          (u'Mike Sragow Gets Reel', u'http://weblogs.baltimoresun.com/entertainment/movies/blog/index.xml'),
  144.          (u'Read Street', u'http://weblogs.baltimoresun.com/entertainment/books/blog/index.xml'),
  145.          (u'Reality Check', u'http://weblogs.baltimoresun.com/entertainment/realitycheck/blog/index.xml'),
  146.          (u'Z on TV', u'http://weblogs.baltimoresun.com/entertainment/zontv/index.xml'),
  147.  
  148.          (u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'),
  149.          (u'Charm City Moms', u'http://weblogs.baltimoresun.com/features/baltimoremomblog/index.xml'),
  150.          (u'Exercists', u'http://weblogs.baltimoresun.com/health/fitness/index.xml'),
  151.          (u'Garden Variety', 'http://weblogs.baltimoresun.com/features/gardening/index.xml'),
  152.          #(u'In Good Faith', u'http://weblogs.baltimoresun.com/news/faith/index.xml'),
  153.          (u'Picture of Health', u'http://weblogs.baltimoresun.com/health/index.xml'),
  154.          (u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'),
  155.  
  156.          #(u'Faceoff', u'http://weblogs.baltimoresun.com/sports/lacrosse/blog/index.xml'),
  157.          #(u'MMA Stomping Grounds', u'http://weblogs.baltimoresun.com/sports/mma/blog/index.xml'),
  158.          (u'Orioles Insider', u'http://weblogs.baltimoresun.com/sports/orioles/blog/index.xml'),
  159.          #(u'Outdoors Girl', u'http://weblogs.baltimoresun.com/sports/outdoors/blog/index.xml'),
  160.          (u'Ravens Insider', u'http://weblogs.baltimoresun.com/sports/ravens/blog/index.xml'),
  161.          #(u'Recruiting Report', u'http://weblogs.baltimoresun.com/sports/college/recruiting/index.xml'),
  162.          #(u'Ring Posts', u'http://weblogs.baltimoresun.com/sports/wrestling/blog/index.xml'),
  163.          (u'The Schmuck Stops Here', u'http://weblogs.baltimoresun.com/sports/schmuck/index.xml'),
  164.          (u'Toy Department', u'http://weblogs.baltimoresun.com/sports/thetoydepartment/index.xml'),
  165.          #(u'Tracking the Terps', u'http://weblogs.baltimoresun.com/sports/college/maryland_terps/blog/index.xml'),
  166.          #(u'Varsity Letters', u'http://weblogs.baltimoresun.com/sports/highschool/varsityletters/index.xml'),
  167.          (u'Virtual Vensanity', u'http://weblogs.baltimoresun.com/entertainment/bthesite/vensel/index.xml'),
  168.  
  169.              ]
  170.  
  171.  
  172.     def get_article_url(self, article):
  173.         print article.get('feedburner_origlink', article.get('guid', article.get('link')))
  174.         return article.get('feedburner_origlink', article.get('guid', article.get('link')))
  175.  
  176.  
  177.     def postprocess_html(self, soup, first_fetch):
  178.         for t in soup.findAll(['table', 'tr', 'td']):
  179.             t.name = 'div'
  180.  
  181.         for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
  182.             tag.extract()
  183.         for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
  184.             tag.extract()
  185.  
  186.         return soup
  187.