home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / tulsaworld.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  3.2 KB  |  47 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. tulsaworld.com
  5. '''
  6.  
  7. from calibre.web.feeds.news import BasicNewsRecipe
  8.  
  9. class TulsaWorld(BasicNewsRecipe):
  10.     title                 = 'Tulsa World'
  11.     __author__            = 'Darko Miletic'
  12.     description           = 'Find breaking news, local news, Oklahoma weather, sports, business, entertainment, lifestyle, opinion, government, movies, books, jobs, education, blogs, video & multimedia.'
  13.     publisher             = 'World Publishing Co.'
  14.     category              = 'Tulsa World, tulsa world, daily newspaper, breaking news, stories, articles, news, local, weather, coverage, editorial, government, education, community, sports, business, entertainment, lifestyle, opinion, multimedia, media, blogs, consumer, OU, OSU, TU, ORU, football, basketball, school, schools, sudoku, movie reviews, stocks, classified ads, classifieds, books, job, jobs, careers, real estate, home, homes, Oklahoma, northeastern, reviews, auto, autos, archives, forecasts, Sooners, Cowboys, Hurricane, Golden Eagles, NFL, NBA, MLB, pro football, scores, college basketball, college football, college baseball, sports columns, fashion and style, associated press, regional news coverage, health, obituaries, politics, political news, Jenks, Union, Owasso, Tulsa, Booker T. Washington, Trojans, Rams, Hornets, video, photography, photos, images, games, search, the picker, predictions, satellite, family, food, teens, polls, births, celebrations, death notices, divorces, marriages, obituaries, audio, podcasts.'
  15.     oldest_article        = 2
  16.     max_articles_per_feed = 200
  17.     no_stylesheets        = True
  18.     encoding              = 'utf8'
  19.     use_embedded_content  = False
  20.     language              = 'en'
  21.     country               = 'US'
  22.     remove_empty_feeds    = True
  23.     masthead_url          = 'http://www.tulsaworld.com/images/TW_logo-blue-footer.jpg'
  24.     extra_css             = ' body{font-family: Arial,Verdana,sans-serif } img{margin-bottom: 0.4em} .articleHeadline{font-size: xx-large; font-weight: bold} .articleKicker{font-size: x-large; font-weight: bold} .articleByline,.articleDate{font-size: small} .leadp{font-size: 1.1em} '
  25.  
  26.     conversion_options = {
  27.                           'comment'          : description
  28.                         , 'tags'             : category
  29.                         , 'publisher'        : publisher
  30.                         , 'language'         : language
  31.                         , 'linearize_tables' : True
  32.                         }
  33.     keep_only_tags = [dict(name='div',attrs={'id':['ctl00_body1_ArticleControl_divArticleText','ctl00_BodyContent_ArticleControl_divArticleText']})]
  34.  
  35.     feeds = [
  36.               (u'News'    , u'http://www.tulsaworld.com/site/rss.aspx?group=1')
  37.              ,(u'Business', u'http://www.tulsaworld.com/site/rss.aspx?group=5')
  38.              ,(u'Opinion' , u'http://www.tulsaworld.com/site/rss.aspx?group=7')
  39.             ]
  40.  
  41.     def get_article_url(self, article):        
  42.         return article.get('link',  None).rpartition('&rss')[0]
  43.             
  44.     def preprocess_html(self, soup):
  45.         for item in soup.findAll(style=True):
  46.             del item['style']
  47.         return self.adeify_images(soup)