home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / latimes.recipe < prev    next >
Text File  |  2011-09-09  |  7KB  |  124 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. www.latimes.com
  5. '''
  6.  
  7. from calibre.web.feeds.news import BasicNewsRecipe
  8.  
  9. class LATimes(BasicNewsRecipe):
  10.     title                 = 'Los Angeles Times'
  11.     __author__            = 'Darko Miletic'
  12.     description           = 'The Los Angeles Times is a leading source of news on Southern California, entertainment, movies, television, music, politics, business, health, technology, travel, sports, environment, economics, autos, jobs, real estate and other topics affecting California'
  13.     publisher             = 'Tribune Company'
  14.     category              = 'news, politics, USA, Los Angeles, world'
  15.     oldest_article        = 2
  16.     max_articles_per_feed = 200
  17.     no_stylesheets        = True
  18.     encoding              = 'utf8'
  19.     use_embedded_content  = False
  20.     language              = 'en'
  21.     remove_empty_feeds    = True
  22.     publication_type      = 'newspaper'
  23.     masthead_url          = 'http://www.latimes.com/images/logo.png'
  24.     cover_url             = 'http://www.latimes.com/includes/sectionfronts/A1.pdf'
  25.     extra_css             = """
  26.                                body{font-family: Georgia,"Times New Roman",Times,serif }
  27.                                img{margin-bottom: 0.4em; margin-top: 0.8em; display:block}
  28.                                h2{font-size: 1.1em}
  29.                                .deckhead{font-size: small; text-transform: uppercase}
  30.                                .small{color: gray; font-size: small}
  31.                                .date,.time,.copyright{font-size: x-small; color:gray; font-style:italic;}
  32.                             """
  33.  
  34.     conversion_options = {
  35.                           'comment'          : description
  36.                         , 'tags'             : category
  37.                         , 'publisher'        : publisher
  38.                         , 'language'         : language
  39.                         , 'linearize_tables' : 'Yes'
  40.                         }
  41.  
  42.     keep_only_tags = [
  43.                         dict(name='div', attrs={'class':'story'})
  44.                        ,dict(attrs={'class':['entry-header','time','entry-content']})
  45.                      ]
  46.     remove_tags_after=dict(name='p', attrs={'class':'copyright'})
  47.     remove_tags = [
  48.                      dict(name=['meta','link','iframe','object','embed'])
  49.                     ,dict(attrs={'class':['toolSet','articlerail','googleAd','entry-footer-left','entry-footer-right','entry-footer-social','google-ad-story-bottom','sphereTools']})
  50.                     ,dict(attrs={'id':['article-promo','googleads','moduleArticleToolsContainer','gallery-subcontent']})
  51.                   ]
  52.     remove_attributes=['lang','xmlns:fb','xmlns:og','border','xtags','i','article_body']
  53.  
  54.  
  55.     feeds = [
  56.               (u'Top News'             , u'http://feeds.latimes.com/latimes/news'                           )
  57.              ,(u'Local News'           , u'http://feeds.latimes.com/latimes/news/local'                     )
  58.              ,(u'National'             , u'http://feeds.latimes.com/latimes/news/nationworld/nation'        )
  59.              ,(u'National Politics'    , u'http://feeds.latimes.com/latimes/news/politics/'                 )
  60.              ,(u'Business'             , u'http://feeds.latimes.com/latimes/business'                       )
  61.              ,(u'Education'            , u'http://feeds.latimes.com/latimes/news/education'                 )
  62.              ,(u'Environment'          , u'http://feeds.latimes.com/latimes/news/science/environment'       )
  63.              ,(u'Religion'             , u'http://feeds.latimes.com/latimes/features/religion'              )
  64.              ,(u'Science'              , u'http://feeds.latimes.com/latimes/news/science'                   )
  65.              ,(u'Technology'           , u'http://feeds.latimes.com/latimes/technology'                     )
  66.              ,(u'Africa'               , u'http://feeds.latimes.com/latimes/africa'                         )
  67.              ,(u'Asia'                 , u'http://feeds.latimes.com/latimes/asia'                           )
  68.              ,(u'Europe'               , u'http://feeds.latimes.com/latimes/europe'                         )
  69.              ,(u'Latin America'        , u'http://feeds.latimes.com/latimes/latinamerica'                   )
  70.              ,(u'Middle East'          , u'http://feeds.latimes.com/latimes/middleeast'                     )
  71.              ,(u'Arts&Culture'         , u'http://feeds.feedburner.com/latimes/entertainment/news/arts'     )
  72.              ,(u'Entertainment News'   , u'http://feeds.feedburner.com/latimes/entertainment/news/'         )
  73.              ,(u'Movie News'           , u'http://feeds.feedburner.com/latimes/entertainment/news/movies/'  )
  74.              ,(u'Movie Reviews'        , u'http://feeds.feedburner.com/movies/reviews/'                     )
  75.              ,(u'Music News'           , u'http://feeds.feedburner.com/latimes/entertainment/news/music/'   )
  76.              ,(u'Pop Album Reviews'    , u'http://feeds.feedburner.com/latimes/pop-album-reviews'           )
  77.              ,(u'Restaurant Reviews'   , u'http://feeds.feedburner.com/latimes/restaurant/reviews'          )
  78.              ,(u'Theatar and Dance'    , u'http://feeds.feedburner.com/latimes/theaterdance'                )
  79.              ,(u'Autos'                , u'http://feeds.latimes.com/latimes/classified/automotive/highway1/')
  80.              ,(u'Books'                , u'http://feeds.latimes.com/features/books'                         )
  81.              ,(u'Food'                 , u'http://feeds.latimes.com/latimes/features/food/'                 )
  82.              ,(u'Health'               , u'http://feeds.latimes.com/latimes/features/health/'               )
  83.              ,(u'Real Estate'          , u'http://feeds.latimes.com/latimes/classified/realestate/'         )
  84.              ,(u'Commentary'           , u'http://feeds2.feedburner.com/latimes/news/opinion/commentary/'   )
  85.              ,(u'Sports'               , u'http://feeds.latimes.com/latimes/sports/'                        )
  86.             ]
  87.  
  88.     def get_article_url(self, article):
  89.         ans = BasicNewsRecipe.get_article_url(self, article).rpartition('?')[0]
  90.  
  91.         try:
  92.             self.log('Looking for full story link in', ans)
  93.             soup = self.index_to_soup(ans)
  94.             x = soup.find(text="single page")
  95.  
  96.             if x is not None:
  97.                 a = x.parent
  98.                 if a and a.has_key('href'):
  99.                     ans = 'http://www.latimes.com'+a['href']
  100.                     self.log('Found full story link', ans)
  101.         except:
  102.             pass
  103.         return ans
  104.  
  105.     def preprocess_html(self, soup):
  106.         for item in soup.findAll(style=True):
  107.             del item['style']
  108.         for item in soup.findAll('img'):
  109.             if not item.has_key('alt'):
  110.                item['alt'] = 'image'
  111.         for item in soup.findAll('a'):
  112.             limg = item.find('img')
  113.             if item.string is not None:
  114.                str = item.string
  115.                item.replaceWith(str)
  116.             else:
  117.                if limg:
  118.                   item.name  ='div'
  119.                   item.attrs =[]
  120.                else:
  121.                    str = self.tag_to_string(item)
  122.                    item.replaceWith(str)
  123.         return soup
  124.