home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / wsj_free.recipe < prev    next >
Text File  |  2011-09-09  |  8KB  |  170 lines

  1. #!/usr/bin/env  python
  2. __license__   = 'GPL v3'
  3. __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
  4. __docformat__ = 'restructuredtext en'
  5.  
  6. from calibre.web.feeds.news import BasicNewsRecipe
  7. import copy
  8.  
  9. class WallStreetJournal(BasicNewsRecipe):
  10.  
  11.     title = 'Wall Street Journal (free)'
  12.     __author__ = 'Kovid Goyal, Sujata Raman, Joshua Oster-Morris, Starson17'
  13.     description = '''News and current affairs. This recipe only fetches complete
  14.     versions of the articles that are available free on the wsj.com website.
  15.     To get the rest of the articles, subscribe to the WSJ and use the other WSJ
  16.     recipe.'''
  17.     language = 'en'
  18.     cover_url           = 'http://dealbreaker.com/images/thumbs/Wall%20Street%20Journal%20A1.JPG'
  19.     max_articles_per_feed = 1000
  20.     timefmt  = ' [%a, %b %d, %Y]'
  21.     no_stylesheets = True
  22.  
  23.     extra_css      = '''h1{color:#093D72 ; font-size:large ; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; }
  24.                     h2{color:#474537; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
  25.                     .subhead{color:gray; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
  26.                     .insettipUnit {color:#666666; font-family:Arial,Sans-serif;font-size:xx-small }
  27.                     .targetCaption{ font-size:x-small; color:#333333; font-family:Arial,Helvetica,sans-serif}
  28.                     .article{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
  29.                     .tagline {color:#333333; font-size:xx-small}
  30.                     .dateStamp {color:#666666; font-family:Arial,Helvetica,sans-serif}
  31.                         h3{color:blue ;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
  32.                         .byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
  33.                         h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
  34.                     .paperLocation{color:#666666; font-size:xx-small}'''
  35.  
  36.     remove_tags_before = dict(name='h1')
  37.     remove_tags = [
  38.                     dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow"]),
  39.                     {'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
  40.                     dict(name='div', attrs={'data-flash-settings':True}),
  41.                     {'class':['insetContent embedType-interactive insetCol3wide','insetCol6wide','insettipUnit']},
  42.                     dict(rel='shortcut icon'),
  43.                     {'class':lambda x: x and 'sTools' in x},
  44.                     ]
  45.     remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
  46.  
  47.     def postprocess_html(self, soup, first):
  48.         for tag in soup.findAll(name=['table', 'tr', 'td']):
  49.             tag.name = 'div'
  50.  
  51.         for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])):
  52.             tag.extract()
  53.  
  54.         return soup
  55.  
  56.     def abs_wsj_url(self, href):
  57.         if not href.startswith('http'):
  58.             href = 'http://online.wsj.com' + href
  59.         return href
  60.  
  61.  
  62.     def wsj_get_index(self):
  63.         return self.index_to_soup('http://online.wsj.com/itp')
  64.  
  65.     def wsj_add_feed(self,feeds,title,url):
  66.         self.log('Found section:', title)
  67.         try:
  68.             if url.endswith('whatsnews'):
  69.                 articles = self.wsj_find_wn_articles(url)
  70.             else:
  71.                 articles = self.wsj_find_articles(url)
  72.         except:
  73.             articles = []
  74.         if articles:
  75.            feeds.append((title, articles))
  76.         return feeds
  77.  
  78.     def parse_index(self):
  79.         soup = self.wsj_get_index()
  80.  
  81.         date = soup.find('span', attrs={'class':'date-date'})
  82.         if date is not None:
  83.             self.timefmt = ' [%s]'%self.tag_to_string(date)
  84.  
  85.         feeds = []
  86.         div = soup.find('div', attrs={'class':'itpHeader'})
  87.         div = div.find('ul', attrs={'class':'tab'})
  88.         for a in div.findAll('a', href=lambda x: x and '/itp/' in x):
  89.             pageone = a['href'].endswith('pageone')
  90.             if pageone:
  91.                title = 'Front Section'
  92.                url = self.abs_wsj_url(a['href'])
  93.                feeds = self.wsj_add_feed(feeds,title,url)
  94.                title = 'What''s News'
  95.                url = url.replace('pageone','whatsnews')
  96.                feeds = self.wsj_add_feed(feeds,title,url)
  97.             else:
  98.                title = self.tag_to_string(a)
  99.                url = self.abs_wsj_url(a['href'])
  100.                feeds = self.wsj_add_feed(feeds,title,url)
  101.         return feeds
  102.  
  103.     def wsj_find_wn_articles(self, url):
  104.         soup = self.index_to_soup(url)
  105.         articles = []
  106.  
  107.         whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
  108.         if whats_news is not None:
  109.           for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x):
  110.             container = a.findParent(['p'])
  111.             meta = a.find(attrs={'class':'meta_sectionName'})
  112.             if meta is not None:
  113.                 meta.extract()
  114.             title = self.tag_to_string(a).strip()
  115.             url = a['href']
  116.             desc = ''
  117.             if container is not None:
  118.                 desc = self.tag_to_string(container)
  119.  
  120.             articles.append({'title':title, 'url':url,
  121.                 'description':desc, 'date':''})
  122.  
  123.             self.log('\tFound WN article:', title)
  124.  
  125.         return articles
  126.  
  127.     def wsj_find_articles(self, url):
  128.         soup = self.index_to_soup(url)
  129.  
  130.         whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
  131.         if whats_news is not None:
  132.            whats_news.extract()
  133.  
  134.         articles = []
  135.  
  136.         flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x})
  137.         if flavorarea is not None:
  138.            flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article'))
  139.            if flavorstory is not None:
  140.               flavorstory['class'] = 'mjLinkItem'
  141.               metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x})
  142.               if metapage is not None:
  143.                  flavorstory.append( copy.copy(metapage) ) #metapage should always be A1 because that should be first on the page
  144.  
  145.         for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
  146.             container = a.findParent(['li', 'div'])
  147.             meta = a.find(attrs={'class':'meta_sectionName'})
  148.             if meta is not None:
  149.                 meta.extract()
  150.                 meta = self.tag_to_string(meta).strip()
  151.             if meta:
  152.                 title = self.tag_to_string(a).strip() + ' [%s]'%meta
  153.             else:
  154.                 title = self.tag_to_string(a).strip()
  155.             url = self.abs_wsj_url(a['href'])
  156.             desc = ''
  157.             for p in container.findAll('p'):
  158.                 desc = self.tag_to_string(p)
  159.                 if not 'Subscriber Content' in desc:
  160.                     break
  161.  
  162.             articles.append({'title':title, 'url':url,
  163.                 'description':desc, 'date':''})
  164.  
  165.             self.log('\tFound article:', title)
  166.  
  167.         return articles
  168.  
  169.  
  170.