home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / barrons.recipe < prev    next >
Text File  |  2011-09-09  |  5KB  |  132 lines

  1. ##
  2. ##    web2lrf profile to download articles from Barrons.com
  3. ##    can download subscriber-only content if username and
  4. ##    password are supplied.
  5. ##
  6. '''
  7. '''
  8.  
  9. import re
  10.  
  11. from calibre.web.feeds.news import BasicNewsRecipe
  12.  
  13. class Barrons(BasicNewsRecipe):
  14.  
  15.         title = 'Barron\'s'
  16.         max_articles_per_feed = 50
  17.         needs_subscription    = True
  18.         language = 'en'
  19.  
  20.         __author__ = 'Kovid Goyal and Sujata Raman'
  21.         description = 'Weekly publication for investors from the publisher of the Wall Street Journal'
  22.         timefmt  = ' [%a, %b %d, %Y]'
  23.         use_embedded_content   = False
  24.         no_stylesheets = True
  25.         match_regexps = ['http://online.barrons.com/.*?html\?mod=.*?|file:.*']
  26.         conversion_options = {'linearize_tables': True}
  27.         ##delay = 1
  28.  
  29.         ## Don't grab articles more than 7 days old
  30.         oldest_article = 7
  31.  
  32.         extra_css = '''
  33.                     .datestamp{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
  34.                     h3{font-family:Georgia,"Times New Roman",Times,serif; }
  35.                     h2{font-family:Georgia,"Times New Roman",Times,serif; }
  36.                     h1{ font-family:Georgia,"Times New Roman",Times,serif; }
  37.                     .byline{font-family:Verdana,Geneva,Kalimati,sans-serif; font-size:x-small;}
  38.                     .subhead{font-family:Georgia,"Times New Roman",Times,serif; font-size: small;}
  39.                     .articlePage{ font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif;}
  40.                     .insettipUnit{font-size: x-small;}
  41.                     '''
  42.         remove_tags = [
  43.                            dict(name ='div', attrs={'class':['tabContainer artTabbedNav','rssToolBox hidden','articleToolbox']}),
  44.                            dict(name = 'a', attrs ={'class':'insetClose'})
  45.                         ]
  46.  
  47.         preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
  48.                 [
  49.                 ## Remove anything before the body of the article.
  50.                 (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
  51.  
  52.                 ## Remove any insets from the body of the article.
  53.                 (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
  54.  
  55.                 ## Remove any reprint info from the body of the article.
  56.                 (r'<hr size.*?<p', lambda match : '<p'),
  57.  
  58.                 ## Remove anything after the end of the article.
  59.                 (r'<!-- article end.*?</body>', lambda match : '</body>'),
  60.                 ]
  61.         ]
  62.  
  63.         def get_browser(self):
  64.             br = BasicNewsRecipe.get_browser()
  65.             if self.username is not None and self.password is not None:
  66.                 br.open('http://commerce.barrons.com/auth/login')
  67.                 br.select_form(name='login_form')
  68.                 br['user']   = self.username
  69.                 br['password'] = self.password
  70.                 br.submit()
  71.             return br
  72.  
  73.         ## Use the print version of a page when available.
  74.  
  75.         def print_version(self, url):
  76.                main, sep, rest = url.rpartition('?')
  77.                return main + '#printmode'
  78.  
  79.         def postprocess_html(self, soup, first):
  80.  
  81.                for tag in soup.findAll(name=['ul', 'li']):
  82.                     tag.name = 'div'
  83.                for tag in soup.findAll(name ='div', attrs={'id': "articleThumbnail_1"}):
  84.                   tag.extract()
  85.  
  86.                return soup
  87.  
  88. ## Comment out the feeds you don't want retrieved.
  89. ## Because these feeds are sorted alphabetically when converted to LRF, you may want to number them to put them in the order you desire
  90.  
  91.         def get_feeds(self):
  92.                 return  [
  93.                 ('This Week\'s Magazine', 'http://online.barrons.com/xml/rss/3_7510.xml'),
  94.                 ('Online Exclusives', 'http://online.barrons.com/xml/rss/3_7515.xml'),
  95.                 ('Companies', 'http://online.barrons.com/xml/rss/3_7516.xml'),
  96.                 ('Markets', 'http://online.barrons.com/xml/rss/3_7517.xml'),
  97.                 ('Technology', 'http://online.barrons.com/xml/rss/3_7518.xml'),
  98.                 ('Funds/Q&A', 'http://online.barrons.com/xml/rss/3_7519.xml'),
  99.                 ]
  100.  
  101.         def get_article_url(self, article):
  102.             return article.get('link', None)
  103.  
  104.  
  105.         def get_cover_url(self):
  106.             cover_url = None
  107.             index = 'http://online.barrons.com/home-page'
  108.             soup = self.index_to_soup(index)
  109.             link_item = soup.find('ul',attrs={'class':'newsItem barronsMag'})
  110.             if link_item:
  111.                cover_url = link_item.img['src']
  112.             return cover_url
  113.  
  114.  
  115.         ## Logout of website
  116.         ## NOT CURRENTLY WORKING
  117.         # def cleanup(self):
  118.             # try:
  119.                 # self.browser.set_debug_responses(True)
  120.                 # import sys, logging
  121.                 # logger = logging.getLogger("mechanize")
  122.                 # logger.addHandler(logging.StreamHandler(sys.stdout))
  123.                 # logger.setLevel(logging.INFO)
  124.  
  125.                 # res = self.browser.open('http://online.barrons.com/logout')
  126.             # except:
  127.                 # import traceback
  128.                 # traceback.print_exc()
  129.  
  130.  
  131.  
  132.