home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / slate.recipe < prev    next >
Text File  |  2011-09-09  |  19KB  |  452 lines

  1. #!/usr/bin/env  python
  2. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  3.  
  4. __license__   = 'GPL v3'
  5.  
  6. '''
  7. calibre recipe for slate.com
  8. '''
  9.  
  10. import re
  11. from calibre.web.feeds.recipes import BasicNewsRecipe
  12. from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag
  13.  
  14. class Slate(BasicNewsRecipe):
  15.     # Method variables for customizing downloads
  16.     description             = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
  17.     __author__              = 'GRiker, Sujata Raman and Nick Redding'
  18.     max_articles_per_feed   = 100
  19.     oldest_article          = 14
  20.     recursions              = 0
  21.     delay                   = 0
  22.     simultaneous_downloads  = 5
  23.     timeout                 = 120.0
  24.     timefmt                 = ''
  25.     feeds                   = None
  26.     no_stylesheets          = True
  27.     encoding                = None
  28.     language = 'en'
  29.  
  30.     slate_complete = True
  31.     if slate_complete:
  32.         title = 'Slate (complete)'
  33.     else:
  34.         title = 'Slate (weekly)'
  35.  
  36.     # Method variables for customizing feed parsing
  37.     summary_length          = 250
  38.     use_embedded_content    = None
  39.  
  40.     # Method variables for pre/post processing of HTML
  41.     preprocess_regexps = [ (re.compile(r'<p><em>Disclosure: <strong>Slate</strong> is owned by the Washington Post.*</p>',
  42.                                         re.DOTALL|re.IGNORECASE),
  43.                                         lambda match: ''),
  44.                            (re.compile(r'<p><strong><em>Join the discussion about this story on.*</p>',
  45.                                         re.DOTALL|re.IGNORECASE),
  46.                                         lambda match: '')   ]
  47.  
  48.     match_regexps           = []
  49.  
  50.     # The second entry is for 'Big Money', which comes from a different site, uses different markup
  51.     keep_only_tags          = [dict(attrs={   'id':['article_top', 'article_body']}),
  52.                                dict(attrs={   'id':['content']})  ]
  53.  
  54.     # The second entry is for 'Big Money', which comes from a different site, uses different markup
  55.     remove_tags             = [dict(attrs={   'id':['toolbox','recommend_tab','insider_ad_wrapper',
  56.                                                     'article_bottom_tools_cntr','fray_article_discussion','fray_article_links','bottom_sponsored_links','author_bio',
  57.                                                     'bizbox_links_bottom','ris_links_wrapper','BOXXLE',
  58.                                                     'comments_button','add_comments_button','comments-to-fray','marriott_ad',
  59.                                                     'article_bottom_tools','recommend_tab2','fbog_article_bottom_cntr']}),
  60.                                dict(attrs={    'id':['content-top','service-links-bottom','hed']})   ]
  61.  
  62.     excludedDescriptionKeywords =   ['Slate V','Twitter feed','podcast']
  63.     excludedTitleKeywords =         ['Gabfest','Slate V','on Twitter']
  64.     excludedAuthorKeywords =        []
  65.     excludedContentKeywords =       ['http://twitter.com/Slate']
  66.  
  67.     extra_css = '''
  68.                   .h1_subhead{font-family:Arial; font-size:small; }
  69.                    h1{font-family:Verdana; font-size:large; }
  70.                  .byline        {font-family:Georgia;   margin-bottom: 0px; }
  71.                  .dateline      {font-family:Arial;  font-size: smaller; height: 0pt;}
  72.                  .imagewrapper  {font-family:Verdana;font-size:x-small; }
  73.                  .source        {font-family:Verdana; font-size:x-small;}
  74.                  .credit        {font-family:Verdana; font-size:     smaller;}
  75.                  #article_body  {font-family:Verdana; }
  76.                  #content  {font-family:Arial; }
  77.                  .caption{font-family:Verdana;font-style:italic; font-size:x-small;}
  78.                  h3{font-family:Arial; font-size:small}
  79.                   '''
  80.  
  81.     # Local variables to extend class
  82.     baseURL = 'http://slate.com'
  83.     section_dates = []
  84.  
  85.     # class extension methods
  86.     def tag_to_strings(self, tag):
  87.         if not tag:
  88.             return ''
  89.         if isinstance(tag, basestring):
  90.             return tag
  91.         strings = []
  92.         for item in tag.contents:
  93.             if isinstance(item, (NavigableString, CData)):
  94.                 strings.append(item.string)
  95.             elif isinstance(item, Tag):
  96.                 res = self.tag_to_string(item,use_alt=False)
  97.                 if res:
  98.                     strings.append(res)
  99.         return strings
  100.  
  101.     def extract_named_sections(self):
  102.         soup = self.index_to_soup( self.baseURL )
  103.         soup_nav_bar = soup.find(True, attrs={'id':'nav'})
  104.         briefing_nav = soup.find('li')
  105.         briefing_url = briefing_nav.a['href']
  106.         for section_nav in soup_nav_bar.findAll('li'):
  107.             section_name = self.tag_to_string(section_nav,use_alt=False)
  108.             self.section_dates.append(section_name)
  109.  
  110.         soup = self.index_to_soup(briefing_url)
  111.  
  112.         self.log("Briefing url = %s " % briefing_url)
  113.         section_lists = soup.findAll('ul','view_links_list')
  114.  
  115.         sections = []
  116.         for section in section_lists :
  117.             sections.append(section)
  118.         return sections
  119.  
  120.  
  121.     def extract_dated_sections(self):
  122.         soup = self.index_to_soup( self.baseURL )
  123.         soup_top_stories = soup.find(True, attrs={'id':'tap3_cntr'})
  124.         if soup_top_stories:
  125.             self.section_dates.append("Top Stories")
  126.             self.log("SELECTION TOP STORIES %s" % "Top Stories")
  127.  
  128.         soup = soup.find(True, attrs={'id':'toc_links_container'})
  129.  
  130.         todays_section = soup.find(True, attrs={'class':'todaydateline'})
  131.         self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
  132.         self.log("SELECTION DATE %s" % self.tag_to_string(todays_section,use_alt=False))
  133.  
  134.         older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
  135.         for older_section in older_section_dates :
  136.             self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
  137.             self.log("SELECTION DATE %s" % self.tag_to_string(older_section,use_alt=False))
  138.  
  139.         if soup_top_stories:
  140.             headline_stories = soup_top_stories
  141.             self.log("HAVE top_stories")
  142.         else:
  143.             headline_stories = None
  144.             self.log("NO top_stories")
  145.         section_lists = soup.findAll('ul')
  146.         # Prepend the headlines to the first section
  147.         if headline_stories:
  148.             section_lists.insert(0,headline_stories)
  149.  
  150.         sections = []
  151.         for section in section_lists :
  152.             sections.append(section)
  153.         return sections
  154.  
  155.  
  156.     def extract_section_articles(self, sections_html) :
  157.         # Find the containers with section content
  158.         sections = sections_html
  159.  
  160.         articles = {}
  161.         key = None
  162.         ans = []
  163.  
  164.         for (i,section) in enumerate(sections) :
  165.  
  166.             # Get the section name
  167.             if section.has_key('id') :
  168.                 self.log("PROCESSING SECTION id = %s" % section['id'])
  169.                 key = self.section_dates[i]
  170.                 if key.startswith("Pod"):
  171.                     continue
  172.                 if key.startswith("Blog"):
  173.                     continue
  174.                 articles[key] = []
  175.                 ans.append(key)
  176.             elif self.slate_complete:
  177.                 key = self.section_dates[i]
  178.                 if key.startswith("Pod"):
  179.                     continue
  180.                 if key.startswith("Blog"):
  181.                     continue
  182.                 self.log("PROCESSING SECTION name = %s" % key)
  183.                 articles[key] = []
  184.                 ans.append(key)
  185.             else :
  186.                 self.log("SECTION %d HAS NO id" % i);
  187.                 continue
  188.  
  189.             # Get the section article_list
  190.             article_list = section.findAll('li')
  191.  
  192.             # Extract the article attributes
  193.             for article in article_list :
  194.                 bylines = self.tag_to_strings(article)
  195.                 url = article.a['href']
  196.                 title = bylines[0]
  197.                 full_title = self.tag_to_string(article,use_alt=False)
  198.                 #self.log("ARTICLE TITLE%s" % title)
  199.                 #self.log("ARTICLE FULL_TITLE%s" % full_title)
  200.                 #self.log("URL %s" % url)
  201.                 author = None
  202.                 description = None
  203.                 pubdate = None
  204.  
  205.                 if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 :
  206.                     description = "A summary of what's in the major U.S. newspapers."
  207.  
  208.                 if len(bylines) == 3 :
  209.                     author = bylines[2].strip()
  210.                     author = re.sub('[\r][\n][\t][\t\t]','', author)
  211.                     author = re.sub(',','', author)
  212.                     if bylines[1] is not None :
  213.                         description = bylines[1]
  214.                         full_byline = self.tag_to_string(article)
  215.                         if full_byline.find('major U.S. newspapers') > 0 :
  216.                             description = "A summary of what's in the major U.S. newspapers."
  217.  
  218.                 if len(bylines) > 3  and author is not None:
  219.                     author += " | "
  220.                     for (i,substring) in enumerate(bylines[3:]) :
  221.                         #print "substring: %s" % substring.encode('cp1252')
  222.                         author += substring.strip()
  223.                         if i < len(bylines[3:]) :
  224.                             author += " | "
  225.  
  226.                 # Skip articles whose descriptions contain excluded keywords
  227.                 if description is not None and len(self.excludedDescriptionKeywords):
  228.                     excluded = re.compile('|'.join(self.excludedDescriptionKeywords))
  229.                     found_excluded = excluded.search(description)
  230.                     if found_excluded :
  231.                         self.log("  >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
  232.                         continue
  233.  
  234.                 # Skip articles whose title contain excluded keywords
  235.                 if full_title is not None and len(self.excludedTitleKeywords):
  236.                     excluded = re.compile('|'.join(self.excludedTitleKeywords))
  237.                     #self.log("evaluating full_title: %s" % full_title)
  238.                     found_excluded = excluded.search(full_title)
  239.                     if found_excluded :
  240.                         self.log("  >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
  241.                         continue
  242.  
  243.                 # Skip articles whose author contain excluded keywords
  244.                 if author is not None and len(self.excludedAuthorKeywords):
  245.                     excluded = re.compile('|'.join(self.excludedAuthorKeywords))
  246.                     found_excluded = excluded.search(author)
  247.                     if found_excluded :
  248.                         self.log("  >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
  249.                         continue
  250.  
  251.                 skip_this_article = False
  252.                 # Check to make sure we're not adding a duplicate
  253.                 for article in articles[key] :
  254.                     if article['url'] == url :
  255.                         skip_this_article = True
  256.                         self.log("SKIPPING DUP %s" % url)
  257.                         break
  258.  
  259.                 if skip_this_article :
  260.                     continue
  261.  
  262.                 # Build the dictionary entry for this article
  263.                 feed = key
  264.                 if not articles.has_key(feed) :
  265.                     articles[feed] = []
  266.                 articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
  267.                                            author=author, content=''))
  268.                 #self.log("KEY %s" % feed)
  269.                 #self.log("APPENDED %s" % url)
  270.             # Promote 'newspapers' to top
  271.             for (i,article) in enumerate(articles[feed]) :
  272.                 if article['description'] is not None :
  273.                     if article['description'].find('newspapers') > 0 :
  274.                         articles[feed].insert(0,articles[feed].pop(i))
  275.  
  276.  
  277.         ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
  278.         return ans
  279.  
  280.     def print_version(self, url) :
  281.         return url + 'pagenum/all/'
  282.  
  283.     # Class methods
  284.     def parse_index(self) :
  285.         if self.slate_complete:
  286.             sections = self.extract_named_sections()
  287.         else:
  288.             sections = self.extract_dated_sections()
  289.         section_list = self.extract_section_articles(sections)
  290.         return section_list
  291.  
  292.     def get_masthead_url(self):
  293.         masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif'
  294.         br = BasicNewsRecipe.get_browser()
  295.         try:
  296.             br.open(masthead)
  297.         except:
  298.             self.log("\nMasthead unavailable")
  299.             masthead = None
  300.         return masthead
  301.  
  302.     def stripAnchors(self,soup):
  303.         body = soup.find('div',attrs={'id':['article_body','content']})
  304.         if body is not None:
  305.             paras = body.findAll('p')
  306.             if paras is not None:
  307.                 for para in paras:
  308.                     aTags = para.findAll('a')
  309.                     if aTags is not None:
  310.                         for a in aTags:
  311.                             if a.img is None:
  312.                                 #print repr(a.renderContents())
  313.                                 a.replaceWith(a.renderContents().decode('utf-8','replace'))
  314.         return soup
  315.  
  316.     def preprocess_html(self, soup) :
  317.  
  318.         # Remove 'grayPlus4.png' images
  319.         imgs = soup.findAll('img')
  320.         if imgs is not None:
  321.             for img in imgs:
  322.                 if re.search("grayPlus4.png",str(img)):
  323.                     img.extract()
  324.  
  325.         # Delete article based upon content keywords
  326.         if len(self.excludedDescriptionKeywords):
  327.             excluded = re.compile('|'.join(self.excludedContentKeywords))
  328.             found_excluded = excluded.search(str(soup))
  329.             if found_excluded :
  330.                 print "No allowed content found, removing article"
  331.                 raise Exception('Rejected article')
  332.  
  333.         # Articles from www.thebigmoney.com use different tagging for byline, dateline and body
  334.         head = soup.find('head')
  335.         if head.link is not None and re.search('www\.thebigmoney\.com', str(head)):
  336.             byline = soup.find('div',attrs={'id':'byline'})
  337.             if byline is not None:
  338.                 byline['class'] = byline['id']
  339.  
  340.             dateline = soup.find('div',attrs={'id':'dateline'})
  341.             if dateline is not None:
  342.                 dateline['class'] = dateline['id']
  343.  
  344.             body = soup.find('div',attrs={'id':'content'})
  345.             if body is not None:
  346.                 body['class'] = 'article_body'
  347.  
  348.             # Synthesize a department kicker
  349.             h3Tag = Tag(soup,'h3')
  350.             emTag = Tag(soup,'em')
  351.             emTag.insert(0,NavigableString("the big money: Today's business press"))
  352.             h3Tag.insert(0,emTag)
  353.             soup.body.insert(0,h3Tag)
  354.  
  355.         # Strip anchors from HTML
  356.         return self.stripAnchors(soup)
  357.  
  358.     def postprocess_html(self, soup, first_fetch) :
  359.  
  360.         # Fix up dept_kicker as <h3><em>
  361.         dept_kicker = soup.find('div', attrs={'class':'department_kicker'})
  362.         if dept_kicker is not None :
  363.             kicker_strings = self.tag_to_strings(dept_kicker)
  364.             kicker = ''.join(kicker_strings[2:])
  365.             kicker = re.sub('\.','',kicker)
  366.             h3Tag = Tag(soup, "h3")
  367.             emTag = Tag(soup, "em")
  368.             emTag.insert(0,NavigableString(kicker))
  369.             h3Tag.insert(0, emTag)
  370.             dept_kicker.replaceWith(h3Tag)
  371.         else:
  372.             self.log("No kicker--return null")
  373.             return None
  374.  
  375.        # Fix up the concatenated byline and dateline
  376.         byline = soup.find(True,attrs={'class':'byline'})
  377.         if byline is not None :
  378.             bylineTag = Tag(soup,'div')
  379.             bylineTag['class'] = 'byline'
  380.             #bylineTag['height'] = '0em'
  381.             bylineTag.insert(0,self.tag_to_string(byline))
  382.             byline.replaceWith(bylineTag)
  383.  
  384.         dateline = soup.find(True, attrs={'class':'dateline'})
  385.         if dateline is not None :
  386.             datelineTag = Tag(soup, 'div')
  387.             datelineTag['class'] = 'dateline'
  388.             #datelineTag['margin-top'] = '0em'
  389.             datelineTag.insert(0,self.tag_to_string(dateline))
  390.             dateline.replaceWith(datelineTag)
  391.  
  392.         # Change captions to italic, add <hr>
  393.         for caption in soup.findAll(True, {'class':'caption'}) :
  394.             if caption is not None:
  395.                 emTag = Tag(soup, "em")
  396.                 emTag.insert(0, '<br />' + self.tag_to_string(caption))
  397.                 hrTag = Tag(soup, 'hr')
  398.                 emTag.insert(1, hrTag)
  399.                 caption.replaceWith(emTag)
  400.  
  401.         # Fix photos
  402.         for photo in soup.findAll('span',attrs={'class':'imagewrapper'}):
  403.             if photo.a is not None and photo.a.img is not None:
  404.                 divTag = Tag(soup,'div')
  405.                 divTag['class'] ='imagewrapper'
  406.                 divTag.insert(0,photo.a.img)
  407.                 photo.replaceWith(divTag)
  408.  
  409.         return soup
  410.  
  411.     def postprocess_book(self, oeb, opts, log) :
  412.  
  413.         def extract_byline(href) :
  414.             soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
  415.             byline = soup.find(True,attrs={'class':'byline'})
  416.             if byline is not None:
  417.                 return self.tag_to_string(byline,use_alt=False)
  418.             else :
  419.                 return None
  420.  
  421.         def extract_description(href) :
  422.             soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
  423.             paragraphs = soup.findAll('p')
  424.             for p in paragraphs :
  425.                 if self.tag_to_string(p,use_alt=False).startswith('By ') or \
  426.                    self.tag_to_string(p,use_alt=False).startswith('Posted '):
  427.                     continue
  428.                 comment = p.find(text=lambda text:isinstance(text, Comment))
  429.                 if comment is not None:
  430.                     continue
  431.                 else:
  432.                     return self.tag_to_string(p,use_alt=False)[:self.summary_length] + '...'
  433.  
  434.             return None
  435.  
  436.         # Method entry point here
  437.         # Single section toc looks different than multi-section tocs
  438.         if oeb.toc.depth() == 2 :
  439.             for article in oeb.toc :
  440.                 if article.author is None :
  441.                     article.author = extract_byline(article.href)
  442.                 if article.description is None :
  443.                     article.description = extract_description(article.href)
  444.         elif oeb.toc.depth() == 3 :
  445.             for section in oeb.toc :
  446.                 for article in section :
  447.                     if article.author is None :
  448.                         article.author = extract_byline(article.href)
  449.                     if article.description is None :
  450.                         article.description = extract_description(article.href)
  451.  
  452.