home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / nytimes.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  28.8 KB  |  733 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  5. '''
  6. nytimes.com
  7. '''
  8. import re, string, time
  9. from calibre import entity_to_unicode, strftime
  10. from datetime import timedelta, date
  11. from calibre.web.feeds.recipes import BasicNewsRecipe
  12. from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
  13.  
  14.  
  15. class NYTimes(BasicNewsRecipe):
  16.  
  17.     # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
  18.     headlinesOnly = True
  19.  
  20.     # set webEdition to True for the Web edition of the newspaper. Set oldest_article to the
  21.     # number of days old an article can be for inclusion. If oldest_article = 0 all articles
  22.     # will be included. Note: oldest_article is ignored if webEdition = False
  23.     webEdition = False
  24.     oldest_article = 7
  25.  
  26.     # includeSections: List of sections to include. If empty, all sections found will be included.
  27.     # Otherwise, only the sections named will be included. For example,
  28.     #
  29.     #    includeSections = ['Politics','Sports']
  30.     #
  31.     # would cause only the Politics and Sports sections to be included.
  32.  
  33.     includeSections = []  # by default, all sections included
  34.  
  35.     # excludeSections: List of sections to exclude. If empty, all sections found will be included.
  36.     # Otherwise, the sections named will be excluded. For example,
  37.     #
  38.     #    excludeSections = ['Politics','Sports']
  39.     #
  40.     # would cause the Politics and Sports sections to be excluded. This parameter can be used
  41.     # in conjuction with includeSections although in most cases using one or the other, but
  42.     # not both, is sufficient.
  43.  
  44.     excludeSections = []
  45.  
  46.     # one_picture_per_article specifies that calibre should only use the first image
  47.     # from an article (if one exists).  If one_picture_per_article = True, the image
  48.     # will be moved to a location between the headline and the byline.
  49.     # If one_picture_per_article = False, all images from the article will be included
  50.     # and shown in their original location.
  51.     one_picture_per_article = False
  52.  
  53.     # The maximum number of articles that will be downloaded
  54.     max_articles_per_feed = 100
  55.  
  56.     # Whether to omit duplicates of articles (typically arsing when articles are indexed in
  57.     # more than one section). If True, only the first occurance will be downloaded.
  58.     filterDuplicates = True
  59.  
  60.     # Sections to collect for the Web edition.
  61.     # Delete any you don't want, or use includeSections or excludeSections
  62.     web_sections = [(u'World',u'world'),
  63.                     (u'U.S.',u'national'),
  64.                     (u'Politics',u'politics'),
  65.                     (u'New York',u'nyregion'),
  66.                     (u'Business','business'),
  67.                     (u'Technology',u'technology'),
  68.                     (u'Sports',u'sports'),
  69.                     (u'Science',u'science'),
  70.                     (u'Health',u'health'),
  71.                     (u'Opinion',u'opinion'),
  72.                     (u'Arts',u'arts'),
  73.                     (u'Books',u'books'),
  74.                     (u'Movies',u'movies'),
  75.                     (u'Music',u'arts/music'),
  76.                     (u'Television',u'arts/television'),
  77.                     (u'Style',u'style'),
  78.                     (u'Dining & Wine',u'dining'),
  79.                     (u'Fashion & Style',u'fashion'),
  80.                     (u'Home & Garden',u'garden'),
  81.                     (u'Travel',u'travel'),
  82.                     ('Education',u'education'),
  83.                     ('Multimedia',u'multimedia'),
  84.                     (u'Obituaries',u'obituaries'),
  85.                     (u'Sunday Magazine',u'magazine'),
  86.                     (u'Week in Review',u'weekinreview')]
  87.  
  88.  
  89.     if headlinesOnly:
  90.         title='New York Times Headlines'
  91.         description = 'Headlines from the New York Times. Needs a subscription from http://www.nytimes.com'
  92.         needs_subscription = 'optional'
  93.     elif webEdition:
  94.         title='New York Times (Web)'
  95.         description = 'New York Times on the Web'
  96.         needs_subscription = True
  97.     else:
  98.         title='New York Times'
  99.         description = 'Today\'s New York Times'
  100.         needs_subscription = True
  101.  
  102.  
  103.     month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
  104.  
  105.     def decode_us_date(self,datestr):
  106.         udate = datestr.strip().lower().split()
  107.         try:
  108.             m = self.month_list.index(udate[0])+1
  109.         except:
  110.             return date.today()
  111.         d = int(udate[1])
  112.         y = int(udate[2])
  113.         try:
  114.             d = date(y,m,d)
  115.         except:
  116.             d = date.today
  117.         return d
  118.  
  119.     earliest_date = date.today() - timedelta(days=oldest_article)
  120.  
  121.     __author__  = 'GRiker/Kovid Goyal/Nick Redding'
  122.     language = 'en'
  123.     requires_version = (0, 7, 5)
  124.  
  125.  
  126.     timefmt = ''
  127.     masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
  128.     cover_margins = (18,18,'grey99')
  129.  
  130.     remove_tags_before = dict(id='article')
  131.     remove_tags_after  = dict(id='article')
  132.     remove_tags = [dict(attrs={'class':[
  133.                             'articleFooter',
  134.                             'articleTools',
  135.                             'columnGroup doubleRule',
  136.                             'columnGroup singleRule',
  137.                             'columnGroup last',
  138.                             'columnGroup  last',
  139.                             'doubleRule',
  140.                             'dottedLine',
  141.                             'entry-meta',
  142.                             'entry-response module',
  143.                             'icon enlargeThis',
  144.                             'leftNavTabs',
  145.                             'metaFootnote',
  146.                             'module box nav',
  147.                             'nextArticleLink',
  148.                             'nextArticleLink clearfix',
  149.                             'post-tools',
  150.                             'relatedSearchesModule',
  151.                             'side_tool',
  152.                             'singleAd',
  153.                             re.compile('^subNavigation'),
  154.                             re.compile('^leaderboard'),
  155.                             re.compile('^module'),
  156.                             ]}),
  157.                    dict(id=[
  158.                             'adxLeaderboard',
  159.                             'adxSponLink',
  160.                             'archive',
  161.                             'articleExtras',
  162.                             'articleInline',
  163.                             'blog_sidebar',
  164.                             'businessSearchBar',
  165.                             'cCol',
  166.                             'entertainmentSearchBar',
  167.                             'footer',
  168.                             'header',
  169.                             'header_search',
  170.                             'inlineBox',
  171.                             'login',
  172.                             'masthead',
  173.                             'masthead-nav',
  174.                             'memberTools',
  175.                             'navigation',
  176.                             'portfolioInline',
  177.                             'readerReviews',
  178.                             'readerReviewsCount',
  179.                             'relatedArticles',
  180.                             'relatedTopics',
  181.                             'respond',
  182.                             'side_search',
  183.                             'side_index',
  184.                             'side_tool',
  185.                             'toolsRight',
  186.                             ]),
  187.                    dict(name=['script', 'noscript', 'style','form','hr'])]
  188.     no_stylesheets = True
  189.     extra_css = '''
  190.                 .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
  191.                 .credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
  192.                 .byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
  193.                 .dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
  194.                 .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
  195.                 .timestamp { text-align: left; font-size: small; }
  196.                 .caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
  197.                 a:link {text-decoration: none; }
  198.                 .articleBody { }
  199.                 .authorId {text-align: left; }
  200.                 .image {text-align: center;}
  201.                 .source {text-align: left; }'''
  202.  
  203.  
  204.     articles = {}
  205.     key = None
  206.     ans = []
  207.     url_list = []
  208.  
  209.     def filter_ans(self, ans) :
  210.         total_article_count = 0
  211.         idx = 0
  212.         idx_max = len(ans)-1
  213.         while idx <= idx_max:
  214.             if self.includeSections != []:
  215.                 if ans[idx][0] not in self.includeSections:
  216.                     print "SECTION NOT INCLUDED: ",ans[idx][0]
  217.                     del ans[idx]
  218.                     idx_max = idx_max-1
  219.                     continue
  220.             if ans[idx][0] in self.excludeSections:
  221.                 print "SECTION EXCLUDED: ",ans[idx][0]
  222.                 del ans[idx]
  223.                 idx_max = idx_max-1
  224.                 continue
  225.             if self.verbose:
  226.                 self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
  227.             for article in ans[idx][1]:
  228.                 total_article_count += 1
  229.                 if self.verbose:
  230.                     self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
  231.                               article['url'].encode('cp1252','replace')))
  232.             idx = idx+1
  233.  
  234.         self.log( "Queued %d articles" % total_article_count )
  235.         return ans
  236.  
  237.     def exclude_url(self,url):
  238.         if not url.startswith("http"):
  239.             return True
  240.         if not url.endswith(".html"):
  241.             return True
  242.         if 'nytimes.com' not in url:
  243.             return True
  244.         if 'podcast' in url:
  245.             return True
  246.         if '/video/' in url:
  247.             return True
  248.         if '/slideshow/' in url:
  249.             return True
  250.         if '/magazine/index' in url:
  251.             return True
  252.         if '/interactive/' in url:
  253.             return True
  254.         if '/reference/' in url:
  255.             return True
  256.         if '/premium/' in url:
  257.             return True
  258.         return False
  259.  
  260.     def fixChars(self,string):
  261.         # Replace lsquo (\x91)
  262.         fixed = re.sub("\x91","ΓÇÿ",string)
  263.  
  264.         # Replace rsquo (\x92)
  265.         fixed = re.sub("\x92","ΓÇÖ",fixed)
  266.  
  267.         # Replace ldquo (\x93)
  268.         fixed = re.sub("\x93","ΓÇ£",fixed)
  269.  
  270.         # Replace rdquo (\x94)
  271.         fixed = re.sub("\x94","ΓÇ¥",fixed)
  272.  
  273.         # Replace ndash (\x96)
  274.         fixed = re.sub("\x96","ΓÇô",fixed)
  275.  
  276.         # Replace mdash (\x97)
  277.         fixed = re.sub("\x97","ΓÇö",fixed)
  278.  
  279.         return fixed
  280.  
  281.     def get_browser(self):
  282.         br = BasicNewsRecipe.get_browser()
  283.         if self.username is not None and self.password is not None:
  284.             br.open('http://www.nytimes.com/auth/login')
  285.             br.form = br.forms().next()
  286.             br['userid']   = self.username
  287.             br['password'] = self.password
  288.             raw = br.submit().read()
  289.             if 'Please try again' in raw:
  290.                 raise Exception('Your username and password are incorrect')
  291.         return br
  292.  
  293.     def skip_ad_pages(self, soup):
  294.         # Skip ad pages served before actual article
  295.         skip_tag = soup.find(True, {'name':'skip'})
  296.         if skip_tag is not None:
  297.             self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
  298.             url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
  299.             url += '?pagewanted=all'
  300.             self.log.warn("Skipping ad to article at '%s'" % url)
  301.             return self.index_to_soup(url, raw=True)
  302.  
  303.     def get_cover_url(self):
  304.         cover = None
  305.         st = time.localtime()
  306.         year = str(st.tm_year)
  307.         month = "%.2d" % st.tm_mon
  308.         day = "%.2d" % st.tm_mday
  309.         cover = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/nytfrontpage/scan.jpg'
  310.         br = BasicNewsRecipe.get_browser()
  311.         try:
  312.             br.open(cover)
  313.         except:
  314.             self.log("\nCover unavailable")
  315.             cover = None
  316.         return cover
  317.  
  318.     def short_title(self):
  319.         return self.title
  320.  
  321.     def index_to_soup(self, url_or_raw, raw=False):
  322.         '''
  323.         OVERRIDE of class method
  324.         deals with various page encodings between index and articles
  325.         '''
  326.         def get_the_soup(docEncoding, url_or_raw, raw=False) :
  327.             if re.match(r'\w+://', url_or_raw):
  328.                 f = self.browser.open(url_or_raw)
  329.                 _raw = f.read()
  330.                 f.close()
  331.                 if not _raw:
  332.                     raise RuntimeError('Could not fetch index from %s'%url_or_raw)
  333.             else:
  334.                 _raw = url_or_raw
  335.             if raw:
  336.                 return _raw
  337.  
  338.             if not isinstance(_raw, unicode) and self.encoding:
  339.                 _raw = _raw.decode(docEncoding, 'replace')
  340.             massage = list(BeautifulSoup.MARKUP_MASSAGE)
  341.             massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
  342.             return BeautifulSoup(_raw, markupMassage=massage)
  343.  
  344.         # Entry point
  345.         soup = get_the_soup( self.encoding, url_or_raw )
  346.         contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
  347.         docEncoding =  str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
  348.         if docEncoding == '' :
  349.             docEncoding = self.encoding
  350.  
  351.         if self.verbose > 2:
  352.             self.log( "  document encoding: '%s'" % docEncoding)
  353.         if docEncoding != self.encoding :
  354.             soup = get_the_soup(docEncoding, url_or_raw)
  355.  
  356.         return soup
  357.  
  358.     def massageNCXText(self, description):
  359.         # Kindle TOC descriptions won't render certain characters
  360.         if description:
  361.             massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
  362.             # Replace '&' with '&'
  363.             massaged = re.sub("&","&", massaged)
  364.             return self.fixChars(massaged)
  365.         else:
  366.             return description
  367.  
  368.     def feed_title(self,div):
  369.         return ''.join(div.findAll(text=True, recursive=True)).strip()
  370.  
  371.     def handle_article(self,div):
  372.         thumbnail = div.find('div','thumbnail')
  373.         if thumbnail:
  374.             thumbnail.extract()
  375.         a = div.find('a', href=True)
  376.         if not a:
  377.             return
  378.         url = re.sub(r'\?.*', '', a['href'])
  379.         if self.exclude_url(url):
  380.             return
  381.         url += '?pagewanted=all'
  382.         if self.filterDuplicates:
  383.             if url in self.url_list:
  384.                 return
  385.         self.url_list.append(url)
  386.         title = self.tag_to_string(a, use_alt=True).strip()
  387.         description = ''
  388.         pubdate = strftime('%a, %d %b')
  389.         summary = div.find(True, attrs={'class':'summary'})
  390.         if summary:
  391.             description = self.tag_to_string(summary, use_alt=False)
  392.         author = ''
  393.         authorAttribution = div.find(True, attrs={'class':'byline'})
  394.         if authorAttribution:
  395.             author = self.tag_to_string(authorAttribution, use_alt=False)
  396.         else:
  397.             authorAttribution = div.find(True, attrs={'class':'byline'})
  398.             if authorAttribution:
  399.                 author = self.tag_to_string(authorAttribution, use_alt=False)
  400.         feed = self.key if self.key is not None else 'Uncategorized'
  401.         if not self.articles.has_key(feed):
  402.             self.ans.append(feed)
  403.             self.articles[feed] = []
  404.         self.articles[feed].append(
  405.                         dict(title=title, url=url, date=pubdate,
  406.                             description=description, author=author,
  407.                             content=''))
  408.  
  409.  
  410.     def parse_web_edition(self):
  411.  
  412.         for (sec_title,index_url) in self.web_sections:
  413.             if self.includeSections != []:
  414.                 if sec_title not in self.includeSections:
  415.                     print "SECTION NOT INCLUDED: ",sec_title
  416.                     continue
  417.             if sec_title in self.excludeSections:
  418.                 print "SECTION EXCLUDED: ",sec_title
  419.                 continue
  420.             print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
  421.             soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
  422.             self.key = sec_title
  423.             # Find each article
  424.             for div in soup.findAll(True,
  425.                 attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
  426.                 if div['class'] in ['story', 'story headline'] :
  427.                     self.handle_article(div)
  428.                 elif div['class'] == 'headlinesOnly multiline flush':
  429.                     for lidiv in div.findAll('li'):
  430.                         self.handle_article(lidiv)
  431.  
  432.         self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
  433.         return self.filter_ans(self.ans)
  434.  
  435.  
  436.     def parse_todays_index(self):
  437.  
  438.         soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
  439.  
  440.         skipping = False
  441.         # Find each article
  442.         for div in soup.findAll(True,
  443.             attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
  444.  
  445.             if div['class'] in ['section-headline','sectionHeader']:
  446.                 self.key = string.capwords(self.feed_title(div))
  447.                 self.key = self.key.replace('Op-ed','Op-Ed')
  448.                 self.key = self.key.replace('U.s.','U.S.')
  449.                 self.key = self.key.replace('N.y.','N.Y.')
  450.                 skipping = False
  451.                 if self.includeSections != []:
  452.                     if self.key not in self.includeSections:
  453.                         print "SECTION NOT INCLUDED: ",self.key
  454.                         skipping = True
  455.                 if self.key in self.excludeSections:
  456.                     print "SECTION EXCLUDED: ",self.key
  457.                     skipping = True
  458.  
  459.             elif div['class'] in ['story', 'story headline'] :
  460.                 if not skipping:
  461.                     self.handle_article(div)
  462.             elif div['class'] == 'headlinesOnly multiline flush':
  463.                 for lidiv in div.findAll('li'):
  464.                     if not skipping:
  465.                         self.handle_article(lidiv)
  466.  
  467.         self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
  468.         return self.filter_ans(self.ans)
  469.  
  470.     def parse_headline_index(self):
  471.  
  472.         soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
  473.  
  474.         # Fetch the content table
  475.         content_table = soup.find('table',{'id':'content'})
  476.         if content_table is None:
  477.             self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
  478.             return None
  479.  
  480.         # Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections
  481.  
  482.         for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
  483.             for div_sec in td_col.findAll('div',recursive=False):
  484.                 for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
  485.  
  486.                     section_name = self.tag_to_string(h6_sec_name,use_alt=False)
  487.                     section_name = re.sub(r'^ *$','',section_name)
  488.  
  489.                     if section_name == '':
  490.                         continue
  491.                     if self.includeSections != []:
  492.                         if section_name not in self.includeSections:
  493.                             print "SECTION NOT INCLUDED: ",section_name
  494.                             continue
  495.                     if section_name in self.excludeSections:
  496.                         print "SECTION EXCLUDED: ",section_name
  497.                         continue
  498.  
  499.                     section_name=string.capwords(section_name)
  500.                     section_name = section_name.replace('Op-ed','Op-Ed')
  501.                     section_name = section_name.replace('U.s.','U.S.')
  502.                     section_name = section_name.replace('N.y.','N.Y.')
  503.                     pubdate = strftime('%a, %d %b')
  504.  
  505.                     search_div = div_sec
  506.                     for next_tag in h6_sec_name.findNextSiblings(True):
  507.                         if next_tag.__class__.__name__ == 'Tag':
  508.                             if next_tag.name == 'div':
  509.                                 search_div = next_tag
  510.                             break
  511.  
  512.                     # Get the articles
  513.                     for h3_item in search_div.findAll('h3'):
  514.                         byline = h3_item.h6
  515.                         if byline is not None:
  516.                             author = self.tag_to_string(byline,usa_alt=False)
  517.                         else:
  518.                             author = ''
  519.                         a = h3_item.find('a', href=True)
  520.                         if not a:
  521.                             continue
  522.                         url = re.sub(r'\?.*', '', a['href'])
  523.                         if self.exclude_url(url):
  524.                             continue
  525.                         url += '?pagewanted=all'
  526.                         if self.filterDuplicates:
  527.                             if url in self.url_list:
  528.                                 continue
  529.                         self.url_list.append(url)
  530.                         title = self.tag_to_string(a, use_alt=True).strip()
  531.                         desc = h3_item.find('p')
  532.                         if desc is not None:
  533.                             description = self.tag_to_string(desc,use_alt=False)
  534.                         else:
  535.                             description = ''
  536.                         if not self.articles.has_key(section_name):
  537.                             self.ans.append(section_name)
  538.                             self.articles[section_name] = []
  539.                         self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
  540.  
  541.         self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
  542.         return self.filter_ans(self.ans)
  543.  
  544.     def parse_index(self):
  545.         if self.headlinesOnly:
  546.             return self.parse_headline_index()
  547.         elif self.webEdition:
  548.             return self.parse_web_edition()
  549.         else:
  550.             return self.parse_todays_index()
  551.  
  552.     def strip_anchors(self,soup):
  553.         paras = soup.findAll(True)
  554.         for para in paras:
  555.             aTags = para.findAll('a')
  556.             for a in aTags:
  557.                 if a.img is None:
  558.                     a.replaceWith(a.renderContents().decode('cp1252','replace'))
  559.         return soup
  560.  
  561.  
  562.     def preprocess_html(self, soup):
  563.  
  564.         if self.webEdition & (self.oldest_article>0):
  565.             date_tag = soup.find(True,attrs={'class': ['dateline','date']})
  566.             if date_tag:
  567.                 date_str = self.tag_to_string(date_tag,use_alt=False)
  568.                 date_str = date_str.replace('Published:','')
  569.                 date_items = date_str.split(',')
  570.                 try:
  571.                     datestring = date_items[0]+' '+date_items[1]
  572.                     article_date = self.decode_us_date(datestring)
  573.                 except:
  574.                     article_date = date.today()
  575.                 if article_date < self.earliest_date:
  576.                     self.log("Skipping article dated %s" % date_str)
  577.                     return None
  578.  
  579.         kicker_tag = soup.find(attrs={'class':'kicker'})
  580.         if kicker_tag: # remove Op_Ed author head shots
  581.             tagline = self.tag_to_string(kicker_tag)
  582.             if tagline=='Op-Ed Columnist':
  583.                 img_div = soup.find('div','inlineImage module')
  584.                 if img_div:
  585.                     img_div.extract()
  586.         return self.strip_anchors(soup)
  587.  
  588.     def postprocess_html(self,soup, True):
  589.         try:
  590.             if self.one_picture_per_article:
  591.                 # Remove all images after first
  592.                 largeImg = soup.find(True, {'class':'articleSpanImage'})
  593.                 inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
  594.                 if largeImg:
  595.                     for inlineImg in inlineImgs:
  596.                         inlineImg.extract()
  597.                 else:
  598.                     if inlineImgs:
  599.                         firstImg = inlineImgs[0]
  600.                         for inlineImg in inlineImgs[1:]:
  601.                             inlineImg.extract()
  602.                         # Move firstImg before article body
  603.                         cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
  604.                         if cgFirst:
  605.                             # Strip all sibling NavigableStrings: noise
  606.                             navstrings = cgFirst.findAll(text=True, recursive=False)
  607.                             [ns.extract() for ns in navstrings]
  608.                             headline_found = False
  609.                             tag = cgFirst.find(True)
  610.                             insertLoc = 0
  611.                             while True:
  612.                                 insertLoc += 1
  613.                                 if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
  614.                                         headline_found = True
  615.                                         break
  616.                                 tag = tag.nextSibling
  617.                                 if not tag:
  618.                                     headline_found = False
  619.                                     break
  620.                             if headline_found:
  621.                                 cgFirst.insert(insertLoc,firstImg)
  622.                         else:
  623.                             self.log(">>> No class:'columnGroup first' found <<<")
  624.         except:
  625.             self.log("ERROR: One picture per article in postprocess_html")
  626.  
  627.         try:
  628.             # Change captions to italic
  629.             for caption in soup.findAll(True, {'class':'caption'}) :
  630.                 if caption and len(caption) > 0:
  631.                     cTag = Tag(soup, "p", [("class", "caption")])
  632.                     c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
  633.                     mp_off = c.find("More Photos")
  634.                     if mp_off >= 0:
  635.                         c = c[:mp_off]
  636.                     cTag.insert(0, c)
  637.                     caption.replaceWith(cTag)
  638.         except:
  639.             self.log("ERROR:  Problem in change captions to italic")
  640.  
  641.         try:
  642.             # Change <nyt_headline> to <h2>
  643.             h1 = soup.find('h1')
  644.             if h1:
  645.                 headline = h1.find("nyt_headline")
  646.                 if headline:
  647.                     tag = Tag(soup, "h2")
  648.                     tag['class'] = "headline"
  649.                     tag.insert(0, self.fixChars(headline.contents[0]))
  650.                     h1.replaceWith(tag)
  651.             else:
  652.                 # Blog entry - replace headline, remove <hr> tags
  653.                 headline = soup.find('title')
  654.                 if headline:
  655.                     tag = Tag(soup, "h2")
  656.                     tag['class'] = "headline"
  657.                     tag.insert(0, self.fixChars(headline.contents[0]))
  658.                     soup.insert(0, tag)
  659.                     hrs = soup.findAll('hr')
  660.                     for hr in hrs:
  661.                         hr.extract()
  662.         except:
  663.             self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")
  664.  
  665.         try:
  666.             # Change <h1> to <h3> - used in editorial blogs
  667.             masthead = soup.find("h1")
  668.             if masthead:
  669.                 # Nuke the href
  670.                 if masthead.a:
  671.                     del(masthead.a['href'])
  672.                 tag = Tag(soup, "h3")
  673.                 tag.insert(0, self.fixChars(masthead.contents[0]))
  674.                 masthead.replaceWith(tag)
  675.         except:
  676.             self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
  677.  
  678.         try:
  679.             # Change <span class="bold"> to <b>
  680.             for subhead in soup.findAll(True, {'class':'bold'}) :
  681.                 if subhead.contents:
  682.                     bTag = Tag(soup, "b")
  683.                     bTag.insert(0, subhead.contents[0])
  684.                     subhead.replaceWith(bTag)
  685.         except:
  686.             self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
  687.  
  688.         try:
  689.             divTag = soup.find('div',attrs={'id':'articleBody'})
  690.             if divTag:
  691.                 divTag['class'] = divTag['id']
  692.         except:
  693.             self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
  694.  
  695.         try:
  696.             # Add class="authorId" to <div> so we can format with CSS
  697.             divTag = soup.find('div',attrs={'id':'authorId'})
  698.             if divTag and divTag.contents[0]:
  699.                 tag = Tag(soup, "p")
  700.                 tag['class'] = "authorId"
  701.                 tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
  702.                                  use_alt=False)))
  703.                 divTag.replaceWith(tag)
  704.         except:
  705.             self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
  706.  
  707.         return soup
  708.  
  709.     def populate_article_metadata(self, article, soup, first):
  710.         shortparagraph = ""
  711.         try:
  712.             if len(article.text_summary.strip()) == 0:
  713.                 articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
  714.                 if articlebodies:
  715.                     for articlebody in articlebodies:
  716.                         if articlebody:
  717.                             paras = articlebody.findAll('p')
  718.                             for p in paras:
  719.                                 refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
  720.                                 #account for blank paragraphs and short paragraphs by appending them to longer ones
  721.                                 if len(refparagraph) > 0:
  722.                                     if len(refparagraph) > 70: #approximately one line of text
  723.                                         article.summary = article.text_summary = shortparagraph + refparagraph
  724.                                         return
  725.                                     else:
  726.                                         shortparagraph = refparagraph + " "
  727.                                         if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
  728.                                             shortparagraph = shortparagraph + "- "
  729.         except:
  730.             self.log("Error creating article descriptions")
  731.             return
  732.  
  733.