home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / nytimes_sub.recipe < prev    next >
Text File  |  2011-09-09  |  41KB  |  884 lines

  1. #!/usr/bin/env  python
  2. # -*- coding: utf-8 -*-
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  5. '''
  6. nytimes.com
  7. '''
  8. import re, string, time
  9. from calibre import entity_to_unicode, strftime
  10. from datetime import timedelta, date
  11. from calibre.web.feeds.recipes import BasicNewsRecipe
  12. from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
  13.  
  14.  
  15. class NYTimes(BasicNewsRecipe):
  16.  
  17.     # set headlinesOnly to True for the headlines-only version. If True, webEdition is ignored.
  18.     headlinesOnly = False
  19.  
  20.     # set webEdition to True for the Web edition of the newspaper. Set oldest_article to the
  21.     # number of days old an article can be for inclusion. If oldest_article = 0 all articles
  22.     # will be included. Note: oldest_article is ignored if webEdition = False
  23.     webEdition = False
  24.     oldest_article = 7
  25.  
  26.     # replace paid Kindle Version:  the name will be changed to "The New York Times" to cause
  27.     # previous paid versions of the new york times to best sent to the back issues folder on the kindle
  28.     replaceKindleVersion = False
  29.  
  30.     # download higher resolution images than the small thumbnails typically included in the article
  31.     # the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
  32.     useHighResImages = True
  33.  
  34.     # includeSections: List of sections to include. If empty, all sections found will be included.
  35.     # Otherwise, only the sections named will be included. For example,
  36.     #
  37.     #    includeSections = ['Politics','Sports']
  38.     #
  39.     # would cause only the Politics and Sports sections to be included.
  40.  
  41.     includeSections = []  # by default, all sections included
  42.  
  43.     # excludeSections: List of sections to exclude. If empty, all sections found will be included.
  44.     # Otherwise, the sections named will be excluded. For example,
  45.     #
  46.     #    excludeSections = ['Politics','Sports']
  47.     #
  48.     # would cause the Politics and Sports sections to be excluded. This parameter can be used
  49.     # in conjuction with includeSections although in most cases using one or the other, but
  50.     # not both, is sufficient.
  51.  
  52.     excludeSections = []
  53.  
  54.     # one_picture_per_article specifies that calibre should only use the first image
  55.     # from an article (if one exists).  If one_picture_per_article = True, the image
  56.     # will be moved to a location between the headline and the byline.
  57.     # If one_picture_per_article = False, all images from the article will be included
  58.     # and shown in their original location.
  59.     one_picture_per_article = False
  60.  
  61.     # The maximum number of articles that will be downloaded
  62.     max_articles_per_feed = 100
  63.  
  64.     # Whether to omit duplicates of articles (typically arsing when articles are indexed in
  65.     # more than one section). If True, only the first occurance will be downloaded.
  66.     filterDuplicates = True
  67.  
  68.     # Sections to collect for the Web edition.
  69.     # Delete any you don't want, or use includeSections or excludeSections
  70.     web_sections = [(u'World',u'world'),
  71.                     (u'U.S.',u'national'),
  72.                     (u'Politics',u'politics'),
  73.                     (u'New York',u'nyregion'),
  74.                     (u'Business','business'),
  75.                     (u'Technology',u'technology'),
  76.                     (u'Sports',u'sports'),
  77.                     (u'Science',u'science'),
  78.                     (u'Health',u'health'),
  79.                     (u'Opinion',u'opinion'),
  80.                     (u'Arts',u'arts'),
  81.                     (u'Books',u'books'),
  82.                     (u'Movies',u'movies'),
  83.                     (u'Music',u'arts/music'),
  84.                     (u'Television',u'arts/television'),
  85.                     (u'Style',u'style'),
  86.                     (u'Dining & Wine',u'dining'),
  87.                     (u'Fashion & Style',u'fashion'),
  88.                     (u'Home & Garden',u'garden'),
  89.                     (u'Travel',u'travel'),
  90.                     ('Education',u'education'),
  91.                     ('Multimedia',u'multimedia'),
  92.                     (u'Obituaries',u'obituaries'),
  93.                     (u'Sunday Magazine',u'magazine'),
  94.                     (u'Week in Review',u'weekinreview')]
  95.  
  96.     if headlinesOnly:
  97.         title='New York Times Headlines'
  98.         description = 'Headlines from the New York Times'
  99.         needs_subscription = True
  100.     elif webEdition:
  101.         title='New York Times (Web)'
  102.         description = 'New York Times on the Web'
  103.         needs_subscription = True
  104.     elif replaceKindleVersion:
  105.         title='The New York Times'
  106.         description = 'Today\'s New York Times'
  107.         needs_subscription = True
  108.     else:
  109.         title='New York Times'
  110.         description = 'Today\'s New York Times. Needs subscription from http://www.nytimes.com'
  111.         needs_subscription = True
  112.  
  113.  
  114.     month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
  115.  
  116.     def decode_us_date(self,datestr):
  117.         udate = datestr.strip().lower().split()
  118.         try:
  119.             m = self.month_list.index(udate[0])+1
  120.         except:
  121.             return date.today()
  122.         d = int(udate[1])
  123.         y = int(udate[2])
  124.         try:
  125.             d = date(y,m,d)
  126.         except:
  127.             d = date.today
  128.         return d
  129.  
  130.     earliest_date = date.today() - timedelta(days=oldest_article)
  131.  
  132.     __author__  = 'GRiker/Kovid Goyal/Nick Redding/Ben Collier'
  133.     language = 'en'
  134.     requires_version = (0, 7, 5)
  135.  
  136.  
  137.     timefmt = ''
  138.     masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
  139.     cover_margins = (18,18,'grey99')
  140.  
  141.     remove_tags_before = dict(id='article')
  142.     remove_tags_after  = dict(id='article')
  143.     remove_tags = [dict(attrs={'class':[
  144.                             'articleFooter',
  145.                             'articleTools',
  146.                             'columnGroup doubleRule',
  147.                             'columnGroup singleRule',
  148.                             'columnGroup last',
  149.                             'columnGroup  last',
  150.                             'doubleRule',
  151.                             'dottedLine',
  152.                             'entry-meta',
  153.                             'entry-response module',
  154.                             #'icon enlargeThis', #removed to provide option for high res images
  155.                             'leftNavTabs',
  156.                             'metaFootnote',
  157.                             'module box nav',
  158.                             'nextArticleLink',
  159.                             'nextArticleLink clearfix',
  160.                             'post-tools',
  161.                             'relatedSearchesModule',
  162.                             'side_tool',
  163.                             'singleAd',
  164.                             'entry entry-utility', #added for DealBook
  165.                             'entry-tags', #added for DealBook
  166.                             'footer promos clearfix', #added for DealBook
  167.                             'footer links clearfix', #added for DealBook
  168.                             'tabsContainer', #added for other blog downloads
  169.                             'column lastColumn', #added for other blog downloads
  170.                             'pageHeaderWithLabel', #added for other gadgetwise downloads
  171.                             'column two', #added for other blog downloads
  172.                             'column two last', #added for other blog downloads
  173.                             'column three', #added for other blog downloads
  174.                             'column three last', #added for other blog downloads
  175.                             'column four',#added for other blog downloads
  176.                             'column four last',#added for other blog downloads
  177.                             'column last', #added for other blog downloads
  178.                             'timestamp published', #added for other blog downloads
  179.                             'entry entry-related',
  180.                             'subNavigation tabContent active', #caucus blog navigation
  181.                             'columnGroup doubleRule',
  182.                             'mediaOverlay slideshow',
  183.                             'headlinesOnly multiline flush',
  184.                             'wideThumb',
  185.                             'video', #added 02-11-2011
  186.                             'videoHeader',#added 02-11-2011
  187.                             'articleInlineVideoHolder', #added 02-11-2011
  188.                             'assetCompanionAd',
  189.                             re.compile('^subNavigation'),
  190.                             re.compile('^leaderboard'),
  191.                             re.compile('^module'),
  192.                             ]}),
  193.                    dict(id=[
  194.                             'adxLeaderboard',
  195.                             'adxSponLink',
  196.                             'archive',
  197.                             'articleExtras',
  198.                             'articleInline',
  199.                             'blog_sidebar',
  200.                             'businessSearchBar',
  201.                             'cCol',
  202.                             'entertainmentSearchBar',
  203.                             'footer',
  204.                             'header',
  205.                             'header_search',
  206.                             'inlineBox',
  207.                             'login',
  208.                             'masthead',
  209.                             'masthead-nav',
  210.                             'memberTools',
  211.                             'navigation',
  212.                             'portfolioInline',
  213.                             'readerReviews',
  214.                             'readerReviewsCount',
  215.                             'relatedArticles',
  216.                             'relatedTopics',
  217.                             'respond',
  218.                             'side_search',
  219.                             'side_index',
  220.                             'side_tool',
  221.                             'toolsRight',
  222.                             'skybox', #added for DealBook
  223.                             'TopAd', #added for DealBook
  224.                             'related-content', #added for DealBook
  225.                             ]),
  226.                    dict(name=['script', 'noscript', 'style','form','hr'])]
  227.     no_stylesheets = True
  228.     extra_css = '''
  229.                 .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
  230.                 .credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
  231.                 .byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
  232.                 .dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
  233.                 .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
  234.                 .timestamp { text-align: left; font-size: small; }
  235.                 .caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
  236.                 a:link {text-decoration: none; }
  237.                 .articleBody { }
  238.                 .authorId {text-align: left; }
  239.                 .image {text-align: center;}
  240.                 .source {text-align: left; }'''
  241.  
  242.  
  243.     articles = {}
  244.     key = None
  245.     ans = []
  246.     url_list = []
  247.  
  248.     def filter_ans(self, ans) :
  249.         total_article_count = 0
  250.         idx = 0
  251.         idx_max = len(ans)-1
  252.         while idx <= idx_max:
  253.             if self.includeSections != []:
  254.                 if ans[idx][0] not in self.includeSections:
  255.                     print "SECTION NOT INCLUDED: ",ans[idx][0]
  256.                     del ans[idx]
  257.                     idx_max = idx_max-1
  258.                     continue
  259.             if ans[idx][0] in self.excludeSections:
  260.                 print "SECTION EXCLUDED: ",ans[idx][0]
  261.                 del ans[idx]
  262.                 idx_max = idx_max-1
  263.                 continue
  264.             if self.verbose:
  265.                 self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
  266.             for article in ans[idx][1]:
  267.                 total_article_count += 1
  268.                 if self.verbose:
  269.                     self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
  270.                               article['url'].encode('cp1252','replace')))
  271.             idx = idx+1
  272.  
  273.         self.log( "Queued %d articles" % total_article_count )
  274.         return ans
  275.  
  276.     def exclude_url(self,url):
  277.         if not url.startswith("http"):
  278.             return True
  279.         if not url.endswith(".html") and 'dealbook.nytimes.com' not in url and 'blogs.nytimes.com' not in url: #added for DealBook
  280.             return True
  281.         if 'nytimes.com' not in url:
  282.             return True
  283.         if 'podcast' in url:
  284.             return True
  285.         if '/video/' in url:
  286.             return True
  287.         if '/slideshow/' in url:
  288.             return True
  289.         if '/magazine/index' in url:
  290.             return True
  291.         if '/interactive/' in url:
  292.             return True
  293.         if '/reference/' in url:
  294.             return True
  295.         if '/premium/' in url:
  296.             return True
  297.         return False
  298.  
  299.     def fixChars(self,string):
  300.         # Replace lsquo (\x91)
  301.         fixed = re.sub("\x91","ΓÇÿ",string)
  302.  
  303.         # Replace rsquo (\x92)
  304.         fixed = re.sub("\x92","ΓÇÖ",fixed)
  305.  
  306.         # Replace ldquo (\x93)
  307.         fixed = re.sub("\x93","ΓÇ£",fixed)
  308.  
  309.         # Replace rdquo (\x94)
  310.         fixed = re.sub("\x94","ΓÇ¥",fixed)
  311.  
  312.         # Replace ndash (\x96)
  313.         fixed = re.sub("\x96","ΓÇô",fixed)
  314.  
  315.         # Replace mdash (\x97)
  316.         fixed = re.sub("\x97","ΓÇö",fixed)
  317.  
  318.         return fixed
  319.  
  320.     def get_browser(self):
  321.         br = BasicNewsRecipe.get_browser()
  322.         if self.username is not None and self.password is not None:
  323.             br.open('http://www.nytimes.com/auth/login')
  324.             br.form = br.forms().next()
  325.             br['userid']   = self.username
  326.             br['password'] = self.password
  327.             raw = br.submit().read()
  328.             if 'Please try again' in raw:
  329.                 raise Exception('Your username and password are incorrect')
  330.         return br
  331.  
  332.     def skip_ad_pages(self, soup):
  333.         # Skip ad pages served before actual article
  334.         skip_tag = soup.find(True, {'name':'skip'})
  335.         if skip_tag is not None:
  336.             self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
  337.             url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
  338.             url += '?pagewanted=all'
  339.             self.log.warn("Skipping ad to article at '%s'" % url)
  340.             return self.index_to_soup(url, raw=True)
  341.  
  342.     def get_cover_url(self):
  343.         cover = None
  344.         st = time.localtime()
  345.         year = str(st.tm_year)
  346.         month = "%.2d" % st.tm_mon
  347.         day = "%.2d" % st.tm_mday
  348.         cover = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/nytfrontpage/scan.jpg'
  349.         br = BasicNewsRecipe.get_browser()
  350.         try:
  351.             br.open(cover)
  352.         except:
  353.             self.log("\nCover unavailable")
  354.             cover = None
  355.         return cover
  356.  
  357.     def short_title(self):
  358.         return self.title
  359.  
  360.     def index_to_soup(self, url_or_raw, raw=False):
  361.         '''
  362.         OVERRIDE of class method
  363.         deals with various page encodings between index and articles
  364.         '''
  365.         def get_the_soup(docEncoding, url_or_raw, raw=False) :
  366.             if re.match(r'\w+://', url_or_raw):
  367.                 f = self.browser.open(url_or_raw)
  368.                 _raw = f.read()
  369.                 f.close()
  370.                 if not _raw:
  371.                     raise RuntimeError('Could not fetch index from %s'%url_or_raw)
  372.             else:
  373.                 _raw = url_or_raw
  374.             if raw:
  375.                 return _raw
  376.  
  377.             if not isinstance(_raw, unicode) and self.encoding:
  378.                 _raw = _raw.decode(docEncoding, 'replace')
  379.             massage = list(BeautifulSoup.MARKUP_MASSAGE)
  380.             massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
  381.             return BeautifulSoup(_raw, markupMassage=massage)
  382.  
  383.         # Entry point
  384.         soup = get_the_soup( self.encoding, url_or_raw )
  385.         contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
  386.         docEncoding =  str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
  387.         if docEncoding == '' :
  388.             docEncoding = self.encoding
  389.  
  390.         if self.verbose > 2:
  391.             self.log( "  document encoding: '%s'" % docEncoding)
  392.         if docEncoding != self.encoding :
  393.             soup = get_the_soup(docEncoding, url_or_raw)
  394.  
  395.         return soup
  396.  
  397.     def massageNCXText(self, description):
  398.         # Kindle TOC descriptions won't render certain characters
  399.         if description:
  400.             massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
  401.             # Replace '&' with '&'
  402.             massaged = re.sub("&","&", massaged)
  403.             return self.fixChars(massaged)
  404.         else:
  405.             return description
  406.  
  407.     def feed_title(self,div):
  408.         return ''.join(div.findAll(text=True, recursive=True)).strip()
  409.  
  410.     def handle_article(self,div):
  411.         thumbnail = div.find('div','thumbnail')
  412.         if thumbnail:
  413.             thumbnail.extract()
  414.         a = div.find('a', href=True)
  415.         if not a:
  416.             return
  417.         url = re.sub(r'\?.*', '', a['href'])
  418.         if self.exclude_url(url):
  419.             return
  420.         url += '?pagewanted=all'
  421.         if self.filterDuplicates:
  422.             if url in self.url_list:
  423.                 return
  424.         self.url_list.append(url)
  425.         title = self.tag_to_string(a, use_alt=True).strip()
  426.         description = ''
  427.         pubdate = strftime('%a, %d %b')
  428.         summary = div.find(True, attrs={'class':'summary'})
  429.         if summary:
  430.             description = self.tag_to_string(summary, use_alt=False)
  431.         author = ''
  432.         authorAttribution = div.find(True, attrs={'class':'byline'})
  433.         if authorAttribution:
  434.             author = self.tag_to_string(authorAttribution, use_alt=False)
  435.         else:
  436.             authorAttribution = div.find(True, attrs={'class':'byline'})
  437.             if authorAttribution:
  438.                 author = self.tag_to_string(authorAttribution, use_alt=False)
  439.         feed = self.key if self.key is not None else 'Uncategorized'
  440.         if not self.articles.has_key(feed):
  441.             self.ans.append(feed)
  442.             self.articles[feed] = []
  443.         self.articles[feed].append(
  444.                         dict(title=title, url=url, date=pubdate,
  445.                             description=description, author=author,
  446.                             content=''))
  447.  
  448.  
  449.     def parse_web_edition(self):
  450.  
  451.         for (sec_title,index_url) in self.web_sections:
  452.             if self.includeSections != []:
  453.                 if sec_title not in self.includeSections:
  454.                     print "SECTION NOT INCLUDED: ",sec_title
  455.                     continue
  456.             if sec_title in self.excludeSections:
  457.                 print "SECTION EXCLUDED: ",sec_title
  458.                 continue
  459.             print 'Index URL: '+'http://www.nytimes.com/pages/'+index_url+'/index.html'
  460.             soup = self.index_to_soup('http://www.nytimes.com/pages/'+index_url+'/index.html')
  461.             self.key = sec_title
  462.             # Find each article
  463.             for div in soup.findAll(True,
  464.                 attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
  465.                 if div['class'] in ['story', 'story headline'] :
  466.                     self.handle_article(div)
  467.                 elif div['class'] == 'headlinesOnly multiline flush':
  468.                     for lidiv in div.findAll('li'):
  469.                         self.handle_article(lidiv)
  470.  
  471.         self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
  472.         return self.filter_ans(self.ans)
  473.  
  474.  
  475.     def parse_todays_index(self):
  476.  
  477.         soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
  478.  
  479.         skipping = False
  480.         # Find each article
  481.         for div in soup.findAll(True,
  482.             attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
  483.  
  484.             if div['class'] in ['section-headline','sectionHeader']:
  485.                 self.key = string.capwords(self.feed_title(div))
  486.                 self.key = self.key.replace('Op-ed','Op-Ed')
  487.                 self.key = self.key.replace('U.s.','U.S.')
  488.                 self.key = self.key.replace('N.y.','N.Y.')
  489.                 skipping = False
  490.                 if self.includeSections != []:
  491.                     if self.key not in self.includeSections:
  492.                         print "SECTION NOT INCLUDED: ",self.key
  493.                         skipping = True
  494.                 if self.key in self.excludeSections:
  495.                     print "SECTION EXCLUDED: ",self.key
  496.                     skipping = True
  497.  
  498.             elif div['class'] in ['story', 'story headline'] :
  499.                 if not skipping:
  500.                     self.handle_article(div)
  501.             elif div['class'] == 'headlinesOnly multiline flush':
  502.                 for lidiv in div.findAll('li'):
  503.                     if not skipping:
  504.                         self.handle_article(lidiv)
  505.  
  506.         self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
  507.         return self.filter_ans(self.ans)
  508.  
  509.     def parse_headline_index(self):
  510.  
  511.         soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
  512.  
  513.         # Fetch the content table
  514.         content_table = soup.find('table',{'id':'content'})
  515.         if content_table is None:
  516.             self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
  517.             return None
  518.  
  519.         # Within this table are <td id=".*Column.*"> entries, each containing one or more h6 tags which represent sections
  520.  
  521.         for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
  522.             for div_sec in td_col.findAll('div',recursive=False):
  523.                 for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
  524.  
  525.                     section_name = self.tag_to_string(h6_sec_name,use_alt=False)
  526.                     section_name = re.sub(r'^ *$','',section_name)
  527.  
  528.                     if section_name == '':
  529.                         continue
  530.                     if self.includeSections != []:
  531.                         if section_name not in self.includeSections:
  532.                             print "SECTION NOT INCLUDED: ",section_name
  533.                             continue
  534.                     if section_name in self.excludeSections:
  535.                         print "SECTION EXCLUDED: ",section_name
  536.                         continue
  537.  
  538.                     section_name=string.capwords(section_name)
  539.                     section_name = section_name.replace('Op-ed','Op-Ed')
  540.                     section_name = section_name.replace('U.s.','U.S.')
  541.                     section_name = section_name.replace('N.y.','N.Y.')
  542.                     pubdate = strftime('%a, %d %b')
  543.  
  544.                     search_div = div_sec
  545.                     for next_tag in h6_sec_name.findNextSiblings(True):
  546.                         if next_tag.__class__.__name__ == 'Tag':
  547.                             if next_tag.name == 'div':
  548.                                 search_div = next_tag
  549.                             break
  550.  
  551.                     # Get the articles
  552.                     for h3_item in search_div.findAll('h3'):
  553.                         byline = h3_item.h6
  554.                         if byline is not None:
  555.                             author = self.tag_to_string(byline,usa_alt=False)
  556.                         else:
  557.                             author = ''
  558.                         a = h3_item.find('a', href=True)
  559.                         if not a:
  560.                             continue
  561.                         url = re.sub(r'\?.*', '', a['href'])
  562.                         if self.exclude_url(url):
  563.                             continue
  564.                         url += '?pagewanted=all'
  565.                         if self.filterDuplicates:
  566.                             if url in self.url_list:
  567.                                 continue
  568.                         self.url_list.append(url)
  569.                         title = self.tag_to_string(a, use_alt=True).strip()
  570.                         desc = h3_item.find('p')
  571.                         if desc is not None:
  572.                             description = self.tag_to_string(desc,use_alt=False)
  573.                         else:
  574.                             description = ''
  575.                         if not self.articles.has_key(section_name):
  576.                             self.ans.append(section_name)
  577.                             self.articles[section_name] = []
  578.                         self.articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
  579.  
  580.         self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
  581.         return self.filter_ans(self.ans)
  582.  
  583.     def parse_index(self):
  584.         if self.headlinesOnly:
  585.             return self.parse_headline_index()
  586.         elif self.webEdition:
  587.             return self.parse_web_edition()
  588.         else:
  589.             return self.parse_todays_index()
  590.  
  591.     def strip_anchors(self,soup):
  592.         paras = soup.findAll(True)
  593.         for para in paras:
  594.             aTags = para.findAll('a')
  595.             for a in aTags:
  596.                 if a.img is None:
  597.                     a.replaceWith(a.renderContents().decode('cp1252','replace'))
  598.         return soup
  599.  
  600.  
  601.     def preprocess_html(self, soup):
  602.         if self.webEdition & (self.oldest_article>0):
  603.             date_tag = soup.find(True,attrs={'class': ['dateline','date']})
  604.             if date_tag:
  605.                 date_str = self.tag_to_string(date_tag,use_alt=False)
  606.                 date_str = date_str.replace('Published:','')
  607.                 date_items = date_str.split(',')
  608.                 try:
  609.                     datestring = date_items[0]+' '+date_items[1]
  610.                     article_date = self.decode_us_date(datestring)
  611.                 except:
  612.                     article_date = date.today()
  613.                 if article_date < self.earliest_date:
  614.                     self.log("Skipping article dated %s" % date_str)
  615.                     return None
  616.  
  617.         #all articles are from today, no need to print the date on every page
  618.         try:
  619.             if not self.webEdition:
  620.                 date_tag = soup.find(True,attrs={'class': ['dateline','date']})
  621.                 if date_tag:
  622.                     date_tag.extract()
  623.         except:
  624.             self.log("Error removing the published date")
  625.  
  626.         if self.useHighResImages:
  627.             try:
  628.                 #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
  629.                 enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
  630.                 if enlargeThisList:
  631.                     for popupref in enlargeThisList:
  632.                         popupreflink = popupref.find('a')
  633.                         if popupreflink:
  634.                             reflinkstring = str(popupreflink['href'])
  635.                             refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('")
  636.                             refend = reflinkstring.find(".html", refstart) + len(".html")
  637.                             reflinkstring = reflinkstring[refstart:refend]
  638.  
  639.                             popuppage = self.browser.open(reflinkstring)
  640.                             popuphtml = popuppage.read()
  641.                             popuppage.close()
  642.                             if popuphtml:
  643.                                 st = time.localtime()
  644.                                 year = str(st.tm_year)
  645.                                 month = "%.2d" % st.tm_mon
  646.                                 day = "%.2d" % st.tm_mday
  647.                                 imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/')
  648.                                 highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
  649.                                 popupSoup = BeautifulSoup(popuphtml)
  650.                                 highResTag = popupSoup.find('img', {'src':highResImageLink})
  651.                                 if highResTag:
  652.                                     try:
  653.                                         newWidth = highResTag['width']
  654.                                         newHeight = highResTag['height']
  655.                                         imageTag = popupref.parent.find("img")
  656.                                     except:
  657.                                         self.log("Error: finding width and height of img")
  658.                                     popupref.extract()
  659.                                     if imageTag:
  660.                                         try:
  661.                                             imageTag['src'] = highResImageLink
  662.                                             imageTag['width'] = newWidth
  663.                                             imageTag['height'] = newHeight
  664.                                         except:
  665.                                             self.log("Error setting the src width and height parameters")
  666.             except Exception:
  667.                 self.log("Error pulling high resolution images")
  668.  
  669.             try:
  670.                 #remove "Related content" bar
  671.                 runAroundsFound = soup.findAll('div',{'class':['articleInline runaroundLeft','articleInline doubleRule runaroundLeft','articleInline runaroundLeft firstArticleInline','articleInline runaroundLeft  ','articleInline runaroundLeft  lastArticleInline']})
  672.                 if runAroundsFound:
  673.                     for runAround in runAroundsFound:
  674.                         #find all section headers
  675.                         hlines = runAround.findAll(True ,{'class':['sectionHeader','sectionHeader flushBottom']})
  676.                         if hlines:
  677.                             for hline in hlines:
  678.                                 hline.extract()
  679.  
  680.                         #find all section headers
  681.                         hlines = runAround.findAll('h6')
  682.                         if hlines:
  683.                             for hline in hlines:
  684.                                 hline.extract()
  685.             except:
  686.                 self.log("Error removing related content bar")
  687.  
  688.  
  689.             try:
  690.                 #in case pulling images failed, delete the enlarge this text
  691.                 enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
  692.                 if enlargeThisList:
  693.                     for popupref in enlargeThisList:
  694.                         popupref.extract()
  695.             except:
  696.                 self.log("Error removing Enlarge this text")
  697.  
  698.         return self.strip_anchors(soup)
  699.  
  700.     def postprocess_html(self,soup, True):
  701.         try:
  702.                 if self.one_picture_per_article:
  703.                         # Remove all images after first
  704.                         largeImg = soup.find(True, {'class':'articleSpanImage'})
  705.                         inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
  706.                         if largeImg:
  707.                                 for inlineImg in inlineImgs:
  708.                                         inlineImg.extract()
  709.                         else:
  710.                                 if inlineImgs:
  711.                                         firstImg = inlineImgs[0]
  712.                                         for inlineImg in inlineImgs[1:]:
  713.                                                 inlineImg.extract()
  714.                                         # Move firstImg before article body
  715.                                         cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
  716.                                         if cgFirst:
  717.                                                 # Strip all sibling NavigableStrings: noise
  718.                                                 navstrings = cgFirst.findAll(text=True, recursive=False)
  719.                                                 [ns.extract() for ns in navstrings]
  720.                                                 headline_found = False
  721.                                                 tag = cgFirst.find(True)
  722.                                                 insertLoc = 0
  723.                                                 while True:
  724.                                                         insertLoc += 1
  725.                                                         if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
  726.                                                                         headline_found = True
  727.                                                                         break
  728.                                                         tag = tag.nextSibling
  729.                                                         if not tag:
  730.                                                                 headline_found = False
  731.                                                                 break
  732.                                                 if headline_found:
  733.                                                         cgFirst.insert(insertLoc,firstImg)
  734.                                         else:
  735.                                                 self.log(">>> No class:'columnGroup first' found <<<")
  736.         except:
  737.                 self.log("ERROR: One picture per article in postprocess_html")
  738.  
  739.         try:
  740.                 # Change captions to italic
  741.                 for caption in soup.findAll(True, {'class':'caption'}) :
  742.                         if caption and len(caption) > 0:
  743.                                 cTag = Tag(soup, "p", [("class", "caption")])
  744.                                 c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
  745.                                 mp_off = c.find("More Photos")
  746.                                 if mp_off >= 0:
  747.                                         c = c[:mp_off]
  748.                                 cTag.insert(0, c)
  749.                                 caption.replaceWith(cTag)
  750.         except:
  751.                 self.log("ERROR:  Problem in change captions to italic")
  752.  
  753.         try:
  754.                 # Change <nyt_headline> to <h2>
  755.                 h1 = soup.find('h1')
  756.                 blogheadline = str(h1) #added for dealbook
  757.                 if h1:
  758.                         headline = h1.find("nyt_headline")
  759.                         if headline:
  760.                                 tag = Tag(soup, "h2")
  761.                                 tag['class'] = "headline"
  762.                                 tag.insert(0, self.fixChars(headline.contents[0]))
  763.                                 h1.replaceWith(tag)
  764.                         elif blogheadline.find('entry-title'):#added for dealbook
  765.                                 tag = Tag(soup, "h2")#added for dealbook
  766.                                 tag['class'] = "headline"#added for dealbook
  767.                                 tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
  768.                                 h1.replaceWith(tag)#added for dealbook
  769.  
  770.                 else:
  771.                         # Blog entry - replace headline, remove <hr> tags  - BCC I think this is no longer functional 1-18-2011
  772.                         headline = soup.find('title')
  773.                         if headline:
  774.                                 tag = Tag(soup, "h2")
  775.                                 tag['class'] = "headline"
  776.                                 tag.insert(0, self.fixChars(headline.renderContents()))
  777.                                 soup.insert(0, tag)
  778.                                 hrs = soup.findAll('hr')
  779.                                 for hr in hrs:
  780.                                         hr.extract()
  781.         except:
  782.                 self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")
  783.  
  784.         try:
  785.                 #if this is from a blog (dealbook, fix the byline format
  786.                 bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
  787.                 if bylineauthor:
  788.                     tag = Tag(soup, "h6")
  789.                     tag['class'] = "byline"
  790.                     tag.insert(0, self.fixChars(bylineauthor.renderContents()))
  791.                     bylineauthor.replaceWith(tag)
  792.         except:
  793.             self.log("ERROR:  fixing byline author format")
  794.  
  795.         try:
  796.                 #if this is a blog (dealbook) fix the credit style for the pictures
  797.                 blogcredit = soup.find('div',attrs={'class':'credit'})
  798.                 if blogcredit:
  799.                     tag = Tag(soup, "h6")
  800.                     tag['class'] = "credit"
  801.                     tag.insert(0, self.fixChars(blogcredit.renderContents()))
  802.                     blogcredit.replaceWith(tag)
  803.         except:
  804.             self.log("ERROR:  fixing credit format")
  805.  
  806.  
  807.         try:
  808.                 # Change <h1> to <h3> - used in editorial blogs
  809.                 masthead = soup.find("h1")
  810.                 if masthead:
  811.                         # Nuke the href
  812.                         if masthead.a:
  813.                                 del(masthead.a['href'])
  814.                         tag = Tag(soup, "h3")
  815.                         tag.insert(0, self.fixChars(masthead.contents[0]))
  816.                         masthead.replaceWith(tag)
  817.         except:
  818.                 self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
  819.  
  820.         try:
  821.                 # Change <span class="bold"> to <b>
  822.                 for subhead in soup.findAll(True, {'class':'bold'}) :
  823.                         if subhead.contents:
  824.                                 bTag = Tag(soup, "b")
  825.                                 bTag.insert(0, subhead.contents[0])
  826.                                 subhead.replaceWith(bTag)
  827.         except:
  828.                 self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
  829.         try:
  830.                 #remove the <strong> update tag
  831.                 blogupdated = soup.find('span', {'class':'update'})
  832.                 if blogupdated:
  833.                     blogupdated.replaceWith("")
  834.         except:
  835.                 self.log("ERROR:  Removing strong tag")
  836.  
  837.         try:
  838.                 divTag = soup.find('div',attrs={'id':'articleBody'})
  839.                 if divTag:
  840.                         divTag['class'] = divTag['id']
  841.         except:
  842.                 self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")
  843.  
  844.         try:
  845.                 # Add class="authorId" to <div> so we can format with CSS
  846.                 divTag = soup.find('div',attrs={'id':'authorId'})
  847.                 if divTag and divTag.contents[0]:
  848.                         tag = Tag(soup, "p")
  849.                         tag['class'] = "authorId"
  850.                         tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
  851.                                                          use_alt=False)))
  852.                         divTag.replaceWith(tag)
  853.         except:
  854.                 self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")
  855.  
  856.         return soup
  857.     def populate_article_metadata(self, article, soup, first):
  858.         shortparagraph = ""
  859.         try:
  860.             if len(article.text_summary.strip()) == 0:
  861.                 articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
  862.                 if not articlebodies: #added to account for blog formats
  863.                     articlebodies = soup.findAll('div', attrs={'class':'entry-content'}) #added to account for blog formats
  864.                 if articlebodies:
  865.                     for articlebody in articlebodies:
  866.                         if articlebody:
  867.                             paras = articlebody.findAll('p')
  868.                             for p in paras:
  869.                                 refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
  870.                                 #account for blank paragraphs and short paragraphs by appending them to longer ones
  871.                                 if len(refparagraph) > 0:
  872.                                     if len(refparagraph) > 140: #approximately two lines of text
  873.                                         article.summary = article.text_summary = shortparagraph + refparagraph
  874.                                         return
  875.                                     else:
  876.                                         shortparagraph = refparagraph + " "
  877.                                         if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
  878.                                             shortparagraph = shortparagraph + "- "
  879.  
  880.         except:
  881.             self.log("Error creating article descriptions")
  882.             return
  883.  
  884.