home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / wapo_cartoons.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  5.6 KB  |  146 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  3. from datetime import date, timedelta
  4.  
  5. class WaPoCartoonsRecipe(BasicNewsRecipe):
  6.     __license__   = 'GPL v3'
  7.     __author__ = 'kwetal'
  8.     language = 'en'
  9.     version = 2
  10.  
  11.     title = u'Washington Post Cartoons'
  12.     publisher = u'Washington Post'
  13.     category = u'News, Cartoons'
  14.     description = u'Cartoons from the Washington Post'
  15.  
  16.     oldest_article = 7
  17.     max_articles_per_feed = 100
  18.     use_embedded_content = False
  19.     no_stylesheets = True
  20.  
  21.     feeds = []
  22.     feeds.append((u'Anderson', u'http://www.uclick.com/client/wpc/wpnan/'))
  23.     feeds.append((u'Auth', u'http://www.uclick.com/client/wpc/ta/'))
  24.     feeds.append((u'Bok', u'http://www.creators.com/featurepages/11_editorialcartoons_chip-bok.html?name=cb'))
  25.     feeds.append((u'Carlson', u'http://www.uclick.com/client/wpc/sc/'))
  26.     feeds.append((u'Luckovich', u'http://www.creators.com/featurepages/11_editorialcartoons_mike-luckovich.html?name=lk'))
  27.     feeds.append((u'McCoy', u'http://www.uclick.com/client/wpc/gm/'))
  28.     feeds.append((u'Pat Oliphant', u'http://www.uclick.com/client/wpc/po/'))
  29.     feeds.append((u'Sargent', u'http://wpcomics.washingtonpost.com/client/wpc/bs/'))
  30.     feeds.append((u'Wilkinson', u'http://www.uclick.com/client/wpc/wpswi/'))
  31.  
  32.     extra_css = '''
  33.                 body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
  34.                 h1 {font-size: medium; font-weight: bold; margin-bottom: -0.1em; padding: 0em; text-align: left;}
  35.                 #name {margin-bottom: 0.2em}
  36.                 #copyright {font-size: xx-small; color: #696969; text-align: right; margin-top: 0.2em;}
  37.                 '''
  38.  
  39.     def parse_index(self):
  40.         index = []
  41.         oldestDate = date.today() - timedelta(days = self.oldest_article)
  42.         oldest = oldestDate.strftime('%Y%m%d')
  43.         for feed in self.feeds:
  44.             cartoons = []
  45.             soup = self.index_to_soup(feed[1])
  46.  
  47.             cartoon = {'title': 'Current', 'date': None, 'url': feed[1], 'description' : ''}
  48.             cartoons.append(cartoon)
  49.  
  50.             select = soup.find('select', attrs = {'name': ['url', 'dest']})
  51.             if select:
  52.                 cartoonCandidates = []
  53.                 if select['name'] == 'url':
  54.                     cartoonCandidates = self.cartoonCandidatesWaPo(select, oldest)
  55.                 else:
  56.                     cartoonCandidates = self.cartoonCandidatesCreatorsCom(select, oldest)
  57.  
  58.                 for cartoon in cartoonCandidates:
  59.                     cartoons.append(cartoon)
  60.  
  61.             index.append([feed[0], cartoons])
  62.  
  63.         return index
  64.  
  65.     def preprocess_html(self, soup):
  66.         freshSoup = self.getFreshSoup(soup)
  67.  
  68.         div = soup.find('div', attrs = {'id': 'name'})
  69.         if div:
  70.             freshSoup.body.append(div)
  71.             comic = soup.find('div', attrs = {'id': 'comic_full'})
  72.  
  73.             img = comic.find('img')
  74.             if '&' in img['src']:
  75.                 img['src'], sep, bad = img['src'].rpartition('&')
  76.  
  77.             freshSoup.body.append(comic)
  78.             freshSoup.body.append(soup.find('div', attrs = {'id': 'copyright'}))
  79.         else:
  80.             span = soup.find('span', attrs = {'class': 'title'})
  81.             if span:
  82.                 del span['class']
  83.                 span['id'] = 'name'
  84.                 span.name = 'div'
  85.                 freshSoup.body.append(span)
  86.  
  87.             img = soup.find('img', attrs = {'class': 'pic_big'})
  88.             if img:
  89.                 td = img.parent
  90.                 if td.has_key('style'):
  91.                     del td['style']
  92.                 td.name = 'div'
  93.                 td['id'] = 'comic_full'
  94.                 freshSoup.body.append(td)
  95.  
  96.             td = soup.find('td', attrs = {'class': 'copy'})
  97.             if td:
  98.                 for a in td.find('a'):
  99.                     a.extract()
  100.                 del td['class']
  101.                 td['id'] = 'copyright'
  102.                 td.name = 'div'
  103.                 freshSoup.body.append(td)
  104.  
  105.         return freshSoup
  106.  
  107.     def getFreshSoup(self, oldSoup):
  108.         freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
  109.         if oldSoup.head.title:
  110.             freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
  111.         return freshSoup
  112.  
  113.     def cartoonCandidatesWaPo(self, select, oldest):
  114.         opts = select.findAll('option')
  115.         for i in range(1, len(opts)):
  116.             url = opts[i]['value'].rstrip('/')
  117.             dateparts = url.split('/')[-3:]
  118.             datenum = str(dateparts[0]) + str(dateparts[1]) + str(dateparts[2])
  119.             if datenum >= oldest:
  120.                 yield {'title': self.tag_to_string(opts[i]), 'date': None, 'url': url, 'description': ''}
  121.             else:
  122.                 return
  123.  
  124.     def cartoonCandidatesCreatorsCom(self, select, oldest):
  125.         monthNames = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05',
  126.                       'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
  127.                       'November': '11', 'December': '12'}
  128.  
  129.         opts = select.findAll('option')
  130.         for i in range(1, len(opts)):
  131.             if opts[i].has_key('selected'):
  132.                 continue
  133.  
  134.             dateString = self.tag_to_string(opts[i])
  135.             rest, sep, year = dateString.rpartition(', ')
  136.             parts = rest.split(' ')
  137.             day = parts[2].rjust(2, '0')
  138.             month = monthNames[parts[1]]
  139.             datenum = str(year) + month + str(day)
  140.             if datenum >= oldest:
  141.                 yield {'title': dateString, 'date': None, 'url': opts[i]['value'], 'description': ''}
  142.             else:
  143.                 return
  144.  
  145.  
  146.