home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / roger_ebert.recipe < prev    next >
Text File  |  2011-09-09  |  5KB  |  121 lines

  1. import re
  2. import urllib2
  3. from calibre.web.feeds.news import BasicNewsRecipe
  4. from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer
  5.  
  6. class Ebert(BasicNewsRecipe):
  7.     title                 = 'Roger Ebert'
  8.     __author__            = 'Shane Erstad'
  9.     description           = 'Roger Ebert Movie Reviews'
  10.     publisher             = 'Chicago Sun Times'
  11.     category              = 'movies'
  12.     oldest_article        = 8
  13.     max_articles_per_feed = 100
  14.     no_stylesheets        = True
  15.     use_embedded_content  = False
  16.     encoding              = 'utf-8'
  17.     masthead_url          = 'http://rogerebert.suntimes.com/graphics/global/roger.jpg'
  18.     language              = 'en'
  19.     remove_empty_feeds    = False
  20.     PREFIX                  = 'http://rogerebert.suntimes.com'
  21.     patternReviews                = r'<span class="*?movietitle"*?>(.*?)</span>.*?<div class="*?headline"*?>(.*?)</div>(.*?)</div>'
  22.     patternCommentary       = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?COMMENTARY.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
  23.     patternPeople           = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?PEOPLE.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
  24.     patternGlossary           = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?GLOSSARY.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
  25.  
  26.  
  27.  
  28.     conversion_options = {
  29.                           'comment'          : description
  30.                         , 'tags'             : category
  31.                         , 'publisher'        : publisher
  32.                         , 'language'         : language
  33.                         , 'linearize_tables' : True
  34.                         }
  35.  
  36.  
  37.     feeds          = [
  38.                         (u'Reviews'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=reviews' )
  39.                         ,(u'Commentary'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=COMMENTARY')
  40.                         ,(u'Great Movies'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=REVIEWS08')
  41.                         ,(u'People'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=PEOPLE')
  42.                         ,(u'Glossary'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=GLOSSARY')
  43.  
  44.                      ]
  45.  
  46.     preprocess_regexps = [
  47.         (re.compile(r'<font.*?>.*?This is a printer friendly.*?</font>.*?<hr>', re.DOTALL|re.IGNORECASE),
  48.             lambda m: '')
  49.     ]
  50.  
  51.  
  52.  
  53.     def print_version(self, url):
  54.         return url + '&template=printart'
  55.  
  56.     def parse_index(self):
  57.         totalfeeds = []
  58.         lfeeds = self.get_feeds()
  59.         for feedobj in lfeeds:
  60.             feedtitle, feedurl = feedobj
  61.             self.log('\tFeedurl: ', feedurl)
  62.             self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
  63.             articles = []
  64.             page = urllib2.urlopen(feedurl).read()
  65.  
  66.             if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
  67.                     pattern = self.patternReviews
  68.             elif feedtitle == 'Commentary':
  69.                     pattern = self.patternCommentary
  70.             elif feedtitle == 'People':
  71.                     pattern = self.patternPeople
  72.             elif feedtitle == 'Glossary':
  73.                     pattern = self.patternGlossary
  74.  
  75.  
  76.             regex = re.compile(pattern, re.IGNORECASE|re.DOTALL)
  77.  
  78.             for match in regex.finditer(page):
  79.                 if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
  80.                     movietitle = match.group(1)
  81.                     thislink = match.group(2)
  82.                     description = match.group(3)
  83.                 elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary':
  84.                     thislink = match.group(1)
  85.                     description = match.group(2)
  86.  
  87.                 self.log(thislink)
  88.  
  89.                 for link in BeautifulSoup(thislink, parseOnlyThese=SoupStrainer('a')):
  90.                     thisurl = self.PREFIX + link['href']
  91.                     thislinktext = self.tag_to_string(link)
  92.  
  93.                     if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
  94.                         thistitle = movietitle
  95.                     elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary':
  96.                         thistitle = thislinktext
  97.  
  98.                     if thistitle == '':
  99.                         thistitle = 'Ebert Journal Post'
  100.  
  101.                     """
  102.                     pattern2 = r'AID=\/(.*?)\/'
  103.                     reg2 = re.compile(pattern2, re.IGNORECASE|re.DOTALL)
  104.                     match2 = reg2.search(thisurl)
  105.                     date = match2.group(1)
  106.                     c = time.strptime(match2.group(1),"%Y%m%d")
  107.                     date=time.strftime("%a, %b %d, %Y", c)
  108.                     self.log(date)
  109.                     """
  110.  
  111.                     articles.append({
  112.                                       'title'      :thistitle
  113.                                      ,'date'       :''
  114.                                      ,'url'        :thisurl
  115.                                      ,'description':description
  116.                                     })
  117.             totalfeeds.append((feedtitle, articles))
  118.  
  119.         return totalfeeds
  120.  
  121.