home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / wired.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  6.3 KB  |  146 lines

  1.  
  2. __license__   = 'GPL v3'
  3. __copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
  4. '''
  5. www.wired.com
  6. '''
  7.  
  8. import re
  9. from calibre import strftime
  10. from calibre.web.feeds.news import BasicNewsRecipe
  11.  
  12. class Wired(BasicNewsRecipe):
  13.     title                 = 'Wired Magazine'
  14.     __author__            = 'Darko Miletic'
  15.     description           = 'Gaming news'
  16.     publisher             = 'Conde Nast Digital'
  17.     category              = 'news, games, IT, gadgets'
  18.     oldest_article        = 32
  19.     delay                 = 1
  20.     max_articles_per_feed = 100
  21.     no_stylesheets        = True
  22.     encoding              = 'utf-8'
  23.     use_embedded_content  = False
  24.     masthead_url          = 'http://www.wired.com/images/home/wired_logo.gif'
  25.     language              = 'en'
  26.     publication_type      = 'magazine'
  27.     extra_css             = ' body{font-family: Arial,Verdana,sans-serif} .entryDescription li {display: inline; list-style-type: none} '
  28.     index                 = 'http://www.wired.com/magazine/'
  29.  
  30.     preprocess_regexps = [(re.compile(r'<meta name="Title".*<title>', re.DOTALL|re.IGNORECASE),lambda match: '<title>')]
  31.     conversion_options = {
  32.                           'comment'   : description
  33.                         , 'tags'      : category
  34.                         , 'publisher' : publisher
  35.                         , 'language'  : language
  36.                         }
  37.  
  38.     keep_only_tags = [dict(name='div', attrs={'class':'post'})]
  39.     remove_tags_after = dict(name='div', attrs={'class':'tweetmeme_button'})
  40.     remove_tags = [
  41.                      dict(name=['object','embed','iframe','link','meta','base'])
  42.                     ,dict(name='div', attrs={'class':['podcast_storyboard','tweetmeme_button']})
  43.                     ,dict(attrs={'id':'ff_bottom_nav'})
  44.                     ,dict(name='a',attrs={'href':'http://www.wired.com/app'})
  45.                   ]
  46.     remove_attributes = ['height','width','lang','border','clear']
  47.  
  48.  
  49.     def parse_index(self):
  50.         totalfeeds = []
  51.  
  52.         soup   = self.index_to_soup(self.index)
  53.         majorf = soup.find('div',attrs={'class':'index'})
  54.         if majorf:
  55.            pfarticles = []
  56.            firsta = majorf.find(attrs={'class':'spread-header'})
  57.            if firsta:
  58.               pfarticles.append({
  59.                                   'title'      :self.tag_to_string(firsta.a)
  60.                                  ,'date'       :strftime(self.timefmt)
  61.                                  ,'url'        :'http://www.wired.com' + firsta.a['href']
  62.                                  ,'description':''
  63.                                 })
  64.            for itt in majorf.findAll('li'):
  65.                itema = itt.find('a',href=True)
  66.                if itema:
  67.                   pfarticles.append({
  68.                                       'title'      :self.tag_to_string(itema)
  69.                                      ,'date'       :strftime(self.timefmt)
  70.                                      ,'url'        :'http://www.wired.com' + itema['href']
  71.                                      ,'description':''
  72.                                     })
  73.            totalfeeds.append(('Cover', pfarticles))
  74.         features = soup.find('div',attrs={'id':'my-glider'})
  75.         if features:
  76.            farticles = []
  77.            for item in features.findAll('div',attrs={'class':'section'}):
  78.                divurl = item.find('div',attrs={'class':'feature-header'})
  79.                if divurl:
  80.                    divdesc = item.find('div',attrs={'class':'feature-text'})
  81.                    url = divurl.a['href']
  82.                    if not divurl.a['href'].startswith('http://www.wired.com'):
  83.                       url   = 'http://www.wired.com' + divurl.a['href']
  84.                    title = self.tag_to_string(divurl.a)
  85.                    description = self.tag_to_string(divdesc)
  86.                    date  = strftime(self.timefmt)
  87.                    farticles.append({
  88.                                       'title'      :title
  89.                                      ,'date'       :date
  90.                                      ,'url'        :url
  91.                                      ,'description':description
  92.                                     })
  93.            totalfeeds.append(('Featured Articles', farticles))
  94.         #department feeds
  95.         departments = ['rants','start','test','play','found']
  96.         dept = soup.find('div',attrs={'id':'magazine-departments'})
  97.         if dept:
  98.             for ditem in departments:
  99.                 darticles = []
  100.                 department = dept.find('div',attrs={'id':'department-'+ditem})
  101.                 if department:
  102.                     for item in department.findAll('div'):
  103.                         description = ''
  104.                         feed_link = item.find('a')
  105.                         if feed_link and feed_link.has_key('href'):
  106.                             url   = feed_link['href']
  107.                             title = self.tag_to_string(feed_link)
  108.                             date  = strftime(self.timefmt)
  109.                             darticles.append({
  110.                                               'title'      :title
  111.                                              ,'date'       :date
  112.                                              ,'url'        :url
  113.                                              ,'description':description
  114.                                             })
  115.                     totalfeeds.append((ditem.capitalize(), darticles))
  116.         return totalfeeds
  117.  
  118.     def get_cover_url(self):
  119.         cover_url = None
  120.         soup = self.index_to_soup(self.index)
  121.         cover_item = soup.find('div',attrs={'class':'spread-image'})
  122.         if cover_item:
  123.            cover_url = 'http://www.wired.com' + cover_item.a.img['src']
  124.         return cover_url
  125.  
  126.     def print_version(self, url):
  127.         return url.rstrip('/') + '/all/1'
  128.  
  129.     def preprocess_html(self, soup):
  130.         for item in soup.findAll(style=True):
  131.             del item['style']
  132.         for item in soup.findAll('a'):
  133.             if item.string is not None:
  134.                tstr = item.string
  135.                item.replaceWith(tstr)
  136.             else:
  137.                item.name='span'
  138.                for atrs in ['href','target','alt','title','name','id']:
  139.                    if item.has_key(atrs):
  140.                       del item[atrs]
  141.         for item in soup.findAll('img'):
  142.             if not item.has_key('alt'):
  143.                item['alt'] = 'image'            
  144.         return soup
  145.  
  146.