home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / ars_technica.recipe < prev    next >
Text File  |  2011-09-09  |  5KB  |  129 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
  3. '''
  4. arstechnica.com
  5. '''
  6.  
  7. import re
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9. from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
  10.  
  11. class ArsTechnica(BasicNewsRecipe):
  12.     title                 = u'Ars Technica'
  13.     language              = 'en'
  14.     __author__            = 'Darko Miletic, Sujata Raman, Alexis Rohou'
  15.     description           = 'The art of technology'
  16.     publisher             = 'Ars Technica'
  17.     category              = 'news, IT, technology'
  18.     oldest_article        = 5
  19.     max_articles_per_feed = 100
  20.     no_stylesheets        = True
  21.     encoding              = 'utf-8'
  22.     use_embedded_content  = False
  23.     extra_css             =     '''
  24.                 body {font-family: Arial,Helvetica,sans-serif}
  25.                 .title{text-align: left}
  26.                 .byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none}
  27.                 .news-item-figure-caption-text{font-size:small; font-style:italic}
  28.                 .news-item-figure-caption-byline{font-size:small; font-style:italic; font-weight:bold}
  29.                 '''
  30.     ignoreEtcArticles     = True    # Etc feed items can be ignored, as they're not real stories
  31.  
  32.     conversion_options = {
  33.                              'comments'  : description
  34.                             ,'tags'      : category
  35.                             ,'language'  : language
  36.                             ,'publisher' : publisher
  37.                          }
  38.  
  39.  
  40.     #preprocess_regexps = [
  41.     #            (re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
  42.     #           ,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
  43.     #                     ]
  44.  
  45.     keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})]
  46.  
  47.     remove_tags = [
  48.                      dict(name=['object','link','embed'])
  49.                     ,dict(name='div', attrs={'class':'read-more-link'})
  50.                   ]
  51.     #remove_attributes=['width','height']
  52.  
  53.     feeds = [
  54.               (u'Infinite Loop (Apple content)'        , u'http://feeds.arstechnica.com/arstechnica/apple/'      )
  55.              ,(u'Opposable Thumbs (Gaming content)'    , u'http://feeds.arstechnica.com/arstechnica/gaming/'     )
  56.              ,(u'Gear and Gadgets'                     , u'http://feeds.arstechnica.com/arstechnica/gadgets/'    )
  57.              ,(u'Chipster (Hardware content)'          , u'http://feeds.arstechnica.com/arstechnica/hardware/'   )
  58.              ,(u'Uptime (IT content)'                  , u'http://feeds.arstechnica.com/arstechnica/business/'   )
  59.              ,(u'Open Ended (Open Source content)'     , u'http://feeds.arstechnica.com/arstechnica/open-source/')
  60.              ,(u'One Microsoft Way'                    , u'http://feeds.arstechnica.com/arstechnica/microsoft/'  )
  61.              ,(u'Nobel Intent (Science content)'       , u'http://feeds.arstechnica.com/arstechnica/science/'    )
  62.              ,(u'Law & Disorder (Tech policy content)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy/')
  63.             ]
  64.  
  65.     # This deals with multi-page stories
  66.     def append_page(self, soup, appendtag, position):
  67.         pager = soup.find('div',attrs={'class':'pager'})
  68.         if pager:
  69.            for atag in pager.findAll('a',href=True):
  70.                str = self.tag_to_string(atag)
  71.                if str.startswith('Next'):
  72.                   nurl = 'http://arstechnica.com' + atag['href']
  73.                   rawc = self.index_to_soup(nurl,True)
  74.                   soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding)
  75.  
  76.                   readmoretag = soup2.find('div', attrs={'class':'read-more-link'})
  77.                   if readmoretag:
  78.                      readmoretag.extract()
  79.                   texttag = soup2.find('div', attrs={'class':'body'})
  80.                   for it in texttag.findAll(style=True):
  81.                       del it['style']
  82.  
  83.                   newpos = len(texttag.contents)
  84.                   self.append_page(soup2,texttag,newpos)
  85.                   texttag.extract()
  86.                   pager.extract()
  87.                   appendtag.insert(position,texttag)
  88.  
  89.  
  90.     def preprocess_html(self, soup):
  91.     # Adds line breaks near the byline (not sure why this is needed)
  92.         ftag = soup.find('div', attrs={'class':'byline'})
  93.         if ftag:
  94.            brtag = Tag(soup,'br')
  95.            brtag2 = Tag(soup,'br')
  96.            ftag.insert(4,brtag)
  97.            ftag.insert(5,brtag2)
  98.  
  99.     # Remove style items
  100.         for item in soup.findAll(style=True):
  101.            del item['style']
  102.  
  103.     # Remove id
  104.     for item in soup.findAll(id=True):
  105.         del item['id']
  106.  
  107.     # For some reason, links to authors don't have the domainname
  108.     a_author = soup.find('a',{'href':re.compile("^/author")})
  109.     if a_author:
  110.         a_author['href'] = 'http://arstechnica.com'+a_author['href']
  111.  
  112.     # within div class news-item-figure, we need to grab images
  113.  
  114.     # Deal with multi-page stories
  115.         self.append_page(soup, soup.body, 3)
  116.  
  117.         return soup
  118.  
  119.     def get_article_url(self, article):
  120.     # If the article title starts with Etc:, don't return it
  121.     if self.ignoreEtcArticles:
  122.         article_title = article.get('title',None)
  123.         if re.match('Etc: ',article_title) is not None:
  124.             return None
  125.  
  126.     # The actual article is in a guid tag
  127.         return article.get('guid',  None).rpartition('?')[0]
  128.  
  129.