home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / apple_daily.recipe < prev    next >
Text File  |  2011-09-09  |  6KB  |  162 lines

  1. # -*- coding: utf-8 -*-
  2. import re
  3. from calibre.web.feeds.recipes import BasicNewsRecipe
  4.  
  5. class AppleDaily(BasicNewsRecipe):
  6.  
  7.     title       = u'Φÿïµ₧£µùÑσá▒'
  8.     __author__  = u'Φÿïµ₧£µùÑσá▒'
  9.     __publisher__  = u'Φÿïµ₧£µùÑσá▒'
  10.     description = u'Φÿïµ₧£µùÑσá▒'
  11.     masthead_url = 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
  12.     language = 'zh_TW'
  13.     encoding = 'UTF-8'
  14.     timefmt = ' [%a, %d %b, %Y]'
  15.     needs_subscription = False
  16.     remove_javascript = True
  17.     remove_tags_before = dict(name=['ul', 'h1'])
  18.     remove_tags_after  = dict(name='form')
  19.     remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
  20.                 dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
  21.                 dict(name=['script', 'noscript', 'style', 'form'])]
  22.     no_stylesheets = True
  23.     extra_css = '''
  24.         @font-face {font-family: "uming", serif, sans-serif;  src: url(res:///usr/share/fonts/truetype/arphic/uming.ttc); }\n
  25.         body {margin-right: 8pt; font-family: 'uming', serif;}
  26.         h1 {font-family: 'uming', serif, sans-serif}
  27.             '''
  28.     #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
  29.  
  30.     preprocess_regexps = [
  31.        (re.compile(r'img.php?server=(?P<server>[^&]+)&path=(?P<path>[^&]+).*', re.DOTALL|re.IGNORECASE),
  32.         lambda match: 'http://' + match.group('server') + '/' + match.group('path')),
  33.     ]
  34.  
  35.     def get_cover_url(self):
  36.         return 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
  37.  
  38.  
  39.     #def get_browser(self):
  40.         #br = BasicNewsRecipe.get_browser()
  41.         #if self.username is not None and self.password is not None:
  42.         #    br.open('http://www.nytimes.com/auth/login')
  43.         #    br.select_form(name='login')
  44.         #    br['USERID']   = self.username
  45.         #    br['PASSWORD'] = self.password
  46.         #    br.submit()
  47.         #return br
  48.  
  49.     def preprocess_html(self, soup):
  50.         #process all the images
  51.         for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
  52.             iurl = tag['src']
  53.             #print 'checking image: ' + iurl
  54.  
  55.             #img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
  56.             p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
  57.  
  58.             m = p.search(iurl)
  59.  
  60.             if m is not None:
  61.                 iurl = 'http://' + m.group('server') + '/' + m.group('path')
  62.                 #print 'working! new url: ' + iurl
  63.                 tag['src'] = iurl
  64.             #else:
  65.                 #print 'not good'
  66.  
  67.         for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
  68.             iurl = tag['href']
  69.             #print 'checking image: ' + iurl
  70.  
  71.             #img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
  72.             p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
  73.  
  74.             m = p.search(iurl)
  75.  
  76.             if m is not None:
  77.                 iurl = 'http://' + m.group('server') + '/' + m.group('path')
  78.                 #print 'working! new url: ' + iurl
  79.                 tag['href'] = iurl
  80.             #else:
  81.                 #print 'not good'
  82.  
  83.         return soup
  84.  
  85.  
  86.     def parse_index(self):
  87.         base = 'http://news.hotpot.hk/fruit'
  88.         soup = self.index_to_soup('http://news.hotpot.hk/fruit/index.php')
  89.  
  90.         #def feed_title(div):
  91.         #    return ''.join(div.findAll(text=True, recursive=False)).strip()
  92.  
  93.         articles = {}
  94.         key = None
  95.         ans = []
  96.         for div in soup.findAll('li'):
  97.             key = div.find(text=True, recursive=True);
  98.             #if key == u'Φ▒¬µâà':
  99.            #    continue;
  100.  
  101.             print 'section=' + key
  102.  
  103.             articles[key] = []
  104.  
  105.             ans.append(key)
  106.  
  107.             a = div.find('a', href=True)
  108.  
  109.             if not a:
  110.                 continue
  111.  
  112.             url = base + '/' + a['href']
  113.             print 'url=' + url
  114.  
  115.             if not articles.has_key(key):
  116.                 articles[key] = []
  117.             else:
  118.                 # sub page
  119.                 subSoup = self.index_to_soup(url)
  120.  
  121.                 for subDiv in subSoup.findAll('li'):
  122.                     subA = subDiv.find('a', href=True)
  123.                     subTitle = subDiv.find(text=True, recursive=True)
  124.                     subUrl = base + '/' + subA['href']
  125.  
  126.                     print 'subUrl' + subUrl
  127.  
  128.                     articles[key].append(
  129.                         dict(title=subTitle,
  130.                          url=subUrl,
  131.                          date='',
  132.                          description='',
  133.                          content=''))
  134.  
  135.  
  136. #             elif div['class'] in ['story', 'story headline']:
  137. #                 a = div.find('a', href=True)
  138. #                 if not a:
  139. #                     continue
  140. #                 url = re.sub(r'\?.*', '', a['href'])
  141. #                 url += '?pagewanted=all'
  142. #                 title = self.tag_to_string(a, use_alt=True).strip()
  143. #                 description = ''
  144. #                 pubdate = strftime('%a, %d %b')
  145. #                 summary = div.find(True, attrs={'class':'summary'})
  146. #                 if summary:
  147. #                     description = self.tag_to_string(summary, use_alt=False)
  148. #
  149. #                 feed = key if key is not None else 'Uncategorized'
  150. #                 if not articles.has_key(feed):
  151. #                     articles[feed] = []
  152. #                 if not 'podcasts' in url:
  153. #                     articles[feed].append(
  154. #                               dict(title=title, url=url, date=pubdate,
  155. #                                    description=description,
  156. #                                    content=''))
  157. #        ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
  158.         ans = [(unicode(key), articles[key]) for key in ans if articles.has_key(key)]
  159.         return ans
  160.  
  161.  
  162.