home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / singtao_daily.recipe < prev    next >
Text File  |  2011-09-09  |  3KB  |  80 lines

  1. from calibre.web.feeds.recipes import BasicNewsRecipe
  2.  
  3. class AdvancedUserRecipe1278063072(BasicNewsRecipe):
  4.     title          = u'Singtao Daily - Canada'
  5.     oldest_article = 7
  6.     max_articles_per_feed = 100
  7.     __author__            = 'rty'
  8.     description           = 'Toronto Canada Chinese Newspaper'
  9.     publisher             = 'news.singtao.ca'
  10.     category              = 'Chinese, News, Canada'
  11.     remove_javascript = True
  12.     use_embedded_content   = False
  13.     no_stylesheets = True
  14.     language = 'zh'
  15.     conversion_options = {'linearize_tables':True}
  16.     masthead_url = 'http://news.singtao.ca/i/site_2009/logo.jpg'
  17.     extra_css = '''
  18.         @font-face {font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\
  19.  
  20.     body {text-align: justify; margin-right: 8pt; font-family: 'DroidFont', serif;}\
  21.  
  22.                     h1 {font-family: 'DroidFont', serif;}\
  23.  
  24.                     .articledescription {font-family: 'DroidFont', serif;}
  25.             '''
  26.     keep_only_tags = [
  27.     dict(name='div', attrs={'id':['title','storybody']}),
  28.     dict(name='div', attrs={'class':'content'})
  29.     ]
  30.  
  31.     def parse_index(self):
  32.             feeds = []
  33.             for title, url in [
  34.                ('Editorial',
  35.                    'http://news.singtao.ca/toronto/editorial.html'),
  36.                ('Toronto   \xe5\x9f\x8e\xe5\xb8\x82/\xe7\xa4\xbe\xe5\x8d\x80'.decode('utf-8'),
  37.                    'http://news.singtao.ca/toronto/city.html'),
  38.                ('Canada \xe5\x8a\xa0\xe5\x9c\x8b'.decode('utf-8'),
  39.                    'http://news.singtao.ca/toronto/canada.html'),
  40.                ('Entertainment',
  41.                    'http://news.singtao.ca/toronto/entertainment.html'),
  42.                ('World',
  43.                    'http://news.singtao.ca/toronto/world.html'),
  44.                ('Finance \xe5\x9c\x8b\xe9\x9a\x9b\xe8\xb2\xa1\xe7\xb6\x93'.decode('utf-8'),
  45.                    'http://news.singtao.ca/toronto/finance.html'),
  46.                ('Sports', 'http://news.singtao.ca/toronto/sports.html'),
  47.                             ]:
  48.                articles = self.parse_section(url)
  49.                if articles:
  50.                    feeds.append((title, articles))
  51.             return feeds
  52.  
  53.     def parse_section(self, url):
  54.             soup = self.index_to_soup(url)
  55.             div = soup.find(attrs={'class': ['newslist paddingL10T10','newslist3 paddingL10T10']})
  56.             #date = div.find(attrs={'class': 'underlineBLK'})
  57.             current_articles = []
  58.             for li in div.findAll('li'):
  59.                     a = li.find('a', href = True)
  60.                     if a is None:
  61.                         continue
  62.                     title = self.tag_to_string(a)
  63.                     url = a.get('href', False)
  64.                     if not url or not title:
  65.                         continue
  66.                     if url.startswith('/'):
  67.                          url = 'http://news.singtao.ca'+url
  68.           #          self.log('\    \    Found article:', title)
  69.           #          self.log('\    \    \    ', url)
  70.                     current_articles.append({'title': title, 'url': url, 'description':''})
  71.  
  72.             return current_articles
  73.  
  74.     def preprocess_html(self, soup):
  75.         for item in soup.findAll(style=True):
  76.            del item['style']
  77.         for item in soup.findAll(width=True):
  78.            del item['width']
  79.         return soup
  80.