home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / jsonline.recipe < prev    next >
Text File  |  2011-09-09  |  3KB  |  70 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  3.  
  4. class JSOnline(BasicNewsRecipe):
  5.     title          = u'Milwaukee Journal Sentinel'
  6.     language       = 'en'
  7.     __author__     = 'Krittika Goyal'
  8.     oldest_article = 2 #days
  9.     max_articles_per_feed = 25
  10.  
  11.  
  12.     no_stylesheets = True
  13.     remove_tags_before = dict(name='div', attrs={'id':'wrapper'})
  14.     #remove_tags_after  = dict(name='td', attrs={'class':'asset-bar'})
  15.     remove_tags = [
  16.        dict(name='iframe'),
  17.        dict(name='div', attrs={'class':['right_float', 'headlines', 'side_section_container poll', 'side_section_container html']}),
  18.        #dict(name='div', attrs={'id':['rightColumn']}),
  19.        #dict(name='span', attrs={'class':'comment_forbidden'}),
  20.        #dict(name='ul', attrs={'class':'links inline'}),
  21.        #dict(name='p', attrs={'id':'commentadvisory'}),
  22.        #dict(name='div', attrs={'style':['width: 300px; margin-right: 2em; float: left;']}),
  23.        #dict(name='div', style="float:right; width: 300px;"),
  24.        #dict(name='p', style="clear:both;"),
  25.        #dict(name='p', attrs={'name':'&lpos=footer_textlinks'}),
  26.        #dict(name='span', text=':'),
  27.     ]
  28.  
  29.     feeds          = [
  30. ('Main Headlines',
  31.  'http://www.jsonline.com/rss?c=y&path=%2F'),
  32. ('Business',
  33.  'http://www.jsonline.com/rss?c=y&path=%2Fbusiness'),
  34. ('Milwaukee marketplace',
  35.  'http://www.jsonline.com/rss?c=y&path=%2Fmarketplace'),
  36. ('Top Entertainment Stories',
  37.  'http://www.jsonline.com/rss?c=y&path=%2Fentertainment%2Ftopstories'),
  38. ('Arts and Books',
  39.  'http://www.jsonline.com/rss?c=y&path=%2Fentertainment%2Farts'),
  40. ('Movies',
  41.  'http://www.jsonline.com/rss?c=y&path=%2Fentertainment%2Fmovies'),
  42. ('Music and Nightlife',
  43.  'http://www.jsonline.com/rss?c=y&path=%2Fentertainment%2Fmusicandnightlife'),
  44. ('Dining',
  45.  'http://www.jsonline.com/rss?c=y&path=%2Ffeatures%2Fdining'),
  46. ('Fashion',
  47.  'http://www.jsonline.com/rss?c=y&path=%2Ffeatures%2Ffashion'),
  48. ('Health and Fitness',
  49.  'http://www.jsonline.com/rss?c=y&path=%2Ffeatures%2Fhealth'),
  50. ('Top Metro Stories',
  51.  'http://www.jsonline.com/rss?c=y&path=%2Fnews%2Ftopstories'),
  52. ('Crime',
  53.  'http://www.jsonline.com/rss?c=y&path=%2Fnews%2Fcrime'),
  54. ('Sports',
  55.  'http://www.jsonline.com/rss?c=y&path=%2Fsports'),
  56. ]
  57.  
  58.     #def print_version(self, url):
  59.         #return url+'/0'
  60.  
  61.     def preprocess_html(self, soup):
  62.         story = soup.find(name='div', attrs={'id':'mainContent'})
  63.         #td = heading.findParent(name='td')
  64.         #td.extract()
  65.         soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
  66.         body = soup.find(name='body')
  67.         body.insert(0, story)
  68.         #td.name = 'div'
  69.         return soup
  70.