home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / linuxdevices.recipe < prev    next >
Text File  |  2011-09-09  |  4KB  |  98 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
  3.  
  4. '''
  5. Fetch Linuxdevices.
  6. '''
  7. import re
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9.  
  10.  
  11. class LinuxDevices(BasicNewsRecipe):
  12.  
  13.     title = u'Linuxdevices'
  14.     description = 'News about Linux driven Hardware'
  15.     __author__ = 'Oliver Niesner'
  16.     use_embedded_content   = False
  17.     timefmt = ' [%a %d %b %Y]'
  18.     max_articles_per_feed = 50
  19.     no_stylesheets = True
  20.     language = 'en'
  21.  
  22.     remove_javascript = True
  23.     conversion_options = { 'linearize_tables' : True}
  24.     encoding = 'latin1'
  25.  
  26.  
  27.     remove_tags_after = [dict(id='intelliTxt')]
  28.     filter_regexps = [r'ad\.doubleclick\.net']
  29.  
  30.     remove_tags = [dict(name='div', attrs={'class':'bannerSuperBanner'}),
  31.                    dict(name='div', attrs={'class':'bannerSky'}),
  32.                    dict(name='div', attrs={'border':'0'}),
  33.                    dict(name='div', attrs={'class':'footerLinks'}),
  34.                    dict(name='div', attrs={'class':'seitenanfang'}),
  35.                    dict(name='td', attrs={'class':'mar5'}),
  36.                    dict(name='table', attrs={'class':'pageAktiv'}),
  37.                    dict(name='table', attrs={'class':'xartable'}),
  38.                    dict(name='table', attrs={'class':'wpnavi'}),
  39.                    dict(name='table', attrs={'class':'bgcontent absatz'}),
  40.                    dict(name='table', attrs={'class':'footer'}),
  41.                    dict(name='table', attrs={'class':'artikelBox'}),
  42.                    dict(name='table', attrs={'class':'kommentare'}),
  43.                    dict(name='table', attrs={'class':'pageBoxBot'}),
  44.                    dict(name='table', attrs={'td':'height="3"'}),
  45.                    dict(name='table', attrs={'class':'contentpaneopen'}),
  46.                    dict(name='td', attrs={'nowrap':'nowrap'}),
  47.                    dict(name='td', attrs={'align':'left'}),
  48.                    dict(name='td', attrs={'height':'5'}),
  49.                    dict(name='td', attrs={'class':'ArticleWidgetsHeadline'}),
  50.                    dict(name='div', attrs={'class':'artikelBox navigatorBox'}),
  51.                    dict(name='div', attrs={'class':'similar-article-box'}),
  52.                    dict(name='div', attrs={'class':'videoBigHack'}),
  53.                    dict(name='td', attrs={'class':'artikelDruckenRight'}),
  54.                    dict(name='td', attrs={'class':'width="200"'}),
  55.                    dict(name='span', attrs={'class':'content_rating'}),
  56.                    dict(name='a', attrs={'href':'http://www.addthis.com/bookmark.php'}),
  57.                    dict(name='a', attrs={'href':'/news'}),
  58.                    dict(name='a', attrs={'href':'/cgi-bin/survey/survey.cgi'}),
  59.                    dict(name='a', attrs={'href':'/cgi-bin/board/UltraBoard.pl'}),
  60.                    dict(name='iframe'),
  61.                    dict(name='form'),
  62.                    dict(name='span', attrs={'class':'hidePrint'}),
  63.                    dict(id='ArticleWidgets'),
  64.                    dict(id='headerLBox'),
  65.                    dict(id='nointelliTXT'),
  66.                    dict(id='rechteSpalte'),
  67.                    dict(id='newsticker-list-small'),
  68.                    dict(id='ntop5'),
  69.                    dict(id='ntop5send'),
  70.                    dict(id='ntop5commented'),
  71.                    dict(id='nnav-bgheader'),
  72.                    dict(id='nnav-headerteaser'),
  73.                    dict(id='nnav-head'),
  74.                    dict(id='nnav-top'),
  75.                    dict(id='readcomment')]
  76.  
  77.  
  78.  
  79.     feeds =  [ (u'Linuxdevices', u'http://www.linuxfordevices.com/rss.xml') ]
  80.  
  81.     def preprocess_html(self, soup):
  82.         match = re.compile(r"^Related")
  83.         for item in soup.findAll('b', text=match):
  84.             item.extract()
  85.         for item in soup.findAll(re.compile('^ul')):
  86.             item.extract()
  87.         for item in soup.findAll('br', limit=10):
  88.             item.extract()
  89.         return soup
  90.  
  91.  
  92.     def postprocess_html(self, soup, first):
  93.         for tag in soup.findAll(name=['table', 'tr', 'td']):
  94.             tag.name = 'div'
  95.         return soup
  96.  
  97.  
  98.