home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / yementimes.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  5.1 KB  |  126 lines

  1. from calibre.web.feeds.news import BasicNewsRecipe
  2. from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
  3.  
  4. class YemenTimesRecipe(BasicNewsRecipe):
  5.     __license__  = 'GPL v3'
  6.     __author__ = 'kwetal'
  7.     language = 'en_YE'
  8.     country = 'YE'
  9.     version = 1
  10.  
  11.     title = u'Yemen Times'
  12.     publisher = u'yementimes.com'
  13.     category = u'News, Opinion, Yemen'
  14.     description = u'Award winning weekly from Yemen, promoting press freedom, professional journalism and the defense of human rights.'
  15.  
  16.     oldest_article = 7
  17.     max_articles_per_feed = 100
  18.     use_embedded_content = False
  19.     encoding = 'utf-8'
  20.  
  21.     remove_empty_feeds = True
  22.     no_stylesheets = True
  23.     remove_javascript = True
  24.  
  25.     keep_only_tags = []
  26.     keep_only_tags.append(dict(name = 'div', attrs = {'id': 'ctl00_ContentPlaceHolder1_MAINNEWS0_Panel1',
  27.                                                       'class': 'DMAIN2'}))
  28.     remove_attributes = ['style']
  29.  
  30.     INDEX = 'http://www.yementimes.com/'
  31.     feeds = []
  32.     feeds.append((u'Our Viewpoint', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=6&pnm=OUR%20VIEWPOINT'))
  33.     feeds.append((u'Local News', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=3&pnm=Local%20news'))
  34.     feeds.append((u'Their News', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=80&pnm=Their%20News'))
  35.     feeds.append((u'Report', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=8&pnm=report'))
  36.     feeds.append((u'Health', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=51&pnm=health'))
  37.     feeds.append((u'Interview', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=77&pnm=interview'))
  38.     feeds.append((u'Opinion', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=7&pnm=opinion'))
  39.     feeds.append((u'Business', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=5&pnm=business'))
  40.     feeds.append((u'Op-Ed', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=81&pnm=Op-Ed'))
  41.     feeds.append((u'Culture', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=75&pnm=Culture'))
  42.     feeds.append((u'Readers View', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=4&pnm=Readers%20View'))
  43.     feeds.append((u'Variety', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=9&pnm=Variety'))
  44.     feeds.append((u'Education', u'http://www.yementimes.com/DEFAULTSUB.ASPX?pnc=57&pnm=Education'))
  45.  
  46.     extra_css = '''
  47.                 body {font-family:verdana, arial, helvetica, geneva, sans-serif;}
  48.                 div.yemen_byline {font-size: medium; font-weight: bold;}
  49.                 div.yemen_date {font-size: small; color: #666666; margin-bottom: 0.6em;}
  50.                 .yemen_caption {font-size: x-small; font-style: italic; color: #696969;}
  51.                 '''
  52.  
  53.     conversion_options = {'comments': description, 'tags': category, 'language': 'en',
  54.                           'publisher': publisher, 'linearize_tables': True}
  55.  
  56.     def get_browser(self):
  57.         br = BasicNewsRecipe.get_browser()
  58.         br.set_handle_gzip(True)
  59.  
  60.         return br
  61.  
  62.     def parse_index(self):
  63.         answer = []
  64.         for feed_title, feed in self.feeds:
  65.             soup = self.index_to_soup(feed)
  66.  
  67.             newsbox = soup.find('div', 'newsbox')
  68.             main = newsbox.findNextSibling('table')
  69.  
  70.             articles = []
  71.             for li in main.findAll('li'):
  72.                 title = self.tag_to_string(li.a)
  73.                 url = self.INDEX + li.a['href']
  74.                 articles.append({'title': title, 'date': None, 'url': url, 'description': '<br/> '})
  75.  
  76.             answer.append((feed_title, articles))
  77.  
  78.         return answer
  79.  
  80.     def preprocess_html(self, soup):
  81.         freshSoup = self.getFreshSoup(soup)
  82.  
  83.         headline = soup.find('div', attrs = {'id': 'DVMTIT'})
  84.         if headline:
  85.             div = headline.findNext('div', attrs = {'id': 'DVTOP'})
  86.             img = None
  87.             if div:
  88.                 img = div.find('img')
  89.  
  90.             headline.name = 'h1'
  91.             freshSoup.body.append(headline)
  92.             if img is not None:
  93.                 freshSoup.body.append(img)
  94.  
  95.         byline = soup.find('div', attrs = {'id': 'DVTIT'})
  96.         if byline:
  97.             date_el = byline.find('span')
  98.             if date_el:
  99.                 pub_date = self.tag_to_string(date_el)
  100.                 date = Tag(soup, 'div', attrs = [('class', 'yemen_date')])
  101.                 date.append(pub_date)
  102.                 date_el.extract()
  103.  
  104.             raw = '<br/>'.join(['%s' % (part) for part in byline.findAll(text = True)])
  105.             author = BeautifulSoup('<div class="yemen_byline">' + raw + '</div>')
  106.  
  107.             if date is not None:
  108.                 freshSoup.body.append(date)
  109.             freshSoup.body.append(author)
  110.  
  111.         story = soup.find('div', attrs = {'id': 'DVDET'})
  112.         if story:
  113.             for table in story.findAll('table'):
  114.                 if table.find('img'):
  115.                     table['class'] = 'yemen_caption'
  116.  
  117.             freshSoup.body.append(story)
  118.  
  119.         return freshSoup
  120.  
  121.     def getFreshSoup(self, oldSoup):
  122.         freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
  123.         if oldSoup.head.title:
  124.             freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
  125.         return freshSoup
  126.