home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / outlook_india.recipe < prev    next >
Text File  |  2011-09-09  |  5KB  |  131 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = '2009, Kovid Goyal <kovid at kovidgoyal.net>'
  5. import re
  6. from calibre.web.feeds.news import BasicNewsRecipe
  7.  
  8. class OutlookIndia(BasicNewsRecipe):
  9.  
  10.     title          = 'Outlook India'
  11.     __author__     = 'Kovid Goyal and Sujata Raman'
  12.     description    = 'Weekly news and current affairs in India'
  13.     no_stylesheets = True
  14.     encoding       = 'utf-8'
  15.     language = 'en_IN'
  16.  
  17.     extra_css = '''
  18.                  body{font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
  19.                 .fspheading{color:#AF0E25 ; font-family:"Times New Roman",Times,serif; font-weight:bold ; font-size:large; }
  20.                 .fspauthor{color:#AF0E25; font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
  21.                 .fspintro{color:#666666; font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
  22.                 .fspchannelhome{font-family:Arial,Helvetica,sans-serif; font-size:x-small;}
  23.                 .fspphotocredit{color:##999999; font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
  24.                 '''
  25.     keep_only_tags = [
  26.                       dict(name='div', attrs={'id':["ctl00_cphpagemiddle_reparticle_ctl00_divfullstorytext","ctl00_cphpagemiddle_reparticle_ctl00_divartpic","ctl00_cphpagemiddle_reparticle_ctl00_divfspheading", "ctl00_cphpagemiddle_reparticle_ctl00_divartpiccaption",  "ctl00_cphpagemiddle_reparticle_ctl00_divartpiccredit","ctl00_cphpagemiddle_reparticle_ctl00_divfspintro", "ctl00_cphpagemiddle_reparticle_ctl00_divartbyline", "ctl00_cphpagemiddle_divglitteratiregulars","ctl00_cphpagemiddle_divcartoon","feedbackslatestfirst","ctl00_cphpagemiddle_divregulars","ctl00_cphpagemiddle_divquotes"]}),
  27.                            ]
  28.     remove_tags = [dict(name=['script','object','hr']),]
  29.  
  30.     def get_browser(self):
  31.         br = BasicNewsRecipe.get_browser(self)
  32.         # This site sends article titles in the cookie which occasionally
  33.         # contain non ascii characters causing httplib to fail. Instead just
  34.         # disable cookies as they're not needed for download. Proper solution
  35.         # would be to implement a unicode aware cookie jar
  36.         br.set_cookiejar(None)
  37.         return br
  38.  
  39.     def parse_index(self):
  40.  
  41.  
  42.         soup = self.index_to_soup('http://www.outlookindia.com/issues.aspx')
  43.         # find cover pic
  44.         div = soup.find('div', attrs={'class':re.compile('cententcellpadding')})
  45.  
  46.         if div is None: return None
  47.         a = div.find('a')
  48.  
  49.         if a is not None:
  50.             href =  'http://www.outlookindia.com/' + a['href']
  51.  
  52.         soup = self.index_to_soup(href)
  53.         cover = soup.find('img', attrs={'id':"ctl00_cphpagemiddle_dlissues_ctl00_imgcoverpic"}, src=True)
  54.         if cover is not None:
  55.  
  56.             self.cover_url = cover['src']
  57.  
  58.          # end find cover pic
  59.          #find current issue
  60.         div = soup.find('table', attrs={'id':re.compile('ctl00_cphpagemiddle_dlissues')})
  61.  
  62.         if div is None: return None
  63.         a = div.find('a')
  64.  
  65.         if a is not None:
  66.             href =  'http://www.outlookindia.com/' + a['href']
  67.  
  68.         soup = self.index_to_soup(href)
  69.         #find current issue
  70.  
  71.         #find the articles in the current issue
  72.         articles = []
  73.  
  74.         for a in soup.findAll('a', attrs={'class':['contentpgsubheadinglink',"contentpgtext6",]}):
  75.  
  76.             if a and a.has_key('href'):
  77.  
  78.                 url = 'http://www.outlookindia.com/' + a['href']
  79.             else:
  80.                 url =''
  81.  
  82.             title = self.tag_to_string(a)
  83.  
  84.             desc = ''
  85.             date = ''
  86.             articles.append({
  87.                                  'title':title,
  88.                                  'date':date,
  89.                                  'url':url,
  90.                                  'description':desc,
  91.                                 })
  92.         for a in soup.findAll('a', attrs={'id':["ctl00_cphpageleft_hlglitterati","ctl00_cphpageleft_hlposcape",]}):
  93.  
  94.             if a and a.has_key('href'):
  95.  
  96.                 url = 'http://www.outlookindia.com/' + a['href']
  97.             else:
  98.                 url =''
  99.  
  100.             title = self.tag_to_string(a)
  101.  
  102.             desc = ''
  103.             date = ''
  104.             articles.append({
  105.                                  'title':title,
  106.                                  'date':date,
  107.                                  'url':url,
  108.                                  'description':desc,
  109.                                 })
  110.  
  111.  
  112.         return [('Current Issue', articles)]
  113.  
  114.     def preprocess_html(self, soup):
  115.         for item in soup.findAll(style=True):
  116.             del item['style']
  117.         return self.adeify_images(soup)
  118.  
  119.  
  120.  
  121.     def postrocess_html(self, soup, first):
  122.  
  123.             for item in soup.findAll(align = "left"):
  124.                 del item['align']
  125.  
  126.             for tag in soup.findAll(name=['table', 'tr','td','tbody','ul','li','font','span']):
  127.                 tag.name = 'div'
  128.  
  129.             return soup
  130.  
  131.