home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / observatorul_cultural.recipe < prev    next >
Text File  |  2011-09-09  |  3KB  |  65 lines

  1. import re
  2. from calibre.web.feeds.news import BasicNewsRecipe
  3. coverpage = None
  4.  
  5. class ObservatorulCultural(BasicNewsRecipe):
  6.     title                 = u'Observatorul cultural'
  7.     __author__            = 'song2' #prelucrat dupa un script de http://www.thenowhereman.com
  8.     encoding = 'utf-8'
  9.     language = 'ro'
  10.     publication_type = 'magazine'
  11.     description = 'Spiritul critic in ac┼úiune\n'
  12.     no_stylesheets        = True
  13.     remove_javascript     = True
  14.     masthead_url='http://www.observatorcultural.ro/userfiles/article/sigla%20Observator%20cultural_02231058.JPG'
  15.     keep_only_tags = [
  16.         dict(name='div', attrs={'class':'detaliuArticol'})]
  17.     remove_tags = [dict(name='div', attrs={'class':'comentariiArticol'}),
  18.          dict(name='div', attrs={'class':'postComment'}),
  19.          dict(name='div', attrs={'class':'utileArticol'}),
  20.          dict(name='p', attrs={'class':'butonComenteaza'}),
  21.          dict(name='h5'),
  22.          dict(name='div', attrs={'style':'margin-top: 0px; padding-top: 0px;'})
  23.          ]
  24.     def parse_index(self):
  25.         soup = self.index_to_soup('http://www.observatorcultural.ro/Arhiva*-archive.html')
  26.         issueTag = soup.find('a', href=re.compile("observatorcultural.ro\/Numarul"))
  27.         issueURL = issueTag['href']
  28.         print issueURL;
  29.         issueSoup = self.index_to_soup(issueURL)
  30.         feeds = []
  31.         stories = []
  32.         for categorie in issueSoup.findAll('dl',attrs={'class':'continutArhive'}):
  33.             categ=self.tag_to_string(categorie.find('dt'))
  34.             for story in categorie.findAll('dd'):
  35.                 title=[]
  36.                 for bucatele in story.findAll('a'):
  37.                     title.append(bucatele)
  38.                 if len(title)==1: #daca articolul nu are autor
  39.                     stories.append({
  40.                         'title' : self.tag_to_string(title[0]),
  41.                         'url'   : title[0]['href'],
  42.                         'date'  : '',
  43.                         'author' : ''})
  44.                 else: # daca articolul are autor len(title)=2
  45.                     stories.append({
  46.                         'title' : self.tag_to_string(title[1]),
  47.                         'url'   :title[1]['href'],
  48.                         'date'  : '',
  49.                         'author' : self.tag_to_string(title[0])})
  50.                     print(self.tag_to_string(title[0]))
  51.                 if 'Editorial' in categ:
  52.                     global coverpage
  53.                     coverpage=title[1]['href']  # am luat link-ul spre editorial
  54.             feeds.append((categ,stories))
  55.             stories = []
  56.         print feeds
  57.         return feeds
  58. #procedura de luat coperta
  59.     def get_cover_url(self):
  60.         soup = self.index_to_soup(coverpage)
  61.         link_item = soup.find('a',attrs={'rel':'lightbox'}) # caut imaginea textului
  62.         a=''
  63.         cover_url = a.join(link_item.img['src'].split('_details_'))
  64.         return cover_url
  65.