home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/env python
-
- u'''
- Ведомости
- '''
-
- from calibre.web.feeds.feedparser import parse
- from calibre.ebooks.BeautifulSoup import Tag
- from calibre.web.feeds.news import BasicNewsRecipe
-
- class VedomostiRecipe(BasicNewsRecipe):
- title = u'Ведомости'
- __author__ = 'Nikolai Kotchetkov'
- publisher = 'vedomosti.ru'
- category = 'press, Russia'
- description = u'Ежедневная деловая газета'
- oldest_article = 3
- max_articles_per_feed = 100
-
- masthead_url = u'http://motorro.com/imgdir/logos/ved_logo_black2_cropped.gif'
- cover_url = u'http://motorro.com/imgdir/logos/ved_logo_black2_cropped.gif'
-
- #Add feed names if you want them to be sorted (feeds of this list appear first)
- sortOrder = [u'_default', u'Первая полоса', u'Власть и деньги']
-
- encoding = 'cp1251'
- language = 'ru'
- no_stylesheets = True
- remove_javascript = True
- recursions = 0
-
- conversion_options = {
- 'comment' : description
- , 'tags' : category
- , 'publisher' : publisher
- , 'language' : language
- }
-
-
- keep_only_tags = [dict(name='td', attrs={'class' : ['second_content']})]
-
- remove_tags_after = [dict(name='div', attrs={'class' : 'article_text'})]
-
- remove_tags = [dict(name='div', attrs={'class' : ['sep', 'choice', 'articleRightTbl']})]
-
- feeds = [u'http://www.vedomosti.ru/newspaper/out/rss.xml']
-
- #base URL for relative links
- base_url = u'http://www.vedomosti.ru'
-
- extra_css = 'h1 {font-size: 1.5em; margin: 0em 0em 0em 0em; text-align: center;}'\
- 'h2 {font-size: 1.0em; margin: 0em 0em 0em 0em;}'\
- 'h3 {font-size: 0.8em; margin: 0em 0em 0em 0em;}'\
- '.article_date {font-size: 0.5em; color: gray; font-family: monospace; text-align:right;}'\
- '.article_authors {font-size: 0.5em; color: gray; font-family: monospace; text-align:right;}'\
- '.article_img {width:100%; text-align: center; padding: 3px 3px 3px 3px;}'\
- '.article_img_desc {width:100%; text-align: center; font-size: 0.5em; color: gray; font-family: monospace;}'\
- '.article_desc {font-size: 1em; font-style:italic;}'
-
- def parse_index(self):
- try:
- feedData = parse(self.feeds[0])
- if not feedData:
- raise NotImplementedError
- self.log("parse_index: Feed loaded successfully.")
- if feedData.feed.has_key('title'):
- self.title = feedData.feed.title
- self.log("parse_index: Title updated to: ", self.title)
- if feedData.feed.has_key('description'):
- self.description = feedData.feed.description
- self.log("parse_index: Description updated to: ", self.description)
-
- def get_virtual_feed_articles(feed):
- if feeds.has_key(feed):
- return feeds[feed][1]
- self.log("Adding new feed: ", feed)
- articles = []
- feeds[feed] = (feed, articles)
- return articles
-
- feeds = {}
-
- #Iterate feed items and distribute articles using tags
- for item in feedData.entries:
- link = item.get('link', '');
- title = item.get('title', '');
- if '' == link or '' == title:
- continue
- article = {'title':title, 'url':link, 'description':item.get('description', ''), 'date':item.get('date', ''), 'content':''};
- if not item.has_key('tags'):
- get_virtual_feed_articles('_default').append(article)
- continue
- for tag in item.tags:
- addedToDefault = False
- term = tag.get('term', '')
- if '' == term:
- if (not addedToDefault):
- get_virtual_feed_articles('_default').append(article)
- continue
- get_virtual_feed_articles(term).append(article)
-
- #Get feed list
- #Select sorted feeds first of all
- result = []
- for feedName in self.sortOrder:
- if (not feeds.has_key(feedName)): continue
- result.append(feeds[feedName])
- del feeds[feedName]
- result = result + feeds.values()
-
- return result
-
- except Exception, err:
- self.log(err)
- raise NotImplementedError
-
- def preprocess_html(self, soup):
- return self.adeify_images(soup)
-
- def postprocess_html(self, soup, first_fetch):
- #self.log('Original: ', soup.prettify())
-
- #Find article
- contents = soup.find('div', {'class':['article_text']})
- if not contents:
- self.log('postprocess_html: article div not found!')
- return soup
- contents.extract()
-
- #Find title
- title = soup.find('h1')
- if title:
- contents.insert(0, title)
-
- #Find article image
- newstop = soup.find('div', {'class':['newstop']})
- if newstop:
- img = newstop.find('img')
- if img:
- imgDiv = Tag(soup, 'div')
- imgDiv['class'] = 'article_img'
-
- if img.has_key('width'):
- del(img['width'])
- if img.has_key('height'):
- del(img['height'])
-
- #find description
- element = img.parent.nextSibling
-
- img.extract()
- imgDiv.insert(0, img)
-
- while element:
- if not isinstance(element, Tag):
- continue
- nextElement = element.nextSibling
- if 'p' == element.name:
- element.extract()
- element['class'] = 'article_img_desc'
- imgDiv.insert(len(imgDiv.contents), element)
- element = nextElement
-
- contents.insert(1, imgDiv)
-
- #find article abstract
- abstract = soup.find('p', {'class':['subhead']})
- if abstract:
- abstract['class'] = 'article_desc'
- contents.insert(2, abstract)
-
- #Find article authors
- authorsDiv = soup.find('div', {'class':['autors']})
- if authorsDiv:
- authorsP = authorsDiv.find('p')
- if authorsP:
- authorsP['class'] = 'article_authors'
- contents.insert(len(contents.contents), authorsP)
-
- #Fix urls that use relative path
- urls = contents.findAll('a');
- if urls:
- for url in urls:
- if not url.has_key('href'):
- continue
- if '/' == url['href'][0]:
- url['href'] = self.base_url + url['href']
-
- body = soup.find('td', {'class':['second_content']})
- if body:
- body.replaceWith(contents)
-
- self.log('Result: ', soup.prettify())
- return soup
-
-