home *** CD-ROM | disk | FTP | other *** search
- #!/usr/bin/env python
- __author__ = 'Darko Spasovski'
- __license__ = 'GPL v3'
- __copyright__ = '2011, Darko Spasovski <darko.spasovski at gmail.com>'
- '''
- dnevnik.com.mk
- '''
- import re
- import datetime
- from calibre.web.feeds.news import BasicNewsRecipe
- from calibre import browser
- from calibre.ebooks.BeautifulSoup import BeautifulSoup
- class Dnevnik(BasicNewsRecipe):
- INDEX = 'http://www.dnevnik.com.mk'
- __author__ = 'Darko Spasovski'
- title = 'Dnevnik - mk'
- description = 'Daily Macedonian newspaper'
- masthead_url = 'http://www.dnevnik.com.mk/images/re-logo.gif'
- language = 'mk'
- publication_type = 'newspaper'
- category = 'news, Macedonia'
- max_articles_per_feed = 100
- remove_javascript = True
- no_stylesheets = True
- use_embedded_content = False
- preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
- [
- ## Remove anything before the start of the article.
- (r'<body.*?<\?xml version=\"1.0\"\?><!--Article start-->', lambda match: '<body>'),
- ## Remove anything after the end of the article.
- (r'<!--Article end.*?</body>', lambda match : '</body>'),
- ]
- ]
- extra_css = """
- body{font-family: Arial,Helvetica,sans-serif}
- .WB_DNEVNIK_Naslov{FONT-WEIGHT: bold; FONT-SIZE: 18px; FONT-FAMILY: Arial, Verdana, Tahoma; TEXT-DECORATION: none}
- """
- conversion_options = {
- 'comment' : description,
- 'tags' : category,
- 'language' : language,
- 'linearize_tables' : True
- }
- def parse_index(self):
- datum = datetime.datetime.today().strftime('%d.%m.%Y')
- soup = self.index_to_soup(self.INDEX + '/default.asp?section=arhiva&arhDatum=' + datum)
- feeds = []
- for section in soup.findAll('td', attrs={'class':'WB_DNEVNIK_ArhivaFormTitle'}):
- sectionTitle = section.contents[0].string
- if sectionTitle.lower().startswith('online'):
- # Skip online articles
- continue
- containerTable = section.findPrevious(name='table').findNextSibling(name='table')
- if containerTable==None:
- print 'No container table found - page layout may have been changed.'
- continue
- articles = []
- for article in containerTable.findAll('a', attrs={'class': 'WB_DNEVNIK_ArhivaFormText'}):
- title = self.tag_to_string(article, use_alt=True).strip()
- articles.append({'title': title, 'url':'http://www.dnevnik.com.mk/' + article['href'], 'description':'', 'date':''})
- if articles:
- feeds.append((sectionTitle, articles))
- return sorted(feeds, key=lambda section: self.get_weight(section))
- def get_weight(self, section):
- """
- Returns 'weight' of a section.
- Used for sorting the sections based on their 'natural' order in the printed edition.
- """
- natural_order = { u'во фокусот': 1, u'актуелно': 2, u'економија': 3,
- u'отворена': 4, u'свет': 5, u'интервју': 6, u'џубокс': 7,
- u'репортажа': 8, u'наш туризам': 9, u'живот': 10,
- u'автомобилизам': 11, u'спорт': 12, u'омнибус': 13 }
- if section[0].string.lower() in natural_order:
- return natural_order[section[0].string.lower()]
- else:
- return 999 # section names not on the list go to the bottom
- def get_cover_url(self):
- datum = datetime.datetime.today().strftime('%d.%m.%Y')
- soup = self.index_to_soup(self.INDEX + '/default.asp?section=arhiva&arhDatum=' + datum)
- anchor = soup.find('a', attrs={'class': 'WB_DNEVNIK_MoreLink'})
- if anchor != None:
- raw = browser().open_novisit(self.INDEX + '/' + anchor['href']).read()
- cover_soup = BeautifulSoup(raw)
- url = cover_soup.find('div', attrs={'class':'WB_DNEVNIK_Datum2'}).findNext('img')['src']
- return self.INDEX + '/' + url
- return ''