Chip 2011 November

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / dnevnik_mk.recipe < prev next >

Wrap

Text File | 2011-09-09 | 4.4 KB | 99 lines

#!/usr/bin/env python __author__ = 'Darko Spasovski' __license__ = 'GPL v3' __copyright__ = '2011, Darko Spasovski <darko.spasovski at gmail.com>' ''' dnevnik.com.mk ''' import re import datetime from calibre.web.feeds.news import BasicNewsRecipe from calibre import browser from calibre.ebooks.BeautifulSoup import BeautifulSoup class Dnevnik(BasicNewsRecipe): INDEX = 'http://www.dnevnik.com.mk' __author__ = 'Darko Spasovski' title = 'Dnevnik - mk' description = 'Daily Macedonian newspaper' masthead_url = 'http://www.dnevnik.com.mk/images/re-logo.gif' language = 'mk' publication_type = 'newspaper' category = 'news, Macedonia' max_articles_per_feed = 100 remove_javascript = True no_stylesheets = True use_embedded_content = False preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ ## Remove anything before the start of the article. (r'<body.*?<\?xml version=\"1.0\"\?>', lambda match: '<body>'), ## Remove anything after the end of the article. (r'<!--Article end.*?</body>', lambda match : '</body>'), ] ] extra_css = """ body{font-family: Arial,Helvetica,sans-serif} .WB_DNEVNIK_Naslov{FONT-WEIGHT: bold; FONT-SIZE: 18px; FONT-FAMILY: Arial, Verdana, Tahoma; TEXT-DECORATION: none} """ conversion_options = { 'comment' : description, 'tags' : category, 'language' : language, 'linearize_tables' : True } def parse_index(self): datum = datetime.datetime.today().strftime('%d.%m.%Y') soup = self.index_to_soup(self.INDEX + '/default.asp?section=arhiva&arhDatum=' + datum) feeds = [] for section in soup.findAll('td', attrs={'class':'WB_DNEVNIK_ArhivaFormTitle'}): sectionTitle = section.contents[0].string if sectionTitle.lower().startswith('online'): # Skip online articles continue containerTable = section.findPrevious(name='table').findNextSibling(name='table') if containerTable==None: print 'No container table found - page layout may have been changed.' continue articles = [] for article in containerTable.findAll('a', attrs={'class': 'WB_DNEVNIK_ArhivaFormText'}): title = self.tag_to_string(article, use_alt=True).strip() articles.append({'title': title, 'url':'http://www.dnevnik.com.mk/' + article['href'], 'description':'', 'date':''}) if articles: feeds.append((sectionTitle, articles)) return sorted(feeds, key=lambda section: self.get_weight(section)) def get_weight(self, section): """ Returns 'weight' of a section. Used for sorting the sections based on their 'natural' order in the printed edition. """ natural_order = { u'╨▓╨╛ ╤ä╨╛╨║╤â╤ü╨╛╤é': 1, u'╨░╨║╤é╤â╨╡╨╗╨╜╨╛': 2, u'╨╡╨║╨╛╨╜╨╛╨╝╨╕╤ÿ╨░': 3, u'╨╛╤é╨▓╨╛╤Ç╨╡╨╜╨░': 4, u'╤ü╨▓╨╡╤é': 5, u'╨╕╨╜╤é╨╡╤Ç╨▓╤ÿ╤â': 6, u'╤ƒ╤â╨▒╨╛╨║╤ü': 7, u'╤Ç╨╡╨┐╨╛╤Ç╤é╨░╨╢╨░': 8, u'╨╜╨░╤ê ╤é╤â╤Ç╨╕╨╖╨░╨╝': 9, u'╨╢╨╕╨▓╨╛╤é': 10, u'╨░╨▓╤é╨╛╨╝╨╛╨▒╨╕╨╗╨╕╨╖╨░╨╝': 11, u'╤ü╨┐╨╛╤Ç╤é': 12, u'╨╛╨╝╨╜╨╕╨▒╤â╤ü': 13 } if section[0].string.lower() in natural_order: return natural_order[section[0].string.lower()] else: return 999 # section names not on the list go to the bottom def get_cover_url(self): datum = datetime.datetime.today().strftime('%d.%m.%Y') soup = self.index_to_soup(self.INDEX + '/default.asp?section=arhiva&arhDatum=' + datum) anchor = soup.find('a', attrs={'class': 'WB_DNEVNIK_MoreLink'}) if anchor != None: raw = browser().open_novisit(self.INDEX + '/' + anchor['href']).read() cover_soup = BeautifulSoup(raw) url = cover_soup.find('div', attrs={'class':'WB_DNEVNIK_Datum2'}).findNext('img')['src'] return self.INDEX + '/' + url return ''