home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / dnevnik_mk.recipe < prev    next >
Encoding:
Text File  |  2011-09-09  |  4.4 KB  |  99 lines

  1. #!/usr/bin/env  python
  2.  
  3. __author__    = 'Darko Spasovski'
  4. __license__   = 'GPL v3'
  5. __copyright__ = '2011, Darko Spasovski <darko.spasovski at gmail.com>'
  6. '''
  7. dnevnik.com.mk
  8. '''
  9.  
  10. import re
  11. import datetime
  12. from calibre.web.feeds.news import BasicNewsRecipe
  13. from calibre import browser
  14. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  15.  
  16. class Dnevnik(BasicNewsRecipe):
  17.  
  18.     INDEX                 = 'http://www.dnevnik.com.mk'
  19.     __author__ = 'Darko Spasovski'
  20.     title                 = 'Dnevnik - mk'
  21.     description           = 'Daily Macedonian newspaper'
  22.     masthead_url          = 'http://www.dnevnik.com.mk/images/re-logo.gif'
  23.     language              = 'mk'
  24.     publication_type      = 'newspaper'
  25.     category              = 'news, Macedonia'
  26.     max_articles_per_feed = 100
  27.     remove_javascript     = True
  28.     no_stylesheets        = True
  29.     use_embedded_content  = False
  30.  
  31.     preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
  32.     [
  33.         ## Remove anything before the start of the article.
  34.         (r'<body.*?<\?xml version=\"1.0\"\?><!--Article start-->', lambda match: '<body>'),
  35.  
  36.         ## Remove anything after the end of the article.
  37.         (r'<!--Article end.*?</body>', lambda match : '</body>'),
  38.         ]
  39.     ]
  40.  
  41.     extra_css = """
  42.                     body{font-family: Arial,Helvetica,sans-serif}
  43.                     .WB_DNEVNIK_Naslov{FONT-WEIGHT: bold; FONT-SIZE: 18px; FONT-FAMILY: Arial, Verdana, Tahoma; TEXT-DECORATION: none}
  44.                 """
  45.  
  46.     conversion_options = {
  47.                           'comment'  : description,
  48.                           'tags'     : category,
  49.                           'language' : language,
  50.                           'linearize_tables' : True
  51.                         }
  52.  
  53.     def parse_index(self):
  54.         datum = datetime.datetime.today().strftime('%d.%m.%Y')
  55.         soup = self.index_to_soup(self.INDEX + '/default.asp?section=arhiva&arhDatum=' + datum)
  56.         feeds = []
  57.         for section in soup.findAll('td', attrs={'class':'WB_DNEVNIK_ArhivaFormTitle'}):
  58.             sectionTitle = section.contents[0].string
  59.             if sectionTitle.lower().startswith('online'):
  60.                 # Skip online articles
  61.                 continue
  62.             containerTable = section.findPrevious(name='table').findNextSibling(name='table')
  63.             if containerTable==None:
  64.                 print 'No container table found - page layout may have been changed.'
  65.                 continue
  66.             articles = []
  67.             for article in containerTable.findAll('a', attrs={'class': 'WB_DNEVNIK_ArhivaFormText'}):
  68.                 title = self.tag_to_string(article, use_alt=True).strip()
  69.                 articles.append({'title': title, 'url':'http://www.dnevnik.com.mk/' + article['href'], 'description':'', 'date':''})
  70.             if articles:
  71.                 feeds.append((sectionTitle, articles))
  72.         return sorted(feeds, key=lambda section: self.get_weight(section))
  73.  
  74.     def get_weight(self, section):
  75.         """
  76.         Returns 'weight' of a section.
  77.         Used for sorting the sections based on their 'natural' order in the printed edition.
  78.         """
  79.         natural_order = { u'╨▓╨╛ ╤ä╨╛╨║╤â╤ü╨╛╤é': 1, u'╨░╨║╤é╤â╨╡╨╗╨╜╨╛': 2, u'╨╡╨║╨╛╨╜╨╛╨╝╨╕╤ÿ╨░': 3,
  80.                           u'╨╛╤é╨▓╨╛╤Ç╨╡╨╜╨░': 4, u'╤ü╨▓╨╡╤é': 5, u'╨╕╨╜╤é╨╡╤Ç╨▓╤ÿ╤â': 6, u'╤ƒ╤â╨▒╨╛╨║╤ü': 7,
  81.                           u'╤Ç╨╡╨┐╨╛╤Ç╤é╨░╨╢╨░': 8, u'╨╜╨░╤ê ╤é╤â╤Ç╨╕╨╖╨░╨╝': 9, u'╨╢╨╕╨▓╨╛╤é': 10,
  82.                           u'╨░╨▓╤é╨╛╨╝╨╛╨▒╨╕╨╗╨╕╨╖╨░╨╝': 11, u'╤ü╨┐╨╛╤Ç╤é': 12, u'╨╛╨╝╨╜╨╕╨▒╤â╤ü': 13 }
  83.         if section[0].string.lower() in natural_order:
  84.             return natural_order[section[0].string.lower()]
  85.         else:
  86.             return 999  # section names not on the list go to the bottom
  87.  
  88.     def get_cover_url(self):
  89.         datum = datetime.datetime.today().strftime('%d.%m.%Y')
  90.         soup = self.index_to_soup(self.INDEX + '/default.asp?section=arhiva&arhDatum=' + datum)
  91.         anchor = soup.find('a', attrs={'class': 'WB_DNEVNIK_MoreLink'})
  92.         if anchor != None:
  93.             raw = browser().open_novisit(self.INDEX + '/' + anchor['href']).read()
  94.             cover_soup = BeautifulSoup(raw)
  95.             url = cover_soup.find('div', attrs={'class':'WB_DNEVNIK_Datum2'}).findNext('img')['src']
  96.             return self.INDEX + '/' + url
  97.         return ''
  98.  
  99.