home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / ming_pao_vancouver.recipe < prev    next >
Text File  |  2011-09-09  |  32KB  |  595 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2010-2011, Eddie Lau'
  3.  
  4. # Region - Hong Kong, Vancouver, Toronto
  5. __Region__ = 'Vancouver'
  6. # Users of Kindle 3 with limited system-level CJK support
  7. # please replace the following "True" with "False".
  8. __MakePeriodical__ = True
  9. # Turn below to true if your device supports display of CJK titles
  10. __UseChineseTitle__ = False
  11. # Set it to False if you want to skip images
  12. __KeepImages__ = True
  13. # (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
  14. __UseLife__ = True
  15.  
  16.  
  17. '''
  18. Change Log:
  19. 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
  20.             provide options to remove all images in the file
  21. 2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
  22. 2011/03/06: add new articles for finance section, also a new section "Columns"
  23. 2011/02/28: rearrange the sections
  24.             [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
  25.             View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues"
  26.             folder in Kindle 3
  27. 2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
  28.             clean up the indentation
  29. 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
  30.             (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
  31. 2010/11/22: add English section, remove eco-news section which is not updated daily, correct
  32.             ordering of articles
  33. 2010/11/12: add news image and eco-news section
  34. 2010/11/08: add parsing of finance section
  35. 2010/11/06: temporary work-around for Kindle device having no capability to display unicode
  36.             in section/article list.
  37. 2010/10/31: skip repeated articles in section pages
  38. '''
  39.  
  40. import os, datetime, re
  41. from calibre.web.feeds.recipes import BasicNewsRecipe
  42. from contextlib import nested
  43. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  44. from calibre.ebooks.metadata.opf2 import OPFCreator
  45. from calibre.ebooks.metadata.toc import TOC
  46. from calibre.ebooks.metadata import MetaInformation
  47.  
  48. # MAIN CLASS
  49. class MPRecipe(BasicNewsRecipe):
  50.     if __Region__ == 'Hong Kong':
  51.         title       = 'Ming Pao - Hong Kong'
  52.         description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
  53.         category    = 'Chinese, News, Hong Kong'
  54.         extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
  55.         masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
  56.         keep_only_tags = [dict(name='h1'),
  57.                           dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
  58.                           dict(name='font', attrs={'color':['AA0000']}), # for column articles title
  59.                           dict(attrs={'id':['newscontent']}), # entertainment and column page content
  60.                           dict(attrs={'id':['newscontent01','newscontent02']}),
  61.                           dict(attrs={'class':['photo']}),
  62.                           dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
  63.                           dict(name='img', attrs={'width':['180'], 'alt':['µîëσ£ûµö╛σñº']}) # images for source from life.mingpao.com
  64.                           ]
  65.         if __KeepImages__:
  66.             remove_tags = [dict(name='style'),
  67.                            dict(attrs={'id':['newscontent135']}),  # for the finance page from mpfinance.com
  68.                            dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
  69.                            #dict(name='table')  # for content fetched from life.mingpao.com
  70.                           ]
  71.         else:
  72.             remove_tags = [dict(name='style'),
  73.                            dict(attrs={'id':['newscontent135']}),  # for the finance page from mpfinance.com
  74.                            dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
  75.                            dict(name='img'),
  76.                            #dict(name='table')  # for content fetched from life.mingpao.com
  77.                           ]
  78.         remove_attributes = ['width']
  79.         preprocess_regexps = [
  80.                               (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
  81.                               lambda match: '<h1>'),
  82.                               (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
  83.                               lambda match: '</h1>'),
  84.                               (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
  85.                               lambda match: ''),
  86.                               # skip <br> after title in life.mingpao.com fetched article
  87.                               (re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE),
  88.                               lambda match: "<div id='newscontent'>"),
  89.                               (re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE),
  90.                               lambda match: "</b>")
  91.                              ]
  92.     elif __Region__ == 'Vancouver':
  93.         title       = 'Ming Pao - Vancouver'
  94.         description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
  95.         category    = 'Chinese, News, Vancouver'
  96.         extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
  97.         masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif'
  98.         keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
  99.                           dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
  100.                           dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
  101.                           ]
  102.         if __KeepImages__:
  103.             remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})]  # the magnifier icon
  104.         else:
  105.             remove_tags = [dict(name='img')]
  106.         remove_attributes = ['width']
  107.         preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE),
  108.                               lambda match: ''),
  109.                              ]
  110.     elif __Region__ == 'Toronto':
  111.         title       = 'Ming Pao - Toronto'
  112.         description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
  113.         category    = 'Chinese, News, Toronto'
  114.         extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
  115.         masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif'
  116.         keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
  117.                           dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
  118.                           dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
  119.                           ]
  120.         if __KeepImages__:
  121.             remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})]  # the magnifier icon
  122.         else:
  123.             remove_tags = [dict(name='img')]
  124.         remove_attributes = ['width']
  125.         preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE),
  126.                               lambda match: ''),
  127.                              ]
  128.  
  129.     oldest_article = 1
  130.     max_articles_per_feed = 100
  131.     __author__            = 'Eddie Lau'
  132.     publisher             = 'MingPao'
  133.     remove_javascript = True
  134.     use_embedded_content   = False
  135.     no_stylesheets = True
  136.     language = 'zh'
  137.     encoding = 'Big5-HKSCS'
  138.     recursions = 0
  139.     conversion_options = {'linearize_tables':True}
  140.     timefmt = ''
  141.  
  142.     def image_url_processor(cls, baseurl, url):
  143.         # trick: break the url at the first occurance of digit, add an additional
  144.         # '_' at the front
  145.         # not working, may need to move this to preprocess_html() method
  146. #        minIdx = 10000
  147. #        i0 = url.find('0')
  148. #        if i0 >= 0 and i0 < minIdx:
  149. #           minIdx = i0
  150. #        i1 = url.find('1')
  151. #        if i1 >= 0 and i1 < minIdx:
  152. #           minIdx = i1
  153. #        i2 = url.find('2')
  154. #        if i2 >= 0 and i2 < minIdx:
  155. #           minIdx = i2
  156. #        i3 = url.find('3')
  157. #        if i3 >= 0 and i0 < minIdx:
  158. #           minIdx = i3
  159. #        i4 = url.find('4')
  160. #        if i4 >= 0 and i4 < minIdx:
  161. #           minIdx = i4
  162. #        i5 = url.find('5')
  163. #        if i5 >= 0 and i5 < minIdx:
  164. #           minIdx = i5
  165. #        i6 = url.find('6')
  166. #        if i6 >= 0 and i6 < minIdx:
  167. #           minIdx = i6
  168. #        i7 = url.find('7')
  169. #        if i7 >= 0 and i7 < minIdx:
  170. #           minIdx = i7
  171. #        i8 = url.find('8')
  172. #        if i8 >= 0 and i8 < minIdx:
  173. #           minIdx = i8
  174. #        i9 = url.find('9')
  175. #        if i9 >= 0 and i9 < minIdx:
  176. #           minIdx = i9
  177.         return url
  178.  
  179.     def get_dtlocal(self):
  180.         dt_utc = datetime.datetime.utcnow()
  181.         if __Region__ == 'Hong Kong':
  182.             # convert UTC to local hk time - at HKT 5.30am, all news are available
  183.             dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
  184.             # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
  185.         elif __Region__ == 'Vancouver':
  186.             # convert UTC to local Vancouver time - at PST time 5.30am, all news are available
  187.             dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
  188.             #dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(5.5/24)
  189.         elif __Region__ == 'Toronto':
  190.             # convert UTC to local Toronto time - at EST time 8.30am, all news are available
  191.             dt_local = dt_utc + datetime.timedelta(-5.0/24) - datetime.timedelta(8.5/24)
  192.             #dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(8.5/24)
  193.         return dt_local
  194.  
  195.     def get_fetchdate(self):
  196.         return self.get_dtlocal().strftime("%Y%m%d")
  197.  
  198.     def get_fetchformatteddate(self):
  199.         return self.get_dtlocal().strftime("%Y-%m-%d")
  200.  
  201.     def get_fetchday(self):
  202.         return self.get_dtlocal().strftime("%d")
  203.  
  204.     def get_cover_url(self):
  205.         if __Region__ == 'Hong Kong':
  206.             cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
  207.         elif __Region__ == 'Vancouver':
  208.             cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
  209.         elif __Region__ == 'Toronto':
  210.             cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg'
  211.         br = BasicNewsRecipe.get_browser()
  212.         try:
  213.             br.open(cover)
  214.         except:
  215.             cover = None
  216.         return cover
  217.  
  218.     def parse_index(self):
  219.         feeds = []
  220.         dateStr = self.get_fetchdate()
  221.  
  222.         if __Region__ == 'Hong Kong':
  223.             if __UseLife__:
  224.                 for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
  225.                                            (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'),
  226.                                            (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'),
  227.                                            (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'),
  228.                                            (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'),
  229.                                            (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'),
  230.                                            (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
  231.                                            (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
  232.                                            (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
  233.                                            (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal'),
  234.                                            (u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')]:
  235.                     articles = self.parse_section2(url, keystr)
  236.                     if articles:
  237.                         feeds.append((title, articles))
  238.  
  239.                 for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
  240.                                    (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
  241.                     articles = self.parse_section(url)
  242.                     if articles:
  243.                         feeds.append((title, articles))
  244.             else:
  245.                 for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
  246.                                    (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
  247.                                    (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
  248.                     articles = self.parse_section(url)
  249.                     if articles:
  250.                         feeds.append((title, articles))
  251.  
  252.                 # special- editorial
  253.                 ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
  254.                 if ed_articles:
  255.                     feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
  256.  
  257.                 for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
  258.                                    (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
  259.                                    (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
  260.                     articles = self.parse_section(url)
  261.                     if articles:
  262.                         feeds.append((title, articles))
  263.  
  264.                 # special - finance
  265.                 #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
  266.                 fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
  267.                 if fin_articles:
  268.                     feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
  269.  
  270.                 for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
  271.                                    (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
  272.                     articles = self.parse_section(url)
  273.                     if articles:
  274.                         feeds.append((title, articles))
  275.  
  276.                 # special - entertainment
  277.                 ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
  278.                 if ent_articles:
  279.                     feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
  280.  
  281.                 for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
  282.                                    (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
  283.                     articles = self.parse_section(url)
  284.                     if articles:
  285.                         feeds.append((title, articles))
  286.  
  287.  
  288.                 # special- columns
  289.                 col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
  290.                 if col_articles:
  291.                     feeds.append((u'\u5c08\u6b04 Columns', col_articles))
  292.         elif __Region__ == 'Vancouver':
  293.             for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
  294.                                (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
  295.                                (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VDindex.htm'),
  296.                                (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-VGindex.htm'),
  297.                                (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VTindex.htm'),
  298.                                (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VCindex.htm'),
  299.                                (u'\u7d93\u6fdf Economics', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'),
  300.                                (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VSindex.htm'),
  301.                                (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
  302.                                (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'),]:
  303.                 articles = self.parse_section3(url, 'http://www.mingpaovan.com/')
  304.                 if articles:
  305.                     feeds.append((title, articles))
  306.         elif __Region__ == 'Toronto':
  307.             for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'),
  308.                                (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TDindex.htm'),
  309.                                (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TFindex.htm'),
  310.                                (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TCAindex.htm'),
  311.                                (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TTAindex.htm'),
  312.                                (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-GAindex.htm'),
  313.                                (u'\u7d93\u6fdf Economics', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'),
  314.                                (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TSindex.htm'),
  315.                                (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
  316.                                (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'),]:
  317.                 articles = self.parse_section3(url, 'http://www.mingpaotor.com/')
  318.                 if articles:
  319.                     feeds.append((title, articles))
  320.         return feeds
  321.  
  322.     # parse from news.mingpao.com
  323.     def parse_section(self, url):
  324.         dateStr = self.get_fetchdate()
  325.         soup = self.index_to_soup(url)
  326.         divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
  327.         current_articles = []
  328.         included_urls = []
  329.         divs.reverse()
  330.         for i in divs:
  331.             a = i.find('a', href = True)
  332.             title = self.tag_to_string(a)
  333.             url = a.get('href', False)
  334.             url = 'http://news.mingpao.com/' + dateStr + '/' +url
  335.             if url not in included_urls and url.rfind('Redirect') == -1:
  336.                 current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
  337.                 included_urls.append(url)
  338.         current_articles.reverse()
  339.         return current_articles
  340.  
  341.     # parse from life.mingpao.com
  342.     def parse_section2(self, url, keystr):
  343.         self.get_fetchdate()
  344.         soup = self.index_to_soup(url)
  345.         a = soup.findAll('a', href=True)
  346.         a.reverse()
  347.         current_articles = []
  348.         included_urls = []
  349.         for i in a:
  350.             title = self.tag_to_string(i)
  351.             url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
  352.             if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
  353.                 url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')  # use printed version of the article
  354.                 current_articles.append({'title': title, 'url': url, 'description': ''})
  355.                 included_urls.append(url)
  356.         current_articles.reverse()
  357.         return current_articles
  358.  
  359.     # parse from www.mingpaovan.com
  360.     def parse_section3(self, url, baseUrl):
  361.         self.get_fetchdate()
  362.         soup = self.index_to_soup(url)
  363.         divs = soup.findAll(attrs={'class': ['ListContentLargeLink']})
  364.         current_articles = []
  365.         included_urls = []
  366.         divs.reverse()
  367.         for i in divs:
  368.             title = self.tag_to_string(i)
  369.             urlstr = i.get('href', False)
  370.             urlstr = baseUrl + '/' + urlstr.replace('../../../', '')
  371.             if urlstr not in included_urls:
  372.                 current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
  373.                 included_urls.append(urlstr)
  374.         current_articles.reverse()
  375.         return current_articles
  376.  
  377.     def parse_ed_section(self, url):
  378.         self.get_fetchdate()
  379.         soup = self.index_to_soup(url)
  380.         a = soup.findAll('a', href=True)
  381.         a.reverse()
  382.         current_articles = []
  383.         included_urls = []
  384.         for i in a:
  385.             title = self.tag_to_string(i)
  386.             url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
  387.             if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
  388.                 current_articles.append({'title': title, 'url': url, 'description': ''})
  389.                 included_urls.append(url)
  390.         current_articles.reverse()
  391.         return current_articles
  392.  
  393.     def parse_fin_section(self, url):
  394.         self.get_fetchdate()
  395.         soup = self.index_to_soup(url)
  396.         a = soup.findAll('a', href= True)
  397.         current_articles = []
  398.         included_urls = []
  399.         for i in a:
  400.             #url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
  401.             url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
  402.             #if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
  403.             if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
  404.                 title = self.tag_to_string(i)
  405.                 current_articles.append({'title': title, 'url': url, 'description':''})
  406.                 included_urls.append(url)
  407.         return current_articles
  408.  
  409.     def parse_ent_section(self, url):
  410.         self.get_fetchdate()
  411.         soup = self.index_to_soup(url)
  412.         a = soup.findAll('a', href=True)
  413.         a.reverse()
  414.         current_articles = []
  415.         included_urls = []
  416.         for i in a:
  417.             title = self.tag_to_string(i)
  418.             url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
  419.             if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
  420.                 current_articles.append({'title': title, 'url': url, 'description': ''})
  421.                 included_urls.append(url)
  422.         current_articles.reverse()
  423.         return current_articles
  424.  
  425.     def parse_col_section(self, url):
  426.         self.get_fetchdate()
  427.         soup = self.index_to_soup(url)
  428.         a = soup.findAll('a', href=True)
  429.         a.reverse()
  430.         current_articles = []
  431.         included_urls = []
  432.         for i in a:
  433.             title = self.tag_to_string(i)
  434.             url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
  435.             if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
  436.                 current_articles.append({'title': title, 'url': url, 'description': ''})
  437.                 included_urls.append(url)
  438.         current_articles.reverse()
  439.         return current_articles
  440.  
  441.     def preprocess_html(self, soup):
  442.         for item in soup.findAll(style=True):
  443.             del item['style']
  444.         for item in soup.findAll(style=True):
  445.             del item['width']
  446.         for item in soup.findAll(stype=True):
  447.             del item['absmiddle']
  448.         return soup
  449.  
  450.     def create_opf(self, feeds, dir=None):
  451.         if dir is None:
  452.             dir = self.output_dir
  453.         if __UseChineseTitle__ == True:
  454.             if __Region__ == 'Hong Kong':
  455.                 title = u'\u660e\u5831 (\u9999\u6e2f)'
  456.             elif __Region__ == 'Vancouver':
  457.                 title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
  458.             elif __Region__ == 'Toronto':
  459.                 title = u'\u660e\u5831 (\u591a\u502b\u591a)'
  460.         else:
  461.             title = self.short_title()
  462.         # if not generating a periodical, force date to apply in title
  463.         if __MakePeriodical__ == False:
  464.             title = title + ' ' + self.get_fetchformatteddate()
  465.         if True:
  466.             mi = MetaInformation(title, [self.publisher])
  467.             mi.publisher = self.publisher
  468.             mi.author_sort = self.publisher
  469.             if __MakePeriodical__ == True:
  470.                 mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
  471.             else:
  472.                 mi.publication_type = self.publication_type+':'+self.short_title()
  473.             #mi.timestamp = nowf()
  474.             mi.timestamp = self.get_dtlocal()
  475.             mi.comments = self.description
  476.             if not isinstance(mi.comments, unicode):
  477.                 mi.comments = mi.comments.decode('utf-8', 'replace')
  478.             #mi.pubdate = nowf()
  479.             mi.pubdate = self.get_dtlocal()
  480.             opf_path = os.path.join(dir, 'index.opf')
  481.             ncx_path = os.path.join(dir, 'index.ncx')
  482.             opf = OPFCreator(dir, mi)
  483.             # Add mastheadImage entry to <guide> section
  484.             mp = getattr(self, 'masthead_path', None)
  485.             if mp is not None and os.access(mp, os.R_OK):
  486.                 from calibre.ebooks.metadata.opf2 import Guide
  487.                 ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
  488.                 ref.type = 'masthead'
  489.                 ref.title = 'Masthead Image'
  490.                 opf.guide.append(ref)
  491.  
  492.             manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
  493.             manifest.append(os.path.join(dir, 'index.html'))
  494.             manifest.append(os.path.join(dir, 'index.ncx'))
  495.  
  496.             # Get cover
  497.             cpath = getattr(self, 'cover_path', None)
  498.             if cpath is None:
  499.                 pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
  500.                 if self.default_cover(pf):
  501.                     cpath =  pf.name
  502.             if cpath is not None and os.access(cpath, os.R_OK):
  503.                 opf.cover = cpath
  504.                 manifest.append(cpath)
  505.  
  506.             # Get masthead
  507.             mpath = getattr(self, 'masthead_path', None)
  508.             if mpath is not None and os.access(mpath, os.R_OK):
  509.                 manifest.append(mpath)
  510.  
  511.             opf.create_manifest_from_files_in(manifest)
  512.             for mani in opf.manifest:
  513.                 if mani.path.endswith('.ncx'):
  514.                     mani.id = 'ncx'
  515.                 if mani.path.endswith('mastheadImage.jpg'):
  516.                     mani.id = 'masthead-image'
  517.             entries = ['index.html']
  518.             toc = TOC(base_path=dir)
  519.             self.play_order_counter = 0
  520.             self.play_order_map = {}
  521.  
  522.         def feed_index(num, parent):
  523.             f = feeds[num]
  524.             for j, a in enumerate(f):
  525.                 if getattr(a, 'downloaded', False):
  526.                     adir = 'feed_%d/article_%d/'%(num, j)
  527.                     auth = a.author
  528.                     if not auth:
  529.                         auth = None
  530.                     desc = a.text_summary
  531.                     if not desc:
  532.                         desc = None
  533.                     else:
  534.                         desc = self.description_limiter(desc)
  535.                     entries.append('%sindex.html'%adir)
  536.                     po = self.play_order_map.get(entries[-1], None)
  537.                     if po is None:
  538.                         self.play_order_counter += 1
  539.                         po = self.play_order_counter
  540.                     parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
  541.                                     play_order=po, author=auth, description=desc)
  542.                     last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
  543.                     for sp in a.sub_pages:
  544.                         prefix = os.path.commonprefix([opf_path, sp])
  545.                         relp = sp[len(prefix):]
  546.                         entries.append(relp.replace(os.sep, '/'))
  547.                         last = sp
  548.  
  549.                     if os.path.exists(last):
  550.                         with open(last, 'rb') as fi:
  551.                             src = fi.read().decode('utf-8')
  552.                         soup = BeautifulSoup(src)
  553.                         body = soup.find('body')
  554.                         if body is not None:
  555.                             prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
  556.                             templ = self.navbar.generate(True, num, j, len(f),
  557.                                             not self.has_single_feed,
  558.                                             a.orig_url, self.publisher, prefix=prefix,
  559.                                             center=self.center_navbar)
  560.                             elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
  561.                             body.insert(len(body.contents), elem)
  562.                             with open(last, 'wb') as fi:
  563.                                 fi.write(unicode(soup).encode('utf-8'))
  564.         if len(feeds) == 0:
  565.             raise Exception('All feeds are empty, aborting.')
  566.  
  567.         if len(feeds) > 1:
  568.             for i, f in enumerate(feeds):
  569.                 entries.append('feed_%d/index.html'%i)
  570.                 po = self.play_order_map.get(entries[-1], None)
  571.                 if po is None:
  572.                     self.play_order_counter += 1
  573.                     po = self.play_order_counter
  574.                 auth = getattr(f, 'author', None)
  575.                 if not auth:
  576.                     auth = None
  577.                 desc = getattr(f, 'description', None)
  578.                 if not desc:
  579.                     desc = None
  580.                 feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
  581.                            f.title, play_order=po, description=desc, author=auth))
  582.  
  583.         else:
  584.             entries.append('feed_%d/index.html'%0)
  585.             feed_index(0, toc)
  586.  
  587.         for i, p in enumerate(entries):
  588.             entries[i] = os.path.join(dir, p.replace('/', os.sep))
  589.         opf.create_spine(entries)
  590.         opf.set_toc(toc)
  591.  
  592.         with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
  593.             opf.render(opf_file, ncx_file)
  594.  
  595.