home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / ming_pao.recipe < prev    next >
Text File  |  2011-09-09  |  32KB  |  596 lines

  1. __license__   = 'GPL v3'
  2. __copyright__ = '2010-2011, Eddie Lau'
  3.  
  4. # Region - Hong Kong, Vancouver, Toronto
  5. __Region__ = 'Hong Kong'
  6. # Users of Kindle 3 with limited system-level CJK support
  7. # please replace the following "True" with "False".
  8. __MakePeriodical__ = True
  9. # Turn below to true if your device supports display of CJK titles
  10. __UseChineseTitle__ = False
  11. # Set it to False if you want to skip images
  12. __KeepImages__ = True
  13. # (HK only) Turn below to true if you wish to use life.mingpao.com as the main article source
  14. __UseLife__ = True
  15.  
  16.  
  17. '''
  18. Change Log:
  19. 2011/09/07: disable "column" section as it is no longer offered free.
  20. 2011/06/26: add fetching Vancouver and Toronto versions of the paper, also provide captions for images using life.mingpao fetch source
  21.             provide options to remove all images in the file
  22. 2011/05/12: switch the main parse source to life.mingpao.com, which has more photos on the article pages
  23. 2011/03/06: add new articles for finance section, also a new section "Columns"
  24. 2011/02/28: rearrange the sections
  25.             [Disabled until Kindle has better CJK support and can remember last (section,article) read in Sections & Articles
  26.             View] make it the same title if generating a periodical, so past issue will be automatically put into "Past Issues"
  27.             folder in Kindle 3
  28. 2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
  29.             clean up the indentation
  30. 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
  31.             (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT)
  32. 2010/11/22: add English section, remove eco-news section which is not updated daily, correct
  33.             ordering of articles
  34. 2010/11/12: add news image and eco-news section
  35. 2010/11/08: add parsing of finance section
  36. 2010/11/06: temporary work-around for Kindle device having no capability to display unicode
  37.             in section/article list.
  38. 2010/10/31: skip repeated articles in section pages
  39. '''
  40.  
  41. import os, datetime, re
  42. from calibre.web.feeds.recipes import BasicNewsRecipe
  43. from contextlib import nested
  44. from calibre.ebooks.BeautifulSoup import BeautifulSoup
  45. from calibre.ebooks.metadata.opf2 import OPFCreator
  46. from calibre.ebooks.metadata.toc import TOC
  47. from calibre.ebooks.metadata import MetaInformation
  48.  
  49. # MAIN CLASS
  50. class MPRecipe(BasicNewsRecipe):
  51.     if __Region__ == 'Hong Kong':
  52.         title       = 'Ming Pao - Hong Kong'
  53.         description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
  54.         category    = 'Chinese, News, Hong Kong'
  55.         extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
  56.         masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
  57.         keep_only_tags = [dict(name='h1'),
  58.                           dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
  59.                           dict(name='font', attrs={'color':['AA0000']}), # for column articles title
  60.                           dict(attrs={'id':['newscontent']}), # entertainment and column page content
  61.                           dict(attrs={'id':['newscontent01','newscontent02']}),
  62.                           dict(attrs={'class':['photo']}),
  63.                           dict(name='table', attrs={'width':['100%'], 'border':['0'], 'cellspacing':['5'], 'cellpadding':['0']}),  # content in printed version of life.mingpao.com
  64.                           dict(name='img', attrs={'width':['180'], 'alt':['µîëσ£ûµö╛σñº']}) # images for source from life.mingpao.com
  65.                           ]
  66.         if __KeepImages__:
  67.             remove_tags = [dict(name='style'),
  68.                            dict(attrs={'id':['newscontent135']}),  # for the finance page from mpfinance.com
  69.                            dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
  70.                            #dict(name='table')  # for content fetched from life.mingpao.com
  71.                           ]
  72.         else:
  73.             remove_tags = [dict(name='style'),
  74.                            dict(attrs={'id':['newscontent135']}),  # for the finance page from mpfinance.com
  75.                            dict(name='font', attrs={'size':['2'], 'color':['666666']}), # article date in life.mingpao.com article
  76.                            dict(name='img'),
  77.                            #dict(name='table')  # for content fetched from life.mingpao.com
  78.                           ]
  79.         remove_attributes = ['width']
  80.         preprocess_regexps = [
  81.                               (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
  82.                               lambda match: '<h1>'),
  83.                               (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
  84.                               lambda match: '</h1>'),
  85.                               (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
  86.                               lambda match: ''),
  87.                               # skip <br> after title in life.mingpao.com fetched article
  88.                               (re.compile(r"<div id='newscontent'><br>", re.DOTALL|re.IGNORECASE),
  89.                               lambda match: "<div id='newscontent'>"),
  90.                               (re.compile(r"<br><br></b>", re.DOTALL|re.IGNORECASE),
  91.                               lambda match: "</b>")
  92.                              ]
  93.     elif __Region__ == 'Vancouver':
  94.         title       = 'Ming Pao - Vancouver'
  95.         description = 'Vancouver Chinese Newspaper (http://www.mingpaovan.com)'
  96.         category    = 'Chinese, News, Vancouver'
  97.         extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
  98.         masthead_url = 'http://www.mingpaovan.com/image/mainlogo2_VAN2.gif'
  99.         keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
  100.                           dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
  101.                           dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
  102.                           ]
  103.         if __KeepImages__:
  104.             remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})]  # the magnifier icon
  105.         else:
  106.             remove_tags = [dict(name='img')]
  107.         remove_attributes = ['width']
  108.         preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE),
  109.                               lambda match: ''),
  110.                              ]
  111.     elif __Region__ == 'Toronto':
  112.         title       = 'Ming Pao - Toronto'
  113.         description = 'Toronto Chinese Newspaper (http://www.mingpaotor.com)'
  114.         category    = 'Chinese, News, Toronto'
  115.         extra_css   = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} b>font {font-size:200%; font-weight:bold;}'
  116.         masthead_url = 'http://www.mingpaotor.com/image/mainlogo2_TOR2.gif'
  117.         keep_only_tags = [dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['1']}),
  118.                           dict(name='table', attrs={'width':['450'], 'border':['0'], 'cellspacing':['3'], 'cellpadding':['3'], 'id':['tblContent3']}),
  119.                           dict(name='table', attrs={'width':['180'], 'border':['0'], 'cellspacing':['0'], 'cellpadding':['0'], 'bgcolor':['F0F0F0']}),
  120.                           ]
  121.         if __KeepImages__:
  122.             remove_tags = [dict(name='img', attrs={'src':['../../../image/magnifier.gif']})]  # the magnifier icon
  123.         else:
  124.             remove_tags = [dict(name='img')]
  125.         remove_attributes = ['width']
  126.         preprocess_regexps = [(re.compile(r' ', re.DOTALL|re.IGNORECASE),
  127.                               lambda match: ''),
  128.                              ]
  129.  
  130.     oldest_article = 1
  131.     max_articles_per_feed = 100
  132.     __author__            = 'Eddie Lau'
  133.     publisher             = 'MingPao'
  134.     remove_javascript = True
  135.     use_embedded_content   = False
  136.     no_stylesheets = True
  137.     language = 'zh'
  138.     encoding = 'Big5-HKSCS'
  139.     recursions = 0
  140.     conversion_options = {'linearize_tables':True}
  141.     timefmt = ''
  142.  
  143.     def image_url_processor(cls, baseurl, url):
  144.         # trick: break the url at the first occurance of digit, add an additional
  145.         # '_' at the front
  146.         # not working, may need to move this to preprocess_html() method
  147. #        minIdx = 10000
  148. #        i0 = url.find('0')
  149. #        if i0 >= 0 and i0 < minIdx:
  150. #           minIdx = i0
  151. #        i1 = url.find('1')
  152. #        if i1 >= 0 and i1 < minIdx:
  153. #           minIdx = i1
  154. #        i2 = url.find('2')
  155. #        if i2 >= 0 and i2 < minIdx:
  156. #           minIdx = i2
  157. #        i3 = url.find('3')
  158. #        if i3 >= 0 and i0 < minIdx:
  159. #           minIdx = i3
  160. #        i4 = url.find('4')
  161. #        if i4 >= 0 and i4 < minIdx:
  162. #           minIdx = i4
  163. #        i5 = url.find('5')
  164. #        if i5 >= 0 and i5 < minIdx:
  165. #           minIdx = i5
  166. #        i6 = url.find('6')
  167. #        if i6 >= 0 and i6 < minIdx:
  168. #           minIdx = i6
  169. #        i7 = url.find('7')
  170. #        if i7 >= 0 and i7 < minIdx:
  171. #           minIdx = i7
  172. #        i8 = url.find('8')
  173. #        if i8 >= 0 and i8 < minIdx:
  174. #           minIdx = i8
  175. #        i9 = url.find('9')
  176. #        if i9 >= 0 and i9 < minIdx:
  177. #           minIdx = i9
  178.         return url
  179.  
  180.     def get_dtlocal(self):
  181.         dt_utc = datetime.datetime.utcnow()
  182.         if __Region__ == 'Hong Kong':
  183.             # convert UTC to local hk time - at HKT 5.30am, all news are available
  184.             dt_local = dt_utc + datetime.timedelta(8.0/24) - datetime.timedelta(5.5/24)
  185.             # dt_local = dt_utc.astimezone(pytz.timezone('Asia/Hong_Kong')) - datetime.timedelta(5.5/24)
  186.         elif __Region__ == 'Vancouver':
  187.             # convert UTC to local Vancouver time - at PST time 5.30am, all news are available
  188.             dt_local = dt_utc + datetime.timedelta(-8.0/24) - datetime.timedelta(5.5/24)
  189.             #dt_local = dt_utc.astimezone(pytz.timezone('America/Vancouver')) - datetime.timedelta(5.5/24)
  190.         elif __Region__ == 'Toronto':
  191.             # convert UTC to local Toronto time - at EST time 8.30am, all news are available
  192.             dt_local = dt_utc + datetime.timedelta(-5.0/24) - datetime.timedelta(8.5/24)
  193.             #dt_local = dt_utc.astimezone(pytz.timezone('America/Toronto')) - datetime.timedelta(8.5/24)
  194.         return dt_local
  195.  
  196.     def get_fetchdate(self):
  197.         return self.get_dtlocal().strftime("%Y%m%d")
  198.  
  199.     def get_fetchformatteddate(self):
  200.         return self.get_dtlocal().strftime("%Y-%m-%d")
  201.  
  202.     def get_fetchday(self):
  203.         return self.get_dtlocal().strftime("%d")
  204.  
  205.     def get_cover_url(self):
  206.         if __Region__ == 'Hong Kong':
  207.             cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
  208.         elif __Region__ == 'Vancouver':
  209.             cover = 'http://www.mingpaovan.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgva1s.jpg'
  210.         elif __Region__ == 'Toronto':
  211.             cover = 'http://www.mingpaotor.com/ftp/News/' + self.get_fetchdate() + '/' + self.get_fetchday() + 'pgtas.jpg'
  212.         br = BasicNewsRecipe.get_browser()
  213.         try:
  214.             br.open(cover)
  215.         except:
  216.             cover = None
  217.         return cover
  218.  
  219.     def parse_index(self):
  220.         feeds = []
  221.         dateStr = self.get_fetchdate()
  222.  
  223.         if __Region__ == 'Hong Kong':
  224.             if __UseLife__:
  225.                 for title, url, keystr in [(u'\u8981\u805e Headline', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalga', 'nal'),
  226.                                            (u'\u6e2f\u805e Local', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgb', 'nal'),
  227.                                            (u'\u6559\u80b2 Education', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalgf', 'nal'),
  228.                                            (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr', 'nal'),
  229.                                            (u'\u8ad6\u58c7 Forum', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalfa', 'nal'),
  230.                                            (u'\u4e2d\u570b China', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalca', 'nal'),
  231.                                            (u'\u570b\u969b World', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalta', 'nal'),
  232.                                            (u'\u7d93\u6fdf Finance', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea', 'nal'),
  233.                                            (u'\u9ad4\u80b2 Sport', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalsp', 'nal'),
  234.                                            (u'\u5f71\u8996 Film/TV', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalma', 'nal')
  235.                                            #(u'\u5c08\u6b04 Columns', 'http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn', 'ncl')
  236.                                           ]:
  237.                     articles = self.parse_section2(url, keystr)
  238.                     if articles:
  239.                         feeds.append((title, articles))
  240.  
  241.                 for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
  242.                                    (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
  243.                     articles = self.parse_section(url)
  244.                     if articles:
  245.                         feeds.append((title, articles))
  246.             else:
  247.                 for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'),
  248.                                    (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'),
  249.                                    (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm')]:
  250.                     articles = self.parse_section(url)
  251.                     if articles:
  252.                         feeds.append((title, articles))
  253.  
  254.                 # special- editorial
  255.                 ed_articles = self.parse_ed_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=nalmr')
  256.                 if ed_articles:
  257.                     feeds.append((u'\u793e\u8a55/\u7b46\u9663 Editorial', ed_articles))
  258.  
  259.                 for title, url in [(u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'),
  260.                                    (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'),
  261.                                    (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm')]:
  262.                     articles = self.parse_section(url)
  263.                     if articles:
  264.                         feeds.append((title, articles))
  265.  
  266.                 # special - finance
  267.                 #fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
  268.                 fin_articles = self.parse_fin_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr + '&Category=nalea')
  269.                 if fin_articles:
  270.                     feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
  271.  
  272.                 for title, url in [('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'),
  273.                                    (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm')]:
  274.                     articles = self.parse_section(url)
  275.                     if articles:
  276.                         feeds.append((title, articles))
  277.  
  278.                 # special - entertainment
  279.                 ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
  280.                 if ent_articles:
  281.                     feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
  282.  
  283.                 for title, url in [(u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
  284.                                    (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
  285.                     articles = self.parse_section(url)
  286.                     if articles:
  287.                         feeds.append((title, articles))
  288.  
  289.  
  290.                 # special- columns
  291.                 col_articles = self.parse_col_section('http://life.mingpao.com/cfm/dailynews2.cfm?Issue=' + dateStr +'&Category=ncolumn')
  292.                 if col_articles:
  293.                     feeds.append((u'\u5c08\u6b04 Columns', col_articles))
  294.         elif __Region__ == 'Vancouver':
  295.             for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VAindex.htm'),
  296.                                (u'\u52a0\u570b Canada', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VBindex.htm'),
  297.                                (u'\u793e\u5340 Local', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VDindex.htm'),
  298.                                (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-VGindex.htm'),
  299.                                (u'\u570b\u969b World', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VTindex.htm'),
  300.                                (u'\u4e2d\u570b China', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VCindex.htm'),
  301.                                (u'\u7d93\u6fdf Economics', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VEindex.htm'),
  302.                                (u'\u9ad4\u80b2 Sports', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/VSindex.htm'),
  303.                                (u'\u5f71\u8996 Film/TV', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
  304.                                (u'\u526f\u520a Supplements', 'http://www.mingpaovan.com/htm/News/' + dateStr + '/WWindex.htm'),]:
  305.                 articles = self.parse_section3(url, 'http://www.mingpaovan.com/')
  306.                 if articles:
  307.                     feeds.append((title, articles))
  308.         elif __Region__ == 'Toronto':
  309.             for title, url in [(u'\u8981\u805e Headline', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TAindex.htm'),
  310.                                (u'\u52a0\u570b Canada', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TDindex.htm'),
  311.                                (u'\u793e\u5340 Local', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TFindex.htm'),
  312.                                (u'\u4e2d\u570b China', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TCAindex.htm'),
  313.                                (u'\u570b\u969b World', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TTAindex.htm'),
  314.                                (u'\u6e2f\u805e Hong Kong', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-GAindex.htm'),
  315.                                (u'\u7d93\u6fdf Economics', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/THindex.htm'),
  316.                                (u'\u9ad4\u80b2 Sports', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/TSindex.htm'),
  317.                                (u'\u5f71\u8996 Film/TV', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/HK-MAindex.htm'),
  318.                                (u'\u526f\u520a Supplements', 'http://www.mingpaotor.com/htm/News/' + dateStr + '/WWindex.htm'),]:
  319.                 articles = self.parse_section3(url, 'http://www.mingpaotor.com/')
  320.                 if articles:
  321.                     feeds.append((title, articles))
  322.         return feeds
  323.  
  324.     # parse from news.mingpao.com
  325.     def parse_section(self, url):
  326.         dateStr = self.get_fetchdate()
  327.         soup = self.index_to_soup(url)
  328.         divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
  329.         current_articles = []
  330.         included_urls = []
  331.         divs.reverse()
  332.         for i in divs:
  333.             a = i.find('a', href = True)
  334.             title = self.tag_to_string(a)
  335.             url = a.get('href', False)
  336.             url = 'http://news.mingpao.com/' + dateStr + '/' +url
  337.             if url not in included_urls and url.rfind('Redirect') == -1:
  338.                 current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
  339.                 included_urls.append(url)
  340.         current_articles.reverse()
  341.         return current_articles
  342.  
  343.     # parse from life.mingpao.com
  344.     def parse_section2(self, url, keystr):
  345.         self.get_fetchdate()
  346.         soup = self.index_to_soup(url)
  347.         a = soup.findAll('a', href=True)
  348.         a.reverse()
  349.         current_articles = []
  350.         included_urls = []
  351.         for i in a:
  352.             title = self.tag_to_string(i)
  353.             url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
  354.             if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind(keystr) == -1):
  355.                 url = url.replace('dailynews3.cfm', 'dailynews3a.cfm')  # use printed version of the article
  356.                 current_articles.append({'title': title, 'url': url, 'description': ''})
  357.                 included_urls.append(url)
  358.         current_articles.reverse()
  359.         return current_articles
  360.  
  361.     # parse from www.mingpaovan.com
  362.     def parse_section3(self, url, baseUrl):
  363.         self.get_fetchdate()
  364.         soup = self.index_to_soup(url)
  365.         divs = soup.findAll(attrs={'class': ['ListContentLargeLink']})
  366.         current_articles = []
  367.         included_urls = []
  368.         divs.reverse()
  369.         for i in divs:
  370.             title = self.tag_to_string(i)
  371.             urlstr = i.get('href', False)
  372.             urlstr = baseUrl + '/' + urlstr.replace('../../../', '')
  373.             if urlstr not in included_urls:
  374.                 current_articles.append({'title': title, 'url': urlstr, 'description': '', 'date': ''})
  375.                 included_urls.append(urlstr)
  376.         current_articles.reverse()
  377.         return current_articles
  378.  
  379.     def parse_ed_section(self, url):
  380.         self.get_fetchdate()
  381.         soup = self.index_to_soup(url)
  382.         a = soup.findAll('a', href=True)
  383.         a.reverse()
  384.         current_articles = []
  385.         included_urls = []
  386.         for i in a:
  387.             title = self.tag_to_string(i)
  388.             url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
  389.             if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('nal') == -1):
  390.                 current_articles.append({'title': title, 'url': url, 'description': ''})
  391.                 included_urls.append(url)
  392.         current_articles.reverse()
  393.         return current_articles
  394.  
  395.     def parse_fin_section(self, url):
  396.         self.get_fetchdate()
  397.         soup = self.index_to_soup(url)
  398.         a = soup.findAll('a', href= True)
  399.         current_articles = []
  400.         included_urls = []
  401.         for i in a:
  402.             #url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
  403.             url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
  404.             #if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
  405.             if url not in included_urls and (not url.rfind('txt') == -1) and (not url.rfind('nal') == -1):
  406.                 title = self.tag_to_string(i)
  407.                 current_articles.append({'title': title, 'url': url, 'description':''})
  408.                 included_urls.append(url)
  409.         return current_articles
  410.  
  411.     def parse_ent_section(self, url):
  412.         self.get_fetchdate()
  413.         soup = self.index_to_soup(url)
  414.         a = soup.findAll('a', href=True)
  415.         a.reverse()
  416.         current_articles = []
  417.         included_urls = []
  418.         for i in a:
  419.             title = self.tag_to_string(i)
  420.             url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
  421.             if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
  422.                 current_articles.append({'title': title, 'url': url, 'description': ''})
  423.                 included_urls.append(url)
  424.         current_articles.reverse()
  425.         return current_articles
  426.  
  427.     def parse_col_section(self, url):
  428.         self.get_fetchdate()
  429.         soup = self.index_to_soup(url)
  430.         a = soup.findAll('a', href=True)
  431.         a.reverse()
  432.         current_articles = []
  433.         included_urls = []
  434.         for i in a:
  435.             title = self.tag_to_string(i)
  436.             url = 'http://life.mingpao.com/cfm/' + i.get('href', False)
  437.             if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('ncl') == -1):
  438.                 current_articles.append({'title': title, 'url': url, 'description': ''})
  439.                 included_urls.append(url)
  440.         current_articles.reverse()
  441.         return current_articles
  442.  
  443.     def preprocess_html(self, soup):
  444.         for item in soup.findAll(style=True):
  445.             del item['style']
  446.         for item in soup.findAll(style=True):
  447.             del item['width']
  448.         for item in soup.findAll(stype=True):
  449.             del item['absmiddle']
  450.         return soup
  451.  
  452.     def create_opf(self, feeds, dir=None):
  453.         if dir is None:
  454.             dir = self.output_dir
  455.         if __UseChineseTitle__ == True:
  456.             if __Region__ == 'Hong Kong':
  457.                 title = u'\u660e\u5831 (\u9999\u6e2f)'
  458.             elif __Region__ == 'Vancouver':
  459.                 title = u'\u660e\u5831 (\u6eab\u54e5\u83ef)'
  460.             elif __Region__ == 'Toronto':
  461.                 title = u'\u660e\u5831 (\u591a\u502b\u591a)'
  462.         else:
  463.             title = self.short_title()
  464.         # if not generating a periodical, force date to apply in title
  465.         if __MakePeriodical__ == False:
  466.             title = title + ' ' + self.get_fetchformatteddate()
  467.         if True:
  468.             mi = MetaInformation(title, [self.publisher])
  469.             mi.publisher = self.publisher
  470.             mi.author_sort = self.publisher
  471.             if __MakePeriodical__ == True:
  472.                 mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
  473.             else:
  474.                 mi.publication_type = self.publication_type+':'+self.short_title()
  475.             #mi.timestamp = nowf()
  476.             mi.timestamp = self.get_dtlocal()
  477.             mi.comments = self.description
  478.             if not isinstance(mi.comments, unicode):
  479.                 mi.comments = mi.comments.decode('utf-8', 'replace')
  480.             #mi.pubdate = nowf()
  481.             mi.pubdate = self.get_dtlocal()
  482.             opf_path = os.path.join(dir, 'index.opf')
  483.             ncx_path = os.path.join(dir, 'index.ncx')
  484.             opf = OPFCreator(dir, mi)
  485.             # Add mastheadImage entry to <guide> section
  486.             mp = getattr(self, 'masthead_path', None)
  487.             if mp is not None and os.access(mp, os.R_OK):
  488.                 from calibre.ebooks.metadata.opf2 import Guide
  489.                 ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
  490.                 ref.type = 'masthead'
  491.                 ref.title = 'Masthead Image'
  492.                 opf.guide.append(ref)
  493.  
  494.             manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
  495.             manifest.append(os.path.join(dir, 'index.html'))
  496.             manifest.append(os.path.join(dir, 'index.ncx'))
  497.  
  498.             # Get cover
  499.             cpath = getattr(self, 'cover_path', None)
  500.             if cpath is None:
  501.                 pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
  502.                 if self.default_cover(pf):
  503.                     cpath =  pf.name
  504.             if cpath is not None and os.access(cpath, os.R_OK):
  505.                 opf.cover = cpath
  506.                 manifest.append(cpath)
  507.  
  508.             # Get masthead
  509.             mpath = getattr(self, 'masthead_path', None)
  510.             if mpath is not None and os.access(mpath, os.R_OK):
  511.                 manifest.append(mpath)
  512.  
  513.             opf.create_manifest_from_files_in(manifest)
  514.             for mani in opf.manifest:
  515.                 if mani.path.endswith('.ncx'):
  516.                     mani.id = 'ncx'
  517.                 if mani.path.endswith('mastheadImage.jpg'):
  518.                     mani.id = 'masthead-image'
  519.             entries = ['index.html']
  520.             toc = TOC(base_path=dir)
  521.             self.play_order_counter = 0
  522.             self.play_order_map = {}
  523.  
  524.         def feed_index(num, parent):
  525.             f = feeds[num]
  526.             for j, a in enumerate(f):
  527.                 if getattr(a, 'downloaded', False):
  528.                     adir = 'feed_%d/article_%d/'%(num, j)
  529.                     auth = a.author
  530.                     if not auth:
  531.                         auth = None
  532.                     desc = a.text_summary
  533.                     if not desc:
  534.                         desc = None
  535.                     else:
  536.                         desc = self.description_limiter(desc)
  537.                     entries.append('%sindex.html'%adir)
  538.                     po = self.play_order_map.get(entries[-1], None)
  539.                     if po is None:
  540.                         self.play_order_counter += 1
  541.                         po = self.play_order_counter
  542.                     parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
  543.                                     play_order=po, author=auth, description=desc)
  544.                     last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
  545.                     for sp in a.sub_pages:
  546.                         prefix = os.path.commonprefix([opf_path, sp])
  547.                         relp = sp[len(prefix):]
  548.                         entries.append(relp.replace(os.sep, '/'))
  549.                         last = sp
  550.  
  551.                     if os.path.exists(last):
  552.                         with open(last, 'rb') as fi:
  553.                             src = fi.read().decode('utf-8')
  554.                         soup = BeautifulSoup(src)
  555.                         body = soup.find('body')
  556.                         if body is not None:
  557.                             prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
  558.                             templ = self.navbar.generate(True, num, j, len(f),
  559.                                             not self.has_single_feed,
  560.                                             a.orig_url, self.publisher, prefix=prefix,
  561.                                             center=self.center_navbar)
  562.                             elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
  563.                             body.insert(len(body.contents), elem)
  564.                             with open(last, 'wb') as fi:
  565.                                 fi.write(unicode(soup).encode('utf-8'))
  566.         if len(feeds) == 0:
  567.             raise Exception('All feeds are empty, aborting.')
  568.  
  569.         if len(feeds) > 1:
  570.             for i, f in enumerate(feeds):
  571.                 entries.append('feed_%d/index.html'%i)
  572.                 po = self.play_order_map.get(entries[-1], None)
  573.                 if po is None:
  574.                     self.play_order_counter += 1
  575.                     po = self.play_order_counter
  576.                 auth = getattr(f, 'author', None)
  577.                 if not auth:
  578.                     auth = None
  579.                 desc = getattr(f, 'description', None)
  580.                 if not desc:
  581.                     desc = None
  582.                 feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
  583.                            f.title, play_order=po, description=desc, author=auth))
  584.  
  585.         else:
  586.             entries.append('feed_%d/index.html'%0)
  587.             feed_index(0, toc)
  588.  
  589.         for i, p in enumerate(entries):
  590.             entries[i] = os.path.join(dir, p.replace('/', os.sep))
  591.         opf.create_spine(entries)
  592.         opf.set_toc(toc)
  593.  
  594.         with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
  595.             opf.render(opf_file, ncx_file)
  596.