[Updated recipe] Ming Pao (明報) - Hong Kong

tylau0 · 11-22-2010, 08:29 PM

Code:

__license__   = 'GPL v3'
__copyright__ = '2010, Eddie Lau'
'''
modified from Singtao Toronto calibre recipe by rty
Change Log:
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
            ordering of articles
2010/11/12: add news image and eco-news section
2010/11/08: add parsing of finance section
2010/11/06: temporary work-around for Kindle device having no capability to display unicode
            in section/article list.
2010/10/31: skip repeated articles in section pages
'''

import os, datetime, time, re
from calibre.web.feeds.recipes import BasicNewsRecipe
from collections import defaultdict
from functools import partial
from contextlib import nested, closing


from calibre import browser, __appname__, iswindows, strftime, preferred_encoding
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre import entity_to_unicode
from calibre.web import Recipe
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
from calibre.web.fetch.simple import RecursiveFetcher
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import now as nowf
from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image
                    
class MPHKRecipe(BasicNewsRecipe):
    title          = 'Ming Pao - Hong Kong'
    oldest_article = 1
    max_articles_per_feed = 100
    __author__            = 'Eddie Lau'
    description           = 'Hong Kong Chinese Newspaper'
    publisher             = 'news.mingpao.com'
    category              = 'Chinese, News, Hong Kong'
    remove_javascript = True
    use_embedded_content   = False
    no_stylesheets = True
    language = 'zh'
    encoding = 'Big5-HKSCS'
    recursions = 0
    conversion_options = {'linearize_tables':True}
    extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;}'
    #extra_css = 'img {float:right; margin:4px;}'
    masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
    keep_only_tags = [dict(name='h1'),
    				  #dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page
    				  dict(attrs={'class':['photo']}),
    				  dict(attrs={'id':['newscontent']}),
                      dict(attrs={'id':['newscontent01','newscontent02']})]
    remove_tags = [dict(name='style'),
    			   dict(attrs={'id':['newscontent135']})]  # for the finance page
    remove_attributes = ['width']
    preprocess_regexps = [
   							(re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
    						lambda match: '<h1>'),
							(re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
    						lambda match: '</h1>'),
						 ]
				 
    def image_url_processor(cls, baseurl, url):
    	# trick: break the url at the first occurance of digit, add an additional
    	# '_' at the front
    	# not working, may need to move this to preprocess_html() method
    	#minIdx = 10000
    	#i0 = url.find('0')
    	#if i0 >= 0 and i0 < minIdx:
    	#	minIdx = i0 
    	#i1 = url.find('1')
    	#if i1 >= 0 and i1 < minIdx:
    	#	minIdx = i1
    	#i2 = url.find('2')
    	#if i2 >= 0 and i2 < minIdx:
    	#	minIdx = i2 
    	#i3 = url.find('3')
    	#if i3 >= 0 and i0 < minIdx:
    	#	minIdx = i3
    	#i4 = url.find('4')
    	#if i4 >= 0 and i4 < minIdx:
    	#	minIdx = i4 
    	#i5 = url.find('5')
    	#if i5 >= 0 and i5 < minIdx:
    	#	minIdx = i5
    	#i6 = url.find('6')
    	#if i6 >= 0 and i6 < minIdx:
    	#	minIdx = i6 
    	#i7 = url.find('7')
    	#if i7 >= 0 and i7 < minIdx:
    	#	minIdx = i7
    	#i8 = url.find('8')
    	#if i8 >= 0 and i8 < minIdx:
    	#	minIdx = i8 
    	#i9 = url.find('9')
    	#if i9 >= 0 and i9 < minIdx:
    	#	minIdx = i9
    	#return url[0:minIdx] + '_' + url[minIdx+1:]
    	return url
    	
    def get_fetchdate(self):
    	dt_utc = datetime.datetime.utcnow()
    	# convert UTC to local hk time - at around HKT 6.00am, all news are available
    	dt_local = dt_utc - datetime.timedelta(-2.0/24)
    	return dt_local.strftime("%Y%m%d")
    		
    def parse_index(self):
            feeds = []
            dateStr = self.get_fetchdate()
            for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), 
            				   (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), 
            				   (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), 
            				   (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), 
            				   (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), 
            				   (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), 
            				   (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), 
            				   ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), 
            				   (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), 
            				   (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
            				   (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
            	articles = self.parse_section(url)
            	if articles:
            		feeds.append((title, articles))
            # special - finance
            fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
            if fin_articles:
            	feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
            # special - eco-friendly
            # eco_articles = self.parse_eco_section('http://tssl.mingpao.com/htm/marketing/eco/cfm/Eco1.cfm')
            # if eco_articles:
            # 	feeds.append((u'\u74b0\u4fdd Eco News', eco_articles))
            # special - entertainment
            #ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
            #if ent_articles:
            #	feeds.append(('Entertainment', ent_articles))
            return feeds
    
    def parse_section(self, url):
    		dateStr = self.get_fetchdate()
    		soup = self.index_to_soup(url)
    		divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
    		current_articles = []
    		included_urls = []
    		divs.reverse()
    		for i in divs:
    			a = i.find('a', href = True)
    			title = self.tag_to_string(a)
    			url = a.get('href', False)
    			url = 'http://news.mingpao.com/' + dateStr + '/' +url
    			if url not in included_urls and url.rfind('Redirect') == -1: 
    				current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
    				included_urls.append(url)
    		current_articles.reverse()
    		return current_articles
    
    def parse_fin_section(self, url):
    	dateStr = self.get_fetchdate()
    	soup = self.index_to_soup(url)
    	a = soup.findAll('a', href= True)
    	current_articles = []
    	included_urls = []
    	for i in a:
    		url = i.get('href', False)
    		if not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
    			title = self.tag_to_string(i)
    			url = 'http://www.mpfinance.com/cfm/' +url
    			current_articles.append({'title': title, 'url': url, 'description':''})
    	return current_articles
    			
    def parse_eco_section(self, url):
    	soup = self.index_to_soup(url)
    	divs = soup.findAll(attrs={'class': ['bullet']})
    	current_articles = []
    	included_urls = []
    	for i in divs:
    		a = i.find('a', href = True)
    		title = self.tag_to_string(a)
    		url = a.get('href', False)
    		url = 'http://tssl.mingpao.com/htm/marketing/eco/cfm/' +url
    		if url not in included_urls and url.rfind('Redirect') == -1: 
    			current_articles.append({'title': title, 'url': url, 'description':''})
    			included_urls.append(url)
    	return current_articles
    
    #def parse_ent_section(self, url):
    #	dateStr = self.get_fetchdate()
    #	soup = self.index_to_soup(url)
    #	a = soup.findAll('a', href=True)
    #	current_articles = []
    #	included_urls = []
    #	for i in a:
    #		title = self.tag_to_string(i)
    #		url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
    #		if url not in included_urls and not url.rfind('.txt') == -1 and not url.rfind(dateStr) == -1 and not title == '':
    #			current_articles.append({'title': title, 'url': url, 'description': ''})
    #	return current_articles
    
    def preprocess_html(self, soup):
    	for item in soup.findAll(style=True):
    		del item['style']
    	for item in soup.findAll(style=True):
    		del item['width']
    	for item in soup.findAll(stype=True):
    		del item['absmiddle']
    	return soup
    		
    def create_opf(self, feeds, dir=None):
    	#super(MPHKRecipe,self).create_opf(feeds, dir)
    	if dir is None:
    		dir = self.output_dir
    	title = self.short_title()
        if self.output_profile.periodical_date_in_title:
            title += strftime(self.timefmt)
        mi = MetaInformation(title, [__appname__])
        mi.publisher = __appname__
        mi.author_sort = __appname__
        mi.publication_type = self.publication_type+':'+self.short_title()
        mi.timestamp = nowf()
        mi.comments = self.description
        if not isinstance(mi.comments, unicode):
            mi.comments = mi.comments.decode('utf-8', 'replace')
        mi.pubdate = nowf()
        opf_path = os.path.join(dir, 'index.opf')
        ncx_path = os.path.join(dir, 'index.ncx')
        opf = OPFCreator(dir, mi)
        # Add mastheadImage entry to <guide> section
        mp = getattr(self, 'masthead_path', None)
        if mp is not None and os.access(mp, os.R_OK):
            from calibre.ebooks.metadata.opf2 import Guide
            ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
            ref.type = 'masthead'
            ref.title = 'Masthead Image'
            opf.guide.append(ref)

        manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
        manifest.append(os.path.join(dir, 'index.html'))
        manifest.append(os.path.join(dir, 'index.ncx'))
        
        # Get cover
        cpath = getattr(self, 'cover_path', None)
        if cpath is None:
            pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
            if self.default_cover(pf):
                cpath =  pf.name
        if cpath is not None and os.access(cpath, os.R_OK):
            opf.cover = cpath
            manifest.append(cpath)

        # Get masthead
        mpath = getattr(self, 'masthead_path', None)
        if mpath is not None and os.access(mpath, os.R_OK):
            manifest.append(mpath)

        opf.create_manifest_from_files_in(manifest)
        for mani in opf.manifest:
            if mani.path.endswith('.ncx'):
                mani.id = 'ncx'
            if mani.path.endswith('mastheadImage.jpg'):
                mani.id = 'masthead-image'
        entries = ['index.html']
        toc = TOC(base_path=dir)
        self.play_order_counter = 0
        self.play_order_map = {}

        def feed_index(num, parent):
            f = feeds[num]
            for j, a in enumerate(f):
                if getattr(a, 'downloaded', False):
                    adir = 'feed_%d/article_%d/'%(num, j)
                    auth = a.author
                    if not auth:
                        auth = None
                    desc = a.text_summary
                    if not desc:
                        desc = None
                    else:
                        desc = self.description_limiter(desc)
                    entries.append('%sindex.html'%adir)
                    po = self.play_order_map.get(entries[-1], None)
                    if po is None:
                        self.play_order_counter += 1
                        po = self.play_order_counter
                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
                                    play_order=po, author=auth, description=desc)
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
                        relp = sp[len(prefix):]
                        entries.append(relp.replace(os.sep, '/'))
                        last = sp

                    if os.path.exists(last):
                        with open(last, 'rb') as fi:
                            src = fi.read().decode('utf-8')
                        soup = BeautifulSoup(src)
                        body = soup.find('body')
                        if body is not None:
                            prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                            templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
                                            a.orig_url, __appname__, prefix=prefix,
                                            center=self.center_navbar)
                            elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                            body.insert(len(body.contents), elem)
                            with open(last, 'wb') as fi:
                                fi.write(unicode(soup).encode('utf-8'))
        if len(feeds) == 0:
            raise Exception('All feeds are empty, aborting.')

        if len(feeds) > 1:
            for i, f in enumerate(feeds):
                entries.append('feed_%d/index.html'%i)
                po = self.play_order_map.get(entries[-1], None)
                if po is None:
                    self.play_order_counter += 1
                    po = self.play_order_counter
                auth = getattr(f, 'author', None)
                if not auth:
                    auth = None
                desc = getattr(f, 'description', None)
                if not desc:
                    desc = None
                feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
                    f.title, play_order=po, description=desc, author=auth))

        else:
            entries.append('feed_%d/index.html'%0)
            feed_index(0, toc)

        for i, p in enumerate(entries):
            entries[i] = os.path.join(dir, p.replace('/', os.sep))
        opf.create_spine(entries)
        opf.set_toc(toc)

        with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
            opf.render(opf_file, ncx_file)

icaria · 11-24-2010, 05:48 AM

Hi,

First of all, thank you for all your efforts in writing this recipe.

I was able to fetch the news but can't open it in the Calibre Viewer.

Getting the following errors:

---------------------------
ERROR: Could not open ebook
---------------------------
unichr() arg not in range(0x10000) (narrow Python build)

Code:

ERROR: Could not open ebook: unichr() arg not in range(0x10000) (narrow Python build)

Traceback (most recent call last):
  File "site-packages\calibre\gui2\viewer\main.py", line 61, in run
  File "threading.py", line 477, in run
  File "site-packages\calibre\ebooks\oeb\iterator.py", line 192, in __enter__
  File "site-packages\calibre\customize\conversion.py", line 216, in __call__
  File "site-packages\calibre\ebooks\lit\input.py", line 25, in convert
  File "site-packages\calibre\ebooks\conversion\plumber.py", line 973, in create_oebbook
  File "site-packages\calibre\ebooks\oeb\reader.py", line 72, in __call__
  File "site-packages\calibre\ebooks\oeb\reader.py", line 598, in _all_from_opf
  File "site-packages\calibre\ebooks\oeb\reader.py", line 248, in _manifest_from_opf
  File "site-packages\calibre\ebooks\oeb\reader.py", line 181, in _manifest_add_missing
  File "site-packages\calibre\ebooks\oeb\base.py", line 1058, in fget
  File "site-packages\calibre\ebooks\lit\reader.py", line 867, in read
  File "site-packages\calibre\ebooks\lit\reader.py", line 133, in __init__
  File "site-packages\calibre\ebooks\lit\reader.py", line 238, in binary_to_text
  File "site-packages\calibre\ebooks\lit\reader.py", line 238, in binary_to_text
  File "site-packages\calibre\ebooks\lit\reader.py", line 238, in binary_to_text
  File "site-packages\calibre\ebooks\lit\reader.py", line 176, in binary_to_text
  File "site-packages\calibre\ebooks\lit\reader.py", line 103, in read_utf8_char
ValueError: unichr() arg not in range(0x10000) (narrow Python build)

ERROR: ERROR: Unhandled exception: <b>AttributeError</b>:'EbookViewer' object has no attribute 'current_index'

Code:

Traceback (most recent call last):
  File "site-packages\calibre\gui2\viewer\documentview.py", line 827, in wheelEvent
  File "site-packages\calibre\gui2\viewer\main.py", line 667, in next_document
AttributeError: 'EbookViewer' object has no attribute 'current_index'

kovidgoyal · 11-24-2010, 01:36 PM

change your putput format to EPUB from LIT. LIT doesn't have support for east asian character sets.

icaria · 11-25-2010, 04:38 AM

Thank You!!!!

Took me a while to relocate the default settings, but once I reset it from LIT to EPUB, it works perfectly!

Similar Threads
Thread	Thread Starter	Forum	Replies	Last Post
[Updated recipe] Ming Pao (明報) - Hong Kong	tylau0	Recipes	0	11-12-2010 07:24 PM
[Updated recipe] Ming Pao (明報) - Hong Kong	tylau0	Recipes	0	11-06-2010 07:46 PM
Ming Pao (明報) - Hong Kong	tylau0	Recipes	1	10-31-2010 07:08 PM
hello from hong kong!	carpetfish	Introduce Yourself	4	03-17-2008 07:23 AM
Hello from Hong Kong	tsuria	Introduce Yourself	2	03-21-2007 06:00 PM

11-24-2010, 01:36 PM	#3
kovidgoyal creator of calibre Posts: 45,661 Karma: 28549046 Join Date: Oct 2006 Location: Mumbai, India Device: Various	change your putput format to EPUB from LIT. LIT doesn't have support for east asian character sets.

11-25-2010, 04:38 AM	#4
icaria Member Posts: 22 Karma: 10 Join Date: Sep 2010 Device: none	Thank You!!!! Took me a while to relocate the default settings, but once I reset it from LIT to EPUB, it works perfectly!

Advert