MobileRead Forums - View Single Post - Updated Recipe: Ming Pao

tylau0 · 02-20-2011, 11:24 AM

Would anyone help putting the following updated recipe to Calibre? Thanks.

Also, I have a question. If I want to place the following notice for potential users:

If your device supports CJK in section and article views well (e.g. Kindle 3.1+), you may enable them by customizing this recipe: change line 41 from \"IsCJKWellSupported = False\" to \"IsCJKWellSupported = True\"

where should I put it in the recipe so the users are alerted when they pick the recipe? The "description" seems not a good place because it will be taken by all the downloaded ebooks/periodicals.

Thanks.

-Eddie

Code:

__license__   = 'GPL v3'
__copyright__ = '2010-2011, Eddie Lau'
'''
Change Log:
2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles
            clean up the indentation
2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list
            (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) 
2010/11/22: add English section, remove eco-news section which is not updated daily, correct
            ordering of articles
2010/11/12: add news image and eco-news section
2010/11/08: add parsing of finance section
2010/11/06: temporary work-around for Kindle device having no capability to display unicode
            in section/article list.
2010/10/31: skip repeated articles in section pages
'''

import os, datetime, time, re
from calibre.web.feeds.recipes import BasicNewsRecipe
from collections import defaultdict
from functools import partial
from contextlib import nested, closing


from calibre import browser, __appname__, iswindows, strftime, preferred_encoding
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre import entity_to_unicode
from calibre.web import Recipe
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
from calibre.web.fetch.simple import RecursiveFetcher
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import now as nowf
from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image
                    
class MPHKRecipe(BasicNewsRecipe):
	IsCJKWellSupported = False  # to avoid generating periodical in which CJK characters can't be displayed in section/article view
	title          = 'Ming Pao - Hong Kong'
	oldest_article = 1
	max_articles_per_feed = 100
	__author__            = 'Eddie Lau'
	description           = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)'
	publisher             = 'MingPao'
	category              = 'Chinese, News, Hong Kong'
	remove_javascript = True
	use_embedded_content   = False
	no_stylesheets = True
	language = 'zh'
	encoding = 'Big5-HKSCS'
	recursions = 0
	conversion_options = {'linearize_tables':True}
	timefmt = ''
	extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}'
	masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif'
	keep_only_tags = [dict(name='h1'),
                      dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title
                      dict(attrs={'id':['newscontent']}), # entertainment page content
                      dict(attrs={'id':['newscontent01','newscontent02']}),
                      dict(attrs={'class':['photo']})
                      ]
	remove_tags = [dict(name='style'),
    			   dict(attrs={'id':['newscontent135']})]  # for the finance page
	remove_attributes = ['width']
	preprocess_regexps = [
                          (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE),
                          lambda match: '<h1>'),
                          (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE),
                          lambda match: '</h1>'),
                          (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page
                          lambda match: '')
                         ]
                         
	def image_url_processor(cls, baseurl, url):
		# trick: break the url at the first occurance of digit, add an additional
		# '_' at the front
		# not working, may need to move this to preprocess_html() method
#        minIdx = 10000
#        i0 = url.find('0')
#        if i0 >= 0 and i0 < minIdx:
#           minIdx = i0
#        i1 = url.find('1')
#        if i1 >= 0 and i1 < minIdx:
#           minIdx = i1
#        i2 = url.find('2')
#        if i2 >= 0 and i2 < minIdx:
#           minIdx = i2
#        i3 = url.find('3')
#        if i3 >= 0 and i0 < minIdx:
#           minIdx = i3
#        i4 = url.find('4')
#        if i4 >= 0 and i4 < minIdx:
#           minIdx = i4
#        i5 = url.find('5')
#        if i5 >= 0 and i5 < minIdx:
#           minIdx = i5
#        i6 = url.find('6')
#        if i6 >= 0 and i6 < minIdx:
#           minIdx = i6
#        i7 = url.find('7')
#        if i7 >= 0 and i7 < minIdx:
#           minIdx = i7
#        i8 = url.find('8')
#        if i8 >= 0 and i8 < minIdx:
#           minIdx = i8
#        i9 = url.find('9')
#        if i9 >= 0 and i9 < minIdx:
#           minIdx = i9
		return url
    
	def get_dtlocal(self):
		dt_utc = datetime.datetime.utcnow()
		# convert UTC to local hk time - at around HKT 6.00am, all news are available
		dt_local = dt_utc - datetime.timedelta(-2.0/24)
		return dt_local
    	
	def get_fetchdate(self):
		return self.get_dtlocal().strftime("%Y%m%d")
    	
	def get_fetchformatteddate(self):
		return self.get_dtlocal().strftime("%Y-%m-%d")
    	
	def get_fetchday(self):
		dt_utc = datetime.datetime.utcnow()
		# convert UTC to local hk time - at around HKT 6.00am, all news are available
		dt_local = dt_utc - datetime.timedelta(-2.0/24)
		return self.get_dtlocal().strftime("%d")
    
	def get_cover_url(self):
		cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg'
		br = BasicNewsRecipe.get_browser()
		try:
			br.open(cover)
		except:
			cover = None
		return cover
        	
	def parse_index(self):
		feeds = []
		dateStr = self.get_fetchdate()
		for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), 
		                   (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), 
                           (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), 
                           (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), 
                           (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), 
                           (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), 
                           ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), 
                           (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), 
                           (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), 
                           (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),
                           (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]:
			articles = self.parse_section(url)
			if articles:
				feeds.append((title, articles))
		# special - finance
		fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm')
		if fin_articles:
			feeds.append((u'\u7d93\u6fdf Finance', fin_articles))
		# special - entertainment
		ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm')
		if ent_articles:
			feeds.append((u'\u5f71\u8996 Film/TV', ent_articles))
		return feeds
    
	def parse_section(self, url):
		dateStr = self.get_fetchdate()
		soup = self.index_to_soup(url)
		divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']})
		current_articles = []
		included_urls = []
		divs.reverse()
		for i in divs:
			a = i.find('a', href = True)
			title = self.tag_to_string(a)
			url = a.get('href', False)
			url = 'http://news.mingpao.com/' + dateStr + '/' +url
			if url not in included_urls and url.rfind('Redirect') == -1: 
				current_articles.append({'title': title, 'url': url, 'description':'', 'date':''})
				included_urls.append(url)
		current_articles.reverse()
		return current_articles
    
	def parse_fin_section(self, url):
		dateStr = self.get_fetchdate()
		soup = self.index_to_soup(url)
		a = soup.findAll('a', href= True)
		current_articles = []
		included_urls = []
		for i in a:
			url = 'http://www.mpfinance.com/cfm/' + i.get('href', False)
			if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1:
				title = self.tag_to_string(i)
				current_articles.append({'title': title, 'url': url, 'description':''})
				included_urls.append(url)
		return current_articles
    			
	def parse_ent_section(self, url):
		dateStr = self.get_fetchdate()
		soup = self.index_to_soup(url)
		a = soup.findAll('a', href=True)
		a.reverse()
		current_articles = []
		included_urls = []
		for i in a:
			title = self.tag_to_string(i)
			url = 'http://ol.mingpao.com/cfm/' + i.get('href', False)
			if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1):
				current_articles.append({'title': title, 'url': url, 'description': ''})
				included_urls.append(url)
		current_articles.reverse()
		return current_articles
    
	def preprocess_html(self, soup):
		for item in soup.findAll(style=True):
			del item['style']
		for item in soup.findAll(style=True):
			del item['width']
		for item in soup.findAll(stype=True):
			del item['absmiddle']
		return soup
    		
	def create_opf(self, feeds, dir=None):
		if dir is None:
			dir = self.output_dir
		if self.IsCJKWellSupported == True:
			# use Chinese title
			title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate()
		else:
			# use English title
			title = self.short_title() + ' ' + self.get_fetchformatteddate()
		if True:  # force date in title
			#    title += strftime(self.timefmt)
			mi = MetaInformation(title, [self.publisher])
			mi.publisher = self.publisher
			mi.author_sort = self.publisher
			if self.IsCJKWellSupported == True:
				mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title()
			else:
				mi.publication_type = self.publication_type+':'+self.short_title()
			#mi.timestamp = nowf()
			mi.timestamp = self.get_dtlocal()
			mi.comments = self.description
			if not isinstance(mi.comments, unicode):
				mi.comments = mi.comments.decode('utf-8', 'replace')
			#mi.pubdate = nowf()
			mi.pubdate = self.get_dtlocal()
			opf_path = os.path.join(dir, 'index.opf')
			ncx_path = os.path.join(dir, 'index.ncx')
			opf = OPFCreator(dir, mi)
			# Add mastheadImage entry to <guide> section
			mp = getattr(self, 'masthead_path', None)
			if mp is not None and os.access(mp, os.R_OK):
				from calibre.ebooks.metadata.opf2 import Guide
				ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu())
				ref.type = 'masthead'
				ref.title = 'Masthead Image'
				opf.guide.append(ref)

			manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))]
			manifest.append(os.path.join(dir, 'index.html'))
			manifest.append(os.path.join(dir, 'index.ncx'))
		    
			# Get cover
			cpath = getattr(self, 'cover_path', None)
			if cpath is None:
				pf = open(os.path.join(dir, 'cover.jpg'), 'wb')
				if self.default_cover(pf):
					cpath =  pf.name
			if cpath is not None and os.access(cpath, os.R_OK):
				opf.cover = cpath
				manifest.append(cpath)

			# Get masthead
			mpath = getattr(self, 'masthead_path', None)
			if mpath is not None and os.access(mpath, os.R_OK):
				manifest.append(mpath)

			opf.create_manifest_from_files_in(manifest)
			for mani in opf.manifest:
				if mani.path.endswith('.ncx'):
					mani.id = 'ncx'
				if mani.path.endswith('mastheadImage.jpg'):
					mani.id = 'masthead-image'
			entries = ['index.html']
			toc = TOC(base_path=dir)
			self.play_order_counter = 0
			self.play_order_map = {}
		    
		def feed_index(num, parent):
			f = feeds[num]
			for j, a in enumerate(f):
				if getattr(a, 'downloaded', False):
					adir = 'feed_%d/article_%d/'%(num, j)
					auth = a.author
					if not auth:
						auth = None
					desc = a.text_summary
					if not desc:
						desc = None
					else:
						desc = self.description_limiter(desc)
					entries.append('%sindex.html'%adir)
					po = self.play_order_map.get(entries[-1], None)
					if po is None:
						self.play_order_counter += 1
						po = self.play_order_counter
					parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'),
                                    play_order=po, author=auth, description=desc)
					last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
					for sp in a.sub_pages:
						prefix = os.path.commonprefix([opf_path, sp])
						relp = sp[len(prefix):]
						entries.append(relp.replace(os.sep, '/'))
						last = sp

					if os.path.exists(last):
						with open(last, 'rb') as fi:
							src = fi.read().decode('utf-8')
						soup = BeautifulSoup(src)
						body = soup.find('body')
						if body is not None:
							prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
							templ = self.navbar.generate(True, num, j, len(f),
                                            not self.has_single_feed,
                                            a.orig_url, self.publisher, prefix=prefix,
                                            center=self.center_navbar)
							elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
							body.insert(len(body.contents), elem)
							with open(last, 'wb') as fi:
								fi.write(unicode(soup).encode('utf-8'))
		if len(feeds) == 0:
			raise Exception('All feeds are empty, aborting.')

		if len(feeds) > 1:
			for i, f in enumerate(feeds):
				entries.append('feed_%d/index.html'%i)
				po = self.play_order_map.get(entries[-1], None)
				if po is None:
					self.play_order_counter += 1
					po = self.play_order_counter
				auth = getattr(f, 'author', None)
				if not auth:
					auth = None
				desc = getattr(f, 'description', None)
				if not desc:
					desc = None
				feed_index(i, toc.add_item('feed_%d/index.html'%i, None,
                           f.title, play_order=po, description=desc, author=auth))

		else:
			entries.append('feed_%d/index.html'%0)
			feed_index(0, toc)

		for i, p in enumerate(entries):
			entries[i] = os.path.join(dir, p.replace('/', os.sep))
		opf.create_spine(entries)
		opf.set_toc(toc)

		with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file):
			opf.render(opf_file, ncx_file)