![]() |
#1 |
Connoisseur
![]() Posts: 82
Karma: 10
Join Date: Oct 2010
Device: Kindle
|
Updated Recipe: Ming Pao - Hong Kong
Would anyone help putting the following updated recipe to Calibre? Thanks.
Also, I have a question. If I want to place the following notice for potential users: If your device supports CJK in section and article views well (e.g. Kindle 3.1+), you may enable them by customizing this recipe: change line 41 from \"IsCJKWellSupported = False\" to \"IsCJKWellSupported = True\" where should I put it in the recipe so the users are alerted when they pick the recipe? The "description" seems not a good place because it will be taken by all the downloaded ebooks/periodicals. Thanks. -Eddie Code:
__license__ = 'GPL v3' __copyright__ = '2010-2011, Eddie Lau' ''' Change Log: 2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles clean up the indentation 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) 2010/11/22: add English section, remove eco-news section which is not updated daily, correct ordering of articles 2010/11/12: add news image and eco-news section 2010/11/08: add parsing of finance section 2010/11/06: temporary work-around for Kindle device having no capability to display unicode in section/article list. 2010/10/31: skip repeated articles in section pages ''' import os, datetime, time, re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import defaultdict from functools import partial from contextlib import nested, closing from calibre import browser, __appname__, iswindows, strftime, preferred_encoding from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag from calibre.ebooks.metadata.opf2 import OPFCreator from calibre import entity_to_unicode from calibre.web import Recipe from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed from calibre.web.fetch.simple import option_parser as web2disk_option_parser from calibre.web.fetch.simple import RecursiveFetcher from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.date import now as nowf from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image class MPHKRecipe(BasicNewsRecipe): IsCJKWellSupported = False # to avoid generating periodical in which CJK characters can't be displayed in section/article view title = 'Ming Pao - Hong Kong' oldest_article = 1 max_articles_per_feed = 100 __author__ = 'Eddie Lau' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com)' publisher = 'MingPao' category = 'Chinese, News, Hong Kong' remove_javascript = True use_embedded_content = False no_stylesheets = True language = 'zh' encoding = 'Big5-HKSCS' recursions = 0 conversion_options = {'linearize_tables':True} timefmt = '' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(attrs={'id':['newscontent']}), # entertainment page content dict(attrs={'id':['newscontent01','newscontent02']}), dict(attrs={'class':['photo']}) ] remove_tags = [dict(name='style'), dict(attrs={'id':['newscontent135']})] # for the finance page remove_attributes = ['width'] preprocess_regexps = [ (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE), lambda match: '<h1>'), (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE), lambda match: '</h1>'), (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page lambda match: '') ] def image_url_processor(cls, baseurl, url): # trick: break the url at the first occurance of digit, add an additional # '_' at the front # not working, may need to move this to preprocess_html() method # minIdx = 10000 # i0 = url.find('0') # if i0 >= 0 and i0 < minIdx: # minIdx = i0 # i1 = url.find('1') # if i1 >= 0 and i1 < minIdx: # minIdx = i1 # i2 = url.find('2') # if i2 >= 0 and i2 < minIdx: # minIdx = i2 # i3 = url.find('3') # if i3 >= 0 and i0 < minIdx: # minIdx = i3 # i4 = url.find('4') # if i4 >= 0 and i4 < minIdx: # minIdx = i4 # i5 = url.find('5') # if i5 >= 0 and i5 < minIdx: # minIdx = i5 # i6 = url.find('6') # if i6 >= 0 and i6 < minIdx: # minIdx = i6 # i7 = url.find('7') # if i7 >= 0 and i7 < minIdx: # minIdx = i7 # i8 = url.find('8') # if i8 >= 0 and i8 < minIdx: # minIdx = i8 # i9 = url.find('9') # if i9 >= 0 and i9 < minIdx: # minIdx = i9 return url def get_dtlocal(self): dt_utc = datetime.datetime.utcnow() # convert UTC to local hk time - at around HKT 6.00am, all news are available dt_local = dt_utc - datetime.timedelta(-2.0/24) return dt_local def get_fetchdate(self): return self.get_dtlocal().strftime("%Y%m%d") def get_fetchformatteddate(self): return self.get_dtlocal().strftime("%Y-%m-%d") def get_fetchday(self): dt_utc = datetime.datetime.utcnow() # convert UTC to local hk time - at around HKT 6.00am, all news are available dt_local = dt_utc - datetime.timedelta(-2.0/24) return self.get_dtlocal().strftime("%d") def get_cover_url(self): cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg' br = BasicNewsRecipe.get_browser() try: br.open(cover) except: cover = None return cover def parse_index(self): feeds = [] dateStr = self.get_fetchdate() for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: articles = self.parse_section(url) if articles: feeds.append((title, articles)) # special - finance fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') if fin_articles: feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) # special - entertainment ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') if ent_articles: feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) return feeds def parse_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) current_articles = [] included_urls = [] divs.reverse() for i in divs: a = i.find('a', href = True) title = self.tag_to_string(a) url = a.get('href', False) url = 'http://news.mingpao.com/' + dateStr + '/' +url if url not in included_urls and url.rfind('Redirect') == -1: current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) included_urls.append(url) current_articles.reverse() return current_articles def parse_fin_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href= True) current_articles = [] included_urls = [] for i in a: url = 'http://www.mpfinance.com/cfm/' + i.get('href', False) if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1: title = self.tag_to_string(i) current_articles.append({'title': title, 'url': url, 'description':''}) included_urls.append(url) return current_articles def parse_ent_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href=True) a.reverse() current_articles = [] included_urls = [] for i in a: title = self.tag_to_string(i) url = 'http://ol.mingpao.com/cfm/' + i.get('href', False) if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1): current_articles.append({'title': title, 'url': url, 'description': ''}) included_urls.append(url) current_articles.reverse() return current_articles def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(style=True): del item['width'] for item in soup.findAll(stype=True): del item['absmiddle'] return soup def create_opf(self, feeds, dir=None): if dir is None: dir = self.output_dir if self.IsCJKWellSupported == True: # use Chinese title title = u'\u660e\u5831 (\u9999\u6e2f) ' + self.get_fetchformatteddate() else: # use English title title = self.short_title() + ' ' + self.get_fetchformatteddate() if True: # force date in title # title += strftime(self.timefmt) mi = MetaInformation(title, [self.publisher]) mi.publisher = self.publisher mi.author_sort = self.publisher if self.IsCJKWellSupported == True: mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() else: mi.publication_type = self.publication_type+':'+self.short_title() #mi.timestamp = nowf() mi.timestamp = self.get_dtlocal() mi.comments = self.description if not isinstance(mi.comments, unicode): mi.comments = mi.comments.decode('utf-8', 'replace') #mi.pubdate = nowf() mi.pubdate = self.get_dtlocal() opf_path = os.path.join(dir, 'index.opf') ncx_path = os.path.join(dir, 'index.ncx') opf = OPFCreator(dir, mi) # Add mastheadImage entry to <guide> section mp = getattr(self, 'masthead_path', None) if mp is not None and os.access(mp, os.R_OK): from calibre.ebooks.metadata.opf2 import Guide ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) ref.type = 'masthead' ref.title = 'Masthead Image' opf.guide.append(ref) manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] manifest.append(os.path.join(dir, 'index.html')) manifest.append(os.path.join(dir, 'index.ncx')) # Get cover cpath = getattr(self, 'cover_path', None) if cpath is None: pf = open(os.path.join(dir, 'cover.jpg'), 'wb') if self.default_cover(pf): cpath = pf.name if cpath is not None and os.access(cpath, os.R_OK): opf.cover = cpath manifest.append(cpath) # Get masthead mpath = getattr(self, 'masthead_path', None) if mpath is not None and os.access(mpath, os.R_OK): manifest.append(mpath) opf.create_manifest_from_files_in(manifest) for mani in opf.manifest: if mani.path.endswith('.ncx'): mani.id = 'ncx' if mani.path.endswith('mastheadImage.jpg'): mani.id = 'masthead-image' entries = ['index.html'] toc = TOC(base_path=dir) self.play_order_counter = 0 self.play_order_map = {} def feed_index(num, parent): f = feeds[num] for j, a in enumerate(f): if getattr(a, 'downloaded', False): adir = 'feed_%d/article_%d/'%(num, j) auth = a.author if not auth: auth = None desc = a.text_summary if not desc: desc = None else: desc = self.description_limiter(desc) entries.append('%sindex.html'%adir) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 po = self.play_order_counter parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), play_order=po, author=auth, description=desc) last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) for sp in a.sub_pages: prefix = os.path.commonprefix([opf_path, sp]) relp = sp[len(prefix):] entries.append(relp.replace(os.sep, '/')) last = sp if os.path.exists(last): with open(last, 'rb') as fi: src = fi.read().decode('utf-8') soup = BeautifulSoup(src) body = soup.find('body') if body is not None: prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) templ = self.navbar.generate(True, num, j, len(f), not self.has_single_feed, a.orig_url, self.publisher, prefix=prefix, center=self.center_navbar) elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') body.insert(len(body.contents), elem) with open(last, 'wb') as fi: fi.write(unicode(soup).encode('utf-8')) if len(feeds) == 0: raise Exception('All feeds are empty, aborting.') if len(feeds) > 1: for i, f in enumerate(feeds): entries.append('feed_%d/index.html'%i) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 po = self.play_order_counter auth = getattr(f, 'author', None) if not auth: auth = None desc = getattr(f, 'description', None) if not desc: desc = None feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title, play_order=po, description=desc, author=auth)) else: entries.append('feed_%d/index.html'%0) feed_index(0, toc) for i, p in enumerate(entries): entries[i] = os.path.join(dir, p.replace('/', os.sep)) opf.create_spine(entries) opf.set_toc(toc) with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): opf.render(opf_file, ncx_file) Last edited by tylau0; 02-20-2011 at 10:40 AM. |
![]() |
![]() |
![]() |
#2 |
creator of calibre
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 44,020
Karma: 22669822
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
|
The description is the only place. But since the Kindle is the only device that has this bug as far as I know, I suggest just defaulting to True. Most people's Kindles will update automatically once 3.1 is released anyway.
|
![]() |
![]() |
Advert | |
|
![]() |
#3 |
Connoisseur
![]() Posts: 82
Karma: 10
Join Date: Oct 2010
Device: Kindle
|
Thanks for your advice. I've decided to set the default to True. Below please find the updated recipe.
Code:
__license__ = 'GPL v3' __copyright__ = '2010-2011, Eddie Lau' # Kindle 3 user with firmware < 3.1 # please replace the following "True" with "False". __MakePeriodical__ = True ''' Change Log: 2011/02/20: skip duplicated links in finance section, put photos which may extend a whole page to the back of the articles clean up the indentation 2010/12/07: add entertainment section, use newspaper front page as ebook cover, suppress date display in section list (to avoid wrong date display in case the user generates the ebook in a time zone different from HKT) 2010/11/22: add English section, remove eco-news section which is not updated daily, correct ordering of articles 2010/11/12: add news image and eco-news section 2010/11/08: add parsing of finance section 2010/11/06: temporary work-around for Kindle device having no capability to display unicode in section/article list. 2010/10/31: skip repeated articles in section pages ''' import os, datetime, time, re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import defaultdict from functools import partial from contextlib import nested, closing from calibre import browser, __appname__, iswindows, strftime, preferred_encoding from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag from calibre.ebooks.metadata.opf2 import OPFCreator from calibre import entity_to_unicode from calibre.web import Recipe from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed from calibre.web.fetch.simple import option_parser as web2disk_option_parser from calibre.web.fetch.simple import RecursiveFetcher from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.date import now as nowf from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image class MPHKRecipe(BasicNewsRecipe): IsCJKWellSupported = __MakePeriodical__ # to avoid generating periodical in which CJK characters can't be displayed in section/article view title = 'Ming Pao - Hong Kong' oldest_article = 1 max_articles_per_feed = 100 __author__ = 'Eddie Lau' description = 'Hong Kong Chinese Newspaper (http://news.mingpao.com). For Kindle firmware 3.1+. Customize the recipe if not.' publisher = 'MingPao' category = 'Chinese, News, Hong Kong' remove_javascript = True use_embedded_content = False no_stylesheets = True language = 'zh' encoding = 'Big5-HKSCS' recursions = 0 conversion_options = {'linearize_tables':True} timefmt = '' extra_css = 'img {display: block; margin-left: auto; margin-right: auto; margin-top: 10px; margin-bottom: 10px;} font>b {font-size:200%; font-weight:bold;}' masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), dict(name='font', attrs={'style':['font-size:14pt; line-height:160%;']}), # for entertainment page title dict(attrs={'id':['newscontent']}), # entertainment page content dict(attrs={'id':['newscontent01','newscontent02']}), dict(attrs={'class':['photo']}) ] remove_tags = [dict(name='style'), dict(attrs={'id':['newscontent135']})] # for the finance page remove_attributes = ['width'] preprocess_regexps = [ (re.compile(r'<h5>', re.DOTALL|re.IGNORECASE), lambda match: '<h1>'), (re.compile(r'</h5>', re.DOTALL|re.IGNORECASE), lambda match: '</h1>'), (re.compile(r'<p><a href=.+?</a></p>', re.DOTALL|re.IGNORECASE), # for entertainment page lambda match: '') ] def image_url_processor(cls, baseurl, url): # trick: break the url at the first occurance of digit, add an additional # '_' at the front # not working, may need to move this to preprocess_html() method # minIdx = 10000 # i0 = url.find('0') # if i0 >= 0 and i0 < minIdx: # minIdx = i0 # i1 = url.find('1') # if i1 >= 0 and i1 < minIdx: # minIdx = i1 # i2 = url.find('2') # if i2 >= 0 and i2 < minIdx: # minIdx = i2 # i3 = url.find('3') # if i3 >= 0 and i0 < minIdx: # minIdx = i3 # i4 = url.find('4') # if i4 >= 0 and i4 < minIdx: # minIdx = i4 # i5 = url.find('5') # if i5 >= 0 and i5 < minIdx: # minIdx = i5 # i6 = url.find('6') # if i6 >= 0 and i6 < minIdx: # minIdx = i6 # i7 = url.find('7') # if i7 >= 0 and i7 < minIdx: # minIdx = i7 # i8 = url.find('8') # if i8 >= 0 and i8 < minIdx: # minIdx = i8 # i9 = url.find('9') # if i9 >= 0 and i9 < minIdx: # minIdx = i9 return url def get_dtlocal(self): dt_utc = datetime.datetime.utcnow() # convert UTC to local hk time - at around HKT 6.00am, all news are available dt_local = dt_utc - datetime.timedelta(-2.0/24) return dt_local def get_fetchdate(self): return self.get_dtlocal().strftime("%Y%m%d") def get_fetchformatteddate(self): return self.get_dtlocal().strftime("%Y-%m-%d") def get_fetchday(self): dt_utc = datetime.datetime.utcnow() # convert UTC to local hk time - at around HKT 6.00am, all news are available dt_local = dt_utc - datetime.timedelta(-2.0/24) return self.get_dtlocal().strftime("%d") def get_cover_url(self): cover = 'http://news.mingpao.com/' + self.get_fetchdate() + '/' + self.get_fetchdate() + '_' + self.get_fetchday() + 'gacov.jpg' br = BasicNewsRecipe.get_browser() try: br.open(cover) except: cover = None return cover def parse_index(self): feeds = [] dateStr = self.get_fetchdate() for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55/\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'), (u'\u82f1\u6587 English', 'http://news.mingpao.com/' + dateStr + '/emindex.htm')]: articles = self.parse_section(url) if articles: feeds.append((title, articles)) # special - finance fin_articles = self.parse_fin_section('http://www.mpfinance.com/htm/Finance/' + dateStr + '/News/ea,eb,ecindex.htm') if fin_articles: feeds.append((u'\u7d93\u6fdf Finance', fin_articles)) # special - entertainment ent_articles = self.parse_ent_section('http://ol.mingpao.com/cfm/star1.cfm') if ent_articles: feeds.append((u'\u5f71\u8996 Film/TV', ent_articles)) return feeds def parse_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) divs = soup.findAll(attrs={'class': ['bullet','bullet_grey']}) current_articles = [] included_urls = [] divs.reverse() for i in divs: a = i.find('a', href = True) title = self.tag_to_string(a) url = a.get('href', False) url = 'http://news.mingpao.com/' + dateStr + '/' +url if url not in included_urls and url.rfind('Redirect') == -1: current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) included_urls.append(url) current_articles.reverse() return current_articles def parse_fin_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href= True) current_articles = [] included_urls = [] for i in a: url = 'http://www.mpfinance.com/cfm/' + i.get('href', False) if url not in included_urls and not url.rfind(dateStr) == -1 and url.rfind('index') == -1: title = self.tag_to_string(i) current_articles.append({'title': title, 'url': url, 'description':''}) included_urls.append(url) return current_articles def parse_ent_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) a = soup.findAll('a', href=True) a.reverse() current_articles = [] included_urls = [] for i in a: title = self.tag_to_string(i) url = 'http://ol.mingpao.com/cfm/' + i.get('href', False) if (url not in included_urls) and (not url.rfind('.txt') == -1) and (not url.rfind('star') == -1): current_articles.append({'title': title, 'url': url, 'description': ''}) included_urls.append(url) current_articles.reverse() return current_articles def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(style=True): del item['width'] for item in soup.findAll(stype=True): del item['absmiddle'] return soup def create_opf(self, feeds, dir=None): if dir is None: dir = self.output_dir if self.IsCJKWellSupported == True: # use Chinese title title = u'\u660e\u5831 (\u9999\u6e2f)' #+ ' ' + self.get_fetchformatteddate() else: # use English title title = self.short_title() + ' ' + self.get_fetchformatteddate() if True: # force date in title # title += strftime(self.timefmt) mi = MetaInformation(title, [self.publisher]) mi.publisher = self.publisher mi.author_sort = self.publisher if self.IsCJKWellSupported == True: mi.publication_type = 'periodical:'+self.publication_type+':'+self.short_title() else: mi.publication_type = self.publication_type+':'+self.short_title() #mi.timestamp = nowf() mi.timestamp = self.get_dtlocal() mi.comments = self.description if not isinstance(mi.comments, unicode): mi.comments = mi.comments.decode('utf-8', 'replace') #mi.pubdate = nowf() mi.pubdate = self.get_dtlocal() opf_path = os.path.join(dir, 'index.opf') ncx_path = os.path.join(dir, 'index.ncx') opf = OPFCreator(dir, mi) # Add mastheadImage entry to <guide> section mp = getattr(self, 'masthead_path', None) if mp is not None and os.access(mp, os.R_OK): from calibre.ebooks.metadata.opf2 import Guide ref = Guide.Reference(os.path.basename(self.masthead_path), os.getcwdu()) ref.type = 'masthead' ref.title = 'Masthead Image' opf.guide.append(ref) manifest = [os.path.join(dir, 'feed_%d'%i) for i in range(len(feeds))] manifest.append(os.path.join(dir, 'index.html')) manifest.append(os.path.join(dir, 'index.ncx')) # Get cover cpath = getattr(self, 'cover_path', None) if cpath is None: pf = open(os.path.join(dir, 'cover.jpg'), 'wb') if self.default_cover(pf): cpath = pf.name if cpath is not None and os.access(cpath, os.R_OK): opf.cover = cpath manifest.append(cpath) # Get masthead mpath = getattr(self, 'masthead_path', None) if mpath is not None and os.access(mpath, os.R_OK): manifest.append(mpath) opf.create_manifest_from_files_in(manifest) for mani in opf.manifest: if mani.path.endswith('.ncx'): mani.id = 'ncx' if mani.path.endswith('mastheadImage.jpg'): mani.id = 'masthead-image' entries = ['index.html'] toc = TOC(base_path=dir) self.play_order_counter = 0 self.play_order_map = {} def feed_index(num, parent): f = feeds[num] for j, a in enumerate(f): if getattr(a, 'downloaded', False): adir = 'feed_%d/article_%d/'%(num, j) auth = a.author if not auth: auth = None desc = a.text_summary if not desc: desc = None else: desc = self.description_limiter(desc) entries.append('%sindex.html'%adir) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 po = self.play_order_counter parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'), play_order=po, author=auth, description=desc) last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep)) for sp in a.sub_pages: prefix = os.path.commonprefix([opf_path, sp]) relp = sp[len(prefix):] entries.append(relp.replace(os.sep, '/')) last = sp if os.path.exists(last): with open(last, 'rb') as fi: src = fi.read().decode('utf-8') soup = BeautifulSoup(src) body = soup.find('body') if body is not None: prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last)))) templ = self.navbar.generate(True, num, j, len(f), not self.has_single_feed, a.orig_url, self.publisher, prefix=prefix, center=self.center_navbar) elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div') body.insert(len(body.contents), elem) with open(last, 'wb') as fi: fi.write(unicode(soup).encode('utf-8')) if len(feeds) == 0: raise Exception('All feeds are empty, aborting.') if len(feeds) > 1: for i, f in enumerate(feeds): entries.append('feed_%d/index.html'%i) po = self.play_order_map.get(entries[-1], None) if po is None: self.play_order_counter += 1 po = self.play_order_counter auth = getattr(f, 'author', None) if not auth: auth = None desc = getattr(f, 'description', None) if not desc: desc = None feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title, play_order=po, description=desc, author=auth)) else: entries.append('feed_%d/index.html'%0) feed_index(0, toc) for i, p in enumerate(entries): entries[i] = os.path.join(dir, p.replace('/', os.sep)) opf.create_spine(entries) opf.set_toc(toc) with nested(open(opf_path, 'wb'), open(ncx_path, 'wb')) as (opf_file, ncx_file): opf.render(opf_file, ncx_file) |
![]() |
![]() |
![]() |
|
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
[Updated recipe] Ming Pao (明報) - Hong Kong (2010/12/07) | tylau0 | Recipes | 1 | 12-08-2010 11:39 AM |
[Updated recipe] Ming Pao (明報) - Hong Kong | tylau0 | Recipes | 3 | 11-25-2010 03:38 AM |
[Updated recipe] Ming Pao (明報) - Hong Kong | tylau0 | Recipes | 0 | 11-12-2010 06:24 PM |
[Updated recipe] Ming Pao (明報) - Hong Kong | tylau0 | Recipes | 0 | 11-06-2010 06:46 PM |
Ming Pao (明報) - Hong Kong | tylau0 | Recipes | 1 | 10-31-2010 06:08 PM |