Hello there,
A China Daily recipe in calibre builtins already exists, but it is an English only version. This one seems to be Chinese interleaved with English throughout the text. I hope this helps.
China Daily (Chinese-English):
Code:
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe
PAGE_LIMIT = 50
def absurl(url):
if url.startswith('//'):
return 'https:' + url
elif url.startswith('/'):
return 'https://language.chinadaily.com.cn' + url
return url
class ChinaDailyCN_EN(BasicNewsRecipe):
title = u'权威发布CD'
__author__ = 'Jose Ortiz'
description = 'From China Daily'
encoding = 'utf-8'
language = 'zh'
no_stylesheets = True
remove_javascript = True
keep_only_tags = [
dict(name='div', attrs={'class':'main_title'}),
dict(name='div', attrs={'class':'mian_txt'}),
dict(name='span', attrs={'class':'next'})
]
def parse_index(self):
site = 'https://language.chinadaily.com.cn/5af95d44a3103f6866ee845c/'
soup = self.index_to_soup(site)
plist = soup.findAll('p',{'class':'gy_box_txt2' })
articles = []
for a in [p.a for p in plist if p.a]:
title = self.tag_to_string(a)
url = absurl(a["href"])
articles.append({'title': title, 'url': url})
return [('Articles', articles)]
def preprocess_html(self, soup):
try:
span_next = soup.find('span',{'class':'next'})
nexturl = absurl(span_next.find('a',{'class':'pagestyle'})['href'])
except:
self.log('No extra pages for this one.')
return self.adeify_images(soup)
span_next.extract()
self.log('Found extra page(2) at',nexturl)
cache = []
for i in range(PAGE_LIMIT):
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div',{'class':'mian_txt'})
texttag.extract()
cache.insert(0, texttag)
try:
span_next = soup2.find('span',{'class':'next'})
nexturl = absurl(span_next.find('a',{'class':'pagestyle'})['href'])
self.log('Found extra page(' + unicode(i + 3) + ') at',nexturl)
except: break
else:
self.log.debug('Exhausted page limit of',PAGE_LIMIT)
div = soup.body.find('div',{'class':'mian_txt'})
index = 1 + div.parent.contents.index(div)
for tag in cache:
div.parent.insert(index,tag)
return self.adeify_images(soup)