MobileRead Forums - View Single Post

lui1 · 03-23-2019, 09:13 PM

Hello there,

A China Daily recipe in calibre builtins already exists, but it is an English only version. This one seems to be Chinese interleaved with English throughout the text. I hope this helps.

China Daily (Chinese-English):

Code:

#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe


PAGE_LIMIT = 50


def absurl(url):
    if url.startswith('//'):
        return 'https:' + url
    elif url.startswith('/'):
        return 'https://language.chinadaily.com.cn' + url
    return url


class ChinaDailyCN_EN(BasicNewsRecipe):
    title = u'权威发布CD'
    __author__ = 'Jose Ortiz'
    description = 'From China Daily'
    encoding = 'utf-8'
    language = 'zh'
    no_stylesheets = True
    remove_javascript = True
    keep_only_tags = [
        dict(name='div', attrs={'class':'main_title'}),
        dict(name='div', attrs={'class':'mian_txt'}),
        dict(name='span', attrs={'class':'next'})
    ]

    def parse_index(self):
        site = 'https://language.chinadaily.com.cn/5af95d44a3103f6866ee845c/'
        soup = self.index_to_soup(site)
        plist = soup.findAll('p',{'class':'gy_box_txt2' })
        articles = []
        for a in [p.a for p in plist if p.a]:
            title = self.tag_to_string(a)
            url = absurl(a["href"])
            articles.append({'title': title, 'url': url})
        return [('Articles', articles)]

    def preprocess_html(self, soup):
        try:
            span_next = soup.find('span',{'class':'next'})
            nexturl = absurl(span_next.find('a',{'class':'pagestyle'})['href'])
        except:
            self.log('No extra pages for this one.')
            return self.adeify_images(soup)

        span_next.extract()
        self.log('Found extra page(2) at',nexturl)
        cache = []
        for i in range(PAGE_LIMIT):
            soup2 = self.index_to_soup(nexturl)
            texttag = soup2.find('div',{'class':'mian_txt'})
            texttag.extract()
            cache.insert(0, texttag)
            try:
                span_next = soup2.find('span',{'class':'next'})
                nexturl = absurl(span_next.find('a',{'class':'pagestyle'})['href'])
                self.log('Found extra page(' + unicode(i + 3) + ') at',nexturl)
            except: break
        else:
            self.log.debug('Exhausted page limit of',PAGE_LIMIT)

        div = soup.body.find('div',{'class':'mian_txt'})
        index = 1 + div.parent.contents.index(div)
        for tag in cache:
            div.parent.insert(index,tag)

        return self.adeify_images(soup)