
Hello, I have tried to fetch the articles on
https://language.chinadaily.com.cn/5...03f6866ee845c/
but I only got the first pages.The append_page didn't seem to work. I wonder if anyone can help me with the recipe.
Spoiler:
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
class shuang1(BasicNewsRecipe):
title = u'权威发布CD'
description = 'From China Daily'
encoding = 'utf-8'
no_stylesheets = True
remove_javascript = True
keep_only_tags = [dict(name='div', attrs={'class':'main_title'}),
dict(name='div', attrs={'class':'mian_txt'})]
def get_title(self, link):
return link.contents[0].strip()
def parse_index(self):
site = 'https://language.chinadaily.com.cn/5af95d44a3103f6866ee845c/'
soup = self.index_to_soup(site)
div = soup.findAll('p', { 'class': 'gy_box_txt2' })
articles = []
for link in div:
til = link.a.contents[0].strip()
url = 'https:' + link.a.get("href")
a = { 'title': til, 'url': url }
articles.append(a)
ans = [(til, articles)]
return ans
def append_page(self, soup, appendtag, position):
pager = soup.find('a', attrs={'class':'pagestyle'})
if pager:
nexturl = 'https:' + pager['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'mian_txt'})
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3)
pager = soup.find('a', attrs={'class':'pagestyle'})
if pager:
pager.extract()
return self.adeify_images(soup)