View Single Post
Old 03-22-2019, 07:06 AM   #1
Susa
Junior Member
Susa began at the beginning.
 
Posts: 2
Karma: 10
Join Date: Jan 2019
Device: Kindle Paperwhite 3
Failed to fetch multipage articles


Hello, I have tried to fetch the articles on https://language.chinadaily.com.cn/5...03f6866ee845c/
but I only got the first pages.The append_page didn't seem to work. I wonder if anyone can help me with the recipe.


Spoiler:

# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe

class shuang1(BasicNewsRecipe):

title = u'权威发布CD'
description = 'From China Daily'
encoding = 'utf-8'
no_stylesheets = True
remove_javascript = True
keep_only_tags = [dict(name='div', attrs={'class':'main_title'}),
dict(name='div', attrs={'class':'mian_txt'})]
def get_title(self, link):
return link.contents[0].strip()
def parse_index(self):
site = 'https://language.chinadaily.com.cn/5af95d44a3103f6866ee845c/'
soup = self.index_to_soup(site)
div = soup.findAll('p', { 'class': 'gy_box_txt2' })
articles = []

for link in div:

til = link.a.contents[0].strip()
url = 'https:' + link.a.get("href")
a = { 'title': til, 'url': url }

articles.append(a)

ans = [(til, articles)]

return ans

def append_page(self, soup, appendtag, position):
pager = soup.find('a', attrs={'class':'pagestyle'})
if pager:
nexturl = 'https:' + pager['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'mian_txt'})
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)

def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3)
pager = soup.find('a', attrs={'class':'pagestyle'})
if pager:
pager.extract()
return self.adeify_images(soup)

Susa is offline   Reply With Quote