Hi, Starson17:
Thanks, but I still need help.
this is my code
class peoplenetrecipe(BasicNewsRecipe):
title = '人民网'
__author__ = 'me'
oldest_article = 3
max_articles_per_feed = 25
feeds = [
('china', 'http://www.people.com.cn/rss/politics.xml'),
('world', 'http://www.people.com.cn/rss/world.xml'),
('finance', 'http://www.people.com.cn/rss/finance.xml'),
('sport', 'http://www.people.com.cn/rss/sports.xml'),
]
no_stylesheets = True
# remove_javascript = True
# encoding = 'UTF-8'
keep_only_tags = [
dict(name='div', attrs={'class':'c_l fl'}),
]
remove_tags = [
dict(name='div', attrs={'class':'tools'}),
dict(name='div', attrs={'class':'box'}),
]
remove_tags_after = [
dict(name='div', attrs={'class':'show_text'}),
]
def append_page(self, soup, appendtag, position):
pager = soup.find('a')
if pager.img['src'] == "/img/next_b.gif":
nexturl = self.INDEX + pager.a['href']
# pager = soup.find('a',attrs={'class':'nextPage greyButton'}) # here is pager
# if pager:
# nexturl = self.INDEX + pager.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'c_l fl'}) # here is text
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)
it seems not work, the page
http://politics.people.com.cn/GB/1024/15556053.html
is in Chinese, at bottom there is a link to next page, code is
<a href="/GB/1024/15556054.html">
<img src="/img/next_b.gif" border="0"/>
I don't know how to debug the recipe. so, would you pls help to check it?
Thanks
BR