MobileRead Forums - View Single Post

zhixiangpan · 08-31-2011, 10:46 PM

Hi, Starson17:

Thanks, but I still need help.

this is my code

class peoplenetrecipe(BasicNewsRecipe):
title = '人民网'
__author__ = 'me'
oldest_article = 3
max_articles_per_feed = 25

feeds = [
('china', 'http://www.people.com.cn/rss/politics.xml'),
('world', 'http://www.people.com.cn/rss/world.xml'),
('finance', 'http://www.people.com.cn/rss/finance.xml'),
('sport', 'http://www.people.com.cn/rss/sports.xml'),
]

no_stylesheets = True
# remove_javascript = True
# encoding = 'UTF-8'

keep_only_tags = [
dict(name='div', attrs={'class':'c_l fl'}),
]
remove_tags = [
dict(name='div', attrs={'class':'tools'}),
dict(name='div', attrs={'class':'box'}),
]
remove_tags_after = [
dict(name='div', attrs={'class':'show_text'}),
]

def append_page(self, soup, appendtag, position):

pager = soup.find('a')
if pager.img['src'] == "/img/next_b.gif":
nexturl = self.INDEX + pager.a['href']

# pager = soup.find('a',attrs={'class':'nextPage greyButton'}) # here is pager
# if pager:
# nexturl = self.INDEX + pager.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'c_l fl'}) # here is text
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)

it seems not work, the page

http://politics.people.com.cn/GB/1024/15556053.html

is in Chinese, at bottom there is a link to next page, code is

<a href="/GB/1024/15556054.html">
<img src="/img/next_b.gif" border="0"/>

I don't know how to debug the recipe. so, would you pls help to check it?

Thanks

BR

08-31-2011, 10:46 PM	#5
zhixiangpan Junior Member Posts: 3 Karma: 10 Join Date: Aug 2011 Device: kindle	Hi, Starson17: Thanks, but I still need help. this is my code class peoplenetrecipe(BasicNewsRecipe): title = '人民网' __author__ = 'me' oldest_article = 3 max_articles_per_feed = 25 feeds = [ ('china', 'http://www.people.com.cn/rss/politics.xml'), ('world', 'http://www.people.com.cn/rss/world.xml'), ('finance', 'http://www.people.com.cn/rss/finance.xml'), ('sport', 'http://www.people.com.cn/rss/sports.xml'), ] no_stylesheets = True # remove_javascript = True # encoding = 'UTF-8' keep_only_tags = [ dict(name='div', attrs={'class':'c_l fl'}), ] remove_tags = [ dict(name='div', attrs={'class':'tools'}), dict(name='div', attrs={'class':'box'}), ] remove_tags_after = [ dict(name='div', attrs={'class':'show_text'}), ] def append_page(self, soup, appendtag, position): pager = soup.find('a') if pager.img['src'] == "/img/next_b.gif": nexturl = self.INDEX + pager.a['href'] # pager = soup.find('a',attrs={'class':'nextPage greyButton'}) # here is pager # if pager: # nexturl = self.INDEX + pager.a['href'] soup2 = self.index_to_soup(nexturl) texttag = soup2.find('div', attrs={'class':'c_l fl'}) # here is text for it in texttag.findAll(style=True): del it['style'] newpos = len(texttag.contents) self.append_page(soup2,texttag,newpos) texttag.extract() appendtag.insert(position,texttag) it seems not work, the page http://politics.people.com.cn/GB/1024/15556053.html is in Chinese, at bottom there is a link to next page, code is <a href="/GB/1024/15556054.html"> <img src="/img/next_b.gif" border="0"/> I don't know how to debug the recipe. so, would you pls help to check it? Thanks BR