MobileRead Forums - View Single Post - Help Please: remove_tags doesn't work in WSJ Chinese

Jmot · 02-18-2011, 10:17 PM

Well, I did use the "keep_only_tags" as the following and confirm that there are "<div id="bodypart">" in the HMTL. Unfortunately, it still does not work. So I'm wondering if I'm missing something. Any suggestion? Thanks.

===
from calibre.web.feeds.news import BasicNewsRecipe

class AdvancedUserRecipe1277443666(BasicNewsRecipe):
title = u'x WSJ 華爾街日報'
oldest_article = 32
max_articles_per_feed = 2

feeds = [
(u'\u8981\u805E', u'http://chinese.wsj.com/big5/rss01.xml'),
(u'Report', u'http://chinese.wsj.com/big5/rss02.xml'),
#(u'\u4E2D\u6E2F\u53F0', u'http://chinese.wsj.com/big5/rssbch.xml'),
#(u'\u570B\u969B\u8CA1\u7D93', u'http://chinese.wsj.com/big5/rssglobal.xml'),
#(u'\u4E2D\u570B\u80A1\u5E02', u'http://chinese.wsj.com/big5/rsschinastock.xml'),
#(u'\u9999\u6E2F\u80A1\u5E02', u'http://chinese.wsj.com/big5/rssHKstock.xml'),
#(u'\u5916\u532F\u5E02\u5834', u'http://chinese.wsj.com/big5/rssforex.xml')
#(u'\u5168\u7403\u91D1\u878D\u5E02\u5834', u'http://chinese.wsj.com/big5/rssmarkets.xml')
#(u'\u79D1\u6280', u'http://chinese.wsj.com/big5/rsstech.xml')
#(u'\u80FD\u6E90\u8207\u6C7D\u8ECA', u'http://chinese.wsj.com/big5/rssautoene.xml')
]

remove_javascript = True

keep_only_tags = [
dict(name='div', attrs={'id':'bodypart'})
]

# remove_tags = [dict(name='div', attrs={'class':['homepage']})]
#remove_tags_after = dict(id='bodypart')
#remove_javascript = True

02-18-2011, 10:17 PM	#3
Jmot Junior Member Posts: 9 Karma: 10 Join Date: Feb 2011 Device: Kindle	Well, I did use the "keep_only_tags" as the following and confirm that there are "<div id="bodypart">" in the HMTL. Unfortunately, it still does not work. So I'm wondering if I'm missing something. Any suggestion? Thanks. === from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1277443666(BasicNewsRecipe): title = u'x WSJ 華爾街日報' oldest_article = 32 max_articles_per_feed = 2 feeds = [ (u'\u8981\u805E', u'http://chinese.wsj.com/big5/rss01.xml'), (u'Report', u'http://chinese.wsj.com/big5/rss02.xml'), #(u'\u4E2D\u6E2F\u53F0', u'http://chinese.wsj.com/big5/rssbch.xml'), #(u'\u570B\u969B\u8CA1\u7D93', u'http://chinese.wsj.com/big5/rssglobal.xml'), #(u'\u4E2D\u570B\u80A1\u5E02', u'http://chinese.wsj.com/big5/rsschinastock.xml'), #(u'\u9999\u6E2F\u80A1\u5E02', u'http://chinese.wsj.com/big5/rssHKstock.xml'), #(u'\u5916\u532F\u5E02\u5834', u'http://chinese.wsj.com/big5/rssforex.xml') #(u'\u5168\u7403\u91D1\u878D\u5E02\u5834', u'http://chinese.wsj.com/big5/rssmarkets.xml') #(u'\u79D1\u6280', u'http://chinese.wsj.com/big5/rsstech.xml') #(u'\u80FD\u6E90\u8207\u6C7D\u8ECA', u'http://chinese.wsj.com/big5/rssautoene.xml') ] remove_javascript = True keep_only_tags = [ dict(name='div', attrs={'id':'bodypart'}) ] # remove_tags = [dict(name='div', attrs={'class':['homepage']})] #remove_tags_after = dict(id='bodypart') #remove_javascript = True