View Single Post
Old 02-18-2011, 10:17 PM   #3
Jmot
Junior Member
Jmot began at the beginning.
 
Posts: 9
Karma: 10
Join Date: Feb 2011
Device: Kindle
Well, I did use the "keep_only_tags" as the following and confirm that there are "<div id="bodypart">" in the HMTL. Unfortunately, it still does not work. So I'm wondering if I'm missing something. Any suggestion? Thanks.

===
from calibre.web.feeds.news import BasicNewsRecipe

class AdvancedUserRecipe1277443666(BasicNewsRecipe):
title = u'x WSJ 華爾街日報'
oldest_article = 32
max_articles_per_feed = 2

feeds = [
(u'\u8981\u805E', u'http://chinese.wsj.com/big5/rss01.xml'),
(u'Report', u'http://chinese.wsj.com/big5/rss02.xml'),
#(u'\u4E2D\u6E2F\u53F0', u'http://chinese.wsj.com/big5/rssbch.xml'),
#(u'\u570B\u969B\u8CA1\u7D93', u'http://chinese.wsj.com/big5/rssglobal.xml'),
#(u'\u4E2D\u570B\u80A1\u5E02', u'http://chinese.wsj.com/big5/rsschinastock.xml'),
#(u'\u9999\u6E2F\u80A1\u5E02', u'http://chinese.wsj.com/big5/rssHKstock.xml'),
#(u'\u5916\u532F\u5E02\u5834', u'http://chinese.wsj.com/big5/rssforex.xml')
#(u'\u5168\u7403\u91D1\u878D\u5E02\u5834', u'http://chinese.wsj.com/big5/rssmarkets.xml')
#(u'\u79D1\u6280', u'http://chinese.wsj.com/big5/rsstech.xml')
#(u'\u80FD\u6E90\u8207\u6C7D\u8ECA', u'http://chinese.wsj.com/big5/rssautoene.xml')
]

remove_javascript = True

keep_only_tags = [
dict(name='div', attrs={'id':'bodypart'})
]

# remove_tags = [dict(name='div', attrs={'class':['homepage']})]
#remove_tags_after = dict(id='bodypart')
#remove_javascript = True
Jmot is offline   Reply With Quote