Here's a first attempt at a recipe for Taiwan's Apple Daily News, rty and others feel free to add comments/clean up/etc.
Spoiler:
class AdvancedUserRecipe1277443634(BasicNewsRecipe):
title = u'蘋果日報'
oldest_article = 7
max_articles_per_feed = 100
feeds = [
(u'\u982D\u689D', u'http://tw.nextmedia.com/rss/create/type/1077'),
(u'\u8981\u805E', u'http://tw.nextmedia.com/rss/create/type/11'),
(u'\u653F\u6CBB', u'http://tw.nextmedia.com/rss/create/type/151'),
(u'\u793E\u6703', u'http://tw.nextmedia.com/rss/create/type/1066'),
(u'\u751F\u6D3B', u'http://tw.nextmedia.com/rss/create/type/2724'),
(u'\u5730\u65B9\u7D9C\u5408', u'http://tw.nextmedia.com/rss/create/type/1076'),
(u'\u6696\u6D41', u'http://tw.nextmedia.com/rss/create/type/9499'),
(u'\u6295\u8A34', u'http://tw.nextmedia.com/rss/create/type/16287'),
(u'\u8AD6\u58C7', u'http://tw.nextmedia.com/rss/create/type/824711')
]
extra_css = '''
@font-face {font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\n
body {margin-right: 8pt; font-family: 'DroidFont', serif;}\n
h1 {font-family: 'DroidFont', serif;}\n
.articledescription {font-family: 'DroidFont', serif;}
'''
__author__ = 'einstuerzende'
__version__ = '1.0'
language = 'zh-HANT'
pubisher = 'Next Media'
description = 'Apple Daily (Taiwan)'
category = 'News, Chinese'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
encoding = 'UTF-8'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://tw.img.nextmedia.com/www/images/atnextheader_logo_appledaily.gif'
keep_only_tags = [dict(name='div', attrs={'id':['article_left']})]
remove_tags = [
dict(name='div', attrs={'id':['articleTools','articleTools2','pagebar','articleI ntroPhoto']}),
dict(name='div', attrs={'class':'gotoFeedback'}),
dict(name='span', attrs={'class':'zoom'}),
]
Quote:
Originally Posted by Starson17
No. It shouldn't be a problem. Are you sure that whatever you think is "after" is really "after" that tag? Try printing the soup in your preprocess_html and actually look. Sometimes the order of the tags as seen in FireFox (or whatever you use to check the tag order) is not the same as what your recipe actually sees when it runs. I've had a lot of trouble with remove_tags before and after.
|
Thanks for the tip; I'll see tonight if I can get something more out of playing with the command line. If anybody can help with the Chinese WSJ recipe, I'm dying to know what I'm missing. I'll keep working on it, but even a recipe this simple, it seems to be pulling the whole page:
Spoiler:
class AdvancedUserRecipe1278740771(BasicNewsRecipe):
title = u'WSJ 華爾街日報'
__author__ = 'x'
oldest_article = 14
max_articles_per_feed = 2
timefmt = ' [%Y %b %d]'
feeds = [
#(u'\u8981\u805E', u'http://chinese.wsj.com/big5/rss01.xml'),
(u'\u7279\u5BEB', u'http://chinese.wsj.com/big5/rss02.xml'),
#(u'\u4E2D\u6E2F\u53F0', u'http://chinese.wsj.com/big5/rssbch.xml'),
#(u'\u570B\u969B\u8CA1\u7D93', u'http://chinese.wsj.com/big5/rssglobal.xml'),
#(u'\u4E2D\u570B\u80A1\u5E02', u'http://chinese.wsj.com/big5/rsschinastock.xml'),
#(u'\u9999\u6E2F\u80A1\u5E02', u'http://chinese.wsj.com/big5/rssHKstock.xml'),
#(u'\u5916\u532F\u5E02\u5834', u'http://chinese.wsj.com/big5/rssforex.xml')
#(u'\u5168\u7403\u91D1\u878D\u5E02\u5834', u'http://chinese.wsj.com/big5/rssmarkets.xml')
#(u'\u79D1\u6280', u'http://chinese.wsj.com/big5/rsstech.xml')
#(u'\u80FD\u6E90\u8207\u6C7D\u8ECA', u'http://chinese.wsj.com/big5/rssautoene.xml')
]
language = 'zh-cn'
pubisher = 'Dow Jones & Company, Inc.'
description = 'Wall Stree Journal - Chinese edition'
category = 'News, Business'
remove_javascript = True
use_embedded_content = False
no_stylesheets = True
encoding = 'big5'
#conversion_options = {'linearize_tables':True}
extra_css = '''
@font-face { font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\n
body {
margin-right: 8pt;
font-family: 'DroidFont', serif;}
.left_content {font-family: 'DroidFont', serif, sans-serif}
'''
keep_only_tags = [dict(name='div', attrs={'id':['headline','A']}),]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(width=True):
del item['width']
return soup