The plot thickens...back to the two articles, hereafter 'HP_Article' and 'Opinion_Article'...I tried html5lib in preprocess, HP_Article downloaded, and Opinion_Article did not (there was an error in the ihatexml.py file in html5lib...not sure that was related).
so I tried parsing the raw data with lxml, isolating the <article> tag, reconstituting the html and passing it out...same result. Not sure if there's further cleaning required here or something else...it seems to me if the html issues directly from lxml (as in this case), it oughta work...clearly that's wrong. Recipe below, zipfile attached has logs, raw html, reprocessed html and epub file.
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
import html5lib
from lxml import etree, html
from lxml.html import builder as E
import copy
# http://online.wsj.com/page/us_in_todays_paper.html
class WallStreetJournal(BasicNewsRecipe):
title = 'The Wall Street Journal'
__author__ = 'Kovid Goyal and Joshua Oster-Morris'
description = 'News and current affairs'
needs_subscription = True
language = 'en'
compress_news_images = True
compress_news_images_auto_size = 5
max_articles_per_feed = 1000
timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True
ignore_duplicate_articles = {'url'}
suffix_dict = {'1412643100': 'HP_ARTICLE', '1412636585': 'Opinion_Article'}
print_files = True
print_file_loc = 'E:\\Temp\\wsjTest\\'
keep_only_tags = [
dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
dict(name='span', itemprop='author', rel='author'),
dict(name='article', id=['article-contents', 'articleBody']),
dict(name='div', id='article_story_body'),
dict(name='div', attrs={'class':'snippet-ad-login'}),
dict(name='div', attrs={'data-module-name':'resp.module.article.articleBody'}),
]
def preprocess_raw_html(self, raw_html, url):
# root = html5lib.parse(raw_html, treebuilder='lxml', namespaceHTMLElements=False)
html_parser = etree.HTMLParser()
html_parsed = etree.fromstring(raw_html, parser=html_parser)
selected = html_parsed.xpath("""//article[@id=('article-contents' or 'articleBody')]""")
html_out = E.HTML(E.BODY(selected[0]))
self.log( "Preprocessing URL:", url)
name = self.suffix_dict[url.split("-")[-1:][0]]
output = etree.tostring(html_out)
if self.print_files:
open(self.print_file_loc + name + '-raw.html', 'wb').write(raw_html)
open(self.print_file_loc + name + '-preprocessed.html', 'wb').write(output)
return output
remove_tags = [
dict(attrs={'class':['insetButton', 'insettipBox']}),
dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
]
use_javascript_to_login = True
def javascript_login(self, br, username, password):
br.visit('https://id.wsj.com/access/pages/wsj/us/login_standalone.html?mg=com-wsj', timeout=120)
f = br.select_form(nr=0)
f['username'] = username
f['password'] = password
br.submit(timeout=120)
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def preprocess_html(self, soup):
# Remove thumbnail for zoomable images
for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
img = div.find('img')
if img is not None:
img.extract()
return soup
def parse_index(self):
feeds = []
articles = []
# will parse
title1 = 'HP_Article'
desc1 = 'A News Article about Hewlett Packard'
url1 = 'http://online.wsj.com/articles/hewlett-packard-split-comes-as-more-investors-say-big-isnt-better-1412643100'
articles.append({'title':title1, 'url':url1, 'description':desc1, 'date':''})
# won't parse
title = "Opinion_Article"
desc = 'An Opinion Article about China Bubble'
url = 'http://online.wsj.com/articles/bret-stephens-hong-kong-pops-the-china-bubble-1412636585'
articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
# bundle and return
section = "This Sample Section"
feeds.append((section, articles))
return feeds
def cleanup(self):
self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')