View Single Post
Old 10-08-2014, 11:10 PM   #11
dkfurrow
Member
dkfurrow began at the beginning.
 
Posts: 13
Karma: 10
Join Date: Jun 2013
Device: LG G-Pad 8.3
The plot thickens...back to the two articles, hereafter 'HP_Article' and 'Opinion_Article'...I tried html5lib in preprocess, HP_Article downloaded, and Opinion_Article did not (there was an error in the ihatexml.py file in html5lib...not sure that was related).

so I tried parsing the raw data with lxml, isolating the <article> tag, reconstituting the html and passing it out...same result. Not sure if there's further cleaning required here or something else...it seems to me if the html issues directly from lxml (as in this case), it oughta work...clearly that's wrong. Recipe below, zipfile attached has logs, raw html, reprocessed html and epub file.

Code:
#!/usr/bin/env  python
__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'

from calibre.web.feeds.news import BasicNewsRecipe
import html5lib
from lxml import etree, html
from lxml.html import builder as E
import copy

# http://online.wsj.com/page/us_in_todays_paper.html

class WallStreetJournal(BasicNewsRecipe):

    title = 'The Wall Street Journal'
    __author__ = 'Kovid Goyal and Joshua Oster-Morris'
    description = 'News and current affairs'
    needs_subscription = True
    language = 'en'

    compress_news_images = True
    compress_news_images_auto_size = 5
    max_articles_per_feed = 1000
    timefmt  = ' [%a, %b %d, %Y]'
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}
    suffix_dict = {'1412643100': 'HP_ARTICLE', '1412636585': 'Opinion_Article'}
    print_files = True
    print_file_loc = 'E:\\Temp\\wsjTest\\'

    keep_only_tags = [
        dict(name='h1'), dict(name='h2', attrs={'class':['subhead', 'subHed deck']}),
        dict(name='span', itemprop='author', rel='author'),
        dict(name='article', id=['article-contents', 'articleBody']),
        dict(name='div', id='article_story_body'),
        dict(name='div', attrs={'class':'snippet-ad-login'}),
        dict(name='div', attrs={'data-module-name':'resp.module.article.articleBody'}),
    ]

    def preprocess_raw_html(self, raw_html, url):
        # root = html5lib.parse(raw_html, treebuilder='lxml', namespaceHTMLElements=False)
        html_parser = etree.HTMLParser()
        html_parsed = etree.fromstring(raw_html, parser=html_parser)
        selected = html_parsed.xpath("""//article[@id=('article-contents' or 'articleBody')]""")
        html_out = E.HTML(E.BODY(selected[0]))
        self.log( "Preprocessing URL:",  url)
        name = self.suffix_dict[url.split("-")[-1:][0]]
        output = etree.tostring(html_out)
        if self.print_files:
            open(self.print_file_loc + name + '-raw.html', 'wb').write(raw_html)
            open(self.print_file_loc + name + '-preprocessed.html', 'wb').write(output)
        return output

    remove_tags = [
        dict(attrs={'class':['insetButton', 'insettipBox']}),
        dict(name='span', attrs={'data-country-code':True, 'data-ticker-code':True}),
    ]

    use_javascript_to_login = True

    def javascript_login(self, br, username, password):
        br.visit('https://id.wsj.com/access/pages/wsj/us/login_standalone.html?mg=com-wsj', timeout=120)
        f = br.select_form(nr=0)
        f['username'] = username
        f['password'] = password
        br.submit(timeout=120)

    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])

    def preprocess_html(self, soup):
        # Remove thumbnail for zoomable images
        for div in soup.findAll('div', attrs={'class':lambda x: x and 'insetZoomTargetBox' in x.split()}):
            img = div.find('img')
            if img is not None:
                img.extract()
        return soup

    def parse_index(self):
        feeds = []
        articles = []
        # will parse
        title1 = 'HP_Article'
        desc1 = 'A News Article about Hewlett Packard'
        url1 = 'http://online.wsj.com/articles/hewlett-packard-split-comes-as-more-investors-say-big-isnt-better-1412643100'
        articles.append({'title':title1, 'url':url1, 'description':desc1, 'date':''})
        # won't parse
        title = "Opinion_Article"
        desc = 'An Opinion Article about China Bubble'
        url = 'http://online.wsj.com/articles/bret-stephens-hong-kong-pops-the-china-bubble-1412636585'
        articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
        # bundle and return
        section = "This Sample Section"
        feeds.append((section, articles))
        return feeds

    def cleanup(self):
        self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')
Attached Files
File Type: zip wsjTest.zip (1.84 MB, 192 views)
dkfurrow is offline   Reply With Quote