And just for posterity, this is how far I got with reversing the JS. I can extract the encrypted key and iv and encrypted data, the problem is in get_decryption_key() for some reason the wsj server isnt returning the decrupted key. The same request in a browser works, so I am guessing htere is some cookie missing or the server does some tls sniffing.
Code:
from html5_parser import parse
import json
from calibre import browser
from mechanize import Request
from urllib.parse import urlparse
def extract_json_data(raw_html):
from pprint import pprint
pprint
root = parse(raw_html)
d = json.loads(root.xpath('//script[@id="__NEXT_DATA__"]')[0].text)
page_props = d['props']['pageProps']
ed = page_props['encryptedDataHash']
encrypted_data = ed['content']
iv = ed['iv']
encrypted_key = page_props['encryptedDocumentKey']
url = root.xpath('//link[@rel="canonical"]')[0].get('href')
return {'url': url, 'encrypted_data': encrypted_data, 'iv': iv, 'encrypted_key': encrypted_key}
def get_browser_for_wsj(*a, **kw):
br = browser()
br.set_cookie('wsjregion', 'na,us', '.wsj.com')
br.set_cookie('gdprApplies', 'false', '.wsj.com')
br.set_cookie('ccpaApplies', 'false', '.wsj.com')
br.set_cookie('vcdpaApplies', 'false', '.wsj.com')
br.set_cookie('regulationApplies', 'gdpr%3Afalse%2Ccpra%3Afalse%2Cvcdpa%3Afalse', '.wsj.com')
br.set_handle_gzip(True)
br.addheaders += [
('Accept', '*/*'),
('Accept-Language', 'en-GB,en-US;q=0.9,en;q=0.8'),
]
return br
def get_decryption_key(br, data, referer):
from pprint import pprint
pprint
purl = urlparse(referer)
rq = Request('https://www.wsj.com/client', headers={
'Cache-Control': 'max-age=0',
'Referer': referer,
'X-Encrypted-Document-Key': data['encrypted_key'],
'X-Original-Host': 'www.wsj.com',
'X-Original-Referrer': '',
'X-Original-Url': purl.path,
})
br.set_debug_http(True)
try:
res = br.open(rq)
except Exception as err:
if hasattr(err, 'read'):
raise Exception('decryption key request failed with error: {} and body: {}'.format(err, err.read().decode('utf-8', 'replace')))
raise
if res.code != 200:
raise ValueError(f'decryption key request returned non OK HTTP result code: {res.code}')
r = json.loads(res.read())
key = r['documentKey']
if not key:
pprint(r)
raise ValueError('No document key returned')
def get_wsj_article(url='https://www.wsj.com/world/middle-east/u-n-world-leaders-push-to-get-gaza-aid-flowing-after-biden-pledge-3b59283b'):
br = get_browser_for_wsj()
res = br.open(url)
raw_html = res.read()
data = extract_json_data(raw_html)
get_decryption_key(br, data, res.geturl())
if __name__ == '__main__':
get_wsj_article()