View Single Post
Old 11-20-2023, 05:27 AM   #33
kovidgoyal
creator of calibre
kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.
 
kovidgoyal's Avatar
 
Posts: 45,451
Karma: 27757438
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
And just for posterity, this is how far I got with reversing the JS. I can extract the encrypted key and iv and encrypted data, the problem is in get_decryption_key() for some reason the wsj server isnt returning the decrupted key. The same request in a browser works, so I am guessing htere is some cookie missing or the server does some tls sniffing.

Code:
from html5_parser import parse
import json
from calibre import browser
from mechanize import Request
from urllib.parse import urlparse


def extract_json_data(raw_html):
    from pprint import pprint
    pprint
    root = parse(raw_html)
    d = json.loads(root.xpath('//script[@id="__NEXT_DATA__"]')[0].text)
    page_props = d['props']['pageProps']
    ed = page_props['encryptedDataHash']
    encrypted_data = ed['content']
    iv = ed['iv']
    encrypted_key = page_props['encryptedDocumentKey']
    url = root.xpath('//link[@rel="canonical"]')[0].get('href')
    return {'url': url, 'encrypted_data': encrypted_data, 'iv': iv, 'encrypted_key': encrypted_key}


def get_browser_for_wsj(*a, **kw):
    br = browser()
    br.set_cookie('wsjregion', 'na,us', '.wsj.com')
    br.set_cookie('gdprApplies', 'false', '.wsj.com')
    br.set_cookie('ccpaApplies', 'false', '.wsj.com')
    br.set_cookie('vcdpaApplies', 'false', '.wsj.com')
    br.set_cookie('regulationApplies', 'gdpr%3Afalse%2Ccpra%3Afalse%2Cvcdpa%3Afalse', '.wsj.com')
    br.set_handle_gzip(True)
    br.addheaders += [
        ('Accept', '*/*'),
        ('Accept-Language', 'en-GB,en-US;q=0.9,en;q=0.8'),
    ]
    return br


def get_decryption_key(br, data, referer):
    from pprint import pprint
    pprint
    purl = urlparse(referer)
    rq = Request('https://www.wsj.com/client', headers={
        'Cache-Control': 'max-age=0',
        'Referer': referer,
        'X-Encrypted-Document-Key': data['encrypted_key'],
        'X-Original-Host': 'www.wsj.com',
        'X-Original-Referrer': '',
        'X-Original-Url': purl.path,
    })
    br.set_debug_http(True)
    try:
        res = br.open(rq)
    except Exception as err:
        if hasattr(err, 'read'):
            raise Exception('decryption key request failed with error: {} and body: {}'.format(err, err.read().decode('utf-8', 'replace')))
        raise
    if res.code != 200:
        raise ValueError(f'decryption key request returned non OK HTTP result code: {res.code}')
    r = json.loads(res.read())
    key = r['documentKey']
    if not key:
        pprint(r)
        raise ValueError('No document key returned')


def get_wsj_article(url='https://www.wsj.com/world/middle-east/u-n-world-leaders-push-to-get-gaza-aid-flowing-after-biden-pledge-3b59283b'):
    br = get_browser_for_wsj()
    res = br.open(url)
    raw_html = res.read()
    data = extract_json_data(raw_html)
    get_decryption_key(br, data, res.geturl())



if __name__ == '__main__':
    get_wsj_article()
kovidgoyal is offline   Reply With Quote