View Single Post
Old 11-21-2023, 12:24 AM   #35
kovidgoyal
creator of calibre
kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.
 
kovidgoyal's Avatar
 
Posts: 45,450
Karma: 27757438
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
Yeah, that works but now the issue is how to decrypt using the key and iv, the obvious candidate, AES-CTR doesnt seem to work

Code:
import base64
import json
from html5_parser import parse
from mechanize import Request
from urllib.parse import urlparse

from calibre import browser


def extract_json_data(raw_html):
    from pprint import pprint
    pprint
    root = parse(raw_html)
    d = json.loads(root.xpath('//script[@id="__NEXT_DATA__"]')[0].text)
    page_props = d['props']['pageProps']
    ed = page_props['encryptedDataHash']
    encrypted_data = base64.standard_b64decode(ed['content'])
    iv = base64.standard_b64decode(ed['iv'])
    encrypted_key = page_props['encryptedDocumentKey']
    url = root.xpath('//link[@rel="canonical"]')[0].get('href')
    return {'url': url, 'encrypted_data': encrypted_data, 'iv': iv, 'encrypted_key': encrypted_key}


def get_browser_for_wsj(*a, **kw):
    br = browser()
    br.set_cookie('wsjregion', 'na,us', '.wsj.com')
    br.set_cookie('gdprApplies', 'false', '.wsj.com')
    br.set_cookie('ccpaApplies', 'false', '.wsj.com')
    br.set_cookie('vcdpaApplies', 'false', '.wsj.com')
    br.set_cookie('regulationApplies', 'gdpr%3Afalse%2Ccpra%3Afalse%2Cvcdpa%3Afalse', '.wsj.com')
    br.set_handle_gzip(True)
    br.addheaders += [
        ('Accept', '*/*'),
        ('Accept-Language', 'en-GB,en-US;q=0.9,en;q=0.8'),
    ]
    return br


def get_decryption_key(br, data, referer='https://www.drudgereport.com/'):
    from pprint import pprint
    pprint
    purl = urlparse(referer)
    rq = Request('https://www.wsj.com/client', headers={
        'Cache-Control': 'max-age=0',
        'Referer': referer,
        'X-Encrypted-Document-Key': data['encrypted_key'],
        'X-Original-Host': 'www.wsj.com',
        'X-Original-Referrer': '',
        'X-Original-Url': purl.path,
    })
    br.set_debug_http(True)
    try:
        res = br.open(rq)
    except Exception as err:
        if hasattr(err, 'read'):
            raise Exception('decryption key request failed with error: {} and body: {}'.format(err, err.read().decode('utf-8', 'replace')))
        raise
    if res.code != 200:
        raise ValueError(f'decryption key request returned non OK HTTP result code: {res.code}')
    r = json.loads(res.read())
    key = r['documentKey']
    if not key:
        pprint(r)
        raise ValueError('No document key returned')
    return base64.standard_b64decode(key)


def decrypt_article(data):
    from Crypto.Cipher import AES
    from Crypto.Util import Counter
    ciphertext = data['encrypted_data']
    # ciphertext += b'\0' * (16 - len(ciphertext) % 16)
    print(11111111, len(ciphertext), len(data['iv']), int.from_bytes(data['iv']))
    counter = Counter.new(nbits=128, initial_value=int.from_bytes(data['iv']))
    cipher = AES.new(data['key'], AES.MODE_CTR, counter=counter)
    return cipher.decrypt(ciphertext)


def get_wsj_article(url='https://www.wsj.com/world/middle-east/u-n-world-leaders-push-to-get-gaza-aid-flowing-after-biden-pledge-3b59283b'):
    br = get_browser_for_wsj()
    res = br.open(url)
    raw_html = res.read()
    data = extract_json_data(raw_html)
    data['key'] = get_decryption_key(br, data)
    return decrypt_article(data)



if __name__ == '__main__':
    data = get_wsj_article()
    print(data)
    print( b'content' in data)
kovidgoyal is offline   Reply With Quote