Yeah, that works but now the issue is how to decrypt using the key and iv, the obvious candidate, AES-CTR doesnt seem to work
Code:
import base64
import json
from html5_parser import parse
from mechanize import Request
from urllib.parse import urlparse
from calibre import browser
def extract_json_data(raw_html):
from pprint import pprint
pprint
root = parse(raw_html)
d = json.loads(root.xpath('//script[@id="__NEXT_DATA__"]')[0].text)
page_props = d['props']['pageProps']
ed = page_props['encryptedDataHash']
encrypted_data = base64.standard_b64decode(ed['content'])
iv = base64.standard_b64decode(ed['iv'])
encrypted_key = page_props['encryptedDocumentKey']
url = root.xpath('//link[@rel="canonical"]')[0].get('href')
return {'url': url, 'encrypted_data': encrypted_data, 'iv': iv, 'encrypted_key': encrypted_key}
def get_browser_for_wsj(*a, **kw):
br = browser()
br.set_cookie('wsjregion', 'na,us', '.wsj.com')
br.set_cookie('gdprApplies', 'false', '.wsj.com')
br.set_cookie('ccpaApplies', 'false', '.wsj.com')
br.set_cookie('vcdpaApplies', 'false', '.wsj.com')
br.set_cookie('regulationApplies', 'gdpr%3Afalse%2Ccpra%3Afalse%2Cvcdpa%3Afalse', '.wsj.com')
br.set_handle_gzip(True)
br.addheaders += [
('Accept', '*/*'),
('Accept-Language', 'en-GB,en-US;q=0.9,en;q=0.8'),
]
return br
def get_decryption_key(br, data, referer='https://www.drudgereport.com/'):
from pprint import pprint
pprint
purl = urlparse(referer)
rq = Request('https://www.wsj.com/client', headers={
'Cache-Control': 'max-age=0',
'Referer': referer,
'X-Encrypted-Document-Key': data['encrypted_key'],
'X-Original-Host': 'www.wsj.com',
'X-Original-Referrer': '',
'X-Original-Url': purl.path,
})
br.set_debug_http(True)
try:
res = br.open(rq)
except Exception as err:
if hasattr(err, 'read'):
raise Exception('decryption key request failed with error: {} and body: {}'.format(err, err.read().decode('utf-8', 'replace')))
raise
if res.code != 200:
raise ValueError(f'decryption key request returned non OK HTTP result code: {res.code}')
r = json.loads(res.read())
key = r['documentKey']
if not key:
pprint(r)
raise ValueError('No document key returned')
return base64.standard_b64decode(key)
def decrypt_article(data):
from Crypto.Cipher import AES
from Crypto.Util import Counter
ciphertext = data['encrypted_data']
# ciphertext += b'\0' * (16 - len(ciphertext) % 16)
print(11111111, len(ciphertext), len(data['iv']), int.from_bytes(data['iv']))
counter = Counter.new(nbits=128, initial_value=int.from_bytes(data['iv']))
cipher = AES.new(data['key'], AES.MODE_CTR, counter=counter)
return cipher.decrypt(ciphertext)
def get_wsj_article(url='https://www.wsj.com/world/middle-east/u-n-world-leaders-push-to-get-gaza-aid-flowing-after-biden-pledge-3b59283b'):
br = get_browser_for_wsj()
res = br.open(url)
raw_html = res.read()
data = extract_json_data(raw_html)
data['key'] = get_decryption_key(br, data)
return decrypt_article(data)
if __name__ == '__main__':
data = get_wsj_article()
print(data)
print( b'content' in data)