The mutool version was the issue. 1.23 does not work, but 1.21 did. Thank you very much!
I have added a script that converts all files in a given location. Warning: I made this to be run on my e-reader, so it doesn't keep the originals.
Spoiler:
Code:
import os
import shutil
import subprocess
# Delete scanned JPX (JPEG 2000) image layers in archive.org pdfs
# Warning: This script does not keep the original files
# Tested with mupdf 1.21. Does not work with 1.23.
# path to mutool.exe
mutool = r"C:\mupdf-1.21.0-windows\mutool.exe"
def checkForJPX(filename):
'''Check if the first page of the file contains a JPX layer.'''
info_output = subprocess.run([mutool, 'info', filename, '1'], capture_output=True)
if '[ JPX ]' in str(info_output.stdout):
return True
else:
return False
def convertFile(filename):
'''Remove scanned image layers from archive.org pdfs'''
print('Working on: ', filename)
if checkForJPX(filename):
print('Trying to convert')
tmpfile = filename + '_tmp.pdf'
info_output = subprocess.run([mutool, 'run', 'dejazap.js', filename, tmpfile], capture_output=True)
if checkForJPX(tmpfile):
print('ERROR, file still has JPX layer, keeping temp file')
else:
shutil.move(tmpfile, filename)
print(' ... file converted')
else:
print('Does not contain JPX')
# from https://gist.github.com/TheMatt2/faf5ca760c61a267412c46bb977718fa
def walklevel(path, depth = 1):
"""It works just like os.walk, but you can pass it a level parameter
that indicates how deep the recursion will go.
If depth is 1, the current directory is listed.
If depth is 0, nothing is returned.
If depth is -1 (or less than 0), the full depth is walked.
"""
# If depth is negative, just walk
# Not using yield from for python2 compat
# and copy dirs to keep consistant behavior for depth = -1 and depth = inf
if depth < 0:
for root, dirs, files in os.walk(path):
yield root, dirs[:], files
return
elif depth == 0:
return
# path.count(os.path.sep) is safe because
# - On Windows "\\" is never allowed in the name of a file or directory
# - On UNIX "/" is never allowed in the name of a file or directory
# - On MacOS a literal "/" is quitely translated to a ":" so it is still
# safe to count "/".
base_depth = path.rstrip(os.path.sep).count(os.path.sep)
for root, dirs, files in os.walk(path):
yield root, dirs[:], files
cur_depth = root.count(os.path.sep)
if base_depth + depth <= cur_depth:
del dirs[:]
if __name__=='__main__':
# set inputFolder to your e-reader's location
inputFolder = r'F:\\' # Convert all pdf documents in this folder unless they don't contain JPX layers. Do not keep originals
max_depth = 3
print('Converting files in folder:', inputFolder)
for root, _, files in walklevel(inputFolder, max_depth):
for f in files:
if not f.endswith('.pdf'):
continue
convertFile(os.path.join(root, f))