MobileRead Forums - View Single Post - Koreader is poor in handling Internet Archive books

DanCa · 11-22-2024, 08:36 PM

The mutool version was the issue. 1.23 does not work, but 1.21 did. Thank you very much!

I have added a script that converts all files in a given location. Warning: I made this to be run on my e-reader, so it doesn't keep the originals.

Spoiler:

Code:

import os
import shutil
import subprocess
# Delete scanned JPX (JPEG 2000) image layers in archive.org pdfs
# Warning: This script does not keep the original files


# Tested with mupdf 1.21. Does not work with 1.23. 

# path to mutool.exe
mutool = r"C:\mupdf-1.21.0-windows\mutool.exe"

def checkForJPX(filename):
    '''Check if the first page of the file contains a JPX layer.'''
    info_output = subprocess.run([mutool, 'info', filename, '1'], capture_output=True)
    if '[ JPX ]' in str(info_output.stdout):
        return True
    else:
        return False


def convertFile(filename):
    '''Remove scanned image layers from archive.org pdfs'''
    print('Working on: ', filename)
    if checkForJPX(filename):
        print('Trying to convert')
        tmpfile = filename + '_tmp.pdf'
        info_output = subprocess.run([mutool, 'run', 'dejazap.js', filename, tmpfile], capture_output=True)
        if checkForJPX(tmpfile):
            print('ERROR, file still has JPX layer, keeping temp file')
        else:
            shutil.move(tmpfile, filename)
            print('                  ... file converted')
    else:
        print('Does not contain JPX')




# from https://gist.github.com/TheMatt2/faf5ca760c61a267412c46bb977718fa
def walklevel(path, depth = 1):
    """It works just like os.walk, but you can pass it a level parameter
       that indicates how deep the recursion will go.
       If depth is 1, the current directory is listed.
       If depth is 0, nothing is returned.
       If depth is -1 (or less than 0), the full depth is walked.
    """
    # If depth is negative, just walk
    # Not using yield from for python2 compat
    # and copy dirs to keep consistant behavior for depth = -1 and depth = inf
    if depth < 0:
        for root, dirs, files in os.walk(path):
            yield root, dirs[:], files
        return
    elif depth == 0:
        return

    # path.count(os.path.sep) is safe because
    # - On Windows "\\" is never allowed in the name of a file or directory
    # - On UNIX "/" is never allowed in the name of a file or directory
    # - On MacOS a literal "/" is quitely translated to a ":" so it is still
    #   safe to count "/".
    base_depth = path.rstrip(os.path.sep).count(os.path.sep)
    for root, dirs, files in os.walk(path):
        yield root, dirs[:], files
        cur_depth = root.count(os.path.sep)
        if base_depth + depth <= cur_depth:
            del dirs[:]

if __name__=='__main__':
# set inputFolder to your e-reader's location
    inputFolder = r'F:\\' # Convert all pdf documents in this folder unless they don't contain JPX layers. Do not keep originals
    max_depth = 3
    print('Converting files in folder:', inputFolder)
    for root, _, files in walklevel(inputFolder, max_depth):
        for f in files:
            if not f.endswith('.pdf'):
                continue
            convertFile(os.path.join(root, f))