MobileRead Forums - View Single Post - Joshua Tallent on the Single Epub Challenge

mattmc · 08-17-2015, 11:33 PM

Quote:

Originally Posted by dgatwood

I think so, now that you mention it. At least I can't think of any cases where that wouldn't work.

For fun, I took a crack at #2 and #3 last week. I ran into a bit of a wall where I couldn't do anything useful with the results from within a web browser, and I also ran into some data loss because Safari's CSS objects only include the bits that Safari uses, and leaves out properties that are specific to other browsers, but it might be useful as a starting point.

Nice! That's some nice JS

I do understand the wall there, though--I think it's best to stay away from the browser on this one, due to quirks and portability issues. I ended up writing #4 in a script that just processes the files on-disk in an unzipped ePub, basically. With the Cheerio module this is fairly easy:

Iterate the CSS rules
Find any selectors for those rules that are "too complex" for Kindlegen
Swap the complex selectors for simplified class selectors
In the HTML, use Cheerio to select elements that the complex selectors match
Add the simple classes to those elements

So something like p > span becomes .p-span, voila.

I put it here as a Gist with syntax highlighting, or you can see it here:

Spoiler:

Code:

#!/usr/bin/env node
// The above "shebang" tells the bash terminal to run this script as a Node.js script.

// Import NodeJS packages
var argv = require('yargs').argv;   // for processing command-line arguments
var fs = require("fs");             // filesystem
var wrench = require('wrench');     // for deep recursive copying
var css = require('css');           // for parsing CSS into an AST
var walk = require('rework-walk'); // for walking ASTs generated by the css module
var cheerio = require('cheerio');   // jquery-like access to an HTML document

// Define an "ends with" method that is useful later.
if (typeof String.prototype.endsWith !== 'function') {
    String.prototype.endsWith = function(suffix) {
        return this.indexOf(suffix, this.length - suffix.length) !== -1;
    };
}

// Fetches all files in a folder, recursively
function filesRecursive(dir) {
    var results = [];
    var list = fs.readdirSync(dir);
    list.forEach(function(file) {
        file = dir + '/' + file;
        var stat = fs.statSync(file);
        if (stat && stat.isDirectory()) {
            results = results.concat(walk(file));
        } else {
            results.push(file);  
        } 
    });
    return results;
}

// Same as filesRecursive, but filters for files that end with an ext, such as ".css"
function filesRecursiveWithExt(dir, ext){
    var results = walk(dir);
    return results.reduce(function(filteredContents, filename){
        if (filename.endsWith(ext)) {
            filteredContents.push(filename);
        }
        return filteredContents;
    }, []);
}

// Convenience function: 
// reads a file, calls a function that you give it, 
// and then writes back to the file whatever your function returns
function readWrite(filePath, callback){
    var content = '';
    var stats = fs.statSync(filePath);
    if (!stats.isDirectory()) {
        content = fs.readFileSync(filePath).toString();
    }
    if (callback) {
        content = callback(content, stats);
        if (!stats.isDirectory() && content) {
            fs.writeFileSync(filePath, content);
        }
    } else {
        return content;
    }
}

/////////////////////
// MAIN SCRIPT STARTS
/////////////////////

// Get the path to the epub directory, as an argument
var targetDirectory = argv._[0];
if (!targetDirectory) {
    console.log("Please specify the directory of the epub you want to work with.");

}
var scriptDir = path.dirname(require.main.filename); // current directory of the main script
var resolvedTargetDir = path.resolve(targetDirectory);

// Clone the directory so we don't taint the original
var newDirectory = resolvedTargetDir + '_kf7';
wrench.copyDirSyncRecursive(resolvedTargetDir, newDirectory, {
    forceDelete: true // overwrites any "_kf7" directory that's already there
});

// Simplify our CSS rules
var complexSelectorMap = {};
var cssFiles = filesRecursiveWithExt(newDirectory, "css");
cssFiles.forEach(function(filename){
    readWrite(filename, function(content, stats){
        if (!content) return;

        // Parse the CSS into an AST that can be walked
        var ast = css.parse(content);

        // Walk the AST
        walk(ast.stylesheet, function(rule, node){

            if (!rule.selectors) return;

            var remove = [];
            var add = [];
            rule.selectors.forEach(function(sel, idx){

                // If the selector contains a space, it's too complex for Kindlegen
                var parts = sel.split(" ");
                if (parts.length > 1) {

                    // Create a simplified version of the selector
                    var newSel = '.' + parts.join('-').replace('#', '-id-').replace('.', '-clz-').replace('+', '-adj-').replace('~', '-pre-').replace('[', '-lbr-').replace(']', '-rbr-').replace(/-{2,}/, '-');

                    // Add it to our list to add to this rule
                    add.push(newSel);

                    // Map the complex selector to the simplified version (for later adjustments we do in the markup)
                    complexSelectorMap[sel] = newSel;

                    // Add this selector to our list to remove
                    remove.push(idx);
                }
            });

            // Remove the complex selectors, if any.
            // Note that we go backwards through the list, otherwise our indexes will be messed up.
            for (var i = remove.length - 1; i >= 0; i--) {
                rule.selectors.splice(remove[i], 1);
            };

            // Add the simplified selectors, if any
            rule.selectors = rule.selectors.concat(add);

        });

        // Stringify the modified AST and return it, so it gets written back to the file
        return css.stringify(ast);
    });
});

// Now convert everything in the markup to the simpler selectors, as it were
var compoundClassMap = {};
var htmlFiles = filesRecursiveWithExt(newDirectory, "html"); // will include "xhtml"
htmlFiles.forEach(function(filename){
    readWrite(filename, function(content, stats){
        if (!content) return;

        // Load up everything into Cheerio!
        var $ = cheerio.load(content, {
            xmlMode: true
        });

        // Find everything that the complex selectors applied to, and stick the simpler class on them
        Object.keys(complexSelectorMap).forEach(function(key) {
            $(key).addClass(complexSelectorMap[key]); // ...Well, that was easy.
        });

        return $.xml();
    });
});

I do want to evolve it and probably make it into a proper NPM package with tests and all that, but I figured I'd post my immediate results.

-----

Okay, now there's the question of #2 and #3. You basically mention elements that have multiple classes, but I think for a truly universal solution a more complex approach is required. Correct me if I'm wrong, but it's not so much elements with multiple classes as it is elements that multiple selectors apply to, right?

Like, what if you have span.blah and you have a <p class="blah"> for whatever reason? The selector wouldn't actually apply in that scenario, but if you were just looking at classes, you would think it did.

Or if you have selectors #super and .duper, and element <p class="duper" id="super"> then both rules would apply to that element.

It's really all dependent on what kind of CSS is being used by the book creator; if you're just using classes then obviously that's fine, I'm just thinking it all the way through to the conclusion.

I suppose if you already got rid of all of the complex selectors, basically anything matching /[+~\[\] ]/, then all you have to worry about is IDs and classes? So you could walk the DOM with that in mind, I suppose.