MobileRead Forums - View Single Post - [Old Thread] Auto Extract ISBN-Feature request

tnt85 · 05-03-2010, 12:10 PM

Hello

You can extract isbn from pdf with acrobat javascript, that simply import files in calibre.

I used following acrobat javascript, though its not perfect it can be very useful:

/* Extract isbn */

ExtractFromDocument(0,25);

function ExtractFromDocument(start,end)
{
var chWord, numWords;
var Out = new Object();
var reMatch = /(?:ISBN[ -–]*(?:|10|13)|International Standard Book Number)[:\s]?(?:|, PDF ed.|, print ed.|$pbk$|$electronic$)[:\s]?[\d][-– ]?[\dxX]/gi;

// construct filename for output document
var i = this.path.search(/[^:/]+\.pdf$/);
var fname = this.path.slice(i, this.path.length - 4);
var filename = fname;
var lastPages = 0;
try {

for (var i = start; i < end; i++)
{
numWords = this.getPageNumWords(i);
var PageText = "";
for (var j = 0; j < numWords; j++) {
var word = this.getPageNthWord(i,j,false);
PageText += word;
}

var strMatches = PageText.match(reMatch);
if (strMatches == null) continue;
for (j = 0; j < strMatches.length; j++) {
Out[strMatches[j]] = true;
}
if (i == end -1 && lastPages == 0){
//scan last 5 pages
lastPages = 1;
i = this.numPages-5;
end = this.numPages+1;
}
}
var nTotal = 0;
for (var prop in Out)
{
var temp = 0;
prop = prop.replace(/(isbn)([- ](10|13))?/gi,"");
prop = prop.replace(/[\r\n:a-wyz/(/)]/gi,"");
if (nTotal == 0) filename = prop;
//if (nTotal >= 1) continue;
//if (nTotal >= 1) filename += ","+prop;
//console.println("***"+prop+"***");
//nTotal++;
}

if (filename.length >= 1) this.saveAs("c:\\data\\" + filename + ".pdf");
if (this.disclosed) this.closeDoc();
}
catch(e)
{
//console.println("Processing error: "+e.message+" "+filename);
//print files with some error.
console.println(fname);
if (this.disclosed) this.closeDoc();
}

} // end of the function

05-03-2010, 12:10 PM	#15
tnt85 Junior Member Posts: 1 Karma: 10 Join Date: May 2010 Device: kindle	Hello You can extract isbn from pdf with acrobat javascript, that simply import files in calibre. I used following acrobat javascript, though its not perfect it can be very useful: /* Extract isbn / ExtractFromDocument(0,25); function ExtractFromDocument(start,end) { var chWord, numWords; var Out = new Object(); var reMatch = /(?:ISBN[ -–](?:\|10\|13)\|International Standard Book Number)[:\s]?(?:\|, PDF ed.\|, print ed.\|\(pbk\)\|\(electronic\))[:\s]?[\d][-– ]?[\dxX]/gi; // construct filename for output document var i = this.path.search(/[^:/]+\.pdf$/); var fname = this.path.slice(i, this.path.length - 4); var filename = fname; var lastPages = 0; try { for (var i = start; i < end; i++) { numWords = this.getPageNumWords(i); var PageText = ""; for (var j = 0; j < numWords; j++) { var word = this.getPageNthWord(i,j,false); PageText += word; } var strMatches = PageText.match(reMatch); if (strMatches == null) continue; for (j = 0; j < strMatches.length; j++) { Out[strMatches[j]] = true; } if (i == end -1 && lastPages == 0){ //scan last 5 pages lastPages = 1; i = this.numPages-5; end = this.numPages+1; } } var nTotal = 0; for (var prop in Out) { var temp = 0; prop = prop.replace(/(isbn)([- ](10\|13))?/gi,""); prop = prop.replace(/[\r\n:a-wyz/(/)]/gi,""); if (nTotal == 0) filename = prop; //if (nTotal >= 1) continue; //if (nTotal >= 1) filename += ","+prop; //console.println("*"+prop+""); //nTotal++; } if (filename.length >= 1) this.saveAs("c:\\data\\" + filename + ".pdf"); if (this.disclosed) this.closeDoc(); } catch(e) { //console.println("Processing error: "+e.message+" "+filename); //print files with some error. console.println(fname); if (this.disclosed) this.closeDoc(); } } // end of the function Last edited by tnt85; 05-03-2010 at 12:17 PM.*