View Single Post
Old 05-03-2010, 12:10 PM   #15
tnt85
Junior Member
tnt85 began at the beginning.
 
Posts: 1
Karma: 10
Join Date: May 2010
Device: kindle
Hello

You can extract isbn from pdf with acrobat javascript, that simply import files in calibre.

I used following acrobat javascript, though its not perfect it can be very useful:

/* Extract isbn */


ExtractFromDocument(0,25);

function ExtractFromDocument(start,end)
{
var chWord, numWords;
var Out = new Object();
var reMatch = /(?:ISBN[ -–]*(?:|10|13)|International Standard Book Number)[:\s]?(?:|, PDF ed.|, print ed.|\(pbk\)|\(electronic\))[:\s]?[\d][-– ]?[\dxX]/gi;

// construct filename for output document
var i = this.path.search(/[^:/]+\.pdf$/);
var fname = this.path.slice(i, this.path.length - 4);
var filename = fname;
var lastPages = 0;
try {

for (var i = start; i < end; i++)
{
numWords = this.getPageNumWords(i);
var PageText = "";
for (var j = 0; j < numWords; j++) {
var word = this.getPageNthWord(i,j,false);
PageText += word;
}

var strMatches = PageText.match(reMatch);
if (strMatches == null) continue;
for (j = 0; j < strMatches.length; j++) {
Out[strMatches[j]] = true;
}
if (i == end -1 && lastPages == 0){
//scan last 5 pages
lastPages = 1;
i = this.numPages-5;
end = this.numPages+1;
}
}
var nTotal = 0;
for (var prop in Out)
{
var temp = 0;
prop = prop.replace(/(isbn)([- ](10|13))?/gi,"");
prop = prop.replace(/[\r\n:a-wyz/(/)]/gi,"");
if (nTotal == 0) filename = prop;
//if (nTotal >= 1) continue;
//if (nTotal >= 1) filename += ","+prop;
//console.println("***"+prop+"***");
//nTotal++;
}

if (filename.length >= 1) this.saveAs("c:\\data\\" + filename + ".pdf");
if (this.disclosed) this.closeDoc();
}
catch(e)
{
//console.println("Processing error: "+e.message+" "+filename);
//print files with some error.
console.println(fname);
if (this.disclosed) this.closeDoc();
}

} // end of the function

Last edited by tnt85; 05-03-2010 at 12:17 PM.
tnt85 is offline   Reply With Quote