# based on PDFrasterFarian by Aleksandr Dubinsky

# TO DO:
# - improve code re blank page handling
# - eliminate need for pdftk

# NOTE: must have PIL installed
# see http://www.pythonware.com/library/pil

# other necessary software
# - pdftk  http://www.accesspdf.com/pdftk/
# - ImageMagick (convert)  http://www.imagemagick.org/
# - ghostscript  http://sourceforge.net/projects/ghostscript/
# - pdftops from xpdf  http://www.foolabs.com/xpdf/
# (though there are much better ways to install these than going
#  to the above websites...)

# on Windows, these have all been kindly assembled by
# Aleksandr Dubinsky - look around in
# http://www.mobileread.com/forums/forumdisplay.php?f=100

import os
import sys
import Image, ImageFilter
import time

# exec info
gs_exec = "software\\gs\\gs8.54\\bin\\gswin32c.exe"
gs_includes = '-I.\\software\\gs\\gs8.54\\lib -I.\\software\\gs\\gs8.54\\Resource -I.\\software\\gs\\fonts'
gs_params = "-q -dBATCH -dSAFER -dNOPAUSE -dMaxBitmap=200000000"
gs_str = "%s %s %s" % (gs_exec, gs_includes, gs_params)
im = "software\\ImageMagick-6.3.1-Q8\\convert.exe"

# device info
hres = 565
vres = 754
scrRatio = float(hres)/float(vres)

# this works great for Google Books images
# may be a little aggressive if page just has a single line (not text) on it (rare)
def trimNoise(im, mag):
    left = -1
    right = 0
    top = 0
    bottom = 0
    width = im.size[0]
    height = im.size[1]
    # will have to scale threshold according to use of dilate
    noiseFactor = mag + mag
    threshT = mag * 6
    thresh = height - noiseFactor
    # LEFT BOUND
    # proceed until enough density
    for col in range(0, width):
        foo = im.crop((col, 0, col+1, height))
        hist = foo.histogram()
        if ((hist[0] > threshT) and
            (hist[0] < thresh)):  # skip mostly black lines
            left = col
            break
    if (left == -1):  # didn't find anything
        return im
    # rewind until drops off
    for col in range(left-1, -1, -1):
        foo = im.crop((col, 0, col+1, height))
        hist = foo.histogram()
        # handle all black and all white rows
        if (hist[0] <= noiseFactor):
            left = col
            break

    # RIGHT BOUND
    for col in range(width, left-1, -1):
        foo = im.crop((col-1, 0, col, height))
        hist = foo.histogram()
        if ((hist[0] > threshT) and (hist[0] < thresh)):
            right = col
            break
    for col in range(right, width+1):
        foo = im.crop((col-1, 0, col, height))
        hist = foo.histogram()
        if (hist[0] <= noiseFactor):
            right = col
            break

    width = right - left
    if (width < noiseFactor):
        return im
    thresh = width - noiseFactor

    # TOP BOUND
    for row in range(0, height):
        foo = im.crop((left, row, right, row + 1))
        hist = foo.histogram()
        if ((hist[0] > threshT) and (hist[0] < thresh)):
            top = row
            break
    for row in range(top-1, -1, -1):
        foo = im.crop((left, row, right, row + 1))
        hist = foo.histogram()
        # handle all black and all white rows
        if (hist[0] <= noiseFactor):
            top = row
            break

    # BOTTOM BOUND
    for row in range(height, top-1, -1):
        foo = im.crop((left, row-1, right, row))
        hist = foo.histogram()
        if ((hist[0] > threshT) and (hist[0] < thresh)):
            bottom = row
            break
    for row in range(bottom, height+1):
        foo = im.crop((left, row-1, right, row))
        hist = foo.histogram()
        if (hist[0] <= noiseFactor):
            bottom = row
            break

    height = bottom - top
    if (height < noiseFactor):
        return im

    # re-run left & right (yes, this does make sense)
    # LEFT BOUND
    thresh = height - noiseFactor
    oldleft = left
    left = -1
    for col in range(oldleft, right):
        foo = im.crop((col, top, col+1, bottom))
        hist = foo.histogram()
        if ((hist[0] > threshT) and (hist[0] < thresh)):
            left = col
            break
    if (left == -1):  # didn't find anything
        return im
    # rewind until drops off
    for col in range(left-1, oldleft-1, -1):
        foo = im.crop((col, top, col+1, bottom))
        hist = foo.histogram()
        # handle all black and all white rows
        if (hist[0] <= noiseFactor):
            left = col
            break

    # RIGHT BOUND
    oldright = right
    for col in range(right, left+1, -1):
        foo = im.crop((col-1, top, col, bottom))
        hist = foo.histogram()
        if ((hist[0] > threshT) and (hist[0] < thresh)):
            right = col
            break
    for col in range(right, oldright+1):
        foo = im.crop((col-1, top, col, bottom))
        hist = foo.histogram()
        if (hist[0] <= noiseFactor):
            right = col
            break

    width = right - left
    if (width < noiseFactor):
        return im

    #im.crop((left, top, right, bottom)).save("trim.png")  # look at trimmed image
    
    actRatio = float(width)/float(height)
    if (actRatio < scrRatio):
        newW = int(float(height)*scrRatio)
        shift = (newW - width) / 2
        left = left - shift
        if (left < 0):
            left = 0
        right = left + newW
        if (right > im.size[0]):
            right = im.size[0]
            left = right - newW
    elif (actRatio > scrRatio):
        newH = int(float(width)/scrRatio)
        shift = (newH - height) / 2
        top = top - shift
        if (top < 0):
            top = 0
        bottom = top + newH
        if (bottom > im.size[1]):
            bottom = im.size[1]
            top = bottom - newH

    # ideally, we might go back & re-run LEFT & RIGHT w/ trimming

    return im.crop((left, top, right, bottom))

def getNumPages(fil):
    (pin, pout) = os.popen2('software\\pdftk.exe "%s" dump_data' % (fil))
    t = pout.read()
    pout.close()
    pin.close()
    idx = t.find("NumberOfPages")
    if (idx == -1):
        raise BaseException("problem with PDF - couldn't get number of pages!")
    idx += 15
    return int(t[idx:t.find("\n", idx)])

def processPage(fil, pageNum, dilateLevel=6, doTrimNoise=False, doMono=False):
    print "processing page %d:" % (pageNum)

    doDilate = True
    if (dilateLevel > 20):
        mag = 20
    elif (dilateLevel < 2):
        doDilate = False
        mag = 6
    else:
        mag = dilateLevel

    # convert to EPS and get bounding box
    os.popen2('software\\pdftops.exe -f %d -l %d -eps -pagecrop "%s" prv_1.eps' % (pageNum, pageNum, fil))
    (pin, pout, perr) = os.popen3('%s -r100 -sDEVICE=bbox prv_1.eps' % (gs_str))
    t = perr.read()
    perr.close()
    pout.close()
    pin.close()
    box = map(float, t.splitlines()[1].split()[1:])
    
    w = box[2]-box[0]
    h = box[3]-box[1]
    print '- dimensions: %.2f" x %.2f"' % (w/72.0, h/72.0)
    if ((w < 18.0) or (h < 18.0)):
        print "- BLANK PAGE"
        f = open("modules\\blank_page.png", "rb")
        of = open("prv_5.png", "wb")
        of.write(f.read())
        of.close()
        f.close()
    else:
        # try to make sure centered l/r
        # there is an assumption that negative numbers are OK
        actRatio = w/h
        if (actRatio < scrRatio):
            newW = h*scrRatio
            shift = (newW - w) / 2.0
            box[0] -= shift
            box[2] += shift
        elif (actRatio > scrRatio):
            # here we're centering top/bottom - might just want at top
            newH = w/scrRatio
            shift = (newH - h) / 2.0
            box[1] -= shift
            box[3] += shift

        # convert to PNG
        f = open("prv_1.eps", "rb")
        o = open("prv_4.eps", "wb")
        for l in f.readlines():
            if l.startswith("%%Bounding"):
                o.write("%%%%HiResBoundingBox: %.6f %.6f %.6f %.6f\n" % (box[0], box[1], box[2], box[3]))
            else:
                o.write(l)
        f.close()
        o.close()

        hrasres = hres * mag
        vrasres = vres * mag
        gs_alpha = "-dTextAlphaBits=4 -dGraphicsAlphaBits=4"
        gs_align = "-dAlignToPixels=0"
        os.popen2("%s -sDEVICE=pnggray -g%dx%d %s %s -dEPSFitPage -sOutputFile=prv_5.png -fprv_4.eps" %
                  (gs_str, hrasres, vrasres, gs_alpha, gs_align))

    img = Image.open("prv_5.png")
    if (doTrimNoise):
        # trim
        print "- trimming noise"
        img = trimNoise(img, mag)
    if (doDilate):
        # dilate
        print "- dilation filter"
        img = img.filter(ImageFilter.MinFilter(3))
    #img.save("prv_5NEW.png)
    # downsample
    print "- downsampling"
    img = img.resize((hres, vres), Image.ANTIALIAS)
    img.save("prv_6.png")
    # sharpen
    print "- sharpening"
    os.popen2("%s prv_6.png -unsharp 2 prv_7.png" % (im))
    # reduce colors
    print "- reducing colors"
    if (doMono):
        colorStr = "-monochrome"
    else:
        colorStr = "-colors 4"
    os.popen2("%s prv_7.png -colorspace GRAY %s page-%d.png" % (im, colorStr, pageNum))
    print "done"

def genLRS(fil, numPages):
    f = open(fil[:-4] + ".lrs", "wb")
    xsize = 637
    ysize = 849

    #f.write('<?xml version="1.0" encoding="UTF-16"?>\n')
    f.write('<BBeBXylog version="1.0"><Property/><BookInformation><Info version="1.0" ><BookInfo>\n')
    f.write('<Title reading="">%s</Title>\n' % ("MY TITLE"))
    f.write('<Author reading="">%s</Author>\n' % ("SOME AUTHOR"))
    f.write('<BookID>FB_0000000000001</BookID>\n')
    f.write('<Publisher reading="">Aleksandr Dubinsky\'s Imprint of Corporeal Pressings</Publisher>\n')
    f.write('<Label reading=""></Label>\n')
    f.write('<Category></Category>\n')
    f.write('<Classification></Classification>\n')
    f.write('<FreeText></FreeText>\n')
    f.write('</BookInfo><DocInfo>\n')
    f.write('<CThumbnail file="modules\\book_thumb.gif"/>\n')
    f.write('<Language>en</Language>\n')
    f.write('<Creator></Creator>\n')
    f.write('<CreationDate> %s </CreationDate>\n' %
            time.strftime("%Y-%m-%d", time.localtime()))
    f.write('<Producer>pyprf</Producer>\n')
    f.write('<SumPage>%d</SumPage>\n' % (numPages))
    f.write('</DocInfo></Info>\n')
    # do TOC
    f.write('<TOC>\n')
    (pin, pout) = os.popen2("software\\pdftk.exe %s dump_data" % (fil))
    t = pout.read()
    pout.close()
    pin.close()
    for l in t.splitlines():
        if (l.startswith("BookmarkTitle:")):
            title = l[15:]
        elif (l.startswith("BookmarkPageNumber:")):
            pnum = int(l[20:])
            f.write('<TocLabel refobj="%d" refpage="%d">%s</TocLabel>\n' % (200000+pnum, 100000+pnum, title))            
    f.write('</TOC>\n')
    f.write('</BookInformation><Main>\n')
    for n in range(1, numPages+1):
        f.write('<Page pagestyle="77" objid="%d" objlabel="Page.%d"><BlockSpace xspace="0" yspace="0"/><ImageBlock x0="0" y0="0" x1="600" y1="800" xsize="%d" ysize="%d" refstream="%d" blockwidth="600" blockheight="768" topskip="0" footskip="0" sidemargin="0" blockstyle="86" objid="%d" objlabel="Block.%d"></ImageBlock></Page>\n' %
                (100000+n, 100000+n, xsize, ysize, 300000+n, 200000+n, 200000+n))
    f.write('</Main><Template version="1.0"></Template><Style>\n')
    f.write('<BookStyle stylelabel="58" objid="58"><SetDefault rubyalign="center" rubyadjust="none" rubyoverhang="none" empdotsposition="before" emplineposition="before" emplinetype="solid" setwaitprop="noreplay"/>\n')
    f.write('<BookSetting bindingdirection="Lr" dpi="1600" screenwidth="600" screenheight="800" colordepth="24"/></BookStyle>\n')
    f.write('<PageStyle stylelabel="77" topmargin="0" headheight="0" headsep="0" oddsidemargin="0" textheight="800" textwidth="600" footspace="0" evensidemargin="0" footheight="0" layout="LrTb" bgimagemode="fix" pageposition="any" setwaitprop="noreplay" setemptyview="empty" objid="77"/>\n')
    f.write('<BlockStyle stylelabel="86" bgimagemode="fix" framemode="square" blockwidth="600" blockheight="800" blockrule="horz.adjustable" layout="LrTb" framewidth="0" framecolor="0x00000000" topskip="0" sidemargin="0" footskip="0" objid="86"/>\n')
    f.write('</Style><Objects>\n')
    for n in range(1, numPages+1):
        f.write('<ImageStream encoding="PNG" file="page-%d.png" objid="%d" imagestreamlabel="ImageStream.%d"/>\n' %
                (n, 300000+n, 300000+n))
    f.write('</Objects></BBeBXylog>\n')
    f.close()

def doUnlink(fil):
    if (os.path.exists(fil)):
        os.unlink(fil)

def processPDF(fil):
    if (fil[-4:].lower() != ".pdf"):
        raise BaseException("please specify a .PDF file")
    numPages = getNumPages(fil)
    print "processing %s (%d pages)..." % (fil, numPages)
    for x in range(1, numPages + 1):
        processPage(fil, x, 6, True, False)

    genLRS(fil, numPages)
    os.popen2('software\\lrs2lrf\\lrs2lrf.exe "%s" "%s"' %
              (fil[:-4] + ".lrs", fil[:-4] + ".lrf"))

    # cleanup
    doUnlink(fil[:-4] + ".lrs")
    doUnlink("lrs2lrf.log")
    for x in range(1, numPages + 1):
        doUnlink("page-%d.png" % x)
    doUnlink("prv_1.eps")
    doUnlink("prv_4.eps")
    doUnlink("prv_5.png")
    #doUnlink("prv_5NEW.png")
    doUnlink("prv_6.png")
    doUnlink("prv_7.png")

processPDF(sys.argv[1])

# can instead run this line to verify a single page
#processPage(sys.argv[1], int(sys.argv[2]), 6, True, False)
