MobileRead Forums - View Single Post

sjvr767 · 09-23-2008, 11:07 AM

Quote:

Originally Posted by haridasi

I have now tried to crop a pdf, but it doesn't crop the left side of the document. Furthermore, it takes some time guessing the correct percentage.

Hi there, I had a few minutes to spare and changed the way the new coordinates are determined. It should solve the "left-side" issue. This is more of a hack than a significant change, but I hope it helps. Code at the end of the document.

Before I give the code, I'd like to say that when I get time I will do a proper update of this. There are a few features I want to implement, such as splitting pages in half and then scaling those to A4. That should enlarge the doc quite a bit..

Here is the code:

Code:

#! /usr/bin/python

import subprocess
import getopt, sys
import find_lines
from pyPdf import PdfFileWriter, PdfFileReader

def usage ():
    print """sjvr767\'s PDF Cropping Script.
Example:
my_pdf_crop.py -s -p 0.5 -i input.pdf -o output.pdf
my_pdf_crop.py --skip --percent 0.5 -input input.pdf -output output.pdf
\n
REQUIRED OPTIONS:
-p\t--percent
The factor by which to crop. Must be positive and less than or equal to 1.

-i\t--input
The path to the file to be cropped.
\n
OPTIONAL:
-s\t--skip
Skip the first page. Ouptut file will not contain the first page of the input file.

-o\t--output
Specify the name and path of the output file. If none specified, the script appends \'cropped\' to the file name.
"""
    sys.exit(0)

def cut_length(dictionary, key, factor):
	cut_factor = 1-factor
	cut = dictionary[key]*cut_factor
	cut = cut / 4
	return cut
	
def new_coords(dictionary, key, cut):
	return abs(dictionary[key]-cut)
	
def new_coords2(ty, lx, rx, by, cut):
	new_ty = ty - cut
	new_by = by + cut
	new_lx = lx + cut
	new_rx = rx - cut
	top_left = {'x': new_lx, 'y': new_ty}
	bottom_left = {'x': new_lx, 'y': new_by}
	bottom_right = {'x': new_rx, 'y': new_by}
	top_right = {'x': new_rx, 'y': new_ty}
	return {'tr': top_right, 'tl': top_left, 'bl': bottom_left, 'br': bottom_right}

try:
	opts, args = getopt.getopt(sys.argv[1:], "sp:i:o:sch", ["skip", "percent=", "input=", "output=", "column", "half"])
except getopt.GetoptError, err:
        # print help information and exit:
        print str(err) # will print something like "option -a not recognized"
        usage()
        sys.exit(2)

skipone = 0

for a in opts[:]:
	if a[0] == '-s' or a[0]=='--skip':
		skipone = 1

factor = 0.8 #default scaling factor

for a in opts[:]:
	if a[0] == '-p' or a[0]=='--factor':
		if a[1] != None:
			try:
				factor = float(a[1])
			except TypeError:
				print "Factor must be a number."
				sys.exit(2) #exit if no appropriate input file

input_file = None #no defualt input file
		
for a in opts[:]:
	if a[0] == '-i' or a[0]=='--input':
		if a[1] != None:
			try:
				if a[1][-4:]=='.pdf':
					input_file = a[1]
				else:
					print "Input file must be a PDF."
					sys.exit(2) #exit if no appropriate input file
			except TypeError:
				print "Input file must be a PDF."
				sys.exit(2) #exit if no appropriate input file
			except IndexError:
				print "Input file must be a PDF."
				sys.exit(2) #exit if no appropriate input file
		else:
			print "Please speicfy an input file."
			sys.exit(2) #exit if no appropriate input file

output_file = "%s_cropped.pdf" %input_file[:-4] #default output

for a in opts[:]:
	if a[0] == '-o' or a[0]== '--output': 
		if a[1]!= None:
			try:
				if a[1][-4:]=='.pdf':
					output_file = a[1]
				else:
					print "Output file must be a PDF."
			except TypeError:
				print "Output file must be a PDF."
			except IndexError:
				print "Output file must be a PDF."

col = 0

for a in opts[:]:
	if a[0] == '-c' or a[0]=='--column':
		col = 1

half = 0

for a in opts[:]:
	if a[0] == '-h' or a[0]=='--half':
		half = 1


input1 = PdfFileReader(file(input_file, "rb"))

output = PdfFileWriter()
outputstream = file(output_file, "wb")

pages = input1.getNumPages()

top_right = {'x': input1.getPage(1).mediaBox.getUpperRight_x(), 'y': input1.getPage(1).mediaBox.getUpperRight_y()}

ty = input1.getPage(1).mediaBox.getUpperLeft_y()
lx = input1.getPage(1).mediaBox.getUpperLeft_x()
rx = input1.getPage(1).mediaBox.getLowerRight_x()
by = input1.getPage(1).mediaBox.getLowerRight_y()
print ty, lx, rx, by

cut = cut_length(top_right, 'x', factor)

newCoords = new_coords2(ty, lx, rx, by, cut)
new_tr = (newCoords['tr']['x'], newCoords['tr']['y'])
new_tl = (newCoords['tl']['x'], newCoords['tl']['y'])
new_br = (newCoords['br']['x'], newCoords['br']['y'])
new_bl = (newCoords['bl']['x'], newCoords['bl']['y'])

print new_tl[1], new_tl[0], new_bl[1], new_bl[0]

if skipone == 0 and col == 0 and half == 0:
	for i in range(0, pages):
		page = input1.getPage(i)
		page.mediaBox.upperLeft = new_tl
		page.mediaBox.upperRight = new_tr
		page.mediaBox.lowerLeft = new_bl
		page.mediaBox.lowerRight = new_br
		output.addPage(page)
elif skipone == 0 and col == 0 and half == 1:
	for i in range(0, pages-2):
		page = input1.getPage(i)
		page.mediaBox.upperLeft = new_tl
		page.mediaBox.upperRight = new_tr
		page.mediaBox.lowerLeft = new_bl
		page.mediaBox.lowerRight = new_br
		temp_output = PdfFileWriter()
		temp_output.addPage(page)
		tos = file("temp.pdf", "wb")
		temp_output.write(tos)
		tos.close()
		cmd = 'convert temp.pdf -density 8400 -colorspace Gray -contrast -contrast -contrast -colors 16 temp.gif'
		subprocess.call(cmd, shell=True)
		height = find_lines.find_hline('temp.gif', 5, 80)
		page1 = input1.getPage(i)
		page1.mediaBox.upperLeft = new_tl
		page1.mediaBox.upperRight = new_tr
		page1.mediaBox.lowerLeft = (new_tl[0], new_tl[1]-height)
		page1.mediaBox.lowerRight = (new_tr[0], new_tr[1]-height)
		output.addPage(page1)
		page2 = input1.getPage(i)
		page2.mediaBox.upperLeft = (new_tl[0], new_tl[1]-height)
		page2.mediaBox.upperRight = (new_tr[0], new_tr[1]-height)
		page2.mediaBox.lowerLeft = new_bl
		page2.mediaBox.lowerRight = new_br
		output.addPage(page2)

elif skipone == 1 and col == 0 and half == 0:
	for i in range(1, pages):
		page = input1.getPage(i)
		page.mediaBox.upperLeft = new_tl
		page.mediaBox.upperRight = new_tr
		page.mediaBox.lowerLeft = new_bl
		page.mediaBox.lowerRight = new_br
		output.addPage(page)

output.write(outputstream)
outputstream.close()