#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
#
# This is a python script. You need a Python interpreter to run it.
# For example, ActiveState Python, which exists for windows.
#
# This script strips the penultimate record from a Mobipocket file.
# This is useful because the current KindleGen add a compressed copy
# of the source files used in this record, making the ebook produced
# about twice as big as it needs to be.
#
#
# This is free and unencumbered software released into the public domain.
# 
# Anyone is free to copy, modify, publish, use, compile, sell, or
# distribute this software, either in source code form or as a compiled
# binary, for any purpose, commercial or non-commercial, and by any
# means.
# 
# In jurisdictions that recognize copyright laws, the author or authors
# of this software dedicate any and all copyright interest in the
# software to the public domain. We make this dedication for the benefit
# of the public at large and to the detriment of our heirs and
# successors. We intend this dedication to be an overt act of
# relinquishment in perpetuity of all present and future rights to this
# software under copyright law.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
# 
# For more information, please refer to <http://unlicense.org/>
#
# Written by Paul Durrant, 2010-2011, paul@durrant.co.uk
#
# Changelog
#  1.00 - Initial version
#  1.10 - Added an option to output the stripped data
#  1.20 - Added check for source files section (thanks Piquan)
#  1.30 - Added prelim Support for K8 style mobis
#  1.31 - removed the SRCS section but kept a 0 size entry for it
#  1.32 - removes the SRCS section and its entry, now updates metadata 121 if needed

__version__ = '1.32'

import sys
import struct
import binascii

class Unbuffered:
	def __init__(self, stream):
		self.stream = stream
	def write(self, data):
		self.stream.write(data)
		self.stream.flush()
	def __getattr__(self, attr):
		return getattr(self.stream, attr)


class StripException(Exception):
	pass


class SectionStripper:
	def loadSection(self, section):
		if (section + 1 == self.num_sections):
			endoff = len(self.data_file)
		else:
			endoff = self.sections[section + 1][0]
		off = self.sections[section][0]
		return self.data_file[off:endoff]

	def patch(self, off, new):
		self.data_file = self.data_file[:off] + new + self.data_file[off+len(new):]

	def strip(self, off, len):
		self.data_file = self.data_file[:off] + self.data_file[off+len:]

	def patchSection(self, section, new, in_off = 0):
		if (section + 1 == self.num_sections):
			endoff = len(self.data_file)
		else:
			endoff = self.sections[section + 1][0]
		off = self.sections[section][0]
		assert off + in_off + len(new) <= endoff
		self.patch(off + in_off, new)

	def updateEXTH121(self, srcs_secnum, mobiheader):
		mobi_length, = struct.unpack('>L',mobiheader[0x14:0x18])
		exth_flag, = struct.unpack('>L', mobiheader[0x80:0x84])
		exth = 'NONE'
		try:
			if exth_flag & 0x40:
				exth = mobiheader[16 + mobi_length:]
				if (len(exth) >= 4) and (exth[:4] == 'EXTH'):
					nitems, = struct.unpack('>I', exth[8:12])
					pos = 12
					for i in xrange(nitems):
						type, size = struct.unpack('>II', exth[pos: pos + 8])
						# print type, size
						if type == 121:
							boundaryptr, =struct.unpack('>L',exth[pos+8: pos + size])
							if srcs_secnum <= boundaryptr:
								boundaryptr -= 1
								prefix = mobiheader[0:16 + mobi_length + pos + 8]
								suffix = mobiheader[16 + mobi_length + pos + 8 + 4:]
								nval = struct.pack('>L',boundaryptr)
								mobiheader = prefix + nval + suffix
						pos += size
		except:
			pass
		return mobiheader

	def __init__(self, datain):
		if datain[0x3C:0x3C+8] != 'BOOKMOBI':
			raise StripException("invalid file format")
		self.num_sections, = struct.unpack('>H', datain[76:78])
		
		# search for the SRCS record and note which section it is in
		srcs_secnum = -1
		srcs_offset = 0xffffffff
		srcs_length = 0
		for i in xrange(self.num_sections - 1):
			offset, flgval = struct.unpack_from('>2L', datain, 78+(i*8))
			if datain[offset:offset+4] == 'SRCS':
				srcs_secnum = i
				srcs_offset = offset
				nextoffset, nextflgval = struct.unpack_from('>2L', datain, 78+((i+1)*8))
				srcs_length = nextoffset - srcs_offset
				# print "%d %0x %0x" % (srcs_secnum, srcs_offset, srcs_length)
				break

		if srcs_secnum == -1:
			raise StripException("File doesn't contain the sources section.")

		# it appears bytes 70 and 71 (0x46 - 0x47) always contain (2*num_sections) + 1
		# this is not documented anyplace at all but it appears to be either the size
		# of palmdb section info in words or more likely the offset in words from byte 
		# 76 to the start of the very first record
		# so we need to update this as well
		self.data_file = datain[:70] + struct.pack('>H',((self.num_sections-1)*2+1))
		self.data_file += datain[72:76]

		# write out the number of sections reduced by 1 
		self.data_file = self.data_file + struct.pack('>H',self.num_sections-1)

		# we are going to remove the SRCS section so the offset of every entry in the table
		# up to the srcs record must begin 8 bytes earlier (the table will be 8 bytes shorter)
		delta = -8 
		for i in xrange(srcs_secnum):
			offset, flgval = struct.unpack_from('>2L', datain, 78+(i*8))
			offset += delta
			self.data_file += struct.pack('>L',offset) + struct.pack('>L',flgval)
			
		# for every record after the SRCS record we must start it earlier by an amount 
		# equal to the 8 bytes plus the length of the SRCS section itself
		delta = delta - srcs_length
		for i in xrange(srcs_secnum+1,self.num_sections):
			offset, flgval = struct.unpack_from('>2L', datain, 78+(i*8))
			offset += delta
			flgval = flgval - 2
			self.data_file += struct.pack('>L',offset) + struct.pack('>L',flgval)

		# now pad it out to begin right at the first offset
		# typically this is 2 bytes of nulls
		first_offset, flgval = struct.unpack_from('>2L', self.data_file, 78)
		self.data_file += '\0' * (first_offset - len(self.data_file))

		# now finally add on every thing up to the original src_offset
		self.data_file += datain[first_offset + 8: srcs_offset]
	
		# and everything afterwards
		self.data_file += datain[srcs_offset+srcs_length:]
		
		#store away the SRCS section in case the user wants it output
		self.stripped_data_header = datain[srcs_offset:srcs_offset+16]
		self.stripped_data = datain[srcs_offset+16:srcs_offset+srcs_length]

		# update the number of sections count
		self.num_section = self.num_sections - 1

		# if K8 mobi, handle metadata 121 in old mobiheader
		offset0, flgval0 = struct.unpack_from('>2L', self.data_file, 78)
		offset1, flgval1 = struct.unpack_from('>2L', self.data_file, 86)
		mobiheader = self.data_file[offset0:offset1]
		mobiheader = self.updateEXTH121(srcs_secnum, mobiheader)
		self.data_file = self.data_file[0:offset0] + mobiheader + self.data_file[offset1:]
		print "done"

	def getResult(self):
		return self.data_file

	def getStrippedData(self):
		return self.stripped_data

	def getHeader(self):
		return self.stripped_data_header

if __name__ == "__main__":
	sys.stdout=Unbuffered(sys.stdout)
	print ('KindleStrip v%(__version__)s. '
	   'Written 2010-2011 by Paul Durrant.' % globals())
	if len(sys.argv)<3 or len(sys.argv)>4:
		print "Strips the penultimate record from Mobipocket ebooks"
		print "For ebooks generated using KindleGen 1.1 that adds the source"
		print "Usage:"
		print "    %s <infile> <outfile> <strippeddatafile>" % sys.argv[0]
		print "<strippeddatafile> is optional."
		sys.exit(1)
	else:
		infile = sys.argv[1]
		outfile = sys.argv[2]
		data_file = file(infile, 'rb').read()
		try:
			strippedFile = SectionStripper(data_file)
			file(outfile, 'wb').write(strippedFile.getResult())
			print "Header Bytes: " + binascii.b2a_hex(strippedFile.getHeader())
			if len(sys.argv)==4:
				file(sys.argv[3], 'wb').write(strippedFile.getStrippedData())
		except StripException, e:
			print "Error: %s" % e
			sys.exit(1)
	sys.exit(0)
