#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai


import sys
import os, getopt, struct, re
import codecs


# Because Windows (and Mac OS X) allows full unicode filenames and paths
# any paths in pure bytestring python 2.X code must be utf-8 encoded as they will need to
# be converted on the fly to unicode for Windows platforms.  Any other 8-bit str 
# encoding would lose characters that can not be represented in that encoding

# these are simple support routines to allow use of utf-8 encoded bytestrings as paths in main program
# to be converted on the fly to full unicode as temporary un-named values to prevent
# the potential mixing of unicode and bytestring string values in the main program 


_iswindows = sys.platform.startswith('win')

# convert utf-8 encoded path string to proper type
# on windows that is full unicode
# on macosx and linux this is utf-8

def pathof(s):
    global _iswindows
    if isinstance(s, unicode):
        print "Warning: pathof expects utf-8 encoded byestring: ", s
        if _iswindows:
            return s
        return s.encode('utf-8')
    if _iswindows:
        return s.decode('utf-8')
    return s


# properly get sys.argv arguments and encode them into utf-8
def utf8_argv():
    global _iswindows
    if _iswindows:
        # Versions 2.x of Python don't support Unicode in sys.argv on
        # Windows, with the underlying Windows API instead replacing multi-byte
        # characters with '?'.  So use shell32.GetCommandLineArgvW to get sys.argv 
        # as a list of Unicode strings and encode them as utf-8

        from ctypes import POINTER, byref, cdll, c_int, windll
        from ctypes.wintypes import LPCWSTR, LPWSTR

        GetCommandLineW = cdll.kernel32.GetCommandLineW
        GetCommandLineW.argtypes = []
        GetCommandLineW.restype = LPCWSTR

        CommandLineToArgvW = windll.shell32.CommandLineToArgvW
        CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
        CommandLineToArgvW.restype = POINTER(LPWSTR)

        cmd = GetCommandLineW()
        argc = c_int(0)
        argv = CommandLineToArgvW(cmd, byref(argc))
        if argc.value > 0:
            # Remove Python executable and commands if present
            start = argc.value - len(sys.argv)
            return [argv[i].encode('utf-8') for i in
                    xrange(start, argc.value)]
        # this should never happen
        return None
    else:
        argv = []
        argvencoding = sys.stdin.encoding
        if argvencoding == None:
            argvencoding = sys.getfilesystemencoding()
        if argvencoding == None:
            argvencoding = 'utf-8'
        for arg in sys.argv:
            if type(arg) == unicode:
                argv.append(arg.encode('utf-8'))
            else:
                argv.append(arg.decode(argvencoding).encode('utf-8'))
        return argv


_TABLE = [('m', 1000), ('cm', 900), ('d', 500), ('cd', 400), ('c', 100), ('xc', 90), ('l', 50), ('xl', 40), ('x', 10), ('ix', 9), ('v', 5), ('iv', 4), ('i', 1)]

def _int_to_roman(i):
    parts = []
    num = i
    for letter, value in _TABLE:
        while value <= num:
            num -= value
            parts.append(letter)
    return ''.join(parts)

def _roman_to_int(s):
    result = 0
    rnstr = s
    for letter, value in _TABLE:
        while rnstr.startswith(letter):
            result += value
            rnstr = rnstr[len(letter):]
    return result

_pattern = r'''\(([^\)]*)\)'''
_tup_pattern = re.compile(_pattern,re.IGNORECASE)


class dumpAPNXException(Exception):
    pass



class PageMapProcessor:
    def _parseNames(self, numpages, data):
        for i in range(numpages):
            self.pagenames.append(None)
        for m in re.finditer(_tup_pattern, data):
            tup = m.group(1)
            spos, nametype, svalue = tup.split(",")
            print spos, nametype, svalue
            if nametype == 'a' or  nametype == 'r':
                svalue = int(svalue)
            spos = int(spos)
            for i in range(spos - 1, numpages):
                if nametype == 'r':
                    pname = _int_to_roman(svalue)
                    svalue += 1
                elif nametype == 'a':
                    pname = "%s" % svalue
                    svalue += 1
                elif nametype == 'c':
                    sp = svalue.find('|')
                    if sp == -1:
                        pname = svalue
                    else:
                        pname = svalue[0:sp]
                        svalue = svalue[sp+1:]
                else:
                    raise dumpAPNXException("Error: unknown page numbering type %s" % nametype)
                self.pagenames[i] = pname

    def __init__(self, apnxdata):
        self.data = apnxdata
        self.pagenames = []
        self.pageoffsets = []

        # get length of revision string
        rev_len, = struct.unpack_from('>L', self.data, 0x10)

        # Use it to skip over the header, revision string length dataa, and revision string itself
        ptr = 0x14 + rev_len 

        # get length of pagemap description string, number of pages, and offset size in bits
        pm_1, pm_len, pm_nn, pm_bits  = struct.unpack_from('>4H', self.data, ptr)
        print pm_1, pm_len, pm_nn, pm_bits

        # extract pagemap string, and offset table
        pmstr = self.data[ptr+8:ptr+8+pm_len]
        pmoff = self.data[ptr+8+pm_len:]

        # generate the names
        self._parseNames(pm_nn, pmstr)

        # create corresponding table of offsets into assembled_text.dat
        offsize = ">L"
        offwidth = 4
        if pm_bits == 16:
            offsize = ">H"
            offwidth = 2
        ptr = 0
        for i in range(pm_nn):
            od, = struct.unpack_from(offsize, pmoff, ptr)
            ptr += offwidth
            self.pageoffsets.append(od)

    def getNames(self):
        return self.pagenames

    def getOffsets(self):
        return self.pageoffsets



def usage(progname):
    print ""
    print "Description:"
    print "   Decode APNX file"
    print "  "
    print "Usage:"
    print "  %s -h infile.apnx" % progname
    print "  "
    print "Options:"
    print "    -h           print this help message"


def main(argv=utf8_argv()):
    print "Decode APNX"
    progname = os.path.basename(argv[0])
    try:
        opts, args = getopt.getopt(argv[1:], "h")
    except getopt.GetoptError, err:
        print str(err)
        usage(progname)
        sys.exit(2)

    if len(args) != 1:
        usage(progname)
        sys.exit(2)

    for o, a in opts:
        if o == "-h":
            usage(progname)
            sys.exit(0)

    infile = args[0]
    infileext = os.path.splitext(infile)[1].upper()
    print infile, infileext
    if infileext not in ['.APNX', '.APNX']:
        print "Error: first parameter must be an APNX file."
        return 1

    try:
        # make sure it is really a mobi ebook
        apnxdata = file(pathof(infile), 'rb').read()
        # front pad with 8 nonsense bytes to match structure of PAGE section built by kindlegen
        apnxdata = "00000000" + apnxdata

        pp = PageMapProcessor(apnxdata)
        print pp.getNames()
        print pp.getOffsets()

    except Exception, e:
        print "Error: %s" % e
        return 1

    return 0


if __name__ == '__main__':
    sys.exit(main())
