#! /usr/bin/python
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab

import sys
import os
import re
import struct
import getopt
import codecs

# Because Windows (and Mac OS X) allows full unicode filenames and paths
# any paths in pure bytestring python 2.X code must be utf-8 encoded as 
# they will need to be converted on the fly to unicode for Windows plat-
# forms.  Any other 8-bit str encoding would lose characters that can not
# be represented in that encoding

# these simple support routines allow use of utf-8 encoded bytestrings as 
# paths in main program to be converted on the fly to full unicode as 
# temporary un-named values to prevent the potential mixing of unicode 
# and bytestring string values in the main program 

_iswindows = sys.platform.startswith('win')

# convert utf-8 encoded path string to proper type on windows that 
# is full unicode and on macosx and linux this is utf-8

def pathof(s):
    global _iswindows
    if s is None:
        return None
    if isinstance(s, unicode):
        print "Warning: pathof expects utf-8 encoded byestring: ", s
        if _iswindows:
            return s
        return s.encode('utf-8')
    if _iswindows:
        return s.decode('utf-8')
    return s


# properly get sys.argv arguments and encode them into utf-8
def utf8_argv():
    global _iswindows
    if _iswindows:
        # Versions 2.x of Python don't support Unicode in 
        # sys.argv on Windows, with the underlying Windows API 
        # instead replacing multi-byte characters with '?'.  
        # So use shell32.GetCommandLineArgvW to get sys.argv 
        # as a list of Unicode strings and encode them as utf-8

        from ctypes import POINTER, byref, cdll, c_int, windll
        from ctypes.wintypes import LPCWSTR, LPWSTR

        GetCommandLineW = cdll.kernel32.GetCommandLineW
        GetCommandLineW.argtypes = []
        GetCommandLineW.restype = LPCWSTR

        CommandLineToArgvW = windll.shell32.CommandLineToArgvW
        CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
        CommandLineToArgvW.restype = POINTER(LPWSTR)

        cmd = GetCommandLineW()
        argc = c_int(0)
        argv = CommandLineToArgvW(cmd, byref(argc))
        if argc.value > 0:
            # Remove Python executable and commands if present
            start = argc.value - len(sys.argv)
            return [argv[i].encode('utf-8') for i in
                    xrange(start, argc.value)]
        # this should never happen
        return None
    else:
        argv = []
        argvencoding = sys.stdin.encoding
        if argvencoding == None:
            argvencoding = sys.getfilesystemencoding()
        if argvencoding == None:
            argvencoding = 'utf-8'
        for arg in sys.argv:
            if type(arg) == unicode:
                argv.append(arg.encode('utf-8'))
            else:
                argv.append(arg.decode(argvencoding).encode('utf-8'))
        return argv


_OPF_PARENT_TAGS = ['xml', 'package', 'metadata', 'dc-metadata', 'x-metadata', 'manifest', 'spine', 'tours', 'guide']

class RESCProcessor(object):

    def __init__(self, filename):
        self.filename = filename
        self.resc = open(pathof(self.filename), 'rb').read()
        self.codec = 'utf-8'
        self.opos = 0
        self.extrameta = []
        self.cover_name = None
        self.spine_idrefs = {}
        self.spine_order = []
        self.spine_pageprops = {}
        self.spine_ppd = None

    # iterate through the tags in the RESC
    def resc_tag_iter(self):
        tcontent = last_tattr = None
        prefix = ['']
        while True:
            text, tag = self.parseresc()
            if text is None and tag is None:
                break
            if text is not None:
                tcontent = text.rstrip(" \r\n")
            else: # we have a tag
                ttype, tname, tattr = self.parsetag(tag)
                if ttype == "begin":
                    tcontent = None
                    prefix.append(tname + '.')
                    if tname in _OPF_PARENT_TAGS:
                        yield "".join(prefix), tname, tattr, tcontent
                    else:
                        last_tattr = tattr
                else: # single or end
                    if ttype == "end":
                        prefix.pop()
                        tattr = last_tattr
                        last_tattr = None
                        if tname in _OPF_PARENT_TAGS:
                            tname += '-end'
                    yield "".join(prefix), tname, tattr, tcontent
                    tcontent = None


    # now parse the RESC to extract spine and extra metadata info
    def parseData(self):
        for prefix, tname, tattr, tcontent in self.resc_tag_iter():
            # print "  "
            # print prefix, tname
            # print tattr
            # print tcontent
            if tname == "spine":
                self.spine_ppd = tattr.get("page-progession-direction", None)
            if tname == "itemref":
                skelid = tattr.get("skelid", None)
                if skelid is None and len(self.spine_order) == 0:
                    # assume it was removed initial coverpage
                    skelid = "coverpage"
                self.spine_order.append(skelid)
                self.spine_idrefs[skelid] = tattr.get("idref",None)
                self.spine_pageprops[skelid] = tattr.get("properties", None)
            if tname == "meta" or tname.startswith("dc:"):
                if tattr.get("name","") == "cover":
                    self.cover_name = tattr.get("content",None)
                else:
                    self.extrameta.append([tname, tattr, tcontent])


    # parse and return either leading text or the next tag
    def parseresc(self):
        p = self.opos
        if p >= len(self.resc):
            return None, None
        if self.resc[p] != '<':
            res = self.resc.find('<',p)
            if res == -1 :
                res = len(self.resc)
            self.opos = res
            return self.resc[p:res], None
        # handle comment as a special case
        if self.resc[p:p+4] == '<!--':
            te = self.resc.find('-->',p+1)
            if te != -1:
                te = te+2
        else:
            te = self.resc.find('>',p+1)
            ntb = self.resc.find('<',p+1)
            if ntb != -1 and ntb < te:
                self.opos = ntb
                return self.resc[p:ntb], None
        self.opos = te + 1
        return None, self.resc[p:te+1]

    # parses tag to identify:  tname: name;
    # ttype: type 'begin', 'end' or 'single';
    # tattr: dictionary of its atributes
    def parsetag(self, s):
        p = 1
        tname = None
        ttype = None
        tattr = {}
        while s[p:p+1] == ' ' : p += 1
        if s[p:p+1] == '/':
            ttype = 'end'
            p += 1
            while s[p:p+1] == ' ' : p += 1
        b = p
        while s[p:p+1] not in ('>', '/', ' ', '"', "'","\r","\n") : p += 1
        tname=s[b:p].lower()
        # some special cases
        if tname == "?xml":
            tname = "xml"
        if tname == "!--":
            ttype = 'single'
            comment = s[p:-3].strip()
            tattr['comment'] = comment
        if ttype is None:
            # parse any attributes of begin or single tags
            while s.find('=',p) != -1 :
                while s[p:p+1] == ' ' : p += 1
                b = p
                while s[p:p+1] != '=' : p += 1
                aname = s[b:p].lower()
                aname = aname.rstrip(' ')
                p += 1
                while s[p:p+1] == ' ' : p += 1
                if s[p:p+1] in ('"', "'") :
                    p = p + 1
                    b = p
                    while s[p:p+1] not in ('"', "'"): p += 1
                    val = s[b:p]
                    p += 1
                else :
                    b = p
                    while s[p:p+1] not in ('>', '/', ' ') : p += 1
                    val = s[b:p]
                tattr[aname] = val
        if tattr is not None and len(tattr)== 0: tattr = None
        if ttype is None:
            ttype = 'begin'
            if s.find('/',p) >= 0:
                ttype = 'single'
        return ttype, tname, tattr


def usage(progname):
    print ""
    print "Description:"
    print "   Decode RESCXXXXX.dat"
    print "  "
    print "Usage:"
    print "  %s -h PATH_TO_UNPACKED_RESCXXXXX.dat" % progname
    print "  "
    print "Options:"
    print "    -h           print this help message"


def main(argv=utf8_argv()):
    progname = os.path.basename(argv[0])
    try:
        opts, args = getopt.getopt(argv[1:], "h")
    except getopt.GetoptError, err:
        print str(err)
        usage(progname)
        sys.exit(2)

    if len(args) != 1:
        usage(progname)
        sys.exit(2)

    for o, a in opts:
        if o == "-h":
            usage(progname)
            sys.exit(0)

    infile = args[0]
    try:
        print 'Parsing RESC'
        rp = RESCProcessor(infile)
        rp.parseData()
        print "cover name: ", rp.cover_name
        print "extra metadata"
        for tname, tattr, tcontent in rp.extrameta:
            print "    ", tname, tattr, tcontent
        print "spine ppd: ",rp.spine_ppd
        for key in rp.spine_order:
            print key, rp.spine_idrefs[key], rp.spine_pageprops[key]
        print 'Completed'
    except ValueError, e:
        print "Error: %s" % e
        return 1
    return 0

if __name__ == "__main__":
    sys.exit(main())

