#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab

import sys
import os
import unicodedata

from collections import OrderedDict

_SEP_ = "|"

SPECIAL_HANDLING_TAGS = OrderedDict([
    ('?xml', ('xmlheader', -1)), 
    ('!--', ('comment', -3)),
    ('!DOCTYPE', ('doctype', -1)),
    ('?', ('pi', -1))
])

SPECIAL_HANDLING_TYPES = ['xmlheader', 'comment', 'doctype', 'pi']

VOID_TAGS = ("area","base","basefont","bgsound","br","col","command",
    "embed","event-source","frame","hr","img","input","keygen",
    "link","menuitem","meta","param","source","spacer","track","wbr",
    "mbp:pagebreak")

class HTMLLangTextParser(object):

    def __init__(self, default_lang, data, codec = 'utf-8'):
        if data is None:
            data = ''
        if isinstance(data, bytes):
            data = data.decode(codec)
        self.content = data
        self.clen = len(self.content)
        
        # parser position info
        self.pos = 0
        self.pp = 0
        self.start_offset = 0
        
        # to track tag nesting
        self.tagpath = []
        self.taglang = [default_lang]


    # parses string version of tag to identify its name,
    # its type 'begin', 'end' or 'single', or one of the special handling types
    # plus build a hashtable of its atributes
    def parsetag(self, s):
        taglen = len(s)
        p = 1
        # get the tag name
        tname = None
        ttype = None
        tattr = OrderedDict()
        while s[p:p+1] == ' ' : p += 1
        if s[p:p+1] == '/':
            ttype = 'end'
            p += 1
            while s[p:p+1] == ' ' : p += 1
        b = p
        # handle special case of comment as there may be no spaces to delimit name begin or end 
        if s[b:b+3] == "!--":
            p = b+3
            tname = "!--"
            ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
            tattr['special'] = s[p:backstep]
            return tname, ttype, tattr
        # handle special case of generic xml processing instruction (pi)
        if tname != "?xml" and s[b:b+1] == "?":
            p = b+1
            tname = "?"
            ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
            tattr['special'] = s[p:backstep]
            return tname, ttype, tattr
        while s[p:p+1] not in ('>', '/', ' ', '\f', "\t", "\r", "\n") : 
            p += 1
        tname=s[b:p].lower()
        if tname == '!doctype':
            tname = '!DOCTYPE'
        # other special cases
        if tname in SPECIAL_HANDLING_TAGS:
            ttype, backstep = SPECIAL_HANDLING_TAGS[tname]
            tattr['special'] = s[p:backstep]
        if ttype is None:
            # parse any attributes
            while s.find('=',p) != -1 :
                while s[p:p+1] == ' ' : p += 1
                b = p
                while s[p:p+1] != '=' : p += 1
                aname = s[b:p].lower()
                aname = aname.rstrip(' ')
                p += 1
                while s[p:p+1] == ' ' : p += 1
                if s[p:p+1] in ('"', "'") :
                    qt = s[p:p+1]
                    p = p + 1
                    b = p
                    while s[p:p+1] != qt: 
                        p += 1
                    val = s[b:p]
                    p += 1
                else :
                    b = p
                    while s[p:p+1] not in ('>', '/', ' ') : 
                        p += 1
                    val = s[b:p]
                tattr[aname] = val
        # label beginning and single tags
        if ttype is None:
            ttype = 'begin'
            if s.find('/',p) >= 0:
                ttype = 'single'
        return tname, ttype, tattr


    # parse leading text of xhtml and tag
    # returns as tuple (Leading Text, Tag)
    # only one will have a value, the other will always be None
    def parseml(self):
        self.pp = self.pos
        p = self.pos
        if p >= self.clen:
            return None, None
        if self.content[p] != '<':
            res = self.content.find('<',p)
            if res == -1 :
                res = len(self.content)
            self.pos = res
            return self.content[p:res], None
        # handle comment as a special case to deal with multi-line comments
        if self.content[p:p+4] == '<!--':
            tb = p
            te = self.content.find('-->',p+1)
            if te != -1:
                te = te+2
        else :
            tb = p
            te = self.content.find('>',p+1)
            ntb = self.content.find('<',p+1)
            if ntb != -1 and ntb < te:
                self.pos = ntb
                return self.content[p:ntb], None
        self.pos = te + 1
        return None, self.content[p:te+1]


    # yields leading text, tagpath prefix, tag name, tag type, tag attributes
    # tag prefix is a dotted history of all open parent ("begin') tags
    # tag types are "single", "begin", "end", "comment", "xmlheader", and "doctype"
    # tag attributes is a dictionary of key and value pairs
    def parse_iter(self):
        while True:
            text, tag = self.parseml()
            if text is None and tag is None:
                break

            self.start_offset = self.pp

            tp = ".".join(self.tagpath)

            if text is not None:
                tname = ttype = tattr = None
            
            if tag is not None:
                text = None
                tname, ttype, tattr = self.parsetag(tag)

                if ttype == "begin":
                    self.tagpath.append(tname)
                    lang = tattr.get("xml:lang", None)
                    lang = tattr.get("lang", lang)
                    if lang:
                        self.taglang.append(lang)
                    else:
                        self.taglang.append(self.taglang[-1])
                                    
                if ttype == "end":
                    self.tagpath.pop()
                    self.taglang.pop()
                                        
            yield text, tp, self.taglang[-1], self.start_offset, tname, ttype, tattr

            
    def langtext_iter(self):
        for text, tp, lang, soffset, tname, ttype, tattr in self.parse_iter():
            if text and not (tp.endswith('.style') or tp.endswith('.script')):
                yield soffset, lang, text

    
def main():
    sample = '''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">

<html xmlns="http://www.w3.org/1999/xhtml" lang="es" xml:lang="pl">
<head>
<title>This is a title!</title>
</head>
<body lang="de">
<p>&nbsp;</p>
<p lang="en">This is a line-of-text.</p>
<p lang="en">Hello!  This is a question?  This "quoted"</p>
<p lang="en">In english: <span lang="es">spanish</span> and english again.</p>
</body>
</html>
'''

    p = HTMLLangTextParser("en", sample)
    for soffset, lang, text in p.langtext_iter():
        if text.strip() != "":
            print("offset: ", soffset, "code: "+ lang," " + text)
    return 0

if __name__ == '__main__':
    sys.exit(main())

