#!/usr/bin/env python

__license__   = 'GPL v3'
__copyright__ = '2011, BurbleBurble <mobireads_forum> <NAMLEHMIARFE>'

import os
import os.path
import shutil
import zipfile
import codecs
import time
import re
from lxml import etree
import lxml.html

class Htmlz():
    def __init__(self, basepath):
        self.basepath = basepath
        self.opf = None

class Reader():
    def __init__(self):
        pass

    def read(self, path_to_htmlz, path_to_tempdir):
        htmlz = Htmlz(os.path.join(path_to_tempdir, 'HTMLZ'))
        with zipfile.ZipFile(path_to_htmlz, 'r') as htmlz_file:
            self.__readOpf(htmlz_file, htmlz)
            self.__readFiles(htmlz_file, htmlz)
        #os.startfile(htmlz.basepath)
        return htmlz

    def __readFiles(self, htmlz_file, htmlz):
        #cover
        htmlz_file.extract('cover.jpg', htmlz.basepath)
        #html
        htmlz_file.extract('index.html', htmlz.basepath)
        #images
        htmlz_file.extractall(htmlz.basepath, [file for file in htmlz_file.namelist() if file.startswith('images/')])

    def __readOpf(self, htmlz_file, htmlz):
        with htmlz_file.open('metadata.opf', mode='r') as opf_file:
            htmlz.opf = etree.parse(opf_file)


class Writer():
    def __init__(self):
        pass

    def write(self, htmlz, path_to_destination):
        if not os.path.exists(os.path.dirname(path_to_destination)):
            os.makedirs(os.path.dirname(path_to_destination))
        with zipfile.ZipFile(path_to_destination, 'w', zipfile.ZIP_DEFLATED) as htmlz_file:
            self.__writeOpf(htmlz, htmlz_file)
            self.__writeFiles(htmlz, htmlz_file)
        os.startfile(os.path.dirname(path_to_destination))

    def __writeFiles(self, htmlz, htmlz_file):
        #cover
        htmlz_file.write(os.path.join(htmlz.basepath, 'cover.jpg'), 'cover.jpg')
        #html
        htmlz_file.write(os.path.join(htmlz.basepath, 'index.html'), 'index.html')
        #images
        for image in os.listdir(os.path.join(htmlz.basepath, 'images')):
            htmlz_file.write(os.path.join(htmlz.basepath, 'images', image), 'images/' + image)

    def __writeOpf(self, htmlz, htmlz_file):
        htmlz_file.writestr('metadata.opf', etree.tostring(htmlz.opf, encoding='utf-8', method='xml', xml_declaration=True, pretty_print=True))

class Tools():
    def __init__(self, informationView=None):
        self.informationView = informationView
        self.blocks = [{'type':'temp', 'children':[]}]
        self.style_stack = [{}]
        self.pattern_map = {}
        self.html = []
        self.unicode_chars = []

    def start(self, tag, attrs):
        #attrs
        self.style_stack.append(self.style_stack[-1].copy())
        for attr in attrs:
            #style attr
            if attr == 'style':
                self.style_stack[-1].update(self.style(attrs['style']))
            #meaningless attr
            elif attr in ['content', 'http-equiv', 'src', 'id']:
                pass
            #unknown attr
            else:
                self.informationView.append('[WARNING] Unsupported attribute: %s="%s"' %(attr, attrs[attr]))
        #remove namespace from tag
        tag = re.sub('{[^}]*}', '', tag)
        #block tags
        if tag in ['blockquote', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']:
            self.blocks.append({'type':'block', 'children':[]})
        #image tag
        elif tag == 'img':
            #fullpath = os.path.normpath(os.path.join(self.basepath, attrs['src']))
            fullpath = attrs['src']
            self.blocks[-1]['children'].append({'text':'<img src="%s"/>' %fullpath, 'style':self.style_stack[-1].copy()})
        #formatting tags
        elif tag in ['sup', 'sub']:
            self.style_stack[-1]['tag-%s' %tag] = 'True'
        #meaningless tags
        elif tag in ['a', 'body', 'div', 'head', 'html', 'meta', 'span', 'title']:
            pass
        #unknown tags
        else:
            self.informationView.append('[WARNING] Unsupported tag: %s' %tag)

    def end(self, tag):
        #update style
        self.style_stack.pop()
        #remove namespace from tag
        tag = re.sub('{[^}]*}', '', tag)
        #block tags: start temp block if previous block ended
        if tag in ['blockquote', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']:
            self.blocks.append({'type':'temp', 'children':[], 'style':{}})

    def data(self, data):
        #remove basic whitespace
        data = re.sub('[\n\r\t ]+', ' ', data)
        #convert unicode -> entity reference
        for match in  re.finditer('&#[0-9]+;', data.encode('ascii', 'xmlcharrefreplace').decode()):
            if not match.group() in self.unicode_chars:
                self.unicode_chars.append(match.group())
        #break data into leading\trailing whitespace and text
        if data[0] == ' ':
            self.blocks[-1]['children'].append({'text':' ', 'style':self.style_stack[-1].copy()})
        if data.strip():
            self.blocks[-1]['children'].append({'text':data.strip(), 'style':self.style_stack[-1].copy()})
        if len(data) > 1 and data[-1] == ' ':
            self.blocks[-1]['children'].append({'text':' ', 'style':self.style_stack[-1].copy()})

    def close(self):
        pass

    def style(self, style):
        styles = re.sub('[\n\r\t ]+', '', style).lower().split(';')
        styles_dict = {}
        for name, value in [individual_style.split(':') for individual_style in styles if individual_style]:
            styles_dict[name] = value
        return styles_dict

    def whitespace(self):
        #convert multiple whitespace -> single
        block = 0
        while block < len(self.blocks):
            children = self.blocks[block]['children']
            index = 0
            while index < len(children) - 1:
                if children[index]['text'] == ' ' and children[index + 1]['text'] == ' ':
                    children.pop(index)
                else:
                    index += 1
            #remove leading whitespace
            if children and children[0]['text'] == ' ':
                children.pop(0)
            #remove trailing whitespace
            if children and children[-1]['text'] == ' ':
                children.pop()
            #remove empty temp blocks
            if self.blocks[block]['type'] == 'temp':
                if not children:
                    self.blocks.pop(block)
                else:
                    block += 1
            else:
                block += 1

    def merge(self):
        #merge text/whitespace: always merge whitespace (as whitespace style doesn't matter), and merge text if same style
        for block in self.blocks:
            children = block['children']
            index = 0
            while index < len(children) - 1:
                child1 = children[index]
                child2 = children[index + 1]
                if child2['text'] == ' ':
                    child1['text'] += child2['text']
                    children.pop(index + 1)
                elif sorted(child1['style'].items()) ==  sorted(child2['style'].items()):
                    child1['text'] += child2['text']
                    children.pop(index + 1)
                else:
                    index += 1

    def upgrade(self):
        #upgrade blockwide styles to the block level
        for block in self.blocks:
            children = block['children']
            block_style = {}
            if children:
                for name, value in list(children[0]['style'].items()):
                    if all(name in child['style'] and child['style'][name] == value for child in children):
                        block_style[name] = value
                        for child in children:
                            child['style'].pop(name)
            block['style'] = block_style

    def pattern(self):
        #discover patterns based on heuristics
        #1.For blocks and spans, utilize styles, excepting italic and bold.
        #2.For blocks, utilize empty paragraphs preceding the block
        emptyblock_count = 0
        for block in self.blocks:
            if block['children']:
                #discover block level patterns
                if emptyblock_count:
                    block['style']['emptyblock-count'] = str(emptyblock_count)
                    emptyblock_count = 0
                if block['style']:
                    style = tuple(sorted(block['style']))
                    if style in self.pattern_map:
                        block['pattern'] = self.pattern_map[style]
                    else:
                        pattern = str(len(self.pattern_map) + 1)
                        self.pattern_map[style] = pattern
                        block['pattern'] = pattern
                #discover span level patterns
                for child in block['children']:
                    if child['style']:
                        style = tuple(sorted(child['style']))
                        if style in self.pattern_map:
                            child['pattern'] = self.pattern_map[style]
                        else:
                            pattern = str(len(self.pattern_map) + 1)
                            self.pattern_map[style] = pattern
                            child['pattern'] = pattern
            else:
                emptyblock_count += 1
        self.informationView.append('[LOG] Discovered %s patterns.' %str(len(self.pattern_map)))

    def write(self):
        #write blocks to html
        #1.ignore empty blocks; they are already included in pattern discovery
        #2.for blocks, add pattern in the form of 'p:pattern'
        #3.for spans, add pattern in the form of 's:pattern'
        #4.for italic and bold, add tag in form of 'i' or 'b'
        #5.after each block, add a newline; remove last newline at the end

        #html start tags
        self.html.append('<html>')
        self.html.append('<head>')
        self.html.append('<meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>')
        self.html.append('</head>')
        self.html.append('<body>')
        #body of the html
        for block in self.blocks:
            if block['children']:
                #add block start tag
                if 'pattern' in block and block['style']:
                    self.html.append('<p pattern="%s" style="%s">'%(block['pattern'], ''.join([name+':'+value+';' for name,value in block['style'].items()])))
                elif 'pattern' in block:
                    self.html.append('<p pattern="%s">'%block['pattern'])
                elif block['style']:
                    self.html.append('<p style="%s">'%''.join([name+':'+value+';' for name,value in block['style'].items()]))
                else:
                    self.html.append('<p>')
                #add children
                for child in block['children']:
                    #add span start tag, if has pattern or style
                    if 'pattern' in child and child['style']:
                        self.html.append('<span pattern="%s" style="%s">'%(child['pattern'], ''.join([name+':'+value+';' for name,value in child['style'].items()])))
                    elif 'pattern' in child:
                        self.html.append('<span pattern="%s">'%child['pattern'])
                    elif child['style']:
                        self.html.append('<span style="%s">'%''.join([name+':'+value+';' for name,value in child['style'].items()]))
                    #add text
                    self.html.append(child['text'])
                    #add span end tag, if has pattern or style
                    if 'pattern' in child or child['style']:
                        self.html.append('</span>')
                #add block end tag
                self.html.append('</p>\n')
        #remove last newline
        self.html[-1] = '</p>'
        #html end tags
        self.html.append('</body>')
        self.html.append('</html>')
        #return joined html!
        return ''.join(self.html)

def main():
    pass

if __name__ == '__main__':
    main()
