#!/usr/bin/env python

__license__   = 'GPL v3'
__copyright__ = '2011, BurbleBurble <mobireads_forum> <NAMLEHMIARFE>'

import os
import os.path
import shutil
import zipfile
import codecs
import time
import re
from lxml import etree
import lxml.html


class Epub():
    def __init__(self, basepath):
        #basepath; directory of extracted epub files
        self.basepath = basepath
        #ocf
        self.ocf_container = None
        self.ocf_encryption = None
        self.ocf_manifest = None
        self.ocf_metadata = None
        self.ocf_mimetype = None
        self.ocf_rights = None
        self.ocf_signatures = None
        #opf
        self.opf= None

    def addManifestItem(self, href, media_type, id_):
        #item
        item_element = etree.Element('item')
        item_element.set('href', href)
        item_element.set('media-type', media_type)
        item_element.set('id', id_)
        #manifest
        manifest = self.opf.xpath('//*[local-name()="manifest"]')[0]
        manifest.append(item_element)

    def addSpineItemref(self, idref, index=None):
        #item
        itemref_element = etree.Element('itemref')
        itemref_element.set('idref', idref)
        #spine
        spine = self.opf.xpath('//*[local-name()="spine"]')[0]
        spine.append(itemref_element)

    def getXhtmlXml(self):
        '''
        Return an ordered list of paths to xhtml/xml files in epub spine.
        '''
        #xhtml/xml
        xhtml_xml = []
        opf = self.ocf_container.xpath('//*[local-name()="rootfile"][@media-type="application/oebps-package+xml"]')[0].get('full-path')
        for itemref in self.opf.xpath('//*[local-name()="spine"]/*[local-name()="itemref"]'):
            item = self.opf.xpath('//*[local-name()="manifest"]/*[local-name()="item"][@id="%s"]' %itemref.attrib['idref'])[0]
            if item.attrib['media-type'] == 'application/xhtml+xml':
                xhtml_xml.append(os.path.normpath(os.path.join(self.basepath, os.path.dirname(opf), item.get('href'))))
        return xhtml_xml


class EpubReader():
    def __init__(self):
        self.epub_object = None

    def read(self, path_to_epub, path_to_tempdir):
        '''
        EPUB 2.0.1
        http://idpf.org/epub/201
        '''
        #epub object
        print('[LOG]Reading epub...')
        self.epub_object = Epub(basepath=os.path.join(path_to_tempdir, 'RAW'))
        with zipfile.ZipFile(path_to_epub, 'r') as epub_file:
            self.__readOcf(epub_file)
            self.__readOpf(epub_file)
            self.__readFiles(epub_file)
        return self.epub_object

    def __readFiles(self, epub_file):
        print('[LOG]Reading files...')
        self.__readFilesFromOcfContainer(epub_file)
        self.__readFilesFromOpf(epub_file)

    def __readFilesFromOcfContainer(self, epub_file):
        #as stated in the ocf documentation, there may be multiple versions of the publication (pdf printing for ex) listed in the ocf container
        print('[LOG]Reading files from the ocf container... not supported.')

    def __readFilesFromOpf(self, epub_file):
        print('[LOG]Reading files from the opf...')
        basepath = os.path.dirname(self.epub_object.ocf_container.xpath('//*[local-name()="rootfile"][@media-type="application/oebps-package+xml"]')[0].get('full-path'))
        for item_element in self.epub_object.opf.xpath('//*[local-name()="manifest"]/*[local-name()="item"]'):
            if basepath: #avoid starting with '/'
                epub_file.extract(basepath + '/' + item_element.get('href'), self.epub_object.basepath)
            else:
                epub_file.extract(item_element.get('href'), self.epub_object.basepath)

    def __readOcf(self, epub_file):
        '''
        OCF 2.0.1
        http://idpf.org/epub/20/spec/OCF_2.0.1_draft.doc
        '''
        print('[LOG]Reading ocf...')
        self.__readOcfMimetype(epub_file)
        self.__readOcfContainer(epub_file)
        self.__readOcfEncryption(epub_file)
        self.__readOcfManifest(epub_file)
        self.__readOcfMetadata(epub_file)
        self.__readOcfRights(epub_file)
        self.__readOcfSignatures(epub_file)

    def __readOcfContainer(self, epub_file):
        print('[LOG]Reading ocf container')
        with epub_file.open('META-INF/container.xml', mode='r') as ocf_container_file:
            self.epub_object.ocf_container = etree.parse(ocf_container_file)

    def __readOcfEncryption(self, epub_file):
        print('[LOG]Reading ocf encryption... not supported.')

    def __readOcfManifest(self, epub_file):
        print('[LOG]Reading ocf manifest... not supported.')

    def __readOcfMetadata(self, epub_file):
        print('[LOG]Reading ocf metadata... not supported.')

    def __readOcfMimetype(self, epub_file):
        print('[LOG]Reading ocf mimetype... not supported.')

    def __readOcfRights(self, epub_file):
        print('[LOG]Reading ocf rights... not supported.')

    def __readOcfSignatures(self, epub_file):
        print('[LOG]Reading ocf signatures... not supported.')

    def __readOpf(self, epub_file):
        '''
        OPF 2.0.1
        http://idpf.org/epub/20/spec/OPF_2.0.1_draft.htm
        '''
        print('[LOG]Reading opf...')
        #as stated in the ocf documentation, the first rootfile element (in the ocf_container) of the media_type 'application/oebps-package+xml' is the package's opf file
        path_to_opf = self.epub_object.ocf_container.xpath('//*[local-name()="rootfile"][@media-type="application/oebps-package+xml"]')[0].get('full-path')
        with epub_file.open(path_to_opf, mode='r') as opf_file:
            self.epub_object.opf = etree.parse(opf_file)


class EpubWriter():
    def __init__(self):
        self.epub_object = None

    def write(self, epub_object, path_to_epub):
        '''
        EPUB 2.0.1
        http://idpf.org/epub/201
        '''
        print('[LOG]Writing epub...')
        self.epub_object = epub_object
        with zipfile.ZipFile(path_to_epub, 'w', zipfile.ZIP_DEFLATED) as epub_file:
            self.__writeOcf(epub_file)
            self.__writeOpf(epub_file)
            self.__writeFiles(epub_file)

    def __writeFiles(self, epub_file):
        print('[LOG]Writing files...')
        self.__writeFilesFromOcfContainer(epub_file)
        self.__writeFilesFromOpf(epub_file)

    def __writeFilesFromOcfContainer(self, epub_file):
        #as stated in the ocf documentation, there may be multiple versions of the publication (pdf printing for ex) listed in the ocf container
        print('[LOG]Writing files from the ocf container... not supported.')

    def __writeFilesFromOpf(self, epub_file):
        print('[LOG]Writing files from the opf...')
        target_basepath = os.path.dirname(self.epub_object.ocf_container.xpath('//*[local-name()="rootfile"][@media-type="application/oebps-package+xml"]')[0].get('full-path'))
        for item in self.epub_object.opf.xpath('//*[local-name()="manifest"]/*[local-name()="item"]'):
            if target_basepath: #avoid starting with '/' when writing...
                source_path = os.path.normpath(os.path.join(self.epub_object.basepath, target_basepath, item.get('href')))
                epub_file.write(source_path, target_basepath + '/' + item.get('href'))
            else:
                source_path = os.path.normpath(os.path.join(self.epub_object.basepath, item.get('href')))
                epub_file.write(source_path, item.get('href'))

    def __writeOcf(self, epub_file):
        '''
        OCF 2.0.1
        http://idpf.org/epub/20/spec/OCF_2.0.1_draft.doc
        '''
        print('[LOG]Writing ocf...')
        self.__writeOcfMimetype(epub_file) #always first
        self.__writeOcfContainer(epub_file)
        self.__writeOcfEncryption(epub_file)
        self.__writeOcfManifest(epub_file)
        self.__writeOcfMetadata(epub_file)
        self.__writeOcfRights(epub_file)
        self.__writeOcfSignatures(epub_file)

    def __writeOcfContainer(self, epub_file):
        print('[LOG]Writing ocf container')
        #ocf container file
        ocf_container_file = zipfile.ZipInfo('META-INF/container.xml', time.gmtime())
        ocf_container_file.compress_type = zipfile.ZIP_DEFLATED
        ocf_container_file.external_attr = 0x81a40000
        #ocf container contents
        ocf_container_contents = etree.tostring(self.epub_object.ocf_container, encoding='utf-8', method='xml', xml_declaration=True, pretty_print=True)
        #write container file/contents
        epub_file.writestr(ocf_container_file, ocf_container_contents)

    def __writeOcfEncryption(self, epub_file):
        print('[LOG]Writing ocf encryption... not supported.')

    def __writeOcfManifest(self, epub_file):
        print('[LOG]Writing ocf manifest... not supported.')

    def __writeOcfMetadata(self, epub_file):
        print('[LOG]Writing ocf metadata... not supported.')

    def __writeOcfMimetype(self, epub_file):
        print('[LOG]Writing ocf mimetype...')
        #ocf mimetype file
        ocf_mimetype_file = zipfile.ZipInfo('mimetype', time.gmtime())
        ocf_mimetype_file.compress_type = zipfile.ZIP_STORED
        ocf_mimetype_file.external_attr = 0x81a40000
        #ocf mimetype contents
        ocf_mimetype_contents = 'application/epub+zip'
        #write
        epub_file.writestr(ocf_mimetype_file, ocf_mimetype_contents)

    def __writeOcfRights(self, epub_file):
        print('[LOG]Writing ocf rights... not supported.')

    def __writeOcfSignatures(self, epub_file):
        print('[LOG]Writing ocf signatures... not supported.')

    def __writeOpf(self, epub_file):
        '''
        OPF 2.0.1
        http://idpf.org/epub/20/spec/OPF_2.0.1_draft.htm
        '''
        print('[LOG]Writing opf...')
        #opf file
        #as stated in the ocf documentation, the first rootfile element (in the ocf_container) of the media_type 'application/oebps-package+xml' is the package's opf file
        path_to_opf_file = self.epub_object.ocf_container.xpath('//*[local-name()="rootfile"][@media-type="application/oebps-package+xml"]')[0].get('full-path')
        opf_file = zipfile.ZipInfo(path_to_opf_file, time.gmtime())
        opf_file.compress_type = zipfile.ZIP_DEFLATED
        opf_file.external_attr = 0x81a40000
        #opf contents
        opf_contents = etree.tostring(self.epub_object.opf, encoding='utf-8', method='xml', xml_declaration=True, pretty_print=True)
        #write opf
        epub_file.writestr(opf_file, opf_contents)


class EpubXhtmlXmlCleaner():
    '''
    1.Combine multiple xhtml/xml
    2.Reduce xhtml/xml to basic blocks/styles
    '''
    def __init__(self):
        self.tags = None
        self.style_names = None
        self.style_values = None

        self.selector_to_style_map = {} #for old css
        self.style_to_selector_map = {} #for new css
        self.tag_stack = []
        self.style_stack = [{}]
        self.block_stack = [{'type':'block', 'tag':'body', 'children':[], 'style':{}}]

        self.epub_object = None
        self.xhtml_xml = []
        self.newbasepath_to_epub = ''
        self.newpaths_to_images = []
        self.oldimagepath_to_newimagepath_map = {} #in case same image referenced twice
        self.image_count= 0 #used for moving/renaming images

    def clean(self, epub_object, tags, style_names, style_values):
        '''
        tags, style_names, style_values are configurations used in the cleaning process
        '''
        self.tags = tags
        self.style_names = style_names
        self.style_values = style_values
        print('[LOG]Cleaning epub...')
        self.epub_object = epub_object
        #create new epub basepath
        self.newbasepath_to_epub = os.path.join(os.path.dirname(epub_object.basepath), 'CLEANED')
        #clean each xhtml/xml
        root_block = {'type':'block', 'tag':'body', 'children':[], 'style':{}}
        for path_to_xhtml_xml in epub_object.getXhtmlXml():
            self.path_to_xhtml_xml = path_to_xhtml_xml
            print('[LOG]Cleaning xhtml/xml (%s)...' %self.path_to_xhtml_xml)
            child_block = etree.parse(self.path_to_xhtml_xml, etree.XMLParser(target=self))
            root_block['children'].extend(child_block['children'])
        #reset basebath
        epub_object.basepath = self.newbasepath_to_epub
        #reset path to opf; use content.opf for cleaner orginazation
        #as stated in the opf documentation, the paths listed in the opf manifest are relative to the opf file
        #as stated in the ocf documentation, the first rootfile element (in the ocf_container) of the media_type 'application/oebps-package+xml' is the package's opf file
        opf_rootfile_element = self.epub_object.ocf_container.xpath('//*[local-name()="rootfile"][@media-type="application/oebps-package+xml"]')[0]
        opf_rootfile_element.set('full-path', 'content.opf')
        #reset manifest
        manifest = epub_object.opf.xpath('//*[local-name()="manifest"]')[0]
        manifest.clear()
        #reset spine
        spine = epub_object.opf.xpath('//*[local-name()="spine"]')[0]
        spine.clear()
        #reset guide
        try:
            guide = epub_object.opf.xpath('//*[local-name()="guide"]')[0]
            guide.clear()
        except:
            print('[WARNING]No guide to reset...')
        #update manifest, spine
        id_ = 'item' + format(1, '02d')
        epub_object.addManifestItem('text.html', 'application/xhtml+xml', id_)
        epub_object.addSpineItemref(id_)

        id_count = 1
        for newpath in self.newpaths_to_images:
            id_count += 1
            id_ = 'item' + format(id_count, '02d')
            epub_object.addManifestItem(newpath, 'image/jpeg', id_)

        #prepare xhtml/xml
        stylesheet = ['.' + selector + '{' + ';'.join(name + ':' + value for name, value in style) + '}' for style, selector in self.style_to_selector_map.items()]
        stylesheet.sort() #aesthetics, easy viewing of stylesheet
        stylesheet = '\n' + '\n'.join(stylesheet) + '\n' #aesthetics, easy viewing of stylesheet
        head = [#'<?xml version=\'1.0\' encoding=\'utf-8\'?>', #if using etree, cant do fromstring() with type declaration
                '<html>',
                '<head>',
                '<meta content="http://www.w3.org/1999/xhtml; charset=utf-8" http-equiv="Content-Type"/>',
                '<style>',
                stylesheet,
                '</style>',
                '</head>',
                '<body>'
                ]
        self.xhtml_xml.extend(head)
        self.write(root_block)
        self.xhtml_xml.append('</body></html>')

        #save
        path = os.path.join(self.newbasepath_to_epub, 'text.html')
        if not os.path.exists(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        with open(path, 'w+b') as xhtml_xml_file: #etree.tostring, when not using encoding=unicode, returns a byte object, thus w+b
            #pretty print!
            print('[LOG]Pretty printing and saving...')
            #self.xhtml_xml = etree.fromstringlist(self.xhtml_xml) #calibre reallllly doesn't like this for some reason.
            self.xhtml_xml = etree.fromstring(''.join(self.xhtml_xml))
            xhtml_xml_file.write(etree.tostring(self.xhtml_xml, encoding='utf-8', method='xml', xml_declaration=True, pretty_print=True))
        #os.startfile(os.path.dirname(path))

    def start(self, tag, attrs):
        #tag
        tag = re.sub('{[^}]*}', '', tag) #namespace
        self.tag_stack.append(tag)
        if not tag in self.tags.keys():
            print('[WARNING]Unsupported tag: %s' %tag)

        #style
        self.style_stack.append(self.style_stack[-1].copy())
        if tag in self.tags.keys() and self.tags[tag]['isStyle']:
            self.style_stack[-1][self.tags[tag]['styleName']] = self.tags[tag]['styleValue']
        if 'class' in attrs:
            for class_ in attrs['class'].split():
                self.style_stack[-1].update(self.selector_to_style_map[class_])
        if 'style' in attrs:
            self.style_stack[-1].update(self.style(attrs['style']))

        #stylesheet
        if tag == 'link' and 'type' in attrs and attrs['type'] == 'text/css' and 'rel' in attrs and attrs['rel'] == 'stylesheet':
            with codecs.open(os.path.normpath(os.path.join(os.path.dirname(self.path_to_xhtml_xml), attrs['href'])), 'r', 'utf-8') as stylesheet_file:
                self.stylesheet(stylesheet_file.read())

        #image
        if tag == 'img':
            if not attrs['src'] in self.oldimagepath_to_newimagepath_map:
                #move; for cleaner orginization
                #as stated in the opf documentation, the paths listed in the opf manifest are relative to the opf file
                #as stated in the ocf documentation, the first rootfile element (in the ocf_container) of the media_type 'application/oebps-package+xml' is the package's opf file
                dirname_of_opf = os.path.dirname(self.epub_object.ocf_container.xpath('//*[local-name()="rootfile"][@media-type="application/oebps-package+xml"]')[0].get('full-path'))
                self.image_count += 1
                old_path_to_image = attrs['src']
                new_path_to_image = 'images/image' + format(self.image_count, '02d') + os.path.splitext(old_path_to_image)[1]
                full_path_to_destination = os.path.normpath(os.path.join(self.newbasepath_to_epub, new_path_to_image))
                if not os.path.exists(os.path.dirname(full_path_to_destination)):
                    os.makedirs(os.path.dirname(full_path_to_destination))
                shutil.move(os.path.normpath(os.path.join(os.path.dirname(self.path_to_xhtml_xml), old_path_to_image)), full_path_to_destination)
                self.newpaths_to_images.append(new_path_to_image)
                self.oldimagepath_to_newimagepath_map[old_path_to_image] = new_path_to_image
            else:
                new_path_to_image = self.oldimagepath_to_newimagepath_map[attrs['src']]
            self.block_stack[-1]['children'].append({'type':'image',
                                                     'src':new_path_to_image,
                                                     'style':self.style_stack[-1].copy()
                                                     })

        #block
        if tag in self.tags.keys() and self.tags[tag]['transform']:
            #if in block created without tags, just because in body (see data() )
            if self.block_stack[-1]['type'] == 'bodyblock':
                block = self.block_stack.pop()
                self.whitespace(block)
                self.default(block)
                self.merge(block) #could be extra if removed due to all whitespace
                self.upgrade(block) #could be extra if removed due to all whitespace
                self.css(block) #could be extra if removed due to all whitespace
            #either way:
            new_block = {'type':'block',
                         'tag':self.tags[tag]['transform'],
                         'children':[],
                         'style':{}}
            self.block_stack[-1]['children'].append(new_block)
            self.block_stack.append(new_block)
        elif tag == 'br':
            #end last block and create new one, if not in tag==body or tag==bodyblock
            if self.block_stack[-1]['type'] == 'bodyblock':
                block = self.block_stack.pop()
                self.whitespace(block)
                self.default(block)
                self.merge(block) #could be extra if removed due to all whitespace
                self.upgrade(block) #could be extra if removed due to all whitespace
                self.css(block) #could be extra if removed due to all whitespace
            if self.block_stack[-1]['tag'] == 'body':
                #if last block is body block; then br was already 'included' in its 'p' tag!
                if self.block_stack[-1]['children'] and self.block_stack[-1]['children'][-1]['type'] == 'bodyblock':
                    #change last bodyblock to 'block'; so it won't repeatedly ignore br's
                    self.block_stack[-1]['children'][-1]['type'] = 'block'
                else:
                    new_block = {'type':'block',
                         'tag':'br',
                         'children':[],
                         'style':{}}
                    self.block_stack[-1]['children'].append(new_block)
            else:
                block = self.block_stack.pop()
                new_block = {'type':block['type'],
                         'tag':block['tag'],
                         'children':[],
                         'style':{}}

                self.whitespace(block)
                self.default(block)
                self.merge(block) #could be extra if removed due to all whitespace
                self.upgrade(block) #could be extra if removed due to all whitespace
                self.css(block) #could be extra if removed due to all whitespace

                self.block_stack[-1]['children'].append(new_block)
                self.block_stack.append(new_block)

    def end(self, tag):
        #tag
        tag = re.sub('{[^}]*}', '', tag)
        self.tag_stack.pop()

        #style
        self.style_stack.pop()

        #block
        if tag in self.tags.keys() and self.tags[tag]['transform']:
            block = self.block_stack.pop()
            self.whitespace(block)
            self.default(block)
            self.merge(block)
            self.upgrade(block)
            self.css(block) #could be extra if removed due to all whitespace

    def data(self, data):
        #stylesheet
        if self.tag_stack[-1] == 'style':
            self.stylesheet(data)
        #text
        elif 'body' in self.tag_stack:
            #avoid un'block'ed data in the body
            if self.block_stack[-1]['tag'] == 'body':
                new_block = {'type':'bodyblock',
                         'tag':'p',
                         'children':[],
                         'style':{}}
                self.block_stack[-1]['children'].append(new_block)
                self.block_stack.append(new_block)
            #add text
            data = re.sub('[\n\r\t ]+', ' ', data)
            if data[0] == ' ':
                self.block_stack[-1]['children'].append({'type':'whitespace',
                                               'text':' ',
                                               'style':self.style_stack[-1].copy()})
            if data.strip():
                self.block_stack[-1]['children'].append({'type':'text',
                                           'text':data.strip(),
                                           'style':self.style_stack[-1].copy()})

            if len(data) > 1 and data[-1] == ' ':
                self.block_stack[-1]['children'].append({'type':'whitespace',
                                               'text':' ',
                                               'style':self.style_stack[-1].copy()})



    def comment(self, comment):
        print('[WARNING]Found comment: %s' %comment)

    def close(self):
        #in case self.block_stack[-1] is a body block (for uncontained text...)
        if self.block_stack[-1]['type'] == 'bodyblock':
            block = self.block_stack.pop()
            self.whitespace(block)
            self.merge(block) #could be extra if removed due to all whitespace
            self.upgrade(block) #could be extra if removed due to all whitespace
            self.css(block) #could be extra if removed due to all whitespace
        #root block
        root_block = self.block_stack.pop()
        self.whitespace(root_block)
        self.default(root_block)
        self.merge(root_block)
        #self.upgrade(root_block) currently not available for root block
        self.css(root_block)
        #reset
        self.selector_to_style_map = {}
        self.tag_stack = []
        self.style_stack = [{}]
        self.block_stack = [{'type':'block', 'tag':'body', 'children':[], 'style':{}}]
        #return root block
        return root_block

    def stylesheet(self, stylesheet):
        #stylesheet; ONLY SUPPORTS CLASS SELECTORS
        rules = re.finditer(r'\.([^.{]+)\{([^}]*)\}', re.sub('"', '\'', re.sub('[\n\r\t ]+', '', stylesheet)).lower()) #example regex match: .calibre{text-align:right;}
        for selector, style in [(rule.group(1), rule.group(2)) for rule in rules]:
            self.selector_to_style_map[selector] = self.style(style)

    def style(self, style):
        styles = re.sub('"', '\'', re.sub('[\n\r\t ]+', '', style)).lower().split(';')
        styles_dict = {}
        for name, value in [individual_style.split(':') for individual_style in styles if individual_style]:
            if name in self.style_names: # and re.match(Configuration.style_name[name]['regex_values'], value, re.IGNORECASE):
                #convert to float for easy matching of each other and default values
                number_match = re.match('[-]{0,1}(?:[0-9]+\.[0-9]+|\.[0-9]+|[0-9]+)', value)
                if number_match:
                    value = str(float(number_match.group())) + value[number_match.end():]
                styles_dict[name] = value
            else:
                print('[WARNING]Unsupported style: ' + name)
        return styles_dict

    def whitespace(self, block):
        #normal whitespace
        children = block['children']
        if children:
            #double
            index = 0
            while index < len(children) - 1:
                if children[index]['type'] == 'whitespace' and children[index + 1]['type'] == 'whitespace':
                    children.pop(index)
                else:
                    index += 1
            #leading
            if children[0]['type'] == 'whitespace':
                children.pop(0)
            #trailing
            if children and children[-1]['type'] == 'whitespace':
                children.pop()
        #bodyblock whitespace (only for uncontained text; otherwise whitespace is meaningless)
        if block['type'] == 'bodyblock':
            if all(child['type'] == 'whitespace' for child in children) or not children:
                self.block_stack[-1]['children'].remove(block)
        #p whitespace (becomes br)
        elif block['tag'] == 'p':
            if all(child['type'] == 'whitespace' for child in block['children']):
                block['tag'] = 'br'
                block['children'] = []

    def default(self, block):
        #remove styles that are default when not negating an inherited style;
        #since working bottom up, there is nothing inherited...  :)
        children = block['children']
        for child in children:
            for name, value in list(child['style'].items()):
                if self.style_names[name]['regex_default'] and re.match(self.style_names[name]['regex_default'], value):
                    child['style'].pop(name)

    def merge(self, block):
        #merge text/whitespace
        children = block['children']
        index = 0
        while index < len(children) - 1:
            child1 = children[index]
            child2 = children[index + 1]
            if child1['type'] == 'text':
                if child2['type'] == 'text':
                    if all(name in child2['style'] and child2['style'][name] == value for name, value in child1['style'].items()):
                        child1['text'] += child2['text']
                        children.pop(index + 1)
                    else:
                        index += 1
                elif child2['type'] == 'whitespace':
                    child1['text'] += ' '
                    children.pop(index + 1)
                else:
                    index += 1
            elif child1['type'] == 'whitespace':
                if child2['type'] == 'text':
                    child2['text'] = ' ' + child2['text']
                    children.pop(index)
                else:
                    index += 1
            else:
                index += 1

    def upgrade(self, block):
        #upgrade blockwide styles
        children = block['children']
        block_style = {}
        if children:
            for name, value in list(children[0]['style'].items()):
                if all(name in child['style'] and child['style'][name] == value for child in children):
                    block_style[name] = value
                    for child in children:
                        child['style'].pop(name)
        block['style'] = block_style

    def css(self, block):
        for child in block['children']:
            child_style = tuple(sorted(child.pop('style').items()))
            if child_style:
                if child_style in self.style_to_selector_map:
                    child['class'] = self.style_to_selector_map[child_style]
                else:
                    selector = 'class' + format(len(self.style_to_selector_map), '02d')
                    self.style_to_selector_map[child_style] = selector
                    child['class'] = selector
            else:
                child['class'] = None

    def write(self, block):
        '''
        Writes the block format to self.xhtml_xml in xhtml/xml format
        '''
        for child in block['children']:
            if child['type'] == 'block' or child['type'] == 'bodyblock':
                #single tag
                if child['tag'] == 'br':
                    self.xhtml_xml.append('<' + child['tag'] + '/>')
                #start and end tag
                else:
                    #start tag
                    if child['class']:
                        self.xhtml_xml.append('<' + child['tag'] + ' class="' + child['class'] + '">')
                    else:
                        self.xhtml_xml.append('<' + child['tag'] + '>')
                    #children
                    self.write(child)
                    #end tag
                    self.xhtml_xml.append('</' + child['tag'] + '>')
            elif child['type'] == 'text' or child['type'] == 'whitespace':
                #span
                if child['class']:
                    self.xhtml_xml.append('<span class="' + child['class'] + '">')
                    #text
                    self.xhtml_xml.append(child['text'])
                    #end tag
                    self.xhtml_xml.append('</span>')
                #plain
                else:
                    self.xhtml_xml.append(child['text'])
            elif child['type'] == 'image':
                if child['class']:
                    self.xhtml_xml.append('<img src="' + child['src'] + '" class="' + child['class'] + '"/>')
                else:
                    self.xhtml_xml.append('<img src="' + child['src'] + '"/>')



def main():
    pass

if __name__ == '__main__':
    main()
