#!/Python3/python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals, division, absolute_import, print_function

import os, os.path, sys, codecs, shutil, inspect, chardet, re, time
from decimal import *
from tempfile import mkdtemp                  
from PIL import Image
import options
from doc_tidy import *

import locale 
import lxml.html.clean as clean
import tkinter as tk
import tkinter.messagebox as mbox

__all__=["processAllTasks", "checkStyles", "copyTextFiles2Dir", "copyCSSFiles2Dir", "addFiles2CSS", "writeFiles2CSS", "writeFiles2Epub", "prettifyXHTMLFile", "moveHTMLStyles2CSS", "createLinkRel1", "moveInlineStyles2CSS", "createLinkRel2", "convertFile2UTF8", "checkFileEncoding", "fileNotLoadedError", "insertAbiWordMetadata", "show_msgbox", "show_yesnobox", "addDOCTYPEHeader", "getHTMLDocType", "fixEncodingErrors", "reformatHTMLStyles", "reformatTidyStyles", "reformatAbiWordStyles", "reformatWordStyles", "reformatOpenDocStyles", "reformatGoogleStyles", "prettifyCSS", "sddHTMLHeaders", "removeHTMLAttributes", "removeHTMLTop", "addHTMLTop", "addHTMLTail", "removeHTMLStyles", "removeAttributes", "removeInlineStyling", "removeAllTagAttributes"]

try:
    from sigil_bs4 import BeautifulSoup, Comment
except:
    from bs4 import BeautifulSoup, Comment  
    
    
def processAllTasks(bk, wdir, s_ids, t_ids, t_fnames, s_fnames):
    print('\n -- Processing automatic tasks...')
    print(' -- Clean and reformat the html')
    
    # get the doctype for imported html processing
    if options.FILE_TYPE == 'HTML':
        getHTMLDocType(wdir, t_fnames[0])
            
    print('\n >>> File type is: ' + options.FILE_TYPE)   
    print(' >>> Doctype is: ' + options.DOCTYPE)
    
    for file in t_fnames:
        file = os.path.join(wdir, file)
        docTidyNoWrap(wdir, file)
        prettifyXHTMLFile(wdir, file)
        addDOCTYPEHeader(wdir, file)   ####
        
        # move all html & inline styling to new stylesheets
        if options.MOVE_ALL_STYLES == True:         
            css_file = moveHTMLStyles2CSS(bk, wdir, file)      
            print('\n >>> html CSS path...' + os.path.join(wdir, css_file))
            if os.path.getsize(os.path.join(wdir, css_file)) < 5:
                msg = 'No html <style> section was found in the html so "html_styles.css" was not created.'
                show_msgbox('No HTML Styles Found', msg, msgtype="info")              
            else:                 
                h_fnames = []            
                h_ids = []
                h_fnames.append(css_file) 
                h_ids.append('css_html')
                addFiles2CSS(bk, wdir, h_ids, h_fnames)
            
            css_fname = moveInlineStyles2CSS(bk, wdir, file)
            print(' >>> Inline CSS path...' + (os.path.join(wdir, css_fname))) 
            if os.path.getsize(os.path.join(wdir, css_fname)) < 5:
                msg = 'No inline styles were found in the html so "inline_styles.css" was not created.'
                show_msgbox('No Inline Styles Found', msg, msgtype="info")     
            else:                
                i_fnames = []            
                i_ids = []            
                i_fnames.append(css_fname)
                i_ids.append('css_inline')
                addFiles2CSS(bk, wdir, i_ids, i_fnames)            
            
        # move only html <styles> to a new stylesheet
        if options.MOVE_HTML_STYLES == True:
            css_filename = moveHTMLStyles2CSS(bk, wdir, file)
            print('\n >>> In move html styles...')
            print('\n >>> html CSS path...' + os.path.join(wdir, css_filename))
            if os.path.getsize(os.path.join(wdir, css_filename)) < 5:
                msg = 'No html <style> section was found in the html so "html_styles.css" was not created.'
                show_msgbox('No HTML Styles Found', msg, msgtype="info")        
            else:                
                h_fnames = []                
                h_ids = []
                h_fnames.append(css_filename)
                h_ids.append('css_html')            
                addFiles2CSS(bk, wdir, h_ids, h_fnames)            
            
        # move only inline styling to a new stylesheet
        if options.MOVE_INLINE_STYLING == True:
            css_fstyles = moveInlineStyles2CSS(bk, wdir, file)
            print('\n >>> In move inline styles...')
            print('\n >>> Inline CSS path...' + os.path.join(wdir, css_fstyles))
            if os.path.getsize(os.path.join(wdir, css_fstyles)) < 5:
                msg = 'No inline styling was found in the html so "inline_styles.css" was not created.'
                show_msgbox('No Inline Styling Found', msg, msgtype="info")        
            else:    
                i_fnames = []    
                i_ids = []
                i_fnames.append(css_fstyles)   
                i_ids.append('css_inline')
                addFiles2CSS(bk, wdir, i_ids, i_fnames)  
            
        # removes all instances of the "class" attribute from the html    
        if options.REMOVE_HTML_STYLES == True:
            removeAllClasses(wdir, file)
            
        # removes all instances of the "style" attribute from the html    
        if options.REMOVE_INLINE_STYLING == True:
           removeInlineStyling(wdir, file)
       
        # remove all tag attributes from the html
        if options.REMOVE_ALL_ATTRS == True:        
            removeHTMLAttributes(wdir, file)
            
        prettifyXHTMLFile(wdir, file) 
         
    # write files back to epub and refresh work dir files         
    writeFiles2CSS(bk, wdir, s_ids, s_fnames)         
    writeFiles2Epub(bk, wdir, t_ids, t_fnames)

    # remove the inline stylesheet if present
    if options.REMOVE_INLINE_STYLING == True:
        try:
            bk.deletefile('css_inline')
        except:
            pass 
         
    # remove all stylesheets
    if options.REMOVE_ALL_ATTRS == True or options.REMOVE_HTML_STYLES == True:
        for s in s_ids:
            try:
                bk.deletefile(s)
            except:
                pass
    return(0)
    
def checkStyles(wdir, fnames):
    #  check if the input file is an HTML file   
    file = os.path.join(wdir, fnames[0])
    with open(file, 'rt', encoding='utf-8') as fp:       
        for line in fp: 
            if '<style>' in line or '<style type="text/css">'in line:          
                options.HTML_STYLES = True
                break                 
    return(0)    
    
def copyTextFiles2Dir(bk, wdir):
    t_ids = list()
    t_hrefs = list()
    t_fnames = list()
    
    for (id, href) in bk.text_iter():
        t_ids.append(id)
        t_hrefs.append(href)
        t_fnames.append(os.path.basename(href))
    
    # copy all xhtml files to the working dir    
    file = str()
    t_fnames_r = list()
    t_ids_r = list()
    i = 0      
    for id in t_ids:
        file = os.path.join(wdir, t_fnames[i])
        if 'cover.xhtml' in file or \
            'cover.html' in file or \
            'titlepage.xhtml' in file:
            i = i + 1
            continue    
        print(' -- Copy to work dir...' + t_fnames[i])
        with open(file, 'wt', encoding='utf-8') as outfp:
            data = bk.readfile(id)
            html = BeautifulSoup(data, 'html.parser')
            t_fnames_r.append(t_fnames[i])
            t_ids_r.append(id)
            outfp.writelines(str(html))
            i = i + 1
    
    return(t_ids_r, t_fnames_r) 
                
def copyCSSFiles2Dir(bk, wdir):
    s_ids = list()
    s_hrefs = list()
    s_fnames = list()
    for (i, h) in bk.css_iter():
        s_ids.append(i)
        s_hrefs.append(h)
        s_fnames.append(os.path.basename(h))
        
    j = 0    
    for sid in s_ids:
        file = os.path.join(wdir, s_fnames[j])
        print(' -- Write to work dir...' + file)
        with open(file, 'wt', encoding='utf-8') as outfp: 
            data = bk.readfile(sid)
            html = BeautifulSoup(data, 'html.parser')
            outfp.writelines(str(html))                
            j = j + 1     
            
    return(s_ids, s_fnames) 

def addFiles2CSS(bk, wdir, s_ids, s_fnames): 
    # no css files with imported html docs
    count = len(s_fnames)
    if count == 0: 
        return(0)
    
    i = 0
    print(' ')  
    for file in s_fnames: 
        if os.path.getsize(os.path.join(wdir, file)) < 5:
            continue        
        #prettifyCSS(wdir, file)    
        print(' -- Add file to epub CSS...' + str(file))        
        with open(os.path.join(wdir, file), 'rt', encoding='utf-8') as fp:           
            data = fp.read()
            print(' >>> addfile uid:' + s_ids[i])
            bk.addfile(s_ids[i], file, data)
            i = i + 1
    
    for file in s_fnames:
        os.remove(os.path.join(wdir, file))    
    
    ids, fnames = copyCSSFiles2Dir(bk, wdir)
    
    #for files in fnames:
        #prettifyCSS(wdir, file)
    
    writeFiles2CSS(bk, wdir, ids, fnames)    
            
    return(0)                  
    
def writeFiles2CSS(bk, wdir, ids, s_fnames): 
    # no css files with imported html docs
    count = len(s_fnames)
    if count == 0: 
        return(0) 
        
    i = 0
    print(' ')  
    for file in s_fnames:
        if os.path.getsize(os.path.join(wdir, file)) < 5:
            continue        
        #prettifyCSS(wdir, file) 
        print(' -- Write to epub CSS...' + str(file))        
        with open(os.path.join(wdir, file), 'rt', encoding='utf-8') as fp:              # input file is epub 
            data = fp.read()           
            bk.writefile(ids[i], data)
            i = i + 1
                        
    return(0)                  
    
def writeFiles2Epub(bk, wdir, ids, fnames):
    # no css files with imported html docs
    count = len(fnames)
    if count == 0: 
        return(0)
    
    i = 0
    print(' ')  
    for file in fnames:
        print(' -- Write files to epub...' + fnames[i])
        file = os.path.join(wdir, file)
        with open(file, 'rt', encoding='utf-8') as fp:
            data = fp.read()           
            bk.writefile(ids[i], data)
            i = i + 1
                        
    return(0)  
 
def prettifyXHTMLFile(wdir, file):
    # reformat and prettify the XHTML file
    outfile= os.path.join(wdir, 'final_one.css')
    infp = open(file, 'rt', encoding='utf-8')
    outfp = open(outfile, 'wt', encoding='utf-8')
     
    for line in infp:
            
        if line.strip().startswith('<body'):
            line = '<body style="font-family: Times New Roman, serif;margin: 3% 3% 3% 3%;">\n'
    
        if '<p></p>' in line.strip() or \
            '<p> </p>' in line.strip():
            continue
         
        if line.strip().startswith('<a') or \
            line.strip().startswith('<span'):
            continue
        
        if '<font>' in line:
            line = line.replace('<font>', '').replace('</font>', '')
            
        line = line.replace(r'&nbsp;', ' ')
        line = line.replace(r'&#160;', ' ')        
        line = line.replace(r'&amp;#160;', ' ')
        line = line.replace(r'&amp;#nbsp;',r'') 
        line = line.replace(r"&#146;", "’")         
        line = line.replace(r"&amp;#146;", "’")      
        line = line.replace(r'&amp;#9;', '')
        line = line.replace("<!--?xml version='1.0' encoding='utf-8'?-->", "")
        
        if line.strip().startswith('<?xml') or \
            line.strip().startswith('<!DOCTYPE') or \
            line.strip().startswith('<html') or \
            line.strip().startswith('<head>') or \
            line.strip().startswith('<meta')or \
            line.strip().startswith('<title>') or \
            line.strip().startswith('<link') or \
            line.strip().startswith('</head>') or \
            line.strip().startswith('<body'):
            line = line.strip()
            if not line:
                continue
            if line.startswith('<meta') or \
                line.startswith('<title>') or \
                line.startswith('<link'):
                line = '  ' + line      
            if line.startswith('<body'):
                line = '\n' + line
            if line.startswith('</body>'):
                outfp.write('\n' + line.rstrip() + '\n')
            else:
                outfp.write(line.rstrip() + '\n')    
        else:
            line = line.strip() 
            if not line:
                continue
            if line.startswith('<p'):
                line = '  ' + line            
            outfp.write('\n' + line + '\n')    
            
    infp.close()
    outfp.close()
    os.remove(file)
    os.rename(outfile, file)
    
    outfile= os.path.join(wdir, 'final_one.css')
    outfp = open(outfile, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:
        finish = False
        for line in infp:
            if '<body' not in line and finish == False:
                if line.strip() == '':
                    continue
                if '<html' in line:
                    line = '\n' + line 
                if '</head>' in line:
                    line = line + '\n'                
                outfp.write(line)    
            else:
                finish = True
                outfp.write(line)
            
    outfp.close()
    os.remove(file)
    os.rename(outfile, file)
    return(0)           
    
def moveHTMLStyles2CSS(bk, wdir, file):

    print(' -- Move html <styles> to "html_styles.css"')
    css = options.HTML_CSS_FILE_NAME
    
    if os.path.isfile(os.path.join(wdir, 'html_styles.css')):
        msg = 'The html <style> section has already been moved to an epub stylesheet("html_styles.css")!!'
        show_msgbox('Error', msg, msgtype='error')
        sys.exit(0)

    # reformat the <styles> section to port to css
    reformatHTMLStyles(wdir, file)
    
    new_file = os.path.join(wdir,'new_file.htm')
    outfp2 = open(new_file, 'wt', encoding=('utf-8'))
    css_file = os.path.join(wdir, css)
    outfp = open(css_file, 'wt', encoding='utf8')
    with open(file, 'rt', encoding='utf-8') as infp:
        for line in infp:
            if line == '' or line == '\n':
                continue
            if '<style>' in line or \
                '<style type="text/css">'in line:          
                for line in infp:
                    if '<![CDATA' in line or ']]>'in line or \
                        '<!--' in line or '-->' in line:
                        continue      
                    if '</style>' in line:
                        break
                    else:
                        if line.strip() == '':
                            continue                        
                        outfp.write(line.strip() + '\n')
            else:
                outfp2.write(line.strip() + '\n')  
            
    outfp.close() 
    outfp2.close()
    os.remove(file)
    os.rename(new_file, file)  
     
    # link the stylesheet to the html file
    css_path = os.path.join(wdir, css)
    createLinkRel1(wdir,file)
    #options.ADD_CSS_FILE = True
    prettifyCSS(wdir, css_path)
    prettifyXHTMLFile(wdir, file)
    return(css)
    
def createLinkRel1(wdir,file):   
    print(' -- Create html link to new CSS')
    output = os.path.join(wdir, 'link_rel.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf8') as infp:      
        for line in infp:
            if line.strip() == '</head>':
                line = '<link rel="stylesheet" href="../Styles/html_styles.css"' + \
                       ' type="text/css"/>\n</head>\n'
                outfp.write(line)
            else:
                outfp.write(line)
                
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)     

def moveInlineStyles2CSS(bk, wdir, file):

    css = options.INLINE_CSS_FILE_NAME
    
    if os.path.isfile(os.path.join(wdir, css)):
        msg = 'The html inline styling has already been moved to an epub stylesheet("inline_styles.css")!!'
        show_msgbox('Error', msg, msgtype='error')
        sys.exit(0)
    
    # move inline styles to the <style> section in html 
    docTidy(wdir, file)
    addDOCTYPEHeader(wdir, file)
    reformatTidyStyles(wdir, file)
    
    # reformat the style section from line to stacked format
    finish = False
    output = os.path.join(wdir, 'styler.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        for line in infp:
            if line == '' or line == '\n':
                continue
                           
            if '<![CDATA' in line or ']]>'in line or \
                '<!--' in line or '-->' in line:
                continue      
            if '</style>' not in line and finish == False:         
                line = line.replace('{', ' {\n')
                line = line.replace(';', ';\n')
                line = line.replace('}', '\n}\n')
                line= line.lstrip()
                outfp.write(line.strip() + '\n')
            else:
                finish = True 
                          
                outfp.write(line.strip() + '\n')
                
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    
    # move the styles section to a new stylesheet
    finish = False
    output1 = os.path.join(wdir, css)
    output2 = os.path.join(wdir, 'remove_styling.html')
    outfp1 = open(output1, 'wt', encoding='utf-8')
    outfp2 = open(output2, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        # for imported html with a <style> section
        if options.HTML_STYLES == True:
            for line in infp:            
                #if '<style>' in line or '<style type="text/css">' in line:               
                if 'sgc-' in line and finish == False:
                    line = line.replace('{', '{\n')
                    line = line.replace(';', ';\n')
                    line = line.replace('}', '\n}\n')
                    outfp1.write(line.strip() + '\n')                     
                    for line in infp:
                        if '</style>' not in line:
                            if '/*<![CDATA[*/' in line or '/*]]>*/' in line:
                                continue
                            if line.strip() == '':
                                continue 
                            line = line.replace('{', '{\n')
                            line = line.replace(';', ';\n')
                            line = line.replace('}', '\n}\n')
                            outfp1.write(line.strip() + '\n')                    
                        else:
                            finish = True                        
                            break
                outfp2.write(line)                            
                
    outfp1.close()
    outfp2.close()
    os.remove(file)
    os.rename(output2, file)
    
    outfile= os.path.join(wdir, 'final_one2.css')
    outfp = open(outfile, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as fp:
        data = fp.read()
        if '<style type="text/css">\n</style>\n' in data:
            data = data.replace('<style type="text/css">\n</style>\n', '')
        
    outfp.writelines(data)        
    outfp.close()
    os.remove(file)
    os.rename(outfile, file)     
    
    print(' -- Move html inline styling to "inline_styles.css"')
    css_path = os.path.join(wdir, css)
    createLinkRel2(wdir, file)
    prettifyCSS2(wdir, css_path)
    removeAttributes(wdir, file)
    prettifyXHTMLFile(wdir, file)
    #options.ADD_CSS_FILE = True
    return(css)
    
def createLinkRel2(wdir,file):   
    print(' -- Create html link to new CSS')
    output = os.path.join(wdir, 'link_rel.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf8') as infp:      
        for line in infp:
            if line.strip() == '</head>':
                line = '<link rel="stylesheet" href="../Styles/inline_styles.css"' + \
                       ' type="text/css"/>\n</head>\n'
                outfp.write(line)
            else:
                outfp.write(line)
                
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)     
    
def convertFile2UTF8(wdir, file, encoder):
    """ Converts input file to utf-8 format
    """
    print(' -- Convert input file to utf-8 if required\n')
    
    original_filename = file
    file = os.path.join(wdir, file)
    output = wdir + os.sep + 'fix_encoding.htm'
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding=encoder).read()  
    
    # safely convert to unicode utf-8 using bs4
    soup = BeautifulSoup(html, 'html.parser')
    outfp.writelines(str(soup))
    
    outfp.close()          
    os.remove(file)
    shutil.copy(output, file)        
    os.remove(output)
    
    return(file)
    
def checkFileEncoding(wdir, file):
    html_encoding = None
    chardet_encoding = ''
    final_encoding = ''
    
    file = os.path.join(wdir, file)
    # get the encoding info from the html meta headers   
    text = open(file, 'rt', encoding='iso-8859-1', errors='surrogateescape').read(2048)  
    
    if 'charset=windows-1252' in text.lower():
        html_encoding = 'cp1252'
    elif 'charset=windows-1250' in text.lower():   
        html_encoding = 'cp1250'
    elif 'charset=windows-1253' in text.lower():   
        html_encoding = 'cp1253' 
    elif 'charset=windows-1254' in text.lower():   
        html_encoding = 'cp1254'            
    elif 'charset=windows-1251' in text.lower():   
        html_encoding = 'cp1251'
    elif 'charset=windows-1255' in text.lower():   
        html_encoding = 'cp1255'    
    elif 'charset=windows-1256' in text.lower():   
        html_encoding = 'cp1256'
    elif 'charset=windows-1257' in text.lower():   
        html_encoding = 'cp1257'  
    elif 'charset=us-ascii' in text.lower():   
        html_encoding = 'us-ascii'          
    elif 'charset=ibm437' in text.lower():
        html_encoding = 'cp437' 
    elif 'charset=ibm850' in text.lower():
        html_encoding = 'cp850'
    elif 'charset=ibm852' in text.lower():
        html_encoding = 'cp852'
    elif 'charset=ibm855' in text.lower():
        html_encoding = 'cp855'    
    elif 'charset=iso-8859-1' in text.lower():
        html_encoding = 'iso-8859-1'
    elif 'charset=iso-8859-2' in text.lower():
        html_encoding = 'iso-8859-2'
    elif 'charset=iso-8859-4' in text.lower():
        html_encoding = 'iso-8859-4'    
    elif 'charset=utf-8' in text.lower():
        html_encoding = 'utf-8'
            
    # get the locale encoding, if needed
    if html_encoding == None:
        html_encoding = locale.getpreferredencoding()       
    
    # now get the file encoding using chardet
    rawdata = codecs.open(file, "rb").read(2048)
    result = chardet.detect(rawdata)
    charset_encoding = result['encoding']    
    chardet_encoding = charset_encoding    
     
    # compare the html and chardet encodings and if chardet contains
    # any 'utf' encodings then go with that as a preference
    final_encoding = chardet_encoding
    if (html_encoding != None and \
        chardet_encoding.upper() != html_encoding.upper()) and \
        'utf' not in chardet_encoding.lower():
        final_encoding = html_encoding
        
    # Final test -- open the file normally and read & write it back. If 
    # there is an exception let the user know and stop the plugin app.
    output = wdir + os.sep + 'encoding_test.htm'
    outfp = open(output, 'wt', encoding=final_encoding)
    html = None
    try:
        html = open(file, 'rt', encoding=final_encoding).read()
        outfp.writelines(html)
        outfp.close()
        os.remove(file)
        os.rename(output, file)
    except:  
        pass
        
    if html == None:
        print('\n >>> Critical Error: The html file could not be \n' + \
              ' >>> read because of file encoding problems.')
        show_msgbox('File Encoding Error', 'The file could not be read because of file encoding ' + \
                       'problems.\n\n', msgtype='error')
        shutil.rmtree(wdir, ignore_errors=True)                            
        sys.exit(0)              
    
    print(' -- Input file encoding is: ' + final_encoding.upper())
    return(final_encoding)

def fileNotLoadedError(title, msgtype):
    msg = 'You have not loaded an epub or html file into Sigil.\n\n' + \
          'You must first load an epub or html file into Sigil and then run this plugin. Please try again.'
    
    print('\n >>> Warning!! You have not loaded an epub or html file into Sigil.')
    print(' >>> You must first load an epub or html file into Sigil and then run this plugin. Please try again.')    
          
    show_msgbox(title, msg, msgtype)
    
    return(0)       

def insertAbiWordMetadata(wdir, file):
    output = os.path.join(wdir, 'Abi_meta2.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:        
        for line in infp:
            if 'AbiWord' in line:
                continue
            if '<title>' in line.strip():
                line = '  <meta content="AbiWord" name="generator" />\n<title></title>\n'
            outfp.write(line)   
    outfp.close()
    os.remove(file)
    os.rename(output, file) 
    return(0)    
    
def show_msgbox(title, msg, msgtype='info'):
    """ For general information, warnings and errors
    """
    localRoot = tk.Tk()
    localRoot.withdraw()
    localRoot.option_add('*font', 'Helvetica -12')
    localRoot.quit()
    
    if msgtype == 'info':
        return(mbox.showinfo(title, msg))
    elif msgtype == 'warning':
        return(mbox.showwarning(title, msg))
    elif msgtype == 'error':
        return(mbox.showerror(title, msg))
    
    localRoot = tk.Tk()
    localRoot.withdraw()
    localRoot.option_add('*font', 'Helvetica -12')
    localRoot.quit()
    if msgtype == 'info':
        return(mbox.showinfo(title, msg))
    elif msgtype == 'warning':
        return(mbox.showwarning(title, msg))
    elif msgtype == 'error':
        return(mbox.showerror(title, msg))

def show_yesnobox(title, msg, msgtype='info'):
    """ For general information, warnings and errors
    """
    localRoot = tk.Tk()
    localRoot.withdraw()
    localRoot.option_add('*font', 'Helvetica -12')
    localRoot.quit()
    if msgtype == 'info':
        return(mbox.showyesno(title, msg))
    if msgtype == 'warning':
        return(mbox.showyesno(title, msg))               
        
def addDOCTYPEHeader(wdir, file):
    output = os.path.join(wdir, 'ostyles1.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        for line in infp:    
        
            if '<![CDATA[' in line or \
                ']]>' in line or \
                '/*' in line:
                continue 
            
            if '<?xml' in line:
                data  = ('<?xml version="1.0" encoding="utf-8"?>\n')
                data += ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"\n' + \
                        '  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n\n')
                outfp.write(data)
            else:
                outfp.write(line)
                
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)

def getHTMLDocType(wdir, file):
    file = os.path.join(wdir, file)
    output = os.path.join(wdir, 'find_doctype.html')
    with open(file, 'rt', encoding='utf-8') as infp:
    
        for line in infp:
            if '<meta' in line and 'Microsoft Word' in line:
                options.DOCTYPE = 'Word'
                break
            elif '<meta' in line and 'LibreOffice' in line:
                options.DOCTYPE = 'LibreOffice'
                break
            elif '<meta' in line and 'OpenOffice' in line:
                options.DOCTYPE = 'OpenOffice'
                break
            elif 'AbiWord' in line:
                options.DOCTYPE = 'AbiWord'
                break
                
    if options.DOCTYPE == '':
        options.DOCTYPE = 'Google'
    
    return(0)
    
def fixEncodingErrors(line):
    """ Fixes encoding problems caused by 
        en dash, em dash, curly qutes, elipses etc
    """    
    # repair mixed encoding
    # cp 1252 to utf-8
    line = line.replace('â€™','’')       # apostrohe   
    line = line.replace('â€œ','“')       # left double quote 
    line = line.replace('â€','”')     # right double quote    
    line = line.replace('Â©','©')        # copyright
    line = line.replace('Â®','®')        # registered
    line = line.replace('â€”', '—')      # em dash
    line = line.replace('â€“', '–')      # en dash
    line = line.replace('â„¢', '™')
    line = line.replace('â”', '–')
    
    # latin-1 encoded in win cp1252 in utf-8 code
    line = line.replace('Ã¢â‚¬Å“', '“')
    line = line.replace('Ã¢â‚¬Â', '”')
    line = line.replace('Ã¢â‚¬â„¢', '’')
    
    # other encodings to utf-8
    line = line.replace('Ã¢â‚¬Å“','“')   # left double quote
    line = line.replace('¢â‚¬Â','”')     # right double quote    
    line = line.replace('Ã¢â‚¬â„¢','’')  # apostrohe, right single quote
    line = line.replace('Ã¢â‚¬Ëœ', '‘')  # left single quote 
    line = line.replace('Ã¢â‚¬“','–')    # en dash
    line = line.replace('Ã¢â‚¬”', '—')
    line = line.replace('Ì¶', '–')
    line = line.replace('Ã”', '”')
    line = line.replace('Ã”Ã', '”')
    line = line.replace('Ã', '')
    line = line.replace('Ãƒâ€šÃ‚Â', '')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã¢â€žÂ¢', '’')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã…â€œ', '“')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã‚Â', '”')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã¢â‚¬Å“','–')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã‹Å“', '‘')
    
    
    line = line.replace('Â', '')
    line = line.replace('Â', '')
    line = line.replace('Â', '’')
    
    line = line.replace('', '’') 
    line = line.replace('', '—')   
    line = line.replace('', '“')    
    line = line.replace('', '”')  
    line = line.replace('', '‘')    
    line = line.replace('', '…')
    line = line.replace('', '–')
    line = line.replace('© ', '©')
    
    line = line.replace(r'&lt;', '<')
    line = line.replace(r'&gt;', '>')
    
    line = line.replace(r'&ldquo;', '“')
    line = line.replace(r'&rdquo;', '”')
    return(line)     
    
def reformatHTMLStyles(wdir, file):    
    
    if options.DOCTYPE == 'AbiWord':
        reformatAbiWordStyles(wdir, file)
        
    if options.DOCTYPE == 'Word':
        reformatWordStyles(wdir, file)
        
    elif options.DOCTYPE == 'LibreOffice' or \
        options.DOCTYPE == 'OpenOffice': 
        reformatOpenDocStyles(wdir, file)

    else:
        if options.DOCTYPE == 'Google':               
            reformatGoogleStyles(wdir, file)
                
    return(0)    

def reformatTidyStyles(wdir, file):
    finish = False
    output = os.path.join(wdir, 'ostyles1.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        for line in infp:
        
            if '@font-face' in line or \
                '{font-family:' in line or \
                'panose-' in line or \
                'transform:' in line:
                line = ''    

            if line.strip() == '':
                continue                     
        
            if '</style>' not in line and finish == False:     
                line = line.replace('{', '{\n')
                line = line.replace('; ', ';\n')
                line = line.replace('}', '\n}\n')
                outfp.write(line.strip() + '\n')
            else:
                finish = True           
                outfp.write(line.strip() + '\n')
                
    outfp.close()
    os.remove(file)
    os.rename(output, file)   
    return(0)
    
def reformatAbiWordStyles(wdir, file):
    output = os.path.join(wdir, 'abi_headers.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:
        for line in infp:
            if line.strip().startswith('}@media'):
                line = line.replace('}@media', '}\n@media')
            outfp.write(line)    
                
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
    
    
def reformatWordStyles(wdir, file):
    output = os.path.join(wdir, 'wstyles2.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        for line in infp:    

            if 'Mso' in line:
                line = line.replace('Mso', '')        

            if '@font-face' in line or \
                '{font-family:' in line or \
                'panose-' in line or \
                'size:8.5in 11.0in;' in line:
                line = ''      
   
            if '<![CDATA[' in line or \
                ']]>' in line or \
                '/*' in line or \
                '<!--' in line or \
                '-->' in line:
                continue          
               
            if line.startswith('p.') or line.startswith('h1') or line.startswith('h2') or \
                line.startswith('h3') or line.startswith('h4') or line.startswith('h5') or \
                line.startswith('h6') or line.startswith('h1.') or line.startswith('h2.') or \
                line.startswith('h3.') or line.startswith('h4.') or line.startswith('h5.') or \
                line.startswith('h6.') or line.startswith('div.') or line.startswith('span.') or \
                line.startswith('a:link') or line.startswith('a:visited') or line.startswith('.PapDefault') or \
                line.startswith('.ChpDefault') or line.startswith('a: link') or line.startswith('a: visited') or \
                line.startswith('@page') or line.startswith('div.Section1'): 
                if '  {\n' not in line:
                    outfp.write(line.strip() + '  {\n')
                    continue
            
            elif line.startswith('{'):
                line = line.replace('{', '')
                line = line.replace(';}\n', ';\n}\n')
 
            else:
                line.endswith(';}\n')
                line = line.replace(';}\n', ';\n}\n')          
            outfp.write(line)   
    
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def reformatOpenDocStyles(wdir, file):    
    
    finish = False
    output = os.path.join(wdir, 'ostyles1.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        for line in infp:
            if '</style>' not in line and finish == False:     
                line = line.replace('{', '  {\n')
                line = line.replace('; ', ';\n')
                line = line.replace('}', '\n}\n')
                outfp.write(line.strip() + '\n')
            else:
                finish = True                
                outfp.write(line.strip() + '\n')
                
    outfp.close()
    os.remove(file)
    os.rename(output, file)   
    return(0)
    
def reformatGoogleStyles(wdir, file):
    finish = False
    output = os.path.join(wdir, 'gstyles1.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        for line in infp:
        
            if '</style>' not in line and finish == False:
                line = line.replace('transform:', '')            
                if '{' in line: 
                    line = line.replace('{', '  {\n')
                if ';' in line:
                    line = line.replace(';', ';\n')
                if '}' in line:
                    line = line.replace('}', '\n}\n')
            else: 
                finish = True                
            
            outfp.write(line.strip() + '\n')
                
    outfp.close()
    os.remove(file)
    os.rename(output, file)    
    return(0)   
    
def prettifyCSS(wdir, css):
    css = os.path.join(wdir, css)
    output = os.path.join(wdir, 'link_rel.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(css, 'rt', encoding='utf8') as infp:      
        for line in infp:
            if 'Mso' in line:
                line = line.replace('Mso', '')
            if re.match(r'^\s*$', line):
                continue 
            if line.strip() == '':
                continue   
            if 'font-size: pt;' in line:
                continue
            if line.strip() == ';':
                continue            

            line = line.replace('P   {', 'p   {')
            line = line.replace('H1', 'h1')
            line = line.replace('H2', 'h2')
            line = line.replace('H3', 'h3')
            line = line.replace('H4', 'h4')
            line = line.replace('H5', 'h5')
            line = line.replace('H6', 'h6')
            line = line.replace('P.', 'p.')
            line = line.replace('H1.', 'h1.')
            line = line.replace('H2.', 'h2.')
            line = line.replace('H3.', 'h3.')
            line = line.replace('H4.', 'h4.')
            line = line.replace('H5.', 'h5.')
            line = line.replace('H6.', 'h6.')
            line = line.replace('A:link', 'a:link')
            line = line.replace('DIV', 'div')
            line = line.replace('SPAN', 'span')                  
                
            if ' {  {' in line:
                line = line.replace(' {  {', '  {' )      
                
            outfp.write(line.strip() + '\n')
    
    css_name = os.path.basename(css)
    if css_name == 'html_styles.css':      
        data = 'p  {\n'             
        data += 'font-family: "Times New Roman", serif;\n'
        data += 'text-align: justify;\n'
        data += 'text-indent: 0em;\n'
        data += 'line-height: 1.2em;\n'
        data += '}\n'
        data += 'h1, h2, h3, h4, h5, h6  {\n' 
        data += 'font-family: "Times New Roman", serif;\n'        
        data += 'text-indent: 0em;\n'
        data += '}\n'
        outfp.write(data)
    
    outfp.close()
    os.remove(css)
    os.rename(output, css)
    return(0)         
    
def prettifyCSS2(wdir, css):
    css = os.path.join(wdir, css)
    output = os.path.join(wdir, 'link_rel.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(css, 'rt', encoding='utf8') as infp:      
        for line in infp:
            if 'Mso' in line:
                line = line.replace('Mso', '')
            if re.match(r'^\s*$', line):
                continue 
            if line.strip() == '':
                continue   
            if 'font-size: pt;' in line:
                continue
            if line.strip() == ';':
                continue            

            line = line.replace('P   {', 'p   {')
            line = line.replace('H1', 'h1')
            line = line.replace('H2', 'h2')
            line = line.replace('H3', 'h3')
            line = line.replace('H4', 'h4')
            line = line.replace('H5', 'h5')
            line = line.replace('H6', 'h6')
            line = line.replace('P.', 'p.')
            line = line.replace('H1.', 'h1.')
            line = line.replace('H2.', 'h2.')
            line = line.replace('H3.', 'h3.')
            line = line.replace('H4.', 'h4.')
            line = line.replace('H5.', 'h5.')
            line = line.replace('H6.', 'h6.')
            line = line.replace('A:link', 'a:link')
            line = line.replace('DIV', 'div')
            line = line.replace('SPAN', 'span')                  
                
            if ' {  {' in line:
                line = line.replace(' {  {', '  {' )      
                
            outfp.write(line.strip() + '\n')
            
    outfp.close()
    os.remove(css)
    os.rename(output, css)
    return(0)             
    
def sddHTMLHeaders(wdir, file): 
    output = os.path.join(wdir, 'link_rel.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf8') as infp:      
        data = '<?xml version="1.0" encoding="utf-8"?>\n'
        data += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
        data += '<html xmlns="http://www.w3.org/1999/xhtml">\n'
        data += '<head>\n'
        data += ' <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n'
        data += '<title></title>\n' 
        data += '</head>\n\n' 
        data += '<body>\n'
        outfp.write(data)
        for line in infp:
            outfp.write(line)
        outfp.write('</body>\n</html>\n')
    outfp.close()        
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def removeHTMLAttributes(wdir, file):
    removeHTMLTop(wdir, file)   
    output = os.path.join(wdir, 'link_rel.html')
    outfp = open(output, 'wt', encoding='utf-8')
    html = open(file, 'rt', encoding='utf8').read()      
    soup = BeautifulSoup(html, 'html.parser')
    
    for tag in soup():
        for attribute in ['class', 'style', 'font', 'align', 'sub', 'sup']:
            del tag[attribute]
   
    for tag1 in soup():
        for attribute in ['id', 'alt', 'src', 'href', 'height', 'width']:
            del tag1[attribute]              
       
    for tag2 in soup():
        tag2.attrs = {}    
        
    # remove spans that contain no styling    
    for d in soup.find_all(['span']):     
        if '<span>' in str(d):
            d.unwrap()    
           
    # remove <a> tags that contain no styling    
    for a in soup.find_all(['a']):      
        if '<a>' in str(a):
            a.unwrap() 

    # remove <img> tags that contain no styling     
    for img in soup.find_all(['img']):     
        if '<img/>' in str(img):
            img.unwrap()
    
    # remove empty 'p' tags
    for p in soup.find_all(['p']):
        if len(p.text) == 0 or p.text == ' ':
            p.decompose()
            
    outfp.writelines(str(soup))   
    outfp.close()    
    os.remove(file)
    os.rename(output, file)
    addHTMLTop(wdir, file), 
    #removeInlineStyling(wdir, file)    ####
    prettifyXHTMLFile(wdir, file)
    
    return(0)
        
    
def removeHTMLTop(wdir, file):             
    finish = False
    file_type = options.FILE_TYPE
    output = os.path.join(wdir, 'remove_top.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:        
        for line in infp:  
            if '<body' not in line and finish == False:
                continue
            else:
                if '<body' in line:
                    line = ''
                    finish = True

            if '</body>' in line or \
               '</html>' in line or \
               '<div></div' in line:
                line = ''
                
            outfp.write(line)            

    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def addHTMLTop(wdir, file):
    output = os.path.join(wdir, 'add_top.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:          
        data = '<?xml version="1.0" encoding="utf-8"?>\n'
        data += '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
        data += '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">\n'
        data += '<head>\n'
        data += '  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n'
        data += '<title></title>\n'
        data += '</head>\n'
        data += '\n<body>\n\n'
        outfp.write(data)
        for line in infp:
            outfp.write(line)
         
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    addHTMLTail(wdir, file)   
    return(0)  
    
def addHTMLTail(wdir, file):
    
    output = wdir + os.sep + "tails.html"
    outfp = open(output, 'wt', encoding=('utf-8'))
    infp = open(file,'rt', encoding=('utf-8'))
    
    for line in infp:
        outfp.write(line)
        
    outfp.write('\n</body>\n</html>\n\n')            
    
    outfp.close()
    infp.close()
    os.remove(file)
    os.rename(output, file)   
    return(0)        

def removeHTMLStyles(wdir, file):
    print(' -- Remove html <styles> from the file')
    output = os.path.join(wdir, 'remove_styles.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:        
        for line in infp:   
            if '<style>' in line or '<style type="text/css">' in line:
                for line in infp:
                    if '</style>' not in line:
                        continue
                    else:                    
                        break
            else:
                outfp.write(line.strip() + '\n')           
            
    outfp.close()
    os.remove(file)
    os.rename(output, file) 
    return(0)   
        
def removeAttributes(wdir, file):
     
    print(' -- Remove or change non-compliant attributes') 
    output = os.path.join(wdir, 'body.html')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
     
    # remove all unwanted proprietary attributes from the html doc   
    search_tags = ['p', 'img', 'span', 'body', 'a', 'h1']  
    search_attribs =  ['dir', 'border', 'title', 'link', 'vlink', 'text']  
    for a in soup.find_all(search_tags):
        for attribute in search_attribs:
            del a[attribute]    
            
    # remove all unwanted proprietary attributes from the html doc   
    search_tags = ['p', 'img', 'span', 'body', 'a', 'h1']  
    search_attribs =  ['lang', 'clear', 'hspace', 'vspace']  
    for a in soup.find_all(search_tags):
        for attribute in search_attribs:
            del a[attribute]            
           
    # remove all unwanted proprietary attributes from the html doc   
    search_tags = ['h2', 'h3', 'h4', 'h5', 'h6', 'br']  
    search_attribs =  ['dir', 'border', 'title', 'link', 'vlink', 'text']  
    for a in soup.find_all(search_tags):
        for attribute in search_attribs:
            del a[attribute]           
            
    # remove all unwanted proprietary attributes from the html doc   
    search_tags = ['h2', 'h3', 'h4', 'h5', 'h6', 'br']  
    search_attribs =  ['lang', 'clear']  
    for a in soup.find_all(search_tags):
        for attribute in search_attribs:
            del a[attribute]        

    # swap 'id' for 'name'        
    for tag in soup.find_all('p', 'a', 'h1', 'h2', 'h3'):
        if tag.has_attr('name'):
            tag['id'] = tag['name']
            del tag['name']
            
    # swap 'id' for 'name'           
    for tag in soup.find_all('h4', 'h5', 'h6'):
        if tag.has_attr('name'):
            tag['id'] = tag['name']  
            del tag['name']            

    
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)        
            
def removeInlineStyling(wdir, file):
    print(' >> In removeInlineStyling()...\n')
   
    #removes all inlines tyling and removes all font tags
    output = os.path.join(wdir, 'check_file.html')
    outfp = open(output, 'wt', encoding='utf-8')
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html,'html.parser')   
   
    for tag in soup(True):
        del tag['style']
        
    for t in soup(True):
        if t.has_attr('class') and 'sgc-' in t['class']:
            del t['class']       
        
    # remove spans that contain no styling    
    for span in soup.find_all(['span']):     
        if '<span>' in str(span):
            span.unwrap()

    # remove font tags that contain no styling    
    for font in soup.find_all(['font']):     
        if '<font>' in str(font):
            font.unwrap()  

    for link in soup.find_all('link'):
        if 'inline_styles.css' in str(link):
            link.decompose()        

    outfp.writelines(str(soup))         
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    prettifyXHTMLFile(wdir, file)
    return(0)            

def removeAllTagAttributes(wdir, file):

    removeHTMLTop(wdir, file)    
    output = os.path.join(wdir, 'remove_attrs.html')
    outfp = open(output, 'wt', encoding='utf-8')
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')          
    
    for tag in soup(True):
        tag.attrs = {} 
        
    for link in soup.find_all('link'):
        if 'inline_styles.css' in str(link) or \
            'html_styles.css' in line:
            link.decompose()              
               
    print('\n>>> In removeAllTagAttributes()...')     
         
    outfp.writelines(str(soup))         
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    addHTMLTop(wdir, file)
    
    #remove the html styles section
    removeHTMLStyles(wdir, file)
    
    return(s_fnames)                 

def removeAllClasses(wdir, file):
    removeHTMLTop(wdir, file)    
    output = os.path.join(wdir, 'remove_attrs.html')
    outfp = open(output, 'wt', encoding='utf-8')
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')         

    for tag in soup(True):
        del tag['class']  
        
    # remove spans that contain no styling    
    for span in soup.find_all(['span']):     
        if '<span>' in str(span):
            span.unwrap()

    # remove font tags    
    for font in soup.find_all(['font']):     
        if '<font>' in str(font):
            font.unwrap()             

    for link in soup.find_all('link'):
        if 'html_styles.css' in str(link):
            link.decompose()                   
            
    outfp.writelines(str(soup))         
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    addHTMLTop(wdir, file)   

    #remove the html styles section
    removeHTMLStyles(wdir, file)
    prettifyXHTMLFile(wdir, file)
    
    return(0)
    

    
    
    
