#!/Python3/python
# -*- coding: utf-8 -*-


from __future__ import unicode_literals, division, absolute_import, print_function


__all__=["checkUTF8", "adjustCSSBody", "changeStylingTags", "getPointSize", "removeFontTags", "removeImageClass", "removeAlignAttr", "insertAbiWordMetadata", "convertName2IDAttr", "reformatHTMLBodyTag", "fixHTMLAttrValues", "formatHTMLBodyTag", "removeLangAttrs", "reformatGoogleStyles", "show_msgbox", "removeAttributes", "removeFontStyles", "removeStyles", "addDOCTYPEHeader", "reformatOpenDocStyles", "reformatAbiWordStyles", "reformatWordStyles", "removeFonts", "changeBodyTag", "convertTags", "getHTMLDocType", "removeEmptyTags", "removeRedundantCSS", "removeRedundantHTML", "fixEncodingErrors", "removeHardBreaks", "getImageSize", "formatImages", "reformatBookImages", "removePageBreaks", "addHTMLTail", "prettifyXHTMLFile", "convertITags", "convertBTags", "convertEMTags", "convertStrongTags", "convertSTags", "convertUTags", "insertGoogleMetadata"]



import os, os.path, sys, codecs, inspect, chardet, re, time, shutil
from decimal import *
from PIL import Image
import options
from doc_tidy import *
from cutils import show_msgbox

import locale
import tkinter as tk
import tkinter.messagebox as mbox

try:
    from sigil_bs4 import BeautifulSoup, Comment
except:
    from bs4 import BeautifulSoup, Comment    

def checkUTF8(wdir, file):

    file = os.path.join(wdir, file)
    output = os.path.join(wdir, 'check_file.html')
    outfp = open(output, 'wt', encoding='utf-8')
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html,'html.parser')    
    
    outfp.writelines(str(soup))         
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)               
    
def adjustCSSBody(wdir, file):
    
    file = os.path.join(wdir, file)
    output = wdir + os.sep + 'body_repaired.css'
    
    outfp = open(output, 'w', encoding='utf-8')
    with open(file, 'r', encoding='utf8') as infp:  
        for line in infp:
            if 'body  {' in line:
                line = line.replace('body  {\n', 'body  {\nfont-family: serif;\n')
                outfp.write(line)        
            else:
                line = line.strip()
                outfp.write(line + '\n') 
                    
    outfp.close()
    os.remove(file)
    shutil.copy(output, file)
    os.remove(output)               
               
    return(0)   
  
def changeStylingTags(wdir, file):

    output = os.path.join(wdir, 'remove_fontags.html')
    outfp = open(output, 'wt', encoding='utf-8')
    file = os.path.join(wdir, file)
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')          

    for ptag in soup.find_all('p'):
        if ptag.u:
            ptag.u['style'] = 'text-decoration: underline;'
            ptag.u.name = 'span'
            
    for ptag in soup.find_all('p'):
        if ptag.s:
            ptag.s['style'] = 'text-decoration: line-through;'
            ptag.s.name = 'span'        

    outfp.writelines(str(soup))         
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)                        
  
def getPointSize(size):
    
    sizes = {'1': '10',
             '2': '12',
             '3': '14',
             '4': '16',
             '5': '18',
             '6': '26',
             '7': '30' 
            } 
            
    size = size.replace('.0', '')            
    for key, value in sizes.items(): 
        if key == size:
            return(value)
            
    print('\n >>> Error: Unable to convert the font SIZE attribute value to points in the HTML.\n')
    return('')        
            
def removeFontTags(wdir, file):
        
    output = os.path.join(wdir, 'remove_fontags.html')
    outfp = open(output, 'wt', encoding='utf-8')
    file = os.path.join(wdir, file)
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')      
    
    # add 'font' attributes to 'style' attributes        
    for p in soup.find_all('p'):
        if '<font' in str(p):
            if p.font.has_attr('style'):
                if p.has_attr('style'):
                    if p['style'].endswith(';'):
                        p['style'] = p['style'] + p.font['style']
                        del p.font['style']
                    else:
                        p['style'] = p['style'] + ';' + p.font['style']
                        del p.font['style']                                           
                else:    
                    p['style'] = p.font['style']
                    del p.font['style']
             
    # remove all font FACE declarations        
    for f in soup.find_all('font'):
        if f.has_attr('face'):
            del f['face']
            f.unwrap()
            
    # remove all 'size = 3' font declarations        
    for x in soup.find_all('font'):
       if x.has_attr('size'):
           if x['size'] == "3" or x['size'] == 3:
               x.unwrap()          

    for f in soup.find_all('font'):
        del f['color']
        del f['face']

    for d in soup.find_all('font'):
        if not d.has_attr('style'):
            d.unwrap()
    
    outfp.writelines(str(soup))         
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)                
                    
def removeImageClass(wdir, file):

    if options.MOVE_ALL_STYLES == False:
        return(0)

    output = os.path.join(wdir, 'remove_class.html')
    outfp = open(output, 'wt', encoding='utf-8')
    file = os.path.join(wdir, file)
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')            
    
    for itag in soup.find_all('img'):
        if itag.has_attr('class'):
            del itag['class']
    
    outfp.writelines(str(soup))         
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)     
    
def removeAlignAttr(wdir, file):

    if options.DOCTYPE != 'Word':
        return(0)
            
    print(' -- Remove align:"center" attribute')        
            
    output = os.path.join(wdir, 'remove_center.html')
    outfp = open(output, 'wt', encoding='utf-8')
    file = os.path.join(wdir, file)
    html = open(file, 'rt', encoding='utf-8').read()
    
    soup = BeautifulSoup(html, 'html.parser')  

    for tag in soup.find_all(align=True):
        if tag.has_attr('align'):
            del tag['align']          
           
    outfp.writelines(str(soup))         
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def insertAbiWordMetadata(wdir, file):

    print(' >> In Abi insert metadata...!!')     
    output = os.path.join(wdir, 'Abi_meta2.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:        
        for line in infp:
            if '<head>' in line.strip():
                line = '<head>\n  <meta name="Generator" content="AbiWord HTML" />\n'
                outfp.write(line)
            else:
                outfp.write(line)        
    outfp.close()
    os.remove(file)
    os.rename(output, file) 
    return(0)    

def convertName2IDAttr(wdir, file):

    output = os.path.join(wdir, 'name2id.html')
    outfp = open(output, 'wt', encoding='utf-8')
    file = os.path.join(wdir, file)
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')        
    
    for tag in soup.find_all(name=True):
        if tag.has_attr('name') and tag['name'] != None and tag['name'] != '':
            idref = tag['name']
            del tag['name']            
            tag['id'] = idref
            
    outfp.writelines(str(soup))         
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def reformatHTMLBodyTag(wdir, file):

    output = os.path.join(wdir, 'reformat_body.html')
    outfp = open(output, 'wt', encoding='utf-8')
    file = os.path.join(wdir, file)
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')        
    
    body = soup.body
    body.attrs = {}
    body['class'] = 'globals'
            
    outfp.writelines(str(soup))         
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)    
    
def fixHTMLAttrValues(wdir, file):
       
    # fix ids in <img> tags - remove spaces
    output = os.path.join(wdir, 'fix_ids1.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')

    for itag in soup.find_all('img'):
        if itag.has_attr('id'):
            itag['id'] = str(itag['id']).replace(' ', '') 
            
    for atag in soup.find_all('a'):
        if atag.has_attr('id'):
            atag['id'] = str(atag['id']).replace(' ', '')       

    for ptag in soup.find_all('p'):
        if ptag.has_attr('id'):
            ptag['id'] = str(ptag['id']).replace(' ', '')             
            
    for dtag in soup.find_all('div'):
        if dtag.has_attr('id'):
            dtag['id'] = str(dtag['id']).replace(' ', '')                  
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def formatHTMLBodyTag(wdir, file):

    output = os.path.join(wdir, 'add_body.html')
    outfp = open(output, 'wt', encoding='utf-8')
    file = os.path.join(wdir, file)
    with open(file, 'rt', encoding='utf-8') as infp:
        for line in infp:
            if line.strip().startswith('<body'):
                del line
                line = '<body style="font-family: serif;magin: 3% 3% 3% 3%;">\n\n'
            outfp.write(line)

    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)        
    
def removeLangAttrs(wdir, file):

    # remove all lang & xml:lang attributes
    output = os.path.join(wdir, 'remove_lang.html')
    file = os.path.join(wdir, file)
    outfp = open(output, 'wt', encoding='utf-8')
    html = open(file, 'rt', encoding='utf-8').read()        
        
    soup = BeautifulSoup(html, 'html.parser')

    for ltag in soup.find_all(lang=True):
        if ltag.has_attr('lang'):
            del ltag['lang']                

    for xtag in soup.find_all(True):
        if xtag.has_attr('xml:lang'):
            del xtag['xml:lang']
    
    outfp.writelines(str(soup))                  
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def reformatGoogleStyles(wdir, file):
    
    if options.DOCTYPE != 'Google':
        return(0)

    print('\n >>> In reformat Google !!')    
        
    finish = False
    output = os.path.join(wdir, 'gstyles1.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        for line in infp:
        
            line = line.replace('</h1><a', '</h1>\n\n<a')        
            line = line.replace('</h2><a', '</h2>\n\n<a')   
            line = line.replace('</p><a', '</p>\n\n<a')  
             
            if '<hr style="display:none;"/>' in line:
                continue          
        
            if '</style>' not in line and finish == False:        
                if '{' in line: 
                    line = line.replace('{', '  {\n')
                if ';' in line:
                    line = line.replace(';', ';\n')
                if '}' in line:
                    line = line.replace('}', '\n}\n')
            else: 
                finish = True                
            
            outfp.write(line.strip() + '\n')
                
    outfp.close()
    os.remove(file)
    os.rename(output, file) 

    output = os.path.join(wdir, 'reformat_G.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        for line in infp: 
        
            if '@import url(' in line or \
               'orphans' in line or \
               'widows' in line or \
               'color:' in line or \
               'vertical-align:baseline;' in line or \
               'text-decoration:none;' in line:
                continue
                   
            line = line.replace(' transform:', '')    
            line = line.replace(' rotate(0.00rad)', '')    
            line = line.replace(' translateZ(0px);', '')    
            line = line.replace(' -webkit-transform:', '')        
            
            if 'title=""' in line:
                line = line.replace('title=""', '')            
            
            if '@media url{' in line:
                line = ''
                
            line = line.replace('padding-top:', 'margin-top:')
            line = line.replace('padding-bottom:', 'margin-bottom:')
            line = line.replace('padding-left:', 'margin-left:')
            line = line.replace('padding-right:', 'margin-right:')
            line = line.replace('padding:', 'margin:')
            
            if 'vertical-align:super' in line:
                line = 'font-size:0.75em;\nvertical-align:top;\nline-height:normal;\n'            
            
            outfp.write(line)
                
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    
    insertGoogleMetadata(wdir, file)
    
    return(0)    
    
def show_msgbox(title, msg, msgtype='info'):
    """ For general information, warnings and errors
    """
    
    localRoot = tk.Tk()
    localRoot.withdraw()
    localRoot.option_add('*font', 'Helvetica -12')
    localRoot.quit()
    if msgtype == 'info':
        return(mbox.showinfo(title, msg))
    elif msgtype == 'warning':
        return(mbox.showwarning(title, msg))
    elif msgtype == 'error':
        return(mbox.showerror(title, msg))          
        
def removeAttributes(wdir, file):
     
    print(' -- Remove or change non-compliant attributes') 
    output = os.path.join(wdir, 'body.html')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
     
    # remove all unwanted proprietary attributes from the html doc   
    search_tags = ['p', 'img', 'span', 'body', 'a', 'h1']  
    search_attribs =  ['dir', 'border', 'title', 'link', 'vlink', 'text']  
    for a in soup.find_all(search_tags):
        for attribute in search_attribs:
            del a[attribute]    
            
    # remove all unwanted proprietary attributes from the html doc   
    search_tags = ['p', 'img', 'span', 'body', 'a', 'h1']  
    search_attribs =  ['lang', 'clear', 'hspace', 'vspace']  
    for a in soup.find_all(search_tags):
        for attribute in search_attribs:
            del a[attribute]            
           
    # remove all unwanted proprietary attributes from the html doc   
    search_tags = ['h2', 'h3', 'h4', 'h5', 'h6', 'br']  
    search_attribs =  ['dir', 'border', 'title', 'link', 'vlink', 'text']  
    for a in soup.find_all(search_tags):
        for attribute in search_attribs:
            del a[attribute]           
            
    # remove all unwanted proprietary attributes from the html doc   
    search_tags = ['h2', 'h3', 'h4', 'h5', 'h6', 'br']  
    search_attribs =  ['lang', 'clear']  
    for a in soup.find_all(search_tags):
        for attribute in search_attribs:
            del a[attribute]                   
           
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)        
    
    
def removeFontStyles(wdir, file):

    finish = False
    output = os.path.join(wdir, 'font_styles.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:  
        for line in infp:
            if '</style>' in line:
                outfp.write(line)
                finish = True 
                continue                
            elif finish == False:
                if 'font-family:' in line.strip():
                    continue
                else:
                    outfp.write(line)                
            else:
                outfp.write(line)                                                    

    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)    
    
def removeStyles(wdir, file):

    finish = False
    output = os.path.join(wdir, 'remove_styles.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:  
        for line in infp:
        
            if line.strip().startswith('<a'):
                continue   
                
            # remove Abiword kix styles    
            if 'kix_' in line: 
                outfp.write('')            
                for line in infp:
                    if '}' in line:
                        outfp.write('')
                        break
                    else:
                        continue
                line = line.replace('}\n', '') 
                
            if ' dir="ltr"' in line:
                line = line.replace(' dir="ltr"', '')
                
            if '</style>' in line.strip():
                outfp.write(line)
                finish = True 
                continue                
                
            elif finish == False:
                if 'font-weight: 700' in line:
                    line = line.replace('700', 'bold')
                    
                if 'font-weight: 400' in line:
                    line = line.replace('400', 'normal')    
            
                if 'line-height:' in line:
                    if 'line-height: normal' not in line and 'line-height:normal' not in line:
                        continue
            
                if 'position:absolute' in line.replace(' ', '') or \
                    'font-variant:normal' in line.replace(' ', '') or \
                    'text-decoration:none' in line.replace(' ', '') or \
                    'letter-spacing:normal' in line.replace(' ', '') or \
                    'vertical-align:normal' in line.replace(' ', '') or \
                    'transform:' in line or \
                    'so-language:' in line or \
                    '-webkit-transform' in line or \
                    'direction:' in line or \
                    'widows:' in line or \
                    'orphans:' in line or \
                    line == ';\n':
                    continue
                else:
                    outfp.write(line)                 
            else:
                outfp.write(line)                                                    
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def addDOCTYPEHeader(wdir, file):

    output = os.path.join(wdir, 'ostyles1.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        for line in infp: 
                   
            if '<![CDATA[' in line or \
                ']]>' in line or \
                '/*' in line or \
                '*/' in line:
                continue 
            
            if '<?xml' in line:
                data  = ('<?xml version="1.0" encoding="utf-8"?>\n')
                data += ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"\n' + \
                         '  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n\n')
                outfp.write(data)
            else:
                outfp.write(line)
                
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)

def reformatOpenDocStyles(wdir, file):

    if options.DOCTYPE == 'LibreOffice' or options.DOCTYPE == 'OpenOffice':
        pass
    else:    
        return(0)    
    
    finish = False
    output = os.path.join(wdir, 'ostyles1.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        for line in infp:
            if '@page' in line:
                continue
                
            if '-ctl' in line or \
                '-cjk' in line or \
                '.ctl' in line or \
                '.cjk' in line: 
                continue    
            
            if '</style>' not in line and finish == False:     
                line = line.replace('-western', '')     
                line = line.replace('w2e_', '')     
                line = line.replace('{', '  {\n')
                line = line.replace('; ', ';\n')
                line = line.replace('}', '\n}\n')
                outfp.write(line.strip() + '\n')
            else:
                finish = True                
                outfp.write(line.strip() + '\n')
           
    
    outfp.close()
    os.remove(file)
    os.rename(output, file)   
    
    output = os.path.join(wdir, 'o_format.htm')
    outfp = open(output, 'wt', encoding=('utf-8')) 
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    
    for k in soup.find_all('font'):
        if k.has_attr('style') and not k.has_attr('face') and not k.has_attr('size') and not k.has_attr('color'):
            k.name = 'span'    
    
    for tag in soup.find_all('font'):
        if tag.has_attr('size'): 
            del tag['size']
        if tag.has_attr('face'):
            del tag['face']
        if tag.has_attr('style'):         
            tag.name = 'span'     
            
    for ptag in soup.find_all('p'):
        if ptag.u:
            ptag.u['style'] = 'text-decoration: underline;'
            ptag.u.name = 'span'
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)

def reformatAbiWordStyles(wdir, file):

    output = os.path.join(wdir, 'abi_headers.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:
        for line in infp:
            if line.strip().startswith('}@media'):
                line = line.replace('}@media', '}\n@media')
            outfp.write(line)    
                
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)    

def reformatWordStyles(wdir, file):

    if options.DOCTYPE != 'Word':
        return(0)
        
    output = os.path.join(wdir, 'wstyles1.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        for line in infp:
            
            line = line.replace('Mso', '')       
            
            if '@font-face' in line or \
                '{font-family' in line or \
                'panose-' in line:
                continue   
            
            if 'text-align:center;text-align:center' in line.replace(' ', ''):
                line = line.replace('text-align:center;text-align:center;', 'text-align: center;')
                line = line.replace('text-align:center;text-align:center', 'text-align: center;')
                line = line.replace('text-align:center;text-align: center;', 'text-align: center;')
                line = line.replace('text-align:center;text-align: center', 'text-align: center;')
                
            if 'text-align:justify;text-align:justify' in line.replace(' ', ''):
                line = line.replace('text-align:justify;text-align:justify;', 'text-align: justify;')
                line = line.replace('text-align:justify;text-align:justify', 'text-align: justify;')
                line = line.replace('text-align:justify;text-align: justify;', 'text-align: justify;')
                line = line.replace('text-align:justify;text-align: justify', 'text-align: justify;')    
                
            if 'text-align:left;text-align:left' in line.replace(' ', ''):
                line = line.replace('text-align:left;text-align:left;', 'text-align: left;')
                line = line.replace('text-align:left;text-align:left', 'text-align: left;')
                line = line.replace('text-align:left;text-align: left;', 'text-align: left;')
                line = line.replace('text-align:left;text-align: left', 'text-align: left;')        
        
            if line.strip().startswith('@font-face') or \
                line.strip().startswith('{font-family') or \
                line.strip().startswith('panose-'):                   
                continue
                
            outfp.write(line)   
    
    outfp.close()
    os.remove(file)
    os.rename(output, file)
        
    output = os.path.join(wdir, 'wstyles2.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        for line in infp:            
   
            if '<![CDATA[' in line or \
                ']]>' in line or \
                '/*' in line or \
                '*/' in line:
                continue 
                
            line = line.replace('Mso', '')       
            
            if '{mso-' in line and '}' not in line:
                continue   
                
            if '{mso-' in line and '}' in line:
                line = '}\n'         

            if line.strip() == 'p':
                line = 'p {'            
                
            if line.startswith('p ') or line.startswith('p.') or line.startswith('h1') or line.startswith('h2') or \
                line.startswith('h3') or line.startswith('h4') or line.startswith('h5') or \
                line.startswith('h6') or line.startswith('h1.') or line.startswith('h2.') or \
                line.startswith('h3.') or line.startswith('h4.') or line.startswith('h5.') or \
                line.startswith('h6.') or line.startswith('div.') or line.startswith('span.') or \
                line.startswith('a:link') or line.startswith('a:visited') or line.startswith('.PapDefault') or \
                line.startswith('.ChpDefault') or line.startswith('a: link') or line.startswith('a: visited') or \
                line.startswith('@page'): 
                if '  {\n' not in line:
                    outfp.write(line.strip() + '  {\n')
                    continue
            
            elif line.startswith('{'):
                line = line.replace('{', '')
                
            else:
                line.endswith(';}\n')
                line = line.replace(';}\n', ';\n}\n')          
            outfp.write(line)   
    
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    
  
    output = os.path.join(wdir, 'wstyles3.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        for line in infp:
                
            if 'max-width:' in line or \
                'orphans:' in line or \
                'widows:' in line or \
                'page-break' in line or \
                'vertical-align: baseline' in line:
                continue    
            outfp.write(line)                

    outfp.close()
    os.remove(file)
    os.rename(output, file)
    
    return(0)     
    
def removeFonts(wdir, fname):
    
    if options.REMOVE_FONTS == False:
        return(0) 
        
    fname = os.path.join(wdir, fname)
    output = os.path.join(wdir, 'remove_fonts.css')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(fname, 'rt', encoding='utf-8') as infp: 
        for line in infp:
            if line.strip().startswith('font-family:'):
                continue
            if 'size:8.5in 11.0in;' in line:
                continue        
                
            outfp.write(line)                
  
    outfp.close()
    os.remove(fname)
    os.rename(output, fname)
    adjustCSSBody(wdir, fname)
    return(0)
        
       
def changeBodyTag(wdir, file):
        
    print(' -- Change ebook text to default serif throughout')    
    
    output = os.path.join(wdir, 'body.html')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    
    body = soup.body
    del body['style']
    body['class'] = 'globals'
    
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)    

def convertTags(wdir, file):

    convertITags(wdir, file)
    convertBTags(wdir, file)
    convertEMTags(wdir, file)
    convertStrongTags(wdir, file)
    convertUTags(wdir, file)
    convertSTags(wdir, file)    
    
def getHTMLDocType(file_path):
    
    with open(file_path, 'rt', encoding='utf-8') as infp:
        for line in infp:
            if line.strip().startswith('<meta') and 'Microsoft Word' in line:
                options.DOCTYPE = 'Word'
                break
            elif line.strip().startswith('<meta') and 'LibreOffice' in line:
                options.DOCTYPE = 'LibreOffice'
                break
            elif line.strip().startswith('<meta') and 'OpenOffice' in line:
                options.DOCTYPE = 'OpenOffice'
                break
         
         
    if options.DOCTYPE == '':
        options.DOCTYPE = 'Google'               
      
    return(0)
    
def removeEmptyTags(wdir, file):    

    print(' -- Remove empty tags')     
    print(' -- Remove tabs and spaces')
    output = os.path.join(wdir, 'remove_spaces.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    infp = open(file, 'rt', encoding='utf-8')
    for line in infp:
    
        if '<br/>' not in line and 'href=' not in line:
            if line.strip() == '':
                continue
            if re.match(r'^\s*$', line):
                continue
            #if r'&#160;' in line or r'&amp;#160;' in line:
            #    continue        
            #if r'&nbsp;' in line or r'&amp;nbsp;'in line:
            #    continue              
        
        outfp.write(line)
    
    outfp.close()
    infp.close()
    os.remove(file)
    os.rename(output, file)
    
    output = wdir + os.sep + 'remove_blank_lines.htm'
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    
    search_tags = ['p', 'h1', 'h2', 'h3', ' h4', 'h5', 'h6']
    for tag in soup.find_all(search_tags):
        if tag.text == ' ' or tag.text == '':
            if '<img' not in str(tag) and '<br/>' not in str(tag):
                tag.extract()
                
    outfp.writelines(str(soup))
    outfp.close()  
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def removeRedundantCSS(wdir, file):

    file = os.path.join(wdir, file)
    output = wdir + os.sep + 'adhoc_cleanup.css'
    outfp = open(output, 'w', encoding='utf-8')
    with open(file, 'r', encoding='utf8') as infp:  
        for line in infp:
                
            if '<![CDATA[' in line or ']]' in line:
                continue       
                
            if ' align="center"' in line.lower():
                line = line.replace(' align="center"', ' style="text-align: center:text-indent: 0em;"')  
                line = line.replace(' align="CENTER"', ' style="text-align: center:text-indent: 0em;"')  
                line = line.replace(' ALIGN="CENTER"', ' style="text-align: center:text-indent: 0em;"')
                
            if ' align="justify"' in line.lower():
                line = line.replace(' align="justify"', ' style="text-align: justify:text-indent: 0em;"')  
                line = line.replace(' align="JUSTIFY"', ' style="text-align: justify:text-indent: 0em;"')  
                line = line.replace(' ALIGN="JUSTIFY"', ' style="text-align: justify:text-indent: 0em;"')    
                                
            if ' align="left"' in line.lower():
                line = line.replace(' align="left"', ' style="text-align: left:text-indent: 0em;"')  
                line = line.replace(' align="LEFT"', ' style="text-align: left:text-indent: 0em;"')  
                line = line.replace(' ALIGN="LEFT"', ' style="text-align: left:text-indent: 0em;"')
                
            #line = line.replace('w2e-', '')
            #line = line.replace('-western', '') 
            
            outfp.write(line)    
       
    outfp.close()
    os.remove(file)
    os.rename(output, file)
               
    return(0)             
    
def removeRedundantHTML(wdir, file):

    print(' -- Remove or change redundant or non-compliant html code')
    output = os.path.join(wdir, 'adhoc_cleaning.html')
    outfp = open(output, 'wt', encoding=('utf-8'))
    infp = open(file, 'rt', encoding='utf-8')
    
    # remove or change adhoc html 
    for line in infp:
        
        line = fixEncodingErrors(line) 
        line = removePageBreaks(line)       
        line = line.replace('<br/>', '')
        line = line.replace('<br />', '') 
        
        if '<meta' in line:
            if 'generator' not in line.lower() and 'http-equiv' not in line.lower():
                continue   
        
        if '<!--' in line or '-->' in line:
            continue    
            
        if '<hr' in line and 'page-break' in line:
            continue       
            
        if ' border="0"' in line:
            line = line.replace(' border="0"', '')
            
        if 'size:8.5in 11.0in;' in line:
            continue        
                                
        if 'letter-spacing: normal' in line:
            if 'letter-spacing: normal;' in line:
                line = line.replace('letter-spacing: normal;', '')
            else:
                line = line.replace('letter-spacing: normal', '')         
            
        if 'text-decoration: none' in line:
            if 'text-decoration: none;' in line:
                line = line.replace('text-decoration: none;', '')
            else:
                line = line.replace('text-decoration: none', '')      
            
        if 'font-variant: normal' in line:
            if 'font-variant: normal;' in line:
                line = line.replace('font-variant: normal;', '')
            else:
                line = line.replace('font-variant: normal', '')  
        
        liner = line.strip()            
        if liner == None:
            continue        
            
        if '\t' in line:
            line = line.replace('\t', '')        
        
        outfp.write(line)          
    
    
    outfp.close()
    infp.close()   
    os.remove(file)
    os.rename(output, file)
    return(file)
    
def fixEncodingErrors(line):
    """ Fixes encoding problems caused by 
        en dash, em dash, curly qutes, elipses etc
    """    
    # repair mixed encoding
    # cp 1252 to utf-8
    line = line.replace('â€™','’')       # apostrohe   
    line = line.replace('â€œ','“')       # left double quote 
    line = line.replace('â€','”')     # right double quote    
    line = line.replace('Â©','©')        # copyright
    line = line.replace('Â®','®')        # registered
    line = line.replace('â€”', '—')      # em dash
    line = line.replace('â€“', '–')      # en dash
    line = line.replace('â„¢', '™')
    line = line.replace('â”', '–')
    
    # latin-1 encoded in win cp1252 in utf-8 code
    line = line.replace('Ã¢â‚¬Å“', '“')
    line = line.replace('Ã¢â‚¬Â', '”')
    line = line.replace('Ã¢â‚¬â„¢', '’')
    
    # other encodings to utf-8
    line = line.replace('Ã¢â‚¬Å“','“')   # left double quote
    line = line.replace('¢â‚¬Â','”')     # right double quote    
    line = line.replace('Ã¢â‚¬â„¢','’')  # apostrohe, right single quote
    line = line.replace('Ã¢â‚¬Ëœ', '‘')  # left single quote 
    line = line.replace('Ã¢â‚¬“','–')    # en dash
    line = line.replace('Ã¢â‚¬”', '—')
    line = line.replace('Ì¶', '–')
    line = line.replace('Ã”', '”')
    line = line.replace('Ã”Ã', '”')
    line = line.replace('Ã', '')
    line = line.replace('Ãƒâ€šÃ‚Â', '')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã¢â€žÂ¢', '’')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã…â€œ', '“')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã‚Â', '”')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã¢â‚¬Å“','–')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã‹Å“', '‘')
    
    
    line = line.replace('Â', '')
    line = line.replace('Â', '')
    line = line.replace('Â', '’')
    
    line = line.replace('', '’') 
    line = line.replace('', '—')   
    line = line.replace('', '“')    
    line = line.replace('', '”')  
    line = line.replace('', '‘')    
    line = line.replace('', '…')
    line = line.replace('', '–')
    line = line.replace('© ', '©')
    
    line = line.replace(r'&lt;', '<')
    line = line.replace(r'&gt;', '>')
    
    line = line.replace(r'&ldquo;', '“')
    line = line.replace(r'&rdquo;', '”')
        
    return(line)     

    
def removeHardBreaks(wdir, file):
 
    print(' -- Remove blank lines')
    output = wdir + os.sep + 'new_html.htm'
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    
    # extract all <br/> tags
    for x in soup.find_all('br'):
        x.decompose()   
   
    outfp.writelines(str(soup))
    outfp.close() 
    os.remove(file)
    os.rename(output, file)
    
    return(file)
    
def getImageSize(image):
    """ Uses PIL to get image dimensions 
    """
    image = image.replace('\n', '')
    im = Image.open(image)
    ht = im.size[1]        
    wd = im.size[0]        
    return(wd, ht)
            
    
def formatImages(wdir, line):

    # get the image link
    soup = BeautifulSoup(line, 'html.parser')
    tag = soup.find('img')    
    if tag.has_attr('src'):
        text = tag['src']      
    else:
        return(line)  
    
    #get the img file name from the img path    
    file_name = os.path.basename(text)
    file_path = os.path.join(wdir, file_name)
     
    # get the image dimensions
    width, height = getImageSize(file_path)
    perc_width = round(width/650 * 100)       # calculates width as a percentage of screen width
    perc_height = round(height/1200 * 100)    # calculates height as a percentage of screen height
    
    if perc_width >= 100:
        perc_width = 100
        
    if perc_height >= 100:
        perc_height = 100
        
    file_name = os.path.split(file_name)[1]
    file_name = file_name.replace(' ', '_')
    
    # insert the height and width image values 
    soup = BeautifulSoup(line, 'html.parser')
    img = soup.img
    if img.has_attr('class'):
        del img['class']
    if not img.has_attr('style'):
        if img.has_attr('width'):
            del img['width']
        if img.has_attr('height'):
            del img['height']            
        img['style'] = 'width: ' + str(perc_width) + '%;height: auto;'
    else:
        img['style'] = 'width: ' + str(perc_width) + '%;height: auto;'        
    
    line = str(soup)
    return(line)    
    
def reformatBookImages(wdir, file):

    print(' -- Reformat smaller images')    
    # inserts and reformats all ebook images 
    outfile = wdir + os.sep + 'images.html'
    infp = open(file, 'rt', encoding=('utf-8'))
    outfp = open(outfile, 'wt', encoding=('utf-8'))
    for line in infp:
        if '<img' in line:
            line = formatImages(wdir, line)
            outfp.write(line)     
        else:
            outfp.write(line) 
        
    outfp.close()                
    infp.close() 
    os.remove(file)
    os.rename(outfile, file)
    return(0)           

def removePageBreaks(line):

    if 'page-break-before: always' in line:           
        if 'page-break-before: always;' in line:            
            line = line.replace('page-break-before: always;', '')
        else:    
            line = line.replace('page-break-before: always', '')

    if 'page-break-before:always' in line:                
        if 'page-break-before:always;' in line:            
            line = line.replace('page-break-before:always;', '')
        else:    
            line = line.replace('page-break-before:always', '')   
    
    if 'page-break-before: auto' in line:           
        if 'page-break-before: auto;' in line:            
            line = line.replace('page-break-before: auto;', '')
        else:    
            line = line.replace('page-break-before: auto', '')

    if 'page-break-before:auto' in line:                
        if 'page-break-before:auto;' in line:            
            line = line.replace('page-break-before:auto;', '')
        else:    
            line = line.replace('page-break-before:auto', '')     

    if 'page-break-before: avoid' in line:           
        if 'page-break-before: avoid;' in line:            
            line = line.replace('page-break-before: avoid;', '')
        else:    
            line = line.replace('page-break-before: avoid', '')

    if 'page-break-before:avoid' in line:                
        if 'page-break-before:avoid;' in line:            
            line = line.replace('page-break-before:avoid;', '')
        else:    
            line = line.replace('page-break-before:avoid', '')     

    if 'page-break-after: avoid' in line:           
        if 'page-break-after: avoid;' in line:            
            line = line.replace('page-break-after: avoid;', '')
        else:    
            line = line.replace('page-break-after: avoid', '')

    if 'page-break-after:avoid' in line:                
        if 'page-break-after:avoid;' in line:            
            line = line.replace('page-break-after:avoid;', '')
        else:    
            line = line.replace('page-break-after:avoid', '')     

    if 'page-break-after: auto' in line:           
        if 'page-break-after: auto;' in line:            
            line = line.replace('page-break-after: auto;', '')
        else:    
            line = line.replace('page-break-after: auto', '')

    if 'page-break-after:auto' in line:                
        if 'page-break-after:auto;' in line:            
            line = line.replace('page-break-after:auto;', '')
        else:    
            line = line.replace('page-break-after:auto', '')            

    if 'page-break-inside: auto' in line:           
        if 'page-break-inside: auto;' in line:            
            line = line.replace('page-break-inside: auto;', '')
        else:    
            line = line.replace('page-break-inside: auto', '')

    if 'page-break-inside:auto' in line:                
        if 'page-break-inside:auto;' in line:            
            line = line.replace('page-break-inside:auto;', '')
        else:    
            line = line.replace('page-break-inside:auto', '')      

    if 'page-break-inside: avoid' in line:           
        if 'page-break-inside: avoid;' in line:            
            line = line.replace('page-break-inside: avoid;', '')
        else:    
            line = line.replace('page-break-inside: avoid', '')

    if 'page-break-inside:avoid' in line:                
        if 'page-break-inside:avoid;' in line:            
            line = line.replace('page-break-inside:avoid;', '')
        else:    
            line = line.replace('page-break-inside:avoid', '')                       
                
    
    return(line)
    
def addHTMLTail(wdir, file):
    
    output = wdir + os.sep + "tails.html"
    outfp = open(output, 'wt', encoding=('utf-8'))
    infp = open(file,'rt', encoding=('utf-8'))
    
    for line in infp:
        outfp.write(line)
        
    outfp.write('\n</body>\n</html>\n\n')            
    
    outfp.close()
    infp.close()
    os.remove(file)
    os.rename(output, file)   
    return(0)    
    
def prettifyXHTMLFile(wdir, file):

    output = os.path.join(wdir, 'remove_divs.html') 
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    
    soup = BeautifulSoup(html, 'html.parser')
    
    # remove Word's top div section
    for dtag in soup.find_all('div', class_='Section1'):
        dtag.attrs = {}
        dtag.unwrap()
            
    # remove top div sections from other html doctypes        
    tag = soup.find('body').next_element
    if tag.name == 'div':
        tag.attrs = {}
        tag.unwrap()          

    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)    
    
    # reformat and prettify the XHTML file
    outfile= os.path.join(wdir, 'final_one.css')
    infp = open(file, 'rt', encoding='utf-8')
    outfp = open(outfile, 'wt', encoding='utf-8') 
    for line in infp:
        
        
        if '<img' in line:
            line = '\n' + line
        
        if '</p><img' in line:
            line=line.replace('</p><img', '</p>\n<img')
            
        if '<div></div>' in line:
            line = ''          

        if 'font-size: pt;' in line:
            line = line.replace('font-size: pt;', '')
            outfp.write(line)
            continue        
        
        if line.strip() == '<br />' or line.strip() == '<br/>':
            line = ''
            outfp.write(line)
            continue               
         
        if '<body' in line:
            line = '<body style="font-family: serif; margin: 3% 3% 3% 3%;">\n\n'   
         
        html = BeautifulSoup(line, 'html.parser')
        if html.img:
            if html.img.has_attr('name'):
                html.img['id'] = html.img['name'] 
                del html.img['name']
                line = str(html)                  
        
        line = line.replace(r'&nbsp;', ' ')
        line = line.replace(r'&#160;', ' ')        
        line = line.replace(r'&amp;#160;', ' ')
        line = line.replace(r'&amp;#nbsp;',r'&#nbsp;') 
        line = line.replace(r"&#146;", "’")         
        line = line.replace(r"&amp;#146;", "’") 
        line = line.replace("<!--?xml version='1.0' encoding='utf-8'?-->", "")
        
        if line.strip().startswith('<svg') or line.strip().startswith('<image'):
                outfp.write(line)
                continue     
    
        if '<style>' in line:
            line = '<style type="text/css">\n'                   
                
        line = line.replace(r'&nbsp;', ' ')
        line = line.replace(r'&#160;', ' ')        
        line = line.replace(r'&amp;#9;', '')
        
        if line.strip().startswith('<?xml') or \
            line.strip().startswith('<!DOCTYPE') or \
            line.strip().startswith('<html') or \
            line.strip().startswith('<head>') or \
            line.strip().startswith('<meta')or \
            line.strip().startswith('<title>') or \
            line.strip().startswith('<link') or \
            line.strip().startswith('</head>') or \
            line.strip().startswith('<body') or \
            line.strip().startswith('<body>'):
            line = line.strip()
            if not line:
                continue
            if line.startswith('<meta') or \
                line.startswith('<title>') or \
                line.startswith('<link'):
                line = '  ' + line      
            if line.startswith('<body'):
                line = '\n' + line
            if line.startswith('</body>'):
                outfp.write('\n' + line.rstrip() + '\n')
            else:
                outfp.write(line.rstrip() + '\n')    
        else:
            line = line.strip() 
            if not line or line == ';':
                continue
            if line.startswith('<p'):
                line = '  ' + line            
            outfp.write('\n' + line + '\n')    
            
    infp.close()
    outfp.close()
    os.remove(file)
    os.rename(outfile, file)
    
    outfile= os.path.join(wdir, 'styles.css')
    infp = open(file, 'rt', encoding='utf-8')
    outfp = open(outfile, 'wt', encoding='utf-8') 
    for line in infp: 
        if '<style>' in line or '<style type="text/css">' in line:
            outfp.write(line) 
        
            for line in infp:
                if line == '\n':
                    continue                 
                
                if '</style>' in line:
                    outfp.write(line.strip() + '\n')
                    break
                else:
                    line = line.replace('P {', 'p {')
                    line = line.replace('H1', 'h1')
                    line = line.replace('H2', 'h2')
                    line = line.replace('H3', 'h3')
                    line = line.replace('H4', 'h4')
                    line = line.replace('H5', 'h5')
                    line = line.replace('H6', 'h6')
                    line = line.replace('P.', 'p.')
                    line = line.replace('H1.', 'h1.')
                    line = line.replace('H2.', 'h2.')
                    line = line.replace('H3.', 'h3.')
                    line = line.replace('H4.', 'h4.')
                    line = line.replace('H5.', 'h5.')
                    line = line.replace('H6.', 'h6.')
                    line = line.replace('A:link', 'a:link')
                    line = line.replace('DIV', 'div')
                    line = line.replace('SPAN', 'span')
                    outfp.write(line.strip() + '\n')
                    continue                    
        else: 
            outfp.write(line)
        
    infp.close()
    outfp.close()
    
    if options.FILE_TYPE == 'EPUB':
        addHTMLTail(wdir, file)
    
    os.remove(file)
    os.rename(outfile, file)
    return(0)         
    
def convertITags(wdir, file):

    output = os.path.join(wdir, 'new_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')\
    
    for ptag in soup.find_all('p'):
        if ptag.i:
            ptag.i['style'] = 'font-style: italic;'
            ptag.i.name = 'span'
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)        
    
def convertBTags(wdir, file):

    output = os.path.join(wdir, 'new_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')\
    
    for ptag in soup.find_all('p'):
        if ptag.b:
            ptag.b['style'] = 'font-weight: bold;'
            ptag.b.name = 'span'
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)            
                
def convertEMTags(wdir, file):

    output = os.path.join(wdir, 'new_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')\
    
    for ptag in soup.find_all('p'):
        if ptag.em:
            ptag.em['style'] = 'font-style: italic;'
            ptag.em.name = 'span'
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)                        
    
def convertStrongTags(wdir, file):

    output = os.path.join(wdir, 'new_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')\
    
    for ptag in soup.find_all('p'):
        if ptag.strong:
            ptag.strong['style'] = 'font-weight: bold;'
            ptag.strong.name = 'span'
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)            
    
def convertSTags(wdir, file):

    output = os.path.join(wdir, 'new_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')\
    
    for ptag in soup.find_all('p'):
        if ptag.s:
            ptag.s['style'] = 'text-decoration: line-through;'
            ptag.s.name = 'span'
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)            
    
def convertUTags(wdir, file):

    output = os.path.join(wdir, 'new_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')\
    
    for ptag in soup.find_all('p'):
        if ptag.u:
            ptag.u['style'] = 'text-decoration: underline;'
            ptag.u.name = 'span'
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)            

def insertGoogleMetadata(wdir, file):

    print(' >> In Google insert metadata...!!\n')
    output = os.path.join(wdir, 'Abi_meta.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:        
    
        for line in infp:
            if '<head>' in line.strip():
                line = '<head>\n  <meta name="Generator" content="Google Doc HTML" />\n'
                outfp.write(line)
            else:
                outfp.write(line)            
    
    outfp.close()
    os.remove(file)
    os.rename(output, file) 
    return(0)        
    

    