#!/Python3/python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals, division, absolute_import, print_function

import os, os.path, sys, codecs, shutil, inspect, chardet, re, time
from decimal import *
from cleanup_utils import *
from PIL import Image
import options
from doc_tidy import * 
import tkinter as tk
import tkinter.messagebox as mbox

__all__=["cleanHTML", "removeHTMLEntities", "checkStyles", "checkHTMLFileName", "copyTextFiles2Dir", "copyCSSFiles2Dir", "copyImageFiles2Dir", "writeFiles2CSS", "writeFiles2Epub", "removeLineHeight", "extraHTMLCleanup", "sanitizeHTML", "removeBadAttributesfromCSS", "setGlobalCSSValues", "cssFinalFormat", "convertFile2UTF8", "checkFileEncoding", "fileNotLoadedError", "addHTMLPresets", "prettifyCSS", "cleanExit", "repairHTMLIDs", "CheckandRepairID", "fixEncodingErrors"]

try:

    from sigil_bs4 import BeautifulSoup, Comment
except:
    from bs4 import BeautifulSoup, Comment  
    
    
def cleanHTML(wdir, fname):

    print('\n -- Processing automatic tasks...')
    print(' -- Clean and reformat the html')
    
    # check/change html file encoding as necessary
    encoder = checkFileEncoding(wdir, fname)
    file = convertFile2UTF8(wdir, fname, encoder)
    
    # get the imported html doctype
    getHTMLDocType(fname)
    if options.SYS_EXIT == True:
        return(0)
 
    print(' >>> Doctype is: ' + options.DOCTYPE)
    print('\n -- In cleanHTML processing file...' + fname)
    file = os.path.join(wdir, os.path.basename(fname))
    removeHardBreaks(wdir, file)
    removeEmptyTags(wdir,file)
    docTidyNoWrap(wdir, file)
    prettifyXHTMLFile(wdir, file)
    removeHTMLEntities(wdir, file)
    removeAlignAttr(wdir, file)
    convertName2IDAttr(wdir, file)
    sanitizeHTML(wdir, file)
    removeFontTags(wdir, file)       
    removeRedundantHTML(wdir, file)
    extraHTMLCleanup(wdir, file) 
    removeAttributes(wdir, file)
    removeLangAttrs(wdir, file)
    convertTags(wdir, file)  
    fixHTMLAttrValues(wdir, file)
    addDOCTYPEHeader(wdir, file)
    prettifyXHTMLFile(wdir, file)
    
    reformatWordStyles(wdir, file)
    reformatOpenDocStyles(wdir, file)
    reformatGoogleStyles(wdir, file)
    removeStyles(wdir, file)
    removeFontStyles(wdir, file)
    docTidyNoWrap(wdir, file)
    addDOCTYPEHeader(wdir, file)
    removeHardBreaks(wdir, file)
    removeEmptyTags(wdir,file)  
    removeLangAttrs(wdir, file)
    prettifyXHTMLFile(wdir, file) 
    
    return(0)
    
def removeHTMLEntities(wdir, file):

    file = os.path.join(wdir, file)
    output = os.path.join(wdir, 'remove_entities.html')
    outfp = open(output, 'wt', encoding=('utf-8'))
    infp = open(file, 'rt', encoding='utf-8')
    finish = False
    for line in infp:
        
        #if line.strip().startswith('<span'):
        #    continue        
        
        if line.isspace():
            line = ''
            outfp.write(line)
            continue
       
        if line.strip() == '':
            continue
        
        if line.strip() == r'&nbsp;' or line.strip() == r'&amp;nbsp;':       
            continue  
            
        if line.strip() == r'&#160;' or line.strip() == r'&amp;#160;':       
            continue      
                
      
        line = line.replace(r'&nbsp;', '')
        line = line.replace(r'&#160;', '')        
        line = line.replace(r'&amp;#160;', '')
        line = line.replace(r'&amp;#nbsp;','') 
        line = line.replace(r"&#146;", "’")         
        line = line.replace(r"&amp;#146;", "’")
        
        outfp.write(line)     
        
    outfp.close()
    infp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def checkStyles(wdir, fnames):

    #  check if the input file is an HTML file
    file = os.path.join(wdir, fnames[0])
    with open(file, 'rt', encoding='utf-8') as fp:       
        for line in fp: 
            if '<style>' in line or '<style type="text/css">'in line:          
                options.HTML_STYLES = True
                break                 
    return(0)    

def checkHTMLFileName(bk, wdir, files):
    fname = files[0] 
    uid = fname 
    if ' ' in files[0]:
        file2 = files[0].replace(' ', '_')
        os.rename(os.path.join(wdir, fname), os.path.join(wdir, file2))
        files[0] = fname
        return(files)
    return(files)    
    
def copyTextFiles2Dir(bk, wdir):

    t_ids = list()
    t_hrefs = list()
    t_fnames = list()
    
    for (id, href) in bk.text_iter():
        t_ids.append(id)
        t_hrefs.append(href)
        t_fnames.append(os.path.basename(href))
    
    # copy all xhtml files to the working dir    
    file = str()
    t_fnames_r = list()
    t_ids_r = list()
    i = 0      
    for id in t_ids:
        file = os.path.join(wdir, t_fnames[i])
        if 'cover.xhtml' in file or \
            'cover.html' in file or \
            'cover.htm' in file:
            i = i + 1
            continue    
        
        print(' -- Copy to work dir...' + t_fnames[i])
        with open(file, 'wt', encoding='utf-8') as outfp:
            data = bk.readfile(id)
            html = BeautifulSoup(data, 'html.parser')
            t_fnames_r.append(t_fnames[i])
            t_ids_r.append(id)
            outfp.writelines(str(html))
            i = i + 1
    
    return(t_ids_r, t_fnames_r) 
                
def copyCSSFiles2Dir(bk, wdir):

    s_ids = list()
    s_hrefs = list()
    s_fnames = list()
    for (i, h) in bk.css_iter():
        s_ids.append(i)
        s_hrefs.append(h)
        s_fnames.append(os.path.basename(h))
        
    j = 0    
    for sid in s_ids:
        file = os.path.join(wdir, s_fnames[j])
        print(' -- Write to work dir...' + file)
        with open(file, 'wt', encoding='utf-8') as outfp: 
            data = bk.readfile(sid)
            html = BeautifulSoup(data, 'html.parser')
            outfp.writelines(str(html))                
            j = j + 1     
            
    return(s_ids, s_fnames) 

def copyImageFiles2Dir(bk, wdir):

    i_ids = list()
    i_hrefs = list()
    i_fnames = list()
    
    for (i, h, m) in bk.image_iter():
        i_ids.append(i)
        i_hrefs.append(h)
        i_fnames.append(os.path.basename(h))
        
    j = 0    
    for iid in i_ids:
        file = os.path.join(wdir, i_fnames[j])
        print(' -- Copy images to work dir...' + file)
        with open(file, 'wb') as outfp: 
            data = bk.readfile(iid)
            outfp.write(data)                
            j = j + 1     
            
    return(i_ids, i_fnames) 
    
def removeLineHeight(line):
    
    # remove all line-height attribute properties from html
    if 'line-height:' in line and 'line-height: normal' not in line and 'line-height:normal' not in line:
        i = 100
        while i < 121:
            line = line.replace('line-height: ' + str(i) + '%', '') 
            i = i + 1
            
        j = 100
        while j < 121:
            line = line.replace('line-height:' + str(j) + '%', '')
            j = j + 1
             
        k = 100
        while k < 121:
            line = line.replace('line-height: ' + str(k) + '%;', '')    
            k = k + 1
             
        l = 100
        while l < 121:
            line = line.replace('line-height:' + str(l) + '%;', '')        
            l = l + 1 
            
        line = line.replace('line-height: 12pt;', '')   
        line = line.replace('line-height:12pt;', '')
        line = line.replace('line-height: 12pt', '')
        line = line.replace('line-height:12pt', '')        
        
        line = line.replace('line-height: 14pt;', '')
        line = line.replace('line-height:14pt;', '')
        line = line.replace('line-height: 14pt', '')
        line = line.replace('line-height:14pt', '') 
        
        line = line.replace('line-height:1.2em', '') 
        line = line.replace('line-height: 1.2em', '') 
        
    return(line)
    
def extraHTMLCleanup(wdir, file):
    """ Extra and more extensive cleanup is 
        required to remove or change all the 
        unneeded proprietary data from the 
        imported html doc.       
    """
    
    output = os.path.join(wdir, 'adhoc_cleaning.html')
    outfp = open(output, 'wt', encoding=('utf-8'))
    infp = open(file, 'rt', encoding='utf-8')
    
    # remove or change adhoc html
    for line in infp:
  
        if line.strip().startswith('/*') and line.strip().endswith('*/'):
            continue

        if line.strip().startswith('/*') and not line.strip().endswith('*/'):
            for line in infp:
                if not line.strip().endswith('*/'):
                    continue
                else:
                    line = ''
                    break                               
        
        
        if '<img align="left"' in line.lower():
            line = line.replace('<img align="left"', '<img ')
            line = line.replace('<img align="LEFT"', '<img ')
            line = line.replace('<img ALIGN="LEFT"', '<img ')  

        # fixes malformed, standalone image tag lines in the html
        if line.strip().startswith('<img '):
            line = line.strip().replace('<img ', '<p style="text-align: center;"><img ')
            line = line + '</p>'
            line = '\n' + line + '\n'            
            
        if '<meta' in line and ('name="generator"' in line.lower() or 'http-equiv="content-type"' in line.lower()):
            outfp.write(line)
            continue
        else:
            if '<meta' in line:
                continue           
        
        
        line = line.replace(' transform:', '')    
        line = line.replace(' rotate(0.00rad)', '')    
        line = line.replace(' translateZ(0px);', '')
        line = line.replace('-webkit-transform:', '') 
        line = removeLineHeight(line)             
    
        if 'font-family: "Times New Roman", serif' in line:
            if 'font-family: "Times New Roman", serif;"' in line:
                line = line.replace('font-family:"Times New Roman", serif;', '')
                line = line.replace('font-family: "Times New Roman", serif;', '')
            else:
                line = line.replace('font-family:"Times New Roman", serif', '')  
                line = line.replace('font-family: "Times New Roman", serif', '')              
        
        if 'direction: inherit' in line:
            if 'direction: inherit;' in line:
                line = line.replace('direction: inherit;', '')
            else:
                line = line.replace('direction: inherit', '') 
         
        if 'position: absolute' in line:
            if 'position: absolute;' in line:
                line = line.replace('position: absolute;', '')
            else:
                line = line.replace('position: absolute', '')
                
        if 'letter-spacing: normal' in line:
            if 'letter-spacing: normal;' in line:
                line = line.replace('letter-spacing: normal;', '')
            else:
                line = line.replace('letter-spacing: normal', '')         
            
        if 'text-decoration: none' in line:
            if 'text-decoration: none;' in line:
                line = line.replace('text-decoration: none;', '')
            else:
                line = line.replace('text-decoration: none', '')      
            
        if 'font-variant: normal' in line:
            if 'font-variant: normal;' in line:
                line = line.replace('font-variant: normal;', '')
            else:
                line = line.replace('font-variant: normal', '')  

        soup = BeautifulSoup(line, 'html.parser')
        for sup in soup.find_all('sup'):
            if sup.has_attr('class'):
                del sup['class']
            sup['style'] = 'font-size: 0.8em;line-height: normal;vertical-align: top;'
            line = str(soup)    

        soup = BeautifulSoup(line, 'html.parser')
        for sub in soup.find_all('sub'):
            if sub.has_attr('class'):
                del sub['class']
            sub['style'] = 'font-size: 0.65em;line-height: normal;vertical-align: bottom;'
            line = str(soup)                    
        
        # fix <span> subscript problems
        soup = BeautifulSoup(line, 'html.parser')
        for span in soup.find_all('span'):
            if 'vertical-align:sub' in str(span).replace(' ', ''):
                span['style'] = 'font-size: 0.65em;line-height: normal;vertical-align: bottom;'
                line = str(soup)      
            
        if line.strip() == ';':
            continue            
        
        liner = line.strip()            
        if liner == None:
            continue
            
        if '\t' in line:
            line = line.replace('\t', '')        
        
        outfp.write(line)     
        
    outfp.close()
    infp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)     
    
def sanitizeHTML(wdir, file):
    """ Removes all unnecessary proprietary 
        tags or attributes from the html. 
    """        
    
    output = os.path.join(wdir, 'new_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')  
        
    search_box = ['p','h1', 'h2', 'h3', 'h4']
    for tag in soup.find_all(search_box):
        if tag.has_attr('align'):
            if 'align="center"' in str(tag).lower():
                del tag['align']
                if tag.has_attr('style'):
                    if tag['style'].endswith(';'):
                        tag['style'] = tag['style'] + 'text-align: center;'
                    else:
                        tag['style'] = tag['style'] + ';text-align: center;'
                else:
                    tag['style'] = 'text-align: center;'
                               
    pt_size = str()
    if options.DOCTYPE == 'OpenOffice' or options.DOCTYPE == 'LibreOffice':
        for ptag in soup.find_all('p'):
            if ptag.font:
                if ptag.font.has_attr('size'):
                    pt_size = getPointSize(str(ptag.font['size']))
                    ptag.font.attrs = {}
                    ptag.font.unwrap()
                if ptag.has_attr('style'):
                    if str(ptag['style']).endswith(';'): 
                        ptag['style'] = ptag['style'] + 'font-size: ' + pt_size + 'pt;'
                    else:
                        ptag['style'] = ptag['style'] + ';font-size: ' + pt_size + 'pt;'
                else:
                    ptag['style'] = 'font-size: ' + pt_size + 'pt;'                 
            
    # ensures epub image compliance
    for fonts in range(len(soup.find_all("font"))):
        le_font = soup.find_all("font")[fonts]
        del (le_font["face"])  
    
    # ensures epub image compliance
    for imgs in range(len(soup.find_all("img"))):
        le_img = soup.find_all("img")[imgs]
        del (le_img["border"])
        del (le_img["clear"])
        del (le_img["align"])
        del (le_img["title"])
        if le_img.has_attr('name'):
            le_img['id'] = le_img['name']
            del(le_img['name'])
    
    for c in soup.find_all('p', 'span'):
        if 'align="left"' in str(c).lower():
            del c['align']                   
  
    for c in soup.find_all('img'):
        if c.has_attr('align'):
            del c['align']
        if c.has_attr('title'):
            del d['title']        
            
    for e in soup.find_all('img'):            
        if e.has_attr('border'):
            del e['border']
            
    # remove unnecessary text decoration        
    for s in soup.find_all('p', 'span'):
        if s.has_attr('style'):
            if 'text-decoration: none' in str(s):
                s['style'] = str(s['style']).replace('text-decoration: none', '')
                    
    # remove unnecessary page-breaks     
    for s in soup.find_all('p'):
        if s.has_attr('style'):
            if 'page-break-before: always' in str(s):
                if 'page-break-before: always;' in str(s):
                    s['style'] = str(s['style']).replace('page-break-before: always;', '')
            else:
                s['style'] = str(s['style']).replace('page-break-before: always', '')               
    
    # remove 'background: transparent'
    for j in soup.find_all('span'):
        if 'background: transparent' in str(j):
            j.attrs = {}
            j.extract()        
    
    # add "alt" to img tags
    for f in soup.find_all('img'):
        if not f.has_attr('alt'):
            f['alt'] = ""
    
    # convert align=center to text-align inline 'style' attributes
    searcher = ['p','h1', 'h2', 'h3', 'h4']  
    for t in soup.find_all(searcher):
        if t.has_attr('class') and 'align="center"' in str(t).lower():
            del t['align']
            if t.has_attr('style'):
                if t['style'].endswith(';'):
                    t['style'] = t['style'] + 'text-align: center;' 
                else:
                    t['style'] = t['style'] + ';text-align: center;'
            else:
                t['style'] = 'text-align: center;'
                            
        else:
            if t.has_attr('style') and 'align="center"' in str(t).lower():
                del t['align']
                if t['style'].endswith(';'):
                    t['style'] = t['style'] + 'text-align: center;' 
                else:
                    t['style'] = t['style'] + ';text-align: center;'
                    
    # convert align=justify to text-align inline 'style' attributes
    searched = ['p','h1', 'h2', 'h3', 'h4']  
    for t in soup.find_all(searched):
        if t.has_attr('class') and 'align="justify"' in str(t).lower():
            del t['align']
            if t.has_attr('style'):
                if t['style'].endswith(';'):
                    t['style'] = t['style'] + 'text-align: justify;' 
                else:
                    t['style'] = t['style'] + ';text-align: justify;'
            else:
                t['style'] = 'text-align: justify;'
                            
        else:
            if t.has_attr('style') and 'align="justify"' in str(t).lower():
                del t['align']
                if t['style'].endswith(';'):
                    t['style'] = t['style'] + 'text-align: justify;' 
                else:
                    t['style'] = t['style'] + ';text-align: justify;'  
                    

    # convert align=justify to text-align inline 'style' attributes
    searches = ['p','h1', 'h2', 'h3', 'h4']  
    for t in soup.find_all(searches):
        if t.has_attr('class') and 'align="left"' in str(t).lower():
            del t['align']
            if t.has_attr('style'):
                if t['style'].endswith(';'):
                    t['style'] = t['style'] + 'text-align: left;' 
                else:
                    t['style'] = t['style'] + ';text-align: left;'
            else:
                t['style'] = 'text-align: left;'
                            
        else:
            if t.has_attr('style') and 'align="left"' in str(t).lower():
                del t['align']
                if t['style'].endswith(';'):
                    t['style'] = t['style'] + 'text-align: left;' 
                else:
                    t['style'] = t['style'] + ';text-align: left;'                                 
    
    
    # remove all 'style' attributes from h1 tags   
    #for h in soup('h1'):
    #    if h.has_attr('style'):
    #        del h['style']
     
    # remove empty h1 tags     
    for h in soup.find_all('p', 'h1'): 
        if h.get_text() == '' or h.get_text() == ' ' or h.get_text() == None:
            if '<img' not in str(h):
                h.attrs = {}
                h.extract()             
    
    # remove any p tags with just space if '<img' not in str(h):
    for x in soup.find_all('p'):
        if x.string == ' ':
            if '<img' not in str(x):
                x.decompose()
    
    # remove all empty heading tags        
    search_tags=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ]            
    for h in soup.find_all(search_tags):
        if h.string == '':
            if '<img' not in str(h):
                h.attrs = {}
                h.extract()
            
    for s in soup.find_all('span'):
        if s.has_attr('style'):
            if not s.has_attr('class'):
                if s['style'] == '':
                    del s['style']
                    s.unwrap() 

    for p in soup.find_all('a'):
            if p.a and '<span>' in str(p):
                p.span.unwrap()                    
    
    for tag in soup.find_all('p', 'h1', 'h2', 'h3', 'h4'):
        if tag.span and tag.span.has_attr('lang'):
            del tag.span['lang']
            tag.span.unwrap()
        if tag.span and tag.span.has_attr('xml:lang'):
            del tag.span['xml:lang']
            tag.span.unwrap()               
            
    for g in soup.find_all('p', 'h1', 'h2', 'h3', 'h4'):
        if 'xml:lang' in str(g):
            del g['xml:lang']
            g.unwrap()                         
            
    for t in soup.find_all('p', 'h1', 'h2', 'h3', 'h4'):
        if 'lang' in str(t):
            del t['lang']
            t.unwrap()        
           
    for t in soup.find_all('br'):
        if t.has_attr('style'):
            if len(t['style']) == 0:
                del t['style']

    for s in soup.find_all(['p']):
        if '<span>' in str(s):
            s.span.unwrap()  

    for d in soup.find_all(['span']):     
        if '<span>' in str(d):
            d.unwrap()                       
            
    for useless in soup.find_all("span", text=lambda text: not text):
        useless.unwrap()        
    
    outfp.writelines(str(soup))
    outfp.close()
    
    os.remove(file)
    os.rename(output, file)
    return(file)    
    
def removeBadAttributesfromCSS(wdir, file):
    print(' -- Remove unwanted style properties from the CSS')
    
    file = os.path.join(wdir, file)
    output = wdir + os.sep + 'removed_fonts.css'
    outfp = open(output, 'w', encoding='utf-8')
    with open(file, 'r', encoding='utf8') as infp:  
        for line in infp:                 
            line = line.replace('-western', '')
            
            if '/*<![CDATA[*/' in line:
                continue
            if '/*]]>*/' in line:
                continue

            if 'body  {' in line:
                outfp.write(line)
                for line in infp:
                    if ']' not in line:
                        outfp.write(line)
                    else:
                        outfp.write(line)
                        break        
                        
            if 'mso-style-link:' in line or 'mso-style-name' in line:
                line = ''
                outfp.write(line)
                continue                
                        
            if 'line-height:' in line:
                if 'line-height: normal' not in line:
                    continue                
                        
            if 'pc;' in line:
                line = line.replace('pc;', 'em;')               
                        
            if 'font-weight:700' in line.strip().replace(' ', ''):
                    line = line.replace('700', 'bold')
                    
            if 'font-weight:400' in line.strip().replace(' ', ''):
                line = line.replace('400', 'normal')                                 
        
            if 'position:absolute' in line.replace(' ', '') or \
                'font-variant:normal' in line.replace(' ', '') or \
                'text-decoration:none' in line.replace(' ', '') or \
                'letter-spacing:normal' in line.replace(' ', '') or \
                'vertical-align:normal' in line.replace(' ', '') or \
                'transform' in line or \
                'so-language:' in line or \
                '-webkit-transform' in line or \
                'direction:' in line or \
                'widows:' in line or \
               'orphans:' in line or \
                line == ';\n':
                continue    
                
            else:
                if 'a:link' not in line and 'a:visited' not in line:
                    if ':' in line:
                        if ': ' in line:
                            pass
                        else:
                            line = line.replace(':', ': ')                        
                outfp.write(line)
                          
    outfp.close()
    os.remove(file)
    shutil.copy(output, file)
    os.remove(output)               
               
    return(0)         
   
   
def setGlobalCSSValues(wdir, file):
    """ The p tag, rather like the "Normal" style 
        in Word, is inherited by all styles. So by adding 
        attributes to this style I am deliberately setting 
        default attribute values for all paragraph styles in 
        the CSS. This is especially useful for avoiding LITB 
        problems (due to the Kindle overrides) after KDP upload.        
    """
    
    file = os.path.join(wdir, file)
    outfile = wdir + os.sep + 'erase_presets.css'
    infp = open(file, 'rt', encoding=('utf-8'))
    outfp = open(outfile, 'wt', encoding=(' utf-8'))
    for line in infp:
        
        if line == 'body  {\n' or \
            line == 'p  {\n' or \
            line == 'h1, h2, h3, h4, h5, h6  {\n' or \
            line == 'body  {\n':            
            for line in infp:
                if '}\n' not in line:
                    continue
                else:
                    break
        else:
            outfp.write(line)                    
    
    outfp.close()                
    infp.close() 
    os.remove(file)
    os.rename(outfile, file)  
    
    # add presets to the CSS
    file = os.path.join(wdir, file)
    outfile = wdir + os.sep + 'presets.css'
    infp = open(file, 'rt', encoding=('utf-8'))
    outfp = open(outfile, 'wt', encoding=(' utf-8'))
    for line in infp:
        outfp.write(line)
        
    outfp.write('p  {\n')
    outfp.write('font-family: serif;\n')
    outfp.write('text-align: justify;\n')
    outfp.write('font-size: 100%;\n')
    outfp.write('text-indent: 0em;\n')
    outfp.write('font-weight: normal;\n')
    outfp.write('font-style: normal;\n')
    outfp.write('}\n')
    
    outfp.write('h1, h2, h3, h4, h5, h6  {\n')
    outfp.write('text-indent: 0em;\n')
    outfp.write('}\n')
    
    outfp.close()                
    infp.close() 
    os.remove(file)
    os.rename(outfile, file) 
    return(0)
        
def cssFinalFormat(wdir, file):

    # convert any styling to lower case
    infile = os.path.join(wdir, file)
    outfile= os.path.join(wdir,'last.css')
    infp = open(infile, 'r', encoding='utf-8')
    outfp = open(outfile, 'w', encoding='utf-8') 
    for line in infp:
     
        if '{\n' in line:
            #line = line.replace(' ', '')
            line = line.replace('{', '  {')
            line = line.replace(',', ', ')  
        
        if '@pageSection1' in line or \
            'div.Section1' in line or \
            '.PapDefault' in line:         
            outfp.write('')
            for line in infp:
                if '}' in line:
                    outfp.write('')
                    break
                else:
                    continue
            line = line.replace('}\n', '')  
           
        if 'page: Section1;' in line:
            continue              
           
        #if '@mediaprint,' in line:
        #    line= line.replace('@mediaprint,', '@media print,')
         
        if '@mediaprint,' in line:        
            outfp.write('')
            for line in infp:
                if '}' in line:
                    outfp.write('')
                    break
                else:
                    continue
            line = line.replace('}\n', '')
         
        line = line.replace('P.', 'p.')
        line = line.replace('H1.', 'h1.')
        line = line.replace('H2.', 'h2.')
        line = line.replace('H3.', 'h3.')
        line = line.replace('H4.', 'h4.')
        line = line.replace('H5.', 'h5.')
        line = line.replace('H6.', 'h6.')
        
        line = line.replace('H1', 'h1')
        line = line.replace('H2', 'h2')
        line = line.replace('H3', 'h3')
        line = line.replace('H4', 'h4')
        line = line.replace('H5', 'h5')
        line = line.replace('H6', 'h6')
        line = line.replace('DIV.', 'div.')
        line = line.replace('SPAN.', 'span.')
        if line.strip() == '' or (line.strip().startswith('/*') and line.strip().endswith('*/')):
            continue 
            
        if line.strip().startswith('/*') and not line.strip().endswith('*/'):
            for line in infp:
                if not line.strip().endswith('*/'):
                    continue
                else:
                    line = ''
                    break                    
            
        if not line.strip().endswith(';') and \
            '{' not in line and \
            '}' not in line and \
            ',' not in line:
            line = line.strip() + ';\n'
        line = line.lstrip()
        if line.endswith(';}\n'):
            line = line.replace(';}\n', ';\n}\n')
        if line.strip().endswith('{\n') and not line.strip().endswith('  {\n'):
            #line = line.startswith('{').replace('{', '')
            line = line.strip() + '  {\n'      
        outfp.write(line.strip() + '\n')
        
    infp.close()
    outfp.close()
    os.remove(infile)
    os.rename(outfile, infile)
    prettifyCSS(wdir, file)
    return(0)   

    
def convertFile2UTF8(wdir, file, encoder):
    """ Converts input file to utf-8 format
    """
    print(' -- Convert input file to utf-8 if required\n')
    
    original_filename = file
    file = os.path.join(wdir, file)
    output = wdir + os.sep + 'fix_encoding.htm'
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding=encoder).read()  
    
    # safely convert to unicode utf-8 using bs4
    soup = BeautifulSoup(html, 'html.parser')
    outfp.writelines(str(soup))
    
    outfp.close()          
    os.remove(file)
    shutil.copy(output, file)        
    os.remove(output)
    
    return(file)
    
def checkFileEncoding(wdir, file):

    html_encoding = None
    chardet_encoding = ''
    final_encoding = ''
    
    file = os.path.join(wdir, file)
    # get the encoding info from the html meta headers   
    text = open(file, 'rt', encoding='iso-8859-1', errors='surrogateescape').read(2048)  
    
    if 'charset=windows-1252' in text.lower():
        html_encoding = 'cp1252'
    elif 'charset=windows-1250' in text.lower():   
        html_encoding = 'cp1250'
    elif 'charset=windows-1253' in text.lower():   
        html_encoding = 'cp1253' 
    elif 'charset=windows-1254' in text.lower():   
        html_encoding = 'cp1254'            
    elif 'charset=windows-1251' in text.lower():   
        html_encoding = 'cp1251'
    elif 'charset=windows-1255' in text.lower():   
        html_encoding = 'cp1255'    
    elif 'charset=windows-1256' in text.lower():   
        html_encoding = 'cp1256'
    elif 'charset=windows-1257' in text.lower():   
        html_encoding = 'cp1257'  
    elif 'charset=us-ascii' in text.lower():   
        html_encoding = 'us-ascii'          
    elif 'charset=ibm437' in text.lower():
        html_encoding = 'cp437' 
    elif 'charset=ibm850' in text.lower():
        html_encoding = 'cp850'
    elif 'charset=ibm852' in text.lower():
        html_encoding = 'cp852'
    elif 'charset=ibm855' in text.lower():
        html_encoding = 'cp855'    
    elif 'charset=iso-8859-1' in text.lower():
        html_encoding = 'iso-8859-1'
    elif 'charset=iso-8859-2' in text.lower():
        html_encoding = 'iso-8859-2'
    elif 'charset=iso-8859-4' in text.lower():
        html_encoding = 'iso-8859-4'    
    elif 'charset=utf-8' in text.lower():
        html_encoding = 'utf-8'
            
    # get the locale encoding, if needed
    if html_encoding == None:
        html_encoding = sys.getdefaultencoding()
        #html_encoding = locale.getpreferredencoding()       
    
    # now get the file encoding using chardet
    rawdata = codecs.open(file, "rb").read(2048)
    result = chardet.detect(rawdata)
    charset_encoding = result['encoding']    
    chardet_encoding = charset_encoding    
     
    # compare the html and chardet encodings and if chardet contains
    # any 'utf' encodings then go with that as a preference
    final_encoding = chardet_encoding
    if (html_encoding != None and \
        chardet_encoding.upper() != html_encoding.upper()) and \
        'utf' not in chardet_encoding.lower():
        final_encoding = html_encoding
        
    # Final test -- open the file normally and read & write it back. If 
    # there is an exception let the user know and stop the plugin app.
    output = wdir + os.sep + 'encoding_test.htm'
    outfp = open(output, 'wt', encoding=final_encoding)
    html = None
    try:
        html = open(file, 'rt', encoding=final_encoding).read()
        outfp.writelines(html)
        outfp.close()
        os.remove(file)
        os.rename(output, file)
    except:  
        pass
        
    if html == None:
        print('\n >>> Critical Error: The html file could not be \n' + \
              ' >>> read because of file encoding problems.')
        show_msgbox('File Encoding Error', 'The file could not be read because of file encoding ' + \
                       'problems.\n\n', msgtype='error')
        shutil.rmtree(wdir, ignore_errors=True)                            
        sys.exit(0)              
    
    print(' -- Input file encoding is: ' + final_encoding.upper())
    return(final_encoding)

def fileNotLoadedError(title, msgtype):
    msg = 'You have not loaded an epub or html file into Sigil.\n\n' + \
          'You must first load an epub or html file into Sigil and then run this plugin. Please try again.'
    
    print('\n >>> Warning!! You have not loaded an epub or html file into Sigil.')
    print(' >>> You must first load an epub or html file into Sigil and then run this plugin. Please try again.')    
          
    show_msgbox(title, msg, msgtype)
    
    return(0)       

def addHTMLPresets(wdir, file):
    file = os.path.join(wdir, file)
    output = os.path.join(wdir, 'presets1.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf8') as infp:      
        for line in infp:
            if line.strip().startswith('p  ') and '{' in line and 'sup' not in line:
                outfp.write(line)
                for line in infp:
                    if line.strip().startswith('line-height:'):
                        continue
                    elif '}\n' in line:
                        line = line.replace('}\n', 'line-height: 1.2em;\n}\n')
                        outfp.write(line)
                        break
                    else:
                        outfp.write(line)                        
            else:
                outfp.write(line)
                
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    
    output = os.path.join(wdir, 'presets2.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf8') as infp:      
        for line in infp:
            if '<style>' in line or '<style type="text/css"' in line:
                outfp.write(line)
                for line in infp:
                    if '</style>' not in line:
                        outfp.write(line)
                    else:
                        outfp.write('p  {\nline-height: 1.2em;\n}\n</style>\n')  
                        break                        
            else:
                outfp.write(line)            
         
    outfp.close()
    os.remove(file)
    os.rename(output, file)     
    return(0)             

def prettifyCSS(wdir, css):

    # ensure no blank lines in the css
    css = os.path.join(wdir, css)
    output = os.path.join(wdir, 'link_rel.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(css, 'rt', encoding='utf8') as infp:      
        for line in infp:
        
            if '{  {\n' in line:
                line = line.replace('{  {\n', '{\n')        
        
            if line.strip() == ';':
                continue
            if re.match(r'^\s*$', line):
                continue 
            if line.strip() == '':
                continue               
            if ';' in line and ':' not in line:
                line = line.replace(';', '')            
            outfp.write(line.strip() + '\n')                  
  
    outfp.close()
    os.remove(css)
    os.rename(output, css)
    
    output = os.path.join(wdir, 'remove_curly.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(css, 'rt', encoding='utf8') as fp:  
        data = fp.read()
        if '\n}\n}\n' in data:
            data = data.replace('\n}\n}\n', '\n}\n')
            
    outfp.writelines(data)
    outfp.close()
    os.remove(css)
    os.rename(output, css)    
    return(0)     

def cleanExit(wdir):

    shutil.rmtree(wdir, ignore_errors=True)
    return(0)
    
def repairHTMLIDs(wdir, file):

    outfile = os.path.join(wdir, 'html_repair.html')
    outfp = open(outfile, 'wt', encoding='utf-8')
    html = open(file, 'rt', encoding='utf-8').read()
    
    soup = BeautifulSoup(html, 'html.parser')
    
    for atag in soup.find_all('a'):
        if atag.has_attr('name'):
            id_ref = atag['name']
            del atag['name']
            atag['id'] = id_ref
                
    for itag in soup.find_all('img'):
        if itag.has_attr('name'):
            idref = itag['name']
            del itag['name']
            itag['id'] = idref            
                
    # remove spaces in id values
    for anchor in soup.find_all('a'):
        if anchor.has_attr('id'):
            anchor['id'] = anchor['id'].replace(' ', '')
            anchor['id'] = anchor['id'].replace(r'%20', '')

    # remove spaces in href values        
    for atag in soup.find_all('a'):
        if atag.has_attr('href') and '#' in atag['href']:
            atag['href'] = atag['href'].replace(' ', '')
            atag['href'] = atag['href'].replace(r'%20', '')
            
    print('\n >>> Check xhtml file ids and hrefs...')
    ### check and repair the xhtml text file ids
    search_tags1 = ['a', 'p', 'body', 'div', 'h1']
    for a_tag in soup.find_all(search_tags1):
        if a_tag.has_attr('id'):
            old_id = a_tag['id']
            new_id = CheckandRepairID(a_tag['id'])
            a_tag['id'] = new_id             
           
    search_tags2 = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']        
    for h_tag in soup.find_all(search_tags2):
        if h_tag.has_attr('id'):
            old_id = h_tag['id']
            new_id = CheckandRepairID(h_tag['id'])
            h_tag['id'] = new_id             
   
    ### check and repair internal links
    for href_tag in soup.find_all('a'):
        if href_tag.has_attr('href') and '#' in href_tag['href']:
            link, id = href_tag['href'].split('#')
            print(' >>> href id...' + id)
            new_id = CheckandRepairID(id)
            href_tag['href'] = link + '#' + new_id          
                
    outfp.writelines(str(soup))  
    outfp.close()    
    os.remove(file)
    os.rename(outfile, file)    
    return(0)
    
def CheckandRepairID(id):
    id = id.strip()
    print('\n>>> Check ID...' + id)
    
    # ensure first char is always an alpha char
    char_list  = list(id)
    first_char = char_list[0]
    if first_char.isdigit():
        print(' >>> Repair id...' + id)
        first_char = 'x'
        char_list[0] = first_char
        new_id = "".join(char_list)
        print(' >>> New ID...' + new_id + '\n')         
        return(new_id)
    else:
        return(id)    
    
def fixEncodingErrors(line):
    """ Fixes encoding problems caused by 
        en dash, em dash, curly qutes, elipses etc
    """    
    # repair mixed encoding
    # cp 1252 to utf-8
    line = line.replace('â€™','’')       # apostrohe   
    line = line.replace('â€œ','“')       # left double quote 
    line = line.replace('â€','”')        # right double quote    
    line = line.replace('Â©','©')        # copyright
    line = line.replace('Â®','®')        # registered
    line = line.replace('â€”', '—')      # em dash
    line = line.replace('â€“', '–')      # en dash
    line = line.replace('â„¢', '™')
    line = line.replace('â”', '–')
    
    # latin-1 encoded in win cp1252 in utf-8 code
    line = line.replace('Ã¢â‚¬Å“', '“')
    line = line.replace('Ã¢â‚¬Â', '”')
    line = line.replace('Ã¢â‚¬â„¢', '’')
    
    line = line.replace('â', '“')
    line = line.replace('â', '”')
    line = line.replace('â', '’')
    line = line.replace('â', '‘')
    line = line.replace('â', '–')
    line = line.replace('â¦', '…')  
    
    # other encodings to utf-8
    line = line.replace('Ã¢â‚¬Å“','“')   # left double quote
    line = line.replace('¢â‚¬Â','”')     # right double quote    
    line = line.replace('Ã¢â‚¬â„¢','’')  # apostrohe, right single quote
    line = line.replace('Ã¢â‚¬Ëœ', '‘')  # left single quote 
    line = line.replace('Ã¢â‚¬“','–')    # en dash
    line = line.replace('Ã¢â‚¬”', '—')
    line = line.replace('Ì¶', '–')
    line = line.replace('Ã”', '”')
    line = line.replace('Ã”', '”')    
    line = line.replace('Ã”Ã', '”')
    line = line.replace('Ã', '')
    line = line.replace('Ãƒâ€šÃ‚Â', '')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã¢â€žÂ¢', '’')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã…â€œ', '“')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã‚Â', '”')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã¢â‚¬Å“','–')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã‹Å“', '‘')
    
    
    line = line.replace('Â', '')
    line = line.replace('Â', '')
    line = line.replace('Â', '’')
    
    line = line.replace('', '’') 
    line = line.replace('', '—')   
    line = line.replace('', '“')    
    line = line.replace('', '”')  
    line = line.replace('', '‘')    
    line = line.replace('', '…')
    line = line.replace('', '–')
    line = line.replace('© ', '©')
    return(line)
    
    
    
        