#!/Python3/python
# -*- coding: utf-8 -*-



from __future__ import unicode_literals, division, absolute_import, print_function


__all__=["getPointSize", "removeFontTags", "removeAlignAttr", "convertName2IDAttr", "fixHTMLAttrValues", "removeLangAttrs", "show_msgbox", "removeInternalLinks", "removeAttributes", "removeStyles", "removeAllIDS", "removeAllIDsLinks","removeInternetLinks", "convertTags", "removeRedundantCSS", "removeDivTags", "removeRedundantHTML", "fixEncodingErrors", "removeHardBreaks", "getImageSize", "formatImages", "reformatSmallImages", "removePageBreaks", "prettifyXHTMLFile", "convertITags", "convertBTags", "convertEMTags", "convertStrongTags", "convertSTags", "convertUTags", "addDOCTYPEHeader","removeHTMLStylespaces","fixScraggyHTML","removeCSSSpacing","prettifyCode","svgAttributes2CamelCase","removeExcessPTags"]


import os, os.path, sys, codecs, inspect, re, time, shutil
from decimal import *
from PIL import Image
import options
import tkinter as tk
import tkinter.messagebox as mbox
import locale

try:
    from sigil_bs4 import BeautifulSoup, Comment
except:
    from bs4 import BeautifulSoup, Comment    

    
def adjustCSSBody(wdir, file):
    
    file = os.path.join(wdir, file)
    output = wdir + os.sep + 'body_repaired.css'
    
    outfp = open(output, 'w', encoding='utf-8')
    with open(file, 'r', encoding='utf8') as infp:  
        for line in infp:
            if 'body.globals  {' in line:
                line = line.replace('body.globals  {\n', 'body.globals  {\nfont-family: serif;\n')
                outfp.write(line)        
            else:
                line = line.strip()
                outfp.write(line + '\n') 
                    
    outfp.close()
    os.remove(file)
    shutil.copy(output, file)
    os.remove(output)               
               
    return(0)   
    
  
def getPointSize(size):
    
    sizes = {'1': '10',
             '2': '12',
             '3': '14',
             '4': '16',
             '5': '18',
             '6': '26',
             '7': '30' 
            } 
            
    size = size.replace('.0', '')            
    for key, value in sizes.items(): 
        if key == size:
            return(value)
            
    print('\n >>> Error: Unable to convert the font SIZE attribute value to points in the HTML.\n')
    return('')        
            
def removeFontTags(wdir, file):
        
    output = os.path.join(wdir, 'remove_fontags.html')
    outfp = open(output, 'wt', encoding='utf-8')
    file = os.path.join(wdir, file)
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')      
    
    # add 'font' attributes to 'style' attributes        
    for p in soup.find_all('p'):
        if '<font' in str(p):
            if p.font.has_attr('style'):
                if p.has_attr('style'):
                    if p['style'].endswith(';'):
                        p['style'] = p['style'] + p.font['style']
                        del p.font['style']
                    else:
                        p['style'] = p['style'] + ';' + p.font['style']
                        del p.font['style']                                           
                else:    
                    p['style'] = p.font['style']
                    del p.font['style']
             
    # remove all font face declarations        
    for f in soup.find_all('font'):
        if f.has_attr('face'):
            del f['face']
            f.unwrap()
            
    # remove all 'size = 3' font declarations        
    for x in soup.find_all('font'):
       if x.has_attr('size'):
           if x['size'] == "3" or x['size'] == 3:
               x.unwrap()          

    for f in soup.find_all('font'):
        del f['color']
        del f['face']

    for d in soup.find_all('font'):
        if not d.has_attr('style'):
            d.unwrap()
    
    
    outfp.writelines(str(soup))         
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)                
    
def removeAlignAttr(wdir, file):

    if options.DOCTYPE != 'Word':
        return(0)
            
    output = os.path.join(wdir, 'remove_center.html')
    outfp = open(output, 'wt', encoding='utf-8')
    file = os.path.join(wdir, file)
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')        
    
    for p_tag in soup.find_all('p'):
        if p_tag.has_attr('align'):
            if 'align="center"' in str(p_tag):
               del p_tag['align']
            if 'align="justify"' in str(p_tag):
               del p_tag['align']   
            if 'align="left"' in str(p_tag):
               del p_tag['align']                    
           
    outfp.writelines(str(soup))         
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def convertName2IDAttr(wdir, file):
    output = os.path.join(wdir, 'name2id.html')
    outfp = open(output, 'wt', encoding='utf-8')
    file = os.path.join(wdir, file)
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')        
    
    search1 = ['a','p','h1','h2','h3']
    for atag in soup.find_all(search1):
        if atag.has_attr('name'):
            name1 = str(atag['name'])
            del atag['name']            
            atag['id'] = name1
            
    search2 = ['div','h4','h5','h6']
    for a_tag in soup.find_all(search2):
        if a_tag.has_attr('name'):
            name2 = str(a_tag['name'])
            del a_tag['name']            
            a_tag['id'] = name2  
            
    outfp.writelines(str(soup)) 
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def fixHTMLAttrValues(wdir, file):
       
    # fix ids in <img> tags - remove spaces
    output = os.path.join(wdir, 'fix_ids1.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')

    for itag in soup.find_all('img'):
        if itag.has_attr('id'):
            itag['id'] = str(itag['id']).replace(' ', '') 
            
    for atag in soup.find_all('a'):
        if atag.has_attr('id'):
            atag['id'] = str(atag['id']).replace(' ', '')       

    for ptag in soup.find_all('p'):
        if ptag.has_attr('id'):
            ptag['id'] = str(ptag['id']).replace(' ', '')             
            
    for dtag in soup.find_all('div'):
        if dtag.has_attr('id'):
            dtag['id'] = str(dtag['id']).replace(' ', '')                  
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def removeLangAttrs(wdir, file):
    # remove all lang & xml:lang attributes
    output = os.path.join(wdir, 'remove_lang.html')
    file = os.path.join(wdir, file)
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:        
        for line in infp:
        
            soup = BeautifulSoup(line, 'html.parser')    
            for tag in soup.find_all('span'):
                if tag.has_attr('lang'):
                    del tag['lang']
                    tag.unwrap()
                if tag.has_attr('xml:lang'):
                    del tag['xml:lang']
                    tag.unwrap()
                line = str(soup)
                    
            for b in soup.find_all('body', 'p', 'h1', 'h2', 'h3'):
                if b.has_attr('lang'):
                    del b['lang']
                if tag.has_attr('xml:lang'):
                    del b['xml:lang']
                line = str(soup)            
            
            if 'xml:lang' in line.strip() and '<html' not in line:
                line = line.replace(' xml:lang="en"', '')
                line = line.replace(' xml:lang="en-PH"', '')
                line = line.replace(' xml:lang="en-GB"', '')
                line = line.replace(' xml:lang="en-US"', '')
                line = line.replace(' xml:lang="EN"', '')
                line = line.replace(' xml:lang="EN-PH"', '')
                line = line.replace(' xml:lang="EN-GB"', '')
                line = line.replace(' xml:lang="EN-US"', '')
                
            if 'lang="' in line.strip() and '<html' not in line:
                line = line.replace(' lang="en"', '')
                line = line.replace(' lang="en-PH"', '')
                line = line.replace(' lang="en-GB"', '')
                line = line.replace(' lang="en-US"', '')
                line = line.replace(' lang="EN"', '')
                line = line.replace(' lang="EN-PH"', '')
                line = line.replace(' lang="EN-GB"', '')
                line = line.replace(' lang="EN-US"', '')    
            
            outfp.write(line)                  
 
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    
    output = os.path.join(wdir, 'lang.html')
    file = os.path.join(wdir, file)
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:        
        for line in infp:
            
            if '<body' in line and 'xml:lang="en-PH"' in line:
                line = '<body>\n'   
            if '<body' in line and 'xml:lang="EN-US"' in line:
                line = '<body>\n'   
            if '<body' in line and 'xml:lang="EN-GB"' in line:
                line = '<body>\n'     
            if '<body' in line and 'xml:lang="EN-us"' in line:
                line = '<body>\n'     
            if '<body' in line and 'xml:lang="EN-gb"' in line:
                line = '<body>\n'                
            if '<body' in line and 'xml:lang="en-US"' in line:
                line = '<body>\n'     
            if '<body' in line and 'xml:lang="en-GB"' in line:
                line = '<body>\n'     
            if '<body' in line and 'xml:lang="en-us"' in line:
                line = '<body>\n'       
            if '<body' in line and 'xml:lang="en-gb"' in line:
                line = '<body>\n'     
            if '<body' in line and 'xml:lang="EN"' in line:
                line = '<body>\n'     
            if 'body' in line and 'xml:lang="en"' in line:
                line = '<body>\n'        
            if 'body' in line and 'xml:lang="en-PH"' in line:
               line = '<body>\n'     
                
            outfp.write(line)
            
    outfp.close()
    os.remove(file)
    os.rename(output, file)   
    return(0)
      
    
def show_msgbox(title, msg, msgtype='info'):
    """ For general information, warnings and errors
    """
    localRoot = tk.Tk()
    localRoot.withdraw()
    localRoot.option_add('*font', 'Helvetica -12')
    localRoot.quit()
    if msgtype == 'info':
        return(mbox.showinfo(title, msg))
    elif msgtype == 'warning':
        return(mbox.showwarning(title, msg))
    elif msgtype == 'error':
        return(mbox.showerror(title, msg))          
    
def removeInternalLinks(wdir, file):

    if options.REMOVE_INT_LINKS == False:
        return(0)
   
    print(' -- Remove all internal links')
    output = os.path.join(wdir, 'remove_int_links.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    
    for tag in soup.find_all(['a']):
        if tag.has_attr('href') and \
            'http:' not in str(tag['href']) and \
            'https:' not in str(tag['href']):
            del tag['href']
            if tag.has_attr('class'):
                del tag['class']    
            if tag.attrs == {}:                         
                tag.unwrap()
    
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    
    return(0)        
    
def removeAttributes(wdir, file):
     
    print(' -- Remove or change non-compliant attributes') 
    output = os.path.join(wdir, 'body.html')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
     
    # remove all unwanted proprietary attributes from the html doc   
    search_tags = ['p', 'img', 'span', 'body', 'a', 'h1']  
    search_attribs =  ['dir', 'border', 'link', 'vlink', 'text']  
    for a in soup.find_all(search_tags):
        for attribute in search_attribs:
            del a[attribute]    
            
    # remove all unwanted proprietary attributes from the html doc   
    search_tags = ['p', 'img', 'span', 'body', 'a', 'h1']  
    search_attribs =  ['lang', 'clear', 'hspace', 'vspace']  
    for a in soup.find_all(search_tags):
        for attribute in search_attribs:
            del a[attribute]            
           
    # remove all unwanted proprietary attributes from the html doc   
    search_tags = ['h2', 'h3', 'h4', 'h5', 'h6', 'br']   ###  
    search_attribs =  ['dir', 'border', 'link', 'vlink', 'text']  
    for a in soup.find_all(search_tags):
        for attribute in search_attribs:
            del a[attribute]           
            
    # remove all unwanted proprietary attributes from the html doc   
    search_tags = ['h2', 'h3', 'h4', 'h5', 'h6','br'] ###  
    search_attribs =  ['lang', 'clear']  
    for a in soup.find_all(search_tags):
        for attribute in search_attribs:
            del a[attribute]        
            
    search_attribs = ['class']        
    for a in soup.find_all('br'):
        for attribute in search_attribs:
            del a[attribute]                
           
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)        
    
def removeStyles(wdir, file):
  
    finish = False
    output = os.path.join(wdir, 'remove_styles.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:  
        for line in infp:
        
            if line.strip().startswith('<a'):
                continue   
                
            if ' dir="ltr"' in line:
                line = line.replace(' dir="ltr"', '')
                
            if '</style>' in line.strip():
                outfp.write(line)
                finish = True 
                continue                
                
            elif finish == False:
            
                # remove Google html kix styles    
                if 'kix_' in line: 
                    outfp.write('')            
                    for line in infp:
                        if '}' in line:
                            outfp.write('')
                            break
                        else:
                            continue
                    line = line.replace('}\n', '')    
            
                if 'font-weight: 700' in line:
                    line = line.replace('700', 'bold')
                    
                if 'font-weight: 400' in line:
                    line = line.replace('400', 'normal')    
            
                if 'position:absolute' in line.replace(' ', '') or \
                    'font-variant:normal' in line.replace(' ', '') or \
                    'text-decoration:none' in line.replace(' ', '') or \
                    'letter-spacing:normal' in line.replace(' ', '') or \
                    'vertical-align:normal' in line.replace(' ', '') or \
                    'transform:' in line or \
                    'so-language:' in line or \
                    '-webkit-transform' in line or \
                    'direction:' in line or \
                    'widows:' in line or \
                    'orphans:' in line or \
                    line == ';\n':
                    continue
                else:
                    outfp.write(line)                 
            else:
                outfp.write(line)                                                    
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
 
    
def removeAllIDS(bk, wdir, file):
    """ Removes all ids/bookmarks from html text"""
    
    if options.REMOVE_IDS == False:
        return(0)
            
    print(' -- In Remove all ids')
    output = os.path.join(wdir, 'remove_links.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    
    # remove all ids/bookmarks from the xhtml    
    for tag in soup.find_all(id=True):
        if tag.has_attr('id'):
            del tag['id']
        if (tag.name == 'a' and tag.attrs == {}):
            tag.unwrap();            
    
    # remove all internal id fragments from internal links  
    for htag in soup.find_all(href=True):
        if htag.has_attr('href'):
            if 'http:' not in htag['href'] and \
                'https:' not in htag['href'] and \
                '#' in htag['href']:
                htag['href'] = htag['href'].split('#')[0]
            if (htag.name == 'a' and htag.attrs == {}):
                htag.unwrap();
             
    new_guide_list = []           
    # remove all the id fragments from the guide links in the opf       
    for type, title, href in bk.getguide():
        if '#' in href:
            new_href = href.split('#')[0]
            new_guide_tuple = (type, title, new_href)
            new_guide_list.append(new_guide_tuple)
        else:
            new_guide_tuple = (type, title, href)
            new_guide_list.append(new_guide_tuple)
    bk.setguide(new_guide_list)                
    
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    #removeGuideLinkIDs(bk)
    return(0)        
    
def removeAllIDsLinks(wdir, file):
    """ Removes all ids/bookmarks from epub/html text"""
    
    if options.REMOVE_IDS_AND_LINKS == False:
        return(0)
            
    print(' -- In Remove all ids')
    output = os.path.join(wdir, 'remove_links.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
        
    # remove all ids/bookmarks from the xhtml    
    for tag in soup.body.find_all(id=True):
        if tag.has_attr('id'):
            del tag['id']
        if (tag.name == 'a' and tag.attrs == {}):
            tag.unwrap()    
    
    # remove all internal links and remove all associated bookmarks/ids  
    for htag in soup.find_all(href=True): 
        if htag.has_attr('href'):
            if 'http:' not in htag['href'] and \
                'https:' not in htag['href'] and \
                '#' in htag['href']:
                del htag['href']
                del htag['class']      
                if (htag.name == 'a' and htag.attrs == {}):
                    htag.unwrap()
    
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)            
    
def removeInternetLinks(wdir, file):
    
    if options.REMOVE_INET_LINKS == False:
        return(0)
    
    print(' -- Remove all internet links')
    output = os.path.join(wdir, 'remove_links.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    
    for tag in soup.find_all('a'):
        if tag.has_attr('href') and \
            ('http:' in tag['href'] or \
            'https:' in tag['href']):
            del tag['href']
            if tag.has_attr('class'):
                del tag['class']
            if tag.attrs == {}:
                tag.unwrap() 
                    
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)        

def convertTags(wdir, file):
 
    if options.CONVERT_TAGS == False:
        return(0)    
 
    convertITags(wdir, file)
    convertBTags(wdir, file)
    convertEMTags(wdir, file)
    convertStrongTags(wdir, file)
    convertUTags(wdir, file)
    convertSTags(wdir, file)    
    
    return(0)
    
    
def removeRedundantCSS(wdir, file):
    file = os.path.join(wdir, file)
    output = wdir + os.sep + 'adhoc_cleanup.css'
    outfp = open(output, 'w', encoding='utf-8')
    with open(file, 'r', encoding='utf8') as infp:  
        for line in infp:
        
            if 'page-break' in line:
                line = ''
                
            if line.strip().replace(' ','').startswith('color:windowtext'):
                continue;             
                
            if '<![CDATA[' in line or ']]' in line:
                continue 
            
            if options.REMOVE_LH == True:
                if line.strip().startswith('line-height:'):
                    continue             
                
            if options.EPUB_VERSION == 'EPUB2':    
                if 'border:' in line or 'border-' in line:
                    continue                
                
            if ' align="center"' in line.lower():
                line = line.replace(' align="center"', ' style="text-align: center:text-indent: 0em;"')  
                line = line.replace(' align="CENTER"', ' style="text-align: center:text-indent: 0em;"')  
                line = line.replace(' ALIGN="CENTER"', ' style="text-align: center:text-indent: 0em;"')
                
            if ' align="justify"' in line.lower():
                line = line.replace(' align="justify"', ' style="text-align: justify:text-indent: 0em;"')  
                line = line.replace(' align="JUSTIFY"', ' style="text-align: justify:text-indent: 0em;"')  
                line = line.replace(' ALIGN="JUSTIFY"', ' style="text-align: justify:text-indent: 0em;"')    
                                
            if ' align="left"' in line.lower():
                line = line.replace(' align="left"', ' style="text-align: left:text-indent: 0em;"')  
                line = line.replace(' align="LEFT"', ' style="text-align: left:text-indent: 0em;"')  
                line = line.replace(' ALIGN="LEFT"', ' style="text-align: left:text-indent: 0em;"')    
                
            outfp.write(line)    
       
    outfp.close()
    os.remove(file)
    os.rename(output, file)
               
    return(0)             
    
def removeDivTags(wdir, file):

    if options.REMOVE_DIV_TAGS == False:
        return(0)
   
    print(' -- Remove all div tags')
    output = os.path.join(wdir, 'new_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser') 
       
    search = ['div','section']   
    for tag in soup.find_all(search):
        tag.attrs = {}
        tag.unwrap() 
    
    outfp.write(str(soup))    
    outfp.close()
    os.remove(file)
    os.rename(output, file)    
    return(0)
    
def removeRedundantHTML(wdir, file):

    print(' -- Remove or change redundant or non-compliant html code')
    output = os.path.join(wdir, 'adhoc_cleaning.html')
    outfp = open(output, 'wt', encoding=('utf-8'))
    infp = open(file, 'rt', encoding='utf-8')
    
    # remove or change adhoc html 
    for line in infp:
        
        line = fixEncodingErrors(line) 
        line = removePageBreaks(line)       
        
        """
        if '<meta' in line:
            if 'generator' not in line.lower() and 'http-equiv' not in line.lower():
                continue   
        """
        
        if '<!--' in line or '-->' in line:
            continue    
            
        if '<hr' in line and 'page-break' in line:
            continue       
            
        if ' border="0"' in line:
            line = line.replace(' border="0"', '')
            
        if 'size:8.5in 11.0in;' in line:
            continue        
                                
        if 'letter-spacing: normal' in line:
            if 'letter-spacing: normal;' in line:
                line = line.replace('letter-spacing: normal;', '')
            else:
                line = line.replace('letter-spacing: normal', '')         
            
        if 'text-decoration: none' in line:
            if 'text-decoration: none;' in line:
                line = line.replace('text-decoration: none;', '')
            else:
                line = line.replace('text-decoration: none', '')      
            
        if 'font-variant: normal' in line:
            if 'font-variant: normal;' in line:
                line = line.replace('font-variant: normal;', '')
            else:
                line = line.replace('font-variant: normal', '')  
        
        liner = line.strip()            
        if liner == None:
            continue        
            
        if '\t' in line:
            line = line.replace('\t', '')        
        
        outfp.write(line)          
    
    
    outfp.close()
    infp.close()   
    os.remove(file)
    os.rename(output, file)
    return(file)
    
def fixEncodingErrors(line):
    """ Fixes encoding problems caused by 
        en dash, em dash, curly qutes, elipses etc
    """    
    # repair mixed encoding
    # cp 1252 to utf-8
    line = line.replace('â€™','’')       # apostrohe   
    line = line.replace('â€œ','“')       # left double quote 
    line = line.replace('â€','”')     # right double quote    
    line = line.replace('Â©','©')        # copyright
    line = line.replace('Â®','®')        # registered
    line = line.replace('â€”', '—')      # em dash
    line = line.replace('â€“', '–')      # en dash
    line = line.replace('â„¢', '™')
    line = line.replace('â”', '–')
    
    # latin-1 encoded in win cp1252 in utf-8 code
    line = line.replace('Ã¢â‚¬Å“', '“')
    line = line.replace('Ã¢â‚¬Â', '”')
    line = line.replace('Ã¢â‚¬â„¢', '’')
    
    # other encodings to utf-8
    line = line.replace('Ã¢â‚¬Å“','“')   # left double quote
    line = line.replace('¢â‚¬Â','”')     # right double quote    
    line = line.replace('Ã¢â‚¬â„¢','’')  # apostrohe, right single quote
    line = line.replace('Ã¢â‚¬Ëœ', '‘')  # left single quote 
    line = line.replace('Ã¢â‚¬“','–')    # en dash
    line = line.replace('Ã¢â‚¬”', '—')
    line = line.replace('Ì¶', '–')
    line = line.replace('Ã”', '”')
    line = line.replace('Ã”Ã', '”')
    line = line.replace('Ã', '')
    line = line.replace('Ãƒâ€šÃ‚Â', '')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã¢â€žÂ¢', '’')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã…â€œ', '“')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã‚Â', '”')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã¢â‚¬Å“','–')
    line = line.replace('ÃƒÂ¢Ã¢â€šÂ¬Ã‹Å“', '‘')
    
    
    line = line.replace('Â', '')
    line = line.replace('Â', '')
    line = line.replace('Â', '’')
    
    line = line.replace('', '’') 
    line = line.replace('', '—')   
    line = line.replace('', '“')    
    line = line.replace('', '”')  
    line = line.replace('', '‘')    
    line = line.replace('', '…')
    line = line.replace('', '–')
    line = line.replace('© ', '©')
    
    line = line.replace(r'&lt;', '<')
    line = line.replace(r'&gt;', '>')
    
    line = line.replace(r'&ldquo;', '“')
    line = line.replace(r'&rdquo;', '”')
    
    
    return(line)     
	   
def removeHardBreaks(wdir, file):
    
    if options.REMOVE_HARD_BREAKS == False:
        return(0)
        
    print(' -- Remove blank lines')
    output = wdir + os.sep + 'new_html2.htm'
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')

    for br_tag in soup.body.find_all('br'):
        br_tag.attrs = {}
        br_tag.extract()
    
    outfp.writelines(str(soup))
    outfp.close() 
    os.remove(file)
    os.rename(output, file)
    
    prettifyXHTMLFile(wdir, file)
    return(file)
    
def getImageSize(image):
    """ Uses PIL to get image dimensions 
    """
    image = image.replace('\n', '')
    im = Image.open(image)
    ht = im.size[1]        
    wd = im.size[0]        
    return(wd, ht)
            
    
def formatImages(wdir, line):
    # get the image link
    soup = BeautifulSoup(line, 'html.parser')
    tag = soup.find('img')    
    if tag.has_attr('src'):
        text = tag['src']      
    else:
        return(line)    
        
    # add'alt' value to <img> tag    
    tag['alt'] = os.path.basename(text)
    
    #get the img file name from the img path    
    file_name = os.path.basename(text)
    file_path = os.path.join(wdir, file_name)
     
    # get the image dimensions
    width, height = getImageSize(file_path)
    perc_width = round(width/650 * 100)       # calculates width as a percentage of screen width
    perc_height = round(height/1200 * 100)    # calculates height as a percentage of screen height
    
    if perc_width >= 100:
        perc_width = 100
        
    if perc_height >= 100:
        perc_height = 100
        
    file_name = os.path.split(file_name)[1]
    file_name = file_name.replace(' ', '_')
    
    # insert the height and width image values 
    soup = BeautifulSoup(line, 'html.parser')
    img = soup.img
    if img.has_attr('class'):
        del img['class']
    if not img.has_attr('style'):
        if img.has_attr('width'):
            del img['width']
        if img.has_attr('height'):
            del img['height']            
        img['style'] = 'width: ' + str(perc_width) + '%;height: auto;'
    else:
        img['style'] = 'width: ' + str(perc_width) + '%;height: auto;'        
    
    line = str(soup)
    return(line)    
    
def reformatSmallImages(wdir, file):

    if options.PRESERVE_IMG_SIZE == False:
        return(0)    
        
    print(' -- Reformat smaller images')    
    # inserts and reformats all ebook images 
    outfile = wdir + os.sep + 'images.html'
    infp = open(file, 'rt', encoding=('utf-8'))
    outfp = open(outfile, 'wt', encoding=('utf-8'))
    for line in infp:
        if '<img' in line:
            
            #remove any 'class' or 'style' attributes in the <img> tag     ####
            soup = BeautifulSoup(line, 'html.parser')
            for img in soup.find_all('img', limit=1):
                if img.has_attr('class'):
                    del img['class']
                if img.has_attr('style'):
                    del img['style']
                    
            line = formatImages(wdir, line)
            outfp.write(line)     
        else:
            outfp.write(line) 
        
    outfp.close()                
    infp.close() 
    os.remove(file)
    os.rename(outfile, file)
    return(0)           

def removePageBreaks(line):

    if 'page-break-before: always' in line:           
        if 'page-break-before: always;' in line:            
            line = line.replace('page-break-before: always;', '')
        else:    
            line = line.replace('page-break-before: always', '')

    if 'page-break-before:always' in line:                
        if 'page-break-before:always;' in line:            
            line = line.replace('page-break-before:always;', '')
        else:    
            line = line.replace('page-break-before:always', '')   
    
    if 'page-break-before: auto' in line:           
        if 'page-break-before: auto;' in line:            
            line = line.replace('page-break-before: auto;', '')
        else:    
            line = line.replace('page-break-before: auto', '')

    if 'page-break-before:auto' in line:                
        if 'page-break-before:auto;' in line:            
            line = line.replace('page-break-before:auto;', '')
        else:    
            line = line.replace('page-break-before:auto', '')     

    if 'page-break-before: avoid' in line:           
        if 'page-break-before: avoid;' in line:            
            line = line.replace('page-break-before: avoid;', '')
        else:    
            line = line.replace('page-break-before: avoid', '')

    if 'page-break-before:avoid' in line:                
        if 'page-break-before:avoid;' in line:            
            line = line.replace('page-break-before:avoid;', '')
        else:    
            line = line.replace('page-break-before:avoid', '')     

    if 'page-break-after: avoid' in line:           
        if 'page-break-after: avoid;' in line:            
            line = line.replace('page-break-after: avoid;', '')
        else:    
            line = line.replace('page-break-after: avoid', '')

    if 'page-break-after:avoid' in line:                
        if 'page-break-after:avoid;' in line:            
            line = line.replace('page-break-after:avoid;', '')
        else:    
            line = line.replace('page-break-after:avoid', '')     

    if 'page-break-after: auto' in line:           
        if 'page-break-after: auto;' in line:            
            line = line.replace('page-break-after: auto;', '')
        else:    
            line = line.replace('page-break-after: auto', '')

    if 'page-break-after:auto' in line:                
        if 'page-break-after:auto;' in line:            
            line = line.replace('page-break-after:auto;', '')
        else:    
            line = line.replace('page-break-after:auto', '')            

    if 'page-break-inside: auto' in line:           
        if 'page-break-inside: auto;' in line:            
            line = line.replace('page-break-inside: auto;', '')
        else:    
            line = line.replace('page-break-inside: auto', '')

    if 'page-break-inside:auto' in line:                
        if 'page-break-inside:auto;' in line:            
            line = line.replace('page-break-inside:auto;', '')
        else:    
            line = line.replace('page-break-inside:auto', '')      

    if 'page-break-inside: avoid' in line:           
        if 'page-break-inside: avoid;' in line:            
            line = line.replace('page-break-inside: avoid;', '')
        else:    
            line = line.replace('page-break-inside: avoid', '')

    if 'page-break-inside:avoid' in line:                
        if 'page-break-inside:avoid;' in line:            
            line = line.replace('page-break-inside:avoid;', '')
        else:    
            line = line.replace('page-break-inside:avoid', '')                       
                
    
    return(line)
    

def prettifyXHTMLFile(wdir, file):
    # reformat and prettify the imported HTML file
    outfile= os.path.join(wdir, 'final_one.css')
    infp = open(file, 'rt', encoding='utf-8')
    outfp = open(outfile, 'wt', encoding='utf-8') 
    for line in infp:
    
        if line.strip() == '':
            line = ''
            outfp.write(line)
            continue
        
        if ';;' in line:
            line = line.replace(';;',';')
            
        if '</body>' in line or '</html>' in line:
            continue   
            
        # remove all tabs from the line
        line = line.replace(r'&#11134;','')
        line = line.replace('style=";',' style="')    
        
        search = ['p','div','h1','h2']
        # remove any empty lines that contain no text
        if options.REMOVE_EMPTY_PARAS == True:
            soup  = BeautifulSoup(line, 'html.parser')
            for ptag in soup.find_all(search, limit=1):
                if not ptag.get_text() and \
                    '<img' not in str(soup) and \
                    '<svg' not in str(soup) and \
                    '<br' not in str(soup):
                    line = ''
                    continue
            
        # remove all instances of non-breaking space(ie &nbsp)        
        if options.REMOVE_NBSP_ENTITIES == True:    
            if r'&#160;' in line or r'&nbsp;' in line:
                # remove the non-breaking space entity
                line = line.strip().replace(r'&#160;','')
                line = line.strip().replace(r'&nbsp;','')
                soup = BeautifulSoup(line, 'html.parser')
                # if the line also contains no text then just delete the line
                if (soup.get_text().strip() == '' or soup.get_text().strip() == None) and '<img' not in str(soup) and '<svg' not in str(soup):
                    continue
                
        
        if options.REMOVE_HR_TAGS == True:
            if line.strip().startswith('<hr'):
                continue    
 
        if '<?xml' in line:
            outfp.write(line)
            for line in infp:
                if '</head>' not in line:
                    if line.strip() == '' or '<![CDATA[' in line or ']]>' in line:
                        line = ''
                    elif line.strip().startswith('"http:'):
                        line = '  ' + line.strip() + '\n'
                    elif line.strip().startswith('<html'):
                        line = '\n' + line                    
                    elif '<style>' in line:
                        line = '<style type="text/css">\n'
                    outfp.write(line.strip() + '\n')                 
                else:
                    outfp.write(line)
                    break
            continue
        
        if line.strip().startswith('<?xml') or \
            line.strip().startswith('<!DOCTYPE') or \
            line.strip().startswith('<html') or \
            line.strip().startswith('<meta')or \
            line.strip().startswith('<title>') or \
            line.strip().startswith('<link') or \
            line.strip().startswith('</head>') or \
            line.strip().startswith('<body') or \
            line.strip().startswith('<body>'):
            line = line.strip()
            if not line:
                continue   
            if line.startswith('<meta') or \
                line.startswith('<title>') or \
                line.startswith('<link'):
                line = '  ' + line      
            if line.startswith('<body'):
                line = '\n' + line
            if line.startswith('</body>'):
                outfp.write('\n' + line.rstrip() + '\n')
            else:
                outfp.write(line.rstrip() + '\n')    
        else:
            line = line.strip()                
            if line.startswith('<p') or line.startswith('<li'):
                line = '  ' + line                 
            outfp.write('\n' + line + '\n')  
        
    outfp.write('\n</body>\n</html>\n')        
    infp.close()
    outfp.close()
    os.remove(file)
    os.rename(outfile, file)
    
    removeHTMLStylespaces(wdir, file)  ###
    fixScraggyHTML(wdir, file)  ###
    prettifyCode(wdir, file)
    svgAttributes2CamelCase(wdir, file)
    return(0)                 
  
    
def convertITags(wdir, file):
    output = os.path.join(wdir, 'new_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    
    for ptag in soup.find_all(True):
        for itag in ptag.find_all('i'):
            itag['style'] = 'font-style: italic;'
            itag.name = 'span'
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)        
    
def convertBTags(wdir, file):
    output = os.path.join(wdir, 'new_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    
    for ptag in soup.find_all(True):
        for btag in ptag.find_all('b'):
            btag['style'] = 'font-weight: bold;'
            btag.name = 'span'
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)            
                
def convertEMTags(wdir, file):
    output = os.path.join(wdir, 'new_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    
    for ptag in soup.find_all('p'):
        for emtag in ptag.find_all('em'):
            emtag['style'] = 'font-style: italic;'
            emtag.name = 'span'
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)                        
    
def convertStrongTags(wdir, file):
    output = os.path.join(wdir, 'new_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    
    for ptag in soup.find_all(True):
        for btag in ptag.find_all('strong'):
            btag['style'] = 'font-weight: bold;'
            btag.name = 'span'
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)            
    
def convertSTags(wdir, file):
    output = os.path.join(wdir, 'new_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    
    for ptag in soup.find_all(True):
        for stag in ptag.find_all('s'):
            stag['style'] = 'text-decoration: line-through;'
            stag.name = 'span'
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)            
    
def convertUTags(wdir, file):
    output = os.path.join(wdir, 'new_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    
    for ptag in soup.find_all(True):
        for utag in ptag.find_all('u'):
            utag['style'] = 'text-decoration: underline;'
            utag.name = 'span'
            
    outfp.writelines(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)            

def addDOCTYPEHeader(wdir, file):
    output = os.path.join(wdir, 'ostyles1.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:    
        for line in infp: 
                   
            if '<![CDATA[' in line or \
                ']]>' in line or \
                '/*' in line or \
                '*/' in line:
                continue 
            
            """
            if '<?xml' in line and options.EPUB_VERSION == 'EPUB2':
                data = '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
                outfp.write(data)
                continue
            """
            
            if '<?xml' in line and options.EPUB_VERSION == 'EPUB2':
                data  = ('<?xml version="1.0" encoding="utf-8"?>\n')
                data += ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"\n' + \
                         '  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n\n')     
                outfp.write(data)
                continue    
         
            
            if '<?xml' in line and options.EPUB_VERSION == 'EPUB3':
                data  = ('<?xml version="1.0" encoding="utf-8"?>\n')
                data += ('<!DOCTYPE html>\n\n')
                outfp.write(data)
                continue
            
            outfp.write(line)    
                    
    outfp.close()
    os.remove(file)
    os.rename(output, file)        
    return(0)
    
def removeHTMLStylespaces(wdir, file):    
    output = os.path.join(wdir, 'remove_style_spaces.html')
    outfp = open(output, 'wt', encoding='utf-8')
    file = os.path.join(wdir, file)
    infp = open(file, 'rt', encoding='utf-8')      
    for line in infp:
        if '<style' in line:
            outfp.write(line)
            for line in infp:
                if '</style>' not in line:
                    if line.strip() == '' or line.strip().startswith(';'):
                        line = ''
                    outfp.write(line)
                else: 
                    outfp.write(line)
                    break
        else:
            outfp.write(line)
        
    outfp.write(line)
    outfp.close()        
    infp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def fixScraggyHTML(wdir, file):
    """ Remove double </html> at EOF and replace
        it with a single </html> at EOF.
    """
    output = os.path.join(wdir, 'fix_html.html')
    outfp = open(output, 'wt', encoding='utf-8')
    file = os.path.join(wdir, file)
    with open(file, 'rt', encoding='utf-8') as infp:     
        for line in infp:
            if '</html>' in line:
                continue
            outfp.write(line)
            
    outfp.write('</html>')
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
        
 
def removeCSSSpacing(wdir, css):
    
    if options.MOVE_CSS == False:
        return(0)
        
    output = os.path.join(wdir, 'fix_spaces.css')
    outfp = open(output, 'wt', encoding='utf-8')
    file = os.path.join(wdir, css)
    with open(file, 'rt', encoding='utf-8') as infp:     
        for line in infp:
            
            if line.strip() == '' or line.strip().startswith(';'):
                continue
            outfp.write(line.strip() + '\n')    
    
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)               
    
def prettifyCode(wdir, file):

    body_flag = False
    file = os.path.join(wdir, file)
    output = os.path.join(wdir, 'remove_style_spaces.html')
    outfp = open(output, 'wt', encoding='utf-8')
    with open(file, 'rt', encoding='utf-8') as infp:      
        for line in infp:
            
            if '</body>' in line or '</html>' in line:
                continue
            
            if '<html' in line:
                outfp.write('\n' + line.strip() + '\n')
                continue
            
            if '<body' in line and body_flag == False:
                body_flag = True
                outfp.write('\n' + line.strip() + '\n\n')
                continue
                
            if body_flag == True:
                
                # first level end tag indent(no spaces)
                if '</html>' in line or \
                    '</body>' in line:
                    outfp.write('\n' + line.strip() + '\n')
                    
                # second level indent(2 spaces)    
                elif '<div' in line or \
                    '</div>' in line or \
                    '<ol' in line or \
                    '</ol>' in line or \
                    '<ul' in line or \
                    '</ul>' in line or \
                    '<section' in line or \
                    '</section>' in line:
                    outfp.write('  ' + line.strip() + '\n\n')    
                    
                # second level indent(2 spaces)    
                elif 'p' in line or \
                    '<h1' in line or \
                    '<h2' in line or \
                    '<h3' in line or \
                    '<h4' in line or \
                    '<h5' in line or \
                    '<h6' in line or \
                    '<blockquote' in line or \
                    '</blockquote>' in line or \
                    '<pre' in line or \
                    '</pre>' in line:
                    outfp.write('  ' + line.strip() + '\n\n')
                    
                # fourth level indent(4 spaces)
                elif '<svg' in line:
                    outfp.write('    ' + line.strip() + '\n\n')
                
                else:
                    outfp.write(line)    
                    
            else:
                # second level indent(2 spaces) for xmlns items   
                if '<title' in line.strip() or \
                    '<meta' in line.strip() or \
                    '<link' in line.strip():
                    outfp.write('  ' + line.strip() + '\n')
                else:    
                    outfp.write(line.strip() + '\n')
    
    outfp.write('\n</body>\n</html>\n')
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)     
    
def svgAttributes2CamelCase(wdir, file):
    
    file = os.path.join(wdir, file)
    file = os.path.join(wdir, os.path.basename(file))       
    output = os.path.join(wdir, 'reformat.html')
    outfp = open(output, 'wt', encoding='utf-8')
    html = open(file, 'rt', encoding='utf-8').read()

    soup = BeautifulSoup(html, 'html.parser')
    
    for svg in soup.find_all('svg'):
    
        # reformat the par to camel case
        if svg.has_attr('preserveaspectratio'):
            par = svg['preserveaspectratio']
            del svg['preserveaspectratio']
            svg['preserveAspectRatio'] = par
        
        # reformat the vb to camel case
        if svg.has_attr('viewbox'):
            vb = svg['viewbox']
            del svg['viewbox']
            svg['viewBox'] = vb
            
        if not svg.has_attr('xmlns:xlink'):
            svg['xmlns:xlink'] = "http://www.w3.org/1999/xlink"        
               
    outfp.write(str(soup))
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    prettifyCode(wdir, file)
    return(0)        
                
def removeExcessPTags(wdir, file):
    output = os.path.join(wdir, 'reformat.html')
    outfp = open(output, 'wt', encoding='utf-8')
    html = open(file, 'rt', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    
    # get rid of <p> tags wrapped 
    # around <p> tags with strings    
    for tag in soup.find_all('p'):
        if tag.find('p') and not tag.string:
            del tag['class']
            del tag['style']
            tag.attrs = {}
            tag.unwrap()
            
    outfp.writelines(str(soup))        
    outfp.close()
    os.remove(file)
    os.rename(output, file)
    return(0)
    
def removeGuideLinkIDs(bk):

    for type, title, href in bk.getguide():
        href = href.split('#')[0]
        tupe = ((type, title, href))
        bk.setguide(tupe)        

    return(0)        
   