#----------------------------------------------------------------------------
# hTranslateOneStep.py
#
# works on html file extracted from docx file
#----------------------------------------------------------------------------import sys
import os
import sys
import re
#import shutil
from pathlib import Path
from PyQt5.QtCore import pyqtSignal,  QObject
#from subprocess import Popen
#from PyQt5.QtWidgets import (QApplication, QMainWindow, QPushButton,  QHBoxLayout, QMessageBox, 
#                            QVBoxLayout, QFormLayout,  QFrame,  QLabel,  QLineEdit,  QFileDialog, 
#                            QFontDialog)
#from PyQt5.QtGui import QFont, QFontInfo
from html.parser import HTMLParser
import win32com.client as win32

import hFileClassE as hFc
import hConstant as hC
#import hSigilBkUtil as hBk
import hMiscClass as hMc

"""
class ImagesInDocx():
    def __init__(self, bk, workingDir, htmFileFullPath):
        '''
            When class HtmFile extracts htm from a docx file, if in the docx there are images, a directory
            will be created in working diretory to save all images. 
            If the docx file is named Document.docx the images directory will have name Document_files.
        ''' 
        self.bk= bk
        self.bkUtil= hBk.BkUtil()
        self.workingDir= workingDir
        self.htmFileFullPath= htmFileFullPath
        self.baseName= os.path.basename(htmFileFullPath)
        self.onlyName= Path(htmFileFullPath).stem
        self.imgDir= self.workingDir+ self.onlyName+ '_files/'
    
    def listExistingImgFiles(self):
        '''
            This function deletes existing images (result of a previous extraction) from Document_file before 
            the class HtmFile extracts htm from docx. 
        '''         
        imgFileLst= []
        if os.path.exists(self.imgDir):
            imgFileLst= os.listdir(self.imgDir)
        return imgFileLst
    
    def deleteExistingImgFiles(self):
        imgFileLst= self.listExistingImgFiles()
        if imgFileLst:
             shutil.rmtree(self.imgDir)             
#            for file in imgFileLst:
#                os.remove(self.imgDir+ '/'+ file)
"""

"""
    def uploadExistingImgFiles(self):
            # upload images, if any
            imgFileLst= self.listExistingImgFiles()
            if imgFileLst:
                if not hC.DEBUG:
                    for n, file in enumerate(imgFileLst):
                        imgFilePath= self.imgDir+ file
#                        print('imgFilePath ', imgFilePath)
                        self.loadImageInWordFileToSigil(imgFilePath)
                    # n. immagini caricate
#                    nImg= str(n+ 1)
#                    imgMsg= ' and '+ nImg+ ' images'                        
                        
                        

    def loadImageInWordFileToSigil(self, imgFilePath):
        oImageFile= hFc.File()
        oImageFile.setFileProperties(imgFilePath)
        manifestId= self.onlyName+ '_'+ oImageFile.baseName
        fileContent= oImageFile.read()
        mimeType= oImageFile.mime
        loadIntoSpine= False
        askIfFileAlreadyIn= False
                
#        print(f"manifestId {manifestId}")        
        #
        self.bkUtil.bkLoadFileIntoSigil(self.bk, manifestId, fileContent, mimeType, loadIntoSpine, askIfFileAlreadyIn)
"""

class MyHTMLParser(HTMLParser):
    '''
        processes htm file extracted from docx file in order to have a htm code suited to be loaded into Sigil
    '''
    def __init__(self, translateFontFamily, translateFontSize):
        super().__init__()
        self.translateFontFamily= translateFontFamily
        self.translateFontSize= translateFontSize        

        self.endTable= False
        self.eq= '='
        self.sp= ' '
        self.newChunk= ''
        self.parsedContent= ''
        self.specialLst=['html',  'head',  'meta']
        self.styleTagFlag= False
        self.pTagStdFontTag= False
        
    def handle_starttag(self, tag, attrs):
        self.parsedContent= self.parsedContent+ self.newChunk
        if tag== 'meta':
            return ''
        self.newChunk= ''
        self.newChunk= '<' + tag
        for name, value in attrs:
            if value== None:
                value= ''
            # adjust line-height to "normal"
            if name== 'style':
                styleItemLst= value.split(';')
                value= ''
                for item in styleItemLst:
                    if 'line-height:' in item:
                        lineHeightLst= item.split(':')
                        item= lineHeightLst[0]+ ':'+ "normal"
                    value= value+ item+ ';'
                
            # adjust p tag to avoid blank line at the end of a paragraph
            if tag== 'p' and name== 'class':
                if value== 'MsoNormal':
                    value= 'StdFont'        # p.StdFont defined in hConstant.py
                    self.pTagStdFontTag= True
                else:
                    self.pTagStdFontTag= False

            if '"' in value:
                value= value.replace('"',  "'")

            self.newChunk= self.newChunk+ self.sp+ name+ self.eq+ '"'+ value+ '"'
            
        if tag== 'img':  
            self.newChunk+= '/'
      
        # adlust meta tag adding / at the end   
        if tag== 'meta' or tag== 'br':
            self.newChunk= self.newChunk+ '/'
            
        # set flag for tag <style> used in handle_data
        if tag== 'style':
            self.styleTagFlag= True
            
        self.newChunk= self.newChunk+ '>'
        
        if tag in self.specialLst:
            if tag== 'html':
                self.newChunk= hC.xhtmlHeader
            self.parsedContent= self.parsedContent+ self.newChunk
            self.newChunk= ''
            
            
    def handle_endtag(self, tag):
        endTag= '</' +tag + '>'
        if endTag != '</br>':
            self.newChunk= self.newChunk+ endTag
            self.parsedContent= self.parsedContent+ self.newChunk
            self.newChunk= ''
        
        
    def handle_data(self, data):
#        if '&' in data:
#            data= data.replace('&', '&amp;')
#        if '·' in data:
#            data= data.replace('·', '&#x25CF;')         # black dot
        hC.htmlCharLst= list(hC.htmlCharDict.keys())
        for char in hC.htmlCharLst:
            if char in data:
                data= data.replace(char, hC.htmlCharDict[char])
                break
       
        hex= '0xc2a0'           # 'spaces' at the beginning of the first line of a dotted list
        hex = hex[2:]
        stringVal = bytes.fromhex(hex).decode('utf-8')
        if stringVal in data:
            if self.pTagStdFontTag:
                data= data.replace(stringVal, "<br/>")      # blank line if tag p class=MsoNormal
            
        if self.styleTagFlag:
            searchStart= '<!--'
            searchEnd= '-->'
            start= data.find(searchStart)
            end= data.find(searchEnd)
            styleContent= data[start:end+len(searchEnd)]+ hC.styleData
            #
            data= styleContent
            #
            data= data.replace('<!--', '')
            data= data.replace('-->', '')
            self.styleTagFlag= False
        #
        self.newChunk= self.newChunk+ data
        
        
class HtmFile(hFc.File):
    createdHtmFileSignal= pyqtSignal(str)
    def __init__(self, docxFile,  htmFile,  startPage,  endPage,  workingDirPath):
        super().__init__(htmFile)
        super().setFileProperties(htmFile)
        #
        self.docxFile= docxFile
        self.htmFile= htmFile
        self.startPage= startPage
        self.endPage= endPage
        self.workingDirPath= workingDirPath
        #   https://learn.microsoft.com/en-us/office/vba/api/word.wdsaveformat
        self.wdFormatFilteredHTML= 10
        self.wdGoToPage= 1
        self.wdGoToAbsolute= 1
        self.wdPrintView= 3
        self.wdPasteDefault= 0

    def extract(self): 
        '''
        extracts htm file (filtered web page) from docx
        '''
        try:
            #   create Word application
            wordApp = win32.Dispatch("Word.Application")
    #        wordApp.visible= True  
            #   open .docx file
            # delete existing img files if any
            self.deleteExistingImgFiles(self.htmFile,  self.workingDirPath)
            #
            wordApp.Documents.Add(self.docxFile)
            #   print view to have page number
            wordApp.ActiveWindow.View.Type= self.wdPrintView   

            #   goto start & end page. Define range of pages
            wordApp.Selection.GoTo (What=self.wdGoToPage, Which=self.wdGoToAbsolute, Count=self.startPage)
            rgePages = wordApp.Selection.Range
            wordApp.Selection.GoTo (What=self.wdGoToPage, Which=self.wdGoToAbsolute, Count=self.endPage)
            rgePages.End = wordApp.Selection.Bookmarks("\\Page").Range.End
            #   select range
            rgePages.Select()
            #   selection copy
            wordApp.Selection.Copy()
            #   open empty docx to paste selection
            wordApp.Documents.Add()        
            #   paste selection in empty document
            wordApp.Selection.PasteAndFormat (self.wdPasteDefault)
            #   save empty document as htm
#            wordApp.ActiveDocument.SaveAs2('c:\\tmp\\aaaaAAAA.docx')
            wordApp.ActiveDocument.SaveAs2(self.htmFile,  FileFormat= self.wdFormatFilteredHTML)
            #   close all .docx files 
            for doc in wordApp.Documents:
                doc.Close()
            # quit from Word.Application    
            wordApp.Quit()
            # reads content
            fileContent= self.read()
        except:
            fileContent= '*'+ str(sys.exc_info()[0])+ '\n'+ str(sys.exc_info()[1])+ '\n'+ str(sys.exc_info()[2])
        self.createdHtmFileSignal.emit(fileContent)        

    def deleteExistingImgFiles(self,  htmFile,  workingDir):
        # When class HtmFile extracts htm from a docx file, if in the docx there are images, a directory
        # will be created in working diretory to save all images. 
        # If the docx file is named Document.docx the images directory will have name Document_file.
        # This function deletes existing images (result of a previous extraction) from Document_file before 
        # the class HtmFile extracts htm from docx. 
        imgDir= Path(htmFile).stem+ '_file'
        if os.path.exists(workingDir+ imgDir):
            imgFileLst= os.listdir(workingDir+ imgDir)
            for file in imgFileLst:
                filePath= workingDir+ imgDir+ '/'+ file
                os.remove(filePath)

        
class Translate(QObject): 
    htmFileReadySignal= pyqtSignal(str)

    def __init__(self, paramLst,  bk, translateFontFamily, translateFontSize, searchChapterTitle):
        super().__init__()
        self.bk= bk
        self.paramLst= paramLst
        self.error= False
        # all parameters are string
        self.startDocxFile= paramLst[0]
        self.htmFilePath= paramLst[1]
        self.startPage= paramLst[2]
        self.endPage= paramLst[3]
        self.workingDirPath= paramLst[4]
        self.translateFontFamily= translateFontFamily
        self.translateFontSize= translateFontSize
        self.searchChapterTitle= searchChapterTitle 

    def extractHtmFile(self):
        self.htmFile= HtmFile(self.startDocxFile, self.htmFilePath, self.startPage, self.endPage, self.workingDirPath)
        self.htmFile.createdHtmFileSignal.connect(self.htmFileReady)        
        self.htmFile.extract()

    def htmFileReady(self, fileContent):      
        if fileContent[0:1]== '*':
            self.error= True
        else:
            # file content processing
            fileContent= self.preProcessing(fileContent)
            #
            parser = MyHTMLParser(self.translateFontFamily, self.translateFontSize)
            parser.feed(fileContent)
            # 
            fileContentStr= parser.parsedContent
            #
            # change font-size
            fontSizeNew= 'font-size:'+ self.translateFontSize+ '.0pt;'
            fileContentStr= re.sub('font-size:.*pt;', fontSizeNew, fileContentStr)
            #
            # change font-family. There are different occurrences of font-family
            # font-family:"Edwardian Script ITC";   double quotes
            # font-family:Symbol;                   no quotes
            # font-family:'Calibri',sans-serif;     single quotes
            fontFamilyLst= []
            fontFamilyLst= re.findall('font-family:.*?;', fileContentStr)
            for font in fontFamilyLst:
                quotes= ''
                if '"' in font:
                    quotes= '"'
                elif "'" in font:
                    quotes= "'" 
                fontNew= 'font-family:'+ quotes+ self.translateFontFamily+ quotes+ ';'
                fileContentStr= re.sub(font, fontNew, fileContentStr)    
            #
#            fileContentByte= parser.parsedContent.encode(encoding='UTF-8')
            fileContentByte= fileContentStr.encode(encoding='UTF-8')
            # writes htm file
            htmEndFile= hFc.File()
            htmEndFile.setFileProperties(self.htmFilePath)
            htmEndFile.writeByte(fileContentByte)
        # send signal to pluginMenu
        self.htmFileReadySignal.emit(fileContentStr)
    
    def preProcessing(self, fileContent):
        '''
            pre-elaboration of the htm file extracted from docx file
        '''
        (hTagToChangeStart, hTagToChangeStartEnd)= self.searchHTabChapterTitle(fileContent)
        # change <h1> to <h3>
        startSearch= '<h3'
        endSearch= '</h3>'
#        fileContent= fileContent.replace('<h1', startSearch)
#        fileContent= fileContent.replace('</h1>', endSearch)
        fileContent= fileContent.replace(hTagToChangeStart, startSearch)
        fileContent= fileContent.replace(hTagToChangeStartEnd, endSearch)        
        # chapter title and chapter number
        start= fileContent.find(startSearch)       
        if start > -1:
            endSearch= '</h3>'
            end= fileContent.find(endSearch, start)
            chapterTitleAndNumber= fileContent[start+len(startSearch):end]
            if chapterTitleAndNumber != '':
                # removes tags between <h3> and </h3> if any
                #  <h3><span lang=EN-US>5 – chapter 5</span></h3>
                #  <h3><span lang=EN-US>VI — Chapter 6<br clear=all style='page-break-before:always'></span></h3>
                betweenH3= fileContent[start:end+len(endSearch)]
                tagLst= re.findall('<(.*?)>',  betweenH3)
                for tag in tagLst:
                    if 'h3' not in tag:
                        chapterTitleAndNumber= chapterTitleAndNumber.replace('<'+ tag+ '>', '')
                #
                fileContent= fileContent.replace(betweenH3, startSearch+ chapterTitleAndNumber+ endSearch)
                #
                # case:
                # <h3 align=left style='text-align:left'>4.8 — Mini editor </h3>
                innnerH3= hMc.BetweenFilters(fileContent, '<h3',  '>' )
                toDelete= innnerH3.search()
                if toDelete:
                    fileContent= fileContent.replace(toDelete, '')
                searchTitleAndNumber= hMc.BetweenFilters(fileContent, '<h3>',  '</h3>' )    
                chapterTitleAndNumber= searchTitleAndNumber.search()
                #    
                chapterNumber= ''
                chapterTitle = chapterTitleAndNumber        # works for PROLOGUE and EPILOGUE (no chapter number)
                # split string with different length dashes
                # to have dashees values for Python use ord()
                # ord("–") ord('-') ord('—')
                splitCharLst= [chr(8211), chr(45), chr(8212)]
                for i, char in list(enumerate(splitCharLst)):
                    if char in chapterTitleAndNumber:
                        titNumLst= chapterTitleAndNumber.split(splitCharLst[i])
                        chapterNumber= titNumLst[0].replace(' ',  '')
                        chapterTitle= titNumLst[1].strip()  # remove lading and trailing spaces
                        break
                toReplace= '<h3>'+ chapterTitleAndNumber+ '</h3>'
                replacedBy=  '<h3>'+ chapterNumber+ '<br/>'+ chapterTitle+ '</h3><br/>'
                fileContent= fileContent.replace(toReplace, replacedBy)
        
        # adjusts image tags (if any) to adapt them to Sigil structure   
        tagLst= re.findall('<img[^>]*src="([^"]+)"[^>]*>',  fileContent)
        if tagLst:
            for tag in tagLst:
                splitTag= tag.split('/')        # [0]= dir  [1]= img file name
                lastChar= splitTag[0][-1:]
                if lastChar== 's':      # dir name ends with _files or with _file 
                    charsToStrip= 6     # _files
                else:
                    charsToStrip= 5     # _file
                imagePrefix= splitTag[0][:-charsToStrip]   # strip '_file from dir
                fileContent= fileContent.replace(tag, '../Images/'+ imagePrefix+ '_'+ splitTag[1])
        
        return fileContent

    def searchHTabChapterTitle(self, fileContent):
        defaultStart= '<h1'
        defaultEnd= '</h1>'
        start= fileContent.find(self.searchChapterTitle)    # search chapter title
        if start== -1:
            return (defaultStart, defaultEnd)
        start= start+ len(self.searchChapterTitle) 
        locSearch= '</h'
        hTagStart= fileContent.find(locSearch, start)       # search end tag
        if hTagStart== -1:
            return (defaultStart, defaultEnd)
        tag= fileContent[hTagStart+len(locSearch)-1:hTagStart+len(locSearch)+1]    
        hTagToChangeStart= '<'+ tag
        hTagToChangeStartEnd= '</'+ tag+ '>'
        return (hTagToChangeStart, hTagToChangeStartEnd) 
