#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import re
import ctypes

import os

import tkinter
import tkinter.ttk as ttk
import tkinter.constants as tkinter_constants
import tkinter.filedialog as tkinter_filedialog
from tkinter import messagebox
from tkinter import *

from Quotations import cDetermineQuotes
from Chapter import Chapters
from ProcessHeaderTags  import cProcessHeaderTags

global useHunspellDict
global bk2

#==================== FUNCTION FOR LOADING LIST OF HYPHENATED WORDS ADDED OCT 2018 =======================
def OpenHyphenExceptionFile(prefs):
	"""
	This function populates a dictionary called keepHyphens from a list of hyphenated words in a text file.
	The text file is located at F:\\Programming\\Python\\keepHyphens.txt.
	It is called by ...?
	"""
	global keepHyphens #Dictionary that will contain allowed hyphenated words

	keepHyphens = {}	#Initalise dictionary that will contain allowed hyphenated words
	try:
		f=open(prefs['HyphenatedWordFileName'], 'r', encoding='utf-8')
	except IOError:
		#showerror(title='WARNING', message='Cannot read file with allowed hyphenated words', detail='Please ensure you have a file containing allowed hyphenated words if you want to retain hyphenation for selected words', icon='"warning"')
		print("Cannot read file with allowed hyphenated words")
	else:
		for line in f:
			keepHyphens[line.rstrip('\n')] = True
		f.close()
	return()

#==================== FUNCTIONS FOR REMOVING EXCESS NEWLINES ADDED JUL 2018 =======================
#These functions remove newlines that occur between paragraph <p..>...</p> and
# heading <h..>...</h> tags
#text between <pre...>...</pre> tags is not affected.
def RemoveNewLines(match):
	foundNL = match.group()
	noLineBreaks =  re.sub(r'[\s]*[\n\r]+[\s]*', ' ', foundNL)
	return (noLineBreaks)

def FindNewLines(match):
	found = match.group()
	NoNewLines = re.sub(r'(?s)((<p[ ])|(<h\d))(.*?)</[p|h]', RemoveNewLines, found)
	return(NoNewLines)
	
#==================== END FUNCTIONS FOR REMOVING EXCESS NEWLINES =======================
def CorrectColGroup(m):
	"""
	This inserts the tags <colgroup> and </colgroup> if they are missing from
	tables. It finds blocks of codes containing <col and if this does not
	include <colgroup> it sandwiches the block between the tags <colgroup> and
	</colgroup>
	"""
	if '<colgroup' in m.group(0):
		return (m.group(0))
	else:
		ProcessColStr=m.group(2)
		ProcessColStr=ProcessColStr.rstrip() #Remove new line from end
		return('\n<colgroup>\n' + ProcessColStr + '\n</colgroup>\n')

def FormatHTML(m):
	"""
	Function for formatting html sections in the ePub installed MAY 2017
	Called by the main section of this module
	It indents lines starting with <table or </table by one space
	It indents lines starting with other tags by two spaces
	Lines that start with any other tag are not indented.
	"""
	if ((m.group(2) == "</div>") or (m.group(2)[:4] == "<div")):
		return ("\n\n"+m.group(2))
	elif ((m.group(2) == "</table>") or (m.group(2)[:6] == "<table")):
		return ("\n\n " + m.group(2))
	else:
		return ("\n\n   "+m.group(2))

def AutoCorrectSpelling(prefs):
	"""
	This function opens the file containing the list of words to be corrected automatically.
	The list contains OCR'd words that are always spelt incorrectly in the same way for every scan.
	It reads each incorrect word in turn and the correct word from each line of the file and then replaces
	the incorrect word with that supplied in the file by calling CorrectText().
	It also corrects the apostrophe, if needed, to be staright or curly using cDetermineQuotes.Apostrophe
	"""
	try:
		f=open(prefs['WordListDirectory'], 'r', encoding="utf-8")
	except IOError:
		print("Error opening the file containing the WordListDirectory")
	else: 
		for line in f:
			mispelt, correctSpell =line.split("|")	#get the word in its mispelt form and corrected form
			correctSpell = correctSpell.strip("\n")	#strip newline from correct version of word
			correctSpell=correctSpell.replace("'", cDetermineQuotes.Apostrophe)	#Changes apostrophe in correct word to that used in text
			mispeltWord=r"\b"+mispelt+r"\b"		
			CorrectText("Changed "+mispelt+" to "+correctSpell, mispeltWord, correctSpell)
		f.close()

def prepDictionary(bk, prefs):
	"""
	This function initialises a Hunspell dictionary if useHunspellDict is 'Yes', otherwise
	it reads a user dictionary that is in the folder specified by prefs['DictFile']
	The function returns True if either dictionary is open
	and False if a dictionary is not opened.
	"""
	if prefs['useHunspellDict']=="Yes":
		#Code for settting up a Hunspell ditionary
		# get a list of all of the locations that Sigil knows about 
		# where Hunspell dictionaries are installed
		dic_dirs = bk.get_dictionary_dirs();
		# check each location to find a pre-installed dictionary in your desired language
		afffile = None
		dicfile = None

		if prefs['HunspellLanguage']=="English (UK)":
			dictStem="en_GB"
		elif  prefs['HunspellLanguage']=="English (USA)":
			dictStem="en_US"
		elif  prefs['HunspellLanguage']=="French":
			dictStem="fr"
		elif  prefs['HunspellLanguage']=="German":
			dictStem="de_DE"
		elif  prefs['HunspellLanguage']=="Spanish":
			dictStem="es"
		elif prefs['HunspellLanguage']=="Greek*":
			dictStem="el_GR"
		
		try:
			for adir in dic_dirs:
				afile = os.path.join(adir, dictStem+".aff")
				dfile = os.path.join(adir, dictStem+".dic")
				if os.path.exists(afile) and os.path.exists(dfile):
					afffile = afile
					dicfile = dfile
					break
			if bk.hspell is not None and afffile is not None and dicfile is not None:
				bk.hspell.loadDictionary(afffile, dicfile)
				return(True)
		except:
			print("===========================================\nCannot find the dictionary for your language.\nDictionaries for English (UK), English (USA), French, German and Spanish are installed with Sigil.\nIf you need a dictionary for a different language please install an appropriate Hunspell dictionary\nin the folder hunspell_dictionaries in the Sigil program folder.\n===========================================")
			return(False)	
	else:	
		# Code for setting up a non Hunspell dictionary
		global dictPath
	
		dictPath = prefs['DictFile']
	
		if os.path.isfile(dictPath):
			ReadDictionary()
			return(True)
		else:
			print("Cannot initialise the Hunspell dictionary")
			return (False)

def SplitQuotes(m):
#Called by FixCommonErrors in the line:
#CorrectText("Quotes for different people split into separate lines", r"""<p(.*?)>(.*?)("|'|’|”)([ ]|&nbsp;)?("|'|‘|“)(.*?)</p>""", SplitQuotes)

	OpenAngularIndex = m.group(2).rfind('<')
	CloseAngularIndex = m.group(2).rfind('>')
	if CloseAngularIndex < OpenAngularIndex:
		return(m.group())
		
	#Need to ensure a quote within speechmarks is not split into separate lines
	if (m.group(3) == '"' or m.group(3) == '“') and (m.group(5)== "'" or m.group(5)== "‘"):
		print("Quote is within speech marks so exit")
		return(m.group())
	if (m.group(3) == "'" or m.group(3) == "’") and (m.group(5)== '"' or m.group(5)== '”'):
		print("Quote is within speech marks so exit")
		return(m.group())
		
	tagList=re.findall(r"<(.*?)>", m.group(2)) #Gets tags without the leading < and ending > into a list
	startPos=0
	while startPos < len(tagList):
		openTag=tagList[startPos]
		if "/" in openTag:
			startPos=startPos+1
		else:
			checkPos = startPos + 1
			closeTag = "/" + openTag
			if "/span" in closeTag:
				closeTag="/span"
			while checkPos < len(tagList) and tagList[checkPos] != closeTag:
				checkPos = checkPos +1
			if checkPos != len(tagList):
				del tagList[checkPos]
				del tagList[startPos]
			else:
				startPos=startPos+1
	strForOpenQuote = ""
	for tag in tagList:
		strForOpenQuote = strForOpenQuote + '<' + tag+'>'
	
	strForClosingQuote = ""
	for tag in reversed(tagList):
		if 'span' in tag:
			tag = 'span'
		strForClosingQuote = strForClosingQuote + '</' + tag+'>'
	quotedStr="<p"+m.group(1)+">"+m.group(2)+m.group(3)+strForClosingQuote+"</p>\n\n	<p>"+strForOpenQuote+m.group(5)+m.group(6)+"</p>"
	return(quotedStr)

def checkQuotes(m):
	"""
	This function replaces two adjacent curly quote marks with the appropriate double quote mark
	It is called by a regular expression function (re.sub) in FixCommonErrors()
	"""
	if "‘" in m.group():  #Open single curly quote
		return  ("“")
	elif  "’" in m.group(): #closisng single curly quote
		return  ("”")
	return(m.group())

def CheckApostrophe(m):
	global Apostrophe, OpenQ
	"""
	Some words are misformed when the apostrophe is taken to be part of that word eg "Tve"
	This function corrects such words, adds an apostrophe and if there are quote marks in
	front of the word, the correct quotes are included using the quotes in the file config.sys
	It is called by a regular expression function (re.sub) in FixCommonErrors()
	"""
	if m.group(2) == "IVe":
		return (cDetermineQuotes.OpenQ+"I"+cDetermineQuotes.Apostrophe+"ve")
	if m.group(2) == "Tve":
		return (cDetermineQuotes.OpenQ+"I"+cDetermineQuotes.Apostrophe+"ve")
	if m.group(2) == "Fd":
		return (cDetermineQuotes.OpenQ+"I"+cDetermineQuotes.Apostrophe+"d")
	if m.group(2) == "Td":
		return (cDetermineQuotes.OpenQ+"I"+cDetermineQuotes.Apostrophe+"d")
	if m.group(2) == "Til":
		return (cDetermineQuotes.OpenQ+"I"+cDetermineQuotes.Apostrophe+"ll")
	if m.group(2) == "Fve":
		return (cDetermineQuotes.OpenQ+"I"+cDetermineQuotes.Apostrophe+"ve")
	if m.group(2) == "Fm":
		return (cDetermineQuotes.OpenQ+"I"+cDetermineQuotes.Apostrophe+"m")
	if m.group(2) == "Vm":
		return (cDetermineQuotes.OpenQ+"I"+cDetermineQuotes.Apostrophe+"m")
	if m.group(2) == "Tm":
		return (cDetermineQuotes.OpenQ+"I"+cDetermineQuotes.Apostrophe+"m")
	else:
		return (m.group())

def ReadDictionary():
	"""
	This function populates a dictionary called spellDict from a list of words in a text file.
	The text file is located at F:\\Programming\\Python\\WordDictionary.txt.
	It is called by processHTML()
	It returns True if the file exists, totherwise it returns false.
	This dictionary may be used instead of a Hunspell dictionary.
	"""
	global spellDict
	global dictPath
	
	spellDict = {}	#Initalise dictionary
	try:
		f=open(dictPath, 'r', encoding='utf-8')
	except IOError:
		showerror(title='WARNING', message='Cannot read dictionary file', detail='Please ensure you have a dictionary installed', icon='"warning"')
	else:
		for line in f:
			spellDict[line.rstrip('\n')] = True
		f.close()
	return()

def spell(aword):
	"""
	This function checks whether a word is in the spelling dictionary, spellDict{}, or the dictionary used by Hunspell.
	It is called by the function IsHyphenated()
	It first checks some expressions where it would not be desireable to remove a hyphen eg in words containing number
	It returns True if found, otherwise it returns False.
	The parameter is the word to check
	"""
	global useHunspellDict 
	#First avoid some expressions from being processed
	if aword == "II": return (False)	#Avoid removing a hyphen between I-I
	
	numeric=r"\d"					#Avoid removing hyphens from numeric expressions
	if re.search(numeric, aword):
		return (False)				#Numeric found, so do not process the word
		
	if useHunspellDict=="Yes":
		if bk2.hspell.check(aword) !=1:	
			return(False)		#aword is not in the Hunspell dictionary
		else:
			return(True)		# aword is in the Hunspell dictionary
	else:
		global spellDict
		
		if aword in spellDict:
			return(True)
		else:
			return(False)

def ProcessItalics():
	global html

#Remove classes from <i> tags
	html = re.sub(r'<i class=.*?>', r'<i>', html)	

#1. Insert space after </i> if this tag is followed by an alpahnumeric
	html = re.sub(r'</i>([a-zA-z0-9])', r'</i> \1', html)

#2. Move punctuation outside </i> and remove space in front of it
	html = re.sub(r'[ ]?(,|\.|\!|,”|!”|\?”|\.”|”|"|,"|\!"|\?"|\."|’|\?)[ ]?</i>', r'</i>\1', html)

#3. Remove space (if present) following <i> and insert before <i>
	html = re.sub(r'(\.|!|,|;|:|([a-zA-Z0-9]))<i>[ ]?', r'\1 <i>', html)

#4. Put quotes before <i>
	html = re.sub(r'<i>(“|")', r'\1<i>', html)

#5. Replace </i> <i> with a space
	html = re.sub(r'</i>(,?[ ]?)<i>', r' ', html)
	return (0)

def ReplaceHTMLCode():
	global html
	CorrectText("Replaced HTML code (&mdash;)", '&mdash;', r'—')
	CorrectText("Replaced HTML code (&#8212;)", '&#8212;', r'—')
	CorrectText("Replaced HTML code (&ndash;)", '&ndash;',  r"–")		#nDash
	CorrectText("Replaced HTML code (&#8211;)", '&#8211;',  r"–")		#nDash
	CorrectText("Replaced HTML code (&nbsp;)", '&nbsp;', r' ')
	CorrectText("Replaced HTML code (&#8216;)", '&#8216;', r'‘')
	CorrectText("Replaced HTML code (&#8217;)", '&#8217;', r'’')
	CorrectText("Replaced HTML code (&#8220;)", '&#8220;', r'“')
	CorrectText("Replaced HTML code (&#8221;)", '&#8221;', r'”')
	CorrectText("Replaced HTML code (&lsquo;)", '&lsquo;', r'‘')
	CorrectText("Replaced HTML code (&rsquo;)", '&rsquo;', r'’')
	CorrectText("Replaced HTML code (&ldquo;)", '&ldquo;', r'“')
	CorrectText("Replaced HTML code (&rdquo;)", '&rdquo;', r'”')
	CorrectText("Replaced HTML code (&#39;)", '&#39;', r"'")
	CorrectText("Replaced HTML code (&apos;)", '&apos;', r"'")
	#CorrectText("Replaced HTML code (&#8230;)", r'[ ]?&#8230;', r"...") #Old ellipses - retain in case some eReaders cant use next line
	CorrectText("Replaced HTML code (&#8230;)", r'[ ]?&#8230;', r"…")	#Ellipsis
	CorrectText("Replaced HTML code (\x85)", r'[ ]?\x85', r"…")
	CorrectText("Replaced HTML code (\x91)", r'[ ]?\x91', r"‘")
	CorrectText("Replaced HTML code (\x92)", r'[ ]?\x92', r"‘")
	CorrectText("Replaced HTML code (\x93)", r'[ ]?\x93', r'"')
	CorrectText("Replaced HTML code (\x95)", r'[ ]?\x95', r' ')
	CorrectText("Replaced HTML code (\x97)", r'[ ]?\x97', r'—')
	CorrectText("Replaced HTML code (\xAD)", r'[ ]?\xAD', r'') #Added to this version
	CorrectText("Replaced HTML code (\uFFFD)", r'[ ]?\uFFFD', r'’') #\x{FFFD} in SIGIL
	CorrectText("Replaced HTML code (\uFFFD)", r'[ ]?\u200B', r'') #\x{200B} in SIGIL
	
	return (0)

def FixFalseLineBreaks(allBreaks, prefs):
#This function fixes some situations where a sentence at the end of one paragraph continues in the next paragraph.
#After running this, do a manual search/replace to find paragraphs that do not end with a punctuation mark using:
#FIND: ([a-z])</p>\s+<p >
#REPLACE:\1	There is a space after \1
	global html

	#remove paragraph break if line ends with hyphen
	CorrectText("Removed line break ending with hyphen:", r'-</p>\s+<p[ ][^>]*>', r'-')
	
	#Remove line break when next line starts with lower case ADDED OCT2018
	CorrectText("Removed line break when next line starts with lower case:", r'</p>\s+<p[^>]*>((<span[^>]*>)?((<[ubi]>)+)?[ ]?[a-z])', r' \1')

	CorrectText("Fixed false line breaks:", r'(|</(i|b|span)>)</p>\s+<p[ ][^>]*>[ ]?(((|<\2[^>]*>))[a-z])', r'\1 \3')

	#Remove paragraph break if line ends with a word that normaly ends with full stop
	CorrectText("Fixed false line breaks:", r'(((Mr|Dr|Mrs|St)\.)|,)</p>\s+<p[ ][^>]*>[ ]?', r'\1 ')

	if allBreaks == 'Yes':
		CorrectText("Fixed false line breaks:", r'([a-z])</p>\s+<p[ ][^>]*>([A-Z])', r'\1 \2')
		if prefs['GreekLetters'] == 'Yes':
			CorrectText("Fixed false line breaks:", r'([\u0370-\u03FF,\u1F00-\u1FFF,\'–’“”][</ib>]*)</p>\s+<p[ ][^>]*>([<ib>]*[\u0370-\u03FF,\u1F00-\u1FFF,\'–’“”])', r'\1 \2')

	return(0)

def IsFixP(m):
	"""
	FIXES Π 
	This function examines a word to see whether is required to fix the Π character that is misspelled.
	It is called by a regular expression function (re.sub) in FixCommonErrors()
	It returns the original expression if the checked word is not in the dictionary,
	otherwise it returns the word without the Π fixed
	"""
	FixP=m.group(1)+"Π"+m.group(3)
	FixP2=m.group(1)+m.group(2)+m.group(3)

	if spell(FixP2):
		return(m.group(1)+m.group(2)+m.group(3))
	elif spell(FixP):
		print("FixP: ",FixP2, " changed to ", FixP)
		return (m.group(1)+'Π'+m.group(3))
	else:
		return(m.group(1)+m.group(2)+m.group(3))

def IsFixE(m):
	"""
	FIXES έ 
	This function examines a word to see whether is required to fix the έ character that is misspelled.
	It is called by a regular expression function (re.sub) in FixCommonErrors()
	It returns the original expression if the checked word is not in the dictionary,
	otherwise it returns the word without the Π fixed
	"""
	FixE=m.group(1)+"έ"+m.group(2)
	FixE2=m.group(1)+"ύ"+m.group(2)
	if spell(FixE2):
		return(m.group(1)+"ύ"+m.group(2))
	elif spell(FixE):
		print("FixE: ",FixE2, " changed to ", FixE)
		return(m.group(1)+"έ"+m.group(2))
	else:
		return(m.group(1)+"ύ"+m.group(2))

def IsFixO(m):
	"""
	This function examines a word to see whether is required to fix the (ιό|οί|ιο|οι) characterw that is misspelled.
	It is called by a regular expression function (re.sub) in FixCommonErrors()
	It returns the original expression if the checked word is not in the dictionary,
	otherwise it returns the word without the ώ fixed
	"""
	FixO=m.group(1)+"ώ"+m.group(3)
	FixO2=m.group(1)+m.group(2)+m.group(3)
	if spell(FixO2):
		return(m.group(1)+m.group(2)+m.group(3))
	elif spell(FixO):
		print("FixΏ: ",FixO2, " changed to ", FixO)
		return(m.group(1)+"ώ"+m.group(3))
	else:
		return(m.group(1)+m.group(2)+m.group(3))
	
def IsFixW(m):
	"""
	This function examines a word to see whether is required to fix the (ιό|οί|ιο|οι) characterς that is misspelled.
	It is called by a regular expression function (re.sub) in FixCommonErrors()
	It returns the original expression if the checked word is not in the dictionary,
	otherwise it returns the word without the ω fixed
	"""
	FixW=m.group(1)+"ω"+m.group(3)
	FixW2=m.group(1)+m.group(2)+m.group(3)
	if spell(FixW2):
		return(m.group(1)+m.group(2)+m.group(3))
	elif spell(FixW):
		print("FixΩ: ",FixW2, " changed to ", FixW)
		return(m.group(1)+"ω"+m.group(3))
	else:
		return(m.group(1)+m.group(2)+m.group(3))
		
def IsFixF(m):
	"""
	This function examines a word to see whether is required to fix the ((ρ|χρ|η&gt;|«ρ|ηι|&lt;ρ|4&gt;|ιρ) characterς that is misspelled.
	It is called by a regular expression function (re.sub) in FixCommonErrors()
	It returns the original expression if the checked word is not in the dictionary,
	otherwise it returns the word without the ω fixed
	"""
	FixF=m.group(1)+"φ"+m.group(3)
	FixF2=m.group(1)+m.group(2)+m.group(3)
	if spell(FixF2):
		return(m.group(1)+m.group(2)+m.group(3))
	elif spell(FixF):
		print("FixΦ: ",FixF2, " changed to ", FixF)
		return(m.group(1)+"φ"+m.group(3))
	else:
		return(m.group(1)+m.group(2)+m.group(3))

def IsHyphenated(m):
	"""
	This function examines a hyphenated word to see whether the hyphen is valid.
	It is called by a regular expression function (re.sub) in FixCommonErrors()
	First it determies whether the hyphenated word is in the dictionary,
	keepHyphens, that contains allowed hyphenated words.
	If the hyphenated word is permitted then the hyphenated expression is
	returned.
	Otherwise it determines whether the hyphenated expression is inthe spelling
	dictionary and if so it returns the word without the hyphen, otherwise it
	returns the hyphenated word.
	"""
	global countHyphenReplacement
	global keepHyphens
	
	#ADDED OCT2018
	#If the hyphenated word is in list provided by user in keepHyphens then
	#return(m.group(0))
	if (m.group(0) in keepHyphens):
		return(m.group(0))

	#ADDED OCT2018
	# If the word either side of the hyphen is only one character then
	# don't remove the hyphen
	if ((len(m.group(1))==1) or (len(m.group(2))==1) ):
		return(m.group(0))

	HyphenRemoved=m.group(1)+m.group(2)
	if spell(HyphenRemoved):
		print("Hyphen removed from: ", HyphenRemoved)
		countHyphenReplacement+=1
		return(HyphenRemoved)
	else:
		return(m.group(0))

def CorrectText(KeyText, pattern, replacement):
	"""
	This function uses regex expressions in pattern and replacement to change the contents of the html file.
	The number of replacements are counted and stored in changedDict
	This function is called by FixCommonErrors()
	KeyText is a description of the change that is being made. This text is added to the dictionary that stores chenges made
	"""
	global changedDict
	global html
	global countHyphenReplacement
	countHyphenReplacement = 0
	newHtml, replacements = re.subn(pattern, replacement, html)
	if pattern == "(?s)(\w+)[ ]?-[ ]?(\w+)(?![^<>]*>)(?!.*<body[^>]*>)": replacements=countHyphenReplacement
	if newHtml != html:
		if  KeyText!="":
			changedDict[KeyText]=changedDict.get(KeyText,0)+replacements
		html = newHtml
			
	# Commented lines using <!-- this is an html comment --> are erroneously corrected to  
	#					   <!—this is an html comment—>
	# The following three changes are counted when this happens:
	# 		"Replaced a series of short/long hyphens with one long hyphen"
	# 		"Removed spaces after a long hyphen"
	#		"Removed spaces in front of a long hyphen"
	# Code has been written to revert these changes back to normal. This requires a count of the
	# commented lines that use <!-- this is an html comment --> so that the count of
	# these three changes can be adjusted.
				
	if (replacement == r"<\!-- "): #OCT 2020
		changedDict["COMMENT"]=changedDict.get("COMMENT",0)+1 #OCT 2020

	# if (replacement == r" -->"): #OCT 2020
	# 	changedDict["COMMENTCLOSE"]=changedDict.get("COMMENTCLOSE",0)+1 #OCT 2020
	# 	print("CHANGED")
			
	return(0)

def FixCommonErrors(prefs):
	global html
	global changedDict
	global CloseQ

	html = re.sub(r"([‘|'])?\b(Td|Tve|Fd|Til|Fve|Fm|Vm|Tm|IVe)\b", CheckApostrophe, html)

	#Replace two adjacent curly quote marks with the appropriate double quote mark
	html = re.sub(r"('[ ]?')|(‘[ ]?‘)|(’[ ]?’)", checkQuotes, html)
	
	#Removes newlines between <p[ ]>...</p> tags #UPDATED JUL 2020
	#CorrectText("Remove newlines between  <p[ ]>...</p> (1)", r'(([^>])[\n][\n][\s]+)' , r'\1 ')
	#Remove newlines between the tags <p[ ]>...</p>  #UPDATED JUL 2020
	CorrectText("Remove newlines between the tags <p...</p>", r'(([ ]?[\n\r]+[\s]*)(?![\s]*<)(?![^<]*>))' , r' ')
	
	#Removes newlines between <p[ ]>...</p> tags #UPDATED JUL 2018
	#Works, but also affects text between (pre>....</pre> tags and does not process text between <h...</h> tags
	#CorrectText("Removes newlines between paragraph tags", r'([\n\r]+[\s]*)(?![\s]*<)(?![^<]*>)', ' ')
	#CorrectText("Removed blocks of surplus newlines", r'(?s)((<p[ ])|(<h\d))(.*?)</[p|h]', FindNewLines)

	#Fixes new line immediately after an opening paragraph tag <p...>
#HAVE NOT INSERTED SPACE AFTER P HERE - SHOULD ASLO FIX NL AFTER <PRE TAGS	
	#CorrectText("Remove newline immediately after the tag <p..>", r'<p(.*?>)(\r\n)+(?!<p)(?!(\r\n)*\s+<p)(?!</body>)', r'<p\1')
	#The following has been superceded by the 2020 update above
	#CorrectText("Remove newline immediately after the tag <p..>", r'(<p[^>]*>)(\r\n)+' , r'\1')

	#Remove blank lines before </p> tag  #NEW JUL 2020
	CorrectText("Remove blank lines immediately before </p> tag", r'[\s]+</p>', r'</p>')

	#Removes space after the tag <p> #NEW JUL 2020
	CorrectText("Remove space after a <p> tag", r'(<p[^>]*>)[ ]', r'\1')

	#Fixes missing <colgroup> in tables
	CorrectText("Corrected missing <colgroup> in tables", r'(\s*((<col(.*)\s*)+))', CorrectColGroup)

	#Fixes 'th' when PDFd as 'lli'
	CorrectText("Changed lli to th", r'\blli(ink|e)\b', r'th\1')

	#Fixes 'thing(s)' when PDFd as 'tiling(s)'
	CorrectText("Changed tiling(s) to thing(s)", r'\b([t|T])iling([s]?)\b', r'\1hing\2')

	#Fixes 'their' when PDFd as 'tlieir'
	CorrectText("Changed tlieir to their", r'\b([t|T])lieir\b', r'\1heir')

	#Fixes 'she' when PDFd as 'slie'
	CorrectText("Changed slie to she", r'\b([s|S])lie\b', r'\1he')

	#Fixes 'when' when PDFd as 'wlien'
	CorrectText("Changed wlien to when", r'\b([w|W])lien\b', r'\1hen')

	#Removes <SPACE> in front of ? OR ! OR : OR ; OR ” [OR ’] OR "</p OR , OR ’</p OR _'_
	CorrectText("Removed erroneous spaces", r'[ ](\?|!|:|;|”|"</p|,|’</p|’ )', r'\1')

	#Fixes I followed by curly quotes by replacing I with !
	CorrectText("Replaced I at end of line with exclamation mark", r'[ ]I(”|’)([ ]|</p)', r'!\1\2')

	#Replaces opening apostrophe with closing apostrophe in shortened words eg ‘re and ‘ve
	#CorrectText("Corrected apostrophes for shortened words", r'[ ]?‘(re|ve)[ ]?', r'’\1 ')
	CorrectText("Corrected apostrophes for shortened words", r'[ ]?‘(re|ve)[ ]', r'’\1 ') #Corrected JUL2020

	#Fixes direction of double quotes at start of paragraph and removes unecessary spaces
	CorrectText("Corrected direction of double quotes and spaces at start of paragraphs", r'((<p[ ][^>]*>)(|(<(i|b|span)[^>]*>)))[ ]?”[ ]?', r'\1“')	

	#Fixes direction of single quote at start of paragraph and removes unecessary spaces
	CorrectText("Corrected direction of single quotes and spaces at start of paragraphs", '((<p[ ][^>]*>)(|(<(i|b|span)[^>]*>)))[ ]?’[ ]?', r'\1‘')

	#Remove space after opening speech mark at start of paragraph
	CorrectText("Removed space after opening speech mark at start of paragraph", r'((<p[ ][^>]*>(|(<(i|b|span)[^>]*>)))(\'|‘|"|“|))[ ]', r'\1')

	#Fixes errors where ‘11 or ’11 appears instead of 'll
	CorrectText("Replaced ‘11 or ’11 with 'll", r"[ ]?[‘’']11[ ]?", cDetermineQuotes.Apostrophe+r"ll ")

	#Correct direction of apostrophe for expressions such as won‘t
	CorrectText("Corrected direction of apostrophe for expressions such as won‘t", r'‘(t|m|d|s|ll) ', r'’\1 ')

	#Remove all spaces in front of a closing </p...> tag
	CorrectText("Spaces removed in front of a closing </p...> tag", r'[ ]+</p', r'</p')

	#Replace space in front of Ve with an apostrophe and lower case Ve
	CorrectText("Corrected errors of the type:‘Ve", r'([\'"‘“ ])(I|[Yy]ou|[Ww]e)[ ]Ve[ ]', r'\1\2’ve ')

	#Remove space in front of apostrophe'd specified expressions eg   're,  ’ve,  'd
	CorrectText("Removed space in front of apostrophe'd specified expressions eg 're", r" (’|')(re|ve|t|m|d|s|ll) ", r"\1\2 ")

	#Remove space after opening curly quote(s)
	CorrectText("Removed space after opening curly quote(s)", r'(‘|“) ', r'\1')

	#Fixes o clock problems
	CorrectText("Corrected o ‘clock errors", r" o[ ]?['‘]clock", " o"+cDetermineQuotes.Apostrophe+"clock")

	#Fixes apostrophes in wrong direction
	if cDetermineQuotes.OpenQ == "‘":
		CorrectText("Corrected apostrophes in wrong direction", r'[ ]?‘(ad|ave|bout|eard|em|er|cause|cept|im|is|old|ome|ow|specially|Taint|til|tis|twas)(\W?)', r' ’\1\2')
		#CorrectText("Corrected apostrophes in wrong direction", r'[ ]?‘(ad|ave|bout|eard|em|er|cause|cept|im|old|ome|ow|specially|Taint|til|tis|twas)(\W?)', r' ’\1\2')
	else:
		CorrectText("Corrected apostrophes in wrong direction", r'[ ]?‘(?i)(\d\d|ad[n]{0,1}|app[yines]{0,5}|appen[eds]{0,2}|ard[er]{0,2}|arf|alf|ang|as|at|av[ein]{0,3}|bout|bye|cause|cept[ing]{0,3}|copter[s]{0,1}|cos|cross|cuz|couse|e[emr]{0,1}|ell|elp[edling]{0,5}|ere[abouts]{0,5}|eard|f|fraid|fore|id|igh[er]{0,2}|ighness|im|is|isself|gainst|kay|less|mongst| n|nd|neath|nough|nother|nuff|o[o]{0,1}|ood|ome|ow|op[eding]{0,3}|oney|orse[flesh]{0,5}|ouse[ds]{0,1}|pon|puter[edrs]{0,2}|round|scuse[ds]{0,1}|spect[sed]{0,2}|scaped|sides|tween|special[ly]{0,2}|stead|t|taint|til|tis|twas|twere|twould|twil l|ud|un|urt|vise)(\W?)', r' ’\1\2')
		#CorrectText("Corrected apostrophes in wrong direction", r'[ ]?‘(?i)(\d\d|ad[n]{0,1}|app[yines]{0,5}|appen[eds]{0,2}|ard[er]{0,2}|arf|alf|ang|av[ein]{0,3}|bout|bye|cause|cept[ing]{0,3}|copter[s]{0,1}|cos|cross|cuz|couse|ell|elp[edling]{0,5}|ere[abouts]{0,5}|eard|f|fraid|fore|id|igh[er]{0,2}|ighness|isself|gainst|kay|less|mongst| n|nd|neath|nough|nother|nuff|ood|ome|ow|op|ope|oped|oping|oney|orse[flesh]{0,5}|ouse[ds]{0,1}|pon|puter[edrs]{0,2}|round|scuse[ds]{0,1}|spect[sed]{0,2}|scaped|sides|tween|special[ly]{0,2}|stead|t|taint|til|tis|twas|twere|twould|twil l|ud|un|urt|vise)(\W)', r' ’\1\2')


	#Replace <space><hypen><space><close quotes> with <long hypen><close quotes>
	CorrectText("Fixed short hyphen and or space in front of closing quotes", r'[ ]?(-|—)[ ]?(”|’)', r'—\2')
	
	#Replace a series of short/long hyphens by one long hyphen - produces error if ,!-- or Fragment-- appears in file
	CorrectText("Replaced a series of short/long hyphens with one long hyphen", r'(?s)[-—]{2,}(?!.*<body[^>]*>)', r'—')

	CorrectText("Removed spaces in front of a long hyphen", r'(?s)[ ]+—(?!.*<body[^>]*>)', r'—') #OCT2020
	CorrectText("Removed spaces after a long hyphen", r'(?s)—[ ]+(?!.*<body[^>]*>)', r'—') #OCT2020
##	CorrectText("Replaced <space><long hypen><space> with one long hyphen", r'(?s)[ ]?—[ ]?(?!.*<body[^>]*>)', r'—')
	

	CorrectText("Quotes for different people split into separate lines", r"""<p(.*?)>(.*?)("|'|’|”)([ ]|&nbsp;)?("|'|‘|“)(.*?)</p>""", SplitQuotes)

#	CorrectText("Quotes for different people split into separate lines", r"""("|'|’|”)([ ]|&nbsp;)?("|'|‘|“)""", cDetermineQuotes.CloseQ+"</p>\n\n   <p>"+cDetermineQuotes.OpenQ) #UPDATED AUG 2018 - GIVES ERROR WITH alt=""

	#Fixes quotes that are the wrong way round at end of paragraph and removes extraneous white space
	CorrectText("Fixes direction of quotes at end of paragraph", r'(\s+)?(‘|“)(\s+)?</p', cDetermineQuotes.CloseQ+'</p')

#---------------------- Greek checks ----------------
	if prefs['GreekLetters'] == 'Yes':
		#Fixes '…' when PDFd as ...
		CorrectText("Changed ... to …", r'\.\.\.', r'…')
	
		#Fixes 'η' when PDFd as ΐ]
		CorrectText("Changed ΐ] to η", r'ΐ]', r'η')
		
		#Fixes 'στη' when PDFd as σιη
		CorrectText("Changed σιη to στη", r'σιη', r'στη')
	
		#Fixes 'στ(η|ο|ον|α|ις|ην)' when PDFd as  '"οτ(η|ο|ον|α|ις|ην)'
		CorrectText("Changed οτ(η|ο|ον|α|ις|ην) to στ(η|ο|ον|α|ις|ην)", r' οτ(η|ο|ον|α|ις|ην) ', r' στ\1 ')
	
		#Fixes 'των' when PDFd as  'τ(οι|οι)ν'
		CorrectText("Changed τ(οι|ιο)ν to των", r' τ(οι|ιο)ν ', r' των ')
	
		#Fixes 'ού' when PDFd as  'οιί'
		CorrectText("Changed οιί to ού", r'οιί', r'ού')
	
		#Fixes 'στις' when PDFd as σιις
		CorrectText("Changed σιις to στις", r'σιις', r'στις')
	
		#Fixes 'στ(η|ο|ον|ην)' when PDFd as οτ(η|ο|ον|ην)
		CorrectText("Changed οτ(η|ο|ον|ην) to στ(η|ο|ον|ην)", r' οτ(η|ο|ον|ην) ', r'στ\1')
	
		#Fixes 'στ(ο|ου|α)' when PDFd as  σι(ο|ου|α)
		CorrectText("Changed σι(ο|ου|α) to στ(ο|ου|α)", r' σι(ο|ου|α)', r'στ\1')
	
		#Fixes 'ώ' when PDFd as ο'ι
		CorrectText("Changed ο'ι to ώ", r'(ο\'ι|\(ί\))', r'ώ')
		
		#Fixes 'Άκουσ' when PDFd as Ακόυσ
		CorrectText("Changed Ακόυσ to Άκουσ", r'Ακόυσ', r'Άκουσ')
		
		#Fixes 'γι’' when PDFd as γΓ,γΡ
		CorrectText("Changed γΓ γΡ to γι’", r'(γΓ|γΡ)', r'γι’')
	
		#Fixes 'ντι' when PDFd as νπ
		CorrectText("Changed νπ to ντι", r'νπ', r'ντι')
		
		#Fixes 'Γι’' when PDFd as ΓΓ
		CorrectText("Changed ΓΓ to Γι’", r'ΓΓ ', r'Γι’ ')
	
		#Fixes 'σχεδίαζ' when PDFd as σχέδιαζ
		CorrectText("Changed σχέδιαζ to σχεδίαζ", r'σχέδιαζ', r'σχεδίαζ')
		
		#Fixes '\u0388' when PDFd as 'E "E
		CorrectText("Changed 'E,\"E to \u0388", r'(\'|\")(\u0395)', r'Έ')
	
		#Fixes \u038E when PDFd as 'Y or "Y
		CorrectText("Changed 'Y,\"Y to \u038E", r'(\'|\")(\u03A5)', r'Ύ')
	
		#Fixes \u038A when PDFd as 'I or "I
		CorrectText("Changed 'I,\"I to \u038A", r'(\'|\")(\u0399)', r'Ί')
	
		#Fixes \u038C when PDFd as 'O or "O
		CorrectText("Changed 'O,\"O to \u038C", r'(\'|\")(\u039F)', r'Ό')
	
		#Fixes \u0386 when PDFd as 'A or "A
		CorrectText("Changed 'A,\"A to \u0386", r'(\'|\")(\u0391)', r'Ά')
	
		#Fixes \u0389 when PDFd as 'H or "H
		CorrectText("Changed 'H,\"H to \u0389", r'(\'|")(\u0397)', r'Ή')
	
		#Fixes \u038F when PDFd as '\u03C9 or "\u03C9
		CorrectText("Changed '\u03C9,\"\u03C9 to \u038F", r'(\'|\")(\u03C9)', r'Ώ')
	
		#Fixes \u03CD when PDFd as \u03B0
		CorrectText("Changed \u03CD to \u03B0", r'ΰ', r'ύ')
	
		#Fixes \u03CD when PDFd as \u03B0
		CorrectText("Changed ε' to έ", r'ε\'', r'έ')
	
		#Fixes ς Character when PDFd as ςCharacter
		CorrectText("Changed ςCharacter to ς Character", r'ς([\u0370-\u03CE])', r'ς \1')

	if prefs['GreekLetters'] != 'Yes':
		CorrectText("Corrected w<sup>..</sup>", r"""w<sup>[^>]+?>""", r'w')
		CorrectText("Corrected <sup>5 and <sup>9", r"""<sup>[59]</sup>""", r'’')
		CorrectText("Corrected <sup>6</sup>", r"""<sup>6</sup>""", r'‘')
		#CorrectText("Corrected / with quote mark", r"""(?s)([^<|>])(/)(?![^<>]*>)(?!.*<body[^>]*>)""", r'\1’')
		CorrectText("Corrected / with quote 'I'", r""" / """, r' I ')	#NB Could be 1 on more rare occasions

	CorrectText("Changed 2 or more consecutive full stops to ellipsis", r"(?s)([^<|>])([ ]?\.[ ]?){2,}(?![^<>]*>)(?!.*<body[^>]*>)", r'\1…')
	
	# Commented lines using <!-- this is an html comment --> are erroneously corrected to  
	#					   <!—this is an html comment—>
	#The following two lines revert <!—this is an html comment—> back to its original format
	CorrectText("COMMENT", r"<!—", "<!-- ") #OCT 2020		
	CorrectText("", r"—>", " -->") #OCT 2020 Blank first args are not entered in dictionary
	# html = re.sub(r"<!—", "<!-- ", html) #OCT 2020	
	# html = re.sub(r"—>", " -->", html) #OCT 2020

	return(0)

def processHTML(bk, prefs, IDlist):
	global html,Apostrophe
	global changedDict
	global dictPath
	global bk2
	global useHunspellDict
	global keepHyphens
	
	useHunspellDict = prefs['useHunspellDict']
	
	bk2=bk
	changedDict={}	#Stores changes made and number of occurences

	dictExists = prepDictionary(bk, prefs)

	for id in IDlist:
		html = bk.readfile(id)					#Read the section into html
		if not isinstance(html, str):			#If the section is not str
			html = text_type(html, 'utf-8')		#then sets its type to 'utf-8'
		html_orig = html						#Copy the result to html_orig
		
		if prefs['ReplaceHTMLCode'] == 'Yes' : ReplaceHTMLCode()
		FixCommonErrors(prefs)

		if prefs['WordListDirectory'] !="": AutoCorrectSpelling(prefs)

		if prefs['ProcessItalics']  == 'Yes': ProcessItalics()
		if Chapters.cs["ProcessChapter"] == True:
			chapDialog = Chapters()
			html=chapDialog.processChapterTitle(html)

		FixFalseLineBreaks(prefs['JoinParagraphs'], prefs)
		
		if dictExists == True:
			#Load the list of allowed hyphenated words ADDED OCT2018
			OpenHyphenExceptionFile(prefs)
			
			#Removes hyphens from words that should not be hyphenated if they
			#are not in the list provided by the user
			CorrectText("Hyphens removed",r"(?s)(\w+)[ ]?-[ ]?(\w+)(?![^<>]*>)(?!.*<body[^>]*>)", IsHyphenated)
			if prefs['GreekLetters'] == 'Yes':
				if useHunspellDict=="Yes":
					#Fixes Π in words that are misspelled
					CorrectText("Π fixes",r"(\w*|\s)(Ιΐ|1\ Ι|1\ Ι|1Ι|1I|ΓΙ|Γΐ|ΙΙ|II|Ι\ Ι|ΓΤ|ΙΊ|Ιί)[ ]?(\w*|\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixP)
					#Fixes έ in words that are misspelled
					CorrectText("έ fixes",r"(\w+|\s)ύ(\w+|\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixE)
					#Fixes ώ in words that are misspelled
					CorrectText("ώ fixes",r"(\w*|\s)(οί\)|νο'\)|α\)|οδ|οό|ιυ|άί|ο5|ο'\)|ιίι|\(ό|ο\)|ίό|ο&gt;|ο'ι|ιό|οί|ιο|οι|&lt;ο|οϊ)(\w*\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixO)
					#Fixes ω in words that are misspelled
					CorrectText("ω fixes",r"(\w*|\s)(οί\)|νο'\)|α\)|οδ|οό|ιυ|άί|ο5|ο'\)|ιίι|\(ό|ο\)|ίό|ο&gt;|ο'ι|ιό|οί|ιο|οι|&lt;ο|οϊ)(\w*|\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixW)
					#Fixes φ in words that are misspelled
					CorrectText("φ fixes",r"(\w*|\s)(\(ρ|χρ|η&gt;|«ρ|ηι|&lt;ρ|4&gt;|ιρ)(\w*|\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixF)

		if cProcessHeaderTags.processHeadersFlag:
			processHeaderTagsDlg = cProcessHeaderTags()
			html = processHeaderTagsDlg.simplifyParaTags(html)


		#Sony ePub reader will not treat <p><br /></p> as a blank line, so insert html code where necessary
		CorrectText("Inserted &nbsp; in (<p><br /></p>) with", r'<p><br[ ]?/></p>', r"<p>&nbsp;<br/></p>")

		if prefs['FormatHTML'] == 'Yes':
			html = re.sub(r"(?s)\n(\s+)?(?!</html|</body|<body>)(<(.*?)>)(?!.*<body[^>]*>)", FormatHTML, html)
			html= re.sub(r"([ ]*(<(title)|(<style)|(</style)|(<meta)|<(link)))", r'  \2', html)
			html= re.sub(r"([ ]+((<head>)|(</head>)|(<body)|(<!DOCTYPE)))",r'\2', html)

		if not html == html_orig: bk.writefile(id, html)	#If the text has changed then write the amended text to the book

	print("\nChanges made\n===============\n")

	# clean up after using Hunspell Dictionary so that you can reuse the bk.hspell class with a different dictionary	 
	if useHunspellDict=="Yes": bk.hspell.cleanUp()

	# Adjust dictionary for commented lines that use <!-- this is an html comment --> #OCT 2020
	if "COMMENT" in changedDict:
		changedDict["Replaced a series of short/long hyphens with one long hyphen"]=changedDict.get("Replaced a series of short/long hyphens with one long hyphen",0)- 2*changedDict["COMMENT"]
		changedDict["Removed spaces after a long hyphen"]=changedDict.get("Removed spaces after a long hyphen",0) - changedDict["COMMENT"]
		changedDict["Removed spaces in front of a long hyphen"]=changedDict.get("Removed spaces in front of a long hyphen",0) - changedDict["COMMENT"]
		del changedDict["COMMENT"] #OCT 2020
#		del changedDict["COMMENTCLOSE"] #OCT 2020
		
	for k, v in changedDict.items():
		if v>0:
			print(k, v)		# Display amendment and number of these changes.
	print("\n\n")
	return(0)
