View Single Post
Old 11-19-2015, 01:27 AM   #139
gipsy
Connoisseur
gipsy began at the beginning.
 
Posts: 81
Karma: 10
Join Date: Nov 2013
Device: Kobo Aura HD
@CalibUser
I change some things in the CorrectTexts...
It works better but I get some unnecessary fixes (4 in a total of 200) but i can use the customised word list to fix them. (like ω, φυλάξοι that are spell correct but it's not the words the text have, I attach the custom text file).
All the Fix finds now the misspelled character in the whole word.
I also add a fix for φ that are as "η>|«ρ|ηι|<ρ|4>|ιρ" after the OCR.
Code:
def IsFixP(m):
	"""
	FIXES Π 
	This function examines a word to see whether is required to fix the Π character that is misspelled.
	It is called by a regular expression function (re.sub) in FixCommonErrors()
	It returns the original expression if the checked word is not in the dictionary,
	otherwise it returns the word without the Π fixed
	"""
	FixP=m.group(1)+"Π"+m.group(3)
	FixP2=m.group(1)+m.group(2)+m.group(3)

	if spell(FixP2):
		return(m.group(1)+m.group(2)+m.group(3))
	elif spell(FixP):
		print("FixP: ",FixP2, " changed to ", FixP)
		return (m.group(1)+'Π'+m.group(3))
	else:
		return(m.group(1)+m.group(2)+m.group(3))

def IsFixE(m):
	"""
	FIXES έ 
	This function examines a word to see whether is required to fix the έ character that is misspelled.
	It is called by a regular expression function (re.sub) in FixCommonErrors()
	It returns the original expression if the checked word is not in the dictionary,
	otherwise it returns the word without the Π fixed
	"""
	FixE=m.group(1)+"έ"+m.group(2)
	FixE2=m.group(1)+"ύ"+m.group(2)
	if spell(FixE2):
		return(m.group(1)+"ύ"+m.group(2))
	elif spell(FixE):
		print("FixE: ",FixE2, " changed to ", FixE)
		return(m.group(1)+"έ"+m.group(2))
	else:
		return(m.group(1)+"ύ"+m.group(2))

def IsFixO(m):
	"""
	This function examines a word to see whether is required to fix the (ιό|οί|ιο|οι) characterw that is misspelled.
	It is called by a regular expression function (re.sub) in FixCommonErrors()
	It returns the original expression if the checked word is not in the dictionary,
	otherwise it returns the word without the ώ fixed
	"""
	FixO=m.group(1)+"ώ"+m.group(3)
	FixO2=m.group(1)+m.group(2)+m.group(3)
	if spell(FixO2):
		return(m.group(1)+m.group(2)+m.group(3))
	elif spell(FixO):
		print("FixΏ: ",FixO2, " changed to ", FixO)
		return(m.group(1)+"ώ"+m.group(3))
	else:
		return(m.group(1)+m.group(2)+m.group(3))
	
def IsFixW(m):
	"""
	This function examines a word to see whether is required to fix the (ιό|οί|ιο|οι) characterς that is misspelled.
	It is called by a regular expression function (re.sub) in FixCommonErrors()
	It returns the original expression if the checked word is not in the dictionary,
	otherwise it returns the word without the ω fixed
	"""
	FixW=m.group(1)+"ω"+m.group(3)
	FixW2=m.group(1)+m.group(2)+m.group(3)
	if spell(FixW2):
		return(m.group(1)+m.group(2)+m.group(3))
	#elif spell(FixW2):
	#	return(m.group(1)+m.group(2)+m.group(3))
	elif spell(FixW):
		print("FixΩ: ",FixW2, " changed to ", FixW)
		return(m.group(1)+"ω"+m.group(3))
	else:
		return(m.group(1)+m.group(2)+m.group(3))

def IsFixF(m):
	"""
	This function examines a word to see whether is required to fix the ((ρ|χρ|η>|«ρ|ηι|<ρ|4>|ιρ) characterς that is misspelled.
	It is called by a regular expression function (re.sub) in FixCommonErrors()
	It returns the original expression if the checked word is not in the dictionary,
	otherwise it returns the word without the ω fixed
	"""
	FixF=m.group(1)+"φ"+m.group(3)
	FixF2=m.group(1)+m.group(2)+m.group(3)
	if spell(FixF2):
		return(m.group(1)+m.group(2)+m.group(3))
	elif spell(FixF):
		print("FixΦ: ",FixF2, " changed to ", FixF)
		return(m.group(1)+"φ"+m.group(3))
	else:
		return(m.group(1)+m.group(2)+m.group(3))
Code:
				if useHunspellDict=="Yes":
					#Fixes Π in words that are misspelled
					CorrectText("Π fixes",r"(\w*|\s)(Ιΐ|1\ Ι|1\ Ι|1Ι|1I|ΓΙ|Γΐ|ΙΙ|II|Ι\ Ι|ΓΤ|ΙΊ|Ιί)[ ]?(\w*|\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixP)
					#Fixes έ in words that are misspelled
					CorrectText("έ fixes",r"(\w+|\s)ύ(\w+|\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixE)
					#Fixes ώ in words that are misspelled
					CorrectText("ώ fixes",r"(\w*|\s)(οί\)|νο'\)|α\)|οδ|οό|ιυ|άί|ο5|ο'\)|ιίι|\(ό|ο\)|ίό|ο&gt;|ο'ι|ιό|οί|ιο|οι|&lt;ο|οϊ)(\w*|\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixO)
					#Fixes ω in words that are misspelled
					CorrectText("ω fixes",r"(\w*|\s)(οί\)|νο'\)|α\)|οδ|οό|ιυ|άί|ο5|ο'\)|ιίι|\(ό|ο\)|ίό|ο&gt;|ο'ι|ιό|οί|ιο|οι|&lt;ο|οϊ)(\w*|\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixW)
					#Fixes φ in words that are misspelled
					CorrectText("φ fixes",r"(\w*|\s)(\(ρ|χρ|η&gt;|«ρ|ηι|&lt;ρ|4&gt;|ιρ)(\w*|\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixF)
EDIT: I attach and a IncorrectWords for greek.
Calib sometime in the future it's possible to make the greek fixes a different py file so i don't mess with your HTMLProcceror all the time?
Attached Files
File Type: txt custom.txt (115 Bytes, 414 views)
File Type: txt IncorrectWords.txt (2.6 KB, 430 views)

Last edited by gipsy; 11-19-2015 at 02:12 AM. Reason: add some more searches
gipsy is offline   Reply With Quote