@CalibUser
I change some things in the CorrectTexts...
It works better but I get some unnecessary fixes (4 in a total of 200) but i can use the customised word list to fix them. (like ω, φυλάξοι that are spell correct but it's not the words the text have, I attach the custom text file).
All the Fix finds now the misspelled character in the whole word.
I also add a fix for φ that are as "η>|«ρ|ηι|<ρ|4>|ιρ" after the OCR.
Code:
def IsFixP(m):
"""
FIXES Π
This function examines a word to see whether is required to fix the Π character that is misspelled.
It is called by a regular expression function (re.sub) in FixCommonErrors()
It returns the original expression if the checked word is not in the dictionary,
otherwise it returns the word without the Π fixed
"""
FixP=m.group(1)+"Π"+m.group(3)
FixP2=m.group(1)+m.group(2)+m.group(3)
if spell(FixP2):
return(m.group(1)+m.group(2)+m.group(3))
elif spell(FixP):
print("FixP: ",FixP2, " changed to ", FixP)
return (m.group(1)+'Π'+m.group(3))
else:
return(m.group(1)+m.group(2)+m.group(3))
def IsFixE(m):
"""
FIXES έ
This function examines a word to see whether is required to fix the έ character that is misspelled.
It is called by a regular expression function (re.sub) in FixCommonErrors()
It returns the original expression if the checked word is not in the dictionary,
otherwise it returns the word without the Π fixed
"""
FixE=m.group(1)+"έ"+m.group(2)
FixE2=m.group(1)+"ύ"+m.group(2)
if spell(FixE2):
return(m.group(1)+"ύ"+m.group(2))
elif spell(FixE):
print("FixE: ",FixE2, " changed to ", FixE)
return(m.group(1)+"έ"+m.group(2))
else:
return(m.group(1)+"ύ"+m.group(2))
def IsFixO(m):
"""
This function examines a word to see whether is required to fix the (ιό|οί|ιο|οι) characterw that is misspelled.
It is called by a regular expression function (re.sub) in FixCommonErrors()
It returns the original expression if the checked word is not in the dictionary,
otherwise it returns the word without the ώ fixed
"""
FixO=m.group(1)+"ώ"+m.group(3)
FixO2=m.group(1)+m.group(2)+m.group(3)
if spell(FixO2):
return(m.group(1)+m.group(2)+m.group(3))
elif spell(FixO):
print("FixΏ: ",FixO2, " changed to ", FixO)
return(m.group(1)+"ώ"+m.group(3))
else:
return(m.group(1)+m.group(2)+m.group(3))
def IsFixW(m):
"""
This function examines a word to see whether is required to fix the (ιό|οί|ιο|οι) characterς that is misspelled.
It is called by a regular expression function (re.sub) in FixCommonErrors()
It returns the original expression if the checked word is not in the dictionary,
otherwise it returns the word without the ω fixed
"""
FixW=m.group(1)+"ω"+m.group(3)
FixW2=m.group(1)+m.group(2)+m.group(3)
if spell(FixW2):
return(m.group(1)+m.group(2)+m.group(3))
#elif spell(FixW2):
# return(m.group(1)+m.group(2)+m.group(3))
elif spell(FixW):
print("FixΩ: ",FixW2, " changed to ", FixW)
return(m.group(1)+"ω"+m.group(3))
else:
return(m.group(1)+m.group(2)+m.group(3))
def IsFixF(m):
"""
This function examines a word to see whether is required to fix the ((ρ|χρ|η>|«ρ|ηι|<ρ|4>|ιρ) characterς that is misspelled.
It is called by a regular expression function (re.sub) in FixCommonErrors()
It returns the original expression if the checked word is not in the dictionary,
otherwise it returns the word without the ω fixed
"""
FixF=m.group(1)+"φ"+m.group(3)
FixF2=m.group(1)+m.group(2)+m.group(3)
if spell(FixF2):
return(m.group(1)+m.group(2)+m.group(3))
elif spell(FixF):
print("FixΦ: ",FixF2, " changed to ", FixF)
return(m.group(1)+"φ"+m.group(3))
else:
return(m.group(1)+m.group(2)+m.group(3))
Code:
if useHunspellDict=="Yes":
#Fixes Π in words that are misspelled
CorrectText("Π fixes",r"(\w*|\s)(Ιΐ|1\ Ι|1\ Ι|1Ι|1I|ΓΙ|Γΐ|ΙΙ|II|Ι\ Ι|ΓΤ|ΙΊ|Ιί)[ ]?(\w*|\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixP)
#Fixes έ in words that are misspelled
CorrectText("έ fixes",r"(\w+|\s)ύ(\w+|\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixE)
#Fixes ώ in words that are misspelled
CorrectText("ώ fixes",r"(\w*|\s)(οί\)|νο'\)|α\)|οδ|οό|ιυ|άί|ο5|ο'\)|ιίι|\(ό|ο\)|ίό|ο>|ο'ι|ιό|οί|ιο|οι|<ο|οϊ)(\w*|\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixO)
#Fixes ω in words that are misspelled
CorrectText("ω fixes",r"(\w*|\s)(οί\)|νο'\)|α\)|οδ|οό|ιυ|άί|ο5|ο'\)|ιίι|\(ό|ο\)|ίό|ο>|ο'ι|ιό|οί|ιο|οι|<ο|οϊ)(\w*|\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixW)
#Fixes φ in words that are misspelled
CorrectText("φ fixes",r"(\w*|\s)(\(ρ|χρ|η>|«ρ|ηι|<ρ|4>|ιρ)(\w*|\s)(?![^<>]*>)(?!.*<body[^>]*>)", IsFixF)
EDIT: I attach and a IncorrectWords for greek.
Calib sometime in the future it's possible to make the greek fixes a different py file so i don't mess with your HTMLProcceror all the time?