View Single Post
Old 03-03-2016, 12:22 PM   #10
Arjayem
Casual Member
Arjayem began at the beginning.
 
Arjayem's Avatar
 
Posts: 5
Karma: 10
Join Date: Mar 2016
Location: UK
Device: Kindle paperwhite
Lightbulb Scanning OCR Errors

Errors produced by scanning text seem to follow a predictable pattern such a seU for sell or iUness for illness or bom for born etc but never the less aren't corrected by the automatic scanning software. So, I created a function for the calibre editor to fix those I most commonly found. You'll also found I've corrected some American spellings, depending upon your dictionary these won't actually be wrong.

The code is based on the Calibre example that tidies up hyphens.

You'll need to enter the following find : >.*?<

Here's the function, because PYTHON uses intelligent (or not so) indenting you may need to play some to get PYTHON to swallow the code. :

Code:
import regex
from calibre import replace_entities
from calibre import prepare_string_for_xml

def replace(match, number, file_name, metadata, dictionaries, data, functions, *args, **kwargs):

    def replace_word(wmatch):
        # Check if the current word exits in the dictionary
        CheckThisSpelling = wmatch.group(1)
        if dictionaries.recognized(CheckThisSpelling) == True:   
            return wmatch.group()
        else:
        #	else try to correct it - remove American spelling
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("or", "our") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)         
            NewSpelling = CheckThisSpelling + '~'
            NewSpelling = NewSpelling.replace("or~", "our") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2) 
            NewSpelling = CheckThisSpelling + '~'
            NewSpelling = NewSpelling.replace("ors~", "our") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)    
        #	else try to correct it - remove American spelling
            NewSpelling = CheckThisSpelling + '~'
            NewSpelling = NewSpelling.replace("er~", "re") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("er", "re") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)
            else:
              NewSpelling = NewSpelling.replace("ree", "re") 
              if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)                                    
            NewSpelling = CheckThisSpelling + '~'
            NewSpelling = NewSpelling.replace("ers~", "res") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling + '~'
            NewSpelling = NewSpelling.replace("nse~", "nce") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)
        #	else try to correct it - remove American spelling
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("l", "ll") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("l", "ll",1) 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("l", "~",2) 
            NewSpelling = NewSpelling.replace("~", "l",1)
            NewSpelling = NewSpelling.replace("~", "ll",1)                       
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)                                                 
        #	else try to correct it - remove American spelling
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("ll", "l") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("ll", "l",1) 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("ll", "~",2) 
            NewSpelling = NewSpelling.replace("~", "ll",1)
            NewSpelling = NewSpelling.replace("~", "l",1)                       
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)               
         #
         #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("U", "li") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)
         #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("U", "ll") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)            
         #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("h", "li") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2) 
         #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("H", "li") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2) 
         #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("h", "li",1) 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2) 
         #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("H", "li",1) 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)  
         #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("h", "~",2) 
            NewSpelling = NewSpelling.replace("~", "h",1)
            NewSpelling = NewSpelling.replace("~", "li",1)              
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2) 
         #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("H", "~",2) 
            NewSpelling = NewSpelling.replace("~", "H",1)
            NewSpelling = NewSpelling.replace("~", "li",1)   
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)                         
         #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("im", "un") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)
         #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("l", "ll") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)
         #
         #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("imi", "um") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)              
         #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("m", "rn") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2) 
         #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("m", "in") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2) 
         #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("m", "hi") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)  
          #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("mn", "um") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)           
          #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("nm", "run") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)
          #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("nmi", "rum") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)                                                                                                           
          #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("bn", "lm") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)                                                                                                            
          #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("ii", "h") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)                                                                                                            
          #	else try to correct it 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("ii", "u") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)                                                                                                            
         #	
         #	else try to correct it 
            if CheckThisSpelling == 'Fd':
                return " I'd" +  wmatch.group(2)  
            if CheckThisSpelling == 'Fve':
                return " I've" +  wmatch.group(2)
            if CheckThisSpelling == 'Fm':
                return " I'm" +  wmatch.group(2)
            if CheckThisSpelling == 'Fll':
                return " I'll" +  wmatch.group(2) 
            if CheckThisSpelling == 'youVe':
                return " you've" +  wmatch.group(2)
            if CheckThisSpelling == 'YouVe':
                return " You've" +  wmatch.group(2)                   
         #	
         #	else try to correct it 
            if CheckThisSpelling == 'wren\'t':
                return " weren't" +  wmatch.group(2)              

         #	
         #	else try to correct it 
            if CheckThisSpelling == '&':
                return ' ' + chr(38) +  wmatch.group(2)  
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace(">", "y") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("j&", "fi") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("i&", "fi") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)  
            NewSpelling = NewSpelling.replace("l&", "fi") 
            if dictionaries.recognized(NewSpelling) == True:   
                return NewSpelling +  wmatch.group(2)                                      
                                                                              
        return wmatch.group()
        #return wmatch.group() + '1' + wmatch.group(1) + '2' + wmatch.group(2) + '3' + NewSpelling
    # Search for words 
    text = replace_entities(match.group()[1:-1])  # Handle HTML entities like &amp;
    corrected = regex.sub(r'\s*([\w\>\&[[a-z]\'[a-z]]]*)([\s*\.\?\,\"\;])', replace_word, text, flags=regex.VERSION1 | regex.UNICODE)
    return '>%s<' % prepare_string_for_xml(corrected)  # Put back required entities
GOOD LUCK & HOPE ITS OF SOME USE

Last edited by Arjayem; 03-04-2016 at 04:53 AM.
Arjayem is offline   Reply With Quote