Hi
and here is a new version of this function taking into account the elided forms (at least for French language) thanks to Olivier, the author of -opensource-
Grammalecte.
Spoiler:
Code:
import regex
from calibre import replace_entities, prepare_string_for_xml
def replace(match, number, file_name, metadata, dictionaries, data, functions, *args, **kwargs):
def fix_word(m):
word = m.group()
if dictionaries.recognized(word):
return word
for i in xrange(1, len(word) - 1):
a, b = word[:i], word[i:]
if dictionaries.recognized(a) and dictionaries.recognized(b):
return a + ' ' + b
m = regex.match(r"(\w+)((?:[dlnmts]|qu(?:oi|el)qu|puisqu|lorsqu|jusqu|qu)[’'`]\w+)", word)
if m:
return m.group(1) + " " + m.group(2)
return word
text = replace_entities(match.group(1))
text = regex.sub(r"\b\w(?:[\w’'`-]*\w|\w+)\b", fix_word, text, flags=regex.VERSION1)
text = prepare_string_for_xml(text)
return '>' + text + '<'
or here:
http://pastebin.com/quGQQzcN