Since I'm on a roll here's one for HTML it might need a little adjustment
Code:
import re
from hyphenate import hyphenate_word as hyphenate
def process_text(match):
src = match.group(1)
return re.sub('\S+', lambda match : u'\u00ad'.join(hyphenate(match.group())), src)
src = open('file', 'rb').read()
result = re.sub(r'>([^><]+)<', process_text, src)