#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab

import sys
import os
import unicodedata


_SEP_ = '|'

_MAX_WORD_LEN = 90


def parse_text_into_words(wc, use_nums, lang, text, soffset):
    # allow soft hyphen character as a wordchar
    wc = wc + chr(0x00ad)
    in_entity = False
    in_invalid_word = False
    word_start = 0
    text = ' ' + text + ' '
    wordlst = []
    n = len(text)
    for i in range(n):
        c = text[i]
        prev_c = text[i-1] if i > 0 else ' '
        next_c = text[i+1] if i < n - 1 else ' '
        if _isBoundary(prev_c, c, next_c, wc, use_nums):
            if in_entity and c != ';':
                in_entity = False
            if not in_invalid_word and not in_entity and word_start != -1 and (i - word_start) > 0:
                # up to but not including the ith character
                word = text[word_start:i]
                if (word != ''):
                    # remember you padded the begin and end of text with a single blank
                    word = "%d%c%s%c%s" % (soffset + word_start - 1, _SEP_, lang, _SEP_, word)
                    wordlst.append(word)
            word_start = i + 1
            in_invalid_word = False
        else:
            # ensure we are not dealing with some crazy run on text
            if not in_invalid_word and ((i - word_start) > _MAX_WORD_LEN): in_invalid_word = True
        if (c == '&'): in_entity = True
        if (c == ';'): in_entity = False
    
    return wordlst


def _isBoundary(prev_c, c, next_c, wc, use_nums):
    if _IsValidChar(c, use_nums): return False
    is_potential_boundary = (c == '-') or (c == chr(0x2012)) or (c == "'") or (c == chr(0x2019)) or (c in wc)
    prev_was_valid = _IsValidChar(prev_c, use_nums)
    next_was_valid = _IsValidChar(next_c, use_nums)
    if is_potential_boundary and (not prev_was_valid or not next_was_valid) :
        return True
    return not (is_potential_boundary and (prev_was_valid or next_was_valid))


def _IsValidChar(c, use_nums):
    if (use_nums):
        return unicodedata.category(c).startswith('L') or unicodedata.category(c).startswith('N')
    return unicodedata.category(c).startswith('L')
        

def getSpellingSafeText(word):
    # strip out soft hyphens
    safeword = word.replace(chr(0x00ad),'')
    # replace smart apostrophe with normal one
    safeword = safeword.replace(chr(0x2019), chr(0x27))
    return safeword

