#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
from six.moves import range

__license__   = 'GPL v3'
__copyright__ = '2011, Rodrigo Coin Curvo. 2019, Thiago Oliveira'
__docformat__ = 'restructuredtext en'

from calibre.utils.icu import lower

STOP_WORDS = ['de', 'a', 'o', 'que', 'e', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na',
              'por', 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou']


def clean_words(words):
    ws = []
    for w in words:
        if lower(w) not in STOP_WORDS:
            ws.append(w)
    return ws


def similarity(a, b):
    m = max(len(a), len(b))
    d = distance(a, b)
    return (m-d)*100/m

# Levenshtein distance by Magnus Lie Hetland (http://hetland.org/)
# I chose to use a Python implementation to avoid forcing the user
# to install an additional lib just for this plugin. Since the data
# size is relatively small, there should be no performance problem,
# even for somewhat large number of books.


def distance(a, b):
    '''
    Calculates the Levenshtein distance between a and b.
    '''
    n, m = len(a), len(b)
    if n > m:
        # Make sure n <= m, to use O(min(n,m)) space
        a, b = b, a
        n, m = m, n
        
    current = list(range(n+1))
    for i in range(1,m+1):
        previous, current = current, [i]+[0]*n
        for j in range(1,n+1):
            add, delete = previous[j]+1, current[j-1]+1
            change = previous[j-1]
            if a[j-1] != b[i-1]:
                change = change + 1
            current[j] = min(add, delete, change)
            
    return current[n]


def words_similarity(list1, list2):
    list1 = clean_words(list1)
    list2 = clean_words(list2)

    if not list1 or not list2:
        return 0

    # TODO Improve algorithm, something formal and consider the word order

    total = 0

    for l1 in list1:
        points = 0
        for l2 in list2:
            if l1 == l2:
                points = 100
                break
            else:
                points = max(similarity(l1, l2), points)
        total = total + points

    w_sim = total*100/(max(len(list1), len(list2))*100)

    return round(w_sim * w_sim / 100, 1)

