﻿#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import, print_function)

import re
from collections import defaultdict

from calibre_plugins.overdrive_link.equivalent import (equivalents, key_phrase_lens)


__license__ = 'GPL v3'
__copyright__ = '2012-2022, John Howell <jhowell@acm.org>'


'''
Routines used for matching books by author and title
'''


HONORIFIC_PREFIXES = {'mr', 'ms', 'mrs', 'dr', 'doctor', 'gen', 'general', 'lord', 'rep', 'sen', 'st', 'sir'}
HONORIFIC_SUFFIXES = {'phd', 'md', 'ba', 'ma', 'dds', 'msts'}

PERSON_PREFIXES = HONORIFIC_PREFIXES
PERSON_SUFFIXES = {'sr', 'senior', 'jr', 'junior', 'ii', 'iii', 'iv'} | HONORIFIC_SUFFIXES

MAX_MIDDLE_NAMES = 3


ACCENTS = "àáâãäăåāăąèéêëēĕėęěìíîïĩīĭįòóôõöōŏőơùúûüũūŭůűųñńņňýÿƴçćĉċč"
NONACCS = "aaaaaaaaaaeeeeeeeeeiiiiiiiiooooooooouuuuuuuuuunnnnyyyccccc"

ACCENT_PATTERN = re.compile("[%s]" % ACCENTS)

UNKNOWN = 'Unknown'     # used by calibre for empty author or title

# author pattern options
A_FIRST = 0
A_MIDDLE = 1
A_LAST = 2
A_OPTIONAL = 3


class Object(object):
    pass


class SimplePattern(object):
    def __init__(self, p):
        self.sl = re.split(r'([()|?])', p[(1 if p[0] == '^' else 0):(-1 if p[-1] == '$' else None)])
        #print("***SL= %s" % str(self.sl))
        self.ln = len(self.sl)

    def generate_strings(self):
        self.i = 0
        alts = self.gen_alts()
        if self.i < self.ln:
            raise Exception('Incomplete parse')

        return alts

    def gen_alts(self):
        myalts = ['']

        while self.i < self.ln:
            s = self.sl[self.i]
            if s == ')' or s == '|':
                break

            if s == '(':
                newalts = []
                while s == '|' or s == '(':
                    self.i += 1
                    alts = self.gen_alts()
                    newalts.extend(alts)

                    if self.i >= self.ln:
                        raise Exception('Unexpected end of pattern')

                    s = self.sl[self.i]

                if s != ')':
                    raise Exception('Missing close paren')

                self.i += 1

                if self.i < self.ln and self.sl[self.i] == '':
                    self.i += 1

                if self.i < self.ln and self.sl[self.i] == '?':
                    newalts.append('')
                    self.i += 1

                myalts = [alt1 + alt2 for alt1 in myalts for alt2 in newalts]

            else:
                if s:
                    myalts = [alt + s for alt in myalts]

                self.i += 1

        return myalts


def primary_author(authors):
    # should always be at least one author (possibly "Unknown") for calibre, but handle this for our discovered books.
    return authors[0] if len(authors) > 0 else UNKNOWN


UNKNOWN_L = UNKNOWN.lower()


def is_unknown(s):
    # in calibre db missing title is "Unknown" and missing author is a single author of "Unknown"
    # handle possibility of having been already converted to lower case
    return (not s) or s == UNKNOWN or s == UNKNOWN_L


def clean_list(s, remove_accents=True, remove_quotes=False):
    # Eliminate anything from string that could cause a false mismatch and split into individual words

    if remove_accents:
        s = re.sub(ACCENT_PATTERN, lambda m: NONACCS[ACCENTS.index(m.group())], s)

    s = re.sub(r'([0-9]),([0-9])', r'\1\2', s)  # remove comma between digits
    s = re.sub(r'\bno\.', '#', s)
    s = re.sub(r'\s&\s', ' and ', s)            # change '&' to 'and'
    s = re.sub(r'\sph\.d\.?$', ' phd', s)       # remove punctuation in Ph.D.

    if remove_quotes:
        s = re.sub(r"['‘’‛′]", "", s)
    else:
        s = re.sub(r"[‘’‛′]", "'", s)   # smart quotes

    s = re.sub(r"[^\w ()|']+", ' ', s, flags=re.UNICODE)  # remove any other miscellaneous punctuation

    s = re.sub(r"_", ' ', s)  # remove underscores (treated as alphanumeric by python re \w)

    s = re.sub('([|()])', r' \1 ', s)   # insert white space around pattern chars (used to delimit optional text)

    return s.strip().split()


def clean_list_title(s):
    s = s.lower()

    while '::' in s:
        s = re.sub(r'(^|:)([^:]*)::', r'\1(\2)', s)  # convert 'series:: title' to '(series) title'

    while ':' in s:
        s = re.sub(r':([^:]*)(:|$)', r'(\1)\2', s)   # convert title: subtitle: subsub' to 'title (subtitle) (subsub)'

    wl = clean_list(s, remove_quotes=True)

    ewl = []
    i = 0
    while i < len(wl):
        w = wl[i]
        for key_phrase_len in key_phrase_lens.get(w, []):
            j = i + key_phrase_len
            if j <= len(wl):
                key = ' '.join(wl[i:j])
                equivalent = equivalents.get(key)
                if equivalent is not None:
                    #print('***equivalent replaced %d: "%s" --> "%s"' % (key_phrase_len, key, equivalent))
                    ewl.extend([singular(ew) for ew in equivalent.split()])
                    i += key_phrase_len
                    break
        else:
            ewl.append(singular(w))  # Simple un-pluralize
            i += 1

    return ewl


def singular(s):
    return re.sub(r"([a-z][a-rt-z])s$", r"\1", re.sub(r"([a-z])ies$", r"\1y", s))  # Simple un-pluralize


def title_pattern(s):
    nwl = []
    counts = []

    for w in clean_list_title(s):
        if w == '(':
            counts.append(0)
            nwl.append('(')

        elif w == ')':
            if len(counts) > 0:
                nwl.append(')?' if counts.pop(-1) == 0 else ')')

        elif w == '|':                  # not currently allowed
            if len(counts) == 0:
                counts.append(0)
                nwl.insert(0, '(')      # implied start at beginning of string

            counts[-1] = counts[-1] + 1
            nwl.append('|')

        else:
            nwl.append(w)
            nwl.append(' ')

    while len(counts) > 0:
        nwl.append(')?' if counts.pop(-1) == 0 else ')')

    nwl.insert(0, '^')
    nwl.append('$')

    return ''.join(nwl)


def gen_all_titles(title, series):
    results = []
    for t in title.split('|'):
        #results.append(title_clean_str(t))
        results.extend(SimplePattern(title_pattern(series_title(t, series))).generate_strings())

    '''
    if title.lower().startswith("george r.r. martin's wild cards"):
        print('***GEN LIST FOR: "%s"' % title)
        for t in results:
            print('    "%s"' % t)
    '''
    return results


DELIMS = ['(', ')', '|']


def title_clean_str(s):
    return ' '.join([w for w in clean_list_title(s) if w not in DELIMS])


def is_same_title(title1, title2):
    return title_clean_str(title1) == title_clean_str(title2)


def series_title(title, series):
    return '%s:: %s' % (series.replace(':', ''), title) if series and ('::' not in title) else title


def is_same_calibre_title(calibre_title, calibre_series, library_title, library_series):
    calibre_series_title = series_title(calibre_title, calibre_series)
    calibre_pattern = title_pattern(calibre_series_title)

    library_series_title = series_title(library_title, library_series)
    library_pattern = title_pattern(library_series_title)

    result = (
            calibre_title == library_title or
            re.match(calibre_pattern, title_clean_str(library_title) + ' ') or
            re.match(calibre_pattern, title_clean_str(library_series_title) + ' ') or
            re.match(library_pattern, title_clean_str(calibre_title) + ' ') or
            re.match(library_pattern, title_clean_str(calibre_series_title) + ' '))

    return result


def is_any_same_calibre_title(calibre_title, calibre_series, library_title, library_series):
    for ct in calibre_title.split('|'):
        if is_same_calibre_title(ct, calibre_series, library_title, library_series):
            return True

    return False


def clean_author_list(author, remove_accents=True):
    if is_unknown(author):
        return []

    # Try to fix author in "Last, First" format but look out for "First Last, Jr", etc
    author = author.lower() + ' '

    for p in PERSON_SUFFIXES:
        author = author.replace(', ' + p + ' ', ' ' + p + ' ')      # , Jr
        author = author.replace(', ' + p + '. ', ' ' + p + '. ')    # , Jr.

    author = re.sub(r'[()|]', '', author)
    author = author.strip()

    if ',' in author:
        name_parts = author.partition(',')
        author = name_parts[2] + ' ' + name_parts[0]

    return clean_list(author, remove_accents)


def author_clean_str(s, remove_accents=True):
    return ' '.join(clean_author_list(s, remove_accents))


def author_pattern(a_clean_list):
    parsed_list = author_parse(a_clean_list)

    # see if can produce an alternate pattern with initials combined
    have_alt = False
    alt_list = []
    initials = ""

    for w, a in parsed_list:
        if (a == A_FIRST or a == A_MIDDLE) and len(w) == 1:
            initials += w
        else:
            if len(initials) > 1:
                alt_list.append((initials, A_LAST))     # set combined initials as required
                have_alt = True

            initials = ""
            alt_list.append((w, a))

    if have_alt:
        #print("have alt: ^(%s)|(%s)$" % (author_pattern2(parsed_list), author_pattern2(alt_list)))
        return "^(%s)|(%s)$" % (author_pattern2(parsed_list), author_pattern2(alt_list))

    return "^%s$" % author_pattern2(parsed_list)


def author_pattern2(parsed_list):
    pat = []

    for w, a in parsed_list:
        if a == A_FIRST:
            pat.append("(%s )" % w if len(w) == 1 else "(%s |%s )" % (w, w[0]))     # name or initial required
        elif a == A_MIDDLE:
            pat.append("(%s )?" % w if len(w) == 1 else "(%s |%s )?" % (w, w[0]))   # name or initial optional
        elif a == A_LAST:
            pat.append("%s " % w)       # required
        elif a == A_OPTIONAL:
            pat.append("(%s )?" % w)    # optional
        else:
            raise Exception("Unexpected author pattern list option: %d" % a)

    return ''.join(pat)


def author_parse(cl):
    pl = []

    if len(cl) == 1:
        pl.append((cl[0], A_LAST))              # single name
    else:
        while len(cl) > 1 and cl[0] in PERSON_PREFIXES:
            pl.append((cl.pop(0), A_OPTIONAL))

        suffixes = []
        while len(cl) > 1 and cl[-1] in PERSON_SUFFIXES:
            suffixes.insert(0, (cl.pop(-1), A_OPTIONAL))

        if len(cl) > 1:
            pl.append((cl.pop(0), A_FIRST))     # first name or initial

        if len(cl) > 0:
            middles = 0
            for w in cl[:-1]:
                pl.append((w, A_MIDDLE))        # middle name or initial
                middles += 1

            if middles > MAX_MIDDLE_NAMES:
                return []       # failure

            pl.append((cl[-1], A_LAST))         # last name

        pl.extend(suffixes)

    return pl


author_cache = {}


def author_info(a):
    a_info = author_cache.get(a)
    if a_info is None:
        a_clean_list = clean_author_list(a)
        author_cache[a] = a_info = (' '.join(a_clean_list), re.compile(author_pattern(a_clean_list)))

    return a_info


def is_same_author(a1, a2, author_match_equivalents):
    if is_unknown(a1) or is_unknown(a2):
        return False

    a1_clean, a1_pattern = author_info(a1)
    a2_clean, a2_pattern = author_info(a2)

    if a1_clean == a2_clean:
        return True

    # relies on author_clean_str being equivalent to match_prep
    if author_match_equivalents is not None:
        if a1_clean in author_match_equivalents:
            return a2_clean in author_match_equivalents[a1_clean]

        if a2_clean in author_match_equivalents:
            return a1_clean in author_match_equivalents[a2_clean]

    return a1_pattern.match(a2_clean + ' ') or a2_pattern.match(a1_clean + ' ')


def is_any_same_author(calibre_authors, library_authors, config, allow_unknown=False):
    # Check authors. Assume that all author names are in first/last name with no comma.
    # Allow any author match between books

    if allow_unknown and (len(calibre_authors) > 1 and is_unknown(primary_author(library_authors))):
        # Freading lists no author for multi-author books. Assume a match.
        return True

    # use cache to improve performance of matching since the same author comparisons tend to be done multiple times
    for calibre_author in calibre_authors:
        for library_author in library_authors:
            if is_same_author(calibre_author, library_author, config.author_match_equivalents):
                return True

    return False


def is_same_book(book1, book2, config):
    return (is_same_title(book1.title, book2.title) and
            is_any_same_author(book1.authors, book2.authors, config, allow_unknown=True))


def is_same_calibre_book(calibre_book, library_book, config):
    return (is_any_same_calibre_title(calibre_book.title, calibre_book.series, library_book.title, library_book.series) and
            is_any_same_author(calibre_book.authors, library_book.authors, config, allow_unknown=True))


def author_match_prep(author):
    return author_clean_str(author)


def exact_book_key(book):
    return tuple([book.title.lower()] + book.authors)


def match_book_lists(l1, l2, config, progress=None, do_exact_match=False, first_is_calibre_books=False):
    # return a list of tuples of matching books from two lists of books
    # do_exact_match == title and authors must match exactly
    # otherwise having a link in common is a priority match
    # first_is_calibre_books == fuzzy match taking into account series, alternates, etc.
    # otherwise titles must match

    if progress:
        count = 0
        progress.setMaximum(len(l1) + len(l2))

    l1_titles = defaultdict(set)
    l1_link_keys = defaultdict(set)
    l1_book_keys = defaultdict(set)

    for l1book in l1:
        if do_exact_match:
            l1_book_keys[exact_book_key(l1book)].add(l1book)
        else:
            for key in l1book.book_link_keys(config):
                l1_link_keys[key].add(l1book)

            if first_is_calibre_books:
                for title in gen_all_titles(l1book.title, l1book.series):
                    l1_titles[title].add(l1book)
            else:
                l1_titles[title_clean_str(l1book.title)].add(l1book)

        if progress:
            count += 1
            progress.setValue(count)
            if progress.wasCanceled():
                return []

    regular_matches = set()
    priority_matches = set()

    for l2book in l2:
        if do_exact_match:
            for l1book in l1_book_keys.get(exact_book_key(l2book), []):
                if l1book is not l2book:
                    priority_matches.add((l1book, l2book))
        else:
            for key in l2book.book_link_keys(config):
                for l1book in l1_link_keys[key]:
                    if l1book is not l2book:
                        priority_matches.add((l1book, l2book))

            if first_is_calibre_books:
                for title in gen_all_titles(l2book.title, l2book.series):
                    for l1book in l1_titles[title]:
                        if (l1book is not l2book) and is_same_calibre_book(l1book, l2book, config):
                            regular_matches.add((l1book, l2book))
            else:
                for l1book in l1_titles[title_clean_str(l2book.title)]:
                    if (l1book is not l2book) and is_same_book(l1book, l2book, config):
                        regular_matches.add((l1book, l2book))

        if progress:
            count += 1
            progress.setValue(count)
            if progress.wasCanceled():
                return []

    #if not do_exact_match:
    #    for missing in sorted(list(priority_matches - regular_matches)):
    #        print('*****missing match: %s' % str(missing[0]))
    #        print('              with: %s' % str(missing[1]))

    # priority matches first, no duplicates
    matches = sorted(list(priority_matches))
    matches.extend(sorted(list(regular_matches - priority_matches)))

    return matches


def gen_all_title_words(title, series=''):
    if series:
        title = series + ' ' + title

    return [w for w in clean_list_title(title) if w not in DELIMS]


def match_single_book_to_list(book1, l2, config, progress=None, max_matches=40):
    # return a list of matching books in decreasing order of strength of match

    if progress:
        count = 0
        progress.setMaximum(len(l2))

    book_words = set(gen_all_title_words(book1.title))
    book1_link_keys = book1.book_link_keys(config)

    matches = {}

    for l2book in l2:
        common_words = book_words & set(gen_all_title_words(l2book.title, l2book.series))

        cnt = 0
        for word in common_words:
            cnt += len(word)

        if not book1_link_keys.isdisjoint(l2book.book_link_keys(config)):
            cnt = 9999      # matching links if high priority

        if (book1.id is not None) and (book1.id == l2book.id):
            cnt = 99999     # if already matched then highest priority

        if cnt > 0:
            matches[l2book] = (-cnt, l2book.title.lower())

        if progress:
            count += 1
            progress.setValue(count)
            if progress.wasCanceled():
                return []

    matches = sorted(matches.items(), key=lambda m: m[1])

    return [m[0] for m in matches[:max_matches]]


'''
def slow_match_book_lists(l1, l2, config, progress=None, do_exact_match=False, first_is_calibre_books=False):

    if progress:
        count = 0
        progress.setMaximum(len(l1) + len(l2))

    l1_titles = defaultdict(set)
    l1_link_keys = defaultdict(set)
    l1_book_keys = defaultdict(set)

    for l1book in l1:
        if do_exact_match:
            l1_book_keys[exact_book_key(l1book)].add(l1book)
        else:
            for key in l1book.book_link_keys(config):
                l1_link_keys[key].add(l1book)

            for title in gen_all_title_words(l1book.title, l1book.series):
                l1_titles[title].add(l1book)

        if progress:
            count += 1
            progress.setValue(count)
            if progress.wasCanceled():
                return []

    #maxes = sorted([(len(l1_titles[w]), w) for w in l1_titles], reverse=True)
    #print('Most common title words in first list:')
    #for i in range(min(len(maxes), 10)):
    #    print('   "%s" (%d)' % (maxes[i][1], maxes[i][0]))


    regular_matches = set()
    priority_matches = set()

    for l2book in l2:
        if do_exact_match:
            for l1book in l1_book_keys.get(exact_book_key(l2book), []):
                if l1book is not l2book:
                    priority_matches.add((l1book, l2book))
        else:
            for key in l2book.book_link_keys(config):
                for l1book in l1_link_keys[key]:
                    if l1book is not l2book:
                        priority_matches.add((l1book, l2book))

            checked = set()
            for title in gen_all_title_words(l2book.title, l2book.series):
                for l1book in l1_titles[title]:
                    if (l1book is not l2book) and (l1book not in checked):
                        checked.add(l1book)
                        if first_is_calibre_book:
                            if is_same_calibre_book(l1book, l2book, config):
                                regular_matches.add((l1book, l2book))
                        else:
                            if is_same_book(l1book, l2book, config):
                                regular_matches.add((l1book, l2book))


        if progress:
            count += 1
            progress.setValue(count)
            if progress.wasCanceled():
                return []

    # priority matches first, no duplicates
    matches = sorted(list(priority_matches))
    matches.extend(sorted(list(regular_matches - priority_matches)))

    return matches


def very_slow_match_book_lists(l1, l2, config, progress=None, do_exact_match=False, first_is_calibre_book=False):
    # full book-book matching for testing
    if progress:
        count = 0
        progress.setMaximum(len(l1) + len(l2))

    regular_matches = set()
    priority_matches = set()

    for l1book in l1:
        for l2book in l2:
            if l1book is not l2book:
                if do_exact_match:
                    if exact_book_key(l1book) == exact_book_key(l2book):
                        priority_matches.add((l1book, l2book))
                else:
                    if not l1book.book_link_keys(config).isdisjoint(l2book.book_link_keys(config)):
                        priority_matches.add((l1book, l2book))

                    if first_is_calibre_book:
                        if is_same_calibre_book(l1book, l2book, config):
                            regular_matches.add((l1book, l2book))
                    else:
                        if is_same_book(l1book, l2book, config):
                            regular_matches.add((l1book, l2book))


        if progress:
            count += 2
            progress.setValue(count)
            if progress.wasCanceled():
                return []


    # priority matches first, no duplicates
    matches = sorted(list(priority_matches))
    matches.extend(sorted(list(regular_matches - priority_matches)))

    return matches


TEST_LIST_SIZE_LIMIT = 2000

def match_book_lists(l1, l2, config, progress=None, do_exact_match=False, first_is_calibre_book=False):
    m1 = match_book_lists_(l1, l2, config, progress, do_exact_match, first_is_calibre_book)

    if len(l1) <= TEST_LIST_SIZE_LIMIT or len(l2) <= TEST_LIST_SIZE_LIMIT:
        print('*****checking results against slow match')
        m2 = slow_match_book_lists(l1, l2, config, progress, do_exact_match, first_is_calibre_book)

        if m1 != m2:
            print('*****match_book_lists failure:')

            for m in sorted(list(m2 - m1)):
                print('   Missing: %s == %s' % (str(m[0]), str(m[1])))

            for m in sorted(list(m1 - m2)):
                print('   Extra: %s == %s' % (str(m[0]), str(m[1])))

    return m1
'''
