﻿#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
                        print_function)

__license__   = 'GPL v3'
__copyright__ = '2016, John Howell <jhowell@acm.org>'
__docformat__ = 'restructuredtext en'

import re
from collections import defaultdict

from calibre_plugins.overdrive_link.equivalent import (equivalent_words, equivalent_multi_words)
from calibre_plugins.overdrive_link.titlecase import titlecase

'''
Routines used for comparing of books between calibre to lending libraries
'''

USE_ISBN_FOR_MATCH = True
USE_SUBTILE_WHEN_TITLE_IS_SHORT = True
    

HONORIFIC_PREFIXES = {'mr', 'ms', 'mrs', 'dr', 'doctor', 'gen', 'general', 'lord', 'rep', 'sen', 'st', 'sir'}
HONORIFIC_SUFFIXES = {'phd', 'md', 'ba', 'ma', 'dds', 'msts'}

PERSON_PREFIXES = HONORIFIC_PREFIXES
PERSON_SUFFIXES = {'sr', 'senior', 'jr', 'junior', 'ii', 'iii', 'iv'} | HONORIFIC_SUFFIXES

PERSON_TITLES = PERSON_PREFIXES | PERSON_SUFFIXES

INITIALS = set('abcdefghijklmnopqrstuvwxyz')   # set of individual letters

NOISE_WORDS = {'a', 'an', 'the', 'of', 'tm'}   # ignored in matching titles

UNKNOWN = 'Unknown' # used by calibre for empty author or title

# author list options
A_REQUIRED = 0
A_OPTIONAL = 1
A_NAME_OR_INITIAL = 2   # evaluates to true as an optional word for fast matching


# titles that should not be allowed on their own
INCOMPLETE_TITLES = frozenset((
    "Apex Magazine",
    "Fantasy The Best of the Year",
    "Galaxy's Edge Magazine",
    "Insight Guide",
    "Insight Guides",
    "Science Fiction The Best of the Year",
    "The Best From Fantasy And Science Fiction",
    "The Best Horror of the Year",
    "The Best Science Fiction of the Year", 
    "The Best Science Fiction And Fantasy of the Year", 
    "The Collected Stories Of Arthur C. Clarke",
    "The Years Best Fantasy And Horror",
    "The Years Best Science Fiction", 
    "Years Best Fantasy",
    "Years Best Weird Fiction",
    "Terry Carr'S Best Science Fiction And Fantasy of the Year",
    "The Year's Best Dark Fantasy & Horror",
    "The Year's Best Science Fiction & Fantasy",
    "Years Best SF",
    ))
 
NOISE_TITLES = [
    'a novel', 
    'a prequel',
    'a short story',
    'with bonus material',
    'with linked table of contents'
    ] 
  
  
class Object(object):
    pass
    
    
def clean_list(s, remove_accents=True):
    # Eliminate anything from string that could cause a false mismatch and split into individual words
    
    if is_unknown(s):
        return ''
        
    if remove_accents:
        s = re.sub('[àáâãäăåāăą]', 'a', re.sub('[èéêëēĕėęě]', 'e', re.sub('[ìíîïĩīĭį]', 'i', s)))
        s = re.sub('[òóôõöōŏőơ]', 'o', re.sub('[ùúûüũūŭůűų]', 'u',s))
        s = re.sub('[ñńņň]', 'n', re.sub('[ýÿƴ]', 'y', re.sub('[çćĉċč]', 'c', s)))
    
    s = re.sub(r"[‘’‛′]","'",s)     # remove smart quotes
    
    s = re.sub(r'([0-9]),([0-9])', r'\1\2', s) # remove comma between digits
    s = re.sub(r'\s&\s', ' and ', s) # change '&' to 'and'
    s = re.sub(r'\sph\.d\.?$', ' phd', s) # remove punctuation in Ph.D.
    
    s = re.sub(r"[^\w :()']+", ' ', s, flags=re.UNICODE)  # remove any other miscellaneous punctuation
    s = re.sub(r"_", ' ', s)  # remove underscores (treated as alphanumeric by python re \w)
    
    s = re.sub(r'^(.*)::', r'(\1)', s)  # convert 'series:: title' to '(series) title
    
    s = re.sub('([:()])', r' \1 ', s)   # insert white space around parens and colon (used to delimit optional text)
    
    return s.strip().split()
    
    
    
def clean_list_title(s):
    s = s.lower() # lower case
    s = re.sub(r"['‘’‛′]","",s)     # Drop apostrophe and smart quotes
    
    for noise in NOISE_TITLES:
        s = re.sub(r'\b%s\b' % noise, '(%s)' % noise, s)   # make this text optional
    
    return clean_list(s)
    


def title_list(s):
    # Mark as optional those words in parenthesis or following a colon (subtitle)
    
    tl = []
    level = 0   # zero is required, non-zero is optional
    min_level = None
    
    for word in clean_list_title(s):
        if word in NOISE_WORDS:
            pass
            
        elif word == ':':
            level += 10     # Rest of string is optional subtitle
            
        elif word == '(':
            level += 1      # Anything in parenthesis is optional
            
        elif word == ')':
            if level > 0:
                level -= 1
                
        else:
            tl.append((word, level))    # keep level for later use
            
            if (min_level is None) or (level < min_level):
                min_level = level
                
                
    if min_level is not None:
        if min_level > 0:
            # all words are optional - fix so lowest level words are required
            tl = [(tl_word, tl_level - min_level) for tl_word, tl_level in tl]
            
    return tl

        
def clean_list_author(author, remove_accents=True):
    # Try to fix author in "Last, First" format but look out for "First Last, Jr", etc
    author = author.lower() + ' '
    
    for p in PERSON_SUFFIXES:
        author = author.replace(', ' + p + ' ', ' ' + p + ' ')      # , Jr
        author = author.replace(', ' + p + '. ', ' ' + p + '. ')    # , Jr.
        
    author = author.strip()
    
    if ',' in author:
        name_parts = author.partition(',')
        author = name_parts[2] + ' ' + name_parts[0]
    
    return clean_list(author, remove_accents)
        

def author_list(a):
    return author_list_of_clean(clean_list_author(a))
    

def author_list_of_clean(cl):
    # Prefixes and suffixes are optional
    # The last name (before suffix if present) is required and all other name parts optional
    # so that fast name matching will work properly.
    
    if len(cl) == 0:
        return []   # not expected
        
    if len(cl) == 1:
        return [(cl[0], A_REQUIRED)]  # a single name is always required
        

    l = [(word, A_NAME_OR_INITIAL) for word in cl]
    
    if l[-1][0] in PERSON_SUFFIXES:
        l[-1] = (l[-1][0], A_OPTIONAL)
        l[-2] = (l[-2][0], A_REQUIRED)
    else:
        l[-1] = (l[-1][0], A_REQUIRED)
        
    if (l[0][1] == A_NAME_OR_INITIAL) and (l[0][0] in PERSON_TITLES):
        l[0] = (l[0][0], A_OPTIONAL)
        
    return l 
 
        
def name_initial_match(w1, w2):
    if len(w1) == 1 and len(w2) >= 1 and w1 == w2[0]:
        return True
        
    if len(w2) == 1 and len(w1) >= 1 and w2 == w1[0]:
        return True
        
    return False
    
    
def opt_list(words, optional):
    return [(word,optional) for word in words]
    
    
'''    
def _fuzzy_compare_title_lists(l1, l2):
    result = fuzzy_compare_title_lists(l1, l2)
    
    print('compare title: "%s" %s "%s"'%(
        word_list_string(l1),
        '==' if result else '!=',
        word_list_string(l2)))
    
    return result
'''


def fuzzy_compare_author_lists(l1, l2, allow_l1_initial=True, allow_l2_initial=True,
                                allow_l1_missing=False, allow_l2_missing=False):
    '''
    Prefixes and suffixes are optional
    The last name (before suffix if present) is required and all other name parts optional
    First name must match initial/name in order
    Middle names are optional, but if present must match initial/name in order
    If any initial is matched with a name they all must match with a name (or same initial)
    
    name generation for searching needs to change to have its own rules???
    '''
    
    if len(l1) == 0:
        for (w2,w2_optional) in l2:
            if w2_optional != A_OPTIONAL:
                return False
                
        return True
        
    if len(l2) == 0:
        for (w1,w1_optional) in l1:
            if w1_optional != A_OPTIONAL:
                return False
                
        return True
        
        
    w1, w1_optional = l1[0]
    w2, w2_optional = l2[0]
        
    if w1 == w2:
        # exact match, skip these words
        
        if (w1_optional != A_OPTIONAL) and (w2_optional != A_OPTIONAL) and (not allow_l1_missing) and (not allow_l2_missing):
            # allow missing words only after an exact match
            if fuzzy_compare_author_lists(l1[1:], l2[1:], allow_l1_initial, allow_l2_initial, True, True):
                return True
        else:
            if fuzzy_compare_author_lists(l1[1:], l2[1:], allow_l1_initial, allow_l2_initial, allow_l1_missing, allow_l2_missing):
                return True
                
    # match names with initials (except for last name). Can't match "John J." with "J. Jim", but both match "J. J."
    elif w1_optional == A_NAME_OR_INITIAL and w2_optional == A_NAME_OR_INITIAL:
        if allow_l1_initial and len(w1) == 1 and w1 == w2[0]:
            if fuzzy_compare_author_lists(l1[1:], l2[1:], True, False, allow_l1_missing, allow_l2_missing):
                return True
                
        if allow_l2_initial and len(w2) == 1 and w2 == w1[0]:
            if fuzzy_compare_author_lists(l1[1:], l2[1:], False, True, allow_l1_missing, allow_l2_missing):
                return True
                
    # See if there is a match without optional words. Two different optional words don't match: "Lord" != "Mr"
        
    if w1_optional == A_OPTIONAL and w2_optional != A_OPTIONAL:
        if fuzzy_compare_author_lists(l1[1:], l2, allow_l1_initial, allow_l2_initial, allow_l1_missing, allow_l2_missing):
            return True
            
    if w2_optional == A_OPTIONAL and w1_optional != A_OPTIONAL:
        if fuzzy_compare_author_lists(l1, l2[1:], allow_l1_initial, allow_l2_initial, allow_l1_missing, allow_l2_missing):
            return True
            
    # See if part of a name can be missing from one or the other: "John Smith" = "John James Smith"
    
    if allow_l1_missing and (w1_optional == A_NAME_OR_INITIAL):
        if fuzzy_compare_author_lists(l1[1:], l2, allow_l1_initial, allow_l2_initial, True, False):
            return True
            
            
    if allow_l2_missing and (w2_optional == A_NAME_OR_INITIAL):
        if fuzzy_compare_author_lists(l1, l2[1:], allow_l1_initial, allow_l2_initial, False, True):
            return True
            
    return False
    
    

def fuzzy_compare_title_lists(l1, l2, allow_partial):
    # returns the number of non-matching optional words as a tuple when match found
    
    if len(l1) == 0:
        if not allow_partial:
            for (w2,w2_optional) in l2:
                if not w2_optional:
                    return False
                
        return (0,len(l2))  # True
        
    if len(l2) == 0:
        for (w1,w1_optional) in l1:
            if not w1_optional:
                return False
                
        return (len(l1),0)  # True
        
        
    if len(l1) > 0:
        w1, w1_optional = l1[0]
    else:
        w1, w1_optional = '', False

    if len(l2) > 0:
        w2, w2_optional = l2[0]
    else:
        w2, w2_optional = '', False
        
        
    if len(l1) > 0 and len(l2) > 0:

        if w1 == w2:
            m = fuzzy_compare_title_lists(l1[1:], l2[1:], allow_partial)  # exact match, skip these words
            if m:
                return m

        else:
            if (w1+'s' == w2) or (w1 == w2+'s') or ((w1 in equivalent_words) and (w2 in equivalent_words[w1])):
                m = fuzzy_compare_title_lists(l1[1:], l2[1:], allow_partial)
                if m:
                    return m

        
            # Handle cases where a single word in one is equivalent to multiple words in the other
            if w1 in equivalent_multi_words:
                for ewl in equivalent_multi_words[w1]:
                    if len(l2) >= len(ewl):
                        for i,w in enumerate(ewl):
                            if l2[i][0] != w:
                                break
                        else:
                            m = fuzzy_compare_title_lists(l1[1:], l2[len(ewl):], allow_partial)
                            if m:
                                return m
                    
                    
            if w2 in equivalent_multi_words:
                for ewl in equivalent_multi_words[w2]:
                    if len(l1) >= len(ewl):
                        for i,w in enumerate(ewl):
                            if l1[i][0] != w:
                                break
                        else:
                            m = fuzzy_compare_title_lists(l1[len(ewl):], l2[1:], allow_partial)
                            if m:
                                return m
                    
                            
      
            # Handle compound words split in one case and not the other
            if len(l2) > 1 and w1 == w2 + l2[1][0]:
                m = fuzzy_compare_title_lists(l1[1:], l2[2:],  allow_partial)
                if m:
                    return m
            
            if len(l1) > 1 and w2 == w1 + l1[1][0]:
                m = fuzzy_compare_title_lists(l1[2:], l2[1:], allow_partial)
                if m:
                    return m
                
        
    # See if there is a match without optional words.
        
    if w1_optional:
        m = fuzzy_compare_title_lists(l1[1:], l2, allow_partial)
        if m:
            return m

    if w2_optional:
        m = fuzzy_compare_title_lists(l1, l2[1:], allow_partial)
        if m:
            return m
        
    return False
    

def drop_series_prefix(series, title):
    # if the title is prefixed with the series and the rest is optional then make the title be
    # a non-optional version of the part after the series name
    # handles cases like "series: real title"
    
    if not series:
        return []
    
    match = fuzzy_compare_title_lists(series, title, allow_partial=True)
    if match:
        series_opt,title_opt = match # number of optional words unmatched at end
        
        if title_opt:
            # make mandatory anything at this level (addition parens will still be optional)
            first_level = title[-title_opt][1]
            
            title2 = [(w[0],False if w[1] <= first_level else w[1] - first_level) for w in title[-title_opt:]]
            
            # print('dsp("%s", "%s")= "%s"' % (
            #     word_list_string(series),word_list_string(title),word_list_string(title2)))
            
            return title2
            
    return []
    


def word_list_string(word_list):
    # Convert word list to string with required word in upper case
    return ' '.join([w[0] if w[1] else w[0].upper() for w in word_list])
            
    
def fuzzy_compare_titles(series1, title1, series2, title2):
    title1x = title_list(title1)
    title2x = title_list(title2)
    
    if fuzzy_compare_title_lists(title1x, title2x, allow_partial=False):
        # print('titles-match: %s == %s' % (unicode(title1x), unicode(title2x)))
        return True
        
    # try removing series name from front of title, if present
    
    series = series1 if series1 else series2
    if series:
        title1y = drop_series_prefix(title_list(series), title1x)
        if title1y and fuzzy_compare_title_lists(title1y, title2x, allow_partial=False):
            return True

        title2y = drop_series_prefix(title_list(series), title2x)
        if title2y and fuzzy_compare_title_lists(title1x, title2y, allow_partial=False):
            return True
    
    return False

    
def compare_titles(series1, title1, series2, title2):
    # Try to determine when a subtitle is required in order to distinguish between different books
    if USE_SUBTILE_WHEN_TITLE_IS_SHORT:
        if ':' in title1 or ':' in title2:
            LIM_T = 4   # max words for possibly incomplete main title
            LIM_S = 8   # max words for a subtitle that is needed to make the book title unique
            
            title1a = title1.replace('::', '~~')
            title2a = title2.replace('::', '~~')
            
            t1,sep1,s1 = title1a.partition(':')
            if s1 and len(t1.split()) <= LIM_T and len(s1.split()) <= LIM_S:
                t2,sep2,s2 = title2a.partition(':')
                if s2 and len(t2.split()) <= LIM_T and len(s2.split()) <= LIM_S:
                    # assume subtitle may be required so make part of main title
                    title1 = title1a.replace(':', ' ').replace('~~', '::')
                    title2 = title2a.replace(':', ' ').replace('~~', '::')
        
    return fuzzy_compare_titles(series1, title1, series2, title2)

    
def fuzzy_compare_authors(a1, a2, author_match_equivalents):
    a1l = clean_list_author(a1)
    a2l = clean_list_author(a2)
    
    a1m = ' '.join(a1l)      # equivalent to match_prep(a1)
    a2m = ' '.join(a2l)      # equivalent to match_prep(a2)
    
    if a1m == a2m:
        return True
    
    if author_match_equivalents is not None:
        if a1m in author_match_equivalents:
            return a2m in author_match_equivalents[a1m]

        if a2m in author_match_equivalents:
            return a1m in author_match_equivalents[a2m]
        
    return fuzzy_compare_author_lists(
        author_list_of_clean(a1l), author_list_of_clean(a2l))
        
        
def primary_author(authors):
    # should always be at least one author (possibly "Unknown") for calibre, but handle this
    # just in case for our discovered books.
    
    if len(authors) > 0:
        return authors[0]
        
    return UNKNOWN
    
   
def same_author(calibre_authors, library_authors, config, use_equivalents=True):
    # Check authors. Assume that all author names are in first/last name with no comma.
    # Allow any author match between books
    
    if len(calibre_authors) > 1 and is_unknown(primary_author(library_authors)):
        # Freading lists no author for multi-author books. Assume a match.
        return True
    
    if use_equivalents:
        author_comparison_cache = config.author_comparison_cache_with_equivalents
        author_match_equivalents = config.author_match_equivalents
    else:
        author_comparison_cache = config.author_comparison_cache_without_equivalents
        author_match_equivalents = None
            
    # use cache to improve performance of matching since the same author comparisons tend to be done multiple times
    for calibre_author in calibre_authors:
        for library_author in library_authors:
            key = (calibre_author, library_author)
            cached_result = author_comparison_cache.get(key, None)
            
            if cached_result is None:
                cached_result = fuzzy_compare_authors(calibre_author, library_author, author_match_equivalents)
                author_comparison_cache[key] = cached_result
            
            if cached_result:
                return True
                
    return False

    
def same_title(calibre_book, library_book):
    calibre_series = series_title(calibre_book)
    library_series = series_title(library_book)

    # Handle use of '|' to indicate alternate titles for the same book
    
    for calibre_title in calibre_book.title.split('|'):
        for library_title in library_book.title.split('|'):
            if compare_titles(calibre_series, calibre_title, library_series, library_title):
                return True
            
    return False
    
    
def series_title(book):
    if (not book.series) or re.search(r'[:(|]', book.series):
        return ''   # use only simple series name
    
    if book.series_index and book.series_index == int(book.series_index):
        return '%s (%d)' % (book.series, int(book.series_index))  # make series + index 
    
    return book.series  # just series
    
    
def combine_series_title(book):
    st = series_title(book)
    
    if st:
        return '(%s) %s' % (st, book.title)  # make series an optional prefix
        
    return book.title

    
    
def same_book(calibre_book, library_book, config, compare_authors=True, compare_titles=True, compare_isbn=True):
    if USE_ISBN_FOR_MATCH and compare_isbn and calibre_book.isbn and (calibre_book.isbn == library_book.isbn):
        return True # isbn match - ignore author and title
        
    return (((not compare_authors) or same_author(calibre_book.authors, library_book.authors, config)) and
            ((not compare_titles) or same_title(calibre_book, library_book)))
        

            
    
def tokenize_book(book, config, compare_authors=True, compare_titles=True):
    all = set()
    required = set()
    
    if compare_authors:
        # careful with this since multiple author books can match on any author (or none for Freading)
        

        # add equivalents (if any).
        if config.author_match_equivalents is not None:
            all_authors = set()
            for author in book.authors:
                author_match = match_prep(author)
                all_authors.add(author_match)
                if author_match in config.author_match_equivalents:
                    all_authors.update(config.author_match_equivalents[author_match])
        else:
            all_authors = book.authors
            
            
        single_author = (len(all_authors) == 1)
        for author in all_authors:
            for word,optional in author_list(author):
                if single_author and not optional:
                    required.add(word)
                    
                all.add(word)
                
                if len(word) > 1:
                    all.add(word[0])    # initial

    
    if compare_titles:
        titles_required = None
        
        for title in combine_series_title(book).split('|'):
            
            title_required = set()
            previous_word = None
            
            for word,optional in title_list(title):
                if not optional:
                    title_required.add(word)
                
                all.add(word)
                
                all.add(word + 's')   # plurals
                
                for ew in equivalent_words.get(word, []):
                    all.add(ew)
                        
                for ewl in equivalent_multi_words.get(word, []):
                    for ew in ewl:
                        all.add(ew)
                            
                if previous_word:
                    all.add(previous_word + word) # compound words
                            
                previous_word = word
                
                            
            if titles_required is None:
                titles_required = title_required
            else:
                titles_required &= title_required    # required words must be in all alternates
                
        required.update(titles_required)
            
    return required,all

    

def combine_initials(author):
    s = ' %s '%author
    s = re.sub(r' ([a-z])\. ([a-z])\. ([a-z])\. ', r' \1.\2.\3. ', s) # merge three initials with periods
    s = re.sub(r' ([a-z])\. ([a-z])\. ', r' \1.\2. ', s) # merge two initials with periods
    s = re.sub(r' ([a-z]) ([a-z]) ([a-z]) ', r' \1\2\3 ', s) # merge three initials without periods
    s = re.sub(r' ([a-z]) ([a-z]) ', r' \1\2 ', s) # merge two initials without periods
    return s.strip()
     
    
def simplify_name(author, remove_optional=False, remove_middle=False, replace_middle_with_initial=False):
    words = []
    part = 0
    
    for name,opt in author_list(author):
        if remove_optional and opt == A_OPTIONAL:
            continue
            
        part += 1
        
        if opt == A_NAME_OR_INITIAL:
            if part == 1 and len(name) == 1:
                remove_middle = replace_middle_with_initial = False # Name starts with an initial: Don't simplify middle names

            if part > 1:
                if remove_middle:
                    continue
                    
                if replace_middle_with_initial:
                    name = name[0]
                
        words.append(name)
            
    return ' '.join(words)     
    
    
def remove_optional_names(author):
    return simplify_name(author, remove_optional=True, remove_middle=True)
    

def replace_middle_with_initial(author):
    return simplify_name(author, remove_optional=True, replace_middle_with_initial=True)
    
    
def longest_word(s):
    if not s:
        return ''
        
    w = sorted(s.split(), key=lambda s: -len(s))[0]
    
    if len(w) < 4:
        return ''
        
    return w
    
    
def add_punctuation(author):
    # Periods after initials and comma before jr are needed in author names for Classic OverDrive Search
    # Exceptions do occur: "Steve Gordon Jr" works but "Steve Gordon, Jr" does not
    
    n = 1
    while (n):  # until no more found
        author,n = re.subn(r'(^| )([a-z])( |$)', r'\1\2.\3', author) # fix initials
    
    author = re.sub(r'(?<!,) jr$', r', jr.', author) # fix jr
    return author
    
    

def clean_author_for_search(author, remove_accents):
    return ' '.join(clean_list_author(author, remove_accents))
    
        

def clean_title(title):
    # reduce a title to minimum for searching
    return ' '.join([word for word,optional in title_list(title) if not optional])
    

def optional_title(title):
    # return only the optional parts of a title
    return ' '.join([word for word,optional in title_list(title) if optional])
    

def is_unknown(s):
    # in calibre db missing title is "Unknown" and missing author is a single author of "Unknown"
    # handle possibility of having been already converted to lower case
    return s == UNKNOWN or s == UNKNOWN.lower()
    

def safe_word_from_title(title):
    if not re.match('^[A-Za-z0-9 ]+$', title):
        return ''   # search results not good unless a simple title
        
    return word_from_title(title)
    
        
def word_from_title(title):
    # Strip a title for search by eliminating anything extraneous and picking the best single word
    return longest_word(clean_title(title))
    
    

def reverse_author_name(a):
    # Convert author name into "last, first" format
    
    if len(a) == 0:
        return a
        
    al = a.replace(',','').replace('.','').split()
    
    if len(al) == 1:
        return al[0]    # nothing to do for single name
        
    if len(al) > 2 and al[-1].lower() in PERSON_SUFFIXES:
        return al[-2] + al[-1] + ', ' + ' '.join(al[:-2])   # "John Smith Jr" -> "Smith Jr, John"
        
    return al[-1] + ', ' + ' '.join(al[:-1])
    
    
def search_prep(author, remove_accents=False):
    return add_punctuation(clean_author_for_search(author, remove_accents))
    
    
    
def match_prep(author):
    return ' '.join(clean_list_author(author))
            
        

def max_match_count(l1, l2):
    return len(l1) + len(l2)
    
    
    
def cache_book_list(l, config, progress=None, compare_authors=True, compare_titles=True, compare_isbn=True):
        
    c = Object()
    c.all_tokens = defaultdict(set)
    c.all_books = set()
    c.isbns = defaultdict(set)
    
    count = 0
    even = True
        
    for book in l:
        b = Object()
        b.book = book
        b.required, b.all = tokenize_book(book, config, compare_authors=compare_authors, compare_titles=compare_titles)
        c.all_books.add(b)
        
        for token in b.all:
            c.all_tokens[token].add(b)
        
        if compare_isbn and book.isbn:
            c.isbns[book.isbn].add(b)
            
        if progress:
            even = not even # count by half since will scan this list twice
            if even:
                count += 1
                progress.setValue(count)
                if progress.wasCanceled():
                    break
                
    return c
    
    
def match_book_list_to_cache(c, l, config, progress=None, compare_authors=True, compare_titles=True, compare_isbn=True):
    # return a list of tuples of matching books from two lists of books
    
    all_tokens = defaultdict(set)
    all_books = set()
    
    count = len(c.all_books) // 2   # floor division - caching did half of work on its list
    even = True
    
    compared = set()
    matches = set()
        
    for book in l:
        required,all = tokenize_book(book, config, compare_authors=compare_authors, compare_titles=compare_titles)
        all_books.add(book)
        for token in all:
            all_tokens[token].add(book)
            
        if USE_ISBN_FOR_MATCH and compare_isbn and book.isbn and (book.isbn in c.isbns):
            for b in c.isbns[book.isbn]:
                # same isbn is a match
                compared.add((b.book, book))
                matches.add((b.book, book))
                
                
        possible_matches = c.all_books
        
        for token in required:
            if possible_matches is c.all_books:
                possible_matches = c.all_tokens[token].copy()
            else:
                possible_matches &= c.all_tokens[token]
            
            if len(possible_matches) == 0:
                break
                
        else:
            for b in possible_matches:
                compared.add((b.book, book))
                if same_book(book, b.book, config, 
                        compare_authors=compare_authors, compare_titles=compare_titles, compare_isbn=compare_isbn):
                    matches.add((b.book, book))
                    
            
            
        if progress:
            count += 1
            progress.setValue(count)
            if progress.wasCanceled():
                return []
                
    for b in c.all_books:
        possible_matches = all_books
        
        for token in b.required:
            if possible_matches is all_books:
                possible_matches = all_tokens[token].copy()
            else:
                possible_matches &= all_tokens[token]
            
            if len(possible_matches) == 0:
                break
                
        else:
            for book in possible_matches:
                if (b.book, book) not in compared:
                    if same_book(book, b.book, config,
                            compare_authors=compare_authors, compare_titles=compare_titles, compare_isbn=compare_isbn):
                        matches.add((b.book, book))
                
            
        if progress:
            even = not even # count by half since will scan this list twice
            if even:
                count += 1
                progress.setValue(count)
                if progress.wasCanceled():
                    return []
            

    return sorted(list(matches))
    
    
    
def match_book_lists(l1, l2, config, progress=None, compare_authors=True, compare_titles=True, compare_isbn=True):
    # return a list of tuples of matching books from two lists of books
    return match_book_list_to_cache(
        cache_book_list(l1, config, progress=progress, compare_authors=compare_authors, 
                        compare_titles=compare_titles, compare_isbn=compare_isbn), 
        l2, config, progress=progress, compare_authors=compare_authors, 
        compare_titles=compare_titles, compare_isbn=compare_isbn) 


INCOMPLETE_TITLELISTS = [title_list(t) for t in INCOMPLETE_TITLES] # INCOMPLETE_TITLES prepared for matching against a title list


def is_incomplete_title(title):
    titlelist = title_list(title)
    
    for incomplete_titlelist in INCOMPLETE_TITLELISTS:
        if fuzzy_compare_title_lists(titlelist, incomplete_titlelist, allow_partial=False):
            return True
        
    return False
    
    
def force_unique_title(log, book):
    if not is_incomplete_title(book.title):
        return
        
    # this title is known to be incomplete causing multiple books to be matched incorrectly
    orig_title = book.title
    
    while True:
        # try to fix by making a subtitle non-optional
        if ':' in book.title:
            test_title = book.title.replace(':', ',', 1)
            if not is_incomplete_title(test_title):
                book.title = test_title     # fixed
                break
            
        if '(' in book.title:
            test_title = book.title.replace('(', ',', 1).replace(')', ',', 1)
            if not is_incomplete_title(test_title):
                book.title = test_title     # fixed
                break
            
        if book.pubdate:
            unique_id = book.pubdate.isoformat()[0:4]   # published year
        elif book.isbn:
            unique_id = book.isbn
        else:
            unique_id = book.book_id
        
        book.title = '%s, %s'%(book.title.replace(':', ',').replace(' (', ', ').replace(')', ''), unique_id)
        break
        
    log.info('Forced unique title: "%s" to "%s"' % (orig_title, book.title))
    

def normalize_author(a, unreverse=True, fix_case=False):
    def suffix_pattern(suf):
        return r'\.?'.join(list(suf)) + r'\.?'
        
    
    if re.match(r'^©[0-9]{4} by', a):
        return ''   # drop extraneous copyright notice
        
    if a.lower() in ['not yet available', 'smashwords', 'various', 'various authors']:
        return ''   # drop "Not Yet Available" used by Scribd, "Various" used by Freading for multiple authors
    
    for suf in PERSON_SUFFIXES:
        a = re.sub(', (%s)$'%suffix_pattern(suf), r' \1', a, flags=re.IGNORECASE)    # ", Jr" -> " Jr"  etc.
        a = re.sub(', (%s),'%suffix_pattern(suf), r' \1,', a, flags=re.IGNORECASE)    # ", Jr." -> " Jr."  etc.
        
    if unreverse and (',' in a):
        if re.search(r' (sr|jr|sr\.|jr\.)$', a, flags=re.IGNORECASE):
            # handle cases from PG like "Deer, Jim G., Sr." -> "Jim G. Deer Sr."
            a,space,suff = a.rpartition(' ')
            suff = space + suff
        else:
            suff = ''
            
        last,comma,first = a.partition(',')
        a = first + ' ' + last + suff
        
    a = re.sub(r' et al\.?$', '', a, flags=re.IGNORECASE) # remove "et al" used by Project Gutenberg
        
    a = a.replace(',','').replace(';','').replace(' & ',' and ')
    
    if re.match('^by ',a, flags=re.IGNORECASE): a = a[3:]

    a = re.sub(r' author$', '', a, flags=re.IGNORECASE)
    a = re.sub(r' \(ed\.\)', '', a, flags=re.IGNORECASE)
    a = re.sub(r' \(editor\)', '', a, flags=re.IGNORECASE)
    a = re.sub(r' \(edt\)', '', a, flags=re.IGNORECASE) # Editor - Axis 360
    a = re.sub(r' \(crt\)', '', a, flags=re.IGNORECASE) # Creator - Axis 360
    a = re.sub(r' \(con\)', '', a, flags=re.IGNORECASE) # Contributor - Axis 360
    a = re.sub(r' \(frw\)', '', a, flags=re.IGNORECASE) # Forward - Axis 360
    a = re.sub(r' \(ilt\)', '', a, flags=re.IGNORECASE) # Illustrator - Axis 360
    if re.match('^More creators.', a) or a == 'Various': a = ''
    
    a = re.sub(r'\s&\s', ' and ', a) # change '&' to 'and' (Better Homes & Gardens)
    
    if len(a.split()) > 2:
        # drop honorific, unless it is used with a single name
        for pre in HONORIFIC_PREFIXES:
            a = re.sub(r'^%s\.? '%pre, '', a, flags=re.IGNORECASE)    # Drop honorific prefixes
            
        for suf in HONORIFIC_SUFFIXES:
            a = re.sub(' %s$'%suffix_pattern(suf), '', a, flags=re.IGNORECASE)    # Drop honorific suffixes
        
    a = re.sub(r'([A-Za-z]{4})\.( |$)',  r'\1\2', a)  # remove extraneous periods (after 4 or more letter word)
    
    n = 1
    while (n):  # until no more found
        a,n = re.subn(r'(^| )([A-Za-z])( |$)', r'\1\2.\3', a) # Fix missing periods after initials
    
    a = re.sub(r'\.([a-zA-Z0-9])', r'. \1', a) # Split initials with no space
    
    a = re.sub(r"[‘’?']","'", a)     # remove smart quotes
   
    a = re.sub(r"[^ .'0-9\w-]", '', a, flags=re.IGNORECASE | re.UNICODE) # remove non-alphanumeric + name chars
    
    if fix_case and (len(a) > 5) and (a.upper() == a):
        a = titlecase(a.lower())    # Correct all upper case name
        
    
    a = ' '.join(a.strip().split())
    
    #if a != a_in: print('normalize_author: "%s" to "%s"'%(a_in,a))
    return a
    

    
def normalize_title(t):
    if not t:
        return ''
        
    t = t.replace('[Kindle Edition]','').strip()
    #t = re.sub(r' *\.\.\.$', '', t)    # remove trailing ...
    
    t = re.sub(r"&amp;", "&", t)     # html entity left over from Amazon
    
    t = re.sub(r'^(.+), (A|An|The)$', r'\2 \1', t, flags=re.IGNORECASE)  # Reverse titles with articles at end for sorting
    
    # indications of incomplete book - make sure they are not ignored
    t = re.sub(r'\(Excerpt\)', r' Excerpt', t, flags=re.IGNORECASE)
    t = re.sub(r'\((First [0-9,]+ words)\)', r' \1', t, flags=re.IGNORECASE)
    t = re.sub(r': (Chapters[- ][0-9]+-[0-9]+)', r' \1', t, flags=re.IGNORECASE)
    t = re.sub(r': (free sampler)', r' \1', t, flags=re.IGNORECASE)
    t = t.replace('(SparkNotes Literature Guide)', 'SparkNotes Literature Guide')
    
    # Extraneous suffix
    t = t.replace('- The Original Classic Edition', '(The Original Classic Edition)')
    
    t = re.sub(r"\[", "(", t)     # brackets -> parens
    t = re.sub(r"\]", ")", t)     # brackets -> parens
    
    t = re.sub(r':.{80,}$', '', t)  # remove any extremely long subtitle
    t = re.sub(r'\(.{80,}?\)', '', t)  # remove any extremely long optional text
    
    t = re.sub(r"[‘’‛′]","'",t)     # remove smart quotes
    t = re.sub(r"\\","/",t)         # remove backslash
    
    t = re.sub(r"[^ .,!:;+@&'0-9\w()/-]", '', t, flags=re.IGNORECASE | re.UNICODE) # remove non-alphanumeric + name chars
    
    return ' '.join(t.strip().split())  # remove extra white space
    
    
    
def alternate_author_names(author, config, log=None):
    # produce variants of the author name
    alt_names = set()
    
    if search_prep(author) in config.author_search_equivalents:
        if log:
            log.info('Using configured author name equivalents')
            
        for equivalent_author in config.author_search_equivalents[search_prep(author)]:
            alt_names.add(equivalent_author)
            alt_names.add(add_punctuation(equivalent_author))
            alt_names.add(combine_initials(add_punctuation(equivalent_author)))
            
    else:
        cleaned_author = clean_author_for_search(author, remove_accents=True)
        
        #if log:
        #    log.info('cleaned_author %s' % cleaned_author)
        
        alt_names.add(add_punctuation(cleaned_author)) 
        alt_names.add(add_punctuation(remove_optional_names(cleaned_author)))
        alt_names.add(combine_initials(add_punctuation(cleaned_author)))
        alt_names.add(add_punctuation(replace_middle_with_initial(cleaned_author)))
        
        accented_author = clean_author_for_search(author, remove_accents=False)
        if accented_author != cleaned_author:
            alt_names.add(add_punctuation(accented_author)) 
            alt_names.add(add_punctuation(remove_optional_names(accented_author)))
            alt_names.add(combine_initials(add_punctuation(accented_author)))
            alt_names.add(add_punctuation(replace_middle_with_initial(accented_author)))
            
    alt_names.discard('')
    return alt_names

    
    
#   Debug using: calibre-debug -e filename.py

def main():
    return


if __name__ == '__main__':
    main()
