﻿#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import, print_function)

from calibre_plugins.overdrive_link.numbers import (word_number, word_rank, numeric_rank, roman_number, word_year)


__license__ = 'GPL v3'
__copyright__ = '2012-2025, John Howell <jhowell@acm.org>'


# Equivalent words for title matching

# maps word to desired equivalent
equivalents = {
    # Noise words
    'a': '',
    'an': '',
    'the': '',
    'at': '',
    'of': '',
    'and': '',
    'or': '',
    'tm': '',
    #'a history': '',   # Breaks "A History of Western Philosophy", need to only accept at end
    'a novel': '',
    'a prequel': '',
    'a short story': '',
    'a thriller': '',
    'with bonus content': '',
    'with bonus material': '',
    'with linked table of contents': '',
    'issue': '',
    'issues': '',

    # Misspelled titles at libraries
    'nevron': 'neveryon',   # Nevèrÿon at freading, ebscohost
    'nevaereyon': 'neveryon',
    'neveryna': 'neveryona',    # Nevèrÿona at freading, ebscohost
    'nevaereyona': 'neveryona',
    'neveryaona': 'neveryona',
    'hitchhikerss': 'hitchhikers',
    'thom': 'tom',

    # compound words
    'under ground': 'underground',
    'hitch hiker': 'hitchhiker',
    'sign post': 'signpost',
    'slaughter house': 'slaughterhouse',
    'search light': 'searchlight',

    # Common misspellings (http://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines)
    'altho': 'although',
    'approproximate': 'approximate',
    'assasined': 'assassinated',
    'assisnate': 'assassinate',
    'assosication': 'assassination',
    'attaindre': 'attained',
    'azn': 'asian',
    'beaurocracy': 'bureaucracy',
    'beaurocratic': 'bureaucratic',
    'broacasted': 'broadcast',
    'cant': 'cannot',
    'catapiller': 'caterpillar',
    'catapillers': 'caterpillars',
    'cervial': 'servile',
    'charistics': 'characteristics',
    'colonizators': 'colonizers',
    'consequentually': 'consequently',
    'copywrite': 'copyright',
    'criticists': 'critics',
    'critising': 'criticizing',
    'deriviated': 'derived',
    'deteoriated': 'deteriorated',
    'differentiatiations': 'differentiations',
    'emmisarries': 'emissaries',
    'emmisarry': 'emissary',
    'eraticly': 'erratically',
    'fiel': 'phial',
    'fiels': 'phials',
    'flourishment': 'flourishing',
    'funguses': 'fungi',
    'futhroc': 'futhark',
    'gogin': 'gauguin',
    'gouvener': 'governor',
    'hvea': 'heave',
    'idaeidae': 'idea',
    'intepretator': 'interpretor',
    'libitarianisn': 'libertarianism',
    'maintainence': 'maintenance',
    'manuever': 'manoeuvre',
    'manuevers': 'manoeuvres',
    'mediterainnean': 'mediterranean',
    'monolite': 'monolithic',
    'nessasarily': 'necessarily',
    'nessecary': 'necessary',
    'ocassionaly': 'occasionally',
    'palistian': 'palestinian',
    'peculure': 'peculiar',
    'playwrite': 'playwright',
    'playwrites': 'playwrights',
    'premonasterians': 'premonstratensians',
    'prominately': 'prominently',
    'pususading': 'persuading',
    'resssurecting': 'resurrecting',
    'restaraunteur': 'restaurateur',
    'restaraunteurs': 'restaurateurs',
    'resteraunt': 'restaurant',
    'restraunt': 'restaurant',
    'resteraunts': 'restaurants',
    'sepina': 'subpoena',
    'shoudln': 'shouldnt',
    'sophicated': 'sophisticated',
    'strikely': 'strikingly',
    'suburburban': 'suburban',
    'sucesfuly': 'successfully',
    'supposingly': 'supposedly',
    'thast': 'thats',
    'thru': 'through',
    'transcendentational': 'transcendental',
    'uneccesary': 'unnecessary',
    'unsucesfuly': 'unsuccessfully',
    'warantee': 'warranty',
    'wendsay': 'wednesday',
    'wensday': 'wednesday',

    # Other misspellings

    'releive': 'relieve',
    'releif': 'relief',
    'cheiftain': 'chieftain',
    'greivous': 'grievous',
    'feild': 'field',
    'beleif': 'belief',
    'yeild': 'yield',
    'theif': 'thief',
    'repreive': 'reprieve',
    'beseige': 'besiege',
    'peice': 'piece',
    'peirce': 'pierce',
    'repreive': 'reprieve',
    'breif': 'brief',
    'seive': 'sieve',
    'seige': 'siege',
    'beleive': 'believe',
    'conciet': 'conceit',
    'cieling': 'ceiling',
    'decieve': 'deceive',
    'percieve': 'perceive',
    'reciept': 'receipt',
    'deciet': 'deceit',
    'concieve': 'conceive',
    'recieve': 'receive',
    'rien': 'rein',
    'viel': 'veil',
    'hier': 'heir',
    'wieght': 'weight',
    'thier': 'their',
    'sliegh': 'sleigh',
    'vien': 'vein',
    'niegh': 'neigh',
    'skien': 'skein',
    'nieghbor': 'neighbor',
    'riegn': 'reign',
    'frieght': 'freight',
    'hiefer': 'heifer',
    'wier': 'weir',
    'wiegh': 'weigh',
    'foriegn': 'foreign',
    'forfiet': 'forfeit',
    'freind': 'friend',
    'mischeif': 'mischief',
    'sieze': 'seize',
    'shiek': 'sheik',
    'wierd': 'weird',
    'niether': 'neither',
    'financeir': 'financier',
    'liesure': 'leisure',
    'sieze': 'seize',
    'speceis': 'species',
    'anceint': 'ancient',
    'feirce': 'fierce',
    'leiutenant': 'lieutenant',

    # Abbreviations
    'dr': 'doctor',
    'ed': 'edition',
    'illus': 'illustrated',
    'intro': 'introduction',
    'lite': 'light',
    'ltd': 'limited',
    'orig': 'original',
    'rd': 'road',
    'sf': 'science fiction',
    'st': 'street',
    'thru': 'through',
    'thruout': 'throughout',
    'vs': 'versus',
    'vol': 'book',
    'vols': 'book',
    'part': 'book',
    'parts': 'book',
    'volume': 'book',
    'volumes': 'book',

    # British/American English
    'eyrie': 'aerie',
    'aluminium': 'aluminum',
    'annexe': 'annex',
    'arse': 'ass',
    'analogue': 'analog',
    'aeroplane': 'airplane',
    'artefact': 'artifact',
    'behaviour': 'behavior',
    'behaviourism': 'behaviorism',
    'behove': 'behoove',
    'bogeyman': 'boogeyman',
    'brasier': 'brazier',
    'calibre': 'caliber',
    'cancelled': 'canceled',
    'carburettor': 'carburetor',
    'catalogue': 'catalog',
    'centre': 'center',
    'camomile': 'chamomile',
    'cheque': 'check',
    'chequer': 'checker',
    'chilli': 'chili',
    'cypher': 'cipher',
    'colour': 'color',
    'coloured': 'colored',
    'cosy': 'cozy',
    'connexion': 'connection',
    'kerb': 'curb',
    'defence': 'defense',
    'dialogue': 'dialog',
    'dyke': 'dike',
    'doughnut': 'donut',
    'draught': 'draft',
    'endeavour': 'endeavor',
    'phantasm': 'fantasm',
    'favour': 'favor',
    'favourite': 'favorite',
    'fibre': 'fiber',
    'fillet': 'filet',
    'flavour': 'flavor',
    'furore': 'furor',
    'gauge': 'gage',
    'glamour': 'glamor',
    'glamourous': 'glamorous',
    'grey': 'gray',
    'grille': 'grill',
    'harbour': 'harbor',
    'hearken': 'harken',
    'honour': 'honor',
    'honourable': 'honorable',
    'humour': 'humor',
    'humourless': 'humorless',
    'idyll': 'idyl',
    'enquiry': 'inquiry',
    'ensure': 'insure',
    'gaol': 'jail',
    'labour': 'labor',
    'levelled': 'leveled',
    'libelled': 'libeled',
    'liquorice': 'licorice',
    'litre': 'liter',
    'lustre': 'luster',
    'manoeuvre': 'maneuver',
    'metre': 'meter',
    'mollusc': 'mollusk',
    'mould': 'mold',
    'moult': 'molt',
    'monologue': 'monolog',
    'moustache': 'mustache',
    'mum': 'mom',
    'mummy': 'mommy',
    'naivety': 'naivete',
    'neighbour': 'neighbor',
    'neighbourhood': 'neighborhood',
    'neurone': 'neuron',
    'offence': 'offense',
    'omelette': 'omelet',
    'organise': 'organize',
    'orientated': 'oriented',
    'pyjamas': 'pajamas',
    'parlour': 'parlor',
    'pernickety': 'persnickety',
    'phoney': 'phony',
    'plough': 'plow',
    'pretence': 'pretense',
    'primaeval': 'primeval',
    'programme': 'program',
    'quarrelled': 'quarreled',
    'realise': 'realize',
    'recognise': 'recognize',
    'redoubt': 'redout',
    'rhyme': 'rime',
    'rumour': 'rumor',
    'savoury': 'savory',
    'syrup': 'sirup',
    'sceptic': 'skeptic',
    'sceptical': 'skeptical',
    'scepticism': 'skepticism',
    'sledge': 'sled',
    'smoulder': 'smolder',
    'speciality': 'specialty',
    'spectre': 'specter',
    'steadfast': 'stedfast',
    'storey': 'story',
    'sulphate': 'sulfate',
    'sulphur': 'sulfur',
    'theatre': 'theater',
    'titbit': 'tidbit',
    'tyre': 'tire',
    'tonne': 'ton',
    'travelled': 'traveled',
    'traveller': 'traveler',
    'valour': 'valor',
    'villain': 'villan',
    'vineyard': 'vinyard',
    'vice': 'vise',
    'visor': 'vizor',
    'whilst': 'while',
    'woeful': 'woful',
    'rack': 'wrack',
    'yoghurt': 'yogurt',

    # Numbers (others added automatically)
    'zero': '0',
    'hundred': '100',
    'thousand': '1000',
    'million': '1000000',

    # Rankings (others added automatically)
    'senior': '1',
    'sr': '1',
    'junior': '2',
    'jr': '2',
}


key_phrase_lens = {}   # maps first word to list of lengths of key phrases (long to short)


def organize_equivalents():
    # Generate numeric equivalents on-the-fly
    for i in range(1, 301):
        equivalents[word_number(i)] = \
            equivalents[word_rank(i)] = \
            equivalents[numeric_rank(i)] = \
            equivalents[roman_number(i)] = "%d" % i

    # Generate year equivalents on-the-fly
    for i in range(1400, 2101):
        equivalents[word_year(i)] = "%d" % i

    # organize for multi-word matching
    for key in equivalents.keys():
        key_list = key.split()
        key_word = key_list[0]

        lens = key_phrase_lens.get(key_word, [])
        if len(key_list) not in lens:
            lens.append(len(key_list))
            key_phrase_lens[key_word] = sorted(lens, reverse=True)


organize_equivalents()
