# -*- coding: utf-8 -*-
__license__   = 'GPL v3'
__copyright__ = '2015,2016,2017,2018,2019,2020,2021,2022,2023 DaltonST'
__my_version__ = "1.0.16"     # qt.core

import os, sys
import apsw
import datetime
import codecs
import time
from time import sleep
import unicodedata
from copy import deepcopy
import re, sre_constants
import collections
from calibre.utils.zipfile import ZipFile

from calibre import isbytestring, force_unicode, prints
from calibre.constants import filesystem_encoding, preferred_encoding, DEBUG
from calibre.utils.logging import Log as log

#---------------------------------------------------------------------------------------------------
#  PDFs ONLY
#---------------------------------------------------------------------------------------------------
import errno, subprocess, shutil
from functools import partial
from calibre.ebooks import ConversionError, DRMError
from calibre.ptempfile import PersistentTemporaryFile
from calibre.constants import isosx, iswindows, islinux, isbsd
from calibre import CurrentDir
#---------------------------------------------------------------------------------------------------
from polyglot.builtins import as_unicode, as_bytes, iteritems, map, only_unicode_recursive, range, unicode_type

from queue import Queue, Empty

from calibre_plugins.english_noun_frequency.enf_polyglot import enf_as_bytes, enf_as_unicode

#---------------------------------------------------------------------------------------------------
from calibre_plugins.english_noun_frequency.heading import log_heading_common
from calibre_plugins.english_noun_frequency.text_filtering_utils import ENFHTMLStripper
from calibre_plugins.english_noun_frequency.text_extraction_utils import ENFHTMLGetContent

from calibre_plugins.english_noun_frequency.english_words_to_delete  import get_set_of_words_to_delete
from calibre_plugins.english_noun_frequency.english_nouns_to_keep import get_set_of_nouns_to_keep
from calibre_plugins.english_noun_frequency.english_plurals import return_english_plurals_dict
from calibre_plugins.english_noun_frequency.english_obscenities import return_english_obscenities_set
from calibre_plugins.english_noun_frequency.english_standard_change_pairs import return_english_standard_change_pairs_dict
from calibre_plugins.english_noun_frequency.english_names_to_delete import return_global_first_names_set

from calibre_plugins.english_noun_frequency.enf_pdf_to_html import enf_pdftohtml

from calibre_plugins.english_noun_frequency.english_to_spanish_translation_pairs import return_english_to_spanish_dict
from calibre_plugins.english_noun_frequency.english_to_other_language_translation_pairs import return_english_to_other_language_dict

#--------------------------------------------------------------------------------------
import calibre_plugins.english_noun_frequency.inflect_py3 as my_inflect

my_inflect_grammar_engine = my_inflect.engine()
my_inflect_grammar_engine.classical(all=False)
#--------------------------------------------------------------------------------------
translate_english_to_other_is_active = False
translate_english_to_other_language = unicode_type('none')
#--------------------------------------------------------------------------------------



#----------------------------------------------
#----------------------------------------------
my_terminate_early = False
#----------------------------------------------
notifications = Queue()
#----------------------------------------------
my_current_book_id = "000000"
#----------------------------------------------
my_book_ids = []
my_param_dict = {}
od_params_dict = collections.OrderedDict([])
english_words_to_delete_set = set()
#----------------------------------------------
my_guidb = ""
my_plugin_path = ""
#----------------------------------------------
SUPPORTED_BOOK_FORMATS = ['TXT','EPUB','PDF']
#----------------------------------------------
SUPPORTED_BOOK_FORMATS_STRING =  "(1st) TXT    (2nd) EPUB    (3rd) PDF"
#----------------------------------------------
ACCUMULATED_MOST_FREQUENT_NOUNS_CSV_FILENAME = "accumulated_most_frequent_nouns.csv"
#----------------------------------------------
ACCUMULATED_MOST_FREQUENT_NOUNS_TUPLES_FILENAME = "accumulated_most_frequent_nouns.tuples"
accumulated_most_frequent_nouns_tuples_file_full_path = "unknown"
protected_data_directory = "unknown"
#----------------------------------------------
accumulation_of_most_frequent_nouns_is_active = False
accumulation_of_most_frequent_nouns_is_paused = False
accumulated_most_frequent_nouns_dict = {}
#----------------------------------------------
header_s1 = None
header_s2 = None
header_s3 = None
header_s4 = None
header_s5 = None
#----------------------------------------------
highest_number_of_nouns_to_keep  = 0
#----------------------------------------------
format_stats_dict = {}
#--------------------------------------------------------------
#--------------------------------------------------------------
#--------------------------------------------------------------
# plurals of singulars
#--------------------------------------------------------------
english_plurals_dict = {}       # key is singular yielding plural
english_singulars_dict = {}   # key is plural yielding singular
#--------------------------------------------------------------
# words to always keep
#--------------------------------------------------------------
english_nouns_to_keep_set = set()
#--------------------------------------------------------------
# words to always delete
#--------------------------------------------------------------
english_words_to_delete_set = set()
#--------------------------------------------------------------
english_contractions_to_transform_list = []
#--------------------------------------------------------------
english_failed_contractions_list = []
#--------------------------------------------------------------
# words to gut for politeness' sake
#--------------------------------------------------------------
english_obscenities_set = set()
#--------------------------------------------------------------
# standard acronyms that should be capitalized
#--------------------------------------------------------------
acronyms_to_capitalize_set = set()
#--------------------------------------------------------------
# standard change pairs
#--------------------------------------------------------------
english_standard_change_pairs_dict = {}
#--------------------------------------------------------------
# merge of the standard and the custom change pairs dicts
#--------------------------------------------------------------
merged_change_pairs_dict = {}
#--------------------------------------------------------------
# standard list of frequent names to always delete
#--------------------------------------------------------------
global_first_names_set = set()
#--------------------------------------------------------------
# English to Spanish Translation Dictionary
#--------------------------------------------------------------
english_to_spanish_dict = {}
#--------------------------------------------------------------
# User Custom English to Other Language Translation Dictionary
english_to_other_language_dict = {}
#--------------------------------------------------------------
#--------------------------------------------------------------
# User Specific Custom Word Rules
#--------------------------------------------------------------
USER_CUSTOM_WORD_RULES_SINGULAR_PLURAL_PAIRS_FILENAME = "user_singular_plural_pairs.string"
user_custom_word_rules_plurals_pairs_full_path = "unknown"
user_custom_plurals_dict  = {}
user_custom_singulars_dict = {}
save_user_custom_plurals_dicts_to_config_directory = False
#--------------------------------------------------------------
USER_CUSTOM_WORD_RULES_CHANGE_PAIRS_FILENAME = "user_change_pairs.string"
user_custom_word_rules_change_pairs_full_path  = "unknown"
user_custom_word_rules_change_pairs_dict = {}
user_custom_word_rules_uppercase_set = set()
user_custom_word_rules_titlecase_set = set()
user_custom_word_rules_lowercase_set = set()
#--------------------------------------------------------------
USER_CUSTOM_WORD_RULES_GOOD_WORDS_FILENAME = "user_good_words.string"
user_custom_word_rules_good_words_full_path  = "unknown"
user_custom_word_rules_good_set = set()
#--------------------------------------------------------------
USER_CUSTOM_WORD_RULES_BAD_WORDS_FILENAME = "user_bad_words.string"
user_custom_word_rules_bad_words_full_path  = "unknown"
user_custom_word_rules_bad_set = set()
#--------------------------------------------------------------
#--------------------------------------------------------------


#--------------------------------------------------------------
 #calculate a "job" weighted average % of remaining words
n_accumulated_job_total_original_words = 0
n_accumulated_job_total_net_words = 0
#--------------------------------------------------------------


#------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------------------------------------------------------------------------------
def main_english_noun_frequency(self,guidb, plugin_path, book_ids, param_dict, log=None, abort=None, notifications=True):

    clear_or_initialize_globals()

    global header_s1
    global header_s2
    global header_s3
    global header_s4
    global header_s5

    global my_book_ids
    global my_param_dict

    global my_guidb

    global my_plugin_path

    log("Starting 'English Noun Frequency'")
    notifications.put((0.002, 'Beginning English Noun Frequency'))
    #----------------------------------------------------------------------------------------------------------------
    my_book_ids = deepcopy(book_ids)
    del book_ids
    my_book_ids.sort()

    my_param_dict =  param_dict.copy()
    del param_dict

    my_plugin_path = plugin_path

    #----------------------------------------------------------------------------------------------------------------
    my_guidb = guidb
    path = my_guidb.library_path
    if isbytestring(path):
        path = path.decode(filesystem_encoding)
    path = path.replace(os.sep, '/')
    path = os.path.join(path, 'metadata.db')
    path = path.replace(os.sep, '/')

    log("Library DB: " + path)
    try:
        my_db =apsw.Connection(path)
    except Exception as e:
        log(enf_as_unicode(e))
        raise e
        return

    my_cursor = my_db.cursor()

    header_s1 =  "SQLite Version: " + enf_as_unicode(apsw.SQLITE_VERSION_NUMBER) + "    [APSW]"
    mysql = "PRAGMA main.busy_timeout = 15000"         #milliseconds
    my_cursor.execute(mysql)
    header_s2 = mysql

    header_s3 = "Beginning 'English Noun Frequency' Processing"

    log_heading_common(log,header_s1,header_s2,header_s3,header_s4,header_s5)

   #----------------------------------------------------------------------------------------------------------------
    log(" ")

    ENF_Control(my_cursor,my_db,log,notifications)

    log(" ")
    #----------------------------------------------------------------------------------------------------------------

    my_db.close()

    log(" ")
    log(" ")
    log("Job complete.")

    clear_or_initialize_globals()

    return
#----------------------------------------------------------------------------------------------------------------
def clear_or_initialize_globals():
    # called both at the beginning and the end of the job.

    if DEBUG: print("Clearing or initializing globals")

    # clear all lists, sets, dicts, and ordered dicts from the previous job that persist from that job to the next job(s) submitted (unless calibre is exited first)

    global accumulated_most_frequent_nouns_dict
    global english_nouns_to_keep_set
    global english_obscenities_set
    global english_plurals_dict
    global english_singulars_dict
    global english_standard_change_pairs_dict
    global english_words_to_delete_set
    global global_first_names_set
    global merged_change_pairs_dict

    global my_book_ids
    global my_param_dict
    global od_params_dict

    global user_custom_plurals_dict
    global user_custom_word_rules_bad_set
    global user_custom_word_rules_change_pairs_dict
    global user_custom_word_rules_good_set
    global user_custom_word_rules_lowercase_set
    global user_custom_word_rules_titlecase_set
    global user_custom_word_rules_uppercase_set

    global english_to_spanish_dict
    global english_to_other_language_dict

    accumulated_most_frequent_nouns_dict.clear()
    english_nouns_to_keep_set.clear()
    english_obscenities_set.clear()
    english_plurals_dict.clear()
    english_singulars_dict.clear()
    english_standard_change_pairs_dict.clear()
    english_words_to_delete_set.clear()
    global_first_names_set.clear()
    merged_change_pairs_dict.clear()
    my_book_ids[:] = []   #clear it
    my_param_dict.clear()
    od_params_dict.clear()
    user_custom_plurals_dict.clear()
    user_custom_word_rules_bad_set.clear()
    user_custom_word_rules_change_pairs_dict.clear()
    user_custom_word_rules_good_set.clear()
    user_custom_word_rules_lowercase_set.clear()
    user_custom_word_rules_titlecase_set.clear()
    user_custom_word_rules_uppercase_set.clear()

    english_to_spanish_dict.clear()
    english_to_other_language_dict.clear()

    # initialize all other globals

    global  my_terminate_early
    global  my_current_book_id
    global  my_guidb
    global  my_plugin_path
    global  SUPPORTED_BOOK_FORMATS
    global  SUPPORTED_BOOK_FORMATS_STRING
    global  ACCUMULATED_MOST_FREQUENT_NOUNS_CSV_FILENAME
    global  ACCUMULATED_MOST_FREQUENT_NOUNS_TUPLES_FILENAME
    global  accumulated_most_frequent_nouns_tuples_file_full_path
    global  protected_data_directory
    global  accumulation_of_most_frequent_nouns_is_active
    global  accumulation_of_most_frequent_nouns_is_paused
    global  header_s1
    global  header_s2
    global  header_s3
    global  header_s4
    global  header_s5
    global  highest_number_of_nouns_to_keep

    my_terminate_early = False
    my_current_book_id = "000000"
    my_guidb = ""
    my_plugin_path = ""
    SUPPORTED_BOOK_FORMATS = ['TXT','EPUB','PDF']
    SUPPORTED_BOOK_FORMATS_STRING =  "(1st) TXT    (2nd) EPUB    (3rd) PDF"
    ACCUMULATED_MOST_FREQUENT_NOUNS_CSV_FILENAME = "accumulated_most_frequent_nouns.csv"
    ACCUMULATED_MOST_FREQUENT_NOUNS_TUPLES_FILENAME = "accumulated_most_frequent_nouns.tuples"
    accumulated_most_frequent_nouns_tuples_file_full_path = "unknown"
    protected_data_directory = "unknown"
    accumulation_of_most_frequent_nouns_is_active = False
    accumulation_of_most_frequent_nouns_is_paused = False
    header_s1 = None
    header_s2 = None
    header_s3 = None
    header_s4 = None
    header_s5 = None
    highest_number_of_nouns_to_keep  = 0


    global n_accumulated_job_total_original_words
    global n_accumulated_job_total_net_words

    n_accumulated_job_total_original_words = 0
    n_accumulated_job_total_net_words = 0
#----------------------------------------------------------------------------------------------------------------
def ENF_Control(my_cursor,my_db,log,notifications):

    if DEBUG: print("ENF Control")

    global my_book_ids
    global my_param_dict
    global my_guidb
    global highest_number_of_nouns_to_keep

    #-----------------------------------------------------------------
    global od_params_dict
    od_params_dict = collections.OrderedDict(sorted(my_param_dict.items()))

    log("Chosen Options: ")
    log("")
    log("-------------------------------------------")
    log("")

    for key,v in iteritems(od_params_dict):
        nk = key
        add_group_separator = False
        add_blank_line = False
        if key == 'COMMENTS_CHECKBOX':
            nk = "Update Comments?"
        if key == 'COMMENTS_MAX':
            nk = "Maximum Words to Add to Comments: "
        if key == 'COMMENTS_PREPEND_APPEND_REPLACE':
            nk = "Comments Location:"
        if key ==  'COMMENTS_REMOVE_PREVIOUS_ENF_COMMENTS_CHECKBOX':
            nk = "Remove Previous ENF Comments Prior to Update?"
            add_group_separator = True

        if key == 'TAGS_CHECKBOX':
            nk = "Add New Tags?"
        if key == 'TAGS_MAX':
            nk = "Maximum New Tags:"
        if key == 'TAGS_REPLACE_ADD':
            nk = "Only Add New Tags, or Replace All Existing Tags?"
            add_group_separator = True

        if key == 'CUSTOM_COLUMN_CHECKBOX':
            nk = "Update Custom Column?"
        if key == 'CUSTOM_COLUMN_MAX':
            nk = "Maximum Words in Custom Column:"
        if key == 'CUSTOM_COLUMN_NAME':
            nk = "Custom Column Specified:"
        if key == 'CUSTOM_COLUMN_SORT_ALPHA':
            nk = "Sort Custom Column Words Alphabetically (not by Frequency)?"
            add_group_separator = True

        if key == 'OTHER_CHECKBOX_ONLY_LOG_COMMENTS':
            nk = "Update Nothing.  Just Log the List of Words?"
        if key == 'OTHER_REMOVE_PREVIOUS_ENF_COMMENTS_CHECKBOX':
            nk = "Update Nothing.  Just Remove Previous ENF Comments?"
            add_group_separator = True

        if key == 'OTHER_SAVE_ALL_MOST_COMMON_TO_CSV_FILE_FULL_PATH':
            nk = 'Accumulate the Most Frequent Nouns in this .csv File: '
        if key == 'OTHER_SAVE_ALL_MOST_COMMON_TO_FILE':
            nk = 'Accumulate the Most Frequent Nouns for all books for all jobs?'
        if key ==  'OTHER_SAVE_ALL_MOST_COMMON_TO_FILE_PAUSE':
            nk = 'Pause the Accumulation of Most Frequent Nouns?'
            add_group_separator = True

        if key == 'OBFUSCATE_OBSCENITIES':
            nk = "Obfuscate Obscenities?"
        if key == 'REMOVE_GLOBAL_FIRST_NAMES':
            nk = "Delete Global First Names?"
        if key == 'REMOVE_TOP_100_NOUNS':
            nk = "Delete the Top 100 Most Common Nouns?"
            add_group_separator = True

        if key == 'TRANSLATE_ENGLISH_TO_OTHER_IS_ACTIVE':
            nk = "Is Translation of English Nouns Active?"
        if key == 'TRANSLATE_ENGLISH_TO_OTHER_LANGUAGE':
            nk = "English will be Translated to this Language: "
        if key == 'TRANSLATE_ENGLISH_TO_OTHER_LANGUAGE_USER_DICT_FILE':
            nk = "Custom Translation Mapping File to Use: "
            add_group_separator = True

#--------------------------------------------------------------

        if not v.startswith('#'):   # do not change the custom column name that is logged.
            if not ACCUMULATED_MOST_FREQUENT_NOUNS_CSV_FILENAME  in v:   #do not titlecase a file path
                try:
                    v = v.title()
                except:
                    pass

        log(nk + "  " + enf_as_unicode(v))

        if add_group_separator:
            log(" ")
            log("-------------------------------------------")
            log(" ")

        if add_blank_line:
            log(" ")
    #END FOR
    #-----------------------------------------------------------------

    library_path = my_guidb.library_path

    #-----------------------------------------------------------------
    global acronyms_to_capitalize_set       # as of 2015-08-04, there are:  54 words
    acronyms_to_capitalize_set = set(['ncis', 'itp', 'gmo', 'ttp', 'dna', 'nasa', 'esa', 'cia', 'rda', 'nato', 'faq', 'cod', 'mia', 'un', 'csf', 'eeg', 'mri', 'turp', 'icu', 'wwi', 'lcd', 'scuba', 'rna', 'aids', 'opec', 'xml', 'cbc', 'tmj', 'emg', 'vp', 'led', 'ywca', 'rem', 'hiv', 'unesco', 'dpt', 'va', 'ekg', 'ndiu', 'vat', 'sos', 'nsa', 'wwii', 'pku', 'fbi', 'eu', 'adhd', 'cpr', 'ppd', 'ecg', 'nbl', 'copd', 'pow', 'rip'])
    #-----------------------------------------------------------------
    global english_contractions_to_transform_list
    english_contractions_to_transform_list[:] = []   # clear it
    tmp_list = list(["would've","let'em","aren't","can't","couldn't","didn't","doesn't","don't","hadn't","hasn't","haven't","he'd","he'll","he's","i'd","i'll","i'm","i've","isn't","it's","let's","mightn't","mustn't","shan't","she'd","she'll","she's","shouldn't","that's","there's","they'd","they'll","they're","they've","we'd","we're","we've","weren't","what'll","what're","what's","what've","where's","who'd","who'll","who's","who've","won't","wouldn't","you'd","you'll","you're","you've"])
    for row in tmp_list:          # a list of all English standard contractions     can't, we'll, they've, you're, etc.
        contraction = row
        english_contractions_to_transform_list.append(contraction)   # now is entirely ASCII, so do not have to encode it repeatedly upon use
    #END FOR
    del tmp_list
    global english_failed_contractions_list
    english_failed_contractions_list[:] = []   # clear it
    for row in english_contractions_to_transform_list:
        s = row
        s = s.replace("'","")         # remove the single quote.   you'll becomes youll  and can't becomes cant.   backup to replacing fancy unicode quotes with ASCII quotes.
        english_failed_contractions_list.append(s)
    #END FOR

    #-----------------------------------------------------------------
    global format_stats_dict
    format_stats_dict.clear()
    format_stats_dict['TXT'] = 0
    format_stats_dict['EPUB'] = 0
    format_stats_dict['PDF'] = 0
    format_stats_dict['UNSUPPORTED'] = 0
    format_stats_dict = collections.OrderedDict(sorted(format_stats_dict.items()))
    #-----------------------------------------------------------------
    global english_plurals_dict
    global english_singulars_dict
    #-----------------------------------------------------------------
    english_plurals_dict, english_singulars_dict  = return_english_plurals_dict()
    n = len(english_plurals_dict)
    s = '{:,}'.format(n)
    log("Number of English word pairs in the standard 'singular:plural pair' list:  " + s)
    log(" ")
    #-----------------------------------------------------------------
    global english_words_to_delete_set
    english_words_to_delete_set = get_english_common_words_to_delete(log,notifications)
    n = len(english_words_to_delete_set)
    s = '{:,}'.format(n)
    log("Number of English words in the standard 'always discard' list:  " + s)
    log(" ")
    #-----------------------------------------------------------------
    global global_first_names_set
    global_first_names_set = return_global_first_names_set(log,notifications)
    n = len(global_first_names_set)
    s = '{:,}'.format(n)
    log("Number of global first names in the standard  'first names to discard' list:  " + s)
    log(" ")
    #-----------------------------------------------------------------
    global english_nouns_to_keep_set
    english_nouns_to_keep_set = get_set_of_nouns_to_keep()

    for k,v in iteritems(english_plurals_dict):    # add singular and plural
        if not k in english_nouns_to_keep_set:
            english_nouns_to_keep_set.add(k)
        if not v in english_nouns_to_keep_set:
            english_nouns_to_keep_set.add(v)
    #END FOR
    n = len(english_nouns_to_keep_set)
    s = '{:,}'.format(n)
    log("Number of English words in the standard 'always keep' list:  " + s)
    log(" ")
    #-----------------------------------------------------------------
    global english_obscenities_set
    if od_params_dict['OBFUSCATE_OBSCENITIES'] == unicode_type("True"):
        english_obscenities_set = return_english_obscenities_set()
        n = len(english_obscenities_set)
        s = '{:,}'.format(n)
        log("Number of English words in the standard 'obscenities' list:  " + s)
    else:
        english_obscenities_set = set()
        log("Standard 'obscenities' will not be obfuscated (as per settings).")
    log(" ")
    #-----------------------------------------------------------------
    global english_standard_change_pairs_dict
    english_standard_change_pairs_dict = return_english_standard_change_pairs_dict()
    n = len(english_standard_change_pairs_dict)
    s = '{:,}'.format(n)
    log("Number of English word pairs in the standard 'change pairs' list:  " + s)
    log(" ")
    #-----------------------------------------------------------------
    n = len(acronyms_to_capitalize_set)
    s = '{:,}'.format(n)
    log("Number of English words in the standard 'acronyms to capitalize' list:  " + s)
    log(" ")
    log(" ")
    #-----------------------------------------------------------------
    #-----------------------------------------------------------------
    global user_custom_plurals_dict
    global user_custom_singulars_dict
    global save_user_custom_plurals_dicts_to_config_directory

    #-----------------------------------------------------------------
    load_user_custom_word_rules_for_use(log)
    #-----------------------------------------------------------------
    load_user_custom_plurals_file(log)
    #-----------------------------------------------------------------
    # add k and v from the exploded custom plural pairs plus change pairs, and add those to the Custom Good Words set before continuing.
    # any word in any pair is, by definition, a good word until changed into something else.

    for k,v in iteritems(user_custom_plurals_dict):
        user_custom_word_rules_good_set.add(k)
        user_custom_word_rules_good_set.add(v)

    global user_custom_word_rules_change_pairs_dict

    for k,v in iteritems(user_custom_word_rules_change_pairs_dict):
        user_custom_word_rules_good_set.add(k)
        user_custom_word_rules_good_set.add(v)

    #-----------------------------------------------------------------
    # add k and v from the exploded standard plural pairs plus change pairs, and add those to the Standard Good Words set before continuing.
    # any word in any pair is, by definition, a good word until changed into something else.

    for k,v in iteritems(english_plurals_dict):
        english_nouns_to_keep_set.add(k)
        english_nouns_to_keep_set.add(v)

    for k,v in iteritems(english_standard_change_pairs_dict):
        english_nouns_to_keep_set.add(k)
        english_nouns_to_keep_set.add(v)

    #-----------------------------------------------------------------
    global merged_change_pairs_dict

    merged_change_pairs_dict_tmp = {}
    merged_change_pairs_dict_tmp.update(english_standard_change_pairs_dict)
    merged_change_pairs_dict_tmp.update(user_custom_word_rules_change_pairs_dict)

    for k,v in iteritems(merged_change_pairs_dict_tmp):
        k = unicode_type(k)              # job rule: all dicts use only unicode or int .  never bytestrings.
        v = unicode_type(v)
        merged_change_pairs_dict[k] = v
    #END FOR
    del merged_change_pairs_dict_tmp
    #-----------------------------------------------------------------
    #-----------------------------------------------------------------
    # check all of the sets for overlapping words, and adjust according to each set's priority
    synchronize_all_user_and_standard_word_rule_sets(log)
    log(" ")
    #-----------------------------------------------------------------
    #-----------------------------------------------------------------
    global translate_english_to_other_is_active
    global translate_english_to_other_language

    global english_to_other_language_dict_file_path

    translate_english_to_other_is_active = my_param_dict['TRANSLATE_ENGLISH_TO_OTHER_IS_ACTIVE']
    translate_english_to_other_language = my_param_dict['TRANSLATE_ENGLISH_TO_OTHER_LANGUAGE']
    english_to_other_language_dict_file_path = my_param_dict['TRANSLATE_ENGLISH_TO_OTHER_LANGUAGE_USER_DICT_FILE']

    global english_to_spanish_dict
    english_to_spanish_dict.clear()

    global english_to_other_language_dict
    english_to_other_language_dict.clear()

    if translate_english_to_other_is_active:
        if translate_english_to_other_language == unicode_type("Español"):
            tmp_dict = return_english_to_spanish_dict()
            english_to_spanish_dict = {}
            for k,v in iteritems(tmp_dict):
                if isbytestring(k):
                    k = as_unicode(k)
                if isbytestring(v):
                    v = as_unicode(v)
                english_to_spanish_dict[k] = v
            del tmp_dict
            english_to_other_language_dict = return_english_to_other_language_dict(english_to_other_language_dict_file_path,translate_english_to_other_language,log)
            if len(english_to_other_language_dict) > 0:
                for k,v in iteritems(english_to_other_language_dict):
                    if isbytestring(k):
                        k = as_unicode(k)
                    if isbytestring(v):
                        v = as_unicode(v)
                    english_to_spanish_dict[k] = v
                #END FOR
                english_to_other_language_dict.clear()  # no longer need the supplemental user custom dict for Spanish...
            else:
                pass
        else:
            if translate_english_to_other_language == unicode_type("Other Language"):
                english_to_other_language_dict = return_english_to_other_language_dict(english_to_other_language_dict_file_path,translate_english_to_other_language,log)
            else:
                translate_english_to_other_is_active = False
                translate_english_to_other_language = "none"
    else:
        translate_english_to_other_language = "none"

    #-----------------------------------------------------------------
    #-----------------------------------------------------------------
    cc_list = build_custom_column_list(my_cursor,my_db,log,notifications)

    q1 = my_param_dict['COMMENTS_MAX']
    q2 = my_param_dict['TAGS_MAX']
    q3 = my_param_dict['CUSTOM_COLUMN_MAX']

    q1 = int(q1)
    q2 = int(q2)
    q3 = int(q3)

    highest_number_of_nouns_to_keep = q1
    if q2 > highest_number_of_nouns_to_keep:
        highest_number_of_nouns_to_keep = q2
    if q3 > highest_number_of_nouns_to_keep:
        highest_number_of_nouns_to_keep = q3

    n_total = len(my_book_ids)
    if n_total == 0:
        log(" ")
        log("No Books Found by Job.  Terminating.")
        return
    log(" ")
    log("-------------------------------------------")
    log(" ")
    log(" ")
    n = n_total
    s = '{:,}'.format(n)
    log("Number of selected books for which to determine 'English Noun Frequency':     " + s)
    log(" ")
    log(" ")
    log("Priority sequence in which book formats will be searched until one is found to use:     " + SUPPORTED_BOOK_FORMATS_STRING)
    log(" ")
    log(" ")
    log(" ")
    log("═════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════")

    #--------------------------------------------------------------------------------------

    #-------------------------------------------------------------------------------------------
    #-----------------------------------------------------------------
    #-----------------------------------------------------------------
    #  ACCUMULATED_MOST_FREQUENT_NOUNS
    #-----------------------------------------------------------------
    global accumulated_most_frequent_nouns_dict
    accumulated_most_frequent_nouns_dict.clear()
    global accumulation_of_most_frequent_nouns_is_active
    global accumulation_of_most_frequent_nouns_is_paused
    if my_param_dict['OTHER_SAVE_ALL_MOST_COMMON_TO_FILE'] == unicode_type("True"):
        accumulation_of_most_frequent_nouns_is_active = True
    if my_param_dict['OTHER_REMOVE_PREVIOUS_ENF_COMMENTS_CHECKBOX'] == unicode_type("True"):
        accumulation_of_most_frequent_nouns_is_active = False     # Not a real update execution, so deactivate this
    if my_param_dict['OTHER_SAVE_ALL_MOST_COMMON_TO_FILE_PAUSE'] == unicode_type("True"):
        accumulation_of_most_frequent_nouns_is_paused = True
    #-----------------------------------------------------------------
    #-----------------------------------------------------------------
    #--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    #--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    n_done = 0
    for book in my_book_ids:
        current_book = enf_as_unicode(book)
        if DEBUG: print("Current book id is: ", current_book, " ______________________________________________________________________________________")
        t_started = time.time()
        if my_param_dict['OTHER_REMOVE_PREVIOUS_ENF_COMMENTS_CHECKBOX'] == unicode_type("True"):    # comments will not be updated at all in this job
            remove_previous_enf_comments(my_cursor,my_db,log,notifications,current_book)
            if my_param_dict['OTHER_CHECKBOX_ONLY_LOG_COMMENTS'] == unicode_type("False"):
                continue
        my_html_stripper = ENFHTMLStripper()
        my_html_stripper.convert_charrefs = True
        if my_param_dict['COMMENTS_REMOVE_PREVIOUS_ENF_COMMENTS_CHECKBOX'] == unicode_type("True"):     # comments are updated normally in this job after this has been done first
            remove_previous_enf_comments(my_cursor,my_db,log,notifications,current_book)
        new_comments,most_common_list,errors,full_book_path = determine_enf_single_book(my_cursor,my_db,log,notifications,current_book,library_path,my_html_stripper)
        if DEBUG: print("after determine_enf_single_book:  full_book_path for current book: ", full_book_path)
        del my_html_stripper
        if errors == "":
            if len(most_common_list) == 0:
                log("No Nouns Were Found in this Book with the Format Shown in the Path.")
            else:
                sleep(1.0)  # cede some cpu
                skip_updates = False
                if my_param_dict['OTHER_CHECKBOX_ONLY_LOG_COMMENTS'] == unicode_type("True"):
                    skip_updates = True
                    log_only(new_comments,log)
                if not skip_updates:
                    if my_param_dict['COMMENTS_CHECKBOX'] == unicode_type("True"):
                        update_comments(my_cursor,my_db,log,notifications,current_book,new_comments)
                    if my_param_dict['TAGS_CHECKBOX'] == unicode_type("True"):
                        create_tags(my_cursor,my_db,log,notifications,current_book,most_common_list)
                    if my_param_dict['CUSTOM_COLUMN_CHECKBOX'] == unicode_type("True"):
                        update_custom_column(my_cursor,my_db,log,notifications,current_book,most_common_list,cc_list)
        else:
            log("ERRORS for current book: " + enf_as_unicode(current_book) + "  >>>" + enf_as_unicode(errors))

        del new_comments
        del most_common_list
        del errors
        del current_book

        n_done = n_done + 1

        n_percent = float(n_done/n_total)

        i = full_book_path.rfind("/")
        full_book_path = full_book_path[i+1: ]
        if len(full_book_path) < 6:  #no path to show due to format issue
            full_book_path = "[A Book That Has No Supported Format]"
        full_book_path = unicode_type(full_book_path)
        notifications.put((n_percent, ("Finished: " + full_book_path) ) )
        del full_book_path
        t_finished = time.time()
        elapsed = int(t_finished - t_started)
        log(" ")
        log(" ")
        log("Elapsed time to process this book was: " + enf_as_unicode(elapsed) + " seconds")
        log(" ")
        if n_done != n_total:
            log("_______________________________________________________________________________________________")
        log(" ")
        sleep(0.50)   # cede some cpu
    #-------------------------------------
    #END FOR
    #-------------------------------------

    #--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


    log("═════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════")

    #-----------------------------------------------------------------
    #-----------------------------------------------------------------
    #-----------------------------------------------------------------
    #-----------------------------------------------------------------

    finalize_accumulated_most_frequent_nouns(log)

    #-----------------------------------------------------------------
    #-----------------------------------------------------------------
    # calculate a "job" weighted average % of remaining words

    global n_accumulated_job_total_original_words
    global n_accumulated_job_total_net_words

    try:
        log("_______________________________________________________________________________________________")
        log(" ")
        s1 = '{:,}'.format(n_accumulated_job_total_original_words)
        if n_accumulated_job_total_original_words == 0:
            s2 = '{:.2%}'.format((0))
        else:
            s2 = '{:.2%}'.format((n_accumulated_job_total_net_words/n_accumulated_job_total_original_words))
        s3 = '{:,}'.format(n_accumulated_job_total_net_words)
        msg = "Percentage of the  " + enf_as_unicode(s1) + " total words from this entire Job remaining after discarding all undesired English words: " + enf_as_unicode(s2) + ", or: " + enf_as_unicode(s3) + " net words"
        log(msg)
    except:
        pass
        log("Weighted Average Percentage of Remaining Words: ", wap)
    #-----------------------------------------------------------------
    #-----------------------------------------------------------------

    log("_______________________________________________________________________________________________")
    log(" ")
    for k,v in iteritems(format_stats_dict):
        k = k + "                           "
        k = k[0:16]
        s = '{:,}'.format(v)
        log("Format: " + k + " Books: " + s)
    log(" ")
    log("_______________________________________________________________________________________________")
    log(" ")
    log("'English Noun Frequency' has completed.")
    log(" ")

    format_stats_dict.clear()
    #-----------------------------------------------------------------
    #-----------------------------------------------------------------
#----------------------------------------------------------------------------------------------------------------
def determine_enf_single_book(my_cursor,my_db,log,notifications,current_book,library_path,my_html_stripper):

    if DEBUG: print("Determining ENF for a single book")

    global format_stats_dict

    new_comments = ""
    most_common_list = ""
    errors = ""
    text_data = ""
    condensed_text = ""
    file_data = ""

    full_book_path,format_to_use,errors = build_book_path(my_cursor,my_db,log,notifications,current_book,library_path)
    if errors == "" and format_to_use != "" :
        log("Book: " + full_book_path)
        n_current_num = format_stats_dict[format_to_use]
        format_stats_dict[format_to_use] = n_current_num + 1

        file_data,errors = load_book_file(log,notifications,full_book_path,format_to_use)

        if errors == "":
            if isinstance(file_data,list):
                file_data = only_unicode_recursive(file_data)
            else:  #should never happen
                tmp = enf_as_unicode(file_data)
                file_data = []
                file_data.append(tmp)
                file_data = only_unicode_recursive(file_data)
                del tmp
            sleep(0.1)  # cede some cpu
            text_data = filter_text(file_data,log,notifications,my_html_stripper)
            if not isinstance(text_data,unicode_type):
                text_data = enf_as_unicode(text_data)
            condensed_text = condense_text(text_data,log)
            sleep(0.1)  # cede some cpu
            new_comments,most_common_list,errors = analyze_text(condensed_text,errors,log)
            sleep(0.1)  # cede some cpu
        else:
            pass
    else:
        errors = ("No Supported Format Found for Current Book: " + enf_as_unicode(current_book))
        n_current_num = format_stats_dict['UNSUPPORTED']
        format_stats_dict['UNSUPPORTED'] = n_current_num + 1

    del file_data
    del text_data
    del condensed_text

    if DEBUG: print("========== Finished:  Determining ENF for a single book")

    return new_comments,most_common_list,errors,full_book_path
#----------------------------------------------------------------------------------------------------------------
def finalize_accumulated_most_frequent_nouns(log):

    if DEBUG: print("Finalizing accumulated most frequent nouns")

    #-----------------------------------
    global accumulation_of_most_frequent_nouns_is_paused
    global accumulation_of_most_frequent_nouns_is_active
    if accumulation_of_most_frequent_nouns_is_paused:
        return
    if not accumulation_of_most_frequent_nouns_is_active:
        return
    #-----------------------------------
    #-----------------------------------
    # the csv file may be somewhere totally different than any other ENF file, even on another drive.
    s_filename_csv = my_param_dict['OTHER_SAVE_ALL_MOST_COMMON_TO_CSV_FILE_FULL_PATH']
    s_filename_csv = unicode_type(s_filename_csv)
    s_filename_csv = s_filename_csv.strip()
    s_filename_csv = s_filename_csv.replace(os.sep,"/")
    #-----------------------------------
    # the .tuples and .backup1,2 files are ALWAYS in the same place, which is the protected directory where all the user_good_words etc. etc. are also
    #~ ACCUMULATED_MOST_FREQUENT_NOUNS_TUPLES_FILENAME = "accumulated_most_frequent_nouns.tuples"
    global accumulated_most_frequent_nouns_tuples_file_full_path
    if accumulated_most_frequent_nouns_tuples_file_full_path == "unknown":
        build_protected_data_files_full_paths(log)
    s_filename_dict = accumulated_most_frequent_nouns_tuples_file_full_path
    #-----------------------------------
    #-----------------------------------
    global accumulated_most_frequent_nouns_dict            #has already been populated by the job just ending
    accumulated_most_frequent_nouns_dict_previous = {}  #cumulative dict read in from the directory where it is stored
    accumulated_most_frequent_nouns_dict_new = {}         # _previous updated with this job's _dict new keys and their values, plus pre-existing keys' incremented values
    try:
        #[1] read in the dict tuples file expecting the identical format written out in step [4], below, and convert to a real dict.
        if os.path.exists(s_filename_dict):
            with open(s_filename_dict, 'r') as dictfile:
                input_dict_tuples = dictfile.readline()        #  use readline() to avoid getting a list of a list of tuples back.
                dictfile.close()

            input_dict_tuples = enf_as_unicode(input_dict_tuples)                            #   "[(u'clan', 1), (u'hill', 2), (u'murder', 3), (u'case', 4), (u'release', 1), (u'body', 5)]"
            input_dict_tuples = enf_as_unicode(input_dict_tuples.replace("[",""))
            input_dict_tuples = enf_as_unicode(input_dict_tuples.replace("]",""))     #   "(u'clan', 1), (u'hill', 2), (u'murder', 3), (u'case', 4), (u'release', 1), (u'body', 5)"
            input_dict_tuples_list = input_dict_tuples.split("),")
            if len(input_dict_tuples_list) > 0:
                for row in input_dict_tuples_list:   #    (u'clan', 1), (u'hill', 2), (u'murder', 3), (u'case', 4), (u'release', 1), (u'body', 5)
                    row = row.replace("u'","")
                    row = row.replace('u"','')
                    row = row.replace("(","")           #Python 3 change since u' no longer used
                    row = row.replace(")","")
                    row = row.replace("'","")
                    row = row.replace('"','')
                    n = row.find(",")
                    k = row[0:n]
                    v = row[n+1: ]

                    k = enf_as_unicode(k)
                    k = k.strip()

                    v = enf_as_unicode(v)
                    v = v.strip()
                    try:
                        v = int(v)
                    except:
                        v = enf_as_unicode(v)
                        s_v = ""
                        for char in v:
                            if char.isdigit():
                                s_v = s_v + char
                        #END FOR
                        if len(v) > 0:
                            v = enf_as_unicode(s_v)
                            v = v.strip()
                            try:
                                v = int(v)
                            except:
                                continue
                        else:
                            continue

                    k = unicode_type(k)
                    accumulated_most_frequent_nouns_dict_previous[k] = v        # job rule: all dicts must stay totally unicode or int to avoid chaos.
                #END FOR
                del input_dict_tuples_list

                #now rename the original to the first backup name
                s_filename_dict_backup_1 = s_filename_dict.replace("tuples","backup1")
                s_filename_dict_backup_2 = s_filename_dict.replace("tuples","backup2")
                if os.path.exists(s_filename_dict_backup_1):
                    if os.path.exists(s_filename_dict_backup_2):
                        os.remove(s_filename_dict_backup_2)
                    os.rename(s_filename_dict_backup_1, s_filename_dict_backup_2)
                if os.path.exists(s_filename_dict):
                    os.rename(s_filename_dict, s_filename_dict_backup_1)
            else:
                accumulated_most_frequent_nouns_dict_previous.clear()
        else:
            accumulated_most_frequent_nouns_dict_previous.clear()
        #-----------------------------------
        #[2] failsafe: duplicates of k (e.g. 1 string and 1 unicode) would be merged here if they ever existed.     # job rule: all dicts must stay totally unicode or int to avoid chaos.
        for k,v in iteritems(accumulated_most_frequent_nouns_dict_previous):
            k = k.strip()
            k = unicode_type(k) # job rule: all dicts must stay totally unicode or int to avoid chaos.
            v = int(v)
            if k in accumulated_most_frequent_nouns_dict_new:
                accumulated_most_frequent_nouns_dict_new[k] = accumulated_most_frequent_nouns_dict_new[k] + v
                total = accumulated_most_frequent_nouns_dict_new[k]
            else:
                accumulated_most_frequent_nouns_dict_new[k] = v     # if nothing else, the input is written to the output after k is made a simple string.
        #-----------------------------------
        #[3] Update the new cumulative dict with the current job's dict before writing out the new cumulative dict (not this job's transient dict)
        if len(accumulated_most_frequent_nouns_dict) > 0:
            for k,v in iteritems(accumulated_most_frequent_nouns_dict):
                if k:
                    if v:
                        if isinstance(v,int):
                            pass
                        else:
                            v = enf_as_unicode(v)
                            v = v.strip()
                            try:
                                v = int(v)
                            except:
                                continue
                    else:
                        continue
                else:
                    continue

                k = enf_as_unicode(k.strip())   # job rule: all dicts must stay totally unicode or int to avoid chaos.
                try:
                    n_new = accumulated_most_frequent_nouns_dict_new[k]   # current k may not have been in _previous, which was copied to _new.
                except:
                    n_new = 0
                accumulated_most_frequent_nouns_dict_new[k] =  int(v) + n_new
                total = accumulated_most_frequent_nouns_dict_new[k]
            #END FOR
            del accumulated_most_frequent_nouns_dict_previous
            accumulated_most_frequent_nouns_dict.clear() #do not delete this global; just clear it.
        else:
            accumulated_most_frequent_nouns_dict_new = accumulated_most_frequent_nouns_dict_previous.copy()
            del accumulated_most_frequent_nouns_dict_previous
        #-----------------------------------
        #[4] write out the new dict in a list of tuples to the .tuples file for future reading back in, updating, then writing back out again...all to be repeated indefinitely if the user so chooses.
        if len(accumulated_most_frequent_nouns_dict_new) > 0:
            output_dict_tuples = [(k,v) for k,v in iteritems(accumulated_most_frequent_nouns_dict_new)]
            output_dict_tuples = enf_as_unicode(output_dict_tuples)
            my_outfile_most_common_list_aggregate = open(s_filename_dict, 'w')
            my_outfile_most_common_list_aggregate.write(output_dict_tuples)    # write out only one single line so [1], above, can use .readline() to return a string of a list of tuples instead of a list containing a list of tuples.
            my_outfile_most_common_list_aggregate.close()
        else:
            pass
        #-----------------------------------
        #[5] write out the contents of the dict  in .csv format to the .csv file for the user to mess with.  It will never be read into this program. Ever.
        mytuples = list([(k,v) for k, v in iteritems(accumulated_most_frequent_nouns_dict_new)])
        mytuples_sorted = sorted(mytuples, key=lambda k: (-k[1], k[0].lower()))   # 0 = k; 1 = v          sorts first by frequency count, descending, then word, ascending
        del mytuples

        import csv
        with open(s_filename_csv, 'w') as csvfile:
            fieldnames = ['word', 'accumulated_frequency']
            lineterminator = '\n'
            delimiter = ','
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=delimiter, lineterminator=lineterminator)
            writer.writeheader()
            if len(accumulated_most_frequent_nouns_dict_new) > 0:
                for row in mytuples_sorted:
                    k,v = row
                    k = unicode_type(k)
                    k = k.strip()
                    v = unicode_type(v)   # int
                    v = v.strip()
                    writer.writerow({'word': k, 'accumulated_frequency': v})
                    if DEBUG:
                        s_line = 'word  ' + enf_as_unicode(k) + '     accumulated_frequency  ' + enf_as_unicode(v)
                        print(s_line)

                #END FOR
            else:
                pass
            csvfile.close()
            log("The accumulated most frequent nouns inception-to-date frequencies were written to your personal .csv file.")
            log(" ")
            n = len(accumulated_most_frequent_nouns_dict_new)
            s = '{:,}'.format(n)
            log("The number of words with their corresponding frequencies saved to your personal .csv file:        " + s)
            log(" ")
            log(" ")
        #-----------------------------------
    except Exception as e:
        # restore file from its backup if it does not exist because it was renamed just prior to this failure.
        if not os.path.exists(s_filename_dict):
            if os.path.exists(s_filename_dict_backup_1):
                os.rename(s_filename_dict_backup_1,s_filename_dict)
            else:
                if os.path.exists(s_filename_dict_backup_2):
                    os.rename(s_filename_dict_backup_2,s_filename_dict)
        log("=================================ERROR============================================")
        log("Updating of accumulated most frequent nouns was aborted due to this fatal ERROR: " + enf_as_unicode(e) )
        log("=================================ERROR============================================")
#----------------------------------------------------------------------------------------------------------------
def log_only(new_comments,log):
    log(new_comments)
#----------------------------------------------------------------------------------------------------------------
def update_comments(my_cursor,my_db,log,notifications,current_book,new_comments_uc):

    if DEBUG: print("Updating comments")

    old_comments = ""

    mysql = 'SELECT text FROM comments WHERE book = ? '
    my_cursor.execute(mysql,([current_book]))
    tmp = my_cursor.fetchall()
    if not tmp:
        pass
    else:
        if len(tmp) == 0:
            pass
        else:
            for row in tmp:
                for col in row:
                    old_comments = col
            del tmp

    old_comments = old_comments.strip()

    if my_param_dict['COMMENTS_PREPEND_APPEND_REPLACE'] == "prepend" :
        #prepend new comments to the book's existing comments, if any.
        text = new_comments_uc + old_comments
    else:
        if my_param_dict['COMMENTS_PREPEND_APPEND_REPLACE'] == "append" :
            #append new comments to the book's existing comments, if any.
            text = old_comments +  new_comments_uc
        else:
            text = new_comments_uc   # replace all previous comments, if any.

    if text:
        if text.strip() != "" :
            my_cursor.execute("begin")
            mysql = "INSERT OR REPLACE INTO comments (id,book,text) VALUES (null,?,?) "
            my_cursor.execute(mysql,(current_book,text))
            my_cursor.execute("commit")
            log(" ")
            log("\t\t\t" + "- - Comments were updated.")

    del text
    del old_comments
    del new_comments_uc
#----------------------------------------------------------------------------------------------------------------
def remove_previous_enf_comments(my_cursor,my_db,log,notifications,current_book):
    #warning: the users need to know that if they 'prepend' in one job, then 'append' in the next job, and then 'remove previous enf comments' in the third job,
    #               all of the original real comments that were in the middle of the prepended and appended enf comments will be removed too.
    #               This can be easily avoided by *always* setting the options checkbox for 'remove_previous_enf_comments' to 'checked'.  So, can never have more than one (1) at a time.

    old_comments = ""
    text = ""

    mysql = 'SELECT text FROM comments WHERE book = ? '
    my_cursor.execute(mysql,([current_book]))
    tmp = my_cursor.fetchall()
    if not tmp:
        pass
    else:
        if len(tmp) == 0:
            pass
        else:
            for row in tmp:
                for col in row:
                    old_comments = col
            del tmp

    try:
        if old_comments:
            if old_comments > " ":
                text = old_comments
                text = re.sub("Most Frequent Words.+_______________________"," ",old_comments, count=0,flags=re.IGNORECASE)     # re.sub = sub(pattern, repl, string, count=0, flags=0)
    except:
        log("Could not remove the previous ENF Comments from this book.  Please do it manually.")
        return

    if text:
        if text != old_comments:
            if text.strip() != "" :
                my_cursor.execute("begin")
                mysql = "INSERT OR REPLACE INTO comments (id,book,text) VALUES (null,?,?) "
                my_cursor.execute(mysql,(current_book,text))
                my_cursor.execute("commit")
                log("For Book: " + enf_as_unicode(current_book) + "  - - Previous ENF Comments were removed.")
                log(" ")

    del text
    del old_comments
#----------------------------------------------------------------------------------------------------------------
def create_tags(my_cursor,my_db,log,notifications,current_book,most_common_list_tags):

    global my_param_dict
    global acronyms_to_capitalize_set

    n_max_tags_to_add = int(my_param_dict['TAGS_MAX'])
    if n_max_tags_to_add == 0:
        del n_max_tags_to_add
        log("New Tags were requested, but the specified Maximum was set to 0.  No Tags have been created")
        return

    log(" ")

    new_tags_list = []

    for x in range(0,n_max_tags_to_add):
        try:
            s = most_common_list_tags[x]
            s = s.strip()
            new_tags_list.append(s)
            del s
        except:
            break

    tags_dict = build_tags_dict(my_cursor,my_db,log,notifications)
    #--- tags_dict[name] = id

    for tag in new_tags_list:
        tag = tag.strip()
        tag = tag.lower()  #this is done here in order to match the retrieved Tags from table 'tags' that were converted to lower case; we want no duplicate Tags due to case differences.
        if tag > " ":
            try:
                id = tags_dict[tag]
            except:
                tags_dict[tag] = 0

    my_cursor.execute("begin")
    mysql = "INSERT OR IGNORE into tags (id,name) VALUES(null,?)"
    for tag in new_tags_list:
        tag = tag.strip()
        tag = tag.lower()
        if tag > " ":
            try:
                id = tags_dict[tag]
                if id == 0:
                    if tag in acronyms_to_capitalize_set:
                        tag = tag.upper()
                    else:
                        if "*" in tag:
                            s = tag[0]
                            s = s.upper()
                            if len(tag) > 1:
                                x = tag[1: ]
                                x = x.lower()
                            else:
                                x = ""
                            tag = s + x
                        else:
                            tag = tag.title()   # *now* can titlecase it...
                    my_cursor.execute(mysql,([tag]))
                    log("\t\t\t" + "- - " + tag + " was created as an entirely new Calibre Tag.")
                else:
                    if tag in acronyms_to_capitalize_set:
                        tag = tag.upper()
                    else:
                        tag = tag.title()   # *now* can titlecase it...
                    log("\t\t\t" + "- - " + tag + " was already a Calibre Tag.")
                    continue
            except:
                continue
        else:
            continue
    #END FOR
    my_cursor.execute("commit")

    log(" ")

    if my_param_dict['TAGS_REPLACE_ADD'] == unicode_type("replace"):
        my_cursor.execute("begin")
        mysql = "DELETE FROM books_tags_link WHERE book = ? "
        my_cursor.execute(mysql,([current_book]))
        my_cursor.execute("commit")
        log(" ")
        log("\t\t\t" + "       - - All existing Tags for this book were deleted per user options.")
        log(" ")
    else:
        pass

    my_cursor.execute("begin")
    mysql = "INSERT into books_tags_link (id,book,tag) VALUES (null, ?, (SELECT id FROM tags WHERE name = ?) )"
    for tag in new_tags_list:
        tag = tag.title()
        try:
            my_cursor.execute(mysql,(current_book,tag))
            log("\t\t\t" + "       - - " + tag + " was added as a Tag for this book.")
        except:
            log("\t\t\t" + "       - - " + tag + " already exists as a Tag for this book.")
    #END FOR
    my_cursor.execute("commit")

    del tags_dict
    del new_tags_list
    del n_max_tags_to_add
    del most_common_list_tags

#----------------------------------------------------------------------------------------------------------------
def update_custom_column(my_cursor,my_db,log,notifications,current_book,most_common_list_cc,cc_list):

    if DEBUG: print("Updating the custom column")

    global my_param_dict
    global acronyms_to_capitalize_set
    global user_custom_word_rules_uppercase_set

    custom_column_name = my_param_dict['CUSTOM_COLUMN_NAME']

    if not custom_column_name.startswith("#"):
        del custom_column_name
        log("Invalid Custom Column Name Specified.  It must begin with a '#' ")
        return

    n_max_nouns_to_add = int(my_param_dict['CUSTOM_COLUMN_MAX'])
    if n_max_nouns_to_add == 0:
        del custom_column_name
        log("Update of a Custom Column was requested, but the specified Maximum was set to 0.  Nothing done.")
        return

    new_nouns_list = []

    for x in range(0,n_max_nouns_to_add):
        try:
            new_nouns_list.append(most_common_list_cc[x])
        except:
            break

    del most_common_list_cc

    if len(new_nouns_list) == 0:
        del new_nouns_list
        log("Update of a Custom Column was requested, but this book has no results.  Nothing done.")
        return

    is_valid_cc = False
    for row in cc_list:
        id,label,datatype,is_multiple,normalized = row
        label = "#" + label
        if label == custom_column_name:
            is_valid_cc = True
            cc_id = enf_as_unicode(id)
            cc_datatype = datatype
            cc_is_multiple = is_multiple
            cc_normalized = normalized
            break

    row = None
    del row

    if not is_valid_cc:
        del custom_column_name
        del new_nouns_list
        log("Update of a Custom Column was requested, but specified #custom_column is invalid. Nothing done.")
        return

    cc_id = cc_id.strip()

    s_custom_column_table = "custom_column_[X]"
    s_books_custom_column_nn_link = "books_custom_column_[X]_link"

    s_custom_column_table = s_custom_column_table.replace("[X]",cc_id)
    s_books_custom_column_nn_link = s_books_custom_column_nn_link.replace("[X]",cc_id)

    s_custom_column_table = s_custom_column_table.strip()
    s_books_custom_column_nn_link = s_books_custom_column_nn_link.strip()

    if enf_as_unicode(my_param_dict['CUSTOM_COLUMN_SORT_ALPHA']) == enf_as_unicode("True"):
        new_nouns_list.sort()

    value = ""
    for word in new_nouns_list:
        word = word.strip()
        word = word.lower()
        if (word in acronyms_to_capitalize_set) or (enf_as_unicode(word) in acronyms_to_capitalize_set):
            word = word.upper()
        else:
            if (enf_as_unicode(word) in user_custom_word_rules_uppercase_set) or (word in user_custom_word_rules_uppercase_set):
                word = word.upper()
            else:
                if "*" in word:
                    s = word[0]
                    s = s.upper()
                    if len(word) > 1:
                        x = word[1: ]
                        x = x.lower()
                    else:
                        x = ""
                    word = s + x
                else:
                    word = word.title()   # *now* can titlecase it...
            value = value + word + ", "
    value = value.strip()
    if value.startswith(","):
        value = value[1: ]
    if value.endswith(","):
        value = value[0:-1]
    value = value.strip()

    del new_nouns_list
    del custom_column_name

    if cc_normalized == 0:

        mysql = "INSERT OR REPLACE INTO [TABLE] (id,book,value) VALUES(null,?,?) "
        mysql = mysql.replace("[TABLE]",s_custom_column_table)

        my_cursor.execute("begin")
        my_cursor.execute(mysql,(current_book,value))
        my_cursor.execute("commit")

    else:

        mysql1 = "DELETE FROM [TABLE2] WHERE book = ?"
        mysql1 = mysql1.replace("[TABLE2]",s_books_custom_column_nn_link)

        my_cursor.execute("begin")
        my_cursor.execute(mysql1,([current_book]))
        my_cursor.execute("commit")

        mysql2 = "INSERT OR REPLACE INTO [TABLE1] (id,value) VALUES(null,?) "
        mysql2 = mysql2.replace("[TABLE1]",s_custom_column_table)

        my_cursor.execute("begin")
        my_cursor.execute(mysql2,([value]))
        my_cursor.execute("commit")

        mysql3 = "INSERT OR REPLACE into [TABLE2] (id,book,value) VALUES (null, ?, (SELECT id FROM [TABLE1] WHERE value = ?) )"
        mysql3 = mysql3.replace("[TABLE2]",s_books_custom_column_nn_link)
        mysql3 = mysql3.replace("[TABLE1]",s_custom_column_table)

        my_cursor.execute("begin")
        my_cursor.execute(mysql3,(current_book,value))
        my_cursor.execute("commit")

        #delete any unused ids, if any
        mysql4 = "DELETE FROM [TABLE1] WHERE id NOT IN(SELECT value FROM [TABLE2]) "
        mysql4 = mysql4.replace("[TABLE1]",s_custom_column_table)
        mysql4 = mysql4.replace("[TABLE2]",s_books_custom_column_nn_link)

        my_cursor.execute("begin")
        my_cursor.execute(mysql4)
        my_cursor.execute("commit")
    #END IF

    log(" ")
    log("\t\t\t" + "- - Custom Column was updated.")
    log("\t\t\t" + "- - Value: " + value)
#----------------------------------------------------------------------------------------------------------------
def analyze_text(condensed_text_word_list,errors,log):
    # Applies Plural Pairs, Change Pairs, Discards Words, Calculates Word Frequencies,
    # Accumulates Frequencies by Word for Entire Job, Formats New Comments by Book

    if DEBUG: print("Analyzing text")

    global my_param_dict
    global highest_number_of_nouns_to_keep

    n_max_comments = int(my_param_dict['COMMENTS_MAX'])
    #---------------------------------------------------------
    #---------------------------------------------------------
    text_list = []

    for row in condensed_text_word_list:
        if not row:
            continue
        word = row.strip()
        if not word > " ":
            continue
        if not word.isalpha():
            continue
        word = unicode_type(word)
        text_list.append(word)          # text_list is pure unicode.  a proxy that is ASCII will be created as needed.  all dicts are also pure unicode.
    #END FOR

    del condensed_text_word_list

    #---------------------------------------------------------
    #  Apply Change Pair Rules:  #2 of 4
    #---------------------------------------------------------

    if DEBUG: print("Applying Change Pair Rules:  #2 of 4")

    global merged_change_pairs_dict  # merged in such a way that conflicts are 'won' by the user custom word rules change pairs dict, not the standard dict...
    # this was first done at the beginning of  condense_text(), but since then some other word changes have been made, such as gutting obscenities, so now all words are checked again.
    # change only the 'word_in' to the 'word_out' here.  In #3 of 3, later on, capitalization rules will be implemented by matching words to the 'word_out' (the 'word' IS the 'word_out' after this point)
    if len(merged_change_pairs_dict) > 0:
        search_set = set(text_list)
        words_to_change_set = set()
        for k,v in iteritems(merged_change_pairs_dict):
            if k in search_set or enf_as_unicode(k) in search_set:
                k = unicode_type(k)
                words_to_change_set.add(k)
            else:
                continue
        #END FOR
        del search_set
        for index, word in enumerate(text_list):
            word = unicode_type(word)
            if word in words_to_change_set:
                new_word = merged_change_pairs_dict[word]
                text_list[index] = unicode_type(new_word)      # only changes the word itself, not any capitalization.  everything is still lower case.
            else:
                continue
        #END FOR
        del words_to_change_set
    else:
        pass
    #---------------------------------------------------------
    #---------------------------------------------------------
    global english_words_to_delete_set               #for filtering theoretical singulars of plurals
    global user_custom_word_rules_good_set     #overrides whatever the "standard" list says, for good or bad.
    global user_custom_word_rules_bad_set       #overrides whatever the "standard" list says, for good or bad.

    #change plurals to singulars prior to counting by Collections

    global english_obscenities_set  # do not change these words to their theoretical singulars or plurals.

    global english_plurals_dict           # manually created from English irregular plurals                                keys are singular and yield the plural
    global english_singulars_dict       # the inverse of english_plurals_dict, from which it was created.          keys are plural and yield the singular

    global user_custom_plurals_dict
    global user_custom_singulars_dict

    global acronyms_to_capitalize_set

    #-------------------------------------------------------------------------------------
    #-------------------------------------------------------------------------------------
    #-------------------------------------------------------------------------------------
    #  PLURAL PAIRS PASS #2 OF 2 AFTER CHANGE PAIRS ARE APPLIED [STEPS 1 & 2 ONLY)
    #-------------------------------------------------------------------------------------
    #-------------------------------------------------------------------------------------
    #-------------------------------------------------------------------------------------

    if DEBUG: print("Applying Plural Pairs pass #2 of 2")
    #----------------------------
    final_text_list = []
    for row in text_list:
        word = row
        word = word.strip()
        word = enf_as_unicode(word)

        if word in acronyms_to_capitalize_set:   #acronyms are acronyms, neither singular nor plural.
            final_text_list.append(word)
            continue

        if word in english_obscenities_set:
            final_text_list.append(word)
            continue

        if word[-4: ] == "itis":    # ends in s but is not a plural of a singular.  example: keratoconjunctivitis  teenageritis  meningitis
            final_text_list.append(word)
            continue

        if word[-4: ] == "esis":    # ends in s but is not a plural of a singular.  example: immunoelectrophoresis
            final_text_list.append(word)
            continue


        #---------------------------------------------------------
        # Change Plural to Singular - Pass #2 of 2
        #---------------------------------------------------------
        # second custom dict pass at changing plurals to their singulars

        if word in user_custom_plurals_dict:  # the k is the singular, so the current word already is a singular
            final_text_list.append(word)
            continue

        if word in english_plurals_dict:  # the k is the singular, so the current word already is a singular
            final_text_list.append(word)
            continue

        if word in user_custom_singulars_dict:
            singular = user_custom_singulars_dict.get(word, "unknown")
            if not (singular == "unknown"):  # just to be safe, as we know for certain already that the current word IS in the dict (per just above...)
                word = unicode_type(singular)
                final_text_list.append(word)
                continue

        # second standard dict pass at changing plurals to their singulars
        if word in english_singulars_dict:
            singular = english_singulars_dict.get(word, "unknown")
            if not (singular == "unknown"):  # just to be safe, as we know for certain already that the current word IS in the dict (per just above...)
                word = unicode_type(singular)
                final_text_list.append(word)
                continue

        if word[-2: ] == 'us':    # campus, octopus, genius, bonus, virus, discus, surplus, status, prospectus, apparatus, asparagus, thesaurus, hippopotamus, and other singulars derived from Latin...
            word = unicode_type(word)
            final_text_list.append(word)  # keep it the way it is...
            continue

        # first and only inflect pass to confirm that the current word is singular because it just may have been missing in both the user custom singulars dict *and* the standard singulars dict...
        if word[-1: ] == 's':  # probably a plural, but not necessarily.  however, known irregulars were taken care of previously, so it is likely.
            s_word = word
            inflect_singular = my_inflect_grammar_engine.singular_noun(s_word, count=None)
            if inflect_singular:
                # that means that s_word is highly likely a valid plural...
                # but, just to be sure, let's double-check...
                s_inflect_singular = inflect_singular
                inflect_plural = my_inflect_grammar_engine.plural_noun(s_inflect_singular, count=None)
                if inflect_plural:
                    inflect_plural = unicode_type(inflect_plural)
                    word = unicode_type(word)
                    inflect_singular = unicode_type(inflect_singular)
                    if inflect_plural == word:
                        if inflect_singular != word:
                            word = unicode_type(inflect_singular)     # proof that the current word was plural, but now is singular
                        else:
                            pass  # if the plural == word == singular because the plural pair is irregular (e.g. sheep == sheep), we are already finished with the current word
                else:  # inflect returned None because a valid singular was *not* passed to plural_noun(), which answers our question...
                    pass
            else:  # inflect returned None because a valid pural was *not* passed to singular_noun(), which answers our question...
                pass
        else:
            pass

        word = unicode_type(word)
        final_text_list.append(word)         #  final_text_list is pure unicode...
        continue
        #---------------------------------------------------------
        #---------------------------------------------------------
    #END FOR
    if DEBUG: print("finished with the second (and more comprehensive) pass for plurals")
    del text_list
    num_start_actual = len(final_text_list)
    if DEBUG: print("The length of final_text_list is: ", enf_as_unicode(num_start_actual))

    #-------------------------------------------------------------------------------------
    #-------------------------------------------------------------------------------------
    #-------------------------------------------------------------------------------------
    #  PLURAL PAIRS [BOTH STEPS]  & CHANGE PAIRS [STEPS 1 & 2 ONLY] ARE COMPLETE.
    #-------------------------------------------------------------------------------------
    #-------------------------------------------------------------------------------------
    #-------------------------------------------------------------------------------------

    if DEBUG: print("Counting the frequency of the entire current list of filtered words")

    #==========================================================================================================
    # COUNT THE FREQUENCY OF THE CURRENT LIST OF FILTERED WORDS
    #==========================================================================================================

    counter = collections.Counter(final_text_list)

    common_list = counter.most_common(2500) #prior to weeding out fluff.    the absolute max user can choose to keep per the options is 100, but the extra is for upcoming deletes of fluff.
    #                                                            100 max / 0.04 retention average for 'words' = 2500 that we must start with as an average minimum (i.e., ~ 96% of a book's words will be *deleted*).
    #~ ('heroes', 296)   296 = total count of that specific word in the top 2500 words that were output by collections.Counter()
    #~ ('prisoners', 69)
    #~ (......., nn)

    del counter
    del final_text_list

    if DEBUG: print("The length of common_list is: ", unicode_type(len(common_list)))

    if DEBUG: print("Finished counting")

    if DEBUG: print("Trimming the initial frequency list, and Accumulating the frequencies for the final list of 'good' words")

    #==========================================================================================================
    # FROM THIS POINT DOWN, ANY CHANGES TO THE REMAINING WORDS MUST ALSO BE REFLECTED IN THE DICT THAT HOLDS THEM PLUS THEIR COUNTS: frequency_dict
    #==========================================================================================================


    global english_nouns_to_keep_set
    #~ already declared:  global user_custom_word_rules_good_set     #overrides whatever the "standard" list says, for good or bad.
    #~ already declared:  global user_custom_word_rules_bad_set        #overrides whatever the "standard" list says, for good or bad.

    #-------------------------------------------------------------------------
    #-------------------------------------------------------------------------
    #-------------------------------------------------------------------------
    #-------------------------------------------------------------------------
    #  ACCUMULATED_MOST_FREQUENT_NOUNS
    #-------------------------------------------------------------------------
    global accumulation_of_most_frequent_nouns_is_active
    global accumulation_of_most_frequent_nouns_is_paused


    #==================================================================================================
    frequency_dict = {}    #~ ('heroes', 296)        296 = total count of that specific word in the top 2500 words that were output by collections.Counter()
    #==================================================================================================

    #-------------------------------------------------------------------------
    #-------------------------------------------------------------------------
    #-------------------------------------------------------------------------
    #-------------------------------------------------------------------------
    #===============================================================================
    last_noun_kept = ""
    most_common_list = []
    words_that_have_been_changed_dict = {}  #   k = new word  v = old word
    words_that_have_been_changed_counts_dict = {}  # k = new word  v = count of the old word taken over by the new word
    words_that_have_been_changed_occurences_dict = {}  # k = new word  v = +1 every time a change event creates the same k
    #==================
    #==================

    for row in common_list:  #already sorted from most common to least common by collections.counter

        word,word_count = row

        if not word:
            continue   # in this for-loop, 'continue' means 'discard this word' that is in common_list.
        if not word_count:
            continue
        if word_count == 0:
            continue

        word = word.strip()

        word_count = enf_as_unicode(word_count)
        word_count = word_count.strip()
        word_count = int(word_count)  #used by frequency_dict as v; word is k.

        word_original = word
        word = word.replace("fuck","f**k")  #always gut these words, even if they are in the "always keep, good word" list.
        word = word.replace("shit","s**t")
        word = word.replace("cunt","c**t")
        word = word.replace("bitch","bi*ch")
        word = word.replace("slut","sl*t")

        if word_original != word:
            words_that_have_been_changed_dict[word] = unicode_type(word_original)
            words_that_have_been_changed_counts_dict[word] = int(word_count)
            words_that_have_been_changed_occurences_dict[word] = int(1)      # this is the very first time this could have been added, so summing is not necessary here

        if word in english_obscenities_set:  # none of these words are in the standard good words list...
            n = len(word)
            if n < 5:  #will be discarded if not in the user custom good words list to always keep...
                if word in user_custom_word_rules_good_set :  # which of course at this point includes all of the exploded k,v values from the user custom plural pairs and change pairs...
                    pass
                else:
                    continue
            if n > 19:  #will be discarded if not in the user custom good words list to always keep...
                if word in user_custom_word_rules_good_set :        # which of course at this point includes all of the exploded k,v values from the user custom plural pairs and change pairs...
                    pass
                else:
                    continue
            word_original = word
            filler = "***************************"
            filler = filler[0:n-3]      # len(pussy) = 5 so filler = [0:3]
            word = word[0:1] + filler + word[-1: ]   # p***y
            word = word.strip()

            if word_original != word:
                try:
                    current_occurences = words_that_have_been_changed_occurences_dict[word]
                except:
                    current_occurences = 0
                if current_occurences > 0:  # previously added another occurence of this identical word
                    current_count = words_that_have_been_changed_counts_dict[word]
                    words_that_have_been_changed_counts_dict[word] = int(current_count + word_count)   # summarize
                else:
                    words_that_have_been_changed_counts_dict[word] = 0
                words_that_have_been_changed_occurences_dict[word] = int(current_occurences + 1)
                words_that_have_been_changed_dict[word] = word_original
        else:
            pass

        #---------------------------------------------------------
        #---------------------------------------------------------
        #  CHANGE PAIR RULES:  STEP #3 OF 4
        #---------------------------------------------------------
        #---------------------------------------------------------
        #~ global merged_change_pairs_dict  # merged in such a way that conflicts are 'won' by the user custom word rules change pairs dict, not the standard dict...
        was_changed_by_pair_rules = False
        word = unicode_type(word)             # job rule: all dicts use only unicode or int .  never bytestrings.
        word_original = word
        try:
            if word in merged_change_pairs_dict:
                word = merged_change_pairs_dict[word]  # e.g. datum:data               # job rule: all dicts use only unicode or int .  never bytestrings.
                was_changed_by_pair_rules = True
            if was_changed_by_pair_rules:  # because word was in merged_change_pairs_dict above
                if word in words_that_have_been_changed_occurences_dict:
                    current_occurences = words_that_have_been_changed_occurences_dict[word]
                else:
                    current_occurences = 0
                if current_occurences > 0:  # previously added another occurence of this identical word
                    current_count = words_that_have_been_changed_counts_dict[word]
                    words_that_have_been_changed_counts_dict[word] = int(current_count + word_count)   # summarize
                else:
                    words_that_have_been_changed_counts_dict[word] = 0
                words_that_have_been_changed_occurences_dict[word] = int(current_occurences + 1)
                words_that_have_been_changed_dict[word] = word_original
            else:
                pass
        except:
            pass

        if word in user_custom_word_rules_bad_set:       # e.g  f**king             # job rule: all dicts use only unicode or int .  never bytestrings.
            if word in user_custom_word_rules_good_set:
                pass
            else:
                continue

        if not was_changed_by_pair_rules:
            #-------------------------------------------------------------------------
            #-------------------------------------------------------------------------
            if len(word) < 5:   # Average word length for the English language is  ~4.8 letters, so *most* things less than 5 are boring, highly common and not worth having as a tag and so forth.
                if word > " ":
                    if word in english_nouns_to_keep_set:   # sex, dog, cat, bird, army, salt, data, etc.
                        if ( (not word in user_custom_word_rules_bad_set) or (word in user_custom_word_rules_good_set) ):             #the custom 'good' set wins over the custom 'bad' set if there is a conflict.
                            pass  # most_common_list.append(word)
                        else:
                            continue
                    else:
                        if word in user_custom_word_rules_good_set:         # the user's custom word sets override the standard sets, and the custom 'good' set wins over the custom 'bad' set if there is a conflict.
                            pass # most_common_list.append(word)
                        else:
                            continue
                else:
                    continue
            else:
                pass

            if len(word) > 19:
                if word in user_custom_word_rules_good_set:         # the user's custom word sets override the standard sets, and the custom 'good' set wins over the custom 'bad' set if there is a conflict.
                    pass
                else:
                    if word in english_nouns_to_keep_set:   #found per http://norvig.com/mayzner.html  and also found by searching the web for "english longest words"
                        if (not word in user_custom_word_rules_bad_set) :      #the custom 'bad' set wins over the standard 'good' set
                            #~ most_common_list.append(word)    see below
                            pass
                        else:
                            continue
                    else:
                        continue   #highly likely a phrase of concatenated words with no spaces arising as an artifact from the conversion to ASCII text from a .......wait for it....... pdf.
            #-------------------------------------------------------------------------
            #-------------------------------------------------------------------------
        else:
            pass

        most_common_list.append(word)  #after weeding out fluff    NOT a bytestring here

        last_noun_kept = word

        #-------------------------------------------------------------------------
        #-------------------------------------------------------------------------
        #-------------------------------------------------------------------------
        #-------------------------------------------------------------------------
        #***********************************************************************************************************
        # 'frequency_dict' is the final source for Comments, Tags, and Custom Column values for each book
        #***********************************************************************************************************
        #-------------------------------------------------------------------------
        if word_count > 0:
            word = word.strip()
            #---------------------
            # if word is already in frequency_dict, then they must be summed, but the sum must be of the 2 different sources of the same new_word.  otherwise, the count would double.
            #~ words_that_have_been_changed_dict[word] = word_original
            #~ words_that_have_been_changed_counts_dict[word] = word_count
            count_to_use = word_count
            for k,v in iteritems(words_that_have_been_changed_dict):  # [word] = word_original
                k = unicode_type(k)
                if word == k:  # that means that either (a) the source of the current word was a change event, or (b) a preexisting word that was never changed, but identical to the new_word from the change event
                    original_count = words_that_have_been_changed_counts_dict[word]  # this is the count belonging to the original word before it was changed by the change event
                    if word_count == original_count:  # this is identical to the output new_word from the change event
                        pass  # no reason to change count_to_use from the word_count of the current word in process.  it could be that the source of this word is not a change event, but the count happens to be identical
                    else:
                        # the current word is the same as a new_word created by changing an old word to the new_word, but the counts are different so this has an entirely separate source compared to the new_word
                        count_to_use = count_to_use + original_count  #summarize the counts of the 2 different sources of the identical word
                else:
                    continue
            #END FOR
            #---------------------
            word = unicode_type(word)
            frequency_dict[word] = count_to_use          #~ ('heroes', 296)        296 = total count of that specific word in the top 2500 words that were output by collections.Counter()
        #-------------------------------------------------------------------------
        #-------------------------------------------------------------------------
        #-------------------------------------------------------------------------
    #----------
    #END FOR
    if DEBUG: print("Finished Trimming and Accumulating Frequency Counts")
    if DEBUG: print("Finalizing the List of Most Frequent Words")
    #===============================================================================
    del common_list
    del words_that_have_been_changed_counts_dict
    del words_that_have_been_changed_occurences_dict
    #==================

    #-----------------------------------
    global n_accumulated_job_total_original_words
    global n_accumulated_job_total_net_words
    #----------
    n_accumulated_job_total_original_words = n_accumulated_job_total_original_words + num_start_actual
    #----------
    num_left = len(most_common_list)
    n_accumulated_job_total_net_words = n_accumulated_job_total_net_words + num_left
    #----------
    try:
        s1 = '{:,}'.format(num_start_actual)
        s2 = '{:.2%}'.format((num_left/num_start_actual))
        s3 = '{:,}'.format(num_left)
        msg = "Percentage of the  " + s1 + " words from the original text remaining after discarding all undesired English words: " + s2 + ", or: " + s3 + " net words"
        log(" ")
        log(msg)
        del msg
    except:
        pass

    del num_start_actual

    #-----------------------------------

    #***********************************************************************************************************
    # 'frequency_dict' is the final source for Comments, Tags, and Custom Column values for each book
    #***********************************************************************************************************
    #-------------------------------------------------------------------------
    # CHANGE frequency_dict TO REFLECT CHANGED WORDS FROM ABOVE
    #-------------------------------------------------------------------------
    #  words_that_have_been_changed_dict[new_word] = word_original
    #-------------------------------------------------------------------------
    if len(words_that_have_been_changed_dict) > 0:
        for k,v in iteritems(words_that_have_been_changed_dict):
            new_word = unicode_type(k)
            word_original = unicode_type(v)
            try:
                count = frequency_dict[word_original]
                frequency_dict[new_word] = int(count)
                frequency_dict.pop(word_original, None)
            except:
                continue
        #END FOR

    del words_that_have_been_changed_dict

    #-------------------------------------------------------------------------
    #-------------------------------------------------------------------------
    #  ACCUMULATED_MOST_FREQUENT_NOUNS: Job (not Book) Level
    #-------------------------------------------------------------------------
    #-------------------------------------------------------------------------
    global accumulated_most_frequent_nouns_dict
    # counts are accumulated for writing out to .csv and .tuples files

    n_total_nouns_kept_so_far = 0
    n_length_of_longest_word = 0

    for word in most_common_list:
        n_total_nouns_kept_so_far = n_total_nouns_kept_so_far + 1
        if n_total_nouns_kept_so_far > highest_number_of_nouns_to_keep:           # only accumulate as many as specified by the user; the maximum is 100 regardless.
            break
        word = unicode_type(word)
        n_word = frequency_dict.get(word)  # leave utf8, otherwise keys won't match...
        if n_word:
            current_word_frequency = n_word
        else:  # .get(word) returned None
            current_word_frequency = 0
        n_word = accumulated_most_frequent_nouns_dict.get(word)
        if n_word:
            current_job_word_accumulated_frequency  = n_word
        else:  # .get(word) returned None
            current_job_word_accumulated_frequency  = 0
        #----------
        new_total = current_word_frequency + current_job_word_accumulated_frequency
        #----------
        word = unicode_type(word)
        accumulated_most_frequent_nouns_dict[word] = new_total
        #----------
        n = len(word)
        if n > n_length_of_longest_word:
            n_length_of_longest_word = n
    #END FOR

    #------------------------------------------------------------------------------------------------------------------------------------------
    #------------------------------------------------------------------------------------------------------------------------------------------
    #  THE WORDS WITH COUNTS NOW IN frequency_dict COMPRISE THE 'FINAL' "most_common_list".
    #------------------------------------------------------------------------------------------------------------------------------------------
    #------------------------------------------------------------------------------------------------------------------------------------------

    del most_common_list
    most_common_list = []

    #***********************************************************************************************************
    # 'frequency_dict' is the final source for Comments, Tags, and Custom Column values for each book
    #***********************************************************************************************************
    mytuples = list([(k,v) for k, v in iteritems(frequency_dict)])

    del frequency_dict  # its work is now complete.

    mytuples_sorted = sorted(mytuples, key=lambda k: (-k[1], k[0].lower()))   # 0 = k; 1 = v

    log(" ")
    log("\t\t\t\t" + "-----------------------------------------------")
    log(" ")
    n_total_nouns_kept_so_far = 0
    for row in mytuples_sorted:   # sorted first by count descending, then by word ascending
        n_total_nouns_kept_so_far = n_total_nouns_kept_so_far + 1
        if n_total_nouns_kept_so_far > highest_number_of_nouns_to_keep:
            break
        else:
            k,v = row
            k = enf_as_unicode(k)
            lk = len(k)
            ls = n_length_of_longest_word - lk + 4
            if ls < 0:
                ls = 4
            s = "                                                                                                     "
            s = s[0:ls]
            log("\t\t\t\t\t" + k + s + " ---> " + enf_as_unicode(v) )
            most_common_list.append(enf_as_unicode(k))   # just the word
    #END FOR
    log(" ")
    log("\t\t\t\t" + "-----------------------------------------------")
    log(" ")
    del mytuples
    del mytuples_sorted

    #---------------------------------------------------------
    #---------------------------------------------------------
    #  Apply Change Pair Rules:  #4 of 4  CAPITALIZATION
    #---------------------------------------------------------
    #---------------------------------------------------------

    # the following sets were built from the merged change pair dicts processed in #1 thru #3 of 4
    global user_custom_word_rules_uppercase_set
    global user_custom_word_rules_titlecase_set
    global user_custom_word_rules_lowercase_set

    try:
        final_most_common_list[:] = []
    except:
        final_most_common_list = []

    if len(most_common_list) > 0:
        for item in most_common_list:
            word = unicode_type(item)
            if word in user_custom_word_rules_uppercase_set:
                word = word.upper()
                final_most_common_list.append(word)         #                   [ this is the list passed on for new Tags and Custom Column updates ]
                continue
            else:
                if word in user_custom_word_rules_titlecase_set:
                    word = word.title()
                    final_most_common_list.append(word)
                    continue
                else:
                    if word in user_custom_word_rules_lowercase_set:         # this is really a mechanism to nicely tell the user that their rules are conflicting and/or otherwise goofy
                        word = word.title()
                        final_most_common_list.append(word)
                        continue
                    else:
                        if word in acronyms_to_capitalize_set:
                            word = word.upper()
                            final_most_common_list.append(word)
                            continue
                        else:
                            if "*" in word:
                                s = word[0]
                                s = s.upper()
                                if len(word) > 1:
                                    x = word[1: ]
                                    x = x.lower()
                                else:
                                    x = ""
                                word = s + x
                                final_most_common_list.append(word)
                                del x
                                del s
                                continue
                            else:
                                word = word.title()
                                final_most_common_list.append(word)       #                 [ this is the list passed on for new Tags and Custom Column updates ]
                                continue
        #END FOR
        if DEBUG: print("Length of the final list of most common words: ", unicode_type(len(final_most_common_list)))
    else:
        pass

    del most_common_list

    #---------------------------------------------------------------------------------------------------------------------
    # Build the English-to-[OtherLanguage] Dictionary from the raw Final List of Most Common Nouns
    #---------------------------------------------------------------------------------------------------------------------
    global translate_english_to_other_is_active
    global translate_english_to_other_language
    global english_to_spanish_dict                            #previously supplemented by the user custom english to spanish dict
    global english_to_other_language_dict               #only used right here if language is not spanish (see above)

    spanish_dict_is_available = False
    other_dict_is_available = False

    if len(english_to_spanish_dict) > 0:
        spanish_dict_is_available = True        #previously supplemented by the user custom english to spanish dict if spanish were chosen instead of 'other language'
        other_dict_is_available = False
    else:
        if len(english_to_other_language_dict) > 0:
            spanish_dict_is_available = False
            other_dict_is_available = True

    if spanish_dict_is_available or other_dict_is_available:
        pass
    else:
        translate_english_to_other_is_active = False    # turn it off for this job since there are no target words available to translate English to...

    translation_dict = {}
    if translate_english_to_other_is_active:
        if not translate_english_to_other_language == "none":
            n_longest_word = 0
            for row in final_most_common_list:
                word = row
                word = word.lower()   # english_to_spanish_dict has all lower case words in both languages, and in unicode
                if len(word) > n_longest_word:
                    n_longest_word = len(word)
                try:
                    if spanish_dict_is_available:
                        if word in english_to_spanish_dict:     #both english and spanish words are in unicode
                            translation = english_to_spanish_dict[word]
                            translation_dict[word] = unicode_type(translation)
                            continue
                        else:
                            continue
                    else:
                        if other_dict_is_available:
                            if word in english_to_other_language_dict:
                                translation = english_to_other_language_dict[word]
                                translation_dict[word] = unicode_type(translation)
                                continue
                            else:
                                continue
                        else:
                            continue
                except:
                    continue
            #END FOR
        else:
            translate_english_to_other_is_active = False
    else:
        pass

    missing_translations = []

    #---------------------------------------------------------------------------------------------------------------------
    # Build the Comments HTML from the raw Final List of Most Common Nouns
    #---------------------------------------------------------------------------------------------------------------------
    if not translate_english_to_other_is_active:
        #no translation from English to another language
        new_comments = "<br><br><center><b>Most Frequent Words</b><br>"
        for word in final_most_common_list:
            if word in user_custom_word_rules_uppercase_set:
                word = word.upper()
            line = "                    " + word
            new_comments = new_comments +  line + "<br>"
        #END FOR
        new_comments = new_comments + "<b>_______________________</b></center><br><br>"

    else:
        #translate from English to the specified language
        new_comments = "<br><br><center><b>Most Frequent Words</b><br>"
        for word in final_most_common_list:
            translation = None
            if word in user_custom_word_rules_uppercase_set:
                word = word.upper()
            l_word = word.lower()  #  english_to_spanish_dict has all lower case words in both languages and all in unicode
            if l_word in translation_dict:
                translation  = translation_dict[l_word]
            else:
                translation = None
            if translation:
                s_padding = " ---> "
            else:
                s_padding = " "
                translation = " "
                missing_translations.append(word)
            line = "                    " + word + s_padding + translation
            if DEBUG: print("Metadata Comments: ", line)
            new_comments = new_comments +  line + "<br>"
        #END FOR
        new_comments = new_comments + "<b>_______________________</b></center><br><br>"
    #---------------------------------------------------------------------------------------------------------------------
    #---------------------------------------------------------------------------------------------------------------------
    del translation_dict

    if DEBUG: print("Finished the Finalizing of the List of Most Frequent Words")

    if len(missing_translations) > 0:
        if DEBUG: print("There are missing translations for the following English nouns: ")
        for row in missing_translations:
            if DEBUG: print(row)
        #END FOR
        del missing_translations

    return new_comments,final_most_common_list,errors
#-------------------------------------------------------------------------------------------------------------------------------------
def condense_text(text_data,log):
    # This is where 'the rubber meets the road'.  Input words are change based on Change Pair Rules.  Bad words then get deleted, and good words are kept for frequency counting.

    global english_words_to_delete_set

    global global_first_names_set                           # always deleted unless the user chose not to delete them for this specific job

    global english_nouns_to_keep_set                    # which at this point also contains the exploded k, v values from the standard plurals dict.  ditto for the standard change pairs dict.

    global user_custom_word_rules_good_set       # which at this point also contains the exploded k, v values from user_custom_plurals_dict.  ditto for the user change pairs dict.
    global user_custom_word_rules_bad_set

    global user_custom_plurals_dict
    global user_custom_singulars_dict


    if DEBUG: print("Condensing text")

    # deprecated for Python 2 to 3:   text_data IS ENTIRELY ASCII
    # text_data is entirely Unicode UTF8
    if not isinstance(text_data,unicode_type):
        text_data = enf_as_unicode(text_data)

    #---------------------------------------------------------
    #  Apply Change Pair Rules:  #1 of 4
    #---------------------------------------------------------
    global merged_change_pairs_dict     # both custom and standard were merged such that the custom rules superceded any standard rule for the same "word_in"

    if DEBUG: print("Applying Change Pair Rules:  #1 of 4")

    if len(merged_change_pairs_dict) == 0:
        pass
    else:
        for k,v in iteritems(merged_change_pairs_dict):  # performed first thing here so any 'bad' word that has a change rule to a 'good' word is applied before 'bad' words are discarded
            k = enf_as_unicode(k)
            v = enf_as_unicode(v)
            k1 = enf_as_unicode(enf_as_unicode(" ") + enf_as_unicode(k) + enf_as_unicode(" "))     #match whole words only, not substrings (e.g.  ' have ' or ' having ' but not 'hav' )
            v1 = enf_as_unicode(enf_as_unicode("     ") + enf_as_unicode(v) + enf_as_unicode("     "))     #keep spaces between words for now
            if k1 in text_data:
                text_data = re.sub(k1,v1,text_data)
        #END FOR
    #---------------------------------------------------------
    #---------------------------------------------------------

    if DEBUG: print("Pass #1 of 2:  Changing Bad Words to Spaces")

    #-------------------------------------------------------------------------------------------------------------------------------------
    # text_data contains only words (letters only) and spaces.  nothing else.
    #-------------------------------------------------------------------------------------------------------------------------------------
    # Pass #1 of 2...

    for word in user_custom_word_rules_bad_set:   # these words beat the standard good words...
        if not word in user_custom_word_rules_good_set:      # user's custom good words beat the user's custom bad words...
            s = word.strip()
            s = " " + s + " "     #match whole words only, not substrings (e.g.  ' have ' or ' having ' but not 'hav' )
            if s in text_data:
                text_data = re.sub(s,"        ",text_data)
    #END FOR

    for word in english_words_to_delete_set:   # this set was originally built with no overlap with english_nouns_to_keep_set to avoid repetitive checking for that theoretical overlap.
        if not word in user_custom_word_rules_good_set:        # user's custom good words beat everything else
            if not word in english_nouns_to_keep_set:
                s = word.strip()
                s = " " + s + " "     #match whole words only, not substrings (e.g.  ' have ' or ' having ' but not 'hav' )
                if s in text_data:
                    text_data = re.sub(s,"        ",text_data)
    #END FOR

    if DEBUG: print("re.sub  -  miscellany plus |  ")

    #-------------------------------------------------------------------------------------------------------------------------------------

    text_data = re.sub("[ ][m][r][ ][a-z]+[ ]|[ ][m][r][s][ ][a-z]+[ ]|[ ][m][s][ ][a-z]+[ ]|[ ][m][i][s][s][ ][a-z]+[ ]","      ",text_data)    #  Mr Corde   Mrs Smith      Ms Jones  Miss Jones

    text_data = re.sub("[u][n][a-z]+[a][b][l][e]","      ",text_data)               #  unable unknowable             and so forth are deleted here

    text_data = re.sub("[ ][u][n][a-z]+[e][d][ ]","      ",text_data)                #   unrecorded undesired

    text_data = re.sub("[ ][u][n][a-z]+[i][n][g][ ]","      ",text_data)             #   unassuming

    text_data = re.sub("[ ][n][o][n][a-z]+[i][n][g][ ]","      ",text_data)         #   nonthreatening


    text_data = re.sub("[ ][ ]+", " ",text_data)   # compress multiple spaces to a single space
    text_data = text_data.replace(" ","|")

    text_data = "|" + text_data + "|"

    text_data = re.sub('[|][|]+', '|',text_data)

    #-------------------------------------------------------------------------------------------------------------------------------------
    # Pass #2 of 2...

    if DEBUG: print("Pass #2 of 2:  Changing Bad Words to Spaces - prep ASCII list of current words")

    text_word_list_1 = []

    text_split_list = text_data.split("|")
    for word in text_split_list:
        word = word.strip()
        if not word.isalpha():
            continue
        else:
            word = word.lower().strip()
            text_word_list_1.append(word)
    #END FOR
    del text_data
    del text_split_list
    #----------
    if DEBUG: print("Pass #1 of 3:  Changing Bad Words to Spaces - words < 5 letters and not in any good words set")

    search_set = set(text_word_list_1)

    words_to_delete_set = set()

    for word in text_word_list_1:
        if not word in user_custom_word_rules_good_set:      # user's custom good words beat everything else
            if not word in english_nouns_to_keep_set:
                if len(word) < 5:
                    if word in search_set:
                        words_to_delete_set.add(word)
    #END FOR
    #----------
    if DEBUG: print("Pass #2 of 3:  Changing Bad Words to Spaces - identifying words per custom bad words set")

    for word in user_custom_word_rules_bad_set:
        if not word in user_custom_word_rules_good_set:      # user's custom good words beat the user custom bad words...
            if word in search_set:
                words_to_delete_set.add(word)
    #END FOR
    #----------
    if DEBUG: print("Pass #3 of 3:  Changing Bad Words to Spaces - identifying words per standard bad words set")

    for word in english_words_to_delete_set:
        if not word in user_custom_word_rules_good_set:      # user's custom good words beat the standard bad words...
            if not word in english_nouns_to_keep_set:
                if word in search_set:
                    words_to_delete_set.add(word)
    #END FOR
    #----------
    if DEBUG: print("Pass #1 of 2:  Identifying English Names to Change to Spaces")
    for word in global_first_names_set :
        word = word.strip()
        if not word in user_custom_word_rules_good_set:      # user's custom good words beat the standard bad words...
            if not word in english_nouns_to_keep_set:
                if word in search_set:
                    words_to_delete_set.add(word)
    #END FOR
    #----------
    if DEBUG: print("Pass #1 of 1:  Identifying Specific Suffixes of Adjectives & Adverbs")
    for word in search_set:
        word = word.strip()
        if word[-4: ] == 'less' or word[-4: ] == 'ally' or word[-6: ] == 'adelic'  or word[-4: ] == 'adic'  or word[-3: ] == 'ish'  or word[-2: ] == 'ly' or word[-3: ] == 'ous' :
            if not word in user_custom_word_rules_good_set:
                if not word in english_nouns_to_keep_set:
                    words_to_delete_set.add(word)
    #END FOR
    #----------
    if DEBUG: print("Pass #1 of 1:  Identifying All '......ed' verb forms plus all  '......ing' forms that are NOT deverbal nouns (and are already in the standard good list) to Change to Spaces")
    for word in search_set:
        word = word.strip()
        if word[-3: ] == 'ing'  or word[-2: ] == 'ed'  :     # gerunds (bad) and deverbal nouns (good)            # looked gazed howeled vomited
            if not word in user_custom_word_rules_good_set:
                if not word in english_nouns_to_keep_set:       #  includes deverbal nouns (good)
                    words_to_delete_set.add(word)     # gerunds (bad)             # looked gazed howeled vomited
    #END FOR
    #----------

    del search_set

    #----------
    if DEBUG: print("Changing Previously Identified Words to Spaces")

    log(" ")
    log(" ")
    n = len(words_to_delete_set)
    s = '{:,}'.format(n)
    log("Number of verb forms, adjectives and adverbs (not nouns or deverbal nouns) that were deleted based upon their English suffixes: " + enf_as_unicode(s) )
    log(" ")

    text_word_list_2 = []
    for word in text_word_list_1:
        if word in words_to_delete_set:
            continue
        else:
            text_word_list_2.append(word)
    #END FOR
    del text_word_list_1
    del words_to_delete_set
    #----------

    #---------------------------------------------------------
    #---------------------------------------------------------
    #---------------------------------------------------------
    # Change Plural to Singular - Pass #1 of 2  [#1: before user custom change rules could possibly create more plurals.     #2: afterwards, in Analyze.  Also, #2 is much more comprehensive. ]
    #---------------------------------------------------------
    # first custom dict pass at changing plurals to their singulars

    if DEBUG: print("# first custom dict pass at changing plurals to their singulars")
    text_word_list_3 = []
    global user_custom_singulars_dict
    for word in text_word_list_2:
        if word in user_custom_singulars_dict:
            # the key is the plural; if true, the word is plural, and the result is its singular
            singular = user_custom_singulars_dict[word]
            text_word_list_3.append(singular)
        else:
            # already singular, or the singular equals the plural
            text_word_list_3.append(word)
    #END FOR
    del text_word_list_2


    if DEBUG: print("# first standard dict pass at changing plurals to their singulars")
    global english_singulars_dict
    condensed_text_word_list = []
    for word in text_word_list_3:
        if word in english_singulars_dict:
            # the key is the plural; if true, the word is plural, and the result is its singular
            singular = english_singulars_dict[word]
            condensed_text_word_list.append(singular)
        else:
            # already singular, or the singular equals the plural
            condensed_text_word_list.append(word)
    #END FOR
    del text_word_list_3

    if DEBUG: print("finished with the first pass for plurals")
    if DEBUG: print("Finished condense_text")

    #--------------
    return condensed_text_word_list
#-------------------------------------------------------------------------------------------------------------------------------------
def filter_text(file_data,log,notifications,my_html_stripper):

    if DEBUG: print("Filtering text")

    if DEBUG: print("Length of the input, file_data: ", unicode_type(len(file_data)))

    text_data = "".join(file_data)

    del file_data

    if DEBUG: print("Length of text_data prior to re.sub's : ", unicode_type(len(text_data)))

    # carriage returns, line feeds, tabs, vertical tabs
    try:
        text_data = re.sub('[\r]','      ',text_data)
        text_data = re.sub('[\n]','      ',text_data)
        text_data = re.sub('[\t]','      ',text_data)
        text_data = re.sub('[\v]','      ',text_data)
    except Exception as e:
        if DEBUG: print("# carriage returns, line feeds, tabs, vertical tabs: exception: ", enf_as_unicode(e))
        text_data = "Parsing Errors; cannot proceed with this book...Run in DEBUG for more information"
        log(text_data)
        return text_data

    # delimiters that confuse patterns
    try:
        text_data = re.sub('[\[]','      ',text_data)        # [
        text_data = re.sub('[\]]','      ',text_data)        # ]
        text_data = re.sub('[{]','      ',text_data)        # {
        text_data = re.sub('[}]','      ',text_data)        # }
        text_data = re.sub('[(]','      ',text_data)        # (
        text_data = re.sub('[)]','      ',text_data)        # )
    except Exception as e:
        if DEBUG: print("# delimiters that confuse patterns: exception: ", enf_as_unicode(e))
        text_data = "Parsing Errors; cannot proceed with this book...Run in DEBUG for more information"
        log(text_data)
        return text_data

    if DEBUG: print("Length of text_data after re.sub's : ", unicode_type(len(text_data)))

    # double apostrophes
    try:
        text_data = re.sub('["]',"'",text_data)                              #change all double apostrophes to a single apostrophe
    except Exception as e:
        if DEBUG: print("# double apostrophes: exception: ", enf_as_unicode(e))
        text_data = "Parsing Errors; cannot proceed with this book...Run in DEBUG for more information"
        log(text_data)
        return text_data

    temp_text = unicode_type("")
    temp_text = temp_text + text_data

    del text_data  #  recreate it next

    text_data = unicode_type("")

    # fancy apostrophes & quotation marks - change to simple single apostrophe

    temp_string_list = []

    change_chars = unicode_type('"') + unicode_type("‚") + unicode_type("„")
    for char in temp_text:  # a new string that is identical to text_data
        if char in change_chars:
            char = "'"
        try:
            char = char.lower()
        except:
            pass
        temp_string_list.append(char)
    #END FOR
    text_data = "".join(temp_string_list)
    del temp_string_list

    del temp_text

    if DEBUG: print("Length of text_data after fancy apostrophes & quotation marks replacement to a simple single quote: ", unicode_type(len(text_data)))

    #contractions
    global english_contractions_to_transform_list                 # the global list of contractions was previously encoded in ASCII upon initial creation
    for row in english_contractions_to_transform_list:          # a list of all English standard contractions     can't, we'll, they've, you're, etc. bounded either by a space (virtually all of them), or bounded by apostrophes (Python list)
        contraction = " " + row + " "                                        # to find contractions delimited by a space, as is standard in English grammar.
        text_data = re.sub(contraction,"      ",text_data)
        contraction = "'" + row + "'"                                       # to find contractions delimited by double apostrophes (changed to single just above), as in a Python list of contractions:  ["shan't","must'nt","mightn't","aren't"]
        text_data = re.sub(contraction,"      ",text_data)
    #END FOR

    if DEBUG: print("Length of text_data after contraction filtering : ", unicode_type(len(text_data)))

    if isbytestring(text_data):  #python 2 to 3: make entirely unicode instead of entirely bytestring...
        text_data = enf_as_unicode(text_data)

    #apostrophes due to file_data original text, not contractions.   example of scenario that must be supported:  original text =   ['a', 'aaron', 'abandon', 'abandoned', 'abbey']    lists and dicts make great test data.      ditto for random webpage source code saved as utf8 .txt files.
    try:
        text_data = re.sub("[']","     '      ",text_data)                 #change all single apostrophes to a spaced single apostrophe to avoid    arah ally  instead of  sarah sally  as in:  'sarah','sally'    [ & possessive   's   case ]
        text_data = re.sub('[,]',"     ,      ",text_data)                  #change all commas to a spaced comma     now:     'a'   ,   'aaron'  ,   'abandon'  ,   'abandoned'  ,   'abbey'  ,    which was:  ['a', 'aaron', 'abandon', 'abandoned', 'abbey']
        text_data = re.sub("[ ][']","        ",text_data)                 #change "space plus apostrophe"  to spaces         leading apostrophe           a'   ,  aaron' ,   abandon'   ,   abandoned'   ,   abbey'   ,
        text_data = re.sub("['][ ]","        ",text_data)                 #change "apostrophe plus space" to spaces          trailing apostrophe            a   ,  aaron ,   abandon   ,   abandoned    ,   abbey   ,
        text_data = re.sub('["]',"          ",text_data)                   # change " to spaces (emaining   "   )
        text_data = re.sub("[']","          ",text_data)                  # change " to spaces (emaining   '  )         possessive   's   -  the letter   s   and all other letteres are 'bad' words and will be deleted
    except Exception as e:
        if DEBUG: print("#apostrophes due to file_data original text, not contractions.: exception: ", enf_as_unicode(e))
        text_data = "Parsing Errors; cannot proceed with this book...Run in DEBUG for more information"
        log(text_data)
        return text_data

    if DEBUG: print("Length of text_data after apostrophes and quotes are replaced : ", unicode_type(len(text_data)))

    #html
    text_data = strip_html_tags(my_html_stripper,text_data)
    if DEBUG: print("Length of text_data after html was stripped : ", unicode_type(len(text_data)))

    #--------------------------------------------------------------
    # Use the ASCII table to remove anything that is not a lower case letter or space.  each row in file_data was already set to lower case just above.
    #--------------------------------------------------------------
    temp_text = text_data

    #~ if DEBUG: print("text_data before removing non-ASCII characters:  ", text_data)

    temp_string_list = []

    #python 3   bytestrings are the numeric integers of the ASCII letter/character in python 3...so cannot use the n = ord(char) approach as in python 2...
    valid_chars = unicode_type(' abcdefghijklmnopqrstuvwxyz')
    valid_char_set = set()
    for c in valid_chars:
        valid_char_set.add(c)
    if not isinstance(temp_text,unicode_type):
        temp_text = enf_as_unicode(temp_text)
    for char in temp_text:
        if char in valid_char_set:
            pass
        else:
            char = "   "
        temp_string_list.append(char)
    #END FOR
    text_data = "".join(temp_string_list)
    del temp_string_list
    del temp_text

    #~ if DEBUG: print("text_data after removing non-ASCII characters:  ", text_data)

    if DEBUG: print("Length of text_data after symbols were stripped: ", unicode_type(len(text_data)))

    # failed contractions because their fancy apostrophes were Unicode and could not be converted , yielding words such as:   theyve  youll  cant   etc.
    global english_failed_contractions_list
    for row in english_failed_contractions_list:
        s = " " + row + " "     # replace the word ' youll ' or ' cant ' or ' well' or ' theyve '  or ' ill '    etc.    deleting a very few valid short nouns is unavoidable at this point due to the stated reason.
        text_data = re.sub(s,"      ", text_data)
    #END FOR

    if DEBUG: print("Length of text_data after failed contractions are stripped: ", unicode_type(len(text_data)))

    #last
    text_data = re.sub("[ ]+"," ",text_data)
    text_data = re.sub("[ ]","   ",text_data)   # leave at least 3 spaces between each word to avoid possible issues in the condense method

    if DEBUG: print("Length of text_data after contiguous spaces have been reduced to three (3) before and after each remaining word: ", unicode_type(len(text_data)))

    #~ if DEBUG: print(text_data)

    #--------------------------------------------------------------
    return text_data
#-------------------------------------------------------------------------------------------------------------------------------------
def load_book_file(log,notifications,path,format_to_use):

    if DEBUG: print("Loading book file: ", path)

    file_data = ""
    errors = ""

    if format_to_use == "TXT":
        file_data,errors = load_text_file(log,notifications,path)
        del format_to_use
        del path
        return file_data,errors

    if format_to_use == "EPUB":
        file_data,errors = load_epub_file(log,notifications,path)
        del format_to_use
        del path
        return file_data,errors

    if format_to_use == "PDF":
        file_data,errors = load_pdf_file(log,notifications,path)
        del format_to_use
        del path
        return file_data,errors

    del format_to_use
    del path

    return file_data,errors
#-------------------------------------------------------------------------------------------------------------------------------------
def load_text_file(log,notifications,path):

    if DEBUG: print("Loading text file: ", path)

    file_data = "NOTHING"
    errors = ""

    path = path.replace(os.sep,"/")
    path = enf_as_unicode(path)

    try:
        with open(path, 'r') as f:
            file_data = f.readlines()
        f.close()
    except Exception as e:
        errors = enf_as_unicode(e)

    try:
        del path
        del f
    except:
        pass

    return file_data,errors
#-------------------------------------------------------------------------------------------------------------------------------------
def load_epub_file(log,notifications,path):

    if DEBUG: print("Loading epub file: ", path)

    file_data = "NOTHING"
    errors = ""

    path = path.replace(os.sep,"/")
    path = enf_as_unicode(path)

    file_data,errors = extract_epub_text(path,log)

    del path

    return file_data,errors
#-------------------------------------------------------------------------------------------------------------------------------------
def load_pdf_file(log,notifications,path):

    if DEBUG: print("Loading pdf file: ", path)

    file_data = "NOTHING"
    errors = ""

    path = path.replace(os.sep,"/")
    path = enf_as_unicode(path)

    file_data,errors = extract_pdf_text(path,log)

    del path

    return file_data,errors
#-------------------------------------------------------------------------------------------------------------------------------------
def build_book_path(my_cursor,my_db,log,notifications,current_book,library_path):
    #   "S:\Calibre\QS\QuarantineAndScrub_Test6\Bella Andre\I Love How You Love Me_ The Sulliva (3295)\I Love How You Love Me_ The Sul - Bella Andre.txt"
    #   path from table books:   Bella Andre/I Love How You Love Me_ The Sulliva (3295)
    #   format and book file name from data:   "id","3295","TXT","5452","I Love How You Love Me_ The Sul - Bella Andre"

    if DEBUG: print("Building book path")

    full_book_path = ""
    path_to_use = ""
    format_to_use = ""
    name = ""
    errors = ""

    mysql = 'SELECT path FROM books WHERE id = ?'
    my_cursor.execute(mysql,([current_book]))
    tmp = my_cursor.fetchall()
    if not tmp:
        errors = "NO BOOKS.PATH FOUND"
        return full_book_path,format_to_use, errors
    else:
        if len(tmp) == 0:
            errors = "NO BOOKS.PATH FOUND"
            return full_book_path,format_to_use, errors
        else:
            for row in tmp:
                for col in row:
                    path_to_use = col

    mysql = 'SELECT format,name FROM data WHERE book = ? ORDER BY format DESC'   #so TXT is first if it exists for this book
    my_cursor.execute(mysql,([current_book]))
    tmp = my_cursor.fetchall()
    if not tmp:
        errors = "NO DATA.FORMAT FOUND"
    else:
        if len(tmp) == 0:
            errors = "NO DATA.FORMAT FOUND"
        else:
            for row in tmp:
                format,name = row
                if format in SUPPORTED_BOOK_FORMATS:
                    format_to_use = format
                    break
            del tmp

    s_lower = format_to_use.lower()

    name = name + "." + s_lower

    path_to_use = os.path.join(path_to_use,name)

    full_book_path = os.path.join(library_path,path_to_use)

    full_book_path = full_book_path.replace(os.sep,"/")

    #   "S:/Calibre/QS/QuarantineAndScrub_Test8/Bella Andre/I Love How You Love Me_ The Sulliva (3295)/I Love How You Love Me_ The Sul - Bella Andre.txt"

    del path_to_use
    del name
    del s_lower

    return full_book_path,format_to_use, errors
#-------------------------------------------------------------------------------------------------------------------------------------
def strip_html_tags(my_html_stripper_,html_):
    html_ = enf_as_unicode(html_)
    my_html_stripper_.feed(html_)
    del html_
    return my_html_stripper_.get_data()
#-------------------------------------------------------------------------------------------------------------------------------------
def extract_epub_text(epub_path,log):

    if DEBUG: print("Extracting epub text: ", epub_path)

    errors = ""
    my_list = []
    my_html_content = ""
    file_handle = ""
    html_list = ""

    try:
        file_handle = ZipFile(epub_path)
        name_list = file_handle.namelist()
        html_list = []
        #END FOR
        for name in name_list:
            if name.endswith("html"):
                zinfo = file_handle.getinfo(name)
                html_list.append(zinfo)
        #END FOR
        if DEBUG: print("number of html files within epub zip: ", as_unicode(len(html_list)))
        if len(html_list) == 0:
            pass
        else:
            html_list = sort_with_embedded_digits(html_list)
            my_html_content = ENFHTMLGetContent()
            for html_ in html_list:
                try:
                    data = file_handle.read(html_)
                    data = enf_as_unicode(data)
                    my_html_content.feed(data)
                except Exception as e:
                    if DEBUG: print("epub html extraction error: ", as_unicode(e))
            #END FOR
            my_list = list(my_html_content.content)
    except Exception as e:
        errors = enf_as_unicode(e)

    try:
        del my_html_content
        del file_handle
        del html_list
        del epub_path
    except:
        pass

    return my_list, errors
#-------------------------------------------------------------------------------------------------------------------------------------
def extract_pdf_text(pdf_path,log):

    if DEBUG: print("Extracting pdf text: ", pdf_path)

    errors = ""
    pdf_html = []
    pdf_html_ugly = []

    try:

        html_dir = enf_pdftohtml(pdf_path)
        html_path = os.path.join(html_dir, 'index.html')

        with open(html_path, 'rb') as f:
            pdf_html_ugly = f.readlines()
        f.close()

        try:
            if os.isfile(html_path):
                os.remove(html_path)
        except:
            pass

        try:
            del f
            del html_path
            del html_dir
            del pdf_path
        except:
            pass

    except Exception as e:
        errors = "Extraction of PDF HTML Failed: " + enf_as_unicode(e)
        if DEBUG: print(errors)
        return pdf_html,errors

    pdf_html = []
    for row in pdf_html_ugly:
        s = unicode_type(row)
        s = s.lower()
        s = re.sub("[&][#][0-9]+[;]"," ",s)
        s = re.sub("[&][a-z][a-z]*[a-z][;]"," ",s)
        s = s.replace("  "," ")
        if len(s) < 51:  #we want ASCII words, not phrases with no spaces due to pdf trash
            pdf_html.append(enf_as_unicode(s))
    #END FOR

    del pdf_html_ugly

    return pdf_html,errors
#-------------------------------------------------------------------------------------------------------------------------------------
re_digits = re.compile('(\d+)')
def embedded_digits(html_list):
    html_parts = re_digits.split(html_list)
    html_parts[1::2] = list(map(int, html_parts[1::2]))  #python 3:  map no longer returns a list as it does in python 2...
    #~ if DEBUG: print("embedded_digits: ", as_unicode(html_parts))   # e.g. ['OEBPS/jacket.xhtml']    ['OEBPS/Text/part', 0, '.html']
    return html_parts
#-------------------------------------------------------------------------------------------------------------------------------------
def sort_with_embedded_digits(zipinfo_list):
    data = [(embedded_digits(zipinfo.filename), zipinfo) for zipinfo in zipinfo_list]
    data.sort()
    return [zipinfo for _, zipinfo in data]
#-------------------------------------------------------------------------------------------------------------------------------------
def build_tags_dict(my_cursor,my_db,log,notifications):
    tags_dict = {}
    del tags_dict
    tags_dict = {}

    for row in my_cursor.execute("select id,name from tags"):
        id,name = row
        try:
            name = name.lower()
        except:
            pass
        tags_dict[name] = id

    return tags_dict
#-------------------------------------------------------------------------------------------------------------------------------------
def build_custom_column_list(my_cursor,my_db,log,notifications):
    if DEBUG: print("Building custom column list")
    cc_list = []
    for row in my_cursor.execute("select id,label,datatype,is_multiple,normalized from custom_columns"):
        cc_list.append(row)
    return cc_list
#-------------------------------------------------------------------------------------------------------------------------------------
def get_english_common_words_to_delete(log,notifications):

    global english_words_to_delete_set

    english_words_to_delete_set  = get_set_of_words_to_delete()

    return english_words_to_delete_set
#-------------------------------------------------------------------------------------------------------------------------------------
def synchronize_all_user_and_standard_word_rule_sets(log):
    # the various sets need to be synchronized based upon each priority regarding the rest of them, else chaos.
    # as of this point, almost all of the 'indirectly good words' (i.e., the plural pairs and the change pairs, both user and standard) have been added dynamically to their appropriate 'good' set.

    #  the priority of use of these sets is shown below in descending priority order:
    global user_custom_word_rules_good_set
    global user_custom_word_rules_bad_set
    global english_nouns_to_keep_set
    global global_first_names_set
    global english_words_to_delete_set


    #------------------------------------------------------
    # Finalize sets based on job runtime parameter
    #------------------------------------------------------
    global my_param_dict

    top_100_nouns_set_singular = set(list(['time' , 'issue' , 'year' , 'side' , 'person' , 'kind' , 'way' , 'head' , 'day' , 'house' , 'man' , 'service' , 'thing' , 'friend' , 'woman' , 'father' , 'life' , 'power' , 'child' , 'hour' , 'world' , 'game' , 'school' , 'line' , 'state' , 'end' , 'family' , 'member' , 'student' , 'law' , 'group' , 'car' , 'country' , 'city' , 'problem' , 'community' , 'hand' , 'name' , 'part' , 'president' , 'place' , 'team' , 'case' , 'minute' , 'week' , 'idea' , 'company' , 'kid' , 'system' , 'body' , 'program' , 'information' , 'question' , 'back' , 'work' , 'parent' , 'government' , 'face' , 'number' , 'other' , 'night' , 'level' , 'mr' , 'office' , 'point' , 'door' , 'home' , 'health' , 'water' , 'person' , 'room' , 'art' , 'mother' , 'war' , 'area' , 'history' , 'money' , 'party' , 'storey' , 'result' , 'fact' , 'change' , 'month' , 'morning' , 'lot' , 'reason' , 'right' , 'research' , 'study' , 'girl' , 'book' , 'guy' , 'eye' , 'food' , 'job' , 'moment' , 'word' , 'air' , 'business' , 'teacher']))

    top_100_nouns_set = top_100_nouns_set_singular.union(set(list(['times' , 'issues' , 'years' , 'sides' , 'people' , 'kinds' , 'ways' , 'heads' , 'days' , 'houses' , 'men' , 'services' , 'things' , 'friends' , 'women' , 'fathers' , 'lives' , 'powers' , 'children' , 'hours' , 'worlds' , 'games' , 'schools' , 'lines ' , 'states' , 'ends' , 'families' , 'members' , 'students' , 'laws' , 'groups' , 'cars' , 'countries' , 'cities' , 'problems' , 'communities' , 'hands' , 'names' , 'parts' , 'presidents' , 'places' , 'teams' , 'cases' , 'minutes' , 'weeks' , 'ideas' , 'companies' , 'kids' , 'systems' , 'bodies' , 'programs' , 'information' , 'questions' , 'backs' , 'works' , 'parents' , 'governments' , 'faces' , 'numbers' , 'others' , 'nights' , 'levels' , 'misters' , 'offices' , 'points' , 'doors' , 'homes' , 'healths' , 'waters' , 'persons' , 'people', 'rooms' , 'arts' , 'mothers' , 'wars' , 'areas' , 'histories' , 'monies' , 'parties' , 'storey' 'story', 'stories' 'storeys', 'results' , 'facts' , 'changes' , 'months' , 'mornings' , 'lots' , 'reasons' , 'rights' , 'researchs' , 'studys' , 'girls' , 'books' , 'guys' , 'eyes' , 'foods' , 'jobs' , 'moments' , 'words' , 'airs' , 'businesses' , 'teachers'])))

    del top_100_nouns_set_singular

    if my_param_dict['REMOVE_TOP_100_NOUNS'] == unicode_type("True"):
        for row in top_100_nouns_set:
            noun = unicode_type(row)
            if noun in english_nouns_to_keep_set:
                english_nouns_to_keep_set.remove(noun)
            english_words_to_delete_set.add(noun)
    else:
        for row in top_100_nouns_set:
            noun = unicode_type(row)
            english_nouns_to_keep_set.add(noun)
            if noun in english_words_to_delete_set:
                english_words_to_delete_set.remove(noun)

    del top_100_nouns_set

    #-----------------------------------------------
    # final 'indirectly good words' to add:
    global acronyms_to_capitalize_set
    for word in acronyms_to_capitalize_set:
        word = word.strip()
        word = word.lower()
        word = unicode_type(word)
        english_nouns_to_keep_set.add(word)
    #-----------------------------------------------

    #------------------------------------------------------------------------------------------------

    # Priority [1]
    for word in user_custom_word_rules_good_set:
        if word in user_custom_word_rules_bad_set:
             user_custom_word_rules_bad_set.discard(word)
        if word in global_first_names_set:
             global_first_names_set.discard(word)
        if word in english_words_to_delete_set:
             english_words_to_delete_set.discard(word)
    #END FOR

    # Priority [2]
    for word in user_custom_word_rules_bad_set:
        if word in english_nouns_to_keep_set:
            english_nouns_to_keep_set.discard(word)
    #END FOR

    # Priority [3]
    for word in english_nouns_to_keep_set:
        if word in global_first_names_set:
            global_first_names_set.discard(word)
        if word in english_words_to_delete_set:
             english_words_to_delete_set.discard(word)
    #END FOR

    # Priority [4]
    for word in global_first_names_set:
        if word in english_words_to_delete_set:
             english_words_to_delete_set.discard(word)
    #END FOR

    #------------------------------------------------------------------------------------------------
    #------------------------------------------------------
    # Finalize set based on job runtime parameter
    #------------------------------------------------------
    if my_param_dict['REMOVE_GLOBAL_FIRST_NAMES'] == unicode_type("False"):
        global_first_names_set.clear()  # this is done AFTER the Priority [4] logic just above, because the user said to NOT delete first names, implicitly including any in the 'bad' list...

    #------------------------------------------------------------------------------------------------

    log(" ")
    log(" ")
    log("Lists have been synchronized by 'Priority':  Custom User Good Words > Custom User Bad Words > Standard Good Words > Standard First (Bad) Names > Standard Bad Words.")
#-------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
def load_user_custom_word_rules_for_use(log):

    if DEBUG: print("Loading user custom word rules for use")

    # the input file formats must match the user dialog input and output used to manually customize them.

    build_protected_data_files_full_paths(log)

    load_good_words_file(log)

    load_bad_words_file(log)

    load_change_word_pairs(log)
#-------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
def load_user_custom_plurals_file(log):

    global user_custom_word_rules_plurals_pairs_full_path
    global user_custom_plurals_dict
    global user_custom_singulars_dict

    try:
        if user_custom_plurals_dict:
            pass
    except:
            user_custom_plurals_dict = {}
            user_custom_singulars_dict = {}

    if isinstance(user_custom_plurals_dict, dict):
        user_custom_plurals_dict.clear()
        user_custom_singulars_dict.clear()
    else:
        user_custom_plurals_dict = {}
        user_custom_singulars_dict = {}

    if user_custom_word_rules_plurals_pairs_full_path == "unknown":
        build_protected_data_files_full_paths(log)

    if not os.path.exists(user_custom_word_rules_plurals_pairs_full_path):
        log("The 'user custom plurals file' does not currently exist.")
        log(" ")
        return

    try:
        saved_user_plurals_text = []
        try:
            with open(user_custom_word_rules_plurals_pairs_full_path, 'r') as f:      # data format is:    daltonst:daltonsts|mouse:mice|one:many|sheep:sheep|
                lines = f.readlines()
                for line in lines:
                    line = enf_as_unicode(line)
                    line = enf_as_unicode(line)
                    saved_user_plurals_text.append(enf_as_unicode(line))
                f.close()
        except Exception as e:
            log(" -----------------------------------------------------------------------------------------------------------------------------")
            log("ERROR[0]: User singular:plural dictionary could not be loaded to use due to this reason:  " + enf_as_unicode(e))
            log(" -----------------------------------------------------------------------------------------------------------------------------")
            return

        try:
            del f
            del lines
        except:
            pass

        tmp_list = []
        for word in saved_user_plurals_text:
            if not "|" in word:
                word = word + "|"
            word.replace("|||","|")
            word.replace("||","|")
            s_split = word.split("|")   # example:       daltonst:daltonsts|mouse:mice|one:many|sheep:sheep|
            if s_split:
                for word in s_split:
                    if word:
                        word = enf_as_unicode(word)
                        if ":" in word:         # example:       daltonst:daltonsts
                            word = word.strip()
                            if len(word) > 2:
                                word = word.lower()
                                tmp_list.append(enf_as_unicode(word))
                            else:
                                continue
                        else:
                            continue
                    else:
                        continue
            else:
                continue
            #END FOR
        #END FOR

        del saved_user_plurals_text

        user_custom_plurals_dict.clear()
        try:
            for row in tmp_list:
                s_split = row.split(":")
                if s_split:
                    if not len(s_split) == 2:
                        continue
                else:
                    continue
                k = s_split[0]
                v = s_split[1]
                user_custom_plurals_dict[k] = v
                del s_split
            #END FOR
            if not isinstance(user_custom_plurals_dict, dict):
                log(" -----------------------------------------------------------------------------------------------------------------------------")
                log("ERROR: could not create a valid dict.  User custom singular:plural data cannot be used until recreated.")
                log(" -----------------------------------------------------------------------------------------------------------------------------")
                user_custom_plurals_dict = {}       #make it a valid dict, even if empty...
                user_custom_singulars_dict = {}    #make it a valid dict, even if empty...
                return
            else:
                n = len(user_custom_plurals_dict)
                s = '{:,}'.format(n)
                log("Number of 'User custom singular:plural pairs' loaded from the Calibre Plugin Directory:    " + enf_as_unicode(s) )
                log(" ")
        except Exception as e:
            log(" -----------------------------------------------------------------------------------------------------------------------------")
            log("ERROR: " + enf_as_unicode(e))
            log(" The User singular:plural pairs dictionary no longer is a valid dictionary.   It cannot be used until recreated.")
            log(" -----------------------------------------------------------------------------------------------------------------------------")
            try:
                user_custom_plurals_dict.clear()        #make it a valid dict, even if empty...
            except:
                try:
                    user_custom_plurals_dict = {}
                except:
                    pass
            return

        #create the other user dict by inverting the one just loaded. the user only has to maintain one direction....if that.
        for k,v in iteritems(user_custom_plurals_dict):
            user_custom_singulars_dict[v] = k
        #END FOR
    except Exception as e:
        log(" -----------------------------------------------------------------------------------------------------------------------------")
        log("ERROR[0]: User singular:plural dictionary could not be loaded to use due to this reason:  " + enf_as_unicode(e))
        log(" -----------------------------------------------------------------------------------------------------------------------------")
#-------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
def load_good_words_file(log):

    global user_custom_word_rules_good_words_full_path
    global user_custom_word_rules_good_set

    if not os.path.exists(user_custom_word_rules_good_words_full_path):
        log("The 'user custom word rules for good words' file does not exist.")
        return

    try:
        saved_user_good_words_text = []
        try:
            with open(user_custom_word_rules_good_words_full_path, 'r') as f:              # file format is:      keepme|keepyou|keephim|
                lines = f.readlines()
                for line in lines:
                    line = enf_as_unicode(line)
                    line = enf_as_unicode(line)
                    saved_user_good_words_text.append(line)
                f.close()
        except Exception as e:
            log(" -----------------------------------------------------------------------------------------------------------------------------")
            log("ERROR[0]: User good words could not be loaded to use for this reason:  " + enf_as_unicode(e))
            log(" -----------------------------------------------------------------------------------------------------------------------------")
            return

        try:
            del f
            del lines
        except:
            pass

        tmp_list = []
        for word in saved_user_good_words_text:
            if not "|" in word:
                word = word + "|"
            word.replace("|||","|")
            word.replace("||","|")
            s_split = word.split("|")
            if s_split:
                for word in s_split:
                    if word:
                        word = word.replace(" ","")
                        word = word.strip()
                        word = word.lower()
                        word = unicode_type(word)
                        if word > " ":
                            tmp_list.append(word)
                        else:
                            continue
                    else:
                        continue
                #END FOR
            else:
                continue
        #END FOR

        del saved_user_good_words_text

        user_custom_word_rules_good_list = []
        for word in tmp_list:
            user_custom_word_rules_good_list.append(word)

        del tmp_list

        if not isinstance(user_custom_word_rules_good_list,list):
            log("ERROR:    user_custom_word_rules_good_list is not a list and cannot be used.")
            return
        try:
            user_custom_word_rules_good_set = set(user_custom_word_rules_good_list)
            del user_custom_word_rules_good_list
            if not isinstance(user_custom_word_rules_good_set,set):
                log("ERROR:    user_custom_word_rules_good_set is invalid.  bypassing.   ")
            else:
                n = len(user_custom_word_rules_good_set)
                s = '{:,}'.format(n)
                log("Number of 'User custom good words' loaded from the Calibre Plugin Directory:   " + enf_as_unicode(s))
                log(" ")
        except:
            log(" -----------------------------------------------------------------------------------------------------------------------------")
            log(" The User custom good words file is not valid.  It cannot be used until recreated. ")
            log(" -----------------------------------------------------------------------------------------------------------------------------")
    except Exception as e:
        log(" -----------------------------------------------------------------------------------------------------------------------------")
        log("ERROR[0]: User custom good words could not be loaded for use due to this reason:  " + enf_as_unicode(e))
        log(" -----------------------------------------------------------------------------------------------------------------------------")
#-------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
def load_bad_words_file(log):

    global user_custom_word_rules_bad_words_full_path
    global user_custom_word_rules_bad_set

    if not os.path.exists(user_custom_word_rules_bad_words_full_path):
        log("The 'user custom word rules for bad words' file does not exist.")
        return

    try:
        saved_user_bad_words_text = []
        try:
            with open(user_custom_word_rules_bad_words_full_path, 'r') as f:       # data format is:  deleteme|deleteyou|deletehim|
                lines = f.readlines()
                for line in lines:
                    line = enf_as_unicode(line)
                    line = enf_as_unicode(line)
                    saved_user_bad_words_text.append(line)
                f.close()
        except Exception as e:
            log("The 'user custom word rules for bad words' file does not currently exist.")
            return

        try:
            del f
            del lines
        except:
            pass


        tmp_list = []
        for word in saved_user_bad_words_text:
            if not "|" in word:
                word = word + "|"
            word.replace("|||","|")
            word.replace("||","|")
            s_split = word.split("|")
            if s_split:
                for word in s_split:
                    if word:
                        word = word.replace(" ","")
                        word = word.strip()
                        word = word.lower()
                        word = unicode_type(word)
                        if word > " ":
                            tmp_list.append(word)
                        else:
                            continue
                    else:
                        continue
                #END FOR
            else:
                continue
        #END FOR

        del saved_user_bad_words_text

        user_custom_word_rules_bad_list = []
        for word in tmp_list:
            user_custom_word_rules_bad_list.append(word)

        del tmp_list

        if not isinstance(user_custom_word_rules_bad_list,list):
            log("ERROR:    user_custom_word_rules_bad_list is not a list. ")
            return

        try:
            user_custom_word_rules_bad_set = set(user_custom_word_rules_bad_list)
            del user_custom_word_rules_bad_list
            if not isinstance(user_custom_word_rules_bad_set,set):
                log("ERROR:    user_custom_word_rules_bad_set is invalid.  bypassing.   ")
            else:
                n = len(user_custom_word_rules_bad_set)
                s = '{:,}'.format(n)
                log("Number of 'User custom bad words' loaded from the Calibre Plugin Directory:    " + enf_as_unicode(s))
                log(" ")
        except:
            log(" -----------------------------------------------------------------------------------------------------------------------------")
            log(" The User custom bad words file is not valid.  It cannot be used until recreated. ")
            log(" -----------------------------------------------------------------------------------------------------------------------------")
    except Exception as e:
        log(" -----------------------------------------------------------------------------------------------------------------------------")
        log("ERROR[0]: User custom bad words could not be loaded for use due to this reason:  " + enf_as_unicode(e))
        log(" -----------------------------------------------------------------------------------------------------------------------------")
#-------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
def load_change_word_pairs(log):

    global user_custom_word_rules_change_pairs_full_path
    global user_custom_word_rules_change_pairs_dict

    user_custom_word_rules_change_pairs_dict.clear()


    global user_custom_word_rules_titlecase_set
    global user_custom_word_rules_lowercase_set

    if user_custom_word_rules_change_pairs_full_path == "unknown":
        build_protected_data_files_full_paths(log)

    if not os.path.exists(user_custom_word_rules_change_pairs_full_path):
        log("The 'user custom word change pairs file' does not currently exist, so none of those rules can be used at this time.")
        log(" ")
        return

    saved_user_change_pairs_text = []
    try:
        with open(user_custom_word_rules_change_pairs_full_path, 'r') as f:      # data format is:    datum:data|evolution:darwinslaw|hogan:HOGANSHEROES|slut:tramp|
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                line = line.replace("\r"," ")
                line = line.replace("\n"," ")
                line = line.replace("\t"," ")
                if line[0] == enf_as_unicode(":") :
                    line = line[1: ]
                saved_user_change_pairs_text.append(line)
            f.close()
    except Exception as e:
        log(" -----------------------------------------------------------------------------------------------------------------------------")
        log("ERROR[1]: User custom word change pairs dictionary could not be loaded to use due to this reason:  " + enf_as_unicode(e))
        log(" -----------------------------------------------------------------------------------------------------------------------------")
        return

    try:
        del f
        del lines
    except:
        pass

    try:
        tmp_list = []
        for line in saved_user_change_pairs_text:
            if line:
                pass
            else:
                continue
            if line[0] == enf_as_unicode(":") :
                line = line[1: ]
            if not "|" in line:
                line = line + "|"
            line.replace("|||","|")
            line.replace("||","|")
            s_split = line.split("|")   # example:
            for word in s_split:
                if word:
                    word = word.replace("|", "")
                    word = word.strip()
                    if word.count(":") > 1:
                        word = word.replace(":", "",1)
                        word = word.strip()
                    if word.count(":") > 0:         # example:      datum:data
                        word = word.strip()
                        if len(word) > 2:
                            word = enf_as_unicode(word)
                            tmp_list.append(word)
                        else:
                            continue
                    else:
                        continue
                else:
                    continue
            #END FOR
        #END FOR

        del saved_user_change_pairs_text

        if tmp_list:
            n_total_change_pairs_loaded = len(tmp_list)
        else:
            n_total_change_pairs_loaded = 0
            tmp_list = []

        try:
            for row in tmp_list:
                row = enf_as_unicode(row)
                s_split = row.split(":")
                if s_split:
                    if not len(s_split) == 2:
                        continue
                else:
                    continue
                #-----------------
                s0 = s_split[0]
                s1 = s_split[1]
                #-----------------
                if s0:
                    pass
                else:
                    continue
                if s1:
                    pass
                else:
                    continue
                #-----------------
                s0 = s_split[0]
                s1 = s_split[1]
                #-----------------
                s0 = enf_as_unicode(s0.strip())
                s1 = enf_as_unicode(s1.strip())
                if s0.isalpha() and s1.isalpha():
                    pass
                else:
                    continue
                #-----------------
                k0 = enf_as_unicode(s0)
                v0 = enf_as_unicode(s1)
                k0 = enf_as_unicode(k0)
                v0 = enf_as_unicode(v0)
                k0 = enf_as_unicode(k0.lower())
                v0 = enf_as_unicode(v0.lower())

                if enf_as_unicode(k0) != enf_as_unicode(v0):  #for changing word1 into word2 only, since both k0 and v0 were set to all lower case for this comparison of equality.
                    user_custom_word_rules_change_pairs_dict[enf_as_unicode(k0)] = enf_as_unicode(v0)      #    word1 >>>> word2, both being lower case until later logic using the sets built below changes word2

                #-----------------
                k1 = enf_as_unicode(s0)      # aids   | nato   | NATO |  FYI  |            word1 |   word1     |    word1      |   word1  |  word1   |  WORD1  |  WORD1  |  Word1  |  Word1  |
                v1 = enf_as_unicode(s1)      # AIDS  | Nato  | Nato   |  fyi   |            word2 |   Word2    |    WORD2   |    Word1 |  WORD1 |  word1   |   Word1    |  word1  |   WORD1  |     only v1 matters below. user cannot say "if k1 == word1, then v1 will be upper, else title". a single word must have the identical case regardless of how it got there.
                if enf_as_unicode(k1) == enf_as_unicode(v1):  # taking into account lower vs. upper vs. title case in this instance
                    pass   # why did the user even add this pair?
                else:
                    # so:  k1 != v1, which may be (a) simply a change in case, or (b) may be a word change and a case change simultaneously.  k0 above took care of word1 changing into word2.  k1 is all about v1's case afterwards.
                    if v1.isupper():      # WORD
                        user_custom_word_rules_uppercase_set.add(enf_as_unicode(v1.lower()))   # add the lowercase version to this set...
                    else:
                        if  v1.islower():  #  word
                            user_custom_word_rules_lowercase_set.add(enf_as_unicode(v1.lower()))
                        else:                   # Word  (or wOrd or worD, which will be changed to Word regardless of what the user specified. no camelbacks. )
                            user_custom_word_rules_titlecase_set.add(enf_as_unicode(v1.lower()))
                #-----------------
                del s_split
            #END FOR

            del tmp_list

            n = len(user_custom_word_rules_change_pairs_dict)
            if n:
                try:
                    if n >= 0:
                        pass
                    else:
                        user_custom_word_rules_change_pairs_dict = dict([])  #make it a valid dict, even if empty...
                except:
                    user_custom_word_rules_change_pairs_dict = dict([])  #make it a valid dict, even if empty...
            else:
                user_custom_word_rules_change_pairs_dict = dict([])  #make it a valid dict, even if empty...

            if not user_custom_word_rules_change_pairs_dict:
                user_custom_word_rules_change_pairs_dict = dict([])  #make it a valid dict, even if empty...
                s = enf_as_unicode("sheep")
                user_custom_word_rules_change_pairs_dict[s] = s
                log("The 'user custom word change pairs' that were loaded, if any, have been lost.")
        except Exception as e:
            log(" -----------------------------------------------------------------------------------------------------------------------------")
            log("ERROR[2A]: User custom change words dictionary could not be loaded to use due to this reason:  " + enf_as_unicode(e))
            log(" ")
            return
    except Exception as e:
        log(" -----------------------------------------------------------------------------------------------------------------------------------------")
        log("ERROR[2]: User custom change words dictionary could not be loaded to use due to this reason:  " + enf_as_unicode(e))
        log(" -----------------------------------------------------------------------------------------------------------------------------------------")
        return



    try:
        log(" ")

        f = enf_as_unicode("{:>0,g}")

        n1 = n_total_change_pairs_loaded
        s = f.format(n1)
        log("Number of 'User custom word change pairs' loaded from the Calibre Plugin Directory:                                " + enf_as_unicode(s) )
        log(" ")
        n2 = len(user_custom_word_rules_uppercase_set)
        s = f.format(n2)
        log("Number of 'User custom word change pairs' that force a word to all upper case after counting is complete:          " + enf_as_unicode(s))
        log(" ")
        n3 = len(user_custom_word_rules_titlecase_set)
        s = f.format(n3)
        log("Number of 'User custom word change pairs' that force a word to title case after counting is complete:              " + enf_as_unicode(s) )
        log(" ")
        n4 = len(user_custom_word_rules_lowercase_set)
        s = f.format(n4)
        log("Number of 'User custom word change pairs' that will be Defaulted:                                                  " + enf_as_unicode(s) )
        log(" ")
        n5 = n1 - n2 - n3 - n4
        s = f.format(n5)
        if n5 > 0:
            log("Number of 'User custom word change pairs' that simply change one word into another word:                           " + enf_as_unicode(s) )
        log(" ")
        log("Default:  Any 'Most Frequent Noun' that does not have a specific rule to force it to all upper case will be titlecased.")
        log(" ")
        log(" ")
        log(" ")
    except Exception as e:
        log(" -----------------------------------------------------------------------------------------------------------------------------------------")
        log("ERROR[3]: Logging the counts for User Custom Word Change Pairs was interrupted for this reason: " + enf_as_unicode(e))
        log(" -----------------------------------------------------------------------------------------------------------------------------------------")
        return
#-------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
def build_protected_data_files_full_paths(log):

    global accumulated_most_frequent_nouns_tuples_file_full_path         # "C:\Users\DaltonST\AppData\Roaming\calibre\plugins\enf_files\accumulated_most_frequent_nouns.tuples

    global user_custom_word_rules_plurals_pairs_full_path           # "C:\Users\DaltonST\AppData\Roaming\calibre\plugins\enf_files\user_singular_plural_pairs.string"
    global user_custom_word_rules_change_pairs_full_path          # "C:\Users\DaltonST\AppData\Roaming\calibre\plugins\enf_files\user_change_pairs.string"

    global user_custom_word_rules_good_words_full_path           # "C:\Users\DaltonST\AppData\Roaming\calibre\plugins\enf_files\user_good_words.string"
    global user_custom_word_rules_bad_words_full_path             # "C:\Users\DaltonST\AppData\Roaming\calibre\plugins\enf_files\user_bad_words.string"

    global my_plugin_path

    global protected_data_directory

    # protected directory to use
    protected_data_path = my_plugin_path
    protected_data_path = protected_data_path.replace("\English Noun Frequency.zip", "/enf_files")
    protected_data_directory = protected_data_path.replace("/English Noun Frequency.zip", "/enf_files")
    protected_data_directory = protected_data_directory.replace(os.sep, '/')
    protected_data_directory = enf_as_unicode(protected_data_directory)

    if not os.path.exists(protected_data_directory):   # failsafe only, since enf_dialog does this first in order to extract .html files and to save the user custom word files (see below) that this program reads in
        os.makedirs(protected_data_directory)

    # accumulated_most_frequent_nouns.tuples
    file_name = ACCUMULATED_MOST_FREQUENT_NOUNS_TUPLES_FILENAME
    file_name = enf_as_unicode(file_name)
    accumulated_most_frequent_nouns_tuples_file_full_path = os.path.join(protected_data_directory, file_name )
    accumulated_most_frequent_nouns_tuples_file_full_path = accumulated_most_frequent_nouns_tuples_file_full_path.replace(os.sep,"/")

    # singular:plural pairs file
    file_name = USER_CUSTOM_WORD_RULES_SINGULAR_PLURAL_PAIRS_FILENAME
    file_name = enf_as_unicode(file_name)
    user_custom_word_rules_plurals_pairs_full_path = os.path.join(protected_data_directory, file_name )
    user_custom_word_rules_plurals_pairs_full_path = user_custom_word_rules_plurals_pairs_full_path.replace(os.sep,"/")

    # change pairs file
    file_name = USER_CUSTOM_WORD_RULES_CHANGE_PAIRS_FILENAME
    file_name = enf_as_unicode(file_name)
    user_custom_word_rules_change_pairs_full_path  = os.path.join(protected_data_directory, file_name )
    user_custom_word_rules_change_pairs_full_path  = user_custom_word_rules_change_pairs_full_path.replace(os.sep,"/")

    # good words file
    file_name = USER_CUSTOM_WORD_RULES_GOOD_WORDS_FILENAME
    file_name = enf_as_unicode(file_name)
    user_custom_word_rules_good_words_full_path = os.path.join(protected_data_directory, file_name )
    user_custom_word_rules_good_words_full_path = user_custom_word_rules_good_words_full_path.replace(os.sep,"/")

    # bad words file
    file_name = USER_CUSTOM_WORD_RULES_BAD_WORDS_FILENAME
    file_name = enf_as_unicode(file_name)
    user_custom_word_rules_bad_words_full_path = os.path.join(protected_data_directory, file_name )
    user_custom_word_rules_bad_words_full_path = user_custom_word_rules_bad_words_full_path.replace(os.sep,"/")
#-------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
#END of enf_main.py


