# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__   = 'GPL v3'
__copyright__ = '2015,2016,2017,2018,2019,2020  DaltonST <DaltonShiTzu@outlook.com>'
__my_version__ = "1.0.82"   #Python 3 regression errors fixed

import os, sys, apsw, csv, datetime, re

from calibre import isbytestring
from calibre.constants import filesystem_encoding
from calibre.utils.logging import Log

from polyglot.builtins import as_unicode, unicode_type
from polyglot.queue import Queue

#---------------------------------------------------------------------------------------------------
from calibre_plugins.multi_column_search.heading import log_heading_common
from calibre_plugins.multi_column_search.config import prefs
#---------------------------------------------------------------------------------------------------

mynothing = ""
custom_column_id = as_unicode(0)
my_terminate_early = False

notifications = Queue()
log = Log()

header_s1 = None
header_s2 = None
header_s3 = None
header_s4 = None
header_s5 = None

#---------------------------------------------------------------------------------------------------------------------------------------------
def mcs_trim_word_book_index(self,my_guidb,trim_type,selected_books_list,log=None, abort=None, notifications=True):

    #----------------------------------------------------------------------------------------------------------------
    global header_s1
    global header_s2
    global header_s3
    global header_s4
    global header_s5
    #----------------------------------------------------------------------------------------------------------------

    notifications.put((0.01, 'MCS Trimming Index Words-by-Book'))
    log(' ')

    path = my_guidb.library_path
    if isbytestring(path):
        path = path.decode(filesystem_encoding)
    path = path.replace(os.sep, '/')
    path = os.path.join(path, 'metadata.db')
    path = path.replace(os.sep, '/')

    log(path)

    try:
        my_db = apsw.Connection(path)
    except Exception as e:
        log(as_unicode(e))
        return

    my_cursor = my_db.cursor()

    mysql = "PRAGMA main.busy_timeout = 10000;"      #PRAGMA busy_timeout = milliseconds;
    my_cursor.execute(mysql)

    #----------------------------------------------------------------------------------------------------------------
    header_s1 =  as_unicode("SQLite Version: " + as_unicode(apsw.SQLITE_VERSION_NUMBER) + "    [APSW]")
    header_s2 = mysql
    header_s3 = "Beginning 'Trimming Index Words-by-Book' "
    log_heading_common(log,header_s1,header_s2,header_s3,header_s4,header_s5)
    #----------------------------------------------------------------------------------------------------------------
    #----------------------------------------------------------------------------------------------------------------
    #----------------------------------------------------------------------------------------------------------------

    log(" ")

    mcs_trim_index_word_by_book_control(my_db,my_cursor,log,notifications,trim_type,selected_books_list)

    global my_terminate_early
    if not my_terminate_early:
        notifications.put((0.99, 'Almost Finished.  Performing Housekeeping'))

    count_records_in_index(my_db,my_cursor)

    my_db.close()

    log(' ')
    log(' ')
    log("You should now defragment/vacuum this library's metadata.db by invoking Calibre menu path: Library > Library Maintenance > Check Library.")
    log(' ')
    log(' ')
    log('MCS Word-by-Book Index Trimming is Complete. ')
    log(' ')
    log(' ')
#---------------------------------------------------------------------------------------------------------------------------------------------
def mcs_trim_index_word_by_book_control(my_db,my_cursor,log,notifications,trim_type,selected_books_list):

    count_records_in_index(my_db,my_cursor)
    n =  prefs['WORD_INDEX_LATEST_RECORD_COUNT']
    log("")
    log("Number of records currently in the index: " + as_unicode(n))
    log("")

    remove_deleted_books_from_index(my_db,my_cursor,log)

    if trim_type == "advanced":

        if prefs['WORD_INDEX_TRIM_ENGLISH_TOP_100'] == unicode_type("True"):
            remove_english_top_100_nouns(my_db,my_cursor,log,notifications)
            count_records_in_index(my_db,my_cursor)
            n =  prefs['WORD_INDEX_LATEST_RECORD_COUNT']
            log("")
            log("Number of records currently in the index: " + as_unicode(n))

        if prefs['WORD_INDEX_TRIM_ENGLISH_BAD_LIST'] == unicode_type("True"):
            remove_english_bad_words(my_db,my_cursor,log,notifications)
            count_records_in_index(my_db,my_cursor)
            n =  prefs['WORD_INDEX_LATEST_RECORD_COUNT']
            log("")
            log("Number of records currently in the index: " + as_unicode(n))

        if prefs['WORD_INDEX_TRIM_HAN'] == unicode_type("True"):
            remove_han_words(my_db,my_cursor,log,notifications)
            count_records_in_index(my_db,my_cursor)
            n =  prefs['WORD_INDEX_LATEST_RECORD_COUNT']
            log("")
            log("Number of records currently in the index: " + as_unicode(n))

        if prefs['WORD_INDEX_TRIM_CSV'] == unicode_type("True"):
            remove_csv_words_control(my_db,my_cursor,log,notifications)
            count_records_in_index(my_db,my_cursor)
            n =  prefs['WORD_INDEX_LATEST_RECORD_COUNT']
            log("")
            log("Number of records currently in the index: " + as_unicode(n))
    elif trim_type == "Basic":
        perform_basic_trim(my_db,my_cursor,log,notifications,selected_books_list)
    else:
        log("Unknown 'Type of Trim'; nothing to be done.")
#---------------------------------------------------------------------------------------------------------------------------------------------
def remove_english_top_100_nouns(my_db,my_cursor,log,notifications):
    from calibre_plugins.multi_column_search.english_top_100_words_to_delete import return_english_top_100_noun_list
    top_100_nouns_set = return_english_top_100_noun_list()

    notifications.put((0.01, 'MCS Trimming Top 100 English Nouns'))

    my_cursor.execute("begin")
    for word in top_100_nouns_set:
        mysql = "DELETE FROM _mcs_word_book_index WHERE word = ?"
        my_cursor.execute(mysql,([word]))
    #END FOR
    my_cursor.execute("commit")

    log("--------------------------------------------------------------")
    log("")
    log("Top 100 English Nouns removed from the index.")
    log(" ")
    #~ log("--------------------------------------------------------------")
    del top_100_nouns_set

    notifications.put((0.25, 'Top 100 English Nouns removed from the index; Continuing...'))
#---------------------------------------------------------------------------------------------------------------------------------------------
def remove_english_bad_words(my_db,my_cursor,log,notifications):
    from calibre_plugins.multi_column_search.english_bad_words_to_delete import create_english_words_to_delete
    english_words_to_delete_set = create_english_words_to_delete()

    n_total_english_words_to_delete = len(english_words_to_delete_set)

    log("--------------------------------------------------------------")
    log(" ")
    log("Number of English non-nouns to be removed from the index if they exist: " + as_unicode(n_total_english_words_to_delete))
    log(" ")

    if n_total_english_words_to_delete == 0:
        return

    my_cursor.execute("begin")
    i = 0
    for word in english_words_to_delete_set:
        mysql = "DELETE FROM _mcs_word_book_index WHERE word = ?"
        my_cursor.execute(mysql,([word]))
        i = i + 1
        if i >= 1000:
            my_cursor.execute("commit")
            my_cursor.execute("begin")
            i = 0
        else:
            continue
    #END FOR
    try:
        my_cursor.execute("commit")
    except:
        pass
    log("English Non-Nouns, if any, have been removed from the index")

    notifications.put((0.50, 'English Non-Nouns, if any, have been removed from the index; Continuing...'))

    del english_words_to_delete_set
#---------------------------------------------------------------------------------------------------------------------------------------------
def remove_han_words(my_db,my_cursor,log,notifications):
    #~ Japanese
    #~ 0x4E00-0x9FBF    Kanji         int(0x4E00) = 19968   unichr(19968) = u'\u4e00'         int(0x9FBF) =
    #~ 0x3040-0x309F    Hiragana  int(0x3040) = 12352   u'\u3040     int(0x309F) =                   ぁあぃいぅうぇえぉおかがきぎ     bottom of all of these ranges
    #~ 0x30A0-0x30FF    Katakana  int(0x30A0) = 12448   u'\u30A0    int(0x30FF) =

    #~ Chinese, Old Korean, Old Vietnamese.  Kanji included also.
    #~ 4E00-62FF       int(0x4E00) =            int(0x62FF) =
    #~ 6300-77FF      int(0x6300) =            int(0x77FF) =
    #~ 7800-8CFF      int(0x7800) =            int(0x8CFF) =
    #~ 8D00-9FFF      int(0x8D00) =            int(0x9FFF) =

    range_list = []
    row = int(0x4E00),int(0x9FBF)
    range_list.append(row)
    row = int(0x3040),int(0x309F)
    range_list.append(row)
    row = int(0x30A0),int(0x30FF)
    range_list.append(row)
    row = int(0x4E00),int(0x62FF)
    range_list.append(row)
    row = int(0x6300),int(0x77FF)
    range_list.append(row)
    row = int(0x7800),int(0x8CFF)
    range_list.append(row)
    row = int(0x8D00),int(0x9FFF)
    range_list.append(row)

    #~ Han & Related
    #~ range: (19968, 40895)
    #~ range: (12352, 12447)
    #~ range: (12448, 12543)
    #~ range: (19968, 25343)
    #~ range: (25344, 30719)
    #~ range: (30720, 36095)
    #~ range: (36096, 40959)

    #~ Modern Korean:
    #~ Hangul Jamo Extended-A (U+A960–U+A97F)      A960-A97F
    #~ Hangul Jamo Extended-B (U+D7B0–U+D7FF)      D7B0-D7FF
    row = int(0xA960),int(0xA97F)       #  (43360, 43391)
    range_list.append(row)
    row = int(0xD7B0),int(0xD7FF)       #  (55216, 55295)
    range_list.append(row)



    log("--------------------------------------------------------------")
    log(" ")
    log("Removing Han (Chinese, related Japanese, old Korean, old Vietnamese) and Hangul (Korean) pictographs, if any.")
    log(" ")

    mysql = "SELECT book FROM _mcs_word_book_index WHERE char(word) > char(12351)"       # bottom of the possible ranges
    my_cursor.execute(mysql)
    tmp_rows = my_cursor.fetchall()
    if not tmp_rows:
        tmp_rows = []
    if len(tmp_rows) == 0:
        log("No Chinese (or related) or Korean pictographs were found to trim from the index")
        return
    tmp_set = set(tmp_rows)
    tmp_rows = list(tmp_set)
    del tmp_set
    tmp_rows.sort()
    log("Number of books with Chinese (or related) or Korean pictographs: " + as_unicode(len(tmp_rows)))
    log(" ")

    mysql = "DELETE FROM _mcs_word_book_index WHERE book = ? AND char(word) >= char(?) AND char(word) <= char(?)  "
    my_cursor.execute("begin")
    for row in tmp_rows:
        for col in row:
            book = col
            for range in range_list:
                start,end = range
                my_cursor.execute(mysql,(book,start,end))
            #END FOR
            break
        #END FOR
    #END FOR
    my_cursor.execute("commit")

    del tmp_rows
    del range_list

    notifications.put((0.75, 'MCS Completed Removing Pictographs; Continuing...'))
    log(" ")
    log("Chinese, Japanese, Old Korean, Old Vietnamese, and Korean words have been removed from the index.")
    log(" ")

    log("--------------------------------------------------------------")
#---------------------------------------------------------------------------------------------------------------------------------------------
def remove_csv_words_control(my_db,my_cursor,log,notifications):

    log("--------------------------------------------------------------")

    csv_word_list = upload_csv_file(my_db,my_cursor,log,notifications)
    if not len(csv_word_list) > 0:
        log("CSV File was empty.  Nothing done.")
        return
    remove_csv_words(my_db,my_cursor,log,notifications,csv_word_list)

    log("--------------------------------------------------------------")
#---------------------------------------------------------------------------------------------------------------------------------------------
def upload_csv_file(my_db,my_cursor,log,notifications):

    tmp_list = []
    csv_word_list  = []

    csv_path = prefs['WORD_INDEX_TRIM_CSV_CHOSEN_FILE_PATH']

    log(" ")
    log("CSV File path: " + csv_path)

    if csv_path == unicode_type(""):
        return csv_word_list

    try:
        with open (csv_path,'rb') as csvfile:
            mcs_csv_reader = csv.reader(csvfile,dialect='excel')
            for row in mcs_csv_reader:
                tmp_list.append(row)
            #END FOR
        csvfile.close()
        del csv_path
        del mcs_csv_reader
        for row in tmp_list:
            if isinstance(row,list):
                csv_word_list.append(row[0])
            elif isinstance(row,unicode):
                csv_word_list.append(row)
    except Exception as e:
        log("CSV File Error: " + as_unicode(e))

    return csv_word_list
#---------------------------------------------------------------------------------------------------------------------------------------------
def remove_csv_words(my_db,my_cursor,log,notifications,csv_word_list):

    csv_word_set = set(csv_word_list)
    csv_word_list = list(csv_word_set)
    del csv_word_set

    n_total_csv_words_to_delete = len(csv_word_list)

    log(" ")
    log("Number of unique words in the specified CSV file to be removed from the index if they exist: " + as_unicode(n_total_csv_words_to_delete))
    log(" ")

    notifications.put((0.02, 'MCS Removing CSV Words'))

    my_cursor.execute("begin")
    i = 0
    n_total = 0
    for word in csv_word_list:
        try:
            if word:
                try:
                    if isinstance(word,str):
                        word.decode('utf-8', errors='replace')
                    elif isinstance(word,unicode):
                        word.encode('utf-8', errors='replace')
                except Exception as e:
                    #~ print("utf-8 decode error: ", as_unicode(e))
                    pass
                word = word.replace('"',"")
                word = word.replace("'","")
                word = word.strip()
                if word != '' and word != ' ':
                    mysql = "DELETE FROM _mcs_word_book_index WHERE word = ?"
                    my_cursor.execute(mysql,([word]))
                    n_total = n_total + 1
                    #~ log(word)
                    i = i + 1
                    if i >= 100:
                        my_cursor.execute("commit")
                        n_progress = float(n_total/n_total_english_words_to_delete)
                        notifications.put((n_progress, 'MCS Removing CSV Words'))
                        my_cursor.execute("begin")
                        i = 0
                    else:
                        continue
        except Exception as e:
            log("CSV data error for word: " + word + " with the reason: " + as_unicode(e) )
            pass
    #END FOR
    try:
        my_cursor.execute("commit")
    except:
        pass

    log("Number of CSV words processed: " + as_unicode(n_total))
    log(" ")
    log("CSV words, if any, have been removed from the index")

    notifications.put((0.99, 'CSV words, if any, have been removed from the index'))

    del csv_word_list
#---------------------------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------------------------------------------------
def remove_deleted_books_from_index(my_db,my_cursor,log):
    try:
        my_cursor.execute("begin")
        mysql = "DELETE FROM _mcs_word_book_index WHERE book NOT IN(SELECT id FROM books WHERE id = _mcs_word_book_index.book)"
        my_cursor.execute(mysql)
        my_cursor.execute("commit")
        log("Deleted Books, if any, have been removed from the Index.")
    except:
        try:
            my_cursor.execute("commit")
        except:
            pass
#---------------------------------------------------------------------------------------------------------------------------------------------
def count_records_in_index(my_db,my_cursor):
    n_records = 0
    mysql = "SELECT count(*) FROM _mcs_word_book_index "
    my_cursor.execute(mysql)
    tmp_rows = my_cursor.fetchall()
    if not tmp_rows:
        pass
    else:
        for row in tmp_rows:
            for col in row:
                n_records = col
            #END FOR
        #END FOR
    prefs['WORD_INDEX_LATEST_RECORD_COUNT'] = unicode_type(n_records)
    prefs
#---------------------------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------------------------------------------------
def perform_basic_trim(my_db,my_cursor,log,notifications,selected_books_list):

    notifications.put((0.02, 'Trimming Short Words in Index'))

    log(" ")
    log("Trimming the index for selected books: " + as_unicode(len(selected_books_list)))
    log(" ")

    minimum_letters = prefs['WORD_INDEX_MINIMUM_NUMBER_OF_LETTERS']
    my_cursor.execute("begin")
    mysql = "DELETE FROM _mcs_word_book_index WHERE length(word) < ? "
    for book in selected_books_list:
        my_cursor.execute(mysql,([minimum_letters]))
    #END FOR
    my_cursor.execute("commit")

    regex = prefs['WORD_INDEX_DELETION_REGEX']
    if regex > " ":
        notifications.put((0.50, 'Trimming Words Matching REGEX in Index'))
        is_valid = apsw_create_regexp_user_function(my_db,my_cursor)
        if is_valid:
            my_cursor.execute("begin")
            mysql = "DELETE FROM _mcs_word_book_index WHERE book = ? AND word REGEXP ?  "
            for book in selected_books_list:
                my_cursor.execute(mysql,(book,regex))
            #END FOR
            my_cursor.execute("commit")

    notifications.put((0.99, 'Trimming(s) Finished'))

    count_records_in_index(my_db,my_cursor)
    n =  prefs['WORD_INDEX_LATEST_RECORD_COUNT']
    log("")
    log("Number of records currently in the index: " + as_unicode(n))
#---------------------------------------------------------------------------------------------------------------------------------------------
def apsw_create_regexp_user_function(my_db,my_cursor):
    try:
        my_db.createscalarfunction("regexp", apsw_user_function_regexp)
        print("Create_SQLite_User_Function 3 was successful...")
        return True
    except Exception as e:
        print("Create_SQLite_User_Function [3] failed...cannot proceed...")
        print(as_unicode(e))
        return False
#--------------------------------------------------------------------------------------------------
def apsw_user_function_regexp(regexpr,avalue):
    #http://www.sqlite.org/lang_expr.html:  The "X REGEXP Y" operator will be implemented as a call to "regexp(Y,X)"
    #------------------------------------------------------------------------------------------------------------
    #mysql = 'SELECT id FROM custom_column_8 WHERE value REGEXP '^.+$'
    #------------------------------------------------------------------------------------------------------------
    if regexpr:
        if avalue:
            try:
                s_string = unicode_type(avalue)
                re_string = unicode_type(regexpr)
                re.escape("\\")
                p = re.compile(re_string, re.IGNORECASE|re.DOTALL|re.MULTILINE)
                match = p.search(s_string)
                if match:
                    return True
                else:
                    return False
            except Exception as e:
                print(as_unicode(e))
                return False
#---------------------------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------------------------------------------------------------------------------------------------
#END of main.py
