#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os, re, sys, json, socket, tempfile, zipfile, shutil, time
from urllib.parse import unquote
from urllib.request import urlopen, urlretrieve
from xml.sax.saxutils import escape
from contextlib import contextmanager
from datetime import datetime, timedelta
from distutils.version import StrictVersion
iswindows = sys.platform.startswith('win')

# code provided by DiapDealer
@contextmanager
def make_temp_directory():
    ''' creates a temporary folder '''
    temp_dir = tempfile.mkdtemp()
    yield temp_dir
    shutil.rmtree(temp_dir)

# code provided by DiapDealer
def is_connected():
    ''' tests Internet connectivity '''
    try:
        sock = socket.create_connection(('8.8.8.8', 53), 1)
        sock.close()
        return True
    except:
        pass

# code provided by DiapDealer
def string_to_date(datestring):
    ''' converts dates to strings '''
    return datetime.strptime(datestring, "%Y-%m-%d %H:%M:%S.%f")

# get latest Grammalecte version and download link
def latest_gl_version():
    ''' returns the download link of the latest grammalecte version '''
    latest_version = ''
    browser_download_url = ''

    # make sure we have an Internet connection
    if is_connected():

        # version and download link search patterns
        version_pattern = re.compile(r'<p id="version_num">([^<]+)</p>')
        filename_pattern = re.compile(r'<a class="button piwik_download" href="([^"]+)">')

        # scrape website
        response = urlopen('https://grammalecte.net')
        response_text = response.read().decode('utf-8', 'ignore')

        # get version info
        version_info = version_pattern.search(response_text)
        if version_info is not None:
            if len(version_info.groups()) == 1:
                latest_version = version_info.group(1)

        # get browser download url
        filename_info = filename_pattern.search(response_text)
        if filename_info is not None:
            if len(filename_info.groups()) == 1:
                browser_download_url = 'https://grammalecte.net/{}'.format(filename_info.group(1))

    return latest_version, browser_download_url

#--------------------------------
# offset code provided by KevinH
#--------------------------------
def generate_line_offsets(s):
    ''' returns line offsets '''
    offlst = [0]
    i = s.find('\n', 0)
    while i >= 0:
        offlst.append(i)
        i = s.find('\n', i + 1)
    return offlst

# code provided by KevinH
def charoffset(line, col, offlst):
    ''' returns character offsets  '''
    coffset = None
    if iswindows:
        coffset = offlst[line-1]  + 2 + (col - 1) - line
    else:
        coffset = offlst[line-1]  + 1 + (col - 1)
    if line == 1:
        coffset -= 1
    return coffset

# load optional Grammalecte dictionary
def loadDictionary(spf):
    ''' loads an optional user JSON dictionary '''
    if os.path.isfile(spf):
        sJSON = open(spf, "r", encoding="utf-8").read()
        try:
            oJSON = json.loads(sJSON)
        except:
            print("# Error. File <" + spf + " is not a valid JSON file.")
            return None
        return oJSON
    else:
        print("# Error: file <" + spf + "> not found.")
        return None

#-------------------------------------
# remove all html tags and entities
#-------------------------------------
def DeleteHTMLTags(html):
    ''' removes all HTML tags and entities '''
    zwj = u'\u200d'
    cleaned_html = html

    # replace XML declaration
    match = re.search(r'<\?xml[^>]+>', cleaned_html)
    if match is not None:
        match_found = match.group(0)
        replacement = zwj * len(match_found)
        cleaned_html = cleaned_html.replace(match_found, replacement)

    # replace doctype
    match = re.search(r'<!DOCTYPE[^>]+>', cleaned_html)
    if match is not None:
        match_found = match.group(0)
        if not '\n' in match_found:
            replacement = zwj * len(match_found)
        else:
            string_list = list(match_found)
            for letter in range(0, len(match_found)):
                if string_list[letter] not in ['\r', '\n', ' ']:
                    string_list[letter] = zwj
            replacement = "".join(string_list)
        cleaned_html = cleaned_html.replace(match_found, replacement)

    # replace script sections
    match = re.search('<script[^>]*>.*</script>', cleaned_html, re.MULTILINE|re.DOTALL)
    if match is not None:
        match_found = match.group(0)
        if not '\n' in match_found:
            replacement = zwj * len(match_found)
        else:
            string_list = list(match_found)
            for letter in range(0, len(match_found)):
                if string_list[letter] not in ['\r', '\n', ' ']:
                    string_list[letter] = zwj
            replacement = "".join(string_list)
        cleaned_html = cleaned_html.replace(match_found, replacement)

    # replace standard HTML tags
    matches = re.finditer(r'''</?([A-Za-z][^\s>/]*)(?:=\s*(?:"[^"]*"|'[^']*'|[^\s>]+)|[^>])*(?:>|$)''', cleaned_html)
    for match in matches:
        match_found = match.group(0)
        replacement = ' ' * len(match_found)
        cleaned_html = cleaned_html.replace(match_found, replacement)

    # replace entities
    matches = re.finditer(r'&[^;]+;', cleaned_html)
    for match in matches:
        match_found = match.group(0)
        # non-breaking space
        if match_found in ['&#160;', '&nbsp;', '&#xa0;']:
            replacement = u'\u200D' * (len(match_found) -1) + u'\u00A0'
        # en dash
        elif match_found in ['&#8211;', '&ndash;', '&#x2013;']:
            replacement = u'\u200D' * int(len(match_found) / 2) + u'\u2013' + u'\u200D' * 3
        # em dash
        elif match_found in ['&#8212;', '&mdash;', '&#x2014;']:
            replacement = u'\u200D' * int(len(match_found) / 2) + u'\u2014' + u'\u200D' * 3
        # all other entities
        else:
            replacement = ' ' * len(match_found)
        cleaned_html = cleaned_html.replace(match_found, replacement)

    return cleaned_html

#------------------------------------------
# parse Grammalecte JSON error messages
#------------------------------------------
def ParseErrorMessage(error):
    ''' parses Grammalecte error messages '''
    column_number = error['nStart'] + 1

    if 'sMessage' in error:
        message = error['sMessage']
    else:
        message = ''

    suggestions = ', '.join(error['aSuggestions']).strip()

    if suggestions != '':
        if len(error['aSuggestions']) > 1:
            message += ' Suggestions: ' + suggestions
        else:
            message += ' Suggestion: ' + suggestions

    if 'sBefore' in error and 'sAfter' in error and 'sUnderlined' in error:
        context = '{}■{}■{}'.format(error['sBefore'], error['sUnderlined'], error['sAfter'])
    else:
        if 'sUnderlined' in error:
            context = error['sUnderlined']
        else:
            context = ''

    return column_number, context, message

#----------------------------
# main plugin routine
#----------------------------
def run(bk):
    ''' main routine '''

    # get/set Grammalecte prefs
    prefs = bk.getPrefs()

    # write initial JSON file
    if prefs == {}:
        prefs['update_check'] = True
        prefs['last_time_checked'] = str(datetime.now() - timedelta(days=7))
        prefs['check_interval'] = 7
        prefs['html'] = True
        prefs['nbsp'] = True
        prefs['error_types'] = ['lGrammarErrors']
        bk.savePrefs(prefs)

    # set Grammalecte defaults, in case the JSON file was invalid or is missing entries
    prefs.defaults['update_check'] = True
    prefs.defaults['last_time_checked'] = str(datetime.now() - timedelta(days=7))
    prefs.defaults['check_interval'] = 7
    prefs.defaults['html'] = True
    prefs.defaults['nbsp'] = False
    prefs.defaults['error_types'] = ['lGrammarErrors']

    # get plugin defaults
    debug = prefs.get('debug', False)
    update_check = prefs.get('update_check', True)
    last_time_checked = prefs.get('last_time_checked', str(datetime.now() - timedelta(days=7)))
    check_interval = prefs.get('check_interval', 7)
    all_files = prefs.get('all_files', False)
    error_types = prefs.get('error_types', ['lGrammarErrors'])

    # migrate/add update preference settings
    if 'update_check' not in prefs:
        prefs['update_check'] = update_check
        prefs['last_time_checked'] = last_time_checked
        prefs['check_interval'] = check_interval
        bk.savePrefs(prefs)

    #----------------------------
    # run update check
    #----------------------------

    # define paths
    plugin_path = os.path.join(bk._w.plugin_dir, bk._w.plugin_name)
    pythonpath_dir = os.path.join(plugin_path, 'grammalecte')

    # reset time stamp to trigger an update check, if the grammalecte folder is missing
    if not os.path.isdir(pythonpath_dir):
        update_check = True
        last_time_checked = str(datetime.now() - timedelta(days=2))

        check_interval = 1
        if debug:
            print('DEBUG: Grammalecte re-download triggered')

    # get current version number (can't use imports)
    current_version = None
    gc_engine_path = os.path.join(plugin_path, 'grammalecte', 'fr', 'gc_engine.py')
    if os.path.exists(gc_engine_path):
        with open(gc_engine_path, 'r', encoding='utf-8') as f:
            response_text = f.read()

        version_pattern = re.compile(r'__version__ = "([^"]+)"')
        version_info = version_pattern.search(response_text)
        if version_info is not None:
            if len(version_info.groups()) == 1:
                current_version = version_info.group(1)
                if debug:
                    print('DEBUG: Current_version: ', current_version)
    else:
        if debug:
            print('DEBUG: gc_engine.py not found!')

    # run update check
    if update_check:

        # make sure we have an Internet connection
        if is_connected():

            # compare current date against last update check date
            time_delta = (datetime.now() - string_to_date(last_time_checked)).days
            if time_delta >= check_interval:

                # display running update check message
                print('Recherche de mises à jour...\n')

                # get latest version and download url
                latest_version, browser_download_url = latest_gl_version()

                # update time stamp
                prefs['last_time_checked'] = str(datetime.now())
                bk.savePrefs(prefs)

                # update check
                if StrictVersion(latest_version) > StrictVersion(current_version) and latest_version != '' and browser_download_url != '':

                    # display update found message
                    print('Mise à jour trouvée : Grammalecte {}\n'.format(latest_version))

                    # get base name
                    base_name = os.path.basename(browser_download_url)

                    # create temp directory
                    with make_temp_directory() as td:
                        zip_file_name = os.path.join(td, base_name)
                        # display file download message
                        print('Téléchargement du fichier {}...\n'.format(base_name))
                        urlretrieve(browser_download_url, zip_file_name)

                        # make sure the file was actually downloaded
                        if os.path.exists(zip_file_name):
                            # display filed downloaded message
                            print('{0} a bien été téléchargé.\n'.format(base_name))

                            # read zip file
                            # https://stackoverflow.com/q/19618268/2614117
                            archive = zipfile.ZipFile(zip_file_name)
                            files = archive.namelist()
                            files_to_extract = [m for m in files if m.startswith('pythonpath/grammalecte')]
                            archive.extractall(td, files_to_extract)
                            archive.close()

                            # Grammalecte pythonpath temp folder location
                            temp_pythonpath_dir = os.path.join(td, 'pythonpath', 'grammalecte')

                            # make sure the files were actually extracted
                            if os.path.isdir(temp_pythonpath_dir):

                                # delete /pythonpath folder
                                if os.path.isdir(pythonpath_dir):
                                    shutil.rmtree(pythonpath_dir, ignore_errors=True)

                                # don't know why this is needed!
                                if os.path.isdir(pythonpath_dir):
                                    if debug:
                                        print('DEBUG: grammalecte dir not empty')
                                    shutil.rmtree(pythonpath_dir, ignore_errors=True)

                                # this part occasionally fails!!!
                                try:
                                    shutil.move(temp_pythonpath_dir, plugin_path)
                                    if debug:
                                        print('DEBUG: TRY: shutil succeeded.')
                                    # move new files to the plugin folder
                                except:
                                    if debug:
                                        print('DEBUG: EXCEPT: shutil failed.')
                                    shutil.rmtree(pythonpath_dir, ignore_errors=True)
                                    shutil.move(temp_pythonpath_dir, plugin_path)

                                # update version number
                                version = latest_version

                                # display update successful message
                                print('Grammalecte a été mise à jour vers la version {}.\n'.format(version))

                            else:
                                # display Grammalecte files couldn't be unzipped error message
                                print('Les fichiers Grammalecte n\'ont pas pu être décompressés.\n')

                        else:
                            # display Grammalecte download failed message
                            print('Échec du téléchargement du fichier : {}.\n'.format(browser_download_url))
                            # wait for 2 seconds
                            time.sleep(2)
                else:
                    print('Aucune mise à jour trouvée.\n')
            else:
                if debug:
                    print('DEBUG: Update check skipped. Time delta: ', time_delta)
        else:
            # display no internet; update skipped message
            print('Recherche de mises à jour n’a pas été effectué : aucune connexion Internet.\n')
        if debug:
            time.sleep(5)

    #-------------------------
    # set up Grammalecte
    #-------------------------
    try:
        import grammalecte
        import grammalecte.text as txt
    except:
        print('Grammalecte introuvable. Veuillez réinstaller le plug-in.')
        return -1

    oGrammarChecker = grammalecte.GrammarChecker("fr")
    oGrammarChecker.gce.setOptions(prefs)
    version = oGrammarChecker.gce.version
    #if debug:
        #oGrammarChecker.gce.displayOptions("fr")

    # set up optional spell checker
    if 'lSpellingErrors' in error_types:
        oSpellChecker = oGrammarChecker.getSpellChecker()

        # look for a custom dictionary
        personal_dict = os.path.join(plugin_path, 'fr.personal.json')
        if os.path.isfile(personal_dict):

            oJSON = loadDictionary(personal_dict)
            if oJSON:
                oSpellChecker.setPersonalDictionary(oJSON)

    #-----------------------------------------
    # get files to be processed
    #-----------------------------------------
    selected_files = []
    for file_name in list(bk.selected_iter()):
        if bk.id_to_mime(file_name[1]) == 'application/xhtml+xml':
            selected_files.append((file_name[1], bk.id_to_href(file_name[1])))

    # select files to be processed
    if selected_files != [] and not all_files:
        # only selected files
        file_list = selected_files
    else:
        # ALL HTML files
        file_list = list(bk.text_iter())

    #--------------------------------
    # process all selected files
    #--------------------------------
    print('Grammalecte {} en cours d\'exécution ... veuillez patienter.\n'.format(version))

    for (html_id, href) in file_list:

        # read file contents
        html = bk.readfile(html_id)
        filename = os.path.basename(href)

        # generate offset list
        offlst = generate_line_offsets(html)

        # delete xml declaration and doctype
        sText = DeleteHTMLTags(html)

        # check each paragraph
        for i, sParagraph in enumerate(txt.getParagraph(sText)):

            # check for older versions, which used the generateParagraphAsJSON attribute
            if StrictVersion(version) <= StrictVersion('1.7.0'):
                sJSON = oGrammarChecker.generateParagraphAsJSON(i, sParagraph, bContext=True, bEmptyIfNoErrors=True, bSpellSugg=True)
            else:
                sJSON = oGrammarChecker.getParagraphErrorsAsJSON(i, sParagraph, bContext=True, bEmptyIfNoErrors=True, bSpellSugg=True)

            # ignore empty lines
            if sJSON != '':
                parsed_json = json.loads(sJSON)
                line_number = parsed_json['iParagraph'] + 1

                # parse all errors
                for error_type in error_types:

                    for GLError in parsed_json[error_type]:
                        column_number, context, message = ParseErrorMessage(GLError)

                        # ignore certain whitespace errors
                        ignore_this_error = False
                        sRuleId = ''
                        if 'sRuleId' in GLError:
                            sRuleId = GLError['sRuleId']
                            if sRuleId in ['tab_début_ligne', 'tab_fin_ligne', 'esp_début_ligne', 'esp_fin_ligne', 'esp_milieu_ligne', 'esp_mélangés2']:
                                ignore_this_error = True

                        # get typo
                        if error_type == 'lSpellingErrors':
                            nStart = GLError['nStart']
                            nEnd = GLError['nEnd']
                            Typo = sParagraph[nStart:nEnd]
                            context = 'Typo: ' + Typo

                        # add error message to validation pane
                        if not ignore_this_error:
                            coffset = charoffset(int(line_number), int(column_number), offlst)
                            print(filename, line_number, column_number, coffset, context, message)
                            msg = escape('Col: ' + str(column_number) + ' ' + context + ' ' + message).replace('"', '&quot;')

                            # Sigil 1.x (and higher) requires a file path instead of a file name
                            if  bk.launcher_version() >= 20190927:
                                bookpath = bk.id_to_bookpath(html_id)
                                bk.add_extended_result('error', bookpath, line_number, coffset, msg)
                            else:
                                bk.add_extended_result('error', filename, line_number, coffset, msg)

    return 0

def main():
    print('I reached main when I should not have\n')
    return -1

if __name__ == "__main__":
    sys.exit(main())
