#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab

# Copyright 2023 Kevin B. Hendricks, Stratford Ontario

# This plugin's source code is available under the GNU LGPL Version 2.1 or GNU LGPL Version 3 License.
# See https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html or
# https://www.gnu.org/licenses/lgpl.html for the complete text of the license.

import sys
import os

import navprocessor
import quickparser
from hrefutils import urlencodepart, urldecodepart

import sigil_gumbo_bs4_adapter as gumbo_bs4

_USER_HOME = os.path.expanduser("~")


_LOI_SOURCE_3 = """<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"  xml:lang="%1" lang="%1">
  <head>
    <title>%2</title>
  </head>
  <body>
    <nav epub:type="loi">
      <%3>%2</%3>
      <ol%4>
      </ol>
    </nav>
  </body>
</html>
"""

_LOI_SOURCE_2 = """<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" 
  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="%1">
  <head>
    <title>%2</title>
  </head>
  <body>
    <div>
      <%3>%2</&3>
      <ol%4>
      </ol>
    </div>
  </body>
</html>
"""

_PARENT_IMAGE_TAGS = [ "div", "figure", "svg"]

_XLATE_TABLE = {
    'az'    : 'İllüstrasiyaların siyahısı', 
    'ca'    : 'Llista d&apos;il·lustracions', 
    'cs'    : 'Seznam kreseb', 
    'da'    : 'Billedliste', 
    'de'    : 'Bilderverzeichnis', 
    'en'    : 'List of Illustrations', 
    'es'    : 'Lista de ilustraciones', 
    'fi'    : 'Kuvaluettelo', 
    'fr'    : 'Liste des illustrations', 
    'gl'    : 'Lista de Ilustracións', 
    'it'    : 'Lista delle Illustrazioni', 
    'ja'    : 'イラスト一覧', 
    'ko'    : '일러스트레이션 목록', 
    'nl'    : 'Lijst van illustraties', 
    'pl'    : 'Spis ilustracji', 
    'pt'    : 'Lista de figuras', 
    'ru'    : 'Список иллюстраций', 
    'sr'    : 'Листа илустрација', 
    'sv'    : 'Förteckning över illustrationer', 
    'th'    : 'รายชื่อภาพประกอบ', 
    'tr'    : 'Çizimler listesi', 
    'uk'    : 'Список ілюстрацій', 
    'zh_CN' : '插图列表', 
    'zh_TW' : '插圖清單' 
}


# encode strings for xml
def xmlencode(data):
    if data is None:
        return ''
    newdata = data
    newdata = newdata.replace('&', '&amp;')
    newdata = newdata.replace('<', '&lt;')
    newdata = newdata.replace('>', '&gt;')
    newdata = newdata.replace('"', '&quot;')
    return newdata


# decode xml encoded strings
def xmldecode(data):
    if data is None:
        return ''
    newdata = data
    newdata = newdata.replace('&quot;', '"')
    newdata = newdata.replace('&gt;', '>')
    newdata = newdata.replace('&lt;', '<')
    newdata = newdata.replace('&amp;', '&')
    return newdata


# utility routine to apply condition tests
# and only setting value if passes all tests
# ow returns None
def set_if_valid(txt):
    if txt:
        val = txt.strip()
        if len(val) > 0:
            return val
    return None


# parse the opf metadataxml, return first dc:language
def get_book_lang(mdata):
    ps = quickparser.QuickXHTMLParser()
    ps.setContent(mdata)
    lang_found = False
    for text, tagprefix, tagname, tagtype, tagattr in ps.parse_iter():
        if text is not None:
            if lang_found:
                return text.strip()
        else:
            if tagname == "dc:language" and tagtype == "begin":
                lang_found = True
            if tagname == "dc:language" and tagtype == "end":
                lang_found = False
    return "en"


# use lang to translate the string "List of Illustrations"
def get_translated_title(lang):
    xtitle="List of Illustrations"
    if lang in _XLATE_TABLE:
        xtitle = _XLATE_TABLE[lang]
    else:
        short_lang = lang.replace('_', '-')
        short_lang = lang.split('-')[0]
        if short_lang in _XLATE_TABLE:
            xtitle = _XLATE_TABLE[short_lang]
    return xtitle


# get appropriate template source for new loi.xhtml
def get_loi_template(epubversion, lang, title, heading, use_numbers):
    if epubversion.startswith("3"):
        asrc = _LOI_SOURCE_3
    else:
        asrc = _LOI_SOURCE_2
    asrc = asrc.replace('%1', lang)
    asrc = asrc.replace('%2', title)
    asrc = asrc.replace('%3', heading)
    if use_numbers:
        asrc = asrc.replace('%4', '')
    else:
        asrc = asrc.replace('%4', ' style="list-style-type:none;"')
    return asrc


# generate list items for the loi
def build_loi_items_xhtml(bk, mode, ilvl, loi_bookpath, entries):
    # example: <li><p><a href="preface.xhtml#the-edge-of-the-world">The Edge of the World</a></p></li>
    indent = " " * ilvl
    res = []
    for item_type, bookpath, id_tgt, desc in entries:
        href = bk.get_relativepath(loi_bookpath, bookpath)
        href = urldecodepart(href)
        href = urlencodepart(href)
        href = href + "#" + urlencodepart(id_tgt)
        rec = '<li><p><a href="' + href + '">'+ xmlencode(desc) + '</a></p></li>'
        if mode == "comment" and item_type == "comment":    
            rec = '<!-- ' + rec + '-->'
        rec = indent + rec + "\n"
        res.append(rec)
    return res


# update the opf guide or the nav landmarks with loi semantics
def update_guide_landmarks(bk, epubversion, loi_bookpath, xtitle):
    if epubversion.startswith("3"):
        nav_id = bk.getnavid()
        nav_bookpath = bk.id_to_bookpath(nav_id)
        nav_dir = bk.get_startingdir(nav_bookpath)
        navsrc = bk.readfile(nav_id) # data is utf-8 encoded
        np = navprocessor.NavProcessor(navsrc)
        lmarks = np.getLandmarks()
        ehref = bk.get_relativepath(nav_bookpath, loi_bookpath)
        # ehref = urldecodepart(ehref)
        # ehref = urlencodepart(ehref)
        lmarks.append(["loi", ehref, xtitle])
        np.setLandmarks(lmarks)
        bk.writefile(nav_id, np.getNavSrc())
    else:
        opf_bookpath = bk.get_opfbookpath()
        opf_dir = bk.get_startingdir(opf_bookpath)
        ghref = bk.get_relativepath(opf_bookpath, loi_bookpath)
        # ghref = urldecodepart(ghref)
        # ghref = urlencodepart(ghref)
        opf_guide = bk.getguide()
        opf_guide.append(["loi", xtitle, ghref])
        bk.setguide(opf_guide)
    return


# parse text that precedes tag in quickparser
# to determine tag indent level count
def determine_indent_count(txt, tabwidth):
    cnt = 0
    for achar in txt:
        if achar == "\n":
            cnt = 0
        elif achar == "\t":
            cnt = cnt + tabwidth
        else:
            cnt = cnt + 1
    return cnt


# the plugin entry point
def run(bk):

    if bk.launcher_version() < 20190927:
        print("Error: LOI-Generator requires Sigil 1.0 or later and Python 3")
        return -1

    prefs = bk.getPrefs()
    # possible mode values are "drop", "keep", "comment"
    prefs.defaults['mode'] = "comment"
    prefs.defaults['tabwidth'] = 4
    prefs.defaults['heading'] = 'h1'
    prefs.defaults['numbers'] = "no"
    mode = prefs['mode']
    tabwidth = prefs['tabwidth']
    hval = prefs['heading']
    if hval in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
        heading = hval
    else:
        heading = 'h1'
    numbers = prefs['numbers']
    use_numbers = numbers in ['YES', 'Yes', 'yes', 'ON', 'On', 'on', 'TRUE', 'True', 'true', 'True']

    epubversion = bk.epub_version()
    epublang = get_book_lang(bk.getmetadataxml())
    
    loi_id = None
    loi_bookpath = None

    # build up dictionary of book paths to landmark / guide types
    landmark_info = {}
    if epubversion.startswith("3"):
        # parse nav landmarks to build dictionary of book paths and landmark types
        nav_id = bk.getnavid()
        nav_bookpath = bk.id_to_bookpath(nav_id)
        nav_dir = bk.get_startingdir(nav_bookpath)
        navsrc = bk.readfile(nav_id)
        np = navprocessor.NavProcessor(navsrc)
        lmarks = np.getLandmarks()
        for etype, ehref, etitle in lmarks:
            # split off any anchor id info from href
            ehref = ehref.split('#')[0]
            bookpath = bk.build_bookpath(ehref, nav_dir)
            landmark_info[bookpath] = etype
            if etype == 'loi':
                loi_id = bk.bookpath_to_id(bookpath)
                loi_bookpath = bookpath
    else:
        # use opf guide to generate dictionary of book paths and guide types
        opf_bookpath = bk.get_opfbookpath()
        opf_dir = bk.get_startingdir(opf_bookpath)
        opf_guide = bk.getguide()
        for gtype, gtitle, ghref in opf_guide:
            # split off any anchor id info from href
            ghref = ghref.split('#')[0]
            bookpath = bk.build_bookpath(ghref, opf_dir)
            landmark_info[bookpath] = gtype
            if gtype == 'loi':
                loi_id = bk.bookpath_to_id(bookpath)
                loi_bookpath = bookpath



    # process all xhtml files in spine order looking for img tags that meet
    # the conditions for inclusion into loi

    loi_info = []
    for idref, linear, href in bk.spine_iter():

        bookpath = bk.id_to_bookpath(idref)

        print("processing ... ", bookpath)
        
        lm = None
        if bookpath and bookpath in landmark_info:
            lm = landmark_info[bookpath]

        # do not process files of specific types
        if lm and lm in ["loi", "cover", "toc", "loa", "lov", "index", "other.loa", "other.lov", "lot"]:
            continue

        # now use bs4 to parse each file looking for suitable img tags
        src = bk.readfile(idref)
        soup = gumbo_bs4.parse(src)

        # and build table of all id attribute values used in that file
        id_list = []
        for node in soup.find_all(id=True):
            id_list.append(node['id'])

        basename = bk.href_to_basename(href)
        cnt = 0
        file_modified = False

        # search for img tags whose direct parents are in _PARENT_IMAGE_TAGS
        for node in soup.find_all(['img', 'image']):
            if node.name == 'img':
                print("    found img: ", node['src'], node.parent.name,end='')
            else:
                link_dest = "unknown image href"
                if node.has_attr('xlink:href'):
                    link_dest = node['xlink:href']
                if node.has_attr('href'):
                    link_dest = node['href']
                print("    found image: ", link_dest, node.parent.name,end='')
                
            id_tgt = None
            aparent = node.parent
            
            # check if image wrapped in an anchor tag and if so record its
            # id but use its parent instead
            if aparent.name == 'a':
                if aparent.has_attr('id'):
                    id_tgt = aparent['id']
                aparent = aparent.parent
                
            aparent_tag = aparent.name

            if aparent_tag in _PARENT_IMAGE_TAGS:

                # we have found a potential target img/image
                # determine if a proper caption or title can be found
                desc = None
                
                # first look if a title atribute exists on the parent tag
                if aparent.has_attr('title'):
                    desc = set_if_valid(aparent['title'])

                # if the parent is a svg tag look for title tag child
                # and if none, then look for desc tag child
                if not desc and aparent_tag == 'svg':
                    if aparent.title:
                        desc = set_if_valid(aparent.title.get_text())
                    if not desc and aparent.desc:
                        desc = set_if_valid(aparent.desc.get_text())

                # if the parent is a figure tag look for a figcaption
                if not desc and aparent_tag == 'figure' and aparent.figcaption:
                    desc = set_if_valid(aparent.figcaption.get_text())

                # if the parent is a div tag look for a child div used for title or caption
                if not desc and aparent_tag == 'div' and aparent.div:
                    desc = set_if_valid(aparent.div.get_text())

                # if the parent is a div tag look for a child p used for title or caption
                if not desc and aparent_tag == 'div' and aparent.p:
                    desc = set_if_valid(aparent.p.get_text())

                if not desc and node.has_attr('alt'):
                    desc = set_if_valid(node['alt'])

                # now look for an existing id
                if not id_tgt and aparent.has_attr('id'):
                    id_tgt = aparent['id']
                    
                if not id_tgt and node.has_attr('id'):
                    id_tgt = node['id']
                    
                if not id_tgt:
                    # we need to create an id on the parent tag
                    # but do not if operating in drop mode when no desc exists
                    if (not desc and mode in ["comment", "keep"]) or desc:
                        id_tgt = 'loi' + basename + str(cnt)
                        while id_tgt in id_list:
                            cnt = cnt + 1
                            id_tgt = 'loi' + basename + str(cnt)
                        aparent['id'] = id_tgt
                        id_list.append(id_tgt)
                        file_modified = True

                item_type = "keep"
                # search for a suitable descriptor string
                if not desc:
                    if mode == "drop":
                        item_type = "drop"
                    if mode in ["keep", "comment"]:
                        if mode == "comment":
                            item_type = "comment"
                        # this image has no caption or title info and is most likely
                        # not an image meant for a loi but if in keep or comment mode
                        # then fabricate a descriptions from the image href
                        if node.name == 'img' and node.has_attr('src'):
                            desc = node['src']

                        if not desc and node.name == 'image':
                            link_dest = None
                            if node.has_attr('xlink:href'):
                                link_dest = node['xlink:href']
                            if node.has_attr('href'):
                                link_dest = node['href']
                            desc = link_dest

                        if not desc:
                            desc = "Unknown Image"

                print(" ", item_type)
                if desc:
                    loi_entry = [item_type, bookpath, id_tgt, desc]
                    loi_info.append(loi_entry)

        if file_modified:
            # we need to to serialize the new tree and write it back to the file
            new_src = soup.serialize_xhtml('utf-8')
            bk.writefile(idref, new_src)

    prefs['mode'] = mode
    prefs['tabwidth'] = tabwidth
    prefs['heading'] = heading
    if use_numbers:
        prefs['numbers'] = 'yes'
    else:
        prefs['numbers'] = 'no'
    bk.savePrefs(prefs)

    # exit early if no suitable images were found
    if len(loi_info) == 0:
        print("No suitable images were found, exiting with no changes")
        return 0
    
    # create the loi source if needed
    loi_src = None
    if loi_id:
        # use existing loi file
        loi_src = bk.readfile(loi_id)
        loi_bookpath = bk.id_to_bookpath(loi_id)
    else:
        # build an empty loi_file with translated title and proper lang info
        loi_title = get_translated_title(epublang)
        loi_src = get_loi_template(epubversion, epublang, loi_title, heading, use_numbers)
        # create the new loi file and add it to the end of the spine
        loi_name = "loi.xhtml"
        loi_id = "loi.xhtml"
        bk.addfile(loi_id, loi_name, loi_src)
        bk.spine_insert_before(-1, loi_id, "yes")
        loi_bookpath = bk.id_to_bookpath(loi_id)
        update_guide_landmarks(bk, epubversion, loi_bookpath, loi_title)

    # now parse the existing source and replace just the contents of the ol tag
    # and nothing else
    ps = quickparser.QuickXHTMLParser()
    ps.setContent(loi_src)
    injection_performed = False
    in_ol = False
    last_txt = None
    res = []
    for text, tagprefix, tagname, tagtype, tagattr in ps.parse_iter():
        if text is not None:
            last_txt = text
            if not in_ol:
                res.append(text)
        else:
            if tagname == "ol" and tagtype == "begin":
                res.append(ps.tag_info_to_xml(tagname, tagtype, tagattr) + "\n")
                ilvl = determine_indent_count(last_txt, tabwidth) + 2
                # now inject the new loi entries
                new_items = build_loi_items_xhtml(bk, mode, ilvl, loi_bookpath, loi_info)
                res = res + new_items
                in_ol = True
                injection_performed = True
                continue
            if tagname == "ol" and tagtype == "end":
                ilvl = determine_indent_count(last_txt, tabwidth)
                res.append(" " * ilvl)
                in_ol = False
            if tagname == "body" and tagtype == "end" and not injection_performed:
                # inject the new loi entries wrapped in its own ol tag
                # just inside the closing body tag since no ol tag found
                res.append('    <ol style="list-style-type: none;">\n')
                ilvl = 6
                new_items = build_loi_items_xhtml(bk, mode, ilvl, loi_bookpath, loi_info)
                res = res + new_items
                res.append("    </ol>\n")
            if not in_ol:
                res.append(ps.tag_info_to_xml(tagname, tagtype, tagattr))
    loi_src = "".join(res)    
    bk.writefile(loi_id, loi_src)
    
    print("LOI Generation Complete")
    # Setting the proper Return value is important.
    # 0 - means success
    # anything else means failure
    return 0
 

def main():
    print("I reached main when I should not have\n")
    return -1
    
if __name__ == "__main__":
    sys.exit(main())
