# -*- coding: utf-8 -*-
__license__   = 'GPL v3'
__copyright__ = '2015,2016,2017,2018,2019,2020,2021,2022,2023 DaltonST 2024 DJG'
__my_version__ = "2.0.1"

import re
import time
from calibre import browser
from calibre.constants import DEBUG
# from calibre.ebooks.BeautifulSoup import BeautifulSoup
# import requests
import xml.etree.ElementTree as ET

STDNBR = "stdnbr"

#-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
def oclc_classify_webscraping_stdnbr(paramtype, paramvalue):
    time.sleep(1.6)
    if DEBUG:
        print("paramtype: ", str(paramtype))
        print("paramvalue: ", str(paramvalue))

    ddc_return = "NONE"
    lcc_return = "NONE"
    fast_return_list = []
    oclc_owi_return = "NONE"
    oclc_wi_return = "NONE"
    oclc_worldcat_return = "NONE"
    viaf_author_id_return = "NONE"

    base_stdnr = f"http://lx2.loc.gov:210/LCDB?version=1.1&operation=searchRetrieve&recordSchema=mods&maximumRecords=20&query=dc.identifier%3D[STDNBRGOESHERE]"

    if paramtype ==STDNBR:
        base_stdnr = base_stdnr.replace("[STDNBRGOESHERE]",paramvalue)
        base_url = base_stdnr
    br = browser()
    timeout=10000
    try:
        raw = br.open_novisit(base_url, timeout=timeout).read().strip()
 #       response.raise_for_status()
 #       raw = response.text.strip()
    except Exception as e:
        if DEBUG: print("Error in br.open_novisit: ", str(e))
        return ddc_return, lcc_return, fast_return_list, oclc_owi_return, oclc_wi_return, oclc_worldcat_return, viaf_author_id_return


    if not raw:
        if DEBUG: print("raw is None; returning from url: ", base_url)
        return ddc_return, lcc_return, fast_return_list, oclc_owi_return, oclc_wi_return, oclc_worldcat_return, viaf_author_id_return

    try:
        root = ET.fromstring(raw)
    except ET.ParseError as e:
        if DEBUG: print("Error parsing XML: ", str(e))
        return ddc_return, lcc_return, fast_return_list, oclc_owi_return, oclc_wi_return, oclc_worldcat_return, viaf_author_id_return

    namespaces = {
        'mods': 'http://www.loc.gov/mods/v3'
    }

    # Find DDC
    try:
        ddc_element = root.find(".//mods:classification[@authority='ddc']", namespaces)
        if ddc_element is not None:
            ddc_return = ddc_element.text.strip()
            if DEBUG: print("DDC: ", ddc_return)
    except Exception as e:
        if DEBUG: print("Exception in finding DDC: ", str(e))

    # Find LCC
    try:
        lcc_element = root.find(".//mods:classification[@authority='lcc']", namespaces)
        if lcc_element is not None:
            lcc_return = lcc_element.text.strip()
            if DEBUG: print("LCC: ", lcc_return)
    except Exception as e:
        if DEBUG: print("Exception in finding LCC: ", str(e))

    if DEBUG: print("Returning Results: DDC: ", ddc_return, ", LCC: ", lcc_return)
    # Find FAST Headings
    try:
        fast_elements = root.findall(".//mods:subject/mods:topic[@authority='lcsh']", namespaces)
        for element in fast_elements:
            fast_return_list.append(element.text.strip())
        if DEBUG: print("FAST subjects: ", fast_return_list)
    except Exception as e:
        if DEBUG: print("Exception in finding FAST subjects: ", str(e))
    if DEBUG: print("Returning Results: DDC: ", ddc_return, ", LCC: ", lcc_return, ", FAST subjects: ", fast_return_list)        
        


    return ddc_return, lcc_return, fast_return_list, oclc_owi_return, oclc_wi_return, oclc_worldcat_return, viaf_author_id_return

#-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
import re
import xml.etree.ElementTree as ET
from calibre import browser
from calibre.constants import DEBUG
'''
def parse_author(record, target_author):
    def normalize_name(name):
        name = name.strip().lower()
        name = re.sub(r'[^\w\s]', '', name)
        parts = name.split()
        return parts

    target_author_parts = normalize_name(target_author)

    author_fields = record.findall(".//datafield[@tag='100']") + record.findall(".//datafield[@tag='700']")
    for author_field in author_fields:
        subfield_a = author_field.find(".//subfield[@code='a']")
        if subfield_a is not None:
            record_author = subfield_a.text.strip()
            record_author_parts = normalize_name(record_author)

            if set(target_author_parts).intersection(set(record_author_parts)):
                return True

    return False

def parse_title(record, target_title):
    def normalize_title(title):
        title = title.strip().lower()
        title = re.sub(r'[^\w\s]', '', title)
        parts = title.split()
        return parts

    target_title_parts = normalize_title(target_title)

    title_fields = record.findall(".//datafield[@tag='245']")
    for title_field in title_fields:
        subfield_a = title_field.find(".//subfield[@code='a']")
        if subfield_a is not None:
            record_title = subfield_a.text.strip()
            record_title_parts = normalize_title(record_title)

            if set(target_title_parts).intersection(set(record_title_parts)):
                return True

    return False

''' '''def parse_publisher(record, target_publisher):
    def normalize_publisher(publisher):
        publisher = publisher.strip().lower()
        publisher = re.sub(r'[^\w\s]', '', publisher)
        parts = publisher.split()
        return parts

    target_publisher_parts = normalize_publisher(target_publisher)

    publisher_fields = record.findall(".//datafield[@tag='260']") + record.findall(".//datafield[@tag='264']")
    for publisher_field in publisher_fields:
        subfield_b = publisher_field.find(".//subfield[@code='b']")
        if subfield_b is not None:
            record_publisher = subfield_b.text.strip()
            record_publisher_parts = normalize_publisher(record_publisher)

            if set(target_publisher_parts).intersection(set(record_publisher_parts)):
                return True

    return False'''
'''
def parse_records(raw):
    try:
        root = ET.fromstring(raw)
    except ET.ParseError as e:
        if DEBUG: print("Error parsing XML: ", str(e))
        return []

    records = []
    for record in root.findall('.//record'):
        records.append(record)
    
    return records

def select_best_record(records, author, title):
    best_record = None
    for record in records:
        if parse_author(record, author) and parse_title(record, title):
            best_record = record
            break
    return best_record

def fetch_mods_record(lccn):
    mods_query_url = f"http://lccn.loc.gov/{lccn}/mods"
    
    br = browser()
    try:
        mods_raw = br.open_novisit(mods_query_url).read().strip()
        return mods_raw
    except Exception as e:
        if DEBUG: print("Error in fetching MODS record: ", str(e))
        return None

def parse_mods_record(mods_raw):
    try:
        root = ET.fromstring(mods_raw)
    except ET.ParseError as e:
        if DEBUG: print("Error parsing MODS XML: ", str(e))
        return wi_record, lcc_record, ddc_record

    wi_record = "NONE"
    lcc_record = "NONE"
    ddc_record = "NONE"

    namespaces = {'mods': 'http://www.loc.gov/mods/v3'}

    try:
        wi_element = root.find(".//mods:recordIdentifier", namespaces)
        if wi_element is not None:
            wi_record = wi_element.text.strip()
            if DEBUG: print("WI Record: ", wi_record)
    except Exception as e:
        if DEBUG: print("Exception in finding WI record: ", str(e))

    try:
        ddc_element = root.find(".//mods:classification[@authority='ddc']", namespaces)
        if ddc_element is not None:
            ddc_record = ddc_element.text.strip()
            if DEBUG: print("DDC: ", ddc_record)
    except Exception as e:
        if DEBUG: print("Exception in finding DDC: ", str(e))

    try:
        lcc_element = root.find(".//mods:classification[@authority='lcc']", namespaces)
        if lcc_element is not None:
            lcc_record = lcc_element.text.strip()
            if DEBUG: print("LCC: ", lcc_record)
    except Exception as e:
        if DEBUG: print("Exception in finding LCC: ", str(e))

    return wi_record, lcc_record, ddc_record
''' '''
def oclc_classify_webscraping_author_title(param_dict):
    if DEBUG:
        for k,v in param_dict.items():
            print("param_dict: ", k,v)
        #END FOR
#    if DEBUG:
#        print("author: ", str(author))
#        print("title: ", str(title))
#        print("publisher: ", str(publisher))

    ddc_return = "NONE"
    lcc_return = "NONE"
    wi_return = "NONE"

    author = param_dict["author"]
    title = param_dict["title"]


    base_author_title_url = "http://lx2.loc.gov:210/LCDB?query=dc.creator=[AUTHORGOESHERE]+AND+dc.title=[TITLEGOESHERE]&version=1.1&operation=searchRetrieve&recordSchema=mods&maximumRecords=10"
    br = browser()
    timeout = 10000

    query_url = base_author_title_url.replace("[TITLEGOESHERE]", title).replace("[AUTHORGOESHERE]", author)

    try:
        raw = br.open_novisit(query_url, timeout=timeout).read().strip()
    except Exception as e:
        if DEBUG: print("Error in br.open_novisit: ", str(e))
        return wi_return, lcc_return, ddc_return, "NONE", "NONE", "NONE", "NONE"

    if not raw:
        if DEBUG: print("raw is None; returning from url: ", query_url)
        return wi_return, lcc_return, ddc_return, "NONE", "NONE", "NONE", "NONE"

    records = parse_records(raw)
    best_record = select_best_record(records, author, title)

    if best_record:
        lccn_element = best_record.find('.//controlfield[@tag="010"]')
        if lccn_element is not None:
            lccn = lccn_element.text.strip()
            mods_raw = fetch_mods_record(lccn)
            if mods_raw:
                wi_return, lcc_return, ddc_return = parse_mods_record(mods_raw)

    if DEBUG: print("Returning Results: WI: ", wi_return, ", DDC: ", ddc_return, ", LCC: ", lcc_return)

    return wi_return, lcc_return, ddc_return, "NONE", "NONE", "NONE", "NONE"
'''
def oclc_classify_webscraping_author_title(param_dict):
    time.sleep(1.6)
    ddc_return = "NONE"
    lcc_return = "NONE"
    fast_return_list = []
    oclc_owi_return = "NONE"
    oclc_wi_return = "NONE"
    oclc_worldcat_return = "NONE"
    viaf_author_id_return = "NONE"

    if DEBUG:
        for k,v in param_dict.items():
            print("param_dict: ", k,v)
        #END FOR

    author = param_dict["author"]
    title = param_dict["title"]


    if DEBUG:
        print("author: ", str(author))
        print("title: ", str(title))


    base_author_title_url = f"http://lx2.loc.gov:210/LCDB?query=dc.creator%3D[AUTHORGOESHERE]+AND+dc.title%3D[TITLEGOESHERE]&version=1.1&operation=searchRetrieve&recordSchema=mods&&maximumRecords=10"
    br = browser()
    timeout=10000

    base_author_title_url = base_author_title_url.replace("[TITLEGOESHERE]",title)
    base_author_title_url = base_author_title_url.replace("[AUTHORGOESHERE]",author)

    try:
        raw = br.open_novisit(base_author_title_url, timeout=timeout).read().strip()
#        response.raise_for_status()
#        raw = response.text.strip()
    except Exception as e:
        if DEBUG: print("Error in br.open_novisit: ", str(e))
        return ddc_return, lcc_return, fast_return_list, oclc_owi_return, oclc_wi_return, oclc_worldcat_return, viaf_author_id_return

    if not raw:
        if DEBUG: print("raw is None; returning from url: ", base_url)
        return ddc_return, lcc_return, fast_return_list, oclc_owi_return, oclc_wi_return, oclc_worldcat_return, viaf_author_id_return

    try:
        root = ET.fromstring(raw)
    except ET.ParseError as e:
        if DEBUG: print("Error parsing XML: ", str(e))
        return ddc_return, lcc_return, fast_return_list, oclc_owi_return, oclc_wi_return, oclc_worldcat_return, viaf_author_id_return

    namespaces = {
        'mods': 'http://www.loc.gov/mods/v3'
    }

    # Find DDC
    try:
        ddc_element = root.find(".//mods:classification[@authority='ddc']", namespaces)
        if ddc_element is not None:
            ddc_return = ddc_element.text.strip()
            if DEBUG: print("DDC: ", ddc_return)
    except Exception as e:
        if DEBUG: print("Exception in finding DDC: ", str(e))

    # Find LCC
    try:
        lcc_element = root.find(".//mods:classification[@authority='lcc']", namespaces)
        if lcc_element is not None:
            lcc_return = lcc_element.text.strip()
            if DEBUG: print("LCC: ", lcc_return)
    except Exception as e:
        if DEBUG: print("Exception in finding LCC: ", str(e))

    if DEBUG: print("Returning Results: DDC: ", ddc_return, ", LCC: ", lcc_return)
    # Find FAST Headings
    try:
        fast_elements = root.findall(".//mods:subject/mods:topic[@authority='lcsh']", namespaces)
        for element in fast_elements:
            fast_return_list.append(element.text.strip())
        if DEBUG: print("FAST subjects: ", fast_return_list)
    except Exception as e:
        if DEBUG: print("Exception in finding FAST subjects: ", str(e))
    if DEBUG: print("Returning Results: DDC: ", ddc_return, ", LCC: ", lcc_return, ", FAST subjects: ", fast_return_list)        
        

    return ddc_return, lcc_return, fast_return_list, oclc_owi_return, oclc_wi_return, oclc_worldcat_return, viaf_author_id_return
#-----------------------------------------------------------------------------------------------------------------------------------------------------------------------

