# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__   = 'GPL v3'
__copyright__ = '2015,2016,2017,2018,2019,2020 DaltonST <DaltonShiTzu@outlook.com>'
__my_version__ = "1.0.45"  # Worldcat site changes

import mechanize
import re
import socket
socket.setdefaulttimeout(15)

from calibre.ebooks.BeautifulSoup import BeautifulSoup       #  bs4 .find_all and not bs3 .findAll
from calibre.constants import DEBUG

from polyglot.builtins import as_unicode, is_py3, iteritems

if is_py3:
    from urllib.request import urlopen
else:
    from urllib2 import urlopen
myurlopen = urlopen

from calibre_plugins.library_codes.ui import SOURCE_TYPE_VIAF, SOURCE_TYPE_OCLC, SOURCE_TYPE_XID_OWI, SOURCE_TYPE_XID_OCLC
#~ SOURCE_TYPE_VIAF = "http://viaf.org/viaf/[REFERENCENUMBER]/"  # get isni and lccn using viaf
#~ SOURCE_TYPE_OCLC = "https://www.worldcat.org/oclc/[REFERENCENUMBER]/"  # get lcead using oclc
#~ SOURCE_TYPE_XID_OWI = "https://www.worldcat.org/search?q=kw%3A[REFERENCENUMBER]"  # get oclc                xid was previously used by worldcat until version "1.0.45"
#~ SOURCE_TYPE_XID_OCLC = "https://www.worldcat.org/oclc/[REFERENCENUMBER]"  # get loc_lccn using oclc           xid was previously used by worldcat until version "1.0.45"

#--------------------------------------------------------------------------------------------
def library_codes_generic_webscraping_api(source_type,source_dict,isbn_dict,oclc_other_dict,final_list):

    source_type = as_unicode(source_type)

    if source_type == SOURCE_TYPE_VIAF:
        active_target_url = "http://viaf.org/viaf/[REFERENCENUMBER]/"
    elif source_type == SOURCE_TYPE_OCLC:
        active_target_url = "https://www.worldcat.org/oclc/[REFERENCENUMBER]/"
    elif source_type == SOURCE_TYPE_XID_OWI:
        active_target_url = "https://www.worldcat.org/search?q=kw%3A[REFERENCENUMBER]"
    elif source_type == SOURCE_TYPE_XID_OCLC:
        active_target_url = "https://www.worldcat.org/oclc/[REFERENCENUMBER]"
    else:
        results_dict = {}
        return results_dict

    #-----------------------------------------------
    # MAIN FUNCTION
    #-----------------------------------------------

    final_results_dict = {}

    already_retrieved_list = []

    if DEBUG: print(" ")

    for book in final_list:
        book = as_unicode(book)
        if book in source_dict:
            source_v = source_dict[book]
        else:
            if DEBUG: print("source_v == None; skipping:   book, source_type ", book, source_type)
            continue

        if DEBUG: print("..........library_codes_generic_webscraping_api: main function.    source_dict: ", book,source_v, source_type)

        target_url = active_target_url

        isbn = None
        oclc_owi = None
        oclc = None

        if source_type == SOURCE_TYPE_XID_OWI:
            oclc_owi = source_v
            if book in isbn_dict:
                isbn = isbn_dict[book]
                isbn = as_unicode(isbn)

        target_url = target_url.replace("[REFERENCENUMBER]",source_v)
        if DEBUG: print("-----------------------------------------------")
        if DEBUG: print("current target_url is: ", target_url)

        results_list = []

        if source_type == SOURCE_TYPE_VIAF:
            if DEBUG: print("[1] SOURCE_TYPE_VIAF")
            html_page,soup,html_raw = download_html(source_type,target_url)
            new_row1,new_row2 = parse_html_viaf(source_v,html_page,html_raw)
            if DEBUG: print("new_row1: ", as_unicode(new_row1),"new_row2: ", as_unicode(new_row2))

        elif source_type == SOURCE_TYPE_OCLC:
            if DEBUG: print("[2] SOURCE_TYPE_OCLC")
            if DEBUG: print("Looking for multiple LC Extra Author Detail rows...")
            html_page,soup,html_raw = download_html(source_type,target_url)
            new_row1,new_row2,new_row3,new_row4,new_row5 = parse_html_oclc(source_v,soup)        # returns multiple LC Extra Author Detail rows

        elif source_type == SOURCE_TYPE_XID_OWI:
            if DEBUG: print("[3] SOURCE_TYPE_XID_OWI using isbn,oclc_owi: ", as_unicode(isbn), as_unicode(oclc_owi))
            if book in oclc_other_dict:  # already have it
                oclc = oclc_other_dict[book]
                new_row1 = "oclc",source_v,oclc
                if DEBUG: print("new_row1: ", "oclc",source_v,oclc)
            else:
                if DEBUG: print("SOURCE_TYPE_XID_OWI: do not already have  *oclc*  from classify api, so now get from worldcat...book,isbn,oclc_owi: ", book, isbn, oclc_owi)
                html_page,soup,html_raw = download_html(source_type,target_url,isbn,oclc_owi)
                new_row1 = parse_html_xid(source_v,soup,html_page,"owi",isbn,oclc_owi,oclc)     #call_type = "owi"  returns oclc for isbn/owi combination
                if DEBUG: print("new_row1: ", as_unicode(new_row1))

        elif source_type == SOURCE_TYPE_XID_OCLC:
            if DEBUG: print("[4] SOURCE_TYPE_XID_OCLC")
            if DEBUG: print("Looking for loc_lccn (Library of Congress LCCN) using oclc")
            html_page,soup,html_raw = download_html(source_type,target_url)
            new_row1 = None
            if book in oclc_other_dict:
                oclc = oclc_other_dict[book]
                new_row1 = parse_html_xid(source_v,soup,html_page,"oclc",None,None,oclc)   #call_type = "oclc"  returns loc_lccn for oclc
                if DEBUG: print("new_row1: ", as_unicode(new_row1))

        else:
            continue

        try:
            if new_row1:
                results_list.append(new_row1)
            if new_row2:
                results_list.append(new_row2)
            if new_row3:
                results_list.append(new_row3)
            if new_row4:
                results_list.append(new_row4)
            if new_row5:
                results_list.append(new_row5)
        except:
            pass

        if len(results_list) > 0:
                final_results_dict[source_v] = results_list
        try:
            del new_row1
            del new_row2
            del new_row3
            del new_row4
            del new_row5
        except:
            pass

        del results_list

    #END FOR
    if DEBUG: print("-----------------------------------------------")

    try:
        del soup
        del html_page
        del html_raw
        del source_dict
        del isbn_dict
        del already_retrieved_list
    except:
        pass

    return final_results_dict
#--------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------
def download_html(source_type,target_url,isbn=None,oclc_owi=None):
    soup = ""
    html_page = ""
    html_raw = ""

    try:
        if source_type == SOURCE_TYPE_XID_OCLC:
            try:
                br = mechanize.Browser()
                response1 = br.open(target_url)
                html_raw = response1.read()
                br.close()
                del br
                response1.close()
                del response1
            except Exception as e:
                if DEBUG: print("mechanize exception: ", as_unicode(e))
        else:
            connection = myurlopen(target_url)
            html_raw = connection.read()
            connection.close()
            del connection
        #~ --------------------------------------
        soup = BeautifulSoup(html_raw)
        html_page = soup.prettify()    # take note...
    except Exception as e:
        if DEBUG: print(as_unicode(e))

    if not soup:
        if DEBUG: print("not soup")
        soup = ""
    if not html_page:
        html_page = ""
    if not html_raw:
        html_raw = ""

    return html_page,soup,html_raw
#-----------------------------------------------
#-----------------------------------------------
#-----------------------------------------------
def parse_html_viaf(v,html_page,html_raw):

    # v = viaf

    new_row1 = None
    new_row2 = None

    try:
        html_page = as_unicode(html_page)
        text_list = html_page.split("source nsid")
        #~ if DEBUG: print("# rows: ", as_unicode(len(text_list)))
        for row in text_list:
            s = as_unicode(row)
            t = as_unicode("ISNI|")
            if s.count(t) > 0:  #  <a href="/viaf/sourceID/ISNI|0000000121340483">ISNI|0000000121340483</a>
                s = s.strip()
                s_split = s.split('|')
                if s_split:
                    if len(s_split) > 0:
                        s = s_split[1]
                        myre = '[0-9]+'
                        match1 = re.match(myre,s)
                        if match1:
                            isni = match1.group(0)
                            isni = isni.strip()
                            if DEBUG: print("isni is: ", as_unicode(isni))
                            new_row1 = "isni",v,isni
                            break
        #END FOR
        del text_list

        #  <a href="http://www.worldcat.org/identities/lccn-n2001-90808">WorldCat Identities</a>
        #~ <ns2:source nsid="n50012900">LC|n  50012900</ns2:source>
        #~  LC|n  50012900
        contents = as_unicode(html_raw)
        tmp_list = contents.split("<ns2:")
        if tmp_list:
            #~ if DEBUG: print("length of tmp_list: ", as_unicode(len(tmp_list)))
            for row in tmp_list:
                s = as_unicode(row)
                if as_unicode('LC|n') in s:
                    s_split = s.split('LC|n')
                    if s_split:
                        if len(s_split) > 0:
                            lccn = s_split[1]
                            lccn = lccn.strip()
                            lccn = "lccn-n" + lccn
                            lccn = lccn.replace(" ","")
                            n = lccn.find("<")
                            if n > 0:
                                lccn = lccn[0:n]
                            n = lccn.find('"')
                            if n > 0:
                                lccn = lccn[0:n]
                            if DEBUG: print("lccn is: ", as_unicode(lccn))
                            new_row2 = "lccn",v,lccn
                            break
                else:
                    continue
            #END FOR
        else:
            if DEBUG: print("no data for lccn")
            pass
    except Exception as e:
        if DEBUG: print(as_unicode(e))
        pass

    return new_row1,new_row2
#-----------------------------------------------
#-----------------------------------------------
#-----------------------------------------------
def parse_html_oclc(v,soup):

    # v = oclc

    #~ <head>
        #~ <meta http-equiv="X-UA-Compatible" content="IE=edge">
        #~ <title>The Andy Warhol diaries (Book, 2014) [WorldCat.org]</title>
        #~ <meta name="description" content="Get this from a library! The Andy Warhol diaries. [Andy Warhol; Pat Hackett] -- Spanning the mid-1970s until just a few days before his death in 1987, THE ANDY WARHOL DIARIES is a compendium of the more than twenty thousand pages of the artist&#039;s diary that he dictated daily to ..." />
        #~ <link rel="canonical" href="//www.worldcat.org/title/andy-warhol-diaries/oclc/881024850" />
        #~ <link rel="meta" type="application/rdf+xml" href="http://www.worldcat.org/oclc/881024850.rdf" />
        #~ <link rel="meta" type="application/ld+json" href="http://www.worldcat.org/oclc/881024850.jsonld" />
        #~ <link rel="meta" type="text/turtle" href="http://www.worldcat.org/oclc/881024850.ttl" />
        #~ <link rel="meta" type="text/plain" href="http://www.worldcat.org/oclc/881024850.nt" />
        #~ <meta name="keywords" content="Warhol, Andy, 1928-1987 Diaries, Artists United States Diaries, Artists United States Biography, Warhol, Andy, 1928-1987, Artists, United States." />
    #~ </head>
    new_row1 = None
    new_row2 = None
    new_row3 = None
    new_row4 = None
    new_row5 = None
    try:
        if soup == "":
            if DEBUG: print("not soup.........")
            return new_row1,new_row2,new_row3,new_row4,new_row5
        head = soup.head
        if head:
            ttitle = head.find('title')
            title = ttitle.string
            title = title.replace("[WorldCat.org]","")
            if DEBUG: print("https://www.worldcat.org/oclc/  --  title is: ", title)
            for item in head.find_all('meta', {"name": "keywords"}):
                if item:  #  <meta name="keywords" content="Warhol, Andy, 1928-1987 Diaries, Artists United States Diaries, Artists United States Biography, Warhol, Andy, 1928-1987, Artists, United States." />
                    long_string = item.get('content')
                    long_string = as_unicode(long_string)
                    long_string = long_string.replace('"','')   # e.g.  arts."   or    "Warhol
                    long_string = long_string.replace('.','')   # e.g.  arts.  which is now identical to another value = arts; no duplicates.
                    long_string = long_string.replace('/>','')
                    tmp_list = long_string.split(",")
                    tmp_list = list(set(tmp_list))   # now no duplicates
                    tmp_list.sort()
                    for row in tmp_list:
                        row = as_unicode(row)
                        row = row.strip()
                        if title.count(row) > 0:   # e.g. Warhol is in the title...
                            continue
                        if DEBUG: print("keyword: ", row)
                        if not new_row1:
                            new_row1 = "lcead",v,row
                        elif not new_row2:
                            new_row2 = "lcead",v,row
                        elif not new_row3:
                            new_row3 = "lcead",v,row
                        elif not new_row4:
                            new_row4 = "lcead",v,row
                        elif not new_row5:
                            new_row5 = "lcead",v,row
                        else:
                            continue
                    #END FOR
            #END FOR
            del head
        else:
            if DEBUG: print("no lcead keywords were found for url...")
    except Exception as e:
        if DEBUG: print(as_unicode(e))

    #~ if DEBUG: print("new rows 1-5: ", new_row1,new_row2,new_row3,new_row4,new_row5)

    return new_row1,new_row2,new_row3,new_row4,new_row5
#-----------------------------------------------
#-----------------------------------------------
#-----------------------------------------------
def parse_html_xid(v,soup,html_page,call_type,isbn,oclc_owi,oclc):

    if DEBUG: print("parse_html_xid:  source_type is: ", call_type, "  of:  ", v, "  with an isbn of: ", as_unicode(isbn), \
    "  and an oclc_owi of: ", as_unicode(oclc_owi), "  and an oclc of: ", as_unicode(oclc))

    new_row1 = None

    try:

        if call_type == "owi":  #v = oclc-owi         purpose:  retrieve oclc for isbn
            new_row1 = parse_html_xid_owi(v,soup,html_page,call_type,isbn)   # purpose:  new_row1 = "oclc",v,oclc

        elif call_type == "oclc":    #v = oclc (other)     purpose:  retrieve loc_lccn for oclc (v)
            v = oclc
            new_row1 = parse_html_xid_oclc(v,soup,html_page,call_type,oclc)  # purpose:  new_row1 = "loc_lccn",v,loc_lccn

    except Exception as e:
        if DEBUG: print("Exception:  parse_html_xid: ", call_type, as_unicode(e))

    if DEBUG: print("new_row1: ", as_unicode(new_row1))

    return new_row1
#-----------------------------------------------
#-----------------------------------------------
def parse_html_xid_owi(v,soup,html_page,call_type,isbn):
    #---------------------------------------------------------------------------
    #  OWI              v = oclc-owi         purpose:  retrieve oclc for isbn (specifically here for oclc)
    #                                                    purpose:  new_row1 = "oclc",v,oclc
    #---------------------------------------------------------------------------
    new_row1 = None
    foundit = False
    for td in soup.find_all('td', {"class": "result details"}):
        for div in td.find_all('div', {"class": "oclc_number"}):
            s = div.contents[0]
            oclc = s.strip()
            new_row1 = "oclc",v,oclc
            if DEBUG: print("parse_html_xid_owi --- oclc: ", oclc)
            foundit = True
            break
        #END FOR
        if foundit:
            break
    #END FOR
    return new_row1
#-----------------------------------------------
#-----------------------------------------------
def parse_html_xid_oclc(v,soup,html_page,call_type,oclc):
    new_row1 = None
    if oclc is None:
        oclc = v
    if oclc is None:
        if DEBUG: print("oclc is None; returning with nothing done about loc_lccn...")
        return new_row1
    foundit = False
    pattern = "[0-9][0-9][0-9][0-9][0-9]+$"
    p = re.compile(pattern,re.IGNORECASE|re.DOTALL|re.MULTILINE)
    findit = '/wcpa/oclc/' + oclc
    for a in soup.find_all('a'):
        s = as_unicode(a)
        if s.find(findit) > -1:  # <a href="/wcpa/oclc/38765078?page=frame&amp;url=http%3A%2F%2Fcatdir.loc.gov%2Fcatdir%2Fdescription%2Frandom044%2F98015575.html%26checksum%3D56ed47778f3b221faf919c26f5b6dc75&amp;title=&amp;linktype=digitalObject&amp;detail=" onclick="LibServices(this,'Nolib_WebResource',';Nolib_WebResource');" title="http://catdir.loc.gov/catdir/description/random044/98015575.html">Publisher description</a>
            if s.find("catdir.loc.gov") > -1:
                s_list = s.split(".html")
                if len(s_list) > 0:
                    t = s_list[0].strip()
                    t = t.replace(oclc,"")  #want loc_lccn for re.match, not oclc...
                    #~ if DEBUG: print(t) # <a href="/wcpa/oclc/?page=frame&amp;url=http%3A%2F%2Fcatdir.loc.gov%2Fcatdir%2Ftoc%2Fecip088%2F2007052250
                    match = p.search(t)
                    if match:
                        loc_lccn = match.group(0)
                        loc_lccn = loc_lccn.strip()
                        new_row1 = "loc_lccn",oclc,loc_lccn
                        if DEBUG: print("parse_html_xid_oclc  ---  loc_lccn: ", loc_lccn)
                        foundit = True
                else:
                    if DEBUG: print("s_list has no rows...")
        if foundit:
            break
    #END FOR
    return new_row1
#-----------------------------------------------
#-----------------------------------------------
#-----------------------------------------------
#END OF library_codes_webscraping_api
