﻿#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai

from __future__ import (unicode_literals, division, absolute_import, print_function)

import dateutil
import json
import re

from calibre.utils.date import parse_only_date
#from calibre.utils.config_base import tweaks

from calibre_plugins.overdrive_link.numbers import (value_unit)
from calibre_plugins.overdrive_link.book import (LibraryBook, InfoBook, unique_authors)
from calibre_plugins.overdrive_link.formats import (
    FORMAT_ADOBE_EPUB, FORMAT_ADOBE_PDF, FORMAT_BOOKREADER, FORMAT_OPEN_PDF, FORMAT_OPEN_EPUB,
    FORMAT_MOBI_EBOOK, FORMAT_PROTECTED_DAISY, FORMAT_DJVU, FORMAT_CBR, FORMAT_CBZ)
from calibre_plugins.overdrive_link.titlecase import titlecase
from calibre_plugins.overdrive_link.json import js_value
from calibre_plugins.overdrive_link.language import (LANGUAGE_CODE, LANGUAGE_NAME)
from calibre_plugins.overdrive_link.library import SearchableLibrary
from calibre_plugins.overdrive_link.net import open_url
from calibre_plugins.overdrive_link.author_prep import normalize_author
from calibre_plugins.overdrive_link.title_prep import normalize_title
#from calibre_plugins.overdrive_link.tweak import TWEAK_SAVE_RESPONSES_ON_ERROR
from calibre_plugins.overdrive_link.parseweb import (LibraryError, beautiful_soup_fix, class_contains, text_only, valid_isbn)

from .python_transition import (IS_PYTHON2)
if IS_PYTHON2:
    from .python_transition import (http, repr, str, urllib)
else:
    import http.cookiejar
    import urllib.parse

try:
    from calibre_plugins.overdrive_link_debug.config import DEBUG_MODE
except ImportError:
    DEBUG_MODE = False


__license__ = 'GPL v3'
__copyright__ = '2012-2022, John Howell <jhowell@acm.org>'


IA_FORMATS = {
    'additional text pdf': FORMAT_OPEN_PDF,
    'b/w pdf': FORMAT_OPEN_PDF,
    'daisy': FORMAT_PROTECTED_DAISY,
    'cbz': FORMAT_CBZ,
    'comic book rar': FORMAT_CBR,
    'comic book zip': FORMAT_CBZ,
    'djvu': FORMAT_DJVU,
    'djvutxt': FORMAT_DJVU,
    'djvu xml': FORMAT_DJVU,
    'epub': FORMAT_OPEN_EPUB,
    'encrypted daisy': FORMAT_PROTECTED_DAISY,
    'image container pdf': FORMAT_OPEN_PDF,
    'kindle': FORMAT_MOBI_EBOOK,
    'mobi': FORMAT_MOBI_EBOOK,
    'pdf': FORMAT_OPEN_PDF,
    'pdf with text': FORMAT_OPEN_PDF,
    'single page processed jp2 zip': FORMAT_BOOKREADER,
    }


class BookData(object):
    pass


class InternetArchive(SearchableLibrary):
    id = 'ia'
    name = 'Internet Archive'
    formats_supported = {
        FORMAT_ADOBE_EPUB, FORMAT_ADOBE_PDF, FORMAT_BOOKREADER, FORMAT_OPEN_PDF, FORMAT_OPEN_EPUB,
        FORMAT_MOBI_EBOOK, FORMAT_PROTECTED_DAISY, FORMAT_DJVU, FORMAT_CBR, FORMAT_CBZ}
    sign_in_affects_get_current_availability = True     # sign in needed to detect current holds

    @staticmethod
    def validate_library_id(library_id, migrate=True, config=None):
        if library_id != '':
            raise ValueError('Internet Archive library-id must be blank')

        return library_id

    @staticmethod
    def validate_book_id(book_id, library_id):
        return book_id

    @staticmethod
    def book_url(library_id, book_id):
        return "https://archive.org/details/%s" % book_id

    @staticmethod
    def book_key_library_id(library_id):
        return library_id   # has same book ids at all libraries, but different available formats

    def __init__(self):
        self.cookiejar = http.cookiejar.CookieJar()

    def sign_in(self, use_credentials):
        if not (self.card_number and use_credentials):
            return  # cannot sign in

        self.log.info('Signing in to %s' % self.name)
        self.signin_required = True
        self.cookiejar.clear()

        open_url(self.log, "https://archive.org/account/login.php", cookiejar=self.cookiejar)

        data = {}
        data["username"] = self.card_number
        data["password"] = self.card_pin
        data["remember"] = "true"
        data["referer"] = "https://archive.org/"
        data["login"] = "true"
        data["submit_by_js"] = "true"

        response = open_url(self.log, "https://archive.org/account/login", urllib.parse.urlencode(data), cookiejar=self.cookiejar)
        self.log.info("login response: %s" % response.data_string)
        jsdata = json.loads(response.data_string)

        if jsdata.get("status") != "ok":
            self.log.info("Login status=%s, message=%s" % (jsdata.get("status"), jsdata.get("message")))
            raise Exception('Sign in unsuccessful.')

        open_url(self.log, "https://archive.org/", cookiejar=self.cookiejar)
        self.log.info('Sign in successful')
        self.signed_in = True

    def find_books(self, books, search_author, search_title, keyword_search):
        collections = [collection.strip() for collection in self.branch_id.split(",")]
        results_processed = 0
        total_pages = 1
        page_num = 1

        RESULTS_PER_PAGE = 500

        while (page_num <= total_pages):
            # see: https://archive.org/advancedsearch.php

            q = []
            q.append('mediatype:"texts"')        # possibly audio for audiobooks?

            if search_author:
                q.append('creator:(%s)' % strip_parens(search_author))

            if search_title:
                if keyword_search:
                    q.append('subject:(%s)' % strip_parens(search_title))
                else:
                    q.append('title:(%s)' % strip_parens(search_title))

            if self.branch_id:
                q.append('collection:(%s)' % (" OR ".join(['"%s"' % collection for collection in collections])))

            if self.config.search_language in LANGUAGE_CODE:
                q.append('language:"%s"' % LANGUAGE_CODE[self.config.search_language])

            query = []
            query.append(('q', " AND ".join(q)))
            query.append(('fl[]', 'creator'))
            query.append(('fl[]', 'date'))
            #query.append(('fl[]', 'format'))     -- incomplete, not usable -- get from book details
            query.append(('fl[]', 'identifier'))
            query.append(('fl[]', 'isbn'))
            query.append(('fl[]', 'language'))
            query.append(('fl[]', 'mediatype'))
            query.append(('fl[]', 'publisher'))
            query.append(('fl[]', 'related-external-id'))
            query.append(('fl[]', 'title'))
            query.append(('sort[]', ''))
            query.append(('rows', "%d" % RESULTS_PER_PAGE))
            query.append(('page', "%d" % page_num))
            query.append(('output', 'json'))
            query.append(('callback', ''))        # empty for pure json
            query.append(('save', 'yes'))

            response = open_url(self.log, 'https://archive.org/advancedsearch.php?%s' % urllib.parse.urlencode(query), warn_on_retry=False)
            result = json.loads(response.data_string)

            status = result.get("responseHeader", {}).get("status", -1)
            if status != 0:
                self.log.info("response: %s" % response.data_string)
                raise LibraryError('Internet Archive search status: %s' % status)

            resp = result["response"]

            total_results = resp["numFound"]
            total_pages = ((total_results - 1) // RESULTS_PER_PAGE) + 1

            if resp["start"] != results_processed:
                raise LibraryError('Expected start %d on page %d but found %d' % (results_processed, page_num, resp["start"]))

            book_ids_found = set()

            for doc in resp["docs"]:
                results_processed += 1
                book_id = doc["identifier"]

                if book_id not in book_ids_found:
                    book_ids_found.add(book_id)     # prevent same book ID multiple times

                    if doc['mediatype'] != 'texts':
                        self.log.warn("Unexpected media type %s for %s" % (doc['mediatype'], book_id))
                        continue

                    title = normalize_title(titlecase(": ".join([t.strip() for t in self.aslist(doc["title"])])))

                    # remove duplicates while preserving order
                    authors = unique_authors([normalize_author(a, unreverse=True, fix_case=True, fix_ia=True) for a in self.aslist(doc["creator"])])

                    publisher = self.asvalue(doc, "publisher", "", only_single=False)

                    try:
                        pubdate = parse_only_date(str(self.asvalue(doc, "date", "", only_single=False)), assume_utc=True) if "date" in doc else None
                    except dateutil.parser.ParserError:
                        pubdate = None      # ignore badly formatted dates

                    isbn = self.asvalue(doc, "isbn", "", only_single=False)
                    if not isbn:
                        for extid in self.aslist(doc.get('related-external-id', [])):
                            if extid.startswith("urn:isbn:"):
                                isbn = valid_isbn(extid[9:])
                                break

                    language = ''
                    if (self.config.search_language in LANGUAGE_CODE) and ("language" in doc):
                        languages = self.aslist(doc["language"])
                        if LANGUAGE_CODE[self.config.search_language] in languages:
                            language = self.config.search_language
                        else:
                            language = languages[0]
                            if language in LANGUAGE_NAME:
                                language = LANGUAGE_NAME[language]

                    series = ''
                    series_index = 0.0

                    m = re.search(r' \(([^)]+), book ([0-9])+\)$', title, flags=re.IGNORECASE)
                    if m:
                        title = re.sub(r' \([^)]+\)$', '', title)
                        series = re.sub(' series$', '', m.group(1), flags=re.IGNORECASE)
                        series_index = int(m.group(2))

                    lbook = LibraryBook(
                            authors=authors, title=title, series=series, series_index=series_index,
                            isbn=isbn, publisher=publisher, pubdate=pubdate, language=language,
                            available=True, lib=self, book_id=book_id, search_author=search_author)

                    self.log.info('Found %s' % repr(lbook))
                    books.add(lbook)

            page_num += 1

        if results_processed != total_results:
            raise LibraryError('Expected %s but found %d' % (value_unit(total_results, 'result'), results_processed))

        return False

    def get_book_info(self, book_id, cache):
        book_data = self.get_book_data(book_id)

        if book_data is None:
            return None

        #if not book_data.formats:
        #    self.log.warn("No formats")

        return InfoBook(formats=book_data.formats, lib=self, book_id=book_id)

    def get_current_book_availability(self, book_id):
        book_data = self.get_book_data(book_id)

        if book_data is None:
            return False

        if not book_data.formats:
            return None

        wait_weeks = self.calculate_wait_weeks(
                library_copies=book_data.library_copies,
                available_copies=book_data.available_copies,
                have_checked_out=book_data.have_checked_out,
                hold_position_overall=book_data.hold_position_overall,
                number_waiting_overall=book_data.number_waiting_overall,
                avg_loan_weeks=1.5)     # 2 weeks nominal, allow for early returns

        return (wait_weeks, book_data.hold_position_overall is not None, book_id)

    def get_book_data(self, book_id):
        '''
        All metadata for archive.org items are stored in <identifier>_meta.xml and <identifier>_files.xml. The meta.xml file contains all of the
        item-level metadata for an item (e.g. title, description, creator, etc.). The files.xml file contains all of the file-level metadata
        (e.g. track title, checksums, etc.). While these two files are the canonical sources of metadata for archive.org items, most users will
        interact with an item’s metadata via the metadata API. For example, nasa_meta.xml correlates to /metadata/nasa/metadata and
        nasa_files.xml to /metadata/nasa/files.

        https://archive.org/metadata/dragongeorge00dick/metadata        metadata from <identifier>_meta.xml in json
        https://archive.org/metadata/dragongeorge00dick/files           metadata from <identifier>_files.xml in json
        https://archive.org/metadata/dragongeorge00dick                 metadata combined json
        https://archive.org/details/dragongeorge00dick?output=json      add "misc" and "item" metadata
        '''

        #debug = tweaks.get(TWEAK_SAVE_RESPONSES_ON_ERROR, False)
        book_data = BookData()
        book_data.formats = set()
        # available format info is more complete from web page vs API for unknown reason
        response = open_url(self.log, self.book_url(self.library_id, book_id), cookiejar=self.cookiejar, expect_errors=[404])

        if response.is_httperror_exception:
            self.log.info('Book does not exist')
            return None

        soup = beautiful_soup_fix(response.data_string)
        br_data = None

        ia_book_theater = soup.find("ia-book-theater")      # 7/2022
        if ia_book_theater is not None:
            #self.log.info("found ia-book-theater: %s" % html.escape(ia_book_theater.prettify(), quote=False))
            book_manifest_url = ia_book_theater.get("bookManifestUrl") or ia_book_theater.get("bookmanifesturl")
            if book_manifest_url is not None:
                #self.log.info("found bookManifestUrl: %s" % book_manifest_url)
                parsed_url = urllib.parse.urlparse(book_manifest_url, 'https')
                book_script_url = urllib.parse.urlunparse(parsed_url).replace("&amp;", "&")   # build full url

                br_response = open_url(self.log, book_script_url, cookiejar=self.cookiejar, expect_errors=[400, 402, 403, 404], addheaders=[
                    ('Referer', 'https://archive.org/'), ('Sec-Fetch-Site', 'same-site'), ('Sec-Fetch-Mode', 'no-cors'),
                    ('Sec-Fetch-Dest', 'script'), ('Accept', '*/*')])

                if br_response.is_httperror_exception:
                    self.log.info('BookReaderJSIA not accessible, code %d' % br_response.code)
                else:
                    #self.log.info("BookReader json: %s" % br_response.data_string)
                    br_data = json.loads(br_response.data_string)["data"]

        if br_data is not None and soup.find('div', attrs={'id': 'IABookReaderWrapper'}):
            for script in soup.findAll('script'):
                content = str(script)
                if '/BookReader/BookReaderJSIA.php?' in content:
                    partial_script_url = js_value(self.log, content, "url: ")

                    parsed_url = urllib.parse.urlparse(partial_script_url, 'https')
                    book_script_url = urllib.parse.urlunparse(parsed_url).replace("&amp;", "&")   # build full url

                    br_response = open_url(self.log, book_script_url, cookiejar=self.cookiejar, expect_errors=[400, 402, 403, 404], addheaders=[
                        ('Referer', 'https://archive.org/'), ('Sec-Fetch-Site', 'same-site'), ('Sec-Fetch-Mode', 'no-cors'),
                        ('Sec-Fetch-Dest', 'script'), ('Accept', '*/*')])

                    if br_response.is_httperror_exception:
                        self.log.info('BookReaderJSIA not accessible, code %d' % br_response.code)
                    else:
                        #self.log.info("BookReader json: %s" % br_response.data_string)
                        br_data = json.loads(br_response.data_string)["data"]

                    break

        if br_data is not None:
            metadata = br_data.get("metadata", {})      # loans__status information in metadata is out-of-date
            is_access_restricted = self.asvalue(metadata, "access-restricted-item", "false") == "true"

            lending_info = br_data.get("lendingInfo", {})   # browse is read subset, lend is 1 hr online only, borrow is 2 week + ADE
            book_data.have_checked_out = lending_info.get("userHasBorrowed", False) or lending_info.get("userHoldIsReady", False)

            userOnWaitingList = lending_info.get("userOnWaitingList", False)
            userWaitlistPosition = lending_info.get("userWaitlistPosition", 0)
            book_data.hold_position_overall = userWaitlistPosition + 1 if userWaitlistPosition >= 0 and userOnWaitingList else None

            lending_status = lending_info.get("lendingStatus") or {}
            is_lendable = lending_status.get("is_lendable", False)
            is_browser_borrowable = lending_info.get("isBrowserBorrowable", False)
            is_borrowable = is_lendable and is_browser_borrowable
            available_to_borrow = is_browser_borrowable and lending_status.get("available_to_borrow", False)

            book_data.library_copies = lending_status.get("max_borrowable_copies", 0) if is_borrowable else 0
            #self.log.warn("max_borrowable_copies=%d, library_copies=%d" % (lending_status.get("max_borrowable_copies", 0), book_data.library_copies))
            book_data.available_copies = (
                (lending_status.get("available_borrowable_copies", 0) if is_borrowable and available_to_borrow else 0)
                if is_access_restricted else True)
            #self.log.warn("available_borrowable_copies=%d, available_copies=%d" % (
            #    lending_status.get("available_borrowable_copies", 0), book_data.available_copies))
            book_data.number_waiting_overall = lending_status.get("users_on_waitlist", 0)
            #book_data.have_checked_out = lending_status.get("user_has_borrowed", False) or lending_status.get("user_can_claim_waitlist", False)

            if is_lendable or not is_access_restricted:
                book_data.formats.add(FORMAT_BOOKREADER)

            if lending_status.get("is_printdisabled", False):
                book_data.formats.add(FORMAT_PROTECTED_DAISY)             # shows up on web page but not in json files list

        details_response = open_url(self.log, self.book_url(self.library_id, book_id) + "?output=json", cookiejar=self.cookiejar)
        #self.log.info("details json: %s" % details_response.data_string)
        details = json.loads(details_response.data_string)       # values are strings or arrays of strings

        if br_data is None:
            metadata = details.get("metadata", {})      # loans__status information in metadata is out-of-date
            is_borrowable = is_access_restricted = self.asvalue(metadata, "access-restricted-item", "false") == "true"
            book_data.have_checked_out = False
            book_data.hold_position_overall = None
            book_data.library_copies = 0 if is_access_restricted else 1
            book_data.available_copies = False if is_access_restricted else True
            book_data.number_waiting_overall = 0

        is_restricted_borrowable = is_access_restricted and is_borrowable and book_data.library_copies

        # incomplete files information, same as from https://archive.org/metadata/<id>/files or <id>_files.xml
        for file_info in details.get("files", {}).values():
            file_format = file_info.get("format", "unknown").lower().partition('"')[0].strip()

            if is_restricted_borrowable:
                if file_format == "acs encrypted epub":
                    book_data.formats.add(FORMAT_ADOBE_EPUB)
                elif file_format == "acs encrypted pdf":
                    book_data.formats.add(FORMAT_ADOBE_PDF)

            if file_info.get("private", "false") != "true" and not is_access_restricted:
                # do not report unknown formats, over 100 found
                if file_format in IA_FORMATS:
                    book_data.formats.add(IA_FORMATS[file_format])

        # downloadable files listed in details HTML page
        download_section = soup.find('section', attrs=class_contains('item-download-options'))
        if download_section:
            for format_group in download_section.findAll('div', attrs={'class': 'format-group'}):
                dl_link = format_group.find('a', attrs=class_contains('download-pill'))
                if dl_link:
                    file_format = text_only(dl_link).lower().partition('"')[0].strip()
                    if file_format.endswith(' download'):
                        file_format = file_format[0:-9]

                    if '.' in file_format:
                        file_format = file_format.rpartition('.')[2]

                    # do not report unknown formats, over 100 found
                    if file_format in IA_FORMATS:
                        book_data.formats.add(IA_FORMATS[file_format])

            if is_restricted_borrowable:
                dl_lending = download_section.find('p', attrs=class_contains('download-lending-message'))
                if dl_lending:
                    dl_lending_text = text_only(dl_lending).lower().strip()
                    if '14 day loan required' in dl_lending_text:
                        if ' epub ' in dl_lending_text:
                            book_data.formats.add(FORMAT_ADOBE_EPUB)
                        if ' pdf ' in dl_lending_text:
                            book_data.formats.add(FORMAT_ADOBE_PDF)

        if "printdisabled" not in self.branch_id.split():
            book_data.formats.discard(FORMAT_PROTECTED_DAISY)         # only keep if printdisabled is a desired collection

        #if book_data.number_waiting_overall > 0 and debug:
        #    self.log.warn("number_waiting_overall=%d" % book_data.number_waiting_overall)

        return book_data

    def aslist(self, x):
        return x if isinstance(x, list) else [x]

    def asvalue(self, dic, key, default, only_single=True):
        x = dic.get(key, default)
        lst = self.aslist(x)
        if len(lst) != 1 and only_single:
            self.log.warn("Expected single value for %s, found %d: %s" % (key, len(lst), repr(lst)))

        return lst[0] if len(lst) > 0 else None


def strip_parens(s):
    return s.replace("(", "").replace(")", "")
