﻿#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai

from __future__ import (unicode_literals, division, absolute_import, print_function)

import gzip
import io
import re
import ssl
import sys
import time
import zlib

from calibre.utils.config_base import tweaks

from calibre_plugins.overdrive_link.parseweb import (CARD_NUMBER_FIELDS, CARD_PIN_FIELDS)
from calibre_plugins.overdrive_link.tweak import (
    TWEAK_DISABLE_SSL_VERIFICATION, TWEAK_SAVE_ALL_RESPONSES, TWEAK_SAVE_RESPONSES_ON_ERROR)

from .python_transition import (IS_PYTHON2)
if IS_PYTHON2:
    from .python_transition import (http, repr, str, urllib)
else:
    import http.client
    import urllib.parse
    import urllib.request


'''
    Note: Using chrome to see what a website does: Press F12 for debug mode, Open network tab, Perform action,
    Find request in list, copy as cURL to see all data in the request sent to the site.
'''

__license__ = 'GPL v3'
__copyright__ = '2012-2025, John Howell <jhowell@acm.org>'


DISABLE_SSL_VERIFICATION_DEFAULT = True     # Fix expired certificate preventing OverDrive access as of 9/29/21

DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
EVERAND_MOBILE_UAGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.98 Safari/537.36'

AMAZON_UAGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]


num_simultaneous_queries = 1    # (max) number of simultaneous queries for pacing
last_query_time_of_host = {}    # keep track of last request for pacing

# verification of SSL certificates began with Python 2.7.9
DISABLE_SSL_VERIFICATION_DEFAULT = DISABLE_SSL_VERIFICATION_DEFAULT and sys.version_info[:3] > (2, 7, 8)


# fix unquoted redirect urls from Open Library
class HTTPRedirectHandler_fix(urllib.request.HTTPRedirectHandler):
    def redirect_request(self, req, fp, code, msg, hdrs, newurl):
        if isinstance(newurl, str):
            prefix, sep, suffix = newurl.rpartition("/")        # Python 3
            for c in suffix:
                if c < " " or c > "~":
                    newurl2 = prefix + sep + urllib.parse.quote(suffix)
                    print("redirect_request: code=%s newurl=%s --> %s" % (code, newurl, newurl2))
                    newurl = newurl2
                    break
        else:
            prefix, sep, suffix = newurl.rpartition(b"/")        # Python 2
            for c in suffix:
                if c < b" " or c > b"~":
                    newurl2 = prefix + sep + urllib.parse.quote(suffix)
                    print("redirect_request: code=%s newurl=%s --> %s" % (code, newurl, newurl2))
                    newurl = newurl2
                    break

        return urllib.request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, hdrs, newurl)


HTTPSConnection_original = http.client.HTTPSConnection


class HTTPSConnection_patched(http.client.HTTPSConnection):
    # https://docs.python.org/3/library/http.client.html

    def __init__(self, *args, **kwargs):
        if ('context' not in kwargs) or (kwargs['context'] is None):
            kwargs['context'] = ssl._create_unverified_context()    # default to no verification of SSL certificates

        #print("HTTPSConnection_patched init called")
        HTTPSConnection_original.__init__(self, *args, **kwargs)


class HTTPSConnection_patched3(http.client.HTTPSConnection):
    # https://docs.python.org/3/library/http.client.html
    def __init__(self, *args, **kwargs):
        if ('context' not in kwargs) or (kwargs['context'] is None):
            kwargs['context'] = ssl._create_unverified_context()    # default to no verification of SSL certificates

        #print("HTTPSConnection_patched3 init called")
        self.__init_original__(*args, **kwargs)


def set_num_simultaneous_queries(i):
    global num_simultaneous_queries
    num_simultaneous_queries = i


def delay_query(url, qps):
    global num_simultaneous_queries

    if not qps:
        return

    host = hostname_from_url(url)
    time_since_last_query = time.time() - last_query_time_of_host.get(host, 0)
    required_delay = (num_simultaneous_queries / qps) - time_since_last_query

    #print("delay_query: host=%s sec-since-last=%f num-simul=%d qps=%f delay-sec=%f" % (host, time_since_last_query, num_simultaneous_queries,
    #            qps, required_delay))

    if required_delay > 0:
        time.sleep(required_delay)

    last_query_time_of_host[host] = time.time()


def inflate_response(content, content_encoding):
    if content_encoding == 'deflate':
        return io.BytesIO(zlib.decompress(content)).read()

    if content_encoding in ['gzip', 'x-gzip']:
        return gzip.GzipFile('', 'rb', 9, io.BytesIO(content)).read()

    return content


def open_url(log, url, data=None, timeout=60, cookiejar=None, uagent=DEFAULT_USER_AGENT,
             referer=None, origin=None, addheaders=None, allow_gzip=True, save=False,
             log_request=True, qps=None, content_type=None, max_tries=3, retry_on_internal_error=True,
             retry_on_not_found=False, warn_on_retry=True, expect_errors=[], disable_ssl_verify=False,
             no_response_save=False, no_unicode_decode=False, fix_redirect=False):

    log.clear_response()

    if log_request:
        if data is not None:
            log.info('Post %s' % url)

            # reduce logging detail for known unimportant fields
            log_data = re.sub(r'((?:__EVENTVALIDATION)|(?:__VIEWSTATE)|(?:__CUSTOMVIEWSTATE))=.?(&|$)', r'\1={REMOVED}\2', data)
            log_data = re.sub(r'"query":"query FilterSearch.*?"', r'"query":REMOVED', log_data)     # Hoopla

            for field in CARD_NUMBER_FIELDS + CARD_PIN_FIELDS:
                log_data = re.sub(r'(^|&)%s=.*?(&|$)' % field, r'\1%s={REMOVED}\2' % field, log_data)    # protect user names & passwords

            log.info('Data %s' % log_data)
        else:
            log.info('Get %s' % url)

    if isinstance(data, str):
        data = data.encode("utf8")      # must be bytes for Python 3

    # Handlers prefixed by: ProxyHandler, UnknownHandler, HTTPHandler, HTTPDefaultErrorHandler, HTTPRedirectHandler, FTPHandler, FileHandler, HTTPErrorProcessor
    # Also  HTTPSHandler (if "import ssl" succeeds)
    # opener = urllib.request.build_opener(urllib.request.HTTPHandler(debuglevel=1), ...) -- for debug

    if fix_redirect:
        if (disable_ssl_verify or tweaks.get(TWEAK_DISABLE_SSL_VERIFICATION, DISABLE_SSL_VERIFICATION_DEFAULT)):
            # do not verify SSL certificates
            opener = urllib.request.build_opener(HTTPRedirectHandler_fix(), urllib.request.HTTPSHandler(context=ssl._create_unverified_context()))
        else:
            opener = urllib.request.build_opener(HTTPRedirectHandler_fix())
    else:
        if (disable_ssl_verify or tweaks.get(TWEAK_DISABLE_SSL_VERIFICATION, DISABLE_SSL_VERIFICATION_DEFAULT)):
            # do not verify SSL certificates
            opener = urllib.request.build_opener(urllib.request.HTTPSHandler(context=ssl._create_unverified_context()))
        else:
            opener = urllib.request.build_opener()

    if cookiejar is not None:
        opener.add_handler(urllib.request.HTTPCookieProcessor(cookiejar))

    opener.addheaders = []

    if origin:
        opener.addheaders.append(('Origin', origin))

    if referer:
        opener.addheaders.append(('Referer', referer))

    if addheaders:
        opener.addheaders.extend(addheaders)

    if uagent:
        opener.addheaders.append(('User-Agent', uagent))

    if allow_gzip:
        opener.addheaders.append(('Accept-Encoding', 'gzip, deflate'))

    try_count = 0

    delay_query(url, qps)

    while True:
        try:
            if content_type:
                # build a request to change the Content-Type of a POST
                req = urllib.request.Request(url, None, {'Content-Type': content_type})
                response = opener.open(req, data, timeout)
            else:
                response = opener.open(url, data, timeout)

            # Note - keep following under 'try' since errors, such as timeout, can occur when
            # returned data is read.

            #log.info('**info: %s' % str(response.info()))    # mimetools.Message instance

            response.data_bytes = inflate_response(response.read(), response.info().get("Content-Encoding"))    # getheader

            #log.info('**data length: %d' % len(response.data_bytes))

        except Exception as e:
            try_count += 1

            if isinstance(e, urllib.request.HTTPError):
                e.response_data = ''

                try:
                    # attempt to save any response received
                    e.response_data = str(
                            inflate_response(e.fp.read(), e.headers.get("Content-Encoding")),
                            encoding='utf-8', errors='ignore')

                    if tweaks.get(TWEAK_SAVE_RESPONSES_ON_ERROR, False) and not no_response_save:
                        log.save_response(url, e.response_data, 'x-unknown')
                        if save is not None:
                            log.response()
                except Exception:
                    log.info('Exception occurred getting response on HTTPError')

                if e.code in expect_errors:
                    e.is_httperror_exception = True
                    return e

                if e.code == 400:
                    # Open Library returns redirect URLS with non-quoted unicode chars that it rejects when requested 1/2018
                    redirect_url = e.geturl()
                    if redirect_url != url:
                        # URL should already be partly quoted so only quote unicode chars
                        corrected_url = urllib.parse.quote(redirect_url, safe=" !\"#$%&'()*+.=,/:;<=>?[\\]^_`{|}~")
                        if corrected_url != redirect_url and try_count < max_tries:
                            url = corrected_url
                            log.info("Bad Request: Retrying with corrected redirect URL")
                            continue

                if e.code in [401, 403, 410]:
                    raise       # 401 (Unauthorized), 403 (Forbidden), 410 (Gone) are assumed to be permanent

                if e.code == 404 and not retry_on_not_found:
                    raise       # Not Found

                if e.code == 500 and not retry_on_internal_error:
                    raise       # Internal server error

                if e.code == 503:
                    time.sleep(30)  # service unavailable, possibly throttled - add extra delay

            if try_count >= max_tries:
                raise e

            msg = 'Retrying %s on exception %s' % (hostname_from_url(url), repr(e))
            if warn_on_retry:
                log.warn(msg)
            else:
                log.info(msg)

            time.sleep(30)  # delay before retry to give temporary problems time to clear

        else:
            break

    response.response_type = response.info().gettype()

    if not no_response_save:
        log.save_response(url, response.data_bytes, response.response_type)

        if save or tweaks.get(TWEAK_SAVE_ALL_RESPONSES, False):
            log.response()

    if response.response_type != 'application/x-bzip2' and (not response.response_type.startswith("image/")) and (not no_unicode_decode):
        # fix occasional UnicodeDecodeError: decode results to unicode using the proper character set
        charset = response.info().getparam("charset") if IS_PYTHON2 else response.info().get_content_charset()
        if not charset:
            charset = 'utf-8'   # default if unknown

        #log.info("**response_type: %s charset: %s" % (response.response_type, charset))

        response.data_string = str(response.data_bytes, encoding=charset, errors='ignore')
    else:
        response.data_string = None

    response.is_httperror_exception = False
    return response


def browse_url(log, br, request=None, cookiejar=None, allow_gzip=False, timeout=60,
               uagent=DEFAULT_USER_AGENT, addheaders=None, referer=None, save=False,
               disable_ssl_verify=False):
    # Attempt to retrieve a page from web site, retrying on common errors

    disable_ssl_verify = disable_ssl_verify or tweaks.get(TWEAK_DISABLE_SSL_VERIFICATION, DISABLE_SSL_VERIFICATION_DEFAULT)

    if disable_ssl_verify:
        # monkey patch since http.client is used by mechanize
        if IS_PYTHON2:
            http.client.HTTPSConnection = HTTPSConnection_patched
        else:
            http.client.HTTPSConnection.__init_original__ = http.client.HTTPSConnection.__init__
            http.client.HTTPSConnection.__init__ = HTTPSConnection_patched3.__init__

    try:
        log.clear_response()

        if request:
            url = '%s %s' % (request.get_full_url(), request.get_data())
        else:
            url = 'submit "%s"' % br.form.attrs.get('action', '')

        log.info('Browse: %s' % url)

        br.set_handle_redirect(True)
        br.set_handle_robots(False)
        br.set_handle_refresh(False)

        br.addheaders = []

        if referer:
            br.addheaders.append(('Referer', referer))

        if addheaders:
            br.addheaders.extend(addheaders)

        if uagent:
            br.addheaders.append(('User-Agent', uagent))

        if cookiejar:
            br.set_cookiejar(cookiejar)

        try_count = 0
        MAX_TRIES = 3

        br.set_handle_gzip(allow_gzip)

        while True:
            try:
                if request is None:
                    br.submit()     # timeout not supported
                else:
                    br.open(request, timeout=timeout)

            except Exception as e:
                if type(e).__name__ == 'httperror_seek_wrapper':
                    log.info("httperror_seek_wrapper: %s" % str(e.__dict__))
                    log.info("wrapped: %s" % str(e.wrapped.__dict__))

                if isinstance(e, urllib.request.HTTPError) and e.code == 404:
                    raise   # 404 error (Not Found) is assumed to be permanent

                if type(e).__name__ == 'BrowserStateError':
                    raise   # assumed to be permanent

                if (request is None) and not (isinstance(e, urllib.request.HTTPError) and e.code == 500):
                    raise   # assumed to be permanent

                try_count += 1
                if try_count >= MAX_TRIES:
                    raise

                if request:
                    log.warn('Retrying %s on exception %s' % (hostname_from_url(url), repr(e)))
                else:
                    log.warn('Retrying %s on exception %s' % (url, repr(e)))

                time.sleep(30)  # delay before retry to give temporary problems time to clear

            else:
                break

        response = br.response()
        response_type = response.info().gettype()

        if not response_type.startswith("image/"):
            # fix occasional UnicodeDecodeError: decode results to unicode using the proper character set
            charset = response.info().getparam("charset") if IS_PYTHON2 else response.info().get_content_charset()
            if not charset:
                charset = 'utf-8'   # default if unknown
            data = str(response.read(), encoding=charset, errors='ignore')

        else:
            data = response.read()

        br.response().seek(0)   # restore data
        log.save_response(url, data, response_type)

        if save or tweaks.get(TWEAK_SAVE_ALL_RESPONSES, False):
            log.response()

        return data

    finally:
        if disable_ssl_verify:
            # undo monkey patch
            if IS_PYTHON2:
                http.client.HTTPSConnection = HTTPSConnection_original
            else:
                http.client.HTTPSConnection.__init__ = http.client.HTTPSConnection.__init_original__


# Utility routines

def isValidHostname(hostname):
    if len(hostname) > 255:
        return False
    if hostname[-1:] == ".":
        hostname = hostname[:-1]    # strip exactly one dot from the right, if present
    allowed = re.compile(r"(?!-)[A-Z\d-]{1,63}(?<!-)$", re.IGNORECASE)
    return all(allowed.match(x) for x in hostname.split("."))


def hostname_from_url(url):
    purl = urllib.parse.urlparse(url)
    return purl.hostname if purl.hostname is not None else ''


def netloc_from_url(url):
    return urllib.parse.urlparse(url).netloc
