﻿#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai

from __future__ import (unicode_literals, division, absolute_import,
                        print_function)

__license__   = 'GPL v3'
__copyright__ = '2016, John Howell <jhowell@acm.org>'
__docformat__ = 'restructuredtext en'

import gzip
import re
import StringIO
import time
import urllib2
import urlparse
import zlib
import ssl
import httplib
import sys

from calibre.utils.config_base import tweaks

from calibre_plugins.overdrive_link.log import exception_str
from calibre_plugins.overdrive_link.tweak import (TWEAK_DISABLE_SSL_VERIFICATION,
                TWEAK_SAVE_ALL_RESPONSES, TWEAK_SAVE_RESPONSES_ON_ERROR)

'''    
    Note: Using chrome to see what a website does: Press F12 for debug mode, Open network tab, Perform action,
    Find request in list, copy as cURL to see all data in the request sent to the site.
'''

# More UA strings can be found in amazon.py
DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"


has_ssl_verify = sys.version_info[:3] > (2, 7, 8)   # verification of SSL certificates began with Python 2.7.9

num_simultaneous_queries = 1    # (max) number of simultaneous queries for pacing
last_query_time_of_host = {}    # keep track of last request for pacing



HTTPSConnection_original = httplib.HTTPSConnection

class HTTPSConnection_patched(httplib.HTTPSConnection):
    def __init__(self, *args, **kwargs):
        if ('context' not in kwargs) or (kwargs['context'] is None):
            kwargs['context'] = ssl._create_unverified_context()  # default to no verification of SSL certificates
            
        HTTPSConnection_original.__init__(self, *args, **kwargs)

def init_ssl(log):
    if tweaks.get(TWEAK_DISABLE_SSL_VERIFICATION, False) and has_ssl_verify:
        log.info('SSL certificate verification disabled')
        httplib.HTTPSConnection = HTTPSConnection_patched   # monkey patch since httplib is used by mechanize
        

def cleanup_ssl():
    if tweaks.get(TWEAK_DISABLE_SSL_VERIFICATION, False) and has_ssl_verify:
        httplib.HTTPSConnection = HTTPSConnection_original  # undo monkey patch


  
def set_num_simultaneous_queries(i):
    global num_simultaneous_queries
    num_simultaneous_queries = i
    
  
def delay_query(url, qps):
    global num_simultaneous_queries
    
    if not qps:
        return
        
    host = hostname_from_url(url)
    time_since_last_query = time.time() - last_query_time_of_host.get(host, 0)
    required_delay = (num_simultaneous_queries / qps) - time_since_last_query
    
    if required_delay > 0:
        time.sleep(required_delay)
        
    last_query_time_of_host[host] = time.time()
    
    

def inflate_response(content, content_encoding):
    if content_encoding in ('gzip', 'x-gzip', 'deflate'):
        if content_encoding == 'deflate':
            data = StringIO.StringIO(zlib.decompress(content))
        else:
            data = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(content))
        return data.read()
        
    return content
  
    
def open_url(log, url, data=None, timeout=60, cookiejar=None, uagent=DEFAULT_USER_AGENT, 
        referer=None, origin=None, addheaders=None, allow_gzip=True, save=False,
        log_request=True, qps=0, content_type=None, max_tries=3, retry_on_internal_error=True,
        retry_on_not_found=False):

    log.clear_response()
    
    if log_request:
        if data is not None:
            log.info('Post %s' % url)
            
            # reduce logging detail for known unimportant fields
            log_data = re.sub(r'((?:__EVENTVALIDATION)|(?:__VIEWSTATE)|(?:__CUSTOMVIEWSTATE))=.{20,}?(&|$)', r'\1=...\2', data)
            log.info('Data %s' % log_data)
        else:
            log.info('Get %s' % url)
        
    # Handlers prefixed by: ProxyHandler, UnknownHandler, HTTPHandler, HTTPDefaultErrorHandler, HTTPRedirectHandler, FTPHandler, FileHandler, HTTPErrorProcessor
    # Also  HTTPSHandler (if "import ssl" succeeds)
    # opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1), ...) -- for debug

    if tweaks.get(TWEAK_DISABLE_SSL_VERIFICATION, False) and has_ssl_verify:
        opener = urllib2.build_opener(urllib2.HTTPSHandler(context=ssl._create_unverified_context()))   # do not verify SSL certificates
    else:
        opener = urllib2.build_opener()
        
    if cookiejar is not None:
        opener.add_handler(urllib2.HTTPCookieProcessor(cookiejar))
        
    opener.addheaders = []
    
    if origin:
        opener.addheaders.append(('Origin', origin))
        
    if referer:
        opener.addheaders.append(('Referer', referer))
        
    if addheaders:
        opener.addheaders.extend(addheaders)
    
    if uagent:
        opener.addheaders.append(('User-Agent', uagent))
    
    if allow_gzip:
        opener.addheaders.append(('Accept-Encoding', 'gzip, deflate'))

        
        
    try_count = 0
    
    delay_query(url, qps)
        
    while True:
        try:
            if content_type:
                # build a request to change the Content-Type of a POST
                req = urllib2.Request(url, None, {'Content-Type': content_type})
                response = opener.open(req, data, timeout)
            else:
                response = opener.open(url, data, timeout)

        
            # Note - keep following under 'try' since errors, such as timeout, can occur when
            # returned data is read.
            
            #log.info('**info: %s'%unicode(response.info()))    # mimetools.Message instance
    
            response.data = inflate_response(response.read(), response.info().getheader("Content-Encoding"))    
        
            #log.info('**data length: %d'%len(response.data))

        except Exception as e:
            if type(e) == urllib2.HTTPError:
                e.response_data = ''
                
                try:
                    # attempt to save any response received
                    e.response_data = unicode(inflate_response(e.fp.read(), e.headers.get("Content-Encoding")), 
                                            encoding='utf-8', errors='ignore')
                    
                    if tweaks.get(TWEAK_SAVE_RESPONSES_ON_ERROR, False):
                        log.save_response(url, e.response_data, 'x-unknown')
                        if save is not None: log.response()
                except:
                    log.info('Exception occured getting reponse on HTTPError')
                
                if e.code in [401, 403, 410]:
                    raise       # 401 (Unauthorized), 403 (Forbidden), 410 (Gone) are assumed to be permanent
                    
                if e.code == 404 and not retry_on_not_found:
                    raise       # Not Found
                    
                if e.code == 500 and not retry_on_internal_error:
                    raise       # Internal server error
                    
                if e.code == 503:
                    time.sleep(30)  # service unavailable, possibly throttled - add extra delay
                 
            try_count += 1
            if try_count >= max_tries:
                raise e
                
            log.warn('Retrying %s on exception %s'%(hostname_from_url(url), exception_str(e)))
            time.sleep(30)  # delay before retry to give temporary problems time to clear
            
        else:
            break
            
    response.response_type = response.info().gettype()
            
    log.save_response(url, response.data, response.response_type)
    
    if save or tweaks.get(TWEAK_SAVE_ALL_RESPONSES, False):
        log.response()
    
    if response.response_type != 'application/x-bzip2' and not response.response_type.startswith("image/"):
        # fix occasional UnicodeDecodeError: decode results to unicode using the proper character set
        charset = response.info().getparam("charset")
        if not charset:
            charset = 'utf-8'   # default if unknown
            
        #log.info('**charset: %s'%charset)
        
        response.data = unicode(response.data, encoding=charset, errors='ignore') 
    
    return response
    
    
    
    
    
    
def browse_url(log, br, request=None, cookiejar=None, allow_gzip=False, timeout=60, 
                uagent=DEFAULT_USER_AGENT, addheaders=None, referer=None, save=False):
    # Attempt to retrieve a page from web site, retrying on common errors
    
    log.clear_response()
    
    if request:
        url = '%s %s' % (request.get_full_url(), request.get_data())
    else:
        url = 'submit "%s"' % br.form.attrs.get('action', '')
        
    log.info('Browse: %s' % url)
    
    br.set_handle_redirect(True)
    br.set_handle_robots(False)
    br.set_handle_refresh(False)
    
    br.addheaders = []
    
    if referer:
        br.addheaders.append(('Referer', referer))
        
    if addheaders:
        br.addheaders.extend(addheaders)
    
    if uagent:
        br.addheaders.append(('User-Agent', uagent))
    
    if cookiejar:
        br.set_cookiejar(cookiejar)
        
    try_count = 0
    MAX_TRIES = 3
    
    br.set_handle_gzip(allow_gzip)
    
    while True:
        try:
            if request is None:
                br.submit()     # timeout not supported
            else:
                br.open(request, timeout=timeout)
            
    
        except Exception as e:
            if type(e) == urllib2.HTTPError and e.code == 404:
                raise   # 404 error (Not Found) is assumed to be permanent
                
            if (type(e).__name__ == 'BrowserStateError') or (request is None):
                raise   # assumed to be permanent
                
            try_count += 1
            if try_count >= MAX_TRIES:
                raise
               
            if request:
                log.warn('Retrying %s on exception %s'%(hostname_from_url(url), exception_str(e)))
            else:
                log.warn('Retrying %s on exception %s'%(url, exception_str(e)))
            
            time.sleep(30)  # delay before retry to give temporary problems time to clear
            
        else:
            break
    
    response = br.response()
    response_type = response.info().gettype()
        
    if not response_type.startswith("image/"):
        # fix occasional UnicodeDecodeError: decode results to unicode using the proper character set
        charset = response.info().getparam("charset")
        if not charset: charset = 'utf-8'   # default if unknown
        data = unicode(response.read(), encoding=charset, errors='ignore')
        
    else:
        data = response.read()
        
    br.response().set_data(data)
    log.save_response(url, data, response_type)
    
    if save or tweaks.get(TWEAK_SAVE_ALL_RESPONSES, False):
        log.response()
        
    return data




# Utility routines

def isValidHostname(hostname): 
    if len(hostname) > 255: 
        return False 
    if hostname[-1:] == ".": 
        hostname = hostname[:-1] # strip exactly one dot from the right, if present 
    allowed = re.compile(r"(?!-)[A-Z\d-]{1,63}(?<!-)$", re.IGNORECASE) 
    return all(allowed.match(x) for x in hostname.split("."))
  
 

def hostname_from_url(url):
    purl = urlparse.urlparse(url)
    return purl.hostname if purl.hostname is not None else ''
    
    
def netloc_from_url(url):
    return urlparse.urlparse(url).netloc
