#!/usr/bin/env python
 # -*- coding: utf-8 -*-
from __future__ import unicode_literals, division, absolute_import, print_function

try:
    from sigil_bs4 import BeautifulSoup
except:
    from bs4 import BeautifulSoup

import sys, os, urllib, os.path, time, socket
from urllib.parse import urlparse
from urllib.error import *

# some websites require valid HTTP headers
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0'
accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
accept_language = 'en-US;q=0.8,en;q=0.5'
accept_encoding = 'gzip, deflate'

# remove forbidden characters in XML
forbidden_chars = dict.fromkeys(map(ord, '<>'), None)

# get home folder location & current date/time
home = os.path.expanduser('~')
today = time.strftime("%Y%m%d-%H%M%S")

# define log file name
if os.path.isdir(os.path.join(home, 'Desktop')):
    log_file_name = os.path.join(home, 'Desktop', 'URLChecker_' + today +'.log')
else:
    log_file_name = os.path.join(home, 'URLChecker_' + today +'.log')

# get the line number
def getlinenumber(html, text):
    lines = html.splitlines()
    linums = []
    for index, line in enumerate(lines):
        if text in line:
            linums.append(index +1)
    return linums

# main routine
def run(bk):
    log_file = ''
    BrokenURLs = False
    # get all files in the spine
    for (html_id, linear) in bk.getspine():
        mime = bk.id_to_mime(html_id)
        
        # process only html files
        if mime == 'application/xhtml+xml':
            filename = os.path.basename(bk.id_to_href(html_id))
            
            print('\nProcessing ' + filename + '...\n')
            log_file += '\nProcessing ' + filename + '...\n'
            html = bk.readfile(html_id)
            
            # load html code into BeautifulSoup
            soup = BeautifulSoup(html, 'html.parser')
            
            # find all links
            CheckedURLs = []
            for link in soup.find_all('a'):
                href = link.get('href')

                # check only hrefs
                if href != None:
                    
                    # ignore sites that have already been checked and non-URLs
                    if href.startswith('http') and urlparse(href).netloc != '' and href not in CheckedURLs:
                        hasError = False
                        # try to connect to the site 
                        try:
                            request = urllib.request.Request(href)
                            # headers somtimes required by websites
                            request.add_header("User-Agent", user_agent)
                            request.add_header("Accept", accept)
                            request.add_header("Accept-Language", accept_language)
                            request.add_header("Accept-Encoding", accept_encoding)
                            # also add a correct Host header
                            request.add_header("Host", urlparse(href).hostname)
                            response = urllib.request.urlopen(request, None, 5)
                            message = 'OK : ' + href
                        except HTTPError as httpError:
                            message = str(httpError.code) + ' : ' + href
                            hasError = True
                        except URLError as urlError:
                            message = str(urlError.reason) + ' : ' + href
                            hasError = True
                        except socket.timeout as socketError:
                            message = 'Socket timeout : ' + href
                            hasError = True                            
                        except Exception as error:
                            message = str(type(error)).translate(forbidden_chars) + ' : ' + href
                            hasError = True
                        
                        # output message whatever it is
                        print(message)
                        log_file += message + '\n'
                        
                        if hasError:
                            BrokenURLs = True
                            linenumbers = getlinenumber(html, href)
                            for linenumber in linenumbers:
                                bk.add_result('error', filename, linenumber, message)
                        
                        # add URL to list of checked URLs
                        CheckedURLs.append(href)
                    else:
                        # add non-URLs to log
                        log_file += href + ': NOT CHECKED\n'

    # write log file
    open(log_file_name,'w').write(log_file)
    print('Log file writen to: ' + log_file_name)
    print('\nDone.\n\nPlease click OK to close the Plugin Runner window.')
    
    return 0


def main():
    print('I reached main when I should not have\n')
    return -1

if __name__ == "__main__":
    sys.exit(main())
