﻿#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai

from __future__ import (unicode_literals, division, absolute_import, print_function)

import re

from calibre.ebooks.BeautifulSoup import (BeautifulSoup, SoupStrainer)

from .python_transition import (IS_PYTHON2)
if IS_PYTHON2:
    from .python_transition import (html, str)
else:
    import html


__license__ = 'GPL v3'
__copyright__ = '2012-2022, John Howell <jhowell@acm.org>'


soup_strainer = SoupStrainer


SIGNIN_ACTIONS = [
    'login.aspx?',              # EBSCOhost, Brooklyn Public Library login form
    'validate_form.cfm?',       # Free Library of Philadelphia
    'dbcheck',
    'login',
    'Auth.asp',
    'cardauth',                 # ocls.freading.com (05/2016)
    'cosign.cgi',
    'freading.asp',             # ocls.freading.com (old)
    #'sndlogin',                # fairfaxcounty.freading.com
    #'plogin',                  # freelibrary.freading.com
    '/users/kansas',            # kslib.freading.com
    ]

REDIRECT_ACTIONS = [
    'elecres/netlibrary.cfm',   # Free Library of Philadelphia
    ]

CARD_NUMBER_FIELDS = [
    'librarycardnumber',    # OverDrive
    'lcn',                  # OverDrive 03/2015
    'user',
    'userid',
    'username',
    'barcode',
    'pid',
    'login',
    'cn',                   # ocls.freading.com (old)
    'card_number',          # ocls.freading.com (05/2016)
    '[card]',               # freading.com
    ]

CARD_PIN_FIELDS = [
    'librarycardpin',       # OverDrive
    'extra1',               # OverDrive (zip code, etc.)
    'password',
    'pass',
    'pin',
    'phone',
    'pn',                   # ocls.freading.com (old)
    '[pin]',                # freading.com
    ]


COMPILED_RE_TYPE = type(re.compile(' '))

try:
    BeautifulSoup.HTML_ENTITIES
    have_bs3 = True
except Exception:
    have_bs3 = False


def is_sign_in_form(f):
    if 'action' in f.attrs:
        for a in SIGNIN_ACTIONS:
            if a in f.attrs['action']:
                return True     # this is a login form
    return False


def is_redirect_form(f):
    if 'action' in f.attrs:
        for a in REDIRECT_ACTIONS:
            if a in f.attrs['action']:
                return True     # this is a redirection form
    return False


def set_card_number(log, form, card_number):
    if card_number:
        field = card_number_field(form)
        if field is not None:
            form[field] = card_number
        else:
            log.error('Sign in form has missing user/card field.')


def set_card_pin(log, form, card_pin):
    if card_pin:
        field = card_pin_field(form)
        if field is not None:
            form[field] = card_pin
        else:
            log.warn('Sign in form has missing password/pin field. (PIN configured when not needed.)')


def card_number_field(form):
    control_names = [c.name for c in form.controls if c.name]

    for f in CARD_NUMBER_FIELDS:
        for name in control_names:
            if name.lower().endswith(f):
                return name

    for f in CARD_NUMBER_FIELDS:
        for name in control_names:
            if f in name.lower():
                return name

    return None


def card_pin_field(form):
    control_names = [c.name for c in form.controls if c.name]

    for f in CARD_PIN_FIELDS:
        for name in control_names:
            if name.lower().endswith(f):
                return name

    for f in CARD_PIN_FIELDS:
        for name in control_names:
            if f in name.lower():
                return name

    return None


def parse_entities(s):
    return html.unescape(s)


class LibraryError(Exception):
    def __init__(self, reason):
        self.reason = reason

    def __repr__(self):
        return 'Server response error (%s)' % self.reason

    def __str__(self):
        return self.__repr__()


class ResultPages:
    '''
    Handle the sequencing of search results broken up across multiple pages
    '''

    def __init__(self):
        self.next_page = 1
        pass

    def sequence(self, first_item=None, last_item=None, total_items=None, current_page=None, total_pages=None):
        # to do ???
        pass

    def next_page(self):
        return self.next_page


def valid_isbn(isbn):
    # fix and check for valid isbn format. Check digit is not verified.

    isbn = isbn.replace('-', '').replace(' ', '')
    if re.match('^[0-9]{10}$', isbn) or re.match('^[0-9]{9}X$', isbn) or re.match('^[0-9]{13}$', isbn):
        return isbn

    return ''


def fix_script(s):
    return s[:-9].replace('</', r'<\/') + s[-9:]


def fix_html_for_beautifulsoup(page):
    # prevent error: ValueError: unichr() arg not in range(0x10000) (narrow Python build)
    page = re.sub(r'&#x[0-9a-fA-F]{5,};', '?', page)

    # eliminate conditional comments
    page = re.sub(r'<!--\[if .*?<!\[endif]-->', '', page, flags=re.DOTALL)

    # eliminate malformed comments causing parse errors
    page = re.sub(r'<!-[^\-].*?-->', '', page, flags=re.DOTALL)

    # prevent BeautifulSoup from missing the end of script tags
    page = re.sub('<script.*?</script>', lambda m: fix_script(m.group()), page, flags=re.DOTALL)

    #print('fixed_page: %s' % html.escape(page, quote=False))
    return page


# For use when search requires double quotes around search string, if present
def double_quote(s):
    return '"%s"' % s if s != '' else ''


# Extract clean visible text from a web page element
def text_only(element):
    # Is an element on a web page part of the visible text
    def visible(element):
        if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
            return False
        elif re.match('<!--.*-->', str(element)):
            return False
        return True

    if element is None:
        return ''

    if 'NavigableString' in str(type(element)):
        vis = str(element)
    else:
        for br in element.findAll('br'):
            br.replaceWith(' ')      # change to some text so that it doesn't disappear completely

        vis = ''.join([e for e in element.findAll(text=True) if visible(e)])

    return ' '.join(vis.replace('&nbsp;', ' ').strip().split())


def strip_scripts(html):
    # remove inline scripts causing BeautifulSoup to parse some pages incorrectly
    return re.sub(r'<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>', '', html)


def beautiful_soup(html, **kwargs):
    # handle backward compatibility so that calls can be BS4 style
    # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#porting-code-to-bs4

    # all entities automatically converted to string in bs4
    if have_bs3:
        kwargs["convertEntities"] = BeautifulSoup.HTML_ENTITIES     # HTML_ENTITIES only in bs3

        if "parse_only" in kwargs:
            kwargs["parseOnlyThese"] = kwargs.pop("parse_only")    # old argument name

    return BeautifulSoup(html, **kwargs)


def beautiful_soup_fix(html):
    return beautiful_soup(fix_html_for_beautifulsoup(html))


def strip_html(html):
    return text_only(beautiful_soup("<div>" + html))


def class_contains(a_class):
    return {'class': re.compile(r'\b%s\b' % a_class)}


def class_contains_all(classes):
    return {'class': re.compile(''.join([r'(?=.*?\b%s\b)' % c for c in classes.split()]) + r'^.*$')}


def must_find(element, tag, attrs={}):
    # report error if not found
    result = element.find(tag, attrs=attrs)

    if not result:
        raise LibraryError('missing "%s"' % ' '.join(
            [tag] + ['%s=%s' % (a, pattern_str(v)) for a, v in attrs.items()]))

    return result


def must_findAll(element, tag, attrs={}, recursive=True):
    # report error if not found
    result = element.findAll(tag, attrs=attrs, recursive=recursive)

    if not result:
        raise LibraryError('missing "%s"' % ' '.join(
            [tag] + ['%s=%s' % (a, pattern_str(v)) for a, v in attrs.items()]))

    return result


def pattern_str(p):
    # return the string given a string or re.compile pattern
    if isinstance(p, COMPILED_RE_TYPE):
        return str(p.pattern)

    return str(p)


def must_findall_xpath(element, path):
    # find using lxml xpath. report error if not found

    results = element.xpath(path)
    if len(results) == 0:
        raise LibraryError('missing "%s"' % path)

    return results


def must_find_xpath(element, path):
    return must_findall_xpath(element, path)[0]


def find_xpath(element, path):
    # return first found or None
    results = element.xpath(path)
    return None if len(results) == 0 else results[0]


def lxml_text_only(element):
    return ' '.join(element.xpath("string()").strip().split())  # same as: etree.tostring(element, method="text")
