﻿#!/usr/bin/env python
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai

from __future__ import (unicode_literals, division, absolute_import,
                        print_function)

__license__   = 'GPL v3'
__copyright__ = '2016, John Howell <jhowell@acm.org>'
__docformat__ = 'restructuredtext en'

import re
import urllib
import json
import cookielib
import mechanize
import urlparse

from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.utils.date import parse_only_date

from calibre_plugins.overdrive_link.numbers import value_unit
from calibre_plugins.overdrive_link.formats import (
    FORMAT_ADOBE_EPUB, FORMAT_ADOBE_PDF, FORMAT_ONLINE_READER, 
    FORMAT_M4B, FORMAT_MP3, FORMAT_WMA)
from calibre_plugins.overdrive_link.book import LibraryBook
from calibre_plugins.overdrive_link.language import LANGUAGE_CODE
from calibre_plugins.overdrive_link.library import SearchableLibrary
from calibre_plugins.overdrive_link.net import (browse_url, open_url, hostname_from_url, netloc_from_url)
from calibre_plugins.overdrive_link.match import (normalize_author, normalize_title)
from calibre_plugins.overdrive_link.parseweb import (
    LibraryError, class_contains, must_find, text_only, is_sign_in_form, is_redirect_form,
    set_card_number, set_card_pin)

'''
EBSCOhost
'''

SEARCH_HOST = 'search.ebscohost.com'

EBOOK_DATABASE = 'nlebk'
AUDIOBOOK_DATABASE = 'nlabk'
PROFILE = 'ehost'
BOOK_ID_SEPARATOR = '/'

RESERVED = {'and', 'or', 'not'}    # reserved words for searches

IDS = ['sid', 'vid', 'tid', 'hid']  # session ids

def decode_book_id(book_id):
    if BOOK_ID_SEPARATOR not in book_id:
        return (EBOOK_DATABASE, book_id)    # assume book database if not specified
    
    return book_id.split(BOOK_ID_SEPARATOR)
    
    
def encode_book_id(database, accession_number):
    if database == EBOOK_DATABASE:
        return accession_number                 # assume book database if not specified
        
    return '%s%s%s'%(database, BOOK_ID_SEPARATOR, accession_number)
        
                
class EBSCOhost(SearchableLibrary):
    id = 'eh'
    name = 'EBSCOhost'
    formats_supported = {FORMAT_ADOBE_EPUB, FORMAT_ADOBE_PDF, FORMAT_ONLINE_READER, FORMAT_M4B, FORMAT_MP3, FORMAT_WMA}
    
    @staticmethod    
    def validate_library_id(library_id, migrate=True, config=None):
        # library_id = custid (library's EBSCOhost customer ID)
        
        if not re.match(r'^([0-9a-zA-Z]+)$', library_id):
            raise ValueError('EBSCOhost library id (customer id) must be alphanumeric: "%s"'%library_id)
                
        return library_id.lower()
         
    @staticmethod    
    def validate_book_id(book_id, library_id):
        book_id = book_id.lower()
        database, accession_number = decode_book_id(book_id)
        
        if not re.match(r'^([0-9a-z]+)$', database):
            raise ValueError('EBSCOhost book id database must be alphabetic: "%s"'%book_id)
            
        if not re.match(r'^([0-9]+)$', accession_number):
            raise ValueError('EBSCOhost book id accession number must be numeric: "%s"'%book_id)
            
        return book_id
            
    @staticmethod    
    def book_url(library_id, book_id):
        # eBook/audiobook (article) permalink
        # customer id (library) is ignored in EBSCOhost permalinks. User must login to set cookies prior to access.
        
        database, accession_number = decode_book_id(book_id)
        
        #return 'http://%s/login.aspx?direct=true&custid=%s&db=%s&AN=%s&site=ehost-live' % (
        #    SEARCH_HOST, library_id, database, accession_number)
        
        #return http://%s/login.aspx?direct=true&db=%s&AN=%s&site=ehost-live' % (
        #    SEARCH_HOST, database, accession_number)
        
        return 'http://%s/login.aspx?direct=true&scope=site&db=%s&AN=%s' % (
            SEARCH_HOST, database, accession_number)
        

    @staticmethod    
    def book_key_library_id(library_id):
        return library_id   # has same book ids at all libraries, but different available formats

         
    def __init__(self):
        self.signed_in = False
        self.cookiejar = cookielib.CookieJar()

    
    
    def sign_in(self, use_credentials):
        '''
        EBSCOhost requires authentication to perform searches
        
        EBSCO offers several different methods of authentication for users:
            IP Address (authtype=ip) - sign in from a computer located at the library or via VPN
            Referring URL (authtype=url) - redirected from library website after sign in
            Patterned IDs (authtype=cpid&custid=custid) - library-specific card number and pin
            Patron ID files (authtype=custuid&custid=custid) - library-specific user id and password
            User ID and Password (authtype=uid) - EBSCO user id and password
            Personal User Authentication (authtype=user) - EBSCO user id and password
            
            Cookie Authentication (http://search.ebscohost.com/login.aspx?authtype=cookie) - not supported (used for searches)
            Athens Authentication (authtype=athens) - not supported
            Shibboleth Authentication (authtype=shib) - not supported
            HTTPS Authentication - not supported
        '''
    
        self.log.info('Using %s authentication for %s' % (self.branch_id, self.name))
        
        self.search_url = None
        self.ids = {}
        
        br = mechanize.Browser()
        br.set_cookiejar(self.cookiejar)
        
        referer = None
        if self.branch_id.startswith('lib='):
            url = self.branch_id[4:]

            if not url.lower().startswith('http://'):
                url = 'http://' + url
            
        elif self.branch_id.startswith('ref='):
            referer = self.branch_id[4:]
            if not referer.lower().startswith('http://'):
                referer = 'http://' + referer
                
            url = 'http://%s/login.aspx?authtype=url&profile=%s&defaultdb=%s' % \
                (SEARCH_HOST, PROFILE, EBOOK_DATABASE)

        elif self.branch_id in ['cpid', 'custuid']:
            url = 'http://%s/login.aspx?authtype=%s&custid=%s&profile=%s&defaultdb=%s' % \
                (SEARCH_HOST, self.branch_id, self.library_id, PROFILE, EBOOK_DATABASE)

        elif self.branch_id in ['ip', 'uid', 'user']:
            url = 'http://%s/login.aspx?authtype=%s&profile=%s&defaultdb=%s' % \
                (SEARCH_HOST, self.branch_id, PROFILE, EBOOK_DATABASE)

        elif self.branch_id == 'url':
            raise LibraryError('Referring URL authentication not supported. Try lib=... instead.')
            
        elif self.branch_id == 'athens':
            raise LibraryError('Athens authentication not supported')
            
        elif self.branch_id == 'shib':
            raise LibraryError('Shibboleth authentication not supported')
        
        else:
            raise LibraryError('Unknown authentication type (branch id): %s' % self.branch_id)
            
        request = mechanize.Request(url)
        
        if not (referer or self.branch_id == 'ip'):
            # load the sign in page
            browse_url(self.log, br, request)
            
            # Select the sign in form
            try:
                if len([f for f in br.forms()]) == 1:
                    br.select_form(nr=0)    # only one form on page
                else:
                    br.select_form(predicate=lambda f: is_sign_in_form(f)) # Exception if not found
                
            except:
                raise LibraryError('Missing sign in form')
            
            # User credentials
            if (not self.card_number):
                raise LibraryError('Library requires credentials for sign in.')
                
            if not use_credentials:
                self.log.warn('Library requires credentials for sign in. -- Ignoring skip login tweak.')
        
            set_card_number(self.log, br.form, self.card_number)
            set_card_pin(self.log, br.form, self.card_pin)
        
            request = None # Submit form

        # sign in
        while True:
            page = browse_url(self.log, br, request, referer=referer)
            
            redirect_url = br.geturl()
            soup = BeautifulSoup(page, convertEntities=BeautifulSoup.HTML_ENTITIES)
            
            
            error = soup.find('span', attrs={'class':'error'})
            if error:
                raise LibraryError('Sign in failed. %s'%text_only(error))
                
            if 'Sorry, we could not validate' in page:
                raise LibraryError('Sign in failed - Incorrect card number or PIN')
                
 
            if '[Authentication Error Code ' in page:
                self.log.info(text_only(soup))
                raise LibraryError('Sign in failed - Authentication Error')
                
            self.log.info('Redirected to %s' % redirect_url)
            redirect_hostname = hostname_from_url(redirect_url)
                
            #if  redirect_hostname.endswith('ebscohost.com'):
            #    break
                
            if 'ebscohost.com' in redirect_hostname:
                # possible access via a proxy. save url for future use in searching
                self.search_url = urlparse.urljoin(redirect_url, '/ehost/Search/PerformSearch')
                
                parsed_query = urlparse.parse_qs(urlparse.urlparse(redirect_url).query)
                for id in IDS:
                    self.ids[id] = parsed_query.get(id,[''])[0]
                
                break
                
            # may need to submit a form to perform redirection
            try:
                br.select_form(predicate=lambda f: is_redirect_form(f)) # Exception if not found
            
            except:
                raise LibraryError('Sign in failed - unexpected url: %s'%redirect_url)
         
        if not self.search_url:
            # expect a cookie named "webauth" for ".ebscohost.com"
            for cookie in self.cookiejar:
                #self.log.info('cookie: %s' % unicode(cookie))
                if cookie.name == 'webauth':
                    break
                
            else:
                self.log.info('cookiejar: %s' % unicode(self.cookiejar))
                raise ValueError('Failed to find sign in cookie')
            
        # verify customer id
        eis_cust = br.response().info().getheader("eis-cust")   # may be present in response header
        if not eis_cust:
            match = re.search(r'<!-- ([a-zA-Z0-9._]+) +-->', page) # comment: <!-- user: brooklyn.main.ehost  -->
            if match:
                eis_cust = match.group(1)
                
        if eis_cust:
            custid = eis_cust.split('.')[0]
            
            if custid != self.library_id:
                raise ValueError('Library id must match EBSCOhost customer id "%s"'%custid)
            

        self.log.info('Sign in successful')
        self.signed_in = True
    
    def find_books(self, books, search_author, search_title, keyword_search, find_recommendable):
        '''
        Search EBSCOhost for books that match an author/title (or subsets thereof).
        '''
        
        if not self.signed_in:
            self.log.info('Cannot perform search at %s due to sign in failure' % self.name)
            return False

        dbs = set()
        fmt = set()
        
        if FORMAT_ADOBE_EPUB in self.config.search_formats:
            dbs.add(EBOOK_DATABASE)
            fmt.add('EK')   # EPUB
        
        if FORMAT_ADOBE_PDF in self.config.search_formats:
            dbs.add(EBOOK_DATABASE)
            fmt.add('EB')   # PDF
        
        if FORMAT_ONLINE_READER in self.config.search_formats:
            dbs.add(EBOOK_DATABASE)
            fmt.add('EK')   # EPUB
            fmt.add('EB')   # PDF
        
        if FORMAT_MP3 in self.config.search_formats:
            dbs.add(AUDIOBOOK_DATABASE)
            fmt.add('E3')   # MP3
            
        if FORMAT_WMA in self.config.search_formats:
            dbs.add(AUDIOBOOK_DATABASE)
            fmt.add('EA')   # WMA
            
        if FORMAT_M4B in self.config.search_formats:
            dbs.add(AUDIOBOOK_DATABASE)
            fmt.add('E4')   # M4B
            
            
        dbs.add(EBOOK_DATABASE) # always specify ebook database to avoid search failures at libs without audiobooks
        
        databases = ','.join(list(dbs))
        
        if not databases:
            return False     # provider doesn't carry desired formats
            
        qry = []
        
        if search_author:
            qry.append('AU (%s)'%(' AND '.join([a for a in search_author.lower().split() if a not in RESERVED])))
            
        if search_title:
            qry.append('TI (%s)'%(' AND '.join([t for t in search_title.lower().split() if t not in RESERVED])))
            
        if fmt:
            qry.append('FM (%s)'%(' OR '.join(list(fmt))))
            
        if self.config.search_language in LANGUAGE_CODE:
            language = self.config.search_language
            qry.append('LA (%s)'%LANGUAGE_CODE[self.config.search_language])    # Searches for the language in which a book is published
        else:
            language = ''
            
        query = ' AND '.join(qry)
        # eg: TI+(leadership)+AND+AU+(charles+AND+m.+AND+carroll)
        
        page_num = 1
        total_results = 0
        results_processed = 0
        
        RESULTS_PER_PAGE = 10
        MAX_RESULTS_ALLOWED = 500
        
        if self.search_url:
            # have a specific url to use when performing searches
            
            # assume that the default database seletion is appropriate for searching e/audio books
            
            data = {}
            for id in IDS:
                if self.ids[id]:
                    data[id] = self.ids[id]
                    
            url = '%s?%s' % (self.search_url, urllib.urlencode(data))
            
            data = {}
            data["RelRequestPath"] = "search/advanced"
            data["__EVENTTARGET"] = ""
            data["__EVENTARGUMENT"] = ""
            data["__sid"] = self.ids.get('sid', '')
            data["__vid"] = self.ids.get('vid', '')
            data["__CUSTOMVIEWSTATE"] = ""
            #data[ "__ScreenResolution"] = "1920+%3A+1080"
            data["__VIEWSTATE"] = ""
            data["SearchTerm"] = query
            data["PerformSearchSettingValue"] = "3"
            data["searchMode"] = "Bool"
            #data["ctl00$ctl00$Column1$Column1$ctl00$isSliderChanged" ] = "0"
            #data["ctl00$ctl00$Column1$Column1$ctl00$HasSliderBeenSet"] = "0"
            #data["common_DT1"] = ""
            #data["common_DT1_FromYear"] = ""
            #data["common_DT1_ToYear"] = ""
            #data["_sort_"] = "Hits"
            #data["_sort_order_"] = "Desc"
            #data["_clusterId_"] = "Subject"
            #data["database_nlebk_DL"] = "on"   # Download Available limiter
            data["ajax"] = "enabled"
                    
            enc_data = urllib.urlencode(data)  # urlencode uses quote_plus without safe values
            
            #enc_data = '&'.join(['%s=%s' % (urllib.quote_plus(key.encode("utf-8"), "()'"), 
            #    urllib.quote_plus(val.encode("utf-8"), "()'")) for key,val in data.items()])
            
        else:
            url = 'http://%s/login.aspx?direct=true&bQuery=%s&db=%s&site=ehost-live' % (
                SEARCH_HOST, urllib.quote_plus(query.encode("utf-8"), "()'"), databases)
            enc_data = None
        
        while True:
            
            response = open_url(self.log, url, enc_data, cookiejar=self.cookiejar)
            
            response_url = response.geturl()
            parsed_query = urlparse.parse_qs(urlparse.urlparse(response_url).query)
            for id in IDS:
                self.ids[id] = parsed_query.get(id,[''])[0]

            # Parse the html results for analysis
            soup = BeautifulSoup(response.data, convertEntities=BeautifulSoup.HTML_ENTITIES)
            
            error_s = soup.find('span', attrs={'id':"lblErrMsg"})
            if error_s:
                raise LibraryError(text_only(error_s))
                
            warning_s = soup.find('span', attrs={'class':"std-warning-text"})
            if warning_s:
                warning = text_only(warning_s)
                if 'No results were found.' in warning:
                    break
                    
                raise LibraryError(warning)
                
            warning_s = soup.find('span', attrs={'class':"smart-text-ran-warning"})
            if warning_s:
                warning = text_only(warning_s)
                if 'Your initial search query did not yield any results.' in warning:
                    break
                    
                raise LibraryError(warning)
                
            if 'Please enter your search terms again.' in response.data:
                raise LibraryError('Malformed query')
                
            if 'The page you have tried to access has expired.' in response.data:
                raise LibraryError('Session expired')
                
            if 'The article or journal you have requested is <b>not</b> available' in response.data:
                break   # database not available at this library
                
            # determinate total number of pages of results
            page_title_h2 = must_find(soup, 'h2', attrs={'class':'page-title alt'})
                
            # EG: "Search Results: 21 - 30 of 244"
            result_count_text = text_only(page_title_h2)
            result_count_list = result_count_text.split()
            
            if (len(result_count_list) < 7 or result_count_list[0] != 'Search' or result_count_list[1] != 'Results:' or
                        result_count_list[3] != '-' or result_count_list[5] != 'of'):
                raise LibraryError('Unexpected pagingTitleCount %s'%result_count_text)
                
            first_result = int(result_count_list[2])
            if first_result != results_processed + 1:
                raise LibraryError('Unexpected first result %d instead of %d'%(first_result, results_processed + 1))
            
            last_result = int(result_count_list[4])
            
            new_total_results = int(result_count_list[6])
            if total_results and (new_total_results != total_results):
                raise LibraryError('Total results changed from %d to %d'%(total_results, new_total_results))
            
            total_results = new_total_results
            total_pages = ((total_results - 1) // RESULTS_PER_PAGE) + 1  # floor division
            
            self.log.info('Response: page %d of %d. %s-%s of %d total results'%(page_num, 
                total_pages, first_result, last_result, total_results))
            
            if total_results > MAX_RESULTS_ALLOWED:
                return True
             
            result_list_ctl = must_find(soup, 'div', attrs={'id':'resultListControl'})
            
            for result in result_list_ctl.findAll('li', attrs={'class':'result-list-li'}, recursive=True):
                # Start of book info
                
                formats = set()
                
                ebook_full_text_a = result.find('a', attrs={'id':re.compile('^ebookft[0-9]+$')})
                if ebook_full_text_a:
                    formats.add(FORMAT_ONLINE_READER)

                ebook_download_a = result.find('a', attrs={'id':re.compile('^eBookDownload[0-9]+$')})
                if ebook_download_a:
                    data_ebook = ebook_download_a.get('data-ebook','')  # NL$149143$PDF
                    if '$EPUB' in data_ebook:
                        formats.add(FORMAT_ADOBE_EPUB)
                    elif '$PDF' in data_ebook:
                        formats.add(FORMAT_ADOBE_PDF)
                    else:
                        # unknown - assume both (How are multiple formats for same AN indicated?)
                        formats.add(FORMAT_ADOBE_EPUB)
                        formats.add(FORMAT_ADOBE_PDF)
                
                audiobook_download_a = result.find('a', attrs={'id':re.compile('^aBookDownload[0-9]+$')})
                if audiobook_download_a:
                    # can't tell which specific format - assume all
                    formats.add(FORMAT_MP3)
                    formats.add(FORMAT_M4B)
                    formats.add(FORMAT_WMA)
                    # data_abook = audiobook_download_a.get('data-abook','') # NL$343313$CVR
                    
                # translate link to full book display into jquery for book details
                
                # ebookviewer/ebook/bmxlYmtfXzIwMDkyMzVfX0FO0?
                #  sid=840e9b6d-cc86-43c4-989b-1f865f4c2adf@sessionmgr198&vid=1&hid=124&format=EB
                
                for record_type_a in result.findAll('a', attrs=class_contains('record-type'), recursive=True):
                    # class = 'record-type ebook-ft', 'record-type pdf-ft', 'record-type epub'
                    href = record_type_a['href']
                    if href != '#':
                        self.log.info('%s href: %s' % (record_type_a['class'], href))
                        break
                else:
                    raise LibraryError('Missing record-type viewer link')
                    
                purl = urlparse.urlparse(href)
                encid = purl.path.rsplit('/', 1)[1]
                parsed_query = urlparse.parse_qs(purl.query)
                
                # http://web.b.ebscohost.com/ehost/Detail/HoverPreview/bmxlYmtfXzIwMDkyMzVfX0FO0?
                #  sid=840e9b6d-cc86-43c4-989b-1f865f4c2adf@sessionmgr198&vid=1&theIsRelatedInfo=false&theResultListId=
                
                detail_url = 'http://%s/ehost/Detail/HoverPreview/%s?sid=%s&vid=%s&theIsRelatedInfo=false&theResultListId=' % \
                    (purl.netloc if purl.netloc else netloc_from_url(response_url), encid, 
                    self.ids.get('sid',''), parsed_query.get('vid',[''])[0])
                
                
                detail_response = open_url(self.log, detail_url, cookiejar=self.cookiejar)
                
                detail = json.loads(detail_response.data)      # Parse the json results

                '''
                {"TargetId":null,"HeaderLinks":[],"FooterLinks":[],"RecordHasEisExtLink":false,"RecordHasLinkOut":false,"LinkOutUrlIp":"",
                "DB":"nlebk","MID":"","UITerm":"2009235","UITag":"AN","ResultID":"5","IsInFolder":false,"ResultListIndex":"5","ThumbnailId":null,
                "Title":"Jabberwocky","PubType":"eBook","IssnIsbn":"9780585019376","JnBookTitle":"Jabberwocky","ContentType":"eBook",
                "IsShowDbNameOnResultListPreview":false,"DisplayValues":{"Title":"Jabberwocky","Authors":["Carroll, Lewis"],
                "Source":"","Date":"","Publication":"Wiretap, ","PublicationType":"eBook",
                "Subjects":["Alice (Fictitious character : Carroll)--Juvenile fiction; Children\u0027s poetry, English"],
                "Abstract":"","Database":"","Duration":""}}               
                '''
                
                if detail["UITag"] != "AN":
                    raise LibraryError('Unexpected UITag %s'%detail["UITag"])
                    
                book_id = encode_book_id(detail["DB"], detail["UITerm"])
                
                display_values = detail["DisplayValues"]
                
                title = normalize_title(display_values["Title"])
                
                authors = []
                for author in display_values["Authors"]:
                    authors.append(normalize_author(author, unreverse=True))
                    
                isbn = detail.get("IssnIsbn", "")
                
                publication = display_values.get("Publication", '')
                
                if ',' in publication:
                    publisher, pubdate_s = publication.rsplit(',', 1)
                else:
                    publisher = publication
                    pubdate_s = ''
                
                pubdate_s = pubdate_s.strip()
                
                if pubdate_s:
                    pubdate = parse_only_date(pubdate_s, assume_utc=True)
                else:    
                    pubdate = None
                    
                    
                lbook = LibraryBook(authors=authors, title=title, formats=formats, pubdate=pubdate,
                        available=True, lib=self, book_id=book_id, isbn=isbn, publisher=publisher,
                        language=language, search_author=search_author)
                        
                self.log.info('Found %s'%repr(lbook))
                books.add(lbook)
                    
                results_processed += 1
                
            page_num += 1
            
            if results_processed >= total_results:
                break
            
            # prepare request for next page
            
            aspnet_form = must_find(soup, 'form', attrs={'id':"aspnetForm"})
            url = urlparse.urljoin(response_url, aspnet_form['action'])
            
            data = {}
            
            for input_field in ["RelRequestPath", "__sid", "__vid", "__CUSTOMVIEWSTATE","__VIEWSTATE"]:
                inf = aspnet_form.find('input', id=input_field)
                if inf:
                    data[input_field] = inf['value']
            
            data["__EVENTTARGET"] = "ctl00$ctl00$MainContentArea$MainContentArea$bottomMultiPage$lnkNext" # next page
            data["__EVENTARGUMENT"] = ""
            
            # data["__ScreenResolution"] = "1920+:+1080"
            data["SearchTerm"] = query
            data["PerformSearchSettingValue"] = "0"
            # data["ctl00$ctl00$Column1$Column1$ctl00$isSliderChanged"] = "0"
            # data["ctl00$ctl00$Column1$Column1$ctl00$HasSliderBeenSet"] = "0"
            # data["common_DT1_FromYear"] = ""
            # data["common_DT1_ToYear"] = ""
            data["_sort_"] = "Hits"
            data["_sort_order_"] = "Desc"
            data["_clusterId_"] = "Subject"
            data["ajax"] = "enabled"
            
            enc_data = urllib.urlencode(data)
                
        if results_processed != total_results:
            raise LibraryError('Expected %s but found %d'%(value_unit(total_results,'result'), results_processed))
            
        return False
