﻿#!/usr/bin/env python
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai

from __future__ import (unicode_literals, division, absolute_import,
                        print_function)

__license__   = 'GPL v3'
__copyright__ = '2016, John Howell <jhowell@acm.org>'
__docformat__ = 'restructuredtext en'

import re
import urllib
import urllib2
import time
import json
import mechanize
import cookielib
import urlparse
import random

from calibre.utils.config_base import tweaks
from calibre.utils.date import parse_only_date
from calibre.ebooks.BeautifulSoup import BeautifulSoup

from calibre_plugins.overdrive_link.numbers import (value_unit)
from calibre_plugins.overdrive_link.book import (LibraryBook, InfoBook, unique_authors)
from calibre_plugins.overdrive_link.formats import (FORMAT_ADOBE_EPUB, FORMAT_ADOBE_PDF,
    FORMAT_BOOKREADER, FORMAT_OPEN_PDF, FORMAT_OPEN_EPUB,
    FORMAT_MOBI_EBOOK, FORMAT_PROTECTED_DAISY, FORMAT_DJVU)
from calibre_plugins.overdrive_link.titlecase import titlecase
from calibre_plugins.overdrive_link.language import (LANGUAGE_CODE, LANGUAGE_NAME)
from calibre_plugins.overdrive_link.library import SearchableLibrary
from calibre_plugins.overdrive_link.net import (browse_url, open_url)
from calibre_plugins.overdrive_link.match import (normalize_author, normalize_title)
from calibre_plugins.overdrive_link.parseweb import (LibraryError, valid_isbn, text_only, must_find, class_contains)
from calibre_plugins.overdrive_link.json import js_value
from calibre_plugins.overdrive_link.numbers import range_str
from calibre_plugins.overdrive_link.tweak import (TWEAK_OL_BORROW_URL, TWEAK_OL_SCALE, TWEAK_OL_START_FROM)

READ_COLLECTION = '*READ*'      # Open Library 'Read' collection name, internal to plugin
PRINTDISABLED = 'printdisabled' # library id and/or collection

# collection set and subject set by library_id
LIBRARY_ID_INFO_ATTRS = ('collections', 'subjects', 'default_formats')

LIBRARY_ID_INFO = {
    # open 'Read' books, available to all, open access - no need to borrow
    'read': ({READ_COLLECTION}, set(), set()),  
    
    # open 'Lending Library' collection, available to all
    'lendinglibrary': ({'lendinglibrary'}, {'lending library'}, {FORMAT_ADOBE_EPUB, FORMAT_ADOBE_PDF, FORMAT_BOOKREADER}),  
    
    # 'In Library' collection, restricted to library patrons (also include open lending library)
    'inlibrary': ({'inlibrary', 'lendinglibrary'}, {'in library', 'lending library'}, {FORMAT_ADOBE_EPUB, FORMAT_ADOBE_PDF, FORMAT_BOOKREADER}),  
    
    # Accessible editions for the disabled
    PRINTDISABLED: ({PRINTDISABLED}, {'accessible book', 'protected daisy'}, {FORMAT_PROTECTED_DAISY}),
    }
    
ALLOWED_LIBRARY_IDS = set(LIBRARY_ID_INFO.keys())


IA_LINK_FORMATS = {
    #'read online': FORMAT_BOOKREADER,   -- no longer used by IA. Reader is embedded on book page.
    'pdf': FORMAT_OPEN_PDF,
    'epub': FORMAT_OPEN_EPUB,
    'kindle': FORMAT_MOBI_EBOOK,
    'djvu': FORMAT_DJVU,
    # 'daisy': FORMAT_PROTECTED_DAISY,  -- intentionally removed to prevent false detection of 'read' books
    }
    
    
def ia_formats_key(ia_ident):
    return "%s@ia-formats/" % ia_ident          # for caching
    

def ia_collections_key(ia_ident):
    return "%s@ia-collections/" % ia_ident      # for caching
    

class OpenLibrary(SearchableLibrary):
    id = 'ol'
    name = 'Open Library'
    formats_supported = {FORMAT_ADOBE_EPUB, FORMAT_ADOBE_PDF, FORMAT_BOOKREADER, FORMAT_PROTECTED_DAISY, 
        FORMAT_OPEN_PDF, FORMAT_OPEN_EPUB, FORMAT_MOBI_EBOOK, FORMAT_DJVU}
    allow_format_merge = True           # different formats may be discovered for different instances of the same book key
    max_expected_links_per_book = 6     # higher limit for Open Library since many editions may be available
    sign_in_affects_check_obtainable = True     # sign in needed to detect current holds

    

    @staticmethod    
    def validate_library_id(library_id, migrate=True, config=None):
        if migrate and library_id == '':
            library_id = 'inlibrary'    # migrate from old plugin versions
            
        if library_id not in ALLOWED_LIBRARY_IDS:
            raise ValueError('Open Library library-id may only be %s, found: "%s"' % (
                ' or '.join(list(ALLOWED_LIBRARY_IDS)), library_id))

        return library_id
             

    @staticmethod    
    def validate_book_id(book_id, library_id):
        if not re.match(r'^OL([0-9]+)[MW]$', book_id):
            #OL###W for a work (book), OL###M for an edition of a work
            raise ValueError('Open Library book id must be OL###M or OL###W: "%s"'%book_id)
            
        return book_id
            

    @staticmethod    
    def book_url(library_id, book_id):
        if book_id[-1:] == 'M': return 'https://openlibrary.org/books/%s'%book_id   # singe edition
        if book_id[-1:] == 'W': return 'https://openlibrary.org/works/%s'%book_id   # work (set of editions)
        raise ValueError
        

    @staticmethod    
    def book_key_library_id(library_id):
        return library_id   # has same book ids at all libraries, but different available formats
        

    def __init__(self):
        self.cookiejar = cookielib.CookieJar()
        self.ia_idents_cache = {}
        self.verify_collections = set() # book id's needing extra check of collections from IA

    
    def sign_in(self, use_credentials):
        for attrib, value in zip(LIBRARY_ID_INFO_ATTRS, LIBRARY_ID_INFO[self.library_id]):
            setattr(self, attrib, value)
        
        if not (self.card_number and use_credentials):
            return  # cannot sign in
            
        self.log.info('Signing in to %s' % self.name)
            
        self.signin_required = True
        br = mechanize.Browser()
        br.set_cookiejar(self.cookiejar)
                
        browse_url(self.log, br, mechanize.Request('https://openlibrary.org/account/login'))
        
        # Select the Sign in form
        br.select_form(predicate=lambda f: 'id' in f.attrs and f.attrs['id'] == "register") # Exception if not found
        
        # User credentials
        br.form["username"] = self.card_number
        br.form["password"] = self.card_pin
        
        # Login
        page = browse_url(self.log, br, None)   # submit form

        if "The username you entered isn't in the Open Library system." in page:
            raise LibraryError('Sign in failed. Check card number and PIN.')

        if '>Log out</a>' not in page: 
            raise LibraryError('Sign in unsuccessful.')
        
        self.log.info('Sign in successful')
        self.signed_in = True
                


    def find_books(self, books, search_author, search_title, keyword_search, find_recommendable):
        '''
        Search Open Library for books that match an author/title (or subsets thereof).

        books = Set of Books to be updated
        '''
            
        results_processed = 0
        total_pages = 1
        page_num = 1
        retry_count = 0
        
        RESULTS_PER_PAGE = 100
        MAX_RETRIES = 5
        
        while (page_num <= total_pages):
            # see: https://openlibrary.org/developers/api
            
            query = {}
            query['has_fulltext'] = 'true'  # show only eBooks
            
            
            if search_author: query['author'] = search_author
            
            if search_title:
                if keyword_search: query['subject'] = search_title
                else: query['title'] = search_title
                
            if page_num > 1: query['page'] = unicode(page_num)
            
            # query returns no results when these search options are used
            # if self.config.search_language in LANGUAGE_CODE: query['language'] = LANGUAGE_CODE[self.config.search_language]
            
            # Note: OpenLibrary allows multiple subject_facet keys in a query, but does 'and', not 'or'
            # query['subject_facet'] = list(self.subjects)
            
            response = open_url(self.log, 'https://openlibrary.org/search.json?%s'%urllib.urlencode(query, doseq=True))
            result = json.loads(response.data)      # Parse the json results
            
            error = result.get("error", "")
            if error:
                if error.startswith("No JSON object could be decoded"):
                    # This occurs occasionally due to an internal problem at Open Library
                    retry_count += 1
                    if retry_count <= MAX_RETRIES:
                        self.log.warn('Retrying on Open Library error: %s' % error)
                        time.sleep(30)  # delay before retry to give temporary problems time to clear
                        continue
                    
                raise LibraryError('Open Library error: %s' % error)
                
            retry_count = 0
            total_results = result["numFound"]
            total_pages = ((total_results - 1) // RESULTS_PER_PAGE) + 1  # floor division
            
            if result["start"] != results_processed:
                raise LibraryError('Expected start %d on page %d but found %d'%(results_processed, page_num, result["start"]))

            for doc in result["docs"]:
                # This returns information for a 'work' rather than the individual editions that are lendable

                if (doc.get("has_fulltext", False) and
                        ("key" in doc) and ("title" in doc) and ("author_name" in doc)):
                    
                    book_id = doc["key"].rpartition('/books/')[2].rpartition('/works/')[2]  # drop prefix if present
                    self.ia_idents_cache[book_id] = doc.get("ia", [])
                        
                    if 'ia_collection_s' in doc: collections = set(doc['ia_collection_s'].split(';'))
                    else: collections = set()
                            
                    if 'subject' in doc: subjects = set([s.lower() for s in doc['subject']])
                    else: subjects = set()
                    
                    is_in_lib_collection = (not self.collections.isdisjoint(collections))
                
                    if (not is_in_lib_collection) and (not self.subjects.isdisjoint(subjects)):
                        # collection info seems to be incomplete sometimes. Possibly just for the first edition listed of the work,
                        # which is usually, but not always the one with an e-book available (if any). Can double check by getting
                        # the collection info for the book found at the Internet Archive.
                        self.verify_collections.add(book_id)
                      
                    if is_in_lib_collection or (book_id in self.verify_collections) or (READ_COLLECTION in self.collections):
                        title = titlecase(doc["title"])
                        if "subtitle" in doc:
                            title = '%s: %s'%(title, titlecase(doc["subtitle"]))
                            
                        title = normalize_title(title)
                        
                        # remove duplicates while preserving order
                        authors = unique_authors([normalize_author(a, unreverse=True, fix_case=True) for a in doc["author_name"]])
                        
                        if "publisher" in doc: publisher = doc["publisher"][0]
                        else: publisher = ''
                        
                        #if "publish_date" in doc: pubdate = parse_only_date(doc["publish_date"][0], assume_utc=True) -- odd formats
                        if "first_publish_year" in doc: pubdate = parse_only_date(unicode(doc["first_publish_year"]), assume_utc=True)
                        elif "publish_year" in doc: pubdate = parse_only_date(unicode(doc["publish_year"][0]), assume_utc=True)
                        else: pubdate = None
                        
                        if "isbn" in doc: isbn = valid_isbn(doc["isbn"][0])
                        else: isbn = ''
                        
                        language = ''
                        if (self.config.search_language in LANGUAGE_CODE) and ("language" in doc):
                            languages = doc["language"]
                            if LANGUAGE_CODE[self.config.search_language] in languages:
                                language = self.config.search_language
                            else:
                                language = languages[0]
                                if language in LANGUAGE_NAME: language = LANGUAGE_NAME[language]
                                
                        formats = set()
                        if is_in_lib_collection:
                            formats.update(self.default_formats)
                        
                        lbook = LibraryBook(authors=authors, title=title, 
                                isbn=isbn, publisher=publisher, pubdate=pubdate, language=language, formats=formats,
                                available=True, lib=self, book_id=book_id, search_author=search_author)
                                
                        self.log.info('Found %s'%repr(lbook))
                        books.add(lbook)

                results_processed += 1
                
            page_num += 1
        
        if results_processed != total_results:
            raise LibraryError('Expected %s but found %d'%(value_unit(total_results,'result'), results_processed))
        
        return False
        
                            
    def get_book_info(self, book_id):
        self.log.info('Getting book info for %s at %s' % (book_id, self.name))
        collections = set()
        formats = set()
        
        if READ_COLLECTION in self.collections:
            # Open readable Internet Archive books - check IA links for formats available (if any)
            for ia_ident in self.ia_idents_cache[book_id]:
                ia_formats = self.cache.get_data(ia_formats_key(ia_ident))
                
                if ia_formats is None:
                    try:
                        # need to check web page for available formats since API can list more than are actually allowed
                        ia_response = open_url(self.log, 'https://archive.org/details/%s' % ia_ident)
                        
                    except Exception as e:
                        if type(e) == urllib2.HTTPError and e.code == 404:
                            # A 404 (Not Found) error can occur for links to non-existent books at IA. Ignore.
                            self.log.info('Missing book page for %s at archive.org' % ia_ident)
                            continue
                            
                        raise
                    
                    ia_formats = set()
                    
                    soup = BeautifulSoup(ia_response.data, convertEntities=BeautifulSoup.HTML_ENTITIES)
                    
                    for format_group in soup.findAll('div', attrs={'class': 'format-group'}):
                        dl_link = format_group.find('a', attrs=class_contains('download-pill'))
                        if dl_link:
                            dl_link_text = text_only(dl_link).lower().strip()
                            #self.log.info('have link "%s"' % dl_link_text)
                            if dl_link_text.endswith(' download'): dl_link_text = dl_link_text[0:-9]
                            
                            if dl_link_text in IA_LINK_FORMATS:
                                #self.log.info('---added ' + IA_LINK_FORMATS[dl_link_text])
                                ia_formats.add(IA_LINK_FORMATS[dl_link_text])
                                ia_formats.add(FORMAT_BOOKREADER)   # assume online reader also if any other format is found
                    
                    self.cache.save_data(ia_formats_key(ia_ident), list(ia_formats))
                                        
                if ia_formats:   
                    formats.update(ia_formats)
                    collections.add(READ_COLLECTION)
                    
         
        if book_id in self.verify_collections:
            for ia_ident in self.ia_idents_cache[book_id]:
                ia_collections = self.cache.get_data(ia_collections_key(ia_ident))
                
                if ia_collections is None:
                    try:
                        ia_response = open_url(self.log, 'https://archive.org/details/%s&output=json' % ia_ident)
                        
                    except Exception as e:
                        if type(e) == urllib2.HTTPError and e.code == 404:
                            # A 404 (Not Found) error can occur for links to non-existent books at IA. Ignore.
                            self.log.info('Missing book page for %s at archive.org' % ia_ident)
                            continue
                            
                        raise
                        
                    ia_result = json.loads(ia_response.data)      # Parse the json results
                    ia_collections = set(ia_result.get("metadata", {}).get("collection", []))
                    self.cache.save_data(ia_collections_key(ia_ident), list(ia_collections))
                 
                if ia_collections:
                    collections.update(ia_collections)
                    
            if self.collections.isdisjoint(collections):
                return None     # not available
                
        formats.update(self.default_formats)
        
        # don't cache this InfoBook since the individual IA lookups are already cached and the
        # set of these may change over time
        return InfoBook(formats=formats, lib=self, book_id=book_id, cache_allowed=False)

  
    def check_book_obtainable(self, book_id):
        if (READ_COLLECTION in self.collections) or (PRINTDISABLED in self.collections):
            return 0    # 'read' titles are always available
            
        # no API for book waiting so need to parse pages instead
        book_url = self.book_url(self.library_id, book_id)
        response = open_url(self.log, book_url, cookiejar=self.cookiejar)
        soup = BeautifulSoup(response.data, convertEntities=BeautifulSoup.HTML_ENTITIES)
        
        have_checked_out = False
        available_copies = 0
        wait_links = set()
        
        for s in soup.findAll('span', recursive=True):
            s_text = text_only(s)
            if s_text == 'You have this book checked out.':
                have_checked_out = True
            
        for a in soup.findAll('a', recursive=True):
            a_text = text_only(a)
             
            if a_text == 'eBook' and a.get('title', '') == 'Borrow from Internet Archive':
                available_copies += 1
            elif 'waiting list' in a_text:
                # "Join the waiting list?" or just "waiting list" or if user has book already on hold
                wait_links.add(a['href'])
        
        library_copies = max(available_copies, 1)   # use 1 if none available since can only wait for a single copy
        number_waiting_overall = None
        hold_position_overall = None
        
        if wait_links and not available_copies:
            # find shortest wait among borrowable editions
            for url in wait_links:
                wait_response = open_url(self.log, urlparse.urljoin(book_url, url), cookiejar=self.cookiejar)
                wait_soup = BeautifulSoup(wait_response.data, convertEntities=BeautifulSoup.HTML_ENTITIES)
                info = must_find(wait_soup, 'div', attrs={'class':'message info'})
                info_text = text_only(info)
                if not info_text.startswith('This book is checked out'):
                    raise LibraryError('Unexpected info for checked out book: %s' % info_text)
                    
                # "This book is checked out and you are on the waiting list. You are the only person waiting for this book."
                if 'You are the only person waiting' in info_text:
                    hold_position_overall = 1
                    number_waiting_overall = None
                    break
  
                # "This book is checked out and you are on the waiting list. You are #2 among 2 people waiting for this book."   
                m = re.search(r'You are #([0-9]+) ', info_text, flags=re.IGNORECASE)
                if m:
                    hold_position_overall = int(m.group(1))
                    number_waiting_overall = None
                    break
                    
                # "This book is checked out and 2 people are on the waiting list." or "and 1 person is"
                m = re.search(r'checked out and ([0-9]+) ', info_text, flags=re.IGNORECASE)
                if m:
                    waiting = int(m.group(1))
                else:
                    waiting = 0
                
                if number_waiting_overall is None or waiting < number_waiting_overall:
                    number_waiting_overall = waiting

        return self.when_obtainable(library_copies=library_copies, available_copies=available_copies,
                                        have_checked_out=have_checked_out, 
                                        hold_position_overall=hold_position_overall,
                                        number_waiting_overall=number_waiting_overall)



def read_openlib_book_online(abort, log, status, config):
    '''
    Retrieve pages in jpeg format to current directory
    '''
    
    borrow_url = tweaks[TWEAK_OL_BORROW_URL]         # "https://openlibrary.org/books/OL000000M/Book_title/borrow"
    scale = tweaks.get(TWEAK_OL_SCALE, 2)            # 2**n, 1= full resolution (600dpi), 2=half (300dpi)
    continue_from = tweaks.get(TWEAK_OL_START_FROM, 0)  # to allow continuation
    page_delay = 10  # seconds
    
    pages_written = 0
    
    for lending_lib in config.enabled_libraries:
        if lending_lib.provider_id == OpenLibrary.id and lending_lib.card_number != '':
            break
    else:
        raise LibraryError('Open Library not configured or missing sign-in credentials')
    
    lib = SearchableLibrary.create(log, config, lending_lib)
    lib.sign_in(True)     # Sign in to Open Library to allow borrowing

    if not lib.signed_in:
        raise LibraryError('Signin required to access online books')
        
    log.context('read_openlib_book_online')
    try:
        br = mechanize.Browser()
        br.set_cookiejar(lib.cookiejar)
                
        done = False
            
        while not done:
            log.clear_response()
            
            last_auth_time = time.time()
        
            browse_url(log, br, mechanize.Request(borrow_url))
            
            br.select_form(predicate=lambda f: 'class' in f.attrs and f.attrs['class'] == "LoanReadForm") # Exception if not found
            page = browse_url(log, br, None)   # submit form
            
            '''
            contains reference to js file with book info:
            <script type=\"text/javascript\"
            src="//ia601201.us.archive.org/BookReader/BookReaderJSIA.php?id=BOOK_ID&itemPath=PATH&server=SERVER&subPrefix=SUB-PREFIX&version=3.0.9">
            '''
            soup = BeautifulSoup(page, convertEntities=BeautifulSoup.HTML_ENTITIES)
            
            for script in soup.findAll('script'):
                if script.get('type') == "text/javascript":
                    partial_script_url = script.get('src','')
                    if '/BookReader/BookReaderJSIA.php' in partial_script_url:
                        break
                        
            else:
                raise LibraryError('Missing BookReaderJSIA.php script')
            
            parsed_url = urlparse.urlparse(partial_script_url, 'https')
            book_script_url = urlparse.urlunparse(parsed_url) # build full url
            
            parsed_query = urlparse.parse_qs(parsed_url.query)
            
            item_path = parsed_query['itemPath'][0]     # path where zip file is located
            book_id = parsed_query['id'][0]             # book id
            sub_prefix = parsed_query['subPrefix'][0]   # file name prefix (same as book_id)
            server = parsed_query['server'][0]          # "xxxx.us.archive.org"
            
         
            response = open_url(log, book_script_url, cookiejar=lib.cookiejar)
            
            # parse resulting js for additional values
            
            image_format = js_value(log, response.data, 'br.imageFormat =') # "jp2"
            
            leaf_map = js_value(log, response.data, 'br.leafMap =') # list of page numbers, each expanded to 4-char with 0 prefix
            log.info('found pages: %s' % range_str(leaf_map))
            if leaf_map != sorted(leaf_map):
                raise ValueError('br.leafMap is not sorted')
            
            for leaf in leaf_map:
                if leaf < continue_from:
                    continue
                    
                if time.time() - last_auth_time > 300:
                    # need to re-authenticate every 5 minutes to prevent "HTTP Error 403: Forbidden" after 10 minutes
                    
                    # still fails with "HTTP Error 403: Forbidden" after 10 minutes despite doing this!
                    # so go back to start instead
                    if True:
                        break
                        
                    else:
                        loan = 'loan-' + book_id
                        
                        for cookie in lib.cookiejar:
                            #log.info('cookie: %s' % unicode(cookie))
                            if cookie.name == loan:
                                token = cookie.value
                                break
                            
                        else:
                            log.info('cookiejar: %s' % unicode(lib.cookiejar))
                            raise ValueError('Failed to find cookie: %s'%loan)
                            
                        '''
                        https://archive.org/bookreader/BookReaderAuthProxy.php?
                        id=BOOK_ID&rand=0.0-0.999&loan=loan-BOOK_ID&token=xxxx&callback=olAuth.initCallback
                        '''
                        
                        data = {}
                        data['id'] = book_id
                        data['rand'] = unicode(random.random())
                        data['loan'] = loan
                        data['token'] = token
                        data['callback'] = 'olAuth.initCallback'
                        
                        auth_url = 'https://archive.org/bookreader/BookReaderAuthProxy.php?' + urllib.urlencode(data)
                        response = open_url(log, auth_url, cookiejar=lib.cookiejar)
                        
                        '''
                        olAuth.initCallback ( {"token": "xxxx", "success": true} );
                        '''
                        
                        result = js_value(log, response.data, '%s (' % data['callback'])
                        if not result.get("success", False):
                            raise LibraryError("AuthProxy response: %s"%response.data)
                        
                        last_auth_time = time.time()

                '''
                https://ia601201.us.archive.org/BookReader/BookReaderImages.php?
                zip=ITEM-PATH/SUB_PREFIX_jp2.zip&file=SUB_PREFIX_jp2/SUBPREFIX_LEAF.jp2&scale=1&rotate=0
                '''
            
                data = {}
                data['zip'] = '%s/%s_%s.zip'%(item_path, sub_prefix, image_format)
                data['file'] = '%s_%s/%s_%04d.%s'%(sub_prefix, image_format, sub_prefix, leaf, image_format)
                data['scale'] = unicode(scale)
                data['rotate'] = '0'
                

                url = urlparse.urlunparse(urlparse.ParseResult(scheme=parsed_url.scheme, netloc=server, 
                    path=parsed_url.path.replace('BookReaderJSIA.php', 'BookReaderImages.php'),
                    params='', query=urllib.urlencode(data), fragment=''))
                    
                response = open_url(log, url, cookiejar=lib.cookiejar)
                
                with open('%s_%04d.jpg'%(sub_prefix,leaf), 'wb') as of:
                    of.write(response.data)
                    
                pages_written += 1
                
                status.update(leaf / len(leaf_map), 'page %d of %d' % (leaf, leaf_map[-1]))

                time.sleep(page_delay)  # pause for reading
                
                continue_from = leaf + 1
                
            else:
                done = True
            
    except Exception as e:
        log.exception('', e)
            
    log.context(None)
    log.summary('Saved %s from Open Library' % value_unit(pages_written, 'page'))
