#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
from calibre.db.write import identifiers
import six

__license__   = 'GPL v3'
__copyright__ = '2011, Grant Drake <grant.drake@gmail.com>, 2016-2018 updates by David Forrester <davidfor@internode.on.net>'
__docformat__ = 'restructuredtext en'

import time, re, six.moves.html_parser
from six.moves.urllib.parse import quote, unquote
from six.moves.queue import Queue, Empty

from lxml.html import fromstring, tostring

from calibre import as_unicode
from calibre.ebooks.metadata.sources.base import Source
from calibre.utils.icu import lower
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.localization import get_udc

from calibre.devices.usbms.driver import debug_print

class Baen(Source):

    name                    = 'Baen'
    description             = _('Downloads metadata and covers from Baen (was Baen.net)')
    author                  = 'Grant Drake with updates by David Forrester'
    version                 = (1, 2, 0)
    minimum_calibre_version = (3, 48, 0)

    ID_NAME = 'baen'
    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset(['title', 'authors', 'identifier:baen',
                                'comments', 'publisher', 'pubdate', 'rating'])
    has_html_comments = True
    supports_gzip_transfer_encoding = True

    BASE_URL = 'https://www.baen.com'

    def get_baen_id(self, identifiers):
        baen_id = identifiers.get(self.ID_NAME, None)
#         debug_print("get_baen_id: match_group=", baen_id)
        if baen_id:
            match_groups = re.search('p-\d+-(.*)', baen_id)
#             debug_print("get_baen_id: match_group=", match_groups)
#             debug_print("get_baen_id: match_group=", match_groups.groups(0))
#             debug_print("get_baen_id: match_group=", match_groups.groups(0)[0])
            if match_groups and len(match_groups.groups(0)) == 1:
                baen_id = match_groups.groups(0)[0]
        return baen_id
        
    def get_book_url(self, identifiers):
        baen_id = self.get_baen_id(identifiers)
        if baen_id:
            return (self.ID_NAME, baen_id,
                    '%s/%s.html'%(self.BASE_URL, baen_id))

    def id_from_url(self, url):
        match = re.match(self.BASE_URL + "/(.*)\.htm.*", url)
        if match:
            return (self.ID_NAME, match.groups(0)[0])
        return None
        
    def get_cached_cover_url(self, identifiers):
        debug_print("get_cached_cover_url: identifiers=", identifiers)
        url = None
        baen_id = self.get_baen_id(identifiers)
        if baen_id is not None:
            url = self.cached_identifier_to_cover_url(baen_id)
        return url

    def create_title_query(self, log, title=None):
        q = ''
        if title:
            title = get_udc().decode(title)
            tokens = []
            title_tokens = list(self.get_title_tokens(title,
                                strip_joiners=False, strip_subtitle=True))
            tokens = [quote(t.encode('utf-8') if isinstance(t, six.text_type) else t) for t in title_tokens]
            q = '+'.join(tokens)
        if not q:
            return None
        return '%s/catalogsearch/result/?dir=desc&order=relevance&q=%s' % (Baen.BASE_URL, q)

    def identify(self, log, result_queue, abort, title=None, authors=None,
            identifiers={}, timeout=30):
        '''
        Note this method will retry without identifiers automatically if no
        match is found with identifiers.
        '''
        matches = []

        # If we have a Baen id then we do not need to fire a "search".
        # Instead we will go straight to the URL for that book.
        baen_id = self.get_baen_id(identifiers)
        br = self.browser
        if baen_id:
#             matches.append(('%s/%s.aspx'%(Baen.BASE_URL, baen_id), None))
            (name, baen_id, url) = self.get_book_url(identifiers)
            matches.append((url, None))
        else:
            query = self.create_title_query(log, title=title)
            if query is None:
                log.error('Insufficient metadata to construct query')
                return
            try:
                log.info('Querying: %s'%query)
                raw = br.open_novisit(query, timeout=timeout).read()
                #open('E:\\t.html', 'wb').write(raw)
            except Exception as e:
                err = 'Failed to make identify query: %r'%query
                log.exception(err)
                return as_unicode(e)
            root = fromstring(clean_ascii_chars(raw))
            # Now grab the match from the search result, provided the
            # title appears to be for the same book
            self._parse_search_results(log, title, root, matches, timeout)

        if abort.is_set():
            return

        if not matches:
            log.error('No matches found with query: %r'%query)
            return
        log.info('_parse_search_results: matches=', matches)
        from calibre_plugins.baen.worker import Worker
        author_tokens = list(self.get_author_tokens(authors))
        workers = [Worker(data[0], data[1], author_tokens, result_queue, br, log, i+1, self) for i, data in
                enumerate(matches)]

        for w in workers:
            w.start()
            # Don't send all requests at the same time
            time.sleep(0.1)

        while not abort.is_set():
            a_worker_is_alive = False
            for w in workers:
                w.join(0.2)
                if abort.is_set():
                    break
                if w.is_alive():
                    a_worker_is_alive = True
            if not a_worker_is_alive:
                break

        return None

    def _parse_search_results(self, log, orig_title, root, matches, timeout):

        log.info('_parse_search_results: start')
        def ismatch(title):
            title = lower(title)
            match = not title_tokens
            for t in title_tokens:
                if lower(t) in title:
                    match = True
                    break
            return match

        title_tokens = list(self.get_title_tokens(orig_title))
        max_results = 5
#         for data in root.xpath('//table[@cellpadding="4"]/tr/td/table/tr'):
        for data in root.xpath('//div[@class="category-products"]/table/tbody/tr'):
            url = ''.join(data.xpath('./td[1]/a/@href'))
            if not url:
                continue
            log.info('_parse_search_results: url=%s' % url)

            title = ''.join(data.xpath('./td[2]/a[1]/text()'))
            title = title.strip()
            log.info('_parse_search_results: title=%s' % title)
            if not ismatch(title):
                log.error('Rejecting as not close enough match: %s'%(title))
                continue
            publisher = data.xpath('./td[2]/br[2]')
            if publisher and publisher[0].tail.strip().lower().startswith('edited by'):
                publisher = data.xpath('./td[2]/br[3]')
            publisher = publisher[0].tail.strip()
            log.info('_parse_search_results: 4 publisher="%s"' % publisher)
            matches.append((url, publisher))
            if len(matches) >= max_results:
                break


    def download_cover(self, log, result_queue, abort,
            title=None, authors=None, identifiers={}, timeout=30):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
            self.identify(log, rq, abort, title=title, authors=authors,
                    identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info('No cover found')
            return

        if abort.is_set():
            return
        br = self.browser
        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)


if __name__ == '__main__': # tests
    # To run these test use:
    # calibre-debug -e __init__.py
    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
            title_test, authors_test, series_test)
    test_identify_plugin(Baen.name,
        [

            ( # A book with no ISBN specified
                {'title':"Harry Potter and the Sorcerer's Stone", 'authors':['J.K. Rowling']},
                [title_test("Harry Potter and the Sorcerer's Stone",
                    exact=True), authors_test(['J. K. Rowling']),
                    series_test('Harry Potter', 1.0)]

            ),

            ( # A book with an ISBN
                {'identifiers':{'isbn': '9780439064866'},
                    'title':'Chamber of Secrets', 'authors':['J.K. Rowling']},
                [title_test('Harry Potter and the Chamber of Secrets',
                    exact=True), authors_test(['J. K. Rowling']),
                    series_test('Harry Potter', 2.0)]

            ),

            ( # A book with a Baen id
                {'identifiers':{'ff': '61-Hours/Lee-Child/e/9780440243694'},
                    'title':'61 Hours', 'authors':['Lee Child']},
                [title_test('61 Hours',
                    exact=True), authors_test(['Lee Child']),
                    series_test('Jack Reacher', 14.0)]

            ),

        ])


