﻿#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
# *-* coding: utf-8 *-*
from __future__ import (unicode_literals, division, absolute_import,
                        print_function)

__license__   = 'GPL v3'
__copyright__ = '2019, Daniel Prazak <kret33n@gmail.com>, 2020-2022, Jindroush <jindroush@seznam.cz>'
__docformat__ = 'restructuredtext cs'

import time

#these are for Python2/3 compatibility
try:
    from urllib.parse import quote
except ImportError:
    from urllib2 import quote

try:
    from queue import Empty, Queue
except ImportError:
    from Queue import Empty, Queue

from lxml.html import fromstring
from calibre import as_unicode
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source
from calibre.utils.icu import lower
from calibre.utils.cleantext import clean_ascii_chars
import lxml
import sys, traceback, urllib
import re


class databazeknih(Source):
    name                   = 'databazeknih.cz'
    description            = _('Downloads metadata and covers from databazeknih.cz')
    author                 = 'Daniel Prazak (Kret33n) based on bagdira version with fixes by JK, Pepyk'
    version                = (1, 4, 26)
    minimum_calibre_version= (0, 8, 0)

    #this declares which data will plugin fill
    capabilities = frozenset(['identify', 'cover'])
    #capabilities = frozenset(['identify'])

    touched_fields = frozenset(['title', 'authors', 'identifier:databazeknih', 'tags', 'comments', 'rating', 'series', 'publisher','pubdate','languages'])
    has_html_comments = False
    supports_gzip_transfer_encoding = False

    BASE_URL = "https://www.databazeknih.cz/"
    
    def config_widget(self):
        '''
        Overriding the default configuration screen for our own custom configuration
        '''
        from calibre_plugins.databazeknih.config import ConfigWidget
        return ConfigWidget(self)

        
    def get_book_url(self, identifiers):
        databazeknih_id = identifiers.get('databazeknih', None)
        if databazeknih_id:
            return (self.name, databazeknih_id, databazeknih.BASE_URL + 'knihy/' + databazeknih_id + '?show=alldesc')


    #creates query based on isbn, or title. Don't know if authors makes sense here
    def create_search_query(self, log, title=None, authors=None, isbn=None):
        if isbn is not None:
            search_title = quote( isbn.encode('utf8'))
        elif title is not None:
            search_title = quote( title.encode('utf8'))
        else:
            search_title = ''
            
        search_page = 'https://www.databazeknih.cz/index.php?stranka=search&q=%s'%search_title
        return search_page


    def get_cached_cover_url(self, identifiers):
        url = None
        databazeknih_id = identifiers.get(u'databazeknih', None)
        if databazeknih_id is None:
            isbn = check_isbn(identifiers.get(u'isbn', None))
            if isbn is not None:
                databazeknih_id = self.cached_isbn_to_identifier(isbn)
        if databazeknih_id is not None:
            url = self.cached_identifier_to_cover_url(databazeknih_id)
            return url


    def identify(self, log, result_queue, abort, title, authors, identifiers={}, timeout=30):
        '''
        Note this method will retry without identifiers automatically if no
        match is found with identifiers.
        '''

        #this is list of potential matches
        matches = []

        #read stored dbk identifier and stored isbn (only valid isbn, which actually may be a problem)
        databazeknih_id = identifiers.get('databazeknih', None)
        isbn = check_isbn(identifiers.get('isbn', None))

        log.info(u'\nIn __init__/identify\nTitle:%s\nAuthors:%s\nISBN:%s\n'%(title, authors, isbn))
        br = self.browser

        if databazeknih_id:
            #if we have dbk id, we just construct the book url
            matches.append(databazeknih.BASE_URL + 'knihy/' + databazeknih_id + '?show=alldesc')
        else:
            #otherwise we do a search on isbn or title
            query = self.create_search_query(log, title=title, authors=authors, isbn=isbn)
            if query is None:
                log.error('Insufficient metadata to construct query')
                return
            try:
                log.info(u'Querying: %s'%query)
            
                response = br.open(query)
            except Exception as e:
                if isbn and callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                    log.info('Failed to find match for ISBN: %s'%isbn)
                else:
                    err = 'Failed to make identify query: %r'%query
                    log.info(err)
                    return as_unicode(e)
            try:
                #we read search query response
                raw = response.read().strip()
                #and also the response url, as sometime we may be 302 redirected to final page
                resp_url = response.geturl()
                raw = raw.decode('utf-8', errors='replace')
                if not raw:
                    log.error('Failed to get raw result for query: %r'%query)
                    return
                root = fromstring(clean_ascii_chars(raw))
            except:
                msg = 'Failed to parse databazeknih page for query: %r'%query
                log.exception(msg)
                return msg
            
            #and we actually parse the seatch results here
            self._parse_search_results(log, title, authors, root, resp_url, matches, timeout)

        if abort.is_set():
            return
        
        if not matches:
            if identifiers and title and authors:
                log.info('No matches found with identifiers, retrying using only title')
                return self.identify(log, result_queue, abort, title=title,
                        authors=authors, timeout=timeout)
            log.error('No matches found with query: %r'%query)
            return

            
        #we have some matches here, so we can start workers
        log.debug('Starting workers for: %s' % (matches,))    
        from calibre_plugins.databazeknih.worker import Worker
        workers = [Worker(url, result_queue, br, log, i, self) for i, url in
                enumerate(matches)]

        for w in workers:
            w.start()
            time.sleep(0.1)

        while not abort.is_set():
            a_worker_is_alive = False
            for w in workers:
                w.join(0.2)
                if abort.is_set():
                    break
                if w.is_alive():
                    a_worker_is_alive = True
            if not a_worker_is_alive:
                break

        return None


    def _parse_search_results(self, log, orig_title, orig_authors, root, ext_url, matches, timeout):
        log.info('in __init__/_parse_search_results')
        results = root.xpath('//*[@class="new"]')
        results_cnt = len(results)
        log.info( 'search results count: %d'%results_cnt )

        if not results_cnt:
            #seems that dbk always redirects to the first found book for that isbn
            #<link rel="canonical" href="https://www.databazeknih.cz/knihy/bookname-bookid" />
            result_urls = root.xpath('//link[@rel="canonical"]/@href')
            if len(result_urls)>0:
                #found link as above
                result_url = result_urls[0] + '?show=alldesc'
                log.info('(no new) Result URL: %r'%result_url)
                matches.append(result_url)
                return
            elif re.search('/knihy/', ext_url):
                #we've been redirected to /knihy/ immediatelly
                result_url = ext_url + '?show=alldesc'
                log.info('(ext url) Result URL: %r'%result_url)
                matches.append(result_url)
                return
            log.info( 'no results found')
            return

        #TODO: check if this is supposed to limit the number of results
        import calibre_plugins.databazeknih.config as cfg
        max_results = 10
        i = 0

        for result in results:
            title = results[i].xpath('a//text()')
            log.info('%d) book title: %s'%(i,title))
            log.info('%d) orig author: %s'%(i,orig_authors))
            product = results[i].xpath('span[@class="smallfind"]//text()')
            vlozit = False

            if product and isinstance(product,list):
                product = product[0]
                product = product.split()
                prijmeni = product[len(product)-1]

                if prijmeni == '(pseudonym)':
                    prijmeni = product[len(product)-2]
                log.info('found_surname: %s'%prijmeni)

                if orig_authors:
                    for o_jmena in orig_authors:
                        log.info('name: %s'%o_jmena)
                        #fixed: only space split to regexp split on comma and space
                        #os_jmena = o_jmena.lower().split()
                        os_jmena = re.split( "[, ]+", o_jmena.lower() )
                        log.info('name0 :%s'%os_jmena[0])
                        log.info('name1 :%s'%os_jmena[1])
                        if prijmeni.lower() in os_jmena:
                            log.info('include')
                            vlozit=True
                        elif prijmeni.lower() + 'ová' in os_jmena:
                            log.info('include +ová')
                            vlozit=True
                elif prijmeni:
                    vlozit=True
                #endif orig_authors

                book_url = results[i].xpath('a/@href')
                log.info('%d) book URL: %r'%(i,book_url))
                result_url = 'https://www.databazeknih.cz/' + book_url[0] + '?show=alldesc'
                log.info('%d) result URL: %r'%(i,result_url))
            i = i+1
            if vlozit:
                matches.append(result_url)
            if len(matches) >= max_results:
                break

    def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
            self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info('No cover found')
            return

        if abort.is_set():
            return
        br = self.browser
        log.info('Downloading cover from: ', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from: ', cached_url)
