﻿#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import, print_function)

__license__   = 'GPL v3'
__copyright__ = '2011-2012, Hoffer Csaba <csaba.hoffer@gmail.com>, Kloon <kloon@techgeek.co.in>, 2020, Hokutya <mail@hokutya.com>'
__docformat__ = 'restructuredtext hu'

import time
import lxml, sys
import lxml.html as lh
from calibre import browser
from calibre import as_unicode
from lxml.html import fromstring
from calibre.utils.icu import lower
from six.moves.queue import Queue, Empty
from calibre.ebooks.metadata import check_isbn
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.metadata.sources.base import Source, Option

class Libri_hu(Source):
	name								= 'Libri_hu'
	description						= _('Downloads metadata and cover from libri.hu')
	author								= 'Hoffer Csaba & Kloon & Hokutya'
	version								= (1, 0, 8)
	minimum_calibre_version  = (0, 8, 0)

	capabilities = frozenset(['identify', 'cover'])
	touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:libri_hu', 'tags', 'comments', 'publisher', 'pubdate', 'series', 'rating', 'language', 'languages'])
	has_html_comments = False
	supports_gzip_transfer_encoding = False
	
	KEY_MAX_BOOKS = 'max_books'
	BASE_URL = 'http://www.libri.hu'
	BOOK_URL = BASE_URL + '/konyv'
	SEARCH_URL = BASE_URL + '/reszletes_kereso/'
	
	options = [Option(KEY_MAX_BOOKS, 'number', 3, _('Maximum number of books to get'),
                      _('The maximum number of books to process from the libri.hu search result')),
	
	]

	def identify(self, log, result_queue, abort, title, authors, identifiers={}, timeout=30):
		'''Note this method will retry without identifiers automatically if no match is found with identifiers.'''
		matches = []
		libri_id = identifiers.get('libri_hu', None)
		#isbn = check_isbn(identifiers.get('isbn', None))
		isbn = ''
		br = browser()
		log.info(u'\nTitle:%s\nAuthors:%s\n'%(title, authors))
		if libri_id:
			matches.append('%s/%s.html'%(Libri_hu.BOOK_URL, libri_id))
		else:
			query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
			if query is None:
				log.error('Insufficient metadata to construct query')
				return
			try:
				log.info('Querying: %s\n'%query)
				response = br.open(query)
				if isbn:
					response_book = br.follow_link(url_regex='konyv', nr=0)
					location = response_book.geturl()
					if location.find('/konyvkereso/') == -1:
						matches.append(location)
			except Exception as e:
				if isbn and callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
					# We did a lookup by ISBN but did not find a match
					# We will fallback to doing a lookup by title author
					log.info('Failed to find match for ISBN: %s'%isbn)
				else:
					err = 'Failed to make identify query: %r'%query
					log.exception(err)
					return as_unicode(e)
			# For ISBN based searches we have already done everything we need to
			# So anything from this point below is for title/author based searches.
			if not isbn:
				try:
					raw = response.read().strip()
					raw = raw.decode('latin-1', errors='replace')
					if not raw:
						log.error('Failed to get raw result for query: %r'%query)
						return
					root = fromstring(clean_ascii_chars(raw))
				except:
					msg = 'Failed to parse Libri page for query: %r'%query
					log.exception(msg)
					return msg
				# Now grab the matches from the search results, provided the
				# title and authors appear to be for the same book
				self._parse_search_results(log, title, authors, root, matches, timeout)
		if abort.is_set():
			return
		if not matches:
			if identifiers and title and authors:
				log.info('No matches found with identifiers, retrying using only title and authors')
				return self.identify(log, result_queue, abort, title=title,
						authors=authors, timeout=timeout)
			log.error('No matches found with query: %r'%query)
			return
		from calibre_plugins.libri_hu.worker import Worker
		workers = [Worker(url, result_queue, br, log, i, self) for i, url in
				enumerate(matches)]
		for w in workers:
			w.start()
			# Don't send all requests at the same time
			time.sleep(0.1)
		while not abort.is_set():
			a_worker_is_alive = False
			for w in workers:
				w.join(0.2)
				if abort.is_set():
					break
				if w.is_alive():
					a_worker_is_alive = True
			if not a_worker_is_alive:
				break
		return None

	def create_query(self, log, title=None, authors=None, identifiers={}):
		br = browser()
		br.open(Libri_hu.SEARCH_URL)
		br.select_form(name = 'detailed_search_form')
		isbn = check_isbn(identifiers.get('isbn', None))
		if isbn is not None:
			log.info('Searching based on the ISBN: %s'%isbn)
			br['isbn'] = isbn
		else:
			log.info('Searching based on:')
			if title is not None:
				log.info(' Title: %s'%title)
				br['cim'] = title
			else:
				br['cim'] = ''
			if authors is not None:
				log.info(' Author: %s'%authors[0])
				br['szerzo'] = authors[0]
			else:
				br['szerzo'] = ''
		search_page = br.submit()
		return search_page.geturl()

	def _parse_search_results(self, log, orig_title, orig_authors, root, matches, timeout):
		results = root.xpath('//*[@id="tab_content_authorPrList"]/section/section')
		if not results:
			return
		max_results = self.prefs[Libri_hu.KEY_MAX_BOOKS]
		#book_urls = results[0].xpath('//a[@class="book-title"]/@href')
		book_urls = results[0].xpath('//a[@class="book-title gtm-click"]/@href')
		for book_url in book_urls:
			result_url = Libri_hu.BASE_URL + book_url
			if '/konyv' in result_url:
				# Get the detailed url to query next
				matches.append(result_url)
				if len(matches) >= max_results:
					return

	def get_cached_cover_url(self, identifiers):
		url = None
		libri_id = identifiers.get('libri_hu', None)
		if libri_id is None:
			isbn = identifiers.get('isbn', None)
			if isbn is not None:
				libri_id = self.cached_isbn_to_identifier(isbn)
		if libri_id is not None:
			url = self.cached_identifier_to_cover_url(libri_id)
		return url
		
	def cached_identifier_to_cover_url(self, id_):
		with self.cache_lock:
			url = self._get_cached_identifier_to_cover_url(id_)
			if not url:
				# Try for a "small" image in the cache
				url = self._get_cached_identifier_to_cover_url('small/'+id_)
			return url
			
	def _get_cached_identifier_to_cover_url(self, id_):
		# This must only be called once we have the cache lock
		url = self._identifier_to_cover_url_cache.get(id_, None)
		if not url:
			key_prefix = id_.rpartition('/')[0]
			for key in self._identifier_to_cover_url_cache.keys():
				if key.startswith('key_prefix'):
					return self._identifier_to_cover_url_cache[key]
		return url

	def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30):
		cached_url = self.get_cached_cover_url(identifiers)
		if cached_url is None:
			log.info('No cached cover found, running identify')
			rq = Queue()
			self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers)
			if abort.is_set():
				return
			results = []
			while True:
				try:
					results.append(rq.get_nowait())
				except Empty:
					break
			results.sort(key=self.identify_results_keygen(title=title, authors=authors, identifiers=identifiers))
			for mi in results:
				cached_url = self.get_cached_cover_url(mi.identifiers)
				if cached_url is not None:
					break
		if cached_url is None:
			log.info('No cover found')
			return
		if abort.is_set():
			return
		br = self.browser
		log('Downloading cover from:', cached_url)
		try:
			cdata = br.open_novisit(cached_url, timeout=timeout).read()
			result_queue.put((self, cdata))
		except:
			log.exception('Failed to download cover from:', cached_url)