#!/usr/bin/env python
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.metadata.book.base import Metadata
from lxml.html import fromstring
from threading import Thread
from calibre_plugins.Adlibris import Adlibris

__license__ = "GPL v3"
__copyright__ = "2022, J-H based on the work by Pr.BarnArt and Grant Drake"
__docformat__ = "restructuredtext en"

import socket
import datetime
import json
import re


class Worker(Thread):
    """
    Get book details from Adlibris.com book page in a separate thread
    """

    name = "Worker"
    description = "Get book details from adlibris.com book page in a separate thread"
    author = "J-H"
    version = (0, 3, 0)
    minimum_calibre_version = (0, 6, 0)

    def __init__(
        self, match_i, result_queue, browser, log, relevance, plugin, timeout=20
    ):
        Thread.__init__(self)
        self.daemon = True
        self.url = match_i
        self.result_queue = result_queue
        self.log, self.timeout = log, timeout
        self.relevance, self.plugin = relevance, plugin
        self.browser = browser.clone_browser()
        self.cover_url = self.isbn = None

    def run(self):
        try:
            self.get_details()
        except:
            self.log.exception(f"get_details failed for url: {self.url}")

    def get_details(self):
        try:
            raw = (
                self.browser.open_novisit(
                    self.url, timeout=self.timeout).read().strip()
            )
        except Exception as e:
            if callable(getattr(e, "getcode", None)) and e.getcode() == 404:
                self.log.error(f"URL malformed: {self.url}")
                return
            attr = getattr(e, "args", [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = "Adlibris.com timed out. Try again later."
                self.log.error(msg)
            else:
                msg = f"Failed to make details query: {self.url}"
                self.log.exception(msg)
            return
        
        raw = raw.decode("utf-8", errors="replace")

        if "<title>404 - " in raw:
            self.log.error(f"URL malformed: {self.url}")
            return
        
        try:
            root = fromstring(clean_ascii_chars(raw))
        except:
            msg = f"Failed to parse Adlibris.com details page: {self.url}"
            self.log.exception(msg)
            return
        
        self.parse_details(root)

    def parse_details(self, root):
        """
        Parse book details from JSON-LD structured data
        """
        json_ld_scripts = root.xpath('//script[@type="application/ld+json"]/text()')
        
        if not json_ld_scripts:
            self.log.error(f"No JSON-LD data found for {self.url}")
            return
        
        book_data = None
        for script_text in json_ld_scripts:
            try:
                data = json.loads(script_text)
                data_type = data.get('@type', '')
                
                if isinstance(data_type, list):
                    if 'Book' in data_type or 'Product' in data_type:
                        book_data = data
                        break
                elif data_type in ['Book', 'Product']:
                    book_data = data
                    break
                    
            except (json.JSONDecodeError, Exception) as e:
                self.log.debug(f"Failed to parse JSON-LD block: {e}")
                continue
        
        if not book_data:
            self.log.error(f"No book data found in JSON-LD for {self.url}")
            return
        
        # Extract basic metadata
        title = book_data.get('name')
        if not title:
            self.log.error(f"Could not find title for {self.url}")
            return
        
        # Extract authors
        authors = self._extract_authors(book_data)
        if not authors:
            self.log.error(f"Could not find authors for {self.url}")
            return

        mi = Metadata(title, authors)

        # Extract ISBN
        self.isbn = book_data.get('isbn')
        
        # Extract and clean description
        description = book_data.get('description', '')
        if description:
            mi.comments = self._clean_description_html(description)
        
        # Extract cover URL
        self.cover_url = book_data.get('image')
        mi.has_cover = bool(self.cover_url)

        # Extract publisher
        publisher = book_data.get('publisher')
        if isinstance(publisher, dict):
            mi.publisher = publisher.get('name')
        elif isinstance(publisher, str):
            mi.publisher = publisher

        # Extract publication date
        self._extract_publication_date(book_data, mi)

        # Extract language
        language = book_data.get('inLanguage')
        if language:
            if isinstance(language, str):
                mi.language = language.lower()
            elif isinstance(language, dict):
                mi.language = language.get('name', '').lower()

        # Extract series information
        series_info = self._extract_series_from_nextjs(root)
        if series_info:
            self._set_series_info(mi, series_info)

        mi.source_relevance = self.relevance

        # Set identifiers and cache cover URL
        if self.isbn:
            mi.set_identifier(Adlibris.ID_NAME, self.isbn)
            if self.cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    self.isbn, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
    
    def _extract_authors(self, book_data):
        """Extract authors from book data, falling back to editors if needed"""
        authors = []
        author_data = book_data.get('author', [])
        
        if isinstance(author_data, list):
            for author in author_data:
                if isinstance(author, dict):
                    author_name = author.get('name')
                    if author_name:
                        authors.append(author_name)
        elif isinstance(author_data, dict):
            author_name = author_data.get('name')
            if author_name:
                authors.append(author_name)
        
        # Fallback to editors if no authors found
        if not authors:
            editor_data = book_data.get('editor', [])
            if isinstance(editor_data, list):
                for editor in editor_data:
                    if isinstance(editor, dict):
                        editor_name = editor.get('name')
                        if editor_name:
                            authors.append(editor_name)
        
        return authors
    
    def _extract_publication_date(self, book_data, mi):
        """Extract and parse publication date"""
        date_text = book_data.get('datePublished')
        if not date_text:
            return
        
        try:
            from calibre.utils.date import utc_tz
            
            if len(date_text) >= 10:
                # Full date: YYYY-MM-DD
                year = int(date_text[:4])
                month = int(date_text[5:7])
                day = int(date_text[8:10])
                mi.pubdate = datetime.datetime(year, month, day, tzinfo=utc_tz)
            elif len(date_text) == 4:
                # Year only
                year = int(date_text)
                mi.pubdate = datetime.datetime(year, 1, 1, tzinfo=utc_tz)
        except:
            self.log.exception(f"Error parsing published date for url: {self.url}")
    
    def _set_series_info(self, mi, series_info):
        """Set series name and index on metadata object"""
        series_name = series_info.get('series')
        series_order = series_info.get('series_order')
        
        if not series_name:
            return
        
        # Check if series name contains embedded number (e.g., "Series Name #2")
        series_pattern = r'^(.+?)\s*#(\d+)$'
        series_match = re.match(series_pattern, series_name)
        
        if series_match:
            mi.series = series_match.group(1).strip()
            try:
                series_num = float(series_match.group(2))
                if 0 < series_num < 3000:
                    mi.series_index = series_num
            except (ValueError, TypeError):
                pass
        else:
            mi.series = series_name
            if series_order:
                try:
                    series_num = float(series_order)
                    # Convert 0 to 1 for series index
                    if series_num == 0:
                        series_num = 1
                    if 0 < series_num < 3000:
                        mi.series_index = series_num
                except (ValueError, TypeError):
                    self.log.debug(f"Could not parse series order: {series_order}")
    
    def _extract_series_from_nextjs(self, root):
        """
        Extract series information from Next.js data stream
        """
        try:
            nextjs_scripts = root.xpath('//script[contains(text(), "self.__next_f.push")]/text()')
            
            for script_text in nextjs_scripts:
                if r'\"series\"' in script_text and r'\"series_order\"' in script_text:
                    pattern = r'\\"series\\":\\"([^"\\]+?)\\"[,}].*?\\"series_order\\":\\"(\d+)\\"'
                    matches = re.findall(pattern, script_text)
                    
                    if matches:
                        series_name, series_order = matches[0]
                        
                        try:
                            from html import unescape
                            series_name = unescape(series_name)
                        except:
                            pass
                        
                        return {
                            'series': series_name,
                            'series_order': series_order
                        }
            
            return None
            
        except Exception as e:
            self.log.debug(f"Failed to extract series from Next.js data: {e}")
            return None
    
    def _clean_description_html(self, text):
        """Clean up description HTML to fix formatting issues"""
        if not text:
            return text
        
        try:
            # Remove div tags
            text = re.sub(r'</?div[^>]*>', '', text, flags=re.IGNORECASE)
            
            # Remove empty paragraphs with br
            text = re.sub(r'<p>\s*<br\s*/?>\s*</p>', '', text, flags=re.IGNORECASE)
            
            # Remove empty paragraphs
            text = re.sub(r'<p>\s*</p>', '', text, flags=re.IGNORECASE)
            
            # Remove br tags between paragraphs (this causes double spacing)
            text = re.sub(r'</p>\s*<br\s*/?>\s*<p>', '</p><p>', text, flags=re.IGNORECASE)
            
            # Remove br at start
            text = re.sub(r'^\s*<br\s*/?>\s*', '', text, flags=re.IGNORECASE)
            
            # Remove br at end
            text = re.sub(r'\s*<br\s*/?>\s*$', '', text, flags=re.IGNORECASE)
            
            return text.strip()
        except Exception as e:
            self.log.exception(f"Error cleaning description HTML: {e}")
            return text