#!/usr/bin/env python
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.metadata.book.base import Metadata
from lxml.html import fromstring
from threading import Thread
from calibre_plugins.Storytel import Storytel

__license__ = "GPL v3"
__copyright__ = "2025, Computer"
__docformat__ = "restructuredtext en"

import socket
import datetime
import json
import re


class Worker(Thread):
    """
    Get book details from Storytel.com book page in a separate thread
    """

    name = "Storytel"
    description = "Downloads metadata from Storytel.se (ebooks only)"
    author = "Computer"
    version = (0, 1, 0)
    minimum_calibre_version = (0, 6, 0)

    def __init__(
        self, match_i, result_queue, browser, log, relevance, plugin, timeout=20
    ):
        Thread.__init__(self)
        self.daemon = True
        self.url = match_i
        self.result_queue = result_queue
        self.log, self.timeout = log, timeout
        self.relevance, self.plugin = relevance, plugin
        self.browser = browser.clone_browser()
        self.isbn = None
        self.cover_url = None

    def run(self):
        try:
            self.get_details()
        except Exception as e:
            self.log.exception(f"get_details failed for url: {self.url}")
            # Don't put anything in the queue if we fail - let other workers continue

    def get_details(self):
        try:
            self.log.info(f"Fetching details from: {self.url}")
            response = self.browser.open_novisit(self.url, timeout=self.timeout)
            raw = response.read().strip()
        except Exception as e:
            if callable(getattr(e, "getcode", None)) and e.getcode() == 404:
                self.log.error(f"URL malformed: {self.url}")
                return
            attr = getattr(e, "args", [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = "Storytel.com timed out. Try again later."
                self.log.error(msg)
            else:
                msg = f"Failed to make details query: {self.url}"
                self.log.exception(msg)
            return

        # Parse HTML and extract embedded JSON from __NEXT_DATA__ only
        book_data = self._extract_book_data(raw)
        
        if not book_data:
            self.log.error(f"No book data found on page: {self.url}")
            return

        # Create metadata object
        mi = Metadata(None)
        
        # Extract title
        title = book_data.get('name') or book_data.get('title')
        if title:
            mi.title = title
            mi.title_sort = title
        else:
            mi.title = "Unknown"

        # Extract authors
        authors = self._extract_authors(book_data)
        if authors:
            mi.authors = authors

        # Extract ISBN from __NEXT_DATA__
        isbn = self._extract_isbn(book_data)
        if isbn:
            self.isbn = isbn
            mi.set_identifier(Storytel.ID_NAME, isbn)

        # Extract description
        description = book_data.get('description')
        if description:
            if isinstance(description, dict):
                description = description.get('text')
            if description and isinstance(description, str):
                mi.comments = description

        # Extract publisher from __NEXT_DATA__ (ebook-specific)
        publisher = self._extract_publisher(book_data)
        if publisher:
            mi.publisher = publisher

        # Extract publication date
        self._extract_publication_date(book_data, mi)

        # Extract language
        language = book_data.get('language')
        if language:
            lang_code = None
            if isinstance(language, dict):
                lang_code = language.get('iso') or language.get('code') or language.get('language')
            elif isinstance(language, str):
                lang_code = language
            
            if lang_code:
                lang_code = lang_code.lower()
                if lang_code == 'sv' or lang_code == 'swe' or lang_code == 'swedish':
                    mi.languages = ['swe']
                elif lang_code.startswith('en') or lang_code == 'english':
                    mi.languages = ['eng']
                elif lang_code.startswith('no') or lang_code == 'norwegian':
                    mi.languages = ['nor']
                elif lang_code.startswith('da') or lang_code == 'danish':
                    mi.languages = ['dan']
                elif lang_code.startswith('fi') or lang_code == 'finnish':
                    mi.languages = ['fin']
                else:
                    mi.languages = [lang_code[:3]]

        # Extract series information
        series_info = self._extract_series(book_data)
        if series_info:
            self._set_series_info(mi, series_info)

        # Extract categories as tags
        tags = self._extract_tags(book_data)
        if tags:
            mi.tags = tags
            self.log.info(f"Set tags to: {mi.tags}")

        # Extract rating from __NEXT_DATA__
        rating = self._extract_rating(book_data)
        if rating:
            mi.rating = rating
            self.log.info(f"Set rating to: {mi.rating}")

        # Extract cover URL for ebook
        self.cover_url = self._extract_cover_url(book_data)
        if self.cover_url:
            mi.has_cover = True
            self.log.info(f"Set cover URL to: {self.cover_url}")

        mi.source_relevance = self.relevance

        # Cache cover URL if we have ISBN
        if self.isbn and self.cover_url:
            self.plugin.cache_identifier_to_cover_url(self.isbn, self.cover_url)

        self.plugin.clean_downloaded_metadata(mi)
        self.log.info(f"About to put metadata in queue - Title: {repr(mi.title)}")
        self.result_queue.put(mi)
        self.log.info(f"Metadata put in result queue successfully")
    
    def _extract_book_data(self, html):
        """Extract book data from embedded JSON in __NEXT_DATA__ only"""
        try:
            root = fromstring(html)
            script_tags = root.xpath('//script[@id="__NEXT_DATA__"]/text()')
            
            if not script_tags:
                self.log.error("No __NEXT_DATA__ script found")
                return None
            
            data = json.loads(script_tags[0])
            page_props = data.get('props', {}).get('pageProps', {})
            
            # Try to find the consumable data
            consumable = None
            
            if 'book' in page_props:
                consumable = page_props['book']
            elif 'consumable' in page_props:
                consumable = page_props['consumable']
            elif 'dehydratedState' in page_props:
                dehydrated = page_props['dehydratedState']
                queries = dehydrated.get('queries', [])
                for query in queries:
                    state = query.get('state', {})
                    query_data = state.get('data', {})
                    if query_data and ('format' in query_data or 'formats' in query_data):
                        consumable = query_data
                        break
            
            if not consumable:
                self.log.error(f"Could not find book data. Available pageProps keys: {list(page_props.keys())}")
                return None
            
            # Check if it's an ebook
            book_format = consumable.get('format')
            if not book_format:
                formats = consumable.get('formats', [])
                if formats:
                    for fmt in formats:
                        fmt_type = None
                        if isinstance(fmt, dict):
                            fmt_type = fmt.get('type')
                        elif isinstance(fmt, str):
                            fmt_type = fmt
                        
                        if fmt_type and fmt_type.upper() == 'EBOOK':
                            book_format = 'ebook'
                            if isinstance(fmt, dict) and 'ebook' not in consumable:
                                consumable['ebook'] = fmt
                            break
            
            # Verify it's an ebook
            if book_format != 'ebook':
                formats = consumable.get('formats', [])
                has_ebook_format = False
                
                for fmt in formats:
                    fmt_type = None
                    if isinstance(fmt, dict):
                        fmt_type = fmt.get('type')
                    elif isinstance(fmt, str):
                        fmt_type = fmt
                    
                    if fmt_type and fmt_type.upper() == 'EBOOK':
                        has_ebook_format = True
                        if isinstance(fmt, dict):
                            consumable['ebook'] = fmt
                        break
                
                if has_ebook_format:
                    book_format = 'ebook'
                else:
                    self.log.error(f"Not an ebook, format is: {book_format}")
                    return None
            
            return consumable
            
        except Exception as e:
            self.log.exception(f"Error extracting book data: {e}")
            return None
    
    def _extract_isbn(self, book_data):
        """Extract ISBN for ebook from __NEXT_DATA__"""
        isbn_data = book_data.get('isbn')
        
        if isinstance(isbn_data, dict):
            isbn = isbn_data.get('EBOOK')
            if isbn and isinstance(isbn, str):
                return isbn
        elif isinstance(isbn_data, str):
            return isbn_data
        
        # Try ebook format data
        ebook_data = book_data.get('ebook', {})
        if ebook_data and isinstance(ebook_data, dict):
            isbn = ebook_data.get('isbn')
            if isbn and isinstance(isbn, str):
                return isbn
        
        # Try formats array
        formats = book_data.get('formats', [])
        for fmt in formats:
            if isinstance(fmt, dict) and fmt.get('type', '').upper() == 'EBOOK':
                isbn = fmt.get('isbn')
                if isbn and isinstance(isbn, str):
                    return isbn
        
        return None
    
    def _extract_publisher(self, book_data):
        """Extract publisher for ebook from __NEXT_DATA__"""
        publishers_data = book_data.get('publishers')
        
        if isinstance(publishers_data, dict):
            ebook_publisher = publishers_data.get('EBOOK')
            if ebook_publisher:
                if isinstance(ebook_publisher, dict):
                    return ebook_publisher.get('name')
                elif isinstance(ebook_publisher, str):
                    return ebook_publisher
        
        # Try prioritizedPublisher
        prioritized = book_data.get('prioritizedPublisher')
        if prioritized:
            if isinstance(prioritized, dict):
                return prioritized.get('name')
            elif isinstance(prioritized, str):
                return prioritized
        
        # Fallback to generic publisher field
        publisher = book_data.get('publisher')
        if publisher:
            if isinstance(publisher, dict):
                return publisher.get('name')
            elif isinstance(publisher, str):
                return publisher
        
        return None
    
    def _extract_authors(self, book_data):
        """Extract authors from book data"""
        authors = []
        seen = set()
        
        # Authors are in the 'authors' field
        authors_data = book_data.get('authors', [])
        
        for author in authors_data:
            author_name = None
            # Author can be a dict with 'name' or a string
            if isinstance(author, dict):
                author_name = author.get('name')
                # Check format - only include ebook authors
                author_format = author.get('format')
                if author_format and author_format.upper() not in ['EBOOK', 'BOTH', None]:
                    continue
            elif isinstance(author, str):
                author_name = author
            
            # Avoid duplicates
            if author_name and author_name not in seen:
                authors.append(author_name)
                seen.add(author_name)
        
        return authors
    
    def _extract_publication_date(self, book_data, mi):
        """Extract and parse publication date"""
        # Look for releaseDate - could be in multiple places
        date_text = None
        
        # Try direct releaseDate field first
        release_date = book_data.get('releaseDate')
        if release_date:
            self.log.debug(f"Checking direct releaseDate: {release_date}")
            
            # releaseDate might be a dict with EBOOK/ABOOK keys
            if isinstance(release_date, dict):
                date_text = release_date.get('EBOOK') or release_date.get('ebook')
                if not date_text:
                    # Fallback to any date available
                    for key in release_date.keys():
                        if key not in ['__typename']:
                            date_text = release_date[key]
                            break
            elif isinstance(release_date, str):
                date_text = release_date
        
        # Try ebook format
        if not date_text:
            ebook_data = book_data.get('ebook', {})
            if ebook_data:
                date_text = ebook_data.get('releaseDate')
                self.log.debug(f"Checking ebook.releaseDate: {date_text}")
        
        # Try formats array
        if not date_text:
            formats = book_data.get('formats', [])
            for fmt in formats:
                if isinstance(fmt, dict) and fmt.get('type', '').lower() == 'ebook':
                    date_text = fmt.get('releaseDate')
                    self.log.debug(f"Checking format.releaseDate: {date_text}")
                    if date_text:
                        break
        
        # Try publishedDate field
        if not date_text:
            date_text = book_data.get('publishedDate')
            self.log.debug(f"Checking publishedDate: {date_text}")
        
        if not date_text:
            self.log.warning(f"No publication date found")
            return
        
        try:
            from calibre.utils.date import utc_tz
            import email.utils
            
            # Try to parse different date formats
            # Format 1: "Wed, 04 Oct 2023 00:00:00 GMT" (RFC 2822)
            if ',' in date_text and ('GMT' in date_text or 'UTC' in date_text):
                try:
                    # Use email.utils to parse RFC 2822 date
                    parsed_tuple = email.utils.parsedate_tz(date_text)
                    if parsed_tuple:
                        timestamp = email.utils.mktime_tz(parsed_tuple)
                        mi.pubdate = datetime.datetime.fromtimestamp(timestamp, tz=utc_tz)
                        self.log.info(f"Set publication date to: {mi.pubdate}")
                        return
                except Exception as e:
                    self.log.debug(f"Failed to parse RFC date: {e}")
            
            # Format 2: ISO format "2023-05-04T00:00:00Z"
            if 'T' in date_text:
                date_text = date_text.split('T')[0]
            
            if len(date_text) >= 10:
                # Full date: YYYY-MM-DD
                year = int(date_text[:4])
                month = int(date_text[5:7])
                day = int(date_text[8:10])
                mi.pubdate = datetime.datetime(year, month, day, tzinfo=utc_tz)
            elif len(date_text) == 4:
                # Year only
                year = int(date_text)
                mi.pubdate = datetime.datetime(year, 1, 1, tzinfo=utc_tz)
            
            self.log.info(f"Set publication date to: {mi.pubdate}")
        except Exception as e:
            self.log.exception(f"Error parsing published date '{date_text}' for url: {self.url}")
    
    def _extract_tags(self, book_data):
        """Extract tags from book data"""
        tags = set()
        
        # Get tags from the tags array
        tags_data = book_data.get('tags', [])
        for tag in tags_data:
            if tag is None:
                continue
            
            tag_name = None
            if isinstance(tag, dict):
                details = tag.get('details')
                if isinstance(details, dict):
                    tag_name = details.get('name')
                elif isinstance(details, str):
                    tag_name = details
            elif isinstance(tag, str):
                tag_name = tag
            
            if tag_name:
                tags.add(tag_name)
        
        # Get category
        category = book_data.get('category')
        if category:
            if isinstance(category, dict):
                category_name = category.get('name')
                if category_name:
                    tags.add(category_name)
            elif isinstance(category, str):
                tags.add(category)
        
        return sorted(list(tags)) if tags else None
    
    def _extract_series(self, book_data):
        """Extract series information from book data"""
        series_data = book_data.get('series')
        
        if not series_data:
            return None
        
        series_info = {}
        
        # Handle series title - can be in details or directly in series object
        series_title = None
        if isinstance(series_data, dict):
            details = series_data.get('details')
            if details and isinstance(details, dict):
                series_title = details.get('title') or details.get('name')
            if not series_title:
                # Try direct name field
                series_title = series_data.get('name') or series_data.get('title')
        
        if series_title:
            series_info['series'] = series_title
        
        order_in_series = series_data.get('orderInSeries')
        if order_in_series:
            series_info['series_order'] = float(order_in_series)
        
        return series_info if series_info else None
    
    def _set_series_info(self, mi, series_info):
        """Set series name and index on metadata object"""
        series_name = series_info.get('series')
        series_order = series_info.get('series_order')
        
        if not series_name:
            return
        
        # Check if series name contains embedded number (e.g., "Series Name #2")
        series_pattern = r'^(.+?)\s+#(\d+)$'
        match = re.match(series_pattern, series_name)
        
        if match:
            # Extract series name and number from the pattern
            mi.series = match.group(1).strip()
            if not series_order:
                try:
                    series_order = float(match.group(2))
                except:
                    pass
        else:
            mi.series = series_name
        
        if series_order:
            mi.series_index = series_order
        
        self.log.info(f"Set series to: {mi.series}, index: {mi.series_index}")
    
    def _extract_rating(self, book_data):
        """Extract rating from book data in __NEXT_DATA__"""
        rating_data = book_data.get('rating')
        if not rating_data:
            self.log.debug("No rating found in book data")
            return None
        
        try:
            rating_value = rating_data.get('averageRating')
            self.log.debug(f"Found rating value: {rating_value} (type: {type(rating_value)})")
            if rating_value is not None:
                # Convert to float
                rating = float(rating_value)
                self.log.debug(f"Storytel rating: {rating}")
                
                # Storytel uses 0-5 scale, Calibre also uses 0-5 for display
                # Just ensure it's in range
                if 0 <= rating <= 5:
                    return rating
                else:
                    self.log.warning(f"Rating {rating} out of expected 0-5 range")
        except Exception as e:
            self.log.exception(f"Error parsing rating: {e}")
        
        return None
    
    def _extract_cover_url(self, book_data):
        """Extract cover URL from __NEXT_DATA__"""
        cover_data = book_data.get('cover')
        
        if not cover_data:
            return None
        
        cover_url = None
        if isinstance(cover_data, dict):
            cover_url = cover_data.get('url')
        elif isinstance(cover_data, str):
            cover_url = cover_data
        
        if cover_url:
            # Replace quality parameter with quality=100 for best quality
            if '?' in cover_url:
                # URL has query parameters
                base_url = cover_url.split('?')[0]
                cover_url = f"{base_url}?quality=100"
            else:
                # No query parameters
                cover_url = f"{cover_url}?quality=100"
            
            self.log.info(f"Found cover: {cover_url}")
            return cover_url
        
        return None