#!/usr/bin/env python
import datetime
import dateutil
import json
from queue import Empty, Queue
import re
import time
from threading import Thread


from calibre import as_unicode
from calibre.ebooks.metadata import check_isbn

import cloudscraperv3


def clean_html(raw):
    from calibre.ebooks.chardet import xml_to_unicode
    from calibre.utils.cleantext import clean_ascii_chars

    return clean_ascii_chars(
        xml_to_unicode(
            raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True
        )[0]
    )


def parse_html(raw):
    raw = clean_html(raw)
    from html5_parser import parse

    return parse(raw)


class Worker(Thread):
    def __init__(
        self, guid, relevance, result_queue, br, timeout, log, plugin, testing=False
    ):
        Thread.__init__(self)
        self.daemon = True
        self.guid = guid
        self.br, self.log, self.timeout = br, log, timeout
        self.result_queue, self.plugin = result_queue, plugin
        self.relevance = relevance
        self.testing = testing

    def run(self):
        url = "https://app.thestorygraph.com/books/{0}".format(self.guid)
        try:
            raw = self.br.get(url, timeout=self.timeout).text
        except:
            self.log.exception("Failed to load book page: %r" % url)
            return

        try:
            raw_rating = self.br.get(
                url + "/community_reviews", timeout=self.timeout
            ).text
        except:
            self.log.exception("Failed to load book rating page: %s" % self.guid)

        try:
            mi = self.parse(raw)
            if "raw_rating" in locals():
                try:
                    mi.rating = self.parse_rating(raw_rating)
                except:
                    self.log.exception("Failed to parse rating")
            mi.source_relevance = self.relevance
            self.plugin.clean_downloaded_metadata(mi)
            self.result_queue.put(mi)
        except:
            self.log.exception("Failed to parse details for guid: %s" % self.guid)

    def parse(self, raw):
        from css_selectors import Select
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import parse_only_date
        from calibre.utils.localization import canonicalize_lang

        root = Select(parse_html(raw))
        main_info = next(root(".book-title-author-and-series > :only-child"))
        title = main_info.text.strip()
        main_info = Select(main_info)
        authors = [author.text for author in main_info('a[href^="/authors/"]')]
        mi = Metadata(title, authors)

        # Series
        series_info = [series.text for series in main_info('a[href^="/series/"]')]
        if self.testing:
            mi.series, mi.series_index = "Dummy series for testing", 1
        elif series_info:
            mi.series = series_info[0]
            if (
                len(series_info) > 1
                and series_info[1]
                and "-" not in series_info[1]
                and "+" not in series_info[1]
            ):  # Calibre cannot
                # handle multiple series entries per book
                mi.series_index = float(series_info[1].lstrip("#"))

        edition_info = {
            field.text.lower().rstrip(":"): field.tail.strip()
            for field in Select(next(root(".edition-info")))("span")
        }

        # Identifiers
        if edition_info.get("isbn/uid"):
            mi.isbn = edition_info["isbn/uid"]
        mi.set_identifier("storygraph", self.guid)

        # Language
        if edition_info.get("language"):
            mi.language = canonicalize_lang(edition_info["language"])

        # Tags
        tags = [tag.text for tag in next(root(".book-page-tag-section")).getchildren()]
        if tags:
            mi.tags = tags

        # Publisher
        if edition_info.get("publisher") != "Not specified":
            mi.publisher = edition_info["publisher"]
        elif self.testing:
            mi.publisher = "Dummy publisher for testing"

        # Pubdate
        first_pubyear = next(root(".toggle-edition-info-link")).text.split(" ")[-1]
        if not self.plugin.prefs["usefirstpub"] or not first_pubyear.isnumeric():
            try:
                mi.pubdate = parse_only_date(edition_info["edition pub date"])
            except (dateutil.parser.ParserError, KeyError):
                self.log.warning("Failed to parse date")
        else:
            mi.pubdate = datetime.datetime(year=int(first_pubyear), month=1, day=31)

        # Comments
        try:
            comments = next(
                filter(
                    lambda script: "read-more-btn" in script.text,
                    root("script:not([src])"),
                )
            ).text
            if comments:
                mi.comments = self.parse_comments(comments)
        except StopIteration:
            self.log.warning("Failed to parse description")

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.guid) is not None
        if not mi.has_cover:
            cover = next(root(".book-cover img")).attrib["src"]
            if cover and "placeholder" not in cover:
                self.plugin.cache_identifier_to_cover_url(self.guid, cover)
                mi.has_cover = True

        return mi

    @staticmethod
    def parse_comments(comments):
        comments = re.search(r"(<div.*</div>)", comments).groups()[0]
        comments = re.sub(r'([^\\])\\*"', r"\1\"", comments)
        comments = re.sub(r"\\+(['$])", r"\1", comments).replace(r"\/", "/")

        comments = re.sub(
            r"\\+x(..)", lambda match: chr(int(match.group(1), 16)), comments
        )
        comments = json.loads(f'"{comments}"')
        return Worker.render_comments(parse_html(comments))

    @staticmethod
    def render_comments(desc):
        from lxml import etree

        from calibre.library.comments import sanitize_comments_html

        for c in desc.xpath("descendant::noscript"):
            c.getparent().remove(c)

        # Remove first heading, which is just "Description"
        for h in desc.xpath("descendant::h4"):
            h.getparent().remove(h)
            break

        for a in desc.xpath("descendant::a[@href]"):
            del a.attrib["href"]
            a.tag = "span"
        desc = etree.tostring(desc, method="html", encoding="unicode").strip()

        # remove all attributes from tags
        desc = re.sub(r"<([a-zA-Z0-9]+)\s[^>]+>", r"<\1>", desc)
        # Remove comments
        desc = re.sub(r"(?s)<!--.*?-->", "", desc)
        return sanitize_comments_html(desc)

    def parse_rating(self, raw):
        from css_selectors import Select

        root = Select(parse_html(raw))
        return float(next(root(".average-star-rating")).text.strip())


class StorygraphImpl:
    def __init__(self, plugin):
        self.plugin = plugin

    @property
    def session(self):
        if not hasattr(self, "_session"):
            self._session = cloudscraperv3.create_scraper(
                debug=True,
                auto_refresh_on_403=False,
                # browser={
                #    "custom": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
                # },
                # interpreter="v8",
                # ecdhCurve="secp384r1",
                delay=10,
            )
        return self._session

    def _get_webpage(self, url: str, timeout: int, log):
        import html5lib

        session = self.session
        try:
            attempts = 0
            while attempts < 15:
                resp = session.get(url, timeout=timeout)
                page = html5lib.parse(
                    resp.text, namespaceHTMLElements=False, treebuilder="lxml"
                ).getroot()
                # If we failed to get past the cloudflare protection, we get a page with one of these classes
                if (
                    not page.xpath("//form[@class='challenge-form']")
                    and not page.xpath("//form[@id='challenge-form']")
                    and not page.xpath("//span[@id='challenge-error-text']")
                ):
                    return page
                log.info(
                    "Could not defeat cloudflare protection - trying again for %s" % url
                )
                attempts += 1
                time.sleep(1.0)
            log.error("Could not defeat cloudflare protection - giving up for %s" % url)
            return None
        except Exception as e:
            log.error("Got exception while opening url: %s\n%s" % (url, e))
            return None

    def create_query(self, log, title=None, authors=None, identifiers={}):
        try:
            from urllib.parse import quote
        except ImportError:
            from urllib import quote
        BASE_URL = "https://app.thestorygraph.com/browse?search_term="

        isbn = check_isbn(identifiers.get("isbn", None))
        if isbn is not None:
            return BASE_URL + quote(isbn)

        if title:
            title_tokens = list(self.plugin.get_title_tokens(title))
            author_tokens = list(
                self.plugin.get_author_tokens(authors, only_first_author=True)
            )
            search_str = quote(" ".join(title_tokens + author_tokens))
            if search_str:
                return BASE_URL + search_str

        return None

    def identify(
        self,
        log,
        result_queue,
        abort,
        title=None,
        authors=None,
        identifiers={},
        timeout=30,
    ):
        from css_selectors import Select

        testing = getattr(self, "running_a_test", False)

        if "storygraph" in identifiers:
            items = [identifiers["storygraph"]]
        else:
            items = []
            query = self.create_query(
                log, title=title, authors=authors, identifiers=identifiers
            )
            if not query:
                log.error("Insufficient metadata to construct query")
                return
            log("Using query URL:", query)
            try:
                root = Select(self._get_webpage(query, timeout=timeout, log=log))
            except Exception as e:
                log.exception("Failed to make identify query: %r" % query)
                return as_unicode(e)
            else:
                books = root(".book-pane")
                if not books:
                    log.error("No results")
                for book in books:
                    guid = book.attrib["data-book-id"]
                    book = Select(book)
                    items.append(guid)
                    isbn = next(
                        book(".edition-info > :first-child > :only-child")
                    ).tail.strip()
                    if isbn:
                        self.plugin.cache_isbn_to_identifier(isbn, guid)
                    cover = next(book("img")).attrib["src"]
                    if cover and "placeholder" not in cover:
                        self.plugin.cache_identifier_to_cover_url(guid, cover)

        if not items and identifiers and title and authors and not abort.is_set():
            return self.identify(
                log, result_queue, abort, title=title, authors=authors, timeout=timeout
            )

        if not items:
            log.error("Failed to get list of matching items")
            log.debug("Response text:")
            log.debug(root)
            return

        workers = []
        items = items[: 5 if not testing else 1]
        for i, guid in enumerate(items):
            workers.append(
                Worker(
                    guid,
                    i,
                    result_queue,
                    self.session,
                    timeout,
                    log,
                    self.plugin,
                    testing=testing,
                )
            )

        if not workers:
            return

        for w in workers:
            # Don't send all requests at the same time
            time.sleep(1)
            w.start()

        while not abort.is_set():
            a_worker_is_alive = False
            for w in workers:
                w.join(0.2)
                if abort.is_set():
                    break
                if w.is_alive():
                    a_worker_is_alive = True
            if not a_worker_is_alive:
                break

    def get_cover_url(
        self, log, abort, title=None, authors=None, identifiers={}, timeout=30
    ):
        rq = Queue()
        self.identify(
            log,
            rq,
            abort,
            title=title,
            authors=authors,
            identifiers=identifiers,
            timeout=timeout,
        )
        if abort.is_set():
            return
        results = []
        while True:
            try:
                results.append(rq.get_nowait())
            except Empty:
                break
        results.sort(
            key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers
            )
        )
        for mi in results:
            cached_url = self.get_cached_cover_url(mi.identifiers)
            if cached_url:
                return cached_url

    def get_cover(self, cached_url, timeout=30):
        br = self.session
        return br.get(cached_url, timeout=timeout).content
