﻿#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai

from __future__ import (unicode_literals, division, absolute_import, print_function)

import re
import time
import json

from calibre.utils.date import parse_only_date

from calibre_plugins.overdrive_link.book import (LibraryBook, InfoBook)
from calibre_plugins.overdrive_link.formats import (FORMAT_SCRIBD_BOOK_READER, FORMAT_SCRIBD_COMIC_READER, FORMAT_SCRIBD_AUDIOBOOK)
from calibre_plugins.overdrive_link.library import SearchableLibrary
from calibre_plugins.overdrive_link.net import open_url
from calibre_plugins.overdrive_link.author_prep import normalize_author
from calibre_plugins.overdrive_link.title_prep import normalize_title
from calibre_plugins.overdrive_link.parseweb import (LibraryError, text_only, beautiful_soup)

from .python_transition import (IS_PYTHON2)
if IS_PYTHON2:
    from .python_transition import (http, repr, str, urllib)
else:
    import http.cookiejar
    import urllib.parse


__license__ = 'GPL v3'
__copyright__ = '2012-2022, John Howell <jhowell@acm.org>'


# Scribd returns a 'forbidden' error when requests are received too quickly. Take a long delay when this is detected.
# Pace requests to try to avoid this condition.

OVERALL_MAX_QPS = 2.0       # maximum queries per second for Scribd across all jobs
THROTTLE_DELAY = 300.0      # seconds to delay when throttled by Scribd
OOPS_DELAY = 30.0           # seconds to delay when Scribd has an internal error

LANGUAGES = {
    'English': 1,
    'Chinese': 6,
    'Spanish': 4,
    'Arabic': 11,
    'Portuguese': 13,
    'Japanese': 3,
    'German': 9,
    'French': 5,
    'Korean': 7,
    'Turkish': 78,
    'Vietnamese': 103,
    'Russian': 14,
    'Tamil': 104,
    'Italian': 8,
    'Thai': 60,
    'Polish': 89,
    }

NON_ENGLISH_WORDS = {
    # German
    "aber", "als", "andere", "anderer", "anderes", "auch", "auf", "aus", "bei", "beispiel",
    "bin", "bis", "da", "damit", "dann", "das", "dass", "dem", "den", "denn", "der", "dich", "dir", "doch",
    "du", "durch", "eigentlich", "ein", "eine", "einen", "er", "erste", "erster", "erstes", "es", "für", "ganz",
    "geben", "gehen", "groß", "habe", "haben", "hier", "ich", "ihm", "ihn", "ihr", "immer", "ist", "ja", "jahr",
    "jede", "jeder", "jedes", "jetzt", "können", "kann", "kein", "kommen", "lassen", "müssen", "machen", "mehr",
    "mein", "mich", "mir", "mit", "nach", "nein", "neu", "nicht", "noch", "nur", "oben", "oder", "sagen", "schon",
    "sehen", "sehr", "sein", "selber", "selbst", "sich", "sie", "sind", "sollen", "stehen", "über", "um", "und",
    "uns", "unser", "unter", "viel", "von", "vor", "weil", "wenn", "werden", "wie", "wieder", "wir", "wissen",
    "wo", "wollen", "zeit", "zu", "zwei",

    "la", "de", "și", "ce", "ca", "un", "cu", "nu", "va", "să", "că", "fi", "mai",
    }

NON_ENGLISH_LETTERS = r"[àáâãäăåāăąèéêëēĕėęěìíîïĩīĭįıòóôõöōŏőơùúûüũūŭůűųñńņňýÿƴßșțçćĉċč]"    # as a re pattern


FORMAT_OF_CONTENT_TYPE = {
    'books': FORMAT_SCRIBD_BOOK_READER,
    'audiobooks': FORMAT_SCRIBD_AUDIOBOOK,
    'comics': FORMAT_SCRIBD_COMIC_READER,
    }

SEARCH_APP = "React.createElement(Scribd.Search.App,"


class Scribd(SearchableLibrary):
    id = 'sc'
    name = 'Scribd'
    formats_supported = {FORMAT_SCRIBD_BOOK_READER, FORMAT_SCRIBD_COMIC_READER, FORMAT_SCRIBD_AUDIOBOOK}

    @staticmethod
    def validate_library_id(library_id, migrate=True, config=None):
        if library_id:
            raise ValueError('Scribd library id must be left blank: "%s"' % library_id)

        return library_id

    @staticmethod
    def validate_book_id(book_id, library_id):
        if not re.match(r'^([0-9]+)$', book_id):
            raise ValueError('Scribd book id must be numeric: "%s"' % book_id)

        return book_id

    @staticmethod
    def book_url(library_id, book_id):
        # Using /book/ will redirect properly for audiobooks
        return 'https://www.scribd.com/book/%s' % book_id

    def __init__(self):
        self.cookiejar = http.cookiejar.CookieJar()
        self.authors_searched = set()
        self.ids_of_author = {}
        self.author_content_types = {}
        self.filter_reported = False

    def sign_in(self, use_credentials):
        # sign in not working as of November 2021, 500-error, possibly needs recaptcha result
        pass

    def open_scribd_url(self, url, **kwargs):
        book_id = kwargs.pop('book_id', None)
        kwargs['expect_errors'] = [403, 410, 500]
        kwargs['qps'] = OVERALL_MAX_QPS

        tries = 0
        MAX_TRIES = 3

        while True:
            tries += 1
            response = open_url(self.log, url, **kwargs)

            if response.is_httperror_exception:
                if response.code == 403:
                    # error 403 (forbidden) occurs for throttling if requests are received too quickly by scribd.

                    #self.log.info('headers: %s' % response.headers)
                    #self.log.info('Forbidden: ' + response.response_data)

                    if 'your computer or network may be sending automated search queries' in response.response_data:
                        self.log.info('Delaying due to throttling')
                        time.sleep(THROTTLE_DELAY)
                        continue

                if book_id:
                    if response.code == 410:
                        # error 410 (gone) occurs if book is not authorized or has been removed from scribd.
                        #self.log.info('headers: %s' % response.headers)

                        if 'This document is not publicly available.' in response.response_data:
                            self.log.info('Access denied: %s is not publically available' % book_id)
                        elif 'Deletion notice' in response.response_data:
                            self.log.info('Deleted: %s is no longer available' % book_id)
                        else:
                            self.log.info('Gone: %s is not available' % book_id)

                        return None

                    if response.code == 500:
                        # occurs consistently for some books that are not accessible
                        self.log.warn('Internal Server Error: %s is not available' % book_id)
                        return None

                raise response

            if "<h1>Oops! Something went wrong.</h1>" in response.data_string:
                if tries > MAX_TRIES:
                    self.log.warn('Server Error: Oops! Something went wrong.')
                    return None

                self.log.info('Server Error: Oops! Something went wrong.')
                time.sleep(OOPS_DELAY)
                continue

            redirect_url = response.geturl()
            redirect_path = urllib.parse.urlparse(redirect_url).path

            if len(redirect_path) < 2:
                raise LibraryError('Scribd redirected query to home page - Retry search later')

            break

        return response

    def find_books(self, books, search_author, search_title, keyword_search):
        content_types = []
        for content_type, search_format in FORMAT_OF_CONTENT_TYPE.items():
            if search_format in self.config.search_formats:
                content_types.append(content_type)

        MAX_RESULTS_ALLOWED = 500 if keyword_search else 200

        results_processed = 0
        data = {}
        query = []

        if search_author:
            query.append(search_author.replace("'", "").replace('.', ''))   # extraneous chars cause huge number of results

        if search_title:
            query.append(search_title)

        data['query'] = ' '.join(query)

        if len(content_types) == 1:
            # tops (all), books, audiobooks, comics, authors, documents, sheet_music, collections, users
            data['content_type'] = content_types[0]     # only specify if only one is desired

        if self.config.search_language and self.config.search_language in LANGUAGES:
            language = self.config.search_language
            data['language'] = '%d' % LANGUAGES[self.config.search_language]
        else:
            language = ''

        response = self.open_scribd_url(
            'https://www.scribd.com/search/query?%s' % urllib.parse.urlencode(data),
            addheaders=[('accept', 'application/json')], cookiejar=self.cookiejar)

        #self.log.info("results: %s" % response.data_string)
        data = json.loads(response.data_string)

        docs = []
        for doc in data.get('top_result', {}).get('mixed', {}).get('content', {}).get('documents', []):
            docs.append(doc)

        for results in data.get('results', []).values():
            for doc in results.get('content', {}).get('documents', []):
                docs.append(doc)

        for doc in docs:
            book_id = str(doc['id'])
            title = doc['title']
            authors = [doc['author']]
            available = not doc['availability']['text']

            lbook = LibraryBook(authors=authors, title=title, language=language,
                                available=available, recommendable=not available, lib=self, book_id=book_id,
                                search_author=search_author)

            if not available:
                self.log.info('Ignoring unavailable: %s' % repr(lbook))
            else:
                self.log.info('Found: %s' % repr(lbook))
                books.add(lbook)

            results_processed += 1

            if results_processed >= MAX_RESULTS_ALLOWED:
                return True

        return False

    def get_book_info(self, book_id, cache):
        response = self.open_scribd_url(self.book_url(self.library_id, book_id),
                                        cookiejar=self.cookiejar, book_id=book_id)

        if response is None:
            return None

        formats = set()

        book_path = urllib.parse.urlparse(response.geturl()).path
        if book_path.startswith('/book/'):
            formats = {FORMAT_SCRIBD_BOOK_READER}    # comics are identified as "book" in URL
        elif book_path.startswith('/audiobook/'):
            formats = {FORMAT_SCRIBD_AUDIOBOOK}
        elif book_path.startswith('/doc/') or book_path.startswith('/document/'):
            return None     # ignored types
        else:
            self.log.info('unknown document_type=%s' % book_path)
            return None

        soup = beautiful_soup(response.data_string)

        title = ''
        authors = []
        publisher = ''
        pubdate = None
        isbn = ''

        content = soup.find('main', attrs={'data-e2e': 'content-preview-app'})
        if not content:
            self.log.error('Missing book information')
            return None

        title_h = content.find('h1', attrs={'data-e2e': 'desktop-content-title'})
        if title_h:
            title = normalize_title(text_only(title_h))

        authors_d = content.find('p', attrs={'data-e2e': 'authors'})
        if authors_d:
            for author_a in authors_d.findAll('a', recursive=True):
                authors.append(normalize_author(text_only(author_a), unreverse=False))

        release_date_d = content.find('div', attrs={'data-e2e': 'content-preview-metadata-release-date'})
        if release_date_d:
            value_d = release_date_d.find('div', attrs={'data-e2e': 'value'})
            release_date_text = text_only(value_d)
            m = re.search(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [0-9]+, [0-9]+', release_date_text, flags=re.IGNORECASE)
            if m:
                try:
                    pubdate = parse_only_date(m.group(0), assume_utc=True)
                except Exception:
                    # ignore invalid dates such as Feb 30, 1970 on https://www.scribd.com/book/150477853/Tales-from-the-White-Hart
                    pass

        publisher_d = content.find('div', attrs={"data-e2e": "metadata-publisher"})
        if publisher_d:
            value_d = publisher_d.find('div', attrs={'data-e2e': 'value'})
            publisher = text_only(value_d)

        isbn_d = content.find('div', attrs={'data-e2e': 'metadata-isbn'})
        if isbn_d:
            value_d = isbn_d.find('div', attrs={'data-e2e': 'value'})
            isbn = text_only(value_d)

        if self.config.search_language == 'English':
            # detect books in other languages mistakenly found when doing English search
            description_d = content.find('div', attrs={'data-e2e': 'description'})
            if description_d:
                description = text_only(description_d)
                total_word_count = non_english_word_count = 0
                for word in description.split():
                    total_word_count += 1
                    if (word.lower() in NON_ENGLISH_WORDS) or re.search(NON_ENGLISH_LETTERS, word):
                        non_english_word_count += 1

                if ((total_word_count < 10 and non_english_word_count > 0) or
                        (float(non_english_word_count)/float(total_word_count) > 0.05)):
                    self.log.info('Ignoring non-English book found using English search')
                    formats = set()    # not available

        return InfoBook(authors=authors, title=title, isbn=isbn, publisher=publisher, pubdate=pubdate,
                        formats=formats, lib=self, book_id=book_id)

    def is_book_available(self, book_id):
        try:
            return (self.get_current_book_availability(book_id) is not False)

        except Exception as e:
            self.log.exception('Checking current availability for %s' % book_id, e)
            return False

    def get_current_book_availability(self, book_id):
        response = self.open_scribd_url(self.book_url(self.library_id, book_id),
                                        cookiejar=self.cookiejar, book_id=book_id)

        if response is None:
            return False

        self.log.info('Book has normal availability')
        return 0    # always available, assuming no pre-release titles
