﻿#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai

from __future__ import (unicode_literals, division, absolute_import, print_function)

import io
from lxml import etree
import tarfile
import re
import time
import datetime

from calibre_plugins.overdrive_link import ActionOverdriveLink
from calibre_plugins.overdrive_link.book import (LibraryBook, InfoBook)
from calibre_plugins.overdrive_link.formats import (
    FORMAT_OPEN_PDF, FORMAT_OPEN_EPUB, FORMAT_MOBI_EBOOK, FORMAT_TEXT, FORMAT_HTML)
from calibre_plugins.overdrive_link.library import SearchableLibrary
from calibre_plugins.overdrive_link.net import open_url
from calibre_plugins.overdrive_link.author_prep import (alternate_author_names, normalize_author)
from calibre_plugins.overdrive_link.title_prep import normalize_title
from calibre_plugins.overdrive_link.language import LANGUAGE_NAME2
from calibre_plugins.overdrive_link.cache import ProjectGutenbergIndex
from calibre_plugins.overdrive_link.numbers import value_unit

from .python_transition import (IS_PYTHON2)
if IS_PYTHON2:
    from .python_transition import (http, repr)
else:
    import http

try:
    from calibre_plugins.overdrive_link_debug.config import DEBUG_MODE
except ImportError:
    DEBUG_MODE = False

__license__ = 'GPL v3'
__copyright__ = '2012-2022, John Howell <jhowell@acm.org>'


# e-book files. order is significant. sub-strings must be after super-strings
FORMAT_OF_SUFFIX = (
    ('.epub.noimages', FORMAT_OPEN_EPUB),
    ('.epub.images', FORMAT_OPEN_EPUB),
    ('.kindle.noimages', FORMAT_MOBI_EBOOK),
    ('.kindle.images', FORMAT_MOBI_EBOOK),
    ('.pdf.gen', FORMAT_OPEN_PDF),
    ('.pdf.noimages', FORMAT_OPEN_PDF),
    ('.pdf.images', FORMAT_OPEN_PDF),
    ('.txt.utf-8', FORMAT_TEXT),
    ('.html.gen', FORMAT_HTML),
    ('.html.noimages', FORMAT_HTML),
    ('.html.images', FORMAT_HTML),
    ('.plucker', None),         # Palm format
    ('.qioo', None),            # QiOO Mobile eBook (no images)
    ('.rst.gen', None),         # ?
    ('.rdf', None),             # metadata (the file being parsed)
    )


class ProjectGutenberg(SearchableLibrary):
    id = 'pg'
    name = 'Project Gutenberg'
    formats_supported = {FORMAT_OPEN_EPUB, FORMAT_OPEN_PDF, FORMAT_MOBI_EBOOK, FORMAT_TEXT, FORMAT_HTML}

    is_project_gutenberg = True

    @staticmethod
    def validate_library_id(library_id, migrate=True, config=None):
        if library_id != '':
            raise ValueError('Project Gutenberg library-id must be blank, found %s' % library_id)

        return library_id

    @staticmethod
    def validate_book_id(book_id, library_id):
        if not re.match(r'^[0-9]+$', book_id):
            raise ValueError('Project Gutenberg book id must be numeric: "%s"' % book_id)

        return book_id

    @staticmethod
    def book_url(library_id, book_id):
        return 'http://www.gutenberg.org/ebooks/%s' % book_id

    def __init__(self):
        pass

    def sign_in(self, use_credentials):
        self.authors_book_ids = {}  # default in case of index load failure
        self.books_by_id = {}       # default in case of index load failure

        pg_index = ProjectGutenbergIndex(self.log, self.config).read()

        if pg_index is None:
            self.log.error('The Project Gutenberg index file does not exist. You must build the index before searching.')
            return

        if tuple(pg_index.get('version', (0,))) != ActionOverdriveLink.version:
            self.log.error('The Project Gutenberg index file was created by a different version of the plugin.'
                           ' You must rebuild the index before searching.')
            return

        for pg_key, pg_val in pg_index.items():
            self.__dict__[pg_key] = pg_val

        age = int((time.time() - self.time_created) // 86400)  # seconds per day

        self.log.info('Project Gutenberg index has %s by %s created %s (%s ago)' % (
                value_unit(len(self.books_by_id), 'book'),
                value_unit(len(self.authors_book_ids), 'author'),
                datetime.datetime.fromtimestamp(self.time_created).strftime('%Y-%m-%d %H:%M:%S'),
                value_unit(age, 'day')))

        if age > 30:
            self.log.warn('The Project Gutenberg index is %s old. Creation of a new index is recommended.' % value_unit(age, 'day'))

    def find_books(self, books, search_author, search_title, keyword_search):
        if keyword_search:
            return False    # requires index by title, not implemented

        for book_id in self.authors_book_ids.get(search_author, []):
            ibook = InfoBook(from_json=self.books_by_id[book_id])

            lbook = LibraryBook(
                    authors=ibook.authors, title=ibook.title, publisher=ibook.publisher,
                    language=ibook.language, formats=ibook.formats,
                    available=True, recommendable=False, lib=self, book_id=ibook.book_id,
                    search_author=search_author)

            self.log.info('Found: %s' % repr(lbook))
            books.add(lbook)

        return False

    def get_book_info(self, book_id, cache):
        return None     # all available book info is provided by find_books

    def get_current_book_availability(self, book_id):
        self.log.info('Assuming book always available at Project Gutenberg')
        return 0    # always available


def build_gutenberg_index(abort, log, status, config):
    log.info('Downloading Project Gutenberg e-book catalog')

    status.update(0.01, 'downloading catalog')
    start_time = time.time()
    cookiejar = http.cookiejar.CookieJar()
    open_url(log, 'http://www.gutenberg.org/ebooks/offline_catalogs.html', cookiejar=cookiejar)    # establish cookie to allow download

    # link from: http://www.gutenberg.org/ebooks/offline_catalogs.html -- 08/2020
    response = open_url(log, 'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2', cookiejar=cookiejar)

    # old: http://www.gutenberg.org/wiki/Gutenberg:Feeds
    # old: http://www.gutenberg.org/feeds/rdf-files.tar.bz2
    # working 08/2020: http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2
    # http://www.gutenberg.org/feeds/catalog.rdf.bz2

    if response.response_type == 'text/html' and 'we ask you to resolve this captcha' in response.data_string:
        raise ValueError('Project Gutenberg is blocking download. Browse to www.gutenberg.org, answer the captcha, and then retry.')

    if response.response_type != 'application/x-bzip2':
        raise ValueError('Unexpected response type %s' % response.response_type)

    tar_data_file = io.BytesIO(response.data_bytes)

    log.info('Downloaded %s in %.1f seconds' % (value_unit(len(response.data_bytes), 'byte'), time.time() - start_time))

    log.info('Parsing catalog')
    start_time = time.time()
    books = []
    parse_status = status.subrange(0.10, 0.80, 'parse catalog')

    tar_file = tarfile.open(fileobj=tar_data_file, mode='r:bz2')

    total_files = 70000         # estimate since using len(tar_file.getnames()) causes entire archive to be read
    tar_info = tar_file.next()
    file_num = 0

    while tar_info is not None:
        if tar_info.isfile() and tar_info.name.endswith('.rdf'):
            if (file_num % 100) == 0:
                parse_status.update(file_num / total_files, tar_info.name)

            fileobj = tar_file.extractfile(tar_info)
            rdf_data = fileobj.read()   # xml format
            fileobj.close()

            root = etree.fromstring(rdf_data)

            # xmlns:cc="http://web.resource.org/cc/"
            # xmlns:dcam="http://purl.org/dc/dcam/"
            # xmlns:dcterms="http://purl.org/dc/terms/"
            # xmlns:marcrel="http://www.loc.gov/loc.terms/relators/"
            # xmlns:pgterms="http://www.gutenberg.org/2009/pgterms/"
            # xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
            # xml:base="http://www.gutenberg.org/"

            ebook = root.find('{http://www.gutenberg.org/2009/pgterms/}ebook')
            if ebook is None:
                raise ValueError('Missing pgterms:ebook in %s' % tar_info.name)

            book_id = ebook.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about').rpartition('ebooks/')[2]

            title = ''
            book_title = ebook.findtext('{http://purl.org/dc/terms/}title')
            if book_title:
                title = normalize_title(book_title.replace('\n', ': '))

            authors = []
            for creator in ebook.findall('.//{http://purl.org/dc/terms/}creator'):
                for name in creator.findall('.//{http://www.gutenberg.org/2009/pgterms/}name'):
                    n = re.sub(r'\([^)]*\)', '', name.text)   # Bradford, Sarah H. (Sarah Hopkins)
                    n = re.sub(r'\[[^)]*\]', '', n)           # Braga Jr., [pseud.]
                    author = normalize_author(n.strip(), unreverse=True)
                    #if 'Jr.' in name.text: log.info('%s >> %s' % (name.text, author))
                    if author:
                        authors.append(author)

            fmt_filenames = set()
            for hasfmt in ebook.findall('.//{http://purl.org/dc/terms/}hasFormat'):
                fmt_filenames.add(hasfmt.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource', ''))

                for fmt_file in hasfmt.findall('.//{http://www.gutenberg.org/2009/pgterms/}file'):
                    fmt_filenames.add(fmt_file.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about', ''))

            formats = set()
            for fmt_filename in fmt_filenames:
                if fmt_filename.startswith('http://www.gutenberg.org/ebooks'):
                    for (suffix, format) in FORMAT_OF_SUFFIX:
                        if fmt_filename.endswith(suffix):
                            if format:
                                formats.add(format)

                            break
                    else:
                        log.warn('Unknown format file type "%s" in %s' % (fmt_filename, tar_info.name))

            language = ''
            lang = ebook.find('.//{http://purl.org/dc/terms/}language')
            if lang is not None:
                language_code = lang.xpath("string()").strip()
                if language_code is not None:
                    if language_code in LANGUAGE_NAME2:
                        language = LANGUAGE_NAME2.get(language_code)
                    else:
                        language = language_code    # treat unknown code as language name

            publisher = ebook.findtext('{http://purl.org/dc/terms/}publisher')
            if publisher is None:
                publisher = ''

            ibook = InfoBook(authors=authors, title=title, publisher=publisher, language=language, formats=formats,
                             provider_id=ProjectGutenberg.id, library_id='', book_id=book_id)

            books.append(ibook)

        tar_info = tar_file.next()
        file_num += 1

    if DEBUG_MODE and file_num > total_files:
        log.warn('Adjust ProjectGutenberg total_files beyond %d' % file_num)

    log.info('Processed %s in %.1f seconds' % (value_unit(len(books), 'book'), time.time() - start_time))

    log.info('Creating index of books')
    start_time = time.time()
    index_status = status.subrange(0.80, 0.95, 'create index')

    authors_book_ids = {}
    books_by_id = {}

    for book_num, ibook in enumerate(books):
        if (book_num % 100) == 0:
            index_status.update(book_num / len(books), '')

        books_by_id[ibook.book_id] = ibook.to_json()

        for author in ibook.authors:
            for name in alternate_author_names(author, config):  # check for variants of the author name
                if name not in authors_book_ids:
                    authors_book_ids[name] = []

                authors_book_ids[name].append(ibook.book_id)

    pg_index = {}
    pg_index['version'] = ActionOverdriveLink.version
    pg_index['time_created'] = time.time()
    pg_index['authors_book_ids'] = authors_book_ids
    pg_index['books_by_id'] = books_by_id

    log.info('Created index of %s by %s in %.1f seconds' % (
            value_unit(len(books_by_id), 'book'), value_unit(len(authors_book_ids), 'author'), time.time() - start_time))

    status.update(0.95, 'saving results')
    ProjectGutenbergIndex(log, config).write(pg_index)

    log.summary('Project Gutenberg index created')
