#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import, print_function)

__license__   = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

import traceback, socket, re, sys, urllib

from lxml import html
from Queue import Queue, Empty

from calibre import browser, prints
from calibre.constants import DEBUG
from calibre.ebooks.metadata.covers import CoverDownload

class GoodreadsCovers(CoverDownload): # {{{
    'Download covers from Goodreads.com'

    GOODREADS_URL = 'http://www.goodreads.com/search?search_type=%22books%22&search[query]='
    name        = 'Goodreads Covers'
    description = 'Download covers from Goodreads.com'
    author      = 'Grant Drake'
    version     = (1, 1, 1)
    # Note that it is not currently possible in Calibre to download covers for books that
    # do not have ISINs. Line 120 of bulk_download.py prevents this from taking place as
    # an "optimisation". The commented out line fragments below can be uncommented when/if
    # this restriction is removed.

    goodreads_img_url_cache = {}

    def get_cover_url(self, isbn, br, timeout=5.):
        if isbn in self.goodreads_img_url_cache:
            return self.goodreads_img_url_cache[isbn]
        self.goodreads_img_url_cache[isbn] = None
        site_page_url = self.GOODREADS_URL + isbn
        cover_url = self.get_cover_url_from_site_page(site_page_url, br, timeout)
        if cover_url:
            self.goodreads_img_url_cache[isbn] = cover_url
            return cover_url

    def has_cover(self, mi, ans, timeout=5.):
        if not mi.isbn: #and not mi.title and not mi.authors:
            return False
        br = browser()
        try:
            #if mi.isbn:
            if self.get_cover_url(mi.isbn, br, timeout=timeout) != None:
                self.debug('Goodreads cover for', mi.isbn, 'found')
                ans.set()
            #elif mi.title and mi.authors:
            #    if self.get_cover_url_from_title_author_search(mi.title, mi.authors, br, timeout=timeout) != None:
            #        self.debug('Goodreads cover for', mi.title, 'found')
            #        ans.set()
        except Exception as e:
            self.debug(e)

    def get_covers(self, mi, result_queue, abort, timeout=5.):
        if not mi.isbn: #and not mi.title and not mi.authors:
            return False
        br = browser()
        try:
            # Grant - we don't have a "nice" way of reporting no cover found when invoked
            # from the Edit Metadata screen due to a flaw in the logic in it. We could create
            # an error on the queue, but that looks worse from a user perspective
            #if mi.isbn:
            url = self.get_cover_url(mi.isbn, br, timeout=timeout)
            if not url:
                result_queue.put((False, _('Cover for ISBN %s not found')%mi.isbn, '', self.name))
                return
            #else:
            #    url = self.get_cover_url_from_title_author_search(mi.title, mi.authors, br, timeout=timeout)
            #    if not url:
            #        result_queue.put((False, _('Cover for Title: %s not found')%mi.title, '', self.name))
            #        return
            cover_data = br.open_novisit(url).read()
            result_queue.put((True, cover_data, 'jpg', self.name))
        except Exception as e:
            traceback.format_exc()
            result_queue.put((False, self.exception_to_string(e),
                traceback.format_exc(), self.name))

    def cleanup_for_compare(self, text):
        # When we compare titles and authors between Calibre and Goodreads, mung the text together
        return re.sub('([^a-z0-9])','', text.lower())

    def get_cover_url_from_site_page(self, site_page_url, br, timeout=5.):
        try:
            src = br.open_novisit(site_page_url, timeout=timeout).read()
        except Exception as err:
            if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
                err = Exception(_('Goodreads.com API timed out. Try again later.'))
            raise err
        else:
            root = html.fromstring(src)
            try:
                imgcol_node = root.xpath('//div[@id="imagecol"]/a/img/@src')
                if not imgcol_node:
                    return None
                img_url = imgcol_node[0]
                # Unfortunately Goodreads sometimes have broken links so we need to do
                # an additional request to see if the URL actually exists
                info = br.open_novisit(img_url, timeout=timeout).info()
                if info.getheader('Content-Length') == '0':
                    self.debug('Broken image url:', img_url)
                    return None
            except:
                return None
            return img_url

    def get_cover_url_from_title_author_search(self, title, authors, br, timeout=5.):
        author = ''
        if len(authors)> 0 and authors[0] != _('Unknown'):
            author = authors[0]
        cache_key = title+author
        if cache_key in self.goodreads_img_url_cache:
            return self.goodreads_img_url_cache[cache_key]

        # Goodreads does not like certain character in the query for title and will return no matches.
        # e.g. "1,000 Places to see before you do" "Patricia Schultz" returns results via web search
        # but not when invoked directly (must be to do with the quote_plus stuff
        pat = re.compile(r'''[-,:;+!@#$%^&*(){}.`~"'\s\[\]/]''')
        title = pat.sub(' ', title)
        query = title.replace('  ',' ').strip() + ' ' + author
        q = urllib.quote_plus(query.strip().encode('utf-8'))
        url = self.GOODREADS_URL + q
        raw = br.open_novisit(url, timeout=timeout).read()
        if not raw:
            return None
        raw = raw.decode('utf-8', errors='replace')
        root = html.fromstring(raw)
        first_result = root.xpath('//table[@class="tableList"]/tr/td[2]/a')
        if not first_result:
            # Goodreads did not find a match
            if DEBUG:
                prints('No Goodreads cover title/author match found for \'%s\' by \'%s\'' %(title, author))
            return None
        goodreads_title = first_result[0].text_content().strip()
        goodreads_author = first_result[1].text_content().strip()
        # For comparison purposes simplify and compact the title/author
        match_title = self.cleanup_for_compare(goodreads_title)
        calibre_title = self.cleanup_for_compare(title)
        match_author = self.cleanup_for_compare(goodreads_author)
        calibre_author = self.cleanup_for_compare(author)
        # We will keep our comparison fairly crude by requiring it to be fairly exact
        if match_title.find(calibre_title) == -1:
            if DEBUG:
                prints('Rejecting Goodreads cover result for title \'%s\' as found \'%s\'' %(title, goodreads_title))
            return None
        # Author could be quite hard, as we have the FN LN issue to address
        # Again will keep it very crude and only compare first author
        is_author_match = match_author.find(calibre_author) != -1
        if not is_author_match and author.find(',') != -1:
            # Didn't match as exact but we have a comma in Calibre author.
            # Perhaps user is storing in LN, FN format so we will do a crude switch
            # around and compare
            author_parts = author.partition(',')
            rev_calibre_author = self.cleanup_for_compare(author_parts[2] + author_parts[0])
            is_author_match = match_author.find(rev_calibre_author) != -1
        if not is_author_match:
            if DEBUG:
                prints('Rejecting Goodreads cover result for author \'%s\' as found \'%s\'' %(author, goodreads_author))
            return
        img_url_first_result = root.xpath('//table[@class="tableList"]/tr/td[1]/a[2]/img/@src')
        if not img_url_first_result:
            return
        # If URL contains "nocover-", the book doesn't have a cover
        if img_url_first_result[0].find('nocover-') != -1:
            return
        # The img on the search screen is too small to use as a cover
        # Instead we have to do an additional scrape to get the full size cover url
        url_first_result = root.xpath('//table[@class="tableList"]/tr/td[1]/a[2]/@href')
        if not url_first_result:
            return
        site_page_url = 'http://www.goodreads.com%s' % url_first_result[0]
        # Now we have a page for a search result for the book with an image,
        # visit that page to actually get the url of the full size image.
        cover_url = self.get_cover_url_from_site_page(site_page_url, br, timeout)
        if cover_url:
            self.goodreads_img_url_cache[cache_key] = cover_url
            return cover_url

# }}}

def test(isbn, title, author): # {{{
    from threading import Event
    from calibre.ebooks.metadata import MetaInformation
    mi = MetaInformation(title, [author])
    prints('Testing Book:', isbn, title, author)
    mi.isbn = isbn
    cover_plugin = GoodreadsCovers(None)
    ans = Event()
    cover_plugin.has_cover(mi, ans)
    found = ans.is_set()
    prints('Has cover:', found)

    if found:
        abort = Event()
        results = Queue()
        cover_plugin.get_covers(mi, results, abort)
        errors, ans = [], None
        while True:
            try:
                x = results.get_nowait()
                if x[0]:
                    ans = x[1]
                else:
                    errors.append(x)
            except Empty:
                break
        if ans is not None:
            prints('Cover downloaded!')
        else:
            prints('Download failed:')
            for err in errors:
                prints('\t', err[-1]+':', err[1])
# }}}

# For testing, run from command line with this:
#    calibre-debug -e goodreads_covers_plugin.py
# For testing with a specific ISBN:
#    calibre-debug -e goodreads_covers_plugin.py 9780307272119
# ISBN with no cover:
#    calibre-debug -e goodreads_covers_plugin.py 9780440201151
# or for testing of title/author:
#    calibre-debug -e goodreads_covers_plugin.py "1984" "Orwell, George"
def main(args=sys.argv):
    title = None
    author = None
    isbn = None
    # Fallback test data in case no args specified on command line
    isbn = '9780340734810'
    #title = 'The Shadow of the Wind'
    #author = 'Carlos Ruiz Zafón'
    if len(args) == 2:
        isbn = args[1]
        title = None
        author = None
    elif len(args) == 3:
        isbn = None
        title = args[1]
        author = args[2]
    test(isbn, title, author)
    return 0

if __name__ == '__main__':
    sys.exit(main())