#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import, print_function)

__license__   = 'GPL v3'
__copyright__ = '2011, Grant Drake <grant.drake@gmail.com>'
__docformat__ = 'restructuredtext en'

import sys, traceback, urllib, re, datetime

from lxml import html

from calibre import browser, prints
from calibre.constants import DEBUG
from calibre.ebooks.metadata.fetch import MetadataSource, filter_metadata_results
from calibre.ebooks.metadata.book.base import Metadata

class Goodreads(MetadataSource):

    name                = 'Goodreads Metadata'
    metadata_type       = 'basic'
    description         = _('Downloads all metadata where available from goodreads.com')
    supported_platforms = ['windows', 'osx', 'linux']
    author              = 'Grant Drake'
    version             = (1, 3, 3)

    def fetch(self):
        try:
            # We need either an ISBN or both of title/author to search
            if self.isbn is None and not (self.title and self.book_author):
                return
            # The Calibre API supports returning multiple results, but we will return only one
            # from Goodreads. This is because it appears to just use a single book to represent
            # the various editions etc - less time retrieving data too!
            t = get_social_metadata(self.title, self.book_author,
                    self.publisher, self.isbn)
            if t:
                self.results = [t]
            else:
                self.results = []
        except Exception as e:
            self.exception = e
            self.tb = traceback.format_exc()

# We are using the "Popular shelves" set for a book as possible tags
# However there are a bunch we will definitely never want as tags
# Too many to do as exclusions, instead creating an "inclusion" dict
# mapping the popular shelves to a Calibre tag name.
# Perhaps one day this will be configurable
calibre_tag_lookup = {
                'Anthologies': 'Anthologies',
                'Adventure': 'Adventure',
                'Adult Fiction': 'Adult',
                'Adult': 'Adult',
                'Art': 'Art',
                'Biography': 'Biography',
                'Biography Memoir': 'Biography',
                'Business': 'Business',
                'Chick-lit': 'Chick-lit',
                'Childrens': 'Childrens',
                'Classics': 'Classics',
                'Comics': 'Comics',
                'Graphic Novels Comics': 'Comics',
                'Contemporary': 'Contemporary',
                'Cookbooks': 'Cookbooks',
                'Crime': 'Crime',
                'Fantasy': 'Fantasy',
                'Feminism': 'Feminism',
                'Gardening': 'Gardening',
                'Gay': 'Gay',
                'Glbt': 'Gay',
                'Health': 'Health',
                'History': 'History',
                'Historical Fiction': 'Historical',
                'Horror': 'Horror',
                'Comedy': 'Humour',
                'Humor': 'Humour',
                'Health': 'Health',
                'Inspirational': 'Inspirational',
                'Sequential Art > Manga': 'Manga',
                'Modern': 'Modern',
                'Music': 'Music',
                'Mystery': 'Mystery',
                'Non Fiction': 'Non-Fiction',
                'Paranormal': 'Paranormal',
                'Religion': 'Religion',
                'Philosophy': 'Philosophy',
                'Politics': 'Politics',
                'Poetry': 'Poetry',
                'Psychology': 'Psychology',
                'Reference': 'Reference',
                'Romance': 'Romance',
                'Science': 'Science',
                'Science Fiction': 'Science Fiction',
                'Science Fiction Fantasy': 'Science Fiction,Fantasy',
                'Self Help': 'Self Help',
                'Sociology': 'Sociology',
                'Spirituality': 'Spirituality',
                'Suspense': 'Suspense',
                'Thriller': 'Thriller',
                'Travel': 'Travel',
                'Paranormal > Vampires': 'Vampires',
                'War': 'War',
                'Western': 'Western',
                'Language > Writing': 'Writing',
                'Writing > Essays': 'Writing',
                'Young Adult': 'Young Adult',
                }

def convert_date_text(date_text):
    # Note that the date text could be "2003", "December 2003" or "December 10th 2003"
    year = int(date_text[-4:])
    month = 1
    day = 1
    if len(date_text) > 4:
        text_parts = date_text[:len(date_text)-5].partition(' ')
        month_name = text_parts[0]
        # Need to convert the month name into a numeric value
        # For now I am "assuming" the Goodreads website only displays in English
        # If it doesn't will just fallback to assuming January
        month_dict = {"January":1, "February":2, "March":3, "April":4, "May":5, "June":6,
            "July":7, "August":8, "September":9, "October":10, "November":11, "December":12}
        month = month_dict.get(month_name, 1)
        if len(text_parts[2]) > 0:
            day = int(re.match('([0-9]+)', text_parts[2]).groups(0)[0])
    return datetime.date(year, month, day)

def convert_genres_to_calibre_tags(genre_tags):
    # for each tag, add if we have a dictionary lookup
    tags_to_add = set()
    for genre_tag in genre_tags:
        tags = calibre_tag_lookup.get(genre_tag, None)
        if tags:
            for tag in tags.split(','):
                tags_to_add.add(tag)
    return list(sorted(list(tags_to_add)))

def cleanup_for_compare(text):
    # When we compare titles and authors between Calibre and Goodreads, mung the text together
    # First strip anything in parenthesis
    text = re.sub('(\(.*\))', '', text.lower()).strip()
    text = re.sub('([^a-z0-9])','', text)
    return text

def get_page_title(root):
    title_node = root.xpath('//title')
    if title_node:
        return title_node[0].text_content().strip()
    return None

def get_title_series(root):
    title_node = root.xpath('//div[@id="metacol"]/h1[@id="bookTitle"]')
    if not title_node:
        return (None, None, None)
    title_text = title_node[0].text_content().strip()
    if title_text.find('(') == -1:
        return (title_text, None, None)
    # Contains a Title and possibly a series. Possible values currently handled:
    # "Some title (Omnibus)"
    # "Some title (#1-3)"
    # "Some title (Series #1)"
    # "Some title (Series (digital) #1)"
    # "Some title (Series #1-5)"
    # "Some title (Omnibus) (Series #1)"
    # "Some title (Omnibus) (Series (digital) #1)"
    # "Some title (Omnibus) (Series (digital) #1-5)"
    text_split = title_text.rpartition('(')
    title = text_split[0]
    series_info = text_split[2]
    hash_pos = series_info.find('#')
    if hash_pos <= 0:
        # Cannot find the series # in expression or at start like (#1-7)
        # so consider whole thing just as title
        title = title_text
        series_info = ''
    else:
        # Check to make sure we have got all of the series information
        series_info = series_info[:len(series_info)-1] #Strip off trailing ')'
        while series_info.count(')') != series_info.count('('):
            title_split = title.rpartition('(')
            title = title_split[0].strip()
            series_info = title_split[2] + '(' + series_info
    if series_info:
        series_partition = series_info.rpartition('#')
        series_name = series_partition[0].strip()
        series_index = series_partition[2].strip()
        if series_index.find('-'):
            # The series is specified as 1-3, 1-7 etc.
            # In future we may offer config options to decide what to do,
            # such as "Use start number", "Use value xxx" like 0 etc.
            # For now will just take the start number and use that
            series_index = series_index.partition('-')[0].strip()
        return (title.strip(), series_name, float(series_index))
    else:
        return (title.strip(), None, None)

def get_authors(root):
    author_node = root.xpath('//div[@id="metacol"]/div[@id="bookAuthors"]/a')
    if author_node:
        authors = []
        for author_value in author_node:
            author = html.tostring(author_value, method='text', encoding=unicode).strip()
            # If multiple authors with some as editors can result in a trailing , to remove
            if author[-1:] == ',':
                author = author[:len(author)-1]
            authors.append(author)
        return authors

def get_publisher_and_date(root):
    publisher = None
    pub_date = None
    publisher_node = root.xpath('//div[@id="metacol"]/div[@id="details"]/div[2]')
    if publisher_node:
        # Publisher is specified within the div above with variations of:
        #  Published December 2003 by Books On Tape <nobr class="greyText">(first published 1982)</nobr>
        #  Published June 30th 2010
        # Note that the date could be "2003", "December 2003" or "December 10th 2003"
        publisher_node_text = html.tostring(publisher_node[0], method='text', encoding=unicode)
        # See if we can find the publisher name
        pub_text_parts = publisher_node_text.partition(' by ')
        if pub_text_parts[2]:
            publisher = pub_text_parts[2].strip()
            if publisher.find('(first') != -1:
                # The publisher name is followed by (first published xxx) so strip that off
                publisher = publisher.rpartition('(first')[0].strip()
        # Now look for the pubdate. There should always be one at start of the string
        pubdate_text_match = re.search('Published[\n\s]*([\w\s]+)', pub_text_parts[0].strip())
        pubdate_text = None
        if pubdate_text_match is not None:
            pubdate_text = pubdate_text_match.groups(0)[0]
        # If we have a first published section of text use that for the date.
        if publisher_node_text.find('(first') != -1:
            # For the publication date we will use first published date
            # Note this date could be just a year, or it could be monthname year
            pubdate_text_match = re.search('.*\(first published ([\w\s]+)', publisher_node_text)
            if pubdate_text_match is not None:
                first_pubdate_text = pubdate_text_match.groups(0)[0]
                if pubdate_text and first_pubdate_text[-4:] == pubdate_text[-4:]:
                    # We have same years, use the first date as it could be more accurate
                    pass
                else:
                    pubdate_text = first_pubdate_text
        if pubdate_text:
            pub_date = convert_date_text(pubdate_text)
    return (publisher, pub_date)

def get_rating(root):
    rating_node = root.xpath('//div[@id="metacol"]/div[@id="bookMeta"]/span[@class="value rating"]/span')
    if rating_node:
        rating_text = html.tostring(rating_node[0], method='text', encoding=unicode)
        rating_text = re.sub('[^0-9]', '', rating_text)
        rating_value = float(rating_text)
        if rating_value >= 100:
            return rating_value / 100
        return rating_value

def get_comments(root):
    description_node = root.xpath('//div[@id="metacol"]/div[@id="description"]/span')
    if description_node:
        # Description could be in a second span if sufficiently long
        desc = description_node[0] if len(description_node) == 1 else description_node[1]
        comments = 'SUMMARY:\n'+ (html.tostring(desc, method='text',encoding=unicode)).strip().replace('(less)','')
        while comments.find('  ') >= 0:
            comments = comments.replace('  ',' ')
        return comments

def get_tags(root):
    # Goodreads does not have "tags", but it does have Genres (wrapper around popular shelves)
    # We will use those as tags (with a bit of massaging)
    genres_node = root.xpath('//div[@class="stacked"]/div/div/div[@class="bigBoxContent"]/div/div')
    if genres_node:
        genre_tags = set()
        for genre_node in genres_node:
            sub_genre_nodes = genre_node.xpath('a')
            genre_tags_list = [sgn.text_content().strip() for sgn in sub_genre_nodes]
            if genre_tags_list:
                genre_tags.add(' > '.join(genre_tags_list))
        calibre_tags = convert_genres_to_calibre_tags(genre_tags)
        if len(calibre_tags) > 0:
            return calibre_tags
    return

def get_isbn(root):
    isbn_node = root.xpath('//div[@id="metacol"]/div[@id="details"]/div[@class="buttons"]/div[@id="bookDataBox"]/div/div')
    if isbn_node:
        id_type = html.tostring(isbn_node[0], method='text', encoding=unicode).strip()
        if id_type == 'ISBN':
            isbn10_data = html.tostring(isbn_node[1], method='text', encoding=unicode).strip()
            isbn13_pos = isbn10_data.find('ISBN13:')
            if isbn13_pos == -1:
                return isbn10_data[:10]
            else:
                return isbn10_data[isbn13_pos+8:isbn13_pos+21]
        elif id_type == 'ISBN13':
            # We have just an ISBN13, without an ISBN10
            return html.tostring(isbn_node[1], method='text', encoding=unicode).strip()

def scrape_link_for_book_metadata(mi, url, scrape_isbn=False):
    br = browser()
    raw = br.open_novisit(url).read()
    if not raw:
        return False
    raw = raw.decode('utf-8', errors='replace')
    root = html.fromstring(raw)

    # Look at the <title> attribute for page to make sure that we were actually returned
    # a details page for a book. If the user had specified an invalid ISBN, then the results
    # page will just do a textual search.
    page_title = get_page_title(root)
    if page_title is None or page_title.find('search results for') != -1:
        return False
    (title, series, series_index) = get_title_series(root)
    mi.title = title
    if series:
        mi.series = series
        mi.series_index = series_index
    mi.authors = get_authors(root)
    (publisher, pubdate) = get_publisher_and_date(root)
    mi.publisher = publisher
    mi.pubdate = pubdate
    # Calibre currently has some checks to exclude results where the publisher
    # indicates it is audiobook information. As Goodreads only has one search result
    # for a search and that "might" be the audiobook version, we need to bypass this
    if publisher and not filter_metadata_results(mi):
        # Publisher would force this result to be ignored, so blank the publisher
        # out so that the rest of the data will be imported. Better that than none!
        mi.publisher = None
    mi.rating = get_rating(root)
    mi.comments = get_comments(root)
    mi.tags = get_tags(root)
    if scrape_isbn:
        mi.isbn = get_isbn(root)
    return True

def scrape_link_for_title_authors(title, author):
    def cleanup_title(title):
        title = title.lower().replace(',','').replace(':',' ')
        title = re.sub('(\(.*\))', '', title).strip()
        return title.replace(' & ', ' and ').strip()
    # Goodreads does not like commas in the query for title and will return no matches!
    # e.g. "1,000 Places to see before you do" "Patricia Schultz" returns results via web search
    # but not when invoked directly (must be to do with the quote_plus stuff)
    query = cleanup_title(title)
    if author:
        query = query + ' ' + author
    q = urllib.quote_plus(query.replace('  ',' ').strip().encode('utf-8'))
    url = 'http://www.goodreads.com/search?search_type=%22books%22&search[query]=' + q
    br = browser()
    raw = br.open_novisit(url).read()
    if not raw:
        return
    raw = raw.decode('utf-8', errors='replace')
    root = html.fromstring(raw)
    first_result = root.xpath('//table[@class="tableList"]/tr/td[2]/a')
    if not first_result:
        # Goodreads did not find a match
        if DEBUG:
            prints('No Goodreads title/author match found for \'%s\' by \'%s\'' %(title, author))
        return
    goodreads_title = first_result[0].text_content().strip()
    goodreads_author = first_result[1].text_content().strip()
    # For comparison purposes simplify and compact the title/author
    match_title = cleanup_for_compare(goodreads_title)
    calibre_title = cleanup_for_compare(title)
    match_author = cleanup_for_compare(goodreads_author)
    calibre_author = cleanup_for_compare(author)
    # We will keep our comparison fairly crude by requiring it to be fairly exact
    if match_title.find(calibre_title) == -1:
        if DEBUG:
            prints('Rejecting Goodreads result for title \'%s\' as found \'%s\'' %(title, goodreads_title))
        return
    # Author could be quite hard, as we have the FN LN issue to address
    # Again will keep it very crude and only compare first author
    is_author_match = match_author.find(calibre_author) != -1
    if not is_author_match and author.find(',') != -1:
        # Didn't match as exact but we have a comma in Calibre author.
        # Perhaps user is storing in LN, FN format so we will do a crude switch
        # around and compare
        author_parts = author.partition(',')
        rev_calibre_author = cleanup_for_compare(author_parts[2] + author_parts[0])
        is_author_match = match_author.find(rev_calibre_author) != -1
    if not is_author_match:
        if DEBUG:
            prints('Rejecting Goodreads result for author \'%s\' as found \'%s\'' %(author, goodreads_author))
        return
    url_first_result = root.xpath('//table[@class="tableList"]/tr/td[1]/a[2]/@href')
    if url_first_result:
        result_url = 'http://www.goodreads.com%s' % url_first_result[0]
        return result_url

def get_social_metadata(title, author, publisher, isbn):
    # I haven't got my head around what Calibre puts in the "authors" field. It gets given a
    # single author from the MetadataSource __init__ method (even if multiple in Calibre field).
    mi = Metadata(title, [author])
    has_data = False
    if isbn:
        # If we have the ISBN we can go directly to the book page on the Goodreads website using the ISBN
        mi.isbn = isbn
        url = 'http://www.goodreads.com/search?search_type=%22books%22&search[query]=%22'+isbn+'%22'
        has_data = scrape_link_for_book_metadata(mi, url)
    elif title and author:
        # Without ISBN we need to do a search for title/author, scrape the first
        # book url found via the search results page then lookup that book
        url = scrape_link_for_title_authors(title, author)
        if url:
            has_data = scrape_link_for_book_metadata(mi, url, scrape_isbn=True)
    if has_data:
        return mi
    return None

# For testing, run from command line with this:
#    calibre-debug -e goodreads_metadata_plugin.py
# or for testing of isbns:
#    calibre-debug -e goodreads_metadata_plugin.py 9780451524935
# or for testing of title/author:
#    calibre-debug -e goodreads_metadata_plugin.py "1984" "Orwell, George"
def main(args=sys.argv):
    title = None
    author = None
    isbn = None
    # Fallback test data in case no args specified on command line
    #isbn = '9780340734810'
    title = 'Trapped Nerves'
    author = 'Drew Hunt'
    #title = 'Solaris'
    #author = 'Stanisław Lem'
    if len(args) == 2:
        isbn = args[1]
        title = None
        author = None
    elif len(args) == 3:
        isbn = None
        title = args[1]
        author = args[2]
    prints(get_social_metadata(title, author, None, isbn))
    return 0

if __name__ == '__main__':
    sys.exit(main())
