#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=56:sw=4:sta:et:sts=4:ai softtabstop=0 noexpandtab

__license__   = 'GPL v3'
__copyright__ = '2018, Steven Dick <kg4ydw@gmail.com>'

from urllib import quote
from Queue import Queue, Empty
from collections import OrderedDict

from lxml.html import fromstring, tostring

from calibre import as_unicode
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source, Option
from calibre.ebooks.metadata.book.base import Metadata
import re
from datetime import datetime
from calibre.ebooks.metadata.sources.prefs import msprefs
from calibre_plugins.wikidata.SPARQLWrapper import SPARQLWrapper, JSON

from calibre_plugins.wikidata.urlfixer import UrlFixer

class Wikidata(Source):
    name = 'Wikidata'
    description = _('Downloads metadata and covers via wikidata')
    author = 'Steven Dick'
    version = (1, 3, 0)
    minimum_calibre_version = (0, 8, 0)

    capabilities = frozenset(['identify', 'cover'])
    # Note: metata options don't allow deselection of identifiers in downloads
    touched_fields = frozenset(
        [ 'pubdate', 'identifier:wd', 'identifier:gutenberg', 'tags', 'series',
          'series_index', 'identifier:isbn'
      ])
# add more fields later: 'comments', 'publisher'
#    has_html_comments = True
    supports_gzip_transfer_encoding = True
    prefer_results_with_isbn = False

    ## cover stuff
    can_get_multiple_covers = True

    urlfixer = UrlFixer()

    #config_help_message = 'Select fields to download from wikidata'

    options = [
        Option('slowsearch', 'bool', False, 'Slow search', 
               'Enable slow regex search'),
        Option('gutensave', 'bool', True, 'Save gutenberg IDs',
               'Convert gutenberg URIs to IDs or save if found'),
        Option('gutenfix', 'bool', False, 'Save found gutenberg IDs',
               'Inject found gutenberg IDs in all matches without one'),
        Option('gutensearch', 'bool', True, 'Search by gutenberg ID',
               'Search wikidata by gutenberg ID'),
        Option('save_isbn', 'bool', False, 'Save ISBNs found in wikidata',
                'Save ISBNs found in wikidata; may not match the actual edition you have and will overwrite existing ISBNs'),
        Option('instance_tags', 'bool', True, 'Save instance tags',
               'Save wikidata instance of as tags'),
        Option('genre_tags', 'bool', False, 'Save genre tags',
               'Save wikidata genres as tags'),
        Option('tagmode', 'choices', 'label', 'Tag format mode',
               'Save tags as wikidata Q codes or native language labels, use the metadata tag filter to translate either',
               { 'label': 'Native language descriptive label',
                 'wdcode': 'Wikidata numeric Q code',
                 'both' : 'Both labels and codes'} ),
        Option('lang','string','en','Language code',
               "Language code to use for exact matches and labels"),
        Option('inexact', 'bool', False, 'Try inexact searches',
               'Inexact searches (ignoring author, fuzzy title) may have false positives, check results carefully'),
        #Option('tryharder', 'bool', False, 'Try all fuzzy searches',
        #       'Don''t stop on first successful fuzzy search'),
        Option('strip_subtitle', 'bool', False, 'Strip subtitle',
               'Strip subtitle before performing a title word search'),
        # this should be number, but number apparently is percent
        Option('inexact_limit','string', '100', 'Search limit',
               'Limit of how many items to check for inexact author search'),
        Option('ignorewdid', 'bool', False, 'Ignore existing wdid',
               'Ignore existing wikidata ID (don''t use previous search results)'),
        Option('debug', 'bool', False, 'debug', 'Emit more messages in log'),
        ]

    def id_from_url(self, url):
        try:
            v = self.urlfixer.id_from_url(url)
            #print(v)
            return v
        except:
            #print("url translation fail")
            return None

    def get_book_urls(self, identifiers):
        results = self.urlfixer.get_book_urls(identifiers)
        #if isfdb in identifiers:
        #    results += ('isfdb', identifiers['isfdb'], "http://www.isfdb.org/cgi-bin/title.cgi?%s"%identifiers['isfdb'])
        return results

    def get_book_url(self, identifiers):
        wikidata_id = identifiers.get('wd', None)
        if wikidata_id:
            return ('wd', wikidata_id, "http://www.wikidata.org/entity/"+wikidata_id)
        else:
            return None
        
    def get_book_url_name(self, idtype, idval, url):
        if idtype == "wd": return "wikidata"
        return self.urlfixer.get_book_url_name(idtype,idval, url)

    def wikidata_tag_from_url(self, url):
        # ignore url, look for key at end
        result = re.search(r"/(Q\d+)$", url)
        if result:
            return result.group(1)
        else:
            return None
  
    # instead of caching covers, do a separate query for them
    def download_cover(self, log, result_queue, abort,
                       title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        wdid = identifiers.get('wd',None)
        if not wdid:
            log.info("No wikidata ID set")
            return None
        try:

            sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
            sparql.setQuery("""SELECT ?image WHERE { wd:%s wdt:P18 ?image. } LIMIT 10"""%wdid)
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()
            if not (results and results['results'] and results['results']['bindings']):
                return None
            list = []
            log.info("Found %d imates\n"%len(results['results']['bindings']))
            for i in results['results']['bindings']:
                list.append(i['image']['value'])
        except Exception as ack:
            log.info("Cover query failed:\n%s"%ack)
            return None

        br =  self.browser
        for url in list:
            log('Downloading cover from: ',url)
            try:
                cdata = br.open_novisit(url, timeout=timeout).read()
                if cdata:
                    result_queue.put((self, cdata))
                    if get_best_cover: return
            except:
                log.exception('Failed to download cover from:', url)


    def doquery(self, log, early='', late='', debug=False, limit=10):
        # wdt:P31 wd:Q571.  --> instance of book
        # wdt:P31/wdt:P279*  wd:Q47461344. --> instance of subclasses of written works
        # wdt:Q732577 publication
        # wdt:Q47461344 written work
        # (GROUP_CONCAT(DISTINCT ?isbn13; SEPARATOR=", ") as ?isbn13s)
        # (GROUP_CONCAT(DISTINCT ?isb10; SEPARATOR=", ") as ?isbn10s)
        moreselect = ''
        moreservice = ''
        moregroup = ''
        if self.prefs['tagmode']=='label' or self.prefs['tagmode']=='both':
            moreservice +="""
            ?instance rdfs:label ?instanceLabel.
            ?genre rdfs:label ?genreLabel."""
            moreselect += """
        (GROUP_CONCAT(DISTINCT ?instanceLabel; SEPARATOR="&") as ?instanceLabels)
        (GROUP_CONCAT(DISTINCT ?genreLabel; SEPARATOR="&") as ?genreLabels)"""
        if self.prefs['tagmode']=='wdcode' or self.prefs['tagmode']=='both':
            moreselect += """
        (GROUP_CONCAT(DISTINCT ?instance; SEPARATOR=" ") as ?instances)
        (GROUP_CONCAT(DISTINCT ?genre; SEPARATOR=" ") as ?genres)"""
        if 'series' not in msprefs["ignore_fields"]:
            moreselect += "?seriesLabel ?seriesordinal "
            moregroup +=  "?seriesLabel ?seriesordinal "
            late += """ OPTIONAL { ?book p:P179 ?seriesstatement.
            ?seriesstatement ps:P179 ?series.
            OPTIONAL { ?seriesstatement pq:P1545 ?seriesordinal. }}"""
            moreservice += " ?series rdfs:label ?seriesLabel."
        if self.prefs['save_isbn']:
            moreselect += "?isbn10 ?isbn13 "
            moregroup +=  "?isbn10 ?isbn13 "
            late += "OPTIONAL { ?book wdt:P212 ?isbn13. } OPTIONAL { ?book wdt:P957 ?isbn10. }"
        lang = self.prefs['lang']
        sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
        # leave a comment in the query so the wikidata people know about us
        query="""# calibre wikidata plugin
SELECT ?book ?bookLabel (MIN(?pubdate) as ?firstpub) 
 (GROUP_CONCAT(DISTINCT ?authorLabel; SEPARATOR="&") as ?authors)
 (GROUP_CONCAT(DISTINCT ?gutenbergid; SEPARATOR=" ") as ?gutenbergids)
 %s
 WHERE {
 SERVICE wikibase:label {
 bd:serviceParam wikibase:language "[AUTO_LANGUAGE],%s". 
 ?book rdfs:label ?bookLabel.
 ?author rdfs:label ?authorLabel.
 %s
 } 
 VALUES ?worktype { wd:Q47461344 wd:Q732577 }
  ?book wdt:P31/wdt:P279*  ?worktype.
  %s
  OPTIONAL { ?book wdt:P577 ?pubdate. }
  OPTIONAL { ?book wdt:P50 ?author. }
  OPTIONAL { ?book wdt:P2034 ?gutenbergid. }
  OPTIONAL { ?book wdt:P31 ?instance. }
  OPTIONAL { ?book wdt:P136 ?genre. }
  %s
 }
GROUP BY ?book ?bookLabel %s
LIMIT %d""" % (moreselect, lang, moreservice, early, late,moregroup,limit)
        if self.prefs['debug']: log.info(query)
        try:
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()
            if results and results['results'] and results['results']['bindings']:
                return results['results']['bindings']
        except Exception as ack:
            log.info('query exception:\n%s\n\n%s'%(query,ack))
        return None

    def identify(self, log, result_queue, abort, title=None, authors=None,
            identifiers={}, timeout=30):
        '''
        Try progressively wider and slower queries until we get something.
        '''
        lang = self.prefs['lang']
        matches = []
        authormatch = False
        titleexact = True
        gid = None
        gid_saved = False

        # make a fake book instead of inserting the gid if dups are kept
        insert_gid = self.prefs['gutenfix'] and not (
            ('keep_dups' in msprefs and msprefs['keep_dups']) or
            ('keep_dups' in msprefs.defaults and msprefs.defaults['keep_dups']))
        log.info("****************\nStarting search:\n")
        # log.info("insert gid = {} {}".format(insert_gid, 'keep_dups' in msprefs))
        # find gutenberg ID just in case we can use it or save it later
        gid = identifiers.get('gutenberg', None)
        if gid and self.prefs['gutensearch']:
            log.info("gutenberg ID was supplied: "+gid)
        else:   # try harder (or if not gutensearch, try again)
            uri = identifiers.get('uri', None)
            if uri:
                result = re.match(r"https?.//www.gutenberg.org/ebooks/(\d+)", uri)
                if result:
                    gid = result.group(1)
                    log.info("Found gutenberg ID in URI: "+gid)
            if not gid: # try stealing a uri from overdrive plugin
                # XXX sometimes this is a false positive!  need to verify it
                odid = identifiers.get('odid',None)
                if odid:
                    # XXX how to handle multiple IDs (separated by &)
                    log.info("Found overdrive id "+odid)
                    result = re.match(r"(\d+)@pg/",odid)
                    if result:
                        gid = result.group(1)
                        log.info("Found gutenberg ID in overdrive ID: "+gid)
                    
        # query by wikidata ID
        wikidata_id = identifiers.get('wd', None)
        if wikidata_id and not self.prefs['ignorewdid']:
           log.info( "Found wikidata id, searching for entity '%s'" % wikidata_id)
           matches = self.doquery(log,late="""{ BIND(wd:%s AS ?book) . }"""%wikidata_id)
        # query wikidata by gutenberg ID
        if not matches and self.prefs['gutensearch']:
            if gid:
                log.info("Searching for gutenberg id "+gid)
                matches = self.doquery(log, early="""?book wdt:P2034 "%s". """%gid)
                
        # check by isbn (REs copied from ISBN wikidata.org/wiki/Property: )
        # Note: split isbn code works, tested with a sample size of 1 (!!)
        # The rest of this code may never be used as calibre seems to remove all punctuation from ISBNs which wikidata apparently hates?
        isbn = identifiers.get('isbn',None)
        if (not matches and isbn):
            # Note: tests searching for simple format ISBNs fails in wikidata
            m= re.match(r"(\d)(\d{3})(\d{5})(\d|X)$",isbn)
            if not m:
                n = re.match(r"(\d{3})(\d)(\d{3})(\d{5})(\d)$",isbn)
            if m:
                log.info( "Searching for split ISBN10 = "+isbn)
                matches = self.doquery(log,early="""?book wdt:P957 "%s-%s-%s-%s".""" % ( m.group(1), m.group(2),m.group(3),m.group(4)))
            # isbn 10
            elif n:
                 log.info( "Searching for split ISBN13 = " + isbn)
                 matches = self.doquery(log,early="""?book wdt:P212 "%s-%s-%s-%s-%s". """ % ( n.group(1), n.group(2),n.group(3),n.group(4),n.group(5)))
            elif re.match(r"(\d{9}(\d|X)$)|(\d{1,5}-\d{1,7}-\d{1,6}-[0-9X]$)",isbn):
                log.info( "Searching for ISBN10 = "+isbn)
                matches = self.doquery(log,early="""?book wdt:P957 "%s".""" % isbn)
                # isbn13 regex from wikidata
            elif re.match(r"97[89]-([0-57]-(\d-\d{7}|\d\d-\d{6}|\d\d\d-\d{5}|\d{4}-\d{4}|\d{5}-\d\d\d|\d{6}-\d\d|\d{7}-\d)|[89]\d-(\d-\d{6}|\d\d-\d{5}|\d\d\d-\d{4}|\d{4}-\d\d\d|\d{5}-\d\d|\d{6}-\d)|[69]\d\d-(\d-\d{5}|\d\d-\d{4}|\d\d\d-\d\d\d|\d{4}-\d\d|\d{5}-\d)|99[0-8]\d-\d-\d{4}|99[0-8]\d-\d\d-\d\d\d|99[0-8]\d-\d\d\d-\d\d|99[0-8]\d-\d{4}-\d|999\d\d-\d-\d\d\d|999\d\d-\d\d-\d\d|999\d\d-\d\d\d-\d)-\d", isbn):
                 log.info( "Searching for ISBN13 = " + isbn)
                 matches = self.doquery(log,early="""?book wdt:P212 "%s". """ %isbn)
            else:
              log.info("ISBN not parsed correctly")

        # try author  XXX should use all authors? or choose one more carefully?
        if not matches and authors:
            author = authors[0]
            log.info("Author = "+author)
        else:
            author = None
        # try exact author / title match (for first author)
        # also try alternate title
        if not matches and (author and title):
           log.info( "Searching for author='%s' title='%s'" % (author,title))
           # I'd use format() except we need {} for sparql
           matches = self.doquery(log,early="""{ ?book rdfs:label "%s"@%s. } UNION { ?book skos:altLabel "%s"@%s. } ?book wdt:P50 ?author. ?author rdfs:label "%s"@%s."""% (title,lang, title, lang, author, lang))
           if matches:
               authormatch = True
        # then try just title (but should this skip the rest if successful?)
        if not matches and self.prefs['inexact'] and title:
           log.info( "Searching for title='%s'" % title)
           matches = self.doquery(log,early="""{?book rdfs:label "%s"@%s.} UNION { ?book skos:altLabel "%s"@%s. }"""% (title,lang, title, lang))
     
        ### all following queries do fuzzy title matches
        if not matches:  titleexact=False

        # try searching for all works by the author and hope the title word search weeds it out
        # XXX possible more refined fuzzy searches:
        # If there are multiple authors, this could try each one?
        # Could try regex search on title keywords plus author search?
        # Also get alternate title and use for word count matching
        if not matches and self.prefs['inexact'] and author:
            log.info("Searching all works by author "+author)
            try:
                limit = int(self.prefs['inexact_limit'])
            except:
                limit = 100
            matches = self.doquery(log, early="""?book wdt:P50 ?author. ?author rdfs:label "%s"@%s. """%(author, lang), limit=limit)
        
        # try regex match on title (slow) (this should be user directed?)
        # XXX this should clean the title first to make it more regex like
        if not matches and self.prefs['inexact'] and self.prefs['slowsearch'] and title:
           log.info( "Searching for regex title='%s'" % title)
           matches = self.doquery(log,early="""?book wdt:P1476 ?t.""",
                                  late="""FILTER(REGEX(?t, "%s", 'i'))"""% title)
        # XXX add more optional searches: trim title and regex
        if matches:
            log.info("Found %d matches" %len(matches))
        else:
            log.info("Found no matches")

        print 'matches=',matches
        log.info('matches=%s'%matches)
        if matches:
            # wikidata should have already merged duplicates for us
            # Weigh quality of match by counting matching words in a,t
            # Don't bother trying exact matches, that already failed.
            # simplistic word match more simple than default algorithm
            authorwords = set()
            if authors:
                for i in authors: authorwords.update(i.split(' '))
            if not titleexact:
                titlewords = frozenset(self.get_title_tokens(title,self.prefs['strip_subtitle']))
            for match in matches:
                source_relevance = 0
                wdid = match['book']['value']
                (wdid, ok) = re.subn(r"http://www.wikidata.org/entity/", "", wdid)
                log.info('found '+wdid)
                if ok!=1:
                    log.warn("Unparsable wikidata ID: "+wdid)
                    # skip this book
                    continue
                log.info('==='*3,wdid,'==='*3)
                mtitle = match['bookLabel']['value']
                mauthors = None
                if 'authors' in match:
                    mauthors = match['authors']['value'].split('&')
                    md = Metadata(mtitle, mauthors)
                    # get all the author words and add weight for matches
                    aw = set()
                    for i in mauthors: aw.update(i.split(' '))
                    source_relevance += len(authorwords.intersection(aw))
                else:
                    md = Metadata(mtitle, None)
                if not titleexact:  # weigh in on the closeness of the title
                    source_relevance += len(titlewords.intersection(
                        set(self.get_title_tokens(match['bookLabel']['value']))))
                # always add the wikidata ID
                md.set_identifier('wd', wdid)

                # add gutenberg ID either from wikidata or from found data
                if 'gutenbergids' in match:
                    # there's suppose to be only one, but sometimes more
                    mgids = match['gutenbergids']['value'].split(' ')
                    # give extra bonus points for matching gid
                    if gid and gid in mgids:
                        source_relevance += 100
                        gid_saved = True
                    if self.prefs['gutensave']:
                        for mgid in mgids: # XXX can we save more than one?
                            md.set_identifier('gutenberg',mgid)
                        log.info("Saving wikidata gutenberg:%s for %s"%(match['gutenbergids']['value'],wdid))
                elif self.prefs['gutensave'] and gid and (
                        insert_gid or (title==mtitle and authors==mauthors)):
                    # save it for exact matches or we'll be merged anyway
                    gid_saved = True
                    md.set_identifier('gutenberg', gid)
                    log.info("Saving found gutenberg:%s for %s"%(gid,wdid))

                # set pubdate
                if 'firstpub' in match:
                    try:
                        md.pubdate = datetime.strptime(match['firstpub']['value'],"%Y-%m-%dT%H:%M:%SZ")
                        source_relevance += 1   # give a point for pubdate
                    except:
                        log.warn("unparseable date: %s"%match['firstpub']['value'])
                # look for tags to add; watch out for tags containing commas
                # which kinds of tags we get is filtered in the query itself
                tags = set()
                if self.prefs['instance_tags'] and 'instanceLabels' in match:
                    t = match['instanceLabels']['value'].replace(',','')
                    tags.update(t.split('&'))
                if self.prefs['genre_tags'] and 'genreLabels' in match:
                    t = match['genreLabels']['value'].replace(',','')
                    tags.update(t.split('&'))
                if self.prefs['instance_tags'] and 'instances' in match:
                    t = match['instances']['value'].split(' ')
                    tags.update(map(self.wikidata_tag_from_url, t))
                if self.prefs['genre_tags'] and 'genres' in match:
                    t = match['genres']['value'].split(' ')
                    tags.update( map(self.wikidata_tag_from_url,t))
                tags = filter(lambda x: x, tags)  #remove junk and listify
                tags.sort()
                #log.info("Adding tags to ",wdid,": ",(', '.join(tags)))
                md.tags = tags
                
                # ISBN gets its own field, keep the longer one
                # note: save_isbn preference not checked here
                # It should not be in the search results if pref is off
                if 'isbn10' in match:
                    isbn = check_isbn(match['isbn10']['value'])
                    if isbn: md.isbn = isbn
                if 'isbn13' in match:
                    isbn = check_isbn(match['isbn13']['value'])
                    if isbn: md.isbn = isbn

                # set series and series index
                if 'seriesLabel' in match:
                    md.series = match['seriesLabel']['value']
                if 'seriesordinal' in match:
                    try:
                        md.series_index = int(match['seriesordinal']['value'])
                    except:
                        log.info("Series index {} rejected".format(match['seriesordinal']))
                        md.series_index = 0
                else:
                    md.series_index = 0

                # XXX more metadata we could import
                # publisher (how to handle multiple results?)
                # comments (from where?)
                # more IDs

                md.source_relevance = source_relevance
                self.clean_downloaded_metadata(md)
                result_queue.put(md)
                log.info("source_relevance=%s\n"%source_relevance,md)
            #### end of match processing
            
            # add a fake book for the gutenberg id if appropriate
            if not insert_gid and not gid_saved:
                md = self.fake_book(log, title, authors,identifiers, gid)
                if md:
                    result_queue.put(md)
            log.info(result_queue)
            return None
        else:
            # fake a result for gutenberg or give up?
            md = self.fake_book(log, title, authors,identifiers, gid)
            if md:
                result_queue.put(md)
                return None
            return 'no matches found'

    def fake_book(self, log, title, authors, identifiers, gid):
        if gid and self.prefs['gutensave'] and not identifiers.get('gutenberg'):
            log.info("Creating a fake result to save gutenberg ID")
            md = Metadata(title, authors)
            md.set_identifier('gutenberg', gid)
            md.source_relevance = 1000  # put me first1!!!!
            return md
        else:
            return None
        

    def identify_results_keygen(self, title=None, authors=None, identifiers={}):
        def keygen(mi):
            return - mi.source_relevance
        return keygen
