#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
                        print_function)

__license__   = 'GPL v3'
__copyright__ = '2012, Ian Stott <I_P_S@hotmail.com>'
__docformat__ = 'restructuredtext en'

import os, traceback, time, cPickle

from calibre.ebooks import DRMError
from calibre.utils.ipc.server import Server
from calibre.utils.ipc.job import ParallelJob

from calibre_plugins.similar_stories.statistics import get_Term_Frequency
from calibre_plugins.similar_stories.common_utils import WriteDebugLog
from calibre_plugins.similar_stories.config import SIMILARITY_ALGORITHMS,INDEXING_METHODS

from math import log10, sqrt, exp, log
from sets import Set

def do_count_statistics(books_to_scan, similarity_algorithm, indexing_method, target_book_id, cpus, notification=lambda x,y:x):
    '''
    Master job, to launch child jobs to Similar Stories in this list of books
    '''
    WriteDebugLog('starting do_count_statistics, target_book_id: %d'%target_book_id)
    server = Server(pool_size=cpus)
    
    # Queue all the jobs
    for book_id, title, format, book_path in books_to_scan:
        WriteDebugLog('Queue for book_id: %d, title: %s, format: %s, book_path: %s'%(book_id, title, format, book_path))
        args = ['calibre_plugins.similar_stories.jobs', 'do_statistics_for_book',
                (format, book_path, similarity_algorithm, indexing_method)]
        job = ParallelJob('arbitrary', str(book_id), done=None, args=args)
        job._book_id = book_id
        job._title = title
        job._format = format
        job._similarity_algorithm = similarity_algorithm
        job._similarity_algorithm = indexing_method
        server.add_job(job)

    # This server is an arbitrary_n job, so there is a notifier available.
    # Set the % complete to a small number to avoid the 'unavailable' indicator
    notification(0.01, 'Calculating Story Similarities')

    similarity_algorithm_name = SIMILARITY_ALGORITHMS[similarity_algorithm]
    indexing_method_name = INDEXING_METHODS[indexing_method]

    # dequeue the job results as they arrive, saving the results
    total = len(books_to_scan)
    count = 0
    book_similarity_map = dict()
    book_results_map = dict()
    # zero library dictionary
    library_dict = {}
    # proportion of the time spent indexing the books, as opposed to working out the similarity
    indexing_fraction = 0.9
    
    start_time=time.time()
    
    while True:
        job = server.changed_jobs_queue.get()
        # A job can 'change' when it is not finished, for example if it
        # produces a notification. Ignore these.
        job.update()
        if not job.is_finished:
            continue
        # A job really finished. Get the information.
        job_result = job.result
        book_id = job._book_id

        
        count = count + 1
        notification(float(count)/(total/indexing_fraction), 'Indexing Book %d of %d'%(count,total))
        # Add this job's output to the current log
        print('-------------------------------')
        print('Book %d of %d after %f'%(count,total,(time.time()-start_time)))
        print('Logfile for book ID %d (%s - %s)'%(book_id, job._title, job._format))
#        print('Target book ID %d '%target_book_id)
        if job._similarity_algorithm is not None:
            # store results of job
            # This is the TF dict, if storing results in memory
            # or the filename of the dict, if not
            if INDEXING_METHODS[indexing_method]=='memory':
                TF_dict = job_result
            else:
                f = open(job_result, 'rb')
                TF_dict = cPickle.load(f)
                f.close()
            
            if ((TF_dict) and (len(TF_dict)>0 )):
                try:
                    print('\tDictionary size of %d'%len(TF_dict))
                    # add words of current book to library dictionary
                    for word in TF_dict.keys():
                        if word in library_dict:
                            library_dict[word] += 1
                        else:
                            library_dict[word] = 1
                    # if current book is the target, then store this dictionary
                    if target_book_id==book_id:
                        target_TF_dict=TF_dict
                    
                    # store job results
                    book_results_map[book_id] = job_result
                except:
                    print('\tFAILED TO calculate Similarity Score for %s'%job._title)                    
            else:
                print('\tFAILED TO calculate Similarity Score for %s'%job._title)
#        print(job.details)
        if count >= total:
            # All done!
            break

    server.close()
    
    # now that libray dictionary has been compiled,
    # calculate the IDF score for words
    # IDF = inverse document frequency 
    # IDF(word) = log( <number documents in library> / <number documents wor appears> )
    notification(float(count)/(total/indexing_fraction), 'Indexing Target Book')
    IDF=dict()
    for word in library_dict.keys():
        IDF[word] = log10( total / library_dict[word] )
#        print('IDF[%s]\t=log10(%d/%d)\t= %f'%(word,total,library_dict[word],IDF[word]))
        
    # calculate target TF*IDF
    target_TFIDF = dict()
    # B_dot_B = sum( target_TFIDF[word] ^ 2)
    # used for Tanimoto similarity
    B_dot_B = 0
    for word in library_dict.keys():
        if word in target_TF_dict:
            target_TFIDF[word] = target_TF_dict[word] * IDF[word]
            B_dot_B += (target_TFIDF[word] **2)
        else:
            target_TFIDF[word] = 0
    
    print('Magnitude of target book: B_dot_B = %f\n'%B_dot_B)
    sqrt_B_dot_B = sqrt(B_dot_B)
    # for binary tanimoto
    b = Set(target_TF_dict.keys())
    
    # PMRA values
    # returns word count, as opposed to weighted term frequency
    if (similarity_algorithm_name == 'PMRA'):
        target_similarity_score = 0
        for word in library_dict.keys():
            if word in target_TF_dict:
                target_similarity_score += target_TF_dict[word] * target_TF_dict[word] * IDF[word]
                target_TFIDF[word] = target_TF_dict[word] * IDF[word]
#                if (target_TF_dict[word] > 0.00000001):
#                    print('\ttarget_TF_dict[%s] = %f\tIDF = %f target_similarity_score = %f'%(word,target_TF_dict[word],IDF[word],target_similarity_score))
            else:
                target_TFIDF[word] = 0
            
        if (target_similarity_score < 0.00000001):
            target_similarity_score = 1
        print('\tPMRA target self-simililarity score = %f'%(target_similarity_score))
        
    
    # go through each book dict and calculate the similarity score from the 
    # comparison of the TFIDF fingerprints
    
    print('\tsimilarity method %d = %s'%(similarity_algorithm,similarity_algorithm_name))
    print('\tindexing method %d = %s'%(indexing_method,INDEXING_METHODS[indexing_method]))
    current_book=0
    for book_id in book_results_map.keys():
        current_book+=1
        count += (1-indexing_fraction)/indexing_fraction
        notification(float(count)/((total/indexing_fraction)), 'Calculating Similarity for Book %d of %d'%(current_book,total))
        print('Calculating similarity score for book # %d, %d of %d'%(book_id,current_book,total))
        WriteDebugLog('Calculating similarity score for book # %d, %d of %d'%(book_id,current_book,total))
        
        # get dictionary for current book
        # from memory or file, depending upon the indexing method
        
        if (INDEXING_METHODS[indexing_method]=='memory'):
            TF_dict = book_results_map[book_id]
        else:
            f = open(book_results_map[book_id], 'rb')
            TF_dict = cPickle.load(f)
            f.close()
      
        similarity = 0
        if (similarity_algorithm_name == 'Euclid'):
            # distance = sum ( (TFIDF_dict(word) - target_TFIDF(word))^2 )
            # sum squared distances

            distance = 0
            for word in library_dict.keys():
                if word in TF_dict:
                    TFIDF_word = TF_dict[word] * IDF[word]
                else:
                    TFIDF_word = 0
                d = (TFIDF_word - target_TFIDF[word])
                d2 = d*d
                distance += d2
            similarity = 1 - 1000*distance
            
        elif (similarity_algorithm_name == 'Tanimoto'):
            # similarity =  A.B / (A^2 + B^2 - A.B) 
            # A = current Book
            # B = target
            A_dot_B=0
            A_dot_A=0
            for word in TF_dict.keys():
                TFIDF_word = TF_dict[word] * IDF[word]
                A_dot_A += (TFIDF_word ** 2)
                A_dot_B += (TFIDF_word * target_TFIDF[word])

            denominator = (A_dot_A + B_dot_B - A_dot_B)
            if denominator != 0:
                similarity = A_dot_B / denominator
            else:
                similarity = 0
            print('\tTanimoto values:\n\t\tA.A = %f\n\t\tB.B = %f\n\t\tA.B = %f\n\t\tsimilarity = %f\n'%(A_dot_A,B_dot_B,A_dot_B,similarity))
        elif (similarity_algorithm_name == 'Cosine'):
            # similarity =  A.B / (|A| * |B|) 
            # A = current Book
            # B = target
            A_dot_B=0
            A_dot_A=0
            for word in TF_dict.keys():
                TFIDF_word = TF_dict[word] * IDF[word]
                A_dot_A += (TFIDF_word ** 2)
                A_dot_B += (TFIDF_word * target_TFIDF[word])

            denominator = sqrt(A_dot_A) * sqrt_B_dot_B
            if denominator != 0:
                similarity = A_dot_B / denominator
            else:
                similarity = 0
            print('\tCosine values: %f / (sqrt(%f) * sqrt(%f) = %f\n'%(A_dot_B,A_dot_A,B_dot_B,similarity))
        elif (similarity_algorithm_name == 'Tanimoto (binary)'):
            # similarity =  T(a,b) = N(ab)/(N(a) + N(b) - N(ab)) 
            # A = current Book
            # B = target
            a = Set(TF_dict.keys())
            
            Nab=len(a.intersection(b))
            Na = len(a)
            Nb = len(b)
            
            denominator = Na + Nb - Nab
            if denominator != 0:
                similarity = (1.0 * Nab) / (1.0*denominator)
            else:
                similarity = 0
            print('\tTanimoto (binary) values: %d / ( %d + %d - %d) = %f\n'%(Nab, Na, Nb, Nab,similarity))
        
        elif (similarity_algorithm_name == 'PMRA'):
            # similarity =  w(i,a).w(i,b) 
            # A = current Book
            # B = target

            similarity = 0
            
            for word in TF_dict.keys():
                similarity +=  TF_dict[word] * target_TFIDF[word]
#                debugStr += '\tsimilarity += %f * %f * %f = %f'%(IDF[word],weight_i,target_TFIDF[word],similarity)
                
            similarity = similarity / target_similarity_score
            print('\tPMRA values:\n\t\tsimilarity = %f\n'%(similarity))
        else:
            similarity = 0
            
        book_similarity_map[book_id] = similarity
    
    # return the map as the job result
    return book_similarity_map


def do_statistics_for_book(format, book_path, similarity_algorithm, indexing_method):
    '''
    Child job, to count statistics in this specific book
    '''
    WriteDebugLog('starting do_statistics_for_book: book_path: %s'%book_path)
    try:
        TF_dict = None
        if similarity_algorithm is not None:
            TF_dict = get_Term_Frequency(format, book_path, similarity_algorithm, indexing_method)
        return TF_dict
    except DRMError:
        print('\tCannot read book due to DRM Encryption')
        return None, None
    except:
        traceback.print_exc()
        return None, None

