#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
                        print_function)

__license__   = 'GPL v3'
__copyright__ = '2012, Ian Stott <I_P_S@hotmail.com>'
__docformat__ = 'restructuredtext en'

if False:
    # This is here to keep my python error checker from complaining about
    # the builtin functions that will be defined by the plugin loading system
    # You do not need this code in your plugins
    get_icons = get_resources = None

import os
import re, cPickle

#from PyQt4.Qt import QDialog, QVBoxLayout, QPushButton, QMessageBox, QLabel
#from calibre_plugins.similar_stories.config import prefs

#from lxml import etree
from calibre.ebooks.oeb.iterator import EbookIterator
from calibre_plugins.similar_stories.common_utils import WriteDebugLog
from calibre_plugins.similar_stories.config import SIMILARITY_ALGORITHMS,INDEXING_METHODS
from math import log10, sqrt, exp, log

RE_STRIP_STYLE = re.compile(u'<style[^<]+</style>', re.MULTILINE | re.UNICODE)
RE_STRIP_MARKUP = re.compile(u'<[^>]+>', re.UNICODE)


def get_Term_Frequency(format, book_path, similarity_algorithm, indexing_method):
    '''
    reads words in a book
    Then calculates the Term Fequency: TF
    TF(word) = <number of occurances of word> / <number of words in book>
    '''
    
    # first see if already indexed
#    if os.path.exists(book_path):
#        os.remove(book_path)

    WriteDebugLog('Start of get_Term_Frequency for %s'%book_path)
    WriteDebugLog('format = %s'%format)
    WriteDebugLog('indexing method = %d'%indexing_method)

    similarity_algorithm_name = SIMILARITY_ALGORITHMS[similarity_algorithm]

    WriteDebugLog('similarity_algorithm = %d (%s)'%(similarity_algorithm,similarity_algorithm_name))
    
    # remove file extension
    # extension separated by a .
    joinStr='.'
    # split book full filename by .
    temp_path = book_path.split(joinStr)
    # remove final element, the format extension
    format_ext = temp_path.pop()
    # create filename, minus extension by concatinating resulting list
    full_path=joinStr.join(temp_path)
    
     
    if (similarity_algorithm_name == 'PMRA'):
        file_extension = '.term_PMRA_weight'
    else:
        file_extension = '.term_frequency'
    results_filename =   full_path + file_extension
    
    WriteDebugLog('results results_filename: %s'%results_filename)
#    tried just checking file existance, but it seems slower than reading in    
#    if os.path.exists(results_filename):
#        if INDEXING_METHODS[indexing_method]=='memory':
#            f = open(results_filename, 'rb')
#            TF_dict = cPickle.load(f)
#            f.close()
#            
#            WriteDebugLog('read in existing term_frequency')
#            WriteDebugLog('len = %d'%len(TF_dict))
#        else:
#            WriteDebugLog('file already exists term_frequency')
#    else:
    try:
        f = open(results_filename, 'rb')
        TF_dict = cPickle.load(f)
        f.close()
        
        WriteDebugLog('read in existing term_frequency')        
        WriteDebugLog('len = %d'%len(TF_dict))
    
    except IOError as e:
        # not present, so build & write        
        
        formats = re.split(',', format.lower())
        if format == 'MOBI':
            text = _read_mobi_file(book_path, strip_html=True)
            WriteDebugLog('read MOBI format')
        elif format == 'EPUB':
            text = _read_epub_file(book_path , strip_html=True)
            WriteDebugLog('read EPUB format')
        else:
            WriteDebugLog('bad format')
            return(0)
        
        text = unicode(RE_STRIP_STYLE.sub('', text))
        text = unicode(RE_STRIP_MARKUP.sub('', text))
        text = re.sub('\W+',' ',text)
        text = re.sub('( \w{1,2})+ ',' ',text)
        text = re.split(' ',text.lower())
        
        WriteDebugLog('created word list for %s'%book_path)
        book_dict={}
        
        for word in text:
            if word in book_dict:
                book_dict[word] += 1
            else:
                book_dict[word] = 1
        
        book_length = len(text)
        WriteDebugLog('length of %s is %d words'%(book_path,book_length))
        # calculate TF
        TF_dict={}
        if (similarity_algorithm_name == 'PMRA'):
            PMRA_lambda = 0.022
            PMRA_mu     = 0.013
            mu_lambda   = PMRA_mu / PMRA_lambda
            ln_mu_lambda =log(PMRA_mu / PMRA_lambda)
            l = sqrt(book_length)
            
            for word in book_dict.keys():
                k =  book_dict[word]
                exponent = (k-1)*ln_mu_lambda + (PMRA_lambda-PMRA_mu)*l
#                debugStr += '\texponent = (%d-1)*%f + (%f-%f)*%d = %f'%(k,ln_mu_lambda,PMRA_lambda,PMRA_mu,l,exponent)
                if exponent < 500:
                    weight_i = 1 / (1+ exp(exponent))
#                    debugStr += '\tweight = 1 / (1+ exp(%f) = %f'%(exponent,weight_i)
                else:
                    weight_i = 0
#                    debugStr += '\tweight = 0'
                TF_dict[word] = weight_i

            WriteDebugLog('created TD_count for %s of len %d'%(book_path,len(TF_dict)))
        else:
            for word in book_dict.keys():
                TF_dict[word] = book_dict[word] / book_length
            WriteDebugLog('created TD_dict for %s of len %d'%(book_path,len(TF_dict)))
        
        f = open(results_filename, 'wb')
        cPickle.dump(TF_dict, f, -1)
        f.close()

        WriteDebugLog('saved term_frequency dict for %s '%book_path)
        
    
    if INDEXING_METHODS[indexing_method]=='memory':
        return(TF_dict)
    else:
        return(results_filename)
    
    

def get_similarity_score(format, book_path, similarity_algorithm, indexing_method, target_dict):
    '''
    finds similarity score a book with target book
    '''
    WriteDebugLog('book %s with target having %d unique words'%(book_path, len(target_dict)))

    TF_dict = get_Term_Frequency(format, book_path, similarity_algorithm, indexing_method)
    WriteDebugLog('book %s with %d unique words'%(book_path, len(book_dict)))
    
    
    similarity = len(TF_dict) - len(target_dict)
    WriteDebugLog('book %s with similarity score of %f'%(book_path, similarity))
    return(TF_dict)



def _read_epub_file(book_path, strip_html=False):
    '''
    Given a path to an EPUB file, read the contents into a giant block of text
    '''
    iterator = EbookIterator(book_path)
    try:
        iterator.__enter__(only_input_plugin=True)
        book_files = []
        for path in iterator.spine:
            with open(path, 'rb') as f:
                html = f.read().decode('utf-8', 'replace')
                if strip_html:
                    html = unicode(RE_STRIP_STYLE.sub('', html))
                    html = unicode(RE_STRIP_MARKUP.sub('', html))
            book_files.append(html)
        return ''.join(book_files)
    finally:
        if iterator:
            iterator.__exit__()


def _read_mobi_file(book_path, strip_html=False):
    '''
    Given a path to a MOBI file, read the contents into a giant block of text
    '''
    from calibre.ebooks.mobi.reader import MobiReader
    from calibre.utils.logging import default_log
    mr = MobiReader(book_path, default_log)
    if mr.book_header.encryption_type != 0:
        # DRMed book
        return None
    mr.extract_text()
    html = mr.mobi_html.decode('utf-8', 'replace')
    if strip_html:
        html = RE_STRIP_STYLE.sub('', html)
        html = RE_STRIP_MARKUP.sub('', html)
    return html
    
def get_epub_standard_word_count(book_path):
    '''
    This algorithm counts individual words instead of pages
    '''
    from calibre.utils.wordcount import get_wordcount_obj
    book_text = _read_epub_file(book_path, strip_html=True)
    wordcount = get_wordcount_obj(book_text)
    return wordcount.words

def get_mobi_standard_word_count(book_path):
    '''
    This algorithm counts individual words instead of pages
    '''
    from calibre.utils.wordcount import get_wordcount_obj
    book_text = _read_mobi_file(book_path, strip_html=True)
    if not book_text:
        return None
    wordcount = get_wordcount_obj(book_text)
    return wordcount.words

def read_words(full_path, format, library_dict):
    '''
    reads words in a book
    '''
    
    # first see if already indexed
#    if os.path.exists(book_path):
#        os.remove(book_path)

    try:
        f = open(full_path + '.dict', 'rb')
        book_dict = cPickle.load(f)
        f.close()  
    
        return(len(book_dict))
    except IOError as e:
        # not present, so build & write        
        
        formats = re.split(',', format.lower())
        if formats.count('mobi') >0:
            text = _read_mobi_file(full_path + '.mobi', strip_html=True)
        elif formats.count('epub') >0:
            text = _read_epub_file(full_path + '.epub', strip_html=True)
        else:
            return(0)
        
        
        text = unicode(RE_STRIP_STYLE.sub('', text))
        text = unicode(RE_STRIP_MARKUP.sub('', text))
        text = re.sub('\W+',' ',text)
        text = re.sub('( \w{1,2})+ ',' ',text)
        text = re.split(' ',text)
        
        book_dict={}
        library_dict['number of books'] += 1
        
        for word in text:
            if word in book_dict:
                book_dict[word] += 1
            else:
                book_dict[word] = 1
        
        
        for word in book_dict.keys():
            if word in library_dict:
                library_dict[word] += 1
            else:
                library_dict[word] = 1
        
        f = open(full_path + '.dict', 'wb')
        cPickle.dump(book_dict, f, -1)
        f.close()
    
    
        return(len(book_dict))



