# Proof-of-concept for "compare text and metadata seperate" mode of "Find Duplicates"

import difflib as dl
import pdftotext
import hashlib
import PyPDF2
import docx2txt
import os.path
import time
import mimetypes
from PyPDF2 import PdfFileReader


def get_file_info(file):
    d1 = get_mimetype(file)
    d2 = split_path(file)

    try:
        st = os.stat(file)
    except IOError:
        print('Failed to get information about {0}'.format(file))
    else:
        print('File         :', file)
        print('Access time  :', time.ctime(os.path.getatime(file)))
        print('Modified time:', time.ctime(os.path.getmtime(file)))
        print('Change time  :', time.ctime(os.path.getctime(file)))
        print('Size         :', os.path.getsize(file))
    return {
        'drive': d2['drive'],
        'path': d2['path'],
        'filename': d2['filename'],
        'extension': d2['extension'],
        'filetype': d1['filetype'],
        'modified': time.ctime(os.path.getmtime(file)),
        'size': os.path.getsize(file),
    }


def get_mimetype(file):
    import mimetypes

    filetype, fileencoding = mimetypes.guess_type(file)
    print('Filetype is {0}, file encoding is {1}.'.format(filetype, fileencoding))
    # logging.info('Filetype is {0}, file encoding is {1}.'.format(filetype, fileencoding))

    return {
        'filetype': filetype,
        'fileencoding': fileencoding
    }


def split_path(path):
    import os
    # get drive and path with file
    drive, path_and_file = os.path.splitdrive(path)
    # Get the path and the file
    path, file = os.path.split(path_and_file)
    # get the extension
    filename, extension = os.path.splitext(file)

    return {
        'drive': drive,
        'path': path,
        'filename': filename,
        'extension': extension
    }


def find_title(file):
    filetype, fileencoding = mimetypes.guess_type(file)
    print(filetype, fileencoding)

    return {
        'filetype': filetype,
        'fileencoding': fileencoding
    }


def doc_text_extractor(path):
    # extract text
    text = docx2txt.process(path)
    return text


def get_file_content(file):
    f = open(file, "r", encoding='utf-8')
    text = f.read()
    f.close()
    return text


def get_pdf_info(path):
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        doc_info = pdf.getDocumentInfo()
        # print('doc_info={0}'.format(doc_info))
        num_of_pages = pdf.getNumPages()
        # print('num_of_pages={0}'.format(num_of_pages))

    path_parts = split_path(path)
    # print('path_parts={0}'.format(path_parts))

    d = {
        'title': doc_info.title if doc_info.title else path_parts['filename'],
        'author': doc_info.author if doc_info.author else u'Unbekannt',
        'subject': doc_info.subject if doc_info.subject else '',
        'creator': doc_info.creator,
        'producer': doc_info.producer,
        # 'pages': num_of_pages,
        # 'pdftype': 'Nur Bild' if pdf_image_only(path) else u'PDF mit Text',
        # 'pdftype': 'Nur Bild' if check_pdf_scanned_plain(path) == "Fully scanned PDF - no relevant text" else u'PDF mit Text',
        'pdftype': 'Nur Bild' if get_pdf_content(path) == "" else u'PDF mit Text',
    }
    if '/Keywords' in doc_info:
        d['keywords'] = doc_info['/Keywords']
    else:
        d['keywords'] = ''

        d['pages'] = num_of_pages
        d['pdftype'] = u'Nur Bild' if get_pdf_content(path) == "" else u'PDF mit Text',

    # print('d={0}'.format(d))
    return d


def get_pdf_content(filename):
    # Load  PDF
    with open(filename, "rb") as f:
        pdf = pdftotext.PDF(f)
        text = ''.join(pdf)  # Read all the text into one string
        # text = text.replace('\n', '')
        if len(text) == 1:
            print(hex(ord(text)))
        if len(text) == 1 and hex(ord(text)) == '0xc':
            text = ''
        return text


def get_pdf_content_lines(file):
    lines = []
    with open(file, 'rb') as f:
        pdf_reader = PyPDF2.PdfFileReader(f)
        for page in pdf_reader.pages:
            for line in page.extractText().splitlines(keepends=True):
                lines.append(line)
    # print(lines)
    return lines


def get_file_hash(filename):
    with open(filename, "rb") as f:
        bytestring = f.read()  # read entire file as bytes
        readable_hash = hashlib.sha256(bytestring).hexdigest()
        return readable_hash


def process_file(file):
    hash = get_file_hash(file)

    print('File info for file {0}:'.format(file))
    file_info = get_file_info(file)
    # print(file_info)

    if file_info['extension'][-3] != file_info['filetype'][-3]:
        print('Warning! Extension and mimetype do not match!')

    if file_info['filetype'] == 'application/pdf':
        print('Checking PDF file for text content...')
        pdf_info = get_pdf_info(file)
        metadata = pdf_info
        # print('pdf_info={0}'.format(pdf_info))

        if pdf_info['pdftype'] == 'Image only PDF':
            print('PDF file contains image only. Stand by for OCR...')
            print('Skipping OCR for this proof-of-concept.')
            # # Do OCR
            # text = ocr_pdf(file)
            # # text = pdf_image_only_to_text(file)
            # print('ocr_pdf found text: {0}'.format(text))
        else:
            print('PDF file contains text.')
            # text1 = get_file_content(file)
            text = get_pdf_content_lines(file)
            # print(text)

    elif file_info['filetype'] == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' or \
            file_info['filetype'][-4] == 'docx':
        print('MS Word document. Extract info and text...')
        text = doc_text_extractor(file)
        # print(text)
        text = text.splitlines(keepends=True)
        metadata = []

    elif file_info['filetype'][-3] == 'txt':
        print('text document. Extract text...')
        # ToDo

    else:
        print('Not supported document type. Giving up.')
        exit(1)

    return hash, text, metadata


def get_text_diff(text1, text2):
    d = dl.Differ()
    result = list(d.compare(text1, text2))
    print(result)
    return result


def get_unified_diff(list1, list2):
    difflist = []
    # print(list1)
    # print(list2)
    for i in range(len(list1)):
        list1[i] = list1[i] + '\n'
    for i in range(len(list2)):
        list2[i] = list2[i] + '\n'
    dl.context_diff(list1, list2)
    for diff in dl.unified_diff(list1, list2):
        # print(diff)
        difflist.append(diff)
    # print(difflist)
    return difflist


if __name__ == '__main__':

    files = ['Test 1.pdf', 'Test 2.pdf', 'Test 1.docx', 'Test 2.docx']

    for i in range(0, len(files), 2):
        print('Processimg pair {0} of files.'.format(i // 2 + 1))
        hash1, text1, metadata1 = process_file(files[i])
        hash2, text2, metadata2 = process_file(files[i + 1])
        if hash1 == hash2:
            print('Files are binary identical.')
        else:
            print('Files are binary different.')
        # Get diff for text content
        print(text1)
        print(text2)
        diff = get_unified_diff(text1, text2)
        print(diff)
        diffsum = 0
        for e in diff[3:]:
            if e.startswith('+') or e.startswith('-'):
                diffsum = diffsum + 1
        if diffsum == 0:
            print('No difference between text contents of file 1 and 2.')
        else:
            print('Text contents of file 1 and 2 are different.')
        # Get diff for file metadata
        if len(metadata1) == 0 and len(metadata2) == 0:
            print('No metadata in files.')
        else:
            difflist = get_unified_diff(list(metadata1.values()), list(metadata2.values()))
            print(metadata1)
            print(metadata2)
            print(difflist)
            diffsum = 0
            for e in difflist[3:]:
                if e.startswith('+') or e.startswith('-'):
                    diffsum = diffsum + 1
            if diffsum == 0:
                print('No difference between metadata of file 1 and 2.')
            else:
                print('Metadata of file 1 and 2 has {0} differences.'.format(diffsum))

        print('##########')
