/*
 * File Name: text_model.cpp
 */

/*
 * This file is part of uds-plugin-plaintext.
 *
 * uds-plugin-plaintext is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * uds-plugin-plaintext is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

/**
 * Copyright (C) 2008 iRex Technologies B.V.
 * All rights reserved.
 */

#include <stdio.h>
#include <string.h>
#include <glib.h>
#include "text_model.h"
#include "utils.h"
#include "log.h"
#include "nsUniversalDetectorImpl.h"

namespace text
{

/// You must feed enca with some text to get the possible encoding.
/// If the sample text is too short, you will get "Unknown" encoding.
static const int SAMPLE_LEN = 1024 * 4;
static const int BLOCK_SIZE = 4096;
static const std::string TARGET_CODESET = "utf-8";

TextModel::TextModel()
: file_p(0)
, encoding("")
, path()
, b_open(false)
, aborting_search_task_id(0)
, incomplete_line(false)
{
}

TextModel::~TextModel()
{
    if (b_open)
    {
        close();
    }
}

PluginStatus TextModel::open(const std::string& doc_path)
{
    return open(doc_path, "");
}

PluginStatus TextModel::open(const std::string& doc_path,
                             const std::string& encoding)
{
    PluginStatus result = PLUGIN_FAIL;

    // The document was opened already, close it first.
    if (b_open)
    {
        close();
    }

    // Try to open specified file.
    file_p = fopen(doc_path.c_str(), "r");
    if (file_p == NULL)
    {
        return PLUGIN_FAIL;
    }

    // Detect encodings if necessary.
    if (encoding.empty())
    {
        detect_encoding();
    }

    // Update document information.
    path = doc_path;
    b_open = true;

    // Build up paragraphs.
    result = read_text();
    if (result != PLUGIN_OK)
    {
        close();
        return result;
    }

    if (doc.empty())
    {
        doc.push_back(Paragraph(0, new std::string(" ")));
    }

    return result;
}

void TextModel::close()
{
    clear();
    fclose(file_p);
    file_p = NULL;
    b_open = false;
    encoding.clear();
}

/// Before using this function, make sure the file is already opened.
void TextModel::detect_encoding()
{
    char buf[SAMPLE_LEN];
    size_t bytes_read = fread(buf, 1, SAMPLE_LEN, file_p);

    nsUniversalDetectorImpl charset_detector;
    charset_detector.HandleData(buf, static_cast<unsigned int>(bytes_read));
    charset_detector.DataEnd();
    encoding = charset_detector.get_enc();

    if (encoding.empty())
    {
        // Fall back to ISO8859-1.
        ERRORPRINTF("Can't detect encodings, fall back to ISO8859-1.");
        encoding = DEFAULT_ENCODING;
    }
}

void TextModel::clear()
{
    // Clear current paragraphs
    for (TextDocumentIter it = doc.begin(); it != doc.end(); ++it)
    {
        delete (*it).text;
    }
    doc.clear();
}

PluginStatus TextModel::read_text()
{
    assert(b_open);
    
    clear();    
    if (encoding == TARGET_CODESET)
    {
        // Already UTF-8 encoded
        return read_utf8_text();
    }
    else
    {
        // Need to convert
        return read_non_utf8_text();
    }
}

PluginStatus TextModel::read_utf8_text()
{
    char buf[BLOCK_SIZE];
    size_t bytes_read = 0;

    fseek(file_p, 0, SEEK_SET);
    while (!feof(file_p))
    {
        bytes_read = fread(buf, 1, BLOCK_SIZE, file_p);
        save_block_with_paragraphs(buf, bytes_read);
    }

    return PLUGIN_OK;
}

void TextModel::convert(iconv_t cd, char **in_buf, size_t *in_bytes_left, char **out_buf, size_t *out_bytes_left)
{
    while (*in_bytes_left > 3)
    {
        size_t bytes_to_be_converted = *in_bytes_left;
#ifdef WIN32
        iconv(cd, const_cast<const char **>(in_buf), in_bytes_left, out_buf, out_bytes_left);
#else
        iconv(cd, in_buf, in_bytes_left, out_buf, out_bytes_left);
#endif

        if (*in_bytes_left == bytes_to_be_converted)
        {
            // 0 bytes converted, maybe we have incorrect characters at start
            // Replace it with blank.
            ++(*in_buf);
            --(*in_bytes_left);
            *(*out_buf)++ = ' ';
            --(*out_bytes_left);
        }
    }
}

PluginStatus TextModel::read_non_utf8_text()
{
    // Read content from disk file
    char in_buf[BLOCK_SIZE];
    char out_buf[3*BLOCK_SIZE];

    size_t partial_chars = 0;
    iconv_t conv = iconv_open(TARGET_CODESET.c_str(), encoding.c_str());
    if (conv == (iconv_t)(-1))
    {
        return PLUGIN_UNSUPPORTED_ENCODING;
    }

    fseek(file_p, 0, SEEK_SET);
    while (!feof(file_p))
    {
        // Read 1 block from disk file
        size_t bytes_read = fread(in_buf+partial_chars, 1, BLOCK_SIZE-partial_chars, file_p);

        char *in_p  = in_buf;
        char *out_p = out_buf;

        // The number of bytes to be converted equals to the bytes read plus
        // the bytes unconverted since last conversion.
        size_t in_bytes_left  = bytes_read + partial_chars;
        size_t out_bytes_left = sizeof(out_buf);

        // Do conversion, use wrapper instead of using iconv directly.
        convert(conv, &in_p, &in_bytes_left, &out_p, &out_bytes_left);

        // Put converted string to paragraph list
        save_block_with_paragraphs(out_buf, sizeof(out_buf)-out_bytes_left);

        // Check if we have partial chars unconverted
        partial_chars = in_bytes_left;
        if (partial_chars > 0)
        {
            memcpy(in_buf, in_buf+BLOCK_SIZE-partial_chars, partial_chars);
        }
    }

    iconv_close(conv);
    return PLUGIN_OK;
}

/*
void TextModel::read_non_utf8_text()
{
    // Read content from disk file
    char in_buf[BLOCK_SIZE];

    IConvEncodingConverter conv(enc);
    fseek(file_p, 0, SEEK_SET);
    while (!feof(file_p))
    {
        // Read 1 block from disk file
        size_t bytes_read = fread(in_buf, 1, BLOCK_SIZE, file_p);
        std::string str = "";
        conv.convert(str, in_buf, in_buf + bytes_read);
        save_block_with_paragraphs(str.c_str(), str.size());
    }
}
*/

void TextModel::save_block_with_paragraphs(const char *blk, size_t blk_size)
{
    if (doc.empty())
    {
        // We are saving the first block
        incomplete_line = false;
    }

    const char *end_p = blk + blk_size; // end_p points to '\0'
    const char *p     = blk;

    while (p < end_p)
    {
        gchar* find_p = g_utf8_strchr(p, static_cast<gssize>(end_p - p), '\n');
        if (find_p != NULL)
        {
            // We find a new paragraph, append it to the paragraph list
            if (incomplete_line)
            {
                // We have a incomplete line since last read, so this time we
                // must append the string to the last paragraph
                doc.back().text->append(p, find_p - p + 1);
            }
            else
            {
                // Just create a new string and append it to the end of the
                // paragraph list
                size_t start_file_pos = 0;
                if (doc.size() > 0)
                {
                    start_file_pos = doc.back().start_file_pos + doc.back().text->size();
                }
                doc.push_back(Paragraph(start_file_pos, new std::string(p, find_p - p + 1)));
            }
            
            p = find_p + 1;
            incomplete_line = false;
        }
        else
        {
            // Can't find paragraph
            size_t start_file_pos = 0;
            if (doc.size() > 0)
            {
                start_file_pos = doc.back().start_file_pos + doc.back().text->size();
            }
            doc.push_back(Paragraph(start_file_pos, new std::string(p, end_p - p)));

            // We have an incomplete paragraph, mark it
            incomplete_line = true;
            break;
        }
    }
}

bool TextModel::search(std::vector<Range>& result_ranges, SearchContext* sc)
{
    // Exact search type and search criteria from search context.
    SearchType search_type      = sc->search_type;
    Position   &from            = sc->from;
    const char *pattern         = sc->pattern.c_str();
    bool       case_sensitive   = sc->case_sensitive;
    bool       forward          = sc->forward;
    bool       match_whole_word = sc->match_whole_word;

    size_t pattern_len = strlen(pattern);
    const char *paragraph_head = doc[from.paragraph].text->c_str();

    if (forward)
    {
        const char *p = paragraph_head + from.offset;
        while (true)
        {
            const char* find = utf8_strstr(p, pattern, case_sensitive);
            if (find)
            {
                // See if matching whole word.
                if (!match_whole_word || is_whole_word(paragraph_head, find, pattern_len))
                {
                    // Pattern found.
                    Position start(from.paragraph, static_cast<unsigned int>(find - paragraph_head));
                    const char* last_char = g_utf8_prev_char(find + pattern_len);
                    Position end(from.paragraph, static_cast<unsigned int>(last_char - paragraph_head));
                    result_ranges.push_back(Range(start, end));
                    if (search_type == SEARCH_NEXT)
                    {
                        // Search complete.
                        return true;
                    }

                    // If SEARCH_ALL we must continue with current paragraph.
                }

                p = find + pattern_len;
            }
            else
            {
                // Can't find any match in current paragraph.
                from.offset = 0;
                return ++(from.paragraph) == doc.size();
            }
        }
    }
    else
    {
        // Backward search.
        int len = static_cast<int>(from.offset);
        while (true)
        {
            const char *find = utf8_strrstr(paragraph_head, len, pattern, case_sensitive);
            if (find)
            {
                // See if matching whole word.
                if (!match_whole_word || is_whole_word(paragraph_head, find, pattern_len))
                {
                    // Pattern found.
                    Position start(from.paragraph, static_cast<unsigned int>(find - paragraph_head));
                    const char* last_char = g_utf8_prev_char(find + pattern_len);
                    Position end(from.paragraph, static_cast<unsigned int>(last_char - paragraph_head));
                    result_ranges.push_back(Range(start, end));
                    return true;
                }

                len = static_cast<int>(find - paragraph_head);
            }
            else
            {
                // Can't find any match in current paragraph.
                if (from.paragraph == 0)
                {
                    return true;
                }
                else
                {
                    from.paragraph--;
                    from.offset = static_cast<unsigned int>(doc[from.paragraph].text->size());
                    return false;
                }
            }
        }
    }
}

bool TextModel::has_anchor(const Position &pos)
{
    // Sanity check.
    if (pos.paragraph >= doc.size() || pos.offset >= doc[pos.paragraph].text->size())
    {
        return false;
    }

    return true;
}

bool TextModel::get_file_pos_from_anchor(size_t& file_pos, const Position &pos)
{
    // Sanity check.
    if (pos.paragraph >= doc.size() || pos.offset >= doc[pos.paragraph].text->size())
    {
        return false;
    }

    file_pos = doc[pos.paragraph].start_file_pos + pos.offset;
    return true;
}

bool TextModel::is_seperator(const char* p)
{
    gunichar ch = g_utf8_get_char(p);

    if (g_unichar_isspace(ch))
    {
        return true; 
    }

    if (g_unichar_ispunct(ch))
    {
        // Punctuation.
        if (*p != '\'' && *p != '\"')
        {
            return true;
        }
    }

    return false;
}

bool TextModel::get_word_from_anchor(const Position& pos,
                                     Position& word_start_pos,
                                     Position& word_end_pos)
{
    
    const char* paragraph = doc[pos.paragraph].text->c_str();
    word_start_pos.paragraph = word_end_pos.paragraph = pos.paragraph;

    const char* p = paragraph + pos.offset;

    // Check if the character at pos is a seperator.
    if (is_seperator(p))
    {
        // Then there is no word at pos.
        word_start_pos.offset = word_end_pos.offset = pos.offset;
        return false;
    }

    // Find the first space before pos.
    for (; p > paragraph; p = g_utf8_prev_char(p))
    {
        if (is_seperator(p))
        {
            p = g_utf8_next_char(p);
            break;
        }
    }
    word_start_pos.offset = static_cast<int>(p - paragraph);

    // Find the first space after pos.
    for (p = paragraph + pos.offset; *p != 0; p = g_utf8_next_char(p))
    {
        if (is_seperator(p))
        {
            p = g_utf8_prev_char(p);
            break;
        }
    }
    word_end_pos.offset = static_cast<int>(p - paragraph);

    return true;
}

bool TextModel::get_words_from_range(const Position& range_start,
                                     const Position& range_end,
                                     Position& words_start,
                                     Position& words_end)
{
    if (range_end < range_start)
    {
        ERRORPRINTF("Invalid range, range_start = %s, range_end = %s",
            range_start.to_string().c_str(),
            range_end.to_string().c_str());
        return false;
    }

    Position tmp;

    // Get the object range the range_start anchor points to.
    get_word_from_anchor(range_start, words_start, tmp);

    // Get the object range the range_end anchor points to.
    get_word_from_anchor(range_end, tmp, words_end);

    // Strip any leading seperators.
    const char* start_paragraph = 0;
    const char* p = 0;
    while (true)
    {
        start_paragraph = doc[words_start.paragraph].text->c_str();
        for (p = start_paragraph + words_start.offset; *p != 0; p = g_utf8_next_char(p))
        {
            if (!is_seperator(p))
            {
                break;
            }
        }

        if (*p == 0)
        {
            words_start.paragraph++;
            words_start.offset = 0;
        }
        else
        {
            break;
        }
    }
    words_start.offset = static_cast<int>(p - start_paragraph);

    // Strip any trailing seperators.
    const char* end_paragraph = doc[words_end.paragraph].text->c_str();
    for (p = end_paragraph + words_end.offset; p > end_paragraph; p = g_utf8_prev_char(p))
    {
        if (!is_seperator(p))
        {
            break;
        }
    }
    words_end.offset = static_cast<int>(p - end_paragraph);

    return words_end >= words_start;
}

bool TextModel::get_text_from_range(std::string& result,
                                    const Position& start_pos,
                                    const Position& end_pos)
{
    unsigned int start_paragraph = start_pos.paragraph;
    unsigned int end_paragraph = end_pos.paragraph;

    for (unsigned int i = start_paragraph; 
            (i <= end_paragraph) && (i < doc.size()); 
            i++)
    {
        if (doc[i].text)
        {
            const char* start_p = doc[i].text->c_str();
            if (i == start_paragraph)
            {
                start_p += start_pos.offset;
            }

            size_t len = doc[i].text->length();
            if (i == end_paragraph)
            {
                const char* p = doc[i].text->c_str() + end_pos.offset;
                len = g_utf8_next_char(p) - start_p;
            }

            result.append(start_p, len);
        }
    }

    return true;
}

void TextModel::dump()
{
    // Generate the dump file.
    std::string dump_path = path + ".converted";
    FILE* fp = fopen(dump_path.c_str(), "w");
    
    if (fp != NULL)
    {
        for (unsigned int i=0; i<doc.size(); i++)
        {
            fputs(doc[i].text->c_str(), fp);
        }

        fclose(fp);
    }
}

}
