#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
#
# This script tries to clean up text files so that they are more suitable
# for processing with latex. 
#
# Perttu Haimi 2009
#
# The author or authors of this code dedicate any and all copyright 
# interest in this code to the public domain. We make this dedication 
# for the benefit of the public at large and to the detriment of our 
# heirs and successors. We intend this dedication to be an overt act of 
# relinquishment in perpetuity of all present and future rights to this 
# code under copyright law.


import sys
import getopt
import fileinput


# finds the most common line length
def mode_length(cnts):

    maximum = 0
    length = 0
    for k, v in cnts.iteritems():
        if v > maximum:
            maximum = v
            length = k

    return length

def quartiles_length(cnts):

    k = cnts.keys()
    k.sort()
    q_25 = k[len(k)/4]
    median = k[len(k)/2]
    q_75 = k[len(k)/4 *3]
    return (q_25, median, q_75)

def process_chapterformatted_file(fn):

    for line in fileinput.input([fn]):
        line = line.rstrip()
        print line + "\n"

def process_lineformatted_file(fn, mode, dashfreq):

    current = ""
    mustend = False
    for line in fileinput.input([fn]):
        line = line.rstrip()
        length = len(line)

                                      # starting line of paragraph
        if mustend or (length > 0 and line[0] in "\t " and len(current.rstrip()) > 0 and current.rstrip()[-1] in ".!?"): 
            print current.rstrip() + "\n"
            current = ""
                                      # long, paragraph line
        if length > 0 and (length > mode - 10 or (length > mode - 20) and line[-1] not in ".!?") :
            if line[-1] == '-':
                if dashfreq > 0.05:   # probably hyphenated 
                    line = line[:-1]  # -> remove the last dash     
                current += line
            else:
                current += line + " "
            mustend = False
        else:                         # short last line of paragraph
            current += line + " "
            mustend = True

    print current


progname = sys.argv.pop(0); # remove the script name

def usage():
    print 'Usage: %s [-h ] file' %progname

try:
    opts, files = getopt.gnu_getopt(sys.argv, 'h:')
except getopt.GetoptError :
    usage()
    sys.exit(2)

# todo implement histogram from command line
h = ''
for o, a in opts:
    if o == '-h':
        h = a

for fn in files:

    cnts = {}
    linecount = 0
    dashcount = 0
    for line in fileinput.input([fn]):
        length = len(line.rstrip())
        if length > 50: # consider only longish lines
            if not length in cnts:
                cnts[length] = 1
            else:
                cnts[length] += 1

        if line[-1] == '-':
            dashcount += 1
                
        linecount += 1

    mode = mode_length(cnts)
    dashfreq = dashcount / float(linecount)

    q_25, median, q_75 =  quartiles_length(cnts)

    if median < 120:
        process_lineformatted_file(fn, mode, dashfreq)
    else:
        process_chapterformatted_file(fn)


