#!/usr/bin/env python
# -*- coding: utf-8 -*-
import getopt, os, re, sys, codecs

def main():
	
	print "pacify v0.2 - Copyright 2009 Pax Librorum (www.PaxLibrorum.com)"
	print
	try:
		opts, args = getopt.getopt(sys.argv[1:], "hP:i:vprcql", ["help", "profile=", "input=", "output", "verbose"])
	except getopt.GetoptError, err:
		# print help information and exit:
		print str(err) # will print something like "option -a not recognized"
		usage()
		sys.exit(2)
	infilename = None
	outfilename = 'output.txt'
	
	optParfix = False
	optRtrim = False
	optCharsub = False
	optQuotefix = False
	optLatex = False
	global profile
	profile = 'en'
	
	for o, a in opts:
		if o == "-v":
			verbose = True
		elif o == "-p":
			optParfix = True
		elif o == "-r":
			optRtrim = True
		elif o == "-c":
			optCharsub = True
		elif o == "-q":
			optQuotefix = True
		elif o == "-l":
			optLatex = True
		elif o in ("-h", "--help"):
			usage()
			sys.exit()
		elif o in ("-P", "--profile"):
			profile = a
			if profile != "en":
				if profile != "hu":
					print "profile must either be \"en\" or \"hu\""
					print
					usage()
					sys.exit(2)
		elif o in ("-i", "--input"):
			infilename = a
		else:
			assert False, "unhandled option"

	if infilename == None:
		usage()
		sys.exit()
	
	try:
	  infile = codecs.open(infilename, 'r', 'utf-8')
	  inbuffer = infile.read()
	except:
	  infile = open(infilename)
	  inbuffer = infile.read()
	  inbuffer = inbuffer.replace(chr(176), '&deg;')
	  inbuffer = inbuffer.encode('utf-8')
	  inbuffer = inbuffer.replace(u'&deg;', u'°')
	infile.close()
	
	if optRtrim:
		print "Right-trimming lines... "
		print
		while inbuffer.find(" \n") > -1:
			inbuffer = inbuffer.replace(" \n", "\n")
		while inbuffer.find(" \r") > -1:
			inbuffer = inbuffer.replace(" \r", "\r")
	
	theTome = []
	
	theTome = getTome(inbuffer)
	
	if optParfix:
		theTome = parfixTome(theTome)
	
	if optCharsub:
		theTome = miscfixTome(theTome)
	
	if optQuotefix:
		theTome = quoteTome(theTome)
		theTome = squoteTome(theTome)
	
	if profile == 'hu':
		theTome = hungfixTome(theTome)
	
	if optLatex:
		theTome = latexTome(theTome)
	
	outfile = open(outfilename, 'w')
	print "Writing to " + outfilename + " ..."
	print
	for line in theTome[1:]:
		outfile.write(line.encode('utf-8')+chr(10))
	outfile.close()
	print "Done!"
	print
	
	sys.exit()

def hungfixTome(theTome):
	
	print "Fixing Hungarian characters..."
	print
	
	newTome = []
	newTome.append(theTome[0])
	
	outbuffer = u""
	docontinue = False
	for idx in range(1, len(theTome[1:])):
		if docontinue > 0:
			docontinue -= 1
			continue
		newTome.append(theTome[idx].replace(u'õ', u'ő').replace(u'Õ', u'Ő').replace(u'û', u'ű').replace(u'Û', u'Ű'))
	return newTome

def latexTome(theTome):
	
	print "Preparing for LaTeX..."
	print
	
	newTome = []
	newTome.append(theTome[0])
	
	outbuffer = u""
	docontinue = False
	for idx in range(1, len(theTome[1:])):
		if docontinue > 0:
			docontinue -= 1
			continue
		if profile == 'hu':
			newTome.append(theTome[idx].replace(u'—', u'---').replace(u'…', u'\ldots{}').replace(u'“', u'``').replace(u'”', u'\'\'').replace(u'ő', u'\H{o}').replace(u'Ő', u'\H{O}').replace(u'ű', u'\H{u}').replace(u'Ű', u'\H{U}').replace(u'í', u'\\\'{ı}'))
		else:
			newTome.append(theTome[idx].replace(u'—', u'---').replace(u'…', u'\ldots{}').replace(u'“', u'``').replace(u'”', u'\'\''))
	
	
	
	return newTome

def miscfixTome(theTome):
	
	print "Miscellaneous fixes..."
	print
	
	newTome = []
	newTome.append(theTome[0])
	
	outbuffer = u""
	docontinue = False
	for idx in range(1, len(theTome[1:])):
		if docontinue > 0:
			docontinue -= 1
			continue
		newTome.append(theTome[idx].replace(u'—', u'—').replace(u'…', u'…').replace(u'. . .', u'…').replace(u'...', u'…').replace(u'“', u'"').replace(u'”', u'"').replace(u'', u'—').replace(u' …', u'…').replace(u'---', u'—').replace(u'--', u'—').replace(u'&quot;', u'"').replace(u'„', u'"').replace(u'- ', u'— ').replace(u'-,', u'—,').replace(u'-;', u'—;').replace(u'-:', u'—:').replace(u'-?', u'—?').replace(u'-!', u'—!').replace(u'-.', u'—.').replace(u' -', u'-'))
	
	return newTome

def parfixTome(theTome):
	
	print "Fixing erroneous paragraph breaks..."
	print
	
	newTome = []
	newTome.append(theTome[0])
	
	outbuffer = u""
	strPunct = u' .?!-)‒–—―…»"\':”’AÁÀÄBCDEÉÈËFGHIÍÌÏJKLMNOÒÓÖŐPQRSTUÙÚÜŰVWXYZ'
	strLowerAlpha = u'aáàäbcdeéèëfghiíìïjklmnoòóöőpqrstuùúüűvwxyz'
	docontinue = False
	for idx in range(1, len(theTome[1:])):
		if docontinue > 0:
			docontinue -= 1
			continue
		if len(theTome[idx])>1:
			if strPunct.find(theTome[idx][len(theTome[idx])-1:]) == -1:
				if strLowerAlpha.find(theTome[idx+2][0:1]) > -1:
					if (theTome[idx] != theTome[idx].upper()) and (theTome[idx+2] != theTome[idx+2].upper()) and (len(theTome[idx]) > 45):
						newTome.append(theTome[idx]+' ' + theTome[idx+2])
						print "  Concatenating: "
						print "  {"
						print "    A: " + theTome[idx].encode('utf-8')
						print "       " + str(strPunct.find(theTome[idx][len(theTome[idx])-1:])) + " (" + theTome[idx][len(theTome[idx])-1:].encode('utf-8') + ")"
						print "    B: " + theTome[idx+2].encode('utf-8')
						print "       " + str(strLowerAlpha.find(theTome[idx+2][1:2])) + " (" + theTome[idx+2][0:1].encode('utf-8') + ")"
						print "  }"
						print
						docontinue = 2
						continue
		newTome.append(theTome[idx])
	
	return newTome

def quoteTome(theTome):
	
	print "Correcting double quotation marks..."
	print
	
	strAlpha = u'aáàäbcdeéèëfghiíìïjklmnoòóöőpqrstuùúüűvwxyzAÁÀÄBCDEÉÈËFGHIÍÌÏJKLMNOÒÓÖŐPQRSTUÙÚÜŰVWXYZ'
	
	newTome = []
	newTome.append(theTome[0])
	
	outbuffer = u""
	for line in theTome[1:]:
		tmpLine = '  ' + line + '  '
		dquote = 'unset'
		digidx = 0
		for digit in line:
			if digit == '"':
				if (strAlpha.find(tmpLine[digidx+1:digidx+2]) == -1) or (strAlpha.find(tmpLine[digidx+3:digidx+4]) == -1):
					#outbuffer += '{' + tmpLine[digidx+1:digidx+2] + '|' + tmpLine[digidx+3:digidx+4] + '}'
					if u' '.find(tmpLine[digidx+1:digidx+2]) > -1:
							outbuffer += u"``"
							dquote = 'open'
					elif u' '.find(tmpLine[digidx+3:digidx+4]) > -1:
							outbuffer += u"''"
							dquote = 'closed'
					else:
						if dquote == 'unset':
							if digidx == len(line):
								outbuffer += u"''"
								dquote = 'closed'
							else:
								outbuffer += u"``"
								dquote = 'open'
						elif dquote == 'open':
							outbuffer += u"''"
							dquote = 'closed'
						elif dquote == 'closed':
							outbuffer += u"``"
							dquote = 'open'
				else:
					outbuffer += '"'
			else:
				outbuffer += digit
			digidx += 1
		newTome.append(outbuffer)
		outbuffer = u""
	
	return newTome

def squoteTome(theTome):
	
	print "Correcting single quotation marks..."
	print
	
	strAlpha = u'aáàäbcdeéèëfghiíìïjklmnoòóöőpqrstuùúüűvwxyzAÁÀÄBCDEÉÈËFGHIÍÌÏJKLMNOÒÓÖŐPQRSTUÙÚÜŰVWXYZ'
	
	newTome = []
	newTome.append(theTome[0])
	
	outbuffer = u""
	for line in theTome[1:]:
		tmpLine = '  ' + line + '  '
		dquote = 'unset'
		digidx = 0
		for digit in line:
			if digit == '\'':
				if (strAlpha.find(tmpLine[digidx+1:digidx+2]) == -1) or (strAlpha.find(tmpLine[digidx+3:digidx+4]) == -1):
					#outbuffer += '{' + tmpLine[digidx+1:digidx+2] + '|' + tmpLine[digidx+3:digidx+4] + '}'
					if u' '.find(tmpLine[digidx+1:digidx+2]) > -1:
							outbuffer += u"`"
							dquote = 'open'
					elif u' '.find(tmpLine[digidx+3:digidx+4]) > -1:
							outbuffer += u"'"
							dquote = 'closed'
					else:
						if dquote == 'unset':
							if digidx == len(line):
								outbuffer += u"'"
								dquote = 'closed'
							else:
								outbuffer += u"`"
								dquote = 'open'
						elif dquote == 'open':
							outbuffer += u"'"
							dquote = 'closed'
						elif dquote == 'closed':
							outbuffer += u"`"
							dquote = 'open'
				else:
					outbuffer += '\''
			else:
				outbuffer += digit
			digidx += 1
		newTome.append(outbuffer)
		outbuffer = u""
	
	return newTome

def getTome(inbuffer):
	
	theTome = []
	cfg = {'idx':0}

	theTome.append(cfg)
	
	curDigit = u""
	prevDigit = u""
	collSpace = u""

	spaceList = []
	spaceDict = {}
	  
	print "Parsing..."
	print
	  
	for digit in inbuffer:
		prevDigit = curDigit
		curDigit = digit
	    
		if curDigit == ' ':
			collSpace += u"s"
		elif curDigit == chr(9):
			collSpace += u"t"
		elif curDigit == chr(10):
			collSpace += u"r"
		elif curDigit == chr(13):
			collSpace += u"n"
		else:
			if collSpace != u"":
				spaceList.append(collSpace)
			if spaceDict.get(collSpace) > 0:
				spaceDict[collSpace] += 1
			else:
				spaceDict[collSpace] = 1
			collSpace = u""

	if collSpace != u"":
		spaceList.append(collSpace)
		if spaceDict.get(collSpace) > 0:
			spaceDict[collSpace] += 1
		else:
			spaceDict[collSpace] = 1
		collSpace = u""

	wordspace = u""
	highest = 0
	for key in spaceDict:
		if spaceDict[key] > highest:
			highest  = spaceDict[key]
			wordspace = key
	top1 = highest
	  
	linebreak = u""
	highest = 0
	for key in spaceDict:
		if spaceDict[key] > highest and key != wordspace and (key.find('r') > -1 or key.find('n') > -1):
			highest  = spaceDict[key]
			linebreak = key
	top2 = highest
	  
	parbreak = u""
	highest = 0
	for key in spaceDict:
		if spaceDict[key] > highest and key != wordspace and key != linebreak and (key.find('r') > -1 or key.find('n') > -1):
			highest  = spaceDict[key]
			parbreak = key
	top3 = highest
	  
	fourth = u""
	highest = 0
	for key in spaceDict:
		if spaceDict[key] > highest and key != wordspace and key != linebreak and key != parbreak and (key.find('r') > -1 or key.find('n') > -1):
			highest  = spaceDict[key]
			fourth = key
	top4 = highest
	
	if top3 > (top4 * 5) and top2 > (top3 * 2) and top1 > (top2 * 5):
		print "  line break: " + linebreak + " (" + str(top2) + ")"
		print "  paragraph break: " + parbreak + " (" + str(top3) + ")"
		print "  ...: " + " (" + str(top4) + ")"
		print
		print "Removing intraparagraph linebreaks..."
		print
		
		prevDigit = u""
		curDigit = u""
		outbuffer = u""
		collSpace = u""
		
		for digit in inbuffer:
			prevDigit = curDigit
			curDigit = digit
		
			if curDigit == ' ':
				collSpace += u"s"
			elif curDigit == chr(9):
				collSpace += u"t"
			elif curDigit == chr(10):
				collSpace += u"r"
			elif curDigit == chr(13):
				collSpace += u"n"
			else:
				if collSpace != u"":
					if collSpace == wordspace:
						outbuffer += u" "
					elif collSpace == linebreak:
						outbuffer += u" "
					elif collSpace == parbreak:
						theTome.append(outbuffer)
						outbuffer = u""
						theTome.append(outbuffer)
					else:
						outbuffer += collSpace.replace(u's', u' ').replace(u't', u''+chr(9)).replace(u'r', u''+chr(10)).replace('n', u''+chr(13))
					collSpace = u""
				outbuffer += curDigit
		
		if collSpace != u"":
			if collSpace != u"":
				if collSpace == wordspace:
					outbuffer += u" "
				if collSpace == linebreak:
					outbuffer += u" "
				if collSpace == parbreak:
					theTome.append(outbuffer)
					outbuffer = u""
					theTome.append(outbuffer)
				collSpace = u""
		
		
	elif top2 > (top3 * 5) and top1 > (top2 * 5):
		parbreak = linebreak
		print "  paragraph break: " + parbreak + " (" + str(top2) + ")"
		print "  ...: " + " (" + str(top3) + ")"
		print "  ...: " + " (" + str(top4) + ")"
		print
		
		prevDigit = u""
		curDigit = u""
		outbuffer = u""
		collSpace = u""
		
		for digit in inbuffer:
			prevDigit = curDigit
			curDigit = digit
		
			if curDigit == ' ':
				collSpace += u"s"
			elif curDigit == chr(9):
				collSpace += u"t"
			elif curDigit == chr(10):
				collSpace += u"r"
			elif curDigit == chr(13):
				collSpace += u"n"
			else:
				if collSpace != u"":
					if collSpace == wordspace:
						outbuffer += u" "
					elif collSpace == parbreak:
						theTome.append(outbuffer)
						outbuffer = u""
						theTome.append(outbuffer)
					else:
						outbuffer += collSpace.replace(u's', u' ').replace(u't', u''+chr(9)).replace(u'r', u''+chr(10)).replace(u'n', u''+chr(13))
					collSpace = u""
				outbuffer += curDigit
		
		if collSpace != u"":
			if collSpace != u"":
				if collSpace == wordspace:
					outbuffer += u" "
				if collSpace == parbreak:
					theTome.append(outbuffer)
					outbuffer = u""
					theTome.append(outbuffer)
				collSpace = u""
		
		
	else:
		print "  ...: " + linebreak + " (" + str(top2) + ")"
		print "  ...: " + parbreak + " (" + str(top3) + ")"
		print "  ...: " + " (" + str(top4) + ")"
		print
		print "Failed to correctly parse file.  Might be too short."
		sys.exit()
	
	return theTome

def usage():
  print "Usage: pacify -P [en|hu] -i input.txt [options]"
  print
  print "Options:        -r : preprocess lines by right-trimming"
  print "                -p : fix erroneous paragraph breaks"
  print "                -c : perform character substitutions (dulls quot. marks)"
  print "                -q : fix quotation marks (smartens quot. marks)"
  print "                -l : prepare for LaTeX"
  print

if __name__ == "__main__":
  main()
