#!/usr/bin/env python
# -*- coding: utf-8 -*-
import getopt, os, re, sys, codecs, time

def main():
	
	#####################################################################
	#                                                                   #
	#                     Initialization Begins Here                    #
	#                                                                   #
	#####################################################################
	
	global log, options, debug
	debug = False
	log = u''
	options = {}
	
	infilename = u''
	outfilename = u'output.txt'
	
	options['title'] = u''
	options['author'] = u''
	options['subtitle'] = u''
	options['identity'] = u''
	
	options['profile'] = u'en'	# profile defaults to english, not hungarian
	options['interactive'] = False	# run in non-interactive mode
	options['pp_rtf'] = False	# preprocess: file is text, not rtf
	options['pp_html'] = False	# preprocess: file is text, not html
	options['pp_rstrip'] = False	# preprocess: no random line-ending spaces
	options['rp_parfix'] = False	# r. process: no erroneous par. breaks fix
	options['rp_charfix'] = False	# r. process: no miscellaneous characters fix
	options['rp_quotfix'] = False	# r. process: no quotation marks fix
	options['out_html'] = True	# output html
	options['out_text'] = False	# output plaintext
	options['identity'] = u''	# identity of bookmaker -- optionally used in latex
	options['out_latex'] = False	# output latex
	options['l_profile'] = u''	# latex profile defaults to nothing
					#     other options:
					#          x_georgia: georgia
					#          x_palatino: palatino
					#          x_garamond: garamond
					#          x_gppro: garamond premier pro
					#          x_gpprocap: garamond premier pro capt.
					#          x_arpro: arno pro
					#          x_arprocap: arno pro capt.
					#
					#     or with latex:
					#          l_cmodern: computer modern
					#          l_palatino: palatino
					#          l_kerkis: kerkis
	
	l_profiles = []
	l_profiles.append(u'x_georgia')
	l_profiles.append(u'x_palatino')
	l_profiles.append(u'x_garamond')
	l_profiles.append(u'x_gppro')
	l_profiles.append(u'x_gpprocap')
	l_profiles.append(u'x_arpro')
	l_profiles.append(u'x_arprocap')
	l_profiles.append(u'l_cmodern')
	l_profiles.append(u'l_palatino')
	l_profiles.append(u'l_kerkis')
	
	logmsg(u'pacify v0.3.1 (2009-08-29) - Copyright 2009 Pax Librorum (www.PaxLibrorum.com)\n')
	
	try:
		opts, args = getopt.getopt(sys.argv[1:], "?huP:i:vprcql:RI:T:A:S:", ["help", "profile=", "input=", "output", "verbose"])
	except getopt.GetoptError, err:
		# print help information and exit:
		logmsg(u'        ERROR: ' + str(err)+u'\n') # will print something like "option -a not recognized"
		usage()
	
	
	for o, a in opts:
		if o == "-?": usage()
		elif o == "-h": usage()
		elif o == "-u": usage()
		elif o in "-I":
			options['identity'] = a.replace(u'\\\'i', u'\\\'{ı}').replace(u'í', u'\\\'{ı}').replace(u'\\\'{\\i}', u'\\\'{ı}')
		elif o in "-T":
			options['title'] = a.replace(u'\\\'i', u'\\\'{ı}').replace(u'í', u'\\\'{ı}').replace(u'\\\'{\\i}', u'\\\'{ı}')
		elif o in "-A":
			options['author'] = a.replace(u'\\\'i', u'\\\'{ı}').replace(u'í', u'\\\'{ı}').replace(u'\\\'{\\i}', u'\\\'{ı}')
		elif o in "-S":
			options['subtitle'] = a.replace(u'\\\'i', u'\\\'{ı}').replace(u'í', u'\\\'{ı}').replace(u'\\\'{\\i}', u'\\\'{ı}')
		elif o in ("-i", "--input"):
			infilename = a
			if infilename[-4:].lower() == '.rtf':
				options['pp_rtf'] = True
			elif infilename[-4:].lower() == '.htm' or infilename[-5:].lower() == '.html':
				logmsg(u'        ERROR: html input files are not supported at this time\n')
				sys.exit(2)
		elif o in ("-P", "--profile"):
			if (a.lower() == 'en') or (a.lower() == 'hu'):
				options['profile'] = a
			else:
				logmsg(u'        ERROR: profile must be \'en\' (english) or \'hu\' (magyar) \n')
				sys.exit(2)
		elif o == "-r": options['pp_rstrip'] = True
		elif o == "-p": options['rp_parfix'] = True
		elif o == "-c": options['rp_charfix'] = True
		elif o == "-q": options['rp_quotfix'] = True
		elif o == "-l":
			options['out_latex'] = True
			options['out_html'] = False
			if ''.join(l_profiles).find(a) > -1:
				options['l_profile'] = a
			else:
				logmsg(u'        ERROR: incorrect latex profile\n')
				usage()
				sys.exit(2)
	
	if infilename == u'':
		logmsg(u'        ERROR: input file required, but not specified\n')
		usage()
		sys.exit(2)
	
	for option,value in options.items():
		# logmsg(u'        ' + option + ": " + str(value))
		pass
	logmsg(u'\n')
	
	try:
		infile = open(infilename)
	except:
		logmsg(u'        ERROR: cannot open input file\n')
		sys.exit(1)
	infile.close()
	
	try:
		infile = codecs.open(infilename, 'r', 'utf-8')
		inbuffer = infile.read()
	except:
		infile = open(infilename)
		inbuffer = infile.read()
		for idx in range(128, 256):
			inbuffer = inbuffer.replace(chr(idx), '&#'+str(idx)+';')
		inbuffer = inbuffer.encode('utf-8')
		for idx in range(128, 256):
			inbuffer = inbuffer.replace('&#'+str(idx)+';', unichr(idx))
	infile.close()
	
	#####################################################################
	#                                                                   #
	#                     Processing Begins Here                        #
	#                                                                   #
	#####################################################################
	
	if options['pp_rtf']:
		inbuffer = rtfExtract(inbuffer)
		inbuffer = postRtfCleanup(inbuffer)
	
	theTome = []
	theTome = getTome(inbuffer)
	
	if options['rp_parfix']:
		theTome = parfixTome(theTome)
	
	if options['rp_quotfix']:
		theTome = quotedumbenTome(theTome)
		theTome = quoteTome(theTome)
		theTome = squoteTome(theTome)
	
	if options['rp_charfix']:
		theTome = charfixTome(theTome)
	
	if options['profile'] == 'hu':
		theTome = hungfixTome(theTome)
	
	theTome = htmlizeTome(theTome)
	
	if options['out_latex']:
		theTome = latexTome(theTome)
	
	sortname = u''
	truename = u''
	if len(options['author']) > 0:
		if options['author'].find(',') == -1:
			sortname = options['author']
			truename = options['author']
		else:
			familyname = options['author'][0:options['author'].find(',')].strip()
			givenname = options['author'][options['author'].find(',')+1:].strip()
			if options['profile'] == 'hu':
				sortname = familyname + u' ' + givenname
				truename = sortname
			else:
				sortname = familyname + u', ' + givenname
				truename = givenname + u' ' + familyname
	
	latexBegin = u'''\\documentclass[11pt,final,openany]{memoir}

\\setlength{\\stockwidth}{9cm}
\\setlength{\\stockheight}{12cm}

\\usepackage[paperwidth=9cm, paperheight=12cm, headsep=0ex, hmargin={0.075in, 0.075in}, vmargin={0.165in, 0.05in}]{geometry}

\\usepackage[dvipdfm, pdftitle={'''+options['title']+u'''},pdfauthor={'''+sortname+u'''}]{hyperref}
\\usepackage{xunicode}
\\usepackage{xltxtra}

%\\PassOptionsToPackage{defaults=hu-min}{magyar.ldf}
\\usepackage[french,magyar]{babel}

%\\usepackage[utf8]{inputenc}
\\usepackage{fontenc}

\\defaultfontfeatures{Mapping=tex-text}

\\setromanfont[Ligatures={Common},Numbers={OldStyle}]{Garamond Premier Pro}

\\widowpenalty 500
\\clubpenalty 5000
\\raggedbottom

\\setlength\\beforechapskip{2cm}

\\pagestyle{ruled}
\\makeevenhead{ruled}{\\tiny\\scshape '''+options['title']+u'''}{}{\\tiny\\rightmark}
\\makeatletter
\\makeoddhead{ruled}{\\tiny\\scshape '''+options['title']+u'''}{}{\\tiny\\rightmark}

\\newcommand{\\lguill}{\\selectlanguage{french}\\fg{}\\selectlanguage{magyar}}
\\newcommand{\\rguill}{\\selectlanguage{french}\\og{}\\selectlanguage{magyar}}

\\newcommand*{\\parbreak}{\\begin{center}*\\quad*\\quad*\\end{center} \\vspace*{-1ex}}

\\newcommand{\\dochapter}[1]{\\chapter*{#1}\\addcontentsline{toc}{chapter}{#1}\\markright{#1}}
\\newcommand{\\doemphchapter}[1]{\\chapter*{\\emph{#1}}\\addcontentsline{toc}{chapter}{\\emph{#1}}\\markright{\\emph{#1}}}

\\newcommand{\\donothing}[1]{#1}

\\begin{document}

\\thispagestyle{empty}
\\pagestyle{empty}

\\begin{center}

\\vspace*{0.2in}

\\Large ''' + truename + u'''

\\vspace*{-0.0in}

\\HUGE \\textsc{''' + options['title'] + u'''}'''
	if len(options['subtitle']) > 0:
		latexBegin += u'''

\\vspace*{0.18in}

\\hrule

\\vspace*{0.18in}

{\\Large{}''' + options['subtitle'] + u'''}

'''
	latexBegin += u'''\\vfil\n\n'''
	if len(options['identity']) > 0:
		latexBegin += u'''\\normalsize \\scshape Ex Libris ''' + options['identity']
	else:
		latexBegin += u'''\\normalsize \\scshape ~ ''' + options['identity']
	latexBegin += u'''\n\n\\vfilneg

\\end{center}

\\clearpage{}

\\tableofcontents*{}

\\begin{sloppypar}

\\clearpage{}

\\thispagestyle{ruled}
\\pagestyle{ruled}

'''
	
	latexCease = u'\n\n\\end{sloppypar}\n\n\\end{document}'
	
	
	if options['out_html']:
		outfilename = infilename[0:-4] + '_pacified' + '.htm'
	elif options['out_latex']:
		outfilename = infilename[0:-4] + '_pacified' + '.tex'
	
	outfile = open(outfilename, 'w')
	logmsg("Writing to " + outfilename + " ...\n")
	if options['out_latex']:
		outfile.write(latexBegin.encode('utf-8'))
	for line in theTome[1:]:
		outfile.write(line.encode('utf-8')+chr(10))
	if options['out_latex']:
		outfile.write(latexCease.encode('utf-8'))
	outfile.close()
	logmsg("Done!\n")
	
	return 0



	#####################################################################
	#                                                                   #
	#                 Helper and Utility Functions Begin                #
	#                                                                   #
	#####################################################################

def charfixTome(theTome):
	global options
	
	logmsg("Fixing miscellaneous characters...\n")
	
	fixes = {}
	if options['profile'] == 'hu':
		fixes[u'--'] = u'&ndash;'
	else:
		fixes[u'--'] = u'&mdash;'
	fixes[u'–'] = u'&ndash;'
	fixes[u'—'] = u'&mdash;'
	fixes[u''] = u'&ndash;'
	fixes[u''] = u'&mdash;'
	fixes[u'»'] = u'&raquo;'
	fixes[u'«'] = u'&laquo;'
	fixes[u''] = u'&hellip;'
	fixes[u'…'] = u'&hellip;'
	fixes[u'...'] = u'&hellip;'
	fixes[u'. . .'] = u'&hellip;'
	fixes[u''] = u'&bull;'
	if options['profile'] == 'hu':
		fixes[u'‘'] = u'&sbquo;'
	else:
		fixes[u'‘'] = u'&lsquo;'
	fixes[u'’'] = u'&rsquo;'
	if options['profile'] == 'hu':
		fixes[u'“'] = u'&bdquo;'
	else:
		fixes[u'“'] = u'&ldquo;'
	fixes[u'”'] = u'&rdquo;'
	
	newTome = []
	newTome.append(theTome[0])
	
	outbuffer = u""
	for idx in range(1, len(theTome)):
		outbuffer = theTome[idx]
		for key, value in fixes.items():
			outbuffer = outbuffer.replace(key, value)
		newTome.append(outbuffer)
	return newTome


def hungfixTome(theTome):
	
	logmsg("Fixing Hungarian characters...\n")
	
	fixes = {}
	fixes[u'õ'] = u'&#337;'
	fixes[u'Õ'] = u'&#336;'
	fixes[u'û'] = u'&#369;'
	fixes[u'Û'] = u'&#368;'
	
	newTome = []
	newTome.append(theTome[0])
	
	outbuffer = u""
	for idx in range(1, len(theTome)):
		outbuffer = theTome[idx]
		for key, value in fixes.items():
			outbuffer = outbuffer.replace(key, value)
		newTome.append(outbuffer)
	return newTome

def latexTome(theTome):
	global options
	logmsg("Fixing Hungarian characters...\n")
	
	fixes = {}
	if options['profile'] == 'hu':
		fixes[u'&#337;'] = u'\H{o}'
		fixes[u'&#336;'] = u'\H{O}'
		fixes[u'&#369;'] = u'\H{u}'
		fixes[u'&#368;'] = u'\H{U}'
	#for idx in range(1, 100):
	#		fixes[u'<p style="left-padding: '+str(idx)+'em">'] = u' '*idx
	fixes[u'í'] = u'\\\'{ı}'
	fixes[u'<footnote>'] = u'\\footnote{'
	fixes[u'</footnote>'] = u'}'
	fixes[u'<p>'] = u''
	fixes[u'</p>'] = u''
	fixes[u'<strong>'] = u'\\textbf{'
	fixes[u'<emph>'] = u'\\emph{'
	fixes[u'</strong>'] = u'}'
	fixes[u'</emph>'] = u'}'
	fixes[u'&ndash;'] = u'--'
	fixes[u'&mdash;'] = u'---'
	fixes[u'&raquo;'] = u'»'
	fixes[u'&laquo;'] = u'«'
	fixes[u'&hellip;'] = u'\\ldots{}'
	fixes[u'&bull;'] = u'\\textperiodcentered{}'
	fixes[u'&sbquo;'] = u'{`}'
	fixes[u'&lsquo;'] = u'{`}'
	fixes[u'&rsquo;'] = u'{\'}'
	fixes[u'&bdquo;'] = u'{``}'
	fixes[u'&ldquo;'] = u'{``}'
	fixes[u'&rdquo;'] = u'{\'\'}'
	
	newTome = []
	newTome.append(theTome[0])
	
	outbuffer = u""
	for idx in range(1, len(theTome)):
		outbuffer = theTome[idx]
		for key, value in fixes.items():
			outbuffer = outbuffer.replace(key, value)
		newTome.append(outbuffer)
	
	fixes = {}
	fixes[u'&'] = u'{\&}'
	fixes[u'#'] = u'{\#}'
	fixes[u'['] = u'$[$'
	fixes[u']'] = u'$]$'

	newestTome = []
	newestTome.append(newTome[0])
	
	outbuffer = u""
	for idx in range(1, len(newTome)):
		outbuffer = newTome[idx]
		for key, value in fixes.items():
			outbuffer = outbuffer.replace(key, value)
		newestTome.append(outbuffer)
	
	return newestTome

def htmlizeTome(theTome):
	
	logmsg("HTMLizing...\n")
	
	newTome = []
	newTome.append(theTome[0])
	
	outbuffer = u""
	docontinue = False
	for idx in range(1, len(theTome)):
		newTome.append(theTome[idx])
		#for line in theTome[idx].split(u'\n'):
		#	if len(line.strip()) > 0:
		#		if line[0] != u' ':
		#			newTome.append(u'<p>'+line+u'</p>'+'\n')
		#		else:
		#			newTome.append(u'<p style="left-padding: '+str( (len(line)-len(line.lstrip())) )+'em">'+line.lstrip()+u'</p>'+'\n')
		#	else:
		#		newTome.append(line+'\n')
	
	return newTome

def parfixTome(theTome):
	
	logmsg("Fixing erroneous paragraph breaks...\n")
	
	newTome = []
	newTome.append(theTome[0])
	
	outbuffer = u""
	strPunct = u' .?!-)‒–—―…»"\':;”’AÁÀÄBCDEÉÈËFGHIÍÌÏJKLMNOÒÓÖŐPQRSTUÙÚÜŰVWXYZ'
	strLowerAlpha = u' aáàäbcdeéèëfghiíìïjklmnoòóöőpqrstuùúüűvwxyz'
	docontinue = False
	for idx in range(1, len(theTome[1:])+1):
		if docontinue > 0:
			docontinue -= 1
			continue
		if len(theTome[idx])>1:
			if len(theTome) > (idx+2):
				if strPunct.find(theTome[idx][len(theTome[idx])-1:]) == -1:
					if strLowerAlpha.find(theTome[idx+2][0:1]) > -1:
						if (theTome[idx] != theTome[idx].upper()) and (theTome[idx+2] != theTome[idx+2].upper()) and (len(theTome[idx]) > 45):
							newTome.append(theTome[idx]+' ' + theTome[idx+2])
							logmsg("  Concatenating: ")
							logmsg("  {")
							logmsg("    A: " + theTome[idx].encode('utf-8'))
							logmsg("       " + str(strPunct.find(theTome[idx][len(theTome[idx])-1:])) + " (" + theTome[idx][len(theTome[idx])-1:].encode('utf-8') + ")")
							logmsg("    B: " + theTome[idx+2].encode('utf-8'))
							logmsg("       " + str(strLowerAlpha.find(theTome[idx+2][1:2])) + " (" + theTome[idx+2][0:1].encode('utf-8') + ")")
							logmsg("  }\n")
							docontinue = 2
							continue
		newTome.append(theTome[idx])
	
	return newTome

def quotedumbenTome(theTome):
	logmsg("Dumbening quotation marks...\n")
	
	newTome = []
	newTome.append(theTome[0])
	
	outbuffer = u""
	docontinue = False
	for idx in range(1, len(theTome)):
		if docontinue > 0:
			docontinue -= 1
			continue
		newTome.append(theTome[idx].replace(u'&quot;', u'"').replace(u'„', u'"').replace(u'“', u'"').replace(u'”', u'"').replace(u'‘', u'\'').replace(u'’', u'\'').replace(u'', u'"').replace(u'', u'"'))
	
	return newTome

def quoteTome(theTome):
	
	logmsg("Correcting double quotation marks...\n")
	
	strAlpha = u'aáàäbcdeéèëfghiíìïjklmnoòóöőpqrstuùúüűvwxyzAÁÀÄBCDEÉÈËFGHIÍÌÏJKLMNOÒÓÖŐPQRSTUÙÚÜŰVWXYZ'
	
	newTome = []
	newTome.append(theTome[0])
	
	outbuffer = u""
	for line in theTome[1:]:
		tmpLine = '  ' + line + '  '
		dquote = 'unset'
		digidx = 0
		for digit in line:
			if digit == '"':
				if (strAlpha.find(tmpLine[digidx+1:digidx+2]) == -1) or (strAlpha.find(tmpLine[digidx+3:digidx+4]) == -1):
					#outbuffer += '{' + tmpLine[digidx+1:digidx+2] + '|' + tmpLine[digidx+3:digidx+4] + '}'
					if u' '.find(tmpLine[digidx+1:digidx+2]) > -1:
							outbuffer += u"“"
							dquote = 'open'
					elif u' '.find(tmpLine[digidx+3:digidx+4]) > -1:
							outbuffer += u"”"
							dquote = 'closed'
					else:
						if dquote == 'unset':
							if digidx == len(line):
								outbuffer += u"”"
								dquote = 'closed'
							else:
								outbuffer += u"“"
								dquote = 'open'
						elif dquote == 'open':
							outbuffer += u"”"
							dquote = 'closed'
						elif dquote == 'closed':
							outbuffer += u"“"
							dquote = 'open'
				else:
					outbuffer += '"'
			else:
				outbuffer += digit
			digidx += 1
		newTome.append(outbuffer)
		outbuffer = u""
	
	return newTome

def squoteTome(theTome):
	
	logmsg("Correcting single quotation marks...\n")
	
	strAlpha = u'aáàäbcdeéèëfghiíìïjklmnoòóöőpqrstuùúüűvwxyzAÁÀÄBCDEÉÈËFGHIÍÌÏJKLMNOÒÓÖŐPQRSTUÙÚÜŰVWXYZ'
	
	newTome = []
	newTome.append(theTome[0])
	
	outbuffer = u""
	for line in theTome[1:]:
		tmpLine = '  ' + line + '  '
		dquote = 'unset'
		digidx = 0
		for digit in line:
			if digit == '\'':
				if (strAlpha.find(tmpLine[digidx+1:digidx+2]) == -1) or (strAlpha.find(tmpLine[digidx+3:digidx+4]) == -1):
					#outbuffer += '{' + tmpLine[digidx+1:digidx+2] + '|' + tmpLine[digidx+3:digidx+4] + '}'
					if u' '.find(tmpLine[digidx+1:digidx+2]) > -1:
							outbuffer += u"‘"
							dquote = 'open'
					elif u' '.find(tmpLine[digidx+3:digidx+4]) > -1:
							outbuffer += u"’"
							dquote = 'closed'
					else:
						if dquote == 'unset':
							if digidx == len(line):
								outbuffer += u"’"
								dquote = 'closed'
							else:
								outbuffer += u"‘"
								dquote = 'open'
						elif dquote == 'open':
							outbuffer += u"’"
							dquote = 'closed'
						elif dquote == 'closed':
							outbuffer += u"‘"
							dquote = 'open'
				else:
					outbuffer += '\''
			else:
				outbuffer += digit
			digidx += 1
		newTome.append(outbuffer)
		outbuffer = u""
	
	return newTome

def postRtfCleanup(inbuffer):
	
	outbuffer = inbuffer
	
	for idx in range(0, 100):
		outbuffer = outbuffer.replace(u'<strong>'+(' '*(99-idx))+'</strong>', '')
		outbuffer = outbuffer.replace(u'<emph>'+(' '*(99-idx))+'</emph>', '')
		outbuffer = outbuffer.replace(u'<strong><emph>'+(' '*(99-idx))+'</emph></strong>', '')
		outbuffer = outbuffer.replace(u'<emph><strong>'+(' '*(99-idx))+'</strong></emph>', '')
		outbuffer = outbuffer.replace(u'<strong><emph>'+(' '*(99-idx))+'</strong></emph>', '')
		outbuffer = outbuffer.replace(u'<emph><strong>'+(' '*(99-idx))+'</emph></strong>', '')
	
	for idx in range(0, 100):
		outbuffer = outbuffer.replace(u'<strong>'+(' '*(99-idx))+'</strong>', '')
		outbuffer = outbuffer.replace(u'<emph>'+(' '*(99-idx))+'</emph>', '')
		outbuffer = outbuffer.replace(u'<strong><emph>'+(' '*(99-idx))+'</emph></strong>', '')
		outbuffer = outbuffer.replace(u'<emph><strong>'+(' '*(99-idx))+'</strong></emph>', '')
		outbuffer = outbuffer.replace(u'<strong><emph>'+(' '*(99-idx))+'</strong></emph>', '')
		outbuffer = outbuffer.replace(u'<emph><strong>'+(' '*(99-idx))+'</emph></strong>', '')
	
	for idx in range(0, 100):
		outbuffer = outbuffer.replace(u'<strong>'+(' '*(99-idx))+'</strong>', '')
		outbuffer = outbuffer.replace(u'<emph>'+(' '*(99-idx))+'</emph>', '')
		outbuffer = outbuffer.replace(u'<strong><emph>'+(' '*(99-idx))+'</emph></strong>', '')
		outbuffer = outbuffer.replace(u'<emph><strong>'+(' '*(99-idx))+'</strong></emph>', '')
		outbuffer = outbuffer.replace(u'<strong><emph>'+(' '*(99-idx))+'</strong></emph>', '')
		outbuffer = outbuffer.replace(u'<emph><strong>'+(' '*(99-idx))+'</emph></strong>', '')
	
	return outbuffer

def rtfExtract(inbuffer):
	
	global debug
	
	if len(inbuffer) > ((1024*5)-1):
		logmsg(u'Extracting text from RTF...\n')
	
	fstack = []
	format = u''
	
	outbuffer = []
	
	state = 'normal'
	bracelevel = 0
	reqskips = 0
	progress = 0
	command = u''
	footnote = u''
	
	inbuffer = inbuffer.replace(u'\r', u'').replace(u'\n', u'')
	
	idx = 0
	while idx < len(inbuffer)-2:
		idx += 1
		alphaLower = u'abcdefghijklmnopqrstuvwxyz'
		alphaUpper = u'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
		numeric = u'0123456789'
		alphanumeric = alphaLower + alphaUpper + numeric
		progress += 1
		
		if debug == True and progress > 435050: sys.exit(0)
		if debug == True: print inbuffer[idx:idx+1] + "      (" + state + ") " + inbuffer[idx:idx+5]
		
		if progress > ((1024 * 512)-1):
			progress = 0
			logmsg("  ... progress: " + str(round((float(idx)/float(len(inbuffer)))*100, 2)) + "% (" + str(round(float(idx)/1024/1024,1)) + " MB of " + str(round(float(len(inbuffer))/1024/1024,1)) + " MB)")
		
		digit = inbuffer[idx:idx+1]
		
		if state == 'skip':
			if skip > 0:
				skip -= 1
				if skip == 0:
					state = 'normal'
				continue
		elif state == 'footnote':
			justclosed = False
			footnote += digit
			if digit == u'{':
				bracelevel += 1
			elif digit == u'}':
				justclosed = True
				bracelevel -= 1
			if bracelevel == 0 and justclosed == True:
				outbuffer.append('<footnote>'+rtfExtract(footnote[0:-1])+'</footnote>')
				footnote = u''
				state = 'normal'
			continue
		elif state == 'skipbraced':
			justclosed = False
			if digit == u'{':
				bracelevel += 1
			elif digit == u'}':
				justclosed = True
				bracelevel -= 1
			if bracelevel == 0 and justclosed == True:
				state = 'normal'
			continue
		elif state == 'backslash':
			if str(alphanumeric+'*\'-~_:').find(digit) > -1:
				if (len(command) > 0) and command[0] == '\'' and len(command) == 2:
					command += digit
					state = 'command'
					continue
				elif (len(command) > 0) and (numeric.find(command[-1]) > -1) and (numeric.find(digit) == -1):
					idx -= 1
					state = 'command'
					continue
				else:
					command += digit
			else:
				idx -= 1
				state = 'command'
				continue
			pass
		elif state == 'command':
			
			if debug == True: print "Command: \\" + command
			
			if len(command) == 3 and command[0] == u'\'':
				outbuffer.append(unichr(int(command[1:3], 16)))
				pass
			elif command == u'~':
				outbuffer.append(u' ')
				pass
			elif command == u'b':
				outbuffer.append(u'<strong>')
				format += u'b'
				pass
			elif command == u'b0':
				if format.find(u'b') > -1:
					outbuffer.append(u'</strong>')
					format = format[0:format.rfind('b')]+format[format.rfind('b')+1:]
				pass
			elif command == u'i':
				outbuffer.append(u'<emph>')
				format += u'i'
				pass
			elif command == u'i0':
				if format.find(u'i') > -1:
					outbuffer.append(u'</emph>')
					format = format[0:format.rfind('i')]+format[format.rfind('i')+1:]
				pass
			elif command == u'par':
				for digit in ''.join(reversed(format)):
					if digit == 'b':
						outbuffer.append(u'</strong>')
					if digit == 'i':
						outbuffer.append(u'</emph>')
				outbuffer.append(u'\n\n')
				for digit in ''.join(format):
					if digit == 'b':
						outbuffer.append(u'<strong>')
					if digit == 'i':
						outbuffer.append(u'<emph>')
				pass
			elif command == u'pard':
				for digit in ''.join(reversed(format)):
					if digit == 'b':
						outbuffer.append(u'</strong>')
					if digit == 'i':
						outbuffer.append(u'</emph>')
				format = ''
			elif command == u'tab':
				outbuffer.append(u'\t')
				pass
			elif command == u'line':
				for digit in ''.join(reversed(format)):
					if digit == 'b':
						outbuffer.append(u'</strong>')
					if digit == 'i':
						outbuffer.append(u'</emph>')
				outbuffer.append(u'\n')
				for digit in ''.join(format):
					if digit == 'b':
						outbuffer.append(u'<strong>')
					if digit == 'i':
						outbuffer.append(u'<emph>')
				pass
			elif command == u'footnote' and inbuffer[idx-len(command)-2:idx-len(command)-1] == '{':
				bracelevel = 1
				state='footnote'
				oldcommand = command
				idx-=1
				command = u''
				continue
			elif command[0] == u'f' and inbuffer[idx-len(command)-2:idx-len(command)-1] == '{':
				bracelevel = 1
				state='skipbraced'
				oldcommand = command
				command = u''
				continue
			elif command[0] == u's' and inbuffer[idx-len(command)-2:idx-len(command)-1] == '{':
				bracelevel = 1
				state='skipbraced'
				oldcommand = command
				command = u''
				continue
			elif command == u'operator' and inbuffer[idx-len(command)-2:idx-len(command)-1] == '{':
				bracelevel = 1
				state='skipbraced'
				oldcommand = command
				command = u''
				continue
			elif command == u'author' and inbuffer[idx-len(command)-2:idx-len(command)-1] == '{':
				bracelevel = 1
				state='skipbraced'
				oldcommand = command
				command = u''
				continue
			elif command == u'fonttbl' and inbuffer[idx-len(command)-2:idx-len(command)-1] == '{':
				bracelevel = 1
				state='skipbraced'
				oldcommand = command
				command = u''
				continue
			elif command == u'colortbl' and inbuffer[idx-len(command)-2:idx-len(command)-1] == '{':
				bracelevel = 1
				state='skipbraced'
				oldcommand = command
				command = u''
				continue
			elif command == u'stylesheet' and inbuffer[idx-len(command)-2:idx-len(command)-1] == '{':
				bracelevel = 1
				state='skipbraced'
				oldcommand = command
				command = u''
				continue
			elif command == u'info' and inbuffer[idx-len(command)-2:idx-len(command)-1] == '{':
				bracelevel = 1
				state='skipbraced'
				oldcommand = command
				command = u''
				continue
			elif command == u'*' and inbuffer[idx-len(command)-2:idx-len(command)-1] == '{':
				bracelevel = 1
				state='skipbraced'
				oldcommand = command
				command = u''
				continue
			elif command == u'header' and inbuffer[idx-len(command)-2:idx-len(command)-1] == '{':
				bracelevel = 1
				state='skipbraced'
				oldcommand = command
				command = u''
				continue
			elif command == u'pict' and inbuffer[idx-len(command)-2:idx-len(command)-1] == '{':
				bracelevel = 1
				state='skipbraced'
				oldcommand = command
				command = u''
				continue
			else:
				pass
			oldcommand = command
			command = u''
			if digit == ' ':
				if idx < len(inbuffer):
					if inbuffer[idx+1:idx+2] != '\\':
						if (len(oldcommand) == 3) and (oldcommand[0] == '\''):
							outbuffer.append(u' ')
						state = 'normal'
						continue
					else:
						if (len(oldcommand) == 3) and (oldcommand[0] == '\''):
							outbuffer.append(u' ')
						state = 'normal'
						continue
			else:
				idx -= 1
				state = 'normal'
				continue
		elif state == 'normal':
			if digit == u'{':
				# fstack.append(format)
				pass
			elif digit == u'}':
				# for digit in ''.join(reversed(format)):
				# 	if digit == 'b':
				# 		outbuffer.append(u'</strong>')
				# 	if digit == 'i':
				# 		outbuffer.append(u'</emph>')
				# format = fstack.pop()
				pass
			elif digit == u'\\':
				state = 'backslash'
			elif digit == u'\n':
				pass
			elif digit == u'\r':
				pass
			else:
				outbuffer.append(digit)
			pass
	
	if len(inbuffer) > ((1024*5)-1):
		logmsg("  ... progress: " + str(round((float(idx)/float(len(inbuffer)))*100, 2)) + "% (" + str(round(float(idx)/1024/1024,1)) + " MB of " + str(round(float(len(inbuffer))/1024/1024,1)) + " MB)\n")
	
	return ''.join(outbuffer)

def getTome(inbuffer):
	
	theTome = []
	cfg = {'idx':0}

	theTome.append(cfg)
	
	curDigit = u""
	prevDigit = u""
	collSpace = u""

	spaceList = []
	spaceDict = {}
	  
	print "Parsing..."
	print
	  
	for digit in inbuffer:
		prevDigit = curDigit
		curDigit = digit
	    
		if curDigit == ' ':
			collSpace += u"s"
		elif curDigit == chr(9):
			collSpace += u"t"
		elif curDigit == chr(10):
			collSpace += u"r"
		elif curDigit == chr(13):
			collSpace += u"n"
		else:
			if collSpace != u"":
				spaceList.append(collSpace)
			if spaceDict.get(collSpace) > 0:
				spaceDict[collSpace] += 1
			else:
				spaceDict[collSpace] = 1
			collSpace = u""

	if collSpace != u"":
		spaceList.append(collSpace)
		if spaceDict.get(collSpace) > 0:
			spaceDict[collSpace] += 1
		else:
			spaceDict[collSpace] = 1
		collSpace = u""

	wordspace = u""
	highest = 0
	for key in spaceDict:
		if spaceDict[key] > highest:
			highest  = spaceDict[key]
			wordspace = key
	top1 = highest
	  
	linebreak = u""
	highest = 0
	for key in spaceDict:
		if spaceDict[key] > highest and key != wordspace and (key.find('r') > -1 or key.find('n') > -1):
			highest  = spaceDict[key]
			linebreak = key
	top2 = highest
	  
	parbreak = u""
	highest = 0
	for key in spaceDict:
		if spaceDict[key] > highest and key != wordspace and key != linebreak and (key.find('r') > -1 or key.find('n') > -1):
			highest  = spaceDict[key]
			parbreak = key
	top3 = highest
	  
	fourth = u""
	highest = 0
	for key in spaceDict:
		if spaceDict[key] > highest and key != wordspace and key != linebreak and key != parbreak and (key.find('r') > -1 or key.find('n') > -1):
			highest  = spaceDict[key]
			fourth = key
	top4 = highest
	
	if top3 > (top4 * 5) and top2 > (top3 * 2) and top1 > (top2 * 15) and top3 > 100:
		print "  line break: " + linebreak + " (" + str(top2) + ")"
		print "  paragraph break: " + parbreak + " (" + str(top3) + ")"
		print "  ...: " + " (" + str(top4) + ")"
		print
		print "Removing intraparagraph linebreaks..."
		print
		
		prevDigit = u""
		curDigit = u""
		outbuffer = u""
		collSpace = u""
		
		for digit in inbuffer:
			prevDigit = curDigit
			curDigit = digit
		
			if curDigit == ' ':
				collSpace += u"s"
			elif curDigit == chr(9):
				collSpace += u"t"
			elif curDigit == chr(10):
				collSpace += u"r"
			elif curDigit == chr(13):
				collSpace += u"n"
			else:
				if collSpace != u"":
					if collSpace == wordspace:
						outbuffer += u" "
					elif collSpace == linebreak:
						outbuffer += u" "
					elif collSpace == parbreak:
						theTome.append(outbuffer)
						outbuffer = u""
						theTome.append(outbuffer)
					else:
						outbuffer += collSpace.replace(u's', u' ').replace(u't', u''+chr(9)).replace(u'r', u''+chr(10)).replace('n', u''+chr(13))
					collSpace = u""
				outbuffer += curDigit
		
		if collSpace != u"":
			if collSpace != u"":
				if collSpace == wordspace:
					theTome.append(outbuffer)
					outbuffer += u" "
					theTome.append(outbuffer)
				elif collSpace == linebreak:
					theTome.append(outbuffer)
					outbuffer += u" "
					theTome.append(outbuffer)
				elif collSpace == parbreak:
					theTome.append(outbuffer)
					outbuffer = u""
					theTome.append(outbuffer)
				else:
					outbuffer += collSpace.replace(u's', u' ').replace(u't', u''+chr(9)).replace(u'r', u''+chr(10)).replace('n', u''+chr(13))
					theTome.append(outbuffer)
				collSpace = u""
		else:
			theTome.append(outbuffer)
		
		
	elif top2 > (top3 * 5) and top1 > (top2 * 5):
		parbreak = linebreak
		print "  paragraph break: " + parbreak + " (" + str(top2) + ")"
		print "  ...: " + " (" + str(top3) + ")"
		print "  ...: " + " (" + str(top4) + ")"
		print
		
		prevDigit = u""
		curDigit = u""
		outbuffer = u""
		collSpace = u""
		
		for digit in inbuffer:
			prevDigit = curDigit
			curDigit = digit
		
			if curDigit == ' ':
				collSpace += u"s"
			elif curDigit == chr(9):
				collSpace += u"t"
			elif curDigit == chr(10):
				collSpace += u"r"
			elif curDigit == chr(13):
				collSpace += u"n"
			else:
				if collSpace != u"":
					if collSpace == wordspace:
						outbuffer += u" "
					elif collSpace == parbreak:
						theTome.append(outbuffer)
						outbuffer = u""
						theTome.append(outbuffer)
					else:
						outbuffer += collSpace.replace(u's', u' ').replace(u't', u''+chr(9)).replace(u'r', u''+chr(10)).replace(u'n', u''+chr(13))
					collSpace = u""
				outbuffer += curDigit
		
		if collSpace != u"":
			if collSpace != u"":
				if collSpace == wordspace:
					theTome.append(outbuffer)
					outbuffer += u" "
					theTome.append(outbuffer)
				elif collSpace == parbreak:
					theTome.append(outbuffer)
					outbuffer = u""
					theTome.append(outbuffer)
				else:
					outbuffer += collSpace.replace(u's', u' ').replace(u't', u''+chr(9)).replace(u'r', u''+chr(10)).replace('n', u''+chr(13))
					theTome.append(outbuffer)
				collSpace = u""
		else:
			theTome.append(outbuffer)
		
		
	else:
		print "  ...: " + linebreak + " (" + str(top2) + ")"
		print "  ...: " + parbreak + " (" + str(top3) + ")"
		print "  ...: " + " (" + str(top4) + ")"
		print
		print "Failed to correctly parse file.  Might be too short."
		print "Assuming linebreaks are paragraph breaks."
		print
		parbreak = linebreak
		print "  paragraph break: " + parbreak + " (" + str(top2) + ")"
		print "  ...: " + " (" + str(top3) + ")"
		print "  ...: " + " (" + str(top4) + ")"
		print
		
		prevDigit = u""
		curDigit = u""
		outbuffer = u""
		collSpace = u""
		
		for digit in inbuffer:
			prevDigit = curDigit
			curDigit = digit
		
			if curDigit == ' ':
				collSpace += u"s"
			elif curDigit == chr(9):
				collSpace += u"t"
			elif curDigit == chr(10):
				collSpace += u"r"
			elif curDigit == chr(13):
				collSpace += u"n"
			else:
				if collSpace != u"":
					if collSpace == wordspace:
						outbuffer += u" "
					elif collSpace == parbreak:
						theTome.append(outbuffer)
						outbuffer = u""
						theTome.append(outbuffer)
					else:
						outbuffer += collSpace.replace(u's', u' ').replace(u't', u''+chr(9)).replace(u'r', u''+chr(10)).replace(u'n', u''+chr(13))
					collSpace = u""
				outbuffer += curDigit
		
		if collSpace != u"":
			if collSpace != u"":
				if collSpace == wordspace:
					theTome.append(outbuffer)
					outbuffer += u" "
					theTome.append(outbuffer)
				elif collSpace == parbreak:
					theTome.append(outbuffer)
					outbuffer = u""
					theTome.append(outbuffer)
				else:
					outbuffer += collSpace.replace(u's', u' ').replace(u't', u''+chr(9)).replace(u'r', u''+chr(10)).replace('n', u''+chr(13))
					theTome.append(outbuffer)
				collSpace = u""
		else:
			theTome.append(outbuffer)
	
	return theTome

def logmsg(text):
	global log
	log += text
	print text

def usage():
	print u"Usage: pacify.py -P [en|hu] -i input.[txt|rtf] [options]"
	print
	print u"Preprocessing options:"
	print
	print u"        -r : preprocess lines by right-trimming (use if lines are randomly,"
	print u"                 NOT systematically, ended with one or more spaces)"
	print
	print u"Regular processing options:"
	print
	print u"        -p : fix erroneous paragraph breaks"
	print u"        -c : perform miscellaneous character substitutions/clean-ups"
	print u"        -q : fix quotation marks (smartens quotation marks)"
	print
	print u"Output options:"
	print
	print u"        -l latex_profile : prepare for LaTeX"
	print u"                           latex_profile can be:"
	print
	# print u"                               x_georgia: XeLaTeX with Georgia"
	# print u"                               x_palatino: XeLaTeX with Palatino"
	# print u"                               x_garamond: XeLaTeX with Garamond"
	print u"                               x_gppro: XeLaTeX with Garamond Premier Pro"
	# print u"                               x_gpprocap: XeLaTeX with Garamond Premier Pro Caption"
	# print u"                               x_arpro: XeLaTeX with Arno Pro"
	# print u"                               x_arprocap: XeLaTeX with Arno Pro Caption"
	# print
	# print u"                               l_cmodern: LaTeX with Computer Modern"
	# print u"                               l_palatino: LaTeX with Palatino"
	# print u"                               l_kerkis: LaTeX with Kerkis"
	print
	print u"        -I \"identity\" : include `Ex Libris identity' on LaTeX title page"
	print u"        -T \"title\" : include on LaTeX title page"
	print u"        -A \"last name, first name\" : include on LaTeX title page"
	print u"        -S \"subtitle\" : include on LaTeX title page"
	sys.exit(2)

if __name__ == "__main__":
	main()
