#!/usr/bin/env python
# -*- coding: utf-8 -*-

################################################################
#                                                              #
#               class P A C I F Y begins here                  #
#                                                              #
################################################################

import time, codecs

class Pacify():
	"""Cleans up and converts eBooks between a variety of formats."""
	
	def __init__(self, args):
		
		self.version = u'0.4.0'
		self.date = u'2009-09-13'
		
		self.args = args
		self.args['outfilename'] = self.args['filename'] + u'-pacified.' + self.args['output']
		self.args['logfilename'] = self.args['filename'] + u'-pacified.log'
		
		self.log = pLogger()
		
		self.log += u'1:pacify v' + self.version + u' (' + self.date + u') - Copyright 2009 Pax Librorum (www.PaxLibrorum.com)\n'
		
		self.log += u'1:\tProfile:\t' + args['profile']
		self.log += u'1:\tInput file:\t' + args['infilename']
		self.log += u'1:\tOutput file:\t' + args['outfilename']
		self.log += u'1:\tLog file:\t' + args['logfilename'] + u'\n'
		
		self.info = {}
		
		self.inbuffer = pString()
		
		if self.args['type'] == 'txt':
			self.inbuffer = self.ReadTXT()
		elif self.args['type'] == 'rtf':
			self.inbuffer = self.ReadRTF()
		elif self.args['type'] == 'html':
			self.inbuffer = self.ReadHTML()
		
		self.CleanFormatting()
		
		self.AnalyzeText()
		
		self.FixQuotes()
		
		self.EnlightenText()
		
		# print self.inbuffer
		# print self.inbuffer.format
		
		self.log += u'1:Writing ' + args['outfilename'] + u'...\n'
		outfile = open(args['outfilename'], 'w')
		
		if self.args['output'] == 'txt':
			outfile.write(self.inbuffer.pText.encode('utf-8'))
		elif self.args['output'] == 'rtf':
			outfile.write(self.inbuffer.pText.encode('utf-8'))
		elif self.args['output'] == 'html':
			outfile.write(self.inbuffer.pText.encode('utf-8'))
		elif self.args['output'] == 'tex':
			outfile.write(self.GetAsLaTeX().encode('utf-8'))
		
		outfile.close()
		
		self.log += u'1:\tDone!\n'
		
		self.log.dump(args['logfilename'])
		
		
	
	def FixQuotes(self):
		
		if not(self.inbuffer.pText.find(u'"')) and not(self.inbuffer.pText.find(u'\'')): return
		
		self.log += u'1:Correcting quotation marks...\n'
		
		strAlpha = u'aáàäbcdeéèëfghiíìïjklmnoòóöőpqrstuùúüűvwxyzAÁÀÄBCDEÉÈËFGHIÍÌÏJKLMNOÒÓÖŐPQRSTUÙÚÜŰVWXYZ'
		
		tmpbuffer = []
		tmpfbuffer = []
		
		dquote = ''
		
		empty = pString()
		empty += u' '
		
		squote = 'unset'
		dquote = 'unset'
		
		print "\t",
		for idx in range(0, len(self.inbuffer)):
			if (idx % (1024 * (1024/2)) == 0) and idx > 0: print ".",
			if idx == 0:
				prevdigit = empty
				digit = self.inbuffer[idx]
				nextdigit = self.inbuffer[idx+1]
			elif idx == 1:
				prevdigit = self.inbuffer[idx-1]
				digit = self.inbuffer[idx]
				nextdigit = self.inbuffer[idx+1]
			elif idx == (len(self.inbuffer)-2):
				prevdigit = self.inbuffer[idx-1]
				digit = self.inbuffer[idx]
				nextdigit = self.inbuffer[idx+1]
			elif idx == (len(self.inbuffer)-1):
				prevdigit = self.inbuffer[idx-1]
				digit = self.inbuffer[idx]
				nextdigit = empty
			else:
				prevdigit = self.inbuffer[idx-1]
				digit = self.inbuffer[idx]
				nextdigit = self.inbuffer[idx+1]
			
			if digit.pText == '"':
				if (strAlpha.find(prevdigit.pText) == -1) or (strAlpha.find(nextdigit.pText) == -1):
					if u' \n'.find(prevdigit.pText) > -1:
							tmpbuffer.append(u"“")
							tmpfbuffer.append(digit.format[0])
							dquote = 'open'
					elif u' \n'.find(nextdigit.pText) > -1:
							tmpbuffer.append(u"”")
							tmpfbuffer.append(digit.format[0])
							dquote = 'closed'
					else:
						if dquote == 'unset':
							if prevdigit.pText == u'\n':
								tmpbuffer.append(u"”")
								tmpfbuffer.append(digit.format[0])
								dquote = 'closed'
							else:
								tmpbuffer.append(u"“")
								tmpfbuffer.append(digit.format[0])
								dquote = 'open'
						elif dquote == 'open':
							tmpbuffer.append(u"”")
							tmpfbuffer.append(digit.format[0])
						elif dquote == 'closed':
							tmpbuffer.append(u"“")
							tmpfbuffer.append(digit.format[0])
							dquote = 'open'
				else:
					tmpbuffer.append(u"\"")
					tmpfbuffer.append(digit.format[0])
			elif digit.pText == '\'':
				if (strAlpha.find(prevdigit.pText) == -1) or (strAlpha.find(nextdigit.pText) == -1):
					if u' \n'.find(prevdigit.pText) > -1:
							tmpbuffer.append(u"‘")
							tmpfbuffer.append(digit.format[0])
							squote = 'open'
					elif u' \n'.find(nextdigit.pText) > -1:
							tmpbuffer.append(u"’")
							tmpfbuffer.append(digit.format[0])
							squote = 'closed'
					else:
						if squote == 'unset':
							if prevdigit.pText == u'\n':
								tmpbuffer.append(u"’")
								tmpfbuffer.append(digit.format[0])
								squote = 'closed'
							else:
								tmpbuffer.append(u"‘")
								tmpfbuffer.append(digit.format[0])
								squote = 'open'
						elif squote == 'open':
							tmpbuffer.append(u"’")
							tmpfbuffer.append(digit.format[0])
						elif squote == 'closed':
							tmpbuffer.append(u"‘")
							tmpfbuffer.append(digit.format[0])
							squote = 'open'
				else:
					tmpbuffer.append(u"\'")
					tmpfbuffer.append(digit.format[0])
			else:
				tmpbuffer.append(digit.pText[0])
				tmpfbuffer.append(digit.format[0])
		
		output = pString()
		output.pText = ''.join(tmpbuffer)
		output.format.extend(tmpfbuffer)
		
		self.inbuffer = output
		return
	
	
	
	def GetAsLaTeX(self):
		
		self.log += u'1:\tConverting to LaTeX...\n'
		
		tmpbuffer = []
		curFormat = pStringFormat()
		
		self.log += u'1:\t\tReplacing \\\'s'
		self.inbuffer = self.inbuffer.Replace(u'\\', u'\\\\')
		self.log += u'1:\t\tReplacing {\'s'
		self.inbuffer = self.inbuffer.Replace(u'{', u'\\{')
		self.log += u'1:\t\tReplacing }\'s'
		self.inbuffer = self.inbuffer.Replace(u'}', u'\\}')
		self.log += u'1:\t\tReplacing >\'s'
		self.inbuffer = self.inbuffer.Replace(u'>', u'\\textless{}')
		self.log += u'1:\t\tReplacing <\'s'
		self.inbuffer = self.inbuffer.Replace(u'<', u'\\textgreater{}')
		self.log += u'1:\t\tReplacing ~\'s'
		self.inbuffer = self.inbuffer.Replace(u'~', u'\\textasciitilde{}')
		self.log += u'1:\t\tReplacing ^\'s'
		self.inbuffer = self.inbuffer.Replace(u'^', u'\\textasciicircum{}')
		self.log += u'1:\t\tReplacing &\'s'
		self.inbuffer = self.inbuffer.Replace(u'&', u'\\&')
		self.log += u'1:\t\tReplacing #\'s'
		self.inbuffer = self.inbuffer.Replace(u'#', u'\\#')
		self.log += u'1:\t\tReplacing _\'s'
		self.inbuffer = self.inbuffer.Replace(u'_', u'\\_')
		self.log += u'1:\t\tReplacing $\'s'
		self.inbuffer = self.inbuffer.Replace(u'$', u'\\$')
		self.log += u'1:\t\tReplacing %\'s\n'
		self.inbuffer = self.inbuffer.Replace(u'%', u'\\%')
		
		self.log += u'1:\t\tFormatting...\n'
		for idx in range(0, len(self.inbuffer)):
			if curFormat != self.inbuffer.format[idx]:
				if curFormat.isBold == False and self.inbuffer.format[idx].isBold == True:
					tmpbuffer.append(u'\\textbf{')
				elif curFormat.isBold == True and self.inbuffer.format[idx].isBold == False:
					tmpbuffer.append(u'}')
				if curFormat.isItalic == False and self.inbuffer.format[idx].isItalic == True:
					tmpbuffer.append(u'\\textit{')
				elif curFormat.isItalic == True and self.inbuffer.format[idx].isItalic == False:
					tmpbuffer.append(u'}')
				if curFormat.isSmallcap == False and self.inbuffer.format[idx].isSmallcap == True:
					tmpbuffer.append(u'\\textsc{')
				elif curFormat.isSmallcap == True and self.inbuffer.format[idx].isSmallcap == False:
					tmpbuffer.append(u'}')
				if curFormat.isUnderline == False and self.inbuffer.format[idx].isUnderline == True:
					tmpbuffer.append(u'\\underline{')
				elif curFormat.isUnderline == True and self.inbuffer.format[idx].isUnderline == False:
					tmpbuffer.append(u'}')
			
			curFormat = self.inbuffer.format[idx]
			
			tmpbuffer.append(self.inbuffer.pText[idx])
			
		
		output = u''.join(tmpbuffer)
		
		output = output.replace(u'…', u'\ldots{}')
		output = output.replace(u'“', u'{``}')
		output = output.replace(u'„', u'{``}')
		output = output.replace(u'”', u'{\'\'}')
		output = output.replace(u'–', u'--')
		output = output.replace(u'—', u'---')
		
		output = output.replace(u'í', u'\\\'{ı}')
		output = output.replace(u'ő', u'\\H{o}')
		output = output.replace(u'Ő', u'\\H{O}')
		output = output.replace(u'ű', u'\\H{u}')
		output = output.replace(u'Ű', u'\\H{U}')
		
		output = output.replace(u'$', u'\\$')
		output = output.replace(u'&', u'\\&')
		
		return output
		
		
	
	def EnlightenText(self):
		
		self.log += u'1:Enlightening text...\n'
		
		self.inbuffer = self.inbuffer.Replace(u'', u'…')
		self.inbuffer = self.inbuffer.Replace(u'', u'“')
		self.inbuffer = self.inbuffer.Replace(u'', u'“')
		self.inbuffer = self.inbuffer.Replace(u'', u'”')
		
		if self.args['profile'] == 'hu':
			if self.inbuffer.pText.find(u'ő') == -1 and self.inbuffer.pText.find(u'Ő') == -1 and self.inbuffer.pText.find(u'ű') == -1 and self.inbuffer.pText.find(u'Ű') == -1:
				self.inbuffer = self.inbuffer.Replace(u'“', u'„')
				self.inbuffer = self.inbuffer.Replace(u'õ', u'ő')
				self.inbuffer = self.inbuffer.Replace(u'Õ', u'Ő')
				self.inbuffer = self.inbuffer.Replace(u'û', u'ű')
				self.inbuffer = self.inbuffer.Replace(u'Û', u'Ű')
		
		self.log += u'1:\tDone!\n'
		
	
	
	def ReadTXT(self):
		
		self.log += u'1:Reading ' + self.args['infilename'] + u'...\n'
		
		self.log += u'1:\tStarting on ' + time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
		self.log += u'1:\n'
		
		infile = open(self.args['infilename'], 'r')
		# infile = codecs.open(self.args['infilename'], 'r', 'utf-8')
		
		infile.seek(0, 2)
		filesize = infile.tell()
		infile.seek(0, 0)
		
		tmpbuffer = []
		tmpfbuffer = []
		
		idx = 0
		abscounter = 0
		
		try:
			infile = codecs.open(self.args['infilename'], 'r', 'utf-8')
			inbuffer = infile.read()
		except:
			infile = open(self.args['infilename'])
			inbuffer = infile.read()
			for idx in range(128, 256):
				inbuffer = inbuffer.replace(chr(idx), '&#'+str(idx)+';')
			inbuffer = inbuffer.encode('utf-8')
			for idx in range(128, 256):
				inbuffer = inbuffer.replace('&#'+str(idx)+';', unichr(idx))
		infile.close()
		
		output = pString()
		output += inbuffer
		
		self.log += u'1:\tFinished on ' + time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) + u'\n'
		
		return output
	
	def ReadRTF(self):
		
		self.log += u'1:Reading ' + self.args['infilename'] + u'...\n'
		
		self.log += u'1:\tStarting on ' + time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
		self.log += u'1:\n'
		
		infile = open(self.args['infilename'], 'r')
		
		infile.seek(0, 2)
		filesize = infile.tell()
		infile.seek(0, 0)
		
		filetoolarge = False
		try:
			inbuffer = infile.read()
		except:
			filetoolarge = True
			self.log += u'1:File too large to read fully into memory.\n'
		
		tmpbuffer = []
		tmpfbuffer = []
		command = u''
		commandarg = u''
		pstate = u''
		bcount = 0
		fstack = []
		tformat = pStringFormat()
		
		idx = 0
		abscounter = 0
		while True:
			if filetoolarge:
				infile.seek(idx, 0)
				digit = infile.read(1)
			else:
				if idx >= len(inbuffer):
					digit = False
				else:
					digit = inbuffer[idx]
			if not(digit): break
			abscounter += 1
			if abscounter % (5 * 1024 * 1024) == 0: self.log += u'1:\t    ' + str(round((float(idx)/float(filesize))*100, 2)) + "% processed (" + str(round(float(idx)/1024/1024,1)) + " MB of " + str(round(float(filesize)/1024/1024,1)) + " MB)"
			
			if pstate == u'':
				if digit == '{':
					fstack.append(tformat)
				elif digit == u'\n' or digit == u'\r':
					pass
				elif digit == '}':
					tformat = fstack.pop()
				elif digit == u'\\':
					pstate = 'GetCommand'
					command = u''
					commandarg = u''
				else:
					tmpbuffer.append(digit)
					tmpfbuffer.append(tformat)
			elif pstate == u'CharCode':
				if len(command) == 3:
					pstate = 'GotCommand'
					continue
				else:
					command += digit
			elif pstate == u'GetCommand':
				if 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'.find(digit) > -1:
					command += digit
				elif '-0123456789'.find(digit) > -1:
					commandarg += digit
				elif '*'.find(digit) > -1:
					pstate = u'GetSkippableCommand'
					command = u''
					commandarg = u''
				elif '|~-_:'.find(digit) > -1:
					command += digit
					pstate = 'GotCommand'
				elif '\''.find(digit) > -1:
					command += digit
					pstate = 'CharCode'
				elif ' '.find(digit) > -1:
					pstate = 'GotCommand'
				else:
					pstate = 'GotCommand'
					continue
			elif pstate == u'GetSkippableCommand':
				if 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'.find(digit) > -1:
					command += digit
				elif '-0123456789'.find(digit) > -1:
					commandarg += digit
				elif ' '.find(digit) > -1:
					pstate = 'GotSkippableCommand'
				else:
					pstate = 'GotSkippableCommand'
					continue
			elif pstate == u'GotCommand':
				if command == u'footnote' or command == u'header' or command == u'footer' or command == u'pict' or command == u'info' or command == u'fonttbl' or command == u'stylesheet' or command == u'colortbl':
					pstate = u'SkipSection'
					bcount = 1
					continue
				elif len(command) == 3 and command[0] == u'\'':
					tmpbuffer.append(unichr(int(command[1:], 16)))
					tmpfbuffer.append(tformat)
					pstate = u''
					continue
				elif command == u'~':
					tmpbuffer.append(u' ')
					tmpfbuffer.append(tformat)
					pstate = u''
					continue
				elif command == u'-': # optional (soft?) hyphen
					pass
					pstate = u''
					continue
				elif command == u'_':
					tmpbuffer.append(u'-')
					tmpfbuffer.append(tformat)
					pstate = u''
					continue
				elif command == u'rquote':
					tmpbuffer.append(u'’')
					tmpfbuffer.append(tformat)
					pstate = u''
					continue
				elif command == u'pard':
					tformat = tformat.Clear('bold')
					tformat = tformat.Clear('italic')
					tformat = tformat.Clear('smallcap')
					tformat = tformat.Clear('underline')
					pstate = u''
					continue
				elif command == u'par':
					tmpbuffer.append(u'\n\n')
					tmpfbuffer.append(tformat)
					tmpfbuffer.append(tformat)
					pstate = u''
					continue
				elif command == u'line':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					pstate = u''
					continue
				elif command == u'emdash':
					tmpbuffer.append(u'—')
					tmpfbuffer.append(tformat)
					pstate = u''
					continue
				elif command == u'endash':
					tmpbuffer.append(u'–')
					tmpfbuffer.append(tformat)
					pstate = u''
					continue
				elif command == u'b' and commandarg == u'':
					tformat = tformat.Set('bold')
					pstate = u''
					continue
				elif command == u'i' and commandarg == u'':
					tformat = tformat.Set('italic')
					pstate = u''
					continue
				elif command == u'ul' and commandarg == u'':
					tformat = tformat.Set('underline')
					pstate = u''
					continue
				elif command == u'scaps' and commandarg == u'':
					tformat = tformat.Set('smallcap')
					pstate = u''
					continue
				elif command == u'b' and commandarg == u'0':
					tformat = tformat.Clear('bold')
					pstate = u''
					continue
				elif command == u'i' and commandarg == u'0':
					tformat = tformat.Clear('italic')
					pstate = u''
					continue
				elif command == u'ul' and commandarg == u'0':
					tformat = tformat.Clear('underline')
					pstate = u''
					continue
				elif command == u'scaps' and commandarg == u'0':
					tformat = tformat.Clear('smallcap')
					pstate = u''
					continue
				else:
					pstate = u''
					continue
			elif pstate == u'GotSkippableCommand':
				pstate = u'SkipSection'
				bcount = 1
				continue
			elif pstate == u'SkipSection':
				if digit == '{':
					fstack.append(tformat)
					bcount += 1
				elif digit == '}':
					tformat = fstack.pop()
					bcount -= 1
				if bcount == 0:
					pstate = u''
				
			idx += 1
		
		output = pString()
		output.pText = u''.join(tmpbuffer)
		output.format = tmpfbuffer
		
		self.log += u'1:\t    ' + str(round((float(filesize)/float(filesize))*100, 2)) + "% processed (" + str(round(float(filesize)/1024/1024,1)) + " MB of " + str(round(float(filesize)/1024/1024,1)) + " MB)"
		self.log += u'1:\n'
		self.log += u'1:\tFinished on ' + time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) + u'\n'
		
		return output
	
	def ReadHTML(self):
		
		self.log += u'1:Reading ' + self.args['infilename'] + u'...\n'
		
		self.log += u'1:\tStarting on ' + time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
		self.log += u'1:\n'
		
		try:
			infile = codecs.open(self.args['infilename'], 'r', 'utf-8')
		except:
			infile = open(self.args['infilename'], 'r')
		
		infile.seek(0, 2)
		filesize = infile.tell()
		infile.seek(0, 0)
		
		filetoolarge = False
		try:
			inbuffer = infile.read()
		except:
			filetoolarge = True
			self.log += u'1:File too large to read fully into memory.\n'
		
		tmpbuffer = []
		tmpfbuffer = []
		command = u''
		commandarg = u''
		pstate = u''
		bcount = 0
		fstack = []
		tformat = pStringFormat()
		inbody = False
		isnewline = True
		isspace = False
		
		idx = 0
		abscounter = 0
		while True:
			if filetoolarge:
				infile.seek(idx, 0)
				digit = infile.read(1)
			else:
				if idx >= len(inbuffer):
					digit = False
				else:
					digit = inbuffer[idx]
			if not(digit): break
			abscounter += 1
			if abscounter % (5 * 1024 * 1024) == 0: self.log += u'1:\t    ' + str(round((float(idx)/float(filesize))*100, 2)) + "% processed (" + str(round(float(idx)/1024/1024,1)) + " MB of " + str(round(float(filesize)/1024/1024,1)) + " MB)"
			
			if pstate == u'':
				if digit == '<':
					pstate = 'GetTag'
					command = ''
				elif digit == '&':
					pstate = 'GetEntity'
					command = ''
				else:
					if inbody:
						if digit == u'\r' or digit == u'\n':
							isnewline = True
						elif digit == u' ' or digit == u'\t':
							isspace = True
						else:
							if (isspace == True and isnewline == False) or (isspace == False and isnewline == True):
								if tmpbuffer[-1] != u' ' and tmpbuffer[-1] != u'\n' and tmpbuffer[-1] != u'␢':
									tmpbuffer.append(u' ')
									tmpfbuffer.append(tformat)
							tmpbuffer.append(digit)
							tmpfbuffer.append(tformat)
							isspace = False
							isnewline = False
			elif pstate == 'GetEntity':
				if u'#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'.find(digit) > -1:
					command += digit
				elif u';'.find(digit) > -1:
					pstate = 'GotEntity'
				else:
					pstate = 'GotEntity'
					continue
			elif pstate == 'GotEntity':
				if command == 'mdash':
					tmpbuffer.append(u'—')
					tmpfbuffer.append(tformat)
				elif command == 'ndash':
					tmpbuffer.append(u'–')
					tmpfbuffer.append(tformat)
				elif command == 'nbsp':
					tmpbuffer.append(u' ')
					tmpfbuffer.append(tformat)
				elif command == 'copy':
					tmpbuffer.append(u'©')
					tmpfbuffer.append(tformat)
				elif command == 'hellip':
					tmpbuffer.append(u'…')
					tmpfbuffer.append(tformat)
				elif command == 'amp':
					tmpbuffer.append(u'&')
					tmpfbuffer.append(tformat)
				elif command == 'lt':
					tmpbuffer.append(u'<')
					tmpfbuffer.append(tformat)
				elif command == 'gt':
					tmpbuffer.append(u'>')
					tmpfbuffer.append(tformat)
				elif command == 'quot':
					tmpbuffer.append(u'\"')
					tmpfbuffer.append(tformat)
				elif command == 'lsquo':
					tmpbuffer.append(u'‘')
					tmpfbuffer.append(tformat)
				elif command == 'rsquo':
					tmpbuffer.append(u'’')
					tmpfbuffer.append(tformat)
				elif command == 'sbquo':
					tmpbuffer.append(u'‚')
					tmpfbuffer.append(tformat)
				elif command == 'ldquo':
					tmpbuffer.append(u'“')
					tmpfbuffer.append(tformat)
				elif command == 'rdquo':
					tmpbuffer.append(u'”')
					tmpfbuffer.append(tformat)
				elif command == 'bdquo':
					tmpbuffer.append(u'„')
					tmpfbuffer.append(tformat)
				pstate = u''
				continue
			elif pstate == 'GetTag':
				if u'/!0123456789abcdefghijklmnopqrstuvwxyz_ABCDEFGHIJKLMNOPQRSTUVWXYZ'.find(digit) > -1:
					command += digit
				else:
					pstate = 'GotTag'
					continue
			elif pstate == 'GotTag':
				if digit == '>':
					pstate = 'DoneTag'
					command = command.lower().strip()
			elif pstate == 'DoneTag':
				if command == 'body':
					inbody = True
				elif command == '/body':
					inbody = False
				elif command == 'br' or command == 'br/':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'␢')
					tmpfbuffer.append(tformat)
				elif command == 'div':
					tmpbuffer.append(u'\n') 
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'␢')
					tmpfbuffer.append(tformat)
				elif command == '/div':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
				elif command == 'h1':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'␢')
					tmpfbuffer.append(tformat)
					tformat = tformat.Set('smallcap')
				elif command == '/h1':
					tformat = tformat.Clear('smallcap')
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
				elif command == 'h2':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'␢')
					tmpfbuffer.append(tformat)
					tformat = tformat.Set('smallcap')
				elif command == '/h2':
					tformat = tformat.Clear('smallcap')
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
				elif command == 'h3':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'␢')
					tmpfbuffer.append(tformat)
					tformat = tformat.Set('smallcap')
				elif command == '/h3':
					tformat = tformat.Clear('smallcap')
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
				elif command == 'h4':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'␢')
					tmpfbuffer.append(tformat)
					tformat = tformat.Set('smallcap')
				elif command == '/h4':
					tformat = tformat.Clear('smallcap')
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
				elif command == 'h5':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'␢')
					tmpfbuffer.append(tformat)
					tformat = tformat.Set('smallcap')
				elif command == '/h5':
					tformat = tformat.Clear('smallcap')
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
				elif command == 'h6':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'␢')
					tmpfbuffer.append(tformat)
					tformat = tformat.Set('smallcap')
				elif command == '/h6':
					tformat = tformat.Clear('smallcap')
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
				elif command == 'p':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'␢')
					tmpfbuffer.append(tformat)
				elif command == '/p':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
				elif command == 'pre':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'␢')
					tmpfbuffer.append(tformat)
				elif command == '/pre':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
				elif command == 'blockquote':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
					tmpbuffer.append(u'␢')
					tmpfbuffer.append(tformat)
				elif command == '/blockquote':
					tmpbuffer.append(u'\n')
					tmpfbuffer.append(tformat)
				elif command == 'b' or command == 'strong':
					tformat = tformat.Set('bold')
				elif command == '/b' or command == '/strong':
					tformat = tformat.Clear('bold')
				elif command == 'i' or command == 'em':
					tformat = tformat.Set('italic')
				elif command == '/i' or command == '/em':
					tformat = tformat.Clear('italic')
				elif command == 'u':
					tformat = tformat.Set('underline')
				elif command == '/u':
					tformat = tformat.Clear('underline')
				pstate = u''
				continue
			
			idx += 1
		
		output = pString()
		output.pText = u''.join(tmpbuffer)
		output.format = tmpfbuffer
		
		output = output.Replace(u'\n␢\n', u'')
		output = output.Replace(u'␢', u'')
		
		return output
	
	def CleanFormatting(self):
		
		self.log += u'1:Cleaning formatting...\n'
		
		self.log += u'1:\tStarting on ' + time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
		self.log += u'1:\n'
		
		unformatted = pStringFormat()
		newline = True
		for idx in range(0, len(self.inbuffer)):
			if self.inbuffer.pText[idx] == '\n':
				self.inbuffer.format[idx] = unformatted
				newline = True
			elif self.inbuffer.pText[idx] == ' ':
				if newline:
					self.inbuffer.format[idx] = unformatted
			elif self.inbuffer.pText[idx] == '\t':
				if newline:
					self.inbuffer.format[idx] = unformatted
			else:
				newline = False
		
		self.log += u'1:\n'
		self.log += u'1:\tFinished on ' + time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) + u'\n'
		
	
	
	def AnalyzeText(self):
		
		self.log += u'1:Analyzing text...\n'
		
		self.log += u'1:\tStarting on ' + time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
		self.log += u'1:\n'
		
		self.log += u'1:\t    Simplifying linebreaks...'
		self.inbuffer = self.inbuffer.Replace('\r\n', '\n')
		
		wsval = {}
		wsval[' '] = 0.245
		wsval['\t'] = 1.00
		wsval['\n'] = 8.00
		
		wsdict = {}
		
		wscoll = u''
		
		curDigit = u''
		preDigit = u''
		
		abscounter = 0
		
		self.log += u'1:\t    Analyzing whitespace patterns...'
		
		for digit in self.inbuffer.pText:
			
			abscounter += 1
			if abscounter % (5 * 1024 * 1024) == 0: self.log += u'1:\t    ' + str(round((float(abscounter)/float(len(self.inbuffer)))*100, 2)) + "% processed (" + str(round(float(abscounter)/1024/1024,1)) + " MB of " + str(round(float(len(self.inbuffer))/1024/1024,1)) + " MB)"
			
			preDigit = curDigit
			curDigit = digit
			
			if ' \t\n'.find(curDigit) > -1:
				wscoll += curDigit
			else:
				if len(wscoll) > 0:
					wstotal = 0
					for s in wscoll:
						wstotal += wsval[s]
					wstotal = round(wstotal, 0)
					if wsdict.has_key(wstotal):
						wsdict[wstotal] += 1
					else:
						wsdict[wstotal] = 1
					wscoll = u''
		
		self.log += u'1:\t    ' + str(round((float(len(self.inbuffer))/float(len(self.inbuffer)))*100, 2)) + "% processed (" + str(round(float(len(self.inbuffer))/1024/1024,1)) + " MB of " + str(round(float(len(self.inbuffer))/1024/1024,1)) + " MB)"
		self.log += u'1:\n'
		self.log += u'1:\tFinished on ' + time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) + u'\n'
		
		ranking = []
		
		while len(wsdict) > 0:
			for keyA, valA in wsdict.items():
				isLargest = True
				for keyB, valB in wsdict.items():
					if valB > valA:
						isLargest = False
				if isLargest:
					ranking.append((valA, keyA))
					del wsdict[keyA]
					break
		
		self.log += u'1:\tFilesize:'
		self.log += u'1:\t' + str(len(self.inbuffer))
		self.log += u'1:\n'
		self.log += u'1:\tWhitespace analysis:'
		self.log += u'1:\t' + str(ranking)
		self.log += u'1:\n'
		if len(ranking) > 0:
			self.log += u'1:\t' + str(float(ranking[0][0])/len(self.inbuffer) * 10000)
		if len(ranking) > 1:
			self.log += u'1:\t' + str(float(ranking[1][0])/len(self.inbuffer) * 10000)
		if len(ranking) > 2:
			self.log += u'1:\t' + str(float(ranking[2][0])/len(self.inbuffer) * 10000)
		if len(ranking) > 3:
			self.log += u'1:\t' + str(float(ranking[3][0])/len(self.inbuffer) * 10000)
		if len(ranking) > 4:
			self.log += u'1:\t' + str(float(ranking[4][0])/len(self.inbuffer) * 10000)
		self.log += u'1:\n'
		
		if len(ranking) > 0:
			rankA = (float(ranking[0][0])/len(self.inbuffer) * 10000)
		if len(ranking) > 1:
			rankB = (float(ranking[1][0])/len(self.inbuffer) * 10000)
		if len(ranking) > 2:
			rankC = (float(ranking[2][0])/len(self.inbuffer) * 10000)
		if len(ranking) > 3:
			rankD = (float(ranking[3][0])/len(self.inbuffer) * 10000)
		
		if len(ranking) > 3 and rankB > 30 and rankC > 8 and (rankC > rankD*4):
			self.log += u'1:\t... appears to be a file with line-breaks.\n'
		elif len(ranking) > 2 and rankB > 30 and (rankB > rankC*10):
			self.log += u'1:\t... appears to be a file with paragraph breaks.\n'
		else:
			self.log += u'1:\t... assumed to be a file with paragraph breaks.\n'
		
	
	
	
	
	
	
	
	
	

################################################################
#                                                              #
#               class P A C I F Y ceases here                  #
#                                                              #
################################################################



class pStringFormat():
	"""Manages formatting settings."""
	
	def __init__(self):
		
		self.isBold = False
		self.isItalic = False
		self.isSmallcap = False
		self.isUnderline = False
	
	def __eq__(self, other):
		if self.isBold != other.isBold:
			return False
		if self.isItalic != other.isItalic:
			return False
		if self.isSmallcap != other.isSmallcap:
			return False
		if self.isUnderline != other.isUnderline:
			return False
		return True
	
	def __ne__(self, other):
		if self.isBold != other.isBold:
			return True
		if self.isItalic != other.isItalic:
			return True
		if self.isSmallcap != other.isSmallcap:
			return True
		if self.isUnderline != other.isUnderline:
			return True
		return False
	
	def __repr__(self):
		output = u''
		if self.isBold:
			output += u'B'
		if self.isItalic:
			output += u'I'
		if self.isSmallcap:
			output += u'S'
		if self.isUnderline:
			output += u'U'
		if output == u'': output = u'-'
		return output
	
	
	def Set(self, f):
		
		output = self
		
		if f.lower().strip() == 'bold':
			if not(self.isBold):
				output = pStringFormat()
				output.isBold = True
				output.isItalic = self.isItalic
				output.isSmallcap = self.isSmallcap
				output.isUnderline = self.isUnderline
		elif f.lower().strip() == 'italic':
			if not(self.isItalic):
				output = pStringFormat()
				output.isBold = self.isBold
				output.isItalic = True
				output.isSmallcap = self.isSmallcap
				output.isUnderline = self.isUnderline
		elif f.lower().strip() == 'smallcap':
			if not(self.isSmallcap):
				output = pStringFormat()
				output.isBold = self.isBold
				output.isItalic = self.isItalic
				output.isSmallcap = True
				output.isUnderline = self.isUnderline
		elif f.lower().strip() == 'underline':
			if not(self.isUnderline):
				output = pStringFormat()
				output.isBold = self.isBold
				output.isItalic = self.isItalic
				output.isSmallcap = self.isSmallcap
				output.isUnderline = True
		else:
			print u'5:Erroneous function call: pFormat.Set('+f.lower().strip()+u')'
		
		return output
	
	def Clear(self, f):
		
		output = self
		
		if f.lower().strip() == 'bold':
			if self.isBold:
				output = pStringFormat()
				output.isBold = False
				output.isItalic = self.isItalic
				output.isSmallcap = self.isSmallcap
				output.isUnderline = self.isUnderline
		elif f.lower().strip() == 'italic':
			if self.isItalic:
				output = pStringFormat()
				output.isBold = self.isBold
				output.isItalic = False
				output.isSmallcap = self.isSmallcap
				output.isUnderline = self.isUnderline
		elif f.lower().strip() == 'smallcap':
			if self.isSmallcap:
				output = pStringFormat()
				output.isBold = self.isBold
				output.isItalic = self.isItalic
				output.isSmallcap = False
				output.isUnderline = self.isUnderline
		elif f.lower().strip() == 'underline':
			if self.isUnderline:
				output = pStringFormat()
				output.isBold = self.isBold
				output.isItalic = self.isItalic
				output.isSmallcap = self.isSmallcap
				output.isUnderline = False
		else:
			print u'5:Erroneous function call: pFormat.Clear('+f.lower().strip()+u')'
		
		return output
	
	def IsBold(self):
		return self.isBold
	
	def IsItalic(self):
		return self.isItalic
	
	def IsSmallcap(self):
		return self.isSmallcap
	
	def IsUnderline(self):
		return self.isUnderline
	

class pString:
	"""Contains and manages formatted strings."""
	
	def __init__(self):
		
		self.idx = 0
		self.curFormat = pStringFormat()
		self.pText = u''
		self.format = []
		self.classification = u''
	
	def replace(self, st, rt):
		return self.Replace(st, rt)
	
	def Replace(self, st, rt):
		
		tself = pString()
		tself = self
		output = tself
		
		startat = 0
		matchcount = 0
		if tself.pText.find(st, startat) > -1:
			if len(st) == len(rt):
				output.pText = tself.pText.replace(st, rt)
			elif len(st) > len(rt):
				startat = 0
				matchlist = []
				while True:
					if tself.pText[startat:].find(st) > -1:
						matchcount += 1
						matchlist.append(tself.pText.find(st, startat))
						startat = tself.pText.find(st, startat)+len(st)
					else:
						break
				abscounter = -1
				output = tself
				tmpbuffer = []
				tmpfbuffer = []
				oldat = 0
				for idx in range(0, len(matchlist)):
					startat = matchlist[idx]
					abscounter += 1
					tString = output
					tmpbuffer.append(tString.pText[oldat:startat])
					tmpfbuffer.extend(tString.format[oldat:startat])
					if len(tmpfbuffer) > 0:
						tcurformat = tmpfbuffer[-1]
					else:
						tcurformat = pString()
					for digit in rt:
						tmpbuffer.append(digit)
						tmpfbuffer.append(tcurformat)
					oldat = startat+len(st)
				tmpbuffer.append(tString.pText[oldat:])
				tmpfbuffer.extend(tString.format[oldat:])
				
				output.pText = ''.join(tmpbuffer)
				output.format = tmpfbuffer
			elif len(st) < len(rt):
				startat = 0
				matchlist = []
				while True:
					if tself.pText[startat:].find(st) > -1:
						matchcount += 1
						matchlist.append(tself.pText.find(st, startat))
						startat = tself.pText.find(st, startat)+len(rt)
					else:
						break
				abscounter = -1
				output = tself
				tmpbuffer = []
				tmpfbuffer = []
				oldat = 0
				for idx in range(0, len(matchlist)):
					startat = matchlist[idx]
					abscounter += 1
					tString = output
					tmpbuffer.append(tString.pText[oldat:startat])
					tmpfbuffer.extend(tString.format[oldat:startat])
					tcurformat = tmpfbuffer[-1]
					for digit in rt:
						tmpbuffer.append(digit)
						tmpfbuffer.append(tcurformat)
					oldat = startat+len(st)
				tmpbuffer.append(tString.pText[oldat:])
				tmpfbuffer.extend(tString.format[oldat:])
				
				output.pText = ''.join(tmpbuffer)
				output.format = tmpfbuffer
					
		return output
	
	def __add__(self, other):
		
		temp = pString()
		temp.pText = self.pText + other.pText
		temp.format.extend(self.format)
		temp.format.extend(other.format)
		return temp
	
	def __str__(self):
		return self.pText
	
	def __unicode__(self):
		print "__unicode__"
	
	def __eq__(self, other):
		if self.pText == other:
			return True
		return False
	
	def __ne__(self, other):
		print "__ne__"
	
	def __getattribute__(self, name):
		print "__getattribute__"
	
	def __get__(self, instance, owner):
		print "__get__"
	
	def __set__(self, instance, value):
		print "__set__"
	
	def __len__(self):
		return len(self.pText)
	
	def __getitem__(self, key):
		tmpString = pString()
		if hasattr(key, "start"):
			if key.start < 0:
				start = len(self)+key.start
			else:
				start = key.start
			
			if key.stop > len(self):
				stop = len(self)
			elif key.stop < 0:
				stop = len(self)+key.start
			else:
				stop = key.stop
			
			for idx in range(start, stop):
				tmpString.curFormat = self.format[idx]
				tmpString += self.pText[idx]
		else:
			tmpString.curFormat = self.format[key]
			tmpString += self.pText[key]
		return tmpString
	
	def __setitem__(self, key, value):
		print "__setitem__"
	
	def __iadd__(self, text):
		self.pText = self.pText + text
		
		for digit in text:
			self.format.append(self.curFormat)
		return self
	
	def Reset(self):
		self.idx = 0
	
	def Next(self):
		self.idx += 1
		if self.idx > len(self):
			self.idx = len(self)
			return False
		return True
	
	def Prev(self):
		self.idx -= 1
		if self.idx < 0:
			self.idx = 0
			return False
		return True
	
	def Bold(self):
		self.curFormat = self.curFormat.Set(u'bold')
	
	def Unbold(self):
		self.curFormat = self.curFormat.Clear(u'bold')
	
	def Italic(self):
		self.curFormat = self.curFormat.Set(u'italic')
	
	def Unitalic(self):
		self.curFormat = self.curFormat.Clear(u'italic')
	
	def Smallcap(self):
		self.curFormat = self.curFormat.Set(u'smallcap')
	
	def Unsmallcap(self):
		self.curFormat = self.curFormat.Clear(u'smallcap')
	
	def Underline(self):
		self.curFormat = self.curFormat.Set(u'underline')
	
	def Ununderline(self):
		self.curFormat = self.curFormat.Clear(u'underline')
	
	

class pTome:
	"""Contains and manages pBlock objects."""
	
	def __init__(self):
		
		self.idx = 0
		self.pBlocks = []
	
	

class pLogger:
	"""Logs and optionally displays status messages."""
	
	def __init__(self):
		
		self.itsLog = []
		self.itsThreshold = 1
		print
	
	def __iadd__(self, text):
		if text[2:] == u'\n':
			self.itsLog.append(u'\n')
			if self.itsThreshold >= int(text[0:1]):
				print
		else:
			self.itsLog.append(u'[' + text[0] + u']: ' + text[2:]+u'\n')
			if self.itsThreshold >= int(text[0:1]):
				print text[2:]
		return self
	
	def dump(self, filename):
		outfile = open(filename, 'w')
		self += u'2:Writing logfile...\n'
		outfile.write(''.join(self.itsLog).encode('utf-8'))
		outfile.close()
		self  += u'2:\tDone!\n'
	
	



if __name__ == "__main__":
    print "\tThis python file is a module.  Cannot execute directly.\n"
