#!/usr/bin/env python
"""
	Program: striphtml

	Author: EatingPie <pie@storybytes.com>

	Strips out HTML that pielrf does not understand.  LEAVES IN HTML THAT
	PIELRF DOES UNDERSTAND!

	This is equivalent to supplying the "--strip-html" option to pielrf,
	but allows is provided for two reasons:

		(1) If html is complex, running this program first to strip makes
		    editing the new file easier (ie adding <chapter> tags).

		(2) Using "--strip-html" in pielrf may not produce the desired
		    results, so stripping it first allows the user to edit
			any undesirable results.

	CAVEAT

		Not meant for serious web-pages!
		Used primarily with HTML Files produced by ConvertLit

"""

from pylrs.pylrs			import *
from pielrf.chapterbook		import *
from pielrf.textconvert		import *

from sys					import *
from optparse				import OptionParser
import os
import re


#############################################################
# def parse_cmdline():										#
#															#
#############################################################
def parse_cmdline():

	cmdline = OptionParser(usage="usage: %prog -i infile")

	cmdline.add_option("-i", "--infile", dest="infile",
					   action="store",   type="string",
					   help="Input html file to strip")
	cmdline.add_option("-o", "--outfile", dest="outfile",
					   action="store",   type="string",
					   help="Output text file, stripped!")

	(options, args) = cmdline.parse_args()

	infile  = options.infile
	outfile = options.outfile

	if infile == None:
		cmdline.print_help()
		print "Must specify Input File"
		return None
	#endif
	if outfile == None:
		cmdline.print_help()
		print "Must specify Output File"
		return None
	#endif
	if not os.path.exists(infile) :
		cmdline.print_help()
		print "Input File \"", infile, "\" does not exist"
		return None
	#endif

	if infile == outfile :
		cmdline.print_help()
		print "Input and Output files were the same: \""+infile+"\""
		return None
	#endif

	return options

#enddef parse_cmdline


#############################################################
# def striphtml():											#
#############################################################
def striphtml():

	cmdopts = parse_cmdline()
	if not cmdopts :
		return

	# Locals
	infile  = cmdopts.infile
	outfile = cmdopts.outfile

	#
	# Read whole file in all at once
	#
	print "Reading", infile+"..."
	f    = open(infile, 'rb')
	data = f.read()
	f.close()

	g    = open(outfile, 'wb')

	print "Stripping Unknown HTML..."
	line = eat_unknown_tags_yum(data)

	print "Writing", outfile+"..."
	g.write(line)
	g.close()

	print "Done."

# enddef test()

if __name__ == "__main__":
	striphtml()
