#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai

from __future__ import print_function
from __future__ import unicode_literals

import os, sys
import codecs, re
from HTMLParser import HTMLParser
from zipfile import ZipFile
from traceback import print_exc

try:
	import xml.etree.cElementTree as ET
except ImportError:
	import xml.etree.ElementTree as ET

MIMETYPE = 'application/epub+zip'
MEDIATYPE = 'application/xhtml+xml'
NSMAP = {'opf': '{http://www.idpf.org/2007/opf}',
		 'con': '{urn:oasis:names:tc:opendocument:xmlns:container}'}

class Unbuffered:
	def __init__(self, stream):
		self.stream = stream
	def write(self, data):
		self.stream.write(data)
		self.stream.flush()
	def __getattr__(self, attr):
		return getattr(self.stream, attr)

class myargparse:
	'''Love the argparse module to death, but it's only standard starting with python 2.7'''
	def __init__(self, argv):
		self.description = \
'''This script will parse an epub/html/text file and generate a list of unique characters used in that file.'''
		self.argv = argv
		self.progname = argv[0]
		self.errors =[]
		self.file = None
		self.outfile = None
		self.encoding = 'utf-8'
		self.cssclass = None

	def get_args(self):
		import os, getopt
		try:
			opts, args = getopt.getopt(self.argv[1:], 'ho:e:c:',
							['help', 'outfile=', 'encoding=', 'cssclass='])
		except getopt.GetoptError, err:
			self.errors.append(str(err))
			return False
		for o, a in opts:
			if o.lower() in ('-h', '--help'):
				self.usage(True)
				sys.exit(2)
			elif o.lower() in ('-o', '--outfile'):
				if not str(a).startswith('-') and a is not None:
					self.outfile = a
				else:
					self.errors.append('Missing an argument for option %s' % str(o))
			elif o.lower() in ('-e', '--encoding'):
				if not str(a).startswith('-') and a is not None:
					self.encoding = a
				else:
					self.errors.append('Missing an argument for option %s' % str(o))
			elif o.lower() in ('-c', '--cssclass'):
				if not str(a).startswith('-') and a is not None:
					self.cssclass = a
				else:
					self.errors.append('Missing an argument for option %s' % str(o))
			else:
				self.errors.append('Unrecognized option %s' % str(o))
		if self.errors:
			return False
		if len(args) == 0 :
			self.errors.append('An input file is required!')
			return False
		elif len(args) > 1:
			self.errors.append('Only one input file is accepted!')
			return False
		else:
			if os.path.isfile(args[0]):
				self.file = args[0]
			else:
				self.errors.append('%s is not an existing file!' % str(args0))
				return False
		return True
	
	def usage(self, help_flag=False):
		import textwrap
		wrapper = textwrap.TextWrapper(width=80, subsequent_indent='\t\t')
		msg = 'USAGE: %s [-h] [-o OUTFILE] [-e ENCODING] [-c CSSCLASS] FILE' % self.progname
		print('')
		print(msg)
		if help_flag:
			print('')
			print(textwrap.fill(self.description))
			print('')
			print ('Positional arguments:')
			print('   FILE\t\tInput file (epub html text).')
			print('')
			print('Optional arguments:')
			print('   -h, --help\t\tshow this help message and exit.')
			print('   -o OUTFILE, --outfile OUTFILE')
			print(wrapper.fill('\t\tOutput file for unique character list. (default: None)'))
			print('   -e ENCODING, --encoding ENCODING')
			print(wrapper.fill('\t\tCharacter encoding of input file. (default: utf-8)'))
			print('   -c CSSCLASS, --cssclass CSSCLASS')
			print(wrapper.fill('\t\tRestrict results to a specific CSS class. (default: None)'))
			print('')	

class _DeHTMLParser(HTMLParser):
	'''Subclass to override the methods of the HTMLParser module. Provides a simple 
	way to remove html tags and include/exclude text based on css class criteria'''
	def __init__(self, class_filter=None):
		HTMLParser.__init__(self)
		self.past_body = False
		self.script_tag = False;
		self.class_filter = class_filter
		self.filter_elem = None
		self.nested = 0
		self.filterOn = False
		if class_filter is not None: #If no filter specified, capture is always on.
			self.filterOn = True
		self.__text = [] 

	def handle_data(self, data):
		'''We're only going to parse the stuff contained in the body of the document, 
		skipping any script tags encountered in the process (just in case).'''
		if not self.past_body:
			return
		elif self.script_tag:
			return
		elif self.filterOn: #If filtering, and filter is on ... do nothing
			return
		text = data.strip()
		if len(text) > 0:
			'''Not really critical, but part of the 'prettifying' of deHTMLed text.'''
			text = re.sub('[ \t\r\n]+', ' ', text)
			self.__text.append(text + ' ')

	def handle_endtag(self, tag):
		'''Processing to be done when an html end-tag is encountered.'''
		if self.filter_elem is not None and tag.lower() == self.filter_elem and not self.nested:
			self.filter_elem = None
			self.filterOn = True #Stop capturing text.
		elif self.filter_elem is not None and tag.lower() == self.filter_elem and self.nested > 0:
			self.nested -= 1
		if tag.lower() == 'script':
			self.script_tag = False
		elif tag.lower() == 'body':
			self.past_body = False

	'''def handle_startendtag(self, tag, attrs):
		if tag.lower() == 'br':
			self.__text.append('\n')'''

	def handle_starttag(self, tag, attrs):
		'''Processing to be done when an html start-tag is encountered.'''
		if self.class_filter is not None and self.filter_elem is None:
			for attr in attrs:
				if attr[0] == 'class' and self.class_filter in attr[1].split(' '):
					self.filter_elem = tag.lower()
					self.filterOn = False #Start capturing text.
		elif self.class_filter is not None and self.filter_elem is not None and tag.lower() == self.filter_elem:
			self.nested += 1
		if tag.lower() == 'body':
			self.past_body = True
		elif tag.lower() == 'script':
			self.script_tag = True
		'''
		elif tag.lower() == 'p':
			self.__text.append('\n\n')
		elif tag.lower() == 'br':
			self.__text.append('\n')
		'''

	def text(self):
		'''Return the collected text.'''
		return ''.join(self.__text).strip()

class ePubReader:
	'''Class to check the (basic) integrity of an ePub container and to gather/read
	the files that are manifested in the OPF file as media-type application/xhtml+xml'''
	def __init__(self, inzip):
		self.inzip = ZipFile(inzip,'r')
		self.files = []
		self.opfPath = None

	def hasValidMimeType(self):
		'''Try to read the mimetype file'''
		for zinfo in self.inzip.infolist():
			if zinfo.filename == 'mimetype':
				try:
					if self.inzip.read(zinfo.filename) == MIMETYPE:
						return True
				except:
					break
		return False

	def hasValidContainer(self):
		'''Try to read the container file to find the path to the OPF file'''
		for zinfo in self.inzip.infolist():
			if zinfo.filename == 'META-INF/container.xml':
				try:
					containerData = self.inzip.read(zinfo.filename)
					xmltree = ET.fromstring(containerData)
					expr = './/%srootfiles/%srootfile' % (NSMAP['con'], NSMAP['con'])
					elem = xmltree.find(expr)
					opfPath = elem.get('full-path', None)
					if opfPath is not None:
						self.opfPath = opfPath
						return True
				except:
					break
		return False

	def getXhtmlFiles(self):
		'''Iterate the OPF's manifest and get all hrefs of the items with a media-type of application/xhtml+xml'''
		files = []
		try:
			opfData = self.inzip.read(self.opfPath)
		except:
			return [], 'Error reading/finding OPF file.'

		prefix = os.path.split(self.opfPath)[0]
		if prefix != '':
			prefix += '/'
		xmltree = ET.fromstring(opfData)
		manifest_expr = './/%smanifest/%sitem' % (NSMAP['opf'], NSMAP['opf'])
		try: #Iterate the manifest.
			for elem in xmltree.findall(manifest_expr):
				if elem.get('media-type') == MEDIATYPE:
					'''Collect the files and patch the file-paths to match the paths in the zipfile.infolist()'''
					files.append(prefix + elem.get('href'))
		except:
			return files, 'Error parsing OPF manifest.'
		if not files:
			return files, 'No items of media-type "application/xhtml+xml" found in OPF\'s manifest.'
		return files, None

	def readXhtmlFile(self, xhtml_file, encoding):
		'''Read/return the contents of the (x)html using proper encoding.'''
		data = self.inzip.read(xhtml_file).decode(encoding)
		return data

	def close(self):
		'''Close the zip (epub) file.'''
		self.inzip.close()

def deHTMLize(data, class_filter):
	'''Create the _DeHTMLParser instance -- unescape any entities -- feed the
	the document to the parser and return the collected text.'''
	h = _DeHTMLParser(class_filter)
	text = h.unescape(data) #Unescape all entities
	h.reset()
	h.feed(text)
	h.close()
	return h.text()

def main():
	args = myargparse(sys.argv)
	if not args.get_args():
		for error in args.errors:
			print('')
			print(error)
		args.usage()
		sys.exit(2)
	
	try:
		codecs.lookup(args.encoding)
	except LookupError:
		print('   \ncodec %s not found/registered.' % args.encoding)
		sys.exit(2)

	print('\nProcessing %s\n' % args.file)
	s=set()
	if os.path.splitext(args.file)[1] == '.epub': #We have something that at least resembles an ePub.
		epub = ePubReader(args.file)
		if epub.hasValidMimeType() and epub.hasValidContainer():
			file_list, error = epub.getXhtmlFiles()
			if error is not None:
				epub.close()
				print('   %s' % error)
				sys.exit(2)
		else:
			print('   Problem reading %s. May not be a valid ePub file.' % args.file)
			epub.close()
			sys.exit(2)

		'''Parse each (x)html file in the epub.'''
		for f in file_list:
			print('   Parsing %s' % f)
			try:
				data = epub.readXhtmlFile(f, args.encoding)
				text = deHTMLize(data, args.cssclass)
			except:
				print_exc(file=sys.stderr)
				epub.close()
				sys.exit(2)
			'''This is the payoff. The set of unique chars.'''	
			if len(text) > 0:
				s = s | set(char for line in text for char in line)
			else:
				print('   ... No qualifying characters found in %s' % f)
		print('')
		epub.close()

	else: #Single (x)hmtl or text file.
		try:
			data = codecs.open(args.file, 'rb', args.encoding).read()
			'''Cheesy (x)html detection system.'''
			if data.find('<html') >= 0 or data.find('<HTML') >= 0:
				text = deHTMLize(data, args.cssclass)
			else: #Assume straight-up text.
				text = data
		except:
			print_exc(file=sys.stderr)
			sys.exit(2)
		'''This is the payoff. The set of unique chars.'''
		if len(text) > 0:
			s = s | set(char for line in text for char in line)
		else:
			print('   ... No qualifying characters found in %s.\n' % args.file)
	
	'''Write or print the set of unique characters.'''
	if len(s) > 0:
		if args.outfile:
			print('Writing unique chars to file: %s' % args.outfile)
			with codecs.open(args.outfile, 'wb', args.encoding) as outf:
				outf.write(''.join(s))
				outf.close
		else:
			'''Probably won't ever work entirely in a Windows console (depending on
			what unicode characters are present) so error-checking is not strict.'''
			output = u''.join(s).encode(sys.stdout.encoding, 'replace')
			print(u'\nUnique Chars: %s'.encode(sys.stdout.encoding) % output)
	else:
		print ('   Nothing to do. Verify input file, encoding and css class filter.')
	
if __name__ == '__main__':
	sys.stdout=Unbuffered(sys.stdout)
	sys.exit(main())