View Single Post
Old 10-09-2012, 10:16 AM   #8
DiapDealer
Grand Sorcerer
DiapDealer ought to be getting tired of karma fortunes by now.DiapDealer ought to be getting tired of karma fortunes by now.DiapDealer ought to be getting tired of karma fortunes by now.DiapDealer ought to be getting tired of karma fortunes by now.DiapDealer ought to be getting tired of karma fortunes by now.DiapDealer ought to be getting tired of karma fortunes by now.DiapDealer ought to be getting tired of karma fortunes by now.DiapDealer ought to be getting tired of karma fortunes by now.DiapDealer ought to be getting tired of karma fortunes by now.DiapDealer ought to be getting tired of karma fortunes by now.DiapDealer ought to be getting tired of karma fortunes by now.
 
DiapDealer's Avatar
 
Posts: 27,548
Karma: 193191846
Join Date: Jan 2010
Device: Nexus 7, Kindle Fire HD
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai

import sys, argparse, codecs
from HTMLParser import HTMLParser
from re import sub
from sys import stderr
from traceback import print_exc

class _DeHTMLParser(HTMLParser):
	def __init__(self):
		HTMLParser.__init__(self)
		self.__text = []

	def handle_data(self, data):
		text = data.strip()
		if len(text) > 0:
			text = sub('[ \t\r\n]+', ' ', text)
			self.__text.append(text + ' ')

	def handle_starttag(self, tag, attrs):
		if tag == 'p':
			self.__text.append('\n\n')
		elif tag == 'br':
			self.__text.append('\n')

	def handle_startendtag(self, tag, attrs):
		if tag == 'br':
			self.__text.append('\n\n')

	def text(self):
		return ''.join(self.__text).strip()

def main():
	parser = argparse.ArgumentParser(description='''This script will accept utf-8 text files and write a list of unique characters to stdout or an output file''')
	parser.add_argument("file", nargs='+',help="input (utf-8) file(s) for character counting")
	parser.add_argument("-o", "--outfile", help="outputfile")
	parser.add_argument("-c", "--codec", help="input char encoding")
	args = parser.parse_args()
	disallowed = set('')
	s=set()
	if not args.codec:
		file_codec = 'utf-8'
	else:
		file_codec = args.codec
	for f in args.file:
		try:
			html_parser = _DeHTMLParser()
			html_parser.feed(unicode(file(f, 'r').read(), file_codec))
			html_parser.close()
			text = html_parser.text()
			s=s|set(char for line in text for char in line 
					if char not in disallowed)
		except:
			print_exc(file=stderr)
	if args.outfile:
		print 'Writing to file: '+args.outfile;
		with codecs.open(args.outfile, 'w', file_codec) as f:
			f.write(''.join(s))
			f.close
	else:
		print ''.join(s).encode(file_codec)
	
if __name__ == '__main__':
	sys.exit(main())


usage: uniquechars.py [-h] [-c CODEC] [-o OUTFILE] file [file ...]

An attempt to modify so that only the text of an html document is parsed and also allow the input/output of other charset encodings. The default is utf-8 if not specified on the command-line. I got it to work with either utf-8 or windows-1252.
DiapDealer is offline   Reply With Quote