Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
import sys, argparse, codecs
from HTMLParser import HTMLParser
from re import sub
from sys import stderr
from traceback import print_exc
class _DeHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.__text = []
def handle_data(self, data):
text = data.strip()
if len(text) > 0:
text = sub('[ \t\r\n]+', ' ', text)
self.__text.append(text + ' ')
def handle_starttag(self, tag, attrs):
if tag == 'p':
self.__text.append('\n\n')
elif tag == 'br':
self.__text.append('\n')
def handle_startendtag(self, tag, attrs):
if tag == 'br':
self.__text.append('\n\n')
def text(self):
return ''.join(self.__text).strip()
def main():
parser = argparse.ArgumentParser(description='''This script will accept utf-8 text files and write a list of unique characters to stdout or an output file''')
parser.add_argument("file", nargs='+',help="input (utf-8) file(s) for character counting")
parser.add_argument("-o", "--outfile", help="outputfile")
parser.add_argument("-c", "--codec", help="input char encoding")
args = parser.parse_args()
disallowed = set('')
s=set()
if not args.codec:
file_codec = 'utf-8'
else:
file_codec = args.codec
for f in args.file:
try:
html_parser = _DeHTMLParser()
html_parser.feed(unicode(file(f, 'r').read(), file_codec))
html_parser.close()
text = html_parser.text()
s=s|set(char for line in text for char in line
if char not in disallowed)
except:
print_exc(file=stderr)
if args.outfile:
print 'Writing to file: '+args.outfile;
with codecs.open(args.outfile, 'w', file_codec) as f:
f.write(''.join(s))
f.close
else:
print ''.join(s).encode(file_codec)
if __name__ == '__main__':
sys.exit(main())
usage: uniquechars.py [-h] [-c CODEC] [-o OUTFILE] file [file ...]
An attempt to modify so that only the text of an html document is parsed and also allow the input/output of other charset encodings. The default is utf-8 if not specified on the command-line. I got it to work with either utf-8 or windows-1252.