import java.io.File;
import java.io.FileReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.IOException;
import java.util.zip.ZipInputStream;
import java.util.HashSet;
import java.util.Iterator;

/**
 * This class providers a RTF pull parser. Roughly themed by the XPP3 class.
 *
 * @author Scott Turner (scotty1024@mac.com)
 * @version $Revision: 1.1 $
 * @Date October 18, 2006
 */
public class RTFPullParser {

    /**
     * This constant is returned as the event type when positioned at the start of the RTF.
     *
     */
    public static final int START_DOCUMENT = 0;

    /**
     * This constant is returned as the event type when a command is parsed from the RTF.
     *
     */
    public static final int COMMAND = 1;

    /**
     * This constant is returned as the event type when a GROUP begin is parsed from the RTF.
     *
     */
    public static final int GROUP_BEGIN = 2;

    /**
     * This constant is returned as the event type when a GROUP end is parsed from the RTF.
     *
     */
    public static final int GROUP_END = 3;

    /**
     * This constant is returned as the event type when a block of TEXT is parsed from the RTF.
     *
     */
    public static final int TEXT = 4;

    /**
     * This constant is returned as the event type when the end of the document has been reached.
     *
     */
    public static final int END_DOCUMENT = 666;

    /**
     * This main method provides a means to test this class. It accepts an RTF file as it's
     * sole command line argument and will parse it and report on the events generated.
     *
     * @param args a <code>String[]</code> value
     * @exception IOException if an error occurs
     */
    public static void main(String[] args)
	throws IOException
    {
	RTFPullParser rpp = new RTFPullParser();

	if (args[0].toLowerCase().endsWith(".zip")) {
	    ZipInputStream zipFile = new ZipInputStream(new FileInputStream(args[0]));
	    zipFile.getNextEntry();
	    rpp.setInput ( new InputStreamReader(zipFile));
	} else {
	    rpp.setInput ( new FileReader(new File(args[0])));
	}

	HashSet seenCommands = new HashSet(32);

	for (int eventType = rpp.getEventType(); eventType != RTFPullParser.END_DOCUMENT; eventType = rpp.next()) {
	    if (eventType == COMMAND) {
		if (rpp.hasArgument()) {
		    System.out.println("Command -> " + rpp.getName() + " argument: " + rpp.getArgument());
		} else {
		    System.out.println("Command -> " + rpp.getName());
		}
		seenCommands.add(rpp.getName());
	    } else if (eventType == START_DOCUMENT) {
		System.out.println("Start of Document");
	    } else if (eventType == TEXT) {
		System.out.println("Text -> " + rpp.getTextCharacters());
	    } else if (eventType == GROUP_BEGIN) {
		System.out.println("Group Start");
	    } else if (eventType == GROUP_END) {
		System.out.println("Group End");
	    } else {
		System.out.println("Event: " + eventType);
	    }
	}

	System.out.println("Encountered " + seenCommands.size() + " commands.");
	Iterator commands = seenCommands.iterator();
	while (commands.hasNext()) {
	    System.out.println(commands.next());
	}
    }

    private Reader reader = null;

    /**
     * This method sets the RTF stream to be parsed.
     *
     * @param aReader a <code>Reader</code> value containing the stream to be parsed.
     */
    public void setInput(Reader aReader) {
	reader = aReader;
    }

    /**
     * This method returns the current event type for the currently parsed token.
     *
     * @return an <code>int</code> value e.g. COMMAND
     */
    public int getEventType() {
	return token;
    }

    /**
     * This method parses the next token from the stream and then returns the event
     * type for that token.
     *
     * @return an <code>int</code> value e.g. COMMAND
     * @exception IOException if an error occurs in reading the stream.
     */
    public int next()
	throws IOException
    {
	parseNextToken();
	return getEventType();
    }

    /**
     * This method returns the name of the current COMMAND.
     *
     * @return a <code>String</code> value containing the name of the current COMMAND e.g. par
     */
    public String getName() {
	return name.toString();
    }

    /**
     * This method return the text of the current TEXT.
     *
     * @return a <code>String</code> value containing the text in the current TEXT block.
     */
    public String getTextCharacters() {
	return textBuffer.toString();
    }

    /**
     * This method returns true if the current COMMAND has an argument, otherwise it returns
     * false.
     *
     * @return a <code>boolean</code> value that is true if the current COMMAND has an argument.
     */
    public boolean hasArgument() {
	return argument.length() != 0;
    }

    /**
     * This method returns the argument for the current COMMAND. You should first invoke
     * hasArgument() to see if the current COMMAND has an argument.
     *
     * @return an <code>int</code> value
     */
    public int getArgument() {
	int i = 0;
	try {
	    i = Integer.parseInt(argument.toString());
	} catch (NumberFormatException e) {
	}
	return i;
    }

    /* Private state for parser's state machine */
    private static final int NONE = 0;		// Idle state
    private static final int NAME = 1;		// Parsing command name
    private static final int ARGUMENT = 2;	// Parsing command argument
    private static final int SLASH = 5;		// Parsing \ in a TEXT block
    private int state = NONE;			// Current state of parser
    private int token = START_DOCUMENT;		// Current token
    private int push = -1;			// 1 level deep push back for parser

    /* Current token state information */
    private StringBuffer textBuffer = new StringBuffer(16384);	// current TEXT
    private StringBuffer name = new StringBuffer(32);		// current COMMAND name
    private StringBuffer argument = new StringBuffer(32);	// current COMMAND argument


    /**
     * This method parses the next token from the RTF stream.
     *
     * @exception IOException if an error occurs in reading the stream.
     */
    private void parseNextToken()
	throws IOException
    {
	boolean tokenNotDone = true;
	int c = push;
	push = -1;
	if (c == -1) {
	    c = reader.read();
	}
	while (tokenNotDone) {
	    //System.out.println("S: " + state + " C: " + (char)c);
	    switch (state) {
	    case NONE:
		if (c == '\\') {
		    state = NAME;
		    name.setLength(0);
		    argument.setLength(0);
		    break;
		} else if (c == '{') {
		    token = GROUP_BEGIN;
		    tokenNotDone = false;
		    break;
		} else if (c == '}') {
		    token = GROUP_END;
		    tokenNotDone = false;
		    break;
		} else if (!Character.isISOControl((char)c)) {
		    state = TEXT;
		    textBuffer.setLength(0);
		    textBuffer.append((char)c);
		}
		break;
	    case NAME:
		if ((c == '\\') ||
		    (c == '{') ||
		    (c == '}') ||
		    (c == ' ') ||
		    Character.isISOControl((char)c)) {
		    if ((c != ' ') && !Character.isISOControl((char)c)) {
			push = c;
		    }
		    token = COMMAND;
		    state = NONE;
		    tokenNotDone = false;
		} else if (Character.isDigit(c) ||
			   (c == '-')) {
		    state = ARGUMENT;
		    argument.setLength(0);
		    argument.append((char)c);
		} else {
		    name.append((char)c);
		}
		break;
	    case ARGUMENT:
		if ((c == '\\') ||
		    (c == ' ') ||
		    (c == '{') ||
		    (c == '}') ||
		    Character.isISOControl((char)c)) {
		    if ((c != ' ') && !Character.isISOControl((char)c)) {
			push = c;
		    }
		    token = COMMAND;
		    tokenNotDone = false;
		    state = NONE;
		} else if (Character.isDigit(c)) {
		    argument.append((char)c);
		} else {
		    System.err.println("Argument parsing error on: " + (char)c);
		    System.exit(1);
		}
		break;
	    case TEXT:
		if (c == '\\') {
		    state = SLASH;
		} else if (c == '{') {
		    state = NONE;
		    push = c;
		    token = TEXT;
		    tokenNotDone = false;
		} else if (c == '}') {
		    state = NONE;
		    push = c;
		    token = TEXT;
		    tokenNotDone = false;
		} else if (!Character.isISOControl((char)c)) {
		    textBuffer.append((char)c);
		}
		break;
	    case SLASH:
		if ((c == '\\') ||
		    (c == '{') ||
		    (c == '}')) {
		    textBuffer.append((char)c);
		    state = TEXT;
		    // \'d0
		} else if (c == '\'') {
		    int h1 = reader.read();
		    int h2 = reader.read();
		    /*
AT PRESENT PDFBox only gives us access to the "Standard" encoding. Which pretty much means all
of the escaped character codes here are unavailable. So we parse it, then toss it.
		    if ((h1 == -1) || (h1 == -1)) {
			token = END_DOCUMENT;
			tokenNotDone = false;
			break;
		    }
		    if (Character.isDigit((char)h1)) {
			c = h1 - (int)'0';
		    } else {
			// XXX whole lotta trust for now...
			c = (Character.toLowerCase((char)h1) - (int)'a') + 10;
		    }
		    c = c * 16;
		    if (Character.isDigit((char)h2)) {
			c += h2 - (int)'0';
		    } else {
			// XXX whole lotta trust for now...
			c += (Character.toLowerCase((char)h2) - (int)'a') + 10;
		    }
		    textBuffer.append((char)c);
		    */
		    state = TEXT;
		} else {
		    state = NAME;
		    name.setLength(0);
		    argument.setLength(0);
		    name.append((char)c);
		    token = TEXT;
		    tokenNotDone = false;
		}
		break;
	    }

	    if (tokenNotDone) {
		c = reader.read();
		if (c == -1) {
		    token = END_DOCUMENT;
		    tokenNotDone = false;
		}
	    }
	}

	//System.out.println("T: " + token + " S: " + state + " C: " + (char)c + " P: " + (char)push);
    }
}