/* 
 * Copyright (C) 2005 and 2006, Scott Turner scotty1024@mac.com
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import java.util.Vector;


import java.io.IOException;

/**
 * Class is a simple HTML parser which mainly focuses on extracting text from HTML in a highly
 * reflowable manner for the BBeB LRF file format.
 *
 * @author Scot Turner <scotty1024@mac.com>
 */
public class HtmlParser {
    private final boolean debugHTML = false;

    /*
    static final char START_BOLD = (char)0xf56c;
    static final char END_BOLD = (char)0xf56d;
    */
    /*
    static final char START_BOLD = (char)0xf56e;
    static final char END_BOLD = (char)0xf56f;
    */
    static final char START_BOLD = (char)0xf5c1;
    static final char END_BOLD = (char)0xf5c2;

    static final char START_SUP = (char)0xf5b7;
    static final char END_SUP = (char)0xf5b8;
    static final char START_SUB = (char)0xf5b9;
    static final char END_SUB = (char)0xf5ba;

    private BBeBook book;
    private byte[] buf;

    /**
     * Default constructor, loads properties from file.
     * 
     * @throws IOException If there is an error loading properties from the file.
     */
    public HtmlParser(BBeBook aBook, String aHtmlFileName)
	throws IOException
    {
	book = aBook;
	buf = readWholeFile(aHtmlFileName);
    }

    byte[] readWholeFile(String aFileName) {
	File file = new File(aFileName);

	int fileSize = (int)file.length();
	byte[] buf = new byte[fileSize];

	try {
	    InputStream in = new FileInputStream(file);

	    if (in.read(buf) != buf.length) {
		System.err.println("Error reading file: " + file);
		System.exit(1);
	    }
	    in.close();
	    in = null;

	    return buf;
	} catch (Exception e) {
	    System.err.println("Error loading LRF file: " + file + " message: " + e.getMessage());
	    e.printStackTrace();
	    System.exit(1);
	}
	return null;
    }

    
    protected void parsePages()
	throws IOException
    {
	int paragraphCount = 0;
	int bufIndex = 0;
	boolean inTag = false;
	boolean inEntity = false;
	boolean inBody = false;
	boolean inWord = false;
	StringBuffer tagBuffer = new StringBuffer(32);
	StringBuffer entityBuffer = new StringBuffer(32);

	startPage();

	while (bufIndex < buf.length) {
	    char c = (char)(buf[bufIndex++] & 0x00ff);
	    if (inTag) {
		if (c == '>') {
		    inTag = false;
		    //System.err.println("Found tag: " + tagBuffer);
		    if (inBody) {
			if (tagBuffer.charAt(0) == '/') {
			    if (tagBuffer.length() == 2) {
				if (tagBuffer.charAt(1) == 'p') {
				    endParagraph();
				} else if (tagBuffer.charAt(1) == 'i') {
				    book.outAppend(0xf582); // End Italic
				}

				if (tagBuffer.charAt(1) == 'b') {
				    //book.outAppend(END_BOLD); // End Bold
				}
			    }

			    if (tagBuffer.length() >= 5) {
				if ("body".equals(tagBuffer.substring(1,5))) {
				    inBody = false;
				    continue;
				}
			    }
			    if (tagBuffer.length() >= 4) {
				if ("sup".equals(tagBuffer.substring(1,4))) {
				    book.outAppend(END_SUP); // Start Superscript
				} else if ("sub".equals(tagBuffer.substring(1,4))) {
				    book.outAppend(END_SUB); // Start Sub-script
			        } else if ("pre".equals(tagBuffer.substring(1,4))) {
				    endParagraph();
				}
			    }
			} else {
			    if (tagBuffer.length() == 1) {
				if (tagBuffer.charAt(0) == 'p') {
				    startParagraph();
				} else if (tagBuffer.charAt(0) == 'i') {
				    book.outAppend(0xf581); // Start Italic
				} else if (tagBuffer.charAt(0) == 'b') {
				    //book.outAppend(START_BOLD); // Start Bold
				}
			    }
			    if (tagBuffer.length() >= 3) {
				if ("sup".equals(tagBuffer.substring(0,3))) {
				    book.outAppend(START_SUP); // Start Superscript
				} else if ("sub".equals(tagBuffer.substring(0,3))) {
				    book.outAppend(START_SUB); // Start Sub-script
				} else if ("pre".equals(tagBuffer.substring(0,3))) {
				    startParagraph();
				}
			    }
			    if ((tagBuffer.length() >= 2) && "br".equals(tagBuffer.substring(0, 2))) {
				breakLine();
			    }
			}
		    } else {
			if (tagBuffer.length() >= 4) {
			    if ("body".equals(tagBuffer.substring(0,4))) {
				inBody = true;
			    }
			}
		    }
		} else {
		    tagBuffer.append(Character.toLowerCase(c));
		}
	    } else if (inEntity) {
		if (c == ';') {
		    inEntity = false;
		    //System.err.println("Found entity: " + entityBuffer);
		    if (entityBuffer.charAt(0) == '#') {
			try {
			    int value = Integer.parseInt(entityBuffer.substring(1));
			    if ((value != 160) &&
				(value != 173)) {
				//System.out.print((char)value);
				book.outAppend(value);
			    }
			} catch (NumberFormatException e) {
			    System.err.println("Bad entity # value: " + entityBuffer);
			}
		    } else {
			Character entityValue = (Character)entityToCharacter.get(entityBuffer);
			if (entityValue != null) {
			    //System.out.print(entityValue.charValue());
			    book.outAppend(entityValue.charValue());
			}
		    }
		} else {
		    entityBuffer.append(c);
		}
	    } else {
		if (c == '<') {
		    inTag = true;
		    tagBuffer.setLength(0);
		} else if (inBody) {
		    if (c == '&') {
			inEntity = true;
			entityBuffer.setLength(0);
		    } else if (Character.isISOControl(c)) {
			inWord = false;
		    } else if (Character.isWhitespace(c)) {
			inWord = false;
		    } else {
			if (!inWord) {
			    //if (Character.isLetterOrDigit(c)) {
				//System.out.print(' ');
				book.outAppend(' ');
			    //}
			}
			//System.out.print(c);
			book.outAppend(c);
			inWord = true;
		    }
		}
	    }
	}
	endPage();
	//	System.err.println("Reached end of document.");
    }

    void startParagraph() {
	book.outAppend(0xf5ca);	// Indent paragraph 18pt
	book.outAppend(180);
    }

    void endParagraph() {
	breakLine();
	breakLine();
    }

    private void breakLine() {
	if (book.outBufOffset > 48000) {
	    breakPage();
	} else {
	    book.outAppend(0xf5d2);
	}
    }

    private void startPage() {
	// Start page marker
	book.outAppend(0xf5a1);
	book.outAppend(0x0000);
	book.outAppend(0x0000);
    }

    private void breakPage() {
	endPage();
	startPage();
    }

    private void endPage() {
	// End of Page marker
	book.outAppend(0xf5a2);

	book.addOutBufAsTextPage();
    }

    public static void main(String[] args)
	throws Exception
    {
	HtmlParser parser = new HtmlParser(null, args[0]);
	parser.parsePages();
    }

    static final HashMap entityToCharacter;

    static {
	entityToCharacter = new HashMap(192*2);

	entityToCharacter.put("nbsp"   , new Character((char)160));
	entityToCharacter.put("iexcl"  , new Character((char)161));
	entityToCharacter.put("cent"   , new Character((char)162));
	entityToCharacter.put("pound"  , new Character((char)163));
	entityToCharacter.put("curren" , new Character((char)164));
	entityToCharacter.put("yen"    , new Character((char)165));
	entityToCharacter.put("brvbar" , new Character((char)166));
	entityToCharacter.put("sect"   , new Character((char)167));
	entityToCharacter.put("uml"    , new Character((char)168));
	entityToCharacter.put("copy"   , new Character((char)169));
	entityToCharacter.put("ordf"   , new Character((char)170));
	entityToCharacter.put("laquo"  , new Character((char)171));
	entityToCharacter.put("not"    , new Character((char)172));
//	entityToCharacter.put("shy"    , new Character((char)173));
	entityToCharacter.put("reg"    , new Character((char)174));
	entityToCharacter.put("macr"   , new Character((char)175));
	entityToCharacter.put("deg"    , new Character((char)176));
	entityToCharacter.put("plusmn" , new Character((char)177));
	entityToCharacter.put("sup2"   , new Character((char)178));
	entityToCharacter.put("sup3"   , new Character((char)179));
	entityToCharacter.put("acute"  , new Character((char)180));
	entityToCharacter.put("micro"  , new Character((char)181));
	entityToCharacter.put("para"   , new Character((char)182));
	entityToCharacter.put("middot" , new Character((char)183));
	entityToCharacter.put("cedil"  , new Character((char)184));
	entityToCharacter.put("sup1"   , new Character((char)185));
	entityToCharacter.put("ordm"   , new Character((char)186));
	entityToCharacter.put("raquo"  , new Character((char)187));
	entityToCharacter.put("frac14" , new Character((char)188));
	entityToCharacter.put("frac12" , new Character((char)189));
	entityToCharacter.put("frac34" , new Character((char)190));
	entityToCharacter.put("iquest" , new Character((char)191));
	entityToCharacter.put("Agrave" , new Character((char)192));
	entityToCharacter.put("Aacute" , new Character((char)193));
	entityToCharacter.put("Acirc"  , new Character((char)194));
	entityToCharacter.put("Atilde" , new Character((char)195));
	entityToCharacter.put("Auml"   , new Character((char)196));
	entityToCharacter.put("Aring"  , new Character((char)197));
	entityToCharacter.put("AElig"  , new Character((char)198));
	entityToCharacter.put("Ccedil" , new Character((char)199));
	entityToCharacter.put("Egrave" , new Character((char)200));
	entityToCharacter.put("Eacute" , new Character((char)201));
	entityToCharacter.put("Ecirc"  , new Character((char)202));
	entityToCharacter.put("Euml"   , new Character((char)203));
	entityToCharacter.put("Igrave" , new Character((char)204));
	entityToCharacter.put("Iacute" , new Character((char)205));
	entityToCharacter.put("Icirc"  , new Character((char)206));
	entityToCharacter.put("Iuml"   , new Character((char)207));
	entityToCharacter.put("ETH"    , new Character((char)208));
	entityToCharacter.put("Ntilde" , new Character((char)209));
	entityToCharacter.put("Ograve" , new Character((char)210));
	entityToCharacter.put("Oacute" , new Character((char)211));
	entityToCharacter.put("Ocirc"  , new Character((char)212));
	entityToCharacter.put("Otilde" , new Character((char)213));
	entityToCharacter.put("Ouml"   , new Character((char)214));
	entityToCharacter.put("times"  , new Character((char)215));
	entityToCharacter.put("Oslash" , new Character((char)216));
	entityToCharacter.put("Ugrave" , new Character((char)217));
	entityToCharacter.put("Uacute" , new Character((char)218));
	entityToCharacter.put("Ucirc"  , new Character((char)219));
	entityToCharacter.put("Uuml"   , new Character((char)220));
	entityToCharacter.put("Yacute" , new Character((char)221));
	entityToCharacter.put("THORN"  , new Character((char)222));
	entityToCharacter.put("szlig"  , new Character((char)223));
	entityToCharacter.put("agrave" , new Character((char)224));
	entityToCharacter.put("aacute" , new Character((char)225));
	entityToCharacter.put("acirc"  , new Character((char)226));
	entityToCharacter.put("atilde" , new Character((char)227));
	entityToCharacter.put("auml"   , new Character((char)228));
	entityToCharacter.put("aring"  , new Character((char)229));
	entityToCharacter.put("aelig"  , new Character((char)230));
	entityToCharacter.put("ccedil" , new Character((char)231));
	entityToCharacter.put("egrave" , new Character((char)232));
	entityToCharacter.put("eacute" , new Character((char)233));
	entityToCharacter.put("ecirc"  , new Character((char)234));
	entityToCharacter.put("euml"   , new Character((char)235));
	entityToCharacter.put("igrave" , new Character((char)236));
	entityToCharacter.put("iacute" , new Character((char)237));
	entityToCharacter.put("icirc"  , new Character((char)238));
	entityToCharacter.put("iuml"   , new Character((char)239));
	entityToCharacter.put("eth"    , new Character((char)240));
	entityToCharacter.put("ntilde" , new Character((char)241));
	entityToCharacter.put("ograve" , new Character((char)242));
	entityToCharacter.put("oacute" , new Character((char)243));
	entityToCharacter.put("ocirc"  , new Character((char)244));
	entityToCharacter.put("otilde" , new Character((char)245));
	entityToCharacter.put("ouml"   , new Character((char)246));
	entityToCharacter.put("divide" , new Character((char)247));
	entityToCharacter.put("oslash" , new Character((char)248));
	entityToCharacter.put("ugrave" , new Character((char)249));
	entityToCharacter.put("uacute" , new Character((char)250));
	entityToCharacter.put("ucirc"  , new Character((char)251));
	entityToCharacter.put("uuml"   , new Character((char)252));
	entityToCharacter.put("yacute" , new Character((char)253));
	entityToCharacter.put("thorn"  , new Character((char)254));
	entityToCharacter.put("yuml"   , new Character((char)255));
    }
 }    
