/* 
 * Copyright (C) 2005 and 2006, Scott Turner scotty1024@mac.com
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

/**
 * Copyright (c) 2003-2005, www.pdfbox.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 3. Neither the name of pdfbox; nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * http://www.pdfbox.org
 *
 */
import java.util.ArrayList;
//import java.util.Comparable;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import java.util.Vector;

import org.pdfbox.cos.COSDocument;
import org.pdfbox.cos.COSStream;

import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDPage;

import org.pdfbox.pdmodel.common.PDRectangle;
import org.pdfbox.pdmodel.common.PDStream;

import org.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
import org.pdfbox.pdmodel.encryption.PDStandardEncryption;
import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;

import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.exceptions.InvalidPasswordException;

import java.awt.BasicStroke;
import java.awt.Color;
import java.awt.Dimension;
import java.awt.Graphics;
import java.awt.Graphics2D;
import java.awt.RenderingHints;

import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;

import java.io.IOException;

import java.util.List;
import java.util.Map;

import org.pdfbox.cos.COSName;
import org.pdfbox.cos.COSNumber;

import org.pdfbox.pdmodel.PDPage;

import org.pdfbox.pdmodel.font.PDFont;
import org.pdfbox.pdmodel.graphics.xobject.PDInlinedImage;
import org.pdfbox.pdmodel.graphics.xobject.PDXObject;
import org.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;

import org.pdfbox.util.ImageParameters;
import org.pdfbox.util.Matrix;
import org.pdfbox.util.PDFOperator;
import org.pdfbox.util.PDFStreamEngine;
import org.pdfbox.util.ResourceLoader;
import org.pdfbox.util.TextPosition;

/**
 * Class uses PDFBox's PDFStreamEngine to parse the PDF to extract text and images.
 * Code is modled on PageDrawer from PDFBox 0.7.1.
 *
 * @author Ben Litchfield (ben@benlitchfield.com)
 * @version $Revision: 1.15 $
 */
public class PageParser extends PDFStreamEngine {
    private final boolean debugPDF = false;

    private BBeBook book;
    private PDDocument document;
    private PDPage page;

    private List pageArticles = null;
    /**
     * The charactersByArticle is used to extract text by article divisions.  For example
     * a PDF that has two columns like a newspaper, we want to extract the first column and
     * then the second column.  In this example the PDF would have 2 beads(or articles), one for
     * each column.  The size of the charactersByArticle would be 5, because not all text on the 
     * screen will fall into one of the articles.  The five divisions are shown below
     * 
     * Text before first article
     * first article text
     * text between first article and second article
     * second article text
     * text after second article
     * 
     * Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
     */
    private Vector charactersByArticle = new Vector();

    /**
     * Default constructor, loads properties from file.
     * 
     * @throws IOException If there is an error loading properties from the file.
     */
    public PageParser(BBeBook aBook, String aPDFFileName)
	throws IOException
    {
        super( ResourceLoader.loadProperties( "Resources/PageDrawer.properties" ) );
	book = aBook;
	document = PDDocument.load( aPDFFileName);

	if (document.isEncrypted()) {
	    try {
		// Try decrypting with empty password
		document.decrypt(""); 
	    } catch (org.pdfbox.exceptions.CryptographyException e) {
		//they didn't suppply a password and the default of "" was wrong.
		e.printStackTrace();
		throw new RuntimeException( "Error: The document is encrypted." );
	    } catch (InvalidPasswordException e) {
		//they didn't suppply a password and the default of "" was wrong.
		throw new RuntimeException( "Error: The document is encrypted." );
	    }
	}
    }

    TreeSet textTree;

    protected void parsePages()
	throws IOException
    {
	List pages = document.getDocumentCatalog().getAllPages();
	for (int i = 0; i < pages.size(); i++) {
	    page = (PDPage)pages.get(i);
            pageArticles = page.getThreadBeads();
            int numberOfArticleSections = 1 + pageArticles.size() * 2;
            int originalSize = charactersByArticle.size();
            charactersByArticle.setSize(numberOfArticleSections);
            for (int j = 0; j < numberOfArticleSections; j++) {
                if (numberOfArticleSections < originalSize) {
                    ((List)charactersByArticle.get(j)).clear();
                } else {
                    charactersByArticle.set( j, new ArrayList() );
                }
            }
            
	    //            characterListMapping.clear();
	    textTree = new TreeSet();
	    processStream( page, page.findResources(), page.getContents().getStream() );
            flushText();

	    System.out.print("Processed PDF page " + (i + 1) + " of " + pages.size() + "\r");
	}
    }

    /**
     * This will extract text and graphics from the page.
     *
     * @param p The page to parse.
     *
     * @throws IOException If there is an IO error while parsing the page.
     */
    public void parsePage(PDPage p) {
        page = p;

    }

    /**
     * This method is invoked as characters are pulled from the PDF page.
     *
     * @param text The string to display.
     */
    /*
    protected void showCharacter2(TextPosition text) {
        //should use colorspaces for the font color but for now assume that
        //the font color is black
        try {
            PDFont font = text.getFont();
            font.drawString( text.getCharacter(), graphics, text.getFontSize(), text.getXScale(), text.getYScale(),
                             text.getX(), text.getY() );
        } catch( IOException io ) {
            io.printStackTrace();
        }
    }
    */

    protected void showCharacter (TextPosition text) {
	// Need to determine which article it belongs to.
	int foundArticleDivisionIndex = -1;
	int notFoundButFirstLeftAndAboveArticleDivisionIndex = -1;
	int notFoundButFirstLeftArticleDivisionIndex = -1;
	int notFoundButFirstAboveArticleDivisionIndex = -1;
	float x = text.getX();
	float y = text.getY();

	if (false) {
	    for (int i=0; i<pageArticles.size() && foundArticleDivisionIndex == -1; i++) {
		PDThreadBead bead = (PDThreadBead)pageArticles.get(i);
		PDRectangle rect = bead.getRectangle();
		if (rect.contains( x, y)) {
		    foundArticleDivisionIndex = i*2+1;
		} else if (((x < rect.getLowerLeftX()) ||
			    (y < rect.getUpperRightY())) &&
			   (notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)) {
		    notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
		} else if ((x < rect.getLowerLeftX()) &&
			   (notFoundButFirstLeftArticleDivisionIndex == -1)) {
		    notFoundButFirstLeftArticleDivisionIndex = i*2;
		} else if ((y < rect.getUpperRightY()) &&
			   (notFoundButFirstAboveArticleDivisionIndex == -1)) {
		    notFoundButFirstAboveArticleDivisionIndex = i*2;
		}
	    }
	} else {
	    foundArticleDivisionIndex = 0;
	}
	int articleDivisionIndex = -1;
	if (foundArticleDivisionIndex != -1) {
	    articleDivisionIndex = foundArticleDivisionIndex;
	} else if (notFoundButFirstLeftAndAboveArticleDivisionIndex != -1) {
	    articleDivisionIndex = notFoundButFirstLeftAndAboveArticleDivisionIndex;
	} else if (notFoundButFirstLeftArticleDivisionIndex != -1) {
	    articleDivisionIndex = notFoundButFirstLeftArticleDivisionIndex;
	} else if (notFoundButFirstAboveArticleDivisionIndex != -1) {
	    articleDivisionIndex = notFoundButFirstAboveArticleDivisionIndex;
	} else {
	    articleDivisionIndex = charactersByArticle.size()-1;
	}

	List textList = (List)charactersByArticle.get(articleDivisionIndex);
	textList.add(text);
	textTree.add(new TextWrapper(text));
    }

    private void dumpTree() {
	// Place text on page
	Iterator treeWalker = textTree.iterator();
	while (treeWalker.hasNext()) {
	    TextPosition text = ((TextWrapper)treeWalker.next()).text;
	    System.out.println("Flushing: y =" + text.getY() +
			       " x =" + text.getX() +
			       " '" + text.getCharacter() + "'");
	}
    }

    /**
     * This will print the text to the output stream.
     *
     * @throws IOException If there is an error writing the text.
     */
    protected void flushText2()
	throws IOException
    {
	if (book == null) {
	    dumpTree();
	    return;
	}

	// Start page
	book.startPage();

	// Place text on page
	Iterator treeWalker = textTree.iterator();
	while (treeWalker.hasNext()) {
	    TextPosition position = ((TextWrapper)treeWalker.next()).text;
	    book.addTextToPage(position.getCharacter(),
			       (int)position.getX(),
			       (int)position.getY(),
			       (int)position.getWidth(),
			       (int)position.getFontSize());
	}

	// End page
	book.endPage();
    }

    /**
     * This will print the text to the output stream.
     *
     * @throws IOException If there is an error writing the text.
     */
    protected void flushText()
	throws IOException
    {
	if (book == null) {
	    dumpTree();
	    return;
	}

	Iterator treeWalker = textTree.iterator();
	/*
	while (treeWalker.hasNext()) {
	    TextPosition text = ((TextWrapper)treeWalker.next()).text;
	    System.out.println("Flushing: y =" + text.getY() +
			       " x =" + text.getX() +
			       " '" + text.getCharacter() + "'");
	}
	*/

        float currentY = -1;
        float lastBaselineFontSize = -1;
        float endOfLastTextX = -1;
        float startOfNextWordX = -1;
        float lastWordSpacing = -1;
        TextPosition lastProcessedCharacter = null;
        
	// Place start of Page marker
	book.outAppend(0xf5a1);
	book.outAppend(0x0000);
	book.outAppend(0x0000);

	/*
        for (int i = 0; i < charactersByArticle.size(); i++) {
            // startParagraph();
            List textList = (List)charactersByArticle.get(i);
            Iterator textIter = textList.iterator();
            while (textIter.hasNext()) {
	*/
	treeWalker = textTree.iterator();
	while (treeWalker.hasNext()) {
	    TextPosition position = ((TextWrapper)treeWalker.next()).text;
	    //                TextPosition position = (TextPosition)textIter.next();
                String characterValue = position.getCharacter();
                
		// try to get width of a space character
                float wordSpacing = position.getWidthOfSpace();
		// if still zero fall back to getting the width of the current character
		if (wordSpacing == 0) {
		    wordSpacing = position.getWidth();
                }
                
                // RDD - We add a conservative approximation for space determination.
                // basically if there is a blank area between two characters that is
                //equal to some percentage of the word spacing then that will be the
                //start of the next word
                if (lastWordSpacing <= 0) {
                    startOfNextWordX = endOfLastTextX + (wordSpacing* 0.50f);
                } else {
                    startOfNextWordX = endOfLastTextX + (((wordSpacing+lastWordSpacing)/2f)* 0.50f);
                }
                
                lastWordSpacing = wordSpacing;
    
                // RDD - Here we determine whether this text object is on the current
                // line.  We use the lastBaselineFontSize to handle the superscript
                // case, and the size of the current font to handle the subscript case.
                // Text must overlap with the last rendered baseline text by at least
                // a small amount in order to be considered as being on the same line.
                //
                int verticalScaling = 1;
                if ((lastBaselineFontSize < 0) || (position.getFontSize() < 0)) {
                    verticalScaling = -1;
                }
                if (currentY != -1 &&
                    ((position.getY() < (currentY - (lastBaselineFontSize * 0.9f * verticalScaling))) ||
                     (position.getY() > (currentY + (position.getFontSize() * 0.9f * verticalScaling)))))
                {
		    if (debugPDF) {
			System.out.println("<newline currentY=" + currentY + ", y=" + position.getY() + 
					   " fs=" + position.getFontSize()+ " lb fs=" + lastBaselineFontSize + ">");
		    }
		    // Start new line (we build entire paragraph into one line)
		    book.outAppend(0xf5d2);
		    System.out.println("NewLine");
                    endOfLastTextX = -1;
                    startOfNextWordX = -1;
                    currentY = -1;
                    lastBaselineFontSize = -1;
                }
    
                if ((startOfNextWordX != -1) && (startOfNextWordX < position.getX()) &&
		    (lastProcessedCharacter != null) &&
                   //only bother adding a space if the last character was not a space
		    (lastProcessedCharacter.getCharacter() != null) &&
		    !lastProcessedCharacter.getCharacter().endsWith(" ")) {
		    if (debugPDF) {
			System.out.println("<space startOfNextWordX=" + startOfNextWordX + ", x=" + position.getX() + ">");
		    }
		    // Write out a word separator
		    book.outAppend(0x0020);
                }
    
    
		if (debugPDF) {
		    System.out.println("flushText" +
				       " y=" + position.getY() +
				       " x=" + position.getX() +
				       " yScale=" + position.getYScale() +
				       " xScale=" + position.getXScale() +
				       " width=" + position.getWidth() +
				       " currentY=" + currentY +
				       " endOfLastTextX=" + endOfLastTextX +
				       " startOfNextWordX=" + startOfNextWordX +
				       " fontSize=" + position.getFontSize() +
				       " wordSpacing=" + wordSpacing +
				       " string=\"" + characterValue + "\"");
		}
    
                if (currentY == -1) {
                    currentY = position.getY();
                }
    
                if (currentY == position.getY()) {
                    lastBaselineFontSize = position.getFontSize();
                }
    
                // RDD - endX is what PDF considers to be the x coordinate of the
                // end position of the text.  We use it in computing our metrics below.
                //
                endOfLastTextX = position.getX() + position.getWidth();
    
                if (characterValue != null) {
		    System.out.println("Flushing: y =" + position.getY() +
				       " x =" + position.getX() +
				       " '" + characterValue + "'");
		    // Write character to book
		    for (int j = 0; j < characterValue.length(); j++) {
			book.outAppend(characterValue.charAt(j));
		    }
                }
                lastProcessedCharacter = position;
	}
	/*
	    // End Paragraph marker
	    book.outAppend(0xf5d2);
	}
        */
	// End of Page marker
	book.outAppend(0xf5a2);

	book.addOutBufAsTextPage();
    }
    
    
    /**
     * This is used to handle an operation. Basically it invoked to hand us things like images.
     *
     * @param operator The operation to perform.
     * @param arguments The list of arguments.
     *
     * @throws IOException If there is an error processing the operation.
     */
    protected void processOperator(PDFOperator operator, List arguments)
	throws IOException
    {
        super.processOperator( operator, arguments );
        String operation = operator.getOperation();
	//System.out.println("processOperator: " + operation);

        if (operation.equals("BI")) {
            //begin inline image object
            ImageParameters params = operator.getImageParameters();
            PDInlinedImage image = new PDInlinedImage();
            image.setImageParameters( params );
            image.setImageData( operator.getImageData() );
	    try {
		flushText();
		book.addBufferedImage(image.createImage());
	    } catch (IOException e) {
		e.printStackTrace();
	    }
        } else if (operation.equals("Do")) {
	    /*
            COSName objectName = (COSName)arguments.get( 0 );
            Map xobjects = getResources().getXObjects();
            PDXObject xobject = (PDXObject)xobjects.get( objectName.getName() );
            if(xobject instanceof PDXObjectImage) {
                PDXObjectImage image = (PDXObjectImage)xobject;
                try {
		    flushText();
		    book.addBufferedImage(image.getRGBImage());
                } catch (Exception e) {
                    e.printStackTrace();
                }
            } else {
                System.out.println( "Unknown xobject type:" + xobject );
            }
	    */
	}
    }

    class TextWrapper implements Comparable {
	
	TextPosition text;
	float x;
	float y;
	TextWrapper (TextPosition aText) {
	    text = aText;
	    x = text.getX();
	    y = text.getY();
	}

	/**
	 * Override comparable. We need to sort by text.y and then by text.x.
	 * 0,0 is in the upper left (for some reason even though this PDF).
	 *
	 * @param o an <code>Object</code> value
	 * @return an <code>int</code> value
	 */
	public int compareTo(Object o) {
	    final TextWrapper t = (TextWrapper)o;

	    if (y > t.y) {
		return 1;
	    } else if (y < t.y) {
		return -1;
	    } else if (x > t.x) {
		return 1;
	    } else if (x < t.x) {
		return -1;
	    } else {
		return 0;
	    }
	}
    }

    public static void main(String[] args)
	throws Exception
    {
	PageParser parser = new PageParser(null, args[0]);
	parser.parsePages();
    }
 }    
