#!/usr/bin/env perl

#    Mobi2IMP, Copyright (C) 2008,2009 Nick Rapallo, nrapallo@yahoo.ca
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
#    mobi2html, base code (C) 2008 Tommy Persson, tpe@ida.liu.se
#    The additional source code used to build the binary is available at
#    <http://www.ida.liu.se/~tompe/mobiperl/>, the MobiPerl repository.

use Encode;

use FindBin qw($RealBin);
use lib "$RealBin";

use HTML::TreeBuilder;
use Palm::PDB;
use Palm::Doc;
use Image::Size;
use Date::Parse;
use Date::Format;
use Getopt::Mixed;

use MobiPerl::EXTH;
use MobiPerl::Util;
use MobiPerl::MobiHeader;

use GD::Image;
use File::Basename;

use strict;

use vars qw ($opt_rawhtml $opt_record0 $opt_saveallrecords $opt_mobihtml $opt_explodedir
             $opt_1200 $opt_1150 $opt_1100 $opt_oeb $opt_epub $opt_verbose $opt_debug $opt_out 
             $opt_BDbig $opt_BDnewpage $opt_BDremove $opt_noBRfix $opt_bgcolor
             $opt_nomargins $opt_LRmargins $opt_noimagefix $opt_nomarginfix
             $opt_smallerfont $opt_largerfont $opt_nojustify $opt_nopara
             $opt_indent $opt_header_hr $opt_header_color $opt_cover);

Getopt::Mixed::getOptions ("rawhtml record0 saveallrecords mobihtml explodedir=s
                            1200 1150 1100 oeb epub verbose debug out=s
                            BDbig BDnewpage BDremove noBRfix bgcolor=s
                            nomargins LRmargins=s noimagefix nomarginfix
                            smallerfont largerfont nojustify nopara indent
                            header_hr header_color=s cover");

my $fullpathfilename = shift;

###################################################################
#For .IMP start (version 9.4e - 11 Aug 2009)
#
my $impversion = "version 9.4e";
my $usage = 'Mobi2IMP [options] MobiSource [Category [Authorname [Title]]]';
my $category = shift;
my $author = shift;
my $title = shift;

print "\nMobi2IMP ($impversion) Copyright (C) 2009 Nick Rapallo (nrapallo)\n";

if (not defined $fullpathfilename){
    print "Usage: $usage\n";
    print "\n";
    print "Where [options] are:\n";
    print "   [ --verbose ] printout messages about this conversion\n";
    print "   [ --debug ] printout more detailed messages about conversion\n";
    print " Output related:\n";     
    print "   [ --explodedir DIR ] set 'ExplodeDIR' to use instead of default 'Filename'\n";
    print "   [ --out IMPFILENAME ] set .IMP filename to use (overrides default naming)\n";
    print "   [ --1200 | --1100 | --oeb | --epub ] simultaneously create these versions\n";
    print "   [ --1150 ] conversely do not create the 1150 .IMP version\n";
    print "   [ --cover ] add cover page using the existing cover embedded in MobiSource\n";
    print " Style related:\n";
    print "   [ --LRmargins LR] set left-right margins to LR=2px,5%,8% not default 2%\n";
    print "   [ --nomargins ] set left-right margins to 0% instead of default 2%\n";
    print "   [ --smallerfont ] use 'x-small' font size for body text not 'small'\n";
    print "   [ --largerfont ] use 'medium' font size for body text not 'small'\n";
    print "   [ --nojustify ] no full justification (i.e. left-aligned) not 'justify'\n";
    print "   [ --nopara ] use no paragraph separation not 'blank line' (1em) separation\n";
    print "   [ --indent ] use small (1em) indent instead of no (0em) indent\n";
    print "   [ --bgcolor #FF80FF ] use color #FF80FF as background color for every page\n";
    print "   [ --header_hr ] use header text 'Title Author' followed by <hr>\n";
    print "   [ --header_color #FF80FF ] use color #FF80FF as header background color only\n";
    print " HTML/BD Fixes:\n";
    print "   [ --BDbig ] make BookDesigner notice at the end 'big print' not small\n";
    print "   [ --BDnewpage ] put BookDesigner notice at the end on a newpage\n";
    print "   [ --BDremove ] remove BookDesigner notice at the end\n";
    print "   [ --noBRfix ] when using '--nopara', don't fix (broken) <br />'s issue\n";
    print "   [ --noimagefix ] do not pad/resize images to device's aspect ratio\n";
    print "   [ --noimagefix ] also leave image link locations/blank lines before captions\n";
    print "   [ --nomarginfix ] do not convert Mobipocket-specific width= & height= tags\n";
    print "\n";
    print "Required parameters: MobiSource\n";
    print "   MobiSource is the Mobipocket (.prc/.mobi/.pdb) file to convert to .IMP\n";
    print "      if contains spaces, then surround with quotes (i.e. \"My Source.prc\")\n";
    print "   [now --explodedir is directory where to explode mobi file (left behind!)\n";
    print "      can also be \".\" for current directory]\n";
    print "\n";
    print "Optional parameters: [Category [Authorname [Title]]]\n";
    print "   Are all optional; with same being extracted from source file, if present.\n";
    print "\n";
    print "If 'Build' successful, then default .IMP filename is 'Author - Title.ext'\n";
    print "If 'Build' method fails to create ebook, load .opf and manually 'Build' it!\n";
    print "If resulting .IMP is wrong version in viewer, just re-install eBook Publisher!\n";
    die "\n" unless 0;
}

my ($fnbody,$fnpath,$fnext) = fileparse("$fullpathfilename",'\.\w+');
my ($onbody,$onpath,$onext) = fileparse("$opt_out",'\.\w+') unless not defined $opt_out;
$fnpath = $onpath unless not defined $opt_out;
my $filename = $fnbody . $fnext;
my $explodedir = $fnbody unless defined $opt_explodedir;

chdir $fnpath;
mkdir $explodedir;

my $htmlfile = $filename;
$htmlfile =~ s/\.mobi/.html/;
$htmlfile =~ s/\.prc/.html/;
$htmlfile =~ s/\.pdb/.html/;
$htmlfile =~ s/\.azw/.html/;

#
#For .IMP end
###################################################################

my $globalcodepage = 0;

print "FULLFILENAME: $fullpathfilename\n";
print "FILENAME    : $filename\n";
print "EXPLODEPATH : $fnpath\n";
print "EXPLODEDIR  : $explodedir\n";

my $pdb = new Palm::PDB;
$pdb->Load($fullpathfilename);

my $name = $pdb->{"name"};
my $version = $pdb->{"version"};
my $type = $pdb->{"type"};
my $creator = $pdb->{"creator"};
my $seed = $pdb->{"uniqueIDseed"};
my $ctime = $pdb->{"ctime"};
my $mtime = $pdb->{"mtime"};
my $sctime = ctime ($ctime);
my $smtime = ctime ($mtime);

print "DatabaseName: $name\n";
print "Version     : $version\n";
print "Type        : $type\n";
print "Creator     : $creator\n";
##print "Seed        : $seed\n";
##print "Resdb       : " . $pdb->{"attributes"}{"ResDB"} . "\n";
##print "AppInfoDirty: " . $pdb->{"attributes"}{"AppInfoDirty"} . "\n";
##print "ctime       : $ctime - $sctime\n";
##print "mtime       : $mtime - $smtime\n";
print "baktime     : " . $pdb->{"baktime"} . "\n";

my @records = @{$pdb->{"records"}};
print "# of records: " . $#records . "\n";
print "---------------------------------------------------\n";

my $image_index = 0;
my %image_index_to_filename = ();
my $coverimageid = -1;
my $longtitle = "";

foreach my $r (@records) {
	my $id = $r->{"id"};
	my $cat = $r->{"category"};
	my $offset = $r->{"offset"};
	my $data = $r->{"data"};
	my $size = length ($data);
	my $filename = "record-$id";
	my ($x, $y, $type) = imgsize(\$data);
	if (defined $x) {
	    if (defined $opt_verbose){ print "Image_Index: $image_index - $filename - $cat - $offset - $size - $x x $y\n"; }
	    $image_index++;
	    $image_index_to_filename{$image_index} = "$filename.$type";
	    open DATA, ">$explodedir/$filename.$type";
	    binmode (DATA);
	    print DATA $data;
	    close DATA;
	} else {
	    if (defined $opt_record0 or defined $opt_saveallrecords) {
		open DATA, ">$explodedir/$filename";
		print DATA $data;
		close DATA;
	    }
	}
	if (defined $opt_record0) {
	    exit (0);
	}
}

my $r0 = $records[0];
parse_record_0 ($r0->{"data"});

#my @resources = @{$pdb->{"resources"}};
#print "Number of resources: " . $#resources . "\n";

my $text = $pdb->text;

#{
#    local $/;
#    $text =~ s/\r//g;
#}

if (defined $opt_rawhtml) {
    binmode (STDOUT);
    print $text;
}

# this doesn't work always with \f
#if (substr ($text, 0, 6) ne "<html>" and 0) {  
#    #$text =~ s/\cM//g;                  # Unix line endings
#    $text =~ s/\f/\n/g;                  # Windows form feeds
#    $text =~ s/\n/\x01/gi;               # Collapse lines
#    $text =~ s/\x01\x01/<\/p>\n\n<p>/g;  # Separate paragraphs
#    $text =~ s/\x01/ /g;                 # Insert whitespace
#
#    $text = "<html><head></head><body><p>" . $text . "</p></body></html>";
#}

# this works better than above but sloppy
if (substr ($text, 0, 6) ne "<html>" && substr ($text, 0, 6) ne "<HTML>" ) {                                       
    open TEMPFILE, ">$explodedir/$htmlfile.txt" or die "Cannot create .txt file";
    binmode (TEMPFILE);
    print TEMPFILE $text;
    close TEMPFILE;

    open TEMPFILE, "$explodedir/$htmlfile.txt" or die "Cannot open .txt file";
    my @lines = <TEMPFILE>;               # Read it into an array
    close TEMPFILE;                       # Close the file

    #unlink "$explodedir/$htmlfile.txt";

    $text = "<html><head></head>\n<body>\n" ;

    my $line;
    foreach $line (@lines) {              # assign @lines to $line, one at a time
       $text .= "<p>" . $line . "</p>" unless (length($line) == 1);  
    }

    $text .= "\n</body>\n</html>\n";

}

my %fileposmap;

print "---------------------------------------------------\n";
print "Found $image_index images\n";
print "Looking for filepos\n";
my $cp = 0;
my $fpcount = 0;
my $len = length ($text);
while ($cp < $len) {
    my $s = substr ($text, $cp, 50);
    if (substr ($s, 0, 7) eq "filepos") {
	if ($s =~ /^filepos=(\d+)/) {
	    $fpcount++;
	    if (defined $opt_debug){ print "FILEPOS $fpcount: $cp - $1\n"; }
	    $fileposmap{$1} = 1;
	}
	if ($s =~ /^filepos=\"(\d+)\"/) {
	    $fpcount++;
	    if (defined $opt_debug){ print "FILEPOS $fpcount: $cp - $1\n"; }
	    $fileposmap{$1} = 1;
	}
    }
    $cp++;
}
print "Found $fpcount filepos\n";

my $offset = 0;
my $nacount = 0;

print "Adding name attributes\n";
foreach my $pos (sort keys %fileposmap) {
    $nacount++;
    if (defined $opt_debug){ print "NAMEPOS $nacount: $pos\n"; }
    my $a = substr ($text, $pos+$offset, 2);
    if (substr ($a, 0, 1) eq "<" and substr ($a, 1, 1) ne "</") {
	substr ($text, $pos+$offset, 2, "<a name=\"" . $pos . "\"></a>$a");
	$offset += (15 + length ($pos));
	next;
    }
###################################################################
#For .IMP start - Kludge mainly for UTF-8 encoded .prc files (ignore warning)
#
my $donefix = 0;
my $ii = 0;
my $b;
#be aggressive in finding a "home" for our link (filepos) despite bad placement!
while (not $donefix) {
    $ii++;  
    $b = substr ($text, $pos+$offset+$ii, 2);
    if (substr ($b, 0, 1) eq "<" and substr ($b, 1, 1) ne "/") {
        # Put an empty anchor before start of any HTML tag i.e. "<"
        substr ($text, $pos+$offset+$ii, 2, "<a name=\"" . $pos . "\"></a>$b");
        #$b = substr ($text, $pos+$offset, 2+$ii);
        $b = substr ($text, $pos+$offset, 8);
        $donefix = 1;
        $offset += (15 + length ($pos));
        if (defined $opt_verbose || defined $opt_debug){ 
            print "FIXED $nacount: $pos ($ii) - Wasn't an anchor: $b\n";
        }
    }
    if (($ii+1) % 20 == 0 and defined $opt_debug) { print "."; }
    if ($ii > 256) {
        $donefix = 1;
    }
}
if ($ii < 256) {
    next;
}
if (defined $opt_verbose || defined $opt_debug){ 
    substr ($text, $pos+$offset, 2, "<a name=\"" . $pos . "\"></a>$a");
    $offset += (15 + length ($pos));
    print "WARNING $nacount: $pos - Not an anchor: $a\n";
}
#
#For .IMP end
###################################################################
}

my $tree = new HTML::TreeBuilder ();
$tree->ignore_unknown (0);
if ($globalcodepage == 1252 or $globalcodepage == 0) {
    $tree->parse ($text); # seems to generate the corrrect entities...
}
if ($globalcodepage == 65001) {
    print STDERR "UTF-8 detected: converting before TreeBuilder parse\n";
    $tree->parse (decode_utf8 $text); # seems to generate the corrrect entities...
}
$tree->eof ();

#my $tree = HTML::TreeBuilder->new_from_content ($text);

fix_filepos_attributes ($tree);
fix_image_tags ($tree);

open HTML, ">$explodedir/$htmlfile" or die "Could not open file $explodedir/$htmlfile";

my $html = $tree->as_HTML(undef,undef,{});    # does not strip </p>, </li> 

my $booktitle = $name; # mobi provided name for ebook
if ($longtitle) { $booktitle = $longtitle; } # mobi provided long title for ebook
$booktitle = $title unless not defined $title;
$category = "Converted Mobipocket" unless defined $category;
$author = "Mobipocket" unless defined $author;

$booktitle =~ s/[\/?<>\\:\*\|"_]/ /g; # remove characters that are invalid for filenames
#$category =~ s/[\/?<>\\:\*\|"]/ /g; # remove characters that are invalid 
$author =~ s/[\/?<>\\:\*\|"_]/ /g; # remove characters that are invalid for filenames

my $bookname = $author . " - " . $booktitle;
if (defined $opt_out) {
    if ($onbody ne "Default naming (Author - Title.ext)") {
        $bookname = $onbody;
    }
}

if (not defined $opt_mobihtml) {
###################################################################
#For .IMP start
#
#    $html =~ s/<mbp:pagebreak\s*\//<br style=\"page-break-after:always\" \//g;
#    $html =~ s/<mbp:pagebreak\s*/<br style=\"page-break-after:always\" \//g;
#
#For .IMP end
###################################################################
#    $html =~ s/<mbp:pagebreak>/<br style=\"page-break-after:always\"\/>/g;
#    $html =~ s/<mbp:pagebreak>//g;
$html =~ s/<\/mbp:pagebreak>//gi;
$html =~ s/<\/mbpagebreak>//gi;
$html =~ s/<guide>.*?<\/guide>//gi;
$html =~ s/<mbp:nu>//gi;
$html =~ s/<\/mbp:nu>//gi;
$html =~ s/<mbp:section>//gi;
$html =~ s/<\/mbp:section>//gi;
$html =~ s/<mbp:frameset>//gi;
$html =~ s/<\/mbp:frameset>//gi;
$html =~ s/<mbp:slave-frame>//gi;
$html =~ s/<\/mbp:slave-frame>//gi;
$html =~ s/\/div>/\/div>\n/gi; 

if ($globalcodepage == 1252) {
    $html =~ s/<head>/<head><meta http-equiv="Content-Type" content="text\/html\; charset=windows-1252" \/>/i;
}
if ($globalcodepage == 65001) {
    $html =~ s/<head>/<head><meta http-equiv="Content-Type" content="text\/html\; charset=UTF-8" \/>/i;
}

###################################################################
#For .IMP start
#
my $margintopfix = "";
if (not defined $opt_nomarginfix) {
    $margintopfix = "<ul><\/ul>\n";
}
my $headerhr = "\n<header><table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" width=\"100%\"><tr>\n";
$headerhr .= "<td align=\"left\" style=\"font-family:smallfont\"><small>" . $booktitle . "</small></td>\n";
$headerhr .= "<td align=\"right\" style=\"font-family:smallfont\"><small>" . $author . "</small></td></tr></table><hr></header>\n";

my $headercolor = "\n<header><table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" width=\"100%\"><tr>\n";
$headercolor .= "<td align=\"left\" style=\"font-family:smallfont\" bgcolor=\"" . $opt_header_color . "\"><small>" . $booktitle . "</small></td>\n"; 
$headercolor .= "<td align=\"right\" style=\"font-family:smallfont\" bgcolor=\"" . $opt_header_color . "\"><small>" . $author . "</small></td></tr></table></header>\n";

if (defined $opt_bgcolor) {
    $html =~ s/<body([^>])*>/\n<body bgcolor=$opt_bgcolor>\n$margintopfix/i;  #remove .mobi defaults in <body> and insert bgcolor
} else {
    $html =~ s/<body([^>])*>/\n<body>\n$margintopfix/i;                       #remove .mobi defaults in <body> (added <ul></ul> kludge to fix "margin-top" on title pages)
}

if (defined $opt_header_hr) {
        $html =~ s/<body([^>])*>/<body$1>\n$headerhr/i;          #remove .mobi defaults in <body> and insert header-hr
} elsif (defined $opt_header_color) {
        $html =~ s/<body([^>])*>/<body$1>\n$headercolor/i;       #remove .mobi defaults in <body> and insert header-color
}

if (defined $opt_nopara and not defined $opt_noBRfix) {
    $html =~ s/<br([^>])*><div/<br \/><br \/><div/gi;            #force <br /> to work in Ebook Publisher
}

if (defined $opt_indent) {
    #indent (~2 characters)
    if (defined $opt_nopara) {
        $html =~ s/<\/head>/<style type="text\/css">p {text-indent:1em; margin-top:0em; margin-bottom:0em} header {display:none; display:oeb-page-head}<\/style><\/head>/i;   #nopara separation (--nopara)
    } else {
        $html =~ s/<\/head>/<style type="text\/css">p {text-indent:1em; padding-top:0em; padding-bottom:1em} header {display:none; display:oeb-page-head}<\/style><\/head>/i;   #para separation (default)
    }
} else {
    #noindent (default)
    if (defined $opt_nopara) {
        $html =~ s/<\/head>/<style type="text\/css">p {text-indent:0em; margin-top:0em; margin-bottom:0em} header {display:none; display:oeb-page-head}<\/style><\/head>/i;   #nopara separation (--nopara)
    } else {
        $html =~ s/<\/head>/<style type="text\/css">p {text-indent:0em; padding-top:0em; padding-bottom:1em} header {display:none; display:oeb-page-head}<\/style><\/head>/i;   #para separation (default)
    }
}

my $LRmargins = "2%";
if (defined $opt_LRmargins) { $LRmargins = $opt_LRmargins; }
if (defined $opt_nomargins) { $LRmargins = "0%"; }
if (defined $opt_nojustify) {
    #nojustify body text (left-align)
    if (defined $opt_smallerfont) {
        $html =~ s/<body/<body style="margin-left:$LRmargins; margin-right:$LRmargins; font-size:x-small; text-align:left"/i; # add small margins and left-align text
    } elsif (defined $opt_largerfont) {
        $html =~ s/<body/<body style="margin-left:$LRmargins; margin-right:$LRmargins; font-size:medium; text-align:left"/i; # add small margins and left-align text
    } else {
        $html =~ s/<body/<body style="margin-left:$LRmargins; margin-right:$LRmargins; text-align:left"/i; # add small margins and left-align text
    }
} else {
    #justify body text (default)
    if (defined $opt_smallerfont) {
        $html =~ s/<body/<body style="margin-left:$LRmargins; margin-right:$LRmargins; font-size:x-small; text-align:justify"/i; # add small margins and justified text
    } elsif (defined $opt_largerfont) {
        $html =~ s/<body/<body style="margin-left:$LRmargins; margin-right:$LRmargins; font-size:large; text-align:justify"/i; # add small margins and justified text
    } else {
        $html =~ s/<body/<body style="margin-left:$LRmargins; margin-right:$LRmargins; text-align:justify"/i; # add small margins and justified text
    }
}

# make BD notice small text
if (not defined $opt_BDbig) {
    $html =~ s/<div align="center">This file was created with(<br([^>])*>)*(\s)*BookDesigner program/<font size=1><div align="center">This file was created with BookDesigner program/i;
}
# put BD notice on new page and in small text if specified
if (defined $opt_BDnewpage) {
    $html =~ s/<div align="center">This file was created with(<br([^>])*>)*(\s)*BookDesigner program/<p style="page-break-before:always"><div align="center">This file was created with BookDesigner program/i;
}
# remove BD notice!
if (defined $opt_BDremove) {
    $html =~ s/<div align="center">This file was created with(<br([^>])*>)*(\s)*BookDesigner program(.+)*/<\/body><\/html>/is;
}

$html =~ s/<mbp:pagebreak/<p style="page-break-before:always"/gi; # insert proper page-breaks
$html =~ s/<mbpagebreak/<p style="page-break-before:always"/gi;   # insert proper page-breaks
$html =~ s/<img align="baseline"/<img/gi;                         # remove the troublesome baseline keyword
$html =~ s/(<br \/><br \/>)+<div align="center"><img/<br \/><div align="center"><p align="center"><img/gi;  # only allow one <br> before an image to avoid after a page-break   
$html =~ s/<div align="center"><img/<div align="center"><p align="center"><img/gi;                          # kludge to get eBook Publisher to center images   

# relocate HarryT's links to illustrations and remove any blank lines between the image and caption!
if (not defined $opt_noimagefix) {
    $html =~ s/<div align="center">(\s)*<p align="center"><img (([^>])*)><\/div>(\s)*(<div>(&nbsp;)*<\/div>)*(\s*)(<br \/>)*<div align="center"><i>(<a name=([^>])*>)/<div align="center">$1<p align="center">$9<img $2><\/div><div align="center"><i>/gi;
}

#fix up feedbooks.com news feeds quirks 
$html =~ s/<br(\s)*\\>/<br \/>/gi;                                # fix ill-coded <br>
$html =~ s/<a href([^>]*)><a name([^>]*)><\/a>/<a name$2><\/a><a href$1>/gi;                   # any <a name> embedded within a <a href> is placed in front now

#fix up HTML number codes that cause .imp creation to fail or have strange characters (this was found by trial and error)
$html =~ s/(\x00|&#0;)//gi;                                       # remove odd insertion of null chars
$html =~ s/(\x10|&#16;)//gi;                                      # char 16 causes the .imp creation to abort abruptly.
$html =~ s/(\x01|&#1;)/ /gi;                                      # chars 1-8,17,18,21,22,23,26,27,30,31 converted to " " char (just to be safe)
$html =~ s/(\x02|&#2;)/ /gi;       
$html =~ s/(\x03|&#3;)/ /gi;       
$html =~ s/(\x04|&#4;)/ /gi;       
$html =~ s/(\x05|&#5;)/ /gi;       
$html =~ s/(\x06|&#6;)/ /gi;       
$html =~ s/(\x07|&#7;)/ /gi;       
$html =~ s/(\x08|&#8;)/ /gi;       
$html =~ s/(\x11|&#17;)/ /gi;
$html =~ s/(\x12|&#18;)/ /gi;
$html =~ s/(\x15|&#21;)/ /gi;
$html =~ s/(\x16|&#22;)/ /gi;
$html =~ s/(\x17|&#23;)/ /gi;
$html =~ s/(\x1a|&#26;)/ /gi;
$html =~ s/(\x1b|&#27;)/ /gi;
$html =~ s/(\x1e|&#30;)/ /gi;
$html =~ s/(\x1f|&#31;)/ /gi;
$html =~ s/(\x13|&#19;)/&ndash;/gi;                               # char 19 should be an "ndash" char
$html =~ s/(\x14|&#20;)/&mdash;/gi;                               # char 20 should be an "mdash" char
$html =~ s/(\x18|&#24;)/&lsquo;/gi;                               # char 24 should be an "left single quote" char
$html =~ s/(\x19|&#25;)/&rsquo;/gi;                               # char 25 should be an "right single quote" char
$html =~ s/(\x1c|&#28;)/&ldquo;/gi;                               # char 28 should be an "left double quote" char
$html =~ s/(\x1d|&#29;)/&rdquo;/gi;                               # char 29 should be an "right double quote" char

#fix up Calibre news feeds quirks 
$html =~ s/&amp;#160;/ /gi;                                       # convert ill-formed &nbsp;
$html =~ s/(<a name[^>]*>(<\/a>)*(\s)*)(<p style="page-break-before\:always">)/$4$3$1$2/gi;    # reorder as <a name> should be after page-break 
$html =~ s/<img .*?(src="[^"]*")[^>]*>/<img $1 \/>/gi;            # strip width, border, height from <img> to avoid squished pictures or even disappearing ones
$html =~ s/<a href="\/">/<a href="">/gi;                          # remove directory slash from missing links as eBook Publisher doesn't like this
$html =~ s/ ="" / /gi;                                            # repair any bad id or classs constructs where nulls were

#fix up blank lines (unwanted) before page-break
$html =~ s/((<div([^>])*>(&nbsp;)*<\/div>)*(\s)*(<br([^>])*>)*(\s)*)*<p style="page-break-before/\n<p style="page-break-before/gi;
$html =~ s/((<br([^>])*>)*(\s)*)*<p style="page-break-before/\n<p style="page-break-before/gi;
            
#convert Mobipocket-specific tags for paragraph/block width and height 
if (not defined $opt_nomarginfix) {
    $html =~ s/<([^ ]*) (.*?)width="([^"]*)"([^>]*)>/<$1 $2style="text-indent:$3"$4>/gi;
    $html =~ s/<([^ ]*) (.*?)height="([^"]*)"([^>]*)>/<$1 $2style="margin-top:$3"$4>/gi;
    $html =~ s/<div style="margin-top:0em"><\/div>//gi;
}

$html =~ s/(<p style="page-break-before:always">(\s)*)*<\/body>/<\/body>/gi;                        #fix up last (unwanted) page-break
$html =~ s/((<br([^>])*>)*(\s)*)*<\/body>/\n<\/body>/gi;         # fix up blank lines (unwanted) at end

$html =~ s/<p/\n<p/gi;                                           # insert newline before '<p' construct
#
#For .IMP end
###################################################################

}

print HTML $html;

###################################################################
#For .IMP start
#

flush HTML;

###################################################################
#
# Adapted by Nick Rapallo (January 2008)
#
# Modified code taken directly from "SBPubX.doc" (installed by the eBook Publisher
# software).  Given a single .html it creates .opf project file for later use as well
# as .IMP for GEB/EBW 1150; can change the latter to REB 1200 or REB 1100 or OEBFF 
# (.oeb or .epub) by passing opt switch for the {BuildTarget} lines below.

use Win32::OLE;
use Win32::OLE qw(EVENTS);
Win32::OLE->Initialize(Win32::OLE::COINIT_APARTMENTTHREADED);

#my $usage='mobi2imp.pl Source.prc [Category [Authorname [Title]]]';

###################################################################
#
# get the interfaces, complain and quit if we cannot
#
# .IMP creation requires eBook Publisher to be installed first!
my $project = Win32::OLE->new("SBPublisher.Project") or
	die ".IMP creation requires eBook Publisher to be installed first!\nUnable to get IProject interface\n";

my $builder = Win32::OLE->new("SBPublisher.Builder") or
	die "Unable to get IBuilder interface\n";

# Setup the event handling.
#
#Win32::OLE->WithEvents($builder, 'EventHandlers');

###################################################################
#
# Create a new project and add our document file with optional cover.
#
$project->ClearAll();
if (-e $fnpath . "cover.htm") {
         print "Adding default cover found in $fnpath\n";
         $project->AddSourceFile("cover.htm");
}
if (defined $opt_cover and create_cover_html()) {
         $project->AddSourceFile("$explodedir/cover_nr.html");
}
$project->AddSourceFile("$explodedir/$htmlfile");
 
###################################################################
#
# Set the various "metadata" items for the publication
#
$project->{AuthorFirstName} = $author;
$project->{BookTitle}       = $booktitle;
$project->{Category}        = $category;
#$project->{ISBN} = $project->CanonicalizeISBN("0448163004 ");
#$project->{BISAC} = "FIC004000";

###################################################################
#
# Now build the output
#
$project->{OutputDirectory} = ".";
$project->{Compress}        = 1;   #True
$project->{Encrypt}         = 0;   #False
$project->{KeepAnchors}     = 1;   #True
$project->{Language}        = "en";
$project->{RequireISBN}     = 0;   #False
$project->{Zoom}            = 2;

if (not defined $opt_1200 or not defined $opt_1150 or defined $opt_1100 or defined $opt_oeb or defined $opt_epub) {
        print "Fixing image aspect ratio for EBW1150 screen\n";
        redo_image_tags ($tree, "1150");
}

###################################################################
#
# Now (default) build the EBW/GEB 1150 (gray HalfVga) .IMP output
#
if (not defined $opt_1150) {
	$project->{BookFileName}    = $bookname;
	$project->Save($bookname . ".opf");

	$project->{BuildTarget}     = 2;

	# Now generate the .IMP output
	$builder->Build($project);
	if (Win32::OLE->LastError() != 0) {
		print "ERROR: Build method failed for EBW 1150.\n";
	} else {
		print "EBW 1150 ebook created!\n";
	}
}

###################################################################
#
# Now (optionally) build the REB 1100 (mono HalfVGA) .RB output
#
if (defined $opt_1100) {
	$project->{BookFileName}    = $bookname;

	$project->{BuildTarget}     = 3;
	
	# Now generate the .RB output
	$builder->Build($project);
	if (Win32::OLE->LastError() != 0) {
		print "ERROR: Build method failed for REB 1100.\n";
	} else {
		print "REB 1100 ebook created!\n";
	}
}

###################################################################
#
# Now (optionally)  build OCF (.epub) output  (with 'OEBFF .oeb' option)
# and (necessarily) build OEBFF (.oeb) output (with 'debug info' option)
#
if (defined $opt_oeb or defined $opt_epub) {
	$project->{BookFileName}    = $bookname;
	
	$project->{BuildTarget}     = 0;

	#Kludge: need .oeb produced for image support within .epub
	if (defined $opt_debug or 1) {    
		# Now generate the OEBFF output
		$builder->GenerateOEBFF($project, 1);
		if (Win32::OLE->LastError() != 0) {
			print "ERROR: GenerateOEBFF method failed for .oeb package.\n";
		} else {
			print "OEBFF (.oeb) ebook created!\n";
		}
	}

	# Now generate the OCF output
	$builder->GenerateOCF($project, 1);
	if (Win32::OLE->LastError() != 0) {
		print "ERROR: GenerateOCF method failed for .epub package.\n";
	} else {
		print "OCF (.epub) ebook created!\n";
	}

	#Kludge: delete .oeb produced if not needed per 'debug info'
	if (not defined $opt_debug) {    	
		unlink $bookname . "oeb";
		print "OEBFF (.oeb) ebook now removed as 'debug info' not selected.\n";
	}
}

###################################################################
#
# Now (optionally) build the REB 1200 (FullVga) .IMP output
#
if (defined $opt_1200) {
        print "Fixing image aspect ratio for REB1200 screen\n";
        redo_image_tags ($tree, "1200");
	$project->{BookFileName}    = $bookname . "_1200";
	$project->Save($bookname . "_1200.opf");
	
	$project->{BuildTarget}     = 1;

	# Now generate the .IMP output
	$builder->Build($project);
	if (Win32::OLE->LastError() != 0) {
		print "ERROR: Build method failed for REB 1200.\n";
	} else {
		print "REB 1200 ebook created!\n";
	}
}

Win32::OLE->Uninitialize();
#
#For .IMP end
###################################################################

#Print some summary info
print "BOOKNAME: $bookname\nTITLE   : $booktitle\nAUTHOR  : $author\nCATEGORY: $category\n";

close HTML;


sub fix_image_tags {
    my $tree = shift;
    my @imgel = $tree->find ("img");
    foreach my $img (@imgel) {
	my $recindex = $img->attr ("recindex");
	my $ind = int ($recindex);
	my $filename = $image_index_to_filename{$ind};
###################################################################
#For .IMP start - fix aspect ratio of .jpg/.png/.gif image files (also fixes non-stnd pics)
#
        #new in version 7 - bugfix for poorly constructed .prc where no image exists (filename=XXXXX)
        if (!$ind) {
            print "ERROR: IMAGE/TAGS not found: $recindex - $ind - $filename (replaced with 'no_image.jpg')\n";
            $img->attr ("recindex", undef);
            $img->attr ("src", "no_image.jpg");
            next;
        }
#
#For .IMP end
###################################################################
	$img->attr ("recindex", undef);
	$img->attr ("src", $filename);
    }
}

sub redo_image_tags {
    my $tree = shift;
    my $dev_type = shift;
    my @imgel = $tree->find ("img");
    my $redoind = 1;
    foreach my $img (@imgel) {
	my $filename = $img->attr ("src");
###################################################################
#For .IMP start - fix aspect ratio of .jpg/.png/.gif image files (also fixes non-stnd pics)
#
        #new in version 7 - bugfix for poorly constructed .prc where no image exists (filename=XXXXX)
        if ($filename eq "no_image.jpg") {
            next;
        }
        fix_image_aspect_ratio($redoind++, $filename, $dev_type);
    }
    if ($coverimageid >= 0) {   
        my $coverimagefilename = $image_index_to_filename{$coverimageid+1};

        # For cover image - fix aspect ratio of .jpg/.png/.gif image files (also fixes non-stnd pics)
        if (defined $opt_verbose || defined $opt_debug){ print "Fixing cover image\n"; }
        fix_image_aspect_ratio($coverimageid+1, $coverimagefilename, $dev_type);
   }
#
#For .IMP end
###################################################################
}

sub fix_image_aspect_ratio { 
    my $fid = shift;
    my $fname = shift;
    my $dev_type = shift;
    $fid -= 1;
    my $imagefile = "$explodedir/$fname";
    my $supportedimage = 0;
    my $MAXASPECT = 0.725;    #  For 1150:  315 x 430 usable space;
    my $MAXHEIGHT = 446;
    if ($dev_type ne "1150") {
        $MAXASPECT = 0.795;   #  For 1200:  468 x 579 usable space
        $MAXHEIGHT = 595;
    }
 
    my $im = $imagefile;
    $im =~ /\.jpe?g$/i and $im = GD::Image->newFromJpeg($imagefile,1) and $supportedimage = 1;
    $im =~ /\.gif$/i   and $im = GD::Image->newFromGif($imagefile) and $supportedimage = 1 and $im->transparent(-1);  #use only non-transparent .gifs
    $im =~ /\.png$/i   and $im = GD::Image->newFromPng($imagefile) and $supportedimage = 1;
    #$im =~ /\.bmp$/i  and next; # not yet implemented in GD, but hopefully not needed here!
 
    if ($supportedimage) {
        my ($width, $height) = $im->getBounds();
    
        my $newwidth = $width;
        my $padwidth = 0;
        if ($height * $MAXASPECT > $width and not defined $opt_noimagefix and $height > $MAXHEIGHT) {
            $newwidth = int ($height * $MAXASPECT);
            $padwidth = int (($newwidth - $width) /2);
        }
        my $target_im = new GD::Image($newwidth,$height,1);
        my $white = $target_im->colorAllocate(255,255,255);
        $target_im->filledRectangle(0,0,$newwidth,$height,$white);
        $target_im->copy($im,$padwidth,0,0,0,$width,$height);

        if (defined $opt_verbose || defined $opt_debug) {
            print "FIXED IMAGE/TAGS: $fid - $fname + $padwidth padding = $newwidth x $height\n";
        }

        open(IMAGE, "> $imagefile");
        binmode(IMAGE);
        if ($imagefile =~ /\.jpe?g$/i){
            print IMAGE $target_im->jpeg(85);
        } elsif ($imagefile =~ /\.gif$/i) {
           print IMAGE $target_im->gif();
        } elsif ($imagefile =~ /\.png$/i) {
           print IMAGE $target_im->png(6);
        } #elsif ($imagefile =~ /\.bmp$/i) {
          #  print IMAGE $target_im->bmp();
          #}
        close(IMAGE);
    }
}

sub fix_filepos_attributes {
    my $tree = shift;
    my @ael = $tree->find ("a");
    print "Fixing filepos attribute\n";
    foreach my $a (@ael) {
	my $filepos = $a->attr ("filepos");
	if ($filepos) {
	    $a->attr ("href", "\#$filepos");
	    $a->attr ("filepos", undef);
	    if (defined $opt_debug){ print "FIX FILEPOS ATTR: $filepos\n"; }
	}
    }
}

sub parse_record_0 {
    my $rec = shift;
    my $palmdocheader = substr ($rec, 0, 16);
    parse_palmdoc_header ($palmdocheader);
    if ($type eq "BOOK" and $creator eq "MOBI") {
	my $mobiheader = substr ($rec, 16);
	parse_mobi_header ($mobiheader);
    }
}

sub parse_palmdoc_header {
    my $data = shift;
    my ($version, $length, $nrecords, $recsize, $unknown) =
	unpack ("nxxNnnN", $data);
    if (defined $opt_verbose || defined $opt_debug){ 
        print "PDHEADER     Version: $version\n";
        print "PDHEADER      Length: $length\n";
        print "PDHEADER    NRecords: $nrecords\n";
        print "PDHEADER     Recsize: $recsize\n";
        print "PDHEADER     Unknown: $unknown\n";
    }
}

sub parse_mobi_header {
    my $data = shift;
    my ($doctype, $length, $type, $codepage, $uniqueid, $ver) =
	unpack ("a4NNNNN", $data);
    my ($exthflg) = unpack ("N", substr ($data, 0x70));
    my $extradataflag = unpack ("n", substr ($data, 242-16));
    if ($ver > 3) {
	my ($ciflg, $ciptr) = unpack ("nn", substr ($data, 0xb0));
	if (defined $opt_verbose || defined $opt_debug) {
	    print "MOBIHEADER     ciflg: $ciflg\n";
	    print "MOBIHEADER     ciptr: $ciptr\n";
            print "MOBIHEADER  xtradata: $extradataflag\n";
            
            $pdb->{multibyteoverlap} = $extradataflag & 1;
}
    }

    my $langcode = MobiPerl::MobiHeader::get_mh_language_code ($data);
    my $typedesc = MobiPerl::MobiHeader::get_booktype_desc ($type);
    my $langdesc = MobiPerl::MobiHeader::get_language_desc ($langcode);
    $longtitle = MobiPerl::MobiHeader::get_extended_title ($data);

    $globalcodepage = $codepage;

    if (defined $opt_verbose || defined $opt_debug){ 
        print "MOBIHEADER   doctype: $doctype\n";
        print "MOBIHEADER    length: $length\n";
        print "MOBIHEADER  booktype: $type - $typedesc\n";
        print "MOBIHEADER     codep: $codepage\n";
        print "MOBIHEADER    uniqid: $uniqueid\n";
        print "MOBIHEADER       ver: $ver\n";
        print "MOBIHEADER   exthflg: $exthflg\n";
        print "MOBIHEADER  codepage: $codepage\n";
        print "MOBIHEADER  language: $langcode - $langdesc\n";
        print "MOBIHEADER LONGTITLE: $longtitle\n"; 
    }

    if ($exthflg & 0x40) {
	my $exth = substr ($data, $length);
	parse_mobi_exth ($exth);
    }
}

sub parse_mobi_exth {
    my $data = shift;
    my ($doctype, $len, $n_items) = unpack ("a4NN", $data);
    print "EXTH doctype: $doctype\n";
    print "EXTH  length: $len\n";
    print "EXTH n_items: $n_items\n";
    my $pos = 12;
    foreach (1..$n_items) {
	my ($id, $size) = unpack ("NN", substr ($data, $pos));
	my $contlen = $size-8;
	my ($id, $size, $content) = unpack ("NNa$contlen", substr ($data, $pos));
	my $hid = sprintf ("%x", $id);
	my $hsize = sprintf ("%x", $size);
	my $desc = MobiPerl::EXTH::get_description ($id);
	if (MobiPerl::EXTH::is_binary_data ($id)) {
	    $content = MobiPerl::Util::iso2hex ($content);
	}
	print "EXTH    item: $id (x$hid) - $desc - $contlen - $content\n";
###################################################################
#For .IMP start
#
	if ($id == 0x64) { $author = $content unless defined $author; } 
	if ($id == 0x69) { $category = $content unless defined $category; } 
	if ($id == 0xC9) {
	    $coverimageid = hex($content);
	print "------> Cover Image_Index: $coverimageid\n";
	}
#
#For .IMP end
###################################################################
	$pos += $size;
    }
}

sub create_cover_html {     
    if ($coverimageid < 0) {   
        return 0;
    } else {
        my $coverimagefilename = $image_index_to_filename{$coverimageid+1};

        open TEMPFILE, ">$explodedir/cover_nr.html" or die "Cannot create cover.html file";
        binmode (TEMPFILE);

        my $coverhtml = "<html><head><style type=\"text/css\">p {text-indent:0em; margin-left:2px; margin-right:2px}</style></head>\n<body>\n";
        $coverhtml .= "<p align=center><center><img src=\"$coverimagefilename\"></center></p>\n</BODY>\n</HTML>\n" ;

        print TEMPFILE $coverhtml;
        close TEMPFILE;
        print "Adding cover image: '$explodedir/$coverimagefilename'\n";
        return 1;
    }
}