##TODO - should use relative paths use File::Basename; use Getopt::Long; use File::NCopy qw(copy); use Cwd; use File::Path; sub makeURLID { my $url = $_[0]; $url =~ s/([^A-Za-z0-9])//seg; return $url; } sub getXMLDate() { my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime; $year = $year + 1900; $mon = $mon + 1; $result = $year."-"; if ($mon < 10) {$result .= "0";} $result .= "$mon"."-"; if ($mday < 10) {$result .= "0";} $result .= "$mday"; return $result; } my $jpegParam = 0; my $author = ""; my $title = ""; my $keepBuild = 0; my ($filename, $directories, $suffix); my $fileCount = 0; my @chapters; my @chapterNames; GetOptions ( "jpeg" => \$jpegParam, "author=s" => \$author, "title=s" => \$title, "cover=s" => \$coverFile, "keepBuild" => \$keepBuild); ## let's see if the arguments look right if (-e $ARGV[0]) { $inFileName = $ARGV[0]; } elsif (-e $ARGV[$#ARGV]) { $inFileName = $ARGV[$#ARGV]; } $invalidFile = 0; if (-e $inFileName) { ($filename, $directories, $suffix) = fileparse($inFileName, qr/\.[^.]*/); } else { $invalidFile = 1; } if (($invalidFile == 1) || (lc($suffix) ne ".html0")) { print "usage:\nbd2epub.exe filename"; print "\n\nwhere filename is the name of the html0 book designer file.\nNote that any file names that contain spaces will need to be quoted or escaped properly."; exit; } chdir $directories; $outFileName = $directories."/".$filename."-epub.html"; ## Open the file open (IN, $inFileName) || die $!; $parseHold = $/; $/ = undef; $contents = ; close (IN); $/ = $parseHold; # if (($contents =~ m/(<[^>]*>)*([^<]*)(<[^>]*>)*\n<\/SPAN>/

$2<\/h2>/sig; $contents =~ s/(<[^>]*>)*([^<]*)(<[^>]*>)*\n<\/SPAN>/

$2<\/h3>/sig; $contents =~ s/
/<\/div>/sg; $contents =~ s//—/sg; $contents =~ s/ align=justify/ class="justify"/sig; $contents =~ s/ align=center/ class="center"/sig; $contents =~ s/ align=right/ class="right"/sig; $contents =~ s///sig; $contents =~ s/<\/B>/<\/strong>/sig; $contents =~ s///sig; $contents =~ s//“/sig; $contents =~ s//”/sig; $contents =~ s//‘/sig; $contents =~ s//’/sig; $contents =~ s//…/sig; $contents =~ s/
/<\/body>\n<\/html>\n\n\n\n<\/title>\n<\/head><body>/sig; $contents =~ s/<\/I>/<\/em>/sig; $contents =~ s/<HTML>/<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE html PUBLIC "-\/\/W3C\/\/DTD XHTML 1.1\/\/EN" "http:\/\/www.w3.org\/TR\/xhtml11\/DTD\/xhtml11.dtd">\n<html xmlns="http:\/\/www.w3.org\/1999\/xhtml" xml:lang="en">\n<head>\n<title><\/title>\n<\/head>/sig; $contents =~ s/<meta content="text\/html; charset=windows-1252" http-equiv="Content-Type">\n//sig; $contents =~ s/<BODY style="FONT-WEIGHT:normal; FONT-SIZE:small; MARGIN-BOTTOM:245px; LINE-HEIGHT:normal; FONT-FAMILY:Times New Roman" text=#000000 bgColor=#ffffff>/<body>/sig; # $contents =~ s/<HTML>\n//sig; # $contents =~ s/<meta content="text\/html; charset=windows-1252" http-equiv="Content-Type">//sig; # $contents =~ s/<BODY style="FONT-WEIGHT:normal; FONT-SIZE:small; MARGIN-BOTTOM:245px; LINE-HEIGHT:normal; FONT-FAMILY:Times New Roman" text=#000000 bgColor=#ffffff>\n//sig; $contents =~ s/<\/BODY>/<\/body>/sg; $contents =~ s/<\/HTML>/<\/html>/sg; $contents =~ s/<H2 id=/<h2 class=/sig; $contents =~ s/<\/H(\d)>/<\/h$1>/sig; $contents =~ s/class=BookTitle class="center"/class="BookTitle"/sig; $contents =~ s/class=BookAuthor class="center"/class="BookAuthor"/sig; $contents =~ s/color=(#[^>\s]*)/color="$1"/sig; #this may not work? Test css $contents =~ s/<SPAN id=cite>/<div class="cite">/sig; $contents =~ s/<\/SPAN>/<\/div>/sig; $contents =~ s/<IMG([^>]*)/<img$1 alt="illustration"\//sig; #todo - eliminate font??? or just change to proper format/css #$contents =~ s/<FONT/<font/sig; #$contents =~ s/<\/FONT/<\/font/sig; $contents =~ s/<FONT[^>]*>//sig; $contents =~ s/<\/FONT>//sig; $contents =~ s/<U>/<span class="underline">/sig; $contents =~ s/<\/U>/<\/span>/sig; $contents =~ s/<SUP>/<sup>/sig; $contents =~ s/<\/SUP>/<\/sup>/sig; $contents =~ s/<SPAN id=([^>]*)>/<div class="$1">/sig; # (<h2 id=(?!BookTitle\b)(?!"BookTitle"\b))"*([^">]*)"*(.*) # $contents =~ s/<SPAN id=title>(<[^>]*>)*([^<]*)(<[^>]*>)*\n<\/SPAN>/<h2 id=\"$2\">$2<\/h2>/sig; # $contents =~ s/(<h2 id=(?!BookTitle\b)(?!"BookTitle"\b))"*([^">]*)"*(.*)/$1$3/sig; #open FOUT, ">$outFileName"; #print FOUT $contents; #close(FOUT); @chapters = split/<hr\/>/, $contents; mkdir "epubBuild"; foreach $chapter (@chapters) { if ($fileCount eq 0) { $chapterFile = "$directories/epubBuild/$filename-title.html"; } else { $chapterFile = "$directories/epubBuild/$filename-$fileCount.html"; } open FOUT, ">$chapterFile"; print FOUT $chapter; close FOUT; if ($fileCount gt 0) { if ($chapter =~ /<h2 class=\"chapterTitle\">([^<]*)<\/h2>/i) { push @chapterNames, $1; } else { push @chapterNames, " - "; } } $fileCount++; } if ($contents =~ /<h2 class="BookTitle">[^>]*>(<[^>]*>)*([^<]*)/i) { $discoveredTitle = $2; } if ($contents =~ /<h2 class="BookAuthor"[^>]*>(<[^>]*>)*([^<]*)/i) { $discoveredAuthor = $2; } if (!$author) { $author = $discoveredAuthor; } if (!$title) { $title = $discoveredTitle; } open MF, ">epubBuild/mimetype"; print MF "application/epub+zip"; close MF; mkdir "epubBuild/META-INF"; open CON, ">epubBuild/META-INF/container.xml"; print CON "<?xml version=\"1.0\"?>\n"; print CON "<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">\n<rootfiles>\n"; print CON "<rootfile full-path=\"content.opf\" media-type=\"application/oebps-package+xml\"/>\n</rootfiles>\n</container>"; close CON; open OPF, ">epubBuild/content.opf"; print OPF "<?xml version=\"1.0\"?>\n<package xmlns=\"http:\/\/www.idpf.org\/2007\/opf\" unique-identifier=\"dcidid\" version=\"2.0\">\n"; print OPF "<metadata xmlns:dc=\"http:\/\/purl.org\/dc\/elements\/1.1\/\" "; print OPF "xmlns:dcterms=\"http:\/\/purl.org\/dc\/terms\/\"\n"; print OPF "xmlns:xsi=\"http:\/\/www.w3.org\/2001\/XMLSchema-instance\"\n"; print OPF "xmlns:opf=\"http:\/\/www.idpf.org\/2007\/opf\">\n"; print OPF "<dc:title>$title</dc:title>\n"; print OPF "<dc:language xsi:type=\"dcterms:RFC3066\">en</dc:language>\n"; print OPF "<dc:identifier id=\"dcidid\" opf:scheme=\"URI\">http://privacy.org/The Stand.epub</dc:identifier>\n"; print OPF "<dc:creator>$author</dc:creator>\n"; print OPF "<dc:date xsi:type=\"dcterms:W3CDTF\">".getXMLDate."</dc:date>\n"; print OPF "</metadata>\n"; print OPF "\n"; print OPF "<manifest>\n"; print OPF "<item id=\"ncx\" href=\"toc.ncx\" media-type=\"application/x-dtbncx+xml\" />\n"; print OPF "<item id=\"pt\" href=\"page-template.xpgt\" media-type=\"application/vnd.adobe-page-template+xml\" />\n"; if (-e "$title.jpg") { print OPF "<item id=\"cover\" href=\"$title-cover.html\" media-type=\"application/xhtml+xml\" />\n"; open CVR, ">epubBuild/$title-cover.html"; print CVR "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-\/\/W3C\/\/DTD XHTML 1.1\/\/EN\" \"http:\/\/www.w3.org\/TR\/xhtml11\/DTD\/xhtml11.dtd\">\n<html xmlns=\"http:\/\/www.w3.org\/1999\/xhtml\" xml:lang=\"en\"><head>\n<title><\/title>\n<\/head><body>"; print CVR "<p align=\"center\"><img src=\"$title.jpg\" alt=\"cover image\"\/></p>\n"; print CVR "</body></html>"; close CVR; } print OPF "<item id=\"title\" href=\"$title-title.html\" media-type=\"application/xhtml+xml\" />\n"; open NCX, ">epubBuild/toc.ncx"; print NCX "<?xml version=\"1.0\"?>\n"; print NCX "<!DOCTYPE ncx PUBLIC \"-//NISO//DTD ncx 2005-1//EN\" \"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd\">\n"; print NCX "<ncx xmlns=\"http://www.daisy.org/z3986/2005/ncx/\" version=\"2005-1\">\n"; print NCX "<head>\n"; print NCX "<meta name=\"dtb:uid\" content=\"http://privacy.org/$title.epub\"/>\n"; print NCX "<meta name=\"dtb:depth\" content=\"1\"/>\n"; print NCX "<meta name=\"dtb:totalPageCount\" content=\"0\"/>\n"; print NCX "<meta name=\"dtb:maxPageNumber\" content=\"0\"/>\n"; print NCX "</head>\n"; print NCX "<docTitle>\n"; print NCX "<text>$title</text>\n"; print NCX "</docTitle>\n"; print NCX "<navMap>\n"; if (-e "$title.jpg") { print NCX "<navPoint id=\"navPoint-0\" playOrder=\"0\">\n"; print NCX "<navLabel>\n"; print NCX "<text>Cover Page</text>\n"; print NCX "</navLabel>\n"; print NCX "<content src=\"$title-cover.html\"/>\n"; print NCX "</navPoint>\n"; } print NCX "<navPoint id=\"navPoint-1\" playOrder=\"1\">\n"; print NCX "<navLabel>\n"; print NCX "<text>Title Page</text>\n"; print NCX "</navLabel>\n"; print NCX "<content src=\"$title-title.html\"/>\n"; print NCX "</navPoint>\n"; $ii = 1; foreach $name (@chapterNames) { my $altIndex = $ii + 1; print OPF "<item id=\"content-$ii\" href=\"$title-$ii.html\" media-type=\"application/xhtml+xml\" />\n"; print NCX "<navPoint id=\"navPoint-$altIndex\" playOrder=\"$altIndex\">\n"; print NCX "<navLabel>\n"; print NCX "<text>$name</text>\n"; print NCX "</navLabel>\n"; print NCX "<content src=\"$title-$ii.html\"/>\n"; print NCX "</navPoint>\n"; $ii++; } print NCX "</navMap>\n\n</ncx>"; close NCX; print OPF "</manifest>\n\n"; print OPF "<spine toc=\"ncx\">\n"; if (-e "$title.jpg") { print OPF "<itemref idref=\"cover\" />\n" } print OPF "<itemref idref=\"title\" />\n"; $ii = 1; foreach $name (@chapterNames) { print OPF "<itemref idref=\"content-$ii\" />"; $ii++; } print OPF "</spine>\n\n<guide>\n"; print OPF "<reference type=\"title-page\" title=\"Title Page\" href=\"$title-title.html\" />\n"; $ii = 1; foreach $name (@chapterNames) { print OPF "<reference type=\"text\" title=\"$name\" href=\"The Stand-$ii.html\" />\n"; $ii++; } print OPF "</guide>\n</package>\n"; close OPF; # re-open to build TOC, etc #open (IN, $inFileName) || die $!; #@contents = <IN>; #close (IN); #open FOUT, ">$directories/toc.html"; #print FOUT "<html>\n<body>\n<h2 align='center'>TABLE OF CONTENTS</h2>\n"; copy "*.jpg", "epubBuild/."; copy "*.png", "epubBuild/."; copy "*.gif", "epubBuild/."; ($tmp1, $currScriptDir, $tmp2) = fileparse($0, qr/\.[^.]*/); copy $currScriptDir."page-template.xpgt", "epubBuild/"; if (-e "epubBuild\\$title.thumb.jpg") { unlink "epubBuild\\$title.thumb.jpg"; } chdir "epubBuild"; system "zip -Xr9D \"..\\$title.epub\" mimetype * > nul"; chdir "..\/"; if (!$keepBuild) { rmtree("epubBuild", 0, 1); } print "done processing $title by $author \n";