Just found the solution to the line breaks, easier to do with two regexes. Here's the code for the area I changed:
Code:
def wrap_lines(match):
ital = match.group('ital')
if not ital:
return ' '
else:
return ital+' '
def chap_head(match):
chap = match.group('chap')
title = match.group('title')
if not title:
return '<h1>'+chap+'</h1><br/>'
else:
return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
class PreProcessor(object):
PREPROCESS = [
# Some idiotic HTML generators (Frontpage I'm looking at you)
# Put all sorts of crap into <head>. This messes up lxml
(re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL),
sanitize_head),
# Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE),
lambda match: ''),
]
# Fix pdftohtml markup
PDFTOHTML = [
# Remove <hr> tags
(re.compile(r'<hr[^>]*>', re.IGNORECASE), lambda match: '<br />'),
# Remove page links
(re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
# Remove page numbers
(re.compile(r'(?<!\w)\s(\d+)\s?(<br>|<br/>|</p><p>)', re.IGNORECASE), lambda match: ''),
# Remove <br> and replace <br><br> with <p>
(re.compile(r'<br[^>]*>\s*<br[^>]*>', re.IGNORECASE), lambda match: '<p>'),
(re.compile(r'(.*)<br[^>]*>', re.IGNORECASE),
lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40
else match.group(1)),
# un-wrap wrapped lines - uses two regexes
(re.compile(r'(?<=[a-z,I])\s*(?P<ital></i>)?\s*(</p><p>)?\n\r?\s?(?=(<i>)?\w)', re.DOTALL), wrap_lines),
(re.compile(r'(?<=.{85}[a-z,I])\s*(<p[^>]*>|<br[^>]*>)\s*(?=\w)', re.UNICODE), lambda match: ' '),
# Add space before italics
(re.compile(r'<i>'), lambda match: '<i> '),
# Remove hyphenation
(re.compile(r'-\n\r?'), lambda match: ''),
# Remove gray background
(re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
# Remove non breaking spaces
(re.compile(ur'\u00a0'), lambda match : ' '),
# Detect Chapters
(re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head),
# Have paragraphs show better
(re.compile(r'(?<=.{85})<br[^>]*>\n'), lambda match : '<p>\n'),
# terminate unterminated lines.
(re.compile(r'(?<!>)\s*\n'), lambda match : '<br/>\n'),
(re.compile(r'</i>\s*\n'), lambda match : '</i><br/>\n'),
]
I've just been testing with a single book, will test across some more to see if there are any problems.