MobileRead Forums - View Single Post - Regexes to improve pdf to epub conversion

ldolse · 04-12-2009, 11:15 PM

Just found the solution to the line breaks, easier to do with two regexes. Here's the code for the area I changed:

Code:

def wrap_lines(match):
    ital = match.group('ital')
    if not ital: 
               return ' '
    else: 
               return ital+' '

def chap_head(match):
    chap = match.group('chap')
    title = match.group('title')
    if not title: 
               return '<h1>'+chap+'</h1><br/>'
    else: 
               return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
    
    
class PreProcessor(object):
    PREPROCESS = [
                  # Some idiotic HTML generators (Frontpage I'm looking at you)
                  # Put all sorts of crap into <head>. This messes up lxml
                  (re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL), 
                   sanitize_head),
                  # Convert all entities, since lxml doesn't handle them well
                  (re.compile(r'&(\S+?);'), convert_entities),
                  # Remove the <![if/endif tags inserted by everybody's darling, MS Word
                  (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), 
                   lambda match: ''),
                  ]
                     
    # Fix pdftohtml markup
    PDFTOHTML  = [
                  # Remove <hr> tags
                  (re.compile(r'<hr[^>]*>', re.IGNORECASE), lambda match: '<br />'),
                  
                  # Remove page links
                  (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                  
                  # Remove page numbers
                  (re.compile(r'(?<!\w)\s(\d+)\s?(<br>|<br/>|</p><p>)', re.IGNORECASE), lambda match: ''),
                  # Remove <br> and replace <br><br> with <p>
                  (re.compile(r'<br[^>]*>\s*<br[^>]*>', re.IGNORECASE), lambda match: '<p>'),
                  (re.compile(r'(.*)<br[^>]*>', re.IGNORECASE), 
                   lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 
                                else match.group(1)),
                  
                  # un-wrap wrapped lines - uses two regexes
                  (re.compile(r'(?<=[a-z,I])\s*(?P<ital></i>)?\s*(</p><p>)?\n\r?\s?(?=(<i>)?\w)', re.DOTALL), wrap_lines),
                  (re.compile(r'(?<=.{85}[a-z,I])\s*(<p[^>]*>|<br[^>]*>)\s*(?=\w)', re.UNICODE), lambda match: ' '),
           
                  # Add space before italics
                  (re.compile(r'<i>'), lambda match: '<i> '),
                  
                  # Remove hyphenation
                  (re.compile(r'-\n\r?'), lambda match: ''),
                  
                  # Remove gray background
                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
                  
                  # Remove non breaking spaces
                  (re.compile(ur'\u00a0'), lambda match : ' '),
                  
                  # Detect Chapters
                  (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head),
                  
                  # Have paragraphs show better
                  (re.compile(r'(?<=.{85})<br[^>]*>\n'), lambda match : '<p>\n'),

                  # terminate unterminated lines.
                  (re.compile(r'(?<!>)\s*\n'), lambda match : '<br/>\n'),
                  (re.compile(r'</i>\s*\n'), lambda match : '</i><br/>\n'),
                  
                  ]

I've just been testing with a single book, will test across some more to see if there are any problems.