View Single Post
Old 04-12-2009, 11:15 PM   #16
ldolse
Wizard
ldolse is an accomplished Snipe hunter.ldolse is an accomplished Snipe hunter.ldolse is an accomplished Snipe hunter.ldolse is an accomplished Snipe hunter.ldolse is an accomplished Snipe hunter.ldolse is an accomplished Snipe hunter.ldolse is an accomplished Snipe hunter.ldolse is an accomplished Snipe hunter.ldolse is an accomplished Snipe hunter.ldolse is an accomplished Snipe hunter.ldolse is an accomplished Snipe hunter.
 
Posts: 1,337
Karma: 123455
Join Date: Apr 2009
Location: Malaysia
Device: PRS-650, iPhone
Just found the solution to the line breaks, easier to do with two regexes. Here's the code for the area I changed:

Code:
def wrap_lines(match):
    ital = match.group('ital')
    if not ital: 
               return ' '
    else: 
               return ital+' '

def chap_head(match):
    chap = match.group('chap')
    title = match.group('title')
    if not title: 
               return '<h1>'+chap+'</h1><br/>'
    else: 
               return '<h1>'+chap+'<br/>'+title+'</h1><br/>'
    
    
class PreProcessor(object):
    PREPROCESS = [
                  # Some idiotic HTML generators (Frontpage I'm looking at you)
                  # Put all sorts of crap into <head>. This messes up lxml
                  (re.compile(r'<head[^>]*>(.*?)</head>', re.IGNORECASE|re.DOTALL), 
                   sanitize_head),
                  # Convert all entities, since lxml doesn't handle them well
                  (re.compile(r'&(\S+?);'), convert_entities),
                  # Remove the <![if/endif tags inserted by everybody's darling, MS Word
                  (re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), 
                   lambda match: ''),
                  ]
                     
    # Fix pdftohtml markup
    PDFTOHTML  = [
                  # Remove <hr> tags
                  (re.compile(r'<hr[^>]*>', re.IGNORECASE), lambda match: '<br />'),
                  
                  # Remove page links
                  (re.compile(r'<a name=\d+></a>', re.IGNORECASE), lambda match: ''),
                  
                  # Remove page numbers
                  (re.compile(r'(?<!\w)\s(\d+)\s?(<br>|<br/>|</p><p>)', re.IGNORECASE), lambda match: ''),
                  # Remove <br> and replace <br><br> with <p>
                  (re.compile(r'<br[^>]*>\s*<br[^>]*>', re.IGNORECASE), lambda match: '<p>'),
                  (re.compile(r'(.*)<br[^>]*>', re.IGNORECASE), 
                   lambda match: match.group() if re.match('<', match.group(1).lstrip()) or len(match.group(1)) < 40 
                                else match.group(1)),
                  
                  # un-wrap wrapped lines - uses two regexes
                  (re.compile(r'(?<=[a-z,I])\s*(?P<ital></i>)?\s*(</p><p>)?\n\r?\s?(?=(<i>)?\w)', re.DOTALL), wrap_lines),
                  (re.compile(r'(?<=.{85}[a-z,I])\s*(<p[^>]*>|<br[^>]*>)\s*(?=\w)', re.UNICODE), lambda match: ' '),
           
                  # Add space before italics
                  (re.compile(r'<i>'), lambda match: '<i> '),
                  
                  # Remove hyphenation
                  (re.compile(r'-\n\r?'), lambda match: ''),
                  
                  # Remove gray background
                  (re.compile(r'<BODY[^<>]+>'), lambda match : '<BODY>'),
                  
                  # Remove non breaking spaces
                  (re.compile(ur'\u00a0'), lambda match : ' '),
                  
                  # Detect Chapters
                  (re.compile(r'(<br[^>]*>)?(</?p[^>]*>)?s*(?P<chap>(Chapter|Epilogue|Prologue|Book|Part)\s*(\d+|\w+)?)(</?p[^>]*>|<br[^>]*>)\n?((?=(<i>)?\s*\w+(\s+\w+)?(</i>)?(<br[^>]*>|</?p[^>]*>))((?P<title>.*)(<br[^>]*>|</?p[^>]*>)))?', re.IGNORECASE), chap_head),
                  
                  # Have paragraphs show better
                  (re.compile(r'(?<=.{85})<br[^>]*>\n'), lambda match : '<p>\n'),

                  # terminate unterminated lines.
                  (re.compile(r'(?<!>)\s*\n'), lambda match : '<br/>\n'),
                  (re.compile(r'</i>\s*\n'), lambda match : '</i><br/>\n'),
                  
                  ]
I've just been testing with a single book, will test across some more to see if there are any problems.
ldolse is offline   Reply With Quote