Quote:
Originally Posted by KevinH
Hi Doitsu,
I have slightly modified your GumboOffset example to do what we do inside Sigil to make it more general
Code:
import sigil_gumbo_bs4_adapter as gumbo_bs4
wspace = (" ", "\n", "\r", "\t", "\v" "\f")
def preprocess(src):
newsrc = src
line_offset = 0;
pos_offset = 0;
n = len(src)
if src.startswith("<?xml"):
# remove any xml header line and trailing whitespace
end = src.find('>',5)
if end != -1:
end = end + 1
while end < n and src[end:end+1] in wspace:
if src[end:end+1] == "\n":
line_offset += 1
end += 1
if (end < n):
pos_offset = end
newsrc = src[end:]
return (newsrc, line_offset, pos_offset)
def run(bk):
for id_type, id in bk.selected_iter():
filename = os.path.basename(bk.id_to_href(id))
html = bk.readfile(id).replace('\r\n', '\n')
(html, line_offset, pos_offset) = preprocess(html)
soup = gumbo_bs4.parse(html)
for para in soup.find_all('p'):
linenumber = para.line + line_offset
colnumber = para.col
offset = para.offset + pos_offset
message = escape(str(para)).replace('"', """)
bk.add_extended_result('info', filename, linenumber, offset, 'Line: ' + str(linenumber) + ' Col: ' + str(colnumber) + ' Gumbo method: ' + message)
return 0
def main():
print('I reached main when I should not have\n')
return -1
if __name__ == "__main__":
sys.exit(main())
We could even move your replace "\r\n" with "\n" inside the new "preprocess" routine if so desired to make the preprocess routine better match how text is parsed and handled inside Sigil.
|

Thanks for the updated code, I'll check it out tomorrow.