Okay here is a first take at a script to add a chapter section header and remove the chapter strings from the note headers. It adds a bullet character to the chapter strings so they are set off from the 'higher level' section header. It will output new html files with '-new' appended to the original filename.
If it does not find 'chapter pattern' in the note headings, it will not do anything.
You can use calibre to run it:
Code:
[path to calibre executables]calibre-debug gather_chapter_notes.py html1 [html2, ...]
gather_chapter_notes.py
Code:
from re import match, DOTALL
from sys import argv
from bs4 import BeautifulSoup
chapter_pattern = r".*? - (.*?)( > ).*"
def gather_chapter_notes(html: str):
soup = BeautifulSoup(html, 'html.parser')
title_insert = {}
remove_these = set()
for note_heading in soup.find_all('div', class_='noteHeading'):
content = note_heading.contents[-1]
if matches := match(chapter_pattern, content, flags=DOTALL):
title, token = matches.groups()
if title not in title_insert:
title_insert[title] = note_heading
remove_these.add(f'{title} > ')
for title, node in title_insert.items():
title_section = soup.new_tag('div', attrs=[('class', 'sectionHeading')])
title_section.string = f'● {title}'
node.insert_before(title_section)
html = str(soup)
for remove_this in remove_these:
html = html.replace(remove_this, '')
return html
for arg in argv[1:]:
with open(arg) as f:
html_text = f.read()
new_html = gather_chapter_notes(html=html_text)
with open(arg.replace('.html', '-new.html'), 'w') as f:
f.write(new_html)