|
Sigil Developer
Posts: 9,419
Karma: 6733960
Join Date: Nov 2009
Device: many
|
That is interesting. I had never heard of Perplexity before. Did you ask Claude to read the Sigil Plugin Framework document and use its api, or just give it a few example Sigil plugins? If not, how else would it know the structure a Sigil plugin requires? As the author of the Sigil Plugin Framework, I would be okay with that, especially if it helps novices write their own Sigil plugins.
And, I just noticed that Anthropic is giving away 10,000 free Claude Max accounts for 6 months to active OpenSource developers with public projects on github with more than 5000 stars. Sigil has 6,700 stars so I think DiapDealer or I (or both of us) may qualify. If not, I will look at other free account options.
Thanks for your tips!
KevinH
Quote:
Originally Posted by Doitsu
Free subscriptions are available. But you'll need to sign up for a free account.
I tested both Claude and Perplexity. They can both generate simple plugins with minimal corrections. BeckyEbook also experimented with this.
It can be helpful for parts of a plugin. (I only used it for a particular function that I had problems with.)
This kind of a simple plugin should be doable with Claude, Perplexity and possibly other AIs
I tested this with Claude. The first version was this:
Spoiler:
Code:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HTMLCleaner - Sigil Plugin
Strips elements that cause Sigil's XML parser to choke:
- <script> blocks
- <noscript> blocks
- <link> tags referencing external stylesheets
- Bare & in attribute values (escaped to &)
- External http/https resource references in src/href attributes
- HTML comments containing conditional IE syntax
"""
import sys
import os
import re
def clean_html(raw):
"""
Apply a series of regex-based cleaning passes to raw HTML text.
Returns (cleaned_text, list_of_changes).
"""
changes = []
text = raw
# 1. Remove all <script>...</script> blocks (including multi-line)
before = len(re.findall(r'<script[\s\S]*?</script>', text, re.IGNORECASE))
text = re.sub(r'<script[\s\S]*?</script>', '', text, flags=re.IGNORECASE)
if before:
changes.append(f"Removed {before} <script> block(s)")
# 2. Remove <noscript>...</noscript> blocks
before = len(re.findall(r'<noscript[\s\S]*?</noscript>', text, re.IGNORECASE))
text = re.sub(r'<noscript[\s\S]*?</noscript>', '', text, flags=re.IGNORECASE)
if before:
changes.append(f"Removed {before} <noscript> block(s)")
# 3. Remove <link> tags that reference external (http/https) resources
before = len(re.findall(r'<link[^>]+href=["\']https?://[^>]+>', text, re.IGNORECASE))
text = re.sub(r'<link[^>]+href=["\']https?://[^>]+>', '', text, flags=re.IGNORECASE)
if before:
changes.append(f"Removed {before} external <link> tag(s)")
# 4. Remove <link> tags for stylesheets/icons (css.php etc) by rel type
before = len(re.findall(r'<link[^>]+rel=["\']stylesheet["\'][^>]*>', text, re.IGNORECASE))
text = re.sub(r'<link[^>]+rel=["\']stylesheet["\'][^>]*>', '', text, flags=re.IGNORECASE)
if before:
changes.append(f"Removed {before} stylesheet <link> tag(s)")
# 5. Remove <meta> tags with http-equiv (can cause issues)
before = len(re.findall(r'<meta[^>]+http-equiv[^>]*>', text, re.IGNORECASE))
text = re.sub(r'<meta[^>]+http-equiv[^>]*>', '', text, flags=re.IGNORECASE)
if before:
changes.append(f"Removed {before} http-equiv <meta> tag(s)")
# 6. Fix bare & in attribute values
# Strategy: inside tag attributes (between < and >), replace & not followed by
# word chars + ; with &
# We do this carefully to avoid double-escaping already-correct & etc.
def fix_ampersands_in_tag(m):
tag_text = m.group(0)
# Replace & that is NOT already part of a named/numeric entity
fixed = re.sub(r'&(?!(?:[a-zA-Z][a-zA-Z0-9]*|#[0-9]+|#x[0-9a-fA-F]+);)', '&', tag_text)
return fixed
old_text = text
text = re.sub(r'<[^>]+>', fix_ampersands_in_tag, text)
amp_count = old_text.count('&') - text.count('&')
if amp_count > 0:
changes.append(f"Escaped {amp_count} bare & character(s) in tags to &")
# 7. Remove IE conditional comments <!-- [if ...]> ... <![endif]-->
before = len(re.findall(r'<!--\[if[\s\S]*?<!\[endif\]-->', text, re.IGNORECASE))
text = re.sub(r'<!--\[if[\s\S]*?<!\[endif\]-->', '', text, flags=re.IGNORECASE)
if before:
changes.append(f"Removed {before} IE conditional comment(s)")
# 8. Replace external src attributes with empty string (images/iframes pointing off-site)
def blank_external_src(m):
return re.sub(r'(src=["\'])https?://[^"\']+(["\'])', r'\1\2', m.group(0))
old_text = text
text = re.sub(r'<(?:img|iframe|audio|video|source)[^>]+>', blank_external_src, text, flags=re.IGNORECASE)
if old_text != text:
changes.append("Blanked external src= URLs in media tags")
return text, changes
def run(bc):
"""
Entry point called by Sigil.
bc = BookContainer object providing access to all book files.
"""
manifest = []
for mid, href in bc.text_iter():
raw = bc.readfile(mid)
# Sigil may hand us bytes or str depending on version
if isinstance(raw, bytes):
raw = raw.decode('utf-8', errors='replace')
cleaned, changes = clean_html(raw)
if changes:
bc.writefile(mid, cleaned)
manifest.append((href, changes))
# Print a summary to Sigil's plugin output window
if manifest:
print("HTMLCleaner: changes made\n" + "="*40)
for href, changes in manifest:
print(f"\n{href}:")
for c in changes:
print(f" • {c}")
else:
print("HTMLCleaner: nothing to clean — all files look fine.")
return 0
def main():
# When invoked by Sigil, sys.argv[1] is the path to the book container support file.
if len(sys.argv) < 2:
print("This script must be run as a Sigil plugin.", file=sys.stderr)
return 1
# Sigil injects the BookContainer machinery via its launcher; just call run().
# The launcher imports this module and calls run(bc) directly —
# but we keep main() here for completeness.
from bookcontainer import BookContainer # provided by Sigil's plugin runtime
bc = BookContainer(sys.argv[1])
return run(bc)
if __name__ == "__main__":
sys.exit(main())
It hallucinated quite a bit. When I pointed out that Sigil uses bk not bc it slightly changed the code and came up with:
Spoiler:
Code:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HTMLCleaner - Sigil Plugin
Strips elements that cause Sigil's XML parser to choke:
- <script> blocks
- <noscript> blocks
- <link> tags referencing external stylesheets
- Bare & in attribute values (escaped to &)
- External http/https resource references in src/href attributes
- IE conditional comments
"""
import re
def clean_html(raw):
"""
Apply regex-based cleaning passes to raw HTML text.
Returns (cleaned_text, list_of_changes).
"""
changes = []
text = raw
# 1. Remove all <script>...</script> blocks
before = len(re.findall(r'<script[\s\S]*?</script>', text, re.IGNORECASE))
text = re.sub(r'<script[\s\S]*?</script>', '', text, flags=re.IGNORECASE)
if before:
changes.append(f"Removed {before} <script> block(s)")
# 2. Remove <noscript>...</noscript> blocks
before = len(re.findall(r'<noscript[\s\S]*?</noscript>', text, re.IGNORECASE))
text = re.sub(r'<noscript[\s\S]*?</noscript>', '', text, flags=re.IGNORECASE)
if before:
changes.append(f"Removed {before} <noscript> block(s)")
# 3. Remove <link> tags pointing to external (http/https) resources
before = len(re.findall(r'<link[^>]+href=["\']https?://[^>]+>', text, re.IGNORECASE))
text = re.sub(r'<link[^>]+href=["\']https?://[^>]+>', '', text, flags=re.IGNORECASE)
if before:
changes.append(f"Removed {before} external <link> tag(s)")
# 4. Remove <link rel="stylesheet"> tags
before = len(re.findall(r'<link[^>]+rel=["\']stylesheet["\'][^>]*>', text, re.IGNORECASE))
text = re.sub(r'<link[^>]+rel=["\']stylesheet["\'][^>]*>', '', text, flags=re.IGNORECASE)
if before:
changes.append(f"Removed {before} stylesheet <link> tag(s)")
# 5. Remove <meta http-equiv> tags
before = len(re.findall(r'<meta[^>]+http-equiv[^>]*>', text, re.IGNORECASE))
text = re.sub(r'<meta[^>]+http-equiv[^>]*>', '', text, flags=re.IGNORECASE)
if before:
changes.append(f"Removed {before} http-equiv <meta> tag(s)")
# 6. Fix bare & in tag attributes (avoid double-escaping existing entities)
def fix_ampersands_in_tag(m):
tag_text = m.group(0)
fixed = re.sub(r'&(?!(?:[a-zA-Z][a-zA-Z0-9]*|#[0-9]+|#x[0-9a-fA-F]+);)', '&', tag_text)
return fixed
old_text = text
text = re.sub(r'<[^>]+>', fix_ampersands_in_tag, text)
amp_count = old_text.count('&') - text.count('&')
if amp_count > 0:
changes.append(f"Escaped {amp_count} bare & character(s) in tags to &")
# 7. Remove IE conditional comments
before = len(re.findall(r'<!--\[if[\s\S]*?<!\[endif\]-->', text, re.IGNORECASE))
text = re.sub(r'<!--\[if[\s\S]*?<!\[endif\]-->', '', text, flags=re.IGNORECASE)
if before:
changes.append(f"Removed {before} IE conditional comment(s)")
# 8. Blank external src= URLs on media tags
def blank_external_src(m):
return re.sub(r'(src=["\'])https?://[^"\']+(["\'])', r'\1\2', m.group(0))
old_text = text
text = re.sub(r'<(?:img|iframe|audio|video|source)[^>]+>', blank_external_src, text, flags=re.IGNORECASE)
if old_text != text:
changes.append("Blanked external src= URLs in media tags")
return text, changes
def run(bk):
"""
Sigil calls run(bk) -- bk is the book wrapper object provided by Sigil.
"""
manifest = []
for mid, href in bk.text_iter():
raw = bk.readfile(mid)
if isinstance(raw, bytes):
raw = raw.decode('utf-8', errors='replace')
cleaned, changes = clean_html(raw)
if changes:
bk.writefile(mid, cleaned)
manifest.append((href, changes))
if manifest:
print("HTMLCleaner: changes made\n" + "=" * 40)
for href, changes in manifest:
print(f"\n{href}:")
for c in changes:
print(f" - {c}")
else:
print("HTMLCleaner: nothing to clean - all files look fine.")
return 0
It also generated this incorrect plugin.xml file:
After fixing plugin.xml the plugin kind of works. For example it'll remove:
Code:
<link rel="manifest" href="https://www.alternatehistory.com/forum/webmanifest.php">
But it'll also remove content that users want to keep. E.g.
Code:
<link href="../Styles/sgc-nav.css" rel="stylesheet" type="text/css"/>
So it is possible to generate a basic plugin with an AI tool.
Claude also has a skills concept. I haven't tested it, though. Maybe you could define a Sigil plugin skill?
|
|