Here's the current code to extract metadata from HTML files (it looks for metadata in comment sections:
Code:
def get_metadata(stream):
src = stream.read()
# Title
title = None
pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+)(?P=q).*?-->', re.DOTALL)
match = pat.search(src)
if match:
title = match.group(2)
else:
pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE)
match = pat.search(src)
if match:
title = match.group(1)
# Author
author = None
pat = re.compile(r'<!--.*?AUTHOR=(?P<q>[\'"])(.+)(?P=q).*?-->', re.DOTALL)
match = pat.search(src)
if match:
author = match.group(2).replace(',', ';')
mi = MetaInformation(title, [author] if author else None)
# Publisher
pat = re.compile(r'<!--.*?PUBLISHER=(?P<q>[\'"])(.+)(?P=q).*?-->', re.DOTALL)
match = pat.search(src)
if match:
mi.publisher = match.group(2)
# ISBN
pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL)
match = pat.search(src)
if match:
isbn = match.group(1)
mi.isbn = re.sub(r'[^0-9xX]', '', isbn)
return mi
I dont think adding support for lrf specific metadata is worthwhile, but adding support for reading more generic kinds of metadata (basically extending the above code, is easy enough to do).
You can get a good idea of what kinds of metadata from OPF calibre supports by using the GUI to save an ebook. The GUI willc reate an OPF file with entries for all the metadata it knows about.