Using BeautifulSoup, here's a quick way to remove all garbage proprietary data from an html file:
Code:
try:
import os.path
from sigil_bs4 import BeautifulSoup
except:
from bs4 import BeautifulSoup
def fixHTML(work_dir, file)
output = os.path.join(work_dir, 'clean_html.htm')
outfp = open(output, 'wt', encoding=('utf-8'))
html = open(file, 'rt', encoding='utf-8').read()
soup = BeautifulSoup(html, 'html.parser')
# remove all unwanted proprietary attributes from the html file
search_tags = ['p', 'span', 'div', 'body', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br']
search_attribs = ['dir', 'name', 'title', 'link', 'id' ,'text', 'lang', 'clear']
for tag in soup.findAll(search_tags):
for attribute in search_attribs:
del tag[attribute]
outfp.writelines(str(soup))
outfp.close()
os.remove(file)
os.rename(output, file)
return(file)