MobileRead Forums - View Single Post - Post your Useful Plugin Code Fragments Here

slowsmile · 12-17-2016, 05:12 AM

Using BeautifulSoup, here's a quick way to remove all garbage proprietary data from an html file:

Code:

try:
    import os.path

    from sigil_bs4 import BeautifulSoup
except:
    from bs4 import BeautifulSoup


def fixHTML(work_dir, file)

    output = os.path.join(work_dir, 'clean_html.htm')
    outfp = open(output, 'wt', encoding=('utf-8'))
    html = open(file, 'rt', encoding='utf-8').read()
    
    soup = BeautifulSoup(html, 'html.parser')
    
    # remove all unwanted proprietary attributes from the html file   
    search_tags = ['p', 'span', 'div', 'body', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br']  
    search_attribs =  ['dir', 'name', 'title', 'link', 'id' ,'text', 'lang', 'clear']  
    for tag in soup.findAll(search_tags):
        for attribute in search_attribs:
            del tag[attribute] 

    outfp.writelines(str(soup))
    outfp.close()
    
    os.remove(file)
    os.rename(output, file)
    return(file)