Skip to content

Instantly share code, notes, and snippets.

@rjpower
Created October 11, 2024 18:27
Show Gist options
  • Save rjpower/6bda41b4a65e41135f38d568294250ed to your computer and use it in GitHub Desktop.
Save rjpower/6bda41b4a65e41135f38d568294250ed to your computer and use it in GitHub Desktop.
def cleanup_html(
html: str,
strip_tags=["head", "script", "svg", "style", "doctype"],
attrs=["data-.*", "^_.*", "aria-.*", "^value$", "^style$"],
):
html = re.sub(r"<!DOCTYPE[^>]*>", "", html, flags=re.IGNORECASE)
soup = bs4.BeautifulSoup(html, "html.parser")
for tag in soup.find_all(strip_tags):
tag.decompose()
for tag in soup.find_all(True):
for attr in list(tag.attrs):
if any(re.search(pat, attr) for pat in attrs):
del tag.attrs[attr]
if "class" in tag.attrs:
class_value = " ".join(tag["class"])
if len(class_value) > 12:
del tag.attrs["class"]
elif not class_value.strip():
del tag.attrs["class"]
if "href" in tag.attrs and tag["href"].lower().startswith("javascript:"):
del tag.attrs["href"]
for comment in soup.find_all(string=lambda text: isinstance(text, bs4.Comment)):
comment.extract()
# Remove empty tags
for tag in soup.find_all():
if not tag.get_text(strip=True):
tag.extract()
# cleanup spaces
for element in soup.find_all(string=True):
element.replace_with(re.sub(r"\s+", " ", element).strip())
pretty_html = re.sub(r">\s+<", "><", str(soup))
return pretty_html
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment