rjpower · October 11, 2024 18:27
diff --git a/cleanup.py b/cleanup.py

 def cleanup_html(
    html: str,
    strip_tags=["head", "script", "svg", "style", "doctype"],
    attrs=["data-.*", "^_.*", "aria-.*", "^value$", "^style$"],
 ):
    html = re.sub(r"<!DOCTYPE[^>]*>", "", html, flags=re.IGNORECASE)
    soup = bs4.BeautifulSoup(html, "html.parser")

    for tag in soup.find_all(strip_tags):
        tag.decompose()

    for tag in soup.find_all(True):
        for attr in list(tag.attrs):
            if any(re.search(pat, attr) for pat in attrs):
                del tag.attrs[attr]

        if "class" in tag.attrs:
            class_value = " ".join(tag["class"])
            if len(class_value) > 12:
                del tag.attrs["class"]
            elif not class_value.strip():
                del tag.attrs["class"]

        if "href" in tag.attrs and tag["href"].lower().startswith("javascript:"):
            del tag.attrs["href"]

    for comment in soup.find_all(string=lambda text: isinstance(text, bs4.Comment)):
        comment.extract()

    # Remove empty tags
    for tag in soup.find_all():
        if not tag.get_text(strip=True):
            tag.extract()

    # cleanup spaces
    for element in soup.find_all(string=True):
        element.replace_with(re.sub(r"\s+", " ", element).strip())

    pretty_html = re.sub(r">\s+<", "><", str(soup))
    return pretty_html

	def cleanup_html(
	html: str,
	strip_tags=["head", "script", "svg", "style", "doctype"],
	attrs=["data-.", "^_.", "aria-.*", "^value$", "^style$"],
	):
	html = re.sub(r"<!DOCTYPE[^>]*>", "", html, flags=re.IGNORECASE)
	soup = bs4.BeautifulSoup(html, "html.parser")

	for tag in soup.find_all(strip_tags):
	tag.decompose()

	for tag in soup.find_all(True):
	for attr in list(tag.attrs):
	if any(re.search(pat, attr) for pat in attrs):
	del tag.attrs[attr]

	if "class" in tag.attrs:
	class_value = " ".join(tag["class"])
	if len(class_value) > 12:
	del tag.attrs["class"]
	elif not class_value.strip():
	del tag.attrs["class"]

	if "href" in tag.attrs and tag["href"].lower().startswith("javascript:"):
	del tag.attrs["href"]

	for comment in soup.find_all(string=lambda text: isinstance(text, bs4.Comment)):
	comment.extract()

	# Remove empty tags
	for tag in soup.find_all():
	if not tag.get_text(strip=True):
	tag.extract()

	# cleanup spaces
	for element in soup.find_all(string=True):
	element.replace_with(re.sub(r"\s+", " ", element).strip())

	pretty_html = re.sub(r">\s+<", "><", str(soup))
	return pretty_html