Skip to content

Instantly share code, notes, and snippets.

@jredrejo
Last active March 7, 2024 18:07
Show Gist options
  • Save jredrejo/ccc687e9dd9849fd9b0ddc2d83205e00 to your computer and use it in GitHub Desktop.
Save jredrejo/ccc687e9dd9849fd9b0ddc2d83205e00 to your computer and use it in GitHub Desktop.
Remove google tag and google ads code from an html file using bs4
from bs4 import BeautifulSoup, Comment
SESSION = requests.Session()
def download_page(url):
"""
Download `url` (following redirects) and soupify response contents.
Returns (final_url, page) where final_url is URL afrer following redirects.
"""
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
}
response = SESSION.request("GET", url, headers=headers)
if not response:
return (None, None)
html = response.text
page = BeautifulSoup(html, "html.parser")
LOGGER.debug("Downloaded page " + str(url))
return (response.url, page)
def remove_google_tag_manager(page):
start_comment = page.find(
string=lambda text: isinstance(text, Comment) and "Google Tag Manager" in text
)
end_comment = page.find(
string=lambda text: isinstance(text, Comment)
and "End Google Tag Manager" in text
)
if start_comment and end_comment:
elements_to_remove = []
# Get all elements between start and end comments
elem = start_comment.next_element
while elem and elem != end_comment:
elements_to_remove.append(elem)
elem = elem.next_element
# Remove the elements
for elem in elements_to_remove:
elem.extract()
return page
if __name__ == "__main__":
_, page = download_page(url)
page = remove_google_tag_manager(page)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment