Last active
March 7, 2024 18:07
-
-
Save jredrejo/ccc687e9dd9849fd9b0ddc2d83205e00 to your computer and use it in GitHub Desktop.
Remove google tag and google ads code from an html file using bs4
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup, Comment | |
SESSION = requests.Session() | |
def download_page(url): | |
""" | |
Download `url` (following redirects) and soupify response contents. | |
Returns (final_url, page) where final_url is URL afrer following redirects. | |
""" | |
headers = { | |
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36" | |
} | |
response = SESSION.request("GET", url, headers=headers) | |
if not response: | |
return (None, None) | |
html = response.text | |
page = BeautifulSoup(html, "html.parser") | |
LOGGER.debug("Downloaded page " + str(url)) | |
return (response.url, page) | |
def remove_google_tag_manager(page): | |
start_comment = page.find( | |
string=lambda text: isinstance(text, Comment) and "Google Tag Manager" in text | |
) | |
end_comment = page.find( | |
string=lambda text: isinstance(text, Comment) | |
and "End Google Tag Manager" in text | |
) | |
if start_comment and end_comment: | |
elements_to_remove = [] | |
# Get all elements between start and end comments | |
elem = start_comment.next_element | |
while elem and elem != end_comment: | |
elements_to_remove.append(elem) | |
elem = elem.next_element | |
# Remove the elements | |
for elem in elements_to_remove: | |
elem.extract() | |
return page | |
if __name__ == "__main__": | |
_, page = download_page(url) | |
page = remove_google_tag_manager(page) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment