Created
January 19, 2015 17:17
-
-
Save flakshack/652b38ba0bb4bd6404cc to your computer and use it in GitHub Desktop.
Script to edit HTML files to remove hyperlinks from a specific domain
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from BeautifulSoup import BeautifulSoup | |
import re | |
import glob | |
# Grab a list of all HTML files | |
path = "/some/path/tofiles/*.html" | |
for file_name in glob.glob(path): | |
print(file_name) | |
# Open the file and read the contents into a string | |
with open (file_name, "r") as file_handle: | |
file_contents=file_handle.read() | |
file_handle.close() | |
soup = BeautifulSoup(file_contents) | |
# Replace the hyperlink with the contents (the text) | |
for match in soup.findAll('a', href=re.compile("somedomain\.com")): | |
match.replaceWithChildren() | |
# Write the contents back to the file | |
file_handle = open(file_name, 'w') | |
file_handle.write(soup.prettify()) | |
file_handle.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment