Last active
March 5, 2021 19:59
-
-
Save markpbaggett/42933b6918a57f550c2c4fa3e431fc6a to your computer and use it in GitHub Desktop.
Check Files in a Directory for Bad Unicode Characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from unicodedata import category | |
import os | |
class FileSet: | |
def __init__(self, path): | |
self.path = path | |
self.bad_files = self.__check_files() | |
@staticmethod | |
def __check_files(): | |
bad_files = [] | |
for path, directories, files in os.walk(self.path): | |
for file in files: | |
x = DataChecker(f"{path}/{file}").check_if_contains_control_characters() | |
if len(x) > 0: | |
bad_files.append((f"{path}/{file}", x)) | |
return bad_files | |
class DataChecker: | |
def __init__(self, filename): | |
self.filename = filename | |
self.good = ("\n", "\t", "\u0009", "\u000A", "\u000D") | |
def check_if_contains_control_characters(self): | |
bad_characters = [] | |
with open(self.filename, "r") as current_file: | |
for line in current_file: | |
for character in line: | |
if category(character) == "Cc" and character not in self.good: | |
bad_characters.append( | |
f"{category(character)} > {character.encode('unicode_escape')}" | |
) | |
return bad_characters | |
if __name__ == "__main__": | |
print(FileSet("data").bad_files) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment