Skip to content

Instantly share code, notes, and snippets.

@markpbaggett
Last active March 5, 2021 19:59
Show Gist options
  • Save markpbaggett/42933b6918a57f550c2c4fa3e431fc6a to your computer and use it in GitHub Desktop.
Save markpbaggett/42933b6918a57f550c2c4fa3e431fc6a to your computer and use it in GitHub Desktop.
Check Files in a Directory for Bad Unicode Characters
from unicodedata import category
import os
class FileSet:
def __init__(self, path):
self.path = path
self.bad_files = self.__check_files()
@staticmethod
def __check_files():
bad_files = []
for path, directories, files in os.walk(self.path):
for file in files:
x = DataChecker(f"{path}/{file}").check_if_contains_control_characters()
if len(x) > 0:
bad_files.append((f"{path}/{file}", x))
return bad_files
class DataChecker:
def __init__(self, filename):
self.filename = filename
self.good = ("\n", "\t", "\u0009", "\u000A", "\u000D")
def check_if_contains_control_characters(self):
bad_characters = []
with open(self.filename, "r") as current_file:
for line in current_file:
for character in line:
if category(character) == "Cc" and character not in self.good:
bad_characters.append(
f"{category(character)} > {character.encode('unicode_escape')}"
)
return bad_characters
if __name__ == "__main__":
print(FileSet("data").bad_files)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment