This Python script identifies and removes duplicate files in a specified directory based on the Levenshtein distance between their names. Only the first file in each group of similar files is kept, and the rest are deleted. The script excludes directories and only processes files.
- Python 3.x
- coloramalibrary for colored console output
Install the colorama library using pip:
pip install coloramaimport os
from collections import defaultdict
import colorama
from colorama import Fore, Style
def levenshtein(s1, s2):
    """Calculate the Levenshtein distance between two strings."""
    if len(s1) < len(s2):
        return levenshtein(s2, s1)
    if len(s1) == 0:
        return len(s2)
    
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]
def main():
    colorama.init()
    folder_path = 'D:\\RetroBat\\roms\\mame'
    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    grouped_files = defaultdict(list)
    print(Fore.CYAN + 'Analyzing files for grouping...' + Style.RESET_ALL)
    total_files = len(files)
    for index, file in enumerate(files):
        base_name = os.path.splitext(file)[0]
        grouped = False
        for key in grouped_files.keys():
            if levenshtein(base_name, key) <= 5:
                grouped_files[key].append(file)
                grouped = True
                break
        if not grouped:
            grouped_files[base_name].append(file)
    print(Fore.GREEN + 'Processing groups and removing duplicates...' + Style.RESET_ALL)
    for group, files in grouped_files.items():
        if len(files) > 1:
            print(Fore.YELLOW + f"Group based on '{group}': {files}" + Style.RESET_ALL)
            keeper = files[0]  # Keep the first file
            print(Fore.GREEN + f"Keeping: {keeper}" + Style.RESET_ALL)
            for file_to_delete in files[1:]:
                file_path = os.path.join(folder_path, file_to_delete)
                print(Fore.RED + f"Deleting: {file_to_delete}" + Style.RESET_ALL)
                os.remove(file_path)  # Now uncommented to actually delete the files
    print(Fore.GREEN + 'Duplication removal process completed.' + Style.RESET_ALL)
    colorama.deinit()
if __name__ == "__main__":
    main()- 
Save the script as manage_files.py.
- 
Open a terminal or command prompt. 
- 
Navigate to the directory where the script is saved. 
- 
Run the script with Python: python manage_files.py 
Make sure to have backups of your files before running the script, especially if running it on important data. Test the script on a sample directory to ensure it behaves as expected.