Skip to content

Instantly share code, notes, and snippets.

@9000cats
Last active April 10, 2025 18:54
Show Gist options
  • Save 9000cats/37410a57b7f8493d869e0197f63225fb to your computer and use it in GitHub Desktop.
Save 9000cats/37410a57b7f8493d869e0197f63225fb to your computer and use it in GitHub Desktop.
<#
.SYNOPSIS
Scans one or more folders for potential duplicate files using filename similarity, fuzzy content hashing, or both.
.DESCRIPTION
This script compares files using two techniques:
• Filename similarity using the Levenshtein distance algorithm (ignores file extensions)
• Fuzzy content similarity using Context-Triggered Piecewise Hashing (CTPH) via ssdeep
Users can select one or more folders through a Windows Forms folder picker.
Files are compared recursively, and a file size check is used before any content-based hash comparison.
Three comparison modes are available:
Name – Levenshtein distance on filenames (extensions ignored)
Content – ssdeep fuzzy hash similarity on file bytes (for any file types)
Hybrid – Filename similarity pre-filter (same extension only), then confirm with ssdeep
.NOTES
Author: Warren Held, ChatGPT-o1, ChatGPT-o3-mini-high
Date: 04/10/2025
Requires: PowerShell 5.1 or higher, Windows Forms, ssdeep.exe in $Env:PATH
.PARAMETER Mode
Comparison mode: Name | Content | Hybrid. Default is Name.
.PARAMETER NameThreshold
Minimum filename similarity (0–100) for Name or Hybrid mode. Default is 80.
.PARAMETER ContentThreshold
Minimum fuzzy hash similarity (0–100) for Content or Hybrid mode. Default is 90.
.PARAMETER ExportCsv
If specified, results will be exported to "DuplicateFiles.csv" in the first selected directory.
.EXAMPLE
Basic filename comparison:
.\Find-SimilarFiles.ps1
Content-based comparison with export:
.\Find-SimilarFiles.ps1 -Mode Content -ContentThreshold 85 -ExportCsv
Hybrid filename and content comparison:
.\Find-SimilarFiles.ps1 -Mode Hybrid -NameThreshold 85 -ContentThreshold 85
.LINK
https://en.wikipedia.org/wiki/Levenshtein_distance
https://ssdeep-project.github.io/ssdeep/index.html
#>
param(
[ValidateSet('Name','Content','Hybrid')]
[string]$Mode,
[ValidateRange(1,100)]
[int]$NameThreshold,
[ValidateRange(1,100)]
[int]$ContentThreshold,
[switch]$ExportCsv
)
# Set defaults
if (-not $PSBoundParameters.ContainsKey('Mode')) { $Mode = 'Name' }
if (-not $PSBoundParameters.ContainsKey('NameThreshold')) { $NameThreshold = 80 }
if (-not $PSBoundParameters.ContainsKey('ContentThreshold')){ $ContentThreshold = 90 }
#Requires -Version 5.1
Add-Type -AssemblyName System.Windows.Forms
function Get-MultipleDirectoriesFromDialog {
[System.Windows.Forms.Application]::EnableVisualStyles()
$selected = @()
while ($true) {
$dlg = New-Object System.Windows.Forms.FolderBrowserDialog
$dlg.Description = 'Select a folder to scan. Click Cancel when finished.'
if ($dlg.ShowDialog() -eq [System.Windows.Forms.DialogResult]::OK) {
if ($selected -notcontains $dlg.SelectedPath) { $selected += $dlg.SelectedPath }
else { Write-Host "Already selected: $($dlg.SelectedPath)" }
} else { break }
}
if (-not $selected) { Write-Warning 'No folders selected – script aborted.'; exit 1 }
return $selected
}
function Get-LevenshteinDistance {
param(
[string]$First,
[string]$Second
)
# Handle trivial cases quickly
if ($First -eq $Second) { return 0 }
if (-not $First) { return $Second.Length }
if (-not $Second) { return $First.Length }
$n = $First.Length
$m = $Second.Length
# Two strongly‑typed 1‑D rows
$prev = [int[]]::new($m + 1)
$curr = [int[]]::new($m + 1)
# Initialise the first row (0..m)
for ($j = 0; $j -le $m; $j++) { $prev[$j] = $j }
for ($i = 1; $i -le $n; $i++) {
$curr[0] = $i
for ($j = 1; $j -le $m; $j++) {
$cost = if ($First[$i-1] -eq $Second[$j-1]) { 0 } else { 1 }
$insert = $curr[$j-1] + 1
$delete = $prev[$j] + 1
$replace = $prev[$j-1] + $cost
$curr[$j] = [Math]::Min([Math]::Min($insert,$delete),$replace)
}
# Roll the rows
$tmp = $prev
$prev = $curr
$curr = $tmp
}
return $prev[$m]
}
# --- JIT compiled C# to implement Levenshtein distance algorithm for speed ---------
if (-not ('FastStringMetrics' -as [type])) { # only compile once per session
$csharp = @'
using System;
public static class FastStringMetrics
{
// Span‑based Levenshtein – about 4× faster than pure PowerShell
public static int Distance(string a, string b)
{
if (a == b) return 0;
if (a.Length == 0) return b.Length;
if (b.Length == 0) return a.Length;
var prev = new int[b.Length + 1];
var curr = new int[b.Length + 1];
for (int j = 0; j <= b.Length; j++) prev[j] = j;
for (int i = 1; i <= a.Length; i++)
{
curr[0] = i;
for (int j = 1; j <= b.Length; j++)
{
int cost = a[i - 1] == b[j - 1] ? 0 : 1;
int insert = curr[j - 1] + 1;
int delete = prev[j] + 1;
int replace = prev[j - 1] + cost;
curr[j] = Math.Min(Math.Min(insert, delete), replace);
}
var tmp = prev; prev = curr; curr = tmp; // swap
}
return prev[b.Length];
}
}
'@
Add-Type -TypeDefinition $csharp -Language CSharp
}
# -----------------------------------------------------------------------------------
function Get-NameSimilarity {
param([string]$First,[string]$Second)
$distance = [FastStringMetrics]::Distance($First.ToLower(), $Second.ToLower())
$maxLength = [Math]::Max($First.Length,$Second.Length)
if ($maxLength -eq 0) { return 100 }
return [Math]::Round((1 - ($distance / $maxLength))*100,2)
}
function Test-SsdeepInstalled {
if (-not (Get-Command 'ssdeep.exe' -ErrorAction SilentlyContinue)) {
throw 'ssdeep.exe not found in $Env:PATH. Download from https://ssdeep-project.github.io/ssdeep/ and try again.'
}
}
function Get-FuzzySimilarity {
param(
[string]$File1,
[string]$File2
)
# -d : compare every FILE against every other FILE
# -s : suppress non‑fatal error messages
$output = & ssdeep.exe -d -s -- "$File1" "$File2" 2>$null
if ($LASTEXITCODE -ne 0 -or -not $output) { return 0 }
# ssdeep prints lines like:
# C:\path\file1.txt matches C:\path\file2.txt (100)
$m = [regex]::Match($output, '\((\d+)\)')
return $m.Success ? [int]$m.Groups[1].Value : 0
}
# === Main ===
$paths = Get-MultipleDirectoriesFromDialog
Write-Host "`nSelected directories:"; $paths | ForEach-Object { Write-Host " - $_" }
$files = $paths | ForEach-Object { Get-ChildItem -Path $_ -Recurse -File }
if (-not $files) { Write-Warning 'No files found.'; exit }
# Build catalog once
$catalog = $files | ForEach-Object {
[PSCustomObject]@{
FullName = $_.FullName
NameNoExt = $_.BaseName # filename without extension
Extension = $_.Extension.ToLower()
Size = $_.Length
}
}
if ($Mode -eq 'Content' -or $Mode -eq 'Hybrid') {
$minSize = ($catalog | Measure-Object -Minimum Size).Minimum
if ($minSize -lt 4096) {
Write-Warning '`nOne or more files are < 4KiB; ssdeep may return only 0% or 100% for such files.'
}
}
$duplicates = New-Object System.Collections.Generic.List[object]
switch ($Mode) {
'Name' {
Write-Host "`nMode: Filename similarity (extensions ignored)"
for ($i=0; $i -lt $catalog.Count; $i++) {
for ($j=$i+1; $j -lt $catalog.Count; $j++) {
$score = Get-NameSimilarity $catalog[$i].NameNoExt $catalog[$j].NameNoExt
if ($score -ge $NameThreshold) {
$duplicates.Add([PSCustomObject]@{
File1 = $catalog[$i].FullName
File2 = $catalog[$j].FullName
Similarity = "$score% (name)"
}) | Out-Null
}
}
}
}
'Content' {
Write-Host "`nMode: Content similarity via ssdeep (heterogeneous types allowed)"
Test-SsdeepInstalled
for ($i=0; $i -lt $catalog.Count; $i++) {
for ($j=$i+1; $j -lt $catalog.Count; $j++) {
# quick size filter – skip if >20% different
if ([math]::Abs($catalog[$i].Size - $catalog[$j].Size) / [double]$catalog[$i].Size -gt 0.2) { continue }
$score = Get-FuzzySimilarity $catalog[$i].FullName $catalog[$j].FullName
if ($score -ge $ContentThreshold) {
$duplicates.Add([PSCustomObject]@{
File1 = $catalog[$i].FullName
File2 = $catalog[$j].FullName
Similarity = "$score% (content)"
}) | Out-Null
}
}
}
}
'Hybrid' {
Write-Host "`nMode: Hybrid – name pre‑filter then content confirmation (same‑extension only)"
Test-SsdeepInstalled
for ($i=0; $i -lt $catalog.Count; $i++) {
for ($j=$i+1; $j -lt $catalog.Count; $j++) {
if ($catalog[$i].Extension -ne $catalog[$j].Extension) { continue } # homogeneous only
$nameScore = Get-NameSimilarity $catalog[$i].NameNoExt $catalog[$j].NameNoExt
if ($nameScore -lt ($NameThreshold*0.75)) { continue } # relaxed pre‑filter
# size sanity‑check
if ([math]::Abs($catalog[$i].Size - $catalog[$j].Size) / [double]$catalog[$i].Size -gt 0.2) { continue }
$contentScore = Get-FuzzySimilarity $catalog[$i].FullName $catalog[$j].FullName
if ($contentScore -ge $ContentThreshold) {
$duplicates.Add([PSCustomObject]@{
File1 = $catalog[$i].FullName
File2 = $catalog[$j].FullName
Similarity = "Name $nameScore% / Content $contentScore%"
}) | Out-Null
}
}
}
}
}
# === Results ===
if ($duplicates.Count) {
Write-Host "`nPotential duplicates found:`n"
$duplicates | Sort-Object Similarity -Descending | Format-Table -AutoSize
if ($ExportCsv) {
$csvPath = Join-Path -Path $paths[0] -ChildPath 'DuplicateFiles.csv'
$duplicates | Export-Csv -Path $csvPath -NoTypeInformation -Encoding UTF8
Write-Host "`nResults exported to: $csvPath"
}
} else {
Write-Host "`nNo duplicates detected under the chosen thresholds."
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment