Last active
April 10, 2025 18:54
-
-
Save 9000cats/37410a57b7f8493d869e0197f63225fb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<# | |
.SYNOPSIS | |
Scans one or more folders for potential duplicate files using filename similarity, fuzzy content hashing, or both. | |
.DESCRIPTION | |
This script compares files using two techniques: | |
• Filename similarity using the Levenshtein distance algorithm (ignores file extensions) | |
• Fuzzy content similarity using Context-Triggered Piecewise Hashing (CTPH) via ssdeep | |
Users can select one or more folders through a Windows Forms folder picker. | |
Files are compared recursively, and a file size check is used before any content-based hash comparison. | |
Three comparison modes are available: | |
Name – Levenshtein distance on filenames (extensions ignored) | |
Content – ssdeep fuzzy hash similarity on file bytes (for any file types) | |
Hybrid – Filename similarity pre-filter (same extension only), then confirm with ssdeep | |
.NOTES | |
Author: Warren Held, ChatGPT-o1, ChatGPT-o3-mini-high | |
Date: 04/10/2025 | |
Requires: PowerShell 5.1 or higher, Windows Forms, ssdeep.exe in $Env:PATH | |
.PARAMETER Mode | |
Comparison mode: Name | Content | Hybrid. Default is Name. | |
.PARAMETER NameThreshold | |
Minimum filename similarity (0–100) for Name or Hybrid mode. Default is 80. | |
.PARAMETER ContentThreshold | |
Minimum fuzzy hash similarity (0–100) for Content or Hybrid mode. Default is 90. | |
.PARAMETER ExportCsv | |
If specified, results will be exported to "DuplicateFiles.csv" in the first selected directory. | |
.EXAMPLE | |
Basic filename comparison: | |
.\Find-SimilarFiles.ps1 | |
Content-based comparison with export: | |
.\Find-SimilarFiles.ps1 -Mode Content -ContentThreshold 85 -ExportCsv | |
Hybrid filename and content comparison: | |
.\Find-SimilarFiles.ps1 -Mode Hybrid -NameThreshold 85 -ContentThreshold 85 | |
.LINK | |
https://en.wikipedia.org/wiki/Levenshtein_distance | |
https://ssdeep-project.github.io/ssdeep/index.html | |
#> | |
param( | |
[ValidateSet('Name','Content','Hybrid')] | |
[string]$Mode, | |
[ValidateRange(1,100)] | |
[int]$NameThreshold, | |
[ValidateRange(1,100)] | |
[int]$ContentThreshold, | |
[switch]$ExportCsv | |
) | |
# Set defaults | |
if (-not $PSBoundParameters.ContainsKey('Mode')) { $Mode = 'Name' } | |
if (-not $PSBoundParameters.ContainsKey('NameThreshold')) { $NameThreshold = 80 } | |
if (-not $PSBoundParameters.ContainsKey('ContentThreshold')){ $ContentThreshold = 90 } | |
#Requires -Version 5.1 | |
Add-Type -AssemblyName System.Windows.Forms | |
function Get-MultipleDirectoriesFromDialog { | |
[System.Windows.Forms.Application]::EnableVisualStyles() | |
$selected = @() | |
while ($true) { | |
$dlg = New-Object System.Windows.Forms.FolderBrowserDialog | |
$dlg.Description = 'Select a folder to scan. Click Cancel when finished.' | |
if ($dlg.ShowDialog() -eq [System.Windows.Forms.DialogResult]::OK) { | |
if ($selected -notcontains $dlg.SelectedPath) { $selected += $dlg.SelectedPath } | |
else { Write-Host "Already selected: $($dlg.SelectedPath)" } | |
} else { break } | |
} | |
if (-not $selected) { Write-Warning 'No folders selected – script aborted.'; exit 1 } | |
return $selected | |
} | |
function Get-LevenshteinDistance { | |
param( | |
[string]$First, | |
[string]$Second | |
) | |
# Handle trivial cases quickly | |
if ($First -eq $Second) { return 0 } | |
if (-not $First) { return $Second.Length } | |
if (-not $Second) { return $First.Length } | |
$n = $First.Length | |
$m = $Second.Length | |
# Two strongly‑typed 1‑D rows | |
$prev = [int[]]::new($m + 1) | |
$curr = [int[]]::new($m + 1) | |
# Initialise the first row (0..m) | |
for ($j = 0; $j -le $m; $j++) { $prev[$j] = $j } | |
for ($i = 1; $i -le $n; $i++) { | |
$curr[0] = $i | |
for ($j = 1; $j -le $m; $j++) { | |
$cost = if ($First[$i-1] -eq $Second[$j-1]) { 0 } else { 1 } | |
$insert = $curr[$j-1] + 1 | |
$delete = $prev[$j] + 1 | |
$replace = $prev[$j-1] + $cost | |
$curr[$j] = [Math]::Min([Math]::Min($insert,$delete),$replace) | |
} | |
# Roll the rows | |
$tmp = $prev | |
$prev = $curr | |
$curr = $tmp | |
} | |
return $prev[$m] | |
} | |
# --- JIT compiled C# to implement Levenshtein distance algorithm for speed --------- | |
if (-not ('FastStringMetrics' -as [type])) { # only compile once per session | |
$csharp = @' | |
using System; | |
public static class FastStringMetrics | |
{ | |
// Span‑based Levenshtein – about 4× faster than pure PowerShell | |
public static int Distance(string a, string b) | |
{ | |
if (a == b) return 0; | |
if (a.Length == 0) return b.Length; | |
if (b.Length == 0) return a.Length; | |
var prev = new int[b.Length + 1]; | |
var curr = new int[b.Length + 1]; | |
for (int j = 0; j <= b.Length; j++) prev[j] = j; | |
for (int i = 1; i <= a.Length; i++) | |
{ | |
curr[0] = i; | |
for (int j = 1; j <= b.Length; j++) | |
{ | |
int cost = a[i - 1] == b[j - 1] ? 0 : 1; | |
int insert = curr[j - 1] + 1; | |
int delete = prev[j] + 1; | |
int replace = prev[j - 1] + cost; | |
curr[j] = Math.Min(Math.Min(insert, delete), replace); | |
} | |
var tmp = prev; prev = curr; curr = tmp; // swap | |
} | |
return prev[b.Length]; | |
} | |
} | |
'@ | |
Add-Type -TypeDefinition $csharp -Language CSharp | |
} | |
# ----------------------------------------------------------------------------------- | |
function Get-NameSimilarity { | |
param([string]$First,[string]$Second) | |
$distance = [FastStringMetrics]::Distance($First.ToLower(), $Second.ToLower()) | |
$maxLength = [Math]::Max($First.Length,$Second.Length) | |
if ($maxLength -eq 0) { return 100 } | |
return [Math]::Round((1 - ($distance / $maxLength))*100,2) | |
} | |
function Test-SsdeepInstalled { | |
if (-not (Get-Command 'ssdeep.exe' -ErrorAction SilentlyContinue)) { | |
throw 'ssdeep.exe not found in $Env:PATH. Download from https://ssdeep-project.github.io/ssdeep/ and try again.' | |
} | |
} | |
function Get-FuzzySimilarity { | |
param( | |
[string]$File1, | |
[string]$File2 | |
) | |
# -d : compare every FILE against every other FILE | |
# -s : suppress non‑fatal error messages | |
$output = & ssdeep.exe -d -s -- "$File1" "$File2" 2>$null | |
if ($LASTEXITCODE -ne 0 -or -not $output) { return 0 } | |
# ssdeep prints lines like: | |
# C:\path\file1.txt matches C:\path\file2.txt (100) | |
$m = [regex]::Match($output, '\((\d+)\)') | |
return $m.Success ? [int]$m.Groups[1].Value : 0 | |
} | |
# === Main === | |
$paths = Get-MultipleDirectoriesFromDialog | |
Write-Host "`nSelected directories:"; $paths | ForEach-Object { Write-Host " - $_" } | |
$files = $paths | ForEach-Object { Get-ChildItem -Path $_ -Recurse -File } | |
if (-not $files) { Write-Warning 'No files found.'; exit } | |
# Build catalog once | |
$catalog = $files | ForEach-Object { | |
[PSCustomObject]@{ | |
FullName = $_.FullName | |
NameNoExt = $_.BaseName # filename without extension | |
Extension = $_.Extension.ToLower() | |
Size = $_.Length | |
} | |
} | |
if ($Mode -eq 'Content' -or $Mode -eq 'Hybrid') { | |
$minSize = ($catalog | Measure-Object -Minimum Size).Minimum | |
if ($minSize -lt 4096) { | |
Write-Warning '`nOne or more files are < 4KiB; ssdeep may return only 0% or 100% for such files.' | |
} | |
} | |
$duplicates = New-Object System.Collections.Generic.List[object] | |
switch ($Mode) { | |
'Name' { | |
Write-Host "`nMode: Filename similarity (extensions ignored)" | |
for ($i=0; $i -lt $catalog.Count; $i++) { | |
for ($j=$i+1; $j -lt $catalog.Count; $j++) { | |
$score = Get-NameSimilarity $catalog[$i].NameNoExt $catalog[$j].NameNoExt | |
if ($score -ge $NameThreshold) { | |
$duplicates.Add([PSCustomObject]@{ | |
File1 = $catalog[$i].FullName | |
File2 = $catalog[$j].FullName | |
Similarity = "$score% (name)" | |
}) | Out-Null | |
} | |
} | |
} | |
} | |
'Content' { | |
Write-Host "`nMode: Content similarity via ssdeep (heterogeneous types allowed)" | |
Test-SsdeepInstalled | |
for ($i=0; $i -lt $catalog.Count; $i++) { | |
for ($j=$i+1; $j -lt $catalog.Count; $j++) { | |
# quick size filter – skip if >20% different | |
if ([math]::Abs($catalog[$i].Size - $catalog[$j].Size) / [double]$catalog[$i].Size -gt 0.2) { continue } | |
$score = Get-FuzzySimilarity $catalog[$i].FullName $catalog[$j].FullName | |
if ($score -ge $ContentThreshold) { | |
$duplicates.Add([PSCustomObject]@{ | |
File1 = $catalog[$i].FullName | |
File2 = $catalog[$j].FullName | |
Similarity = "$score% (content)" | |
}) | Out-Null | |
} | |
} | |
} | |
} | |
'Hybrid' { | |
Write-Host "`nMode: Hybrid – name pre‑filter then content confirmation (same‑extension only)" | |
Test-SsdeepInstalled | |
for ($i=0; $i -lt $catalog.Count; $i++) { | |
for ($j=$i+1; $j -lt $catalog.Count; $j++) { | |
if ($catalog[$i].Extension -ne $catalog[$j].Extension) { continue } # homogeneous only | |
$nameScore = Get-NameSimilarity $catalog[$i].NameNoExt $catalog[$j].NameNoExt | |
if ($nameScore -lt ($NameThreshold*0.75)) { continue } # relaxed pre‑filter | |
# size sanity‑check | |
if ([math]::Abs($catalog[$i].Size - $catalog[$j].Size) / [double]$catalog[$i].Size -gt 0.2) { continue } | |
$contentScore = Get-FuzzySimilarity $catalog[$i].FullName $catalog[$j].FullName | |
if ($contentScore -ge $ContentThreshold) { | |
$duplicates.Add([PSCustomObject]@{ | |
File1 = $catalog[$i].FullName | |
File2 = $catalog[$j].FullName | |
Similarity = "Name $nameScore% / Content $contentScore%" | |
}) | Out-Null | |
} | |
} | |
} | |
} | |
} | |
# === Results === | |
if ($duplicates.Count) { | |
Write-Host "`nPotential duplicates found:`n" | |
$duplicates | Sort-Object Similarity -Descending | Format-Table -AutoSize | |
if ($ExportCsv) { | |
$csvPath = Join-Path -Path $paths[0] -ChildPath 'DuplicateFiles.csv' | |
$duplicates | Export-Csv -Path $csvPath -NoTypeInformation -Encoding UTF8 | |
Write-Host "`nResults exported to: $csvPath" | |
} | |
} else { | |
Write-Host "`nNo duplicates detected under the chosen thresholds." | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment