Created
November 5, 2018 05:38
-
-
Save septagon/824d662e03662ba63fb39d6dbffb1118 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Diagnostics; | |
using System.Linq; | |
/** | |
* Theory described here: https://en.wikipedia.org/wiki/Levenshtein_distance | |
* Algorithm described here: https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm | |
* | |
* A basic, barely-tested implementation of the Wagner-Fischer algorithm for | |
* Levenshtein distance, modified to support variable costs of insertion, deletion, | |
* and substitution. I'm pretty sure this isn't novel at all (the possibility of | |
* doing this was mentioned on the Wikipedia page, even) but I didn't immediately | |
* find an existing implementation that did what I wanted, so I made this one. As | |
* stated before, this is only slightly tested and has basically no optimization | |
* (uses a full matrix, etc.). I may update this gist as I test and tweak this | |
* implementation; but for now, this is what I've got. | |
* | |
* | |
* No copyright, public domain. Go nuts. | |
*/ | |
namespace Levenshtein | |
{ | |
// Alphabet class to support arbitrary alphabets, making the usage a little | |
// more general. | |
public class Alphabet<T> | |
{ | |
private Dictionary<T, int> characterToIdx; | |
private float[] insertionCosts; | |
private float[] deletionCosts; | |
private float[,] substitutionCosts; | |
public int GetCharacterIdx(T character) | |
{ | |
return this.characterToIdx[character]; | |
} | |
public float GetInsertionCost(int idx) | |
{ | |
return this.insertionCosts[idx]; | |
} | |
public float GetDeletionCost(int idx) | |
{ | |
return this.deletionCosts[idx]; | |
} | |
public float GetSubstitutionCost(int idx1, int idx2) | |
{ | |
int min = Math.Min(idx1, idx2); | |
int max = Math.Max(idx1, idx2); | |
return this.substitutionCosts[min, max]; | |
} | |
// Basic constructor, in case you don't want to customize your costs. | |
public Alphabet(T[] characters) : this(characters, _ => 1f, _ => 1f, (a, b) => a.Equals(b) ? 0f : 1f) { } | |
// Fully-featured constructor with complete control of edit costs. | |
public Alphabet( | |
T[] characters, | |
Func<T, float> charToInsertionCost, | |
Func<T, float> charToDeletionCost, | |
Func<T, T, float> charsToSubstitutionCost) | |
{ | |
this.characterToIdx = new Dictionary<T, int>(); | |
this.insertionCosts = new float[characters.Length]; | |
this.deletionCosts = new float[characters.Length]; | |
this.substitutionCosts = new float[characters.Length, characters.Length]; | |
for (int outerIdx = 0; outerIdx < characters.Length; outerIdx++) | |
{ | |
T c = characters[outerIdx]; | |
this.characterToIdx[c] = outerIdx; | |
this.insertionCosts[outerIdx] = charToInsertionCost(c); | |
this.deletionCosts[outerIdx] = charToDeletionCost(c); | |
for (int innerIdx = outerIdx; innerIdx < characters.Length; innerIdx++) | |
{ | |
this.substitutionCosts[outerIdx, innerIdx] = | |
charsToSubstitutionCost(c, characters[innerIdx]); | |
} | |
} | |
} | |
} | |
public struct String<T> | |
{ | |
private Alphabet<T> alphabet; | |
private int[] characters; | |
private int Length | |
{ | |
get | |
{ | |
return this.characters.Length; | |
} | |
} | |
private int this[int idx] | |
{ | |
get | |
{ | |
return this.characters[idx]; | |
} | |
} | |
public String(T[] characters, Alphabet<T> alphabet) | |
{ | |
this.alphabet = alphabet; | |
this.characters = characters.Select(c => alphabet.GetCharacterIdx(c)).ToArray(); | |
} | |
public float Distance(String<T> other) | |
{ | |
return Distance(this, other); | |
} | |
// This is the core of it: an implementation of the Wagner-Fisher algorithm. | |
public static float Distance(String<T> from, String<T> to) | |
{ | |
// Comparing strings built from different alphabets is nonsense. | |
Debug.Assert(from.alphabet == to.alphabet); | |
var alphabet = from.alphabet; | |
// Initialize the cost matrix, populating the first row and column. | |
float[,] costMatrix = new float[from.Length + 1, to.Length + 1]; | |
costMatrix[0, 0] = 0f; | |
for (int idx = 0; idx < from.Length; idx++) | |
{ | |
costMatrix[idx + 1, 0] = costMatrix[idx, 0] + alphabet.GetInsertionCost(from[idx]); | |
} | |
for (int idx = 0; idx < to.Length; idx++) | |
{ | |
costMatrix[0, idx + 1] = costMatrix[0, idx] + alphabet.GetInsertionCost(to[idx]); | |
} | |
// Populate the cost matrix. | |
for (int fromIdx = 0; fromIdx < from.Length; fromIdx++) | |
{ | |
for (int toIdx = 0; toIdx < to.Length; toIdx++) | |
{ | |
float insertionCost = costMatrix[fromIdx + 1, toIdx] + | |
alphabet.GetInsertionCost(to[toIdx]); | |
float deletionCost = costMatrix[fromIdx, toIdx + 1] + | |
alphabet.GetDeletionCost(from[fromIdx]); | |
float substitutionCost = costMatrix[fromIdx, toIdx] + | |
alphabet.GetSubstitutionCost(from[fromIdx], to[toIdx]); | |
costMatrix[fromIdx + 1, toIdx + 1] = Min(insertionCost, deletionCost, substitutionCost); | |
} | |
} | |
// Could reconstruct the edit path, if you care; if not, return the distance. | |
return costMatrix[from.Length, to.Length]; | |
} | |
private static float Min(params float[] values) | |
{ | |
return values.Min(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment