Created
November 29, 2012 12:33
-
-
Save digitalex/4168735 to your computer and use it in GitHub Desktop.
Levenshtein w/diagonal stripe optimization
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.*; | |
import java.io.*; | |
public class Dictionary { | |
private List<String> words; | |
public Dictionary(List<String> list) { | |
this.words = list; | |
} | |
public static Dictionary readFromFile(String filename) throws IOException { | |
Scanner scanner = new Scanner(new File(filename)); | |
List<String> words = new ArrayList<String>(178691); | |
while (scanner.hasNext()) { | |
words.add(scanner.next().trim().toUpperCase()); | |
} | |
return new Dictionary(words); | |
} | |
public final int getMinDistance(String word) { | |
String prevWord = ""; | |
int bestSoFar = 1000; | |
for (String w : this.words) { | |
int lowerBound = Math.abs(w.length() - word.length()); | |
if (bestSoFar > lowerBound) { | |
int prefixLength = commonPrefixLength(prevWord, w); | |
int k = bestSoFar - 1; | |
int d = levenshtein(w, word.toUpperCase(), k, prefixLength); | |
bestSoFar = Math.min(bestSoFar, d); | |
if (bestSoFar == 0) { return 0; } | |
prevWord = w; | |
} | |
} | |
return bestSoFar; | |
} | |
public static final int commonPrefixLength(String s, String t) { | |
int i = 0, n = Math.min(s.length(), t.length()); | |
while (i < n && s.charAt(i) == t.charAt(i)) { i++; } | |
return i; | |
} | |
private final int levenshtein(String s, String t, int k, int firstRow) { | |
if (s.equals(t)) { return 0; } | |
int m = s.length() + 1; | |
int n = t.length() + 1; | |
int lowerBound = Integer.MAX_VALUE; | |
for (int i = firstRow + 1; i < m; i++) { | |
for (int j = Math.max(i-k, 1); j < Math.min(i+k+1, n); j++) { | |
if (s.charAt(i-1) == t.charAt(j-1)) { | |
this.m[i][j] = this.m[i-1][j-1]; | |
} | |
else { | |
// substitution | |
this.m[i][j] = this.m[i-1][j-1] + 1; | |
// deletion | |
if (j < i+k) // ignore [i-1][j] if it's outside of diagonal stripe | |
this.m[i][j] = Math.min(this.m[i][j], this.m[i-1][j] + 1); | |
// insertion | |
if (j > i-k) // ignore [i][j-1] if it's outside of diagonal stripe | |
this.m[i][j] = Math.min(this.m[i][j], this.m[i][j-1] + 1); | |
} | |
lowerBound = Math.min(lowerBound, this.m[i][j]); | |
} | |
// Since the result can't possibly be less than best so far; | |
if (lowerBound > k) { | |
return Integer.MAX_VALUE; | |
} | |
} | |
return this.m[m-1][n-1]; | |
} | |
private int[][] m = { | |
{0 ,1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}, | |
{1 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{2 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{3 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{4 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{5 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{6 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{7 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{8 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{9 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{10,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{11,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{12,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{13,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{14,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{15,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{16,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{17,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{18,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{19,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{20,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{21,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{22,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{23,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{24,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, | |
{25,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment