Last active
August 31, 2023 08:48
-
-
Save Mark-McCracken/84b5e5cf17f760ce6a55e7f9e8d3082c to your computer and use it in GitHub Desktop.
BigQuery string fuzzy matching damerau-levenshtein distance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE OR REPLACE FUNCTION `functions.fuzzy_match_damerau_levenshtein_distance`(s1 STRING, s2 STRING) RETURNS INT64 LANGUAGE js AS """ | |
/* | |
* Based off https://github.com/fabvalaaah/damerau-levenshtein-js | |
* input: Two strings to compare the edit distance of. You probably want to lowercase/trim these before passing to this function | |
* returns: Integer of the edit distance. | |
*/ | |
const initMatrix = (s1, s2) => { | |
if (undefined == s1 || undefined == s2) { | |
return null; | |
} | |
let d = []; | |
for (let i = 0; i <= s1.length; i++) { | |
d[i] = []; | |
d[i][0] = i; | |
} | |
for (let j = 0; j <= s2.length; j++) { | |
d[0][j] = j; | |
} | |
return d; | |
}; | |
const damerau = (i, j, s1, s2, d, cost) => { | |
if (i > 1 && j > 1 && s1[i - 1] === s2[j - 2] && s1[i - 2] === s2[j - 1]) { | |
d[i][j] = Math.min.apply(null, [d[i][j], d[i - 2][j - 2] + cost]); | |
} | |
}; | |
if ( | |
undefined == s1 || | |
undefined == s2 || | |
"string" !== typeof s1 || | |
"string" !== typeof s2 | |
) { | |
return -1; | |
} | |
let d = initMatrix(s1, s2); | |
if (null === d) { | |
return -1; | |
} | |
for (var i = 1; i <= s1.length; i++) { | |
let cost; | |
for (let j = 1; j <= s2.length; j++) { | |
if (s1.charAt(i - 1) === s2.charAt(j - 1)) { | |
cost = 0; | |
} else { | |
cost = 1; | |
} | |
d[i][j] = Math.min.apply(null, [ | |
d[i - 1][j] + 1, | |
d[i][j - 1] + 1, | |
d[i - 1][j - 1] + cost | |
]); | |
damerau(i, j, s1, s2, d, cost); | |
} | |
} | |
return d[s1.length][s2.length]; | |
"""; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment