Created
May 1, 2023 15:37
-
-
Save gustavonovaes/7bd983bf1b426ee827d7bbcf06b10585 to your computer and use it in GitHub Desktop.
Levenshtein Distance vs Cosine Similarity
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const data = [ | |
["The quick brown fox jumps", ""], | |
["", "The quick brown box jumps"], | |
["The quick brown fox jumps", " "], | |
[" ", "The quick brown box jumps"], | |
["The quick brown fox jumps", " "], | |
[" ", "The quick brown box jumps"], | |
["The quick brown fox jumps", "The quick brown box jumps"], | |
["The quick brown fox jumps", "The fast brown dog jumps"], | |
["The red car drives fast", "The blue car drives slow"], | |
["The old man walks slowly", "The young boy walks quickly"], | |
["The tall tree sways gently", "The short bush sways wildly"], | |
["The sweet apple tastes good", "The sour lemon tastes bad"], | |
["The loud music plays softly", "The quiet music plays loudly"], | |
["The loud music plays softly", "The quiet music plays loudly The quiet music plays loudly The quiet music plays loudly"], | |
["The loud music plays softly The loud music plays softly The loud music plays softly", "The quiet music plays loudly"], | |
] | |
/** | |
* @param {string} str1 | |
* @param {string} str2 | |
* @returns {number} similarity | |
* @see https://en.wikipedia.org/wiki/Levenshtein_distance | |
*/ | |
function levenshteinDistance(str1, str2) { | |
const len1 = str1.length; | |
const len2 = str2.length; | |
const matrix = Array(len1 + 1).fill().map(() => Array(len2 + 1).fill(0)); | |
for (let i = 0; i <= len1; i++) { | |
matrix[i][0] = i; | |
} | |
for (let j = 0; j <= len2; j++) { | |
matrix[0][j] = j; | |
} | |
for (let i = 1; i <= len1; i++) { | |
for (let j = 1; j <= len2; j++) { | |
if (str1[i - 1] === str2[j - 1]) { | |
matrix[i][j] = matrix[i - 1][j - 1]; | |
} else { | |
matrix[i][j] = Math.min( | |
matrix[i - 1][j] + 1, // deletion | |
matrix[i][j - 1] + 1, // insertion | |
matrix[i - 1][j - 1] + 1 // substitution | |
); | |
} | |
} | |
} | |
return 1 - (matrix[len1][len2] / Math.max(len1, len2)); | |
} | |
/** | |
* @param {string} str1 | |
* @param {string} str2 | |
* @returns {number} similarity | |
* @see https://en.wikipedia.org/wiki/Cosine_similarity | |
* @see https://stackoverflow.com/questions/17388213/find-the-similarity-percent-between-two-strings | |
*/ | |
function cosineSimilarity(str1, str2) { | |
const wordSet = new Set([...str1.split(' '), ...str2.split(' ')]); | |
const vec1 = Array.from(wordSet).map(word => { | |
const count = (str1.match(new RegExp(word, 'g')) || []).length; | |
return count; | |
}); | |
const vec2 = Array.from(wordSet).map(word => { | |
const count = (str2.match(new RegExp(word, 'g')) || []).length; | |
return count; | |
}); | |
const dotProduct = vec1.reduce((acc, val, i) => acc + val * vec2[i], 0); | |
const vec1Magnitude = Math.sqrt(vec1.reduce((acc, val) => acc + val * val, 0)); | |
const vec2Magnitude = Math.sqrt(vec2.reduce((acc, val) => acc + val * val, 0)); | |
const similarity = dotProduct / (vec1Magnitude * vec2Magnitude); | |
return similarity; | |
} | |
data.forEach(([str1, str2]) => { | |
console.log(`String 1: ${str1}`); | |
console.log(`String 2: ${str2}`); | |
console.log(`String similarity: ${levenshteinDistance(str1, str2).toFixed(1)}`); | |
console.log(`Cosine similarity: ${cosineSimilarity(str1, str2).toFixed(1)}`); | |
console.log(); | |
}); | |
for (let i = 0; i < 10e2; i++) { | |
data.forEach(([str1, str2]) => { | |
levenshteinDistance(str1, str2); | |
cosineSimilarity(str1, str2); | |
}); | |
} | |
console.time('levenshteinDistance'); | |
for (let i = 0; i < 10e4; i++) { | |
data.forEach(([str1, str2]) => levenshteinDistance(str1, str2)); | |
} | |
console.timeEnd('levenshteinDistance'); | |
console.time('cosineSimilarity'); | |
for (let i = 0; i < 10e4; i++) { | |
data.forEach(([str1, str2]) => cosineSimilarity(str1, str2)); | |
} | |
console.timeEnd('cosineSimilarity'); | |
// levenshteinDistance: 11.460s | |
// cosineSimilarity: 6.029s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment