Created
February 11, 2025 13:34
-
-
Save thedumbtechguy/a589dfa2911cc1a235513301af5f93f6 to your computer and use it in GitHub Desktop.
Ruby Name Matcher
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'jaro_winkler' | |
class NameMatcher | |
ACCEPTANCE_THRESHOLD = 0.9 | |
COMMON_TITLES = /(Mr|Mrs|Ms|Miss|Dr|Prof)/i | |
CONJUNCTIONS = /(and)/i | |
COMMON_SUFFIXES = /(Jr|Sr|III|II|IV)/i | |
# TODO: Some weird ones we've seen are | |
# LC = Limited Company | |
COMPANY_TERMS = { | |
"private" => "priv", | |
"corporation" => "corp", | |
"incorporated" => "inc", | |
"limited" => "ltd", | |
"company" => "co", | |
"enterprise" => "ent", | |
} | |
# Matches two names by comparing individual parts and returns the score. | |
# @param [String] name First name to match. | |
# @param [String] name_against Second name to match against. | |
# @return [Float] Returns the similarity score between the two names. | |
def match(name, name_against) | |
normalized_name = normalize_name(name) | |
normalized_name_against = normalize_name(name_against) | |
compute_score(normalized_name, normalized_name_against) | |
end | |
private | |
# Computes the similarity score between two names by comparing individual parts. | |
# @param [String] name Normalized first name. | |
# @param [String] name_against Normalized second name. | |
# @return [Float] The computed similarity score. | |
def compute_score(name, name_against) | |
parts_to_match = name.split | |
parts_to_match_against = name_against.split | |
total_matches = 0 | |
parts_to_match.each do |part| | |
parts_to_match_against.each do |against_part| | |
if compare_part(part, against_part) > ACCEPTANCE_THRESHOLD | |
total_matches += 1 | |
break | |
end | |
end | |
end | |
required_matches = parts_to_match_against.size > 2 ? parts_to_match_against.size - 1 : parts_to_match_against.size | |
total_matches >= required_matches ? 1.0 : total_matches.to_f / parts_to_match_against.size | |
end | |
# Compares two parts of the name using Jaro-Winkler distance. | |
# @param [String] part First part of the name. | |
# @param [String] against_part Second part of the name to compare against. | |
# @return [Float] The similarity score of the two parts. | |
def compare_part(part, against_part) | |
JaroWinkler.distance(part, against_part, ignore_case: true) | |
end | |
# Normalizes the name by downcasing, separating suffixes, splitting, and normalizing each part. | |
# @param [String] name The name to normalize. | |
# @return [String] The normalized name. | |
def normalize_name(name) | |
name = name.downcase | |
name = separate_suffixes(name) | |
parts = name.split(/[^a-zA-Z]/).reject(&:empty?) | |
normalized_parts = parts.map { |part| normalize_part(part) } | |
normalized_parts.sort.join(' ').squish | |
end | |
# Normalizes each part of the name by removing titles, suffixes, conjuctions and handling company terms. | |
# @param [String] part The part of the name to normalize. | |
# @return [String] The normalized part. | |
def normalize_part(part) | |
part.gsub!(COMMON_TITLES, '') | |
part.gsub!(COMMON_SUFFIXES, '') | |
part.gsub!(CONJUNCTIONS, '') | |
part = handle_company_terms(part) | |
part | |
end | |
# Handles common company terms by abbreviating them. | |
# @param [String] name The name to process. | |
# @return [String] The name with company terms abbreviated. | |
def handle_company_terms(name) | |
COMPANY_TERMS.each do |full, abbr| | |
name.gsub!(/\b#{full}\b/i, abbr) | |
end | |
name | |
end | |
# Separates suffixes that are directly appended to the last name. | |
# @param [String] name The name to process. | |
# @return [String] The name with suffixes separated. | |
def separate_suffixes(name) | |
COMMON_SUFFIXES.match(name) do |m| | |
suffix = m[0] | |
name = name.gsub(suffix, " #{suffix}") | |
end | |
name | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require_relative "../config/environment" | |
require 'minitest/autorun' | |
require_relative '../packages/verify/app/lib/verify/name_matcher.rb' # Adjust the path according to your file structure | |
class NameMatcherTest < Minitest::Test | |
def setup | |
@matcher = Verify::NameMatcher.new | |
end | |
def test_name_matching | |
test_cases = [ | |
['John Doe', 'John Doe', 1.0], | |
['John Doe', 'john doe', 1.0], | |
['John Doe', 'John', 1.0], | |
['John Doe', 'Doe', 1.0], | |
['Doe', 'John Doe', 0.5], | |
['Mr. John Doe', 'John Doe', 1.0], | |
['Dr. John Doe', 'John Doe', 1.0], | |
['John Doe Jr.', 'John Doe', 1.0], | |
['John Doe Sr.', 'John Doe', 1.0], | |
['John Doe Private', 'John Doe Priv', 1.0], | |
['John Doe Corporation', 'John Doe Corp', 1.0], | |
['John Doe', 'Jane Smith', 0.0], | |
['Mr. John A. Doe III', 'John Doe', 1.0], | |
['John Doe Incorporated', 'John Doe Inc', 1.0], | |
['John Doe Incorporated', 'John Doe', 1.0], | |
["NbWilmot Solutions", "Nb Wilmot Solutions", 1.0], | |
["Milwakee Ent", "Milwakee Enterprise", 1.0], | |
["C&S FOODS GH. LTD", "C and S FOODS Ghana LTD", 1.0] | |
] | |
failures = [] | |
test_cases.each do |test_case| | |
begin | |
actual = @matcher.match(test_case[0], test_case[1]) | |
assert_equal test_case[2], actual, "Failed for case: #{test_case}" | |
rescue Minitest::Assertion => e | |
failures << e.message | |
end | |
end | |
unless failures.empty? | |
flunk "#{failures.size} failures:\n" + failures.join("\n") | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment