Skip to content

Instantly share code, notes, and snippets.

@thedumbtechguy
Created February 11, 2025 13:34
Show Gist options
  • Save thedumbtechguy/a589dfa2911cc1a235513301af5f93f6 to your computer and use it in GitHub Desktop.
Save thedumbtechguy/a589dfa2911cc1a235513301af5f93f6 to your computer and use it in GitHub Desktop.
Ruby Name Matcher
require 'jaro_winkler'
class NameMatcher
ACCEPTANCE_THRESHOLD = 0.9
COMMON_TITLES = /(Mr|Mrs|Ms|Miss|Dr|Prof)/i
CONJUNCTIONS = /(and)/i
COMMON_SUFFIXES = /(Jr|Sr|III|II|IV)/i
# TODO: Some weird ones we've seen are
# LC = Limited Company
COMPANY_TERMS = {
"private" => "priv",
"corporation" => "corp",
"incorporated" => "inc",
"limited" => "ltd",
"company" => "co",
"enterprise" => "ent",
}
# Matches two names by comparing individual parts and returns the score.
# @param [String] name First name to match.
# @param [String] name_against Second name to match against.
# @return [Float] Returns the similarity score between the two names.
def match(name, name_against)
normalized_name = normalize_name(name)
normalized_name_against = normalize_name(name_against)
compute_score(normalized_name, normalized_name_against)
end
private
# Computes the similarity score between two names by comparing individual parts.
# @param [String] name Normalized first name.
# @param [String] name_against Normalized second name.
# @return [Float] The computed similarity score.
def compute_score(name, name_against)
parts_to_match = name.split
parts_to_match_against = name_against.split
total_matches = 0
parts_to_match.each do |part|
parts_to_match_against.each do |against_part|
if compare_part(part, against_part) > ACCEPTANCE_THRESHOLD
total_matches += 1
break
end
end
end
required_matches = parts_to_match_against.size > 2 ? parts_to_match_against.size - 1 : parts_to_match_against.size
total_matches >= required_matches ? 1.0 : total_matches.to_f / parts_to_match_against.size
end
# Compares two parts of the name using Jaro-Winkler distance.
# @param [String] part First part of the name.
# @param [String] against_part Second part of the name to compare against.
# @return [Float] The similarity score of the two parts.
def compare_part(part, against_part)
JaroWinkler.distance(part, against_part, ignore_case: true)
end
# Normalizes the name by downcasing, separating suffixes, splitting, and normalizing each part.
# @param [String] name The name to normalize.
# @return [String] The normalized name.
def normalize_name(name)
name = name.downcase
name = separate_suffixes(name)
parts = name.split(/[^a-zA-Z]/).reject(&:empty?)
normalized_parts = parts.map { |part| normalize_part(part) }
normalized_parts.sort.join(' ').squish
end
# Normalizes each part of the name by removing titles, suffixes, conjuctions and handling company terms.
# @param [String] part The part of the name to normalize.
# @return [String] The normalized part.
def normalize_part(part)
part.gsub!(COMMON_TITLES, '')
part.gsub!(COMMON_SUFFIXES, '')
part.gsub!(CONJUNCTIONS, '')
part = handle_company_terms(part)
part
end
# Handles common company terms by abbreviating them.
# @param [String] name The name to process.
# @return [String] The name with company terms abbreviated.
def handle_company_terms(name)
COMPANY_TERMS.each do |full, abbr|
name.gsub!(/\b#{full}\b/i, abbr)
end
name
end
# Separates suffixes that are directly appended to the last name.
# @param [String] name The name to process.
# @return [String] The name with suffixes separated.
def separate_suffixes(name)
COMMON_SUFFIXES.match(name) do |m|
suffix = m[0]
name = name.gsub(suffix, " #{suffix}")
end
name
end
end
require_relative "../config/environment"
require 'minitest/autorun'
require_relative '../packages/verify/app/lib/verify/name_matcher.rb' # Adjust the path according to your file structure
class NameMatcherTest < Minitest::Test
def setup
@matcher = Verify::NameMatcher.new
end
def test_name_matching
test_cases = [
['John Doe', 'John Doe', 1.0],
['John Doe', 'john doe', 1.0],
['John Doe', 'John', 1.0],
['John Doe', 'Doe', 1.0],
['Doe', 'John Doe', 0.5],
['Mr. John Doe', 'John Doe', 1.0],
['Dr. John Doe', 'John Doe', 1.0],
['John Doe Jr.', 'John Doe', 1.0],
['John Doe Sr.', 'John Doe', 1.0],
['John Doe Private', 'John Doe Priv', 1.0],
['John Doe Corporation', 'John Doe Corp', 1.0],
['John Doe', 'Jane Smith', 0.0],
['Mr. John A. Doe III', 'John Doe', 1.0],
['John Doe Incorporated', 'John Doe Inc', 1.0],
['John Doe Incorporated', 'John Doe', 1.0],
["NbWilmot Solutions", "Nb Wilmot Solutions", 1.0],
["Milwakee Ent", "Milwakee Enterprise", 1.0],
["C&S FOODS GH. LTD", "C and S FOODS Ghana LTD", 1.0]
]
failures = []
test_cases.each do |test_case|
begin
actual = @matcher.match(test_case[0], test_case[1])
assert_equal test_case[2], actual, "Failed for case: #{test_case}"
rescue Minitest::Assertion => e
failures << e.message
end
end
unless failures.empty?
flunk "#{failures.size} failures:\n" + failures.join("\n")
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment