Last active
December 20, 2015 17:48
-
-
Save dmarkow/6170832 to your computer and use it in GitHub Desktop.
Levenshtein distance of company names
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with foo as (select company, | |
length(company), | |
levenshtein('Company Name', company) from contacts) | |
select company from contacts where | |
company in | |
(select company from foo where company != contacts.company and | |
( | |
select (1 - (cast(levenshtein as real)/greatest(length(contacts.company),length))) | |
) > 0.8 | |
); | |
with foo as (select company, | |
length(company), | |
levenshtein('Company Name', company) from contacts) | |
select company from contacts where | |
(select count(*) from foo where company != contacts.company and | |
( | |
select (1 - (cast(levenshtein as real)/greatest(length(contacts.company),length))) | |
) > 0.8 | |
) > 0; | |
with foo as (select company, | |
length(company), | |
levenshtein('Company Name', company) from contacts) | |
select company from contacts where | |
(select count(id) from contacts as sub where sub.company != contacts.company and | |
( | |
select (1 - (cast(levenshtein(contacts.company, sub.company) as real)/greatest(length(contacts.company),length(sub.company)))) | |
) > 0.8 | |
) > 0; | |
select count(company) from contacts where company != 'Sedgwick Detert Moran & Arnold LLP' and id in | |
(select id from contacts as sub where | |
( | |
select (1 - (cast(levenshtein('Sedgwick Detert Moran & Arnold LLP', sub.company) as real)/greatest(length('Sedgwick Detert Moran & Arnold LLP'),length(sub.company)))) | |
) > 0.50 | |
); | |
with main as (select distinct(company) from contacts order by company) | |
select company from main where company != '' and company is not null and (select count(company) from contacts where company != main.company and id in | |
(select id from contacts as sub where | |
( | |
select (1 - (cast(levenshtein(main.company, sub.company) as real)/greatest(length(main.company),length(sub.company)))) | |
) > 0.80 | |
)) > 0 limit 50; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment