Last active
May 7, 2021 09:55
-
-
Save langner/9e732e662b81d45af52a to your computer and use it in GitHub Desktop.
Python function for testing similarity of two article title fields
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import difflib | |
import string | |
def similar_titles(t1, t2, accuracy=1.00, debug=None): | |
"""Determine whether two titles are similar. | |
As a rule, we want titles to be identical after removing whitespace | |
and punctuation. Other discrepancies should be dealt with manually by | |
ensuring the titles are correct, or by replacing strings in all titles, | |
in this function, before comparing them. | |
""" | |
try: | |
t1 = t1.lower().decode('utf-8') | |
except UnicodeEncodeError as e: | |
if debug: | |
debug("There was a problem decoding a title: %s" % e) | |
debug("Offending title: %s" % t1) | |
try: | |
t2 = t2.lower().decode('utf-8') | |
except UnicodeEncodeError as e: | |
if debug: | |
debug("There was a problem decoding a title: %s" % e) | |
debug("Offending title: %s" % t2) | |
to_replace = { | |
# Remove some prefixes that are sometimes prepended to titles. | |
"letter to the editor: " : "", "tech sight. " : "", | |
# Expand Unicode symbols and some signs. | |
u"α" : "alpha", u"β" : "beta", u"γ" : "gamma", | |
u"κ" : "kappa", u"δ": "delta", | |
u"Å" : "a", "angstrom" : "a", 'angstroms' : 'angstrom', | |
"+" : "plus", | |
# Several names are often shortened. | |
"h. pylori" : "helicobacter pylori", | |
# The formatting of isotopes varies. | |
"h-1" : "1h", "c-13" : "13c", "n-15" : "15n", | |
# WoS is not capable of printing vertical bars for some odd reason, | |
# a response I received from the Customer Support. | |
'|' : ' vertical bar ', | |
} | |
to_replace.update(greek_alphabet) | |
for tr in to_replace: | |
t1 = t1.replace(tr, to_replace[tr]) | |
t2 = t2.replace(tr, to_replace[tr]) | |
# Replace any remaining Unicode with ASCII equivalents | |
t1 = unidecode(t1) | |
t2 = unidecode(t2) | |
exclude = ' ' + string.whitespace + string.punctuation | |
t1 = ''.join([c for c in t1 if c not in exclude]) | |
t2 = ''.join([c for c in t2 if c not in exclude]) | |
return difflib.SequenceMatcher(None, t1, t2).ratio() >= accuracy |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment