langner · May 7, 2021 09:55
diff --git a/similar_titles.py b/similar_titles.py
 import difflib
 import string

 def similar_titles(t1, t2, accuracy=1.00, debug=None):
    """Determine whether two titles are similar.

    As a rule, we want titles to be identical after removing whitespace
    and punctuation. Other discrepancies should be dealt with manually by
    ensuring the titles are correct, or by replacing strings in all titles,
    in this function, before comparing them.
    """
    try:
        t1 = t1.lower().decode('utf-8')
    except UnicodeEncodeError as e:
        if debug:
            debug("There was a problem decoding a title: %s" % e)
            debug("Offending title: %s" % t1)
    try:
        t2 = t2.lower().decode('utf-8')
    except UnicodeEncodeError as e:
        if debug:
            debug("There was a problem decoding a title: %s" % e)
            debug("Offending title: %s" % t2)

    to_replace = {
        # Remove some prefixes that are sometimes prepended to titles.
        "letter to the editor: " : "", "tech sight. " : "",
        # Expand Unicode symbols and some signs.
        u"α" : "alpha", u"β" : "beta", u"γ" : "gamma",
        u"κ" : "kappa", u"δ": "delta",
        u"Å" : "a", "angstrom" : "a", 'angstroms' : 'angstrom',
        "+" : "plus",
        # Several names are often shortened.
        "h. pylori" : "helicobacter pylori",
        # The formatting of isotopes varies.
        "h-1" : "1h", "c-13" : "13c", "n-15" : "15n",
        # WoS is not capable of printing vertical bars for some odd reason,
        # a response I received from the Customer Support.
        '|' : ' vertical bar ',
    }
    to_replace.update(greek_alphabet)
    for tr in to_replace:
        t1 = t1.replace(tr, to_replace[tr])
        t2 = t2.replace(tr, to_replace[tr])

    # Replace any remaining Unicode with ASCII equivalents
    t1 = unidecode(t1)
    t2 = unidecode(t2)

    exclude = ' ' + string.whitespace + string.punctuation
    t1 = ''.join([c for c in t1 if c not in exclude])
    t2 = ''.join([c for c in t2 if c not in exclude])
    return difflib.SequenceMatcher(None, t1, t2).ratio() >= accuracy
	import difflib
	import string

	def similar_titles(t1, t2, accuracy=1.00, debug=None):
	"""Determine whether two titles are similar.

	As a rule, we want titles to be identical after removing whitespace
	and punctuation. Other discrepancies should be dealt with manually by
	ensuring the titles are correct, or by replacing strings in all titles,
	in this function, before comparing them.
	"""
	try:
	t1 = t1.lower().decode('utf-8')
	except UnicodeEncodeError as e:
	if debug:
	debug("There was a problem decoding a title: %s" % e)
	debug("Offending title: %s" % t1)
	try:
	t2 = t2.lower().decode('utf-8')
	except UnicodeEncodeError as e:
	if debug:
	debug("There was a problem decoding a title: %s" % e)
	debug("Offending title: %s" % t2)

	to_replace = {
	# Remove some prefixes that are sometimes prepended to titles.
	"letter to the editor: " : "", "tech sight. " : "",
	# Expand Unicode symbols and some signs.
	u"α" : "alpha", u"β" : "beta", u"γ" : "gamma",
	u"κ" : "kappa", u"δ": "delta",
	u"Å" : "a", "angstrom" : "a", 'angstroms' : 'angstrom',
	"+" : "plus",
	# Several names are often shortened.
	"h. pylori" : "helicobacter pylori",
	# The formatting of isotopes varies.
	"h-1" : "1h", "c-13" : "13c", "n-15" : "15n",
	# WoS is not capable of printing vertical bars for some odd reason,
	# a response I received from the Customer Support.
	'\|' : ' vertical bar ',
	}
	to_replace.update(greek_alphabet)
	for tr in to_replace:
	t1 = t1.replace(tr, to_replace[tr])
	t2 = t2.replace(tr, to_replace[tr])

	# Replace any remaining Unicode with ASCII equivalents
	t1 = unidecode(t1)
	t2 = unidecode(t2)

	exclude = ' ' + string.whitespace + string.punctuation
	t1 = ''.join([c for c in t1 if c not in exclude])
	t2 = ''.join([c for c in t2 if c not in exclude])
	return difflib.SequenceMatcher(None, t1, t2).ratio() >= accuracy