Last active
August 29, 2015 14:05
-
-
Save langner/ce600127262b04db82d2 to your computer and use it in GitHub Desktop.
Python function for testing similarity of two article pages fields
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def similar_pages(pages1, pages2): | |
"""Determine whether two pages strings are similar. | |
Redundant digits in the end page should be ignored -- for example, 1660-1661 can be | |
reduced to 1660-1 -- and the end page (and hyphen) can be skipped if it's a single page. | |
Additionally, for some journals, WoK can also replace the end page with something else, | |
for example: 241-+ instead of 241-247.e9 (supp info), or O1125-U144 (no idea what that is), | |
and they have said this cannot change for technical reasons. Oh well. | |
Additional exceptions: | |
- sometimes WoK adds 'UNSP before the article number when it is given as a pge | |
""" | |
if not (pages1 and pages2): | |
return False | |
def standardize_pages(p): | |
if p.count('-') == 1: | |
start, end = p.split('-') | |
if start == end: | |
p = start | |
elif start[0] == 'o': | |
p = start | |
elif (end == "+") or (".e" in end): | |
p = start | |
elif len(start) == len(end): | |
p = start + "-" + end[len(os.path.commonprefix([start, end])):] | |
if len(p.split()) > 1: | |
if p.split()[0].upper() == "UNSP": | |
p = ' '.join(p.split()[1:]) | |
return p | |
sp1 = standardize_pages(pages1.strip().lower()) | |
sp2 = standardize_pages(pages2.strip().lower()) | |
# Sometimes WoK has pages like '540-U32', but the actual pages are regular, so we | |
# won't detect the 'U' in our database. In this case we need to adjust the pages | |
# for both cases (taking just the first page for matching). | |
if sp1.count('-') and sp2.count('-'): | |
if sp1.split('-')[1][0] == "u" or sp2.split('-')[1][0] == "u": | |
sp1 = sp1.split('-')[0] | |
sp2 = sp2.split('-')[0] | |
return sp1 == sp2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment