Skip to content

Instantly share code, notes, and snippets.

@eckardm
Last active August 29, 2015 14:25
Show Gist options
  • Save eckardm/21ae8c2f37b3ce288d63 to your computer and use it in GitHub Desktop.
Save eckardm/21ae8c2f37b3ce288d63 to your computer and use it in GitHub Desktop.
Creates a dictionary using the Name and LC Record Link columns from OpenRefine.
import csv
# you'll need to install fuzzywuzzy -- from a command line: 'pip install fuzzywuzzy'
from fuzzywuzzy import fuzz
# where are the input files?
persname_csv_filepath = 'path/to/persname/csv/output.csv'
corpname_csv_filepath = 'path/to/corpname/csv/output.csv'
# what should we name the output file?
output_file = 'constants.py'
# sets up the authority dictionary. Contains a dictionary for each authority type you'll be parsing
auth_dict = {'persnames': {}, 'corpnames': {}}
# Function to populate the dictionaries
# Takes a path to the input csv and an authority-type string ('persname', 'corpname', etc.).
# The authority type should match one of the auth_dict keys
def populate_dictionary(path_to_refined_csv, auth_type):
with open(path_to_refined_csv, 'rb') as f:
reader = csv.reader(f)
next(reader) # skip the first row (this is assuming it starts with a header)
# update the dictionary
for row in reader:
original_name, lc_name, lc_link = row
if lc_link is not None and lc_link.startswith('http://id.loc.gov/authorities/names/'):
fuzz_ratio = fuzz.ratio(original_name, lc_name)
if fuzz_ratio >= 80:
auth_dict[auth_type][original_name] = lc_link
# call the function with our csv files as inputs
populate_dictionary(persname_csv_filepath, auth_type='persnames')
populate_dictionary(corpname_csv_filepath, auth_type='corpnames')
# write the new dictionaries to constants.py
with open(output_file, "w") as txt_file:
txt_file.write('# dictionary for persnames\npersnames_dictionary = ' + str(auth_dict['persnames']))
txt_file.write('\n\n# dictionary for corpnames\ncorpnames_dictionary = ' + str(auth_dict['corpnames']))
print('Written to {0}'.format(output_file))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment