Last active
August 29, 2015 14:25
-
-
Save eckardm/21ae8c2f37b3ce288d63 to your computer and use it in GitHub Desktop.
Creates a dictionary using the Name and LC Record Link columns from OpenRefine.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
# you'll need to install fuzzywuzzy -- from a command line: 'pip install fuzzywuzzy' | |
from fuzzywuzzy import fuzz | |
# where are the input files? | |
persname_csv_filepath = 'path/to/persname/csv/output.csv' | |
corpname_csv_filepath = 'path/to/corpname/csv/output.csv' | |
# what should we name the output file? | |
output_file = 'constants.py' | |
# sets up the authority dictionary. Contains a dictionary for each authority type you'll be parsing | |
auth_dict = {'persnames': {}, 'corpnames': {}} | |
# Function to populate the dictionaries | |
# Takes a path to the input csv and an authority-type string ('persname', 'corpname', etc.). | |
# The authority type should match one of the auth_dict keys | |
def populate_dictionary(path_to_refined_csv, auth_type): | |
with open(path_to_refined_csv, 'rb') as f: | |
reader = csv.reader(f) | |
next(reader) # skip the first row (this is assuming it starts with a header) | |
# update the dictionary | |
for row in reader: | |
original_name, lc_name, lc_link = row | |
if lc_link is not None and lc_link.startswith('http://id.loc.gov/authorities/names/'): | |
fuzz_ratio = fuzz.ratio(original_name, lc_name) | |
if fuzz_ratio >= 80: | |
auth_dict[auth_type][original_name] = lc_link | |
# call the function with our csv files as inputs | |
populate_dictionary(persname_csv_filepath, auth_type='persnames') | |
populate_dictionary(corpname_csv_filepath, auth_type='corpnames') | |
# write the new dictionaries to constants.py | |
with open(output_file, "w") as txt_file: | |
txt_file.write('# dictionary for persnames\npersnames_dictionary = ' + str(auth_dict['persnames'])) | |
txt_file.write('\n\n# dictionary for corpnames\ncorpnames_dictionary = ' + str(auth_dict['corpnames'])) | |
print('Written to {0}'.format(output_file)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment