Last active
August 29, 2015 14:14
-
-
Save inky/d0b812a2af8c07d2aa4d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
This script aims to make it a little easier to work with ConceptNet | |
relationships using the raw data available. It extracts all of the semantic | |
relationships for English terms, and saves the results to multiple JSON files | |
-- one for each letter in the alphabet. | |
As of Jan 2015, this will require about 9 GB of disk space: | |
* conceptnet5_flat_json_5.3.tar.bz2 is 644 MB compressed, | |
7.3 GB uncompressed. | |
* Using the file above, the output of this script is 747 MB uncompressed. | |
To run: | |
1. Download the 'flat json' file from | |
http://conceptnet5.media.mit.edu/downloads/current/ | |
2. Extract the file, and then change to the data directory. For example: | |
tar xvjf conceptnet5_flat_json_5.3.tar.bz2 | |
cd data | |
3. Run this script to parse the data files. The results will be saved to | |
a directory named 'parsed'. | |
python convert_rels.py | |
ls -l parsed/ | |
du -sh parsed/ | |
Sample output (Jan 2015): | |
$ python convert_rels.py | |
Reading... | |
assertions/part_00.jsons (1065963 items) | |
assertions/part_01.jsons (1066531 items) | |
assertions/part_02.jsons (1065944 items) | |
assertions/part_03.jsons (1066361 items) | |
assertions/part_04.jsons (1066461 items) | |
assertions/part_05.jsons (1067644 items) | |
assertions/part_06.jsons (1066130 items) | |
assertions/part_07.jsons (1067429 items) | |
Writing... | |
relationship types | |
0 (47110 items) | |
a (169995 items) | |
b (166081 items) | |
c (194629 items) | |
d (117090 items) | |
e (80352 items) | |
f (83671 items) | |
g (100831 items) | |
h (102779 items) | |
i (61977 items) | |
j (78533 items) | |
k (78580 items) | |
l (110009 items) | |
m (177304 items) | |
n (89463 items) | |
o (55490 items) | |
p (162473 items) | |
q (9865 items) | |
r (107025 items) | |
s (242414 items) | |
t (119849 items) | |
u (52904 items) | |
v (43502 items) | |
w (80200 items) | |
x (5250 items) | |
y (16658 items) | |
z (14671 items) | |
Done | |
""" | |
import codecs | |
import json | |
import os | |
import string | |
import sys | |
from collections import defaultdict | |
from glob import glob | |
def main(): | |
try: | |
os.mkdir('parsed') | |
except OSError: | |
pass | |
# get statistics on the types of relationships used, in the format: | |
# { '/r/SomeRelationship': number_of_entries, ... } | |
rel_types = defaultdict(int) | |
# get relationship triplets for each term, in the format: | |
# { | |
# 'a': { | |
# 'a word': [ | |
# (start_term, relationship, end_term), | |
# ... | |
# ], | |
# ... | |
# }, | |
# ... | |
# } | |
rel_entries = {} | |
for letter in (string.lowercase + '0'): | |
rel_entries[letter] = defaultdict(list) | |
# prefix used for English terms | |
en_term_prefix = '/c/en/' | |
en_term_prefix_len = len(en_term_prefix) | |
print 'Reading...' | |
for fn in glob('assertions/*.jsons'): | |
sys.stdout.write(' %s ' % fn) | |
added = 0 | |
with codecs.open(fn, encoding='utf-8') as fp: | |
# parse each line as a json object; | |
# only parse English terms | |
for line in fp: | |
if not en_term_prefix in line: | |
continue | |
item = json.loads(line) | |
start, rel, end = item['start'], item['rel'], item['end'] | |
if not (start.startswith(en_term_prefix) and | |
end.startswith(en_term_prefix)): | |
continue | |
# remove the prefix | |
start = start[en_term_prefix_len:] | |
end = end[en_term_prefix_len:] | |
letter = start[0].lower() | |
if not letter in string.lowercase: | |
letter = '0' | |
# add the relationship | |
rel_types[rel] += 1 | |
rel_entries[letter][start].append((start, rel, end)) | |
added += 1 | |
sys.stdout.write('(%d items)\n' % added) | |
print 'Writing...' | |
with codecs.open('parsed/rel_types.txt', 'w', encoding='utf-8') as fp: | |
print ' relationship types' | |
for rel, count in sorted(rel_types.items()): | |
fp.write('%7d %s\n' % (count, rel)) | |
for letter in sorted(rel_entries.keys()): | |
entries = rel_entries[letter] | |
print ' %s (%d items)' % (letter, len(entries)) | |
with codecs.open('parsed/terms_%s.json' % letter, 'w', | |
encoding='utf-8') as fp: | |
json.dump(entries, fp, indent=2) | |
print 'Done' | |
if __name__ == '__main__': | |
try: | |
retval = main() | |
except KeyboardInterrupt: | |
print('') | |
retval = 1 | |
sys.exit(retval) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment