-
-
Save kaustumbh7/afbd9788cc6b5526f11e70be9e41935a to your computer and use it in GitHub Desktop.
# Convert json file to spaCy format. | |
import plac | |
import logging | |
import argparse | |
import sys | |
import os | |
import json | |
import pickle | |
@plac.annotations(input_file=("Input file", "option", "i", str), output_file=("Output file", "option", "o", str)) | |
def main(input_file=None, output_file=None): | |
try: | |
training_data = [] | |
lines=[] | |
with open(input_file, 'r') as f: | |
lines = f.readlines() | |
for line in lines: | |
data = json.loads(line) | |
text = data['content'] | |
entities = [] | |
for annotation in data['annotation']: | |
point = annotation['points'][0] | |
labels = annotation['label'] | |
if not isinstance(labels, list): | |
labels = [labels] | |
for label in labels: | |
entities.append((point['start'], point['end'] + 1 ,label)) | |
training_data.append((text, {"entities" : entities})) | |
print(training_data) | |
with open(output_file, 'wb') as fp: | |
pickle.dump(training_data, fp) | |
except Exception as e: | |
logging.exception("Unable to process " + input_file + "\n" + "error = " + str(e)) | |
return None | |
if __name__ == '__main__': | |
plac.call(main) |
Hi @kaustumbh7, I', try to transfer json file to the spacy format.
The command I run is
python json_to_spacy.py -i input_file -o output_file
But I got an error:
output_file is not UTF-8 encoded Saving disabled. See Console for more details.
Here is the console screenshot.
Hello @kaustumbh7,
I believe that there is a mistake in this section of the code:
for annotation in data['annotation']:
point = annotation['points'][0]
labels = annotation['label']
if not isinstance(labels, list):
labels = [labels]
for label in labels:
entities.append((point['start'], point['end'] + 1 ,label))
Namely the "annotation['points'][0]" part, it's always getting the first point and appending only that one to the entities, furthermore:
for label in labels:
entities.append((point['start'], point['end'] + 1 ,label))
will only append only one (start, end, label) point tuple, for each label (because each annotation contains exactly one label), so if a label has more than one entity, they will be discarded.
I suggest using the following code instead, to allow more than one point per label:
for annotation in data['annotation']:
points = annotation['points']
label = annotation['label'][0]
if not isinstance(points, list):
points = [points]
for point in points:
entities.append((point['start'], point['end'] + 1, label))
With this we can get the data as such:
('I like London and Berlin.',
[(7, 13, 'LOC'), (18, 24, 'LOC')])
Instead of
('I like London and Berlin.',
[(7, 13, 'LOC')])
Thank you Very Much!!! Helped Me a lot.....!!