Created
March 24, 2019 06:26
-
-
Save kaustumbh7/afbd9788cc6b5526f11e70be9e41935a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Convert json file to spaCy format. | |
import plac | |
import logging | |
import argparse | |
import sys | |
import os | |
import json | |
import pickle | |
@plac.annotations(input_file=("Input file", "option", "i", str), output_file=("Output file", "option", "o", str)) | |
def main(input_file=None, output_file=None): | |
try: | |
training_data = [] | |
lines=[] | |
with open(input_file, 'r') as f: | |
lines = f.readlines() | |
for line in lines: | |
data = json.loads(line) | |
text = data['content'] | |
entities = [] | |
for annotation in data['annotation']: | |
point = annotation['points'][0] | |
labels = annotation['label'] | |
if not isinstance(labels, list): | |
labels = [labels] | |
for label in labels: | |
entities.append((point['start'], point['end'] + 1 ,label)) | |
training_data.append((text, {"entities" : entities})) | |
print(training_data) | |
with open(output_file, 'wb') as fp: | |
pickle.dump(training_data, fp) | |
except Exception as e: | |
logging.exception("Unable to process " + input_file + "\n" + "error = " + str(e)) | |
return None | |
if __name__ == '__main__': | |
plac.call(main) |
Hello @kaustumbh7,
I believe that there is a mistake in this section of the code:
for annotation in data['annotation']:
point = annotation['points'][0]
labels = annotation['label']
if not isinstance(labels, list):
labels = [labels]
for label in labels:
entities.append((point['start'], point['end'] + 1 ,label))
Namely the "annotation['points'][0]" part, it's always getting the first point and appending only that one to the entities, furthermore:
for label in labels:
entities.append((point['start'], point['end'] + 1 ,label))
will only append only one (start, end, label) point tuple, for each label (because each annotation contains exactly one label), so if a label has more than one entity, they will be discarded.
I suggest using the following code instead, to allow more than one point per label:
for annotation in data['annotation']:
points = annotation['points']
label = annotation['label'][0]
if not isinstance(points, list):
points = [points]
for point in points:
entities.append((point['start'], point['end'] + 1, label))
With this we can get the data as such:
('I like London and Berlin.',
[(7, 13, 'LOC'), (18, 24, 'LOC')])
Instead of
('I like London and Berlin.',
[(7, 13, 'LOC')])
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @kaustumbh7, I', try to transfer json file to the spacy format.

The command I run is
python json_to_spacy.py -i input_file -o output_file
But I got an error:
output_file is not UTF-8 encoded Saving disabled. See Console for more details.
Here is the console screenshot.