Created
March 24, 2019 06:26
-
-
Save kaustumbh7/afbd9788cc6b5526f11e70be9e41935a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Convert json file to spaCy format. | |
import plac | |
import logging | |
import argparse | |
import sys | |
import os | |
import json | |
import pickle | |
@plac.annotations(input_file=("Input file", "option", "i", str), output_file=("Output file", "option", "o", str)) | |
def main(input_file=None, output_file=None): | |
try: | |
training_data = [] | |
lines=[] | |
with open(input_file, 'r') as f: | |
lines = f.readlines() | |
for line in lines: | |
data = json.loads(line) | |
text = data['content'] | |
entities = [] | |
for annotation in data['annotation']: | |
point = annotation['points'][0] | |
labels = annotation['label'] | |
if not isinstance(labels, list): | |
labels = [labels] | |
for label in labels: | |
entities.append((point['start'], point['end'] + 1 ,label)) | |
training_data.append((text, {"entities" : entities})) | |
print(training_data) | |
with open(output_file, 'wb') as fp: | |
pickle.dump(training_data, fp) | |
except Exception as e: | |
logging.exception("Unable to process " + input_file + "\n" + "error = " + str(e)) | |
return None | |
if __name__ == '__main__': | |
plac.call(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello @kaustumbh7,
I believe that there is a mistake in this section of the code:
Namely the "annotation['points'][0]" part, it's always getting the first point and appending only that one to the entities, furthermore:
will only append only one (start, end, label) point tuple, for each label (because each annotation contains exactly one label), so if a label has more than one entity, they will be discarded.
I suggest using the following code instead, to allow more than one point per label:
With this we can get the data as such:
Instead of