-
-
Save kaustumbh7/afbd9788cc6b5526f11e70be9e41935a to your computer and use it in GitHub Desktop.
# Convert json file to spaCy format. | |
import plac | |
import logging | |
import argparse | |
import sys | |
import os | |
import json | |
import pickle | |
@plac.annotations(input_file=("Input file", "option", "i", str), output_file=("Output file", "option", "o", str)) | |
def main(input_file=None, output_file=None): | |
try: | |
training_data = [] | |
lines=[] | |
with open(input_file, 'r') as f: | |
lines = f.readlines() | |
for line in lines: | |
data = json.loads(line) | |
text = data['content'] | |
entities = [] | |
for annotation in data['annotation']: | |
point = annotation['points'][0] | |
labels = annotation['label'] | |
if not isinstance(labels, list): | |
labels = [labels] | |
for label in labels: | |
entities.append((point['start'], point['end'] + 1 ,label)) | |
training_data.append((text, {"entities" : entities})) | |
print(training_data) | |
with open(output_file, 'wb') as fp: | |
pickle.dump(training_data, fp) | |
except Exception as e: | |
logging.exception("Unable to process " + input_file + "\n" + "error = " + str(e)) | |
return None | |
if __name__ == '__main__': | |
plac.call(main) |
Hey @sonalisss,
Are you trying to covert tsv -> json
or json -> spacy
?
According to your code it seems that you are trying to convert csv -> json
. You can convert tsv -> json
using this script. Make sure that you pass a tsv
file.
If you have a json
file which you would like to convert to spacy
format then, please share your json
file with me so that I can spot where the problem is.
Hello,
When I give the command-
It gives me:
python: can't open file 'json_to_spacy.py': [Errno 2] No such file or directory
Can you help please!
Hi @kaustumbh7, I was trying to train spacy NER on custom dataset. I am not able to give correct result on test dataset after saving the model.
However, if I pass the test dataset directly on the model variable, I get good result.
Any tips to solve this error ?
Here is the link of the train dataset.
https://drive.google.com/file/d/1nUvMht6BWhBPVooqL_uNlpUvBOvXh17m/view?usp=sharing
And test dataset
https://drive.google.com/file/d/1H9SrZsueuZ9pescNxNowQqYFOvPqswMB/view?usp=sharing
Thank you Very Much!!! Helped Me a lot.....!!
Hi @kaustumbh7, I', try to transfer json file to the spacy format.
The command I run is
python json_to_spacy.py -i input_file -o output_file
But I got an error:
output_file is not UTF-8 encoded Saving disabled. See Console for more details.
Here is the console screenshot.
Hello @kaustumbh7,
I believe that there is a mistake in this section of the code:
for annotation in data['annotation']:
point = annotation['points'][0]
labels = annotation['label']
if not isinstance(labels, list):
labels = [labels]
for label in labels:
entities.append((point['start'], point['end'] + 1 ,label))
Namely the "annotation['points'][0]" part, it's always getting the first point and appending only that one to the entities, furthermore:
for label in labels:
entities.append((point['start'], point['end'] + 1 ,label))
will only append only one (start, end, label) point tuple, for each label (because each annotation contains exactly one label), so if a label has more than one entity, they will be discarded.
I suggest using the following code instead, to allow more than one point per label:
for annotation in data['annotation']:
points = annotation['points']
label = annotation['label'][0]
if not isinstance(points, list):
points = [points]
for point in points:
entities.append((point['start'], point['end'] + 1, label))
With this we can get the data as such:
('I like London and Berlin.',
[(7, 13, 'LOC'), (18, 24, 'LOC')])
Instead of
('I like London and Berlin.',
[(7, 13, 'LOC')])
Hey @sonalisss
You get
KeyError: 'annotation'
because yourjson
file does not have any key calledannotation
. Can you please share yourjson
file so that I can get a better understanding of your data?If you are trying to make you Custom NER using spaCy then please refer this