Created
November 5, 2020 17:59
-
-
Save davidefiocco/eb92fc5427d2924e52858361ce98ef1a to your computer and use it in GitHub Desktop.
Prodigy recipe to categorize differences in text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"removed":"These results suggested that the distribution of bacterial communities was driven more by sample types than the separate caves from which samples were collected.","added":"These results suggest that the distribution of bacterial communities is driven more by sample types than the separate caves from which samples were collected.","meta":{"score":1}} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import prodigy | |
from prodigy.components.loaders import JSONL | |
# run with | |
# python -m prodigy diff-textcat copyedits copyedits.jsonl -F .\diff_text.py | |
def add_label_to_stream(stream, label): | |
for eg in stream: | |
# The 'label' you get from the command line is a list | |
# so let's just assume it's always one and take the first | |
eg["label"] = label[0] | |
yield eg | |
@prodigy.recipe( | |
"diff-textcat", | |
dataset=("The dataset to use", "positional", None, str), | |
source=("The source data as a JSONL file", "positional", None, str), | |
) | |
def copyedit(dataset, source): | |
stream = JSONL(source) | |
blocks = [ | |
{"view_id": "diff"}, | |
{"view_id": "classification"}, | |
] | |
stream = stream | |
return { | |
"stream": stream, | |
"dataset": dataset, | |
"view_id": "blocks", | |
"config": { | |
"blocks": blocks, | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment