-
Star
(262)
You must be signed in to star a gist -
Fork
(114)
You must be signed in to fork a gist
-
-
Save W4ngatang/60c2bdb54d156a41194446737ce03e2e to your computer and use it in GitHub Desktop.
| ''' Script for downloading all GLUE data. | |
| Note: for legal reasons, we are unable to host MRPC. | |
| You can either use the version hosted by the SentEval team, which is already tokenized, | |
| or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually. | |
| For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example). | |
| You should then rename and place specific files in a folder (see below for an example). | |
| mkdir MRPC | |
| cabextract MSRParaphraseCorpus.msi -d MRPC | |
| cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt | |
| cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt | |
| rm MRPC/_* | |
| rm MSRParaphraseCorpus.msi | |
| 1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now. | |
| 2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray! | |
| ''' | |
| import os | |
| import sys | |
| import shutil | |
| import argparse | |
| import tempfile | |
| import urllib.request | |
| import zipfile | |
| TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "QNLI", "RTE", "WNLI", "diagnostic"] | |
| TASK2PATH = {"CoLA":'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip', | |
| "SST":'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip', | |
| "QQP":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip', | |
| "STS":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip', | |
| "MNLI":'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip', | |
| "QNLI":'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip', | |
| "RTE":'https://dl.fbaipublicfiles.com/glue/data/RTE.zip', | |
| "WNLI":'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip', | |
| "diagnostic":'https://dl.fbaipublicfiles.com/glue/data/AX.tsv'} | |
| MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt' | |
| MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt' | |
| def download_and_extract(task, data_dir): | |
| print("Downloading and extracting %s..." % task) | |
| if task == "MNLI": | |
| print("\tNote (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.") | |
| data_file = "%s.zip" % task | |
| urllib.request.urlretrieve(TASK2PATH[task], data_file) | |
| with zipfile.ZipFile(data_file) as zip_ref: | |
| zip_ref.extractall(data_dir) | |
| os.remove(data_file) | |
| print("\tCompleted!") | |
| def format_mrpc(data_dir, path_to_data): | |
| print("Processing MRPC...") | |
| mrpc_dir = os.path.join(data_dir, "MRPC") | |
| if not os.path.isdir(mrpc_dir): | |
| os.mkdir(mrpc_dir) | |
| if path_to_data: | |
| mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt") | |
| mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt") | |
| else: | |
| try: | |
| mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt") | |
| mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt") | |
| URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) | |
| URLLIB.urlretrieve(MRPC_TEST, mrpc_test_file) | |
| except urllib.error.HTTPError: | |
| print("Error downloading MRPC") | |
| return | |
| assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file | |
| assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file | |
| with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \ | |
| io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh: | |
| header = data_fh.readline() | |
| test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n") | |
| for idx, row in enumerate(data_fh): | |
| label, id1, id2, s1, s2 = row.strip().split('\t') | |
| test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2)) | |
| try: | |
| URLLIB.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv")) | |
| except KeyError or urllib.error.HTTPError: | |
| print("\tError downloading standard development IDs for MRPC. You will need to manually split your data.") | |
| return | |
| dev_ids = [] | |
| with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh: | |
| for row in ids_fh: | |
| dev_ids.append(row.strip().split('\t')) | |
| with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \ | |
| io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \ | |
| io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh: | |
| header = data_fh.readline() | |
| train_fh.write(header) | |
| dev_fh.write(header) | |
| for row in data_fh: | |
| label, id1, id2, s1, s2 = row.strip().split('\t') | |
| if [id1, id2] in dev_ids: | |
| dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) | |
| else: | |
| train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) | |
| print("\tCompleted!") | |
| def download_diagnostic(data_dir): | |
| print("Downloading and extracting diagnostic...") | |
| if not os.path.isdir(os.path.join(data_dir, "diagnostic")): | |
| os.mkdir(os.path.join(data_dir, "diagnostic")) | |
| data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv") | |
| urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file) | |
| print("\tCompleted!") | |
| return | |
| def get_tasks(task_names): | |
| task_names = task_names.split(',') | |
| if "all" in task_names: | |
| tasks = TASKS | |
| else: | |
| tasks = [] | |
| for task_name in task_names: | |
| assert task_name in TASKS, "Task %s not found!" % task_name | |
| tasks.append(task_name) | |
| return tasks | |
| def main(arguments): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data') | |
| parser.add_argument('--tasks', help='tasks to download data for as a comma separated string', | |
| type=str, default='all') | |
| parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt', | |
| type=str, default='') | |
| args = parser.parse_args(arguments) | |
| if not os.path.isdir(args.data_dir): | |
| os.mkdir(args.data_dir) | |
| tasks = get_tasks(args.tasks) | |
| for task in tasks: | |
| if task == 'MRPC': | |
| format_mrpc(args.data_dir, args.path_to_mrpc) | |
| elif task == 'diagnostic': | |
| download_diagnostic(args.data_dir) | |
| else: | |
| download_and_extract(task, args.data_dir) | |
| if __name__ == '__main__': | |
| sys.exit(main(sys.argv[1:])) |
Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not definedI managed to get the MRPC donwload completed by adding the following three lines:
import ioURLLIB = urllib.request'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv'inside theTASK2PATHdict (creds to @laouer)
Thanks!
"QQP":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip',
"STS":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip'
The urls of the two datasets are reversed.
Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not definedI managed to get the MRPC donwload completed by adding the following three lines:
import ioURLLIB = urllib.request'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv'inside theTASK2PATHdict (creds to @laouer)Thanks!@stevenwjy
Thanks!
"QQP":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip',
"STS":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip'
The urls of the two datasets are reversed.
Yeas, you are right. Thanks!
I'm getting the following warning:
Downloading and extracting MNLI...
Note (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.
Here is the script with all the fixes so far
I am getting the following error when trying to download MRPC files for BERT classifier:
Processing MRPC...
Error downloading standard development IDs for MRPC. You will need to manually split your data.
***** Task data directory: glue_data/MRPC *****
msr_paraphrase_test.txt msr_paraphrase_train.txt test.tsv
***** Model output directory: gs://capstone-testing/bert-tfhub/models/MRPC *****
What is the best method to resolve this?
import ioURLLIB = urllib.request'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv'inside theTASK2PATH
Thanks a lot! It works for me, too
I'm getting the following warning:
Downloading and extracting MNLI... Note (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.
+1
I'm getting the following warning:
Downloading and extracting MNLI... Note (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.+1
You should download the SNLI dataset manually, by https://nlp.stanford.edu/projects/snli/snli_1.0.zip
Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not definedI managed to get the MRPC donwload completed by adding the following three lines:
import ioURLLIB = urllib.request'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv'inside theTASK2PATHdict (creds to @laouer)
Thanks
Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not definedI managed to get the MRPC donwload completed by adding the following three lines:
import ioURLLIB = urllib.request'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv'inside theTASK2PATHdict (creds to @laouer)
Thanks for saving some time for all of us :)
Here is fixed script in case anyone else needs it: https://gist.github.com/vlasenkoalexey/fef1601580f269eca73bf26a198595f3
Nice! Worked for me.
It works for me. Thanks a lot!!
Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not definedI managed to get the MRPC donwload completed by adding the following three lines:
import ioURLLIB = urllib.request'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv'inside the dict (creds to @laouerTASK2PATH)
thanks a lot!
Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not definedI managed to get the MRPC donwload completed by adding the following three lines:
import ioURLLIB = urllib.request'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv'inside theTASK2PATHdict (creds to @laouer)
it really works!
Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not definedI managed to get the MRPC donwload completed by adding the following three lines:
import ioURLLIB = urllib.request'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv'inside theTASK2PATHdict (creds to @laouer)
tanks a lot!!
Got a NameError 'URLLIB' is not defined here:
Traceback (most recent call last): File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 150, in <module> sys.exit(main(sys.argv[1:])) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 142, in main format_mrpc(args.data_dir, args.path_to_mrpc) File "C:\Users\cpeng4\Downloads\download_glue_data\download_glue_data.py", line 65, in format_mrpc URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file) NameError: name 'URLLIB' is not definedI managed to get the MRPC donwload completed by adding the following three lines:
import ioURLLIB = urllib.request'MRPC':'https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv'inside theTASK2PATHdict (creds to @laouer)
u r real hero
Why does this keep getting referenced and used for years with some obvious errors? Is this a test?
fine!