Last active
June 30, 2021 07:52
-
-
Save kwang2049/e16a2f5d8a9cf4fa2f8c685d517ac9ec to your computer and use it in GitHub Desktop.
Generate an 1M-document version of MS MARCO by keeping the dev/test qrels and random sample other negatives. The msmarco-1m.zip is available at https://public.ukp.informatik.tu-darmstadt.de/kwang/datasets/ir/msmarco-1m.zip.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from beir import util, LoggingHandler | |
from beir.retrieval import models | |
from beir.datasets.data_loader import GenericDataLoader | |
from beir.retrieval.evaluation import EvaluateRetrieval | |
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES | |
import logging | |
import pathlib, os | |
import random | |
import json | |
#### Just some code to print debug information to stdout | |
logging.basicConfig(format='%(asctime)s - %(message)s', | |
datefmt='%Y-%m-%d %H:%M:%S', | |
level=logging.INFO, | |
handlers=[LoggingHandler()]) | |
#### /print debug information to stdout | |
dataset = "msmarco" | |
#### Download msmarco.zip dataset and unzip the dataset from the BeIR repo | |
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset) | |
out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets") | |
data_path = util.download_and_unzip(url, out_dir) | |
#### Load data | |
corpus, queries, qrels_train = GenericDataLoader(data_folder=data_path).load(split="train") | |
_, _, qrels_dev = GenericDataLoader(data_folder=data_path).load(split="dev") | |
_, _, qrels_test = GenericDataLoader(data_folder=data_path).load(split="test") | |
#### Store document ids used in qrels | |
eval_dids = set() | |
for qid, docs in qrels_train.items(): | |
for did, label in docs.items(): | |
eval_dids.add(did) | |
for qid, docs in qrels_dev.items(): | |
for did, label in docs.items(): | |
eval_dids.add(did) | |
for qid, docs in qrels_test.items(): | |
for did, label in docs.items(): | |
eval_dids.add(did) | |
# Sample random negatives | |
ntotal = 1000000 # 1M | |
negs = list(set(corpus.keys()) - eval_dids) | |
negs = random.sample(negs, ntotal - len(eval_dids)) | |
# Save new corpus | |
corpus_new = {did: corpus[did] for did in set(negs) | eval_dids} | |
with open('corpus.jsonl', 'w') as f: | |
for did, val_dict in corpus_new.items(): | |
val_dict.update({'_id': did}) | |
f.write(json.dumps(val_dict) + '\n') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment