Skip to content

Instantly share code, notes, and snippets.

@kwang2049
Last active June 30, 2021 07:52
Show Gist options
  • Save kwang2049/e16a2f5d8a9cf4fa2f8c685d517ac9ec to your computer and use it in GitHub Desktop.
Save kwang2049/e16a2f5d8a9cf4fa2f8c685d517ac9ec to your computer and use it in GitHub Desktop.
Generate an 1M-document version of MS MARCO by keeping the dev/test qrels and random sample other negatives. The msmarco-1m.zip is available at https://public.ukp.informatik.tu-darmstadt.de/kwang/datasets/ir/msmarco-1m.zip.
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
import logging
import pathlib, os
import random
import json
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO,
handlers=[LoggingHandler()])
#### /print debug information to stdout
dataset = "msmarco"
#### Download msmarco.zip dataset and unzip the dataset from the BeIR repo
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), "datasets")
data_path = util.download_and_unzip(url, out_dir)
#### Load data
corpus, queries, qrels_train = GenericDataLoader(data_folder=data_path).load(split="train")
_, _, qrels_dev = GenericDataLoader(data_folder=data_path).load(split="dev")
_, _, qrels_test = GenericDataLoader(data_folder=data_path).load(split="test")
#### Store document ids used in qrels
eval_dids = set()
for qid, docs in qrels_train.items():
for did, label in docs.items():
eval_dids.add(did)
for qid, docs in qrels_dev.items():
for did, label in docs.items():
eval_dids.add(did)
for qid, docs in qrels_test.items():
for did, label in docs.items():
eval_dids.add(did)
# Sample random negatives
ntotal = 1000000 # 1M
negs = list(set(corpus.keys()) - eval_dids)
negs = random.sample(negs, ntotal - len(eval_dids))
# Save new corpus
corpus_new = {did: corpus[did] for did in set(negs) | eval_dids}
with open('corpus.jsonl', 'w') as f:
for did, val_dict in corpus_new.items():
val_dict.update({'_id': did})
f.write(json.dumps(val_dict) + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment