Graduation / pipelines /bm25s /examples /evaluate_on_beir.py
DuyTa's picture
Upload folder using huggingface_hub
74b1bac verified
raw
history blame
2.88 kB
"""
Sometimes you might be interested in benchmarking BM25 on BEIR. bm25s makes this straightforward in Python.
To install:
```
pip install bm25s[core] beir
```
Now, run this script, you can modify the `run_benchmark()` part to use the datase you want to test on.
"""
import json
import os
from pathlib import Path
import time
import beir.util
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
import numpy as np
from tqdm.auto import tqdm
import Stemmer
import bm25s
from bm25s.utils.benchmark import get_max_memory_usage, Timer
from bm25s.utils.beir import (
BASE_URL,
clean_results_keys,
)
def postprocess_results_for_eval(results, scores, query_ids):
"""
Given the queried results and scores output by BM25S, postprocess them
to be compatible with BEIR evaluation functions.
query_ids is a list of query ids in the same order as the results.
"""
results_record = [
{"id": qid, "hits": results[i], "scores": list(scores[i])}
for i, qid in enumerate(query_ids)
]
result_dict_for_eval = {
res["id"]: {
docid: float(score) for docid, score in zip(res["hits"], res["scores"])
}
for res in results_record
}
return result_dict_for_eval
def run_benchmark(dataset, save_dir="datasets"):
#### Download dataset and unzip the dataset
data_path = beir.util.download_and_unzip(BASE_URL.format(dataset), save_dir)
split = "test" if dataset != "msmarco" else "dev"
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split=split)
corpus_ids, corpus_lst = [], []
for key, val in corpus.items():
corpus_ids.append(key)
corpus_lst.append(val["title"] + " " + val["text"])
del corpus
qids, queries_lst = [], []
for key, val in queries.items():
qids.append(key)
queries_lst.append(val)
stemmer = Stemmer.Stemmer("english")
corpus_tokens = bm25s.tokenize(
corpus_lst, stemmer=stemmer, leave=False
)
del corpus_lst
query_tokens = bm25s.tokenize(
queries_lst, stemmer=stemmer, leave=False
)
model = bm25s.BM25(method="lucene", k1=1.2, b=0.75)
model.index(corpus_tokens, leave_progress=False)
############## BENCHMARKING BEIR HERE ##############
queried_results, queried_scores = model.retrieve(
query_tokens, corpus=corpus_ids, k=1000, n_threads=4
)
results_dict = postprocess_results_for_eval(queried_results, queried_scores, qids)
ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(
qrels, results_dict, [1, 10, 100, 1000]
)
print(ndcg)
print(recall)
return ndcg, _map, recall, precision
if __name__ == "__main__":
ndcg, _map, recall, precision = run_benchmark("scidocs") # Change to dataset you want