Spaces:
Sleeping
Sleeping
import datasets | |
import evaluate | |
import numpy as np | |
_CITATION = """\ | |
@inproceedings{palotti2019, | |
author = {Palotti, Joao and Scells, Harrisen and Zuccon, Guido}, | |
title = {TrecTools: an open-source Python library for Information Retrieval practitioners involved in TREC-like campaigns}, | |
series = {SIGIR'19}, | |
year = {2019}, | |
location = {Paris, France}, | |
publisher = {ACM} | |
} | |
""" | |
_DESCRIPTION = """\ | |
A metric to evaluate ranking tasks using the TREC evaluation tool. It compares predicted rankings of items (e.g., documents) with their true relevance scores. The metric takes two inputs: references (true relevance scores) and predictions (predicted scores), both as lists of lists, where each (i, j) is the truth or the predicted score of the document j in the collection for the query i. In a nutshell: simplifies the usage of TREC to compute ranking metrics given scores per sample. | |
""" | |
_KWARGS_DESCRIPTION = """ Computes MAP, P@K, RR, and NDCG using the TREC evaluation tool. | |
Args: | |
references (list(list(float))): true scores for each query | |
predictions (list(list(float))): pred scores for each query | |
Returns: | |
Dict: the set of TREC's metrics scores | |
Example: | |
# (i, j) means the truth/predicted score of the document j in the collection for the query i | |
references = [[5, 0, 3, 0, 0, 2, 1], | |
[5, 0, 3, 0, 0, 2, 1], | |
[5, 0, 3, 0, 0, 2, 1], | |
[0, 1, 2]] | |
predictions = [[3, 4, 2, 0, 1, 5, 0], | |
[2, 0, 4, 5, 0, 1, 3], | |
[0, 3, 2, 1, 5, 0, 4], | |
[5, 3, 2]] | |
metric = evaluate.load("symanto/ranking_evaluator") | |
metric.compute(references=references, predictions=predictions) | |
""" | |
class RankingEvaluator(evaluate.Metric): | |
def _info(self): | |
return evaluate.MetricInfo( | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
features=datasets.Features( | |
{ | |
"predictions": datasets.Sequence(datasets.Value("float32")), | |
"references": datasets.Sequence(datasets.Value("float32")), | |
} | |
), | |
) | |
def _download_and_prepare(self, dl_manager): | |
self.trec_eval = evaluate.load("trec_eval") | |
def _compute( | |
self, references: list[list[float]], predictions: list[list[float]] | |
) -> dict: | |
""" | |
Calculates MAP, P@K, RR, and NDCG using the TREC evaluation tool. | |
Args: | |
references (list(list(float))): true scores for each query | |
predictions (list(list(float))): pred scores for each query | |
Returns: | |
Dict: the set of TREC's metrics scores | |
Example: | |
# (i, j) means the truth/predicted score of the document j in the query i | |
references = [[5, 0, 3, 0, 0, 2, 1], | |
[5, 0, 3, 0, 0, 2, 1], | |
[5, 0, 3, 0, 0, 2, 1], | |
[0, 1, 2]] | |
predictions = [[3, 4, 2, 0, 1, 5, 0], | |
[2, 0, 4, 5, 0, 1, 3], | |
[0, 3, 2, 1, 5, 0, 4], | |
[5, 3, 2]] | |
metric = evaluate.load("symanto/ranking_evaluator") | |
metric.compute(references=references, predictions=predictions) | |
""" | |
qrel = {} | |
run = {} | |
# Fill qrel | |
for query_idx, truth in enumerate(references): | |
for item_idx, relevance in enumerate(truth): | |
if relevance > 0: | |
qrel.setdefault("query", []).append(query_idx) | |
qrel.setdefault("q0", []).append("q0") | |
qrel.setdefault("docid", []).append(f"doc_{item_idx}") | |
qrel.setdefault("rel", []).append(relevance) | |
# Fill run | |
for query_idx, pred in enumerate(predictions): | |
ranking = np.argsort(np.argsort(pred)[::-1]) | |
for item_idx, score in enumerate(pred): | |
if score > 0: | |
run.setdefault("query", []).append(query_idx) | |
run.setdefault("q0", []).append("q0") | |
run.setdefault("docid", []).append(f"doc_{item_idx}") | |
run.setdefault("score", []).append(score) | |
run.setdefault("system", []).append("sys") | |
run.setdefault("rank", []).append(ranking[item_idx]) | |
return self.trec_eval.compute(references=[qrel], predictions=[run]) | |