import datasets import evaluate import numpy as np _CITATION = """\ @inproceedings{palotti2019, author = {Palotti, Joao and Scells, Harrisen and Zuccon, Guido}, title = {TrecTools: an open-source Python library for Information Retrieval practitioners involved in TREC-like campaigns}, series = {SIGIR'19}, year = {2019}, location = {Paris, France}, publisher = {ACM} } """ _DESCRIPTION = """\ A metric to evaluate ranking tasks using the TREC evaluation tool. It compares predicted rankings of items (e.g., documents) with their true relevance scores. The metric takes two inputs: references (true relevance scores) and predictions (predicted scores), both as lists of lists, where each (i, j) is the truth or the predicted score of the document j in the collection for the query i. In a nutshell: simplifies the usage of TREC to compute ranking metrics given scores per sample. """ _KWARGS_DESCRIPTION = """ Computes MAP, P@K, RR, and NDCG using the TREC evaluation tool. Args: references (list(list(float))): true scores for each query predictions (list(list(float))): pred scores for each query Returns: Dict: the set of TREC's metrics scores Example: # (i, j) means the truth/predicted score of the document j in the collection for the query i references = [[5, 0, 3, 0, 0, 2, 1], [5, 0, 3, 0, 0, 2, 1], [5, 0, 3, 0, 0, 2, 1], [0, 1, 2]] predictions = [[3, 4, 2, 0, 1, 5, 0], [2, 0, 4, 5, 0, 1, 3], [0, 3, 2, 1, 5, 0, 4], [5, 3, 2]] metric = evaluate.load("symanto/ranking_evaluator") metric.compute(references=references, predictions=predictions) """ class RankingEvaluator(evaluate.Metric): def _info(self): return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=datasets.Features( { "predictions": datasets.Sequence(datasets.Value("float32")), "references": datasets.Sequence(datasets.Value("float32")), } ), ) def _download_and_prepare(self, dl_manager): self.trec_eval = evaluate.load("trec_eval") def _compute( self, references: list[list[float]], predictions: list[list[float]] ) -> dict: """ Calculates MAP, P@K, RR, and NDCG using the TREC evaluation tool. Args: references (list(list(float))): true scores for each query predictions (list(list(float))): pred scores for each query Returns: Dict: the set of TREC's metrics scores Example: # (i, j) means the truth/predicted score of the document j in the query i references = [[5, 0, 3, 0, 0, 2, 1], [5, 0, 3, 0, 0, 2, 1], [5, 0, 3, 0, 0, 2, 1], [0, 1, 2]] predictions = [[3, 4, 2, 0, 1, 5, 0], [2, 0, 4, 5, 0, 1, 3], [0, 3, 2, 1, 5, 0, 4], [5, 3, 2]] metric = evaluate.load("symanto/ranking_evaluator") metric.compute(references=references, predictions=predictions) """ qrel = {} run = {} # Fill qrel for query_idx, truth in enumerate(references): for item_idx, relevance in enumerate(truth): if relevance > 0: qrel.setdefault("query", []).append(query_idx) qrel.setdefault("q0", []).append("q0") qrel.setdefault("docid", []).append(f"doc_{item_idx}") qrel.setdefault("rel", []).append(relevance) # Fill run for query_idx, pred in enumerate(predictions): ranking = np.argsort(np.argsort(pred)[::-1]) for item_idx, score in enumerate(pred): if score > 0: run.setdefault("query", []).append(query_idx) run.setdefault("q0", []).append("q0") run.setdefault("docid", []).append(f"doc_{item_idx}") run.setdefault("score", []).append(score) run.setdefault("system", []).append("sys") run.setdefault("rank", []).append(ranking[item_idx]) return self.trec_eval.compute(references=[qrel], predictions=[run])