Spaces:

symanto
/

ranking_evaluator

Sleeping

+import json
+import logging
+import os
+import re
+import sys
+from pathlib import Path
+import numpy as np
+from datasets import Value
+REGEX_YAML_BLOCK = re.compile(r"---[\n\r]+([\S\s]*?)[\n\r]+---[\n\r]")
+def infer_gradio_input_types(feature_types):
+    """
+    Maps metric feature types to input types for gradio Dataframes:
+        - float/int -> numbers
+        - string -> strings
+        - any other -> json
+    Note that json is not a native gradio type but will be treated as string that
+    is then parsed as a json.
+    """
+    input_types = []
+    for feature_type in feature_types:
+        input_type = "json"
+        if isinstance(feature_type, Value):
+            if feature_type.dtype.startswith(
+                "int"
+            ) or feature_type.dtype.startswith("float"):
+                input_type = "number"
+            elif feature_type.dtype == "string":
+                input_type = "str"
+        input_types.append(input_type)
+    return input_types
+def json_to_string_type(input_types):
+    """Maps json input type to str."""
+    return ["str" if i == "json" else i for i in input_types]
+def parse_readme(filepath):
+    """Parses a repositories README and removes"""
+    if not os.path.exists(filepath):
+        return "No README.md found."
+    with open(filepath, "r") as f:
+        text = f.read()
+        match = REGEX_YAML_BLOCK.search(text)
+        if match:
+            text = text[match.end() :]
+    return text
+def parse_gradio_data(data, input_types):
+    """Parses data from gradio Dataframe for use in metric."""
+    metric_inputs = {}
+    data.replace("", np.nan, inplace=True)
+    data.dropna(inplace=True)
+    for feature_name, input_type in zip(data, input_types):
+        if input_type == "json":
+            metric_inputs[feature_name] = [
+                json.loads(d) for d in data[feature_name].to_list()
+            ]
+        elif input_type == "str":
+            metric_inputs[feature_name] = [
+                d.strip('"') for d in data[feature_name].to_list()
+            ]
+        else:
+            metric_inputs[feature_name] = data[feature_name]
+    return metric_inputs
+def parse_test_cases(test_cases, feature_names, input_types):
+    """
+    Parses test cases to be used in gradio Dataframe. Note that an apostrophe is added
+    to strings to follow the format in json.
+    """
+    if len(test_cases) == 0:
+        return None
+    examples = []
+    for test_case in test_cases:
+        parsed_cases = []
+        for feat, input_type in zip(feature_names, input_types):
+            if input_type == "json":
+                parsed_cases.append(
+                    [str(element) for element in test_case[feat]]
+                )
+            elif input_type == "str":
+                parsed_cases.append(
+                    ['"' + element + '"' for element in test_case[feat]]
+                )
+            else:
+                parsed_cases.append(test_case[feat])
+        examples.append([list(i) for i in zip(*parsed_cases)])
+    return examples
+def launch_gradio_widget2(metric):
+    """Launches `metric` widget with Gradio."""
+    try:
+        import gradio as gr
+    except ImportError as error:
+        logging.error(
+            "To create a metric widget with Gradio make sure gradio is installed."
+        )
+        raise error
+    local_path = Path(sys.path[0])
+    # if there are several input types, use first as default.
+    if isinstance(metric.features, list):
+        (feature_names, feature_types) = zip(*metric.features[0].items())
+    else:
+        (feature_names, feature_types) = zip(*metric.features.items())
+    gradio_input_types = infer_gradio_input_types(feature_types)
+    def compute(data):
+        return metric.compute(**parse_gradio_data(data, gradio_input_types))
+    test_cases = [
+        {
+            "predictions": [[0.0, 5.0, 4.0, 3.0], [1.0, 4.0, 2.0]],
+            "references": [[0.0, 5.0, 4.0, 3.0], [4.0, 3.0, 1.0]],
+        }
+    ]
+    iface = gr.Interface(
+        fn=compute,
+        inputs=gr.Dataframe(
+            headers=feature_names,
+            col_count=len(feature_names),
+            row_count=1,
+            datatype=json_to_string_type(gradio_input_types),
+        ),
+        outputs=gr.Textbox(label=metric.name),
+        description=(metric.info.description),
+        title=f"Metric: {metric.name}",
+        article=parse_readme(local_path / "README.md"),
+        examples=[
+            parse_test_cases(test_cases, feature_names, gradio_input_types)
+        ],
+    )
+    iface.launch(share=True)

ranking_evaluator.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import datasets
+import evaluate
+import numpy as np
+_CITATION = """\
+@inproceedings{palotti2019,
+ author = {Palotti, Joao and Scells, Harrisen and Zuccon, Guido},
+ title = {TrecTools: an open-source Python library for Information Retrieval practitioners involved in TREC-like campaigns},
+ series = {SIGIR'19},
+ year = {2019},
+ location = {Paris, France},
+ publisher = {ACM}
+}
+"""
+_DESCRIPTION = """\
+A metric to evaluate ranking tasks using the TREC evaluation tool. It compares predicted rankings of items (e.g., documents) with their true relevance scores. The metric takes two inputs: references (true relevance scores) and predictions (predicted scores), both as lists of lists, where each (i, j) is the truth or the predicted score of the document j in the query i. In a nutshell: simplifies the usage of TREC to compute ranking metrics given scores per sample.
+"""
+_KWARGS_DESCRIPTION = """ Computes MAP, P@K, RR, and NDCG using the TREC evaluation tool.
+Args:
+    references (list(list(float))): true scores for each query
+    predictions (list(list(float))): pred scores for each query
+Returns:
+    Dict: the set of TREC's metrics scores
+Example:
+    # (i, j) means the truth/predicted score of the document j in the query i
+    references = [[5, 0, 3, 0, 0, 2, 1],
+                    [5, 0, 3, 0, 0, 2, 1],
+                    [5, 0, 3, 0, 0, 2, 1],
+                    [0, 1, 2]]
+    predictions = [[3, 4, 2, 0, 1, 5, 0],
+                    [2, 0, 4, 5, 0, 1, 3],
+                    [0, 3, 2, 1, 5, 0, 4],
+                    [5, 3, 2]]
+    metric = evaluate.load("symanto/ranking_evaluator")
+    metric.compute(references=references, predictions=predictions)
+"""
+class RankingEvaluator(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Sequence(datasets.Value("float32")),
+                    "references": datasets.Sequence(datasets.Value("float32")),
+                }
+            ),
+        )
+    def _download_and_prepare(self, dl_manager):
+        self.trec_eval = evaluate.load("trec_eval")
+    def _compute(
+        self, references: list[list[float]], predictions: list[list[float]]
+    ) -> dict:
+        """
+        Calculates MAP, P@K, RR, and NDCG using the TREC evaluation tool.
+        Args:
+            references (list(list(float))): true scores for each query
+            predictions (list(list(float))): pred scores for each query
+        Returns:
+            Dict: the set of TREC's metrics scores
+        Example:
+            # (i, j) means the truth/predicted score of the document j in the query i
+            references = [[5, 0, 3, 0, 0, 2, 1],
+                          [5, 0, 3, 0, 0, 2, 1],
+                          [5, 0, 3, 0, 0, 2, 1],
+                          [0, 1, 2]]
+            predictions = [[3, 4, 2, 0, 1, 5, 0],
+                           [2, 0, 4, 5, 0, 1, 3],
+                           [0, 3, 2, 1, 5, 0, 4],
+                           [5, 3, 2]]
+            metric = evaluate.load("symanto/ranking_evaluator")
+            metric.compute(references=references, predictions=predictions)
+        """
+        qrel = {}
+        run = {}
+        # Fill qrel
+        for query_idx, truth in enumerate(references):
+            for item_idx, relevance in enumerate(truth):
+                if relevance > 0:
+                    qrel.setdefault("query", []).append(query_idx)
+                    qrel.setdefault("q0", []).append("q0")
+                    qrel.setdefault("docid", []).append(f"doc_{item_idx}")
+                    qrel.setdefault("rel", []).append(relevance)
+        # Fill run
+        for query_idx, pred in enumerate(predictions):
+            ranking = np.argsort(np.argsort(pred)[::-1])
+            for item_idx, score in enumerate(pred):
+                if score > 0:
+                    run.setdefault("query", []).append(query_idx)
+                    run.setdefault("q0", []).append("q0")
+                    run.setdefault("docid", []).append(f"doc_{item_idx}")
+                    run.setdefault("score", []).append(score)
+                    run.setdefault("system", []).append("sys")
+                    run.setdefault("rank", []).append(ranking[item_idx])
+        return self.trec_eval.compute(references=[qrel], predictions=[run])

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ trectools
2	+ git+https://github.com/huggingface/evaluate@a4bdc10c48a450b978d91389a48dbb5297835c7d