jogonba2 commited on
Commit
d891c3c
Β·
1 Parent(s): 4929743

first commit

Browse files
Files changed (5) hide show
  1. README.md +1 -1
  2. app.py +5 -0
  3. gradio_tst.py +143 -0
  4. ranking_evaluator.py +115 -0
  5. requirements.txt +2 -0
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Ranking Evaluator
3
- emoji: 🐨
4
  colorFrom: gray
5
  colorTo: green
6
  sdk: gradio
 
1
  ---
2
  title: Ranking Evaluator
3
+ emoji: πŸ†
4
  colorFrom: gray
5
  colorTo: green
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import evaluate
2
+ from gradio_tst import launch_gradio_widget2
3
+
4
+ module = evaluate.load("ranking_evaluator.py")
5
+ launch_gradio_widget2(module)
gradio_tst.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import re
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ import numpy as np
9
+ from datasets import Value
10
+
11
+ REGEX_YAML_BLOCK = re.compile(r"---[\n\r]+([\S\s]*?)[\n\r]+---[\n\r]")
12
+
13
+
14
+ def infer_gradio_input_types(feature_types):
15
+ """
16
+ Maps metric feature types to input types for gradio Dataframes:
17
+ - float/int -> numbers
18
+ - string -> strings
19
+ - any other -> json
20
+ Note that json is not a native gradio type but will be treated as string that
21
+ is then parsed as a json.
22
+ """
23
+ input_types = []
24
+ for feature_type in feature_types:
25
+ input_type = "json"
26
+ if isinstance(feature_type, Value):
27
+ if feature_type.dtype.startswith(
28
+ "int"
29
+ ) or feature_type.dtype.startswith("float"):
30
+ input_type = "number"
31
+ elif feature_type.dtype == "string":
32
+ input_type = "str"
33
+ input_types.append(input_type)
34
+ return input_types
35
+
36
+
37
+ def json_to_string_type(input_types):
38
+ """Maps json input type to str."""
39
+ return ["str" if i == "json" else i for i in input_types]
40
+
41
+
42
+ def parse_readme(filepath):
43
+ """Parses a repositories README and removes"""
44
+ if not os.path.exists(filepath):
45
+ return "No README.md found."
46
+ with open(filepath, "r") as f:
47
+ text = f.read()
48
+ match = REGEX_YAML_BLOCK.search(text)
49
+ if match:
50
+ text = text[match.end() :]
51
+ return text
52
+
53
+
54
+ def parse_gradio_data(data, input_types):
55
+ """Parses data from gradio Dataframe for use in metric."""
56
+ metric_inputs = {}
57
+ data.replace("", np.nan, inplace=True)
58
+ data.dropna(inplace=True)
59
+ for feature_name, input_type in zip(data, input_types):
60
+ if input_type == "json":
61
+ metric_inputs[feature_name] = [
62
+ json.loads(d) for d in data[feature_name].to_list()
63
+ ]
64
+ elif input_type == "str":
65
+ metric_inputs[feature_name] = [
66
+ d.strip('"') for d in data[feature_name].to_list()
67
+ ]
68
+ else:
69
+ metric_inputs[feature_name] = data[feature_name]
70
+ return metric_inputs
71
+
72
+
73
+ def parse_test_cases(test_cases, feature_names, input_types):
74
+ """
75
+ Parses test cases to be used in gradio Dataframe. Note that an apostrophe is added
76
+ to strings to follow the format in json.
77
+ """
78
+ if len(test_cases) == 0:
79
+ return None
80
+ examples = []
81
+ for test_case in test_cases:
82
+ parsed_cases = []
83
+ for feat, input_type in zip(feature_names, input_types):
84
+ if input_type == "json":
85
+ parsed_cases.append(
86
+ [str(element) for element in test_case[feat]]
87
+ )
88
+ elif input_type == "str":
89
+ parsed_cases.append(
90
+ ['"' + element + '"' for element in test_case[feat]]
91
+ )
92
+ else:
93
+ parsed_cases.append(test_case[feat])
94
+ examples.append([list(i) for i in zip(*parsed_cases)])
95
+ return examples
96
+
97
+
98
+ def launch_gradio_widget2(metric):
99
+ """Launches `metric` widget with Gradio."""
100
+
101
+ try:
102
+ import gradio as gr
103
+ except ImportError as error:
104
+ logging.error(
105
+ "To create a metric widget with Gradio make sure gradio is installed."
106
+ )
107
+ raise error
108
+
109
+ local_path = Path(sys.path[0])
110
+ # if there are several input types, use first as default.
111
+ if isinstance(metric.features, list):
112
+ (feature_names, feature_types) = zip(*metric.features[0].items())
113
+ else:
114
+ (feature_names, feature_types) = zip(*metric.features.items())
115
+ gradio_input_types = infer_gradio_input_types(feature_types)
116
+
117
+ def compute(data):
118
+ return metric.compute(**parse_gradio_data(data, gradio_input_types))
119
+
120
+ test_cases = [
121
+ {
122
+ "predictions": [[0.0, 5.0, 4.0, 3.0], [1.0, 4.0, 2.0]],
123
+ "references": [[0.0, 5.0, 4.0, 3.0], [4.0, 3.0, 1.0]],
124
+ }
125
+ ]
126
+ iface = gr.Interface(
127
+ fn=compute,
128
+ inputs=gr.Dataframe(
129
+ headers=feature_names,
130
+ col_count=len(feature_names),
131
+ row_count=1,
132
+ datatype=json_to_string_type(gradio_input_types),
133
+ ),
134
+ outputs=gr.Textbox(label=metric.name),
135
+ description=(metric.info.description),
136
+ title=f"Metric: {metric.name}",
137
+ article=parse_readme(local_path / "README.md"),
138
+ examples=[
139
+ parse_test_cases(test_cases, feature_names, gradio_input_types)
140
+ ],
141
+ )
142
+
143
+ iface.launch(share=True)
ranking_evaluator.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import evaluate
3
+ import numpy as np
4
+
5
+ _CITATION = """\
6
+ @inproceedings{palotti2019,
7
+ author = {Palotti, Joao and Scells, Harrisen and Zuccon, Guido},
8
+ title = {TrecTools: an open-source Python library for Information Retrieval practitioners involved in TREC-like campaigns},
9
+ series = {SIGIR'19},
10
+ year = {2019},
11
+ location = {Paris, France},
12
+ publisher = {ACM}
13
+ }
14
+ """
15
+
16
+ _DESCRIPTION = """\
17
+ A metric to evaluate ranking tasks using the TREC evaluation tool. It compares predicted rankings of items (e.g., documents) with their true relevance scores. The metric takes two inputs: references (true relevance scores) and predictions (predicted scores), both as lists of lists, where each (i, j) is the truth or the predicted score of the document j in the query i. In a nutshell: simplifies the usage of TREC to compute ranking metrics given scores per sample.
18
+ """
19
+
20
+ _KWARGS_DESCRIPTION = """ Computes MAP, P@K, RR, and NDCG using the TREC evaluation tool.
21
+
22
+ Args:
23
+ references (list(list(float))): true scores for each query
24
+ predictions (list(list(float))): pred scores for each query
25
+
26
+ Returns:
27
+ Dict: the set of TREC's metrics scores
28
+
29
+ Example:
30
+ # (i, j) means the truth/predicted score of the document j in the query i
31
+ references = [[5, 0, 3, 0, 0, 2, 1],
32
+ [5, 0, 3, 0, 0, 2, 1],
33
+ [5, 0, 3, 0, 0, 2, 1],
34
+ [0, 1, 2]]
35
+
36
+ predictions = [[3, 4, 2, 0, 1, 5, 0],
37
+ [2, 0, 4, 5, 0, 1, 3],
38
+ [0, 3, 2, 1, 5, 0, 4],
39
+ [5, 3, 2]]
40
+
41
+ metric = evaluate.load("symanto/ranking_evaluator")
42
+ metric.compute(references=references, predictions=predictions)
43
+ """
44
+
45
+
46
+ class RankingEvaluator(evaluate.Metric):
47
+ def _info(self):
48
+ return evaluate.MetricInfo(
49
+ description=_DESCRIPTION,
50
+ citation=_CITATION,
51
+ inputs_description=_KWARGS_DESCRIPTION,
52
+ features=datasets.Features(
53
+ {
54
+ "predictions": datasets.Sequence(datasets.Value("float32")),
55
+ "references": datasets.Sequence(datasets.Value("float32")),
56
+ }
57
+ ),
58
+ )
59
+
60
+ def _download_and_prepare(self, dl_manager):
61
+ self.trec_eval = evaluate.load("trec_eval")
62
+
63
+ def _compute(
64
+ self, references: list[list[float]], predictions: list[list[float]]
65
+ ) -> dict:
66
+ """
67
+ Calculates MAP, P@K, RR, and NDCG using the TREC evaluation tool.
68
+
69
+ Args:
70
+ references (list(list(float))): true scores for each query
71
+ predictions (list(list(float))): pred scores for each query
72
+
73
+ Returns:
74
+ Dict: the set of TREC's metrics scores
75
+
76
+ Example:
77
+ # (i, j) means the truth/predicted score of the document j in the query i
78
+ references = [[5, 0, 3, 0, 0, 2, 1],
79
+ [5, 0, 3, 0, 0, 2, 1],
80
+ [5, 0, 3, 0, 0, 2, 1],
81
+ [0, 1, 2]]
82
+
83
+ predictions = [[3, 4, 2, 0, 1, 5, 0],
84
+ [2, 0, 4, 5, 0, 1, 3],
85
+ [0, 3, 2, 1, 5, 0, 4],
86
+ [5, 3, 2]]
87
+
88
+ metric = evaluate.load("symanto/ranking_evaluator")
89
+ metric.compute(references=references, predictions=predictions)
90
+ """
91
+ qrel = {}
92
+ run = {}
93
+
94
+ # Fill qrel
95
+ for query_idx, truth in enumerate(references):
96
+ for item_idx, relevance in enumerate(truth):
97
+ if relevance > 0:
98
+ qrel.setdefault("query", []).append(query_idx)
99
+ qrel.setdefault("q0", []).append("q0")
100
+ qrel.setdefault("docid", []).append(f"doc_{item_idx}")
101
+ qrel.setdefault("rel", []).append(relevance)
102
+
103
+ # Fill run
104
+ for query_idx, pred in enumerate(predictions):
105
+ ranking = np.argsort(np.argsort(pred)[::-1])
106
+ for item_idx, score in enumerate(pred):
107
+ if score > 0:
108
+ run.setdefault("query", []).append(query_idx)
109
+ run.setdefault("q0", []).append("q0")
110
+ run.setdefault("docid", []).append(f"doc_{item_idx}")
111
+ run.setdefault("score", []).append(score)
112
+ run.setdefault("system", []).append("sys")
113
+ run.setdefault("rank", []).append(ranking[item_idx])
114
+
115
+ return self.trec_eval.compute(references=[qrel], predictions=[run])
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ trectools
2
+ git+https://github.com/huggingface/evaluate@a4bdc10c48a450b978d91389a48dbb5297835c7d