Spaces:
Running
Running
Muennighoff
commited on
Commit
Β·
6181979
1
Parent(s):
3be8255
Fix metric names & metadata new format
Browse files- EXTERNAL_MODEL_RESULTS.json +0 -0
- app.py +24 -18
- config.yaml +9 -9
EXTERNAL_MODEL_RESULTS.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
app.py
CHANGED
@@ -23,7 +23,15 @@ PRETTY_NAMES = {
|
|
23 |
"BitextMining": "Bitext Mining",
|
24 |
}
|
25 |
|
26 |
-
TASK_TO_METRIC = {k: v["metric"] for k, v in TASKS_CONFIG.items()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
def make_clickable_model(model_name, link=None):
|
29 |
if link is None:
|
@@ -93,16 +101,16 @@ def add_task(examples):
|
|
93 |
examples["mteb_task"] = "Unknown"
|
94 |
return examples
|
95 |
|
96 |
-
def filter_metric_external(x, task,
|
97 |
# This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
|
98 |
if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
|
99 |
return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
|
100 |
else:
|
101 |
-
return x["mteb_task"] == task and x["metric"]
|
102 |
|
103 |
-
def filter_metric_fetched(name, metric,
|
104 |
# This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
|
105 |
-
return metric == 'ndcg_at_1' if name in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval'] else metric
|
106 |
|
107 |
if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
|
108 |
with open("EXTERNAL_MODEL_RESULTS.json") as f:
|
@@ -112,9 +120,9 @@ if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
|
|
112 |
for model in EXTERNAL_MODELS:
|
113 |
if model not in EXTERNAL_MODEL_RESULTS:
|
114 |
models_to_run.append(model)
|
115 |
-
EXTERNAL_MODEL_RESULTS[model] = {k: {v: []} for k, v in TASK_TO_METRIC.items()}
|
116 |
else:
|
117 |
-
EXTERNAL_MODEL_RESULTS = {model: {k: {v: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
|
118 |
models_to_run = EXTERNAL_MODELS
|
119 |
|
120 |
pbar = tqdm(models_to_run, desc="Fetching external model results")
|
@@ -127,10 +135,11 @@ for model in pbar:
|
|
127 |
ds = ds.map(add_task)
|
128 |
base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
|
129 |
|
130 |
-
for task,
|
131 |
-
ds_dict = ds.filter(lambda x: filter_metric_external(x, task,
|
132 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
133 |
-
|
|
|
134 |
|
135 |
# Save & cache EXTERNAL_MODEL_RESULTS
|
136 |
with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
|
@@ -204,9 +213,8 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
204 |
results_list = []
|
205 |
for task in tasks:
|
206 |
# Not all models have InstructionRetrieval, other new tasks
|
207 |
-
if task not in EXTERNAL_MODEL_RESULTS[model]:
|
208 |
-
|
209 |
-
results_list += EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]]
|
210 |
|
211 |
if len(datasets) > 0:
|
212 |
res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
|
@@ -262,7 +270,8 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
262 |
# import pdb; pdb.set_trace()
|
263 |
try:
|
264 |
out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if filter_metric_fetched(res["dataset"]["name"].replace("MTEB ", ""), score["type"], task_to_metric.get(res["task"]["type"]))][0]} for res in task_results]
|
265 |
-
except:
|
|
|
266 |
print("ERROR", model.modelId)
|
267 |
continue
|
268 |
out = {k: v for d in out for k, v in d.items()}
|
@@ -304,10 +313,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
304 |
if len(datasets) > 0:
|
305 |
# Update legacy column names to be merged with newer ones
|
306 |
# Update 'MLSUMClusteringP2P (fr)' with values from 'MLSUMClusteringP2P'
|
307 |
-
#if ('MLSUMClusteringP2P (fr)' in datasets):
|
308 |
-
# import pdb; pdb.set_trace()
|
309 |
if ('MLSUMClusteringP2P (fr)' in datasets) and ('MLSUMClusteringP2P' in cols):
|
310 |
-
#import pdb; pdb.set_trace()
|
311 |
df['MLSUMClusteringP2P (fr)'] = df['MLSUMClusteringP2P (fr)'].fillna(df['MLSUMClusteringP2P'])
|
312 |
datasets.remove('MLSUMClusteringP2P')
|
313 |
if ('MLSUMClusteringS2S (fr)' in datasets) and ('MLSUMClusteringS2S' in cols):
|
@@ -656,7 +662,7 @@ with gr.Blocks(css=css) as block:
|
|
656 |
gr.Markdown(f"""
|
657 |
{item['description']}
|
658 |
|
659 |
-
- **Metric:** {
|
660 |
- **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
|
661 |
{"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
|
662 |
""")
|
|
|
23 |
"BitextMining": "Bitext Mining",
|
24 |
}
|
25 |
|
26 |
+
TASK_TO_METRIC = {k: [v["metric"]] for k, v in TASKS_CONFIG.items()}
|
27 |
+
# Add legacy metric names
|
28 |
+
TASK_TO_METRIC["STS"].append("cos_sim_spearman")
|
29 |
+
TASK_TO_METRIC["STS"].append("cosine_spearman")
|
30 |
+
TASK_TO_METRIC["Summarization"].append("cos_sim_spearman")
|
31 |
+
TASK_TO_METRIC["Summarization"].append("cosine_spearman")
|
32 |
+
TASK_TO_METRIC["PairClassification"].append("cos_sim_ap")
|
33 |
+
TASK_TO_METRIC["PairClassification"].append("cosine_ap")
|
34 |
+
|
35 |
|
36 |
def make_clickable_model(model_name, link=None):
|
37 |
if link is None:
|
|
|
101 |
examples["mteb_task"] = "Unknown"
|
102 |
return examples
|
103 |
|
104 |
+
def filter_metric_external(x, task, metrics):
|
105 |
# This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
|
106 |
if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
|
107 |
return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
|
108 |
else:
|
109 |
+
return x["mteb_task"] == task and x["metric"] in metrics
|
110 |
|
111 |
+
def filter_metric_fetched(name, metric, expected_metrics):
|
112 |
# This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
|
113 |
+
return metric == 'ndcg_at_1' if name in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval'] else metric in expected_metrics
|
114 |
|
115 |
if os.path.exists("EXTERNAL_MODEL_RESULTS.json"):
|
116 |
with open("EXTERNAL_MODEL_RESULTS.json") as f:
|
|
|
120 |
for model in EXTERNAL_MODELS:
|
121 |
if model not in EXTERNAL_MODEL_RESULTS:
|
122 |
models_to_run.append(model)
|
123 |
+
EXTERNAL_MODEL_RESULTS[model] = {k: {v[0]: []} for k, v in TASK_TO_METRIC.items()}
|
124 |
else:
|
125 |
+
EXTERNAL_MODEL_RESULTS = {model: {k: {v[0]: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
|
126 |
models_to_run = EXTERNAL_MODELS
|
127 |
|
128 |
pbar = tqdm(models_to_run, desc="Fetching external model results")
|
|
|
135 |
ds = ds.map(add_task)
|
136 |
base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
|
137 |
|
138 |
+
for task, metrics in TASK_TO_METRIC.items():
|
139 |
+
ds_dict = ds.filter(lambda x: filter_metric_external(x, task, metrics))["test"].to_dict()
|
140 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
141 |
+
# metrics[0] is the main name for this metric; other names in the list are legacy for backward-compat
|
142 |
+
EXTERNAL_MODEL_RESULTS[model][task][metrics[0]].append({**base_dict, **ds_dict})
|
143 |
|
144 |
# Save & cache EXTERNAL_MODEL_RESULTS
|
145 |
with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
|
|
|
213 |
results_list = []
|
214 |
for task in tasks:
|
215 |
# Not all models have InstructionRetrieval, other new tasks
|
216 |
+
if task not in EXTERNAL_MODEL_RESULTS[model]: continue
|
217 |
+
results_list += EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task][0]]
|
|
|
218 |
|
219 |
if len(datasets) > 0:
|
220 |
res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
|
|
|
270 |
# import pdb; pdb.set_trace()
|
271 |
try:
|
272 |
out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if filter_metric_fetched(res["dataset"]["name"].replace("MTEB ", ""), score["type"], task_to_metric.get(res["task"]["type"]))][0]} for res in task_results]
|
273 |
+
except Exception as e:
|
274 |
+
import pdb; pdb.set_trace()
|
275 |
print("ERROR", model.modelId)
|
276 |
continue
|
277 |
out = {k: v for d in out for k, v in d.items()}
|
|
|
313 |
if len(datasets) > 0:
|
314 |
# Update legacy column names to be merged with newer ones
|
315 |
# Update 'MLSUMClusteringP2P (fr)' with values from 'MLSUMClusteringP2P'
|
|
|
|
|
316 |
if ('MLSUMClusteringP2P (fr)' in datasets) and ('MLSUMClusteringP2P' in cols):
|
|
|
317 |
df['MLSUMClusteringP2P (fr)'] = df['MLSUMClusteringP2P (fr)'].fillna(df['MLSUMClusteringP2P'])
|
318 |
datasets.remove('MLSUMClusteringP2P')
|
319 |
if ('MLSUMClusteringS2S (fr)' in datasets) and ('MLSUMClusteringS2S' in cols):
|
|
|
662 |
gr.Markdown(f"""
|
663 |
{item['description']}
|
664 |
|
665 |
+
- **Metric:** {specific_metric}
|
666 |
- **Languages:** {item['language_long'] if 'language_long' in item else item['language']}
|
667 |
{"- **Credits:** " + item['credits'] if ("credits" in item and item["credits"] is not None) else ''}
|
668 |
""")
|
config.yaml
CHANGED
@@ -16,12 +16,12 @@ tasks:
|
|
16 |
Clustering:
|
17 |
icon: "β¨"
|
18 |
metric: v_measure
|
19 |
-
metric_description: "Validity Measure (
|
20 |
task_description: "Clustering is the task of grouping similar documents together."
|
21 |
PairClassification:
|
22 |
icon: "π"
|
23 |
-
metric:
|
24 |
-
metric_description: "Average Precision based on
|
25 |
task_description: "Pair classification is the task of determining whether two texts are similar."
|
26 |
Reranking:
|
27 |
icon: "π₯"
|
@@ -31,22 +31,22 @@ tasks:
|
|
31 |
Retrieval:
|
32 |
icon: "π"
|
33 |
metric: ndcg_at_10
|
34 |
-
metric_description: "Normalized Discounted Cumulative Gain @
|
35 |
task_description: "Retrieval is the task of finding relevant documents for a query."
|
36 |
STS:
|
37 |
icon: "βοΈ"
|
38 |
-
metric:
|
39 |
-
metric_description: "Spearman correlation based on
|
40 |
task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
|
41 |
Summarization:
|
42 |
icon: "π"
|
43 |
-
metric:
|
44 |
-
metric_description: "Spearman correlation
|
45 |
task_description: "Summarization is the task of generating a summary of a text."
|
46 |
InstructionRetrieval:
|
47 |
icon: "ππ"
|
48 |
metric: "p-MRR"
|
49 |
-
metric_description: "paired mean reciprocal rank"
|
50 |
task_description: "Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions."
|
51 |
boards:
|
52 |
en:
|
|
|
16 |
Clustering:
|
17 |
icon: "β¨"
|
18 |
metric: v_measure
|
19 |
+
metric_description: "Validity Measure (V-measure)"
|
20 |
task_description: "Clustering is the task of grouping similar documents together."
|
21 |
PairClassification:
|
22 |
icon: "π"
|
23 |
+
metric: ap
|
24 |
+
metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
|
25 |
task_description: "Pair classification is the task of determining whether two texts are similar."
|
26 |
Reranking:
|
27 |
icon: "π₯"
|
|
|
31 |
Retrieval:
|
32 |
icon: "π"
|
33 |
metric: ndcg_at_10
|
34 |
+
metric_description: "Normalized Discounted Cumulative Gain @ 10 (nDCG@10)"
|
35 |
task_description: "Retrieval is the task of finding relevant documents for a query."
|
36 |
STS:
|
37 |
icon: "βοΈ"
|
38 |
+
metric: spearman
|
39 |
+
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
|
40 |
task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
|
41 |
Summarization:
|
42 |
icon: "π"
|
43 |
+
metric: spearman
|
44 |
+
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
|
45 |
task_description: "Summarization is the task of generating a summary of a text."
|
46 |
InstructionRetrieval:
|
47 |
icon: "ππ"
|
48 |
metric: "p-MRR"
|
49 |
+
metric_description: "paired mean reciprocal rank (p-MRR)"
|
50 |
task_description: "Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions."
|
51 |
boards:
|
52 |
en:
|