Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
from huggingface_hub import HfApi, hf_hub_download | |
from huggingface_hub.repocard import metadata_load | |
TASKS = [ | |
"BitextMining", | |
"Classification", | |
"Clustering", | |
"PairClassification", | |
"Reranking", | |
"Retrieval", | |
"STS", | |
"Summarization", | |
] | |
TASK_LIST_CLASSIFICATION = [ | |
"AmazonCounterfactualClassification (en)", | |
"AmazonPolarityClassification", | |
"AmazonReviewsClassification (en)", | |
"Banking77Classification", | |
"EmotionClassification", | |
"ImdbClassification", | |
"MassiveIntentClassification (en)", | |
"MassiveScenarioClassification (en)", | |
"MTOPDomainClassification (en)", | |
"MTOPIntentClassification (en)", | |
"ToxicConversationsClassification", | |
"TweetSentimentExtractionClassification", | |
] | |
TASK_LIST_CLUSTERING = [ | |
"ArxivClusteringP2P", | |
"ArxivClusteringS2S", | |
"BiorxivClusteringP2P", | |
"BiorxivClusteringS2S", | |
"MedrxivClusteringP2P", | |
"MedrxivClusteringS2S", | |
"RedditClustering", | |
"RedditClusteringP2P", | |
"StackExchangeClustering", | |
"StackExchangeClusteringP2P", | |
"TwentyNewsgroupsClustering", | |
] | |
TASK_LIST_PAIR_CLASSIFICATION = [ | |
"SprintDuplicateQuestions", | |
"TwitterSemEval2015", | |
"TwitterURLCorpus", | |
] | |
TASK_LIST_RERANKING = [ | |
"AskUbuntuDupQuestions", | |
"MindSmallReranking", | |
"SciDocsRR", | |
"StackOverflowDupQuestions", | |
] | |
TASK_LIST_RETRIEVAL = [ | |
"ArguAna", | |
"ClimateFEVER", | |
"CQADupstackRetrieval", | |
"DBPedia", | |
"FEVER", | |
"FiQA2018", | |
"HotpotQA", | |
"MSMARCO", | |
"NFCorpus", | |
"NQ", | |
"QuoraRetrieval", | |
"SCIDOCS", | |
"SciFact", | |
"Touche2020", | |
"TRECCOVID", | |
] | |
TASK_LIST_STS = [ | |
"BIOSSES", | |
"SICK-R", | |
"STS12", | |
"STS13", | |
"STS14", | |
"STS15", | |
"STS16", | |
"STS17 (en-en)", | |
"STS22 (en)", | |
"STSBenchmark", | |
] | |
TASK_LIST_SUMMARIZATION = [ | |
"SummEval", | |
] | |
TASK_LIST_EN = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_RERANKING + TASK_LIST_RETRIEVAL + TASK_LIST_STS + TASK_LIST_SUMMARIZATION | |
TASK_TO_METRIC = { | |
"BitextMining": "f1", | |
"Clustering": "v_measure", | |
"Classification": "accuracy", | |
"PairClassification": "cos_sim_ap", | |
"Reranking": "map", | |
"Retrieval": "ndcg_at_10", | |
"STS": "cos_sim_spearman", | |
"Summarization": "cos_sim_spearman", | |
} | |
def make_clickable_model(model_name): | |
# Remove user from model name | |
model_name_show = " ".join(model_name.split("/")[1:]) | |
link = "https://huggingface.co/" + model_name | |
return ( | |
f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name_show}</a>' | |
) | |
def get_mteb_data(tasks=["Clustering"], langs=[], cast_to_str=True, task_to_metric=TASK_TO_METRIC): | |
api = HfApi() | |
models = api.list_models(filter="mteb") | |
df_list = [] | |
for model in models: | |
readme_path = hf_hub_download(model.modelId, filename="README.md") | |
meta = metadata_load(readme_path) | |
# meta['model-index'][0]["results"] is list of elements like: | |
# { | |
# "task": {"type": "Classification"}, | |
# "dataset": { | |
# "type": "mteb/amazon_massive_intent", | |
# "name": "MTEB MassiveIntentClassification (nb)", | |
# "config": "nb", | |
# "split": "test", | |
# }, | |
# "metrics": [ | |
# {"type": "accuracy", "value": 39.81506388702084}, | |
# {"type": "f1", "value": 38.809586587791664}, | |
# ], | |
# }, | |
# Use "get" instead of dict indexing to skip incompat metadata instead of erroring out | |
if langs: | |
task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and (sub_res.get("dataset", {}).get("config", "default") in ("default", *langs))] | |
else: | |
task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks)] | |
out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if score["type"] == task_to_metric.get(res["task"]["type"])][0]} for res in task_results] | |
out = {k: v for d in out for k, v in d.items()} | |
out["Model"] = make_clickable_model(model.modelId) | |
df_list.append(out) | |
df = pd.DataFrame(df_list) | |
# Put 'Model' column first | |
cols = sorted(list(df.columns)) | |
cols.insert(0, cols.pop(cols.index("Model"))) | |
df = df[cols] | |
df.fillna("", inplace=True) | |
if cast_to_str: | |
return df.astype(str) # Cast to str as Gradio does not accept floats | |
return df | |
def get_mteb_average(get_all_avgs=False): | |
global DATA_OVERALL, DATA_CLASSIFICATION_EN, DATA_CLUSTERING, DATA_PAIR_CLASSIFICATION, DATA_RERANKING, DATA_RETRIEVAL, DATA_STS_EN, DATA_SUMMARIZATION | |
DATA_OVERALL = get_mteb_data( | |
tasks=[ | |
"Classification", | |
"Clustering", | |
"PairClassification", | |
"Reranking", | |
"Retrieval", | |
"STS", | |
"Summarization", | |
], | |
langs=["en", "en-en"], | |
cast_to_str=False | |
) | |
DATA_OVERALL.insert(1, f"Average ({len(TASK_LIST_EN)} datasets)", DATA_OVERALL[TASK_LIST_EN].mean(axis=1, skipna=False)) | |
DATA_OVERALL.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", DATA_OVERALL[TASK_LIST_CLASSIFICATION].mean(axis=1, skipna=False)) | |
DATA_OVERALL.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", DATA_OVERALL[TASK_LIST_CLUSTERING].mean(axis=1, skipna=False)) | |
DATA_OVERALL.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", DATA_OVERALL[TASK_LIST_PAIR_CLASSIFICATION].mean(axis=1, skipna=False)) | |
DATA_OVERALL.insert(5, f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", DATA_OVERALL[TASK_LIST_RERANKING].mean(axis=1, skipna=False)) | |
DATA_OVERALL.insert(6, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", DATA_OVERALL[TASK_LIST_RETRIEVAL].mean(axis=1, skipna=False)) | |
DATA_OVERALL.insert(7, f"STS Average ({len(TASK_LIST_STS)} datasets)", DATA_OVERALL[TASK_LIST_STS].mean(axis=1, skipna=False)) | |
DATA_OVERALL.insert(8, f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)", DATA_OVERALL[TASK_LIST_SUMMARIZATION].mean(axis=1, skipna=False)) | |
DATA_OVERALL.sort_values(f"Average ({len(TASK_LIST_EN)} datasets)", ascending=False, inplace=True) | |
# Start ranking from 1 | |
DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1))) | |
DATA_OVERALL = DATA_OVERALL.round(2).astype(str) | |
DATA_CLASSIFICATION_EN = DATA_OVERALL[["Model"] + TASK_LIST_CLASSIFICATION] | |
DATA_CLUSTERING = DATA_OVERALL[["Model"] + TASK_LIST_CLUSTERING] | |
DATA_PAIR_CLASSIFICATION = DATA_OVERALL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION] | |
DATA_RERANKING = DATA_OVERALL[["Model"] + TASK_LIST_RERANKING] | |
DATA_RETRIEVAL = DATA_OVERALL[["Model"] + TASK_LIST_RETRIEVAL] | |
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS] | |
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION] | |
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]] | |
return DATA_OVERALL | |
get_mteb_average() | |
block = gr.Blocks() | |
with block: | |
gr.Markdown(f""" | |
Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗 | |
- **Total Scores**: TODO | |
- **Total Models**: {len(DATA_OVERALL)} | |
- **Total Users**: TODO | |
""") | |
with gr.Tabs(): | |
with gr.TabItem("Overall"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**Overall MTEB English leaderboard 🔮** | |
- **Metric:** Various, refer to task tabs | |
- **Languages:** English, refer to task tabs for others | |
""") | |
with gr.Row(): | |
data_overall = gr.components.Dataframe( | |
DATA_OVERALL, | |
datatype=["markdown"] * len(DATA_OVERALL.columns) * 2, | |
type="pandas", | |
wrap=True, | |
) | |
with gr.Row(): | |
data_run = gr.Button("Refresh") | |
data_run.click(get_mteb_average, inputs=None, outputs=data_overall) | |
with gr.TabItem("BitextMining"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**Bitext Mining Leaderboard 🎌** | |
- **Metric:** Accuracy (accuracy) | |
- **Languages:** 117 | |
""") | |
with gr.Row(): | |
data_bitext_mining = gr.components.Dataframe( | |
datatype=["markdown"] * 500, # hack when we don't know how many columns | |
type="pandas", | |
) | |
with gr.Row(): | |
data_run = gr.Button("Refresh") | |
task_bitext_mining = gr.Variable(value="BitextMining") | |
data_run.click( | |
get_mteb_data, | |
inputs=[task_bitext_mining], | |
outputs=data_bitext_mining, | |
) | |
with gr.TabItem("Classification"): | |
with gr.TabItem("English"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**Classification Leaderboard ❤️** | |
- **Metric:** Accuracy (accuracy) | |
- **Languages:** English | |
""") | |
with gr.Row(): | |
data_classification_en = gr.components.Dataframe( | |
DATA_CLASSIFICATION_EN, | |
datatype=["markdown"] * len(DATA_CLASSIFICATION_EN.columns) * 20, | |
type="pandas", | |
) | |
with gr.Row(): | |
data_run_classification_en = gr.Button("Refresh") | |
task_classification_en = gr.Variable(value="Classification") | |
lang_classification_en = gr.Variable(value=["en"]) | |
data_run_classification_en.click( | |
get_mteb_data, | |
inputs=[ | |
task_classification_en, | |
lang_classification_en, | |
], | |
outputs=data_classification_en, | |
) | |
with gr.TabItem("Multilingual"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**Classification Multilingual Leaderboard 💜💚💙** | |
- **Metric:** Accuracy (accuracy) | |
- **Languages:** 51 | |
""") | |
with gr.Row(): | |
data_classification = gr.components.Dataframe( | |
datatype=["markdown"] * 500, # hack when we don't know how many columns | |
type="pandas", | |
) | |
with gr.Row(): | |
data_run = gr.Button("Refresh") | |
task_classification = gr.Variable(value="Classification") | |
data_run.click( | |
get_mteb_data, | |
inputs=[task_classification], | |
outputs=data_classification, | |
) | |
with gr.TabItem("Clustering"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**Clustering Leaderboard ✨** | |
- **Metric:** Validity Measure (v_measure) | |
- **Languages:** English | |
""") | |
with gr.Row(): | |
data_clustering = gr.components.Dataframe( | |
DATA_CLUSTERING, | |
datatype="markdown", | |
type="pandas", | |
col_count=(len(DATA_CLUSTERING.columns), "fixed"), | |
) | |
with gr.Row(): | |
data_run = gr.Button("Refresh") | |
task_clustering = gr.Variable(value="Clustering") | |
data_run.click( | |
get_mteb_data, | |
inputs=[task_clustering], | |
outputs=data_clustering, | |
) | |
with gr.TabItem("Pair Classification"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**Pair Classification Leaderboard 🎭** | |
- **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap) | |
- **Languages:** English | |
""") | |
with gr.Row(): | |
data_pair_classification = gr.components.Dataframe( | |
DATA_PAIR_CLASSIFICATION, | |
datatype="markdown", | |
type="pandas", | |
col_count=(len(DATA_PAIR_CLASSIFICATION.columns), "fixed"), | |
) | |
with gr.Row(): | |
data_run = gr.Button("Refresh") | |
task_pair_classification = gr.Variable(value="PairClassification") | |
data_run.click( | |
get_mteb_data, | |
inputs=[task_pair_classification], | |
outputs=data_pair_classification, | |
) | |
with gr.TabItem("Retrieval"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**Retrieval Leaderboard 🔎** | |
- **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10) | |
- **Languages:** English | |
""") | |
with gr.Row(): | |
data_retrieval = gr.components.Dataframe( | |
DATA_RETRIEVAL, | |
datatype=["markdown"] * len(DATA_RETRIEVAL.columns) * 2, | |
type="pandas", | |
) | |
with gr.Row(): | |
data_run = gr.Button("Refresh") | |
task_retrieval = gr.Variable(value="Retrieval") | |
data_run.click( | |
get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval | |
) | |
with gr.TabItem("Reranking"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**Reranking Leaderboard 🥇** | |
- **Metric:** Mean Average Precision (MAP) | |
- **Languages:** English | |
""") | |
with gr.Row(): | |
data_reranking = gr.components.Dataframe( | |
DATA_RERANKING, | |
datatype="markdown", | |
type="pandas", | |
col_count=(len(DATA_RERANKING.columns), "fixed"), | |
) | |
with gr.Row(): | |
data_run = gr.Button("Refresh") | |
task_reranking = gr.Variable(value="Reranking") | |
metric_reranking = gr.Variable(value="map") | |
data_run.click( | |
get_mteb_data, inputs=[task_reranking], outputs=data_reranking | |
) | |
with gr.TabItem("STS"): | |
with gr.TabItem("English"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**STS Leaderboard 🤖** | |
- **Metric:** Spearman correlation based on cosine similarity | |
- **Languages:** English | |
""") | |
with gr.Row(): | |
data_sts_en = gr.components.Dataframe( | |
DATA_STS_EN, | |
datatype="markdown", | |
type="pandas", | |
col_count=(len(DATA_STS_EN.columns), "fixed"), | |
) | |
with gr.Row(): | |
data_run_en = gr.Button("Refresh") | |
task_sts_en = gr.Variable(value="STS") | |
lang_sts_en = gr.Variable(value=["en", "en-en"]) | |
data_run.click( | |
get_mteb_data, | |
inputs=[task_sts_en, lang_sts_en], | |
outputs=data_sts_en, | |
) | |
with gr.TabItem("Multilingual"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**STS Multilingual Leaderboard 👽** | |
- **Metric:** Spearman correlation based on cosine similarity | |
- **Languages:** Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish | |
""") | |
with gr.Row(): | |
data_sts = gr.components.Dataframe( | |
datatype=["markdown"] * 50, # hack when we don't know how many columns | |
type="pandas", | |
) | |
with gr.Row(): | |
data_run = gr.Button("Refresh") | |
task_sts = gr.Variable(value="STS") | |
data_run.click(get_mteb_data, inputs=[task_sts], outputs=data_sts) | |
with gr.TabItem("Summarization"): | |
with gr.Row(): | |
gr.Markdown(""" | |
**Summarization Leaderboard 📜** | |
- **Metric:** Spearman correlation based on cosine similarity | |
- **Languages:** English | |
""") | |
with gr.Row(): | |
data_summarization = gr.components.Dataframe( | |
DATA_SUMMARIZATION, | |
datatype="markdown", | |
type="pandas", | |
col_count=(len(DATA_SUMMARIZATION.columns), "fixed"), | |
) | |
with gr.Row(): | |
data_run = gr.Button("Refresh") | |
task_summarization = gr.Variable(value="Summarization") | |
data_run.click( | |
get_mteb_data, | |
inputs=[task_summarization], | |
outputs=data_summarization, | |
) | |
# Running the function on page load in addition to when the button is clicked | |
# This is optional - If deactivated the data created loaded at "Build time" is shown like for Overall tab | |
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining) | |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en) | |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification) | |
block.load(get_mteb_data, inputs=[task_clustering], outputs=data_clustering) | |
block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval) | |
block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking) | |
block.load(get_mteb_data, inputs=[task_sts_en, lang_sts_en], outputs=data_sts_en) | |
block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts) | |
block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization) | |
block.launch() | |
# Possible changes: | |
# Could check if tasks are valid (Currently users could just invent new tasks - similar for languages) | |
# Could make it load in the background without the Gradio logo closer to the Deep RL space | |
# Could add graphs / other visual content | |
# Sources: | |
# https://huggingface.co/spaces/gradio/leaderboard | |
# https://huggingface.co/spaces/huggingface-projects/Deep-Reinforcement-Learning-Leaderboard | |
# https://getemoji.com/ | |