Muennighoff commited on
Commit
2458a90
β€’
1 Parent(s): fa91720

Add new CLF, BTM leaderboards

Browse files
Files changed (1) hide show
  1. app.py +390 -146
app.py CHANGED
@@ -17,6 +17,9 @@ TASKS = [
17
  "Summarization",
18
  ]
19
 
 
 
 
20
  TASK_LIST_CLASSIFICATION = [
21
  "AmazonCounterfactualClassification (en)",
22
  "AmazonPolarityClassification",
@@ -34,6 +37,38 @@ TASK_LIST_CLASSIFICATION = [
34
 
35
  TASK_LIST_CLASSIFICATION_NORM = [x.replace(" (en)", "") for x in TASK_LIST_CLASSIFICATION]
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  TASK_LIST_CLUSTERING = [
38
  "ArxivClusteringP2P",
39
  "ArxivClusteringS2S",
@@ -48,6 +83,7 @@ TASK_LIST_CLUSTERING = [
48
  "TwentyNewsgroupsClustering",
49
  ]
50
 
 
51
  TASK_LIST_CLUSTERING_DE = [
52
  "BlurbsClusteringP2P",
53
  "BlurbsClusteringS2S",
@@ -86,7 +122,8 @@ TASK_LIST_RETRIEVAL = [
86
  "TRECCOVID",
87
  ]
88
 
89
- TASK_LIST_RETRIEVAL_NORM = TASK_LIST_RETRIEVAL + ["CQADupstackAndroidRetrieval",
 
90
  "CQADupstackEnglishRetrieval",
91
  "CQADupstackGamingRetrieval",
92
  "CQADupstackGisRetrieval",
@@ -124,7 +161,6 @@ TASK_LIST_EN = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_
124
  TASK_TO_METRIC = {
125
  "BitextMining": "f1",
126
  "Clustering": "v_measure",
127
- "Clustering (DE)": "v_measure",
128
  "Classification": "accuracy",
129
  "PairClassification": "cos_sim_ap",
130
  "Reranking": "map",
@@ -143,16 +179,23 @@ def make_clickable_model(model_name, link=None):
143
 
144
  # Models without metadata, thus we cannot fetch their results naturally
145
  EXTERNAL_MODELS = [
146
- "LASER2",
147
- "LaBSE",
148
  "all-MiniLM-L12-v2",
149
  "all-MiniLM-L6-v2",
150
  "all-mpnet-base-v2",
151
  "allenai-specter",
 
152
  "bert-base-uncased",
153
  "contriever-base-msmarco",
154
  "cross-en-de-roberta-sentence-transformer",
 
 
155
  "distiluse-base-multilingual-cased-v2",
 
 
 
 
 
 
156
  "gbert-base",
157
  "gbert-large",
158
  "gelectra-base",
@@ -164,9 +207,19 @@ EXTERNAL_MODELS = [
164
  "gtr-t5-xl",
165
  "gtr-t5-xxl",
166
  "komninos",
 
 
167
  "msmarco-bert-co-condensor",
 
 
 
 
 
 
 
168
  "paraphrase-multilingual-MiniLM-L12-v2",
169
  "paraphrase-multilingual-mpnet-base-v2",
 
170
  "sentence-t5-base",
171
  "sentence-t5-large",
172
  "sentence-t5-xl",
@@ -184,20 +237,58 @@ EXTERNAL_MODELS = [
184
  "text-search-davinci-001",
185
  "unsup-simcse-bert-base-uncased",
186
  "use-cmlm-multilingual",
 
187
  "xlm-roberta-large",
188
  ]
189
 
190
  EXTERNAL_MODEL_TO_LINK = {
191
- "xlm-roberta-large": "https://huggingface.co/xlm-roberta-large",
192
- "use-cmlm-multilingual": "https://huggingface.co/sentence-transformers/use-cmlm-multilingual",
 
 
 
 
 
 
193
  "cross-en-de-roberta-sentence-transformer": "https://huggingface.co/T-Systems-onsite/cross-en-de-roberta-sentence-transformer",
 
194
  "distiluse-base-multilingual-cased-v2": "https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2",
 
 
 
 
 
 
 
195
  "gbert-base": "https://huggingface.co/deepset/gbert-base",
196
  "gbert-large": "https://huggingface.co/deepset/gbert-large",
197
  "gelectra-base": "https://huggingface.co/deepset/gelectra-base",
198
  "gelectra-large": "https://huggingface.co/deepset/gelectra-large",
 
199
  "gottbert-base": "https://huggingface.co/uklfr/gottbert-base",
 
 
 
 
 
200
  "LASER2": "https://github.com/facebookresearch/LASER",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  "text-embedding-ada-002": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
202
  "text-similarity-ada-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
203
  "text-similarity-babbage-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
@@ -209,173 +300,192 @@ EXTERNAL_MODEL_TO_LINK = {
209
  "text-search-curie-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
210
  "text-search-babbage-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
211
  "text-search-davinci-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
212
- "LaBSE": "https://huggingface.co/sentence-transformers/LaBSE",
213
- "sentence-t5-xxl": "https://huggingface.co/sentence-transformers/sentence-t5-xxl",
214
- "sentence-t5-xl": "https://huggingface.co/sentence-transformers/sentence-t5-xl",
215
- "sentence-t5-large": "https://huggingface.co/sentence-transformers/sentence-t5-large",
216
- "sentence-t5-base": "https://huggingface.co/sentence-transformers/sentence-t5-base",
217
- "gtr-t5-xxl": "https://huggingface.co/sentence-transformers/gtr-t5-xxl",
218
- "gtr-t5-xl": "https://huggingface.co/sentence-transformers/gtr-t5-xl",
219
- "gtr-t5-large": "https://huggingface.co/sentence-transformers/gtr-t5-large",
220
- "gtr-t5-base": "https://huggingface.co/sentence-transformers/gtr-t5-base",
221
- "gtr-t5-xxl": "https://huggingface.co/sentence-transformers/gtr-t5-xxl",
222
- "gtr-t5-xl": "https://huggingface.co/sentence-transformers/gtr-t5-xl",
223
- "gtr-t5-large": "https://huggingface.co/sentence-transformers/gtr-t5-large",
224
- "gtr-t5-base": "https://huggingface.co/sentence-transformers/gtr-t5-base",
225
- "bert-base-uncased": "https://huggingface.co/bert-base-uncased",
226
- "allenai-specter": "https://huggingface.co/sentence-transformers/allenai-specter",
227
- "allenai-specter": "https://huggingface.co/sentence-transformers/allenai-specter",
228
  "unsup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/unsup-simcse-bert-base-uncased",
229
- "sup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/sup-simcse-bert-base-uncased",
230
- "komninos": "https://huggingface.co/sentence-transformers/average_word_embeddings_komninos",
231
- "glove.6B.300d": "https://huggingface.co/sentence-transformers/average_word_embeddings_glove.6B.300d",
232
- "msmarco-bert-co-condensor": "https://huggingface.co/sentence-transformers/msmarco-bert-co-condensor",
233
- "all-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2",
234
- "all-MiniLM-L6-v2": "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2",
235
- "all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
236
- "paraphrase-multilingual-mpnet-base-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
237
- "paraphrase-multilingual-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
238
- "contriever-base-msmarco": "https://huggingface.co/nthakur/contriever-base-msmarco",
239
  }
240
 
241
  EXTERNAL_MODEL_TO_DIM = {
242
- "xlm-roberta-large": 1024,
243
- "use-cmlm-multilingual": 768,
244
- "gottbert-base": 768,
245
- "cross-en-de-roberta-sentence-transformer": 768,
246
- "distiluse-base-multilingual-cased-v2": 512,
247
- "gbert-base": 768,
248
- "gbert-large": 1024,
249
- "gelectra-base": 768,
250
- "gelectra-large": 1024,
251
- "gottbert-base": 768,
252
-
253
- "LASER2": 1024,
254
- "LaBSE": 768,
255
  "all-MiniLM-L12-v2": 384,
256
  "all-MiniLM-L6-v2": 384,
257
  "all-mpnet-base-v2": 768,
258
- "allenai-specter": 768,
 
259
  "bert-base-uncased": 768,
260
  "contriever-base-msmarco": 768,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  "glove.6B.300d": 300,
 
262
  "gtr-t5-base": 768,
263
  "gtr-t5-large": 768,
264
  "gtr-t5-xl": 768,
265
  "gtr-t5-xxl": 768,
266
  "komninos": 300,
267
  "msmarco-bert-co-condensor": 768,
 
 
 
 
 
 
 
268
  "paraphrase-multilingual-MiniLM-L12-v2": 384,
269
  "paraphrase-multilingual-mpnet-base-v2": 768,
 
270
  "sentence-t5-base": 768,
271
  "sentence-t5-large": 768,
272
  "sentence-t5-xl": 768,
273
  "sentence-t5-xxl": 768,
274
  "sup-simcse-bert-base-uncased": 768,
275
-
 
276
  "text-embedding-ada-002": 1536,
277
-
278
  "text-similarity-ada-001": 1024,
279
  "text-similarity-babbage-001": 2048,
280
  "text-similarity-curie-001": 4096,
281
  "text-similarity-davinci-001": 12288,
282
-
283
  "text-search-ada-doc-001": 1024,
284
  "text-search-ada-query-001": 1024,
285
  "text-search-ada-001": 1024,
286
  "text-search-babbage-001": 2048,
287
  "text-search-curie-001": 4096,
288
- "text-search-davinci-001": 12288,
289
-
290
- "unsup-simcse-bert-base-uncased": 768,
291
  }
292
 
293
 
294
  EXTERNAL_MODEL_TO_SEQLEN = {
295
- "xlm-roberta-large": 514,
296
- "use-cmlm-multilingual": 512,
297
- "gottbert-base": 512,
298
- "cross-en-de-roberta-sentence-transformer": 514,
299
- "distiluse-base-multilingual-cased-v2": 512,
300
- "gbert-base": 512,
301
- "gbert-large": 512,
302
- "gelectra-base": 512,
303
- "gelectra-large": 512,
304
- "gottbert-base": 512,
305
-
306
- "LASER2": "N/A",
307
- "LaBSE": 512,
308
  "all-MiniLM-L12-v2": 512,
309
  "all-MiniLM-L6-v2": 512,
310
  "all-mpnet-base-v2": 514,
311
  "allenai-specter": 512,
 
312
  "bert-base-uncased": 512,
313
  "contriever-base-msmarco": 512,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  "glove.6B.300d": "N/A",
315
  "gtr-t5-base": 512,
316
  "gtr-t5-large": 512,
317
  "gtr-t5-xl": 512,
318
  "gtr-t5-xxl": 512,
319
  "komninos": "N/A",
 
 
320
  "msmarco-bert-co-condensor": 512,
 
 
 
 
 
 
 
321
  "paraphrase-multilingual-MiniLM-L12-v2": 512,
322
  "paraphrase-multilingual-mpnet-base-v2": 514,
 
323
  "sentence-t5-base": 512,
324
  "sentence-t5-large": 512,
325
  "sentence-t5-xl": 512,
326
  "sentence-t5-xxl": 512,
327
  "sup-simcse-bert-base-uncased": 512,
328
-
329
  "text-embedding-ada-002": 8191,
330
-
331
  "text-similarity-ada-001": 2046,
332
  "text-similarity-babbage-001": 2046,
333
  "text-similarity-curie-001": 2046,
334
  "text-similarity-davinci-001": 2046,
335
-
336
  "text-search-ada-doc-001": 2046,
337
  "text-search-ada-query-001": 2046,
338
  "text-search-ada-001": 2046,
339
  "text-search-babbage-001": 2046,
340
  "text-search-curie-001": 2046,
341
  "text-search-davinci-001": 2046,
342
-
343
  "unsup-simcse-bert-base-uncased": 512,
 
 
344
  }
345
 
346
  EXTERNAL_MODEL_TO_SIZE = {
347
- "gtr-t5-xxl": 9.73,
348
- "gtr-t5-xl": 2.48,
349
- "gtr-t5-large": 0.67,
350
- "gtr-t5-base": 0.22,
351
- "sentence-t5-xxl": 9.73,
352
- "sentence-t5-xl": 2.48,
353
- "sentence-t5-large": 0.67,
354
- "sentence-t5-base": 0.22,
355
- "all-mpnet-base-v2": 0.44,
356
  "all-MiniLM-L12-v2": 0.13,
357
  "all-MiniLM-L6-v2": 0.09,
358
- "contriever-base-msmarco": 0.44,
359
- "paraphrase-multilingual-mpnet-base-v2": 1.11,
360
- "paraphrase-multilingual-MiniLM-L12-v2": 0.47,
361
- "msmarco-bert-co-condensor": 0.44,
362
- "sup-simcse-bert-base-uncased": 0.44,
363
- "unsup-simcse-bert-base-uncased": 0.44,
364
- "LaBSE": 1.88,
365
- "komninos": 0.27,
366
- "glove.6B.300d": 0.48,
367
- "allenai-specter": 0.44,
368
- "bert-base-uncased": 0.44,
369
- "LASER2": 0.17,
370
  "cross-en-de-roberta-sentence-transformer": 1.11,
 
 
371
  "distiluse-base-multilingual-cased-v2": 0.54,
 
 
 
 
 
 
 
372
  "gbert-base": 0.44,
373
  "gbert-large": 1.35,
374
  "gelectra-base": 0.44,
375
  "gelectra-large": 1.34,
376
- "use-cmlm-multilingual": 1.89,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  "xlm-roberta-large": 2.24,
378
- "gottbert-base": 0.51
379
  }
380
 
381
  MODELS_TO_SKIP = {
@@ -413,7 +523,7 @@ def add_lang(examples):
413
 
414
  def add_task(examples):
415
  # Could be added to the dataset loading script instead
416
- if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM:
417
  examples["mteb_task"] = "Classification"
418
  elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING + TASK_LIST_CLUSTERING_DE:
419
  examples["mteb_task"] = "Clustering"
@@ -547,6 +657,11 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
547
  out["Embedding Dimensions"], out["Sequence Length"], out["Model Size (GB)"] = get_dim_seq_size(model)
548
  df_list.append(out)
549
  df = pd.DataFrame(df_list)
 
 
 
 
 
550
  # Put 'Model' column first
551
  cols = sorted(list(df.columns))
552
  cols.insert(0, cols.pop(cols.index("Model")))
@@ -607,8 +722,12 @@ def get_mteb_average():
607
  return DATA_OVERALL
608
 
609
  get_mteb_average()
610
- DATA_BITEXT_MINING = get_mteb_data(["BitextMining"])
611
- DATA_CLASSIFICATION = get_mteb_data(["Classification"])
 
 
 
 
612
  DATA_CLUSTERING_GERMAN = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
613
  DATA_STS = get_mteb_data(["STS"])
614
 
@@ -616,7 +735,7 @@ DATA_STS = get_mteb_data(["STS"])
616
  NUM_SCORES = 0
617
  DATASETS = []
618
  # LANGUAGES = []
619
- for d in [DATA_BITEXT_MINING, DATA_CLASSIFICATION, DATA_CLUSTERING, DATA_CLUSTERING_GERMAN, DATA_PAIR_CLASSIFICATION, DATA_RERANKING, DATA_RETRIEVAL, DATA_STS, DATA_SUMMARIZATION]:
620
  # NUM_SCORES += d.iloc[:, 1:].apply(lambda x: sum([1 for y in x if isinstance(y, float) and not np.isnan(y)]), axis=1).sum()
621
  cols_to_ignore = 3 if "Average" in d.columns else 2
622
  # Count number of scores including only non-nan floats & excluding the rank column
@@ -634,7 +753,7 @@ with block:
634
  Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> πŸ€— Refer to the [MTEB paper](https://arxiv.org/abs/2210.07316) for details on metrics, tasks and models.
635
 
636
  - **Total Datasets**: {NUM_DATASETS}
637
- - **Total Languages**: 112
638
  - **Total Scores**: {NUM_SCORES}
639
  - **Total Models**: {len(DATA_OVERALL)}
640
  """)
@@ -656,29 +775,61 @@ with block:
656
  )
657
  with gr.Row():
658
  data_run = gr.Button("Refresh")
659
- data_run.click(get_mteb_average, inputs=None, outputs=data_overall)
660
  with gr.TabItem("Bitext Mining"):
661
- with gr.Row():
662
- gr.Markdown("""
663
- **Bitext Mining Leaderboard 🎌**
664
-
665
- - **Metric:** [F1](https://huggingface.co/spaces/evaluate-metric/f1)
666
- - **Languages:** 117
667
- """)
668
- with gr.Row():
669
- data_bitext_mining = gr.components.Dataframe(
670
- DATA_BITEXT_MINING,
671
- datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING.columns),
672
- type="pandas",
673
- )
674
- with gr.Row():
675
- data_run = gr.Button("Refresh")
676
- task_bitext_mining = gr.Variable(value=["BitextMining"])
677
- data_run.click(
678
- get_mteb_data,
679
- inputs=[task_bitext_mining],
680
- outputs=data_bitext_mining,
681
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
682
  with gr.TabItem("Classification"):
683
  with gr.TabItem("English"):
684
  with gr.Row():
@@ -706,28 +857,121 @@ with block:
706
  ],
707
  outputs=data_classification_en,
708
  )
709
- with gr.TabItem("Multilingual"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
710
  with gr.Row():
711
  gr.Markdown("""
712
- **Classification Multilingual Leaderboard πŸ’œπŸ’šπŸ’™**
713
 
714
  - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
715
- - **Languages:** 51
716
  """)
717
  with gr.Row():
718
  data_classification = gr.components.Dataframe(
719
- DATA_CLASSIFICATION,
720
- datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION) * 10,
721
  type="pandas",
722
  )
723
  with gr.Row():
724
  data_run = gr.Button("Refresh")
725
  task_classification = gr.Variable(value=["Classification"])
 
 
726
  data_run.click(
727
  get_mteb_data,
728
- inputs=[task_classification],
 
 
 
 
729
  outputs=data_classification,
730
- )
731
  with gr.TabItem("Clustering"):
732
  with gr.TabItem("English"):
733
  with gr.Row():
@@ -756,7 +1000,7 @@ with block:
756
  with gr.TabItem("German"):
757
  with gr.Row():
758
  gr.Markdown("""
759
- **Clustering Leaderboard βœ¨πŸ‡©πŸ‡ͺ**
760
 
761
  - **Metric:** Validity Measure (v_measure)
762
  - **Languages:** German
@@ -800,48 +1044,48 @@ with block:
800
  inputs=[task_pair_classification],
801
  outputs=data_pair_classification,
802
  )
803
- with gr.TabItem("Retrieval"):
804
  with gr.Row():
805
  gr.Markdown("""
806
- **Retrieval Leaderboard πŸ”Ž**
807
 
808
- - **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10)
809
  - **Languages:** English
810
  """)
811
  with gr.Row():
812
- data_retrieval = gr.components.Dataframe(
813
- DATA_RETRIEVAL,
814
- # Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
815
- datatype=["number", "markdown"] + ["number"] * len(DATA_RETRIEVAL.columns) * 2,
816
  type="pandas",
817
  )
818
  with gr.Row():
819
  data_run = gr.Button("Refresh")
820
- task_retrieval = gr.Variable(value=["Retrieval"])
 
821
  data_run.click(
822
- get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval
823
  )
824
- with gr.TabItem("Reranking"):
825
  with gr.Row():
826
  gr.Markdown("""
827
- **Reranking Leaderboard πŸ₯ˆ**
828
 
829
- - **Metric:** Mean Average Precision (MAP)
830
  - **Languages:** English
831
  """)
832
  with gr.Row():
833
- data_reranking = gr.components.Dataframe(
834
- DATA_RERANKING,
835
- datatype=["number", "markdown"] + ["number"] * len(DATA_RERANKING.columns),
 
836
  type="pandas",
837
  )
838
  with gr.Row():
839
  data_run = gr.Button("Refresh")
840
- task_reranking = gr.Variable(value=["Reranking"])
841
- metric_reranking = gr.Variable(value="map")
842
  data_run.click(
843
- get_mteb_data, inputs=[task_reranking], outputs=data_reranking
844
- )
845
  with gr.TabItem("STS"):
846
  with gr.TabItem("English"):
847
  with gr.Row():
 
17
  "Summarization",
18
  ]
19
 
20
+ TASK_LIST_BITEXT_MINING = ['BUCC (de-en)', 'BUCC (fr-en)', 'BUCC (ru-en)', 'BUCC (zh-en)', 'Tatoeba (afr-eng)', 'Tatoeba (amh-eng)', 'Tatoeba (ang-eng)', 'Tatoeba (ara-eng)', 'Tatoeba (arq-eng)', 'Tatoeba (arz-eng)', 'Tatoeba (ast-eng)', 'Tatoeba (awa-eng)', 'Tatoeba (aze-eng)', 'Tatoeba (bel-eng)', 'Tatoeba (ben-eng)', 'Tatoeba (ber-eng)', 'Tatoeba (bos-eng)', 'Tatoeba (bre-eng)', 'Tatoeba (bul-eng)', 'Tatoeba (cat-eng)', 'Tatoeba (cbk-eng)', 'Tatoeba (ceb-eng)', 'Tatoeba (ces-eng)', 'Tatoeba (cha-eng)', 'Tatoeba (cmn-eng)', 'Tatoeba (cor-eng)', 'Tatoeba (csb-eng)', 'Tatoeba (cym-eng)', 'Tatoeba (dan-eng)', 'Tatoeba (deu-eng)', 'Tatoeba (dsb-eng)', 'Tatoeba (dtp-eng)', 'Tatoeba (ell-eng)', 'Tatoeba (epo-eng)', 'Tatoeba (est-eng)', 'Tatoeba (eus-eng)', 'Tatoeba (fao-eng)', 'Tatoeba (fin-eng)', 'Tatoeba (fra-eng)', 'Tatoeba (fry-eng)', 'Tatoeba (gla-eng)', 'Tatoeba (gle-eng)', 'Tatoeba (glg-eng)', 'Tatoeba (gsw-eng)', 'Tatoeba (heb-eng)', 'Tatoeba (hin-eng)', 'Tatoeba (hrv-eng)', 'Tatoeba (hsb-eng)', 'Tatoeba (hun-eng)', 'Tatoeba (hye-eng)', 'Tatoeba (ido-eng)', 'Tatoeba (ile-eng)', 'Tatoeba (ina-eng)', 'Tatoeba (ind-eng)', 'Tatoeba (isl-eng)', 'Tatoeba (ita-eng)', 'Tatoeba (jav-eng)', 'Tatoeba (jpn-eng)', 'Tatoeba (kab-eng)', 'Tatoeba (kat-eng)', 'Tatoeba (kaz-eng)', 'Tatoeba (khm-eng)', 'Tatoeba (kor-eng)', 'Tatoeba (kur-eng)', 'Tatoeba (kzj-eng)', 'Tatoeba (lat-eng)', 'Tatoeba (lfn-eng)', 'Tatoeba (lit-eng)', 'Tatoeba (lvs-eng)', 'Tatoeba (mal-eng)', 'Tatoeba (mar-eng)', 'Tatoeba (max-eng)', 'Tatoeba (mhr-eng)', 'Tatoeba (mkd-eng)', 'Tatoeba (mon-eng)', 'Tatoeba (nds-eng)', 'Tatoeba (nld-eng)', 'Tatoeba (nno-eng)', 'Tatoeba (nob-eng)', 'Tatoeba (nov-eng)', 'Tatoeba (oci-eng)', 'Tatoeba (orv-eng)', 'Tatoeba (pam-eng)', 'Tatoeba (pes-eng)', 'Tatoeba (pms-eng)', 'Tatoeba (pol-eng)', 'Tatoeba (por-eng)', 'Tatoeba (ron-eng)', 'Tatoeba (rus-eng)', 'Tatoeba (slk-eng)', 'Tatoeba (slv-eng)', 'Tatoeba (spa-eng)', 'Tatoeba (sqi-eng)', 'Tatoeba (srp-eng)', 'Tatoeba (swe-eng)', 'Tatoeba (swg-eng)', 'Tatoeba (swh-eng)', 'Tatoeba (tam-eng)', 'Tatoeba (tat-eng)', 'Tatoeba (tel-eng)', 'Tatoeba (tgl-eng)', 'Tatoeba (tha-eng)', 'Tatoeba (tuk-eng)', 'Tatoeba (tur-eng)', 'Tatoeba (tzl-eng)', 'Tatoeba (uig-eng)', 'Tatoeba (ukr-eng)', 'Tatoeba (urd-eng)', 'Tatoeba (uzb-eng)', 'Tatoeba (vie-eng)', 'Tatoeba (war-eng)', 'Tatoeba (wuu-eng)', 'Tatoeba (xho-eng)', 'Tatoeba (yid-eng)', 'Tatoeba (yue-eng)', 'Tatoeba (zsm-eng)']
21
+ TASK_LIST_BITEXT_MINING_OTHER = ["BornholmBitextMining"]
22
+
23
  TASK_LIST_CLASSIFICATION = [
24
  "AmazonCounterfactualClassification (en)",
25
  "AmazonPolarityClassification",
 
37
 
38
  TASK_LIST_CLASSIFICATION_NORM = [x.replace(" (en)", "") for x in TASK_LIST_CLASSIFICATION]
39
 
40
+ TASK_LIST_CLASSIFICATION_DA = [
41
+ "AngryTweetsClassification",
42
+ "DanishPoliticalCommentsClassification",
43
+ "DKHateClassification",
44
+ "LccSentimentClassification",
45
+ "MassiveIntentClassification (da)",
46
+ "MassiveScenarioClassification (da)",
47
+ "NordicLangClassification",
48
+ "ScalaDaClassification",
49
+ ]
50
+
51
+ TASK_LIST_CLASSIFICATION_NB = [
52
+ "NoRecClassification",
53
+ "NordicLangClassification",
54
+ "NorwegianParliament",
55
+ "MassiveIntentClassification (nb)",
56
+ "MassiveScenarioClassification (nb)",
57
+ "ScalaNbClassification (nb)",
58
+ ]
59
+
60
+ TASK_LIST_CLASSIFICATION_SV = [
61
+ "DalajClassification",
62
+ "MassiveIntentClassification (sv)",
63
+ "MassiveScenarioClassification (sv)",
64
+ "NordicLangClassification",
65
+ "ScalaNbClassification",
66
+ "ScalaSvClassification",
67
+ "SweRecClassification",
68
+ ]
69
+
70
+ TASK_LIST_CLASSIFICATION_OTHER = ['AmazonCounterfactualClassification (de)', 'AmazonCounterfactualClassification (ja)', 'AmazonReviewsClassification (de)', 'AmazonReviewsClassification (es)', 'AmazonReviewsClassification (fr)', 'AmazonReviewsClassification (ja)', 'AmazonReviewsClassification (zh)', 'MTOPDomainClassification (de)', 'MTOPDomainClassification (es)', 'MTOPDomainClassification (fr)', 'MTOPDomainClassification (hi)', 'MTOPDomainClassification (th)', 'MTOPIntentClassification (de)', 'MTOPIntentClassification (es)', 'MTOPIntentClassification (fr)', 'MTOPIntentClassification (hi)', 'MTOPIntentClassification (th)', 'MassiveIntentClassification (af)', 'MassiveIntentClassification (am)', 'MassiveIntentClassification (ar)', 'MassiveIntentClassification (az)', 'MassiveIntentClassification (bn)', 'MassiveIntentClassification (cy)', 'MassiveIntentClassification (de)', 'MassiveIntentClassification (el)', 'MassiveIntentClassification (es)', 'MassiveIntentClassification (fa)', 'MassiveIntentClassification (fi)', 'MassiveIntentClassification (fr)', 'MassiveIntentClassification (he)', 'MassiveIntentClassification (hi)', 'MassiveIntentClassification (hu)', 'MassiveIntentClassification (hy)', 'MassiveIntentClassification (id)', 'MassiveIntentClassification (is)', 'MassiveIntentClassification (it)', 'MassiveIntentClassification (ja)', 'MassiveIntentClassification (jv)', 'MassiveIntentClassification (ka)', 'MassiveIntentClassification (km)', 'MassiveIntentClassification (kn)', 'MassiveIntentClassification (ko)', 'MassiveIntentClassification (lv)', 'MassiveIntentClassification (ml)', 'MassiveIntentClassification (mn)', 'MassiveIntentClassification (ms)', 'MassiveIntentClassification (my)', 'MassiveIntentClassification (nl)', 'MassiveIntentClassification (pl)', 'MassiveIntentClassification (pt)', 'MassiveIntentClassification (ro)', 'MassiveIntentClassification (ru)', 'MassiveIntentClassification (sl)', 'MassiveIntentClassification (sq)', 'MassiveIntentClassification (sw)', 'MassiveIntentClassification (ta)', 'MassiveIntentClassification (te)', 'MassiveIntentClassification (th)', 'MassiveIntentClassification (tl)', 'MassiveIntentClassification (tr)', 'MassiveIntentClassification (ur)', 'MassiveIntentClassification (vi)', 'MassiveIntentClassification (zh-CN)', 'MassiveIntentClassification (zh-TW)', 'MassiveScenarioClassification (af)', 'MassiveScenarioClassification (am)', 'MassiveScenarioClassification (ar)', 'MassiveScenarioClassification (az)', 'MassiveScenarioClassification (bn)', 'MassiveScenarioClassification (cy)', 'MassiveScenarioClassification (de)', 'MassiveScenarioClassification (el)', 'MassiveScenarioClassification (es)', 'MassiveScenarioClassification (fa)', 'MassiveScenarioClassification (fi)', 'MassiveScenarioClassification (fr)', 'MassiveScenarioClassification (he)', 'MassiveScenarioClassification (hi)', 'MassiveScenarioClassification (hu)', 'MassiveScenarioClassification (hy)', 'MassiveScenarioClassification (id)', 'MassiveScenarioClassification (is)', 'MassiveScenarioClassification (it)', 'MassiveScenarioClassification (ja)', 'MassiveScenarioClassification (jv)', 'MassiveScenarioClassification (ka)', 'MassiveScenarioClassification (km)', 'MassiveScenarioClassification (kn)', 'MassiveScenarioClassification (ko)', 'MassiveScenarioClassification (lv)', 'MassiveScenarioClassification (ml)', 'MassiveScenarioClassification (mn)', 'MassiveScenarioClassification (ms)', 'MassiveScenarioClassification (my)', 'MassiveScenarioClassification (nl)', 'MassiveScenarioClassification (pl)', 'MassiveScenarioClassification (pt)', 'MassiveScenarioClassification (ro)', 'MassiveScenarioClassification (ru)', 'MassiveScenarioClassification (sl)', 'MassiveScenarioClassification (sq)', 'MassiveScenarioClassification (sw)', 'MassiveScenarioClassification (ta)', 'MassiveScenarioClassification (te)', 'MassiveScenarioClassification (th)', 'MassiveScenarioClassification (tl)', 'MassiveScenarioClassification (tr)', 'MassiveScenarioClassification (ur)', 'MassiveScenarioClassification (vi)', 'MassiveScenarioClassification (zh-CN)', 'MassiveScenarioClassification (zh-TW)']
71
+
72
  TASK_LIST_CLUSTERING = [
73
  "ArxivClusteringP2P",
74
  "ArxivClusteringS2S",
 
83
  "TwentyNewsgroupsClustering",
84
  ]
85
 
86
+
87
  TASK_LIST_CLUSTERING_DE = [
88
  "BlurbsClusteringP2P",
89
  "BlurbsClusteringS2S",
 
122
  "TRECCOVID",
123
  ]
124
 
125
+ TASK_LIST_RETRIEVAL_NORM = TASK_LIST_RETRIEVAL + [
126
+ "CQADupstackAndroidRetrieval",
127
  "CQADupstackEnglishRetrieval",
128
  "CQADupstackGamingRetrieval",
129
  "CQADupstackGisRetrieval",
 
161
  TASK_TO_METRIC = {
162
  "BitextMining": "f1",
163
  "Clustering": "v_measure",
 
164
  "Classification": "accuracy",
165
  "PairClassification": "cos_sim_ap",
166
  "Reranking": "map",
 
179
 
180
  # Models without metadata, thus we cannot fetch their results naturally
181
  EXTERNAL_MODELS = [
 
 
182
  "all-MiniLM-L12-v2",
183
  "all-MiniLM-L6-v2",
184
  "all-mpnet-base-v2",
185
  "allenai-specter",
186
+ "bert-base-swedish-cased",
187
  "bert-base-uncased",
188
  "contriever-base-msmarco",
189
  "cross-en-de-roberta-sentence-transformer",
190
+ "dfm-encoder-large-v1",
191
+ "dfm-sentence-encoder-large-1",
192
  "distiluse-base-multilingual-cased-v2",
193
+ "DanskBERT",
194
+ "e5-base",
195
+ "e5-large",
196
+ "e5-small",
197
+ "electra-small-nordic",
198
+ "electra-small-swedish-cased-discriminator",
199
  "gbert-base",
200
  "gbert-large",
201
  "gelectra-base",
 
207
  "gtr-t5-xl",
208
  "gtr-t5-xxl",
209
  "komninos",
210
+ "LASER2",
211
+ "LaBSE",
212
  "msmarco-bert-co-condensor",
213
+ "multilingual-e5-base",
214
+ "multilingual-e5-large",
215
+ "multilingual-e5-small",
216
+ "nb-bert-base",
217
+ "nb-bert-large",
218
+ "norbert3-base",
219
+ "norbert3-large",
220
  "paraphrase-multilingual-MiniLM-L12-v2",
221
  "paraphrase-multilingual-mpnet-base-v2",
222
+ "sentence-bert-swedish-cased",
223
  "sentence-t5-base",
224
  "sentence-t5-large",
225
  "sentence-t5-xl",
 
237
  "text-search-davinci-001",
238
  "unsup-simcse-bert-base-uncased",
239
  "use-cmlm-multilingual",
240
+ "xlm-roberta-base",
241
  "xlm-roberta-large",
242
  ]
243
 
244
  EXTERNAL_MODEL_TO_LINK = {
245
+ "allenai-specter": "https://huggingface.co/sentence-transformers/allenai-specter",
246
+ "allenai-specter": "https://huggingface.co/sentence-transformers/allenai-specter",
247
+ "all-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2",
248
+ "all-MiniLM-L6-v2": "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2",
249
+ "all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
250
+ "bert-base-swedish-cased": "https://huggingface.co/KB/bert-base-swedish-cased",
251
+ "bert-base-uncased": "https://huggingface.co/bert-base-uncased",
252
+ "contriever-base-msmarco": "https://huggingface.co/nthakur/contriever-base-msmarco",
253
  "cross-en-de-roberta-sentence-transformer": "https://huggingface.co/T-Systems-onsite/cross-en-de-roberta-sentence-transformer",
254
+ "DanskBERT": "https://huggingface.co/vesteinn/DanskBERT",
255
  "distiluse-base-multilingual-cased-v2": "https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2",
256
+ "dfm-encoder-large-v1": "https://huggingface.co/chcaa/dfm-encoder-large-v1",
257
+ "dfm-sentence-encoder-large-1": "https://huggingface.co/chcaa/dfm-encoder-large-v1",
258
+ "e5-base": "https://huggingface.co/intfloat/e5-base",
259
+ "e5-large": "https://huggingface.co/intfloat/e5-large",
260
+ "e5-small": "https://huggingface.co/intfloat/e5-small",
261
+ "electra-small-nordic": "https://huggingface.co/jonfd/electra-small-nordic",
262
+ "electra-small-swedish-cased-discriminator": "https://huggingface.co/KBLab/electra-small-swedish-cased-discriminator",
263
  "gbert-base": "https://huggingface.co/deepset/gbert-base",
264
  "gbert-large": "https://huggingface.co/deepset/gbert-large",
265
  "gelectra-base": "https://huggingface.co/deepset/gelectra-base",
266
  "gelectra-large": "https://huggingface.co/deepset/gelectra-large",
267
+ "glove.6B.300d": "https://huggingface.co/sentence-transformers/average_word_embeddings_glove.6B.300d",
268
  "gottbert-base": "https://huggingface.co/uklfr/gottbert-base",
269
+ "gtr-t5-base": "https://huggingface.co/sentence-transformers/gtr-t5-base",
270
+ "gtr-t5-large": "https://huggingface.co/sentence-transformers/gtr-t5-large",
271
+ "gtr-t5-xl": "https://huggingface.co/sentence-transformers/gtr-t5-xl",
272
+ "gtr-t5-xxl": "https://huggingface.co/sentence-transformers/gtr-t5-xxl",
273
+ "komninos": "https://huggingface.co/sentence-transformers/average_word_embeddings_komninos",
274
  "LASER2": "https://github.com/facebookresearch/LASER",
275
+ "LaBSE": "https://huggingface.co/sentence-transformers/LaBSE",
276
+ "msmarco-bert-co-condensor": "https://huggingface.co/sentence-transformers/msmarco-bert-co-condensor",
277
+ "multilingual-e5-base": "https://huggingface.co/intfloat/multilingual-e5-base",
278
+ "multilingual-e5-large": "https://huggingface.co/intfloat/multilingual-e5-large",
279
+ "multilingual-e5-small": "https://huggingface.co/intfloat/multilingual-e5-small",
280
+ "nb-bert-base": "https://huggingface.co/NbAiLab/nb-bert-base",
281
+ "nb-bert-large": "https://huggingface.co/NbAiLab/nb-bert-large",
282
+ "norbert3-base": "https://huggingface.co/ltg/norbert3-base",
283
+ "norbert3-large": "https://huggingface.co/ltg/norbert3-large",
284
+ "paraphrase-multilingual-mpnet-base-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
285
+ "paraphrase-multilingual-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
286
+ "sentence-bert-swedish-cased": "https://huggingface.co/KBLab/sentence-bert-swedish-cased",
287
+ "sentence-t5-base": "https://huggingface.co/sentence-transformers/sentence-t5-base",
288
+ "sentence-t5-large": "https://huggingface.co/sentence-transformers/sentence-t5-large",
289
+ "sentence-t5-xl": "https://huggingface.co/sentence-transformers/sentence-t5-xl",
290
+ "sentence-t5-xxl": "https://huggingface.co/sentence-transformers/sentence-t5-xxl",
291
+ "sup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/sup-simcse-bert-base-uncased",
292
  "text-embedding-ada-002": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
293
  "text-similarity-ada-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
294
  "text-similarity-babbage-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
 
300
  "text-search-curie-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
301
  "text-search-babbage-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
302
  "text-search-davinci-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  "unsup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/unsup-simcse-bert-base-uncased",
304
+ "use-cmlm-multilingual": "https://huggingface.co/sentence-transformers/use-cmlm-multilingual",
305
+ "xlm-roberta-base": "https://huggingface.co/xlm-roberta-base",
306
+ "xlm-roberta-large": "https://huggingface.co/xlm-roberta-large",
 
 
 
 
 
 
 
307
  }
308
 
309
  EXTERNAL_MODEL_TO_DIM = {
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  "all-MiniLM-L12-v2": 384,
311
  "all-MiniLM-L6-v2": 384,
312
  "all-mpnet-base-v2": 768,
313
+ "allenai-specter": 768,
314
+ "bert-base-swedish-cased": 768,
315
  "bert-base-uncased": 768,
316
  "contriever-base-msmarco": 768,
317
+ "cross-en-de-roberta-sentence-transformer": 768,
318
+ "DanskBERT": 768,
319
+ "distiluse-base-multilingual-cased-v2": 512,
320
+ "dfm-encoder-large-v1": 1024,
321
+ "dfm-sentence-encoder-large-1": 1024,
322
+ "e5-base": 768,
323
+ "e5-small": 384,
324
+ "e5-large": 1024,
325
+ "electra-small-nordic": 256,
326
+ "electra-small-swedish-cased-discriminator": 256,
327
+ "LASER2": 1024,
328
+ "LaBSE": 768,
329
+ "gbert-base": 768,
330
+ "gbert-large": 1024,
331
+ "gelectra-base": 768,
332
+ "gelectra-large": 1024,
333
  "glove.6B.300d": 300,
334
+ "gottbert-base": 768,
335
  "gtr-t5-base": 768,
336
  "gtr-t5-large": 768,
337
  "gtr-t5-xl": 768,
338
  "gtr-t5-xxl": 768,
339
  "komninos": 300,
340
  "msmarco-bert-co-condensor": 768,
341
+ "multilingual-e5-base": 768,
342
+ "multilingual-e5-small": 384,
343
+ "multilingual-e5-large": 1024,
344
+ "nb-bert-base": 768,
345
+ "nb-bert-large": 1024,
346
+ "norbert3-base": 768,
347
+ "norbert3-large": 1024,
348
  "paraphrase-multilingual-MiniLM-L12-v2": 384,
349
  "paraphrase-multilingual-mpnet-base-v2": 768,
350
+ "sentence-bert-swedish-cased": 768,
351
  "sentence-t5-base": 768,
352
  "sentence-t5-large": 768,
353
  "sentence-t5-xl": 768,
354
  "sentence-t5-xxl": 768,
355
  "sup-simcse-bert-base-uncased": 768,
356
+ "use-cmlm-multilingual": 768,
357
+ "unsup-simcse-bert-base-uncased": 768,
358
  "text-embedding-ada-002": 1536,
 
359
  "text-similarity-ada-001": 1024,
360
  "text-similarity-babbage-001": 2048,
361
  "text-similarity-curie-001": 4096,
362
  "text-similarity-davinci-001": 12288,
 
363
  "text-search-ada-doc-001": 1024,
364
  "text-search-ada-query-001": 1024,
365
  "text-search-ada-001": 1024,
366
  "text-search-babbage-001": 2048,
367
  "text-search-curie-001": 4096,
368
+ "text-search-davinci-001": 12288,
369
+ "xlm-roberta-base": 768,
370
+ "xlm-roberta-large": 1024,
371
  }
372
 
373
 
374
  EXTERNAL_MODEL_TO_SEQLEN = {
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  "all-MiniLM-L12-v2": 512,
376
  "all-MiniLM-L6-v2": 512,
377
  "all-mpnet-base-v2": 514,
378
  "allenai-specter": 512,
379
+ "bert-base-swedish-cased": 512,
380
  "bert-base-uncased": 512,
381
  "contriever-base-msmarco": 512,
382
+ "cross-en-de-roberta-sentence-transformer": 514,
383
+ "DanskBERT": 514,
384
+ "dfm-encoder-large-v1": 512,
385
+ "dfm-sentence-encoder-large-1": 512,
386
+ "distiluse-base-multilingual-cased-v2": 512,
387
+ "e5-base": 512,
388
+ "e5-large": 512,
389
+ "e5-small": 512,
390
+ "electra-small-nordic": 512,
391
+ "electra-small-swedish-cased-discriminator": 512,
392
+ "gbert-base": 512,
393
+ "gbert-large": 512,
394
+ "gelectra-base": 512,
395
+ "gelectra-large": 512,
396
+ "gottbert-base": 512,
397
  "glove.6B.300d": "N/A",
398
  "gtr-t5-base": 512,
399
  "gtr-t5-large": 512,
400
  "gtr-t5-xl": 512,
401
  "gtr-t5-xxl": 512,
402
  "komninos": "N/A",
403
+ "LASER2": "N/A",
404
+ "LaBSE": 512,
405
  "msmarco-bert-co-condensor": 512,
406
+ "multilingual-e5-base": 514,
407
+ "multilingual-e5-large": 514,
408
+ "multilingual-e5-small": 512,
409
+ "nb-bert-base": 512,
410
+ "nb-bert-large": 512,
411
+ "norbert3-base": 512,
412
+ "norbert3-large": 512,
413
  "paraphrase-multilingual-MiniLM-L12-v2": 512,
414
  "paraphrase-multilingual-mpnet-base-v2": 514,
415
+ "sentence-bert-swedish-cased": 512,
416
  "sentence-t5-base": 512,
417
  "sentence-t5-large": 512,
418
  "sentence-t5-xl": 512,
419
  "sentence-t5-xxl": 512,
420
  "sup-simcse-bert-base-uncased": 512,
 
421
  "text-embedding-ada-002": 8191,
 
422
  "text-similarity-ada-001": 2046,
423
  "text-similarity-babbage-001": 2046,
424
  "text-similarity-curie-001": 2046,
425
  "text-similarity-davinci-001": 2046,
 
426
  "text-search-ada-doc-001": 2046,
427
  "text-search-ada-query-001": 2046,
428
  "text-search-ada-001": 2046,
429
  "text-search-babbage-001": 2046,
430
  "text-search-curie-001": 2046,
431
  "text-search-davinci-001": 2046,
432
+ "use-cmlm-multilingual": 512,
433
  "unsup-simcse-bert-base-uncased": 512,
434
+ "xlm-roberta-base": 514,
435
+ "xlm-roberta-large": 514,
436
  }
437
 
438
  EXTERNAL_MODEL_TO_SIZE = {
439
+ "allenai-specter": 0.44,
 
 
 
 
 
 
 
 
440
  "all-MiniLM-L12-v2": 0.13,
441
  "all-MiniLM-L6-v2": 0.09,
442
+ "all-mpnet-base-v2": 0.44,
443
+ "bert-base-uncased": 0.44,
444
+ "bert-base-swedish-cased": 0.50,
 
 
 
 
 
 
 
 
 
445
  "cross-en-de-roberta-sentence-transformer": 1.11,
446
+ "contriever-base-msmarco": 0.44,
447
+ "DanskBERT": 0.50,
448
  "distiluse-base-multilingual-cased-v2": 0.54,
449
+ "dfm-encoder-large-v1": 1.42,
450
+ "dfm-sentence-encoder-large-1": 1.63,
451
+ "e5-base": 0.44,
452
+ "e5-small": 0.13,
453
+ "e5-large": 1.34,
454
+ "electra-small-nordic": 0.09,
455
+ "electra-small-swedish-cased-discriminator": 0.06,
456
  "gbert-base": 0.44,
457
  "gbert-large": 1.35,
458
  "gelectra-base": 0.44,
459
  "gelectra-large": 1.34,
460
+ "glove.6B.300d": 0.48,
461
+ "gottbert-base": 0.51,
462
+ "gtr-t5-base": 0.22,
463
+ "gtr-t5-large": 0.67,
464
+ "gtr-t5-xl": 2.48,
465
+ "gtr-t5-xxl": 9.73,
466
+ "komninos": 0.27,
467
+ "LASER2": 0.17,
468
+ "LaBSE": 1.88,
469
+ "msmarco-bert-co-condensor": 0.44,
470
+ "multilingual-e5-base": 1.11,
471
+ "multilingual-e5-small": 0.47,
472
+ "multilingual-e5-large": 2.24,
473
+ "nb-bert-base": 0.71,
474
+ "nb-bert-large": 1.42,
475
+ "norbert3-base": 0.52,
476
+ "norbert3-large": 1.47,
477
+ "paraphrase-multilingual-mpnet-base-v2": 1.11,
478
+ "paraphrase-multilingual-MiniLM-L12-v2": 0.47,
479
+ "sentence-bert-swedish-cased": 0.50,
480
+ "sentence-t5-base": 0.22,
481
+ "sentence-t5-large": 0.67,
482
+ "sentence-t5-xl": 2.48,
483
+ "sentence-t5-xxl": 9.73,
484
+ "sup-simcse-bert-base-uncased": 0.44,
485
+ "unsup-simcse-bert-base-uncased": 0.44,
486
+ "use-cmlm-multilingual": 1.89,
487
+ "xlm-roberta-base": 1.12,
488
  "xlm-roberta-large": 2.24,
 
489
  }
490
 
491
  MODELS_TO_SKIP = {
 
523
 
524
  def add_task(examples):
525
  # Could be added to the dataset loading script instead
526
+ if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM + TASK_LIST_CLASSIFICATION_DA + TASK_LIST_CLASSIFICATION_SV + TASK_LIST_CLASSIFICATION_NB:
527
  examples["mteb_task"] = "Classification"
528
  elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING + TASK_LIST_CLUSTERING_DE:
529
  examples["mteb_task"] = "Clustering"
 
657
  out["Embedding Dimensions"], out["Sequence Length"], out["Model Size (GB)"] = get_dim_seq_size(model)
658
  df_list.append(out)
659
  df = pd.DataFrame(df_list)
660
+ # If there are any models that are the same, merge them
661
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
662
+ # Save to csv
663
+ df.to_csv("mteb.csv", index=False)
664
+ df = df.groupby("Model", as_index=False).first()
665
  # Put 'Model' column first
666
  cols = sorted(list(df.columns))
667
  cols.insert(0, cols.pop(cols.index("Model")))
 
722
  return DATA_OVERALL
723
 
724
  get_mteb_average()
725
+ DATA_BITEXT_MINING = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING)
726
+ DATA_BITEXT_MINING_OTHER = get_mteb_data(["BitextMining"], [], TASK_LIST_BITEXT_MINING_OTHER)
727
+ DATA_CLASSIFICATION_DA = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_DA)
728
+ DATA_CLASSIFICATION_NB = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_NB)
729
+ DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
730
+ DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
731
  DATA_CLUSTERING_GERMAN = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
732
  DATA_STS = get_mteb_data(["STS"])
733
 
 
735
  NUM_SCORES = 0
736
  DATASETS = []
737
  # LANGUAGES = []
738
+ for d in [DATA_BITEXT_MINING, DATA_BITEXT_MINING_OTHER, DATA_CLASSIFICATION_EN, DATA_CLASSIFICATION_DA, DATA_CLASSIFICATION_NB, DATA_CLASSIFICATION_SV, DATA_CLASSIFICATION_OTHER, DATA_CLUSTERING, DATA_CLUSTERING_GERMAN, DATA_PAIR_CLASSIFICATION, DATA_RERANKING, DATA_RETRIEVAL, DATA_STS, DATA_SUMMARIZATION]:
739
  # NUM_SCORES += d.iloc[:, 1:].apply(lambda x: sum([1 for y in x if isinstance(y, float) and not np.isnan(y)]), axis=1).sum()
740
  cols_to_ignore = 3 if "Average" in d.columns else 2
741
  # Count number of scores including only non-nan floats & excluding the rank column
 
753
  Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> πŸ€— Refer to the [MTEB paper](https://arxiv.org/abs/2210.07316) for details on metrics, tasks and models.
754
 
755
  - **Total Datasets**: {NUM_DATASETS}
756
+ - **Total Languages**: 113
757
  - **Total Scores**: {NUM_SCORES}
758
  - **Total Models**: {len(DATA_OVERALL)}
759
  """)
 
775
  )
776
  with gr.Row():
777
  data_run = gr.Button("Refresh")
778
+ data_run.click(get_mteb_average, inputs=None, outputs=data_overall)
779
  with gr.TabItem("Bitext Mining"):
780
+ with gr.TabItem("English-X"):
781
+ with gr.Row():
782
+ gr.Markdown("""
783
+ **Bitext Mining Leaderboard 🏴󠁧󠁒󠁳󠁣󠁴󠁿**
784
+
785
+ - **Metric:** [F1](https://huggingface.co/spaces/evaluate-metric/f1)
786
+ - **Languages:** 117 (Pairs of: English & other language)
787
+ """)
788
+ with gr.Row():
789
+ data_bitext_mining = gr.components.Dataframe(
790
+ DATA_BITEXT_MINING,
791
+ datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING.columns),
792
+ type="pandas",
793
+ )
794
+ with gr.Row():
795
+ data_run = gr.Button("Refresh")
796
+ task_bitext_mining = gr.Variable(value=["BitextMining"])
797
+ lang_bitext_mining_other = gr.Variable(value=[])
798
+ datasets_bitext_mining_other = gr.Variable(value=TASK_LIST_BITEXT_MINING)
799
+ data_run.click(
800
+ get_mteb_data,
801
+ inputs=[task_bitext_mining, lang_bitext_mining_other, datasets_bitext_mining_other],
802
+ outputs=data_bitext_mining,
803
+ )
804
+ with gr.TabItem("Other"):
805
+ with gr.Row():
806
+ gr.Markdown("""
807
+ **Bitext Mining Other Leaderboard 🎌**
808
+
809
+ - **Metric:** [F1](https://huggingface.co/spaces/evaluate-metric/f1)
810
+ - **Languages:** 2 (Pair of: Danish & Bornholmsk)
811
+ - **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen)
812
+ """)
813
+ with gr.Row():
814
+ data_bitext_mining_other = gr.components.Dataframe(
815
+ DATA_BITEXT_MINING_OTHER,
816
+ datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING_OTHER.columns),
817
+ type="pandas",
818
+ )
819
+ with gr.Row():
820
+ data_run = gr.Button("Refresh")
821
+ task_bitext_mining_other = gr.Variable(value=["BitextMining"])
822
+ lang_bitext_mining_other = gr.Variable(value=[])
823
+ datasets_bitext_mining_other = gr.Variable(value=TASK_LIST_BITEXT_MINING_OTHER)
824
+ data_run.click(
825
+ get_mteb_data,
826
+ inputs=[
827
+ task_bitext_mining_other,
828
+ lang_bitext_mining_other,
829
+ datasets_bitext_mining_other,
830
+ ],
831
+ outputs=data_bitext_mining_other,
832
+ )
833
  with gr.TabItem("Classification"):
834
  with gr.TabItem("English"):
835
  with gr.Row():
 
857
  ],
858
  outputs=data_classification_en,
859
  )
860
+ with gr.TabItem("Danish"):
861
+ with gr.Row():
862
+ gr.Markdown("""
863
+ **Classification Danish Leaderboard πŸ€πŸ‡©πŸ‡°**
864
+
865
+ - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
866
+ - **Languages:** Danish
867
+ - **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen)
868
+ """)
869
+ with gr.Row():
870
+ data_classification_da = gr.components.Dataframe(
871
+ DATA_CLASSIFICATION_DA,
872
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_DA.columns),
873
+ type="pandas",
874
+ )
875
+ with gr.Row():
876
+ data_run_classification_da = gr.Button("Refresh")
877
+ task_classification_da = gr.Variable(value=["Classification"])
878
+ lang_classification_da = gr.Variable(value=[])
879
+ datasets_classification_da = gr.Variable(value=TASK_LIST_CLASSIFICATION_DA)
880
+ data_run_classification_da.click(
881
+ get_mteb_data,
882
+ inputs=[
883
+ task_classification_da,
884
+ lang_classification_da,
885
+ datasets_classification_da,
886
+ ],
887
+ outputs=data_classification_da,
888
+ )
889
+ with gr.TabItem("Norwegian"):
890
+ with gr.Row():
891
+ gr.Markdown("""
892
+ **Classification Norwegian Leaderboard πŸ’™πŸ‡³πŸ‡΄**
893
+
894
+ - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
895
+ - **Languages:** Norwegian BokmΓ₯l
896
+ - **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen)
897
+ """)
898
+ with gr.Row():
899
+ data_classification_nb = gr.components.Dataframe(
900
+ DATA_CLASSIFICATION_NB,
901
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_NB.columns),
902
+ type="pandas",
903
+ )
904
+ with gr.Row():
905
+ data_run_classification_nb = gr.Button("Refresh")
906
+ task_classification_nb = gr.Variable(value=["Classification"])
907
+ lang_classification_nb = gr.Variable(value=[])
908
+ datasets_classification_nb = gr.Variable(value=TASK_LIST_CLASSIFICATION_NB)
909
+ data_run_classification_nb.click(
910
+ get_mteb_data,
911
+ inputs=[
912
+ task_classification_nb,
913
+ lang_classification_nb,
914
+ datasets_classification_nb,
915
+ ],
916
+ outputs=data_classification_nb,
917
+ )
918
+ with gr.TabItem("Swedish"):
919
+ with gr.Row():
920
+ gr.Markdown("""
921
+ **Classification Swedish Leaderboard πŸ’›πŸ‡ΈπŸ‡ͺ**
922
+
923
+ - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
924
+ - **Languages:** Swedish
925
+ - **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen)
926
+ """)
927
+ with gr.Row():
928
+ data_classification_sv = gr.components.Dataframe(
929
+ DATA_CLASSIFICATION_SV,
930
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_SV.columns),
931
+ type="pandas",
932
+ )
933
+ with gr.Row():
934
+ data_run_classification_sv = gr.Button("Refresh")
935
+ task_classification_sv = gr.Variable(value=["Classification"])
936
+ lang_classification_sv = gr.Variable(value=[])
937
+ datasets_classification_sv = gr.Variable(value=TASK_LIST_CLASSIFICATION_SV)
938
+ data_run_classification_sv.click(
939
+ get_mteb_data,
940
+ inputs=[
941
+ task_classification_sv,
942
+ lang_classification_sv,
943
+ datasets_classification_sv,
944
+ ],
945
+ outputs=data_classification_sv,
946
+ )
947
+ with gr.TabItem("Other"):
948
  with gr.Row():
949
  gr.Markdown("""
950
+ **Classification Other Languages Leaderboard πŸ’œπŸ’šπŸ’™**
951
 
952
  - **Metric:** [Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)
953
+ - **Languages:** 47 (Only languages not included in the other tabs)
954
  """)
955
  with gr.Row():
956
  data_classification = gr.components.Dataframe(
957
+ DATA_CLASSIFICATION_OTHER,
958
+ datatype=["number", "markdown"] + ["number"] * len(DATA_CLASSIFICATION_OTHER) * 10,
959
  type="pandas",
960
  )
961
  with gr.Row():
962
  data_run = gr.Button("Refresh")
963
  task_classification = gr.Variable(value=["Classification"])
964
+ lang_classification = gr.Variable(value=[])
965
+ datasets_classification = gr.Variable(value=TASK_LIST_CLASSIFICATION_OTHER)
966
  data_run.click(
967
  get_mteb_data,
968
+ inputs=[
969
+ task_classification,
970
+ lang_classification,
971
+ datasets_classification,
972
+ ],
973
  outputs=data_classification,
974
+ )
975
  with gr.TabItem("Clustering"):
976
  with gr.TabItem("English"):
977
  with gr.Row():
 
1000
  with gr.TabItem("German"):
1001
  with gr.Row():
1002
  gr.Markdown("""
1003
+ **Clustering German Leaderboard βœ¨πŸ‡©πŸ‡ͺ**
1004
 
1005
  - **Metric:** Validity Measure (v_measure)
1006
  - **Languages:** German
 
1044
  inputs=[task_pair_classification],
1045
  outputs=data_pair_classification,
1046
  )
1047
+ with gr.TabItem("Reranking"):
1048
  with gr.Row():
1049
  gr.Markdown("""
1050
+ **Reranking Leaderboard πŸ₯ˆ**
1051
 
1052
+ - **Metric:** Mean Average Precision (MAP)
1053
  - **Languages:** English
1054
  """)
1055
  with gr.Row():
1056
+ data_reranking = gr.components.Dataframe(
1057
+ DATA_RERANKING,
1058
+ datatype=["number", "markdown"] + ["number"] * len(DATA_RERANKING.columns),
 
1059
  type="pandas",
1060
  )
1061
  with gr.Row():
1062
  data_run = gr.Button("Refresh")
1063
+ task_reranking = gr.Variable(value=["Reranking"])
1064
+ metric_reranking = gr.Variable(value="map")
1065
  data_run.click(
1066
+ get_mteb_data, inputs=[task_reranking], outputs=data_reranking
1067
  )
1068
+ with gr.TabItem("Retrieval"):
1069
  with gr.Row():
1070
  gr.Markdown("""
1071
+ **Retrieval Leaderboard πŸ”Ž**
1072
 
1073
+ - **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10)
1074
  - **Languages:** English
1075
  """)
1076
  with gr.Row():
1077
+ data_retrieval = gr.components.Dataframe(
1078
+ DATA_RETRIEVAL,
1079
+ # Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
1080
+ datatype=["number", "markdown"] + ["number"] * len(DATA_RETRIEVAL.columns) * 2,
1081
  type="pandas",
1082
  )
1083
  with gr.Row():
1084
  data_run = gr.Button("Refresh")
1085
+ task_retrieval = gr.Variable(value=["Retrieval"])
 
1086
  data_run.click(
1087
+ get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval
1088
+ )
1089
  with gr.TabItem("STS"):
1090
  with gr.TabItem("English"):
1091
  with gr.Row():