Muennighoff commited on
Commit
a1e84d6
1 Parent(s): e220002

Add Polish Retrieval

Browse files
Files changed (1) hide show
  1. app.py +34 -31
app.py CHANGED
@@ -398,8 +398,8 @@ EXTERNAL_MODEL_TO_SEQLEN = {
398
  "dfm-sentence-encoder-large-1": 512,
399
  "distiluse-base-multilingual-cased-v2": 512,
400
  "e5-base": 512,
401
- "e5-large": 512,
402
- "e5-small": 512,
403
  "electra-small-nordic": 512,
404
  "electra-small-swedish-cased-discriminator": 512,
405
  "gbert-base": 512,
@@ -452,18 +452,18 @@ EXTERNAL_MODEL_TO_SIZE = {
452
  "allenai-specter": 0.44,
453
  "all-MiniLM-L12-v2": 0.13,
454
  "all-MiniLM-L6-v2": 0.09,
455
- "all-mpnet-base-v2": 0.44,
456
- "bert-base-uncased": 0.44,
457
  "bert-base-swedish-cased": 0.50,
458
  "cross-en-de-roberta-sentence-transformer": 1.11,
459
- "contriever-base-msmarco": 0.44,
460
  "DanskBERT": 0.50,
461
  "distiluse-base-multilingual-cased-v2": 0.54,
462
  "dfm-encoder-large-v1": 1.42,
463
  "dfm-sentence-encoder-large-1": 1.63,
464
  "e5-base": 0.44,
465
  "e5-small": 0.13,
466
- "e5-large": 1.34,
467
  "electra-small-nordic": 0.09,
468
  "electra-small-swedish-cased-discriminator": 0.06,
469
  "gbert-base": 0.44,
@@ -471,18 +471,18 @@ EXTERNAL_MODEL_TO_SIZE = {
471
  "gelectra-base": 0.44,
472
  "gelectra-large": 1.34,
473
  "glove.6B.300d": 0.48,
474
- "gottbert-base": 0.51,
475
  "gtr-t5-base": 0.22,
476
  "gtr-t5-large": 0.67,
477
  "gtr-t5-xl": 2.48,
478
  "gtr-t5-xxl": 9.73,
479
- "komninos": 0.27,
480
  "LASER2": 0.17,
481
  "LaBSE": 1.88,
482
  "msmarco-bert-co-condensor": 0.44,
483
  "multilingual-e5-base": 1.11,
484
  "multilingual-e5-small": 0.47,
485
- "multilingual-e5-large": 2.24,
486
  "nb-bert-base": 0.71,
487
  "nb-bert-large": 1.42,
488
  "norbert3-base": 0.52,
@@ -496,7 +496,7 @@ EXTERNAL_MODEL_TO_SIZE = {
496
  "sentence-t5-xxl": 9.73,
497
  "sup-simcse-bert-base-uncased": 0.44,
498
  "unsup-simcse-bert-base-uncased": 0.44,
499
- "use-cmlm-multilingual": 1.89,
500
  "xlm-roberta-base": 1.12,
501
  "xlm-roberta-large": 2.24,
502
  }
@@ -522,6 +522,7 @@ MODELS_TO_SKIP = {
522
  "newsrx/instructor-large",
523
  "newsrx/instructor-xl",
524
  "dmlls/all-mpnet-base-v2",
 
525
  }
526
 
527
 
@@ -544,7 +545,7 @@ def add_task(examples):
544
  examples["mteb_task"] = "PairClassification"
545
  elif examples["mteb_dataset_name"] in TASK_LIST_RERANKING:
546
  examples["mteb_task"] = "Reranking"
547
- elif examples["mteb_dataset_name"] in TASK_LIST_RETRIEVAL_NORM:
548
  examples["mteb_task"] = "Retrieval"
549
  elif examples["mteb_dataset_name"] in TASK_LIST_STS_NORM:
550
  examples["mteb_task"] = "STS"
@@ -749,7 +750,7 @@ DATA_CLASSIFICATION_NB = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIF
749
  DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
750
  DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
751
  DATA_CLUSTERING_GERMAN = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
752
- #DATA_RETRIEVAL_PL = get_mteb_data(["Retrieval"], [], TASK_LIST_RETRIEVAL_PL)
753
  DATA_STS = get_mteb_data(["STS"])
754
 
755
  # Exact, add all non-nan integer values for every dataset
@@ -815,11 +816,11 @@ with block:
815
  with gr.Row():
816
  data_run = gr.Button("Refresh")
817
  task_bitext_mining = gr.Variable(value=["BitextMining"])
818
- lang_bitext_mining_other = gr.Variable(value=[])
819
- datasets_bitext_mining_other = gr.Variable(value=TASK_LIST_BITEXT_MINING)
820
  data_run.click(
821
  get_mteb_data,
822
- inputs=[task_bitext_mining, lang_bitext_mining_other, datasets_bitext_mining_other],
823
  outputs=data_bitext_mining,
824
  )
825
  with gr.TabItem("Danish"):
@@ -832,24 +833,24 @@ with block:
832
  - **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)
833
  """)
834
  with gr.Row():
835
- data_bitext_mining_other = gr.components.Dataframe(
836
  DATA_BITEXT_MINING_OTHER,
837
  datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING_OTHER.columns),
838
  type="pandas",
839
  )
840
  with gr.Row():
841
  data_run = gr.Button("Refresh")
842
- task_bitext_mining_other = gr.Variable(value=["BitextMining"])
843
- lang_bitext_mining_other = gr.Variable(value=[])
844
- datasets_bitext_mining_other = gr.Variable(value=TASK_LIST_BITEXT_MINING_OTHER)
845
  data_run.click(
846
  get_mteb_data,
847
  inputs=[
848
- task_bitext_mining_other,
849
- lang_bitext_mining_other,
850
- datasets_bitext_mining_other,
851
  ],
852
- outputs=data_bitext_mining_other,
853
  )
854
  with gr.TabItem("Classification"):
855
  with gr.TabItem("English"):
@@ -1011,11 +1012,11 @@ with block:
1011
  with gr.Row():
1012
  data_run = gr.Button("Refresh")
1013
  task_clustering = gr.Variable(value=["Clustering"])
1014
- empty = gr.Variable(value=[])
1015
  datasets_clustering = gr.Variable(value=TASK_LIST_CLUSTERING)
1016
  data_run.click(
1017
  get_mteb_data,
1018
- inputs=[task_clustering, empty, datasets_clustering],
1019
  outputs=data_clustering,
1020
  )
1021
  with gr.TabItem("German"):
@@ -1036,11 +1037,11 @@ with block:
1036
  with gr.Row():
1037
  data_run = gr.Button("Refresh")
1038
  task_clustering_de = gr.Variable(value=["Clustering"])
1039
- empty_de = gr.Variable(value=[])
1040
  datasets_clustering_de = gr.Variable(value=TASK_LIST_CLUSTERING_DE)
1041
  data_run.click(
1042
  get_mteb_data,
1043
- inputs=[task_clustering_de, empty_de, datasets_clustering_de],
1044
  outputs=data_clustering_de,
1045
  )
1046
  with gr.TabItem("Pair Classification"):
@@ -1108,7 +1109,6 @@ with block:
1108
  data_run.click(
1109
  get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval
1110
  )
1111
- '''
1112
  with gr.TabItem("Polish"):
1113
  with gr.Row():
1114
  gr.Markdown("""
@@ -1128,10 +1128,13 @@ with block:
1128
  with gr.Row():
1129
  data_run = gr.Button("Refresh")
1130
  task_retrieval_pl = gr.Variable(value=["Retrieval"])
 
 
1131
  data_run.click(
1132
- get_mteb_data, inputs=[task_retrieval_pl], outputs=data_retrieval_pl
1133
- )
1134
- '''
 
1135
  with gr.TabItem("STS"):
1136
  with gr.TabItem("English"):
1137
  with gr.Row():
 
398
  "dfm-sentence-encoder-large-1": 512,
399
  "distiluse-base-multilingual-cased-v2": 512,
400
  "e5-base": 512,
401
+ "e5-large": 512,
402
+ "e5-small": 512,
403
  "electra-small-nordic": 512,
404
  "electra-small-swedish-cased-discriminator": 512,
405
  "gbert-base": 512,
 
452
  "allenai-specter": 0.44,
453
  "all-MiniLM-L12-v2": 0.13,
454
  "all-MiniLM-L6-v2": 0.09,
455
+ "all-mpnet-base-v2": 0.44,
456
+ "bert-base-uncased": 0.44,
457
  "bert-base-swedish-cased": 0.50,
458
  "cross-en-de-roberta-sentence-transformer": 1.11,
459
+ "contriever-base-msmarco": 0.44,
460
  "DanskBERT": 0.50,
461
  "distiluse-base-multilingual-cased-v2": 0.54,
462
  "dfm-encoder-large-v1": 1.42,
463
  "dfm-sentence-encoder-large-1": 1.63,
464
  "e5-base": 0.44,
465
  "e5-small": 0.13,
466
+ "e5-large": 1.34,
467
  "electra-small-nordic": 0.09,
468
  "electra-small-swedish-cased-discriminator": 0.06,
469
  "gbert-base": 0.44,
 
471
  "gelectra-base": 0.44,
472
  "gelectra-large": 1.34,
473
  "glove.6B.300d": 0.48,
474
+ "gottbert-base": 0.51,
475
  "gtr-t5-base": 0.22,
476
  "gtr-t5-large": 0.67,
477
  "gtr-t5-xl": 2.48,
478
  "gtr-t5-xxl": 9.73,
479
+ "komninos": 0.27,
480
  "LASER2": 0.17,
481
  "LaBSE": 1.88,
482
  "msmarco-bert-co-condensor": 0.44,
483
  "multilingual-e5-base": 1.11,
484
  "multilingual-e5-small": 0.47,
485
+ "multilingual-e5-large": 2.24,
486
  "nb-bert-base": 0.71,
487
  "nb-bert-large": 1.42,
488
  "norbert3-base": 0.52,
 
496
  "sentence-t5-xxl": 9.73,
497
  "sup-simcse-bert-base-uncased": 0.44,
498
  "unsup-simcse-bert-base-uncased": 0.44,
499
+ "use-cmlm-multilingual": 1.89,
500
  "xlm-roberta-base": 1.12,
501
  "xlm-roberta-large": 2.24,
502
  }
 
522
  "newsrx/instructor-large",
523
  "newsrx/instructor-xl",
524
  "dmlls/all-mpnet-base-v2",
525
+ "cgldo/semanticClone",
526
  }
527
 
528
 
 
545
  examples["mteb_task"] = "PairClassification"
546
  elif examples["mteb_dataset_name"] in TASK_LIST_RERANKING:
547
  examples["mteb_task"] = "Reranking"
548
+ elif examples["mteb_dataset_name"] in TASK_LIST_RETRIEVAL_NORM + TASK_LIST_RETRIEVAL_PL:
549
  examples["mteb_task"] = "Retrieval"
550
  elif examples["mteb_dataset_name"] in TASK_LIST_STS_NORM:
551
  examples["mteb_task"] = "STS"
 
750
  DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
751
  DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
752
  DATA_CLUSTERING_GERMAN = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
753
+ DATA_RETRIEVAL_PL = get_mteb_data(["Retrieval"], [], TASK_LIST_RETRIEVAL_PL)
754
  DATA_STS = get_mteb_data(["STS"])
755
 
756
  # Exact, add all non-nan integer values for every dataset
 
816
  with gr.Row():
817
  data_run = gr.Button("Refresh")
818
  task_bitext_mining = gr.Variable(value=["BitextMining"])
819
+ lang_bitext_mining = gr.Variable(value=[])
820
+ datasets_bitext_mining = gr.Variable(value=TASK_LIST_BITEXT_MINING)
821
  data_run.click(
822
  get_mteb_data,
823
+ inputs=[task_bitext_mining, lang_bitext_mining, datasets_bitext_mining],
824
  outputs=data_bitext_mining,
825
  )
826
  with gr.TabItem("Danish"):
 
833
  - **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)
834
  """)
835
  with gr.Row():
836
+ data_bitext_mining_da = gr.components.Dataframe(
837
  DATA_BITEXT_MINING_OTHER,
838
  datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING_OTHER.columns),
839
  type="pandas",
840
  )
841
  with gr.Row():
842
  data_run = gr.Button("Refresh")
843
+ task_bitext_mining_da = gr.Variable(value=["BitextMining"])
844
+ lang_bitext_mining_da = gr.Variable(value=[])
845
+ datasets_bitext_mining_da = gr.Variable(value=TASK_LIST_BITEXT_MINING_OTHER)
846
  data_run.click(
847
  get_mteb_data,
848
  inputs=[
849
+ task_bitext_mining_da,
850
+ lang_bitext_mining_da,
851
+ datasets_bitext_mining_da,
852
  ],
853
+ outputs=data_bitext_mining_da,
854
  )
855
  with gr.TabItem("Classification"):
856
  with gr.TabItem("English"):
 
1012
  with gr.Row():
1013
  data_run = gr.Button("Refresh")
1014
  task_clustering = gr.Variable(value=["Clustering"])
1015
+ lang_clustering = gr.Variable(value=[])
1016
  datasets_clustering = gr.Variable(value=TASK_LIST_CLUSTERING)
1017
  data_run.click(
1018
  get_mteb_data,
1019
+ inputs=[task_clustering, lang_clustering, datasets_clustering],
1020
  outputs=data_clustering,
1021
  )
1022
  with gr.TabItem("German"):
 
1037
  with gr.Row():
1038
  data_run = gr.Button("Refresh")
1039
  task_clustering_de = gr.Variable(value=["Clustering"])
1040
+ lang_clustering_de = gr.Variable(value=[])
1041
  datasets_clustering_de = gr.Variable(value=TASK_LIST_CLUSTERING_DE)
1042
  data_run.click(
1043
  get_mteb_data,
1044
+ inputs=[task_clustering_de, lang_clustering_de, datasets_clustering_de],
1045
  outputs=data_clustering_de,
1046
  )
1047
  with gr.TabItem("Pair Classification"):
 
1109
  data_run.click(
1110
  get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval
1111
  )
 
1112
  with gr.TabItem("Polish"):
1113
  with gr.Row():
1114
  gr.Markdown("""
 
1128
  with gr.Row():
1129
  data_run = gr.Button("Refresh")
1130
  task_retrieval_pl = gr.Variable(value=["Retrieval"])
1131
+ lang_retrieval_pl = gr.Variable(value=[])
1132
+ datasets_retrieval_pl = gr.Variable(value=TASK_LIST_RETRIEVAL_PL)
1133
  data_run.click(
1134
+ get_mteb_data,
1135
+ inputs=[task_retrieval_pl, lang_retrieval_pl, datasets_retrieval_pl],
1136
+ outputs=data_retrieval_pl
1137
+ )
1138
  with gr.TabItem("STS"):
1139
  with gr.TabItem("English"):
1140
  with gr.Row():