Spaces:
Running
Running
Muennighoff
commited on
Commit
•
a1e84d6
1
Parent(s):
e220002
Add Polish Retrieval
Browse files
app.py
CHANGED
@@ -398,8 +398,8 @@ EXTERNAL_MODEL_TO_SEQLEN = {
|
|
398 |
"dfm-sentence-encoder-large-1": 512,
|
399 |
"distiluse-base-multilingual-cased-v2": 512,
|
400 |
"e5-base": 512,
|
401 |
-
"e5-large": 512,
|
402 |
-
"e5-small": 512,
|
403 |
"electra-small-nordic": 512,
|
404 |
"electra-small-swedish-cased-discriminator": 512,
|
405 |
"gbert-base": 512,
|
@@ -452,18 +452,18 @@ EXTERNAL_MODEL_TO_SIZE = {
|
|
452 |
"allenai-specter": 0.44,
|
453 |
"all-MiniLM-L12-v2": 0.13,
|
454 |
"all-MiniLM-L6-v2": 0.09,
|
455 |
-
"all-mpnet-base-v2": 0.44,
|
456 |
-
"bert-base-uncased": 0.44,
|
457 |
"bert-base-swedish-cased": 0.50,
|
458 |
"cross-en-de-roberta-sentence-transformer": 1.11,
|
459 |
-
"contriever-base-msmarco": 0.44,
|
460 |
"DanskBERT": 0.50,
|
461 |
"distiluse-base-multilingual-cased-v2": 0.54,
|
462 |
"dfm-encoder-large-v1": 1.42,
|
463 |
"dfm-sentence-encoder-large-1": 1.63,
|
464 |
"e5-base": 0.44,
|
465 |
"e5-small": 0.13,
|
466 |
-
"e5-large": 1.34,
|
467 |
"electra-small-nordic": 0.09,
|
468 |
"electra-small-swedish-cased-discriminator": 0.06,
|
469 |
"gbert-base": 0.44,
|
@@ -471,18 +471,18 @@ EXTERNAL_MODEL_TO_SIZE = {
|
|
471 |
"gelectra-base": 0.44,
|
472 |
"gelectra-large": 1.34,
|
473 |
"glove.6B.300d": 0.48,
|
474 |
-
"gottbert-base": 0.51,
|
475 |
"gtr-t5-base": 0.22,
|
476 |
"gtr-t5-large": 0.67,
|
477 |
"gtr-t5-xl": 2.48,
|
478 |
"gtr-t5-xxl": 9.73,
|
479 |
-
"komninos": 0.27,
|
480 |
"LASER2": 0.17,
|
481 |
"LaBSE": 1.88,
|
482 |
"msmarco-bert-co-condensor": 0.44,
|
483 |
"multilingual-e5-base": 1.11,
|
484 |
"multilingual-e5-small": 0.47,
|
485 |
-
"multilingual-e5-large": 2.24,
|
486 |
"nb-bert-base": 0.71,
|
487 |
"nb-bert-large": 1.42,
|
488 |
"norbert3-base": 0.52,
|
@@ -496,7 +496,7 @@ EXTERNAL_MODEL_TO_SIZE = {
|
|
496 |
"sentence-t5-xxl": 9.73,
|
497 |
"sup-simcse-bert-base-uncased": 0.44,
|
498 |
"unsup-simcse-bert-base-uncased": 0.44,
|
499 |
-
"use-cmlm-multilingual": 1.89,
|
500 |
"xlm-roberta-base": 1.12,
|
501 |
"xlm-roberta-large": 2.24,
|
502 |
}
|
@@ -522,6 +522,7 @@ MODELS_TO_SKIP = {
|
|
522 |
"newsrx/instructor-large",
|
523 |
"newsrx/instructor-xl",
|
524 |
"dmlls/all-mpnet-base-v2",
|
|
|
525 |
}
|
526 |
|
527 |
|
@@ -544,7 +545,7 @@ def add_task(examples):
|
|
544 |
examples["mteb_task"] = "PairClassification"
|
545 |
elif examples["mteb_dataset_name"] in TASK_LIST_RERANKING:
|
546 |
examples["mteb_task"] = "Reranking"
|
547 |
-
elif examples["mteb_dataset_name"] in TASK_LIST_RETRIEVAL_NORM:
|
548 |
examples["mteb_task"] = "Retrieval"
|
549 |
elif examples["mteb_dataset_name"] in TASK_LIST_STS_NORM:
|
550 |
examples["mteb_task"] = "STS"
|
@@ -749,7 +750,7 @@ DATA_CLASSIFICATION_NB = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIF
|
|
749 |
DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
|
750 |
DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
|
751 |
DATA_CLUSTERING_GERMAN = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
|
752 |
-
|
753 |
DATA_STS = get_mteb_data(["STS"])
|
754 |
|
755 |
# Exact, add all non-nan integer values for every dataset
|
@@ -815,11 +816,11 @@ with block:
|
|
815 |
with gr.Row():
|
816 |
data_run = gr.Button("Refresh")
|
817 |
task_bitext_mining = gr.Variable(value=["BitextMining"])
|
818 |
-
|
819 |
-
|
820 |
data_run.click(
|
821 |
get_mteb_data,
|
822 |
-
inputs=[task_bitext_mining,
|
823 |
outputs=data_bitext_mining,
|
824 |
)
|
825 |
with gr.TabItem("Danish"):
|
@@ -832,24 +833,24 @@ with block:
|
|
832 |
- **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)
|
833 |
""")
|
834 |
with gr.Row():
|
835 |
-
|
836 |
DATA_BITEXT_MINING_OTHER,
|
837 |
datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING_OTHER.columns),
|
838 |
type="pandas",
|
839 |
)
|
840 |
with gr.Row():
|
841 |
data_run = gr.Button("Refresh")
|
842 |
-
|
843 |
-
|
844 |
-
|
845 |
data_run.click(
|
846 |
get_mteb_data,
|
847 |
inputs=[
|
848 |
-
|
849 |
-
|
850 |
-
|
851 |
],
|
852 |
-
outputs=
|
853 |
)
|
854 |
with gr.TabItem("Classification"):
|
855 |
with gr.TabItem("English"):
|
@@ -1011,11 +1012,11 @@ with block:
|
|
1011 |
with gr.Row():
|
1012 |
data_run = gr.Button("Refresh")
|
1013 |
task_clustering = gr.Variable(value=["Clustering"])
|
1014 |
-
|
1015 |
datasets_clustering = gr.Variable(value=TASK_LIST_CLUSTERING)
|
1016 |
data_run.click(
|
1017 |
get_mteb_data,
|
1018 |
-
inputs=[task_clustering,
|
1019 |
outputs=data_clustering,
|
1020 |
)
|
1021 |
with gr.TabItem("German"):
|
@@ -1036,11 +1037,11 @@ with block:
|
|
1036 |
with gr.Row():
|
1037 |
data_run = gr.Button("Refresh")
|
1038 |
task_clustering_de = gr.Variable(value=["Clustering"])
|
1039 |
-
|
1040 |
datasets_clustering_de = gr.Variable(value=TASK_LIST_CLUSTERING_DE)
|
1041 |
data_run.click(
|
1042 |
get_mteb_data,
|
1043 |
-
inputs=[task_clustering_de,
|
1044 |
outputs=data_clustering_de,
|
1045 |
)
|
1046 |
with gr.TabItem("Pair Classification"):
|
@@ -1108,7 +1109,6 @@ with block:
|
|
1108 |
data_run.click(
|
1109 |
get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval
|
1110 |
)
|
1111 |
-
'''
|
1112 |
with gr.TabItem("Polish"):
|
1113 |
with gr.Row():
|
1114 |
gr.Markdown("""
|
@@ -1128,10 +1128,13 @@ with block:
|
|
1128 |
with gr.Row():
|
1129 |
data_run = gr.Button("Refresh")
|
1130 |
task_retrieval_pl = gr.Variable(value=["Retrieval"])
|
|
|
|
|
1131 |
data_run.click(
|
1132 |
-
get_mteb_data,
|
1133 |
-
|
1134 |
-
|
|
|
1135 |
with gr.TabItem("STS"):
|
1136 |
with gr.TabItem("English"):
|
1137 |
with gr.Row():
|
|
|
398 |
"dfm-sentence-encoder-large-1": 512,
|
399 |
"distiluse-base-multilingual-cased-v2": 512,
|
400 |
"e5-base": 512,
|
401 |
+
"e5-large": 512,
|
402 |
+
"e5-small": 512,
|
403 |
"electra-small-nordic": 512,
|
404 |
"electra-small-swedish-cased-discriminator": 512,
|
405 |
"gbert-base": 512,
|
|
|
452 |
"allenai-specter": 0.44,
|
453 |
"all-MiniLM-L12-v2": 0.13,
|
454 |
"all-MiniLM-L6-v2": 0.09,
|
455 |
+
"all-mpnet-base-v2": 0.44,
|
456 |
+
"bert-base-uncased": 0.44,
|
457 |
"bert-base-swedish-cased": 0.50,
|
458 |
"cross-en-de-roberta-sentence-transformer": 1.11,
|
459 |
+
"contriever-base-msmarco": 0.44,
|
460 |
"DanskBERT": 0.50,
|
461 |
"distiluse-base-multilingual-cased-v2": 0.54,
|
462 |
"dfm-encoder-large-v1": 1.42,
|
463 |
"dfm-sentence-encoder-large-1": 1.63,
|
464 |
"e5-base": 0.44,
|
465 |
"e5-small": 0.13,
|
466 |
+
"e5-large": 1.34,
|
467 |
"electra-small-nordic": 0.09,
|
468 |
"electra-small-swedish-cased-discriminator": 0.06,
|
469 |
"gbert-base": 0.44,
|
|
|
471 |
"gelectra-base": 0.44,
|
472 |
"gelectra-large": 1.34,
|
473 |
"glove.6B.300d": 0.48,
|
474 |
+
"gottbert-base": 0.51,
|
475 |
"gtr-t5-base": 0.22,
|
476 |
"gtr-t5-large": 0.67,
|
477 |
"gtr-t5-xl": 2.48,
|
478 |
"gtr-t5-xxl": 9.73,
|
479 |
+
"komninos": 0.27,
|
480 |
"LASER2": 0.17,
|
481 |
"LaBSE": 1.88,
|
482 |
"msmarco-bert-co-condensor": 0.44,
|
483 |
"multilingual-e5-base": 1.11,
|
484 |
"multilingual-e5-small": 0.47,
|
485 |
+
"multilingual-e5-large": 2.24,
|
486 |
"nb-bert-base": 0.71,
|
487 |
"nb-bert-large": 1.42,
|
488 |
"norbert3-base": 0.52,
|
|
|
496 |
"sentence-t5-xxl": 9.73,
|
497 |
"sup-simcse-bert-base-uncased": 0.44,
|
498 |
"unsup-simcse-bert-base-uncased": 0.44,
|
499 |
+
"use-cmlm-multilingual": 1.89,
|
500 |
"xlm-roberta-base": 1.12,
|
501 |
"xlm-roberta-large": 2.24,
|
502 |
}
|
|
|
522 |
"newsrx/instructor-large",
|
523 |
"newsrx/instructor-xl",
|
524 |
"dmlls/all-mpnet-base-v2",
|
525 |
+
"cgldo/semanticClone",
|
526 |
}
|
527 |
|
528 |
|
|
|
545 |
examples["mteb_task"] = "PairClassification"
|
546 |
elif examples["mteb_dataset_name"] in TASK_LIST_RERANKING:
|
547 |
examples["mteb_task"] = "Reranking"
|
548 |
+
elif examples["mteb_dataset_name"] in TASK_LIST_RETRIEVAL_NORM + TASK_LIST_RETRIEVAL_PL:
|
549 |
examples["mteb_task"] = "Retrieval"
|
550 |
elif examples["mteb_dataset_name"] in TASK_LIST_STS_NORM:
|
551 |
examples["mteb_task"] = "STS"
|
|
|
750 |
DATA_CLASSIFICATION_SV = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_SV)
|
751 |
DATA_CLASSIFICATION_OTHER = get_mteb_data(["Classification"], [], TASK_LIST_CLASSIFICATION_OTHER)
|
752 |
DATA_CLUSTERING_GERMAN = get_mteb_data(["Clustering"], [], TASK_LIST_CLUSTERING_DE)
|
753 |
+
DATA_RETRIEVAL_PL = get_mteb_data(["Retrieval"], [], TASK_LIST_RETRIEVAL_PL)
|
754 |
DATA_STS = get_mteb_data(["STS"])
|
755 |
|
756 |
# Exact, add all non-nan integer values for every dataset
|
|
|
816 |
with gr.Row():
|
817 |
data_run = gr.Button("Refresh")
|
818 |
task_bitext_mining = gr.Variable(value=["BitextMining"])
|
819 |
+
lang_bitext_mining = gr.Variable(value=[])
|
820 |
+
datasets_bitext_mining = gr.Variable(value=TASK_LIST_BITEXT_MINING)
|
821 |
data_run.click(
|
822 |
get_mteb_data,
|
823 |
+
inputs=[task_bitext_mining, lang_bitext_mining, datasets_bitext_mining],
|
824 |
outputs=data_bitext_mining,
|
825 |
)
|
826 |
with gr.TabItem("Danish"):
|
|
|
833 |
- **Credits:** [Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)
|
834 |
""")
|
835 |
with gr.Row():
|
836 |
+
data_bitext_mining_da = gr.components.Dataframe(
|
837 |
DATA_BITEXT_MINING_OTHER,
|
838 |
datatype=["number", "markdown"] + ["number"] * len(DATA_BITEXT_MINING_OTHER.columns),
|
839 |
type="pandas",
|
840 |
)
|
841 |
with gr.Row():
|
842 |
data_run = gr.Button("Refresh")
|
843 |
+
task_bitext_mining_da = gr.Variable(value=["BitextMining"])
|
844 |
+
lang_bitext_mining_da = gr.Variable(value=[])
|
845 |
+
datasets_bitext_mining_da = gr.Variable(value=TASK_LIST_BITEXT_MINING_OTHER)
|
846 |
data_run.click(
|
847 |
get_mteb_data,
|
848 |
inputs=[
|
849 |
+
task_bitext_mining_da,
|
850 |
+
lang_bitext_mining_da,
|
851 |
+
datasets_bitext_mining_da,
|
852 |
],
|
853 |
+
outputs=data_bitext_mining_da,
|
854 |
)
|
855 |
with gr.TabItem("Classification"):
|
856 |
with gr.TabItem("English"):
|
|
|
1012 |
with gr.Row():
|
1013 |
data_run = gr.Button("Refresh")
|
1014 |
task_clustering = gr.Variable(value=["Clustering"])
|
1015 |
+
lang_clustering = gr.Variable(value=[])
|
1016 |
datasets_clustering = gr.Variable(value=TASK_LIST_CLUSTERING)
|
1017 |
data_run.click(
|
1018 |
get_mteb_data,
|
1019 |
+
inputs=[task_clustering, lang_clustering, datasets_clustering],
|
1020 |
outputs=data_clustering,
|
1021 |
)
|
1022 |
with gr.TabItem("German"):
|
|
|
1037 |
with gr.Row():
|
1038 |
data_run = gr.Button("Refresh")
|
1039 |
task_clustering_de = gr.Variable(value=["Clustering"])
|
1040 |
+
lang_clustering_de = gr.Variable(value=[])
|
1041 |
datasets_clustering_de = gr.Variable(value=TASK_LIST_CLUSTERING_DE)
|
1042 |
data_run.click(
|
1043 |
get_mteb_data,
|
1044 |
+
inputs=[task_clustering_de, lang_clustering_de, datasets_clustering_de],
|
1045 |
outputs=data_clustering_de,
|
1046 |
)
|
1047 |
with gr.TabItem("Pair Classification"):
|
|
|
1109 |
data_run.click(
|
1110 |
get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval
|
1111 |
)
|
|
|
1112 |
with gr.TabItem("Polish"):
|
1113 |
with gr.Row():
|
1114 |
gr.Markdown("""
|
|
|
1128 |
with gr.Row():
|
1129 |
data_run = gr.Button("Refresh")
|
1130 |
task_retrieval_pl = gr.Variable(value=["Retrieval"])
|
1131 |
+
lang_retrieval_pl = gr.Variable(value=[])
|
1132 |
+
datasets_retrieval_pl = gr.Variable(value=TASK_LIST_RETRIEVAL_PL)
|
1133 |
data_run.click(
|
1134 |
+
get_mteb_data,
|
1135 |
+
inputs=[task_retrieval_pl, lang_retrieval_pl, datasets_retrieval_pl],
|
1136 |
+
outputs=data_retrieval_pl
|
1137 |
+
)
|
1138 |
with gr.TabItem("STS"):
|
1139 |
with gr.TabItem("English"):
|
1140 |
with gr.Row():
|