Spaces:
Running
Running
Muennighoff
commited on
Commit
•
bcadbe0
1
Parent(s):
2e5b810
Add seqlen
Browse files
app.py
CHANGED
@@ -288,6 +288,59 @@ EXTERNAL_MODEL_TO_DIM = {
|
|
288 |
"unsup-simcse-bert-base-uncased": 768,
|
289 |
}
|
290 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
MODELS_TO_SKIP = {
|
292 |
"baseplate/instructor-large-1", # Duplicate
|
293 |
"radames/e5-large", # Duplicate
|
@@ -341,26 +394,22 @@ for model in EXTERNAL_MODELS:
|
|
341 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
342 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
343 |
|
344 |
-
def
|
345 |
filenames = [sib.rfilename for sib in model.siblings]
|
346 |
-
dim = ""
|
347 |
if "1_Pooling/config.json" in filenames:
|
348 |
st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
|
349 |
dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
|
350 |
elif "2_Pooling/config.json" in filenames:
|
351 |
st_config_path = hf_hub_download(model.modelId, filename="2_Pooling/config.json")
|
352 |
dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
|
353 |
-
|
354 |
config_path = hf_hub_download(model.modelId, filename="config.json")
|
355 |
config = json.load(open(config_path))
|
356 |
-
if
|
357 |
-
dim = config
|
358 |
-
|
359 |
-
|
360 |
-
elif "d_model" in config:
|
361 |
-
dim = config["d_model"]
|
362 |
-
return dim
|
363 |
-
|
364 |
|
365 |
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
|
366 |
api = HfApi()
|
@@ -381,6 +430,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
381 |
if len(res) > 1:
|
382 |
if add_emb_dim:
|
383 |
res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
|
|
|
384 |
df_list.append(res)
|
385 |
|
386 |
for model in models:
|
@@ -414,7 +464,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
414 |
# Model & at least one result
|
415 |
if len(out) > 1:
|
416 |
if add_emb_dim:
|
417 |
-
out["Embedding Dimensions"] =
|
418 |
df_list.append(out)
|
419 |
df = pd.DataFrame(df_list)
|
420 |
# Put 'Model' column first
|
@@ -472,7 +522,7 @@ def get_mteb_average():
|
|
472 |
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
|
473 |
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
|
474 |
|
475 |
-
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Embedding Dimensions", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
476 |
|
477 |
return DATA_OVERALL
|
478 |
|
|
|
288 |
"unsup-simcse-bert-base-uncased": 768,
|
289 |
}
|
290 |
|
291 |
+
|
292 |
+
EXTERNAL_MODEL_TO_SEQLEN = {
|
293 |
+
"xlm-roberta-large": 514,
|
294 |
+
"use-cmlm-multilingual": 512,
|
295 |
+
"gottbert-base": 512,
|
296 |
+
"cross-en-de-roberta-sentence-transformer": 514,
|
297 |
+
"gbert-base": 512,
|
298 |
+
"gbert-large": 512,
|
299 |
+
"gelectra-base": 512,
|
300 |
+
"gelectra-large": 512,
|
301 |
+
"gottbert-base": 512,
|
302 |
+
|
303 |
+
"LASER2": "N/A",
|
304 |
+
"LaBSE": 512,
|
305 |
+
"all-MiniLM-L12-v2": 512,
|
306 |
+
"all-MiniLM-L6-v2": 512,
|
307 |
+
"all-mpnet-base-v2": 514,
|
308 |
+
"allenai-specter": 512,
|
309 |
+
"bert-base-uncased": 512,
|
310 |
+
"contriever-base-msmarco": 512,
|
311 |
+
"glove.6B.300d": "N/A",
|
312 |
+
"gtr-t5-base": 512,
|
313 |
+
"gtr-t5-large": 512,
|
314 |
+
"gtr-t5-xl": 512,
|
315 |
+
"gtr-t5-xxl": 512,
|
316 |
+
"komninos": "N/A",
|
317 |
+
"msmarco-bert-co-condensor": 512,
|
318 |
+
"paraphrase-multilingual-MiniLM-L12-v2": 512,
|
319 |
+
"paraphrase-multilingual-mpnet-base-v2": 514,
|
320 |
+
"sentence-t5-base": 512,
|
321 |
+
"sentence-t5-large": 512,
|
322 |
+
"sentence-t5-xl": 512,
|
323 |
+
"sentence-t5-xxl": 512,
|
324 |
+
"sup-simcse-bert-base-uncased": 512,
|
325 |
+
|
326 |
+
"text-embedding-ada-002": 8191,
|
327 |
+
|
328 |
+
"text-similarity-ada-001": 2046,
|
329 |
+
"text-similarity-babbage-001": 2046,
|
330 |
+
"text-similarity-curie-001": 2046,
|
331 |
+
"text-similarity-davinci-001": 2046,
|
332 |
+
|
333 |
+
"text-search-ada-doc-001": 2046,
|
334 |
+
"text-search-ada-query-001": 2046,
|
335 |
+
"text-search-ada-001": 2046,
|
336 |
+
"text-search-babbage-001": 2046,
|
337 |
+
"text-search-curie-001": 2046,
|
338 |
+
"text-search-davinci-001": 2046,
|
339 |
+
|
340 |
+
"unsup-simcse-bert-base-uncased": 512,
|
341 |
+
}
|
342 |
+
|
343 |
+
|
344 |
MODELS_TO_SKIP = {
|
345 |
"baseplate/instructor-large-1", # Duplicate
|
346 |
"radames/e5-large", # Duplicate
|
|
|
394 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
395 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
396 |
|
397 |
+
def get_dim_seq(model):
|
398 |
filenames = [sib.rfilename for sib in model.siblings]
|
399 |
+
dim, seq = "", ""
|
400 |
if "1_Pooling/config.json" in filenames:
|
401 |
st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
|
402 |
dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
|
403 |
elif "2_Pooling/config.json" in filenames:
|
404 |
st_config_path = hf_hub_download(model.modelId, filename="2_Pooling/config.json")
|
405 |
dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
|
406 |
+
if "config.json" in filenames:
|
407 |
config_path = hf_hub_download(model.modelId, filename="config.json")
|
408 |
config = json.load(open(config_path))
|
409 |
+
if not dim:
|
410 |
+
dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
|
411 |
+
seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
|
412 |
+
return dim, seq
|
|
|
|
|
|
|
|
|
413 |
|
414 |
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
|
415 |
api = HfApi()
|
|
|
430 |
if len(res) > 1:
|
431 |
if add_emb_dim:
|
432 |
res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
|
433 |
+
res["Sequence Length"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
|
434 |
df_list.append(res)
|
435 |
|
436 |
for model in models:
|
|
|
464 |
# Model & at least one result
|
465 |
if len(out) > 1:
|
466 |
if add_emb_dim:
|
467 |
+
out["Embedding Dimensions"], out["Sequence Length"] = get_dim_seq(model)
|
468 |
df_list.append(out)
|
469 |
df = pd.DataFrame(df_list)
|
470 |
# Put 'Model' column first
|
|
|
522 |
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
|
523 |
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
|
524 |
|
525 |
+
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
526 |
|
527 |
return DATA_OVERALL
|
528 |
|