Muennighoff commited on
Commit
bcadbe0
1 Parent(s): 2e5b810

Add seqlen

Browse files
Files changed (1) hide show
  1. app.py +63 -13
app.py CHANGED
@@ -288,6 +288,59 @@ EXTERNAL_MODEL_TO_DIM = {
288
  "unsup-simcse-bert-base-uncased": 768,
289
  }
290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  MODELS_TO_SKIP = {
292
  "baseplate/instructor-large-1", # Duplicate
293
  "radames/e5-large", # Duplicate
@@ -341,26 +394,22 @@ for model in EXTERNAL_MODELS:
341
  ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
342
  EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
343
 
344
- def get_emb_dim(model):
345
  filenames = [sib.rfilename for sib in model.siblings]
346
- dim = ""
347
  if "1_Pooling/config.json" in filenames:
348
  st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
349
  dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
350
  elif "2_Pooling/config.json" in filenames:
351
  st_config_path = hf_hub_download(model.modelId, filename="2_Pooling/config.json")
352
  dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
353
- elif "config.json" in filenames:
354
  config_path = hf_hub_download(model.modelId, filename="config.json")
355
  config = json.load(open(config_path))
356
- if "hidden_dim" in config:
357
- dim = config["hidden_dim"]
358
- elif "hidden_size" in config:
359
- dim = config["hidden_size"]
360
- elif "d_model" in config:
361
- dim = config["d_model"]
362
- return dim
363
-
364
 
365
  def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
366
  api = HfApi()
@@ -381,6 +430,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
381
  if len(res) > 1:
382
  if add_emb_dim:
383
  res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
 
384
  df_list.append(res)
385
 
386
  for model in models:
@@ -414,7 +464,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
414
  # Model & at least one result
415
  if len(out) > 1:
416
  if add_emb_dim:
417
- out["Embedding Dimensions"] = get_emb_dim(model)
418
  df_list.append(out)
419
  df = pd.DataFrame(df_list)
420
  # Put 'Model' column first
@@ -472,7 +522,7 @@ def get_mteb_average():
472
  DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
473
  DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
474
 
475
- DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Embedding Dimensions", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
476
 
477
  return DATA_OVERALL
478
 
 
288
  "unsup-simcse-bert-base-uncased": 768,
289
  }
290
 
291
+
292
+ EXTERNAL_MODEL_TO_SEQLEN = {
293
+ "xlm-roberta-large": 514,
294
+ "use-cmlm-multilingual": 512,
295
+ "gottbert-base": 512,
296
+ "cross-en-de-roberta-sentence-transformer": 514,
297
+ "gbert-base": 512,
298
+ "gbert-large": 512,
299
+ "gelectra-base": 512,
300
+ "gelectra-large": 512,
301
+ "gottbert-base": 512,
302
+
303
+ "LASER2": "N/A",
304
+ "LaBSE": 512,
305
+ "all-MiniLM-L12-v2": 512,
306
+ "all-MiniLM-L6-v2": 512,
307
+ "all-mpnet-base-v2": 514,
308
+ "allenai-specter": 512,
309
+ "bert-base-uncased": 512,
310
+ "contriever-base-msmarco": 512,
311
+ "glove.6B.300d": "N/A",
312
+ "gtr-t5-base": 512,
313
+ "gtr-t5-large": 512,
314
+ "gtr-t5-xl": 512,
315
+ "gtr-t5-xxl": 512,
316
+ "komninos": "N/A",
317
+ "msmarco-bert-co-condensor": 512,
318
+ "paraphrase-multilingual-MiniLM-L12-v2": 512,
319
+ "paraphrase-multilingual-mpnet-base-v2": 514,
320
+ "sentence-t5-base": 512,
321
+ "sentence-t5-large": 512,
322
+ "sentence-t5-xl": 512,
323
+ "sentence-t5-xxl": 512,
324
+ "sup-simcse-bert-base-uncased": 512,
325
+
326
+ "text-embedding-ada-002": 8191,
327
+
328
+ "text-similarity-ada-001": 2046,
329
+ "text-similarity-babbage-001": 2046,
330
+ "text-similarity-curie-001": 2046,
331
+ "text-similarity-davinci-001": 2046,
332
+
333
+ "text-search-ada-doc-001": 2046,
334
+ "text-search-ada-query-001": 2046,
335
+ "text-search-ada-001": 2046,
336
+ "text-search-babbage-001": 2046,
337
+ "text-search-curie-001": 2046,
338
+ "text-search-davinci-001": 2046,
339
+
340
+ "unsup-simcse-bert-base-uncased": 512,
341
+ }
342
+
343
+
344
  MODELS_TO_SKIP = {
345
  "baseplate/instructor-large-1", # Duplicate
346
  "radames/e5-large", # Duplicate
 
394
  ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
395
  EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
396
 
397
+ def get_dim_seq(model):
398
  filenames = [sib.rfilename for sib in model.siblings]
399
+ dim, seq = "", ""
400
  if "1_Pooling/config.json" in filenames:
401
  st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
402
  dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
403
  elif "2_Pooling/config.json" in filenames:
404
  st_config_path = hf_hub_download(model.modelId, filename="2_Pooling/config.json")
405
  dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
406
+ if "config.json" in filenames:
407
  config_path = hf_hub_download(model.modelId, filename="config.json")
408
  config = json.load(open(config_path))
409
+ if not dim:
410
+ dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
411
+ seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
412
+ return dim, seq
 
 
 
 
413
 
414
  def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
415
  api = HfApi()
 
430
  if len(res) > 1:
431
  if add_emb_dim:
432
  res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
433
+ res["Sequence Length"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
434
  df_list.append(res)
435
 
436
  for model in models:
 
464
  # Model & at least one result
465
  if len(out) > 1:
466
  if add_emb_dim:
467
+ out["Embedding Dimensions"], out["Sequence Length"] = get_dim_seq(model)
468
  df_list.append(out)
469
  df = pd.DataFrame(df_list)
470
  # Put 'Model' column first
 
522
  DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
523
  DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
524
 
525
+ DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
526
 
527
  return DATA_OVERALL
528