luanafelbarros's picture
Add new SentenceTransformer model
6d49851 verified
metadata
tags:
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - generated_from_trainer
  - dataset_size:234000
  - loss:MSELoss
base_model: google-bert/bert-base-multilingual-cased
widget:
  - source_sentence: who sings in spite of ourselves with john prine
    sentences:
      - es
      - når ble michael jordan draftet til nba
      - quien canta en spite of ourselves con john prine
  - source_sentence: who wrote when you look me in the eyes
    sentences:
      - متى بدأت الفتاة الكشفية في بيع ملفات تعريف الارتباط
      - A écrit when you look me in the eyes
      - fr
  - source_sentence: when was fathers day made a national holiday
    sentences:
      - wann wurde der Vatertag zum nationalen Feiertag
      - de
      - ' អ្នកណាច្រៀង i want to sing you a love song'
  - source_sentence: what is the density of the continental crust
    sentences:
      - cuál es la densidad de la corteza continental
      - wie zingt i want to sing you a love song
      - es
  - source_sentence: who wrote the song i shot the sheriff
    sentences:
      - Quel est l'âge légal pour consommer du vin au Canada?
      - i shot the sheriff şarkısını kim besteledi
      - tr
pipeline_tag: sentence-similarity
library_name: sentence-transformers
metrics:
  - negative_mse
model-index:
  - name: SentenceTransformer based on google-bert/bert-base-multilingual-cased
    results:
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ar
          type: MSE-val-en-to-ar
        metrics:
          - type: negative_mse
            value: -18.93259286880493
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to da
          type: MSE-val-en-to-da
        metrics:
          - type: negative_mse
            value: -15.68576693534851
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to de
          type: MSE-val-en-to-de
        metrics:
          - type: negative_mse
            value: -16.125640273094177
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to en
          type: MSE-val-en-to-en
        metrics:
          - type: negative_mse
            value: -13.388358056545258
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to es
          type: MSE-val-en-to-es
        metrics:
          - type: negative_mse
            value: -15.648126602172852
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to fi
          type: MSE-val-en-to-fi
        metrics:
          - type: negative_mse
            value: -17.174141108989716
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to fr
          type: MSE-val-en-to-fr
        metrics:
          - type: negative_mse
            value: -15.814268589019775
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to he
          type: MSE-val-en-to-he
        metrics:
          - type: negative_mse
            value: -18.483880162239075
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to hu
          type: MSE-val-en-to-hu
        metrics:
          - type: negative_mse
            value: -17.58536398410797
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to it
          type: MSE-val-en-to-it
        metrics:
          - type: negative_mse
            value: -15.706634521484375
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ja
          type: MSE-val-en-to-ja
        metrics:
          - type: negative_mse
            value: -17.800691723823547
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ko
          type: MSE-val-en-to-ko
        metrics:
          - type: negative_mse
            value: -19.26662176847458
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to km
          type: MSE-val-en-to-km
        metrics:
          - type: negative_mse
            value: -28.38749885559082
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ms
          type: MSE-val-en-to-ms
        metrics:
          - type: negative_mse
            value: -15.783128142356873
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to nl
          type: MSE-val-en-to-nl
        metrics:
          - type: negative_mse
            value: -15.027229487895966
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to no
          type: MSE-val-en-to-no
        metrics:
          - type: negative_mse
            value: -15.598368644714355
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to pl
          type: MSE-val-en-to-pl
        metrics:
          - type: negative_mse
            value: -16.64138436317444
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to pt
          type: MSE-val-en-to-pt
        metrics:
          - type: negative_mse
            value: -15.76906442642212
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to ru
          type: MSE-val-en-to-ru
        metrics:
          - type: negative_mse
            value: -16.91163182258606
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to sv
          type: MSE-val-en-to-sv
        metrics:
          - type: negative_mse
            value: -15.555775165557861
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to th
          type: MSE-val-en-to-th
        metrics:
          - type: negative_mse
            value: -18.37025284767151
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to tr
          type: MSE-val-en-to-tr
        metrics:
          - type: negative_mse
            value: -16.945864260196686
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to vi
          type: MSE-val-en-to-vi
        metrics:
          - type: negative_mse
            value: -16.482776403427124
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to zh cn
          type: MSE-val-en-to-zh_cn
        metrics:
          - type: negative_mse
            value: -16.996394097805023
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to zh hk
          type: MSE-val-en-to-zh_hk
        metrics:
          - type: negative_mse
            value: -16.82070791721344
            name: Negative Mse
      - task:
          type: knowledge-distillation
          name: Knowledge Distillation
        dataset:
          name: MSE val en to zh tw
          type: MSE-val-en-to-zh_tw
        metrics:
          - type: negative_mse
            value: -17.381685972213745
            name: Negative Mse

SentenceTransformer based on google-bert/bert-base-multilingual-cased

This is a sentence-transformers model finetuned from google-bert/bert-base-multilingual-cased. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.

Model Details

Model Description

  • Model Type: Sentence Transformer
  • Base model: google-bert/bert-base-multilingual-cased
  • Maximum Sequence Length: 128 tokens
  • Output Dimensionality: 768 dimensions
  • Similarity Function: Cosine Similarity

Model Sources

Full Model Architecture

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

Usage

Direct Usage (Sentence Transformers)

First install the Sentence Transformers library:

pip install -U sentence-transformers

Then you can load this model and run inference.

from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("luanafelbarros/bert-base-multilingual-cased-matryoshka-mkqa")
# Run inference
sentences = [
    'who wrote the song i shot the sheriff',
    'i shot the sheriff şarkısını kim besteledi',
    'tr',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]

Evaluation

Metrics

Knowledge Distillation

  • Datasets: MSE-val-en-to-ar, MSE-val-en-to-da, MSE-val-en-to-de, MSE-val-en-to-en, MSE-val-en-to-es, MSE-val-en-to-fi, MSE-val-en-to-fr, MSE-val-en-to-he, MSE-val-en-to-hu, MSE-val-en-to-it, MSE-val-en-to-ja, MSE-val-en-to-ko, MSE-val-en-to-km, MSE-val-en-to-ms, MSE-val-en-to-nl, MSE-val-en-to-no, MSE-val-en-to-pl, MSE-val-en-to-pt, MSE-val-en-to-ru, MSE-val-en-to-sv, MSE-val-en-to-th, MSE-val-en-to-tr, MSE-val-en-to-vi, MSE-val-en-to-zh_cn, MSE-val-en-to-zh_hk and MSE-val-en-to-zh_tw
  • Evaluated with MSEEvaluator
Metric MSE-val-en-to-ar MSE-val-en-to-da MSE-val-en-to-de MSE-val-en-to-en MSE-val-en-to-es MSE-val-en-to-fi MSE-val-en-to-fr MSE-val-en-to-he MSE-val-en-to-hu MSE-val-en-to-it MSE-val-en-to-ja MSE-val-en-to-ko MSE-val-en-to-km MSE-val-en-to-ms MSE-val-en-to-nl MSE-val-en-to-no MSE-val-en-to-pl MSE-val-en-to-pt MSE-val-en-to-ru MSE-val-en-to-sv MSE-val-en-to-th MSE-val-en-to-tr MSE-val-en-to-vi MSE-val-en-to-zh_cn MSE-val-en-to-zh_hk MSE-val-en-to-zh_tw
negative_mse -18.9326 -15.6858 -16.1256 -13.3884 -15.6481 -17.1741 -15.8143 -18.4839 -17.5854 -15.7066 -17.8007 -19.2666 -28.3875 -15.7831 -15.0272 -15.5984 -16.6414 -15.7691 -16.9116 -15.5558 -18.3703 -16.9459 -16.4828 -16.9964 -16.8207 -17.3817

Training Details

Training Dataset

Unnamed Dataset

  • Size: 234,000 training samples
  • Columns: english, non-english, target, and label
  • Approximate statistics based on the first 1000 samples:
    english non-english target label
    type string string string list
    details
    • min: 10 tokens
    • mean: 12.34 tokens
    • max: 18 tokens
    • min: 3 tokens
    • mean: 14.41 tokens
    • max: 49 tokens
    • min: 3 tokens
    • mean: 3.38 tokens
    • max: 7 tokens
    • size: 768 elements
  • Samples:
    english non-english target label
    who plays hope on days of our lives من الذي يلعب الأمل في أيام حياتنا ar [0.2171212136745453, 0.5138550996780396, 0.5517176389694214, -1.0655105113983154, 1.5853567123413086, ...]
    who plays hope on days of our lives hvem spiller hope i Horton-sagaen da [0.2171212136745453, 0.5138550996780396, 0.5517176389694214, -1.0655105113983154, 1.5853567123413086, ...]
    who plays hope on days of our lives Wer spielt die Hope in Zeit der Sehnsucht? de [0.2171212136745453, 0.5138550996780396, 0.5517176389694214, -1.0655105113983154, 1.5853567123413086, ...]
  • Loss: MSELoss

Evaluation Dataset

Unnamed Dataset

  • Size: 13,000 evaluation samples
  • Columns: english, non-english, target, and label
  • Approximate statistics based on the first 1000 samples:
    english non-english target label
    type string string string list
    details
    • min: 10 tokens
    • mean: 12.44 tokens
    • max: 16 tokens
    • min: 3 tokens
    • mean: 14.48 tokens
    • max: 49 tokens
    • min: 3 tokens
    • mean: 3.38 tokens
    • max: 7 tokens
    • size: 768 elements
  • Samples:
    english non-english target label
    who played prudence on nanny and the professor من لعب دور "prudence" فى "nanny and the professor" ar [-0.2837616801261902, -0.4943353235721588, 0.020107418298721313, 0.7796109318733215, -0.47365888953208923, ...]
    who played prudence on nanny and the professor hvem spiller prudence på nanny and the professor da [-0.2837616801261902, -0.4943353235721588, 0.020107418298721313, 0.7796109318733215, -0.47365888953208923, ...]
    who played prudence on nanny and the professor Wer spielte Prudence in Nanny and the Professor de [-0.2837616801261902, -0.4943353235721588, 0.020107418298721313, 0.7796109318733215, -0.47365888953208923, ...]
  • Loss: MSELoss

Training Hyperparameters

Non-Default Hyperparameters

  • eval_strategy: steps
  • per_device_train_batch_size: 64
  • per_device_eval_batch_size: 64
  • learning_rate: 1e-05
  • num_train_epochs: 4
  • warmup_ratio: 0.1
  • fp16: True

All Hyperparameters

Click to expand
  • overwrite_output_dir: False
  • do_predict: False
  • eval_strategy: steps
  • prediction_loss_only: True
  • per_device_train_batch_size: 64
  • per_device_eval_batch_size: 64
  • per_gpu_train_batch_size: None
  • per_gpu_eval_batch_size: None
  • gradient_accumulation_steps: 1
  • eval_accumulation_steps: None
  • torch_empty_cache_steps: None
  • learning_rate: 1e-05
  • weight_decay: 0.0
  • adam_beta1: 0.9
  • adam_beta2: 0.999
  • adam_epsilon: 1e-08
  • max_grad_norm: 1.0
  • num_train_epochs: 4
  • max_steps: -1
  • lr_scheduler_type: linear
  • lr_scheduler_kwargs: {}
  • warmup_ratio: 0.1
  • warmup_steps: 0
  • log_level: passive
  • log_level_replica: warning
  • log_on_each_node: True
  • logging_nan_inf_filter: True
  • save_safetensors: True
  • save_on_each_node: False
  • save_only_model: False
  • restore_callback_states_from_checkpoint: False
  • no_cuda: False
  • use_cpu: False
  • use_mps_device: False
  • seed: 42
  • data_seed: None
  • jit_mode_eval: False
  • use_ipex: False
  • bf16: False
  • fp16: True
  • fp16_opt_level: O1
  • half_precision_backend: auto
  • bf16_full_eval: False
  • fp16_full_eval: False
  • tf32: None
  • local_rank: 0
  • ddp_backend: None
  • tpu_num_cores: None
  • tpu_metrics_debug: False
  • debug: []
  • dataloader_drop_last: False
  • dataloader_num_workers: 0
  • dataloader_prefetch_factor: None
  • past_index: -1
  • disable_tqdm: False
  • remove_unused_columns: True
  • label_names: None
  • load_best_model_at_end: False
  • ignore_data_skip: False
  • fsdp: []
  • fsdp_min_num_params: 0
  • fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
  • fsdp_transformer_layer_cls_to_wrap: None
  • accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
  • deepspeed: None
  • label_smoothing_factor: 0.0
  • optim: adamw_torch
  • optim_args: None
  • adafactor: False
  • group_by_length: False
  • length_column_name: length
  • ddp_find_unused_parameters: None
  • ddp_bucket_cap_mb: None
  • ddp_broadcast_buffers: False
  • dataloader_pin_memory: True
  • dataloader_persistent_workers: False
  • skip_memory_metrics: True
  • use_legacy_prediction_loop: False
  • push_to_hub: False
  • resume_from_checkpoint: None
  • hub_model_id: None
  • hub_strategy: every_save
  • hub_private_repo: False
  • hub_always_push: False
  • gradient_checkpointing: False
  • gradient_checkpointing_kwargs: None
  • include_inputs_for_metrics: False
  • include_for_metrics: []
  • eval_do_concat_batches: True
  • fp16_backend: auto
  • push_to_hub_model_id: None
  • push_to_hub_organization: None
  • mp_parameters:
  • auto_find_batch_size: False
  • full_determinism: False
  • torchdynamo: None
  • ray_scope: last
  • ddp_timeout: 1800
  • torch_compile: False
  • torch_compile_backend: None
  • torch_compile_mode: None
  • dispatch_batches: None
  • split_batches: None
  • include_tokens_per_second: False
  • include_num_input_tokens_seen: False
  • neftune_noise_alpha: None
  • optim_target_modules: None
  • batch_eval_metrics: False
  • eval_on_start: False
  • use_liger_kernel: False
  • eval_use_gather_object: False
  • average_tokens_across_devices: False
  • prompts: None
  • batch_sampler: batch_sampler
  • multi_dataset_batch_sampler: proportional

Training Logs

Epoch Step Training Loss Validation Loss MSE-val-en-to-ar_negative_mse MSE-val-en-to-da_negative_mse MSE-val-en-to-de_negative_mse MSE-val-en-to-en_negative_mse MSE-val-en-to-es_negative_mse MSE-val-en-to-fi_negative_mse MSE-val-en-to-fr_negative_mse MSE-val-en-to-he_negative_mse MSE-val-en-to-hu_negative_mse MSE-val-en-to-it_negative_mse MSE-val-en-to-ja_negative_mse MSE-val-en-to-ko_negative_mse MSE-val-en-to-km_negative_mse MSE-val-en-to-ms_negative_mse MSE-val-en-to-nl_negative_mse MSE-val-en-to-no_negative_mse MSE-val-en-to-pl_negative_mse MSE-val-en-to-pt_negative_mse MSE-val-en-to-ru_negative_mse MSE-val-en-to-sv_negative_mse MSE-val-en-to-th_negative_mse MSE-val-en-to-tr_negative_mse MSE-val-en-to-vi_negative_mse MSE-val-en-to-zh_cn_negative_mse MSE-val-en-to-zh_hk_negative_mse MSE-val-en-to-zh_tw_negative_mse
0.1367 500 0.3783 - - - - - - - - - - - - - - - - - - - - - - - - - - -
0.2734 1000 0.3256 0.3071 -30.0050 -29.7152 -29.7584 -29.5204 -29.6875 -29.9032 -29.6918 -29.9795 -29.9430 -29.7142 -29.8220 -30.0745 -32.1218 -29.8042 -29.7132 -29.7625 -29.7677 -29.6658 -29.8250 -29.8242 -30.1233 -29.8640 -29.7497 -29.6833 -29.7296 -29.7063
0.4102 1500 0.3007 - - - - - - - - - - - - - - - - - - - - - - - - - - -
0.5469 2000 0.2795 0.2663 -25.0193 -23.8364 -23.9924 -22.8145 -23.7158 -24.4490 -23.7719 -24.6885 -24.5973 -23.7662 -24.4998 -25.3625 -30.9153 -24.0474 -23.5674 -23.7934 -24.1332 -23.6279 -24.1308 -23.8860 -25.4166 -24.4840 -24.1931 -24.0816 -24.0634 -24.2529
0.6836 2500 0.2659 - - - - - - - - - - - - - - - - - - - - - - - - - - -
0.8203 3000 0.2562 0.2487 -22.9862 -21.2544 -21.4573 -19.8714 -21.1251 -22.1884 -21.1984 -22.6963 -22.3069 -21.1959 -22.3180 -23.4410 -30.2373 -21.4324 -20.8799 -21.1834 -21.7427 -21.1291 -21.7291 -21.3003 -23.2994 -22.1537 -21.7480 -21.7521 -21.6844 -21.9702
0.9571 3500 0.2475 - - - - - - - - - - - - - - - - - - - - - - - - - - -
1.0938 4000 0.2411 0.2375 -21.8220 -19.6064 -19.9128 -17.9872 -19.5372 -20.7666 -19.6563 -21.4985 -20.9295 -19.6182 -20.9963 -22.2441 -29.7291 -19.8001 -19.2003 -19.5189 -20.2697 -19.5946 -20.3160 -19.6652 -21.9553 -20.6678 -20.2305 -20.3719 -20.2700 -20.6528
1.2305 4500 0.2351 - - - - - - - - - - - - - - - - - - - - - - - - - - -
1.3672 5000 0.23 0.2296 -21.0058 -18.4861 -18.7926 -16.6395 -18.4034 -19.7517 -18.5299 -20.6663 -19.9769 -18.4977 -20.0496 -21.4171 -29.3272 -18.6213 -17.9746 -18.3449 -19.2392 -18.4960 -19.3377 -18.5079 -20.9805 -19.5803 -19.1385 -19.4256 -19.2708 -19.7140
1.5040 5500 0.2257 - - - - - - - - - - - - - - - - - - - - - - - - - - -
1.6407 6000 0.2222 0.2245 -20.4317 -17.7592 -18.1037 -15.7487 -17.6947 -19.0287 -17.8518 -20.1401 -19.3864 -17.7539 -19.4615 -20.8562 -29.1081 -17.8707 -17.1892 -17.6230 -18.5879 -17.7857 -18.7075 -17.7347 -20.2941 -18.8814 -18.4449 -18.8036 -18.6146 -19.1169
1.7774 6500 0.2186 - - - - - - - - - - - - - - - - - - - - - - - - - - -
1.9141 7000 0.2158 0.2199 -19.9961 -17.0956 -17.4488 -14.9930 -17.0238 -18.4442 -17.1720 -19.6005 -18.7765 -17.1020 -18.8972 -20.3720 -28.8656 -17.1949 -16.4824 -16.9655 -17.9687 -17.1229 -18.0911 -17.0128 -19.6600 -18.2823 -17.8109 -18.2341 -18.0582 -18.5735
2.0509 7500 0.2135 - - - - - - - - - - - - - - - - - - - - - - - - - - -
2.1876 8000 0.2109 0.2167 -19.6376 -16.6362 -17.0307 -14.4461 -16.5766 -18.0419 -16.7080 -19.2403 -18.3971 -16.6443 -18.5251 -20.0263 -28.7414 -16.7279 -15.9992 -16.5092 -17.5170 -16.6766 -17.7151 -16.5403 -19.2861 -17.8316 -17.3764 -17.8453 -17.6606 -18.1844
2.3243 8500 0.2088 - - - - - - - - - - - - - - - - - - - - - - - - - - -
2.4610 9000 0.2074 0.2149 -19.4358 -16.3728 -16.7740 -14.1447 -16.3289 -17.8191 -16.4582 -19.0369 -18.1738 -16.3903 -18.3565 -19.8207 -28.6133 -16.4804 -15.7354 -16.2673 -17.3034 -16.4190 -17.4826 -16.2566 -18.9971 -17.5950 -17.1273 -17.6066 -17.4124 -17.9799
2.5978 9500 0.2059 - - - - - - - - - - - - - - - - - - - - - - - - - - -
2.7345 10000 0.2047 0.2134 -19.2764 -16.1718 -16.5449 -13.8928 -16.1098 -17.5866 -16.2421 -18.8665 -17.9798 -16.1538 -18.1695 -19.6218 -28.5605 -16.2479 -15.4962 -16.0522 -17.0797 -16.2106 -17.3130 -16.0278 -18.8206 -17.3910 -16.9231 -17.4203 -17.2266 -17.7903
2.8712 10500 0.2033 - - - - - - - - - - - - - - - - - - - - - - - - - - -
3.0079 11000 0.2024 0.2120 -19.1026 -15.9149 -16.3497 -13.6750 -15.8828 -17.3842 -16.0397 -18.6612 -17.7796 -15.9436 -17.9779 -19.4370 -28.4678 -16.0245 -15.2818 -15.8265 -16.8594 -15.9988 -17.1163 -15.8106 -18.5870 -17.1548 -16.7074 -17.2082 -17.0233 -17.5910
3.1447 11500 0.201 - - - - - - - - - - - - - - - - - - - - - - - - - - -
3.2814 12000 0.2004 0.2112 -19.0406 -15.8196 -16.2516 -13.5420 -15.7688 -17.2734 -15.9280 -18.5894 -17.6966 -15.8265 -17.8933 -19.3785 -28.4539 -15.9129 -15.1631 -15.7175 -16.7540 -15.8974 -17.0251 -15.6875 -18.4807 -17.0615 -16.6087 -17.1051 -16.9423 -17.4923
3.4181 12500 0.1997 - - - - - - - - - - - - - - - - - - - - - - - - - - -
3.5548 13000 0.1995 0.2108 -18.9779 -15.7524 -16.1996 -13.4723 -15.7211 -17.2272 -15.8790 -18.5412 -17.6416 -15.7862 -17.8502 -19.3124 -28.4179 -15.8513 -15.1030 -15.6645 -16.7053 -15.8355 -16.9742 -15.6246 -18.4384 -17.0053 -16.5478 -17.0674 -16.8851 -17.4527
3.6916 13500 0.1991 - - - - - - - - - - - - - - - - - - - - - - - - - - -
3.8283 14000 0.1987 0.2103 -18.9326 -15.6858 -16.1256 -13.3884 -15.6481 -17.1741 -15.8143 -18.4839 -17.5854 -15.7066 -17.8007 -19.2666 -28.3875 -15.7831 -15.0272 -15.5984 -16.6414 -15.7691 -16.9116 -15.5558 -18.3703 -16.9459 -16.4828 -16.9964 -16.8207 -17.3817
3.9650 14500 0.1989 - - - - - - - - - - - - - - - - - - - - - - - - - - -

Framework Versions

  • Python: 3.10.12
  • Sentence Transformers: 3.3.1
  • Transformers: 4.46.3
  • PyTorch: 2.5.1+cu121
  • Accelerate: 1.1.1
  • Datasets: 3.1.0
  • Tokenizers: 0.20.3

Citation

BibTeX

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

MSELoss

@inproceedings{reimers-2020-multilingual-sentence-bert,
    title = "Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2020",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/2004.09813",
}