[model] dim = 2048 n_vocab = 68096 hf_model_id = "sarvamai/sarvam-1" pad_token = "" [tokenizer] return_tensors = "pt" return_attention_mask = true max_length = 256 padding = "longest" truncation = true add_special_tokens = false [training] output_dir = "output/matryoshka_sarvam1" num_train_epochs = 20 per_device_train_batch_size = 128 warmup_steps = 256 evaluation_strategy = "steps" eval_steps = 2000 save_steps = 2000 fp16 = true include_num_input_tokens_seen = false learning_rate = 3e-4 multi_dataset_batch_sampler = "PROPORTIONAL" binarizer_ste = "tanh" [matryoshka] dims = [1024, 512, 256, 128, 64]