Using tevatron, unpushed code ``` bs=512 lr=1e-5 gradient_accumulation_steps=8 real_bs=$(($bs / $gradient_accumulation_steps)) echo "real_bs: $real_bs" echo "expected_bs: $bs" sleep 1s epoch=5 teacher=crystina-z/monoXLMR.pft-msmarco dataset=Tevatron/msmarco-passage && dataset_name=enMarco output_dir=margin-mse.distill/teacher-$(basename $teacher).student-mbert.epoch-${epoch}.${bs}x2.lr.$lr.data-$dataset_name.$commit_id mkdir -p $output_dir WANDB_PROJECT=distill \ python examples/distill_marginmse/distil_train.py \ --output_dir $output_dir \ --model_name_or_path bert-base-multilingual-cased \ --teacher_model_name_or_path $teacher \ --save_steps 1000 \ --dataset_name $dataset \ --fp16 \ --per_device_train_batch_size $real_bs \ --gradient_accumulation_steps 4 \ --train_n_passages 2 \ --learning_rate $lr \ --q_max_len 16 \ --p_max_len 128 \ --num_train_epochs $epoch \ --logging_steps 500 \ --overwrite_output_dir \ --dataloader_num_workers 4 ```