|
--- |
|
datasets: |
|
- Tevatron/msmarco-passage |
|
--- |
|
Using tevatron, unpushed code |
|
|
|
``` |
|
bs=32 |
|
lr=7e-6 |
|
|
|
gradient_accumulation_steps=1 |
|
real_bs=$(($bs / $gradient_accumulation_steps)) |
|
echo "real_bs: $real_bs" |
|
echo "expected_bs: $bs" |
|
sleep 1s |
|
|
|
epoch=5 |
|
teacher=crystina-z/monoXLMR.pft-msmarco |
|
|
|
dataset=Tevatron/msmarco-passage && dataset_name=enMarco |
|
output_dir=margin-mse.distill/teacher-$(basename $teacher).student-mbert.epoch-${epoch}.${bs}x2.lr.$lr.data-$dataset_name.$commit_id |
|
mkdir -p $output_dir |
|
|
|
CUDA_VISIBLE_DEVICES=$device WANDB_PROJECT=distill \ |
|
python examples/distill_marginmse/distil_train.py \ |
|
--output_dir $output_dir \ |
|
--model_name_or_path bert-base-multilingual-cased \ |
|
--teacher_model_name_or_path $teacher \ |
|
--save_steps 1000 \ |
|
--dataset_name $dataset \ |
|
--fp16 \ |
|
--per_device_train_batch_size $real_bs \ |
|
--gradient_accumulation_steps 4 \ |
|
--train_n_passages 2 \ |
|
--learning_rate $lr \ |
|
--q_max_len 16 \ |
|
--p_max_len 128 \ |
|
--num_train_epochs $epoch \ |
|
--logging_steps 500 \ |
|
--overwrite_output_dir \ |
|
--dataloader_num_workers 4 \ |
|
``` |