MODEL="t5-base-dutch" MODEL_DIR="${HOME}/${MODEL}" mkdir -p "${MODEL_DIR}/runs" # T5 paper lr 0.01 with batch size 128 # We have a batch size of 8 devices * 32 = 256, so lr = 0.01/2 #SEED=9200 # #./run_t5_mlm_flax_custom_dataset.py \ # --output_dir="${MODEL_DIR}" \ # --model_type="t5" \ # --config_name="flax-community/${MODEL}" \ # --tokenizer_name="${MODEL_DIR}" \ # --seed="${SEED}" \ # --preprocessing_num_workers="96" \ # --do_train --do_eval \ # --adafactor \ # --max_seq_length="512" \ # --per_device_train_batch_size="32" \ # --per_device_eval_batch_size="32" \ # --dtype="bfloat16" \ # --learning_rate="5e-3" \ # --overwrite_output_dir \ # --num_train_epochs="3" \ # --logging_steps="50" \ # --save_steps="100" \ # --eval_steps="5000" \ # --warmup_steps="3413" #exit while true; do # Set the seed to random before each run, so date shuffling per epoch is different each run. # This kills reproducibility, but is required as long as during training ValueError can be raised. # SEED=$RANDOM SEED=22384 ./run_t5_mlm_flax_custom_dataset.py \ --output_dir="${MODEL_DIR}" \ --model_type="t5" \ --config_name="flax-community/${MODEL}" \ --tokenizer_name="${MODEL_DIR}" \ --seed="${SEED}" \ --preprocessing_num_workers="96" \ --do_train --do_eval \ --adafactor \ --max_seq_length="512" \ --per_device_train_batch_size="16" \ --per_device_eval_batch_size="16" \ --dtype="bfloat16" \ --learning_rate="1e-3" \ --overwrite_output_dir \ --num_train_epochs="1" \ --logging_steps="50" \ --save_steps="500" \ --eval_steps="5000" \ --resume_from_checkpoint="${MODEL_DIR}" \ --warmup_steps="6519" # \ # --push_to_hub echo "RESTARTING" sleep 20 done # # \ #git add pytorch_model.bin #git commit -m "Update pytorch model after training" #git push origin main # --gradient_accumulation_steps="2" \ # --resume_from_checkpoint="${MODEL_DIR}/ckpt-18000" \