|
{ |
|
"best_metric": 0.8064516129032258, |
|
"best_model_checkpoint": "distilbert-base-multilingual-cased-hyper-matt/run-rtpz8b71/checkpoint-800", |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 800, |
|
"is_hyper_param_search": true, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.5273020267486572, |
|
"learning_rate": 3.379454045530049e-05, |
|
"loss": 0.672, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 7.657852649688721, |
|
"learning_rate": 3.336676146219542e-05, |
|
"loss": 0.53, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.539841651916504, |
|
"learning_rate": 3.293898246909035e-05, |
|
"loss": 0.5481, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.097090244293213, |
|
"learning_rate": 3.251120347598528e-05, |
|
"loss": 0.2841, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.80979061126709, |
|
"learning_rate": 3.208342448288021e-05, |
|
"loss": 0.6117, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 7.640440464019775, |
|
"learning_rate": 3.1655645489775144e-05, |
|
"loss": 0.5832, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.1154446601867676, |
|
"learning_rate": 3.1227866496670074e-05, |
|
"loss": 0.4628, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.84663724899292, |
|
"learning_rate": 3.0800087503565005e-05, |
|
"loss": 0.4354, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 15.556452751159668, |
|
"learning_rate": 3.0372308510459932e-05, |
|
"loss": 0.3762, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 5.357182025909424, |
|
"learning_rate": 2.9944529517354863e-05, |
|
"loss": 0.4135, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 5.593770980834961, |
|
"learning_rate": 2.9516750524249797e-05, |
|
"loss": 0.3815, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 4.29806661605835, |
|
"learning_rate": 2.9088971531144725e-05, |
|
"loss": 0.4898, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.596214771270752, |
|
"learning_rate": 2.8661192538039655e-05, |
|
"loss": 0.2377, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 13.040666580200195, |
|
"learning_rate": 2.8233413544934586e-05, |
|
"loss": 0.4523, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.686233997344971, |
|
"learning_rate": 2.7805634551829517e-05, |
|
"loss": 0.2944, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.4252870082855225, |
|
"learning_rate": 2.7377855558724448e-05, |
|
"loss": 0.4242, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 5.249920845031738, |
|
"learning_rate": 2.695007656561938e-05, |
|
"loss": 0.3134, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 4.831853866577148, |
|
"learning_rate": 2.652229757251431e-05, |
|
"loss": 0.4533, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 5.20128059387207, |
|
"learning_rate": 2.6094518579409237e-05, |
|
"loss": 0.3014, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 5.756709098815918, |
|
"learning_rate": 2.5666739586304167e-05, |
|
"loss": 0.4128, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.845, |
|
"eval_f1": 0.6804123711340206, |
|
"eval_loss": 0.3612925708293915, |
|
"eval_precision": 0.9295774647887324, |
|
"eval_recall": 0.5365853658536586, |
|
"eval_runtime": 1.5276, |
|
"eval_samples_per_second": 261.844, |
|
"eval_steps_per_second": 16.365, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 5.490586757659912, |
|
"learning_rate": 2.52389605931991e-05, |
|
"loss": 0.3643, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 6.718981742858887, |
|
"learning_rate": 2.481118160009403e-05, |
|
"loss": 0.2173, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.35438114404678345, |
|
"learning_rate": 2.438340260698896e-05, |
|
"loss": 0.2357, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 7.001437664031982, |
|
"learning_rate": 2.395562361388389e-05, |
|
"loss": 0.2616, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 6.03153133392334, |
|
"learning_rate": 2.352784462077882e-05, |
|
"loss": 0.1749, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 29.80844497680664, |
|
"learning_rate": 2.3100065627673752e-05, |
|
"loss": 0.3992, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 3.6485345363616943, |
|
"learning_rate": 2.2672286634568683e-05, |
|
"loss": 0.1931, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.2925957143306732, |
|
"learning_rate": 2.2244507641463614e-05, |
|
"loss": 0.3383, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 11.773463249206543, |
|
"learning_rate": 2.181672864835854e-05, |
|
"loss": 0.4417, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.6896932125091553, |
|
"learning_rate": 2.1388949655253475e-05, |
|
"loss": 0.3196, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.8070700168609619, |
|
"learning_rate": 2.0961170662148406e-05, |
|
"loss": 0.1319, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.24381496012210846, |
|
"learning_rate": 2.0533391669043333e-05, |
|
"loss": 0.3669, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 7.7754669189453125, |
|
"learning_rate": 2.0105612675938267e-05, |
|
"loss": 0.397, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 3.648085594177246, |
|
"learning_rate": 1.9677833682833195e-05, |
|
"loss": 0.4326, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 19.058910369873047, |
|
"learning_rate": 1.9250054689728126e-05, |
|
"loss": 0.3497, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.9193096160888672, |
|
"learning_rate": 1.882227569662306e-05, |
|
"loss": 0.3383, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 5.150022029876709, |
|
"learning_rate": 1.8394496703517987e-05, |
|
"loss": 0.2809, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.5875343680381775, |
|
"learning_rate": 1.7966717710412918e-05, |
|
"loss": 0.2199, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 9.385665893554688, |
|
"learning_rate": 1.753893871730785e-05, |
|
"loss": 0.4043, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.588594913482666, |
|
"learning_rate": 1.711115972420278e-05, |
|
"loss": 0.1863, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.865, |
|
"eval_f1": 0.7906976744186046, |
|
"eval_loss": 0.33189040422439575, |
|
"eval_precision": 0.7555555555555555, |
|
"eval_recall": 0.8292682926829268, |
|
"eval_runtime": 1.5225, |
|
"eval_samples_per_second": 262.722, |
|
"eval_steps_per_second": 16.42, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.32351887226104736, |
|
"learning_rate": 1.668338073109771e-05, |
|
"loss": 0.1385, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.6892914175987244, |
|
"learning_rate": 1.625560173799264e-05, |
|
"loss": 0.3113, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.4579280912876129, |
|
"learning_rate": 1.5827822744887572e-05, |
|
"loss": 0.1769, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 9.545488357543945, |
|
"learning_rate": 1.5400043751782503e-05, |
|
"loss": 0.2754, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 2.9151611328125, |
|
"learning_rate": 1.4972264758677432e-05, |
|
"loss": 0.2701, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.5319097638130188, |
|
"learning_rate": 1.4544485765572362e-05, |
|
"loss": 0.098, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.9106752276420593, |
|
"learning_rate": 1.4116706772467293e-05, |
|
"loss": 0.0424, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 4.542623043060303, |
|
"learning_rate": 1.3688927779362224e-05, |
|
"loss": 0.3205, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 2.6717588901519775, |
|
"learning_rate": 1.3261148786257155e-05, |
|
"loss": 0.3126, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 63.507198333740234, |
|
"learning_rate": 1.2833369793152084e-05, |
|
"loss": 0.2623, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 33.924095153808594, |
|
"learning_rate": 1.2405590800047015e-05, |
|
"loss": 0.1062, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 8.15170669555664, |
|
"learning_rate": 1.1977811806941945e-05, |
|
"loss": 0.3471, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 34.99510192871094, |
|
"learning_rate": 1.1550032813836876e-05, |
|
"loss": 0.3089, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 9.204129219055176, |
|
"learning_rate": 1.1122253820731807e-05, |
|
"loss": 0.2824, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 7.578718185424805, |
|
"learning_rate": 1.0694474827626738e-05, |
|
"loss": 0.0619, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 3.9372365474700928, |
|
"learning_rate": 1.0266695834521667e-05, |
|
"loss": 0.341, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 5.813101291656494, |
|
"learning_rate": 9.838916841416597e-06, |
|
"loss": 0.0413, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 54.38895034790039, |
|
"learning_rate": 9.41113784831153e-06, |
|
"loss": 0.2031, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 42.406883239746094, |
|
"learning_rate": 8.983358855206459e-06, |
|
"loss": 0.2095, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.2973996102809906, |
|
"learning_rate": 8.55557986210139e-06, |
|
"loss": 0.0812, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.8725, |
|
"eval_f1": 0.7866108786610879, |
|
"eval_loss": 0.4548227787017822, |
|
"eval_precision": 0.8103448275862069, |
|
"eval_recall": 0.7642276422764228, |
|
"eval_runtime": 1.5153, |
|
"eval_samples_per_second": 263.982, |
|
"eval_steps_per_second": 16.499, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.10320937633514404, |
|
"learning_rate": 8.12780086899632e-06, |
|
"loss": 0.0107, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.14922451972961426, |
|
"learning_rate": 7.700021875891251e-06, |
|
"loss": 0.0992, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 58.235931396484375, |
|
"learning_rate": 7.272242882786181e-06, |
|
"loss": 0.0839, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.5583060383796692, |
|
"learning_rate": 6.844463889681112e-06, |
|
"loss": 0.188, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.6185976266860962, |
|
"learning_rate": 6.416684896576042e-06, |
|
"loss": 0.1084, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 0.10940929502248764, |
|
"learning_rate": 5.988905903470973e-06, |
|
"loss": 0.0349, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 1.7489935159683228, |
|
"learning_rate": 5.561126910365903e-06, |
|
"loss": 0.0542, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.22723452746868134, |
|
"learning_rate": 5.133347917260833e-06, |
|
"loss": 0.1929, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.11978468298912048, |
|
"learning_rate": 4.705568924155765e-06, |
|
"loss": 0.0968, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.12295297533273697, |
|
"learning_rate": 4.277789931050695e-06, |
|
"loss": 0.0746, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 1.1583808660507202, |
|
"learning_rate": 3.850010937945626e-06, |
|
"loss": 0.0119, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.05860567465424538, |
|
"learning_rate": 3.422231944840556e-06, |
|
"loss": 0.1999, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.3195663094520569, |
|
"learning_rate": 2.9944529517354863e-06, |
|
"loss": 0.1388, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 0.07110361754894257, |
|
"learning_rate": 2.5666739586304167e-06, |
|
"loss": 0.2383, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.10332443565130234, |
|
"learning_rate": 2.1388949655253474e-06, |
|
"loss": 0.1156, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.222213014960289, |
|
"learning_rate": 1.711115972420278e-06, |
|
"loss": 0.0304, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.5618990659713745, |
|
"learning_rate": 1.2833369793152083e-06, |
|
"loss": 0.1119, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 5.86262845993042, |
|
"learning_rate": 8.55557986210139e-07, |
|
"loss": 0.1783, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 0.07000931352376938, |
|
"learning_rate": 4.277789931050695e-07, |
|
"loss": 0.0181, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.0716305747628212, |
|
"learning_rate": 0.0, |
|
"loss": 0.1302, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.88, |
|
"eval_f1": 0.8064516129032258, |
|
"eval_loss": 0.5255711078643799, |
|
"eval_precision": 0.8, |
|
"eval_recall": 0.8130081300813008, |
|
"eval_runtime": 3.5354, |
|
"eval_samples_per_second": 113.14, |
|
"eval_steps_per_second": 7.071, |
|
"step": 800 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 847261481803776.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": { |
|
"_wandb": {}, |
|
"assignments": {}, |
|
"learning_rate": 3.422231944840556e-05, |
|
"metric": "eval/loss", |
|
"num_train_epochs": 4, |
|
"per_device_train_batch_size": 8, |
|
"seed": 28 |
|
} |
|
} |
|
|