|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.5384615384615383, |
|
"eval_steps": 9, |
|
"global_step": 50, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"grad_norm": 0.1967356950044632, |
|
"learning_rate": 1e-05, |
|
"loss": 10.3769, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"eval_loss": 10.386632919311523, |
|
"eval_runtime": 0.079, |
|
"eval_samples_per_second": 1380.359, |
|
"eval_steps_per_second": 50.655, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"grad_norm": 0.2022944986820221, |
|
"learning_rate": 2e-05, |
|
"loss": 10.3764, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"grad_norm": 0.18788686394691467, |
|
"learning_rate": 3e-05, |
|
"loss": 10.382, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"grad_norm": 0.19782385230064392, |
|
"learning_rate": 4e-05, |
|
"loss": 10.3854, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 0.2000645250082016, |
|
"learning_rate": 5e-05, |
|
"loss": 10.3846, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"grad_norm": 0.21505430340766907, |
|
"learning_rate": 6e-05, |
|
"loss": 10.3951, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"grad_norm": 0.19195610284805298, |
|
"learning_rate": 7e-05, |
|
"loss": 10.4099, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"grad_norm": 0.20345796644687653, |
|
"learning_rate": 8e-05, |
|
"loss": 10.4002, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"grad_norm": 0.2024480700492859, |
|
"learning_rate": 9e-05, |
|
"loss": 10.4227, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"eval_loss": 10.384278297424316, |
|
"eval_runtime": 0.0734, |
|
"eval_samples_per_second": 1484.068, |
|
"eval_steps_per_second": 54.461, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.20689363777637482, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3974, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"grad_norm": 0.2552158832550049, |
|
"learning_rate": 9.996740476948385e-05, |
|
"loss": 10.3596, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"grad_norm": 0.2340978980064392, |
|
"learning_rate": 9.98696615758975e-05, |
|
"loss": 10.383, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2439499795436859, |
|
"learning_rate": 9.970689785771798e-05, |
|
"loss": 10.3855, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"grad_norm": 0.21027112007141113, |
|
"learning_rate": 9.947932582778188e-05, |
|
"loss": 10.3948, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 0.2078929841518402, |
|
"learning_rate": 9.918724219660013e-05, |
|
"loss": 10.3575, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"grad_norm": 0.20251798629760742, |
|
"learning_rate": 9.883102778550434e-05, |
|
"loss": 10.3808, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"grad_norm": 0.19867950677871704, |
|
"learning_rate": 9.841114703012817e-05, |
|
"loss": 10.3645, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"grad_norm": 0.22018198668956757, |
|
"learning_rate": 9.792814737487207e-05, |
|
"loss": 10.4185, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"eval_loss": 10.378647804260254, |
|
"eval_runtime": 0.0748, |
|
"eval_samples_per_second": 1456.458, |
|
"eval_steps_per_second": 53.448, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"grad_norm": 0.24947059154510498, |
|
"learning_rate": 9.738265855914013e-05, |
|
"loss": 10.3531, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.2289813756942749, |
|
"learning_rate": 9.677539179628005e-05, |
|
"loss": 10.3866, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"grad_norm": 0.22504808008670807, |
|
"learning_rate": 9.610713884629666e-05, |
|
"loss": 10.3638, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"grad_norm": 0.24337053298950195, |
|
"learning_rate": 9.537877098354786e-05, |
|
"loss": 10.3807, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"grad_norm": 0.22690436244010925, |
|
"learning_rate": 9.459123786076912e-05, |
|
"loss": 10.3491, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"grad_norm": 0.2087947279214859, |
|
"learning_rate": 9.374556627090749e-05, |
|
"loss": 10.3742, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.21224865317344666, |
|
"learning_rate": 9.284285880837946e-05, |
|
"loss": 10.3702, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.2412348985671997, |
|
"learning_rate": 9.188429243149824e-05, |
|
"loss": 10.3665, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"grad_norm": 0.2303854376077652, |
|
"learning_rate": 9.087111692794459e-05, |
|
"loss": 10.3636, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"eval_loss": 10.372750282287598, |
|
"eval_runtime": 0.0826, |
|
"eval_samples_per_second": 1319.489, |
|
"eval_steps_per_second": 48.422, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.8615384615384616, |
|
"grad_norm": 0.2295921891927719, |
|
"learning_rate": 8.980465328528219e-05, |
|
"loss": 10.3577, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.8923076923076924, |
|
"grad_norm": 0.2032311111688614, |
|
"learning_rate": 8.868629196864182e-05, |
|
"loss": 10.3765, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.22687426209449768, |
|
"learning_rate": 8.751749110782012e-05, |
|
"loss": 10.3815, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.9538461538461539, |
|
"grad_norm": 0.22927479445934296, |
|
"learning_rate": 8.629977459615655e-05, |
|
"loss": 10.3654, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"grad_norm": 0.23933324217796326, |
|
"learning_rate": 8.503473010366713e-05, |
|
"loss": 10.3814, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.0153846153846153, |
|
"grad_norm": 0.32213953137397766, |
|
"learning_rate": 8.37240070070257e-05, |
|
"loss": 14.7436, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.0461538461538462, |
|
"grad_norm": 0.2868359386920929, |
|
"learning_rate": 8.236931423909138e-05, |
|
"loss": 11.6626, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 0.21313560009002686, |
|
"learning_rate": 8.097241806078615e-05, |
|
"loss": 9.8745, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"grad_norm": 0.2324608415365219, |
|
"learning_rate": 7.953513975822755e-05, |
|
"loss": 9.8428, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"eval_loss": 10.366357803344727, |
|
"eval_runtime": 0.0742, |
|
"eval_samples_per_second": 1469.155, |
|
"eval_steps_per_second": 53.914, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.1384615384615384, |
|
"grad_norm": 0.27083510160446167, |
|
"learning_rate": 7.805935326811912e-05, |
|
"loss": 9.634, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.1692307692307693, |
|
"grad_norm": 0.30902621150016785, |
|
"learning_rate": 7.654698273449435e-05, |
|
"loss": 11.505, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.35695773363113403, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 11.8729, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 0.23437827825546265, |
|
"learning_rate": 7.342042203498951e-05, |
|
"loss": 9.4989, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.2615384615384615, |
|
"grad_norm": 0.2547559440135956, |
|
"learning_rate": 7.181030830777837e-05, |
|
"loss": 8.6384, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.2923076923076924, |
|
"grad_norm": 0.26535457372665405, |
|
"learning_rate": 7.017175809949044e-05, |
|
"loss": 9.577, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.323076923076923, |
|
"grad_norm": 0.34555530548095703, |
|
"learning_rate": 6.850690776699573e-05, |
|
"loss": 13.4974, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.353846153846154, |
|
"grad_norm": 0.2884383499622345, |
|
"learning_rate": 6.681792795750875e-05, |
|
"loss": 10.2425, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 0.26960813999176025, |
|
"learning_rate": 6.510702077847863e-05, |
|
"loss": 8.4271, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"eval_loss": 10.359579086303711, |
|
"eval_runtime": 0.0835, |
|
"eval_samples_per_second": 1305.361, |
|
"eval_steps_per_second": 47.903, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.4153846153846155, |
|
"grad_norm": 0.37105077505111694, |
|
"learning_rate": 6.337641692646106e-05, |
|
"loss": 10.7346, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.4461538461538461, |
|
"grad_norm": 0.31993967294692993, |
|
"learning_rate": 6.162837277871553e-05, |
|
"loss": 10.932, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.476923076923077, |
|
"grad_norm": 0.31407782435417175, |
|
"learning_rate": 5.9865167451320005e-05, |
|
"loss": 10.4683, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.5076923076923077, |
|
"grad_norm": 0.29343822598457336, |
|
"learning_rate": 5.808909982763825e-05, |
|
"loss": 9.5898, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.40266671776771545, |
|
"learning_rate": 5.6302485561014475e-05, |
|
"loss": 12.5307, |
|
"step": 50 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 97, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 10460489318400.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|