{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5384615384615383, "eval_steps": 9, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03076923076923077, "grad_norm": 0.1967356950044632, "learning_rate": 1e-05, "loss": 10.3769, "step": 1 }, { "epoch": 0.03076923076923077, "eval_loss": 10.386632919311523, "eval_runtime": 0.079, "eval_samples_per_second": 1380.359, "eval_steps_per_second": 50.655, "step": 1 }, { "epoch": 0.06153846153846154, "grad_norm": 0.2022944986820221, "learning_rate": 2e-05, "loss": 10.3764, "step": 2 }, { "epoch": 0.09230769230769231, "grad_norm": 0.18788686394691467, "learning_rate": 3e-05, "loss": 10.382, "step": 3 }, { "epoch": 0.12307692307692308, "grad_norm": 0.19782385230064392, "learning_rate": 4e-05, "loss": 10.3854, "step": 4 }, { "epoch": 0.15384615384615385, "grad_norm": 0.2000645250082016, "learning_rate": 5e-05, "loss": 10.3846, "step": 5 }, { "epoch": 0.18461538461538463, "grad_norm": 0.21505430340766907, "learning_rate": 6e-05, "loss": 10.3951, "step": 6 }, { "epoch": 0.2153846153846154, "grad_norm": 0.19195610284805298, "learning_rate": 7e-05, "loss": 10.4099, "step": 7 }, { "epoch": 0.24615384615384617, "grad_norm": 0.20345796644687653, "learning_rate": 8e-05, "loss": 10.4002, "step": 8 }, { "epoch": 0.27692307692307694, "grad_norm": 0.2024480700492859, "learning_rate": 9e-05, "loss": 10.4227, "step": 9 }, { "epoch": 0.27692307692307694, "eval_loss": 10.384278297424316, "eval_runtime": 0.0734, "eval_samples_per_second": 1484.068, "eval_steps_per_second": 54.461, "step": 9 }, { "epoch": 0.3076923076923077, "grad_norm": 0.20689363777637482, "learning_rate": 0.0001, "loss": 10.3974, "step": 10 }, { "epoch": 0.3384615384615385, "grad_norm": 0.2552158832550049, "learning_rate": 9.996740476948385e-05, "loss": 10.3596, "step": 11 }, { "epoch": 0.36923076923076925, "grad_norm": 0.2340978980064392, "learning_rate": 9.98696615758975e-05, "loss": 10.383, "step": 12 }, { "epoch": 0.4, "grad_norm": 0.2439499795436859, "learning_rate": 9.970689785771798e-05, "loss": 10.3855, "step": 13 }, { "epoch": 0.4307692307692308, "grad_norm": 0.21027112007141113, "learning_rate": 9.947932582778188e-05, "loss": 10.3948, "step": 14 }, { "epoch": 0.46153846153846156, "grad_norm": 0.2078929841518402, "learning_rate": 9.918724219660013e-05, "loss": 10.3575, "step": 15 }, { "epoch": 0.49230769230769234, "grad_norm": 0.20251798629760742, "learning_rate": 9.883102778550434e-05, "loss": 10.3808, "step": 16 }, { "epoch": 0.5230769230769231, "grad_norm": 0.19867950677871704, "learning_rate": 9.841114703012817e-05, "loss": 10.3645, "step": 17 }, { "epoch": 0.5538461538461539, "grad_norm": 0.22018198668956757, "learning_rate": 9.792814737487207e-05, "loss": 10.4185, "step": 18 }, { "epoch": 0.5538461538461539, "eval_loss": 10.378647804260254, "eval_runtime": 0.0748, "eval_samples_per_second": 1456.458, "eval_steps_per_second": 53.448, "step": 18 }, { "epoch": 0.5846153846153846, "grad_norm": 0.24947059154510498, "learning_rate": 9.738265855914013e-05, "loss": 10.3531, "step": 19 }, { "epoch": 0.6153846153846154, "grad_norm": 0.2289813756942749, "learning_rate": 9.677539179628005e-05, "loss": 10.3866, "step": 20 }, { "epoch": 0.6461538461538462, "grad_norm": 0.22504808008670807, "learning_rate": 9.610713884629666e-05, "loss": 10.3638, "step": 21 }, { "epoch": 0.676923076923077, "grad_norm": 0.24337053298950195, "learning_rate": 9.537877098354786e-05, "loss": 10.3807, "step": 22 }, { "epoch": 0.7076923076923077, "grad_norm": 0.22690436244010925, "learning_rate": 9.459123786076912e-05, "loss": 10.3491, "step": 23 }, { "epoch": 0.7384615384615385, "grad_norm": 0.2087947279214859, "learning_rate": 9.374556627090749e-05, "loss": 10.3742, "step": 24 }, { "epoch": 0.7692307692307693, "grad_norm": 0.21224865317344666, "learning_rate": 9.284285880837946e-05, "loss": 10.3702, "step": 25 }, { "epoch": 0.8, "grad_norm": 0.2412348985671997, "learning_rate": 9.188429243149824e-05, "loss": 10.3665, "step": 26 }, { "epoch": 0.8307692307692308, "grad_norm": 0.2303854376077652, "learning_rate": 9.087111692794459e-05, "loss": 10.3636, "step": 27 }, { "epoch": 0.8307692307692308, "eval_loss": 10.372750282287598, "eval_runtime": 0.0826, "eval_samples_per_second": 1319.489, "eval_steps_per_second": 48.422, "step": 27 }, { "epoch": 0.8615384615384616, "grad_norm": 0.2295921891927719, "learning_rate": 8.980465328528219e-05, "loss": 10.3577, "step": 28 }, { "epoch": 0.8923076923076924, "grad_norm": 0.2032311111688614, "learning_rate": 8.868629196864182e-05, "loss": 10.3765, "step": 29 }, { "epoch": 0.9230769230769231, "grad_norm": 0.22687426209449768, "learning_rate": 8.751749110782012e-05, "loss": 10.3815, "step": 30 }, { "epoch": 0.9538461538461539, "grad_norm": 0.22927479445934296, "learning_rate": 8.629977459615655e-05, "loss": 10.3654, "step": 31 }, { "epoch": 0.9846153846153847, "grad_norm": 0.23933324217796326, "learning_rate": 8.503473010366713e-05, "loss": 10.3814, "step": 32 }, { "epoch": 1.0153846153846153, "grad_norm": 0.32213953137397766, "learning_rate": 8.37240070070257e-05, "loss": 14.7436, "step": 33 }, { "epoch": 1.0461538461538462, "grad_norm": 0.2868359386920929, "learning_rate": 8.236931423909138e-05, "loss": 11.6626, "step": 34 }, { "epoch": 1.0769230769230769, "grad_norm": 0.21313560009002686, "learning_rate": 8.097241806078615e-05, "loss": 9.8745, "step": 35 }, { "epoch": 1.1076923076923078, "grad_norm": 0.2324608415365219, "learning_rate": 7.953513975822755e-05, "loss": 9.8428, "step": 36 }, { "epoch": 1.1076923076923078, "eval_loss": 10.366357803344727, "eval_runtime": 0.0742, "eval_samples_per_second": 1469.155, "eval_steps_per_second": 53.914, "step": 36 }, { "epoch": 1.1384615384615384, "grad_norm": 0.27083510160446167, "learning_rate": 7.805935326811912e-05, "loss": 9.634, "step": 37 }, { "epoch": 1.1692307692307693, "grad_norm": 0.30902621150016785, "learning_rate": 7.654698273449435e-05, "loss": 11.505, "step": 38 }, { "epoch": 1.2, "grad_norm": 0.35695773363113403, "learning_rate": 7.500000000000001e-05, "loss": 11.8729, "step": 39 }, { "epoch": 1.2307692307692308, "grad_norm": 0.23437827825546265, "learning_rate": 7.342042203498951e-05, "loss": 9.4989, "step": 40 }, { "epoch": 1.2615384615384615, "grad_norm": 0.2547559440135956, "learning_rate": 7.181030830777837e-05, "loss": 8.6384, "step": 41 }, { "epoch": 1.2923076923076924, "grad_norm": 0.26535457372665405, "learning_rate": 7.017175809949044e-05, "loss": 9.577, "step": 42 }, { "epoch": 1.323076923076923, "grad_norm": 0.34555530548095703, "learning_rate": 6.850690776699573e-05, "loss": 13.4974, "step": 43 }, { "epoch": 1.353846153846154, "grad_norm": 0.2884383499622345, "learning_rate": 6.681792795750875e-05, "loss": 10.2425, "step": 44 }, { "epoch": 1.3846153846153846, "grad_norm": 0.26960813999176025, "learning_rate": 6.510702077847863e-05, "loss": 8.4271, "step": 45 }, { "epoch": 1.3846153846153846, "eval_loss": 10.359579086303711, "eval_runtime": 0.0835, "eval_samples_per_second": 1305.361, "eval_steps_per_second": 47.903, "step": 45 }, { "epoch": 1.4153846153846155, "grad_norm": 0.37105077505111694, "learning_rate": 6.337641692646106e-05, "loss": 10.7346, "step": 46 }, { "epoch": 1.4461538461538461, "grad_norm": 0.31993967294692993, "learning_rate": 6.162837277871553e-05, "loss": 10.932, "step": 47 }, { "epoch": 1.476923076923077, "grad_norm": 0.31407782435417175, "learning_rate": 5.9865167451320005e-05, "loss": 10.4683, "step": 48 }, { "epoch": 1.5076923076923077, "grad_norm": 0.29343822598457336, "learning_rate": 5.808909982763825e-05, "loss": 9.5898, "step": 49 }, { "epoch": 1.5384615384615383, "grad_norm": 0.40266671776771545, "learning_rate": 5.6302485561014475e-05, "loss": 12.5307, "step": 50 } ], "logging_steps": 1, "max_steps": 97, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 10460489318400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }