{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0784, "eval_steps": 500, "global_step": 980, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 62.10089111328125, "learning_rate": 1e-06, "loss": 4.5777, "step": 20 }, { "epoch": 0.0, "grad_norm": 39.39016342163086, "learning_rate": 2e-06, "loss": 4.4077, "step": 40 }, { "epoch": 0.0, "grad_norm": 54.24020767211914, "learning_rate": 3e-06, "loss": 4.4807, "step": 60 }, { "epoch": 0.01, "grad_norm": 30.161609649658203, "learning_rate": 4e-06, "loss": 4.5756, "step": 80 }, { "epoch": 0.01, "grad_norm": 40.131675720214844, "learning_rate": 4.9999999999999996e-06, "loss": 4.4352, "step": 100 }, { "epoch": 0.01, "grad_norm": 52.3621940612793, "learning_rate": 6e-06, "loss": 4.5096, "step": 120 }, { "epoch": 0.01, "grad_norm": 49.86561584472656, "learning_rate": 7e-06, "loss": 4.493, "step": 140 }, { "epoch": 0.01, "grad_norm": 20.034923553466797, "learning_rate": 8e-06, "loss": 4.4088, "step": 160 }, { "epoch": 0.01, "grad_norm": 50.790679931640625, "learning_rate": 9e-06, "loss": 4.4901, "step": 180 }, { "epoch": 0.02, "grad_norm": 48.5693473815918, "learning_rate": 9.999999999999999e-06, "loss": 4.3628, "step": 200 }, { "epoch": 0.02, "grad_norm": 37.95353698730469, "learning_rate": 1.1e-05, "loss": 4.3298, "step": 220 }, { "epoch": 0.02, "grad_norm": 35.7153434753418, "learning_rate": 1.2e-05, "loss": 4.2839, "step": 240 }, { "epoch": 0.02, "grad_norm": 91.47773742675781, "learning_rate": 1.3000000000000001e-05, "loss": 4.1238, "step": 260 }, { "epoch": 0.02, "grad_norm": 23.16193389892578, "learning_rate": 1.4e-05, "loss": 4.1245, "step": 280 }, { "epoch": 0.02, "grad_norm": 28.304485321044922, "learning_rate": 1.5e-05, "loss": 4.2198, "step": 300 }, { "epoch": 0.03, "grad_norm": 34.03230285644531, "learning_rate": 1.6e-05, "loss": 4.0958, "step": 320 }, { "epoch": 0.03, "grad_norm": 29.786975860595703, "learning_rate": 1.7e-05, "loss": 4.024, "step": 340 }, { "epoch": 0.03, "grad_norm": 33.04754638671875, "learning_rate": 1.8e-05, "loss": 4.0832, "step": 360 }, { "epoch": 0.03, "grad_norm": 28.68460464477539, "learning_rate": 1.9e-05, "loss": 3.9827, "step": 380 }, { "epoch": 0.03, "grad_norm": 26.463253021240234, "learning_rate": 1.9999999999999998e-05, "loss": 3.9454, "step": 400 }, { "epoch": 0.03, "grad_norm": 19.407127380371094, "learning_rate": 2.1e-05, "loss": 4.0119, "step": 420 }, { "epoch": 0.04, "grad_norm": 26.383380889892578, "learning_rate": 2.2e-05, "loss": 3.9554, "step": 440 }, { "epoch": 0.04, "grad_norm": 33.225223541259766, "learning_rate": 2.3000000000000003e-05, "loss": 3.8172, "step": 460 }, { "epoch": 0.04, "grad_norm": 26.000978469848633, "learning_rate": 2.4e-05, "loss": 3.8934, "step": 480 }, { "epoch": 0.04, "grad_norm": 28.714366912841797, "learning_rate": 2.5e-05, "loss": 3.9194, "step": 500 }, { "epoch": 0.04, "grad_norm": 28.721248626708984, "learning_rate": 2.6000000000000002e-05, "loss": 3.8144, "step": 520 }, { "epoch": 0.04, "grad_norm": 24.934555053710938, "learning_rate": 2.7000000000000002e-05, "loss": 3.9166, "step": 540 }, { "epoch": 0.04, "grad_norm": 23.113840103149414, "learning_rate": 2.8e-05, "loss": 3.8248, "step": 560 }, { "epoch": 0.05, "grad_norm": 21.58758544921875, "learning_rate": 2.9e-05, "loss": 3.7538, "step": 580 }, { "epoch": 0.05, "grad_norm": 22.339618682861328, "learning_rate": 3e-05, "loss": 3.726, "step": 600 }, { "epoch": 0.05, "grad_norm": 49.25693893432617, "learning_rate": 2.999992132854894e-05, "loss": 3.8692, "step": 620 }, { "epoch": 0.05, "grad_norm": 45.1494026184082, "learning_rate": 2.999968531502098e-05, "loss": 3.7374, "step": 640 }, { "epoch": 0.05, "grad_norm": 72.25853729248047, "learning_rate": 2.99992919618918e-05, "loss": 3.7735, "step": 660 }, { "epoch": 0.05, "grad_norm": 39.445220947265625, "learning_rate": 2.999874127328748e-05, "loss": 3.759, "step": 680 }, { "epoch": 0.06, "grad_norm": 21.18370246887207, "learning_rate": 2.9998033254984483e-05, "loss": 3.7841, "step": 700 }, { "epoch": 0.06, "grad_norm": 24.310373306274414, "learning_rate": 2.999716791440959e-05, "loss": 3.679, "step": 720 }, { "epoch": 0.06, "grad_norm": 36.432350158691406, "learning_rate": 2.9996145260639812e-05, "loss": 3.6796, "step": 740 }, { "epoch": 0.06, "grad_norm": 32.12275314331055, "learning_rate": 2.9994965304402304e-05, "loss": 3.7613, "step": 760 }, { "epoch": 0.06, "grad_norm": 38.32442855834961, "learning_rate": 2.999362805807425e-05, "loss": 3.7586, "step": 780 }, { "epoch": 0.06, "grad_norm": 30.289432525634766, "learning_rate": 2.9992133535682725e-05, "loss": 3.6919, "step": 800 }, { "epoch": 0.07, "grad_norm": 32.69138717651367, "learning_rate": 2.9990481752904566e-05, "loss": 3.6855, "step": 820 }, { "epoch": 0.07, "grad_norm": 46.554874420166016, "learning_rate": 2.9988672727066197e-05, "loss": 3.7201, "step": 840 }, { "epoch": 0.07, "grad_norm": 28.671123504638672, "learning_rate": 2.9986706477143436e-05, "loss": 3.6594, "step": 860 }, { "epoch": 0.07, "grad_norm": 49.44480895996094, "learning_rate": 2.9984583023761318e-05, "loss": 3.7271, "step": 880 }, { "epoch": 0.07, "grad_norm": 26.61457061767578, "learning_rate": 2.998230238919386e-05, "loss": 3.7376, "step": 900 }, { "epoch": 0.07, "grad_norm": 27.453275680541992, "learning_rate": 2.9979864597363846e-05, "loss": 3.6716, "step": 920 }, { "epoch": 0.08, "grad_norm": 22.791175842285156, "learning_rate": 2.9977269673842554e-05, "loss": 3.6172, "step": 940 }, { "epoch": 0.08, "grad_norm": 58.2718620300293, "learning_rate": 2.997451764584951e-05, "loss": 3.7494, "step": 960 }, { "epoch": 0.08, "grad_norm": 33.610286712646484, "learning_rate": 2.9971608542252175e-05, "loss": 3.7077, "step": 980 } ], "logging_steps": 20, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 20, "total_flos": 2331220953563136.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }