{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5780346820809249, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005780346820809248, "eval_loss": 1.637251377105713, "eval_runtime": 12.7741, "eval_samples_per_second": 22.859, "eval_steps_per_second": 2.896, "step": 1 }, { "epoch": 0.017341040462427744, "grad_norm": 1.9862357378005981, "learning_rate": 1.5e-05, "loss": 6.4468, "step": 3 }, { "epoch": 0.03468208092485549, "grad_norm": 2.358619213104248, "learning_rate": 3e-05, "loss": 6.4951, "step": 6 }, { "epoch": 0.05202312138728324, "grad_norm": 2.2806968688964844, "learning_rate": 4.5e-05, "loss": 6.5117, "step": 9 }, { "epoch": 0.05202312138728324, "eval_loss": 1.5940098762512207, "eval_runtime": 12.9111, "eval_samples_per_second": 22.616, "eval_steps_per_second": 2.866, "step": 9 }, { "epoch": 0.06936416184971098, "grad_norm": 2.385481119155884, "learning_rate": 4.993910125649561e-05, "loss": 6.2864, "step": 12 }, { "epoch": 0.08670520231213873, "grad_norm": 3.0656661987304688, "learning_rate": 4.962019382530521e-05, "loss": 6.0474, "step": 15 }, { "epoch": 0.10404624277456648, "grad_norm": 2.061871290206909, "learning_rate": 4.9031542398457974e-05, "loss": 5.6145, "step": 18 }, { "epoch": 0.10404624277456648, "eval_loss": 1.4199366569519043, "eval_runtime": 13.0476, "eval_samples_per_second": 22.38, "eval_steps_per_second": 2.836, "step": 18 }, { "epoch": 0.12138728323699421, "grad_norm": 1.7540128231048584, "learning_rate": 4.817959636416969e-05, "loss": 5.5856, "step": 21 }, { "epoch": 0.13872832369942195, "grad_norm": 1.9189330339431763, "learning_rate": 4.707368982147318e-05, "loss": 5.3653, "step": 24 }, { "epoch": 0.15606936416184972, "grad_norm": 1.8402730226516724, "learning_rate": 4.572593931387604e-05, "loss": 5.1259, "step": 27 }, { "epoch": 0.15606936416184972, "eval_loss": 1.310396432876587, "eval_runtime": 12.9831, "eval_samples_per_second": 22.491, "eval_steps_per_second": 2.85, "step": 27 }, { "epoch": 0.17341040462427745, "grad_norm": 1.8337098360061646, "learning_rate": 4.415111107797445e-05, "loss": 5.1098, "step": 30 }, { "epoch": 0.1907514450867052, "grad_norm": 1.904125452041626, "learning_rate": 4.2366459261474933e-05, "loss": 5.0332, "step": 33 }, { "epoch": 0.20809248554913296, "grad_norm": 1.740017294883728, "learning_rate": 4.039153688314145e-05, "loss": 4.6888, "step": 36 }, { "epoch": 0.20809248554913296, "eval_loss": 1.2403978109359741, "eval_runtime": 13.0329, "eval_samples_per_second": 22.405, "eval_steps_per_second": 2.839, "step": 36 }, { "epoch": 0.2254335260115607, "grad_norm": 2.048555850982666, "learning_rate": 3.824798160583012e-05, "loss": 5.1037, "step": 39 }, { "epoch": 0.24277456647398843, "grad_norm": 1.9620063304901123, "learning_rate": 3.5959278669726935e-05, "loss": 4.8165, "step": 42 }, { "epoch": 0.26011560693641617, "grad_norm": 1.8515405654907227, "learning_rate": 3.355050358314172e-05, "loss": 4.609, "step": 45 }, { "epoch": 0.26011560693641617, "eval_loss": 1.1901941299438477, "eval_runtime": 13.0297, "eval_samples_per_second": 22.41, "eval_steps_per_second": 2.84, "step": 45 }, { "epoch": 0.2774566473988439, "grad_norm": 2.143303632736206, "learning_rate": 3.104804738999169e-05, "loss": 4.8827, "step": 48 }, { "epoch": 0.2947976878612717, "grad_norm": 2.159912347793579, "learning_rate": 2.8479327524001636e-05, "loss": 4.6632, "step": 51 }, { "epoch": 0.31213872832369943, "grad_norm": 2.170741558074951, "learning_rate": 2.587248741756253e-05, "loss": 4.6249, "step": 54 }, { "epoch": 0.31213872832369943, "eval_loss": 1.1553400754928589, "eval_runtime": 13.0239, "eval_samples_per_second": 22.42, "eval_steps_per_second": 2.841, "step": 54 }, { "epoch": 0.32947976878612717, "grad_norm": 2.413362979888916, "learning_rate": 2.3256088156396868e-05, "loss": 4.572, "step": 57 }, { "epoch": 0.3468208092485549, "grad_norm": 2.0913474559783936, "learning_rate": 2.0658795558326743e-05, "loss": 4.4474, "step": 60 }, { "epoch": 0.36416184971098264, "grad_norm": 2.158571720123291, "learning_rate": 1.8109066104575023e-05, "loss": 4.4169, "step": 63 }, { "epoch": 0.36416184971098264, "eval_loss": 1.1319787502288818, "eval_runtime": 13.0396, "eval_samples_per_second": 22.393, "eval_steps_per_second": 2.838, "step": 63 }, { "epoch": 0.3815028901734104, "grad_norm": 2.003814458847046, "learning_rate": 1.56348351646022e-05, "loss": 4.4698, "step": 66 }, { "epoch": 0.3988439306358382, "grad_norm": 2.0732972621917725, "learning_rate": 1.3263210930352737e-05, "loss": 4.2751, "step": 69 }, { "epoch": 0.4161849710982659, "grad_norm": 2.3835160732269287, "learning_rate": 1.1020177413231334e-05, "loss": 4.6411, "step": 72 }, { "epoch": 0.4161849710982659, "eval_loss": 1.1185858249664307, "eval_runtime": 12.9977, "eval_samples_per_second": 22.466, "eval_steps_per_second": 2.847, "step": 72 }, { "epoch": 0.43352601156069365, "grad_norm": 2.16192626953125, "learning_rate": 8.930309757836517e-06, "loss": 4.5164, "step": 75 }, { "epoch": 0.4508670520231214, "grad_norm": 2.1548967361450195, "learning_rate": 7.016504991533726e-06, "loss": 4.3064, "step": 78 }, { "epoch": 0.4682080924855491, "grad_norm": 2.206399440765381, "learning_rate": 5.299731159831953e-06, "loss": 4.4663, "step": 81 }, { "epoch": 0.4682080924855491, "eval_loss": 1.1104938983917236, "eval_runtime": 13.0351, "eval_samples_per_second": 22.401, "eval_steps_per_second": 2.838, "step": 81 }, { "epoch": 0.48554913294797686, "grad_norm": 2.343303680419922, "learning_rate": 3.798797596089351e-06, "loss": 4.3065, "step": 84 }, { "epoch": 0.5028901734104047, "grad_norm": 2.413022756576538, "learning_rate": 2.5301488425208296e-06, "loss": 4.3311, "step": 87 }, { "epoch": 0.5202312138728323, "grad_norm": 2.3520829677581787, "learning_rate": 1.5076844803522922e-06, "loss": 4.6312, "step": 90 }, { "epoch": 0.5202312138728323, "eval_loss": 1.1076910495758057, "eval_runtime": 13.0433, "eval_samples_per_second": 22.387, "eval_steps_per_second": 2.837, "step": 90 }, { "epoch": 0.5375722543352601, "grad_norm": 2.2418923377990723, "learning_rate": 7.426068431000882e-07, "loss": 4.202, "step": 93 }, { "epoch": 0.5549132947976878, "grad_norm": 8.558448791503906, "learning_rate": 2.4329828146074095e-07, "loss": 4.4147, "step": 96 }, { "epoch": 0.5722543352601156, "grad_norm": 2.1289567947387695, "learning_rate": 1.522932452260595e-08, "loss": 4.1999, "step": 99 }, { "epoch": 0.5722543352601156, "eval_loss": 1.1074049472808838, "eval_runtime": 13.0348, "eval_samples_per_second": 22.402, "eval_steps_per_second": 2.839, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.36836603346944e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }