{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9946268656716417, "eval_steps": 500, "global_step": 1254, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023880597014925373, "grad_norm": 10.273793568807084, "learning_rate": 5e-06, "loss": 0.8832, "step": 10 }, { "epoch": 0.04776119402985075, "grad_norm": 17.413079986249713, "learning_rate": 5e-06, "loss": 0.7826, "step": 20 }, { "epoch": 0.07164179104477612, "grad_norm": 1.7682355694433471, "learning_rate": 5e-06, "loss": 0.7785, "step": 30 }, { "epoch": 0.0955223880597015, "grad_norm": 1.2119658541333866, "learning_rate": 5e-06, "loss": 0.7537, "step": 40 }, { "epoch": 0.11940298507462686, "grad_norm": 0.8476848561416307, "learning_rate": 5e-06, "loss": 0.7389, "step": 50 }, { "epoch": 0.14328358208955225, "grad_norm": 0.7667403306996323, "learning_rate": 5e-06, "loss": 0.7217, "step": 60 }, { "epoch": 0.16716417910447762, "grad_norm": 0.8702244781914732, "learning_rate": 5e-06, "loss": 0.7085, "step": 70 }, { "epoch": 0.191044776119403, "grad_norm": 1.3853703465833076, "learning_rate": 5e-06, "loss": 0.6971, "step": 80 }, { "epoch": 0.21492537313432836, "grad_norm": 1.2094222885220798, "learning_rate": 5e-06, "loss": 0.6882, "step": 90 }, { "epoch": 0.23880597014925373, "grad_norm": 0.7689681572000278, "learning_rate": 5e-06, "loss": 0.6849, "step": 100 }, { "epoch": 0.2626865671641791, "grad_norm": 0.7780597190717987, "learning_rate": 5e-06, "loss": 0.6818, "step": 110 }, { "epoch": 0.2865671641791045, "grad_norm": 0.5000855152810337, "learning_rate": 5e-06, "loss": 0.6816, "step": 120 }, { "epoch": 0.31044776119402984, "grad_norm": 0.7276396850932823, "learning_rate": 5e-06, "loss": 0.6736, "step": 130 }, { "epoch": 0.33432835820895523, "grad_norm": 0.6186133127584187, "learning_rate": 5e-06, "loss": 0.6818, "step": 140 }, { "epoch": 0.3582089552238806, "grad_norm": 0.48005981720062146, "learning_rate": 5e-06, "loss": 0.673, "step": 150 }, { "epoch": 0.382089552238806, "grad_norm": 0.5023606142200144, "learning_rate": 5e-06, "loss": 0.6773, "step": 160 }, { "epoch": 0.4059701492537313, "grad_norm": 0.5808170399886017, "learning_rate": 5e-06, "loss": 0.6677, "step": 170 }, { "epoch": 0.4298507462686567, "grad_norm": 0.4379969982797677, "learning_rate": 5e-06, "loss": 0.6685, "step": 180 }, { "epoch": 0.4537313432835821, "grad_norm": 0.5347993662162854, "learning_rate": 5e-06, "loss": 0.6752, "step": 190 }, { "epoch": 0.47761194029850745, "grad_norm": 0.5389725567643092, "learning_rate": 5e-06, "loss": 0.6648, "step": 200 }, { "epoch": 0.5014925373134328, "grad_norm": 0.48052530490608086, "learning_rate": 5e-06, "loss": 0.6641, "step": 210 }, { "epoch": 0.5253731343283582, "grad_norm": 0.5805014519700745, "learning_rate": 5e-06, "loss": 0.6591, "step": 220 }, { "epoch": 0.5492537313432836, "grad_norm": 0.48958097076386, "learning_rate": 5e-06, "loss": 0.6655, "step": 230 }, { "epoch": 0.573134328358209, "grad_norm": 0.49437501765783676, "learning_rate": 5e-06, "loss": 0.6683, "step": 240 }, { "epoch": 0.5970149253731343, "grad_norm": 0.45944903239035917, "learning_rate": 5e-06, "loss": 0.659, "step": 250 }, { "epoch": 0.6208955223880597, "grad_norm": 0.4748455280427033, "learning_rate": 5e-06, "loss": 0.6583, "step": 260 }, { "epoch": 0.6447761194029851, "grad_norm": 0.507103510913949, "learning_rate": 5e-06, "loss": 0.6583, "step": 270 }, { "epoch": 0.6686567164179105, "grad_norm": 0.48478059033960125, "learning_rate": 5e-06, "loss": 0.6587, "step": 280 }, { "epoch": 0.6925373134328359, "grad_norm": 0.5180513023339883, "learning_rate": 5e-06, "loss": 0.6557, "step": 290 }, { "epoch": 0.7164179104477612, "grad_norm": 0.6977133509211488, "learning_rate": 5e-06, "loss": 0.6596, "step": 300 }, { "epoch": 0.7402985074626866, "grad_norm": 0.6751977610973682, "learning_rate": 5e-06, "loss": 0.6631, "step": 310 }, { "epoch": 0.764179104477612, "grad_norm": 0.5497977666993439, "learning_rate": 5e-06, "loss": 0.6538, "step": 320 }, { "epoch": 0.7880597014925373, "grad_norm": 0.4930749400299794, "learning_rate": 5e-06, "loss": 0.6533, "step": 330 }, { "epoch": 0.8119402985074626, "grad_norm": 0.42531288506677356, "learning_rate": 5e-06, "loss": 0.654, "step": 340 }, { "epoch": 0.835820895522388, "grad_norm": 0.3981313944033897, "learning_rate": 5e-06, "loss": 0.6588, "step": 350 }, { "epoch": 0.8597014925373134, "grad_norm": 0.5280916740215565, "learning_rate": 5e-06, "loss": 0.6467, "step": 360 }, { "epoch": 0.8835820895522388, "grad_norm": 0.42664364036263097, "learning_rate": 5e-06, "loss": 0.6547, "step": 370 }, { "epoch": 0.9074626865671642, "grad_norm": 0.4407169068422887, "learning_rate": 5e-06, "loss": 0.6576, "step": 380 }, { "epoch": 0.9313432835820895, "grad_norm": 0.5863915466432706, "learning_rate": 5e-06, "loss": 0.6517, "step": 390 }, { "epoch": 0.9552238805970149, "grad_norm": 0.591205757573771, "learning_rate": 5e-06, "loss": 0.6489, "step": 400 }, { "epoch": 0.9791044776119403, "grad_norm": 0.4729812620066271, "learning_rate": 5e-06, "loss": 0.6443, "step": 410 }, { "epoch": 0.9982089552238806, "eval_loss": 0.6459853649139404, "eval_runtime": 145.5401, "eval_samples_per_second": 77.532, "eval_steps_per_second": 0.612, "step": 418 }, { "epoch": 1.0029850746268656, "grad_norm": 0.7406302510482085, "learning_rate": 5e-06, "loss": 0.6464, "step": 420 }, { "epoch": 1.026865671641791, "grad_norm": 0.5814842349080193, "learning_rate": 5e-06, "loss": 0.6064, "step": 430 }, { "epoch": 1.0507462686567164, "grad_norm": 0.49550119932649633, "learning_rate": 5e-06, "loss": 0.6082, "step": 440 }, { "epoch": 1.0746268656716418, "grad_norm": 0.5636659335114863, "learning_rate": 5e-06, "loss": 0.6113, "step": 450 }, { "epoch": 1.0985074626865672, "grad_norm": 0.5990578133097235, "learning_rate": 5e-06, "loss": 0.6025, "step": 460 }, { "epoch": 1.1223880597014926, "grad_norm": 0.5395914783918955, "learning_rate": 5e-06, "loss": 0.6096, "step": 470 }, { "epoch": 1.146268656716418, "grad_norm": 0.5134850748551497, "learning_rate": 5e-06, "loss": 0.6127, "step": 480 }, { "epoch": 1.1701492537313434, "grad_norm": 0.6291645545602457, "learning_rate": 5e-06, "loss": 0.6016, "step": 490 }, { "epoch": 1.1940298507462686, "grad_norm": 0.5896587056027497, "learning_rate": 5e-06, "loss": 0.6078, "step": 500 }, { "epoch": 1.217910447761194, "grad_norm": 0.44004057090652665, "learning_rate": 5e-06, "loss": 0.6028, "step": 510 }, { "epoch": 1.2417910447761193, "grad_norm": 0.6082489861086471, "learning_rate": 5e-06, "loss": 0.6057, "step": 520 }, { "epoch": 1.2656716417910447, "grad_norm": 0.4757708288590624, "learning_rate": 5e-06, "loss": 0.6128, "step": 530 }, { "epoch": 1.2895522388059701, "grad_norm": 0.4866590721782206, "learning_rate": 5e-06, "loss": 0.6133, "step": 540 }, { "epoch": 1.3134328358208955, "grad_norm": 0.4646472106430823, "learning_rate": 5e-06, "loss": 0.6089, "step": 550 }, { "epoch": 1.337313432835821, "grad_norm": 0.44517915287397264, "learning_rate": 5e-06, "loss": 0.6079, "step": 560 }, { "epoch": 1.3611940298507463, "grad_norm": 0.6694478573734268, "learning_rate": 5e-06, "loss": 0.6069, "step": 570 }, { "epoch": 1.3850746268656717, "grad_norm": 0.5078303398930042, "learning_rate": 5e-06, "loss": 0.6069, "step": 580 }, { "epoch": 1.408955223880597, "grad_norm": 0.503001629794063, "learning_rate": 5e-06, "loss": 0.6111, "step": 590 }, { "epoch": 1.4328358208955223, "grad_norm": 0.5438514985018119, "learning_rate": 5e-06, "loss": 0.613, "step": 600 }, { "epoch": 1.4567164179104477, "grad_norm": 0.5056414273287846, "learning_rate": 5e-06, "loss": 0.6031, "step": 610 }, { "epoch": 1.480597014925373, "grad_norm": 0.4588677461940861, "learning_rate": 5e-06, "loss": 0.6126, "step": 620 }, { "epoch": 1.5044776119402985, "grad_norm": 0.5151479126151606, "learning_rate": 5e-06, "loss": 0.608, "step": 630 }, { "epoch": 1.528358208955224, "grad_norm": 0.46401616793533473, "learning_rate": 5e-06, "loss": 0.6031, "step": 640 }, { "epoch": 1.5522388059701493, "grad_norm": 0.48451525361860803, "learning_rate": 5e-06, "loss": 0.605, "step": 650 }, { "epoch": 1.5761194029850745, "grad_norm": 0.47997260631095534, "learning_rate": 5e-06, "loss": 0.6109, "step": 660 }, { "epoch": 1.6, "grad_norm": 0.4897371630270623, "learning_rate": 5e-06, "loss": 0.6058, "step": 670 }, { "epoch": 1.6238805970149253, "grad_norm": 0.5654442922228684, "learning_rate": 5e-06, "loss": 0.6123, "step": 680 }, { "epoch": 1.6477611940298509, "grad_norm": 0.5784991723951006, "learning_rate": 5e-06, "loss": 0.5942, "step": 690 }, { "epoch": 1.671641791044776, "grad_norm": 0.4722303828551024, "learning_rate": 5e-06, "loss": 0.6106, "step": 700 }, { "epoch": 1.6955223880597015, "grad_norm": 0.4839424076756933, "learning_rate": 5e-06, "loss": 0.6091, "step": 710 }, { "epoch": 1.7194029850746269, "grad_norm": 0.47764784827023116, "learning_rate": 5e-06, "loss": 0.6123, "step": 720 }, { "epoch": 1.7432835820895523, "grad_norm": 0.6621875079490905, "learning_rate": 5e-06, "loss": 0.6019, "step": 730 }, { "epoch": 1.7671641791044777, "grad_norm": 0.41773352573318373, "learning_rate": 5e-06, "loss": 0.6012, "step": 740 }, { "epoch": 1.7910447761194028, "grad_norm": 0.5339311769344796, "learning_rate": 5e-06, "loss": 0.6088, "step": 750 }, { "epoch": 1.8149253731343284, "grad_norm": 0.41758051779371724, "learning_rate": 5e-06, "loss": 0.602, "step": 760 }, { "epoch": 1.8388059701492536, "grad_norm": 0.48493110488320246, "learning_rate": 5e-06, "loss": 0.609, "step": 770 }, { "epoch": 1.8626865671641792, "grad_norm": 0.503079485131092, "learning_rate": 5e-06, "loss": 0.6108, "step": 780 }, { "epoch": 1.8865671641791044, "grad_norm": 0.49999802261208487, "learning_rate": 5e-06, "loss": 0.6059, "step": 790 }, { "epoch": 1.9104477611940298, "grad_norm": 0.49289621093364544, "learning_rate": 5e-06, "loss": 0.6071, "step": 800 }, { "epoch": 1.9343283582089552, "grad_norm": 0.5213797280676552, "learning_rate": 5e-06, "loss": 0.6006, "step": 810 }, { "epoch": 1.9582089552238806, "grad_norm": 0.48494536428497287, "learning_rate": 5e-06, "loss": 0.606, "step": 820 }, { "epoch": 1.982089552238806, "grad_norm": 0.6304797916900879, "learning_rate": 5e-06, "loss": 0.6046, "step": 830 }, { "epoch": 1.9988059701492538, "eval_loss": 0.6354221701622009, "eval_runtime": 145.6955, "eval_samples_per_second": 77.449, "eval_steps_per_second": 0.611, "step": 837 }, { "epoch": 2.005970149253731, "grad_norm": 0.7544412739259527, "learning_rate": 5e-06, "loss": 0.6034, "step": 840 }, { "epoch": 2.029850746268657, "grad_norm": 0.6076312693154532, "learning_rate": 5e-06, "loss": 0.5647, "step": 850 }, { "epoch": 2.053731343283582, "grad_norm": 0.5264338967840297, "learning_rate": 5e-06, "loss": 0.5566, "step": 860 }, { "epoch": 2.0776119402985076, "grad_norm": 0.5190037883143301, "learning_rate": 5e-06, "loss": 0.5679, "step": 870 }, { "epoch": 2.1014925373134328, "grad_norm": 0.46674031058869914, "learning_rate": 5e-06, "loss": 0.5611, "step": 880 }, { "epoch": 2.1253731343283584, "grad_norm": 0.5072546488653424, "learning_rate": 5e-06, "loss": 0.565, "step": 890 }, { "epoch": 2.1492537313432836, "grad_norm": 0.6009874453432602, "learning_rate": 5e-06, "loss": 0.5611, "step": 900 }, { "epoch": 2.173134328358209, "grad_norm": 0.46110082010928216, "learning_rate": 5e-06, "loss": 0.5618, "step": 910 }, { "epoch": 2.1970149253731344, "grad_norm": 0.5163994956633559, "learning_rate": 5e-06, "loss": 0.568, "step": 920 }, { "epoch": 2.2208955223880595, "grad_norm": 0.6176077747391998, "learning_rate": 5e-06, "loss": 0.5698, "step": 930 }, { "epoch": 2.244776119402985, "grad_norm": 0.46798047694883976, "learning_rate": 5e-06, "loss": 0.5607, "step": 940 }, { "epoch": 2.2686567164179103, "grad_norm": 0.5458770025747796, "learning_rate": 5e-06, "loss": 0.5657, "step": 950 }, { "epoch": 2.292537313432836, "grad_norm": 0.4522021510300349, "learning_rate": 5e-06, "loss": 0.5582, "step": 960 }, { "epoch": 2.316417910447761, "grad_norm": 0.4993495813706899, "learning_rate": 5e-06, "loss": 0.5673, "step": 970 }, { "epoch": 2.3402985074626868, "grad_norm": 0.5148727234502507, "learning_rate": 5e-06, "loss": 0.5627, "step": 980 }, { "epoch": 2.364179104477612, "grad_norm": 0.4746669105759272, "learning_rate": 5e-06, "loss": 0.5632, "step": 990 }, { "epoch": 2.388059701492537, "grad_norm": 0.49984909528425026, "learning_rate": 5e-06, "loss": 0.5683, "step": 1000 }, { "epoch": 2.4119402985074627, "grad_norm": 0.44986277151799875, "learning_rate": 5e-06, "loss": 0.5616, "step": 1010 }, { "epoch": 2.435820895522388, "grad_norm": 0.48588485615374555, "learning_rate": 5e-06, "loss": 0.5607, "step": 1020 }, { "epoch": 2.4597014925373135, "grad_norm": 0.48231851620423527, "learning_rate": 5e-06, "loss": 0.564, "step": 1030 }, { "epoch": 2.4835820895522387, "grad_norm": 0.5353905043801261, "learning_rate": 5e-06, "loss": 0.572, "step": 1040 }, { "epoch": 2.5074626865671643, "grad_norm": 0.4911575383473398, "learning_rate": 5e-06, "loss": 0.561, "step": 1050 }, { "epoch": 2.5313432835820895, "grad_norm": 0.5322320223748976, "learning_rate": 5e-06, "loss": 0.5702, "step": 1060 }, { "epoch": 2.5552238805970147, "grad_norm": 0.47802103841875787, "learning_rate": 5e-06, "loss": 0.5648, "step": 1070 }, { "epoch": 2.5791044776119403, "grad_norm": 0.5461878411480141, "learning_rate": 5e-06, "loss": 0.5699, "step": 1080 }, { "epoch": 2.602985074626866, "grad_norm": 0.5723521540745492, "learning_rate": 5e-06, "loss": 0.5687, "step": 1090 }, { "epoch": 2.626865671641791, "grad_norm": 0.43486752386685085, "learning_rate": 5e-06, "loss": 0.5647, "step": 1100 }, { "epoch": 2.6507462686567163, "grad_norm": 0.4851163868024419, "learning_rate": 5e-06, "loss": 0.5628, "step": 1110 }, { "epoch": 2.674626865671642, "grad_norm": 0.4819830185138904, "learning_rate": 5e-06, "loss": 0.5672, "step": 1120 }, { "epoch": 2.698507462686567, "grad_norm": 0.4782708772767338, "learning_rate": 5e-06, "loss": 0.5715, "step": 1130 }, { "epoch": 2.7223880597014927, "grad_norm": 0.4564853695633748, "learning_rate": 5e-06, "loss": 0.5664, "step": 1140 }, { "epoch": 2.746268656716418, "grad_norm": 0.5672636342724084, "learning_rate": 5e-06, "loss": 0.5703, "step": 1150 }, { "epoch": 2.7701492537313435, "grad_norm": 0.46200475373217254, "learning_rate": 5e-06, "loss": 0.568, "step": 1160 }, { "epoch": 2.7940298507462686, "grad_norm": 0.439779919962293, "learning_rate": 5e-06, "loss": 0.5617, "step": 1170 }, { "epoch": 2.817910447761194, "grad_norm": 0.48759282182632596, "learning_rate": 5e-06, "loss": 0.5632, "step": 1180 }, { "epoch": 2.8417910447761194, "grad_norm": 0.5417263063839436, "learning_rate": 5e-06, "loss": 0.5659, "step": 1190 }, { "epoch": 2.8656716417910446, "grad_norm": 0.4780631102145764, "learning_rate": 5e-06, "loss": 0.5677, "step": 1200 }, { "epoch": 2.8895522388059702, "grad_norm": 0.500379213535652, "learning_rate": 5e-06, "loss": 0.5756, "step": 1210 }, { "epoch": 2.9134328358208954, "grad_norm": 0.5402151963225374, "learning_rate": 5e-06, "loss": 0.5686, "step": 1220 }, { "epoch": 2.937313432835821, "grad_norm": 0.4522864472397569, "learning_rate": 5e-06, "loss": 0.5734, "step": 1230 }, { "epoch": 2.961194029850746, "grad_norm": 0.4647996850215528, "learning_rate": 5e-06, "loss": 0.5699, "step": 1240 }, { "epoch": 2.9850746268656714, "grad_norm": 0.4918905334543915, "learning_rate": 5e-06, "loss": 0.5716, "step": 1250 }, { "epoch": 2.9946268656716417, "eval_loss": 0.636893630027771, "eval_runtime": 143.3398, "eval_samples_per_second": 78.722, "eval_steps_per_second": 0.621, "step": 1254 }, { "epoch": 2.9946268656716417, "step": 1254, "total_flos": 2100077946470400.0, "train_loss": 0.6183488205479283, "train_runtime": 21046.2129, "train_samples_per_second": 30.56, "train_steps_per_second": 0.06 } ], "logging_steps": 10, "max_steps": 1254, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2100077946470400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }