{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.009173259947253756, "eval_steps": 500, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 5.8857, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 5.8613, "step": 2 }, { "epoch": 0.0, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 5.8757, "step": 3 }, { "epoch": 0.0, "grad_norm": 22.244311559060773, "learning_rate": 3.816793893129771e-06, "loss": 6.0653, "step": 4 }, { "epoch": 0.0, "grad_norm": 15.057639900452392, "learning_rate": 7.633587786259541e-06, "loss": 5.9572, "step": 5 }, { "epoch": 0.0, "grad_norm": 38.092887646390466, "learning_rate": 1.1450381679389314e-05, "loss": 5.7617, "step": 6 }, { "epoch": 0.0, "grad_norm": 15.31117053293346, "learning_rate": 1.5267175572519083e-05, "loss": 5.8945, "step": 7 }, { "epoch": 0.0, "grad_norm": 21.702345688707492, "learning_rate": 1.9083969465648855e-05, "loss": 5.9337, "step": 8 }, { "epoch": 0.0, "grad_norm": 21.702345688707492, "learning_rate": 1.9083969465648855e-05, "loss": 5.8771, "step": 9 }, { "epoch": 0.0, "grad_norm": 26.25713504824, "learning_rate": 2.2900763358778628e-05, "loss": 5.879, "step": 10 }, { "epoch": 0.0, "grad_norm": 88.79206625984197, "learning_rate": 2.6717557251908397e-05, "loss": 5.6244, "step": 11 }, { "epoch": 0.0, "grad_norm": 13.979462978780887, "learning_rate": 3.0534351145038166e-05, "loss": 5.7982, "step": 12 }, { "epoch": 0.0, "grad_norm": 13.979462978780887, "learning_rate": 3.0534351145038166e-05, "loss": 5.8216, "step": 13 }, { "epoch": 0.0, "grad_norm": 58.311502538831355, "learning_rate": 3.435114503816794e-05, "loss": 5.7539, "step": 14 }, { "epoch": 0.0, "grad_norm": 20.71528208128746, "learning_rate": 3.816793893129771e-05, "loss": 5.7114, "step": 15 }, { "epoch": 0.0, "grad_norm": 33.69759468630642, "learning_rate": 4.198473282442748e-05, "loss": 5.6904, "step": 16 }, { "epoch": 0.0, "grad_norm": 17.28319140833229, "learning_rate": 4.5801526717557256e-05, "loss": 5.6718, "step": 17 }, { "epoch": 0.0, "grad_norm": 44.39956377017333, "learning_rate": 4.9618320610687025e-05, "loss": 5.4446, "step": 18 }, { "epoch": 0.0, "grad_norm": 91.68382396043916, "learning_rate": 5.3435114503816794e-05, "loss": 5.6799, "step": 19 }, { "epoch": 0.0, "grad_norm": 91.68382396043916, "learning_rate": 5.3435114503816794e-05, "loss": 5.606, "step": 20 }, { "epoch": 0.0, "grad_norm": 192.72226388224067, "learning_rate": 5.725190839694656e-05, "loss": 5.5796, "step": 21 }, { "epoch": 0.0, "grad_norm": 45.566936397354795, "learning_rate": 6.106870229007633e-05, "loss": 5.6014, "step": 22 }, { "epoch": 0.0, "grad_norm": 54.111992406676734, "learning_rate": 6.488549618320611e-05, "loss": 5.4642, "step": 23 }, { "epoch": 0.0, "grad_norm": 47.77738772861109, "learning_rate": 6.870229007633588e-05, "loss": 5.3002, "step": 24 }, { "epoch": 0.0, "grad_norm": 52.88816210898902, "learning_rate": 7.251908396946565e-05, "loss": 5.8573, "step": 25 }, { "epoch": 0.0, "grad_norm": 43.38379795566033, "learning_rate": 7.633587786259542e-05, "loss": 5.5942, "step": 26 }, { "epoch": 0.0, "grad_norm": 112.9679807995391, "learning_rate": 8.015267175572518e-05, "loss": 5.7442, "step": 27 }, { "epoch": 0.0, "grad_norm": 112.9679807995391, "learning_rate": 8.015267175572518e-05, "loss": 5.447, "step": 28 }, { "epoch": 0.0, "grad_norm": 164.11451980650267, "learning_rate": 8.396946564885496e-05, "loss": 5.7154, "step": 29 }, { "epoch": 0.0, "grad_norm": 115.9334173332188, "learning_rate": 8.778625954198472e-05, "loss": 5.9088, "step": 30 }, { "epoch": 0.0, "grad_norm": 145.9836977981191, "learning_rate": 9.160305343511451e-05, "loss": 5.4605, "step": 31 }, { "epoch": 0.0, "grad_norm": 114.64052697776405, "learning_rate": 9.541984732824429e-05, "loss": 5.697, "step": 32 }, { "epoch": 0.0, "grad_norm": 202.12636675775389, "learning_rate": 9.923664122137405e-05, "loss": 6.0274, "step": 33 }, { "epoch": 0.0, "grad_norm": 160.88000887426793, "learning_rate": 0.00010305343511450383, "loss": 6.2896, "step": 34 }, { "epoch": 0.0, "grad_norm": 145.16182847186317, "learning_rate": 0.00010687022900763359, "loss": 5.9883, "step": 35 }, { "epoch": 0.0, "grad_norm": 104.5781091944148, "learning_rate": 0.00011068702290076336, "loss": 6.1505, "step": 36 }, { "epoch": 0.0, "grad_norm": 55.72279835011099, "learning_rate": 0.00011450381679389313, "loss": 6.458, "step": 37 }, { "epoch": 0.0, "grad_norm": 72.60539121615658, "learning_rate": 0.0001183206106870229, "loss": 6.4766, "step": 38 }, { "epoch": 0.0, "grad_norm": 152.31919342671264, "learning_rate": 0.00012213740458015266, "loss": 6.6228, "step": 39 }, { "epoch": 0.0, "grad_norm": 195.38778604806365, "learning_rate": 0.00012595419847328244, "loss": 6.5874, "step": 40 }, { "epoch": 0.0, "grad_norm": 98.21218214875543, "learning_rate": 0.00012977099236641222, "loss": 6.2979, "step": 41 }, { "epoch": 0.0, "grad_norm": 117.40378533203793, "learning_rate": 0.000133587786259542, "loss": 6.1422, "step": 42 }, { "epoch": 0.0, "grad_norm": 76.43242080692808, "learning_rate": 0.00013740458015267177, "loss": 6.0982, "step": 43 }, { "epoch": 0.01, "grad_norm": 161.5295826913437, "learning_rate": 0.00014122137404580154, "loss": 5.9792, "step": 44 }, { "epoch": 0.01, "grad_norm": 54.30211860707633, "learning_rate": 0.0001450381679389313, "loss": 6.0895, "step": 45 }, { "epoch": 0.01, "grad_norm": 96.35953226922737, "learning_rate": 0.00014885496183206107, "loss": 6.1023, "step": 46 }, { "epoch": 0.01, "grad_norm": 49.71381292121367, "learning_rate": 0.00015267175572519084, "loss": 5.9927, "step": 47 }, { "epoch": 0.01, "grad_norm": 92.40570872689418, "learning_rate": 0.00015648854961832062, "loss": 5.8947, "step": 48 }, { "epoch": 0.01, "grad_norm": 70.58634543270558, "learning_rate": 0.00016030534351145037, "loss": 5.5419, "step": 49 }, { "epoch": 0.01, "grad_norm": 99.21861402306824, "learning_rate": 0.00016412213740458014, "loss": 5.533, "step": 50 }, { "epoch": 0.01, "grad_norm": 60.43737769128788, "learning_rate": 0.00016793893129770992, "loss": 5.7271, "step": 51 }, { "epoch": 0.01, "grad_norm": 40.38259047816709, "learning_rate": 0.0001717557251908397, "loss": 5.7707, "step": 52 }, { "epoch": 0.01, "grad_norm": 50.37624352755525, "learning_rate": 0.00017557251908396944, "loss": 5.4807, "step": 53 }, { "epoch": 0.01, "grad_norm": 105.31786701509579, "learning_rate": 0.00017938931297709925, "loss": 5.5782, "step": 54 }, { "epoch": 0.01, "grad_norm": 58.697213953188964, "learning_rate": 0.00018320610687022902, "loss": 5.5337, "step": 55 }, { "epoch": 0.01, "grad_norm": 110.55644774315732, "learning_rate": 0.0001870229007633588, "loss": 5.5836, "step": 56 }, { "epoch": 0.01, "grad_norm": 14.426822607818815, "learning_rate": 0.00019083969465648857, "loss": 5.6155, "step": 57 }, { "epoch": 0.01, "grad_norm": 26.228166827626183, "learning_rate": 0.00019465648854961832, "loss": 5.6817, "step": 58 }, { "epoch": 0.01, "grad_norm": 27.269174056089717, "learning_rate": 0.0001984732824427481, "loss": 5.2749, "step": 59 }, { "epoch": 0.01, "grad_norm": 21.695242218914665, "learning_rate": 0.00020229007633587788, "loss": 5.5323, "step": 60 }, { "epoch": 0.01, "grad_norm": 68.62936938874972, "learning_rate": 0.00020610687022900765, "loss": 5.804, "step": 61 }, { "epoch": 0.01, "grad_norm": 25.97127488754509, "learning_rate": 0.0002099236641221374, "loss": 5.4587, "step": 62 }, { "epoch": 0.01, "grad_norm": 15.325961009638357, "learning_rate": 0.00021374045801526718, "loss": 5.4342, "step": 63 }, { "epoch": 0.01, "grad_norm": 19.875772589083574, "learning_rate": 0.00021755725190839695, "loss": 5.5249, "step": 64 }, { "epoch": 0.01, "grad_norm": 69.24118271312939, "learning_rate": 0.00022137404580152673, "loss": 5.1242, "step": 65 }, { "epoch": 0.01, "grad_norm": 26.355890547603202, "learning_rate": 0.00022519083969465648, "loss": 5.5468, "step": 66 }, { "epoch": 0.01, "grad_norm": 10.563329361046026, "learning_rate": 0.00022900763358778625, "loss": 5.2759, "step": 67 }, { "epoch": 0.01, "grad_norm": 91.00091143398366, "learning_rate": 0.00023282442748091603, "loss": 5.4683, "step": 68 }, { "epoch": 0.01, "grad_norm": 31.924406772853743, "learning_rate": 0.0002366412213740458, "loss": 5.4741, "step": 69 }, { "epoch": 0.01, "grad_norm": 59.162721435471, "learning_rate": 0.00024045801526717558, "loss": 5.2497, "step": 70 }, { "epoch": 0.01, "grad_norm": 31.455685925896024, "learning_rate": 0.00024427480916030533, "loss": 5.3254, "step": 71 }, { "epoch": 0.01, "grad_norm": 67.5878959609375, "learning_rate": 0.00024809160305343513, "loss": 5.4352, "step": 72 }, { "epoch": 0.01, "grad_norm": 42.716427641408124, "learning_rate": 0.0002519083969465649, "loss": 5.1745, "step": 73 }, { "epoch": 0.01, "grad_norm": 51.763664942049346, "learning_rate": 0.00025572519083969463, "loss": 5.3703, "step": 74 }, { "epoch": 0.01, "grad_norm": 45.70575367441054, "learning_rate": 0.00025954198473282443, "loss": 5.3987, "step": 75 }, { "epoch": 0.01, "grad_norm": 22.180135968343524, "learning_rate": 0.0002633587786259542, "loss": 5.4224, "step": 76 }, { "epoch": 0.01, "grad_norm": 60.31973560441679, "learning_rate": 0.000267175572519084, "loss": 5.6587, "step": 77 }, { "epoch": 0.01, "grad_norm": 33.89974772165083, "learning_rate": 0.00027099236641221373, "loss": 5.6768, "step": 78 }, { "epoch": 0.01, "grad_norm": 34.7175502203455, "learning_rate": 0.00027480916030534353, "loss": 5.4154, "step": 79 }, { "epoch": 0.01, "grad_norm": 34.357468729191154, "learning_rate": 0.0002786259541984733, "loss": 5.4172, "step": 80 } ], "logging_steps": 1.0, "max_steps": 8721, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "total_flos": 419593936896.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }