|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.009173259947253756, |
|
"eval_steps": 500, |
|
"global_step": 80, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 5.8857, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 5.8613, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 5.8757, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 22.244311559060773, |
|
"learning_rate": 3.816793893129771e-06, |
|
"loss": 6.0653, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 15.057639900452392, |
|
"learning_rate": 7.633587786259541e-06, |
|
"loss": 5.9572, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 38.092887646390466, |
|
"learning_rate": 1.1450381679389314e-05, |
|
"loss": 5.7617, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 15.31117053293346, |
|
"learning_rate": 1.5267175572519083e-05, |
|
"loss": 5.8945, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 21.702345688707492, |
|
"learning_rate": 1.9083969465648855e-05, |
|
"loss": 5.9337, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 21.702345688707492, |
|
"learning_rate": 1.9083969465648855e-05, |
|
"loss": 5.8771, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 26.25713504824, |
|
"learning_rate": 2.2900763358778628e-05, |
|
"loss": 5.879, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 88.79206625984197, |
|
"learning_rate": 2.6717557251908397e-05, |
|
"loss": 5.6244, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 13.979462978780887, |
|
"learning_rate": 3.0534351145038166e-05, |
|
"loss": 5.7982, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 13.979462978780887, |
|
"learning_rate": 3.0534351145038166e-05, |
|
"loss": 5.8216, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 58.311502538831355, |
|
"learning_rate": 3.435114503816794e-05, |
|
"loss": 5.7539, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 20.71528208128746, |
|
"learning_rate": 3.816793893129771e-05, |
|
"loss": 5.7114, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 33.69759468630642, |
|
"learning_rate": 4.198473282442748e-05, |
|
"loss": 5.6904, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 17.28319140833229, |
|
"learning_rate": 4.5801526717557256e-05, |
|
"loss": 5.6718, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 44.39956377017333, |
|
"learning_rate": 4.9618320610687025e-05, |
|
"loss": 5.4446, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 91.68382396043916, |
|
"learning_rate": 5.3435114503816794e-05, |
|
"loss": 5.6799, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 91.68382396043916, |
|
"learning_rate": 5.3435114503816794e-05, |
|
"loss": 5.606, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 192.72226388224067, |
|
"learning_rate": 5.725190839694656e-05, |
|
"loss": 5.5796, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 45.566936397354795, |
|
"learning_rate": 6.106870229007633e-05, |
|
"loss": 5.6014, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 54.111992406676734, |
|
"learning_rate": 6.488549618320611e-05, |
|
"loss": 5.4642, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 47.77738772861109, |
|
"learning_rate": 6.870229007633588e-05, |
|
"loss": 5.3002, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 52.88816210898902, |
|
"learning_rate": 7.251908396946565e-05, |
|
"loss": 5.8573, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 43.38379795566033, |
|
"learning_rate": 7.633587786259542e-05, |
|
"loss": 5.5942, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 112.9679807995391, |
|
"learning_rate": 8.015267175572518e-05, |
|
"loss": 5.7442, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 112.9679807995391, |
|
"learning_rate": 8.015267175572518e-05, |
|
"loss": 5.447, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 164.11451980650267, |
|
"learning_rate": 8.396946564885496e-05, |
|
"loss": 5.7154, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 115.9334173332188, |
|
"learning_rate": 8.778625954198472e-05, |
|
"loss": 5.9088, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 145.9836977981191, |
|
"learning_rate": 9.160305343511451e-05, |
|
"loss": 5.4605, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 114.64052697776405, |
|
"learning_rate": 9.541984732824429e-05, |
|
"loss": 5.697, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 202.12636675775389, |
|
"learning_rate": 9.923664122137405e-05, |
|
"loss": 6.0274, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 160.88000887426793, |
|
"learning_rate": 0.00010305343511450383, |
|
"loss": 6.2896, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 145.16182847186317, |
|
"learning_rate": 0.00010687022900763359, |
|
"loss": 5.9883, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 104.5781091944148, |
|
"learning_rate": 0.00011068702290076336, |
|
"loss": 6.1505, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 55.72279835011099, |
|
"learning_rate": 0.00011450381679389313, |
|
"loss": 6.458, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 72.60539121615658, |
|
"learning_rate": 0.0001183206106870229, |
|
"loss": 6.4766, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 152.31919342671264, |
|
"learning_rate": 0.00012213740458015266, |
|
"loss": 6.6228, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 195.38778604806365, |
|
"learning_rate": 0.00012595419847328244, |
|
"loss": 6.5874, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 98.21218214875543, |
|
"learning_rate": 0.00012977099236641222, |
|
"loss": 6.2979, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 117.40378533203793, |
|
"learning_rate": 0.000133587786259542, |
|
"loss": 6.1422, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 76.43242080692808, |
|
"learning_rate": 0.00013740458015267177, |
|
"loss": 6.0982, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 161.5295826913437, |
|
"learning_rate": 0.00014122137404580154, |
|
"loss": 5.9792, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 54.30211860707633, |
|
"learning_rate": 0.0001450381679389313, |
|
"loss": 6.0895, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 96.35953226922737, |
|
"learning_rate": 0.00014885496183206107, |
|
"loss": 6.1023, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 49.71381292121367, |
|
"learning_rate": 0.00015267175572519084, |
|
"loss": 5.9927, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 92.40570872689418, |
|
"learning_rate": 0.00015648854961832062, |
|
"loss": 5.8947, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 70.58634543270558, |
|
"learning_rate": 0.00016030534351145037, |
|
"loss": 5.5419, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 99.21861402306824, |
|
"learning_rate": 0.00016412213740458014, |
|
"loss": 5.533, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 60.43737769128788, |
|
"learning_rate": 0.00016793893129770992, |
|
"loss": 5.7271, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 40.38259047816709, |
|
"learning_rate": 0.0001717557251908397, |
|
"loss": 5.7707, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 50.37624352755525, |
|
"learning_rate": 0.00017557251908396944, |
|
"loss": 5.4807, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 105.31786701509579, |
|
"learning_rate": 0.00017938931297709925, |
|
"loss": 5.5782, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 58.697213953188964, |
|
"learning_rate": 0.00018320610687022902, |
|
"loss": 5.5337, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 110.55644774315732, |
|
"learning_rate": 0.0001870229007633588, |
|
"loss": 5.5836, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 14.426822607818815, |
|
"learning_rate": 0.00019083969465648857, |
|
"loss": 5.6155, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 26.228166827626183, |
|
"learning_rate": 0.00019465648854961832, |
|
"loss": 5.6817, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 27.269174056089717, |
|
"learning_rate": 0.0001984732824427481, |
|
"loss": 5.2749, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 21.695242218914665, |
|
"learning_rate": 0.00020229007633587788, |
|
"loss": 5.5323, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 68.62936938874972, |
|
"learning_rate": 0.00020610687022900765, |
|
"loss": 5.804, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 25.97127488754509, |
|
"learning_rate": 0.0002099236641221374, |
|
"loss": 5.4587, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 15.325961009638357, |
|
"learning_rate": 0.00021374045801526718, |
|
"loss": 5.4342, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 19.875772589083574, |
|
"learning_rate": 0.00021755725190839695, |
|
"loss": 5.5249, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 69.24118271312939, |
|
"learning_rate": 0.00022137404580152673, |
|
"loss": 5.1242, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 26.355890547603202, |
|
"learning_rate": 0.00022519083969465648, |
|
"loss": 5.5468, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 10.563329361046026, |
|
"learning_rate": 0.00022900763358778625, |
|
"loss": 5.2759, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 91.00091143398366, |
|
"learning_rate": 0.00023282442748091603, |
|
"loss": 5.4683, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 31.924406772853743, |
|
"learning_rate": 0.0002366412213740458, |
|
"loss": 5.4741, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 59.162721435471, |
|
"learning_rate": 0.00024045801526717558, |
|
"loss": 5.2497, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 31.455685925896024, |
|
"learning_rate": 0.00024427480916030533, |
|
"loss": 5.3254, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 67.5878959609375, |
|
"learning_rate": 0.00024809160305343513, |
|
"loss": 5.4352, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 42.716427641408124, |
|
"learning_rate": 0.0002519083969465649, |
|
"loss": 5.1745, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 51.763664942049346, |
|
"learning_rate": 0.00025572519083969463, |
|
"loss": 5.3703, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 45.70575367441054, |
|
"learning_rate": 0.00025954198473282443, |
|
"loss": 5.3987, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 22.180135968343524, |
|
"learning_rate": 0.0002633587786259542, |
|
"loss": 5.4224, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 60.31973560441679, |
|
"learning_rate": 0.000267175572519084, |
|
"loss": 5.6587, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 33.89974772165083, |
|
"learning_rate": 0.00027099236641221373, |
|
"loss": 5.6768, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 34.7175502203455, |
|
"learning_rate": 0.00027480916030534353, |
|
"loss": 5.4154, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 34.357468729191154, |
|
"learning_rate": 0.0002786259541984733, |
|
"loss": 5.4172, |
|
"step": 80 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 8721, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"total_flos": 419593936896.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|