|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 7.246376811594203, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07246376811594203, |
|
"grad_norm": 1.0494204759597778, |
|
"learning_rate": 0.00019800000000000002, |
|
"loss": 4.2586, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.14492753623188406, |
|
"grad_norm": 2.3070778846740723, |
|
"learning_rate": 0.000196, |
|
"loss": 3.4035, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.21739130434782608, |
|
"grad_norm": 3.2612266540527344, |
|
"learning_rate": 0.000194, |
|
"loss": 2.6736, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2898550724637681, |
|
"grad_norm": 2.4876601696014404, |
|
"learning_rate": 0.0001922, |
|
"loss": 2.1857, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.36231884057971014, |
|
"grad_norm": 2.367360830307007, |
|
"learning_rate": 0.0001902, |
|
"loss": 2.0017, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 1.9053794145584106, |
|
"learning_rate": 0.0001882, |
|
"loss": 1.7045, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5072463768115942, |
|
"grad_norm": 2.813183069229126, |
|
"learning_rate": 0.00018620000000000003, |
|
"loss": 1.777, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5797101449275363, |
|
"grad_norm": 1.703532099723816, |
|
"learning_rate": 0.0001842, |
|
"loss": 1.8335, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6521739130434783, |
|
"grad_norm": 2.5876986980438232, |
|
"learning_rate": 0.0001822, |
|
"loss": 1.932, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7246376811594203, |
|
"grad_norm": 2.3653035163879395, |
|
"learning_rate": 0.00018020000000000002, |
|
"loss": 1.5939, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7971014492753623, |
|
"grad_norm": 1.8726710081100464, |
|
"learning_rate": 0.00017820000000000002, |
|
"loss": 1.816, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 1.8513643741607666, |
|
"learning_rate": 0.0001762, |
|
"loss": 1.4941, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9420289855072463, |
|
"grad_norm": 2.2692129611968994, |
|
"learning_rate": 0.0001742, |
|
"loss": 1.5971, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0144927536231885, |
|
"grad_norm": 2.6734588146209717, |
|
"learning_rate": 0.0001722, |
|
"loss": 1.6418, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0869565217391304, |
|
"grad_norm": 3.8182976245880127, |
|
"learning_rate": 0.00017020000000000002, |
|
"loss": 1.4724, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1594202898550725, |
|
"grad_norm": 2.3226006031036377, |
|
"learning_rate": 0.0001682, |
|
"loss": 1.3599, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2318840579710144, |
|
"grad_norm": 6.185395240783691, |
|
"learning_rate": 0.0001662, |
|
"loss": 1.3629, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.3043478260869565, |
|
"grad_norm": 2.1148927211761475, |
|
"learning_rate": 0.0001642, |
|
"loss": 1.3997, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3768115942028984, |
|
"grad_norm": 3.705061197280884, |
|
"learning_rate": 0.0001622, |
|
"loss": 1.3026, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4492753623188406, |
|
"grad_norm": 3.832019090652466, |
|
"learning_rate": 0.00016020000000000002, |
|
"loss": 1.5701, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5217391304347827, |
|
"grad_norm": 2.6830215454101562, |
|
"learning_rate": 0.00015820000000000002, |
|
"loss": 1.6283, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5942028985507246, |
|
"grad_norm": 4.285362243652344, |
|
"learning_rate": 0.0001562, |
|
"loss": 1.3493, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 3.6948392391204834, |
|
"learning_rate": 0.0001542, |
|
"loss": 1.3661, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 3.770359992980957, |
|
"learning_rate": 0.0001522, |
|
"loss": 1.2724, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.8115942028985508, |
|
"grad_norm": 4.342422008514404, |
|
"learning_rate": 0.00015020000000000002, |
|
"loss": 1.2295, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.8840579710144927, |
|
"grad_norm": 4.003652572631836, |
|
"learning_rate": 0.0001482, |
|
"loss": 1.3862, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.9565217391304348, |
|
"grad_norm": 3.8167941570281982, |
|
"learning_rate": 0.0001462, |
|
"loss": 1.3623, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.028985507246377, |
|
"grad_norm": 4.051051139831543, |
|
"learning_rate": 0.0001442, |
|
"loss": 0.9666, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.101449275362319, |
|
"grad_norm": 2.2572319507598877, |
|
"learning_rate": 0.0001422, |
|
"loss": 0.9071, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.1739130434782608, |
|
"grad_norm": 5.0580878257751465, |
|
"learning_rate": 0.0001402, |
|
"loss": 0.9925, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.246376811594203, |
|
"grad_norm": 4.157490253448486, |
|
"learning_rate": 0.0001382, |
|
"loss": 1.2288, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.318840579710145, |
|
"grad_norm": 4.6029510498046875, |
|
"learning_rate": 0.0001362, |
|
"loss": 0.8646, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.391304347826087, |
|
"grad_norm": 6.775791645050049, |
|
"learning_rate": 0.0001342, |
|
"loss": 1.0872, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.463768115942029, |
|
"grad_norm": 4.0972113609313965, |
|
"learning_rate": 0.00013220000000000001, |
|
"loss": 0.9001, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.536231884057971, |
|
"grad_norm": 8.093110084533691, |
|
"learning_rate": 0.00013020000000000002, |
|
"loss": 0.9329, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"grad_norm": 5.541107177734375, |
|
"learning_rate": 0.0001282, |
|
"loss": 1.1102, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.681159420289855, |
|
"grad_norm": 4.768208980560303, |
|
"learning_rate": 0.0001262, |
|
"loss": 0.9413, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.753623188405797, |
|
"grad_norm": 6.943519115447998, |
|
"learning_rate": 0.0001242, |
|
"loss": 1.1666, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.8260869565217392, |
|
"grad_norm": 4.54674768447876, |
|
"learning_rate": 0.00012220000000000002, |
|
"loss": 0.9934, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.898550724637681, |
|
"grad_norm": 4.663645267486572, |
|
"learning_rate": 0.00012020000000000001, |
|
"loss": 1.2474, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.971014492753623, |
|
"grad_norm": 4.170300483703613, |
|
"learning_rate": 0.0001182, |
|
"loss": 1.1413, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.0434782608695654, |
|
"grad_norm": 3.5200247764587402, |
|
"learning_rate": 0.00011619999999999999, |
|
"loss": 1.1336, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.1159420289855073, |
|
"grad_norm": 6.17999792098999, |
|
"learning_rate": 0.0001142, |
|
"loss": 0.8005, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.1884057971014492, |
|
"grad_norm": 6.850672245025635, |
|
"learning_rate": 0.00011220000000000002, |
|
"loss": 0.8476, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.260869565217391, |
|
"grad_norm": 5.512606620788574, |
|
"learning_rate": 0.00011020000000000001, |
|
"loss": 0.9174, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 5.102043628692627, |
|
"learning_rate": 0.00010820000000000001, |
|
"loss": 0.7113, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.4057971014492754, |
|
"grad_norm": 3.110646963119507, |
|
"learning_rate": 0.0001062, |
|
"loss": 0.752, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.4782608695652173, |
|
"grad_norm": 4.310419082641602, |
|
"learning_rate": 0.00010420000000000001, |
|
"loss": 0.8261, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.550724637681159, |
|
"grad_norm": 6.366318225860596, |
|
"learning_rate": 0.0001022, |
|
"loss": 0.793, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.6231884057971016, |
|
"grad_norm": 7.170370578765869, |
|
"learning_rate": 0.00010020000000000001, |
|
"loss": 1.0607, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.6956521739130435, |
|
"grad_norm": 5.464928150177002, |
|
"learning_rate": 9.82e-05, |
|
"loss": 0.5825, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.7681159420289854, |
|
"grad_norm": 5.7281951904296875, |
|
"learning_rate": 9.620000000000001e-05, |
|
"loss": 0.786, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.8405797101449277, |
|
"grad_norm": 5.510980129241943, |
|
"learning_rate": 9.42e-05, |
|
"loss": 0.6948, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.9130434782608696, |
|
"grad_norm": 1.936035394668579, |
|
"learning_rate": 9.22e-05, |
|
"loss": 0.6707, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.9855072463768115, |
|
"grad_norm": 3.7161924839019775, |
|
"learning_rate": 9.020000000000001e-05, |
|
"loss": 0.8582, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.057971014492754, |
|
"grad_norm": 4.582805156707764, |
|
"learning_rate": 8.82e-05, |
|
"loss": 0.5581, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.130434782608695, |
|
"grad_norm": 6.6975250244140625, |
|
"learning_rate": 8.620000000000001e-05, |
|
"loss": 0.6891, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.202898550724638, |
|
"grad_norm": 4.396116256713867, |
|
"learning_rate": 8.42e-05, |
|
"loss": 0.6131, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.27536231884058, |
|
"grad_norm": 8.45380687713623, |
|
"learning_rate": 8.22e-05, |
|
"loss": 0.49, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.3478260869565215, |
|
"grad_norm": 5.600996017456055, |
|
"learning_rate": 8.020000000000001e-05, |
|
"loss": 0.6265, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.420289855072464, |
|
"grad_norm": 9.154874801635742, |
|
"learning_rate": 7.82e-05, |
|
"loss": 0.6905, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.492753623188406, |
|
"grad_norm": 6.6350202560424805, |
|
"learning_rate": 7.620000000000001e-05, |
|
"loss": 0.5512, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.565217391304348, |
|
"grad_norm": 5.929750442504883, |
|
"learning_rate": 7.42e-05, |
|
"loss": 0.6337, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.63768115942029, |
|
"grad_norm": 4.856590270996094, |
|
"learning_rate": 7.22e-05, |
|
"loss": 0.5576, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.710144927536232, |
|
"grad_norm": 6.00139856338501, |
|
"learning_rate": 7.02e-05, |
|
"loss": 0.839, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.782608695652174, |
|
"grad_norm": 7.68943452835083, |
|
"learning_rate": 6.82e-05, |
|
"loss": 0.6007, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.855072463768116, |
|
"grad_norm": 3.7272567749023438, |
|
"learning_rate": 6.620000000000001e-05, |
|
"loss": 0.6239, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.927536231884058, |
|
"grad_norm": 4.914477825164795, |
|
"learning_rate": 6.42e-05, |
|
"loss": 0.6188, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 9.873302459716797, |
|
"learning_rate": 6.220000000000001e-05, |
|
"loss": 0.5975, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 5.072463768115942, |
|
"grad_norm": 8.113228797912598, |
|
"learning_rate": 6.02e-05, |
|
"loss": 0.4262, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.144927536231884, |
|
"grad_norm": 2.4135184288024902, |
|
"learning_rate": 5.82e-05, |
|
"loss": 0.3966, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.217391304347826, |
|
"grad_norm": 3.7978782653808594, |
|
"learning_rate": 5.620000000000001e-05, |
|
"loss": 0.5201, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.2898550724637685, |
|
"grad_norm": 4.620602607727051, |
|
"learning_rate": 5.420000000000001e-05, |
|
"loss": 0.5306, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 5.36231884057971, |
|
"grad_norm": 7.97003173828125, |
|
"learning_rate": 5.22e-05, |
|
"loss": 0.4757, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 5.434782608695652, |
|
"grad_norm": 3.4005777835845947, |
|
"learning_rate": 5.02e-05, |
|
"loss": 0.3991, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 5.507246376811594, |
|
"grad_norm": 9.03802490234375, |
|
"learning_rate": 4.82e-05, |
|
"loss": 0.4278, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 5.579710144927536, |
|
"grad_norm": 4.8757123947143555, |
|
"learning_rate": 4.6200000000000005e-05, |
|
"loss": 0.4691, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 5.6521739130434785, |
|
"grad_norm": 7.352402210235596, |
|
"learning_rate": 4.4200000000000004e-05, |
|
"loss": 0.4877, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 5.72463768115942, |
|
"grad_norm": 4.516758918762207, |
|
"learning_rate": 4.22e-05, |
|
"loss": 0.506, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 5.797101449275362, |
|
"grad_norm": 6.949781894683838, |
|
"learning_rate": 4.02e-05, |
|
"loss": 0.688, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.869565217391305, |
|
"grad_norm": 8.907429695129395, |
|
"learning_rate": 3.82e-05, |
|
"loss": 0.5848, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 5.942028985507246, |
|
"grad_norm": 8.472686767578125, |
|
"learning_rate": 3.62e-05, |
|
"loss": 0.4619, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 6.0144927536231885, |
|
"grad_norm": 3.424809217453003, |
|
"learning_rate": 3.4200000000000005e-05, |
|
"loss": 0.4204, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 6.086956521739131, |
|
"grad_norm": 6.555367946624756, |
|
"learning_rate": 3.2200000000000003e-05, |
|
"loss": 0.3869, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 6.159420289855072, |
|
"grad_norm": 7.202473163604736, |
|
"learning_rate": 3.02e-05, |
|
"loss": 0.4058, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 6.231884057971015, |
|
"grad_norm": 9.05301570892334, |
|
"learning_rate": 2.8199999999999998e-05, |
|
"loss": 0.3875, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 6.304347826086957, |
|
"grad_norm": 8.509578704833984, |
|
"learning_rate": 2.6200000000000003e-05, |
|
"loss": 0.4036, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 6.3768115942028984, |
|
"grad_norm": 4.049665451049805, |
|
"learning_rate": 2.4200000000000002e-05, |
|
"loss": 0.4272, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 6.449275362318841, |
|
"grad_norm": 3.557060718536377, |
|
"learning_rate": 2.22e-05, |
|
"loss": 0.3636, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 6.521739130434782, |
|
"grad_norm": 8.958136558532715, |
|
"learning_rate": 2.0200000000000003e-05, |
|
"loss": 0.4378, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 6.594202898550725, |
|
"grad_norm": 2.7690534591674805, |
|
"learning_rate": 1.8200000000000002e-05, |
|
"loss": 0.3435, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 5.819123268127441, |
|
"learning_rate": 1.62e-05, |
|
"loss": 0.4098, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 6.739130434782608, |
|
"grad_norm": 6.891845226287842, |
|
"learning_rate": 1.42e-05, |
|
"loss": 0.3363, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 6.811594202898551, |
|
"grad_norm": 7.646413326263428, |
|
"learning_rate": 1.22e-05, |
|
"loss": 0.4361, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 6.884057971014493, |
|
"grad_norm": 7.139030933380127, |
|
"learning_rate": 1.02e-05, |
|
"loss": 0.3213, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 6.956521739130435, |
|
"grad_norm": 8.147725105285645, |
|
"learning_rate": 8.200000000000001e-06, |
|
"loss": 0.4433, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 7.028985507246377, |
|
"grad_norm": 9.252585411071777, |
|
"learning_rate": 6.2e-06, |
|
"loss": 0.4423, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 7.101449275362318, |
|
"grad_norm": 3.595215320587158, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 0.389, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 7.173913043478261, |
|
"grad_norm": 2.2256908416748047, |
|
"learning_rate": 2.2e-06, |
|
"loss": 0.3653, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 7.246376811594203, |
|
"grad_norm": 7.12455415725708, |
|
"learning_rate": 2.0000000000000002e-07, |
|
"loss": 0.3505, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 787356038406144.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|