|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.65848934796643, |
|
"eval_steps": 500, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.3856842517852783, |
|
"learning_rate": 1.9868055555555558e-05, |
|
"loss": 1.1287, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.4272608458995819, |
|
"learning_rate": 1.9736111111111115e-05, |
|
"loss": 1.0586, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.4174883961677551, |
|
"learning_rate": 1.9597222222222224e-05, |
|
"loss": 1.0166, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.35379958152770996, |
|
"learning_rate": 1.9458333333333333e-05, |
|
"loss": 0.8919, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.3820742964744568, |
|
"learning_rate": 1.9319444444444446e-05, |
|
"loss": 0.7971, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.34241142868995667, |
|
"learning_rate": 1.918055555555556e-05, |
|
"loss": 0.7466, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.48230335116386414, |
|
"learning_rate": 1.9041666666666668e-05, |
|
"loss": 0.7468, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.45057639479637146, |
|
"learning_rate": 1.890277777777778e-05, |
|
"loss": 0.7373, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.43228989839553833, |
|
"learning_rate": 1.876388888888889e-05, |
|
"loss": 0.756, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.460918664932251, |
|
"learning_rate": 1.8625000000000002e-05, |
|
"loss": 0.7186, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.8221462368965149, |
|
"learning_rate": 1.8486111111111115e-05, |
|
"loss": 0.6772, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.547721266746521, |
|
"learning_rate": 1.8347222222222224e-05, |
|
"loss": 0.6809, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.6005927920341492, |
|
"learning_rate": 1.8208333333333333e-05, |
|
"loss": 0.6651, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.7613247632980347, |
|
"learning_rate": 1.8069444444444446e-05, |
|
"loss": 0.641, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.668274462223053, |
|
"learning_rate": 1.793055555555556e-05, |
|
"loss": 0.6296, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.6101451516151428, |
|
"learning_rate": 1.7791666666666668e-05, |
|
"loss": 0.6033, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 0.8970944285392761, |
|
"learning_rate": 1.765277777777778e-05, |
|
"loss": 0.5939, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 0.9417675137519836, |
|
"learning_rate": 1.751388888888889e-05, |
|
"loss": 0.5966, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 1.3364688158035278, |
|
"learning_rate": 1.7375000000000002e-05, |
|
"loss": 0.5771, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 0.7763314843177795, |
|
"learning_rate": 1.7236111111111115e-05, |
|
"loss": 0.5588, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 1.156174659729004, |
|
"learning_rate": 1.7097222222222224e-05, |
|
"loss": 0.5571, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 0.8457236886024475, |
|
"learning_rate": 1.6958333333333333e-05, |
|
"loss": 0.5431, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 1.1372079849243164, |
|
"learning_rate": 1.6819444444444446e-05, |
|
"loss": 0.4839, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 1.0457851886749268, |
|
"learning_rate": 1.668055555555556e-05, |
|
"loss": 0.5041, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 1.1884819269180298, |
|
"learning_rate": 1.6541666666666668e-05, |
|
"loss": 0.468, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 5.37, |
|
"grad_norm": 1.5204098224639893, |
|
"learning_rate": 1.6402777777777777e-05, |
|
"loss": 0.4778, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 5.58, |
|
"grad_norm": 1.3218625783920288, |
|
"learning_rate": 1.626388888888889e-05, |
|
"loss": 0.4682, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 5.78, |
|
"grad_norm": 1.2012380361557007, |
|
"learning_rate": 1.6125000000000002e-05, |
|
"loss": 0.4326, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 5.99, |
|
"grad_norm": 1.4303064346313477, |
|
"learning_rate": 1.5986111111111115e-05, |
|
"loss": 0.4358, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 1.4230263233184814, |
|
"learning_rate": 1.5847222222222224e-05, |
|
"loss": 0.4384, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 1.5604521036148071, |
|
"learning_rate": 1.5708333333333333e-05, |
|
"loss": 0.3908, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 6.61, |
|
"grad_norm": 1.7824558019638062, |
|
"learning_rate": 1.5569444444444446e-05, |
|
"loss": 0.4017, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 1.4250575304031372, |
|
"learning_rate": 1.543055555555556e-05, |
|
"loss": 0.3759, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 7.02, |
|
"grad_norm": 1.5560147762298584, |
|
"learning_rate": 1.5291666666666668e-05, |
|
"loss": 0.3869, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 7.23, |
|
"grad_norm": 1.7027453184127808, |
|
"learning_rate": 1.5152777777777779e-05, |
|
"loss": 0.3591, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 7.44, |
|
"grad_norm": 1.5872056484222412, |
|
"learning_rate": 1.501388888888889e-05, |
|
"loss": 0.3573, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 1.5370341539382935, |
|
"learning_rate": 1.4875000000000002e-05, |
|
"loss": 0.3488, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"grad_norm": 1.853333830833435, |
|
"learning_rate": 1.4736111111111113e-05, |
|
"loss": 0.3416, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 8.06, |
|
"grad_norm": 1.5671972036361694, |
|
"learning_rate": 1.4597222222222223e-05, |
|
"loss": 0.3282, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 8.26, |
|
"grad_norm": 1.6593025922775269, |
|
"learning_rate": 1.4458333333333334e-05, |
|
"loss": 0.3112, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 8.47, |
|
"grad_norm": 1.7826825380325317, |
|
"learning_rate": 1.4319444444444446e-05, |
|
"loss": 0.286, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 8.68, |
|
"grad_norm": 1.5925076007843018, |
|
"learning_rate": 1.4180555555555557e-05, |
|
"loss": 0.3281, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"grad_norm": 1.7169785499572754, |
|
"learning_rate": 1.4041666666666666e-05, |
|
"loss": 0.3092, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"grad_norm": 1.8225599527359009, |
|
"learning_rate": 1.3902777777777779e-05, |
|
"loss": 0.2728, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 9.3, |
|
"grad_norm": 2.113538980484009, |
|
"learning_rate": 1.376388888888889e-05, |
|
"loss": 0.2893, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 2.0112457275390625, |
|
"learning_rate": 1.3625e-05, |
|
"loss": 0.277, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 9.71, |
|
"grad_norm": 1.9079582691192627, |
|
"learning_rate": 1.3486111111111113e-05, |
|
"loss": 0.261, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"grad_norm": 1.793885350227356, |
|
"learning_rate": 1.3347222222222223e-05, |
|
"loss": 0.267, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 10.12, |
|
"grad_norm": 1.770896315574646, |
|
"learning_rate": 1.3208333333333334e-05, |
|
"loss": 0.2719, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 10.33, |
|
"grad_norm": 1.849857211112976, |
|
"learning_rate": 1.3069444444444446e-05, |
|
"loss": 0.2399, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 10.54, |
|
"grad_norm": 1.8785293102264404, |
|
"learning_rate": 1.2930555555555557e-05, |
|
"loss": 0.2316, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 10.74, |
|
"grad_norm": 1.7189656496047974, |
|
"learning_rate": 1.2791666666666666e-05, |
|
"loss": 0.243, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 10.95, |
|
"grad_norm": 1.6181199550628662, |
|
"learning_rate": 1.2652777777777779e-05, |
|
"loss": 0.2347, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 11.16, |
|
"grad_norm": 2.0756866931915283, |
|
"learning_rate": 1.251388888888889e-05, |
|
"loss": 0.2407, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 11.36, |
|
"grad_norm": 1.9992755651474, |
|
"learning_rate": 1.2375000000000001e-05, |
|
"loss": 0.2205, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 11.57, |
|
"grad_norm": 2.19657039642334, |
|
"learning_rate": 1.2236111111111114e-05, |
|
"loss": 0.1992, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 11.78, |
|
"grad_norm": 1.8114335536956787, |
|
"learning_rate": 1.2097222222222223e-05, |
|
"loss": 0.219, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 11.98, |
|
"grad_norm": 1.841556429862976, |
|
"learning_rate": 1.1958333333333334e-05, |
|
"loss": 0.2051, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 12.19, |
|
"grad_norm": 1.9361953735351562, |
|
"learning_rate": 1.1819444444444446e-05, |
|
"loss": 0.2025, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 12.4, |
|
"grad_norm": 1.841424584388733, |
|
"learning_rate": 1.1680555555555557e-05, |
|
"loss": 0.2109, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 12.6, |
|
"grad_norm": 2.569114923477173, |
|
"learning_rate": 1.1541666666666667e-05, |
|
"loss": 0.1933, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 12.81, |
|
"grad_norm": 1.8263764381408691, |
|
"learning_rate": 1.1402777777777777e-05, |
|
"loss": 0.1814, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 13.01, |
|
"grad_norm": 1.7250192165374756, |
|
"learning_rate": 1.126388888888889e-05, |
|
"loss": 0.1842, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 13.22, |
|
"grad_norm": 1.8750269412994385, |
|
"learning_rate": 1.1125000000000001e-05, |
|
"loss": 0.1803, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 13.43, |
|
"grad_norm": 2.2998244762420654, |
|
"learning_rate": 1.0986111111111114e-05, |
|
"loss": 0.167, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 13.63, |
|
"grad_norm": 1.663015604019165, |
|
"learning_rate": 1.0847222222222223e-05, |
|
"loss": 0.1723, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 13.84, |
|
"grad_norm": 2.0710666179656982, |
|
"learning_rate": 1.0708333333333334e-05, |
|
"loss": 0.1895, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 14.05, |
|
"grad_norm": 1.8963505029678345, |
|
"learning_rate": 1.0569444444444445e-05, |
|
"loss": 0.1783, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"grad_norm": 2.3760714530944824, |
|
"learning_rate": 1.0430555555555557e-05, |
|
"loss": 0.1674, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 14.46, |
|
"grad_norm": 1.8337997198104858, |
|
"learning_rate": 1.0291666666666667e-05, |
|
"loss": 0.1677, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 14.67, |
|
"grad_norm": 1.8879417181015015, |
|
"learning_rate": 1.0152777777777778e-05, |
|
"loss": 0.1602, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 14.87, |
|
"grad_norm": 1.9163988828659058, |
|
"learning_rate": 1.001388888888889e-05, |
|
"loss": 0.1536, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 15.08, |
|
"grad_norm": 1.772133469581604, |
|
"learning_rate": 9.875000000000001e-06, |
|
"loss": 0.1422, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 15.29, |
|
"grad_norm": 2.173783540725708, |
|
"learning_rate": 9.736111111111112e-06, |
|
"loss": 0.1634, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 15.49, |
|
"grad_norm": 2.1147613525390625, |
|
"learning_rate": 9.597222222222223e-06, |
|
"loss": 0.14, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 15.7, |
|
"grad_norm": 2.0401387214660645, |
|
"learning_rate": 9.458333333333334e-06, |
|
"loss": 0.1498, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 15.91, |
|
"grad_norm": 2.0865604877471924, |
|
"learning_rate": 9.319444444444445e-06, |
|
"loss": 0.1395, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 16.11, |
|
"grad_norm": 1.884482502937317, |
|
"learning_rate": 9.180555555555556e-06, |
|
"loss": 0.1461, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 16.32, |
|
"grad_norm": 2.2012784481048584, |
|
"learning_rate": 9.041666666666667e-06, |
|
"loss": 0.1297, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 16.53, |
|
"grad_norm": 2.0317113399505615, |
|
"learning_rate": 8.902777777777778e-06, |
|
"loss": 0.15, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 16.73, |
|
"grad_norm": 1.4792850017547607, |
|
"learning_rate": 8.76388888888889e-06, |
|
"loss": 0.1359, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 16.94, |
|
"grad_norm": 2.3025472164154053, |
|
"learning_rate": 8.625000000000001e-06, |
|
"loss": 0.1298, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 17.15, |
|
"grad_norm": 1.8987499475479126, |
|
"learning_rate": 8.486111111111112e-06, |
|
"loss": 0.1287, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 17.35, |
|
"grad_norm": 3.6023619174957275, |
|
"learning_rate": 8.347222222222223e-06, |
|
"loss": 0.1584, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 17.56, |
|
"grad_norm": 1.6749480962753296, |
|
"learning_rate": 8.208333333333334e-06, |
|
"loss": 0.1071, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 17.77, |
|
"grad_norm": 1.894116759300232, |
|
"learning_rate": 8.069444444444445e-06, |
|
"loss": 0.1233, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 17.97, |
|
"grad_norm": 1.9405465126037598, |
|
"learning_rate": 7.930555555555556e-06, |
|
"loss": 0.1143, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 18.18, |
|
"grad_norm": 1.8571505546569824, |
|
"learning_rate": 7.791666666666667e-06, |
|
"loss": 0.1197, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 18.39, |
|
"grad_norm": 1.8494367599487305, |
|
"learning_rate": 7.652777777777778e-06, |
|
"loss": 0.133, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 18.59, |
|
"grad_norm": 1.8017449378967285, |
|
"learning_rate": 7.51388888888889e-06, |
|
"loss": 0.1036, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 18.8, |
|
"grad_norm": 1.9734400510787964, |
|
"learning_rate": 7.375000000000001e-06, |
|
"loss": 0.1135, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 19.01, |
|
"grad_norm": 1.88029944896698, |
|
"learning_rate": 7.2361111111111115e-06, |
|
"loss": 0.1159, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 19.21, |
|
"grad_norm": 1.6627161502838135, |
|
"learning_rate": 7.097222222222223e-06, |
|
"loss": 0.1138, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 19.42, |
|
"grad_norm": 1.9287075996398926, |
|
"learning_rate": 6.958333333333333e-06, |
|
"loss": 0.1154, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 19.63, |
|
"grad_norm": 1.843787670135498, |
|
"learning_rate": 6.819444444444445e-06, |
|
"loss": 0.1052, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 19.83, |
|
"grad_norm": 2.0632147789001465, |
|
"learning_rate": 6.680555555555556e-06, |
|
"loss": 0.1157, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 20.04, |
|
"grad_norm": 1.6356507539749146, |
|
"learning_rate": 6.541666666666667e-06, |
|
"loss": 0.0993, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 20.25, |
|
"grad_norm": 2.1572136878967285, |
|
"learning_rate": 6.402777777777778e-06, |
|
"loss": 0.0973, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 20.45, |
|
"grad_norm": 1.834170937538147, |
|
"learning_rate": 6.26388888888889e-06, |
|
"loss": 0.0947, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 20.66, |
|
"grad_norm": 1.80930495262146, |
|
"learning_rate": 6.125000000000001e-06, |
|
"loss": 0.1176, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 2880, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 500, |
|
"total_flos": 4.626131694005453e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|