|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 25.0, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 1.5333333333333334e-05, |
|
"loss": 1.135, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 3.1333333333333334e-05, |
|
"loss": 0.9975, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 4.7333333333333336e-05, |
|
"loss": 0.8521, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 6.333333333333333e-05, |
|
"loss": 0.6949, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"learning_rate": 7.933333333333334e-05, |
|
"loss": 0.5953, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"learning_rate": 9.533333333333334e-05, |
|
"loss": 0.5087, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"learning_rate": 0.00011133333333333333, |
|
"loss": 0.4767, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"learning_rate": 0.00012733333333333336, |
|
"loss": 0.4546, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"learning_rate": 0.00014333333333333334, |
|
"loss": 0.4363, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 0.00015933333333333332, |
|
"loss": 0.4296, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"learning_rate": 0.00017533333333333336, |
|
"loss": 0.3956, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"learning_rate": 0.00019133333333333334, |
|
"loss": 0.3875, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"learning_rate": 0.0001991851851851852, |
|
"loss": 0.3885, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"learning_rate": 0.00019740740740740743, |
|
"loss": 0.3827, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"learning_rate": 0.00019562962962962964, |
|
"loss": 0.378, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"learning_rate": 0.00019385185185185187, |
|
"loss": 0.3337, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"learning_rate": 0.00019207407407407408, |
|
"loss": 0.3448, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"learning_rate": 0.00019029629629629632, |
|
"loss": 0.3409, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"learning_rate": 0.00018851851851851853, |
|
"loss": 0.3446, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"learning_rate": 0.00018674074074074074, |
|
"loss": 0.3396, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"learning_rate": 0.00018496296296296297, |
|
"loss": 0.2972, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"learning_rate": 0.00018318518518518518, |
|
"loss": 0.3032, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"learning_rate": 0.00018140740740740742, |
|
"loss": 0.3051, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"learning_rate": 0.00017962962962962963, |
|
"loss": 0.307, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"learning_rate": 0.00017785185185185186, |
|
"loss": 0.3093, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"learning_rate": 0.00017607407407407407, |
|
"loss": 0.261, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"learning_rate": 0.0001742962962962963, |
|
"loss": 0.2686, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"learning_rate": 0.00017251851851851852, |
|
"loss": 0.2753, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"learning_rate": 0.00017074074074074075, |
|
"loss": 0.2802, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"learning_rate": 0.00016896296296296296, |
|
"loss": 0.2805, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"learning_rate": 0.0001671851851851852, |
|
"loss": 0.2335, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"learning_rate": 0.0001654074074074074, |
|
"loss": 0.234, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"learning_rate": 0.00016362962962962964, |
|
"loss": 0.2434, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"learning_rate": 0.00016185185185185185, |
|
"loss": 0.253, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"learning_rate": 0.0001600740740740741, |
|
"loss": 0.2481, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"learning_rate": 0.0001582962962962963, |
|
"loss": 0.2053, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"learning_rate": 0.00015651851851851854, |
|
"loss": 0.2058, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"learning_rate": 0.00015474074074074074, |
|
"loss": 0.213, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"learning_rate": 0.00015296296296296298, |
|
"loss": 0.2155, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"learning_rate": 0.0001511851851851852, |
|
"loss": 0.2218, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"learning_rate": 0.00014940740740740743, |
|
"loss": 0.1721, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"learning_rate": 0.00014762962962962964, |
|
"loss": 0.1812, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"learning_rate": 0.00014585185185185187, |
|
"loss": 0.1858, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"learning_rate": 0.00014407407407407408, |
|
"loss": 0.187, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"learning_rate": 0.00014229629629629632, |
|
"loss": 0.1929, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"learning_rate": 0.00014051851851851853, |
|
"loss": 0.1449, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"learning_rate": 0.00013874074074074076, |
|
"loss": 0.1539, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"learning_rate": 0.00013696296296296297, |
|
"loss": 0.1556, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"learning_rate": 0.00013518518518518518, |
|
"loss": 0.1615, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"learning_rate": 0.00013340740740740742, |
|
"loss": 0.1704, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 10.2, |
|
"learning_rate": 0.00013162962962962963, |
|
"loss": 0.1237, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 10.4, |
|
"learning_rate": 0.00012985185185185186, |
|
"loss": 0.1282, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 10.6, |
|
"learning_rate": 0.00012807407407407407, |
|
"loss": 0.1335, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 10.8, |
|
"learning_rate": 0.0001262962962962963, |
|
"loss": 0.1401, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"learning_rate": 0.00012451851851851852, |
|
"loss": 0.14, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 11.2, |
|
"learning_rate": 0.00012274074074074075, |
|
"loss": 0.0996, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 11.4, |
|
"learning_rate": 0.00012096296296296296, |
|
"loss": 0.1093, |
|
"step": 1368 |
|
}, |
|
{ |
|
"epoch": 11.6, |
|
"learning_rate": 0.0001191851851851852, |
|
"loss": 0.1096, |
|
"step": 1392 |
|
}, |
|
{ |
|
"epoch": 11.8, |
|
"learning_rate": 0.00011740740740740741, |
|
"loss": 0.1137, |
|
"step": 1416 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"learning_rate": 0.00011562962962962964, |
|
"loss": 0.1187, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 12.2, |
|
"learning_rate": 0.00011385185185185185, |
|
"loss": 0.0852, |
|
"step": 1464 |
|
}, |
|
{ |
|
"epoch": 12.4, |
|
"learning_rate": 0.00011207407407407409, |
|
"loss": 0.0901, |
|
"step": 1488 |
|
}, |
|
{ |
|
"epoch": 12.6, |
|
"learning_rate": 0.0001102962962962963, |
|
"loss": 0.0909, |
|
"step": 1512 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"learning_rate": 0.00010851851851851853, |
|
"loss": 0.094, |
|
"step": 1536 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"learning_rate": 0.00010674074074074074, |
|
"loss": 0.1011, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 13.2, |
|
"learning_rate": 0.00010496296296296298, |
|
"loss": 0.0703, |
|
"step": 1584 |
|
}, |
|
{ |
|
"epoch": 13.4, |
|
"learning_rate": 0.00010318518518518519, |
|
"loss": 0.0726, |
|
"step": 1608 |
|
}, |
|
{ |
|
"epoch": 13.6, |
|
"learning_rate": 0.00010140740740740741, |
|
"loss": 0.0769, |
|
"step": 1632 |
|
}, |
|
{ |
|
"epoch": 13.8, |
|
"learning_rate": 9.962962962962963e-05, |
|
"loss": 0.0787, |
|
"step": 1656 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"learning_rate": 9.785185185185186e-05, |
|
"loss": 0.0848, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 14.2, |
|
"learning_rate": 9.607407407407408e-05, |
|
"loss": 0.0604, |
|
"step": 1704 |
|
}, |
|
{ |
|
"epoch": 14.4, |
|
"learning_rate": 9.42962962962963e-05, |
|
"loss": 0.0586, |
|
"step": 1728 |
|
}, |
|
{ |
|
"epoch": 14.6, |
|
"learning_rate": 9.251851851851852e-05, |
|
"loss": 0.0616, |
|
"step": 1752 |
|
}, |
|
{ |
|
"epoch": 14.8, |
|
"learning_rate": 9.074074074074075e-05, |
|
"loss": 0.0663, |
|
"step": 1776 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"learning_rate": 8.896296296296297e-05, |
|
"loss": 0.0651, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 15.2, |
|
"learning_rate": 8.718518518518519e-05, |
|
"loss": 0.0479, |
|
"step": 1824 |
|
}, |
|
{ |
|
"epoch": 15.4, |
|
"learning_rate": 8.540740740740742e-05, |
|
"loss": 0.05, |
|
"step": 1848 |
|
}, |
|
{ |
|
"epoch": 15.6, |
|
"learning_rate": 8.362962962962964e-05, |
|
"loss": 0.0521, |
|
"step": 1872 |
|
}, |
|
{ |
|
"epoch": 15.8, |
|
"learning_rate": 8.185185185185186e-05, |
|
"loss": 0.0523, |
|
"step": 1896 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"learning_rate": 8.007407407407408e-05, |
|
"loss": 0.0535, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 16.2, |
|
"learning_rate": 7.82962962962963e-05, |
|
"loss": 0.0402, |
|
"step": 1944 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"learning_rate": 7.651851851851853e-05, |
|
"loss": 0.0406, |
|
"step": 1968 |
|
}, |
|
{ |
|
"epoch": 16.6, |
|
"learning_rate": 7.474074074074074e-05, |
|
"loss": 0.0426, |
|
"step": 1992 |
|
}, |
|
{ |
|
"epoch": 16.8, |
|
"learning_rate": 7.296296296296296e-05, |
|
"loss": 0.0414, |
|
"step": 2016 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"learning_rate": 7.118518518518518e-05, |
|
"loss": 0.0438, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 17.2, |
|
"learning_rate": 6.94074074074074e-05, |
|
"loss": 0.0336, |
|
"step": 2064 |
|
}, |
|
{ |
|
"epoch": 17.4, |
|
"learning_rate": 6.762962962962963e-05, |
|
"loss": 0.0332, |
|
"step": 2088 |
|
}, |
|
{ |
|
"epoch": 17.6, |
|
"learning_rate": 6.585185185185185e-05, |
|
"loss": 0.0342, |
|
"step": 2112 |
|
}, |
|
{ |
|
"epoch": 17.8, |
|
"learning_rate": 6.407407407407407e-05, |
|
"loss": 0.0342, |
|
"step": 2136 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"learning_rate": 6.22962962962963e-05, |
|
"loss": 0.0356, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 18.2, |
|
"learning_rate": 6.051851851851852e-05, |
|
"loss": 0.0259, |
|
"step": 2184 |
|
}, |
|
{ |
|
"epoch": 18.4, |
|
"learning_rate": 5.874074074074074e-05, |
|
"loss": 0.0269, |
|
"step": 2208 |
|
}, |
|
{ |
|
"epoch": 18.6, |
|
"learning_rate": 5.6962962962962965e-05, |
|
"loss": 0.0282, |
|
"step": 2232 |
|
}, |
|
{ |
|
"epoch": 18.8, |
|
"learning_rate": 5.518518518518519e-05, |
|
"loss": 0.0291, |
|
"step": 2256 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"learning_rate": 5.34074074074074e-05, |
|
"loss": 0.0297, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"learning_rate": 5.1629629629629626e-05, |
|
"loss": 0.0236, |
|
"step": 2304 |
|
}, |
|
{ |
|
"epoch": 19.4, |
|
"learning_rate": 4.9851851851851855e-05, |
|
"loss": 0.0234, |
|
"step": 2328 |
|
}, |
|
{ |
|
"epoch": 19.6, |
|
"learning_rate": 4.807407407407408e-05, |
|
"loss": 0.0233, |
|
"step": 2352 |
|
}, |
|
{ |
|
"epoch": 19.8, |
|
"learning_rate": 4.62962962962963e-05, |
|
"loss": 0.0231, |
|
"step": 2376 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"learning_rate": 4.4518518518518523e-05, |
|
"loss": 0.0231, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 20.2, |
|
"learning_rate": 4.274074074074074e-05, |
|
"loss": 0.0183, |
|
"step": 2424 |
|
}, |
|
{ |
|
"epoch": 20.4, |
|
"learning_rate": 4.096296296296296e-05, |
|
"loss": 0.0196, |
|
"step": 2448 |
|
}, |
|
{ |
|
"epoch": 20.6, |
|
"learning_rate": 3.9185185185185185e-05, |
|
"loss": 0.0197, |
|
"step": 2472 |
|
}, |
|
{ |
|
"epoch": 20.8, |
|
"learning_rate": 3.740740740740741e-05, |
|
"loss": 0.0198, |
|
"step": 2496 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"learning_rate": 3.562962962962963e-05, |
|
"loss": 0.0196, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 21.2, |
|
"learning_rate": 3.385185185185185e-05, |
|
"loss": 0.0162, |
|
"step": 2544 |
|
}, |
|
{ |
|
"epoch": 21.4, |
|
"learning_rate": 3.2074074074074075e-05, |
|
"loss": 0.016, |
|
"step": 2568 |
|
}, |
|
{ |
|
"epoch": 21.6, |
|
"learning_rate": 3.02962962962963e-05, |
|
"loss": 0.0166, |
|
"step": 2592 |
|
}, |
|
{ |
|
"epoch": 21.8, |
|
"learning_rate": 2.851851851851852e-05, |
|
"loss": 0.0168, |
|
"step": 2616 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"learning_rate": 2.6740740740740743e-05, |
|
"loss": 0.0167, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 22.2, |
|
"learning_rate": 2.4962962962962963e-05, |
|
"loss": 0.0145, |
|
"step": 2664 |
|
}, |
|
{ |
|
"epoch": 22.4, |
|
"learning_rate": 2.318518518518519e-05, |
|
"loss": 0.014, |
|
"step": 2688 |
|
}, |
|
{ |
|
"epoch": 22.6, |
|
"learning_rate": 2.1407407407407408e-05, |
|
"loss": 0.0135, |
|
"step": 2712 |
|
}, |
|
{ |
|
"epoch": 22.8, |
|
"learning_rate": 1.962962962962963e-05, |
|
"loss": 0.014, |
|
"step": 2736 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"learning_rate": 1.7851851851851853e-05, |
|
"loss": 0.0144, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 23.2, |
|
"learning_rate": 1.6074074074074076e-05, |
|
"loss": 0.0123, |
|
"step": 2784 |
|
}, |
|
{ |
|
"epoch": 23.4, |
|
"learning_rate": 1.4296296296296297e-05, |
|
"loss": 0.0128, |
|
"step": 2808 |
|
}, |
|
{ |
|
"epoch": 23.6, |
|
"learning_rate": 1.2518518518518518e-05, |
|
"loss": 0.0128, |
|
"step": 2832 |
|
}, |
|
{ |
|
"epoch": 23.8, |
|
"learning_rate": 1.074074074074074e-05, |
|
"loss": 0.0124, |
|
"step": 2856 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"learning_rate": 8.962962962962963e-06, |
|
"loss": 0.0125, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 24.2, |
|
"learning_rate": 7.185185185185185e-06, |
|
"loss": 0.0113, |
|
"step": 2904 |
|
}, |
|
{ |
|
"epoch": 24.4, |
|
"learning_rate": 5.407407407407407e-06, |
|
"loss": 0.0112, |
|
"step": 2928 |
|
}, |
|
{ |
|
"epoch": 24.6, |
|
"learning_rate": 3.6296296296296302e-06, |
|
"loss": 0.0114, |
|
"step": 2952 |
|
}, |
|
{ |
|
"epoch": 24.8, |
|
"learning_rate": 1.8518518518518519e-06, |
|
"loss": 0.0118, |
|
"step": 2976 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"learning_rate": 7.407407407407407e-08, |
|
"loss": 0.0116, |
|
"step": 3000 |
|
} |
|
], |
|
"max_steps": 3000, |
|
"num_train_epochs": 25, |
|
"total_flos": 4.87764705411072e+17, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|