|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9946268656716417, |
|
"eval_steps": 500, |
|
"global_step": 1254, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.023880597014925373, |
|
"grad_norm": 10.273793568807084, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8832, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04776119402985075, |
|
"grad_norm": 17.413079986249713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7826, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07164179104477612, |
|
"grad_norm": 1.7682355694433471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7785, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0955223880597015, |
|
"grad_norm": 1.2119658541333866, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7537, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11940298507462686, |
|
"grad_norm": 0.8476848561416307, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7389, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14328358208955225, |
|
"grad_norm": 0.7667403306996323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7217, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16716417910447762, |
|
"grad_norm": 0.8702244781914732, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7085, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.191044776119403, |
|
"grad_norm": 1.3853703465833076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6971, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21492537313432836, |
|
"grad_norm": 1.2094222885220798, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6882, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23880597014925373, |
|
"grad_norm": 0.7689681572000278, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6849, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2626865671641791, |
|
"grad_norm": 0.7780597190717987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6818, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2865671641791045, |
|
"grad_norm": 0.5000855152810337, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6816, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.31044776119402984, |
|
"grad_norm": 0.7276396850932823, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6736, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.33432835820895523, |
|
"grad_norm": 0.6186133127584187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6818, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3582089552238806, |
|
"grad_norm": 0.48005981720062146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.673, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.382089552238806, |
|
"grad_norm": 0.5023606142200144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6773, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4059701492537313, |
|
"grad_norm": 0.5808170399886017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6677, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4298507462686567, |
|
"grad_norm": 0.4379969982797677, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6685, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4537313432835821, |
|
"grad_norm": 0.5347993662162854, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6752, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.47761194029850745, |
|
"grad_norm": 0.5389725567643092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6648, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5014925373134328, |
|
"grad_norm": 0.48052530490608086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6641, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5253731343283582, |
|
"grad_norm": 0.5805014519700745, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6591, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5492537313432836, |
|
"grad_norm": 0.48958097076386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6655, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.573134328358209, |
|
"grad_norm": 0.49437501765783676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6683, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 0.45944903239035917, |
|
"learning_rate": 5e-06, |
|
"loss": 0.659, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6208955223880597, |
|
"grad_norm": 0.4748455280427033, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6583, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6447761194029851, |
|
"grad_norm": 0.507103510913949, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6583, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6686567164179105, |
|
"grad_norm": 0.48478059033960125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6587, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6925373134328359, |
|
"grad_norm": 0.5180513023339883, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6557, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7164179104477612, |
|
"grad_norm": 0.6977133509211488, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6596, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7402985074626866, |
|
"grad_norm": 0.6751977610973682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6631, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.764179104477612, |
|
"grad_norm": 0.5497977666993439, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6538, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7880597014925373, |
|
"grad_norm": 0.4930749400299794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6533, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8119402985074626, |
|
"grad_norm": 0.42531288506677356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.654, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.835820895522388, |
|
"grad_norm": 0.3981313944033897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6588, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8597014925373134, |
|
"grad_norm": 0.5280916740215565, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6467, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8835820895522388, |
|
"grad_norm": 0.42664364036263097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6547, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9074626865671642, |
|
"grad_norm": 0.4407169068422887, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6576, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9313432835820895, |
|
"grad_norm": 0.5863915466432706, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6517, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9552238805970149, |
|
"grad_norm": 0.591205757573771, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6489, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9791044776119403, |
|
"grad_norm": 0.4729812620066271, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6443, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9982089552238806, |
|
"eval_loss": 0.6459853649139404, |
|
"eval_runtime": 145.5401, |
|
"eval_samples_per_second": 77.532, |
|
"eval_steps_per_second": 0.612, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.0029850746268656, |
|
"grad_norm": 0.7406302510482085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6464, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.026865671641791, |
|
"grad_norm": 0.5814842349080193, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6064, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0507462686567164, |
|
"grad_norm": 0.49550119932649633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6082, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0746268656716418, |
|
"grad_norm": 0.5636659335114863, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6113, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0985074626865672, |
|
"grad_norm": 0.5990578133097235, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6025, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1223880597014926, |
|
"grad_norm": 0.5395914783918955, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6096, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.146268656716418, |
|
"grad_norm": 0.5134850748551497, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6127, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1701492537313434, |
|
"grad_norm": 0.6291645545602457, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6016, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.1940298507462686, |
|
"grad_norm": 0.5896587056027497, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6078, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.217910447761194, |
|
"grad_norm": 0.44004057090652665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6028, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2417910447761193, |
|
"grad_norm": 0.6082489861086471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6057, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2656716417910447, |
|
"grad_norm": 0.4757708288590624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6128, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.2895522388059701, |
|
"grad_norm": 0.4866590721782206, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6133, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3134328358208955, |
|
"grad_norm": 0.4646472106430823, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6089, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.337313432835821, |
|
"grad_norm": 0.44517915287397264, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6079, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.3611940298507463, |
|
"grad_norm": 0.6694478573734268, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6069, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3850746268656717, |
|
"grad_norm": 0.5078303398930042, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6069, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.408955223880597, |
|
"grad_norm": 0.503001629794063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6111, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4328358208955223, |
|
"grad_norm": 0.5438514985018119, |
|
"learning_rate": 5e-06, |
|
"loss": 0.613, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4567164179104477, |
|
"grad_norm": 0.5056414273287846, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6031, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.480597014925373, |
|
"grad_norm": 0.4588677461940861, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6126, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5044776119402985, |
|
"grad_norm": 0.5151479126151606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.608, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.528358208955224, |
|
"grad_norm": 0.46401616793533473, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6031, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.5522388059701493, |
|
"grad_norm": 0.48451525361860803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.605, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.5761194029850745, |
|
"grad_norm": 0.47997260631095534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6109, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.4897371630270623, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6058, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6238805970149253, |
|
"grad_norm": 0.5654442922228684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6123, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6477611940298509, |
|
"grad_norm": 0.5784991723951006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5942, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.671641791044776, |
|
"grad_norm": 0.4722303828551024, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6106, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6955223880597015, |
|
"grad_norm": 0.4839424076756933, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6091, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7194029850746269, |
|
"grad_norm": 0.47764784827023116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6123, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.7432835820895523, |
|
"grad_norm": 0.6621875079490905, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6019, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.7671641791044777, |
|
"grad_norm": 0.41773352573318373, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6012, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.7910447761194028, |
|
"grad_norm": 0.5339311769344796, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6088, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.8149253731343284, |
|
"grad_norm": 0.41758051779371724, |
|
"learning_rate": 5e-06, |
|
"loss": 0.602, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8388059701492536, |
|
"grad_norm": 0.48493110488320246, |
|
"learning_rate": 5e-06, |
|
"loss": 0.609, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.8626865671641792, |
|
"grad_norm": 0.503079485131092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6108, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.8865671641791044, |
|
"grad_norm": 0.49999802261208487, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6059, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.9104477611940298, |
|
"grad_norm": 0.49289621093364544, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6071, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.9343283582089552, |
|
"grad_norm": 0.5213797280676552, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6006, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.9582089552238806, |
|
"grad_norm": 0.48494536428497287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.606, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.982089552238806, |
|
"grad_norm": 0.6304797916900879, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6046, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.9988059701492538, |
|
"eval_loss": 0.6354221701622009, |
|
"eval_runtime": 145.6955, |
|
"eval_samples_per_second": 77.449, |
|
"eval_steps_per_second": 0.611, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 2.005970149253731, |
|
"grad_norm": 0.7544412739259527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6034, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.029850746268657, |
|
"grad_norm": 0.6076312693154532, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5647, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.053731343283582, |
|
"grad_norm": 0.5264338967840297, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5566, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.0776119402985076, |
|
"grad_norm": 0.5190037883143301, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5679, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.1014925373134328, |
|
"grad_norm": 0.46674031058869914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5611, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.1253731343283584, |
|
"grad_norm": 0.5072546488653424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.565, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.1492537313432836, |
|
"grad_norm": 0.6009874453432602, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5611, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.173134328358209, |
|
"grad_norm": 0.46110082010928216, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5618, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.1970149253731344, |
|
"grad_norm": 0.5163994956633559, |
|
"learning_rate": 5e-06, |
|
"loss": 0.568, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.2208955223880595, |
|
"grad_norm": 0.6176077747391998, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5698, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.244776119402985, |
|
"grad_norm": 0.46798047694883976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5607, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.2686567164179103, |
|
"grad_norm": 0.5458770025747796, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5657, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.292537313432836, |
|
"grad_norm": 0.4522021510300349, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5582, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.316417910447761, |
|
"grad_norm": 0.4993495813706899, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5673, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.3402985074626868, |
|
"grad_norm": 0.5148727234502507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5627, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.364179104477612, |
|
"grad_norm": 0.4746669105759272, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5632, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.388059701492537, |
|
"grad_norm": 0.49984909528425026, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5683, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.4119402985074627, |
|
"grad_norm": 0.44986277151799875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5616, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.435820895522388, |
|
"grad_norm": 0.48588485615374555, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5607, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.4597014925373135, |
|
"grad_norm": 0.48231851620423527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.564, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.4835820895522387, |
|
"grad_norm": 0.5353905043801261, |
|
"learning_rate": 5e-06, |
|
"loss": 0.572, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.5074626865671643, |
|
"grad_norm": 0.4911575383473398, |
|
"learning_rate": 5e-06, |
|
"loss": 0.561, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.5313432835820895, |
|
"grad_norm": 0.5322320223748976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5702, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.5552238805970147, |
|
"grad_norm": 0.47802103841875787, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5648, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.5791044776119403, |
|
"grad_norm": 0.5461878411480141, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5699, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.602985074626866, |
|
"grad_norm": 0.5723521540745492, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5687, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.626865671641791, |
|
"grad_norm": 0.43486752386685085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5647, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.6507462686567163, |
|
"grad_norm": 0.4851163868024419, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5628, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.674626865671642, |
|
"grad_norm": 0.4819830185138904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5672, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.698507462686567, |
|
"grad_norm": 0.4782708772767338, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5715, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.7223880597014927, |
|
"grad_norm": 0.4564853695633748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5664, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.746268656716418, |
|
"grad_norm": 0.5672636342724084, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5703, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.7701492537313435, |
|
"grad_norm": 0.46200475373217254, |
|
"learning_rate": 5e-06, |
|
"loss": 0.568, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.7940298507462686, |
|
"grad_norm": 0.439779919962293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5617, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.817910447761194, |
|
"grad_norm": 0.48759282182632596, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5632, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.8417910447761194, |
|
"grad_norm": 0.5417263063839436, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5659, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.8656716417910446, |
|
"grad_norm": 0.4780631102145764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5677, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.8895522388059702, |
|
"grad_norm": 0.500379213535652, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5756, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.9134328358208954, |
|
"grad_norm": 0.5402151963225374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5686, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.937313432835821, |
|
"grad_norm": 0.4522864472397569, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5734, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.961194029850746, |
|
"grad_norm": 0.4647996850215528, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5699, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"grad_norm": 0.4918905334543915, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5716, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.9946268656716417, |
|
"eval_loss": 0.636893630027771, |
|
"eval_runtime": 143.3398, |
|
"eval_samples_per_second": 78.722, |
|
"eval_steps_per_second": 0.621, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 2.9946268656716417, |
|
"step": 1254, |
|
"total_flos": 2100077946470400.0, |
|
"train_loss": 0.6183488205479283, |
|
"train_runtime": 21046.2129, |
|
"train_samples_per_second": 30.56, |
|
"train_steps_per_second": 0.06 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1254, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2100077946470400.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|