|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 100, |
|
"global_step": 4233, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01771793054571226, |
|
"grad_norm": 0.24673239886760712, |
|
"learning_rate": 0.0001, |
|
"loss": 3.8313, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03543586109142452, |
|
"grad_norm": 0.23432838916778564, |
|
"learning_rate": 0.0001, |
|
"loss": 3.6164, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05315379163713678, |
|
"grad_norm": 0.4262351393699646, |
|
"learning_rate": 0.0001, |
|
"loss": 3.4113, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.07087172218284904, |
|
"grad_norm": 0.329630047082901, |
|
"learning_rate": 0.0001, |
|
"loss": 3.1874, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07087172218284904, |
|
"eval_loss": 3.0117650032043457, |
|
"eval_runtime": 31.2497, |
|
"eval_samples_per_second": 9.536, |
|
"eval_steps_per_second": 2.4, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0885896527285613, |
|
"grad_norm": 0.39532458782196045, |
|
"learning_rate": 0.0001, |
|
"loss": 3.0928, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.10630758327427356, |
|
"grad_norm": 0.38724836707115173, |
|
"learning_rate": 0.0001, |
|
"loss": 2.8781, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12402551381998582, |
|
"grad_norm": 0.2721017897129059, |
|
"learning_rate": 0.0001, |
|
"loss": 2.9166, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.14174344436569808, |
|
"grad_norm": 0.40747421979904175, |
|
"learning_rate": 0.0001, |
|
"loss": 2.8756, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14174344436569808, |
|
"eval_loss": 2.822810411453247, |
|
"eval_runtime": 32.4779, |
|
"eval_samples_per_second": 9.175, |
|
"eval_steps_per_second": 2.309, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15946137491141035, |
|
"grad_norm": 0.4922279119491577, |
|
"learning_rate": 0.0001, |
|
"loss": 2.9015, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.1771793054571226, |
|
"grad_norm": 0.3577287197113037, |
|
"learning_rate": 0.0001, |
|
"loss": 2.8639, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.19489723600283487, |
|
"grad_norm": 0.26928216218948364, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6776, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.21261516654854712, |
|
"grad_norm": 0.3623720109462738, |
|
"learning_rate": 0.0001, |
|
"loss": 2.7134, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.21261516654854712, |
|
"eval_loss": 2.735769271850586, |
|
"eval_runtime": 32.6152, |
|
"eval_samples_per_second": 9.137, |
|
"eval_steps_per_second": 2.3, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2303330970942594, |
|
"grad_norm": 0.48846569657325745, |
|
"learning_rate": 0.0001, |
|
"loss": 2.7118, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.24805102763997164, |
|
"grad_norm": 0.555242657661438, |
|
"learning_rate": 0.0001, |
|
"loss": 2.7072, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2657689581856839, |
|
"grad_norm": 0.3421138823032379, |
|
"learning_rate": 0.0001, |
|
"loss": 2.7114, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.28348688873139616, |
|
"grad_norm": 0.3518970012664795, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6948, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.28348688873139616, |
|
"eval_loss": 2.6832730770111084, |
|
"eval_runtime": 32.6428, |
|
"eval_samples_per_second": 9.129, |
|
"eval_steps_per_second": 2.298, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.30120481927710846, |
|
"grad_norm": 0.7523891925811768, |
|
"learning_rate": 0.0001, |
|
"loss": 2.7691, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.3189227498228207, |
|
"grad_norm": 0.36019715666770935, |
|
"learning_rate": 0.0001, |
|
"loss": 2.7116, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.33664068036853295, |
|
"grad_norm": 0.47855111956596375, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5935, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.3543586109142452, |
|
"grad_norm": 0.3621235489845276, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6386, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3543586109142452, |
|
"eval_loss": 2.6440742015838623, |
|
"eval_runtime": 32.6151, |
|
"eval_samples_per_second": 9.137, |
|
"eval_steps_per_second": 2.3, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3720765414599575, |
|
"grad_norm": 0.6827765703201294, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6515, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.38979447200566975, |
|
"grad_norm": 0.5584849119186401, |
|
"learning_rate": 0.0001, |
|
"loss": 2.7017, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.407512402551382, |
|
"grad_norm": 0.42108777165412903, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6605, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.42523033309709424, |
|
"grad_norm": 0.4811398684978485, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6525, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.42523033309709424, |
|
"eval_loss": 2.6149747371673584, |
|
"eval_runtime": 32.6256, |
|
"eval_samples_per_second": 9.134, |
|
"eval_steps_per_second": 2.299, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.44294826364280654, |
|
"grad_norm": 0.3859870731830597, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6805, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.4606661941885188, |
|
"grad_norm": 0.5074867010116577, |
|
"learning_rate": 0.0001, |
|
"loss": 2.66, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.47838412473423103, |
|
"grad_norm": 0.4459812343120575, |
|
"learning_rate": 0.0001, |
|
"loss": 2.645, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.4961020552799433, |
|
"grad_norm": 0.3960532248020172, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6242, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4961020552799433, |
|
"eval_loss": 2.5855672359466553, |
|
"eval_runtime": 32.5667, |
|
"eval_samples_per_second": 9.15, |
|
"eval_steps_per_second": 2.303, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5138199858256556, |
|
"grad_norm": 0.28771647810935974, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6985, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.5315379163713678, |
|
"grad_norm": 0.3489636182785034, |
|
"learning_rate": 0.0001, |
|
"loss": 2.7126, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5492558469170801, |
|
"grad_norm": 0.5809263586997986, |
|
"learning_rate": 0.0001, |
|
"loss": 2.558, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.5669737774627923, |
|
"grad_norm": 0.5202405452728271, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6444, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5669737774627923, |
|
"eval_loss": 2.5701286792755127, |
|
"eval_runtime": 32.5138, |
|
"eval_samples_per_second": 9.165, |
|
"eval_steps_per_second": 2.307, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5846917080085046, |
|
"grad_norm": 0.4100205600261688, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5452, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.6024096385542169, |
|
"grad_norm": 0.494243323802948, |
|
"learning_rate": 0.0001, |
|
"loss": 2.566, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.6201275690999292, |
|
"grad_norm": 0.41979244351387024, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6271, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.6378454996456414, |
|
"grad_norm": 0.4423984885215759, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6007, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6378454996456414, |
|
"eval_loss": 2.554037094116211, |
|
"eval_runtime": 32.562, |
|
"eval_samples_per_second": 9.152, |
|
"eval_steps_per_second": 2.303, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6555634301913537, |
|
"grad_norm": 0.49124300479888916, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6006, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.6732813607370659, |
|
"grad_norm": 0.4179531931877136, |
|
"learning_rate": 0.0001, |
|
"loss": 2.634, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6909992912827781, |
|
"grad_norm": 0.35822227597236633, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5601, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.7087172218284904, |
|
"grad_norm": 0.44015607237815857, |
|
"learning_rate": 0.0001, |
|
"loss": 2.462, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7087172218284904, |
|
"eval_loss": 2.5418050289154053, |
|
"eval_runtime": 32.5949, |
|
"eval_samples_per_second": 9.143, |
|
"eval_steps_per_second": 2.301, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7264351523742026, |
|
"grad_norm": 0.3899957835674286, |
|
"learning_rate": 0.0001, |
|
"loss": 2.538, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.744153082919915, |
|
"grad_norm": 0.5775083303451538, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5553, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.7618710134656272, |
|
"grad_norm": 0.6485064625740051, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5878, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.7795889440113395, |
|
"grad_norm": 0.5756754875183105, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5641, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7795889440113395, |
|
"eval_loss": 2.5314512252807617, |
|
"eval_runtime": 32.673, |
|
"eval_samples_per_second": 9.121, |
|
"eval_steps_per_second": 2.295, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7973068745570517, |
|
"grad_norm": 0.46359655261039734, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5062, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.815024805102764, |
|
"grad_norm": 0.6569529175758362, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5921, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.8327427356484762, |
|
"grad_norm": 0.5323280096054077, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4839, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.8504606661941885, |
|
"grad_norm": 0.46269166469573975, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4672, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8504606661941885, |
|
"eval_loss": 2.5238418579101562, |
|
"eval_runtime": 32.6451, |
|
"eval_samples_per_second": 9.128, |
|
"eval_steps_per_second": 2.297, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8681785967399008, |
|
"grad_norm": 0.4144786596298218, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5174, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.8858965272856131, |
|
"grad_norm": 0.6366602182388306, |
|
"learning_rate": 0.0001, |
|
"loss": 2.512, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.9036144578313253, |
|
"grad_norm": 0.490589439868927, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5243, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.9213323883770376, |
|
"grad_norm": 0.5880224108695984, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5017, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.9213323883770376, |
|
"eval_loss": 2.51455020904541, |
|
"eval_runtime": 32.6422, |
|
"eval_samples_per_second": 9.129, |
|
"eval_steps_per_second": 2.298, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.9390503189227498, |
|
"grad_norm": 0.5042569637298584, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5238, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.9567682494684621, |
|
"grad_norm": 0.3827306032180786, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5785, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.9744861800141743, |
|
"grad_norm": 0.4069231450557709, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5312, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.9922041105598866, |
|
"grad_norm": 0.5515998005867004, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6389, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9922041105598866, |
|
"eval_loss": 2.5082926750183105, |
|
"eval_runtime": 32.6146, |
|
"eval_samples_per_second": 9.137, |
|
"eval_steps_per_second": 2.3, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.009922041105599, |
|
"grad_norm": 0.34569019079208374, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4996, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.0276399716513112, |
|
"grad_norm": 0.4051329493522644, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4935, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.0453579021970234, |
|
"grad_norm": 0.5617458820343018, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5528, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.0630758327427356, |
|
"grad_norm": 0.7300685048103333, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4869, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.0630758327427356, |
|
"eval_loss": 2.502122640609741, |
|
"eval_runtime": 32.6521, |
|
"eval_samples_per_second": 9.127, |
|
"eval_steps_per_second": 2.297, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.080793763288448, |
|
"grad_norm": 0.3424386978149414, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5036, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.0985116938341601, |
|
"grad_norm": 0.48358920216560364, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4546, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.1162296243798724, |
|
"grad_norm": 0.620297372341156, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4505, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.1339475549255846, |
|
"grad_norm": 0.41402992606163025, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5302, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.1339475549255846, |
|
"eval_loss": 2.4941623210906982, |
|
"eval_runtime": 32.6717, |
|
"eval_samples_per_second": 9.121, |
|
"eval_steps_per_second": 2.296, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.1516654854712969, |
|
"grad_norm": 0.6208804249763489, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5035, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.1693834160170091, |
|
"grad_norm": 0.3958500623703003, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6296, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.1871013465627214, |
|
"grad_norm": 0.5418444275856018, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5194, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.2048192771084336, |
|
"grad_norm": 0.7059912085533142, |
|
"learning_rate": 0.0001, |
|
"loss": 2.497, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.2048192771084336, |
|
"eval_loss": 2.4886298179626465, |
|
"eval_runtime": 32.6428, |
|
"eval_samples_per_second": 9.129, |
|
"eval_steps_per_second": 2.298, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.222537207654146, |
|
"grad_norm": 0.5447824001312256, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5098, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.2402551381998583, |
|
"grad_norm": 0.508121132850647, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4463, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.2579730687455706, |
|
"grad_norm": 0.5141641497612, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4407, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.2756909992912828, |
|
"grad_norm": 0.7895596623420715, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4965, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.2756909992912828, |
|
"eval_loss": 2.4845895767211914, |
|
"eval_runtime": 32.6353, |
|
"eval_samples_per_second": 9.131, |
|
"eval_steps_per_second": 2.298, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.293408929836995, |
|
"grad_norm": 0.33818376064300537, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3678, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.3111268603827073, |
|
"grad_norm": 0.36670777201652527, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4522, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.3288447909284196, |
|
"grad_norm": 0.46683812141418457, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5355, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.3465627214741318, |
|
"grad_norm": 0.45993196964263916, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5535, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.3465627214741318, |
|
"eval_loss": 2.4782519340515137, |
|
"eval_runtime": 32.6522, |
|
"eval_samples_per_second": 9.126, |
|
"eval_steps_per_second": 2.297, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.364280652019844, |
|
"grad_norm": 0.4003180265426636, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4353, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.3819985825655563, |
|
"grad_norm": 0.4715386629104614, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4086, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.3997165131112685, |
|
"grad_norm": 0.4463669955730438, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5299, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.4174344436569808, |
|
"grad_norm": 0.6194645762443542, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5747, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.4174344436569808, |
|
"eval_loss": 2.4732108116149902, |
|
"eval_runtime": 32.5875, |
|
"eval_samples_per_second": 9.145, |
|
"eval_steps_per_second": 2.301, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.4351523742026933, |
|
"grad_norm": 0.4712235927581787, |
|
"learning_rate": 0.0001, |
|
"loss": 2.433, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.4528703047484055, |
|
"grad_norm": 0.5619576573371887, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4883, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.4705882352941178, |
|
"grad_norm": 0.44078579545021057, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5575, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.48830616583983, |
|
"grad_norm": 0.44347578287124634, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4534, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.48830616583983, |
|
"eval_loss": 2.467924118041992, |
|
"eval_runtime": 32.6469, |
|
"eval_samples_per_second": 9.128, |
|
"eval_steps_per_second": 2.297, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.5060240963855422, |
|
"grad_norm": 0.4806898832321167, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4605, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.5237420269312545, |
|
"grad_norm": 0.4710817337036133, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5124, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.5414599574769667, |
|
"grad_norm": 0.4697023630142212, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5251, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.559177888022679, |
|
"grad_norm": 0.4275937080383301, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4909, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.559177888022679, |
|
"eval_loss": 2.465703248977661, |
|
"eval_runtime": 32.66, |
|
"eval_samples_per_second": 9.124, |
|
"eval_steps_per_second": 2.296, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.5768958185683912, |
|
"grad_norm": 0.6271502375602722, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4563, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 1.5946137491141035, |
|
"grad_norm": 0.5484246015548706, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4781, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.6123316796598157, |
|
"grad_norm": 0.7340563535690308, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5204, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 1.630049610205528, |
|
"grad_norm": 0.43257761001586914, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5192, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.630049610205528, |
|
"eval_loss": 2.461730718612671, |
|
"eval_runtime": 32.5818, |
|
"eval_samples_per_second": 9.146, |
|
"eval_steps_per_second": 2.302, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.6477675407512402, |
|
"grad_norm": 0.7394423484802246, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5862, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 1.6654854712969525, |
|
"grad_norm": 0.48102429509162903, |
|
"learning_rate": 0.0001, |
|
"loss": 2.519, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.6832034018426647, |
|
"grad_norm": 0.5994846820831299, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4566, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 1.700921332388377, |
|
"grad_norm": 0.4805436134338379, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4271, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.700921332388377, |
|
"eval_loss": 2.457273006439209, |
|
"eval_runtime": 32.6806, |
|
"eval_samples_per_second": 9.119, |
|
"eval_steps_per_second": 2.295, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.7186392629340892, |
|
"grad_norm": 0.6208567023277283, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3581, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 1.7363571934798014, |
|
"grad_norm": 0.44081413745880127, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4295, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.7540751240255137, |
|
"grad_norm": 0.4629543721675873, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4569, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 1.771793054571226, |
|
"grad_norm": 0.518991231918335, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4855, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.771793054571226, |
|
"eval_loss": 2.454190731048584, |
|
"eval_runtime": 32.6137, |
|
"eval_samples_per_second": 9.137, |
|
"eval_steps_per_second": 2.3, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.7895109851169382, |
|
"grad_norm": 0.6166653037071228, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4761, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 1.8072289156626506, |
|
"grad_norm": 0.5490785241127014, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5232, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.824946846208363, |
|
"grad_norm": 0.6279402375221252, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4425, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 1.8426647767540751, |
|
"grad_norm": 0.606396496295929, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4599, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.8426647767540751, |
|
"eval_loss": 2.4530327320098877, |
|
"eval_runtime": 32.6381, |
|
"eval_samples_per_second": 9.13, |
|
"eval_steps_per_second": 2.298, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.8603827072997874, |
|
"grad_norm": 0.5355327129364014, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5615, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 1.8781006378454996, |
|
"grad_norm": 0.3971356451511383, |
|
"learning_rate": 0.0001, |
|
"loss": 2.445, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.8958185683912119, |
|
"grad_norm": 0.48701226711273193, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5405, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 1.9135364989369241, |
|
"grad_norm": 0.5117021203041077, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4482, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.9135364989369241, |
|
"eval_loss": 2.444391965866089, |
|
"eval_runtime": 32.6264, |
|
"eval_samples_per_second": 9.134, |
|
"eval_steps_per_second": 2.299, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.9312544294826366, |
|
"grad_norm": 0.40752479434013367, |
|
"learning_rate": 0.0001, |
|
"loss": 2.421, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 1.9489723600283488, |
|
"grad_norm": 0.4466327428817749, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4286, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.966690290574061, |
|
"grad_norm": 0.35452115535736084, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4265, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 1.9844082211197733, |
|
"grad_norm": 0.7978025674819946, |
|
"learning_rate": 0.0001, |
|
"loss": 2.493, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.9844082211197733, |
|
"eval_loss": 2.444624423980713, |
|
"eval_runtime": 32.6679, |
|
"eval_samples_per_second": 9.122, |
|
"eval_steps_per_second": 2.296, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.0021261516654856, |
|
"grad_norm": 0.46374988555908203, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5357, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 2.019844082211198, |
|
"grad_norm": 0.4631684422492981, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4015, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.03756201275691, |
|
"grad_norm": 0.4475260376930237, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3658, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 2.0552799433026223, |
|
"grad_norm": 0.47790655493736267, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3527, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.0552799433026223, |
|
"eval_loss": 2.441364049911499, |
|
"eval_runtime": 32.6599, |
|
"eval_samples_per_second": 9.124, |
|
"eval_steps_per_second": 2.296, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.0729978738483346, |
|
"grad_norm": 0.5602151155471802, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4763, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 2.090715804394047, |
|
"grad_norm": 0.37178730964660645, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4431, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.108433734939759, |
|
"grad_norm": 0.47269827127456665, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5528, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 2.1261516654854713, |
|
"grad_norm": 0.5636725425720215, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5243, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.1261516654854713, |
|
"eval_loss": 2.4375791549682617, |
|
"eval_runtime": 32.6693, |
|
"eval_samples_per_second": 9.122, |
|
"eval_steps_per_second": 2.296, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.1438695960311835, |
|
"grad_norm": 0.5602971315383911, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4726, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 2.161587526576896, |
|
"grad_norm": 0.7102957367897034, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4513, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.179305457122608, |
|
"grad_norm": 0.5028663277626038, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4038, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 2.1970233876683203, |
|
"grad_norm": 0.5358246564865112, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4644, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.1970233876683203, |
|
"eval_loss": 2.433030605316162, |
|
"eval_runtime": 32.6486, |
|
"eval_samples_per_second": 9.128, |
|
"eval_steps_per_second": 2.297, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.2147413182140325, |
|
"grad_norm": 0.5380859971046448, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4342, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 2.2324592487597448, |
|
"grad_norm": 0.8703417181968689, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4462, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.250177179305457, |
|
"grad_norm": 0.44465309381484985, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4428, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 2.2678951098511693, |
|
"grad_norm": 0.4541110396385193, |
|
"learning_rate": 0.0001, |
|
"loss": 2.386, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.2678951098511693, |
|
"eval_loss": 2.4308454990386963, |
|
"eval_runtime": 32.6678, |
|
"eval_samples_per_second": 9.122, |
|
"eval_steps_per_second": 2.296, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.2856130403968815, |
|
"grad_norm": 0.6527560949325562, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3964, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 2.3033309709425938, |
|
"grad_norm": 0.5541362762451172, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4817, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.321048901488306, |
|
"grad_norm": 0.5997689366340637, |
|
"learning_rate": 0.0001, |
|
"loss": 2.496, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 2.3387668320340183, |
|
"grad_norm": 0.5446316003799438, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3762, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.3387668320340183, |
|
"eval_loss": 2.428109645843506, |
|
"eval_runtime": 32.6878, |
|
"eval_samples_per_second": 9.117, |
|
"eval_steps_per_second": 2.294, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.3564847625797305, |
|
"grad_norm": 0.3934761881828308, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5195, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 2.3742026931254427, |
|
"grad_norm": 0.47348254919052124, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5055, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.3919206236711554, |
|
"grad_norm": 0.5292345881462097, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3635, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 2.4096385542168672, |
|
"grad_norm": 0.6056796312332153, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3827, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.4096385542168672, |
|
"eval_loss": 2.4244818687438965, |
|
"eval_runtime": 32.5864, |
|
"eval_samples_per_second": 9.145, |
|
"eval_steps_per_second": 2.302, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.42735648476258, |
|
"grad_norm": 0.2907162010669708, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3354, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 2.445074415308292, |
|
"grad_norm": 0.43741077184677124, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4199, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.4627923458540044, |
|
"grad_norm": 0.36141782999038696, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4165, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 2.4805102763997167, |
|
"grad_norm": 0.5461854338645935, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3487, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.4805102763997167, |
|
"eval_loss": 2.4221482276916504, |
|
"eval_runtime": 32.6681, |
|
"eval_samples_per_second": 9.122, |
|
"eval_steps_per_second": 2.296, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.498228206945429, |
|
"grad_norm": 0.61762934923172, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4136, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 2.515946137491141, |
|
"grad_norm": 0.41114169359207153, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3587, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.5336640680368534, |
|
"grad_norm": 0.5726279020309448, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5009, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 2.5513819985825656, |
|
"grad_norm": 0.4807787239551544, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4737, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.5513819985825656, |
|
"eval_loss": 2.4191701412200928, |
|
"eval_runtime": 32.6743, |
|
"eval_samples_per_second": 9.12, |
|
"eval_steps_per_second": 2.295, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.569099929128278, |
|
"grad_norm": 0.5931722521781921, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4178, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 2.58681785967399, |
|
"grad_norm": 0.4658395051956177, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5162, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.6045357902197024, |
|
"grad_norm": 0.5829235315322876, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3402, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 2.6222537207654146, |
|
"grad_norm": 0.6382436156272888, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4907, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.6222537207654146, |
|
"eval_loss": 2.417147397994995, |
|
"eval_runtime": 32.6259, |
|
"eval_samples_per_second": 9.134, |
|
"eval_steps_per_second": 2.299, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.639971651311127, |
|
"grad_norm": 0.578823983669281, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4281, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 2.657689581856839, |
|
"grad_norm": 0.5311617255210876, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4315, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.6754075124025514, |
|
"grad_norm": 0.4713846743106842, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4268, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 2.6931254429482636, |
|
"grad_norm": 0.7472273111343384, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3967, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.6931254429482636, |
|
"eval_loss": 2.415893077850342, |
|
"eval_runtime": 32.6303, |
|
"eval_samples_per_second": 9.133, |
|
"eval_steps_per_second": 2.298, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.710843373493976, |
|
"grad_norm": 0.5875506401062012, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3541, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 2.728561304039688, |
|
"grad_norm": 0.38152602314949036, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4623, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.7462792345854004, |
|
"grad_norm": 0.35034802556037903, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4392, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 2.7639971651311126, |
|
"grad_norm": 0.3683781027793884, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4772, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.7639971651311126, |
|
"eval_loss": 2.414635181427002, |
|
"eval_runtime": 32.6546, |
|
"eval_samples_per_second": 9.126, |
|
"eval_steps_per_second": 2.297, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.781715095676825, |
|
"grad_norm": 0.632203221321106, |
|
"learning_rate": 0.0001, |
|
"loss": 2.48, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 2.799433026222537, |
|
"grad_norm": 0.4688514173030853, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4058, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.8171509567682493, |
|
"grad_norm": 0.3703823685646057, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3802, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 2.8348688873139616, |
|
"grad_norm": 0.4395906329154968, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4114, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.8348688873139616, |
|
"eval_loss": 2.4105727672576904, |
|
"eval_runtime": 32.6241, |
|
"eval_samples_per_second": 9.134, |
|
"eval_steps_per_second": 2.299, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.852586817859674, |
|
"grad_norm": 0.3975163996219635, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4189, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 2.8703047484053865, |
|
"grad_norm": 0.37457334995269775, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3946, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.8880226789510983, |
|
"grad_norm": 0.3786819279193878, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4683, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 2.905740609496811, |
|
"grad_norm": 0.633921205997467, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4017, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.905740609496811, |
|
"eval_loss": 2.406451463699341, |
|
"eval_runtime": 32.6727, |
|
"eval_samples_per_second": 9.121, |
|
"eval_steps_per_second": 2.295, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.923458540042523, |
|
"grad_norm": 0.9005404710769653, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4099, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 0.5802463293075562, |
|
"learning_rate": 0.0001, |
|
"loss": 2.4068, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 2.9588944011339473, |
|
"grad_norm": 0.3155713975429535, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3682, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 2.97661233167966, |
|
"grad_norm": 0.4876560568809509, |
|
"learning_rate": 0.0001, |
|
"loss": 2.3477, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.97661233167966, |
|
"eval_loss": 2.405850648880005, |
|
"eval_runtime": 32.6084, |
|
"eval_samples_per_second": 9.139, |
|
"eval_steps_per_second": 2.3, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.9943302622253722, |
|
"grad_norm": 0.49624720215797424, |
|
"learning_rate": 0.0001, |
|
"loss": 2.445, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 4233, |
|
"total_flos": 9152314711474176.0, |
|
"train_loss": 2.5398848939386913, |
|
"train_runtime": 5263.4986, |
|
"train_samples_per_second": 3.217, |
|
"train_steps_per_second": 0.804 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 4233, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9152314711474176.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|