|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.4140012070006036, |
|
"eval_steps": 500, |
|
"global_step": 1500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 53.90464782714844, |
|
"learning_rate": 9.946323134728933e-06, |
|
"loss": 5.2914, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 53.05311965942383, |
|
"learning_rate": 9.892646269457864e-06, |
|
"loss": 4.9539, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 64.5956802368164, |
|
"learning_rate": 9.838969404186796e-06, |
|
"loss": 4.7374, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 54.43781280517578, |
|
"learning_rate": 9.785292538915728e-06, |
|
"loss": 4.5988, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 48.784854888916016, |
|
"learning_rate": 9.73161567364466e-06, |
|
"loss": 4.4899, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 56.800575256347656, |
|
"learning_rate": 9.677938808373591e-06, |
|
"loss": 4.4391, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 51.58082962036133, |
|
"learning_rate": 9.624261943102525e-06, |
|
"loss": 4.3347, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 50.625308990478516, |
|
"learning_rate": 9.570585077831455e-06, |
|
"loss": 4.3145, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 46.56022262573242, |
|
"learning_rate": 9.516908212560388e-06, |
|
"loss": 4.1742, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 51.267852783203125, |
|
"learning_rate": 9.463231347289318e-06, |
|
"loss": 4.1308, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 52.627098083496094, |
|
"learning_rate": 9.40955448201825e-06, |
|
"loss": 4.1251, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 51.49211120605469, |
|
"learning_rate": 9.355877616747183e-06, |
|
"loss": 4.073, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 50.88691329956055, |
|
"learning_rate": 9.302200751476115e-06, |
|
"loss": 3.9998, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 45.862796783447266, |
|
"learning_rate": 9.248523886205046e-06, |
|
"loss": 3.8666, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 51.27406692504883, |
|
"learning_rate": 9.194847020933978e-06, |
|
"loss": 3.9477, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 47.50687026977539, |
|
"learning_rate": 9.14117015566291e-06, |
|
"loss": 3.7899, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 48.58837127685547, |
|
"learning_rate": 9.087493290391842e-06, |
|
"loss": 3.7295, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 48.3990592956543, |
|
"learning_rate": 9.033816425120775e-06, |
|
"loss": 3.7023, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 46.30027770996094, |
|
"learning_rate": 8.980139559849705e-06, |
|
"loss": 3.6479, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 47.15484619140625, |
|
"learning_rate": 8.926462694578637e-06, |
|
"loss": 3.6161, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 49.22386932373047, |
|
"learning_rate": 8.87278582930757e-06, |
|
"loss": 3.5503, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 45.485557556152344, |
|
"learning_rate": 8.819108964036502e-06, |
|
"loss": 3.4459, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 50.84394454956055, |
|
"learning_rate": 8.765432098765432e-06, |
|
"loss": 3.4918, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 43.17815399169922, |
|
"learning_rate": 8.711755233494365e-06, |
|
"loss": 3.3527, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 41.90092849731445, |
|
"learning_rate": 8.658078368223295e-06, |
|
"loss": 3.347, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 43.23625564575195, |
|
"learning_rate": 8.60440150295223e-06, |
|
"loss": 3.3433, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 44.05680847167969, |
|
"learning_rate": 8.55072463768116e-06, |
|
"loss": 3.2831, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 46.9146842956543, |
|
"learning_rate": 8.497047772410092e-06, |
|
"loss": 3.2932, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 48.542701721191406, |
|
"learning_rate": 8.443370907139024e-06, |
|
"loss": 3.2108, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 39.1197509765625, |
|
"learning_rate": 8.389694041867955e-06, |
|
"loss": 3.1704, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 45.300697326660156, |
|
"learning_rate": 8.336017176596887e-06, |
|
"loss": 3.2239, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 47.439823150634766, |
|
"learning_rate": 8.28234031132582e-06, |
|
"loss": 3.1709, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 42.67869567871094, |
|
"learning_rate": 8.228663446054752e-06, |
|
"loss": 3.0708, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 46.0591926574707, |
|
"learning_rate": 8.174986580783682e-06, |
|
"loss": 3.0959, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 42.061805725097656, |
|
"learning_rate": 8.121309715512614e-06, |
|
"loss": 3.0442, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 44.487159729003906, |
|
"learning_rate": 8.067632850241547e-06, |
|
"loss": 2.8881, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 44.4419059753418, |
|
"learning_rate": 8.013955984970479e-06, |
|
"loss": 2.9994, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 40.00767135620117, |
|
"learning_rate": 7.96027911969941e-06, |
|
"loss": 2.8931, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 44.14014434814453, |
|
"learning_rate": 7.906602254428342e-06, |
|
"loss": 2.9288, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 42.418888092041016, |
|
"learning_rate": 7.852925389157274e-06, |
|
"loss": 2.7776, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 49.98628234863281, |
|
"learning_rate": 7.799248523886206e-06, |
|
"loss": 2.8408, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 39.240352630615234, |
|
"learning_rate": 7.745571658615137e-06, |
|
"loss": 2.7995, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 43.185569763183594, |
|
"learning_rate": 7.691894793344069e-06, |
|
"loss": 2.7486, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 37.41790008544922, |
|
"learning_rate": 7.638217928073001e-06, |
|
"loss": 2.7459, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 38.520973205566406, |
|
"learning_rate": 7.584541062801934e-06, |
|
"loss": 2.6785, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 42.01523971557617, |
|
"learning_rate": 7.530864197530865e-06, |
|
"loss": 2.7745, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 40.55296325683594, |
|
"learning_rate": 7.477187332259796e-06, |
|
"loss": 2.7046, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 38.31270217895508, |
|
"learning_rate": 7.423510466988728e-06, |
|
"loss": 2.6057, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 37.31733322143555, |
|
"learning_rate": 7.369833601717661e-06, |
|
"loss": 2.5229, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 37.55893325805664, |
|
"learning_rate": 7.316156736446592e-06, |
|
"loss": 2.5846, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 40.64212417602539, |
|
"learning_rate": 7.262479871175524e-06, |
|
"loss": 2.4992, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 41.34495544433594, |
|
"learning_rate": 7.208803005904456e-06, |
|
"loss": 2.5575, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 38.53882598876953, |
|
"learning_rate": 7.155126140633387e-06, |
|
"loss": 2.5468, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 39.23023986816406, |
|
"learning_rate": 7.10144927536232e-06, |
|
"loss": 2.5009, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 35.68672180175781, |
|
"learning_rate": 7.047772410091251e-06, |
|
"loss": 2.2929, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 38.0786247253418, |
|
"learning_rate": 6.994095544820183e-06, |
|
"loss": 2.3178, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 42.21394348144531, |
|
"learning_rate": 6.940418679549115e-06, |
|
"loss": 2.4645, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 37.197696685791016, |
|
"learning_rate": 6.886741814278046e-06, |
|
"loss": 2.4797, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 40.341514587402344, |
|
"learning_rate": 6.833064949006979e-06, |
|
"loss": 2.3048, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 36.51962661743164, |
|
"learning_rate": 6.779388083735911e-06, |
|
"loss": 2.4652, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 35.32337951660156, |
|
"learning_rate": 6.725711218464842e-06, |
|
"loss": 2.2979, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 40.75404739379883, |
|
"learning_rate": 6.672034353193773e-06, |
|
"loss": 2.354, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 30.498510360717773, |
|
"learning_rate": 6.6183574879227065e-06, |
|
"loss": 1.8592, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 31.13385772705078, |
|
"learning_rate": 6.564680622651638e-06, |
|
"loss": 1.7776, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 34.9401969909668, |
|
"learning_rate": 6.511003757380569e-06, |
|
"loss": 1.9329, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 32.90480041503906, |
|
"learning_rate": 6.457326892109501e-06, |
|
"loss": 1.8312, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 32.94902420043945, |
|
"learning_rate": 6.403650026838433e-06, |
|
"loss": 1.8556, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 32.02881622314453, |
|
"learning_rate": 6.349973161567365e-06, |
|
"loss": 1.8142, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 32.752323150634766, |
|
"learning_rate": 6.296296296296297e-06, |
|
"loss": 1.7429, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 31.938289642333984, |
|
"learning_rate": 6.242619431025229e-06, |
|
"loss": 1.821, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 32.64255142211914, |
|
"learning_rate": 6.18894256575416e-06, |
|
"loss": 1.7492, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 30.172483444213867, |
|
"learning_rate": 6.135265700483092e-06, |
|
"loss": 1.7661, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 32.1895637512207, |
|
"learning_rate": 6.081588835212025e-06, |
|
"loss": 1.6979, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 32.555870056152344, |
|
"learning_rate": 6.027911969940956e-06, |
|
"loss": 1.778, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 31.702539443969727, |
|
"learning_rate": 5.974235104669888e-06, |
|
"loss": 1.7263, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 32.07310104370117, |
|
"learning_rate": 5.920558239398819e-06, |
|
"loss": 1.5491, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 31.130224227905273, |
|
"learning_rate": 5.866881374127752e-06, |
|
"loss": 1.6249, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 39.838436126708984, |
|
"learning_rate": 5.8132045088566835e-06, |
|
"loss": 1.6721, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 33.75567626953125, |
|
"learning_rate": 5.759527643585615e-06, |
|
"loss": 1.6779, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 31.251935958862305, |
|
"learning_rate": 5.705850778314546e-06, |
|
"loss": 1.6401, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 31.644649505615234, |
|
"learning_rate": 5.652173913043479e-06, |
|
"loss": 1.6647, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 30.14424705505371, |
|
"learning_rate": 5.598497047772411e-06, |
|
"loss": 1.6887, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 29.70695686340332, |
|
"learning_rate": 5.544820182501342e-06, |
|
"loss": 1.5387, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 31.330068588256836, |
|
"learning_rate": 5.4911433172302745e-06, |
|
"loss": 1.6414, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 32.08658981323242, |
|
"learning_rate": 5.437466451959206e-06, |
|
"loss": 1.6158, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 33.42084503173828, |
|
"learning_rate": 5.383789586688137e-06, |
|
"loss": 1.6728, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 32.10792922973633, |
|
"learning_rate": 5.3301127214170704e-06, |
|
"loss": 1.5365, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 34.231239318847656, |
|
"learning_rate": 5.276435856146002e-06, |
|
"loss": 1.584, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 32.19587326049805, |
|
"learning_rate": 5.222758990874933e-06, |
|
"loss": 1.6233, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 30.36279296875, |
|
"learning_rate": 5.169082125603865e-06, |
|
"loss": 1.5246, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 33.34714889526367, |
|
"learning_rate": 5.115405260332798e-06, |
|
"loss": 1.5514, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 32.581424713134766, |
|
"learning_rate": 5.061728395061729e-06, |
|
"loss": 1.495, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 33.158203125, |
|
"learning_rate": 5.0080515297906606e-06, |
|
"loss": 1.5546, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 29.796606063842773, |
|
"learning_rate": 4.954374664519592e-06, |
|
"loss": 1.5141, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 31.936180114746094, |
|
"learning_rate": 4.900697799248524e-06, |
|
"loss": 1.5156, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 30.770095825195312, |
|
"learning_rate": 4.847020933977456e-06, |
|
"loss": 1.5262, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 32.497520446777344, |
|
"learning_rate": 4.793344068706388e-06, |
|
"loss": 1.5303, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 31.067218780517578, |
|
"learning_rate": 4.739667203435319e-06, |
|
"loss": 1.4994, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 27.720073699951172, |
|
"learning_rate": 4.6859903381642516e-06, |
|
"loss": 1.4268, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 30.310941696166992, |
|
"learning_rate": 4.632313472893184e-06, |
|
"loss": 1.4636, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 33.62602996826172, |
|
"learning_rate": 4.578636607622115e-06, |
|
"loss": 1.4783, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 28.9564266204834, |
|
"learning_rate": 4.5249597423510475e-06, |
|
"loss": 1.359, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 29.886262893676758, |
|
"learning_rate": 4.471282877079979e-06, |
|
"loss": 1.4405, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 26.291038513183594, |
|
"learning_rate": 4.417606011808911e-06, |
|
"loss": 1.3914, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 30.628904342651367, |
|
"learning_rate": 4.3639291465378425e-06, |
|
"loss": 1.4335, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 27.96939468383789, |
|
"learning_rate": 4.310252281266775e-06, |
|
"loss": 1.3577, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 29.119224548339844, |
|
"learning_rate": 4.256575415995706e-06, |
|
"loss": 1.3808, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 30.36097526550293, |
|
"learning_rate": 4.202898550724638e-06, |
|
"loss": 1.3545, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 30.843242645263672, |
|
"learning_rate": 4.14922168545357e-06, |
|
"loss": 1.3751, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 29.29217529296875, |
|
"learning_rate": 4.095544820182501e-06, |
|
"loss": 1.3649, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 30.685625076293945, |
|
"learning_rate": 4.0418679549114335e-06, |
|
"loss": 1.4354, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 26.101669311523438, |
|
"learning_rate": 3.988191089640365e-06, |
|
"loss": 1.3355, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 29.12729835510254, |
|
"learning_rate": 3.934514224369297e-06, |
|
"loss": 1.3568, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 27.82271957397461, |
|
"learning_rate": 3.880837359098229e-06, |
|
"loss": 1.3702, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 26.432327270507812, |
|
"learning_rate": 3.827160493827161e-06, |
|
"loss": 1.3231, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 30.632972717285156, |
|
"learning_rate": 3.7734836285560927e-06, |
|
"loss": 1.3283, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 27.142309188842773, |
|
"learning_rate": 3.7198067632850245e-06, |
|
"loss": 1.3159, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 27.63045310974121, |
|
"learning_rate": 3.6661298980139563e-06, |
|
"loss": 1.3777, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 30.256242752075195, |
|
"learning_rate": 3.612453032742888e-06, |
|
"loss": 1.2845, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 28.592174530029297, |
|
"learning_rate": 3.5587761674718204e-06, |
|
"loss": 1.3163, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 29.088247299194336, |
|
"learning_rate": 3.505099302200752e-06, |
|
"loss": 1.3145, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 27.801074981689453, |
|
"learning_rate": 3.4514224369296832e-06, |
|
"loss": 1.3675, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 28.81484603881836, |
|
"learning_rate": 3.3977455716586155e-06, |
|
"loss": 1.2854, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 28.966217041015625, |
|
"learning_rate": 3.3440687063875473e-06, |
|
"loss": 1.3431, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 23.021453857421875, |
|
"learning_rate": 3.290391841116479e-06, |
|
"loss": 1.0516, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 25.622419357299805, |
|
"learning_rate": 3.236714975845411e-06, |
|
"loss": 0.9667, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 26.45795249938965, |
|
"learning_rate": 3.1830381105743428e-06, |
|
"loss": 0.9341, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 26.28618812561035, |
|
"learning_rate": 3.1293612453032746e-06, |
|
"loss": 0.9024, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 24.80799102783203, |
|
"learning_rate": 3.075684380032206e-06, |
|
"loss": 0.8784, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 25.70199966430664, |
|
"learning_rate": 3.0220075147611383e-06, |
|
"loss": 0.9307, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 24.88735580444336, |
|
"learning_rate": 2.9683306494900697e-06, |
|
"loss": 0.9037, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 26.51141929626465, |
|
"learning_rate": 2.914653784219002e-06, |
|
"loss": 0.9884, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 25.662946701049805, |
|
"learning_rate": 2.8609769189479338e-06, |
|
"loss": 0.8631, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 22.733741760253906, |
|
"learning_rate": 2.8073000536768656e-06, |
|
"loss": 0.9273, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 24.159793853759766, |
|
"learning_rate": 2.7536231884057974e-06, |
|
"loss": 0.9262, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 23.92421531677246, |
|
"learning_rate": 2.699946323134729e-06, |
|
"loss": 0.9048, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 28.564496994018555, |
|
"learning_rate": 2.646269457863661e-06, |
|
"loss": 0.9879, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 25.430883407592773, |
|
"learning_rate": 2.5925925925925925e-06, |
|
"loss": 0.892, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 23.307687759399414, |
|
"learning_rate": 2.5389157273215247e-06, |
|
"loss": 0.894, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 25.83247184753418, |
|
"learning_rate": 2.4852388620504566e-06, |
|
"loss": 0.8817, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 25.72507095336914, |
|
"learning_rate": 2.4315619967793884e-06, |
|
"loss": 0.9155, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 26.67945098876953, |
|
"learning_rate": 2.3778851315083202e-06, |
|
"loss": 0.9325, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 25.82522964477539, |
|
"learning_rate": 2.324208266237252e-06, |
|
"loss": 0.9055, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 22.66315269470215, |
|
"learning_rate": 2.270531400966184e-06, |
|
"loss": 0.8631, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 25.832313537597656, |
|
"learning_rate": 2.2168545356951157e-06, |
|
"loss": 0.9539, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 23.865262985229492, |
|
"learning_rate": 2.163177670424047e-06, |
|
"loss": 0.8461, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 23.32217025756836, |
|
"learning_rate": 2.109500805152979e-06, |
|
"loss": 0.8595, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 24.299062728881836, |
|
"learning_rate": 2.0558239398819112e-06, |
|
"loss": 0.899, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 25.582359313964844, |
|
"learning_rate": 2.002147074610843e-06, |
|
"loss": 0.898, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 25.416006088256836, |
|
"learning_rate": 1.948470209339775e-06, |
|
"loss": 0.898, |
|
"step": 1500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1863, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 5730542923874304.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|