|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 2025, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014814814814814815, |
|
"grad_norm": 26.017102109115793, |
|
"learning_rate": 4.926108374384237e-07, |
|
"loss": 0.5936, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02962962962962963, |
|
"grad_norm": 12.914864221350799, |
|
"learning_rate": 9.852216748768474e-07, |
|
"loss": 0.3977, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.044444444444444446, |
|
"grad_norm": 7.447780113204344, |
|
"learning_rate": 1.4778325123152712e-06, |
|
"loss": 0.2758, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05925925925925926, |
|
"grad_norm": 7.2037494981971735, |
|
"learning_rate": 1.970443349753695e-06, |
|
"loss": 0.206, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 9.39085512125214, |
|
"learning_rate": 2.4630541871921186e-06, |
|
"loss": 0.2267, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 7.044937181975286, |
|
"learning_rate": 2.9556650246305424e-06, |
|
"loss": 0.2088, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1037037037037037, |
|
"grad_norm": 5.795730113429066, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 0.197, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11851851851851852, |
|
"grad_norm": 7.080218597087718, |
|
"learning_rate": 3.94088669950739e-06, |
|
"loss": 0.1934, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 5.53348676182737, |
|
"learning_rate": 4.4334975369458135e-06, |
|
"loss": 0.2117, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 5.286987282479059, |
|
"learning_rate": 4.926108374384237e-06, |
|
"loss": 0.2408, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16296296296296298, |
|
"grad_norm": 6.606219609047498, |
|
"learning_rate": 5.41871921182266e-06, |
|
"loss": 0.2066, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 4.701177482407085, |
|
"learning_rate": 5.911330049261085e-06, |
|
"loss": 0.2281, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1925925925925926, |
|
"grad_norm": 5.456763973680646, |
|
"learning_rate": 6.403940886699508e-06, |
|
"loss": 0.234, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2074074074074074, |
|
"grad_norm": 5.574928605824461, |
|
"learning_rate": 6.896551724137932e-06, |
|
"loss": 0.2239, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 4.2789352272007095, |
|
"learning_rate": 7.3891625615763555e-06, |
|
"loss": 0.2339, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23703703703703705, |
|
"grad_norm": 4.6444068197911585, |
|
"learning_rate": 7.88177339901478e-06, |
|
"loss": 0.2261, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2518518518518518, |
|
"grad_norm": 4.889863953460466, |
|
"learning_rate": 8.374384236453203e-06, |
|
"loss": 0.253, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 4.819379627675783, |
|
"learning_rate": 8.866995073891627e-06, |
|
"loss": 0.2592, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2814814814814815, |
|
"grad_norm": 4.783281320760556, |
|
"learning_rate": 9.359605911330049e-06, |
|
"loss": 0.2558, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 4.6210148664541055, |
|
"learning_rate": 9.852216748768475e-06, |
|
"loss": 0.2791, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3111111111111111, |
|
"grad_norm": 6.194129626142383, |
|
"learning_rate": 9.99963580513638e-06, |
|
"loss": 0.266, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.32592592592592595, |
|
"grad_norm": 4.306028672789547, |
|
"learning_rate": 9.997852121279563e-06, |
|
"loss": 0.2644, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.34074074074074073, |
|
"grad_norm": 4.550912215301358, |
|
"learning_rate": 9.994582585118449e-06, |
|
"loss": 0.2398, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 2.7873840728898416, |
|
"learning_rate": 9.989828168680164e-06, |
|
"loss": 0.2459, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 5.670238707619235, |
|
"learning_rate": 9.983590285444025e-06, |
|
"loss": 0.2874, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3851851851851852, |
|
"grad_norm": 3.666204083864708, |
|
"learning_rate": 9.975870789921322e-06, |
|
"loss": 0.243, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 4.765419867995775, |
|
"learning_rate": 9.966671977103972e-06, |
|
"loss": 0.2418, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4148148148148148, |
|
"grad_norm": 4.266352929836881, |
|
"learning_rate": 9.955996581782218e-06, |
|
"loss": 0.3023, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.42962962962962964, |
|
"grad_norm": 5.260394697457832, |
|
"learning_rate": 9.943847777731584e-06, |
|
"loss": 0.2765, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 8.072668515977384, |
|
"learning_rate": 9.93022917676932e-06, |
|
"loss": 0.2804, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.45925925925925926, |
|
"grad_norm": 8.611241020880648, |
|
"learning_rate": 9.915144827680606e-06, |
|
"loss": 0.2703, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4740740740740741, |
|
"grad_norm": 5.269422886099256, |
|
"learning_rate": 9.898599215014868e-06, |
|
"loss": 0.2875, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4888888888888889, |
|
"grad_norm": 4.630321430071603, |
|
"learning_rate": 9.880597257752522e-06, |
|
"loss": 0.2639, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5037037037037037, |
|
"grad_norm": 6.86287315478417, |
|
"learning_rate": 9.861144307842574e-06, |
|
"loss": 0.2339, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5185185185185185, |
|
"grad_norm": 4.726853171099211, |
|
"learning_rate": 9.840246148611485e-06, |
|
"loss": 0.2552, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 3.24789768577898, |
|
"learning_rate": 9.817908993043819e-06, |
|
"loss": 0.2817, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5481481481481482, |
|
"grad_norm": 4.1350169337840965, |
|
"learning_rate": 9.794139481935108e-06, |
|
"loss": 0.2641, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.562962962962963, |
|
"grad_norm": 2.730446652174673, |
|
"learning_rate": 9.768944681917582e-06, |
|
"loss": 0.2826, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5777777777777777, |
|
"grad_norm": 3.9971570005802266, |
|
"learning_rate": 9.742332083359252e-06, |
|
"loss": 0.2559, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 3.753217276080861, |
|
"learning_rate": 9.714309598137045e-06, |
|
"loss": 0.2306, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6074074074074074, |
|
"grad_norm": 15.424898203468624, |
|
"learning_rate": 9.68488555728462e-06, |
|
"loss": 0.2682, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 3.8975775295207318, |
|
"learning_rate": 9.654068708515564e-06, |
|
"loss": 0.2607, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6370370370370371, |
|
"grad_norm": 4.141551276129438, |
|
"learning_rate": 9.621868213622713e-06, |
|
"loss": 0.2383, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6518518518518519, |
|
"grad_norm": 3.9225855004559356, |
|
"learning_rate": 9.588293645754363e-06, |
|
"loss": 0.293, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 3.2761325115244624, |
|
"learning_rate": 9.553354986568201e-06, |
|
"loss": 0.2942, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6814814814814815, |
|
"grad_norm": 3.0682621048324945, |
|
"learning_rate": 9.517062623263768e-06, |
|
"loss": 0.2524, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6962962962962963, |
|
"grad_norm": 3.231493961817089, |
|
"learning_rate": 9.479427345494366e-06, |
|
"loss": 0.2623, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 5.3337328348116015, |
|
"learning_rate": 9.440460342159314e-06, |
|
"loss": 0.2786, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.725925925925926, |
|
"grad_norm": 4.944222927733353, |
|
"learning_rate": 9.40017319807751e-06, |
|
"loss": 0.2276, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 4.898395138758685, |
|
"learning_rate": 9.358577890543277e-06, |
|
"loss": 0.2781, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"eval_loss": 0.2680646777153015, |
|
"eval_runtime": 110.7868, |
|
"eval_samples_per_second": 2.708, |
|
"eval_steps_per_second": 1.354, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7555555555555555, |
|
"grad_norm": 3.853101145062431, |
|
"learning_rate": 9.315686785765556e-06, |
|
"loss": 0.2637, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7703703703703704, |
|
"grad_norm": 4.4209909567001535, |
|
"learning_rate": 9.271512635191427e-06, |
|
"loss": 0.2707, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7851851851851852, |
|
"grad_norm": 4.336113654547109, |
|
"learning_rate": 9.22606857171515e-06, |
|
"loss": 0.2376, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 5.107334588558504, |
|
"learning_rate": 9.179368105773768e-06, |
|
"loss": 0.2495, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8148148148148148, |
|
"grad_norm": 4.730641918906785, |
|
"learning_rate": 9.131425121330477e-06, |
|
"loss": 0.2379, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8296296296296296, |
|
"grad_norm": 19.960301317231036, |
|
"learning_rate": 9.082253871746962e-06, |
|
"loss": 0.2702, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8444444444444444, |
|
"grad_norm": 6.6215773453228115, |
|
"learning_rate": 9.031868975545884e-06, |
|
"loss": 0.2961, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8592592592592593, |
|
"grad_norm": 3.0739713217438274, |
|
"learning_rate": 8.980285412064827e-06, |
|
"loss": 0.2381, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8740740740740741, |
|
"grad_norm": 3.748817408124034, |
|
"learning_rate": 8.92751851700297e-06, |
|
"loss": 0.2258, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 3.6234404866008996, |
|
"learning_rate": 8.873583977861802e-06, |
|
"loss": 0.2395, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9037037037037037, |
|
"grad_norm": 4.04735847376876, |
|
"learning_rate": 8.818497829281272e-06, |
|
"loss": 0.2458, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9185185185185185, |
|
"grad_norm": 3.6442745029714296, |
|
"learning_rate": 8.762276448272709e-06, |
|
"loss": 0.2552, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 4.331703349382409, |
|
"learning_rate": 8.70493654934996e-06, |
|
"loss": 0.2909, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9481481481481482, |
|
"grad_norm": 6.051325269419405, |
|
"learning_rate": 8.646495179560221e-06, |
|
"loss": 0.2564, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9629629629629629, |
|
"grad_norm": 4.436797287629697, |
|
"learning_rate": 8.586969713415949e-06, |
|
"loss": 0.2374, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 2.756495280511311, |
|
"learning_rate": 8.526377847729475e-06, |
|
"loss": 0.2441, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9925925925925926, |
|
"grad_norm": 2.460062031773824, |
|
"learning_rate": 8.46473759635176e-06, |
|
"loss": 0.2566, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0074074074074073, |
|
"grad_norm": 2.8968674379949215, |
|
"learning_rate": 8.402067284816919e-06, |
|
"loss": 0.1853, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0222222222222221, |
|
"grad_norm": 3.1957903906863465, |
|
"learning_rate": 8.338385544894073e-06, |
|
"loss": 0.127, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.037037037037037, |
|
"grad_norm": 2.6559772016064587, |
|
"learning_rate": 8.273711309048145e-06, |
|
"loss": 0.1254, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0518518518518518, |
|
"grad_norm": 3.0419408376224033, |
|
"learning_rate": 8.208063804811293e-06, |
|
"loss": 0.1385, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 2.324938116275548, |
|
"learning_rate": 8.141462549066581e-06, |
|
"loss": 0.1344, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.0814814814814815, |
|
"grad_norm": 3.4270759875212233, |
|
"learning_rate": 8.073927342245663e-06, |
|
"loss": 0.1419, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.0962962962962963, |
|
"grad_norm": 2.6055600373408105, |
|
"learning_rate": 8.005478262442132e-06, |
|
"loss": 0.1308, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 2.573600018786121, |
|
"learning_rate": 7.936135659442355e-06, |
|
"loss": 0.133, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.125925925925926, |
|
"grad_norm": 1.3199535934067952, |
|
"learning_rate": 7.86592014867551e-06, |
|
"loss": 0.1099, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1407407407407408, |
|
"grad_norm": 2.5610850600680113, |
|
"learning_rate": 7.794852605084661e-06, |
|
"loss": 0.1429, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1555555555555554, |
|
"grad_norm": 2.72955021912288, |
|
"learning_rate": 7.722954156920675e-06, |
|
"loss": 0.1227, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.1703703703703703, |
|
"grad_norm": 3.447611029467879, |
|
"learning_rate": 7.650246179460826e-06, |
|
"loss": 0.1499, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 4.42799750544096, |
|
"learning_rate": 7.57675028865397e-06, |
|
"loss": 0.1564, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.9245723364864586, |
|
"learning_rate": 7.502488334694167e-06, |
|
"loss": 0.1441, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2148148148148148, |
|
"grad_norm": 3.9787651452124524, |
|
"learning_rate": 7.427482395524646e-06, |
|
"loss": 0.1317, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.2296296296296296, |
|
"grad_norm": 3.5447225202330745, |
|
"learning_rate": 7.35175477027408e-06, |
|
"loss": 0.1211, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2444444444444445, |
|
"grad_norm": 3.013749806838306, |
|
"learning_rate": 7.2753279726271e-06, |
|
"loss": 0.1284, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2592592592592593, |
|
"grad_norm": 2.8757278244677553, |
|
"learning_rate": 7.198224724131012e-06, |
|
"loss": 0.1436, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.2740740740740741, |
|
"grad_norm": 3.813666156239548, |
|
"learning_rate": 7.120467947440719e-06, |
|
"loss": 0.1221, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.2888888888888888, |
|
"grad_norm": 2.3407714617223885, |
|
"learning_rate": 7.042080759503866e-06, |
|
"loss": 0.1162, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.3037037037037038, |
|
"grad_norm": 2.5559098463338397, |
|
"learning_rate": 6.963086464688209e-06, |
|
"loss": 0.1291, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.3185185185185184, |
|
"grad_norm": 2.5329035033815637, |
|
"learning_rate": 6.883508547853268e-06, |
|
"loss": 0.1319, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 4.756367202905442, |
|
"learning_rate": 6.8033706673683276e-06, |
|
"loss": 0.125, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.348148148148148, |
|
"grad_norm": 2.4135868103281446, |
|
"learning_rate": 6.722696648078838e-06, |
|
"loss": 0.1382, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.362962962962963, |
|
"grad_norm": 2.1274954103320143, |
|
"learning_rate": 6.641510474223338e-06, |
|
"loss": 0.1311, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.3777777777777778, |
|
"grad_norm": 5.925560885437899, |
|
"learning_rate": 6.559836282302984e-06, |
|
"loss": 0.1416, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.3925925925925926, |
|
"grad_norm": 2.628591269898234, |
|
"learning_rate": 6.477698353905808e-06, |
|
"loss": 0.1519, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4074074074074074, |
|
"grad_norm": 2.7741467084148925, |
|
"learning_rate": 6.395121108487855e-06, |
|
"loss": 0.1179, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.4222222222222223, |
|
"grad_norm": 2.4497960109998362, |
|
"learning_rate": 6.312129096113313e-06, |
|
"loss": 0.1329, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4370370370370371, |
|
"grad_norm": 3.192126852233065, |
|
"learning_rate": 6.228746990155831e-06, |
|
"loss": 0.0981, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.4518518518518517, |
|
"grad_norm": 2.180408863260512, |
|
"learning_rate": 6.144999579963164e-06, |
|
"loss": 0.116, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.4666666666666668, |
|
"grad_norm": 2.60239822124948, |
|
"learning_rate": 6.060911763487353e-06, |
|
"loss": 0.136, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 2.9763711731701745, |
|
"learning_rate": 5.976508539882604e-06, |
|
"loss": 0.1417, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"eval_loss": 0.23991718888282776, |
|
"eval_runtime": 110.0876, |
|
"eval_samples_per_second": 2.725, |
|
"eval_steps_per_second": 1.363, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.4962962962962962, |
|
"grad_norm": 1.5705062635477183, |
|
"learning_rate": 5.891815002073081e-06, |
|
"loss": 0.1118, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.511111111111111, |
|
"grad_norm": 1.8570339015498183, |
|
"learning_rate": 5.806856329292839e-06, |
|
"loss": 0.1202, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.525925925925926, |
|
"grad_norm": 2.9744617003579523, |
|
"learning_rate": 5.721657779600071e-06, |
|
"loss": 0.1256, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.5407407407407407, |
|
"grad_norm": 3.0753703391352056, |
|
"learning_rate": 5.636244682367937e-06, |
|
"loss": 0.1402, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 2.464360253185267, |
|
"learning_rate": 5.5506424307541895e-06, |
|
"loss": 0.1034, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.5703703703703704, |
|
"grad_norm": 2.836329426651843, |
|
"learning_rate": 5.464876474151835e-06, |
|
"loss": 0.1302, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.585185185185185, |
|
"grad_norm": 4.20442553150441, |
|
"learning_rate": 5.3789723106230675e-06, |
|
"loss": 0.128, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.189788584493857, |
|
"learning_rate": 5.292955479318756e-06, |
|
"loss": 0.1265, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.6148148148148147, |
|
"grad_norm": 2.1857430787318273, |
|
"learning_rate": 5.206851552885691e-06, |
|
"loss": 0.1276, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.6296296296296298, |
|
"grad_norm": 2.0830677851745283, |
|
"learning_rate": 5.120686129863882e-06, |
|
"loss": 0.1131, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6444444444444444, |
|
"grad_norm": 2.399753090852167, |
|
"learning_rate": 5.0344848270761635e-06, |
|
"loss": 0.1353, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.6592592592592592, |
|
"grad_norm": 2.972662069394894, |
|
"learning_rate": 4.948273272012363e-06, |
|
"loss": 0.1256, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.674074074074074, |
|
"grad_norm": 2.725058666193907, |
|
"learning_rate": 4.862077095210284e-06, |
|
"loss": 0.1223, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.6888888888888889, |
|
"grad_norm": 2.403455581216295, |
|
"learning_rate": 4.775921922635806e-06, |
|
"loss": 0.1101, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.7037037037037037, |
|
"grad_norm": 2.954298808597469, |
|
"learning_rate": 4.689833368064326e-06, |
|
"loss": 0.1136, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.7185185185185186, |
|
"grad_norm": 2.065869616587133, |
|
"learning_rate": 4.603837025465829e-06, |
|
"loss": 0.0936, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.7333333333333334, |
|
"grad_norm": 2.5225001244758434, |
|
"learning_rate": 4.517958461395846e-06, |
|
"loss": 0.1035, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.748148148148148, |
|
"grad_norm": 2.278719889103367, |
|
"learning_rate": 4.432223207394577e-06, |
|
"loss": 0.0978, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.762962962962963, |
|
"grad_norm": 3.0324188432282204, |
|
"learning_rate": 4.346656752396388e-06, |
|
"loss": 0.1234, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 3.15770874304934, |
|
"learning_rate": 4.261284535152016e-06, |
|
"loss": 0.1117, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7925925925925927, |
|
"grad_norm": 2.4288815684292007, |
|
"learning_rate": 4.176131936665669e-06, |
|
"loss": 0.1151, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.8074074074074074, |
|
"grad_norm": 2.2345406971177586, |
|
"learning_rate": 4.0912242726493e-06, |
|
"loss": 0.1087, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.8222222222222222, |
|
"grad_norm": 1.3631354507880038, |
|
"learning_rate": 4.006586785996285e-06, |
|
"loss": 0.1094, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.837037037037037, |
|
"grad_norm": 2.759382864621944, |
|
"learning_rate": 3.922244639276773e-06, |
|
"loss": 0.1077, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 2.299484616218074, |
|
"learning_rate": 3.838222907256884e-06, |
|
"loss": 0.1106, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.8666666666666667, |
|
"grad_norm": 3.450427642587667, |
|
"learning_rate": 3.7545465694440363e-06, |
|
"loss": 0.1075, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.8814814814814815, |
|
"grad_norm": 1.946789383498192, |
|
"learning_rate": 3.6712405026605792e-06, |
|
"loss": 0.1097, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.8962962962962964, |
|
"grad_norm": 3.1898982494578663, |
|
"learning_rate": 3.5883294736479612e-06, |
|
"loss": 0.1004, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.911111111111111, |
|
"grad_norm": 2.180121020731452, |
|
"learning_rate": 3.5058381317036285e-06, |
|
"loss": 0.0906, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.925925925925926, |
|
"grad_norm": 2.4521692799587593, |
|
"learning_rate": 3.423791001352823e-06, |
|
"loss": 0.1209, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.9407407407407407, |
|
"grad_norm": 2.578764727071429, |
|
"learning_rate": 3.3422124750574902e-06, |
|
"loss": 0.1084, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.9555555555555557, |
|
"grad_norm": 2.9192101640040526, |
|
"learning_rate": 3.2611268059644535e-06, |
|
"loss": 0.0974, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.9703703703703703, |
|
"grad_norm": 3.1900819524103254, |
|
"learning_rate": 3.1805581006949856e-06, |
|
"loss": 0.1291, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.9851851851851852, |
|
"grad_norm": 3.5372332007233585, |
|
"learning_rate": 3.100530312177956e-06, |
|
"loss": 0.1028, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.682319850341937, |
|
"learning_rate": 3.0210672325286806e-06, |
|
"loss": 0.1026, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.0148148148148146, |
|
"grad_norm": 1.3372912715867262, |
|
"learning_rate": 2.9421924859755525e-06, |
|
"loss": 0.0428, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.0296296296296297, |
|
"grad_norm": 1.3386975885602668, |
|
"learning_rate": 2.8639295218366115e-06, |
|
"loss": 0.0373, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.0444444444444443, |
|
"grad_norm": 1.2812593118990734, |
|
"learning_rate": 2.78630160754811e-06, |
|
"loss": 0.0354, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.0592592592592593, |
|
"grad_norm": 1.290388617260846, |
|
"learning_rate": 2.709331821747133e-06, |
|
"loss": 0.0265, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.074074074074074, |
|
"grad_norm": 1.573529577676281, |
|
"learning_rate": 2.63304304741037e-06, |
|
"loss": 0.0278, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.088888888888889, |
|
"grad_norm": 2.419667316602129, |
|
"learning_rate": 2.55745796505105e-06, |
|
"loss": 0.0487, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.1037037037037036, |
|
"grad_norm": 1.0785142520976503, |
|
"learning_rate": 2.482599045976059e-06, |
|
"loss": 0.0364, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.1185185185185187, |
|
"grad_norm": 2.0952689701078784, |
|
"learning_rate": 2.408488545605265e-06, |
|
"loss": 0.045, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.1333333333333333, |
|
"grad_norm": 2.823990301006105, |
|
"learning_rate": 2.3351484968550264e-06, |
|
"loss": 0.0368, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.148148148148148, |
|
"grad_norm": 1.7955041398866707, |
|
"learning_rate": 2.2626007035878377e-06, |
|
"loss": 0.0421, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.162962962962963, |
|
"grad_norm": 0.8109338878062605, |
|
"learning_rate": 2.1908667341300923e-06, |
|
"loss": 0.0315, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.1777777777777776, |
|
"grad_norm": 1.9133299721589396, |
|
"learning_rate": 2.1199679148598434e-06, |
|
"loss": 0.0348, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.1925925925925926, |
|
"grad_norm": 1.2071834999832143, |
|
"learning_rate": 2.0499253238665284e-06, |
|
"loss": 0.0496, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.2074074074074073, |
|
"grad_norm": 1.0217372762938706, |
|
"learning_rate": 1.9807597846844737e-06, |
|
"loss": 0.0287, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 1.0023764675817943, |
|
"learning_rate": 1.9124918601021124e-06, |
|
"loss": 0.0362, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"eval_loss": 0.25548920035362244, |
|
"eval_runtime": 109.9471, |
|
"eval_samples_per_second": 2.729, |
|
"eval_steps_per_second": 1.364, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.237037037037037, |
|
"grad_norm": 3.3820011756317867, |
|
"learning_rate": 1.845141846048691e-06, |
|
"loss": 0.0327, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.251851851851852, |
|
"grad_norm": 1.8639592123431572, |
|
"learning_rate": 1.778729765560337e-06, |
|
"loss": 0.0275, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.2666666666666666, |
|
"grad_norm": 1.924019925176325, |
|
"learning_rate": 1.7132753628272403e-06, |
|
"loss": 0.0465, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.2814814814814817, |
|
"grad_norm": 1.152773684003778, |
|
"learning_rate": 1.6487980973237434e-06, |
|
"loss": 0.0308, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.2962962962962963, |
|
"grad_norm": 1.894824044203695, |
|
"learning_rate": 1.5853171380230791e-06, |
|
"loss": 0.0345, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.311111111111111, |
|
"grad_norm": 1.3328611183809627, |
|
"learning_rate": 1.5228513576984633e-06, |
|
"loss": 0.0281, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.325925925925926, |
|
"grad_norm": 3.750377643295275, |
|
"learning_rate": 1.4614193273122562e-06, |
|
"loss": 0.0445, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.3407407407407406, |
|
"grad_norm": 3.8759939460354915, |
|
"learning_rate": 1.401039310494855e-06, |
|
"loss": 0.0435, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.3555555555555556, |
|
"grad_norm": 3.0989225284077917, |
|
"learning_rate": 1.3417292581149388e-06, |
|
"loss": 0.0353, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.3703703703703702, |
|
"grad_norm": 0.9919766228227254, |
|
"learning_rate": 1.2835068029427188e-06, |
|
"loss": 0.0299, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.3851851851851853, |
|
"grad_norm": 1.1007285921225456, |
|
"learning_rate": 1.2263892544077439e-06, |
|
"loss": 0.0351, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.8350934062145996, |
|
"learning_rate": 1.1703935934528327e-06, |
|
"loss": 0.0449, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.414814814814815, |
|
"grad_norm": 1.0737399447700786, |
|
"learning_rate": 1.1155364674856834e-06, |
|
"loss": 0.0279, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.4296296296296296, |
|
"grad_norm": 1.1914051412570683, |
|
"learning_rate": 1.0618341854296176e-06, |
|
"loss": 0.0443, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.4444444444444446, |
|
"grad_norm": 1.4625234311241986, |
|
"learning_rate": 1.0093027128749722e-06, |
|
"loss": 0.0398, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.4592592592592593, |
|
"grad_norm": 2.809214532722792, |
|
"learning_rate": 9.57957667332562e-07, |
|
"loss": 0.0392, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.474074074074074, |
|
"grad_norm": 2.048944419289513, |
|
"learning_rate": 9.078143135906154e-07, |
|
"loss": 0.0361, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.488888888888889, |
|
"grad_norm": 2.2346039939090168, |
|
"learning_rate": 8.588875591765838e-07, |
|
"loss": 0.0424, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.5037037037037035, |
|
"grad_norm": 0.8685048120192506, |
|
"learning_rate": 8.111919499251653e-07, |
|
"loss": 0.0265, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.5185185185185186, |
|
"grad_norm": 5.531554078682163, |
|
"learning_rate": 7.647416656538464e-07, |
|
"loss": 0.0379, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.533333333333333, |
|
"grad_norm": 2.567773799038788, |
|
"learning_rate": 7.195505159472726e-07, |
|
"loss": 0.0269, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.5481481481481483, |
|
"grad_norm": 1.5337559661912539, |
|
"learning_rate": 6.756319360516856e-07, |
|
"loss": 0.0276, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.562962962962963, |
|
"grad_norm": 2.401685833878839, |
|
"learning_rate": 6.329989828806482e-07, |
|
"loss": 0.0283, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.5777777777777775, |
|
"grad_norm": 1.682913358837789, |
|
"learning_rate": 5.916643311332438e-07, |
|
"loss": 0.0391, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.5925925925925926, |
|
"grad_norm": 1.689757637438205, |
|
"learning_rate": 5.516402695259165e-07, |
|
"loss": 0.0324, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.6074074074074076, |
|
"grad_norm": 1.2197797882219985, |
|
"learning_rate": 5.12938697139056e-07, |
|
"loss": 0.0298, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.6222222222222222, |
|
"grad_norm": 2.191377307550078, |
|
"learning_rate": 4.755711198794233e-07, |
|
"loss": 0.0323, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.637037037037037, |
|
"grad_norm": 1.103714063244568, |
|
"learning_rate": 4.395486470594645e-07, |
|
"loss": 0.0256, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.651851851851852, |
|
"grad_norm": 1.6229391835726705, |
|
"learning_rate": 4.048819880945337e-07, |
|
"loss": 0.0291, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 1.6366466447258576, |
|
"learning_rate": 3.7158144931900395e-07, |
|
"loss": 0.0262, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.6814814814814816, |
|
"grad_norm": 1.3157936293009576, |
|
"learning_rate": 3.396569309222114e-07, |
|
"loss": 0.0282, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.696296296296296, |
|
"grad_norm": 1.3484419903044749, |
|
"learning_rate": 3.091179240051462e-07, |
|
"loss": 0.0282, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.7111111111111112, |
|
"grad_norm": 2.820403024559756, |
|
"learning_rate": 2.799735077587695e-07, |
|
"loss": 0.036, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.725925925925926, |
|
"grad_norm": 1.108338803655822, |
|
"learning_rate": 2.5223234676478193e-07, |
|
"loss": 0.0236, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.7407407407407405, |
|
"grad_norm": 1.1593020570108006, |
|
"learning_rate": 2.2590268841966357e-07, |
|
"loss": 0.0306, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.7555555555555555, |
|
"grad_norm": 0.9513752742372911, |
|
"learning_rate": 2.0099236048273407e-07, |
|
"loss": 0.0306, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.7703703703703706, |
|
"grad_norm": 2.6444015892408417, |
|
"learning_rate": 1.7750876874897627e-07, |
|
"loss": 0.0317, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.785185185185185, |
|
"grad_norm": 2.028461793078949, |
|
"learning_rate": 1.554588948473068e-07, |
|
"loss": 0.0303, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 2.9681404257688637, |
|
"learning_rate": 1.3484929416495096e-07, |
|
"loss": 0.0318, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.814814814814815, |
|
"grad_norm": 1.7073934684848604, |
|
"learning_rate": 1.1568609389853546e-07, |
|
"loss": 0.0403, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.8296296296296295, |
|
"grad_norm": 1.5684215370418764, |
|
"learning_rate": 9.7974991232489e-08, |
|
"loss": 0.0284, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.8444444444444446, |
|
"grad_norm": 1.742009123872279, |
|
"learning_rate": 8.172125164527312e-08, |
|
"loss": 0.0408, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.859259259259259, |
|
"grad_norm": 1.7024512023509764, |
|
"learning_rate": 6.692970734397176e-08, |
|
"loss": 0.038, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.8740740740740742, |
|
"grad_norm": 1.0455017584864013, |
|
"learning_rate": 5.360475582768088e-08, |
|
"loss": 0.0362, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.888888888888889, |
|
"grad_norm": 1.3244853973367365, |
|
"learning_rate": 4.175035858013987e-08, |
|
"loss": 0.0294, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.9037037037037035, |
|
"grad_norm": 1.3623719555386105, |
|
"learning_rate": 3.13700398919925e-08, |
|
"loss": 0.0303, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.9185185185185185, |
|
"grad_norm": 1.417988437962572, |
|
"learning_rate": 2.2466885813018925e-08, |
|
"loss": 0.0284, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.9333333333333336, |
|
"grad_norm": 0.8742047293170354, |
|
"learning_rate": 1.504354323466073e-08, |
|
"loss": 0.0235, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.948148148148148, |
|
"grad_norm": 1.9231464337238604, |
|
"learning_rate": 9.102219103103161e-09, |
|
"loss": 0.0383, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 2.1190416620342303, |
|
"learning_rate": 4.644679763155524e-09, |
|
"loss": 0.0244, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"eval_loss": 0.2506597936153412, |
|
"eval_runtime": 109.9607, |
|
"eval_samples_per_second": 2.728, |
|
"eval_steps_per_second": 1.364, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.977777777777778, |
|
"grad_norm": 1.0361197122874597, |
|
"learning_rate": 1.6722504331195822e-09, |
|
"loss": 0.024, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.9925925925925925, |
|
"grad_norm": 1.8532792834690945, |
|
"learning_rate": 1.8581481080415242e-10, |
|
"loss": 0.0264, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 2025, |
|
"total_flos": 15334618890240.0, |
|
"train_loss": 0.13858298796930432, |
|
"train_runtime": 10686.6427, |
|
"train_samples_per_second": 0.758, |
|
"train_steps_per_second": 0.189 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2025, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 15334618890240.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|