{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9999335474581903, "eval_steps": 500, "global_step": 16929, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 7.78125, "learning_rate": 1.5e-05, "loss": 1.4908, "step": 5 }, { "epoch": 0.0, "grad_norm": 6.0625, "learning_rate": 3e-05, "loss": 1.5469, "step": 10 }, { "epoch": 0.0, "grad_norm": 6.625, "learning_rate": 4.5e-05, "loss": 1.4631, "step": 15 }, { "epoch": 0.0, "grad_norm": 5.375, "learning_rate": 6e-05, "loss": 1.4561, "step": 20 }, { "epoch": 0.0, "grad_norm": 6.21875, "learning_rate": 5.99999870552063e-05, "loss": 1.452, "step": 25 }, { "epoch": 0.01, "grad_norm": 5.125, "learning_rate": 5.999994822083636e-05, "loss": 1.4263, "step": 30 }, { "epoch": 0.01, "grad_norm": 4.6875, "learning_rate": 5.9999883496923705e-05, "loss": 1.4167, "step": 35 }, { "epoch": 0.01, "grad_norm": 4.5625, "learning_rate": 5.9999792883524176e-05, "loss": 1.4179, "step": 40 }, { "epoch": 0.01, "grad_norm": 4.375, "learning_rate": 5.999967638071599e-05, "loss": 1.3843, "step": 45 }, { "epoch": 0.01, "grad_norm": 4.84375, "learning_rate": 5.9999533988599675e-05, "loss": 1.4072, "step": 50 }, { "epoch": 0.01, "grad_norm": 4.6875, "learning_rate": 5.999936570729811e-05, "loss": 1.3193, "step": 55 }, { "epoch": 0.01, "grad_norm": 4.75, "learning_rate": 5.999917153695653e-05, "loss": 1.3554, "step": 60 }, { "epoch": 0.01, "grad_norm": 5.15625, "learning_rate": 5.999895147774249e-05, "loss": 1.3453, "step": 65 }, { "epoch": 0.01, "grad_norm": 4.6875, "learning_rate": 5.9998705529845914e-05, "loss": 1.3736, "step": 70 }, { "epoch": 0.01, "grad_norm": 3.953125, "learning_rate": 5.9998433693479035e-05, "loss": 1.3863, "step": 75 }, { "epoch": 0.01, "grad_norm": 4.84375, "learning_rate": 5.9998135968876455e-05, "loss": 1.313, "step": 80 }, { "epoch": 0.02, "grad_norm": 4.15625, "learning_rate": 5.999781235629511e-05, "loss": 1.4022, "step": 85 }, { "epoch": 0.02, "grad_norm": 4.84375, "learning_rate": 5.999746285601425e-05, "loss": 1.4179, "step": 90 }, { "epoch": 0.02, "grad_norm": 4.84375, "learning_rate": 5.9997087468335514e-05, "loss": 1.4068, "step": 95 }, { "epoch": 0.02, "grad_norm": 4.65625, "learning_rate": 5.999668619358285e-05, "loss": 1.4107, "step": 100 }, { "epoch": 0.02, "grad_norm": 4.375, "learning_rate": 5.999625903210254e-05, "loss": 1.355, "step": 105 }, { "epoch": 0.02, "grad_norm": 4.53125, "learning_rate": 5.9995805984263235e-05, "loss": 1.4045, "step": 110 }, { "epoch": 0.02, "grad_norm": 4.09375, "learning_rate": 5.99953270504559e-05, "loss": 1.335, "step": 115 }, { "epoch": 0.02, "grad_norm": 4.125, "learning_rate": 5.9994822231093855e-05, "loss": 1.3568, "step": 120 }, { "epoch": 0.02, "grad_norm": 4.5625, "learning_rate": 5.999429152661274e-05, "loss": 1.3344, "step": 125 }, { "epoch": 0.02, "grad_norm": 4.3125, "learning_rate": 5.9993734937470554e-05, "loss": 1.3172, "step": 130 }, { "epoch": 0.02, "grad_norm": 4.5625, "learning_rate": 5.999315246414764e-05, "loss": 1.3562, "step": 135 }, { "epoch": 0.02, "grad_norm": 4.125, "learning_rate": 5.9992544107146624e-05, "loss": 1.3601, "step": 140 }, { "epoch": 0.03, "grad_norm": 4.4375, "learning_rate": 5.9991909866992556e-05, "loss": 1.3281, "step": 145 }, { "epoch": 0.03, "grad_norm": 5.0625, "learning_rate": 5.999124974423274e-05, "loss": 1.4124, "step": 150 }, { "epoch": 0.03, "grad_norm": 4.0625, "learning_rate": 5.999056373943688e-05, "loss": 1.3428, "step": 155 }, { "epoch": 0.03, "grad_norm": 4.4375, "learning_rate": 5.9989851853196974e-05, "loss": 1.3556, "step": 160 }, { "epoch": 0.03, "grad_norm": 4.75, "learning_rate": 5.9989114086127384e-05, "loss": 1.4002, "step": 165 }, { "epoch": 0.03, "grad_norm": 5.0, "learning_rate": 5.998835043886477e-05, "loss": 1.4018, "step": 170 }, { "epoch": 0.03, "grad_norm": 4.9375, "learning_rate": 5.998756091206816e-05, "loss": 1.3767, "step": 175 }, { "epoch": 0.03, "grad_norm": 5.28125, "learning_rate": 5.998674550641891e-05, "loss": 1.4117, "step": 180 }, { "epoch": 0.03, "grad_norm": 4.4375, "learning_rate": 5.9985904222620705e-05, "loss": 1.4244, "step": 185 }, { "epoch": 0.03, "grad_norm": 4.75, "learning_rate": 5.9985037061399545e-05, "loss": 1.2954, "step": 190 }, { "epoch": 0.03, "grad_norm": 4.875, "learning_rate": 5.9984144023503796e-05, "loss": 1.3381, "step": 195 }, { "epoch": 0.04, "grad_norm": 4.53125, "learning_rate": 5.9983225109704137e-05, "loss": 1.3754, "step": 200 }, { "epoch": 0.04, "grad_norm": 4.65625, "learning_rate": 5.9982280320793565e-05, "loss": 1.3988, "step": 205 }, { "epoch": 0.04, "grad_norm": 4.21875, "learning_rate": 5.998130965758742e-05, "loss": 1.3583, "step": 210 }, { "epoch": 0.04, "grad_norm": 4.71875, "learning_rate": 5.9980313120923394e-05, "loss": 1.3655, "step": 215 }, { "epoch": 0.04, "grad_norm": 4.875, "learning_rate": 5.997929071166146e-05, "loss": 1.3826, "step": 220 }, { "epoch": 0.04, "grad_norm": 4.625, "learning_rate": 5.997824243068395e-05, "loss": 1.365, "step": 225 }, { "epoch": 0.04, "grad_norm": 5.125, "learning_rate": 5.9977168278895516e-05, "loss": 1.3666, "step": 230 }, { "epoch": 0.04, "grad_norm": 4.65625, "learning_rate": 5.9976068257223146e-05, "loss": 1.3921, "step": 235 }, { "epoch": 0.04, "grad_norm": 4.53125, "learning_rate": 5.9974942366616116e-05, "loss": 1.3138, "step": 240 }, { "epoch": 0.04, "grad_norm": 4.03125, "learning_rate": 5.9973790608046095e-05, "loss": 1.3864, "step": 245 }, { "epoch": 0.04, "grad_norm": 4.21875, "learning_rate": 5.9972612982507006e-05, "loss": 1.2755, "step": 250 }, { "epoch": 0.05, "grad_norm": 4.9375, "learning_rate": 5.997140949101513e-05, "loss": 1.3516, "step": 255 }, { "epoch": 0.05, "grad_norm": 4.78125, "learning_rate": 5.997018013460906e-05, "loss": 1.3214, "step": 260 }, { "epoch": 0.05, "grad_norm": 4.75, "learning_rate": 5.996892491434972e-05, "loss": 1.3606, "step": 265 }, { "epoch": 0.05, "grad_norm": 4.34375, "learning_rate": 5.9967643831320346e-05, "loss": 1.357, "step": 270 }, { "epoch": 0.05, "grad_norm": 4.46875, "learning_rate": 5.99663368866265e-05, "loss": 1.4333, "step": 275 }, { "epoch": 0.05, "grad_norm": 4.46875, "learning_rate": 5.996500408139605e-05, "loss": 1.4344, "step": 280 }, { "epoch": 0.05, "grad_norm": 4.8125, "learning_rate": 5.9963645416779187e-05, "loss": 1.3336, "step": 285 }, { "epoch": 0.05, "grad_norm": 5.0625, "learning_rate": 5.9962260893948426e-05, "loss": 1.2713, "step": 290 }, { "epoch": 0.05, "grad_norm": 4.8125, "learning_rate": 5.99608505140986e-05, "loss": 1.407, "step": 295 }, { "epoch": 0.05, "grad_norm": 3.96875, "learning_rate": 5.995941427844681e-05, "loss": 1.3888, "step": 300 }, { "epoch": 0.05, "grad_norm": 5.03125, "learning_rate": 5.9957952188232554e-05, "loss": 1.3372, "step": 305 }, { "epoch": 0.05, "grad_norm": 4.625, "learning_rate": 5.9956464244717565e-05, "loss": 1.3894, "step": 310 }, { "epoch": 0.06, "grad_norm": 4.34375, "learning_rate": 5.995495044918594e-05, "loss": 1.35, "step": 315 }, { "epoch": 0.06, "grad_norm": 4.53125, "learning_rate": 5.995341080294405e-05, "loss": 1.2879, "step": 320 }, { "epoch": 0.06, "grad_norm": 4.53125, "learning_rate": 5.995184530732058e-05, "loss": 1.3465, "step": 325 }, { "epoch": 0.06, "grad_norm": 4.8125, "learning_rate": 5.995025396366655e-05, "loss": 1.3824, "step": 330 }, { "epoch": 0.06, "grad_norm": 5.625, "learning_rate": 5.994863677335526e-05, "loss": 1.3866, "step": 335 }, { "epoch": 0.06, "grad_norm": 4.09375, "learning_rate": 5.994699373778232e-05, "loss": 1.3199, "step": 340 }, { "epoch": 0.06, "grad_norm": 4.59375, "learning_rate": 5.994532485836565e-05, "loss": 1.309, "step": 345 }, { "epoch": 0.06, "grad_norm": 4.5625, "learning_rate": 5.994363013654547e-05, "loss": 1.353, "step": 350 }, { "epoch": 0.06, "grad_norm": 4.65625, "learning_rate": 5.99419095737843e-05, "loss": 1.3219, "step": 355 }, { "epoch": 0.06, "grad_norm": 4.65625, "learning_rate": 5.994016317156696e-05, "loss": 1.3854, "step": 360 }, { "epoch": 0.06, "grad_norm": 5.03125, "learning_rate": 5.993839093140058e-05, "loss": 1.4163, "step": 365 }, { "epoch": 0.07, "grad_norm": 4.4375, "learning_rate": 5.993659285481457e-05, "loss": 1.3182, "step": 370 }, { "epoch": 0.07, "grad_norm": 4.78125, "learning_rate": 5.993476894336065e-05, "loss": 1.3262, "step": 375 }, { "epoch": 0.07, "grad_norm": 5.0, "learning_rate": 5.9932919198612836e-05, "loss": 1.2898, "step": 380 }, { "epoch": 0.07, "grad_norm": 4.4375, "learning_rate": 5.9931043622167415e-05, "loss": 1.2814, "step": 385 }, { "epoch": 0.07, "grad_norm": 5.1875, "learning_rate": 5.9929142215643e-05, "loss": 1.3117, "step": 390 }, { "epoch": 0.07, "grad_norm": 4.375, "learning_rate": 5.9927214980680465e-05, "loss": 1.2959, "step": 395 }, { "epoch": 0.07, "grad_norm": 4.5, "learning_rate": 5.9925261918943e-05, "loss": 1.4071, "step": 400 }, { "epoch": 0.07, "grad_norm": 5.21875, "learning_rate": 5.992328303211607e-05, "loss": 1.3745, "step": 405 }, { "epoch": 0.07, "grad_norm": 4.0625, "learning_rate": 5.9921278321907416e-05, "loss": 1.3082, "step": 410 }, { "epoch": 0.07, "grad_norm": 4.1875, "learning_rate": 5.991924779004709e-05, "loss": 1.3505, "step": 415 }, { "epoch": 0.07, "grad_norm": 4.5, "learning_rate": 5.9917191438287385e-05, "loss": 1.3175, "step": 420 }, { "epoch": 0.08, "grad_norm": 4.375, "learning_rate": 5.9915109268402945e-05, "loss": 1.2956, "step": 425 }, { "epoch": 0.08, "grad_norm": 4.5, "learning_rate": 5.991300128219062e-05, "loss": 1.3669, "step": 430 }, { "epoch": 0.08, "grad_norm": 4.96875, "learning_rate": 5.99108674814696e-05, "loss": 1.3332, "step": 435 }, { "epoch": 0.08, "grad_norm": 4.125, "learning_rate": 5.99087078680813e-05, "loss": 1.3186, "step": 440 }, { "epoch": 0.08, "grad_norm": 4.84375, "learning_rate": 5.990652244388945e-05, "loss": 1.3829, "step": 445 }, { "epoch": 0.08, "grad_norm": 5.96875, "learning_rate": 5.9904311210780055e-05, "loss": 1.3311, "step": 450 }, { "epoch": 0.08, "grad_norm": 4.53125, "learning_rate": 5.990207417066134e-05, "loss": 1.2819, "step": 455 }, { "epoch": 0.08, "grad_norm": 4.4375, "learning_rate": 5.989981132546388e-05, "loss": 1.3235, "step": 460 }, { "epoch": 0.08, "grad_norm": 4.375, "learning_rate": 5.989752267714045e-05, "loss": 1.303, "step": 465 }, { "epoch": 0.08, "grad_norm": 4.75, "learning_rate": 5.989520822766614e-05, "loss": 1.3631, "step": 470 }, { "epoch": 0.08, "grad_norm": 4.09375, "learning_rate": 5.989286797903829e-05, "loss": 1.2994, "step": 475 }, { "epoch": 0.09, "grad_norm": 4.34375, "learning_rate": 5.989050193327649e-05, "loss": 1.3148, "step": 480 }, { "epoch": 0.09, "grad_norm": 4.75, "learning_rate": 5.9888110092422605e-05, "loss": 1.4402, "step": 485 }, { "epoch": 0.09, "grad_norm": 5.3125, "learning_rate": 5.988569245854077e-05, "loss": 1.3807, "step": 490 }, { "epoch": 0.09, "grad_norm": 5.0, "learning_rate": 5.988324903371736e-05, "loss": 1.338, "step": 495 }, { "epoch": 0.09, "grad_norm": 4.40625, "learning_rate": 5.988077982006103e-05, "loss": 1.3091, "step": 500 }, { "epoch": 0.09, "grad_norm": 3.875, "learning_rate": 5.987828481970267e-05, "loss": 1.2977, "step": 505 }, { "epoch": 0.09, "grad_norm": 4.875, "learning_rate": 5.9875764034795424e-05, "loss": 1.3424, "step": 510 }, { "epoch": 0.09, "grad_norm": 5.09375, "learning_rate": 5.9873217467514714e-05, "loss": 1.3303, "step": 515 }, { "epoch": 0.09, "grad_norm": 4.75, "learning_rate": 5.987064512005817e-05, "loss": 1.387, "step": 520 }, { "epoch": 0.09, "grad_norm": 4.5625, "learning_rate": 5.986804699464571e-05, "loss": 1.345, "step": 525 }, { "epoch": 0.09, "grad_norm": 5.1875, "learning_rate": 5.9865423093519465e-05, "loss": 1.3771, "step": 530 }, { "epoch": 0.09, "grad_norm": 4.375, "learning_rate": 5.986277341894384e-05, "loss": 1.3298, "step": 535 }, { "epoch": 0.1, "grad_norm": 4.8125, "learning_rate": 5.986009797320546e-05, "loss": 1.2843, "step": 540 }, { "epoch": 0.1, "grad_norm": 4.375, "learning_rate": 5.985739675861319e-05, "loss": 1.3001, "step": 545 }, { "epoch": 0.1, "grad_norm": 5.1875, "learning_rate": 5.985466977749816e-05, "loss": 1.401, "step": 550 }, { "epoch": 0.1, "grad_norm": 4.5625, "learning_rate": 5.985191703221371e-05, "loss": 1.3484, "step": 555 }, { "epoch": 0.1, "grad_norm": 4.46875, "learning_rate": 5.9849138525135416e-05, "loss": 1.3024, "step": 560 }, { "epoch": 0.1, "grad_norm": 4.25, "learning_rate": 5.984633425866109e-05, "loss": 1.4023, "step": 565 }, { "epoch": 0.1, "grad_norm": 5.40625, "learning_rate": 5.9843504235210785e-05, "loss": 1.342, "step": 570 }, { "epoch": 0.1, "grad_norm": 3.828125, "learning_rate": 5.984064845722676e-05, "loss": 1.3334, "step": 575 }, { "epoch": 0.1, "grad_norm": 4.8125, "learning_rate": 5.9837766927173525e-05, "loss": 1.2903, "step": 580 }, { "epoch": 0.1, "grad_norm": 4.65625, "learning_rate": 5.983485964753779e-05, "loss": 1.3846, "step": 585 }, { "epoch": 0.1, "grad_norm": 5.03125, "learning_rate": 5.9831926620828506e-05, "loss": 1.3149, "step": 590 }, { "epoch": 0.11, "grad_norm": 3.859375, "learning_rate": 5.9828967849576834e-05, "loss": 1.3479, "step": 595 }, { "epoch": 0.11, "grad_norm": 4.5625, "learning_rate": 5.982598333633614e-05, "loss": 1.4663, "step": 600 }, { "epoch": 0.11, "grad_norm": 6.28125, "learning_rate": 5.982297308368204e-05, "loss": 1.4407, "step": 605 }, { "epoch": 0.11, "grad_norm": 4.15625, "learning_rate": 5.9819937094212325e-05, "loss": 1.2985, "step": 610 }, { "epoch": 0.11, "grad_norm": 4.90625, "learning_rate": 5.981687537054701e-05, "loss": 1.3391, "step": 615 }, { "epoch": 0.11, "grad_norm": 4.65625, "learning_rate": 5.981378791532833e-05, "loss": 1.2657, "step": 620 }, { "epoch": 0.11, "grad_norm": 4.6875, "learning_rate": 5.9810674731220716e-05, "loss": 1.3232, "step": 625 }, { "epoch": 0.11, "grad_norm": 4.46875, "learning_rate": 5.980753582091079e-05, "loss": 1.412, "step": 630 }, { "epoch": 0.11, "grad_norm": 5.4375, "learning_rate": 5.980437118710741e-05, "loss": 1.2918, "step": 635 }, { "epoch": 0.11, "grad_norm": 4.875, "learning_rate": 5.980118083254159e-05, "loss": 1.3751, "step": 640 }, { "epoch": 0.11, "grad_norm": 5.5625, "learning_rate": 5.9797964759966574e-05, "loss": 1.3412, "step": 645 }, { "epoch": 0.12, "grad_norm": 4.1875, "learning_rate": 5.9794722972157785e-05, "loss": 1.3715, "step": 650 }, { "epoch": 0.12, "grad_norm": 4.625, "learning_rate": 5.979145547191284e-05, "loss": 1.2992, "step": 655 }, { "epoch": 0.12, "grad_norm": 4.21875, "learning_rate": 5.978816226205155e-05, "loss": 1.33, "step": 660 }, { "epoch": 0.12, "grad_norm": 4.84375, "learning_rate": 5.978484334541591e-05, "loss": 1.3168, "step": 665 }, { "epoch": 0.12, "grad_norm": 5.34375, "learning_rate": 5.97814987248701e-05, "loss": 1.3821, "step": 670 }, { "epoch": 0.12, "grad_norm": 4.875, "learning_rate": 5.977812840330049e-05, "loss": 1.2717, "step": 675 }, { "epoch": 0.12, "grad_norm": 4.84375, "learning_rate": 5.9774732383615594e-05, "loss": 1.4104, "step": 680 }, { "epoch": 0.12, "grad_norm": 4.6875, "learning_rate": 5.977131066874615e-05, "loss": 1.3501, "step": 685 }, { "epoch": 0.12, "grad_norm": 4.25, "learning_rate": 5.9767863261645055e-05, "loss": 1.2627, "step": 690 }, { "epoch": 0.12, "grad_norm": 4.75, "learning_rate": 5.976439016528736e-05, "loss": 1.2337, "step": 695 }, { "epoch": 0.12, "grad_norm": 4.6875, "learning_rate": 5.97608913826703e-05, "loss": 1.4027, "step": 700 }, { "epoch": 0.12, "grad_norm": 5.0, "learning_rate": 5.97573669168133e-05, "loss": 1.3029, "step": 705 }, { "epoch": 0.13, "grad_norm": 4.40625, "learning_rate": 5.9753816770757895e-05, "loss": 1.3413, "step": 710 }, { "epoch": 0.13, "grad_norm": 4.59375, "learning_rate": 5.9750240947567835e-05, "loss": 1.3196, "step": 715 }, { "epoch": 0.13, "grad_norm": 4.78125, "learning_rate": 5.974663945032899e-05, "loss": 1.3558, "step": 720 }, { "epoch": 0.13, "grad_norm": 4.0625, "learning_rate": 5.974301228214941e-05, "loss": 1.3365, "step": 725 }, { "epoch": 0.13, "grad_norm": 4.5625, "learning_rate": 5.9739359446159284e-05, "loss": 1.3516, "step": 730 }, { "epoch": 0.13, "grad_norm": 3.78125, "learning_rate": 5.973568094551097e-05, "loss": 1.3054, "step": 735 }, { "epoch": 0.13, "grad_norm": 4.34375, "learning_rate": 5.9731976783378975e-05, "loss": 1.3778, "step": 740 }, { "epoch": 0.13, "grad_norm": 4.0, "learning_rate": 5.972824696295991e-05, "loss": 1.2534, "step": 745 }, { "epoch": 0.13, "grad_norm": 5.25, "learning_rate": 5.9724491487472566e-05, "loss": 1.3194, "step": 750 }, { "epoch": 0.13, "grad_norm": 3.8125, "learning_rate": 5.972071036015789e-05, "loss": 1.2788, "step": 755 }, { "epoch": 0.13, "grad_norm": 4.09375, "learning_rate": 5.9716903584278915e-05, "loss": 1.298, "step": 760 }, { "epoch": 0.14, "grad_norm": 4.15625, "learning_rate": 5.971307116312085e-05, "loss": 1.4129, "step": 765 }, { "epoch": 0.14, "grad_norm": 4.125, "learning_rate": 5.9709213099991016e-05, "loss": 1.3477, "step": 770 }, { "epoch": 0.14, "grad_norm": 4.34375, "learning_rate": 5.970532939821888e-05, "loss": 1.3616, "step": 775 }, { "epoch": 0.14, "grad_norm": 4.6875, "learning_rate": 5.970142006115601e-05, "loss": 1.3422, "step": 780 }, { "epoch": 0.14, "grad_norm": 4.03125, "learning_rate": 5.969748509217611e-05, "loss": 1.29, "step": 785 }, { "epoch": 0.14, "grad_norm": 4.71875, "learning_rate": 5.969352449467501e-05, "loss": 1.2633, "step": 790 }, { "epoch": 0.14, "grad_norm": 4.875, "learning_rate": 5.9689538272070664e-05, "loss": 1.3293, "step": 795 }, { "epoch": 0.14, "grad_norm": 4.1875, "learning_rate": 5.96855264278031e-05, "loss": 1.3215, "step": 800 }, { "epoch": 0.14, "grad_norm": 4.03125, "learning_rate": 5.9681488965334503e-05, "loss": 1.398, "step": 805 }, { "epoch": 0.14, "grad_norm": 4.4375, "learning_rate": 5.967742588814915e-05, "loss": 1.3202, "step": 810 }, { "epoch": 0.14, "grad_norm": 5.0, "learning_rate": 5.9673337199753406e-05, "loss": 1.4, "step": 815 }, { "epoch": 0.15, "grad_norm": 5.125, "learning_rate": 5.9669222903675756e-05, "loss": 1.337, "step": 820 }, { "epoch": 0.15, "grad_norm": 4.90625, "learning_rate": 5.966508300346678e-05, "loss": 1.3281, "step": 825 }, { "epoch": 0.15, "grad_norm": 5.40625, "learning_rate": 5.966091750269918e-05, "loss": 1.3512, "step": 830 }, { "epoch": 0.15, "grad_norm": 5.46875, "learning_rate": 5.96567264049677e-05, "loss": 1.3155, "step": 835 }, { "epoch": 0.15, "grad_norm": 4.5625, "learning_rate": 5.96525097138892e-05, "loss": 1.4283, "step": 840 }, { "epoch": 0.15, "grad_norm": 4.1875, "learning_rate": 5.964826743310263e-05, "loss": 1.3494, "step": 845 }, { "epoch": 0.15, "grad_norm": 3.9375, "learning_rate": 5.964399956626902e-05, "loss": 1.3876, "step": 850 }, { "epoch": 0.15, "grad_norm": 4.21875, "learning_rate": 5.963970611707149e-05, "loss": 1.3442, "step": 855 }, { "epoch": 0.15, "grad_norm": 5.78125, "learning_rate": 5.963538708921522e-05, "loss": 1.3384, "step": 860 }, { "epoch": 0.15, "grad_norm": 4.09375, "learning_rate": 5.9631042486427474e-05, "loss": 1.3665, "step": 865 }, { "epoch": 0.15, "grad_norm": 4.28125, "learning_rate": 5.962667231245757e-05, "loss": 1.2918, "step": 870 }, { "epoch": 0.16, "grad_norm": 4.78125, "learning_rate": 5.962227657107694e-05, "loss": 1.3723, "step": 875 }, { "epoch": 0.16, "grad_norm": 4.46875, "learning_rate": 5.961785526607901e-05, "loss": 1.3158, "step": 880 }, { "epoch": 0.16, "grad_norm": 4.40625, "learning_rate": 5.9613408401279334e-05, "loss": 1.3147, "step": 885 }, { "epoch": 0.16, "grad_norm": 4.53125, "learning_rate": 5.960893598051549e-05, "loss": 1.3635, "step": 890 }, { "epoch": 0.16, "grad_norm": 4.65625, "learning_rate": 5.9604438007647093e-05, "loss": 1.3384, "step": 895 }, { "epoch": 0.16, "grad_norm": 4.5, "learning_rate": 5.959991448655586e-05, "loss": 1.2457, "step": 900 }, { "epoch": 0.16, "grad_norm": 4.1875, "learning_rate": 5.959536542114551e-05, "loss": 1.3478, "step": 905 }, { "epoch": 0.16, "grad_norm": 4.125, "learning_rate": 5.9590790815341845e-05, "loss": 1.3477, "step": 910 }, { "epoch": 0.16, "grad_norm": 4.53125, "learning_rate": 5.958619067309266e-05, "loss": 1.3543, "step": 915 }, { "epoch": 0.16, "grad_norm": 4.875, "learning_rate": 5.958156499836784e-05, "loss": 1.2512, "step": 920 }, { "epoch": 0.16, "grad_norm": 4.5, "learning_rate": 5.957691379515926e-05, "loss": 1.3305, "step": 925 }, { "epoch": 0.16, "grad_norm": 4.5625, "learning_rate": 5.9572237067480845e-05, "loss": 1.3761, "step": 930 }, { "epoch": 0.17, "grad_norm": 4.25, "learning_rate": 5.9567534819368555e-05, "loss": 1.3712, "step": 935 }, { "epoch": 0.17, "grad_norm": 4.9375, "learning_rate": 5.956280705488036e-05, "loss": 1.2846, "step": 940 }, { "epoch": 0.17, "grad_norm": 5.28125, "learning_rate": 5.955805377809627e-05, "loss": 1.2998, "step": 945 }, { "epoch": 0.17, "grad_norm": 4.09375, "learning_rate": 5.955327499311827e-05, "loss": 1.4294, "step": 950 }, { "epoch": 0.17, "grad_norm": 4.90625, "learning_rate": 5.954847070407041e-05, "loss": 1.3406, "step": 955 }, { "epoch": 0.17, "grad_norm": 5.09375, "learning_rate": 5.954364091509871e-05, "loss": 1.3176, "step": 960 }, { "epoch": 0.17, "grad_norm": 4.3125, "learning_rate": 5.953878563037122e-05, "loss": 1.3496, "step": 965 }, { "epoch": 0.17, "grad_norm": 4.09375, "learning_rate": 5.953390485407799e-05, "loss": 1.4221, "step": 970 }, { "epoch": 0.17, "grad_norm": 5.09375, "learning_rate": 5.952899859043105e-05, "loss": 1.2472, "step": 975 }, { "epoch": 0.17, "grad_norm": 4.28125, "learning_rate": 5.952406684366444e-05, "loss": 1.2978, "step": 980 }, { "epoch": 0.17, "grad_norm": 4.0625, "learning_rate": 5.9519109618034204e-05, "loss": 1.3261, "step": 985 }, { "epoch": 0.18, "grad_norm": 4.78125, "learning_rate": 5.9514126917818336e-05, "loss": 1.3967, "step": 990 }, { "epoch": 0.18, "grad_norm": 4.46875, "learning_rate": 5.950911874731685e-05, "loss": 1.3451, "step": 995 }, { "epoch": 0.18, "grad_norm": 4.40625, "learning_rate": 5.9504085110851734e-05, "loss": 1.3612, "step": 1000 }, { "epoch": 0.18, "grad_norm": 4.8125, "learning_rate": 5.949902601276695e-05, "loss": 1.294, "step": 1005 }, { "epoch": 0.18, "grad_norm": 4.53125, "learning_rate": 5.94939414574284e-05, "loss": 1.339, "step": 1010 }, { "epoch": 0.18, "grad_norm": 4.71875, "learning_rate": 5.948883144922402e-05, "loss": 1.2529, "step": 1015 }, { "epoch": 0.18, "grad_norm": 4.59375, "learning_rate": 5.948369599256366e-05, "loss": 1.2141, "step": 1020 }, { "epoch": 0.18, "grad_norm": 4.9375, "learning_rate": 5.947853509187916e-05, "loss": 1.3487, "step": 1025 }, { "epoch": 0.18, "grad_norm": 4.34375, "learning_rate": 5.9473348751624284e-05, "loss": 1.3342, "step": 1030 }, { "epoch": 0.18, "grad_norm": 4.9375, "learning_rate": 5.94681369762748e-05, "loss": 1.3191, "step": 1035 }, { "epoch": 0.18, "grad_norm": 4.34375, "learning_rate": 5.9462899770328374e-05, "loss": 1.3673, "step": 1040 }, { "epoch": 0.19, "grad_norm": 4.4375, "learning_rate": 5.9457637138304664e-05, "loss": 1.3066, "step": 1045 }, { "epoch": 0.19, "grad_norm": 4.90625, "learning_rate": 5.945234908474523e-05, "loss": 1.3902, "step": 1050 }, { "epoch": 0.19, "grad_norm": 4.84375, "learning_rate": 5.94470356142136e-05, "loss": 1.3194, "step": 1055 }, { "epoch": 0.19, "grad_norm": 4.96875, "learning_rate": 5.944169673129522e-05, "loss": 1.3439, "step": 1060 }, { "epoch": 0.19, "grad_norm": 4.125, "learning_rate": 5.943633244059749e-05, "loss": 1.366, "step": 1065 }, { "epoch": 0.19, "grad_norm": 4.625, "learning_rate": 5.9430942746749696e-05, "loss": 1.3423, "step": 1070 }, { "epoch": 0.19, "grad_norm": 4.65625, "learning_rate": 5.942552765440308e-05, "loss": 1.3562, "step": 1075 }, { "epoch": 0.19, "grad_norm": 4.40625, "learning_rate": 5.9420087168230794e-05, "loss": 1.3145, "step": 1080 }, { "epoch": 0.19, "grad_norm": 4.90625, "learning_rate": 5.94146212929279e-05, "loss": 1.3223, "step": 1085 }, { "epoch": 0.19, "grad_norm": 4.65625, "learning_rate": 5.940913003321138e-05, "loss": 1.3113, "step": 1090 }, { "epoch": 0.19, "grad_norm": 4.0625, "learning_rate": 5.9403613393820105e-05, "loss": 1.2579, "step": 1095 }, { "epoch": 0.19, "grad_norm": 4.5, "learning_rate": 5.939807137951486e-05, "loss": 1.4243, "step": 1100 }, { "epoch": 0.2, "grad_norm": 4.46875, "learning_rate": 5.939250399507833e-05, "loss": 1.3892, "step": 1105 }, { "epoch": 0.2, "grad_norm": 4.25, "learning_rate": 5.938691124531509e-05, "loss": 1.3237, "step": 1110 }, { "epoch": 0.2, "grad_norm": 4.78125, "learning_rate": 5.938129313505162e-05, "loss": 1.3506, "step": 1115 }, { "epoch": 0.2, "grad_norm": 4.25, "learning_rate": 5.9375649669136244e-05, "loss": 1.2949, "step": 1120 }, { "epoch": 0.2, "grad_norm": 4.59375, "learning_rate": 5.9369980852439214e-05, "loss": 1.2863, "step": 1125 }, { "epoch": 0.2, "grad_norm": 4.71875, "learning_rate": 5.936428668985265e-05, "loss": 1.3694, "step": 1130 }, { "epoch": 0.2, "grad_norm": 4.8125, "learning_rate": 5.935856718629051e-05, "loss": 1.3392, "step": 1135 }, { "epoch": 0.2, "grad_norm": 5.15625, "learning_rate": 5.935282234668867e-05, "loss": 1.3389, "step": 1140 }, { "epoch": 0.2, "grad_norm": 4.90625, "learning_rate": 5.9347052176004836e-05, "loss": 1.3579, "step": 1145 }, { "epoch": 0.2, "grad_norm": 4.125, "learning_rate": 5.934125667921859e-05, "loss": 1.3148, "step": 1150 }, { "epoch": 0.2, "grad_norm": 4.21875, "learning_rate": 5.9335435861331355e-05, "loss": 1.3288, "step": 1155 }, { "epoch": 0.21, "grad_norm": 5.4375, "learning_rate": 5.9329589727366435e-05, "loss": 1.3332, "step": 1160 }, { "epoch": 0.21, "grad_norm": 5.8125, "learning_rate": 5.932371828236895e-05, "loss": 1.3346, "step": 1165 }, { "epoch": 0.21, "grad_norm": 4.9375, "learning_rate": 5.931782153140588e-05, "loss": 1.3085, "step": 1170 }, { "epoch": 0.21, "grad_norm": 4.65625, "learning_rate": 5.931189947956605e-05, "loss": 1.2766, "step": 1175 }, { "epoch": 0.21, "grad_norm": 4.03125, "learning_rate": 5.9305952131960096e-05, "loss": 1.3103, "step": 1180 }, { "epoch": 0.21, "grad_norm": 4.21875, "learning_rate": 5.929997949372049e-05, "loss": 1.2864, "step": 1185 }, { "epoch": 0.21, "grad_norm": 4.8125, "learning_rate": 5.929398157000156e-05, "loss": 1.3362, "step": 1190 }, { "epoch": 0.21, "grad_norm": 4.75, "learning_rate": 5.9287958365979407e-05, "loss": 1.2611, "step": 1195 }, { "epoch": 0.21, "grad_norm": 4.40625, "learning_rate": 5.9281909886852004e-05, "loss": 1.244, "step": 1200 }, { "epoch": 0.21, "grad_norm": 4.25, "learning_rate": 5.927583613783907e-05, "loss": 1.3666, "step": 1205 }, { "epoch": 0.21, "grad_norm": 5.09375, "learning_rate": 5.926973712418219e-05, "loss": 1.306, "step": 1210 }, { "epoch": 0.22, "grad_norm": 4.875, "learning_rate": 5.926361285114473e-05, "loss": 1.4385, "step": 1215 }, { "epoch": 0.22, "grad_norm": 4.53125, "learning_rate": 5.925746332401183e-05, "loss": 1.3202, "step": 1220 }, { "epoch": 0.22, "grad_norm": 4.4375, "learning_rate": 5.9251288548090484e-05, "loss": 1.3279, "step": 1225 }, { "epoch": 0.22, "grad_norm": 4.65625, "learning_rate": 5.92450885287094e-05, "loss": 1.2451, "step": 1230 }, { "epoch": 0.22, "grad_norm": 4.1875, "learning_rate": 5.923886327121914e-05, "loss": 1.2847, "step": 1235 }, { "epoch": 0.22, "grad_norm": 4.65625, "learning_rate": 5.9232612780991996e-05, "loss": 1.3613, "step": 1240 }, { "epoch": 0.22, "grad_norm": 5.0625, "learning_rate": 5.9226337063422067e-05, "loss": 1.4253, "step": 1245 }, { "epoch": 0.22, "grad_norm": 4.59375, "learning_rate": 5.922003612392521e-05, "loss": 1.3019, "step": 1250 }, { "epoch": 0.22, "grad_norm": 4.5, "learning_rate": 5.9213709967939036e-05, "loss": 1.3341, "step": 1255 }, { "epoch": 0.22, "grad_norm": 4.65625, "learning_rate": 5.920735860092294e-05, "loss": 1.3104, "step": 1260 }, { "epoch": 0.22, "grad_norm": 4.65625, "learning_rate": 5.920098202835807e-05, "loss": 1.3679, "step": 1265 }, { "epoch": 0.23, "grad_norm": 5.375, "learning_rate": 5.919458025574732e-05, "loss": 1.251, "step": 1270 }, { "epoch": 0.23, "grad_norm": 4.65625, "learning_rate": 5.9188153288615304e-05, "loss": 1.3897, "step": 1275 }, { "epoch": 0.23, "grad_norm": 4.03125, "learning_rate": 5.918170113250845e-05, "loss": 1.3073, "step": 1280 }, { "epoch": 0.23, "grad_norm": 4.78125, "learning_rate": 5.9175223792994846e-05, "loss": 1.3152, "step": 1285 }, { "epoch": 0.23, "grad_norm": 4.5625, "learning_rate": 5.916872127566437e-05, "loss": 1.3615, "step": 1290 }, { "epoch": 0.23, "grad_norm": 5.8125, "learning_rate": 5.916219358612859e-05, "loss": 1.3319, "step": 1295 }, { "epoch": 0.23, "grad_norm": 4.8125, "learning_rate": 5.9155640730020824e-05, "loss": 1.2954, "step": 1300 }, { "epoch": 0.23, "grad_norm": 4.0, "learning_rate": 5.9149062712996084e-05, "loss": 1.3416, "step": 1305 }, { "epoch": 0.23, "grad_norm": 4.5, "learning_rate": 5.914245954073111e-05, "loss": 1.3452, "step": 1310 }, { "epoch": 0.23, "grad_norm": 4.0625, "learning_rate": 5.9135831218924354e-05, "loss": 1.3642, "step": 1315 }, { "epoch": 0.23, "grad_norm": 4.84375, "learning_rate": 5.912917775329597e-05, "loss": 1.2947, "step": 1320 }, { "epoch": 0.23, "grad_norm": 4.65625, "learning_rate": 5.91224991495878e-05, "loss": 1.4229, "step": 1325 }, { "epoch": 0.24, "grad_norm": 4.71875, "learning_rate": 5.9115795413563396e-05, "loss": 1.3313, "step": 1330 }, { "epoch": 0.24, "grad_norm": 4.78125, "learning_rate": 5.910906655100797e-05, "loss": 1.3698, "step": 1335 }, { "epoch": 0.24, "grad_norm": 4.8125, "learning_rate": 5.9102312567728466e-05, "loss": 1.3391, "step": 1340 }, { "epoch": 0.24, "grad_norm": 5.28125, "learning_rate": 5.9095533469553453e-05, "loss": 1.3375, "step": 1345 }, { "epoch": 0.24, "grad_norm": 4.875, "learning_rate": 5.908872926233322e-05, "loss": 1.3414, "step": 1350 }, { "epoch": 0.24, "grad_norm": 4.1875, "learning_rate": 5.908189995193969e-05, "loss": 1.389, "step": 1355 }, { "epoch": 0.24, "grad_norm": 4.21875, "learning_rate": 5.907504554426647e-05, "loss": 1.3755, "step": 1360 }, { "epoch": 0.24, "grad_norm": 3.8125, "learning_rate": 5.906816604522882e-05, "loss": 1.3034, "step": 1365 }, { "epoch": 0.24, "grad_norm": 4.0, "learning_rate": 5.906126146076365e-05, "loss": 1.2712, "step": 1370 }, { "epoch": 0.24, "grad_norm": 4.0625, "learning_rate": 5.905433179682952e-05, "loss": 1.3502, "step": 1375 }, { "epoch": 0.24, "grad_norm": 5.125, "learning_rate": 5.9047377059406646e-05, "loss": 1.3117, "step": 1380 }, { "epoch": 0.25, "grad_norm": 5.40625, "learning_rate": 5.904039725449686e-05, "loss": 1.363, "step": 1385 }, { "epoch": 0.25, "grad_norm": 5.125, "learning_rate": 5.903339238812364e-05, "loss": 1.3283, "step": 1390 }, { "epoch": 0.25, "grad_norm": 4.5, "learning_rate": 5.902636246633208e-05, "loss": 1.38, "step": 1395 }, { "epoch": 0.25, "grad_norm": 4.34375, "learning_rate": 5.9019307495188924e-05, "loss": 1.2899, "step": 1400 }, { "epoch": 0.25, "grad_norm": 6.25, "learning_rate": 5.9012227480782505e-05, "loss": 1.3645, "step": 1405 }, { "epoch": 0.25, "grad_norm": 4.21875, "learning_rate": 5.900512242922278e-05, "loss": 1.3104, "step": 1410 }, { "epoch": 0.25, "grad_norm": 4.5625, "learning_rate": 5.899799234664131e-05, "loss": 1.3189, "step": 1415 }, { "epoch": 0.25, "grad_norm": 3.78125, "learning_rate": 5.899083723919126e-05, "loss": 1.2704, "step": 1420 }, { "epoch": 0.25, "grad_norm": 4.4375, "learning_rate": 5.898365711304738e-05, "loss": 1.3263, "step": 1425 }, { "epoch": 0.25, "grad_norm": 4.28125, "learning_rate": 5.8976451974406034e-05, "loss": 1.3194, "step": 1430 }, { "epoch": 0.25, "grad_norm": 5.4375, "learning_rate": 5.896922182948516e-05, "loss": 1.3556, "step": 1435 }, { "epoch": 0.26, "grad_norm": 4.4375, "learning_rate": 5.8961966684524244e-05, "loss": 1.2764, "step": 1440 }, { "epoch": 0.26, "grad_norm": 4.625, "learning_rate": 5.8954686545784414e-05, "loss": 1.3496, "step": 1445 }, { "epoch": 0.26, "grad_norm": 4.625, "learning_rate": 5.894738141954831e-05, "loss": 1.3491, "step": 1450 }, { "epoch": 0.26, "grad_norm": 4.09375, "learning_rate": 5.894005131212015e-05, "loss": 1.3982, "step": 1455 }, { "epoch": 0.26, "grad_norm": 5.21875, "learning_rate": 5.8932696229825736e-05, "loss": 1.3221, "step": 1460 }, { "epoch": 0.26, "grad_norm": 3.84375, "learning_rate": 5.8925316179012384e-05, "loss": 1.3247, "step": 1465 }, { "epoch": 0.26, "grad_norm": 4.375, "learning_rate": 5.8917911166048995e-05, "loss": 1.2892, "step": 1470 }, { "epoch": 0.26, "grad_norm": 4.15625, "learning_rate": 5.891048119732597e-05, "loss": 1.3018, "step": 1475 }, { "epoch": 0.26, "grad_norm": 5.0, "learning_rate": 5.890302627925528e-05, "loss": 1.4015, "step": 1480 }, { "epoch": 0.26, "grad_norm": 4.125, "learning_rate": 5.889554641827043e-05, "loss": 1.3275, "step": 1485 }, { "epoch": 0.26, "grad_norm": 4.6875, "learning_rate": 5.888804162082642e-05, "loss": 1.3106, "step": 1490 }, { "epoch": 0.26, "grad_norm": 4.0625, "learning_rate": 5.888051189339978e-05, "loss": 1.2854, "step": 1495 }, { "epoch": 0.27, "grad_norm": 4.71875, "learning_rate": 5.8872957242488585e-05, "loss": 1.2805, "step": 1500 }, { "epoch": 0.27, "grad_norm": 4.96875, "learning_rate": 5.886537767461238e-05, "loss": 1.3378, "step": 1505 }, { "epoch": 0.27, "grad_norm": 5.09375, "learning_rate": 5.885777319631224e-05, "loss": 1.3369, "step": 1510 }, { "epoch": 0.27, "grad_norm": 4.71875, "learning_rate": 5.885014381415071e-05, "loss": 1.3096, "step": 1515 }, { "epoch": 0.27, "grad_norm": 4.625, "learning_rate": 5.8842489534711856e-05, "loss": 1.3195, "step": 1520 }, { "epoch": 0.27, "grad_norm": 5.03125, "learning_rate": 5.88348103646012e-05, "loss": 1.2828, "step": 1525 }, { "epoch": 0.27, "grad_norm": 4.46875, "learning_rate": 5.882710631044577e-05, "loss": 1.3882, "step": 1530 }, { "epoch": 0.27, "grad_norm": 4.875, "learning_rate": 5.881937737889406e-05, "loss": 1.3717, "step": 1535 }, { "epoch": 0.27, "grad_norm": 3.9375, "learning_rate": 5.8811623576616026e-05, "loss": 1.2351, "step": 1540 }, { "epoch": 0.27, "grad_norm": 4.71875, "learning_rate": 5.8803844910303105e-05, "loss": 1.2742, "step": 1545 }, { "epoch": 0.27, "grad_norm": 4.28125, "learning_rate": 5.879604138666816e-05, "loss": 1.304, "step": 1550 }, { "epoch": 0.28, "grad_norm": 5.03125, "learning_rate": 5.878821301244554e-05, "loss": 1.3395, "step": 1555 }, { "epoch": 0.28, "grad_norm": 4.28125, "learning_rate": 5.878035979439101e-05, "loss": 1.3145, "step": 1560 }, { "epoch": 0.28, "grad_norm": 3.8125, "learning_rate": 5.87724817392818e-05, "loss": 1.3266, "step": 1565 }, { "epoch": 0.28, "grad_norm": 6.375, "learning_rate": 5.876457885391656e-05, "loss": 1.3551, "step": 1570 }, { "epoch": 0.28, "grad_norm": 4.96875, "learning_rate": 5.8756651145115384e-05, "loss": 1.2578, "step": 1575 }, { "epoch": 0.28, "grad_norm": 4.9375, "learning_rate": 5.874869861971976e-05, "loss": 1.4112, "step": 1580 }, { "epoch": 0.28, "grad_norm": 4.59375, "learning_rate": 5.874072128459261e-05, "loss": 1.2546, "step": 1585 }, { "epoch": 0.28, "grad_norm": 5.25, "learning_rate": 5.8732719146618265e-05, "loss": 1.2999, "step": 1590 }, { "epoch": 0.28, "grad_norm": 4.8125, "learning_rate": 5.872469221270246e-05, "loss": 1.3047, "step": 1595 }, { "epoch": 0.28, "grad_norm": 4.21875, "learning_rate": 5.8716640489772335e-05, "loss": 1.3674, "step": 1600 }, { "epoch": 0.28, "grad_norm": 4.5, "learning_rate": 5.870856398477641e-05, "loss": 1.2913, "step": 1605 }, { "epoch": 0.29, "grad_norm": 4.4375, "learning_rate": 5.870046270468459e-05, "loss": 1.3916, "step": 1610 }, { "epoch": 0.29, "grad_norm": 5.0625, "learning_rate": 5.8692336656488196e-05, "loss": 1.3267, "step": 1615 }, { "epoch": 0.29, "grad_norm": 4.53125, "learning_rate": 5.868418584719986e-05, "loss": 1.3203, "step": 1620 }, { "epoch": 0.29, "grad_norm": 4.40625, "learning_rate": 5.867601028385365e-05, "loss": 1.4038, "step": 1625 }, { "epoch": 0.29, "grad_norm": 6.0625, "learning_rate": 5.866780997350494e-05, "loss": 1.3253, "step": 1630 }, { "epoch": 0.29, "grad_norm": 5.21875, "learning_rate": 5.86595849232305e-05, "loss": 1.3407, "step": 1635 }, { "epoch": 0.29, "grad_norm": 4.90625, "learning_rate": 5.865133514012843e-05, "loss": 1.2964, "step": 1640 }, { "epoch": 0.29, "grad_norm": 4.25, "learning_rate": 5.864306063131818e-05, "loss": 1.3601, "step": 1645 }, { "epoch": 0.29, "grad_norm": 4.125, "learning_rate": 5.863476140394053e-05, "loss": 1.3301, "step": 1650 }, { "epoch": 0.29, "grad_norm": 4.9375, "learning_rate": 5.862643746515761e-05, "loss": 1.3416, "step": 1655 }, { "epoch": 0.29, "grad_norm": 4.09375, "learning_rate": 5.8618088822152866e-05, "loss": 1.3301, "step": 1660 }, { "epoch": 0.3, "grad_norm": 5.40625, "learning_rate": 5.8609715482131055e-05, "loss": 1.3167, "step": 1665 }, { "epoch": 0.3, "grad_norm": 4.15625, "learning_rate": 5.860131745231826e-05, "loss": 1.3531, "step": 1670 }, { "epoch": 0.3, "grad_norm": 4.65625, "learning_rate": 5.859289473996186e-05, "loss": 1.3402, "step": 1675 }, { "epoch": 0.3, "grad_norm": 4.75, "learning_rate": 5.8584447352330533e-05, "loss": 1.2596, "step": 1680 }, { "epoch": 0.3, "grad_norm": 4.1875, "learning_rate": 5.8575975296714274e-05, "loss": 1.3238, "step": 1685 }, { "epoch": 0.3, "grad_norm": 4.625, "learning_rate": 5.856747858042435e-05, "loss": 1.3372, "step": 1690 }, { "epoch": 0.3, "grad_norm": 4.125, "learning_rate": 5.85589572107933e-05, "loss": 1.3193, "step": 1695 }, { "epoch": 0.3, "grad_norm": 4.875, "learning_rate": 5.855041119517494e-05, "loss": 1.284, "step": 1700 }, { "epoch": 0.3, "grad_norm": 5.5, "learning_rate": 5.854184054094439e-05, "loss": 1.2943, "step": 1705 }, { "epoch": 0.3, "grad_norm": 4.6875, "learning_rate": 5.853324525549799e-05, "loss": 1.2727, "step": 1710 }, { "epoch": 0.3, "grad_norm": 3.6875, "learning_rate": 5.852462534625336e-05, "loss": 1.3013, "step": 1715 }, { "epoch": 0.3, "grad_norm": 4.5625, "learning_rate": 5.8515980820649354e-05, "loss": 1.2731, "step": 1720 }, { "epoch": 0.31, "grad_norm": 5.5625, "learning_rate": 5.850731168614608e-05, "loss": 1.3445, "step": 1725 }, { "epoch": 0.31, "grad_norm": 4.1875, "learning_rate": 5.849861795022489e-05, "loss": 1.2977, "step": 1730 }, { "epoch": 0.31, "grad_norm": 4.4375, "learning_rate": 5.848989962038836e-05, "loss": 1.3489, "step": 1735 }, { "epoch": 0.31, "grad_norm": 4.03125, "learning_rate": 5.848115670416028e-05, "loss": 1.3006, "step": 1740 }, { "epoch": 0.31, "grad_norm": 4.625, "learning_rate": 5.847238920908566e-05, "loss": 1.357, "step": 1745 }, { "epoch": 0.31, "grad_norm": 5.25, "learning_rate": 5.846359714273075e-05, "loss": 1.37, "step": 1750 }, { "epoch": 0.31, "grad_norm": 4.5, "learning_rate": 5.845478051268296e-05, "loss": 1.315, "step": 1755 }, { "epoch": 0.31, "grad_norm": 4.15625, "learning_rate": 5.844593932655094e-05, "loss": 1.3086, "step": 1760 }, { "epoch": 0.31, "grad_norm": 4.65625, "learning_rate": 5.84370735919645e-05, "loss": 1.2737, "step": 1765 }, { "epoch": 0.31, "grad_norm": 4.40625, "learning_rate": 5.8428183316574645e-05, "loss": 1.2527, "step": 1770 }, { "epoch": 0.31, "grad_norm": 4.8125, "learning_rate": 5.841926850805357e-05, "loss": 1.3043, "step": 1775 }, { "epoch": 0.32, "grad_norm": 4.5625, "learning_rate": 5.8410329174094624e-05, "loss": 1.3782, "step": 1780 }, { "epoch": 0.32, "grad_norm": 3.984375, "learning_rate": 5.8401365322412335e-05, "loss": 1.253, "step": 1785 }, { "epoch": 0.32, "grad_norm": 5.03125, "learning_rate": 5.839237696074237e-05, "loss": 1.3019, "step": 1790 }, { "epoch": 0.32, "grad_norm": 4.53125, "learning_rate": 5.8383364096841584e-05, "loss": 1.349, "step": 1795 }, { "epoch": 0.32, "grad_norm": 4.78125, "learning_rate": 5.837432673848794e-05, "loss": 1.3437, "step": 1800 }, { "epoch": 0.32, "grad_norm": 4.5, "learning_rate": 5.836526489348056e-05, "loss": 1.2328, "step": 1805 }, { "epoch": 0.32, "grad_norm": 4.375, "learning_rate": 5.835617856963968e-05, "loss": 1.2784, "step": 1810 }, { "epoch": 0.32, "grad_norm": 4.71875, "learning_rate": 5.834706777480669e-05, "loss": 1.3552, "step": 1815 }, { "epoch": 0.32, "grad_norm": 4.03125, "learning_rate": 5.8337932516844065e-05, "loss": 1.4127, "step": 1820 }, { "epoch": 0.32, "grad_norm": 4.46875, "learning_rate": 5.832877280363542e-05, "loss": 1.311, "step": 1825 }, { "epoch": 0.32, "grad_norm": 4.8125, "learning_rate": 5.831958864308545e-05, "loss": 1.3119, "step": 1830 }, { "epoch": 0.33, "grad_norm": 5.21875, "learning_rate": 5.831038004311996e-05, "loss": 1.3521, "step": 1835 }, { "epoch": 0.33, "grad_norm": 4.65625, "learning_rate": 5.830114701168586e-05, "loss": 1.3077, "step": 1840 }, { "epoch": 0.33, "grad_norm": 4.4375, "learning_rate": 5.829188955675111e-05, "loss": 1.3005, "step": 1845 }, { "epoch": 0.33, "grad_norm": 4.8125, "learning_rate": 5.828260768630478e-05, "loss": 1.3011, "step": 1850 }, { "epoch": 0.33, "grad_norm": 4.46875, "learning_rate": 5.827330140835699e-05, "loss": 1.3626, "step": 1855 }, { "epoch": 0.33, "grad_norm": 5.15625, "learning_rate": 5.826397073093893e-05, "loss": 1.339, "step": 1860 }, { "epoch": 0.33, "grad_norm": 4.71875, "learning_rate": 5.825461566210285e-05, "loss": 1.2936, "step": 1865 }, { "epoch": 0.33, "grad_norm": 5.1875, "learning_rate": 5.824523620992205e-05, "loss": 1.3297, "step": 1870 }, { "epoch": 0.33, "grad_norm": 4.84375, "learning_rate": 5.823583238249084e-05, "loss": 1.3285, "step": 1875 }, { "epoch": 0.33, "grad_norm": 4.34375, "learning_rate": 5.8226404187924633e-05, "loss": 1.2949, "step": 1880 }, { "epoch": 0.33, "grad_norm": 4.1875, "learning_rate": 5.821695163435982e-05, "loss": 1.4273, "step": 1885 }, { "epoch": 0.33, "grad_norm": 5.15625, "learning_rate": 5.820747472995381e-05, "loss": 1.2731, "step": 1890 }, { "epoch": 0.34, "grad_norm": 5.0625, "learning_rate": 5.819797348288504e-05, "loss": 1.3678, "step": 1895 }, { "epoch": 0.34, "grad_norm": 4.34375, "learning_rate": 5.8188447901352976e-05, "loss": 1.3279, "step": 1900 }, { "epoch": 0.34, "grad_norm": 4.15625, "learning_rate": 5.817889799357805e-05, "loss": 1.2751, "step": 1905 }, { "epoch": 0.34, "grad_norm": 4.65625, "learning_rate": 5.81693237678017e-05, "loss": 1.2946, "step": 1910 }, { "epoch": 0.34, "grad_norm": 4.5, "learning_rate": 5.815972523228636e-05, "loss": 1.3346, "step": 1915 }, { "epoch": 0.34, "grad_norm": 4.15625, "learning_rate": 5.815010239531543e-05, "loss": 1.3255, "step": 1920 }, { "epoch": 0.34, "grad_norm": 4.40625, "learning_rate": 5.8140455265193284e-05, "loss": 1.3371, "step": 1925 }, { "epoch": 0.34, "grad_norm": 5.625, "learning_rate": 5.8130783850245265e-05, "loss": 1.2516, "step": 1930 }, { "epoch": 0.34, "grad_norm": 4.125, "learning_rate": 5.812108815881766e-05, "loss": 1.3151, "step": 1935 }, { "epoch": 0.34, "grad_norm": 4.6875, "learning_rate": 5.8111368199277735e-05, "loss": 1.3182, "step": 1940 }, { "epoch": 0.34, "grad_norm": 5.5625, "learning_rate": 5.810162398001367e-05, "loss": 1.2896, "step": 1945 }, { "epoch": 0.35, "grad_norm": 5.15625, "learning_rate": 5.809185550943461e-05, "loss": 1.3718, "step": 1950 }, { "epoch": 0.35, "grad_norm": 5.1875, "learning_rate": 5.808206279597058e-05, "loss": 1.3822, "step": 1955 }, { "epoch": 0.35, "grad_norm": 4.15625, "learning_rate": 5.8072245848072595e-05, "loss": 1.2806, "step": 1960 }, { "epoch": 0.35, "grad_norm": 4.78125, "learning_rate": 5.80624046742125e-05, "loss": 1.2678, "step": 1965 }, { "epoch": 0.35, "grad_norm": 5.375, "learning_rate": 5.805253928288313e-05, "loss": 1.3332, "step": 1970 }, { "epoch": 0.35, "grad_norm": 5.53125, "learning_rate": 5.804264968259817e-05, "loss": 1.3929, "step": 1975 }, { "epoch": 0.35, "grad_norm": 4.90625, "learning_rate": 5.803273588189221e-05, "loss": 1.346, "step": 1980 }, { "epoch": 0.35, "grad_norm": 4.90625, "learning_rate": 5.8022797889320716e-05, "loss": 1.3509, "step": 1985 }, { "epoch": 0.35, "grad_norm": 4.4375, "learning_rate": 5.801283571346004e-05, "loss": 1.3379, "step": 1990 }, { "epoch": 0.35, "grad_norm": 4.34375, "learning_rate": 5.8002849362907425e-05, "loss": 1.3061, "step": 1995 }, { "epoch": 0.35, "grad_norm": 4.8125, "learning_rate": 5.7992838846280926e-05, "loss": 1.2657, "step": 2000 }, { "epoch": 0.36, "grad_norm": 4.5, "learning_rate": 5.798280417221949e-05, "loss": 1.3385, "step": 2005 }, { "epoch": 0.36, "grad_norm": 3.9375, "learning_rate": 5.797274534938291e-05, "loss": 1.307, "step": 2010 }, { "epoch": 0.36, "grad_norm": 4.53125, "learning_rate": 5.7962662386451805e-05, "loss": 1.3082, "step": 2015 }, { "epoch": 0.36, "grad_norm": 4.1875, "learning_rate": 5.795255529212763e-05, "loss": 1.3263, "step": 2020 }, { "epoch": 0.36, "grad_norm": 5.5, "learning_rate": 5.794242407513268e-05, "loss": 1.3369, "step": 2025 }, { "epoch": 0.36, "grad_norm": 4.75, "learning_rate": 5.7932268744210046e-05, "loss": 1.3142, "step": 2030 }, { "epoch": 0.36, "grad_norm": 4.46875, "learning_rate": 5.7922089308123646e-05, "loss": 1.2688, "step": 2035 }, { "epoch": 0.36, "grad_norm": 4.25, "learning_rate": 5.7911885775658186e-05, "loss": 1.3064, "step": 2040 }, { "epoch": 0.36, "grad_norm": 4.84375, "learning_rate": 5.790165815561918e-05, "loss": 1.2826, "step": 2045 }, { "epoch": 0.36, "grad_norm": 4.96875, "learning_rate": 5.7891406456832925e-05, "loss": 1.2759, "step": 2050 }, { "epoch": 0.36, "grad_norm": 4.90625, "learning_rate": 5.788113068814648e-05, "loss": 1.318, "step": 2055 }, { "epoch": 0.37, "grad_norm": 5.46875, "learning_rate": 5.787083085842772e-05, "loss": 1.3142, "step": 2060 }, { "epoch": 0.37, "grad_norm": 4.5, "learning_rate": 5.786050697656523e-05, "loss": 1.2876, "step": 2065 }, { "epoch": 0.37, "grad_norm": 4.28125, "learning_rate": 5.78501590514684e-05, "loss": 1.3256, "step": 2070 }, { "epoch": 0.37, "grad_norm": 3.703125, "learning_rate": 5.783978709206732e-05, "loss": 1.3258, "step": 2075 }, { "epoch": 0.37, "grad_norm": 4.65625, "learning_rate": 5.782939110731287e-05, "loss": 1.2908, "step": 2080 }, { "epoch": 0.37, "grad_norm": 4.28125, "learning_rate": 5.7818971106176634e-05, "loss": 1.2957, "step": 2085 }, { "epoch": 0.37, "grad_norm": 4.5, "learning_rate": 5.780852709765094e-05, "loss": 1.355, "step": 2090 }, { "epoch": 0.37, "grad_norm": 4.0, "learning_rate": 5.7798059090748805e-05, "loss": 1.2604, "step": 2095 }, { "epoch": 0.37, "grad_norm": 4.09375, "learning_rate": 5.778756709450399e-05, "loss": 1.2796, "step": 2100 }, { "epoch": 0.37, "grad_norm": 4.5625, "learning_rate": 5.7777051117970946e-05, "loss": 1.3614, "step": 2105 }, { "epoch": 0.37, "grad_norm": 4.3125, "learning_rate": 5.77665111702248e-05, "loss": 1.3198, "step": 2110 }, { "epoch": 0.37, "grad_norm": 4.09375, "learning_rate": 5.775594726036139e-05, "loss": 1.2494, "step": 2115 }, { "epoch": 0.38, "grad_norm": 3.984375, "learning_rate": 5.774535939749723e-05, "loss": 1.2933, "step": 2120 }, { "epoch": 0.38, "grad_norm": 4.8125, "learning_rate": 5.77347475907695e-05, "loss": 1.3178, "step": 2125 }, { "epoch": 0.38, "grad_norm": 4.75, "learning_rate": 5.772411184933603e-05, "loss": 1.3002, "step": 2130 }, { "epoch": 0.38, "grad_norm": 4.53125, "learning_rate": 5.771345218237534e-05, "loss": 1.3109, "step": 2135 }, { "epoch": 0.38, "grad_norm": 4.96875, "learning_rate": 5.770276859908655e-05, "loss": 1.3076, "step": 2140 }, { "epoch": 0.38, "grad_norm": 4.5625, "learning_rate": 5.769206110868947e-05, "loss": 1.3253, "step": 2145 }, { "epoch": 0.38, "grad_norm": 4.65625, "learning_rate": 5.76813297204245e-05, "loss": 1.3409, "step": 2150 }, { "epoch": 0.38, "grad_norm": 4.53125, "learning_rate": 5.767057444355269e-05, "loss": 1.3253, "step": 2155 }, { "epoch": 0.38, "grad_norm": 4.40625, "learning_rate": 5.765979528735569e-05, "loss": 1.2958, "step": 2160 }, { "epoch": 0.38, "grad_norm": 4.5625, "learning_rate": 5.7648992261135766e-05, "loss": 1.2519, "step": 2165 }, { "epoch": 0.38, "grad_norm": 4.84375, "learning_rate": 5.7638165374215786e-05, "loss": 1.2364, "step": 2170 }, { "epoch": 0.39, "grad_norm": 4.8125, "learning_rate": 5.7627314635939185e-05, "loss": 1.2876, "step": 2175 }, { "epoch": 0.39, "grad_norm": 4.46875, "learning_rate": 5.7616440055670037e-05, "loss": 1.2515, "step": 2180 }, { "epoch": 0.39, "grad_norm": 4.71875, "learning_rate": 5.7605541642792915e-05, "loss": 1.3361, "step": 2185 }, { "epoch": 0.39, "grad_norm": 4.125, "learning_rate": 5.7594619406713025e-05, "loss": 1.28, "step": 2190 }, { "epoch": 0.39, "grad_norm": 4.34375, "learning_rate": 5.758367335685611e-05, "loss": 1.2606, "step": 2195 }, { "epoch": 0.39, "grad_norm": 4.9375, "learning_rate": 5.7572703502668436e-05, "loss": 1.3897, "step": 2200 }, { "epoch": 0.39, "grad_norm": 6.21875, "learning_rate": 5.756170985361686e-05, "loss": 1.2162, "step": 2205 }, { "epoch": 0.39, "grad_norm": 5.28125, "learning_rate": 5.755069241918873e-05, "loss": 1.3689, "step": 2210 }, { "epoch": 0.39, "grad_norm": 4.5625, "learning_rate": 5.753965120889196e-05, "loss": 1.3492, "step": 2215 }, { "epoch": 0.39, "grad_norm": 4.4375, "learning_rate": 5.752858623225495e-05, "loss": 1.333, "step": 2220 }, { "epoch": 0.39, "grad_norm": 4.78125, "learning_rate": 5.751749749882663e-05, "loss": 1.3392, "step": 2225 }, { "epoch": 0.4, "grad_norm": 4.5625, "learning_rate": 5.750638501817642e-05, "loss": 1.3787, "step": 2230 }, { "epoch": 0.4, "grad_norm": 4.53125, "learning_rate": 5.7495248799894225e-05, "loss": 1.2737, "step": 2235 }, { "epoch": 0.4, "grad_norm": 4.875, "learning_rate": 5.7484088853590474e-05, "loss": 1.2746, "step": 2240 }, { "epoch": 0.4, "grad_norm": 4.09375, "learning_rate": 5.747290518889604e-05, "loss": 1.2974, "step": 2245 }, { "epoch": 0.4, "grad_norm": 4.1875, "learning_rate": 5.7461697815462265e-05, "loss": 1.3759, "step": 2250 }, { "epoch": 0.4, "grad_norm": 4.65625, "learning_rate": 5.745046674296095e-05, "loss": 1.326, "step": 2255 }, { "epoch": 0.4, "grad_norm": 4.84375, "learning_rate": 5.743921198108437e-05, "loss": 1.2786, "step": 2260 }, { "epoch": 0.4, "grad_norm": 4.09375, "learning_rate": 5.7427933539545225e-05, "loss": 1.3348, "step": 2265 }, { "epoch": 0.4, "grad_norm": 5.53125, "learning_rate": 5.7416631428076655e-05, "loss": 1.3611, "step": 2270 }, { "epoch": 0.4, "grad_norm": 5.4375, "learning_rate": 5.740530565643223e-05, "loss": 1.3215, "step": 2275 }, { "epoch": 0.4, "grad_norm": 4.5, "learning_rate": 5.739395623438594e-05, "loss": 1.337, "step": 2280 }, { "epoch": 0.4, "grad_norm": 4.28125, "learning_rate": 5.738258317173217e-05, "loss": 1.3059, "step": 2285 }, { "epoch": 0.41, "grad_norm": 4.5, "learning_rate": 5.737118647828571e-05, "loss": 1.3581, "step": 2290 }, { "epoch": 0.41, "grad_norm": 4.09375, "learning_rate": 5.7359766163881766e-05, "loss": 1.2102, "step": 2295 }, { "epoch": 0.41, "grad_norm": 4.78125, "learning_rate": 5.73483222383759e-05, "loss": 1.3794, "step": 2300 }, { "epoch": 0.41, "grad_norm": 5.0625, "learning_rate": 5.733685471164407e-05, "loss": 1.3096, "step": 2305 }, { "epoch": 0.41, "grad_norm": 4.6875, "learning_rate": 5.73253635935826e-05, "loss": 1.3031, "step": 2310 }, { "epoch": 0.41, "grad_norm": 4.15625, "learning_rate": 5.731384889410814e-05, "loss": 1.2819, "step": 2315 }, { "epoch": 0.41, "grad_norm": 4.75, "learning_rate": 5.730231062315775e-05, "loss": 1.2891, "step": 2320 }, { "epoch": 0.41, "grad_norm": 4.0625, "learning_rate": 5.729074879068878e-05, "loss": 1.3334, "step": 2325 }, { "epoch": 0.41, "grad_norm": 4.8125, "learning_rate": 5.727916340667895e-05, "loss": 1.3738, "step": 2330 }, { "epoch": 0.41, "grad_norm": 4.21875, "learning_rate": 5.7267554481126255e-05, "loss": 1.3433, "step": 2335 }, { "epoch": 0.41, "grad_norm": 4.625, "learning_rate": 5.725592202404907e-05, "loss": 1.2954, "step": 2340 }, { "epoch": 0.42, "grad_norm": 5.4375, "learning_rate": 5.7244266045486025e-05, "loss": 1.323, "step": 2345 }, { "epoch": 0.42, "grad_norm": 4.5625, "learning_rate": 5.723258655549608e-05, "loss": 1.2917, "step": 2350 }, { "epoch": 0.42, "grad_norm": 5.1875, "learning_rate": 5.7220883564158474e-05, "loss": 1.3366, "step": 2355 }, { "epoch": 0.42, "grad_norm": 4.03125, "learning_rate": 5.7209157081572725e-05, "loss": 1.3243, "step": 2360 }, { "epoch": 0.42, "grad_norm": 4.40625, "learning_rate": 5.7197407117858625e-05, "loss": 1.3289, "step": 2365 }, { "epoch": 0.42, "grad_norm": 4.125, "learning_rate": 5.718563368315623e-05, "loss": 1.3062, "step": 2370 }, { "epoch": 0.42, "grad_norm": 4.6875, "learning_rate": 5.7173836787625854e-05, "loss": 1.3856, "step": 2375 }, { "epoch": 0.42, "grad_norm": 4.5, "learning_rate": 5.7162016441448056e-05, "loss": 1.4069, "step": 2380 }, { "epoch": 0.42, "grad_norm": 4.46875, "learning_rate": 5.715017265482363e-05, "loss": 1.3393, "step": 2385 }, { "epoch": 0.42, "grad_norm": 4.6875, "learning_rate": 5.7138305437973613e-05, "loss": 1.3196, "step": 2390 }, { "epoch": 0.42, "grad_norm": 4.4375, "learning_rate": 5.712641480113923e-05, "loss": 1.2874, "step": 2395 }, { "epoch": 0.43, "grad_norm": 4.8125, "learning_rate": 5.7114500754581954e-05, "loss": 1.3249, "step": 2400 }, { "epoch": 0.43, "grad_norm": 4.6875, "learning_rate": 5.710256330858343e-05, "loss": 1.327, "step": 2405 }, { "epoch": 0.43, "grad_norm": 5.0625, "learning_rate": 5.709060247344552e-05, "loss": 1.3701, "step": 2410 }, { "epoch": 0.43, "grad_norm": 4.15625, "learning_rate": 5.707861825949026e-05, "loss": 1.3252, "step": 2415 }, { "epoch": 0.43, "grad_norm": 4.40625, "learning_rate": 5.706661067705984e-05, "loss": 1.3281, "step": 2420 }, { "epoch": 0.43, "grad_norm": 4.3125, "learning_rate": 5.705457973651668e-05, "loss": 1.3507, "step": 2425 }, { "epoch": 0.43, "grad_norm": 4.5625, "learning_rate": 5.7042525448243286e-05, "loss": 1.3503, "step": 2430 }, { "epoch": 0.43, "grad_norm": 5.0, "learning_rate": 5.7030447822642346e-05, "loss": 1.2939, "step": 2435 }, { "epoch": 0.43, "grad_norm": 4.84375, "learning_rate": 5.701834687013669e-05, "loss": 1.2999, "step": 2440 }, { "epoch": 0.43, "grad_norm": 4.65625, "learning_rate": 5.700622260116927e-05, "loss": 1.3386, "step": 2445 }, { "epoch": 0.43, "grad_norm": 4.78125, "learning_rate": 5.699407502620317e-05, "loss": 1.2755, "step": 2450 }, { "epoch": 0.44, "grad_norm": 4.28125, "learning_rate": 5.6981904155721574e-05, "loss": 1.335, "step": 2455 }, { "epoch": 0.44, "grad_norm": 4.09375, "learning_rate": 5.696971000022778e-05, "loss": 1.2703, "step": 2460 }, { "epoch": 0.44, "grad_norm": 4.28125, "learning_rate": 5.695749257024517e-05, "loss": 1.254, "step": 2465 }, { "epoch": 0.44, "grad_norm": 4.09375, "learning_rate": 5.694525187631722e-05, "loss": 1.2533, "step": 2470 }, { "epoch": 0.44, "grad_norm": 4.46875, "learning_rate": 5.6932987929007484e-05, "loss": 1.2611, "step": 2475 }, { "epoch": 0.44, "grad_norm": 4.875, "learning_rate": 5.692070073889958e-05, "loss": 1.3172, "step": 2480 }, { "epoch": 0.44, "grad_norm": 4.28125, "learning_rate": 5.690839031659718e-05, "loss": 1.2717, "step": 2485 }, { "epoch": 0.44, "grad_norm": 4.625, "learning_rate": 5.6896056672724014e-05, "loss": 1.3477, "step": 2490 }, { "epoch": 0.44, "grad_norm": 4.34375, "learning_rate": 5.6883699817923856e-05, "loss": 1.3107, "step": 2495 }, { "epoch": 0.44, "grad_norm": 4.65625, "learning_rate": 5.6871319762860475e-05, "loss": 1.3247, "step": 2500 }, { "epoch": 0.44, "grad_norm": 4.59375, "learning_rate": 5.6858916518217714e-05, "loss": 1.282, "step": 2505 }, { "epoch": 0.44, "grad_norm": 4.9375, "learning_rate": 5.6846490094699386e-05, "loss": 1.2465, "step": 2510 }, { "epoch": 0.45, "grad_norm": 4.90625, "learning_rate": 5.6834040503029334e-05, "loss": 1.2966, "step": 2515 }, { "epoch": 0.45, "grad_norm": 5.96875, "learning_rate": 5.682156775395137e-05, "loss": 1.2818, "step": 2520 }, { "epoch": 0.45, "grad_norm": 4.5625, "learning_rate": 5.6809071858229325e-05, "loss": 1.2737, "step": 2525 }, { "epoch": 0.45, "grad_norm": 4.4375, "learning_rate": 5.679655282664698e-05, "loss": 1.3038, "step": 2530 }, { "epoch": 0.45, "grad_norm": 4.125, "learning_rate": 5.6784010670008073e-05, "loss": 1.314, "step": 2535 }, { "epoch": 0.45, "grad_norm": 5.0625, "learning_rate": 5.6771445399136324e-05, "loss": 1.3201, "step": 2540 }, { "epoch": 0.45, "grad_norm": 4.90625, "learning_rate": 5.6758857024875384e-05, "loss": 1.3113, "step": 2545 }, { "epoch": 0.45, "grad_norm": 4.96875, "learning_rate": 5.674624555808886e-05, "loss": 1.2482, "step": 2550 }, { "epoch": 0.45, "grad_norm": 4.8125, "learning_rate": 5.673361100966025e-05, "loss": 1.3105, "step": 2555 }, { "epoch": 0.45, "grad_norm": 6.28125, "learning_rate": 5.672095339049302e-05, "loss": 1.308, "step": 2560 }, { "epoch": 0.45, "grad_norm": 4.65625, "learning_rate": 5.670827271151051e-05, "loss": 1.3011, "step": 2565 }, { "epoch": 0.46, "grad_norm": 4.46875, "learning_rate": 5.669556898365598e-05, "loss": 1.3295, "step": 2570 }, { "epoch": 0.46, "grad_norm": 4.78125, "learning_rate": 5.668284221789255e-05, "loss": 1.3619, "step": 2575 }, { "epoch": 0.46, "grad_norm": 4.625, "learning_rate": 5.667009242520327e-05, "loss": 1.2631, "step": 2580 }, { "epoch": 0.46, "grad_norm": 4.40625, "learning_rate": 5.665731961659102e-05, "loss": 1.3128, "step": 2585 }, { "epoch": 0.46, "grad_norm": 4.25, "learning_rate": 5.6644523803078564e-05, "loss": 1.2335, "step": 2590 }, { "epoch": 0.46, "grad_norm": 4.1875, "learning_rate": 5.663170499570851e-05, "loss": 1.3206, "step": 2595 }, { "epoch": 0.46, "grad_norm": 4.8125, "learning_rate": 5.661886320554331e-05, "loss": 1.3087, "step": 2600 }, { "epoch": 0.46, "grad_norm": 4.1875, "learning_rate": 5.660599844366527e-05, "loss": 1.2921, "step": 2605 }, { "epoch": 0.46, "grad_norm": 4.5, "learning_rate": 5.6593110721176475e-05, "loss": 1.23, "step": 2610 }, { "epoch": 0.46, "grad_norm": 4.59375, "learning_rate": 5.6580200049198876e-05, "loss": 1.3199, "step": 2615 }, { "epoch": 0.46, "grad_norm": 4.9375, "learning_rate": 5.656726643887419e-05, "loss": 1.2793, "step": 2620 }, { "epoch": 0.47, "grad_norm": 4.21875, "learning_rate": 5.655430990136395e-05, "loss": 1.3256, "step": 2625 }, { "epoch": 0.47, "grad_norm": 5.0625, "learning_rate": 5.654133044784948e-05, "loss": 1.2995, "step": 2630 }, { "epoch": 0.47, "grad_norm": 4.4375, "learning_rate": 5.652832808953185e-05, "loss": 1.3027, "step": 2635 }, { "epoch": 0.47, "grad_norm": 4.15625, "learning_rate": 5.6515302837631935e-05, "loss": 1.2884, "step": 2640 }, { "epoch": 0.47, "grad_norm": 4.0625, "learning_rate": 5.650225470339034e-05, "loss": 1.335, "step": 2645 }, { "epoch": 0.47, "grad_norm": 4.75, "learning_rate": 5.648918369806742e-05, "loss": 1.3514, "step": 2650 }, { "epoch": 0.47, "grad_norm": 4.25, "learning_rate": 5.6476089832943275e-05, "loss": 1.2674, "step": 2655 }, { "epoch": 0.47, "grad_norm": 4.40625, "learning_rate": 5.646297311931775e-05, "loss": 1.3246, "step": 2660 }, { "epoch": 0.47, "grad_norm": 5.0625, "learning_rate": 5.644983356851035e-05, "loss": 1.2846, "step": 2665 }, { "epoch": 0.47, "grad_norm": 4.96875, "learning_rate": 5.643667119186036e-05, "loss": 1.2634, "step": 2670 }, { "epoch": 0.47, "grad_norm": 4.4375, "learning_rate": 5.642348600072672e-05, "loss": 1.3831, "step": 2675 }, { "epoch": 0.47, "grad_norm": 5.09375, "learning_rate": 5.641027800648806e-05, "loss": 1.3404, "step": 2680 }, { "epoch": 0.48, "grad_norm": 3.984375, "learning_rate": 5.6397047220542705e-05, "loss": 1.3466, "step": 2685 }, { "epoch": 0.48, "grad_norm": 4.59375, "learning_rate": 5.6383793654308653e-05, "loss": 1.2297, "step": 2690 }, { "epoch": 0.48, "grad_norm": 4.59375, "learning_rate": 5.637051731922353e-05, "loss": 1.3816, "step": 2695 }, { "epoch": 0.48, "grad_norm": 5.65625, "learning_rate": 5.6357218226744634e-05, "loss": 1.3857, "step": 2700 }, { "epoch": 0.48, "grad_norm": 4.25, "learning_rate": 5.63438963883489e-05, "loss": 1.3098, "step": 2705 }, { "epoch": 0.48, "grad_norm": 4.4375, "learning_rate": 5.6330551815532915e-05, "loss": 1.2842, "step": 2710 }, { "epoch": 0.48, "grad_norm": 4.3125, "learning_rate": 5.631718451981282e-05, "loss": 1.3243, "step": 2715 }, { "epoch": 0.48, "grad_norm": 4.6875, "learning_rate": 5.6303794512724444e-05, "loss": 1.2961, "step": 2720 }, { "epoch": 0.48, "grad_norm": 4.375, "learning_rate": 5.629038180582315e-05, "loss": 1.2975, "step": 2725 }, { "epoch": 0.48, "grad_norm": 4.6875, "learning_rate": 5.627694641068394e-05, "loss": 1.295, "step": 2730 }, { "epoch": 0.48, "grad_norm": 4.46875, "learning_rate": 5.626348833890136e-05, "loss": 1.3153, "step": 2735 }, { "epoch": 0.49, "grad_norm": 4.78125, "learning_rate": 5.6250007602089556e-05, "loss": 1.2853, "step": 2740 }, { "epoch": 0.49, "grad_norm": 6.09375, "learning_rate": 5.623650421188221e-05, "loss": 1.2951, "step": 2745 }, { "epoch": 0.49, "grad_norm": 4.1875, "learning_rate": 5.6222978179932566e-05, "loss": 1.3122, "step": 2750 }, { "epoch": 0.49, "grad_norm": 4.3125, "learning_rate": 5.62094295179134e-05, "loss": 1.3803, "step": 2755 }, { "epoch": 0.49, "grad_norm": 4.5625, "learning_rate": 5.6195858237517024e-05, "loss": 1.3136, "step": 2760 }, { "epoch": 0.49, "grad_norm": 4.65625, "learning_rate": 5.618226435045526e-05, "loss": 1.2738, "step": 2765 }, { "epoch": 0.49, "grad_norm": 5.1875, "learning_rate": 5.616864786845945e-05, "loss": 1.3019, "step": 2770 }, { "epoch": 0.49, "grad_norm": 4.4375, "learning_rate": 5.615500880328044e-05, "loss": 1.3231, "step": 2775 }, { "epoch": 0.49, "grad_norm": 4.4375, "learning_rate": 5.6141347166688534e-05, "loss": 1.3113, "step": 2780 }, { "epoch": 0.49, "grad_norm": 4.5625, "learning_rate": 5.6127662970473556e-05, "loss": 1.3466, "step": 2785 }, { "epoch": 0.49, "grad_norm": 4.59375, "learning_rate": 5.611395622644477e-05, "loss": 1.293, "step": 2790 }, { "epoch": 0.5, "grad_norm": 4.75, "learning_rate": 5.610022694643091e-05, "loss": 1.335, "step": 2795 }, { "epoch": 0.5, "grad_norm": 4.40625, "learning_rate": 5.608647514228015e-05, "loss": 1.3079, "step": 2800 }, { "epoch": 0.5, "grad_norm": 4.3125, "learning_rate": 5.607270082586013e-05, "loss": 1.3268, "step": 2805 }, { "epoch": 0.5, "grad_norm": 4.1875, "learning_rate": 5.605890400905786e-05, "loss": 1.3984, "step": 2810 }, { "epoch": 0.5, "grad_norm": 4.875, "learning_rate": 5.604508470377983e-05, "loss": 1.3175, "step": 2815 }, { "epoch": 0.5, "grad_norm": 4.25, "learning_rate": 5.60312429219519e-05, "loss": 1.2461, "step": 2820 }, { "epoch": 0.5, "grad_norm": 4.09375, "learning_rate": 5.601737867551934e-05, "loss": 1.2589, "step": 2825 }, { "epoch": 0.5, "grad_norm": 4.8125, "learning_rate": 5.6003491976446804e-05, "loss": 1.2646, "step": 2830 }, { "epoch": 0.5, "grad_norm": 4.875, "learning_rate": 5.598958283671832e-05, "loss": 1.2677, "step": 2835 }, { "epoch": 0.5, "grad_norm": 5.0625, "learning_rate": 5.59756512683373e-05, "loss": 1.3203, "step": 2840 }, { "epoch": 0.5, "grad_norm": 4.65625, "learning_rate": 5.5961697283326465e-05, "loss": 1.3257, "step": 2845 }, { "epoch": 0.51, "grad_norm": 4.78125, "learning_rate": 5.5947720893727943e-05, "loss": 1.2212, "step": 2850 }, { "epoch": 0.51, "grad_norm": 5.28125, "learning_rate": 5.593372211160314e-05, "loss": 1.3295, "step": 2855 }, { "epoch": 0.51, "grad_norm": 3.90625, "learning_rate": 5.591970094903284e-05, "loss": 1.3295, "step": 2860 }, { "epoch": 0.51, "grad_norm": 5.0625, "learning_rate": 5.5905657418117084e-05, "loss": 1.31, "step": 2865 }, { "epoch": 0.51, "grad_norm": 4.0, "learning_rate": 5.589159153097526e-05, "loss": 1.3057, "step": 2870 }, { "epoch": 0.51, "grad_norm": 4.5625, "learning_rate": 5.5877503299746044e-05, "loss": 1.3155, "step": 2875 }, { "epoch": 0.51, "grad_norm": 4.5, "learning_rate": 5.586339273658737e-05, "loss": 1.3647, "step": 2880 }, { "epoch": 0.51, "grad_norm": 4.3125, "learning_rate": 5.584925985367647e-05, "loss": 1.3519, "step": 2885 }, { "epoch": 0.51, "grad_norm": 4.4375, "learning_rate": 5.583510466320983e-05, "loss": 1.3283, "step": 2890 }, { "epoch": 0.51, "grad_norm": 4.9375, "learning_rate": 5.582092717740318e-05, "loss": 1.3263, "step": 2895 }, { "epoch": 0.51, "grad_norm": 4.78125, "learning_rate": 5.5806727408491494e-05, "loss": 1.3202, "step": 2900 }, { "epoch": 0.51, "grad_norm": 4.40625, "learning_rate": 5.579250536872897e-05, "loss": 1.3079, "step": 2905 }, { "epoch": 0.52, "grad_norm": 4.75, "learning_rate": 5.577826107038904e-05, "loss": 1.2571, "step": 2910 }, { "epoch": 0.52, "grad_norm": 4.375, "learning_rate": 5.576399452576435e-05, "loss": 1.275, "step": 2915 }, { "epoch": 0.52, "grad_norm": 4.53125, "learning_rate": 5.574970574716672e-05, "loss": 1.3317, "step": 2920 }, { "epoch": 0.52, "grad_norm": 4.875, "learning_rate": 5.5735394746927157e-05, "loss": 1.3375, "step": 2925 }, { "epoch": 0.52, "grad_norm": 4.28125, "learning_rate": 5.572106153739588e-05, "loss": 1.3079, "step": 2930 }, { "epoch": 0.52, "grad_norm": 4.8125, "learning_rate": 5.570670613094223e-05, "loss": 1.314, "step": 2935 }, { "epoch": 0.52, "grad_norm": 3.96875, "learning_rate": 5.569232853995475e-05, "loss": 1.2896, "step": 2940 }, { "epoch": 0.52, "grad_norm": 4.65625, "learning_rate": 5.567792877684109e-05, "loss": 1.2462, "step": 2945 }, { "epoch": 0.52, "grad_norm": 4.71875, "learning_rate": 5.566350685402804e-05, "loss": 1.4189, "step": 2950 }, { "epoch": 0.52, "grad_norm": 4.90625, "learning_rate": 5.564906278396153e-05, "loss": 1.2991, "step": 2955 }, { "epoch": 0.52, "grad_norm": 3.96875, "learning_rate": 5.56345965791066e-05, "loss": 1.3341, "step": 2960 }, { "epoch": 0.53, "grad_norm": 5.4375, "learning_rate": 5.562010825194737e-05, "loss": 1.2373, "step": 2965 }, { "epoch": 0.53, "grad_norm": 4.28125, "learning_rate": 5.560559781498709e-05, "loss": 1.3048, "step": 2970 }, { "epoch": 0.53, "grad_norm": 4.375, "learning_rate": 5.5591065280748035e-05, "loss": 1.3548, "step": 2975 }, { "epoch": 0.53, "grad_norm": 4.625, "learning_rate": 5.557651066177161e-05, "loss": 1.3368, "step": 2980 }, { "epoch": 0.53, "grad_norm": 4.46875, "learning_rate": 5.556193397061824e-05, "loss": 1.212, "step": 2985 }, { "epoch": 0.53, "grad_norm": 4.25, "learning_rate": 5.55473352198674e-05, "loss": 1.2773, "step": 2990 }, { "epoch": 0.53, "grad_norm": 4.6875, "learning_rate": 5.553271442211764e-05, "loss": 1.3321, "step": 2995 }, { "epoch": 0.53, "grad_norm": 4.0, "learning_rate": 5.5518071589986475e-05, "loss": 1.2887, "step": 3000 }, { "epoch": 0.53, "grad_norm": 4.9375, "learning_rate": 5.550340673611049e-05, "loss": 1.3299, "step": 3005 }, { "epoch": 0.53, "grad_norm": 4.46875, "learning_rate": 5.548871987314524e-05, "loss": 1.3793, "step": 3010 }, { "epoch": 0.53, "grad_norm": 5.0625, "learning_rate": 5.547401101376529e-05, "loss": 1.2792, "step": 3015 }, { "epoch": 0.54, "grad_norm": 5.46875, "learning_rate": 5.5459280170664174e-05, "loss": 1.2922, "step": 3020 }, { "epoch": 0.54, "grad_norm": 4.09375, "learning_rate": 5.544452735655443e-05, "loss": 1.2014, "step": 3025 }, { "epoch": 0.54, "grad_norm": 4.84375, "learning_rate": 5.542975258416751e-05, "loss": 1.3246, "step": 3030 }, { "epoch": 0.54, "grad_norm": 4.21875, "learning_rate": 5.5414955866253854e-05, "loss": 1.2227, "step": 3035 }, { "epoch": 0.54, "grad_norm": 5.03125, "learning_rate": 5.540013721558283e-05, "loss": 1.2552, "step": 3040 }, { "epoch": 0.54, "grad_norm": 4.65625, "learning_rate": 5.53852966449427e-05, "loss": 1.345, "step": 3045 }, { "epoch": 0.54, "grad_norm": 4.53125, "learning_rate": 5.53704341671407e-05, "loss": 1.2734, "step": 3050 }, { "epoch": 0.54, "grad_norm": 4.65625, "learning_rate": 5.535554979500295e-05, "loss": 1.2668, "step": 3055 }, { "epoch": 0.54, "grad_norm": 4.71875, "learning_rate": 5.5340643541374435e-05, "loss": 1.291, "step": 3060 }, { "epoch": 0.54, "grad_norm": 4.46875, "learning_rate": 5.532571541911906e-05, "loss": 1.2496, "step": 3065 }, { "epoch": 0.54, "grad_norm": 4.1875, "learning_rate": 5.531076544111958e-05, "loss": 1.3392, "step": 3070 }, { "epoch": 0.54, "grad_norm": 4.3125, "learning_rate": 5.5295793620277634e-05, "loss": 1.3181, "step": 3075 }, { "epoch": 0.55, "grad_norm": 4.03125, "learning_rate": 5.5280799969513686e-05, "loss": 1.2371, "step": 3080 }, { "epoch": 0.55, "grad_norm": 4.34375, "learning_rate": 5.5265784501767056e-05, "loss": 1.3203, "step": 3085 }, { "epoch": 0.55, "grad_norm": 4.46875, "learning_rate": 5.5250747229995884e-05, "loss": 1.3282, "step": 3090 }, { "epoch": 0.55, "grad_norm": 5.15625, "learning_rate": 5.5235688167177135e-05, "loss": 1.3521, "step": 3095 }, { "epoch": 0.55, "grad_norm": 4.65625, "learning_rate": 5.522060732630656e-05, "loss": 1.2624, "step": 3100 }, { "epoch": 0.55, "grad_norm": 4.40625, "learning_rate": 5.520550472039873e-05, "loss": 1.2576, "step": 3105 }, { "epoch": 0.55, "grad_norm": 4.40625, "learning_rate": 5.519038036248698e-05, "loss": 1.3183, "step": 3110 }, { "epoch": 0.55, "grad_norm": 4.34375, "learning_rate": 5.5175234265623426e-05, "loss": 1.3224, "step": 3115 }, { "epoch": 0.55, "grad_norm": 4.75, "learning_rate": 5.5160066442878936e-05, "loss": 1.2699, "step": 3120 }, { "epoch": 0.55, "grad_norm": 4.0, "learning_rate": 5.5144876907343145e-05, "loss": 1.4023, "step": 3125 }, { "epoch": 0.55, "grad_norm": 4.1875, "learning_rate": 5.51296656721244e-05, "loss": 1.304, "step": 3130 }, { "epoch": 0.56, "grad_norm": 4.65625, "learning_rate": 5.5114432750349784e-05, "loss": 1.2936, "step": 3135 }, { "epoch": 0.56, "grad_norm": 4.5625, "learning_rate": 5.509917815516511e-05, "loss": 1.3187, "step": 3140 }, { "epoch": 0.56, "grad_norm": 4.375, "learning_rate": 5.5083901899734875e-05, "loss": 1.27, "step": 3145 }, { "epoch": 0.56, "grad_norm": 4.6875, "learning_rate": 5.50686039972423e-05, "loss": 1.3432, "step": 3150 }, { "epoch": 0.56, "grad_norm": 4.84375, "learning_rate": 5.5053284460889234e-05, "loss": 1.2972, "step": 3155 }, { "epoch": 0.56, "grad_norm": 4.40625, "learning_rate": 5.5037943303896246e-05, "loss": 1.2965, "step": 3160 }, { "epoch": 0.56, "grad_norm": 6.125, "learning_rate": 5.502258053950253e-05, "loss": 1.2841, "step": 3165 }, { "epoch": 0.56, "grad_norm": 4.34375, "learning_rate": 5.500719618096595e-05, "loss": 1.2324, "step": 3170 }, { "epoch": 0.56, "grad_norm": 4.03125, "learning_rate": 5.4991790241563e-05, "loss": 1.2665, "step": 3175 }, { "epoch": 0.56, "grad_norm": 4.5, "learning_rate": 5.497636273458878e-05, "loss": 1.2642, "step": 3180 }, { "epoch": 0.56, "grad_norm": 4.46875, "learning_rate": 5.496091367335703e-05, "loss": 1.3122, "step": 3185 }, { "epoch": 0.57, "grad_norm": 4.40625, "learning_rate": 5.494544307120007e-05, "loss": 1.343, "step": 3190 }, { "epoch": 0.57, "grad_norm": 5.375, "learning_rate": 5.492995094146881e-05, "loss": 1.286, "step": 3195 }, { "epoch": 0.57, "grad_norm": 3.859375, "learning_rate": 5.491443729753276e-05, "loss": 1.2499, "step": 3200 }, { "epoch": 0.57, "grad_norm": 5.90625, "learning_rate": 5.489890215277997e-05, "loss": 1.3874, "step": 3205 }, { "epoch": 0.57, "grad_norm": 4.21875, "learning_rate": 5.4883345520617064e-05, "loss": 1.2641, "step": 3210 }, { "epoch": 0.57, "grad_norm": 4.71875, "learning_rate": 5.486776741446919e-05, "loss": 1.3424, "step": 3215 }, { "epoch": 0.57, "grad_norm": 3.625, "learning_rate": 5.485216784778006e-05, "loss": 1.3308, "step": 3220 }, { "epoch": 0.57, "grad_norm": 4.96875, "learning_rate": 5.4836546834011864e-05, "loss": 1.2457, "step": 3225 }, { "epoch": 0.57, "grad_norm": 4.78125, "learning_rate": 5.482090438664533e-05, "loss": 1.3051, "step": 3230 }, { "epoch": 0.57, "grad_norm": 4.375, "learning_rate": 5.480524051917968e-05, "loss": 1.2924, "step": 3235 }, { "epoch": 0.57, "grad_norm": 4.375, "learning_rate": 5.4789555245132614e-05, "loss": 1.3335, "step": 3240 }, { "epoch": 0.58, "grad_norm": 4.90625, "learning_rate": 5.47738485780403e-05, "loss": 1.3339, "step": 3245 }, { "epoch": 0.58, "grad_norm": 4.75, "learning_rate": 5.475812053145738e-05, "loss": 1.3429, "step": 3250 }, { "epoch": 0.58, "grad_norm": 5.21875, "learning_rate": 5.474237111895695e-05, "loss": 1.3206, "step": 3255 }, { "epoch": 0.58, "grad_norm": 4.8125, "learning_rate": 5.472660035413052e-05, "loss": 1.285, "step": 3260 }, { "epoch": 0.58, "grad_norm": 4.59375, "learning_rate": 5.471080825058807e-05, "loss": 1.2688, "step": 3265 }, { "epoch": 0.58, "grad_norm": 5.0, "learning_rate": 5.469499482195794e-05, "loss": 1.2592, "step": 3270 }, { "epoch": 0.58, "grad_norm": 4.03125, "learning_rate": 5.467916008188692e-05, "loss": 1.3642, "step": 3275 }, { "epoch": 0.58, "grad_norm": 4.84375, "learning_rate": 5.4663304044040157e-05, "loss": 1.2997, "step": 3280 }, { "epoch": 0.58, "grad_norm": 4.96875, "learning_rate": 5.46474267221012e-05, "loss": 1.3506, "step": 3285 }, { "epoch": 0.58, "grad_norm": 3.828125, "learning_rate": 5.463152812977196e-05, "loss": 1.2474, "step": 3290 }, { "epoch": 0.58, "grad_norm": 4.625, "learning_rate": 5.4615608280772714e-05, "loss": 1.2861, "step": 3295 }, { "epoch": 0.58, "grad_norm": 4.4375, "learning_rate": 5.459966718884206e-05, "loss": 1.2688, "step": 3300 }, { "epoch": 0.59, "grad_norm": 4.09375, "learning_rate": 5.458370486773694e-05, "loss": 1.2938, "step": 3305 }, { "epoch": 0.59, "grad_norm": 4.3125, "learning_rate": 5.4567721331232634e-05, "loss": 1.321, "step": 3310 }, { "epoch": 0.59, "grad_norm": 4.125, "learning_rate": 5.455171659312269e-05, "loss": 1.279, "step": 3315 }, { "epoch": 0.59, "grad_norm": 4.375, "learning_rate": 5.4535690667219e-05, "loss": 1.2736, "step": 3320 }, { "epoch": 0.59, "grad_norm": 4.5, "learning_rate": 5.4519643567351704e-05, "loss": 1.3178, "step": 3325 }, { "epoch": 0.59, "grad_norm": 4.65625, "learning_rate": 5.4503575307369225e-05, "loss": 1.2609, "step": 3330 }, { "epoch": 0.59, "grad_norm": 4.59375, "learning_rate": 5.448748590113826e-05, "loss": 1.3433, "step": 3335 }, { "epoch": 0.59, "grad_norm": 4.78125, "learning_rate": 5.447137536254374e-05, "loss": 1.3147, "step": 3340 }, { "epoch": 0.59, "grad_norm": 3.796875, "learning_rate": 5.445524370548885e-05, "loss": 1.3675, "step": 3345 }, { "epoch": 0.59, "grad_norm": 5.0625, "learning_rate": 5.443909094389497e-05, "loss": 1.3018, "step": 3350 }, { "epoch": 0.59, "grad_norm": 5.09375, "learning_rate": 5.442291709170172e-05, "loss": 1.302, "step": 3355 }, { "epoch": 0.6, "grad_norm": 4.40625, "learning_rate": 5.4406722162866915e-05, "loss": 1.2886, "step": 3360 }, { "epoch": 0.6, "grad_norm": 4.3125, "learning_rate": 5.439050617136656e-05, "loss": 1.2772, "step": 3365 }, { "epoch": 0.6, "grad_norm": 4.0625, "learning_rate": 5.437426913119482e-05, "loss": 1.2609, "step": 3370 }, { "epoch": 0.6, "grad_norm": 4.71875, "learning_rate": 5.435801105636403e-05, "loss": 1.2131, "step": 3375 }, { "epoch": 0.6, "grad_norm": 4.125, "learning_rate": 5.434173196090471e-05, "loss": 1.2784, "step": 3380 }, { "epoch": 0.6, "grad_norm": 4.71875, "learning_rate": 5.432543185886548e-05, "loss": 1.2992, "step": 3385 }, { "epoch": 0.6, "grad_norm": 4.875, "learning_rate": 5.4309110764313104e-05, "loss": 1.1948, "step": 3390 }, { "epoch": 0.6, "grad_norm": 4.59375, "learning_rate": 5.429276869133247e-05, "loss": 1.3929, "step": 3395 }, { "epoch": 0.6, "grad_norm": 4.09375, "learning_rate": 5.4276405654026564e-05, "loss": 1.304, "step": 3400 }, { "epoch": 0.6, "grad_norm": 4.9375, "learning_rate": 5.426002166651644e-05, "loss": 1.3069, "step": 3405 }, { "epoch": 0.6, "grad_norm": 4.25, "learning_rate": 5.424361674294128e-05, "loss": 1.2989, "step": 3410 }, { "epoch": 0.61, "grad_norm": 4.40625, "learning_rate": 5.422719089745829e-05, "loss": 1.3277, "step": 3415 }, { "epoch": 0.61, "grad_norm": 4.65625, "learning_rate": 5.421074414424276e-05, "loss": 1.3051, "step": 3420 }, { "epoch": 0.61, "grad_norm": 5.03125, "learning_rate": 5.4194276497488015e-05, "loss": 1.2802, "step": 3425 }, { "epoch": 0.61, "grad_norm": 5.1875, "learning_rate": 5.417778797140539e-05, "loss": 1.3996, "step": 3430 }, { "epoch": 0.61, "grad_norm": 4.78125, "learning_rate": 5.416127858022426e-05, "loss": 1.2566, "step": 3435 }, { "epoch": 0.61, "grad_norm": 4.65625, "learning_rate": 5.414474833819202e-05, "loss": 1.3191, "step": 3440 }, { "epoch": 0.61, "grad_norm": 4.6875, "learning_rate": 5.412819725957403e-05, "loss": 1.2944, "step": 3445 }, { "epoch": 0.61, "grad_norm": 4.59375, "learning_rate": 5.411162535865364e-05, "loss": 1.2609, "step": 3450 }, { "epoch": 0.61, "grad_norm": 4.375, "learning_rate": 5.409503264973217e-05, "loss": 1.3204, "step": 3455 }, { "epoch": 0.61, "grad_norm": 4.75, "learning_rate": 5.407841914712891e-05, "loss": 1.2741, "step": 3460 }, { "epoch": 0.61, "grad_norm": 4.6875, "learning_rate": 5.406178486518108e-05, "loss": 1.2724, "step": 3465 }, { "epoch": 0.61, "grad_norm": 5.28125, "learning_rate": 5.4045129818243844e-05, "loss": 1.3163, "step": 3470 }, { "epoch": 0.62, "grad_norm": 4.53125, "learning_rate": 5.402845402069027e-05, "loss": 1.2788, "step": 3475 }, { "epoch": 0.62, "grad_norm": 5.03125, "learning_rate": 5.401175748691133e-05, "loss": 1.2765, "step": 3480 }, { "epoch": 0.62, "grad_norm": 4.53125, "learning_rate": 5.399504023131593e-05, "loss": 1.3242, "step": 3485 }, { "epoch": 0.62, "grad_norm": 4.1875, "learning_rate": 5.397830226833081e-05, "loss": 1.3258, "step": 3490 }, { "epoch": 0.62, "grad_norm": 4.375, "learning_rate": 5.396154361240061e-05, "loss": 1.2323, "step": 3495 }, { "epoch": 0.62, "grad_norm": 4.90625, "learning_rate": 5.3944764277987825e-05, "loss": 1.3304, "step": 3500 }, { "epoch": 0.62, "grad_norm": 4.25, "learning_rate": 5.3927964279572776e-05, "loss": 1.3493, "step": 3505 }, { "epoch": 0.62, "grad_norm": 4.375, "learning_rate": 5.391114363165365e-05, "loss": 1.2971, "step": 3510 }, { "epoch": 0.62, "grad_norm": 4.59375, "learning_rate": 5.3894302348746404e-05, "loss": 1.298, "step": 3515 }, { "epoch": 0.62, "grad_norm": 4.4375, "learning_rate": 5.3877440445384875e-05, "loss": 1.3231, "step": 3520 }, { "epoch": 0.62, "grad_norm": 4.84375, "learning_rate": 5.386055793612061e-05, "loss": 1.3774, "step": 3525 }, { "epoch": 0.63, "grad_norm": 4.4375, "learning_rate": 5.3843654835523025e-05, "loss": 1.2998, "step": 3530 }, { "epoch": 0.63, "grad_norm": 4.71875, "learning_rate": 5.382673115817923e-05, "loss": 1.2647, "step": 3535 }, { "epoch": 0.63, "grad_norm": 4.875, "learning_rate": 5.380978691869414e-05, "loss": 1.2855, "step": 3540 }, { "epoch": 0.63, "grad_norm": 4.6875, "learning_rate": 5.37928221316904e-05, "loss": 1.2911, "step": 3545 }, { "epoch": 0.63, "grad_norm": 4.15625, "learning_rate": 5.3775836811808384e-05, "loss": 1.2909, "step": 3550 }, { "epoch": 0.63, "grad_norm": 5.03125, "learning_rate": 5.375883097370619e-05, "loss": 1.3173, "step": 3555 }, { "epoch": 0.63, "grad_norm": 4.8125, "learning_rate": 5.374180463205962e-05, "loss": 1.3441, "step": 3560 }, { "epoch": 0.63, "grad_norm": 3.90625, "learning_rate": 5.372475780156219e-05, "loss": 1.3172, "step": 3565 }, { "epoch": 0.63, "grad_norm": 5.0625, "learning_rate": 5.370769049692506e-05, "loss": 1.3002, "step": 3570 }, { "epoch": 0.63, "grad_norm": 4.34375, "learning_rate": 5.369060273287709e-05, "loss": 1.2938, "step": 3575 }, { "epoch": 0.63, "grad_norm": 4.65625, "learning_rate": 5.3673494524164775e-05, "loss": 1.3679, "step": 3580 }, { "epoch": 0.64, "grad_norm": 5.25, "learning_rate": 5.365636588555228e-05, "loss": 1.2639, "step": 3585 }, { "epoch": 0.64, "grad_norm": 4.5625, "learning_rate": 5.3639216831821364e-05, "loss": 1.2428, "step": 3590 }, { "epoch": 0.64, "grad_norm": 4.1875, "learning_rate": 5.362204737777144e-05, "loss": 1.3073, "step": 3595 }, { "epoch": 0.64, "grad_norm": 4.53125, "learning_rate": 5.3604857538219515e-05, "loss": 1.2776, "step": 3600 }, { "epoch": 0.64, "grad_norm": 3.984375, "learning_rate": 5.358764732800017e-05, "loss": 1.2903, "step": 3605 }, { "epoch": 0.64, "grad_norm": 4.625, "learning_rate": 5.357041676196558e-05, "loss": 1.2906, "step": 3610 }, { "epoch": 0.64, "grad_norm": 4.71875, "learning_rate": 5.35531658549855e-05, "loss": 1.212, "step": 3615 }, { "epoch": 0.64, "grad_norm": 4.28125, "learning_rate": 5.353589462194721e-05, "loss": 1.4188, "step": 3620 }, { "epoch": 0.64, "grad_norm": 4.625, "learning_rate": 5.351860307775556e-05, "loss": 1.3063, "step": 3625 }, { "epoch": 0.64, "grad_norm": 4.4375, "learning_rate": 5.350129123733291e-05, "loss": 1.3076, "step": 3630 }, { "epoch": 0.64, "grad_norm": 4.53125, "learning_rate": 5.348395911561912e-05, "loss": 1.2788, "step": 3635 }, { "epoch": 0.65, "grad_norm": 3.875, "learning_rate": 5.346660672757161e-05, "loss": 1.2826, "step": 3640 }, { "epoch": 0.65, "grad_norm": 4.9375, "learning_rate": 5.3449234088165236e-05, "loss": 1.3334, "step": 3645 }, { "epoch": 0.65, "grad_norm": 4.625, "learning_rate": 5.343184121239233e-05, "loss": 1.2731, "step": 3650 }, { "epoch": 0.65, "grad_norm": 4.5, "learning_rate": 5.341442811526273e-05, "loss": 1.3533, "step": 3655 }, { "epoch": 0.65, "grad_norm": 4.125, "learning_rate": 5.339699481180369e-05, "loss": 1.3177, "step": 3660 }, { "epoch": 0.65, "grad_norm": 4.4375, "learning_rate": 5.33795413170599e-05, "loss": 1.3122, "step": 3665 }, { "epoch": 0.65, "grad_norm": 4.59375, "learning_rate": 5.33620676460935e-05, "loss": 1.3433, "step": 3670 }, { "epoch": 0.65, "grad_norm": 4.46875, "learning_rate": 5.334457381398402e-05, "loss": 1.2886, "step": 3675 }, { "epoch": 0.65, "grad_norm": 4.3125, "learning_rate": 5.332705983582841e-05, "loss": 1.2162, "step": 3680 }, { "epoch": 0.65, "grad_norm": 4.59375, "learning_rate": 5.330952572674098e-05, "loss": 1.2925, "step": 3685 }, { "epoch": 0.65, "grad_norm": 4.65625, "learning_rate": 5.329197150185342e-05, "loss": 1.3291, "step": 3690 }, { "epoch": 0.65, "grad_norm": 4.1875, "learning_rate": 5.327439717631479e-05, "loss": 1.2377, "step": 3695 }, { "epoch": 0.66, "grad_norm": 4.5625, "learning_rate": 5.32568027652915e-05, "loss": 1.3072, "step": 3700 }, { "epoch": 0.66, "grad_norm": 4.59375, "learning_rate": 5.323918828396728e-05, "loss": 1.3183, "step": 3705 }, { "epoch": 0.66, "grad_norm": 5.125, "learning_rate": 5.322155374754317e-05, "loss": 1.325, "step": 3710 }, { "epoch": 0.66, "grad_norm": 3.9375, "learning_rate": 5.320389917123756e-05, "loss": 1.2649, "step": 3715 }, { "epoch": 0.66, "grad_norm": 6.0625, "learning_rate": 5.318622457028609e-05, "loss": 1.2945, "step": 3720 }, { "epoch": 0.66, "grad_norm": 4.5, "learning_rate": 5.3168529959941684e-05, "loss": 1.3274, "step": 3725 }, { "epoch": 0.66, "grad_norm": 4.75, "learning_rate": 5.315081535547458e-05, "loss": 1.3552, "step": 3730 }, { "epoch": 0.66, "grad_norm": 4.8125, "learning_rate": 5.3133080772172216e-05, "loss": 1.288, "step": 3735 }, { "epoch": 0.66, "grad_norm": 4.125, "learning_rate": 5.311532622533929e-05, "loss": 1.2358, "step": 3740 }, { "epoch": 0.66, "grad_norm": 4.5, "learning_rate": 5.309755173029775e-05, "loss": 1.2855, "step": 3745 }, { "epoch": 0.66, "grad_norm": 4.5, "learning_rate": 5.307975730238673e-05, "loss": 1.3186, "step": 3750 }, { "epoch": 0.67, "grad_norm": 4.125, "learning_rate": 5.306194295696257e-05, "loss": 1.3483, "step": 3755 }, { "epoch": 0.67, "grad_norm": 4.09375, "learning_rate": 5.304410870939882e-05, "loss": 1.2975, "step": 3760 }, { "epoch": 0.67, "grad_norm": 4.15625, "learning_rate": 5.302625457508618e-05, "loss": 1.3327, "step": 3765 }, { "epoch": 0.67, "grad_norm": 4.3125, "learning_rate": 5.3008380569432526e-05, "loss": 1.3524, "step": 3770 }, { "epoch": 0.67, "grad_norm": 4.25, "learning_rate": 5.299048670786288e-05, "loss": 1.2576, "step": 3775 }, { "epoch": 0.67, "grad_norm": 4.5, "learning_rate": 5.297257300581938e-05, "loss": 1.3048, "step": 3780 }, { "epoch": 0.67, "grad_norm": 4.46875, "learning_rate": 5.295463947876134e-05, "loss": 1.2838, "step": 3785 }, { "epoch": 0.67, "grad_norm": 3.890625, "learning_rate": 5.293668614216513e-05, "loss": 1.2591, "step": 3790 }, { "epoch": 0.67, "grad_norm": 4.03125, "learning_rate": 5.2918713011524216e-05, "loss": 1.271, "step": 3795 }, { "epoch": 0.67, "grad_norm": 4.75, "learning_rate": 5.290072010234919e-05, "loss": 1.2851, "step": 3800 }, { "epoch": 0.67, "grad_norm": 4.65625, "learning_rate": 5.288270743016767e-05, "loss": 1.3198, "step": 3805 }, { "epoch": 0.68, "grad_norm": 4.71875, "learning_rate": 5.286467501052434e-05, "loss": 1.3168, "step": 3810 }, { "epoch": 0.68, "grad_norm": 4.75, "learning_rate": 5.284662285898094e-05, "loss": 1.2694, "step": 3815 }, { "epoch": 0.68, "grad_norm": 4.59375, "learning_rate": 5.2828550991116234e-05, "loss": 1.3057, "step": 3820 }, { "epoch": 0.68, "grad_norm": 4.59375, "learning_rate": 5.2810459422525973e-05, "loss": 1.334, "step": 3825 }, { "epoch": 0.68, "grad_norm": 4.15625, "learning_rate": 5.2792348168822954e-05, "loss": 1.2662, "step": 3830 }, { "epoch": 0.68, "grad_norm": 4.9375, "learning_rate": 5.277421724563693e-05, "loss": 1.3281, "step": 3835 }, { "epoch": 0.68, "grad_norm": 4.71875, "learning_rate": 5.275606666861464e-05, "loss": 1.2915, "step": 3840 }, { "epoch": 0.68, "grad_norm": 4.84375, "learning_rate": 5.2737896453419784e-05, "loss": 1.2695, "step": 3845 }, { "epoch": 0.68, "grad_norm": 4.875, "learning_rate": 5.2719706615733e-05, "loss": 1.2676, "step": 3850 }, { "epoch": 0.68, "grad_norm": 4.5, "learning_rate": 5.270149717125189e-05, "loss": 1.3432, "step": 3855 }, { "epoch": 0.68, "grad_norm": 4.875, "learning_rate": 5.268326813569092e-05, "loss": 1.3136, "step": 3860 }, { "epoch": 0.68, "grad_norm": 5.5, "learning_rate": 5.266501952478153e-05, "loss": 1.3011, "step": 3865 }, { "epoch": 0.69, "grad_norm": 4.03125, "learning_rate": 5.264675135427199e-05, "loss": 1.2212, "step": 3870 }, { "epoch": 0.69, "grad_norm": 4.34375, "learning_rate": 5.26284636399275e-05, "loss": 1.3356, "step": 3875 }, { "epoch": 0.69, "grad_norm": 4.625, "learning_rate": 5.261015639753011e-05, "loss": 1.2184, "step": 3880 }, { "epoch": 0.69, "grad_norm": 4.625, "learning_rate": 5.25918296428787e-05, "loss": 1.2665, "step": 3885 }, { "epoch": 0.69, "grad_norm": 4.5, "learning_rate": 5.2573483391789026e-05, "loss": 1.3721, "step": 3890 }, { "epoch": 0.69, "grad_norm": 4.65625, "learning_rate": 5.255511766009364e-05, "loss": 1.2977, "step": 3895 }, { "epoch": 0.69, "grad_norm": 5.5, "learning_rate": 5.253673246364191e-05, "loss": 1.3042, "step": 3900 }, { "epoch": 0.69, "grad_norm": 5.59375, "learning_rate": 5.251832781830002e-05, "loss": 1.2613, "step": 3905 }, { "epoch": 0.69, "grad_norm": 4.53125, "learning_rate": 5.249990373995092e-05, "loss": 1.3259, "step": 3910 }, { "epoch": 0.69, "grad_norm": 4.09375, "learning_rate": 5.248146024449433e-05, "loss": 1.2898, "step": 3915 }, { "epoch": 0.69, "grad_norm": 4.5, "learning_rate": 5.246299734784675e-05, "loss": 1.2752, "step": 3920 }, { "epoch": 0.7, "grad_norm": 4.1875, "learning_rate": 5.244451506594138e-05, "loss": 1.2829, "step": 3925 }, { "epoch": 0.7, "grad_norm": 5.6875, "learning_rate": 5.2426013414728213e-05, "loss": 1.3093, "step": 3930 }, { "epoch": 0.7, "grad_norm": 5.25, "learning_rate": 5.240749241017389e-05, "loss": 1.2484, "step": 3935 }, { "epoch": 0.7, "grad_norm": 4.6875, "learning_rate": 5.238895206826178e-05, "loss": 1.3355, "step": 3940 }, { "epoch": 0.7, "grad_norm": 4.46875, "learning_rate": 5.2370392404991965e-05, "loss": 1.2665, "step": 3945 }, { "epoch": 0.7, "grad_norm": 4.3125, "learning_rate": 5.235181343638116e-05, "loss": 1.3069, "step": 3950 }, { "epoch": 0.7, "grad_norm": 4.8125, "learning_rate": 5.233321517846277e-05, "loss": 1.3489, "step": 3955 }, { "epoch": 0.7, "grad_norm": 5.15625, "learning_rate": 5.231459764728684e-05, "loss": 1.28, "step": 3960 }, { "epoch": 0.7, "grad_norm": 5.125, "learning_rate": 5.229596085892002e-05, "loss": 1.2844, "step": 3965 }, { "epoch": 0.7, "grad_norm": 4.5625, "learning_rate": 5.227730482944563e-05, "loss": 1.2787, "step": 3970 }, { "epoch": 0.7, "grad_norm": 4.375, "learning_rate": 5.225862957496355e-05, "loss": 1.2874, "step": 3975 }, { "epoch": 0.71, "grad_norm": 4.03125, "learning_rate": 5.223993511159028e-05, "loss": 1.3417, "step": 3980 }, { "epoch": 0.71, "grad_norm": 3.96875, "learning_rate": 5.222122145545888e-05, "loss": 1.3058, "step": 3985 }, { "epoch": 0.71, "grad_norm": 3.984375, "learning_rate": 5.220248862271896e-05, "loss": 1.2472, "step": 3990 }, { "epoch": 0.71, "grad_norm": 5.09375, "learning_rate": 5.218373662953672e-05, "loss": 1.2801, "step": 3995 }, { "epoch": 0.71, "grad_norm": 4.28125, "learning_rate": 5.2164965492094874e-05, "loss": 1.2347, "step": 4000 }, { "epoch": 0.71, "grad_norm": 4.4375, "learning_rate": 5.2146175226592634e-05, "loss": 1.2724, "step": 4005 }, { "epoch": 0.71, "grad_norm": 5.09375, "learning_rate": 5.212736584924576e-05, "loss": 1.3077, "step": 4010 }, { "epoch": 0.71, "grad_norm": 5.21875, "learning_rate": 5.210853737628647e-05, "loss": 1.3172, "step": 4015 }, { "epoch": 0.71, "grad_norm": 4.40625, "learning_rate": 5.20896898239635e-05, "loss": 1.272, "step": 4020 }, { "epoch": 0.71, "grad_norm": 5.03125, "learning_rate": 5.207082320854201e-05, "loss": 1.2565, "step": 4025 }, { "epoch": 0.71, "grad_norm": 4.59375, "learning_rate": 5.205193754630362e-05, "loss": 1.3023, "step": 4030 }, { "epoch": 0.72, "grad_norm": 4.59375, "learning_rate": 5.203303285354642e-05, "loss": 1.2946, "step": 4035 }, { "epoch": 0.72, "grad_norm": 4.4375, "learning_rate": 5.2014109146584905e-05, "loss": 1.2827, "step": 4040 }, { "epoch": 0.72, "grad_norm": 5.03125, "learning_rate": 5.199516644174995e-05, "loss": 1.3168, "step": 4045 }, { "epoch": 0.72, "grad_norm": 4.46875, "learning_rate": 5.197620475538885e-05, "loss": 1.3188, "step": 4050 }, { "epoch": 0.72, "grad_norm": 4.28125, "learning_rate": 5.195722410386529e-05, "loss": 1.2968, "step": 4055 }, { "epoch": 0.72, "grad_norm": 4.5625, "learning_rate": 5.193822450355933e-05, "loss": 1.2972, "step": 4060 }, { "epoch": 0.72, "grad_norm": 4.5625, "learning_rate": 5.191920597086732e-05, "loss": 1.2119, "step": 4065 }, { "epoch": 0.72, "grad_norm": 4.21875, "learning_rate": 5.190016852220202e-05, "loss": 1.3148, "step": 4070 }, { "epoch": 0.72, "grad_norm": 4.53125, "learning_rate": 5.188111217399249e-05, "loss": 1.2963, "step": 4075 }, { "epoch": 0.72, "grad_norm": 5.5, "learning_rate": 5.186203694268408e-05, "loss": 1.3657, "step": 4080 }, { "epoch": 0.72, "grad_norm": 5.0, "learning_rate": 5.1842942844738456e-05, "loss": 1.3618, "step": 4085 }, { "epoch": 0.72, "grad_norm": 4.53125, "learning_rate": 5.182382989663357e-05, "loss": 1.3017, "step": 4090 }, { "epoch": 0.73, "grad_norm": 4.25, "learning_rate": 5.1804698114863634e-05, "loss": 1.2431, "step": 4095 }, { "epoch": 0.73, "grad_norm": 4.28125, "learning_rate": 5.17855475159391e-05, "loss": 1.2422, "step": 4100 }, { "epoch": 0.73, "grad_norm": 4.15625, "learning_rate": 5.176637811638669e-05, "loss": 1.2612, "step": 4105 }, { "epoch": 0.73, "grad_norm": 4.8125, "learning_rate": 5.1747189932749315e-05, "loss": 1.3168, "step": 4110 }, { "epoch": 0.73, "grad_norm": 4.125, "learning_rate": 5.172798298158613e-05, "loss": 1.2903, "step": 4115 }, { "epoch": 0.73, "grad_norm": 5.3125, "learning_rate": 5.170875727947245e-05, "loss": 1.3711, "step": 4120 }, { "epoch": 0.73, "grad_norm": 4.34375, "learning_rate": 5.1689512842999796e-05, "loss": 1.2792, "step": 4125 }, { "epoch": 0.73, "grad_norm": 4.5625, "learning_rate": 5.1670249688775865e-05, "loss": 1.3702, "step": 4130 }, { "epoch": 0.73, "grad_norm": 4.8125, "learning_rate": 5.165096783342448e-05, "loss": 1.2846, "step": 4135 }, { "epoch": 0.73, "grad_norm": 4.46875, "learning_rate": 5.1631667293585637e-05, "loss": 1.2773, "step": 4140 }, { "epoch": 0.73, "grad_norm": 5.375, "learning_rate": 5.161234808591542e-05, "loss": 1.3789, "step": 4145 }, { "epoch": 0.74, "grad_norm": 4.84375, "learning_rate": 5.159301022708603e-05, "loss": 1.3553, "step": 4150 }, { "epoch": 0.74, "grad_norm": 4.125, "learning_rate": 5.15736537337858e-05, "loss": 1.2859, "step": 4155 }, { "epoch": 0.74, "grad_norm": 4.5625, "learning_rate": 5.1554278622719096e-05, "loss": 1.3654, "step": 4160 }, { "epoch": 0.74, "grad_norm": 4.96875, "learning_rate": 5.153488491060638e-05, "loss": 1.3205, "step": 4165 }, { "epoch": 0.74, "grad_norm": 4.5625, "learning_rate": 5.151547261418416e-05, "loss": 1.2788, "step": 4170 }, { "epoch": 0.74, "grad_norm": 4.3125, "learning_rate": 5.149604175020498e-05, "loss": 1.2954, "step": 4175 }, { "epoch": 0.74, "grad_norm": 5.09375, "learning_rate": 5.1476592335437406e-05, "loss": 1.2905, "step": 4180 }, { "epoch": 0.74, "grad_norm": 4.3125, "learning_rate": 5.1457124386666025e-05, "loss": 1.293, "step": 4185 }, { "epoch": 0.74, "grad_norm": 5.375, "learning_rate": 5.143763792069139e-05, "loss": 1.2971, "step": 4190 }, { "epoch": 0.74, "grad_norm": 3.859375, "learning_rate": 5.141813295433007e-05, "loss": 1.2769, "step": 4195 }, { "epoch": 0.74, "grad_norm": 4.28125, "learning_rate": 5.139860950441457e-05, "loss": 1.2683, "step": 4200 }, { "epoch": 0.75, "grad_norm": 4.0625, "learning_rate": 5.137906758779337e-05, "loss": 1.2046, "step": 4205 }, { "epoch": 0.75, "grad_norm": 3.890625, "learning_rate": 5.1359507221330876e-05, "loss": 1.2322, "step": 4210 }, { "epoch": 0.75, "grad_norm": 4.625, "learning_rate": 5.133992842190741e-05, "loss": 1.3289, "step": 4215 }, { "epoch": 0.75, "grad_norm": 4.71875, "learning_rate": 5.132033120641921e-05, "loss": 1.332, "step": 4220 }, { "epoch": 0.75, "grad_norm": 4.875, "learning_rate": 5.130071559177839e-05, "loss": 1.3364, "step": 4225 }, { "epoch": 0.75, "grad_norm": 4.59375, "learning_rate": 5.128108159491297e-05, "loss": 1.3073, "step": 4230 }, { "epoch": 0.75, "grad_norm": 4.65625, "learning_rate": 5.126142923276681e-05, "loss": 1.2453, "step": 4235 }, { "epoch": 0.75, "grad_norm": 5.15625, "learning_rate": 5.1241758522299644e-05, "loss": 1.2716, "step": 4240 }, { "epoch": 0.75, "grad_norm": 4.875, "learning_rate": 5.1222069480487016e-05, "loss": 1.356, "step": 4245 }, { "epoch": 0.75, "grad_norm": 4.9375, "learning_rate": 5.1202362124320285e-05, "loss": 1.21, "step": 4250 }, { "epoch": 0.75, "grad_norm": 4.90625, "learning_rate": 5.118263647080665e-05, "loss": 1.3601, "step": 4255 }, { "epoch": 0.75, "grad_norm": 5.375, "learning_rate": 5.116289253696905e-05, "loss": 1.3043, "step": 4260 }, { "epoch": 0.76, "grad_norm": 5.53125, "learning_rate": 5.114313033984627e-05, "loss": 1.279, "step": 4265 }, { "epoch": 0.76, "grad_norm": 4.34375, "learning_rate": 5.112334989649278e-05, "loss": 1.2923, "step": 4270 }, { "epoch": 0.76, "grad_norm": 4.03125, "learning_rate": 5.110355122397885e-05, "loss": 1.3014, "step": 4275 }, { "epoch": 0.76, "grad_norm": 4.9375, "learning_rate": 5.108373433939045e-05, "loss": 1.4038, "step": 4280 }, { "epoch": 0.76, "grad_norm": 4.21875, "learning_rate": 5.1063899259829276e-05, "loss": 1.3483, "step": 4285 }, { "epoch": 0.76, "grad_norm": 3.84375, "learning_rate": 5.1044046002412736e-05, "loss": 1.3516, "step": 4290 }, { "epoch": 0.76, "grad_norm": 4.65625, "learning_rate": 5.102417458427392e-05, "loss": 1.3442, "step": 4295 }, { "epoch": 0.76, "grad_norm": 4.0625, "learning_rate": 5.1004285022561584e-05, "loss": 1.2925, "step": 4300 }, { "epoch": 0.76, "grad_norm": 4.21875, "learning_rate": 5.0984377334440154e-05, "loss": 1.2403, "step": 4305 }, { "epoch": 0.76, "grad_norm": 4.3125, "learning_rate": 5.096445153708969e-05, "loss": 1.3107, "step": 4310 }, { "epoch": 0.76, "grad_norm": 4.4375, "learning_rate": 5.0944507647705873e-05, "loss": 1.297, "step": 4315 }, { "epoch": 0.77, "grad_norm": 4.96875, "learning_rate": 5.09245456835e-05, "loss": 1.3165, "step": 4320 }, { "epoch": 0.77, "grad_norm": 5.03125, "learning_rate": 5.090456566169899e-05, "loss": 1.3021, "step": 4325 }, { "epoch": 0.77, "grad_norm": 4.4375, "learning_rate": 5.0884567599545316e-05, "loss": 1.325, "step": 4330 }, { "epoch": 0.77, "grad_norm": 4.875, "learning_rate": 5.0864551514297027e-05, "loss": 1.2788, "step": 4335 }, { "epoch": 0.77, "grad_norm": 4.15625, "learning_rate": 5.084451742322774e-05, "loss": 1.2565, "step": 4340 }, { "epoch": 0.77, "grad_norm": 5.0, "learning_rate": 5.0824465343626595e-05, "loss": 1.3245, "step": 4345 }, { "epoch": 0.77, "grad_norm": 4.65625, "learning_rate": 5.0804395292798254e-05, "loss": 1.2891, "step": 4350 }, { "epoch": 0.77, "grad_norm": 5.3125, "learning_rate": 5.078430728806291e-05, "loss": 1.2448, "step": 4355 }, { "epoch": 0.77, "grad_norm": 3.828125, "learning_rate": 5.076420134675621e-05, "loss": 1.241, "step": 4360 }, { "epoch": 0.77, "grad_norm": 4.40625, "learning_rate": 5.0744077486229334e-05, "loss": 1.3236, "step": 4365 }, { "epoch": 0.77, "grad_norm": 4.875, "learning_rate": 5.0723935723848884e-05, "loss": 1.3193, "step": 4370 }, { "epoch": 0.78, "grad_norm": 4.3125, "learning_rate": 5.0703776076996916e-05, "loss": 1.2602, "step": 4375 }, { "epoch": 0.78, "grad_norm": 4.46875, "learning_rate": 5.0683598563070944e-05, "loss": 1.3601, "step": 4380 }, { "epoch": 0.78, "grad_norm": 4.40625, "learning_rate": 5.066340319948387e-05, "loss": 1.2986, "step": 4385 }, { "epoch": 0.78, "grad_norm": 5.09375, "learning_rate": 5.064319000366403e-05, "loss": 1.2844, "step": 4390 }, { "epoch": 0.78, "grad_norm": 4.125, "learning_rate": 5.062295899305512e-05, "loss": 1.3314, "step": 4395 }, { "epoch": 0.78, "grad_norm": 4.46875, "learning_rate": 5.060271018511622e-05, "loss": 1.2696, "step": 4400 }, { "epoch": 0.78, "grad_norm": 4.1875, "learning_rate": 5.0582443597321796e-05, "loss": 1.2538, "step": 4405 }, { "epoch": 0.78, "grad_norm": 4.8125, "learning_rate": 5.056215924716162e-05, "loss": 1.3236, "step": 4410 }, { "epoch": 0.78, "grad_norm": 4.75, "learning_rate": 5.05418571521408e-05, "loss": 1.3391, "step": 4415 }, { "epoch": 0.78, "grad_norm": 4.46875, "learning_rate": 5.052153732977978e-05, "loss": 1.2908, "step": 4420 }, { "epoch": 0.78, "grad_norm": 4.6875, "learning_rate": 5.0501199797614274e-05, "loss": 1.2552, "step": 4425 }, { "epoch": 0.79, "grad_norm": 5.4375, "learning_rate": 5.0480844573195307e-05, "loss": 1.2899, "step": 4430 }, { "epoch": 0.79, "grad_norm": 5.5, "learning_rate": 5.0460471674089145e-05, "loss": 1.2976, "step": 4435 }, { "epoch": 0.79, "grad_norm": 5.3125, "learning_rate": 5.0440081117877326e-05, "loss": 1.2849, "step": 4440 }, { "epoch": 0.79, "grad_norm": 4.6875, "learning_rate": 5.041967292215661e-05, "loss": 1.2499, "step": 4445 }, { "epoch": 0.79, "grad_norm": 4.3125, "learning_rate": 5.0399247104539e-05, "loss": 1.238, "step": 4450 }, { "epoch": 0.79, "grad_norm": 4.25, "learning_rate": 5.037880368265169e-05, "loss": 1.296, "step": 4455 }, { "epoch": 0.79, "grad_norm": 4.75, "learning_rate": 5.035834267413708e-05, "loss": 1.3124, "step": 4460 }, { "epoch": 0.79, "grad_norm": 5.09375, "learning_rate": 5.0337864096652734e-05, "loss": 1.3708, "step": 4465 }, { "epoch": 0.79, "grad_norm": 5.09375, "learning_rate": 5.031736796787138e-05, "loss": 1.3199, "step": 4470 }, { "epoch": 0.79, "grad_norm": 4.34375, "learning_rate": 5.02968543054809e-05, "loss": 1.2688, "step": 4475 }, { "epoch": 0.79, "grad_norm": 4.28125, "learning_rate": 5.0276323127184296e-05, "loss": 1.2993, "step": 4480 }, { "epoch": 0.79, "grad_norm": 4.84375, "learning_rate": 5.0255774450699695e-05, "loss": 1.2572, "step": 4485 }, { "epoch": 0.8, "grad_norm": 4.40625, "learning_rate": 5.023520829376033e-05, "loss": 1.2632, "step": 4490 }, { "epoch": 0.8, "grad_norm": 4.3125, "learning_rate": 5.02146246741145e-05, "loss": 1.2746, "step": 4495 }, { "epoch": 0.8, "grad_norm": 4.15625, "learning_rate": 5.0194023609525604e-05, "loss": 1.3369, "step": 4500 }, { "epoch": 0.8, "grad_norm": 4.28125, "learning_rate": 5.017340511777205e-05, "loss": 1.2647, "step": 4505 }, { "epoch": 0.8, "grad_norm": 4.90625, "learning_rate": 5.0152769216647325e-05, "loss": 1.2443, "step": 4510 }, { "epoch": 0.8, "grad_norm": 4.3125, "learning_rate": 5.013211592395993e-05, "loss": 1.3092, "step": 4515 }, { "epoch": 0.8, "grad_norm": 4.6875, "learning_rate": 5.011144525753339e-05, "loss": 1.2733, "step": 4520 }, { "epoch": 0.8, "grad_norm": 4.8125, "learning_rate": 5.009075723520617e-05, "loss": 1.3003, "step": 4525 }, { "epoch": 0.8, "grad_norm": 4.6875, "learning_rate": 5.007005187483178e-05, "loss": 1.2904, "step": 4530 }, { "epoch": 0.8, "grad_norm": 5.15625, "learning_rate": 5.0049329194278636e-05, "loss": 1.346, "step": 4535 }, { "epoch": 0.8, "grad_norm": 4.59375, "learning_rate": 5.0028589211430144e-05, "loss": 1.2387, "step": 4540 }, { "epoch": 0.81, "grad_norm": 4.59375, "learning_rate": 5.0007831944184615e-05, "loss": 1.3197, "step": 4545 }, { "epoch": 0.81, "grad_norm": 5.25, "learning_rate": 4.9987057410455304e-05, "loss": 1.2982, "step": 4550 }, { "epoch": 0.81, "grad_norm": 4.21875, "learning_rate": 4.9966265628170314e-05, "loss": 1.3044, "step": 4555 }, { "epoch": 0.81, "grad_norm": 5.0625, "learning_rate": 4.9945456615272696e-05, "loss": 1.3252, "step": 4560 }, { "epoch": 0.81, "grad_norm": 4.5625, "learning_rate": 4.992463038972032e-05, "loss": 1.2837, "step": 4565 }, { "epoch": 0.81, "grad_norm": 4.875, "learning_rate": 4.9903786969485956e-05, "loss": 1.3049, "step": 4570 }, { "epoch": 0.81, "grad_norm": 5.03125, "learning_rate": 4.9882926372557183e-05, "loss": 1.3271, "step": 4575 }, { "epoch": 0.81, "grad_norm": 4.34375, "learning_rate": 4.986204861693639e-05, "loss": 1.2451, "step": 4580 }, { "epoch": 0.81, "grad_norm": 5.15625, "learning_rate": 4.984115372064081e-05, "loss": 1.3575, "step": 4585 }, { "epoch": 0.81, "grad_norm": 4.53125, "learning_rate": 4.982024170170244e-05, "loss": 1.3739, "step": 4590 }, { "epoch": 0.81, "grad_norm": 4.4375, "learning_rate": 4.9799312578168083e-05, "loss": 1.2905, "step": 4595 }, { "epoch": 0.82, "grad_norm": 5.71875, "learning_rate": 4.977836636809927e-05, "loss": 1.2897, "step": 4600 }, { "epoch": 0.82, "grad_norm": 4.46875, "learning_rate": 4.97574030895723e-05, "loss": 1.3045, "step": 4605 }, { "epoch": 0.82, "grad_norm": 4.34375, "learning_rate": 4.973642276067818e-05, "loss": 1.2284, "step": 4610 }, { "epoch": 0.82, "grad_norm": 3.9375, "learning_rate": 4.971542539952267e-05, "loss": 1.2265, "step": 4615 }, { "epoch": 0.82, "grad_norm": 4.09375, "learning_rate": 4.969441102422618e-05, "loss": 1.2963, "step": 4620 }, { "epoch": 0.82, "grad_norm": 4.25, "learning_rate": 4.9673379652923835e-05, "loss": 1.294, "step": 4625 }, { "epoch": 0.82, "grad_norm": 4.5625, "learning_rate": 4.9652331303765426e-05, "loss": 1.4051, "step": 4630 }, { "epoch": 0.82, "grad_norm": 4.125, "learning_rate": 4.963126599491538e-05, "loss": 1.2579, "step": 4635 }, { "epoch": 0.82, "grad_norm": 4.625, "learning_rate": 4.961018374455277e-05, "loss": 1.3055, "step": 4640 }, { "epoch": 0.82, "grad_norm": 4.21875, "learning_rate": 4.95890845708713e-05, "loss": 1.2871, "step": 4645 }, { "epoch": 0.82, "grad_norm": 4.03125, "learning_rate": 4.956796849207924e-05, "loss": 1.284, "step": 4650 }, { "epoch": 0.82, "grad_norm": 4.25, "learning_rate": 4.95468355263995e-05, "loss": 1.282, "step": 4655 }, { "epoch": 0.83, "grad_norm": 4.75, "learning_rate": 4.952568569206954e-05, "loss": 1.2939, "step": 4660 }, { "epoch": 0.83, "grad_norm": 5.25, "learning_rate": 4.9504519007341355e-05, "loss": 1.2865, "step": 4665 }, { "epoch": 0.83, "grad_norm": 4.9375, "learning_rate": 4.948333549048151e-05, "loss": 1.2585, "step": 4670 }, { "epoch": 0.83, "grad_norm": 4.84375, "learning_rate": 4.946213515977111e-05, "loss": 1.2563, "step": 4675 }, { "epoch": 0.83, "grad_norm": 5.03125, "learning_rate": 4.944091803350573e-05, "loss": 1.2754, "step": 4680 }, { "epoch": 0.83, "grad_norm": 4.875, "learning_rate": 4.9419684129995454e-05, "loss": 1.3099, "step": 4685 }, { "epoch": 0.83, "grad_norm": 4.3125, "learning_rate": 4.939843346756486e-05, "loss": 1.2882, "step": 4690 }, { "epoch": 0.83, "grad_norm": 3.875, "learning_rate": 4.9377166064552976e-05, "loss": 1.2923, "step": 4695 }, { "epoch": 0.83, "grad_norm": 5.0, "learning_rate": 4.9355881939313264e-05, "loss": 1.2603, "step": 4700 }, { "epoch": 0.83, "grad_norm": 4.8125, "learning_rate": 4.933458111021366e-05, "loss": 1.2562, "step": 4705 }, { "epoch": 0.83, "grad_norm": 4.6875, "learning_rate": 4.931326359563645e-05, "loss": 1.3307, "step": 4710 }, { "epoch": 0.84, "grad_norm": 4.40625, "learning_rate": 4.929192941397839e-05, "loss": 1.3033, "step": 4715 }, { "epoch": 0.84, "grad_norm": 4.84375, "learning_rate": 4.927057858365056e-05, "loss": 1.3993, "step": 4720 }, { "epoch": 0.84, "grad_norm": 4.875, "learning_rate": 4.9249211123078454e-05, "loss": 1.3516, "step": 4725 }, { "epoch": 0.84, "grad_norm": 4.625, "learning_rate": 4.922782705070187e-05, "loss": 1.3344, "step": 4730 }, { "epoch": 0.84, "grad_norm": 5.1875, "learning_rate": 4.9206426384975e-05, "loss": 1.2776, "step": 4735 }, { "epoch": 0.84, "grad_norm": 4.96875, "learning_rate": 4.9185009144366305e-05, "loss": 1.3762, "step": 4740 }, { "epoch": 0.84, "grad_norm": 4.9375, "learning_rate": 4.9163575347358565e-05, "loss": 1.2513, "step": 4745 }, { "epoch": 0.84, "grad_norm": 5.03125, "learning_rate": 4.9142125012448864e-05, "loss": 1.2112, "step": 4750 }, { "epoch": 0.84, "grad_norm": 4.1875, "learning_rate": 4.912065815814854e-05, "loss": 1.29, "step": 4755 }, { "epoch": 0.84, "grad_norm": 4.0625, "learning_rate": 4.90991748029832e-05, "loss": 1.3301, "step": 4760 }, { "epoch": 0.84, "grad_norm": 5.25, "learning_rate": 4.9077674965492675e-05, "loss": 1.3531, "step": 4765 }, { "epoch": 0.85, "grad_norm": 4.90625, "learning_rate": 4.905615866423103e-05, "loss": 1.32, "step": 4770 }, { "epoch": 0.85, "grad_norm": 4.875, "learning_rate": 4.903462591776655e-05, "loss": 1.2463, "step": 4775 }, { "epoch": 0.85, "grad_norm": 4.34375, "learning_rate": 4.901307674468168e-05, "loss": 1.2764, "step": 4780 }, { "epoch": 0.85, "grad_norm": 3.84375, "learning_rate": 4.899151116357307e-05, "loss": 1.3328, "step": 4785 }, { "epoch": 0.85, "grad_norm": 4.78125, "learning_rate": 4.896992919305153e-05, "loss": 1.3076, "step": 4790 }, { "epoch": 0.85, "grad_norm": 3.984375, "learning_rate": 4.894833085174197e-05, "loss": 1.3817, "step": 4795 }, { "epoch": 0.85, "grad_norm": 4.3125, "learning_rate": 4.89267161582835e-05, "loss": 1.2894, "step": 4800 }, { "epoch": 0.85, "grad_norm": 4.59375, "learning_rate": 4.8905085131329287e-05, "loss": 1.2889, "step": 4805 }, { "epoch": 0.85, "grad_norm": 4.96875, "learning_rate": 4.8883437789546605e-05, "loss": 1.3055, "step": 4810 }, { "epoch": 0.85, "grad_norm": 4.40625, "learning_rate": 4.886177415161681e-05, "loss": 1.2963, "step": 4815 }, { "epoch": 0.85, "grad_norm": 5.15625, "learning_rate": 4.884009423623534e-05, "loss": 1.3166, "step": 4820 }, { "epoch": 0.86, "grad_norm": 4.78125, "learning_rate": 4.881839806211167e-05, "loss": 1.272, "step": 4825 }, { "epoch": 0.86, "grad_norm": 4.6875, "learning_rate": 4.8796685647969267e-05, "loss": 1.253, "step": 4830 }, { "epoch": 0.86, "grad_norm": 4.75, "learning_rate": 4.877495701254566e-05, "loss": 1.3501, "step": 4835 }, { "epoch": 0.86, "grad_norm": 5.15625, "learning_rate": 4.875321217459237e-05, "loss": 1.239, "step": 4840 }, { "epoch": 0.86, "grad_norm": 4.59375, "learning_rate": 4.87314511528749e-05, "loss": 1.3132, "step": 4845 }, { "epoch": 0.86, "grad_norm": 4.28125, "learning_rate": 4.870967396617268e-05, "loss": 1.2862, "step": 4850 }, { "epoch": 0.86, "grad_norm": 4.5, "learning_rate": 4.868788063327915e-05, "loss": 1.2742, "step": 4855 }, { "epoch": 0.86, "grad_norm": 4.0625, "learning_rate": 4.866607117300166e-05, "loss": 1.3141, "step": 4860 }, { "epoch": 0.86, "grad_norm": 4.4375, "learning_rate": 4.864424560416146e-05, "loss": 1.3136, "step": 4865 }, { "epoch": 0.86, "grad_norm": 4.125, "learning_rate": 4.862240394559372e-05, "loss": 1.3039, "step": 4870 }, { "epoch": 0.86, "grad_norm": 4.625, "learning_rate": 4.8600546216147496e-05, "loss": 1.1936, "step": 4875 }, { "epoch": 0.86, "grad_norm": 4.46875, "learning_rate": 4.85786724346857e-05, "loss": 1.2879, "step": 4880 }, { "epoch": 0.87, "grad_norm": 5.5, "learning_rate": 4.8556782620085106e-05, "loss": 1.2209, "step": 4885 }, { "epoch": 0.87, "grad_norm": 4.90625, "learning_rate": 4.8534876791236335e-05, "loss": 1.3453, "step": 4890 }, { "epoch": 0.87, "grad_norm": 4.71875, "learning_rate": 4.8512954967043796e-05, "loss": 1.3152, "step": 4895 }, { "epoch": 0.87, "grad_norm": 4.28125, "learning_rate": 4.849101716642573e-05, "loss": 1.3382, "step": 4900 }, { "epoch": 0.87, "grad_norm": 5.0625, "learning_rate": 4.846906340831417e-05, "loss": 1.248, "step": 4905 }, { "epoch": 0.87, "grad_norm": 4.3125, "learning_rate": 4.844709371165489e-05, "loss": 1.2956, "step": 4910 }, { "epoch": 0.87, "grad_norm": 4.0, "learning_rate": 4.842510809540744e-05, "loss": 1.2005, "step": 4915 }, { "epoch": 0.87, "grad_norm": 4.78125, "learning_rate": 4.8403106578545106e-05, "loss": 1.321, "step": 4920 }, { "epoch": 0.87, "grad_norm": 4.71875, "learning_rate": 4.8381089180054905e-05, "loss": 1.3446, "step": 4925 }, { "epoch": 0.87, "grad_norm": 3.6875, "learning_rate": 4.835905591893753e-05, "loss": 1.247, "step": 4930 }, { "epoch": 0.87, "grad_norm": 4.28125, "learning_rate": 4.8337006814207396e-05, "loss": 1.3416, "step": 4935 }, { "epoch": 0.88, "grad_norm": 5.65625, "learning_rate": 4.831494188489257e-05, "loss": 1.3565, "step": 4940 }, { "epoch": 0.88, "grad_norm": 4.40625, "learning_rate": 4.829286115003478e-05, "loss": 1.2465, "step": 4945 }, { "epoch": 0.88, "grad_norm": 6.78125, "learning_rate": 4.827076462868941e-05, "loss": 1.2708, "step": 4950 }, { "epoch": 0.88, "grad_norm": 4.90625, "learning_rate": 4.8248652339925455e-05, "loss": 1.2907, "step": 4955 }, { "epoch": 0.88, "grad_norm": 3.890625, "learning_rate": 4.822652430282549e-05, "loss": 1.3552, "step": 4960 }, { "epoch": 0.88, "grad_norm": 3.96875, "learning_rate": 4.820438053648573e-05, "loss": 1.2673, "step": 4965 }, { "epoch": 0.88, "grad_norm": 4.75, "learning_rate": 4.8182221060015935e-05, "loss": 1.2592, "step": 4970 }, { "epoch": 0.88, "grad_norm": 4.40625, "learning_rate": 4.816004589253943e-05, "loss": 1.3148, "step": 4975 }, { "epoch": 0.88, "grad_norm": 4.8125, "learning_rate": 4.813785505319307e-05, "loss": 1.2941, "step": 4980 }, { "epoch": 0.88, "grad_norm": 5.03125, "learning_rate": 4.811564856112726e-05, "loss": 1.2849, "step": 4985 }, { "epoch": 0.88, "grad_norm": 4.0625, "learning_rate": 4.809342643550588e-05, "loss": 1.214, "step": 4990 }, { "epoch": 0.89, "grad_norm": 4.25, "learning_rate": 4.807118869550634e-05, "loss": 1.2822, "step": 4995 }, { "epoch": 0.89, "grad_norm": 5.03125, "learning_rate": 4.804893536031948e-05, "loss": 1.2748, "step": 5000 }, { "epoch": 0.89, "grad_norm": 4.5, "learning_rate": 4.802666644914964e-05, "loss": 1.2134, "step": 5005 }, { "epoch": 0.89, "grad_norm": 4.75, "learning_rate": 4.8004381981214566e-05, "loss": 1.2885, "step": 5010 }, { "epoch": 0.89, "grad_norm": 4.75, "learning_rate": 4.7982081975745476e-05, "loss": 1.226, "step": 5015 }, { "epoch": 0.89, "grad_norm": 5.6875, "learning_rate": 4.7959766451986947e-05, "loss": 1.2971, "step": 5020 }, { "epoch": 0.89, "grad_norm": 4.28125, "learning_rate": 4.793743542919697e-05, "loss": 1.2564, "step": 5025 }, { "epoch": 0.89, "grad_norm": 4.0, "learning_rate": 4.791508892664691e-05, "loss": 1.2823, "step": 5030 }, { "epoch": 0.89, "grad_norm": 5.03125, "learning_rate": 4.7892726963621504e-05, "loss": 1.306, "step": 5035 }, { "epoch": 0.89, "grad_norm": 4.8125, "learning_rate": 4.78703495594188e-05, "loss": 1.3166, "step": 5040 }, { "epoch": 0.89, "grad_norm": 4.78125, "learning_rate": 4.78479567333502e-05, "loss": 1.2067, "step": 5045 }, { "epoch": 0.89, "grad_norm": 4.25, "learning_rate": 4.782554850474042e-05, "loss": 1.3576, "step": 5050 }, { "epoch": 0.9, "grad_norm": 4.03125, "learning_rate": 4.780312489292742e-05, "loss": 1.2865, "step": 5055 }, { "epoch": 0.9, "grad_norm": 4.0, "learning_rate": 4.778068591726249e-05, "loss": 1.3035, "step": 5060 }, { "epoch": 0.9, "grad_norm": 4.4375, "learning_rate": 4.775823159711016e-05, "loss": 1.2351, "step": 5065 }, { "epoch": 0.9, "grad_norm": 4.84375, "learning_rate": 4.773576195184819e-05, "loss": 1.2538, "step": 5070 }, { "epoch": 0.9, "grad_norm": 4.15625, "learning_rate": 4.771327700086758e-05, "loss": 1.3922, "step": 5075 }, { "epoch": 0.9, "grad_norm": 4.34375, "learning_rate": 4.769077676357253e-05, "loss": 1.2832, "step": 5080 }, { "epoch": 0.9, "grad_norm": 4.9375, "learning_rate": 4.766826125938043e-05, "loss": 1.2454, "step": 5085 }, { "epoch": 0.9, "grad_norm": 4.21875, "learning_rate": 4.764573050772187e-05, "loss": 1.2927, "step": 5090 }, { "epoch": 0.9, "grad_norm": 4.6875, "learning_rate": 4.7623184528040554e-05, "loss": 1.3244, "step": 5095 }, { "epoch": 0.9, "grad_norm": 3.953125, "learning_rate": 4.7600623339793375e-05, "loss": 1.2799, "step": 5100 }, { "epoch": 0.9, "grad_norm": 3.953125, "learning_rate": 4.757804696245031e-05, "loss": 1.3255, "step": 5105 }, { "epoch": 0.91, "grad_norm": 4.84375, "learning_rate": 4.755545541549447e-05, "loss": 1.3351, "step": 5110 }, { "epoch": 0.91, "grad_norm": 3.9375, "learning_rate": 4.753284871842205e-05, "loss": 1.3204, "step": 5115 }, { "epoch": 0.91, "grad_norm": 3.734375, "learning_rate": 4.751022689074232e-05, "loss": 1.2998, "step": 5120 }, { "epoch": 0.91, "grad_norm": 4.40625, "learning_rate": 4.7487589951977604e-05, "loss": 1.2709, "step": 5125 }, { "epoch": 0.91, "grad_norm": 4.1875, "learning_rate": 4.7464937921663267e-05, "loss": 1.3357, "step": 5130 }, { "epoch": 0.91, "grad_norm": 4.4375, "learning_rate": 4.7442270819347705e-05, "loss": 1.3468, "step": 5135 }, { "epoch": 0.91, "grad_norm": 5.1875, "learning_rate": 4.741958866459231e-05, "loss": 1.2886, "step": 5140 }, { "epoch": 0.91, "grad_norm": 4.8125, "learning_rate": 4.739689147697147e-05, "loss": 1.348, "step": 5145 }, { "epoch": 0.91, "grad_norm": 4.46875, "learning_rate": 4.7374179276072554e-05, "loss": 1.2766, "step": 5150 }, { "epoch": 0.91, "grad_norm": 4.4375, "learning_rate": 4.735145208149587e-05, "loss": 1.2971, "step": 5155 }, { "epoch": 0.91, "grad_norm": 4.96875, "learning_rate": 4.7328709912854676e-05, "loss": 1.301, "step": 5160 }, { "epoch": 0.92, "grad_norm": 5.0625, "learning_rate": 4.730595278977514e-05, "loss": 1.2901, "step": 5165 }, { "epoch": 0.92, "grad_norm": 4.28125, "learning_rate": 4.728318073189636e-05, "loss": 1.2202, "step": 5170 }, { "epoch": 0.92, "grad_norm": 5.40625, "learning_rate": 4.726039375887032e-05, "loss": 1.2634, "step": 5175 }, { "epoch": 0.92, "grad_norm": 4.40625, "learning_rate": 4.7237591890361846e-05, "loss": 1.3181, "step": 5180 }, { "epoch": 0.92, "grad_norm": 4.125, "learning_rate": 4.721477514604864e-05, "loss": 1.287, "step": 5185 }, { "epoch": 0.92, "grad_norm": 4.8125, "learning_rate": 4.719194354562124e-05, "loss": 1.2871, "step": 5190 }, { "epoch": 0.92, "grad_norm": 4.6875, "learning_rate": 4.7169097108783e-05, "loss": 1.2688, "step": 5195 }, { "epoch": 0.92, "grad_norm": 3.765625, "learning_rate": 4.714623585525009e-05, "loss": 1.2646, "step": 5200 }, { "epoch": 0.92, "grad_norm": 4.09375, "learning_rate": 4.7123359804751446e-05, "loss": 1.2724, "step": 5205 }, { "epoch": 0.92, "grad_norm": 4.75, "learning_rate": 4.7100468977028794e-05, "loss": 1.2753, "step": 5210 }, { "epoch": 0.92, "grad_norm": 3.953125, "learning_rate": 4.7077563391836605e-05, "loss": 1.2553, "step": 5215 }, { "epoch": 0.93, "grad_norm": 4.90625, "learning_rate": 4.705464306894207e-05, "loss": 1.3248, "step": 5220 }, { "epoch": 0.93, "grad_norm": 4.5, "learning_rate": 4.703170802812512e-05, "loss": 1.2768, "step": 5225 }, { "epoch": 0.93, "grad_norm": 4.78125, "learning_rate": 4.700875828917839e-05, "loss": 1.2913, "step": 5230 }, { "epoch": 0.93, "grad_norm": 4.65625, "learning_rate": 4.6985793871907175e-05, "loss": 1.2903, "step": 5235 }, { "epoch": 0.93, "grad_norm": 4.3125, "learning_rate": 4.696281479612946e-05, "loss": 1.2847, "step": 5240 }, { "epoch": 0.93, "grad_norm": 4.5, "learning_rate": 4.6939821081675855e-05, "loss": 1.3054, "step": 5245 }, { "epoch": 0.93, "grad_norm": 5.375, "learning_rate": 4.6916812748389656e-05, "loss": 1.308, "step": 5250 }, { "epoch": 0.93, "grad_norm": 4.46875, "learning_rate": 4.68937898161267e-05, "loss": 1.3282, "step": 5255 }, { "epoch": 0.93, "grad_norm": 5.15625, "learning_rate": 4.6870752304755475e-05, "loss": 1.217, "step": 5260 }, { "epoch": 0.93, "grad_norm": 4.625, "learning_rate": 4.684770023415704e-05, "loss": 1.2743, "step": 5265 }, { "epoch": 0.93, "grad_norm": 4.5625, "learning_rate": 4.6824633624225006e-05, "loss": 1.298, "step": 5270 }, { "epoch": 0.93, "grad_norm": 4.09375, "learning_rate": 4.680155249486555e-05, "loss": 1.3485, "step": 5275 }, { "epoch": 0.94, "grad_norm": 5.1875, "learning_rate": 4.677845686599737e-05, "loss": 1.2938, "step": 5280 }, { "epoch": 0.94, "grad_norm": 4.71875, "learning_rate": 4.675534675755167e-05, "loss": 1.3163, "step": 5285 }, { "epoch": 0.94, "grad_norm": 4.875, "learning_rate": 4.673222218947215e-05, "loss": 1.2243, "step": 5290 }, { "epoch": 0.94, "grad_norm": 4.78125, "learning_rate": 4.670908318171501e-05, "loss": 1.1996, "step": 5295 }, { "epoch": 0.94, "grad_norm": 4.28125, "learning_rate": 4.668592975424888e-05, "loss": 1.256, "step": 5300 }, { "epoch": 0.94, "grad_norm": 4.40625, "learning_rate": 4.666276192705486e-05, "loss": 1.2628, "step": 5305 }, { "epoch": 0.94, "grad_norm": 5.1875, "learning_rate": 4.6639579720126455e-05, "loss": 1.3257, "step": 5310 }, { "epoch": 0.94, "grad_norm": 5.375, "learning_rate": 4.661638315346961e-05, "loss": 1.2853, "step": 5315 }, { "epoch": 0.94, "grad_norm": 4.03125, "learning_rate": 4.659317224710263e-05, "loss": 1.3387, "step": 5320 }, { "epoch": 0.94, "grad_norm": 4.25, "learning_rate": 4.6569947021056194e-05, "loss": 1.3179, "step": 5325 }, { "epoch": 0.94, "grad_norm": 4.78125, "learning_rate": 4.654670749537338e-05, "loss": 1.2654, "step": 5330 }, { "epoch": 0.95, "grad_norm": 4.125, "learning_rate": 4.652345369010956e-05, "loss": 1.2965, "step": 5335 }, { "epoch": 0.95, "grad_norm": 4.34375, "learning_rate": 4.650018562533246e-05, "loss": 1.3444, "step": 5340 }, { "epoch": 0.95, "grad_norm": 4.8125, "learning_rate": 4.647690332112209e-05, "loss": 1.2672, "step": 5345 }, { "epoch": 0.95, "grad_norm": 4.75, "learning_rate": 4.645360679757077e-05, "loss": 1.3395, "step": 5350 }, { "epoch": 0.95, "grad_norm": 4.34375, "learning_rate": 4.6430296074783066e-05, "loss": 1.2981, "step": 5355 }, { "epoch": 0.95, "grad_norm": 4.46875, "learning_rate": 4.6406971172875825e-05, "loss": 1.2583, "step": 5360 }, { "epoch": 0.95, "grad_norm": 4.40625, "learning_rate": 4.638363211197811e-05, "loss": 1.3008, "step": 5365 }, { "epoch": 0.95, "grad_norm": 4.09375, "learning_rate": 4.63602789122312e-05, "loss": 1.2887, "step": 5370 }, { "epoch": 0.95, "grad_norm": 4.625, "learning_rate": 4.63369115937886e-05, "loss": 1.3329, "step": 5375 }, { "epoch": 0.95, "grad_norm": 5.0625, "learning_rate": 4.631353017681597e-05, "loss": 1.2983, "step": 5380 }, { "epoch": 0.95, "grad_norm": 4.15625, "learning_rate": 4.6290134681491165e-05, "loss": 1.2293, "step": 5385 }, { "epoch": 0.96, "grad_norm": 4.0625, "learning_rate": 4.626672512800417e-05, "loss": 1.3206, "step": 5390 }, { "epoch": 0.96, "grad_norm": 3.5, "learning_rate": 4.6243301536557114e-05, "loss": 1.2805, "step": 5395 }, { "epoch": 0.96, "grad_norm": 5.09375, "learning_rate": 4.621986392736423e-05, "loss": 1.352, "step": 5400 }, { "epoch": 0.96, "grad_norm": 4.875, "learning_rate": 4.619641232065184e-05, "loss": 1.2629, "step": 5405 }, { "epoch": 0.96, "grad_norm": 4.75, "learning_rate": 4.6172946736658366e-05, "loss": 1.295, "step": 5410 }, { "epoch": 0.96, "grad_norm": 5.28125, "learning_rate": 4.61494671956343e-05, "loss": 1.2634, "step": 5415 }, { "epoch": 0.96, "grad_norm": 4.90625, "learning_rate": 4.6125973717842145e-05, "loss": 1.2701, "step": 5420 }, { "epoch": 0.96, "grad_norm": 4.75, "learning_rate": 4.6102466323556464e-05, "loss": 1.2746, "step": 5425 }, { "epoch": 0.96, "grad_norm": 6.15625, "learning_rate": 4.607894503306379e-05, "loss": 1.403, "step": 5430 }, { "epoch": 0.96, "grad_norm": 4.09375, "learning_rate": 4.605540986666269e-05, "loss": 1.2862, "step": 5435 }, { "epoch": 0.96, "grad_norm": 4.25, "learning_rate": 4.6031860844663694e-05, "loss": 1.3084, "step": 5440 }, { "epoch": 0.96, "grad_norm": 3.96875, "learning_rate": 4.600829798738927e-05, "loss": 1.3343, "step": 5445 }, { "epoch": 0.97, "grad_norm": 5.4375, "learning_rate": 4.5984721315173856e-05, "loss": 1.3125, "step": 5450 }, { "epoch": 0.97, "grad_norm": 4.03125, "learning_rate": 4.5961130848363776e-05, "loss": 1.2542, "step": 5455 }, { "epoch": 0.97, "grad_norm": 5.09375, "learning_rate": 4.593752660731731e-05, "loss": 1.2577, "step": 5460 }, { "epoch": 0.97, "grad_norm": 4.1875, "learning_rate": 4.591390861240455e-05, "loss": 1.2446, "step": 5465 }, { "epoch": 0.97, "grad_norm": 4.40625, "learning_rate": 4.589027688400754e-05, "loss": 1.2402, "step": 5470 }, { "epoch": 0.97, "grad_norm": 5.40625, "learning_rate": 4.586663144252011e-05, "loss": 1.2589, "step": 5475 }, { "epoch": 0.97, "grad_norm": 6.125, "learning_rate": 4.584297230834797e-05, "loss": 1.2659, "step": 5480 }, { "epoch": 0.97, "grad_norm": 5.09375, "learning_rate": 4.581929950190861e-05, "loss": 1.2953, "step": 5485 }, { "epoch": 0.97, "grad_norm": 3.796875, "learning_rate": 4.579561304363135e-05, "loss": 1.2839, "step": 5490 }, { "epoch": 0.97, "grad_norm": 4.75, "learning_rate": 4.577191295395728e-05, "loss": 1.2788, "step": 5495 }, { "epoch": 0.97, "grad_norm": 4.90625, "learning_rate": 4.574819925333923e-05, "loss": 1.2983, "step": 5500 }, { "epoch": 0.98, "grad_norm": 4.4375, "learning_rate": 4.5724471962241815e-05, "loss": 1.3001, "step": 5505 }, { "epoch": 0.98, "grad_norm": 5.0, "learning_rate": 4.5700731101141364e-05, "loss": 1.3513, "step": 5510 }, { "epoch": 0.98, "grad_norm": 4.25, "learning_rate": 4.5676976690525895e-05, "loss": 1.2569, "step": 5515 }, { "epoch": 0.98, "grad_norm": 4.5, "learning_rate": 4.565320875089516e-05, "loss": 1.3074, "step": 5520 }, { "epoch": 0.98, "grad_norm": 4.75, "learning_rate": 4.562942730276055e-05, "loss": 1.2869, "step": 5525 }, { "epoch": 0.98, "grad_norm": 4.21875, "learning_rate": 4.560563236664513e-05, "loss": 1.2873, "step": 5530 }, { "epoch": 0.98, "grad_norm": 4.46875, "learning_rate": 4.558182396308361e-05, "loss": 1.2841, "step": 5535 }, { "epoch": 0.98, "grad_norm": 4.78125, "learning_rate": 4.555800211262231e-05, "loss": 1.3453, "step": 5540 }, { "epoch": 0.98, "grad_norm": 5.03125, "learning_rate": 4.5534166835819155e-05, "loss": 1.2259, "step": 5545 }, { "epoch": 0.98, "grad_norm": 4.625, "learning_rate": 4.551031815324367e-05, "loss": 1.2489, "step": 5550 }, { "epoch": 0.98, "grad_norm": 4.6875, "learning_rate": 4.548645608547692e-05, "loss": 1.3278, "step": 5555 }, { "epoch": 0.99, "grad_norm": 4.96875, "learning_rate": 4.5462580653111563e-05, "loss": 1.3266, "step": 5560 }, { "epoch": 0.99, "grad_norm": 4.34375, "learning_rate": 4.543869187675176e-05, "loss": 1.3331, "step": 5565 }, { "epoch": 0.99, "grad_norm": 4.75, "learning_rate": 4.541478977701321e-05, "loss": 1.2837, "step": 5570 }, { "epoch": 0.99, "grad_norm": 4.8125, "learning_rate": 4.5390874374523066e-05, "loss": 1.2806, "step": 5575 }, { "epoch": 0.99, "grad_norm": 4.5, "learning_rate": 4.536694568992002e-05, "loss": 1.3459, "step": 5580 }, { "epoch": 0.99, "grad_norm": 4.6875, "learning_rate": 4.534300374385418e-05, "loss": 1.2935, "step": 5585 }, { "epoch": 0.99, "grad_norm": 4.5, "learning_rate": 4.531904855698714e-05, "loss": 1.2311, "step": 5590 }, { "epoch": 0.99, "grad_norm": 4.15625, "learning_rate": 4.529508014999187e-05, "loss": 1.2909, "step": 5595 }, { "epoch": 0.99, "grad_norm": 4.28125, "learning_rate": 4.5271098543552774e-05, "loss": 1.2652, "step": 5600 }, { "epoch": 0.99, "grad_norm": 4.5625, "learning_rate": 4.524710375836568e-05, "loss": 1.2896, "step": 5605 }, { "epoch": 0.99, "grad_norm": 4.4375, "learning_rate": 4.522309581513773e-05, "loss": 1.2788, "step": 5610 }, { "epoch": 1.0, "grad_norm": 5.0, "learning_rate": 4.5199074734587454e-05, "loss": 1.2915, "step": 5615 }, { "epoch": 1.0, "grad_norm": 4.625, "learning_rate": 4.5175040537444726e-05, "loss": 1.2556, "step": 5620 }, { "epoch": 1.0, "grad_norm": 5.09375, "learning_rate": 4.51509932444507e-05, "loss": 1.3896, "step": 5625 }, { "epoch": 1.0, "grad_norm": 4.21875, "learning_rate": 4.5126932876357896e-05, "loss": 1.2715, "step": 5630 }, { "epoch": 1.0, "grad_norm": 4.375, "learning_rate": 4.510285945393005e-05, "loss": 1.3126, "step": 5635 }, { "epoch": 1.0, "grad_norm": 4.5, "learning_rate": 4.5078772997942224e-05, "loss": 1.3096, "step": 5640 }, { "epoch": 1.0, "grad_norm": 3.890625, "learning_rate": 4.5054673529180676e-05, "loss": 1.2639, "step": 5645 }, { "epoch": 1.0, "grad_norm": 4.5, "learning_rate": 4.503056106844291e-05, "loss": 1.1886, "step": 5650 }, { "epoch": 1.0, "grad_norm": 3.953125, "learning_rate": 4.500643563653767e-05, "loss": 1.1993, "step": 5655 }, { "epoch": 1.0, "grad_norm": 4.1875, "learning_rate": 4.498229725428486e-05, "loss": 1.2049, "step": 5660 }, { "epoch": 1.0, "grad_norm": 4.65625, "learning_rate": 4.4958145942515565e-05, "loss": 1.2029, "step": 5665 }, { "epoch": 1.0, "grad_norm": 4.84375, "learning_rate": 4.493398172207205e-05, "loss": 1.1845, "step": 5670 }, { "epoch": 1.01, "grad_norm": 4.25, "learning_rate": 4.49098046138077e-05, "loss": 1.2204, "step": 5675 }, { "epoch": 1.01, "grad_norm": 3.9375, "learning_rate": 4.4885614638587024e-05, "loss": 1.2347, "step": 5680 }, { "epoch": 1.01, "grad_norm": 5.53125, "learning_rate": 4.486141181728564e-05, "loss": 1.312, "step": 5685 }, { "epoch": 1.01, "grad_norm": 4.65625, "learning_rate": 4.483719617079024e-05, "loss": 1.2256, "step": 5690 }, { "epoch": 1.01, "grad_norm": 4.21875, "learning_rate": 4.481296771999861e-05, "loss": 1.2535, "step": 5695 }, { "epoch": 1.01, "grad_norm": 4.78125, "learning_rate": 4.478872648581956e-05, "loss": 1.2195, "step": 5700 }, { "epoch": 1.01, "grad_norm": 4.5625, "learning_rate": 4.476447248917295e-05, "loss": 1.2377, "step": 5705 }, { "epoch": 1.01, "grad_norm": 4.21875, "learning_rate": 4.4740205750989626e-05, "loss": 1.1829, "step": 5710 }, { "epoch": 1.01, "grad_norm": 4.21875, "learning_rate": 4.471592629221148e-05, "loss": 1.2236, "step": 5715 }, { "epoch": 1.01, "grad_norm": 5.09375, "learning_rate": 4.469163413379132e-05, "loss": 1.2187, "step": 5720 }, { "epoch": 1.01, "grad_norm": 4.875, "learning_rate": 4.4667329296692966e-05, "loss": 1.2311, "step": 5725 }, { "epoch": 1.02, "grad_norm": 4.125, "learning_rate": 4.464301180189115e-05, "loss": 1.2426, "step": 5730 }, { "epoch": 1.02, "grad_norm": 5.03125, "learning_rate": 4.461868167037154e-05, "loss": 1.1811, "step": 5735 }, { "epoch": 1.02, "grad_norm": 4.625, "learning_rate": 4.459433892313069e-05, "loss": 1.2966, "step": 5740 }, { "epoch": 1.02, "grad_norm": 4.375, "learning_rate": 4.456998358117608e-05, "loss": 1.2073, "step": 5745 }, { "epoch": 1.02, "grad_norm": 4.78125, "learning_rate": 4.454561566552602e-05, "loss": 1.2092, "step": 5750 }, { "epoch": 1.02, "grad_norm": 4.8125, "learning_rate": 4.4521235197209686e-05, "loss": 1.2631, "step": 5755 }, { "epoch": 1.02, "grad_norm": 4.65625, "learning_rate": 4.449684219726709e-05, "loss": 1.2517, "step": 5760 }, { "epoch": 1.02, "grad_norm": 4.5625, "learning_rate": 4.447243668674905e-05, "loss": 1.2079, "step": 5765 }, { "epoch": 1.02, "grad_norm": 4.6875, "learning_rate": 4.44480186867172e-05, "loss": 1.2512, "step": 5770 }, { "epoch": 1.02, "grad_norm": 4.1875, "learning_rate": 4.442358821824392e-05, "loss": 1.2197, "step": 5775 }, { "epoch": 1.02, "grad_norm": 4.0625, "learning_rate": 4.439914530241238e-05, "loss": 1.2712, "step": 5780 }, { "epoch": 1.03, "grad_norm": 4.09375, "learning_rate": 4.4374689960316486e-05, "loss": 1.2516, "step": 5785 }, { "epoch": 1.03, "grad_norm": 4.75, "learning_rate": 4.435022221306084e-05, "loss": 1.2293, "step": 5790 }, { "epoch": 1.03, "grad_norm": 3.96875, "learning_rate": 4.4325742081760793e-05, "loss": 1.275, "step": 5795 }, { "epoch": 1.03, "grad_norm": 5.0, "learning_rate": 4.430124958754235e-05, "loss": 1.2475, "step": 5800 }, { "epoch": 1.03, "grad_norm": 4.4375, "learning_rate": 4.4276744751542196e-05, "loss": 1.1762, "step": 5805 }, { "epoch": 1.03, "grad_norm": 4.65625, "learning_rate": 4.4252227594907685e-05, "loss": 1.2229, "step": 5810 }, { "epoch": 1.03, "grad_norm": 4.59375, "learning_rate": 4.422769813879676e-05, "loss": 1.2236, "step": 5815 }, { "epoch": 1.03, "grad_norm": 4.6875, "learning_rate": 4.4203156404378025e-05, "loss": 1.2489, "step": 5820 }, { "epoch": 1.03, "grad_norm": 4.65625, "learning_rate": 4.417860241283064e-05, "loss": 1.2097, "step": 5825 }, { "epoch": 1.03, "grad_norm": 4.78125, "learning_rate": 4.415403618534439e-05, "loss": 1.2719, "step": 5830 }, { "epoch": 1.03, "grad_norm": 4.28125, "learning_rate": 4.412945774311956e-05, "loss": 1.276, "step": 5835 }, { "epoch": 1.03, "grad_norm": 4.125, "learning_rate": 4.410486710736704e-05, "loss": 1.2583, "step": 5840 }, { "epoch": 1.04, "grad_norm": 4.5, "learning_rate": 4.4080264299308166e-05, "loss": 1.2111, "step": 5845 }, { "epoch": 1.04, "grad_norm": 4.6875, "learning_rate": 4.405564934017487e-05, "loss": 1.2251, "step": 5850 }, { "epoch": 1.04, "grad_norm": 4.4375, "learning_rate": 4.4031022251209494e-05, "loss": 1.2126, "step": 5855 }, { "epoch": 1.04, "grad_norm": 4.53125, "learning_rate": 4.400638305366489e-05, "loss": 1.1829, "step": 5860 }, { "epoch": 1.04, "grad_norm": 4.15625, "learning_rate": 4.3981731768804336e-05, "loss": 1.2408, "step": 5865 }, { "epoch": 1.04, "grad_norm": 4.4375, "learning_rate": 4.395706841790157e-05, "loss": 1.1975, "step": 5870 }, { "epoch": 1.04, "grad_norm": 4.8125, "learning_rate": 4.393239302224071e-05, "loss": 1.3135, "step": 5875 }, { "epoch": 1.04, "grad_norm": 4.375, "learning_rate": 4.390770560311628e-05, "loss": 1.1736, "step": 5880 }, { "epoch": 1.04, "grad_norm": 4.21875, "learning_rate": 4.388300618183319e-05, "loss": 1.2355, "step": 5885 }, { "epoch": 1.04, "grad_norm": 5.15625, "learning_rate": 4.38582947797067e-05, "loss": 1.2522, "step": 5890 }, { "epoch": 1.04, "grad_norm": 4.875, "learning_rate": 4.3833571418062416e-05, "loss": 1.2311, "step": 5895 }, { "epoch": 1.05, "grad_norm": 5.1875, "learning_rate": 4.380883611823624e-05, "loss": 1.2135, "step": 5900 }, { "epoch": 1.05, "grad_norm": 4.21875, "learning_rate": 4.3784088901574415e-05, "loss": 1.2067, "step": 5905 }, { "epoch": 1.05, "grad_norm": 4.8125, "learning_rate": 4.375932978943345e-05, "loss": 1.2137, "step": 5910 }, { "epoch": 1.05, "grad_norm": 4.625, "learning_rate": 4.373455880318011e-05, "loss": 1.1993, "step": 5915 }, { "epoch": 1.05, "grad_norm": 4.3125, "learning_rate": 4.370977596419141e-05, "loss": 1.2493, "step": 5920 }, { "epoch": 1.05, "grad_norm": 4.5625, "learning_rate": 4.368498129385461e-05, "loss": 1.3239, "step": 5925 }, { "epoch": 1.05, "grad_norm": 4.34375, "learning_rate": 4.366017481356716e-05, "loss": 1.253, "step": 5930 }, { "epoch": 1.05, "grad_norm": 4.28125, "learning_rate": 4.363535654473672e-05, "loss": 1.2516, "step": 5935 }, { "epoch": 1.05, "grad_norm": 6.1875, "learning_rate": 4.361052650878111e-05, "loss": 1.2904, "step": 5940 }, { "epoch": 1.05, "grad_norm": 4.6875, "learning_rate": 4.358568472712831e-05, "loss": 1.2502, "step": 5945 }, { "epoch": 1.05, "grad_norm": 5.46875, "learning_rate": 4.356083122121644e-05, "loss": 1.2033, "step": 5950 }, { "epoch": 1.06, "grad_norm": 5.1875, "learning_rate": 4.353596601249372e-05, "loss": 1.205, "step": 5955 }, { "epoch": 1.06, "grad_norm": 5.46875, "learning_rate": 4.35110891224185e-05, "loss": 1.1544, "step": 5960 }, { "epoch": 1.06, "grad_norm": 4.96875, "learning_rate": 4.34862005724592e-05, "loss": 1.2133, "step": 5965 }, { "epoch": 1.06, "grad_norm": 5.78125, "learning_rate": 4.3461300384094256e-05, "loss": 1.1757, "step": 5970 }, { "epoch": 1.06, "grad_norm": 4.25, "learning_rate": 4.3436388578812236e-05, "loss": 1.1731, "step": 5975 }, { "epoch": 1.06, "grad_norm": 5.40625, "learning_rate": 4.3411465178111656e-05, "loss": 1.2755, "step": 5980 }, { "epoch": 1.06, "grad_norm": 4.03125, "learning_rate": 4.338653020350108e-05, "loss": 1.3168, "step": 5985 }, { "epoch": 1.06, "grad_norm": 6.03125, "learning_rate": 4.336158367649905e-05, "loss": 1.2721, "step": 5990 }, { "epoch": 1.06, "grad_norm": 4.78125, "learning_rate": 4.333662561863406e-05, "loss": 1.1861, "step": 5995 }, { "epoch": 1.06, "grad_norm": 4.78125, "learning_rate": 4.3311656051444594e-05, "loss": 1.2178, "step": 6000 }, { "epoch": 1.06, "grad_norm": 5.65625, "learning_rate": 4.328667499647903e-05, "loss": 1.2756, "step": 6005 }, { "epoch": 1.07, "grad_norm": 5.40625, "learning_rate": 4.326168247529567e-05, "loss": 1.3336, "step": 6010 }, { "epoch": 1.07, "grad_norm": 4.21875, "learning_rate": 4.323667850946273e-05, "loss": 1.3161, "step": 6015 }, { "epoch": 1.07, "grad_norm": 5.21875, "learning_rate": 4.321166312055829e-05, "loss": 1.3181, "step": 6020 }, { "epoch": 1.07, "grad_norm": 4.78125, "learning_rate": 4.318663633017026e-05, "loss": 1.2342, "step": 6025 }, { "epoch": 1.07, "grad_norm": 4.65625, "learning_rate": 4.316159815989645e-05, "loss": 1.1907, "step": 6030 }, { "epoch": 1.07, "grad_norm": 3.921875, "learning_rate": 4.313654863134444e-05, "loss": 1.2357, "step": 6035 }, { "epoch": 1.07, "grad_norm": 4.875, "learning_rate": 4.3111487766131624e-05, "loss": 1.1942, "step": 6040 }, { "epoch": 1.07, "grad_norm": 4.84375, "learning_rate": 4.308641558588519e-05, "loss": 1.2796, "step": 6045 }, { "epoch": 1.07, "grad_norm": 4.4375, "learning_rate": 4.3061332112242086e-05, "loss": 1.2781, "step": 6050 }, { "epoch": 1.07, "grad_norm": 4.25, "learning_rate": 4.3036237366849e-05, "loss": 1.2587, "step": 6055 }, { "epoch": 1.07, "grad_norm": 4.1875, "learning_rate": 4.301113137136236e-05, "loss": 1.198, "step": 6060 }, { "epoch": 1.07, "grad_norm": 3.71875, "learning_rate": 4.298601414744829e-05, "loss": 1.222, "step": 6065 }, { "epoch": 1.08, "grad_norm": 5.28125, "learning_rate": 4.296088571678261e-05, "loss": 1.2194, "step": 6070 }, { "epoch": 1.08, "grad_norm": 4.34375, "learning_rate": 4.293574610105081e-05, "loss": 1.2135, "step": 6075 }, { "epoch": 1.08, "grad_norm": 4.65625, "learning_rate": 4.291059532194803e-05, "loss": 1.2257, "step": 6080 }, { "epoch": 1.08, "grad_norm": 4.25, "learning_rate": 4.2885433401179054e-05, "loss": 1.2826, "step": 6085 }, { "epoch": 1.08, "grad_norm": 4.15625, "learning_rate": 4.2860260360458256e-05, "loss": 1.1905, "step": 6090 }, { "epoch": 1.08, "grad_norm": 5.25, "learning_rate": 4.2835076221509646e-05, "loss": 1.2797, "step": 6095 }, { "epoch": 1.08, "grad_norm": 4.28125, "learning_rate": 4.2809881006066775e-05, "loss": 1.2085, "step": 6100 }, { "epoch": 1.08, "grad_norm": 4.4375, "learning_rate": 4.278467473587278e-05, "loss": 1.2315, "step": 6105 }, { "epoch": 1.08, "grad_norm": 4.375, "learning_rate": 4.275945743268031e-05, "loss": 1.2416, "step": 6110 }, { "epoch": 1.08, "grad_norm": 4.03125, "learning_rate": 4.2734229118251553e-05, "loss": 1.1506, "step": 6115 }, { "epoch": 1.08, "grad_norm": 4.65625, "learning_rate": 4.270898981435821e-05, "loss": 1.1416, "step": 6120 }, { "epoch": 1.09, "grad_norm": 4.34375, "learning_rate": 4.268373954278145e-05, "loss": 1.2071, "step": 6125 }, { "epoch": 1.09, "grad_norm": 4.59375, "learning_rate": 4.2658478325311905e-05, "loss": 1.1974, "step": 6130 }, { "epoch": 1.09, "grad_norm": 4.21875, "learning_rate": 4.263320618374964e-05, "loss": 1.2206, "step": 6135 }, { "epoch": 1.09, "grad_norm": 4.1875, "learning_rate": 4.260792313990421e-05, "loss": 1.213, "step": 6140 }, { "epoch": 1.09, "grad_norm": 4.46875, "learning_rate": 4.2582629215594504e-05, "loss": 1.2882, "step": 6145 }, { "epoch": 1.09, "grad_norm": 4.96875, "learning_rate": 4.255732443264882e-05, "loss": 1.2809, "step": 6150 }, { "epoch": 1.09, "grad_norm": 4.25, "learning_rate": 4.2532008812904866e-05, "loss": 1.3132, "step": 6155 }, { "epoch": 1.09, "grad_norm": 4.625, "learning_rate": 4.250668237820965e-05, "loss": 1.1401, "step": 6160 }, { "epoch": 1.09, "grad_norm": 4.5, "learning_rate": 4.2481345150419566e-05, "loss": 1.265, "step": 6165 }, { "epoch": 1.09, "grad_norm": 4.71875, "learning_rate": 4.245599715140027e-05, "loss": 1.3294, "step": 6170 }, { "epoch": 1.09, "grad_norm": 4.46875, "learning_rate": 4.243063840302673e-05, "loss": 1.2237, "step": 6175 }, { "epoch": 1.1, "grad_norm": 4.5, "learning_rate": 4.2405268927183214e-05, "loss": 1.2142, "step": 6180 }, { "epoch": 1.1, "grad_norm": 4.0625, "learning_rate": 4.2379888745763224e-05, "loss": 1.2268, "step": 6185 }, { "epoch": 1.1, "grad_norm": 4.15625, "learning_rate": 4.2354497880669514e-05, "loss": 1.2154, "step": 6190 }, { "epoch": 1.1, "grad_norm": 4.1875, "learning_rate": 4.232909635381404e-05, "loss": 1.185, "step": 6195 }, { "epoch": 1.1, "grad_norm": 4.46875, "learning_rate": 4.230368418711798e-05, "loss": 1.3333, "step": 6200 }, { "epoch": 1.1, "grad_norm": 4.40625, "learning_rate": 4.227826140251169e-05, "loss": 1.2565, "step": 6205 }, { "epoch": 1.1, "grad_norm": 5.28125, "learning_rate": 4.225282802193466e-05, "loss": 1.2973, "step": 6210 }, { "epoch": 1.1, "grad_norm": 4.65625, "learning_rate": 4.2227384067335566e-05, "loss": 1.2033, "step": 6215 }, { "epoch": 1.1, "grad_norm": 5.0625, "learning_rate": 4.220192956067219e-05, "loss": 1.3041, "step": 6220 }, { "epoch": 1.1, "grad_norm": 4.15625, "learning_rate": 4.2176464523911406e-05, "loss": 1.2824, "step": 6225 }, { "epoch": 1.1, "grad_norm": 4.34375, "learning_rate": 4.215098897902922e-05, "loss": 1.1925, "step": 6230 }, { "epoch": 1.1, "grad_norm": 4.84375, "learning_rate": 4.2125502948010646e-05, "loss": 1.2424, "step": 6235 }, { "epoch": 1.11, "grad_norm": 4.0, "learning_rate": 4.2100006452849804e-05, "loss": 1.1535, "step": 6240 }, { "epoch": 1.11, "grad_norm": 4.4375, "learning_rate": 4.207449951554979e-05, "loss": 1.2146, "step": 6245 }, { "epoch": 1.11, "grad_norm": 4.46875, "learning_rate": 4.204898215812277e-05, "loss": 1.2141, "step": 6250 }, { "epoch": 1.11, "grad_norm": 4.3125, "learning_rate": 4.2023454402589856e-05, "loss": 1.2369, "step": 6255 }, { "epoch": 1.11, "grad_norm": 4.84375, "learning_rate": 4.1997916270981144e-05, "loss": 1.2443, "step": 6260 }, { "epoch": 1.11, "grad_norm": 4.1875, "learning_rate": 4.197236778533571e-05, "loss": 1.1905, "step": 6265 }, { "epoch": 1.11, "grad_norm": 5.8125, "learning_rate": 4.194680896770154e-05, "loss": 1.2564, "step": 6270 }, { "epoch": 1.11, "grad_norm": 4.65625, "learning_rate": 4.1921239840135525e-05, "loss": 1.2885, "step": 6275 }, { "epoch": 1.11, "grad_norm": 4.3125, "learning_rate": 4.189566042470349e-05, "loss": 1.1706, "step": 6280 }, { "epoch": 1.11, "grad_norm": 4.65625, "learning_rate": 4.18700707434801e-05, "loss": 1.3361, "step": 6285 }, { "epoch": 1.11, "grad_norm": 4.3125, "learning_rate": 4.1844470818548924e-05, "loss": 1.2021, "step": 6290 }, { "epoch": 1.12, "grad_norm": 4.53125, "learning_rate": 4.181886067200233e-05, "loss": 1.2032, "step": 6295 }, { "epoch": 1.12, "grad_norm": 4.3125, "learning_rate": 4.1793240325941505e-05, "loss": 1.2259, "step": 6300 }, { "epoch": 1.12, "grad_norm": 4.625, "learning_rate": 4.176760980247649e-05, "loss": 1.215, "step": 6305 }, { "epoch": 1.12, "grad_norm": 4.25, "learning_rate": 4.174196912372605e-05, "loss": 1.2053, "step": 6310 }, { "epoch": 1.12, "grad_norm": 4.5625, "learning_rate": 4.1716318311817746e-05, "loss": 1.195, "step": 6315 }, { "epoch": 1.12, "grad_norm": 4.53125, "learning_rate": 4.169065738888787e-05, "loss": 1.2607, "step": 6320 }, { "epoch": 1.12, "grad_norm": 5.0625, "learning_rate": 4.166498637708145e-05, "loss": 1.2782, "step": 6325 }, { "epoch": 1.12, "grad_norm": 5.59375, "learning_rate": 4.163930529855223e-05, "loss": 1.231, "step": 6330 }, { "epoch": 1.12, "grad_norm": 4.84375, "learning_rate": 4.1613614175462606e-05, "loss": 1.2344, "step": 6335 }, { "epoch": 1.12, "grad_norm": 4.53125, "learning_rate": 4.158791302998367e-05, "loss": 1.2016, "step": 6340 }, { "epoch": 1.12, "grad_norm": 4.125, "learning_rate": 4.156220188429517e-05, "loss": 1.2371, "step": 6345 }, { "epoch": 1.13, "grad_norm": 4.46875, "learning_rate": 4.153648076058545e-05, "loss": 1.2503, "step": 6350 }, { "epoch": 1.13, "grad_norm": 4.46875, "learning_rate": 4.1510749681051504e-05, "loss": 1.2649, "step": 6355 }, { "epoch": 1.13, "grad_norm": 4.9375, "learning_rate": 4.148500866789888e-05, "loss": 1.2615, "step": 6360 }, { "epoch": 1.13, "grad_norm": 4.03125, "learning_rate": 4.145925774334174e-05, "loss": 1.2473, "step": 6365 }, { "epoch": 1.13, "grad_norm": 4.5625, "learning_rate": 4.143349692960277e-05, "loss": 1.2527, "step": 6370 }, { "epoch": 1.13, "grad_norm": 4.15625, "learning_rate": 4.140772624891319e-05, "loss": 1.2399, "step": 6375 }, { "epoch": 1.13, "grad_norm": 4.40625, "learning_rate": 4.1381945723512756e-05, "loss": 1.2365, "step": 6380 }, { "epoch": 1.13, "grad_norm": 5.0625, "learning_rate": 4.135615537564969e-05, "loss": 1.3182, "step": 6385 }, { "epoch": 1.13, "grad_norm": 4.34375, "learning_rate": 4.133035522758073e-05, "loss": 1.2551, "step": 6390 }, { "epoch": 1.13, "grad_norm": 4.84375, "learning_rate": 4.1304545301571024e-05, "loss": 1.2442, "step": 6395 }, { "epoch": 1.13, "grad_norm": 4.1875, "learning_rate": 4.12787256198942e-05, "loss": 1.2467, "step": 6400 }, { "epoch": 1.14, "grad_norm": 4.6875, "learning_rate": 4.1252896204832294e-05, "loss": 1.2771, "step": 6405 }, { "epoch": 1.14, "grad_norm": 5.09375, "learning_rate": 4.122705707867571e-05, "loss": 1.2577, "step": 6410 }, { "epoch": 1.14, "grad_norm": 5.09375, "learning_rate": 4.120120826372329e-05, "loss": 1.2464, "step": 6415 }, { "epoch": 1.14, "grad_norm": 4.375, "learning_rate": 4.1175349782282184e-05, "loss": 1.204, "step": 6420 }, { "epoch": 1.14, "grad_norm": 4.375, "learning_rate": 4.114948165666791e-05, "loss": 1.2572, "step": 6425 }, { "epoch": 1.14, "grad_norm": 4.1875, "learning_rate": 4.112360390920431e-05, "loss": 1.2345, "step": 6430 }, { "epoch": 1.14, "grad_norm": 4.34375, "learning_rate": 4.1097716562223514e-05, "loss": 1.2233, "step": 6435 }, { "epoch": 1.14, "grad_norm": 4.4375, "learning_rate": 4.107181963806597e-05, "loss": 1.2604, "step": 6440 }, { "epoch": 1.14, "grad_norm": 4.25, "learning_rate": 4.104591315908033e-05, "loss": 1.1759, "step": 6445 }, { "epoch": 1.14, "grad_norm": 4.75, "learning_rate": 4.101999714762356e-05, "loss": 1.1818, "step": 6450 }, { "epoch": 1.14, "grad_norm": 5.15625, "learning_rate": 4.09940716260608e-05, "loss": 1.1738, "step": 6455 }, { "epoch": 1.14, "grad_norm": 4.625, "learning_rate": 4.096813661676545e-05, "loss": 1.2228, "step": 6460 }, { "epoch": 1.15, "grad_norm": 4.25, "learning_rate": 4.0942192142119024e-05, "loss": 1.2425, "step": 6465 }, { "epoch": 1.15, "grad_norm": 5.125, "learning_rate": 4.091623822451129e-05, "loss": 1.2344, "step": 6470 }, { "epoch": 1.15, "grad_norm": 5.0625, "learning_rate": 4.089027488634009e-05, "loss": 1.1715, "step": 6475 }, { "epoch": 1.15, "grad_norm": 4.03125, "learning_rate": 4.086430215001144e-05, "loss": 1.3268, "step": 6480 }, { "epoch": 1.15, "grad_norm": 4.4375, "learning_rate": 4.083832003793945e-05, "loss": 1.2293, "step": 6485 }, { "epoch": 1.15, "grad_norm": 5.3125, "learning_rate": 4.0812328572546336e-05, "loss": 1.2874, "step": 6490 }, { "epoch": 1.15, "grad_norm": 4.71875, "learning_rate": 4.0786327776262355e-05, "loss": 1.2482, "step": 6495 }, { "epoch": 1.15, "grad_norm": 4.375, "learning_rate": 4.076031767152586e-05, "loss": 1.3249, "step": 6500 }, { "epoch": 1.15, "grad_norm": 3.984375, "learning_rate": 4.073429828078319e-05, "loss": 1.3375, "step": 6505 }, { "epoch": 1.15, "grad_norm": 4.75, "learning_rate": 4.070826962648875e-05, "loss": 1.2449, "step": 6510 }, { "epoch": 1.15, "grad_norm": 4.625, "learning_rate": 4.068223173110489e-05, "loss": 1.2831, "step": 6515 }, { "epoch": 1.16, "grad_norm": 4.96875, "learning_rate": 4.065618461710196e-05, "loss": 1.2977, "step": 6520 }, { "epoch": 1.16, "grad_norm": 4.1875, "learning_rate": 4.0630128306958254e-05, "loss": 1.2328, "step": 6525 }, { "epoch": 1.16, "grad_norm": 4.21875, "learning_rate": 4.0604062823160024e-05, "loss": 1.2707, "step": 6530 }, { "epoch": 1.16, "grad_norm": 4.4375, "learning_rate": 4.0577988188201414e-05, "loss": 1.279, "step": 6535 }, { "epoch": 1.16, "grad_norm": 4.375, "learning_rate": 4.0551904424584485e-05, "loss": 1.2234, "step": 6540 }, { "epoch": 1.16, "grad_norm": 4.46875, "learning_rate": 4.0525811554819145e-05, "loss": 1.1966, "step": 6545 }, { "epoch": 1.16, "grad_norm": 5.28125, "learning_rate": 4.049970960142321e-05, "loss": 1.1876, "step": 6550 }, { "epoch": 1.16, "grad_norm": 5.1875, "learning_rate": 4.047359858692229e-05, "loss": 1.2647, "step": 6555 }, { "epoch": 1.16, "grad_norm": 4.3125, "learning_rate": 4.044747853384983e-05, "loss": 1.2892, "step": 6560 }, { "epoch": 1.16, "grad_norm": 4.3125, "learning_rate": 4.042134946474708e-05, "loss": 1.1884, "step": 6565 }, { "epoch": 1.16, "grad_norm": 4.625, "learning_rate": 4.039521140216308e-05, "loss": 1.2284, "step": 6570 }, { "epoch": 1.17, "grad_norm": 4.6875, "learning_rate": 4.036906436865459e-05, "loss": 1.1783, "step": 6575 }, { "epoch": 1.17, "grad_norm": 4.375, "learning_rate": 4.0342908386786175e-05, "loss": 1.2079, "step": 6580 }, { "epoch": 1.17, "grad_norm": 4.75, "learning_rate": 4.031674347913005e-05, "loss": 1.2053, "step": 6585 }, { "epoch": 1.17, "grad_norm": 4.5625, "learning_rate": 4.02905696682662e-05, "loss": 1.1107, "step": 6590 }, { "epoch": 1.17, "grad_norm": 5.03125, "learning_rate": 4.0264386976782256e-05, "loss": 1.2973, "step": 6595 }, { "epoch": 1.17, "grad_norm": 4.71875, "learning_rate": 4.0238195427273515e-05, "loss": 1.2198, "step": 6600 }, { "epoch": 1.17, "grad_norm": 4.875, "learning_rate": 4.021199504234294e-05, "loss": 1.2684, "step": 6605 }, { "epoch": 1.17, "grad_norm": 4.34375, "learning_rate": 4.018578584460107e-05, "loss": 1.2408, "step": 6610 }, { "epoch": 1.17, "grad_norm": 4.96875, "learning_rate": 4.015956785666611e-05, "loss": 1.2367, "step": 6615 }, { "epoch": 1.17, "grad_norm": 3.953125, "learning_rate": 4.0133341101163824e-05, "loss": 1.1958, "step": 6620 }, { "epoch": 1.17, "grad_norm": 5.0, "learning_rate": 4.010710560072753e-05, "loss": 1.2547, "step": 6625 }, { "epoch": 1.17, "grad_norm": 4.46875, "learning_rate": 4.00808613779981e-05, "loss": 1.2206, "step": 6630 }, { "epoch": 1.18, "grad_norm": 4.9375, "learning_rate": 4.005460845562395e-05, "loss": 1.267, "step": 6635 }, { "epoch": 1.18, "grad_norm": 4.3125, "learning_rate": 4.002834685626098e-05, "loss": 1.2417, "step": 6640 }, { "epoch": 1.18, "grad_norm": 4.4375, "learning_rate": 4.000207660257261e-05, "loss": 1.2821, "step": 6645 }, { "epoch": 1.18, "grad_norm": 4.6875, "learning_rate": 3.997579771722967e-05, "loss": 1.2522, "step": 6650 }, { "epoch": 1.18, "grad_norm": 4.28125, "learning_rate": 3.994951022291051e-05, "loss": 1.1806, "step": 6655 }, { "epoch": 1.18, "grad_norm": 5.46875, "learning_rate": 3.992321414230086e-05, "loss": 1.1642, "step": 6660 }, { "epoch": 1.18, "grad_norm": 4.875, "learning_rate": 3.9896909498093895e-05, "loss": 1.2136, "step": 6665 }, { "epoch": 1.18, "grad_norm": 4.0, "learning_rate": 3.9870596312990126e-05, "loss": 1.1851, "step": 6670 }, { "epoch": 1.18, "grad_norm": 4.25, "learning_rate": 3.9844274609697496e-05, "loss": 1.1859, "step": 6675 }, { "epoch": 1.18, "grad_norm": 4.25, "learning_rate": 3.9817944410931276e-05, "loss": 1.2189, "step": 6680 }, { "epoch": 1.18, "grad_norm": 4.8125, "learning_rate": 3.979160573941406e-05, "loss": 1.2484, "step": 6685 }, { "epoch": 1.19, "grad_norm": 5.125, "learning_rate": 3.976525861787574e-05, "loss": 1.2842, "step": 6690 }, { "epoch": 1.19, "grad_norm": 5.15625, "learning_rate": 3.973890306905355e-05, "loss": 1.148, "step": 6695 }, { "epoch": 1.19, "grad_norm": 5.5, "learning_rate": 3.9712539115691944e-05, "loss": 1.2832, "step": 6700 }, { "epoch": 1.19, "grad_norm": 4.3125, "learning_rate": 3.968616678054267e-05, "loss": 1.2673, "step": 6705 }, { "epoch": 1.19, "grad_norm": 4.46875, "learning_rate": 3.965978608636466e-05, "loss": 1.2478, "step": 6710 }, { "epoch": 1.19, "grad_norm": 4.0625, "learning_rate": 3.963339705592413e-05, "loss": 1.2625, "step": 6715 }, { "epoch": 1.19, "grad_norm": 5.03125, "learning_rate": 3.960699971199442e-05, "loss": 1.1947, "step": 6720 }, { "epoch": 1.19, "grad_norm": 4.4375, "learning_rate": 3.9580594077356086e-05, "loss": 1.259, "step": 6725 }, { "epoch": 1.19, "grad_norm": 4.96875, "learning_rate": 3.955418017479682e-05, "loss": 1.2197, "step": 6730 }, { "epoch": 1.19, "grad_norm": 4.1875, "learning_rate": 3.952775802711148e-05, "loss": 1.2944, "step": 6735 }, { "epoch": 1.19, "grad_norm": 4.9375, "learning_rate": 3.9501327657101985e-05, "loss": 1.2542, "step": 6740 }, { "epoch": 1.2, "grad_norm": 5.15625, "learning_rate": 3.947488908757741e-05, "loss": 1.2235, "step": 6745 }, { "epoch": 1.2, "grad_norm": 4.8125, "learning_rate": 3.9448442341353846e-05, "loss": 1.3071, "step": 6750 }, { "epoch": 1.2, "grad_norm": 5.3125, "learning_rate": 3.942198744125449e-05, "loss": 1.1837, "step": 6755 }, { "epoch": 1.2, "grad_norm": 4.375, "learning_rate": 3.939552441010956e-05, "loss": 1.2894, "step": 6760 }, { "epoch": 1.2, "grad_norm": 4.75, "learning_rate": 3.936905327075628e-05, "loss": 1.2089, "step": 6765 }, { "epoch": 1.2, "grad_norm": 4.5625, "learning_rate": 3.934257404603888e-05, "loss": 1.2504, "step": 6770 }, { "epoch": 1.2, "grad_norm": 4.8125, "learning_rate": 3.931608675880857e-05, "loss": 1.2669, "step": 6775 }, { "epoch": 1.2, "grad_norm": 4.09375, "learning_rate": 3.9289591431923505e-05, "loss": 1.2545, "step": 6780 }, { "epoch": 1.2, "grad_norm": 4.78125, "learning_rate": 3.92630880882488e-05, "loss": 1.2244, "step": 6785 }, { "epoch": 1.2, "grad_norm": 4.4375, "learning_rate": 3.9236576750656486e-05, "loss": 1.2, "step": 6790 }, { "epoch": 1.2, "grad_norm": 5.125, "learning_rate": 3.921005744202544e-05, "loss": 1.1967, "step": 6795 }, { "epoch": 1.21, "grad_norm": 4.875, "learning_rate": 3.918353018524151e-05, "loss": 1.2683, "step": 6800 }, { "epoch": 1.21, "grad_norm": 4.25, "learning_rate": 3.915699500319733e-05, "loss": 1.1731, "step": 6805 }, { "epoch": 1.21, "grad_norm": 4.71875, "learning_rate": 3.913045191879239e-05, "loss": 1.1733, "step": 6810 }, { "epoch": 1.21, "grad_norm": 5.03125, "learning_rate": 3.910390095493302e-05, "loss": 1.2795, "step": 6815 }, { "epoch": 1.21, "grad_norm": 4.71875, "learning_rate": 3.907734213453234e-05, "loss": 1.223, "step": 6820 }, { "epoch": 1.21, "grad_norm": 4.4375, "learning_rate": 3.905077548051024e-05, "loss": 1.2375, "step": 6825 }, { "epoch": 1.21, "grad_norm": 4.28125, "learning_rate": 3.902420101579337e-05, "loss": 1.2068, "step": 6830 }, { "epoch": 1.21, "grad_norm": 4.84375, "learning_rate": 3.8997618763315135e-05, "loss": 1.2004, "step": 6835 }, { "epoch": 1.21, "grad_norm": 4.03125, "learning_rate": 3.8971028746015656e-05, "loss": 1.2176, "step": 6840 }, { "epoch": 1.21, "grad_norm": 5.46875, "learning_rate": 3.894443098684175e-05, "loss": 1.21, "step": 6845 }, { "epoch": 1.21, "grad_norm": 4.75, "learning_rate": 3.891782550874691e-05, "loss": 1.1994, "step": 6850 }, { "epoch": 1.21, "grad_norm": 4.46875, "learning_rate": 3.889121233469131e-05, "loss": 1.2446, "step": 6855 }, { "epoch": 1.22, "grad_norm": 4.5, "learning_rate": 3.886459148764174e-05, "loss": 1.1825, "step": 6860 }, { "epoch": 1.22, "grad_norm": 3.96875, "learning_rate": 3.883796299057164e-05, "loss": 1.1753, "step": 6865 }, { "epoch": 1.22, "grad_norm": 4.0, "learning_rate": 3.8811326866461034e-05, "loss": 1.2289, "step": 6870 }, { "epoch": 1.22, "grad_norm": 4.46875, "learning_rate": 3.878468313829651e-05, "loss": 1.2698, "step": 6875 }, { "epoch": 1.22, "grad_norm": 4.34375, "learning_rate": 3.875803182907127e-05, "loss": 1.1907, "step": 6880 }, { "epoch": 1.22, "grad_norm": 4.46875, "learning_rate": 3.8731372961785e-05, "loss": 1.2376, "step": 6885 }, { "epoch": 1.22, "grad_norm": 5.09375, "learning_rate": 3.8704706559443956e-05, "loss": 1.3123, "step": 6890 }, { "epoch": 1.22, "grad_norm": 4.0, "learning_rate": 3.867803264506086e-05, "loss": 1.1943, "step": 6895 }, { "epoch": 1.22, "grad_norm": 4.4375, "learning_rate": 3.865135124165495e-05, "loss": 1.2202, "step": 6900 }, { "epoch": 1.22, "grad_norm": 4.5, "learning_rate": 3.86246623722519e-05, "loss": 1.2567, "step": 6905 }, { "epoch": 1.22, "grad_norm": 5.09375, "learning_rate": 3.859796605988383e-05, "loss": 1.2359, "step": 6910 }, { "epoch": 1.23, "grad_norm": 4.90625, "learning_rate": 3.857126232758931e-05, "loss": 1.2316, "step": 6915 }, { "epoch": 1.23, "grad_norm": 4.34375, "learning_rate": 3.8544551198413276e-05, "loss": 1.2515, "step": 6920 }, { "epoch": 1.23, "grad_norm": 5.34375, "learning_rate": 3.8517832695407084e-05, "loss": 1.2577, "step": 6925 }, { "epoch": 1.23, "grad_norm": 4.1875, "learning_rate": 3.8491106841628414e-05, "loss": 1.2484, "step": 6930 }, { "epoch": 1.23, "grad_norm": 4.46875, "learning_rate": 3.846437366014132e-05, "loss": 1.2581, "step": 6935 }, { "epoch": 1.23, "grad_norm": 4.96875, "learning_rate": 3.8437633174016166e-05, "loss": 1.1939, "step": 6940 }, { "epoch": 1.23, "grad_norm": 5.34375, "learning_rate": 3.841088540632963e-05, "loss": 1.158, "step": 6945 }, { "epoch": 1.23, "grad_norm": 4.9375, "learning_rate": 3.838413038016467e-05, "loss": 1.2439, "step": 6950 }, { "epoch": 1.23, "grad_norm": 4.28125, "learning_rate": 3.83573681186105e-05, "loss": 1.327, "step": 6955 }, { "epoch": 1.23, "grad_norm": 4.3125, "learning_rate": 3.833059864476257e-05, "loss": 1.2533, "step": 6960 }, { "epoch": 1.23, "grad_norm": 5.03125, "learning_rate": 3.8303821981722585e-05, "loss": 1.2454, "step": 6965 }, { "epoch": 1.24, "grad_norm": 4.21875, "learning_rate": 3.8277038152598444e-05, "loss": 1.1471, "step": 6970 }, { "epoch": 1.24, "grad_norm": 3.984375, "learning_rate": 3.82502471805042e-05, "loss": 1.163, "step": 6975 }, { "epoch": 1.24, "grad_norm": 4.4375, "learning_rate": 3.822344908856011e-05, "loss": 1.2945, "step": 6980 }, { "epoch": 1.24, "grad_norm": 4.09375, "learning_rate": 3.819664389989257e-05, "loss": 1.1877, "step": 6985 }, { "epoch": 1.24, "grad_norm": 5.25, "learning_rate": 3.816983163763405e-05, "loss": 1.2883, "step": 6990 }, { "epoch": 1.24, "grad_norm": 5.03125, "learning_rate": 3.8143012324923204e-05, "loss": 1.3264, "step": 6995 }, { "epoch": 1.24, "grad_norm": 5.0, "learning_rate": 3.811618598490471e-05, "loss": 1.2202, "step": 7000 }, { "epoch": 1.24, "grad_norm": 3.890625, "learning_rate": 3.8089352640729335e-05, "loss": 1.1479, "step": 7005 }, { "epoch": 1.24, "grad_norm": 4.34375, "learning_rate": 3.806251231555388e-05, "loss": 1.2507, "step": 7010 }, { "epoch": 1.24, "grad_norm": 4.09375, "learning_rate": 3.8035665032541186e-05, "loss": 1.1932, "step": 7015 }, { "epoch": 1.24, "grad_norm": 4.375, "learning_rate": 3.8008810814860076e-05, "loss": 1.2855, "step": 7020 }, { "epoch": 1.24, "grad_norm": 4.28125, "learning_rate": 3.798194968568539e-05, "loss": 1.2523, "step": 7025 }, { "epoch": 1.25, "grad_norm": 4.9375, "learning_rate": 3.79550816681979e-05, "loss": 1.2133, "step": 7030 }, { "epoch": 1.25, "grad_norm": 4.34375, "learning_rate": 3.792820678558434e-05, "loss": 1.2357, "step": 7035 }, { "epoch": 1.25, "grad_norm": 4.75, "learning_rate": 3.7901325061037354e-05, "loss": 1.2459, "step": 7040 }, { "epoch": 1.25, "grad_norm": 4.34375, "learning_rate": 3.7874436517755506e-05, "loss": 1.242, "step": 7045 }, { "epoch": 1.25, "grad_norm": 4.0625, "learning_rate": 3.784754117894326e-05, "loss": 1.198, "step": 7050 }, { "epoch": 1.25, "grad_norm": 4.84375, "learning_rate": 3.782063906781089e-05, "loss": 1.2639, "step": 7055 }, { "epoch": 1.25, "grad_norm": 4.625, "learning_rate": 3.779373020757456e-05, "loss": 1.2507, "step": 7060 }, { "epoch": 1.25, "grad_norm": 5.03125, "learning_rate": 3.776681462145625e-05, "loss": 1.2248, "step": 7065 }, { "epoch": 1.25, "grad_norm": 4.625, "learning_rate": 3.773989233268375e-05, "loss": 1.2225, "step": 7070 }, { "epoch": 1.25, "grad_norm": 4.375, "learning_rate": 3.771296336449061e-05, "loss": 1.1804, "step": 7075 }, { "epoch": 1.25, "grad_norm": 4.15625, "learning_rate": 3.7686027740116165e-05, "loss": 1.2351, "step": 7080 }, { "epoch": 1.26, "grad_norm": 4.5625, "learning_rate": 3.765908548280548e-05, "loss": 1.214, "step": 7085 }, { "epoch": 1.26, "grad_norm": 4.375, "learning_rate": 3.763213661580937e-05, "loss": 1.2337, "step": 7090 }, { "epoch": 1.26, "grad_norm": 4.0, "learning_rate": 3.7605181162384326e-05, "loss": 1.2682, "step": 7095 }, { "epoch": 1.26, "grad_norm": 5.4375, "learning_rate": 3.757821914579252e-05, "loss": 1.2228, "step": 7100 }, { "epoch": 1.26, "grad_norm": 4.40625, "learning_rate": 3.755125058930183e-05, "loss": 1.2619, "step": 7105 }, { "epoch": 1.26, "grad_norm": 4.5, "learning_rate": 3.752427551618573e-05, "loss": 1.1934, "step": 7110 }, { "epoch": 1.26, "grad_norm": 4.0625, "learning_rate": 3.7497293949723344e-05, "loss": 1.2883, "step": 7115 }, { "epoch": 1.26, "grad_norm": 5.46875, "learning_rate": 3.7470305913199386e-05, "loss": 1.285, "step": 7120 }, { "epoch": 1.26, "grad_norm": 4.53125, "learning_rate": 3.7443311429904164e-05, "loss": 1.2227, "step": 7125 }, { "epoch": 1.26, "grad_norm": 4.46875, "learning_rate": 3.7416310523133556e-05, "loss": 1.1527, "step": 7130 }, { "epoch": 1.26, "grad_norm": 4.375, "learning_rate": 3.7389303216188964e-05, "loss": 1.3002, "step": 7135 }, { "epoch": 1.27, "grad_norm": 4.40625, "learning_rate": 3.736228953237731e-05, "loss": 1.1666, "step": 7140 }, { "epoch": 1.27, "grad_norm": 4.5, "learning_rate": 3.733526949501105e-05, "loss": 1.2881, "step": 7145 }, { "epoch": 1.27, "grad_norm": 4.3125, "learning_rate": 3.730824312740811e-05, "loss": 1.2361, "step": 7150 }, { "epoch": 1.27, "grad_norm": 4.3125, "learning_rate": 3.7281210452891846e-05, "loss": 1.2538, "step": 7155 }, { "epoch": 1.27, "grad_norm": 3.8125, "learning_rate": 3.725417149479111e-05, "loss": 1.2228, "step": 7160 }, { "epoch": 1.27, "grad_norm": 5.25, "learning_rate": 3.722712627644014e-05, "loss": 1.2783, "step": 7165 }, { "epoch": 1.27, "grad_norm": 5.03125, "learning_rate": 3.7200074821178587e-05, "loss": 1.2249, "step": 7170 }, { "epoch": 1.27, "grad_norm": 3.40625, "learning_rate": 3.717301715235149e-05, "loss": 1.2403, "step": 7175 }, { "epoch": 1.27, "grad_norm": 4.6875, "learning_rate": 3.714595329330924e-05, "loss": 1.2009, "step": 7180 }, { "epoch": 1.27, "grad_norm": 4.4375, "learning_rate": 3.711888326740757e-05, "loss": 1.2153, "step": 7185 }, { "epoch": 1.27, "grad_norm": 4.75, "learning_rate": 3.709180709800756e-05, "loss": 1.2137, "step": 7190 }, { "epoch": 1.28, "grad_norm": 5.25, "learning_rate": 3.706472480847556e-05, "loss": 1.3122, "step": 7195 }, { "epoch": 1.28, "grad_norm": 5.0625, "learning_rate": 3.703763642218321e-05, "loss": 1.2654, "step": 7200 }, { "epoch": 1.28, "grad_norm": 4.46875, "learning_rate": 3.701054196250741e-05, "loss": 1.1606, "step": 7205 }, { "epoch": 1.28, "grad_norm": 4.40625, "learning_rate": 3.6983441452830306e-05, "loss": 1.3224, "step": 7210 }, { "epoch": 1.28, "grad_norm": 5.40625, "learning_rate": 3.695633491653929e-05, "loss": 1.229, "step": 7215 }, { "epoch": 1.28, "grad_norm": 4.25, "learning_rate": 3.6929222377026907e-05, "loss": 1.1734, "step": 7220 }, { "epoch": 1.28, "grad_norm": 5.0625, "learning_rate": 3.69021038576909e-05, "loss": 1.3508, "step": 7225 }, { "epoch": 1.28, "grad_norm": 4.25, "learning_rate": 3.687497938193421e-05, "loss": 1.1976, "step": 7230 }, { "epoch": 1.28, "grad_norm": 3.921875, "learning_rate": 3.684784897316485e-05, "loss": 1.2657, "step": 7235 }, { "epoch": 1.28, "grad_norm": 5.125, "learning_rate": 3.682071265479602e-05, "loss": 1.2413, "step": 7240 }, { "epoch": 1.28, "grad_norm": 4.71875, "learning_rate": 3.679357045024596e-05, "loss": 1.2066, "step": 7245 }, { "epoch": 1.28, "grad_norm": 4.28125, "learning_rate": 3.676642238293804e-05, "loss": 1.2192, "step": 7250 }, { "epoch": 1.29, "grad_norm": 4.375, "learning_rate": 3.673926847630067e-05, "loss": 1.2644, "step": 7255 }, { "epoch": 1.29, "grad_norm": 3.78125, "learning_rate": 3.671210875376729e-05, "loss": 1.2022, "step": 7260 }, { "epoch": 1.29, "grad_norm": 4.34375, "learning_rate": 3.668494323877635e-05, "loss": 1.3457, "step": 7265 }, { "epoch": 1.29, "grad_norm": 4.125, "learning_rate": 3.665777195477135e-05, "loss": 1.2585, "step": 7270 }, { "epoch": 1.29, "grad_norm": 4.8125, "learning_rate": 3.663059492520072e-05, "loss": 1.2622, "step": 7275 }, { "epoch": 1.29, "grad_norm": 4.4375, "learning_rate": 3.660341217351785e-05, "loss": 1.2034, "step": 7280 }, { "epoch": 1.29, "grad_norm": 4.53125, "learning_rate": 3.6576223723181104e-05, "loss": 1.1774, "step": 7285 }, { "epoch": 1.29, "grad_norm": 4.25, "learning_rate": 3.654902959765372e-05, "loss": 1.2666, "step": 7290 }, { "epoch": 1.29, "grad_norm": 4.34375, "learning_rate": 3.6521829820403864e-05, "loss": 1.2841, "step": 7295 }, { "epoch": 1.29, "grad_norm": 3.75, "learning_rate": 3.6494624414904575e-05, "loss": 1.2286, "step": 7300 }, { "epoch": 1.29, "grad_norm": 4.15625, "learning_rate": 3.646741340463373e-05, "loss": 1.2205, "step": 7305 }, { "epoch": 1.3, "grad_norm": 4.28125, "learning_rate": 3.644019681307406e-05, "loss": 1.2346, "step": 7310 }, { "epoch": 1.3, "grad_norm": 4.125, "learning_rate": 3.641297466371313e-05, "loss": 1.1978, "step": 7315 }, { "epoch": 1.3, "grad_norm": 4.28125, "learning_rate": 3.638574698004325e-05, "loss": 1.2718, "step": 7320 }, { "epoch": 1.3, "grad_norm": 3.984375, "learning_rate": 3.6358513785561554e-05, "loss": 1.1844, "step": 7325 }, { "epoch": 1.3, "grad_norm": 4.34375, "learning_rate": 3.633127510376991e-05, "loss": 1.2515, "step": 7330 }, { "epoch": 1.3, "grad_norm": 4.46875, "learning_rate": 3.630403095817493e-05, "loss": 1.2636, "step": 7335 }, { "epoch": 1.3, "grad_norm": 4.1875, "learning_rate": 3.6276781372287934e-05, "loss": 1.2779, "step": 7340 }, { "epoch": 1.3, "grad_norm": 4.75, "learning_rate": 3.6249526369624934e-05, "loss": 1.2245, "step": 7345 }, { "epoch": 1.3, "grad_norm": 4.90625, "learning_rate": 3.622226597370663e-05, "loss": 1.2662, "step": 7350 }, { "epoch": 1.3, "grad_norm": 4.65625, "learning_rate": 3.619500020805837e-05, "loss": 1.2206, "step": 7355 }, { "epoch": 1.3, "grad_norm": 4.5625, "learning_rate": 3.616772909621012e-05, "loss": 1.3105, "step": 7360 }, { "epoch": 1.31, "grad_norm": 4.5, "learning_rate": 3.61404526616965e-05, "loss": 1.2933, "step": 7365 }, { "epoch": 1.31, "grad_norm": 4.78125, "learning_rate": 3.6113170928056666e-05, "loss": 1.2003, "step": 7370 }, { "epoch": 1.31, "grad_norm": 4.625, "learning_rate": 3.608588391883441e-05, "loss": 1.2041, "step": 7375 }, { "epoch": 1.31, "grad_norm": 4.9375, "learning_rate": 3.605859165757803e-05, "loss": 1.1733, "step": 7380 }, { "epoch": 1.31, "grad_norm": 4.46875, "learning_rate": 3.603129416784038e-05, "loss": 1.2631, "step": 7385 }, { "epoch": 1.31, "grad_norm": 4.78125, "learning_rate": 3.60039914731788e-05, "loss": 1.1757, "step": 7390 }, { "epoch": 1.31, "grad_norm": 5.1875, "learning_rate": 3.5976683597155156e-05, "loss": 1.2808, "step": 7395 }, { "epoch": 1.31, "grad_norm": 4.5625, "learning_rate": 3.594937056333577e-05, "loss": 1.2256, "step": 7400 }, { "epoch": 1.31, "grad_norm": 4.125, "learning_rate": 3.5922052395291406e-05, "loss": 1.2752, "step": 7405 }, { "epoch": 1.31, "grad_norm": 4.5, "learning_rate": 3.5894729116597277e-05, "loss": 1.3149, "step": 7410 }, { "epoch": 1.31, "grad_norm": 4.5, "learning_rate": 3.586740075083298e-05, "loss": 1.2811, "step": 7415 }, { "epoch": 1.31, "grad_norm": 4.71875, "learning_rate": 3.584006732158255e-05, "loss": 1.229, "step": 7420 }, { "epoch": 1.32, "grad_norm": 4.78125, "learning_rate": 3.581272885243433e-05, "loss": 1.2844, "step": 7425 }, { "epoch": 1.32, "grad_norm": 4.40625, "learning_rate": 3.578538536698104e-05, "loss": 1.1966, "step": 7430 }, { "epoch": 1.32, "grad_norm": 4.40625, "learning_rate": 3.575803688881976e-05, "loss": 1.2922, "step": 7435 }, { "epoch": 1.32, "grad_norm": 4.84375, "learning_rate": 3.573068344155184e-05, "loss": 1.2633, "step": 7440 }, { "epoch": 1.32, "grad_norm": 4.75, "learning_rate": 3.570332504878292e-05, "loss": 1.2893, "step": 7445 }, { "epoch": 1.32, "grad_norm": 4.8125, "learning_rate": 3.567596173412292e-05, "loss": 1.2356, "step": 7450 }, { "epoch": 1.32, "grad_norm": 4.71875, "learning_rate": 3.564859352118601e-05, "loss": 1.1968, "step": 7455 }, { "epoch": 1.32, "grad_norm": 4.8125, "learning_rate": 3.562122043359057e-05, "loss": 1.2618, "step": 7460 }, { "epoch": 1.32, "grad_norm": 4.40625, "learning_rate": 3.5593842494959216e-05, "loss": 1.2348, "step": 7465 }, { "epoch": 1.32, "grad_norm": 4.1875, "learning_rate": 3.556645972891871e-05, "loss": 1.2285, "step": 7470 }, { "epoch": 1.32, "grad_norm": 5.6875, "learning_rate": 3.553907215910001e-05, "loss": 1.2448, "step": 7475 }, { "epoch": 1.33, "grad_norm": 4.375, "learning_rate": 3.5511679809138235e-05, "loss": 1.1997, "step": 7480 }, { "epoch": 1.33, "grad_norm": 5.21875, "learning_rate": 3.548428270267257e-05, "loss": 1.2353, "step": 7485 }, { "epoch": 1.33, "grad_norm": 4.46875, "learning_rate": 3.5456880863346376e-05, "loss": 1.218, "step": 7490 }, { "epoch": 1.33, "grad_norm": 3.921875, "learning_rate": 3.542947431480703e-05, "loss": 1.1787, "step": 7495 }, { "epoch": 1.33, "grad_norm": 5.21875, "learning_rate": 3.540206308070603e-05, "loss": 1.2362, "step": 7500 }, { "epoch": 1.33, "grad_norm": 4.34375, "learning_rate": 3.5374647184698896e-05, "loss": 1.239, "step": 7505 }, { "epoch": 1.33, "grad_norm": 3.875, "learning_rate": 3.534722665044514e-05, "loss": 1.1829, "step": 7510 }, { "epoch": 1.33, "grad_norm": 5.0625, "learning_rate": 3.531980150160834e-05, "loss": 1.2116, "step": 7515 }, { "epoch": 1.33, "grad_norm": 5.0, "learning_rate": 3.529237176185601e-05, "loss": 1.2746, "step": 7520 }, { "epoch": 1.33, "grad_norm": 4.71875, "learning_rate": 3.526493745485962e-05, "loss": 1.2576, "step": 7525 }, { "epoch": 1.33, "grad_norm": 4.1875, "learning_rate": 3.523749860429463e-05, "loss": 1.2221, "step": 7530 }, { "epoch": 1.34, "grad_norm": 4.0625, "learning_rate": 3.521005523384037e-05, "loss": 1.1911, "step": 7535 }, { "epoch": 1.34, "grad_norm": 4.78125, "learning_rate": 3.51826073671801e-05, "loss": 1.2157, "step": 7540 }, { "epoch": 1.34, "grad_norm": 4.1875, "learning_rate": 3.515515502800095e-05, "loss": 1.2778, "step": 7545 }, { "epoch": 1.34, "grad_norm": 4.4375, "learning_rate": 3.512769823999392e-05, "loss": 1.2832, "step": 7550 }, { "epoch": 1.34, "grad_norm": 4.6875, "learning_rate": 3.510023702685381e-05, "loss": 1.2897, "step": 7555 }, { "epoch": 1.34, "grad_norm": 4.375, "learning_rate": 3.507277141227931e-05, "loss": 1.2216, "step": 7560 }, { "epoch": 1.34, "grad_norm": 4.625, "learning_rate": 3.504530141997283e-05, "loss": 1.2385, "step": 7565 }, { "epoch": 1.34, "grad_norm": 4.9375, "learning_rate": 3.501782707364063e-05, "loss": 1.2352, "step": 7570 }, { "epoch": 1.34, "grad_norm": 4.625, "learning_rate": 3.499034839699267e-05, "loss": 1.1935, "step": 7575 }, { "epoch": 1.34, "grad_norm": 4.3125, "learning_rate": 3.496286541374267e-05, "loss": 1.2684, "step": 7580 }, { "epoch": 1.34, "grad_norm": 3.796875, "learning_rate": 3.493537814760808e-05, "loss": 1.2503, "step": 7585 }, { "epoch": 1.34, "grad_norm": 5.15625, "learning_rate": 3.490788662231003e-05, "loss": 1.2168, "step": 7590 }, { "epoch": 1.35, "grad_norm": 4.75, "learning_rate": 3.48803908615733e-05, "loss": 1.2524, "step": 7595 }, { "epoch": 1.35, "grad_norm": 3.921875, "learning_rate": 3.4852890889126404e-05, "loss": 1.2563, "step": 7600 }, { "epoch": 1.35, "grad_norm": 5.09375, "learning_rate": 3.482538672870141e-05, "loss": 1.2519, "step": 7605 }, { "epoch": 1.35, "grad_norm": 4.0, "learning_rate": 3.4797878404034025e-05, "loss": 1.2164, "step": 7610 }, { "epoch": 1.35, "grad_norm": 4.9375, "learning_rate": 3.477036593886357e-05, "loss": 1.3107, "step": 7615 }, { "epoch": 1.35, "grad_norm": 4.28125, "learning_rate": 3.474284935693291e-05, "loss": 1.2211, "step": 7620 }, { "epoch": 1.35, "grad_norm": 4.75, "learning_rate": 3.471532868198848e-05, "loss": 1.2178, "step": 7625 }, { "epoch": 1.35, "grad_norm": 4.34375, "learning_rate": 3.468780393778027e-05, "loss": 1.1606, "step": 7630 }, { "epoch": 1.35, "grad_norm": 4.71875, "learning_rate": 3.466027514806171e-05, "loss": 1.28, "step": 7635 }, { "epoch": 1.35, "grad_norm": 4.375, "learning_rate": 3.46327423365898e-05, "loss": 1.3125, "step": 7640 }, { "epoch": 1.35, "grad_norm": 3.984375, "learning_rate": 3.460520552712497e-05, "loss": 1.2196, "step": 7645 }, { "epoch": 1.36, "grad_norm": 5.40625, "learning_rate": 3.4577664743431095e-05, "loss": 1.2336, "step": 7650 }, { "epoch": 1.36, "grad_norm": 4.34375, "learning_rate": 3.455012000927551e-05, "loss": 1.1643, "step": 7655 }, { "epoch": 1.36, "grad_norm": 4.71875, "learning_rate": 3.452257134842894e-05, "loss": 1.2263, "step": 7660 }, { "epoch": 1.36, "grad_norm": 3.9375, "learning_rate": 3.4495018784665495e-05, "loss": 1.2134, "step": 7665 }, { "epoch": 1.36, "grad_norm": 4.34375, "learning_rate": 3.446746234176266e-05, "loss": 1.2095, "step": 7670 }, { "epoch": 1.36, "grad_norm": 4.375, "learning_rate": 3.443990204350127e-05, "loss": 1.2502, "step": 7675 }, { "epoch": 1.36, "grad_norm": 3.78125, "learning_rate": 3.441233791366546e-05, "loss": 1.1875, "step": 7680 }, { "epoch": 1.36, "grad_norm": 4.71875, "learning_rate": 3.438476997604274e-05, "loss": 1.2597, "step": 7685 }, { "epoch": 1.36, "grad_norm": 4.8125, "learning_rate": 3.435719825442381e-05, "loss": 1.2673, "step": 7690 }, { "epoch": 1.36, "grad_norm": 4.46875, "learning_rate": 3.432962277260273e-05, "loss": 1.2411, "step": 7695 }, { "epoch": 1.36, "grad_norm": 4.40625, "learning_rate": 3.430204355437674e-05, "loss": 1.2837, "step": 7700 }, { "epoch": 1.37, "grad_norm": 4.6875, "learning_rate": 3.427446062354632e-05, "loss": 1.2769, "step": 7705 }, { "epoch": 1.37, "grad_norm": 4.71875, "learning_rate": 3.424687400391518e-05, "loss": 1.283, "step": 7710 }, { "epoch": 1.37, "grad_norm": 4.375, "learning_rate": 3.421928371929018e-05, "loss": 1.2279, "step": 7715 }, { "epoch": 1.37, "grad_norm": 4.84375, "learning_rate": 3.4191689793481346e-05, "loss": 1.2468, "step": 7720 }, { "epoch": 1.37, "grad_norm": 5.21875, "learning_rate": 3.416409225030189e-05, "loss": 1.2283, "step": 7725 }, { "epoch": 1.37, "grad_norm": 3.953125, "learning_rate": 3.413649111356808e-05, "loss": 1.1719, "step": 7730 }, { "epoch": 1.37, "grad_norm": 4.78125, "learning_rate": 3.410888640709933e-05, "loss": 1.2873, "step": 7735 }, { "epoch": 1.37, "grad_norm": 4.25, "learning_rate": 3.408127815471812e-05, "loss": 1.18, "step": 7740 }, { "epoch": 1.37, "grad_norm": 4.96875, "learning_rate": 3.4053666380249986e-05, "loss": 1.2085, "step": 7745 }, { "epoch": 1.37, "grad_norm": 4.4375, "learning_rate": 3.402605110752353e-05, "loss": 1.1924, "step": 7750 }, { "epoch": 1.37, "grad_norm": 4.71875, "learning_rate": 3.3998432360370334e-05, "loss": 1.2658, "step": 7755 }, { "epoch": 1.38, "grad_norm": 4.0, "learning_rate": 3.397081016262499e-05, "loss": 1.1559, "step": 7760 }, { "epoch": 1.38, "grad_norm": 4.875, "learning_rate": 3.39431845381251e-05, "loss": 1.1612, "step": 7765 }, { "epoch": 1.38, "grad_norm": 4.65625, "learning_rate": 3.3915555510711185e-05, "loss": 1.2686, "step": 7770 }, { "epoch": 1.38, "grad_norm": 4.59375, "learning_rate": 3.3887923104226714e-05, "loss": 1.2432, "step": 7775 }, { "epoch": 1.38, "grad_norm": 4.3125, "learning_rate": 3.386028734251808e-05, "loss": 1.0922, "step": 7780 }, { "epoch": 1.38, "grad_norm": 4.34375, "learning_rate": 3.383264824943455e-05, "loss": 1.2121, "step": 7785 }, { "epoch": 1.38, "grad_norm": 4.65625, "learning_rate": 3.38050058488283e-05, "loss": 1.2373, "step": 7790 }, { "epoch": 1.38, "grad_norm": 4.5, "learning_rate": 3.377736016455433e-05, "loss": 1.1687, "step": 7795 }, { "epoch": 1.38, "grad_norm": 5.0, "learning_rate": 3.374971122047048e-05, "loss": 1.3151, "step": 7800 }, { "epoch": 1.38, "grad_norm": 4.5625, "learning_rate": 3.372205904043742e-05, "loss": 1.259, "step": 7805 }, { "epoch": 1.38, "grad_norm": 4.78125, "learning_rate": 3.369440364831861e-05, "loss": 1.1974, "step": 7810 }, { "epoch": 1.38, "grad_norm": 5.5, "learning_rate": 3.366674506798025e-05, "loss": 1.2669, "step": 7815 }, { "epoch": 1.39, "grad_norm": 4.46875, "learning_rate": 3.363908332329134e-05, "loss": 1.1812, "step": 7820 }, { "epoch": 1.39, "grad_norm": 4.125, "learning_rate": 3.361141843812356e-05, "loss": 1.2599, "step": 7825 }, { "epoch": 1.39, "grad_norm": 4.03125, "learning_rate": 3.358375043635133e-05, "loss": 1.216, "step": 7830 }, { "epoch": 1.39, "grad_norm": 4.90625, "learning_rate": 3.3556079341851775e-05, "loss": 1.1921, "step": 7835 }, { "epoch": 1.39, "grad_norm": 4.125, "learning_rate": 3.352840517850465e-05, "loss": 1.1924, "step": 7840 }, { "epoch": 1.39, "grad_norm": 4.53125, "learning_rate": 3.3500727970192374e-05, "loss": 1.1985, "step": 7845 }, { "epoch": 1.39, "grad_norm": 4.6875, "learning_rate": 3.347304774080002e-05, "loss": 1.201, "step": 7850 }, { "epoch": 1.39, "grad_norm": 4.5625, "learning_rate": 3.344536451421522e-05, "loss": 1.2272, "step": 7855 }, { "epoch": 1.39, "grad_norm": 4.40625, "learning_rate": 3.341767831432824e-05, "loss": 1.2578, "step": 7860 }, { "epoch": 1.39, "grad_norm": 4.9375, "learning_rate": 3.338998916503186e-05, "loss": 1.2344, "step": 7865 }, { "epoch": 1.39, "grad_norm": 5.375, "learning_rate": 3.3362297090221466e-05, "loss": 1.28, "step": 7870 }, { "epoch": 1.4, "grad_norm": 4.40625, "learning_rate": 3.333460211379493e-05, "loss": 1.2487, "step": 7875 }, { "epoch": 1.4, "grad_norm": 4.125, "learning_rate": 3.3306904259652626e-05, "loss": 1.274, "step": 7880 }, { "epoch": 1.4, "grad_norm": 4.6875, "learning_rate": 3.3279203551697414e-05, "loss": 1.2464, "step": 7885 }, { "epoch": 1.4, "grad_norm": 4.34375, "learning_rate": 3.3251500013834655e-05, "loss": 1.3006, "step": 7890 }, { "epoch": 1.4, "grad_norm": 4.53125, "learning_rate": 3.32237936699721e-05, "loss": 1.2043, "step": 7895 }, { "epoch": 1.4, "grad_norm": 4.25, "learning_rate": 3.319608454401994e-05, "loss": 1.2738, "step": 7900 }, { "epoch": 1.4, "grad_norm": 5.1875, "learning_rate": 3.316837265989077e-05, "loss": 1.1612, "step": 7905 }, { "epoch": 1.4, "grad_norm": 4.21875, "learning_rate": 3.314065804149958e-05, "loss": 1.2169, "step": 7910 }, { "epoch": 1.4, "grad_norm": 4.1875, "learning_rate": 3.3112940712763685e-05, "loss": 1.2207, "step": 7915 }, { "epoch": 1.4, "grad_norm": 4.6875, "learning_rate": 3.308522069760278e-05, "loss": 1.2492, "step": 7920 }, { "epoch": 1.4, "grad_norm": 4.1875, "learning_rate": 3.305749801993882e-05, "loss": 1.1049, "step": 7925 }, { "epoch": 1.41, "grad_norm": 5.03125, "learning_rate": 3.3029772703696145e-05, "loss": 1.1853, "step": 7930 }, { "epoch": 1.41, "grad_norm": 5.03125, "learning_rate": 3.3002044772801284e-05, "loss": 1.3082, "step": 7935 }, { "epoch": 1.41, "grad_norm": 4.59375, "learning_rate": 3.2974314251183066e-05, "loss": 1.2403, "step": 7940 }, { "epoch": 1.41, "grad_norm": 4.46875, "learning_rate": 3.294658116277257e-05, "loss": 1.2429, "step": 7945 }, { "epoch": 1.41, "grad_norm": 4.6875, "learning_rate": 3.291884553150305e-05, "loss": 1.2617, "step": 7950 }, { "epoch": 1.41, "grad_norm": 4.40625, "learning_rate": 3.2891107381309974e-05, "loss": 1.2781, "step": 7955 }, { "epoch": 1.41, "grad_norm": 4.40625, "learning_rate": 3.286336673613099e-05, "loss": 1.2413, "step": 7960 }, { "epoch": 1.41, "grad_norm": 4.34375, "learning_rate": 3.28356236199059e-05, "loss": 1.2386, "step": 7965 }, { "epoch": 1.41, "grad_norm": 4.375, "learning_rate": 3.280787805657662e-05, "loss": 1.2417, "step": 7970 }, { "epoch": 1.41, "grad_norm": 4.8125, "learning_rate": 3.2780130070087196e-05, "loss": 1.1677, "step": 7975 }, { "epoch": 1.41, "grad_norm": 3.96875, "learning_rate": 3.2752379684383754e-05, "loss": 1.2378, "step": 7980 }, { "epoch": 1.41, "grad_norm": 4.71875, "learning_rate": 3.27246269234145e-05, "loss": 1.2372, "step": 7985 }, { "epoch": 1.42, "grad_norm": 4.34375, "learning_rate": 3.269687181112968e-05, "loss": 1.2368, "step": 7990 }, { "epoch": 1.42, "grad_norm": 3.671875, "learning_rate": 3.2669114371481587e-05, "loss": 1.2243, "step": 7995 }, { "epoch": 1.42, "grad_norm": 4.875, "learning_rate": 3.2641354628424494e-05, "loss": 1.2218, "step": 8000 }, { "epoch": 1.42, "grad_norm": 4.1875, "learning_rate": 3.261359260591468e-05, "loss": 1.2455, "step": 8005 }, { "epoch": 1.42, "grad_norm": 4.625, "learning_rate": 3.25858283279104e-05, "loss": 1.2707, "step": 8010 }, { "epoch": 1.42, "grad_norm": 4.65625, "learning_rate": 3.2558061818371835e-05, "loss": 1.2104, "step": 8015 }, { "epoch": 1.42, "grad_norm": 5.5625, "learning_rate": 3.25302931012611e-05, "loss": 1.3039, "step": 8020 }, { "epoch": 1.42, "grad_norm": 4.78125, "learning_rate": 3.2502522200542216e-05, "loss": 1.2611, "step": 8025 }, { "epoch": 1.42, "grad_norm": 4.625, "learning_rate": 3.247474914018109e-05, "loss": 1.1853, "step": 8030 }, { "epoch": 1.42, "grad_norm": 4.34375, "learning_rate": 3.244697394414549e-05, "loss": 1.3064, "step": 8035 }, { "epoch": 1.42, "grad_norm": 5.65625, "learning_rate": 3.241919663640504e-05, "loss": 1.248, "step": 8040 }, { "epoch": 1.43, "grad_norm": 4.71875, "learning_rate": 3.2391417240931166e-05, "loss": 1.2026, "step": 8045 }, { "epoch": 1.43, "grad_norm": 4.28125, "learning_rate": 3.236363578169707e-05, "loss": 1.2549, "step": 8050 }, { "epoch": 1.43, "grad_norm": 4.9375, "learning_rate": 3.2335852282677844e-05, "loss": 1.2203, "step": 8055 }, { "epoch": 1.43, "grad_norm": 4.125, "learning_rate": 3.230806676785021e-05, "loss": 1.1889, "step": 8060 }, { "epoch": 1.43, "grad_norm": 4.3125, "learning_rate": 3.2280279261192685e-05, "loss": 1.1957, "step": 8065 }, { "epoch": 1.43, "grad_norm": 4.25, "learning_rate": 3.225248978668553e-05, "loss": 1.2039, "step": 8070 }, { "epoch": 1.43, "grad_norm": 4.84375, "learning_rate": 3.222469836831067e-05, "loss": 1.2068, "step": 8075 }, { "epoch": 1.43, "grad_norm": 4.0, "learning_rate": 3.219690503005171e-05, "loss": 1.2231, "step": 8080 }, { "epoch": 1.43, "grad_norm": 4.21875, "learning_rate": 3.2169109795893924e-05, "loss": 1.1968, "step": 8085 }, { "epoch": 1.43, "grad_norm": 4.375, "learning_rate": 3.2141312689824204e-05, "loss": 1.2683, "step": 8090 }, { "epoch": 1.43, "grad_norm": 4.8125, "learning_rate": 3.21135137358311e-05, "loss": 1.2401, "step": 8095 }, { "epoch": 1.44, "grad_norm": 4.28125, "learning_rate": 3.2085712957904704e-05, "loss": 1.2254, "step": 8100 }, { "epoch": 1.44, "grad_norm": 4.09375, "learning_rate": 3.205791038003671e-05, "loss": 1.2193, "step": 8105 }, { "epoch": 1.44, "grad_norm": 4.84375, "learning_rate": 3.203010602622036e-05, "loss": 1.2358, "step": 8110 }, { "epoch": 1.44, "grad_norm": 4.71875, "learning_rate": 3.200229992045044e-05, "loss": 1.2211, "step": 8115 }, { "epoch": 1.44, "grad_norm": 3.765625, "learning_rate": 3.197449208672321e-05, "loss": 1.1589, "step": 8120 }, { "epoch": 1.44, "grad_norm": 5.75, "learning_rate": 3.194668254903648e-05, "loss": 1.2716, "step": 8125 }, { "epoch": 1.44, "grad_norm": 4.21875, "learning_rate": 3.191887133138947e-05, "loss": 1.2663, "step": 8130 }, { "epoch": 1.44, "grad_norm": 4.46875, "learning_rate": 3.1891058457782895e-05, "loss": 1.2231, "step": 8135 }, { "epoch": 1.44, "grad_norm": 4.1875, "learning_rate": 3.186324395221888e-05, "loss": 1.2072, "step": 8140 }, { "epoch": 1.44, "grad_norm": 4.0, "learning_rate": 3.183542783870096e-05, "loss": 1.2664, "step": 8145 }, { "epoch": 1.44, "grad_norm": 5.15625, "learning_rate": 3.180761014123407e-05, "loss": 1.2123, "step": 8150 }, { "epoch": 1.45, "grad_norm": 4.75, "learning_rate": 3.177979088382447e-05, "loss": 1.1911, "step": 8155 }, { "epoch": 1.45, "grad_norm": 4.59375, "learning_rate": 3.175197009047982e-05, "loss": 1.265, "step": 8160 }, { "epoch": 1.45, "grad_norm": 4.59375, "learning_rate": 3.172414778520908e-05, "loss": 1.175, "step": 8165 }, { "epoch": 1.45, "grad_norm": 4.71875, "learning_rate": 3.169632399202252e-05, "loss": 1.3004, "step": 8170 }, { "epoch": 1.45, "grad_norm": 4.8125, "learning_rate": 3.1668498734931665e-05, "loss": 1.2015, "step": 8175 }, { "epoch": 1.45, "grad_norm": 4.34375, "learning_rate": 3.164067203794936e-05, "loss": 1.2129, "step": 8180 }, { "epoch": 1.45, "grad_norm": 4.25, "learning_rate": 3.161284392508966e-05, "loss": 1.2226, "step": 8185 }, { "epoch": 1.45, "grad_norm": 5.0625, "learning_rate": 3.1585014420367823e-05, "loss": 1.338, "step": 8190 }, { "epoch": 1.45, "grad_norm": 4.5625, "learning_rate": 3.155718354780034e-05, "loss": 1.2668, "step": 8195 }, { "epoch": 1.45, "grad_norm": 4.5, "learning_rate": 3.1529351331404876e-05, "loss": 1.3103, "step": 8200 }, { "epoch": 1.45, "grad_norm": 4.21875, "learning_rate": 3.1501517795200246e-05, "loss": 1.2984, "step": 8205 }, { "epoch": 1.45, "grad_norm": 4.09375, "learning_rate": 3.147368296320642e-05, "loss": 1.2456, "step": 8210 }, { "epoch": 1.46, "grad_norm": 4.59375, "learning_rate": 3.1445846859444435e-05, "loss": 1.1962, "step": 8215 }, { "epoch": 1.46, "grad_norm": 4.375, "learning_rate": 3.141800950793652e-05, "loss": 1.2575, "step": 8220 }, { "epoch": 1.46, "grad_norm": 4.21875, "learning_rate": 3.13901709327059e-05, "loss": 1.2426, "step": 8225 }, { "epoch": 1.46, "grad_norm": 4.125, "learning_rate": 3.1362331157776876e-05, "loss": 1.2526, "step": 8230 }, { "epoch": 1.46, "grad_norm": 4.78125, "learning_rate": 3.13344902071748e-05, "loss": 1.2345, "step": 8235 }, { "epoch": 1.46, "grad_norm": 4.71875, "learning_rate": 3.130664810492604e-05, "loss": 1.2419, "step": 8240 }, { "epoch": 1.46, "grad_norm": 5.28125, "learning_rate": 3.127880487505792e-05, "loss": 1.2911, "step": 8245 }, { "epoch": 1.46, "grad_norm": 4.65625, "learning_rate": 3.125096054159879e-05, "loss": 1.2828, "step": 8250 }, { "epoch": 1.46, "grad_norm": 4.1875, "learning_rate": 3.122311512857791e-05, "loss": 1.3087, "step": 8255 }, { "epoch": 1.46, "grad_norm": 4.8125, "learning_rate": 3.11952686600255e-05, "loss": 1.2053, "step": 8260 }, { "epoch": 1.46, "grad_norm": 4.875, "learning_rate": 3.116742115997268e-05, "loss": 1.193, "step": 8265 }, { "epoch": 1.47, "grad_norm": 4.90625, "learning_rate": 3.113957265245145e-05, "loss": 1.2133, "step": 8270 }, { "epoch": 1.47, "grad_norm": 4.84375, "learning_rate": 3.111172316149469e-05, "loss": 1.2015, "step": 8275 }, { "epoch": 1.47, "grad_norm": 4.28125, "learning_rate": 3.108387271113614e-05, "loss": 1.233, "step": 8280 }, { "epoch": 1.47, "grad_norm": 4.21875, "learning_rate": 3.105602132541033e-05, "loss": 1.24, "step": 8285 }, { "epoch": 1.47, "grad_norm": 4.34375, "learning_rate": 3.1028169028352663e-05, "loss": 1.1698, "step": 8290 }, { "epoch": 1.47, "grad_norm": 3.921875, "learning_rate": 3.100031584399925e-05, "loss": 1.3059, "step": 8295 }, { "epoch": 1.47, "grad_norm": 4.21875, "learning_rate": 3.0972461796387027e-05, "loss": 1.2142, "step": 8300 }, { "epoch": 1.47, "grad_norm": 4.0625, "learning_rate": 3.094460690955365e-05, "loss": 1.2578, "step": 8305 }, { "epoch": 1.47, "grad_norm": 4.21875, "learning_rate": 3.091675120753749e-05, "loss": 1.2503, "step": 8310 }, { "epoch": 1.47, "grad_norm": 4.09375, "learning_rate": 3.088889471437766e-05, "loss": 1.288, "step": 8315 }, { "epoch": 1.47, "grad_norm": 5.0625, "learning_rate": 3.086103745411391e-05, "loss": 1.2587, "step": 8320 }, { "epoch": 1.48, "grad_norm": 4.375, "learning_rate": 3.083317945078667e-05, "loss": 1.2464, "step": 8325 }, { "epoch": 1.48, "grad_norm": 3.890625, "learning_rate": 3.080532072843704e-05, "loss": 1.2546, "step": 8330 }, { "epoch": 1.48, "grad_norm": 4.5, "learning_rate": 3.07774613111067e-05, "loss": 1.1686, "step": 8335 }, { "epoch": 1.48, "grad_norm": 4.53125, "learning_rate": 3.074960122283792e-05, "loss": 1.2485, "step": 8340 }, { "epoch": 1.48, "grad_norm": 4.65625, "learning_rate": 3.0721740487673606e-05, "loss": 1.2651, "step": 8345 }, { "epoch": 1.48, "grad_norm": 4.21875, "learning_rate": 3.069387912965717e-05, "loss": 1.2785, "step": 8350 }, { "epoch": 1.48, "grad_norm": 5.09375, "learning_rate": 3.0666017172832594e-05, "loss": 1.3134, "step": 8355 }, { "epoch": 1.48, "grad_norm": 4.15625, "learning_rate": 3.063815464124436e-05, "loss": 1.2162, "step": 8360 }, { "epoch": 1.48, "grad_norm": 4.3125, "learning_rate": 3.061029155893744e-05, "loss": 1.1924, "step": 8365 }, { "epoch": 1.48, "grad_norm": 4.40625, "learning_rate": 3.058242794995731e-05, "loss": 1.2103, "step": 8370 }, { "epoch": 1.48, "grad_norm": 4.28125, "learning_rate": 3.055456383834987e-05, "loss": 1.1494, "step": 8375 }, { "epoch": 1.48, "grad_norm": 4.34375, "learning_rate": 3.052669924816144e-05, "loss": 1.2265, "step": 8380 }, { "epoch": 1.49, "grad_norm": 5.03125, "learning_rate": 3.0498834203438825e-05, "loss": 1.2099, "step": 8385 }, { "epoch": 1.49, "grad_norm": 4.6875, "learning_rate": 3.0470968728229156e-05, "loss": 1.1725, "step": 8390 }, { "epoch": 1.49, "grad_norm": 5.0, "learning_rate": 3.044310284657994e-05, "loss": 1.2158, "step": 8395 }, { "epoch": 1.49, "grad_norm": 4.78125, "learning_rate": 3.041523658253906e-05, "loss": 1.2203, "step": 8400 }, { "epoch": 1.49, "grad_norm": 4.375, "learning_rate": 3.038736996015473e-05, "loss": 1.2895, "step": 8405 }, { "epoch": 1.49, "grad_norm": 5.25, "learning_rate": 3.035950300347544e-05, "loss": 1.284, "step": 8410 }, { "epoch": 1.49, "grad_norm": 3.96875, "learning_rate": 3.0331635736550014e-05, "loss": 1.2822, "step": 8415 }, { "epoch": 1.49, "grad_norm": 4.65625, "learning_rate": 3.0303768183427504e-05, "loss": 1.2586, "step": 8420 }, { "epoch": 1.49, "grad_norm": 4.09375, "learning_rate": 3.0275900368157222e-05, "loss": 1.2682, "step": 8425 }, { "epoch": 1.49, "grad_norm": 4.71875, "learning_rate": 3.0248032314788733e-05, "loss": 1.1847, "step": 8430 }, { "epoch": 1.49, "grad_norm": 4.625, "learning_rate": 3.022016404737175e-05, "loss": 1.2741, "step": 8435 }, { "epoch": 1.5, "grad_norm": 4.40625, "learning_rate": 3.019229558995623e-05, "loss": 1.2583, "step": 8440 }, { "epoch": 1.5, "grad_norm": 4.28125, "learning_rate": 3.0164426966592264e-05, "loss": 1.1743, "step": 8445 }, { "epoch": 1.5, "grad_norm": 4.375, "learning_rate": 3.0136558201330086e-05, "loss": 1.3288, "step": 8450 }, { "epoch": 1.5, "grad_norm": 4.28125, "learning_rate": 3.0108689318220058e-05, "loss": 1.1603, "step": 8455 }, { "epoch": 1.5, "grad_norm": 4.8125, "learning_rate": 3.0080820341312637e-05, "loss": 1.1689, "step": 8460 }, { "epoch": 1.5, "grad_norm": 4.15625, "learning_rate": 3.0052951294658376e-05, "loss": 1.2477, "step": 8465 }, { "epoch": 1.5, "grad_norm": 4.90625, "learning_rate": 3.002508220230788e-05, "loss": 1.2148, "step": 8470 }, { "epoch": 1.5, "grad_norm": 4.4375, "learning_rate": 2.9997213088311784e-05, "loss": 1.2865, "step": 8475 }, { "epoch": 1.5, "grad_norm": 4.03125, "learning_rate": 2.996934397672076e-05, "loss": 1.2427, "step": 8480 }, { "epoch": 1.5, "grad_norm": 4.03125, "learning_rate": 2.9941474891585464e-05, "loss": 1.2736, "step": 8485 }, { "epoch": 1.5, "grad_norm": 4.53125, "learning_rate": 2.9913605856956522e-05, "loss": 1.2205, "step": 8490 }, { "epoch": 1.51, "grad_norm": 5.15625, "learning_rate": 2.9885736896884532e-05, "loss": 1.3078, "step": 8495 }, { "epoch": 1.51, "grad_norm": 4.15625, "learning_rate": 2.985786803542004e-05, "loss": 1.2615, "step": 8500 }, { "epoch": 1.51, "grad_norm": 3.984375, "learning_rate": 2.9829999296613477e-05, "loss": 1.2039, "step": 8505 }, { "epoch": 1.51, "grad_norm": 4.0, "learning_rate": 2.9802130704515188e-05, "loss": 1.2577, "step": 8510 }, { "epoch": 1.51, "grad_norm": 4.71875, "learning_rate": 2.9774262283175376e-05, "loss": 1.2148, "step": 8515 }, { "epoch": 1.51, "grad_norm": 4.21875, "learning_rate": 2.9746394056644106e-05, "loss": 1.293, "step": 8520 }, { "epoch": 1.51, "grad_norm": 4.0625, "learning_rate": 2.9718526048971282e-05, "loss": 1.1719, "step": 8525 }, { "epoch": 1.51, "grad_norm": 4.5625, "learning_rate": 2.9690658284206603e-05, "loss": 1.2483, "step": 8530 }, { "epoch": 1.51, "grad_norm": 4.46875, "learning_rate": 2.966279078639956e-05, "loss": 1.2634, "step": 8535 }, { "epoch": 1.51, "grad_norm": 4.8125, "learning_rate": 2.9634923579599448e-05, "loss": 1.2112, "step": 8540 }, { "epoch": 1.51, "grad_norm": 3.984375, "learning_rate": 2.9607056687855252e-05, "loss": 1.3329, "step": 8545 }, { "epoch": 1.52, "grad_norm": 5.0625, "learning_rate": 2.957919013521574e-05, "loss": 1.274, "step": 8550 }, { "epoch": 1.52, "grad_norm": 4.4375, "learning_rate": 2.9551323945729357e-05, "loss": 1.2245, "step": 8555 }, { "epoch": 1.52, "grad_norm": 4.25, "learning_rate": 2.952345814344423e-05, "loss": 1.2301, "step": 8560 }, { "epoch": 1.52, "grad_norm": 4.625, "learning_rate": 2.949559275240818e-05, "loss": 1.1579, "step": 8565 }, { "epoch": 1.52, "grad_norm": 4.4375, "learning_rate": 2.946772779666864e-05, "loss": 1.2767, "step": 8570 }, { "epoch": 1.52, "grad_norm": 3.890625, "learning_rate": 2.9439863300272683e-05, "loss": 1.2106, "step": 8575 }, { "epoch": 1.52, "grad_norm": 4.0625, "learning_rate": 2.9411999287267002e-05, "loss": 1.2151, "step": 8580 }, { "epoch": 1.52, "grad_norm": 4.71875, "learning_rate": 2.9384135781697858e-05, "loss": 1.2881, "step": 8585 }, { "epoch": 1.52, "grad_norm": 4.90625, "learning_rate": 2.9356272807611054e-05, "loss": 1.2159, "step": 8590 }, { "epoch": 1.52, "grad_norm": 4.6875, "learning_rate": 2.932841038905197e-05, "loss": 1.2455, "step": 8595 }, { "epoch": 1.52, "grad_norm": 4.53125, "learning_rate": 2.930054855006548e-05, "loss": 1.2668, "step": 8600 }, { "epoch": 1.52, "grad_norm": 4.34375, "learning_rate": 2.9272687314695975e-05, "loss": 1.2085, "step": 8605 }, { "epoch": 1.53, "grad_norm": 4.75, "learning_rate": 2.9244826706987314e-05, "loss": 1.1821, "step": 8610 }, { "epoch": 1.53, "grad_norm": 4.90625, "learning_rate": 2.9216966750982808e-05, "loss": 1.2171, "step": 8615 }, { "epoch": 1.53, "grad_norm": 5.1875, "learning_rate": 2.9189107470725232e-05, "loss": 1.225, "step": 8620 }, { "epoch": 1.53, "grad_norm": 5.28125, "learning_rate": 2.9161248890256763e-05, "loss": 1.2498, "step": 8625 }, { "epoch": 1.53, "grad_norm": 5.40625, "learning_rate": 2.913339103361896e-05, "loss": 1.2482, "step": 8630 }, { "epoch": 1.53, "grad_norm": 4.28125, "learning_rate": 2.9105533924852786e-05, "loss": 1.1949, "step": 8635 }, { "epoch": 1.53, "grad_norm": 4.40625, "learning_rate": 2.9077677587998524e-05, "loss": 1.2296, "step": 8640 }, { "epoch": 1.53, "grad_norm": 4.5625, "learning_rate": 2.9049822047095822e-05, "loss": 1.2159, "step": 8645 }, { "epoch": 1.53, "grad_norm": 4.9375, "learning_rate": 2.902196732618363e-05, "loss": 1.2127, "step": 8650 }, { "epoch": 1.53, "grad_norm": 4.46875, "learning_rate": 2.8994113449300177e-05, "loss": 1.2132, "step": 8655 }, { "epoch": 1.53, "grad_norm": 3.984375, "learning_rate": 2.8966260440482975e-05, "loss": 1.2137, "step": 8660 }, { "epoch": 1.54, "grad_norm": 4.9375, "learning_rate": 2.8938408323768812e-05, "loss": 1.2337, "step": 8665 }, { "epoch": 1.54, "grad_norm": 4.96875, "learning_rate": 2.8910557123193657e-05, "loss": 1.1794, "step": 8670 }, { "epoch": 1.54, "grad_norm": 4.71875, "learning_rate": 2.8882706862792727e-05, "loss": 1.2256, "step": 8675 }, { "epoch": 1.54, "grad_norm": 4.25, "learning_rate": 2.885485756660041e-05, "loss": 1.2319, "step": 8680 }, { "epoch": 1.54, "grad_norm": 5.28125, "learning_rate": 2.8827009258650253e-05, "loss": 1.1926, "step": 8685 }, { "epoch": 1.54, "grad_norm": 4.0, "learning_rate": 2.8799161962974983e-05, "loss": 1.2584, "step": 8690 }, { "epoch": 1.54, "grad_norm": 4.71875, "learning_rate": 2.8771315703606418e-05, "loss": 1.2559, "step": 8695 }, { "epoch": 1.54, "grad_norm": 4.46875, "learning_rate": 2.8743470504575492e-05, "loss": 1.2884, "step": 8700 }, { "epoch": 1.54, "grad_norm": 5.0, "learning_rate": 2.871562638991226e-05, "loss": 1.2673, "step": 8705 }, { "epoch": 1.54, "grad_norm": 4.0, "learning_rate": 2.868778338364578e-05, "loss": 1.2837, "step": 8710 }, { "epoch": 1.54, "grad_norm": 4.21875, "learning_rate": 2.8659941509804196e-05, "loss": 1.1962, "step": 8715 }, { "epoch": 1.55, "grad_norm": 4.34375, "learning_rate": 2.863210079241466e-05, "loss": 1.3414, "step": 8720 }, { "epoch": 1.55, "grad_norm": 4.78125, "learning_rate": 2.8604261255503318e-05, "loss": 1.3023, "step": 8725 }, { "epoch": 1.55, "grad_norm": 4.53125, "learning_rate": 2.857642292309533e-05, "loss": 1.2483, "step": 8730 }, { "epoch": 1.55, "grad_norm": 4.96875, "learning_rate": 2.8548585819214768e-05, "loss": 1.2163, "step": 8735 }, { "epoch": 1.55, "grad_norm": 3.6875, "learning_rate": 2.8520749967884678e-05, "loss": 1.1771, "step": 8740 }, { "epoch": 1.55, "grad_norm": 4.03125, "learning_rate": 2.8492915393127036e-05, "loss": 1.2, "step": 8745 }, { "epoch": 1.55, "grad_norm": 4.1875, "learning_rate": 2.8465082118962684e-05, "loss": 1.2173, "step": 8750 }, { "epoch": 1.55, "grad_norm": 4.84375, "learning_rate": 2.8437250169411343e-05, "loss": 1.2842, "step": 8755 }, { "epoch": 1.55, "grad_norm": 4.53125, "learning_rate": 2.8409419568491622e-05, "loss": 1.2555, "step": 8760 }, { "epoch": 1.55, "grad_norm": 3.984375, "learning_rate": 2.838159034022093e-05, "loss": 1.2138, "step": 8765 }, { "epoch": 1.55, "grad_norm": 5.84375, "learning_rate": 2.8353762508615525e-05, "loss": 1.177, "step": 8770 }, { "epoch": 1.55, "grad_norm": 4.8125, "learning_rate": 2.8325936097690432e-05, "loss": 1.1827, "step": 8775 }, { "epoch": 1.56, "grad_norm": 4.4375, "learning_rate": 2.829811113145945e-05, "loss": 1.2602, "step": 8780 }, { "epoch": 1.56, "grad_norm": 4.21875, "learning_rate": 2.8270287633935162e-05, "loss": 1.2557, "step": 8785 }, { "epoch": 1.56, "grad_norm": 4.46875, "learning_rate": 2.8242465629128863e-05, "loss": 1.2532, "step": 8790 }, { "epoch": 1.56, "grad_norm": 5.1875, "learning_rate": 2.8214645141050548e-05, "loss": 1.1997, "step": 8795 }, { "epoch": 1.56, "grad_norm": 5.46875, "learning_rate": 2.818682619370893e-05, "loss": 1.2626, "step": 8800 }, { "epoch": 1.56, "grad_norm": 4.84375, "learning_rate": 2.8159008811111362e-05, "loss": 1.1736, "step": 8805 }, { "epoch": 1.56, "grad_norm": 5.03125, "learning_rate": 2.813119301726388e-05, "loss": 1.1969, "step": 8810 }, { "epoch": 1.56, "grad_norm": 3.890625, "learning_rate": 2.8103378836171118e-05, "loss": 1.1917, "step": 8815 }, { "epoch": 1.56, "grad_norm": 4.875, "learning_rate": 2.807556629183633e-05, "loss": 1.3117, "step": 8820 }, { "epoch": 1.56, "grad_norm": 4.125, "learning_rate": 2.804775540826136e-05, "loss": 1.1878, "step": 8825 }, { "epoch": 1.56, "grad_norm": 5.0625, "learning_rate": 2.8019946209446638e-05, "loss": 1.2779, "step": 8830 }, { "epoch": 1.57, "grad_norm": 4.5625, "learning_rate": 2.7992138719391096e-05, "loss": 1.1921, "step": 8835 }, { "epoch": 1.57, "grad_norm": 5.03125, "learning_rate": 2.796433296209223e-05, "loss": 1.2221, "step": 8840 }, { "epoch": 1.57, "grad_norm": 4.71875, "learning_rate": 2.7936528961546024e-05, "loss": 1.2652, "step": 8845 }, { "epoch": 1.57, "grad_norm": 5.34375, "learning_rate": 2.7908726741746935e-05, "loss": 1.2786, "step": 8850 }, { "epoch": 1.57, "grad_norm": 4.09375, "learning_rate": 2.7880926326687916e-05, "loss": 1.246, "step": 8855 }, { "epoch": 1.57, "grad_norm": 4.71875, "learning_rate": 2.7853127740360328e-05, "loss": 1.2047, "step": 8860 }, { "epoch": 1.57, "grad_norm": 4.25, "learning_rate": 2.7825331006753964e-05, "loss": 1.285, "step": 8865 }, { "epoch": 1.57, "grad_norm": 4.40625, "learning_rate": 2.7797536149857047e-05, "loss": 1.2147, "step": 8870 }, { "epoch": 1.57, "grad_norm": 4.46875, "learning_rate": 2.7769743193656144e-05, "loss": 1.2411, "step": 8875 }, { "epoch": 1.57, "grad_norm": 4.875, "learning_rate": 2.774195216213618e-05, "loss": 1.21, "step": 8880 }, { "epoch": 1.57, "grad_norm": 5.03125, "learning_rate": 2.7714163079280456e-05, "loss": 1.2104, "step": 8885 }, { "epoch": 1.58, "grad_norm": 4.4375, "learning_rate": 2.768637596907055e-05, "loss": 1.2257, "step": 8890 }, { "epoch": 1.58, "grad_norm": 4.53125, "learning_rate": 2.7658590855486366e-05, "loss": 1.2227, "step": 8895 }, { "epoch": 1.58, "grad_norm": 4.78125, "learning_rate": 2.7630807762506068e-05, "loss": 1.2063, "step": 8900 }, { "epoch": 1.58, "grad_norm": 4.625, "learning_rate": 2.7603026714106082e-05, "loss": 1.2168, "step": 8905 }, { "epoch": 1.58, "grad_norm": 4.4375, "learning_rate": 2.7575247734261086e-05, "loss": 1.271, "step": 8910 }, { "epoch": 1.58, "grad_norm": 4.5625, "learning_rate": 2.754747084694395e-05, "loss": 1.2211, "step": 8915 }, { "epoch": 1.58, "grad_norm": 5.0, "learning_rate": 2.7519696076125733e-05, "loss": 1.2754, "step": 8920 }, { "epoch": 1.58, "grad_norm": 4.5, "learning_rate": 2.74919234457757e-05, "loss": 1.198, "step": 8925 }, { "epoch": 1.58, "grad_norm": 4.34375, "learning_rate": 2.746415297986123e-05, "loss": 1.2156, "step": 8930 }, { "epoch": 1.58, "grad_norm": 4.4375, "learning_rate": 2.7436384702347875e-05, "loss": 1.2143, "step": 8935 }, { "epoch": 1.58, "grad_norm": 4.71875, "learning_rate": 2.740861863719926e-05, "loss": 1.2102, "step": 8940 }, { "epoch": 1.59, "grad_norm": 4.3125, "learning_rate": 2.738085480837711e-05, "loss": 1.1612, "step": 8945 }, { "epoch": 1.59, "grad_norm": 4.4375, "learning_rate": 2.735309323984124e-05, "loss": 1.2098, "step": 8950 }, { "epoch": 1.59, "grad_norm": 4.5625, "learning_rate": 2.7325333955549514e-05, "loss": 1.2207, "step": 8955 }, { "epoch": 1.59, "grad_norm": 4.46875, "learning_rate": 2.729757697945779e-05, "loss": 1.288, "step": 8960 }, { "epoch": 1.59, "grad_norm": 4.40625, "learning_rate": 2.726982233551997e-05, "loss": 1.2354, "step": 8965 }, { "epoch": 1.59, "grad_norm": 4.8125, "learning_rate": 2.7242070047687924e-05, "loss": 1.2786, "step": 8970 }, { "epoch": 1.59, "grad_norm": 4.53125, "learning_rate": 2.7214320139911502e-05, "loss": 1.2516, "step": 8975 }, { "epoch": 1.59, "grad_norm": 4.1875, "learning_rate": 2.7186572636138486e-05, "loss": 1.3034, "step": 8980 }, { "epoch": 1.59, "grad_norm": 4.25, "learning_rate": 2.7158827560314584e-05, "loss": 1.2262, "step": 8985 }, { "epoch": 1.59, "grad_norm": 4.65625, "learning_rate": 2.7131084936383415e-05, "loss": 1.2658, "step": 8990 }, { "epoch": 1.59, "grad_norm": 4.375, "learning_rate": 2.710334478828651e-05, "loss": 1.2092, "step": 8995 }, { "epoch": 1.59, "grad_norm": 4.59375, "learning_rate": 2.70756071399632e-05, "loss": 1.3058, "step": 9000 }, { "epoch": 1.6, "grad_norm": 5.3125, "learning_rate": 2.7047872015350716e-05, "loss": 1.2714, "step": 9005 }, { "epoch": 1.6, "grad_norm": 4.40625, "learning_rate": 2.7020139438384086e-05, "loss": 1.1851, "step": 9010 }, { "epoch": 1.6, "grad_norm": 4.78125, "learning_rate": 2.6992409432996128e-05, "loss": 1.1893, "step": 9015 }, { "epoch": 1.6, "grad_norm": 4.4375, "learning_rate": 2.6964682023117474e-05, "loss": 1.2345, "step": 9020 }, { "epoch": 1.6, "grad_norm": 5.53125, "learning_rate": 2.6936957232676483e-05, "loss": 1.2383, "step": 9025 }, { "epoch": 1.6, "grad_norm": 4.8125, "learning_rate": 2.6909235085599265e-05, "loss": 1.2523, "step": 9030 }, { "epoch": 1.6, "grad_norm": 3.953125, "learning_rate": 2.688151560580967e-05, "loss": 1.22, "step": 9035 }, { "epoch": 1.6, "grad_norm": 4.53125, "learning_rate": 2.6853798817229224e-05, "loss": 1.1901, "step": 9040 }, { "epoch": 1.6, "grad_norm": 4.5625, "learning_rate": 2.6826084743777124e-05, "loss": 1.1808, "step": 9045 }, { "epoch": 1.6, "grad_norm": 4.625, "learning_rate": 2.679837340937024e-05, "loss": 1.2417, "step": 9050 }, { "epoch": 1.6, "grad_norm": 4.25, "learning_rate": 2.6770664837923066e-05, "loss": 1.2738, "step": 9055 }, { "epoch": 1.61, "grad_norm": 5.21875, "learning_rate": 2.674295905334773e-05, "loss": 1.19, "step": 9060 }, { "epoch": 1.61, "grad_norm": 4.21875, "learning_rate": 2.6715256079553932e-05, "loss": 1.1987, "step": 9065 }, { "epoch": 1.61, "grad_norm": 4.5625, "learning_rate": 2.6687555940448957e-05, "loss": 1.1559, "step": 9070 }, { "epoch": 1.61, "grad_norm": 4.3125, "learning_rate": 2.6659858659937654e-05, "loss": 1.245, "step": 9075 }, { "epoch": 1.61, "grad_norm": 5.09375, "learning_rate": 2.6632164261922395e-05, "loss": 1.2034, "step": 9080 }, { "epoch": 1.61, "grad_norm": 3.890625, "learning_rate": 2.6604472770303054e-05, "loss": 1.2308, "step": 9085 }, { "epoch": 1.61, "grad_norm": 5.40625, "learning_rate": 2.657678420897702e-05, "loss": 1.3012, "step": 9090 }, { "epoch": 1.61, "grad_norm": 5.03125, "learning_rate": 2.6549098601839126e-05, "loss": 1.2385, "step": 9095 }, { "epoch": 1.61, "grad_norm": 5.375, "learning_rate": 2.6521415972781684e-05, "loss": 1.2172, "step": 9100 }, { "epoch": 1.61, "grad_norm": 5.1875, "learning_rate": 2.6493736345694416e-05, "loss": 1.2625, "step": 9105 }, { "epoch": 1.61, "grad_norm": 4.5625, "learning_rate": 2.6466059744464445e-05, "loss": 1.1966, "step": 9110 }, { "epoch": 1.62, "grad_norm": 4.59375, "learning_rate": 2.6438386192976317e-05, "loss": 1.2159, "step": 9115 }, { "epoch": 1.62, "grad_norm": 4.875, "learning_rate": 2.6410715715111932e-05, "loss": 1.2293, "step": 9120 }, { "epoch": 1.62, "grad_norm": 4.65625, "learning_rate": 2.6383048334750513e-05, "loss": 1.2052, "step": 9125 }, { "epoch": 1.62, "grad_norm": 4.78125, "learning_rate": 2.6355384075768643e-05, "loss": 1.2416, "step": 9130 }, { "epoch": 1.62, "grad_norm": 4.25, "learning_rate": 2.632772296204019e-05, "loss": 1.1974, "step": 9135 }, { "epoch": 1.62, "grad_norm": 4.4375, "learning_rate": 2.630006501743631e-05, "loss": 1.1715, "step": 9140 }, { "epoch": 1.62, "grad_norm": 4.8125, "learning_rate": 2.627241026582544e-05, "loss": 1.2225, "step": 9145 }, { "epoch": 1.62, "grad_norm": 5.21875, "learning_rate": 2.6244758731073237e-05, "loss": 1.2184, "step": 9150 }, { "epoch": 1.62, "grad_norm": 4.6875, "learning_rate": 2.6217110437042598e-05, "loss": 1.2246, "step": 9155 }, { "epoch": 1.62, "grad_norm": 4.28125, "learning_rate": 2.6189465407593638e-05, "loss": 1.2882, "step": 9160 }, { "epoch": 1.62, "grad_norm": 5.3125, "learning_rate": 2.616182366658362e-05, "loss": 1.209, "step": 9165 }, { "epoch": 1.62, "grad_norm": 4.1875, "learning_rate": 2.613418523786699e-05, "loss": 1.2612, "step": 9170 }, { "epoch": 1.63, "grad_norm": 4.6875, "learning_rate": 2.610655014529534e-05, "loss": 1.2252, "step": 9175 }, { "epoch": 1.63, "grad_norm": 4.5, "learning_rate": 2.6078918412717357e-05, "loss": 1.2659, "step": 9180 }, { "epoch": 1.63, "grad_norm": 4.28125, "learning_rate": 2.6051290063978867e-05, "loss": 1.216, "step": 9185 }, { "epoch": 1.63, "grad_norm": 5.34375, "learning_rate": 2.6023665122922733e-05, "loss": 1.2545, "step": 9190 }, { "epoch": 1.63, "grad_norm": 3.984375, "learning_rate": 2.5996043613388908e-05, "loss": 1.2076, "step": 9195 }, { "epoch": 1.63, "grad_norm": 5.53125, "learning_rate": 2.596842555921439e-05, "loss": 1.2028, "step": 9200 }, { "epoch": 1.63, "grad_norm": 4.75, "learning_rate": 2.5940810984233167e-05, "loss": 1.284, "step": 9205 }, { "epoch": 1.63, "grad_norm": 5.375, "learning_rate": 2.591319991227623e-05, "loss": 1.262, "step": 9210 }, { "epoch": 1.63, "grad_norm": 4.84375, "learning_rate": 2.588559236717157e-05, "loss": 1.3166, "step": 9215 }, { "epoch": 1.63, "grad_norm": 5.0625, "learning_rate": 2.5857988372744105e-05, "loss": 1.1672, "step": 9220 }, { "epoch": 1.63, "grad_norm": 4.25, "learning_rate": 2.5830387952815718e-05, "loss": 1.2457, "step": 9225 }, { "epoch": 1.64, "grad_norm": 5.625, "learning_rate": 2.580279113120517e-05, "loss": 1.2493, "step": 9230 }, { "epoch": 1.64, "grad_norm": 4.5, "learning_rate": 2.5775197931728153e-05, "loss": 1.2051, "step": 9235 }, { "epoch": 1.64, "grad_norm": 4.375, "learning_rate": 2.574760837819722e-05, "loss": 1.1574, "step": 9240 }, { "epoch": 1.64, "grad_norm": 5.40625, "learning_rate": 2.572002249442178e-05, "loss": 1.2136, "step": 9245 }, { "epoch": 1.64, "grad_norm": 4.15625, "learning_rate": 2.5692440304208052e-05, "loss": 1.2651, "step": 9250 }, { "epoch": 1.64, "grad_norm": 4.40625, "learning_rate": 2.566486183135911e-05, "loss": 1.2261, "step": 9255 }, { "epoch": 1.64, "grad_norm": 4.71875, "learning_rate": 2.5637287099674775e-05, "loss": 1.2706, "step": 9260 }, { "epoch": 1.64, "grad_norm": 4.5625, "learning_rate": 2.560971613295168e-05, "loss": 1.2091, "step": 9265 }, { "epoch": 1.64, "grad_norm": 4.25, "learning_rate": 2.558214895498318e-05, "loss": 1.283, "step": 9270 }, { "epoch": 1.64, "grad_norm": 4.28125, "learning_rate": 2.5554585589559366e-05, "loss": 1.2048, "step": 9275 }, { "epoch": 1.64, "grad_norm": 4.125, "learning_rate": 2.5527026060467052e-05, "loss": 1.2221, "step": 9280 }, { "epoch": 1.65, "grad_norm": 5.40625, "learning_rate": 2.549947039148974e-05, "loss": 1.3335, "step": 9285 }, { "epoch": 1.65, "grad_norm": 3.96875, "learning_rate": 2.5471918606407575e-05, "loss": 1.2139, "step": 9290 }, { "epoch": 1.65, "grad_norm": 4.15625, "learning_rate": 2.5444370728997383e-05, "loss": 1.2791, "step": 9295 }, { "epoch": 1.65, "grad_norm": 4.28125, "learning_rate": 2.5416826783032596e-05, "loss": 1.2211, "step": 9300 }, { "epoch": 1.65, "grad_norm": 5.09375, "learning_rate": 2.538928679228326e-05, "loss": 1.234, "step": 9305 }, { "epoch": 1.65, "grad_norm": 4.96875, "learning_rate": 2.5361750780516014e-05, "loss": 1.2598, "step": 9310 }, { "epoch": 1.65, "grad_norm": 4.8125, "learning_rate": 2.533421877149405e-05, "loss": 1.2217, "step": 9315 }, { "epoch": 1.65, "grad_norm": 4.71875, "learning_rate": 2.5306690788977108e-05, "loss": 1.2758, "step": 9320 }, { "epoch": 1.65, "grad_norm": 4.84375, "learning_rate": 2.527916685672148e-05, "loss": 1.1914, "step": 9325 }, { "epoch": 1.65, "grad_norm": 4.59375, "learning_rate": 2.5251646998479924e-05, "loss": 1.246, "step": 9330 }, { "epoch": 1.65, "grad_norm": 4.84375, "learning_rate": 2.522413123800171e-05, "loss": 1.2388, "step": 9335 }, { "epoch": 1.66, "grad_norm": 4.90625, "learning_rate": 2.519661959903255e-05, "loss": 1.2098, "step": 9340 }, { "epoch": 1.66, "grad_norm": 5.25, "learning_rate": 2.5169112105314614e-05, "loss": 1.2659, "step": 9345 }, { "epoch": 1.66, "grad_norm": 4.65625, "learning_rate": 2.5141608780586494e-05, "loss": 1.2059, "step": 9350 }, { "epoch": 1.66, "grad_norm": 4.46875, "learning_rate": 2.5114109648583172e-05, "loss": 1.2435, "step": 9355 }, { "epoch": 1.66, "grad_norm": 4.125, "learning_rate": 2.5086614733036024e-05, "loss": 1.207, "step": 9360 }, { "epoch": 1.66, "grad_norm": 4.8125, "learning_rate": 2.5059124057672797e-05, "loss": 1.2742, "step": 9365 }, { "epoch": 1.66, "grad_norm": 4.4375, "learning_rate": 2.5031637646217557e-05, "loss": 1.2421, "step": 9370 }, { "epoch": 1.66, "grad_norm": 5.0, "learning_rate": 2.500415552239069e-05, "loss": 1.2334, "step": 9375 }, { "epoch": 1.66, "grad_norm": 4.625, "learning_rate": 2.4976677709908902e-05, "loss": 1.1658, "step": 9380 }, { "epoch": 1.66, "grad_norm": 4.59375, "learning_rate": 2.4949204232485163e-05, "loss": 1.3034, "step": 9385 }, { "epoch": 1.66, "grad_norm": 3.9375, "learning_rate": 2.492173511382871e-05, "loss": 1.2718, "step": 9390 }, { "epoch": 1.66, "grad_norm": 4.4375, "learning_rate": 2.4894270377645012e-05, "loss": 1.2526, "step": 9395 }, { "epoch": 1.67, "grad_norm": 4.875, "learning_rate": 2.486681004763575e-05, "loss": 1.3093, "step": 9400 }, { "epoch": 1.67, "grad_norm": 5.03125, "learning_rate": 2.483935414749882e-05, "loss": 1.2795, "step": 9405 }, { "epoch": 1.67, "grad_norm": 4.6875, "learning_rate": 2.4811902700928293e-05, "loss": 1.254, "step": 9410 }, { "epoch": 1.67, "grad_norm": 5.34375, "learning_rate": 2.4784455731614378e-05, "loss": 1.2126, "step": 9415 }, { "epoch": 1.67, "grad_norm": 5.5625, "learning_rate": 2.4757013263243443e-05, "loss": 1.2063, "step": 9420 }, { "epoch": 1.67, "grad_norm": 4.375, "learning_rate": 2.4729575319497944e-05, "loss": 1.1925, "step": 9425 }, { "epoch": 1.67, "grad_norm": 4.21875, "learning_rate": 2.470214192405647e-05, "loss": 1.202, "step": 9430 }, { "epoch": 1.67, "grad_norm": 5.125, "learning_rate": 2.467471310059365e-05, "loss": 1.2715, "step": 9435 }, { "epoch": 1.67, "grad_norm": 4.21875, "learning_rate": 2.4647288872780175e-05, "loss": 1.2031, "step": 9440 }, { "epoch": 1.67, "grad_norm": 4.5625, "learning_rate": 2.461986926428279e-05, "loss": 1.2453, "step": 9445 }, { "epoch": 1.67, "grad_norm": 5.0, "learning_rate": 2.4592454298764244e-05, "loss": 1.184, "step": 9450 }, { "epoch": 1.68, "grad_norm": 4.59375, "learning_rate": 2.4565043999883262e-05, "loss": 1.3203, "step": 9455 }, { "epoch": 1.68, "grad_norm": 4.625, "learning_rate": 2.4537638391294565e-05, "loss": 1.182, "step": 9460 }, { "epoch": 1.68, "grad_norm": 3.96875, "learning_rate": 2.4510237496648813e-05, "loss": 1.2617, "step": 9465 }, { "epoch": 1.68, "grad_norm": 4.78125, "learning_rate": 2.4482841339592592e-05, "loss": 1.2159, "step": 9470 }, { "epoch": 1.68, "grad_norm": 5.15625, "learning_rate": 2.4455449943768427e-05, "loss": 1.2835, "step": 9475 }, { "epoch": 1.68, "grad_norm": 5.125, "learning_rate": 2.4428063332814694e-05, "loss": 1.2629, "step": 9480 }, { "epoch": 1.68, "grad_norm": 4.46875, "learning_rate": 2.440068153036567e-05, "loss": 1.1939, "step": 9485 }, { "epoch": 1.68, "grad_norm": 4.34375, "learning_rate": 2.437330456005149e-05, "loss": 1.2551, "step": 9490 }, { "epoch": 1.68, "grad_norm": 4.3125, "learning_rate": 2.4345932445498088e-05, "loss": 1.2447, "step": 9495 }, { "epoch": 1.68, "grad_norm": 5.15625, "learning_rate": 2.4318565210327217e-05, "loss": 1.23, "step": 9500 }, { "epoch": 1.68, "grad_norm": 4.34375, "learning_rate": 2.4291202878156442e-05, "loss": 1.2258, "step": 9505 }, { "epoch": 1.69, "grad_norm": 4.53125, "learning_rate": 2.4263845472599058e-05, "loss": 1.3003, "step": 9510 }, { "epoch": 1.69, "grad_norm": 4.5625, "learning_rate": 2.423649301726415e-05, "loss": 1.2588, "step": 9515 }, { "epoch": 1.69, "grad_norm": 4.5625, "learning_rate": 2.42091455357565e-05, "loss": 1.1817, "step": 9520 }, { "epoch": 1.69, "grad_norm": 4.78125, "learning_rate": 2.4181803051676597e-05, "loss": 1.2335, "step": 9525 }, { "epoch": 1.69, "grad_norm": 4.25, "learning_rate": 2.4154465588620656e-05, "loss": 1.2761, "step": 9530 }, { "epoch": 1.69, "grad_norm": 4.65625, "learning_rate": 2.412713317018052e-05, "loss": 1.2143, "step": 9535 }, { "epoch": 1.69, "grad_norm": 4.375, "learning_rate": 2.4099805819943678e-05, "loss": 1.2038, "step": 9540 }, { "epoch": 1.69, "grad_norm": 4.75, "learning_rate": 2.4072483561493276e-05, "loss": 1.2474, "step": 9545 }, { "epoch": 1.69, "grad_norm": 4.0625, "learning_rate": 2.404516641840803e-05, "loss": 1.2524, "step": 9550 }, { "epoch": 1.69, "grad_norm": 4.84375, "learning_rate": 2.4017854414262276e-05, "loss": 1.2824, "step": 9555 }, { "epoch": 1.69, "grad_norm": 4.0, "learning_rate": 2.399054757262589e-05, "loss": 1.1828, "step": 9560 }, { "epoch": 1.69, "grad_norm": 4.625, "learning_rate": 2.3963245917064284e-05, "loss": 1.1856, "step": 9565 }, { "epoch": 1.7, "grad_norm": 4.625, "learning_rate": 2.3935949471138434e-05, "loss": 1.2389, "step": 9570 }, { "epoch": 1.7, "grad_norm": 4.375, "learning_rate": 2.39086582584048e-05, "loss": 1.251, "step": 9575 }, { "epoch": 1.7, "grad_norm": 4.6875, "learning_rate": 2.3881372302415304e-05, "loss": 1.2674, "step": 9580 }, { "epoch": 1.7, "grad_norm": 4.78125, "learning_rate": 2.385409162671737e-05, "loss": 1.2149, "step": 9585 }, { "epoch": 1.7, "grad_norm": 4.28125, "learning_rate": 2.382681625485383e-05, "loss": 1.2698, "step": 9590 }, { "epoch": 1.7, "grad_norm": 4.625, "learning_rate": 2.3799546210362964e-05, "loss": 1.1687, "step": 9595 }, { "epoch": 1.7, "grad_norm": 5.25, "learning_rate": 2.3772281516778444e-05, "loss": 1.2271, "step": 9600 }, { "epoch": 1.7, "grad_norm": 5.125, "learning_rate": 2.3745022197629315e-05, "loss": 1.2536, "step": 9605 }, { "epoch": 1.7, "grad_norm": 4.25, "learning_rate": 2.3717768276440008e-05, "loss": 1.2312, "step": 9610 }, { "epoch": 1.7, "grad_norm": 3.734375, "learning_rate": 2.3690519776730285e-05, "loss": 1.2273, "step": 9615 }, { "epoch": 1.7, "grad_norm": 4.25, "learning_rate": 2.3663276722015216e-05, "loss": 1.3296, "step": 9620 }, { "epoch": 1.71, "grad_norm": 4.71875, "learning_rate": 2.3636039135805184e-05, "loss": 1.2372, "step": 9625 }, { "epoch": 1.71, "grad_norm": 4.375, "learning_rate": 2.360880704160586e-05, "loss": 1.1788, "step": 9630 }, { "epoch": 1.71, "grad_norm": 5.0625, "learning_rate": 2.3581580462918153e-05, "loss": 1.1941, "step": 9635 }, { "epoch": 1.71, "grad_norm": 5.21875, "learning_rate": 2.3554359423238237e-05, "loss": 1.234, "step": 9640 }, { "epoch": 1.71, "grad_norm": 4.0625, "learning_rate": 2.3527143946057487e-05, "loss": 1.2099, "step": 9645 }, { "epoch": 1.71, "grad_norm": 4.71875, "learning_rate": 2.3499934054862485e-05, "loss": 1.2352, "step": 9650 }, { "epoch": 1.71, "grad_norm": 5.5625, "learning_rate": 2.3472729773135007e-05, "loss": 1.2427, "step": 9655 }, { "epoch": 1.71, "grad_norm": 5.125, "learning_rate": 2.344553112435196e-05, "loss": 1.2594, "step": 9660 }, { "epoch": 1.71, "grad_norm": 4.375, "learning_rate": 2.341833813198541e-05, "loss": 1.2753, "step": 9665 }, { "epoch": 1.71, "grad_norm": 4.21875, "learning_rate": 2.3391150819502537e-05, "loss": 1.2217, "step": 9670 }, { "epoch": 1.71, "grad_norm": 4.6875, "learning_rate": 2.3363969210365603e-05, "loss": 1.2343, "step": 9675 }, { "epoch": 1.72, "grad_norm": 4.75, "learning_rate": 2.3336793328031985e-05, "loss": 1.3, "step": 9680 }, { "epoch": 1.72, "grad_norm": 4.34375, "learning_rate": 2.330962319595408e-05, "loss": 1.1879, "step": 9685 }, { "epoch": 1.72, "grad_norm": 5.625, "learning_rate": 2.3282458837579332e-05, "loss": 1.308, "step": 9690 }, { "epoch": 1.72, "grad_norm": 4.4375, "learning_rate": 2.325530027635024e-05, "loss": 1.2088, "step": 9695 }, { "epoch": 1.72, "grad_norm": 4.34375, "learning_rate": 2.322814753570424e-05, "loss": 1.2339, "step": 9700 }, { "epoch": 1.72, "grad_norm": 5.4375, "learning_rate": 2.3201000639073782e-05, "loss": 1.2529, "step": 9705 }, { "epoch": 1.72, "grad_norm": 4.84375, "learning_rate": 2.317385960988627e-05, "loss": 1.1942, "step": 9710 }, { "epoch": 1.72, "grad_norm": 4.625, "learning_rate": 2.3146724471564027e-05, "loss": 1.219, "step": 9715 }, { "epoch": 1.72, "grad_norm": 4.28125, "learning_rate": 2.3119595247524316e-05, "loss": 1.2757, "step": 9720 }, { "epoch": 1.72, "grad_norm": 4.375, "learning_rate": 2.309247196117928e-05, "loss": 1.2365, "step": 9725 }, { "epoch": 1.72, "grad_norm": 4.4375, "learning_rate": 2.306535463593593e-05, "loss": 1.2427, "step": 9730 }, { "epoch": 1.73, "grad_norm": 4.3125, "learning_rate": 2.303824329519616e-05, "loss": 1.2575, "step": 9735 }, { "epoch": 1.73, "grad_norm": 3.875, "learning_rate": 2.3011137962356688e-05, "loss": 1.2286, "step": 9740 }, { "epoch": 1.73, "grad_norm": 4.9375, "learning_rate": 2.298403866080903e-05, "loss": 1.1721, "step": 9745 }, { "epoch": 1.73, "grad_norm": 4.21875, "learning_rate": 2.2956945413939523e-05, "loss": 1.1961, "step": 9750 }, { "epoch": 1.73, "grad_norm": 5.46875, "learning_rate": 2.292985824512926e-05, "loss": 1.2077, "step": 9755 }, { "epoch": 1.73, "grad_norm": 4.78125, "learning_rate": 2.2902777177754086e-05, "loss": 1.2, "step": 9760 }, { "epoch": 1.73, "grad_norm": 4.21875, "learning_rate": 2.2875702235184604e-05, "loss": 1.1728, "step": 9765 }, { "epoch": 1.73, "grad_norm": 4.90625, "learning_rate": 2.28486334407861e-05, "loss": 1.23, "step": 9770 }, { "epoch": 1.73, "grad_norm": 4.59375, "learning_rate": 2.282157081791859e-05, "loss": 1.2252, "step": 9775 }, { "epoch": 1.73, "grad_norm": 4.53125, "learning_rate": 2.279451438993675e-05, "loss": 1.2351, "step": 9780 }, { "epoch": 1.73, "grad_norm": 4.40625, "learning_rate": 2.2767464180189878e-05, "loss": 1.2357, "step": 9785 }, { "epoch": 1.73, "grad_norm": 4.28125, "learning_rate": 2.2740420212021957e-05, "loss": 1.1844, "step": 9790 }, { "epoch": 1.74, "grad_norm": 4.28125, "learning_rate": 2.2713382508771552e-05, "loss": 1.2376, "step": 9795 }, { "epoch": 1.74, "grad_norm": 4.5, "learning_rate": 2.2686351093771822e-05, "loss": 1.207, "step": 9800 }, { "epoch": 1.74, "grad_norm": 5.09375, "learning_rate": 2.2659325990350515e-05, "loss": 1.1888, "step": 9805 }, { "epoch": 1.74, "grad_norm": 4.3125, "learning_rate": 2.2632307221829922e-05, "loss": 1.2653, "step": 9810 }, { "epoch": 1.74, "grad_norm": 5.125, "learning_rate": 2.2605294811526856e-05, "loss": 1.2371, "step": 9815 }, { "epoch": 1.74, "grad_norm": 4.90625, "learning_rate": 2.257828878275268e-05, "loss": 1.1258, "step": 9820 }, { "epoch": 1.74, "grad_norm": 4.28125, "learning_rate": 2.2551289158813215e-05, "loss": 1.2575, "step": 9825 }, { "epoch": 1.74, "grad_norm": 5.28125, "learning_rate": 2.2524295963008756e-05, "loss": 1.2713, "step": 9830 }, { "epoch": 1.74, "grad_norm": 4.21875, "learning_rate": 2.249730921863407e-05, "loss": 1.2667, "step": 9835 }, { "epoch": 1.74, "grad_norm": 5.46875, "learning_rate": 2.247032894897834e-05, "loss": 1.3168, "step": 9840 }, { "epoch": 1.74, "grad_norm": 4.25, "learning_rate": 2.2443355177325178e-05, "loss": 1.2969, "step": 9845 }, { "epoch": 1.75, "grad_norm": 4.34375, "learning_rate": 2.2416387926952558e-05, "loss": 1.2525, "step": 9850 }, { "epoch": 1.75, "grad_norm": 4.78125, "learning_rate": 2.2389427221132862e-05, "loss": 1.252, "step": 9855 }, { "epoch": 1.75, "grad_norm": 5.1875, "learning_rate": 2.2362473083132805e-05, "loss": 1.2446, "step": 9860 }, { "epoch": 1.75, "grad_norm": 4.375, "learning_rate": 2.2335525536213447e-05, "loss": 1.2247, "step": 9865 }, { "epoch": 1.75, "grad_norm": 3.984375, "learning_rate": 2.2308584603630127e-05, "loss": 1.2229, "step": 9870 }, { "epoch": 1.75, "grad_norm": 4.25, "learning_rate": 2.228165030863252e-05, "loss": 1.2547, "step": 9875 }, { "epoch": 1.75, "grad_norm": 4.84375, "learning_rate": 2.2254722674464545e-05, "loss": 1.2873, "step": 9880 }, { "epoch": 1.75, "grad_norm": 4.40625, "learning_rate": 2.222780172436438e-05, "loss": 1.1998, "step": 9885 }, { "epoch": 1.75, "grad_norm": 4.59375, "learning_rate": 2.220088748156444e-05, "loss": 1.2603, "step": 9890 }, { "epoch": 1.75, "grad_norm": 4.75, "learning_rate": 2.2173979969291325e-05, "loss": 1.2554, "step": 9895 }, { "epoch": 1.75, "grad_norm": 4.75, "learning_rate": 2.2147079210765873e-05, "loss": 1.165, "step": 9900 }, { "epoch": 1.76, "grad_norm": 4.46875, "learning_rate": 2.2120185229203064e-05, "loss": 1.1839, "step": 9905 }, { "epoch": 1.76, "grad_norm": 4.28125, "learning_rate": 2.2093298047812023e-05, "loss": 1.3033, "step": 9910 }, { "epoch": 1.76, "grad_norm": 4.34375, "learning_rate": 2.206641768979603e-05, "loss": 1.2337, "step": 9915 }, { "epoch": 1.76, "grad_norm": 4.78125, "learning_rate": 2.2039544178352465e-05, "loss": 1.2301, "step": 9920 }, { "epoch": 1.76, "grad_norm": 4.1875, "learning_rate": 2.201267753667278e-05, "loss": 1.2458, "step": 9925 }, { "epoch": 1.76, "grad_norm": 5.28125, "learning_rate": 2.1985817787942542e-05, "loss": 1.2615, "step": 9930 }, { "epoch": 1.76, "grad_norm": 4.78125, "learning_rate": 2.1958964955341318e-05, "loss": 1.216, "step": 9935 }, { "epoch": 1.76, "grad_norm": 5.28125, "learning_rate": 2.1932119062042753e-05, "loss": 1.2266, "step": 9940 }, { "epoch": 1.76, "grad_norm": 4.1875, "learning_rate": 2.1905280131214484e-05, "loss": 1.2197, "step": 9945 }, { "epoch": 1.76, "grad_norm": 4.96875, "learning_rate": 2.187844818601813e-05, "loss": 1.1808, "step": 9950 }, { "epoch": 1.76, "grad_norm": 4.375, "learning_rate": 2.18516232496093e-05, "loss": 1.2173, "step": 9955 }, { "epoch": 1.76, "grad_norm": 4.40625, "learning_rate": 2.182480534513754e-05, "loss": 1.222, "step": 9960 }, { "epoch": 1.77, "grad_norm": 3.84375, "learning_rate": 2.1797994495746325e-05, "loss": 1.2228, "step": 9965 }, { "epoch": 1.77, "grad_norm": 4.6875, "learning_rate": 2.1771190724573066e-05, "loss": 1.1286, "step": 9970 }, { "epoch": 1.77, "grad_norm": 4.40625, "learning_rate": 2.1744394054749027e-05, "loss": 1.1704, "step": 9975 }, { "epoch": 1.77, "grad_norm": 4.59375, "learning_rate": 2.171760450939938e-05, "loss": 1.2409, "step": 9980 }, { "epoch": 1.77, "grad_norm": 4.875, "learning_rate": 2.1690822111643146e-05, "loss": 1.2584, "step": 9985 }, { "epoch": 1.77, "grad_norm": 5.125, "learning_rate": 2.1664046884593147e-05, "loss": 1.2551, "step": 9990 }, { "epoch": 1.77, "grad_norm": 4.375, "learning_rate": 2.1637278851356034e-05, "loss": 1.2601, "step": 9995 }, { "epoch": 1.77, "grad_norm": 4.9375, "learning_rate": 2.1610518035032265e-05, "loss": 1.2196, "step": 10000 }, { "epoch": 1.77, "grad_norm": 4.6875, "learning_rate": 2.1583764458716046e-05, "loss": 1.1912, "step": 10005 }, { "epoch": 1.77, "grad_norm": 4.09375, "learning_rate": 2.1557018145495357e-05, "loss": 1.2636, "step": 10010 }, { "epoch": 1.77, "grad_norm": 5.28125, "learning_rate": 2.1530279118451885e-05, "loss": 1.2286, "step": 10015 }, { "epoch": 1.78, "grad_norm": 4.09375, "learning_rate": 2.1503547400661045e-05, "loss": 1.1407, "step": 10020 }, { "epoch": 1.78, "grad_norm": 4.03125, "learning_rate": 2.1476823015191943e-05, "loss": 1.3042, "step": 10025 }, { "epoch": 1.78, "grad_norm": 4.21875, "learning_rate": 2.1450105985107368e-05, "loss": 1.2152, "step": 10030 }, { "epoch": 1.78, "grad_norm": 4.03125, "learning_rate": 2.1423396333463733e-05, "loss": 1.2356, "step": 10035 }, { "epoch": 1.78, "grad_norm": 5.46875, "learning_rate": 2.1396694083311116e-05, "loss": 1.282, "step": 10040 }, { "epoch": 1.78, "grad_norm": 4.125, "learning_rate": 2.1369999257693177e-05, "loss": 1.1886, "step": 10045 }, { "epoch": 1.78, "grad_norm": 4.375, "learning_rate": 2.1343311879647188e-05, "loss": 1.166, "step": 10050 }, { "epoch": 1.78, "grad_norm": 3.90625, "learning_rate": 2.1316631972203997e-05, "loss": 1.3303, "step": 10055 }, { "epoch": 1.78, "grad_norm": 4.6875, "learning_rate": 2.1289959558387974e-05, "loss": 1.2369, "step": 10060 }, { "epoch": 1.78, "grad_norm": 4.375, "learning_rate": 2.126329466121707e-05, "loss": 1.2935, "step": 10065 }, { "epoch": 1.78, "grad_norm": 5.21875, "learning_rate": 2.1236637303702725e-05, "loss": 1.2315, "step": 10070 }, { "epoch": 1.79, "grad_norm": 4.3125, "learning_rate": 2.120998750884985e-05, "loss": 1.2752, "step": 10075 }, { "epoch": 1.79, "grad_norm": 4.4375, "learning_rate": 2.1183345299656873e-05, "loss": 1.21, "step": 10080 }, { "epoch": 1.79, "grad_norm": 4.5, "learning_rate": 2.115671069911565e-05, "loss": 1.2175, "step": 10085 }, { "epoch": 1.79, "grad_norm": 4.34375, "learning_rate": 2.1130083730211464e-05, "loss": 1.2338, "step": 10090 }, { "epoch": 1.79, "grad_norm": 4.5, "learning_rate": 2.1103464415923034e-05, "loss": 1.2426, "step": 10095 }, { "epoch": 1.79, "grad_norm": 4.34375, "learning_rate": 2.107685277922244e-05, "loss": 1.1281, "step": 10100 }, { "epoch": 1.79, "grad_norm": 4.40625, "learning_rate": 2.105024884307519e-05, "loss": 1.2764, "step": 10105 }, { "epoch": 1.79, "grad_norm": 4.90625, "learning_rate": 2.1023652630440103e-05, "loss": 1.2374, "step": 10110 }, { "epoch": 1.79, "grad_norm": 5.03125, "learning_rate": 2.0997064164269346e-05, "loss": 1.3095, "step": 10115 }, { "epoch": 1.79, "grad_norm": 4.59375, "learning_rate": 2.097048346750839e-05, "loss": 1.1059, "step": 10120 }, { "epoch": 1.79, "grad_norm": 4.71875, "learning_rate": 2.0943910563096023e-05, "loss": 1.2832, "step": 10125 }, { "epoch": 1.8, "grad_norm": 5.5625, "learning_rate": 2.0917345473964282e-05, "loss": 1.3174, "step": 10130 }, { "epoch": 1.8, "grad_norm": 4.0625, "learning_rate": 2.089078822303849e-05, "loss": 1.1703, "step": 10135 }, { "epoch": 1.8, "grad_norm": 4.4375, "learning_rate": 2.086423883323717e-05, "loss": 1.1923, "step": 10140 }, { "epoch": 1.8, "grad_norm": 4.4375, "learning_rate": 2.0837697327472093e-05, "loss": 1.2809, "step": 10145 }, { "epoch": 1.8, "grad_norm": 4.625, "learning_rate": 2.081116372864822e-05, "loss": 1.1973, "step": 10150 }, { "epoch": 1.8, "grad_norm": 5.5625, "learning_rate": 2.0784638059663678e-05, "loss": 1.1783, "step": 10155 }, { "epoch": 1.8, "grad_norm": 5.75, "learning_rate": 2.075812034340974e-05, "loss": 1.2367, "step": 10160 }, { "epoch": 1.8, "grad_norm": 4.625, "learning_rate": 2.0731610602770848e-05, "loss": 1.2496, "step": 10165 }, { "epoch": 1.8, "grad_norm": 4.4375, "learning_rate": 2.0705108860624534e-05, "loss": 1.237, "step": 10170 }, { "epoch": 1.8, "grad_norm": 4.21875, "learning_rate": 2.067861513984144e-05, "loss": 1.1803, "step": 10175 }, { "epoch": 1.8, "grad_norm": 4.34375, "learning_rate": 2.065212946328528e-05, "loss": 1.2181, "step": 10180 }, { "epoch": 1.8, "grad_norm": 5.15625, "learning_rate": 2.0625651853812823e-05, "loss": 1.2144, "step": 10185 }, { "epoch": 1.81, "grad_norm": 4.96875, "learning_rate": 2.0599182334273897e-05, "loss": 1.2775, "step": 10190 }, { "epoch": 1.81, "grad_norm": 4.9375, "learning_rate": 2.0572720927511328e-05, "loss": 1.2114, "step": 10195 }, { "epoch": 1.81, "grad_norm": 4.28125, "learning_rate": 2.0546267656360938e-05, "loss": 1.2475, "step": 10200 }, { "epoch": 1.81, "grad_norm": 5.0, "learning_rate": 2.051982254365155e-05, "loss": 1.196, "step": 10205 }, { "epoch": 1.81, "grad_norm": 5.0, "learning_rate": 2.049338561220492e-05, "loss": 1.2303, "step": 10210 }, { "epoch": 1.81, "grad_norm": 5.0, "learning_rate": 2.0466956884835765e-05, "loss": 1.2322, "step": 10215 }, { "epoch": 1.81, "grad_norm": 5.46875, "learning_rate": 2.044053638435171e-05, "loss": 1.3535, "step": 10220 }, { "epoch": 1.81, "grad_norm": 4.78125, "learning_rate": 2.0414124133553268e-05, "loss": 1.3262, "step": 10225 }, { "epoch": 1.81, "grad_norm": 4.46875, "learning_rate": 2.0387720155233867e-05, "loss": 1.2842, "step": 10230 }, { "epoch": 1.81, "grad_norm": 4.96875, "learning_rate": 2.0361324472179784e-05, "loss": 1.2659, "step": 10235 }, { "epoch": 1.81, "grad_norm": 5.15625, "learning_rate": 2.033493710717011e-05, "loss": 1.2254, "step": 10240 }, { "epoch": 1.82, "grad_norm": 4.5, "learning_rate": 2.0308558082976797e-05, "loss": 1.2179, "step": 10245 }, { "epoch": 1.82, "grad_norm": 4.625, "learning_rate": 2.0282187422364568e-05, "loss": 1.26, "step": 10250 }, { "epoch": 1.82, "grad_norm": 5.59375, "learning_rate": 2.0255825148090936e-05, "loss": 1.2971, "step": 10255 }, { "epoch": 1.82, "grad_norm": 5.25, "learning_rate": 2.0229471282906196e-05, "loss": 1.1954, "step": 10260 }, { "epoch": 1.82, "grad_norm": 5.1875, "learning_rate": 2.0203125849553353e-05, "loss": 1.2139, "step": 10265 }, { "epoch": 1.82, "grad_norm": 4.96875, "learning_rate": 2.0176788870768166e-05, "loss": 1.215, "step": 10270 }, { "epoch": 1.82, "grad_norm": 5.375, "learning_rate": 2.015046036927909e-05, "loss": 1.2333, "step": 10275 }, { "epoch": 1.82, "grad_norm": 4.8125, "learning_rate": 2.012414036780726e-05, "loss": 1.2548, "step": 10280 }, { "epoch": 1.82, "grad_norm": 4.5, "learning_rate": 2.0097828889066455e-05, "loss": 1.236, "step": 10285 }, { "epoch": 1.82, "grad_norm": 4.46875, "learning_rate": 2.0071525955763142e-05, "loss": 1.2928, "step": 10290 }, { "epoch": 1.82, "grad_norm": 4.3125, "learning_rate": 2.0045231590596376e-05, "loss": 1.214, "step": 10295 }, { "epoch": 1.83, "grad_norm": 4.40625, "learning_rate": 2.001894581625784e-05, "loss": 1.1807, "step": 10300 }, { "epoch": 1.83, "grad_norm": 4.25, "learning_rate": 1.9992668655431786e-05, "loss": 1.1957, "step": 10305 }, { "epoch": 1.83, "grad_norm": 4.4375, "learning_rate": 1.9966400130795052e-05, "loss": 1.1615, "step": 10310 }, { "epoch": 1.83, "grad_norm": 4.0625, "learning_rate": 1.9940140265017024e-05, "loss": 1.2096, "step": 10315 }, { "epoch": 1.83, "grad_norm": 4.28125, "learning_rate": 1.9913889080759586e-05, "loss": 1.2963, "step": 10320 }, { "epoch": 1.83, "grad_norm": 4.65625, "learning_rate": 1.988764660067715e-05, "loss": 1.2614, "step": 10325 }, { "epoch": 1.83, "grad_norm": 4.53125, "learning_rate": 1.986141284741663e-05, "loss": 1.2647, "step": 10330 }, { "epoch": 1.83, "grad_norm": 4.90625, "learning_rate": 1.9835187843617382e-05, "loss": 1.2544, "step": 10335 }, { "epoch": 1.83, "grad_norm": 4.6875, "learning_rate": 1.9808971611911228e-05, "loss": 1.226, "step": 10340 }, { "epoch": 1.83, "grad_norm": 4.84375, "learning_rate": 1.9782764174922415e-05, "loss": 1.1379, "step": 10345 }, { "epoch": 1.83, "grad_norm": 4.78125, "learning_rate": 1.9756565555267587e-05, "loss": 1.2248, "step": 10350 }, { "epoch": 1.83, "grad_norm": 3.984375, "learning_rate": 1.9730375775555813e-05, "loss": 1.1641, "step": 10355 }, { "epoch": 1.84, "grad_norm": 4.4375, "learning_rate": 1.970419485838851e-05, "loss": 1.2268, "step": 10360 }, { "epoch": 1.84, "grad_norm": 5.0, "learning_rate": 1.9678022826359442e-05, "loss": 1.2274, "step": 10365 }, { "epoch": 1.84, "grad_norm": 5.0625, "learning_rate": 1.9651859702054715e-05, "loss": 1.2171, "step": 10370 }, { "epoch": 1.84, "grad_norm": 4.4375, "learning_rate": 1.9625705508052748e-05, "loss": 1.1871, "step": 10375 }, { "epoch": 1.84, "grad_norm": 5.09375, "learning_rate": 1.9599560266924248e-05, "loss": 1.2849, "step": 10380 }, { "epoch": 1.84, "grad_norm": 4.9375, "learning_rate": 1.95734240012322e-05, "loss": 1.2191, "step": 10385 }, { "epoch": 1.84, "grad_norm": 4.4375, "learning_rate": 1.9547296733531824e-05, "loss": 1.2566, "step": 10390 }, { "epoch": 1.84, "grad_norm": 3.953125, "learning_rate": 1.9521178486370627e-05, "loss": 1.2119, "step": 10395 }, { "epoch": 1.84, "grad_norm": 4.71875, "learning_rate": 1.949506928228828e-05, "loss": 1.2778, "step": 10400 }, { "epoch": 1.84, "grad_norm": 4.625, "learning_rate": 1.9468969143816655e-05, "loss": 1.282, "step": 10405 }, { "epoch": 1.84, "grad_norm": 4.59375, "learning_rate": 1.9442878093479836e-05, "loss": 1.2939, "step": 10410 }, { "epoch": 1.85, "grad_norm": 3.953125, "learning_rate": 1.9416796153794027e-05, "loss": 1.2547, "step": 10415 }, { "epoch": 1.85, "grad_norm": 4.8125, "learning_rate": 1.939072334726758e-05, "loss": 1.2143, "step": 10420 }, { "epoch": 1.85, "grad_norm": 5.0, "learning_rate": 1.9364659696400978e-05, "loss": 1.2312, "step": 10425 }, { "epoch": 1.85, "grad_norm": 4.78125, "learning_rate": 1.9338605223686777e-05, "loss": 1.2821, "step": 10430 }, { "epoch": 1.85, "grad_norm": 4.46875, "learning_rate": 1.9312559951609644e-05, "loss": 1.124, "step": 10435 }, { "epoch": 1.85, "grad_norm": 4.5625, "learning_rate": 1.92865239026463e-05, "loss": 1.2027, "step": 10440 }, { "epoch": 1.85, "grad_norm": 4.0, "learning_rate": 1.926049709926548e-05, "loss": 1.2098, "step": 10445 }, { "epoch": 1.85, "grad_norm": 5.1875, "learning_rate": 1.9234479563927956e-05, "loss": 1.281, "step": 10450 }, { "epoch": 1.85, "grad_norm": 5.0625, "learning_rate": 1.9208471319086508e-05, "loss": 1.2019, "step": 10455 }, { "epoch": 1.85, "grad_norm": 4.21875, "learning_rate": 1.9182472387185897e-05, "loss": 1.1232, "step": 10460 }, { "epoch": 1.85, "grad_norm": 4.625, "learning_rate": 1.9156482790662843e-05, "loss": 1.329, "step": 10465 }, { "epoch": 1.86, "grad_norm": 4.90625, "learning_rate": 1.9130502551945996e-05, "loss": 1.2313, "step": 10470 }, { "epoch": 1.86, "grad_norm": 5.125, "learning_rate": 1.9104531693455965e-05, "loss": 1.2072, "step": 10475 }, { "epoch": 1.86, "grad_norm": 4.09375, "learning_rate": 1.907857023760524e-05, "loss": 1.2114, "step": 10480 }, { "epoch": 1.86, "grad_norm": 5.65625, "learning_rate": 1.9052618206798195e-05, "loss": 1.2417, "step": 10485 }, { "epoch": 1.86, "grad_norm": 4.40625, "learning_rate": 1.9026675623431073e-05, "loss": 1.258, "step": 10490 }, { "epoch": 1.86, "grad_norm": 4.40625, "learning_rate": 1.9000742509891978e-05, "loss": 1.1946, "step": 10495 }, { "epoch": 1.86, "grad_norm": 4.53125, "learning_rate": 1.897481888856081e-05, "loss": 1.1545, "step": 10500 }, { "epoch": 1.86, "grad_norm": 3.78125, "learning_rate": 1.8948904781809318e-05, "loss": 1.2631, "step": 10505 }, { "epoch": 1.86, "grad_norm": 4.625, "learning_rate": 1.892300021200102e-05, "loss": 1.1224, "step": 10510 }, { "epoch": 1.86, "grad_norm": 4.15625, "learning_rate": 1.8897105201491177e-05, "loss": 1.2031, "step": 10515 }, { "epoch": 1.86, "grad_norm": 4.6875, "learning_rate": 1.887121977262685e-05, "loss": 1.1983, "step": 10520 }, { "epoch": 1.87, "grad_norm": 4.21875, "learning_rate": 1.8845343947746813e-05, "loss": 1.2446, "step": 10525 }, { "epoch": 1.87, "grad_norm": 5.3125, "learning_rate": 1.8819477749181528e-05, "loss": 1.24, "step": 10530 }, { "epoch": 1.87, "grad_norm": 4.40625, "learning_rate": 1.8793621199253184e-05, "loss": 1.2421, "step": 10535 }, { "epoch": 1.87, "grad_norm": 4.375, "learning_rate": 1.8767774320275628e-05, "loss": 1.2379, "step": 10540 }, { "epoch": 1.87, "grad_norm": 4.21875, "learning_rate": 1.8741937134554347e-05, "loss": 1.2196, "step": 10545 }, { "epoch": 1.87, "grad_norm": 4.5625, "learning_rate": 1.871610966438649e-05, "loss": 1.2877, "step": 10550 }, { "epoch": 1.87, "grad_norm": 5.625, "learning_rate": 1.8690291932060786e-05, "loss": 1.1435, "step": 10555 }, { "epoch": 1.87, "grad_norm": 4.6875, "learning_rate": 1.866448395985761e-05, "loss": 1.3104, "step": 10560 }, { "epoch": 1.87, "grad_norm": 5.0625, "learning_rate": 1.863868577004889e-05, "loss": 1.2981, "step": 10565 }, { "epoch": 1.87, "grad_norm": 4.53125, "learning_rate": 1.8612897384898088e-05, "loss": 1.2738, "step": 10570 }, { "epoch": 1.87, "grad_norm": 3.9375, "learning_rate": 1.8587118826660236e-05, "loss": 1.1909, "step": 10575 }, { "epoch": 1.87, "grad_norm": 3.84375, "learning_rate": 1.8561350117581876e-05, "loss": 1.2391, "step": 10580 }, { "epoch": 1.88, "grad_norm": 5.15625, "learning_rate": 1.8535591279901043e-05, "loss": 1.2303, "step": 10585 }, { "epoch": 1.88, "grad_norm": 4.59375, "learning_rate": 1.8509842335847268e-05, "loss": 1.2008, "step": 10590 }, { "epoch": 1.88, "grad_norm": 4.5, "learning_rate": 1.848410330764152e-05, "loss": 1.2388, "step": 10595 }, { "epoch": 1.88, "grad_norm": 4.25, "learning_rate": 1.845837421749624e-05, "loss": 1.2994, "step": 10600 }, { "epoch": 1.88, "grad_norm": 4.71875, "learning_rate": 1.8432655087615286e-05, "loss": 1.2355, "step": 10605 }, { "epoch": 1.88, "grad_norm": 4.5, "learning_rate": 1.8406945940193905e-05, "loss": 1.2061, "step": 10610 }, { "epoch": 1.88, "grad_norm": 5.28125, "learning_rate": 1.838124679741873e-05, "loss": 1.354, "step": 10615 }, { "epoch": 1.88, "grad_norm": 4.5, "learning_rate": 1.8355557681467778e-05, "loss": 1.2445, "step": 10620 }, { "epoch": 1.88, "grad_norm": 4.40625, "learning_rate": 1.8329878614510394e-05, "loss": 1.2624, "step": 10625 }, { "epoch": 1.88, "grad_norm": 4.3125, "learning_rate": 1.8304209618707276e-05, "loss": 1.1915, "step": 10630 }, { "epoch": 1.88, "grad_norm": 4.03125, "learning_rate": 1.8278550716210384e-05, "loss": 1.2452, "step": 10635 }, { "epoch": 1.89, "grad_norm": 5.375, "learning_rate": 1.825290192916303e-05, "loss": 1.2235, "step": 10640 }, { "epoch": 1.89, "grad_norm": 4.84375, "learning_rate": 1.822726327969975e-05, "loss": 1.2426, "step": 10645 }, { "epoch": 1.89, "grad_norm": 4.03125, "learning_rate": 1.820163478994635e-05, "loss": 1.2017, "step": 10650 }, { "epoch": 1.89, "grad_norm": 4.78125, "learning_rate": 1.817601648201986e-05, "loss": 1.2144, "step": 10655 }, { "epoch": 1.89, "grad_norm": 4.9375, "learning_rate": 1.815040837802853e-05, "loss": 1.306, "step": 10660 }, { "epoch": 1.89, "grad_norm": 4.625, "learning_rate": 1.8124810500071796e-05, "loss": 1.189, "step": 10665 }, { "epoch": 1.89, "grad_norm": 5.25, "learning_rate": 1.809922287024029e-05, "loss": 1.1786, "step": 10670 }, { "epoch": 1.89, "grad_norm": 4.6875, "learning_rate": 1.807364551061577e-05, "loss": 1.183, "step": 10675 }, { "epoch": 1.89, "grad_norm": 4.78125, "learning_rate": 1.8048078443271136e-05, "loss": 1.2631, "step": 10680 }, { "epoch": 1.89, "grad_norm": 4.21875, "learning_rate": 1.8022521690270435e-05, "loss": 1.2272, "step": 10685 }, { "epoch": 1.89, "grad_norm": 4.84375, "learning_rate": 1.7996975273668795e-05, "loss": 1.2642, "step": 10690 }, { "epoch": 1.9, "grad_norm": 4.59375, "learning_rate": 1.7971439215512408e-05, "loss": 1.2779, "step": 10695 }, { "epoch": 1.9, "grad_norm": 5.0, "learning_rate": 1.794591353783855e-05, "loss": 1.1558, "step": 10700 }, { "epoch": 1.9, "grad_norm": 5.6875, "learning_rate": 1.7920398262675535e-05, "loss": 1.1882, "step": 10705 }, { "epoch": 1.9, "grad_norm": 4.1875, "learning_rate": 1.7894893412042675e-05, "loss": 1.2556, "step": 10710 }, { "epoch": 1.9, "grad_norm": 4.8125, "learning_rate": 1.7869399007950325e-05, "loss": 1.1912, "step": 10715 }, { "epoch": 1.9, "grad_norm": 4.15625, "learning_rate": 1.7843915072399783e-05, "loss": 1.2509, "step": 10720 }, { "epoch": 1.9, "grad_norm": 4.8125, "learning_rate": 1.781844162738336e-05, "loss": 1.2804, "step": 10725 }, { "epoch": 1.9, "grad_norm": 5.125, "learning_rate": 1.7792978694884282e-05, "loss": 1.2376, "step": 10730 }, { "epoch": 1.9, "grad_norm": 4.15625, "learning_rate": 1.7767526296876696e-05, "loss": 1.2882, "step": 10735 }, { "epoch": 1.9, "grad_norm": 4.625, "learning_rate": 1.7742084455325692e-05, "loss": 1.2569, "step": 10740 }, { "epoch": 1.9, "grad_norm": 4.0625, "learning_rate": 1.7716653192187213e-05, "loss": 1.1303, "step": 10745 }, { "epoch": 1.9, "grad_norm": 4.71875, "learning_rate": 1.7691232529408093e-05, "loss": 1.1813, "step": 10750 }, { "epoch": 1.91, "grad_norm": 4.1875, "learning_rate": 1.7665822488926018e-05, "loss": 1.2501, "step": 10755 }, { "epoch": 1.91, "grad_norm": 5.71875, "learning_rate": 1.7640423092669488e-05, "loss": 1.2762, "step": 10760 }, { "epoch": 1.91, "grad_norm": 4.3125, "learning_rate": 1.7615034362557857e-05, "loss": 1.2352, "step": 10765 }, { "epoch": 1.91, "grad_norm": 4.84375, "learning_rate": 1.7589656320501243e-05, "loss": 1.2967, "step": 10770 }, { "epoch": 1.91, "grad_norm": 4.6875, "learning_rate": 1.756428898840054e-05, "loss": 1.3101, "step": 10775 }, { "epoch": 1.91, "grad_norm": 3.78125, "learning_rate": 1.753893238814741e-05, "loss": 1.2059, "step": 10780 }, { "epoch": 1.91, "grad_norm": 4.03125, "learning_rate": 1.7513586541624254e-05, "loss": 1.2329, "step": 10785 }, { "epoch": 1.91, "grad_norm": 4.96875, "learning_rate": 1.748825147070418e-05, "loss": 1.2249, "step": 10790 }, { "epoch": 1.91, "grad_norm": 4.875, "learning_rate": 1.7462927197251008e-05, "loss": 1.2879, "step": 10795 }, { "epoch": 1.91, "grad_norm": 4.4375, "learning_rate": 1.743761374311924e-05, "loss": 1.2719, "step": 10800 }, { "epoch": 1.91, "grad_norm": 4.5625, "learning_rate": 1.7412311130154042e-05, "loss": 1.2503, "step": 10805 }, { "epoch": 1.92, "grad_norm": 5.15625, "learning_rate": 1.738701938019123e-05, "loss": 1.2268, "step": 10810 }, { "epoch": 1.92, "grad_norm": 4.6875, "learning_rate": 1.7361738515057212e-05, "loss": 1.2435, "step": 10815 }, { "epoch": 1.92, "grad_norm": 5.625, "learning_rate": 1.733646855656904e-05, "loss": 1.2645, "step": 10820 }, { "epoch": 1.92, "grad_norm": 5.125, "learning_rate": 1.731120952653434e-05, "loss": 1.1833, "step": 10825 }, { "epoch": 1.92, "grad_norm": 5.53125, "learning_rate": 1.7285961446751323e-05, "loss": 1.2146, "step": 10830 }, { "epoch": 1.92, "grad_norm": 4.5, "learning_rate": 1.7260724339008706e-05, "loss": 1.246, "step": 10835 }, { "epoch": 1.92, "grad_norm": 5.1875, "learning_rate": 1.7235498225085774e-05, "loss": 1.2344, "step": 10840 }, { "epoch": 1.92, "grad_norm": 4.875, "learning_rate": 1.7210283126752328e-05, "loss": 1.2346, "step": 10845 }, { "epoch": 1.92, "grad_norm": 4.40625, "learning_rate": 1.7185079065768637e-05, "loss": 1.2486, "step": 10850 }, { "epoch": 1.92, "grad_norm": 4.125, "learning_rate": 1.7159886063885474e-05, "loss": 1.2362, "step": 10855 }, { "epoch": 1.92, "grad_norm": 5.1875, "learning_rate": 1.7134704142844058e-05, "loss": 1.211, "step": 10860 }, { "epoch": 1.93, "grad_norm": 4.46875, "learning_rate": 1.7109533324376017e-05, "loss": 1.3006, "step": 10865 }, { "epoch": 1.93, "grad_norm": 4.65625, "learning_rate": 1.7084373630203427e-05, "loss": 1.2022, "step": 10870 }, { "epoch": 1.93, "grad_norm": 4.53125, "learning_rate": 1.705922508203878e-05, "loss": 1.2281, "step": 10875 }, { "epoch": 1.93, "grad_norm": 5.0625, "learning_rate": 1.7034087701584896e-05, "loss": 1.2491, "step": 10880 }, { "epoch": 1.93, "grad_norm": 3.953125, "learning_rate": 1.7008961510534997e-05, "loss": 1.2503, "step": 10885 }, { "epoch": 1.93, "grad_norm": 3.546875, "learning_rate": 1.6983846530572664e-05, "loss": 1.1476, "step": 10890 }, { "epoch": 1.93, "grad_norm": 4.6875, "learning_rate": 1.6958742783371756e-05, "loss": 1.1949, "step": 10895 }, { "epoch": 1.93, "grad_norm": 4.90625, "learning_rate": 1.6933650290596465e-05, "loss": 1.1943, "step": 10900 }, { "epoch": 1.93, "grad_norm": 4.6875, "learning_rate": 1.6908569073901278e-05, "loss": 1.2673, "step": 10905 }, { "epoch": 1.93, "grad_norm": 3.8125, "learning_rate": 1.6883499154930928e-05, "loss": 1.2509, "step": 10910 }, { "epoch": 1.93, "grad_norm": 5.0, "learning_rate": 1.6858440555320408e-05, "loss": 1.2243, "step": 10915 }, { "epoch": 1.94, "grad_norm": 4.84375, "learning_rate": 1.683339329669495e-05, "loss": 1.2138, "step": 10920 }, { "epoch": 1.94, "grad_norm": 4.96875, "learning_rate": 1.6808357400670002e-05, "loss": 1.1986, "step": 10925 }, { "epoch": 1.94, "grad_norm": 5.15625, "learning_rate": 1.678333288885118e-05, "loss": 1.277, "step": 10930 }, { "epoch": 1.94, "grad_norm": 4.53125, "learning_rate": 1.6758319782834325e-05, "loss": 1.2099, "step": 10935 }, { "epoch": 1.94, "grad_norm": 5.1875, "learning_rate": 1.6733318104205368e-05, "loss": 1.2928, "step": 10940 }, { "epoch": 1.94, "grad_norm": 4.8125, "learning_rate": 1.670832787454043e-05, "loss": 1.276, "step": 10945 }, { "epoch": 1.94, "grad_norm": 4.9375, "learning_rate": 1.6683349115405733e-05, "loss": 1.2391, "step": 10950 }, { "epoch": 1.94, "grad_norm": 4.5, "learning_rate": 1.6658381848357613e-05, "loss": 1.2587, "step": 10955 }, { "epoch": 1.94, "grad_norm": 4.9375, "learning_rate": 1.663342609494246e-05, "loss": 1.2599, "step": 10960 }, { "epoch": 1.94, "grad_norm": 4.65625, "learning_rate": 1.6608481876696742e-05, "loss": 1.2571, "step": 10965 }, { "epoch": 1.94, "grad_norm": 4.875, "learning_rate": 1.6583549215147014e-05, "loss": 1.3088, "step": 10970 }, { "epoch": 1.94, "grad_norm": 3.75, "learning_rate": 1.655862813180978e-05, "loss": 1.2296, "step": 10975 }, { "epoch": 1.95, "grad_norm": 4.625, "learning_rate": 1.6533718648191607e-05, "loss": 1.2204, "step": 10980 }, { "epoch": 1.95, "grad_norm": 3.96875, "learning_rate": 1.6508820785789048e-05, "loss": 1.3175, "step": 10985 }, { "epoch": 1.95, "grad_norm": 4.25, "learning_rate": 1.6483934566088598e-05, "loss": 1.2415, "step": 10990 }, { "epoch": 1.95, "grad_norm": 4.65625, "learning_rate": 1.645906001056673e-05, "loss": 1.2501, "step": 10995 }, { "epoch": 1.95, "grad_norm": 5.1875, "learning_rate": 1.643419714068985e-05, "loss": 1.2471, "step": 11000 }, { "epoch": 1.95, "grad_norm": 4.3125, "learning_rate": 1.640934597791425e-05, "loss": 1.2796, "step": 11005 }, { "epoch": 1.95, "grad_norm": 4.875, "learning_rate": 1.638450654368615e-05, "loss": 1.2118, "step": 11010 }, { "epoch": 1.95, "grad_norm": 4.28125, "learning_rate": 1.6359678859441667e-05, "loss": 1.2012, "step": 11015 }, { "epoch": 1.95, "grad_norm": 4.25, "learning_rate": 1.6334862946606726e-05, "loss": 1.2075, "step": 11020 }, { "epoch": 1.95, "grad_norm": 4.40625, "learning_rate": 1.6310058826597115e-05, "loss": 1.2756, "step": 11025 }, { "epoch": 1.95, "grad_norm": 4.8125, "learning_rate": 1.6285266520818474e-05, "loss": 1.2687, "step": 11030 }, { "epoch": 1.96, "grad_norm": 3.921875, "learning_rate": 1.626048605066619e-05, "loss": 1.1698, "step": 11035 }, { "epoch": 1.96, "grad_norm": 5.84375, "learning_rate": 1.6235717437525485e-05, "loss": 1.2954, "step": 11040 }, { "epoch": 1.96, "grad_norm": 5.21875, "learning_rate": 1.621096070277133e-05, "loss": 1.2346, "step": 11045 }, { "epoch": 1.96, "grad_norm": 4.28125, "learning_rate": 1.6186215867768445e-05, "loss": 1.2863, "step": 11050 }, { "epoch": 1.96, "grad_norm": 4.375, "learning_rate": 1.616148295387128e-05, "loss": 1.308, "step": 11055 }, { "epoch": 1.96, "grad_norm": 4.5625, "learning_rate": 1.6136761982424006e-05, "loss": 1.2754, "step": 11060 }, { "epoch": 1.96, "grad_norm": 4.4375, "learning_rate": 1.611205297476049e-05, "loss": 1.2071, "step": 11065 }, { "epoch": 1.96, "grad_norm": 4.53125, "learning_rate": 1.6087355952204242e-05, "loss": 1.2192, "step": 11070 }, { "epoch": 1.96, "grad_norm": 4.53125, "learning_rate": 1.6062670936068466e-05, "loss": 1.2669, "step": 11075 }, { "epoch": 1.96, "grad_norm": 4.6875, "learning_rate": 1.6037997947656e-05, "loss": 1.2244, "step": 11080 }, { "epoch": 1.96, "grad_norm": 4.875, "learning_rate": 1.6013337008259272e-05, "loss": 1.2916, "step": 11085 }, { "epoch": 1.97, "grad_norm": 4.65625, "learning_rate": 1.5988688139160333e-05, "loss": 1.2581, "step": 11090 }, { "epoch": 1.97, "grad_norm": 4.875, "learning_rate": 1.5964051361630855e-05, "loss": 1.279, "step": 11095 }, { "epoch": 1.97, "grad_norm": 4.4375, "learning_rate": 1.5939426696931997e-05, "loss": 1.2235, "step": 11100 }, { "epoch": 1.97, "grad_norm": 4.28125, "learning_rate": 1.591481416631453e-05, "loss": 1.2298, "step": 11105 }, { "epoch": 1.97, "grad_norm": 5.3125, "learning_rate": 1.589021379101873e-05, "loss": 1.2104, "step": 11110 }, { "epoch": 1.97, "grad_norm": 5.28125, "learning_rate": 1.5865625592274362e-05, "loss": 1.2318, "step": 11115 }, { "epoch": 1.97, "grad_norm": 3.75, "learning_rate": 1.5841049591300717e-05, "loss": 1.1771, "step": 11120 }, { "epoch": 1.97, "grad_norm": 4.4375, "learning_rate": 1.5816485809306543e-05, "loss": 1.224, "step": 11125 }, { "epoch": 1.97, "grad_norm": 4.40625, "learning_rate": 1.579193426749005e-05, "loss": 1.1915, "step": 11130 }, { "epoch": 1.97, "grad_norm": 4.09375, "learning_rate": 1.576739498703888e-05, "loss": 1.2214, "step": 11135 }, { "epoch": 1.97, "grad_norm": 3.984375, "learning_rate": 1.57428679891301e-05, "loss": 1.2232, "step": 11140 }, { "epoch": 1.97, "grad_norm": 4.34375, "learning_rate": 1.5718353294930157e-05, "loss": 1.2187, "step": 11145 }, { "epoch": 1.98, "grad_norm": 4.34375, "learning_rate": 1.56938509255949e-05, "loss": 1.2003, "step": 11150 }, { "epoch": 1.98, "grad_norm": 5.03125, "learning_rate": 1.566936090226954e-05, "loss": 1.227, "step": 11155 }, { "epoch": 1.98, "grad_norm": 4.78125, "learning_rate": 1.5644883246088645e-05, "loss": 1.254, "step": 11160 }, { "epoch": 1.98, "grad_norm": 4.71875, "learning_rate": 1.5620417978176067e-05, "loss": 1.2335, "step": 11165 }, { "epoch": 1.98, "grad_norm": 4.3125, "learning_rate": 1.559596511964501e-05, "loss": 1.234, "step": 11170 }, { "epoch": 1.98, "grad_norm": 5.0625, "learning_rate": 1.557152469159795e-05, "loss": 1.289, "step": 11175 }, { "epoch": 1.98, "grad_norm": 5.09375, "learning_rate": 1.554709671512665e-05, "loss": 1.2494, "step": 11180 }, { "epoch": 1.98, "grad_norm": 4.28125, "learning_rate": 1.552268121131211e-05, "loss": 1.2839, "step": 11185 }, { "epoch": 1.98, "grad_norm": 4.40625, "learning_rate": 1.5498278201224583e-05, "loss": 1.2122, "step": 11190 }, { "epoch": 1.98, "grad_norm": 4.03125, "learning_rate": 1.5473887705923518e-05, "loss": 1.2544, "step": 11195 }, { "epoch": 1.98, "grad_norm": 5.125, "learning_rate": 1.5449509746457583e-05, "loss": 1.2481, "step": 11200 }, { "epoch": 1.99, "grad_norm": 3.890625, "learning_rate": 1.5425144343864632e-05, "loss": 1.2549, "step": 11205 }, { "epoch": 1.99, "grad_norm": 4.28125, "learning_rate": 1.540079151917165e-05, "loss": 1.2285, "step": 11210 }, { "epoch": 1.99, "grad_norm": 4.125, "learning_rate": 1.5376451293394795e-05, "loss": 1.1655, "step": 11215 }, { "epoch": 1.99, "grad_norm": 5.03125, "learning_rate": 1.535212368753937e-05, "loss": 1.2322, "step": 11220 }, { "epoch": 1.99, "grad_norm": 4.34375, "learning_rate": 1.532780872259974e-05, "loss": 1.2208, "step": 11225 }, { "epoch": 1.99, "grad_norm": 5.15625, "learning_rate": 1.5303506419559393e-05, "loss": 1.136, "step": 11230 }, { "epoch": 1.99, "grad_norm": 4.28125, "learning_rate": 1.5279216799390896e-05, "loss": 1.2763, "step": 11235 }, { "epoch": 1.99, "grad_norm": 4.84375, "learning_rate": 1.5254939883055829e-05, "loss": 1.2371, "step": 11240 }, { "epoch": 1.99, "grad_norm": 4.5625, "learning_rate": 1.5230675691504857e-05, "loss": 1.2051, "step": 11245 }, { "epoch": 1.99, "grad_norm": 4.0625, "learning_rate": 1.5206424245677632e-05, "loss": 1.2487, "step": 11250 }, { "epoch": 1.99, "grad_norm": 4.53125, "learning_rate": 1.518218556650283e-05, "loss": 1.1531, "step": 11255 }, { "epoch": 2.0, "grad_norm": 4.71875, "learning_rate": 1.5157959674898093e-05, "loss": 1.2918, "step": 11260 }, { "epoch": 2.0, "grad_norm": 4.5625, "learning_rate": 1.5133746591770042e-05, "loss": 1.1509, "step": 11265 }, { "epoch": 2.0, "grad_norm": 5.21875, "learning_rate": 1.510954633801421e-05, "loss": 1.2398, "step": 11270 }, { "epoch": 2.0, "grad_norm": 5.15625, "learning_rate": 1.5085358934515104e-05, "loss": 1.2054, "step": 11275 }, { "epoch": 2.0, "grad_norm": 4.59375, "learning_rate": 1.5061184402146114e-05, "loss": 1.1741, "step": 11280 }, { "epoch": 2.0, "grad_norm": 4.96875, "learning_rate": 1.5037022761769532e-05, "loss": 1.2437, "step": 11285 }, { "epoch": 2.0, "grad_norm": 3.9375, "learning_rate": 1.5012874034236515e-05, "loss": 1.2112, "step": 11290 }, { "epoch": 2.0, "grad_norm": 4.75, "learning_rate": 1.4988738240387083e-05, "loss": 1.2235, "step": 11295 }, { "epoch": 2.0, "grad_norm": 4.53125, "learning_rate": 1.49646154010501e-05, "loss": 1.215, "step": 11300 }, { "epoch": 2.0, "grad_norm": 5.15625, "learning_rate": 1.4940505537043241e-05, "loss": 1.1498, "step": 11305 }, { "epoch": 2.0, "grad_norm": 4.78125, "learning_rate": 1.4916408669172988e-05, "loss": 1.2098, "step": 11310 }, { "epoch": 2.01, "grad_norm": 4.71875, "learning_rate": 1.4892324818234619e-05, "loss": 1.1875, "step": 11315 }, { "epoch": 2.01, "grad_norm": 4.625, "learning_rate": 1.4868254005012142e-05, "loss": 1.18, "step": 11320 }, { "epoch": 2.01, "grad_norm": 4.75, "learning_rate": 1.484419625027835e-05, "loss": 1.1851, "step": 11325 }, { "epoch": 2.01, "grad_norm": 5.375, "learning_rate": 1.4820151574794762e-05, "loss": 1.2454, "step": 11330 }, { "epoch": 2.01, "grad_norm": 4.8125, "learning_rate": 1.4796119999311588e-05, "loss": 1.1825, "step": 11335 }, { "epoch": 2.01, "grad_norm": 3.96875, "learning_rate": 1.4772101544567736e-05, "loss": 1.1918, "step": 11340 }, { "epoch": 2.01, "grad_norm": 3.921875, "learning_rate": 1.4748096231290848e-05, "loss": 1.1976, "step": 11345 }, { "epoch": 2.01, "grad_norm": 4.4375, "learning_rate": 1.4724104080197138e-05, "loss": 1.2413, "step": 11350 }, { "epoch": 2.01, "grad_norm": 4.28125, "learning_rate": 1.4700125111991513e-05, "loss": 1.183, "step": 11355 }, { "epoch": 2.01, "grad_norm": 5.15625, "learning_rate": 1.4676159347367507e-05, "loss": 1.2035, "step": 11360 }, { "epoch": 2.01, "grad_norm": 5.09375, "learning_rate": 1.4652206807007225e-05, "loss": 1.2674, "step": 11365 }, { "epoch": 2.01, "grad_norm": 4.75, "learning_rate": 1.4628267511581385e-05, "loss": 1.2273, "step": 11370 }, { "epoch": 2.02, "grad_norm": 3.796875, "learning_rate": 1.4604341481749274e-05, "loss": 1.2421, "step": 11375 }, { "epoch": 2.02, "grad_norm": 4.84375, "learning_rate": 1.4580428738158726e-05, "loss": 1.2685, "step": 11380 }, { "epoch": 2.02, "grad_norm": 4.78125, "learning_rate": 1.4556529301446108e-05, "loss": 1.223, "step": 11385 }, { "epoch": 2.02, "grad_norm": 5.3125, "learning_rate": 1.4532643192236318e-05, "loss": 1.1947, "step": 11390 }, { "epoch": 2.02, "grad_norm": 4.96875, "learning_rate": 1.4508770431142716e-05, "loss": 1.2627, "step": 11395 }, { "epoch": 2.02, "grad_norm": 4.625, "learning_rate": 1.4484911038767183e-05, "loss": 1.231, "step": 11400 }, { "epoch": 2.02, "grad_norm": 4.625, "learning_rate": 1.4461065035700039e-05, "loss": 1.1734, "step": 11405 }, { "epoch": 2.02, "grad_norm": 4.0, "learning_rate": 1.4437232442520072e-05, "loss": 1.163, "step": 11410 }, { "epoch": 2.02, "grad_norm": 4.40625, "learning_rate": 1.4413413279794458e-05, "loss": 1.2159, "step": 11415 }, { "epoch": 2.02, "grad_norm": 4.375, "learning_rate": 1.4389607568078809e-05, "loss": 1.2378, "step": 11420 }, { "epoch": 2.02, "grad_norm": 4.1875, "learning_rate": 1.4365815327917157e-05, "loss": 1.1857, "step": 11425 }, { "epoch": 2.03, "grad_norm": 4.4375, "learning_rate": 1.4342036579841848e-05, "loss": 1.176, "step": 11430 }, { "epoch": 2.03, "grad_norm": 4.53125, "learning_rate": 1.4318271344373625e-05, "loss": 1.2452, "step": 11435 }, { "epoch": 2.03, "grad_norm": 5.5625, "learning_rate": 1.429451964202157e-05, "loss": 1.2421, "step": 11440 }, { "epoch": 2.03, "grad_norm": 5.03125, "learning_rate": 1.4270781493283054e-05, "loss": 1.247, "step": 11445 }, { "epoch": 2.03, "grad_norm": 4.25, "learning_rate": 1.4247056918643782e-05, "loss": 1.2056, "step": 11450 }, { "epoch": 2.03, "grad_norm": 4.15625, "learning_rate": 1.4223345938577747e-05, "loss": 1.2407, "step": 11455 }, { "epoch": 2.03, "grad_norm": 4.625, "learning_rate": 1.4199648573547167e-05, "loss": 1.3027, "step": 11460 }, { "epoch": 2.03, "grad_norm": 3.84375, "learning_rate": 1.4175964844002574e-05, "loss": 1.1677, "step": 11465 }, { "epoch": 2.03, "grad_norm": 4.84375, "learning_rate": 1.4152294770382703e-05, "loss": 1.2503, "step": 11470 }, { "epoch": 2.03, "grad_norm": 4.34375, "learning_rate": 1.4128638373114476e-05, "loss": 1.1914, "step": 11475 }, { "epoch": 2.03, "grad_norm": 4.21875, "learning_rate": 1.4104995672613054e-05, "loss": 1.2181, "step": 11480 }, { "epoch": 2.04, "grad_norm": 4.65625, "learning_rate": 1.408136668928176e-05, "loss": 1.2696, "step": 11485 }, { "epoch": 2.04, "grad_norm": 4.5, "learning_rate": 1.4057751443512093e-05, "loss": 1.1018, "step": 11490 }, { "epoch": 2.04, "grad_norm": 4.875, "learning_rate": 1.403414995568366e-05, "loss": 1.2709, "step": 11495 }, { "epoch": 2.04, "grad_norm": 4.75, "learning_rate": 1.401056224616424e-05, "loss": 1.2611, "step": 11500 }, { "epoch": 2.04, "grad_norm": 4.1875, "learning_rate": 1.3986988335309692e-05, "loss": 1.3003, "step": 11505 }, { "epoch": 2.04, "grad_norm": 4.6875, "learning_rate": 1.3963428243463982e-05, "loss": 1.1981, "step": 11510 }, { "epoch": 2.04, "grad_norm": 5.90625, "learning_rate": 1.3939881990959147e-05, "loss": 1.192, "step": 11515 }, { "epoch": 2.04, "grad_norm": 4.4375, "learning_rate": 1.3916349598115287e-05, "loss": 1.2897, "step": 11520 }, { "epoch": 2.04, "grad_norm": 4.28125, "learning_rate": 1.389283108524051e-05, "loss": 1.2152, "step": 11525 }, { "epoch": 2.04, "grad_norm": 5.0, "learning_rate": 1.3869326472630983e-05, "loss": 1.2433, "step": 11530 }, { "epoch": 2.04, "grad_norm": 4.15625, "learning_rate": 1.3845835780570876e-05, "loss": 1.2339, "step": 11535 }, { "epoch": 2.04, "grad_norm": 4.34375, "learning_rate": 1.3822359029332308e-05, "loss": 1.2686, "step": 11540 }, { "epoch": 2.05, "grad_norm": 5.21875, "learning_rate": 1.379889623917539e-05, "loss": 1.2561, "step": 11545 }, { "epoch": 2.05, "grad_norm": 4.78125, "learning_rate": 1.3775447430348227e-05, "loss": 1.2147, "step": 11550 }, { "epoch": 2.05, "grad_norm": 4.625, "learning_rate": 1.375201262308678e-05, "loss": 1.2291, "step": 11555 }, { "epoch": 2.05, "grad_norm": 4.625, "learning_rate": 1.372859183761498e-05, "loss": 1.203, "step": 11560 }, { "epoch": 2.05, "grad_norm": 5.21875, "learning_rate": 1.370518509414465e-05, "loss": 1.2442, "step": 11565 }, { "epoch": 2.05, "grad_norm": 5.40625, "learning_rate": 1.368179241287547e-05, "loss": 1.1387, "step": 11570 }, { "epoch": 2.05, "grad_norm": 4.34375, "learning_rate": 1.3658413813995008e-05, "loss": 1.2217, "step": 11575 }, { "epoch": 2.05, "grad_norm": 3.8125, "learning_rate": 1.3635049317678682e-05, "loss": 1.2324, "step": 11580 }, { "epoch": 2.05, "grad_norm": 4.90625, "learning_rate": 1.3611698944089723e-05, "loss": 1.2554, "step": 11585 }, { "epoch": 2.05, "grad_norm": 4.125, "learning_rate": 1.3588362713379181e-05, "loss": 1.2289, "step": 11590 }, { "epoch": 2.05, "grad_norm": 4.78125, "learning_rate": 1.3565040645685918e-05, "loss": 1.1574, "step": 11595 }, { "epoch": 2.06, "grad_norm": 4.96875, "learning_rate": 1.3541732761136536e-05, "loss": 1.2241, "step": 11600 }, { "epoch": 2.06, "grad_norm": 5.9375, "learning_rate": 1.351843907984543e-05, "loss": 1.3095, "step": 11605 }, { "epoch": 2.06, "grad_norm": 4.6875, "learning_rate": 1.3495159621914723e-05, "loss": 1.2143, "step": 11610 }, { "epoch": 2.06, "grad_norm": 4.59375, "learning_rate": 1.3471894407434278e-05, "loss": 1.2177, "step": 11615 }, { "epoch": 2.06, "grad_norm": 4.875, "learning_rate": 1.3448643456481635e-05, "loss": 1.1781, "step": 11620 }, { "epoch": 2.06, "grad_norm": 5.0625, "learning_rate": 1.3425406789122058e-05, "loss": 1.1749, "step": 11625 }, { "epoch": 2.06, "grad_norm": 5.65625, "learning_rate": 1.3402184425408467e-05, "loss": 1.2989, "step": 11630 }, { "epoch": 2.06, "grad_norm": 4.9375, "learning_rate": 1.3378976385381449e-05, "loss": 1.2312, "step": 11635 }, { "epoch": 2.06, "grad_norm": 5.09375, "learning_rate": 1.3355782689069215e-05, "loss": 1.2329, "step": 11640 }, { "epoch": 2.06, "grad_norm": 4.59375, "learning_rate": 1.3332603356487622e-05, "loss": 1.2531, "step": 11645 }, { "epoch": 2.06, "grad_norm": 4.28125, "learning_rate": 1.3309438407640094e-05, "loss": 1.2564, "step": 11650 }, { "epoch": 2.07, "grad_norm": 4.78125, "learning_rate": 1.3286287862517671e-05, "loss": 1.2358, "step": 11655 }, { "epoch": 2.07, "grad_norm": 4.375, "learning_rate": 1.3263151741098968e-05, "loss": 1.1978, "step": 11660 }, { "epoch": 2.07, "grad_norm": 4.75, "learning_rate": 1.3240030063350117e-05, "loss": 1.2038, "step": 11665 }, { "epoch": 2.07, "grad_norm": 4.3125, "learning_rate": 1.3216922849224808e-05, "loss": 1.2005, "step": 11670 }, { "epoch": 2.07, "grad_norm": 4.59375, "learning_rate": 1.319383011866428e-05, "loss": 1.3006, "step": 11675 }, { "epoch": 2.07, "grad_norm": 3.90625, "learning_rate": 1.3170751891597216e-05, "loss": 1.211, "step": 11680 }, { "epoch": 2.07, "grad_norm": 4.875, "learning_rate": 1.314768818793981e-05, "loss": 1.1973, "step": 11685 }, { "epoch": 2.07, "grad_norm": 5.0625, "learning_rate": 1.3124639027595737e-05, "loss": 1.2116, "step": 11690 }, { "epoch": 2.07, "grad_norm": 4.1875, "learning_rate": 1.3101604430456085e-05, "loss": 1.1875, "step": 11695 }, { "epoch": 2.07, "grad_norm": 4.125, "learning_rate": 1.3078584416399403e-05, "loss": 1.2534, "step": 11700 }, { "epoch": 2.07, "grad_norm": 4.25, "learning_rate": 1.3055579005291644e-05, "loss": 1.2312, "step": 11705 }, { "epoch": 2.08, "grad_norm": 4.75, "learning_rate": 1.3032588216986167e-05, "loss": 1.1792, "step": 11710 }, { "epoch": 2.08, "grad_norm": 4.15625, "learning_rate": 1.3009612071323704e-05, "loss": 1.1538, "step": 11715 }, { "epoch": 2.08, "grad_norm": 4.71875, "learning_rate": 1.2986650588132362e-05, "loss": 1.2282, "step": 11720 }, { "epoch": 2.08, "grad_norm": 4.15625, "learning_rate": 1.2963703787227565e-05, "loss": 1.2437, "step": 11725 }, { "epoch": 2.08, "grad_norm": 4.5625, "learning_rate": 1.2940771688412095e-05, "loss": 1.2169, "step": 11730 }, { "epoch": 2.08, "grad_norm": 4.75, "learning_rate": 1.2917854311476042e-05, "loss": 1.2458, "step": 11735 }, { "epoch": 2.08, "grad_norm": 4.53125, "learning_rate": 1.2894951676196797e-05, "loss": 1.1886, "step": 11740 }, { "epoch": 2.08, "grad_norm": 4.5625, "learning_rate": 1.2872063802338993e-05, "loss": 1.2291, "step": 11745 }, { "epoch": 2.08, "grad_norm": 4.3125, "learning_rate": 1.2849190709654569e-05, "loss": 1.0974, "step": 11750 }, { "epoch": 2.08, "grad_norm": 4.6875, "learning_rate": 1.282633241788268e-05, "loss": 1.2064, "step": 11755 }, { "epoch": 2.08, "grad_norm": 4.5625, "learning_rate": 1.2803488946749727e-05, "loss": 1.2217, "step": 11760 }, { "epoch": 2.08, "grad_norm": 4.1875, "learning_rate": 1.2780660315969305e-05, "loss": 1.1908, "step": 11765 }, { "epoch": 2.09, "grad_norm": 4.9375, "learning_rate": 1.275784654524222e-05, "loss": 1.1512, "step": 11770 }, { "epoch": 2.09, "grad_norm": 4.09375, "learning_rate": 1.2735047654256423e-05, "loss": 1.2046, "step": 11775 }, { "epoch": 2.09, "grad_norm": 4.59375, "learning_rate": 1.2712263662687052e-05, "loss": 1.2452, "step": 11780 }, { "epoch": 2.09, "grad_norm": 4.78125, "learning_rate": 1.2689494590196393e-05, "loss": 1.232, "step": 11785 }, { "epoch": 2.09, "grad_norm": 4.1875, "learning_rate": 1.2666740456433805e-05, "loss": 1.1803, "step": 11790 }, { "epoch": 2.09, "grad_norm": 4.0625, "learning_rate": 1.2644001281035827e-05, "loss": 1.254, "step": 11795 }, { "epoch": 2.09, "grad_norm": 4.90625, "learning_rate": 1.2621277083626053e-05, "loss": 1.2314, "step": 11800 }, { "epoch": 2.09, "grad_norm": 3.921875, "learning_rate": 1.2598567883815138e-05, "loss": 1.2098, "step": 11805 }, { "epoch": 2.09, "grad_norm": 4.375, "learning_rate": 1.2575873701200808e-05, "loss": 1.2614, "step": 11810 }, { "epoch": 2.09, "grad_norm": 4.375, "learning_rate": 1.2553194555367852e-05, "loss": 1.2503, "step": 11815 }, { "epoch": 2.09, "grad_norm": 4.375, "learning_rate": 1.2530530465888038e-05, "loss": 1.1943, "step": 11820 }, { "epoch": 2.1, "grad_norm": 4.4375, "learning_rate": 1.2507881452320168e-05, "loss": 1.2694, "step": 11825 }, { "epoch": 2.1, "grad_norm": 4.8125, "learning_rate": 1.2485247534210033e-05, "loss": 1.235, "step": 11830 }, { "epoch": 2.1, "grad_norm": 4.34375, "learning_rate": 1.2462628731090398e-05, "loss": 1.1934, "step": 11835 }, { "epoch": 2.1, "grad_norm": 4.875, "learning_rate": 1.244002506248097e-05, "loss": 1.2025, "step": 11840 }, { "epoch": 2.1, "grad_norm": 4.59375, "learning_rate": 1.2417436547888406e-05, "loss": 1.2334, "step": 11845 }, { "epoch": 2.1, "grad_norm": 4.9375, "learning_rate": 1.2394863206806299e-05, "loss": 1.2341, "step": 11850 }, { "epoch": 2.1, "grad_norm": 4.46875, "learning_rate": 1.2372305058715108e-05, "loss": 1.1738, "step": 11855 }, { "epoch": 2.1, "grad_norm": 4.96875, "learning_rate": 1.2349762123082216e-05, "loss": 1.1949, "step": 11860 }, { "epoch": 2.1, "grad_norm": 4.40625, "learning_rate": 1.2327234419361873e-05, "loss": 1.2212, "step": 11865 }, { "epoch": 2.1, "grad_norm": 4.25, "learning_rate": 1.2304721966995162e-05, "loss": 1.2751, "step": 11870 }, { "epoch": 2.1, "grad_norm": 4.6875, "learning_rate": 1.228222478541002e-05, "loss": 1.2377, "step": 11875 }, { "epoch": 2.11, "grad_norm": 5.21875, "learning_rate": 1.2259742894021228e-05, "loss": 1.242, "step": 11880 }, { "epoch": 2.11, "grad_norm": 4.34375, "learning_rate": 1.2237276312230327e-05, "loss": 1.1998, "step": 11885 }, { "epoch": 2.11, "grad_norm": 4.59375, "learning_rate": 1.2214825059425674e-05, "loss": 1.1997, "step": 11890 }, { "epoch": 2.11, "grad_norm": 4.46875, "learning_rate": 1.2192389154982405e-05, "loss": 1.2709, "step": 11895 }, { "epoch": 2.11, "grad_norm": 4.25, "learning_rate": 1.2169968618262373e-05, "loss": 1.2318, "step": 11900 }, { "epoch": 2.11, "grad_norm": 4.25, "learning_rate": 1.2147563468614206e-05, "loss": 1.1543, "step": 11905 }, { "epoch": 2.11, "grad_norm": 3.96875, "learning_rate": 1.2125173725373247e-05, "loss": 1.22, "step": 11910 }, { "epoch": 2.11, "grad_norm": 4.15625, "learning_rate": 1.2102799407861507e-05, "loss": 1.2233, "step": 11915 }, { "epoch": 2.11, "grad_norm": 4.5, "learning_rate": 1.2080440535387745e-05, "loss": 1.2098, "step": 11920 }, { "epoch": 2.11, "grad_norm": 4.34375, "learning_rate": 1.2058097127247363e-05, "loss": 1.2235, "step": 11925 }, { "epoch": 2.11, "grad_norm": 4.0625, "learning_rate": 1.203576920272239e-05, "loss": 1.171, "step": 11930 }, { "epoch": 2.11, "grad_norm": 4.65625, "learning_rate": 1.2013456781081533e-05, "loss": 1.3391, "step": 11935 }, { "epoch": 2.12, "grad_norm": 4.46875, "learning_rate": 1.19911598815801e-05, "loss": 1.2118, "step": 11940 }, { "epoch": 2.12, "grad_norm": 4.6875, "learning_rate": 1.1968878523460024e-05, "loss": 1.2064, "step": 11945 }, { "epoch": 2.12, "grad_norm": 3.859375, "learning_rate": 1.1946612725949785e-05, "loss": 1.1682, "step": 11950 }, { "epoch": 2.12, "grad_norm": 4.4375, "learning_rate": 1.1924362508264476e-05, "loss": 1.2179, "step": 11955 }, { "epoch": 2.12, "grad_norm": 4.125, "learning_rate": 1.190212788960572e-05, "loss": 1.2331, "step": 11960 }, { "epoch": 2.12, "grad_norm": 4.46875, "learning_rate": 1.1879908889161698e-05, "loss": 1.2432, "step": 11965 }, { "epoch": 2.12, "grad_norm": 4.75, "learning_rate": 1.1857705526107094e-05, "loss": 1.2453, "step": 11970 }, { "epoch": 2.12, "grad_norm": 4.03125, "learning_rate": 1.1835517819603119e-05, "loss": 1.1749, "step": 11975 }, { "epoch": 2.12, "grad_norm": 5.375, "learning_rate": 1.1813345788797434e-05, "loss": 1.2463, "step": 11980 }, { "epoch": 2.12, "grad_norm": 4.0625, "learning_rate": 1.1791189452824213e-05, "loss": 1.2137, "step": 11985 }, { "epoch": 2.12, "grad_norm": 4.375, "learning_rate": 1.1769048830804074e-05, "loss": 1.2506, "step": 11990 }, { "epoch": 2.13, "grad_norm": 4.15625, "learning_rate": 1.174692394184404e-05, "loss": 1.2114, "step": 11995 }, { "epoch": 2.13, "grad_norm": 4.40625, "learning_rate": 1.172481480503762e-05, "loss": 1.2779, "step": 12000 }, { "epoch": 2.13, "grad_norm": 4.0625, "learning_rate": 1.170272143946469e-05, "loss": 1.3237, "step": 12005 }, { "epoch": 2.13, "grad_norm": 4.0, "learning_rate": 1.16806438641915e-05, "loss": 1.2011, "step": 12010 }, { "epoch": 2.13, "grad_norm": 4.4375, "learning_rate": 1.165858209827071e-05, "loss": 1.2506, "step": 12015 }, { "epoch": 2.13, "grad_norm": 4.875, "learning_rate": 1.1636536160741325e-05, "loss": 1.2015, "step": 12020 }, { "epoch": 2.13, "grad_norm": 3.96875, "learning_rate": 1.161450607062867e-05, "loss": 1.2242, "step": 12025 }, { "epoch": 2.13, "grad_norm": 4.875, "learning_rate": 1.1592491846944419e-05, "loss": 1.1798, "step": 12030 }, { "epoch": 2.13, "grad_norm": 4.21875, "learning_rate": 1.1570493508686539e-05, "loss": 1.1721, "step": 12035 }, { "epoch": 2.13, "grad_norm": 5.25, "learning_rate": 1.15485110748393e-05, "loss": 1.2821, "step": 12040 }, { "epoch": 2.13, "grad_norm": 5.125, "learning_rate": 1.1526544564373234e-05, "loss": 1.2259, "step": 12045 }, { "epoch": 2.14, "grad_norm": 4.0, "learning_rate": 1.1504593996245156e-05, "loss": 1.1761, "step": 12050 }, { "epoch": 2.14, "grad_norm": 4.9375, "learning_rate": 1.1482659389398079e-05, "loss": 1.2192, "step": 12055 }, { "epoch": 2.14, "grad_norm": 4.90625, "learning_rate": 1.1460740762761275e-05, "loss": 1.2518, "step": 12060 }, { "epoch": 2.14, "grad_norm": 5.40625, "learning_rate": 1.143883813525022e-05, "loss": 1.2474, "step": 12065 }, { "epoch": 2.14, "grad_norm": 4.21875, "learning_rate": 1.1416951525766588e-05, "loss": 1.1971, "step": 12070 }, { "epoch": 2.14, "grad_norm": 4.59375, "learning_rate": 1.1395080953198206e-05, "loss": 1.2473, "step": 12075 }, { "epoch": 2.14, "grad_norm": 3.90625, "learning_rate": 1.1373226436419085e-05, "loss": 1.2219, "step": 12080 }, { "epoch": 2.14, "grad_norm": 4.28125, "learning_rate": 1.1351387994289373e-05, "loss": 1.2296, "step": 12085 }, { "epoch": 2.14, "grad_norm": 5.09375, "learning_rate": 1.1329565645655343e-05, "loss": 1.2139, "step": 12090 }, { "epoch": 2.14, "grad_norm": 5.8125, "learning_rate": 1.1307759409349385e-05, "loss": 1.2684, "step": 12095 }, { "epoch": 2.14, "grad_norm": 4.53125, "learning_rate": 1.1285969304189989e-05, "loss": 1.2209, "step": 12100 }, { "epoch": 2.15, "grad_norm": 4.5, "learning_rate": 1.12641953489817e-05, "loss": 1.2325, "step": 12105 }, { "epoch": 2.15, "grad_norm": 4.3125, "learning_rate": 1.1242437562515145e-05, "loss": 1.1674, "step": 12110 }, { "epoch": 2.15, "grad_norm": 5.0, "learning_rate": 1.122069596356701e-05, "loss": 1.2634, "step": 12115 }, { "epoch": 2.15, "grad_norm": 4.4375, "learning_rate": 1.1198970570899963e-05, "loss": 1.2642, "step": 12120 }, { "epoch": 2.15, "grad_norm": 4.4375, "learning_rate": 1.1177261403262751e-05, "loss": 1.2223, "step": 12125 }, { "epoch": 2.15, "grad_norm": 4.3125, "learning_rate": 1.1155568479390083e-05, "loss": 1.2342, "step": 12130 }, { "epoch": 2.15, "grad_norm": 4.3125, "learning_rate": 1.113389181800264e-05, "loss": 1.258, "step": 12135 }, { "epoch": 2.15, "grad_norm": 4.125, "learning_rate": 1.1112231437807092e-05, "loss": 1.2185, "step": 12140 }, { "epoch": 2.15, "grad_norm": 3.984375, "learning_rate": 1.1090587357496052e-05, "loss": 1.1993, "step": 12145 }, { "epoch": 2.15, "grad_norm": 4.71875, "learning_rate": 1.106895959574805e-05, "loss": 1.2767, "step": 12150 }, { "epoch": 2.15, "grad_norm": 4.15625, "learning_rate": 1.1047348171227558e-05, "loss": 1.229, "step": 12155 }, { "epoch": 2.15, "grad_norm": 4.875, "learning_rate": 1.1025753102584933e-05, "loss": 1.15, "step": 12160 }, { "epoch": 2.16, "grad_norm": 4.5, "learning_rate": 1.1004174408456427e-05, "loss": 1.1801, "step": 12165 }, { "epoch": 2.16, "grad_norm": 4.21875, "learning_rate": 1.098261210746415e-05, "loss": 1.1574, "step": 12170 }, { "epoch": 2.16, "grad_norm": 4.8125, "learning_rate": 1.0961066218216093e-05, "loss": 1.181, "step": 12175 }, { "epoch": 2.16, "grad_norm": 4.28125, "learning_rate": 1.0939536759306027e-05, "loss": 1.1649, "step": 12180 }, { "epoch": 2.16, "grad_norm": 4.25, "learning_rate": 1.09180237493136e-05, "loss": 1.2138, "step": 12185 }, { "epoch": 2.16, "grad_norm": 4.46875, "learning_rate": 1.089652720680424e-05, "loss": 1.2191, "step": 12190 }, { "epoch": 2.16, "grad_norm": 4.9375, "learning_rate": 1.087504715032918e-05, "loss": 1.1963, "step": 12195 }, { "epoch": 2.16, "grad_norm": 4.09375, "learning_rate": 1.0853583598425386e-05, "loss": 1.2847, "step": 12200 }, { "epoch": 2.16, "grad_norm": 4.15625, "learning_rate": 1.0832136569615616e-05, "loss": 1.2158, "step": 12205 }, { "epoch": 2.16, "grad_norm": 4.3125, "learning_rate": 1.0810706082408387e-05, "loss": 1.1894, "step": 12210 }, { "epoch": 2.16, "grad_norm": 4.09375, "learning_rate": 1.0789292155297889e-05, "loss": 1.1982, "step": 12215 }, { "epoch": 2.17, "grad_norm": 4.375, "learning_rate": 1.0767894806764048e-05, "loss": 1.2524, "step": 12220 }, { "epoch": 2.17, "grad_norm": 4.28125, "learning_rate": 1.0746514055272501e-05, "loss": 1.1822, "step": 12225 }, { "epoch": 2.17, "grad_norm": 5.4375, "learning_rate": 1.0725149919274516e-05, "loss": 1.199, "step": 12230 }, { "epoch": 2.17, "grad_norm": 4.375, "learning_rate": 1.070380241720706e-05, "loss": 1.2155, "step": 12235 }, { "epoch": 2.17, "grad_norm": 4.25, "learning_rate": 1.0682471567492748e-05, "loss": 1.2446, "step": 12240 }, { "epoch": 2.17, "grad_norm": 4.6875, "learning_rate": 1.0661157388539776e-05, "loss": 1.1959, "step": 12245 }, { "epoch": 2.17, "grad_norm": 4.34375, "learning_rate": 1.0639859898742016e-05, "loss": 1.1996, "step": 12250 }, { "epoch": 2.17, "grad_norm": 4.5, "learning_rate": 1.0618579116478912e-05, "loss": 1.2542, "step": 12255 }, { "epoch": 2.17, "grad_norm": 4.09375, "learning_rate": 1.0597315060115462e-05, "loss": 1.2137, "step": 12260 }, { "epoch": 2.17, "grad_norm": 4.96875, "learning_rate": 1.057606774800227e-05, "loss": 1.3289, "step": 12265 }, { "epoch": 2.17, "grad_norm": 4.34375, "learning_rate": 1.0554837198475482e-05, "loss": 1.1656, "step": 12270 }, { "epoch": 2.18, "grad_norm": 4.28125, "learning_rate": 1.0533623429856747e-05, "loss": 1.2656, "step": 12275 }, { "epoch": 2.18, "grad_norm": 4.3125, "learning_rate": 1.0512426460453269e-05, "loss": 1.1964, "step": 12280 }, { "epoch": 2.18, "grad_norm": 4.40625, "learning_rate": 1.0491246308557744e-05, "loss": 1.2557, "step": 12285 }, { "epoch": 2.18, "grad_norm": 4.21875, "learning_rate": 1.0470082992448344e-05, "loss": 1.2265, "step": 12290 }, { "epoch": 2.18, "grad_norm": 5.15625, "learning_rate": 1.0448936530388726e-05, "loss": 1.2316, "step": 12295 }, { "epoch": 2.18, "grad_norm": 3.96875, "learning_rate": 1.0427806940627992e-05, "loss": 1.21, "step": 12300 }, { "epoch": 2.18, "grad_norm": 4.9375, "learning_rate": 1.0406694241400697e-05, "loss": 1.2146, "step": 12305 }, { "epoch": 2.18, "grad_norm": 4.75, "learning_rate": 1.0385598450926796e-05, "loss": 1.2386, "step": 12310 }, { "epoch": 2.18, "grad_norm": 4.25, "learning_rate": 1.0364519587411671e-05, "loss": 1.2426, "step": 12315 }, { "epoch": 2.18, "grad_norm": 4.21875, "learning_rate": 1.0343457669046101e-05, "loss": 1.2029, "step": 12320 }, { "epoch": 2.18, "grad_norm": 4.5, "learning_rate": 1.0322412714006203e-05, "loss": 1.207, "step": 12325 }, { "epoch": 2.18, "grad_norm": 4.375, "learning_rate": 1.0301384740453518e-05, "loss": 1.1426, "step": 12330 }, { "epoch": 2.19, "grad_norm": 4.40625, "learning_rate": 1.028037376653489e-05, "loss": 1.1714, "step": 12335 }, { "epoch": 2.19, "grad_norm": 4.28125, "learning_rate": 1.0259379810382487e-05, "loss": 1.169, "step": 12340 }, { "epoch": 2.19, "grad_norm": 5.09375, "learning_rate": 1.0238402890113813e-05, "loss": 1.3475, "step": 12345 }, { "epoch": 2.19, "grad_norm": 4.46875, "learning_rate": 1.0217443023831667e-05, "loss": 1.1872, "step": 12350 }, { "epoch": 2.19, "grad_norm": 4.96875, "learning_rate": 1.0196500229624113e-05, "loss": 1.2164, "step": 12355 }, { "epoch": 2.19, "grad_norm": 5.125, "learning_rate": 1.0175574525564498e-05, "loss": 1.2419, "step": 12360 }, { "epoch": 2.19, "grad_norm": 5.625, "learning_rate": 1.0154665929711419e-05, "loss": 1.2395, "step": 12365 }, { "epoch": 2.19, "grad_norm": 4.0625, "learning_rate": 1.013377446010871e-05, "loss": 1.1668, "step": 12370 }, { "epoch": 2.19, "grad_norm": 4.96875, "learning_rate": 1.0112900134785415e-05, "loss": 1.207, "step": 12375 }, { "epoch": 2.19, "grad_norm": 4.4375, "learning_rate": 1.0092042971755805e-05, "loss": 1.3247, "step": 12380 }, { "epoch": 2.19, "grad_norm": 4.46875, "learning_rate": 1.0071202989019303e-05, "loss": 1.2503, "step": 12385 }, { "epoch": 2.2, "grad_norm": 3.890625, "learning_rate": 1.005038020456054e-05, "loss": 1.1916, "step": 12390 }, { "epoch": 2.2, "grad_norm": 3.84375, "learning_rate": 1.0029574636349289e-05, "loss": 1.1929, "step": 12395 }, { "epoch": 2.2, "grad_norm": 4.34375, "learning_rate": 1.0008786302340479e-05, "loss": 1.2172, "step": 12400 }, { "epoch": 2.2, "grad_norm": 4.59375, "learning_rate": 9.98801522047414e-06, "loss": 1.2727, "step": 12405 }, { "epoch": 2.2, "grad_norm": 4.0625, "learning_rate": 9.967261408675439e-06, "loss": 1.1781, "step": 12410 }, { "epoch": 2.2, "grad_norm": 4.5625, "learning_rate": 9.946524884854628e-06, "loss": 1.2183, "step": 12415 }, { "epoch": 2.2, "grad_norm": 4.71875, "learning_rate": 9.925805666907042e-06, "loss": 1.2361, "step": 12420 }, { "epoch": 2.2, "grad_norm": 4.53125, "learning_rate": 9.905103772713085e-06, "loss": 1.1819, "step": 12425 }, { "epoch": 2.2, "grad_norm": 4.15625, "learning_rate": 9.884419220138212e-06, "loss": 1.2409, "step": 12430 }, { "epoch": 2.2, "grad_norm": 4.03125, "learning_rate": 9.86375202703289e-06, "loss": 1.2427, "step": 12435 }, { "epoch": 2.2, "grad_norm": 4.90625, "learning_rate": 9.84310221123263e-06, "loss": 1.2343, "step": 12440 }, { "epoch": 2.21, "grad_norm": 3.984375, "learning_rate": 9.822469790557954e-06, "loss": 1.205, "step": 12445 }, { "epoch": 2.21, "grad_norm": 4.15625, "learning_rate": 9.801854782814324e-06, "loss": 1.2038, "step": 12450 }, { "epoch": 2.21, "grad_norm": 4.0625, "learning_rate": 9.781257205792236e-06, "loss": 1.2113, "step": 12455 }, { "epoch": 2.21, "grad_norm": 5.21875, "learning_rate": 9.760677077267121e-06, "loss": 1.1965, "step": 12460 }, { "epoch": 2.21, "grad_norm": 4.96875, "learning_rate": 9.740114414999322e-06, "loss": 1.2775, "step": 12465 }, { "epoch": 2.21, "grad_norm": 4.3125, "learning_rate": 9.719569236734145e-06, "loss": 1.2827, "step": 12470 }, { "epoch": 2.21, "grad_norm": 4.46875, "learning_rate": 9.699041560201808e-06, "loss": 1.2849, "step": 12475 }, { "epoch": 2.21, "grad_norm": 4.84375, "learning_rate": 9.678531403117393e-06, "loss": 1.2862, "step": 12480 }, { "epoch": 2.21, "grad_norm": 4.3125, "learning_rate": 9.658038783180895e-06, "loss": 1.1669, "step": 12485 }, { "epoch": 2.21, "grad_norm": 4.15625, "learning_rate": 9.63756371807716e-06, "loss": 1.196, "step": 12490 }, { "epoch": 2.21, "grad_norm": 4.59375, "learning_rate": 9.617106225475886e-06, "loss": 1.1788, "step": 12495 }, { "epoch": 2.22, "grad_norm": 4.28125, "learning_rate": 9.596666323031613e-06, "loss": 1.2413, "step": 12500 }, { "epoch": 2.22, "grad_norm": 4.53125, "learning_rate": 9.576244028383702e-06, "loss": 1.2391, "step": 12505 }, { "epoch": 2.22, "grad_norm": 4.59375, "learning_rate": 9.555839359156297e-06, "loss": 1.2009, "step": 12510 }, { "epoch": 2.22, "grad_norm": 4.125, "learning_rate": 9.535452332958349e-06, "loss": 1.2493, "step": 12515 }, { "epoch": 2.22, "grad_norm": 4.5, "learning_rate": 9.515082967383589e-06, "loss": 1.2127, "step": 12520 }, { "epoch": 2.22, "grad_norm": 5.0, "learning_rate": 9.494731280010504e-06, "loss": 1.2991, "step": 12525 }, { "epoch": 2.22, "grad_norm": 4.34375, "learning_rate": 9.474397288402301e-06, "loss": 1.2015, "step": 12530 }, { "epoch": 2.22, "grad_norm": 4.1875, "learning_rate": 9.45408101010695e-06, "loss": 1.2464, "step": 12535 }, { "epoch": 2.22, "grad_norm": 4.15625, "learning_rate": 9.433782462657117e-06, "loss": 1.2177, "step": 12540 }, { "epoch": 2.22, "grad_norm": 4.03125, "learning_rate": 9.413501663570168e-06, "loss": 1.2475, "step": 12545 }, { "epoch": 2.22, "grad_norm": 4.03125, "learning_rate": 9.39323863034815e-06, "loss": 1.1492, "step": 12550 }, { "epoch": 2.22, "grad_norm": 4.5, "learning_rate": 9.3729933804778e-06, "loss": 1.2035, "step": 12555 }, { "epoch": 2.23, "grad_norm": 4.34375, "learning_rate": 9.352765931430465e-06, "loss": 1.2241, "step": 12560 }, { "epoch": 2.23, "grad_norm": 4.5625, "learning_rate": 9.332556300662167e-06, "loss": 1.3138, "step": 12565 }, { "epoch": 2.23, "grad_norm": 4.25, "learning_rate": 9.31236450561355e-06, "loss": 1.1703, "step": 12570 }, { "epoch": 2.23, "grad_norm": 5.3125, "learning_rate": 9.292190563709825e-06, "loss": 1.1405, "step": 12575 }, { "epoch": 2.23, "grad_norm": 4.28125, "learning_rate": 9.272034492360855e-06, "loss": 1.2765, "step": 12580 }, { "epoch": 2.23, "grad_norm": 4.46875, "learning_rate": 9.251896308961055e-06, "loss": 1.2769, "step": 12585 }, { "epoch": 2.23, "grad_norm": 4.25, "learning_rate": 9.231776030889379e-06, "loss": 1.2174, "step": 12590 }, { "epoch": 2.23, "grad_norm": 4.5, "learning_rate": 9.21167367550936e-06, "loss": 1.2512, "step": 12595 }, { "epoch": 2.23, "grad_norm": 4.125, "learning_rate": 9.19158926016906e-06, "loss": 1.301, "step": 12600 }, { "epoch": 2.23, "grad_norm": 5.03125, "learning_rate": 9.17152280220104e-06, "loss": 1.277, "step": 12605 }, { "epoch": 2.23, "grad_norm": 4.9375, "learning_rate": 9.151474318922383e-06, "loss": 1.1845, "step": 12610 }, { "epoch": 2.24, "grad_norm": 4.40625, "learning_rate": 9.13144382763466e-06, "loss": 1.1766, "step": 12615 }, { "epoch": 2.24, "grad_norm": 4.90625, "learning_rate": 9.111431345623902e-06, "loss": 1.1864, "step": 12620 }, { "epoch": 2.24, "grad_norm": 4.5625, "learning_rate": 9.09143689016061e-06, "loss": 1.2049, "step": 12625 }, { "epoch": 2.24, "grad_norm": 4.4375, "learning_rate": 9.071460478499725e-06, "loss": 1.1791, "step": 12630 }, { "epoch": 2.24, "grad_norm": 4.78125, "learning_rate": 9.051502127880625e-06, "loss": 1.1526, "step": 12635 }, { "epoch": 2.24, "grad_norm": 4.59375, "learning_rate": 9.031561855527065e-06, "loss": 1.2475, "step": 12640 }, { "epoch": 2.24, "grad_norm": 4.03125, "learning_rate": 9.011639678647246e-06, "loss": 1.166, "step": 12645 }, { "epoch": 2.24, "grad_norm": 4.78125, "learning_rate": 8.991735614433739e-06, "loss": 1.259, "step": 12650 }, { "epoch": 2.24, "grad_norm": 5.5625, "learning_rate": 8.971849680063446e-06, "loss": 1.2286, "step": 12655 }, { "epoch": 2.24, "grad_norm": 4.46875, "learning_rate": 8.951981892697686e-06, "loss": 1.1004, "step": 12660 }, { "epoch": 2.24, "grad_norm": 4.40625, "learning_rate": 8.932132269482087e-06, "loss": 1.2629, "step": 12665 }, { "epoch": 2.25, "grad_norm": 5.46875, "learning_rate": 8.912300827546578e-06, "loss": 1.237, "step": 12670 }, { "epoch": 2.25, "grad_norm": 4.03125, "learning_rate": 8.892487584005434e-06, "loss": 1.2382, "step": 12675 }, { "epoch": 2.25, "grad_norm": 4.40625, "learning_rate": 8.872692555957218e-06, "loss": 1.2339, "step": 12680 }, { "epoch": 2.25, "grad_norm": 4.5625, "learning_rate": 8.852915760484751e-06, "loss": 1.2276, "step": 12685 }, { "epoch": 2.25, "grad_norm": 4.6875, "learning_rate": 8.833157214655145e-06, "loss": 1.2687, "step": 12690 }, { "epoch": 2.25, "grad_norm": 4.5625, "learning_rate": 8.813416935519762e-06, "loss": 1.2516, "step": 12695 }, { "epoch": 2.25, "grad_norm": 4.6875, "learning_rate": 8.793694940114159e-06, "loss": 1.2138, "step": 12700 }, { "epoch": 2.25, "grad_norm": 3.484375, "learning_rate": 8.773991245458181e-06, "loss": 1.251, "step": 12705 }, { "epoch": 2.25, "grad_norm": 4.40625, "learning_rate": 8.754305868555845e-06, "loss": 1.2466, "step": 12710 }, { "epoch": 2.25, "grad_norm": 5.21875, "learning_rate": 8.734638826395337e-06, "loss": 1.2669, "step": 12715 }, { "epoch": 2.25, "grad_norm": 4.625, "learning_rate": 8.71499013594906e-06, "loss": 1.2774, "step": 12720 }, { "epoch": 2.25, "grad_norm": 5.65625, "learning_rate": 8.695359814173555e-06, "loss": 1.3163, "step": 12725 }, { "epoch": 2.26, "grad_norm": 4.5, "learning_rate": 8.67574787800954e-06, "loss": 1.157, "step": 12730 }, { "epoch": 2.26, "grad_norm": 4.53125, "learning_rate": 8.656154344381815e-06, "loss": 1.3017, "step": 12735 }, { "epoch": 2.26, "grad_norm": 4.1875, "learning_rate": 8.636579230199349e-06, "loss": 1.2112, "step": 12740 }, { "epoch": 2.26, "grad_norm": 4.1875, "learning_rate": 8.617022552355193e-06, "loss": 1.2632, "step": 12745 }, { "epoch": 2.26, "grad_norm": 5.3125, "learning_rate": 8.59748432772649e-06, "loss": 1.2442, "step": 12750 }, { "epoch": 2.26, "grad_norm": 4.59375, "learning_rate": 8.57796457317446e-06, "loss": 1.2142, "step": 12755 }, { "epoch": 2.26, "grad_norm": 5.21875, "learning_rate": 8.558463305544396e-06, "loss": 1.2106, "step": 12760 }, { "epoch": 2.26, "grad_norm": 4.75, "learning_rate": 8.538980541665599e-06, "loss": 1.2024, "step": 12765 }, { "epoch": 2.26, "grad_norm": 4.90625, "learning_rate": 8.519516298351436e-06, "loss": 1.2204, "step": 12770 }, { "epoch": 2.26, "grad_norm": 4.84375, "learning_rate": 8.500070592399297e-06, "loss": 1.2423, "step": 12775 }, { "epoch": 2.26, "grad_norm": 4.3125, "learning_rate": 8.480643440590524e-06, "loss": 1.1918, "step": 12780 }, { "epoch": 2.27, "grad_norm": 3.8125, "learning_rate": 8.461234859690514e-06, "loss": 1.2232, "step": 12785 }, { "epoch": 2.27, "grad_norm": 4.78125, "learning_rate": 8.4418448664486e-06, "loss": 1.2385, "step": 12790 }, { "epoch": 2.27, "grad_norm": 3.875, "learning_rate": 8.422473477598068e-06, "loss": 1.2354, "step": 12795 }, { "epoch": 2.27, "grad_norm": 4.125, "learning_rate": 8.403120709856165e-06, "loss": 1.1435, "step": 12800 }, { "epoch": 2.27, "grad_norm": 4.71875, "learning_rate": 8.383786579924075e-06, "loss": 1.2146, "step": 12805 }, { "epoch": 2.27, "grad_norm": 4.25, "learning_rate": 8.364471104486862e-06, "loss": 1.2101, "step": 12810 }, { "epoch": 2.27, "grad_norm": 3.96875, "learning_rate": 8.345174300213533e-06, "loss": 1.2591, "step": 12815 }, { "epoch": 2.27, "grad_norm": 4.8125, "learning_rate": 8.32589618375696e-06, "loss": 1.2192, "step": 12820 }, { "epoch": 2.27, "grad_norm": 4.28125, "learning_rate": 8.306636771753893e-06, "loss": 1.2827, "step": 12825 }, { "epoch": 2.27, "grad_norm": 4.5, "learning_rate": 8.287396080824941e-06, "loss": 1.1651, "step": 12830 }, { "epoch": 2.27, "grad_norm": 4.25, "learning_rate": 8.268174127574562e-06, "loss": 1.1642, "step": 12835 }, { "epoch": 2.28, "grad_norm": 4.03125, "learning_rate": 8.248970928591024e-06, "loss": 1.2455, "step": 12840 }, { "epoch": 2.28, "grad_norm": 4.75, "learning_rate": 8.229786500446427e-06, "loss": 1.1773, "step": 12845 }, { "epoch": 2.28, "grad_norm": 4.96875, "learning_rate": 8.210620859696673e-06, "loss": 1.1888, "step": 12850 }, { "epoch": 2.28, "grad_norm": 5.5625, "learning_rate": 8.191474022881452e-06, "loss": 1.2173, "step": 12855 }, { "epoch": 2.28, "grad_norm": 4.5, "learning_rate": 8.172346006524206e-06, "loss": 1.2199, "step": 12860 }, { "epoch": 2.28, "grad_norm": 4.9375, "learning_rate": 8.153236827132157e-06, "loss": 1.2301, "step": 12865 }, { "epoch": 2.28, "grad_norm": 4.4375, "learning_rate": 8.13414650119626e-06, "loss": 1.186, "step": 12870 }, { "epoch": 2.28, "grad_norm": 4.21875, "learning_rate": 8.115075045191213e-06, "loss": 1.2115, "step": 12875 }, { "epoch": 2.28, "grad_norm": 4.84375, "learning_rate": 8.096022475575415e-06, "loss": 1.3112, "step": 12880 }, { "epoch": 2.28, "grad_norm": 4.09375, "learning_rate": 8.076988808790983e-06, "loss": 1.2512, "step": 12885 }, { "epoch": 2.28, "grad_norm": 4.15625, "learning_rate": 8.057974061263686e-06, "loss": 1.2413, "step": 12890 }, { "epoch": 2.29, "grad_norm": 4.0, "learning_rate": 8.038978249403004e-06, "loss": 1.1906, "step": 12895 }, { "epoch": 2.29, "grad_norm": 4.96875, "learning_rate": 8.02000138960207e-06, "loss": 1.1623, "step": 12900 }, { "epoch": 2.29, "grad_norm": 4.625, "learning_rate": 8.001043498237623e-06, "loss": 1.1897, "step": 12905 }, { "epoch": 2.29, "grad_norm": 3.84375, "learning_rate": 7.982104591670092e-06, "loss": 1.2242, "step": 12910 }, { "epoch": 2.29, "grad_norm": 4.25, "learning_rate": 7.963184686243491e-06, "loss": 1.122, "step": 12915 }, { "epoch": 2.29, "grad_norm": 4.0625, "learning_rate": 7.94428379828542e-06, "loss": 1.2384, "step": 12920 }, { "epoch": 2.29, "grad_norm": 4.8125, "learning_rate": 7.925401944107099e-06, "loss": 1.222, "step": 12925 }, { "epoch": 2.29, "grad_norm": 4.46875, "learning_rate": 7.906539140003314e-06, "loss": 1.1871, "step": 12930 }, { "epoch": 2.29, "grad_norm": 3.890625, "learning_rate": 7.887695402252386e-06, "loss": 1.2027, "step": 12935 }, { "epoch": 2.29, "grad_norm": 4.375, "learning_rate": 7.86887074711621e-06, "loss": 1.1551, "step": 12940 }, { "epoch": 2.29, "grad_norm": 4.8125, "learning_rate": 7.850065190840212e-06, "loss": 1.1454, "step": 12945 }, { "epoch": 2.29, "grad_norm": 5.03125, "learning_rate": 7.831278749653317e-06, "loss": 1.3206, "step": 12950 }, { "epoch": 2.3, "grad_norm": 4.25, "learning_rate": 7.812511439767975e-06, "loss": 1.239, "step": 12955 }, { "epoch": 2.3, "grad_norm": 5.53125, "learning_rate": 7.79376327738012e-06, "loss": 1.2231, "step": 12960 }, { "epoch": 2.3, "grad_norm": 4.34375, "learning_rate": 7.775034278669146e-06, "loss": 1.2539, "step": 12965 }, { "epoch": 2.3, "grad_norm": 4.5625, "learning_rate": 7.756324459797925e-06, "loss": 1.1297, "step": 12970 }, { "epoch": 2.3, "grad_norm": 4.5, "learning_rate": 7.73763383691277e-06, "loss": 1.2602, "step": 12975 }, { "epoch": 2.3, "grad_norm": 4.25, "learning_rate": 7.71896242614345e-06, "loss": 1.2281, "step": 12980 }, { "epoch": 2.3, "grad_norm": 4.09375, "learning_rate": 7.700310243603103e-06, "loss": 1.2094, "step": 12985 }, { "epoch": 2.3, "grad_norm": 4.40625, "learning_rate": 7.681677305388332e-06, "loss": 1.2602, "step": 12990 }, { "epoch": 2.3, "grad_norm": 4.46875, "learning_rate": 7.663063627579111e-06, "loss": 1.2418, "step": 12995 }, { "epoch": 2.3, "grad_norm": 4.3125, "learning_rate": 7.644469226238764e-06, "loss": 1.2019, "step": 13000 }, { "epoch": 2.3, "grad_norm": 4.53125, "learning_rate": 7.62589411741402e-06, "loss": 1.3093, "step": 13005 }, { "epoch": 2.31, "grad_norm": 4.625, "learning_rate": 7.607338317134945e-06, "loss": 1.2237, "step": 13010 }, { "epoch": 2.31, "grad_norm": 4.15625, "learning_rate": 7.588801841414926e-06, "loss": 1.2198, "step": 13015 }, { "epoch": 2.31, "grad_norm": 4.78125, "learning_rate": 7.570284706250691e-06, "loss": 1.2645, "step": 13020 }, { "epoch": 2.31, "grad_norm": 4.375, "learning_rate": 7.551786927622283e-06, "loss": 1.2441, "step": 13025 }, { "epoch": 2.31, "grad_norm": 4.6875, "learning_rate": 7.53330852149301e-06, "loss": 1.1804, "step": 13030 }, { "epoch": 2.31, "grad_norm": 4.3125, "learning_rate": 7.5148495038094985e-06, "loss": 1.2938, "step": 13035 }, { "epoch": 2.31, "grad_norm": 4.4375, "learning_rate": 7.4964098905016375e-06, "loss": 1.2402, "step": 13040 }, { "epoch": 2.31, "grad_norm": 4.3125, "learning_rate": 7.477989697482533e-06, "loss": 1.2788, "step": 13045 }, { "epoch": 2.31, "grad_norm": 4.46875, "learning_rate": 7.459588940648574e-06, "loss": 1.3012, "step": 13050 }, { "epoch": 2.31, "grad_norm": 4.40625, "learning_rate": 7.441207635879367e-06, "loss": 1.2432, "step": 13055 }, { "epoch": 2.31, "grad_norm": 4.34375, "learning_rate": 7.422845799037707e-06, "loss": 1.1939, "step": 13060 }, { "epoch": 2.32, "grad_norm": 4.375, "learning_rate": 7.4045034459696135e-06, "loss": 1.2259, "step": 13065 }, { "epoch": 2.32, "grad_norm": 4.40625, "learning_rate": 7.386180592504287e-06, "loss": 1.1989, "step": 13070 }, { "epoch": 2.32, "grad_norm": 4.28125, "learning_rate": 7.367877254454097e-06, "loss": 1.1798, "step": 13075 }, { "epoch": 2.32, "grad_norm": 4.28125, "learning_rate": 7.349593447614576e-06, "loss": 1.2313, "step": 13080 }, { "epoch": 2.32, "grad_norm": 4.15625, "learning_rate": 7.331329187764396e-06, "loss": 1.232, "step": 13085 }, { "epoch": 2.32, "grad_norm": 4.53125, "learning_rate": 7.313084490665368e-06, "loss": 1.2528, "step": 13090 }, { "epoch": 2.32, "grad_norm": 4.375, "learning_rate": 7.294859372062402e-06, "loss": 1.2367, "step": 13095 }, { "epoch": 2.32, "grad_norm": 4.1875, "learning_rate": 7.276653847683528e-06, "loss": 1.1921, "step": 13100 }, { "epoch": 2.32, "grad_norm": 4.875, "learning_rate": 7.258467933239876e-06, "loss": 1.2442, "step": 13105 }, { "epoch": 2.32, "grad_norm": 4.71875, "learning_rate": 7.240301644425613e-06, "loss": 1.1734, "step": 13110 }, { "epoch": 2.32, "grad_norm": 4.75, "learning_rate": 7.222154996918021e-06, "loss": 1.2215, "step": 13115 }, { "epoch": 2.32, "grad_norm": 4.40625, "learning_rate": 7.20402800637741e-06, "loss": 1.2926, "step": 13120 }, { "epoch": 2.33, "grad_norm": 4.0625, "learning_rate": 7.185920688447106e-06, "loss": 1.1718, "step": 13125 }, { "epoch": 2.33, "grad_norm": 5.0, "learning_rate": 7.16783305875348e-06, "loss": 1.2646, "step": 13130 }, { "epoch": 2.33, "grad_norm": 4.3125, "learning_rate": 7.1497651329059185e-06, "loss": 1.2017, "step": 13135 }, { "epoch": 2.33, "grad_norm": 4.53125, "learning_rate": 7.131716926496776e-06, "loss": 1.2526, "step": 13140 }, { "epoch": 2.33, "grad_norm": 4.40625, "learning_rate": 7.113688455101417e-06, "loss": 1.2169, "step": 13145 }, { "epoch": 2.33, "grad_norm": 5.1875, "learning_rate": 7.095679734278162e-06, "loss": 1.2157, "step": 13150 }, { "epoch": 2.33, "grad_norm": 4.40625, "learning_rate": 7.07769077956829e-06, "loss": 1.2365, "step": 13155 }, { "epoch": 2.33, "grad_norm": 5.03125, "learning_rate": 7.059721606496019e-06, "loss": 1.274, "step": 13160 }, { "epoch": 2.33, "grad_norm": 4.21875, "learning_rate": 7.041772230568512e-06, "loss": 1.2208, "step": 13165 }, { "epoch": 2.33, "grad_norm": 6.0625, "learning_rate": 7.02384266727581e-06, "loss": 1.1886, "step": 13170 }, { "epoch": 2.33, "grad_norm": 4.40625, "learning_rate": 7.005932932090895e-06, "loss": 1.1618, "step": 13175 }, { "epoch": 2.34, "grad_norm": 4.03125, "learning_rate": 6.988043040469618e-06, "loss": 1.211, "step": 13180 }, { "epoch": 2.34, "grad_norm": 4.8125, "learning_rate": 6.970173007850721e-06, "loss": 1.2251, "step": 13185 }, { "epoch": 2.34, "grad_norm": 4.125, "learning_rate": 6.952322849655777e-06, "loss": 1.2558, "step": 13190 }, { "epoch": 2.34, "grad_norm": 4.65625, "learning_rate": 6.934492581289233e-06, "loss": 1.1988, "step": 13195 }, { "epoch": 2.34, "grad_norm": 4.4375, "learning_rate": 6.9166822181383695e-06, "loss": 1.2061, "step": 13200 }, { "epoch": 2.34, "grad_norm": 4.59375, "learning_rate": 6.8988917755732825e-06, "loss": 1.2491, "step": 13205 }, { "epoch": 2.34, "grad_norm": 4.3125, "learning_rate": 6.881121268946879e-06, "loss": 1.2104, "step": 13210 }, { "epoch": 2.34, "grad_norm": 4.4375, "learning_rate": 6.863370713594876e-06, "loss": 1.2402, "step": 13215 }, { "epoch": 2.34, "grad_norm": 4.65625, "learning_rate": 6.845640124835734e-06, "loss": 1.2133, "step": 13220 }, { "epoch": 2.34, "grad_norm": 5.25, "learning_rate": 6.8279295179707215e-06, "loss": 1.2277, "step": 13225 }, { "epoch": 2.34, "grad_norm": 5.625, "learning_rate": 6.810238908283852e-06, "loss": 1.2826, "step": 13230 }, { "epoch": 2.35, "grad_norm": 4.25, "learning_rate": 6.792568311041858e-06, "loss": 1.2167, "step": 13235 }, { "epoch": 2.35, "grad_norm": 4.09375, "learning_rate": 6.774917741494243e-06, "loss": 1.2804, "step": 13240 }, { "epoch": 2.35, "grad_norm": 4.28125, "learning_rate": 6.757287214873207e-06, "loss": 1.1219, "step": 13245 }, { "epoch": 2.35, "grad_norm": 4.75, "learning_rate": 6.739676746393638e-06, "loss": 1.2941, "step": 13250 }, { "epoch": 2.35, "grad_norm": 4.40625, "learning_rate": 6.722086351253134e-06, "loss": 1.1911, "step": 13255 }, { "epoch": 2.35, "grad_norm": 4.6875, "learning_rate": 6.7045160446319705e-06, "loss": 1.2473, "step": 13260 }, { "epoch": 2.35, "grad_norm": 4.9375, "learning_rate": 6.68696584169307e-06, "loss": 1.2949, "step": 13265 }, { "epoch": 2.35, "grad_norm": 4.78125, "learning_rate": 6.669435757582017e-06, "loss": 1.2424, "step": 13270 }, { "epoch": 2.35, "grad_norm": 3.875, "learning_rate": 6.651925807427038e-06, "loss": 1.1743, "step": 13275 }, { "epoch": 2.35, "grad_norm": 4.5625, "learning_rate": 6.634436006338978e-06, "loss": 1.2165, "step": 13280 }, { "epoch": 2.35, "grad_norm": 5.125, "learning_rate": 6.616966369411294e-06, "loss": 1.276, "step": 13285 }, { "epoch": 2.36, "grad_norm": 4.65625, "learning_rate": 6.59951691172005e-06, "loss": 1.242, "step": 13290 }, { "epoch": 2.36, "grad_norm": 4.40625, "learning_rate": 6.582087648323873e-06, "loss": 1.2212, "step": 13295 }, { "epoch": 2.36, "grad_norm": 4.59375, "learning_rate": 6.564678594263987e-06, "loss": 1.1428, "step": 13300 }, { "epoch": 2.36, "grad_norm": 4.5, "learning_rate": 6.547289764564165e-06, "loss": 1.1883, "step": 13305 }, { "epoch": 2.36, "grad_norm": 4.46875, "learning_rate": 6.529921174230735e-06, "loss": 1.2039, "step": 13310 }, { "epoch": 2.36, "grad_norm": 4.25, "learning_rate": 6.5125728382525285e-06, "loss": 1.2108, "step": 13315 }, { "epoch": 2.36, "grad_norm": 4.46875, "learning_rate": 6.4952447716009554e-06, "loss": 1.1812, "step": 13320 }, { "epoch": 2.36, "grad_norm": 4.53125, "learning_rate": 6.4779369892298675e-06, "loss": 1.2652, "step": 13325 }, { "epoch": 2.36, "grad_norm": 4.46875, "learning_rate": 6.460649506075657e-06, "loss": 1.2179, "step": 13330 }, { "epoch": 2.36, "grad_norm": 4.5, "learning_rate": 6.443382337057185e-06, "loss": 1.2241, "step": 13335 }, { "epoch": 2.36, "grad_norm": 5.34375, "learning_rate": 6.4261354970757846e-06, "loss": 1.23, "step": 13340 }, { "epoch": 2.36, "grad_norm": 3.921875, "learning_rate": 6.408909001015226e-06, "loss": 1.2766, "step": 13345 }, { "epoch": 2.37, "grad_norm": 4.65625, "learning_rate": 6.391702863741751e-06, "loss": 1.2437, "step": 13350 }, { "epoch": 2.37, "grad_norm": 4.21875, "learning_rate": 6.374517100104024e-06, "loss": 1.2141, "step": 13355 }, { "epoch": 2.37, "grad_norm": 4.9375, "learning_rate": 6.357351724933095e-06, "loss": 1.3076, "step": 13360 }, { "epoch": 2.37, "grad_norm": 5.34375, "learning_rate": 6.340206753042478e-06, "loss": 1.2218, "step": 13365 }, { "epoch": 2.37, "grad_norm": 4.84375, "learning_rate": 6.323082199228044e-06, "loss": 1.1901, "step": 13370 }, { "epoch": 2.37, "grad_norm": 3.734375, "learning_rate": 6.305978078268032e-06, "loss": 1.2251, "step": 13375 }, { "epoch": 2.37, "grad_norm": 4.15625, "learning_rate": 6.288894404923072e-06, "loss": 1.2667, "step": 13380 }, { "epoch": 2.37, "grad_norm": 5.125, "learning_rate": 6.271831193936145e-06, "loss": 1.226, "step": 13385 }, { "epoch": 2.37, "grad_norm": 4.34375, "learning_rate": 6.254788460032551e-06, "loss": 1.2075, "step": 13390 }, { "epoch": 2.37, "grad_norm": 5.1875, "learning_rate": 6.237766217919948e-06, "loss": 1.1999, "step": 13395 }, { "epoch": 2.37, "grad_norm": 4.875, "learning_rate": 6.220764482288291e-06, "loss": 1.1984, "step": 13400 }, { "epoch": 2.38, "grad_norm": 4.6875, "learning_rate": 6.203783267809848e-06, "loss": 1.2679, "step": 13405 }, { "epoch": 2.38, "grad_norm": 4.375, "learning_rate": 6.186822589139173e-06, "loss": 1.2431, "step": 13410 }, { "epoch": 2.38, "grad_norm": 3.75, "learning_rate": 6.1698824609131035e-06, "loss": 1.2197, "step": 13415 }, { "epoch": 2.38, "grad_norm": 4.53125, "learning_rate": 6.152962897750726e-06, "loss": 1.2248, "step": 13420 }, { "epoch": 2.38, "grad_norm": 4.125, "learning_rate": 6.136063914253394e-06, "loss": 1.1751, "step": 13425 }, { "epoch": 2.38, "grad_norm": 4.75, "learning_rate": 6.119185525004698e-06, "loss": 1.2829, "step": 13430 }, { "epoch": 2.38, "grad_norm": 4.65625, "learning_rate": 6.102327744570467e-06, "loss": 1.2259, "step": 13435 }, { "epoch": 2.38, "grad_norm": 4.53125, "learning_rate": 6.085490587498711e-06, "loss": 1.249, "step": 13440 }, { "epoch": 2.38, "grad_norm": 5.09375, "learning_rate": 6.0686740683196825e-06, "loss": 1.298, "step": 13445 }, { "epoch": 2.38, "grad_norm": 4.3125, "learning_rate": 6.051878201545814e-06, "loss": 1.1801, "step": 13450 }, { "epoch": 2.38, "grad_norm": 4.5, "learning_rate": 6.035103001671684e-06, "loss": 1.1638, "step": 13455 }, { "epoch": 2.39, "grad_norm": 4.9375, "learning_rate": 6.018348483174074e-06, "loss": 1.2189, "step": 13460 }, { "epoch": 2.39, "grad_norm": 5.0625, "learning_rate": 6.001614660511907e-06, "loss": 1.1509, "step": 13465 }, { "epoch": 2.39, "grad_norm": 4.28125, "learning_rate": 5.984901548126223e-06, "loss": 1.2472, "step": 13470 }, { "epoch": 2.39, "grad_norm": 4.625, "learning_rate": 5.9682091604402234e-06, "loss": 1.2166, "step": 13475 }, { "epoch": 2.39, "grad_norm": 4.5625, "learning_rate": 5.951537511859206e-06, "loss": 1.22, "step": 13480 }, { "epoch": 2.39, "grad_norm": 5.0, "learning_rate": 5.934886616770574e-06, "loss": 1.1693, "step": 13485 }, { "epoch": 2.39, "grad_norm": 4.59375, "learning_rate": 5.918256489543819e-06, "loss": 1.2806, "step": 13490 }, { "epoch": 2.39, "grad_norm": 6.15625, "learning_rate": 5.901647144530523e-06, "loss": 1.2205, "step": 13495 }, { "epoch": 2.39, "grad_norm": 4.8125, "learning_rate": 5.885058596064306e-06, "loss": 1.2368, "step": 13500 }, { "epoch": 2.39, "grad_norm": 3.640625, "learning_rate": 5.868490858460861e-06, "loss": 1.1197, "step": 13505 }, { "epoch": 2.39, "grad_norm": 4.9375, "learning_rate": 5.851943946017922e-06, "loss": 1.2083, "step": 13510 }, { "epoch": 2.39, "grad_norm": 4.0, "learning_rate": 5.835417873015258e-06, "loss": 1.2314, "step": 13515 }, { "epoch": 2.4, "grad_norm": 5.3125, "learning_rate": 5.818912653714611e-06, "loss": 1.229, "step": 13520 }, { "epoch": 2.4, "grad_norm": 4.28125, "learning_rate": 5.802428302359793e-06, "loss": 1.2604, "step": 13525 }, { "epoch": 2.4, "grad_norm": 3.90625, "learning_rate": 5.785964833176552e-06, "loss": 1.1857, "step": 13530 }, { "epoch": 2.4, "grad_norm": 4.9375, "learning_rate": 5.76952226037264e-06, "loss": 1.2315, "step": 13535 }, { "epoch": 2.4, "grad_norm": 5.1875, "learning_rate": 5.753100598137772e-06, "loss": 1.1852, "step": 13540 }, { "epoch": 2.4, "grad_norm": 4.40625, "learning_rate": 5.736699860643623e-06, "loss": 1.2071, "step": 13545 }, { "epoch": 2.4, "grad_norm": 3.90625, "learning_rate": 5.720320062043791e-06, "loss": 1.2514, "step": 13550 }, { "epoch": 2.4, "grad_norm": 4.75, "learning_rate": 5.703961216473824e-06, "loss": 1.226, "step": 13555 }, { "epoch": 2.4, "grad_norm": 5.15625, "learning_rate": 5.687623338051189e-06, "loss": 1.23, "step": 13560 }, { "epoch": 2.4, "grad_norm": 4.75, "learning_rate": 5.671306440875224e-06, "loss": 1.279, "step": 13565 }, { "epoch": 2.4, "grad_norm": 4.65625, "learning_rate": 5.655010539027215e-06, "loss": 1.2161, "step": 13570 }, { "epoch": 2.41, "grad_norm": 4.0625, "learning_rate": 5.638735646570295e-06, "loss": 1.1989, "step": 13575 }, { "epoch": 2.41, "grad_norm": 4.96875, "learning_rate": 5.622481777549462e-06, "loss": 1.3063, "step": 13580 }, { "epoch": 2.41, "grad_norm": 4.21875, "learning_rate": 5.6062489459915865e-06, "loss": 1.2894, "step": 13585 }, { "epoch": 2.41, "grad_norm": 3.890625, "learning_rate": 5.590037165905391e-06, "loss": 1.2357, "step": 13590 }, { "epoch": 2.41, "grad_norm": 4.0, "learning_rate": 5.573846451281396e-06, "loss": 1.1786, "step": 13595 }, { "epoch": 2.41, "grad_norm": 4.78125, "learning_rate": 5.557676816091979e-06, "loss": 1.2385, "step": 13600 }, { "epoch": 2.41, "grad_norm": 4.34375, "learning_rate": 5.541528274291311e-06, "loss": 1.16, "step": 13605 }, { "epoch": 2.41, "grad_norm": 4.09375, "learning_rate": 5.525400839815361e-06, "loss": 1.231, "step": 13610 }, { "epoch": 2.41, "grad_norm": 4.8125, "learning_rate": 5.509294526581883e-06, "loss": 1.1719, "step": 13615 }, { "epoch": 2.41, "grad_norm": 4.09375, "learning_rate": 5.493209348490411e-06, "loss": 1.2882, "step": 13620 }, { "epoch": 2.41, "grad_norm": 4.6875, "learning_rate": 5.477145319422219e-06, "loss": 1.1774, "step": 13625 }, { "epoch": 2.42, "grad_norm": 3.890625, "learning_rate": 5.461102453240349e-06, "loss": 1.2344, "step": 13630 }, { "epoch": 2.42, "grad_norm": 5.28125, "learning_rate": 5.445080763789579e-06, "loss": 1.2527, "step": 13635 }, { "epoch": 2.42, "grad_norm": 4.28125, "learning_rate": 5.429080264896408e-06, "loss": 1.2034, "step": 13640 }, { "epoch": 2.42, "grad_norm": 4.1875, "learning_rate": 5.413100970369026e-06, "loss": 1.2204, "step": 13645 }, { "epoch": 2.42, "grad_norm": 4.625, "learning_rate": 5.397142893997373e-06, "loss": 1.2548, "step": 13650 }, { "epoch": 2.42, "grad_norm": 4.0625, "learning_rate": 5.38120604955303e-06, "loss": 1.2007, "step": 13655 }, { "epoch": 2.42, "grad_norm": 4.3125, "learning_rate": 5.365290450789279e-06, "loss": 1.1728, "step": 13660 }, { "epoch": 2.42, "grad_norm": 5.53125, "learning_rate": 5.349396111441065e-06, "loss": 1.245, "step": 13665 }, { "epoch": 2.42, "grad_norm": 4.71875, "learning_rate": 5.333523045224991e-06, "loss": 1.3296, "step": 13670 }, { "epoch": 2.42, "grad_norm": 4.21875, "learning_rate": 5.317671265839278e-06, "loss": 1.1998, "step": 13675 }, { "epoch": 2.42, "grad_norm": 4.78125, "learning_rate": 5.301840786963804e-06, "loss": 1.2105, "step": 13680 }, { "epoch": 2.43, "grad_norm": 4.5, "learning_rate": 5.286031622260055e-06, "loss": 1.1911, "step": 13685 }, { "epoch": 2.43, "grad_norm": 4.46875, "learning_rate": 5.270243785371123e-06, "loss": 1.2265, "step": 13690 }, { "epoch": 2.43, "grad_norm": 4.125, "learning_rate": 5.25447728992169e-06, "loss": 1.1662, "step": 13695 }, { "epoch": 2.43, "grad_norm": 4.0, "learning_rate": 5.238732149518032e-06, "loss": 1.2045, "step": 13700 }, { "epoch": 2.43, "grad_norm": 5.4375, "learning_rate": 5.22300837774798e-06, "loss": 1.18, "step": 13705 }, { "epoch": 2.43, "grad_norm": 4.4375, "learning_rate": 5.207305988180936e-06, "loss": 1.1698, "step": 13710 }, { "epoch": 2.43, "grad_norm": 4.25, "learning_rate": 5.1916249943678505e-06, "loss": 1.1973, "step": 13715 }, { "epoch": 2.43, "grad_norm": 4.6875, "learning_rate": 5.175965409841199e-06, "loss": 1.2249, "step": 13720 }, { "epoch": 2.43, "grad_norm": 4.53125, "learning_rate": 5.160327248114987e-06, "loss": 1.272, "step": 13725 }, { "epoch": 2.43, "grad_norm": 4.625, "learning_rate": 5.144710522684738e-06, "loss": 1.2849, "step": 13730 }, { "epoch": 2.43, "grad_norm": 4.40625, "learning_rate": 5.129115247027469e-06, "loss": 1.1437, "step": 13735 }, { "epoch": 2.43, "grad_norm": 4.6875, "learning_rate": 5.1135414346016894e-06, "loss": 1.2092, "step": 13740 }, { "epoch": 2.44, "grad_norm": 5.0625, "learning_rate": 5.097989098847391e-06, "loss": 1.2192, "step": 13745 }, { "epoch": 2.44, "grad_norm": 3.90625, "learning_rate": 5.0824582531860145e-06, "loss": 1.2569, "step": 13750 }, { "epoch": 2.44, "grad_norm": 5.34375, "learning_rate": 5.066948911020468e-06, "loss": 1.2137, "step": 13755 }, { "epoch": 2.44, "grad_norm": 5.25, "learning_rate": 5.051461085735106e-06, "loss": 1.1968, "step": 13760 }, { "epoch": 2.44, "grad_norm": 4.65625, "learning_rate": 5.03599479069571e-06, "loss": 1.1551, "step": 13765 }, { "epoch": 2.44, "grad_norm": 4.0, "learning_rate": 5.020550039249464e-06, "loss": 1.1821, "step": 13770 }, { "epoch": 2.44, "grad_norm": 4.4375, "learning_rate": 5.005126844725002e-06, "loss": 1.2117, "step": 13775 }, { "epoch": 2.44, "grad_norm": 4.875, "learning_rate": 4.989725220432311e-06, "loss": 1.219, "step": 13780 }, { "epoch": 2.44, "grad_norm": 4.625, "learning_rate": 4.974345179662782e-06, "loss": 1.168, "step": 13785 }, { "epoch": 2.44, "grad_norm": 4.84375, "learning_rate": 4.958986735689181e-06, "loss": 1.3122, "step": 13790 }, { "epoch": 2.44, "grad_norm": 5.1875, "learning_rate": 4.9436499017656424e-06, "loss": 1.2454, "step": 13795 }, { "epoch": 2.45, "grad_norm": 4.5, "learning_rate": 4.928334691127626e-06, "loss": 1.1958, "step": 13800 }, { "epoch": 2.45, "grad_norm": 4.8125, "learning_rate": 4.91304111699196e-06, "loss": 1.1721, "step": 13805 }, { "epoch": 2.45, "grad_norm": 4.21875, "learning_rate": 4.897769192556783e-06, "loss": 1.1879, "step": 13810 }, { "epoch": 2.45, "grad_norm": 4.09375, "learning_rate": 4.882518931001558e-06, "loss": 1.238, "step": 13815 }, { "epoch": 2.45, "grad_norm": 4.40625, "learning_rate": 4.867290345487053e-06, "loss": 1.1882, "step": 13820 }, { "epoch": 2.45, "grad_norm": 4.8125, "learning_rate": 4.8520834491553315e-06, "loss": 1.2289, "step": 13825 }, { "epoch": 2.45, "grad_norm": 4.34375, "learning_rate": 4.836898255129725e-06, "loss": 1.2288, "step": 13830 }, { "epoch": 2.45, "grad_norm": 4.96875, "learning_rate": 4.821734776514851e-06, "loss": 1.2269, "step": 13835 }, { "epoch": 2.45, "grad_norm": 4.9375, "learning_rate": 4.806593026396593e-06, "loss": 1.2926, "step": 13840 }, { "epoch": 2.45, "grad_norm": 4.34375, "learning_rate": 4.791473017842057e-06, "loss": 1.2239, "step": 13845 }, { "epoch": 2.45, "grad_norm": 4.6875, "learning_rate": 4.776374763899604e-06, "loss": 1.2458, "step": 13850 }, { "epoch": 2.46, "grad_norm": 5.53125, "learning_rate": 4.761298277598839e-06, "loss": 1.22, "step": 13855 }, { "epoch": 2.46, "grad_norm": 4.65625, "learning_rate": 4.746243571950542e-06, "loss": 1.2482, "step": 13860 }, { "epoch": 2.46, "grad_norm": 5.1875, "learning_rate": 4.731210659946726e-06, "loss": 1.1763, "step": 13865 }, { "epoch": 2.46, "grad_norm": 4.25, "learning_rate": 4.71619955456058e-06, "loss": 1.2507, "step": 13870 }, { "epoch": 2.46, "grad_norm": 4.21875, "learning_rate": 4.7012102687464965e-06, "loss": 1.2358, "step": 13875 }, { "epoch": 2.46, "grad_norm": 5.125, "learning_rate": 4.686242815440008e-06, "loss": 1.2011, "step": 13880 }, { "epoch": 2.46, "grad_norm": 4.90625, "learning_rate": 4.671297207557823e-06, "loss": 1.3662, "step": 13885 }, { "epoch": 2.46, "grad_norm": 4.125, "learning_rate": 4.656373457997807e-06, "loss": 1.2752, "step": 13890 }, { "epoch": 2.46, "grad_norm": 4.34375, "learning_rate": 4.64147157963893e-06, "loss": 1.2324, "step": 13895 }, { "epoch": 2.46, "grad_norm": 4.78125, "learning_rate": 4.626591585341324e-06, "loss": 1.1464, "step": 13900 }, { "epoch": 2.46, "grad_norm": 5.0, "learning_rate": 4.611733487946226e-06, "loss": 1.2287, "step": 13905 }, { "epoch": 2.46, "grad_norm": 4.65625, "learning_rate": 4.596897300275949e-06, "loss": 1.2273, "step": 13910 }, { "epoch": 2.47, "grad_norm": 4.5625, "learning_rate": 4.5820830351339304e-06, "loss": 1.2192, "step": 13915 }, { "epoch": 2.47, "grad_norm": 4.3125, "learning_rate": 4.567290705304685e-06, "loss": 1.2464, "step": 13920 }, { "epoch": 2.47, "grad_norm": 4.53125, "learning_rate": 4.552520323553772e-06, "loss": 1.2819, "step": 13925 }, { "epoch": 2.47, "grad_norm": 4.40625, "learning_rate": 4.537771902627839e-06, "loss": 1.211, "step": 13930 }, { "epoch": 2.47, "grad_norm": 4.5, "learning_rate": 4.5230454552545656e-06, "loss": 1.1863, "step": 13935 }, { "epoch": 2.47, "grad_norm": 4.5625, "learning_rate": 4.508340994142679e-06, "loss": 1.198, "step": 13940 }, { "epoch": 2.47, "grad_norm": 4.46875, "learning_rate": 4.493658531981923e-06, "loss": 1.1726, "step": 13945 }, { "epoch": 2.47, "grad_norm": 3.8125, "learning_rate": 4.478998081443067e-06, "loss": 1.2258, "step": 13950 }, { "epoch": 2.47, "grad_norm": 5.1875, "learning_rate": 4.4643596551778674e-06, "loss": 1.2225, "step": 13955 }, { "epoch": 2.47, "grad_norm": 4.625, "learning_rate": 4.449743265819088e-06, "loss": 1.1837, "step": 13960 }, { "epoch": 2.47, "grad_norm": 4.0625, "learning_rate": 4.435148925980476e-06, "loss": 1.2069, "step": 13965 }, { "epoch": 2.48, "grad_norm": 4.65625, "learning_rate": 4.4205766482567514e-06, "loss": 1.2637, "step": 13970 }, { "epoch": 2.48, "grad_norm": 4.0, "learning_rate": 4.406026445223566e-06, "loss": 1.2356, "step": 13975 }, { "epoch": 2.48, "grad_norm": 4.75, "learning_rate": 4.391498329437574e-06, "loss": 1.2047, "step": 13980 }, { "epoch": 2.48, "grad_norm": 4.3125, "learning_rate": 4.376992313436322e-06, "loss": 1.1981, "step": 13985 }, { "epoch": 2.48, "grad_norm": 4.125, "learning_rate": 4.362508409738305e-06, "loss": 1.2055, "step": 13990 }, { "epoch": 2.48, "grad_norm": 4.53125, "learning_rate": 4.348046630842935e-06, "loss": 1.203, "step": 13995 }, { "epoch": 2.48, "grad_norm": 4.34375, "learning_rate": 4.333606989230534e-06, "loss": 1.1335, "step": 14000 }, { "epoch": 2.48, "grad_norm": 4.3125, "learning_rate": 4.319189497362303e-06, "loss": 1.1772, "step": 14005 }, { "epoch": 2.48, "grad_norm": 4.65625, "learning_rate": 4.304794167680341e-06, "loss": 1.2322, "step": 14010 }, { "epoch": 2.48, "grad_norm": 4.59375, "learning_rate": 4.290421012607623e-06, "loss": 1.1897, "step": 14015 }, { "epoch": 2.48, "grad_norm": 4.90625, "learning_rate": 4.2760700445479835e-06, "loss": 1.258, "step": 14020 }, { "epoch": 2.49, "grad_norm": 4.59375, "learning_rate": 4.261741275886111e-06, "loss": 1.1733, "step": 14025 }, { "epoch": 2.49, "grad_norm": 4.0625, "learning_rate": 4.247434718987543e-06, "loss": 1.1627, "step": 14030 }, { "epoch": 2.49, "grad_norm": 4.625, "learning_rate": 4.233150386198624e-06, "loss": 1.2205, "step": 14035 }, { "epoch": 2.49, "grad_norm": 4.5625, "learning_rate": 4.218888289846551e-06, "loss": 1.2929, "step": 14040 }, { "epoch": 2.49, "grad_norm": 4.75, "learning_rate": 4.204648442239317e-06, "loss": 1.2437, "step": 14045 }, { "epoch": 2.49, "grad_norm": 4.40625, "learning_rate": 4.190430855665703e-06, "loss": 1.2375, "step": 14050 }, { "epoch": 2.49, "grad_norm": 4.0625, "learning_rate": 4.176235542395298e-06, "loss": 1.2609, "step": 14055 }, { "epoch": 2.49, "grad_norm": 4.40625, "learning_rate": 4.162062514678462e-06, "loss": 1.2145, "step": 14060 }, { "epoch": 2.49, "grad_norm": 4.84375, "learning_rate": 4.147911784746324e-06, "loss": 1.2177, "step": 14065 }, { "epoch": 2.49, "grad_norm": 4.8125, "learning_rate": 4.133783364810767e-06, "loss": 1.2332, "step": 14070 }, { "epoch": 2.49, "grad_norm": 4.3125, "learning_rate": 4.119677267064433e-06, "loss": 1.1548, "step": 14075 }, { "epoch": 2.5, "grad_norm": 4.4375, "learning_rate": 4.105593503680672e-06, "loss": 1.181, "step": 14080 }, { "epoch": 2.5, "grad_norm": 3.75, "learning_rate": 4.091532086813589e-06, "loss": 1.1565, "step": 14085 }, { "epoch": 2.5, "grad_norm": 4.125, "learning_rate": 4.077493028597995e-06, "loss": 1.2753, "step": 14090 }, { "epoch": 2.5, "grad_norm": 4.34375, "learning_rate": 4.063476341149404e-06, "loss": 1.1614, "step": 14095 }, { "epoch": 2.5, "grad_norm": 4.375, "learning_rate": 4.049482036564012e-06, "loss": 1.213, "step": 14100 }, { "epoch": 2.5, "grad_norm": 5.03125, "learning_rate": 4.035510126918732e-06, "loss": 1.2405, "step": 14105 }, { "epoch": 2.5, "grad_norm": 4.71875, "learning_rate": 4.021560624271113e-06, "loss": 1.2894, "step": 14110 }, { "epoch": 2.5, "grad_norm": 5.1875, "learning_rate": 4.0076335406593886e-06, "loss": 1.1662, "step": 14115 }, { "epoch": 2.5, "grad_norm": 4.09375, "learning_rate": 3.9937288881024446e-06, "loss": 1.245, "step": 14120 }, { "epoch": 2.5, "grad_norm": 3.875, "learning_rate": 3.979846678599809e-06, "loss": 1.2372, "step": 14125 }, { "epoch": 2.5, "grad_norm": 4.3125, "learning_rate": 3.9659869241316215e-06, "loss": 1.2155, "step": 14130 }, { "epoch": 2.5, "grad_norm": 4.71875, "learning_rate": 3.952149636658672e-06, "loss": 1.1334, "step": 14135 }, { "epoch": 2.51, "grad_norm": 4.84375, "learning_rate": 3.938334828122344e-06, "loss": 1.2097, "step": 14140 }, { "epoch": 2.51, "grad_norm": 4.9375, "learning_rate": 3.9245425104446285e-06, "loss": 1.1807, "step": 14145 }, { "epoch": 2.51, "grad_norm": 4.875, "learning_rate": 3.910772695528111e-06, "loss": 1.2905, "step": 14150 }, { "epoch": 2.51, "grad_norm": 4.59375, "learning_rate": 3.897025395255952e-06, "loss": 1.1643, "step": 14155 }, { "epoch": 2.51, "grad_norm": 5.53125, "learning_rate": 3.883300621491872e-06, "loss": 1.1922, "step": 14160 }, { "epoch": 2.51, "grad_norm": 4.65625, "learning_rate": 3.86959838608017e-06, "loss": 1.2677, "step": 14165 }, { "epoch": 2.51, "grad_norm": 4.5625, "learning_rate": 3.855918700845693e-06, "loss": 1.2631, "step": 14170 }, { "epoch": 2.51, "grad_norm": 4.28125, "learning_rate": 3.842261577593805e-06, "loss": 1.2279, "step": 14175 }, { "epoch": 2.51, "grad_norm": 4.0625, "learning_rate": 3.828627028110419e-06, "loss": 1.2304, "step": 14180 }, { "epoch": 2.51, "grad_norm": 4.4375, "learning_rate": 3.815015064161984e-06, "loss": 1.2528, "step": 14185 }, { "epoch": 2.51, "grad_norm": 4.53125, "learning_rate": 3.801425697495412e-06, "loss": 1.2509, "step": 14190 }, { "epoch": 2.52, "grad_norm": 4.46875, "learning_rate": 3.7878589398381548e-06, "loss": 1.2176, "step": 14195 }, { "epoch": 2.52, "grad_norm": 4.65625, "learning_rate": 3.7743148028981367e-06, "loss": 1.1542, "step": 14200 }, { "epoch": 2.52, "grad_norm": 5.0625, "learning_rate": 3.760793298363755e-06, "loss": 1.2211, "step": 14205 }, { "epoch": 2.52, "grad_norm": 4.78125, "learning_rate": 3.747294437903882e-06, "loss": 1.1891, "step": 14210 }, { "epoch": 2.52, "grad_norm": 5.21875, "learning_rate": 3.733818233167853e-06, "loss": 1.1773, "step": 14215 }, { "epoch": 2.52, "grad_norm": 4.25, "learning_rate": 3.720364695785451e-06, "loss": 1.2188, "step": 14220 }, { "epoch": 2.52, "grad_norm": 4.84375, "learning_rate": 3.7069338373668795e-06, "loss": 1.2067, "step": 14225 }, { "epoch": 2.52, "grad_norm": 3.953125, "learning_rate": 3.6935256695027963e-06, "loss": 1.193, "step": 14230 }, { "epoch": 2.52, "grad_norm": 4.15625, "learning_rate": 3.6801402037642727e-06, "loss": 1.247, "step": 14235 }, { "epoch": 2.52, "grad_norm": 4.6875, "learning_rate": 3.6667774517027653e-06, "loss": 1.1969, "step": 14240 }, { "epoch": 2.52, "grad_norm": 4.34375, "learning_rate": 3.6534374248501524e-06, "loss": 1.1234, "step": 14245 }, { "epoch": 2.53, "grad_norm": 6.46875, "learning_rate": 3.640120134718702e-06, "loss": 1.2498, "step": 14250 }, { "epoch": 2.53, "grad_norm": 4.25, "learning_rate": 3.6268255928010342e-06, "loss": 1.1945, "step": 14255 }, { "epoch": 2.53, "grad_norm": 5.09375, "learning_rate": 3.6135538105701683e-06, "loss": 1.2459, "step": 14260 }, { "epoch": 2.53, "grad_norm": 4.125, "learning_rate": 3.6003047994794658e-06, "loss": 1.2033, "step": 14265 }, { "epoch": 2.53, "grad_norm": 4.8125, "learning_rate": 3.587078570962644e-06, "loss": 1.1887, "step": 14270 }, { "epoch": 2.53, "grad_norm": 4.28125, "learning_rate": 3.5738751364337565e-06, "loss": 1.2342, "step": 14275 }, { "epoch": 2.53, "grad_norm": 4.71875, "learning_rate": 3.5606945072871875e-06, "loss": 1.2288, "step": 14280 }, { "epoch": 2.53, "grad_norm": 4.5625, "learning_rate": 3.5475366948976294e-06, "loss": 1.2596, "step": 14285 }, { "epoch": 2.53, "grad_norm": 5.3125, "learning_rate": 3.534401710620099e-06, "loss": 1.1797, "step": 14290 }, { "epoch": 2.53, "grad_norm": 5.25, "learning_rate": 3.5212895657899114e-06, "loss": 1.2309, "step": 14295 }, { "epoch": 2.53, "grad_norm": 4.40625, "learning_rate": 3.5082002717226592e-06, "loss": 1.297, "step": 14300 }, { "epoch": 2.53, "grad_norm": 4.5, "learning_rate": 3.4951338397142197e-06, "loss": 1.2751, "step": 14305 }, { "epoch": 2.54, "grad_norm": 4.875, "learning_rate": 3.4820902810407663e-06, "loss": 1.2834, "step": 14310 }, { "epoch": 2.54, "grad_norm": 4.375, "learning_rate": 3.469069606958688e-06, "loss": 1.2813, "step": 14315 }, { "epoch": 2.54, "grad_norm": 4.0625, "learning_rate": 3.456071828704653e-06, "loss": 1.1941, "step": 14320 }, { "epoch": 2.54, "grad_norm": 5.0625, "learning_rate": 3.4430969574955693e-06, "loss": 1.2214, "step": 14325 }, { "epoch": 2.54, "grad_norm": 4.8125, "learning_rate": 3.4301450045285767e-06, "loss": 1.2623, "step": 14330 }, { "epoch": 2.54, "grad_norm": 4.5, "learning_rate": 3.41721598098102e-06, "loss": 1.2812, "step": 14335 }, { "epoch": 2.54, "grad_norm": 4.0, "learning_rate": 3.4043098980104747e-06, "loss": 1.1913, "step": 14340 }, { "epoch": 2.54, "grad_norm": 4.4375, "learning_rate": 3.3914267667547116e-06, "loss": 1.2005, "step": 14345 }, { "epoch": 2.54, "grad_norm": 4.53125, "learning_rate": 3.3785665983317005e-06, "loss": 1.1436, "step": 14350 }, { "epoch": 2.54, "grad_norm": 4.1875, "learning_rate": 3.3657294038395836e-06, "loss": 1.1792, "step": 14355 }, { "epoch": 2.54, "grad_norm": 4.40625, "learning_rate": 3.3529151943566927e-06, "loss": 1.197, "step": 14360 }, { "epoch": 2.55, "grad_norm": 4.125, "learning_rate": 3.3401239809415052e-06, "loss": 1.2112, "step": 14365 }, { "epoch": 2.55, "grad_norm": 5.15625, "learning_rate": 3.327355774632661e-06, "loss": 1.2661, "step": 14370 }, { "epoch": 2.55, "grad_norm": 4.71875, "learning_rate": 3.3146105864489597e-06, "loss": 1.1639, "step": 14375 }, { "epoch": 2.55, "grad_norm": 4.5625, "learning_rate": 3.301888427389306e-06, "loss": 1.2085, "step": 14380 }, { "epoch": 2.55, "grad_norm": 4.15625, "learning_rate": 3.289189308432757e-06, "loss": 1.1612, "step": 14385 }, { "epoch": 2.55, "grad_norm": 4.90625, "learning_rate": 3.276513240538478e-06, "loss": 1.2035, "step": 14390 }, { "epoch": 2.55, "grad_norm": 4.625, "learning_rate": 3.263860234645737e-06, "loss": 1.1527, "step": 14395 }, { "epoch": 2.55, "grad_norm": 5.15625, "learning_rate": 3.251230301673913e-06, "loss": 1.2506, "step": 14400 }, { "epoch": 2.55, "grad_norm": 4.625, "learning_rate": 3.238623452522461e-06, "loss": 1.1533, "step": 14405 }, { "epoch": 2.55, "grad_norm": 4.4375, "learning_rate": 3.226039698070912e-06, "loss": 1.2889, "step": 14410 }, { "epoch": 2.55, "grad_norm": 4.875, "learning_rate": 3.213479049178878e-06, "loss": 1.2468, "step": 14415 }, { "epoch": 2.56, "grad_norm": 4.375, "learning_rate": 3.200941516686022e-06, "loss": 1.2032, "step": 14420 }, { "epoch": 2.56, "grad_norm": 5.625, "learning_rate": 3.1884271114120765e-06, "loss": 1.2408, "step": 14425 }, { "epoch": 2.56, "grad_norm": 5.5, "learning_rate": 3.1759358441567756e-06, "loss": 1.2995, "step": 14430 }, { "epoch": 2.56, "grad_norm": 4.375, "learning_rate": 3.163467725699939e-06, "loss": 1.2982, "step": 14435 }, { "epoch": 2.56, "grad_norm": 4.59375, "learning_rate": 3.1510227668013637e-06, "loss": 1.2657, "step": 14440 }, { "epoch": 2.56, "grad_norm": 4.625, "learning_rate": 3.1386009782008818e-06, "loss": 1.1995, "step": 14445 }, { "epoch": 2.56, "grad_norm": 5.0, "learning_rate": 3.1262023706183253e-06, "loss": 1.1654, "step": 14450 }, { "epoch": 2.56, "grad_norm": 4.96875, "learning_rate": 3.1138269547535314e-06, "loss": 1.2838, "step": 14455 }, { "epoch": 2.56, "grad_norm": 4.125, "learning_rate": 3.101474741286301e-06, "loss": 1.2064, "step": 14460 }, { "epoch": 2.56, "grad_norm": 4.03125, "learning_rate": 3.089145740876427e-06, "loss": 1.2008, "step": 14465 }, { "epoch": 2.56, "grad_norm": 4.9375, "learning_rate": 3.0768399641636714e-06, "loss": 1.3106, "step": 14470 }, { "epoch": 2.57, "grad_norm": 4.6875, "learning_rate": 3.0645574217677497e-06, "loss": 1.2492, "step": 14475 }, { "epoch": 2.57, "grad_norm": 4.65625, "learning_rate": 3.052298124288323e-06, "loss": 1.2166, "step": 14480 }, { "epoch": 2.57, "grad_norm": 4.40625, "learning_rate": 3.0400620823050083e-06, "loss": 1.2448, "step": 14485 }, { "epoch": 2.57, "grad_norm": 4.125, "learning_rate": 3.027849306377326e-06, "loss": 1.2154, "step": 14490 }, { "epoch": 2.57, "grad_norm": 4.21875, "learning_rate": 3.0156598070447384e-06, "loss": 1.2059, "step": 14495 }, { "epoch": 2.57, "grad_norm": 4.25, "learning_rate": 3.0034935948266275e-06, "loss": 1.2992, "step": 14500 }, { "epoch": 2.57, "grad_norm": 4.34375, "learning_rate": 2.991350680222248e-06, "loss": 1.2047, "step": 14505 }, { "epoch": 2.57, "grad_norm": 4.40625, "learning_rate": 2.9792310737107743e-06, "loss": 1.1961, "step": 14510 }, { "epoch": 2.57, "grad_norm": 4.625, "learning_rate": 2.967134785751273e-06, "loss": 1.1616, "step": 14515 }, { "epoch": 2.57, "grad_norm": 4.21875, "learning_rate": 2.955061826782661e-06, "loss": 1.1633, "step": 14520 }, { "epoch": 2.57, "grad_norm": 4.15625, "learning_rate": 2.9430122072237373e-06, "loss": 1.2265, "step": 14525 }, { "epoch": 2.57, "grad_norm": 5.4375, "learning_rate": 2.9309859374731675e-06, "loss": 1.1996, "step": 14530 }, { "epoch": 2.58, "grad_norm": 4.59375, "learning_rate": 2.918983027909442e-06, "loss": 1.3076, "step": 14535 }, { "epoch": 2.58, "grad_norm": 4.09375, "learning_rate": 2.907003488890916e-06, "loss": 1.1145, "step": 14540 }, { "epoch": 2.58, "grad_norm": 4.5625, "learning_rate": 2.8950473307557658e-06, "loss": 1.1708, "step": 14545 }, { "epoch": 2.58, "grad_norm": 4.4375, "learning_rate": 2.8831145638219946e-06, "loss": 1.2462, "step": 14550 }, { "epoch": 2.58, "grad_norm": 4.59375, "learning_rate": 2.871205198387402e-06, "loss": 1.2302, "step": 14555 }, { "epoch": 2.58, "grad_norm": 4.125, "learning_rate": 2.85931924472963e-06, "loss": 1.2205, "step": 14560 }, { "epoch": 2.58, "grad_norm": 4.34375, "learning_rate": 2.8474567131060738e-06, "loss": 1.2079, "step": 14565 }, { "epoch": 2.58, "grad_norm": 4.25, "learning_rate": 2.83561761375394e-06, "loss": 1.2664, "step": 14570 }, { "epoch": 2.58, "grad_norm": 4.25, "learning_rate": 2.8238019568902107e-06, "loss": 1.2239, "step": 14575 }, { "epoch": 2.58, "grad_norm": 4.875, "learning_rate": 2.8120097527116383e-06, "loss": 1.2249, "step": 14580 }, { "epoch": 2.58, "grad_norm": 4.53125, "learning_rate": 2.8002410113947264e-06, "loss": 1.2104, "step": 14585 }, { "epoch": 2.59, "grad_norm": 5.0, "learning_rate": 2.788495743095737e-06, "loss": 1.2833, "step": 14590 }, { "epoch": 2.59, "grad_norm": 4.8125, "learning_rate": 2.776773957950679e-06, "loss": 1.1381, "step": 14595 }, { "epoch": 2.59, "grad_norm": 4.75, "learning_rate": 2.7650756660752873e-06, "loss": 1.1904, "step": 14600 }, { "epoch": 2.59, "grad_norm": 5.40625, "learning_rate": 2.753400877565033e-06, "loss": 1.3166, "step": 14605 }, { "epoch": 2.59, "grad_norm": 4.09375, "learning_rate": 2.7417496024951006e-06, "loss": 1.2379, "step": 14610 }, { "epoch": 2.59, "grad_norm": 4.40625, "learning_rate": 2.7301218509203697e-06, "loss": 1.1392, "step": 14615 }, { "epoch": 2.59, "grad_norm": 4.78125, "learning_rate": 2.7185176328754334e-06, "loss": 1.2676, "step": 14620 }, { "epoch": 2.59, "grad_norm": 4.78125, "learning_rate": 2.7069369583745786e-06, "loss": 1.304, "step": 14625 }, { "epoch": 2.59, "grad_norm": 4.34375, "learning_rate": 2.69537983741176e-06, "loss": 1.1828, "step": 14630 }, { "epoch": 2.59, "grad_norm": 4.5625, "learning_rate": 2.68384627996061e-06, "loss": 1.2505, "step": 14635 }, { "epoch": 2.59, "grad_norm": 4.5, "learning_rate": 2.672336295974447e-06, "loss": 1.2539, "step": 14640 }, { "epoch": 2.6, "grad_norm": 4.84375, "learning_rate": 2.660849895386218e-06, "loss": 1.1943, "step": 14645 }, { "epoch": 2.6, "grad_norm": 4.125, "learning_rate": 2.649387088108528e-06, "loss": 1.2522, "step": 14650 }, { "epoch": 2.6, "grad_norm": 4.125, "learning_rate": 2.6379478840336236e-06, "loss": 1.2709, "step": 14655 }, { "epoch": 2.6, "grad_norm": 4.1875, "learning_rate": 2.6265322930333844e-06, "loss": 1.2667, "step": 14660 }, { "epoch": 2.6, "grad_norm": 4.71875, "learning_rate": 2.615140324959304e-06, "loss": 1.2275, "step": 14665 }, { "epoch": 2.6, "grad_norm": 4.75, "learning_rate": 2.6037719896424917e-06, "loss": 1.2404, "step": 14670 }, { "epoch": 2.6, "grad_norm": 4.46875, "learning_rate": 2.592427296893669e-06, "loss": 1.2323, "step": 14675 }, { "epoch": 2.6, "grad_norm": 4.53125, "learning_rate": 2.581106256503152e-06, "loss": 1.2422, "step": 14680 }, { "epoch": 2.6, "grad_norm": 5.03125, "learning_rate": 2.5698088782408345e-06, "loss": 1.1812, "step": 14685 }, { "epoch": 2.6, "grad_norm": 4.46875, "learning_rate": 2.558535171856212e-06, "loss": 1.1797, "step": 14690 }, { "epoch": 2.6, "grad_norm": 4.6875, "learning_rate": 2.5472851470783253e-06, "loss": 1.2708, "step": 14695 }, { "epoch": 2.6, "grad_norm": 4.28125, "learning_rate": 2.536058813615795e-06, "loss": 1.2745, "step": 14700 }, { "epoch": 2.61, "grad_norm": 4.625, "learning_rate": 2.524856181156797e-06, "loss": 1.1406, "step": 14705 }, { "epoch": 2.61, "grad_norm": 4.34375, "learning_rate": 2.5136772593690395e-06, "loss": 1.2081, "step": 14710 }, { "epoch": 2.61, "grad_norm": 4.75, "learning_rate": 2.502522057899784e-06, "loss": 1.2162, "step": 14715 }, { "epoch": 2.61, "grad_norm": 4.46875, "learning_rate": 2.491390586375818e-06, "loss": 1.2293, "step": 14720 }, { "epoch": 2.61, "grad_norm": 4.4375, "learning_rate": 2.4802828544034427e-06, "loss": 1.208, "step": 14725 }, { "epoch": 2.61, "grad_norm": 4.6875, "learning_rate": 2.4691988715684843e-06, "loss": 1.2732, "step": 14730 }, { "epoch": 2.61, "grad_norm": 4.5625, "learning_rate": 2.4581386474362676e-06, "loss": 1.254, "step": 14735 }, { "epoch": 2.61, "grad_norm": 4.96875, "learning_rate": 2.447102191551608e-06, "loss": 1.2072, "step": 14740 }, { "epoch": 2.61, "grad_norm": 4.09375, "learning_rate": 2.4360895134388162e-06, "loss": 1.21, "step": 14745 }, { "epoch": 2.61, "grad_norm": 4.96875, "learning_rate": 2.425100622601687e-06, "loss": 1.2603, "step": 14750 }, { "epoch": 2.61, "grad_norm": 4.78125, "learning_rate": 2.414135528523481e-06, "loss": 1.2553, "step": 14755 }, { "epoch": 2.62, "grad_norm": 4.40625, "learning_rate": 2.4031942406669127e-06, "loss": 1.2011, "step": 14760 }, { "epoch": 2.62, "grad_norm": 4.625, "learning_rate": 2.392276768474182e-06, "loss": 1.267, "step": 14765 }, { "epoch": 2.62, "grad_norm": 4.40625, "learning_rate": 2.381383121366899e-06, "loss": 1.2457, "step": 14770 }, { "epoch": 2.62, "grad_norm": 4.1875, "learning_rate": 2.370513308746144e-06, "loss": 1.2169, "step": 14775 }, { "epoch": 2.62, "grad_norm": 4.875, "learning_rate": 2.3596673399924095e-06, "loss": 1.2065, "step": 14780 }, { "epoch": 2.62, "grad_norm": 5.1875, "learning_rate": 2.348845224465622e-06, "loss": 1.2515, "step": 14785 }, { "epoch": 2.62, "grad_norm": 3.78125, "learning_rate": 2.3380469715051147e-06, "loss": 1.2639, "step": 14790 }, { "epoch": 2.62, "grad_norm": 5.375, "learning_rate": 2.3272725904296287e-06, "loss": 1.2149, "step": 14795 }, { "epoch": 2.62, "grad_norm": 4.34375, "learning_rate": 2.3165220905373107e-06, "loss": 1.253, "step": 14800 }, { "epoch": 2.62, "grad_norm": 4.125, "learning_rate": 2.3057954811056958e-06, "loss": 1.227, "step": 14805 }, { "epoch": 2.62, "grad_norm": 4.46875, "learning_rate": 2.295092771391696e-06, "loss": 1.2242, "step": 14810 }, { "epoch": 2.63, "grad_norm": 4.5, "learning_rate": 2.2844139706316093e-06, "loss": 1.2294, "step": 14815 }, { "epoch": 2.63, "grad_norm": 5.125, "learning_rate": 2.2737590880410874e-06, "loss": 1.2813, "step": 14820 }, { "epoch": 2.63, "grad_norm": 4.5625, "learning_rate": 2.263128132815149e-06, "loss": 1.233, "step": 14825 }, { "epoch": 2.63, "grad_norm": 4.28125, "learning_rate": 2.2525211141281643e-06, "loss": 1.2906, "step": 14830 }, { "epoch": 2.63, "grad_norm": 4.96875, "learning_rate": 2.241938041133842e-06, "loss": 1.2401, "step": 14835 }, { "epoch": 2.63, "grad_norm": 4.5, "learning_rate": 2.2313789229652228e-06, "loss": 1.1943, "step": 14840 }, { "epoch": 2.63, "grad_norm": 5.21875, "learning_rate": 2.2208437687346973e-06, "loss": 1.2146, "step": 14845 }, { "epoch": 2.63, "grad_norm": 4.375, "learning_rate": 2.2103325875339463e-06, "loss": 1.2441, "step": 14850 }, { "epoch": 2.63, "grad_norm": 5.5625, "learning_rate": 2.1998453884339744e-06, "loss": 1.2039, "step": 14855 }, { "epoch": 2.63, "grad_norm": 3.859375, "learning_rate": 2.1893821804850978e-06, "loss": 1.1928, "step": 14860 }, { "epoch": 2.63, "grad_norm": 4.28125, "learning_rate": 2.178942972716911e-06, "loss": 1.2162, "step": 14865 }, { "epoch": 2.64, "grad_norm": 4.90625, "learning_rate": 2.16852777413831e-06, "loss": 1.23, "step": 14870 }, { "epoch": 2.64, "grad_norm": 5.1875, "learning_rate": 2.1581365937374733e-06, "loss": 1.1935, "step": 14875 }, { "epoch": 2.64, "grad_norm": 4.96875, "learning_rate": 2.147769440481846e-06, "loss": 1.3079, "step": 14880 }, { "epoch": 2.64, "grad_norm": 5.21875, "learning_rate": 2.1374263233181256e-06, "loss": 1.1898, "step": 14885 }, { "epoch": 2.64, "grad_norm": 4.5, "learning_rate": 2.1271072511723023e-06, "loss": 1.2012, "step": 14890 }, { "epoch": 2.64, "grad_norm": 4.90625, "learning_rate": 2.1168122329495753e-06, "loss": 1.2945, "step": 14895 }, { "epoch": 2.64, "grad_norm": 4.625, "learning_rate": 2.1065412775344085e-06, "loss": 1.2, "step": 14900 }, { "epoch": 2.64, "grad_norm": 5.5, "learning_rate": 2.0962943937904978e-06, "loss": 1.2661, "step": 14905 }, { "epoch": 2.64, "grad_norm": 4.09375, "learning_rate": 2.086071590560766e-06, "loss": 1.2286, "step": 14910 }, { "epoch": 2.64, "grad_norm": 4.65625, "learning_rate": 2.075872876667344e-06, "loss": 1.2287, "step": 14915 }, { "epoch": 2.64, "grad_norm": 4.625, "learning_rate": 2.0656982609115803e-06, "loss": 1.2689, "step": 14920 }, { "epoch": 2.64, "grad_norm": 4.15625, "learning_rate": 2.0555477520740384e-06, "loss": 1.2351, "step": 14925 }, { "epoch": 2.65, "grad_norm": 4.4375, "learning_rate": 2.0454213589144577e-06, "loss": 1.246, "step": 14930 }, { "epoch": 2.65, "grad_norm": 4.3125, "learning_rate": 2.0353190901717844e-06, "loss": 1.204, "step": 14935 }, { "epoch": 2.65, "grad_norm": 4.28125, "learning_rate": 2.025240954564138e-06, "loss": 1.2059, "step": 14940 }, { "epoch": 2.65, "grad_norm": 4.6875, "learning_rate": 2.015186960788802e-06, "loss": 1.2623, "step": 14945 }, { "epoch": 2.65, "grad_norm": 5.78125, "learning_rate": 2.0051571175222382e-06, "loss": 1.2947, "step": 14950 }, { "epoch": 2.65, "grad_norm": 3.953125, "learning_rate": 1.9951514334200706e-06, "loss": 1.2189, "step": 14955 }, { "epoch": 2.65, "grad_norm": 5.34375, "learning_rate": 1.985169917117058e-06, "loss": 1.2386, "step": 14960 }, { "epoch": 2.65, "grad_norm": 4.6875, "learning_rate": 1.9752125772271102e-06, "loss": 1.2289, "step": 14965 }, { "epoch": 2.65, "grad_norm": 4.15625, "learning_rate": 1.965279422343287e-06, "loss": 1.2092, "step": 14970 }, { "epoch": 2.65, "grad_norm": 4.875, "learning_rate": 1.955370461037751e-06, "loss": 1.2083, "step": 14975 }, { "epoch": 2.65, "grad_norm": 5.0625, "learning_rate": 1.945485701861809e-06, "loss": 1.2536, "step": 14980 }, { "epoch": 2.66, "grad_norm": 4.25, "learning_rate": 1.93562515334587e-06, "loss": 1.2724, "step": 14985 }, { "epoch": 2.66, "grad_norm": 4.15625, "learning_rate": 1.925788823999448e-06, "loss": 1.187, "step": 14990 }, { "epoch": 2.66, "grad_norm": 4.53125, "learning_rate": 1.9159767223111592e-06, "loss": 1.2194, "step": 14995 }, { "epoch": 2.66, "grad_norm": 4.53125, "learning_rate": 1.9061888567487162e-06, "loss": 1.2056, "step": 15000 }, { "epoch": 2.66, "grad_norm": 5.4375, "learning_rate": 1.8964252357589128e-06, "loss": 1.2409, "step": 15005 }, { "epoch": 2.66, "grad_norm": 4.25, "learning_rate": 1.8866858677676191e-06, "loss": 1.2587, "step": 15010 }, { "epoch": 2.66, "grad_norm": 4.875, "learning_rate": 1.8769707611797748e-06, "loss": 1.2194, "step": 15015 }, { "epoch": 2.66, "grad_norm": 4.21875, "learning_rate": 1.867279924379387e-06, "loss": 1.2043, "step": 15020 }, { "epoch": 2.66, "grad_norm": 4.90625, "learning_rate": 1.8576133657295091e-06, "loss": 1.2677, "step": 15025 }, { "epoch": 2.66, "grad_norm": 4.5625, "learning_rate": 1.8479710935722483e-06, "loss": 1.2728, "step": 15030 }, { "epoch": 2.66, "grad_norm": 5.25, "learning_rate": 1.8383531162287582e-06, "loss": 1.2767, "step": 15035 }, { "epoch": 2.67, "grad_norm": 4.25, "learning_rate": 1.8287594419992127e-06, "loss": 1.1646, "step": 15040 }, { "epoch": 2.67, "grad_norm": 4.25, "learning_rate": 1.8191900791628292e-06, "loss": 1.2444, "step": 15045 }, { "epoch": 2.67, "grad_norm": 5.28125, "learning_rate": 1.8096450359778316e-06, "loss": 1.1962, "step": 15050 }, { "epoch": 2.67, "grad_norm": 4.8125, "learning_rate": 1.8001243206814578e-06, "loss": 1.2632, "step": 15055 }, { "epoch": 2.67, "grad_norm": 4.375, "learning_rate": 1.7906279414899619e-06, "loss": 1.3105, "step": 15060 }, { "epoch": 2.67, "grad_norm": 5.0625, "learning_rate": 1.7811559065985915e-06, "loss": 1.2764, "step": 15065 }, { "epoch": 2.67, "grad_norm": 4.96875, "learning_rate": 1.7717082241815686e-06, "loss": 1.1784, "step": 15070 }, { "epoch": 2.67, "grad_norm": 4.5, "learning_rate": 1.7622849023921207e-06, "loss": 1.2218, "step": 15075 }, { "epoch": 2.67, "grad_norm": 4.1875, "learning_rate": 1.7528859493624494e-06, "loss": 1.2025, "step": 15080 }, { "epoch": 2.67, "grad_norm": 4.125, "learning_rate": 1.74351137320371e-06, "loss": 1.2513, "step": 15085 }, { "epoch": 2.67, "grad_norm": 4.0, "learning_rate": 1.734161182006041e-06, "loss": 1.1973, "step": 15090 }, { "epoch": 2.67, "grad_norm": 4.625, "learning_rate": 1.7248353838385312e-06, "loss": 1.1656, "step": 15095 }, { "epoch": 2.68, "grad_norm": 4.84375, "learning_rate": 1.7155339867492127e-06, "loss": 1.2393, "step": 15100 }, { "epoch": 2.68, "grad_norm": 5.125, "learning_rate": 1.706256998765061e-06, "loss": 1.2502, "step": 15105 }, { "epoch": 2.68, "grad_norm": 4.84375, "learning_rate": 1.6970044278919927e-06, "loss": 1.1912, "step": 15110 }, { "epoch": 2.68, "grad_norm": 4.125, "learning_rate": 1.6877762821148502e-06, "loss": 1.2212, "step": 15115 }, { "epoch": 2.68, "grad_norm": 4.65625, "learning_rate": 1.678572569397393e-06, "loss": 1.2053, "step": 15120 }, { "epoch": 2.68, "grad_norm": 5.34375, "learning_rate": 1.669393297682298e-06, "loss": 1.1483, "step": 15125 }, { "epoch": 2.68, "grad_norm": 4.6875, "learning_rate": 1.6602384748911515e-06, "loss": 1.25, "step": 15130 }, { "epoch": 2.68, "grad_norm": 4.5625, "learning_rate": 1.6511081089244407e-06, "loss": 1.2511, "step": 15135 }, { "epoch": 2.68, "grad_norm": 4.9375, "learning_rate": 1.642002207661546e-06, "loss": 1.1327, "step": 15140 }, { "epoch": 2.68, "grad_norm": 4.28125, "learning_rate": 1.632920778960738e-06, "loss": 1.2231, "step": 15145 }, { "epoch": 2.68, "grad_norm": 3.828125, "learning_rate": 1.623863830659158e-06, "loss": 1.2155, "step": 15150 }, { "epoch": 2.69, "grad_norm": 4.125, "learning_rate": 1.6148313705728302e-06, "loss": 1.2029, "step": 15155 }, { "epoch": 2.69, "grad_norm": 4.125, "learning_rate": 1.6058234064966492e-06, "loss": 1.2027, "step": 15160 }, { "epoch": 2.69, "grad_norm": 4.59375, "learning_rate": 1.5968399462043536e-06, "loss": 1.2603, "step": 15165 }, { "epoch": 2.69, "grad_norm": 4.3125, "learning_rate": 1.5878809974485487e-06, "loss": 1.2531, "step": 15170 }, { "epoch": 2.69, "grad_norm": 4.9375, "learning_rate": 1.5789465679606962e-06, "loss": 1.2575, "step": 15175 }, { "epoch": 2.69, "grad_norm": 5.1875, "learning_rate": 1.5700366654510723e-06, "loss": 1.2439, "step": 15180 }, { "epoch": 2.69, "grad_norm": 4.375, "learning_rate": 1.5611512976088026e-06, "loss": 1.2729, "step": 15185 }, { "epoch": 2.69, "grad_norm": 5.75, "learning_rate": 1.5522904721018438e-06, "loss": 1.223, "step": 15190 }, { "epoch": 2.69, "grad_norm": 4.34375, "learning_rate": 1.5434541965769622e-06, "loss": 1.2872, "step": 15195 }, { "epoch": 2.69, "grad_norm": 5.5625, "learning_rate": 1.5346424786597379e-06, "loss": 1.2887, "step": 15200 }, { "epoch": 2.69, "grad_norm": 4.125, "learning_rate": 1.5258553259545681e-06, "loss": 1.2042, "step": 15205 }, { "epoch": 2.7, "grad_norm": 4.4375, "learning_rate": 1.517092746044647e-06, "loss": 1.2175, "step": 15210 }, { "epoch": 2.7, "grad_norm": 4.3125, "learning_rate": 1.5083547464919455e-06, "loss": 1.2704, "step": 15215 }, { "epoch": 2.7, "grad_norm": 4.53125, "learning_rate": 1.4996413348372584e-06, "loss": 1.2679, "step": 15220 }, { "epoch": 2.7, "grad_norm": 4.4375, "learning_rate": 1.4909525186001272e-06, "loss": 1.2228, "step": 15225 }, { "epoch": 2.7, "grad_norm": 4.3125, "learning_rate": 1.482288305278884e-06, "loss": 1.1883, "step": 15230 }, { "epoch": 2.7, "grad_norm": 4.34375, "learning_rate": 1.4736487023506206e-06, "loss": 1.2245, "step": 15235 }, { "epoch": 2.7, "grad_norm": 4.625, "learning_rate": 1.465033717271207e-06, "loss": 1.1921, "step": 15240 }, { "epoch": 2.7, "grad_norm": 5.71875, "learning_rate": 1.4564433574752456e-06, "loss": 1.2503, "step": 15245 }, { "epoch": 2.7, "grad_norm": 3.875, "learning_rate": 1.4478776303761033e-06, "loss": 1.1965, "step": 15250 }, { "epoch": 2.7, "grad_norm": 5.0625, "learning_rate": 1.4393365433658867e-06, "loss": 1.2536, "step": 15255 }, { "epoch": 2.7, "grad_norm": 4.5, "learning_rate": 1.43082010381543e-06, "loss": 1.2525, "step": 15260 }, { "epoch": 2.71, "grad_norm": 3.890625, "learning_rate": 1.4223283190743109e-06, "loss": 1.1967, "step": 15265 }, { "epoch": 2.71, "grad_norm": 4.53125, "learning_rate": 1.4138611964708203e-06, "loss": 1.2691, "step": 15270 }, { "epoch": 2.71, "grad_norm": 4.3125, "learning_rate": 1.405418743311967e-06, "loss": 1.2037, "step": 15275 }, { "epoch": 2.71, "grad_norm": 4.96875, "learning_rate": 1.3970009668834728e-06, "loss": 1.2393, "step": 15280 }, { "epoch": 2.71, "grad_norm": 4.5625, "learning_rate": 1.3886078744497666e-06, "loss": 1.2493, "step": 15285 }, { "epoch": 2.71, "grad_norm": 4.125, "learning_rate": 1.3802394732539613e-06, "loss": 1.228, "step": 15290 }, { "epoch": 2.71, "grad_norm": 4.28125, "learning_rate": 1.371895770517877e-06, "loss": 1.2395, "step": 15295 }, { "epoch": 2.71, "grad_norm": 4.0, "learning_rate": 1.3635767734420268e-06, "loss": 1.206, "step": 15300 }, { "epoch": 2.71, "grad_norm": 5.21875, "learning_rate": 1.3552824892055716e-06, "loss": 1.2609, "step": 15305 }, { "epoch": 2.71, "grad_norm": 4.8125, "learning_rate": 1.3470129249663755e-06, "loss": 1.2489, "step": 15310 }, { "epoch": 2.71, "grad_norm": 4.4375, "learning_rate": 1.3387680878609632e-06, "loss": 1.2318, "step": 15315 }, { "epoch": 2.71, "grad_norm": 4.0625, "learning_rate": 1.3305479850045e-06, "loss": 1.2424, "step": 15320 }, { "epoch": 2.72, "grad_norm": 4.9375, "learning_rate": 1.3223526234908344e-06, "loss": 1.2121, "step": 15325 }, { "epoch": 2.72, "grad_norm": 4.5, "learning_rate": 1.3141820103924463e-06, "loss": 1.2529, "step": 15330 }, { "epoch": 2.72, "grad_norm": 4.96875, "learning_rate": 1.3060361527604614e-06, "loss": 1.153, "step": 15335 }, { "epoch": 2.72, "grad_norm": 4.3125, "learning_rate": 1.2979150576246468e-06, "loss": 1.2536, "step": 15340 }, { "epoch": 2.72, "grad_norm": 4.28125, "learning_rate": 1.2898187319933962e-06, "loss": 1.2562, "step": 15345 }, { "epoch": 2.72, "grad_norm": 4.71875, "learning_rate": 1.281747182853721e-06, "loss": 1.1804, "step": 15350 }, { "epoch": 2.72, "grad_norm": 4.25, "learning_rate": 1.2737004171712585e-06, "loss": 1.2051, "step": 15355 }, { "epoch": 2.72, "grad_norm": 5.09375, "learning_rate": 1.2656784418902611e-06, "loss": 1.2003, "step": 15360 }, { "epoch": 2.72, "grad_norm": 4.65625, "learning_rate": 1.2576812639335844e-06, "loss": 1.1739, "step": 15365 }, { "epoch": 2.72, "grad_norm": 4.53125, "learning_rate": 1.2497088902026775e-06, "loss": 1.2231, "step": 15370 }, { "epoch": 2.72, "grad_norm": 4.96875, "learning_rate": 1.2417613275775874e-06, "loss": 1.2153, "step": 15375 }, { "epoch": 2.73, "grad_norm": 4.96875, "learning_rate": 1.2338385829169607e-06, "loss": 1.1916, "step": 15380 }, { "epoch": 2.73, "grad_norm": 4.8125, "learning_rate": 1.2259406630580084e-06, "loss": 1.209, "step": 15385 }, { "epoch": 2.73, "grad_norm": 4.53125, "learning_rate": 1.2180675748165314e-06, "loss": 1.2562, "step": 15390 }, { "epoch": 2.73, "grad_norm": 4.3125, "learning_rate": 1.2102193249868987e-06, "loss": 1.177, "step": 15395 }, { "epoch": 2.73, "grad_norm": 5.46875, "learning_rate": 1.2023959203420354e-06, "loss": 1.2022, "step": 15400 }, { "epoch": 2.73, "grad_norm": 4.4375, "learning_rate": 1.1945973676334344e-06, "loss": 1.2274, "step": 15405 }, { "epoch": 2.73, "grad_norm": 4.40625, "learning_rate": 1.1868236735911454e-06, "loss": 1.1985, "step": 15410 }, { "epoch": 2.73, "grad_norm": 4.53125, "learning_rate": 1.1790748449237488e-06, "loss": 1.1587, "step": 15415 }, { "epoch": 2.73, "grad_norm": 4.875, "learning_rate": 1.1713508883183788e-06, "loss": 1.2371, "step": 15420 }, { "epoch": 2.73, "grad_norm": 3.953125, "learning_rate": 1.1636518104407168e-06, "loss": 1.2591, "step": 15425 }, { "epoch": 2.73, "grad_norm": 4.5, "learning_rate": 1.155977617934948e-06, "loss": 1.2539, "step": 15430 }, { "epoch": 2.74, "grad_norm": 4.5625, "learning_rate": 1.148328317423798e-06, "loss": 1.279, "step": 15435 }, { "epoch": 2.74, "grad_norm": 4.34375, "learning_rate": 1.1407039155085131e-06, "loss": 1.1732, "step": 15440 }, { "epoch": 2.74, "grad_norm": 3.9375, "learning_rate": 1.1331044187688399e-06, "loss": 1.2721, "step": 15445 }, { "epoch": 2.74, "grad_norm": 4.5, "learning_rate": 1.1255298337630392e-06, "loss": 1.2155, "step": 15450 }, { "epoch": 2.74, "grad_norm": 4.4375, "learning_rate": 1.1179801670278823e-06, "loss": 1.2153, "step": 15455 }, { "epoch": 2.74, "grad_norm": 4.9375, "learning_rate": 1.1104554250786203e-06, "loss": 1.29, "step": 15460 }, { "epoch": 2.74, "grad_norm": 5.125, "learning_rate": 1.1029556144090026e-06, "loss": 1.2462, "step": 15465 }, { "epoch": 2.74, "grad_norm": 4.21875, "learning_rate": 1.0954807414912682e-06, "loss": 1.182, "step": 15470 }, { "epoch": 2.74, "grad_norm": 4.28125, "learning_rate": 1.0880308127761274e-06, "loss": 1.1277, "step": 15475 }, { "epoch": 2.74, "grad_norm": 4.25, "learning_rate": 1.0806058346927606e-06, "loss": 1.1808, "step": 15480 }, { "epoch": 2.74, "grad_norm": 5.21875, "learning_rate": 1.0732058136488255e-06, "loss": 1.2594, "step": 15485 }, { "epoch": 2.74, "grad_norm": 4.4375, "learning_rate": 1.0658307560304436e-06, "loss": 1.2275, "step": 15490 }, { "epoch": 2.75, "grad_norm": 4.0, "learning_rate": 1.058480668202181e-06, "loss": 1.23, "step": 15495 }, { "epoch": 2.75, "grad_norm": 4.53125, "learning_rate": 1.0511555565070597e-06, "loss": 1.1641, "step": 15500 }, { "epoch": 2.75, "grad_norm": 3.84375, "learning_rate": 1.0438554272665635e-06, "loss": 1.1708, "step": 15505 }, { "epoch": 2.75, "grad_norm": 4.3125, "learning_rate": 1.036580286780593e-06, "loss": 1.2656, "step": 15510 }, { "epoch": 2.75, "grad_norm": 4.84375, "learning_rate": 1.0293301413274958e-06, "loss": 1.221, "step": 15515 }, { "epoch": 2.75, "grad_norm": 5.28125, "learning_rate": 1.0221049971640505e-06, "loss": 1.2158, "step": 15520 }, { "epoch": 2.75, "grad_norm": 4.3125, "learning_rate": 1.014904860525453e-06, "loss": 1.2776, "step": 15525 }, { "epoch": 2.75, "grad_norm": 4.25, "learning_rate": 1.0077297376253258e-06, "loss": 1.2624, "step": 15530 }, { "epoch": 2.75, "grad_norm": 4.46875, "learning_rate": 1.0005796346556995e-06, "loss": 1.212, "step": 15535 }, { "epoch": 2.75, "grad_norm": 5.4375, "learning_rate": 9.934545577870146e-07, "loss": 1.2415, "step": 15540 }, { "epoch": 2.75, "grad_norm": 4.0, "learning_rate": 9.863545131681162e-07, "loss": 1.2322, "step": 15545 }, { "epoch": 2.76, "grad_norm": 5.125, "learning_rate": 9.792795069262495e-07, "loss": 1.2535, "step": 15550 }, { "epoch": 2.76, "grad_norm": 4.40625, "learning_rate": 9.722295451670337e-07, "loss": 1.2222, "step": 15555 }, { "epoch": 2.76, "grad_norm": 4.34375, "learning_rate": 9.652046339745024e-07, "loss": 1.1908, "step": 15560 }, { "epoch": 2.76, "grad_norm": 4.375, "learning_rate": 9.582047794110493e-07, "loss": 1.1571, "step": 15565 }, { "epoch": 2.76, "grad_norm": 4.5625, "learning_rate": 9.51229987517459e-07, "loss": 1.1873, "step": 15570 }, { "epoch": 2.76, "grad_norm": 5.25, "learning_rate": 9.442802643128767e-07, "loss": 1.2215, "step": 15575 }, { "epoch": 2.76, "grad_norm": 4.34375, "learning_rate": 9.373556157948149e-07, "loss": 1.3061, "step": 15580 }, { "epoch": 2.76, "grad_norm": 4.4375, "learning_rate": 9.304560479391566e-07, "loss": 1.2797, "step": 15585 }, { "epoch": 2.76, "grad_norm": 4.875, "learning_rate": 9.235815667001324e-07, "loss": 1.2405, "step": 15590 }, { "epoch": 2.76, "grad_norm": 4.875, "learning_rate": 9.167321780103199e-07, "loss": 1.197, "step": 15595 }, { "epoch": 2.76, "grad_norm": 4.5625, "learning_rate": 9.099078877806577e-07, "loss": 1.2398, "step": 15600 }, { "epoch": 2.77, "grad_norm": 5.78125, "learning_rate": 9.031087019004047e-07, "loss": 1.2159, "step": 15605 }, { "epoch": 2.77, "grad_norm": 4.25, "learning_rate": 8.963346262371675e-07, "loss": 1.1645, "step": 15610 }, { "epoch": 2.77, "grad_norm": 4.125, "learning_rate": 8.895856666368862e-07, "loss": 1.1884, "step": 15615 }, { "epoch": 2.77, "grad_norm": 4.40625, "learning_rate": 8.828618289238088e-07, "loss": 1.1647, "step": 15620 }, { "epoch": 2.77, "grad_norm": 4.71875, "learning_rate": 8.761631189005171e-07, "loss": 1.2412, "step": 15625 }, { "epoch": 2.77, "grad_norm": 4.25, "learning_rate": 8.694895423479199e-07, "loss": 1.1995, "step": 15630 }, { "epoch": 2.77, "grad_norm": 4.34375, "learning_rate": 8.628411050252038e-07, "loss": 1.255, "step": 15635 }, { "epoch": 2.77, "grad_norm": 4.53125, "learning_rate": 8.562178126698894e-07, "loss": 1.2921, "step": 15640 }, { "epoch": 2.77, "grad_norm": 4.96875, "learning_rate": 8.496196709977877e-07, "loss": 1.2723, "step": 15645 }, { "epoch": 2.77, "grad_norm": 4.09375, "learning_rate": 8.430466857029973e-07, "loss": 1.1927, "step": 15650 }, { "epoch": 2.77, "grad_norm": 4.8125, "learning_rate": 8.36498862457914e-07, "loss": 1.2692, "step": 15655 }, { "epoch": 2.78, "grad_norm": 4.4375, "learning_rate": 8.299762069132244e-07, "loss": 1.264, "step": 15660 }, { "epoch": 2.78, "grad_norm": 4.0625, "learning_rate": 8.234787246978892e-07, "loss": 1.2398, "step": 15665 }, { "epoch": 2.78, "grad_norm": 4.5625, "learning_rate": 8.170064214191464e-07, "loss": 1.258, "step": 15670 }, { "epoch": 2.78, "grad_norm": 4.40625, "learning_rate": 8.10559302662508e-07, "loss": 1.2311, "step": 15675 }, { "epoch": 2.78, "grad_norm": 4.5625, "learning_rate": 8.041373739917402e-07, "loss": 1.2606, "step": 15680 }, { "epoch": 2.78, "grad_norm": 5.1875, "learning_rate": 7.97740640948883e-07, "loss": 1.1358, "step": 15685 }, { "epoch": 2.78, "grad_norm": 4.8125, "learning_rate": 7.913691090542308e-07, "loss": 1.2167, "step": 15690 }, { "epoch": 2.78, "grad_norm": 4.125, "learning_rate": 7.850227838063284e-07, "loss": 1.194, "step": 15695 }, { "epoch": 2.78, "grad_norm": 4.375, "learning_rate": 7.787016706819649e-07, "loss": 1.238, "step": 15700 }, { "epoch": 2.78, "grad_norm": 4.34375, "learning_rate": 7.724057751361735e-07, "loss": 1.2405, "step": 15705 }, { "epoch": 2.78, "grad_norm": 4.125, "learning_rate": 7.661351026022278e-07, "loss": 1.2148, "step": 15710 }, { "epoch": 2.78, "grad_norm": 4.21875, "learning_rate": 7.598896584916326e-07, "loss": 1.2898, "step": 15715 }, { "epoch": 2.79, "grad_norm": 4.34375, "learning_rate": 7.536694481941165e-07, "loss": 1.1764, "step": 15720 }, { "epoch": 2.79, "grad_norm": 3.921875, "learning_rate": 7.474744770776387e-07, "loss": 1.2397, "step": 15725 }, { "epoch": 2.79, "grad_norm": 4.75, "learning_rate": 7.41304750488373e-07, "loss": 1.2566, "step": 15730 }, { "epoch": 2.79, "grad_norm": 4.65625, "learning_rate": 7.351602737507068e-07, "loss": 1.2023, "step": 15735 }, { "epoch": 2.79, "grad_norm": 4.125, "learning_rate": 7.290410521672453e-07, "loss": 1.24, "step": 15740 }, { "epoch": 2.79, "grad_norm": 4.25, "learning_rate": 7.22947091018784e-07, "loss": 1.2243, "step": 15745 }, { "epoch": 2.79, "grad_norm": 4.59375, "learning_rate": 7.168783955643299e-07, "loss": 1.203, "step": 15750 }, { "epoch": 2.79, "grad_norm": 4.53125, "learning_rate": 7.108349710410899e-07, "loss": 1.242, "step": 15755 }, { "epoch": 2.79, "grad_norm": 4.25, "learning_rate": 7.048168226644491e-07, "loss": 1.1896, "step": 15760 }, { "epoch": 2.79, "grad_norm": 4.75, "learning_rate": 6.988239556279929e-07, "loss": 1.2296, "step": 15765 }, { "epoch": 2.79, "grad_norm": 4.6875, "learning_rate": 6.928563751034778e-07, "loss": 1.2388, "step": 15770 }, { "epoch": 2.8, "grad_norm": 4.53125, "learning_rate": 6.869140862408441e-07, "loss": 1.1976, "step": 15775 }, { "epoch": 2.8, "grad_norm": 4.46875, "learning_rate": 6.809970941682098e-07, "loss": 1.2042, "step": 15780 }, { "epoch": 2.8, "grad_norm": 4.71875, "learning_rate": 6.751054039918503e-07, "loss": 1.2093, "step": 15785 }, { "epoch": 2.8, "grad_norm": 4.78125, "learning_rate": 6.692390207962217e-07, "loss": 1.2112, "step": 15790 }, { "epoch": 2.8, "grad_norm": 4.40625, "learning_rate": 6.633979496439247e-07, "loss": 1.1161, "step": 15795 }, { "epoch": 2.8, "grad_norm": 4.1875, "learning_rate": 6.575821955757299e-07, "loss": 1.1945, "step": 15800 }, { "epoch": 2.8, "grad_norm": 4.5, "learning_rate": 6.517917636105497e-07, "loss": 1.1423, "step": 15805 }, { "epoch": 2.8, "grad_norm": 4.9375, "learning_rate": 6.460266587454433e-07, "loss": 1.208, "step": 15810 }, { "epoch": 2.8, "grad_norm": 4.75, "learning_rate": 6.402868859556244e-07, "loss": 1.1694, "step": 15815 }, { "epoch": 2.8, "grad_norm": 5.0625, "learning_rate": 6.345724501944372e-07, "loss": 1.1911, "step": 15820 }, { "epoch": 2.8, "grad_norm": 4.1875, "learning_rate": 6.288833563933538e-07, "loss": 1.2979, "step": 15825 }, { "epoch": 2.81, "grad_norm": 4.5625, "learning_rate": 6.232196094619869e-07, "loss": 1.2027, "step": 15830 }, { "epoch": 2.81, "grad_norm": 4.625, "learning_rate": 6.175812142880799e-07, "loss": 1.2395, "step": 15835 }, { "epoch": 2.81, "grad_norm": 5.28125, "learning_rate": 6.119681757374807e-07, "loss": 1.2291, "step": 15840 }, { "epoch": 2.81, "grad_norm": 4.96875, "learning_rate": 6.063804986541677e-07, "loss": 1.153, "step": 15845 }, { "epoch": 2.81, "grad_norm": 4.40625, "learning_rate": 6.008181878602337e-07, "loss": 1.2317, "step": 15850 }, { "epoch": 2.81, "grad_norm": 4.375, "learning_rate": 5.952812481558689e-07, "loss": 1.2297, "step": 15855 }, { "epoch": 2.81, "grad_norm": 4.03125, "learning_rate": 5.897696843193812e-07, "loss": 1.2349, "step": 15860 }, { "epoch": 2.81, "grad_norm": 4.8125, "learning_rate": 5.842835011071724e-07, "loss": 1.1738, "step": 15865 }, { "epoch": 2.81, "grad_norm": 5.09375, "learning_rate": 5.788227032537419e-07, "loss": 1.2393, "step": 15870 }, { "epoch": 2.81, "grad_norm": 4.75, "learning_rate": 5.733872954716835e-07, "loss": 1.2061, "step": 15875 }, { "epoch": 2.81, "grad_norm": 4.1875, "learning_rate": 5.679772824516882e-07, "loss": 1.2361, "step": 15880 }, { "epoch": 2.81, "grad_norm": 4.75, "learning_rate": 5.625926688625082e-07, "loss": 1.2011, "step": 15885 }, { "epoch": 2.82, "grad_norm": 3.71875, "learning_rate": 5.572334593509998e-07, "loss": 1.239, "step": 15890 }, { "epoch": 2.82, "grad_norm": 4.09375, "learning_rate": 5.518996585420866e-07, "loss": 1.2198, "step": 15895 }, { "epoch": 2.82, "grad_norm": 5.53125, "learning_rate": 5.465912710387633e-07, "loss": 1.2382, "step": 15900 }, { "epoch": 2.82, "grad_norm": 5.21875, "learning_rate": 5.413083014220954e-07, "loss": 1.2613, "step": 15905 }, { "epoch": 2.82, "grad_norm": 6.0, "learning_rate": 5.360507542512161e-07, "loss": 1.2447, "step": 15910 }, { "epoch": 2.82, "grad_norm": 4.65625, "learning_rate": 5.308186340633093e-07, "loss": 1.2287, "step": 15915 }, { "epoch": 2.82, "grad_norm": 5.53125, "learning_rate": 5.25611945373633e-07, "loss": 1.1844, "step": 15920 }, { "epoch": 2.82, "grad_norm": 4.5625, "learning_rate": 5.204306926754765e-07, "loss": 1.2053, "step": 15925 }, { "epoch": 2.82, "grad_norm": 5.03125, "learning_rate": 5.15274880440203e-07, "loss": 1.2335, "step": 15930 }, { "epoch": 2.82, "grad_norm": 4.4375, "learning_rate": 5.101445131171934e-07, "loss": 1.1645, "step": 15935 }, { "epoch": 2.82, "grad_norm": 3.84375, "learning_rate": 5.050395951338926e-07, "loss": 1.1898, "step": 15940 }, { "epoch": 2.83, "grad_norm": 4.25, "learning_rate": 4.999601308957768e-07, "loss": 1.1604, "step": 15945 }, { "epoch": 2.83, "grad_norm": 4.5625, "learning_rate": 4.949061247863462e-07, "loss": 1.2201, "step": 15950 }, { "epoch": 2.83, "grad_norm": 4.8125, "learning_rate": 4.898775811671385e-07, "loss": 1.2203, "step": 15955 }, { "epoch": 2.83, "grad_norm": 5.15625, "learning_rate": 4.848745043777258e-07, "loss": 1.2214, "step": 15960 }, { "epoch": 2.83, "grad_norm": 4.6875, "learning_rate": 4.798968987356911e-07, "loss": 1.2455, "step": 15965 }, { "epoch": 2.83, "grad_norm": 4.71875, "learning_rate": 4.7494476853663484e-07, "loss": 1.2371, "step": 15970 }, { "epoch": 2.83, "grad_norm": 4.75, "learning_rate": 4.7001811805418515e-07, "loss": 1.1929, "step": 15975 }, { "epoch": 2.83, "grad_norm": 4.34375, "learning_rate": 4.651169515399678e-07, "loss": 1.26, "step": 15980 }, { "epoch": 2.83, "grad_norm": 5.09375, "learning_rate": 4.6024127322362274e-07, "loss": 1.1944, "step": 15985 }, { "epoch": 2.83, "grad_norm": 4.375, "learning_rate": 4.553910873127942e-07, "loss": 1.183, "step": 15990 }, { "epoch": 2.83, "grad_norm": 4.21875, "learning_rate": 4.5056639799312405e-07, "loss": 1.2059, "step": 15995 }, { "epoch": 2.84, "grad_norm": 4.3125, "learning_rate": 4.457672094282583e-07, "loss": 1.2161, "step": 16000 }, { "epoch": 2.84, "grad_norm": 4.96875, "learning_rate": 4.409935257598274e-07, "loss": 1.2133, "step": 16005 }, { "epoch": 2.84, "grad_norm": 4.375, "learning_rate": 4.3624535110745267e-07, "loss": 1.2375, "step": 16010 }, { "epoch": 2.84, "grad_norm": 4.625, "learning_rate": 4.315226895687463e-07, "loss": 1.2277, "step": 16015 }, { "epoch": 2.84, "grad_norm": 4.8125, "learning_rate": 4.2682554521929816e-07, "loss": 1.1579, "step": 16020 }, { "epoch": 2.84, "grad_norm": 4.1875, "learning_rate": 4.221539221126824e-07, "loss": 1.2338, "step": 16025 }, { "epoch": 2.84, "grad_norm": 4.6875, "learning_rate": 4.175078242804442e-07, "loss": 1.2099, "step": 16030 }, { "epoch": 2.84, "grad_norm": 4.96875, "learning_rate": 4.128872557320995e-07, "loss": 1.2116, "step": 16035 }, { "epoch": 2.84, "grad_norm": 4.21875, "learning_rate": 4.082922204551354e-07, "loss": 1.2764, "step": 16040 }, { "epoch": 2.84, "grad_norm": 4.78125, "learning_rate": 4.037227224150097e-07, "loss": 1.2124, "step": 16045 }, { "epoch": 2.84, "grad_norm": 4.40625, "learning_rate": 3.991787655551282e-07, "loss": 1.2693, "step": 16050 }, { "epoch": 2.85, "grad_norm": 5.0, "learning_rate": 3.9466035379687406e-07, "loss": 1.1926, "step": 16055 }, { "epoch": 2.85, "grad_norm": 5.46875, "learning_rate": 3.901674910395647e-07, "loss": 1.2205, "step": 16060 }, { "epoch": 2.85, "grad_norm": 3.78125, "learning_rate": 3.857001811604788e-07, "loss": 1.2319, "step": 16065 }, { "epoch": 2.85, "grad_norm": 4.4375, "learning_rate": 3.8125842801485233e-07, "loss": 1.1998, "step": 16070 }, { "epoch": 2.85, "grad_norm": 4.375, "learning_rate": 3.7684223543584915e-07, "loss": 1.2529, "step": 16075 }, { "epoch": 2.85, "grad_norm": 4.0625, "learning_rate": 3.7245160723458406e-07, "loss": 1.2422, "step": 16080 }, { "epoch": 2.85, "grad_norm": 4.25, "learning_rate": 3.680865472001127e-07, "loss": 1.2361, "step": 16085 }, { "epoch": 2.85, "grad_norm": 3.78125, "learning_rate": 3.637470590994185e-07, "loss": 1.2167, "step": 16090 }, { "epoch": 2.85, "grad_norm": 4.53125, "learning_rate": 3.594331466774192e-07, "loss": 1.1572, "step": 16095 }, { "epoch": 2.85, "grad_norm": 5.03125, "learning_rate": 3.5514481365696683e-07, "loss": 1.2401, "step": 16100 }, { "epoch": 2.85, "grad_norm": 4.5625, "learning_rate": 3.508820637388277e-07, "loss": 1.2385, "step": 16105 }, { "epoch": 2.85, "grad_norm": 4.40625, "learning_rate": 3.4664490060169586e-07, "loss": 1.2307, "step": 16110 }, { "epoch": 2.86, "grad_norm": 4.5, "learning_rate": 3.424333279021929e-07, "loss": 1.2208, "step": 16115 }, { "epoch": 2.86, "grad_norm": 4.3125, "learning_rate": 3.3824734927484167e-07, "loss": 1.1774, "step": 16120 }, { "epoch": 2.86, "grad_norm": 4.4375, "learning_rate": 3.340869683320824e-07, "loss": 1.2198, "step": 16125 }, { "epoch": 2.86, "grad_norm": 4.375, "learning_rate": 3.2995218866427644e-07, "loss": 1.2034, "step": 16130 }, { "epoch": 2.86, "grad_norm": 4.21875, "learning_rate": 3.258430138396695e-07, "loss": 1.2151, "step": 16135 }, { "epoch": 2.86, "grad_norm": 4.34375, "learning_rate": 3.2175944740443165e-07, "loss": 1.2524, "step": 16140 }, { "epoch": 2.86, "grad_norm": 4.71875, "learning_rate": 3.177014928826205e-07, "loss": 1.272, "step": 16145 }, { "epoch": 2.86, "grad_norm": 5.03125, "learning_rate": 3.136691537761982e-07, "loss": 1.2611, "step": 16150 }, { "epoch": 2.86, "grad_norm": 4.75, "learning_rate": 3.0966243356501446e-07, "loss": 1.2921, "step": 16155 }, { "epoch": 2.86, "grad_norm": 4.34375, "learning_rate": 3.0568133570681e-07, "loss": 1.1796, "step": 16160 }, { "epoch": 2.86, "grad_norm": 5.03125, "learning_rate": 3.0172586363723e-07, "loss": 1.2796, "step": 16165 }, { "epoch": 2.87, "grad_norm": 5.0625, "learning_rate": 2.977960207697805e-07, "loss": 1.2552, "step": 16170 }, { "epoch": 2.87, "grad_norm": 4.1875, "learning_rate": 2.9389181049586875e-07, "loss": 1.1891, "step": 16175 }, { "epoch": 2.87, "grad_norm": 4.21875, "learning_rate": 2.9001323618477296e-07, "loss": 1.2664, "step": 16180 }, { "epoch": 2.87, "grad_norm": 4.1875, "learning_rate": 2.8616030118364886e-07, "loss": 1.2855, "step": 16185 }, { "epoch": 2.87, "grad_norm": 5.15625, "learning_rate": 2.8233300881752356e-07, "loss": 1.2714, "step": 16190 }, { "epoch": 2.87, "grad_norm": 5.6875, "learning_rate": 2.7853136238930154e-07, "loss": 1.1917, "step": 16195 }, { "epoch": 2.87, "grad_norm": 4.3125, "learning_rate": 2.747553651797485e-07, "loss": 1.2613, "step": 16200 }, { "epoch": 2.87, "grad_norm": 4.625, "learning_rate": 2.710050204475012e-07, "loss": 1.2138, "step": 16205 }, { "epoch": 2.87, "grad_norm": 4.96875, "learning_rate": 2.6728033142905393e-07, "loss": 1.2404, "step": 16210 }, { "epoch": 2.87, "grad_norm": 4.90625, "learning_rate": 2.6358130133876224e-07, "loss": 1.2025, "step": 16215 }, { "epoch": 2.87, "grad_norm": 4.8125, "learning_rate": 2.5990793336883925e-07, "loss": 1.2518, "step": 16220 }, { "epoch": 2.88, "grad_norm": 5.15625, "learning_rate": 2.562602306893491e-07, "loss": 1.2982, "step": 16225 }, { "epoch": 2.88, "grad_norm": 4.625, "learning_rate": 2.526381964482105e-07, "loss": 1.2893, "step": 16230 }, { "epoch": 2.88, "grad_norm": 4.4375, "learning_rate": 2.490418337711864e-07, "loss": 1.215, "step": 16235 }, { "epoch": 2.88, "grad_norm": 4.65625, "learning_rate": 2.4547114576189413e-07, "loss": 1.2429, "step": 16240 }, { "epoch": 2.88, "grad_norm": 4.21875, "learning_rate": 2.419261355017821e-07, "loss": 1.1793, "step": 16245 }, { "epoch": 2.88, "grad_norm": 5.46875, "learning_rate": 2.3840680605014653e-07, "loss": 1.1375, "step": 16250 }, { "epoch": 2.88, "grad_norm": 5.09375, "learning_rate": 2.3491316044412458e-07, "loss": 1.2798, "step": 16255 }, { "epoch": 2.88, "grad_norm": 4.03125, "learning_rate": 2.3144520169868122e-07, "loss": 1.1822, "step": 16260 }, { "epoch": 2.88, "grad_norm": 4.59375, "learning_rate": 2.280029328066191e-07, "loss": 1.1957, "step": 16265 }, { "epoch": 2.88, "grad_norm": 3.828125, "learning_rate": 2.2458635673856531e-07, "loss": 1.1985, "step": 16270 }, { "epoch": 2.88, "grad_norm": 4.09375, "learning_rate": 2.211954764429813e-07, "loss": 1.268, "step": 16275 }, { "epoch": 2.88, "grad_norm": 4.25, "learning_rate": 2.178302948461497e-07, "loss": 1.2402, "step": 16280 }, { "epoch": 2.89, "grad_norm": 4.5625, "learning_rate": 2.144908148521707e-07, "loss": 1.2392, "step": 16285 }, { "epoch": 2.89, "grad_norm": 4.40625, "learning_rate": 2.1117703934297904e-07, "loss": 1.2404, "step": 16290 }, { "epoch": 2.89, "grad_norm": 4.15625, "learning_rate": 2.078889711783105e-07, "loss": 1.2165, "step": 16295 }, { "epoch": 2.89, "grad_norm": 4.875, "learning_rate": 2.0462661319572197e-07, "loss": 1.2774, "step": 16300 }, { "epoch": 2.89, "grad_norm": 4.40625, "learning_rate": 2.013899682105913e-07, "loss": 1.2393, "step": 16305 }, { "epoch": 2.89, "grad_norm": 4.75, "learning_rate": 1.981790390160909e-07, "loss": 1.2066, "step": 16310 }, { "epoch": 2.89, "grad_norm": 4.15625, "learning_rate": 1.949938283832109e-07, "loss": 1.1683, "step": 16315 }, { "epoch": 2.89, "grad_norm": 5.34375, "learning_rate": 1.9183433906074577e-07, "loss": 1.2378, "step": 16320 }, { "epoch": 2.89, "grad_norm": 5.1875, "learning_rate": 1.8870057377528783e-07, "loss": 1.2328, "step": 16325 }, { "epoch": 2.89, "grad_norm": 4.5, "learning_rate": 1.855925352312371e-07, "loss": 1.1982, "step": 16330 }, { "epoch": 2.89, "grad_norm": 4.28125, "learning_rate": 1.8251022611078806e-07, "loss": 1.2686, "step": 16335 }, { "epoch": 2.9, "grad_norm": 5.09375, "learning_rate": 1.7945364907392624e-07, "loss": 1.2487, "step": 16340 }, { "epoch": 2.9, "grad_norm": 4.875, "learning_rate": 1.764228067584417e-07, "loss": 1.2633, "step": 16345 }, { "epoch": 2.9, "grad_norm": 4.96875, "learning_rate": 1.7341770177990545e-07, "loss": 1.1945, "step": 16350 }, { "epoch": 2.9, "grad_norm": 4.4375, "learning_rate": 1.7043833673168642e-07, "loss": 1.269, "step": 16355 }, { "epoch": 2.9, "grad_norm": 5.25, "learning_rate": 1.6748471418493116e-07, "loss": 1.3258, "step": 16360 }, { "epoch": 2.9, "grad_norm": 4.375, "learning_rate": 1.6455683668857412e-07, "loss": 1.2335, "step": 16365 }, { "epoch": 2.9, "grad_norm": 4.96875, "learning_rate": 1.6165470676933747e-07, "loss": 1.228, "step": 16370 }, { "epoch": 2.9, "grad_norm": 4.65625, "learning_rate": 1.5877832693171778e-07, "loss": 1.2766, "step": 16375 }, { "epoch": 2.9, "grad_norm": 4.125, "learning_rate": 1.5592769965798947e-07, "loss": 1.1978, "step": 16380 }, { "epoch": 2.9, "grad_norm": 4.75, "learning_rate": 1.5310282740821135e-07, "loss": 1.2357, "step": 16385 }, { "epoch": 2.9, "grad_norm": 4.0625, "learning_rate": 1.5030371262020005e-07, "loss": 1.1732, "step": 16390 }, { "epoch": 2.91, "grad_norm": 4.75, "learning_rate": 1.4753035770955992e-07, "loss": 1.2298, "step": 16395 }, { "epoch": 2.91, "grad_norm": 4.6875, "learning_rate": 1.447827650696565e-07, "loss": 1.2754, "step": 16400 }, { "epoch": 2.91, "grad_norm": 4.59375, "learning_rate": 1.420609370716197e-07, "loss": 1.1565, "step": 16405 }, { "epoch": 2.91, "grad_norm": 4.03125, "learning_rate": 1.3936487606435733e-07, "loss": 1.245, "step": 16410 }, { "epoch": 2.91, "grad_norm": 4.46875, "learning_rate": 1.3669458437452818e-07, "loss": 1.141, "step": 16415 }, { "epoch": 2.91, "grad_norm": 4.90625, "learning_rate": 1.3405006430655896e-07, "loss": 1.25, "step": 16420 }, { "epoch": 2.91, "grad_norm": 4.75, "learning_rate": 1.3143131814263076e-07, "loss": 1.1836, "step": 16425 }, { "epoch": 2.91, "grad_norm": 4.03125, "learning_rate": 1.2883834814269246e-07, "loss": 1.2555, "step": 16430 }, { "epoch": 2.91, "grad_norm": 4.1875, "learning_rate": 1.262711565444341e-07, "loss": 1.1709, "step": 16435 }, { "epoch": 2.91, "grad_norm": 5.1875, "learning_rate": 1.237297455633135e-07, "loss": 1.4025, "step": 16440 }, { "epoch": 2.91, "grad_norm": 4.6875, "learning_rate": 1.212141173925263e-07, "loss": 1.2073, "step": 16445 }, { "epoch": 2.92, "grad_norm": 4.3125, "learning_rate": 1.1872427420302923e-07, "loss": 1.2578, "step": 16450 }, { "epoch": 2.92, "grad_norm": 4.4375, "learning_rate": 1.162602181435235e-07, "loss": 1.1682, "step": 16455 }, { "epoch": 2.92, "grad_norm": 4.59375, "learning_rate": 1.1382195134045148e-07, "loss": 1.2097, "step": 16460 }, { "epoch": 2.92, "grad_norm": 4.75, "learning_rate": 1.1140947589800999e-07, "loss": 1.1716, "step": 16465 }, { "epoch": 2.92, "grad_norm": 4.46875, "learning_rate": 1.0902279389812697e-07, "loss": 1.2083, "step": 16470 }, { "epoch": 2.92, "grad_norm": 4.46875, "learning_rate": 1.0666190740047821e-07, "loss": 1.2104, "step": 16475 }, { "epoch": 2.92, "grad_norm": 4.78125, "learning_rate": 1.0432681844247727e-07, "loss": 1.2306, "step": 16480 }, { "epoch": 2.92, "grad_norm": 4.84375, "learning_rate": 1.0201752903926886e-07, "loss": 1.2515, "step": 16485 }, { "epoch": 2.92, "grad_norm": 5.09375, "learning_rate": 9.973404118374552e-08, "loss": 1.2126, "step": 16490 }, { "epoch": 2.92, "grad_norm": 4.625, "learning_rate": 9.747635684651757e-08, "loss": 1.1876, "step": 16495 }, { "epoch": 2.92, "grad_norm": 4.28125, "learning_rate": 9.524447797594316e-08, "loss": 1.2459, "step": 16500 }, { "epoch": 2.92, "grad_norm": 4.5, "learning_rate": 9.303840649809825e-08, "loss": 1.2333, "step": 16505 }, { "epoch": 2.93, "grad_norm": 4.84375, "learning_rate": 9.085814431679662e-08, "loss": 1.2255, "step": 16510 }, { "epoch": 2.93, "grad_norm": 4.78125, "learning_rate": 8.870369331356987e-08, "loss": 1.2172, "step": 16515 }, { "epoch": 2.93, "grad_norm": 5.34375, "learning_rate": 8.657505534768073e-08, "loss": 1.259, "step": 16520 }, { "epoch": 2.93, "grad_norm": 5.03125, "learning_rate": 8.447223225611645e-08, "loss": 1.2263, "step": 16525 }, { "epoch": 2.93, "grad_norm": 4.6875, "learning_rate": 8.239522585358206e-08, "loss": 1.1481, "step": 16530 }, { "epoch": 2.93, "grad_norm": 5.09375, "learning_rate": 8.03440379325071e-08, "loss": 1.2165, "step": 16535 }, { "epoch": 2.93, "grad_norm": 5.34375, "learning_rate": 7.831867026303896e-08, "loss": 1.1734, "step": 16540 }, { "epoch": 2.93, "grad_norm": 4.75, "learning_rate": 7.63191245930428e-08, "loss": 1.2312, "step": 16545 }, { "epoch": 2.93, "grad_norm": 4.1875, "learning_rate": 7.434540264809498e-08, "loss": 1.3157, "step": 16550 }, { "epoch": 2.93, "grad_norm": 3.953125, "learning_rate": 7.239750613149299e-08, "loss": 1.2029, "step": 16555 }, { "epoch": 2.93, "grad_norm": 4.46875, "learning_rate": 7.04754367242455e-08, "loss": 1.2632, "step": 16560 }, { "epoch": 2.94, "grad_norm": 5.28125, "learning_rate": 6.857919608506902e-08, "loss": 1.2953, "step": 16565 }, { "epoch": 2.94, "grad_norm": 4.53125, "learning_rate": 6.670878585039452e-08, "loss": 1.1709, "step": 16570 }, { "epoch": 2.94, "grad_norm": 4.15625, "learning_rate": 6.486420763436418e-08, "loss": 1.1543, "step": 16575 }, { "epoch": 2.94, "grad_norm": 3.90625, "learning_rate": 6.304546302881797e-08, "loss": 1.2359, "step": 16580 }, { "epoch": 2.94, "grad_norm": 4.625, "learning_rate": 6.125255360331373e-08, "loss": 1.2346, "step": 16585 }, { "epoch": 2.94, "grad_norm": 4.4375, "learning_rate": 5.948548090510042e-08, "loss": 1.3088, "step": 16590 }, { "epoch": 2.94, "grad_norm": 4.59375, "learning_rate": 5.774424645914156e-08, "loss": 1.188, "step": 16595 }, { "epoch": 2.94, "grad_norm": 4.96875, "learning_rate": 5.6028851768095135e-08, "loss": 1.3111, "step": 16600 }, { "epoch": 2.94, "grad_norm": 4.4375, "learning_rate": 5.433929831233031e-08, "loss": 1.1926, "step": 16605 }, { "epoch": 2.94, "grad_norm": 5.0, "learning_rate": 5.267558754990409e-08, "loss": 1.1958, "step": 16610 }, { "epoch": 2.94, "grad_norm": 4.90625, "learning_rate": 5.103772091657133e-08, "loss": 1.2281, "step": 16615 }, { "epoch": 2.95, "grad_norm": 4.78125, "learning_rate": 4.942569982579803e-08, "loss": 1.2216, "step": 16620 }, { "epoch": 2.95, "grad_norm": 4.59375, "learning_rate": 4.7839525668728066e-08, "loss": 1.2009, "step": 16625 }, { "epoch": 2.95, "grad_norm": 4.5, "learning_rate": 4.6279199814213135e-08, "loss": 1.2543, "step": 16630 }, { "epoch": 2.95, "grad_norm": 4.25, "learning_rate": 4.474472360878945e-08, "loss": 1.2124, "step": 16635 }, { "epoch": 2.95, "grad_norm": 4.21875, "learning_rate": 4.323609837669107e-08, "loss": 1.2459, "step": 16640 }, { "epoch": 2.95, "grad_norm": 4.5625, "learning_rate": 4.175332541983989e-08, "loss": 1.2493, "step": 16645 }, { "epoch": 2.95, "grad_norm": 4.625, "learning_rate": 4.0296406017849005e-08, "loss": 1.3077, "step": 16650 }, { "epoch": 2.95, "grad_norm": 4.5, "learning_rate": 3.886534142801934e-08, "loss": 1.3042, "step": 16655 }, { "epoch": 2.95, "grad_norm": 4.46875, "learning_rate": 3.746013288533967e-08, "loss": 1.2705, "step": 16660 }, { "epoch": 2.95, "grad_norm": 4.59375, "learning_rate": 3.6080781602486625e-08, "loss": 1.2616, "step": 16665 }, { "epoch": 2.95, "grad_norm": 4.34375, "learning_rate": 3.4727288769821344e-08, "loss": 1.1836, "step": 16670 }, { "epoch": 2.95, "grad_norm": 4.9375, "learning_rate": 3.3399655555389485e-08, "loss": 1.1835, "step": 16675 }, { "epoch": 2.96, "grad_norm": 4.1875, "learning_rate": 3.209788310492123e-08, "loss": 1.2594, "step": 16680 }, { "epoch": 2.96, "grad_norm": 5.0625, "learning_rate": 3.0821972541824617e-08, "loss": 1.2392, "step": 16685 }, { "epoch": 2.96, "grad_norm": 5.0, "learning_rate": 2.9571924967195517e-08, "loss": 1.2233, "step": 16690 }, { "epoch": 2.96, "grad_norm": 4.625, "learning_rate": 2.8347741459807673e-08, "loss": 1.2079, "step": 16695 }, { "epoch": 2.96, "grad_norm": 4.5, "learning_rate": 2.7149423076116008e-08, "loss": 1.2368, "step": 16700 }, { "epoch": 2.96, "grad_norm": 4.28125, "learning_rate": 2.597697085024997e-08, "loss": 1.1561, "step": 16705 }, { "epoch": 2.96, "grad_norm": 4.03125, "learning_rate": 2.4830385794020194e-08, "loss": 1.1925, "step": 16710 }, { "epoch": 2.96, "grad_norm": 4.3125, "learning_rate": 2.370966889691517e-08, "loss": 1.2199, "step": 16715 }, { "epoch": 2.96, "grad_norm": 4.375, "learning_rate": 2.2614821126097917e-08, "loss": 1.288, "step": 16720 }, { "epoch": 2.96, "grad_norm": 4.78125, "learning_rate": 2.1545843426405977e-08, "loss": 1.3031, "step": 16725 }, { "epoch": 2.96, "grad_norm": 4.59375, "learning_rate": 2.0502736720354743e-08, "loss": 1.2298, "step": 16730 }, { "epoch": 2.97, "grad_norm": 4.96875, "learning_rate": 1.948550190813081e-08, "loss": 1.2347, "step": 16735 }, { "epoch": 2.97, "grad_norm": 4.5, "learning_rate": 1.849413986758863e-08, "loss": 1.2037, "step": 16740 }, { "epoch": 2.97, "grad_norm": 4.53125, "learning_rate": 1.7528651454267186e-08, "loss": 1.2405, "step": 16745 }, { "epoch": 2.97, "grad_norm": 4.6875, "learning_rate": 1.6589037501363314e-08, "loss": 1.2105, "step": 16750 }, { "epoch": 2.97, "grad_norm": 4.53125, "learning_rate": 1.5675298819755047e-08, "loss": 1.1745, "step": 16755 }, { "epoch": 2.97, "grad_norm": 4.4375, "learning_rate": 1.4787436197984949e-08, "loss": 1.2091, "step": 16760 }, { "epoch": 2.97, "grad_norm": 3.796875, "learning_rate": 1.3925450402266781e-08, "loss": 1.213, "step": 16765 }, { "epoch": 2.97, "grad_norm": 4.5625, "learning_rate": 1.3089342176482167e-08, "loss": 1.227, "step": 16770 }, { "epoch": 2.97, "grad_norm": 4.21875, "learning_rate": 1.227911224218059e-08, "loss": 1.288, "step": 16775 }, { "epoch": 2.97, "grad_norm": 4.0625, "learning_rate": 1.1494761298582735e-08, "loss": 1.228, "step": 16780 }, { "epoch": 2.97, "grad_norm": 4.5625, "learning_rate": 1.0736290022563822e-08, "loss": 1.3013, "step": 16785 }, { "epoch": 2.98, "grad_norm": 5.8125, "learning_rate": 1.0003699068683592e-08, "loss": 1.2895, "step": 16790 }, { "epoch": 2.98, "grad_norm": 4.375, "learning_rate": 9.296989069156325e-09, "loss": 1.2222, "step": 16795 }, { "epoch": 2.98, "grad_norm": 4.0, "learning_rate": 8.616160633857506e-09, "loss": 1.1691, "step": 16800 }, { "epoch": 2.98, "grad_norm": 4.15625, "learning_rate": 7.961214350337143e-09, "loss": 1.2044, "step": 16805 }, { "epoch": 2.98, "grad_norm": 4.875, "learning_rate": 7.332150783803115e-09, "loss": 1.1979, "step": 16810 }, { "epoch": 2.98, "grad_norm": 4.21875, "learning_rate": 6.728970477131169e-09, "loss": 1.2296, "step": 16815 }, { "epoch": 2.98, "grad_norm": 5.125, "learning_rate": 6.151673950854919e-09, "loss": 1.2141, "step": 16820 }, { "epoch": 2.98, "grad_norm": 4.96875, "learning_rate": 5.600261703175846e-09, "loss": 1.2413, "step": 16825 }, { "epoch": 2.98, "grad_norm": 4.71875, "learning_rate": 5.074734209953303e-09, "loss": 1.2685, "step": 16830 }, { "epoch": 2.98, "grad_norm": 4.65625, "learning_rate": 4.575091924707842e-09, "loss": 1.2457, "step": 16835 }, { "epoch": 2.98, "grad_norm": 5.28125, "learning_rate": 4.101335278631213e-09, "loss": 1.2232, "step": 16840 }, { "epoch": 2.99, "grad_norm": 4.71875, "learning_rate": 3.6534646805597147e-09, "loss": 1.205, "step": 16845 }, { "epoch": 2.99, "grad_norm": 4.625, "learning_rate": 3.231480517007501e-09, "loss": 1.2043, "step": 16850 }, { "epoch": 2.99, "grad_norm": 4.28125, "learning_rate": 2.835383152136606e-09, "loss": 1.288, "step": 16855 }, { "epoch": 2.99, "grad_norm": 5.375, "learning_rate": 2.4651729277735955e-09, "loss": 1.1982, "step": 16860 }, { "epoch": 2.99, "grad_norm": 3.921875, "learning_rate": 2.1208501634062405e-09, "loss": 1.1977, "step": 16865 }, { "epoch": 2.99, "grad_norm": 4.46875, "learning_rate": 1.8024151561835123e-09, "loss": 1.2298, "step": 16870 }, { "epoch": 2.99, "grad_norm": 4.09375, "learning_rate": 1.509868180902263e-09, "loss": 1.1935, "step": 16875 }, { "epoch": 2.99, "grad_norm": 4.78125, "learning_rate": 1.2432094900338698e-09, "loss": 1.2461, "step": 16880 }, { "epoch": 2.99, "grad_norm": 3.953125, "learning_rate": 1.0024393136975895e-09, "loss": 1.2549, "step": 16885 }, { "epoch": 2.99, "grad_norm": 4.125, "learning_rate": 7.875578596772126e-10, "loss": 1.2171, "step": 16890 }, { "epoch": 2.99, "grad_norm": 4.71875, "learning_rate": 5.985653134077396e-10, "loss": 1.2405, "step": 16895 }, { "epoch": 2.99, "grad_norm": 4.40625, "learning_rate": 4.35461837992035e-10, "loss": 1.2141, "step": 16900 }, { "epoch": 3.0, "grad_norm": 4.15625, "learning_rate": 2.982475741841739e-10, "loss": 1.2289, "step": 16905 }, { "epoch": 3.0, "grad_norm": 4.09375, "learning_rate": 1.8692264039943397e-10, "loss": 1.2016, "step": 16910 }, { "epoch": 3.0, "grad_norm": 4.1875, "learning_rate": 1.0148713270763387e-10, "loss": 1.2719, "step": 16915 }, { "epoch": 3.0, "grad_norm": 4.09375, "learning_rate": 4.194112483979495e-11, "loss": 1.2198, "step": 16920 }, { "epoch": 3.0, "grad_norm": 4.59375, "learning_rate": 8.284668184810328e-12, "loss": 1.2718, "step": 16925 }, { "epoch": 3.0, "step": 16929, "total_flos": 3.449475439306998e+18, "train_loss": 1.257252904005387, "train_runtime": 50222.2429, "train_samples_per_second": 21.574, "train_steps_per_second": 0.337 } ], "logging_steps": 5, "max_steps": 16929, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 30000, "total_flos": 3.449475439306998e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }