PEFT
Safetensors
lop_jan2025 / trainer_state.json
obulikrish's picture
lop_jan2025
6376dcc
{
"best_metric": 0.5111260414123535,
"best_model_checkpoint": "saved_model/lop_jan2025/checkpoint-8506",
"epoch": 0.9999118347194874,
"eval_steps": 500,
"global_step": 8506,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": NaN,
"learning_rate": 0.0,
"loss": 73.2096,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 11.857208251953125,
"learning_rate": 2.0000000000000003e-06,
"loss": 73.226,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 14.41334056854248,
"learning_rate": 7.000000000000001e-06,
"loss": 73.0485,
"step": 20
},
{
"epoch": 0.0,
"grad_norm": 13.21112060546875,
"learning_rate": 1.2e-05,
"loss": 71.6132,
"step": 30
},
{
"epoch": 0.0,
"grad_norm": 14.494720458984375,
"learning_rate": 1.7000000000000003e-05,
"loss": 69.3524,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 19.041685104370117,
"learning_rate": 2.2000000000000003e-05,
"loss": 64.7244,
"step": 50
},
{
"epoch": 0.01,
"grad_norm": 26.15471076965332,
"learning_rate": 2.7000000000000002e-05,
"loss": 55.6446,
"step": 60
},
{
"epoch": 0.01,
"grad_norm": 35.239906311035156,
"learning_rate": 3.2000000000000005e-05,
"loss": 38.204,
"step": 70
},
{
"epoch": 0.01,
"grad_norm": 21.781131744384766,
"learning_rate": 3.7e-05,
"loss": 14.7707,
"step": 80
},
{
"epoch": 0.01,
"grad_norm": 13.038077354431152,
"learning_rate": 4.2e-05,
"loss": 2.6304,
"step": 90
},
{
"epoch": 0.01,
"grad_norm": 9.131162643432617,
"learning_rate": 4.7e-05,
"loss": 1.5558,
"step": 100
},
{
"epoch": 0.01,
"grad_norm": 7.18823766708374,
"learning_rate": 5.2000000000000004e-05,
"loss": 1.4542,
"step": 110
},
{
"epoch": 0.01,
"grad_norm": 8.378619194030762,
"learning_rate": 5.6999999999999996e-05,
"loss": 1.3911,
"step": 120
},
{
"epoch": 0.02,
"grad_norm": 5.996410369873047,
"learning_rate": 6.2e-05,
"loss": 1.2997,
"step": 130
},
{
"epoch": 0.02,
"grad_norm": 6.906871795654297,
"learning_rate": 6.7e-05,
"loss": 1.2202,
"step": 140
},
{
"epoch": 0.02,
"grad_norm": 4.491147994995117,
"learning_rate": 7.2e-05,
"loss": 1.2298,
"step": 150
},
{
"epoch": 0.02,
"grad_norm": 4.182204723358154,
"learning_rate": 7.7e-05,
"loss": 1.2139,
"step": 160
},
{
"epoch": 0.02,
"grad_norm": 4.557494163513184,
"learning_rate": 8.2e-05,
"loss": 1.12,
"step": 170
},
{
"epoch": 0.02,
"grad_norm": 6.10492467880249,
"learning_rate": 8.7e-05,
"loss": 1.0733,
"step": 180
},
{
"epoch": 0.02,
"grad_norm": 4.44232177734375,
"learning_rate": 9.200000000000001e-05,
"loss": 0.9548,
"step": 190
},
{
"epoch": 0.02,
"grad_norm": 4.1531524658203125,
"learning_rate": 9.7e-05,
"loss": 0.9438,
"step": 200
},
{
"epoch": 0.02,
"grad_norm": 2.814894676208496,
"learning_rate": 9.999528635399482e-05,
"loss": 0.9362,
"step": 210
},
{
"epoch": 0.03,
"grad_norm": 4.083724021911621,
"learning_rate": 9.998350223898186e-05,
"loss": 0.8759,
"step": 220
},
{
"epoch": 0.03,
"grad_norm": 2.8454160690307617,
"learning_rate": 9.99717181239689e-05,
"loss": 0.8088,
"step": 230
},
{
"epoch": 0.03,
"grad_norm": 5.060369491577148,
"learning_rate": 9.995993400895593e-05,
"loss": 0.8316,
"step": 240
},
{
"epoch": 0.03,
"grad_norm": 2.6055431365966797,
"learning_rate": 9.994814989394296e-05,
"loss": 0.7956,
"step": 250
},
{
"epoch": 0.03,
"grad_norm": 3.7556564807891846,
"learning_rate": 9.993636577893001e-05,
"loss": 0.7444,
"step": 260
},
{
"epoch": 0.03,
"grad_norm": 4.718788146972656,
"learning_rate": 9.992458166391705e-05,
"loss": 0.7605,
"step": 270
},
{
"epoch": 0.03,
"grad_norm": 3.2772555351257324,
"learning_rate": 9.991279754890408e-05,
"loss": 0.7491,
"step": 280
},
{
"epoch": 0.03,
"grad_norm": 2.8285696506500244,
"learning_rate": 9.990101343389112e-05,
"loss": 0.7012,
"step": 290
},
{
"epoch": 0.04,
"grad_norm": 3.104058265686035,
"learning_rate": 9.988922931887816e-05,
"loss": 0.7206,
"step": 300
},
{
"epoch": 0.04,
"grad_norm": 2.573132276535034,
"learning_rate": 9.98774452038652e-05,
"loss": 0.6939,
"step": 310
},
{
"epoch": 0.04,
"grad_norm": 2.5289745330810547,
"learning_rate": 9.986566108885223e-05,
"loss": 0.6694,
"step": 320
},
{
"epoch": 0.04,
"grad_norm": 1.5782195329666138,
"learning_rate": 9.985387697383927e-05,
"loss": 0.687,
"step": 330
},
{
"epoch": 0.04,
"grad_norm": 2.2761898040771484,
"learning_rate": 9.984209285882631e-05,
"loss": 0.6377,
"step": 340
},
{
"epoch": 0.04,
"grad_norm": 3.468121290206909,
"learning_rate": 9.983030874381334e-05,
"loss": 0.6669,
"step": 350
},
{
"epoch": 0.04,
"grad_norm": 1.6436119079589844,
"learning_rate": 9.981852462880038e-05,
"loss": 0.6371,
"step": 360
},
{
"epoch": 0.04,
"grad_norm": 2.254734754562378,
"learning_rate": 9.980674051378743e-05,
"loss": 0.6509,
"step": 370
},
{
"epoch": 0.04,
"grad_norm": 3.150641679763794,
"learning_rate": 9.979495639877445e-05,
"loss": 0.645,
"step": 380
},
{
"epoch": 0.05,
"grad_norm": 2.567556858062744,
"learning_rate": 9.978317228376149e-05,
"loss": 0.6414,
"step": 390
},
{
"epoch": 0.05,
"grad_norm": 2.490185260772705,
"learning_rate": 9.977138816874853e-05,
"loss": 0.6607,
"step": 400
},
{
"epoch": 0.05,
"grad_norm": 2.325997829437256,
"learning_rate": 9.975960405373557e-05,
"loss": 0.5877,
"step": 410
},
{
"epoch": 0.05,
"grad_norm": 3.3636696338653564,
"learning_rate": 9.974781993872261e-05,
"loss": 0.6222,
"step": 420
},
{
"epoch": 0.05,
"grad_norm": 2.3623595237731934,
"learning_rate": 9.973603582370964e-05,
"loss": 0.6083,
"step": 430
},
{
"epoch": 0.05,
"grad_norm": 2.102452278137207,
"learning_rate": 9.972425170869669e-05,
"loss": 0.5983,
"step": 440
},
{
"epoch": 0.05,
"grad_norm": 2.635756492614746,
"learning_rate": 9.971246759368373e-05,
"loss": 0.6338,
"step": 450
},
{
"epoch": 0.05,
"grad_norm": 2.1941983699798584,
"learning_rate": 9.970068347867075e-05,
"loss": 0.6216,
"step": 460
},
{
"epoch": 0.06,
"grad_norm": 1.1595548391342163,
"learning_rate": 9.968889936365779e-05,
"loss": 0.596,
"step": 470
},
{
"epoch": 0.06,
"grad_norm": 2.0213301181793213,
"learning_rate": 9.967711524864483e-05,
"loss": 0.6187,
"step": 480
},
{
"epoch": 0.06,
"grad_norm": 2.55887508392334,
"learning_rate": 9.966533113363187e-05,
"loss": 0.6264,
"step": 490
},
{
"epoch": 0.06,
"grad_norm": 2.3320772647857666,
"learning_rate": 9.965354701861891e-05,
"loss": 0.5847,
"step": 500
},
{
"epoch": 0.06,
"grad_norm": 1.8742616176605225,
"learning_rate": 9.964176290360593e-05,
"loss": 0.6231,
"step": 510
},
{
"epoch": 0.06,
"grad_norm": 2.782414436340332,
"learning_rate": 9.962997878859299e-05,
"loss": 0.5919,
"step": 520
},
{
"epoch": 0.06,
"grad_norm": 2.733325958251953,
"learning_rate": 9.961819467358003e-05,
"loss": 0.5951,
"step": 530
},
{
"epoch": 0.06,
"grad_norm": 1.784854769706726,
"learning_rate": 9.960641055856705e-05,
"loss": 0.5719,
"step": 540
},
{
"epoch": 0.06,
"grad_norm": 2.787519693374634,
"learning_rate": 9.959462644355409e-05,
"loss": 0.6018,
"step": 550
},
{
"epoch": 0.07,
"grad_norm": 1.7599632740020752,
"learning_rate": 9.958284232854113e-05,
"loss": 0.6124,
"step": 560
},
{
"epoch": 0.07,
"grad_norm": 3.3914382457733154,
"learning_rate": 9.957105821352817e-05,
"loss": 0.5862,
"step": 570
},
{
"epoch": 0.07,
"grad_norm": 2.6275715827941895,
"learning_rate": 9.95592740985152e-05,
"loss": 0.6412,
"step": 580
},
{
"epoch": 0.07,
"grad_norm": 3.2809360027313232,
"learning_rate": 9.954748998350225e-05,
"loss": 0.5979,
"step": 590
},
{
"epoch": 0.07,
"grad_norm": 4.346547603607178,
"learning_rate": 9.953570586848928e-05,
"loss": 0.6233,
"step": 600
},
{
"epoch": 0.07,
"grad_norm": 2.7601490020751953,
"learning_rate": 9.952392175347632e-05,
"loss": 0.5613,
"step": 610
},
{
"epoch": 0.07,
"grad_norm": 2.135434627532959,
"learning_rate": 9.951213763846335e-05,
"loss": 0.5924,
"step": 620
},
{
"epoch": 0.07,
"grad_norm": 1.0876432657241821,
"learning_rate": 9.95003535234504e-05,
"loss": 0.5469,
"step": 630
},
{
"epoch": 0.08,
"grad_norm": 3.1632611751556396,
"learning_rate": 9.948856940843743e-05,
"loss": 0.5954,
"step": 640
},
{
"epoch": 0.08,
"grad_norm": 3.198110818862915,
"learning_rate": 9.947678529342447e-05,
"loss": 0.5542,
"step": 650
},
{
"epoch": 0.08,
"grad_norm": 1.968692660331726,
"learning_rate": 9.94650011784115e-05,
"loss": 0.6,
"step": 660
},
{
"epoch": 0.08,
"grad_norm": 1.6521714925765991,
"learning_rate": 9.945321706339854e-05,
"loss": 0.5513,
"step": 670
},
{
"epoch": 0.08,
"grad_norm": 1.7843347787857056,
"learning_rate": 9.944143294838558e-05,
"loss": 0.5718,
"step": 680
},
{
"epoch": 0.08,
"grad_norm": 2.2484843730926514,
"learning_rate": 9.942964883337262e-05,
"loss": 0.5852,
"step": 690
},
{
"epoch": 0.08,
"grad_norm": 2.0752651691436768,
"learning_rate": 9.941786471835966e-05,
"loss": 0.6152,
"step": 700
},
{
"epoch": 0.08,
"grad_norm": 1.9290324449539185,
"learning_rate": 9.94060806033467e-05,
"loss": 0.5755,
"step": 710
},
{
"epoch": 0.08,
"grad_norm": 2.6076815128326416,
"learning_rate": 9.939429648833373e-05,
"loss": 0.6025,
"step": 720
},
{
"epoch": 0.09,
"grad_norm": 2.5126802921295166,
"learning_rate": 9.938251237332076e-05,
"loss": 0.5695,
"step": 730
},
{
"epoch": 0.09,
"grad_norm": 1.2637362480163574,
"learning_rate": 9.93707282583078e-05,
"loss": 0.5785,
"step": 740
},
{
"epoch": 0.09,
"grad_norm": 1.2289626598358154,
"learning_rate": 9.935894414329484e-05,
"loss": 0.5738,
"step": 750
},
{
"epoch": 0.09,
"grad_norm": 1.6401044130325317,
"learning_rate": 9.934716002828188e-05,
"loss": 0.5629,
"step": 760
},
{
"epoch": 0.09,
"grad_norm": 3.114750862121582,
"learning_rate": 9.933537591326891e-05,
"loss": 0.5856,
"step": 770
},
{
"epoch": 0.09,
"grad_norm": 2.1406428813934326,
"learning_rate": 9.932359179825596e-05,
"loss": 0.5775,
"step": 780
},
{
"epoch": 0.09,
"grad_norm": 1.9738190174102783,
"learning_rate": 9.9311807683243e-05,
"loss": 0.5724,
"step": 790
},
{
"epoch": 0.09,
"grad_norm": 1.7785269021987915,
"learning_rate": 9.930002356823002e-05,
"loss": 0.5755,
"step": 800
},
{
"epoch": 0.1,
"grad_norm": 2.815354108810425,
"learning_rate": 9.928823945321706e-05,
"loss": 0.5704,
"step": 810
},
{
"epoch": 0.1,
"grad_norm": 2.890578269958496,
"learning_rate": 9.927645533820412e-05,
"loss": 0.5678,
"step": 820
},
{
"epoch": 0.1,
"grad_norm": 1.484841227531433,
"learning_rate": 9.926467122319114e-05,
"loss": 0.5439,
"step": 830
},
{
"epoch": 0.1,
"grad_norm": 1.4272996187210083,
"learning_rate": 9.925288710817818e-05,
"loss": 0.5632,
"step": 840
},
{
"epoch": 0.1,
"grad_norm": 2.043959379196167,
"learning_rate": 9.924110299316522e-05,
"loss": 0.5539,
"step": 850
},
{
"epoch": 0.1,
"grad_norm": 2.2277722358703613,
"learning_rate": 9.922931887815226e-05,
"loss": 0.5579,
"step": 860
},
{
"epoch": 0.1,
"grad_norm": 3.0078842639923096,
"learning_rate": 9.92175347631393e-05,
"loss": 0.5601,
"step": 870
},
{
"epoch": 0.1,
"grad_norm": 3.3610494136810303,
"learning_rate": 9.920575064812632e-05,
"loss": 0.5421,
"step": 880
},
{
"epoch": 0.1,
"grad_norm": 1.7380688190460205,
"learning_rate": 9.919396653311338e-05,
"loss": 0.5443,
"step": 890
},
{
"epoch": 0.11,
"grad_norm": 1.9940533638000488,
"learning_rate": 9.91821824181004e-05,
"loss": 0.5875,
"step": 900
},
{
"epoch": 0.11,
"grad_norm": 1.2586497068405151,
"learning_rate": 9.917039830308744e-05,
"loss": 0.5336,
"step": 910
},
{
"epoch": 0.11,
"grad_norm": 1.4775793552398682,
"learning_rate": 9.915861418807448e-05,
"loss": 0.5849,
"step": 920
},
{
"epoch": 0.11,
"grad_norm": 2.0493359565734863,
"learning_rate": 9.914683007306152e-05,
"loss": 0.5304,
"step": 930
},
{
"epoch": 0.11,
"grad_norm": 1.54847252368927,
"learning_rate": 9.913504595804856e-05,
"loss": 0.5567,
"step": 940
},
{
"epoch": 0.11,
"grad_norm": 1.4110504388809204,
"learning_rate": 9.91232618430356e-05,
"loss": 0.5256,
"step": 950
},
{
"epoch": 0.11,
"grad_norm": 2.1841001510620117,
"learning_rate": 9.911147772802263e-05,
"loss": 0.5481,
"step": 960
},
{
"epoch": 0.11,
"grad_norm": 1.7482472658157349,
"learning_rate": 9.909969361300967e-05,
"loss": 0.577,
"step": 970
},
{
"epoch": 0.12,
"grad_norm": 1.8513494729995728,
"learning_rate": 9.90879094979967e-05,
"loss": 0.5776,
"step": 980
},
{
"epoch": 0.12,
"grad_norm": 1.2556830644607544,
"learning_rate": 9.907612538298374e-05,
"loss": 0.543,
"step": 990
},
{
"epoch": 0.12,
"grad_norm": 1.3181068897247314,
"learning_rate": 9.906434126797079e-05,
"loss": 0.5214,
"step": 1000
},
{
"epoch": 0.12,
"grad_norm": 1.9610787630081177,
"learning_rate": 9.905255715295782e-05,
"loss": 0.5513,
"step": 1010
},
{
"epoch": 0.12,
"grad_norm": 2.0105204582214355,
"learning_rate": 9.904077303794486e-05,
"loss": 0.53,
"step": 1020
},
{
"epoch": 0.12,
"grad_norm": 1.932958722114563,
"learning_rate": 9.90289889229319e-05,
"loss": 0.5575,
"step": 1030
},
{
"epoch": 0.12,
"grad_norm": 2.1123929023742676,
"learning_rate": 9.901720480791893e-05,
"loss": 0.5338,
"step": 1040
},
{
"epoch": 0.12,
"grad_norm": 1.497371792793274,
"learning_rate": 9.900542069290597e-05,
"loss": 0.534,
"step": 1050
},
{
"epoch": 0.12,
"grad_norm": 1.7074368000030518,
"learning_rate": 9.8993636577893e-05,
"loss": 0.5537,
"step": 1060
},
{
"epoch": 0.13,
"grad_norm": 1.0042043924331665,
"learning_rate": 9.898185246288004e-05,
"loss": 0.5492,
"step": 1070
},
{
"epoch": 0.13,
"grad_norm": 1.1898843050003052,
"learning_rate": 9.897006834786709e-05,
"loss": 0.5565,
"step": 1080
},
{
"epoch": 0.13,
"grad_norm": 2.4821114540100098,
"learning_rate": 9.895828423285411e-05,
"loss": 0.553,
"step": 1090
},
{
"epoch": 0.13,
"grad_norm": 2.453070640563965,
"learning_rate": 9.894650011784115e-05,
"loss": 0.5709,
"step": 1100
},
{
"epoch": 0.13,
"grad_norm": 1.8405230045318604,
"learning_rate": 9.893471600282819e-05,
"loss": 0.5337,
"step": 1110
},
{
"epoch": 0.13,
"grad_norm": 1.9227620363235474,
"learning_rate": 9.892293188781523e-05,
"loss": 0.5495,
"step": 1120
},
{
"epoch": 0.13,
"grad_norm": 1.3059393167495728,
"learning_rate": 9.891114777280227e-05,
"loss": 0.5573,
"step": 1130
},
{
"epoch": 0.13,
"grad_norm": 1.0983681678771973,
"learning_rate": 9.88993636577893e-05,
"loss": 0.5375,
"step": 1140
},
{
"epoch": 0.14,
"grad_norm": 1.1052865982055664,
"learning_rate": 9.888757954277635e-05,
"loss": 0.5252,
"step": 1150
},
{
"epoch": 0.14,
"grad_norm": 1.7332347631454468,
"learning_rate": 9.887579542776339e-05,
"loss": 0.5352,
"step": 1160
},
{
"epoch": 0.14,
"grad_norm": 2.7764995098114014,
"learning_rate": 9.886401131275041e-05,
"loss": 0.554,
"step": 1170
},
{
"epoch": 0.14,
"grad_norm": 2.8629496097564697,
"learning_rate": 9.885222719773745e-05,
"loss": 0.549,
"step": 1180
},
{
"epoch": 0.14,
"grad_norm": 1.0409201383590698,
"learning_rate": 9.884044308272449e-05,
"loss": 0.535,
"step": 1190
},
{
"epoch": 0.14,
"grad_norm": 1.2447623014450073,
"learning_rate": 9.882865896771153e-05,
"loss": 0.5517,
"step": 1200
},
{
"epoch": 0.14,
"grad_norm": 1.1214842796325684,
"learning_rate": 9.881687485269857e-05,
"loss": 0.5627,
"step": 1210
},
{
"epoch": 0.14,
"grad_norm": 1.171769618988037,
"learning_rate": 9.880509073768561e-05,
"loss": 0.5285,
"step": 1220
},
{
"epoch": 0.14,
"grad_norm": 1.3193728923797607,
"learning_rate": 9.879330662267265e-05,
"loss": 0.5207,
"step": 1230
},
{
"epoch": 0.15,
"grad_norm": 1.4482805728912354,
"learning_rate": 9.878152250765967e-05,
"loss": 0.5207,
"step": 1240
},
{
"epoch": 0.15,
"grad_norm": 2.4224910736083984,
"learning_rate": 9.876973839264671e-05,
"loss": 0.5561,
"step": 1250
},
{
"epoch": 0.15,
"grad_norm": 1.8356711864471436,
"learning_rate": 9.875795427763376e-05,
"loss": 0.521,
"step": 1260
},
{
"epoch": 0.15,
"grad_norm": 1.1184781789779663,
"learning_rate": 9.874617016262079e-05,
"loss": 0.5391,
"step": 1270
},
{
"epoch": 0.15,
"grad_norm": 1.017152190208435,
"learning_rate": 9.873438604760783e-05,
"loss": 0.5282,
"step": 1280
},
{
"epoch": 0.15,
"grad_norm": 1.2299338579177856,
"learning_rate": 9.872260193259487e-05,
"loss": 0.5409,
"step": 1290
},
{
"epoch": 0.15,
"grad_norm": 2.4953067302703857,
"learning_rate": 9.87108178175819e-05,
"loss": 0.5358,
"step": 1300
},
{
"epoch": 0.15,
"grad_norm": 2.0601046085357666,
"learning_rate": 9.869903370256895e-05,
"loss": 0.507,
"step": 1310
},
{
"epoch": 0.16,
"grad_norm": 1.5593374967575073,
"learning_rate": 9.868724958755597e-05,
"loss": 0.5391,
"step": 1320
},
{
"epoch": 0.16,
"grad_norm": 1.1601495742797852,
"learning_rate": 9.867546547254301e-05,
"loss": 0.5384,
"step": 1330
},
{
"epoch": 0.16,
"grad_norm": 1.0274831056594849,
"learning_rate": 9.866368135753006e-05,
"loss": 0.5286,
"step": 1340
},
{
"epoch": 0.16,
"grad_norm": 1.367141842842102,
"learning_rate": 9.865189724251709e-05,
"loss": 0.5395,
"step": 1350
},
{
"epoch": 0.16,
"grad_norm": 1.800572156906128,
"learning_rate": 9.864011312750413e-05,
"loss": 0.5343,
"step": 1360
},
{
"epoch": 0.16,
"grad_norm": 1.111892580986023,
"learning_rate": 9.862832901249117e-05,
"loss": 0.5407,
"step": 1370
},
{
"epoch": 0.16,
"grad_norm": 1.3224807977676392,
"learning_rate": 9.86165448974782e-05,
"loss": 0.5031,
"step": 1380
},
{
"epoch": 0.16,
"grad_norm": 2.254096031188965,
"learning_rate": 9.860476078246524e-05,
"loss": 0.5449,
"step": 1390
},
{
"epoch": 0.16,
"grad_norm": 1.751460313796997,
"learning_rate": 9.859297666745227e-05,
"loss": 0.533,
"step": 1400
},
{
"epoch": 0.17,
"grad_norm": 0.8202616572380066,
"learning_rate": 9.858119255243932e-05,
"loss": 0.5466,
"step": 1410
},
{
"epoch": 0.17,
"grad_norm": 1.2575254440307617,
"learning_rate": 9.856940843742636e-05,
"loss": 0.5433,
"step": 1420
},
{
"epoch": 0.17,
"grad_norm": 1.6363134384155273,
"learning_rate": 9.855762432241339e-05,
"loss": 0.5107,
"step": 1430
},
{
"epoch": 0.17,
"grad_norm": 1.349046230316162,
"learning_rate": 9.854584020740043e-05,
"loss": 0.5235,
"step": 1440
},
{
"epoch": 0.17,
"grad_norm": 1.1566694974899292,
"learning_rate": 9.853405609238746e-05,
"loss": 0.5424,
"step": 1450
},
{
"epoch": 0.17,
"grad_norm": 1.0542116165161133,
"learning_rate": 9.85222719773745e-05,
"loss": 0.5313,
"step": 1460
},
{
"epoch": 0.17,
"grad_norm": 0.9067943692207336,
"learning_rate": 9.851048786236154e-05,
"loss": 0.5318,
"step": 1470
},
{
"epoch": 0.17,
"grad_norm": 0.9999909996986389,
"learning_rate": 9.849870374734858e-05,
"loss": 0.512,
"step": 1480
},
{
"epoch": 0.18,
"grad_norm": 1.1185346841812134,
"learning_rate": 9.848691963233562e-05,
"loss": 0.5351,
"step": 1490
},
{
"epoch": 0.18,
"grad_norm": 1.613094687461853,
"learning_rate": 9.847513551732266e-05,
"loss": 0.5293,
"step": 1500
},
{
"epoch": 0.18,
"grad_norm": 0.7273901104927063,
"learning_rate": 9.846335140230968e-05,
"loss": 0.5173,
"step": 1510
},
{
"epoch": 0.18,
"grad_norm": 1.6152803897857666,
"learning_rate": 9.845156728729674e-05,
"loss": 0.5399,
"step": 1520
},
{
"epoch": 0.18,
"grad_norm": 1.870840072631836,
"learning_rate": 9.843978317228376e-05,
"loss": 0.5316,
"step": 1530
},
{
"epoch": 0.18,
"grad_norm": 1.6257565021514893,
"learning_rate": 9.84279990572708e-05,
"loss": 0.5122,
"step": 1540
},
{
"epoch": 0.18,
"grad_norm": 0.9900372624397278,
"learning_rate": 9.841621494225784e-05,
"loss": 0.5485,
"step": 1550
},
{
"epoch": 0.18,
"grad_norm": 1.493819236755371,
"learning_rate": 9.840443082724488e-05,
"loss": 0.5257,
"step": 1560
},
{
"epoch": 0.18,
"grad_norm": 1.255489468574524,
"learning_rate": 9.839264671223192e-05,
"loss": 0.5207,
"step": 1570
},
{
"epoch": 0.19,
"grad_norm": 0.7504528760910034,
"learning_rate": 9.838086259721896e-05,
"loss": 0.523,
"step": 1580
},
{
"epoch": 0.19,
"grad_norm": 1.1679805517196655,
"learning_rate": 9.836907848220598e-05,
"loss": 0.51,
"step": 1590
},
{
"epoch": 0.19,
"grad_norm": 1.8699631690979004,
"learning_rate": 9.835729436719304e-05,
"loss": 0.5127,
"step": 1600
},
{
"epoch": 0.19,
"grad_norm": 1.3806575536727905,
"learning_rate": 9.834551025218006e-05,
"loss": 0.5056,
"step": 1610
},
{
"epoch": 0.19,
"grad_norm": 2.5336737632751465,
"learning_rate": 9.83337261371671e-05,
"loss": 0.5112,
"step": 1620
},
{
"epoch": 0.19,
"grad_norm": 1.1178169250488281,
"learning_rate": 9.832194202215414e-05,
"loss": 0.5091,
"step": 1630
},
{
"epoch": 0.19,
"grad_norm": 1.2269675731658936,
"learning_rate": 9.831015790714118e-05,
"loss": 0.5227,
"step": 1640
},
{
"epoch": 0.19,
"grad_norm": 2.6915369033813477,
"learning_rate": 9.829837379212822e-05,
"loss": 0.5428,
"step": 1650
},
{
"epoch": 0.2,
"grad_norm": 1.2049098014831543,
"learning_rate": 9.828658967711524e-05,
"loss": 0.5104,
"step": 1660
},
{
"epoch": 0.2,
"grad_norm": 1.3738254308700562,
"learning_rate": 9.82748055621023e-05,
"loss": 0.5032,
"step": 1670
},
{
"epoch": 0.2,
"grad_norm": 1.2295957803726196,
"learning_rate": 9.826302144708933e-05,
"loss": 0.514,
"step": 1680
},
{
"epoch": 0.2,
"grad_norm": 1.3847508430480957,
"learning_rate": 9.825123733207636e-05,
"loss": 0.5019,
"step": 1690
},
{
"epoch": 0.2,
"grad_norm": 1.2683600187301636,
"learning_rate": 9.82394532170634e-05,
"loss": 0.5202,
"step": 1700
},
{
"epoch": 0.2,
"grad_norm": 1.1210131645202637,
"learning_rate": 9.822766910205045e-05,
"loss": 0.5306,
"step": 1710
},
{
"epoch": 0.2,
"grad_norm": 1.4907463788986206,
"learning_rate": 9.821588498703748e-05,
"loss": 0.5179,
"step": 1720
},
{
"epoch": 0.2,
"grad_norm": 1.2331088781356812,
"learning_rate": 9.820410087202452e-05,
"loss": 0.5032,
"step": 1730
},
{
"epoch": 0.2,
"grad_norm": 0.8801277279853821,
"learning_rate": 9.819231675701155e-05,
"loss": 0.5025,
"step": 1740
},
{
"epoch": 0.21,
"grad_norm": 0.7929331064224243,
"learning_rate": 9.81805326419986e-05,
"loss": 0.5138,
"step": 1750
},
{
"epoch": 0.21,
"grad_norm": 1.3621491193771362,
"learning_rate": 9.816874852698563e-05,
"loss": 0.5124,
"step": 1760
},
{
"epoch": 0.21,
"grad_norm": 0.6551066637039185,
"learning_rate": 9.815696441197266e-05,
"loss": 0.5243,
"step": 1770
},
{
"epoch": 0.21,
"grad_norm": 1.429326057434082,
"learning_rate": 9.814518029695971e-05,
"loss": 0.5044,
"step": 1780
},
{
"epoch": 0.21,
"grad_norm": 0.9943854212760925,
"learning_rate": 9.813339618194674e-05,
"loss": 0.5074,
"step": 1790
},
{
"epoch": 0.21,
"grad_norm": 1.0659611225128174,
"learning_rate": 9.812161206693378e-05,
"loss": 0.5147,
"step": 1800
},
{
"epoch": 0.21,
"grad_norm": 1.1857000589370728,
"learning_rate": 9.810982795192081e-05,
"loss": 0.5238,
"step": 1810
},
{
"epoch": 0.21,
"grad_norm": 1.0947470664978027,
"learning_rate": 9.809804383690785e-05,
"loss": 0.5128,
"step": 1820
},
{
"epoch": 0.22,
"grad_norm": 1.282160758972168,
"learning_rate": 9.808625972189489e-05,
"loss": 0.5124,
"step": 1830
},
{
"epoch": 0.22,
"grad_norm": 1.1156092882156372,
"learning_rate": 9.807447560688193e-05,
"loss": 0.5144,
"step": 1840
},
{
"epoch": 0.22,
"grad_norm": 0.7073375582695007,
"learning_rate": 9.806269149186896e-05,
"loss": 0.5084,
"step": 1850
},
{
"epoch": 0.22,
"grad_norm": 2.155949354171753,
"learning_rate": 9.805090737685601e-05,
"loss": 0.4977,
"step": 1860
},
{
"epoch": 0.22,
"grad_norm": 1.0926837921142578,
"learning_rate": 9.803912326184303e-05,
"loss": 0.5191,
"step": 1870
},
{
"epoch": 0.22,
"grad_norm": 1.2038774490356445,
"learning_rate": 9.802733914683007e-05,
"loss": 0.5054,
"step": 1880
},
{
"epoch": 0.22,
"grad_norm": 1.1242024898529053,
"learning_rate": 9.801555503181711e-05,
"loss": 0.5601,
"step": 1890
},
{
"epoch": 0.22,
"grad_norm": 1.3172681331634521,
"learning_rate": 9.800377091680415e-05,
"loss": 0.5076,
"step": 1900
},
{
"epoch": 0.22,
"grad_norm": 0.8751928806304932,
"learning_rate": 9.799198680179119e-05,
"loss": 0.5118,
"step": 1910
},
{
"epoch": 0.23,
"grad_norm": 0.8687849640846252,
"learning_rate": 9.798020268677823e-05,
"loss": 0.5008,
"step": 1920
},
{
"epoch": 0.23,
"grad_norm": 1.1417611837387085,
"learning_rate": 9.796841857176527e-05,
"loss": 0.5017,
"step": 1930
},
{
"epoch": 0.23,
"grad_norm": 2.0473146438598633,
"learning_rate": 9.795663445675231e-05,
"loss": 0.5282,
"step": 1940
},
{
"epoch": 0.23,
"grad_norm": 1.1375166177749634,
"learning_rate": 9.794485034173933e-05,
"loss": 0.5226,
"step": 1950
},
{
"epoch": 0.23,
"grad_norm": 1.4211698770523071,
"learning_rate": 9.793306622672637e-05,
"loss": 0.5103,
"step": 1960
},
{
"epoch": 0.23,
"grad_norm": 1.6234328746795654,
"learning_rate": 9.792128211171342e-05,
"loss": 0.5151,
"step": 1970
},
{
"epoch": 0.23,
"grad_norm": 1.8369724750518799,
"learning_rate": 9.790949799670045e-05,
"loss": 0.5007,
"step": 1980
},
{
"epoch": 0.23,
"grad_norm": 1.2908823490142822,
"learning_rate": 9.789771388168749e-05,
"loss": 0.5102,
"step": 1990
},
{
"epoch": 0.24,
"grad_norm": 1.2204883098602295,
"learning_rate": 9.788592976667453e-05,
"loss": 0.5088,
"step": 2000
},
{
"epoch": 0.24,
"grad_norm": 1.121698260307312,
"learning_rate": 9.787414565166157e-05,
"loss": 0.5042,
"step": 2010
},
{
"epoch": 0.24,
"grad_norm": 0.7488769888877869,
"learning_rate": 9.78623615366486e-05,
"loss": 0.4978,
"step": 2020
},
{
"epoch": 0.24,
"grad_norm": 0.8214760422706604,
"learning_rate": 9.785057742163563e-05,
"loss": 0.4889,
"step": 2030
},
{
"epoch": 0.24,
"grad_norm": 0.8698264956474304,
"learning_rate": 9.783879330662268e-05,
"loss": 0.4948,
"step": 2040
},
{
"epoch": 0.24,
"grad_norm": 1.6263267993927002,
"learning_rate": 9.782700919160972e-05,
"loss": 0.4987,
"step": 2050
},
{
"epoch": 0.24,
"grad_norm": 1.1735159158706665,
"learning_rate": 9.781522507659675e-05,
"loss": 0.5073,
"step": 2060
},
{
"epoch": 0.24,
"grad_norm": 1.4336321353912354,
"learning_rate": 9.780344096158379e-05,
"loss": 0.5338,
"step": 2070
},
{
"epoch": 0.24,
"grad_norm": 1.0912896394729614,
"learning_rate": 9.779165684657083e-05,
"loss": 0.5236,
"step": 2080
},
{
"epoch": 0.25,
"grad_norm": 1.0635606050491333,
"learning_rate": 9.777987273155787e-05,
"loss": 0.4942,
"step": 2090
},
{
"epoch": 0.25,
"grad_norm": 1.2251390218734741,
"learning_rate": 9.77680886165449e-05,
"loss": 0.5089,
"step": 2100
},
{
"epoch": 0.25,
"grad_norm": 1.194677710533142,
"learning_rate": 9.775630450153193e-05,
"loss": 0.5052,
"step": 2110
},
{
"epoch": 0.25,
"grad_norm": 1.4351847171783447,
"learning_rate": 9.774452038651898e-05,
"loss": 0.5207,
"step": 2120
},
{
"epoch": 0.25,
"grad_norm": 1.5037511587142944,
"learning_rate": 9.773273627150601e-05,
"loss": 0.4995,
"step": 2130
},
{
"epoch": 0.25,
"grad_norm": 0.8581846952438354,
"learning_rate": 9.772095215649305e-05,
"loss": 0.5015,
"step": 2140
},
{
"epoch": 0.25,
"grad_norm": 0.5671782493591309,
"learning_rate": 9.770916804148009e-05,
"loss": 0.4975,
"step": 2150
},
{
"epoch": 0.25,
"grad_norm": 1.4382553100585938,
"learning_rate": 9.769738392646713e-05,
"loss": 0.5011,
"step": 2160
},
{
"epoch": 0.26,
"grad_norm": 1.4695125818252563,
"learning_rate": 9.768559981145416e-05,
"loss": 0.5036,
"step": 2170
},
{
"epoch": 0.26,
"grad_norm": 0.7850602865219116,
"learning_rate": 9.76738156964412e-05,
"loss": 0.489,
"step": 2180
},
{
"epoch": 0.26,
"grad_norm": 1.7293654680252075,
"learning_rate": 9.766203158142824e-05,
"loss": 0.4999,
"step": 2190
},
{
"epoch": 0.26,
"grad_norm": 0.6642454862594604,
"learning_rate": 9.765024746641528e-05,
"loss": 0.4793,
"step": 2200
},
{
"epoch": 0.26,
"grad_norm": 1.2489564418792725,
"learning_rate": 9.763846335140231e-05,
"loss": 0.5221,
"step": 2210
},
{
"epoch": 0.26,
"grad_norm": 1.549101710319519,
"learning_rate": 9.762667923638935e-05,
"loss": 0.5131,
"step": 2220
},
{
"epoch": 0.26,
"grad_norm": 1.4325413703918457,
"learning_rate": 9.76148951213764e-05,
"loss": 0.4888,
"step": 2230
},
{
"epoch": 0.26,
"grad_norm": 1.263622760772705,
"learning_rate": 9.760311100636342e-05,
"loss": 0.4912,
"step": 2240
},
{
"epoch": 0.26,
"grad_norm": 1.379479169845581,
"learning_rate": 9.759132689135046e-05,
"loss": 0.4959,
"step": 2250
},
{
"epoch": 0.27,
"grad_norm": 1.4230656623840332,
"learning_rate": 9.75795427763375e-05,
"loss": 0.4936,
"step": 2260
},
{
"epoch": 0.27,
"grad_norm": 1.1852017641067505,
"learning_rate": 9.756775866132454e-05,
"loss": 0.4989,
"step": 2270
},
{
"epoch": 0.27,
"grad_norm": 1.3687397241592407,
"learning_rate": 9.755597454631158e-05,
"loss": 0.5034,
"step": 2280
},
{
"epoch": 0.27,
"grad_norm": 1.5852590799331665,
"learning_rate": 9.75441904312986e-05,
"loss": 0.4928,
"step": 2290
},
{
"epoch": 0.27,
"grad_norm": 2.14615535736084,
"learning_rate": 9.753240631628566e-05,
"loss": 0.5234,
"step": 2300
},
{
"epoch": 0.27,
"grad_norm": 1.3089383840560913,
"learning_rate": 9.75206222012727e-05,
"loss": 0.5209,
"step": 2310
},
{
"epoch": 0.27,
"grad_norm": 1.3444660902023315,
"learning_rate": 9.750883808625972e-05,
"loss": 0.513,
"step": 2320
},
{
"epoch": 0.27,
"grad_norm": 1.801767110824585,
"learning_rate": 9.749705397124676e-05,
"loss": 0.501,
"step": 2330
},
{
"epoch": 0.28,
"grad_norm": 0.8154950141906738,
"learning_rate": 9.74852698562338e-05,
"loss": 0.5055,
"step": 2340
},
{
"epoch": 0.28,
"grad_norm": 1.3127573728561401,
"learning_rate": 9.747348574122084e-05,
"loss": 0.5136,
"step": 2350
},
{
"epoch": 0.28,
"grad_norm": 0.9519993662834167,
"learning_rate": 9.746170162620788e-05,
"loss": 0.5026,
"step": 2360
},
{
"epoch": 0.28,
"grad_norm": 0.6125206351280212,
"learning_rate": 9.74499175111949e-05,
"loss": 0.4978,
"step": 2370
},
{
"epoch": 0.28,
"grad_norm": 0.7447252869606018,
"learning_rate": 9.743813339618196e-05,
"loss": 0.5013,
"step": 2380
},
{
"epoch": 0.28,
"grad_norm": 1.4316449165344238,
"learning_rate": 9.7426349281169e-05,
"loss": 0.5048,
"step": 2390
},
{
"epoch": 0.28,
"grad_norm": 1.442835807800293,
"learning_rate": 9.741456516615602e-05,
"loss": 0.4988,
"step": 2400
},
{
"epoch": 0.28,
"grad_norm": 1.016635775566101,
"learning_rate": 9.740278105114306e-05,
"loss": 0.4949,
"step": 2410
},
{
"epoch": 0.28,
"grad_norm": 1.7097216844558716,
"learning_rate": 9.73909969361301e-05,
"loss": 0.5085,
"step": 2420
},
{
"epoch": 0.29,
"grad_norm": 1.5806695222854614,
"learning_rate": 9.737921282111714e-05,
"loss": 0.4989,
"step": 2430
},
{
"epoch": 0.29,
"grad_norm": 1.047554850578308,
"learning_rate": 9.736742870610418e-05,
"loss": 0.4819,
"step": 2440
},
{
"epoch": 0.29,
"grad_norm": 1.0060478448867798,
"learning_rate": 9.735564459109122e-05,
"loss": 0.5133,
"step": 2450
},
{
"epoch": 0.29,
"grad_norm": 1.1388624906539917,
"learning_rate": 9.734386047607825e-05,
"loss": 0.5018,
"step": 2460
},
{
"epoch": 0.29,
"grad_norm": 1.3359582424163818,
"learning_rate": 9.73320763610653e-05,
"loss": 0.5026,
"step": 2470
},
{
"epoch": 0.29,
"grad_norm": 1.5101521015167236,
"learning_rate": 9.732029224605232e-05,
"loss": 0.5038,
"step": 2480
},
{
"epoch": 0.29,
"grad_norm": 0.7922062873840332,
"learning_rate": 9.730850813103937e-05,
"loss": 0.5075,
"step": 2490
},
{
"epoch": 0.29,
"grad_norm": 1.5266971588134766,
"learning_rate": 9.72967240160264e-05,
"loss": 0.5178,
"step": 2500
},
{
"epoch": 0.3,
"grad_norm": 1.322617530822754,
"learning_rate": 9.728493990101344e-05,
"loss": 0.4865,
"step": 2510
},
{
"epoch": 0.3,
"grad_norm": 1.0608652830123901,
"learning_rate": 9.727315578600048e-05,
"loss": 0.4842,
"step": 2520
},
{
"epoch": 0.3,
"grad_norm": 1.281894326210022,
"learning_rate": 9.726137167098751e-05,
"loss": 0.5153,
"step": 2530
},
{
"epoch": 0.3,
"grad_norm": 1.3256927728652954,
"learning_rate": 9.724958755597455e-05,
"loss": 0.5006,
"step": 2540
},
{
"epoch": 0.3,
"grad_norm": 0.9028598666191101,
"learning_rate": 9.723780344096158e-05,
"loss": 0.5032,
"step": 2550
},
{
"epoch": 0.3,
"grad_norm": 1.06511390209198,
"learning_rate": 9.722601932594863e-05,
"loss": 0.5058,
"step": 2560
},
{
"epoch": 0.3,
"grad_norm": 1.4232276678085327,
"learning_rate": 9.721423521093567e-05,
"loss": 0.5212,
"step": 2570
},
{
"epoch": 0.3,
"grad_norm": 1.4823582172393799,
"learning_rate": 9.72024510959227e-05,
"loss": 0.5188,
"step": 2580
},
{
"epoch": 0.3,
"grad_norm": 1.3798667192459106,
"learning_rate": 9.719066698090973e-05,
"loss": 0.5083,
"step": 2590
},
{
"epoch": 0.31,
"grad_norm": 1.2208833694458008,
"learning_rate": 9.717888286589679e-05,
"loss": 0.4985,
"step": 2600
},
{
"epoch": 0.31,
"grad_norm": 1.477569341659546,
"learning_rate": 9.716709875088381e-05,
"loss": 0.5056,
"step": 2610
},
{
"epoch": 0.31,
"grad_norm": 0.6647968888282776,
"learning_rate": 9.715531463587085e-05,
"loss": 0.4825,
"step": 2620
},
{
"epoch": 0.31,
"grad_norm": 1.6094647645950317,
"learning_rate": 9.714353052085788e-05,
"loss": 0.4939,
"step": 2630
},
{
"epoch": 0.31,
"grad_norm": 1.5791853666305542,
"learning_rate": 9.713174640584493e-05,
"loss": 0.4993,
"step": 2640
},
{
"epoch": 0.31,
"grad_norm": 1.0835442543029785,
"learning_rate": 9.711996229083197e-05,
"loss": 0.4924,
"step": 2650
},
{
"epoch": 0.31,
"grad_norm": 0.7846812605857849,
"learning_rate": 9.7108178175819e-05,
"loss": 0.4906,
"step": 2660
},
{
"epoch": 0.31,
"grad_norm": 0.8635871410369873,
"learning_rate": 9.709639406080603e-05,
"loss": 0.4958,
"step": 2670
},
{
"epoch": 0.32,
"grad_norm": 0.8005358576774597,
"learning_rate": 9.708460994579307e-05,
"loss": 0.4958,
"step": 2680
},
{
"epoch": 0.32,
"grad_norm": 1.9108798503875732,
"learning_rate": 9.707282583078011e-05,
"loss": 0.513,
"step": 2690
},
{
"epoch": 0.32,
"grad_norm": 1.824289321899414,
"learning_rate": 9.706104171576715e-05,
"loss": 0.5166,
"step": 2700
},
{
"epoch": 0.32,
"grad_norm": 0.720629870891571,
"learning_rate": 9.704925760075419e-05,
"loss": 0.4888,
"step": 2710
},
{
"epoch": 0.32,
"grad_norm": 0.9477500319480896,
"learning_rate": 9.703747348574123e-05,
"loss": 0.4978,
"step": 2720
},
{
"epoch": 0.32,
"grad_norm": 0.7759199142456055,
"learning_rate": 9.702568937072827e-05,
"loss": 0.5013,
"step": 2730
},
{
"epoch": 0.32,
"grad_norm": 0.9977040886878967,
"learning_rate": 9.701390525571529e-05,
"loss": 0.4815,
"step": 2740
},
{
"epoch": 0.32,
"grad_norm": 0.7906765937805176,
"learning_rate": 9.700212114070235e-05,
"loss": 0.5039,
"step": 2750
},
{
"epoch": 0.32,
"grad_norm": 0.9885894656181335,
"learning_rate": 9.699033702568937e-05,
"loss": 0.4892,
"step": 2760
},
{
"epoch": 0.33,
"grad_norm": 0.8168877959251404,
"learning_rate": 9.697855291067641e-05,
"loss": 0.5042,
"step": 2770
},
{
"epoch": 0.33,
"grad_norm": 1.1622967720031738,
"learning_rate": 9.696676879566345e-05,
"loss": 0.4956,
"step": 2780
},
{
"epoch": 0.33,
"grad_norm": 1.3321975469589233,
"learning_rate": 9.695498468065049e-05,
"loss": 0.5008,
"step": 2790
},
{
"epoch": 0.33,
"grad_norm": 0.9106395840644836,
"learning_rate": 9.694320056563753e-05,
"loss": 0.5008,
"step": 2800
},
{
"epoch": 0.33,
"grad_norm": 1.224526286125183,
"learning_rate": 9.693141645062457e-05,
"loss": 0.4817,
"step": 2810
},
{
"epoch": 0.33,
"grad_norm": 1.7083461284637451,
"learning_rate": 9.69196323356116e-05,
"loss": 0.5119,
"step": 2820
},
{
"epoch": 0.33,
"grad_norm": 1.2689660787582397,
"learning_rate": 9.690784822059864e-05,
"loss": 0.4959,
"step": 2830
},
{
"epoch": 0.33,
"grad_norm": 0.9503073692321777,
"learning_rate": 9.689606410558567e-05,
"loss": 0.4986,
"step": 2840
},
{
"epoch": 0.34,
"grad_norm": 0.9541449546813965,
"learning_rate": 9.688427999057271e-05,
"loss": 0.489,
"step": 2850
},
{
"epoch": 0.34,
"grad_norm": 0.6083498001098633,
"learning_rate": 9.687249587555976e-05,
"loss": 0.5077,
"step": 2860
},
{
"epoch": 0.34,
"grad_norm": 0.8455288410186768,
"learning_rate": 9.686071176054679e-05,
"loss": 0.4854,
"step": 2870
},
{
"epoch": 0.34,
"grad_norm": 1.1523447036743164,
"learning_rate": 9.684892764553383e-05,
"loss": 0.4932,
"step": 2880
},
{
"epoch": 0.34,
"grad_norm": 1.706553339958191,
"learning_rate": 9.683714353052085e-05,
"loss": 0.5079,
"step": 2890
},
{
"epoch": 0.34,
"grad_norm": 1.193648338317871,
"learning_rate": 9.68253594155079e-05,
"loss": 0.488,
"step": 2900
},
{
"epoch": 0.34,
"grad_norm": 0.6835173964500427,
"learning_rate": 9.681357530049494e-05,
"loss": 0.5023,
"step": 2910
},
{
"epoch": 0.34,
"grad_norm": 0.8280544281005859,
"learning_rate": 9.680179118548197e-05,
"loss": 0.509,
"step": 2920
},
{
"epoch": 0.34,
"grad_norm": 0.983601987361908,
"learning_rate": 9.6790007070469e-05,
"loss": 0.4941,
"step": 2930
},
{
"epoch": 0.35,
"grad_norm": 1.036715030670166,
"learning_rate": 9.677822295545606e-05,
"loss": 0.479,
"step": 2940
},
{
"epoch": 0.35,
"grad_norm": 0.8797247409820557,
"learning_rate": 9.676643884044308e-05,
"loss": 0.4827,
"step": 2950
},
{
"epoch": 0.35,
"grad_norm": 1.078275203704834,
"learning_rate": 9.675465472543012e-05,
"loss": 0.4916,
"step": 2960
},
{
"epoch": 0.35,
"grad_norm": 0.9871312975883484,
"learning_rate": 9.674287061041716e-05,
"loss": 0.4993,
"step": 2970
},
{
"epoch": 0.35,
"grad_norm": 1.6119897365570068,
"learning_rate": 9.67310864954042e-05,
"loss": 0.4896,
"step": 2980
},
{
"epoch": 0.35,
"grad_norm": 1.048764705657959,
"learning_rate": 9.671930238039124e-05,
"loss": 0.5231,
"step": 2990
},
{
"epoch": 0.35,
"grad_norm": 0.8787404894828796,
"learning_rate": 9.670751826537827e-05,
"loss": 0.4998,
"step": 3000
},
{
"epoch": 0.35,
"grad_norm": 0.8770228028297424,
"learning_rate": 9.669573415036532e-05,
"loss": 0.4946,
"step": 3010
},
{
"epoch": 0.36,
"grad_norm": 0.915125846862793,
"learning_rate": 9.668395003535234e-05,
"loss": 0.4864,
"step": 3020
},
{
"epoch": 0.36,
"grad_norm": 0.6411454677581787,
"learning_rate": 9.667216592033938e-05,
"loss": 0.4858,
"step": 3030
},
{
"epoch": 0.36,
"grad_norm": 1.2021106481552124,
"learning_rate": 9.666038180532642e-05,
"loss": 0.4852,
"step": 3040
},
{
"epoch": 0.36,
"grad_norm": 1.232035756111145,
"learning_rate": 9.664859769031346e-05,
"loss": 0.5033,
"step": 3050
},
{
"epoch": 0.36,
"grad_norm": 0.9577634334564209,
"learning_rate": 9.66368135753005e-05,
"loss": 0.5003,
"step": 3060
},
{
"epoch": 0.36,
"grad_norm": 2.0180046558380127,
"learning_rate": 9.662502946028754e-05,
"loss": 0.503,
"step": 3070
},
{
"epoch": 0.36,
"grad_norm": 0.6394404768943787,
"learning_rate": 9.661324534527458e-05,
"loss": 0.4982,
"step": 3080
},
{
"epoch": 0.36,
"grad_norm": 1.7030550241470337,
"learning_rate": 9.660146123026162e-05,
"loss": 0.5158,
"step": 3090
},
{
"epoch": 0.36,
"grad_norm": 0.8564779758453369,
"learning_rate": 9.658967711524864e-05,
"loss": 0.4854,
"step": 3100
},
{
"epoch": 0.37,
"grad_norm": 1.1143525838851929,
"learning_rate": 9.657789300023568e-05,
"loss": 0.4922,
"step": 3110
},
{
"epoch": 0.37,
"grad_norm": 1.3438676595687866,
"learning_rate": 9.656610888522273e-05,
"loss": 0.4889,
"step": 3120
},
{
"epoch": 0.37,
"grad_norm": 0.8271397352218628,
"learning_rate": 9.655432477020976e-05,
"loss": 0.466,
"step": 3130
},
{
"epoch": 0.37,
"grad_norm": 0.7543887495994568,
"learning_rate": 9.65425406551968e-05,
"loss": 0.5069,
"step": 3140
},
{
"epoch": 0.37,
"grad_norm": 0.7445496320724487,
"learning_rate": 9.653075654018384e-05,
"loss": 0.4779,
"step": 3150
},
{
"epoch": 0.37,
"grad_norm": 0.7102170586585999,
"learning_rate": 9.651897242517088e-05,
"loss": 0.4952,
"step": 3160
},
{
"epoch": 0.37,
"grad_norm": 1.2727932929992676,
"learning_rate": 9.650718831015792e-05,
"loss": 0.4898,
"step": 3170
},
{
"epoch": 0.37,
"grad_norm": 0.7808095216751099,
"learning_rate": 9.649540419514494e-05,
"loss": 0.4905,
"step": 3180
},
{
"epoch": 0.37,
"grad_norm": 1.3508325815200806,
"learning_rate": 9.648362008013198e-05,
"loss": 0.4991,
"step": 3190
},
{
"epoch": 0.38,
"grad_norm": 1.81468665599823,
"learning_rate": 9.647183596511903e-05,
"loss": 0.4904,
"step": 3200
},
{
"epoch": 0.38,
"grad_norm": 1.1315728425979614,
"learning_rate": 9.646005185010606e-05,
"loss": 0.4935,
"step": 3210
},
{
"epoch": 0.38,
"grad_norm": 0.9630333185195923,
"learning_rate": 9.64482677350931e-05,
"loss": 0.4958,
"step": 3220
},
{
"epoch": 0.38,
"grad_norm": 0.9422668814659119,
"learning_rate": 9.643648362008014e-05,
"loss": 0.4813,
"step": 3230
},
{
"epoch": 0.38,
"grad_norm": 1.2882485389709473,
"learning_rate": 9.642469950506718e-05,
"loss": 0.4847,
"step": 3240
},
{
"epoch": 0.38,
"grad_norm": 1.1148041486740112,
"learning_rate": 9.641291539005421e-05,
"loss": 0.4991,
"step": 3250
},
{
"epoch": 0.38,
"grad_norm": 1.2429006099700928,
"learning_rate": 9.640113127504124e-05,
"loss": 0.4952,
"step": 3260
},
{
"epoch": 0.38,
"grad_norm": 1.1591366529464722,
"learning_rate": 9.638934716002829e-05,
"loss": 0.4982,
"step": 3270
},
{
"epoch": 0.39,
"grad_norm": 0.547893762588501,
"learning_rate": 9.637756304501533e-05,
"loss": 0.4902,
"step": 3280
},
{
"epoch": 0.39,
"grad_norm": 0.9699311256408691,
"learning_rate": 9.636577893000236e-05,
"loss": 0.4926,
"step": 3290
},
{
"epoch": 0.39,
"grad_norm": 1.3232420682907104,
"learning_rate": 9.63539948149894e-05,
"loss": 0.5159,
"step": 3300
},
{
"epoch": 0.39,
"grad_norm": 1.5830579996109009,
"learning_rate": 9.634221069997643e-05,
"loss": 0.4893,
"step": 3310
},
{
"epoch": 0.39,
"grad_norm": 1.0730410814285278,
"learning_rate": 9.633042658496347e-05,
"loss": 0.5222,
"step": 3320
},
{
"epoch": 0.39,
"grad_norm": 1.1800957918167114,
"learning_rate": 9.631864246995051e-05,
"loss": 0.4891,
"step": 3330
},
{
"epoch": 0.39,
"grad_norm": 1.3200833797454834,
"learning_rate": 9.630685835493755e-05,
"loss": 0.4973,
"step": 3340
},
{
"epoch": 0.39,
"grad_norm": 1.4505298137664795,
"learning_rate": 9.629507423992459e-05,
"loss": 0.486,
"step": 3350
},
{
"epoch": 0.39,
"grad_norm": 0.9176786541938782,
"learning_rate": 9.628329012491163e-05,
"loss": 0.4913,
"step": 3360
},
{
"epoch": 0.4,
"grad_norm": 0.6543411612510681,
"learning_rate": 9.627150600989866e-05,
"loss": 0.4936,
"step": 3370
},
{
"epoch": 0.4,
"grad_norm": 0.8311076164245605,
"learning_rate": 9.625972189488571e-05,
"loss": 0.5141,
"step": 3380
},
{
"epoch": 0.4,
"grad_norm": 0.9839892983436584,
"learning_rate": 9.624793777987273e-05,
"loss": 0.4982,
"step": 3390
},
{
"epoch": 0.4,
"grad_norm": 0.924263596534729,
"learning_rate": 9.623615366485977e-05,
"loss": 0.5076,
"step": 3400
},
{
"epoch": 0.4,
"grad_norm": 1.0616888999938965,
"learning_rate": 9.622436954984681e-05,
"loss": 0.5062,
"step": 3410
},
{
"epoch": 0.4,
"grad_norm": 0.5893858671188354,
"learning_rate": 9.621258543483385e-05,
"loss": 0.4925,
"step": 3420
},
{
"epoch": 0.4,
"grad_norm": 0.7346591949462891,
"learning_rate": 9.620080131982089e-05,
"loss": 0.4923,
"step": 3430
},
{
"epoch": 0.4,
"grad_norm": 0.9968793392181396,
"learning_rate": 9.618901720480791e-05,
"loss": 0.4974,
"step": 3440
},
{
"epoch": 0.41,
"grad_norm": 1.5348602533340454,
"learning_rate": 9.617723308979495e-05,
"loss": 0.4941,
"step": 3450
},
{
"epoch": 0.41,
"grad_norm": 0.5470607280731201,
"learning_rate": 9.6165448974782e-05,
"loss": 0.4873,
"step": 3460
},
{
"epoch": 0.41,
"grad_norm": 0.7650502324104309,
"learning_rate": 9.615366485976903e-05,
"loss": 0.503,
"step": 3470
},
{
"epoch": 0.41,
"grad_norm": 0.8446630835533142,
"learning_rate": 9.614188074475607e-05,
"loss": 0.491,
"step": 3480
},
{
"epoch": 0.41,
"grad_norm": 1.224928855895996,
"learning_rate": 9.613009662974311e-05,
"loss": 0.4981,
"step": 3490
},
{
"epoch": 0.41,
"grad_norm": 0.7117965817451477,
"learning_rate": 9.611831251473015e-05,
"loss": 0.5013,
"step": 3500
},
{
"epoch": 0.41,
"grad_norm": 1.0274097919464111,
"learning_rate": 9.610652839971719e-05,
"loss": 0.4867,
"step": 3510
},
{
"epoch": 0.41,
"grad_norm": 0.849981427192688,
"learning_rate": 9.609474428470421e-05,
"loss": 0.4945,
"step": 3520
},
{
"epoch": 0.41,
"grad_norm": 0.9329881072044373,
"learning_rate": 9.608296016969127e-05,
"loss": 0.4926,
"step": 3530
},
{
"epoch": 0.42,
"grad_norm": 1.1163865327835083,
"learning_rate": 9.60711760546783e-05,
"loss": 0.482,
"step": 3540
},
{
"epoch": 0.42,
"grad_norm": 1.2010319232940674,
"learning_rate": 9.605939193966533e-05,
"loss": 0.4832,
"step": 3550
},
{
"epoch": 0.42,
"grad_norm": 0.6669803261756897,
"learning_rate": 9.604760782465237e-05,
"loss": 0.4929,
"step": 3560
},
{
"epoch": 0.42,
"grad_norm": 0.8228316903114319,
"learning_rate": 9.603582370963941e-05,
"loss": 0.483,
"step": 3570
},
{
"epoch": 0.42,
"grad_norm": 0.946983277797699,
"learning_rate": 9.602403959462645e-05,
"loss": 0.4965,
"step": 3580
},
{
"epoch": 0.42,
"grad_norm": 0.8362289071083069,
"learning_rate": 9.601225547961349e-05,
"loss": 0.4969,
"step": 3590
},
{
"epoch": 0.42,
"grad_norm": 1.052290439605713,
"learning_rate": 9.600047136460053e-05,
"loss": 0.4868,
"step": 3600
},
{
"epoch": 0.42,
"grad_norm": 1.4288514852523804,
"learning_rate": 9.598868724958756e-05,
"loss": 0.4743,
"step": 3610
},
{
"epoch": 0.43,
"grad_norm": 0.7788878679275513,
"learning_rate": 9.59769031345746e-05,
"loss": 0.4853,
"step": 3620
},
{
"epoch": 0.43,
"grad_norm": 0.8866305351257324,
"learning_rate": 9.596511901956163e-05,
"loss": 0.5182,
"step": 3630
},
{
"epoch": 0.43,
"grad_norm": 1.6054041385650635,
"learning_rate": 9.595333490454868e-05,
"loss": 0.488,
"step": 3640
},
{
"epoch": 0.43,
"grad_norm": 0.8780691027641296,
"learning_rate": 9.59415507895357e-05,
"loss": 0.5067,
"step": 3650
},
{
"epoch": 0.43,
"grad_norm": 0.593924880027771,
"learning_rate": 9.592976667452275e-05,
"loss": 0.4969,
"step": 3660
},
{
"epoch": 0.43,
"grad_norm": 1.2807188034057617,
"learning_rate": 9.591798255950978e-05,
"loss": 0.4964,
"step": 3670
},
{
"epoch": 0.43,
"grad_norm": 0.8862617015838623,
"learning_rate": 9.590619844449682e-05,
"loss": 0.4976,
"step": 3680
},
{
"epoch": 0.43,
"grad_norm": 0.7632378935813904,
"learning_rate": 9.589441432948386e-05,
"loss": 0.496,
"step": 3690
},
{
"epoch": 0.43,
"grad_norm": 0.8985171318054199,
"learning_rate": 9.58826302144709e-05,
"loss": 0.4949,
"step": 3700
},
{
"epoch": 0.44,
"grad_norm": 1.0251476764678955,
"learning_rate": 9.587084609945793e-05,
"loss": 0.5029,
"step": 3710
},
{
"epoch": 0.44,
"grad_norm": 1.2682667970657349,
"learning_rate": 9.585906198444498e-05,
"loss": 0.5051,
"step": 3720
},
{
"epoch": 0.44,
"grad_norm": 0.8219572305679321,
"learning_rate": 9.5847277869432e-05,
"loss": 0.5073,
"step": 3730
},
{
"epoch": 0.44,
"grad_norm": 0.5985639095306396,
"learning_rate": 9.583549375441904e-05,
"loss": 0.4769,
"step": 3740
},
{
"epoch": 0.44,
"grad_norm": 0.46747612953186035,
"learning_rate": 9.582370963940608e-05,
"loss": 0.4981,
"step": 3750
},
{
"epoch": 0.44,
"grad_norm": 1.5026284456253052,
"learning_rate": 9.581192552439312e-05,
"loss": 0.4984,
"step": 3760
},
{
"epoch": 0.44,
"grad_norm": 1.1273953914642334,
"learning_rate": 9.580014140938016e-05,
"loss": 0.4945,
"step": 3770
},
{
"epoch": 0.44,
"grad_norm": 1.2880148887634277,
"learning_rate": 9.578835729436719e-05,
"loss": 0.502,
"step": 3780
},
{
"epoch": 0.45,
"grad_norm": 0.9079731702804565,
"learning_rate": 9.577657317935424e-05,
"loss": 0.4968,
"step": 3790
},
{
"epoch": 0.45,
"grad_norm": 0.9394592642784119,
"learning_rate": 9.576478906434128e-05,
"loss": 0.4989,
"step": 3800
},
{
"epoch": 0.45,
"grad_norm": 1.0500024557113647,
"learning_rate": 9.57530049493283e-05,
"loss": 0.4915,
"step": 3810
},
{
"epoch": 0.45,
"grad_norm": 0.5405048727989197,
"learning_rate": 9.574122083431534e-05,
"loss": 0.4888,
"step": 3820
},
{
"epoch": 0.45,
"grad_norm": 0.7963844537734985,
"learning_rate": 9.57294367193024e-05,
"loss": 0.4877,
"step": 3830
},
{
"epoch": 0.45,
"grad_norm": 1.3261078596115112,
"learning_rate": 9.571765260428942e-05,
"loss": 0.4966,
"step": 3840
},
{
"epoch": 0.45,
"grad_norm": 0.6448584198951721,
"learning_rate": 9.570586848927646e-05,
"loss": 0.4965,
"step": 3850
},
{
"epoch": 0.45,
"grad_norm": 0.7990720272064209,
"learning_rate": 9.56940843742635e-05,
"loss": 0.48,
"step": 3860
},
{
"epoch": 0.45,
"grad_norm": 0.6464685797691345,
"learning_rate": 9.568230025925054e-05,
"loss": 0.4817,
"step": 3870
},
{
"epoch": 0.46,
"grad_norm": 0.9830204844474792,
"learning_rate": 9.567051614423758e-05,
"loss": 0.4842,
"step": 3880
},
{
"epoch": 0.46,
"grad_norm": 0.7194159030914307,
"learning_rate": 9.56587320292246e-05,
"loss": 0.4934,
"step": 3890
},
{
"epoch": 0.46,
"grad_norm": 0.9990166425704956,
"learning_rate": 9.564694791421165e-05,
"loss": 0.5,
"step": 3900
},
{
"epoch": 0.46,
"grad_norm": 0.6201900839805603,
"learning_rate": 9.563516379919868e-05,
"loss": 0.4837,
"step": 3910
},
{
"epoch": 0.46,
"grad_norm": 1.099794626235962,
"learning_rate": 9.562337968418572e-05,
"loss": 0.4911,
"step": 3920
},
{
"epoch": 0.46,
"grad_norm": 1.30674147605896,
"learning_rate": 9.561159556917276e-05,
"loss": 0.4959,
"step": 3930
},
{
"epoch": 0.46,
"grad_norm": 0.7809882164001465,
"learning_rate": 9.55998114541598e-05,
"loss": 0.5025,
"step": 3940
},
{
"epoch": 0.46,
"grad_norm": 1.2487872838974,
"learning_rate": 9.558802733914684e-05,
"loss": 0.4778,
"step": 3950
},
{
"epoch": 0.47,
"grad_norm": 0.514445960521698,
"learning_rate": 9.557624322413388e-05,
"loss": 0.4751,
"step": 3960
},
{
"epoch": 0.47,
"grad_norm": 1.346526861190796,
"learning_rate": 9.55644591091209e-05,
"loss": 0.4912,
"step": 3970
},
{
"epoch": 0.47,
"grad_norm": 0.5071583986282349,
"learning_rate": 9.555267499410795e-05,
"loss": 0.5029,
"step": 3980
},
{
"epoch": 0.47,
"grad_norm": 1.130698323249817,
"learning_rate": 9.554089087909498e-05,
"loss": 0.4941,
"step": 3990
},
{
"epoch": 0.47,
"grad_norm": 0.5424882173538208,
"learning_rate": 9.552910676408202e-05,
"loss": 0.4897,
"step": 4000
},
{
"epoch": 0.47,
"grad_norm": 0.7285670042037964,
"learning_rate": 9.551732264906906e-05,
"loss": 0.4956,
"step": 4010
},
{
"epoch": 0.47,
"grad_norm": 1.3889833688735962,
"learning_rate": 9.55055385340561e-05,
"loss": 0.4985,
"step": 4020
},
{
"epoch": 0.47,
"grad_norm": 1.190926432609558,
"learning_rate": 9.549375441904313e-05,
"loss": 0.4869,
"step": 4030
},
{
"epoch": 0.47,
"grad_norm": 0.8289108276367188,
"learning_rate": 9.548197030403017e-05,
"loss": 0.4838,
"step": 4040
},
{
"epoch": 0.48,
"grad_norm": 0.754700243473053,
"learning_rate": 9.547018618901721e-05,
"loss": 0.5048,
"step": 4050
},
{
"epoch": 0.48,
"grad_norm": 0.8800638318061829,
"learning_rate": 9.545840207400425e-05,
"loss": 0.4817,
"step": 4060
},
{
"epoch": 0.48,
"grad_norm": 0.6806440949440002,
"learning_rate": 9.544661795899128e-05,
"loss": 0.5038,
"step": 4070
},
{
"epoch": 0.48,
"grad_norm": 0.8915199041366577,
"learning_rate": 9.543483384397832e-05,
"loss": 0.4662,
"step": 4080
},
{
"epoch": 0.48,
"grad_norm": 0.6544657945632935,
"learning_rate": 9.542304972896537e-05,
"loss": 0.4795,
"step": 4090
},
{
"epoch": 0.48,
"grad_norm": 0.760875403881073,
"learning_rate": 9.54112656139524e-05,
"loss": 0.4948,
"step": 4100
},
{
"epoch": 0.48,
"grad_norm": 0.9429724216461182,
"learning_rate": 9.539948149893943e-05,
"loss": 0.47,
"step": 4110
},
{
"epoch": 0.48,
"grad_norm": 1.138684868812561,
"learning_rate": 9.538769738392647e-05,
"loss": 0.4815,
"step": 4120
},
{
"epoch": 0.49,
"grad_norm": 1.4824031591415405,
"learning_rate": 9.537591326891351e-05,
"loss": 0.5053,
"step": 4130
},
{
"epoch": 0.49,
"grad_norm": 1.0486384630203247,
"learning_rate": 9.536412915390055e-05,
"loss": 0.4823,
"step": 4140
},
{
"epoch": 0.49,
"grad_norm": 0.7436571717262268,
"learning_rate": 9.535234503888758e-05,
"loss": 0.492,
"step": 4150
},
{
"epoch": 0.49,
"grad_norm": 0.6755275726318359,
"learning_rate": 9.534056092387463e-05,
"loss": 0.4679,
"step": 4160
},
{
"epoch": 0.49,
"grad_norm": 0.6708274483680725,
"learning_rate": 9.532877680886167e-05,
"loss": 0.4926,
"step": 4170
},
{
"epoch": 0.49,
"grad_norm": 0.9209789037704468,
"learning_rate": 9.531699269384869e-05,
"loss": 0.4836,
"step": 4180
},
{
"epoch": 0.49,
"grad_norm": 1.1882667541503906,
"learning_rate": 9.530520857883573e-05,
"loss": 0.4807,
"step": 4190
},
{
"epoch": 0.49,
"grad_norm": 1.184501051902771,
"learning_rate": 9.529342446382277e-05,
"loss": 0.5028,
"step": 4200
},
{
"epoch": 0.49,
"grad_norm": 1.0626683235168457,
"learning_rate": 9.528164034880981e-05,
"loss": 0.4883,
"step": 4210
},
{
"epoch": 0.5,
"grad_norm": 1.046478033065796,
"learning_rate": 9.526985623379685e-05,
"loss": 0.4916,
"step": 4220
},
{
"epoch": 0.5,
"grad_norm": 0.9243775606155396,
"learning_rate": 9.525807211878389e-05,
"loss": 0.4818,
"step": 4230
},
{
"epoch": 0.5,
"grad_norm": 1.0813802480697632,
"learning_rate": 9.524628800377093e-05,
"loss": 0.4724,
"step": 4240
},
{
"epoch": 0.5,
"grad_norm": 1.133711814880371,
"learning_rate": 9.523450388875797e-05,
"loss": 0.484,
"step": 4250
},
{
"epoch": 0.5,
"grad_norm": 0.8296009302139282,
"learning_rate": 9.522271977374499e-05,
"loss": 0.4964,
"step": 4260
},
{
"epoch": 0.5,
"grad_norm": 0.7538905739784241,
"learning_rate": 9.521093565873203e-05,
"loss": 0.4816,
"step": 4270
},
{
"epoch": 0.5,
"grad_norm": 0.7560605406761169,
"learning_rate": 9.519915154371907e-05,
"loss": 0.4774,
"step": 4280
},
{
"epoch": 0.5,
"grad_norm": 0.8916319012641907,
"learning_rate": 9.518736742870611e-05,
"loss": 0.4845,
"step": 4290
},
{
"epoch": 0.51,
"grad_norm": 0.9245229959487915,
"learning_rate": 9.517558331369315e-05,
"loss": 0.4946,
"step": 4300
},
{
"epoch": 0.51,
"grad_norm": 0.7412012219429016,
"learning_rate": 9.516379919868019e-05,
"loss": 0.4877,
"step": 4310
},
{
"epoch": 0.51,
"grad_norm": 1.1689118146896362,
"learning_rate": 9.515201508366723e-05,
"loss": 0.4884,
"step": 4320
},
{
"epoch": 0.51,
"grad_norm": 0.47012460231781006,
"learning_rate": 9.514023096865425e-05,
"loss": 0.4845,
"step": 4330
},
{
"epoch": 0.51,
"grad_norm": 1.0123286247253418,
"learning_rate": 9.512844685364129e-05,
"loss": 0.4853,
"step": 4340
},
{
"epoch": 0.51,
"grad_norm": 1.1943848133087158,
"learning_rate": 9.511666273862834e-05,
"loss": 0.472,
"step": 4350
},
{
"epoch": 0.51,
"grad_norm": 0.9038557410240173,
"learning_rate": 9.510487862361537e-05,
"loss": 0.5017,
"step": 4360
},
{
"epoch": 0.51,
"grad_norm": 0.6542546153068542,
"learning_rate": 9.50930945086024e-05,
"loss": 0.4734,
"step": 4370
},
{
"epoch": 0.51,
"grad_norm": 0.42980071902275085,
"learning_rate": 9.508131039358945e-05,
"loss": 0.4811,
"step": 4380
},
{
"epoch": 0.52,
"grad_norm": 0.828368604183197,
"learning_rate": 9.506952627857648e-05,
"loss": 0.4859,
"step": 4390
},
{
"epoch": 0.52,
"grad_norm": 0.8931118845939636,
"learning_rate": 9.505774216356352e-05,
"loss": 0.4939,
"step": 4400
},
{
"epoch": 0.52,
"grad_norm": 1.2151875495910645,
"learning_rate": 9.504595804855055e-05,
"loss": 0.4868,
"step": 4410
},
{
"epoch": 0.52,
"grad_norm": 0.4697801470756531,
"learning_rate": 9.50341739335376e-05,
"loss": 0.4873,
"step": 4420
},
{
"epoch": 0.52,
"grad_norm": 1.1358791589736938,
"learning_rate": 9.502238981852464e-05,
"loss": 0.4998,
"step": 4430
},
{
"epoch": 0.52,
"grad_norm": 0.8548920154571533,
"learning_rate": 9.501060570351167e-05,
"loss": 0.4998,
"step": 4440
},
{
"epoch": 0.52,
"grad_norm": 0.6899234652519226,
"learning_rate": 9.49988215884987e-05,
"loss": 0.4741,
"step": 4450
},
{
"epoch": 0.52,
"grad_norm": 1.041538953781128,
"learning_rate": 9.498703747348574e-05,
"loss": 0.4934,
"step": 4460
},
{
"epoch": 0.53,
"grad_norm": 1.1963938474655151,
"learning_rate": 9.497525335847278e-05,
"loss": 0.4896,
"step": 4470
},
{
"epoch": 0.53,
"grad_norm": 0.9801954627037048,
"learning_rate": 9.496346924345982e-05,
"loss": 0.4782,
"step": 4480
},
{
"epoch": 0.53,
"grad_norm": 0.6222419142723083,
"learning_rate": 9.495168512844686e-05,
"loss": 0.4993,
"step": 4490
},
{
"epoch": 0.53,
"grad_norm": 0.6828306317329407,
"learning_rate": 9.49399010134339e-05,
"loss": 0.4839,
"step": 4500
},
{
"epoch": 0.53,
"grad_norm": 1.203779935836792,
"learning_rate": 9.492811689842094e-05,
"loss": 0.4839,
"step": 4510
},
{
"epoch": 0.53,
"grad_norm": 0.6782972812652588,
"learning_rate": 9.491633278340796e-05,
"loss": 0.4917,
"step": 4520
},
{
"epoch": 0.53,
"grad_norm": 0.7733616828918457,
"learning_rate": 9.4904548668395e-05,
"loss": 0.4926,
"step": 4530
},
{
"epoch": 0.53,
"grad_norm": 0.7314490675926208,
"learning_rate": 9.489276455338204e-05,
"loss": 0.4754,
"step": 4540
},
{
"epoch": 0.53,
"grad_norm": 0.8113279938697815,
"learning_rate": 9.488098043836908e-05,
"loss": 0.4907,
"step": 4550
},
{
"epoch": 0.54,
"grad_norm": 0.6463847756385803,
"learning_rate": 9.486919632335612e-05,
"loss": 0.4836,
"step": 4560
},
{
"epoch": 0.54,
"grad_norm": 0.9166641235351562,
"learning_rate": 9.485741220834316e-05,
"loss": 0.4693,
"step": 4570
},
{
"epoch": 0.54,
"grad_norm": 1.0718308687210083,
"learning_rate": 9.48456280933302e-05,
"loss": 0.4874,
"step": 4580
},
{
"epoch": 0.54,
"grad_norm": 0.9915964007377625,
"learning_rate": 9.483384397831724e-05,
"loss": 0.4857,
"step": 4590
},
{
"epoch": 0.54,
"grad_norm": 0.9420537948608398,
"learning_rate": 9.482205986330426e-05,
"loss": 0.4756,
"step": 4600
},
{
"epoch": 0.54,
"grad_norm": 0.8548345565795898,
"learning_rate": 9.481027574829132e-05,
"loss": 0.4824,
"step": 4610
},
{
"epoch": 0.54,
"grad_norm": 1.0632978677749634,
"learning_rate": 9.479849163327834e-05,
"loss": 0.5109,
"step": 4620
},
{
"epoch": 0.54,
"grad_norm": 0.6384020447731018,
"learning_rate": 9.478670751826538e-05,
"loss": 0.4797,
"step": 4630
},
{
"epoch": 0.55,
"grad_norm": 0.7735443711280823,
"learning_rate": 9.477492340325242e-05,
"loss": 0.4833,
"step": 4640
},
{
"epoch": 0.55,
"grad_norm": 0.800276517868042,
"learning_rate": 9.476313928823946e-05,
"loss": 0.4765,
"step": 4650
},
{
"epoch": 0.55,
"grad_norm": 1.1817256212234497,
"learning_rate": 9.47513551732265e-05,
"loss": 0.4767,
"step": 4660
},
{
"epoch": 0.55,
"grad_norm": 0.8723351955413818,
"learning_rate": 9.473957105821352e-05,
"loss": 0.4821,
"step": 4670
},
{
"epoch": 0.55,
"grad_norm": 0.8674776554107666,
"learning_rate": 9.472778694320058e-05,
"loss": 0.4919,
"step": 4680
},
{
"epoch": 0.55,
"grad_norm": 0.5936712026596069,
"learning_rate": 9.471600282818761e-05,
"loss": 0.4801,
"step": 4690
},
{
"epoch": 0.55,
"grad_norm": 0.9484730958938599,
"learning_rate": 9.470421871317464e-05,
"loss": 0.5059,
"step": 4700
},
{
"epoch": 0.55,
"grad_norm": 1.349593997001648,
"learning_rate": 9.469243459816168e-05,
"loss": 0.4826,
"step": 4710
},
{
"epoch": 0.55,
"grad_norm": 1.290330171585083,
"learning_rate": 9.468065048314873e-05,
"loss": 0.4824,
"step": 4720
},
{
"epoch": 0.56,
"grad_norm": 0.7647900581359863,
"learning_rate": 9.466886636813576e-05,
"loss": 0.4837,
"step": 4730
},
{
"epoch": 0.56,
"grad_norm": 1.3760255575180054,
"learning_rate": 9.46570822531228e-05,
"loss": 0.4744,
"step": 4740
},
{
"epoch": 0.56,
"grad_norm": 0.574338436126709,
"learning_rate": 9.464529813810983e-05,
"loss": 0.5001,
"step": 4750
},
{
"epoch": 0.56,
"grad_norm": 1.570899486541748,
"learning_rate": 9.463351402309687e-05,
"loss": 0.4805,
"step": 4760
},
{
"epoch": 0.56,
"grad_norm": 1.2061035633087158,
"learning_rate": 9.462172990808391e-05,
"loss": 0.4828,
"step": 4770
},
{
"epoch": 0.56,
"grad_norm": 0.9155292510986328,
"learning_rate": 9.460994579307094e-05,
"loss": 0.5038,
"step": 4780
},
{
"epoch": 0.56,
"grad_norm": 0.9685520529747009,
"learning_rate": 9.459816167805799e-05,
"loss": 0.4707,
"step": 4790
},
{
"epoch": 0.56,
"grad_norm": 0.6070892214775085,
"learning_rate": 9.458637756304502e-05,
"loss": 0.4708,
"step": 4800
},
{
"epoch": 0.57,
"grad_norm": 0.9815748929977417,
"learning_rate": 9.457459344803206e-05,
"loss": 0.4824,
"step": 4810
},
{
"epoch": 0.57,
"grad_norm": 0.8268948793411255,
"learning_rate": 9.45628093330191e-05,
"loss": 0.493,
"step": 4820
},
{
"epoch": 0.57,
"grad_norm": 0.7950807213783264,
"learning_rate": 9.455102521800613e-05,
"loss": 0.4697,
"step": 4830
},
{
"epoch": 0.57,
"grad_norm": 1.2134770154953003,
"learning_rate": 9.453924110299317e-05,
"loss": 0.4787,
"step": 4840
},
{
"epoch": 0.57,
"grad_norm": 1.0385112762451172,
"learning_rate": 9.452745698798021e-05,
"loss": 0.4855,
"step": 4850
},
{
"epoch": 0.57,
"grad_norm": 0.6822102665901184,
"learning_rate": 9.451567287296724e-05,
"loss": 0.482,
"step": 4860
},
{
"epoch": 0.57,
"grad_norm": 0.8164222240447998,
"learning_rate": 9.450388875795429e-05,
"loss": 0.4908,
"step": 4870
},
{
"epoch": 0.57,
"grad_norm": 0.6714780926704407,
"learning_rate": 9.449210464294131e-05,
"loss": 0.4707,
"step": 4880
},
{
"epoch": 0.57,
"grad_norm": 0.8351618647575378,
"learning_rate": 9.448032052792835e-05,
"loss": 0.4916,
"step": 4890
},
{
"epoch": 0.58,
"grad_norm": 1.516365885734558,
"learning_rate": 9.446853641291539e-05,
"loss": 0.4961,
"step": 4900
},
{
"epoch": 0.58,
"grad_norm": 1.229721188545227,
"learning_rate": 9.445675229790243e-05,
"loss": 0.4921,
"step": 4910
},
{
"epoch": 0.58,
"grad_norm": 1.1634761095046997,
"learning_rate": 9.444496818288947e-05,
"loss": 0.4953,
"step": 4920
},
{
"epoch": 0.58,
"grad_norm": 1.038814663887024,
"learning_rate": 9.443318406787651e-05,
"loss": 0.4881,
"step": 4930
},
{
"epoch": 0.58,
"grad_norm": 0.4522653818130493,
"learning_rate": 9.442139995286355e-05,
"loss": 0.4798,
"step": 4940
},
{
"epoch": 0.58,
"grad_norm": 0.9705685973167419,
"learning_rate": 9.440961583785059e-05,
"loss": 0.4787,
"step": 4950
},
{
"epoch": 0.58,
"grad_norm": 1.0672070980072021,
"learning_rate": 9.439783172283761e-05,
"loss": 0.4654,
"step": 4960
},
{
"epoch": 0.58,
"grad_norm": 0.7916922569274902,
"learning_rate": 9.438604760782465e-05,
"loss": 0.4737,
"step": 4970
},
{
"epoch": 0.59,
"grad_norm": 1.1799827814102173,
"learning_rate": 9.43742634928117e-05,
"loss": 0.4742,
"step": 4980
},
{
"epoch": 0.59,
"grad_norm": 0.6448288559913635,
"learning_rate": 9.436247937779873e-05,
"loss": 0.4799,
"step": 4990
},
{
"epoch": 0.59,
"grad_norm": 0.8689081072807312,
"learning_rate": 9.435069526278577e-05,
"loss": 0.4952,
"step": 5000
},
{
"epoch": 0.59,
"grad_norm": 1.4697692394256592,
"learning_rate": 9.433891114777281e-05,
"loss": 0.4774,
"step": 5010
},
{
"epoch": 0.59,
"grad_norm": 0.8025481104850769,
"learning_rate": 9.432712703275985e-05,
"loss": 0.4836,
"step": 5020
},
{
"epoch": 0.59,
"grad_norm": 0.6762499213218689,
"learning_rate": 9.431534291774689e-05,
"loss": 0.4746,
"step": 5030
},
{
"epoch": 0.59,
"grad_norm": 0.8484092950820923,
"learning_rate": 9.430355880273391e-05,
"loss": 0.4815,
"step": 5040
},
{
"epoch": 0.59,
"grad_norm": 0.8202689290046692,
"learning_rate": 9.429177468772096e-05,
"loss": 0.4744,
"step": 5050
},
{
"epoch": 0.59,
"grad_norm": 0.544952929019928,
"learning_rate": 9.4279990572708e-05,
"loss": 0.4899,
"step": 5060
},
{
"epoch": 0.6,
"grad_norm": 0.6483100056648254,
"learning_rate": 9.426820645769503e-05,
"loss": 0.4872,
"step": 5070
},
{
"epoch": 0.6,
"grad_norm": 0.5478421449661255,
"learning_rate": 9.425642234268207e-05,
"loss": 0.4807,
"step": 5080
},
{
"epoch": 0.6,
"grad_norm": 0.7820990085601807,
"learning_rate": 9.42446382276691e-05,
"loss": 0.4822,
"step": 5090
},
{
"epoch": 0.6,
"grad_norm": 0.7355049252510071,
"learning_rate": 9.423285411265615e-05,
"loss": 0.4785,
"step": 5100
},
{
"epoch": 0.6,
"grad_norm": 0.6957169771194458,
"learning_rate": 9.422106999764318e-05,
"loss": 0.4894,
"step": 5110
},
{
"epoch": 0.6,
"grad_norm": 1.4908500909805298,
"learning_rate": 9.420928588263021e-05,
"loss": 0.4884,
"step": 5120
},
{
"epoch": 0.6,
"grad_norm": 0.6293405890464783,
"learning_rate": 9.419750176761726e-05,
"loss": 0.4923,
"step": 5130
},
{
"epoch": 0.6,
"grad_norm": 0.9960302114486694,
"learning_rate": 9.41857176526043e-05,
"loss": 0.4834,
"step": 5140
},
{
"epoch": 0.61,
"grad_norm": 0.5231541395187378,
"learning_rate": 9.417393353759133e-05,
"loss": 0.4994,
"step": 5150
},
{
"epoch": 0.61,
"grad_norm": 0.9027219414710999,
"learning_rate": 9.416214942257837e-05,
"loss": 0.4771,
"step": 5160
},
{
"epoch": 0.61,
"grad_norm": 0.9002034664154053,
"learning_rate": 9.41503653075654e-05,
"loss": 0.4896,
"step": 5170
},
{
"epoch": 0.61,
"grad_norm": 0.6835697293281555,
"learning_rate": 9.413858119255244e-05,
"loss": 0.4739,
"step": 5180
},
{
"epoch": 0.61,
"grad_norm": 1.2067887783050537,
"learning_rate": 9.412679707753948e-05,
"loss": 0.4873,
"step": 5190
},
{
"epoch": 0.61,
"grad_norm": 0.6633793115615845,
"learning_rate": 9.411501296252652e-05,
"loss": 0.4876,
"step": 5200
},
{
"epoch": 0.61,
"grad_norm": 1.061316728591919,
"learning_rate": 9.410322884751356e-05,
"loss": 0.472,
"step": 5210
},
{
"epoch": 0.61,
"grad_norm": 0.5604255795478821,
"learning_rate": 9.409144473250059e-05,
"loss": 0.4805,
"step": 5220
},
{
"epoch": 0.61,
"grad_norm": 1.1520159244537354,
"learning_rate": 9.407966061748763e-05,
"loss": 0.4883,
"step": 5230
},
{
"epoch": 0.62,
"grad_norm": 0.8987627625465393,
"learning_rate": 9.406787650247468e-05,
"loss": 0.4964,
"step": 5240
},
{
"epoch": 0.62,
"grad_norm": 0.7069312334060669,
"learning_rate": 9.40560923874617e-05,
"loss": 0.4838,
"step": 5250
},
{
"epoch": 0.62,
"grad_norm": 1.124684453010559,
"learning_rate": 9.404430827244874e-05,
"loss": 0.4852,
"step": 5260
},
{
"epoch": 0.62,
"grad_norm": 1.0120004415512085,
"learning_rate": 9.403252415743578e-05,
"loss": 0.4764,
"step": 5270
},
{
"epoch": 0.62,
"grad_norm": 1.2393412590026855,
"learning_rate": 9.402074004242282e-05,
"loss": 0.476,
"step": 5280
},
{
"epoch": 0.62,
"grad_norm": 1.1735107898712158,
"learning_rate": 9.400895592740986e-05,
"loss": 0.4852,
"step": 5290
},
{
"epoch": 0.62,
"grad_norm": 0.8905333280563354,
"learning_rate": 9.399717181239688e-05,
"loss": 0.4769,
"step": 5300
},
{
"epoch": 0.62,
"grad_norm": 1.2668908834457397,
"learning_rate": 9.398538769738394e-05,
"loss": 0.4801,
"step": 5310
},
{
"epoch": 0.63,
"grad_norm": 0.6397873163223267,
"learning_rate": 9.397360358237098e-05,
"loss": 0.4784,
"step": 5320
},
{
"epoch": 0.63,
"grad_norm": 0.5698948502540588,
"learning_rate": 9.3961819467358e-05,
"loss": 0.4723,
"step": 5330
},
{
"epoch": 0.63,
"grad_norm": 0.5725936889648438,
"learning_rate": 9.395003535234504e-05,
"loss": 0.4849,
"step": 5340
},
{
"epoch": 0.63,
"grad_norm": 1.0437946319580078,
"learning_rate": 9.393825123733208e-05,
"loss": 0.4779,
"step": 5350
},
{
"epoch": 0.63,
"grad_norm": 0.8974117636680603,
"learning_rate": 9.392646712231912e-05,
"loss": 0.4837,
"step": 5360
},
{
"epoch": 0.63,
"grad_norm": 0.4877300262451172,
"learning_rate": 9.391468300730616e-05,
"loss": 0.4685,
"step": 5370
},
{
"epoch": 0.63,
"grad_norm": 0.9568804502487183,
"learning_rate": 9.390289889229318e-05,
"loss": 0.4807,
"step": 5380
},
{
"epoch": 0.63,
"grad_norm": 0.9116615653038025,
"learning_rate": 9.389111477728024e-05,
"loss": 0.4783,
"step": 5390
},
{
"epoch": 0.63,
"grad_norm": 1.0499573945999146,
"learning_rate": 9.387933066226727e-05,
"loss": 0.4825,
"step": 5400
},
{
"epoch": 0.64,
"grad_norm": 0.7615978717803955,
"learning_rate": 9.38675465472543e-05,
"loss": 0.4841,
"step": 5410
},
{
"epoch": 0.64,
"grad_norm": 0.9241262078285217,
"learning_rate": 9.385576243224134e-05,
"loss": 0.4934,
"step": 5420
},
{
"epoch": 0.64,
"grad_norm": 0.49354177713394165,
"learning_rate": 9.384397831722838e-05,
"loss": 0.4832,
"step": 5430
},
{
"epoch": 0.64,
"grad_norm": 1.5777790546417236,
"learning_rate": 9.383219420221542e-05,
"loss": 0.4756,
"step": 5440
},
{
"epoch": 0.64,
"grad_norm": 0.8923964500427246,
"learning_rate": 9.382041008720246e-05,
"loss": 0.4812,
"step": 5450
},
{
"epoch": 0.64,
"grad_norm": 2.254605293273926,
"learning_rate": 9.38086259721895e-05,
"loss": 0.5011,
"step": 5460
},
{
"epoch": 0.64,
"grad_norm": 1.110668659210205,
"learning_rate": 9.379684185717653e-05,
"loss": 0.4901,
"step": 5470
},
{
"epoch": 0.64,
"grad_norm": 0.9847891330718994,
"learning_rate": 9.378505774216357e-05,
"loss": 0.4795,
"step": 5480
},
{
"epoch": 0.65,
"grad_norm": 0.8963242769241333,
"learning_rate": 9.37732736271506e-05,
"loss": 0.4697,
"step": 5490
},
{
"epoch": 0.65,
"grad_norm": 0.8577290773391724,
"learning_rate": 9.376148951213765e-05,
"loss": 0.4839,
"step": 5500
},
{
"epoch": 0.65,
"grad_norm": 0.944857656955719,
"learning_rate": 9.374970539712468e-05,
"loss": 0.4772,
"step": 5510
},
{
"epoch": 0.65,
"grad_norm": 0.8949635028839111,
"learning_rate": 9.373792128211172e-05,
"loss": 0.4781,
"step": 5520
},
{
"epoch": 0.65,
"grad_norm": 1.073096752166748,
"learning_rate": 9.372613716709875e-05,
"loss": 0.4764,
"step": 5530
},
{
"epoch": 0.65,
"grad_norm": 0.46362000703811646,
"learning_rate": 9.37143530520858e-05,
"loss": 0.4882,
"step": 5540
},
{
"epoch": 0.65,
"grad_norm": 0.6273431777954102,
"learning_rate": 9.370256893707283e-05,
"loss": 0.4925,
"step": 5550
},
{
"epoch": 0.65,
"grad_norm": 0.7715334296226501,
"learning_rate": 9.369078482205986e-05,
"loss": 0.4806,
"step": 5560
},
{
"epoch": 0.65,
"grad_norm": 0.597259521484375,
"learning_rate": 9.367900070704691e-05,
"loss": 0.4581,
"step": 5570
},
{
"epoch": 0.66,
"grad_norm": 0.5563521385192871,
"learning_rate": 9.366721659203395e-05,
"loss": 0.4648,
"step": 5580
},
{
"epoch": 0.66,
"grad_norm": 1.0849449634552002,
"learning_rate": 9.365543247702098e-05,
"loss": 0.4663,
"step": 5590
},
{
"epoch": 0.66,
"grad_norm": 0.8659862279891968,
"learning_rate": 9.364364836200801e-05,
"loss": 0.4687,
"step": 5600
},
{
"epoch": 0.66,
"grad_norm": 0.5031567215919495,
"learning_rate": 9.363186424699507e-05,
"loss": 0.4719,
"step": 5610
},
{
"epoch": 0.66,
"grad_norm": 1.3020015954971313,
"learning_rate": 9.362008013198209e-05,
"loss": 0.4647,
"step": 5620
},
{
"epoch": 0.66,
"grad_norm": 1.0185835361480713,
"learning_rate": 9.360829601696913e-05,
"loss": 0.4675,
"step": 5630
},
{
"epoch": 0.66,
"grad_norm": 0.7432125210762024,
"learning_rate": 9.359651190195616e-05,
"loss": 0.482,
"step": 5640
},
{
"epoch": 0.66,
"grad_norm": 0.46069788932800293,
"learning_rate": 9.358472778694321e-05,
"loss": 0.4827,
"step": 5650
},
{
"epoch": 0.67,
"grad_norm": 0.6779037117958069,
"learning_rate": 9.357294367193025e-05,
"loss": 0.4584,
"step": 5660
},
{
"epoch": 0.67,
"grad_norm": 0.43060535192489624,
"learning_rate": 9.356115955691727e-05,
"loss": 0.4859,
"step": 5670
},
{
"epoch": 0.67,
"grad_norm": 0.8050900101661682,
"learning_rate": 9.354937544190431e-05,
"loss": 0.4753,
"step": 5680
},
{
"epoch": 0.67,
"grad_norm": 1.405532956123352,
"learning_rate": 9.353759132689135e-05,
"loss": 0.5,
"step": 5690
},
{
"epoch": 0.67,
"grad_norm": 0.8472504615783691,
"learning_rate": 9.352580721187839e-05,
"loss": 0.4728,
"step": 5700
},
{
"epoch": 0.67,
"grad_norm": 0.7228186130523682,
"learning_rate": 9.351402309686543e-05,
"loss": 0.4743,
"step": 5710
},
{
"epoch": 0.67,
"grad_norm": 0.6807494163513184,
"learning_rate": 9.350223898185247e-05,
"loss": 0.4943,
"step": 5720
},
{
"epoch": 0.67,
"grad_norm": 1.2535176277160645,
"learning_rate": 9.349045486683951e-05,
"loss": 0.4805,
"step": 5730
},
{
"epoch": 0.67,
"grad_norm": 0.7142338752746582,
"learning_rate": 9.347867075182655e-05,
"loss": 0.4986,
"step": 5740
},
{
"epoch": 0.68,
"grad_norm": 0.524666965007782,
"learning_rate": 9.346688663681357e-05,
"loss": 0.4668,
"step": 5750
},
{
"epoch": 0.68,
"grad_norm": 0.9502595663070679,
"learning_rate": 9.345510252180062e-05,
"loss": 0.4808,
"step": 5760
},
{
"epoch": 0.68,
"grad_norm": 0.8740987777709961,
"learning_rate": 9.344331840678765e-05,
"loss": 0.4802,
"step": 5770
},
{
"epoch": 0.68,
"grad_norm": 0.9712386727333069,
"learning_rate": 9.343153429177469e-05,
"loss": 0.4753,
"step": 5780
},
{
"epoch": 0.68,
"grad_norm": 0.49214234948158264,
"learning_rate": 9.341975017676173e-05,
"loss": 0.4724,
"step": 5790
},
{
"epoch": 0.68,
"grad_norm": 0.6717044711112976,
"learning_rate": 9.340796606174877e-05,
"loss": 0.4817,
"step": 5800
},
{
"epoch": 0.68,
"grad_norm": 0.9044239521026611,
"learning_rate": 9.339736035823709e-05,
"loss": 0.4845,
"step": 5810
},
{
"epoch": 0.68,
"grad_norm": 0.711783766746521,
"learning_rate": 9.338557624322413e-05,
"loss": 0.4922,
"step": 5820
},
{
"epoch": 0.69,
"grad_norm": 1.010398030281067,
"learning_rate": 9.337379212821118e-05,
"loss": 0.4793,
"step": 5830
},
{
"epoch": 0.69,
"grad_norm": 0.5923473834991455,
"learning_rate": 9.336200801319821e-05,
"loss": 0.4834,
"step": 5840
},
{
"epoch": 0.69,
"grad_norm": 0.9304243922233582,
"learning_rate": 9.335022389818525e-05,
"loss": 0.4583,
"step": 5850
},
{
"epoch": 0.69,
"grad_norm": 0.7781521081924438,
"learning_rate": 9.333843978317229e-05,
"loss": 0.4715,
"step": 5860
},
{
"epoch": 0.69,
"grad_norm": 0.768720805644989,
"learning_rate": 9.332665566815933e-05,
"loss": 0.4767,
"step": 5870
},
{
"epoch": 0.69,
"grad_norm": 0.8461710214614868,
"learning_rate": 9.331487155314637e-05,
"loss": 0.4809,
"step": 5880
},
{
"epoch": 0.69,
"grad_norm": 1.4430184364318848,
"learning_rate": 9.330308743813339e-05,
"loss": 0.4886,
"step": 5890
},
{
"epoch": 0.69,
"grad_norm": 0.9757821559906006,
"learning_rate": 9.329130332312044e-05,
"loss": 0.5049,
"step": 5900
},
{
"epoch": 0.69,
"grad_norm": 0.8204195499420166,
"learning_rate": 9.327951920810748e-05,
"loss": 0.4673,
"step": 5910
},
{
"epoch": 0.7,
"grad_norm": 1.2501641511917114,
"learning_rate": 9.326773509309451e-05,
"loss": 0.5006,
"step": 5920
},
{
"epoch": 0.7,
"grad_norm": 1.438923954963684,
"learning_rate": 9.325595097808155e-05,
"loss": 0.4761,
"step": 5930
},
{
"epoch": 0.7,
"grad_norm": 0.8495496511459351,
"learning_rate": 9.324416686306859e-05,
"loss": 0.4732,
"step": 5940
},
{
"epoch": 0.7,
"grad_norm": 1.1707062721252441,
"learning_rate": 9.323238274805563e-05,
"loss": 0.4796,
"step": 5950
},
{
"epoch": 0.7,
"grad_norm": 0.9148976802825928,
"learning_rate": 9.322059863304266e-05,
"loss": 0.4782,
"step": 5960
},
{
"epoch": 0.7,
"grad_norm": 0.8116015195846558,
"learning_rate": 9.32088145180297e-05,
"loss": 0.4816,
"step": 5970
},
{
"epoch": 0.7,
"grad_norm": 1.3817458152770996,
"learning_rate": 9.319703040301674e-05,
"loss": 0.4844,
"step": 5980
},
{
"epoch": 0.7,
"grad_norm": 0.891568660736084,
"learning_rate": 9.318524628800378e-05,
"loss": 0.4706,
"step": 5990
},
{
"epoch": 0.71,
"grad_norm": 0.9312018156051636,
"learning_rate": 9.317346217299081e-05,
"loss": 0.4644,
"step": 6000
},
{
"epoch": 0.71,
"grad_norm": 0.9175546169281006,
"learning_rate": 9.316167805797785e-05,
"loss": 0.4682,
"step": 6010
},
{
"epoch": 0.71,
"grad_norm": 1.396550178527832,
"learning_rate": 9.314989394296488e-05,
"loss": 0.4799,
"step": 6020
},
{
"epoch": 0.71,
"grad_norm": 0.8256319165229797,
"learning_rate": 9.313810982795192e-05,
"loss": 0.4651,
"step": 6030
},
{
"epoch": 0.71,
"grad_norm": 0.6263672113418579,
"learning_rate": 9.312632571293896e-05,
"loss": 0.4761,
"step": 6040
},
{
"epoch": 0.71,
"grad_norm": 1.064767837524414,
"learning_rate": 9.3114541597926e-05,
"loss": 0.4908,
"step": 6050
},
{
"epoch": 0.71,
"grad_norm": 0.7858134508132935,
"learning_rate": 9.310275748291304e-05,
"loss": 0.4515,
"step": 6060
},
{
"epoch": 0.71,
"grad_norm": 0.6081665754318237,
"learning_rate": 9.309097336790008e-05,
"loss": 0.4688,
"step": 6070
},
{
"epoch": 0.71,
"grad_norm": 0.6087002754211426,
"learning_rate": 9.30791892528871e-05,
"loss": 0.4704,
"step": 6080
},
{
"epoch": 0.72,
"grad_norm": 0.8753799796104431,
"learning_rate": 9.306740513787416e-05,
"loss": 0.4832,
"step": 6090
},
{
"epoch": 0.72,
"grad_norm": 0.5013492107391357,
"learning_rate": 9.305562102286118e-05,
"loss": 0.4586,
"step": 6100
},
{
"epoch": 0.72,
"grad_norm": 0.841362714767456,
"learning_rate": 9.304383690784822e-05,
"loss": 0.4814,
"step": 6110
},
{
"epoch": 0.72,
"grad_norm": 0.5504693388938904,
"learning_rate": 9.303205279283526e-05,
"loss": 0.4773,
"step": 6120
},
{
"epoch": 0.72,
"grad_norm": 0.622080385684967,
"learning_rate": 9.30202686778223e-05,
"loss": 0.4619,
"step": 6130
},
{
"epoch": 0.72,
"grad_norm": 0.44307008385658264,
"learning_rate": 9.300848456280934e-05,
"loss": 0.4672,
"step": 6140
},
{
"epoch": 0.72,
"grad_norm": 0.7248728275299072,
"learning_rate": 9.299670044779636e-05,
"loss": 0.4764,
"step": 6150
},
{
"epoch": 0.72,
"grad_norm": 0.8518944382667542,
"learning_rate": 9.298491633278342e-05,
"loss": 0.4991,
"step": 6160
},
{
"epoch": 0.73,
"grad_norm": 0.6587331295013428,
"learning_rate": 9.297313221777046e-05,
"loss": 0.504,
"step": 6170
},
{
"epoch": 0.73,
"grad_norm": 0.6488806009292603,
"learning_rate": 9.296134810275748e-05,
"loss": 0.4764,
"step": 6180
},
{
"epoch": 0.73,
"grad_norm": 0.7182736396789551,
"learning_rate": 9.294956398774452e-05,
"loss": 0.4674,
"step": 6190
},
{
"epoch": 0.73,
"grad_norm": 0.9945371747016907,
"learning_rate": 9.293777987273157e-05,
"loss": 0.4594,
"step": 6200
},
{
"epoch": 0.73,
"grad_norm": 0.8697236776351929,
"learning_rate": 9.29259957577186e-05,
"loss": 0.5021,
"step": 6210
},
{
"epoch": 0.73,
"grad_norm": 1.406794786453247,
"learning_rate": 9.291421164270564e-05,
"loss": 0.4664,
"step": 6220
},
{
"epoch": 0.73,
"grad_norm": 0.41580840945243835,
"learning_rate": 9.290242752769268e-05,
"loss": 0.4839,
"step": 6230
},
{
"epoch": 0.73,
"grad_norm": 0.7050737142562866,
"learning_rate": 9.289064341267972e-05,
"loss": 0.4772,
"step": 6240
},
{
"epoch": 0.73,
"grad_norm": 1.2518810033798218,
"learning_rate": 9.287885929766675e-05,
"loss": 0.4783,
"step": 6250
},
{
"epoch": 0.74,
"grad_norm": 0.9627326726913452,
"learning_rate": 9.286707518265378e-05,
"loss": 0.4747,
"step": 6260
},
{
"epoch": 0.74,
"grad_norm": 0.7871906161308289,
"learning_rate": 9.285529106764082e-05,
"loss": 0.4967,
"step": 6270
},
{
"epoch": 0.74,
"grad_norm": 0.8793184757232666,
"learning_rate": 9.284350695262786e-05,
"loss": 0.4837,
"step": 6280
},
{
"epoch": 0.74,
"grad_norm": 0.6042229533195496,
"learning_rate": 9.28317228376149e-05,
"loss": 0.4508,
"step": 6290
},
{
"epoch": 0.74,
"grad_norm": 0.6557303071022034,
"learning_rate": 9.281993872260194e-05,
"loss": 0.4789,
"step": 6300
},
{
"epoch": 0.74,
"grad_norm": 0.8597153425216675,
"learning_rate": 9.280815460758898e-05,
"loss": 0.4879,
"step": 6310
},
{
"epoch": 0.74,
"grad_norm": 0.6139453053474426,
"learning_rate": 9.279637049257601e-05,
"loss": 0.4719,
"step": 6320
},
{
"epoch": 0.74,
"grad_norm": 0.5312277674674988,
"learning_rate": 9.278458637756305e-05,
"loss": 0.481,
"step": 6330
},
{
"epoch": 0.75,
"grad_norm": 0.5759139060974121,
"learning_rate": 9.277280226255008e-05,
"loss": 0.4762,
"step": 6340
},
{
"epoch": 0.75,
"grad_norm": 0.6466348767280579,
"learning_rate": 9.276101814753713e-05,
"loss": 0.4741,
"step": 6350
},
{
"epoch": 0.75,
"grad_norm": 0.7243015766143799,
"learning_rate": 9.274923403252416e-05,
"loss": 0.4618,
"step": 6360
},
{
"epoch": 0.75,
"grad_norm": 0.8750311136245728,
"learning_rate": 9.27374499175112e-05,
"loss": 0.4719,
"step": 6370
},
{
"epoch": 0.75,
"grad_norm": 0.7637357115745544,
"learning_rate": 9.272566580249823e-05,
"loss": 0.474,
"step": 6380
},
{
"epoch": 0.75,
"grad_norm": 1.011893630027771,
"learning_rate": 9.271388168748527e-05,
"loss": 0.4753,
"step": 6390
},
{
"epoch": 0.75,
"grad_norm": 0.7775669097900391,
"learning_rate": 9.270209757247231e-05,
"loss": 0.4726,
"step": 6400
},
{
"epoch": 0.75,
"grad_norm": 0.5721779465675354,
"learning_rate": 9.269031345745935e-05,
"loss": 0.4771,
"step": 6410
},
{
"epoch": 0.75,
"grad_norm": 1.1552717685699463,
"learning_rate": 9.267852934244639e-05,
"loss": 0.4708,
"step": 6420
},
{
"epoch": 0.76,
"grad_norm": 1.0046576261520386,
"learning_rate": 9.266674522743343e-05,
"loss": 0.484,
"step": 6430
},
{
"epoch": 0.76,
"grad_norm": 0.5850175619125366,
"learning_rate": 9.265496111242046e-05,
"loss": 0.4738,
"step": 6440
},
{
"epoch": 0.76,
"grad_norm": 1.332746148109436,
"learning_rate": 9.26431769974075e-05,
"loss": 0.4775,
"step": 6450
},
{
"epoch": 0.76,
"grad_norm": 0.4971960484981537,
"learning_rate": 9.263139288239455e-05,
"loss": 0.4856,
"step": 6460
},
{
"epoch": 0.76,
"grad_norm": 0.7696993947029114,
"learning_rate": 9.261960876738157e-05,
"loss": 0.4832,
"step": 6470
},
{
"epoch": 0.76,
"grad_norm": 0.8447930216789246,
"learning_rate": 9.260782465236861e-05,
"loss": 0.4671,
"step": 6480
},
{
"epoch": 0.76,
"grad_norm": 0.6897150278091431,
"learning_rate": 9.259604053735565e-05,
"loss": 0.4658,
"step": 6490
},
{
"epoch": 0.76,
"grad_norm": 0.7481512427330017,
"learning_rate": 9.258425642234269e-05,
"loss": 0.4821,
"step": 6500
},
{
"epoch": 0.77,
"grad_norm": 1.0475082397460938,
"learning_rate": 9.257247230732973e-05,
"loss": 0.4816,
"step": 6510
},
{
"epoch": 0.77,
"grad_norm": 0.9807186722755432,
"learning_rate": 9.256068819231675e-05,
"loss": 0.4763,
"step": 6520
},
{
"epoch": 0.77,
"grad_norm": 0.7923306822776794,
"learning_rate": 9.254890407730379e-05,
"loss": 0.4746,
"step": 6530
},
{
"epoch": 0.77,
"grad_norm": 1.1048396825790405,
"learning_rate": 9.253711996229085e-05,
"loss": 0.4654,
"step": 6540
},
{
"epoch": 0.77,
"grad_norm": 0.622284471988678,
"learning_rate": 9.252533584727787e-05,
"loss": 0.4879,
"step": 6550
},
{
"epoch": 0.77,
"grad_norm": 0.8338263034820557,
"learning_rate": 9.251355173226491e-05,
"loss": 0.485,
"step": 6560
},
{
"epoch": 0.77,
"grad_norm": 1.4728702306747437,
"learning_rate": 9.250176761725195e-05,
"loss": 0.4993,
"step": 6570
},
{
"epoch": 0.77,
"grad_norm": 1.5744489431381226,
"learning_rate": 9.248998350223899e-05,
"loss": 0.4711,
"step": 6580
},
{
"epoch": 0.77,
"grad_norm": 1.0752863883972168,
"learning_rate": 9.247819938722603e-05,
"loss": 0.4807,
"step": 6590
},
{
"epoch": 0.78,
"grad_norm": 0.5310035347938538,
"learning_rate": 9.246641527221305e-05,
"loss": 0.467,
"step": 6600
},
{
"epoch": 0.78,
"grad_norm": 0.7580615282058716,
"learning_rate": 9.24546311572001e-05,
"loss": 0.4699,
"step": 6610
},
{
"epoch": 0.78,
"grad_norm": 1.1891237497329712,
"learning_rate": 9.244284704218713e-05,
"loss": 0.4808,
"step": 6620
},
{
"epoch": 0.78,
"grad_norm": 0.736323356628418,
"learning_rate": 9.243106292717417e-05,
"loss": 0.4759,
"step": 6630
},
{
"epoch": 0.78,
"grad_norm": 1.2166295051574707,
"learning_rate": 9.241927881216121e-05,
"loss": 0.4881,
"step": 6640
},
{
"epoch": 0.78,
"grad_norm": 0.7576156854629517,
"learning_rate": 9.240749469714825e-05,
"loss": 0.4774,
"step": 6650
},
{
"epoch": 0.78,
"grad_norm": 0.6080641150474548,
"learning_rate": 9.239571058213529e-05,
"loss": 0.4822,
"step": 6660
},
{
"epoch": 0.78,
"grad_norm": 0.8534877896308899,
"learning_rate": 9.238392646712233e-05,
"loss": 0.4685,
"step": 6670
},
{
"epoch": 0.79,
"grad_norm": 0.9101846218109131,
"learning_rate": 9.237214235210936e-05,
"loss": 0.4833,
"step": 6680
},
{
"epoch": 0.79,
"grad_norm": 1.3006272315979004,
"learning_rate": 9.23603582370964e-05,
"loss": 0.4742,
"step": 6690
},
{
"epoch": 0.79,
"grad_norm": 1.504955768585205,
"learning_rate": 9.234857412208343e-05,
"loss": 0.4876,
"step": 6700
},
{
"epoch": 0.79,
"grad_norm": 1.27055025100708,
"learning_rate": 9.233679000707047e-05,
"loss": 0.4848,
"step": 6710
},
{
"epoch": 0.79,
"grad_norm": 0.855186402797699,
"learning_rate": 9.232500589205752e-05,
"loss": 0.4597,
"step": 6720
},
{
"epoch": 0.79,
"grad_norm": 0.8392820358276367,
"learning_rate": 9.231322177704455e-05,
"loss": 0.476,
"step": 6730
},
{
"epoch": 0.79,
"grad_norm": 0.6883922815322876,
"learning_rate": 9.230143766203158e-05,
"loss": 0.4693,
"step": 6740
},
{
"epoch": 0.79,
"grad_norm": 0.707591712474823,
"learning_rate": 9.228965354701862e-05,
"loss": 0.4739,
"step": 6750
},
{
"epoch": 0.79,
"grad_norm": 1.406572699546814,
"learning_rate": 9.227786943200566e-05,
"loss": 0.4932,
"step": 6760
},
{
"epoch": 0.8,
"grad_norm": 0.8618888258934021,
"learning_rate": 9.22660853169927e-05,
"loss": 0.4887,
"step": 6770
},
{
"epoch": 0.8,
"grad_norm": 0.8809340000152588,
"learning_rate": 9.225430120197973e-05,
"loss": 0.4552,
"step": 6780
},
{
"epoch": 0.8,
"grad_norm": 0.7067540884017944,
"learning_rate": 9.224251708696678e-05,
"loss": 0.4543,
"step": 6790
},
{
"epoch": 0.8,
"grad_norm": 0.8848843574523926,
"learning_rate": 9.223073297195382e-05,
"loss": 0.4629,
"step": 6800
},
{
"epoch": 0.8,
"grad_norm": 0.8002381324768066,
"learning_rate": 9.221894885694084e-05,
"loss": 0.4582,
"step": 6810
},
{
"epoch": 0.8,
"grad_norm": 0.8329706192016602,
"learning_rate": 9.220716474192788e-05,
"loss": 0.4634,
"step": 6820
},
{
"epoch": 0.8,
"grad_norm": 0.5875262022018433,
"learning_rate": 9.219538062691492e-05,
"loss": 0.4875,
"step": 6830
},
{
"epoch": 0.8,
"grad_norm": 0.6160840392112732,
"learning_rate": 9.218359651190196e-05,
"loss": 0.4608,
"step": 6840
},
{
"epoch": 0.81,
"grad_norm": 0.7814944982528687,
"learning_rate": 9.2171812396889e-05,
"loss": 0.4776,
"step": 6850
},
{
"epoch": 0.81,
"grad_norm": 0.7762553691864014,
"learning_rate": 9.216002828187603e-05,
"loss": 0.4865,
"step": 6860
},
{
"epoch": 0.81,
"grad_norm": 0.40437743067741394,
"learning_rate": 9.214824416686308e-05,
"loss": 0.4827,
"step": 6870
},
{
"epoch": 0.81,
"grad_norm": 1.0636314153671265,
"learning_rate": 9.213646005185012e-05,
"loss": 0.4778,
"step": 6880
},
{
"epoch": 0.81,
"grad_norm": 1.2540510892868042,
"learning_rate": 9.212467593683714e-05,
"loss": 0.4781,
"step": 6890
},
{
"epoch": 0.81,
"grad_norm": 0.6767933368682861,
"learning_rate": 9.211289182182418e-05,
"loss": 0.48,
"step": 6900
},
{
"epoch": 0.81,
"grad_norm": 0.943888247013092,
"learning_rate": 9.210110770681122e-05,
"loss": 0.4673,
"step": 6910
},
{
"epoch": 0.81,
"grad_norm": 0.6125652194023132,
"learning_rate": 9.208932359179826e-05,
"loss": 0.47,
"step": 6920
},
{
"epoch": 0.81,
"grad_norm": 0.7698494791984558,
"learning_rate": 9.20775394767853e-05,
"loss": 0.4676,
"step": 6930
},
{
"epoch": 0.82,
"grad_norm": 1.0100045204162598,
"learning_rate": 9.206575536177234e-05,
"loss": 0.4679,
"step": 6940
},
{
"epoch": 0.82,
"grad_norm": 1.1571061611175537,
"learning_rate": 9.205397124675938e-05,
"loss": 0.4668,
"step": 6950
},
{
"epoch": 0.82,
"grad_norm": 0.6960354447364807,
"learning_rate": 9.204218713174642e-05,
"loss": 0.4806,
"step": 6960
},
{
"epoch": 0.82,
"grad_norm": 0.7625404596328735,
"learning_rate": 9.203040301673344e-05,
"loss": 0.4584,
"step": 6970
},
{
"epoch": 0.82,
"grad_norm": 0.8157169818878174,
"learning_rate": 9.20186189017205e-05,
"loss": 0.4835,
"step": 6980
},
{
"epoch": 0.82,
"grad_norm": 0.5967955589294434,
"learning_rate": 9.200683478670752e-05,
"loss": 0.4738,
"step": 6990
},
{
"epoch": 0.82,
"grad_norm": 0.7080566883087158,
"learning_rate": 9.199505067169456e-05,
"loss": 0.4817,
"step": 7000
},
{
"epoch": 0.82,
"grad_norm": 1.0959988832473755,
"learning_rate": 9.19832665566816e-05,
"loss": 0.4756,
"step": 7010
},
{
"epoch": 0.83,
"grad_norm": 0.6054321527481079,
"learning_rate": 9.197148244166864e-05,
"loss": 0.4669,
"step": 7020
},
{
"epoch": 0.83,
"grad_norm": 1.058050513267517,
"learning_rate": 9.195969832665568e-05,
"loss": 0.4708,
"step": 7030
},
{
"epoch": 0.83,
"grad_norm": 0.5092371106147766,
"learning_rate": 9.19479142116427e-05,
"loss": 0.4797,
"step": 7040
},
{
"epoch": 0.83,
"grad_norm": 1.0786676406860352,
"learning_rate": 9.193613009662975e-05,
"loss": 0.4767,
"step": 7050
},
{
"epoch": 0.83,
"grad_norm": 0.7446795701980591,
"learning_rate": 9.192434598161679e-05,
"loss": 0.4855,
"step": 7060
},
{
"epoch": 0.83,
"grad_norm": 0.7856457829475403,
"learning_rate": 9.191256186660382e-05,
"loss": 0.4817,
"step": 7070
},
{
"epoch": 0.83,
"grad_norm": 0.6589493155479431,
"learning_rate": 9.190077775159086e-05,
"loss": 0.4762,
"step": 7080
},
{
"epoch": 0.83,
"grad_norm": 1.3472398519515991,
"learning_rate": 9.18889936365779e-05,
"loss": 0.4831,
"step": 7090
},
{
"epoch": 0.83,
"grad_norm": 0.7599365711212158,
"learning_rate": 9.187720952156493e-05,
"loss": 0.472,
"step": 7100
},
{
"epoch": 0.84,
"grad_norm": 0.5667014122009277,
"learning_rate": 9.186542540655197e-05,
"loss": 0.4713,
"step": 7110
},
{
"epoch": 0.84,
"grad_norm": 0.6668433547019958,
"learning_rate": 9.1853641291539e-05,
"loss": 0.4728,
"step": 7120
},
{
"epoch": 0.84,
"grad_norm": 0.7601708769798279,
"learning_rate": 9.184185717652605e-05,
"loss": 0.4628,
"step": 7130
},
{
"epoch": 0.84,
"grad_norm": 0.9204805493354797,
"learning_rate": 9.183007306151309e-05,
"loss": 0.4769,
"step": 7140
},
{
"epoch": 0.84,
"grad_norm": 1.016757845878601,
"learning_rate": 9.181828894650012e-05,
"loss": 0.4774,
"step": 7150
},
{
"epoch": 0.84,
"grad_norm": 0.6940405964851379,
"learning_rate": 9.180650483148716e-05,
"loss": 0.4783,
"step": 7160
},
{
"epoch": 0.84,
"grad_norm": 0.7603360414505005,
"learning_rate": 9.17947207164742e-05,
"loss": 0.4806,
"step": 7170
},
{
"epoch": 0.84,
"grad_norm": 0.7697874307632446,
"learning_rate": 9.178293660146123e-05,
"loss": 0.4649,
"step": 7180
},
{
"epoch": 0.85,
"grad_norm": 0.8786806464195251,
"learning_rate": 9.177115248644827e-05,
"loss": 0.4599,
"step": 7190
},
{
"epoch": 0.85,
"grad_norm": 1.1152408123016357,
"learning_rate": 9.175936837143531e-05,
"loss": 0.4879,
"step": 7200
},
{
"epoch": 0.85,
"grad_norm": 0.8338118195533752,
"learning_rate": 9.174758425642235e-05,
"loss": 0.4549,
"step": 7210
},
{
"epoch": 0.85,
"grad_norm": 1.5139575004577637,
"learning_rate": 9.173580014140939e-05,
"loss": 0.4634,
"step": 7220
},
{
"epoch": 0.85,
"grad_norm": 0.994613528251648,
"learning_rate": 9.172401602639641e-05,
"loss": 0.462,
"step": 7230
},
{
"epoch": 0.85,
"grad_norm": 0.8666975498199463,
"learning_rate": 9.171223191138347e-05,
"loss": 0.4703,
"step": 7240
},
{
"epoch": 0.85,
"grad_norm": 0.5157896876335144,
"learning_rate": 9.170044779637049e-05,
"loss": 0.4763,
"step": 7250
},
{
"epoch": 0.85,
"grad_norm": 0.7530190348625183,
"learning_rate": 9.168866368135753e-05,
"loss": 0.461,
"step": 7260
},
{
"epoch": 0.85,
"grad_norm": 0.7715838551521301,
"learning_rate": 9.167687956634457e-05,
"loss": 0.467,
"step": 7270
},
{
"epoch": 0.86,
"grad_norm": 0.9403729438781738,
"learning_rate": 9.166509545133161e-05,
"loss": 0.4742,
"step": 7280
},
{
"epoch": 0.86,
"grad_norm": 0.6798827052116394,
"learning_rate": 9.165331133631865e-05,
"loss": 0.4666,
"step": 7290
},
{
"epoch": 0.86,
"grad_norm": 0.7613863348960876,
"learning_rate": 9.164152722130569e-05,
"loss": 0.4811,
"step": 7300
},
{
"epoch": 0.86,
"grad_norm": 0.7779159545898438,
"learning_rate": 9.162974310629273e-05,
"loss": 0.4554,
"step": 7310
},
{
"epoch": 0.86,
"grad_norm": 0.640932023525238,
"learning_rate": 9.161795899127977e-05,
"loss": 0.4757,
"step": 7320
},
{
"epoch": 0.86,
"grad_norm": 1.2536754608154297,
"learning_rate": 9.160617487626679e-05,
"loss": 0.4731,
"step": 7330
},
{
"epoch": 0.86,
"grad_norm": 0.857828676700592,
"learning_rate": 9.159439076125383e-05,
"loss": 0.4678,
"step": 7340
},
{
"epoch": 0.86,
"grad_norm": 0.5510068535804749,
"learning_rate": 9.158260664624087e-05,
"loss": 0.4682,
"step": 7350
},
{
"epoch": 0.87,
"grad_norm": 0.9158300161361694,
"learning_rate": 9.157082253122791e-05,
"loss": 0.4681,
"step": 7360
},
{
"epoch": 0.87,
"grad_norm": 0.8237592577934265,
"learning_rate": 9.155903841621495e-05,
"loss": 0.4736,
"step": 7370
},
{
"epoch": 0.87,
"grad_norm": 0.7829803228378296,
"learning_rate": 9.154725430120197e-05,
"loss": 0.4731,
"step": 7380
},
{
"epoch": 0.87,
"grad_norm": 0.6544725894927979,
"learning_rate": 9.153547018618903e-05,
"loss": 0.4802,
"step": 7390
},
{
"epoch": 0.87,
"grad_norm": 0.8436287641525269,
"learning_rate": 9.152368607117606e-05,
"loss": 0.4746,
"step": 7400
},
{
"epoch": 0.87,
"grad_norm": 1.2512866258621216,
"learning_rate": 9.151190195616309e-05,
"loss": 0.4697,
"step": 7410
},
{
"epoch": 0.87,
"grad_norm": 1.0702438354492188,
"learning_rate": 9.150011784115013e-05,
"loss": 0.4675,
"step": 7420
},
{
"epoch": 0.87,
"grad_norm": 0.9401954412460327,
"learning_rate": 9.148833372613718e-05,
"loss": 0.4644,
"step": 7430
},
{
"epoch": 0.87,
"grad_norm": 0.9166263341903687,
"learning_rate": 9.147654961112421e-05,
"loss": 0.4614,
"step": 7440
},
{
"epoch": 0.88,
"grad_norm": 0.7773593068122864,
"learning_rate": 9.146476549611125e-05,
"loss": 0.469,
"step": 7450
},
{
"epoch": 0.88,
"grad_norm": 1.585802435874939,
"learning_rate": 9.145298138109828e-05,
"loss": 0.4772,
"step": 7460
},
{
"epoch": 0.88,
"grad_norm": 0.7248924970626831,
"learning_rate": 9.144119726608532e-05,
"loss": 0.4594,
"step": 7470
},
{
"epoch": 0.88,
"grad_norm": 0.6239253878593445,
"learning_rate": 9.142941315107236e-05,
"loss": 0.4649,
"step": 7480
},
{
"epoch": 0.88,
"grad_norm": 1.108443260192871,
"learning_rate": 9.141762903605939e-05,
"loss": 0.4621,
"step": 7490
},
{
"epoch": 0.88,
"grad_norm": 0.8479273319244385,
"learning_rate": 9.140584492104644e-05,
"loss": 0.4754,
"step": 7500
},
{
"epoch": 0.88,
"grad_norm": 0.8700122237205505,
"learning_rate": 9.139406080603347e-05,
"loss": 0.4792,
"step": 7510
},
{
"epoch": 0.88,
"grad_norm": 0.5243459343910217,
"learning_rate": 9.13822766910205e-05,
"loss": 0.4675,
"step": 7520
},
{
"epoch": 0.89,
"grad_norm": 0.4775218665599823,
"learning_rate": 9.137049257600754e-05,
"loss": 0.4799,
"step": 7530
},
{
"epoch": 0.89,
"grad_norm": 0.7216522693634033,
"learning_rate": 9.135870846099458e-05,
"loss": 0.4673,
"step": 7540
},
{
"epoch": 0.89,
"grad_norm": 0.5191614627838135,
"learning_rate": 9.134692434598162e-05,
"loss": 0.4791,
"step": 7550
},
{
"epoch": 0.89,
"grad_norm": 0.795246422290802,
"learning_rate": 9.133514023096866e-05,
"loss": 0.4776,
"step": 7560
},
{
"epoch": 0.89,
"grad_norm": 0.9776081442832947,
"learning_rate": 9.13233561159557e-05,
"loss": 0.4822,
"step": 7570
},
{
"epoch": 0.89,
"grad_norm": 0.9107388854026794,
"learning_rate": 9.131157200094274e-05,
"loss": 0.4691,
"step": 7580
},
{
"epoch": 0.89,
"grad_norm": 0.8358667492866516,
"learning_rate": 9.129978788592976e-05,
"loss": 0.4707,
"step": 7590
},
{
"epoch": 0.89,
"grad_norm": 0.6095325946807861,
"learning_rate": 9.12880037709168e-05,
"loss": 0.4808,
"step": 7600
},
{
"epoch": 0.89,
"grad_norm": 0.9324237704277039,
"learning_rate": 9.127621965590386e-05,
"loss": 0.4704,
"step": 7610
},
{
"epoch": 0.9,
"grad_norm": 1.0240130424499512,
"learning_rate": 9.126443554089088e-05,
"loss": 0.4571,
"step": 7620
},
{
"epoch": 0.9,
"grad_norm": 0.8012809157371521,
"learning_rate": 9.125265142587792e-05,
"loss": 0.4787,
"step": 7630
},
{
"epoch": 0.9,
"grad_norm": 0.6790209412574768,
"learning_rate": 9.124086731086496e-05,
"loss": 0.4627,
"step": 7640
},
{
"epoch": 0.9,
"grad_norm": 1.284743309020996,
"learning_rate": 9.1229083195852e-05,
"loss": 0.4683,
"step": 7650
},
{
"epoch": 0.9,
"grad_norm": 0.524591863155365,
"learning_rate": 9.121729908083904e-05,
"loss": 0.476,
"step": 7660
},
{
"epoch": 0.9,
"grad_norm": 1.5539113283157349,
"learning_rate": 9.120551496582606e-05,
"loss": 0.4686,
"step": 7670
},
{
"epoch": 0.9,
"grad_norm": 0.8249083161354065,
"learning_rate": 9.11937308508131e-05,
"loss": 0.4723,
"step": 7680
},
{
"epoch": 0.9,
"grad_norm": 0.48530295491218567,
"learning_rate": 9.118194673580015e-05,
"loss": 0.4663,
"step": 7690
},
{
"epoch": 0.91,
"grad_norm": 0.8678601980209351,
"learning_rate": 9.117016262078718e-05,
"loss": 0.469,
"step": 7700
},
{
"epoch": 0.91,
"grad_norm": 0.9824271202087402,
"learning_rate": 9.115837850577422e-05,
"loss": 0.4602,
"step": 7710
},
{
"epoch": 0.91,
"grad_norm": 0.7269099950790405,
"learning_rate": 9.114659439076126e-05,
"loss": 0.4696,
"step": 7720
},
{
"epoch": 0.91,
"grad_norm": 0.6246598362922668,
"learning_rate": 9.11348102757483e-05,
"loss": 0.4682,
"step": 7730
},
{
"epoch": 0.91,
"grad_norm": 1.1035609245300293,
"learning_rate": 9.112302616073534e-05,
"loss": 0.4616,
"step": 7740
},
{
"epoch": 0.91,
"grad_norm": 0.6836515069007874,
"learning_rate": 9.111124204572236e-05,
"loss": 0.4571,
"step": 7750
},
{
"epoch": 0.91,
"grad_norm": 0.8848505020141602,
"learning_rate": 9.109945793070941e-05,
"loss": 0.4832,
"step": 7760
},
{
"epoch": 0.91,
"grad_norm": 0.4297630786895752,
"learning_rate": 9.108767381569645e-05,
"loss": 0.4644,
"step": 7770
},
{
"epoch": 0.91,
"grad_norm": 0.8360887765884399,
"learning_rate": 9.107588970068348e-05,
"loss": 0.4574,
"step": 7780
},
{
"epoch": 0.92,
"grad_norm": 0.5313239097595215,
"learning_rate": 9.106410558567052e-05,
"loss": 0.473,
"step": 7790
},
{
"epoch": 0.92,
"grad_norm": 0.9913547039031982,
"learning_rate": 9.105232147065756e-05,
"loss": 0.4704,
"step": 7800
},
{
"epoch": 0.92,
"grad_norm": 0.5140584707260132,
"learning_rate": 9.10405373556446e-05,
"loss": 0.4511,
"step": 7810
},
{
"epoch": 0.92,
"grad_norm": 0.8073404431343079,
"learning_rate": 9.102875324063163e-05,
"loss": 0.4706,
"step": 7820
},
{
"epoch": 0.92,
"grad_norm": 0.6178425550460815,
"learning_rate": 9.101696912561867e-05,
"loss": 0.4718,
"step": 7830
},
{
"epoch": 0.92,
"grad_norm": 0.8150768280029297,
"learning_rate": 9.100518501060571e-05,
"loss": 0.4809,
"step": 7840
},
{
"epoch": 0.92,
"grad_norm": 0.9359731674194336,
"learning_rate": 9.099340089559275e-05,
"loss": 0.4769,
"step": 7850
},
{
"epoch": 0.92,
"grad_norm": 1.2084877490997314,
"learning_rate": 9.098161678057978e-05,
"loss": 0.4766,
"step": 7860
},
{
"epoch": 0.93,
"grad_norm": 0.8307682871818542,
"learning_rate": 9.096983266556683e-05,
"loss": 0.466,
"step": 7870
},
{
"epoch": 0.93,
"grad_norm": 0.6864941120147705,
"learning_rate": 9.095804855055386e-05,
"loss": 0.4692,
"step": 7880
},
{
"epoch": 0.93,
"grad_norm": 0.8492443561553955,
"learning_rate": 9.09462644355409e-05,
"loss": 0.4836,
"step": 7890
},
{
"epoch": 0.93,
"grad_norm": 0.4004725217819214,
"learning_rate": 9.093448032052793e-05,
"loss": 0.4591,
"step": 7900
},
{
"epoch": 0.93,
"grad_norm": 0.8568896055221558,
"learning_rate": 9.092269620551497e-05,
"loss": 0.4782,
"step": 7910
},
{
"epoch": 0.93,
"grad_norm": 0.9604170322418213,
"learning_rate": 9.091091209050201e-05,
"loss": 0.4761,
"step": 7920
},
{
"epoch": 0.93,
"grad_norm": 0.8827893137931824,
"learning_rate": 9.089912797548904e-05,
"loss": 0.4665,
"step": 7930
},
{
"epoch": 0.93,
"grad_norm": 0.8036709427833557,
"learning_rate": 9.088734386047608e-05,
"loss": 0.4728,
"step": 7940
},
{
"epoch": 0.93,
"grad_norm": 0.5380088090896606,
"learning_rate": 9.087555974546313e-05,
"loss": 0.461,
"step": 7950
},
{
"epoch": 0.94,
"grad_norm": 1.3507716655731201,
"learning_rate": 9.086377563045015e-05,
"loss": 0.4705,
"step": 7960
},
{
"epoch": 0.94,
"grad_norm": 1.2934057712554932,
"learning_rate": 9.085199151543719e-05,
"loss": 0.4616,
"step": 7970
},
{
"epoch": 0.94,
"grad_norm": 1.3037034273147583,
"learning_rate": 9.084020740042423e-05,
"loss": 0.4768,
"step": 7980
},
{
"epoch": 0.94,
"grad_norm": 0.6422207355499268,
"learning_rate": 9.082842328541127e-05,
"loss": 0.4762,
"step": 7990
},
{
"epoch": 0.94,
"grad_norm": 0.7728084921836853,
"learning_rate": 9.081663917039831e-05,
"loss": 0.4632,
"step": 8000
},
{
"epoch": 0.94,
"grad_norm": 0.7049902081489563,
"learning_rate": 9.080485505538534e-05,
"loss": 0.4698,
"step": 8010
},
{
"epoch": 0.94,
"grad_norm": 0.7685100436210632,
"learning_rate": 9.079307094037239e-05,
"loss": 0.4661,
"step": 8020
},
{
"epoch": 0.94,
"grad_norm": 0.6383164525032043,
"learning_rate": 9.078128682535943e-05,
"loss": 0.4684,
"step": 8030
},
{
"epoch": 0.95,
"grad_norm": 0.541415810585022,
"learning_rate": 9.076950271034645e-05,
"loss": 0.4543,
"step": 8040
},
{
"epoch": 0.95,
"grad_norm": 0.737616777420044,
"learning_rate": 9.075771859533349e-05,
"loss": 0.4674,
"step": 8050
},
{
"epoch": 0.95,
"grad_norm": 0.706376850605011,
"learning_rate": 9.074593448032053e-05,
"loss": 0.4931,
"step": 8060
},
{
"epoch": 0.95,
"grad_norm": 0.7297462224960327,
"learning_rate": 9.073415036530757e-05,
"loss": 0.4545,
"step": 8070
},
{
"epoch": 0.95,
"grad_norm": 0.7163042426109314,
"learning_rate": 9.072236625029461e-05,
"loss": 0.4589,
"step": 8080
},
{
"epoch": 0.95,
"grad_norm": 0.39174124598503113,
"learning_rate": 9.071058213528165e-05,
"loss": 0.4548,
"step": 8090
},
{
"epoch": 0.95,
"grad_norm": 0.7997798919677734,
"learning_rate": 9.069879802026869e-05,
"loss": 0.4773,
"step": 8100
},
{
"epoch": 0.95,
"grad_norm": 0.845039963722229,
"learning_rate": 9.068701390525573e-05,
"loss": 0.4697,
"step": 8110
},
{
"epoch": 0.95,
"grad_norm": 0.5737539529800415,
"learning_rate": 9.067522979024275e-05,
"loss": 0.4524,
"step": 8120
},
{
"epoch": 0.96,
"grad_norm": 1.2482928037643433,
"learning_rate": 9.06634456752298e-05,
"loss": 0.4901,
"step": 8130
},
{
"epoch": 0.96,
"grad_norm": 0.6044333577156067,
"learning_rate": 9.065166156021683e-05,
"loss": 0.4784,
"step": 8140
},
{
"epoch": 0.96,
"grad_norm": 0.45608577132225037,
"learning_rate": 9.063987744520387e-05,
"loss": 0.469,
"step": 8150
},
{
"epoch": 0.96,
"grad_norm": 0.8200978636741638,
"learning_rate": 9.06280933301909e-05,
"loss": 0.4671,
"step": 8160
},
{
"epoch": 0.96,
"grad_norm": 0.6905077695846558,
"learning_rate": 9.061630921517795e-05,
"loss": 0.4938,
"step": 8170
},
{
"epoch": 0.96,
"grad_norm": 0.5382047891616821,
"learning_rate": 9.060452510016498e-05,
"loss": 0.4622,
"step": 8180
},
{
"epoch": 0.96,
"grad_norm": 1.1205216646194458,
"learning_rate": 9.059274098515202e-05,
"loss": 0.4516,
"step": 8190
},
{
"epoch": 0.96,
"grad_norm": 0.6442331075668335,
"learning_rate": 9.058095687013905e-05,
"loss": 0.4738,
"step": 8200
},
{
"epoch": 0.97,
"grad_norm": 0.5995213985443115,
"learning_rate": 9.05691727551261e-05,
"loss": 0.4658,
"step": 8210
},
{
"epoch": 0.97,
"grad_norm": 0.5977282524108887,
"learning_rate": 9.055738864011313e-05,
"loss": 0.4605,
"step": 8220
},
{
"epoch": 0.97,
"grad_norm": 0.8662328124046326,
"learning_rate": 9.054560452510017e-05,
"loss": 0.4931,
"step": 8230
},
{
"epoch": 0.97,
"grad_norm": 0.6842892169952393,
"learning_rate": 9.05338204100872e-05,
"loss": 0.4793,
"step": 8240
},
{
"epoch": 0.97,
"grad_norm": 0.8832321166992188,
"learning_rate": 9.052203629507424e-05,
"loss": 0.4922,
"step": 8250
},
{
"epoch": 0.97,
"grad_norm": 1.177495002746582,
"learning_rate": 9.051025218006128e-05,
"loss": 0.4875,
"step": 8260
},
{
"epoch": 0.97,
"grad_norm": 0.5269367694854736,
"learning_rate": 9.049846806504831e-05,
"loss": 0.4782,
"step": 8270
},
{
"epoch": 0.97,
"grad_norm": 1.1454589366912842,
"learning_rate": 9.048668395003536e-05,
"loss": 0.4657,
"step": 8280
},
{
"epoch": 0.97,
"grad_norm": 0.8788142800331116,
"learning_rate": 9.04748998350224e-05,
"loss": 0.453,
"step": 8290
},
{
"epoch": 0.98,
"grad_norm": 0.9039009213447571,
"learning_rate": 9.046311572000943e-05,
"loss": 0.4669,
"step": 8300
},
{
"epoch": 0.98,
"grad_norm": 0.4375890791416168,
"learning_rate": 9.045133160499646e-05,
"loss": 0.4591,
"step": 8310
},
{
"epoch": 0.98,
"grad_norm": 0.6917681097984314,
"learning_rate": 9.043954748998352e-05,
"loss": 0.4666,
"step": 8320
},
{
"epoch": 0.98,
"grad_norm": 0.5293903946876526,
"learning_rate": 9.042776337497054e-05,
"loss": 0.4641,
"step": 8330
},
{
"epoch": 0.98,
"grad_norm": 0.5259762406349182,
"learning_rate": 9.041597925995758e-05,
"loss": 0.4737,
"step": 8340
},
{
"epoch": 0.98,
"grad_norm": 0.5596423149108887,
"learning_rate": 9.040419514494462e-05,
"loss": 0.4634,
"step": 8350
},
{
"epoch": 0.98,
"grad_norm": 0.7936623692512512,
"learning_rate": 9.039241102993166e-05,
"loss": 0.4713,
"step": 8360
},
{
"epoch": 0.98,
"grad_norm": 0.4819786250591278,
"learning_rate": 9.03806269149187e-05,
"loss": 0.4699,
"step": 8370
},
{
"epoch": 0.99,
"grad_norm": 1.039947748184204,
"learning_rate": 9.036884279990572e-05,
"loss": 0.4811,
"step": 8380
},
{
"epoch": 0.99,
"grad_norm": 1.1255519390106201,
"learning_rate": 9.035705868489278e-05,
"loss": 0.4673,
"step": 8390
},
{
"epoch": 0.99,
"grad_norm": 0.8890841603279114,
"learning_rate": 9.03452745698798e-05,
"loss": 0.4663,
"step": 8400
},
{
"epoch": 0.99,
"grad_norm": 1.1442759037017822,
"learning_rate": 9.033349045486684e-05,
"loss": 0.4669,
"step": 8410
},
{
"epoch": 0.99,
"grad_norm": 0.8435789346694946,
"learning_rate": 9.032170633985388e-05,
"loss": 0.4546,
"step": 8420
},
{
"epoch": 0.99,
"grad_norm": 0.8128703236579895,
"learning_rate": 9.030992222484092e-05,
"loss": 0.4634,
"step": 8430
},
{
"epoch": 0.99,
"grad_norm": 0.7601869702339172,
"learning_rate": 9.029813810982796e-05,
"loss": 0.4714,
"step": 8440
},
{
"epoch": 0.99,
"grad_norm": 1.0469821691513062,
"learning_rate": 9.0286353994815e-05,
"loss": 0.4881,
"step": 8450
},
{
"epoch": 0.99,
"grad_norm": 0.8869287967681885,
"learning_rate": 9.027456987980202e-05,
"loss": 0.4611,
"step": 8460
},
{
"epoch": 1.0,
"grad_norm": 0.6261885762214661,
"learning_rate": 9.026278576478908e-05,
"loss": 0.4684,
"step": 8470
},
{
"epoch": 1.0,
"grad_norm": 0.7744522094726562,
"learning_rate": 9.02510016497761e-05,
"loss": 0.4844,
"step": 8480
},
{
"epoch": 1.0,
"grad_norm": 0.6113030910491943,
"learning_rate": 9.023921753476314e-05,
"loss": 0.4705,
"step": 8490
},
{
"epoch": 1.0,
"grad_norm": 0.6078872680664062,
"learning_rate": 9.022743341975018e-05,
"loss": 0.4596,
"step": 8500
},
{
"epoch": 1.0,
"eval_loss": 0.5111260414123535,
"eval_runtime": 1412.8929,
"eval_samples_per_second": 268.4,
"eval_steps_per_second": 4.194,
"step": 8506
}
],
"logging_steps": 10,
"max_steps": 85060,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 3.6280297423949005e+18,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}