i2s_sep_2024 / trainer_state.json
obulikrish's picture
i2s_sep_2024
f83af24
{
"best_metric": 1.338958978652954,
"best_model_checkpoint": "saved_model/i2s_sep_2024/checkpoint-5742",
"epoch": 10.0,
"eval_steps": 500,
"global_step": 6380,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": NaN,
"learning_rate": 0.0,
"loss": 77.0564,
"step": 1
},
{
"epoch": 0.02,
"grad_norm": 16.588455200195312,
"learning_rate": 2.0000000000000003e-06,
"loss": 77.9293,
"step": 10
},
{
"epoch": 0.03,
"grad_norm": 14.381007194519043,
"learning_rate": 6e-06,
"loss": 76.7998,
"step": 20
},
{
"epoch": 0.05,
"grad_norm": 19.72962188720703,
"learning_rate": 1.1000000000000001e-05,
"loss": 76.0966,
"step": 30
},
{
"epoch": 0.06,
"grad_norm": 20.388410568237305,
"learning_rate": 1.55e-05,
"loss": 74.5197,
"step": 40
},
{
"epoch": 0.08,
"grad_norm": 14.634848594665527,
"learning_rate": 2.05e-05,
"loss": 72.1866,
"step": 50
},
{
"epoch": 0.09,
"grad_norm": 17.051549911499023,
"learning_rate": 2.5500000000000003e-05,
"loss": 67.0821,
"step": 60
},
{
"epoch": 0.11,
"grad_norm": 28.50428581237793,
"learning_rate": 3.05e-05,
"loss": 57.4963,
"step": 70
},
{
"epoch": 0.13,
"grad_norm": 30.780961990356445,
"learning_rate": 3.55e-05,
"loss": 36.6869,
"step": 80
},
{
"epoch": 0.14,
"grad_norm": 14.346121788024902,
"learning_rate": 4.05e-05,
"loss": 14.3617,
"step": 90
},
{
"epoch": 0.16,
"grad_norm": 10.912578582763672,
"learning_rate": 4.55e-05,
"loss": 7.2406,
"step": 100
},
{
"epoch": 0.17,
"grad_norm": 6.822045803070068,
"learning_rate": 5.05e-05,
"loss": 4.8365,
"step": 110
},
{
"epoch": 0.19,
"grad_norm": 7.5130133628845215,
"learning_rate": 5.550000000000001e-05,
"loss": 4.0174,
"step": 120
},
{
"epoch": 0.2,
"grad_norm": 6.511799335479736,
"learning_rate": 6.05e-05,
"loss": 3.8019,
"step": 130
},
{
"epoch": 0.22,
"grad_norm": 7.3074870109558105,
"learning_rate": 6.55e-05,
"loss": 3.431,
"step": 140
},
{
"epoch": 0.24,
"grad_norm": 6.095701694488525,
"learning_rate": 7.05e-05,
"loss": 3.3795,
"step": 150
},
{
"epoch": 0.25,
"grad_norm": 6.266198635101318,
"learning_rate": 7.55e-05,
"loss": 3.2918,
"step": 160
},
{
"epoch": 0.27,
"grad_norm": 5.44221830368042,
"learning_rate": 8.05e-05,
"loss": 3.2624,
"step": 170
},
{
"epoch": 0.28,
"grad_norm": 5.414107799530029,
"learning_rate": 8.55e-05,
"loss": 3.0717,
"step": 180
},
{
"epoch": 0.3,
"grad_norm": 6.234427452087402,
"learning_rate": 9.05e-05,
"loss": 3.0962,
"step": 190
},
{
"epoch": 0.31,
"grad_norm": 6.2128095626831055,
"learning_rate": 9.55e-05,
"loss": 3.031,
"step": 200
},
{
"epoch": 0.33,
"grad_norm": 6.436244964599609,
"learning_rate": 9.998381877022654e-05,
"loss": 3.0275,
"step": 210
},
{
"epoch": 0.34,
"grad_norm": 5.83070707321167,
"learning_rate": 9.982200647249192e-05,
"loss": 3.0312,
"step": 220
},
{
"epoch": 0.36,
"grad_norm": 4.7300639152526855,
"learning_rate": 9.966019417475728e-05,
"loss": 2.9896,
"step": 230
},
{
"epoch": 0.38,
"grad_norm": 5.816330909729004,
"learning_rate": 9.949838187702267e-05,
"loss": 2.8982,
"step": 240
},
{
"epoch": 0.39,
"grad_norm": 5.503503799438477,
"learning_rate": 9.933656957928803e-05,
"loss": 2.8692,
"step": 250
},
{
"epoch": 0.41,
"grad_norm": 4.587036609649658,
"learning_rate": 9.917475728155339e-05,
"loss": 2.7963,
"step": 260
},
{
"epoch": 0.42,
"grad_norm": 5.378553867340088,
"learning_rate": 9.901294498381878e-05,
"loss": 2.8548,
"step": 270
},
{
"epoch": 0.44,
"grad_norm": 4.6093549728393555,
"learning_rate": 9.885113268608414e-05,
"loss": 2.766,
"step": 280
},
{
"epoch": 0.45,
"grad_norm": 4.90356969833374,
"learning_rate": 9.868932038834952e-05,
"loss": 2.6888,
"step": 290
},
{
"epoch": 0.47,
"grad_norm": 5.321515083312988,
"learning_rate": 9.85275080906149e-05,
"loss": 2.6801,
"step": 300
},
{
"epoch": 0.49,
"grad_norm": 4.544011116027832,
"learning_rate": 9.836569579288026e-05,
"loss": 2.6365,
"step": 310
},
{
"epoch": 0.5,
"grad_norm": 5.581942558288574,
"learning_rate": 9.820388349514564e-05,
"loss": 2.5887,
"step": 320
},
{
"epoch": 0.52,
"grad_norm": 3.580773115158081,
"learning_rate": 9.804207119741101e-05,
"loss": 2.3635,
"step": 330
},
{
"epoch": 0.53,
"grad_norm": 21.596670150756836,
"learning_rate": 9.788025889967637e-05,
"loss": 2.2275,
"step": 340
},
{
"epoch": 0.55,
"grad_norm": 2.5623159408569336,
"learning_rate": 9.771844660194175e-05,
"loss": 2.1448,
"step": 350
},
{
"epoch": 0.56,
"grad_norm": 3.9460721015930176,
"learning_rate": 9.755663430420713e-05,
"loss": 2.0788,
"step": 360
},
{
"epoch": 0.58,
"grad_norm": 2.1983449459075928,
"learning_rate": 9.73948220064725e-05,
"loss": 1.9382,
"step": 370
},
{
"epoch": 0.6,
"grad_norm": 2.7000677585601807,
"learning_rate": 9.723300970873786e-05,
"loss": 1.913,
"step": 380
},
{
"epoch": 0.61,
"grad_norm": 2.016218900680542,
"learning_rate": 9.707119741100324e-05,
"loss": 1.869,
"step": 390
},
{
"epoch": 0.63,
"grad_norm": 2.706763982772827,
"learning_rate": 9.690938511326862e-05,
"loss": 1.8184,
"step": 400
},
{
"epoch": 0.64,
"grad_norm": 2.03427791595459,
"learning_rate": 9.674757281553398e-05,
"loss": 1.78,
"step": 410
},
{
"epoch": 0.66,
"grad_norm": 1.9661788940429688,
"learning_rate": 9.658576051779936e-05,
"loss": 1.799,
"step": 420
},
{
"epoch": 0.67,
"grad_norm": 2.119837760925293,
"learning_rate": 9.642394822006473e-05,
"loss": 1.7693,
"step": 430
},
{
"epoch": 0.69,
"grad_norm": 2.3113484382629395,
"learning_rate": 9.62621359223301e-05,
"loss": 1.7285,
"step": 440
},
{
"epoch": 0.71,
"grad_norm": 1.9309462308883667,
"learning_rate": 9.610032362459548e-05,
"loss": 1.7827,
"step": 450
},
{
"epoch": 0.72,
"grad_norm": 1.9387729167938232,
"learning_rate": 9.593851132686085e-05,
"loss": 1.7421,
"step": 460
},
{
"epoch": 0.74,
"grad_norm": 1.560995101928711,
"learning_rate": 9.577669902912622e-05,
"loss": 1.7373,
"step": 470
},
{
"epoch": 0.75,
"grad_norm": 1.9981017112731934,
"learning_rate": 9.56148867313916e-05,
"loss": 1.7115,
"step": 480
},
{
"epoch": 0.77,
"grad_norm": 1.8179075717926025,
"learning_rate": 9.545307443365696e-05,
"loss": 1.705,
"step": 490
},
{
"epoch": 0.78,
"grad_norm": 1.5206012725830078,
"learning_rate": 9.529126213592234e-05,
"loss": 1.6909,
"step": 500
},
{
"epoch": 0.8,
"grad_norm": 1.7136727571487427,
"learning_rate": 9.512944983818771e-05,
"loss": 1.7102,
"step": 510
},
{
"epoch": 0.82,
"grad_norm": 1.8239753246307373,
"learning_rate": 9.496763754045308e-05,
"loss": 1.6711,
"step": 520
},
{
"epoch": 0.83,
"grad_norm": 2.050579309463501,
"learning_rate": 9.480582524271845e-05,
"loss": 1.683,
"step": 530
},
{
"epoch": 0.85,
"grad_norm": 2.0592105388641357,
"learning_rate": 9.464401294498383e-05,
"loss": 1.6371,
"step": 540
},
{
"epoch": 0.86,
"grad_norm": 1.847617745399475,
"learning_rate": 9.44822006472492e-05,
"loss": 1.6274,
"step": 550
},
{
"epoch": 0.88,
"grad_norm": 1.512438178062439,
"learning_rate": 9.432038834951457e-05,
"loss": 1.6423,
"step": 560
},
{
"epoch": 0.89,
"grad_norm": 2.095834732055664,
"learning_rate": 9.415857605177993e-05,
"loss": 1.6757,
"step": 570
},
{
"epoch": 0.91,
"grad_norm": 1.4796581268310547,
"learning_rate": 9.399676375404532e-05,
"loss": 1.6821,
"step": 580
},
{
"epoch": 0.92,
"grad_norm": 1.7051208019256592,
"learning_rate": 9.383495145631068e-05,
"loss": 1.6136,
"step": 590
},
{
"epoch": 0.94,
"grad_norm": 1.624534010887146,
"learning_rate": 9.367313915857606e-05,
"loss": 1.6493,
"step": 600
},
{
"epoch": 0.96,
"grad_norm": 1.776533842086792,
"learning_rate": 9.351132686084143e-05,
"loss": 1.6263,
"step": 610
},
{
"epoch": 0.97,
"grad_norm": 1.6108460426330566,
"learning_rate": 9.33495145631068e-05,
"loss": 1.6377,
"step": 620
},
{
"epoch": 0.99,
"grad_norm": 2.364605665206909,
"learning_rate": 9.318770226537217e-05,
"loss": 1.6113,
"step": 630
},
{
"epoch": 1.0,
"eval_loss": 1.4478819370269775,
"eval_runtime": 237.862,
"eval_samples_per_second": 235.25,
"eval_steps_per_second": 3.679,
"step": 638
},
{
"epoch": 1.0,
"grad_norm": 1.5748507976531982,
"learning_rate": 9.302588996763755e-05,
"loss": 1.6196,
"step": 640
},
{
"epoch": 1.02,
"grad_norm": 1.5334014892578125,
"learning_rate": 9.286407766990291e-05,
"loss": 1.6075,
"step": 650
},
{
"epoch": 1.03,
"grad_norm": 1.5750752687454224,
"learning_rate": 9.270226537216829e-05,
"loss": 1.6183,
"step": 660
},
{
"epoch": 1.05,
"grad_norm": 1.6654421091079712,
"learning_rate": 9.254045307443366e-05,
"loss": 1.5908,
"step": 670
},
{
"epoch": 1.07,
"grad_norm": 1.5488779544830322,
"learning_rate": 9.237864077669904e-05,
"loss": 1.5969,
"step": 680
},
{
"epoch": 1.08,
"grad_norm": 1.5573183298110962,
"learning_rate": 9.22168284789644e-05,
"loss": 1.6164,
"step": 690
},
{
"epoch": 1.1,
"grad_norm": 1.7255555391311646,
"learning_rate": 9.205501618122978e-05,
"loss": 1.6363,
"step": 700
},
{
"epoch": 1.11,
"grad_norm": 1.6308395862579346,
"learning_rate": 9.189320388349515e-05,
"loss": 1.6115,
"step": 710
},
{
"epoch": 1.13,
"grad_norm": 1.4543898105621338,
"learning_rate": 9.173139158576051e-05,
"loss": 1.6245,
"step": 720
},
{
"epoch": 1.14,
"grad_norm": 1.4260578155517578,
"learning_rate": 9.156957928802589e-05,
"loss": 1.5829,
"step": 730
},
{
"epoch": 1.16,
"grad_norm": 1.7818788290023804,
"learning_rate": 9.140776699029127e-05,
"loss": 1.5877,
"step": 740
},
{
"epoch": 1.18,
"grad_norm": 2.3251705169677734,
"learning_rate": 9.124595469255663e-05,
"loss": 1.5946,
"step": 750
},
{
"epoch": 1.19,
"grad_norm": 1.694584846496582,
"learning_rate": 9.108414239482202e-05,
"loss": 1.5932,
"step": 760
},
{
"epoch": 1.21,
"grad_norm": 1.4754955768585205,
"learning_rate": 9.092233009708738e-05,
"loss": 1.5965,
"step": 770
},
{
"epoch": 1.22,
"grad_norm": 1.6428216695785522,
"learning_rate": 9.076051779935276e-05,
"loss": 1.5732,
"step": 780
},
{
"epoch": 1.24,
"grad_norm": 2.023967742919922,
"learning_rate": 9.059870550161813e-05,
"loss": 1.5751,
"step": 790
},
{
"epoch": 1.25,
"grad_norm": 1.5377651453018188,
"learning_rate": 9.04368932038835e-05,
"loss": 1.5391,
"step": 800
},
{
"epoch": 1.27,
"grad_norm": 1.5178940296173096,
"learning_rate": 9.027508090614887e-05,
"loss": 1.5374,
"step": 810
},
{
"epoch": 1.29,
"grad_norm": 1.4959659576416016,
"learning_rate": 9.011326860841425e-05,
"loss": 1.5309,
"step": 820
},
{
"epoch": 1.3,
"grad_norm": 1.662809133529663,
"learning_rate": 8.995145631067961e-05,
"loss": 1.529,
"step": 830
},
{
"epoch": 1.32,
"grad_norm": 1.3378241062164307,
"learning_rate": 8.978964401294499e-05,
"loss": 1.5824,
"step": 840
},
{
"epoch": 1.33,
"grad_norm": 1.5929758548736572,
"learning_rate": 8.962783171521036e-05,
"loss": 1.5355,
"step": 850
},
{
"epoch": 1.35,
"grad_norm": 1.2536360025405884,
"learning_rate": 8.946601941747574e-05,
"loss": 1.5581,
"step": 860
},
{
"epoch": 1.36,
"grad_norm": 1.3562827110290527,
"learning_rate": 8.93042071197411e-05,
"loss": 1.5411,
"step": 870
},
{
"epoch": 1.38,
"grad_norm": 1.5773125886917114,
"learning_rate": 8.914239482200648e-05,
"loss": 1.5453,
"step": 880
},
{
"epoch": 1.39,
"grad_norm": 1.6030040979385376,
"learning_rate": 8.898058252427185e-05,
"loss": 1.5551,
"step": 890
},
{
"epoch": 1.41,
"grad_norm": 1.471346139907837,
"learning_rate": 8.881877022653722e-05,
"loss": 1.5777,
"step": 900
},
{
"epoch": 1.43,
"grad_norm": 1.343739628791809,
"learning_rate": 8.865695792880259e-05,
"loss": 1.527,
"step": 910
},
{
"epoch": 1.44,
"grad_norm": 1.498059868812561,
"learning_rate": 8.849514563106797e-05,
"loss": 1.5739,
"step": 920
},
{
"epoch": 1.46,
"grad_norm": 1.2020891904830933,
"learning_rate": 8.833333333333333e-05,
"loss": 1.5419,
"step": 930
},
{
"epoch": 1.47,
"grad_norm": 1.4334652423858643,
"learning_rate": 8.817152103559872e-05,
"loss": 1.5496,
"step": 940
},
{
"epoch": 1.49,
"grad_norm": 2.028785467147827,
"learning_rate": 8.800970873786408e-05,
"loss": 1.5261,
"step": 950
},
{
"epoch": 1.5,
"grad_norm": 1.5516548156738281,
"learning_rate": 8.784789644012944e-05,
"loss": 1.5342,
"step": 960
},
{
"epoch": 1.52,
"grad_norm": 1.715617060661316,
"learning_rate": 8.768608414239483e-05,
"loss": 1.5576,
"step": 970
},
{
"epoch": 1.54,
"grad_norm": 1.5550382137298584,
"learning_rate": 8.75242718446602e-05,
"loss": 1.5656,
"step": 980
},
{
"epoch": 1.55,
"grad_norm": 1.8287665843963623,
"learning_rate": 8.736245954692557e-05,
"loss": 1.5102,
"step": 990
},
{
"epoch": 1.57,
"grad_norm": 1.2875758409500122,
"learning_rate": 8.720064724919095e-05,
"loss": 1.4912,
"step": 1000
},
{
"epoch": 1.58,
"grad_norm": 1.2705069780349731,
"learning_rate": 8.703883495145631e-05,
"loss": 1.538,
"step": 1010
},
{
"epoch": 1.6,
"grad_norm": 1.945082664489746,
"learning_rate": 8.687702265372169e-05,
"loss": 1.512,
"step": 1020
},
{
"epoch": 1.61,
"grad_norm": 1.4333319664001465,
"learning_rate": 8.671521035598706e-05,
"loss": 1.5592,
"step": 1030
},
{
"epoch": 1.63,
"grad_norm": 1.8623706102371216,
"learning_rate": 8.655339805825244e-05,
"loss": 1.5152,
"step": 1040
},
{
"epoch": 1.65,
"grad_norm": 1.23512601852417,
"learning_rate": 8.63915857605178e-05,
"loss": 1.5225,
"step": 1050
},
{
"epoch": 1.66,
"grad_norm": 1.358238697052002,
"learning_rate": 8.622977346278318e-05,
"loss": 1.5028,
"step": 1060
},
{
"epoch": 1.68,
"grad_norm": 1.351360559463501,
"learning_rate": 8.606796116504855e-05,
"loss": 1.522,
"step": 1070
},
{
"epoch": 1.69,
"grad_norm": 1.4016929864883423,
"learning_rate": 8.590614886731392e-05,
"loss": 1.5155,
"step": 1080
},
{
"epoch": 1.71,
"grad_norm": 1.3688647747039795,
"learning_rate": 8.574433656957929e-05,
"loss": 1.5491,
"step": 1090
},
{
"epoch": 1.72,
"grad_norm": 1.6790826320648193,
"learning_rate": 8.558252427184467e-05,
"loss": 1.5048,
"step": 1100
},
{
"epoch": 1.74,
"grad_norm": 1.184859037399292,
"learning_rate": 8.542071197411003e-05,
"loss": 1.5155,
"step": 1110
},
{
"epoch": 1.76,
"grad_norm": 1.2918730974197388,
"learning_rate": 8.525889967637541e-05,
"loss": 1.5618,
"step": 1120
},
{
"epoch": 1.77,
"grad_norm": 1.484913945198059,
"learning_rate": 8.509708737864078e-05,
"loss": 1.5435,
"step": 1130
},
{
"epoch": 1.79,
"grad_norm": 1.2351198196411133,
"learning_rate": 8.493527508090615e-05,
"loss": 1.497,
"step": 1140
},
{
"epoch": 1.8,
"grad_norm": 1.3195732831954956,
"learning_rate": 8.477346278317152e-05,
"loss": 1.4924,
"step": 1150
},
{
"epoch": 1.82,
"grad_norm": 1.1907435655593872,
"learning_rate": 8.46116504854369e-05,
"loss": 1.4678,
"step": 1160
},
{
"epoch": 1.83,
"grad_norm": 1.2588481903076172,
"learning_rate": 8.444983818770227e-05,
"loss": 1.5284,
"step": 1170
},
{
"epoch": 1.85,
"grad_norm": 1.448137879371643,
"learning_rate": 8.428802588996764e-05,
"loss": 1.5338,
"step": 1180
},
{
"epoch": 1.87,
"grad_norm": 1.2317396402359009,
"learning_rate": 8.412621359223301e-05,
"loss": 1.4911,
"step": 1190
},
{
"epoch": 1.88,
"grad_norm": 1.3239874839782715,
"learning_rate": 8.396440129449839e-05,
"loss": 1.5164,
"step": 1200
},
{
"epoch": 1.9,
"grad_norm": 1.2729260921478271,
"learning_rate": 8.380258899676375e-05,
"loss": 1.5313,
"step": 1210
},
{
"epoch": 1.91,
"grad_norm": 1.2165913581848145,
"learning_rate": 8.364077669902913e-05,
"loss": 1.5346,
"step": 1220
},
{
"epoch": 1.93,
"grad_norm": 1.2190241813659668,
"learning_rate": 8.34789644012945e-05,
"loss": 1.5028,
"step": 1230
},
{
"epoch": 1.94,
"grad_norm": 1.3130549192428589,
"learning_rate": 8.331715210355987e-05,
"loss": 1.4912,
"step": 1240
},
{
"epoch": 1.96,
"grad_norm": 1.3163666725158691,
"learning_rate": 8.315533980582526e-05,
"loss": 1.4825,
"step": 1250
},
{
"epoch": 1.97,
"grad_norm": 1.0928343534469604,
"learning_rate": 8.299352750809062e-05,
"loss": 1.4839,
"step": 1260
},
{
"epoch": 1.99,
"grad_norm": 1.2266935110092163,
"learning_rate": 8.2831715210356e-05,
"loss": 1.4707,
"step": 1270
},
{
"epoch": 2.0,
"eval_loss": 1.3973770141601562,
"eval_runtime": 238.8528,
"eval_samples_per_second": 234.274,
"eval_steps_per_second": 3.663,
"step": 1276
},
{
"epoch": 2.01,
"grad_norm": 1.376717209815979,
"learning_rate": 8.266990291262137e-05,
"loss": 1.4909,
"step": 1280
},
{
"epoch": 2.02,
"grad_norm": 1.4684240818023682,
"learning_rate": 8.250809061488673e-05,
"loss": 1.5064,
"step": 1290
},
{
"epoch": 2.04,
"grad_norm": 1.382200837135315,
"learning_rate": 8.234627831715211e-05,
"loss": 1.4718,
"step": 1300
},
{
"epoch": 2.05,
"grad_norm": 1.258672833442688,
"learning_rate": 8.218446601941748e-05,
"loss": 1.4647,
"step": 1310
},
{
"epoch": 2.07,
"grad_norm": 1.2823798656463623,
"learning_rate": 8.202265372168285e-05,
"loss": 1.5035,
"step": 1320
},
{
"epoch": 2.08,
"grad_norm": 1.2092549800872803,
"learning_rate": 8.186084142394822e-05,
"loss": 1.4678,
"step": 1330
},
{
"epoch": 2.1,
"grad_norm": 1.411786675453186,
"learning_rate": 8.16990291262136e-05,
"loss": 1.4917,
"step": 1340
},
{
"epoch": 2.12,
"grad_norm": 1.2800941467285156,
"learning_rate": 8.153721682847897e-05,
"loss": 1.4615,
"step": 1350
},
{
"epoch": 2.13,
"grad_norm": 1.140244483947754,
"learning_rate": 8.137540453074434e-05,
"loss": 1.4955,
"step": 1360
},
{
"epoch": 2.15,
"grad_norm": 1.1085867881774902,
"learning_rate": 8.121359223300971e-05,
"loss": 1.4771,
"step": 1370
},
{
"epoch": 2.16,
"grad_norm": 1.1928099393844604,
"learning_rate": 8.105177993527509e-05,
"loss": 1.4299,
"step": 1380
},
{
"epoch": 2.18,
"grad_norm": 1.2127610445022583,
"learning_rate": 8.088996763754045e-05,
"loss": 1.4889,
"step": 1390
},
{
"epoch": 2.19,
"grad_norm": 1.262681245803833,
"learning_rate": 8.072815533980583e-05,
"loss": 1.4704,
"step": 1400
},
{
"epoch": 2.21,
"grad_norm": 1.291587471961975,
"learning_rate": 8.05663430420712e-05,
"loss": 1.4705,
"step": 1410
},
{
"epoch": 2.23,
"grad_norm": 1.1992008686065674,
"learning_rate": 8.040453074433657e-05,
"loss": 1.4879,
"step": 1420
},
{
"epoch": 2.24,
"grad_norm": 1.273314356803894,
"learning_rate": 8.024271844660196e-05,
"loss": 1.5023,
"step": 1430
},
{
"epoch": 2.26,
"grad_norm": 1.2564905881881714,
"learning_rate": 8.008090614886732e-05,
"loss": 1.4731,
"step": 1440
},
{
"epoch": 2.27,
"grad_norm": 1.1837260723114014,
"learning_rate": 7.991909385113268e-05,
"loss": 1.5127,
"step": 1450
},
{
"epoch": 2.29,
"grad_norm": 1.2088736295700073,
"learning_rate": 7.975728155339807e-05,
"loss": 1.4674,
"step": 1460
},
{
"epoch": 2.3,
"grad_norm": 1.0863879919052124,
"learning_rate": 7.959546925566343e-05,
"loss": 1.4484,
"step": 1470
},
{
"epoch": 2.32,
"grad_norm": 1.2499940395355225,
"learning_rate": 7.943365695792881e-05,
"loss": 1.472,
"step": 1480
},
{
"epoch": 2.34,
"grad_norm": 1.264376163482666,
"learning_rate": 7.927184466019419e-05,
"loss": 1.4647,
"step": 1490
},
{
"epoch": 2.35,
"grad_norm": 1.2872412204742432,
"learning_rate": 7.911003236245955e-05,
"loss": 1.4985,
"step": 1500
},
{
"epoch": 2.37,
"grad_norm": 1.3327168226242065,
"learning_rate": 7.894822006472492e-05,
"loss": 1.4647,
"step": 1510
},
{
"epoch": 2.38,
"grad_norm": 1.0859509706497192,
"learning_rate": 7.87864077669903e-05,
"loss": 1.4904,
"step": 1520
},
{
"epoch": 2.4,
"grad_norm": 1.44997239112854,
"learning_rate": 7.862459546925566e-05,
"loss": 1.4611,
"step": 1530
},
{
"epoch": 2.41,
"grad_norm": 1.1412160396575928,
"learning_rate": 7.846278317152104e-05,
"loss": 1.4366,
"step": 1540
},
{
"epoch": 2.43,
"grad_norm": 1.6778419017791748,
"learning_rate": 7.830097087378641e-05,
"loss": 1.4827,
"step": 1550
},
{
"epoch": 2.45,
"grad_norm": 1.0160796642303467,
"learning_rate": 7.813915857605179e-05,
"loss": 1.4795,
"step": 1560
},
{
"epoch": 2.46,
"grad_norm": 1.2060844898223877,
"learning_rate": 7.797734627831715e-05,
"loss": 1.4209,
"step": 1570
},
{
"epoch": 2.48,
"grad_norm": 1.0862780809402466,
"learning_rate": 7.781553398058253e-05,
"loss": 1.4809,
"step": 1580
},
{
"epoch": 2.49,
"grad_norm": 1.1212877035140991,
"learning_rate": 7.76537216828479e-05,
"loss": 1.4411,
"step": 1590
},
{
"epoch": 2.51,
"grad_norm": 1.2004621028900146,
"learning_rate": 7.749190938511327e-05,
"loss": 1.4595,
"step": 1600
},
{
"epoch": 2.52,
"grad_norm": 1.2479016780853271,
"learning_rate": 7.733009708737864e-05,
"loss": 1.4501,
"step": 1610
},
{
"epoch": 2.54,
"grad_norm": 1.1785011291503906,
"learning_rate": 7.716828478964402e-05,
"loss": 1.4512,
"step": 1620
},
{
"epoch": 2.55,
"grad_norm": 1.049422264099121,
"learning_rate": 7.700647249190938e-05,
"loss": 1.4555,
"step": 1630
},
{
"epoch": 2.57,
"grad_norm": 1.135881781578064,
"learning_rate": 7.684466019417476e-05,
"loss": 1.4327,
"step": 1640
},
{
"epoch": 2.59,
"grad_norm": 1.2440606355667114,
"learning_rate": 7.668284789644013e-05,
"loss": 1.4566,
"step": 1650
},
{
"epoch": 2.6,
"grad_norm": 1.2145899534225464,
"learning_rate": 7.652103559870551e-05,
"loss": 1.4393,
"step": 1660
},
{
"epoch": 2.62,
"grad_norm": 1.1985297203063965,
"learning_rate": 7.635922330097087e-05,
"loss": 1.4631,
"step": 1670
},
{
"epoch": 2.63,
"grad_norm": 1.2036408185958862,
"learning_rate": 7.619741100323625e-05,
"loss": 1.4589,
"step": 1680
},
{
"epoch": 2.65,
"grad_norm": 1.4329497814178467,
"learning_rate": 7.603559870550162e-05,
"loss": 1.4457,
"step": 1690
},
{
"epoch": 2.66,
"grad_norm": 1.2009234428405762,
"learning_rate": 7.587378640776699e-05,
"loss": 1.4525,
"step": 1700
},
{
"epoch": 2.68,
"grad_norm": 1.1393496990203857,
"learning_rate": 7.571197411003236e-05,
"loss": 1.4641,
"step": 1710
},
{
"epoch": 2.7,
"grad_norm": 1.0305531024932861,
"learning_rate": 7.555016181229774e-05,
"loss": 1.462,
"step": 1720
},
{
"epoch": 2.71,
"grad_norm": 1.1692051887512207,
"learning_rate": 7.540453074433658e-05,
"loss": 1.4366,
"step": 1730
},
{
"epoch": 2.73,
"grad_norm": 1.0835274457931519,
"learning_rate": 7.524271844660194e-05,
"loss": 1.4322,
"step": 1740
},
{
"epoch": 2.74,
"grad_norm": 1.22800874710083,
"learning_rate": 7.508090614886732e-05,
"loss": 1.4586,
"step": 1750
},
{
"epoch": 2.76,
"grad_norm": 1.0783599615097046,
"learning_rate": 7.49190938511327e-05,
"loss": 1.4474,
"step": 1760
},
{
"epoch": 2.77,
"grad_norm": 1.2081267833709717,
"learning_rate": 7.475728155339806e-05,
"loss": 1.4849,
"step": 1770
},
{
"epoch": 2.79,
"grad_norm": 1.0928617715835571,
"learning_rate": 7.459546925566343e-05,
"loss": 1.4306,
"step": 1780
},
{
"epoch": 2.81,
"grad_norm": 1.0414780378341675,
"learning_rate": 7.443365695792881e-05,
"loss": 1.4761,
"step": 1790
},
{
"epoch": 2.82,
"grad_norm": 1.0172119140625,
"learning_rate": 7.427184466019417e-05,
"loss": 1.4416,
"step": 1800
},
{
"epoch": 2.84,
"grad_norm": 1.1381781101226807,
"learning_rate": 7.411003236245955e-05,
"loss": 1.4488,
"step": 1810
},
{
"epoch": 2.85,
"grad_norm": 1.2968194484710693,
"learning_rate": 7.394822006472492e-05,
"loss": 1.4656,
"step": 1820
},
{
"epoch": 2.87,
"grad_norm": 1.2675330638885498,
"learning_rate": 7.37864077669903e-05,
"loss": 1.4769,
"step": 1830
},
{
"epoch": 2.88,
"grad_norm": 1.2188855409622192,
"learning_rate": 7.362459546925566e-05,
"loss": 1.4567,
"step": 1840
},
{
"epoch": 2.9,
"grad_norm": 1.1572908163070679,
"learning_rate": 7.346278317152104e-05,
"loss": 1.4557,
"step": 1850
},
{
"epoch": 2.92,
"grad_norm": 1.65030837059021,
"learning_rate": 7.330097087378641e-05,
"loss": 1.4382,
"step": 1860
},
{
"epoch": 2.93,
"grad_norm": 1.100380301475525,
"learning_rate": 7.313915857605178e-05,
"loss": 1.4483,
"step": 1870
},
{
"epoch": 2.95,
"grad_norm": 1.1413873434066772,
"learning_rate": 7.297734627831717e-05,
"loss": 1.4125,
"step": 1880
},
{
"epoch": 2.96,
"grad_norm": 1.0477943420410156,
"learning_rate": 7.281553398058253e-05,
"loss": 1.4665,
"step": 1890
},
{
"epoch": 2.98,
"grad_norm": 1.630313515663147,
"learning_rate": 7.265372168284789e-05,
"loss": 1.4091,
"step": 1900
},
{
"epoch": 2.99,
"grad_norm": 1.1295515298843384,
"learning_rate": 7.249190938511328e-05,
"loss": 1.434,
"step": 1910
},
{
"epoch": 3.0,
"eval_loss": 1.3728151321411133,
"eval_runtime": 238.5107,
"eval_samples_per_second": 234.61,
"eval_steps_per_second": 3.669,
"step": 1914
},
{
"epoch": 3.01,
"grad_norm": 1.0527026653289795,
"learning_rate": 7.233009708737864e-05,
"loss": 1.443,
"step": 1920
},
{
"epoch": 3.03,
"grad_norm": 0.9968172907829285,
"learning_rate": 7.216828478964402e-05,
"loss": 1.4266,
"step": 1930
},
{
"epoch": 3.04,
"grad_norm": 1.117412805557251,
"learning_rate": 7.20064724919094e-05,
"loss": 1.4166,
"step": 1940
},
{
"epoch": 3.06,
"grad_norm": 1.1899322271347046,
"learning_rate": 7.184466019417476e-05,
"loss": 1.429,
"step": 1950
},
{
"epoch": 3.07,
"grad_norm": 1.0852290391921997,
"learning_rate": 7.168284789644013e-05,
"loss": 1.4082,
"step": 1960
},
{
"epoch": 3.09,
"grad_norm": 1.0769116878509521,
"learning_rate": 7.152103559870551e-05,
"loss": 1.4289,
"step": 1970
},
{
"epoch": 3.1,
"grad_norm": 1.1807366609573364,
"learning_rate": 7.135922330097087e-05,
"loss": 1.4151,
"step": 1980
},
{
"epoch": 3.12,
"grad_norm": 1.2398109436035156,
"learning_rate": 7.119741100323625e-05,
"loss": 1.4504,
"step": 1990
},
{
"epoch": 3.13,
"grad_norm": 1.1362110376358032,
"learning_rate": 7.103559870550163e-05,
"loss": 1.4148,
"step": 2000
},
{
"epoch": 3.15,
"grad_norm": 1.02677583694458,
"learning_rate": 7.0873786407767e-05,
"loss": 1.4133,
"step": 2010
},
{
"epoch": 3.17,
"grad_norm": 1.1590044498443604,
"learning_rate": 7.071197411003236e-05,
"loss": 1.4151,
"step": 2020
},
{
"epoch": 3.18,
"grad_norm": 1.193935751914978,
"learning_rate": 7.055016181229773e-05,
"loss": 1.4133,
"step": 2030
},
{
"epoch": 3.2,
"grad_norm": 1.2912925481796265,
"learning_rate": 7.038834951456312e-05,
"loss": 1.4321,
"step": 2040
},
{
"epoch": 3.21,
"grad_norm": 1.2993954420089722,
"learning_rate": 7.022653721682848e-05,
"loss": 1.4038,
"step": 2050
},
{
"epoch": 3.23,
"grad_norm": 1.4455443620681763,
"learning_rate": 7.006472491909385e-05,
"loss": 1.4331,
"step": 2060
},
{
"epoch": 3.24,
"grad_norm": 1.0302395820617676,
"learning_rate": 6.990291262135923e-05,
"loss": 1.4191,
"step": 2070
},
{
"epoch": 3.26,
"grad_norm": 1.0828335285186768,
"learning_rate": 6.974110032362459e-05,
"loss": 1.419,
"step": 2080
},
{
"epoch": 3.28,
"grad_norm": 1.2309439182281494,
"learning_rate": 6.957928802588997e-05,
"loss": 1.4029,
"step": 2090
},
{
"epoch": 3.29,
"grad_norm": 1.118615984916687,
"learning_rate": 6.941747572815534e-05,
"loss": 1.4336,
"step": 2100
},
{
"epoch": 3.31,
"grad_norm": 1.0558626651763916,
"learning_rate": 6.925566343042071e-05,
"loss": 1.4097,
"step": 2110
},
{
"epoch": 3.32,
"grad_norm": 1.086051344871521,
"learning_rate": 6.909385113268608e-05,
"loss": 1.4054,
"step": 2120
},
{
"epoch": 3.34,
"grad_norm": 1.2219781875610352,
"learning_rate": 6.893203883495146e-05,
"loss": 1.413,
"step": 2130
},
{
"epoch": 3.35,
"grad_norm": 0.9822403788566589,
"learning_rate": 6.877022653721684e-05,
"loss": 1.4413,
"step": 2140
},
{
"epoch": 3.37,
"grad_norm": 1.2207833528518677,
"learning_rate": 6.86084142394822e-05,
"loss": 1.4364,
"step": 2150
},
{
"epoch": 3.39,
"grad_norm": 1.1165385246276855,
"learning_rate": 6.844660194174757e-05,
"loss": 1.4202,
"step": 2160
},
{
"epoch": 3.4,
"grad_norm": 1.1908485889434814,
"learning_rate": 6.828478964401295e-05,
"loss": 1.4231,
"step": 2170
},
{
"epoch": 3.42,
"grad_norm": 0.9900442957878113,
"learning_rate": 6.812297734627831e-05,
"loss": 1.3984,
"step": 2180
},
{
"epoch": 3.43,
"grad_norm": 1.053161382675171,
"learning_rate": 6.79611650485437e-05,
"loss": 1.4027,
"step": 2190
},
{
"epoch": 3.45,
"grad_norm": 1.0249375104904175,
"learning_rate": 6.779935275080906e-05,
"loss": 1.4085,
"step": 2200
},
{
"epoch": 3.46,
"grad_norm": 1.1247555017471313,
"learning_rate": 6.763754045307443e-05,
"loss": 1.4245,
"step": 2210
},
{
"epoch": 3.48,
"grad_norm": 1.138109803199768,
"learning_rate": 6.747572815533982e-05,
"loss": 1.4129,
"step": 2220
},
{
"epoch": 3.5,
"grad_norm": 0.9391753077507019,
"learning_rate": 6.731391585760518e-05,
"loss": 1.3891,
"step": 2230
},
{
"epoch": 3.51,
"grad_norm": 1.0447841882705688,
"learning_rate": 6.715210355987056e-05,
"loss": 1.3899,
"step": 2240
},
{
"epoch": 3.53,
"grad_norm": 1.0124462842941284,
"learning_rate": 6.699029126213593e-05,
"loss": 1.4284,
"step": 2250
},
{
"epoch": 3.54,
"grad_norm": 1.0774402618408203,
"learning_rate": 6.68284789644013e-05,
"loss": 1.3935,
"step": 2260
},
{
"epoch": 3.56,
"grad_norm": 1.1104766130447388,
"learning_rate": 6.666666666666667e-05,
"loss": 1.4162,
"step": 2270
},
{
"epoch": 3.57,
"grad_norm": 1.0763837099075317,
"learning_rate": 6.650485436893205e-05,
"loss": 1.4176,
"step": 2280
},
{
"epoch": 3.59,
"grad_norm": 1.2160184383392334,
"learning_rate": 6.634304207119741e-05,
"loss": 1.3826,
"step": 2290
},
{
"epoch": 3.61,
"grad_norm": 1.1123363971710205,
"learning_rate": 6.618122977346278e-05,
"loss": 1.4101,
"step": 2300
},
{
"epoch": 3.62,
"grad_norm": 1.0691155195236206,
"learning_rate": 6.601941747572816e-05,
"loss": 1.3906,
"step": 2310
},
{
"epoch": 3.64,
"grad_norm": 1.0122454166412354,
"learning_rate": 6.585760517799354e-05,
"loss": 1.3923,
"step": 2320
},
{
"epoch": 3.65,
"grad_norm": 1.0922036170959473,
"learning_rate": 6.56957928802589e-05,
"loss": 1.41,
"step": 2330
},
{
"epoch": 3.67,
"grad_norm": 1.1541335582733154,
"learning_rate": 6.553398058252428e-05,
"loss": 1.4316,
"step": 2340
},
{
"epoch": 3.68,
"grad_norm": 1.1643292903900146,
"learning_rate": 6.537216828478965e-05,
"loss": 1.4516,
"step": 2350
},
{
"epoch": 3.7,
"grad_norm": 1.1834907531738281,
"learning_rate": 6.521035598705501e-05,
"loss": 1.4204,
"step": 2360
},
{
"epoch": 3.71,
"grad_norm": 1.0385903120040894,
"learning_rate": 6.504854368932039e-05,
"loss": 1.4191,
"step": 2370
},
{
"epoch": 3.73,
"grad_norm": 0.9123443365097046,
"learning_rate": 6.488673139158577e-05,
"loss": 1.3812,
"step": 2380
},
{
"epoch": 3.75,
"grad_norm": 1.2512091398239136,
"learning_rate": 6.472491909385113e-05,
"loss": 1.4015,
"step": 2390
},
{
"epoch": 3.76,
"grad_norm": 1.2057242393493652,
"learning_rate": 6.456310679611652e-05,
"loss": 1.4186,
"step": 2400
},
{
"epoch": 3.78,
"grad_norm": 1.0209051370620728,
"learning_rate": 6.440129449838188e-05,
"loss": 1.3958,
"step": 2410
},
{
"epoch": 3.79,
"grad_norm": 1.138934850692749,
"learning_rate": 6.423948220064726e-05,
"loss": 1.427,
"step": 2420
},
{
"epoch": 3.81,
"grad_norm": 1.0238676071166992,
"learning_rate": 6.407766990291263e-05,
"loss": 1.4007,
"step": 2430
},
{
"epoch": 3.82,
"grad_norm": 1.3099086284637451,
"learning_rate": 6.3915857605178e-05,
"loss": 1.4029,
"step": 2440
},
{
"epoch": 3.84,
"grad_norm": 1.043864130973816,
"learning_rate": 6.375404530744337e-05,
"loss": 1.4046,
"step": 2450
},
{
"epoch": 3.86,
"grad_norm": 0.9795685410499573,
"learning_rate": 6.359223300970875e-05,
"loss": 1.4191,
"step": 2460
},
{
"epoch": 3.87,
"grad_norm": 0.9932299256324768,
"learning_rate": 6.343042071197411e-05,
"loss": 1.4009,
"step": 2470
},
{
"epoch": 3.89,
"grad_norm": 1.114258050918579,
"learning_rate": 6.326860841423949e-05,
"loss": 1.4444,
"step": 2480
},
{
"epoch": 3.9,
"grad_norm": 0.9840899109840393,
"learning_rate": 6.310679611650486e-05,
"loss": 1.4138,
"step": 2490
},
{
"epoch": 3.92,
"grad_norm": 0.9501660466194153,
"learning_rate": 6.294498381877024e-05,
"loss": 1.3924,
"step": 2500
},
{
"epoch": 3.93,
"grad_norm": 1.2911392450332642,
"learning_rate": 6.27831715210356e-05,
"loss": 1.4036,
"step": 2510
},
{
"epoch": 3.95,
"grad_norm": 1.0698624849319458,
"learning_rate": 6.262135922330098e-05,
"loss": 1.3908,
"step": 2520
},
{
"epoch": 3.97,
"grad_norm": 1.0507279634475708,
"learning_rate": 6.245954692556635e-05,
"loss": 1.401,
"step": 2530
},
{
"epoch": 3.98,
"grad_norm": 1.1537911891937256,
"learning_rate": 6.229773462783171e-05,
"loss": 1.3823,
"step": 2540
},
{
"epoch": 4.0,
"grad_norm": 1.057826280593872,
"learning_rate": 6.213592233009709e-05,
"loss": 1.4156,
"step": 2550
},
{
"epoch": 4.0,
"eval_loss": 1.36408269405365,
"eval_runtime": 241.0553,
"eval_samples_per_second": 232.133,
"eval_steps_per_second": 3.63,
"step": 2552
},
{
"epoch": 4.01,
"grad_norm": 1.1321182250976562,
"learning_rate": 6.197411003236247e-05,
"loss": 1.4004,
"step": 2560
},
{
"epoch": 4.03,
"grad_norm": 1.130816102027893,
"learning_rate": 6.181229773462783e-05,
"loss": 1.3964,
"step": 2570
},
{
"epoch": 4.04,
"grad_norm": 1.1177788972854614,
"learning_rate": 6.16504854368932e-05,
"loss": 1.3924,
"step": 2580
},
{
"epoch": 4.06,
"grad_norm": 1.2621471881866455,
"learning_rate": 6.148867313915858e-05,
"loss": 1.4158,
"step": 2590
},
{
"epoch": 4.08,
"grad_norm": 1.1513010263442993,
"learning_rate": 6.132686084142394e-05,
"loss": 1.3814,
"step": 2600
},
{
"epoch": 4.09,
"grad_norm": 0.9279323816299438,
"learning_rate": 6.116504854368932e-05,
"loss": 1.4108,
"step": 2610
},
{
"epoch": 4.11,
"grad_norm": 0.9440147280693054,
"learning_rate": 6.1003236245954696e-05,
"loss": 1.3666,
"step": 2620
},
{
"epoch": 4.12,
"grad_norm": 1.111999273300171,
"learning_rate": 6.0841423948220065e-05,
"loss": 1.364,
"step": 2630
},
{
"epoch": 4.14,
"grad_norm": 1.087272047996521,
"learning_rate": 6.0679611650485434e-05,
"loss": 1.3909,
"step": 2640
},
{
"epoch": 4.15,
"grad_norm": 1.1972167491912842,
"learning_rate": 6.051779935275082e-05,
"loss": 1.3557,
"step": 2650
},
{
"epoch": 4.17,
"grad_norm": 1.0718189477920532,
"learning_rate": 6.0355987055016186e-05,
"loss": 1.3721,
"step": 2660
},
{
"epoch": 4.18,
"grad_norm": 1.120781660079956,
"learning_rate": 6.019417475728155e-05,
"loss": 1.3843,
"step": 2670
},
{
"epoch": 4.2,
"grad_norm": 1.0872063636779785,
"learning_rate": 6.003236245954693e-05,
"loss": 1.4134,
"step": 2680
},
{
"epoch": 4.22,
"grad_norm": 1.0607426166534424,
"learning_rate": 5.98705501618123e-05,
"loss": 1.3926,
"step": 2690
},
{
"epoch": 4.23,
"grad_norm": 0.9699810743331909,
"learning_rate": 5.970873786407767e-05,
"loss": 1.4078,
"step": 2700
},
{
"epoch": 4.25,
"grad_norm": 1.0885370969772339,
"learning_rate": 5.9546925566343046e-05,
"loss": 1.3803,
"step": 2710
},
{
"epoch": 4.26,
"grad_norm": 1.1456458568572998,
"learning_rate": 5.9385113268608416e-05,
"loss": 1.3975,
"step": 2720
},
{
"epoch": 4.28,
"grad_norm": 1.0021471977233887,
"learning_rate": 5.9223300970873785e-05,
"loss": 1.3884,
"step": 2730
},
{
"epoch": 4.29,
"grad_norm": 1.1158568859100342,
"learning_rate": 5.906148867313917e-05,
"loss": 1.3916,
"step": 2740
},
{
"epoch": 4.31,
"grad_norm": 1.01328444480896,
"learning_rate": 5.889967637540453e-05,
"loss": 1.3668,
"step": 2750
},
{
"epoch": 4.33,
"grad_norm": 1.0017708539962769,
"learning_rate": 5.87378640776699e-05,
"loss": 1.3908,
"step": 2760
},
{
"epoch": 4.34,
"grad_norm": 1.0248336791992188,
"learning_rate": 5.857605177993528e-05,
"loss": 1.3897,
"step": 2770
},
{
"epoch": 4.36,
"grad_norm": 1.1342253684997559,
"learning_rate": 5.841423948220065e-05,
"loss": 1.3756,
"step": 2780
},
{
"epoch": 4.37,
"grad_norm": 1.026523470878601,
"learning_rate": 5.825242718446602e-05,
"loss": 1.3615,
"step": 2790
},
{
"epoch": 4.39,
"grad_norm": 1.1998255252838135,
"learning_rate": 5.80906148867314e-05,
"loss": 1.393,
"step": 2800
},
{
"epoch": 4.4,
"grad_norm": 1.110936164855957,
"learning_rate": 5.7928802588996766e-05,
"loss": 1.372,
"step": 2810
},
{
"epoch": 4.42,
"grad_norm": 1.0857529640197754,
"learning_rate": 5.7766990291262135e-05,
"loss": 1.3888,
"step": 2820
},
{
"epoch": 4.44,
"grad_norm": 1.0588997602462769,
"learning_rate": 5.760517799352752e-05,
"loss": 1.3711,
"step": 2830
},
{
"epoch": 4.45,
"grad_norm": 1.0266647338867188,
"learning_rate": 5.744336569579288e-05,
"loss": 1.3844,
"step": 2840
},
{
"epoch": 4.47,
"grad_norm": 1.1120190620422363,
"learning_rate": 5.728155339805825e-05,
"loss": 1.3578,
"step": 2850
},
{
"epoch": 4.48,
"grad_norm": 1.085601568222046,
"learning_rate": 5.711974110032363e-05,
"loss": 1.3723,
"step": 2860
},
{
"epoch": 4.5,
"grad_norm": 1.0725315809249878,
"learning_rate": 5.6957928802589e-05,
"loss": 1.3931,
"step": 2870
},
{
"epoch": 4.51,
"grad_norm": 1.0232300758361816,
"learning_rate": 5.679611650485437e-05,
"loss": 1.3913,
"step": 2880
},
{
"epoch": 4.53,
"grad_norm": 1.0337555408477783,
"learning_rate": 5.663430420711975e-05,
"loss": 1.3917,
"step": 2890
},
{
"epoch": 4.55,
"grad_norm": 1.0149486064910889,
"learning_rate": 5.6472491909385117e-05,
"loss": 1.3687,
"step": 2900
},
{
"epoch": 4.56,
"grad_norm": 1.1411727666854858,
"learning_rate": 5.6310679611650486e-05,
"loss": 1.3829,
"step": 2910
},
{
"epoch": 4.58,
"grad_norm": 1.0212510824203491,
"learning_rate": 5.614886731391586e-05,
"loss": 1.4066,
"step": 2920
},
{
"epoch": 4.59,
"grad_norm": 0.9236648082733154,
"learning_rate": 5.598705501618123e-05,
"loss": 1.3656,
"step": 2930
},
{
"epoch": 4.61,
"grad_norm": 1.0414596796035767,
"learning_rate": 5.58252427184466e-05,
"loss": 1.377,
"step": 2940
},
{
"epoch": 4.62,
"grad_norm": 0.9774185419082642,
"learning_rate": 5.566343042071198e-05,
"loss": 1.3484,
"step": 2950
},
{
"epoch": 4.64,
"grad_norm": 1.113642930984497,
"learning_rate": 5.550161812297735e-05,
"loss": 1.3836,
"step": 2960
},
{
"epoch": 4.66,
"grad_norm": 0.9276535511016846,
"learning_rate": 5.533980582524272e-05,
"loss": 1.3647,
"step": 2970
},
{
"epoch": 4.67,
"grad_norm": 1.0644927024841309,
"learning_rate": 5.51779935275081e-05,
"loss": 1.3785,
"step": 2980
},
{
"epoch": 4.69,
"grad_norm": 0.9582048058509827,
"learning_rate": 5.501618122977347e-05,
"loss": 1.3894,
"step": 2990
},
{
"epoch": 4.7,
"grad_norm": 1.005980372428894,
"learning_rate": 5.4854368932038836e-05,
"loss": 1.3911,
"step": 3000
},
{
"epoch": 4.72,
"grad_norm": 0.924061119556427,
"learning_rate": 5.469255663430421e-05,
"loss": 1.371,
"step": 3010
},
{
"epoch": 4.73,
"grad_norm": 1.0288368463516235,
"learning_rate": 5.453074433656958e-05,
"loss": 1.3694,
"step": 3020
},
{
"epoch": 4.75,
"grad_norm": 0.9873260855674744,
"learning_rate": 5.436893203883495e-05,
"loss": 1.3833,
"step": 3030
},
{
"epoch": 4.76,
"grad_norm": 1.0119273662567139,
"learning_rate": 5.4207119741100334e-05,
"loss": 1.3829,
"step": 3040
},
{
"epoch": 4.78,
"grad_norm": 1.053831934928894,
"learning_rate": 5.40453074433657e-05,
"loss": 1.3835,
"step": 3050
},
{
"epoch": 4.8,
"grad_norm": 1.0047526359558105,
"learning_rate": 5.3883495145631065e-05,
"loss": 1.3749,
"step": 3060
},
{
"epoch": 4.81,
"grad_norm": 0.9968057870864868,
"learning_rate": 5.3721682847896435e-05,
"loss": 1.3644,
"step": 3070
},
{
"epoch": 4.83,
"grad_norm": 0.9985815286636353,
"learning_rate": 5.355987055016182e-05,
"loss": 1.355,
"step": 3080
},
{
"epoch": 4.84,
"grad_norm": 1.1190820932388306,
"learning_rate": 5.339805825242719e-05,
"loss": 1.3934,
"step": 3090
},
{
"epoch": 4.86,
"grad_norm": 0.942805290222168,
"learning_rate": 5.3236245954692556e-05,
"loss": 1.3554,
"step": 3100
},
{
"epoch": 4.87,
"grad_norm": 1.119782567024231,
"learning_rate": 5.307443365695793e-05,
"loss": 1.3874,
"step": 3110
},
{
"epoch": 4.89,
"grad_norm": 1.1315473318099976,
"learning_rate": 5.29126213592233e-05,
"loss": 1.3729,
"step": 3120
},
{
"epoch": 4.91,
"grad_norm": 1.1299010515213013,
"learning_rate": 5.275080906148867e-05,
"loss": 1.365,
"step": 3130
},
{
"epoch": 4.92,
"grad_norm": 0.9549687504768372,
"learning_rate": 5.2588996763754053e-05,
"loss": 1.4106,
"step": 3140
},
{
"epoch": 4.94,
"grad_norm": 1.180576205253601,
"learning_rate": 5.2427184466019416e-05,
"loss": 1.3603,
"step": 3150
},
{
"epoch": 4.95,
"grad_norm": 1.0854281187057495,
"learning_rate": 5.2265372168284785e-05,
"loss": 1.3839,
"step": 3160
},
{
"epoch": 4.97,
"grad_norm": 1.0713642835617065,
"learning_rate": 5.210355987055017e-05,
"loss": 1.3759,
"step": 3170
},
{
"epoch": 4.98,
"grad_norm": 1.0487116575241089,
"learning_rate": 5.194174757281554e-05,
"loss": 1.386,
"step": 3180
},
{
"epoch": 5.0,
"grad_norm": 1.1627116203308105,
"learning_rate": 5.1779935275080907e-05,
"loss": 1.3973,
"step": 3190
},
{
"epoch": 5.0,
"eval_loss": 1.356418251991272,
"eval_runtime": 235.512,
"eval_samples_per_second": 237.597,
"eval_steps_per_second": 3.715,
"step": 3190
},
{
"epoch": 5.02,
"grad_norm": 0.9876235723495483,
"learning_rate": 5.161812297734628e-05,
"loss": 1.3562,
"step": 3200
},
{
"epoch": 5.03,
"grad_norm": 1.0808262825012207,
"learning_rate": 5.145631067961165e-05,
"loss": 1.3637,
"step": 3210
},
{
"epoch": 5.05,
"grad_norm": 1.1226028203964233,
"learning_rate": 5.129449838187702e-05,
"loss": 1.3602,
"step": 3220
},
{
"epoch": 5.06,
"grad_norm": 0.9872296452522278,
"learning_rate": 5.11326860841424e-05,
"loss": 1.3488,
"step": 3230
},
{
"epoch": 5.08,
"grad_norm": 1.0114407539367676,
"learning_rate": 5.0970873786407766e-05,
"loss": 1.337,
"step": 3240
},
{
"epoch": 5.09,
"grad_norm": 1.0250917673110962,
"learning_rate": 5.0809061488673136e-05,
"loss": 1.3835,
"step": 3250
},
{
"epoch": 5.11,
"grad_norm": 1.051629900932312,
"learning_rate": 5.064724919093852e-05,
"loss": 1.3584,
"step": 3260
},
{
"epoch": 5.13,
"grad_norm": 0.9989591836929321,
"learning_rate": 5.048543689320389e-05,
"loss": 1.3632,
"step": 3270
},
{
"epoch": 5.14,
"grad_norm": 1.332314372062683,
"learning_rate": 5.032362459546926e-05,
"loss": 1.3707,
"step": 3280
},
{
"epoch": 5.16,
"grad_norm": 1.2292428016662598,
"learning_rate": 5.016181229773463e-05,
"loss": 1.391,
"step": 3290
},
{
"epoch": 5.17,
"grad_norm": 0.9948448538780212,
"learning_rate": 5e-05,
"loss": 1.3809,
"step": 3300
},
{
"epoch": 5.19,
"grad_norm": 1.100917100906372,
"learning_rate": 4.983818770226538e-05,
"loss": 1.3607,
"step": 3310
},
{
"epoch": 5.2,
"grad_norm": 1.0996888875961304,
"learning_rate": 4.967637540453075e-05,
"loss": 1.3836,
"step": 3320
},
{
"epoch": 5.22,
"grad_norm": 1.0655711889266968,
"learning_rate": 4.951456310679612e-05,
"loss": 1.4003,
"step": 3330
},
{
"epoch": 5.24,
"grad_norm": 0.96244877576828,
"learning_rate": 4.935275080906149e-05,
"loss": 1.3663,
"step": 3340
},
{
"epoch": 5.25,
"grad_norm": 1.0469475984573364,
"learning_rate": 4.919093851132686e-05,
"loss": 1.3588,
"step": 3350
},
{
"epoch": 5.27,
"grad_norm": 1.0681473016738892,
"learning_rate": 4.902912621359224e-05,
"loss": 1.3406,
"step": 3360
},
{
"epoch": 5.28,
"grad_norm": 1.0062505006790161,
"learning_rate": 4.886731391585761e-05,
"loss": 1.3579,
"step": 3370
},
{
"epoch": 5.3,
"grad_norm": 1.0351691246032715,
"learning_rate": 4.870550161812298e-05,
"loss": 1.3542,
"step": 3380
},
{
"epoch": 5.31,
"grad_norm": 1.1298000812530518,
"learning_rate": 4.854368932038835e-05,
"loss": 1.362,
"step": 3390
},
{
"epoch": 5.33,
"grad_norm": 0.9948107600212097,
"learning_rate": 4.838187702265373e-05,
"loss": 1.3368,
"step": 3400
},
{
"epoch": 5.34,
"grad_norm": 1.0648963451385498,
"learning_rate": 4.82200647249191e-05,
"loss": 1.364,
"step": 3410
},
{
"epoch": 5.36,
"grad_norm": 1.3084832429885864,
"learning_rate": 4.805825242718447e-05,
"loss": 1.3696,
"step": 3420
},
{
"epoch": 5.38,
"grad_norm": 1.305511713027954,
"learning_rate": 4.789644012944984e-05,
"loss": 1.3425,
"step": 3430
},
{
"epoch": 5.39,
"grad_norm": 1.1301295757293701,
"learning_rate": 4.773462783171521e-05,
"loss": 1.3701,
"step": 3440
},
{
"epoch": 5.41,
"grad_norm": 1.031591534614563,
"learning_rate": 4.757281553398059e-05,
"loss": 1.3758,
"step": 3450
},
{
"epoch": 5.42,
"grad_norm": 0.9542048573493958,
"learning_rate": 4.741100323624595e-05,
"loss": 1.3522,
"step": 3460
},
{
"epoch": 5.44,
"grad_norm": 1.0493799448013306,
"learning_rate": 4.724919093851133e-05,
"loss": 1.377,
"step": 3470
},
{
"epoch": 5.45,
"grad_norm": 1.000899314880371,
"learning_rate": 4.7087378640776703e-05,
"loss": 1.3633,
"step": 3480
},
{
"epoch": 5.47,
"grad_norm": 1.1479367017745972,
"learning_rate": 4.692556634304207e-05,
"loss": 1.3822,
"step": 3490
},
{
"epoch": 5.49,
"grad_norm": 1.2105580568313599,
"learning_rate": 4.676375404530744e-05,
"loss": 1.356,
"step": 3500
},
{
"epoch": 5.5,
"grad_norm": 1.0127900838851929,
"learning_rate": 4.660194174757282e-05,
"loss": 1.3627,
"step": 3510
},
{
"epoch": 5.52,
"grad_norm": 1.0151665210723877,
"learning_rate": 4.644012944983819e-05,
"loss": 1.3691,
"step": 3520
},
{
"epoch": 5.53,
"grad_norm": 1.0975522994995117,
"learning_rate": 4.627831715210356e-05,
"loss": 1.3429,
"step": 3530
},
{
"epoch": 5.55,
"grad_norm": 0.9621421098709106,
"learning_rate": 4.611650485436894e-05,
"loss": 1.3518,
"step": 3540
},
{
"epoch": 5.56,
"grad_norm": 1.1527364253997803,
"learning_rate": 4.59546925566343e-05,
"loss": 1.3715,
"step": 3550
},
{
"epoch": 5.58,
"grad_norm": 0.951931893825531,
"learning_rate": 4.579288025889968e-05,
"loss": 1.3708,
"step": 3560
},
{
"epoch": 5.6,
"grad_norm": 0.9758704900741577,
"learning_rate": 4.5631067961165054e-05,
"loss": 1.3443,
"step": 3570
},
{
"epoch": 5.61,
"grad_norm": 1.0266969203948975,
"learning_rate": 4.546925566343042e-05,
"loss": 1.3459,
"step": 3580
},
{
"epoch": 5.63,
"grad_norm": 1.087899088859558,
"learning_rate": 4.530744336569579e-05,
"loss": 1.3413,
"step": 3590
},
{
"epoch": 5.64,
"grad_norm": 1.0898716449737549,
"learning_rate": 4.514563106796117e-05,
"loss": 1.3593,
"step": 3600
},
{
"epoch": 5.66,
"grad_norm": 0.9583597183227539,
"learning_rate": 4.498381877022654e-05,
"loss": 1.3645,
"step": 3610
},
{
"epoch": 5.67,
"grad_norm": 1.0944550037384033,
"learning_rate": 4.4822006472491914e-05,
"loss": 1.354,
"step": 3620
},
{
"epoch": 5.69,
"grad_norm": 0.9865062236785889,
"learning_rate": 4.466019417475728e-05,
"loss": 1.3925,
"step": 3630
},
{
"epoch": 5.71,
"grad_norm": 0.9798828959465027,
"learning_rate": 4.449838187702265e-05,
"loss": 1.3519,
"step": 3640
},
{
"epoch": 5.72,
"grad_norm": 1.0982840061187744,
"learning_rate": 4.433656957928803e-05,
"loss": 1.3696,
"step": 3650
},
{
"epoch": 5.74,
"grad_norm": 1.067983627319336,
"learning_rate": 4.4174757281553404e-05,
"loss": 1.3789,
"step": 3660
},
{
"epoch": 5.75,
"grad_norm": 1.052541971206665,
"learning_rate": 4.4012944983818774e-05,
"loss": 1.3425,
"step": 3670
},
{
"epoch": 5.77,
"grad_norm": 1.0020917654037476,
"learning_rate": 4.385113268608414e-05,
"loss": 1.3683,
"step": 3680
},
{
"epoch": 5.78,
"grad_norm": 1.0541307926177979,
"learning_rate": 4.368932038834951e-05,
"loss": 1.3724,
"step": 3690
},
{
"epoch": 5.8,
"grad_norm": 1.0373015403747559,
"learning_rate": 4.352750809061489e-05,
"loss": 1.3517,
"step": 3700
},
{
"epoch": 5.82,
"grad_norm": 0.9015985727310181,
"learning_rate": 4.3365695792880264e-05,
"loss": 1.3718,
"step": 3710
},
{
"epoch": 5.83,
"grad_norm": 1.17056405544281,
"learning_rate": 4.3203883495145634e-05,
"loss": 1.4,
"step": 3720
},
{
"epoch": 5.85,
"grad_norm": 1.050032377243042,
"learning_rate": 4.3042071197411e-05,
"loss": 1.3746,
"step": 3730
},
{
"epoch": 5.86,
"grad_norm": 0.9545022249221802,
"learning_rate": 4.288025889967638e-05,
"loss": 1.3534,
"step": 3740
},
{
"epoch": 5.88,
"grad_norm": 0.9817554354667664,
"learning_rate": 4.271844660194175e-05,
"loss": 1.3549,
"step": 3750
},
{
"epoch": 5.89,
"grad_norm": 0.9152430891990662,
"learning_rate": 4.2556634304207124e-05,
"loss": 1.3486,
"step": 3760
},
{
"epoch": 5.91,
"grad_norm": 1.0011696815490723,
"learning_rate": 4.2394822006472493e-05,
"loss": 1.3406,
"step": 3770
},
{
"epoch": 5.92,
"grad_norm": 0.9946187138557434,
"learning_rate": 4.223300970873786e-05,
"loss": 1.3646,
"step": 3780
},
{
"epoch": 5.94,
"grad_norm": 0.9893314242362976,
"learning_rate": 4.207119741100324e-05,
"loss": 1.3873,
"step": 3790
},
{
"epoch": 5.96,
"grad_norm": 1.0371146202087402,
"learning_rate": 4.1909385113268615e-05,
"loss": 1.368,
"step": 3800
},
{
"epoch": 5.97,
"grad_norm": 0.9810591340065002,
"learning_rate": 4.1747572815533984e-05,
"loss": 1.3528,
"step": 3810
},
{
"epoch": 5.99,
"grad_norm": 1.061856985092163,
"learning_rate": 4.158576051779935e-05,
"loss": 1.3454,
"step": 3820
},
{
"epoch": 6.0,
"eval_loss": 1.3522756099700928,
"eval_runtime": 237.0881,
"eval_samples_per_second": 236.018,
"eval_steps_per_second": 3.691,
"step": 3828
},
{
"epoch": 6.0,
"grad_norm": 1.0692236423492432,
"learning_rate": 4.142394822006473e-05,
"loss": 1.3237,
"step": 3830
},
{
"epoch": 6.02,
"grad_norm": 1.01275634765625,
"learning_rate": 4.12621359223301e-05,
"loss": 1.3491,
"step": 3840
},
{
"epoch": 6.03,
"grad_norm": 1.0843064785003662,
"learning_rate": 4.1100323624595475e-05,
"loss": 1.3377,
"step": 3850
},
{
"epoch": 6.05,
"grad_norm": 1.0904685258865356,
"learning_rate": 4.0938511326860844e-05,
"loss": 1.3526,
"step": 3860
},
{
"epoch": 6.07,
"grad_norm": 1.087999939918518,
"learning_rate": 4.077669902912621e-05,
"loss": 1.3511,
"step": 3870
},
{
"epoch": 6.08,
"grad_norm": 1.025306224822998,
"learning_rate": 4.061488673139159e-05,
"loss": 1.3324,
"step": 3880
},
{
"epoch": 6.1,
"grad_norm": 1.050473690032959,
"learning_rate": 4.0453074433656965e-05,
"loss": 1.3391,
"step": 3890
},
{
"epoch": 6.11,
"grad_norm": 1.0401322841644287,
"learning_rate": 4.029126213592233e-05,
"loss": 1.3565,
"step": 3900
},
{
"epoch": 6.13,
"grad_norm": 1.0086355209350586,
"learning_rate": 4.0129449838187704e-05,
"loss": 1.3468,
"step": 3910
},
{
"epoch": 6.14,
"grad_norm": 0.9769521355628967,
"learning_rate": 3.996763754045308e-05,
"loss": 1.3502,
"step": 3920
},
{
"epoch": 6.16,
"grad_norm": 0.9940343499183655,
"learning_rate": 3.980582524271845e-05,
"loss": 1.3662,
"step": 3930
},
{
"epoch": 6.18,
"grad_norm": 1.1186522245407104,
"learning_rate": 3.964401294498382e-05,
"loss": 1.3428,
"step": 3940
},
{
"epoch": 6.19,
"grad_norm": 1.16608464717865,
"learning_rate": 3.948220064724919e-05,
"loss": 1.325,
"step": 3950
},
{
"epoch": 6.21,
"grad_norm": 1.1285912990570068,
"learning_rate": 3.9320388349514564e-05,
"loss": 1.3401,
"step": 3960
},
{
"epoch": 6.22,
"grad_norm": 1.158744215965271,
"learning_rate": 3.915857605177994e-05,
"loss": 1.3367,
"step": 3970
},
{
"epoch": 6.24,
"grad_norm": 1.0512911081314087,
"learning_rate": 3.899676375404531e-05,
"loss": 1.3575,
"step": 3980
},
{
"epoch": 6.25,
"grad_norm": 0.9593988656997681,
"learning_rate": 3.883495145631068e-05,
"loss": 1.3601,
"step": 3990
},
{
"epoch": 6.27,
"grad_norm": 1.1319143772125244,
"learning_rate": 3.8673139158576054e-05,
"loss": 1.3432,
"step": 4000
},
{
"epoch": 6.29,
"grad_norm": 1.0733578205108643,
"learning_rate": 3.8511326860841424e-05,
"loss": 1.3373,
"step": 4010
},
{
"epoch": 6.3,
"grad_norm": 1.0583279132843018,
"learning_rate": 3.83495145631068e-05,
"loss": 1.3496,
"step": 4020
},
{
"epoch": 6.32,
"grad_norm": 1.070660948753357,
"learning_rate": 3.818770226537217e-05,
"loss": 1.3345,
"step": 4030
},
{
"epoch": 6.33,
"grad_norm": 1.0403887033462524,
"learning_rate": 3.802588996763754e-05,
"loss": 1.3311,
"step": 4040
},
{
"epoch": 6.35,
"grad_norm": 1.0967310667037964,
"learning_rate": 3.7864077669902914e-05,
"loss": 1.3623,
"step": 4050
},
{
"epoch": 6.36,
"grad_norm": 0.9571351408958435,
"learning_rate": 3.770226537216829e-05,
"loss": 1.3322,
"step": 4060
},
{
"epoch": 6.38,
"grad_norm": 1.0306861400604248,
"learning_rate": 3.754045307443366e-05,
"loss": 1.384,
"step": 4070
},
{
"epoch": 6.39,
"grad_norm": 1.1071652173995972,
"learning_rate": 3.737864077669903e-05,
"loss": 1.3352,
"step": 4080
},
{
"epoch": 6.41,
"grad_norm": 1.0994199514389038,
"learning_rate": 3.7216828478964405e-05,
"loss": 1.3486,
"step": 4090
},
{
"epoch": 6.43,
"grad_norm": 1.1480964422225952,
"learning_rate": 3.7055016181229774e-05,
"loss": 1.3452,
"step": 4100
},
{
"epoch": 6.44,
"grad_norm": 1.0182585716247559,
"learning_rate": 3.689320388349515e-05,
"loss": 1.358,
"step": 4110
},
{
"epoch": 6.46,
"grad_norm": 0.9810735583305359,
"learning_rate": 3.673139158576052e-05,
"loss": 1.3208,
"step": 4120
},
{
"epoch": 6.47,
"grad_norm": 1.0320755243301392,
"learning_rate": 3.656957928802589e-05,
"loss": 1.3266,
"step": 4130
},
{
"epoch": 6.49,
"grad_norm": 1.1064990758895874,
"learning_rate": 3.6407766990291265e-05,
"loss": 1.3468,
"step": 4140
},
{
"epoch": 6.5,
"grad_norm": 1.019127368927002,
"learning_rate": 3.624595469255664e-05,
"loss": 1.3361,
"step": 4150
},
{
"epoch": 6.52,
"grad_norm": 0.9901451468467712,
"learning_rate": 3.608414239482201e-05,
"loss": 1.3402,
"step": 4160
},
{
"epoch": 6.54,
"grad_norm": 1.0479698181152344,
"learning_rate": 3.592233009708738e-05,
"loss": 1.3279,
"step": 4170
},
{
"epoch": 6.55,
"grad_norm": 1.1830583810806274,
"learning_rate": 3.5760517799352755e-05,
"loss": 1.3531,
"step": 4180
},
{
"epoch": 6.57,
"grad_norm": 1.1133328676223755,
"learning_rate": 3.5598705501618125e-05,
"loss": 1.3258,
"step": 4190
},
{
"epoch": 6.58,
"grad_norm": 1.0264846086502075,
"learning_rate": 3.54368932038835e-05,
"loss": 1.3478,
"step": 4200
},
{
"epoch": 6.6,
"grad_norm": 0.9772770404815674,
"learning_rate": 3.527508090614886e-05,
"loss": 1.3552,
"step": 4210
},
{
"epoch": 6.61,
"grad_norm": 1.060797095298767,
"learning_rate": 3.511326860841424e-05,
"loss": 1.3422,
"step": 4220
},
{
"epoch": 6.63,
"grad_norm": 0.9916195869445801,
"learning_rate": 3.4951456310679615e-05,
"loss": 1.3471,
"step": 4230
},
{
"epoch": 6.65,
"grad_norm": 1.1624395847320557,
"learning_rate": 3.4789644012944984e-05,
"loss": 1.33,
"step": 4240
},
{
"epoch": 6.66,
"grad_norm": 1.1898868083953857,
"learning_rate": 3.4627831715210354e-05,
"loss": 1.3382,
"step": 4250
},
{
"epoch": 6.68,
"grad_norm": 0.9397373795509338,
"learning_rate": 3.446601941747573e-05,
"loss": 1.3531,
"step": 4260
},
{
"epoch": 6.69,
"grad_norm": 0.9842270612716675,
"learning_rate": 3.43042071197411e-05,
"loss": 1.3241,
"step": 4270
},
{
"epoch": 6.71,
"grad_norm": 1.017988920211792,
"learning_rate": 3.4142394822006475e-05,
"loss": 1.369,
"step": 4280
},
{
"epoch": 6.72,
"grad_norm": 1.1584311723709106,
"learning_rate": 3.398058252427185e-05,
"loss": 1.347,
"step": 4290
},
{
"epoch": 6.74,
"grad_norm": 1.014439582824707,
"learning_rate": 3.3818770226537214e-05,
"loss": 1.3479,
"step": 4300
},
{
"epoch": 6.76,
"grad_norm": 1.0588716268539429,
"learning_rate": 3.365695792880259e-05,
"loss": 1.3593,
"step": 4310
},
{
"epoch": 6.77,
"grad_norm": 0.9868447780609131,
"learning_rate": 3.3495145631067966e-05,
"loss": 1.3356,
"step": 4320
},
{
"epoch": 6.79,
"grad_norm": 0.9966190457344055,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.3337,
"step": 4330
},
{
"epoch": 6.8,
"grad_norm": 1.0700786113739014,
"learning_rate": 3.3171521035598704e-05,
"loss": 1.3405,
"step": 4340
},
{
"epoch": 6.82,
"grad_norm": 1.0027292966842651,
"learning_rate": 3.300970873786408e-05,
"loss": 1.3269,
"step": 4350
},
{
"epoch": 6.83,
"grad_norm": 0.9834580421447754,
"learning_rate": 3.284789644012945e-05,
"loss": 1.3487,
"step": 4360
},
{
"epoch": 6.85,
"grad_norm": 1.108362078666687,
"learning_rate": 3.2686084142394826e-05,
"loss": 1.3345,
"step": 4370
},
{
"epoch": 6.87,
"grad_norm": 1.0489062070846558,
"learning_rate": 3.2524271844660195e-05,
"loss": 1.3233,
"step": 4380
},
{
"epoch": 6.88,
"grad_norm": 1.0966260433197021,
"learning_rate": 3.2362459546925564e-05,
"loss": 1.3519,
"step": 4390
},
{
"epoch": 6.9,
"grad_norm": 1.087006688117981,
"learning_rate": 3.220064724919094e-05,
"loss": 1.3408,
"step": 4400
},
{
"epoch": 6.91,
"grad_norm": 1.018376350402832,
"learning_rate": 3.2038834951456316e-05,
"loss": 1.345,
"step": 4410
},
{
"epoch": 6.93,
"grad_norm": 1.0844260454177856,
"learning_rate": 3.1877022653721685e-05,
"loss": 1.3658,
"step": 4420
},
{
"epoch": 6.94,
"grad_norm": 0.9972302317619324,
"learning_rate": 3.1715210355987055e-05,
"loss": 1.3581,
"step": 4430
},
{
"epoch": 6.96,
"grad_norm": 0.959899365901947,
"learning_rate": 3.155339805825243e-05,
"loss": 1.353,
"step": 4440
},
{
"epoch": 6.97,
"grad_norm": 1.0507310628890991,
"learning_rate": 3.13915857605178e-05,
"loss": 1.3517,
"step": 4450
},
{
"epoch": 6.99,
"grad_norm": 1.064765214920044,
"learning_rate": 3.1229773462783176e-05,
"loss": 1.3682,
"step": 4460
},
{
"epoch": 7.0,
"eval_loss": 1.3421807289123535,
"eval_runtime": 240.4278,
"eval_samples_per_second": 232.739,
"eval_steps_per_second": 3.639,
"step": 4466
},
{
"epoch": 7.01,
"grad_norm": 1.0591291189193726,
"learning_rate": 3.1067961165048545e-05,
"loss": 1.3159,
"step": 4470
},
{
"epoch": 7.02,
"grad_norm": 1.0589513778686523,
"learning_rate": 3.0906148867313915e-05,
"loss": 1.3231,
"step": 4480
},
{
"epoch": 7.04,
"grad_norm": 1.0672593116760254,
"learning_rate": 3.074433656957929e-05,
"loss": 1.3487,
"step": 4490
},
{
"epoch": 7.05,
"grad_norm": 0.9952087998390198,
"learning_rate": 3.058252427184466e-05,
"loss": 1.3306,
"step": 4500
},
{
"epoch": 7.07,
"grad_norm": 1.0041515827178955,
"learning_rate": 3.0420711974110033e-05,
"loss": 1.3399,
"step": 4510
},
{
"epoch": 7.08,
"grad_norm": 1.1356942653656006,
"learning_rate": 3.025889967637541e-05,
"loss": 1.3448,
"step": 4520
},
{
"epoch": 7.1,
"grad_norm": 1.1105142831802368,
"learning_rate": 3.0097087378640774e-05,
"loss": 1.3186,
"step": 4530
},
{
"epoch": 7.12,
"grad_norm": 1.022033452987671,
"learning_rate": 2.993527508090615e-05,
"loss": 1.3171,
"step": 4540
},
{
"epoch": 7.13,
"grad_norm": 0.9359177350997925,
"learning_rate": 2.9773462783171523e-05,
"loss": 1.3404,
"step": 4550
},
{
"epoch": 7.15,
"grad_norm": 0.9594640135765076,
"learning_rate": 2.9611650485436892e-05,
"loss": 1.321,
"step": 4560
},
{
"epoch": 7.16,
"grad_norm": 1.1205610036849976,
"learning_rate": 2.9449838187702265e-05,
"loss": 1.291,
"step": 4570
},
{
"epoch": 7.18,
"grad_norm": 1.1314877271652222,
"learning_rate": 2.928802588996764e-05,
"loss": 1.3321,
"step": 4580
},
{
"epoch": 7.19,
"grad_norm": 1.090805172920227,
"learning_rate": 2.912621359223301e-05,
"loss": 1.3152,
"step": 4590
},
{
"epoch": 7.21,
"grad_norm": 1.000931978225708,
"learning_rate": 2.8964401294498383e-05,
"loss": 1.3167,
"step": 4600
},
{
"epoch": 7.23,
"grad_norm": 1.0113887786865234,
"learning_rate": 2.880258899676376e-05,
"loss": 1.3248,
"step": 4610
},
{
"epoch": 7.24,
"grad_norm": 0.9959588050842285,
"learning_rate": 2.8640776699029125e-05,
"loss": 1.3652,
"step": 4620
},
{
"epoch": 7.26,
"grad_norm": 1.1672637462615967,
"learning_rate": 2.84789644012945e-05,
"loss": 1.3416,
"step": 4630
},
{
"epoch": 7.27,
"grad_norm": 1.114760398864746,
"learning_rate": 2.8317152103559874e-05,
"loss": 1.3248,
"step": 4640
},
{
"epoch": 7.29,
"grad_norm": 1.047802448272705,
"learning_rate": 2.8155339805825243e-05,
"loss": 1.3175,
"step": 4650
},
{
"epoch": 7.3,
"grad_norm": 1.1477265357971191,
"learning_rate": 2.7993527508090616e-05,
"loss": 1.3522,
"step": 4660
},
{
"epoch": 7.32,
"grad_norm": 1.0247204303741455,
"learning_rate": 2.783171521035599e-05,
"loss": 1.3245,
"step": 4670
},
{
"epoch": 7.34,
"grad_norm": 1.054656744003296,
"learning_rate": 2.766990291262136e-05,
"loss": 1.3324,
"step": 4680
},
{
"epoch": 7.35,
"grad_norm": 1.0230228900909424,
"learning_rate": 2.7508090614886734e-05,
"loss": 1.3478,
"step": 4690
},
{
"epoch": 7.37,
"grad_norm": 1.0580846071243286,
"learning_rate": 2.7346278317152106e-05,
"loss": 1.3256,
"step": 4700
},
{
"epoch": 7.38,
"grad_norm": 0.9862005114555359,
"learning_rate": 2.7184466019417475e-05,
"loss": 1.3218,
"step": 4710
},
{
"epoch": 7.4,
"grad_norm": 1.0547658205032349,
"learning_rate": 2.702265372168285e-05,
"loss": 1.3376,
"step": 4720
},
{
"epoch": 7.41,
"grad_norm": 1.1729800701141357,
"learning_rate": 2.6860841423948217e-05,
"loss": 1.3369,
"step": 4730
},
{
"epoch": 7.43,
"grad_norm": 0.9250290393829346,
"learning_rate": 2.6699029126213593e-05,
"loss": 1.32,
"step": 4740
},
{
"epoch": 7.45,
"grad_norm": 1.0951013565063477,
"learning_rate": 2.6537216828478966e-05,
"loss": 1.3338,
"step": 4750
},
{
"epoch": 7.46,
"grad_norm": 1.1006604433059692,
"learning_rate": 2.6375404530744335e-05,
"loss": 1.3379,
"step": 4760
},
{
"epoch": 7.48,
"grad_norm": 0.9685320854187012,
"learning_rate": 2.6213592233009708e-05,
"loss": 1.3214,
"step": 4770
},
{
"epoch": 7.49,
"grad_norm": 1.1178371906280518,
"learning_rate": 2.6051779935275084e-05,
"loss": 1.3391,
"step": 4780
},
{
"epoch": 7.51,
"grad_norm": 1.0559793710708618,
"learning_rate": 2.5889967637540453e-05,
"loss": 1.3149,
"step": 4790
},
{
"epoch": 7.52,
"grad_norm": 1.1109081506729126,
"learning_rate": 2.5728155339805826e-05,
"loss": 1.3298,
"step": 4800
},
{
"epoch": 7.54,
"grad_norm": 1.0514659881591797,
"learning_rate": 2.55663430420712e-05,
"loss": 1.3202,
"step": 4810
},
{
"epoch": 7.55,
"grad_norm": 0.9586611390113831,
"learning_rate": 2.5404530744336568e-05,
"loss": 1.3329,
"step": 4820
},
{
"epoch": 7.57,
"grad_norm": 1.04630708694458,
"learning_rate": 2.5242718446601944e-05,
"loss": 1.3442,
"step": 4830
},
{
"epoch": 7.59,
"grad_norm": 1.0235368013381958,
"learning_rate": 2.5080906148867317e-05,
"loss": 1.3288,
"step": 4840
},
{
"epoch": 7.6,
"grad_norm": 1.1447534561157227,
"learning_rate": 2.491909385113269e-05,
"loss": 1.3375,
"step": 4850
},
{
"epoch": 7.62,
"grad_norm": 1.0859249830245972,
"learning_rate": 2.475728155339806e-05,
"loss": 1.3345,
"step": 4860
},
{
"epoch": 7.63,
"grad_norm": 1.1479110717773438,
"learning_rate": 2.459546925566343e-05,
"loss": 1.3382,
"step": 4870
},
{
"epoch": 7.65,
"grad_norm": 1.0188355445861816,
"learning_rate": 2.4433656957928804e-05,
"loss": 1.3202,
"step": 4880
},
{
"epoch": 7.66,
"grad_norm": 1.0158920288085938,
"learning_rate": 2.4271844660194176e-05,
"loss": 1.3394,
"step": 4890
},
{
"epoch": 7.68,
"grad_norm": 1.0036348104476929,
"learning_rate": 2.411003236245955e-05,
"loss": 1.3423,
"step": 4900
},
{
"epoch": 7.7,
"grad_norm": 0.9872713088989258,
"learning_rate": 2.394822006472492e-05,
"loss": 1.3209,
"step": 4910
},
{
"epoch": 7.71,
"grad_norm": 1.2620477676391602,
"learning_rate": 2.3786407766990294e-05,
"loss": 1.3191,
"step": 4920
},
{
"epoch": 7.73,
"grad_norm": 1.143929123878479,
"learning_rate": 2.3624595469255664e-05,
"loss": 1.3256,
"step": 4930
},
{
"epoch": 7.74,
"grad_norm": 1.0806171894073486,
"learning_rate": 2.3462783171521036e-05,
"loss": 1.3293,
"step": 4940
},
{
"epoch": 7.76,
"grad_norm": 1.0597835779190063,
"learning_rate": 2.330097087378641e-05,
"loss": 1.3186,
"step": 4950
},
{
"epoch": 7.77,
"grad_norm": 1.103973388671875,
"learning_rate": 2.313915857605178e-05,
"loss": 1.3063,
"step": 4960
},
{
"epoch": 7.79,
"grad_norm": 1.0064899921417236,
"learning_rate": 2.297734627831715e-05,
"loss": 1.3137,
"step": 4970
},
{
"epoch": 7.81,
"grad_norm": 1.001996636390686,
"learning_rate": 2.2815533980582527e-05,
"loss": 1.3199,
"step": 4980
},
{
"epoch": 7.82,
"grad_norm": 0.9413354396820068,
"learning_rate": 2.2653721682847896e-05,
"loss": 1.326,
"step": 4990
},
{
"epoch": 7.84,
"grad_norm": 0.9986903667449951,
"learning_rate": 2.249190938511327e-05,
"loss": 1.3178,
"step": 5000
},
{
"epoch": 7.85,
"grad_norm": 1.072055697441101,
"learning_rate": 2.233009708737864e-05,
"loss": 1.3154,
"step": 5010
},
{
"epoch": 7.87,
"grad_norm": 1.034061074256897,
"learning_rate": 2.2168284789644014e-05,
"loss": 1.3167,
"step": 5020
},
{
"epoch": 7.88,
"grad_norm": 1.0620774030685425,
"learning_rate": 2.2006472491909387e-05,
"loss": 1.3371,
"step": 5030
},
{
"epoch": 7.9,
"grad_norm": 0.9556956887245178,
"learning_rate": 2.1844660194174756e-05,
"loss": 1.3096,
"step": 5040
},
{
"epoch": 7.92,
"grad_norm": 0.9692999124526978,
"learning_rate": 2.1682847896440132e-05,
"loss": 1.3273,
"step": 5050
},
{
"epoch": 7.93,
"grad_norm": 1.0051753520965576,
"learning_rate": 2.15210355987055e-05,
"loss": 1.355,
"step": 5060
},
{
"epoch": 7.95,
"grad_norm": 1.0744236707687378,
"learning_rate": 2.1359223300970874e-05,
"loss": 1.3258,
"step": 5070
},
{
"epoch": 7.96,
"grad_norm": 0.9954714179039001,
"learning_rate": 2.1197411003236247e-05,
"loss": 1.3396,
"step": 5080
},
{
"epoch": 7.98,
"grad_norm": 1.0861910581588745,
"learning_rate": 2.103559870550162e-05,
"loss": 1.3253,
"step": 5090
},
{
"epoch": 7.99,
"grad_norm": 1.0773459672927856,
"learning_rate": 2.0873786407766992e-05,
"loss": 1.3339,
"step": 5100
},
{
"epoch": 8.0,
"eval_loss": 1.3397225141525269,
"eval_runtime": 236.8224,
"eval_samples_per_second": 236.283,
"eval_steps_per_second": 3.695,
"step": 5104
},
{
"epoch": 8.01,
"grad_norm": 0.9482476711273193,
"learning_rate": 2.0711974110032365e-05,
"loss": 1.3272,
"step": 5110
},
{
"epoch": 8.03,
"grad_norm": 1.102388620376587,
"learning_rate": 2.0550161812297737e-05,
"loss": 1.3309,
"step": 5120
},
{
"epoch": 8.04,
"grad_norm": 1.0851417779922485,
"learning_rate": 2.0388349514563107e-05,
"loss": 1.3245,
"step": 5130
},
{
"epoch": 8.06,
"grad_norm": 0.9989822506904602,
"learning_rate": 2.0226537216828483e-05,
"loss": 1.3306,
"step": 5140
},
{
"epoch": 8.07,
"grad_norm": 1.0017298460006714,
"learning_rate": 2.0064724919093852e-05,
"loss": 1.345,
"step": 5150
},
{
"epoch": 8.09,
"grad_norm": 0.9394101500511169,
"learning_rate": 1.9902912621359225e-05,
"loss": 1.31,
"step": 5160
},
{
"epoch": 8.1,
"grad_norm": 1.193172812461853,
"learning_rate": 1.9741100323624594e-05,
"loss": 1.3158,
"step": 5170
},
{
"epoch": 8.12,
"grad_norm": 1.0058799982070923,
"learning_rate": 1.957928802588997e-05,
"loss": 1.3348,
"step": 5180
},
{
"epoch": 8.13,
"grad_norm": 0.9482255578041077,
"learning_rate": 1.941747572815534e-05,
"loss": 1.3233,
"step": 5190
},
{
"epoch": 8.15,
"grad_norm": 0.9763492345809937,
"learning_rate": 1.9255663430420712e-05,
"loss": 1.312,
"step": 5200
},
{
"epoch": 8.17,
"grad_norm": 1.0182414054870605,
"learning_rate": 1.9093851132686084e-05,
"loss": 1.3435,
"step": 5210
},
{
"epoch": 8.18,
"grad_norm": 1.0216591358184814,
"learning_rate": 1.8932038834951457e-05,
"loss": 1.3182,
"step": 5220
},
{
"epoch": 8.2,
"grad_norm": 1.0049365758895874,
"learning_rate": 1.877022653721683e-05,
"loss": 1.3248,
"step": 5230
},
{
"epoch": 8.21,
"grad_norm": 0.9713476300239563,
"learning_rate": 1.8608414239482202e-05,
"loss": 1.3212,
"step": 5240
},
{
"epoch": 8.23,
"grad_norm": 1.093015432357788,
"learning_rate": 1.8446601941747575e-05,
"loss": 1.2979,
"step": 5250
},
{
"epoch": 8.24,
"grad_norm": 0.9980152249336243,
"learning_rate": 1.8284789644012944e-05,
"loss": 1.3131,
"step": 5260
},
{
"epoch": 8.26,
"grad_norm": 1.0397632122039795,
"learning_rate": 1.812297734627832e-05,
"loss": 1.2957,
"step": 5270
},
{
"epoch": 8.28,
"grad_norm": 1.0194075107574463,
"learning_rate": 1.796116504854369e-05,
"loss": 1.317,
"step": 5280
},
{
"epoch": 8.29,
"grad_norm": 0.9956216216087341,
"learning_rate": 1.7799352750809062e-05,
"loss": 1.3329,
"step": 5290
},
{
"epoch": 8.31,
"grad_norm": 1.1138004064559937,
"learning_rate": 1.763754045307443e-05,
"loss": 1.3302,
"step": 5300
},
{
"epoch": 8.32,
"grad_norm": 1.01331627368927,
"learning_rate": 1.7475728155339808e-05,
"loss": 1.3203,
"step": 5310
},
{
"epoch": 8.34,
"grad_norm": 1.0264416933059692,
"learning_rate": 1.7313915857605177e-05,
"loss": 1.3263,
"step": 5320
},
{
"epoch": 8.35,
"grad_norm": 1.109243392944336,
"learning_rate": 1.715210355987055e-05,
"loss": 1.3027,
"step": 5330
},
{
"epoch": 8.37,
"grad_norm": 1.0397975444793701,
"learning_rate": 1.6990291262135926e-05,
"loss": 1.3171,
"step": 5340
},
{
"epoch": 8.39,
"grad_norm": 1.038338303565979,
"learning_rate": 1.6828478964401295e-05,
"loss": 1.3304,
"step": 5350
},
{
"epoch": 8.4,
"grad_norm": 1.1445987224578857,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.3367,
"step": 5360
},
{
"epoch": 8.42,
"grad_norm": 1.1104984283447266,
"learning_rate": 1.650485436893204e-05,
"loss": 1.3179,
"step": 5370
},
{
"epoch": 8.43,
"grad_norm": 1.0236207246780396,
"learning_rate": 1.6343042071197413e-05,
"loss": 1.3075,
"step": 5380
},
{
"epoch": 8.45,
"grad_norm": 0.990680456161499,
"learning_rate": 1.6181229773462782e-05,
"loss": 1.3197,
"step": 5390
},
{
"epoch": 8.46,
"grad_norm": 1.0307044982910156,
"learning_rate": 1.6019417475728158e-05,
"loss": 1.3178,
"step": 5400
},
{
"epoch": 8.48,
"grad_norm": 0.9662342667579651,
"learning_rate": 1.5857605177993527e-05,
"loss": 1.3348,
"step": 5410
},
{
"epoch": 8.5,
"grad_norm": 1.069128394126892,
"learning_rate": 1.56957928802589e-05,
"loss": 1.3213,
"step": 5420
},
{
"epoch": 8.51,
"grad_norm": 1.0478894710540771,
"learning_rate": 1.5533980582524273e-05,
"loss": 1.3296,
"step": 5430
},
{
"epoch": 8.53,
"grad_norm": 1.0435206890106201,
"learning_rate": 1.5372168284789645e-05,
"loss": 1.3401,
"step": 5440
},
{
"epoch": 8.54,
"grad_norm": 1.0635913610458374,
"learning_rate": 1.5210355987055016e-05,
"loss": 1.3345,
"step": 5450
},
{
"epoch": 8.56,
"grad_norm": 0.9900932908058167,
"learning_rate": 1.5048543689320387e-05,
"loss": 1.3002,
"step": 5460
},
{
"epoch": 8.57,
"grad_norm": 0.9892101883888245,
"learning_rate": 1.4886731391585762e-05,
"loss": 1.3387,
"step": 5470
},
{
"epoch": 8.59,
"grad_norm": 1.019913673400879,
"learning_rate": 1.4724919093851133e-05,
"loss": 1.2908,
"step": 5480
},
{
"epoch": 8.61,
"grad_norm": 0.9870728254318237,
"learning_rate": 1.4563106796116505e-05,
"loss": 1.3339,
"step": 5490
},
{
"epoch": 8.62,
"grad_norm": 0.9261495471000671,
"learning_rate": 1.440129449838188e-05,
"loss": 1.3313,
"step": 5500
},
{
"epoch": 8.64,
"grad_norm": 0.9847787022590637,
"learning_rate": 1.423948220064725e-05,
"loss": 1.3299,
"step": 5510
},
{
"epoch": 8.65,
"grad_norm": 1.0319174528121948,
"learning_rate": 1.4077669902912621e-05,
"loss": 1.3391,
"step": 5520
},
{
"epoch": 8.67,
"grad_norm": 1.0386266708374023,
"learning_rate": 1.3915857605177996e-05,
"loss": 1.2854,
"step": 5530
},
{
"epoch": 8.68,
"grad_norm": 1.0957350730895996,
"learning_rate": 1.3754045307443367e-05,
"loss": 1.3148,
"step": 5540
},
{
"epoch": 8.7,
"grad_norm": 1.0400688648223877,
"learning_rate": 1.3592233009708738e-05,
"loss": 1.3018,
"step": 5550
},
{
"epoch": 8.71,
"grad_norm": 1.0478119850158691,
"learning_rate": 1.3430420711974109e-05,
"loss": 1.3154,
"step": 5560
},
{
"epoch": 8.73,
"grad_norm": 0.9195369482040405,
"learning_rate": 1.3268608414239483e-05,
"loss": 1.329,
"step": 5570
},
{
"epoch": 8.75,
"grad_norm": 1.0205663442611694,
"learning_rate": 1.3106796116504854e-05,
"loss": 1.3075,
"step": 5580
},
{
"epoch": 8.76,
"grad_norm": 1.1660791635513306,
"learning_rate": 1.2944983818770227e-05,
"loss": 1.3004,
"step": 5590
},
{
"epoch": 8.78,
"grad_norm": 1.1753368377685547,
"learning_rate": 1.27831715210356e-05,
"loss": 1.3095,
"step": 5600
},
{
"epoch": 8.79,
"grad_norm": 1.0401411056518555,
"learning_rate": 1.2621359223300972e-05,
"loss": 1.3197,
"step": 5610
},
{
"epoch": 8.81,
"grad_norm": 1.0116024017333984,
"learning_rate": 1.2459546925566345e-05,
"loss": 1.3043,
"step": 5620
},
{
"epoch": 8.82,
"grad_norm": 1.1200822591781616,
"learning_rate": 1.2297734627831716e-05,
"loss": 1.3053,
"step": 5630
},
{
"epoch": 8.84,
"grad_norm": 1.1668050289154053,
"learning_rate": 1.2135922330097088e-05,
"loss": 1.3286,
"step": 5640
},
{
"epoch": 8.86,
"grad_norm": 1.0215119123458862,
"learning_rate": 1.197411003236246e-05,
"loss": 1.2964,
"step": 5650
},
{
"epoch": 8.87,
"grad_norm": 0.9568407535552979,
"learning_rate": 1.1812297734627832e-05,
"loss": 1.305,
"step": 5660
},
{
"epoch": 8.89,
"grad_norm": 1.0878469944000244,
"learning_rate": 1.1650485436893204e-05,
"loss": 1.3401,
"step": 5670
},
{
"epoch": 8.9,
"grad_norm": 1.1077464818954468,
"learning_rate": 1.1488673139158575e-05,
"loss": 1.321,
"step": 5680
},
{
"epoch": 8.92,
"grad_norm": 1.006237506866455,
"learning_rate": 1.1326860841423948e-05,
"loss": 1.3184,
"step": 5690
},
{
"epoch": 8.93,
"grad_norm": 1.0599100589752197,
"learning_rate": 1.116504854368932e-05,
"loss": 1.3209,
"step": 5700
},
{
"epoch": 8.95,
"grad_norm": 1.0371463298797607,
"learning_rate": 1.1003236245954693e-05,
"loss": 1.3299,
"step": 5710
},
{
"epoch": 8.97,
"grad_norm": 1.1393773555755615,
"learning_rate": 1.0841423948220066e-05,
"loss": 1.3492,
"step": 5720
},
{
"epoch": 8.98,
"grad_norm": 0.8908681869506836,
"learning_rate": 1.0679611650485437e-05,
"loss": 1.3262,
"step": 5730
},
{
"epoch": 9.0,
"grad_norm": 0.9290010333061218,
"learning_rate": 1.051779935275081e-05,
"loss": 1.3318,
"step": 5740
},
{
"epoch": 9.0,
"eval_loss": 1.338958978652954,
"eval_runtime": 239.6707,
"eval_samples_per_second": 233.475,
"eval_steps_per_second": 3.651,
"step": 5742
},
{
"epoch": 9.01,
"grad_norm": 1.0507458448410034,
"learning_rate": 1.0355987055016182e-05,
"loss": 1.2964,
"step": 5750
},
{
"epoch": 9.03,
"grad_norm": 0.9368033409118652,
"learning_rate": 1.0194174757281553e-05,
"loss": 1.3196,
"step": 5760
},
{
"epoch": 9.04,
"grad_norm": 1.1127394437789917,
"learning_rate": 1.0032362459546926e-05,
"loss": 1.3221,
"step": 5770
},
{
"epoch": 9.06,
"grad_norm": 1.028363585472107,
"learning_rate": 9.870550161812297e-06,
"loss": 1.3175,
"step": 5780
},
{
"epoch": 9.08,
"grad_norm": 1.0377564430236816,
"learning_rate": 9.70873786407767e-06,
"loss": 1.3353,
"step": 5790
},
{
"epoch": 9.09,
"grad_norm": 1.0370157957077026,
"learning_rate": 9.546925566343042e-06,
"loss": 1.3035,
"step": 5800
},
{
"epoch": 9.11,
"grad_norm": 0.9729301333427429,
"learning_rate": 9.385113268608415e-06,
"loss": 1.3262,
"step": 5810
},
{
"epoch": 9.12,
"grad_norm": 1.0480165481567383,
"learning_rate": 9.223300970873788e-06,
"loss": 1.2991,
"step": 5820
},
{
"epoch": 9.14,
"grad_norm": 1.0377051830291748,
"learning_rate": 9.06148867313916e-06,
"loss": 1.3084,
"step": 5830
},
{
"epoch": 9.15,
"grad_norm": 1.0131510496139526,
"learning_rate": 8.899676375404531e-06,
"loss": 1.337,
"step": 5840
},
{
"epoch": 9.17,
"grad_norm": 1.0025854110717773,
"learning_rate": 8.737864077669904e-06,
"loss": 1.3278,
"step": 5850
},
{
"epoch": 9.18,
"grad_norm": 1.1271703243255615,
"learning_rate": 8.576051779935275e-06,
"loss": 1.3218,
"step": 5860
},
{
"epoch": 9.2,
"grad_norm": 0.9496561884880066,
"learning_rate": 8.414239482200647e-06,
"loss": 1.3032,
"step": 5870
},
{
"epoch": 9.22,
"grad_norm": 1.0544354915618896,
"learning_rate": 8.25242718446602e-06,
"loss": 1.3203,
"step": 5880
},
{
"epoch": 9.23,
"grad_norm": 1.0525000095367432,
"learning_rate": 8.090614886731391e-06,
"loss": 1.3085,
"step": 5890
},
{
"epoch": 9.25,
"grad_norm": 1.1335513591766357,
"learning_rate": 7.928802588996764e-06,
"loss": 1.3192,
"step": 5900
},
{
"epoch": 9.26,
"grad_norm": 1.0482152700424194,
"learning_rate": 7.766990291262136e-06,
"loss": 1.3136,
"step": 5910
},
{
"epoch": 9.28,
"grad_norm": 1.0089904069900513,
"learning_rate": 7.605177993527508e-06,
"loss": 1.3091,
"step": 5920
},
{
"epoch": 9.29,
"grad_norm": 1.033751130104065,
"learning_rate": 7.443365695792881e-06,
"loss": 1.309,
"step": 5930
},
{
"epoch": 9.31,
"grad_norm": 1.0338579416275024,
"learning_rate": 7.281553398058253e-06,
"loss": 1.3172,
"step": 5940
},
{
"epoch": 9.33,
"grad_norm": 0.9827445149421692,
"learning_rate": 7.119741100323625e-06,
"loss": 1.3364,
"step": 5950
},
{
"epoch": 9.34,
"grad_norm": 1.037365436553955,
"learning_rate": 6.957928802588998e-06,
"loss": 1.3223,
"step": 5960
},
{
"epoch": 9.36,
"grad_norm": 1.000085711479187,
"learning_rate": 6.796116504854369e-06,
"loss": 1.3424,
"step": 5970
},
{
"epoch": 9.37,
"grad_norm": 1.1008888483047485,
"learning_rate": 6.6343042071197415e-06,
"loss": 1.321,
"step": 5980
},
{
"epoch": 9.39,
"grad_norm": 0.9704506397247314,
"learning_rate": 6.472491909385113e-06,
"loss": 1.3215,
"step": 5990
},
{
"epoch": 9.4,
"grad_norm": 1.0648174285888672,
"learning_rate": 6.310679611650486e-06,
"loss": 1.3106,
"step": 6000
},
{
"epoch": 9.42,
"grad_norm": 1.0990079641342163,
"learning_rate": 6.148867313915858e-06,
"loss": 1.3129,
"step": 6010
},
{
"epoch": 9.44,
"grad_norm": 1.014382243156433,
"learning_rate": 5.98705501618123e-06,
"loss": 1.2988,
"step": 6020
},
{
"epoch": 9.45,
"grad_norm": 1.0244662761688232,
"learning_rate": 5.825242718446602e-06,
"loss": 1.3062,
"step": 6030
},
{
"epoch": 9.47,
"grad_norm": 1.009265661239624,
"learning_rate": 5.663430420711974e-06,
"loss": 1.3069,
"step": 6040
},
{
"epoch": 9.48,
"grad_norm": 1.0187369585037231,
"learning_rate": 5.501618122977347e-06,
"loss": 1.3136,
"step": 6050
},
{
"epoch": 9.5,
"grad_norm": 1.0073754787445068,
"learning_rate": 5.3398058252427185e-06,
"loss": 1.2951,
"step": 6060
},
{
"epoch": 9.51,
"grad_norm": 0.9931274056434631,
"learning_rate": 5.177993527508091e-06,
"loss": 1.3112,
"step": 6070
},
{
"epoch": 9.53,
"grad_norm": 1.013115644454956,
"learning_rate": 5.016181229773463e-06,
"loss": 1.304,
"step": 6080
},
{
"epoch": 9.55,
"grad_norm": 1.0234911441802979,
"learning_rate": 4.854368932038835e-06,
"loss": 1.3331,
"step": 6090
},
{
"epoch": 9.56,
"grad_norm": 1.0556929111480713,
"learning_rate": 4.6925566343042074e-06,
"loss": 1.2966,
"step": 6100
},
{
"epoch": 9.58,
"grad_norm": 1.1237767934799194,
"learning_rate": 4.53074433656958e-06,
"loss": 1.2936,
"step": 6110
},
{
"epoch": 9.59,
"grad_norm": 1.075426697731018,
"learning_rate": 4.368932038834952e-06,
"loss": 1.3168,
"step": 6120
},
{
"epoch": 9.61,
"grad_norm": 1.0397717952728271,
"learning_rate": 4.207119741100324e-06,
"loss": 1.3018,
"step": 6130
},
{
"epoch": 9.62,
"grad_norm": 1.0505844354629517,
"learning_rate": 4.0453074433656955e-06,
"loss": 1.3145,
"step": 6140
},
{
"epoch": 9.64,
"grad_norm": 1.00873863697052,
"learning_rate": 3.883495145631068e-06,
"loss": 1.3097,
"step": 6150
},
{
"epoch": 9.66,
"grad_norm": 1.0691351890563965,
"learning_rate": 3.7216828478964404e-06,
"loss": 1.2964,
"step": 6160
},
{
"epoch": 9.67,
"grad_norm": 1.0562281608581543,
"learning_rate": 3.5598705501618126e-06,
"loss": 1.3436,
"step": 6170
},
{
"epoch": 9.69,
"grad_norm": 1.0892330408096313,
"learning_rate": 3.3980582524271844e-06,
"loss": 1.3234,
"step": 6180
},
{
"epoch": 9.7,
"grad_norm": 1.0730509757995605,
"learning_rate": 3.2362459546925567e-06,
"loss": 1.284,
"step": 6190
},
{
"epoch": 9.72,
"grad_norm": 1.146493911743164,
"learning_rate": 3.074433656957929e-06,
"loss": 1.3299,
"step": 6200
},
{
"epoch": 9.73,
"grad_norm": 1.1121958494186401,
"learning_rate": 2.912621359223301e-06,
"loss": 1.3263,
"step": 6210
},
{
"epoch": 9.75,
"grad_norm": 0.9771898984909058,
"learning_rate": 2.7508090614886734e-06,
"loss": 1.3093,
"step": 6220
},
{
"epoch": 9.76,
"grad_norm": 1.05784273147583,
"learning_rate": 2.5889967637540456e-06,
"loss": 1.3264,
"step": 6230
},
{
"epoch": 9.78,
"grad_norm": 0.9583161473274231,
"learning_rate": 2.4271844660194174e-06,
"loss": 1.2982,
"step": 6240
},
{
"epoch": 9.8,
"grad_norm": 1.1259510517120361,
"learning_rate": 2.26537216828479e-06,
"loss": 1.3116,
"step": 6250
},
{
"epoch": 9.81,
"grad_norm": 1.0041571855545044,
"learning_rate": 2.103559870550162e-06,
"loss": 1.3099,
"step": 6260
},
{
"epoch": 9.83,
"grad_norm": 0.9975118637084961,
"learning_rate": 1.941747572815534e-06,
"loss": 1.3179,
"step": 6270
},
{
"epoch": 9.84,
"grad_norm": 1.0262846946716309,
"learning_rate": 1.7799352750809063e-06,
"loss": 1.3099,
"step": 6280
},
{
"epoch": 9.86,
"grad_norm": 0.9885269999504089,
"learning_rate": 1.6181229773462783e-06,
"loss": 1.2816,
"step": 6290
},
{
"epoch": 9.87,
"grad_norm": 0.9989182353019714,
"learning_rate": 1.4563106796116506e-06,
"loss": 1.3146,
"step": 6300
},
{
"epoch": 9.89,
"grad_norm": 0.9430673718452454,
"learning_rate": 1.2944983818770228e-06,
"loss": 1.3489,
"step": 6310
},
{
"epoch": 9.91,
"grad_norm": 0.9786996245384216,
"learning_rate": 1.132686084142395e-06,
"loss": 1.3152,
"step": 6320
},
{
"epoch": 9.92,
"grad_norm": 1.1633003950119019,
"learning_rate": 9.70873786407767e-07,
"loss": 1.3357,
"step": 6330
},
{
"epoch": 9.94,
"grad_norm": 1.1029000282287598,
"learning_rate": 8.090614886731392e-07,
"loss": 1.2956,
"step": 6340
},
{
"epoch": 9.95,
"grad_norm": 1.119912028312683,
"learning_rate": 6.472491909385114e-07,
"loss": 1.3293,
"step": 6350
},
{
"epoch": 9.97,
"grad_norm": 0.998874306678772,
"learning_rate": 4.854368932038835e-07,
"loss": 1.2852,
"step": 6360
},
{
"epoch": 9.98,
"grad_norm": 1.0051956176757812,
"learning_rate": 3.236245954692557e-07,
"loss": 1.3242,
"step": 6370
},
{
"epoch": 10.0,
"grad_norm": 1.007757544517517,
"learning_rate": 1.6181229773462785e-07,
"loss": 1.298,
"step": 6380
},
{
"epoch": 10.0,
"eval_loss": 1.3409571647644043,
"eval_runtime": 237.188,
"eval_samples_per_second": 235.918,
"eval_steps_per_second": 3.689,
"step": 6380
}
],
"logging_steps": 10,
"max_steps": 6380,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 2.7210120923153695e+18,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}