bunbo-reward / trainer_state.json
lctzz540's picture
Upload 11 files
3edb218 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.857035364936042,
"eval_steps": 500,
"global_step": 39300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 98.3414306640625,
"learning_rate": 1.4082317531978931e-05,
"loss": 1.674,
"step": 50
},
{
"epoch": 0.03,
"grad_norm": 21.889772415161133,
"learning_rate": 1.4064635063957864e-05,
"loss": 1.7321,
"step": 100
},
{
"epoch": 0.04,
"grad_norm": 78.81497955322266,
"learning_rate": 1.4046952595936794e-05,
"loss": 1.3246,
"step": 150
},
{
"epoch": 0.05,
"grad_norm": 117.79057312011719,
"learning_rate": 1.4029270127915727e-05,
"loss": 1.8399,
"step": 200
},
{
"epoch": 0.06,
"grad_norm": 89.93197631835938,
"learning_rate": 1.4011587659894659e-05,
"loss": 1.7021,
"step": 250
},
{
"epoch": 0.08,
"grad_norm": 5.327052116394043,
"learning_rate": 1.399390519187359e-05,
"loss": 1.3229,
"step": 300
},
{
"epoch": 0.09,
"grad_norm": 104.67691802978516,
"learning_rate": 1.397622272385252e-05,
"loss": 1.0449,
"step": 350
},
{
"epoch": 0.1,
"grad_norm": 62.50383377075195,
"learning_rate": 1.3958540255831453e-05,
"loss": 1.2135,
"step": 400
},
{
"epoch": 0.11,
"grad_norm": 30.1390380859375,
"learning_rate": 1.3940857787810384e-05,
"loss": 1.1312,
"step": 450
},
{
"epoch": 0.13,
"grad_norm": 172.32058715820312,
"learning_rate": 1.3923175319789316e-05,
"loss": 1.1339,
"step": 500
},
{
"epoch": 0.14,
"grad_norm": 149.6029052734375,
"learning_rate": 1.3905492851768248e-05,
"loss": 0.9226,
"step": 550
},
{
"epoch": 0.15,
"grad_norm": 104.08654022216797,
"learning_rate": 1.3887810383747179e-05,
"loss": 0.9141,
"step": 600
},
{
"epoch": 0.16,
"grad_norm": 28.90251350402832,
"learning_rate": 1.387012791572611e-05,
"loss": 0.7194,
"step": 650
},
{
"epoch": 0.18,
"grad_norm": 78.85499572753906,
"learning_rate": 1.3852445447705042e-05,
"loss": 1.051,
"step": 700
},
{
"epoch": 0.19,
"grad_norm": 59.84476089477539,
"learning_rate": 1.3834762979683973e-05,
"loss": 0.8815,
"step": 750
},
{
"epoch": 0.2,
"grad_norm": 47.683658599853516,
"learning_rate": 1.3817080511662905e-05,
"loss": 1.1052,
"step": 800
},
{
"epoch": 0.21,
"grad_norm": 73.24783325195312,
"learning_rate": 1.3799398043641836e-05,
"loss": 0.6957,
"step": 850
},
{
"epoch": 0.23,
"grad_norm": 121.98059844970703,
"learning_rate": 1.3781715575620768e-05,
"loss": 1.1512,
"step": 900
},
{
"epoch": 0.24,
"grad_norm": 115.57231140136719,
"learning_rate": 1.3764033107599699e-05,
"loss": 0.8512,
"step": 950
},
{
"epoch": 0.25,
"grad_norm": 40.25959014892578,
"learning_rate": 1.374635063957863e-05,
"loss": 0.873,
"step": 1000
},
{
"epoch": 0.26,
"grad_norm": 10.869709014892578,
"learning_rate": 1.3728668171557562e-05,
"loss": 0.7834,
"step": 1050
},
{
"epoch": 0.28,
"grad_norm": 128.24893188476562,
"learning_rate": 1.3710985703536495e-05,
"loss": 0.8042,
"step": 1100
},
{
"epoch": 0.29,
"grad_norm": 60.73322677612305,
"learning_rate": 1.3693303235515425e-05,
"loss": 1.0092,
"step": 1150
},
{
"epoch": 0.3,
"grad_norm": 19.39624786376953,
"learning_rate": 1.3675620767494358e-05,
"loss": 0.662,
"step": 1200
},
{
"epoch": 0.31,
"grad_norm": 0.13774849474430084,
"learning_rate": 1.3657938299473288e-05,
"loss": 0.98,
"step": 1250
},
{
"epoch": 0.33,
"grad_norm": 79.46333312988281,
"learning_rate": 1.3640255831452219e-05,
"loss": 0.7967,
"step": 1300
},
{
"epoch": 0.34,
"grad_norm": 13.158239364624023,
"learning_rate": 1.3622573363431151e-05,
"loss": 1.0218,
"step": 1350
},
{
"epoch": 0.35,
"grad_norm": 17.267330169677734,
"learning_rate": 1.3604890895410084e-05,
"loss": 0.8711,
"step": 1400
},
{
"epoch": 0.36,
"grad_norm": 174.72537231445312,
"learning_rate": 1.3587208427389015e-05,
"loss": 0.8711,
"step": 1450
},
{
"epoch": 0.38,
"grad_norm": 77.13172149658203,
"learning_rate": 1.3569525959367947e-05,
"loss": 1.0233,
"step": 1500
},
{
"epoch": 0.39,
"grad_norm": 48.417015075683594,
"learning_rate": 1.3551843491346878e-05,
"loss": 0.7682,
"step": 1550
},
{
"epoch": 0.4,
"grad_norm": 6.1959638595581055,
"learning_rate": 1.3534161023325808e-05,
"loss": 0.8792,
"step": 1600
},
{
"epoch": 0.41,
"grad_norm": 49.98043441772461,
"learning_rate": 1.351647855530474e-05,
"loss": 0.9868,
"step": 1650
},
{
"epoch": 0.43,
"grad_norm": 45.13309860229492,
"learning_rate": 1.3498796087283673e-05,
"loss": 0.5272,
"step": 1700
},
{
"epoch": 0.44,
"grad_norm": 8.423553466796875,
"learning_rate": 1.3481113619262604e-05,
"loss": 1.1983,
"step": 1750
},
{
"epoch": 0.45,
"grad_norm": 17.5786190032959,
"learning_rate": 1.3463431151241536e-05,
"loss": 0.7065,
"step": 1800
},
{
"epoch": 0.46,
"grad_norm": 5.939927577972412,
"learning_rate": 1.3445748683220467e-05,
"loss": 0.6674,
"step": 1850
},
{
"epoch": 0.48,
"grad_norm": 23.781694412231445,
"learning_rate": 1.3428066215199398e-05,
"loss": 0.7267,
"step": 1900
},
{
"epoch": 0.49,
"grad_norm": 0.4960607886314392,
"learning_rate": 1.341038374717833e-05,
"loss": 1.0549,
"step": 1950
},
{
"epoch": 0.5,
"grad_norm": 83.99737548828125,
"learning_rate": 1.3392701279157262e-05,
"loss": 0.786,
"step": 2000
},
{
"epoch": 0.51,
"grad_norm": 20.65607261657715,
"learning_rate": 1.3375018811136193e-05,
"loss": 0.9709,
"step": 2050
},
{
"epoch": 0.53,
"grad_norm": 1.0673532485961914,
"learning_rate": 1.3357336343115126e-05,
"loss": 0.8208,
"step": 2100
},
{
"epoch": 0.54,
"grad_norm": 10.350920677185059,
"learning_rate": 1.3339653875094056e-05,
"loss": 1.1503,
"step": 2150
},
{
"epoch": 0.55,
"grad_norm": 0.7176612019538879,
"learning_rate": 1.3321971407072987e-05,
"loss": 0.5841,
"step": 2200
},
{
"epoch": 0.56,
"grad_norm": 16.532655715942383,
"learning_rate": 1.330428893905192e-05,
"loss": 1.1618,
"step": 2250
},
{
"epoch": 0.58,
"grad_norm": 0.24398092925548553,
"learning_rate": 1.3286606471030852e-05,
"loss": 0.6052,
"step": 2300
},
{
"epoch": 0.59,
"grad_norm": 45.761695861816406,
"learning_rate": 1.3268924003009782e-05,
"loss": 1.0618,
"step": 2350
},
{
"epoch": 0.6,
"grad_norm": 0.3656911849975586,
"learning_rate": 1.3251241534988713e-05,
"loss": 0.8395,
"step": 2400
},
{
"epoch": 0.61,
"grad_norm": 56.36614227294922,
"learning_rate": 1.3233559066967646e-05,
"loss": 0.6547,
"step": 2450
},
{
"epoch": 0.63,
"grad_norm": 50.591705322265625,
"learning_rate": 1.3215876598946576e-05,
"loss": 0.9528,
"step": 2500
},
{
"epoch": 0.64,
"grad_norm": 11.290885925292969,
"learning_rate": 1.3198194130925507e-05,
"loss": 0.6811,
"step": 2550
},
{
"epoch": 0.65,
"grad_norm": 0.10668418556451797,
"learning_rate": 1.3180511662904441e-05,
"loss": 0.7421,
"step": 2600
},
{
"epoch": 0.66,
"grad_norm": 1.0529690980911255,
"learning_rate": 1.3162829194883372e-05,
"loss": 0.7665,
"step": 2650
},
{
"epoch": 0.68,
"grad_norm": 35.5570068359375,
"learning_rate": 1.3145146726862302e-05,
"loss": 0.6587,
"step": 2700
},
{
"epoch": 0.69,
"grad_norm": 47.973697662353516,
"learning_rate": 1.3127464258841235e-05,
"loss": 0.8273,
"step": 2750
},
{
"epoch": 0.7,
"grad_norm": 42.45454788208008,
"learning_rate": 1.3109781790820166e-05,
"loss": 0.8512,
"step": 2800
},
{
"epoch": 0.71,
"grad_norm": 52.255821228027344,
"learning_rate": 1.3092099322799096e-05,
"loss": 0.5748,
"step": 2850
},
{
"epoch": 0.73,
"grad_norm": 55.622413635253906,
"learning_rate": 1.307441685477803e-05,
"loss": 0.6585,
"step": 2900
},
{
"epoch": 0.74,
"grad_norm": 6.804417610168457,
"learning_rate": 1.3056734386756961e-05,
"loss": 0.9276,
"step": 2950
},
{
"epoch": 0.75,
"grad_norm": 8.9085054397583,
"learning_rate": 1.3039051918735892e-05,
"loss": 0.9573,
"step": 3000
},
{
"epoch": 0.76,
"grad_norm": 3.399890422821045,
"learning_rate": 1.3021369450714824e-05,
"loss": 0.815,
"step": 3050
},
{
"epoch": 0.78,
"grad_norm": 9.621098518371582,
"learning_rate": 1.3003686982693755e-05,
"loss": 0.6272,
"step": 3100
},
{
"epoch": 0.79,
"grad_norm": 34.52663803100586,
"learning_rate": 1.2986004514672686e-05,
"loss": 0.7548,
"step": 3150
},
{
"epoch": 0.8,
"grad_norm": 38.8935661315918,
"learning_rate": 1.296832204665162e-05,
"loss": 1.0272,
"step": 3200
},
{
"epoch": 0.82,
"grad_norm": 53.31705093383789,
"learning_rate": 1.295063957863055e-05,
"loss": 0.8594,
"step": 3250
},
{
"epoch": 0.83,
"grad_norm": 24.726455688476562,
"learning_rate": 1.2932957110609481e-05,
"loss": 0.7025,
"step": 3300
},
{
"epoch": 0.84,
"grad_norm": 35.29804992675781,
"learning_rate": 1.2915274642588413e-05,
"loss": 0.8359,
"step": 3350
},
{
"epoch": 0.85,
"grad_norm": 15.382336616516113,
"learning_rate": 1.2897592174567344e-05,
"loss": 0.7358,
"step": 3400
},
{
"epoch": 0.87,
"grad_norm": 2.9050614833831787,
"learning_rate": 1.2879909706546275e-05,
"loss": 0.8021,
"step": 3450
},
{
"epoch": 0.88,
"grad_norm": 44.734962463378906,
"learning_rate": 1.2862227238525209e-05,
"loss": 0.844,
"step": 3500
},
{
"epoch": 0.89,
"grad_norm": 14.811912536621094,
"learning_rate": 1.284454477050414e-05,
"loss": 0.7822,
"step": 3550
},
{
"epoch": 0.9,
"grad_norm": 44.70045471191406,
"learning_rate": 1.282686230248307e-05,
"loss": 1.0654,
"step": 3600
},
{
"epoch": 0.92,
"grad_norm": 48.43465805053711,
"learning_rate": 1.2809179834462003e-05,
"loss": 0.6354,
"step": 3650
},
{
"epoch": 0.93,
"grad_norm": 47.798423767089844,
"learning_rate": 1.2791497366440933e-05,
"loss": 0.8125,
"step": 3700
},
{
"epoch": 0.94,
"grad_norm": 42.33122634887695,
"learning_rate": 1.2773814898419864e-05,
"loss": 1.1325,
"step": 3750
},
{
"epoch": 0.95,
"grad_norm": 0.14906466007232666,
"learning_rate": 1.2756132430398797e-05,
"loss": 0.5325,
"step": 3800
},
{
"epoch": 0.97,
"grad_norm": 10.49329662322998,
"learning_rate": 1.2738449962377729e-05,
"loss": 0.7013,
"step": 3850
},
{
"epoch": 0.98,
"grad_norm": 21.828550338745117,
"learning_rate": 1.272076749435666e-05,
"loss": 0.5134,
"step": 3900
},
{
"epoch": 0.99,
"grad_norm": 1.0481252670288086,
"learning_rate": 1.270308502633559e-05,
"loss": 1.4255,
"step": 3950
},
{
"epoch": 1.0,
"grad_norm": 1.075194001197815,
"learning_rate": 1.2685402558314523e-05,
"loss": 0.7727,
"step": 4000
},
{
"epoch": 1.02,
"grad_norm": 17.64851188659668,
"learning_rate": 1.2667720090293453e-05,
"loss": 0.4984,
"step": 4050
},
{
"epoch": 1.03,
"grad_norm": 49.92161178588867,
"learning_rate": 1.2650037622272386e-05,
"loss": 0.9065,
"step": 4100
},
{
"epoch": 1.04,
"grad_norm": 11.019123077392578,
"learning_rate": 1.2632355154251318e-05,
"loss": 0.8184,
"step": 4150
},
{
"epoch": 1.05,
"grad_norm": 12.537881851196289,
"learning_rate": 1.2614672686230249e-05,
"loss": 0.6989,
"step": 4200
},
{
"epoch": 1.07,
"grad_norm": 0.5771467089653015,
"learning_rate": 1.259699021820918e-05,
"loss": 0.7282,
"step": 4250
},
{
"epoch": 1.08,
"grad_norm": 60.68583297729492,
"learning_rate": 1.2579307750188112e-05,
"loss": 0.695,
"step": 4300
},
{
"epoch": 1.09,
"grad_norm": 0.7341581583023071,
"learning_rate": 1.2561625282167043e-05,
"loss": 0.7021,
"step": 4350
},
{
"epoch": 1.1,
"grad_norm": 0.020291157066822052,
"learning_rate": 1.2543942814145975e-05,
"loss": 0.8563,
"step": 4400
},
{
"epoch": 1.12,
"grad_norm": 1.3924442529678345,
"learning_rate": 1.2526260346124907e-05,
"loss": 0.7378,
"step": 4450
},
{
"epoch": 1.13,
"grad_norm": 31.691173553466797,
"learning_rate": 1.2508577878103838e-05,
"loss": 0.5887,
"step": 4500
},
{
"epoch": 1.14,
"grad_norm": 1.1823307275772095,
"learning_rate": 1.2490895410082769e-05,
"loss": 0.8132,
"step": 4550
},
{
"epoch": 1.15,
"grad_norm": 0.08653511106967926,
"learning_rate": 1.2473212942061701e-05,
"loss": 0.8374,
"step": 4600
},
{
"epoch": 1.17,
"grad_norm": 2.169903039932251,
"learning_rate": 1.2455530474040632e-05,
"loss": 0.588,
"step": 4650
},
{
"epoch": 1.18,
"grad_norm": 56.76768112182617,
"learning_rate": 1.2437848006019564e-05,
"loss": 0.7869,
"step": 4700
},
{
"epoch": 1.19,
"grad_norm": 0.05390803515911102,
"learning_rate": 1.2420165537998497e-05,
"loss": 0.6243,
"step": 4750
},
{
"epoch": 1.2,
"grad_norm": 5.537655830383301,
"learning_rate": 1.2402483069977427e-05,
"loss": 0.737,
"step": 4800
},
{
"epoch": 1.22,
"grad_norm": 69.44229125976562,
"learning_rate": 1.2384800601956358e-05,
"loss": 1.0479,
"step": 4850
},
{
"epoch": 1.23,
"grad_norm": 45.22208023071289,
"learning_rate": 1.236711813393529e-05,
"loss": 0.8327,
"step": 4900
},
{
"epoch": 1.24,
"grad_norm": 22.553054809570312,
"learning_rate": 1.2349435665914221e-05,
"loss": 0.6587,
"step": 4950
},
{
"epoch": 1.25,
"grad_norm": 2.1869142055511475,
"learning_rate": 1.2331753197893154e-05,
"loss": 0.5913,
"step": 5000
},
{
"epoch": 1.27,
"grad_norm": 2.483933210372925,
"learning_rate": 1.2314070729872086e-05,
"loss": 0.8163,
"step": 5050
},
{
"epoch": 1.28,
"grad_norm": 18.768310546875,
"learning_rate": 1.2296388261851017e-05,
"loss": 0.6273,
"step": 5100
},
{
"epoch": 1.29,
"grad_norm": 56.0864372253418,
"learning_rate": 1.2278705793829947e-05,
"loss": 0.8787,
"step": 5150
},
{
"epoch": 1.3,
"grad_norm": 51.98051834106445,
"learning_rate": 1.226102332580888e-05,
"loss": 0.4302,
"step": 5200
},
{
"epoch": 1.32,
"grad_norm": 17.60165023803711,
"learning_rate": 1.224334085778781e-05,
"loss": 0.7238,
"step": 5250
},
{
"epoch": 1.33,
"grad_norm": 48.4942626953125,
"learning_rate": 1.2225658389766743e-05,
"loss": 0.8018,
"step": 5300
},
{
"epoch": 1.34,
"grad_norm": 14.206453323364258,
"learning_rate": 1.2207975921745674e-05,
"loss": 0.5428,
"step": 5350
},
{
"epoch": 1.35,
"grad_norm": 65.64610290527344,
"learning_rate": 1.2190293453724606e-05,
"loss": 0.7923,
"step": 5400
},
{
"epoch": 1.37,
"grad_norm": 9.786343574523926,
"learning_rate": 1.2172610985703537e-05,
"loss": 0.7779,
"step": 5450
},
{
"epoch": 1.38,
"grad_norm": 3.1632120609283447,
"learning_rate": 1.2154928517682467e-05,
"loss": 0.6474,
"step": 5500
},
{
"epoch": 1.39,
"grad_norm": 15.631272315979004,
"learning_rate": 1.21372460496614e-05,
"loss": 0.6736,
"step": 5550
},
{
"epoch": 1.4,
"grad_norm": 0.025490593165159225,
"learning_rate": 1.2119563581640332e-05,
"loss": 0.6371,
"step": 5600
},
{
"epoch": 1.42,
"grad_norm": 84.42486572265625,
"learning_rate": 1.2101881113619263e-05,
"loss": 0.9348,
"step": 5650
},
{
"epoch": 1.43,
"grad_norm": 0.32389989495277405,
"learning_rate": 1.2084198645598195e-05,
"loss": 0.8304,
"step": 5700
},
{
"epoch": 1.44,
"grad_norm": 49.16242599487305,
"learning_rate": 1.2066516177577126e-05,
"loss": 0.6624,
"step": 5750
},
{
"epoch": 1.45,
"grad_norm": 119.3700942993164,
"learning_rate": 1.2048833709556057e-05,
"loss": 1.1135,
"step": 5800
},
{
"epoch": 1.47,
"grad_norm": 0.15834768116474152,
"learning_rate": 1.2031151241534989e-05,
"loss": 0.6358,
"step": 5850
},
{
"epoch": 1.48,
"grad_norm": 54.722652435302734,
"learning_rate": 1.2013468773513922e-05,
"loss": 0.5639,
"step": 5900
},
{
"epoch": 1.49,
"grad_norm": 0.0872531533241272,
"learning_rate": 1.1995786305492852e-05,
"loss": 0.7912,
"step": 5950
},
{
"epoch": 1.5,
"grad_norm": 2.5009591579437256,
"learning_rate": 1.1978103837471785e-05,
"loss": 0.6478,
"step": 6000
},
{
"epoch": 1.52,
"grad_norm": 1.5101827383041382,
"learning_rate": 1.1960421369450715e-05,
"loss": 0.8577,
"step": 6050
},
{
"epoch": 1.53,
"grad_norm": 3.4737539291381836,
"learning_rate": 1.1942738901429646e-05,
"loss": 0.9474,
"step": 6100
},
{
"epoch": 1.54,
"grad_norm": 92.57341003417969,
"learning_rate": 1.1925056433408578e-05,
"loss": 0.8665,
"step": 6150
},
{
"epoch": 1.56,
"grad_norm": 38.56670379638672,
"learning_rate": 1.1907373965387509e-05,
"loss": 0.7833,
"step": 6200
},
{
"epoch": 1.57,
"grad_norm": 29.12518310546875,
"learning_rate": 1.1889691497366442e-05,
"loss": 0.7454,
"step": 6250
},
{
"epoch": 1.58,
"grad_norm": 69.91959381103516,
"learning_rate": 1.1872009029345374e-05,
"loss": 0.7843,
"step": 6300
},
{
"epoch": 1.59,
"grad_norm": 56.20566177368164,
"learning_rate": 1.1854326561324305e-05,
"loss": 0.841,
"step": 6350
},
{
"epoch": 1.61,
"grad_norm": 66.2998275756836,
"learning_rate": 1.1836644093303235e-05,
"loss": 0.723,
"step": 6400
},
{
"epoch": 1.62,
"grad_norm": 1.9407018423080444,
"learning_rate": 1.1818961625282168e-05,
"loss": 0.7235,
"step": 6450
},
{
"epoch": 1.63,
"grad_norm": 61.69858932495117,
"learning_rate": 1.1801279157261098e-05,
"loss": 0.8241,
"step": 6500
},
{
"epoch": 1.64,
"grad_norm": 8.412137985229492,
"learning_rate": 1.178359668924003e-05,
"loss": 0.564,
"step": 6550
},
{
"epoch": 1.66,
"grad_norm": 9.307317733764648,
"learning_rate": 1.1765914221218962e-05,
"loss": 0.8438,
"step": 6600
},
{
"epoch": 1.67,
"grad_norm": 41.45466995239258,
"learning_rate": 1.1748231753197894e-05,
"loss": 0.7763,
"step": 6650
},
{
"epoch": 1.68,
"grad_norm": 2.8245513439178467,
"learning_rate": 1.1730549285176825e-05,
"loss": 0.7476,
"step": 6700
},
{
"epoch": 1.69,
"grad_norm": 76.77831268310547,
"learning_rate": 1.1712866817155757e-05,
"loss": 0.9578,
"step": 6750
},
{
"epoch": 1.71,
"grad_norm": 0.004409218207001686,
"learning_rate": 1.1695184349134688e-05,
"loss": 0.8765,
"step": 6800
},
{
"epoch": 1.72,
"grad_norm": 46.58176803588867,
"learning_rate": 1.167750188111362e-05,
"loss": 0.5402,
"step": 6850
},
{
"epoch": 1.73,
"grad_norm": 5.006879806518555,
"learning_rate": 1.165981941309255e-05,
"loss": 0.4722,
"step": 6900
},
{
"epoch": 1.74,
"grad_norm": 2.194460153579712,
"learning_rate": 1.1642136945071483e-05,
"loss": 0.858,
"step": 6950
},
{
"epoch": 1.76,
"grad_norm": 0.012106262147426605,
"learning_rate": 1.1624454477050414e-05,
"loss": 0.6607,
"step": 7000
},
{
"epoch": 1.77,
"grad_norm": 6.08723258972168,
"learning_rate": 1.1606772009029345e-05,
"loss": 0.866,
"step": 7050
},
{
"epoch": 1.78,
"grad_norm": 51.338478088378906,
"learning_rate": 1.1589089541008277e-05,
"loss": 0.7508,
"step": 7100
},
{
"epoch": 1.79,
"grad_norm": 18.472858428955078,
"learning_rate": 1.157140707298721e-05,
"loss": 0.8686,
"step": 7150
},
{
"epoch": 1.81,
"grad_norm": 4.837900638580322,
"learning_rate": 1.155372460496614e-05,
"loss": 0.5302,
"step": 7200
},
{
"epoch": 1.82,
"grad_norm": 41.74524688720703,
"learning_rate": 1.1536042136945072e-05,
"loss": 0.7681,
"step": 7250
},
{
"epoch": 1.83,
"grad_norm": 30.557188034057617,
"learning_rate": 1.1518359668924003e-05,
"loss": 0.9107,
"step": 7300
},
{
"epoch": 1.84,
"grad_norm": 14.001880645751953,
"learning_rate": 1.1500677200902934e-05,
"loss": 0.5387,
"step": 7350
},
{
"epoch": 1.86,
"grad_norm": 0.1815216839313507,
"learning_rate": 1.1482994732881866e-05,
"loss": 0.8152,
"step": 7400
},
{
"epoch": 1.87,
"grad_norm": 36.915061950683594,
"learning_rate": 1.1465312264860799e-05,
"loss": 0.6313,
"step": 7450
},
{
"epoch": 1.88,
"grad_norm": 0.20334406197071075,
"learning_rate": 1.144762979683973e-05,
"loss": 0.8265,
"step": 7500
},
{
"epoch": 1.89,
"grad_norm": 0.0018741831881925464,
"learning_rate": 1.1429947328818662e-05,
"loss": 0.7202,
"step": 7550
},
{
"epoch": 1.91,
"grad_norm": 0.000707630708348006,
"learning_rate": 1.1412264860797592e-05,
"loss": 0.6488,
"step": 7600
},
{
"epoch": 1.92,
"grad_norm": 0.4616662561893463,
"learning_rate": 1.1394582392776523e-05,
"loss": 0.9402,
"step": 7650
},
{
"epoch": 1.93,
"grad_norm": 43.170814514160156,
"learning_rate": 1.1376899924755456e-05,
"loss": 0.763,
"step": 7700
},
{
"epoch": 1.94,
"grad_norm": 3.035790205001831,
"learning_rate": 1.1359217456734388e-05,
"loss": 0.5681,
"step": 7750
},
{
"epoch": 1.96,
"grad_norm": 45.11912536621094,
"learning_rate": 1.1341534988713319e-05,
"loss": 0.9795,
"step": 7800
},
{
"epoch": 1.97,
"grad_norm": 2.006427049636841,
"learning_rate": 1.1323852520692251e-05,
"loss": 0.4772,
"step": 7850
},
{
"epoch": 1.98,
"grad_norm": 69.13399505615234,
"learning_rate": 1.1306170052671182e-05,
"loss": 0.8649,
"step": 7900
},
{
"epoch": 1.99,
"grad_norm": 43.80717468261719,
"learning_rate": 1.1288487584650112e-05,
"loss": 0.6051,
"step": 7950
},
{
"epoch": 2.01,
"grad_norm": 1.3676908016204834,
"learning_rate": 1.1270805116629045e-05,
"loss": 0.4737,
"step": 8000
},
{
"epoch": 2.02,
"grad_norm": 18.533445358276367,
"learning_rate": 1.1253122648607977e-05,
"loss": 0.4353,
"step": 8050
},
{
"epoch": 2.03,
"grad_norm": 0.649580717086792,
"learning_rate": 1.1235440180586908e-05,
"loss": 0.9283,
"step": 8100
},
{
"epoch": 2.04,
"grad_norm": 37.0181999206543,
"learning_rate": 1.1217757712565839e-05,
"loss": 0.8631,
"step": 8150
},
{
"epoch": 2.06,
"grad_norm": 1.1191781759262085,
"learning_rate": 1.1200075244544771e-05,
"loss": 0.7166,
"step": 8200
},
{
"epoch": 2.07,
"grad_norm": 46.35097885131836,
"learning_rate": 1.1182392776523702e-05,
"loss": 0.6263,
"step": 8250
},
{
"epoch": 2.08,
"grad_norm": 9.393693923950195,
"learning_rate": 1.1164710308502632e-05,
"loss": 0.7146,
"step": 8300
},
{
"epoch": 2.09,
"grad_norm": 49.04343032836914,
"learning_rate": 1.1147027840481567e-05,
"loss": 0.5924,
"step": 8350
},
{
"epoch": 2.11,
"grad_norm": 2.917092800140381,
"learning_rate": 1.1129345372460497e-05,
"loss": 0.815,
"step": 8400
},
{
"epoch": 2.12,
"grad_norm": 6.2741618156433105,
"learning_rate": 1.1111662904439428e-05,
"loss": 0.8852,
"step": 8450
},
{
"epoch": 2.13,
"grad_norm": 0.026425007730722427,
"learning_rate": 1.109398043641836e-05,
"loss": 0.609,
"step": 8500
},
{
"epoch": 2.14,
"grad_norm": 8.229249954223633,
"learning_rate": 1.1076297968397291e-05,
"loss": 0.5546,
"step": 8550
},
{
"epoch": 2.16,
"grad_norm": 3.257112979888916,
"learning_rate": 1.1058615500376222e-05,
"loss": 0.6084,
"step": 8600
},
{
"epoch": 2.17,
"grad_norm": 44.147640228271484,
"learning_rate": 1.1040933032355156e-05,
"loss": 0.4687,
"step": 8650
},
{
"epoch": 2.18,
"grad_norm": 93.26548767089844,
"learning_rate": 1.1023250564334087e-05,
"loss": 0.6323,
"step": 8700
},
{
"epoch": 2.19,
"grad_norm": 83.17293548583984,
"learning_rate": 1.1005568096313017e-05,
"loss": 0.8759,
"step": 8750
},
{
"epoch": 2.21,
"grad_norm": 51.27419662475586,
"learning_rate": 1.098788562829195e-05,
"loss": 0.69,
"step": 8800
},
{
"epoch": 2.22,
"grad_norm": 0.0010558576323091984,
"learning_rate": 1.097020316027088e-05,
"loss": 0.5279,
"step": 8850
},
{
"epoch": 2.23,
"grad_norm": 73.43231201171875,
"learning_rate": 1.0952520692249811e-05,
"loss": 0.9285,
"step": 8900
},
{
"epoch": 2.24,
"grad_norm": 6.488553047180176,
"learning_rate": 1.0934838224228745e-05,
"loss": 0.6137,
"step": 8950
},
{
"epoch": 2.26,
"grad_norm": 53.465972900390625,
"learning_rate": 1.0917155756207676e-05,
"loss": 0.4718,
"step": 9000
},
{
"epoch": 2.27,
"grad_norm": 1.405421495437622,
"learning_rate": 1.0899473288186607e-05,
"loss": 0.7248,
"step": 9050
},
{
"epoch": 2.28,
"grad_norm": 58.552490234375,
"learning_rate": 1.0881790820165539e-05,
"loss": 0.6312,
"step": 9100
},
{
"epoch": 2.29,
"grad_norm": 85.75029754638672,
"learning_rate": 1.086410835214447e-05,
"loss": 1.1383,
"step": 9150
},
{
"epoch": 2.31,
"grad_norm": 1.4940392971038818,
"learning_rate": 1.08464258841234e-05,
"loss": 0.6104,
"step": 9200
},
{
"epoch": 2.32,
"grad_norm": 1.2434502840042114,
"learning_rate": 1.0828743416102334e-05,
"loss": 0.5124,
"step": 9250
},
{
"epoch": 2.33,
"grad_norm": 0.002772190608084202,
"learning_rate": 1.0811060948081265e-05,
"loss": 0.8389,
"step": 9300
},
{
"epoch": 2.35,
"grad_norm": 27.42812156677246,
"learning_rate": 1.0793378480060196e-05,
"loss": 0.6571,
"step": 9350
},
{
"epoch": 2.36,
"grad_norm": 70.63783264160156,
"learning_rate": 1.0775696012039128e-05,
"loss": 0.5234,
"step": 9400
},
{
"epoch": 2.37,
"grad_norm": 0.873970627784729,
"learning_rate": 1.0758013544018059e-05,
"loss": 0.7862,
"step": 9450
},
{
"epoch": 2.38,
"grad_norm": 0.0001105390620068647,
"learning_rate": 1.074033107599699e-05,
"loss": 0.9885,
"step": 9500
},
{
"epoch": 2.4,
"grad_norm": 2.0316097736358643,
"learning_rate": 1.0722648607975922e-05,
"loss": 0.6648,
"step": 9550
},
{
"epoch": 2.41,
"grad_norm": 33.791568756103516,
"learning_rate": 1.0704966139954854e-05,
"loss": 0.6746,
"step": 9600
},
{
"epoch": 2.42,
"grad_norm": 112.26337432861328,
"learning_rate": 1.0687283671933785e-05,
"loss": 0.787,
"step": 9650
},
{
"epoch": 2.43,
"grad_norm": 53.35863494873047,
"learning_rate": 1.0669601203912716e-05,
"loss": 0.5922,
"step": 9700
},
{
"epoch": 2.45,
"grad_norm": 0.0027942871674895287,
"learning_rate": 1.0651918735891648e-05,
"loss": 0.6236,
"step": 9750
},
{
"epoch": 2.46,
"grad_norm": 0.00036070370697416365,
"learning_rate": 1.0634236267870579e-05,
"loss": 0.6559,
"step": 9800
},
{
"epoch": 2.47,
"grad_norm": 2.5188686847686768,
"learning_rate": 1.0616553799849511e-05,
"loss": 1.002,
"step": 9850
},
{
"epoch": 2.48,
"grad_norm": 42.79086685180664,
"learning_rate": 1.0598871331828444e-05,
"loss": 1.001,
"step": 9900
},
{
"epoch": 2.5,
"grad_norm": 0.06492776423692703,
"learning_rate": 1.0581188863807374e-05,
"loss": 0.9975,
"step": 9950
},
{
"epoch": 2.51,
"grad_norm": 12.079846382141113,
"learning_rate": 1.0563506395786305e-05,
"loss": 0.6417,
"step": 10000
},
{
"epoch": 2.52,
"grad_norm": 98.72542572021484,
"learning_rate": 1.0545823927765237e-05,
"loss": 0.9242,
"step": 10050
},
{
"epoch": 2.53,
"grad_norm": 0.15632659196853638,
"learning_rate": 1.0528141459744168e-05,
"loss": 0.4118,
"step": 10100
},
{
"epoch": 2.55,
"grad_norm": 3.5314505100250244,
"learning_rate": 1.05104589917231e-05,
"loss": 0.6486,
"step": 10150
},
{
"epoch": 2.56,
"grad_norm": 0.06171553581953049,
"learning_rate": 1.0492776523702033e-05,
"loss": 0.7782,
"step": 10200
},
{
"epoch": 2.57,
"grad_norm": 69.53456115722656,
"learning_rate": 1.0475094055680964e-05,
"loss": 0.5421,
"step": 10250
},
{
"epoch": 2.58,
"grad_norm": 27.149484634399414,
"learning_rate": 1.0457411587659894e-05,
"loss": 0.7476,
"step": 10300
},
{
"epoch": 2.6,
"grad_norm": 3.7423877716064453,
"learning_rate": 1.0439729119638827e-05,
"loss": 0.7429,
"step": 10350
},
{
"epoch": 2.61,
"grad_norm": 0.6006436944007874,
"learning_rate": 1.0422046651617757e-05,
"loss": 0.4376,
"step": 10400
},
{
"epoch": 2.62,
"grad_norm": 0.2609996497631073,
"learning_rate": 1.040436418359669e-05,
"loss": 0.8938,
"step": 10450
},
{
"epoch": 2.63,
"grad_norm": 73.91007232666016,
"learning_rate": 1.0386681715575622e-05,
"loss": 0.7273,
"step": 10500
},
{
"epoch": 2.65,
"grad_norm": 0.010080622509121895,
"learning_rate": 1.0368999247554553e-05,
"loss": 0.7709,
"step": 10550
},
{
"epoch": 2.66,
"grad_norm": 5.206912994384766,
"learning_rate": 1.0351316779533484e-05,
"loss": 0.696,
"step": 10600
},
{
"epoch": 2.67,
"grad_norm": 94.36717987060547,
"learning_rate": 1.0333634311512416e-05,
"loss": 0.6964,
"step": 10650
},
{
"epoch": 2.68,
"grad_norm": 0.6438612341880798,
"learning_rate": 1.0315951843491347e-05,
"loss": 0.6461,
"step": 10700
},
{
"epoch": 2.7,
"grad_norm": 0.02532346546649933,
"learning_rate": 1.029826937547028e-05,
"loss": 0.8581,
"step": 10750
},
{
"epoch": 2.71,
"grad_norm": 1.5096291303634644,
"learning_rate": 1.0280586907449212e-05,
"loss": 0.4629,
"step": 10800
},
{
"epoch": 2.72,
"grad_norm": 81.77324676513672,
"learning_rate": 1.0262904439428142e-05,
"loss": 0.8681,
"step": 10850
},
{
"epoch": 2.73,
"grad_norm": 1.1398659944534302,
"learning_rate": 1.0245221971407073e-05,
"loss": 0.5162,
"step": 10900
},
{
"epoch": 2.75,
"grad_norm": 0.4226570725440979,
"learning_rate": 1.0227539503386005e-05,
"loss": 0.4572,
"step": 10950
},
{
"epoch": 2.76,
"grad_norm": 0.02047480270266533,
"learning_rate": 1.0209857035364936e-05,
"loss": 0.8946,
"step": 11000
},
{
"epoch": 2.77,
"grad_norm": 124.79954528808594,
"learning_rate": 1.0192174567343868e-05,
"loss": 0.8325,
"step": 11050
},
{
"epoch": 2.78,
"grad_norm": 7.112376624718308e-05,
"learning_rate": 1.01744920993228e-05,
"loss": 0.5664,
"step": 11100
},
{
"epoch": 2.8,
"grad_norm": 78.66365051269531,
"learning_rate": 1.0156809631301732e-05,
"loss": 0.9426,
"step": 11150
},
{
"epoch": 2.81,
"grad_norm": 9.567934466758743e-05,
"learning_rate": 1.0139127163280662e-05,
"loss": 0.4818,
"step": 11200
},
{
"epoch": 2.82,
"grad_norm": 0.003907013684511185,
"learning_rate": 1.0121444695259593e-05,
"loss": 0.743,
"step": 11250
},
{
"epoch": 2.83,
"grad_norm": 84.53366088867188,
"learning_rate": 1.0103762227238525e-05,
"loss": 0.8544,
"step": 11300
},
{
"epoch": 2.85,
"grad_norm": 3.4674291610717773,
"learning_rate": 1.0086079759217458e-05,
"loss": 0.5553,
"step": 11350
},
{
"epoch": 2.86,
"grad_norm": 125.62838745117188,
"learning_rate": 1.0068397291196388e-05,
"loss": 0.6168,
"step": 11400
},
{
"epoch": 2.87,
"grad_norm": 99.19140625,
"learning_rate": 1.005071482317532e-05,
"loss": 1.1238,
"step": 11450
},
{
"epoch": 2.88,
"grad_norm": 38.717559814453125,
"learning_rate": 1.0033032355154252e-05,
"loss": 1.0667,
"step": 11500
},
{
"epoch": 2.9,
"grad_norm": 28.915889739990234,
"learning_rate": 1.0015349887133182e-05,
"loss": 0.5045,
"step": 11550
},
{
"epoch": 2.91,
"grad_norm": 48.31145477294922,
"learning_rate": 9.997667419112115e-06,
"loss": 0.754,
"step": 11600
},
{
"epoch": 2.92,
"grad_norm": 0.06709738075733185,
"learning_rate": 9.979984951091047e-06,
"loss": 0.6229,
"step": 11650
},
{
"epoch": 2.93,
"grad_norm": 1.2689626216888428,
"learning_rate": 9.962302483069978e-06,
"loss": 0.7818,
"step": 11700
},
{
"epoch": 2.95,
"grad_norm": 35.311134338378906,
"learning_rate": 9.94462001504891e-06,
"loss": 1.0477,
"step": 11750
},
{
"epoch": 2.96,
"grad_norm": 88.91561889648438,
"learning_rate": 9.92693754702784e-06,
"loss": 0.6488,
"step": 11800
},
{
"epoch": 2.97,
"grad_norm": 70.55093383789062,
"learning_rate": 9.909255079006772e-06,
"loss": 0.5951,
"step": 11850
},
{
"epoch": 2.98,
"grad_norm": 89.51988983154297,
"learning_rate": 9.891572610985704e-06,
"loss": 0.6867,
"step": 11900
},
{
"epoch": 3.0,
"grad_norm": 0.40069764852523804,
"learning_rate": 9.873890142964636e-06,
"loss": 0.7094,
"step": 11950
},
{
"epoch": 3.01,
"grad_norm": 2.006258964538574,
"learning_rate": 9.856207674943567e-06,
"loss": 0.5428,
"step": 12000
},
{
"epoch": 3.02,
"grad_norm": 51.34798049926758,
"learning_rate": 9.8385252069225e-06,
"loss": 0.573,
"step": 12050
},
{
"epoch": 3.03,
"grad_norm": 95.47881317138672,
"learning_rate": 9.82084273890143e-06,
"loss": 0.4226,
"step": 12100
},
{
"epoch": 3.05,
"grad_norm": 0.07185523957014084,
"learning_rate": 9.80316027088036e-06,
"loss": 0.6424,
"step": 12150
},
{
"epoch": 3.06,
"grad_norm": 109.8128662109375,
"learning_rate": 9.785477802859293e-06,
"loss": 0.5279,
"step": 12200
},
{
"epoch": 3.07,
"grad_norm": 44.56191635131836,
"learning_rate": 9.767795334838224e-06,
"loss": 0.3463,
"step": 12250
},
{
"epoch": 3.09,
"grad_norm": 0.45552492141723633,
"learning_rate": 9.750112866817156e-06,
"loss": 0.6696,
"step": 12300
},
{
"epoch": 3.1,
"grad_norm": 0.0008902169647626579,
"learning_rate": 9.732430398796089e-06,
"loss": 0.3845,
"step": 12350
},
{
"epoch": 3.11,
"grad_norm": 134.49839782714844,
"learning_rate": 9.71474793077502e-06,
"loss": 0.8803,
"step": 12400
},
{
"epoch": 3.12,
"grad_norm": 0.21923835575580597,
"learning_rate": 9.69706546275395e-06,
"loss": 0.741,
"step": 12450
},
{
"epoch": 3.14,
"grad_norm": 0.2331884801387787,
"learning_rate": 9.679382994732883e-06,
"loss": 0.7015,
"step": 12500
},
{
"epoch": 3.15,
"grad_norm": 0.4663000702857971,
"learning_rate": 9.661700526711813e-06,
"loss": 0.7605,
"step": 12550
},
{
"epoch": 3.16,
"grad_norm": 59.55733871459961,
"learning_rate": 9.644018058690746e-06,
"loss": 0.5855,
"step": 12600
},
{
"epoch": 3.17,
"grad_norm": 0.8377301096916199,
"learning_rate": 9.626335590669676e-06,
"loss": 0.4117,
"step": 12650
},
{
"epoch": 3.19,
"grad_norm": 64.69242095947266,
"learning_rate": 9.608653122648609e-06,
"loss": 0.5216,
"step": 12700
},
{
"epoch": 3.2,
"grad_norm": 0.8485704660415649,
"learning_rate": 9.59097065462754e-06,
"loss": 0.6882,
"step": 12750
},
{
"epoch": 3.21,
"grad_norm": 143.98147583007812,
"learning_rate": 9.57328818660647e-06,
"loss": 0.6463,
"step": 12800
},
{
"epoch": 3.22,
"grad_norm": 132.84567260742188,
"learning_rate": 9.555605718585403e-06,
"loss": 0.7474,
"step": 12850
},
{
"epoch": 3.24,
"grad_norm": 8.179304122924805,
"learning_rate": 9.537923250564335e-06,
"loss": 0.375,
"step": 12900
},
{
"epoch": 3.25,
"grad_norm": 10.138591766357422,
"learning_rate": 9.520240782543266e-06,
"loss": 0.7204,
"step": 12950
},
{
"epoch": 3.26,
"grad_norm": 0.00011070028267567977,
"learning_rate": 9.502558314522198e-06,
"loss": 0.3631,
"step": 13000
},
{
"epoch": 3.27,
"grad_norm": 1.0425533056259155,
"learning_rate": 9.484875846501129e-06,
"loss": 0.6752,
"step": 13050
},
{
"epoch": 3.29,
"grad_norm": 19.544971466064453,
"learning_rate": 9.46719337848006e-06,
"loss": 0.4082,
"step": 13100
},
{
"epoch": 3.3,
"grad_norm": 3.29071121996094e-06,
"learning_rate": 9.449510910458992e-06,
"loss": 0.752,
"step": 13150
},
{
"epoch": 3.31,
"grad_norm": 1.4096872806549072,
"learning_rate": 9.431828442437924e-06,
"loss": 0.739,
"step": 13200
},
{
"epoch": 3.32,
"grad_norm": 0.1742667555809021,
"learning_rate": 9.414145974416855e-06,
"loss": 0.5783,
"step": 13250
},
{
"epoch": 3.34,
"grad_norm": 0.8604665398597717,
"learning_rate": 9.396463506395787e-06,
"loss": 0.8603,
"step": 13300
},
{
"epoch": 3.35,
"grad_norm": 6.3410016082343645e-06,
"learning_rate": 9.378781038374718e-06,
"loss": 0.4481,
"step": 13350
},
{
"epoch": 3.36,
"grad_norm": 157.0394744873047,
"learning_rate": 9.361098570353649e-06,
"loss": 0.6242,
"step": 13400
},
{
"epoch": 3.37,
"grad_norm": 0.00026235656696371734,
"learning_rate": 9.343416102332581e-06,
"loss": 0.7734,
"step": 13450
},
{
"epoch": 3.39,
"grad_norm": 0.48436620831489563,
"learning_rate": 9.325733634311513e-06,
"loss": 0.4109,
"step": 13500
},
{
"epoch": 3.4,
"grad_norm": 136.50823974609375,
"learning_rate": 9.308051166290444e-06,
"loss": 0.6214,
"step": 13550
},
{
"epoch": 3.41,
"grad_norm": 0.14412285387516022,
"learning_rate": 9.290368698269377e-06,
"loss": 0.2606,
"step": 13600
},
{
"epoch": 3.42,
"grad_norm": 11.025894165039062,
"learning_rate": 9.272686230248307e-06,
"loss": 0.7337,
"step": 13650
},
{
"epoch": 3.44,
"grad_norm": 121.1470718383789,
"learning_rate": 9.255003762227238e-06,
"loss": 0.7108,
"step": 13700
},
{
"epoch": 3.45,
"grad_norm": 0.08408990502357483,
"learning_rate": 9.23732129420617e-06,
"loss": 0.4979,
"step": 13750
},
{
"epoch": 3.46,
"grad_norm": 0.05547923222184181,
"learning_rate": 9.219638826185103e-06,
"loss": 0.299,
"step": 13800
},
{
"epoch": 3.47,
"grad_norm": 131.8295135498047,
"learning_rate": 9.201956358164033e-06,
"loss": 1.0513,
"step": 13850
},
{
"epoch": 3.49,
"grad_norm": 40.073734283447266,
"learning_rate": 9.184273890142966e-06,
"loss": 0.4599,
"step": 13900
},
{
"epoch": 3.5,
"grad_norm": 18.33232879638672,
"learning_rate": 9.166591422121897e-06,
"loss": 0.6182,
"step": 13950
},
{
"epoch": 3.51,
"grad_norm": 0.02969328872859478,
"learning_rate": 9.148908954100827e-06,
"loss": 0.4793,
"step": 14000
},
{
"epoch": 3.52,
"grad_norm": 0.36942940950393677,
"learning_rate": 9.13122648607976e-06,
"loss": 0.3778,
"step": 14050
},
{
"epoch": 3.54,
"grad_norm": 0.076649971306324,
"learning_rate": 9.113544018058692e-06,
"loss": 0.6148,
"step": 14100
},
{
"epoch": 3.55,
"grad_norm": 282.6568298339844,
"learning_rate": 9.095861550037623e-06,
"loss": 0.6784,
"step": 14150
},
{
"epoch": 3.56,
"grad_norm": 0.14636385440826416,
"learning_rate": 9.078179082016553e-06,
"loss": 0.9237,
"step": 14200
},
{
"epoch": 3.57,
"grad_norm": 0.014414280652999878,
"learning_rate": 9.060496613995486e-06,
"loss": 0.7111,
"step": 14250
},
{
"epoch": 3.59,
"grad_norm": 0.10564962774515152,
"learning_rate": 9.042814145974417e-06,
"loss": 0.4485,
"step": 14300
},
{
"epoch": 3.6,
"grad_norm": 0.10087831318378448,
"learning_rate": 9.025131677953347e-06,
"loss": 0.7537,
"step": 14350
},
{
"epoch": 3.61,
"grad_norm": 75.64422607421875,
"learning_rate": 9.007449209932281e-06,
"loss": 0.4629,
"step": 14400
},
{
"epoch": 3.62,
"grad_norm": 87.81208801269531,
"learning_rate": 8.989766741911212e-06,
"loss": 0.5313,
"step": 14450
},
{
"epoch": 3.64,
"grad_norm": 0.0018619262846186757,
"learning_rate": 8.972084273890143e-06,
"loss": 0.7642,
"step": 14500
},
{
"epoch": 3.65,
"grad_norm": 110.11195373535156,
"learning_rate": 8.954401805869075e-06,
"loss": 0.6499,
"step": 14550
},
{
"epoch": 3.66,
"grad_norm": 0.008621015585958958,
"learning_rate": 8.936719337848006e-06,
"loss": 0.3583,
"step": 14600
},
{
"epoch": 3.67,
"grad_norm": 0.022055380046367645,
"learning_rate": 8.919036869826937e-06,
"loss": 0.5497,
"step": 14650
},
{
"epoch": 3.69,
"grad_norm": 67.4389419555664,
"learning_rate": 8.90135440180587e-06,
"loss": 0.5981,
"step": 14700
},
{
"epoch": 3.7,
"grad_norm": 0.000478647300042212,
"learning_rate": 8.883671933784801e-06,
"loss": 0.259,
"step": 14750
},
{
"epoch": 3.71,
"grad_norm": 1.5297553539276123,
"learning_rate": 8.865989465763732e-06,
"loss": 0.6259,
"step": 14800
},
{
"epoch": 3.72,
"grad_norm": 36.321128845214844,
"learning_rate": 8.848306997742664e-06,
"loss": 0.6844,
"step": 14850
},
{
"epoch": 3.74,
"grad_norm": 175.9180450439453,
"learning_rate": 8.830624529721595e-06,
"loss": 0.5772,
"step": 14900
},
{
"epoch": 3.75,
"grad_norm": 178.33462524414062,
"learning_rate": 8.812942061700526e-06,
"loss": 0.5891,
"step": 14950
},
{
"epoch": 3.76,
"grad_norm": 0.00013845643843524158,
"learning_rate": 8.79525959367946e-06,
"loss": 0.464,
"step": 15000
},
{
"epoch": 3.77,
"grad_norm": 127.49348449707031,
"learning_rate": 8.77757712565839e-06,
"loss": 0.5844,
"step": 15050
},
{
"epoch": 3.79,
"grad_norm": 1.6402578353881836,
"learning_rate": 8.759894657637321e-06,
"loss": 0.7526,
"step": 15100
},
{
"epoch": 3.8,
"grad_norm": 0.008880015462636948,
"learning_rate": 8.742212189616254e-06,
"loss": 0.9234,
"step": 15150
},
{
"epoch": 3.81,
"grad_norm": 0.4811843931674957,
"learning_rate": 8.724529721595184e-06,
"loss": 0.848,
"step": 15200
},
{
"epoch": 3.82,
"grad_norm": 0.0008742750505916774,
"learning_rate": 8.706847253574115e-06,
"loss": 0.4136,
"step": 15250
},
{
"epoch": 3.84,
"grad_norm": 45.28816604614258,
"learning_rate": 8.68916478555305e-06,
"loss": 0.6978,
"step": 15300
},
{
"epoch": 3.85,
"grad_norm": 0.014465034939348698,
"learning_rate": 8.67148231753198e-06,
"loss": 0.6124,
"step": 15350
},
{
"epoch": 3.86,
"grad_norm": 0.01468442752957344,
"learning_rate": 8.65379984951091e-06,
"loss": 0.925,
"step": 15400
},
{
"epoch": 3.88,
"grad_norm": 1.076714283954061e-06,
"learning_rate": 8.636117381489843e-06,
"loss": 0.5271,
"step": 15450
},
{
"epoch": 3.89,
"grad_norm": 4.781663847097661e-07,
"learning_rate": 8.618434913468774e-06,
"loss": 0.4686,
"step": 15500
},
{
"epoch": 3.9,
"grad_norm": 1.0695022344589233,
"learning_rate": 8.600752445447704e-06,
"loss": 1.076,
"step": 15550
},
{
"epoch": 3.91,
"grad_norm": 0.3064178228378296,
"learning_rate": 8.583069977426637e-06,
"loss": 0.4409,
"step": 15600
},
{
"epoch": 3.93,
"grad_norm": 95.81256103515625,
"learning_rate": 8.56538750940557e-06,
"loss": 0.628,
"step": 15650
},
{
"epoch": 3.94,
"grad_norm": 0.011423008516430855,
"learning_rate": 8.5477050413845e-06,
"loss": 0.4738,
"step": 15700
},
{
"epoch": 3.95,
"grad_norm": 68.7823257446289,
"learning_rate": 8.53002257336343e-06,
"loss": 0.5614,
"step": 15750
},
{
"epoch": 3.96,
"grad_norm": 0.0003278045041952282,
"learning_rate": 8.512340105342363e-06,
"loss": 0.451,
"step": 15800
},
{
"epoch": 3.98,
"grad_norm": 5.685105293196102e-07,
"learning_rate": 8.494657637321294e-06,
"loss": 0.6919,
"step": 15850
},
{
"epoch": 3.99,
"grad_norm": 0.006908051203936338,
"learning_rate": 8.476975169300226e-06,
"loss": 0.7209,
"step": 15900
},
{
"epoch": 4.0,
"grad_norm": 0.14153322577476501,
"learning_rate": 8.459292701279158e-06,
"loss": 0.8544,
"step": 15950
},
{
"epoch": 4.01,
"grad_norm": 0.01233228575438261,
"learning_rate": 8.44161023325809e-06,
"loss": 0.1127,
"step": 16000
},
{
"epoch": 4.03,
"grad_norm": 0.02049972675740719,
"learning_rate": 8.42392776523702e-06,
"loss": 0.2392,
"step": 16050
},
{
"epoch": 4.04,
"grad_norm": 0.6001113653182983,
"learning_rate": 8.406245297215952e-06,
"loss": 0.2408,
"step": 16100
},
{
"epoch": 4.05,
"grad_norm": 0.7253586649894714,
"learning_rate": 8.388562829194883e-06,
"loss": 0.679,
"step": 16150
},
{
"epoch": 4.06,
"grad_norm": 0.20070885121822357,
"learning_rate": 8.370880361173815e-06,
"loss": 0.5534,
"step": 16200
},
{
"epoch": 4.08,
"grad_norm": 0.004428381100296974,
"learning_rate": 8.353197893152748e-06,
"loss": 0.3753,
"step": 16250
},
{
"epoch": 4.09,
"grad_norm": 0.1646382063627243,
"learning_rate": 8.335515425131678e-06,
"loss": 0.7136,
"step": 16300
},
{
"epoch": 4.1,
"grad_norm": 4.304123401641846,
"learning_rate": 8.31783295711061e-06,
"loss": 0.6533,
"step": 16350
},
{
"epoch": 4.11,
"grad_norm": 0.0014060864923521876,
"learning_rate": 8.300150489089542e-06,
"loss": 0.2931,
"step": 16400
},
{
"epoch": 4.13,
"grad_norm": 10.760331153869629,
"learning_rate": 8.282468021068472e-06,
"loss": 0.2996,
"step": 16450
},
{
"epoch": 4.14,
"grad_norm": 151.8526611328125,
"learning_rate": 8.264785553047405e-06,
"loss": 0.2342,
"step": 16500
},
{
"epoch": 4.15,
"grad_norm": 0.2262250781059265,
"learning_rate": 8.247103085026337e-06,
"loss": 0.2152,
"step": 16550
},
{
"epoch": 4.16,
"grad_norm": 0.028175359591841698,
"learning_rate": 8.229420617005268e-06,
"loss": 0.2108,
"step": 16600
},
{
"epoch": 4.18,
"grad_norm": 1.2244036197662354,
"learning_rate": 8.211738148984198e-06,
"loss": 0.4471,
"step": 16650
},
{
"epoch": 4.19,
"grad_norm": 0.12875045835971832,
"learning_rate": 8.194055680963131e-06,
"loss": 0.5662,
"step": 16700
},
{
"epoch": 4.2,
"grad_norm": 3.702627420425415,
"learning_rate": 8.176373212942062e-06,
"loss": 0.3945,
"step": 16750
},
{
"epoch": 4.21,
"grad_norm": 50.61404800415039,
"learning_rate": 8.158690744920994e-06,
"loss": 0.2347,
"step": 16800
},
{
"epoch": 4.23,
"grad_norm": 0.736967146396637,
"learning_rate": 8.141008276899926e-06,
"loss": 0.2615,
"step": 16850
},
{
"epoch": 4.24,
"grad_norm": 0.00011446132702985778,
"learning_rate": 8.123325808878857e-06,
"loss": 0.5149,
"step": 16900
},
{
"epoch": 4.25,
"grad_norm": 0.0010398293379694223,
"learning_rate": 8.105643340857788e-06,
"loss": 0.2957,
"step": 16950
},
{
"epoch": 4.26,
"grad_norm": 0.26418277621269226,
"learning_rate": 8.08796087283672e-06,
"loss": 0.2704,
"step": 17000
},
{
"epoch": 4.28,
"grad_norm": 0.8061837553977966,
"learning_rate": 8.070278404815651e-06,
"loss": 0.367,
"step": 17050
},
{
"epoch": 4.29,
"grad_norm": 0.010115943849086761,
"learning_rate": 8.052595936794583e-06,
"loss": 0.3768,
"step": 17100
},
{
"epoch": 4.3,
"grad_norm": 27.51811981201172,
"learning_rate": 8.034913468773514e-06,
"loss": 0.3892,
"step": 17150
},
{
"epoch": 4.31,
"grad_norm": 0.000684226572047919,
"learning_rate": 8.017231000752446e-06,
"loss": 0.1805,
"step": 17200
},
{
"epoch": 4.33,
"grad_norm": 0.08357678353786469,
"learning_rate": 7.999548532731377e-06,
"loss": 0.2773,
"step": 17250
},
{
"epoch": 4.34,
"grad_norm": 292.503662109375,
"learning_rate": 7.981866064710308e-06,
"loss": 0.6283,
"step": 17300
},
{
"epoch": 4.35,
"grad_norm": 0.1264430582523346,
"learning_rate": 7.96418359668924e-06,
"loss": 0.4072,
"step": 17350
},
{
"epoch": 4.36,
"grad_norm": 1.3433716958388686e-05,
"learning_rate": 7.946501128668173e-06,
"loss": 0.8405,
"step": 17400
},
{
"epoch": 4.38,
"grad_norm": 27.759994506835938,
"learning_rate": 7.928818660647103e-06,
"loss": 0.4456,
"step": 17450
},
{
"epoch": 4.39,
"grad_norm": 369.9099426269531,
"learning_rate": 7.911136192626036e-06,
"loss": 0.3382,
"step": 17500
},
{
"epoch": 4.4,
"grad_norm": 6.0055251121521,
"learning_rate": 7.893453724604966e-06,
"loss": 0.329,
"step": 17550
},
{
"epoch": 4.41,
"grad_norm": 0.17973710596561432,
"learning_rate": 7.875771256583897e-06,
"loss": 0.6193,
"step": 17600
},
{
"epoch": 4.43,
"grad_norm": 0.03942597284913063,
"learning_rate": 7.85808878856283e-06,
"loss": 0.3823,
"step": 17650
},
{
"epoch": 4.44,
"grad_norm": 0.0010533991735428572,
"learning_rate": 7.840406320541762e-06,
"loss": 0.6641,
"step": 17700
},
{
"epoch": 4.45,
"grad_norm": 3.6850650531050633e-07,
"learning_rate": 7.822723852520693e-06,
"loss": 0.4148,
"step": 17750
},
{
"epoch": 4.46,
"grad_norm": 5.283959399093874e-05,
"learning_rate": 7.805041384499625e-06,
"loss": 0.799,
"step": 17800
},
{
"epoch": 4.48,
"grad_norm": 0.01196613721549511,
"learning_rate": 7.787358916478556e-06,
"loss": 0.5424,
"step": 17850
},
{
"epoch": 4.49,
"grad_norm": 211.05799865722656,
"learning_rate": 7.769676448457486e-06,
"loss": 0.2341,
"step": 17900
},
{
"epoch": 4.5,
"grad_norm": 8.655371743770957e-07,
"learning_rate": 7.751993980436419e-06,
"loss": 0.5349,
"step": 17950
},
{
"epoch": 4.51,
"grad_norm": 1.5644945408621602e-11,
"learning_rate": 7.734311512415351e-06,
"loss": 0.0804,
"step": 18000
},
{
"epoch": 4.53,
"grad_norm": 0.00036508633638732135,
"learning_rate": 7.716629044394282e-06,
"loss": 0.3295,
"step": 18050
},
{
"epoch": 4.54,
"grad_norm": 1.3209816270357e-13,
"learning_rate": 7.698946576373214e-06,
"loss": 0.3606,
"step": 18100
},
{
"epoch": 4.55,
"grad_norm": 314.8194885253906,
"learning_rate": 7.681264108352145e-06,
"loss": 0.3064,
"step": 18150
},
{
"epoch": 4.56,
"grad_norm": 1.250010797093637e-07,
"learning_rate": 7.663581640331076e-06,
"loss": 0.2967,
"step": 18200
},
{
"epoch": 4.58,
"grad_norm": 1.0573174953460693,
"learning_rate": 7.645899172310008e-06,
"loss": 0.4857,
"step": 18250
},
{
"epoch": 4.59,
"grad_norm": 204.4314727783203,
"learning_rate": 7.628216704288939e-06,
"loss": 0.358,
"step": 18300
},
{
"epoch": 4.6,
"grad_norm": 0.02004345878958702,
"learning_rate": 7.610534236267871e-06,
"loss": 0.753,
"step": 18350
},
{
"epoch": 4.61,
"grad_norm": 302.43280029296875,
"learning_rate": 7.592851768246803e-06,
"loss": 0.3723,
"step": 18400
},
{
"epoch": 4.63,
"grad_norm": 0.0004978284705430269,
"learning_rate": 7.575169300225734e-06,
"loss": 0.4034,
"step": 18450
},
{
"epoch": 4.64,
"grad_norm": 93.66849517822266,
"learning_rate": 7.557486832204665e-06,
"loss": 0.4249,
"step": 18500
},
{
"epoch": 4.65,
"grad_norm": 0.001678618835285306,
"learning_rate": 7.5398043641835965e-06,
"loss": 0.7051,
"step": 18550
},
{
"epoch": 4.67,
"grad_norm": 0.37766626477241516,
"learning_rate": 7.522121896162528e-06,
"loss": 0.2375,
"step": 18600
},
{
"epoch": 4.68,
"grad_norm": 268.7151184082031,
"learning_rate": 7.50443942814146e-06,
"loss": 0.9723,
"step": 18650
},
{
"epoch": 4.69,
"grad_norm": 10.93520450592041,
"learning_rate": 7.486756960120392e-06,
"loss": 0.4446,
"step": 18700
},
{
"epoch": 4.7,
"grad_norm": 0.0002736333408392966,
"learning_rate": 7.4690744920993235e-06,
"loss": 0.5695,
"step": 18750
},
{
"epoch": 4.72,
"grad_norm": 0.006334410980343819,
"learning_rate": 7.451392024078254e-06,
"loss": 0.4816,
"step": 18800
},
{
"epoch": 4.73,
"grad_norm": 2.021748046754368e-10,
"learning_rate": 7.433709556057186e-06,
"loss": 0.6205,
"step": 18850
},
{
"epoch": 4.74,
"grad_norm": 77.61640930175781,
"learning_rate": 7.416027088036117e-06,
"loss": 0.1743,
"step": 18900
},
{
"epoch": 4.75,
"grad_norm": 0.24281173944473267,
"learning_rate": 7.39834462001505e-06,
"loss": 0.3958,
"step": 18950
},
{
"epoch": 4.77,
"grad_norm": 0.0005730040138587356,
"learning_rate": 7.380662151993981e-06,
"loss": 0.2709,
"step": 19000
},
{
"epoch": 4.78,
"grad_norm": 25.074310302734375,
"learning_rate": 7.362979683972912e-06,
"loss": 0.3811,
"step": 19050
},
{
"epoch": 4.79,
"grad_norm": 0.0002688245731405914,
"learning_rate": 7.3452972159518435e-06,
"loss": 0.2937,
"step": 19100
},
{
"epoch": 4.8,
"grad_norm": 6.246182601898909e-05,
"learning_rate": 7.327614747930775e-06,
"loss": 0.2862,
"step": 19150
},
{
"epoch": 4.82,
"grad_norm": 0.000318751554004848,
"learning_rate": 7.309932279909706e-06,
"loss": 0.132,
"step": 19200
},
{
"epoch": 4.83,
"grad_norm": 285.48297119140625,
"learning_rate": 7.292249811888639e-06,
"loss": 0.3126,
"step": 19250
},
{
"epoch": 4.84,
"grad_norm": 214.23065185546875,
"learning_rate": 7.2745673438675705e-06,
"loss": 0.6408,
"step": 19300
},
{
"epoch": 4.85,
"grad_norm": 305.9626159667969,
"learning_rate": 7.256884875846501e-06,
"loss": 0.4605,
"step": 19350
},
{
"epoch": 4.87,
"grad_norm": 4.2915186782011006e-07,
"learning_rate": 7.239202407825433e-06,
"loss": 0.2248,
"step": 19400
},
{
"epoch": 4.88,
"grad_norm": 265.24072265625,
"learning_rate": 7.221519939804364e-06,
"loss": 0.6776,
"step": 19450
},
{
"epoch": 4.89,
"grad_norm": 250.4654083251953,
"learning_rate": 7.203837471783295e-06,
"loss": 0.3709,
"step": 19500
},
{
"epoch": 4.9,
"grad_norm": 0.0005780484061688185,
"learning_rate": 7.186155003762228e-06,
"loss": 0.3521,
"step": 19550
},
{
"epoch": 4.92,
"grad_norm": 8.780172348022461,
"learning_rate": 7.168472535741159e-06,
"loss": 0.3998,
"step": 19600
},
{
"epoch": 4.93,
"grad_norm": 7.643636703491211,
"learning_rate": 7.1507900677200905e-06,
"loss": 0.5537,
"step": 19650
},
{
"epoch": 4.94,
"grad_norm": 0.0002484459837432951,
"learning_rate": 7.133107599699022e-06,
"loss": 0.4808,
"step": 19700
},
{
"epoch": 4.95,
"grad_norm": 0.2631732225418091,
"learning_rate": 7.115425131677953e-06,
"loss": 0.6781,
"step": 19750
},
{
"epoch": 4.97,
"grad_norm": 0.0346391536295414,
"learning_rate": 7.097742663656884e-06,
"loss": 0.1673,
"step": 19800
},
{
"epoch": 4.98,
"grad_norm": 0.006426098290830851,
"learning_rate": 7.0800601956358176e-06,
"loss": 0.2001,
"step": 19850
},
{
"epoch": 4.99,
"grad_norm": 0.070701465010643,
"learning_rate": 7.062377727614748e-06,
"loss": 0.6119,
"step": 19900
},
{
"epoch": 5.0,
"grad_norm": 9.641678479965776e-05,
"learning_rate": 7.04469525959368e-06,
"loss": 0.1432,
"step": 19950
},
{
"epoch": 5.02,
"grad_norm": 80.3648681640625,
"learning_rate": 7.027012791572611e-06,
"loss": 0.2837,
"step": 20000
},
{
"epoch": 5.03,
"grad_norm": 7.515856123063713e-05,
"learning_rate": 7.009330323551543e-06,
"loss": 0.0325,
"step": 20050
},
{
"epoch": 5.04,
"grad_norm": 9.76786541286856e-05,
"learning_rate": 6.9916478555304745e-06,
"loss": 0.28,
"step": 20100
},
{
"epoch": 5.05,
"grad_norm": 0.058834467083215714,
"learning_rate": 6.973965387509406e-06,
"loss": 0.119,
"step": 20150
},
{
"epoch": 5.07,
"grad_norm": 3.0734496116638184,
"learning_rate": 6.9562829194883376e-06,
"loss": 0.1121,
"step": 20200
},
{
"epoch": 5.08,
"grad_norm": 173.53060913085938,
"learning_rate": 6.938600451467269e-06,
"loss": 0.4994,
"step": 20250
},
{
"epoch": 5.09,
"grad_norm": 1.482841071265284e-06,
"learning_rate": 6.920917983446201e-06,
"loss": 0.4273,
"step": 20300
},
{
"epoch": 5.1,
"grad_norm": 0.06339254975318909,
"learning_rate": 6.903235515425132e-06,
"loss": 0.0653,
"step": 20350
},
{
"epoch": 5.12,
"grad_norm": 29.73435401916504,
"learning_rate": 6.885553047404064e-06,
"loss": 0.0064,
"step": 20400
},
{
"epoch": 5.13,
"grad_norm": 0.0535583458840847,
"learning_rate": 6.8678705793829944e-06,
"loss": 0.1328,
"step": 20450
},
{
"epoch": 5.14,
"grad_norm": 0.016700129956007004,
"learning_rate": 6.850188111361927e-06,
"loss": 0.3879,
"step": 20500
},
{
"epoch": 5.15,
"grad_norm": 3.702952017192729e-05,
"learning_rate": 6.832505643340858e-06,
"loss": 0.1604,
"step": 20550
},
{
"epoch": 5.17,
"grad_norm": 0.03472837060689926,
"learning_rate": 6.814823175319789e-06,
"loss": 0.2436,
"step": 20600
},
{
"epoch": 5.18,
"grad_norm": 3.1909748940961435e-05,
"learning_rate": 6.7971407072987215e-06,
"loss": 0.1352,
"step": 20650
},
{
"epoch": 5.19,
"grad_norm": 0.3979862630367279,
"learning_rate": 6.779458239277653e-06,
"loss": 0.1159,
"step": 20700
},
{
"epoch": 5.2,
"grad_norm": 0.0028309274930506945,
"learning_rate": 6.761775771256584e-06,
"loss": 0.2612,
"step": 20750
},
{
"epoch": 5.22,
"grad_norm": 0.7586016654968262,
"learning_rate": 6.744093303235516e-06,
"loss": 0.4589,
"step": 20800
},
{
"epoch": 5.23,
"grad_norm": 0.0062132058665156364,
"learning_rate": 6.726410835214448e-06,
"loss": 0.0843,
"step": 20850
},
{
"epoch": 5.24,
"grad_norm": 0.01292335707694292,
"learning_rate": 6.708728367193378e-06,
"loss": 0.092,
"step": 20900
},
{
"epoch": 5.25,
"grad_norm": 0.0012096440186724067,
"learning_rate": 6.691045899172311e-06,
"loss": 0.1515,
"step": 20950
},
{
"epoch": 5.27,
"grad_norm": 0.003023844677954912,
"learning_rate": 6.673363431151242e-06,
"loss": 0.3177,
"step": 21000
},
{
"epoch": 5.28,
"grad_norm": 106.2956771850586,
"learning_rate": 6.655680963130173e-06,
"loss": 0.0315,
"step": 21050
},
{
"epoch": 5.29,
"grad_norm": 0.0011365425307303667,
"learning_rate": 6.637998495109105e-06,
"loss": 0.0159,
"step": 21100
},
{
"epoch": 5.3,
"grad_norm": 39.502681732177734,
"learning_rate": 6.620316027088036e-06,
"loss": 0.3804,
"step": 21150
},
{
"epoch": 5.32,
"grad_norm": 0.017230931669473648,
"learning_rate": 6.602633559066968e-06,
"loss": 0.0453,
"step": 21200
},
{
"epoch": 5.33,
"grad_norm": 6.043082976248115e-06,
"learning_rate": 6.584951091045899e-06,
"loss": 0.3094,
"step": 21250
},
{
"epoch": 5.34,
"grad_norm": 6.83969769710302e-10,
"learning_rate": 6.567268623024831e-06,
"loss": 0.3382,
"step": 21300
},
{
"epoch": 5.35,
"grad_norm": 1.2151496714234156e-13,
"learning_rate": 6.549586155003762e-06,
"loss": 0.0469,
"step": 21350
},
{
"epoch": 5.37,
"grad_norm": 129.63966369628906,
"learning_rate": 6.531903686982694e-06,
"loss": 0.1542,
"step": 21400
},
{
"epoch": 5.38,
"grad_norm": 8.008062764019996e-07,
"learning_rate": 6.514221218961625e-06,
"loss": 0.1121,
"step": 21450
},
{
"epoch": 5.39,
"grad_norm": 195.3101043701172,
"learning_rate": 6.496538750940557e-06,
"loss": 0.134,
"step": 21500
},
{
"epoch": 5.41,
"grad_norm": 0.44227921962738037,
"learning_rate": 6.4788562829194885e-06,
"loss": 0.1614,
"step": 21550
},
{
"epoch": 5.42,
"grad_norm": 389.9450988769531,
"learning_rate": 6.46117381489842e-06,
"loss": 0.2223,
"step": 21600
},
{
"epoch": 5.43,
"grad_norm": 2.417748987681989e-07,
"learning_rate": 6.443491346877352e-06,
"loss": 0.2297,
"step": 21650
},
{
"epoch": 5.44,
"grad_norm": 0.0011466313153505325,
"learning_rate": 6.425808878856283e-06,
"loss": 0.0367,
"step": 21700
},
{
"epoch": 5.46,
"grad_norm": 0.4562750458717346,
"learning_rate": 6.408126410835215e-06,
"loss": 0.3361,
"step": 21750
},
{
"epoch": 5.47,
"grad_norm": 3.822188591584563e-05,
"learning_rate": 6.390443942814146e-06,
"loss": 0.0979,
"step": 21800
},
{
"epoch": 5.48,
"grad_norm": 100.44294738769531,
"learning_rate": 6.372761474793078e-06,
"loss": 0.0573,
"step": 21850
},
{
"epoch": 5.49,
"grad_norm": 1.8141976397600956e-05,
"learning_rate": 6.355079006772009e-06,
"loss": 0.6044,
"step": 21900
},
{
"epoch": 5.51,
"grad_norm": 2.5538651055034833e-12,
"learning_rate": 6.337396538750941e-06,
"loss": 0.2549,
"step": 21950
},
{
"epoch": 5.52,
"grad_norm": 7.968230164578927e-08,
"learning_rate": 6.319714070729872e-06,
"loss": 0.36,
"step": 22000
},
{
"epoch": 5.53,
"grad_norm": 0.001464845146983862,
"learning_rate": 6.302031602708804e-06,
"loss": 0.0043,
"step": 22050
},
{
"epoch": 5.54,
"grad_norm": 0.5217474102973938,
"learning_rate": 6.2843491346877355e-06,
"loss": 0.3358,
"step": 22100
},
{
"epoch": 5.56,
"grad_norm": 2.1627647583954968e-05,
"learning_rate": 6.266666666666666e-06,
"loss": 0.4962,
"step": 22150
},
{
"epoch": 5.57,
"grad_norm": 0.0039770700968801975,
"learning_rate": 6.248984198645599e-06,
"loss": 0.0886,
"step": 22200
},
{
"epoch": 5.58,
"grad_norm": 0.028452860191464424,
"learning_rate": 6.23130173062453e-06,
"loss": 0.2919,
"step": 22250
},
{
"epoch": 5.59,
"grad_norm": 1.0354268550872803,
"learning_rate": 6.213619262603461e-06,
"loss": 0.1583,
"step": 22300
},
{
"epoch": 5.61,
"grad_norm": 0.0001276719121960923,
"learning_rate": 6.195936794582393e-06,
"loss": 0.1062,
"step": 22350
},
{
"epoch": 5.62,
"grad_norm": 213.48941040039062,
"learning_rate": 6.178254326561325e-06,
"loss": 0.377,
"step": 22400
},
{
"epoch": 5.63,
"grad_norm": 8.587969205109403e-06,
"learning_rate": 6.1605718585402555e-06,
"loss": 0.2043,
"step": 22450
},
{
"epoch": 5.64,
"grad_norm": 0.011805477552115917,
"learning_rate": 6.142889390519188e-06,
"loss": 0.2744,
"step": 22500
},
{
"epoch": 5.66,
"grad_norm": 1.4445524776363072e-08,
"learning_rate": 6.125206922498119e-06,
"loss": 0.0145,
"step": 22550
},
{
"epoch": 5.67,
"grad_norm": 136.72720336914062,
"learning_rate": 6.10752445447705e-06,
"loss": 0.1608,
"step": 22600
},
{
"epoch": 5.68,
"grad_norm": 8.377895937883295e-06,
"learning_rate": 6.0898419864559826e-06,
"loss": 0.1146,
"step": 22650
},
{
"epoch": 5.69,
"grad_norm": 0.0005771568394266069,
"learning_rate": 6.072159518434913e-06,
"loss": 0.3716,
"step": 22700
},
{
"epoch": 5.71,
"grad_norm": 0.0033020416740328074,
"learning_rate": 6.054477050413845e-06,
"loss": 0.1609,
"step": 22750
},
{
"epoch": 5.72,
"grad_norm": 0.014289168640971184,
"learning_rate": 6.036794582392777e-06,
"loss": 0.2873,
"step": 22800
},
{
"epoch": 5.73,
"grad_norm": 433.4857482910156,
"learning_rate": 6.019112114371708e-06,
"loss": 0.2766,
"step": 22850
},
{
"epoch": 5.74,
"grad_norm": 51.506011962890625,
"learning_rate": 6.0014296463506395e-06,
"loss": 0.2557,
"step": 22900
},
{
"epoch": 5.76,
"grad_norm": 2.9865319106647803e-07,
"learning_rate": 5.983747178329572e-06,
"loss": 0.052,
"step": 22950
},
{
"epoch": 5.77,
"grad_norm": 0.0004749756189994514,
"learning_rate": 5.9660647103085026e-06,
"loss": 0.048,
"step": 23000
},
{
"epoch": 5.78,
"grad_norm": 296.063720703125,
"learning_rate": 5.948382242287434e-06,
"loss": 0.1432,
"step": 23050
},
{
"epoch": 5.79,
"grad_norm": 0.002446663100272417,
"learning_rate": 5.9306997742663665e-06,
"loss": 0.3151,
"step": 23100
},
{
"epoch": 5.81,
"grad_norm": 0.012231925502419472,
"learning_rate": 5.913017306245297e-06,
"loss": 0.0295,
"step": 23150
},
{
"epoch": 5.82,
"grad_norm": 0.006459045223891735,
"learning_rate": 5.895334838224229e-06,
"loss": 0.0319,
"step": 23200
},
{
"epoch": 5.83,
"grad_norm": 5.6175377238787405e-08,
"learning_rate": 5.87765237020316e-06,
"loss": 0.1096,
"step": 23250
},
{
"epoch": 5.84,
"grad_norm": 9.727654060043278e-07,
"learning_rate": 5.859969902182092e-06,
"loss": 0.365,
"step": 23300
},
{
"epoch": 5.86,
"grad_norm": 167.01791381835938,
"learning_rate": 5.842287434161023e-06,
"loss": 0.0494,
"step": 23350
},
{
"epoch": 5.87,
"grad_norm": 0.05854243040084839,
"learning_rate": 5.824604966139955e-06,
"loss": 0.0218,
"step": 23400
},
{
"epoch": 5.88,
"grad_norm": 2.8002886676148364e-09,
"learning_rate": 5.8069224981188865e-06,
"loss": 0.0119,
"step": 23450
},
{
"epoch": 5.89,
"grad_norm": 455.8995361328125,
"learning_rate": 5.789240030097818e-06,
"loss": 0.3402,
"step": 23500
},
{
"epoch": 5.91,
"grad_norm": 0.0034980960190296173,
"learning_rate": 5.77155756207675e-06,
"loss": 0.1623,
"step": 23550
},
{
"epoch": 5.92,
"grad_norm": 0.048077382147312164,
"learning_rate": 5.753875094055681e-06,
"loss": 0.5028,
"step": 23600
},
{
"epoch": 5.93,
"grad_norm": 1.1395950317382812,
"learning_rate": 5.736192626034613e-06,
"loss": 0.1841,
"step": 23650
},
{
"epoch": 5.94,
"grad_norm": 3.0090935979387723e-05,
"learning_rate": 5.718510158013544e-06,
"loss": 0.5312,
"step": 23700
},
{
"epoch": 5.96,
"grad_norm": 4.985315626981901e-08,
"learning_rate": 5.700827689992476e-06,
"loss": 0.0867,
"step": 23750
},
{
"epoch": 5.97,
"grad_norm": 0.7515669465065002,
"learning_rate": 5.683145221971407e-06,
"loss": 0.3645,
"step": 23800
},
{
"epoch": 5.98,
"grad_norm": 14.448786735534668,
"learning_rate": 5.665462753950339e-06,
"loss": 0.0975,
"step": 23850
},
{
"epoch": 5.99,
"grad_norm": 0.58511883020401,
"learning_rate": 5.6477802859292704e-06,
"loss": 0.0981,
"step": 23900
},
{
"epoch": 6.01,
"grad_norm": 5.8292873291065916e-05,
"learning_rate": 5.630097817908202e-06,
"loss": 0.2598,
"step": 23950
},
{
"epoch": 6.02,
"grad_norm": 0.03704287111759186,
"learning_rate": 5.6124153498871335e-06,
"loss": 0.1594,
"step": 24000
},
{
"epoch": 6.03,
"grad_norm": 0.0010854690335690975,
"learning_rate": 5.594732881866065e-06,
"loss": 0.2415,
"step": 24050
},
{
"epoch": 6.04,
"grad_norm": 381.2314147949219,
"learning_rate": 5.577050413844996e-06,
"loss": 0.0477,
"step": 24100
},
{
"epoch": 6.06,
"grad_norm": 8.66334667080082e-05,
"learning_rate": 5.559367945823928e-06,
"loss": 0.0424,
"step": 24150
},
{
"epoch": 6.07,
"grad_norm": 0.019515322521328926,
"learning_rate": 5.54168547780286e-06,
"loss": 0.3617,
"step": 24200
},
{
"epoch": 6.08,
"grad_norm": 0.00011614364484557882,
"learning_rate": 5.52400300978179e-06,
"loss": 0.1944,
"step": 24250
},
{
"epoch": 6.09,
"grad_norm": 0.00019373864051885903,
"learning_rate": 5.506320541760723e-06,
"loss": 0.0011,
"step": 24300
},
{
"epoch": 6.11,
"grad_norm": 1.0937032612901021e-08,
"learning_rate": 5.488638073739654e-06,
"loss": 0.0014,
"step": 24350
},
{
"epoch": 6.12,
"grad_norm": 2.1784097691945198e-13,
"learning_rate": 5.470955605718585e-06,
"loss": 0.0055,
"step": 24400
},
{
"epoch": 6.13,
"grad_norm": 0.01839843951165676,
"learning_rate": 5.4532731376975175e-06,
"loss": 0.0042,
"step": 24450
},
{
"epoch": 6.14,
"grad_norm": 4.981990930907898e-10,
"learning_rate": 5.435590669676449e-06,
"loss": 0.103,
"step": 24500
},
{
"epoch": 6.16,
"grad_norm": 0.0047708419151604176,
"learning_rate": 5.41790820165538e-06,
"loss": 0.0022,
"step": 24550
},
{
"epoch": 6.17,
"grad_norm": 0.003085497999563813,
"learning_rate": 5.400225733634312e-06,
"loss": 0.0021,
"step": 24600
},
{
"epoch": 6.18,
"grad_norm": 6.570710642250788e-11,
"learning_rate": 5.382543265613244e-06,
"loss": 0.2051,
"step": 24650
},
{
"epoch": 6.2,
"grad_norm": 0.0029285515192896128,
"learning_rate": 5.364860797592174e-06,
"loss": 0.0012,
"step": 24700
},
{
"epoch": 6.21,
"grad_norm": 3.4288578376617806e-07,
"learning_rate": 5.347178329571107e-06,
"loss": 0.0001,
"step": 24750
},
{
"epoch": 6.22,
"grad_norm": 0.00539399404078722,
"learning_rate": 5.3294958615500375e-06,
"loss": 0.2899,
"step": 24800
},
{
"epoch": 6.23,
"grad_norm": 2.6356909188507416e-07,
"learning_rate": 5.311813393528969e-06,
"loss": 0.0019,
"step": 24850
},
{
"epoch": 6.25,
"grad_norm": 0.019658172503113747,
"learning_rate": 5.294130925507901e-06,
"loss": 0.1133,
"step": 24900
},
{
"epoch": 6.26,
"grad_norm": 4.7282670834203344e-11,
"learning_rate": 5.276448457486832e-06,
"loss": 0.0001,
"step": 24950
},
{
"epoch": 6.27,
"grad_norm": 1.2473710739868693e-06,
"learning_rate": 5.258765989465764e-06,
"loss": 0.1143,
"step": 25000
},
{
"epoch": 6.28,
"grad_norm": 0.38085153698921204,
"learning_rate": 5.241083521444696e-06,
"loss": 0.059,
"step": 25050
},
{
"epoch": 6.3,
"grad_norm": 5.584224224090576,
"learning_rate": 5.223401053423627e-06,
"loss": 0.0833,
"step": 25100
},
{
"epoch": 6.31,
"grad_norm": 9.337106348539237e-06,
"learning_rate": 5.205718585402558e-06,
"loss": 0.0876,
"step": 25150
},
{
"epoch": 6.32,
"grad_norm": 4.118080099146937e-08,
"learning_rate": 5.188036117381491e-06,
"loss": 0.0703,
"step": 25200
},
{
"epoch": 6.33,
"grad_norm": 1.8987177554663504e-06,
"learning_rate": 5.170353649360421e-06,
"loss": 0.0625,
"step": 25250
},
{
"epoch": 6.35,
"grad_norm": 4.3221673462490173e-10,
"learning_rate": 5.152671181339353e-06,
"loss": 0.0284,
"step": 25300
},
{
"epoch": 6.36,
"grad_norm": 0.000691065622959286,
"learning_rate": 5.134988713318285e-06,
"loss": 0.0422,
"step": 25350
},
{
"epoch": 6.37,
"grad_norm": 0.00046700576785951853,
"learning_rate": 5.117306245297216e-06,
"loss": 0.0001,
"step": 25400
},
{
"epoch": 6.38,
"grad_norm": 0.008938438259065151,
"learning_rate": 5.099623777276148e-06,
"loss": 0.0141,
"step": 25450
},
{
"epoch": 6.4,
"grad_norm": 0.16503383219242096,
"learning_rate": 5.081941309255079e-06,
"loss": 0.0646,
"step": 25500
},
{
"epoch": 6.41,
"grad_norm": 8.952581993071362e-06,
"learning_rate": 5.064258841234011e-06,
"loss": 0.036,
"step": 25550
},
{
"epoch": 6.42,
"grad_norm": 0.014195716008543968,
"learning_rate": 5.046576373212942e-06,
"loss": 0.0005,
"step": 25600
},
{
"epoch": 6.43,
"grad_norm": 0.00028850819217041135,
"learning_rate": 5.028893905191874e-06,
"loss": 0.0754,
"step": 25650
},
{
"epoch": 6.45,
"grad_norm": 0.00020963407587260008,
"learning_rate": 5.011211437170805e-06,
"loss": 0.0003,
"step": 25700
},
{
"epoch": 6.46,
"grad_norm": 0.0010497659677639604,
"learning_rate": 4.993528969149737e-06,
"loss": 0.6013,
"step": 25750
},
{
"epoch": 6.47,
"grad_norm": 1.387237716699019e-06,
"learning_rate": 4.975846501128668e-06,
"loss": 0.005,
"step": 25800
},
{
"epoch": 6.48,
"grad_norm": 1.8294354958925396e-05,
"learning_rate": 4.9581640331076e-06,
"loss": 0.0,
"step": 25850
},
{
"epoch": 6.5,
"grad_norm": 4.903622539131902e-06,
"learning_rate": 4.9404815650865315e-06,
"loss": 0.0003,
"step": 25900
},
{
"epoch": 6.51,
"grad_norm": 0.000930552021600306,
"learning_rate": 4.922799097065463e-06,
"loss": 0.0464,
"step": 25950
},
{
"epoch": 6.52,
"grad_norm": 2.9821951102348976e-05,
"learning_rate": 4.905116629044395e-06,
"loss": 0.0854,
"step": 26000
},
{
"epoch": 6.53,
"grad_norm": 0.19266781210899353,
"learning_rate": 4.887434161023326e-06,
"loss": 0.1578,
"step": 26050
},
{
"epoch": 6.55,
"grad_norm": 6.610630862269318e-06,
"learning_rate": 4.869751693002258e-06,
"loss": 0.0004,
"step": 26100
},
{
"epoch": 6.56,
"grad_norm": 6.910874503773812e-07,
"learning_rate": 4.852069224981189e-06,
"loss": 0.0013,
"step": 26150
},
{
"epoch": 6.57,
"grad_norm": 0.00030907560721971095,
"learning_rate": 4.834386756960121e-06,
"loss": 0.0005,
"step": 26200
},
{
"epoch": 6.58,
"grad_norm": 1.2135699112292286e-09,
"learning_rate": 4.816704288939052e-06,
"loss": 0.1581,
"step": 26250
},
{
"epoch": 6.6,
"grad_norm": 8.979808626463637e-06,
"learning_rate": 4.799021820917984e-06,
"loss": 0.1339,
"step": 26300
},
{
"epoch": 6.61,
"grad_norm": 8.109305053949356e-05,
"learning_rate": 4.781339352896915e-06,
"loss": 0.1607,
"step": 26350
},
{
"epoch": 6.62,
"grad_norm": 0.11362000554800034,
"learning_rate": 4.763656884875847e-06,
"loss": 0.0152,
"step": 26400
},
{
"epoch": 6.63,
"grad_norm": 3.168620969518088e-05,
"learning_rate": 4.7459744168547785e-06,
"loss": 0.062,
"step": 26450
},
{
"epoch": 6.65,
"grad_norm": 2.37572979927063,
"learning_rate": 4.728291948833709e-06,
"loss": 0.0001,
"step": 26500
},
{
"epoch": 6.66,
"grad_norm": 1.1477128509795875e-06,
"learning_rate": 4.710609480812642e-06,
"loss": 0.2304,
"step": 26550
},
{
"epoch": 6.67,
"grad_norm": 3.561492079029449e-08,
"learning_rate": 4.692927012791573e-06,
"loss": 0.1046,
"step": 26600
},
{
"epoch": 6.68,
"grad_norm": 1.6958483457565308,
"learning_rate": 4.675244544770504e-06,
"loss": 0.0273,
"step": 26650
},
{
"epoch": 6.7,
"grad_norm": 6.609186675632372e-05,
"learning_rate": 4.657562076749436e-06,
"loss": 0.0504,
"step": 26700
},
{
"epoch": 6.71,
"grad_norm": 0.02066265046596527,
"learning_rate": 4.639879608728368e-06,
"loss": 0.0845,
"step": 26750
},
{
"epoch": 6.72,
"grad_norm": 0.6868598461151123,
"learning_rate": 4.6221971407072985e-06,
"loss": 0.064,
"step": 26800
},
{
"epoch": 6.73,
"grad_norm": 4.525861463378078e-09,
"learning_rate": 4.604514672686231e-06,
"loss": 0.0372,
"step": 26850
},
{
"epoch": 6.75,
"grad_norm": 0.0018904170719906688,
"learning_rate": 4.5868322046651625e-06,
"loss": 0.171,
"step": 26900
},
{
"epoch": 6.76,
"grad_norm": 0.06831281632184982,
"learning_rate": 4.569149736644093e-06,
"loss": 0.0005,
"step": 26950
},
{
"epoch": 6.77,
"grad_norm": 2.7328371288604103e-05,
"learning_rate": 4.5514672686230256e-06,
"loss": 0.0834,
"step": 27000
},
{
"epoch": 6.78,
"grad_norm": 1.312251782792373e-07,
"learning_rate": 4.533784800601956e-06,
"loss": 0.0009,
"step": 27050
},
{
"epoch": 6.8,
"grad_norm": 0.006464004050940275,
"learning_rate": 4.516102332580888e-06,
"loss": 0.1302,
"step": 27100
},
{
"epoch": 6.81,
"grad_norm": 4.0537888601477334e-09,
"learning_rate": 4.49841986455982e-06,
"loss": 0.1255,
"step": 27150
},
{
"epoch": 6.82,
"grad_norm": 0.0004817073349840939,
"learning_rate": 4.480737396538751e-06,
"loss": 0.001,
"step": 27200
},
{
"epoch": 6.83,
"grad_norm": 0.014918695203959942,
"learning_rate": 4.4630549285176825e-06,
"loss": 0.0019,
"step": 27250
},
{
"epoch": 6.85,
"grad_norm": 6.75780752420712e-17,
"learning_rate": 4.445372460496614e-06,
"loss": 0.0179,
"step": 27300
},
{
"epoch": 6.86,
"grad_norm": 382.1897888183594,
"learning_rate": 4.4276899924755456e-06,
"loss": 0.0396,
"step": 27350
},
{
"epoch": 6.87,
"grad_norm": 0.30687054991722107,
"learning_rate": 4.410007524454477e-06,
"loss": 0.0576,
"step": 27400
},
{
"epoch": 6.88,
"grad_norm": 1.2169127785455203e-06,
"learning_rate": 4.392325056433409e-06,
"loss": 0.0002,
"step": 27450
},
{
"epoch": 6.9,
"grad_norm": 6.928129077377054e-12,
"learning_rate": 4.37464258841234e-06,
"loss": 0.0989,
"step": 27500
},
{
"epoch": 6.91,
"grad_norm": 7.992535522305388e-10,
"learning_rate": 4.356960120391272e-06,
"loss": 0.0014,
"step": 27550
},
{
"epoch": 6.92,
"grad_norm": 0.001016330672428012,
"learning_rate": 4.339277652370203e-06,
"loss": 0.0796,
"step": 27600
},
{
"epoch": 6.94,
"grad_norm": 9.33817503323553e-08,
"learning_rate": 4.321595184349135e-06,
"loss": 0.0031,
"step": 27650
},
{
"epoch": 6.95,
"grad_norm": 3.0769423120524664e-10,
"learning_rate": 4.303912716328066e-06,
"loss": 0.0482,
"step": 27700
},
{
"epoch": 6.96,
"grad_norm": 5.2930868577050205e-09,
"learning_rate": 4.286230248306998e-06,
"loss": 0.0241,
"step": 27750
},
{
"epoch": 6.97,
"grad_norm": 2.738467628660146e-05,
"learning_rate": 4.2685477802859295e-06,
"loss": 0.0094,
"step": 27800
},
{
"epoch": 6.99,
"grad_norm": 1.259439272871532e-06,
"learning_rate": 4.250865312264861e-06,
"loss": 0.0011,
"step": 27850
},
{
"epoch": 7.0,
"grad_norm": 433.7135925292969,
"learning_rate": 4.233182844243792e-06,
"loss": 0.4265,
"step": 27900
},
{
"epoch": 7.01,
"grad_norm": 0.000105952778540086,
"learning_rate": 4.215500376222724e-06,
"loss": 0.0048,
"step": 27950
},
{
"epoch": 7.02,
"grad_norm": 0.2630611062049866,
"learning_rate": 4.197817908201656e-06,
"loss": 0.083,
"step": 28000
},
{
"epoch": 7.04,
"grad_norm": 1.2784289252221193e-11,
"learning_rate": 4.180135440180586e-06,
"loss": 0.0003,
"step": 28050
},
{
"epoch": 7.05,
"grad_norm": 4.8076164577137703e-11,
"learning_rate": 4.162452972159519e-06,
"loss": 0.0,
"step": 28100
},
{
"epoch": 7.06,
"grad_norm": 2.940306558230077e-07,
"learning_rate": 4.14477050413845e-06,
"loss": 0.0001,
"step": 28150
},
{
"epoch": 7.07,
"grad_norm": 4.1964653064496815e-05,
"learning_rate": 4.127088036117381e-06,
"loss": 0.0005,
"step": 28200
},
{
"epoch": 7.09,
"grad_norm": 0.005852025002241135,
"learning_rate": 4.1094055680963134e-06,
"loss": 0.0049,
"step": 28250
},
{
"epoch": 7.1,
"grad_norm": 0.05330043286085129,
"learning_rate": 4.091723100075245e-06,
"loss": 0.0,
"step": 28300
},
{
"epoch": 7.11,
"grad_norm": 2.5323606323013337e-08,
"learning_rate": 4.074040632054176e-06,
"loss": 0.0001,
"step": 28350
},
{
"epoch": 7.12,
"grad_norm": 0.004866173956543207,
"learning_rate": 4.056358164033108e-06,
"loss": 0.0002,
"step": 28400
},
{
"epoch": 7.14,
"grad_norm": 1.1348839645819453e-09,
"learning_rate": 4.038675696012039e-06,
"loss": 0.0361,
"step": 28450
},
{
"epoch": 7.15,
"grad_norm": 4.0626005102240015e-06,
"learning_rate": 4.02099322799097e-06,
"loss": 0.0006,
"step": 28500
},
{
"epoch": 7.16,
"grad_norm": 1.4158376870909706e-07,
"learning_rate": 4.003310759969903e-06,
"loss": 0.0,
"step": 28550
},
{
"epoch": 7.17,
"grad_norm": 3.5035823202633765e-06,
"learning_rate": 3.9856282919488334e-06,
"loss": 0.0,
"step": 28600
},
{
"epoch": 7.19,
"grad_norm": 7.668052421649918e-05,
"learning_rate": 3.967945823927765e-06,
"loss": 0.1484,
"step": 28650
},
{
"epoch": 7.2,
"grad_norm": 0.0006498922011815012,
"learning_rate": 3.950263355906697e-06,
"loss": 0.0,
"step": 28700
},
{
"epoch": 7.21,
"grad_norm": 1.2344708920863923e-05,
"learning_rate": 3.932580887885628e-06,
"loss": 0.0001,
"step": 28750
},
{
"epoch": 7.22,
"grad_norm": 4.231491038808599e-05,
"learning_rate": 3.91489841986456e-06,
"loss": 0.0001,
"step": 28800
},
{
"epoch": 7.24,
"grad_norm": 0.008648673072457314,
"learning_rate": 3.897215951843492e-06,
"loss": 0.0,
"step": 28850
},
{
"epoch": 7.25,
"grad_norm": 0.0010539034847170115,
"learning_rate": 3.879533483822423e-06,
"loss": 0.0,
"step": 28900
},
{
"epoch": 7.26,
"grad_norm": 5.991931902826764e-05,
"learning_rate": 3.861851015801354e-06,
"loss": 0.0001,
"step": 28950
},
{
"epoch": 7.27,
"grad_norm": 0.017336919903755188,
"learning_rate": 3.844168547780287e-06,
"loss": 0.0206,
"step": 29000
},
{
"epoch": 7.29,
"grad_norm": 0.0004083296225871891,
"learning_rate": 3.826486079759217e-06,
"loss": 0.0002,
"step": 29050
},
{
"epoch": 7.3,
"grad_norm": 9.027652740478516,
"learning_rate": 3.808803611738149e-06,
"loss": 0.0067,
"step": 29100
},
{
"epoch": 7.31,
"grad_norm": 0.0003242001694161445,
"learning_rate": 3.791121143717081e-06,
"loss": 0.0,
"step": 29150
},
{
"epoch": 7.32,
"grad_norm": 2.259884604427498e-05,
"learning_rate": 3.773438675696012e-06,
"loss": 0.0002,
"step": 29200
},
{
"epoch": 7.34,
"grad_norm": 9.495877265930176,
"learning_rate": 3.7557562076749436e-06,
"loss": 0.0002,
"step": 29250
},
{
"epoch": 7.35,
"grad_norm": 0.0059493957087397575,
"learning_rate": 3.7380737396538755e-06,
"loss": 0.0,
"step": 29300
},
{
"epoch": 7.36,
"grad_norm": 0.004485088866204023,
"learning_rate": 3.7203912716328067e-06,
"loss": 0.0,
"step": 29350
},
{
"epoch": 7.37,
"grad_norm": 8.322012309412881e-15,
"learning_rate": 3.702708803611738e-06,
"loss": 0.0018,
"step": 29400
},
{
"epoch": 7.39,
"grad_norm": 0.0009153097053058445,
"learning_rate": 3.68502633559067e-06,
"loss": 0.0001,
"step": 29450
},
{
"epoch": 7.4,
"grad_norm": 2.3616248654434457e-05,
"learning_rate": 3.6673438675696013e-06,
"loss": 0.1638,
"step": 29500
},
{
"epoch": 7.41,
"grad_norm": 0.0017722542397677898,
"learning_rate": 3.6496613995485324e-06,
"loss": 0.0123,
"step": 29550
},
{
"epoch": 7.42,
"grad_norm": 0.06969759613275528,
"learning_rate": 3.631978931527465e-06,
"loss": 0.0155,
"step": 29600
},
{
"epoch": 7.44,
"grad_norm": 1.5746809367556125e-06,
"learning_rate": 3.614296463506396e-06,
"loss": 0.0001,
"step": 29650
},
{
"epoch": 7.45,
"grad_norm": 3.0426802744010217e-10,
"learning_rate": 3.596613995485327e-06,
"loss": 0.0,
"step": 29700
},
{
"epoch": 7.46,
"grad_norm": 0.5712952017784119,
"learning_rate": 3.578931527464259e-06,
"loss": 0.1067,
"step": 29750
},
{
"epoch": 7.47,
"grad_norm": 0.766385555267334,
"learning_rate": 3.5612490594431906e-06,
"loss": 0.0107,
"step": 29800
},
{
"epoch": 7.49,
"grad_norm": 0.05696748197078705,
"learning_rate": 3.5435665914221217e-06,
"loss": 0.0013,
"step": 29850
},
{
"epoch": 7.5,
"grad_norm": 7.25884137864341e-06,
"learning_rate": 3.5258841234010537e-06,
"loss": 0.0,
"step": 29900
},
{
"epoch": 7.51,
"grad_norm": 1.7060403479263186e-05,
"learning_rate": 3.5082016553799852e-06,
"loss": 0.0002,
"step": 29950
},
{
"epoch": 7.52,
"grad_norm": 0.012671858072280884,
"learning_rate": 3.4905191873589168e-06,
"loss": 0.0,
"step": 30000
},
{
"epoch": 7.54,
"grad_norm": 2.8193007928223324e-09,
"learning_rate": 3.472836719337848e-06,
"loss": 0.0001,
"step": 30050
},
{
"epoch": 7.55,
"grad_norm": 0.019155049696564674,
"learning_rate": 3.4551542513167795e-06,
"loss": 0.0,
"step": 30100
},
{
"epoch": 7.56,
"grad_norm": 0.0020516354124993086,
"learning_rate": 3.4374717832957114e-06,
"loss": 0.0002,
"step": 30150
},
{
"epoch": 7.57,
"grad_norm": 2.4088294594548643e-05,
"learning_rate": 3.4197893152746425e-06,
"loss": 0.0492,
"step": 30200
},
{
"epoch": 7.59,
"grad_norm": 1.9164204786648043e-05,
"learning_rate": 3.402106847253574e-06,
"loss": 0.0312,
"step": 30250
},
{
"epoch": 7.6,
"grad_norm": 0.0160346832126379,
"learning_rate": 3.384424379232506e-06,
"loss": 0.0007,
"step": 30300
},
{
"epoch": 7.61,
"grad_norm": 7.57160614739405e-06,
"learning_rate": 3.366741911211437e-06,
"loss": 0.0005,
"step": 30350
},
{
"epoch": 7.62,
"grad_norm": 1.1699286504851525e-11,
"learning_rate": 3.3490594431903687e-06,
"loss": 0.0027,
"step": 30400
},
{
"epoch": 7.64,
"grad_norm": 1.0412069286758197e-06,
"learning_rate": 3.3313769751693003e-06,
"loss": 0.0011,
"step": 30450
},
{
"epoch": 7.65,
"grad_norm": 1.0678839998945477e-06,
"learning_rate": 3.313694507148232e-06,
"loss": 0.0,
"step": 30500
},
{
"epoch": 7.66,
"grad_norm": 2.537229315535683e-09,
"learning_rate": 3.2960120391271634e-06,
"loss": 0.0,
"step": 30550
},
{
"epoch": 7.67,
"grad_norm": 8.23991967990878e-07,
"learning_rate": 3.278329571106095e-06,
"loss": 0.0001,
"step": 30600
},
{
"epoch": 7.69,
"grad_norm": 0.0006322423578239977,
"learning_rate": 3.2606471030850265e-06,
"loss": 0.0001,
"step": 30650
},
{
"epoch": 7.7,
"grad_norm": 1.3688865863059618e-07,
"learning_rate": 3.242964635063958e-06,
"loss": 0.062,
"step": 30700
},
{
"epoch": 7.71,
"grad_norm": 2.41971292780363e-06,
"learning_rate": 3.2252821670428896e-06,
"loss": 0.1235,
"step": 30750
},
{
"epoch": 7.73,
"grad_norm": 2.4634087480990274e-07,
"learning_rate": 3.207599699021821e-06,
"loss": 0.0035,
"step": 30800
},
{
"epoch": 7.74,
"grad_norm": 3.1068152566149365e-06,
"learning_rate": 3.1899172310007527e-06,
"loss": 0.0205,
"step": 30850
},
{
"epoch": 7.75,
"grad_norm": 5.763430177552209e-09,
"learning_rate": 3.1722347629796842e-06,
"loss": 0.0,
"step": 30900
},
{
"epoch": 7.76,
"grad_norm": 0.008364195004105568,
"learning_rate": 3.1545522949586153e-06,
"loss": 0.0009,
"step": 30950
},
{
"epoch": 7.78,
"grad_norm": 0.00012845598394051194,
"learning_rate": 3.1368698269375473e-06,
"loss": 0.0008,
"step": 31000
},
{
"epoch": 7.79,
"grad_norm": 0.001842482597567141,
"learning_rate": 3.119187358916479e-06,
"loss": 0.0007,
"step": 31050
},
{
"epoch": 7.8,
"grad_norm": 1.2641396263113336e-10,
"learning_rate": 3.10150489089541e-06,
"loss": 0.0019,
"step": 31100
},
{
"epoch": 7.81,
"grad_norm": 0.00033131783129647374,
"learning_rate": 3.083822422874342e-06,
"loss": 0.0002,
"step": 31150
},
{
"epoch": 7.83,
"grad_norm": 1.851675369834993e-05,
"learning_rate": 3.0661399548532735e-06,
"loss": 0.0009,
"step": 31200
},
{
"epoch": 7.84,
"grad_norm": 0.00795644149184227,
"learning_rate": 3.0484574868322046e-06,
"loss": 0.077,
"step": 31250
},
{
"epoch": 7.85,
"grad_norm": 0.07745194435119629,
"learning_rate": 3.030775018811136e-06,
"loss": 0.0001,
"step": 31300
},
{
"epoch": 7.86,
"grad_norm": 0.10175588726997375,
"learning_rate": 3.013092550790068e-06,
"loss": 0.0307,
"step": 31350
},
{
"epoch": 7.88,
"grad_norm": 8.556443935958669e-05,
"learning_rate": 2.9954100827689993e-06,
"loss": 0.0,
"step": 31400
},
{
"epoch": 7.89,
"grad_norm": 0.9275371432304382,
"learning_rate": 2.977727614747931e-06,
"loss": 0.1478,
"step": 31450
},
{
"epoch": 7.9,
"grad_norm": 5.1567803360796916e-09,
"learning_rate": 2.960045146726863e-06,
"loss": 0.0598,
"step": 31500
},
{
"epoch": 7.91,
"grad_norm": 6.67710139623523e-07,
"learning_rate": 2.942362678705794e-06,
"loss": 0.0094,
"step": 31550
},
{
"epoch": 7.93,
"grad_norm": 1.458290155298414e-10,
"learning_rate": 2.9246802106847255e-06,
"loss": 0.0009,
"step": 31600
},
{
"epoch": 7.94,
"grad_norm": 5.1869348681066185e-05,
"learning_rate": 2.906997742663657e-06,
"loss": 0.0007,
"step": 31650
},
{
"epoch": 7.95,
"grad_norm": 0.00036754223401658237,
"learning_rate": 2.8893152746425886e-06,
"loss": 0.1282,
"step": 31700
},
{
"epoch": 7.96,
"grad_norm": 0.0028616636991500854,
"learning_rate": 2.87163280662152e-06,
"loss": 0.1516,
"step": 31750
},
{
"epoch": 7.98,
"grad_norm": 0.0008008142467588186,
"learning_rate": 2.8539503386004512e-06,
"loss": 0.0004,
"step": 31800
},
{
"epoch": 7.99,
"grad_norm": 1.0718519405372717e-07,
"learning_rate": 2.8362678705793832e-06,
"loss": 0.0,
"step": 31850
},
{
"epoch": 8.0,
"grad_norm": 0.0009103859774768353,
"learning_rate": 2.8185854025583148e-06,
"loss": 0.0001,
"step": 31900
},
{
"epoch": 8.01,
"grad_norm": 0.0001856798044173047,
"learning_rate": 2.800902934537246e-06,
"loss": 0.0,
"step": 31950
},
{
"epoch": 8.03,
"grad_norm": 0.00011591133807087317,
"learning_rate": 2.7832204665161774e-06,
"loss": 0.0001,
"step": 32000
},
{
"epoch": 8.04,
"grad_norm": 0.00040982267819345,
"learning_rate": 2.7655379984951094e-06,
"loss": 0.0,
"step": 32050
},
{
"epoch": 8.05,
"grad_norm": 2.265534648770995e-09,
"learning_rate": 2.7478555304740405e-06,
"loss": 0.0001,
"step": 32100
},
{
"epoch": 8.06,
"grad_norm": 5.858885425424898e-13,
"learning_rate": 2.730173062452972e-06,
"loss": 0.0001,
"step": 32150
},
{
"epoch": 8.08,
"grad_norm": 2.8236866932730136e-18,
"learning_rate": 2.712490594431904e-06,
"loss": 0.0,
"step": 32200
},
{
"epoch": 8.09,
"grad_norm": 0.0001981940004043281,
"learning_rate": 2.694808126410835e-06,
"loss": 0.0,
"step": 32250
},
{
"epoch": 8.1,
"grad_norm": 3.2661256511856696e-12,
"learning_rate": 2.6771256583897667e-06,
"loss": 0.0,
"step": 32300
},
{
"epoch": 8.11,
"grad_norm": 1.1293546776869334e-05,
"learning_rate": 2.6594431903686983e-06,
"loss": 0.0,
"step": 32350
},
{
"epoch": 8.13,
"grad_norm": 0.0003391726640984416,
"learning_rate": 2.64176072234763e-06,
"loss": 0.0,
"step": 32400
},
{
"epoch": 8.14,
"grad_norm": 6.486132042482495e-05,
"learning_rate": 2.6240782543265614e-06,
"loss": 0.0,
"step": 32450
},
{
"epoch": 8.15,
"grad_norm": 2.1309777366695926e-05,
"learning_rate": 2.606395786305493e-06,
"loss": 0.0,
"step": 32500
},
{
"epoch": 8.16,
"grad_norm": 4.4795211806558655e-07,
"learning_rate": 2.5887133182844245e-06,
"loss": 0.0007,
"step": 32550
},
{
"epoch": 8.18,
"grad_norm": 2.0528705402256264e-09,
"learning_rate": 2.571030850263356e-06,
"loss": 0.0,
"step": 32600
},
{
"epoch": 8.19,
"grad_norm": 4.783522308571264e-05,
"learning_rate": 2.5533483822422876e-06,
"loss": 0.0,
"step": 32650
},
{
"epoch": 8.2,
"grad_norm": 1.7800081408836377e-08,
"learning_rate": 2.535665914221219e-06,
"loss": 0.0,
"step": 32700
},
{
"epoch": 8.21,
"grad_norm": 0.0003143524518236518,
"learning_rate": 2.5179834462001507e-06,
"loss": 0.0003,
"step": 32750
},
{
"epoch": 8.23,
"grad_norm": 9.409014455741271e-05,
"learning_rate": 2.500300978179082e-06,
"loss": 0.0,
"step": 32800
},
{
"epoch": 8.24,
"grad_norm": 3.097814449404268e-09,
"learning_rate": 2.4826185101580133e-06,
"loss": 0.0,
"step": 32850
},
{
"epoch": 8.25,
"grad_norm": 6.660656595158798e-07,
"learning_rate": 2.4649360421369453e-06,
"loss": 0.0,
"step": 32900
},
{
"epoch": 8.26,
"grad_norm": 0.04804990068078041,
"learning_rate": 2.447253574115877e-06,
"loss": 0.0,
"step": 32950
},
{
"epoch": 8.28,
"grad_norm": 3.926641234386352e-09,
"learning_rate": 2.429571106094808e-06,
"loss": 0.0,
"step": 33000
},
{
"epoch": 8.29,
"grad_norm": 1.5834859022183257e-20,
"learning_rate": 2.4118886380737395e-06,
"loss": 0.0001,
"step": 33050
},
{
"epoch": 8.3,
"grad_norm": 0.20250500738620758,
"learning_rate": 2.3942061700526715e-06,
"loss": 0.0004,
"step": 33100
},
{
"epoch": 8.31,
"grad_norm": 5.932114959250612e-07,
"learning_rate": 2.3765237020316026e-06,
"loss": 0.0001,
"step": 33150
},
{
"epoch": 8.33,
"grad_norm": 1.5223192498248217e-11,
"learning_rate": 2.358841234010534e-06,
"loss": 0.0,
"step": 33200
},
{
"epoch": 8.34,
"grad_norm": 1.2739813826101454e-07,
"learning_rate": 2.341158765989466e-06,
"loss": 0.0,
"step": 33250
},
{
"epoch": 8.35,
"grad_norm": 1.2789546310898459e-08,
"learning_rate": 2.3234762979683973e-06,
"loss": 0.0001,
"step": 33300
},
{
"epoch": 8.36,
"grad_norm": 1.4692803233629093e-05,
"learning_rate": 2.305793829947329e-06,
"loss": 0.0,
"step": 33350
},
{
"epoch": 8.38,
"grad_norm": 0.00019242956477683038,
"learning_rate": 2.2881113619262604e-06,
"loss": 0.0403,
"step": 33400
},
{
"epoch": 8.39,
"grad_norm": 0.0,
"learning_rate": 2.270428893905192e-06,
"loss": 0.0,
"step": 33450
},
{
"epoch": 8.4,
"grad_norm": 0.002393543953076005,
"learning_rate": 2.2527464258841235e-06,
"loss": 0.0399,
"step": 33500
},
{
"epoch": 8.41,
"grad_norm": 1.7551202802223997e-07,
"learning_rate": 2.235063957863055e-06,
"loss": 0.0,
"step": 33550
},
{
"epoch": 8.43,
"grad_norm": 2.735872639547665e-11,
"learning_rate": 2.2173814898419866e-06,
"loss": 0.0004,
"step": 33600
},
{
"epoch": 8.44,
"grad_norm": 0.0003994428552687168,
"learning_rate": 2.199699021820918e-06,
"loss": 0.0,
"step": 33650
},
{
"epoch": 8.45,
"grad_norm": 2.7801218032836914,
"learning_rate": 2.1820165537998497e-06,
"loss": 0.0,
"step": 33700
},
{
"epoch": 8.47,
"grad_norm": 1.100529516406823e-06,
"learning_rate": 2.164334085778781e-06,
"loss": 0.0,
"step": 33750
},
{
"epoch": 8.48,
"grad_norm": 0.02319416031241417,
"learning_rate": 2.1466516177577128e-06,
"loss": 0.0,
"step": 33800
},
{
"epoch": 8.49,
"grad_norm": 0.0017326247179880738,
"learning_rate": 2.1289691497366443e-06,
"loss": 0.0,
"step": 33850
},
{
"epoch": 8.5,
"grad_norm": 1.8130524859216735e-10,
"learning_rate": 2.1112866817155754e-06,
"loss": 0.0001,
"step": 33900
},
{
"epoch": 8.52,
"grad_norm": 0.0017156396061182022,
"learning_rate": 2.0936042136945074e-06,
"loss": 0.0004,
"step": 33950
},
{
"epoch": 8.53,
"grad_norm": 0.0018524077022448182,
"learning_rate": 2.075921745673439e-06,
"loss": 0.0,
"step": 34000
},
{
"epoch": 8.54,
"grad_norm": 1.1194772923772689e-05,
"learning_rate": 2.05823927765237e-06,
"loss": 0.0,
"step": 34050
},
{
"epoch": 8.55,
"grad_norm": 7.453370471921517e-06,
"learning_rate": 2.040556809631302e-06,
"loss": 0.0164,
"step": 34100
},
{
"epoch": 8.57,
"grad_norm": 6.412294029090049e-10,
"learning_rate": 2.0228743416102336e-06,
"loss": 0.0,
"step": 34150
},
{
"epoch": 8.58,
"grad_norm": 1.4134855689861236e-17,
"learning_rate": 2.0051918735891647e-06,
"loss": 0.0,
"step": 34200
},
{
"epoch": 8.59,
"grad_norm": 0.00038817909080535173,
"learning_rate": 1.9875094055680963e-06,
"loss": 0.0365,
"step": 34250
},
{
"epoch": 8.6,
"grad_norm": 2.2248328605201095e-05,
"learning_rate": 1.9698269375470282e-06,
"loss": 0.0,
"step": 34300
},
{
"epoch": 8.62,
"grad_norm": 0.010381842032074928,
"learning_rate": 1.9521444695259594e-06,
"loss": 0.0,
"step": 34350
},
{
"epoch": 8.63,
"grad_norm": 0.001246288768015802,
"learning_rate": 1.934462001504891e-06,
"loss": 0.0,
"step": 34400
},
{
"epoch": 8.64,
"grad_norm": 1.8006402254104614,
"learning_rate": 1.916779533483823e-06,
"loss": 0.0007,
"step": 34450
},
{
"epoch": 8.65,
"grad_norm": 1.644072100681626e-10,
"learning_rate": 1.899097065462754e-06,
"loss": 0.0,
"step": 34500
},
{
"epoch": 8.67,
"grad_norm": 5.652666779099036e-09,
"learning_rate": 1.8814145974416856e-06,
"loss": 0.0,
"step": 34550
},
{
"epoch": 8.68,
"grad_norm": 0.003141549648717046,
"learning_rate": 1.8637321294206173e-06,
"loss": 0.0,
"step": 34600
},
{
"epoch": 8.69,
"grad_norm": 1.1486420135042863e-06,
"learning_rate": 1.8460496613995484e-06,
"loss": 0.0003,
"step": 34650
},
{
"epoch": 8.7,
"grad_norm": 1.1713603271346074e-05,
"learning_rate": 1.8283671933784802e-06,
"loss": 0.0,
"step": 34700
},
{
"epoch": 8.72,
"grad_norm": 1.2204428685436142e-06,
"learning_rate": 1.8106847253574115e-06,
"loss": 0.0,
"step": 34750
},
{
"epoch": 8.73,
"grad_norm": 0.0014657212886959314,
"learning_rate": 1.793002257336343e-06,
"loss": 0.0,
"step": 34800
},
{
"epoch": 8.74,
"grad_norm": 0.017868679016828537,
"learning_rate": 1.7753197893152748e-06,
"loss": 0.0,
"step": 34850
},
{
"epoch": 8.75,
"grad_norm": 3.3499613891763147e-06,
"learning_rate": 1.7576373212942062e-06,
"loss": 0.0001,
"step": 34900
},
{
"epoch": 8.77,
"grad_norm": 6.1278524476904295e-09,
"learning_rate": 1.7399548532731377e-06,
"loss": 0.0001,
"step": 34950
},
{
"epoch": 8.78,
"grad_norm": 1.445396605959104e-06,
"learning_rate": 1.7222723852520693e-06,
"loss": 0.0001,
"step": 35000
},
{
"epoch": 8.79,
"grad_norm": 0.0017798148328438401,
"learning_rate": 1.7045899172310008e-06,
"loss": 0.1651,
"step": 35050
},
{
"epoch": 8.8,
"grad_norm": 3.6833380789857983e-08,
"learning_rate": 1.6869074492099324e-06,
"loss": 0.0,
"step": 35100
},
{
"epoch": 8.82,
"grad_norm": 9.361156988463293e-11,
"learning_rate": 1.669224981188864e-06,
"loss": 0.0,
"step": 35150
},
{
"epoch": 8.83,
"grad_norm": 7.828115933250501e-09,
"learning_rate": 1.6515425131677955e-06,
"loss": 0.0,
"step": 35200
},
{
"epoch": 8.84,
"grad_norm": 1.020300643972405e-08,
"learning_rate": 1.6338600451467268e-06,
"loss": 0.0,
"step": 35250
},
{
"epoch": 8.85,
"grad_norm": 0.000530413759406656,
"learning_rate": 1.6161775771256586e-06,
"loss": 0.0,
"step": 35300
},
{
"epoch": 8.87,
"grad_norm": 8.391751182834639e-10,
"learning_rate": 1.59849510910459e-06,
"loss": 0.0,
"step": 35350
},
{
"epoch": 8.88,
"grad_norm": 0.003899802453815937,
"learning_rate": 1.5808126410835214e-06,
"loss": 0.0,
"step": 35400
},
{
"epoch": 8.89,
"grad_norm": 2.3727285224595107e-05,
"learning_rate": 1.5631301730624532e-06,
"loss": 0.0,
"step": 35450
},
{
"epoch": 8.9,
"grad_norm": 4.068557245773263e-06,
"learning_rate": 1.5454477050413845e-06,
"loss": 0.0,
"step": 35500
},
{
"epoch": 8.92,
"grad_norm": 0.007758264895528555,
"learning_rate": 1.527765237020316e-06,
"loss": 0.0,
"step": 35550
},
{
"epoch": 8.93,
"grad_norm": 0.00016868404054548591,
"learning_rate": 1.5100827689992474e-06,
"loss": 0.0,
"step": 35600
},
{
"epoch": 8.94,
"grad_norm": 0.00011449763405835256,
"learning_rate": 1.4924003009781792e-06,
"loss": 0.0,
"step": 35650
},
{
"epoch": 8.95,
"grad_norm": 4.054548298881855e-06,
"learning_rate": 1.4747178329571107e-06,
"loss": 0.0,
"step": 35700
},
{
"epoch": 8.97,
"grad_norm": 0.0010476693278178573,
"learning_rate": 1.457035364936042e-06,
"loss": 0.0001,
"step": 35750
},
{
"epoch": 8.98,
"grad_norm": 0.06502784043550491,
"learning_rate": 1.4393528969149738e-06,
"loss": 0.0,
"step": 35800
},
{
"epoch": 8.99,
"grad_norm": 2.2866407789479126e-07,
"learning_rate": 1.4216704288939052e-06,
"loss": 0.0,
"step": 35850
},
{
"epoch": 9.0,
"grad_norm": 0.00021389636094681919,
"learning_rate": 1.4039879608728367e-06,
"loss": 0.0002,
"step": 35900
},
{
"epoch": 9.02,
"grad_norm": 1.3870979032049036e-08,
"learning_rate": 1.3863054928517683e-06,
"loss": 0.0,
"step": 35950
},
{
"epoch": 9.03,
"grad_norm": 6.817894586674811e-07,
"learning_rate": 1.3686230248306998e-06,
"loss": 0.0,
"step": 36000
},
{
"epoch": 9.04,
"grad_norm": 6.899564031215277e-09,
"learning_rate": 1.3509405568096314e-06,
"loss": 0.0,
"step": 36050
},
{
"epoch": 9.05,
"grad_norm": 5.222953859629342e-06,
"learning_rate": 1.333258088788563e-06,
"loss": 0.0,
"step": 36100
},
{
"epoch": 9.07,
"grad_norm": 1.425233087104516e-08,
"learning_rate": 1.3155756207674945e-06,
"loss": 0.0,
"step": 36150
},
{
"epoch": 9.08,
"grad_norm": 0.001089599565602839,
"learning_rate": 1.2978931527464258e-06,
"loss": 0.0,
"step": 36200
},
{
"epoch": 9.09,
"grad_norm": 5.556981932386407e-07,
"learning_rate": 1.2802106847253576e-06,
"loss": 0.0,
"step": 36250
},
{
"epoch": 9.1,
"grad_norm": 3.3812255423981696e-05,
"learning_rate": 1.2625282167042889e-06,
"loss": 0.0,
"step": 36300
},
{
"epoch": 9.12,
"grad_norm": 0.008249111473560333,
"learning_rate": 1.2448457486832204e-06,
"loss": 0.0,
"step": 36350
},
{
"epoch": 9.13,
"grad_norm": 1.3336196388991084e-05,
"learning_rate": 1.2271632806621522e-06,
"loss": 0.0,
"step": 36400
},
{
"epoch": 9.14,
"grad_norm": 7.238022403655009e-10,
"learning_rate": 1.2094808126410835e-06,
"loss": 0.0,
"step": 36450
},
{
"epoch": 9.15,
"grad_norm": 1.9307069831775436e-11,
"learning_rate": 1.191798344620015e-06,
"loss": 0.0,
"step": 36500
},
{
"epoch": 9.17,
"grad_norm": 2.09102075932055e-11,
"learning_rate": 1.1741158765989466e-06,
"loss": 0.0,
"step": 36550
},
{
"epoch": 9.18,
"grad_norm": 0.005663714837282896,
"learning_rate": 1.1564334085778782e-06,
"loss": 0.0,
"step": 36600
},
{
"epoch": 9.19,
"grad_norm": 0.0010162381222471595,
"learning_rate": 1.1387509405568097e-06,
"loss": 0.0,
"step": 36650
},
{
"epoch": 9.2,
"grad_norm": 6.629519339185208e-05,
"learning_rate": 1.1210684725357413e-06,
"loss": 0.0,
"step": 36700
},
{
"epoch": 9.22,
"grad_norm": 3.708991016537766e-06,
"learning_rate": 1.1033860045146728e-06,
"loss": 0.0,
"step": 36750
},
{
"epoch": 9.23,
"grad_norm": 1.2199451703054365e-05,
"learning_rate": 1.0857035364936042e-06,
"loss": 0.0,
"step": 36800
},
{
"epoch": 9.24,
"grad_norm": 3.44480326930352e-07,
"learning_rate": 1.068021068472536e-06,
"loss": 0.0,
"step": 36850
},
{
"epoch": 9.26,
"grad_norm": 4.88109819229976e-09,
"learning_rate": 1.0503386004514673e-06,
"loss": 0.0,
"step": 36900
},
{
"epoch": 9.27,
"grad_norm": 1.233081690088511e-07,
"learning_rate": 1.0326561324303988e-06,
"loss": 0.0,
"step": 36950
},
{
"epoch": 9.28,
"grad_norm": 4.49614967479306e-09,
"learning_rate": 1.0149736644093304e-06,
"loss": 0.0,
"step": 37000
},
{
"epoch": 9.29,
"grad_norm": 1.2748416793328943e-06,
"learning_rate": 9.97291196388262e-07,
"loss": 0.0,
"step": 37050
},
{
"epoch": 9.31,
"grad_norm": 2.0055947869265442e-14,
"learning_rate": 9.796087283671935e-07,
"loss": 0.0,
"step": 37100
},
{
"epoch": 9.32,
"grad_norm": 1.1864563075153988e-15,
"learning_rate": 9.619262603461248e-07,
"loss": 0.0,
"step": 37150
},
{
"epoch": 9.33,
"grad_norm": 3.7789734051330015e-05,
"learning_rate": 9.442437923250566e-07,
"loss": 0.0,
"step": 37200
},
{
"epoch": 9.34,
"grad_norm": 0.2207726538181305,
"learning_rate": 9.26561324303988e-07,
"loss": 0.0,
"step": 37250
},
{
"epoch": 9.36,
"grad_norm": 0.0002287498500663787,
"learning_rate": 9.088788562829194e-07,
"loss": 0.0,
"step": 37300
},
{
"epoch": 9.37,
"grad_norm": 1.4228673350658028e-08,
"learning_rate": 8.911963882618511e-07,
"loss": 0.0,
"step": 37350
},
{
"epoch": 9.38,
"grad_norm": 2.1739325584408233e-16,
"learning_rate": 8.735139202407825e-07,
"loss": 0.0,
"step": 37400
},
{
"epoch": 9.39,
"grad_norm": 1.1177444037002715e-07,
"learning_rate": 8.558314522197141e-07,
"loss": 0.0,
"step": 37450
},
{
"epoch": 9.41,
"grad_norm": 2.286371909576701e-06,
"learning_rate": 8.381489841986456e-07,
"loss": 0.0,
"step": 37500
},
{
"epoch": 9.42,
"grad_norm": 0.0007677926332689822,
"learning_rate": 8.204665161775772e-07,
"loss": 0.0,
"step": 37550
},
{
"epoch": 9.43,
"grad_norm": 7.146362435150877e-08,
"learning_rate": 8.027840481565087e-07,
"loss": 0.0,
"step": 37600
},
{
"epoch": 9.44,
"grad_norm": 7.620369160576956e-06,
"learning_rate": 7.851015801354402e-07,
"loss": 0.0,
"step": 37650
},
{
"epoch": 9.46,
"grad_norm": 1.6175413009023032e-07,
"learning_rate": 7.674191121143717e-07,
"loss": 0.0,
"step": 37700
},
{
"epoch": 9.47,
"grad_norm": 1.4307224773801863e-06,
"learning_rate": 7.497366440933033e-07,
"loss": 0.0,
"step": 37750
},
{
"epoch": 9.48,
"grad_norm": 1.4143168414193497e-07,
"learning_rate": 7.320541760722348e-07,
"loss": 0.0,
"step": 37800
},
{
"epoch": 9.49,
"grad_norm": 5.554405676719276e-13,
"learning_rate": 7.143717080511664e-07,
"loss": 0.0,
"step": 37850
},
{
"epoch": 9.51,
"grad_norm": 3.3866279225414075e-10,
"learning_rate": 6.966892400300979e-07,
"loss": 0.0941,
"step": 37900
},
{
"epoch": 9.52,
"grad_norm": 6.048647804846041e-08,
"learning_rate": 6.790067720090294e-07,
"loss": 0.0,
"step": 37950
},
{
"epoch": 9.53,
"grad_norm": 2.0648124632316467e-07,
"learning_rate": 6.613243039879609e-07,
"loss": 0.0,
"step": 38000
},
{
"epoch": 9.54,
"grad_norm": 0.0016063437797129154,
"learning_rate": 6.436418359668924e-07,
"loss": 0.0,
"step": 38050
},
{
"epoch": 9.56,
"grad_norm": 1.247152141559127e-07,
"learning_rate": 6.259593679458239e-07,
"loss": 0.0,
"step": 38100
},
{
"epoch": 9.57,
"grad_norm": 8.07224120880079e-10,
"learning_rate": 6.082768999247555e-07,
"loss": 0.0,
"step": 38150
},
{
"epoch": 9.58,
"grad_norm": 7.635571320184498e-13,
"learning_rate": 5.90594431903687e-07,
"loss": 0.0,
"step": 38200
},
{
"epoch": 9.59,
"grad_norm": 4.792551688836966e-09,
"learning_rate": 5.729119638826185e-07,
"loss": 0.0,
"step": 38250
},
{
"epoch": 9.61,
"grad_norm": 3.3811686535045737e-06,
"learning_rate": 5.552294958615501e-07,
"loss": 0.0,
"step": 38300
},
{
"epoch": 9.62,
"grad_norm": 1.0496427338413383e-10,
"learning_rate": 5.375470278404815e-07,
"loss": 0.0,
"step": 38350
},
{
"epoch": 9.63,
"grad_norm": 0.000780309725087136,
"learning_rate": 5.198645598194131e-07,
"loss": 0.0,
"step": 38400
},
{
"epoch": 9.64,
"grad_norm": 4.170356078248005e-06,
"learning_rate": 5.021820917983446e-07,
"loss": 0.0,
"step": 38450
},
{
"epoch": 9.66,
"grad_norm": 0.004391836933791637,
"learning_rate": 4.844996237772762e-07,
"loss": 0.0,
"step": 38500
},
{
"epoch": 9.67,
"grad_norm": 6.341772859741468e-06,
"learning_rate": 4.668171557562077e-07,
"loss": 0.0,
"step": 38550
},
{
"epoch": 9.68,
"grad_norm": 0.006547071970999241,
"learning_rate": 4.4913468773513927e-07,
"loss": 0.0,
"step": 38600
},
{
"epoch": 9.69,
"grad_norm": 4.025184352940414e-06,
"learning_rate": 4.3145221971407076e-07,
"loss": 0.0,
"step": 38650
},
{
"epoch": 9.71,
"grad_norm": 0.007137050852179527,
"learning_rate": 4.1376975169300226e-07,
"loss": 0.0,
"step": 38700
},
{
"epoch": 9.72,
"grad_norm": 2.464033421745171e-10,
"learning_rate": 3.960872836719338e-07,
"loss": 0.0,
"step": 38750
},
{
"epoch": 9.73,
"grad_norm": 1.651005368330516e-05,
"learning_rate": 3.784048156508653e-07,
"loss": 0.0,
"step": 38800
},
{
"epoch": 9.74,
"grad_norm": 8.712972184021783e-12,
"learning_rate": 3.6072234762979685e-07,
"loss": 0.0,
"step": 38850
},
{
"epoch": 9.76,
"grad_norm": 0.00013669347390532494,
"learning_rate": 3.430398796087284e-07,
"loss": 0.0,
"step": 38900
},
{
"epoch": 9.77,
"grad_norm": 2.920106635428965e-05,
"learning_rate": 3.253574115876599e-07,
"loss": 0.0,
"step": 38950
},
{
"epoch": 9.78,
"grad_norm": 3.793598768453421e-09,
"learning_rate": 3.0767494356659144e-07,
"loss": 0.0,
"step": 39000
},
{
"epoch": 9.79,
"grad_norm": 1.0695604402144454e-09,
"learning_rate": 2.89992475545523e-07,
"loss": 0.0,
"step": 39050
},
{
"epoch": 9.81,
"grad_norm": 1.2106751764691062e-16,
"learning_rate": 2.723100075244545e-07,
"loss": 0.0,
"step": 39100
},
{
"epoch": 9.82,
"grad_norm": 3.6123870472692943e-07,
"learning_rate": 2.54627539503386e-07,
"loss": 0.0,
"step": 39150
},
{
"epoch": 9.83,
"grad_norm": 3.495557336918864e-07,
"learning_rate": 2.3694507148231756e-07,
"loss": 0.0,
"step": 39200
},
{
"epoch": 9.84,
"grad_norm": 4.76532950415276e-05,
"learning_rate": 2.1926260346124908e-07,
"loss": 0.0,
"step": 39250
},
{
"epoch": 9.86,
"grad_norm": 8.871047612046823e-05,
"learning_rate": 2.015801354401806e-07,
"loss": 0.0,
"step": 39300
}
],
"logging_steps": 50,
"max_steps": 39870,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 50,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}