{ "best_metric": 0.9373365167161658, "best_model_checkpoint": "vivit-surf-analytics-runpod/checkpoint-11115", "epoch": 15.001349527665317, "eval_steps": 500, "global_step": 11116, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006747638326585695, "grad_norm": 32.80915451049805, "learning_rate": 3.373819163292848e-07, "loss": 1.5069, "step": 10 }, { "epoch": 0.001349527665317139, "grad_norm": 22.916248321533203, "learning_rate": 6.747638326585696e-07, "loss": 1.354, "step": 20 }, { "epoch": 0.0020242914979757085, "grad_norm": 34.89827346801758, "learning_rate": 1.0121457489878542e-06, "loss": 1.5187, "step": 30 }, { "epoch": 0.002699055330634278, "grad_norm": 22.8042049407959, "learning_rate": 1.3495276653171391e-06, "loss": 1.478, "step": 40 }, { "epoch": 0.0033738191632928477, "grad_norm": 27.662748336791992, "learning_rate": 1.6869095816464238e-06, "loss": 1.2862, "step": 50 }, { "epoch": 0.004048582995951417, "grad_norm": 24.901159286499023, "learning_rate": 2.0242914979757085e-06, "loss": 1.2586, "step": 60 }, { "epoch": 0.004723346828609987, "grad_norm": 25.327184677124023, "learning_rate": 2.3616734143049934e-06, "loss": 1.2728, "step": 70 }, { "epoch": 0.005398110661268556, "grad_norm": 18.19566535949707, "learning_rate": 2.6990553306342783e-06, "loss": 1.0159, "step": 80 }, { "epoch": 0.006072874493927126, "grad_norm": 20.370386123657227, "learning_rate": 3.0364372469635627e-06, "loss": 1.2504, "step": 90 }, { "epoch": 0.006747638326585695, "grad_norm": 12.196557998657227, "learning_rate": 3.3738191632928476e-06, "loss": 2.0246, "step": 100 }, { "epoch": 0.007422402159244264, "grad_norm": 12.822103500366211, "learning_rate": 3.711201079622133e-06, "loss": 0.8519, "step": 110 }, { "epoch": 0.008097165991902834, "grad_norm": 6.872288227081299, "learning_rate": 4.048582995951417e-06, "loss": 0.7749, "step": 120 }, { "epoch": 0.008771929824561403, "grad_norm": 40.45072937011719, "learning_rate": 4.3859649122807014e-06, "loss": 1.3494, "step": 130 }, { "epoch": 0.009446693657219974, "grad_norm": 3.996568441390991, "learning_rate": 4.723346828609987e-06, "loss": 0.9678, "step": 140 }, { "epoch": 0.010121457489878543, "grad_norm": 2.117781400680542, "learning_rate": 5.060728744939271e-06, "loss": 1.8095, "step": 150 }, { "epoch": 0.010796221322537112, "grad_norm": 1.1970853805541992, "learning_rate": 5.3981106612685565e-06, "loss": 1.3044, "step": 160 }, { "epoch": 0.011470985155195682, "grad_norm": 56.31877136230469, "learning_rate": 5.735492577597841e-06, "loss": 3.0015, "step": 170 }, { "epoch": 0.012145748987854251, "grad_norm": 1.2758257389068604, "learning_rate": 6.0728744939271254e-06, "loss": 1.7654, "step": 180 }, { "epoch": 0.01282051282051282, "grad_norm": 49.485626220703125, "learning_rate": 6.41025641025641e-06, "loss": 1.9578, "step": 190 }, { "epoch": 0.01349527665317139, "grad_norm": 1.0414538383483887, "learning_rate": 6.747638326585695e-06, "loss": 2.0202, "step": 200 }, { "epoch": 0.01417004048582996, "grad_norm": 46.221031188964844, "learning_rate": 7.0850202429149805e-06, "loss": 2.0222, "step": 210 }, { "epoch": 0.014844804318488529, "grad_norm": 5.171656131744385, "learning_rate": 7.422402159244266e-06, "loss": 2.3988, "step": 220 }, { "epoch": 0.0155195681511471, "grad_norm": 40.51677703857422, "learning_rate": 7.75978407557355e-06, "loss": 1.1011, "step": 230 }, { "epoch": 0.016194331983805668, "grad_norm": 0.3821451961994171, "learning_rate": 8.097165991902834e-06, "loss": 1.7582, "step": 240 }, { "epoch": 0.016869095816464237, "grad_norm": 56.244895935058594, "learning_rate": 8.43454790823212e-06, "loss": 1.628, "step": 250 }, { "epoch": 0.017543859649122806, "grad_norm": 2.9704697132110596, "learning_rate": 8.771929824561403e-06, "loss": 2.4795, "step": 260 }, { "epoch": 0.018218623481781375, "grad_norm": 2.420311689376831, "learning_rate": 9.109311740890689e-06, "loss": 1.2225, "step": 270 }, { "epoch": 0.018893387314439947, "grad_norm": 3.02461314201355, "learning_rate": 9.446693657219973e-06, "loss": 1.813, "step": 280 }, { "epoch": 0.019568151147098516, "grad_norm": 1.8302630186080933, "learning_rate": 9.784075573549258e-06, "loss": 1.6011, "step": 290 }, { "epoch": 0.020242914979757085, "grad_norm": 62.22663497924805, "learning_rate": 1.0121457489878542e-05, "loss": 2.1712, "step": 300 }, { "epoch": 0.020917678812415654, "grad_norm": 4.137598037719727, "learning_rate": 1.0458839406207829e-05, "loss": 1.691, "step": 310 }, { "epoch": 0.021592442645074223, "grad_norm": 1.1848357915878296, "learning_rate": 1.0796221322537113e-05, "loss": 0.8611, "step": 320 }, { "epoch": 0.022267206477732792, "grad_norm": 48.48101043701172, "learning_rate": 1.1133603238866398e-05, "loss": 2.4268, "step": 330 }, { "epoch": 0.022941970310391364, "grad_norm": 1.995662808418274, "learning_rate": 1.1470985155195682e-05, "loss": 1.6822, "step": 340 }, { "epoch": 0.023616734143049933, "grad_norm": 4.30789041519165, "learning_rate": 1.1808367071524966e-05, "loss": 1.5158, "step": 350 }, { "epoch": 0.024291497975708502, "grad_norm": 2.7685494422912598, "learning_rate": 1.2145748987854251e-05, "loss": 0.9964, "step": 360 }, { "epoch": 0.02496626180836707, "grad_norm": 47.719268798828125, "learning_rate": 1.2483130904183535e-05, "loss": 2.2256, "step": 370 }, { "epoch": 0.02564102564102564, "grad_norm": 48.7852783203125, "learning_rate": 1.282051282051282e-05, "loss": 1.7263, "step": 380 }, { "epoch": 0.02631578947368421, "grad_norm": 57.43927001953125, "learning_rate": 1.3157894736842106e-05, "loss": 1.9673, "step": 390 }, { "epoch": 0.02699055330634278, "grad_norm": 44.42695617675781, "learning_rate": 1.349527665317139e-05, "loss": 1.6821, "step": 400 }, { "epoch": 0.02766531713900135, "grad_norm": 26.7230167388916, "learning_rate": 1.3832658569500675e-05, "loss": 1.4562, "step": 410 }, { "epoch": 0.02834008097165992, "grad_norm": 39.75962448120117, "learning_rate": 1.4170040485829961e-05, "loss": 0.7446, "step": 420 }, { "epoch": 0.029014844804318488, "grad_norm": 45.954254150390625, "learning_rate": 1.4507422402159246e-05, "loss": 1.1802, "step": 430 }, { "epoch": 0.029689608636977057, "grad_norm": 25.454557418823242, "learning_rate": 1.4844804318488532e-05, "loss": 1.1458, "step": 440 }, { "epoch": 0.030364372469635626, "grad_norm": 39.98874282836914, "learning_rate": 1.5182186234817813e-05, "loss": 0.512, "step": 450 }, { "epoch": 0.0310391363022942, "grad_norm": 22.448896408081055, "learning_rate": 1.55195681511471e-05, "loss": 1.5049, "step": 460 }, { "epoch": 0.03171390013495277, "grad_norm": 26.93549346923828, "learning_rate": 1.5856950067476383e-05, "loss": 0.7132, "step": 470 }, { "epoch": 0.032388663967611336, "grad_norm": 21.29535675048828, "learning_rate": 1.6194331983805668e-05, "loss": 1.3895, "step": 480 }, { "epoch": 0.033063427800269905, "grad_norm": 0.4730716645717621, "learning_rate": 1.6531713900134956e-05, "loss": 0.1323, "step": 490 }, { "epoch": 0.033738191632928474, "grad_norm": 6.3616862297058105, "learning_rate": 1.686909581646424e-05, "loss": 0.9272, "step": 500 }, { "epoch": 0.03441295546558704, "grad_norm": 0.5018609762191772, "learning_rate": 1.720647773279352e-05, "loss": 1.3049, "step": 510 }, { "epoch": 0.03508771929824561, "grad_norm": 62.65403747558594, "learning_rate": 1.7543859649122806e-05, "loss": 2.8324, "step": 520 }, { "epoch": 0.03576248313090418, "grad_norm": 12.991227149963379, "learning_rate": 1.7881241565452094e-05, "loss": 0.5474, "step": 530 }, { "epoch": 0.03643724696356275, "grad_norm": 0.5492640733718872, "learning_rate": 1.8218623481781378e-05, "loss": 1.2093, "step": 540 }, { "epoch": 0.037112010796221326, "grad_norm": 0.07551870495080948, "learning_rate": 1.8556005398110663e-05, "loss": 1.2592, "step": 550 }, { "epoch": 0.037786774628879895, "grad_norm": 1.5378609895706177, "learning_rate": 1.8893387314439947e-05, "loss": 0.3359, "step": 560 }, { "epoch": 0.038461538461538464, "grad_norm": 0.23666121065616608, "learning_rate": 1.923076923076923e-05, "loss": 1.3151, "step": 570 }, { "epoch": 0.03913630229419703, "grad_norm": 7.869609832763672, "learning_rate": 1.9568151147098516e-05, "loss": 0.6125, "step": 580 }, { "epoch": 0.0398110661268556, "grad_norm": 13.923602104187012, "learning_rate": 1.99055330634278e-05, "loss": 1.1772, "step": 590 }, { "epoch": 0.04048582995951417, "grad_norm": 29.88282585144043, "learning_rate": 2.0242914979757085e-05, "loss": 0.1969, "step": 600 }, { "epoch": 0.04116059379217274, "grad_norm": 24.888872146606445, "learning_rate": 2.058029689608637e-05, "loss": 0.7039, "step": 610 }, { "epoch": 0.04183535762483131, "grad_norm": 0.40080440044403076, "learning_rate": 2.0917678812415657e-05, "loss": 1.008, "step": 620 }, { "epoch": 0.04251012145748988, "grad_norm": 4.866868495941162, "learning_rate": 2.125506072874494e-05, "loss": 1.0502, "step": 630 }, { "epoch": 0.043184885290148446, "grad_norm": 1.2502915859222412, "learning_rate": 2.1592442645074226e-05, "loss": 0.7422, "step": 640 }, { "epoch": 0.043859649122807015, "grad_norm": 0.6650214791297913, "learning_rate": 2.1929824561403507e-05, "loss": 0.6072, "step": 650 }, { "epoch": 0.044534412955465584, "grad_norm": 16.356952667236328, "learning_rate": 2.2267206477732795e-05, "loss": 0.8198, "step": 660 }, { "epoch": 0.04520917678812416, "grad_norm": 169.0858154296875, "learning_rate": 2.260458839406208e-05, "loss": 1.2371, "step": 670 }, { "epoch": 0.04588394062078273, "grad_norm": 98.78671264648438, "learning_rate": 2.2941970310391364e-05, "loss": 1.1303, "step": 680 }, { "epoch": 0.0465587044534413, "grad_norm": 99.31029510498047, "learning_rate": 2.327935222672065e-05, "loss": 0.7183, "step": 690 }, { "epoch": 0.04723346828609987, "grad_norm": 30.58230209350586, "learning_rate": 2.3616734143049933e-05, "loss": 1.1701, "step": 700 }, { "epoch": 0.047908232118758436, "grad_norm": 0.05228818207979202, "learning_rate": 2.395411605937922e-05, "loss": 0.1544, "step": 710 }, { "epoch": 0.048582995951417005, "grad_norm": 1.974684715270996, "learning_rate": 2.4291497975708502e-05, "loss": 1.4774, "step": 720 }, { "epoch": 0.049257759784075573, "grad_norm": 0.12068396061658859, "learning_rate": 2.4628879892037786e-05, "loss": 0.6994, "step": 730 }, { "epoch": 0.04993252361673414, "grad_norm": 76.24126434326172, "learning_rate": 2.496626180836707e-05, "loss": 1.3533, "step": 740 }, { "epoch": 0.05, "eval_accuracy": 0.8392857142857143, "eval_f1": 0.8382276099228692, "eval_loss": 0.7030884623527527, "eval_runtime": 74.2993, "eval_samples_per_second": 1.507, "eval_steps_per_second": 1.507, "step": 741 }, { "epoch": 1.0006072874493928, "grad_norm": 0.14856617152690887, "learning_rate": 2.530364372469636e-05, "loss": 0.4158, "step": 750 }, { "epoch": 1.0012820512820513, "grad_norm": 0.07458806782960892, "learning_rate": 2.564102564102564e-05, "loss": 0.0042, "step": 760 }, { "epoch": 1.0019568151147098, "grad_norm": 0.08816417306661606, "learning_rate": 2.5978407557354928e-05, "loss": 0.0175, "step": 770 }, { "epoch": 1.0026315789473683, "grad_norm": 0.07340700924396515, "learning_rate": 2.6315789473684212e-05, "loss": 0.0039, "step": 780 }, { "epoch": 1.003306342780027, "grad_norm": 0.08517087250947952, "learning_rate": 2.66531713900135e-05, "loss": 0.0075, "step": 790 }, { "epoch": 1.0039811066126856, "grad_norm": 0.07905049622058868, "learning_rate": 2.699055330634278e-05, "loss": 0.0021, "step": 800 }, { "epoch": 1.004655870445344, "grad_norm": 0.13749797642230988, "learning_rate": 2.732793522267207e-05, "loss": 0.7603, "step": 810 }, { "epoch": 1.0053306342780026, "grad_norm": 0.04107066988945007, "learning_rate": 2.766531713900135e-05, "loss": 0.0033, "step": 820 }, { "epoch": 1.0060053981106614, "grad_norm": 0.05302370712161064, "learning_rate": 2.8002699055330634e-05, "loss": 0.0487, "step": 830 }, { "epoch": 1.0066801619433199, "grad_norm": 0.050035424530506134, "learning_rate": 2.8340080971659922e-05, "loss": 0.0166, "step": 840 }, { "epoch": 1.0073549257759784, "grad_norm": 0.17594772577285767, "learning_rate": 2.8677462887989203e-05, "loss": 0.9013, "step": 850 }, { "epoch": 1.008029689608637, "grad_norm": 5.9323811531066895, "learning_rate": 2.901484480431849e-05, "loss": 0.8083, "step": 860 }, { "epoch": 1.0087044534412954, "grad_norm": 0.2871362566947937, "learning_rate": 2.9352226720647776e-05, "loss": 0.6024, "step": 870 }, { "epoch": 1.0093792172739542, "grad_norm": 0.3136674463748932, "learning_rate": 2.9689608636977063e-05, "loss": 0.5028, "step": 880 }, { "epoch": 1.0100539811066127, "grad_norm": 0.05438687652349472, "learning_rate": 3.0026990553306344e-05, "loss": 0.1368, "step": 890 }, { "epoch": 1.0107287449392712, "grad_norm": 0.05301366746425629, "learning_rate": 3.0364372469635626e-05, "loss": 0.1373, "step": 900 }, { "epoch": 1.0114035087719297, "grad_norm": 0.015999358147382736, "learning_rate": 3.0701754385964913e-05, "loss": 0.0301, "step": 910 }, { "epoch": 1.0120782726045885, "grad_norm": 0.014771537855267525, "learning_rate": 3.10391363022942e-05, "loss": 0.0195, "step": 920 }, { "epoch": 1.012753036437247, "grad_norm": 31.934608459472656, "learning_rate": 3.137651821862348e-05, "loss": 1.6569, "step": 930 }, { "epoch": 1.0134278002699055, "grad_norm": 0.031412914395332336, "learning_rate": 3.171390013495277e-05, "loss": 0.0009, "step": 940 }, { "epoch": 1.014102564102564, "grad_norm": 0.028489330783486366, "learning_rate": 3.205128205128206e-05, "loss": 0.1698, "step": 950 }, { "epoch": 1.0147773279352226, "grad_norm": 105.16389465332031, "learning_rate": 3.2388663967611336e-05, "loss": 0.1704, "step": 960 }, { "epoch": 1.0154520917678813, "grad_norm": 0.024373585358262062, "learning_rate": 3.272604588394062e-05, "loss": 0.6264, "step": 970 }, { "epoch": 1.0161268556005398, "grad_norm": 0.024133900180459023, "learning_rate": 3.306342780026991e-05, "loss": 0.8147, "step": 980 }, { "epoch": 1.0168016194331984, "grad_norm": 92.91847229003906, "learning_rate": 3.340080971659919e-05, "loss": 1.1899, "step": 990 }, { "epoch": 1.0174763832658569, "grad_norm": 0.3631739616394043, "learning_rate": 3.373819163292848e-05, "loss": 1.2713, "step": 1000 }, { "epoch": 1.0181511470985156, "grad_norm": 7.820636749267578, "learning_rate": 3.407557354925776e-05, "loss": 0.5495, "step": 1010 }, { "epoch": 1.0188259109311741, "grad_norm": 55.31717300415039, "learning_rate": 3.441295546558704e-05, "loss": 0.4036, "step": 1020 }, { "epoch": 1.0195006747638327, "grad_norm": 0.013262225314974785, "learning_rate": 3.4750337381916334e-05, "loss": 0.0089, "step": 1030 }, { "epoch": 1.0201754385964912, "grad_norm": 0.8780525922775269, "learning_rate": 3.508771929824561e-05, "loss": 0.3444, "step": 1040 }, { "epoch": 1.0208502024291497, "grad_norm": 2.9044814109802246, "learning_rate": 3.54251012145749e-05, "loss": 0.0081, "step": 1050 }, { "epoch": 1.0215249662618084, "grad_norm": 0.020421041175723076, "learning_rate": 3.576248313090419e-05, "loss": 1.0599, "step": 1060 }, { "epoch": 1.022199730094467, "grad_norm": 0.012594003230333328, "learning_rate": 3.609986504723347e-05, "loss": 0.0616, "step": 1070 }, { "epoch": 1.0228744939271255, "grad_norm": 0.018383637070655823, "learning_rate": 3.6437246963562756e-05, "loss": 0.9516, "step": 1080 }, { "epoch": 1.023549257759784, "grad_norm": 0.04205102473497391, "learning_rate": 3.6774628879892034e-05, "loss": 0.4867, "step": 1090 }, { "epoch": 1.0242240215924427, "grad_norm": 0.022214779630303383, "learning_rate": 3.7112010796221325e-05, "loss": 0.0101, "step": 1100 }, { "epoch": 1.0248987854251013, "grad_norm": 0.026110410690307617, "learning_rate": 3.744939271255061e-05, "loss": 0.3327, "step": 1110 }, { "epoch": 1.0255735492577598, "grad_norm": 0.16947214305400848, "learning_rate": 3.7786774628879894e-05, "loss": 0.6535, "step": 1120 }, { "epoch": 1.0262483130904183, "grad_norm": 0.019961325451731682, "learning_rate": 3.812415654520918e-05, "loss": 0.0014, "step": 1130 }, { "epoch": 1.0269230769230768, "grad_norm": 213.16741943359375, "learning_rate": 3.846153846153846e-05, "loss": 0.348, "step": 1140 }, { "epoch": 1.0275978407557356, "grad_norm": 0.26998648047447205, "learning_rate": 3.879892037786775e-05, "loss": 1.8138, "step": 1150 }, { "epoch": 1.028272604588394, "grad_norm": 16.201974868774414, "learning_rate": 3.913630229419703e-05, "loss": 1.1767, "step": 1160 }, { "epoch": 1.0289473684210526, "grad_norm": 0.46378159523010254, "learning_rate": 3.9473684210526316e-05, "loss": 0.3981, "step": 1170 }, { "epoch": 1.0296221322537111, "grad_norm": 0.16117815673351288, "learning_rate": 3.98110661268556e-05, "loss": 0.6503, "step": 1180 }, { "epoch": 1.0302968960863699, "grad_norm": 0.09139110147953033, "learning_rate": 4.014844804318489e-05, "loss": 0.0009, "step": 1190 }, { "epoch": 1.0309716599190284, "grad_norm": 59.96378707885742, "learning_rate": 4.048582995951417e-05, "loss": 2.7966, "step": 1200 }, { "epoch": 1.031646423751687, "grad_norm": 0.1793028563261032, "learning_rate": 4.082321187584346e-05, "loss": 0.7813, "step": 1210 }, { "epoch": 1.0323211875843454, "grad_norm": 0.04233807325363159, "learning_rate": 4.116059379217274e-05, "loss": 0.0117, "step": 1220 }, { "epoch": 1.032995951417004, "grad_norm": 0.10781926661729813, "learning_rate": 4.149797570850202e-05, "loss": 0.0029, "step": 1230 }, { "epoch": 1.0336707152496627, "grad_norm": 0.04887605831027031, "learning_rate": 4.1835357624831314e-05, "loss": 0.0023, "step": 1240 }, { "epoch": 1.0343454790823212, "grad_norm": 0.0070233517326414585, "learning_rate": 4.217273954116059e-05, "loss": 0.5274, "step": 1250 }, { "epoch": 1.0350202429149797, "grad_norm": 0.009842370636761189, "learning_rate": 4.251012145748988e-05, "loss": 1.1471, "step": 1260 }, { "epoch": 1.0356950067476383, "grad_norm": 117.02069091796875, "learning_rate": 4.284750337381917e-05, "loss": 0.0518, "step": 1270 }, { "epoch": 1.036369770580297, "grad_norm": 0.011584924533963203, "learning_rate": 4.318488529014845e-05, "loss": 0.0088, "step": 1280 }, { "epoch": 1.0370445344129555, "grad_norm": 0.04845478758215904, "learning_rate": 4.3522267206477737e-05, "loss": 0.937, "step": 1290 }, { "epoch": 1.037719298245614, "grad_norm": 12.870345115661621, "learning_rate": 4.3859649122807014e-05, "loss": 0.1796, "step": 1300 }, { "epoch": 1.0383940620782726, "grad_norm": 0.18226304650306702, "learning_rate": 4.4197031039136306e-05, "loss": 0.7725, "step": 1310 }, { "epoch": 1.039068825910931, "grad_norm": 0.038409680128097534, "learning_rate": 4.453441295546559e-05, "loss": 0.5331, "step": 1320 }, { "epoch": 1.0397435897435898, "grad_norm": 1.686890721321106, "learning_rate": 4.4871794871794874e-05, "loss": 0.6265, "step": 1330 }, { "epoch": 1.0404183535762483, "grad_norm": 0.009872148744761944, "learning_rate": 4.520917678812416e-05, "loss": 1.2371, "step": 1340 }, { "epoch": 1.0410931174089069, "grad_norm": 0.016034213826060295, "learning_rate": 4.5546558704453443e-05, "loss": 0.8008, "step": 1350 }, { "epoch": 1.0417678812415654, "grad_norm": 161.0729217529297, "learning_rate": 4.588394062078273e-05, "loss": 1.6563, "step": 1360 }, { "epoch": 1.0424426450742241, "grad_norm": 0.039535123854875565, "learning_rate": 4.622132253711201e-05, "loss": 0.698, "step": 1370 }, { "epoch": 1.0431174089068826, "grad_norm": 0.02719847857952118, "learning_rate": 4.65587044534413e-05, "loss": 0.1234, "step": 1380 }, { "epoch": 1.0437921727395412, "grad_norm": 79.83929443359375, "learning_rate": 4.689608636977058e-05, "loss": 1.275, "step": 1390 }, { "epoch": 1.0444669365721997, "grad_norm": 0.2730661928653717, "learning_rate": 4.7233468286099866e-05, "loss": 0.4828, "step": 1400 }, { "epoch": 1.0451417004048582, "grad_norm": 0.025355026125907898, "learning_rate": 4.757085020242915e-05, "loss": 0.1393, "step": 1410 }, { "epoch": 1.045816464237517, "grad_norm": 8.70992374420166, "learning_rate": 4.790823211875844e-05, "loss": 0.0709, "step": 1420 }, { "epoch": 1.0464912280701755, "grad_norm": 37.11697006225586, "learning_rate": 4.824561403508772e-05, "loss": 2.5881, "step": 1430 }, { "epoch": 1.047165991902834, "grad_norm": 52.71913528442383, "learning_rate": 4.8582995951417004e-05, "loss": 2.0305, "step": 1440 }, { "epoch": 1.0478407557354925, "grad_norm": 1.0200884342193604, "learning_rate": 4.8920377867746295e-05, "loss": 0.0119, "step": 1450 }, { "epoch": 1.0485155195681513, "grad_norm": 0.16433711349964142, "learning_rate": 4.925775978407557e-05, "loss": 1.4951, "step": 1460 }, { "epoch": 1.0491902834008098, "grad_norm": 0.04836498573422432, "learning_rate": 4.9595141700404864e-05, "loss": 0.5514, "step": 1470 }, { "epoch": 1.0498650472334683, "grad_norm": 0.021334873512387276, "learning_rate": 4.993252361673414e-05, "loss": 0.0028, "step": 1480 }, { "epoch": 1.05, "eval_accuracy": 0.8482142857142857, "eval_f1": 0.8460469703429654, "eval_loss": 0.7499637603759766, "eval_runtime": 75.3886, "eval_samples_per_second": 1.486, "eval_steps_per_second": 1.486, "step": 1482 }, { "epoch": 2.000539811066127, "grad_norm": 0.006392825860530138, "learning_rate": 4.9970010496326286e-05, "loss": 0.0052, "step": 1490 }, { "epoch": 2.0012145748987855, "grad_norm": 0.1666487753391266, "learning_rate": 4.993252361673414e-05, "loss": 0.0088, "step": 1500 }, { "epoch": 2.001889338731444, "grad_norm": 0.27092501521110535, "learning_rate": 4.9895036737142004e-05, "loss": 0.0005, "step": 1510 }, { "epoch": 2.0025641025641026, "grad_norm": 0.01244429126381874, "learning_rate": 4.985754985754986e-05, "loss": 1.2941, "step": 1520 }, { "epoch": 2.003238866396761, "grad_norm": 0.07986637949943542, "learning_rate": 4.9820062977957716e-05, "loss": 1.3347, "step": 1530 }, { "epoch": 2.0039136302294196, "grad_norm": 0.010807895101606846, "learning_rate": 4.978257609836557e-05, "loss": 0.4777, "step": 1540 }, { "epoch": 2.004588394062078, "grad_norm": 0.010884225368499756, "learning_rate": 4.9745089218773434e-05, "loss": 1.7184, "step": 1550 }, { "epoch": 2.0052631578947366, "grad_norm": 0.17375628650188446, "learning_rate": 4.970760233918128e-05, "loss": 0.0067, "step": 1560 }, { "epoch": 2.0059379217273956, "grad_norm": 0.006022674031555653, "learning_rate": 4.9670115459589145e-05, "loss": 0.0014, "step": 1570 }, { "epoch": 2.006612685560054, "grad_norm": 0.07748937606811523, "learning_rate": 4.9632628579997e-05, "loss": 0.2612, "step": 1580 }, { "epoch": 2.0072874493927126, "grad_norm": 0.2620987296104431, "learning_rate": 4.9595141700404864e-05, "loss": 0.6517, "step": 1590 }, { "epoch": 2.007962213225371, "grad_norm": 148.11007690429688, "learning_rate": 4.955765482081271e-05, "loss": 0.5783, "step": 1600 }, { "epoch": 2.0086369770580297, "grad_norm": 0.0034163114614784718, "learning_rate": 4.9520167941220575e-05, "loss": 0.0304, "step": 1610 }, { "epoch": 2.009311740890688, "grad_norm": 0.02201319858431816, "learning_rate": 4.948268106162843e-05, "loss": 0.3777, "step": 1620 }, { "epoch": 2.0099865047233467, "grad_norm": 0.01761261560022831, "learning_rate": 4.9445194182036294e-05, "loss": 0.6914, "step": 1630 }, { "epoch": 2.0106612685560052, "grad_norm": 0.02757342904806137, "learning_rate": 4.940770730244414e-05, "loss": 0.001, "step": 1640 }, { "epoch": 2.0113360323886638, "grad_norm": 0.016815010458230972, "learning_rate": 4.9370220422852005e-05, "loss": 0.0006, "step": 1650 }, { "epoch": 2.0120107962213227, "grad_norm": 0.7724957466125488, "learning_rate": 4.933273354325986e-05, "loss": 0.4395, "step": 1660 }, { "epoch": 2.0126855600539812, "grad_norm": 0.003277893178164959, "learning_rate": 4.9295246663667724e-05, "loss": 0.0023, "step": 1670 }, { "epoch": 2.0133603238866398, "grad_norm": 0.010450620204210281, "learning_rate": 4.925775978407557e-05, "loss": 0.0003, "step": 1680 }, { "epoch": 2.0140350877192983, "grad_norm": 0.008632444776594639, "learning_rate": 4.9220272904483435e-05, "loss": 0.0588, "step": 1690 }, { "epoch": 2.014709851551957, "grad_norm": 0.2135269045829773, "learning_rate": 4.918278602489129e-05, "loss": 0.8012, "step": 1700 }, { "epoch": 2.0153846153846153, "grad_norm": 0.006235187407582998, "learning_rate": 4.9145299145299147e-05, "loss": 0.0007, "step": 1710 }, { "epoch": 2.016059379217274, "grad_norm": 0.013167057186365128, "learning_rate": 4.9107812265707e-05, "loss": 0.0004, "step": 1720 }, { "epoch": 2.0167341430499324, "grad_norm": 0.008585361763834953, "learning_rate": 4.9070325386114865e-05, "loss": 0.0006, "step": 1730 }, { "epoch": 2.017408906882591, "grad_norm": 55.19523620605469, "learning_rate": 4.903283850652272e-05, "loss": 0.8423, "step": 1740 }, { "epoch": 2.01808367071525, "grad_norm": 0.005840742029249668, "learning_rate": 4.8995351626930576e-05, "loss": 0.432, "step": 1750 }, { "epoch": 2.0187584345479084, "grad_norm": 0.007270222995430231, "learning_rate": 4.895786474733843e-05, "loss": 0.5094, "step": 1760 }, { "epoch": 2.019433198380567, "grad_norm": 0.013795904815196991, "learning_rate": 4.8920377867746295e-05, "loss": 0.6975, "step": 1770 }, { "epoch": 2.0201079622132254, "grad_norm": 0.44005972146987915, "learning_rate": 4.888289098815415e-05, "loss": 0.0006, "step": 1780 }, { "epoch": 2.020782726045884, "grad_norm": 0.020803041756153107, "learning_rate": 4.8845404108562006e-05, "loss": 0.0003, "step": 1790 }, { "epoch": 2.0214574898785425, "grad_norm": 0.004395525902509689, "learning_rate": 4.880791722896986e-05, "loss": 0.0007, "step": 1800 }, { "epoch": 2.022132253711201, "grad_norm": 0.07428783923387527, "learning_rate": 4.8770430349377725e-05, "loss": 0.0006, "step": 1810 }, { "epoch": 2.0228070175438595, "grad_norm": 0.007445579394698143, "learning_rate": 4.8732943469785574e-05, "loss": 0.0002, "step": 1820 }, { "epoch": 2.023481781376518, "grad_norm": 0.02664661407470703, "learning_rate": 4.8695456590193436e-05, "loss": 0.0002, "step": 1830 }, { "epoch": 2.024156545209177, "grad_norm": 0.08112671971321106, "learning_rate": 4.865796971060129e-05, "loss": 0.0003, "step": 1840 }, { "epoch": 2.0248313090418355, "grad_norm": 0.002486151410266757, "learning_rate": 4.862048283100915e-05, "loss": 0.0018, "step": 1850 }, { "epoch": 2.025506072874494, "grad_norm": 0.00320970406755805, "learning_rate": 4.8582995951417004e-05, "loss": 0.2271, "step": 1860 }, { "epoch": 2.0261808367071525, "grad_norm": 0.1994234174489975, "learning_rate": 4.8545509071824866e-05, "loss": 0.5385, "step": 1870 }, { "epoch": 2.026855600539811, "grad_norm": 0.0024550287052989006, "learning_rate": 4.850802219223272e-05, "loss": 0.0759, "step": 1880 }, { "epoch": 2.0275303643724696, "grad_norm": 0.004535624757409096, "learning_rate": 4.847053531264058e-05, "loss": 0.0006, "step": 1890 }, { "epoch": 2.028205128205128, "grad_norm": 0.07630165666341782, "learning_rate": 4.8433048433048433e-05, "loss": 0.0002, "step": 1900 }, { "epoch": 2.0288798920377866, "grad_norm": 0.005508648231625557, "learning_rate": 4.839556155345629e-05, "loss": 0.0048, "step": 1910 }, { "epoch": 2.029554655870445, "grad_norm": 0.00268650334328413, "learning_rate": 4.835807467386415e-05, "loss": 0.021, "step": 1920 }, { "epoch": 2.030229419703104, "grad_norm": 0.6032857894897461, "learning_rate": 4.832058779427201e-05, "loss": 0.9845, "step": 1930 }, { "epoch": 2.0309041835357626, "grad_norm": 0.0025021624751389027, "learning_rate": 4.828310091467986e-05, "loss": 0.0005, "step": 1940 }, { "epoch": 2.031578947368421, "grad_norm": 0.0031213329639285803, "learning_rate": 4.824561403508772e-05, "loss": 0.1197, "step": 1950 }, { "epoch": 2.0322537112010797, "grad_norm": 0.011701357550919056, "learning_rate": 4.820812715549558e-05, "loss": 0.0004, "step": 1960 }, { "epoch": 2.032928475033738, "grad_norm": 0.002749168314039707, "learning_rate": 4.817064027590343e-05, "loss": 0.0001, "step": 1970 }, { "epoch": 2.0336032388663967, "grad_norm": 0.003767299233004451, "learning_rate": 4.813315339631129e-05, "loss": 0.0002, "step": 1980 }, { "epoch": 2.0342780026990552, "grad_norm": 0.005788211710751057, "learning_rate": 4.809566651671915e-05, "loss": 0.0012, "step": 1990 }, { "epoch": 2.0349527665317138, "grad_norm": 329.865966796875, "learning_rate": 4.805817963712701e-05, "loss": 1.4817, "step": 2000 }, { "epoch": 2.0356275303643723, "grad_norm": 0.011975220404565334, "learning_rate": 4.802069275753486e-05, "loss": 1.3289, "step": 2010 }, { "epoch": 2.0363022941970312, "grad_norm": 0.0021649515256285667, "learning_rate": 4.798320587794272e-05, "loss": 0.0055, "step": 2020 }, { "epoch": 2.0369770580296898, "grad_norm": 0.0019632915500551462, "learning_rate": 4.794571899835058e-05, "loss": 0.0002, "step": 2030 }, { "epoch": 2.0376518218623483, "grad_norm": 0.005742478650063276, "learning_rate": 4.790823211875844e-05, "loss": 0.0011, "step": 2040 }, { "epoch": 2.038326585695007, "grad_norm": 0.009554996155202389, "learning_rate": 4.787074523916629e-05, "loss": 0.8594, "step": 2050 }, { "epoch": 2.0390013495276653, "grad_norm": 0.0015004322631284595, "learning_rate": 4.783325835957415e-05, "loss": 0.0138, "step": 2060 }, { "epoch": 2.039676113360324, "grad_norm": 0.005102177150547504, "learning_rate": 4.779577147998201e-05, "loss": 0.7251, "step": 2070 }, { "epoch": 2.0403508771929824, "grad_norm": 0.0036967694759368896, "learning_rate": 4.7758284600389865e-05, "loss": 0.004, "step": 2080 }, { "epoch": 2.041025641025641, "grad_norm": 0.0025739429984241724, "learning_rate": 4.772079772079772e-05, "loss": 0.9565, "step": 2090 }, { "epoch": 2.0417004048582994, "grad_norm": 0.006292873062193394, "learning_rate": 4.768331084120558e-05, "loss": 0.0182, "step": 2100 }, { "epoch": 2.0423751686909584, "grad_norm": 0.007768746931105852, "learning_rate": 4.764582396161344e-05, "loss": 0.4385, "step": 2110 }, { "epoch": 2.043049932523617, "grad_norm": 0.005842685699462891, "learning_rate": 4.7608337082021294e-05, "loss": 0.6865, "step": 2120 }, { "epoch": 2.0437246963562754, "grad_norm": 0.003818152705207467, "learning_rate": 4.757085020242915e-05, "loss": 0.1049, "step": 2130 }, { "epoch": 2.044399460188934, "grad_norm": 0.0034294927027076483, "learning_rate": 4.753336332283701e-05, "loss": 0.0004, "step": 2140 }, { "epoch": 2.0450742240215924, "grad_norm": 0.005487513262778521, "learning_rate": 4.749587644324487e-05, "loss": 0.2808, "step": 2150 }, { "epoch": 2.045748987854251, "grad_norm": 0.004234930034726858, "learning_rate": 4.7458389563652724e-05, "loss": 0.0096, "step": 2160 }, { "epoch": 2.0464237516869095, "grad_norm": 0.004304991569370031, "learning_rate": 4.742090268406058e-05, "loss": 1.0463, "step": 2170 }, { "epoch": 2.047098515519568, "grad_norm": 0.06642390042543411, "learning_rate": 4.738341580446844e-05, "loss": 0.0003, "step": 2180 }, { "epoch": 2.0477732793522265, "grad_norm": 21.008607864379883, "learning_rate": 4.73459289248763e-05, "loss": 0.0073, "step": 2190 }, { "epoch": 2.0484480431848855, "grad_norm": 0.003075533313676715, "learning_rate": 4.7308442045284154e-05, "loss": 0.6295, "step": 2200 }, { "epoch": 2.049122807017544, "grad_norm": 0.002309370320290327, "learning_rate": 4.727095516569201e-05, "loss": 0.1005, "step": 2210 }, { "epoch": 2.0497975708502025, "grad_norm": 0.0027971486561000347, "learning_rate": 4.7233468286099866e-05, "loss": 0.0021, "step": 2220 }, { "epoch": 2.05, "eval_accuracy": 0.8839285714285714, "eval_f1": 0.882116388637625, "eval_loss": 0.5603616833686829, "eval_runtime": 74.4272, "eval_samples_per_second": 1.505, "eval_steps_per_second": 1.505, "step": 2223 }, { "epoch": 3.000472334682861, "grad_norm": 0.01166750118136406, "learning_rate": 4.719598140650772e-05, "loss": 0.0163, "step": 2230 }, { "epoch": 3.0011470985155198, "grad_norm": 96.65850067138672, "learning_rate": 4.7158494526915584e-05, "loss": 1.6819, "step": 2240 }, { "epoch": 3.0018218623481783, "grad_norm": 0.05787191540002823, "learning_rate": 4.712100764732344e-05, "loss": 0.6805, "step": 2250 }, { "epoch": 3.002496626180837, "grad_norm": 0.0013539530336856842, "learning_rate": 4.7083520767731296e-05, "loss": 0.0011, "step": 2260 }, { "epoch": 3.0031713900134953, "grad_norm": 0.010776277631521225, "learning_rate": 4.704603388813915e-05, "loss": 0.0148, "step": 2270 }, { "epoch": 3.003846153846154, "grad_norm": 0.024213161319494247, "learning_rate": 4.700854700854701e-05, "loss": 0.0008, "step": 2280 }, { "epoch": 3.0045209176788124, "grad_norm": 0.0691986232995987, "learning_rate": 4.697106012895487e-05, "loss": 0.006, "step": 2290 }, { "epoch": 3.005195681511471, "grad_norm": 0.009089670144021511, "learning_rate": 4.6933573249362725e-05, "loss": 0.6432, "step": 2300 }, { "epoch": 3.0058704453441294, "grad_norm": 0.005548300687223673, "learning_rate": 4.689608636977058e-05, "loss": 0.0018, "step": 2310 }, { "epoch": 3.006545209176788, "grad_norm": 0.006319984793663025, "learning_rate": 4.685859949017844e-05, "loss": 0.0001, "step": 2320 }, { "epoch": 3.007219973009447, "grad_norm": 0.007898062467575073, "learning_rate": 4.68211126105863e-05, "loss": 0.0008, "step": 2330 }, { "epoch": 3.0078947368421054, "grad_norm": 0.003347884165123105, "learning_rate": 4.678362573099415e-05, "loss": 0.0002, "step": 2340 }, { "epoch": 3.008569500674764, "grad_norm": 0.009431365877389908, "learning_rate": 4.674613885140201e-05, "loss": 0.0001, "step": 2350 }, { "epoch": 3.0092442645074224, "grad_norm": 0.006901255808770657, "learning_rate": 4.670865197180987e-05, "loss": 0.0001, "step": 2360 }, { "epoch": 3.009919028340081, "grad_norm": 0.00315679213963449, "learning_rate": 4.667116509221773e-05, "loss": 0.0002, "step": 2370 }, { "epoch": 3.0105937921727395, "grad_norm": 0.21266283094882965, "learning_rate": 4.663367821262558e-05, "loss": 0.0006, "step": 2380 }, { "epoch": 3.011268556005398, "grad_norm": 0.004384478088468313, "learning_rate": 4.659619133303344e-05, "loss": 0.0006, "step": 2390 }, { "epoch": 3.0119433198380565, "grad_norm": 0.013708599843084812, "learning_rate": 4.65587044534413e-05, "loss": 0.2589, "step": 2400 }, { "epoch": 3.012618083670715, "grad_norm": 308.8554992675781, "learning_rate": 4.652121757384916e-05, "loss": 1.1215, "step": 2410 }, { "epoch": 3.013292847503374, "grad_norm": 0.0031652101315557957, "learning_rate": 4.648373069425701e-05, "loss": 0.5235, "step": 2420 }, { "epoch": 3.0139676113360325, "grad_norm": 0.00223003257997334, "learning_rate": 4.644624381466487e-05, "loss": 0.0001, "step": 2430 }, { "epoch": 3.014642375168691, "grad_norm": 0.0067682513035833836, "learning_rate": 4.640875693507273e-05, "loss": 0.0001, "step": 2440 }, { "epoch": 3.0153171390013496, "grad_norm": 0.0025887356605380774, "learning_rate": 4.637127005548059e-05, "loss": 0.0003, "step": 2450 }, { "epoch": 3.015991902834008, "grad_norm": 0.0077194697223603725, "learning_rate": 4.633378317588844e-05, "loss": 0.6217, "step": 2460 }, { "epoch": 3.0166666666666666, "grad_norm": 0.03473236411809921, "learning_rate": 4.62962962962963e-05, "loss": 0.8179, "step": 2470 }, { "epoch": 3.017341430499325, "grad_norm": 0.014423678629100323, "learning_rate": 4.6258809416704157e-05, "loss": 0.2263, "step": 2480 }, { "epoch": 3.0180161943319836, "grad_norm": 0.006188780535012484, "learning_rate": 4.622132253711201e-05, "loss": 0.6701, "step": 2490 }, { "epoch": 3.018690958164642, "grad_norm": 0.35851019620895386, "learning_rate": 4.618383565751987e-05, "loss": 0.0008, "step": 2500 }, { "epoch": 3.019365721997301, "grad_norm": 0.0032032101880759, "learning_rate": 4.614634877792773e-05, "loss": 0.0137, "step": 2510 }, { "epoch": 3.0200404858299597, "grad_norm": 0.006460473407059908, "learning_rate": 4.6108861898335586e-05, "loss": 0.0016, "step": 2520 }, { "epoch": 3.020715249662618, "grad_norm": 0.0026447370182722807, "learning_rate": 4.607137501874344e-05, "loss": 0.0014, "step": 2530 }, { "epoch": 3.0213900134952767, "grad_norm": 0.0030527382623404264, "learning_rate": 4.60338881391513e-05, "loss": 0.0001, "step": 2540 }, { "epoch": 3.022064777327935, "grad_norm": 0.007262419909238815, "learning_rate": 4.599640125955916e-05, "loss": 0.0001, "step": 2550 }, { "epoch": 3.0227395411605937, "grad_norm": 0.0038091035094112158, "learning_rate": 4.5958914379967016e-05, "loss": 0.2222, "step": 2560 }, { "epoch": 3.0234143049932523, "grad_norm": 0.0035387033130973577, "learning_rate": 4.592142750037487e-05, "loss": 0.8073, "step": 2570 }, { "epoch": 3.0240890688259108, "grad_norm": 0.0033677646424621344, "learning_rate": 4.588394062078273e-05, "loss": 0.0006, "step": 2580 }, { "epoch": 3.0247638326585693, "grad_norm": 0.006484442390501499, "learning_rate": 4.5846453741190584e-05, "loss": 0.0004, "step": 2590 }, { "epoch": 3.0254385964912283, "grad_norm": 0.010489704087376595, "learning_rate": 4.580896686159844e-05, "loss": 0.0009, "step": 2600 }, { "epoch": 3.026113360323887, "grad_norm": 0.0032699282746762037, "learning_rate": 4.57714799820063e-05, "loss": 0.0039, "step": 2610 }, { "epoch": 3.0267881241565453, "grad_norm": 97.59777069091797, "learning_rate": 4.573399310241416e-05, "loss": 1.8702, "step": 2620 }, { "epoch": 3.027462887989204, "grad_norm": 0.05291153863072395, "learning_rate": 4.5696506222822014e-05, "loss": 0.9784, "step": 2630 }, { "epoch": 3.0281376518218623, "grad_norm": 0.0015122004551813006, "learning_rate": 4.565901934322987e-05, "loss": 0.0008, "step": 2640 }, { "epoch": 3.028812415654521, "grad_norm": 0.10103687644004822, "learning_rate": 4.5621532463637725e-05, "loss": 0.0013, "step": 2650 }, { "epoch": 3.0294871794871794, "grad_norm": 0.002090906724333763, "learning_rate": 4.558404558404559e-05, "loss": 0.0005, "step": 2660 }, { "epoch": 3.030161943319838, "grad_norm": 0.0011990441707894206, "learning_rate": 4.5546558704453443e-05, "loss": 0.772, "step": 2670 }, { "epoch": 3.0308367071524964, "grad_norm": 0.0113350385800004, "learning_rate": 4.55090718248613e-05, "loss": 0.0039, "step": 2680 }, { "epoch": 3.0315114709851554, "grad_norm": 379.9673156738281, "learning_rate": 4.5471584945269155e-05, "loss": 1.1636, "step": 2690 }, { "epoch": 3.032186234817814, "grad_norm": 0.03890157490968704, "learning_rate": 4.543409806567702e-05, "loss": 1.1822, "step": 2700 }, { "epoch": 3.0328609986504724, "grad_norm": 0.0033322779927402735, "learning_rate": 4.5396611186084866e-05, "loss": 0.0014, "step": 2710 }, { "epoch": 3.033535762483131, "grad_norm": 0.6530995965003967, "learning_rate": 4.535912430649273e-05, "loss": 0.6605, "step": 2720 }, { "epoch": 3.0342105263157895, "grad_norm": 0.03727166727185249, "learning_rate": 4.5321637426900585e-05, "loss": 0.9511, "step": 2730 }, { "epoch": 3.034885290148448, "grad_norm": 0.0015920967562124133, "learning_rate": 4.528415054730845e-05, "loss": 0.0008, "step": 2740 }, { "epoch": 3.0355600539811065, "grad_norm": 0.08293965458869934, "learning_rate": 4.5246663667716296e-05, "loss": 0.0007, "step": 2750 }, { "epoch": 3.036234817813765, "grad_norm": 0.04548066109418869, "learning_rate": 4.520917678812416e-05, "loss": 0.0015, "step": 2760 }, { "epoch": 3.0369095816464236, "grad_norm": 0.011057593859732151, "learning_rate": 4.5171689908532015e-05, "loss": 0.6973, "step": 2770 }, { "epoch": 3.0375843454790825, "grad_norm": 113.07095336914062, "learning_rate": 4.513420302893988e-05, "loss": 0.3667, "step": 2780 }, { "epoch": 3.038259109311741, "grad_norm": 0.0016718521947041154, "learning_rate": 4.5096716149347726e-05, "loss": 0.6746, "step": 2790 }, { "epoch": 3.0389338731443996, "grad_norm": 0.006011700723320246, "learning_rate": 4.505922926975559e-05, "loss": 1.4009, "step": 2800 }, { "epoch": 3.039608636977058, "grad_norm": 0.0032539258245378733, "learning_rate": 4.5021742390163445e-05, "loss": 0.005, "step": 2810 }, { "epoch": 3.0402834008097166, "grad_norm": 0.024762948974967003, "learning_rate": 4.498425551057131e-05, "loss": 0.0007, "step": 2820 }, { "epoch": 3.040958164642375, "grad_norm": 0.008271398954093456, "learning_rate": 4.4946768630979156e-05, "loss": 0.0004, "step": 2830 }, { "epoch": 3.0416329284750336, "grad_norm": 0.0073724472895264626, "learning_rate": 4.490928175138702e-05, "loss": 0.7153, "step": 2840 }, { "epoch": 3.042307692307692, "grad_norm": 0.01329676155000925, "learning_rate": 4.4871794871794874e-05, "loss": 0.1339, "step": 2850 }, { "epoch": 3.0429824561403507, "grad_norm": 0.00492237601429224, "learning_rate": 4.483430799220273e-05, "loss": 0.7432, "step": 2860 }, { "epoch": 3.0436572199730096, "grad_norm": 0.006463408935815096, "learning_rate": 4.4796821112610586e-05, "loss": 0.0007, "step": 2870 }, { "epoch": 3.044331983805668, "grad_norm": 0.0007826614892110229, "learning_rate": 4.475933423301845e-05, "loss": 0.5263, "step": 2880 }, { "epoch": 3.0450067476383267, "grad_norm": 0.0012907817726954818, "learning_rate": 4.4721847353426304e-05, "loss": 0.0017, "step": 2890 }, { "epoch": 3.045681511470985, "grad_norm": 0.0011142657604068518, "learning_rate": 4.468436047383416e-05, "loss": 0.0004, "step": 2900 }, { "epoch": 3.0463562753036437, "grad_norm": 0.0039123659953475, "learning_rate": 4.4646873594242016e-05, "loss": 0.025, "step": 2910 }, { "epoch": 3.0470310391363022, "grad_norm": 0.006876886822283268, "learning_rate": 4.460938671464988e-05, "loss": 0.5972, "step": 2920 }, { "epoch": 3.0477058029689608, "grad_norm": 0.0013078979682177305, "learning_rate": 4.4571899835057734e-05, "loss": 0.0216, "step": 2930 }, { "epoch": 3.0483805668016193, "grad_norm": 0.01804491877555847, "learning_rate": 4.453441295546559e-05, "loss": 0.0025, "step": 2940 }, { "epoch": 3.049055330634278, "grad_norm": 0.0017017913050949574, "learning_rate": 4.4496926075873446e-05, "loss": 0.1553, "step": 2950 }, { "epoch": 3.0497300944669368, "grad_norm": 0.004222176969051361, "learning_rate": 4.445943919628131e-05, "loss": 0.0002, "step": 2960 }, { "epoch": 3.05, "eval_accuracy": 0.9017857142857143, "eval_f1": 0.900079642364192, "eval_loss": 0.3880017399787903, "eval_runtime": 72.9967, "eval_samples_per_second": 1.534, "eval_steps_per_second": 1.534, "step": 2964 }, { "epoch": 4.0004048582995955, "grad_norm": 0.0011517743114382029, "learning_rate": 4.442195231668916e-05, "loss": 0.4772, "step": 2970 }, { "epoch": 4.001079622132254, "grad_norm": 0.0008661440806463361, "learning_rate": 4.438446543709702e-05, "loss": 0.0001, "step": 2980 }, { "epoch": 4.0017543859649125, "grad_norm": 0.005399093497544527, "learning_rate": 4.4346978557504876e-05, "loss": 0.0033, "step": 2990 }, { "epoch": 4.002429149797571, "grad_norm": 0.0038267234340310097, "learning_rate": 4.430949167791273e-05, "loss": 0.0005, "step": 3000 }, { "epoch": 4.0031039136302295, "grad_norm": 0.0029461942613124847, "learning_rate": 4.427200479832059e-05, "loss": 0.0002, "step": 3010 }, { "epoch": 4.003778677462888, "grad_norm": 0.0006391266360878944, "learning_rate": 4.423451791872845e-05, "loss": 0.0001, "step": 3020 }, { "epoch": 4.004453441295547, "grad_norm": 0.004189279396086931, "learning_rate": 4.4197031039136306e-05, "loss": 0.0001, "step": 3030 }, { "epoch": 4.005128205128205, "grad_norm": 0.0011289932299405336, "learning_rate": 4.415954415954416e-05, "loss": 0.0001, "step": 3040 }, { "epoch": 4.005802968960864, "grad_norm": 0.0023520805407315493, "learning_rate": 4.412205727995202e-05, "loss": 0.0001, "step": 3050 }, { "epoch": 4.006477732793522, "grad_norm": 0.0018153834389522672, "learning_rate": 4.408457040035987e-05, "loss": 0.8745, "step": 3060 }, { "epoch": 4.007152496626181, "grad_norm": 0.001743017346598208, "learning_rate": 4.4047083520767735e-05, "loss": 0.0003, "step": 3070 }, { "epoch": 4.007827260458839, "grad_norm": 0.002831714926287532, "learning_rate": 4.400959664117559e-05, "loss": 0.0066, "step": 3080 }, { "epoch": 4.008502024291498, "grad_norm": 0.005015307106077671, "learning_rate": 4.397210976158345e-05, "loss": 0.1127, "step": 3090 }, { "epoch": 4.009176788124156, "grad_norm": 0.0019009409006685019, "learning_rate": 4.39346228819913e-05, "loss": 0.001, "step": 3100 }, { "epoch": 4.009851551956815, "grad_norm": 0.0011994624510407448, "learning_rate": 4.3897136002399165e-05, "loss": 0.8256, "step": 3110 }, { "epoch": 4.010526315789473, "grad_norm": 0.002758684800937772, "learning_rate": 4.3859649122807014e-05, "loss": 0.0002, "step": 3120 }, { "epoch": 4.011201079622133, "grad_norm": 0.014079189859330654, "learning_rate": 4.382216224321488e-05, "loss": 0.0001, "step": 3130 }, { "epoch": 4.011875843454791, "grad_norm": 0.001694743288680911, "learning_rate": 4.378467536362273e-05, "loss": 0.0001, "step": 3140 }, { "epoch": 4.01255060728745, "grad_norm": 0.005108845420181751, "learning_rate": 4.3747188484030595e-05, "loss": 0.0001, "step": 3150 }, { "epoch": 4.013225371120108, "grad_norm": 0.0009567590313963592, "learning_rate": 4.3709701604438444e-05, "loss": 0.0003, "step": 3160 }, { "epoch": 4.013900134952767, "grad_norm": 0.005206429865211248, "learning_rate": 4.367221472484631e-05, "loss": 0.0139, "step": 3170 }, { "epoch": 4.014574898785425, "grad_norm": 0.0010895140003412962, "learning_rate": 4.363472784525416e-05, "loss": 0.0001, "step": 3180 }, { "epoch": 4.015249662618084, "grad_norm": 0.0026008691638708115, "learning_rate": 4.3597240965662025e-05, "loss": 0.0002, "step": 3190 }, { "epoch": 4.015924426450742, "grad_norm": 0.00945541262626648, "learning_rate": 4.3559754086069874e-05, "loss": 0.0001, "step": 3200 }, { "epoch": 4.016599190283401, "grad_norm": 0.002652823692187667, "learning_rate": 4.3522267206477737e-05, "loss": 0.0003, "step": 3210 }, { "epoch": 4.017273954116059, "grad_norm": 0.011731209233403206, "learning_rate": 4.348478032688559e-05, "loss": 0.0001, "step": 3220 }, { "epoch": 4.017948717948718, "grad_norm": 0.002854161197319627, "learning_rate": 4.344729344729345e-05, "loss": 0.0001, "step": 3230 }, { "epoch": 4.018623481781376, "grad_norm": 0.0006263653049245477, "learning_rate": 4.3409806567701304e-05, "loss": 0.0002, "step": 3240 }, { "epoch": 4.019298245614035, "grad_norm": 0.008615193888545036, "learning_rate": 4.3372319688109166e-05, "loss": 0.7675, "step": 3250 }, { "epoch": 4.0199730094466934, "grad_norm": 0.0012555683497339487, "learning_rate": 4.333483280851702e-05, "loss": 0.0001, "step": 3260 }, { "epoch": 4.020647773279352, "grad_norm": 0.0026209617499262094, "learning_rate": 4.329734592892488e-05, "loss": 0.0001, "step": 3270 }, { "epoch": 4.0213225371120105, "grad_norm": 0.0008131062495522201, "learning_rate": 4.3259859049332734e-05, "loss": 0.495, "step": 3280 }, { "epoch": 4.021997300944669, "grad_norm": 0.004160483367741108, "learning_rate": 4.3222372169740596e-05, "loss": 0.0001, "step": 3290 }, { "epoch": 4.0226720647773275, "grad_norm": 0.00135552987921983, "learning_rate": 4.318488529014845e-05, "loss": 0.0001, "step": 3300 }, { "epoch": 4.023346828609987, "grad_norm": 0.0020715997088700533, "learning_rate": 4.314739841055631e-05, "loss": 0.0001, "step": 3310 }, { "epoch": 4.0240215924426455, "grad_norm": 0.0006134248687885702, "learning_rate": 4.3109911530964164e-05, "loss": 0.0003, "step": 3320 }, { "epoch": 4.024696356275304, "grad_norm": 0.005337740760296583, "learning_rate": 4.3072424651372026e-05, "loss": 0.0002, "step": 3330 }, { "epoch": 4.0253711201079625, "grad_norm": 0.002447796519845724, "learning_rate": 4.303493777177988e-05, "loss": 0.0013, "step": 3340 }, { "epoch": 4.026045883940621, "grad_norm": 0.0020753496792167425, "learning_rate": 4.299745089218774e-05, "loss": 0.0001, "step": 3350 }, { "epoch": 4.0267206477732795, "grad_norm": 0.001169373164884746, "learning_rate": 4.2959964012595594e-05, "loss": 0.4363, "step": 3360 }, { "epoch": 4.027395411605938, "grad_norm": 0.0031577907502651215, "learning_rate": 4.292247713300345e-05, "loss": 0.4359, "step": 3370 }, { "epoch": 4.028070175438597, "grad_norm": 0.0011828079586848617, "learning_rate": 4.2884990253411305e-05, "loss": 0.0001, "step": 3380 }, { "epoch": 4.028744939271255, "grad_norm": 0.0016030353726819158, "learning_rate": 4.284750337381917e-05, "loss": 0.0001, "step": 3390 }, { "epoch": 4.029419703103914, "grad_norm": 0.014403590932488441, "learning_rate": 4.2810016494227023e-05, "loss": 0.7807, "step": 3400 }, { "epoch": 4.030094466936572, "grad_norm": 0.005019639153033495, "learning_rate": 4.277252961463488e-05, "loss": 0.4727, "step": 3410 }, { "epoch": 4.030769230769231, "grad_norm": 0.002246898366138339, "learning_rate": 4.2735042735042735e-05, "loss": 0.0499, "step": 3420 }, { "epoch": 4.031443994601889, "grad_norm": 0.013324781320989132, "learning_rate": 4.269755585545059e-05, "loss": 0.5992, "step": 3430 }, { "epoch": 4.032118758434548, "grad_norm": 0.0579649917781353, "learning_rate": 4.266006897585845e-05, "loss": 0.0039, "step": 3440 }, { "epoch": 4.032793522267206, "grad_norm": 1.7032642364501953, "learning_rate": 4.262258209626631e-05, "loss": 0.6145, "step": 3450 }, { "epoch": 4.033468286099865, "grad_norm": 0.013759407214820385, "learning_rate": 4.2585095216674165e-05, "loss": 0.0002, "step": 3460 }, { "epoch": 4.034143049932523, "grad_norm": 0.00753359729424119, "learning_rate": 4.254760833708202e-05, "loss": 0.0071, "step": 3470 }, { "epoch": 4.034817813765182, "grad_norm": 0.0020441561937332153, "learning_rate": 4.251012145748988e-05, "loss": 0.001, "step": 3480 }, { "epoch": 4.035492577597841, "grad_norm": 0.001379093388095498, "learning_rate": 4.247263457789773e-05, "loss": 0.0013, "step": 3490 }, { "epoch": 4.0361673414305, "grad_norm": 0.002510966034606099, "learning_rate": 4.2435147698305595e-05, "loss": 0.0003, "step": 3500 }, { "epoch": 4.036842105263158, "grad_norm": 0.0011007965076714754, "learning_rate": 4.239766081871345e-05, "loss": 1.0836, "step": 3510 }, { "epoch": 4.037516869095817, "grad_norm": 0.022373057901859283, "learning_rate": 4.236017393912131e-05, "loss": 0.0838, "step": 3520 }, { "epoch": 4.038191632928475, "grad_norm": 0.0008921432308852673, "learning_rate": 4.232268705952916e-05, "loss": 0.0001, "step": 3530 }, { "epoch": 4.038866396761134, "grad_norm": 0.0007166191353462636, "learning_rate": 4.2285200179937025e-05, "loss": 0.0076, "step": 3540 }, { "epoch": 4.039541160593792, "grad_norm": 8.101381301879883, "learning_rate": 4.224771330034488e-05, "loss": 0.0233, "step": 3550 }, { "epoch": 4.040215924426451, "grad_norm": 0.0007625047001056373, "learning_rate": 4.221022642075274e-05, "loss": 0.0003, "step": 3560 }, { "epoch": 4.040890688259109, "grad_norm": 0.00398569880053401, "learning_rate": 4.217273954116059e-05, "loss": 0.0001, "step": 3570 }, { "epoch": 4.041565452091768, "grad_norm": 0.0010361782042309642, "learning_rate": 4.2135252661568455e-05, "loss": 0.0001, "step": 3580 }, { "epoch": 4.042240215924426, "grad_norm": 0.001946108415722847, "learning_rate": 4.209776578197631e-05, "loss": 0.0004, "step": 3590 }, { "epoch": 4.042914979757085, "grad_norm": 0.003806932596489787, "learning_rate": 4.2060278902384166e-05, "loss": 0.0, "step": 3600 }, { "epoch": 4.043589743589743, "grad_norm": 0.0009996455628424883, "learning_rate": 4.202279202279202e-05, "loss": 0.0002, "step": 3610 }, { "epoch": 4.044264507422402, "grad_norm": 0.0016769858775660396, "learning_rate": 4.1985305143199884e-05, "loss": 0.0001, "step": 3620 }, { "epoch": 4.0449392712550605, "grad_norm": 0.00047590630128979683, "learning_rate": 4.194781826360774e-05, "loss": 0.0001, "step": 3630 }, { "epoch": 4.045614035087719, "grad_norm": 0.0010459835175424814, "learning_rate": 4.1910331384015596e-05, "loss": 0.3976, "step": 3640 }, { "epoch": 4.0462887989203775, "grad_norm": 0.003536689095199108, "learning_rate": 4.187284450442345e-05, "loss": 0.5592, "step": 3650 }, { "epoch": 4.046963562753036, "grad_norm": 0.004078584257513285, "learning_rate": 4.1835357624831314e-05, "loss": 0.2639, "step": 3660 }, { "epoch": 4.0476383265856954, "grad_norm": 0.01091256644576788, "learning_rate": 4.179787074523917e-05, "loss": 0.0001, "step": 3670 }, { "epoch": 4.048313090418354, "grad_norm": 0.0032140237744897604, "learning_rate": 4.1760383865647026e-05, "loss": 0.2047, "step": 3680 }, { "epoch": 4.0489878542510125, "grad_norm": 0.003986234311014414, "learning_rate": 4.172289698605488e-05, "loss": 0.0019, "step": 3690 }, { "epoch": 4.049662618083671, "grad_norm": 0.0013649433385580778, "learning_rate": 4.1685410106462744e-05, "loss": 0.0001, "step": 3700 }, { "epoch": 4.05, "eval_accuracy": 0.9285714285714286, "eval_f1": 0.9284473859473861, "eval_loss": 0.43087735772132874, "eval_runtime": 74.3247, "eval_samples_per_second": 1.507, "eval_steps_per_second": 1.507, "step": 3705 }, { "epoch": 5.00033738191633, "grad_norm": 0.0009709022124297917, "learning_rate": 4.16479232268706e-05, "loss": 0.0001, "step": 3710 }, { "epoch": 5.001012145748988, "grad_norm": 0.00450406176969409, "learning_rate": 4.1610436347278456e-05, "loss": 0.0001, "step": 3720 }, { "epoch": 5.001686909581647, "grad_norm": 490.396240234375, "learning_rate": 4.157294946768631e-05, "loss": 0.3041, "step": 3730 }, { "epoch": 5.002361673414305, "grad_norm": 0.00026446336414664984, "learning_rate": 4.153546258809417e-05, "loss": 0.0001, "step": 3740 }, { "epoch": 5.003036437246964, "grad_norm": 0.0011977544054389, "learning_rate": 4.149797570850202e-05, "loss": 0.0001, "step": 3750 }, { "epoch": 5.003711201079622, "grad_norm": 0.0008563337032683194, "learning_rate": 4.1460488828909886e-05, "loss": 0.6888, "step": 3760 }, { "epoch": 5.004385964912281, "grad_norm": 0.0008433638722635806, "learning_rate": 4.142300194931774e-05, "loss": 0.0003, "step": 3770 }, { "epoch": 5.005060728744939, "grad_norm": 0.0007336140261031687, "learning_rate": 4.13855150697256e-05, "loss": 0.5238, "step": 3780 }, { "epoch": 5.005735492577598, "grad_norm": 0.0012576148146763444, "learning_rate": 4.134802819013345e-05, "loss": 0.0023, "step": 3790 }, { "epoch": 5.006410256410256, "grad_norm": 0.0009189122938551009, "learning_rate": 4.131054131054131e-05, "loss": 0.0131, "step": 3800 }, { "epoch": 5.007085020242915, "grad_norm": 0.008739179000258446, "learning_rate": 4.127305443094917e-05, "loss": 0.0003, "step": 3810 }, { "epoch": 5.007759784075573, "grad_norm": 0.0012460118159651756, "learning_rate": 4.123556755135703e-05, "loss": 0.0001, "step": 3820 }, { "epoch": 5.008434547908232, "grad_norm": 0.002039340790361166, "learning_rate": 4.119808067176488e-05, "loss": 0.0003, "step": 3830 }, { "epoch": 5.0091093117408905, "grad_norm": 0.0009501971653662622, "learning_rate": 4.116059379217274e-05, "loss": 0.0052, "step": 3840 }, { "epoch": 5.009784075573549, "grad_norm": 0.07869889587163925, "learning_rate": 4.11231069125806e-05, "loss": 0.0002, "step": 3850 }, { "epoch": 5.0104588394062075, "grad_norm": 0.0006638221675530076, "learning_rate": 4.108562003298845e-05, "loss": 0.0005, "step": 3860 }, { "epoch": 5.011133603238866, "grad_norm": 0.0008539034170098603, "learning_rate": 4.104813315339631e-05, "loss": 0.0001, "step": 3870 }, { "epoch": 5.0118083670715246, "grad_norm": 0.0006605815142393112, "learning_rate": 4.101064627380417e-05, "loss": 0.0004, "step": 3880 }, { "epoch": 5.012483130904184, "grad_norm": 0.0008256967412307858, "learning_rate": 4.097315939421203e-05, "loss": 0.0001, "step": 3890 }, { "epoch": 5.0131578947368425, "grad_norm": 0.008075601421296597, "learning_rate": 4.093567251461988e-05, "loss": 0.0018, "step": 3900 }, { "epoch": 5.013832658569501, "grad_norm": 0.0012110425159335136, "learning_rate": 4.089818563502774e-05, "loss": 0.0011, "step": 3910 }, { "epoch": 5.0145074224021595, "grad_norm": 0.0048310281708836555, "learning_rate": 4.08606987554356e-05, "loss": 0.0001, "step": 3920 }, { "epoch": 5.015182186234818, "grad_norm": 0.0012771515175700188, "learning_rate": 4.082321187584346e-05, "loss": 0.0003, "step": 3930 }, { "epoch": 5.015856950067477, "grad_norm": 0.0013642838457599282, "learning_rate": 4.078572499625131e-05, "loss": 0.0001, "step": 3940 }, { "epoch": 5.016531713900135, "grad_norm": 311.0769348144531, "learning_rate": 4.074823811665917e-05, "loss": 0.7081, "step": 3950 }, { "epoch": 5.017206477732794, "grad_norm": 0.002835233462974429, "learning_rate": 4.071075123706703e-05, "loss": 0.0003, "step": 3960 }, { "epoch": 5.017881241565452, "grad_norm": 0.0006811009370721877, "learning_rate": 4.067326435747489e-05, "loss": 0.4166, "step": 3970 }, { "epoch": 5.018556005398111, "grad_norm": 0.0010262362193316221, "learning_rate": 4.063577747788274e-05, "loss": 0.0001, "step": 3980 }, { "epoch": 5.019230769230769, "grad_norm": 0.11619503796100616, "learning_rate": 4.05982905982906e-05, "loss": 0.0002, "step": 3990 }, { "epoch": 5.019905533063428, "grad_norm": 0.011183816939592361, "learning_rate": 4.056080371869846e-05, "loss": 0.0006, "step": 4000 }, { "epoch": 5.020580296896086, "grad_norm": 0.0007078946800902486, "learning_rate": 4.0523316839106314e-05, "loss": 0.0004, "step": 4010 }, { "epoch": 5.021255060728745, "grad_norm": 0.008296789601445198, "learning_rate": 4.048582995951417e-05, "loss": 0.0134, "step": 4020 }, { "epoch": 5.021929824561403, "grad_norm": 0.013501118868589401, "learning_rate": 4.044834307992203e-05, "loss": 0.0003, "step": 4030 }, { "epoch": 5.022604588394062, "grad_norm": 0.15977753698825836, "learning_rate": 4.041085620032989e-05, "loss": 0.0001, "step": 4040 }, { "epoch": 5.02327935222672, "grad_norm": 0.004472650587558746, "learning_rate": 4.0373369320737744e-05, "loss": 0.0032, "step": 4050 }, { "epoch": 5.023954116059379, "grad_norm": 0.0012224495876580477, "learning_rate": 4.03358824411456e-05, "loss": 0.0, "step": 4060 }, { "epoch": 5.024628879892038, "grad_norm": 0.0016181441023945808, "learning_rate": 4.029839556155346e-05, "loss": 0.7806, "step": 4070 }, { "epoch": 5.025303643724697, "grad_norm": 0.004258355125784874, "learning_rate": 4.026090868196132e-05, "loss": 0.0, "step": 4080 }, { "epoch": 5.025978407557355, "grad_norm": 0.0011408330174162984, "learning_rate": 4.0223421802369174e-05, "loss": 0.0001, "step": 4090 }, { "epoch": 5.026653171390014, "grad_norm": 0.010054398328065872, "learning_rate": 4.018593492277703e-05, "loss": 0.0001, "step": 4100 }, { "epoch": 5.027327935222672, "grad_norm": 0.0009806094458326697, "learning_rate": 4.014844804318489e-05, "loss": 0.0001, "step": 4110 }, { "epoch": 5.028002699055331, "grad_norm": 0.0007722462760284543, "learning_rate": 4.011096116359274e-05, "loss": 0.0003, "step": 4120 }, { "epoch": 5.028677462887989, "grad_norm": 0.01538068987429142, "learning_rate": 4.0073474284000604e-05, "loss": 0.6961, "step": 4130 }, { "epoch": 5.029352226720648, "grad_norm": 0.00021896508405916393, "learning_rate": 4.003598740440846e-05, "loss": 0.0001, "step": 4140 }, { "epoch": 5.030026990553306, "grad_norm": 0.0006867019692435861, "learning_rate": 3.9998500524816315e-05, "loss": 0.0, "step": 4150 }, { "epoch": 5.030701754385965, "grad_norm": 0.0021174189168959856, "learning_rate": 3.996101364522417e-05, "loss": 0.0, "step": 4160 }, { "epoch": 5.031376518218623, "grad_norm": 0.0005668731173500419, "learning_rate": 3.992352676563203e-05, "loss": 0.0, "step": 4170 }, { "epoch": 5.032051282051282, "grad_norm": 0.0007015119190327823, "learning_rate": 3.988603988603989e-05, "loss": 0.4088, "step": 4180 }, { "epoch": 5.0327260458839405, "grad_norm": 0.007248507812619209, "learning_rate": 3.9848553006447745e-05, "loss": 0.0212, "step": 4190 }, { "epoch": 5.033400809716599, "grad_norm": 0.0023328044917434454, "learning_rate": 3.98110661268556e-05, "loss": 0.0001, "step": 4200 }, { "epoch": 5.0340755735492575, "grad_norm": 0.0011781149078160524, "learning_rate": 3.9773579247263456e-05, "loss": 0.0001, "step": 4210 }, { "epoch": 5.034750337381916, "grad_norm": 0.000842131907120347, "learning_rate": 3.973609236767132e-05, "loss": 0.0001, "step": 4220 }, { "epoch": 5.0354251012145745, "grad_norm": 0.0013578764628618956, "learning_rate": 3.9698605488079175e-05, "loss": 0.0001, "step": 4230 }, { "epoch": 5.036099865047233, "grad_norm": 0.0005201473250053823, "learning_rate": 3.966111860848703e-05, "loss": 0.0001, "step": 4240 }, { "epoch": 5.0367746288798925, "grad_norm": 0.0011828228598460555, "learning_rate": 3.9623631728894886e-05, "loss": 0.0065, "step": 4250 }, { "epoch": 5.037449392712551, "grad_norm": 0.000755178218241781, "learning_rate": 3.958614484930275e-05, "loss": 0.207, "step": 4260 }, { "epoch": 5.0381241565452095, "grad_norm": 0.0009751113248057663, "learning_rate": 3.95486579697106e-05, "loss": 0.0001, "step": 4270 }, { "epoch": 5.038798920377868, "grad_norm": 0.00031620432855561376, "learning_rate": 3.951117109011846e-05, "loss": 0.337, "step": 4280 }, { "epoch": 5.0394736842105265, "grad_norm": 0.0007090018480084836, "learning_rate": 3.9473684210526316e-05, "loss": 0.0006, "step": 4290 }, { "epoch": 5.040148448043185, "grad_norm": 0.0010267384350299835, "learning_rate": 3.943619733093418e-05, "loss": 0.0, "step": 4300 }, { "epoch": 5.040823211875844, "grad_norm": 0.014587147161364555, "learning_rate": 3.939871045134203e-05, "loss": 0.0001, "step": 4310 }, { "epoch": 5.041497975708502, "grad_norm": 0.000788258679676801, "learning_rate": 3.936122357174989e-05, "loss": 0.0, "step": 4320 }, { "epoch": 5.042172739541161, "grad_norm": 0.0006495325942523777, "learning_rate": 3.9323736692157746e-05, "loss": 0.0, "step": 4330 }, { "epoch": 5.042847503373819, "grad_norm": 0.0006167737883515656, "learning_rate": 3.928624981256561e-05, "loss": 0.1018, "step": 4340 }, { "epoch": 5.043522267206478, "grad_norm": 0.0014920184621587396, "learning_rate": 3.924876293297346e-05, "loss": 0.0, "step": 4350 }, { "epoch": 5.044197031039136, "grad_norm": 0.0015535310376435518, "learning_rate": 3.921127605338132e-05, "loss": 0.0007, "step": 4360 }, { "epoch": 5.044871794871795, "grad_norm": 0.0006431335350498557, "learning_rate": 3.9173789173789176e-05, "loss": 0.0001, "step": 4370 }, { "epoch": 5.045546558704453, "grad_norm": 0.005366568453609943, "learning_rate": 3.913630229419703e-05, "loss": 0.0, "step": 4380 }, { "epoch": 5.046221322537112, "grad_norm": 0.0013297253753989935, "learning_rate": 3.909881541460489e-05, "loss": 0.0, "step": 4390 }, { "epoch": 5.04689608636977, "grad_norm": 0.0004990586312487721, "learning_rate": 3.906132853501275e-05, "loss": 0.0, "step": 4400 }, { "epoch": 5.047570850202429, "grad_norm": 0.0013985860859975219, "learning_rate": 3.9023841655420606e-05, "loss": 0.0, "step": 4410 }, { "epoch": 5.048245614035087, "grad_norm": 0.0006711781024932861, "learning_rate": 3.898635477582846e-05, "loss": 0.0, "step": 4420 }, { "epoch": 5.048920377867747, "grad_norm": 0.0006565306102856994, "learning_rate": 3.894886789623632e-05, "loss": 0.0, "step": 4430 }, { "epoch": 5.049595141700405, "grad_norm": 0.0009195157326757908, "learning_rate": 3.891138101664418e-05, "loss": 0.0001, "step": 4440 }, { "epoch": 5.05, "eval_accuracy": 0.9107142857142857, "eval_f1": 0.9105137981578073, "eval_loss": 0.7364658117294312, "eval_runtime": 73.1769, "eval_samples_per_second": 1.531, "eval_steps_per_second": 1.531, "step": 4446 }, { "epoch": 6.000269905533063, "grad_norm": 0.0008725410443730652, "learning_rate": 3.8873894137052036e-05, "loss": 0.0, "step": 4450 }, { "epoch": 6.000944669365722, "grad_norm": 0.0006686112028546631, "learning_rate": 3.883640725745989e-05, "loss": 0.0, "step": 4460 }, { "epoch": 6.001619433198381, "grad_norm": 0.000973099609836936, "learning_rate": 3.879892037786775e-05, "loss": 0.0, "step": 4470 }, { "epoch": 6.0022941970310395, "grad_norm": 0.0036273570731282234, "learning_rate": 3.876143349827561e-05, "loss": 0.0, "step": 4480 }, { "epoch": 6.002968960863698, "grad_norm": 0.0030524057801812887, "learning_rate": 3.8723946618683466e-05, "loss": 0.9891, "step": 4490 }, { "epoch": 6.0036437246963565, "grad_norm": 0.0005925680161453784, "learning_rate": 3.868645973909132e-05, "loss": 0.0001, "step": 4500 }, { "epoch": 6.004318488529015, "grad_norm": 0.0012102797627449036, "learning_rate": 3.864897285949918e-05, "loss": 0.0004, "step": 4510 }, { "epoch": 6.004993252361674, "grad_norm": 0.001870299456641078, "learning_rate": 3.861148597990703e-05, "loss": 0.0001, "step": 4520 }, { "epoch": 6.005668016194332, "grad_norm": 0.0008334846352227032, "learning_rate": 3.857399910031489e-05, "loss": 0.0, "step": 4530 }, { "epoch": 6.006342780026991, "grad_norm": 0.0909259095788002, "learning_rate": 3.853651222072275e-05, "loss": 0.0033, "step": 4540 }, { "epoch": 6.007017543859649, "grad_norm": 0.08534003794193268, "learning_rate": 3.849902534113061e-05, "loss": 0.0001, "step": 4550 }, { "epoch": 6.007692307692308, "grad_norm": 0.009015407413244247, "learning_rate": 3.846153846153846e-05, "loss": 0.0001, "step": 4560 }, { "epoch": 6.008367071524966, "grad_norm": 0.0005771831492893398, "learning_rate": 3.842405158194632e-05, "loss": 0.0, "step": 4570 }, { "epoch": 6.009041835357625, "grad_norm": 0.00015217051259241998, "learning_rate": 3.8386564702354174e-05, "loss": 0.0, "step": 4580 }, { "epoch": 6.009716599190283, "grad_norm": 0.001618007430806756, "learning_rate": 3.834907782276204e-05, "loss": 0.0001, "step": 4590 }, { "epoch": 6.010391363022942, "grad_norm": 0.0008747613755986094, "learning_rate": 3.831159094316989e-05, "loss": 0.0, "step": 4600 }, { "epoch": 6.0110661268556, "grad_norm": 0.0011886496795341372, "learning_rate": 3.827410406357775e-05, "loss": 0.0001, "step": 4610 }, { "epoch": 6.011740890688259, "grad_norm": 0.0006136572919785976, "learning_rate": 3.8236617183985604e-05, "loss": 0.0, "step": 4620 }, { "epoch": 6.012415654520917, "grad_norm": 0.0002797636261675507, "learning_rate": 3.819913030439347e-05, "loss": 0.0, "step": 4630 }, { "epoch": 6.013090418353576, "grad_norm": 0.0005924575380049646, "learning_rate": 3.8161643424801316e-05, "loss": 0.0, "step": 4640 }, { "epoch": 6.013765182186235, "grad_norm": 381.5912170410156, "learning_rate": 3.812415654520918e-05, "loss": 0.6612, "step": 4650 }, { "epoch": 6.014439946018894, "grad_norm": 0.0007501631625927985, "learning_rate": 3.8086669665617034e-05, "loss": 0.057, "step": 4660 }, { "epoch": 6.015114709851552, "grad_norm": 0.00048053194768726826, "learning_rate": 3.80491827860249e-05, "loss": 0.7472, "step": 4670 }, { "epoch": 6.015789473684211, "grad_norm": 0.0008806756814010441, "learning_rate": 3.8011695906432746e-05, "loss": 0.0, "step": 4680 }, { "epoch": 6.016464237516869, "grad_norm": 0.0007039654301479459, "learning_rate": 3.797420902684061e-05, "loss": 0.0002, "step": 4690 }, { "epoch": 6.017139001349528, "grad_norm": 0.0005677440203726292, "learning_rate": 3.7936722147248464e-05, "loss": 0.0, "step": 4700 }, { "epoch": 6.017813765182186, "grad_norm": 0.0006246105185709894, "learning_rate": 3.7899235267656327e-05, "loss": 0.0002, "step": 4710 }, { "epoch": 6.018488529014845, "grad_norm": 0.0003905866760760546, "learning_rate": 3.7861748388064176e-05, "loss": 0.0, "step": 4720 }, { "epoch": 6.019163292847503, "grad_norm": 0.0004027994582429528, "learning_rate": 3.782426150847204e-05, "loss": 0.0002, "step": 4730 }, { "epoch": 6.019838056680162, "grad_norm": 0.0017455661436542869, "learning_rate": 3.7786774628879894e-05, "loss": 0.0001, "step": 4740 }, { "epoch": 6.02051282051282, "grad_norm": 0.0022832180839031935, "learning_rate": 3.774928774928775e-05, "loss": 0.0001, "step": 4750 }, { "epoch": 6.021187584345479, "grad_norm": 295.60693359375, "learning_rate": 3.7711800869695605e-05, "loss": 0.7359, "step": 4760 }, { "epoch": 6.0218623481781375, "grad_norm": 0.0004823520721402019, "learning_rate": 3.767431399010347e-05, "loss": 0.0, "step": 4770 }, { "epoch": 6.022537112010796, "grad_norm": 0.003145309165120125, "learning_rate": 3.7636827110511324e-05, "loss": 0.0, "step": 4780 }, { "epoch": 6.0232118758434545, "grad_norm": 0.00026828868431039155, "learning_rate": 3.759934023091918e-05, "loss": 0.0, "step": 4790 }, { "epoch": 6.023886639676113, "grad_norm": 0.000310034112771973, "learning_rate": 3.7561853351327035e-05, "loss": 0.0, "step": 4800 }, { "epoch": 6.024561403508772, "grad_norm": 0.00041966387652792037, "learning_rate": 3.75243664717349e-05, "loss": 0.0, "step": 4810 }, { "epoch": 6.02523616734143, "grad_norm": 0.0011529176263138652, "learning_rate": 3.7486879592142754e-05, "loss": 0.5445, "step": 4820 }, { "epoch": 6.0259109311740895, "grad_norm": 0.02147838845849037, "learning_rate": 3.744939271255061e-05, "loss": 1.0205, "step": 4830 }, { "epoch": 6.026585695006748, "grad_norm": 0.000508416909724474, "learning_rate": 3.7411905832958465e-05, "loss": 0.001, "step": 4840 }, { "epoch": 6.0272604588394065, "grad_norm": 0.008615111000835896, "learning_rate": 3.737441895336633e-05, "loss": 0.0001, "step": 4850 }, { "epoch": 6.027935222672065, "grad_norm": 0.444153755903244, "learning_rate": 3.7336932073774184e-05, "loss": 0.9325, "step": 4860 }, { "epoch": 6.028609986504724, "grad_norm": 0.0013290736824274063, "learning_rate": 3.729944519418204e-05, "loss": 0.0001, "step": 4870 }, { "epoch": 6.029284750337382, "grad_norm": 0.000803654664196074, "learning_rate": 3.7261958314589895e-05, "loss": 0.0044, "step": 4880 }, { "epoch": 6.029959514170041, "grad_norm": 0.0021947200875729322, "learning_rate": 3.722447143499775e-05, "loss": 0.9785, "step": 4890 }, { "epoch": 6.030634278002699, "grad_norm": 0.0023971525952219963, "learning_rate": 3.718698455540561e-05, "loss": 0.0001, "step": 4900 }, { "epoch": 6.031309041835358, "grad_norm": 0.00609954446554184, "learning_rate": 3.714949767581347e-05, "loss": 0.0002, "step": 4910 }, { "epoch": 6.031983805668016, "grad_norm": 0.0020932150073349476, "learning_rate": 3.7112010796221325e-05, "loss": 0.0002, "step": 4920 }, { "epoch": 6.032658569500675, "grad_norm": 0.0034460346214473248, "learning_rate": 3.707452391662918e-05, "loss": 0.0004, "step": 4930 }, { "epoch": 6.033333333333333, "grad_norm": 0.0021088484209030867, "learning_rate": 3.7037037037037037e-05, "loss": 0.0001, "step": 4940 }, { "epoch": 6.034008097165992, "grad_norm": 0.002742623910307884, "learning_rate": 3.699955015744489e-05, "loss": 0.0006, "step": 4950 }, { "epoch": 6.03468286099865, "grad_norm": 0.002541649155318737, "learning_rate": 3.6962063277852755e-05, "loss": 0.0001, "step": 4960 }, { "epoch": 6.035357624831309, "grad_norm": 0.000678271462675184, "learning_rate": 3.692457639826061e-05, "loss": 0.0, "step": 4970 }, { "epoch": 6.036032388663967, "grad_norm": 0.0022359860595315695, "learning_rate": 3.6887089518668466e-05, "loss": 0.0002, "step": 4980 }, { "epoch": 6.036707152496626, "grad_norm": 0.003631311934441328, "learning_rate": 3.684960263907632e-05, "loss": 0.0139, "step": 4990 }, { "epoch": 6.037381916329284, "grad_norm": 408.66119384765625, "learning_rate": 3.6812115759484185e-05, "loss": 0.3617, "step": 5000 }, { "epoch": 6.038056680161944, "grad_norm": 0.001363090705126524, "learning_rate": 3.6774628879892034e-05, "loss": 0.7014, "step": 5010 }, { "epoch": 6.038731443994602, "grad_norm": 0.0028585607651621103, "learning_rate": 3.6737142000299896e-05, "loss": 0.0209, "step": 5020 }, { "epoch": 6.039406207827261, "grad_norm": 0.0029073706828057766, "learning_rate": 3.669965512070775e-05, "loss": 0.0007, "step": 5030 }, { "epoch": 6.040080971659919, "grad_norm": 0.021762054413557053, "learning_rate": 3.6662168241115615e-05, "loss": 3.0967, "step": 5040 }, { "epoch": 6.040755735492578, "grad_norm": 1.7035624980926514, "learning_rate": 3.6624681361523464e-05, "loss": 1.3983, "step": 5050 }, { "epoch": 6.041430499325236, "grad_norm": 0.07881853729486465, "learning_rate": 3.6587194481931326e-05, "loss": 0.8778, "step": 5060 }, { "epoch": 6.042105263157895, "grad_norm": 49.91697311401367, "learning_rate": 3.654970760233918e-05, "loss": 0.0293, "step": 5070 }, { "epoch": 6.042780026990553, "grad_norm": 0.01630672998726368, "learning_rate": 3.6512220722747045e-05, "loss": 1.1156, "step": 5080 }, { "epoch": 6.043454790823212, "grad_norm": 0.007935232482850552, "learning_rate": 3.6474733843154894e-05, "loss": 0.0031, "step": 5090 }, { "epoch": 6.04412955465587, "grad_norm": 299.0083923339844, "learning_rate": 3.6437246963562756e-05, "loss": 0.6953, "step": 5100 }, { "epoch": 6.044804318488529, "grad_norm": 0.014369282871484756, "learning_rate": 3.639976008397061e-05, "loss": 0.0002, "step": 5110 }, { "epoch": 6.0454790823211875, "grad_norm": 0.0033456783276051283, "learning_rate": 3.6362273204378474e-05, "loss": 0.0009, "step": 5120 }, { "epoch": 6.046153846153846, "grad_norm": 0.0012127397349104285, "learning_rate": 3.6324786324786323e-05, "loss": 0.0001, "step": 5130 }, { "epoch": 6.0468286099865045, "grad_norm": 0.003025912446901202, "learning_rate": 3.6287299445194186e-05, "loss": 0.0001, "step": 5140 }, { "epoch": 6.047503373819163, "grad_norm": 0.006771762855350971, "learning_rate": 3.624981256560204e-05, "loss": 0.0003, "step": 5150 }, { "epoch": 6.0481781376518216, "grad_norm": 0.006291988305747509, "learning_rate": 3.62123256860099e-05, "loss": 0.6232, "step": 5160 }, { "epoch": 6.04885290148448, "grad_norm": 0.010942903347313404, "learning_rate": 3.617483880641775e-05, "loss": 0.9909, "step": 5170 }, { "epoch": 6.049527665317139, "grad_norm": 0.0050459960475564, "learning_rate": 3.6137351926825616e-05, "loss": 0.8987, "step": 5180 }, { "epoch": 6.05, "eval_accuracy": 0.8392857142857143, "eval_f1": 0.8294011707968183, "eval_loss": 0.930968701839447, "eval_runtime": 74.4165, "eval_samples_per_second": 1.505, "eval_steps_per_second": 1.505, "step": 5187 }, { "epoch": 7.000202429149797, "grad_norm": 0.012029584497213364, "learning_rate": 3.609986504723347e-05, "loss": 0.0003, "step": 5190 }, { "epoch": 7.000877192982456, "grad_norm": 0.002462017349898815, "learning_rate": 3.606237816764133e-05, "loss": 0.0005, "step": 5200 }, { "epoch": 7.001551956815114, "grad_norm": 0.0375690832734108, "learning_rate": 3.602489128804918e-05, "loss": 0.1058, "step": 5210 }, { "epoch": 7.002226720647773, "grad_norm": 0.026218879967927933, "learning_rate": 3.5987404408457046e-05, "loss": 0.0083, "step": 5220 }, { "epoch": 7.002901484480432, "grad_norm": 0.0031192379537969828, "learning_rate": 3.59499175288649e-05, "loss": 0.0342, "step": 5230 }, { "epoch": 7.003576248313091, "grad_norm": 0.002261426765471697, "learning_rate": 3.591243064927276e-05, "loss": 0.8758, "step": 5240 }, { "epoch": 7.004251012145749, "grad_norm": 0.7252321839332581, "learning_rate": 3.587494376968061e-05, "loss": 0.0008, "step": 5250 }, { "epoch": 7.004925775978408, "grad_norm": 0.002154165878891945, "learning_rate": 3.583745689008847e-05, "loss": 0.0002, "step": 5260 }, { "epoch": 7.005600539811066, "grad_norm": 0.0012370734475553036, "learning_rate": 3.5799970010496325e-05, "loss": 1.0515, "step": 5270 }, { "epoch": 7.006275303643725, "grad_norm": 0.0021348996087908745, "learning_rate": 3.576248313090419e-05, "loss": 0.0001, "step": 5280 }, { "epoch": 7.006950067476383, "grad_norm": 0.006049524061381817, "learning_rate": 3.572499625131204e-05, "loss": 0.0003, "step": 5290 }, { "epoch": 7.007624831309042, "grad_norm": 0.01275632157921791, "learning_rate": 3.56875093717199e-05, "loss": 0.0021, "step": 5300 }, { "epoch": 7.0082995951417, "grad_norm": 0.0016850440297275782, "learning_rate": 3.5650022492127754e-05, "loss": 0.0001, "step": 5310 }, { "epoch": 7.008974358974359, "grad_norm": 0.0009741022950038314, "learning_rate": 3.561253561253561e-05, "loss": 0.0005, "step": 5320 }, { "epoch": 7.0096491228070175, "grad_norm": 0.000799846719019115, "learning_rate": 3.557504873294347e-05, "loss": 0.0002, "step": 5330 }, { "epoch": 7.010323886639676, "grad_norm": 0.0008095399825833738, "learning_rate": 3.553756185335133e-05, "loss": 0.0024, "step": 5340 }, { "epoch": 7.0109986504723345, "grad_norm": 0.0016390876844525337, "learning_rate": 3.5500074973759184e-05, "loss": 0.0, "step": 5350 }, { "epoch": 7.011673414304993, "grad_norm": 0.0013130076695233583, "learning_rate": 3.546258809416704e-05, "loss": 0.8843, "step": 5360 }, { "epoch": 7.0123481781376515, "grad_norm": 0.015013671480119228, "learning_rate": 3.54251012145749e-05, "loss": 0.7296, "step": 5370 }, { "epoch": 7.01302294197031, "grad_norm": 0.003729419782757759, "learning_rate": 3.538761433498276e-05, "loss": 0.004, "step": 5380 }, { "epoch": 7.013697705802969, "grad_norm": 0.007766401395201683, "learning_rate": 3.5350127455390614e-05, "loss": 0.0001, "step": 5390 }, { "epoch": 7.014372469635627, "grad_norm": 0.03760051354765892, "learning_rate": 3.531264057579847e-05, "loss": 0.0002, "step": 5400 }, { "epoch": 7.0150472334682865, "grad_norm": 0.003396588610485196, "learning_rate": 3.527515369620633e-05, "loss": 0.0001, "step": 5410 }, { "epoch": 7.015721997300945, "grad_norm": 0.005965414922684431, "learning_rate": 3.523766681661418e-05, "loss": 0.0001, "step": 5420 }, { "epoch": 7.0163967611336036, "grad_norm": 0.002591415075585246, "learning_rate": 3.5200179937022044e-05, "loss": 0.0001, "step": 5430 }, { "epoch": 7.017071524966262, "grad_norm": 0.0007187007577158511, "learning_rate": 3.51626930574299e-05, "loss": 0.6273, "step": 5440 }, { "epoch": 7.017746288798921, "grad_norm": 0.0018147805240005255, "learning_rate": 3.512520617783776e-05, "loss": 0.0001, "step": 5450 }, { "epoch": 7.018421052631579, "grad_norm": 0.0007241186103783548, "learning_rate": 3.508771929824561e-05, "loss": 0.0002, "step": 5460 }, { "epoch": 7.019095816464238, "grad_norm": 0.002352670766413212, "learning_rate": 3.5050232418653474e-05, "loss": 0.0001, "step": 5470 }, { "epoch": 7.019770580296896, "grad_norm": 0.0018704934045672417, "learning_rate": 3.501274553906133e-05, "loss": 0.0004, "step": 5480 }, { "epoch": 7.020445344129555, "grad_norm": 0.002092360518872738, "learning_rate": 3.497525865946919e-05, "loss": 0.0001, "step": 5490 }, { "epoch": 7.021120107962213, "grad_norm": 0.001126096467487514, "learning_rate": 3.493777177987704e-05, "loss": 0.6823, "step": 5500 }, { "epoch": 7.021794871794872, "grad_norm": 0.0008661505416966975, "learning_rate": 3.4900284900284904e-05, "loss": 0.0001, "step": 5510 }, { "epoch": 7.02246963562753, "grad_norm": 0.02058524824678898, "learning_rate": 3.486279802069276e-05, "loss": 0.0001, "step": 5520 }, { "epoch": 7.023144399460189, "grad_norm": 0.002387122018262744, "learning_rate": 3.4825311141100615e-05, "loss": 0.0002, "step": 5530 }, { "epoch": 7.023819163292847, "grad_norm": 0.0011330017587170005, "learning_rate": 3.478782426150847e-05, "loss": 0.0001, "step": 5540 }, { "epoch": 7.024493927125506, "grad_norm": 0.0005625615012831986, "learning_rate": 3.4750337381916334e-05, "loss": 0.0002, "step": 5550 }, { "epoch": 7.025168690958164, "grad_norm": 0.0008695796132087708, "learning_rate": 3.471285050232419e-05, "loss": 0.0, "step": 5560 }, { "epoch": 7.025843454790823, "grad_norm": 0.0016092468285933137, "learning_rate": 3.4675363622732045e-05, "loss": 0.0001, "step": 5570 }, { "epoch": 7.026518218623481, "grad_norm": 0.0011349094565957785, "learning_rate": 3.46378767431399e-05, "loss": 0.0003, "step": 5580 }, { "epoch": 7.027192982456141, "grad_norm": 0.0005459162639454007, "learning_rate": 3.4600389863547764e-05, "loss": 0.0001, "step": 5590 }, { "epoch": 7.027867746288799, "grad_norm": 0.0009417292312718928, "learning_rate": 3.456290298395562e-05, "loss": 0.0, "step": 5600 }, { "epoch": 7.028542510121458, "grad_norm": 0.0005761535139754415, "learning_rate": 3.4525416104363475e-05, "loss": 0.0, "step": 5610 }, { "epoch": 7.029217273954116, "grad_norm": 0.0007409591344185174, "learning_rate": 3.448792922477133e-05, "loss": 0.0001, "step": 5620 }, { "epoch": 7.029892037786775, "grad_norm": 0.004374117590487003, "learning_rate": 3.4450442345179194e-05, "loss": 0.0, "step": 5630 }, { "epoch": 7.030566801619433, "grad_norm": 0.017210789024829865, "learning_rate": 3.441295546558704e-05, "loss": 0.0001, "step": 5640 }, { "epoch": 7.031241565452092, "grad_norm": 0.0008836057968437672, "learning_rate": 3.4375468585994905e-05, "loss": 0.0012, "step": 5650 }, { "epoch": 7.03191632928475, "grad_norm": 0.0015315774362534285, "learning_rate": 3.433798170640276e-05, "loss": 0.0, "step": 5660 }, { "epoch": 7.032591093117409, "grad_norm": 0.0006376684177666903, "learning_rate": 3.4300494826810617e-05, "loss": 0.0, "step": 5670 }, { "epoch": 7.0332658569500675, "grad_norm": 0.0005232661496847868, "learning_rate": 3.426300794721847e-05, "loss": 0.0, "step": 5680 }, { "epoch": 7.033940620782726, "grad_norm": 0.0008468987653031945, "learning_rate": 3.4225521067626335e-05, "loss": 0.0003, "step": 5690 }, { "epoch": 7.0346153846153845, "grad_norm": 0.000993360416032374, "learning_rate": 3.418803418803419e-05, "loss": 0.0, "step": 5700 }, { "epoch": 7.035290148448043, "grad_norm": 0.0020066085271537304, "learning_rate": 3.4150547308442046e-05, "loss": 0.0001, "step": 5710 }, { "epoch": 7.0359649122807015, "grad_norm": 0.00036297430051490664, "learning_rate": 3.41130604288499e-05, "loss": 0.0, "step": 5720 }, { "epoch": 7.03663967611336, "grad_norm": 0.0009432988590560853, "learning_rate": 3.407557354925776e-05, "loss": 0.0001, "step": 5730 }, { "epoch": 7.037314439946019, "grad_norm": 0.0018047185149043798, "learning_rate": 3.403808666966562e-05, "loss": 0.8627, "step": 5740 }, { "epoch": 7.037989203778677, "grad_norm": 0.0037690841127187014, "learning_rate": 3.4000599790073476e-05, "loss": 0.0781, "step": 5750 }, { "epoch": 7.038663967611336, "grad_norm": 0.023057781159877777, "learning_rate": 3.396311291048133e-05, "loss": 0.0001, "step": 5760 }, { "epoch": 7.039338731443995, "grad_norm": 0.004012484569102526, "learning_rate": 3.392562603088919e-05, "loss": 0.0001, "step": 5770 }, { "epoch": 7.0400134952766535, "grad_norm": 0.0012608218239620328, "learning_rate": 3.388813915129705e-05, "loss": 0.0001, "step": 5780 }, { "epoch": 7.040688259109312, "grad_norm": 0.002351221162825823, "learning_rate": 3.38506522717049e-05, "loss": 0.0001, "step": 5790 }, { "epoch": 7.041363022941971, "grad_norm": 0.000716827402357012, "learning_rate": 3.381316539211276e-05, "loss": 0.0005, "step": 5800 }, { "epoch": 7.042037786774629, "grad_norm": 0.0029892646707594395, "learning_rate": 3.377567851252062e-05, "loss": 0.0, "step": 5810 }, { "epoch": 7.042712550607288, "grad_norm": 372.2917175292969, "learning_rate": 3.373819163292848e-05, "loss": 0.5735, "step": 5820 }, { "epoch": 7.043387314439946, "grad_norm": 0.0010425182990729809, "learning_rate": 3.370070475333633e-05, "loss": 0.0, "step": 5830 }, { "epoch": 7.044062078272605, "grad_norm": 43.60670852661133, "learning_rate": 3.366321787374419e-05, "loss": 1.7134, "step": 5840 }, { "epoch": 7.044736842105263, "grad_norm": 44.16180419921875, "learning_rate": 3.362573099415205e-05, "loss": 0.4081, "step": 5850 }, { "epoch": 7.045411605937922, "grad_norm": 0.002386684063822031, "learning_rate": 3.358824411455991e-05, "loss": 0.0014, "step": 5860 }, { "epoch": 7.04608636977058, "grad_norm": 0.000626052962616086, "learning_rate": 3.355075723496776e-05, "loss": 0.6338, "step": 5870 }, { "epoch": 7.046761133603239, "grad_norm": 0.0048158965073525906, "learning_rate": 3.351327035537562e-05, "loss": 0.0002, "step": 5880 }, { "epoch": 7.047435897435897, "grad_norm": 102.3766860961914, "learning_rate": 3.347578347578348e-05, "loss": 0.7, "step": 5890 }, { "epoch": 7.048110661268556, "grad_norm": 0.0005689793615601957, "learning_rate": 3.343829659619133e-05, "loss": 0.7073, "step": 5900 }, { "epoch": 7.048785425101214, "grad_norm": 0.013288695365190506, "learning_rate": 3.340080971659919e-05, "loss": 0.0572, "step": 5910 }, { "epoch": 7.049460188933873, "grad_norm": 0.0011189569486305118, "learning_rate": 3.336332283700705e-05, "loss": 0.4888, "step": 5920 }, { "epoch": 7.05, "eval_accuracy": 0.875, "eval_f1": 0.8702947845804988, "eval_loss": 0.856253445148468, "eval_runtime": 74.1698, "eval_samples_per_second": 1.51, "eval_steps_per_second": 1.51, "step": 5928 }, { "epoch": 8.000134952766532, "grad_norm": 0.0010889604454860091, "learning_rate": 3.332583595741491e-05, "loss": 0.0492, "step": 5930 }, { "epoch": 8.000809716599191, "grad_norm": 0.0005811200244352221, "learning_rate": 3.328834907782276e-05, "loss": 0.0003, "step": 5940 }, { "epoch": 8.001484480431849, "grad_norm": 0.0028562431689351797, "learning_rate": 3.325086219823062e-05, "loss": 0.0003, "step": 5950 }, { "epoch": 8.002159244264508, "grad_norm": 0.0011086298618465662, "learning_rate": 3.321337531863848e-05, "loss": 0.0001, "step": 5960 }, { "epoch": 8.002834008097166, "grad_norm": 0.0018863864243030548, "learning_rate": 3.317588843904634e-05, "loss": 0.0001, "step": 5970 }, { "epoch": 8.003508771929825, "grad_norm": 0.0009740761015564203, "learning_rate": 3.313840155945419e-05, "loss": 0.0002, "step": 5980 }, { "epoch": 8.004183535762483, "grad_norm": 0.0005378098576329648, "learning_rate": 3.310091467986205e-05, "loss": 0.0001, "step": 5990 }, { "epoch": 8.004858299595142, "grad_norm": 0.001058222958818078, "learning_rate": 3.306342780026991e-05, "loss": 0.0001, "step": 6000 }, { "epoch": 8.0055330634278, "grad_norm": 0.0010611525503918529, "learning_rate": 3.302594092067777e-05, "loss": 0.0001, "step": 6010 }, { "epoch": 8.006207827260459, "grad_norm": 0.002727857790887356, "learning_rate": 3.298845404108562e-05, "loss": 0.0001, "step": 6020 }, { "epoch": 8.006882591093117, "grad_norm": 0.0007821051403880119, "learning_rate": 3.295096716149348e-05, "loss": 0.0017, "step": 6030 }, { "epoch": 8.007557354925776, "grad_norm": 0.001169922179542482, "learning_rate": 3.2913480281901335e-05, "loss": 0.0001, "step": 6040 }, { "epoch": 8.008232118758434, "grad_norm": 0.0011363876983523369, "learning_rate": 3.287599340230919e-05, "loss": 0.0001, "step": 6050 }, { "epoch": 8.008906882591093, "grad_norm": 0.0005207853973843157, "learning_rate": 3.283850652271705e-05, "loss": 0.6813, "step": 6060 }, { "epoch": 8.00958164642375, "grad_norm": 0.0005264964420348406, "learning_rate": 3.280101964312491e-05, "loss": 0.0001, "step": 6070 }, { "epoch": 8.01025641025641, "grad_norm": 0.0005870209424756467, "learning_rate": 3.2763532763532764e-05, "loss": 0.0001, "step": 6080 }, { "epoch": 8.01093117408907, "grad_norm": 0.0016355343395844102, "learning_rate": 3.272604588394062e-05, "loss": 0.0004, "step": 6090 }, { "epoch": 8.011605937921727, "grad_norm": 0.004568756558001041, "learning_rate": 3.2688559004348476e-05, "loss": 0.0004, "step": 6100 }, { "epoch": 8.012280701754387, "grad_norm": 0.0005888245650567114, "learning_rate": 3.265107212475634e-05, "loss": 0.0001, "step": 6110 }, { "epoch": 8.012955465587044, "grad_norm": 0.0023943374399095774, "learning_rate": 3.2613585245164194e-05, "loss": 0.0, "step": 6120 }, { "epoch": 8.013630229419704, "grad_norm": 0.0004357252037152648, "learning_rate": 3.257609836557205e-05, "loss": 0.0002, "step": 6130 }, { "epoch": 8.014304993252361, "grad_norm": 0.0006332327611744404, "learning_rate": 3.2538611485979906e-05, "loss": 0.0001, "step": 6140 }, { "epoch": 8.01497975708502, "grad_norm": 0.0006531701656058431, "learning_rate": 3.250112460638777e-05, "loss": 0.0001, "step": 6150 }, { "epoch": 8.015654520917678, "grad_norm": 0.0005107235629111528, "learning_rate": 3.246363772679562e-05, "loss": 0.0, "step": 6160 }, { "epoch": 8.016329284750338, "grad_norm": 0.012723034247756004, "learning_rate": 3.242615084720348e-05, "loss": 0.0001, "step": 6170 }, { "epoch": 8.017004048582995, "grad_norm": 0.0427851527929306, "learning_rate": 3.2388663967611336e-05, "loss": 0.0002, "step": 6180 }, { "epoch": 8.017678812415655, "grad_norm": 0.001141960732638836, "learning_rate": 3.23511770880192e-05, "loss": 0.0, "step": 6190 }, { "epoch": 8.018353576248312, "grad_norm": 0.0015029623173177242, "learning_rate": 3.231369020842705e-05, "loss": 0.0001, "step": 6200 }, { "epoch": 8.019028340080972, "grad_norm": 0.0005648156511597335, "learning_rate": 3.227620332883491e-05, "loss": 0.0001, "step": 6210 }, { "epoch": 8.01970310391363, "grad_norm": 0.0006971880211494863, "learning_rate": 3.2238716449242766e-05, "loss": 0.0, "step": 6220 }, { "epoch": 8.020377867746289, "grad_norm": 0.0005205124034546316, "learning_rate": 3.220122956965063e-05, "loss": 0.0001, "step": 6230 }, { "epoch": 8.021052631578947, "grad_norm": 0.0007245125016197562, "learning_rate": 3.216374269005848e-05, "loss": 0.0001, "step": 6240 }, { "epoch": 8.021727395411606, "grad_norm": 0.0005247213994152844, "learning_rate": 3.212625581046634e-05, "loss": 0.0001, "step": 6250 }, { "epoch": 8.022402159244265, "grad_norm": 0.0005060233525000513, "learning_rate": 3.2088768930874195e-05, "loss": 0.0014, "step": 6260 }, { "epoch": 8.023076923076923, "grad_norm": 0.01399776991456747, "learning_rate": 3.205128205128206e-05, "loss": 0.0001, "step": 6270 }, { "epoch": 8.023751686909582, "grad_norm": 0.0013257871614769101, "learning_rate": 3.201379517168991e-05, "loss": 0.0, "step": 6280 }, { "epoch": 8.02442645074224, "grad_norm": 0.00038729843799956143, "learning_rate": 3.197630829209777e-05, "loss": 0.0, "step": 6290 }, { "epoch": 8.0251012145749, "grad_norm": 0.0013562028761953115, "learning_rate": 3.1938821412505625e-05, "loss": 0.0002, "step": 6300 }, { "epoch": 8.025775978407557, "grad_norm": 0.0023358569014817476, "learning_rate": 3.190133453291348e-05, "loss": 0.0, "step": 6310 }, { "epoch": 8.026450742240216, "grad_norm": 0.0007051244028843939, "learning_rate": 3.186384765332134e-05, "loss": 0.0, "step": 6320 }, { "epoch": 8.027125506072874, "grad_norm": 0.00045763421803712845, "learning_rate": 3.18263607737292e-05, "loss": 0.0, "step": 6330 }, { "epoch": 8.027800269905534, "grad_norm": 0.0003405519819352776, "learning_rate": 3.1788873894137055e-05, "loss": 0.0, "step": 6340 }, { "epoch": 8.028475033738191, "grad_norm": 0.0009031207882799208, "learning_rate": 3.175138701454491e-05, "loss": 0.3007, "step": 6350 }, { "epoch": 8.02914979757085, "grad_norm": 0.00048344547394663095, "learning_rate": 3.171390013495277e-05, "loss": 0.0, "step": 6360 }, { "epoch": 8.029824561403508, "grad_norm": 0.005110772326588631, "learning_rate": 3.167641325536063e-05, "loss": 0.0, "step": 6370 }, { "epoch": 8.030499325236168, "grad_norm": 0.0005193505785427988, "learning_rate": 3.1638926375768485e-05, "loss": 1.2393, "step": 6380 }, { "epoch": 8.031174089068825, "grad_norm": 0.001544152619317174, "learning_rate": 3.160143949617634e-05, "loss": 0.0, "step": 6390 }, { "epoch": 8.031848852901485, "grad_norm": 0.004015884827822447, "learning_rate": 3.15639526165842e-05, "loss": 0.0001, "step": 6400 }, { "epoch": 8.032523616734142, "grad_norm": 0.005030054599046707, "learning_rate": 3.152646573699205e-05, "loss": 0.0002, "step": 6410 }, { "epoch": 8.033198380566802, "grad_norm": 0.08386117219924927, "learning_rate": 3.148897885739991e-05, "loss": 0.0002, "step": 6420 }, { "epoch": 8.03387314439946, "grad_norm": 0.004819917026907206, "learning_rate": 3.145149197780777e-05, "loss": 0.1815, "step": 6430 }, { "epoch": 8.034547908232119, "grad_norm": 0.0022033504210412502, "learning_rate": 3.1414005098215627e-05, "loss": 0.0001, "step": 6440 }, { "epoch": 8.035222672064778, "grad_norm": 0.0040964060463011265, "learning_rate": 3.137651821862348e-05, "loss": 0.0001, "step": 6450 }, { "epoch": 8.035897435897436, "grad_norm": 0.004042464308440685, "learning_rate": 3.133903133903134e-05, "loss": 0.0001, "step": 6460 }, { "epoch": 8.036572199730095, "grad_norm": 0.0027346210554242134, "learning_rate": 3.1301544459439194e-05, "loss": 0.0001, "step": 6470 }, { "epoch": 8.037246963562753, "grad_norm": 0.0005888897576369345, "learning_rate": 3.1264057579847056e-05, "loss": 0.0001, "step": 6480 }, { "epoch": 8.037921727395412, "grad_norm": 0.004620389547199011, "learning_rate": 3.122657070025491e-05, "loss": 0.0001, "step": 6490 }, { "epoch": 8.03859649122807, "grad_norm": 0.0017953782808035612, "learning_rate": 3.118908382066277e-05, "loss": 0.0001, "step": 6500 }, { "epoch": 8.03927125506073, "grad_norm": 0.0019287167815491557, "learning_rate": 3.1151596941070624e-05, "loss": 0.0, "step": 6510 }, { "epoch": 8.039946018893387, "grad_norm": 0.017189156264066696, "learning_rate": 3.1114110061478486e-05, "loss": 0.0001, "step": 6520 }, { "epoch": 8.040620782726046, "grad_norm": 0.0002868880983442068, "learning_rate": 3.107662318188634e-05, "loss": 0.0001, "step": 6530 }, { "epoch": 8.041295546558704, "grad_norm": 0.003237192053347826, "learning_rate": 3.10391363022942e-05, "loss": 0.0, "step": 6540 }, { "epoch": 8.041970310391363, "grad_norm": 0.010104048997163773, "learning_rate": 3.1001649422702054e-05, "loss": 0.0001, "step": 6550 }, { "epoch": 8.042645074224021, "grad_norm": 0.0012962371110916138, "learning_rate": 3.0964162543109916e-05, "loss": 0.0001, "step": 6560 }, { "epoch": 8.04331983805668, "grad_norm": 0.0021973999682813883, "learning_rate": 3.0926675663517765e-05, "loss": 0.0001, "step": 6570 }, { "epoch": 8.043994601889338, "grad_norm": 0.004213243722915649, "learning_rate": 3.088918878392563e-05, "loss": 0.0001, "step": 6580 }, { "epoch": 8.044669365721997, "grad_norm": 0.0007371046231128275, "learning_rate": 3.0851701904333484e-05, "loss": 0.0, "step": 6590 }, { "epoch": 8.045344129554655, "grad_norm": 0.0029181931167840958, "learning_rate": 3.0814215024741346e-05, "loss": 0.0001, "step": 6600 }, { "epoch": 8.046018893387314, "grad_norm": 0.001932345563545823, "learning_rate": 3.0776728145149195e-05, "loss": 0.0001, "step": 6610 }, { "epoch": 8.046693657219974, "grad_norm": 0.0020557758398354053, "learning_rate": 3.073924126555706e-05, "loss": 1.2559, "step": 6620 }, { "epoch": 8.047368421052632, "grad_norm": 0.005378492642194033, "learning_rate": 3.0701754385964913e-05, "loss": 0.084, "step": 6630 }, { "epoch": 8.048043184885291, "grad_norm": 0.0023935220669955015, "learning_rate": 3.0664267506372776e-05, "loss": 0.2057, "step": 6640 }, { "epoch": 8.048717948717949, "grad_norm": 0.0029694943223148584, "learning_rate": 3.0626780626780625e-05, "loss": 0.0001, "step": 6650 }, { "epoch": 8.049392712550608, "grad_norm": 0.0006969795795157552, "learning_rate": 3.058929374718849e-05, "loss": 0.0001, "step": 6660 }, { "epoch": 8.05, "eval_accuracy": 0.8928571428571429, "eval_f1": 0.8894495468057416, "eval_loss": 0.6908820867538452, "eval_runtime": 70.8817, "eval_samples_per_second": 1.58, "eval_steps_per_second": 1.58, "step": 6669 }, { "epoch": 9.000067476383267, "grad_norm": 0.007031037472188473, "learning_rate": 3.055180686759634e-05, "loss": 0.0001, "step": 6670 }, { "epoch": 9.000742240215924, "grad_norm": 0.0013843988999724388, "learning_rate": 3.05143199880042e-05, "loss": 0.0001, "step": 6680 }, { "epoch": 9.001417004048584, "grad_norm": 0.07400429248809814, "learning_rate": 3.0476833108412055e-05, "loss": 0.0002, "step": 6690 }, { "epoch": 9.002091767881241, "grad_norm": 0.001845911960117519, "learning_rate": 3.0439346228819914e-05, "loss": 0.0, "step": 6700 }, { "epoch": 9.0027665317139, "grad_norm": 0.00020295576541684568, "learning_rate": 3.0401859349227773e-05, "loss": 0.0014, "step": 6710 }, { "epoch": 9.003441295546558, "grad_norm": 0.001036637695506215, "learning_rate": 3.0364372469635626e-05, "loss": 0.0003, "step": 6720 }, { "epoch": 9.004116059379218, "grad_norm": 0.001262377598322928, "learning_rate": 3.0326885590043485e-05, "loss": 0.0001, "step": 6730 }, { "epoch": 9.004790823211875, "grad_norm": 0.0012167665408924222, "learning_rate": 3.0289398710451344e-05, "loss": 0.6511, "step": 6740 }, { "epoch": 9.005465587044535, "grad_norm": 0.0019521707436069846, "learning_rate": 3.0251911830859203e-05, "loss": 0.0, "step": 6750 }, { "epoch": 9.006140350877192, "grad_norm": 0.0013618938392028213, "learning_rate": 3.0214424951267055e-05, "loss": 0.0001, "step": 6760 }, { "epoch": 9.006815114709852, "grad_norm": 0.0009306151187047362, "learning_rate": 3.0176938071674915e-05, "loss": 0.0001, "step": 6770 }, { "epoch": 9.00748987854251, "grad_norm": 0.0007624576683156192, "learning_rate": 3.0139451192082774e-05, "loss": 0.0001, "step": 6780 }, { "epoch": 9.008164642375169, "grad_norm": 0.0007957870257087052, "learning_rate": 3.0101964312490626e-05, "loss": 0.4765, "step": 6790 }, { "epoch": 9.008839406207827, "grad_norm": 0.47597193717956543, "learning_rate": 3.0064477432898485e-05, "loss": 0.0019, "step": 6800 }, { "epoch": 9.009514170040486, "grad_norm": 0.0003396008105482906, "learning_rate": 3.0026990553306344e-05, "loss": 0.0001, "step": 6810 }, { "epoch": 9.010188933873144, "grad_norm": 0.0011485237628221512, "learning_rate": 2.9989503673714204e-05, "loss": 0.0, "step": 6820 }, { "epoch": 9.010863697705803, "grad_norm": 0.0008013169863261282, "learning_rate": 2.9952016794122056e-05, "loss": 0.0001, "step": 6830 }, { "epoch": 9.011538461538462, "grad_norm": 0.00038786802906543016, "learning_rate": 2.9914529914529915e-05, "loss": 0.0, "step": 6840 }, { "epoch": 9.01221322537112, "grad_norm": 0.003582603298127651, "learning_rate": 2.9877043034937774e-05, "loss": 0.9191, "step": 6850 }, { "epoch": 9.01288798920378, "grad_norm": 0.0014808046398684382, "learning_rate": 2.9839556155345634e-05, "loss": 0.0001, "step": 6860 }, { "epoch": 9.013562753036437, "grad_norm": 0.01157829724252224, "learning_rate": 2.9802069275753486e-05, "loss": 0.0001, "step": 6870 }, { "epoch": 9.014237516869096, "grad_norm": 0.007076776586472988, "learning_rate": 2.9764582396161345e-05, "loss": 0.0012, "step": 6880 }, { "epoch": 9.014912280701754, "grad_norm": 0.003984262701123953, "learning_rate": 2.9727095516569204e-05, "loss": 0.0001, "step": 6890 }, { "epoch": 9.015587044534414, "grad_norm": 0.00039073076914064586, "learning_rate": 2.9689608636977063e-05, "loss": 0.0001, "step": 6900 }, { "epoch": 9.016261808367071, "grad_norm": 0.005625125020742416, "learning_rate": 2.9652121757384916e-05, "loss": 0.0001, "step": 6910 }, { "epoch": 9.01693657219973, "grad_norm": 0.0015515730483457446, "learning_rate": 2.9614634877792775e-05, "loss": 0.0, "step": 6920 }, { "epoch": 9.017611336032388, "grad_norm": 0.0017237714491784573, "learning_rate": 2.9577147998200634e-05, "loss": 0.0, "step": 6930 }, { "epoch": 9.018286099865048, "grad_norm": 0.008184783160686493, "learning_rate": 2.9539661118608486e-05, "loss": 0.0001, "step": 6940 }, { "epoch": 9.018960863697705, "grad_norm": 0.002028749557211995, "learning_rate": 2.9502174239016346e-05, "loss": 0.0001, "step": 6950 }, { "epoch": 9.019635627530365, "grad_norm": 0.0036216990556567907, "learning_rate": 2.9464687359424205e-05, "loss": 0.0, "step": 6960 }, { "epoch": 9.020310391363022, "grad_norm": 0.0013016269076615572, "learning_rate": 2.942720047983206e-05, "loss": 0.0001, "step": 6970 }, { "epoch": 9.020985155195682, "grad_norm": 0.00772570027038455, "learning_rate": 2.9389713600239916e-05, "loss": 0.0001, "step": 6980 }, { "epoch": 9.02165991902834, "grad_norm": 0.0003020280273631215, "learning_rate": 2.9352226720647776e-05, "loss": 0.0, "step": 6990 }, { "epoch": 9.022334682860999, "grad_norm": 0.0012822924181818962, "learning_rate": 2.931473984105563e-05, "loss": 0.0, "step": 7000 }, { "epoch": 9.023009446693656, "grad_norm": 0.0010099551873281598, "learning_rate": 2.927725296146349e-05, "loss": 0.0001, "step": 7010 }, { "epoch": 9.023684210526316, "grad_norm": 0.0024363386910408735, "learning_rate": 2.9239766081871346e-05, "loss": 0.0001, "step": 7020 }, { "epoch": 9.024358974358975, "grad_norm": 0.0023049945011734962, "learning_rate": 2.9202279202279202e-05, "loss": 0.0001, "step": 7030 }, { "epoch": 9.025033738191633, "grad_norm": 0.0029273051768541336, "learning_rate": 2.916479232268706e-05, "loss": 0.0, "step": 7040 }, { "epoch": 9.025708502024292, "grad_norm": 0.003555365838110447, "learning_rate": 2.9127305443094917e-05, "loss": 0.0001, "step": 7050 }, { "epoch": 9.02638326585695, "grad_norm": 0.0033711865544319153, "learning_rate": 2.9089818563502773e-05, "loss": 0.0, "step": 7060 }, { "epoch": 9.02705802968961, "grad_norm": 0.00046359331463463604, "learning_rate": 2.9052331683910632e-05, "loss": 0.0001, "step": 7070 }, { "epoch": 9.027732793522267, "grad_norm": 0.0003137718595098704, "learning_rate": 2.901484480431849e-05, "loss": 0.0, "step": 7080 }, { "epoch": 9.028407557354926, "grad_norm": 0.0016707087634131312, "learning_rate": 2.8977357924726343e-05, "loss": 0.0, "step": 7090 }, { "epoch": 9.029082321187584, "grad_norm": 0.0012837687972933054, "learning_rate": 2.8939871045134203e-05, "loss": 0.0, "step": 7100 }, { "epoch": 9.029757085020243, "grad_norm": 0.00030405522556975484, "learning_rate": 2.8902384165542062e-05, "loss": 0.0, "step": 7110 }, { "epoch": 9.030431848852901, "grad_norm": 0.000334856566041708, "learning_rate": 2.886489728594992e-05, "loss": 0.0, "step": 7120 }, { "epoch": 9.03110661268556, "grad_norm": 0.00024141219910234213, "learning_rate": 2.8827410406357773e-05, "loss": 0.0, "step": 7130 }, { "epoch": 9.031781376518218, "grad_norm": 0.0014251351822167635, "learning_rate": 2.8789923526765633e-05, "loss": 0.0001, "step": 7140 }, { "epoch": 9.032456140350877, "grad_norm": 0.0001798996381694451, "learning_rate": 2.875243664717349e-05, "loss": 0.0, "step": 7150 }, { "epoch": 9.033130904183535, "grad_norm": 0.00026806764071807265, "learning_rate": 2.871494976758135e-05, "loss": 0.0, "step": 7160 }, { "epoch": 9.033805668016194, "grad_norm": 0.001039984286762774, "learning_rate": 2.8677462887989203e-05, "loss": 0.0, "step": 7170 }, { "epoch": 9.034480431848852, "grad_norm": 0.00029442558297887444, "learning_rate": 2.8639976008397062e-05, "loss": 0.0, "step": 7180 }, { "epoch": 9.035155195681511, "grad_norm": 0.0010803727200254798, "learning_rate": 2.860248912880492e-05, "loss": 0.0, "step": 7190 }, { "epoch": 9.035829959514171, "grad_norm": 0.0009579784818924963, "learning_rate": 2.8565002249212774e-05, "loss": 0.0, "step": 7200 }, { "epoch": 9.036504723346829, "grad_norm": 0.00148207473102957, "learning_rate": 2.8527515369620633e-05, "loss": 0.5707, "step": 7210 }, { "epoch": 9.037179487179488, "grad_norm": 0.0010521633084863424, "learning_rate": 2.8490028490028492e-05, "loss": 0.0627, "step": 7220 }, { "epoch": 9.037854251012146, "grad_norm": 0.0016639038221910596, "learning_rate": 2.845254161043635e-05, "loss": 0.0205, "step": 7230 }, { "epoch": 9.038529014844805, "grad_norm": 0.0019760627765208483, "learning_rate": 2.8415054730844204e-05, "loss": 0.0001, "step": 7240 }, { "epoch": 9.039203778677463, "grad_norm": 0.0023020838852971792, "learning_rate": 2.8377567851252063e-05, "loss": 0.0001, "step": 7250 }, { "epoch": 9.039878542510122, "grad_norm": 0.9819605946540833, "learning_rate": 2.8340080971659922e-05, "loss": 0.0009, "step": 7260 }, { "epoch": 9.04055330634278, "grad_norm": 0.002409159205853939, "learning_rate": 2.830259409206778e-05, "loss": 0.6277, "step": 7270 }, { "epoch": 9.041228070175439, "grad_norm": 298.6535339355469, "learning_rate": 2.8265107212475634e-05, "loss": 0.947, "step": 7280 }, { "epoch": 9.041902834008097, "grad_norm": 0.034443099051713943, "learning_rate": 2.8227620332883493e-05, "loss": 0.0001, "step": 7290 }, { "epoch": 9.042577597840756, "grad_norm": 0.040302518755197525, "learning_rate": 2.8190133453291352e-05, "loss": 0.003, "step": 7300 }, { "epoch": 9.043252361673414, "grad_norm": 0.0009369853651151061, "learning_rate": 2.8152646573699204e-05, "loss": 0.0, "step": 7310 }, { "epoch": 9.043927125506073, "grad_norm": 0.0013028283137828112, "learning_rate": 2.8115159694107064e-05, "loss": 0.0, "step": 7320 }, { "epoch": 9.04460188933873, "grad_norm": 0.001541333505883813, "learning_rate": 2.8077672814514923e-05, "loss": 0.0, "step": 7330 }, { "epoch": 9.04527665317139, "grad_norm": 0.000400466175051406, "learning_rate": 2.8040185934922782e-05, "loss": 0.0006, "step": 7340 }, { "epoch": 9.045951417004048, "grad_norm": 0.001137162558734417, "learning_rate": 2.8002699055330634e-05, "loss": 0.0002, "step": 7350 }, { "epoch": 9.046626180836707, "grad_norm": 0.0009733253973536193, "learning_rate": 2.7965212175738493e-05, "loss": 0.0, "step": 7360 }, { "epoch": 9.047300944669365, "grad_norm": 0.0002777110203169286, "learning_rate": 2.792772529614635e-05, "loss": 0.0, "step": 7370 }, { "epoch": 9.047975708502024, "grad_norm": 0.0009547212393954396, "learning_rate": 2.789023841655421e-05, "loss": 0.0, "step": 7380 }, { "epoch": 9.048650472334684, "grad_norm": 0.0003457583661656827, "learning_rate": 2.7852751536962064e-05, "loss": 0.0008, "step": 7390 }, { "epoch": 9.049325236167341, "grad_norm": 0.0019107568077743053, "learning_rate": 2.781526465736992e-05, "loss": 0.0009, "step": 7400 }, { "epoch": 9.05, "grad_norm": 0.0008839426445774734, "learning_rate": 2.777777777777778e-05, "loss": 0.0018, "step": 7410 }, { "epoch": 9.05, "eval_accuracy": 0.8928571428571429, "eval_f1": 0.8916871416871418, "eval_loss": 0.9169295430183411, "eval_runtime": 70.5688, "eval_samples_per_second": 1.587, "eval_steps_per_second": 1.587, "step": 7410 }, { "epoch": 10.00067476383266, "grad_norm": 0.0002696131123229861, "learning_rate": 2.7740290898185638e-05, "loss": 0.002, "step": 7420 }, { "epoch": 10.001349527665317, "grad_norm": 0.00017847323033493012, "learning_rate": 2.770280401859349e-05, "loss": 0.0, "step": 7430 }, { "epoch": 10.002024291497976, "grad_norm": 0.0010017943568527699, "learning_rate": 2.766531713900135e-05, "loss": 0.0, "step": 7440 }, { "epoch": 10.002699055330634, "grad_norm": 0.0006036867271177471, "learning_rate": 2.762783025940921e-05, "loss": 0.0, "step": 7450 }, { "epoch": 10.003373819163293, "grad_norm": 0.00019583333050832152, "learning_rate": 2.759034337981706e-05, "loss": 0.1066, "step": 7460 }, { "epoch": 10.004048582995951, "grad_norm": 0.08632688224315643, "learning_rate": 2.755285650022492e-05, "loss": 0.0003, "step": 7470 }, { "epoch": 10.00472334682861, "grad_norm": 0.00013941490033175796, "learning_rate": 2.751536962063278e-05, "loss": 0.0, "step": 7480 }, { "epoch": 10.005398110661268, "grad_norm": 0.0003023550088983029, "learning_rate": 2.747788274104064e-05, "loss": 0.0001, "step": 7490 }, { "epoch": 10.006072874493928, "grad_norm": 0.0005739156622439623, "learning_rate": 2.744039586144849e-05, "loss": 0.3069, "step": 7500 }, { "epoch": 10.006747638326585, "grad_norm": 0.0005304102669470012, "learning_rate": 2.740290898185635e-05, "loss": 0.0, "step": 7510 }, { "epoch": 10.007422402159245, "grad_norm": 0.0009174313163384795, "learning_rate": 2.736542210226421e-05, "loss": 0.0, "step": 7520 }, { "epoch": 10.008097165991902, "grad_norm": 0.0004933126620016992, "learning_rate": 2.732793522267207e-05, "loss": 0.0, "step": 7530 }, { "epoch": 10.008771929824562, "grad_norm": 0.002700564218685031, "learning_rate": 2.729044834307992e-05, "loss": 0.0, "step": 7540 }, { "epoch": 10.00944669365722, "grad_norm": 0.0008284652722068131, "learning_rate": 2.725296146348778e-05, "loss": 0.7602, "step": 7550 }, { "epoch": 10.010121457489879, "grad_norm": 0.0005742429639212787, "learning_rate": 2.721547458389564e-05, "loss": 0.0013, "step": 7560 }, { "epoch": 10.010796221322536, "grad_norm": 0.0001865791855379939, "learning_rate": 2.7177987704303492e-05, "loss": 1.0409, "step": 7570 }, { "epoch": 10.011470985155196, "grad_norm": 0.0005401599337346852, "learning_rate": 2.714050082471135e-05, "loss": 0.0, "step": 7580 }, { "epoch": 10.012145748987853, "grad_norm": 10.862272262573242, "learning_rate": 2.710301394511921e-05, "loss": 0.6573, "step": 7590 }, { "epoch": 10.012820512820513, "grad_norm": 37.7309455871582, "learning_rate": 2.706552706552707e-05, "loss": 0.7899, "step": 7600 }, { "epoch": 10.013495276653172, "grad_norm": 0.0009414503001607955, "learning_rate": 2.7028040185934922e-05, "loss": 0.0011, "step": 7610 }, { "epoch": 10.01417004048583, "grad_norm": 0.0004630287585314363, "learning_rate": 2.699055330634278e-05, "loss": 0.0001, "step": 7620 }, { "epoch": 10.01484480431849, "grad_norm": 0.0013565809931606054, "learning_rate": 2.695306642675064e-05, "loss": 0.0001, "step": 7630 }, { "epoch": 10.015519568151147, "grad_norm": 0.0022902884520590305, "learning_rate": 2.69155795471585e-05, "loss": 0.0, "step": 7640 }, { "epoch": 10.016194331983806, "grad_norm": 0.0009432418155483902, "learning_rate": 2.687809266756635e-05, "loss": 0.0, "step": 7650 }, { "epoch": 10.016869095816464, "grad_norm": 0.0009669333812780678, "learning_rate": 2.684060578797421e-05, "loss": 0.0001, "step": 7660 }, { "epoch": 10.017543859649123, "grad_norm": 0.0011604432947933674, "learning_rate": 2.680311890838207e-05, "loss": 0.0011, "step": 7670 }, { "epoch": 10.018218623481781, "grad_norm": 0.0037133977748453617, "learning_rate": 2.6765632028789922e-05, "loss": 0.0089, "step": 7680 }, { "epoch": 10.01889338731444, "grad_norm": 0.0019840672612190247, "learning_rate": 2.672814514919778e-05, "loss": 0.0001, "step": 7690 }, { "epoch": 10.019568151147098, "grad_norm": 0.0010515927569940686, "learning_rate": 2.669065826960564e-05, "loss": 0.0, "step": 7700 }, { "epoch": 10.020242914979757, "grad_norm": 0.00031027224031277, "learning_rate": 2.66531713900135e-05, "loss": 0.0001, "step": 7710 }, { "epoch": 10.020917678812415, "grad_norm": 0.0026109693571925163, "learning_rate": 2.6615684510421352e-05, "loss": 0.0001, "step": 7720 }, { "epoch": 10.021592442645074, "grad_norm": 0.001366731128655374, "learning_rate": 2.657819763082921e-05, "loss": 0.0, "step": 7730 }, { "epoch": 10.022267206477732, "grad_norm": 0.0010099642677232623, "learning_rate": 2.654071075123707e-05, "loss": 0.0, "step": 7740 }, { "epoch": 10.022941970310391, "grad_norm": 0.0007431610720232129, "learning_rate": 2.6503223871644926e-05, "loss": 0.3974, "step": 7750 }, { "epoch": 10.023616734143049, "grad_norm": 0.0005235990975052118, "learning_rate": 2.6465736992052782e-05, "loss": 0.0001, "step": 7760 }, { "epoch": 10.024291497975709, "grad_norm": 0.002703143283724785, "learning_rate": 2.642825011246064e-05, "loss": 0.9305, "step": 7770 }, { "epoch": 10.024966261808368, "grad_norm": 0.0013169089797884226, "learning_rate": 2.6390763232868497e-05, "loss": 0.0061, "step": 7780 }, { "epoch": 10.025641025641026, "grad_norm": 0.0006970075191929936, "learning_rate": 2.6353276353276356e-05, "loss": 0.0, "step": 7790 }, { "epoch": 10.026315789473685, "grad_norm": 0.0022921450436115265, "learning_rate": 2.6315789473684212e-05, "loss": 2.1218, "step": 7800 }, { "epoch": 10.026990553306343, "grad_norm": 0.015075190924108028, "learning_rate": 2.6278302594092068e-05, "loss": 0.0001, "step": 7810 }, { "epoch": 10.027665317139002, "grad_norm": 0.0003634750028140843, "learning_rate": 2.6240815714499927e-05, "loss": 0.0003, "step": 7820 }, { "epoch": 10.02834008097166, "grad_norm": 0.005189963150769472, "learning_rate": 2.6203328834907783e-05, "loss": 0.0002, "step": 7830 }, { "epoch": 10.029014844804319, "grad_norm": 0.0013347219210118055, "learning_rate": 2.616584195531564e-05, "loss": 0.0004, "step": 7840 }, { "epoch": 10.029689608636977, "grad_norm": 0.011999278329312801, "learning_rate": 2.6128355075723498e-05, "loss": 0.0001, "step": 7850 }, { "epoch": 10.030364372469636, "grad_norm": 0.0007896720780991018, "learning_rate": 2.6090868196131357e-05, "loss": 0.0001, "step": 7860 }, { "epoch": 10.031039136302294, "grad_norm": 0.004586980678141117, "learning_rate": 2.605338131653921e-05, "loss": 0.0001, "step": 7870 }, { "epoch": 10.031713900134953, "grad_norm": 0.001417971565388143, "learning_rate": 2.601589443694707e-05, "loss": 0.0, "step": 7880 }, { "epoch": 10.03238866396761, "grad_norm": 0.0019554668106138706, "learning_rate": 2.5978407557354928e-05, "loss": 0.0001, "step": 7890 }, { "epoch": 10.03306342780027, "grad_norm": 0.028743397444486618, "learning_rate": 2.5940920677762787e-05, "loss": 0.0001, "step": 7900 }, { "epoch": 10.033738191632928, "grad_norm": 0.0008731328416615725, "learning_rate": 2.590343379817064e-05, "loss": 0.0001, "step": 7910 }, { "epoch": 10.034412955465587, "grad_norm": 0.0012366612209007144, "learning_rate": 2.5865946918578498e-05, "loss": 0.0001, "step": 7920 }, { "epoch": 10.035087719298245, "grad_norm": 0.0026165838353335857, "learning_rate": 2.5828460038986357e-05, "loss": 0.0001, "step": 7930 }, { "epoch": 10.035762483130904, "grad_norm": 0.014659812673926353, "learning_rate": 2.579097315939421e-05, "loss": 0.0002, "step": 7940 }, { "epoch": 10.036437246963562, "grad_norm": 0.00143991329241544, "learning_rate": 2.575348627980207e-05, "loss": 0.0, "step": 7950 }, { "epoch": 10.037112010796221, "grad_norm": 0.00752654206007719, "learning_rate": 2.5715999400209928e-05, "loss": 0.0001, "step": 7960 }, { "epoch": 10.03778677462888, "grad_norm": 0.0011906948639079928, "learning_rate": 2.5678512520617787e-05, "loss": 0.0001, "step": 7970 }, { "epoch": 10.038461538461538, "grad_norm": 0.004429694265127182, "learning_rate": 2.564102564102564e-05, "loss": 0.0001, "step": 7980 }, { "epoch": 10.039136302294198, "grad_norm": 0.00023650593357160687, "learning_rate": 2.56035387614335e-05, "loss": 0.0001, "step": 7990 }, { "epoch": 10.039811066126855, "grad_norm": 0.0007866804371587932, "learning_rate": 2.5566051881841358e-05, "loss": 0.0, "step": 8000 }, { "epoch": 10.040485829959515, "grad_norm": 0.0013989802682772279, "learning_rate": 2.5528565002249217e-05, "loss": 0.0001, "step": 8010 }, { "epoch": 10.041160593792172, "grad_norm": 0.0008867586147971451, "learning_rate": 2.549107812265707e-05, "loss": 0.2682, "step": 8020 }, { "epoch": 10.041835357624832, "grad_norm": 0.001083207200281322, "learning_rate": 2.545359124306493e-05, "loss": 0.0, "step": 8030 }, { "epoch": 10.04251012145749, "grad_norm": 0.0010164374252781272, "learning_rate": 2.5416104363472788e-05, "loss": 0.0014, "step": 8040 }, { "epoch": 10.043184885290149, "grad_norm": 0.0032585004810243845, "learning_rate": 2.5378617483880647e-05, "loss": 0.0001, "step": 8050 }, { "epoch": 10.043859649122806, "grad_norm": 0.0007220272673293948, "learning_rate": 2.53411306042885e-05, "loss": 0.0001, "step": 8060 }, { "epoch": 10.044534412955466, "grad_norm": 0.0010795597918331623, "learning_rate": 2.530364372469636e-05, "loss": 0.0001, "step": 8070 }, { "epoch": 10.045209176788124, "grad_norm": 0.0033428198657929897, "learning_rate": 2.5266156845104218e-05, "loss": 0.0, "step": 8080 }, { "epoch": 10.045883940620783, "grad_norm": 0.0007780479500070214, "learning_rate": 2.522866996551207e-05, "loss": 0.0003, "step": 8090 }, { "epoch": 10.04655870445344, "grad_norm": 0.002177152084186673, "learning_rate": 2.519118308591993e-05, "loss": 1.034, "step": 8100 }, { "epoch": 10.0472334682861, "grad_norm": 0.012076308950781822, "learning_rate": 2.515369620632779e-05, "loss": 0.0001, "step": 8110 }, { "epoch": 10.047908232118758, "grad_norm": 0.00882900319993496, "learning_rate": 2.5116209326735644e-05, "loss": 0.0001, "step": 8120 }, { "epoch": 10.048582995951417, "grad_norm": 0.0017163383308798075, "learning_rate": 2.50787224471435e-05, "loss": 0.0002, "step": 8130 }, { "epoch": 10.049257759784076, "grad_norm": 0.07908181846141815, "learning_rate": 2.504123556755136e-05, "loss": 0.0002, "step": 8140 }, { "epoch": 10.049932523616734, "grad_norm": 0.0007900993805378675, "learning_rate": 2.5003748687959215e-05, "loss": 0.0, "step": 8150 }, { "epoch": 10.05, "eval_accuracy": 0.8928571428571429, "eval_f1": 0.8927764491849939, "eval_loss": 0.6104062795639038, "eval_runtime": 74.468, "eval_samples_per_second": 1.504, "eval_steps_per_second": 1.504, "step": 8151 }, { "epoch": 11.000607287449393, "grad_norm": 0.0007240193081088364, "learning_rate": 2.496626180836707e-05, "loss": 0.0, "step": 8160 }, { "epoch": 11.001282051282052, "grad_norm": 0.0002468556631356478, "learning_rate": 2.492877492877493e-05, "loss": 0.11, "step": 8170 }, { "epoch": 11.00195681511471, "grad_norm": 0.0006738382508046925, "learning_rate": 2.4891288049182786e-05, "loss": 0.0009, "step": 8180 }, { "epoch": 11.00263157894737, "grad_norm": 0.0002363823732594028, "learning_rate": 2.485380116959064e-05, "loss": 0.0001, "step": 8190 }, { "epoch": 11.003306342780027, "grad_norm": 0.01611531712114811, "learning_rate": 2.48163142899985e-05, "loss": 0.0001, "step": 8200 }, { "epoch": 11.003981106612686, "grad_norm": 0.00017891006427817047, "learning_rate": 2.4778827410406356e-05, "loss": 0.0001, "step": 8210 }, { "epoch": 11.004655870445344, "grad_norm": 0.0012173757422715425, "learning_rate": 2.4741340530814216e-05, "loss": 0.0, "step": 8220 }, { "epoch": 11.005330634278003, "grad_norm": 0.00027030581259168684, "learning_rate": 2.470385365122207e-05, "loss": 0.0001, "step": 8230 }, { "epoch": 11.006005398110661, "grad_norm": 0.0007059440249577165, "learning_rate": 2.466636677162993e-05, "loss": 0.0038, "step": 8240 }, { "epoch": 11.00668016194332, "grad_norm": 0.0038354801945388317, "learning_rate": 2.4628879892037786e-05, "loss": 0.0, "step": 8250 }, { "epoch": 11.007354925775978, "grad_norm": 0.002050234004855156, "learning_rate": 2.4591393012445645e-05, "loss": 0.0001, "step": 8260 }, { "epoch": 11.008029689608637, "grad_norm": 0.0007953056483529508, "learning_rate": 2.45539061328535e-05, "loss": 0.0001, "step": 8270 }, { "epoch": 11.008704453441295, "grad_norm": 0.0005133861559443176, "learning_rate": 2.451641925326136e-05, "loss": 0.0001, "step": 8280 }, { "epoch": 11.009379217273954, "grad_norm": 0.00046163739170879126, "learning_rate": 2.4478932373669216e-05, "loss": 0.0, "step": 8290 }, { "epoch": 11.010053981106612, "grad_norm": 0.0001449552073609084, "learning_rate": 2.4441445494077075e-05, "loss": 0.3172, "step": 8300 }, { "epoch": 11.010728744939271, "grad_norm": 164.93666076660156, "learning_rate": 2.440395861448493e-05, "loss": 0.6368, "step": 8310 }, { "epoch": 11.011403508771929, "grad_norm": 0.000476795103168115, "learning_rate": 2.4366471734892787e-05, "loss": 0.0, "step": 8320 }, { "epoch": 11.012078272604588, "grad_norm": 0.0037983739748597145, "learning_rate": 2.4328984855300646e-05, "loss": 0.0002, "step": 8330 }, { "epoch": 11.012753036437246, "grad_norm": 0.000796021893620491, "learning_rate": 2.4291497975708502e-05, "loss": 0.0, "step": 8340 }, { "epoch": 11.013427800269906, "grad_norm": 0.0005037175142206252, "learning_rate": 2.425401109611636e-05, "loss": 0.0002, "step": 8350 }, { "epoch": 11.014102564102565, "grad_norm": 0.0043189083226025105, "learning_rate": 2.4216524216524217e-05, "loss": 0.0001, "step": 8360 }, { "epoch": 11.014777327935223, "grad_norm": 0.0015088323270902038, "learning_rate": 2.4179037336932076e-05, "loss": 0.0, "step": 8370 }, { "epoch": 11.015452091767882, "grad_norm": 0.009932787157595158, "learning_rate": 2.414155045733993e-05, "loss": 0.0, "step": 8380 }, { "epoch": 11.01612685560054, "grad_norm": 0.0006705676787532866, "learning_rate": 2.410406357774779e-05, "loss": 0.0, "step": 8390 }, { "epoch": 11.016801619433199, "grad_norm": 0.0004983929102309048, "learning_rate": 2.4066576698155647e-05, "loss": 0.0, "step": 8400 }, { "epoch": 11.017476383265857, "grad_norm": 0.0002321622014278546, "learning_rate": 2.4029089818563506e-05, "loss": 0.0, "step": 8410 }, { "epoch": 11.018151147098516, "grad_norm": 0.00045225844951346517, "learning_rate": 2.399160293897136e-05, "loss": 0.0001, "step": 8420 }, { "epoch": 11.018825910931174, "grad_norm": 0.0006059862207621336, "learning_rate": 2.395411605937922e-05, "loss": 0.0, "step": 8430 }, { "epoch": 11.019500674763833, "grad_norm": 0.00025944746448658407, "learning_rate": 2.3916629179787076e-05, "loss": 0.0, "step": 8440 }, { "epoch": 11.02017543859649, "grad_norm": 0.005270655732601881, "learning_rate": 2.3879142300194932e-05, "loss": 0.0, "step": 8450 }, { "epoch": 11.02085020242915, "grad_norm": 0.0001714004756649956, "learning_rate": 2.384165542060279e-05, "loss": 0.0, "step": 8460 }, { "epoch": 11.021524966261808, "grad_norm": 0.0004896153695881367, "learning_rate": 2.3804168541010647e-05, "loss": 0.0001, "step": 8470 }, { "epoch": 11.022199730094467, "grad_norm": 0.0004871699493378401, "learning_rate": 2.3766681661418506e-05, "loss": 0.0, "step": 8480 }, { "epoch": 11.022874493927125, "grad_norm": 0.00332398503087461, "learning_rate": 2.3729194781826362e-05, "loss": 0.0, "step": 8490 }, { "epoch": 11.023549257759784, "grad_norm": 0.0004967203130945563, "learning_rate": 2.369170790223422e-05, "loss": 0.0, "step": 8500 }, { "epoch": 11.024224021592442, "grad_norm": 0.0006828585756011307, "learning_rate": 2.3654221022642077e-05, "loss": 0.0, "step": 8510 }, { "epoch": 11.024898785425101, "grad_norm": 0.00026437186170369387, "learning_rate": 2.3616734143049933e-05, "loss": 0.0923, "step": 8520 }, { "epoch": 11.025573549257759, "grad_norm": 0.001157809398137033, "learning_rate": 2.3579247263457792e-05, "loss": 0.3747, "step": 8530 }, { "epoch": 11.026248313090418, "grad_norm": 0.0006130054825916886, "learning_rate": 2.3541760383865648e-05, "loss": 0.1333, "step": 8540 }, { "epoch": 11.026923076923078, "grad_norm": 0.004360508639365435, "learning_rate": 2.3504273504273504e-05, "loss": 0.0001, "step": 8550 }, { "epoch": 11.027597840755735, "grad_norm": 0.0038445070385932922, "learning_rate": 2.3466786624681363e-05, "loss": 0.0005, "step": 8560 }, { "epoch": 11.028272604588395, "grad_norm": 0.0003999462933279574, "learning_rate": 2.342929974508922e-05, "loss": 0.0, "step": 8570 }, { "epoch": 11.028947368421052, "grad_norm": 0.0013614681083709002, "learning_rate": 2.3391812865497074e-05, "loss": 0.0, "step": 8580 }, { "epoch": 11.029622132253712, "grad_norm": 8.867596625350416e-05, "learning_rate": 2.3354325985904933e-05, "loss": 0.0, "step": 8590 }, { "epoch": 11.03029689608637, "grad_norm": 0.0005633147084154189, "learning_rate": 2.331683910631279e-05, "loss": 0.0, "step": 8600 }, { "epoch": 11.030971659919029, "grad_norm": 0.0005220117163844407, "learning_rate": 2.327935222672065e-05, "loss": 0.0001, "step": 8610 }, { "epoch": 11.031646423751686, "grad_norm": 0.0004213712236378342, "learning_rate": 2.3241865347128504e-05, "loss": 0.0, "step": 8620 }, { "epoch": 11.032321187584346, "grad_norm": 0.00038689616485498846, "learning_rate": 2.3204378467536363e-05, "loss": 0.0, "step": 8630 }, { "epoch": 11.032995951417004, "grad_norm": 0.00039902018033899367, "learning_rate": 2.316689158794422e-05, "loss": 0.0, "step": 8640 }, { "epoch": 11.033670715249663, "grad_norm": 0.0026982324197888374, "learning_rate": 2.3129404708352078e-05, "loss": 0.0, "step": 8650 }, { "epoch": 11.03434547908232, "grad_norm": 0.0001991643221117556, "learning_rate": 2.3091917828759934e-05, "loss": 0.0, "step": 8660 }, { "epoch": 11.03502024291498, "grad_norm": 0.0019273010548204184, "learning_rate": 2.3054430949167793e-05, "loss": 0.0, "step": 8670 }, { "epoch": 11.035695006747638, "grad_norm": 0.000698404386639595, "learning_rate": 2.301694406957565e-05, "loss": 0.0001, "step": 8680 }, { "epoch": 11.036369770580297, "grad_norm": 0.00025344561436213553, "learning_rate": 2.2979457189983508e-05, "loss": 0.0, "step": 8690 }, { "epoch": 11.037044534412955, "grad_norm": 0.0027119882870465517, "learning_rate": 2.2941970310391364e-05, "loss": 0.4804, "step": 8700 }, { "epoch": 11.037719298245614, "grad_norm": 0.00020401214715093374, "learning_rate": 2.290448343079922e-05, "loss": 0.0, "step": 8710 }, { "epoch": 11.038394062078273, "grad_norm": 0.0004772163520101458, "learning_rate": 2.286699655120708e-05, "loss": 0.0, "step": 8720 }, { "epoch": 11.039068825910931, "grad_norm": 0.0004061859508510679, "learning_rate": 2.2829509671614935e-05, "loss": 0.0001, "step": 8730 }, { "epoch": 11.03974358974359, "grad_norm": 0.0010080209467560053, "learning_rate": 2.2792022792022794e-05, "loss": 0.0, "step": 8740 }, { "epoch": 11.040418353576248, "grad_norm": 0.00021367882436607033, "learning_rate": 2.275453591243065e-05, "loss": 0.0, "step": 8750 }, { "epoch": 11.041093117408908, "grad_norm": 0.002230451675131917, "learning_rate": 2.271704903283851e-05, "loss": 0.0, "step": 8760 }, { "epoch": 11.041767881241565, "grad_norm": 0.0003300936659798026, "learning_rate": 2.2679562153246365e-05, "loss": 0.0, "step": 8770 }, { "epoch": 11.042442645074225, "grad_norm": 0.0023498530499637127, "learning_rate": 2.2642075273654224e-05, "loss": 0.0001, "step": 8780 }, { "epoch": 11.043117408906882, "grad_norm": 0.0011958705727010965, "learning_rate": 2.260458839406208e-05, "loss": 0.0001, "step": 8790 }, { "epoch": 11.043792172739542, "grad_norm": 0.0022039199247956276, "learning_rate": 2.256710151446994e-05, "loss": 0.0, "step": 8800 }, { "epoch": 11.0444669365722, "grad_norm": 0.0003688375581987202, "learning_rate": 2.2529614634877794e-05, "loss": 0.0059, "step": 8810 }, { "epoch": 11.045141700404859, "grad_norm": 0.0007805086788721383, "learning_rate": 2.2492127755285654e-05, "loss": 0.0, "step": 8820 }, { "epoch": 11.045816464237516, "grad_norm": 0.0009934029076248407, "learning_rate": 2.245464087569351e-05, "loss": 0.0, "step": 8830 }, { "epoch": 11.046491228070176, "grad_norm": 0.001246001455001533, "learning_rate": 2.2417153996101365e-05, "loss": 0.0, "step": 8840 }, { "epoch": 11.047165991902833, "grad_norm": 9.812816279008985e-05, "learning_rate": 2.2379667116509224e-05, "loss": 0.0, "step": 8850 }, { "epoch": 11.047840755735493, "grad_norm": 0.0004926809924654663, "learning_rate": 2.234218023691708e-05, "loss": 0.0, "step": 8860 }, { "epoch": 11.04851551956815, "grad_norm": 0.0003611448628362268, "learning_rate": 2.230469335732494e-05, "loss": 0.0, "step": 8870 }, { "epoch": 11.04919028340081, "grad_norm": 0.000520729401614517, "learning_rate": 2.2267206477732795e-05, "loss": 0.0, "step": 8880 }, { "epoch": 11.049865047233467, "grad_norm": 0.00023293115373235196, "learning_rate": 2.2229719598140654e-05, "loss": 0.0, "step": 8890 }, { "epoch": 11.05, "eval_accuracy": 0.9196428571428571, "eval_f1": 0.9207212368977075, "eval_loss": 0.6125034689903259, "eval_runtime": 71.2839, "eval_samples_per_second": 1.571, "eval_steps_per_second": 1.571, "step": 8892 }, { "epoch": 12.000539811066126, "grad_norm": 0.0007233197102323174, "learning_rate": 2.219223271854851e-05, "loss": 0.4448, "step": 8900 }, { "epoch": 12.001214574898786, "grad_norm": 0.0002516189415473491, "learning_rate": 2.2154745838956366e-05, "loss": 0.0354, "step": 8910 }, { "epoch": 12.001889338731443, "grad_norm": 0.0003420355205889791, "learning_rate": 2.2117258959364225e-05, "loss": 0.0, "step": 8920 }, { "epoch": 12.002564102564103, "grad_norm": 0.0004494291788432747, "learning_rate": 2.207977207977208e-05, "loss": 0.0, "step": 8930 }, { "epoch": 12.003238866396762, "grad_norm": 0.00031234361813403666, "learning_rate": 2.2042285200179936e-05, "loss": 0.0, "step": 8940 }, { "epoch": 12.00391363022942, "grad_norm": 0.00012721461826004088, "learning_rate": 2.2004798320587796e-05, "loss": 0.0838, "step": 8950 }, { "epoch": 12.004588394062079, "grad_norm": 0.000489223632030189, "learning_rate": 2.196731144099565e-05, "loss": 0.0, "step": 8960 }, { "epoch": 12.005263157894737, "grad_norm": 0.0033521486911922693, "learning_rate": 2.1929824561403507e-05, "loss": 0.1973, "step": 8970 }, { "epoch": 12.005937921727396, "grad_norm": 0.009397713467478752, "learning_rate": 2.1892337681811366e-05, "loss": 0.0, "step": 8980 }, { "epoch": 12.006612685560054, "grad_norm": 0.006849181838333607, "learning_rate": 2.1854850802219222e-05, "loss": 0.0, "step": 8990 }, { "epoch": 12.007287449392713, "grad_norm": 0.0006626849644817412, "learning_rate": 2.181736392262708e-05, "loss": 0.0, "step": 9000 }, { "epoch": 12.00796221322537, "grad_norm": 0.000323317275615409, "learning_rate": 2.1779877043034937e-05, "loss": 0.4233, "step": 9010 }, { "epoch": 12.00863697705803, "grad_norm": 0.00013916198804508895, "learning_rate": 2.1742390163442796e-05, "loss": 0.0006, "step": 9020 }, { "epoch": 12.009311740890688, "grad_norm": 0.0004281499423086643, "learning_rate": 2.1704903283850652e-05, "loss": 0.0, "step": 9030 }, { "epoch": 12.009986504723347, "grad_norm": 0.0038120527751743793, "learning_rate": 2.166741640425851e-05, "loss": 0.2496, "step": 9040 }, { "epoch": 12.010661268556005, "grad_norm": 0.007827537134289742, "learning_rate": 2.1629929524666367e-05, "loss": 0.0, "step": 9050 }, { "epoch": 12.011336032388664, "grad_norm": 0.0004882031353190541, "learning_rate": 2.1592442645074226e-05, "loss": 0.0236, "step": 9060 }, { "epoch": 12.012010796221322, "grad_norm": 0.0006974562420509756, "learning_rate": 2.1554955765482082e-05, "loss": 0.0, "step": 9070 }, { "epoch": 12.012685560053981, "grad_norm": 0.0007927274564281106, "learning_rate": 2.151746888588994e-05, "loss": 0.0, "step": 9080 }, { "epoch": 12.013360323886639, "grad_norm": 0.0005972622311674058, "learning_rate": 2.1479982006297797e-05, "loss": 0.0, "step": 9090 }, { "epoch": 12.014035087719298, "grad_norm": 0.0020678879227489233, "learning_rate": 2.1442495126705653e-05, "loss": 0.0, "step": 9100 }, { "epoch": 12.014709851551958, "grad_norm": 0.0017037901561707258, "learning_rate": 2.1405008247113512e-05, "loss": 0.0, "step": 9110 }, { "epoch": 12.015384615384615, "grad_norm": 0.002625885419547558, "learning_rate": 2.1367521367521368e-05, "loss": 0.0, "step": 9120 }, { "epoch": 12.016059379217275, "grad_norm": 0.00016007163503672928, "learning_rate": 2.1330034487929227e-05, "loss": 0.0, "step": 9130 }, { "epoch": 12.016734143049932, "grad_norm": 8.975803211797029e-05, "learning_rate": 2.1292547608337082e-05, "loss": 0.0, "step": 9140 }, { "epoch": 12.017408906882592, "grad_norm": 0.00010270516213495284, "learning_rate": 2.125506072874494e-05, "loss": 0.0, "step": 9150 }, { "epoch": 12.01808367071525, "grad_norm": 0.0003781057021114975, "learning_rate": 2.1217573849152797e-05, "loss": 0.0, "step": 9160 }, { "epoch": 12.018758434547909, "grad_norm": 0.00045806102571077645, "learning_rate": 2.1180086969560657e-05, "loss": 0.0, "step": 9170 }, { "epoch": 12.019433198380566, "grad_norm": 0.00040667993016541004, "learning_rate": 2.1142600089968512e-05, "loss": 0.0, "step": 9180 }, { "epoch": 12.020107962213226, "grad_norm": 7.579607336083427e-05, "learning_rate": 2.110511321037637e-05, "loss": 0.0, "step": 9190 }, { "epoch": 12.020782726045883, "grad_norm": 0.0002768370322883129, "learning_rate": 2.1067626330784227e-05, "loss": 0.0, "step": 9200 }, { "epoch": 12.021457489878543, "grad_norm": 0.0010953324381262064, "learning_rate": 2.1030139451192083e-05, "loss": 0.0, "step": 9210 }, { "epoch": 12.0221322537112, "grad_norm": 0.00658809207379818, "learning_rate": 2.0992652571599942e-05, "loss": 0.0919, "step": 9220 }, { "epoch": 12.02280701754386, "grad_norm": 0.0006163925281725824, "learning_rate": 2.0955165692007798e-05, "loss": 0.0, "step": 9230 }, { "epoch": 12.023481781376518, "grad_norm": 0.000813082791864872, "learning_rate": 2.0917678812415657e-05, "loss": 0.0001, "step": 9240 }, { "epoch": 12.024156545209177, "grad_norm": 0.00046772375935688615, "learning_rate": 2.0880191932823513e-05, "loss": 0.0, "step": 9250 }, { "epoch": 12.024831309041835, "grad_norm": 0.0005937941023148596, "learning_rate": 2.0842705053231372e-05, "loss": 0.0002, "step": 9260 }, { "epoch": 12.025506072874494, "grad_norm": 0.000659748911857605, "learning_rate": 2.0805218173639228e-05, "loss": 0.0, "step": 9270 }, { "epoch": 12.026180836707152, "grad_norm": 0.0006786544108763337, "learning_rate": 2.0767731294047084e-05, "loss": 0.0, "step": 9280 }, { "epoch": 12.026855600539811, "grad_norm": 0.000225842886720784, "learning_rate": 2.0730244414454943e-05, "loss": 0.0, "step": 9290 }, { "epoch": 12.02753036437247, "grad_norm": 0.0006020697182975709, "learning_rate": 2.06927575348628e-05, "loss": 0.0, "step": 9300 }, { "epoch": 12.028205128205128, "grad_norm": 0.0005702193011529744, "learning_rate": 2.0655270655270654e-05, "loss": 0.0, "step": 9310 }, { "epoch": 12.028879892037788, "grad_norm": 0.000844390713609755, "learning_rate": 2.0617783775678514e-05, "loss": 0.0, "step": 9320 }, { "epoch": 12.029554655870445, "grad_norm": 9.666190453572199e-05, "learning_rate": 2.058029689608637e-05, "loss": 0.0, "step": 9330 }, { "epoch": 12.030229419703105, "grad_norm": 0.0001864578080130741, "learning_rate": 2.0542810016494225e-05, "loss": 0.0, "step": 9340 }, { "epoch": 12.030904183535762, "grad_norm": 0.00014394025492947549, "learning_rate": 2.0505323136902084e-05, "loss": 0.0, "step": 9350 }, { "epoch": 12.031578947368422, "grad_norm": 0.00027057836996391416, "learning_rate": 2.046783625730994e-05, "loss": 0.0, "step": 9360 }, { "epoch": 12.03225371120108, "grad_norm": 0.0004066646215505898, "learning_rate": 2.04303493777178e-05, "loss": 0.0, "step": 9370 }, { "epoch": 12.032928475033739, "grad_norm": 0.00043117342283949256, "learning_rate": 2.0392862498125655e-05, "loss": 0.0, "step": 9380 }, { "epoch": 12.033603238866396, "grad_norm": 0.00019329691713210195, "learning_rate": 2.0355375618533514e-05, "loss": 0.0001, "step": 9390 }, { "epoch": 12.034278002699056, "grad_norm": 0.00036019805702380836, "learning_rate": 2.031788873894137e-05, "loss": 0.0, "step": 9400 }, { "epoch": 12.034952766531713, "grad_norm": 0.0006936113350093365, "learning_rate": 2.028040185934923e-05, "loss": 0.0, "step": 9410 }, { "epoch": 12.035627530364373, "grad_norm": 0.00041965124546550214, "learning_rate": 2.0242914979757085e-05, "loss": 0.0, "step": 9420 }, { "epoch": 12.03630229419703, "grad_norm": 0.00011109585466329008, "learning_rate": 2.0205428100164944e-05, "loss": 0.0, "step": 9430 }, { "epoch": 12.03697705802969, "grad_norm": 0.000144297766382806, "learning_rate": 2.01679412205728e-05, "loss": 0.0281, "step": 9440 }, { "epoch": 12.037651821862347, "grad_norm": 0.0002551145735196769, "learning_rate": 2.013045434098066e-05, "loss": 0.0, "step": 9450 }, { "epoch": 12.038326585695007, "grad_norm": 0.006847582757472992, "learning_rate": 2.0092967461388515e-05, "loss": 0.0, "step": 9460 }, { "epoch": 12.039001349527666, "grad_norm": 0.00011437670036684722, "learning_rate": 2.005548058179637e-05, "loss": 0.0, "step": 9470 }, { "epoch": 12.039676113360324, "grad_norm": 0.00040303889545612037, "learning_rate": 2.001799370220423e-05, "loss": 0.0, "step": 9480 }, { "epoch": 12.040350877192983, "grad_norm": 0.00046083523193374276, "learning_rate": 1.9980506822612085e-05, "loss": 0.0, "step": 9490 }, { "epoch": 12.04102564102564, "grad_norm": 0.0006515540299005806, "learning_rate": 1.9943019943019945e-05, "loss": 0.0, "step": 9500 }, { "epoch": 12.0417004048583, "grad_norm": 0.00014752485731150955, "learning_rate": 1.99055330634278e-05, "loss": 0.0, "step": 9510 }, { "epoch": 12.042375168690958, "grad_norm": 0.0005620931042358279, "learning_rate": 1.986804618383566e-05, "loss": 0.0, "step": 9520 }, { "epoch": 12.043049932523617, "grad_norm": 0.00011923335841856897, "learning_rate": 1.9830559304243515e-05, "loss": 0.0, "step": 9530 }, { "epoch": 12.043724696356275, "grad_norm": 0.0002657576696947217, "learning_rate": 1.9793072424651374e-05, "loss": 0.0, "step": 9540 }, { "epoch": 12.044399460188934, "grad_norm": 0.0001235770614584908, "learning_rate": 1.975558554505923e-05, "loss": 0.0, "step": 9550 }, { "epoch": 12.045074224021592, "grad_norm": 0.0001751129748299718, "learning_rate": 1.971809866546709e-05, "loss": 0.4854, "step": 9560 }, { "epoch": 12.045748987854251, "grad_norm": 0.000554791884496808, "learning_rate": 1.9680611785874945e-05, "loss": 0.0, "step": 9570 }, { "epoch": 12.046423751686909, "grad_norm": 0.0003107208467554301, "learning_rate": 1.9643124906282804e-05, "loss": 0.0, "step": 9580 }, { "epoch": 12.047098515519568, "grad_norm": 0.0002857028157450259, "learning_rate": 1.960563802669066e-05, "loss": 0.0, "step": 9590 }, { "epoch": 12.047773279352226, "grad_norm": 0.0001487692934460938, "learning_rate": 1.9568151147098516e-05, "loss": 0.0, "step": 9600 }, { "epoch": 12.048448043184885, "grad_norm": 0.0004835377912968397, "learning_rate": 1.9530664267506375e-05, "loss": 0.0, "step": 9610 }, { "epoch": 12.049122807017543, "grad_norm": 0.004288305062800646, "learning_rate": 1.949317738791423e-05, "loss": 0.0, "step": 9620 }, { "epoch": 12.049797570850203, "grad_norm": 0.0002630397502798587, "learning_rate": 1.945569050832209e-05, "loss": 0.0, "step": 9630 }, { "epoch": 12.05, "eval_accuracy": 0.9285714285714286, "eval_f1": 0.9281167328042328, "eval_loss": 0.5643919110298157, "eval_runtime": 75.5753, "eval_samples_per_second": 1.482, "eval_steps_per_second": 1.482, "step": 9633 }, { "epoch": 13.000472334682861, "grad_norm": 0.00026892725145444274, "learning_rate": 1.9418203628729946e-05, "loss": 0.0, "step": 9640 }, { "epoch": 13.001147098515519, "grad_norm": 0.00012843680451624095, "learning_rate": 1.9380716749137805e-05, "loss": 0.0, "step": 9650 }, { "epoch": 13.001821862348178, "grad_norm": 0.00029701701714657247, "learning_rate": 1.934322986954566e-05, "loss": 0.0, "step": 9660 }, { "epoch": 13.002496626180836, "grad_norm": 0.00036974012618884444, "learning_rate": 1.9305742989953516e-05, "loss": 0.0, "step": 9670 }, { "epoch": 13.003171390013495, "grad_norm": 0.0001296445552725345, "learning_rate": 1.9268256110361376e-05, "loss": 0.0078, "step": 9680 }, { "epoch": 13.003846153846155, "grad_norm": 0.0002359377540415153, "learning_rate": 1.923076923076923e-05, "loss": 0.0, "step": 9690 }, { "epoch": 13.004520917678812, "grad_norm": 0.0003535948053468019, "learning_rate": 1.9193282351177087e-05, "loss": 0.0, "step": 9700 }, { "epoch": 13.005195681511472, "grad_norm": 0.00025236004148609936, "learning_rate": 1.9155795471584946e-05, "loss": 0.0, "step": 9710 }, { "epoch": 13.00587044534413, "grad_norm": 0.0002863478730432689, "learning_rate": 1.9118308591992802e-05, "loss": 0.0, "step": 9720 }, { "epoch": 13.006545209176789, "grad_norm": 0.00016143821994774044, "learning_rate": 1.9080821712400658e-05, "loss": 0.3645, "step": 9730 }, { "epoch": 13.007219973009446, "grad_norm": 0.0004113702161703259, "learning_rate": 1.9043334832808517e-05, "loss": 0.0, "step": 9740 }, { "epoch": 13.007894736842106, "grad_norm": 0.0008134804083965719, "learning_rate": 1.9005847953216373e-05, "loss": 0.0, "step": 9750 }, { "epoch": 13.008569500674763, "grad_norm": 0.00027760997181758285, "learning_rate": 1.8968361073624232e-05, "loss": 0.0, "step": 9760 }, { "epoch": 13.009244264507423, "grad_norm": 0.0016426608199253678, "learning_rate": 1.8930874194032088e-05, "loss": 0.0, "step": 9770 }, { "epoch": 13.00991902834008, "grad_norm": 0.0008006367716006935, "learning_rate": 1.8893387314439947e-05, "loss": 0.0, "step": 9780 }, { "epoch": 13.01059379217274, "grad_norm": 0.00025531640858389437, "learning_rate": 1.8855900434847803e-05, "loss": 0.0, "step": 9790 }, { "epoch": 13.011268556005398, "grad_norm": 0.0003084157651755959, "learning_rate": 1.8818413555255662e-05, "loss": 0.0, "step": 9800 }, { "epoch": 13.011943319838057, "grad_norm": 0.0007207695161923766, "learning_rate": 1.8780926675663518e-05, "loss": 0.0001, "step": 9810 }, { "epoch": 13.012618083670715, "grad_norm": 0.00012202781363157555, "learning_rate": 1.8743439796071377e-05, "loss": 0.0, "step": 9820 }, { "epoch": 13.013292847503374, "grad_norm": 0.0012473361566662788, "learning_rate": 1.8705952916479233e-05, "loss": 0.0, "step": 9830 }, { "epoch": 13.013967611336032, "grad_norm": 0.0007895145681686699, "learning_rate": 1.8668466036887092e-05, "loss": 0.0, "step": 9840 }, { "epoch": 13.014642375168691, "grad_norm": 0.0002717502065934241, "learning_rate": 1.8630979157294948e-05, "loss": 0.0, "step": 9850 }, { "epoch": 13.015317139001349, "grad_norm": 0.0002320138446521014, "learning_rate": 1.8593492277702803e-05, "loss": 0.0, "step": 9860 }, { "epoch": 13.015991902834008, "grad_norm": 0.0002716576855164021, "learning_rate": 1.8556005398110663e-05, "loss": 0.0, "step": 9870 }, { "epoch": 13.016666666666667, "grad_norm": 7.131123129511252e-05, "learning_rate": 1.8518518518518518e-05, "loss": 0.0, "step": 9880 }, { "epoch": 13.017341430499325, "grad_norm": 0.00045431696344166994, "learning_rate": 1.8481031638926377e-05, "loss": 0.0, "step": 9890 }, { "epoch": 13.018016194331985, "grad_norm": 0.00013243043213151395, "learning_rate": 1.8443544759334233e-05, "loss": 0.0, "step": 9900 }, { "epoch": 13.018690958164642, "grad_norm": 0.00031196267809718847, "learning_rate": 1.8406057879742092e-05, "loss": 0.0, "step": 9910 }, { "epoch": 13.019365721997302, "grad_norm": 0.000940505473408848, "learning_rate": 1.8368571000149948e-05, "loss": 0.0, "step": 9920 }, { "epoch": 13.02004048582996, "grad_norm": 0.0002774264430627227, "learning_rate": 1.8331084120557807e-05, "loss": 0.0, "step": 9930 }, { "epoch": 13.020715249662619, "grad_norm": 0.0002633021795190871, "learning_rate": 1.8293597240965663e-05, "loss": 0.0, "step": 9940 }, { "epoch": 13.021390013495276, "grad_norm": 7.044156518531963e-05, "learning_rate": 1.8256110361373522e-05, "loss": 0.0, "step": 9950 }, { "epoch": 13.022064777327936, "grad_norm": 0.00017661662423051894, "learning_rate": 1.8218623481781378e-05, "loss": 0.0, "step": 9960 }, { "epoch": 13.022739541160593, "grad_norm": 0.00028747491887770593, "learning_rate": 1.8181136602189237e-05, "loss": 0.0, "step": 9970 }, { "epoch": 13.023414304993253, "grad_norm": 0.00039829890010878444, "learning_rate": 1.8143649722597093e-05, "loss": 0.0, "step": 9980 }, { "epoch": 13.02408906882591, "grad_norm": 0.00022789667127653956, "learning_rate": 1.810616284300495e-05, "loss": 0.0, "step": 9990 }, { "epoch": 13.02476383265857, "grad_norm": 0.00028411843231879175, "learning_rate": 1.8068675963412808e-05, "loss": 0.0, "step": 10000 }, { "epoch": 13.025438596491227, "grad_norm": 0.0002080064732581377, "learning_rate": 1.8031189083820664e-05, "loss": 0.0, "step": 10010 }, { "epoch": 13.026113360323887, "grad_norm": 0.00023453705944120884, "learning_rate": 1.7993702204228523e-05, "loss": 0.0096, "step": 10020 }, { "epoch": 13.026788124156544, "grad_norm": 0.00010610045865178108, "learning_rate": 1.795621532463638e-05, "loss": 0.0, "step": 10030 }, { "epoch": 13.027462887989204, "grad_norm": 0.0001514716714154929, "learning_rate": 1.7918728445044234e-05, "loss": 0.0, "step": 10040 }, { "epoch": 13.028137651821863, "grad_norm": 0.00033169661764986813, "learning_rate": 1.7881241565452094e-05, "loss": 0.0, "step": 10050 }, { "epoch": 13.02881241565452, "grad_norm": 0.00013784744078293443, "learning_rate": 1.784375468585995e-05, "loss": 0.0, "step": 10060 }, { "epoch": 13.02948717948718, "grad_norm": 8.872824400896206e-05, "learning_rate": 1.7806267806267805e-05, "loss": 0.0, "step": 10070 }, { "epoch": 13.030161943319838, "grad_norm": 0.00037344591692090034, "learning_rate": 1.7768780926675664e-05, "loss": 0.0, "step": 10080 }, { "epoch": 13.030836707152497, "grad_norm": 0.0003687291464302689, "learning_rate": 1.773129404708352e-05, "loss": 0.0, "step": 10090 }, { "epoch": 13.031511470985155, "grad_norm": 0.00017588827176950872, "learning_rate": 1.769380716749138e-05, "loss": 0.0, "step": 10100 }, { "epoch": 13.032186234817814, "grad_norm": 0.00026350162806920707, "learning_rate": 1.7656320287899235e-05, "loss": 0.0, "step": 10110 }, { "epoch": 13.032860998650472, "grad_norm": 9.849424532148987e-05, "learning_rate": 1.761883340830709e-05, "loss": 0.0, "step": 10120 }, { "epoch": 13.033535762483131, "grad_norm": 0.00028973835287615657, "learning_rate": 1.758134652871495e-05, "loss": 0.0, "step": 10130 }, { "epoch": 13.034210526315789, "grad_norm": 0.00022602990793529898, "learning_rate": 1.7543859649122806e-05, "loss": 0.0, "step": 10140 }, { "epoch": 13.034885290148448, "grad_norm": 0.000543447386007756, "learning_rate": 1.7506372769530665e-05, "loss": 0.0, "step": 10150 }, { "epoch": 13.035560053981106, "grad_norm": 0.0006508603109978139, "learning_rate": 1.746888588993852e-05, "loss": 0.0, "step": 10160 }, { "epoch": 13.036234817813765, "grad_norm": 6.645211396971717e-05, "learning_rate": 1.743139901034638e-05, "loss": 0.4286, "step": 10170 }, { "epoch": 13.036909581646423, "grad_norm": 0.00017078538076020777, "learning_rate": 1.7393912130754236e-05, "loss": 0.0, "step": 10180 }, { "epoch": 13.037584345479083, "grad_norm": 0.0010123905958607793, "learning_rate": 1.7356425251162095e-05, "loss": 0.0, "step": 10190 }, { "epoch": 13.03825910931174, "grad_norm": 0.00027252710424363613, "learning_rate": 1.731893837156995e-05, "loss": 0.0, "step": 10200 }, { "epoch": 13.0389338731444, "grad_norm": 0.00013458417379297316, "learning_rate": 1.728145149197781e-05, "loss": 0.0, "step": 10210 }, { "epoch": 13.039608636977057, "grad_norm": 0.00022678014647681266, "learning_rate": 1.7243964612385665e-05, "loss": 0.0, "step": 10220 }, { "epoch": 13.040283400809717, "grad_norm": 0.00022790237562730908, "learning_rate": 1.720647773279352e-05, "loss": 0.0, "step": 10230 }, { "epoch": 13.040958164642376, "grad_norm": 0.0002460694231558591, "learning_rate": 1.716899085320138e-05, "loss": 0.0, "step": 10240 }, { "epoch": 13.041632928475034, "grad_norm": 0.00018956181884277612, "learning_rate": 1.7131503973609236e-05, "loss": 0.0, "step": 10250 }, { "epoch": 13.042307692307693, "grad_norm": 0.00017144810408353806, "learning_rate": 1.7094017094017095e-05, "loss": 0.0, "step": 10260 }, { "epoch": 13.04298245614035, "grad_norm": 0.0002925437001977116, "learning_rate": 1.705653021442495e-05, "loss": 0.0, "step": 10270 }, { "epoch": 13.04365721997301, "grad_norm": 0.0002330515708308667, "learning_rate": 1.701904333483281e-05, "loss": 0.013, "step": 10280 }, { "epoch": 13.044331983805668, "grad_norm": 0.00011631449160631746, "learning_rate": 1.6981556455240666e-05, "loss": 0.0, "step": 10290 }, { "epoch": 13.045006747638327, "grad_norm": 0.0003174786688759923, "learning_rate": 1.6944069575648525e-05, "loss": 0.0, "step": 10300 }, { "epoch": 13.045681511470985, "grad_norm": 0.0001684718154137954, "learning_rate": 1.690658269605638e-05, "loss": 0.0, "step": 10310 }, { "epoch": 13.046356275303644, "grad_norm": 0.001750526949763298, "learning_rate": 1.686909581646424e-05, "loss": 0.0, "step": 10320 }, { "epoch": 13.047031039136302, "grad_norm": 0.00024045804457273334, "learning_rate": 1.6831608936872096e-05, "loss": 0.0, "step": 10330 }, { "epoch": 13.047705802968961, "grad_norm": 0.0006596571765840054, "learning_rate": 1.6794122057279955e-05, "loss": 0.0, "step": 10340 }, { "epoch": 13.048380566801619, "grad_norm": 0.001252808142453432, "learning_rate": 1.675663517768781e-05, "loss": 0.3996, "step": 10350 }, { "epoch": 13.049055330634278, "grad_norm": 0.0002453498891554773, "learning_rate": 1.6719148298095667e-05, "loss": 0.0, "step": 10360 }, { "epoch": 13.049730094466936, "grad_norm": 0.0005040777614340186, "learning_rate": 1.6681661418503526e-05, "loss": 0.0, "step": 10370 }, { "epoch": 13.05, "eval_accuracy": 0.9285714285714286, "eval_f1": 0.9285714285714286, "eval_loss": 0.5062018632888794, "eval_runtime": 72.8565, "eval_samples_per_second": 1.537, "eval_steps_per_second": 1.537, "step": 10374 }, { "epoch": 14.000404858299595, "grad_norm": 6.942117033759132e-05, "learning_rate": 1.664417453891138e-05, "loss": 0.0, "step": 10380 }, { "epoch": 14.001079622132254, "grad_norm": 0.0004584739508572966, "learning_rate": 1.660668765931924e-05, "loss": 0.0, "step": 10390 }, { "epoch": 14.001754385964912, "grad_norm": 0.0002316083264304325, "learning_rate": 1.6569200779727097e-05, "loss": 0.2714, "step": 10400 }, { "epoch": 14.002429149797571, "grad_norm": 0.00024051779473666102, "learning_rate": 1.6531713900134956e-05, "loss": 0.0, "step": 10410 }, { "epoch": 14.003103913630229, "grad_norm": 0.0008334843441843987, "learning_rate": 1.649422702054281e-05, "loss": 0.0, "step": 10420 }, { "epoch": 14.003778677462888, "grad_norm": 0.00020968765602447093, "learning_rate": 1.6456740140950667e-05, "loss": 0.0178, "step": 10430 }, { "epoch": 14.004453441295546, "grad_norm": 0.00022330092906486243, "learning_rate": 1.6419253261358526e-05, "loss": 0.0, "step": 10440 }, { "epoch": 14.005128205128205, "grad_norm": 0.00021671153081115335, "learning_rate": 1.6381766381766382e-05, "loss": 0.009, "step": 10450 }, { "epoch": 14.005802968960865, "grad_norm": 0.00033940834691748023, "learning_rate": 1.6344279502174238e-05, "loss": 0.0, "step": 10460 }, { "epoch": 14.006477732793522, "grad_norm": 0.00048104580491781235, "learning_rate": 1.6306792622582097e-05, "loss": 0.0, "step": 10470 }, { "epoch": 14.007152496626182, "grad_norm": 0.00029779202304780483, "learning_rate": 1.6269305742989953e-05, "loss": 0.0, "step": 10480 }, { "epoch": 14.00782726045884, "grad_norm": 0.0004120915837120265, "learning_rate": 1.623181886339781e-05, "loss": 0.0, "step": 10490 }, { "epoch": 14.008502024291499, "grad_norm": 0.0003056660061702132, "learning_rate": 1.6194331983805668e-05, "loss": 0.0, "step": 10500 }, { "epoch": 14.009176788124156, "grad_norm": 0.000378406752133742, "learning_rate": 1.6156845104213524e-05, "loss": 0.0039, "step": 10510 }, { "epoch": 14.009851551956816, "grad_norm": 0.0005049049505032599, "learning_rate": 1.6119358224621383e-05, "loss": 0.0, "step": 10520 }, { "epoch": 14.010526315789473, "grad_norm": 0.00025037440354935825, "learning_rate": 1.608187134502924e-05, "loss": 0.0, "step": 10530 }, { "epoch": 14.011201079622133, "grad_norm": 0.00037562023499049246, "learning_rate": 1.6044384465437098e-05, "loss": 0.0, "step": 10540 }, { "epoch": 14.01187584345479, "grad_norm": 0.0003121852350886911, "learning_rate": 1.6006897585844954e-05, "loss": 0.0, "step": 10550 }, { "epoch": 14.01255060728745, "grad_norm": 0.0003679589426610619, "learning_rate": 1.5969410706252813e-05, "loss": 0.0, "step": 10560 }, { "epoch": 14.013225371120107, "grad_norm": 0.00028154728352092206, "learning_rate": 1.593192382666067e-05, "loss": 0.0, "step": 10570 }, { "epoch": 14.013900134952767, "grad_norm": 0.00020654525724239647, "learning_rate": 1.5894436947068528e-05, "loss": 0.0, "step": 10580 }, { "epoch": 14.014574898785424, "grad_norm": 0.00034096045419573784, "learning_rate": 1.5856950067476383e-05, "loss": 0.0, "step": 10590 }, { "epoch": 14.015249662618084, "grad_norm": 0.00026030451408587396, "learning_rate": 1.5819463187884243e-05, "loss": 0.0, "step": 10600 }, { "epoch": 14.015924426450741, "grad_norm": 8.031875040614977e-05, "learning_rate": 1.57819763082921e-05, "loss": 0.0, "step": 10610 }, { "epoch": 14.0165991902834, "grad_norm": 0.000621096114628017, "learning_rate": 1.5744489428699954e-05, "loss": 0.0, "step": 10620 }, { "epoch": 14.01727395411606, "grad_norm": 0.000524580420460552, "learning_rate": 1.5707002549107813e-05, "loss": 0.0, "step": 10630 }, { "epoch": 14.017948717948718, "grad_norm": 0.00011200064182048663, "learning_rate": 1.566951566951567e-05, "loss": 0.0, "step": 10640 }, { "epoch": 14.018623481781377, "grad_norm": 0.00032178129185922444, "learning_rate": 1.5632028789923528e-05, "loss": 0.0, "step": 10650 }, { "epoch": 14.019298245614035, "grad_norm": 0.00024140749883372337, "learning_rate": 1.5594541910331384e-05, "loss": 0.0, "step": 10660 }, { "epoch": 14.019973009446694, "grad_norm": 0.00022133818129077554, "learning_rate": 1.5557055030739243e-05, "loss": 0.0, "step": 10670 }, { "epoch": 14.020647773279352, "grad_norm": 0.0002797930792439729, "learning_rate": 1.55195681511471e-05, "loss": 0.0, "step": 10680 }, { "epoch": 14.021322537112011, "grad_norm": 0.0002334755117772147, "learning_rate": 1.5482081271554958e-05, "loss": 0.0, "step": 10690 }, { "epoch": 14.021997300944669, "grad_norm": 0.0002469551400281489, "learning_rate": 1.5444594391962814e-05, "loss": 0.0, "step": 10700 }, { "epoch": 14.022672064777328, "grad_norm": 8.5323081293609e-05, "learning_rate": 1.5407107512370673e-05, "loss": 0.0, "step": 10710 }, { "epoch": 14.023346828609986, "grad_norm": 0.00019482328207232058, "learning_rate": 1.536962063277853e-05, "loss": 0.0, "step": 10720 }, { "epoch": 14.024021592442645, "grad_norm": 0.00021449346968438476, "learning_rate": 1.5332133753186388e-05, "loss": 0.4463, "step": 10730 }, { "epoch": 14.024696356275303, "grad_norm": 0.00064310641027987, "learning_rate": 1.5294646873594244e-05, "loss": 0.0, "step": 10740 }, { "epoch": 14.025371120107962, "grad_norm": 0.00020890735322609544, "learning_rate": 1.52571599940021e-05, "loss": 0.0, "step": 10750 }, { "epoch": 14.02604588394062, "grad_norm": 0.0005201689782552421, "learning_rate": 1.5219673114409957e-05, "loss": 0.0, "step": 10760 }, { "epoch": 14.02672064777328, "grad_norm": 0.0005751597345806658, "learning_rate": 1.5182186234817813e-05, "loss": 0.0, "step": 10770 }, { "epoch": 14.027395411605937, "grad_norm": 0.0009388537146151066, "learning_rate": 1.5144699355225672e-05, "loss": 0.0, "step": 10780 }, { "epoch": 14.028070175438597, "grad_norm": 0.0005402613314799964, "learning_rate": 1.5107212475633528e-05, "loss": 0.0, "step": 10790 }, { "epoch": 14.028744939271254, "grad_norm": 0.00010339209256926551, "learning_rate": 1.5069725596041387e-05, "loss": 0.0, "step": 10800 }, { "epoch": 14.029419703103914, "grad_norm": 0.0005152708035893738, "learning_rate": 1.5032238716449243e-05, "loss": 0.0, "step": 10810 }, { "epoch": 14.030094466936573, "grad_norm": 0.0007186134462244809, "learning_rate": 1.4994751836857102e-05, "loss": 0.0, "step": 10820 }, { "epoch": 14.03076923076923, "grad_norm": 0.0005925975274294615, "learning_rate": 1.4957264957264958e-05, "loss": 0.0, "step": 10830 }, { "epoch": 14.03144399460189, "grad_norm": 0.00019110101857222617, "learning_rate": 1.4919778077672817e-05, "loss": 0.0, "step": 10840 }, { "epoch": 14.032118758434548, "grad_norm": 0.00018360813555773348, "learning_rate": 1.4882291198080673e-05, "loss": 0.0, "step": 10850 }, { "epoch": 14.032793522267207, "grad_norm": 0.00020973542996216565, "learning_rate": 1.4844804318488532e-05, "loss": 0.0, "step": 10860 }, { "epoch": 14.033468286099865, "grad_norm": 0.0007199271931312978, "learning_rate": 1.4807317438896387e-05, "loss": 0.0, "step": 10870 }, { "epoch": 14.034143049932524, "grad_norm": 9.265208791475743e-05, "learning_rate": 1.4769830559304243e-05, "loss": 0.0, "step": 10880 }, { "epoch": 14.034817813765182, "grad_norm": 8.818476635497063e-05, "learning_rate": 1.4732343679712102e-05, "loss": 0.0, "step": 10890 }, { "epoch": 14.035492577597841, "grad_norm": 0.00018744076078291982, "learning_rate": 1.4694856800119958e-05, "loss": 0.0, "step": 10900 }, { "epoch": 14.036167341430499, "grad_norm": 0.0003558373427949846, "learning_rate": 1.4657369920527816e-05, "loss": 0.0, "step": 10910 }, { "epoch": 14.036842105263158, "grad_norm": 0.00015756840002723038, "learning_rate": 1.4619883040935673e-05, "loss": 0.0, "step": 10920 }, { "epoch": 14.037516869095816, "grad_norm": 0.00011693660053424537, "learning_rate": 1.458239616134353e-05, "loss": 0.0, "step": 10930 }, { "epoch": 14.038191632928475, "grad_norm": 0.00013403450429905206, "learning_rate": 1.4544909281751386e-05, "loss": 0.0, "step": 10940 }, { "epoch": 14.038866396761133, "grad_norm": 0.00014881876995787024, "learning_rate": 1.4507422402159246e-05, "loss": 0.0, "step": 10950 }, { "epoch": 14.039541160593792, "grad_norm": 0.00014527350140269846, "learning_rate": 1.4469935522567101e-05, "loss": 0.0, "step": 10960 }, { "epoch": 14.04021592442645, "grad_norm": 0.00016278887051157653, "learning_rate": 1.443244864297496e-05, "loss": 0.0, "step": 10970 }, { "epoch": 14.04089068825911, "grad_norm": 8.402692037634552e-05, "learning_rate": 1.4394961763382816e-05, "loss": 0.0, "step": 10980 }, { "epoch": 14.041565452091769, "grad_norm": 0.00017224009206984192, "learning_rate": 1.4357474883790675e-05, "loss": 0.0, "step": 10990 }, { "epoch": 14.042240215924426, "grad_norm": 0.0005430065211839974, "learning_rate": 1.4319988004198531e-05, "loss": 0.0, "step": 11000 }, { "epoch": 14.042914979757086, "grad_norm": 0.0009919034782797098, "learning_rate": 1.4282501124606387e-05, "loss": 0.0, "step": 11010 }, { "epoch": 14.043589743589743, "grad_norm": 0.0003526155778672546, "learning_rate": 1.4245014245014246e-05, "loss": 0.0, "step": 11020 }, { "epoch": 14.044264507422403, "grad_norm": 9.54778806772083e-05, "learning_rate": 1.4207527365422102e-05, "loss": 0.0, "step": 11030 }, { "epoch": 14.04493927125506, "grad_norm": 0.0001671431091381237, "learning_rate": 1.4170040485829961e-05, "loss": 0.0, "step": 11040 }, { "epoch": 14.04561403508772, "grad_norm": 0.00022146198898553848, "learning_rate": 1.4132553606237817e-05, "loss": 0.3607, "step": 11050 }, { "epoch": 14.046288798920378, "grad_norm": 0.0001517270429758355, "learning_rate": 1.4095066726645676e-05, "loss": 0.0, "step": 11060 }, { "epoch": 14.046963562753037, "grad_norm": 0.0006123693310655653, "learning_rate": 1.4057579847053532e-05, "loss": 0.0, "step": 11070 }, { "epoch": 14.047638326585695, "grad_norm": 0.001610752660781145, "learning_rate": 1.4020092967461391e-05, "loss": 0.0, "step": 11080 }, { "epoch": 14.048313090418354, "grad_norm": 0.0001440331107005477, "learning_rate": 1.3982606087869247e-05, "loss": 0.0, "step": 11090 }, { "epoch": 14.048987854251012, "grad_norm": 0.0007454275619238615, "learning_rate": 1.3945119208277104e-05, "loss": 0.0, "step": 11100 }, { "epoch": 14.049662618083671, "grad_norm": 0.0003447613853495568, "learning_rate": 1.390763232868496e-05, "loss": 0.0, "step": 11110 }, { "epoch": 14.05, "eval_accuracy": 0.9375, "eval_f1": 0.9373365167161658, "eval_loss": 0.5185861587524414, "eval_runtime": 73.7028, "eval_samples_per_second": 1.52, "eval_steps_per_second": 1.52, "step": 11115 }, { "epoch": 15.001349527665317, "eval_accuracy": 0.9023255813953488, "eval_f1": 0.9016146713373171, "eval_loss": 0.7568970918655396, "eval_runtime": 137.2112, "eval_samples_per_second": 1.567, "eval_steps_per_second": 1.567, "step": 11116 }, { "epoch": 15.001349527665317, "step": 11116, "total_flos": 2.8480212872085897e+19, "train_loss": 5.147429101647338e-09, "train_runtime": 143.5742, "train_samples_per_second": 5.161, "train_steps_per_second": 5.161 }, { "epoch": 15.001349527665317, "eval_accuracy": 0.9375, "eval_f1": 0.9373365167161658, "eval_loss": 0.5185860991477966, "eval_runtime": 72.3734, "eval_samples_per_second": 1.548, "eval_steps_per_second": 1.548, "step": 11116 }, { "epoch": 15.001349527665317, "eval_accuracy": 0.9023255813953488, "eval_f1": 0.9016146713373171, "eval_loss": 0.756963849067688, "eval_runtime": 137.6677, "eval_samples_per_second": 1.562, "eval_steps_per_second": 1.562, "step": 11116 } ], "logging_steps": 10, "max_steps": 741, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8480212872085897e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }