diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,5 +1,5 @@ { - "best_metric": 0.35884907841682434, + "best_metric": 0.3794967830181122, "best_model_checkpoint": "videomae-large-finetuned-deepfake-subset/checkpoint-2235", "epoch": 9.1, "eval_steps": 500, @@ -10,3236 +10,3236 @@ "log_history": [ { "epoch": 0.0022371364653243847, - "grad_norm": 8.820916175842285, + "grad_norm": 8.840668678283691, "learning_rate": 1.1185682326621925e-06, - "loss": 0.731, + "loss": 0.7314, "step": 10 }, { "epoch": 0.0044742729306487695, - "grad_norm": 12.290339469909668, + "grad_norm": 14.85615348815918, "learning_rate": 2.237136465324385e-06, - "loss": 0.705, + "loss": 0.7059, "step": 20 }, { "epoch": 0.006711409395973154, - "grad_norm": 10.981163024902344, + "grad_norm": 13.392386436462402, "learning_rate": 3.3557046979865773e-06, - "loss": 0.7033, + "loss": 0.677, "step": 30 }, { "epoch": 0.008948545861297539, - "grad_norm": 9.063810348510742, + "grad_norm": 9.807293891906738, "learning_rate": 4.47427293064877e-06, - "loss": 0.6902, + "loss": 0.7136, "step": 40 }, { "epoch": 0.011185682326621925, - "grad_norm": 14.12073802947998, + "grad_norm": 7.858952045440674, "learning_rate": 5.592841163310962e-06, - "loss": 0.651, + "loss": 0.679, "step": 50 }, { "epoch": 0.013422818791946308, - "grad_norm": 11.928865432739258, + "grad_norm": 11.944358825683594, "learning_rate": 6.7114093959731546e-06, - "loss": 0.6963, + "loss": 0.6539, "step": 60 }, { "epoch": 0.015659955257270694, - "grad_norm": 11.369632720947266, + "grad_norm": 11.665265083312988, "learning_rate": 7.829977628635348e-06, - "loss": 0.6992, + "loss": 0.7147, "step": 70 }, { "epoch": 0.017897091722595078, - "grad_norm": 13.989575386047363, + "grad_norm": 17.402263641357422, "learning_rate": 8.94854586129754e-06, - "loss": 0.6777, + "loss": 0.6939, "step": 80 }, { "epoch": 0.020134228187919462, - "grad_norm": 14.570990562438965, + "grad_norm": 13.367256164550781, "learning_rate": 1.006711409395973e-05, - "loss": 0.6896, + "loss": 0.6801, "step": 90 }, { "epoch": 0.02237136465324385, - "grad_norm": 10.531535148620605, + "grad_norm": 11.276527404785156, "learning_rate": 1.1185682326621925e-05, - "loss": 0.6827, + "loss": 0.6904, "step": 100 }, { "epoch": 0.024608501118568233, - "grad_norm": 11.762115478515625, + "grad_norm": 8.762359619140625, "learning_rate": 1.2304250559284117e-05, - "loss": 0.6952, + "loss": 0.6742, "step": 110 }, { "epoch": 0.026845637583892617, - "grad_norm": 10.128353118896484, + "grad_norm": 10.632108688354492, "learning_rate": 1.3422818791946309e-05, - "loss": 0.7187, + "loss": 0.6967, "step": 120 }, { "epoch": 0.029082774049217, - "grad_norm": 5.641812324523926, + "grad_norm": 8.825544357299805, "learning_rate": 1.4541387024608501e-05, - "loss": 0.6474, + "loss": 0.631, "step": 130 }, { "epoch": 0.03131991051454139, - "grad_norm": 9.743863105773926, + "grad_norm": 10.079858779907227, "learning_rate": 1.5659955257270695e-05, - "loss": 0.6943, + "loss": 0.698, "step": 140 }, { "epoch": 0.03355704697986577, - "grad_norm": 7.398090362548828, + "grad_norm": 12.647045135498047, "learning_rate": 1.6778523489932888e-05, - "loss": 0.6404, + "loss": 0.6871, "step": 150 }, { "epoch": 0.035794183445190156, - "grad_norm": 6.879280090332031, + "grad_norm": 7.6588311195373535, "learning_rate": 1.789709172259508e-05, - "loss": 0.7416, + "loss": 0.7482, "step": 160 }, { "epoch": 0.03803131991051454, - "grad_norm": 5.413578510284424, + "grad_norm": 8.198958396911621, "learning_rate": 1.9015659955257272e-05, - "loss": 0.6926, + "loss": 0.641, "step": 170 }, { "epoch": 0.040268456375838924, - "grad_norm": 6.2413411140441895, + "grad_norm": 8.8464937210083, "learning_rate": 2.013422818791946e-05, - "loss": 0.8072, + "loss": 0.7909, "step": 180 }, { "epoch": 0.042505592841163314, - "grad_norm": 3.8624861240386963, + "grad_norm": 4.950939178466797, "learning_rate": 2.1252796420581657e-05, - "loss": 0.685, + "loss": 0.7335, "step": 190 }, { "epoch": 0.0447427293064877, - "grad_norm": 2.6995649337768555, + "grad_norm": 5.206540584564209, "learning_rate": 2.237136465324385e-05, - "loss": 0.7004, + "loss": 0.744, "step": 200 }, { "epoch": 0.04697986577181208, - "grad_norm": 6.890233993530273, + "grad_norm": 6.270852088928223, "learning_rate": 2.348993288590604e-05, - "loss": 0.7119, + "loss": 0.691, "step": 210 }, { "epoch": 0.049217002237136466, - "grad_norm": 4.524689197540283, + "grad_norm": 6.907114028930664, "learning_rate": 2.4608501118568234e-05, - "loss": 0.6801, + "loss": 0.6868, "step": 220 }, { "epoch": 0.05145413870246085, - "grad_norm": 4.578395843505859, + "grad_norm": 6.208651065826416, "learning_rate": 2.5727069351230426e-05, - "loss": 0.6926, + "loss": 0.7205, "step": 230 }, { "epoch": 0.053691275167785234, - "grad_norm": 4.124147891998291, + "grad_norm": 4.508482456207275, "learning_rate": 2.6845637583892618e-05, - "loss": 0.6538, + "loss": 0.6688, "step": 240 }, { "epoch": 0.05592841163310962, - "grad_norm": 6.061787128448486, + "grad_norm": 8.969482421875, "learning_rate": 2.796420581655481e-05, - "loss": 0.6506, + "loss": 0.6811, "step": 250 }, { "epoch": 0.058165548098434, - "grad_norm": 12.14808177947998, + "grad_norm": 6.105631351470947, "learning_rate": 2.9082774049217003e-05, - "loss": 0.7799, + "loss": 0.7777, "step": 260 }, { "epoch": 0.06040268456375839, - "grad_norm": 3.3967928886413574, + "grad_norm": 4.278919696807861, "learning_rate": 3.02013422818792e-05, - "loss": 0.687, + "loss": 0.7046, "step": 270 }, { "epoch": 0.06263982102908278, - "grad_norm": 3.4164726734161377, + "grad_norm": 5.356738567352295, "learning_rate": 3.131991051454139e-05, - "loss": 0.6865, + "loss": 0.6642, "step": 280 }, { "epoch": 0.06487695749440715, - "grad_norm": 5.354477405548096, + "grad_norm": 5.4614691734313965, "learning_rate": 3.243847874720358e-05, - "loss": 0.6474, + "loss": 0.6817, "step": 290 }, { "epoch": 0.06711409395973154, - "grad_norm": 6.427885055541992, + "grad_norm": 5.7568278312683105, "learning_rate": 3.3557046979865775e-05, - "loss": 0.6265, + "loss": 0.6382, "step": 300 }, { "epoch": 0.06935123042505593, - "grad_norm": 5.238284111022949, + "grad_norm": 3.24556565284729, "learning_rate": 3.4675615212527964e-05, - "loss": 0.6951, + "loss": 0.679, "step": 310 }, { "epoch": 0.07158836689038031, - "grad_norm": 3.8965413570404053, + "grad_norm": 3.6615562438964844, "learning_rate": 3.579418344519016e-05, - "loss": 0.702, + "loss": 0.6796, "step": 320 }, { "epoch": 0.0738255033557047, - "grad_norm": 5.09553861618042, + "grad_norm": 7.1600494384765625, "learning_rate": 3.6912751677852356e-05, - "loss": 0.6297, + "loss": 0.675, "step": 330 }, { "epoch": 0.07606263982102908, - "grad_norm": 10.964111328125, + "grad_norm": 3.104412317276001, "learning_rate": 3.8031319910514545e-05, - "loss": 0.5219, + "loss": 0.6024, "step": 340 }, { "epoch": 0.07829977628635347, - "grad_norm": 2.5747010707855225, + "grad_norm": 4.062417030334473, "learning_rate": 3.914988814317674e-05, - "loss": 0.7686, + "loss": 0.7211, "step": 350 }, { "epoch": 0.08053691275167785, - "grad_norm": 8.747536659240723, + "grad_norm": 16.75772476196289, "learning_rate": 4.026845637583892e-05, - "loss": 0.6403, + "loss": 0.6736, "step": 360 }, { "epoch": 0.08277404921700224, - "grad_norm": 6.88169527053833, + "grad_norm": 6.7340168952941895, "learning_rate": 4.138702460850112e-05, - "loss": 0.7298, + "loss": 0.6964, "step": 370 }, { "epoch": 0.08501118568232663, - "grad_norm": 3.903372049331665, + "grad_norm": 4.515945911407471, "learning_rate": 4.2505592841163314e-05, - "loss": 0.725, + "loss": 0.7276, "step": 380 }, { "epoch": 0.087248322147651, - "grad_norm": 3.645214319229126, + "grad_norm": 2.2911009788513184, "learning_rate": 4.36241610738255e-05, - "loss": 0.6768, + "loss": 0.5794, "step": 390 }, { "epoch": 0.0894854586129754, - "grad_norm": 4.003920555114746, + "grad_norm": 5.8774027824401855, "learning_rate": 4.47427293064877e-05, - "loss": 0.7701, + "loss": 0.5691, "step": 400 }, { "epoch": 0.09172259507829977, - "grad_norm": 8.795763969421387, + "grad_norm": 4.3128743171691895, "learning_rate": 4.586129753914989e-05, - "loss": 0.7623, + "loss": 0.7606, "step": 410 }, { "epoch": 0.09395973154362416, - "grad_norm": 5.5298357009887695, + "grad_norm": 9.608476638793945, "learning_rate": 4.697986577181208e-05, - "loss": 0.7405, + "loss": 0.775, "step": 420 }, { "epoch": 0.09619686800894854, - "grad_norm": 5.45560884475708, + "grad_norm": 7.786107540130615, "learning_rate": 4.809843400447427e-05, - "loss": 0.6677, + "loss": 0.6394, "step": 430 }, { "epoch": 0.09843400447427293, - "grad_norm": 5.385969161987305, + "grad_norm": 8.117825508117676, "learning_rate": 4.921700223713647e-05, - "loss": 0.587, + "loss": 0.6361, "step": 440 }, { "epoch": 0.1, - "eval_loss": 0.6548025012016296, - "eval_runtime": 935.5761, - "eval_samples_per_second": 5.284, - "eval_steps_per_second": 0.661, + "eval_loss": 0.6478366255760193, + "eval_runtime": 887.1172, + "eval_samples_per_second": 8.36, + "eval_steps_per_second": 1.045, "step": 447 }, { "epoch": 1.0006711409395974, - "grad_norm": 3.5708768367767334, + "grad_norm": 4.550715446472168, "learning_rate": 4.99627143922446e-05, - "loss": 0.6714, + "loss": 0.6042, "step": 450 }, { "epoch": 1.0029082774049216, - "grad_norm": 3.380483865737915, + "grad_norm": 6.292792797088623, "learning_rate": 4.9838429033059906e-05, - "loss": 0.6215, + "loss": 0.6412, "step": 460 }, { "epoch": 1.005145413870246, - "grad_norm": 2.324326276779175, + "grad_norm": 4.233644485473633, "learning_rate": 4.971414367387522e-05, - "loss": 0.6885, + "loss": 0.6035, "step": 470 }, { "epoch": 1.0073825503355704, - "grad_norm": 4.357543468475342, + "grad_norm": 5.84645938873291, "learning_rate": 4.958985831469053e-05, - "loss": 0.678, + "loss": 0.5547, "step": 480 }, { "epoch": 1.0096196868008949, - "grad_norm": 3.158769369125366, + "grad_norm": 5.941657066345215, "learning_rate": 4.946557295550584e-05, - "loss": 0.7105, + "loss": 0.6611, "step": 490 }, { "epoch": 1.0118568232662193, - "grad_norm": 2.6318163871765137, + "grad_norm": 2.766065835952759, "learning_rate": 4.9341287596321155e-05, - "loss": 0.6218, + "loss": 0.7369, "step": 500 }, { "epoch": 1.0140939597315437, - "grad_norm": 3.114478349685669, + "grad_norm": 7.194911003112793, "learning_rate": 4.921700223713647e-05, - "loss": 0.6693, + "loss": 0.7586, "step": 510 }, { "epoch": 1.016331096196868, - "grad_norm": 5.514845371246338, + "grad_norm": 5.610304832458496, "learning_rate": 4.909271687795178e-05, - "loss": 0.6839, + "loss": 0.6815, "step": 520 }, { "epoch": 1.0185682326621923, - "grad_norm": 4.107670307159424, + "grad_norm": 5.830178260803223, "learning_rate": 4.896843151876709e-05, - "loss": 0.6526, + "loss": 0.5949, "step": 530 }, { "epoch": 1.0208053691275167, - "grad_norm": 9.080002784729004, + "grad_norm": 7.435003280639648, "learning_rate": 4.8844146159582404e-05, - "loss": 0.5914, + "loss": 0.6054, "step": 540 }, { "epoch": 1.0230425055928412, - "grad_norm": 2.4229257106781006, + "grad_norm": 3.300534248352051, "learning_rate": 4.871986080039772e-05, - "loss": 0.6653, + "loss": 0.644, "step": 550 }, { "epoch": 1.0252796420581656, - "grad_norm": 7.172940731048584, + "grad_norm": 5.10628080368042, "learning_rate": 4.859557544121303e-05, - "loss": 0.6491, + "loss": 0.7789, "step": 560 }, { "epoch": 1.02751677852349, - "grad_norm": 3.7052698135375977, + "grad_norm": 3.5758488178253174, "learning_rate": 4.8471290082028335e-05, - "loss": 0.6932, + "loss": 0.6373, "step": 570 }, { "epoch": 1.0297539149888144, - "grad_norm": 3.1077563762664795, + "grad_norm": 2.3225715160369873, "learning_rate": 4.8347004722843654e-05, - "loss": 0.6827, + "loss": 0.6763, "step": 580 }, { "epoch": 1.0319910514541386, - "grad_norm": 3.718585968017578, + "grad_norm": 2.920625686645508, "learning_rate": 4.8222719363658966e-05, - "loss": 0.7414, + "loss": 0.6733, "step": 590 }, { "epoch": 1.034228187919463, - "grad_norm": 2.7453243732452393, + "grad_norm": 5.328765869140625, "learning_rate": 4.809843400447427e-05, - "loss": 0.6593, + "loss": 0.6051, "step": 600 }, { "epoch": 1.0364653243847874, - "grad_norm": 2.820056438446045, + "grad_norm": 5.5468878746032715, "learning_rate": 4.7974148645289584e-05, - "loss": 0.674, + "loss": 0.6439, "step": 610 }, { "epoch": 1.0387024608501119, - "grad_norm": 5.4566192626953125, + "grad_norm": 3.9731194972991943, "learning_rate": 4.78498632861049e-05, - "loss": 0.6664, + "loss": 0.6416, "step": 620 }, { "epoch": 1.0409395973154363, - "grad_norm": 4.904478549957275, + "grad_norm": 5.598540782928467, "learning_rate": 4.772557792692021e-05, - "loss": 0.6305, + "loss": 0.6182, "step": 630 }, { "epoch": 1.0431767337807607, - "grad_norm": 4.792154312133789, + "grad_norm": 3.5205001831054688, "learning_rate": 4.760129256773552e-05, - "loss": 0.6781, + "loss": 0.6307, "step": 640 }, { "epoch": 1.045413870246085, - "grad_norm": 8.574151039123535, + "grad_norm": 8.040474891662598, "learning_rate": 4.7477007208550834e-05, - "loss": 0.6938, + "loss": 0.6298, "step": 650 }, { "epoch": 1.0476510067114093, - "grad_norm": 3.2592074871063232, + "grad_norm": 7.812734127044678, "learning_rate": 4.735272184936615e-05, - "loss": 0.7068, + "loss": 0.5933, "step": 660 }, { "epoch": 1.0498881431767337, - "grad_norm": 3.684659957885742, + "grad_norm": 8.276246070861816, "learning_rate": 4.722843649018146e-05, - "loss": 0.6925, + "loss": 0.68, "step": 670 }, { "epoch": 1.0521252796420582, - "grad_norm": 3.7525546550750732, + "grad_norm": 9.207062721252441, "learning_rate": 4.710415113099677e-05, - "loss": 0.7036, + "loss": 0.6591, "step": 680 }, { "epoch": 1.0543624161073826, - "grad_norm": 2.746760129928589, + "grad_norm": 2.244326591491699, "learning_rate": 4.697986577181208e-05, - "loss": 0.6284, + "loss": 0.5858, "step": 690 }, { "epoch": 1.056599552572707, - "grad_norm": 2.086256504058838, + "grad_norm": 4.339087963104248, "learning_rate": 4.6855580412627395e-05, - "loss": 0.6403, + "loss": 0.6173, "step": 700 }, { "epoch": 1.0588366890380314, - "grad_norm": 3.4739294052124023, + "grad_norm": 4.587048053741455, "learning_rate": 4.673129505344271e-05, - "loss": 0.6529, + "loss": 0.5405, "step": 710 }, { "epoch": 1.0610738255033556, - "grad_norm": 6.01228141784668, + "grad_norm": 6.943038463592529, "learning_rate": 4.660700969425802e-05, - "loss": 0.6661, + "loss": 0.5336, "step": 720 }, { "epoch": 1.06331096196868, - "grad_norm": 3.2344679832458496, + "grad_norm": 3.565220832824707, "learning_rate": 4.648272433507333e-05, - "loss": 0.6968, + "loss": 0.6332, "step": 730 }, { "epoch": 1.0655480984340044, - "grad_norm": 2.684730052947998, + "grad_norm": 4.71682596206665, "learning_rate": 4.635843897588864e-05, - "loss": 0.618, + "loss": 0.6398, "step": 740 }, { "epoch": 1.0677852348993289, - "grad_norm": 2.324744701385498, + "grad_norm": 8.337652206420898, "learning_rate": 4.623415361670396e-05, - "loss": 0.6689, + "loss": 0.6597, "step": 750 }, { "epoch": 1.0700223713646533, - "grad_norm": 4.5876569747924805, + "grad_norm": 6.056807994842529, "learning_rate": 4.610986825751927e-05, - "loss": 0.6356, + "loss": 0.68, "step": 760 }, { "epoch": 1.0722595078299777, - "grad_norm": 4.345230579376221, + "grad_norm": 3.2448365688323975, "learning_rate": 4.5985582898334575e-05, - "loss": 0.6123, + "loss": 0.6275, "step": 770 }, { "epoch": 1.0744966442953021, - "grad_norm": 5.078761577606201, + "grad_norm": 7.669735431671143, "learning_rate": 4.586129753914989e-05, - "loss": 0.7232, + "loss": 0.6847, "step": 780 }, { "epoch": 1.0767337807606263, - "grad_norm": 5.6790266036987305, + "grad_norm": 3.858572483062744, "learning_rate": 4.57370121799652e-05, - "loss": 0.6505, + "loss": 0.5387, "step": 790 }, { "epoch": 1.0789709172259507, - "grad_norm": 4.5163445472717285, + "grad_norm": 3.0774307250976562, "learning_rate": 4.561272682078052e-05, - "loss": 0.6364, + "loss": 0.6463, "step": 800 }, { "epoch": 1.0812080536912752, - "grad_norm": 5.686500549316406, + "grad_norm": 5.94879674911499, "learning_rate": 4.5488441461595824e-05, - "loss": 0.5556, + "loss": 0.5771, "step": 810 }, { "epoch": 1.0834451901565996, - "grad_norm": 2.98331356048584, + "grad_norm": 7.283985137939453, "learning_rate": 4.5364156102411137e-05, - "loss": 0.6267, + "loss": 0.4675, "step": 820 }, { "epoch": 1.085682326621924, - "grad_norm": 6.326406478881836, + "grad_norm": 5.343263149261475, "learning_rate": 4.523987074322645e-05, - "loss": 0.5947, + "loss": 0.619, "step": 830 }, { "epoch": 1.0879194630872484, - "grad_norm": 3.6593778133392334, + "grad_norm": 2.982252597808838, "learning_rate": 4.511558538404176e-05, - "loss": 0.6253, + "loss": 0.6595, "step": 840 }, { "epoch": 1.0901565995525728, - "grad_norm": 2.3336687088012695, + "grad_norm": 3.7833807468414307, "learning_rate": 4.4991300024857074e-05, - "loss": 0.5903, + "loss": 0.4992, "step": 850 }, { "epoch": 1.092393736017897, - "grad_norm": 3.4927194118499756, + "grad_norm": 4.744262218475342, "learning_rate": 4.4867014665672386e-05, - "loss": 0.5819, + "loss": 0.5481, "step": 860 }, { "epoch": 1.0946308724832214, - "grad_norm": 3.6458919048309326, + "grad_norm": 4.21218204498291, "learning_rate": 4.47427293064877e-05, - "loss": 0.5121, + "loss": 0.4745, "step": 870 }, { "epoch": 1.0968680089485459, - "grad_norm": 5.987964630126953, + "grad_norm": 6.225035667419434, "learning_rate": 4.461844394730301e-05, - "loss": 0.6405, + "loss": 0.6044, "step": 880 }, { "epoch": 1.0991051454138703, - "grad_norm": 5.051669597625732, + "grad_norm": 6.521196365356445, "learning_rate": 4.449415858811832e-05, - "loss": 0.6739, + "loss": 0.6774, "step": 890 }, { "epoch": 1.1, - "eval_loss": 0.5807220339775085, - "eval_runtime": 928.2211, - "eval_samples_per_second": 5.326, - "eval_steps_per_second": 0.666, + "eval_loss": 0.6047455072402954, + "eval_runtime": 892.7591, + "eval_samples_per_second": 8.307, + "eval_steps_per_second": 1.038, "step": 894 }, { "epoch": 2.001342281879195, - "grad_norm": 2.9489381313323975, + "grad_norm": 4.137167930603027, "learning_rate": 4.4369873228933635e-05, - "loss": 0.5788, + "loss": 0.6044, "step": 900 }, { "epoch": 2.003579418344519, - "grad_norm": 4.1599273681640625, + "grad_norm": 5.822400093078613, "learning_rate": 4.424558786974895e-05, - "loss": 0.4658, + "loss": 0.4379, "step": 910 }, { "epoch": 2.005816554809843, - "grad_norm": 3.328503131866455, + "grad_norm": 5.85886812210083, "learning_rate": 4.412130251056425e-05, - "loss": 0.6324, + "loss": 0.6157, "step": 920 }, { "epoch": 2.0080536912751676, - "grad_norm": 4.6580681800842285, + "grad_norm": 3.880828857421875, "learning_rate": 4.399701715137957e-05, - "loss": 0.5451, + "loss": 0.457, "step": 930 }, { "epoch": 2.010290827740492, - "grad_norm": 9.446999549865723, + "grad_norm": 6.341732025146484, "learning_rate": 4.3872731792194885e-05, - "loss": 0.5531, + "loss": 0.522, "step": 940 }, { "epoch": 2.0125279642058165, - "grad_norm": 2.4541642665863037, + "grad_norm": 10.094740867614746, "learning_rate": 4.374844643301019e-05, - "loss": 0.6491, + "loss": 0.6731, "step": 950 }, { "epoch": 2.014765100671141, - "grad_norm": 3.4880592823028564, + "grad_norm": 5.733267784118652, "learning_rate": 4.36241610738255e-05, - "loss": 0.6484, + "loss": 0.4796, "step": 960 }, { "epoch": 2.0170022371364653, - "grad_norm": 1.96310293674469, + "grad_norm": 3.963778257369995, "learning_rate": 4.349987571464082e-05, - "loss": 0.557, + "loss": 0.5343, "step": 970 }, { "epoch": 2.0192393736017897, - "grad_norm": 5.5436577796936035, + "grad_norm": 4.41682243347168, "learning_rate": 4.337559035545613e-05, - "loss": 0.5996, + "loss": 0.5492, "step": 980 }, { "epoch": 2.021476510067114, - "grad_norm": 10.744361877441406, + "grad_norm": 1.9658536911010742, "learning_rate": 4.325130499627144e-05, - "loss": 0.4878, + "loss": 0.4044, "step": 990 }, { "epoch": 2.0237136465324386, - "grad_norm": 5.160739421844482, + "grad_norm": 8.907441139221191, "learning_rate": 4.312701963708675e-05, - "loss": 0.6356, + "loss": 0.486, "step": 1000 }, { "epoch": 2.025950782997763, - "grad_norm": 4.501141548156738, + "grad_norm": 4.796280860900879, "learning_rate": 4.3002734277902064e-05, - "loss": 0.6026, + "loss": 0.5449, "step": 1010 }, { "epoch": 2.0281879194630874, - "grad_norm": 10.099557876586914, + "grad_norm": 13.880756378173828, "learning_rate": 4.287844891871738e-05, - "loss": 0.5309, + "loss": 0.5926, "step": 1020 }, { "epoch": 2.030425055928412, - "grad_norm": 5.0861029624938965, + "grad_norm": 5.1638312339782715, "learning_rate": 4.275416355953269e-05, - "loss": 0.7049, + "loss": 0.7223, "step": 1030 }, { "epoch": 2.032662192393736, - "grad_norm": 2.878390073776245, + "grad_norm": 2.8141660690307617, "learning_rate": 4.2629878200348e-05, - "loss": 0.6, + "loss": 0.5354, "step": 1040 }, { "epoch": 2.03489932885906, - "grad_norm": 4.252883434295654, + "grad_norm": 5.402659893035889, "learning_rate": 4.2505592841163314e-05, - "loss": 0.6321, + "loss": 0.5388, "step": 1050 }, { "epoch": 2.0371364653243846, - "grad_norm": 4.990085124969482, + "grad_norm": 9.897317886352539, "learning_rate": 4.2381307481978626e-05, - "loss": 0.6085, + "loss": 0.5581, "step": 1060 }, { "epoch": 2.039373601789709, - "grad_norm": 3.3724958896636963, + "grad_norm": 5.52318811416626, "learning_rate": 4.225702212279394e-05, - "loss": 0.4212, + "loss": 0.4331, "step": 1070 }, { "epoch": 2.0416107382550335, - "grad_norm": 7.132399559020996, + "grad_norm": 6.323409080505371, "learning_rate": 4.213273676360925e-05, - "loss": 0.6055, + "loss": 0.6491, "step": 1080 }, { "epoch": 2.043847874720358, - "grad_norm": 6.623457908630371, + "grad_norm": 5.7435503005981445, "learning_rate": 4.2008451404424556e-05, - "loss": 0.5264, + "loss": 0.494, "step": 1090 }, { "epoch": 2.0460850111856823, - "grad_norm": 3.902120351791382, + "grad_norm": 2.975238084793091, "learning_rate": 4.1884166045239875e-05, - "loss": 0.5448, + "loss": 0.5629, "step": 1100 }, { "epoch": 2.0483221476510067, - "grad_norm": 7.523580551147461, + "grad_norm": 7.936377048492432, "learning_rate": 4.175988068605519e-05, - "loss": 0.5755, + "loss": 0.5487, "step": 1110 }, { "epoch": 2.050559284116331, - "grad_norm": 6.084871768951416, + "grad_norm": 9.979763984680176, "learning_rate": 4.16355953268705e-05, - "loss": 0.6067, + "loss": 0.6606, "step": 1120 }, { "epoch": 2.0527964205816556, - "grad_norm": 10.225180625915527, + "grad_norm": 8.900005340576172, "learning_rate": 4.1511309967685806e-05, - "loss": 0.5577, + "loss": 0.537, "step": 1130 }, { "epoch": 2.05503355704698, - "grad_norm": 6.4241790771484375, + "grad_norm": 5.619589328765869, "learning_rate": 4.138702460850112e-05, - "loss": 0.6698, + "loss": 0.5785, "step": 1140 }, { "epoch": 2.0572706935123044, - "grad_norm": 2.930966377258301, + "grad_norm": 4.075002193450928, "learning_rate": 4.126273924931644e-05, - "loss": 0.5346, + "loss": 0.3999, "step": 1150 }, { "epoch": 2.059507829977629, - "grad_norm": 7.139683723449707, + "grad_norm": 5.566099166870117, "learning_rate": 4.113845389013174e-05, - "loss": 0.6482, + "loss": 0.4934, "step": 1160 }, { "epoch": 2.0617449664429532, - "grad_norm": 8.462257385253906, + "grad_norm": 7.996336936950684, "learning_rate": 4.1014168530947055e-05, - "loss": 0.5197, + "loss": 0.3563, "step": 1170 }, { "epoch": 2.063982102908277, - "grad_norm": 6.358745574951172, + "grad_norm": 8.547967910766602, "learning_rate": 4.088988317176237e-05, - "loss": 0.5159, + "loss": 0.6231, "step": 1180 }, { "epoch": 2.0662192393736016, - "grad_norm": 4.412177562713623, + "grad_norm": 4.0213236808776855, "learning_rate": 4.076559781257768e-05, - "loss": 0.7253, + "loss": 0.7292, "step": 1190 }, { "epoch": 2.068456375838926, - "grad_norm": 2.295029640197754, + "grad_norm": 4.820833206176758, "learning_rate": 4.064131245339299e-05, - "loss": 0.5456, + "loss": 0.4553, "step": 1200 }, { "epoch": 2.0706935123042505, - "grad_norm": 6.339975357055664, + "grad_norm": 9.791057586669922, "learning_rate": 4.0517027094208304e-05, - "loss": 0.5454, + "loss": 0.5081, "step": 1210 }, { "epoch": 2.072930648769575, - "grad_norm": 3.029376745223999, + "grad_norm": 2.710472345352173, "learning_rate": 4.039274173502362e-05, - "loss": 0.5875, + "loss": 0.4875, "step": 1220 }, { "epoch": 2.0751677852348993, - "grad_norm": 5.36366081237793, + "grad_norm": 4.432290554046631, "learning_rate": 4.026845637583892e-05, - "loss": 0.5715, + "loss": 0.4762, "step": 1230 }, { "epoch": 2.0774049217002237, - "grad_norm": 8.790428161621094, + "grad_norm": 12.281310081481934, "learning_rate": 4.014417101665424e-05, - "loss": 0.6075, + "loss": 0.5381, "step": 1240 }, { "epoch": 2.079642058165548, - "grad_norm": 12.27261734008789, + "grad_norm": 12.222536087036133, "learning_rate": 4.0019885657469554e-05, - "loss": 0.4774, + "loss": 0.4633, "step": 1250 }, { "epoch": 2.0818791946308726, - "grad_norm": 9.575027465820312, + "grad_norm": 10.08840274810791, "learning_rate": 3.9895600298284866e-05, - "loss": 0.5224, + "loss": 0.4012, "step": 1260 }, { "epoch": 2.084116331096197, - "grad_norm": 4.080117702484131, + "grad_norm": 12.933877944946289, "learning_rate": 3.977131493910017e-05, - "loss": 0.5142, + "loss": 0.4828, "step": 1270 }, { "epoch": 2.0863534675615214, - "grad_norm": 9.554539680480957, + "grad_norm": 8.704557418823242, "learning_rate": 3.964702957991549e-05, - "loss": 0.5001, + "loss": 0.4824, "step": 1280 }, { "epoch": 2.088590604026846, - "grad_norm": 5.12808084487915, + "grad_norm": 5.015852451324463, "learning_rate": 3.95227442207308e-05, - "loss": 0.5425, + "loss": 0.5165, "step": 1290 }, { "epoch": 2.09082774049217, - "grad_norm": 5.651499271392822, + "grad_norm": 11.11552906036377, "learning_rate": 3.939845886154611e-05, - "loss": 0.6167, + "loss": 0.5092, "step": 1300 }, { "epoch": 2.093064876957494, - "grad_norm": 10.270825386047363, + "grad_norm": 10.659283638000488, "learning_rate": 3.927417350236142e-05, - "loss": 0.553, + "loss": 0.5005, "step": 1310 }, { "epoch": 2.0953020134228186, - "grad_norm": 3.7876462936401367, + "grad_norm": 4.979311466217041, "learning_rate": 3.914988814317674e-05, - "loss": 0.7124, + "loss": 0.7376, "step": 1320 }, { "epoch": 2.097539149888143, - "grad_norm": 2.0910379886627197, + "grad_norm": 2.4167163372039795, "learning_rate": 3.9025602783992046e-05, - "loss": 0.5093, + "loss": 0.4397, "step": 1330 }, { "epoch": 2.0997762863534675, - "grad_norm": 6.5479207038879395, + "grad_norm": 7.5874857902526855, "learning_rate": 3.890131742480736e-05, - "loss": 0.444, + "loss": 0.4168, "step": 1340 }, { "epoch": 2.1, - "eval_loss": 0.5115740895271301, - "eval_runtime": 933.3886, - "eval_samples_per_second": 5.297, - "eval_steps_per_second": 0.662, + "eval_loss": 0.48516377806663513, + "eval_runtime": 891.9205, + "eval_samples_per_second": 8.315, + "eval_steps_per_second": 1.039, "step": 1341 }, { "epoch": 3.002013422818792, - "grad_norm": 11.015625, + "grad_norm": 9.898842811584473, "learning_rate": 3.877703206562267e-05, - "loss": 0.4018, + "loss": 0.4532, "step": 1350 }, { "epoch": 3.004250559284116, - "grad_norm": 8.430967330932617, + "grad_norm": 10.05817699432373, "learning_rate": 3.865274670643798e-05, - "loss": 0.767, + "loss": 0.7569, "step": 1360 }, { "epoch": 3.0064876957494406, - "grad_norm": 1.8640131950378418, + "grad_norm": 1.2511606216430664, "learning_rate": 3.8528461347253295e-05, - "loss": 0.4937, + "loss": 0.4412, "step": 1370 }, { "epoch": 3.008724832214765, - "grad_norm": 6.5426177978515625, + "grad_norm": 11.480599403381348, "learning_rate": 3.840417598806861e-05, - "loss": 0.4556, + "loss": 0.3935, "step": 1380 }, { "epoch": 3.0109619686800895, - "grad_norm": 8.181113243103027, + "grad_norm": 11.63521671295166, "learning_rate": 3.827989062888392e-05, - "loss": 0.4801, + "loss": 0.5626, "step": 1390 }, { "epoch": 3.013199105145414, - "grad_norm": 5.536498069763184, + "grad_norm": 9.114398956298828, "learning_rate": 3.815560526969923e-05, - "loss": 0.5512, + "loss": 0.6732, "step": 1400 }, { "epoch": 3.0154362416107383, - "grad_norm": 8.874992370605469, + "grad_norm": 7.543931484222412, "learning_rate": 3.8031319910514545e-05, - "loss": 0.6437, + "loss": 0.4275, "step": 1410 }, { "epoch": 3.0176733780760627, - "grad_norm": 8.250784873962402, + "grad_norm": 10.644927024841309, "learning_rate": 3.790703455132986e-05, - "loss": 0.5753, + "loss": 0.4325, "step": 1420 }, { "epoch": 3.019910514541387, - "grad_norm": 6.462278366088867, + "grad_norm": 3.5276317596435547, "learning_rate": 3.778274919214517e-05, - "loss": 0.4336, + "loss": 0.3605, "step": 1430 }, { "epoch": 3.0221476510067116, - "grad_norm": 2.823152780532837, + "grad_norm": 5.080909729003906, "learning_rate": 3.7658463832960475e-05, - "loss": 0.6393, + "loss": 0.5921, "step": 1440 }, { "epoch": 3.024384787472036, - "grad_norm": 1.859449028968811, + "grad_norm": 1.2971785068511963, "learning_rate": 3.753417847377579e-05, - "loss": 0.3849, + "loss": 0.3565, "step": 1450 }, { "epoch": 3.02662192393736, - "grad_norm": 7.397490501403809, + "grad_norm": 3.7884743213653564, "learning_rate": 3.7409893114591106e-05, - "loss": 0.5019, + "loss": 0.287, "step": 1460 }, { "epoch": 3.0288590604026844, - "grad_norm": 3.8984179496765137, + "grad_norm": 5.833460807800293, "learning_rate": 3.728560775540642e-05, - "loss": 0.5481, + "loss": 0.4596, "step": 1470 }, { "epoch": 3.031096196868009, - "grad_norm": 9.145849227905273, + "grad_norm": 10.393218994140625, "learning_rate": 3.7161322396221724e-05, - "loss": 0.5017, + "loss": 0.3767, "step": 1480 }, { "epoch": 3.033333333333333, - "grad_norm": 9.390955924987793, + "grad_norm": 12.434408187866211, "learning_rate": 3.7037037037037037e-05, - "loss": 0.4875, + "loss": 0.5202, "step": 1490 }, { "epoch": 3.0355704697986576, - "grad_norm": 7.330883979797363, + "grad_norm": 7.507827281951904, "learning_rate": 3.6912751677852356e-05, - "loss": 0.4656, + "loss": 0.4699, "step": 1500 }, { "epoch": 3.037807606263982, - "grad_norm": 4.043966293334961, + "grad_norm": 4.108563423156738, "learning_rate": 3.678846631866766e-05, - "loss": 0.4502, + "loss": 0.4836, "step": 1510 }, { "epoch": 3.0400447427293065, - "grad_norm": 4.5185089111328125, + "grad_norm": 6.502699851989746, "learning_rate": 3.6664180959482974e-05, - "loss": 0.4251, + "loss": 0.2626, "step": 1520 }, { "epoch": 3.042281879194631, - "grad_norm": 5.390605926513672, + "grad_norm": 11.717517852783203, "learning_rate": 3.6539895600298286e-05, - "loss": 0.5748, + "loss": 0.6785, "step": 1530 }, { "epoch": 3.0445190156599553, - "grad_norm": 2.755403995513916, + "grad_norm": 9.127665519714355, "learning_rate": 3.64156102411136e-05, - "loss": 0.4585, + "loss": 0.5052, "step": 1540 }, { "epoch": 3.0467561521252797, - "grad_norm": 4.787998676300049, + "grad_norm": 6.234489917755127, "learning_rate": 3.629132488192891e-05, - "loss": 0.6129, + "loss": 0.4078, "step": 1550 }, { "epoch": 3.048993288590604, - "grad_norm": 3.5947635173797607, + "grad_norm": 13.028934478759766, "learning_rate": 3.616703952274422e-05, - "loss": 0.4497, + "loss": 0.5171, "step": 1560 }, { "epoch": 3.0512304250559286, - "grad_norm": 9.294822692871094, + "grad_norm": 9.663383483886719, "learning_rate": 3.6042754163559535e-05, - "loss": 0.5038, + "loss": 0.6043, "step": 1570 }, { "epoch": 3.053467561521253, - "grad_norm": 4.422880172729492, + "grad_norm": 3.3989367485046387, "learning_rate": 3.591846880437484e-05, - "loss": 0.4785, + "loss": 0.4231, "step": 1580 }, { "epoch": 3.0557046979865774, - "grad_norm": 8.102038383483887, + "grad_norm": 6.579158782958984, "learning_rate": 3.579418344519016e-05, - "loss": 0.6245, + "loss": 0.515, "step": 1590 }, { "epoch": 3.0579418344519014, - "grad_norm": 4.504268169403076, + "grad_norm": 5.151082515716553, "learning_rate": 3.566989808600547e-05, - "loss": 0.4765, + "loss": 0.3919, "step": 1600 }, { "epoch": 3.060178970917226, - "grad_norm": 9.747063636779785, + "grad_norm": 2.145969867706299, "learning_rate": 3.5545612726820785e-05, - "loss": 0.6108, + "loss": 0.4733, "step": 1610 }, { "epoch": 3.06241610738255, - "grad_norm": 2.142608165740967, + "grad_norm": 5.741364002227783, "learning_rate": 3.542132736763609e-05, - "loss": 0.5031, + "loss": 0.4354, "step": 1620 }, { "epoch": 3.0646532438478746, - "grad_norm": 4.941412448883057, + "grad_norm": 3.9511780738830566, "learning_rate": 3.529704200845141e-05, - "loss": 0.4812, + "loss": 0.443, "step": 1630 }, { "epoch": 3.066890380313199, - "grad_norm": 9.923797607421875, + "grad_norm": 6.973093509674072, "learning_rate": 3.517275664926672e-05, - "loss": 0.4473, + "loss": 0.4201, "step": 1640 }, { "epoch": 3.0691275167785235, - "grad_norm": 4.434045791625977, + "grad_norm": 1.0698981285095215, "learning_rate": 3.504847129008203e-05, - "loss": 0.4739, + "loss": 0.485, "step": 1650 }, { "epoch": 3.071364653243848, - "grad_norm": 4.894314765930176, + "grad_norm": 6.2486701011657715, "learning_rate": 3.492418593089734e-05, - "loss": 0.5424, + "loss": 0.4176, "step": 1660 }, { "epoch": 3.0736017897091723, - "grad_norm": 3.4307405948638916, + "grad_norm": 2.134953022003174, "learning_rate": 3.479990057171265e-05, - "loss": 0.5213, + "loss": 0.3947, "step": 1670 }, { "epoch": 3.0758389261744967, - "grad_norm": 2.12326979637146, + "grad_norm": 1.0479804277420044, "learning_rate": 3.4675615212527964e-05, - "loss": 0.4881, + "loss": 0.3886, "step": 1680 }, { "epoch": 3.078076062639821, - "grad_norm": 3.1597368717193604, + "grad_norm": 1.2134567499160767, "learning_rate": 3.455132985334328e-05, - "loss": 0.4149, + "loss": 0.3022, "step": 1690 }, { "epoch": 3.0803131991051456, - "grad_norm": 8.707585334777832, + "grad_norm": 10.898287773132324, "learning_rate": 3.442704449415859e-05, - "loss": 0.4081, + "loss": 0.3078, "step": 1700 }, { "epoch": 3.08255033557047, - "grad_norm": 14.810624122619629, + "grad_norm": 18.389766693115234, "learning_rate": 3.43027591349739e-05, - "loss": 0.569, + "loss": 0.663, "step": 1710 }, { "epoch": 3.0847874720357944, - "grad_norm": 2.331845998764038, + "grad_norm": 2.9712672233581543, "learning_rate": 3.4178473775789214e-05, - "loss": 0.4592, + "loss": 0.4882, "step": 1720 }, { "epoch": 3.0870246085011184, - "grad_norm": 0.9509165287017822, + "grad_norm": 4.190480709075928, "learning_rate": 3.4054188416604526e-05, - "loss": 0.3231, + "loss": 0.4099, "step": 1730 }, { "epoch": 3.089261744966443, - "grad_norm": 7.728116989135742, + "grad_norm": 5.036893367767334, "learning_rate": 3.392990305741984e-05, - "loss": 0.4468, + "loss": 0.3227, "step": 1740 }, { "epoch": 3.091498881431767, - "grad_norm": 5.856037616729736, + "grad_norm": 3.94989013671875, "learning_rate": 3.380561769823515e-05, - "loss": 0.4978, + "loss": 0.3345, "step": 1750 }, { "epoch": 3.0937360178970916, - "grad_norm": 5.866270065307617, + "grad_norm": 10.000751495361328, "learning_rate": 3.3681332339050456e-05, - "loss": 0.3826, + "loss": 0.3655, "step": 1760 }, { "epoch": 3.095973154362416, - "grad_norm": 4.168057918548584, + "grad_norm": 6.677926063537598, "learning_rate": 3.3557046979865775e-05, - "loss": 0.4231, + "loss": 0.2954, "step": 1770 }, { "epoch": 3.0982102908277405, - "grad_norm": 11.238313674926758, + "grad_norm": 10.632355690002441, "learning_rate": 3.343276162068109e-05, - "loss": 0.5038, + "loss": 0.4427, "step": 1780 }, { "epoch": 3.1, - "eval_loss": 0.7296048998832703, - "eval_runtime": 937.8481, - "eval_samples_per_second": 5.272, - "eval_steps_per_second": 0.659, + "eval_loss": 0.8546826243400574, + "eval_runtime": 891.5992, + "eval_samples_per_second": 8.318, + "eval_steps_per_second": 1.04, "step": 1788 }, { "epoch": 4.000447427293065, - "grad_norm": 8.892515182495117, + "grad_norm": 16.46874237060547, "learning_rate": 3.330847626149639e-05, - "loss": 0.478, + "loss": 0.6448, "step": 1790 }, { "epoch": 4.00268456375839, - "grad_norm": 5.279501914978027, + "grad_norm": 7.57173490524292, "learning_rate": 3.3184190902311706e-05, - "loss": 0.4808, + "loss": 0.4691, "step": 1800 }, { "epoch": 4.004921700223714, - "grad_norm": 1.478691577911377, + "grad_norm": 6.603734493255615, "learning_rate": 3.3059905543127025e-05, - "loss": 0.4722, + "loss": 0.4373, "step": 1810 }, { "epoch": 4.007158836689038, - "grad_norm": 9.76483154296875, + "grad_norm": 5.733815670013428, "learning_rate": 3.293562018394234e-05, - "loss": 0.4112, + "loss": 0.4219, "step": 1820 }, { "epoch": 4.009395973154362, - "grad_norm": 3.3786661624908447, + "grad_norm": 0.5362582206726074, "learning_rate": 3.281133482475764e-05, - "loss": 0.4243, + "loss": 0.2901, "step": 1830 }, { "epoch": 4.011633109619686, - "grad_norm": 4.852506637573242, + "grad_norm": 9.654644012451172, "learning_rate": 3.2687049465572955e-05, - "loss": 0.3942, + "loss": 0.4001, "step": 1840 }, { "epoch": 4.013870246085011, - "grad_norm": 7.3863301277160645, + "grad_norm": 5.657355785369873, "learning_rate": 3.2562764106388274e-05, - "loss": 0.5132, + "loss": 0.3882, "step": 1850 }, { "epoch": 4.016107382550335, - "grad_norm": 4.083529472351074, + "grad_norm": 4.895392417907715, "learning_rate": 3.243847874720358e-05, - "loss": 0.355, + "loss": 0.402, "step": 1860 }, { "epoch": 4.01834451901566, - "grad_norm": 6.427587032318115, + "grad_norm": 7.476536750793457, "learning_rate": 3.231419338801889e-05, - "loss": 0.4309, + "loss": 0.2948, "step": 1870 }, { "epoch": 4.020581655480984, - "grad_norm": 11.749137878417969, + "grad_norm": 15.446544647216797, "learning_rate": 3.2189908028834204e-05, - "loss": 0.5108, + "loss": 0.3745, "step": 1880 }, { "epoch": 4.0228187919463085, - "grad_norm": 8.5358304977417, + "grad_norm": 9.441873550415039, "learning_rate": 3.206562266964952e-05, - "loss": 0.3922, + "loss": 0.4731, "step": 1890 }, { "epoch": 4.025055928411633, - "grad_norm": 5.002346038818359, + "grad_norm": 2.744432210922241, "learning_rate": 3.194133731046483e-05, - "loss": 0.3854, + "loss": 0.5575, "step": 1900 }, { "epoch": 4.027293064876957, - "grad_norm": 4.695934295654297, + "grad_norm": 7.594290733337402, "learning_rate": 3.181705195128014e-05, - "loss": 0.3627, + "loss": 0.463, "step": 1910 }, { "epoch": 4.029530201342282, - "grad_norm": 9.114057540893555, + "grad_norm": 9.001227378845215, "learning_rate": 3.1692766592095454e-05, - "loss": 0.3789, + "loss": 0.3622, "step": 1920 }, { "epoch": 4.031767337807606, - "grad_norm": 13.6981782913208, + "grad_norm": 12.734862327575684, "learning_rate": 3.156848123291076e-05, - "loss": 0.3505, + "loss": 0.3435, "step": 1930 }, { "epoch": 4.034004474272931, - "grad_norm": 1.9973479509353638, + "grad_norm": 1.6699249744415283, "learning_rate": 3.144419587372608e-05, - "loss": 0.3911, + "loss": 0.2984, "step": 1940 }, { "epoch": 4.036241610738255, - "grad_norm": 12.077263832092285, + "grad_norm": 14.737456321716309, "learning_rate": 3.131991051454139e-05, - "loss": 0.4104, + "loss": 0.4041, "step": 1950 }, { "epoch": 4.0384787472035795, - "grad_norm": 9.706670761108398, + "grad_norm": 5.71207857131958, "learning_rate": 3.11956251553567e-05, - "loss": 0.4253, + "loss": 0.3152, "step": 1960 }, { "epoch": 4.040715883668904, - "grad_norm": 3.5470070838928223, + "grad_norm": 12.913744926452637, "learning_rate": 3.107133979617201e-05, - "loss": 0.4829, + "loss": 0.4892, "step": 1970 }, { "epoch": 4.042953020134228, - "grad_norm": 9.14087200164795, + "grad_norm": 12.614986419677734, "learning_rate": 3.094705443698732e-05, - "loss": 0.4324, + "loss": 0.4399, "step": 1980 }, { "epoch": 4.045190156599553, - "grad_norm": 3.2850630283355713, + "grad_norm": 10.273175239562988, "learning_rate": 3.082276907780264e-05, - "loss": 0.4274, + "loss": 0.5362, "step": 1990 }, { "epoch": 4.047427293064877, - "grad_norm": 6.201521873474121, + "grad_norm": 1.00068199634552, "learning_rate": 3.0698483718617946e-05, - "loss": 0.373, + "loss": 0.4019, "step": 2000 }, { "epoch": 4.049664429530202, - "grad_norm": 2.3075242042541504, + "grad_norm": 3.2468674182891846, "learning_rate": 3.057419835943326e-05, - "loss": 0.2872, + "loss": 0.2908, "step": 2010 }, { "epoch": 4.051901565995526, - "grad_norm": 10.908852577209473, + "grad_norm": 13.30661392211914, "learning_rate": 3.044991300024857e-05, - "loss": 0.3557, + "loss": 0.4924, "step": 2020 }, { "epoch": 4.05413870246085, - "grad_norm": 10.656627655029297, + "grad_norm": 7.9545063972473145, "learning_rate": 3.0325627641063886e-05, - "loss": 0.4079, + "loss": 0.3895, "step": 2030 }, { "epoch": 4.056375838926175, - "grad_norm": 8.033869743347168, + "grad_norm": 9.699666023254395, "learning_rate": 3.02013422818792e-05, - "loss": 0.2665, + "loss": 0.2904, "step": 2040 }, { "epoch": 4.058612975391499, - "grad_norm": 3.8938560485839844, + "grad_norm": 6.4541707038879395, "learning_rate": 3.0077056922694508e-05, - "loss": 0.4266, + "loss": 0.4361, "step": 2050 }, { "epoch": 4.060850111856824, - "grad_norm": 11.372118949890137, + "grad_norm": 22.470691680908203, "learning_rate": 2.995277156350982e-05, - "loss": 0.4161, + "loss": 0.3353, "step": 2060 }, { "epoch": 4.063087248322148, - "grad_norm": 7.895952224731445, + "grad_norm": 8.081043243408203, "learning_rate": 2.9828486204325136e-05, - "loss": 0.5103, + "loss": 0.492, "step": 2070 }, { "epoch": 4.065324384787472, - "grad_norm": 5.879835605621338, + "grad_norm": 1.5061018466949463, "learning_rate": 2.9704200845140445e-05, - "loss": 0.424, + "loss": 0.3086, "step": 2080 }, { "epoch": 4.067561521252796, - "grad_norm": 1.7476272583007812, + "grad_norm": 0.6852089762687683, "learning_rate": 2.9579915485955757e-05, - "loss": 0.403, + "loss": 0.389, "step": 2090 }, { "epoch": 4.06979865771812, - "grad_norm": 11.164639472961426, + "grad_norm": 12.416427612304688, "learning_rate": 2.9455630126771066e-05, - "loss": 0.3045, + "loss": 0.3055, "step": 2100 }, { "epoch": 4.072035794183445, - "grad_norm": 4.138221740722656, + "grad_norm": 7.986660957336426, "learning_rate": 2.9331344767586378e-05, - "loss": 0.3905, + "loss": 0.5869, "step": 2110 }, { "epoch": 4.074272930648769, - "grad_norm": 11.378765106201172, + "grad_norm": 14.885111808776855, "learning_rate": 2.9207059408401694e-05, - "loss": 0.6302, + "loss": 0.3224, "step": 2120 }, { "epoch": 4.076510067114094, - "grad_norm": 4.001158714294434, + "grad_norm": 10.167362213134766, "learning_rate": 2.9082774049217003e-05, - "loss": 0.3419, + "loss": 0.3044, "step": 2130 }, { "epoch": 4.078747203579418, - "grad_norm": 5.916733264923096, + "grad_norm": 6.552039623260498, "learning_rate": 2.8958488690032315e-05, - "loss": 0.3233, + "loss": 0.2917, "step": 2140 }, { "epoch": 4.0809843400447425, - "grad_norm": 9.534823417663574, + "grad_norm": 7.694530487060547, "learning_rate": 2.8834203330847624e-05, - "loss": 0.4643, + "loss": 0.4193, "step": 2150 }, { "epoch": 4.083221476510067, - "grad_norm": 8.507469177246094, + "grad_norm": 7.818331241607666, "learning_rate": 2.8709917971662943e-05, - "loss": 0.391, + "loss": 0.3355, "step": 2160 }, { "epoch": 4.085458612975391, - "grad_norm": 12.570788383483887, + "grad_norm": 18.491985321044922, "learning_rate": 2.8585632612478252e-05, - "loss": 0.3059, + "loss": 0.2317, "step": 2170 }, { "epoch": 4.087695749440716, - "grad_norm": 6.391950607299805, + "grad_norm": 6.1848602294921875, "learning_rate": 2.8461347253293565e-05, - "loss": 0.3738, + "loss": 0.2758, "step": 2180 }, { "epoch": 4.08993288590604, - "grad_norm": 14.067455291748047, + "grad_norm": 13.520557403564453, "learning_rate": 2.8337061894108874e-05, - "loss": 0.4277, + "loss": 0.439, "step": 2190 }, { "epoch": 4.092170022371365, - "grad_norm": 6.279323101043701, + "grad_norm": 9.846938133239746, "learning_rate": 2.8212776534924186e-05, - "loss": 0.4808, + "loss": 0.4711, "step": 2200 }, { "epoch": 4.094407158836689, - "grad_norm": 12.135629653930664, + "grad_norm": 5.90399694442749, "learning_rate": 2.80884911757395e-05, - "loss": 0.2029, + "loss": 0.085, "step": 2210 }, { "epoch": 4.0966442953020135, - "grad_norm": 11.846600532531738, + "grad_norm": 16.939096450805664, "learning_rate": 2.796420581655481e-05, - "loss": 0.4229, + "loss": 0.4099, "step": 2220 }, { "epoch": 4.098881431767338, - "grad_norm": 3.3600635528564453, + "grad_norm": 1.1104991436004639, "learning_rate": 2.7839920457370123e-05, - "loss": 0.3927, + "loss": 0.4496, "step": 2230 }, { "epoch": 4.1, - "eval_loss": 0.35884907841682434, - "eval_runtime": 935.0139, - "eval_samples_per_second": 5.288, - "eval_steps_per_second": 0.661, + "eval_loss": 0.3794967830181122, + "eval_runtime": 891.9924, + "eval_samples_per_second": 8.314, + "eval_steps_per_second": 1.039, "step": 2235 }, { "epoch": 5.001118568232662, - "grad_norm": 4.892012119293213, + "grad_norm": 4.833787441253662, "learning_rate": 2.7715635098185432e-05, - "loss": 0.308, + "loss": 0.2992, "step": 2240 }, { "epoch": 5.003355704697986, - "grad_norm": 3.3251285552978516, + "grad_norm": 5.847362518310547, "learning_rate": 2.7591349739000748e-05, - "loss": 0.2273, + "loss": 0.2596, "step": 2250 }, { "epoch": 5.005592841163311, - "grad_norm": 22.025094985961914, + "grad_norm": 18.474834442138672, "learning_rate": 2.746706437981606e-05, - "loss": 0.5861, + "loss": 0.4849, "step": 2260 }, { "epoch": 5.007829977628635, - "grad_norm": 16.336713790893555, + "grad_norm": 12.474526405334473, "learning_rate": 2.734277902063137e-05, - "loss": 0.4155, + "loss": 0.3275, "step": 2270 }, { "epoch": 5.010067114093959, - "grad_norm": 1.52274489402771, + "grad_norm": 1.1569851636886597, "learning_rate": 2.721849366144668e-05, - "loss": 0.3442, + "loss": 0.3104, "step": 2280 }, { "epoch": 5.012304250559284, - "grad_norm": 7.693943977355957, + "grad_norm": 10.059696197509766, "learning_rate": 2.7094208302261997e-05, - "loss": 0.1632, + "loss": 0.1573, "step": 2290 }, { "epoch": 5.014541387024608, - "grad_norm": 10.348392486572266, + "grad_norm": 4.665347099304199, "learning_rate": 2.696992294307731e-05, - "loss": 0.3964, + "loss": 0.4128, "step": 2300 }, { "epoch": 5.016778523489933, - "grad_norm": 11.288122177124023, + "grad_norm": 13.133956909179688, "learning_rate": 2.6845637583892618e-05, - "loss": 0.5684, + "loss": 0.5156, "step": 2310 }, { "epoch": 5.019015659955257, - "grad_norm": 1.0990941524505615, + "grad_norm": 1.0956755876541138, "learning_rate": 2.672135222470793e-05, - "loss": 0.3866, + "loss": 0.3676, "step": 2320 }, { "epoch": 5.0212527964205815, - "grad_norm": 7.708190441131592, + "grad_norm": 8.179460525512695, "learning_rate": 2.659706686552324e-05, - "loss": 0.3275, + "loss": 0.3082, "step": 2330 }, { "epoch": 5.023489932885906, - "grad_norm": 0.9472073912620544, + "grad_norm": 1.9563068151474, "learning_rate": 2.6472781506338555e-05, - "loss": 0.1861, + "loss": 0.2365, "step": 2340 }, { "epoch": 5.02572706935123, - "grad_norm": 13.988951683044434, + "grad_norm": 9.728827476501465, "learning_rate": 2.6348496147153868e-05, - "loss": 0.4737, + "loss": 0.6143, "step": 2350 }, { "epoch": 5.027964205816555, - "grad_norm": 18.836071014404297, + "grad_norm": 0.4499684274196625, "learning_rate": 2.6224210787969177e-05, - "loss": 0.3949, + "loss": 0.1527, "step": 2360 }, { "epoch": 5.030201342281879, - "grad_norm": 11.103631973266602, + "grad_norm": 4.692073345184326, "learning_rate": 2.609992542878449e-05, - "loss": 0.3836, + "loss": 0.4215, "step": 2370 }, { "epoch": 5.032438478747204, - "grad_norm": 13.336675643920898, + "grad_norm": 0.4127592146396637, "learning_rate": 2.5975640069599805e-05, - "loss": 0.349, + "loss": 0.3132, "step": 2380 }, { "epoch": 5.034675615212528, - "grad_norm": 10.622421264648438, + "grad_norm": 5.006928443908691, "learning_rate": 2.5851354710415117e-05, - "loss": 0.3322, + "loss": 0.243, "step": 2390 }, { "epoch": 5.0369127516778525, - "grad_norm": 4.1144843101501465, + "grad_norm": 9.491246223449707, "learning_rate": 2.5727069351230426e-05, - "loss": 0.2823, + "loss": 0.4203, "step": 2400 }, { "epoch": 5.039149888143177, - "grad_norm": 4.660728454589844, + "grad_norm": 1.1350034475326538, "learning_rate": 2.560278399204574e-05, - "loss": 0.2464, + "loss": 0.3657, "step": 2410 }, { "epoch": 5.041387024608501, - "grad_norm": 7.401895523071289, + "grad_norm": 17.347612380981445, "learning_rate": 2.5478498632861047e-05, - "loss": 0.3123, + "loss": 0.4972, "step": 2420 }, { "epoch": 5.043624161073826, - "grad_norm": 2.969099283218384, + "grad_norm": 2.2265026569366455, "learning_rate": 2.5354213273676363e-05, - "loss": 0.2394, + "loss": 0.3345, "step": 2430 }, { "epoch": 5.04586129753915, - "grad_norm": 9.645813941955566, + "grad_norm": 4.642486572265625, "learning_rate": 2.5229927914491675e-05, - "loss": 0.2217, + "loss": 0.2684, "step": 2440 }, { "epoch": 5.0480984340044746, - "grad_norm": 6.4596734046936035, + "grad_norm": 12.171128273010254, "learning_rate": 2.5105642555306984e-05, - "loss": 0.5332, + "loss": 0.4424, "step": 2450 }, { "epoch": 5.050335570469799, - "grad_norm": 5.598334312438965, + "grad_norm": 11.319775581359863, "learning_rate": 2.49813571961223e-05, - "loss": 0.3076, + "loss": 0.2878, "step": 2460 }, { "epoch": 5.052572706935123, - "grad_norm": 8.19486141204834, + "grad_norm": 7.691583156585693, "learning_rate": 2.485707183693761e-05, - "loss": 0.5466, + "loss": 0.3732, "step": 2470 }, { "epoch": 5.054809843400448, - "grad_norm": 9.856636047363281, + "grad_norm": 11.818854331970215, "learning_rate": 2.473278647775292e-05, - "loss": 0.3923, + "loss": 0.3303, "step": 2480 }, { "epoch": 5.057046979865772, - "grad_norm": 9.83061695098877, + "grad_norm": 10.7247314453125, "learning_rate": 2.4608501118568234e-05, - "loss": 0.2615, + "loss": 0.2182, "step": 2490 }, { "epoch": 5.059284116331096, - "grad_norm": 5.30068302154541, + "grad_norm": 11.010503768920898, "learning_rate": 2.4484215759383546e-05, - "loss": 0.4623, + "loss": 0.3295, "step": 2500 }, { "epoch": 5.06152125279642, - "grad_norm": 11.414236068725586, + "grad_norm": 8.999567985534668, "learning_rate": 2.435993040019886e-05, - "loss": 0.3316, + "loss": 0.2326, "step": 2510 }, { "epoch": 5.063758389261745, - "grad_norm": 3.8756637573242188, + "grad_norm": 7.073877334594727, "learning_rate": 2.4235645041014167e-05, - "loss": 0.2236, + "loss": 0.2705, "step": 2520 }, { "epoch": 5.065995525727069, - "grad_norm": 14.053914070129395, + "grad_norm": 11.623409271240234, "learning_rate": 2.4111359681829483e-05, - "loss": 0.3271, + "loss": 0.3234, "step": 2530 }, { "epoch": 5.068232662192393, - "grad_norm": 10.499263763427734, + "grad_norm": 4.587973594665527, "learning_rate": 2.3987074322644792e-05, - "loss": 0.5217, + "loss": 0.2505, "step": 2540 }, { "epoch": 5.070469798657718, - "grad_norm": 6.821963787078857, + "grad_norm": 11.883222579956055, "learning_rate": 2.3862788963460104e-05, - "loss": 0.3827, + "loss": 0.456, "step": 2550 }, { "epoch": 5.072706935123042, - "grad_norm": 7.142666339874268, + "grad_norm": 21.06523323059082, "learning_rate": 2.3738503604275417e-05, - "loss": 0.3265, + "loss": 0.2691, "step": 2560 }, { "epoch": 5.074944071588367, - "grad_norm": 3.25323486328125, + "grad_norm": 12.439352989196777, "learning_rate": 2.361421824509073e-05, - "loss": 0.2221, + "loss": 0.2216, "step": 2570 }, { "epoch": 5.077181208053691, - "grad_norm": 2.06369686126709, + "grad_norm": 0.9154367446899414, "learning_rate": 2.348993288590604e-05, - "loss": 0.2334, + "loss": 0.3547, "step": 2580 }, { "epoch": 5.0794183445190155, - "grad_norm": 4.083550930023193, + "grad_norm": 1.807629942893982, "learning_rate": 2.3365647526721354e-05, - "loss": 0.2098, + "loss": 0.2784, "step": 2590 }, { "epoch": 5.08165548098434, - "grad_norm": 22.284122467041016, + "grad_norm": 14.866148948669434, "learning_rate": 2.3241362167536666e-05, - "loss": 0.4255, + "loss": 0.4041, "step": 2600 }, { "epoch": 5.083892617449664, - "grad_norm": 7.050011157989502, + "grad_norm": 9.991622924804688, "learning_rate": 2.311707680835198e-05, - "loss": 0.4338, + "loss": 0.3884, "step": 2610 }, { "epoch": 5.086129753914989, - "grad_norm": 12.864763259887695, + "grad_norm": 11.396065711975098, "learning_rate": 2.2992791449167287e-05, - "loss": 0.217, + "loss": 0.1861, "step": 2620 }, { "epoch": 5.088366890380313, - "grad_norm": 8.292052268981934, + "grad_norm": 5.028550624847412, "learning_rate": 2.28685060899826e-05, - "loss": 0.2749, + "loss": 0.1773, "step": 2630 }, { "epoch": 5.090604026845638, - "grad_norm": 8.610766410827637, + "grad_norm": 7.7053022384643555, "learning_rate": 2.2744220730797912e-05, - "loss": 0.475, + "loss": 0.34, "step": 2640 }, { "epoch": 5.092841163310962, - "grad_norm": 6.8138933181762695, + "grad_norm": 11.335506439208984, "learning_rate": 2.2619935371613224e-05, - "loss": 0.2715, + "loss": 0.2359, "step": 2650 }, { "epoch": 5.0950782997762865, - "grad_norm": 13.458662986755371, + "grad_norm": 9.261770248413086, "learning_rate": 2.2495650012428537e-05, - "loss": 0.558, + "loss": 0.3856, "step": 2660 }, { "epoch": 5.097315436241611, - "grad_norm": 7.119832515716553, + "grad_norm": 8.961913108825684, "learning_rate": 2.237136465324385e-05, - "loss": 0.3312, + "loss": 0.4009, "step": 2670 }, { "epoch": 5.099552572706935, - "grad_norm": 10.518315315246582, + "grad_norm": 3.3359217643737793, "learning_rate": 2.224707929405916e-05, - "loss": 0.3167, + "loss": 0.3433, "step": 2680 }, { "epoch": 5.1, - "eval_loss": 0.4430215656757355, - "eval_runtime": 921.6901, - "eval_samples_per_second": 5.364, - "eval_steps_per_second": 0.671, + "eval_loss": 0.4118688702583313, + "eval_runtime": 892.4454, + "eval_samples_per_second": 8.31, + "eval_steps_per_second": 1.039, "step": 2682 }, { "epoch": 6.001789709172259, - "grad_norm": 8.830635070800781, + "grad_norm": 8.580872535705566, "learning_rate": 2.2122793934874474e-05, - "loss": 0.1378, + "loss": 0.1915, "step": 2690 }, { "epoch": 6.004026845637584, - "grad_norm": 8.985321044921875, + "grad_norm": 0.8801985383033752, "learning_rate": 2.1998508575689786e-05, - "loss": 0.2403, + "loss": 0.182, "step": 2700 }, { "epoch": 6.006263982102908, - "grad_norm": 8.969911575317383, + "grad_norm": 15.736348152160645, "learning_rate": 2.1874223216505095e-05, - "loss": 0.3703, + "loss": 0.4147, "step": 2710 }, { "epoch": 6.008501118568232, - "grad_norm": 13.08025074005127, + "grad_norm": 1.222266674041748, "learning_rate": 2.174993785732041e-05, - "loss": 0.236, + "loss": 0.2243, "step": 2720 }, { "epoch": 6.010738255033557, - "grad_norm": 10.97948169708252, + "grad_norm": 9.234151840209961, "learning_rate": 2.162565249813572e-05, - "loss": 0.2386, + "loss": 0.2065, "step": 2730 }, { "epoch": 6.012975391498881, - "grad_norm": 20.18202781677246, + "grad_norm": 17.84362030029297, "learning_rate": 2.1501367138951032e-05, - "loss": 0.3661, + "loss": 0.2676, "step": 2740 }, { "epoch": 6.015212527964206, - "grad_norm": 9.394488334655762, + "grad_norm": 7.234649658203125, "learning_rate": 2.1377081779766345e-05, - "loss": 0.3235, + "loss": 0.3512, "step": 2750 }, { "epoch": 6.01744966442953, - "grad_norm": 8.908599853515625, + "grad_norm": 5.13038969039917, "learning_rate": 2.1252796420581657e-05, - "loss": 0.4476, + "loss": 0.3029, "step": 2760 }, { "epoch": 6.0196868008948545, - "grad_norm": 10.415188789367676, + "grad_norm": 12.59158706665039, "learning_rate": 2.112851106139697e-05, - "loss": 0.3331, + "loss": 0.2794, "step": 2770 }, { "epoch": 6.021923937360179, - "grad_norm": 10.19694709777832, + "grad_norm": 14.324825286865234, "learning_rate": 2.1004225702212278e-05, - "loss": 0.2918, + "loss": 0.2144, "step": 2780 }, { "epoch": 6.024161073825503, - "grad_norm": 5.295567512512207, + "grad_norm": 6.485671043395996, "learning_rate": 2.0879940343027594e-05, - "loss": 0.2004, + "loss": 0.249, "step": 2790 }, { "epoch": 6.026398210290828, - "grad_norm": 3.8771309852600098, + "grad_norm": 1.8053562641143799, "learning_rate": 2.0755654983842903e-05, - "loss": 0.1936, + "loss": 0.2008, "step": 2800 }, { "epoch": 6.028635346756152, - "grad_norm": 10.931071281433105, + "grad_norm": 5.798049449920654, "learning_rate": 2.063136962465822e-05, - "loss": 0.2634, + "loss": 0.1985, "step": 2810 }, { "epoch": 6.030872483221477, - "grad_norm": 13.032493591308594, + "grad_norm": 11.126876831054688, "learning_rate": 2.0507084265473528e-05, - "loss": 0.4461, + "loss": 0.3294, "step": 2820 }, { "epoch": 6.033109619686801, - "grad_norm": 15.591374397277832, + "grad_norm": 20.707542419433594, "learning_rate": 2.038279890628884e-05, - "loss": 0.4873, + "loss": 0.4472, "step": 2830 }, { "epoch": 6.0353467561521255, - "grad_norm": 8.528480529785156, + "grad_norm": 7.778163909912109, "learning_rate": 2.0258513547104152e-05, - "loss": 0.2138, + "loss": 0.2675, "step": 2840 }, { "epoch": 6.03758389261745, - "grad_norm": 12.467988014221191, + "grad_norm": 8.803659439086914, "learning_rate": 2.013422818791946e-05, - "loss": 0.4351, + "loss": 0.244, "step": 2850 }, { "epoch": 6.039821029082774, - "grad_norm": 6.8514018058776855, + "grad_norm": 0.29662925004959106, "learning_rate": 2.0009942828734777e-05, - "loss": 0.2339, + "loss": 0.1816, "step": 2860 }, { "epoch": 6.042058165548099, - "grad_norm": 1.4219473600387573, + "grad_norm": 0.928829550743103, "learning_rate": 1.9885657469550086e-05, - "loss": 0.2019, + "loss": 0.3033, "step": 2870 }, { "epoch": 6.044295302013423, - "grad_norm": 1.5489323139190674, + "grad_norm": 0.4924483299255371, "learning_rate": 1.97613721103654e-05, - "loss": 0.3255, + "loss": 0.3565, "step": 2880 }, { "epoch": 6.0465324384787476, - "grad_norm": 9.738638877868652, + "grad_norm": 10.867852210998535, "learning_rate": 1.963708675118071e-05, - "loss": 0.2453, + "loss": 0.1718, "step": 2890 }, { "epoch": 6.048769574944072, - "grad_norm": 10.866888999938965, + "grad_norm": 0.17046819627285004, "learning_rate": 1.9512801391996023e-05, - "loss": 0.289, + "loss": 0.3193, "step": 2900 }, { "epoch": 6.051006711409396, - "grad_norm": 29.56283187866211, + "grad_norm": 21.9335994720459, "learning_rate": 1.9388516032811335e-05, - "loss": 0.3176, + "loss": 0.2725, "step": 2910 }, { "epoch": 6.05324384787472, - "grad_norm": 9.453397750854492, + "grad_norm": 7.8351030349731445, "learning_rate": 1.9264230673626648e-05, - "loss": 0.4457, + "loss": 0.2877, "step": 2920 }, { "epoch": 6.055480984340044, - "grad_norm": 1.6036638021469116, + "grad_norm": 1.0813746452331543, "learning_rate": 1.913994531444196e-05, - "loss": 0.2702, + "loss": 0.2619, "step": 2930 }, { "epoch": 6.057718120805369, - "grad_norm": 11.915915489196777, + "grad_norm": 4.7920241355896, "learning_rate": 1.9015659955257272e-05, - "loss": 0.3377, + "loss": 0.182, "step": 2940 }, { "epoch": 6.059955257270693, - "grad_norm": 8.017142295837402, + "grad_norm": 18.19488525390625, "learning_rate": 1.8891374596072585e-05, - "loss": 0.4199, + "loss": 0.2848, "step": 2950 }, { "epoch": 6.062192393736018, - "grad_norm": 3.0255606174468994, + "grad_norm": 2.5479190349578857, "learning_rate": 1.8767089236887894e-05, - "loss": 0.2819, + "loss": 0.3099, "step": 2960 }, { "epoch": 6.064429530201342, - "grad_norm": 12.458708763122559, + "grad_norm": 21.13016128540039, "learning_rate": 1.864280387770321e-05, - "loss": 0.3564, + "loss": 0.4713, "step": 2970 }, { "epoch": 6.066666666666666, - "grad_norm": 0.24730181694030762, + "grad_norm": 0.09484589099884033, "learning_rate": 1.8518518518518518e-05, - "loss": 0.2035, + "loss": 0.2758, "step": 2980 }, { "epoch": 6.068903803131991, - "grad_norm": 15.799874305725098, + "grad_norm": 11.212635040283203, "learning_rate": 1.839423315933383e-05, - "loss": 0.2655, + "loss": 0.2305, "step": 2990 }, { "epoch": 6.071140939597315, - "grad_norm": 15.106266021728516, + "grad_norm": 8.96382999420166, "learning_rate": 1.8269947800149143e-05, - "loss": 0.2328, + "loss": 0.2464, "step": 3000 }, { "epoch": 6.07337807606264, - "grad_norm": 9.420243263244629, + "grad_norm": 8.752419471740723, "learning_rate": 1.8145662440964455e-05, - "loss": 0.2431, + "loss": 0.1472, "step": 3010 }, { "epoch": 6.075615212527964, - "grad_norm": 6.75723123550415, + "grad_norm": 9.717801094055176, "learning_rate": 1.8021377081779768e-05, - "loss": 0.4296, + "loss": 0.3221, "step": 3020 }, { "epoch": 6.0778523489932885, - "grad_norm": 8.274847984313965, + "grad_norm": 5.287158966064453, "learning_rate": 1.789709172259508e-05, - "loss": 0.4351, + "loss": 0.2747, "step": 3030 }, { "epoch": 6.080089485458613, - "grad_norm": 4.256190299987793, + "grad_norm": 1.6286725997924805, "learning_rate": 1.7772806363410392e-05, - "loss": 0.0965, + "loss": 0.1127, "step": 3040 }, { "epoch": 6.082326621923937, - "grad_norm": 1.681492567062378, + "grad_norm": 12.31570053100586, "learning_rate": 1.7648521004225705e-05, - "loss": 0.4374, + "loss": 0.2879, "step": 3050 }, { "epoch": 6.084563758389262, - "grad_norm": 0.5820801854133606, + "grad_norm": 3.0768423080444336, "learning_rate": 1.7524235645041014e-05, - "loss": 0.3684, + "loss": 0.3264, "step": 3060 }, { "epoch": 6.086800894854586, - "grad_norm": 5.919391632080078, + "grad_norm": 6.542660713195801, "learning_rate": 1.7399950285856326e-05, - "loss": 0.2455, + "loss": 0.2626, "step": 3070 }, { "epoch": 6.089038031319911, - "grad_norm": 7.470072269439697, + "grad_norm": 22.778274536132812, "learning_rate": 1.727566492667164e-05, - "loss": 0.2138, + "loss": 0.2602, "step": 3080 }, { "epoch": 6.091275167785235, - "grad_norm": 5.051515102386475, + "grad_norm": 14.418547630310059, "learning_rate": 1.715137956748695e-05, - "loss": 0.2726, + "loss": 0.2574, "step": 3090 }, { "epoch": 6.0935123042505595, - "grad_norm": 0.34526365995407104, + "grad_norm": 1.627580165863037, "learning_rate": 1.7027094208302263e-05, - "loss": 0.178, + "loss": 0.2347, "step": 3100 }, { "epoch": 6.095749440715884, - "grad_norm": 17.414888381958008, + "grad_norm": 2.667323350906372, "learning_rate": 1.6902808849117575e-05, - "loss": 0.1793, + "loss": 0.4904, "step": 3110 }, { "epoch": 6.097986577181208, - "grad_norm": 23.270225524902344, + "grad_norm": 26.79123878479004, "learning_rate": 1.6778523489932888e-05, - "loss": 0.2943, + "loss": 0.2287, "step": 3120 }, { "epoch": 6.1, - "eval_loss": 0.4818915128707886, - "eval_runtime": 922.9566, - "eval_samples_per_second": 5.357, - "eval_steps_per_second": 0.67, + "eval_loss": 0.4823199510574341, + "eval_runtime": 898.1668, + "eval_samples_per_second": 8.257, + "eval_steps_per_second": 1.032, "step": 3129 }, { "epoch": 7.000223713646532, - "grad_norm": 2.266531229019165, + "grad_norm": 10.718249320983887, "learning_rate": 1.6654238130748197e-05, - "loss": 0.2625, + "loss": 0.2243, "step": 3130 }, { "epoch": 7.002460850111857, - "grad_norm": 24.349164962768555, + "grad_norm": 3.2877581119537354, "learning_rate": 1.6529952771563512e-05, - "loss": 0.28, + "loss": 0.3877, "step": 3140 }, { "epoch": 7.004697986577181, - "grad_norm": 0.4873502254486084, + "grad_norm": 2.2482826709747314, "learning_rate": 1.640566741237882e-05, - "loss": 0.1496, + "loss": 0.1283, "step": 3150 }, { "epoch": 7.006935123042505, - "grad_norm": 3.7403814792633057, + "grad_norm": 11.641459465026855, "learning_rate": 1.6281382053194137e-05, - "loss": 0.2493, + "loss": 0.2999, "step": 3160 }, { "epoch": 7.00917225950783, - "grad_norm": 0.49554741382598877, + "grad_norm": 6.360275745391846, "learning_rate": 1.6157096694009446e-05, - "loss": 0.3466, + "loss": 0.2999, "step": 3170 }, { "epoch": 7.011409395973154, - "grad_norm": 0.8320086598396301, + "grad_norm": 0.33911630511283875, "learning_rate": 1.603281133482476e-05, - "loss": 0.3601, + "loss": 0.3532, "step": 3180 }, { "epoch": 7.013646532438479, - "grad_norm": 2.373836040496826, + "grad_norm": 6.708487033843994, "learning_rate": 1.590852597564007e-05, - "loss": 0.4232, + "loss": 0.3293, "step": 3190 }, { "epoch": 7.015883668903803, - "grad_norm": 13.097466468811035, + "grad_norm": 7.4095025062561035, "learning_rate": 1.578424061645538e-05, - "loss": 0.2663, + "loss": 0.1617, "step": 3200 }, { "epoch": 7.0181208053691275, - "grad_norm": 2.8690104484558105, + "grad_norm": 23.493078231811523, "learning_rate": 1.5659955257270695e-05, - "loss": 0.1244, + "loss": 0.1497, "step": 3210 }, { "epoch": 7.020357941834452, - "grad_norm": 10.713217735290527, + "grad_norm": 7.098881244659424, "learning_rate": 1.5535669898086004e-05, - "loss": 0.2673, + "loss": 0.2247, "step": 3220 }, { "epoch": 7.022595078299776, - "grad_norm": 16.93208885192871, + "grad_norm": 8.431655883789062, "learning_rate": 1.541138453890132e-05, - "loss": 0.3606, + "loss": 0.2487, "step": 3230 }, { "epoch": 7.024832214765101, - "grad_norm": 10.654176712036133, + "grad_norm": 15.323188781738281, "learning_rate": 1.528709917971663e-05, - "loss": 0.1426, + "loss": 0.1421, "step": 3240 }, { "epoch": 7.027069351230425, - "grad_norm": 10.325299263000488, + "grad_norm": 7.675280570983887, "learning_rate": 1.5162813820531943e-05, - "loss": 0.4853, + "loss": 0.4883, "step": 3250 }, { "epoch": 7.02930648769575, - "grad_norm": 0.8497664332389832, + "grad_norm": 3.4720630645751953, "learning_rate": 1.5038528461347254e-05, - "loss": 0.3557, + "loss": 0.2715, "step": 3260 }, { "epoch": 7.031543624161074, - "grad_norm": 7.555183410644531, + "grad_norm": 1.7522681951522827, "learning_rate": 1.4914243102162568e-05, - "loss": 0.2883, + "loss": 0.1637, "step": 3270 }, { "epoch": 7.0337807606263985, - "grad_norm": 0.6843427419662476, + "grad_norm": 0.16172055900096893, "learning_rate": 1.4789957742977878e-05, - "loss": 0.1345, + "loss": 0.1717, "step": 3280 }, { "epoch": 7.036017897091723, - "grad_norm": 0.9340455532073975, + "grad_norm": 12.743756294250488, "learning_rate": 1.4665672383793189e-05, - "loss": 0.1728, + "loss": 0.3252, "step": 3290 }, { "epoch": 7.038255033557047, - "grad_norm": 12.420462608337402, + "grad_norm": 10.548824310302734, "learning_rate": 1.4541387024608501e-05, - "loss": 0.1637, + "loss": 0.0774, "step": 3300 }, { "epoch": 7.040492170022372, - "grad_norm": 18.193172454833984, + "grad_norm": 11.808302879333496, "learning_rate": 1.4417101665423812e-05, - "loss": 0.1866, + "loss": 0.3318, "step": 3310 }, { "epoch": 7.042729306487696, - "grad_norm": 7.021984100341797, + "grad_norm": 11.149969100952148, "learning_rate": 1.4292816306239126e-05, - "loss": 0.3043, + "loss": 0.3435, "step": 3320 }, { "epoch": 7.0449664429530205, - "grad_norm": 19.211868286132812, + "grad_norm": 16.549835205078125, "learning_rate": 1.4168530947054437e-05, - "loss": 0.1779, + "loss": 0.245, "step": 3330 }, { "epoch": 7.047203579418344, - "grad_norm": 1.4456816911697388, + "grad_norm": 0.44824355840682983, "learning_rate": 1.404424558786975e-05, - "loss": 0.2717, + "loss": 0.2654, "step": 3340 }, { "epoch": 7.0494407158836685, - "grad_norm": 5.062099456787109, + "grad_norm": 10.627558708190918, "learning_rate": 1.3919960228685061e-05, - "loss": 0.3043, + "loss": 0.2706, "step": 3350 }, { "epoch": 7.051677852348993, - "grad_norm": 1.7384883165359497, + "grad_norm": 4.539977073669434, "learning_rate": 1.3795674869500374e-05, - "loss": 0.2957, + "loss": 0.2529, "step": 3360 }, { "epoch": 7.053914988814317, - "grad_norm": 10.810503959655762, + "grad_norm": 25.88998031616211, "learning_rate": 1.3671389510315684e-05, - "loss": 0.1791, + "loss": 0.1102, "step": 3370 }, { "epoch": 7.056152125279642, - "grad_norm": 0.12086842954158783, + "grad_norm": 0.056022170931100845, "learning_rate": 1.3547104151130999e-05, - "loss": 0.0836, + "loss": 0.0937, "step": 3380 }, { "epoch": 7.058389261744966, - "grad_norm": 14.477333068847656, + "grad_norm": 27.33989715576172, "learning_rate": 1.3422818791946309e-05, - "loss": 0.4779, + "loss": 0.3445, "step": 3390 }, { "epoch": 7.060626398210291, - "grad_norm": 10.850499153137207, + "grad_norm": 27.60544204711914, "learning_rate": 1.329853343276162e-05, - "loss": 0.2134, + "loss": 0.3671, "step": 3400 }, { "epoch": 7.062863534675615, - "grad_norm": 0.5340798497200012, + "grad_norm": 0.10744742304086685, "learning_rate": 1.3174248073576934e-05, - "loss": 0.0879, + "loss": 0.2315, "step": 3410 }, { "epoch": 7.065100671140939, - "grad_norm": 20.357879638671875, + "grad_norm": 19.817075729370117, "learning_rate": 1.3049962714392244e-05, - "loss": 0.364, + "loss": 0.2397, "step": 3420 }, { "epoch": 7.067337807606264, - "grad_norm": 12.8193998336792, + "grad_norm": 20.652551651000977, "learning_rate": 1.2925677355207559e-05, - "loss": 0.1638, + "loss": 0.1723, "step": 3430 }, { "epoch": 7.069574944071588, - "grad_norm": 32.92828369140625, + "grad_norm": 3.9074671268463135, "learning_rate": 1.280139199602287e-05, - "loss": 0.438, + "loss": 0.2729, "step": 3440 }, { "epoch": 7.071812080536913, - "grad_norm": 1.9262471199035645, + "grad_norm": 0.27141252160072327, "learning_rate": 1.2677106636838182e-05, - "loss": 0.2231, + "loss": 0.3176, "step": 3450 }, { "epoch": 7.074049217002237, - "grad_norm": 13.616302490234375, + "grad_norm": 11.46578598022461, "learning_rate": 1.2552821277653492e-05, - "loss": 0.1296, + "loss": 0.2252, "step": 3460 }, { "epoch": 7.0762863534675615, - "grad_norm": 17.851036071777344, + "grad_norm": 16.85219383239746, "learning_rate": 1.2428535918468805e-05, - "loss": 0.119, + "loss": 0.1388, "step": 3470 }, { "epoch": 7.078523489932886, - "grad_norm": 9.76507568359375, + "grad_norm": 10.73118782043457, "learning_rate": 1.2304250559284117e-05, - "loss": 0.3196, + "loss": 0.2334, "step": 3480 }, { "epoch": 7.08076062639821, - "grad_norm": 3.3353357315063477, + "grad_norm": 0.460288941860199, "learning_rate": 1.217996520009943e-05, - "loss": 0.3447, + "loss": 0.0608, "step": 3490 }, { "epoch": 7.082997762863535, - "grad_norm": 19.23137664794922, + "grad_norm": 1.64030921459198, "learning_rate": 1.2055679840914742e-05, - "loss": 0.2249, + "loss": 0.1595, "step": 3500 }, { "epoch": 7.085234899328859, - "grad_norm": 19.729598999023438, + "grad_norm": 14.426569938659668, "learning_rate": 1.1931394481730052e-05, - "loss": 0.151, + "loss": 0.2432, "step": 3510 }, { "epoch": 7.087472035794184, - "grad_norm": 9.253190994262695, + "grad_norm": 13.951542854309082, "learning_rate": 1.1807109122545365e-05, - "loss": 0.2224, + "loss": 0.2007, "step": 3520 }, { "epoch": 7.089709172259508, - "grad_norm": 0.1582317054271698, + "grad_norm": 25.381717681884766, "learning_rate": 1.1682823763360677e-05, - "loss": 0.2683, + "loss": 0.2316, "step": 3530 }, { "epoch": 7.0919463087248324, - "grad_norm": 0.43652433156967163, + "grad_norm": 0.6340412497520447, "learning_rate": 1.155853840417599e-05, - "loss": 0.275, + "loss": 0.1972, "step": 3540 }, { "epoch": 7.094183445190157, - "grad_norm": 0.7390974760055542, + "grad_norm": 11.732870101928711, "learning_rate": 1.14342530449913e-05, - "loss": 0.1894, + "loss": 0.1388, "step": 3550 }, { "epoch": 7.096420581655481, - "grad_norm": 1.3036680221557617, + "grad_norm": 0.08352160453796387, "learning_rate": 1.1309967685806612e-05, - "loss": 0.2943, + "loss": 0.2276, "step": 3560 }, { "epoch": 7.098657718120806, - "grad_norm": 9.086507797241211, + "grad_norm": 0.6394329071044922, "learning_rate": 1.1185682326621925e-05, - "loss": 0.2089, + "loss": 0.1297, "step": 3570 }, { "epoch": 7.1, - "eval_loss": 0.3785615861415863, - "eval_runtime": 932.236, - "eval_samples_per_second": 5.303, - "eval_steps_per_second": 0.663, + "eval_loss": 0.4294538199901581, + "eval_runtime": 892.5544, + "eval_samples_per_second": 8.309, + "eval_steps_per_second": 1.039, "step": 3576 }, { "epoch": 8.00089485458613, - "grad_norm": 1.078548550605774, + "grad_norm": 12.145805358886719, "learning_rate": 1.1061396967437237e-05, - "loss": 0.2587, + "loss": 0.4217, "step": 3580 }, { "epoch": 8.003131991051454, - "grad_norm": 1.228070616722107, + "grad_norm": 16.92803382873535, "learning_rate": 1.0937111608252548e-05, - "loss": 0.1517, + "loss": 0.1421, "step": 3590 }, { "epoch": 8.00536912751678, - "grad_norm": 6.344273090362549, + "grad_norm": 6.773504257202148, "learning_rate": 1.081282624906786e-05, - "loss": 0.2672, + "loss": 0.1891, "step": 3600 }, { "epoch": 8.007606263982103, - "grad_norm": 10.512372970581055, + "grad_norm": 22.504379272460938, "learning_rate": 1.0688540889883172e-05, - "loss": 0.1911, + "loss": 0.1844, "step": 3610 }, { "epoch": 8.009843400447428, - "grad_norm": 9.218599319458008, + "grad_norm": 19.881702423095703, "learning_rate": 1.0564255530698485e-05, - "loss": 0.2245, + "loss": 0.2223, "step": 3620 }, { "epoch": 8.012080536912752, - "grad_norm": 4.7348952293396, + "grad_norm": 1.9907094240188599, "learning_rate": 1.0439970171513797e-05, - "loss": 0.1359, + "loss": 0.2041, "step": 3630 }, { "epoch": 8.014317673378075, - "grad_norm": 6.268092632293701, + "grad_norm": 0.3308212161064148, "learning_rate": 1.031568481232911e-05, - "loss": 0.1942, + "loss": 0.0968, "step": 3640 }, { "epoch": 8.0165548098434, - "grad_norm": 9.11316204071045, + "grad_norm": 1.7726011276245117, "learning_rate": 1.019139945314442e-05, - "loss": 0.0928, + "loss": 0.0571, "step": 3650 }, { "epoch": 8.018791946308724, - "grad_norm": 40.897552490234375, + "grad_norm": 29.911670684814453, "learning_rate": 1.006711409395973e-05, - "loss": 0.2588, + "loss": 0.1089, "step": 3660 }, { "epoch": 8.02102908277405, - "grad_norm": 0.3429583013057709, + "grad_norm": 5.603979587554932, "learning_rate": 9.942828734775043e-06, - "loss": 0.3107, + "loss": 0.0697, "step": 3670 }, { "epoch": 8.023266219239373, - "grad_norm": 0.48826009035110474, + "grad_norm": 0.07145686447620392, "learning_rate": 9.818543375590355e-06, - "loss": 0.2078, + "loss": 0.0768, "step": 3680 }, { "epoch": 8.025503355704698, - "grad_norm": 7.965569019317627, + "grad_norm": 1.745406985282898, "learning_rate": 9.694258016405668e-06, - "loss": 0.1233, + "loss": 0.0412, "step": 3690 }, { "epoch": 8.027740492170022, - "grad_norm": 0.34283313155174255, + "grad_norm": 0.02010565809905529, "learning_rate": 9.56997265722098e-06, - "loss": 0.1816, + "loss": 0.2096, "step": 3700 }, { "epoch": 8.029977628635347, - "grad_norm": 5.221133708953857, + "grad_norm": 8.72205638885498, "learning_rate": 9.445687298036292e-06, - "loss": 0.2169, + "loss": 0.1555, "step": 3710 }, { "epoch": 8.03221476510067, - "grad_norm": 0.05195072665810585, + "grad_norm": 0.01665619947016239, "learning_rate": 9.321401938851605e-06, - "loss": 0.237, + "loss": 0.2793, "step": 3720 }, { "epoch": 8.034451901565996, - "grad_norm": 0.1627640724182129, + "grad_norm": 0.004611628130078316, "learning_rate": 9.197116579666915e-06, - "loss": 0.2308, + "loss": 0.2735, "step": 3730 }, { "epoch": 8.03668903803132, - "grad_norm": 0.33864226937294006, + "grad_norm": 1.931004524230957, "learning_rate": 9.072831220482228e-06, - "loss": 0.1982, + "loss": 0.2636, "step": 3740 }, { "epoch": 8.038926174496645, - "grad_norm": 0.016831032931804657, + "grad_norm": 0.014065139926970005, "learning_rate": 8.94854586129754e-06, - "loss": 0.2632, + "loss": 0.2653, "step": 3750 }, { "epoch": 8.041163310961968, - "grad_norm": 1.7836061716079712, + "grad_norm": 0.5851568579673767, "learning_rate": 8.824260502112852e-06, - "loss": 0.0641, + "loss": 0.092, "step": 3760 }, { "epoch": 8.043400447427294, - "grad_norm": 20.223674774169922, + "grad_norm": 0.051146071404218674, "learning_rate": 8.699975142928163e-06, - "loss": 0.1735, + "loss": 0.2241, "step": 3770 }, { "epoch": 8.045637583892617, - "grad_norm": 16.769073486328125, + "grad_norm": 0.1566874086856842, "learning_rate": 8.575689783743475e-06, - "loss": 0.2878, + "loss": 0.1845, "step": 3780 }, { "epoch": 8.047874720357942, - "grad_norm": 26.629791259765625, + "grad_norm": 35.4766960144043, "learning_rate": 8.451404424558788e-06, - "loss": 0.1811, + "loss": 0.1556, "step": 3790 }, { "epoch": 8.050111856823266, - "grad_norm": 35.164878845214844, + "grad_norm": 44.98518753051758, "learning_rate": 8.327119065374098e-06, - "loss": 0.3123, + "loss": 0.2961, "step": 3800 }, { "epoch": 8.052348993288591, - "grad_norm": 1.7551312446594238, + "grad_norm": 1.2694602012634277, "learning_rate": 8.20283370618941e-06, - "loss": 0.1989, + "loss": 0.0785, "step": 3810 }, { "epoch": 8.054586129753915, - "grad_norm": 0.1438596099615097, + "grad_norm": 0.6759599447250366, "learning_rate": 8.078548347004723e-06, - "loss": 0.1427, + "loss": 0.31, "step": 3820 }, { "epoch": 8.05682326621924, - "grad_norm": 4.8719482421875, + "grad_norm": 5.972934246063232, "learning_rate": 7.954262987820035e-06, - "loss": 0.0963, + "loss": 0.0335, "step": 3830 }, { "epoch": 8.059060402684564, - "grad_norm": 2.296576499938965, + "grad_norm": 0.8728901743888855, "learning_rate": 7.829977628635348e-06, - "loss": 0.1478, + "loss": 0.2005, "step": 3840 }, { "epoch": 8.061297539149889, - "grad_norm": 7.730291366577148, + "grad_norm": 10.340746879577637, "learning_rate": 7.70569226945066e-06, - "loss": 0.1619, + "loss": 0.1724, "step": 3850 }, { "epoch": 8.063534675615212, - "grad_norm": 0.0443468801677227, + "grad_norm": 0.060672469437122345, "learning_rate": 7.5814069102659716e-06, - "loss": 0.1471, + "loss": 0.174, "step": 3860 }, { "epoch": 8.065771812080538, - "grad_norm": 1.623080849647522, + "grad_norm": 4.864527702331543, "learning_rate": 7.457121551081284e-06, - "loss": 0.2739, + "loss": 0.3543, "step": 3870 }, { "epoch": 8.068008948545861, - "grad_norm": 21.143207550048828, + "grad_norm": 4.48012638092041, "learning_rate": 7.3328361918965945e-06, - "loss": 0.2317, + "loss": 0.122, "step": 3880 }, { "epoch": 8.070246085011185, - "grad_norm": 0.43577510118484497, + "grad_norm": 0.07548043876886368, "learning_rate": 7.208550832711906e-06, - "loss": 0.3871, + "loss": 0.2838, "step": 3890 }, { "epoch": 8.07248322147651, - "grad_norm": 27.5056095123291, + "grad_norm": 18.938758850097656, "learning_rate": 7.084265473527218e-06, - "loss": 0.2158, + "loss": 0.1681, "step": 3900 }, { "epoch": 8.074720357941834, - "grad_norm": 0.9177855253219604, + "grad_norm": 1.3047031164169312, "learning_rate": 6.959980114342531e-06, - "loss": 0.1795, + "loss": 0.2001, "step": 3910 }, { "epoch": 8.076957494407159, - "grad_norm": 17.42781639099121, + "grad_norm": 12.20274829864502, "learning_rate": 6.835694755157842e-06, - "loss": 0.2471, + "loss": 0.2776, "step": 3920 }, { "epoch": 8.079194630872482, - "grad_norm": 10.758467674255371, + "grad_norm": 0.47563186287879944, "learning_rate": 6.7114093959731546e-06, - "loss": 0.0654, + "loss": 0.2005, "step": 3930 }, { "epoch": 8.081431767337808, - "grad_norm": 8.318502426147461, + "grad_norm": 5.735065460205078, "learning_rate": 6.587124036788467e-06, - "loss": 0.3004, + "loss": 0.1499, "step": 3940 }, { "epoch": 8.083668903803131, - "grad_norm": 11.846883773803711, + "grad_norm": 20.469074249267578, "learning_rate": 6.462838677603779e-06, - "loss": 0.2625, + "loss": 0.2258, "step": 3950 }, { "epoch": 8.085906040268457, - "grad_norm": 4.139098644256592, + "grad_norm": 3.2545881271362305, "learning_rate": 6.338553318419091e-06, - "loss": 0.1639, + "loss": 0.2216, "step": 3960 }, { "epoch": 8.08814317673378, - "grad_norm": 0.2406436949968338, + "grad_norm": 1.8031901121139526, "learning_rate": 6.214267959234402e-06, - "loss": 0.1516, + "loss": 0.1462, "step": 3970 }, { "epoch": 8.090380313199105, - "grad_norm": 5.129803657531738, + "grad_norm": 11.510761260986328, "learning_rate": 6.089982600049715e-06, - "loss": 0.2214, + "loss": 0.2225, "step": 3980 }, { "epoch": 8.092617449664429, - "grad_norm": 0.08740751445293427, + "grad_norm": 1.421615481376648, "learning_rate": 5.965697240865026e-06, - "loss": 0.2038, + "loss": 0.1203, "step": 3990 }, { "epoch": 8.094854586129754, - "grad_norm": 0.03226242959499359, + "grad_norm": 0.10247212648391724, "learning_rate": 5.8414118816803384e-06, - "loss": 0.1007, + "loss": 0.0633, "step": 4000 }, { "epoch": 8.097091722595078, - "grad_norm": 0.23406432569026947, + "grad_norm": 0.038117095828056335, "learning_rate": 5.71712652249565e-06, - "loss": 0.1954, + "loss": 0.1246, "step": 4010 }, { "epoch": 8.099328859060403, - "grad_norm": 14.286626815795898, + "grad_norm": 14.49740982055664, "learning_rate": 5.592841163310962e-06, - "loss": 0.2659, + "loss": 0.3104, "step": 4020 }, { "epoch": 8.1, - "eval_loss": 0.37856465578079224, - "eval_runtime": 924.4583, - "eval_samples_per_second": 5.348, - "eval_steps_per_second": 0.668, + "eval_loss": 0.40958455204963684, + "eval_runtime": 894.7196, + "eval_samples_per_second": 8.289, + "eval_steps_per_second": 1.036, "step": 4023 }, { "epoch": 9.001565995525727, - "grad_norm": 2.8623125553131104, + "grad_norm": 0.1057172641158104, "learning_rate": 5.468555804126274e-06, - "loss": 0.027, + "loss": 0.0831, "step": 4030 }, { "epoch": 9.003803131991052, - "grad_norm": 0.19636014103889465, + "grad_norm": 1.2070332765579224, "learning_rate": 5.344270444941586e-06, - "loss": 0.1193, + "loss": 0.0479, "step": 4040 }, { "epoch": 9.006040268456376, - "grad_norm": 0.22568732500076294, + "grad_norm": 0.6095995903015137, "learning_rate": 5.2199850857568985e-06, - "loss": 0.0185, + "loss": 0.2161, "step": 4050 }, { "epoch": 9.0082774049217, - "grad_norm": 16.95732307434082, + "grad_norm": 6.29830265045166, "learning_rate": 5.09569972657221e-06, - "loss": 0.1922, + "loss": 0.1455, "step": 4060 }, { "epoch": 9.010514541387025, - "grad_norm": 0.051336098462343216, + "grad_norm": 0.01811257004737854, "learning_rate": 4.9714143673875215e-06, - "loss": 0.1818, + "loss": 0.0465, "step": 4070 }, { "epoch": 9.012751677852348, - "grad_norm": 15.473747253417969, + "grad_norm": 17.725488662719727, "learning_rate": 4.847129008202834e-06, - "loss": 0.2227, + "loss": 0.3159, "step": 4080 }, { "epoch": 9.014988814317674, - "grad_norm": 1.2553575038909912, + "grad_norm": 27.672937393188477, "learning_rate": 4.722843649018146e-06, - "loss": 0.0386, + "loss": 0.0538, "step": 4090 }, { "epoch": 9.017225950782997, - "grad_norm": 15.090453147888184, + "grad_norm": 0.12884105741977692, "learning_rate": 4.598558289833458e-06, - "loss": 0.1785, + "loss": 0.2565, "step": 4100 }, { "epoch": 9.019463087248322, - "grad_norm": 3.2336063385009766, + "grad_norm": 20.479238510131836, "learning_rate": 4.47427293064877e-06, - "loss": 0.0169, + "loss": 0.1162, "step": 4110 }, { "epoch": 9.021700223713646, - "grad_norm": 0.01566535234451294, + "grad_norm": 23.51799774169922, "learning_rate": 4.3499875714640815e-06, - "loss": 0.1087, + "loss": 0.181, "step": 4120 }, { "epoch": 9.023937360178971, - "grad_norm": 40.838531494140625, + "grad_norm": 14.024354934692383, "learning_rate": 4.225702212279394e-06, - "loss": 0.1926, + "loss": 0.2026, "step": 4130 }, { "epoch": 9.026174496644295, - "grad_norm": 0.03625281900167465, + "grad_norm": 0.20324963331222534, "learning_rate": 4.101416853094705e-06, - "loss": 0.1113, + "loss": 0.0238, "step": 4140 }, { "epoch": 9.02841163310962, - "grad_norm": 0.037700217217206955, + "grad_norm": 0.17176759243011475, "learning_rate": 3.977131493910018e-06, - "loss": 0.0486, + "loss": 0.036, "step": 4150 }, { "epoch": 9.030648769574944, - "grad_norm": 28.63728141784668, + "grad_norm": 13.05589485168457, "learning_rate": 3.85284613472533e-06, - "loss": 0.2895, + "loss": 0.3049, "step": 4160 }, { "epoch": 9.032885906040269, - "grad_norm": 0.01208668015897274, + "grad_norm": 0.03032175451517105, "learning_rate": 3.728560775540642e-06, - "loss": 0.0776, + "loss": 0.043, "step": 4170 }, { "epoch": 9.035123042505592, - "grad_norm": 0.07349768280982971, + "grad_norm": 2.0046262741088867, "learning_rate": 3.604275416355953e-06, - "loss": 0.2092, + "loss": 0.2365, "step": 4180 }, { "epoch": 9.037360178970918, - "grad_norm": 0.4977450966835022, + "grad_norm": 0.10407610237598419, "learning_rate": 3.4799900571712654e-06, - "loss": 0.1928, + "loss": 0.0472, "step": 4190 }, { "epoch": 9.039597315436241, - "grad_norm": 0.0466020293533802, + "grad_norm": 0.7708104252815247, "learning_rate": 3.3557046979865773e-06, - "loss": 0.1642, + "loss": 0.2119, "step": 4200 }, { "epoch": 9.041834451901567, - "grad_norm": 0.30247071385383606, + "grad_norm": 0.13973136246204376, "learning_rate": 3.2314193388018896e-06, - "loss": 0.2029, + "loss": 0.1322, "step": 4210 }, { "epoch": 9.04407158836689, - "grad_norm": 13.724775314331055, + "grad_norm": 4.554214000701904, "learning_rate": 3.107133979617201e-06, - "loss": 0.1069, + "loss": 0.0044, "step": 4220 }, { "epoch": 9.046308724832215, - "grad_norm": 0.40144094824790955, + "grad_norm": 1.251420259475708, "learning_rate": 2.982848620432513e-06, - "loss": 0.1657, + "loss": 0.1559, "step": 4230 }, { "epoch": 9.048545861297539, - "grad_norm": 0.06872935593128204, + "grad_norm": 0.027974896132946014, "learning_rate": 2.858563261247825e-06, - "loss": 0.1699, + "loss": 0.1958, "step": 4240 }, { "epoch": 9.050782997762864, - "grad_norm": 10.61763858795166, + "grad_norm": 17.233055114746094, "learning_rate": 2.734277902063137e-06, - "loss": 0.1787, + "loss": 0.1606, "step": 4250 }, { "epoch": 9.053020134228188, - "grad_norm": 34.91901779174805, + "grad_norm": 0.4418050944805145, "learning_rate": 2.6099925428784492e-06, - "loss": 0.2871, + "loss": 0.19, "step": 4260 }, { "epoch": 9.055257270693513, - "grad_norm": 14.127365112304688, + "grad_norm": 7.108306884765625, "learning_rate": 2.4857071836937607e-06, - "loss": 0.2077, + "loss": 0.1778, "step": 4270 }, { "epoch": 9.057494407158837, - "grad_norm": 0.02785838022828102, + "grad_norm": 0.11051614582538605, "learning_rate": 2.361421824509073e-06, - "loss": 0.1807, + "loss": 0.1074, "step": 4280 }, { "epoch": 9.059731543624162, - "grad_norm": 0.04847564175724983, + "grad_norm": 0.011775967665016651, "learning_rate": 2.237136465324385e-06, - "loss": 0.0376, + "loss": 0.1294, "step": 4290 }, { "epoch": 9.061968680089485, - "grad_norm": 0.029158098623156548, + "grad_norm": 0.011663487181067467, "learning_rate": 2.112851106139697e-06, - "loss": 0.1871, + "loss": 0.0744, "step": 4300 }, { "epoch": 9.06420581655481, - "grad_norm": 23.394874572753906, + "grad_norm": 23.934040069580078, "learning_rate": 1.988565746955009e-06, - "loss": 0.1058, + "loss": 0.1064, "step": 4310 }, { "epoch": 9.066442953020134, - "grad_norm": 0.8079015016555786, + "grad_norm": 0.42981475591659546, "learning_rate": 1.864280387770321e-06, - "loss": 0.1675, + "loss": 0.0546, "step": 4320 }, { "epoch": 9.068680089485458, - "grad_norm": 24.60673713684082, + "grad_norm": 19.337921142578125, "learning_rate": 1.7399950285856327e-06, - "loss": 0.2744, + "loss": 0.2816, "step": 4330 }, { "epoch": 9.070917225950783, - "grad_norm": 0.17073792219161987, + "grad_norm": 0.4612930119037628, "learning_rate": 1.6157096694009448e-06, - "loss": 0.2612, + "loss": 0.136, "step": 4340 }, { "epoch": 9.073154362416107, - "grad_norm": 0.29150497913360596, + "grad_norm": 0.09723786264657974, "learning_rate": 1.4914243102162565e-06, - "loss": 0.2493, + "loss": 0.1669, "step": 4350 }, { "epoch": 9.075391498881432, - "grad_norm": 0.5105171203613281, + "grad_norm": 0.028177455067634583, "learning_rate": 1.3671389510315684e-06, - "loss": 0.2066, + "loss": 0.0808, "step": 4360 }, { "epoch": 9.077628635346755, - "grad_norm": 15.902856826782227, + "grad_norm": 13.670249938964844, "learning_rate": 1.2428535918468804e-06, - "loss": 0.0912, + "loss": 0.1512, "step": 4370 }, { "epoch": 9.07986577181208, - "grad_norm": 0.061424434185028076, + "grad_norm": 0.34152135252952576, "learning_rate": 1.1185682326621925e-06, - "loss": 0.1446, + "loss": 0.1886, "step": 4380 }, { "epoch": 9.082102908277404, - "grad_norm": 0.5502781271934509, + "grad_norm": 0.24342969059944153, "learning_rate": 9.942828734775044e-07, - "loss": 0.1979, + "loss": 0.2693, "step": 4390 }, { "epoch": 9.08434004474273, - "grad_norm": 0.8110822439193726, + "grad_norm": 0.17166166007518768, "learning_rate": 8.699975142928163e-07, - "loss": 0.0716, + "loss": 0.1354, "step": 4400 }, { "epoch": 9.086577181208053, - "grad_norm": 0.02159685641527176, + "grad_norm": 0.027880514040589333, "learning_rate": 7.457121551081283e-07, - "loss": 0.1669, + "loss": 0.1474, "step": 4410 }, { "epoch": 9.088814317673378, - "grad_norm": 0.6268171668052673, + "grad_norm": 0.05772515758872032, "learning_rate": 6.214267959234402e-07, - "loss": 0.0994, + "loss": 0.0538, "step": 4420 }, { "epoch": 9.091051454138702, - "grad_norm": 0.645751416683197, + "grad_norm": 0.07866553962230682, "learning_rate": 4.971414367387522e-07, - "loss": 0.1157, + "loss": 0.1434, "step": 4430 }, { "epoch": 9.093288590604027, - "grad_norm": 22.347421646118164, + "grad_norm": 16.45406723022461, "learning_rate": 3.7285607755406413e-07, - "loss": 0.1085, + "loss": 0.1083, "step": 4440 }, { "epoch": 9.09552572706935, - "grad_norm": 0.024207374081015587, + "grad_norm": 0.050580114126205444, "learning_rate": 2.485707183693761e-07, - "loss": 0.2458, + "loss": 0.3213, "step": 4450 }, { "epoch": 9.097762863534676, - "grad_norm": 28.59840965270996, + "grad_norm": 0.4725286364555359, "learning_rate": 1.2428535918468805e-07, - "loss": 0.1111, + "loss": 0.1468, "step": 4460 }, { "epoch": 9.1, - "grad_norm": 0.028079470619559288, + "grad_norm": 0.048940546810626984, "learning_rate": 0.0, - "loss": 0.0856, + "loss": 0.0525, "step": 4470 }, { "epoch": 9.1, - "eval_loss": 0.3906996250152588, - "eval_runtime": 934.7366, - "eval_samples_per_second": 5.289, - "eval_steps_per_second": 0.661, + "eval_loss": 0.41542962193489075, + "eval_runtime": 893.9438, + "eval_samples_per_second": 8.296, + "eval_steps_per_second": 1.037, "step": 4470 }, { "epoch": 9.1, "step": 4470, - "total_flos": 2.3553465155251864e+20, - "train_loss": 0.4063819212768168, - "train_runtime": 34011.8222, - "train_samples_per_second": 1.051, - "train_steps_per_second": 0.131 + "total_flos": 1.5702310103501242e+20, + "train_loss": 0.38359242084309025, + "train_runtime": 28356.0667, + "train_samples_per_second": 1.261, + "train_steps_per_second": 0.158 }, { "epoch": 9.1, - "eval_loss": 0.39387446641921997, - "eval_runtime": 485.4183, - "eval_samples_per_second": 2.266, - "eval_steps_per_second": 0.284, + "eval_loss": 0.39315128326416016, + "eval_runtime": 423.7958, + "eval_samples_per_second": 2.596, + "eval_steps_per_second": 0.326, "step": 4470 }, { "epoch": 9.1, - "eval_loss": 0.3938744366168976, - "eval_runtime": 485.0329, - "eval_samples_per_second": 2.268, - "eval_steps_per_second": 0.285, + "eval_loss": 0.39315125346183777, + "eval_runtime": 426.8234, + "eval_samples_per_second": 2.577, + "eval_steps_per_second": 0.323, "step": 4470 } ], @@ -3260,7 +3260,7 @@ "attributes": {} } }, - "total_flos": 2.3553465155251864e+20, + "total_flos": 1.5702310103501242e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null