diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,56021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9928021841648051, + "eval_steps": 500, + "global_step": 8000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.415610232722863, + "learning_rate": 4.132231404958678e-08, + "loss": 0.7865, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 4.7507651462908065, + "learning_rate": 8.264462809917357e-08, + "loss": 0.8337, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 5.697050383152881, + "learning_rate": 1.2396694214876034e-07, + "loss": 0.7606, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 4.833288022383022, + "learning_rate": 1.6528925619834713e-07, + "loss": 0.7577, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 4.6610300424792745, + "learning_rate": 2.066115702479339e-07, + "loss": 0.8072, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 4.361957249992969, + "learning_rate": 2.4793388429752067e-07, + "loss": 0.787, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 4.178361424391466, + "learning_rate": 2.892561983471075e-07, + "loss": 0.8219, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 0.7176108393289844, + "learning_rate": 3.3057851239669426e-07, + "loss": 0.4465, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 0.6557419046963692, + "learning_rate": 3.7190082644628103e-07, + "loss": 0.4455, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 7.388562834977926, + "learning_rate": 4.132231404958678e-07, + "loss": 0.7729, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 4.305265496796701, + "learning_rate": 4.5454545454545457e-07, + "loss": 0.8012, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 4.2907850336775555, + "learning_rate": 4.958677685950413e-07, + "loss": 0.8273, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 6.601527102678475, + "learning_rate": 5.371900826446281e-07, + "loss": 0.793, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 3.968476594018755, + "learning_rate": 5.78512396694215e-07, + "loss": 0.788, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 4.524793672412605, + "learning_rate": 6.198347107438018e-07, + "loss": 0.7818, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 4.443521791810294, + "learning_rate": 6.611570247933885e-07, + "loss": 0.7434, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 3.773132453512853, + "learning_rate": 7.024793388429753e-07, + "loss": 0.7548, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 3.7482263437220364, + "learning_rate": 7.438016528925621e-07, + "loss": 0.7902, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 3.836441534543556, + "learning_rate": 7.851239669421488e-07, + "loss": 0.7889, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 3.9739204476729437, + "learning_rate": 8.264462809917356e-07, + "loss": 0.7762, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 4.4474365552708, + "learning_rate": 8.677685950413224e-07, + "loss": 0.7679, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 3.4580858839397326, + "learning_rate": 9.090909090909091e-07, + "loss": 0.8085, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 3.413169939123294, + "learning_rate": 9.50413223140496e-07, + "loss": 0.6965, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 2.8731977460065075, + "learning_rate": 9.917355371900827e-07, + "loss": 0.7589, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 2.7087501271234804, + "learning_rate": 1.0330578512396695e-06, + "loss": 0.6862, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 3.0679975335981493, + "learning_rate": 1.0743801652892562e-06, + "loss": 0.6426, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 2.616920787492988, + "learning_rate": 1.115702479338843e-06, + "loss": 0.7291, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 2.431583405470523, + "learning_rate": 1.15702479338843e-06, + "loss": 0.6693, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 2.759569588989731, + "learning_rate": 1.1983471074380167e-06, + "loss": 0.6582, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 2.4037335332718492, + "learning_rate": 1.2396694214876035e-06, + "loss": 0.6811, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 2.4900842129537524, + "learning_rate": 1.28099173553719e-06, + "loss": 0.7467, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 2.028361911401686, + "learning_rate": 1.322314049586777e-06, + "loss": 0.6178, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 2.571997398821945, + "learning_rate": 1.3636363636363636e-06, + "loss": 0.6036, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 2.276076194538056, + "learning_rate": 1.4049586776859506e-06, + "loss": 0.6242, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 3.2059372728177826, + "learning_rate": 1.4462809917355372e-06, + "loss": 0.6995, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 2.5259678899939773, + "learning_rate": 1.4876033057851241e-06, + "loss": 0.641, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 2.099201309749444, + "learning_rate": 1.5289256198347107e-06, + "loss": 0.6355, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 2.0199781281044813, + "learning_rate": 1.5702479338842977e-06, + "loss": 0.6039, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 5.172474850706783, + "learning_rate": 1.6115702479338842e-06, + "loss": 0.6257, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 2.051256221405437, + "learning_rate": 1.6528925619834712e-06, + "loss": 0.6485, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 1.8784132131735476, + "learning_rate": 1.694214876033058e-06, + "loss": 0.6131, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 2.3302814905160405, + "learning_rate": 1.7355371900826448e-06, + "loss": 0.6148, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 2.675021092939445, + "learning_rate": 1.7768595041322315e-06, + "loss": 0.618, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 1.9396764176262677, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.607, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 3.162903769359593, + "learning_rate": 1.859504132231405e-06, + "loss": 0.6073, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 1.8725730728879908, + "learning_rate": 1.900826446280992e-06, + "loss": 0.6039, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 2.1190143535626333, + "learning_rate": 1.9421487603305786e-06, + "loss": 0.5924, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 1.831058773283454, + "learning_rate": 1.9834710743801654e-06, + "loss": 0.6027, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 0.7497304742654913, + "learning_rate": 2.024793388429752e-06, + "loss": 0.5308, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 2.031101871069113, + "learning_rate": 2.066115702479339e-06, + "loss": 0.5773, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 2.0924155453957884, + "learning_rate": 2.1074380165289257e-06, + "loss": 0.5808, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 2.1410226694005132, + "learning_rate": 2.1487603305785124e-06, + "loss": 0.6618, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 1.9601583251234524, + "learning_rate": 2.1900826446280992e-06, + "loss": 0.5147, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 2.217739964833312, + "learning_rate": 2.231404958677686e-06, + "loss": 0.6055, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 3.4871845999980895, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.5695, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 9.728315202799985, + "learning_rate": 2.31404958677686e-06, + "loss": 0.5878, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 2.1838581200045892, + "learning_rate": 2.3553719008264463e-06, + "loss": 0.5239, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 2.2711658012889107, + "learning_rate": 2.3966942148760335e-06, + "loss": 0.595, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 1.9191083300460372, + "learning_rate": 2.43801652892562e-06, + "loss": 0.597, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 3.232113453296245, + "learning_rate": 2.479338842975207e-06, + "loss": 0.5439, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 2.247069820214734, + "learning_rate": 2.5206611570247934e-06, + "loss": 0.6307, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 2.1081482509346166, + "learning_rate": 2.56198347107438e-06, + "loss": 0.6352, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 2.038799318139236, + "learning_rate": 2.6033057851239673e-06, + "loss": 0.6116, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 2.2767344300217505, + "learning_rate": 2.644628099173554e-06, + "loss": 0.5667, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 2.1918624104732034, + "learning_rate": 2.6859504132231405e-06, + "loss": 0.6243, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 1.9263213735038611, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.5455, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 4.079115837709929, + "learning_rate": 2.7685950413223144e-06, + "loss": 0.5963, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 2.422611576288611, + "learning_rate": 2.809917355371901e-06, + "loss": 0.5641, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 1.6977361968463944, + "learning_rate": 2.851239669421488e-06, + "loss": 0.5915, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 1.6310705010720266, + "learning_rate": 2.8925619834710743e-06, + "loss": 0.5116, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 1.9813718105958915, + "learning_rate": 2.9338842975206615e-06, + "loss": 0.5831, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 4.533914016058211, + "learning_rate": 2.9752066115702483e-06, + "loss": 0.5544, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 2.749797864900883, + "learning_rate": 3.016528925619835e-06, + "loss": 0.6231, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 2.2739710963750737, + "learning_rate": 3.0578512396694214e-06, + "loss": 0.5658, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 1.6293489910001722, + "learning_rate": 3.0991735537190086e-06, + "loss": 0.5888, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 6.27649736387961, + "learning_rate": 3.1404958677685953e-06, + "loss": 0.566, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 1.971009882984369, + "learning_rate": 3.181818181818182e-06, + "loss": 0.5731, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 3.7269504201622956, + "learning_rate": 3.2231404958677685e-06, + "loss": 0.5792, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 2.398280082914536, + "learning_rate": 3.264462809917356e-06, + "loss": 0.5617, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 2.239264649971885, + "learning_rate": 3.3057851239669424e-06, + "loss": 0.6041, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 2.1614348411341076, + "learning_rate": 3.347107438016529e-06, + "loss": 0.6224, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 2.5115529692047422, + "learning_rate": 3.388429752066116e-06, + "loss": 0.5348, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 8.669544917763194, + "learning_rate": 3.429752066115703e-06, + "loss": 0.5508, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 1.9449410268347895, + "learning_rate": 3.4710743801652895e-06, + "loss": 0.6149, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 2.2476415793258164, + "learning_rate": 3.5123966942148763e-06, + "loss": 0.566, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 2.284326985907656, + "learning_rate": 3.553719008264463e-06, + "loss": 0.5919, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 2.329487557124964, + "learning_rate": 3.5950413223140502e-06, + "loss": 0.5753, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 1.6915372745929467, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.5402, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 1.840436747417264, + "learning_rate": 3.6776859504132234e-06, + "loss": 0.5935, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 2.0556986278371054, + "learning_rate": 3.71900826446281e-06, + "loss": 0.5469, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 3.216467905440008, + "learning_rate": 3.7603305785123973e-06, + "loss": 0.62, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 2.7956144664342064, + "learning_rate": 3.801652892561984e-06, + "loss": 0.5953, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 2.052474595357274, + "learning_rate": 3.842975206611571e-06, + "loss": 0.5889, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 2.6229806435376872, + "learning_rate": 3.884297520661157e-06, + "loss": 0.5417, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 2.529529144898349, + "learning_rate": 3.925619834710744e-06, + "loss": 0.462, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 2.104085886441274, + "learning_rate": 3.966942148760331e-06, + "loss": 0.5942, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 1.7522377211377986, + "learning_rate": 4.008264462809918e-06, + "loss": 0.5685, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 2.433678798953759, + "learning_rate": 4.049586776859504e-06, + "loss": 0.5783, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 2.2162277574010516, + "learning_rate": 4.0909090909090915e-06, + "loss": 0.5774, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 5.1328075855667015, + "learning_rate": 4.132231404958678e-06, + "loss": 0.4919, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 2.1548005092718774, + "learning_rate": 4.173553719008265e-06, + "loss": 0.5601, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 2.275791310124109, + "learning_rate": 4.214876033057851e-06, + "loss": 0.5364, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 2.729405960454034, + "learning_rate": 4.2561983471074386e-06, + "loss": 0.5553, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 1.8846153766860623, + "learning_rate": 4.297520661157025e-06, + "loss": 0.5103, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 2.0246132215342603, + "learning_rate": 4.338842975206612e-06, + "loss": 0.5289, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 1.7353792905572192, + "learning_rate": 4.3801652892561984e-06, + "loss": 0.5145, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 2.167202823008346, + "learning_rate": 4.421487603305786e-06, + "loss": 0.5981, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 3.2486808689824627, + "learning_rate": 4.462809917355372e-06, + "loss": 0.584, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 1.7923020421622617, + "learning_rate": 4.504132231404959e-06, + "loss": 0.5168, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 1.998942559674158, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.5632, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 2.1861973024624484, + "learning_rate": 4.586776859504133e-06, + "loss": 0.5556, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 1.902236458891177, + "learning_rate": 4.62809917355372e-06, + "loss": 0.5957, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 2.3472123997091527, + "learning_rate": 4.669421487603306e-06, + "loss": 0.5574, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 1.826111048367796, + "learning_rate": 4.710743801652893e-06, + "loss": 0.5195, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 1.8812962425133646, + "learning_rate": 4.75206611570248e-06, + "loss": 0.5703, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 1.8624556818394709, + "learning_rate": 4.793388429752067e-06, + "loss": 0.5656, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 3.766670998630178, + "learning_rate": 4.834710743801653e-06, + "loss": 0.5282, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 2.039610136010939, + "learning_rate": 4.87603305785124e-06, + "loss": 0.5132, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 2.5079262458353866, + "learning_rate": 4.917355371900827e-06, + "loss": 0.5289, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 2.4038638893489335, + "learning_rate": 4.958677685950414e-06, + "loss": 0.5659, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 1.6960722697241173, + "learning_rate": 5e-06, + "loss": 0.5051, + "step": 121 + }, + { + "epoch": 0.02, + "grad_norm": 2.2161861436132058, + "learning_rate": 5.041322314049587e-06, + "loss": 0.5622, + "step": 122 + }, + { + "epoch": 0.02, + "grad_norm": 2.0070358384779747, + "learning_rate": 5.082644628099174e-06, + "loss": 0.569, + "step": 123 + }, + { + "epoch": 0.02, + "grad_norm": 2.072153444096047, + "learning_rate": 5.12396694214876e-06, + "loss": 0.5898, + "step": 124 + }, + { + "epoch": 0.02, + "grad_norm": 1.961455431362149, + "learning_rate": 5.165289256198347e-06, + "loss": 0.5278, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 2.7520151908460355, + "learning_rate": 5.206611570247935e-06, + "loss": 0.5762, + "step": 126 + }, + { + "epoch": 0.02, + "grad_norm": 0.6835662297556955, + "learning_rate": 5.247933884297521e-06, + "loss": 0.5071, + "step": 127 + }, + { + "epoch": 0.02, + "grad_norm": 2.702178064947002, + "learning_rate": 5.289256198347108e-06, + "loss": 0.5928, + "step": 128 + }, + { + "epoch": 0.02, + "grad_norm": 2.2345051265285494, + "learning_rate": 5.3305785123966946e-06, + "loss": 0.5699, + "step": 129 + }, + { + "epoch": 0.02, + "grad_norm": 2.0642219152001933, + "learning_rate": 5.371900826446281e-06, + "loss": 0.5713, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 2.0108826296235645, + "learning_rate": 5.413223140495868e-06, + "loss": 0.5459, + "step": 131 + }, + { + "epoch": 0.02, + "grad_norm": 2.1627936472454423, + "learning_rate": 5.4545454545454545e-06, + "loss": 0.516, + "step": 132 + }, + { + "epoch": 0.02, + "grad_norm": 1.6647761146847948, + "learning_rate": 5.495867768595042e-06, + "loss": 0.4862, + "step": 133 + }, + { + "epoch": 0.02, + "grad_norm": 2.7119580663489593, + "learning_rate": 5.537190082644629e-06, + "loss": 0.5691, + "step": 134 + }, + { + "epoch": 0.02, + "grad_norm": 2.5852970473911814, + "learning_rate": 5.578512396694216e-06, + "loss": 0.5285, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 2.4735294501089675, + "learning_rate": 5.619834710743802e-06, + "loss": 0.5869, + "step": 136 + }, + { + "epoch": 0.02, + "grad_norm": 2.04141538102587, + "learning_rate": 5.661157024793389e-06, + "loss": 0.5159, + "step": 137 + }, + { + "epoch": 0.02, + "grad_norm": 1.724866138626163, + "learning_rate": 5.702479338842976e-06, + "loss": 0.5488, + "step": 138 + }, + { + "epoch": 0.02, + "grad_norm": 1.7139656285038183, + "learning_rate": 5.743801652892562e-06, + "loss": 0.5026, + "step": 139 + }, + { + "epoch": 0.02, + "grad_norm": 1.847574100805026, + "learning_rate": 5.785123966942149e-06, + "loss": 0.5027, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 3.230329154971066, + "learning_rate": 5.826446280991736e-06, + "loss": 0.5595, + "step": 141 + }, + { + "epoch": 0.02, + "grad_norm": 1.795342282430791, + "learning_rate": 5.867768595041323e-06, + "loss": 0.5263, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 1.8716269720617247, + "learning_rate": 5.90909090909091e-06, + "loss": 0.5301, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 1.9299922283974924, + "learning_rate": 5.9504132231404965e-06, + "loss": 0.5687, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 0.7136333319419209, + "learning_rate": 5.991735537190083e-06, + "loss": 0.5037, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 1.7337829913060052, + "learning_rate": 6.03305785123967e-06, + "loss": 0.4986, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 0.6919406868411878, + "learning_rate": 6.074380165289256e-06, + "loss": 0.4855, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 2.328246418923426, + "learning_rate": 6.115702479338843e-06, + "loss": 0.5565, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 5.441722885984954, + "learning_rate": 6.15702479338843e-06, + "loss": 0.5145, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 2.1257138893693006, + "learning_rate": 6.198347107438017e-06, + "loss": 0.5299, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 2.1770823656371974, + "learning_rate": 6.239669421487604e-06, + "loss": 0.5518, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 2.1037313109160927, + "learning_rate": 6.280991735537191e-06, + "loss": 0.5186, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 2.095149553016966, + "learning_rate": 6.322314049586777e-06, + "loss": 0.5334, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 0.6095163348330389, + "learning_rate": 6.363636363636364e-06, + "loss": 0.4232, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 2.260362966455831, + "learning_rate": 6.404958677685951e-06, + "loss": 0.5246, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 2.18109831500874, + "learning_rate": 6.446280991735537e-06, + "loss": 0.5689, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 1.6992471311150057, + "learning_rate": 6.487603305785124e-06, + "loss": 0.5432, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 2.1996411301077297, + "learning_rate": 6.528925619834712e-06, + "loss": 0.5471, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 6.683198647229002, + "learning_rate": 6.5702479338842985e-06, + "loss": 0.4844, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 1.9315335319997355, + "learning_rate": 6.611570247933885e-06, + "loss": 0.5114, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 4.034007691224993, + "learning_rate": 6.652892561983472e-06, + "loss": 0.5095, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 1.933206220453076, + "learning_rate": 6.694214876033058e-06, + "loss": 0.5584, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 1.7594153622700035, + "learning_rate": 6.735537190082645e-06, + "loss": 0.5267, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 2.128978895295904, + "learning_rate": 6.776859504132232e-06, + "loss": 0.4973, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 1.8752257215193229, + "learning_rate": 6.818181818181818e-06, + "loss": 0.5569, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 1.7552075283979744, + "learning_rate": 6.859504132231406e-06, + "loss": 0.5137, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 1.7116607536756927, + "learning_rate": 6.900826446280993e-06, + "loss": 0.4852, + "step": 167 + }, + { + "epoch": 0.02, + "grad_norm": 1.8310325583717713, + "learning_rate": 6.942148760330579e-06, + "loss": 0.512, + "step": 168 + }, + { + "epoch": 0.02, + "grad_norm": 2.051351087670528, + "learning_rate": 6.983471074380166e-06, + "loss": 0.5072, + "step": 169 + }, + { + "epoch": 0.02, + "grad_norm": 2.6691184220184736, + "learning_rate": 7.0247933884297525e-06, + "loss": 0.553, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 2.472591362280194, + "learning_rate": 7.066115702479339e-06, + "loss": 0.5742, + "step": 171 + }, + { + "epoch": 0.02, + "grad_norm": 1.97874128709382, + "learning_rate": 7.107438016528926e-06, + "loss": 0.5504, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 1.968673756150883, + "learning_rate": 7.1487603305785124e-06, + "loss": 0.5672, + "step": 173 + }, + { + "epoch": 0.02, + "grad_norm": 2.58437763050975, + "learning_rate": 7.1900826446281005e-06, + "loss": 0.5689, + "step": 174 + }, + { + "epoch": 0.02, + "grad_norm": 2.1221625942102254, + "learning_rate": 7.231404958677687e-06, + "loss": 0.551, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 1.7421455025898023, + "learning_rate": 7.272727272727273e-06, + "loss": 0.5774, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 1.8920356174568798, + "learning_rate": 7.31404958677686e-06, + "loss": 0.5269, + "step": 177 + }, + { + "epoch": 0.02, + "grad_norm": 1.9144789767737893, + "learning_rate": 7.355371900826447e-06, + "loss": 0.5161, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 5.34326962708711, + "learning_rate": 7.396694214876033e-06, + "loss": 0.5565, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 3.702683434570155, + "learning_rate": 7.43801652892562e-06, + "loss": 0.5342, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 2.4440904350216632, + "learning_rate": 7.479338842975207e-06, + "loss": 0.4631, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 2.2025744403847805, + "learning_rate": 7.520661157024795e-06, + "loss": 0.5517, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 2.3152746565304083, + "learning_rate": 7.561983471074381e-06, + "loss": 0.5385, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 1.8138121988036877, + "learning_rate": 7.603305785123968e-06, + "loss": 0.5312, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 1.9696695332798093, + "learning_rate": 7.644628099173555e-06, + "loss": 0.4975, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 2.1638549313150577, + "learning_rate": 7.685950413223142e-06, + "loss": 0.4598, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 3.679763180841708, + "learning_rate": 7.727272727272727e-06, + "loss": 0.5828, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 15.755753338347303, + "learning_rate": 7.768595041322314e-06, + "loss": 0.6424, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 2.0607035699738367, + "learning_rate": 7.809917355371902e-06, + "loss": 0.6084, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 4.563843230786939, + "learning_rate": 7.851239669421489e-06, + "loss": 0.5493, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 2.7007150791601604, + "learning_rate": 7.892561983471076e-06, + "loss": 0.5373, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 1.6296232588728798, + "learning_rate": 7.933884297520661e-06, + "loss": 0.5412, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 2.642761568125003, + "learning_rate": 7.975206611570249e-06, + "loss": 0.5568, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 2.062493720572721, + "learning_rate": 8.016528925619836e-06, + "loss": 0.4958, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 1.840113636335198, + "learning_rate": 8.057851239669421e-06, + "loss": 0.5443, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 2.2357413647918993, + "learning_rate": 8.099173553719009e-06, + "loss": 0.5351, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 2.220715219963027, + "learning_rate": 8.140495867768596e-06, + "loss": 0.5326, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 1.9436370507926042, + "learning_rate": 8.181818181818183e-06, + "loss": 0.4906, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 4.233288939881527, + "learning_rate": 8.22314049586777e-06, + "loss": 0.4961, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 2.064536487822645, + "learning_rate": 8.264462809917356e-06, + "loss": 0.5768, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 2.7690467508481755, + "learning_rate": 8.305785123966943e-06, + "loss": 0.55, + "step": 201 + }, + { + "epoch": 0.03, + "grad_norm": 2.137594915236406, + "learning_rate": 8.34710743801653e-06, + "loss": 0.4988, + "step": 202 + }, + { + "epoch": 0.03, + "grad_norm": 1.9534259558411042, + "learning_rate": 8.388429752066116e-06, + "loss": 0.5566, + "step": 203 + }, + { + "epoch": 0.03, + "grad_norm": 2.5020717347328643, + "learning_rate": 8.429752066115703e-06, + "loss": 0.5513, + "step": 204 + }, + { + "epoch": 0.03, + "grad_norm": 2.660014453532568, + "learning_rate": 8.47107438016529e-06, + "loss": 0.4613, + "step": 205 + }, + { + "epoch": 0.03, + "grad_norm": 0.6983360001377713, + "learning_rate": 8.512396694214877e-06, + "loss": 0.4163, + "step": 206 + }, + { + "epoch": 0.03, + "grad_norm": 4.38401204497113, + "learning_rate": 8.553719008264464e-06, + "loss": 0.5223, + "step": 207 + }, + { + "epoch": 0.03, + "grad_norm": 4.657695910352119, + "learning_rate": 8.59504132231405e-06, + "loss": 0.6005, + "step": 208 + }, + { + "epoch": 0.03, + "grad_norm": 2.1393074553357745, + "learning_rate": 8.636363636363637e-06, + "loss": 0.5133, + "step": 209 + }, + { + "epoch": 0.03, + "grad_norm": 3.524402023924675, + "learning_rate": 8.677685950413224e-06, + "loss": 0.5466, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 2.064634815950332, + "learning_rate": 8.71900826446281e-06, + "loss": 0.4988, + "step": 211 + }, + { + "epoch": 0.03, + "grad_norm": 1.96453185900645, + "learning_rate": 8.760330578512397e-06, + "loss": 0.5043, + "step": 212 + }, + { + "epoch": 0.03, + "grad_norm": 2.3011311824481058, + "learning_rate": 8.801652892561984e-06, + "loss": 0.5443, + "step": 213 + }, + { + "epoch": 0.03, + "grad_norm": 1.9545398206419267, + "learning_rate": 8.842975206611571e-06, + "loss": 0.5573, + "step": 214 + }, + { + "epoch": 0.03, + "grad_norm": 2.2270890144831594, + "learning_rate": 8.884297520661158e-06, + "loss": 0.5475, + "step": 215 + }, + { + "epoch": 0.03, + "grad_norm": 2.367048313677594, + "learning_rate": 8.925619834710744e-06, + "loss": 0.5026, + "step": 216 + }, + { + "epoch": 0.03, + "grad_norm": 2.3731067718776466, + "learning_rate": 8.966942148760331e-06, + "loss": 0.5645, + "step": 217 + }, + { + "epoch": 0.03, + "grad_norm": 1.7771313439516687, + "learning_rate": 9.008264462809918e-06, + "loss": 0.5453, + "step": 218 + }, + { + "epoch": 0.03, + "grad_norm": 1.8002080571572299, + "learning_rate": 9.049586776859506e-06, + "loss": 0.495, + "step": 219 + }, + { + "epoch": 0.03, + "grad_norm": 2.302513893194134, + "learning_rate": 9.090909090909091e-06, + "loss": 0.5282, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 4.695747565735532, + "learning_rate": 9.132231404958678e-06, + "loss": 0.5554, + "step": 221 + }, + { + "epoch": 0.03, + "grad_norm": 4.083286425298984, + "learning_rate": 9.173553719008265e-06, + "loss": 0.5131, + "step": 222 + }, + { + "epoch": 0.03, + "grad_norm": 2.330380712840821, + "learning_rate": 9.214876033057853e-06, + "loss": 0.5469, + "step": 223 + }, + { + "epoch": 0.03, + "grad_norm": 1.8673689557327797, + "learning_rate": 9.25619834710744e-06, + "loss": 0.5133, + "step": 224 + }, + { + "epoch": 0.03, + "grad_norm": 3.7630221758042226, + "learning_rate": 9.297520661157025e-06, + "loss": 0.5031, + "step": 225 + }, + { + "epoch": 0.03, + "grad_norm": 2.421818000908611, + "learning_rate": 9.338842975206613e-06, + "loss": 0.4737, + "step": 226 + }, + { + "epoch": 0.03, + "grad_norm": 2.1159305242439594, + "learning_rate": 9.3801652892562e-06, + "loss": 0.5586, + "step": 227 + }, + { + "epoch": 0.03, + "grad_norm": 3.542872602570653, + "learning_rate": 9.421487603305785e-06, + "loss": 0.5837, + "step": 228 + }, + { + "epoch": 0.03, + "grad_norm": 3.8866498424994997, + "learning_rate": 9.462809917355372e-06, + "loss": 0.5472, + "step": 229 + }, + { + "epoch": 0.03, + "grad_norm": 1.8632223477667107, + "learning_rate": 9.50413223140496e-06, + "loss": 0.4915, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 2.769130478817958, + "learning_rate": 9.545454545454547e-06, + "loss": 0.5311, + "step": 231 + }, + { + "epoch": 0.03, + "grad_norm": 1.5927783003609426, + "learning_rate": 9.586776859504134e-06, + "loss": 0.4975, + "step": 232 + }, + { + "epoch": 0.03, + "grad_norm": 2.346701530645649, + "learning_rate": 9.62809917355372e-06, + "loss": 0.5696, + "step": 233 + }, + { + "epoch": 0.03, + "grad_norm": 1.8552511574832422, + "learning_rate": 9.669421487603307e-06, + "loss": 0.5047, + "step": 234 + }, + { + "epoch": 0.03, + "grad_norm": 1.8612115784851542, + "learning_rate": 9.710743801652894e-06, + "loss": 0.5683, + "step": 235 + }, + { + "epoch": 0.03, + "grad_norm": 1.7957162557860462, + "learning_rate": 9.75206611570248e-06, + "loss": 0.5231, + "step": 236 + }, + { + "epoch": 0.03, + "grad_norm": 1.6128091928743173, + "learning_rate": 9.793388429752067e-06, + "loss": 0.5223, + "step": 237 + }, + { + "epoch": 0.03, + "grad_norm": 1.7441349517290226, + "learning_rate": 9.834710743801654e-06, + "loss": 0.5417, + "step": 238 + }, + { + "epoch": 0.03, + "grad_norm": 1.7518601110115872, + "learning_rate": 9.876033057851241e-06, + "loss": 0.5866, + "step": 239 + }, + { + "epoch": 0.03, + "grad_norm": 2.1033851691625745, + "learning_rate": 9.917355371900828e-06, + "loss": 0.5297, + "step": 240 + }, + { + "epoch": 0.03, + "grad_norm": 1.9910098722645073, + "learning_rate": 9.958677685950414e-06, + "loss": 0.4461, + "step": 241 + }, + { + "epoch": 0.03, + "grad_norm": 2.8174366976581458, + "learning_rate": 1e-05, + "loss": 0.5063, + "step": 242 + }, + { + "epoch": 0.03, + "grad_norm": 1.674372365245035, + "learning_rate": 9.999999596102982e-06, + "loss": 0.5256, + "step": 243 + }, + { + "epoch": 0.03, + "grad_norm": 1.7487857604376191, + "learning_rate": 9.999998384411993e-06, + "loss": 0.5951, + "step": 244 + }, + { + "epoch": 0.03, + "grad_norm": 1.6303611245059892, + "learning_rate": 9.99999636492723e-06, + "loss": 0.6027, + "step": 245 + }, + { + "epoch": 0.03, + "grad_norm": 2.0701765426010597, + "learning_rate": 9.999993537649018e-06, + "loss": 0.5304, + "step": 246 + }, + { + "epoch": 0.03, + "grad_norm": 2.1090228708257226, + "learning_rate": 9.999989902577813e-06, + "loss": 0.5487, + "step": 247 + }, + { + "epoch": 0.03, + "grad_norm": 2.2097391973231413, + "learning_rate": 9.999985459714203e-06, + "loss": 0.5548, + "step": 248 + }, + { + "epoch": 0.03, + "grad_norm": 0.6819193332040362, + "learning_rate": 9.999980209058907e-06, + "loss": 0.4655, + "step": 249 + }, + { + "epoch": 0.03, + "grad_norm": 1.7160171625750975, + "learning_rate": 9.999974150612773e-06, + "loss": 0.5536, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 2.044131904537645, + "learning_rate": 9.999967284376777e-06, + "loss": 0.5704, + "step": 251 + }, + { + "epoch": 0.03, + "grad_norm": 1.8803170341600512, + "learning_rate": 9.999959610352032e-06, + "loss": 0.5053, + "step": 252 + }, + { + "epoch": 0.03, + "grad_norm": 2.1350599019041843, + "learning_rate": 9.999951128539776e-06, + "loss": 0.5145, + "step": 253 + }, + { + "epoch": 0.03, + "grad_norm": 3.959337484545383, + "learning_rate": 9.999941838941378e-06, + "loss": 0.5465, + "step": 254 + }, + { + "epoch": 0.03, + "grad_norm": 2.263615164770816, + "learning_rate": 9.999931741558342e-06, + "loss": 0.5284, + "step": 255 + }, + { + "epoch": 0.03, + "grad_norm": 1.9392426887684562, + "learning_rate": 9.999920836392298e-06, + "loss": 0.5087, + "step": 256 + }, + { + "epoch": 0.03, + "grad_norm": 2.0601038827538445, + "learning_rate": 9.999909123445006e-06, + "loss": 0.5072, + "step": 257 + }, + { + "epoch": 0.03, + "grad_norm": 1.9443319471352343, + "learning_rate": 9.99989660271836e-06, + "loss": 0.5114, + "step": 258 + }, + { + "epoch": 0.03, + "grad_norm": 1.9823465986216011, + "learning_rate": 9.999883274214383e-06, + "loss": 0.5244, + "step": 259 + }, + { + "epoch": 0.03, + "grad_norm": 1.5894097797953115, + "learning_rate": 9.99986913793523e-06, + "loss": 0.5607, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 2.1034456786503797, + "learning_rate": 9.99985419388318e-06, + "loss": 0.5246, + "step": 261 + }, + { + "epoch": 0.03, + "grad_norm": 13.925708553269999, + "learning_rate": 9.999838442060652e-06, + "loss": 0.5578, + "step": 262 + }, + { + "epoch": 0.03, + "grad_norm": 2.031790416570794, + "learning_rate": 9.999821882470188e-06, + "loss": 0.5533, + "step": 263 + }, + { + "epoch": 0.03, + "grad_norm": 2.299536220707557, + "learning_rate": 9.999804515114465e-06, + "loss": 0.4861, + "step": 264 + }, + { + "epoch": 0.03, + "grad_norm": 3.339327687472147, + "learning_rate": 9.999786339996288e-06, + "loss": 0.5479, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 0.7242515927078733, + "learning_rate": 9.999767357118594e-06, + "loss": 0.4831, + "step": 266 + }, + { + "epoch": 0.03, + "grad_norm": 2.910605648741558, + "learning_rate": 9.99974756648445e-06, + "loss": 0.5574, + "step": 267 + }, + { + "epoch": 0.03, + "grad_norm": 2.0505930996455395, + "learning_rate": 9.99972696809705e-06, + "loss": 0.5484, + "step": 268 + }, + { + "epoch": 0.03, + "grad_norm": 2.3422061192597923, + "learning_rate": 9.999705561959727e-06, + "loss": 0.5685, + "step": 269 + }, + { + "epoch": 0.03, + "grad_norm": 0.685857192457201, + "learning_rate": 9.999683348075937e-06, + "loss": 0.4722, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 1.9628329354004277, + "learning_rate": 9.999660326449267e-06, + "loss": 0.5987, + "step": 271 + }, + { + "epoch": 0.03, + "grad_norm": 2.2789243043528704, + "learning_rate": 9.99963649708344e-06, + "loss": 0.5422, + "step": 272 + }, + { + "epoch": 0.03, + "grad_norm": 3.421312306756197, + "learning_rate": 9.999611859982304e-06, + "loss": 0.5501, + "step": 273 + }, + { + "epoch": 0.03, + "grad_norm": 2.0296521584425005, + "learning_rate": 9.99958641514984e-06, + "loss": 0.5765, + "step": 274 + }, + { + "epoch": 0.03, + "grad_norm": 1.5470558706281141, + "learning_rate": 9.999560162590157e-06, + "loss": 0.5157, + "step": 275 + }, + { + "epoch": 0.03, + "grad_norm": 1.4700139418770868, + "learning_rate": 9.999533102307497e-06, + "loss": 0.506, + "step": 276 + }, + { + "epoch": 0.03, + "grad_norm": 1.6813756337253796, + "learning_rate": 9.999505234306232e-06, + "loss": 0.513, + "step": 277 + }, + { + "epoch": 0.03, + "grad_norm": 1.6349444581628045, + "learning_rate": 9.999476558590865e-06, + "loss": 0.5735, + "step": 278 + }, + { + "epoch": 0.03, + "grad_norm": 1.9746160718526162, + "learning_rate": 9.99944707516603e-06, + "loss": 0.5774, + "step": 279 + }, + { + "epoch": 0.03, + "grad_norm": 4.4811608488552475, + "learning_rate": 9.999416784036488e-06, + "loss": 0.4944, + "step": 280 + }, + { + "epoch": 0.03, + "grad_norm": 1.5957764732299413, + "learning_rate": 9.999385685207135e-06, + "loss": 0.5006, + "step": 281 + }, + { + "epoch": 0.03, + "grad_norm": 1.9472233687602192, + "learning_rate": 9.999353778682992e-06, + "loss": 0.5112, + "step": 282 + }, + { + "epoch": 0.04, + "grad_norm": 1.9484798031195745, + "learning_rate": 9.999321064469216e-06, + "loss": 0.4938, + "step": 283 + }, + { + "epoch": 0.04, + "grad_norm": 2.920263709004281, + "learning_rate": 9.999287542571092e-06, + "loss": 0.4956, + "step": 284 + }, + { + "epoch": 0.04, + "grad_norm": 0.6468350417563766, + "learning_rate": 9.999253212994035e-06, + "loss": 0.4424, + "step": 285 + }, + { + "epoch": 0.04, + "grad_norm": 1.844929706278709, + "learning_rate": 9.999218075743594e-06, + "loss": 0.5706, + "step": 286 + }, + { + "epoch": 0.04, + "grad_norm": 4.482273640778281, + "learning_rate": 9.999182130825443e-06, + "loss": 0.5308, + "step": 287 + }, + { + "epoch": 0.04, + "grad_norm": 1.9083396990379513, + "learning_rate": 9.99914537824539e-06, + "loss": 0.5654, + "step": 288 + }, + { + "epoch": 0.04, + "grad_norm": 1.7555024837914865, + "learning_rate": 9.99910781800937e-06, + "loss": 0.5698, + "step": 289 + }, + { + "epoch": 0.04, + "grad_norm": 2.563462775984113, + "learning_rate": 9.999069450123458e-06, + "loss": 0.5425, + "step": 290 + }, + { + "epoch": 0.04, + "grad_norm": 2.537480565574436, + "learning_rate": 9.999030274593845e-06, + "loss": 0.5425, + "step": 291 + }, + { + "epoch": 0.04, + "grad_norm": 5.201318330504299, + "learning_rate": 9.998990291426864e-06, + "loss": 0.5718, + "step": 292 + }, + { + "epoch": 0.04, + "grad_norm": 2.0509238907273457, + "learning_rate": 9.998949500628977e-06, + "loss": 0.4952, + "step": 293 + }, + { + "epoch": 0.04, + "grad_norm": 1.6658988414947071, + "learning_rate": 9.998907902206769e-06, + "loss": 0.5382, + "step": 294 + }, + { + "epoch": 0.04, + "grad_norm": 2.076364793574101, + "learning_rate": 9.998865496166963e-06, + "loss": 0.5792, + "step": 295 + }, + { + "epoch": 0.04, + "grad_norm": 1.6504825769137341, + "learning_rate": 9.99882228251641e-06, + "loss": 0.5384, + "step": 296 + }, + { + "epoch": 0.04, + "grad_norm": 4.195943188299471, + "learning_rate": 9.998778261262093e-06, + "loss": 0.4845, + "step": 297 + }, + { + "epoch": 0.04, + "grad_norm": 3.1865330279591957, + "learning_rate": 9.99873343241112e-06, + "loss": 0.5822, + "step": 298 + }, + { + "epoch": 0.04, + "grad_norm": 1.583521959332929, + "learning_rate": 9.998687795970739e-06, + "loss": 0.5497, + "step": 299 + }, + { + "epoch": 0.04, + "grad_norm": 1.867600307176823, + "learning_rate": 9.998641351948319e-06, + "loss": 0.5321, + "step": 300 + }, + { + "epoch": 0.04, + "grad_norm": 1.9599824750180688, + "learning_rate": 9.998594100351362e-06, + "loss": 0.5741, + "step": 301 + }, + { + "epoch": 0.04, + "grad_norm": 0.6829182091305455, + "learning_rate": 9.998546041187507e-06, + "loss": 0.5211, + "step": 302 + }, + { + "epoch": 0.04, + "grad_norm": 3.0686970869720076, + "learning_rate": 9.998497174464517e-06, + "loss": 0.525, + "step": 303 + }, + { + "epoch": 0.04, + "grad_norm": 1.7658227546806848, + "learning_rate": 9.998447500190283e-06, + "loss": 0.5234, + "step": 304 + }, + { + "epoch": 0.04, + "grad_norm": 2.1642748578874906, + "learning_rate": 9.998397018372833e-06, + "loss": 0.5805, + "step": 305 + }, + { + "epoch": 0.04, + "grad_norm": 1.8098658826161074, + "learning_rate": 9.998345729020324e-06, + "loss": 0.5115, + "step": 306 + }, + { + "epoch": 0.04, + "grad_norm": 2.0808928845396357, + "learning_rate": 9.998293632141042e-06, + "loss": 0.5643, + "step": 307 + }, + { + "epoch": 0.04, + "grad_norm": 4.062037007760145, + "learning_rate": 9.998240727743401e-06, + "loss": 0.5168, + "step": 308 + }, + { + "epoch": 0.04, + "grad_norm": 2.1539325790326656, + "learning_rate": 9.99818701583595e-06, + "loss": 0.4943, + "step": 309 + }, + { + "epoch": 0.04, + "grad_norm": 7.820375964588249, + "learning_rate": 9.998132496427366e-06, + "loss": 0.5088, + "step": 310 + }, + { + "epoch": 0.04, + "grad_norm": 1.954096590969464, + "learning_rate": 9.998077169526459e-06, + "loss": 0.5659, + "step": 311 + }, + { + "epoch": 0.04, + "grad_norm": 2.285658810614788, + "learning_rate": 9.998021035142166e-06, + "loss": 0.5142, + "step": 312 + }, + { + "epoch": 0.04, + "grad_norm": 1.9304285977081468, + "learning_rate": 9.997964093283555e-06, + "loss": 0.5098, + "step": 313 + }, + { + "epoch": 0.04, + "grad_norm": 2.4164241091216825, + "learning_rate": 9.997906343959827e-06, + "loss": 0.5105, + "step": 314 + }, + { + "epoch": 0.04, + "grad_norm": 2.6374326074383707, + "learning_rate": 9.997847787180313e-06, + "loss": 0.5503, + "step": 315 + }, + { + "epoch": 0.04, + "grad_norm": 1.9372827013095961, + "learning_rate": 9.99778842295447e-06, + "loss": 0.5462, + "step": 316 + }, + { + "epoch": 0.04, + "grad_norm": 1.5169237820059158, + "learning_rate": 9.997728251291891e-06, + "loss": 0.5151, + "step": 317 + }, + { + "epoch": 0.04, + "grad_norm": 1.56778689015433, + "learning_rate": 9.997667272202297e-06, + "loss": 0.4994, + "step": 318 + }, + { + "epoch": 0.04, + "grad_norm": 0.7363415337809248, + "learning_rate": 9.99760548569554e-06, + "loss": 0.4856, + "step": 319 + }, + { + "epoch": 0.04, + "grad_norm": 1.6500642022053966, + "learning_rate": 9.997542891781602e-06, + "loss": 0.541, + "step": 320 + }, + { + "epoch": 0.04, + "grad_norm": 1.646446903693105, + "learning_rate": 9.997479490470594e-06, + "loss": 0.5681, + "step": 321 + }, + { + "epoch": 0.04, + "grad_norm": 1.6120661264680305, + "learning_rate": 9.997415281772762e-06, + "loss": 0.5224, + "step": 322 + }, + { + "epoch": 0.04, + "grad_norm": 1.6969882678190478, + "learning_rate": 9.997350265698477e-06, + "loss": 0.563, + "step": 323 + }, + { + "epoch": 0.04, + "grad_norm": 2.702569911701666, + "learning_rate": 9.997284442258244e-06, + "loss": 0.5087, + "step": 324 + }, + { + "epoch": 0.04, + "grad_norm": 1.5080891548286657, + "learning_rate": 9.997217811462698e-06, + "loss": 0.5377, + "step": 325 + }, + { + "epoch": 0.04, + "grad_norm": 4.292261492859581, + "learning_rate": 9.997150373322602e-06, + "loss": 0.5686, + "step": 326 + }, + { + "epoch": 0.04, + "grad_norm": 2.150320693707667, + "learning_rate": 9.997082127848853e-06, + "loss": 0.5537, + "step": 327 + }, + { + "epoch": 0.04, + "grad_norm": 1.8924546749988123, + "learning_rate": 9.997013075052476e-06, + "loss": 0.5463, + "step": 328 + }, + { + "epoch": 0.04, + "grad_norm": 1.654314133243649, + "learning_rate": 9.996943214944626e-06, + "loss": 0.5565, + "step": 329 + }, + { + "epoch": 0.04, + "grad_norm": 7.923442603059371, + "learning_rate": 9.99687254753659e-06, + "loss": 0.5881, + "step": 330 + }, + { + "epoch": 0.04, + "grad_norm": 1.8293811891706615, + "learning_rate": 9.996801072839789e-06, + "loss": 0.5718, + "step": 331 + }, + { + "epoch": 0.04, + "grad_norm": 1.480332829339136, + "learning_rate": 9.996728790865763e-06, + "loss": 0.503, + "step": 332 + }, + { + "epoch": 0.04, + "grad_norm": 1.6201449901428477, + "learning_rate": 9.996655701626195e-06, + "loss": 0.4973, + "step": 333 + }, + { + "epoch": 0.04, + "grad_norm": 2.2742138495119044, + "learning_rate": 9.99658180513289e-06, + "loss": 0.4997, + "step": 334 + }, + { + "epoch": 0.04, + "grad_norm": 9.120557910370504, + "learning_rate": 9.99650710139779e-06, + "loss": 0.5397, + "step": 335 + }, + { + "epoch": 0.04, + "grad_norm": 1.5861768219312984, + "learning_rate": 9.996431590432962e-06, + "loss": 0.543, + "step": 336 + }, + { + "epoch": 0.04, + "grad_norm": 1.614805525586774, + "learning_rate": 9.996355272250607e-06, + "loss": 0.55, + "step": 337 + }, + { + "epoch": 0.04, + "grad_norm": 2.6053554102899765, + "learning_rate": 9.996278146863054e-06, + "loss": 0.5516, + "step": 338 + }, + { + "epoch": 0.04, + "grad_norm": 10.796024780854161, + "learning_rate": 9.996200214282762e-06, + "loss": 0.5283, + "step": 339 + }, + { + "epoch": 0.04, + "grad_norm": 1.6948255023691408, + "learning_rate": 9.996121474522323e-06, + "loss": 0.5753, + "step": 340 + }, + { + "epoch": 0.04, + "grad_norm": 1.6740506396070718, + "learning_rate": 9.996041927594457e-06, + "loss": 0.5164, + "step": 341 + }, + { + "epoch": 0.04, + "grad_norm": 1.6315151778674875, + "learning_rate": 9.995961573512018e-06, + "loss": 0.5759, + "step": 342 + }, + { + "epoch": 0.04, + "grad_norm": 1.701863204355339, + "learning_rate": 9.995880412287985e-06, + "loss": 0.5348, + "step": 343 + }, + { + "epoch": 0.04, + "grad_norm": 2.103113802438708, + "learning_rate": 9.995798443935473e-06, + "loss": 0.54, + "step": 344 + }, + { + "epoch": 0.04, + "grad_norm": 1.8491565354564938, + "learning_rate": 9.995715668467722e-06, + "loss": 0.5071, + "step": 345 + }, + { + "epoch": 0.04, + "grad_norm": 2.0318284949244294, + "learning_rate": 9.995632085898106e-06, + "loss": 0.5683, + "step": 346 + }, + { + "epoch": 0.04, + "grad_norm": 2.1338428368044196, + "learning_rate": 9.995547696240132e-06, + "loss": 0.531, + "step": 347 + }, + { + "epoch": 0.04, + "grad_norm": 1.9904163800757648, + "learning_rate": 9.995462499507427e-06, + "loss": 0.5595, + "step": 348 + }, + { + "epoch": 0.04, + "grad_norm": 1.5269130775268465, + "learning_rate": 9.99537649571376e-06, + "loss": 0.517, + "step": 349 + }, + { + "epoch": 0.04, + "grad_norm": 1.6240532368533307, + "learning_rate": 9.995289684873027e-06, + "loss": 0.5192, + "step": 350 + }, + { + "epoch": 0.04, + "grad_norm": 1.3651319032052907, + "learning_rate": 9.99520206699925e-06, + "loss": 0.4983, + "step": 351 + }, + { + "epoch": 0.04, + "grad_norm": 2.342350856871222, + "learning_rate": 9.995113642106584e-06, + "loss": 0.5554, + "step": 352 + }, + { + "epoch": 0.04, + "grad_norm": 2.6168109185854025, + "learning_rate": 9.995024410209316e-06, + "loss": 0.5243, + "step": 353 + }, + { + "epoch": 0.04, + "grad_norm": 0.6988720397199879, + "learning_rate": 9.994934371321862e-06, + "loss": 0.5005, + "step": 354 + }, + { + "epoch": 0.04, + "grad_norm": 4.159118467801112, + "learning_rate": 9.994843525458771e-06, + "loss": 0.5256, + "step": 355 + }, + { + "epoch": 0.04, + "grad_norm": 1.7501348381594886, + "learning_rate": 9.994751872634717e-06, + "loss": 0.5444, + "step": 356 + }, + { + "epoch": 0.04, + "grad_norm": 2.558673168866606, + "learning_rate": 9.994659412864508e-06, + "loss": 0.5795, + "step": 357 + }, + { + "epoch": 0.04, + "grad_norm": 1.3712882222338771, + "learning_rate": 9.994566146163082e-06, + "loss": 0.4981, + "step": 358 + }, + { + "epoch": 0.04, + "grad_norm": 3.917448635976479, + "learning_rate": 9.994472072545506e-06, + "loss": 0.5797, + "step": 359 + }, + { + "epoch": 0.04, + "grad_norm": 2.5699481449625465, + "learning_rate": 9.994377192026981e-06, + "loss": 0.5329, + "step": 360 + }, + { + "epoch": 0.04, + "grad_norm": 2.349005454005592, + "learning_rate": 9.994281504622831e-06, + "loss": 0.5051, + "step": 361 + }, + { + "epoch": 0.04, + "grad_norm": 2.434754572817806, + "learning_rate": 9.99418501034852e-06, + "loss": 0.4765, + "step": 362 + }, + { + "epoch": 0.05, + "grad_norm": 1.7721504794383782, + "learning_rate": 9.994087709219637e-06, + "loss": 0.603, + "step": 363 + }, + { + "epoch": 0.05, + "grad_norm": 2.2223558544616, + "learning_rate": 9.993989601251899e-06, + "loss": 0.5589, + "step": 364 + }, + { + "epoch": 0.05, + "grad_norm": 2.1127193527497323, + "learning_rate": 9.993890686461159e-06, + "loss": 0.5474, + "step": 365 + }, + { + "epoch": 0.05, + "grad_norm": 2.1347486367959934, + "learning_rate": 9.993790964863394e-06, + "loss": 0.5455, + "step": 366 + }, + { + "epoch": 0.05, + "grad_norm": 3.7495898246789374, + "learning_rate": 9.993690436474719e-06, + "loss": 0.4887, + "step": 367 + }, + { + "epoch": 0.05, + "grad_norm": 1.783308092584701, + "learning_rate": 9.993589101311373e-06, + "loss": 0.5348, + "step": 368 + }, + { + "epoch": 0.05, + "grad_norm": 1.5856712387472398, + "learning_rate": 9.993486959389728e-06, + "loss": 0.5714, + "step": 369 + }, + { + "epoch": 0.05, + "grad_norm": 1.6477109895625353, + "learning_rate": 9.993384010726288e-06, + "loss": 0.5417, + "step": 370 + }, + { + "epoch": 0.05, + "grad_norm": 5.309065931402148, + "learning_rate": 9.99328025533768e-06, + "loss": 0.5701, + "step": 371 + }, + { + "epoch": 0.05, + "grad_norm": 0.6912380894441247, + "learning_rate": 9.993175693240673e-06, + "loss": 0.5116, + "step": 372 + }, + { + "epoch": 0.05, + "grad_norm": 3.1409674989933656, + "learning_rate": 9.993070324452154e-06, + "loss": 0.5192, + "step": 373 + }, + { + "epoch": 0.05, + "grad_norm": 1.778627892390217, + "learning_rate": 9.99296414898915e-06, + "loss": 0.5501, + "step": 374 + }, + { + "epoch": 0.05, + "grad_norm": 2.570838939085222, + "learning_rate": 9.992857166868814e-06, + "loss": 0.4959, + "step": 375 + }, + { + "epoch": 0.05, + "grad_norm": 2.0505514157390987, + "learning_rate": 9.99274937810843e-06, + "loss": 0.5285, + "step": 376 + }, + { + "epoch": 0.05, + "grad_norm": 1.6209806009721568, + "learning_rate": 9.99264078272541e-06, + "loss": 0.5205, + "step": 377 + }, + { + "epoch": 0.05, + "grad_norm": 1.561951117823356, + "learning_rate": 9.992531380737303e-06, + "loss": 0.5162, + "step": 378 + }, + { + "epoch": 0.05, + "grad_norm": 1.6842346047481311, + "learning_rate": 9.99242117216178e-06, + "loss": 0.5459, + "step": 379 + }, + { + "epoch": 0.05, + "grad_norm": 1.5206441075953183, + "learning_rate": 9.992310157016647e-06, + "loss": 0.5212, + "step": 380 + }, + { + "epoch": 0.05, + "grad_norm": 1.613170143739462, + "learning_rate": 9.992198335319839e-06, + "loss": 0.5296, + "step": 381 + }, + { + "epoch": 0.05, + "grad_norm": 2.7097419358184256, + "learning_rate": 9.992085707089424e-06, + "loss": 0.5358, + "step": 382 + }, + { + "epoch": 0.05, + "grad_norm": 1.8302659825726482, + "learning_rate": 9.991972272343595e-06, + "loss": 0.4922, + "step": 383 + }, + { + "epoch": 0.05, + "grad_norm": 2.3884596685854786, + "learning_rate": 9.991858031100682e-06, + "loss": 0.5421, + "step": 384 + }, + { + "epoch": 0.05, + "grad_norm": 1.8222840733689352, + "learning_rate": 9.991742983379137e-06, + "loss": 0.5477, + "step": 385 + }, + { + "epoch": 0.05, + "grad_norm": 1.8910852787106245, + "learning_rate": 9.991627129197552e-06, + "loss": 0.5638, + "step": 386 + }, + { + "epoch": 0.05, + "grad_norm": 1.5426420623594577, + "learning_rate": 9.991510468574642e-06, + "loss": 0.5331, + "step": 387 + }, + { + "epoch": 0.05, + "grad_norm": 1.6218897462067299, + "learning_rate": 9.991393001529255e-06, + "loss": 0.4998, + "step": 388 + }, + { + "epoch": 0.05, + "grad_norm": 3.4803448874576803, + "learning_rate": 9.991274728080368e-06, + "loss": 0.5687, + "step": 389 + }, + { + "epoch": 0.05, + "grad_norm": 1.817078050217724, + "learning_rate": 9.991155648247086e-06, + "loss": 0.5589, + "step": 390 + }, + { + "epoch": 0.05, + "grad_norm": 1.56274673389566, + "learning_rate": 9.991035762048655e-06, + "loss": 0.5402, + "step": 391 + }, + { + "epoch": 0.05, + "grad_norm": 0.6700379174795714, + "learning_rate": 9.990915069504438e-06, + "loss": 0.4653, + "step": 392 + }, + { + "epoch": 0.05, + "grad_norm": 0.6793263712894418, + "learning_rate": 9.990793570633935e-06, + "loss": 0.4721, + "step": 393 + }, + { + "epoch": 0.05, + "grad_norm": 1.6887319698367116, + "learning_rate": 9.990671265456778e-06, + "loss": 0.5377, + "step": 394 + }, + { + "epoch": 0.05, + "grad_norm": 1.4642447699000192, + "learning_rate": 9.990548153992721e-06, + "loss": 0.552, + "step": 395 + }, + { + "epoch": 0.05, + "grad_norm": 1.8643006750012217, + "learning_rate": 9.990424236261657e-06, + "loss": 0.5422, + "step": 396 + }, + { + "epoch": 0.05, + "grad_norm": 0.7130101747622972, + "learning_rate": 9.990299512283608e-06, + "loss": 0.4625, + "step": 397 + }, + { + "epoch": 0.05, + "grad_norm": 1.509124542655514, + "learning_rate": 9.990173982078721e-06, + "loss": 0.5449, + "step": 398 + }, + { + "epoch": 0.05, + "grad_norm": 1.7287882922623734, + "learning_rate": 9.990047645667277e-06, + "loss": 0.525, + "step": 399 + }, + { + "epoch": 0.05, + "grad_norm": 1.597759493266317, + "learning_rate": 9.98992050306969e-06, + "loss": 0.5109, + "step": 400 + }, + { + "epoch": 0.05, + "grad_norm": 1.7164045972462163, + "learning_rate": 9.989792554306496e-06, + "loss": 0.4961, + "step": 401 + }, + { + "epoch": 0.05, + "grad_norm": 1.7717464858573455, + "learning_rate": 9.98966379939837e-06, + "loss": 0.4837, + "step": 402 + }, + { + "epoch": 0.05, + "grad_norm": 2.9481512152040525, + "learning_rate": 9.989534238366113e-06, + "loss": 0.5405, + "step": 403 + }, + { + "epoch": 0.05, + "grad_norm": 1.6987543219058931, + "learning_rate": 9.989403871230654e-06, + "loss": 0.5504, + "step": 404 + }, + { + "epoch": 0.05, + "grad_norm": 1.4308349414603916, + "learning_rate": 9.989272698013058e-06, + "loss": 0.5327, + "step": 405 + }, + { + "epoch": 0.05, + "grad_norm": 1.6876272721147005, + "learning_rate": 9.989140718734515e-06, + "loss": 0.5607, + "step": 406 + }, + { + "epoch": 0.05, + "grad_norm": 0.729149900507196, + "learning_rate": 9.989007933416348e-06, + "loss": 0.4808, + "step": 407 + }, + { + "epoch": 0.05, + "grad_norm": 1.7700639883038296, + "learning_rate": 9.988874342080011e-06, + "loss": 0.5669, + "step": 408 + }, + { + "epoch": 0.05, + "grad_norm": 1.5781092289283492, + "learning_rate": 9.988739944747086e-06, + "loss": 0.505, + "step": 409 + }, + { + "epoch": 0.05, + "grad_norm": 1.557526972913807, + "learning_rate": 9.988604741439288e-06, + "loss": 0.5129, + "step": 410 + }, + { + "epoch": 0.05, + "grad_norm": 1.8408561874260345, + "learning_rate": 9.988468732178456e-06, + "loss": 0.5595, + "step": 411 + }, + { + "epoch": 0.05, + "grad_norm": 2.0413512443099053, + "learning_rate": 9.988331916986565e-06, + "loss": 0.5793, + "step": 412 + }, + { + "epoch": 0.05, + "grad_norm": 1.6768995403140314, + "learning_rate": 9.988194295885721e-06, + "loss": 0.5092, + "step": 413 + }, + { + "epoch": 0.05, + "grad_norm": 1.4892709861156581, + "learning_rate": 9.988055868898156e-06, + "loss": 0.5342, + "step": 414 + }, + { + "epoch": 0.05, + "grad_norm": 1.6539830627005252, + "learning_rate": 9.987916636046234e-06, + "loss": 0.5679, + "step": 415 + }, + { + "epoch": 0.05, + "grad_norm": 1.6412699630465084, + "learning_rate": 9.987776597352451e-06, + "loss": 0.531, + "step": 416 + }, + { + "epoch": 0.05, + "grad_norm": 1.5955019956242151, + "learning_rate": 9.987635752839429e-06, + "loss": 0.4732, + "step": 417 + }, + { + "epoch": 0.05, + "grad_norm": 1.4359477688178175, + "learning_rate": 9.987494102529924e-06, + "loss": 0.4767, + "step": 418 + }, + { + "epoch": 0.05, + "grad_norm": 1.6153040652848387, + "learning_rate": 9.987351646446824e-06, + "loss": 0.4845, + "step": 419 + }, + { + "epoch": 0.05, + "grad_norm": 1.592003559782074, + "learning_rate": 9.987208384613136e-06, + "loss": 0.5956, + "step": 420 + }, + { + "epoch": 0.05, + "grad_norm": 1.7946167267388238, + "learning_rate": 9.987064317052013e-06, + "loss": 0.6168, + "step": 421 + }, + { + "epoch": 0.05, + "grad_norm": 0.7008916654016195, + "learning_rate": 9.986919443786727e-06, + "loss": 0.5003, + "step": 422 + }, + { + "epoch": 0.05, + "grad_norm": 5.2870924806038975, + "learning_rate": 9.986773764840684e-06, + "loss": 0.5361, + "step": 423 + }, + { + "epoch": 0.05, + "grad_norm": 4.484401416940637, + "learning_rate": 9.98662728023742e-06, + "loss": 0.5721, + "step": 424 + }, + { + "epoch": 0.05, + "grad_norm": 1.547533213628365, + "learning_rate": 9.986479990000598e-06, + "loss": 0.5358, + "step": 425 + }, + { + "epoch": 0.05, + "grad_norm": 2.673741921515085, + "learning_rate": 9.98633189415402e-06, + "loss": 0.504, + "step": 426 + }, + { + "epoch": 0.05, + "grad_norm": 1.4089500017727525, + "learning_rate": 9.986182992721606e-06, + "loss": 0.5495, + "step": 427 + }, + { + "epoch": 0.05, + "grad_norm": 2.2003603241293996, + "learning_rate": 9.986033285727418e-06, + "loss": 0.5278, + "step": 428 + }, + { + "epoch": 0.05, + "grad_norm": 3.784641006031379, + "learning_rate": 9.985882773195638e-06, + "loss": 0.5357, + "step": 429 + }, + { + "epoch": 0.05, + "grad_norm": 1.7317785476187197, + "learning_rate": 9.985731455150584e-06, + "loss": 0.5567, + "step": 430 + }, + { + "epoch": 0.05, + "grad_norm": 2.648485799879626, + "learning_rate": 9.985579331616705e-06, + "loss": 0.4999, + "step": 431 + }, + { + "epoch": 0.05, + "grad_norm": 1.9685222126772437, + "learning_rate": 9.985426402618574e-06, + "loss": 0.5034, + "step": 432 + }, + { + "epoch": 0.05, + "grad_norm": 1.4689771404993912, + "learning_rate": 9.985272668180901e-06, + "loss": 0.537, + "step": 433 + }, + { + "epoch": 0.05, + "grad_norm": 1.688785913533373, + "learning_rate": 9.985118128328522e-06, + "loss": 0.5454, + "step": 434 + }, + { + "epoch": 0.05, + "grad_norm": 4.48562468774087, + "learning_rate": 9.984962783086403e-06, + "loss": 0.4896, + "step": 435 + }, + { + "epoch": 0.05, + "grad_norm": 1.459814886971303, + "learning_rate": 9.984806632479643e-06, + "loss": 0.4912, + "step": 436 + }, + { + "epoch": 0.05, + "grad_norm": 1.5872647895748622, + "learning_rate": 9.98464967653347e-06, + "loss": 0.5714, + "step": 437 + }, + { + "epoch": 0.05, + "grad_norm": 1.9004557876468335, + "learning_rate": 9.98449191527324e-06, + "loss": 0.494, + "step": 438 + }, + { + "epoch": 0.05, + "grad_norm": 1.985375325252717, + "learning_rate": 9.984333348724442e-06, + "loss": 0.5637, + "step": 439 + }, + { + "epoch": 0.05, + "grad_norm": 1.8027928544018976, + "learning_rate": 9.984173976912695e-06, + "loss": 0.5001, + "step": 440 + }, + { + "epoch": 0.05, + "grad_norm": 1.5841099007415989, + "learning_rate": 9.984013799863744e-06, + "loss": 0.5864, + "step": 441 + }, + { + "epoch": 0.05, + "grad_norm": 1.5646937500034126, + "learning_rate": 9.983852817603468e-06, + "loss": 0.574, + "step": 442 + }, + { + "epoch": 0.05, + "grad_norm": 1.97296027361365, + "learning_rate": 9.983691030157876e-06, + "loss": 0.4592, + "step": 443 + }, + { + "epoch": 0.06, + "grad_norm": 0.7128940493354311, + "learning_rate": 9.983528437553106e-06, + "loss": 0.5014, + "step": 444 + }, + { + "epoch": 0.06, + "grad_norm": 1.3553160387069387, + "learning_rate": 9.983365039815425e-06, + "loss": 0.5138, + "step": 445 + }, + { + "epoch": 0.06, + "grad_norm": 1.7551202165825732, + "learning_rate": 9.983200836971234e-06, + "loss": 0.4924, + "step": 446 + }, + { + "epoch": 0.06, + "grad_norm": 1.783065671146444, + "learning_rate": 9.983035829047058e-06, + "loss": 0.4881, + "step": 447 + }, + { + "epoch": 0.06, + "grad_norm": 1.6346481923047431, + "learning_rate": 9.982870016069557e-06, + "loss": 0.5086, + "step": 448 + }, + { + "epoch": 0.06, + "grad_norm": 2.269760431343647, + "learning_rate": 9.98270339806552e-06, + "loss": 0.4659, + "step": 449 + }, + { + "epoch": 0.06, + "grad_norm": 1.9208644340466066, + "learning_rate": 9.982535975061866e-06, + "loss": 0.4691, + "step": 450 + }, + { + "epoch": 0.06, + "grad_norm": 0.700303977995299, + "learning_rate": 9.982367747085642e-06, + "loss": 0.4813, + "step": 451 + }, + { + "epoch": 0.06, + "grad_norm": 1.4580808287839546, + "learning_rate": 9.982198714164029e-06, + "loss": 0.5482, + "step": 452 + }, + { + "epoch": 0.06, + "grad_norm": 1.6395785682570598, + "learning_rate": 9.982028876324334e-06, + "loss": 0.5674, + "step": 453 + }, + { + "epoch": 0.06, + "grad_norm": 3.089402865370069, + "learning_rate": 9.981858233593996e-06, + "loss": 0.5959, + "step": 454 + }, + { + "epoch": 0.06, + "grad_norm": 1.7647424329197148, + "learning_rate": 9.981686786000584e-06, + "loss": 0.5567, + "step": 455 + }, + { + "epoch": 0.06, + "grad_norm": 2.6426304996228964, + "learning_rate": 9.981514533571797e-06, + "loss": 0.5418, + "step": 456 + }, + { + "epoch": 0.06, + "grad_norm": 2.124650738441062, + "learning_rate": 9.981341476335464e-06, + "loss": 0.602, + "step": 457 + }, + { + "epoch": 0.06, + "grad_norm": 4.379438226743473, + "learning_rate": 9.981167614319542e-06, + "loss": 0.4779, + "step": 458 + }, + { + "epoch": 0.06, + "grad_norm": 3.0204003882969204, + "learning_rate": 9.980992947552124e-06, + "loss": 0.5301, + "step": 459 + }, + { + "epoch": 0.06, + "grad_norm": 1.6188537242497516, + "learning_rate": 9.980817476061426e-06, + "loss": 0.4723, + "step": 460 + }, + { + "epoch": 0.06, + "grad_norm": 1.636311850641546, + "learning_rate": 9.980641199875797e-06, + "loss": 0.5494, + "step": 461 + }, + { + "epoch": 0.06, + "grad_norm": 1.5196244609813645, + "learning_rate": 9.980464119023718e-06, + "loss": 0.5092, + "step": 462 + }, + { + "epoch": 0.06, + "grad_norm": 1.59037993007294, + "learning_rate": 9.980286233533795e-06, + "loss": 0.5209, + "step": 463 + }, + { + "epoch": 0.06, + "grad_norm": 1.4369151490083365, + "learning_rate": 9.980107543434769e-06, + "loss": 0.5809, + "step": 464 + }, + { + "epoch": 0.06, + "grad_norm": 3.0160569481227903, + "learning_rate": 9.97992804875551e-06, + "loss": 0.5392, + "step": 465 + }, + { + "epoch": 0.06, + "grad_norm": 0.7213586451768685, + "learning_rate": 9.979747749525014e-06, + "loss": 0.5294, + "step": 466 + }, + { + "epoch": 0.06, + "grad_norm": 1.5676191987425458, + "learning_rate": 9.979566645772412e-06, + "loss": 0.4995, + "step": 467 + }, + { + "epoch": 0.06, + "grad_norm": 2.2941976901857233, + "learning_rate": 9.97938473752696e-06, + "loss": 0.5625, + "step": 468 + }, + { + "epoch": 0.06, + "grad_norm": 2.3231610487610865, + "learning_rate": 9.979202024818052e-06, + "loss": 0.561, + "step": 469 + }, + { + "epoch": 0.06, + "grad_norm": 2.0242057322321005, + "learning_rate": 9.979018507675202e-06, + "loss": 0.5524, + "step": 470 + }, + { + "epoch": 0.06, + "grad_norm": 1.8226219382548836, + "learning_rate": 9.978834186128063e-06, + "loss": 0.512, + "step": 471 + }, + { + "epoch": 0.06, + "grad_norm": 1.4955935257507191, + "learning_rate": 9.97864906020641e-06, + "loss": 0.5431, + "step": 472 + }, + { + "epoch": 0.06, + "grad_norm": 1.680011950409537, + "learning_rate": 9.978463129940153e-06, + "loss": 0.5267, + "step": 473 + }, + { + "epoch": 0.06, + "grad_norm": 1.565469276971012, + "learning_rate": 9.978276395359332e-06, + "loss": 0.5174, + "step": 474 + }, + { + "epoch": 0.06, + "grad_norm": 0.6714958132903396, + "learning_rate": 9.978088856494115e-06, + "loss": 0.5051, + "step": 475 + }, + { + "epoch": 0.06, + "grad_norm": 1.8389831857433339, + "learning_rate": 9.977900513374799e-06, + "loss": 0.5401, + "step": 476 + }, + { + "epoch": 0.06, + "grad_norm": 1.5499532118888697, + "learning_rate": 9.977711366031813e-06, + "loss": 0.544, + "step": 477 + }, + { + "epoch": 0.06, + "grad_norm": 1.727431677360426, + "learning_rate": 9.977521414495716e-06, + "loss": 0.518, + "step": 478 + }, + { + "epoch": 0.06, + "grad_norm": 2.157775699970845, + "learning_rate": 9.977330658797199e-06, + "loss": 0.5556, + "step": 479 + }, + { + "epoch": 0.06, + "grad_norm": 1.5612974363773897, + "learning_rate": 9.977139098967075e-06, + "loss": 0.5161, + "step": 480 + }, + { + "epoch": 0.06, + "grad_norm": 1.7175998885815322, + "learning_rate": 9.976946735036296e-06, + "loss": 0.5357, + "step": 481 + }, + { + "epoch": 0.06, + "grad_norm": 1.406772227901931, + "learning_rate": 9.97675356703594e-06, + "loss": 0.5333, + "step": 482 + }, + { + "epoch": 0.06, + "grad_norm": 2.1451551555311283, + "learning_rate": 9.976559594997211e-06, + "loss": 0.5144, + "step": 483 + }, + { + "epoch": 0.06, + "grad_norm": 1.8879155418167197, + "learning_rate": 9.976364818951452e-06, + "loss": 0.5342, + "step": 484 + }, + { + "epoch": 0.06, + "grad_norm": 1.4034381097267934, + "learning_rate": 9.976169238930128e-06, + "loss": 0.5143, + "step": 485 + }, + { + "epoch": 0.06, + "grad_norm": 1.6253361682506393, + "learning_rate": 9.975972854964837e-06, + "loss": 0.5532, + "step": 486 + }, + { + "epoch": 0.06, + "grad_norm": 2.0486162168827273, + "learning_rate": 9.975775667087308e-06, + "loss": 0.5694, + "step": 487 + }, + { + "epoch": 0.06, + "grad_norm": 1.5977042020706673, + "learning_rate": 9.975577675329396e-06, + "loss": 0.6012, + "step": 488 + }, + { + "epoch": 0.06, + "grad_norm": 2.6581212962368017, + "learning_rate": 9.97537887972309e-06, + "loss": 0.5014, + "step": 489 + }, + { + "epoch": 0.06, + "grad_norm": 1.450762817262192, + "learning_rate": 9.975179280300507e-06, + "loss": 0.5355, + "step": 490 + }, + { + "epoch": 0.06, + "grad_norm": 1.496453897715201, + "learning_rate": 9.974978877093892e-06, + "loss": 0.5749, + "step": 491 + }, + { + "epoch": 0.06, + "grad_norm": 1.5955663116958974, + "learning_rate": 9.974777670135627e-06, + "loss": 0.5485, + "step": 492 + }, + { + "epoch": 0.06, + "grad_norm": 1.6511586849526274, + "learning_rate": 9.974575659458214e-06, + "loss": 0.528, + "step": 493 + }, + { + "epoch": 0.06, + "grad_norm": 1.8173706390670754, + "learning_rate": 9.97437284509429e-06, + "loss": 0.5043, + "step": 494 + }, + { + "epoch": 0.06, + "grad_norm": 1.450248038097281, + "learning_rate": 9.974169227076623e-06, + "loss": 0.56, + "step": 495 + }, + { + "epoch": 0.06, + "grad_norm": 2.3759328275504474, + "learning_rate": 9.97396480543811e-06, + "loss": 0.519, + "step": 496 + }, + { + "epoch": 0.06, + "grad_norm": 1.6118263272522013, + "learning_rate": 9.973759580211776e-06, + "loss": 0.4886, + "step": 497 + }, + { + "epoch": 0.06, + "grad_norm": 1.4398888457780274, + "learning_rate": 9.973553551430775e-06, + "loss": 0.5334, + "step": 498 + }, + { + "epoch": 0.06, + "grad_norm": 1.469914373917854, + "learning_rate": 9.973346719128395e-06, + "loss": 0.4947, + "step": 499 + }, + { + "epoch": 0.06, + "grad_norm": 1.570025029324263, + "learning_rate": 9.973139083338052e-06, + "loss": 0.5887, + "step": 500 + }, + { + "epoch": 0.06, + "grad_norm": 1.7122866183202354, + "learning_rate": 9.97293064409329e-06, + "loss": 0.5287, + "step": 501 + }, + { + "epoch": 0.06, + "grad_norm": 1.4599374691955285, + "learning_rate": 9.972721401427785e-06, + "loss": 0.5127, + "step": 502 + }, + { + "epoch": 0.06, + "grad_norm": 1.4856231479658881, + "learning_rate": 9.972511355375341e-06, + "loss": 0.5805, + "step": 503 + }, + { + "epoch": 0.06, + "grad_norm": 2.0164015385758502, + "learning_rate": 9.972300505969896e-06, + "loss": 0.5619, + "step": 504 + }, + { + "epoch": 0.06, + "grad_norm": 2.2997701887534436, + "learning_rate": 9.97208885324551e-06, + "loss": 0.4729, + "step": 505 + }, + { + "epoch": 0.06, + "grad_norm": 1.756468060556157, + "learning_rate": 9.97187639723638e-06, + "loss": 0.5284, + "step": 506 + }, + { + "epoch": 0.06, + "grad_norm": 1.7198355693077105, + "learning_rate": 9.971663137976829e-06, + "loss": 0.5327, + "step": 507 + }, + { + "epoch": 0.06, + "grad_norm": 1.7937945953770489, + "learning_rate": 9.971449075501313e-06, + "loss": 0.5882, + "step": 508 + }, + { + "epoch": 0.06, + "grad_norm": 1.4428157798894228, + "learning_rate": 9.971234209844413e-06, + "loss": 0.516, + "step": 509 + }, + { + "epoch": 0.06, + "grad_norm": 0.6343343346489136, + "learning_rate": 9.971018541040844e-06, + "loss": 0.4583, + "step": 510 + }, + { + "epoch": 0.06, + "grad_norm": 1.6346268222138216, + "learning_rate": 9.970802069125449e-06, + "loss": 0.5788, + "step": 511 + }, + { + "epoch": 0.06, + "grad_norm": 2.236311818205272, + "learning_rate": 9.9705847941332e-06, + "loss": 0.5473, + "step": 512 + }, + { + "epoch": 0.06, + "grad_norm": 4.186404431476861, + "learning_rate": 9.970366716099203e-06, + "loss": 0.5021, + "step": 513 + }, + { + "epoch": 0.06, + "grad_norm": 1.448273391917764, + "learning_rate": 9.970147835058686e-06, + "loss": 0.4891, + "step": 514 + }, + { + "epoch": 0.06, + "grad_norm": 1.6707135102073185, + "learning_rate": 9.969928151047012e-06, + "loss": 0.5128, + "step": 515 + }, + { + "epoch": 0.06, + "grad_norm": 1.9157604861759812, + "learning_rate": 9.969707664099677e-06, + "loss": 0.5314, + "step": 516 + }, + { + "epoch": 0.06, + "grad_norm": 1.4641731434697254, + "learning_rate": 9.969486374252297e-06, + "loss": 0.5648, + "step": 517 + }, + { + "epoch": 0.06, + "grad_norm": 0.6951811673512422, + "learning_rate": 9.969264281540627e-06, + "loss": 0.5429, + "step": 518 + }, + { + "epoch": 0.06, + "grad_norm": 1.5351361206003573, + "learning_rate": 9.969041386000547e-06, + "loss": 0.5203, + "step": 519 + }, + { + "epoch": 0.06, + "grad_norm": 1.8922924188464132, + "learning_rate": 9.968817687668067e-06, + "loss": 0.5381, + "step": 520 + }, + { + "epoch": 0.06, + "grad_norm": 1.5799451731489043, + "learning_rate": 9.96859318657933e-06, + "loss": 0.5668, + "step": 521 + }, + { + "epoch": 0.06, + "grad_norm": 2.0926399677961864, + "learning_rate": 9.968367882770601e-06, + "loss": 0.5635, + "step": 522 + }, + { + "epoch": 0.06, + "grad_norm": 1.9474775210488486, + "learning_rate": 9.968141776278287e-06, + "loss": 0.571, + "step": 523 + }, + { + "epoch": 0.07, + "grad_norm": 1.7431408889119921, + "learning_rate": 9.96791486713891e-06, + "loss": 0.5826, + "step": 524 + }, + { + "epoch": 0.07, + "grad_norm": 1.8072112193176124, + "learning_rate": 9.967687155389135e-06, + "loss": 0.6058, + "step": 525 + }, + { + "epoch": 0.07, + "grad_norm": 1.5487646831257245, + "learning_rate": 9.967458641065748e-06, + "loss": 0.5161, + "step": 526 + }, + { + "epoch": 0.07, + "grad_norm": 1.9345132321574223, + "learning_rate": 9.967229324205666e-06, + "loss": 0.5713, + "step": 527 + }, + { + "epoch": 0.07, + "grad_norm": 1.9435970449322106, + "learning_rate": 9.966999204845942e-06, + "loss": 0.5563, + "step": 528 + }, + { + "epoch": 0.07, + "grad_norm": 1.43796673259639, + "learning_rate": 9.966768283023752e-06, + "loss": 0.5661, + "step": 529 + }, + { + "epoch": 0.07, + "grad_norm": 1.6557336189733225, + "learning_rate": 9.966536558776399e-06, + "loss": 0.5597, + "step": 530 + }, + { + "epoch": 0.07, + "grad_norm": 1.9099136381706796, + "learning_rate": 9.966304032141325e-06, + "loss": 0.4683, + "step": 531 + }, + { + "epoch": 0.07, + "grad_norm": 1.5887496350127177, + "learning_rate": 9.966070703156096e-06, + "loss": 0.476, + "step": 532 + }, + { + "epoch": 0.07, + "grad_norm": 2.2832288568597026, + "learning_rate": 9.965836571858408e-06, + "loss": 0.4918, + "step": 533 + }, + { + "epoch": 0.07, + "grad_norm": 1.8915667838943808, + "learning_rate": 9.965601638286085e-06, + "loss": 0.4534, + "step": 534 + }, + { + "epoch": 0.07, + "grad_norm": 1.5838608224606279, + "learning_rate": 9.965365902477085e-06, + "loss": 0.5287, + "step": 535 + }, + { + "epoch": 0.07, + "grad_norm": 1.5069296219030366, + "learning_rate": 9.965129364469492e-06, + "loss": 0.522, + "step": 536 + }, + { + "epoch": 0.07, + "grad_norm": 1.575781289038582, + "learning_rate": 9.96489202430152e-06, + "loss": 0.5527, + "step": 537 + }, + { + "epoch": 0.07, + "grad_norm": 1.495415835304909, + "learning_rate": 9.964653882011516e-06, + "loss": 0.5092, + "step": 538 + }, + { + "epoch": 0.07, + "grad_norm": 1.6262876945226945, + "learning_rate": 9.964414937637953e-06, + "loss": 0.4981, + "step": 539 + }, + { + "epoch": 0.07, + "grad_norm": 0.6387936310785975, + "learning_rate": 9.964175191219436e-06, + "loss": 0.4891, + "step": 540 + }, + { + "epoch": 0.07, + "grad_norm": 1.6217650667702996, + "learning_rate": 9.963934642794694e-06, + "loss": 0.5808, + "step": 541 + }, + { + "epoch": 0.07, + "grad_norm": 1.4841632250821106, + "learning_rate": 9.963693292402592e-06, + "loss": 0.552, + "step": 542 + }, + { + "epoch": 0.07, + "grad_norm": 2.2404882976865395, + "learning_rate": 9.963451140082124e-06, + "loss": 0.5235, + "step": 543 + }, + { + "epoch": 0.07, + "grad_norm": 1.8579527183857778, + "learning_rate": 9.963208185872407e-06, + "loss": 0.5543, + "step": 544 + }, + { + "epoch": 0.07, + "grad_norm": 1.9117719830923814, + "learning_rate": 9.962964429812697e-06, + "loss": 0.5189, + "step": 545 + }, + { + "epoch": 0.07, + "grad_norm": 1.5305379419100715, + "learning_rate": 9.962719871942374e-06, + "loss": 0.5631, + "step": 546 + }, + { + "epoch": 0.07, + "grad_norm": 1.882149654586994, + "learning_rate": 9.962474512300948e-06, + "loss": 0.5624, + "step": 547 + }, + { + "epoch": 0.07, + "grad_norm": 1.6646759237474957, + "learning_rate": 9.962228350928058e-06, + "loss": 0.5551, + "step": 548 + }, + { + "epoch": 0.07, + "grad_norm": 1.5649208125553558, + "learning_rate": 9.961981387863476e-06, + "loss": 0.5388, + "step": 549 + }, + { + "epoch": 0.07, + "grad_norm": 1.4445542695684628, + "learning_rate": 9.961733623147098e-06, + "loss": 0.5359, + "step": 550 + }, + { + "epoch": 0.07, + "grad_norm": 1.621089252780402, + "learning_rate": 9.961485056818957e-06, + "loss": 0.5501, + "step": 551 + }, + { + "epoch": 0.07, + "grad_norm": 1.3600616481489842, + "learning_rate": 9.961235688919204e-06, + "loss": 0.5164, + "step": 552 + }, + { + "epoch": 0.07, + "grad_norm": 1.5291096429900197, + "learning_rate": 9.960985519488133e-06, + "loss": 0.5464, + "step": 553 + }, + { + "epoch": 0.07, + "grad_norm": 1.8444597766221988, + "learning_rate": 9.96073454856616e-06, + "loss": 0.5621, + "step": 554 + }, + { + "epoch": 0.07, + "grad_norm": 1.6732011691950741, + "learning_rate": 9.96048277619383e-06, + "loss": 0.5049, + "step": 555 + }, + { + "epoch": 0.07, + "grad_norm": 1.438419778694226, + "learning_rate": 9.960230202411818e-06, + "loss": 0.5538, + "step": 556 + }, + { + "epoch": 0.07, + "grad_norm": 1.6603929165553797, + "learning_rate": 9.95997682726093e-06, + "loss": 0.5469, + "step": 557 + }, + { + "epoch": 0.07, + "grad_norm": 1.7668487791259488, + "learning_rate": 9.959722650782104e-06, + "loss": 0.555, + "step": 558 + }, + { + "epoch": 0.07, + "grad_norm": 1.6218233851859314, + "learning_rate": 9.959467673016403e-06, + "loss": 0.5416, + "step": 559 + }, + { + "epoch": 0.07, + "grad_norm": 2.0098143485676965, + "learning_rate": 9.959211894005018e-06, + "loss": 0.588, + "step": 560 + }, + { + "epoch": 0.07, + "grad_norm": 1.5346689636597521, + "learning_rate": 9.958955313789277e-06, + "loss": 0.6114, + "step": 561 + }, + { + "epoch": 0.07, + "grad_norm": 1.3691963998562475, + "learning_rate": 9.958697932410631e-06, + "loss": 0.5647, + "step": 562 + }, + { + "epoch": 0.07, + "grad_norm": 1.5902891681020572, + "learning_rate": 9.958439749910658e-06, + "loss": 0.5241, + "step": 563 + }, + { + "epoch": 0.07, + "grad_norm": 1.4317090557464656, + "learning_rate": 9.958180766331076e-06, + "loss": 0.5344, + "step": 564 + }, + { + "epoch": 0.07, + "grad_norm": 1.6223451298186553, + "learning_rate": 9.957920981713723e-06, + "loss": 0.5354, + "step": 565 + }, + { + "epoch": 0.07, + "grad_norm": 1.4911420362299912, + "learning_rate": 9.95766039610057e-06, + "loss": 0.5261, + "step": 566 + }, + { + "epoch": 0.07, + "grad_norm": 1.459595289704798, + "learning_rate": 9.957399009533716e-06, + "loss": 0.5441, + "step": 567 + }, + { + "epoch": 0.07, + "grad_norm": 1.50536016248566, + "learning_rate": 9.957136822055392e-06, + "loss": 0.5742, + "step": 568 + }, + { + "epoch": 0.07, + "grad_norm": 3.099600133776981, + "learning_rate": 9.956873833707958e-06, + "loss": 0.5347, + "step": 569 + }, + { + "epoch": 0.07, + "grad_norm": 1.823400575954244, + "learning_rate": 9.956610044533897e-06, + "loss": 0.5679, + "step": 570 + }, + { + "epoch": 0.07, + "grad_norm": 2.211244433828808, + "learning_rate": 9.956345454575831e-06, + "loss": 0.5144, + "step": 571 + }, + { + "epoch": 0.07, + "grad_norm": 1.3813580483388566, + "learning_rate": 9.956080063876503e-06, + "loss": 0.5213, + "step": 572 + }, + { + "epoch": 0.07, + "grad_norm": 1.547609620866616, + "learning_rate": 9.955813872478794e-06, + "loss": 0.5107, + "step": 573 + }, + { + "epoch": 0.07, + "grad_norm": 1.9698736531779897, + "learning_rate": 9.955546880425706e-06, + "loss": 0.5011, + "step": 574 + }, + { + "epoch": 0.07, + "grad_norm": 0.7132662201669262, + "learning_rate": 9.955279087760374e-06, + "loss": 0.4852, + "step": 575 + }, + { + "epoch": 0.07, + "grad_norm": 2.1289820789250253, + "learning_rate": 9.955010494526062e-06, + "loss": 0.6084, + "step": 576 + }, + { + "epoch": 0.07, + "grad_norm": 0.6894981066518925, + "learning_rate": 9.954741100766167e-06, + "loss": 0.5068, + "step": 577 + }, + { + "epoch": 0.07, + "grad_norm": 1.993185449507924, + "learning_rate": 9.95447090652421e-06, + "loss": 0.5744, + "step": 578 + }, + { + "epoch": 0.07, + "grad_norm": 1.9175405662571439, + "learning_rate": 9.954199911843842e-06, + "loss": 0.5143, + "step": 579 + }, + { + "epoch": 0.07, + "grad_norm": 1.707912930910467, + "learning_rate": 9.953928116768848e-06, + "loss": 0.5874, + "step": 580 + }, + { + "epoch": 0.07, + "grad_norm": 2.1196759791921496, + "learning_rate": 9.953655521343132e-06, + "loss": 0.5544, + "step": 581 + }, + { + "epoch": 0.07, + "grad_norm": 1.6578798781305177, + "learning_rate": 9.953382125610742e-06, + "loss": 0.5006, + "step": 582 + }, + { + "epoch": 0.07, + "grad_norm": 1.7117964965327253, + "learning_rate": 9.953107929615843e-06, + "loss": 0.5286, + "step": 583 + }, + { + "epoch": 0.07, + "grad_norm": 1.718532476961154, + "learning_rate": 9.952832933402733e-06, + "loss": 0.5907, + "step": 584 + }, + { + "epoch": 0.07, + "grad_norm": 4.874584366704575, + "learning_rate": 9.952557137015846e-06, + "loss": 0.5456, + "step": 585 + }, + { + "epoch": 0.07, + "grad_norm": 2.091952706607619, + "learning_rate": 9.952280540499733e-06, + "loss": 0.5807, + "step": 586 + }, + { + "epoch": 0.07, + "grad_norm": 1.8340961057465826, + "learning_rate": 9.952003143899082e-06, + "loss": 0.5044, + "step": 587 + }, + { + "epoch": 0.07, + "grad_norm": 1.4142321636334534, + "learning_rate": 9.951724947258713e-06, + "loss": 0.4952, + "step": 588 + }, + { + "epoch": 0.07, + "grad_norm": 1.5498823391862544, + "learning_rate": 9.951445950623565e-06, + "loss": 0.5229, + "step": 589 + }, + { + "epoch": 0.07, + "grad_norm": 7.976631116773797, + "learning_rate": 9.951166154038716e-06, + "loss": 0.5212, + "step": 590 + }, + { + "epoch": 0.07, + "grad_norm": 1.4043970725348118, + "learning_rate": 9.950885557549369e-06, + "loss": 0.5213, + "step": 591 + }, + { + "epoch": 0.07, + "grad_norm": 1.6411492750911545, + "learning_rate": 9.950604161200855e-06, + "loss": 0.508, + "step": 592 + }, + { + "epoch": 0.07, + "grad_norm": 1.6155280198944444, + "learning_rate": 9.950321965038638e-06, + "loss": 0.5176, + "step": 593 + }, + { + "epoch": 0.07, + "grad_norm": 1.6329573178476742, + "learning_rate": 9.950038969108312e-06, + "loss": 0.5381, + "step": 594 + }, + { + "epoch": 0.07, + "grad_norm": 1.5322368189387603, + "learning_rate": 9.949755173455591e-06, + "loss": 0.4737, + "step": 595 + }, + { + "epoch": 0.07, + "grad_norm": 1.8966104227734712, + "learning_rate": 9.94947057812633e-06, + "loss": 0.5447, + "step": 596 + }, + { + "epoch": 0.07, + "grad_norm": 2.3255920186297554, + "learning_rate": 9.949185183166506e-06, + "loss": 0.4983, + "step": 597 + }, + { + "epoch": 0.07, + "grad_norm": 1.514982661457202, + "learning_rate": 9.948898988622226e-06, + "loss": 0.4703, + "step": 598 + }, + { + "epoch": 0.07, + "grad_norm": 1.5578472451579808, + "learning_rate": 9.94861199453973e-06, + "loss": 0.5355, + "step": 599 + }, + { + "epoch": 0.07, + "grad_norm": 1.8462377204745257, + "learning_rate": 9.948324200965382e-06, + "loss": 0.5217, + "step": 600 + }, + { + "epoch": 0.07, + "grad_norm": 2.0501987500355052, + "learning_rate": 9.948035607945678e-06, + "loss": 0.5216, + "step": 601 + }, + { + "epoch": 0.07, + "grad_norm": 0.7196396011246718, + "learning_rate": 9.947746215527246e-06, + "loss": 0.4568, + "step": 602 + }, + { + "epoch": 0.07, + "grad_norm": 2.2029295205716912, + "learning_rate": 9.947456023756834e-06, + "loss": 0.5113, + "step": 603 + }, + { + "epoch": 0.07, + "grad_norm": 2.4106620683922255, + "learning_rate": 9.94716503268133e-06, + "loss": 0.5382, + "step": 604 + }, + { + "epoch": 0.08, + "grad_norm": 1.4704451672909378, + "learning_rate": 9.946873242347741e-06, + "loss": 0.552, + "step": 605 + }, + { + "epoch": 0.08, + "grad_norm": 1.5073803415658742, + "learning_rate": 9.946580652803215e-06, + "loss": 0.4871, + "step": 606 + }, + { + "epoch": 0.08, + "grad_norm": 1.9073111646355374, + "learning_rate": 9.946287264095016e-06, + "loss": 0.5451, + "step": 607 + }, + { + "epoch": 0.08, + "grad_norm": 1.4481931142536788, + "learning_rate": 9.945993076270548e-06, + "loss": 0.4984, + "step": 608 + }, + { + "epoch": 0.08, + "grad_norm": 1.5698280416999764, + "learning_rate": 9.945698089377338e-06, + "loss": 0.5163, + "step": 609 + }, + { + "epoch": 0.08, + "grad_norm": 1.5623181354974682, + "learning_rate": 9.945402303463045e-06, + "loss": 0.4943, + "step": 610 + }, + { + "epoch": 0.08, + "grad_norm": 1.545427331377954, + "learning_rate": 9.945105718575452e-06, + "loss": 0.5753, + "step": 611 + }, + { + "epoch": 0.08, + "grad_norm": 2.519017117437064, + "learning_rate": 9.94480833476248e-06, + "loss": 0.4687, + "step": 612 + }, + { + "epoch": 0.08, + "grad_norm": 1.320764969254525, + "learning_rate": 9.94451015207217e-06, + "loss": 0.5192, + "step": 613 + }, + { + "epoch": 0.08, + "grad_norm": 1.585198566620101, + "learning_rate": 9.944211170552698e-06, + "loss": 0.5299, + "step": 614 + }, + { + "epoch": 0.08, + "grad_norm": 2.065016309964868, + "learning_rate": 9.943911390252368e-06, + "loss": 0.5043, + "step": 615 + }, + { + "epoch": 0.08, + "grad_norm": 1.717156629974031, + "learning_rate": 9.943610811219608e-06, + "loss": 0.5314, + "step": 616 + }, + { + "epoch": 0.08, + "grad_norm": 1.3992808758110693, + "learning_rate": 9.943309433502985e-06, + "loss": 0.4816, + "step": 617 + }, + { + "epoch": 0.08, + "grad_norm": 1.4741157871118, + "learning_rate": 9.943007257151185e-06, + "loss": 0.5328, + "step": 618 + }, + { + "epoch": 0.08, + "grad_norm": 1.6625878117192396, + "learning_rate": 9.942704282213028e-06, + "loss": 0.5106, + "step": 619 + }, + { + "epoch": 0.08, + "grad_norm": 1.5814144236989829, + "learning_rate": 9.942400508737465e-06, + "loss": 0.5308, + "step": 620 + }, + { + "epoch": 0.08, + "grad_norm": 1.5003023210361535, + "learning_rate": 9.942095936773568e-06, + "loss": 0.5584, + "step": 621 + }, + { + "epoch": 0.08, + "grad_norm": 1.5907340486009833, + "learning_rate": 9.941790566370547e-06, + "loss": 0.5498, + "step": 622 + }, + { + "epoch": 0.08, + "grad_norm": 1.2882804193460367, + "learning_rate": 9.941484397577737e-06, + "loss": 0.5108, + "step": 623 + }, + { + "epoch": 0.08, + "grad_norm": 1.679829152208987, + "learning_rate": 9.941177430444601e-06, + "loss": 0.5687, + "step": 624 + }, + { + "epoch": 0.08, + "grad_norm": 1.4950337923655184, + "learning_rate": 9.940869665020736e-06, + "loss": 0.583, + "step": 625 + }, + { + "epoch": 0.08, + "grad_norm": 0.692963812544074, + "learning_rate": 9.940561101355858e-06, + "loss": 0.507, + "step": 626 + }, + { + "epoch": 0.08, + "grad_norm": 1.5440986461401733, + "learning_rate": 9.940251739499824e-06, + "loss": 0.5577, + "step": 627 + }, + { + "epoch": 0.08, + "grad_norm": 1.6306303651228953, + "learning_rate": 9.93994157950261e-06, + "loss": 0.4947, + "step": 628 + }, + { + "epoch": 0.08, + "grad_norm": 0.6663936494364988, + "learning_rate": 9.939630621414328e-06, + "loss": 0.467, + "step": 629 + }, + { + "epoch": 0.08, + "grad_norm": 1.8972238621140487, + "learning_rate": 9.939318865285214e-06, + "loss": 0.5485, + "step": 630 + }, + { + "epoch": 0.08, + "grad_norm": 1.4247705518456582, + "learning_rate": 9.939006311165636e-06, + "loss": 0.5473, + "step": 631 + }, + { + "epoch": 0.08, + "grad_norm": 1.718273329018105, + "learning_rate": 9.938692959106089e-06, + "loss": 0.5431, + "step": 632 + }, + { + "epoch": 0.08, + "grad_norm": 0.6943576952650891, + "learning_rate": 9.9383788091572e-06, + "loss": 0.5074, + "step": 633 + }, + { + "epoch": 0.08, + "grad_norm": 0.6876591007539876, + "learning_rate": 9.938063861369718e-06, + "loss": 0.5159, + "step": 634 + }, + { + "epoch": 0.08, + "grad_norm": 1.7548691737757782, + "learning_rate": 9.93774811579453e-06, + "loss": 0.5381, + "step": 635 + }, + { + "epoch": 0.08, + "grad_norm": 1.5154806990192187, + "learning_rate": 9.937431572482646e-06, + "loss": 0.5053, + "step": 636 + }, + { + "epoch": 0.08, + "grad_norm": 1.5611688276113909, + "learning_rate": 9.937114231485207e-06, + "loss": 0.506, + "step": 637 + }, + { + "epoch": 0.08, + "grad_norm": 1.9093304638448643, + "learning_rate": 9.936796092853482e-06, + "loss": 0.482, + "step": 638 + }, + { + "epoch": 0.08, + "grad_norm": 2.097493189282801, + "learning_rate": 9.936477156638868e-06, + "loss": 0.5737, + "step": 639 + }, + { + "epoch": 0.08, + "grad_norm": 1.430398311443158, + "learning_rate": 9.936157422892892e-06, + "loss": 0.5013, + "step": 640 + }, + { + "epoch": 0.08, + "grad_norm": 1.8149506986026083, + "learning_rate": 9.935836891667212e-06, + "loss": 0.5898, + "step": 641 + }, + { + "epoch": 0.08, + "grad_norm": 1.6180185699234901, + "learning_rate": 9.93551556301361e-06, + "loss": 0.5698, + "step": 642 + }, + { + "epoch": 0.08, + "grad_norm": 0.7222498111719983, + "learning_rate": 9.935193436984001e-06, + "loss": 0.5386, + "step": 643 + }, + { + "epoch": 0.08, + "grad_norm": 0.7724001895597116, + "learning_rate": 9.934870513630428e-06, + "loss": 0.5139, + "step": 644 + }, + { + "epoch": 0.08, + "grad_norm": 1.5072040003212988, + "learning_rate": 9.93454679300506e-06, + "loss": 0.5524, + "step": 645 + }, + { + "epoch": 0.08, + "grad_norm": 1.5356579540974762, + "learning_rate": 9.934222275160199e-06, + "loss": 0.5048, + "step": 646 + }, + { + "epoch": 0.08, + "grad_norm": 2.108613917254826, + "learning_rate": 9.933896960148272e-06, + "loss": 0.5056, + "step": 647 + }, + { + "epoch": 0.08, + "grad_norm": 0.6624497135490821, + "learning_rate": 9.933570848021837e-06, + "loss": 0.4759, + "step": 648 + }, + { + "epoch": 0.08, + "grad_norm": 1.8727451645721778, + "learning_rate": 9.933243938833581e-06, + "loss": 0.5638, + "step": 649 + }, + { + "epoch": 0.08, + "grad_norm": 1.4408274780775827, + "learning_rate": 9.932916232636318e-06, + "loss": 0.5385, + "step": 650 + }, + { + "epoch": 0.08, + "grad_norm": 1.488341298958083, + "learning_rate": 9.932587729482992e-06, + "loss": 0.5178, + "step": 651 + }, + { + "epoch": 0.08, + "grad_norm": 1.5293715713962739, + "learning_rate": 9.932258429426678e-06, + "loss": 0.4827, + "step": 652 + }, + { + "epoch": 0.08, + "grad_norm": 1.739811601628948, + "learning_rate": 9.931928332520573e-06, + "loss": 0.5144, + "step": 653 + }, + { + "epoch": 0.08, + "grad_norm": 1.7864609906352191, + "learning_rate": 9.93159743881801e-06, + "loss": 0.5343, + "step": 654 + }, + { + "epoch": 0.08, + "grad_norm": 1.5659000901067446, + "learning_rate": 9.931265748372447e-06, + "loss": 0.4829, + "step": 655 + }, + { + "epoch": 0.08, + "grad_norm": 1.3856751502290265, + "learning_rate": 9.930933261237472e-06, + "loss": 0.5254, + "step": 656 + }, + { + "epoch": 0.08, + "grad_norm": 1.6639494622011914, + "learning_rate": 9.930599977466802e-06, + "loss": 0.5489, + "step": 657 + }, + { + "epoch": 0.08, + "grad_norm": 1.5503942485192046, + "learning_rate": 9.93026589711428e-06, + "loss": 0.4999, + "step": 658 + }, + { + "epoch": 0.08, + "grad_norm": 1.6374283584517775, + "learning_rate": 9.929931020233878e-06, + "loss": 0.5678, + "step": 659 + }, + { + "epoch": 0.08, + "grad_norm": 1.4451768173598474, + "learning_rate": 9.929595346879704e-06, + "loss": 0.5326, + "step": 660 + }, + { + "epoch": 0.08, + "grad_norm": 3.771283449714104, + "learning_rate": 9.929258877105985e-06, + "loss": 0.5726, + "step": 661 + }, + { + "epoch": 0.08, + "grad_norm": 1.5995943876705156, + "learning_rate": 9.928921610967079e-06, + "loss": 0.5317, + "step": 662 + }, + { + "epoch": 0.08, + "grad_norm": 2.0357535708739407, + "learning_rate": 9.92858354851748e-06, + "loss": 0.5355, + "step": 663 + }, + { + "epoch": 0.08, + "grad_norm": 1.6543400868553626, + "learning_rate": 9.928244689811799e-06, + "loss": 0.5311, + "step": 664 + }, + { + "epoch": 0.08, + "grad_norm": 3.5531763061855233, + "learning_rate": 9.927905034904785e-06, + "loss": 0.4885, + "step": 665 + }, + { + "epoch": 0.08, + "grad_norm": 1.6727414973141812, + "learning_rate": 9.927564583851313e-06, + "loss": 0.5728, + "step": 666 + }, + { + "epoch": 0.08, + "grad_norm": 1.5057542539112305, + "learning_rate": 9.927223336706384e-06, + "loss": 0.5613, + "step": 667 + }, + { + "epoch": 0.08, + "grad_norm": 1.3621577823437236, + "learning_rate": 9.926881293525129e-06, + "loss": 0.5133, + "step": 668 + }, + { + "epoch": 0.08, + "grad_norm": 1.3917135906447404, + "learning_rate": 9.92653845436281e-06, + "loss": 0.582, + "step": 669 + }, + { + "epoch": 0.08, + "grad_norm": 1.1837662859106874, + "learning_rate": 9.926194819274812e-06, + "loss": 0.5057, + "step": 670 + }, + { + "epoch": 0.08, + "grad_norm": 1.5927160077016809, + "learning_rate": 9.925850388316657e-06, + "loss": 0.44, + "step": 671 + }, + { + "epoch": 0.08, + "grad_norm": 1.3077815833136432, + "learning_rate": 9.925505161543988e-06, + "loss": 0.5325, + "step": 672 + }, + { + "epoch": 0.08, + "grad_norm": 1.3547793282803282, + "learning_rate": 9.925159139012581e-06, + "loss": 0.4944, + "step": 673 + }, + { + "epoch": 0.08, + "grad_norm": 3.8707164450695393, + "learning_rate": 9.924812320778338e-06, + "loss": 0.497, + "step": 674 + }, + { + "epoch": 0.08, + "grad_norm": 1.5811520204557328, + "learning_rate": 9.92446470689729e-06, + "loss": 0.5824, + "step": 675 + }, + { + "epoch": 0.08, + "grad_norm": 1.6237920019107122, + "learning_rate": 9.924116297425598e-06, + "loss": 0.546, + "step": 676 + }, + { + "epoch": 0.08, + "grad_norm": 1.300885812955191, + "learning_rate": 9.923767092419551e-06, + "loss": 0.543, + "step": 677 + }, + { + "epoch": 0.08, + "grad_norm": 1.4815799886567405, + "learning_rate": 9.923417091935564e-06, + "loss": 0.5462, + "step": 678 + }, + { + "epoch": 0.08, + "grad_norm": 0.7256114801909598, + "learning_rate": 9.923066296030185e-06, + "loss": 0.5211, + "step": 679 + }, + { + "epoch": 0.08, + "grad_norm": 1.5039129063851944, + "learning_rate": 9.922714704760088e-06, + "loss": 0.5257, + "step": 680 + }, + { + "epoch": 0.08, + "grad_norm": 1.4942655542398324, + "learning_rate": 9.922362318182075e-06, + "loss": 0.534, + "step": 681 + }, + { + "epoch": 0.08, + "grad_norm": 1.6854065304986234, + "learning_rate": 9.922009136353077e-06, + "loss": 0.5436, + "step": 682 + }, + { + "epoch": 0.08, + "grad_norm": 1.6207158877626267, + "learning_rate": 9.921655159330154e-06, + "loss": 0.5232, + "step": 683 + }, + { + "epoch": 0.08, + "grad_norm": 1.9176833918479952, + "learning_rate": 9.921300387170494e-06, + "loss": 0.5185, + "step": 684 + }, + { + "epoch": 0.09, + "grad_norm": 2.222262958976972, + "learning_rate": 9.920944819931412e-06, + "loss": 0.5406, + "step": 685 + }, + { + "epoch": 0.09, + "grad_norm": 1.5443955907847353, + "learning_rate": 9.920588457670357e-06, + "loss": 0.5517, + "step": 686 + }, + { + "epoch": 0.09, + "grad_norm": 1.4342050400083974, + "learning_rate": 9.920231300444897e-06, + "loss": 0.5497, + "step": 687 + }, + { + "epoch": 0.09, + "grad_norm": 1.311067650265911, + "learning_rate": 9.91987334831274e-06, + "loss": 0.5226, + "step": 688 + }, + { + "epoch": 0.09, + "grad_norm": 1.471479637322531, + "learning_rate": 9.919514601331712e-06, + "loss": 0.5122, + "step": 689 + }, + { + "epoch": 0.09, + "grad_norm": 1.7926523501870986, + "learning_rate": 9.919155059559772e-06, + "loss": 0.55, + "step": 690 + }, + { + "epoch": 0.09, + "grad_norm": 1.5675007462680313, + "learning_rate": 9.918794723055009e-06, + "loss": 0.5821, + "step": 691 + }, + { + "epoch": 0.09, + "grad_norm": 1.4604240106900952, + "learning_rate": 9.918433591875637e-06, + "loss": 0.564, + "step": 692 + }, + { + "epoch": 0.09, + "grad_norm": 1.4861143775734027, + "learning_rate": 9.918071666080003e-06, + "loss": 0.5813, + "step": 693 + }, + { + "epoch": 0.09, + "grad_norm": 2.143330932505597, + "learning_rate": 9.917708945726574e-06, + "loss": 0.5196, + "step": 694 + }, + { + "epoch": 0.09, + "grad_norm": 3.44234958841204, + "learning_rate": 9.917345430873955e-06, + "loss": 0.5551, + "step": 695 + }, + { + "epoch": 0.09, + "grad_norm": 1.48817999565854, + "learning_rate": 9.916981121580873e-06, + "loss": 0.521, + "step": 696 + }, + { + "epoch": 0.09, + "grad_norm": 1.774381530446406, + "learning_rate": 9.916616017906185e-06, + "loss": 0.5086, + "step": 697 + }, + { + "epoch": 0.09, + "grad_norm": 0.6229915734516575, + "learning_rate": 9.91625011990888e-06, + "loss": 0.4771, + "step": 698 + }, + { + "epoch": 0.09, + "grad_norm": 2.448594905519429, + "learning_rate": 9.915883427648066e-06, + "loss": 0.5662, + "step": 699 + }, + { + "epoch": 0.09, + "grad_norm": 1.9591531503663824, + "learning_rate": 9.915515941182992e-06, + "loss": 0.4842, + "step": 700 + }, + { + "epoch": 0.09, + "grad_norm": 1.36950835978183, + "learning_rate": 9.915147660573024e-06, + "loss": 0.4944, + "step": 701 + }, + { + "epoch": 0.09, + "grad_norm": 1.8648299110331485, + "learning_rate": 9.914778585877665e-06, + "loss": 0.5539, + "step": 702 + }, + { + "epoch": 0.09, + "grad_norm": 1.7320586877113233, + "learning_rate": 9.914408717156538e-06, + "loss": 0.5153, + "step": 703 + }, + { + "epoch": 0.09, + "grad_norm": 1.4658542485684234, + "learning_rate": 9.9140380544694e-06, + "loss": 0.572, + "step": 704 + }, + { + "epoch": 0.09, + "grad_norm": 1.4362278910057131, + "learning_rate": 9.913666597876137e-06, + "loss": 0.5114, + "step": 705 + }, + { + "epoch": 0.09, + "grad_norm": 2.0480313933332335, + "learning_rate": 9.913294347436758e-06, + "loss": 0.4573, + "step": 706 + }, + { + "epoch": 0.09, + "grad_norm": 1.3863115172008518, + "learning_rate": 9.912921303211405e-06, + "loss": 0.5416, + "step": 707 + }, + { + "epoch": 0.09, + "grad_norm": 1.756096947460397, + "learning_rate": 9.912547465260347e-06, + "loss": 0.4997, + "step": 708 + }, + { + "epoch": 0.09, + "grad_norm": 1.681996815479112, + "learning_rate": 9.91217283364398e-06, + "loss": 0.5189, + "step": 709 + }, + { + "epoch": 0.09, + "grad_norm": 1.8491451039304299, + "learning_rate": 9.91179740842283e-06, + "loss": 0.5712, + "step": 710 + }, + { + "epoch": 0.09, + "grad_norm": 0.6846675664657432, + "learning_rate": 9.911421189657548e-06, + "loss": 0.52, + "step": 711 + }, + { + "epoch": 0.09, + "grad_norm": 1.6256288323393315, + "learning_rate": 9.911044177408917e-06, + "loss": 0.5461, + "step": 712 + }, + { + "epoch": 0.09, + "grad_norm": 1.4504352891549575, + "learning_rate": 9.910666371737848e-06, + "loss": 0.5386, + "step": 713 + }, + { + "epoch": 0.09, + "grad_norm": 0.6943756408365623, + "learning_rate": 9.910287772705374e-06, + "loss": 0.51, + "step": 714 + }, + { + "epoch": 0.09, + "grad_norm": 1.6456825439428284, + "learning_rate": 9.909908380372667e-06, + "loss": 0.5576, + "step": 715 + }, + { + "epoch": 0.09, + "grad_norm": 1.3685240161884205, + "learning_rate": 9.909528194801019e-06, + "loss": 0.5561, + "step": 716 + }, + { + "epoch": 0.09, + "grad_norm": 2.6513433815250442, + "learning_rate": 9.90914721605185e-06, + "loss": 0.4893, + "step": 717 + }, + { + "epoch": 0.09, + "grad_norm": 1.2340929468815107, + "learning_rate": 9.908765444186713e-06, + "loss": 0.5007, + "step": 718 + }, + { + "epoch": 0.09, + "grad_norm": 1.3786299592454312, + "learning_rate": 9.908382879267286e-06, + "loss": 0.5752, + "step": 719 + }, + { + "epoch": 0.09, + "grad_norm": 1.4800630744877326, + "learning_rate": 9.907999521355377e-06, + "loss": 0.5015, + "step": 720 + }, + { + "epoch": 0.09, + "grad_norm": 2.2232871580378, + "learning_rate": 9.907615370512919e-06, + "loss": 0.497, + "step": 721 + }, + { + "epoch": 0.09, + "grad_norm": 1.2928878365401588, + "learning_rate": 9.907230426801975e-06, + "loss": 0.5447, + "step": 722 + }, + { + "epoch": 0.09, + "grad_norm": 1.4967207824812627, + "learning_rate": 9.906844690284737e-06, + "loss": 0.5291, + "step": 723 + }, + { + "epoch": 0.09, + "grad_norm": 1.816746657284892, + "learning_rate": 9.906458161023522e-06, + "loss": 0.5342, + "step": 724 + }, + { + "epoch": 0.09, + "grad_norm": 2.0861253042637484, + "learning_rate": 9.90607083908078e-06, + "loss": 0.5903, + "step": 725 + }, + { + "epoch": 0.09, + "grad_norm": 1.5926144364257493, + "learning_rate": 9.905682724519086e-06, + "loss": 0.5445, + "step": 726 + }, + { + "epoch": 0.09, + "grad_norm": 1.3870599530402037, + "learning_rate": 9.90529381740114e-06, + "loss": 0.5318, + "step": 727 + }, + { + "epoch": 0.09, + "grad_norm": 1.6319587413630097, + "learning_rate": 9.904904117789777e-06, + "loss": 0.5497, + "step": 728 + }, + { + "epoch": 0.09, + "grad_norm": 1.9513204246105857, + "learning_rate": 9.904513625747957e-06, + "loss": 0.5404, + "step": 729 + }, + { + "epoch": 0.09, + "grad_norm": 1.6492222551928712, + "learning_rate": 9.904122341338764e-06, + "loss": 0.5099, + "step": 730 + }, + { + "epoch": 0.09, + "grad_norm": 1.7330336994895605, + "learning_rate": 9.903730264625415e-06, + "loss": 0.5237, + "step": 731 + }, + { + "epoch": 0.09, + "grad_norm": 1.335780098202449, + "learning_rate": 9.903337395671255e-06, + "loss": 0.5068, + "step": 732 + }, + { + "epoch": 0.09, + "grad_norm": 0.6746985228306138, + "learning_rate": 9.902943734539752e-06, + "loss": 0.479, + "step": 733 + }, + { + "epoch": 0.09, + "grad_norm": 1.5679095009354174, + "learning_rate": 9.902549281294508e-06, + "loss": 0.5457, + "step": 734 + }, + { + "epoch": 0.09, + "grad_norm": 1.5978083443392859, + "learning_rate": 9.90215403599925e-06, + "loss": 0.5094, + "step": 735 + }, + { + "epoch": 0.09, + "grad_norm": 1.3502793900824692, + "learning_rate": 9.901757998717833e-06, + "loss": 0.5287, + "step": 736 + }, + { + "epoch": 0.09, + "grad_norm": 1.3767717373956716, + "learning_rate": 9.90136116951424e-06, + "loss": 0.5638, + "step": 737 + }, + { + "epoch": 0.09, + "grad_norm": 1.4100309801230537, + "learning_rate": 9.900963548452584e-06, + "loss": 0.5088, + "step": 738 + }, + { + "epoch": 0.09, + "grad_norm": 1.6723873034933143, + "learning_rate": 9.900565135597104e-06, + "loss": 0.5254, + "step": 739 + }, + { + "epoch": 0.09, + "grad_norm": 1.4968579496803647, + "learning_rate": 9.900165931012164e-06, + "loss": 0.5401, + "step": 740 + }, + { + "epoch": 0.09, + "grad_norm": 1.512318360157006, + "learning_rate": 9.899765934762263e-06, + "loss": 0.4433, + "step": 741 + }, + { + "epoch": 0.09, + "grad_norm": 2.5025123815418007, + "learning_rate": 9.899365146912018e-06, + "loss": 0.511, + "step": 742 + }, + { + "epoch": 0.09, + "grad_norm": 1.6683442789870047, + "learning_rate": 9.898963567526188e-06, + "loss": 0.5479, + "step": 743 + }, + { + "epoch": 0.09, + "grad_norm": 1.6151733284076952, + "learning_rate": 9.898561196669645e-06, + "loss": 0.5266, + "step": 744 + }, + { + "epoch": 0.09, + "grad_norm": 1.4827781091080732, + "learning_rate": 9.8981580344074e-06, + "loss": 0.5435, + "step": 745 + }, + { + "epoch": 0.09, + "grad_norm": 1.3361012140606874, + "learning_rate": 9.897754080804583e-06, + "loss": 0.5011, + "step": 746 + }, + { + "epoch": 0.09, + "grad_norm": 1.570149978353058, + "learning_rate": 9.89734933592646e-06, + "loss": 0.5391, + "step": 747 + }, + { + "epoch": 0.09, + "grad_norm": 1.4440488628055852, + "learning_rate": 9.896943799838419e-06, + "loss": 0.563, + "step": 748 + }, + { + "epoch": 0.09, + "grad_norm": 1.4722485313647773, + "learning_rate": 9.896537472605979e-06, + "loss": 0.5872, + "step": 749 + }, + { + "epoch": 0.09, + "grad_norm": 1.6458885186334251, + "learning_rate": 9.896130354294787e-06, + "loss": 0.4825, + "step": 750 + }, + { + "epoch": 0.09, + "grad_norm": 2.0870975798160076, + "learning_rate": 9.895722444970613e-06, + "loss": 0.5378, + "step": 751 + }, + { + "epoch": 0.09, + "grad_norm": 0.6713600482428013, + "learning_rate": 9.89531374469936e-06, + "loss": 0.5243, + "step": 752 + }, + { + "epoch": 0.09, + "grad_norm": 1.3666798769307291, + "learning_rate": 9.89490425354706e-06, + "loss": 0.537, + "step": 753 + }, + { + "epoch": 0.09, + "grad_norm": 1.6965904249779475, + "learning_rate": 9.894493971579864e-06, + "loss": 0.5512, + "step": 754 + }, + { + "epoch": 0.09, + "grad_norm": 1.3900037721001617, + "learning_rate": 9.894082898864062e-06, + "loss": 0.5467, + "step": 755 + }, + { + "epoch": 0.09, + "grad_norm": 1.524197186398035, + "learning_rate": 9.893671035466065e-06, + "loss": 0.4735, + "step": 756 + }, + { + "epoch": 0.09, + "grad_norm": 0.7339854586343567, + "learning_rate": 9.893258381452412e-06, + "loss": 0.5089, + "step": 757 + }, + { + "epoch": 0.09, + "grad_norm": 1.3923524219523842, + "learning_rate": 9.892844936889772e-06, + "loss": 0.4909, + "step": 758 + }, + { + "epoch": 0.09, + "grad_norm": 1.5124594114302838, + "learning_rate": 9.89243070184494e-06, + "loss": 0.5394, + "step": 759 + }, + { + "epoch": 0.09, + "grad_norm": 1.8034130536722808, + "learning_rate": 9.892015676384838e-06, + "loss": 0.512, + "step": 760 + }, + { + "epoch": 0.09, + "grad_norm": 1.5762898788709665, + "learning_rate": 9.891599860576519e-06, + "loss": 0.5248, + "step": 761 + }, + { + "epoch": 0.09, + "grad_norm": 1.4714297837827943, + "learning_rate": 9.891183254487162e-06, + "loss": 0.5465, + "step": 762 + }, + { + "epoch": 0.09, + "grad_norm": 1.3214866314049687, + "learning_rate": 9.890765858184073e-06, + "loss": 0.516, + "step": 763 + }, + { + "epoch": 0.09, + "grad_norm": 1.9633591420432888, + "learning_rate": 9.890347671734684e-06, + "loss": 0.5366, + "step": 764 + }, + { + "epoch": 0.09, + "grad_norm": 1.3580228817655544, + "learning_rate": 9.889928695206559e-06, + "loss": 0.5315, + "step": 765 + }, + { + "epoch": 0.1, + "grad_norm": 1.5198904764841936, + "learning_rate": 9.889508928667387e-06, + "loss": 0.5439, + "step": 766 + }, + { + "epoch": 0.1, + "grad_norm": 2.239507833632809, + "learning_rate": 9.889088372184983e-06, + "loss": 0.549, + "step": 767 + }, + { + "epoch": 0.1, + "grad_norm": 2.0090601913369452, + "learning_rate": 9.888667025827295e-06, + "loss": 0.5388, + "step": 768 + }, + { + "epoch": 0.1, + "grad_norm": 1.7600992496202628, + "learning_rate": 9.888244889662394e-06, + "loss": 0.5286, + "step": 769 + }, + { + "epoch": 0.1, + "grad_norm": 1.363528192366846, + "learning_rate": 9.887821963758478e-06, + "loss": 0.5206, + "step": 770 + }, + { + "epoch": 0.1, + "grad_norm": 1.3125021258357175, + "learning_rate": 9.887398248183876e-06, + "loss": 0.5342, + "step": 771 + }, + { + "epoch": 0.1, + "grad_norm": 2.1185739451091274, + "learning_rate": 9.886973743007042e-06, + "loss": 0.559, + "step": 772 + }, + { + "epoch": 0.1, + "grad_norm": 1.3878073704129925, + "learning_rate": 9.88654844829656e-06, + "loss": 0.5069, + "step": 773 + }, + { + "epoch": 0.1, + "grad_norm": 1.603486854404389, + "learning_rate": 9.88612236412114e-06, + "loss": 0.5484, + "step": 774 + }, + { + "epoch": 0.1, + "grad_norm": 1.5023961328735826, + "learning_rate": 9.885695490549616e-06, + "loss": 0.5537, + "step": 775 + }, + { + "epoch": 0.1, + "grad_norm": 1.345851732601472, + "learning_rate": 9.88526782765096e-06, + "loss": 0.5059, + "step": 776 + }, + { + "epoch": 0.1, + "grad_norm": 1.3519885963470128, + "learning_rate": 9.884839375494258e-06, + "loss": 0.4886, + "step": 777 + }, + { + "epoch": 0.1, + "grad_norm": 1.48289100247411, + "learning_rate": 9.884410134148733e-06, + "loss": 0.5144, + "step": 778 + }, + { + "epoch": 0.1, + "grad_norm": 1.4008779870643109, + "learning_rate": 9.883980103683735e-06, + "loss": 0.5803, + "step": 779 + }, + { + "epoch": 0.1, + "grad_norm": 1.4562131509620257, + "learning_rate": 9.883549284168736e-06, + "loss": 0.5574, + "step": 780 + }, + { + "epoch": 0.1, + "grad_norm": 1.4444072614610364, + "learning_rate": 9.883117675673339e-06, + "loss": 0.4858, + "step": 781 + }, + { + "epoch": 0.1, + "grad_norm": 1.5080506086390568, + "learning_rate": 9.882685278267277e-06, + "loss": 0.5444, + "step": 782 + }, + { + "epoch": 0.1, + "grad_norm": 1.7045840262615868, + "learning_rate": 9.882252092020403e-06, + "loss": 0.5505, + "step": 783 + }, + { + "epoch": 0.1, + "grad_norm": 1.7370813343274576, + "learning_rate": 9.881818117002706e-06, + "loss": 0.5415, + "step": 784 + }, + { + "epoch": 0.1, + "grad_norm": 2.018153134156803, + "learning_rate": 9.881383353284295e-06, + "loss": 0.4996, + "step": 785 + }, + { + "epoch": 0.1, + "grad_norm": 1.3253195899746142, + "learning_rate": 9.880947800935414e-06, + "loss": 0.5106, + "step": 786 + }, + { + "epoch": 0.1, + "grad_norm": 1.7024567402620956, + "learning_rate": 9.880511460026428e-06, + "loss": 0.5396, + "step": 787 + }, + { + "epoch": 0.1, + "grad_norm": 1.4091988414010796, + "learning_rate": 9.88007433062783e-06, + "loss": 0.555, + "step": 788 + }, + { + "epoch": 0.1, + "grad_norm": 0.7087771391854086, + "learning_rate": 9.879636412810244e-06, + "loss": 0.4847, + "step": 789 + }, + { + "epoch": 0.1, + "grad_norm": 2.7231897900894366, + "learning_rate": 9.879197706644423e-06, + "loss": 0.5219, + "step": 790 + }, + { + "epoch": 0.1, + "grad_norm": 1.7027320981466387, + "learning_rate": 9.878758212201238e-06, + "loss": 0.5014, + "step": 791 + }, + { + "epoch": 0.1, + "grad_norm": 1.3482912883508638, + "learning_rate": 9.878317929551695e-06, + "loss": 0.4895, + "step": 792 + }, + { + "epoch": 0.1, + "grad_norm": 1.3152332995824816, + "learning_rate": 9.877876858766927e-06, + "loss": 0.5465, + "step": 793 + }, + { + "epoch": 0.1, + "grad_norm": 1.4997609418658844, + "learning_rate": 9.877434999918192e-06, + "loss": 0.5262, + "step": 794 + }, + { + "epoch": 0.1, + "grad_norm": 0.6539265342268819, + "learning_rate": 9.876992353076875e-06, + "loss": 0.4821, + "step": 795 + }, + { + "epoch": 0.1, + "grad_norm": 1.6249056812162253, + "learning_rate": 9.876548918314492e-06, + "loss": 0.534, + "step": 796 + }, + { + "epoch": 0.1, + "grad_norm": 1.5004396199830836, + "learning_rate": 9.876104695702682e-06, + "loss": 0.5699, + "step": 797 + }, + { + "epoch": 0.1, + "grad_norm": 1.2826945141311428, + "learning_rate": 9.875659685313214e-06, + "loss": 0.4789, + "step": 798 + }, + { + "epoch": 0.1, + "grad_norm": 1.5469724214523863, + "learning_rate": 9.875213887217983e-06, + "loss": 0.5576, + "step": 799 + }, + { + "epoch": 0.1, + "grad_norm": 1.5415041213137937, + "learning_rate": 9.87476730148901e-06, + "loss": 0.5988, + "step": 800 + }, + { + "epoch": 0.1, + "grad_norm": 0.7021497983380178, + "learning_rate": 9.874319928198448e-06, + "loss": 0.5403, + "step": 801 + }, + { + "epoch": 0.1, + "grad_norm": 1.3549275899238842, + "learning_rate": 9.873871767418572e-06, + "loss": 0.455, + "step": 802 + }, + { + "epoch": 0.1, + "grad_norm": 1.2532327534233543, + "learning_rate": 9.873422819221785e-06, + "loss": 0.5064, + "step": 803 + }, + { + "epoch": 0.1, + "grad_norm": 1.7360535633059955, + "learning_rate": 9.872973083680622e-06, + "loss": 0.5882, + "step": 804 + }, + { + "epoch": 0.1, + "grad_norm": 1.4059071965483354, + "learning_rate": 9.87252256086774e-06, + "loss": 0.5658, + "step": 805 + }, + { + "epoch": 0.1, + "grad_norm": 2.530355778895807, + "learning_rate": 9.872071250855925e-06, + "loss": 0.5147, + "step": 806 + }, + { + "epoch": 0.1, + "grad_norm": 1.3847353474560062, + "learning_rate": 9.87161915371809e-06, + "loss": 0.5407, + "step": 807 + }, + { + "epoch": 0.1, + "grad_norm": 1.2893581665896208, + "learning_rate": 9.871166269527275e-06, + "loss": 0.5172, + "step": 808 + }, + { + "epoch": 0.1, + "grad_norm": 1.4749327657449687, + "learning_rate": 9.870712598356647e-06, + "loss": 0.5339, + "step": 809 + }, + { + "epoch": 0.1, + "grad_norm": 1.4358631901359513, + "learning_rate": 9.870258140279503e-06, + "loss": 0.5119, + "step": 810 + }, + { + "epoch": 0.1, + "grad_norm": 2.1722531501187503, + "learning_rate": 9.869802895369262e-06, + "loss": 0.5623, + "step": 811 + }, + { + "epoch": 0.1, + "grad_norm": 1.3154410118141262, + "learning_rate": 9.869346863699474e-06, + "loss": 0.5222, + "step": 812 + }, + { + "epoch": 0.1, + "grad_norm": 1.5619568170354916, + "learning_rate": 9.868890045343814e-06, + "loss": 0.5189, + "step": 813 + }, + { + "epoch": 0.1, + "grad_norm": 1.5104678506036293, + "learning_rate": 9.868432440376086e-06, + "loss": 0.5747, + "step": 814 + }, + { + "epoch": 0.1, + "grad_norm": 1.3470148837454545, + "learning_rate": 9.867974048870221e-06, + "loss": 0.5123, + "step": 815 + }, + { + "epoch": 0.1, + "grad_norm": 1.3544857302522781, + "learning_rate": 9.867514870900274e-06, + "loss": 0.5256, + "step": 816 + }, + { + "epoch": 0.1, + "grad_norm": 2.1417064240265287, + "learning_rate": 9.867054906540432e-06, + "loss": 0.5228, + "step": 817 + }, + { + "epoch": 0.1, + "grad_norm": 2.069195455278062, + "learning_rate": 9.866594155865004e-06, + "loss": 0.5605, + "step": 818 + }, + { + "epoch": 0.1, + "grad_norm": 0.6880607864614212, + "learning_rate": 9.866132618948428e-06, + "loss": 0.49, + "step": 819 + }, + { + "epoch": 0.1, + "grad_norm": 1.9067634496940928, + "learning_rate": 9.86567029586527e-06, + "loss": 0.5758, + "step": 820 + }, + { + "epoch": 0.1, + "grad_norm": 1.3809878109560794, + "learning_rate": 9.865207186690225e-06, + "loss": 0.4819, + "step": 821 + }, + { + "epoch": 0.1, + "grad_norm": 1.4377570369203871, + "learning_rate": 9.86474329149811e-06, + "loss": 0.5409, + "step": 822 + }, + { + "epoch": 0.1, + "grad_norm": 1.2837566075142783, + "learning_rate": 9.864278610363869e-06, + "loss": 0.5082, + "step": 823 + }, + { + "epoch": 0.1, + "grad_norm": 1.376197397673897, + "learning_rate": 9.863813143362579e-06, + "loss": 0.468, + "step": 824 + }, + { + "epoch": 0.1, + "grad_norm": 3.5091655795057206, + "learning_rate": 9.863346890569438e-06, + "loss": 0.5511, + "step": 825 + }, + { + "epoch": 0.1, + "grad_norm": 1.2654935539583412, + "learning_rate": 9.862879852059775e-06, + "loss": 0.5588, + "step": 826 + }, + { + "epoch": 0.1, + "grad_norm": 1.480429796301672, + "learning_rate": 9.862412027909045e-06, + "loss": 0.5077, + "step": 827 + }, + { + "epoch": 0.1, + "grad_norm": 1.4231689337630336, + "learning_rate": 9.861943418192825e-06, + "loss": 0.5572, + "step": 828 + }, + { + "epoch": 0.1, + "grad_norm": 1.6537058377816523, + "learning_rate": 9.861474022986828e-06, + "loss": 0.5611, + "step": 829 + }, + { + "epoch": 0.1, + "grad_norm": 1.4257922752948444, + "learning_rate": 9.861003842366886e-06, + "loss": 0.5383, + "step": 830 + }, + { + "epoch": 0.1, + "grad_norm": 1.5238556011282056, + "learning_rate": 9.86053287640896e-06, + "loss": 0.5396, + "step": 831 + }, + { + "epoch": 0.1, + "grad_norm": 1.5593477658026424, + "learning_rate": 9.860061125189143e-06, + "loss": 0.5556, + "step": 832 + }, + { + "epoch": 0.1, + "grad_norm": 1.6877101213463102, + "learning_rate": 9.859588588783644e-06, + "loss": 0.5304, + "step": 833 + }, + { + "epoch": 0.1, + "grad_norm": 1.52836473336756, + "learning_rate": 9.85911526726881e-06, + "loss": 0.5365, + "step": 834 + }, + { + "epoch": 0.1, + "grad_norm": 1.4292053696202944, + "learning_rate": 9.85864116072111e-06, + "loss": 0.524, + "step": 835 + }, + { + "epoch": 0.1, + "grad_norm": 1.3567394691546826, + "learning_rate": 9.85816626921714e-06, + "loss": 0.5385, + "step": 836 + }, + { + "epoch": 0.1, + "grad_norm": 1.813389252993085, + "learning_rate": 9.857690592833621e-06, + "loss": 0.5389, + "step": 837 + }, + { + "epoch": 0.1, + "grad_norm": 1.6373671449371334, + "learning_rate": 9.857214131647405e-06, + "loss": 0.517, + "step": 838 + }, + { + "epoch": 0.1, + "grad_norm": 1.550185315855915, + "learning_rate": 9.856736885735467e-06, + "loss": 0.5129, + "step": 839 + }, + { + "epoch": 0.1, + "grad_norm": 1.2396005024409704, + "learning_rate": 9.856258855174912e-06, + "loss": 0.5535, + "step": 840 + }, + { + "epoch": 0.1, + "grad_norm": 1.4116583979770747, + "learning_rate": 9.855780040042966e-06, + "loss": 0.4832, + "step": 841 + }, + { + "epoch": 0.1, + "grad_norm": 1.6187709239417065, + "learning_rate": 9.85530044041699e-06, + "loss": 0.5649, + "step": 842 + }, + { + "epoch": 0.1, + "grad_norm": 1.3743636030552622, + "learning_rate": 9.854820056374468e-06, + "loss": 0.548, + "step": 843 + }, + { + "epoch": 0.1, + "grad_norm": 1.4003641447914825, + "learning_rate": 9.854338887993006e-06, + "loss": 0.5033, + "step": 844 + }, + { + "epoch": 0.1, + "grad_norm": 1.6185345234958257, + "learning_rate": 9.853856935350345e-06, + "loss": 0.593, + "step": 845 + }, + { + "epoch": 0.1, + "grad_norm": 1.8383993635732563, + "learning_rate": 9.853374198524347e-06, + "loss": 0.5019, + "step": 846 + }, + { + "epoch": 0.11, + "grad_norm": 1.4429440730187004, + "learning_rate": 9.852890677593003e-06, + "loss": 0.553, + "step": 847 + }, + { + "epoch": 0.11, + "grad_norm": 1.502777317388333, + "learning_rate": 9.852406372634427e-06, + "loss": 0.5163, + "step": 848 + }, + { + "epoch": 0.11, + "grad_norm": 1.7042353791138922, + "learning_rate": 9.85192128372687e-06, + "loss": 0.5671, + "step": 849 + }, + { + "epoch": 0.11, + "grad_norm": 1.5670146983134932, + "learning_rate": 9.851435410948693e-06, + "loss": 0.5254, + "step": 850 + }, + { + "epoch": 0.11, + "grad_norm": 1.6943773775700972, + "learning_rate": 9.8509487543784e-06, + "loss": 0.5858, + "step": 851 + }, + { + "epoch": 0.11, + "grad_norm": 1.7259489878374763, + "learning_rate": 9.850461314094611e-06, + "loss": 0.5266, + "step": 852 + }, + { + "epoch": 0.11, + "grad_norm": 1.4839090775806405, + "learning_rate": 9.84997309017608e-06, + "loss": 0.4945, + "step": 853 + }, + { + "epoch": 0.11, + "grad_norm": 1.6430132038958682, + "learning_rate": 9.849484082701679e-06, + "loss": 0.5514, + "step": 854 + }, + { + "epoch": 0.11, + "grad_norm": 1.330689307629748, + "learning_rate": 9.848994291750416e-06, + "loss": 0.5068, + "step": 855 + }, + { + "epoch": 0.11, + "grad_norm": 1.4367448150793294, + "learning_rate": 9.848503717401416e-06, + "loss": 0.4849, + "step": 856 + }, + { + "epoch": 0.11, + "grad_norm": 1.5435415775983266, + "learning_rate": 9.848012359733941e-06, + "loss": 0.5299, + "step": 857 + }, + { + "epoch": 0.11, + "grad_norm": 2.0779784325734822, + "learning_rate": 9.847520218827372e-06, + "loss": 0.5144, + "step": 858 + }, + { + "epoch": 0.11, + "grad_norm": 0.7886067252096047, + "learning_rate": 9.847027294761218e-06, + "loss": 0.5121, + "step": 859 + }, + { + "epoch": 0.11, + "grad_norm": 0.6874506009278324, + "learning_rate": 9.846533587615117e-06, + "loss": 0.5396, + "step": 860 + }, + { + "epoch": 0.11, + "grad_norm": 1.6245535898960135, + "learning_rate": 9.846039097468829e-06, + "loss": 0.5138, + "step": 861 + }, + { + "epoch": 0.11, + "grad_norm": 1.5610689448816104, + "learning_rate": 9.845543824402245e-06, + "loss": 0.5456, + "step": 862 + }, + { + "epoch": 0.11, + "grad_norm": 3.867756358837439, + "learning_rate": 9.84504776849538e-06, + "loss": 0.5296, + "step": 863 + }, + { + "epoch": 0.11, + "grad_norm": 0.6572342482634592, + "learning_rate": 9.844550929828375e-06, + "loss": 0.4763, + "step": 864 + }, + { + "epoch": 0.11, + "grad_norm": 1.5801874916299146, + "learning_rate": 9.844053308481504e-06, + "loss": 0.5699, + "step": 865 + }, + { + "epoch": 0.11, + "grad_norm": 1.4583504122348614, + "learning_rate": 9.843554904535157e-06, + "loss": 0.5041, + "step": 866 + }, + { + "epoch": 0.11, + "grad_norm": 1.611938008913365, + "learning_rate": 9.843055718069856e-06, + "loss": 0.5419, + "step": 867 + }, + { + "epoch": 0.11, + "grad_norm": 1.43075395951784, + "learning_rate": 9.842555749166252e-06, + "loss": 0.5617, + "step": 868 + }, + { + "epoch": 0.11, + "grad_norm": 5.380501740350102, + "learning_rate": 9.842054997905115e-06, + "loss": 0.4923, + "step": 869 + }, + { + "epoch": 0.11, + "grad_norm": 1.4650493645359763, + "learning_rate": 9.841553464367349e-06, + "loss": 0.5058, + "step": 870 + }, + { + "epoch": 0.11, + "grad_norm": 2.314374886920701, + "learning_rate": 9.84105114863398e-06, + "loss": 0.5574, + "step": 871 + }, + { + "epoch": 0.11, + "grad_norm": 1.335405641847203, + "learning_rate": 9.840548050786162e-06, + "loss": 0.5267, + "step": 872 + }, + { + "epoch": 0.11, + "grad_norm": 2.882923799877262, + "learning_rate": 9.840044170905175e-06, + "loss": 0.5018, + "step": 873 + }, + { + "epoch": 0.11, + "grad_norm": 1.606183086211936, + "learning_rate": 9.839539509072425e-06, + "loss": 0.5532, + "step": 874 + }, + { + "epoch": 0.11, + "grad_norm": 1.4106290303040485, + "learning_rate": 9.839034065369444e-06, + "loss": 0.4926, + "step": 875 + }, + { + "epoch": 0.11, + "grad_norm": 1.8398370549040346, + "learning_rate": 9.83852783987789e-06, + "loss": 0.5319, + "step": 876 + }, + { + "epoch": 0.11, + "grad_norm": 1.8145118768900863, + "learning_rate": 9.838020832679552e-06, + "loss": 0.5236, + "step": 877 + }, + { + "epoch": 0.11, + "grad_norm": 1.1562995225815056, + "learning_rate": 9.837513043856338e-06, + "loss": 0.513, + "step": 878 + }, + { + "epoch": 0.11, + "grad_norm": 1.5446941041707583, + "learning_rate": 9.837004473490286e-06, + "loss": 0.5432, + "step": 879 + }, + { + "epoch": 0.11, + "grad_norm": 1.7330545831926203, + "learning_rate": 9.83649512166356e-06, + "loss": 0.5111, + "step": 880 + }, + { + "epoch": 0.11, + "grad_norm": 1.822826636212995, + "learning_rate": 9.835984988458454e-06, + "loss": 0.5, + "step": 881 + }, + { + "epoch": 0.11, + "grad_norm": 1.483699967216602, + "learning_rate": 9.835474073957379e-06, + "loss": 0.5181, + "step": 882 + }, + { + "epoch": 0.11, + "grad_norm": 1.8228914204587476, + "learning_rate": 9.834962378242882e-06, + "loss": 0.5745, + "step": 883 + }, + { + "epoch": 0.11, + "grad_norm": 1.7397105597518532, + "learning_rate": 9.834449901397628e-06, + "loss": 0.5306, + "step": 884 + }, + { + "epoch": 0.11, + "grad_norm": 1.3840024922502834, + "learning_rate": 9.833936643504417e-06, + "loss": 0.4915, + "step": 885 + }, + { + "epoch": 0.11, + "grad_norm": 1.6649130295487007, + "learning_rate": 9.833422604646165e-06, + "loss": 0.5362, + "step": 886 + }, + { + "epoch": 0.11, + "grad_norm": 1.7078837515289695, + "learning_rate": 9.832907784905924e-06, + "loss": 0.4886, + "step": 887 + }, + { + "epoch": 0.11, + "grad_norm": 1.7511691661944424, + "learning_rate": 9.832392184366867e-06, + "loss": 0.4758, + "step": 888 + }, + { + "epoch": 0.11, + "grad_norm": 1.3751504432892512, + "learning_rate": 9.831875803112291e-06, + "loss": 0.5, + "step": 889 + }, + { + "epoch": 0.11, + "grad_norm": 1.4513862307154417, + "learning_rate": 9.831358641225624e-06, + "loss": 0.5377, + "step": 890 + }, + { + "epoch": 0.11, + "grad_norm": 2.5931908846008063, + "learning_rate": 9.830840698790418e-06, + "loss": 0.5329, + "step": 891 + }, + { + "epoch": 0.11, + "grad_norm": 1.3425686954116223, + "learning_rate": 9.83032197589035e-06, + "loss": 0.4917, + "step": 892 + }, + { + "epoch": 0.11, + "grad_norm": 1.2865043128692821, + "learning_rate": 9.829802472609227e-06, + "loss": 0.4823, + "step": 893 + }, + { + "epoch": 0.11, + "grad_norm": 1.4653845961751846, + "learning_rate": 9.829282189030976e-06, + "loss": 0.5754, + "step": 894 + }, + { + "epoch": 0.11, + "grad_norm": 1.5380876117092384, + "learning_rate": 9.828761125239655e-06, + "loss": 0.5642, + "step": 895 + }, + { + "epoch": 0.11, + "grad_norm": 1.6425053205832336, + "learning_rate": 9.828239281319447e-06, + "loss": 0.5546, + "step": 896 + }, + { + "epoch": 0.11, + "grad_norm": 1.256609403421453, + "learning_rate": 9.827716657354658e-06, + "loss": 0.4601, + "step": 897 + }, + { + "epoch": 0.11, + "grad_norm": 6.390734805023612, + "learning_rate": 9.827193253429726e-06, + "loss": 0.5261, + "step": 898 + }, + { + "epoch": 0.11, + "grad_norm": 2.7799958313656696, + "learning_rate": 9.826669069629209e-06, + "loss": 0.5457, + "step": 899 + }, + { + "epoch": 0.11, + "grad_norm": 1.4733773601073992, + "learning_rate": 9.826144106037794e-06, + "loss": 0.5611, + "step": 900 + }, + { + "epoch": 0.11, + "grad_norm": 2.9129884749417165, + "learning_rate": 9.825618362740295e-06, + "loss": 0.5339, + "step": 901 + }, + { + "epoch": 0.11, + "grad_norm": 2.243230069846606, + "learning_rate": 9.825091839821648e-06, + "loss": 0.5816, + "step": 902 + }, + { + "epoch": 0.11, + "grad_norm": 2.0242936924161343, + "learning_rate": 9.82456453736692e-06, + "loss": 0.5167, + "step": 903 + }, + { + "epoch": 0.11, + "grad_norm": 1.434184689682166, + "learning_rate": 9.8240364554613e-06, + "loss": 0.4836, + "step": 904 + }, + { + "epoch": 0.11, + "grad_norm": 0.6775533855530116, + "learning_rate": 9.823507594190103e-06, + "loss": 0.4603, + "step": 905 + }, + { + "epoch": 0.11, + "grad_norm": 1.6827627319527105, + "learning_rate": 9.822977953638772e-06, + "loss": 0.5396, + "step": 906 + }, + { + "epoch": 0.11, + "grad_norm": 1.3676057637250456, + "learning_rate": 9.822447533892877e-06, + "loss": 0.5059, + "step": 907 + }, + { + "epoch": 0.11, + "grad_norm": 1.3258876354387583, + "learning_rate": 9.821916335038111e-06, + "loss": 0.5057, + "step": 908 + }, + { + "epoch": 0.11, + "grad_norm": 134.66857492128472, + "learning_rate": 9.821384357160292e-06, + "loss": 0.5451, + "step": 909 + }, + { + "epoch": 0.11, + "grad_norm": 1.2585093599662378, + "learning_rate": 9.820851600345368e-06, + "loss": 0.5558, + "step": 910 + }, + { + "epoch": 0.11, + "grad_norm": 1.5964531920502647, + "learning_rate": 9.820318064679408e-06, + "loss": 0.505, + "step": 911 + }, + { + "epoch": 0.11, + "grad_norm": 1.969482196985344, + "learning_rate": 9.819783750248612e-06, + "loss": 0.5582, + "step": 912 + }, + { + "epoch": 0.11, + "grad_norm": 1.5323419980885993, + "learning_rate": 9.819248657139304e-06, + "loss": 0.5278, + "step": 913 + }, + { + "epoch": 0.11, + "grad_norm": 7.449348072753027, + "learning_rate": 9.818712785437929e-06, + "loss": 0.5646, + "step": 914 + }, + { + "epoch": 0.11, + "grad_norm": 5.113064908635709, + "learning_rate": 9.818176135231066e-06, + "loss": 0.5043, + "step": 915 + }, + { + "epoch": 0.11, + "grad_norm": 1.46539588571356, + "learning_rate": 9.817638706605413e-06, + "loss": 0.5927, + "step": 916 + }, + { + "epoch": 0.11, + "grad_norm": 1.9591832216721343, + "learning_rate": 9.817100499647796e-06, + "loss": 0.4954, + "step": 917 + }, + { + "epoch": 0.11, + "grad_norm": 1.8904706076981188, + "learning_rate": 9.81656151444517e-06, + "loss": 0.5074, + "step": 918 + }, + { + "epoch": 0.11, + "grad_norm": 1.603397229383552, + "learning_rate": 9.81602175108461e-06, + "loss": 0.5213, + "step": 919 + }, + { + "epoch": 0.11, + "grad_norm": 1.4407858273805725, + "learning_rate": 9.815481209653318e-06, + "loss": 0.5073, + "step": 920 + }, + { + "epoch": 0.11, + "grad_norm": 1.6780133192986095, + "learning_rate": 9.814939890238629e-06, + "loss": 0.5749, + "step": 921 + }, + { + "epoch": 0.11, + "grad_norm": 1.3890417822540029, + "learning_rate": 9.814397792927993e-06, + "loss": 0.5103, + "step": 922 + }, + { + "epoch": 0.11, + "grad_norm": 1.4123428104356073, + "learning_rate": 9.813854917808993e-06, + "loss": 0.511, + "step": 923 + }, + { + "epoch": 0.11, + "grad_norm": 2.3239410306747974, + "learning_rate": 9.813311264969333e-06, + "loss": 0.5895, + "step": 924 + }, + { + "epoch": 0.11, + "grad_norm": 1.6481482416750444, + "learning_rate": 9.812766834496847e-06, + "loss": 0.5619, + "step": 925 + }, + { + "epoch": 0.11, + "grad_norm": 1.5682167709528458, + "learning_rate": 9.812221626479493e-06, + "loss": 0.5106, + "step": 926 + }, + { + "epoch": 0.12, + "grad_norm": 2.108483178988802, + "learning_rate": 9.811675641005352e-06, + "loss": 0.4574, + "step": 927 + }, + { + "epoch": 0.12, + "grad_norm": 1.272404284404087, + "learning_rate": 9.811128878162633e-06, + "loss": 0.5017, + "step": 928 + }, + { + "epoch": 0.12, + "grad_norm": 2.229753751249011, + "learning_rate": 9.81058133803967e-06, + "loss": 0.4652, + "step": 929 + }, + { + "epoch": 0.12, + "grad_norm": 3.8807442456609524, + "learning_rate": 9.810033020724927e-06, + "loss": 0.5045, + "step": 930 + }, + { + "epoch": 0.12, + "grad_norm": 1.5325223466990254, + "learning_rate": 9.809483926306984e-06, + "loss": 0.5153, + "step": 931 + }, + { + "epoch": 0.12, + "grad_norm": 1.293134868842404, + "learning_rate": 9.808934054874557e-06, + "loss": 0.4899, + "step": 932 + }, + { + "epoch": 0.12, + "grad_norm": 1.398989709606525, + "learning_rate": 9.808383406516478e-06, + "loss": 0.5187, + "step": 933 + }, + { + "epoch": 0.12, + "grad_norm": 2.071140988213683, + "learning_rate": 9.807831981321711e-06, + "loss": 0.5403, + "step": 934 + }, + { + "epoch": 0.12, + "grad_norm": 0.6607542531288317, + "learning_rate": 9.807279779379346e-06, + "loss": 0.4914, + "step": 935 + }, + { + "epoch": 0.12, + "grad_norm": 1.3850092138390073, + "learning_rate": 9.806726800778592e-06, + "loss": 0.4681, + "step": 936 + }, + { + "epoch": 0.12, + "grad_norm": 1.5181764228685495, + "learning_rate": 9.806173045608791e-06, + "loss": 0.4813, + "step": 937 + }, + { + "epoch": 0.12, + "grad_norm": 1.71479008444105, + "learning_rate": 9.805618513959405e-06, + "loss": 0.5298, + "step": 938 + }, + { + "epoch": 0.12, + "grad_norm": 1.7241969898445435, + "learning_rate": 9.805063205920024e-06, + "loss": 0.5251, + "step": 939 + }, + { + "epoch": 0.12, + "grad_norm": 1.4856505135312046, + "learning_rate": 9.804507121580363e-06, + "loss": 0.5439, + "step": 940 + }, + { + "epoch": 0.12, + "grad_norm": 1.5218457627879594, + "learning_rate": 9.80395026103026e-06, + "loss": 0.538, + "step": 941 + }, + { + "epoch": 0.12, + "grad_norm": 1.9835057866029318, + "learning_rate": 9.803392624359686e-06, + "loss": 0.5694, + "step": 942 + }, + { + "epoch": 0.12, + "grad_norm": 1.7890943425296235, + "learning_rate": 9.802834211658727e-06, + "loss": 0.556, + "step": 943 + }, + { + "epoch": 0.12, + "grad_norm": 1.823356520693511, + "learning_rate": 9.802275023017603e-06, + "loss": 0.4954, + "step": 944 + }, + { + "epoch": 0.12, + "grad_norm": 2.778959746033984, + "learning_rate": 9.801715058526654e-06, + "loss": 0.5047, + "step": 945 + }, + { + "epoch": 0.12, + "grad_norm": 1.5026611567465495, + "learning_rate": 9.801154318276346e-06, + "loss": 0.5611, + "step": 946 + }, + { + "epoch": 0.12, + "grad_norm": 1.3920378820144184, + "learning_rate": 9.800592802357274e-06, + "loss": 0.5048, + "step": 947 + }, + { + "epoch": 0.12, + "grad_norm": 1.5561386982115961, + "learning_rate": 9.800030510860154e-06, + "loss": 0.5821, + "step": 948 + }, + { + "epoch": 0.12, + "grad_norm": 5.0569047792119814, + "learning_rate": 9.799467443875832e-06, + "loss": 0.4784, + "step": 949 + }, + { + "epoch": 0.12, + "grad_norm": 1.5279500400553616, + "learning_rate": 9.798903601495274e-06, + "loss": 0.5358, + "step": 950 + }, + { + "epoch": 0.12, + "grad_norm": 1.4977870060684855, + "learning_rate": 9.798338983809574e-06, + "loss": 0.5229, + "step": 951 + }, + { + "epoch": 0.12, + "grad_norm": 1.7866661445205454, + "learning_rate": 9.797773590909951e-06, + "loss": 0.5753, + "step": 952 + }, + { + "epoch": 0.12, + "grad_norm": 1.6653644954339666, + "learning_rate": 9.797207422887749e-06, + "loss": 0.5222, + "step": 953 + }, + { + "epoch": 0.12, + "grad_norm": 1.4942644071101057, + "learning_rate": 9.796640479834439e-06, + "loss": 0.5648, + "step": 954 + }, + { + "epoch": 0.12, + "grad_norm": 2.3212009783951033, + "learning_rate": 9.796072761841614e-06, + "loss": 0.4975, + "step": 955 + }, + { + "epoch": 0.12, + "grad_norm": 2.3157046474966476, + "learning_rate": 9.795504269000993e-06, + "loss": 0.5508, + "step": 956 + }, + { + "epoch": 0.12, + "grad_norm": 1.5525174218441946, + "learning_rate": 9.794935001404424e-06, + "loss": 0.5046, + "step": 957 + }, + { + "epoch": 0.12, + "grad_norm": 1.5409578454278048, + "learning_rate": 9.794364959143876e-06, + "loss": 0.5008, + "step": 958 + }, + { + "epoch": 0.12, + "grad_norm": 2.0286529491143623, + "learning_rate": 9.793794142311442e-06, + "loss": 0.5295, + "step": 959 + }, + { + "epoch": 0.12, + "grad_norm": 1.735399312413163, + "learning_rate": 9.793222550999343e-06, + "loss": 0.4717, + "step": 960 + }, + { + "epoch": 0.12, + "grad_norm": 3.364301097630389, + "learning_rate": 9.79265018529993e-06, + "loss": 0.5434, + "step": 961 + }, + { + "epoch": 0.12, + "grad_norm": 0.6307910155590302, + "learning_rate": 9.792077045305667e-06, + "loss": 0.5009, + "step": 962 + }, + { + "epoch": 0.12, + "grad_norm": 2.0261251017328084, + "learning_rate": 9.791503131109153e-06, + "loss": 0.5054, + "step": 963 + }, + { + "epoch": 0.12, + "grad_norm": 1.8934827285210059, + "learning_rate": 9.790928442803109e-06, + "loss": 0.5408, + "step": 964 + }, + { + "epoch": 0.12, + "grad_norm": 1.7390284854415334, + "learning_rate": 9.79035298048038e-06, + "loss": 0.5017, + "step": 965 + }, + { + "epoch": 0.12, + "grad_norm": 1.5967519421160552, + "learning_rate": 9.789776744233937e-06, + "loss": 0.5105, + "step": 966 + }, + { + "epoch": 0.12, + "grad_norm": 1.4326649427114473, + "learning_rate": 9.789199734156876e-06, + "loss": 0.5143, + "step": 967 + }, + { + "epoch": 0.12, + "grad_norm": 2.0047895387472607, + "learning_rate": 9.788621950342419e-06, + "loss": 0.5886, + "step": 968 + }, + { + "epoch": 0.12, + "grad_norm": 2.059058151775754, + "learning_rate": 9.788043392883913e-06, + "loss": 0.5324, + "step": 969 + }, + { + "epoch": 0.12, + "grad_norm": 1.7462739610027802, + "learning_rate": 9.787464061874826e-06, + "loss": 0.5006, + "step": 970 + }, + { + "epoch": 0.12, + "grad_norm": 1.440421328798765, + "learning_rate": 9.786883957408755e-06, + "loss": 0.5404, + "step": 971 + }, + { + "epoch": 0.12, + "grad_norm": 1.4410596054166127, + "learning_rate": 9.786303079579423e-06, + "loss": 0.562, + "step": 972 + }, + { + "epoch": 0.12, + "grad_norm": 1.3845526423586314, + "learning_rate": 9.785721428480672e-06, + "loss": 0.5482, + "step": 973 + }, + { + "epoch": 0.12, + "grad_norm": 1.6455669709278569, + "learning_rate": 9.785139004206478e-06, + "loss": 0.5413, + "step": 974 + }, + { + "epoch": 0.12, + "grad_norm": 2.965987983372974, + "learning_rate": 9.784555806850932e-06, + "loss": 0.546, + "step": 975 + }, + { + "epoch": 0.12, + "grad_norm": 1.3477974890308504, + "learning_rate": 9.783971836508258e-06, + "loss": 0.5016, + "step": 976 + }, + { + "epoch": 0.12, + "grad_norm": 1.4996179118803765, + "learning_rate": 9.783387093272801e-06, + "loss": 0.603, + "step": 977 + }, + { + "epoch": 0.12, + "grad_norm": 1.6326168420110307, + "learning_rate": 9.78280157723903e-06, + "loss": 0.4822, + "step": 978 + }, + { + "epoch": 0.12, + "grad_norm": 2.1691513988958606, + "learning_rate": 9.782215288501541e-06, + "loss": 0.5877, + "step": 979 + }, + { + "epoch": 0.12, + "grad_norm": 1.3980271196527394, + "learning_rate": 9.781628227155056e-06, + "loss": 0.4766, + "step": 980 + }, + { + "epoch": 0.12, + "grad_norm": 1.5816788788667602, + "learning_rate": 9.781040393294415e-06, + "loss": 0.5204, + "step": 981 + }, + { + "epoch": 0.12, + "grad_norm": 1.4866512555478568, + "learning_rate": 9.780451787014593e-06, + "loss": 0.5519, + "step": 982 + }, + { + "epoch": 0.12, + "grad_norm": 1.500045693403318, + "learning_rate": 9.779862408410682e-06, + "loss": 0.5559, + "step": 983 + }, + { + "epoch": 0.12, + "grad_norm": 1.539271350449334, + "learning_rate": 9.779272257577901e-06, + "loss": 0.5247, + "step": 984 + }, + { + "epoch": 0.12, + "grad_norm": 1.452860261604936, + "learning_rate": 9.778681334611595e-06, + "loss": 0.5641, + "step": 985 + }, + { + "epoch": 0.12, + "grad_norm": 1.598342601973268, + "learning_rate": 9.778089639607232e-06, + "loss": 0.5046, + "step": 986 + }, + { + "epoch": 0.12, + "grad_norm": 1.5798505955932483, + "learning_rate": 9.777497172660406e-06, + "loss": 0.4981, + "step": 987 + }, + { + "epoch": 0.12, + "grad_norm": 2.4824775493039155, + "learning_rate": 9.776903933866838e-06, + "loss": 0.5306, + "step": 988 + }, + { + "epoch": 0.12, + "grad_norm": 1.7456312083003678, + "learning_rate": 9.776309923322365e-06, + "loss": 0.5612, + "step": 989 + }, + { + "epoch": 0.12, + "grad_norm": 2.0909503716589932, + "learning_rate": 9.77571514112296e-06, + "loss": 0.5093, + "step": 990 + }, + { + "epoch": 0.12, + "grad_norm": 2.043487520816119, + "learning_rate": 9.77511958736471e-06, + "loss": 0.5899, + "step": 991 + }, + { + "epoch": 0.12, + "grad_norm": 1.5009302169192755, + "learning_rate": 9.774523262143838e-06, + "loss": 0.5234, + "step": 992 + }, + { + "epoch": 0.12, + "grad_norm": 1.693164828745711, + "learning_rate": 9.773926165556682e-06, + "loss": 0.5293, + "step": 993 + }, + { + "epoch": 0.12, + "grad_norm": 1.320473825716359, + "learning_rate": 9.773328297699709e-06, + "loss": 0.4707, + "step": 994 + }, + { + "epoch": 0.12, + "grad_norm": 1.493561778782454, + "learning_rate": 9.772729658669511e-06, + "loss": 0.5299, + "step": 995 + }, + { + "epoch": 0.12, + "grad_norm": 1.3194409305794026, + "learning_rate": 9.7721302485628e-06, + "loss": 0.5391, + "step": 996 + }, + { + "epoch": 0.12, + "grad_norm": 1.5548800287869815, + "learning_rate": 9.77153006747642e-06, + "loss": 0.5221, + "step": 997 + }, + { + "epoch": 0.12, + "grad_norm": 1.3753921622169898, + "learning_rate": 9.770929115507333e-06, + "loss": 0.4613, + "step": 998 + }, + { + "epoch": 0.12, + "grad_norm": 1.4793123657960134, + "learning_rate": 9.770327392752627e-06, + "loss": 0.5219, + "step": 999 + }, + { + "epoch": 0.12, + "grad_norm": 1.6888355213538224, + "learning_rate": 9.76972489930952e-06, + "loss": 0.5256, + "step": 1000 + }, + { + "epoch": 0.12, + "grad_norm": 3.510342478308107, + "learning_rate": 9.769121635275348e-06, + "loss": 0.506, + "step": 1001 + }, + { + "epoch": 0.12, + "grad_norm": 1.3832178583611607, + "learning_rate": 9.768517600747572e-06, + "loss": 0.5308, + "step": 1002 + }, + { + "epoch": 0.12, + "grad_norm": 1.3804901712551811, + "learning_rate": 9.76791279582378e-06, + "loss": 0.5085, + "step": 1003 + }, + { + "epoch": 0.12, + "grad_norm": 1.5874172918864036, + "learning_rate": 9.767307220601682e-06, + "loss": 0.564, + "step": 1004 + }, + { + "epoch": 0.12, + "grad_norm": 1.709225786964121, + "learning_rate": 9.766700875179118e-06, + "loss": 0.5527, + "step": 1005 + }, + { + "epoch": 0.12, + "grad_norm": 1.6674120712422085, + "learning_rate": 9.766093759654047e-06, + "loss": 0.549, + "step": 1006 + }, + { + "epoch": 0.12, + "grad_norm": 1.5771344976496824, + "learning_rate": 9.765485874124551e-06, + "loss": 0.541, + "step": 1007 + }, + { + "epoch": 0.13, + "grad_norm": 2.0461275407031114, + "learning_rate": 9.764877218688844e-06, + "loss": 0.5979, + "step": 1008 + }, + { + "epoch": 0.13, + "grad_norm": 1.5508383621666664, + "learning_rate": 9.764267793445254e-06, + "loss": 0.5072, + "step": 1009 + }, + { + "epoch": 0.13, + "grad_norm": 1.5389952214975169, + "learning_rate": 9.763657598492244e-06, + "loss": 0.5267, + "step": 1010 + }, + { + "epoch": 0.13, + "grad_norm": 2.19076555655587, + "learning_rate": 9.763046633928394e-06, + "loss": 0.5695, + "step": 1011 + }, + { + "epoch": 0.13, + "grad_norm": 2.1357995975740542, + "learning_rate": 9.762434899852412e-06, + "loss": 0.5243, + "step": 1012 + }, + { + "epoch": 0.13, + "grad_norm": 10.30912398608652, + "learning_rate": 9.761822396363127e-06, + "loss": 0.5061, + "step": 1013 + }, + { + "epoch": 0.13, + "grad_norm": 1.7211057851844471, + "learning_rate": 9.761209123559497e-06, + "loss": 0.5614, + "step": 1014 + }, + { + "epoch": 0.13, + "grad_norm": 1.6430505979593055, + "learning_rate": 9.760595081540598e-06, + "loss": 0.5224, + "step": 1015 + }, + { + "epoch": 0.13, + "grad_norm": 1.7293605186403183, + "learning_rate": 9.759980270405636e-06, + "loss": 0.5426, + "step": 1016 + }, + { + "epoch": 0.13, + "grad_norm": 0.6819165374186842, + "learning_rate": 9.75936469025394e-06, + "loss": 0.5263, + "step": 1017 + }, + { + "epoch": 0.13, + "grad_norm": 3.0238097603809133, + "learning_rate": 9.758748341184963e-06, + "loss": 0.5028, + "step": 1018 + }, + { + "epoch": 0.13, + "grad_norm": 10.498056742579276, + "learning_rate": 9.758131223298277e-06, + "loss": 0.4839, + "step": 1019 + }, + { + "epoch": 0.13, + "grad_norm": 2.9442501356371493, + "learning_rate": 9.757513336693589e-06, + "loss": 0.4814, + "step": 1020 + }, + { + "epoch": 0.13, + "grad_norm": 1.5512458163562854, + "learning_rate": 9.756894681470718e-06, + "loss": 0.4852, + "step": 1021 + }, + { + "epoch": 0.13, + "grad_norm": 2.1008113097679795, + "learning_rate": 9.756275257729619e-06, + "loss": 0.5667, + "step": 1022 + }, + { + "epoch": 0.13, + "grad_norm": 1.8076424691587238, + "learning_rate": 9.755655065570363e-06, + "loss": 0.5121, + "step": 1023 + }, + { + "epoch": 0.13, + "grad_norm": 1.799319617461757, + "learning_rate": 9.755034105093143e-06, + "loss": 0.5469, + "step": 1024 + }, + { + "epoch": 0.13, + "grad_norm": 2.364799714460202, + "learning_rate": 9.754412376398288e-06, + "loss": 0.5173, + "step": 1025 + }, + { + "epoch": 0.13, + "grad_norm": 1.5452171924103493, + "learning_rate": 9.75378987958624e-06, + "loss": 0.5226, + "step": 1026 + }, + { + "epoch": 0.13, + "grad_norm": 4.548293795499255, + "learning_rate": 9.75316661475757e-06, + "loss": 0.5025, + "step": 1027 + }, + { + "epoch": 0.13, + "grad_norm": 1.5622088483551202, + "learning_rate": 9.752542582012969e-06, + "loss": 0.5667, + "step": 1028 + }, + { + "epoch": 0.13, + "grad_norm": 3.2172447624890994, + "learning_rate": 9.75191778145326e-06, + "loss": 0.5161, + "step": 1029 + }, + { + "epoch": 0.13, + "grad_norm": 1.5164628494052323, + "learning_rate": 9.75129221317938e-06, + "loss": 0.5504, + "step": 1030 + }, + { + "epoch": 0.13, + "grad_norm": 1.803976492652808, + "learning_rate": 9.750665877292399e-06, + "loss": 0.5444, + "step": 1031 + }, + { + "epoch": 0.13, + "grad_norm": 3.6078363814371985, + "learning_rate": 9.750038773893504e-06, + "loss": 0.4997, + "step": 1032 + }, + { + "epoch": 0.13, + "grad_norm": 1.5506457096195898, + "learning_rate": 9.749410903084012e-06, + "loss": 0.5513, + "step": 1033 + }, + { + "epoch": 0.13, + "grad_norm": 1.9688749331319362, + "learning_rate": 9.748782264965358e-06, + "loss": 0.5313, + "step": 1034 + }, + { + "epoch": 0.13, + "grad_norm": 3.383066967658014, + "learning_rate": 9.748152859639106e-06, + "loss": 0.5261, + "step": 1035 + }, + { + "epoch": 0.13, + "grad_norm": 1.39514071147194, + "learning_rate": 9.74752268720694e-06, + "loss": 0.4936, + "step": 1036 + }, + { + "epoch": 0.13, + "grad_norm": 3.8765304882173792, + "learning_rate": 9.746891747770674e-06, + "loss": 0.5348, + "step": 1037 + }, + { + "epoch": 0.13, + "grad_norm": 1.6546071258012456, + "learning_rate": 9.746260041432238e-06, + "loss": 0.5669, + "step": 1038 + }, + { + "epoch": 0.13, + "grad_norm": 1.8137595354780747, + "learning_rate": 9.745627568293692e-06, + "loss": 0.5026, + "step": 1039 + }, + { + "epoch": 0.13, + "grad_norm": 1.7045762249053962, + "learning_rate": 9.744994328457216e-06, + "loss": 0.5531, + "step": 1040 + }, + { + "epoch": 0.13, + "grad_norm": 2.227903087362228, + "learning_rate": 9.744360322025116e-06, + "loss": 0.5391, + "step": 1041 + }, + { + "epoch": 0.13, + "grad_norm": 1.685844631800516, + "learning_rate": 9.743725549099821e-06, + "loss": 0.4648, + "step": 1042 + }, + { + "epoch": 0.13, + "grad_norm": 1.569940061174605, + "learning_rate": 9.743090009783884e-06, + "loss": 0.4944, + "step": 1043 + }, + { + "epoch": 0.13, + "grad_norm": 6.723142959449064, + "learning_rate": 9.742453704179984e-06, + "loss": 0.5125, + "step": 1044 + }, + { + "epoch": 0.13, + "grad_norm": 1.4912047218885114, + "learning_rate": 9.74181663239092e-06, + "loss": 0.4813, + "step": 1045 + }, + { + "epoch": 0.13, + "grad_norm": 2.5235008521313596, + "learning_rate": 9.741178794519617e-06, + "loss": 0.4845, + "step": 1046 + }, + { + "epoch": 0.13, + "grad_norm": 2.138918688090799, + "learning_rate": 9.740540190669123e-06, + "loss": 0.5212, + "step": 1047 + }, + { + "epoch": 0.13, + "grad_norm": 1.6402131120648562, + "learning_rate": 9.73990082094261e-06, + "loss": 0.5325, + "step": 1048 + }, + { + "epoch": 0.13, + "grad_norm": 2.3317651053778317, + "learning_rate": 9.739260685443373e-06, + "loss": 0.6036, + "step": 1049 + }, + { + "epoch": 0.13, + "grad_norm": 1.870170924854555, + "learning_rate": 9.738619784274833e-06, + "loss": 0.547, + "step": 1050 + }, + { + "epoch": 0.13, + "grad_norm": 1.645815534813316, + "learning_rate": 9.737978117540535e-06, + "loss": 0.5063, + "step": 1051 + }, + { + "epoch": 0.13, + "grad_norm": 4.392607147245962, + "learning_rate": 9.73733568534414e-06, + "loss": 0.5087, + "step": 1052 + }, + { + "epoch": 0.13, + "grad_norm": 3.5143165627174358, + "learning_rate": 9.736692487789445e-06, + "loss": 0.5406, + "step": 1053 + }, + { + "epoch": 0.13, + "grad_norm": 2.8633855334581346, + "learning_rate": 9.736048524980361e-06, + "loss": 0.5339, + "step": 1054 + }, + { + "epoch": 0.13, + "grad_norm": 2.4131426572227155, + "learning_rate": 9.735403797020927e-06, + "loss": 0.5163, + "step": 1055 + }, + { + "epoch": 0.13, + "grad_norm": 1.7726441057919677, + "learning_rate": 9.734758304015304e-06, + "loss": 0.5178, + "step": 1056 + }, + { + "epoch": 0.13, + "grad_norm": 3.4664637428741596, + "learning_rate": 9.734112046067776e-06, + "loss": 0.5636, + "step": 1057 + }, + { + "epoch": 0.13, + "grad_norm": 2.250829141697953, + "learning_rate": 9.733465023282752e-06, + "loss": 0.5415, + "step": 1058 + }, + { + "epoch": 0.13, + "grad_norm": 4.034243227211706, + "learning_rate": 9.732817235764766e-06, + "loss": 0.5273, + "step": 1059 + }, + { + "epoch": 0.13, + "grad_norm": 2.4069863342768443, + "learning_rate": 9.732168683618473e-06, + "loss": 0.5037, + "step": 1060 + }, + { + "epoch": 0.13, + "grad_norm": 1.9783336033315735, + "learning_rate": 9.731519366948649e-06, + "loss": 0.4826, + "step": 1061 + }, + { + "epoch": 0.13, + "grad_norm": 2.083472842664245, + "learning_rate": 9.730869285860203e-06, + "loss": 0.539, + "step": 1062 + }, + { + "epoch": 0.13, + "grad_norm": 3.1097580935012274, + "learning_rate": 9.730218440458157e-06, + "loss": 0.5468, + "step": 1063 + }, + { + "epoch": 0.13, + "grad_norm": 3.532998375527931, + "learning_rate": 9.729566830847662e-06, + "loss": 0.5339, + "step": 1064 + }, + { + "epoch": 0.13, + "grad_norm": 4.236358533233571, + "learning_rate": 9.72891445713399e-06, + "loss": 0.526, + "step": 1065 + }, + { + "epoch": 0.13, + "grad_norm": 0.6568477345833922, + "learning_rate": 9.728261319422541e-06, + "loss": 0.5168, + "step": 1066 + }, + { + "epoch": 0.13, + "grad_norm": 3.524033186735707, + "learning_rate": 9.727607417818831e-06, + "loss": 0.5217, + "step": 1067 + }, + { + "epoch": 0.13, + "grad_norm": 5.71213752658569, + "learning_rate": 9.726952752428509e-06, + "loss": 0.5378, + "step": 1068 + }, + { + "epoch": 0.13, + "grad_norm": 1.898260673307282, + "learning_rate": 9.726297323357335e-06, + "loss": 0.5476, + "step": 1069 + }, + { + "epoch": 0.13, + "grad_norm": 1.4246224608948932, + "learning_rate": 9.725641130711205e-06, + "loss": 0.4839, + "step": 1070 + }, + { + "epoch": 0.13, + "grad_norm": 1.7794987519402379, + "learning_rate": 9.724984174596129e-06, + "loss": 0.5076, + "step": 1071 + }, + { + "epoch": 0.13, + "grad_norm": 2.9061616591589945, + "learning_rate": 9.724326455118247e-06, + "loss": 0.4956, + "step": 1072 + }, + { + "epoch": 0.13, + "grad_norm": 2.173745750118815, + "learning_rate": 9.723667972383816e-06, + "loss": 0.6017, + "step": 1073 + }, + { + "epoch": 0.13, + "grad_norm": 1.9698620736533579, + "learning_rate": 9.723008726499224e-06, + "loss": 0.5057, + "step": 1074 + }, + { + "epoch": 0.13, + "grad_norm": 0.6466460434953547, + "learning_rate": 9.722348717570974e-06, + "loss": 0.4908, + "step": 1075 + }, + { + "epoch": 0.13, + "grad_norm": 1.8438028852879214, + "learning_rate": 9.7216879457057e-06, + "loss": 0.5895, + "step": 1076 + }, + { + "epoch": 0.13, + "grad_norm": 1.7790940990355575, + "learning_rate": 9.721026411010152e-06, + "loss": 0.5293, + "step": 1077 + }, + { + "epoch": 0.13, + "grad_norm": 1.5031660420904411, + "learning_rate": 9.720364113591209e-06, + "loss": 0.5265, + "step": 1078 + }, + { + "epoch": 0.13, + "grad_norm": 5.536355883265839, + "learning_rate": 9.71970105355587e-06, + "loss": 0.5356, + "step": 1079 + }, + { + "epoch": 0.13, + "grad_norm": 0.7276800472528657, + "learning_rate": 9.719037231011258e-06, + "loss": 0.5216, + "step": 1080 + }, + { + "epoch": 0.13, + "grad_norm": 1.5881462730538876, + "learning_rate": 9.71837264606462e-06, + "loss": 0.5356, + "step": 1081 + }, + { + "epoch": 0.13, + "grad_norm": 1.6318467478851473, + "learning_rate": 9.717707298823325e-06, + "loss": 0.5273, + "step": 1082 + }, + { + "epoch": 0.13, + "grad_norm": 2.9848151538178227, + "learning_rate": 9.717041189394865e-06, + "loss": 0.5267, + "step": 1083 + }, + { + "epoch": 0.13, + "grad_norm": 0.6819293422314283, + "learning_rate": 9.716374317886858e-06, + "loss": 0.5276, + "step": 1084 + }, + { + "epoch": 0.13, + "grad_norm": 1.8559724047291661, + "learning_rate": 9.715706684407042e-06, + "loss": 0.5191, + "step": 1085 + }, + { + "epoch": 0.13, + "grad_norm": 1.5339688193120449, + "learning_rate": 9.715038289063278e-06, + "loss": 0.5334, + "step": 1086 + }, + { + "epoch": 0.13, + "grad_norm": 2.5166992172797564, + "learning_rate": 9.714369131963554e-06, + "loss": 0.4737, + "step": 1087 + }, + { + "epoch": 0.14, + "grad_norm": 2.134209639993174, + "learning_rate": 9.713699213215974e-06, + "loss": 0.5355, + "step": 1088 + }, + { + "epoch": 0.14, + "grad_norm": 1.96269880054436, + "learning_rate": 9.713028532928771e-06, + "loss": 0.5655, + "step": 1089 + }, + { + "epoch": 0.14, + "grad_norm": 1.8661433241028522, + "learning_rate": 9.712357091210303e-06, + "loss": 0.5408, + "step": 1090 + }, + { + "epoch": 0.14, + "grad_norm": 1.6968495452565269, + "learning_rate": 9.711684888169043e-06, + "loss": 0.5566, + "step": 1091 + }, + { + "epoch": 0.14, + "grad_norm": 1.7839624583219784, + "learning_rate": 9.711011923913592e-06, + "loss": 0.4723, + "step": 1092 + }, + { + "epoch": 0.14, + "grad_norm": 1.9629701859367892, + "learning_rate": 9.710338198552673e-06, + "loss": 0.5142, + "step": 1093 + }, + { + "epoch": 0.14, + "grad_norm": 2.614758849948944, + "learning_rate": 9.709663712195134e-06, + "loss": 0.5048, + "step": 1094 + }, + { + "epoch": 0.14, + "grad_norm": 1.667464215660333, + "learning_rate": 9.708988464949944e-06, + "loss": 0.5172, + "step": 1095 + }, + { + "epoch": 0.14, + "grad_norm": 3.7918874492859533, + "learning_rate": 9.708312456926195e-06, + "loss": 0.5401, + "step": 1096 + }, + { + "epoch": 0.14, + "grad_norm": 1.7004882321052495, + "learning_rate": 9.707635688233098e-06, + "loss": 0.5092, + "step": 1097 + }, + { + "epoch": 0.14, + "grad_norm": 1.701452526614193, + "learning_rate": 9.706958158979997e-06, + "loss": 0.5389, + "step": 1098 + }, + { + "epoch": 0.14, + "grad_norm": 1.7636567128885436, + "learning_rate": 9.70627986927635e-06, + "loss": 0.6421, + "step": 1099 + }, + { + "epoch": 0.14, + "grad_norm": 1.8276796485736944, + "learning_rate": 9.705600819231743e-06, + "loss": 0.5333, + "step": 1100 + }, + { + "epoch": 0.14, + "grad_norm": 2.246268565950378, + "learning_rate": 9.704921008955876e-06, + "loss": 0.5561, + "step": 1101 + }, + { + "epoch": 0.14, + "grad_norm": 1.438082199230945, + "learning_rate": 9.704240438558585e-06, + "loss": 0.569, + "step": 1102 + }, + { + "epoch": 0.14, + "grad_norm": 1.9144248893253415, + "learning_rate": 9.70355910814982e-06, + "loss": 0.4746, + "step": 1103 + }, + { + "epoch": 0.14, + "grad_norm": 1.9469849927167315, + "learning_rate": 9.702877017839656e-06, + "loss": 0.5467, + "step": 1104 + }, + { + "epoch": 0.14, + "grad_norm": 1.645061673320711, + "learning_rate": 9.70219416773829e-06, + "loss": 0.5401, + "step": 1105 + }, + { + "epoch": 0.14, + "grad_norm": 1.7672589402175505, + "learning_rate": 9.701510557956041e-06, + "loss": 0.5233, + "step": 1106 + }, + { + "epoch": 0.14, + "grad_norm": 1.6562526037884509, + "learning_rate": 9.700826188603358e-06, + "loss": 0.573, + "step": 1107 + }, + { + "epoch": 0.14, + "grad_norm": 2.1079545454951174, + "learning_rate": 9.700141059790801e-06, + "loss": 0.5971, + "step": 1108 + }, + { + "epoch": 0.14, + "grad_norm": 1.484603231624597, + "learning_rate": 9.699455171629063e-06, + "loss": 0.5241, + "step": 1109 + }, + { + "epoch": 0.14, + "grad_norm": 1.5334514844206923, + "learning_rate": 9.69876852422895e-06, + "loss": 0.4631, + "step": 1110 + }, + { + "epoch": 0.14, + "grad_norm": 1.456284075939431, + "learning_rate": 9.698081117701399e-06, + "loss": 0.5309, + "step": 1111 + }, + { + "epoch": 0.14, + "grad_norm": 3.1560014571082715, + "learning_rate": 9.697392952157467e-06, + "loss": 0.4962, + "step": 1112 + }, + { + "epoch": 0.14, + "grad_norm": 1.6442324739481327, + "learning_rate": 9.696704027708332e-06, + "loss": 0.5049, + "step": 1113 + }, + { + "epoch": 0.14, + "grad_norm": 1.7439283088142261, + "learning_rate": 9.6960143444653e-06, + "loss": 0.5787, + "step": 1114 + }, + { + "epoch": 0.14, + "grad_norm": 1.8398447633635922, + "learning_rate": 9.695323902539787e-06, + "loss": 0.5191, + "step": 1115 + }, + { + "epoch": 0.14, + "grad_norm": 1.6251227622842839, + "learning_rate": 9.694632702043347e-06, + "loss": 0.5029, + "step": 1116 + }, + { + "epoch": 0.14, + "grad_norm": 5.79714755581544, + "learning_rate": 9.693940743087647e-06, + "loss": 0.5115, + "step": 1117 + }, + { + "epoch": 0.14, + "grad_norm": 2.064944948011489, + "learning_rate": 9.693248025784481e-06, + "loss": 0.5948, + "step": 1118 + }, + { + "epoch": 0.14, + "grad_norm": 2.179435891042551, + "learning_rate": 9.692554550245759e-06, + "loss": 0.55, + "step": 1119 + }, + { + "epoch": 0.14, + "grad_norm": 4.815622421678713, + "learning_rate": 9.691860316583523e-06, + "loss": 0.4634, + "step": 1120 + }, + { + "epoch": 0.14, + "grad_norm": 1.8473286275242924, + "learning_rate": 9.69116532490993e-06, + "loss": 0.4928, + "step": 1121 + }, + { + "epoch": 0.14, + "grad_norm": 1.6666329256151406, + "learning_rate": 9.690469575337265e-06, + "loss": 0.5003, + "step": 1122 + }, + { + "epoch": 0.14, + "grad_norm": 2.5464761651711085, + "learning_rate": 9.689773067977927e-06, + "loss": 0.4924, + "step": 1123 + }, + { + "epoch": 0.14, + "grad_norm": 3.378718901137373, + "learning_rate": 9.689075802944447e-06, + "loss": 0.5337, + "step": 1124 + }, + { + "epoch": 0.14, + "grad_norm": 2.8700837581036818, + "learning_rate": 9.688377780349475e-06, + "loss": 0.5231, + "step": 1125 + }, + { + "epoch": 0.14, + "grad_norm": 1.7010701035393139, + "learning_rate": 9.687679000305779e-06, + "loss": 0.5754, + "step": 1126 + }, + { + "epoch": 0.14, + "grad_norm": 2.6688092521075686, + "learning_rate": 9.686979462926255e-06, + "loss": 0.5792, + "step": 1127 + }, + { + "epoch": 0.14, + "grad_norm": 1.8813021532141243, + "learning_rate": 9.68627916832392e-06, + "loss": 0.6095, + "step": 1128 + }, + { + "epoch": 0.14, + "grad_norm": 1.6581996476541911, + "learning_rate": 9.685578116611913e-06, + "loss": 0.5042, + "step": 1129 + }, + { + "epoch": 0.14, + "grad_norm": 1.6582970311756564, + "learning_rate": 9.684876307903495e-06, + "loss": 0.4988, + "step": 1130 + }, + { + "epoch": 0.14, + "grad_norm": 1.683884028197148, + "learning_rate": 9.684173742312047e-06, + "loss": 0.4801, + "step": 1131 + }, + { + "epoch": 0.14, + "grad_norm": 3.588948092366968, + "learning_rate": 9.683470419951076e-06, + "loss": 0.5665, + "step": 1132 + }, + { + "epoch": 0.14, + "grad_norm": 2.310523817864352, + "learning_rate": 9.682766340934212e-06, + "loss": 0.5613, + "step": 1133 + }, + { + "epoch": 0.14, + "grad_norm": 4.195517805689883, + "learning_rate": 9.682061505375203e-06, + "loss": 0.5476, + "step": 1134 + }, + { + "epoch": 0.14, + "grad_norm": 1.6244317571837932, + "learning_rate": 9.681355913387921e-06, + "loss": 0.5026, + "step": 1135 + }, + { + "epoch": 0.14, + "grad_norm": 1.8925554979075394, + "learning_rate": 9.680649565086363e-06, + "loss": 0.5229, + "step": 1136 + }, + { + "epoch": 0.14, + "grad_norm": 1.632376881931433, + "learning_rate": 9.679942460584643e-06, + "loss": 0.5724, + "step": 1137 + }, + { + "epoch": 0.14, + "grad_norm": 0.7020674525959631, + "learning_rate": 9.679234599997003e-06, + "loss": 0.4889, + "step": 1138 + }, + { + "epoch": 0.14, + "grad_norm": 1.5508086290219265, + "learning_rate": 9.6785259834378e-06, + "loss": 0.5535, + "step": 1139 + }, + { + "epoch": 0.14, + "grad_norm": 2.3720437102762606, + "learning_rate": 9.677816611021522e-06, + "loss": 0.5221, + "step": 1140 + }, + { + "epoch": 0.14, + "grad_norm": 3.6902927516019997, + "learning_rate": 9.67710648286277e-06, + "loss": 0.5167, + "step": 1141 + }, + { + "epoch": 0.14, + "grad_norm": 2.430557957008761, + "learning_rate": 9.676395599076274e-06, + "loss": 0.4993, + "step": 1142 + }, + { + "epoch": 0.14, + "grad_norm": 1.8064802832707425, + "learning_rate": 9.675683959776883e-06, + "loss": 0.4976, + "step": 1143 + }, + { + "epoch": 0.14, + "grad_norm": 1.7692889961512195, + "learning_rate": 9.67497156507957e-06, + "loss": 0.6078, + "step": 1144 + }, + { + "epoch": 0.14, + "grad_norm": 0.7279284571335646, + "learning_rate": 9.674258415099424e-06, + "loss": 0.4917, + "step": 1145 + }, + { + "epoch": 0.14, + "grad_norm": 1.5671149456296096, + "learning_rate": 9.673544509951666e-06, + "loss": 0.5279, + "step": 1146 + }, + { + "epoch": 0.14, + "grad_norm": 2.3614047258344826, + "learning_rate": 9.672829849751633e-06, + "loss": 0.5778, + "step": 1147 + }, + { + "epoch": 0.14, + "grad_norm": 1.6605177543197422, + "learning_rate": 9.67211443461478e-06, + "loss": 0.5539, + "step": 1148 + }, + { + "epoch": 0.14, + "grad_norm": 1.745652249143334, + "learning_rate": 9.671398264656693e-06, + "loss": 0.5044, + "step": 1149 + }, + { + "epoch": 0.14, + "grad_norm": 1.6403940468245404, + "learning_rate": 9.670681339993076e-06, + "loss": 0.5688, + "step": 1150 + }, + { + "epoch": 0.14, + "grad_norm": 1.3705504539528528, + "learning_rate": 9.669963660739749e-06, + "loss": 0.4915, + "step": 1151 + }, + { + "epoch": 0.14, + "grad_norm": 2.0052403405897046, + "learning_rate": 9.669245227012667e-06, + "loss": 0.5709, + "step": 1152 + }, + { + "epoch": 0.14, + "grad_norm": 1.555499863113405, + "learning_rate": 9.668526038927895e-06, + "loss": 0.4766, + "step": 1153 + }, + { + "epoch": 0.14, + "grad_norm": 1.531025831393635, + "learning_rate": 9.667806096601621e-06, + "loss": 0.4946, + "step": 1154 + }, + { + "epoch": 0.14, + "grad_norm": 2.0443285220787972, + "learning_rate": 9.667085400150167e-06, + "loss": 0.4962, + "step": 1155 + }, + { + "epoch": 0.14, + "grad_norm": 1.4424983450814588, + "learning_rate": 9.666363949689959e-06, + "loss": 0.4927, + "step": 1156 + }, + { + "epoch": 0.14, + "grad_norm": 2.059446725914697, + "learning_rate": 9.665641745337558e-06, + "loss": 0.5343, + "step": 1157 + }, + { + "epoch": 0.14, + "grad_norm": 2.0802914443470293, + "learning_rate": 9.664918787209643e-06, + "loss": 0.532, + "step": 1158 + }, + { + "epoch": 0.14, + "grad_norm": 0.6472887207129542, + "learning_rate": 9.664195075423011e-06, + "loss": 0.4588, + "step": 1159 + }, + { + "epoch": 0.14, + "grad_norm": 1.7292441767378381, + "learning_rate": 9.663470610094587e-06, + "loss": 0.5606, + "step": 1160 + }, + { + "epoch": 0.14, + "grad_norm": 1.788898862305559, + "learning_rate": 9.662745391341415e-06, + "loss": 0.513, + "step": 1161 + }, + { + "epoch": 0.14, + "grad_norm": 1.5309024684748662, + "learning_rate": 9.662019419280659e-06, + "loss": 0.5694, + "step": 1162 + }, + { + "epoch": 0.14, + "grad_norm": 1.4995683508223077, + "learning_rate": 9.661292694029605e-06, + "loss": 0.5268, + "step": 1163 + }, + { + "epoch": 0.14, + "grad_norm": 1.7653072066574216, + "learning_rate": 9.660565215705664e-06, + "loss": 0.5659, + "step": 1164 + }, + { + "epoch": 0.14, + "grad_norm": 1.963524493317451, + "learning_rate": 9.659836984426366e-06, + "loss": 0.4975, + "step": 1165 + }, + { + "epoch": 0.14, + "grad_norm": 1.6558752769707101, + "learning_rate": 9.65910800030936e-06, + "loss": 0.4612, + "step": 1166 + }, + { + "epoch": 0.14, + "grad_norm": 1.569688065552612, + "learning_rate": 9.658378263472428e-06, + "loss": 0.5306, + "step": 1167 + }, + { + "epoch": 0.14, + "grad_norm": 1.600377545926572, + "learning_rate": 9.657647774033456e-06, + "loss": 0.5281, + "step": 1168 + }, + { + "epoch": 0.15, + "grad_norm": 2.094413769681627, + "learning_rate": 9.656916532110468e-06, + "loss": 0.4827, + "step": 1169 + }, + { + "epoch": 0.15, + "grad_norm": 2.4195357712238534, + "learning_rate": 9.656184537821598e-06, + "loss": 0.4979, + "step": 1170 + }, + { + "epoch": 0.15, + "grad_norm": 1.5100123439164768, + "learning_rate": 9.655451791285108e-06, + "loss": 0.5261, + "step": 1171 + }, + { + "epoch": 0.15, + "grad_norm": 1.688940664076167, + "learning_rate": 9.65471829261938e-06, + "loss": 0.5062, + "step": 1172 + }, + { + "epoch": 0.15, + "grad_norm": 1.5548426778604, + "learning_rate": 9.653984041942917e-06, + "loss": 0.5516, + "step": 1173 + }, + { + "epoch": 0.15, + "grad_norm": 1.7100786513962827, + "learning_rate": 9.653249039374344e-06, + "loss": 0.5542, + "step": 1174 + }, + { + "epoch": 0.15, + "grad_norm": 1.6241572009144818, + "learning_rate": 9.652513285032406e-06, + "loss": 0.4834, + "step": 1175 + }, + { + "epoch": 0.15, + "grad_norm": 1.6386928401709666, + "learning_rate": 9.65177677903597e-06, + "loss": 0.5102, + "step": 1176 + }, + { + "epoch": 0.15, + "grad_norm": 1.6867664454182212, + "learning_rate": 9.651039521504026e-06, + "loss": 0.5255, + "step": 1177 + }, + { + "epoch": 0.15, + "grad_norm": 3.4139399172397975, + "learning_rate": 9.650301512555687e-06, + "loss": 0.5622, + "step": 1178 + }, + { + "epoch": 0.15, + "grad_norm": 4.094308688561519, + "learning_rate": 9.64956275231018e-06, + "loss": 0.4882, + "step": 1179 + }, + { + "epoch": 0.15, + "grad_norm": 1.865048965284112, + "learning_rate": 9.648823240886862e-06, + "loss": 0.5096, + "step": 1180 + }, + { + "epoch": 0.15, + "grad_norm": 2.494701863132983, + "learning_rate": 9.648082978405207e-06, + "loss": 0.5809, + "step": 1181 + }, + { + "epoch": 0.15, + "grad_norm": 0.6793676392905069, + "learning_rate": 9.647341964984808e-06, + "loss": 0.5012, + "step": 1182 + }, + { + "epoch": 0.15, + "grad_norm": 5.213005790580897, + "learning_rate": 9.646600200745386e-06, + "loss": 0.5641, + "step": 1183 + }, + { + "epoch": 0.15, + "grad_norm": 0.695252467817391, + "learning_rate": 9.645857685806776e-06, + "loss": 0.4946, + "step": 1184 + }, + { + "epoch": 0.15, + "grad_norm": 1.6872888667684667, + "learning_rate": 9.645114420288943e-06, + "loss": 0.5504, + "step": 1185 + }, + { + "epoch": 0.15, + "grad_norm": 2.1474607546107864, + "learning_rate": 9.644370404311962e-06, + "loss": 0.5554, + "step": 1186 + }, + { + "epoch": 0.15, + "grad_norm": 2.65613332157712, + "learning_rate": 9.64362563799604e-06, + "loss": 0.5397, + "step": 1187 + }, + { + "epoch": 0.15, + "grad_norm": 2.1984130027521513, + "learning_rate": 9.642880121461498e-06, + "loss": 0.5274, + "step": 1188 + }, + { + "epoch": 0.15, + "grad_norm": 1.9276657533703743, + "learning_rate": 9.642133854828782e-06, + "loss": 0.5502, + "step": 1189 + }, + { + "epoch": 0.15, + "grad_norm": 0.6981372675873604, + "learning_rate": 9.641386838218457e-06, + "loss": 0.471, + "step": 1190 + }, + { + "epoch": 0.15, + "grad_norm": 2.117793484644697, + "learning_rate": 9.640639071751211e-06, + "loss": 0.5068, + "step": 1191 + }, + { + "epoch": 0.15, + "grad_norm": 2.737088935356651, + "learning_rate": 9.639890555547851e-06, + "loss": 0.5434, + "step": 1192 + }, + { + "epoch": 0.15, + "grad_norm": 2.215032646390322, + "learning_rate": 9.639141289729308e-06, + "loss": 0.5475, + "step": 1193 + }, + { + "epoch": 0.15, + "grad_norm": 8.586242568006778, + "learning_rate": 9.638391274416631e-06, + "loss": 0.5123, + "step": 1194 + }, + { + "epoch": 0.15, + "grad_norm": 0.7004319954959887, + "learning_rate": 9.637640509730994e-06, + "loss": 0.5152, + "step": 1195 + }, + { + "epoch": 0.15, + "grad_norm": 5.300052946091436, + "learning_rate": 9.636888995793688e-06, + "loss": 0.516, + "step": 1196 + }, + { + "epoch": 0.15, + "grad_norm": 3.3913417939583823, + "learning_rate": 9.636136732726125e-06, + "loss": 0.5321, + "step": 1197 + }, + { + "epoch": 0.15, + "grad_norm": 2.6905813420549474, + "learning_rate": 9.635383720649842e-06, + "loss": 0.5162, + "step": 1198 + }, + { + "epoch": 0.15, + "grad_norm": 2.1986911451055486, + "learning_rate": 9.634629959686495e-06, + "loss": 0.5013, + "step": 1199 + }, + { + "epoch": 0.15, + "grad_norm": 4.310535302411778, + "learning_rate": 9.633875449957858e-06, + "loss": 0.535, + "step": 1200 + }, + { + "epoch": 0.15, + "grad_norm": 1.8547606195531325, + "learning_rate": 9.633120191585831e-06, + "loss": 0.51, + "step": 1201 + }, + { + "epoch": 0.15, + "grad_norm": 2.613164494224369, + "learning_rate": 9.632364184692433e-06, + "loss": 0.5094, + "step": 1202 + }, + { + "epoch": 0.15, + "grad_norm": 2.85274118986579, + "learning_rate": 9.631607429399804e-06, + "loss": 0.5153, + "step": 1203 + }, + { + "epoch": 0.15, + "grad_norm": 2.5773498083449433, + "learning_rate": 9.6308499258302e-06, + "loss": 0.5393, + "step": 1204 + }, + { + "epoch": 0.15, + "grad_norm": 3.010815599427557, + "learning_rate": 9.630091674106007e-06, + "loss": 0.5093, + "step": 1205 + }, + { + "epoch": 0.15, + "grad_norm": 2.9770650666755865, + "learning_rate": 9.629332674349726e-06, + "loss": 0.4783, + "step": 1206 + }, + { + "epoch": 0.15, + "grad_norm": 3.437068958047755, + "learning_rate": 9.62857292668398e-06, + "loss": 0.5322, + "step": 1207 + }, + { + "epoch": 0.15, + "grad_norm": 4.74042908492775, + "learning_rate": 9.627812431231513e-06, + "loss": 0.5359, + "step": 1208 + }, + { + "epoch": 0.15, + "grad_norm": 2.01476278074651, + "learning_rate": 9.627051188115188e-06, + "loss": 0.5211, + "step": 1209 + }, + { + "epoch": 0.15, + "grad_norm": 4.8731844210119135, + "learning_rate": 9.626289197457994e-06, + "loss": 0.5703, + "step": 1210 + }, + { + "epoch": 0.15, + "grad_norm": 2.3127041969062683, + "learning_rate": 9.625526459383036e-06, + "loss": 0.5378, + "step": 1211 + }, + { + "epoch": 0.15, + "grad_norm": 2.2867488219449634, + "learning_rate": 9.62476297401354e-06, + "loss": 0.5546, + "step": 1212 + }, + { + "epoch": 0.15, + "grad_norm": 2.5360675967832114, + "learning_rate": 9.623998741472853e-06, + "loss": 0.4797, + "step": 1213 + }, + { + "epoch": 0.15, + "grad_norm": 2.0113735199718814, + "learning_rate": 9.623233761884445e-06, + "loss": 0.5743, + "step": 1214 + }, + { + "epoch": 0.15, + "grad_norm": 2.367006214996671, + "learning_rate": 9.622468035371905e-06, + "loss": 0.5069, + "step": 1215 + }, + { + "epoch": 0.15, + "grad_norm": 3.009616179978305, + "learning_rate": 9.621701562058945e-06, + "loss": 0.5268, + "step": 1216 + }, + { + "epoch": 0.15, + "grad_norm": 2.7213470140986056, + "learning_rate": 9.620934342069391e-06, + "loss": 0.5008, + "step": 1217 + }, + { + "epoch": 0.15, + "grad_norm": 1.8416928579180722, + "learning_rate": 9.620166375527199e-06, + "loss": 0.5389, + "step": 1218 + }, + { + "epoch": 0.15, + "grad_norm": 3.5360768675242755, + "learning_rate": 9.619397662556434e-06, + "loss": 0.5154, + "step": 1219 + }, + { + "epoch": 0.15, + "grad_norm": 2.666612843631181, + "learning_rate": 9.618628203281295e-06, + "loss": 0.5828, + "step": 1220 + }, + { + "epoch": 0.15, + "grad_norm": 2.2125291127077475, + "learning_rate": 9.617857997826093e-06, + "loss": 0.5387, + "step": 1221 + }, + { + "epoch": 0.15, + "grad_norm": 4.7546061712011785, + "learning_rate": 9.617087046315261e-06, + "loss": 0.4793, + "step": 1222 + }, + { + "epoch": 0.15, + "grad_norm": 2.464835676400265, + "learning_rate": 9.616315348873351e-06, + "loss": 0.5424, + "step": 1223 + }, + { + "epoch": 0.15, + "grad_norm": 8.164795916642207, + "learning_rate": 9.615542905625041e-06, + "loss": 0.5119, + "step": 1224 + }, + { + "epoch": 0.15, + "grad_norm": 2.6484271639304557, + "learning_rate": 9.614769716695124e-06, + "loss": 0.5667, + "step": 1225 + }, + { + "epoch": 0.15, + "grad_norm": 5.150313277541799, + "learning_rate": 9.613995782208519e-06, + "loss": 0.5262, + "step": 1226 + }, + { + "epoch": 0.15, + "grad_norm": 2.321639794941608, + "learning_rate": 9.613221102290256e-06, + "loss": 0.4829, + "step": 1227 + }, + { + "epoch": 0.15, + "grad_norm": 2.6141589182441907, + "learning_rate": 9.612445677065494e-06, + "loss": 0.5041, + "step": 1228 + }, + { + "epoch": 0.15, + "grad_norm": 1.9570658209637812, + "learning_rate": 9.611669506659512e-06, + "loss": 0.5886, + "step": 1229 + }, + { + "epoch": 0.15, + "grad_norm": 2.8923861340904082, + "learning_rate": 9.610892591197702e-06, + "loss": 0.5248, + "step": 1230 + }, + { + "epoch": 0.15, + "grad_norm": 2.8984705127022097, + "learning_rate": 9.610114930805588e-06, + "loss": 0.5049, + "step": 1231 + }, + { + "epoch": 0.15, + "grad_norm": 2.4040263275304556, + "learning_rate": 9.609336525608804e-06, + "loss": 0.5476, + "step": 1232 + }, + { + "epoch": 0.15, + "grad_norm": 2.5329760895843587, + "learning_rate": 9.608557375733108e-06, + "loss": 0.5361, + "step": 1233 + }, + { + "epoch": 0.15, + "grad_norm": 2.4091340353572606, + "learning_rate": 9.607777481304378e-06, + "loss": 0.5241, + "step": 1234 + }, + { + "epoch": 0.15, + "grad_norm": 4.083298586043196, + "learning_rate": 9.606996842448617e-06, + "loss": 0.5274, + "step": 1235 + }, + { + "epoch": 0.15, + "grad_norm": 4.131902615128241, + "learning_rate": 9.60621545929194e-06, + "loss": 0.5581, + "step": 1236 + }, + { + "epoch": 0.15, + "grad_norm": 0.7455567595837368, + "learning_rate": 9.605433331960589e-06, + "loss": 0.4963, + "step": 1237 + }, + { + "epoch": 0.15, + "grad_norm": 3.2914875837784163, + "learning_rate": 9.60465046058092e-06, + "loss": 0.4893, + "step": 1238 + }, + { + "epoch": 0.15, + "grad_norm": 3.2680055610154803, + "learning_rate": 9.603866845279416e-06, + "loss": 0.547, + "step": 1239 + }, + { + "epoch": 0.15, + "grad_norm": 2.3527127987284717, + "learning_rate": 9.603082486182677e-06, + "loss": 0.5201, + "step": 1240 + }, + { + "epoch": 0.15, + "grad_norm": 3.8932230576608493, + "learning_rate": 9.60229738341742e-06, + "loss": 0.5067, + "step": 1241 + }, + { + "epoch": 0.15, + "grad_norm": 4.521453010399267, + "learning_rate": 9.601511537110488e-06, + "loss": 0.5039, + "step": 1242 + }, + { + "epoch": 0.15, + "grad_norm": 2.902197545583407, + "learning_rate": 9.600724947388842e-06, + "loss": 0.5102, + "step": 1243 + }, + { + "epoch": 0.15, + "grad_norm": 3.8936990939314446, + "learning_rate": 9.59993761437956e-06, + "loss": 0.4766, + "step": 1244 + }, + { + "epoch": 0.15, + "grad_norm": 1.818935057937169, + "learning_rate": 9.599149538209844e-06, + "loss": 0.4479, + "step": 1245 + }, + { + "epoch": 0.15, + "grad_norm": 2.4841266281922247, + "learning_rate": 9.598360719007014e-06, + "loss": 0.5502, + "step": 1246 + }, + { + "epoch": 0.15, + "grad_norm": 0.6573299803407252, + "learning_rate": 9.597571156898512e-06, + "loss": 0.4948, + "step": 1247 + }, + { + "epoch": 0.15, + "grad_norm": 1.9694935567001643, + "learning_rate": 9.596780852011898e-06, + "loss": 0.5235, + "step": 1248 + }, + { + "epoch": 0.16, + "grad_norm": 1.7258316103612048, + "learning_rate": 9.59598980447485e-06, + "loss": 0.5048, + "step": 1249 + }, + { + "epoch": 0.16, + "grad_norm": 2.4025233679193496, + "learning_rate": 9.595198014415175e-06, + "loss": 0.4647, + "step": 1250 + }, + { + "epoch": 0.16, + "grad_norm": 1.9544947136772115, + "learning_rate": 9.594405481960788e-06, + "loss": 0.5649, + "step": 1251 + }, + { + "epoch": 0.16, + "grad_norm": 1.9777545383227084, + "learning_rate": 9.593612207239731e-06, + "loss": 0.5665, + "step": 1252 + }, + { + "epoch": 0.16, + "grad_norm": 2.117499822077094, + "learning_rate": 9.592818190380164e-06, + "loss": 0.5135, + "step": 1253 + }, + { + "epoch": 0.16, + "grad_norm": 7.076146244025416, + "learning_rate": 9.59202343151037e-06, + "loss": 0.5269, + "step": 1254 + }, + { + "epoch": 0.16, + "grad_norm": 3.223126647867816, + "learning_rate": 9.591227930758747e-06, + "loss": 0.5216, + "step": 1255 + }, + { + "epoch": 0.16, + "grad_norm": 2.0328596241189216, + "learning_rate": 9.590431688253816e-06, + "loss": 0.5427, + "step": 1256 + }, + { + "epoch": 0.16, + "grad_norm": 2.6556957352883486, + "learning_rate": 9.589634704124218e-06, + "loss": 0.5067, + "step": 1257 + }, + { + "epoch": 0.16, + "grad_norm": 2.804364284260751, + "learning_rate": 9.58883697849871e-06, + "loss": 0.5287, + "step": 1258 + }, + { + "epoch": 0.16, + "grad_norm": 1.5480925200101965, + "learning_rate": 9.588038511506174e-06, + "loss": 0.45, + "step": 1259 + }, + { + "epoch": 0.16, + "grad_norm": 2.730298843291153, + "learning_rate": 9.587239303275609e-06, + "loss": 0.5133, + "step": 1260 + }, + { + "epoch": 0.16, + "grad_norm": 4.908091031732794, + "learning_rate": 9.586439353936134e-06, + "loss": 0.4956, + "step": 1261 + }, + { + "epoch": 0.16, + "grad_norm": 2.708475563781002, + "learning_rate": 9.585638663616988e-06, + "loss": 0.5518, + "step": 1262 + }, + { + "epoch": 0.16, + "grad_norm": 1.9115489553355347, + "learning_rate": 9.584837232447528e-06, + "loss": 0.484, + "step": 1263 + }, + { + "epoch": 0.16, + "grad_norm": 2.6690112469223255, + "learning_rate": 9.584035060557232e-06, + "loss": 0.558, + "step": 1264 + }, + { + "epoch": 0.16, + "grad_norm": 1.8645618150965557, + "learning_rate": 9.583232148075704e-06, + "loss": 0.5433, + "step": 1265 + }, + { + "epoch": 0.16, + "grad_norm": 1.6006910031055772, + "learning_rate": 9.582428495132652e-06, + "loss": 0.5044, + "step": 1266 + }, + { + "epoch": 0.16, + "grad_norm": 1.643665291170957, + "learning_rate": 9.58162410185792e-06, + "loss": 0.5421, + "step": 1267 + }, + { + "epoch": 0.16, + "grad_norm": 2.0755796639475856, + "learning_rate": 9.580818968381465e-06, + "loss": 0.5329, + "step": 1268 + }, + { + "epoch": 0.16, + "grad_norm": 1.4024091543968857, + "learning_rate": 9.580013094833358e-06, + "loss": 0.5128, + "step": 1269 + }, + { + "epoch": 0.16, + "grad_norm": 1.4212575894723896, + "learning_rate": 9.579206481343802e-06, + "loss": 0.4955, + "step": 1270 + }, + { + "epoch": 0.16, + "grad_norm": 1.5678792635077088, + "learning_rate": 9.578399128043106e-06, + "loss": 0.4909, + "step": 1271 + }, + { + "epoch": 0.16, + "grad_norm": 2.2412182086972967, + "learning_rate": 9.577591035061709e-06, + "loss": 0.4752, + "step": 1272 + }, + { + "epoch": 0.16, + "grad_norm": 2.0582740833905353, + "learning_rate": 9.576782202530164e-06, + "loss": 0.5094, + "step": 1273 + }, + { + "epoch": 0.16, + "grad_norm": 2.1595951744380097, + "learning_rate": 9.575972630579147e-06, + "loss": 0.4988, + "step": 1274 + }, + { + "epoch": 0.16, + "grad_norm": 1.7575760712452722, + "learning_rate": 9.575162319339448e-06, + "loss": 0.5597, + "step": 1275 + }, + { + "epoch": 0.16, + "grad_norm": 2.1055777822631323, + "learning_rate": 9.574351268941982e-06, + "loss": 0.5209, + "step": 1276 + }, + { + "epoch": 0.16, + "grad_norm": 2.8901163453768444, + "learning_rate": 9.573539479517782e-06, + "loss": 0.5642, + "step": 1277 + }, + { + "epoch": 0.16, + "grad_norm": 1.781553074512771, + "learning_rate": 9.572726951198e-06, + "loss": 0.5262, + "step": 1278 + }, + { + "epoch": 0.16, + "grad_norm": 1.9014220500004342, + "learning_rate": 9.571913684113905e-06, + "loss": 0.5311, + "step": 1279 + }, + { + "epoch": 0.16, + "grad_norm": 1.5394281191849384, + "learning_rate": 9.571099678396886e-06, + "loss": 0.5251, + "step": 1280 + }, + { + "epoch": 0.16, + "grad_norm": 2.749150285162678, + "learning_rate": 9.57028493417846e-06, + "loss": 0.558, + "step": 1281 + }, + { + "epoch": 0.16, + "grad_norm": 2.194358414181551, + "learning_rate": 9.569469451590248e-06, + "loss": 0.5159, + "step": 1282 + }, + { + "epoch": 0.16, + "grad_norm": 1.3419010510041824, + "learning_rate": 9.568653230764003e-06, + "loss": 0.4754, + "step": 1283 + }, + { + "epoch": 0.16, + "grad_norm": 1.60353026025585, + "learning_rate": 9.567836271831592e-06, + "loss": 0.5184, + "step": 1284 + }, + { + "epoch": 0.16, + "grad_norm": 2.08991136852093, + "learning_rate": 9.567018574925e-06, + "loss": 0.5461, + "step": 1285 + }, + { + "epoch": 0.16, + "grad_norm": 3.3027449836301668, + "learning_rate": 9.566200140176336e-06, + "loss": 0.5126, + "step": 1286 + }, + { + "epoch": 0.16, + "grad_norm": 1.675273115727022, + "learning_rate": 9.565380967717824e-06, + "loss": 0.5403, + "step": 1287 + }, + { + "epoch": 0.16, + "grad_norm": 1.5043081964292935, + "learning_rate": 9.564561057681805e-06, + "loss": 0.4648, + "step": 1288 + }, + { + "epoch": 0.16, + "grad_norm": 3.3349823760180684, + "learning_rate": 9.56374041020075e-06, + "loss": 0.5427, + "step": 1289 + }, + { + "epoch": 0.16, + "grad_norm": 2.211619642804284, + "learning_rate": 9.562919025407236e-06, + "loss": 0.5283, + "step": 1290 + }, + { + "epoch": 0.16, + "grad_norm": 1.45557876575767, + "learning_rate": 9.562096903433968e-06, + "loss": 0.5412, + "step": 1291 + }, + { + "epoch": 0.16, + "grad_norm": 2.9953449011412303, + "learning_rate": 9.561274044413764e-06, + "loss": 0.5083, + "step": 1292 + }, + { + "epoch": 0.16, + "grad_norm": 11.668342208345377, + "learning_rate": 9.560450448479567e-06, + "loss": 0.5149, + "step": 1293 + }, + { + "epoch": 0.16, + "grad_norm": 2.170669371423883, + "learning_rate": 9.559626115764437e-06, + "loss": 0.5424, + "step": 1294 + }, + { + "epoch": 0.16, + "grad_norm": 1.794732304031168, + "learning_rate": 9.558801046401547e-06, + "loss": 0.5584, + "step": 1295 + }, + { + "epoch": 0.16, + "grad_norm": 1.8864192744126171, + "learning_rate": 9.5579752405242e-06, + "loss": 0.5109, + "step": 1296 + }, + { + "epoch": 0.16, + "grad_norm": 1.9524520373710526, + "learning_rate": 9.55714869826581e-06, + "loss": 0.5622, + "step": 1297 + }, + { + "epoch": 0.16, + "grad_norm": 1.6924629447195305, + "learning_rate": 9.55632141975991e-06, + "loss": 0.5265, + "step": 1298 + }, + { + "epoch": 0.16, + "grad_norm": 1.5101536120787722, + "learning_rate": 9.555493405140158e-06, + "loss": 0.5426, + "step": 1299 + }, + { + "epoch": 0.16, + "grad_norm": 1.4822593254375265, + "learning_rate": 9.554664654540324e-06, + "loss": 0.5498, + "step": 1300 + }, + { + "epoch": 0.16, + "grad_norm": 2.1051335057769935, + "learning_rate": 9.553835168094302e-06, + "loss": 0.5347, + "step": 1301 + }, + { + "epoch": 0.16, + "grad_norm": 1.7821949833507682, + "learning_rate": 9.553004945936101e-06, + "loss": 0.5628, + "step": 1302 + }, + { + "epoch": 0.16, + "grad_norm": 0.7230312544660243, + "learning_rate": 9.552173988199854e-06, + "loss": 0.5047, + "step": 1303 + }, + { + "epoch": 0.16, + "grad_norm": 4.525803056649358, + "learning_rate": 9.551342295019805e-06, + "loss": 0.5135, + "step": 1304 + }, + { + "epoch": 0.16, + "grad_norm": 0.6930037398549997, + "learning_rate": 9.550509866530323e-06, + "loss": 0.4996, + "step": 1305 + }, + { + "epoch": 0.16, + "grad_norm": 1.5104572978125144, + "learning_rate": 9.549676702865897e-06, + "loss": 0.5371, + "step": 1306 + }, + { + "epoch": 0.16, + "grad_norm": 1.8562563426485768, + "learning_rate": 9.54884280416113e-06, + "loss": 0.5612, + "step": 1307 + }, + { + "epoch": 0.16, + "grad_norm": 1.926794889658595, + "learning_rate": 9.548008170550744e-06, + "loss": 0.4897, + "step": 1308 + }, + { + "epoch": 0.16, + "grad_norm": 1.639247567223645, + "learning_rate": 9.547172802169582e-06, + "loss": 0.5497, + "step": 1309 + }, + { + "epoch": 0.16, + "grad_norm": 2.8091300839987694, + "learning_rate": 9.546336699152608e-06, + "loss": 0.5176, + "step": 1310 + }, + { + "epoch": 0.16, + "grad_norm": 1.9440447348284993, + "learning_rate": 9.545499861634897e-06, + "loss": 0.5783, + "step": 1311 + }, + { + "epoch": 0.16, + "grad_norm": 1.8076948729636986, + "learning_rate": 9.544662289751651e-06, + "loss": 0.5092, + "step": 1312 + }, + { + "epoch": 0.16, + "grad_norm": 6.715210485185207, + "learning_rate": 9.543823983638187e-06, + "loss": 0.4853, + "step": 1313 + }, + { + "epoch": 0.16, + "grad_norm": 3.513563901986961, + "learning_rate": 9.54298494342994e-06, + "loss": 0.5144, + "step": 1314 + }, + { + "epoch": 0.16, + "grad_norm": 2.3014768871227727, + "learning_rate": 9.542145169262465e-06, + "loss": 0.5047, + "step": 1315 + }, + { + "epoch": 0.16, + "grad_norm": 2.8557169997552467, + "learning_rate": 9.541304661271433e-06, + "loss": 0.5442, + "step": 1316 + }, + { + "epoch": 0.16, + "grad_norm": 2.638193691189023, + "learning_rate": 9.540463419592638e-06, + "loss": 0.5737, + "step": 1317 + }, + { + "epoch": 0.16, + "grad_norm": 1.4331612514067824, + "learning_rate": 9.539621444361988e-06, + "loss": 0.4952, + "step": 1318 + }, + { + "epoch": 0.16, + "grad_norm": 1.814975821494965, + "learning_rate": 9.538778735715512e-06, + "loss": 0.5214, + "step": 1319 + }, + { + "epoch": 0.16, + "grad_norm": 1.6358206258929424, + "learning_rate": 9.537935293789357e-06, + "loss": 0.5852, + "step": 1320 + }, + { + "epoch": 0.16, + "grad_norm": 11.785035433589641, + "learning_rate": 9.53709111871979e-06, + "loss": 0.4918, + "step": 1321 + }, + { + "epoch": 0.16, + "grad_norm": 2.7894871619403805, + "learning_rate": 9.536246210643192e-06, + "loss": 0.4917, + "step": 1322 + }, + { + "epoch": 0.16, + "grad_norm": 0.6960296087976123, + "learning_rate": 9.535400569696068e-06, + "loss": 0.4808, + "step": 1323 + }, + { + "epoch": 0.16, + "grad_norm": 1.6707368064014259, + "learning_rate": 9.534554196015038e-06, + "loss": 0.5208, + "step": 1324 + }, + { + "epoch": 0.16, + "grad_norm": 4.418989617577566, + "learning_rate": 9.53370708973684e-06, + "loss": 0.5419, + "step": 1325 + }, + { + "epoch": 0.16, + "grad_norm": 2.9542552706872764, + "learning_rate": 9.532859250998332e-06, + "loss": 0.5268, + "step": 1326 + }, + { + "epoch": 0.16, + "grad_norm": 2.109108500431418, + "learning_rate": 9.532010679936491e-06, + "loss": 0.5321, + "step": 1327 + }, + { + "epoch": 0.16, + "grad_norm": 2.40541389320721, + "learning_rate": 9.53116137668841e-06, + "loss": 0.5123, + "step": 1328 + }, + { + "epoch": 0.16, + "grad_norm": 4.120180365123593, + "learning_rate": 9.530311341391303e-06, + "loss": 0.5523, + "step": 1329 + }, + { + "epoch": 0.17, + "grad_norm": 1.837416018100702, + "learning_rate": 9.529460574182498e-06, + "loss": 0.5293, + "step": 1330 + }, + { + "epoch": 0.17, + "grad_norm": 1.8130170198172144, + "learning_rate": 9.528609075199445e-06, + "loss": 0.5078, + "step": 1331 + }, + { + "epoch": 0.17, + "grad_norm": 3.5671538001377203, + "learning_rate": 9.527756844579711e-06, + "loss": 0.5376, + "step": 1332 + }, + { + "epoch": 0.17, + "grad_norm": 1.7851923640009364, + "learning_rate": 9.526903882460983e-06, + "loss": 0.5495, + "step": 1333 + }, + { + "epoch": 0.17, + "grad_norm": 2.058190778408725, + "learning_rate": 9.526050188981064e-06, + "loss": 0.488, + "step": 1334 + }, + { + "epoch": 0.17, + "grad_norm": 1.9634306303556182, + "learning_rate": 9.525195764277874e-06, + "loss": 0.5185, + "step": 1335 + }, + { + "epoch": 0.17, + "grad_norm": 2.032739483474685, + "learning_rate": 9.524340608489454e-06, + "loss": 0.5238, + "step": 1336 + }, + { + "epoch": 0.17, + "grad_norm": 3.412786853743459, + "learning_rate": 9.523484721753961e-06, + "loss": 0.4834, + "step": 1337 + }, + { + "epoch": 0.17, + "grad_norm": 0.7243055388004805, + "learning_rate": 9.522628104209675e-06, + "loss": 0.5075, + "step": 1338 + }, + { + "epoch": 0.17, + "grad_norm": 2.4597665228729926, + "learning_rate": 9.521770755994983e-06, + "loss": 0.5515, + "step": 1339 + }, + { + "epoch": 0.17, + "grad_norm": 1.616644720048126, + "learning_rate": 9.520912677248403e-06, + "loss": 0.5475, + "step": 1340 + }, + { + "epoch": 0.17, + "grad_norm": 1.5366952523712116, + "learning_rate": 9.520053868108566e-06, + "loss": 0.4744, + "step": 1341 + }, + { + "epoch": 0.17, + "grad_norm": 3.09729427720111, + "learning_rate": 9.519194328714214e-06, + "loss": 0.5455, + "step": 1342 + }, + { + "epoch": 0.17, + "grad_norm": 3.556155394623956, + "learning_rate": 9.518334059204218e-06, + "loss": 0.5228, + "step": 1343 + }, + { + "epoch": 0.17, + "grad_norm": 3.2820341211515673, + "learning_rate": 9.517473059717559e-06, + "loss": 0.5088, + "step": 1344 + }, + { + "epoch": 0.17, + "grad_norm": 1.7841649432175868, + "learning_rate": 9.516611330393343e-06, + "loss": 0.499, + "step": 1345 + }, + { + "epoch": 0.17, + "grad_norm": 2.3170294309282338, + "learning_rate": 9.515748871370786e-06, + "loss": 0.5223, + "step": 1346 + }, + { + "epoch": 0.17, + "grad_norm": 1.7675365545954447, + "learning_rate": 9.51488568278923e-06, + "loss": 0.5589, + "step": 1347 + }, + { + "epoch": 0.17, + "grad_norm": 2.876530940821112, + "learning_rate": 9.514021764788127e-06, + "loss": 0.5056, + "step": 1348 + }, + { + "epoch": 0.17, + "grad_norm": 1.8763300559518927, + "learning_rate": 9.513157117507053e-06, + "loss": 0.5009, + "step": 1349 + }, + { + "epoch": 0.17, + "grad_norm": 2.299660676483901, + "learning_rate": 9.512291741085696e-06, + "loss": 0.4943, + "step": 1350 + }, + { + "epoch": 0.17, + "grad_norm": 1.8836219482182772, + "learning_rate": 9.51142563566387e-06, + "loss": 0.5369, + "step": 1351 + }, + { + "epoch": 0.17, + "grad_norm": 1.7009943354735515, + "learning_rate": 9.510558801381497e-06, + "loss": 0.5177, + "step": 1352 + }, + { + "epoch": 0.17, + "grad_norm": 2.007229156877643, + "learning_rate": 9.509691238378626e-06, + "loss": 0.5089, + "step": 1353 + }, + { + "epoch": 0.17, + "grad_norm": 1.4387259770576408, + "learning_rate": 9.508822946795417e-06, + "loss": 0.4878, + "step": 1354 + }, + { + "epoch": 0.17, + "grad_norm": 3.1516983884115723, + "learning_rate": 9.507953926772152e-06, + "loss": 0.5281, + "step": 1355 + }, + { + "epoch": 0.17, + "grad_norm": 1.8865060786578052, + "learning_rate": 9.507084178449226e-06, + "loss": 0.5245, + "step": 1356 + }, + { + "epoch": 0.17, + "grad_norm": 1.8621271424152859, + "learning_rate": 9.506213701967157e-06, + "loss": 0.5745, + "step": 1357 + }, + { + "epoch": 0.17, + "grad_norm": 1.6162835398610804, + "learning_rate": 9.505342497466577e-06, + "loss": 0.5083, + "step": 1358 + }, + { + "epoch": 0.17, + "grad_norm": 1.9539800242472292, + "learning_rate": 9.504470565088237e-06, + "loss": 0.5211, + "step": 1359 + }, + { + "epoch": 0.17, + "grad_norm": 1.8163097041032972, + "learning_rate": 9.503597904973006e-06, + "loss": 0.5399, + "step": 1360 + }, + { + "epoch": 0.17, + "grad_norm": 0.6481196635818798, + "learning_rate": 9.502724517261867e-06, + "loss": 0.4951, + "step": 1361 + }, + { + "epoch": 0.17, + "grad_norm": 1.7434840971002281, + "learning_rate": 9.50185040209593e-06, + "loss": 0.4897, + "step": 1362 + }, + { + "epoch": 0.17, + "grad_norm": 1.938999967638018, + "learning_rate": 9.500975559616407e-06, + "loss": 0.5322, + "step": 1363 + }, + { + "epoch": 0.17, + "grad_norm": 1.5166896975682522, + "learning_rate": 9.500099989964644e-06, + "loss": 0.4906, + "step": 1364 + }, + { + "epoch": 0.17, + "grad_norm": 0.6493103354228393, + "learning_rate": 9.499223693282095e-06, + "loss": 0.4824, + "step": 1365 + }, + { + "epoch": 0.17, + "grad_norm": 1.5580997421435645, + "learning_rate": 9.498346669710331e-06, + "loss": 0.5484, + "step": 1366 + }, + { + "epoch": 0.17, + "grad_norm": 1.5722533449608065, + "learning_rate": 9.497468919391046e-06, + "loss": 0.5286, + "step": 1367 + }, + { + "epoch": 0.17, + "grad_norm": 2.1379581735023523, + "learning_rate": 9.496590442466045e-06, + "loss": 0.5285, + "step": 1368 + }, + { + "epoch": 0.17, + "grad_norm": 6.004542983588922, + "learning_rate": 9.495711239077256e-06, + "loss": 0.5383, + "step": 1369 + }, + { + "epoch": 0.17, + "grad_norm": 1.723423923699892, + "learning_rate": 9.494831309366723e-06, + "loss": 0.5023, + "step": 1370 + }, + { + "epoch": 0.17, + "grad_norm": 7.294621546390402, + "learning_rate": 9.493950653476604e-06, + "loss": 0.4739, + "step": 1371 + }, + { + "epoch": 0.17, + "grad_norm": 1.8962630752434209, + "learning_rate": 9.493069271549179e-06, + "loss": 0.5278, + "step": 1372 + }, + { + "epoch": 0.17, + "grad_norm": 1.2888590584106818, + "learning_rate": 9.49218716372684e-06, + "loss": 0.4836, + "step": 1373 + }, + { + "epoch": 0.17, + "grad_norm": 1.7787949561950795, + "learning_rate": 9.491304330152102e-06, + "loss": 0.5112, + "step": 1374 + }, + { + "epoch": 0.17, + "grad_norm": 2.1784795200831315, + "learning_rate": 9.490420770967594e-06, + "loss": 0.5267, + "step": 1375 + }, + { + "epoch": 0.17, + "grad_norm": 1.6375490466954763, + "learning_rate": 9.489536486316062e-06, + "loss": 0.5242, + "step": 1376 + }, + { + "epoch": 0.17, + "grad_norm": 1.477956447346823, + "learning_rate": 9.48865147634037e-06, + "loss": 0.5442, + "step": 1377 + }, + { + "epoch": 0.17, + "grad_norm": 1.409881186281856, + "learning_rate": 9.487765741183499e-06, + "loss": 0.4991, + "step": 1378 + }, + { + "epoch": 0.17, + "grad_norm": 3.6800854486941126, + "learning_rate": 9.486879280988548e-06, + "loss": 0.5077, + "step": 1379 + }, + { + "epoch": 0.17, + "grad_norm": 1.9714553284132872, + "learning_rate": 9.485992095898734e-06, + "loss": 0.4876, + "step": 1380 + }, + { + "epoch": 0.17, + "grad_norm": 1.5844507509035346, + "learning_rate": 9.485104186057386e-06, + "loss": 0.4705, + "step": 1381 + }, + { + "epoch": 0.17, + "grad_norm": 1.4796161279871252, + "learning_rate": 9.484215551607956e-06, + "loss": 0.4945, + "step": 1382 + }, + { + "epoch": 0.17, + "grad_norm": 1.3891117518032612, + "learning_rate": 9.48332619269401e-06, + "loss": 0.5054, + "step": 1383 + }, + { + "epoch": 0.17, + "grad_norm": 1.7929379067706905, + "learning_rate": 9.482436109459231e-06, + "loss": 0.5061, + "step": 1384 + }, + { + "epoch": 0.17, + "grad_norm": 1.6640724080747038, + "learning_rate": 9.481545302047423e-06, + "loss": 0.502, + "step": 1385 + }, + { + "epoch": 0.17, + "grad_norm": 1.5469445671131699, + "learning_rate": 9.480653770602502e-06, + "loss": 0.5031, + "step": 1386 + }, + { + "epoch": 0.17, + "grad_norm": 1.356349412795799, + "learning_rate": 9.479761515268499e-06, + "loss": 0.5271, + "step": 1387 + }, + { + "epoch": 0.17, + "grad_norm": 1.44417765309099, + "learning_rate": 9.478868536189571e-06, + "loss": 0.4856, + "step": 1388 + }, + { + "epoch": 0.17, + "grad_norm": 3.332946182241321, + "learning_rate": 9.477974833509984e-06, + "loss": 0.5198, + "step": 1389 + }, + { + "epoch": 0.17, + "grad_norm": 1.7290837012935454, + "learning_rate": 9.477080407374124e-06, + "loss": 0.4916, + "step": 1390 + }, + { + "epoch": 0.17, + "grad_norm": 4.4563147780501335, + "learning_rate": 9.476185257926496e-06, + "loss": 0.5489, + "step": 1391 + }, + { + "epoch": 0.17, + "grad_norm": 1.5845936119723913, + "learning_rate": 9.475289385311714e-06, + "loss": 0.5404, + "step": 1392 + }, + { + "epoch": 0.17, + "grad_norm": 0.6851537462201612, + "learning_rate": 9.474392789674517e-06, + "loss": 0.4969, + "step": 1393 + }, + { + "epoch": 0.17, + "grad_norm": 1.2924924777343387, + "learning_rate": 9.473495471159759e-06, + "loss": 0.4973, + "step": 1394 + }, + { + "epoch": 0.17, + "grad_norm": 1.8189207932754312, + "learning_rate": 9.472597429912409e-06, + "loss": 0.5301, + "step": 1395 + }, + { + "epoch": 0.17, + "grad_norm": 3.9522121006208017, + "learning_rate": 9.471698666077554e-06, + "loss": 0.5599, + "step": 1396 + }, + { + "epoch": 0.17, + "grad_norm": 3.654001619422945, + "learning_rate": 9.470799179800393e-06, + "loss": 0.513, + "step": 1397 + }, + { + "epoch": 0.17, + "grad_norm": 1.4693946073677817, + "learning_rate": 9.469898971226251e-06, + "loss": 0.4943, + "step": 1398 + }, + { + "epoch": 0.17, + "grad_norm": 1.352280469067633, + "learning_rate": 9.468998040500563e-06, + "loss": 0.4851, + "step": 1399 + }, + { + "epoch": 0.17, + "grad_norm": 1.713410517258502, + "learning_rate": 9.468096387768882e-06, + "loss": 0.4895, + "step": 1400 + }, + { + "epoch": 0.17, + "grad_norm": 0.6359837268312648, + "learning_rate": 9.467194013176878e-06, + "loss": 0.4622, + "step": 1401 + }, + { + "epoch": 0.17, + "grad_norm": 1.9106064409196937, + "learning_rate": 9.466290916870338e-06, + "loss": 0.5316, + "step": 1402 + }, + { + "epoch": 0.17, + "grad_norm": 1.5652596983399456, + "learning_rate": 9.465387098995165e-06, + "loss": 0.524, + "step": 1403 + }, + { + "epoch": 0.17, + "grad_norm": 1.4641608430759605, + "learning_rate": 9.464482559697377e-06, + "loss": 0.5306, + "step": 1404 + }, + { + "epoch": 0.17, + "grad_norm": 2.365277486999446, + "learning_rate": 9.463577299123113e-06, + "loss": 0.4959, + "step": 1405 + }, + { + "epoch": 0.17, + "grad_norm": 1.3531053244732796, + "learning_rate": 9.462671317418625e-06, + "loss": 0.4777, + "step": 1406 + }, + { + "epoch": 0.17, + "grad_norm": 1.529796907855802, + "learning_rate": 9.461764614730282e-06, + "loss": 0.5073, + "step": 1407 + }, + { + "epoch": 0.17, + "grad_norm": 1.6037050306107454, + "learning_rate": 9.46085719120457e-06, + "loss": 0.5345, + "step": 1408 + }, + { + "epoch": 0.17, + "grad_norm": 4.184384273671617, + "learning_rate": 9.459949046988089e-06, + "loss": 0.5283, + "step": 1409 + }, + { + "epoch": 0.17, + "grad_norm": 1.971143226559445, + "learning_rate": 9.459040182227561e-06, + "loss": 0.4918, + "step": 1410 + }, + { + "epoch": 0.18, + "grad_norm": 1.6162454601784557, + "learning_rate": 9.458130597069818e-06, + "loss": 0.5123, + "step": 1411 + }, + { + "epoch": 0.18, + "grad_norm": 2.187548515299508, + "learning_rate": 9.457220291661817e-06, + "loss": 0.6018, + "step": 1412 + }, + { + "epoch": 0.18, + "grad_norm": 1.417545035108629, + "learning_rate": 9.456309266150621e-06, + "loss": 0.5043, + "step": 1413 + }, + { + "epoch": 0.18, + "grad_norm": 1.6683483655573892, + "learning_rate": 9.455397520683414e-06, + "loss": 0.5066, + "step": 1414 + }, + { + "epoch": 0.18, + "grad_norm": 1.8267071607296352, + "learning_rate": 9.454485055407498e-06, + "loss": 0.5395, + "step": 1415 + }, + { + "epoch": 0.18, + "grad_norm": 2.2769052311188753, + "learning_rate": 9.45357187047029e-06, + "loss": 0.5016, + "step": 1416 + }, + { + "epoch": 0.18, + "grad_norm": 1.5160384864380638, + "learning_rate": 9.452657966019324e-06, + "loss": 0.5127, + "step": 1417 + }, + { + "epoch": 0.18, + "grad_norm": 1.5904869300889308, + "learning_rate": 9.451743342202248e-06, + "loss": 0.5364, + "step": 1418 + }, + { + "epoch": 0.18, + "grad_norm": 1.6504050121743716, + "learning_rate": 9.450827999166825e-06, + "loss": 0.5016, + "step": 1419 + }, + { + "epoch": 0.18, + "grad_norm": 1.5705800991617183, + "learning_rate": 9.449911937060943e-06, + "loss": 0.5539, + "step": 1420 + }, + { + "epoch": 0.18, + "grad_norm": 1.4645945472912039, + "learning_rate": 9.448995156032595e-06, + "loss": 0.5034, + "step": 1421 + }, + { + "epoch": 0.18, + "grad_norm": 1.4074577541628757, + "learning_rate": 9.448077656229895e-06, + "loss": 0.5257, + "step": 1422 + }, + { + "epoch": 0.18, + "grad_norm": 1.6932730921411916, + "learning_rate": 9.447159437801074e-06, + "loss": 0.525, + "step": 1423 + }, + { + "epoch": 0.18, + "grad_norm": 1.508238112135693, + "learning_rate": 9.44624050089448e-06, + "loss": 0.5297, + "step": 1424 + }, + { + "epoch": 0.18, + "grad_norm": 1.386080118480711, + "learning_rate": 9.445320845658574e-06, + "loss": 0.4918, + "step": 1425 + }, + { + "epoch": 0.18, + "grad_norm": 1.7120372086794187, + "learning_rate": 9.444400472241934e-06, + "loss": 0.4962, + "step": 1426 + }, + { + "epoch": 0.18, + "grad_norm": 1.2643237638504257, + "learning_rate": 9.443479380793256e-06, + "loss": 0.5, + "step": 1427 + }, + { + "epoch": 0.18, + "grad_norm": 0.6849918060542648, + "learning_rate": 9.44255757146135e-06, + "loss": 0.5596, + "step": 1428 + }, + { + "epoch": 0.18, + "grad_norm": 1.7220988937001047, + "learning_rate": 9.44163504439514e-06, + "loss": 0.5398, + "step": 1429 + }, + { + "epoch": 0.18, + "grad_norm": 1.7402435053337213, + "learning_rate": 9.44071179974367e-06, + "loss": 0.5265, + "step": 1430 + }, + { + "epoch": 0.18, + "grad_norm": 1.7330985961158227, + "learning_rate": 9.4397878376561e-06, + "loss": 0.4891, + "step": 1431 + }, + { + "epoch": 0.18, + "grad_norm": 1.9559886571410015, + "learning_rate": 9.438863158281702e-06, + "loss": 0.5233, + "step": 1432 + }, + { + "epoch": 0.18, + "grad_norm": 1.6463205429913006, + "learning_rate": 9.437937761769867e-06, + "loss": 0.5349, + "step": 1433 + }, + { + "epoch": 0.18, + "grad_norm": 1.6663696669907717, + "learning_rate": 9.4370116482701e-06, + "loss": 0.5131, + "step": 1434 + }, + { + "epoch": 0.18, + "grad_norm": 0.7039126134585667, + "learning_rate": 9.436084817932023e-06, + "loss": 0.537, + "step": 1435 + }, + { + "epoch": 0.18, + "grad_norm": 1.5876792100322659, + "learning_rate": 9.435157270905375e-06, + "loss": 0.5268, + "step": 1436 + }, + { + "epoch": 0.18, + "grad_norm": 1.4258461786668768, + "learning_rate": 9.434229007340008e-06, + "loss": 0.4963, + "step": 1437 + }, + { + "epoch": 0.18, + "grad_norm": 1.899112215769487, + "learning_rate": 9.433300027385891e-06, + "loss": 0.5093, + "step": 1438 + }, + { + "epoch": 0.18, + "grad_norm": 0.5966334451115977, + "learning_rate": 9.432370331193112e-06, + "loss": 0.506, + "step": 1439 + }, + { + "epoch": 0.18, + "grad_norm": 2.2001297668171955, + "learning_rate": 9.43143991891187e-06, + "loss": 0.5542, + "step": 1440 + }, + { + "epoch": 0.18, + "grad_norm": 1.5979829933620182, + "learning_rate": 9.430508790692477e-06, + "loss": 0.5353, + "step": 1441 + }, + { + "epoch": 0.18, + "grad_norm": 1.3938257090441009, + "learning_rate": 9.429576946685369e-06, + "loss": 0.5508, + "step": 1442 + }, + { + "epoch": 0.18, + "grad_norm": 1.887500531029625, + "learning_rate": 9.428644387041094e-06, + "loss": 0.5216, + "step": 1443 + }, + { + "epoch": 0.18, + "grad_norm": 3.830904789433637, + "learning_rate": 9.427711111910314e-06, + "loss": 0.559, + "step": 1444 + }, + { + "epoch": 0.18, + "grad_norm": 1.4963688236773471, + "learning_rate": 9.42677712144381e-06, + "loss": 0.5282, + "step": 1445 + }, + { + "epoch": 0.18, + "grad_norm": 1.4798735024063236, + "learning_rate": 9.42584241579247e-06, + "loss": 0.5549, + "step": 1446 + }, + { + "epoch": 0.18, + "grad_norm": 1.2502644942892267, + "learning_rate": 9.424906995107312e-06, + "loss": 0.541, + "step": 1447 + }, + { + "epoch": 0.18, + "grad_norm": 1.3223247871109725, + "learning_rate": 9.423970859539456e-06, + "loss": 0.4921, + "step": 1448 + }, + { + "epoch": 0.18, + "grad_norm": 1.891800694340005, + "learning_rate": 9.423034009240146e-06, + "loss": 0.4867, + "step": 1449 + }, + { + "epoch": 0.18, + "grad_norm": 1.5131892594414695, + "learning_rate": 9.422096444360736e-06, + "loss": 0.5101, + "step": 1450 + }, + { + "epoch": 0.18, + "grad_norm": 1.4027176765940803, + "learning_rate": 9.4211581650527e-06, + "loss": 0.4721, + "step": 1451 + }, + { + "epoch": 0.18, + "grad_norm": 2.05923787696074, + "learning_rate": 9.420219171467624e-06, + "loss": 0.5484, + "step": 1452 + }, + { + "epoch": 0.18, + "grad_norm": 1.6191945144015785, + "learning_rate": 9.419279463757212e-06, + "loss": 0.5314, + "step": 1453 + }, + { + "epoch": 0.18, + "grad_norm": 1.5430874657099605, + "learning_rate": 9.41833904207328e-06, + "loss": 0.5428, + "step": 1454 + }, + { + "epoch": 0.18, + "grad_norm": 1.544150626032831, + "learning_rate": 9.417397906567762e-06, + "loss": 0.5192, + "step": 1455 + }, + { + "epoch": 0.18, + "grad_norm": 1.828887008965527, + "learning_rate": 9.41645605739271e-06, + "loss": 0.5674, + "step": 1456 + }, + { + "epoch": 0.18, + "grad_norm": 1.446962550807306, + "learning_rate": 9.415513494700281e-06, + "loss": 0.5534, + "step": 1457 + }, + { + "epoch": 0.18, + "grad_norm": 1.409505263960533, + "learning_rate": 9.414570218642762e-06, + "loss": 0.5118, + "step": 1458 + }, + { + "epoch": 0.18, + "grad_norm": 1.9473100463811759, + "learning_rate": 9.413626229372543e-06, + "loss": 0.5302, + "step": 1459 + }, + { + "epoch": 0.18, + "grad_norm": 2.0115230441559584, + "learning_rate": 9.412681527042135e-06, + "loss": 0.5832, + "step": 1460 + }, + { + "epoch": 0.18, + "grad_norm": 2.504650850157876, + "learning_rate": 9.411736111804161e-06, + "loss": 0.5084, + "step": 1461 + }, + { + "epoch": 0.18, + "grad_norm": 1.5976807919316702, + "learning_rate": 9.410789983811366e-06, + "loss": 0.5124, + "step": 1462 + }, + { + "epoch": 0.18, + "grad_norm": 1.7259305630841915, + "learning_rate": 9.409843143216602e-06, + "loss": 0.5163, + "step": 1463 + }, + { + "epoch": 0.18, + "grad_norm": 1.3109812844968418, + "learning_rate": 9.408895590172837e-06, + "loss": 0.5135, + "step": 1464 + }, + { + "epoch": 0.18, + "grad_norm": 1.501449859949129, + "learning_rate": 9.407947324833161e-06, + "loss": 0.5308, + "step": 1465 + }, + { + "epoch": 0.18, + "grad_norm": 1.8953543710540945, + "learning_rate": 9.406998347350774e-06, + "loss": 0.4843, + "step": 1466 + }, + { + "epoch": 0.18, + "grad_norm": 0.7027642277487309, + "learning_rate": 9.40604865787899e-06, + "loss": 0.4771, + "step": 1467 + }, + { + "epoch": 0.18, + "grad_norm": 1.7233924413310757, + "learning_rate": 9.40509825657124e-06, + "loss": 0.5098, + "step": 1468 + }, + { + "epoch": 0.18, + "grad_norm": 0.7107652884518568, + "learning_rate": 9.40414714358107e-06, + "loss": 0.5001, + "step": 1469 + }, + { + "epoch": 0.18, + "grad_norm": 1.582631591668211, + "learning_rate": 9.403195319062142e-06, + "loss": 0.5248, + "step": 1470 + }, + { + "epoch": 0.18, + "grad_norm": 2.387342543665346, + "learning_rate": 9.402242783168228e-06, + "loss": 0.5538, + "step": 1471 + }, + { + "epoch": 0.18, + "grad_norm": 0.672928018604899, + "learning_rate": 9.401289536053223e-06, + "loss": 0.5351, + "step": 1472 + }, + { + "epoch": 0.18, + "grad_norm": 1.9915864472791334, + "learning_rate": 9.400335577871128e-06, + "loss": 0.5635, + "step": 1473 + }, + { + "epoch": 0.18, + "grad_norm": 1.6520622595751824, + "learning_rate": 9.399380908776068e-06, + "loss": 0.5298, + "step": 1474 + }, + { + "epoch": 0.18, + "grad_norm": 1.931210566761977, + "learning_rate": 9.398425528922275e-06, + "loss": 0.558, + "step": 1475 + }, + { + "epoch": 0.18, + "grad_norm": 2.7686227437021595, + "learning_rate": 9.3974694384641e-06, + "loss": 0.5278, + "step": 1476 + }, + { + "epoch": 0.18, + "grad_norm": 1.5142014550507987, + "learning_rate": 9.396512637556007e-06, + "loss": 0.4852, + "step": 1477 + }, + { + "epoch": 0.18, + "grad_norm": 2.121450635344311, + "learning_rate": 9.395555126352576e-06, + "loss": 0.5849, + "step": 1478 + }, + { + "epoch": 0.18, + "grad_norm": 1.9456554463436584, + "learning_rate": 9.394596905008504e-06, + "loss": 0.5376, + "step": 1479 + }, + { + "epoch": 0.18, + "grad_norm": 1.697792582575253, + "learning_rate": 9.393637973678595e-06, + "loss": 0.5283, + "step": 1480 + }, + { + "epoch": 0.18, + "grad_norm": 3.311532952026007, + "learning_rate": 9.392678332517778e-06, + "loss": 0.5285, + "step": 1481 + }, + { + "epoch": 0.18, + "grad_norm": 1.6887015599370656, + "learning_rate": 9.391717981681089e-06, + "loss": 0.5606, + "step": 1482 + }, + { + "epoch": 0.18, + "grad_norm": 1.8749079141161675, + "learning_rate": 9.39075692132368e-06, + "loss": 0.5518, + "step": 1483 + }, + { + "epoch": 0.18, + "grad_norm": 2.5186622420467013, + "learning_rate": 9.38979515160082e-06, + "loss": 0.5646, + "step": 1484 + }, + { + "epoch": 0.18, + "grad_norm": 1.5302094559262753, + "learning_rate": 9.388832672667893e-06, + "loss": 0.5038, + "step": 1485 + }, + { + "epoch": 0.18, + "grad_norm": 1.7591373713642573, + "learning_rate": 9.387869484680395e-06, + "loss": 0.5247, + "step": 1486 + }, + { + "epoch": 0.18, + "grad_norm": 1.9377967760499746, + "learning_rate": 9.386905587793933e-06, + "loss": 0.5463, + "step": 1487 + }, + { + "epoch": 0.18, + "grad_norm": 1.3403669028777074, + "learning_rate": 9.385940982164239e-06, + "loss": 0.5146, + "step": 1488 + }, + { + "epoch": 0.18, + "grad_norm": 1.811749023747539, + "learning_rate": 9.384975667947152e-06, + "loss": 0.5511, + "step": 1489 + }, + { + "epoch": 0.18, + "grad_norm": 4.577574886181875, + "learning_rate": 9.384009645298627e-06, + "loss": 0.5508, + "step": 1490 + }, + { + "epoch": 0.19, + "grad_norm": 5.301454623132674, + "learning_rate": 9.383042914374731e-06, + "loss": 0.5493, + "step": 1491 + }, + { + "epoch": 0.19, + "grad_norm": 2.4533170252188423, + "learning_rate": 9.382075475331652e-06, + "loss": 0.5466, + "step": 1492 + }, + { + "epoch": 0.19, + "grad_norm": 2.5421032500320355, + "learning_rate": 9.381107328325683e-06, + "loss": 0.52, + "step": 1493 + }, + { + "epoch": 0.19, + "grad_norm": 4.352474522527251, + "learning_rate": 9.380138473513241e-06, + "loss": 0.4819, + "step": 1494 + }, + { + "epoch": 0.19, + "grad_norm": 2.3336804024823565, + "learning_rate": 9.379168911050853e-06, + "loss": 0.5604, + "step": 1495 + }, + { + "epoch": 0.19, + "grad_norm": 2.450935676102443, + "learning_rate": 9.378198641095159e-06, + "loss": 0.5465, + "step": 1496 + }, + { + "epoch": 0.19, + "grad_norm": 1.8670887608442013, + "learning_rate": 9.377227663802913e-06, + "loss": 0.5355, + "step": 1497 + }, + { + "epoch": 0.19, + "grad_norm": 0.8171098999218793, + "learning_rate": 9.376255979330988e-06, + "loss": 0.5302, + "step": 1498 + }, + { + "epoch": 0.19, + "grad_norm": 2.2358050289181253, + "learning_rate": 9.375283587836368e-06, + "loss": 0.5323, + "step": 1499 + }, + { + "epoch": 0.19, + "grad_norm": 3.586997073057225, + "learning_rate": 9.374310489476149e-06, + "loss": 0.5512, + "step": 1500 + }, + { + "epoch": 0.19, + "grad_norm": 4.55579447130064, + "learning_rate": 9.373336684407545e-06, + "loss": 0.5447, + "step": 1501 + }, + { + "epoch": 0.19, + "grad_norm": 1.6129422186439102, + "learning_rate": 9.372362172787882e-06, + "loss": 0.4549, + "step": 1502 + }, + { + "epoch": 0.19, + "grad_norm": 6.055117530737087, + "learning_rate": 9.371386954774603e-06, + "loss": 0.5521, + "step": 1503 + }, + { + "epoch": 0.19, + "grad_norm": 1.6137472317864316, + "learning_rate": 9.370411030525261e-06, + "loss": 0.4941, + "step": 1504 + }, + { + "epoch": 0.19, + "grad_norm": 0.6825693446935227, + "learning_rate": 9.369434400197526e-06, + "loss": 0.4765, + "step": 1505 + }, + { + "epoch": 0.19, + "grad_norm": 3.0047785918869723, + "learning_rate": 9.36845706394918e-06, + "loss": 0.4718, + "step": 1506 + }, + { + "epoch": 0.19, + "grad_norm": 2.184404353395833, + "learning_rate": 9.367479021938123e-06, + "loss": 0.5639, + "step": 1507 + }, + { + "epoch": 0.19, + "grad_norm": 3.128493573070691, + "learning_rate": 9.366500274322365e-06, + "loss": 0.5322, + "step": 1508 + }, + { + "epoch": 0.19, + "grad_norm": 2.2077003304423544, + "learning_rate": 9.36552082126003e-06, + "loss": 0.5479, + "step": 1509 + }, + { + "epoch": 0.19, + "grad_norm": 2.5001203755032226, + "learning_rate": 9.364540662909358e-06, + "loss": 0.5705, + "step": 1510 + }, + { + "epoch": 0.19, + "grad_norm": 1.706994629835939, + "learning_rate": 9.363559799428704e-06, + "loss": 0.5147, + "step": 1511 + }, + { + "epoch": 0.19, + "grad_norm": 1.9071940375807923, + "learning_rate": 9.362578230976532e-06, + "loss": 0.5358, + "step": 1512 + }, + { + "epoch": 0.19, + "grad_norm": 2.199072784364784, + "learning_rate": 9.361595957711425e-06, + "loss": 0.5274, + "step": 1513 + }, + { + "epoch": 0.19, + "grad_norm": 3.0735830354933347, + "learning_rate": 9.360612979792078e-06, + "loss": 0.5311, + "step": 1514 + }, + { + "epoch": 0.19, + "grad_norm": 2.2066166573545036, + "learning_rate": 9.3596292973773e-06, + "loss": 0.4923, + "step": 1515 + }, + { + "epoch": 0.19, + "grad_norm": 2.4159872038811874, + "learning_rate": 9.358644910626012e-06, + "loss": 0.54, + "step": 1516 + }, + { + "epoch": 0.19, + "grad_norm": 2.479018280513656, + "learning_rate": 9.35765981969725e-06, + "loss": 0.5287, + "step": 1517 + }, + { + "epoch": 0.19, + "grad_norm": 3.089308304624241, + "learning_rate": 9.356674024750166e-06, + "loss": 0.5309, + "step": 1518 + }, + { + "epoch": 0.19, + "grad_norm": 2.2037636350371397, + "learning_rate": 9.355687525944025e-06, + "loss": 0.5754, + "step": 1519 + }, + { + "epoch": 0.19, + "grad_norm": 2.3408652508775556, + "learning_rate": 9.3547003234382e-06, + "loss": 0.4907, + "step": 1520 + }, + { + "epoch": 0.19, + "grad_norm": 1.913428552260077, + "learning_rate": 9.353712417392186e-06, + "loss": 0.5455, + "step": 1521 + }, + { + "epoch": 0.19, + "grad_norm": 3.6113559679228633, + "learning_rate": 9.352723807965586e-06, + "loss": 0.5563, + "step": 1522 + }, + { + "epoch": 0.19, + "grad_norm": 1.8314547370094012, + "learning_rate": 9.35173449531812e-06, + "loss": 0.4734, + "step": 1523 + }, + { + "epoch": 0.19, + "grad_norm": 2.0034022703098637, + "learning_rate": 9.350744479609622e-06, + "loss": 0.4954, + "step": 1524 + }, + { + "epoch": 0.19, + "grad_norm": 1.9027907885598594, + "learning_rate": 9.349753761000034e-06, + "loss": 0.5279, + "step": 1525 + }, + { + "epoch": 0.19, + "grad_norm": 1.928394262689405, + "learning_rate": 9.348762339649416e-06, + "loss": 0.5482, + "step": 1526 + }, + { + "epoch": 0.19, + "grad_norm": 2.638848020410885, + "learning_rate": 9.347770215717941e-06, + "loss": 0.6168, + "step": 1527 + }, + { + "epoch": 0.19, + "grad_norm": 1.9437404774526785, + "learning_rate": 9.346777389365896e-06, + "loss": 0.5453, + "step": 1528 + }, + { + "epoch": 0.19, + "grad_norm": 5.581529142975484, + "learning_rate": 9.345783860753681e-06, + "loss": 0.5647, + "step": 1529 + }, + { + "epoch": 0.19, + "grad_norm": 32.45158123886623, + "learning_rate": 9.344789630041811e-06, + "loss": 0.5425, + "step": 1530 + }, + { + "epoch": 0.19, + "grad_norm": 2.4210339557850866, + "learning_rate": 9.343794697390908e-06, + "loss": 0.5446, + "step": 1531 + }, + { + "epoch": 0.19, + "grad_norm": 2.0194607726636917, + "learning_rate": 9.342799062961716e-06, + "loss": 0.5179, + "step": 1532 + }, + { + "epoch": 0.19, + "grad_norm": 3.666606781187195, + "learning_rate": 9.341802726915088e-06, + "loss": 0.5297, + "step": 1533 + }, + { + "epoch": 0.19, + "grad_norm": 2.596514276782259, + "learning_rate": 9.340805689411989e-06, + "loss": 0.4748, + "step": 1534 + }, + { + "epoch": 0.19, + "grad_norm": 4.127376711598774, + "learning_rate": 9.339807950613502e-06, + "loss": 0.5213, + "step": 1535 + }, + { + "epoch": 0.19, + "grad_norm": 4.213578919065974, + "learning_rate": 9.338809510680818e-06, + "loss": 0.4784, + "step": 1536 + }, + { + "epoch": 0.19, + "grad_norm": 2.6406784786202318, + "learning_rate": 9.337810369775245e-06, + "loss": 0.5167, + "step": 1537 + }, + { + "epoch": 0.19, + "grad_norm": 4.710331055507111, + "learning_rate": 9.336810528058202e-06, + "loss": 0.513, + "step": 1538 + }, + { + "epoch": 0.19, + "grad_norm": 2.1709666221921617, + "learning_rate": 9.335809985691224e-06, + "loss": 0.5563, + "step": 1539 + }, + { + "epoch": 0.19, + "grad_norm": 2.128357963800714, + "learning_rate": 9.334808742835956e-06, + "loss": 0.5162, + "step": 1540 + }, + { + "epoch": 0.19, + "grad_norm": 2.18402549553364, + "learning_rate": 9.33380679965416e-06, + "loss": 0.5271, + "step": 1541 + }, + { + "epoch": 0.19, + "grad_norm": 3.855067283290758, + "learning_rate": 9.332804156307705e-06, + "loss": 0.4588, + "step": 1542 + }, + { + "epoch": 0.19, + "grad_norm": 1.7396959013303646, + "learning_rate": 9.331800812958576e-06, + "loss": 0.4747, + "step": 1543 + }, + { + "epoch": 0.19, + "grad_norm": 2.2383869045867417, + "learning_rate": 9.330796769768876e-06, + "loss": 0.4842, + "step": 1544 + }, + { + "epoch": 0.19, + "grad_norm": 5.501740378958404, + "learning_rate": 9.32979202690082e-06, + "loss": 0.5378, + "step": 1545 + }, + { + "epoch": 0.19, + "grad_norm": 2.5750775887331003, + "learning_rate": 9.328786584516725e-06, + "loss": 0.5284, + "step": 1546 + }, + { + "epoch": 0.19, + "grad_norm": 1.8052053950979472, + "learning_rate": 9.327780442779032e-06, + "loss": 0.4851, + "step": 1547 + }, + { + "epoch": 0.19, + "grad_norm": 2.419443490992145, + "learning_rate": 9.326773601850294e-06, + "loss": 0.4742, + "step": 1548 + }, + { + "epoch": 0.19, + "grad_norm": 1.79993293703367, + "learning_rate": 9.325766061893174e-06, + "loss": 0.5099, + "step": 1549 + }, + { + "epoch": 0.19, + "grad_norm": 1.9208065957405682, + "learning_rate": 9.324757823070448e-06, + "loss": 0.4399, + "step": 1550 + }, + { + "epoch": 0.19, + "grad_norm": 2.868678691846867, + "learning_rate": 9.323748885545006e-06, + "loss": 0.5128, + "step": 1551 + }, + { + "epoch": 0.19, + "grad_norm": 2.714265342211101, + "learning_rate": 9.322739249479853e-06, + "loss": 0.5015, + "step": 1552 + }, + { + "epoch": 0.19, + "grad_norm": 1.9555294131012604, + "learning_rate": 9.321728915038101e-06, + "loss": 0.4642, + "step": 1553 + }, + { + "epoch": 0.19, + "grad_norm": 1.956602369828446, + "learning_rate": 9.320717882382983e-06, + "loss": 0.5373, + "step": 1554 + }, + { + "epoch": 0.19, + "grad_norm": 2.0660493606510353, + "learning_rate": 9.319706151677837e-06, + "loss": 0.5311, + "step": 1555 + }, + { + "epoch": 0.19, + "grad_norm": 1.6164649314832973, + "learning_rate": 9.318693723086117e-06, + "loss": 0.5194, + "step": 1556 + }, + { + "epoch": 0.19, + "grad_norm": 2.8465365990890934, + "learning_rate": 9.317680596771389e-06, + "loss": 0.5243, + "step": 1557 + }, + { + "epoch": 0.19, + "grad_norm": 2.587440187024195, + "learning_rate": 9.316666772897336e-06, + "loss": 0.5472, + "step": 1558 + }, + { + "epoch": 0.19, + "grad_norm": 1.9684250433411536, + "learning_rate": 9.315652251627747e-06, + "loss": 0.5507, + "step": 1559 + }, + { + "epoch": 0.19, + "grad_norm": 2.7791357535604546, + "learning_rate": 9.314637033126529e-06, + "loss": 0.4917, + "step": 1560 + }, + { + "epoch": 0.19, + "grad_norm": 2.948195593969024, + "learning_rate": 9.313621117557696e-06, + "loss": 0.5052, + "step": 1561 + }, + { + "epoch": 0.19, + "grad_norm": 1.9947405314980446, + "learning_rate": 9.312604505085383e-06, + "loss": 0.4566, + "step": 1562 + }, + { + "epoch": 0.19, + "grad_norm": 2.2847089946324814, + "learning_rate": 9.311587195873828e-06, + "loss": 0.5639, + "step": 1563 + }, + { + "epoch": 0.19, + "grad_norm": 1.6519078478731801, + "learning_rate": 9.310569190087389e-06, + "loss": 0.5206, + "step": 1564 + }, + { + "epoch": 0.19, + "grad_norm": 5.990246449649651, + "learning_rate": 9.309550487890533e-06, + "loss": 0.5136, + "step": 1565 + }, + { + "epoch": 0.19, + "grad_norm": 1.839093204640025, + "learning_rate": 9.308531089447842e-06, + "loss": 0.5123, + "step": 1566 + }, + { + "epoch": 0.19, + "grad_norm": 8.90895040912295, + "learning_rate": 9.307510994924008e-06, + "loss": 0.5208, + "step": 1567 + }, + { + "epoch": 0.19, + "grad_norm": 0.6585239573059181, + "learning_rate": 9.306490204483834e-06, + "loss": 0.4448, + "step": 1568 + }, + { + "epoch": 0.19, + "grad_norm": 2.499030323961249, + "learning_rate": 9.305468718292239e-06, + "loss": 0.5165, + "step": 1569 + }, + { + "epoch": 0.19, + "grad_norm": 2.685184213028577, + "learning_rate": 9.304446536514253e-06, + "loss": 0.5815, + "step": 1570 + }, + { + "epoch": 0.19, + "grad_norm": 4.653346665510227, + "learning_rate": 9.303423659315021e-06, + "loss": 0.5341, + "step": 1571 + }, + { + "epoch": 0.2, + "grad_norm": 4.487347787640413, + "learning_rate": 9.302400086859792e-06, + "loss": 0.4511, + "step": 1572 + }, + { + "epoch": 0.2, + "grad_norm": 11.835863837095015, + "learning_rate": 9.30137581931394e-06, + "loss": 0.5304, + "step": 1573 + }, + { + "epoch": 0.2, + "grad_norm": 2.2039324524357453, + "learning_rate": 9.30035085684294e-06, + "loss": 0.4979, + "step": 1574 + }, + { + "epoch": 0.2, + "grad_norm": 1.9724914467420795, + "learning_rate": 9.299325199612387e-06, + "loss": 0.5541, + "step": 1575 + }, + { + "epoch": 0.2, + "grad_norm": 0.7016534015009618, + "learning_rate": 9.29829884778798e-06, + "loss": 0.501, + "step": 1576 + }, + { + "epoch": 0.2, + "grad_norm": 0.6458296648826666, + "learning_rate": 9.29727180153554e-06, + "loss": 0.4692, + "step": 1577 + }, + { + "epoch": 0.2, + "grad_norm": 2.2079147099479624, + "learning_rate": 9.296244061020993e-06, + "loss": 0.5151, + "step": 1578 + }, + { + "epoch": 0.2, + "grad_norm": 1.636960136634182, + "learning_rate": 9.295215626410382e-06, + "loss": 0.5485, + "step": 1579 + }, + { + "epoch": 0.2, + "grad_norm": 4.709804496192575, + "learning_rate": 9.294186497869854e-06, + "loss": 0.4918, + "step": 1580 + }, + { + "epoch": 0.2, + "grad_norm": 2.4096354356444096, + "learning_rate": 9.29315667556568e-06, + "loss": 0.4803, + "step": 1581 + }, + { + "epoch": 0.2, + "grad_norm": 3.0354798485566814, + "learning_rate": 9.292126159664231e-06, + "loss": 0.509, + "step": 1582 + }, + { + "epoch": 0.2, + "grad_norm": 2.2221754554853494, + "learning_rate": 9.291094950332002e-06, + "loss": 0.5475, + "step": 1583 + }, + { + "epoch": 0.2, + "grad_norm": 1.9423280254716395, + "learning_rate": 9.290063047735592e-06, + "loss": 0.5052, + "step": 1584 + }, + { + "epoch": 0.2, + "grad_norm": 2.1675876477187406, + "learning_rate": 9.289030452041712e-06, + "loss": 0.5406, + "step": 1585 + }, + { + "epoch": 0.2, + "grad_norm": 2.0501627862867737, + "learning_rate": 9.287997163417189e-06, + "loss": 0.5008, + "step": 1586 + }, + { + "epoch": 0.2, + "grad_norm": 1.9004854423947835, + "learning_rate": 9.286963182028956e-06, + "loss": 0.4894, + "step": 1587 + }, + { + "epoch": 0.2, + "grad_norm": 2.1099537386796596, + "learning_rate": 9.285928508044067e-06, + "loss": 0.5329, + "step": 1588 + }, + { + "epoch": 0.2, + "grad_norm": 2.8551813231815677, + "learning_rate": 9.284893141629681e-06, + "loss": 0.5531, + "step": 1589 + }, + { + "epoch": 0.2, + "grad_norm": 1.793130285133734, + "learning_rate": 9.283857082953069e-06, + "loss": 0.4937, + "step": 1590 + }, + { + "epoch": 0.2, + "grad_norm": 2.0523223005721443, + "learning_rate": 9.282820332181617e-06, + "loss": 0.5133, + "step": 1591 + }, + { + "epoch": 0.2, + "grad_norm": 3.8939020501115116, + "learning_rate": 9.281782889482819e-06, + "loss": 0.4925, + "step": 1592 + }, + { + "epoch": 0.2, + "grad_norm": 2.0010390620354106, + "learning_rate": 9.280744755024286e-06, + "loss": 0.5246, + "step": 1593 + }, + { + "epoch": 0.2, + "grad_norm": 2.0795143229091786, + "learning_rate": 9.279705928973736e-06, + "loss": 0.5206, + "step": 1594 + }, + { + "epoch": 0.2, + "grad_norm": 8.924256171448095, + "learning_rate": 9.278666411499e-06, + "loss": 0.507, + "step": 1595 + }, + { + "epoch": 0.2, + "grad_norm": 2.186584015434262, + "learning_rate": 9.277626202768024e-06, + "loss": 0.5503, + "step": 1596 + }, + { + "epoch": 0.2, + "grad_norm": 6.910497080223889, + "learning_rate": 9.276585302948861e-06, + "loss": 0.4957, + "step": 1597 + }, + { + "epoch": 0.2, + "grad_norm": 3.177423463231272, + "learning_rate": 9.275543712209675e-06, + "loss": 0.546, + "step": 1598 + }, + { + "epoch": 0.2, + "grad_norm": 0.7135803240589209, + "learning_rate": 9.27450143071875e-06, + "loss": 0.4871, + "step": 1599 + }, + { + "epoch": 0.2, + "grad_norm": 7.6993256970327915, + "learning_rate": 9.27345845864447e-06, + "loss": 0.5413, + "step": 1600 + }, + { + "epoch": 0.2, + "grad_norm": 2.760557263160197, + "learning_rate": 9.27241479615534e-06, + "loss": 0.5103, + "step": 1601 + }, + { + "epoch": 0.2, + "grad_norm": 1.91286078054797, + "learning_rate": 9.271370443419971e-06, + "loss": 0.5083, + "step": 1602 + }, + { + "epoch": 0.2, + "grad_norm": 2.2604935256819743, + "learning_rate": 9.270325400607087e-06, + "loss": 0.5342, + "step": 1603 + }, + { + "epoch": 0.2, + "grad_norm": 1.506226488554768, + "learning_rate": 9.269279667885527e-06, + "loss": 0.5538, + "step": 1604 + }, + { + "epoch": 0.2, + "grad_norm": 2.2697591097698377, + "learning_rate": 9.268233245424235e-06, + "loss": 0.5124, + "step": 1605 + }, + { + "epoch": 0.2, + "grad_norm": 1.5951640188619323, + "learning_rate": 9.267186133392272e-06, + "loss": 0.5352, + "step": 1606 + }, + { + "epoch": 0.2, + "grad_norm": 2.932511232716658, + "learning_rate": 9.266138331958805e-06, + "loss": 0.4822, + "step": 1607 + }, + { + "epoch": 0.2, + "grad_norm": 2.9386140715583693, + "learning_rate": 9.265089841293119e-06, + "loss": 0.4821, + "step": 1608 + }, + { + "epoch": 0.2, + "grad_norm": 0.6560129362849357, + "learning_rate": 9.264040661564606e-06, + "loss": 0.4677, + "step": 1609 + }, + { + "epoch": 0.2, + "grad_norm": 1.8727163082095566, + "learning_rate": 9.262990792942769e-06, + "loss": 0.5077, + "step": 1610 + }, + { + "epoch": 0.2, + "grad_norm": 0.6686106337025884, + "learning_rate": 9.261940235597225e-06, + "loss": 0.4837, + "step": 1611 + }, + { + "epoch": 0.2, + "grad_norm": 1.7054558852668906, + "learning_rate": 9.260888989697699e-06, + "loss": 0.5034, + "step": 1612 + }, + { + "epoch": 0.2, + "grad_norm": 1.5885063961864796, + "learning_rate": 9.259837055414032e-06, + "loss": 0.5449, + "step": 1613 + }, + { + "epoch": 0.2, + "grad_norm": 1.5075623137868153, + "learning_rate": 9.258784432916169e-06, + "loss": 0.5099, + "step": 1614 + }, + { + "epoch": 0.2, + "grad_norm": 2.4840673530181774, + "learning_rate": 9.257731122374175e-06, + "loss": 0.5326, + "step": 1615 + }, + { + "epoch": 0.2, + "grad_norm": 1.5251406015657, + "learning_rate": 9.256677123958218e-06, + "loss": 0.4976, + "step": 1616 + }, + { + "epoch": 0.2, + "grad_norm": 1.6575980627911382, + "learning_rate": 9.255622437838583e-06, + "loss": 0.4913, + "step": 1617 + }, + { + "epoch": 0.2, + "grad_norm": 1.5594753292416925, + "learning_rate": 9.254567064185662e-06, + "loss": 0.5068, + "step": 1618 + }, + { + "epoch": 0.2, + "grad_norm": 3.8252392196648803, + "learning_rate": 9.253511003169962e-06, + "loss": 0.5559, + "step": 1619 + }, + { + "epoch": 0.2, + "grad_norm": 6.602915905442299, + "learning_rate": 9.252454254962098e-06, + "loss": 0.5077, + "step": 1620 + }, + { + "epoch": 0.2, + "grad_norm": 2.3477427111270988, + "learning_rate": 9.251396819732796e-06, + "loss": 0.5271, + "step": 1621 + }, + { + "epoch": 0.2, + "grad_norm": 2.08775411330249, + "learning_rate": 9.250338697652894e-06, + "loss": 0.5081, + "step": 1622 + }, + { + "epoch": 0.2, + "grad_norm": 1.6607209226617103, + "learning_rate": 9.249279888893343e-06, + "loss": 0.5438, + "step": 1623 + }, + { + "epoch": 0.2, + "grad_norm": 1.6352177119328937, + "learning_rate": 9.248220393625203e-06, + "loss": 0.5325, + "step": 1624 + }, + { + "epoch": 0.2, + "grad_norm": 2.1768861690048777, + "learning_rate": 9.247160212019642e-06, + "loss": 0.5183, + "step": 1625 + }, + { + "epoch": 0.2, + "grad_norm": 2.385573126299658, + "learning_rate": 9.246099344247942e-06, + "loss": 0.5403, + "step": 1626 + }, + { + "epoch": 0.2, + "grad_norm": 3.489702228564718, + "learning_rate": 9.245037790481497e-06, + "loss": 0.5016, + "step": 1627 + }, + { + "epoch": 0.2, + "grad_norm": 3.341294944395603, + "learning_rate": 9.243975550891811e-06, + "loss": 0.4733, + "step": 1628 + }, + { + "epoch": 0.2, + "grad_norm": 3.3586603475734966, + "learning_rate": 9.242912625650497e-06, + "loss": 0.499, + "step": 1629 + }, + { + "epoch": 0.2, + "grad_norm": 2.2481656074495446, + "learning_rate": 9.24184901492928e-06, + "loss": 0.4805, + "step": 1630 + }, + { + "epoch": 0.2, + "grad_norm": 2.3536741952704943, + "learning_rate": 9.240784718899996e-06, + "loss": 0.5595, + "step": 1631 + }, + { + "epoch": 0.2, + "grad_norm": 2.099961793335952, + "learning_rate": 9.23971973773459e-06, + "loss": 0.463, + "step": 1632 + }, + { + "epoch": 0.2, + "grad_norm": 2.036367273374173, + "learning_rate": 9.238654071605122e-06, + "loss": 0.5815, + "step": 1633 + }, + { + "epoch": 0.2, + "grad_norm": 1.6846706618374232, + "learning_rate": 9.237587720683757e-06, + "loss": 0.5025, + "step": 1634 + }, + { + "epoch": 0.2, + "grad_norm": 1.7727570659719454, + "learning_rate": 9.236520685142774e-06, + "loss": 0.4953, + "step": 1635 + }, + { + "epoch": 0.2, + "grad_norm": 2.125186372058996, + "learning_rate": 9.235452965154563e-06, + "loss": 0.4817, + "step": 1636 + }, + { + "epoch": 0.2, + "grad_norm": 2.119531132765183, + "learning_rate": 9.234384560891623e-06, + "loss": 0.5453, + "step": 1637 + }, + { + "epoch": 0.2, + "grad_norm": 1.7431348160011935, + "learning_rate": 9.233315472526564e-06, + "loss": 0.5565, + "step": 1638 + }, + { + "epoch": 0.2, + "grad_norm": 2.0585035517080037, + "learning_rate": 9.232245700232106e-06, + "loss": 0.5027, + "step": 1639 + }, + { + "epoch": 0.2, + "grad_norm": 1.890438150417577, + "learning_rate": 9.231175244181081e-06, + "loss": 0.5089, + "step": 1640 + }, + { + "epoch": 0.2, + "grad_norm": 1.7057878522990162, + "learning_rate": 9.230104104546432e-06, + "loss": 0.4724, + "step": 1641 + }, + { + "epoch": 0.2, + "grad_norm": 2.412450440165766, + "learning_rate": 9.229032281501209e-06, + "loss": 0.4912, + "step": 1642 + }, + { + "epoch": 0.2, + "grad_norm": 1.886096384576177, + "learning_rate": 9.227959775218573e-06, + "loss": 0.5774, + "step": 1643 + }, + { + "epoch": 0.2, + "grad_norm": 46.04729752018606, + "learning_rate": 9.2268865858718e-06, + "loss": 0.5116, + "step": 1644 + }, + { + "epoch": 0.2, + "grad_norm": 1.643269640799219, + "learning_rate": 9.225812713634272e-06, + "loss": 0.5138, + "step": 1645 + }, + { + "epoch": 0.2, + "grad_norm": 2.3246059542850226, + "learning_rate": 9.224738158679482e-06, + "loss": 0.4874, + "step": 1646 + }, + { + "epoch": 0.2, + "grad_norm": 1.9035809681676414, + "learning_rate": 9.223662921181036e-06, + "loss": 0.5225, + "step": 1647 + }, + { + "epoch": 0.2, + "grad_norm": 4.151399192600683, + "learning_rate": 9.222587001312643e-06, + "loss": 0.532, + "step": 1648 + }, + { + "epoch": 0.2, + "grad_norm": 6.844265666331204, + "learning_rate": 9.221510399248135e-06, + "loss": 0.5036, + "step": 1649 + }, + { + "epoch": 0.2, + "grad_norm": 1.4196112703467962, + "learning_rate": 9.220433115161438e-06, + "loss": 0.4798, + "step": 1650 + }, + { + "epoch": 0.2, + "grad_norm": 2.252588897696059, + "learning_rate": 9.219355149226604e-06, + "loss": 0.5083, + "step": 1651 + }, + { + "epoch": 0.21, + "grad_norm": 0.6791610636974595, + "learning_rate": 9.218276501617784e-06, + "loss": 0.5234, + "step": 1652 + }, + { + "epoch": 0.21, + "grad_norm": 1.6705201817878617, + "learning_rate": 9.217197172509245e-06, + "loss": 0.4772, + "step": 1653 + }, + { + "epoch": 0.21, + "grad_norm": 1.5807569647746058, + "learning_rate": 9.216117162075358e-06, + "loss": 0.537, + "step": 1654 + }, + { + "epoch": 0.21, + "grad_norm": 1.9277385315437, + "learning_rate": 9.215036470490614e-06, + "loss": 0.4648, + "step": 1655 + }, + { + "epoch": 0.21, + "grad_norm": 5.4803467893276085, + "learning_rate": 9.213955097929605e-06, + "loss": 0.5672, + "step": 1656 + }, + { + "epoch": 0.21, + "grad_norm": 6.188819558803046, + "learning_rate": 9.212873044567037e-06, + "loss": 0.5753, + "step": 1657 + }, + { + "epoch": 0.21, + "grad_norm": 1.865195647290685, + "learning_rate": 9.211790310577723e-06, + "loss": 0.5022, + "step": 1658 + }, + { + "epoch": 0.21, + "grad_norm": 1.4354281045268755, + "learning_rate": 9.210706896136592e-06, + "loss": 0.4858, + "step": 1659 + }, + { + "epoch": 0.21, + "grad_norm": 3.2891107036246554, + "learning_rate": 9.209622801418676e-06, + "loss": 0.4664, + "step": 1660 + }, + { + "epoch": 0.21, + "grad_norm": 2.4323787584802106, + "learning_rate": 9.208538026599124e-06, + "loss": 0.4623, + "step": 1661 + }, + { + "epoch": 0.21, + "grad_norm": 6.08512998477563, + "learning_rate": 9.207452571853184e-06, + "loss": 0.5376, + "step": 1662 + }, + { + "epoch": 0.21, + "grad_norm": 1.5491397801481632, + "learning_rate": 9.206366437356229e-06, + "loss": 0.4774, + "step": 1663 + }, + { + "epoch": 0.21, + "grad_norm": 2.6062166151520283, + "learning_rate": 9.205279623283726e-06, + "loss": 0.5563, + "step": 1664 + }, + { + "epoch": 0.21, + "grad_norm": 4.764291375497078, + "learning_rate": 9.204192129811267e-06, + "loss": 0.5121, + "step": 1665 + }, + { + "epoch": 0.21, + "grad_norm": 1.7158970499491477, + "learning_rate": 9.203103957114542e-06, + "loss": 0.5255, + "step": 1666 + }, + { + "epoch": 0.21, + "grad_norm": 1.4593279660480194, + "learning_rate": 9.202015105369352e-06, + "loss": 0.4987, + "step": 1667 + }, + { + "epoch": 0.21, + "grad_norm": 1.8706544392705933, + "learning_rate": 9.200925574751616e-06, + "loss": 0.5272, + "step": 1668 + }, + { + "epoch": 0.21, + "grad_norm": 3.5665826039814967, + "learning_rate": 9.199835365437353e-06, + "loss": 0.4803, + "step": 1669 + }, + { + "epoch": 0.21, + "grad_norm": 1.5876414430314243, + "learning_rate": 9.1987444776027e-06, + "loss": 0.4969, + "step": 1670 + }, + { + "epoch": 0.21, + "grad_norm": 1.8052599557337923, + "learning_rate": 9.197652911423896e-06, + "loss": 0.4989, + "step": 1671 + }, + { + "epoch": 0.21, + "grad_norm": 2.1687653891029224, + "learning_rate": 9.196560667077294e-06, + "loss": 0.5182, + "step": 1672 + }, + { + "epoch": 0.21, + "grad_norm": 1.9959404100536227, + "learning_rate": 9.19546774473936e-06, + "loss": 0.5439, + "step": 1673 + }, + { + "epoch": 0.21, + "grad_norm": 1.6202007061973254, + "learning_rate": 9.194374144586657e-06, + "loss": 0.5115, + "step": 1674 + }, + { + "epoch": 0.21, + "grad_norm": 1.9857764472674335, + "learning_rate": 9.193279866795872e-06, + "loss": 0.5182, + "step": 1675 + }, + { + "epoch": 0.21, + "grad_norm": 1.9178192223929929, + "learning_rate": 9.192184911543794e-06, + "loss": 0.4937, + "step": 1676 + }, + { + "epoch": 0.21, + "grad_norm": 3.602441531524967, + "learning_rate": 9.19108927900732e-06, + "loss": 0.5622, + "step": 1677 + }, + { + "epoch": 0.21, + "grad_norm": 1.7741676838270075, + "learning_rate": 9.189992969363463e-06, + "loss": 0.4866, + "step": 1678 + }, + { + "epoch": 0.21, + "grad_norm": 1.48544166609113, + "learning_rate": 9.18889598278934e-06, + "loss": 0.5614, + "step": 1679 + }, + { + "epoch": 0.21, + "grad_norm": 1.533303252704799, + "learning_rate": 9.187798319462177e-06, + "loss": 0.5184, + "step": 1680 + }, + { + "epoch": 0.21, + "grad_norm": 1.6051159343776416, + "learning_rate": 9.186699979559314e-06, + "loss": 0.5394, + "step": 1681 + }, + { + "epoch": 0.21, + "grad_norm": 2.432811234549796, + "learning_rate": 9.185600963258194e-06, + "loss": 0.4813, + "step": 1682 + }, + { + "epoch": 0.21, + "grad_norm": 1.408143189466315, + "learning_rate": 9.184501270736378e-06, + "loss": 0.471, + "step": 1683 + }, + { + "epoch": 0.21, + "grad_norm": 0.659302765312688, + "learning_rate": 9.183400902171527e-06, + "loss": 0.5021, + "step": 1684 + }, + { + "epoch": 0.21, + "grad_norm": 1.3610849090522068, + "learning_rate": 9.182299857741415e-06, + "loss": 0.4949, + "step": 1685 + }, + { + "epoch": 0.21, + "grad_norm": 1.7850082372585498, + "learning_rate": 9.181198137623925e-06, + "loss": 0.5339, + "step": 1686 + }, + { + "epoch": 0.21, + "grad_norm": 1.7702658610128066, + "learning_rate": 9.180095741997055e-06, + "loss": 0.4974, + "step": 1687 + }, + { + "epoch": 0.21, + "grad_norm": 1.6362295093390105, + "learning_rate": 9.1789926710389e-06, + "loss": 0.6164, + "step": 1688 + }, + { + "epoch": 0.21, + "grad_norm": 1.651506207170075, + "learning_rate": 9.177888924927675e-06, + "loss": 0.5517, + "step": 1689 + }, + { + "epoch": 0.21, + "grad_norm": 1.7460872196879362, + "learning_rate": 9.176784503841699e-06, + "loss": 0.5577, + "step": 1690 + }, + { + "epoch": 0.21, + "grad_norm": 1.4073768477156903, + "learning_rate": 9.175679407959399e-06, + "loss": 0.5033, + "step": 1691 + }, + { + "epoch": 0.21, + "grad_norm": 1.9985932651418432, + "learning_rate": 9.174573637459317e-06, + "loss": 0.5495, + "step": 1692 + }, + { + "epoch": 0.21, + "grad_norm": 2.634411342635036, + "learning_rate": 9.173467192520095e-06, + "loss": 0.5501, + "step": 1693 + }, + { + "epoch": 0.21, + "grad_norm": 1.768504397733744, + "learning_rate": 9.172360073320493e-06, + "loss": 0.496, + "step": 1694 + }, + { + "epoch": 0.21, + "grad_norm": 1.5573104075423723, + "learning_rate": 9.171252280039374e-06, + "loss": 0.5204, + "step": 1695 + }, + { + "epoch": 0.21, + "grad_norm": 1.3393545628964552, + "learning_rate": 9.170143812855711e-06, + "loss": 0.5045, + "step": 1696 + }, + { + "epoch": 0.21, + "grad_norm": 1.975789021549798, + "learning_rate": 9.169034671948589e-06, + "loss": 0.5396, + "step": 1697 + }, + { + "epoch": 0.21, + "grad_norm": 1.6801972267122265, + "learning_rate": 9.167924857497197e-06, + "loss": 0.5435, + "step": 1698 + }, + { + "epoch": 0.21, + "grad_norm": 0.7246085283194033, + "learning_rate": 9.166814369680837e-06, + "loss": 0.5395, + "step": 1699 + }, + { + "epoch": 0.21, + "grad_norm": 1.7107966860149284, + "learning_rate": 9.165703208678917e-06, + "loss": 0.5077, + "step": 1700 + }, + { + "epoch": 0.21, + "grad_norm": 1.9115099585645674, + "learning_rate": 9.164591374670957e-06, + "loss": 0.5343, + "step": 1701 + }, + { + "epoch": 0.21, + "grad_norm": 1.588858967214631, + "learning_rate": 9.163478867836582e-06, + "loss": 0.58, + "step": 1702 + }, + { + "epoch": 0.21, + "grad_norm": 7.554126927044856, + "learning_rate": 9.162365688355526e-06, + "loss": 0.4872, + "step": 1703 + }, + { + "epoch": 0.21, + "grad_norm": 1.3298965104985374, + "learning_rate": 9.161251836407635e-06, + "loss": 0.5012, + "step": 1704 + }, + { + "epoch": 0.21, + "grad_norm": 1.4103330693756335, + "learning_rate": 9.16013731217286e-06, + "loss": 0.5279, + "step": 1705 + }, + { + "epoch": 0.21, + "grad_norm": 1.2852666812092637, + "learning_rate": 9.159022115831263e-06, + "loss": 0.5313, + "step": 1706 + }, + { + "epoch": 0.21, + "grad_norm": 1.3941519860589664, + "learning_rate": 9.157906247563013e-06, + "loss": 0.5133, + "step": 1707 + }, + { + "epoch": 0.21, + "grad_norm": 1.413329364028389, + "learning_rate": 9.15678970754839e-06, + "loss": 0.5126, + "step": 1708 + }, + { + "epoch": 0.21, + "grad_norm": 1.42572581665144, + "learning_rate": 9.15567249596778e-06, + "loss": 0.5391, + "step": 1709 + }, + { + "epoch": 0.21, + "grad_norm": 1.739416463421691, + "learning_rate": 9.154554613001679e-06, + "loss": 0.5283, + "step": 1710 + }, + { + "epoch": 0.21, + "grad_norm": 2.014337410080953, + "learning_rate": 9.15343605883069e-06, + "loss": 0.5153, + "step": 1711 + }, + { + "epoch": 0.21, + "grad_norm": 1.9909225909292738, + "learning_rate": 9.152316833635526e-06, + "loss": 0.5594, + "step": 1712 + }, + { + "epoch": 0.21, + "grad_norm": 2.7323915263313325, + "learning_rate": 9.151196937597006e-06, + "loss": 0.5514, + "step": 1713 + }, + { + "epoch": 0.21, + "grad_norm": 1.5930929939562986, + "learning_rate": 9.150076370896061e-06, + "loss": 0.5265, + "step": 1714 + }, + { + "epoch": 0.21, + "grad_norm": 1.561376133480682, + "learning_rate": 9.148955133713728e-06, + "loss": 0.4775, + "step": 1715 + }, + { + "epoch": 0.21, + "grad_norm": 1.357765216075096, + "learning_rate": 9.147833226231151e-06, + "loss": 0.4957, + "step": 1716 + }, + { + "epoch": 0.21, + "grad_norm": 15.316501717867558, + "learning_rate": 9.146710648629587e-06, + "loss": 0.5084, + "step": 1717 + }, + { + "epoch": 0.21, + "grad_norm": 1.622111052086209, + "learning_rate": 9.145587401090394e-06, + "loss": 0.5024, + "step": 1718 + }, + { + "epoch": 0.21, + "grad_norm": 0.6842905276591803, + "learning_rate": 9.144463483795049e-06, + "loss": 0.5024, + "step": 1719 + }, + { + "epoch": 0.21, + "grad_norm": 2.7523349635715433, + "learning_rate": 9.143338896925124e-06, + "loss": 0.5035, + "step": 1720 + }, + { + "epoch": 0.21, + "grad_norm": 1.9129601709051165, + "learning_rate": 9.142213640662312e-06, + "loss": 0.6002, + "step": 1721 + }, + { + "epoch": 0.21, + "grad_norm": 1.7903992740470418, + "learning_rate": 9.141087715188402e-06, + "loss": 0.5401, + "step": 1722 + }, + { + "epoch": 0.21, + "grad_norm": 1.9102915119427542, + "learning_rate": 9.1399611206853e-06, + "loss": 0.5402, + "step": 1723 + }, + { + "epoch": 0.21, + "grad_norm": 1.5467085531898992, + "learning_rate": 9.138833857335021e-06, + "loss": 0.5115, + "step": 1724 + }, + { + "epoch": 0.21, + "grad_norm": 3.1841285847142387, + "learning_rate": 9.137705925319677e-06, + "loss": 0.5619, + "step": 1725 + }, + { + "epoch": 0.21, + "grad_norm": 0.6965895219325391, + "learning_rate": 9.136577324821501e-06, + "loss": 0.4995, + "step": 1726 + }, + { + "epoch": 0.21, + "grad_norm": 1.6859448097443563, + "learning_rate": 9.135448056022827e-06, + "loss": 0.4778, + "step": 1727 + }, + { + "epoch": 0.21, + "grad_norm": 1.2927993289518513, + "learning_rate": 9.134318119106098e-06, + "loss": 0.4931, + "step": 1728 + }, + { + "epoch": 0.21, + "grad_norm": 1.2036250081564543, + "learning_rate": 9.133187514253862e-06, + "loss": 0.4606, + "step": 1729 + }, + { + "epoch": 0.21, + "grad_norm": 1.4265361266647725, + "learning_rate": 9.132056241648784e-06, + "loss": 0.5319, + "step": 1730 + }, + { + "epoch": 0.21, + "grad_norm": 1.3683153281725227, + "learning_rate": 9.130924301473627e-06, + "loss": 0.512, + "step": 1731 + }, + { + "epoch": 0.21, + "grad_norm": 3.3247604798264128, + "learning_rate": 9.129791693911268e-06, + "loss": 0.5108, + "step": 1732 + }, + { + "epoch": 0.22, + "grad_norm": 1.446197135116374, + "learning_rate": 9.128658419144689e-06, + "loss": 0.5134, + "step": 1733 + }, + { + "epoch": 0.22, + "grad_norm": 1.4740690486820653, + "learning_rate": 9.127524477356979e-06, + "loss": 0.4845, + "step": 1734 + }, + { + "epoch": 0.22, + "grad_norm": 1.5251618719712636, + "learning_rate": 9.12638986873134e-06, + "loss": 0.5262, + "step": 1735 + }, + { + "epoch": 0.22, + "grad_norm": 1.3988676213731237, + "learning_rate": 9.125254593451074e-06, + "loss": 0.5271, + "step": 1736 + }, + { + "epoch": 0.22, + "grad_norm": 1.4890342420503115, + "learning_rate": 9.124118651699596e-06, + "loss": 0.4984, + "step": 1737 + }, + { + "epoch": 0.22, + "grad_norm": 0.7355545059416543, + "learning_rate": 9.122982043660428e-06, + "loss": 0.5047, + "step": 1738 + }, + { + "epoch": 0.22, + "grad_norm": 1.4378053013014622, + "learning_rate": 9.121844769517201e-06, + "loss": 0.5143, + "step": 1739 + }, + { + "epoch": 0.22, + "grad_norm": 2.0339115280934568, + "learning_rate": 9.120706829453648e-06, + "loss": 0.5517, + "step": 1740 + }, + { + "epoch": 0.22, + "grad_norm": 3.1660031363051604, + "learning_rate": 9.119568223653614e-06, + "loss": 0.5358, + "step": 1741 + }, + { + "epoch": 0.22, + "grad_norm": 0.646667354488079, + "learning_rate": 9.118428952301052e-06, + "loss": 0.5022, + "step": 1742 + }, + { + "epoch": 0.22, + "grad_norm": 2.6285272562157975, + "learning_rate": 9.117289015580022e-06, + "loss": 0.5378, + "step": 1743 + }, + { + "epoch": 0.22, + "grad_norm": 1.3783401691815536, + "learning_rate": 9.116148413674688e-06, + "loss": 0.5544, + "step": 1744 + }, + { + "epoch": 0.22, + "grad_norm": 1.8822723672322517, + "learning_rate": 9.115007146769326e-06, + "loss": 0.5124, + "step": 1745 + }, + { + "epoch": 0.22, + "grad_norm": 1.791138047925575, + "learning_rate": 9.11386521504832e-06, + "loss": 0.5198, + "step": 1746 + }, + { + "epoch": 0.22, + "grad_norm": 1.7874423277192169, + "learning_rate": 9.112722618696155e-06, + "loss": 0.5609, + "step": 1747 + }, + { + "epoch": 0.22, + "grad_norm": 1.5767870564083402, + "learning_rate": 9.11157935789743e-06, + "loss": 0.5131, + "step": 1748 + }, + { + "epoch": 0.22, + "grad_norm": 1.7726451129948841, + "learning_rate": 9.110435432836847e-06, + "loss": 0.6044, + "step": 1749 + }, + { + "epoch": 0.22, + "grad_norm": 1.6653491459083822, + "learning_rate": 9.109290843699219e-06, + "loss": 0.531, + "step": 1750 + }, + { + "epoch": 0.22, + "grad_norm": 1.3072485709613986, + "learning_rate": 9.108145590669464e-06, + "loss": 0.5101, + "step": 1751 + }, + { + "epoch": 0.22, + "grad_norm": 2.094158973402322, + "learning_rate": 9.106999673932608e-06, + "loss": 0.5323, + "step": 1752 + }, + { + "epoch": 0.22, + "grad_norm": 1.620502440367058, + "learning_rate": 9.105853093673782e-06, + "loss": 0.5151, + "step": 1753 + }, + { + "epoch": 0.22, + "grad_norm": 2.028921567090642, + "learning_rate": 9.104705850078229e-06, + "loss": 0.5269, + "step": 1754 + }, + { + "epoch": 0.22, + "grad_norm": 1.8064643452958555, + "learning_rate": 9.103557943331294e-06, + "loss": 0.5825, + "step": 1755 + }, + { + "epoch": 0.22, + "grad_norm": 1.6589087033035828, + "learning_rate": 9.102409373618433e-06, + "loss": 0.5637, + "step": 1756 + }, + { + "epoch": 0.22, + "grad_norm": 1.5052128611635847, + "learning_rate": 9.101260141125207e-06, + "loss": 0.5489, + "step": 1757 + }, + { + "epoch": 0.22, + "grad_norm": 2.444200630168782, + "learning_rate": 9.100110246037284e-06, + "loss": 0.523, + "step": 1758 + }, + { + "epoch": 0.22, + "grad_norm": 1.3240725750282878, + "learning_rate": 9.09895968854044e-06, + "loss": 0.5259, + "step": 1759 + }, + { + "epoch": 0.22, + "grad_norm": 1.4390570656456079, + "learning_rate": 9.097808468820559e-06, + "loss": 0.5009, + "step": 1760 + }, + { + "epoch": 0.22, + "grad_norm": 0.7482633852588438, + "learning_rate": 9.096656587063628e-06, + "loss": 0.5182, + "step": 1761 + }, + { + "epoch": 0.22, + "grad_norm": 1.4406976180426263, + "learning_rate": 9.095504043455747e-06, + "loss": 0.5115, + "step": 1762 + }, + { + "epoch": 0.22, + "grad_norm": 1.3518685475810814, + "learning_rate": 9.094350838183117e-06, + "loss": 0.5111, + "step": 1763 + }, + { + "epoch": 0.22, + "grad_norm": 2.2916236319235064, + "learning_rate": 9.093196971432048e-06, + "loss": 0.5317, + "step": 1764 + }, + { + "epoch": 0.22, + "grad_norm": 1.8835108269892396, + "learning_rate": 9.09204244338896e-06, + "loss": 0.549, + "step": 1765 + }, + { + "epoch": 0.22, + "grad_norm": 1.6435708630601518, + "learning_rate": 9.090887254240375e-06, + "loss": 0.5201, + "step": 1766 + }, + { + "epoch": 0.22, + "grad_norm": 2.8321184478689867, + "learning_rate": 9.089731404172926e-06, + "loss": 0.5411, + "step": 1767 + }, + { + "epoch": 0.22, + "grad_norm": 1.2943460426664266, + "learning_rate": 9.08857489337335e-06, + "loss": 0.5587, + "step": 1768 + }, + { + "epoch": 0.22, + "grad_norm": 1.6125378503682504, + "learning_rate": 9.08741772202849e-06, + "loss": 0.4937, + "step": 1769 + }, + { + "epoch": 0.22, + "grad_norm": 2.2238112852588428, + "learning_rate": 9.086259890325297e-06, + "loss": 0.5269, + "step": 1770 + }, + { + "epoch": 0.22, + "grad_norm": 1.6489029783453386, + "learning_rate": 9.08510139845083e-06, + "loss": 0.5192, + "step": 1771 + }, + { + "epoch": 0.22, + "grad_norm": 1.552669094441354, + "learning_rate": 9.083942246592256e-06, + "loss": 0.5442, + "step": 1772 + }, + { + "epoch": 0.22, + "grad_norm": 1.3550158439915052, + "learning_rate": 9.082782434936844e-06, + "loss": 0.4781, + "step": 1773 + }, + { + "epoch": 0.22, + "grad_norm": 1.3058746893566382, + "learning_rate": 9.08162196367197e-06, + "loss": 0.5051, + "step": 1774 + }, + { + "epoch": 0.22, + "grad_norm": 1.5481492836508022, + "learning_rate": 9.08046083298512e-06, + "loss": 0.5488, + "step": 1775 + }, + { + "epoch": 0.22, + "grad_norm": 1.4310769766264753, + "learning_rate": 9.079299043063885e-06, + "loss": 0.5832, + "step": 1776 + }, + { + "epoch": 0.22, + "grad_norm": 1.6531261378265354, + "learning_rate": 9.078136594095964e-06, + "loss": 0.5043, + "step": 1777 + }, + { + "epoch": 0.22, + "grad_norm": 2.0159329090312497, + "learning_rate": 9.076973486269158e-06, + "loss": 0.517, + "step": 1778 + }, + { + "epoch": 0.22, + "grad_norm": 1.9511807427420154, + "learning_rate": 9.075809719771378e-06, + "loss": 0.5206, + "step": 1779 + }, + { + "epoch": 0.22, + "grad_norm": 1.5978232229037173, + "learning_rate": 9.074645294790643e-06, + "loss": 0.5324, + "step": 1780 + }, + { + "epoch": 0.22, + "grad_norm": 1.4798428851101637, + "learning_rate": 9.073480211515071e-06, + "loss": 0.5067, + "step": 1781 + }, + { + "epoch": 0.22, + "grad_norm": 2.0306639014145014, + "learning_rate": 9.0723144701329e-06, + "loss": 0.5573, + "step": 1782 + }, + { + "epoch": 0.22, + "grad_norm": 1.4728696179541005, + "learning_rate": 9.071148070832456e-06, + "loss": 0.5273, + "step": 1783 + }, + { + "epoch": 0.22, + "grad_norm": 1.627559526949772, + "learning_rate": 9.069981013802188e-06, + "loss": 0.5133, + "step": 1784 + }, + { + "epoch": 0.22, + "grad_norm": 1.4834810512434706, + "learning_rate": 9.06881329923064e-06, + "loss": 0.4869, + "step": 1785 + }, + { + "epoch": 0.22, + "grad_norm": 1.372365155180784, + "learning_rate": 9.067644927306471e-06, + "loss": 0.5169, + "step": 1786 + }, + { + "epoch": 0.22, + "grad_norm": 0.6951708523849913, + "learning_rate": 9.066475898218439e-06, + "loss": 0.4823, + "step": 1787 + }, + { + "epoch": 0.22, + "grad_norm": 1.3878873806746732, + "learning_rate": 9.06530621215541e-06, + "loss": 0.55, + "step": 1788 + }, + { + "epoch": 0.22, + "grad_norm": 1.4695670506341068, + "learning_rate": 9.064135869306359e-06, + "loss": 0.4979, + "step": 1789 + }, + { + "epoch": 0.22, + "grad_norm": 1.3469285321385347, + "learning_rate": 9.062964869860364e-06, + "loss": 0.5153, + "step": 1790 + }, + { + "epoch": 0.22, + "grad_norm": 1.3684711900202686, + "learning_rate": 9.06179321400661e-06, + "loss": 0.5329, + "step": 1791 + }, + { + "epoch": 0.22, + "grad_norm": 1.4754969731161858, + "learning_rate": 9.060620901934393e-06, + "loss": 0.5619, + "step": 1792 + }, + { + "epoch": 0.22, + "grad_norm": 1.3439809939316543, + "learning_rate": 9.059447933833103e-06, + "loss": 0.4887, + "step": 1793 + }, + { + "epoch": 0.22, + "grad_norm": 1.4620094045772167, + "learning_rate": 9.058274309892248e-06, + "loss": 0.5352, + "step": 1794 + }, + { + "epoch": 0.22, + "grad_norm": 1.336998172657835, + "learning_rate": 9.057100030301438e-06, + "loss": 0.5195, + "step": 1795 + }, + { + "epoch": 0.22, + "grad_norm": 7.740380989435937, + "learning_rate": 9.055925095250384e-06, + "loss": 0.5416, + "step": 1796 + }, + { + "epoch": 0.22, + "grad_norm": 2.654630803377701, + "learning_rate": 9.05474950492891e-06, + "loss": 0.4519, + "step": 1797 + }, + { + "epoch": 0.22, + "grad_norm": 1.9937748629540164, + "learning_rate": 9.053573259526941e-06, + "loss": 0.5766, + "step": 1798 + }, + { + "epoch": 0.22, + "grad_norm": 1.7901930378208728, + "learning_rate": 9.052396359234514e-06, + "loss": 0.4942, + "step": 1799 + }, + { + "epoch": 0.22, + "grad_norm": 0.6924941301747856, + "learning_rate": 9.051218804241764e-06, + "loss": 0.5251, + "step": 1800 + }, + { + "epoch": 0.22, + "grad_norm": 1.5309637058525534, + "learning_rate": 9.050040594738937e-06, + "loss": 0.5792, + "step": 1801 + }, + { + "epoch": 0.22, + "grad_norm": 1.355722641980439, + "learning_rate": 9.048861730916381e-06, + "loss": 0.4687, + "step": 1802 + }, + { + "epoch": 0.22, + "grad_norm": 1.845814720364885, + "learning_rate": 9.047682212964553e-06, + "loss": 0.5346, + "step": 1803 + }, + { + "epoch": 0.22, + "grad_norm": 1.3425542240340513, + "learning_rate": 9.046502041074014e-06, + "loss": 0.481, + "step": 1804 + }, + { + "epoch": 0.22, + "grad_norm": 1.617695978622734, + "learning_rate": 9.045321215435433e-06, + "loss": 0.4759, + "step": 1805 + }, + { + "epoch": 0.22, + "grad_norm": 1.5334485516373337, + "learning_rate": 9.044139736239581e-06, + "loss": 0.562, + "step": 1806 + }, + { + "epoch": 0.22, + "grad_norm": 1.5226781612458151, + "learning_rate": 9.042957603677338e-06, + "loss": 0.4891, + "step": 1807 + }, + { + "epoch": 0.22, + "grad_norm": 1.3686279475665037, + "learning_rate": 9.041774817939686e-06, + "loss": 0.5542, + "step": 1808 + }, + { + "epoch": 0.22, + "grad_norm": 1.5436064750306893, + "learning_rate": 9.040591379217718e-06, + "loss": 0.4889, + "step": 1809 + }, + { + "epoch": 0.22, + "grad_norm": 1.501701685513641, + "learning_rate": 9.039407287702622e-06, + "loss": 0.5308, + "step": 1810 + }, + { + "epoch": 0.22, + "grad_norm": 1.8016377081845487, + "learning_rate": 9.038222543585706e-06, + "loss": 0.5534, + "step": 1811 + }, + { + "epoch": 0.22, + "grad_norm": 1.4173818191469483, + "learning_rate": 9.037037147058372e-06, + "loss": 0.499, + "step": 1812 + }, + { + "epoch": 0.22, + "grad_norm": 1.4346008752805965, + "learning_rate": 9.035851098312131e-06, + "loss": 0.4631, + "step": 1813 + }, + { + "epoch": 0.23, + "grad_norm": 1.4053591169104527, + "learning_rate": 9.0346643975386e-06, + "loss": 0.5564, + "step": 1814 + }, + { + "epoch": 0.23, + "grad_norm": 3.275060750971617, + "learning_rate": 9.033477044929504e-06, + "loss": 0.5159, + "step": 1815 + }, + { + "epoch": 0.23, + "grad_norm": 4.77859597684764, + "learning_rate": 9.032289040676665e-06, + "loss": 0.5705, + "step": 1816 + }, + { + "epoch": 0.23, + "grad_norm": 1.7519356417482121, + "learning_rate": 9.03110038497202e-06, + "loss": 0.5405, + "step": 1817 + }, + { + "epoch": 0.23, + "grad_norm": 1.4251065097842237, + "learning_rate": 9.029911078007604e-06, + "loss": 0.5315, + "step": 1818 + }, + { + "epoch": 0.23, + "grad_norm": 1.2492462894322915, + "learning_rate": 9.02872111997556e-06, + "loss": 0.5089, + "step": 1819 + }, + { + "epoch": 0.23, + "grad_norm": 1.4463132073460823, + "learning_rate": 9.027530511068139e-06, + "loss": 0.4827, + "step": 1820 + }, + { + "epoch": 0.23, + "grad_norm": 3.3032911990407365, + "learning_rate": 9.026339251477692e-06, + "loss": 0.5078, + "step": 1821 + }, + { + "epoch": 0.23, + "grad_norm": 1.626706774190053, + "learning_rate": 9.025147341396678e-06, + "loss": 0.5205, + "step": 1822 + }, + { + "epoch": 0.23, + "grad_norm": 1.3509087885099418, + "learning_rate": 9.023954781017662e-06, + "loss": 0.4761, + "step": 1823 + }, + { + "epoch": 0.23, + "grad_norm": 1.8946587915817568, + "learning_rate": 9.02276157053331e-06, + "loss": 0.5425, + "step": 1824 + }, + { + "epoch": 0.23, + "grad_norm": 1.4134687843412066, + "learning_rate": 9.021567710136397e-06, + "loss": 0.4827, + "step": 1825 + }, + { + "epoch": 0.23, + "grad_norm": 1.5361484675395476, + "learning_rate": 9.020373200019802e-06, + "loss": 0.5378, + "step": 1826 + }, + { + "epoch": 0.23, + "grad_norm": 1.513940693690181, + "learning_rate": 9.019178040376509e-06, + "loss": 0.5161, + "step": 1827 + }, + { + "epoch": 0.23, + "grad_norm": 1.507508231829813, + "learning_rate": 9.017982231399604e-06, + "loss": 0.5442, + "step": 1828 + }, + { + "epoch": 0.23, + "grad_norm": 1.5256370685141893, + "learning_rate": 9.016785773282284e-06, + "loss": 0.5371, + "step": 1829 + }, + { + "epoch": 0.23, + "grad_norm": 1.497173577544496, + "learning_rate": 9.015588666217845e-06, + "loss": 0.5134, + "step": 1830 + }, + { + "epoch": 0.23, + "grad_norm": 4.0854993405691715, + "learning_rate": 9.014390910399691e-06, + "loss": 0.5161, + "step": 1831 + }, + { + "epoch": 0.23, + "grad_norm": 4.518221248548298, + "learning_rate": 9.013192506021328e-06, + "loss": 0.5122, + "step": 1832 + }, + { + "epoch": 0.23, + "grad_norm": 1.6377551021684842, + "learning_rate": 9.011993453276373e-06, + "loss": 0.5408, + "step": 1833 + }, + { + "epoch": 0.23, + "grad_norm": 1.5068880542901153, + "learning_rate": 9.01079375235854e-06, + "loss": 0.4908, + "step": 1834 + }, + { + "epoch": 0.23, + "grad_norm": 1.4292090316250703, + "learning_rate": 9.009593403461652e-06, + "loss": 0.5954, + "step": 1835 + }, + { + "epoch": 0.23, + "grad_norm": 10.733599321570736, + "learning_rate": 9.008392406779638e-06, + "loss": 0.5509, + "step": 1836 + }, + { + "epoch": 0.23, + "grad_norm": 3.8927517677665815, + "learning_rate": 9.007190762506527e-06, + "loss": 0.5052, + "step": 1837 + }, + { + "epoch": 0.23, + "grad_norm": 1.541189839125486, + "learning_rate": 9.005988470836456e-06, + "loss": 0.5135, + "step": 1838 + }, + { + "epoch": 0.23, + "grad_norm": 1.9180698533113243, + "learning_rate": 9.004785531963665e-06, + "loss": 0.5047, + "step": 1839 + }, + { + "epoch": 0.23, + "grad_norm": 1.3942542286939754, + "learning_rate": 9.003581946082503e-06, + "loss": 0.5786, + "step": 1840 + }, + { + "epoch": 0.23, + "grad_norm": 2.0924217443966975, + "learning_rate": 9.002377713387415e-06, + "loss": 0.4938, + "step": 1841 + }, + { + "epoch": 0.23, + "grad_norm": 1.8068366112672747, + "learning_rate": 9.001172834072958e-06, + "loss": 0.4915, + "step": 1842 + }, + { + "epoch": 0.23, + "grad_norm": 1.4742349524709262, + "learning_rate": 8.999967308333791e-06, + "loss": 0.4864, + "step": 1843 + }, + { + "epoch": 0.23, + "grad_norm": 1.9117290532257252, + "learning_rate": 8.998761136364675e-06, + "loss": 0.5857, + "step": 1844 + }, + { + "epoch": 0.23, + "grad_norm": 1.8116074153470343, + "learning_rate": 8.997554318360482e-06, + "loss": 0.489, + "step": 1845 + }, + { + "epoch": 0.23, + "grad_norm": 2.0987590232716706, + "learning_rate": 8.99634685451618e-06, + "loss": 0.5531, + "step": 1846 + }, + { + "epoch": 0.23, + "grad_norm": 1.5604499423857239, + "learning_rate": 8.995138745026847e-06, + "loss": 0.4864, + "step": 1847 + }, + { + "epoch": 0.23, + "grad_norm": 1.5388505102375079, + "learning_rate": 8.993929990087664e-06, + "loss": 0.5363, + "step": 1848 + }, + { + "epoch": 0.23, + "grad_norm": 1.29222554323978, + "learning_rate": 8.992720589893915e-06, + "loss": 0.5159, + "step": 1849 + }, + { + "epoch": 0.23, + "grad_norm": 1.9916577839753073, + "learning_rate": 8.991510544640992e-06, + "loss": 0.5397, + "step": 1850 + }, + { + "epoch": 0.23, + "grad_norm": 0.6948489030363554, + "learning_rate": 8.990299854524384e-06, + "loss": 0.5693, + "step": 1851 + }, + { + "epoch": 0.23, + "grad_norm": 1.9513734930911082, + "learning_rate": 8.989088519739693e-06, + "loss": 0.5161, + "step": 1852 + }, + { + "epoch": 0.23, + "grad_norm": 1.4587444301512456, + "learning_rate": 8.987876540482618e-06, + "loss": 0.4824, + "step": 1853 + }, + { + "epoch": 0.23, + "grad_norm": 1.6710499041249678, + "learning_rate": 8.986663916948965e-06, + "loss": 0.5454, + "step": 1854 + }, + { + "epoch": 0.23, + "grad_norm": 1.9152211306936873, + "learning_rate": 8.985450649334646e-06, + "loss": 0.5987, + "step": 1855 + }, + { + "epoch": 0.23, + "grad_norm": 2.6560317393676094, + "learning_rate": 8.984236737835673e-06, + "loss": 0.5399, + "step": 1856 + }, + { + "epoch": 0.23, + "grad_norm": 1.5195714881992344, + "learning_rate": 8.983022182648166e-06, + "loss": 0.597, + "step": 1857 + }, + { + "epoch": 0.23, + "grad_norm": 1.355222695904536, + "learning_rate": 8.981806983968346e-06, + "loss": 0.5864, + "step": 1858 + }, + { + "epoch": 0.23, + "grad_norm": 1.2761619323097473, + "learning_rate": 8.980591141992538e-06, + "loss": 0.5001, + "step": 1859 + }, + { + "epoch": 0.23, + "grad_norm": 1.4872401648051745, + "learning_rate": 8.979374656917174e-06, + "loss": 0.4997, + "step": 1860 + }, + { + "epoch": 0.23, + "grad_norm": 1.588819005755008, + "learning_rate": 8.978157528938786e-06, + "loss": 0.5829, + "step": 1861 + }, + { + "epoch": 0.23, + "grad_norm": 1.6542912440317863, + "learning_rate": 8.976939758254015e-06, + "loss": 0.5478, + "step": 1862 + }, + { + "epoch": 0.23, + "grad_norm": 26.682459025536943, + "learning_rate": 8.975721345059598e-06, + "loss": 0.5039, + "step": 1863 + }, + { + "epoch": 0.23, + "grad_norm": 2.0430820852157336, + "learning_rate": 8.974502289552384e-06, + "loss": 0.5205, + "step": 1864 + }, + { + "epoch": 0.23, + "grad_norm": 1.43661731754798, + "learning_rate": 8.973282591929319e-06, + "loss": 0.5473, + "step": 1865 + }, + { + "epoch": 0.23, + "grad_norm": 6.688736566388985, + "learning_rate": 8.97206225238746e-06, + "loss": 0.5611, + "step": 1866 + }, + { + "epoch": 0.23, + "grad_norm": 1.4425314258470237, + "learning_rate": 8.970841271123962e-06, + "loss": 0.5586, + "step": 1867 + }, + { + "epoch": 0.23, + "grad_norm": 1.8776702201654907, + "learning_rate": 8.969619648336082e-06, + "loss": 0.5739, + "step": 1868 + }, + { + "epoch": 0.23, + "grad_norm": 0.7012564447126629, + "learning_rate": 8.968397384221188e-06, + "loss": 0.5036, + "step": 1869 + }, + { + "epoch": 0.23, + "grad_norm": 1.3566032268734702, + "learning_rate": 8.967174478976745e-06, + "loss": 0.5345, + "step": 1870 + }, + { + "epoch": 0.23, + "grad_norm": 1.8068390422598124, + "learning_rate": 8.965950932800326e-06, + "loss": 0.5652, + "step": 1871 + }, + { + "epoch": 0.23, + "grad_norm": 1.435850750817673, + "learning_rate": 8.964726745889606e-06, + "loss": 0.5618, + "step": 1872 + }, + { + "epoch": 0.23, + "grad_norm": 1.2816893239440885, + "learning_rate": 8.963501918442359e-06, + "loss": 0.4831, + "step": 1873 + }, + { + "epoch": 0.23, + "grad_norm": 1.3104746693874945, + "learning_rate": 8.962276450656471e-06, + "loss": 0.4523, + "step": 1874 + }, + { + "epoch": 0.23, + "grad_norm": 1.4357443614868366, + "learning_rate": 8.961050342729927e-06, + "loss": 0.4957, + "step": 1875 + }, + { + "epoch": 0.23, + "grad_norm": 1.568259871694868, + "learning_rate": 8.959823594860813e-06, + "loss": 0.4929, + "step": 1876 + }, + { + "epoch": 0.23, + "grad_norm": 1.372551882835372, + "learning_rate": 8.958596207247322e-06, + "loss": 0.585, + "step": 1877 + }, + { + "epoch": 0.23, + "grad_norm": 1.3909216826293513, + "learning_rate": 8.95736818008775e-06, + "loss": 0.4942, + "step": 1878 + }, + { + "epoch": 0.23, + "grad_norm": 1.6154288710912335, + "learning_rate": 8.956139513580495e-06, + "loss": 0.4675, + "step": 1879 + }, + { + "epoch": 0.23, + "grad_norm": 1.4455950323321096, + "learning_rate": 8.95491020792406e-06, + "loss": 0.5411, + "step": 1880 + }, + { + "epoch": 0.23, + "grad_norm": 1.259868701492697, + "learning_rate": 8.953680263317048e-06, + "loss": 0.4533, + "step": 1881 + }, + { + "epoch": 0.23, + "grad_norm": 3.522655048402381, + "learning_rate": 8.952449679958168e-06, + "loss": 0.5309, + "step": 1882 + }, + { + "epoch": 0.23, + "grad_norm": 1.6655362560273492, + "learning_rate": 8.951218458046233e-06, + "loss": 0.5407, + "step": 1883 + }, + { + "epoch": 0.23, + "grad_norm": 1.3048223977489455, + "learning_rate": 8.949986597780157e-06, + "loss": 0.5039, + "step": 1884 + }, + { + "epoch": 0.23, + "grad_norm": 1.5562776861902379, + "learning_rate": 8.94875409935896e-06, + "loss": 0.5763, + "step": 1885 + }, + { + "epoch": 0.23, + "grad_norm": 1.2745792479455942, + "learning_rate": 8.947520962981758e-06, + "loss": 0.4908, + "step": 1886 + }, + { + "epoch": 0.23, + "grad_norm": 1.6672757070096937, + "learning_rate": 8.946287188847778e-06, + "loss": 0.5517, + "step": 1887 + }, + { + "epoch": 0.23, + "grad_norm": 1.3056030322769647, + "learning_rate": 8.945052777156346e-06, + "loss": 0.5022, + "step": 1888 + }, + { + "epoch": 0.23, + "grad_norm": 1.8962485527766033, + "learning_rate": 8.943817728106894e-06, + "loss": 0.506, + "step": 1889 + }, + { + "epoch": 0.23, + "grad_norm": 1.4549494573125639, + "learning_rate": 8.942582041898954e-06, + "loss": 0.4836, + "step": 1890 + }, + { + "epoch": 0.23, + "grad_norm": 1.7727010742859703, + "learning_rate": 8.941345718732162e-06, + "loss": 0.5042, + "step": 1891 + }, + { + "epoch": 0.23, + "grad_norm": 2.383471940931024, + "learning_rate": 8.940108758806258e-06, + "loss": 0.5198, + "step": 1892 + }, + { + "epoch": 0.23, + "grad_norm": 1.6303903494517051, + "learning_rate": 8.938871162321082e-06, + "loss": 0.5107, + "step": 1893 + }, + { + "epoch": 0.24, + "grad_norm": 1.783742959636969, + "learning_rate": 8.937632929476578e-06, + "loss": 0.5629, + "step": 1894 + }, + { + "epoch": 0.24, + "grad_norm": 1.4634858088918852, + "learning_rate": 8.936394060472796e-06, + "loss": 0.4866, + "step": 1895 + }, + { + "epoch": 0.24, + "grad_norm": 1.339140151238824, + "learning_rate": 8.935154555509883e-06, + "loss": 0.5651, + "step": 1896 + }, + { + "epoch": 0.24, + "grad_norm": 1.4343314700039496, + "learning_rate": 8.933914414788095e-06, + "loss": 0.5304, + "step": 1897 + }, + { + "epoch": 0.24, + "grad_norm": 1.447195399814874, + "learning_rate": 8.932673638507787e-06, + "loss": 0.5352, + "step": 1898 + }, + { + "epoch": 0.24, + "grad_norm": 1.5450789392126534, + "learning_rate": 8.931432226869416e-06, + "loss": 0.4946, + "step": 1899 + }, + { + "epoch": 0.24, + "grad_norm": 2.1174940010910475, + "learning_rate": 8.930190180073544e-06, + "loss": 0.5236, + "step": 1900 + }, + { + "epoch": 0.24, + "grad_norm": 1.6545234531448152, + "learning_rate": 8.928947498320835e-06, + "loss": 0.5477, + "step": 1901 + }, + { + "epoch": 0.24, + "grad_norm": 1.3292775696499728, + "learning_rate": 8.927704181812053e-06, + "loss": 0.4817, + "step": 1902 + }, + { + "epoch": 0.24, + "grad_norm": 1.5071985105494015, + "learning_rate": 8.92646023074807e-06, + "loss": 0.5697, + "step": 1903 + }, + { + "epoch": 0.24, + "grad_norm": 1.9818677569842058, + "learning_rate": 8.925215645329854e-06, + "loss": 0.4559, + "step": 1904 + }, + { + "epoch": 0.24, + "grad_norm": 1.580179304566661, + "learning_rate": 8.923970425758481e-06, + "loss": 0.5159, + "step": 1905 + }, + { + "epoch": 0.24, + "grad_norm": 2.458276371266938, + "learning_rate": 8.922724572235128e-06, + "loss": 0.5959, + "step": 1906 + }, + { + "epoch": 0.24, + "grad_norm": 1.7012565794755887, + "learning_rate": 8.921478084961071e-06, + "loss": 0.5144, + "step": 1907 + }, + { + "epoch": 0.24, + "grad_norm": 1.4290181998565596, + "learning_rate": 8.92023096413769e-06, + "loss": 0.5189, + "step": 1908 + }, + { + "epoch": 0.24, + "grad_norm": 1.4312359811126811, + "learning_rate": 8.918983209966475e-06, + "loss": 0.5151, + "step": 1909 + }, + { + "epoch": 0.24, + "grad_norm": 7.336001691916545, + "learning_rate": 8.917734822649002e-06, + "loss": 0.4681, + "step": 1910 + }, + { + "epoch": 0.24, + "grad_norm": 1.9200723965237108, + "learning_rate": 8.916485802386968e-06, + "loss": 0.5125, + "step": 1911 + }, + { + "epoch": 0.24, + "grad_norm": 1.5797716719857873, + "learning_rate": 8.915236149382155e-06, + "loss": 0.5416, + "step": 1912 + }, + { + "epoch": 0.24, + "grad_norm": 1.783287209569103, + "learning_rate": 8.913985863836465e-06, + "loss": 0.5443, + "step": 1913 + }, + { + "epoch": 0.24, + "grad_norm": 1.8272401889966197, + "learning_rate": 8.912734945951884e-06, + "loss": 0.5295, + "step": 1914 + }, + { + "epoch": 0.24, + "grad_norm": 1.6138889353965085, + "learning_rate": 8.911483395930514e-06, + "loss": 0.5037, + "step": 1915 + }, + { + "epoch": 0.24, + "grad_norm": 1.495452942442373, + "learning_rate": 8.910231213974549e-06, + "loss": 0.5173, + "step": 1916 + }, + { + "epoch": 0.24, + "grad_norm": 2.146077073992904, + "learning_rate": 8.908978400286297e-06, + "loss": 0.5296, + "step": 1917 + }, + { + "epoch": 0.24, + "grad_norm": 1.7738406140487761, + "learning_rate": 8.907724955068156e-06, + "loss": 0.5089, + "step": 1918 + }, + { + "epoch": 0.24, + "grad_norm": 1.4445001960204054, + "learning_rate": 8.90647087852263e-06, + "loss": 0.4729, + "step": 1919 + }, + { + "epoch": 0.24, + "grad_norm": 3.064579524062797, + "learning_rate": 8.905216170852332e-06, + "loss": 0.4936, + "step": 1920 + }, + { + "epoch": 0.24, + "grad_norm": 1.5744305050856235, + "learning_rate": 8.903960832259966e-06, + "loss": 0.4603, + "step": 1921 + }, + { + "epoch": 0.24, + "grad_norm": 1.6956951304877073, + "learning_rate": 8.902704862948344e-06, + "loss": 0.496, + "step": 1922 + }, + { + "epoch": 0.24, + "grad_norm": 2.1043226523378875, + "learning_rate": 8.901448263120379e-06, + "loss": 0.5216, + "step": 1923 + }, + { + "epoch": 0.24, + "grad_norm": 1.2288602774625417, + "learning_rate": 8.900191032979088e-06, + "loss": 0.4907, + "step": 1924 + }, + { + "epoch": 0.24, + "grad_norm": 1.6974651681887711, + "learning_rate": 8.898933172727584e-06, + "loss": 0.5437, + "step": 1925 + }, + { + "epoch": 0.24, + "grad_norm": 2.1277108296392484, + "learning_rate": 8.897674682569088e-06, + "loss": 0.4111, + "step": 1926 + }, + { + "epoch": 0.24, + "grad_norm": 1.3434009357471774, + "learning_rate": 8.896415562706919e-06, + "loss": 0.5192, + "step": 1927 + }, + { + "epoch": 0.24, + "grad_norm": 1.5168120489469041, + "learning_rate": 8.8951558133445e-06, + "loss": 0.5363, + "step": 1928 + }, + { + "epoch": 0.24, + "grad_norm": 1.372964230231597, + "learning_rate": 8.893895434685353e-06, + "loss": 0.5477, + "step": 1929 + }, + { + "epoch": 0.24, + "grad_norm": 1.3341145832969576, + "learning_rate": 8.892634426933106e-06, + "loss": 0.5334, + "step": 1930 + }, + { + "epoch": 0.24, + "grad_norm": 1.381671572338022, + "learning_rate": 8.891372790291482e-06, + "loss": 0.4387, + "step": 1931 + }, + { + "epoch": 0.24, + "grad_norm": 1.3889571726074057, + "learning_rate": 8.890110524964313e-06, + "loss": 0.5352, + "step": 1932 + }, + { + "epoch": 0.24, + "grad_norm": 1.6346808498868517, + "learning_rate": 8.888847631155525e-06, + "loss": 0.5435, + "step": 1933 + }, + { + "epoch": 0.24, + "grad_norm": 3.389766354005252, + "learning_rate": 8.887584109069157e-06, + "loss": 0.4923, + "step": 1934 + }, + { + "epoch": 0.24, + "grad_norm": 1.506276037797442, + "learning_rate": 8.886319958909334e-06, + "loss": 0.5358, + "step": 1935 + }, + { + "epoch": 0.24, + "grad_norm": 1.8617829100438767, + "learning_rate": 8.885055180880294e-06, + "loss": 0.5455, + "step": 1936 + }, + { + "epoch": 0.24, + "grad_norm": 1.4403503924568222, + "learning_rate": 8.883789775186374e-06, + "loss": 0.4609, + "step": 1937 + }, + { + "epoch": 0.24, + "grad_norm": 1.5025770716659845, + "learning_rate": 8.88252374203201e-06, + "loss": 0.5037, + "step": 1938 + }, + { + "epoch": 0.24, + "grad_norm": 1.713090061036998, + "learning_rate": 8.881257081621741e-06, + "loss": 0.4901, + "step": 1939 + }, + { + "epoch": 0.24, + "grad_norm": 0.7078369186879294, + "learning_rate": 8.879989794160208e-06, + "loss": 0.4839, + "step": 1940 + }, + { + "epoch": 0.24, + "grad_norm": 1.5524521922610566, + "learning_rate": 8.878721879852153e-06, + "loss": 0.5419, + "step": 1941 + }, + { + "epoch": 0.24, + "grad_norm": 1.7800261176459817, + "learning_rate": 8.877453338902415e-06, + "loss": 0.5274, + "step": 1942 + }, + { + "epoch": 0.24, + "grad_norm": 1.438442558784284, + "learning_rate": 8.876184171515943e-06, + "loss": 0.5588, + "step": 1943 + }, + { + "epoch": 0.24, + "grad_norm": 1.579247057716277, + "learning_rate": 8.874914377897778e-06, + "loss": 0.4872, + "step": 1944 + }, + { + "epoch": 0.24, + "grad_norm": 2.7752495014824574, + "learning_rate": 8.87364395825307e-06, + "loss": 0.5349, + "step": 1945 + }, + { + "epoch": 0.24, + "grad_norm": 1.5952965241983228, + "learning_rate": 8.872372912787061e-06, + "loss": 0.5209, + "step": 1946 + }, + { + "epoch": 0.24, + "grad_norm": 1.4540745806687716, + "learning_rate": 8.871101241705105e-06, + "loss": 0.5252, + "step": 1947 + }, + { + "epoch": 0.24, + "grad_norm": 1.5604738622505752, + "learning_rate": 8.86982894521265e-06, + "loss": 0.4982, + "step": 1948 + }, + { + "epoch": 0.24, + "grad_norm": 1.7187200851975826, + "learning_rate": 8.868556023515247e-06, + "loss": 0.5191, + "step": 1949 + }, + { + "epoch": 0.24, + "grad_norm": 1.3690982491354926, + "learning_rate": 8.867282476818546e-06, + "loss": 0.5194, + "step": 1950 + }, + { + "epoch": 0.24, + "grad_norm": 1.872490573096125, + "learning_rate": 8.866008305328303e-06, + "loss": 0.4693, + "step": 1951 + }, + { + "epoch": 0.24, + "grad_norm": 1.653900492109943, + "learning_rate": 8.864733509250367e-06, + "loss": 0.4991, + "step": 1952 + }, + { + "epoch": 0.24, + "grad_norm": 1.8198036332691097, + "learning_rate": 8.863458088790695e-06, + "loss": 0.5657, + "step": 1953 + }, + { + "epoch": 0.24, + "grad_norm": 0.7071267036238204, + "learning_rate": 8.862182044155345e-06, + "loss": 0.5261, + "step": 1954 + }, + { + "epoch": 0.24, + "grad_norm": 1.4544658968334625, + "learning_rate": 8.860905375550469e-06, + "loss": 0.5482, + "step": 1955 + }, + { + "epoch": 0.24, + "grad_norm": 1.5844352202023428, + "learning_rate": 8.859628083182326e-06, + "loss": 0.5331, + "step": 1956 + }, + { + "epoch": 0.24, + "grad_norm": 1.8270985290546589, + "learning_rate": 8.858350167257275e-06, + "loss": 0.5229, + "step": 1957 + }, + { + "epoch": 0.24, + "grad_norm": 2.2358524365219887, + "learning_rate": 8.85707162798177e-06, + "loss": 0.5315, + "step": 1958 + }, + { + "epoch": 0.24, + "grad_norm": 1.6698059723203507, + "learning_rate": 8.855792465562377e-06, + "loss": 0.5325, + "step": 1959 + }, + { + "epoch": 0.24, + "grad_norm": 0.7400217029843602, + "learning_rate": 8.854512680205748e-06, + "loss": 0.5117, + "step": 1960 + }, + { + "epoch": 0.24, + "grad_norm": 1.4293314877485304, + "learning_rate": 8.853232272118653e-06, + "loss": 0.5194, + "step": 1961 + }, + { + "epoch": 0.24, + "grad_norm": 1.3617601801455612, + "learning_rate": 8.851951241507945e-06, + "loss": 0.48, + "step": 1962 + }, + { + "epoch": 0.24, + "grad_norm": 2.5593848472649214, + "learning_rate": 8.850669588580591e-06, + "loss": 0.5879, + "step": 1963 + }, + { + "epoch": 0.24, + "grad_norm": 1.329342422082542, + "learning_rate": 8.84938731354365e-06, + "loss": 0.5234, + "step": 1964 + }, + { + "epoch": 0.24, + "grad_norm": 1.6158031021826076, + "learning_rate": 8.848104416604287e-06, + "loss": 0.5032, + "step": 1965 + }, + { + "epoch": 0.24, + "grad_norm": 1.4835217258908457, + "learning_rate": 8.846820897969763e-06, + "loss": 0.53, + "step": 1966 + }, + { + "epoch": 0.24, + "grad_norm": 0.6760264757911146, + "learning_rate": 8.845536757847444e-06, + "loss": 0.502, + "step": 1967 + }, + { + "epoch": 0.24, + "grad_norm": 1.8477646844223596, + "learning_rate": 8.844251996444792e-06, + "loss": 0.5484, + "step": 1968 + }, + { + "epoch": 0.24, + "grad_norm": 1.7376392792536866, + "learning_rate": 8.842966613969376e-06, + "loss": 0.5252, + "step": 1969 + }, + { + "epoch": 0.24, + "grad_norm": 2.165308432040499, + "learning_rate": 8.841680610628853e-06, + "loss": 0.4646, + "step": 1970 + }, + { + "epoch": 0.24, + "grad_norm": 1.4974071983421144, + "learning_rate": 8.840393986630996e-06, + "loss": 0.4945, + "step": 1971 + }, + { + "epoch": 0.24, + "grad_norm": 1.845611195832243, + "learning_rate": 8.839106742183668e-06, + "loss": 0.5535, + "step": 1972 + }, + { + "epoch": 0.24, + "grad_norm": 1.5394864096122054, + "learning_rate": 8.837818877494833e-06, + "loss": 0.5139, + "step": 1973 + }, + { + "epoch": 0.24, + "grad_norm": 1.4206296295423873, + "learning_rate": 8.836530392772555e-06, + "loss": 0.5274, + "step": 1974 + }, + { + "epoch": 0.25, + "grad_norm": 0.6776243792818417, + "learning_rate": 8.835241288225007e-06, + "loss": 0.5352, + "step": 1975 + }, + { + "epoch": 0.25, + "grad_norm": 1.3565471277974326, + "learning_rate": 8.83395156406045e-06, + "loss": 0.5327, + "step": 1976 + }, + { + "epoch": 0.25, + "grad_norm": 1.4313562691928976, + "learning_rate": 8.832661220487251e-06, + "loss": 0.5132, + "step": 1977 + }, + { + "epoch": 0.25, + "grad_norm": 1.7967161826065228, + "learning_rate": 8.831370257713877e-06, + "loss": 0.4947, + "step": 1978 + }, + { + "epoch": 0.25, + "grad_norm": 1.467448215573434, + "learning_rate": 8.830078675948894e-06, + "loss": 0.5552, + "step": 1979 + }, + { + "epoch": 0.25, + "grad_norm": 1.5835629523108656, + "learning_rate": 8.828786475400968e-06, + "loss": 0.5141, + "step": 1980 + }, + { + "epoch": 0.25, + "grad_norm": 2.0099353012114616, + "learning_rate": 8.827493656278867e-06, + "loss": 0.4699, + "step": 1981 + }, + { + "epoch": 0.25, + "grad_norm": 1.5473553695010045, + "learning_rate": 8.826200218791455e-06, + "loss": 0.5123, + "step": 1982 + }, + { + "epoch": 0.25, + "grad_norm": 1.5795956022466797, + "learning_rate": 8.8249061631477e-06, + "loss": 0.522, + "step": 1983 + }, + { + "epoch": 0.25, + "grad_norm": 2.222064880909354, + "learning_rate": 8.823611489556668e-06, + "loss": 0.5207, + "step": 1984 + }, + { + "epoch": 0.25, + "grad_norm": 1.7684840911887174, + "learning_rate": 8.822316198227525e-06, + "loss": 0.5149, + "step": 1985 + }, + { + "epoch": 0.25, + "grad_norm": 1.476508776650612, + "learning_rate": 8.821020289369535e-06, + "loss": 0.5619, + "step": 1986 + }, + { + "epoch": 0.25, + "grad_norm": 1.3676529888008777, + "learning_rate": 8.819723763192065e-06, + "loss": 0.4907, + "step": 1987 + }, + { + "epoch": 0.25, + "grad_norm": 2.8100143598963343, + "learning_rate": 8.81842661990458e-06, + "loss": 0.4757, + "step": 1988 + }, + { + "epoch": 0.25, + "grad_norm": 0.6704306315147124, + "learning_rate": 8.817128859716646e-06, + "loss": 0.5198, + "step": 1989 + }, + { + "epoch": 0.25, + "grad_norm": 1.6174741992271353, + "learning_rate": 8.815830482837925e-06, + "loss": 0.4942, + "step": 1990 + }, + { + "epoch": 0.25, + "grad_norm": 2.1235866635095344, + "learning_rate": 8.814531489478183e-06, + "loss": 0.5388, + "step": 1991 + }, + { + "epoch": 0.25, + "grad_norm": 1.749572638929686, + "learning_rate": 8.813231879847284e-06, + "loss": 0.4947, + "step": 1992 + }, + { + "epoch": 0.25, + "grad_norm": 1.8765404375963373, + "learning_rate": 8.811931654155191e-06, + "loss": 0.4857, + "step": 1993 + }, + { + "epoch": 0.25, + "grad_norm": 0.7127206859801521, + "learning_rate": 8.810630812611965e-06, + "loss": 0.5374, + "step": 1994 + }, + { + "epoch": 0.25, + "grad_norm": 1.3765373181685499, + "learning_rate": 8.80932935542777e-06, + "loss": 0.5574, + "step": 1995 + }, + { + "epoch": 0.25, + "grad_norm": 1.4220313237505227, + "learning_rate": 8.808027282812871e-06, + "loss": 0.551, + "step": 1996 + }, + { + "epoch": 0.25, + "grad_norm": 1.390487340074321, + "learning_rate": 8.806724594977625e-06, + "loss": 0.5951, + "step": 1997 + }, + { + "epoch": 0.25, + "grad_norm": 1.6304586844315077, + "learning_rate": 8.805421292132495e-06, + "loss": 0.4819, + "step": 1998 + }, + { + "epoch": 0.25, + "grad_norm": 1.548784626397272, + "learning_rate": 8.804117374488037e-06, + "loss": 0.4743, + "step": 1999 + }, + { + "epoch": 0.25, + "grad_norm": 2.7079415571467975, + "learning_rate": 8.802812842254917e-06, + "loss": 0.5411, + "step": 2000 + }, + { + "epoch": 0.25, + "grad_norm": 1.7494791245756824, + "learning_rate": 8.801507695643886e-06, + "loss": 0.552, + "step": 2001 + }, + { + "epoch": 0.25, + "grad_norm": 1.5625148747215771, + "learning_rate": 8.80020193486581e-06, + "loss": 0.5551, + "step": 2002 + }, + { + "epoch": 0.25, + "grad_norm": 1.5943239427434999, + "learning_rate": 8.798895560131642e-06, + "loss": 0.46, + "step": 2003 + }, + { + "epoch": 0.25, + "grad_norm": 1.8847274473426747, + "learning_rate": 8.797588571652439e-06, + "loss": 0.5404, + "step": 2004 + }, + { + "epoch": 0.25, + "grad_norm": 1.7756131902868117, + "learning_rate": 8.796280969639353e-06, + "loss": 0.4842, + "step": 2005 + }, + { + "epoch": 0.25, + "grad_norm": 1.852938704264008, + "learning_rate": 8.794972754303644e-06, + "loss": 0.5657, + "step": 2006 + }, + { + "epoch": 0.25, + "grad_norm": 4.412632757606416, + "learning_rate": 8.793663925856662e-06, + "loss": 0.5468, + "step": 2007 + }, + { + "epoch": 0.25, + "grad_norm": 1.6268459566919793, + "learning_rate": 8.792354484509863e-06, + "loss": 0.5665, + "step": 2008 + }, + { + "epoch": 0.25, + "grad_norm": 1.318500406755192, + "learning_rate": 8.791044430474795e-06, + "loss": 0.5019, + "step": 2009 + }, + { + "epoch": 0.25, + "grad_norm": 1.4252737383395815, + "learning_rate": 8.789733763963112e-06, + "loss": 0.5352, + "step": 2010 + }, + { + "epoch": 0.25, + "grad_norm": 0.7028911675339917, + "learning_rate": 8.788422485186561e-06, + "loss": 0.4947, + "step": 2011 + }, + { + "epoch": 0.25, + "grad_norm": 1.295162872112619, + "learning_rate": 8.787110594356993e-06, + "loss": 0.5005, + "step": 2012 + }, + { + "epoch": 0.25, + "grad_norm": 1.2920138918598154, + "learning_rate": 8.785798091686356e-06, + "loss": 0.4892, + "step": 2013 + }, + { + "epoch": 0.25, + "grad_norm": 1.5507368136823545, + "learning_rate": 8.784484977386691e-06, + "loss": 0.5525, + "step": 2014 + }, + { + "epoch": 0.25, + "grad_norm": 1.4858272518440248, + "learning_rate": 8.783171251670151e-06, + "loss": 0.5463, + "step": 2015 + }, + { + "epoch": 0.25, + "grad_norm": 0.6740133113942713, + "learning_rate": 8.781856914748974e-06, + "loss": 0.5366, + "step": 2016 + }, + { + "epoch": 0.25, + "grad_norm": 1.6130432936138699, + "learning_rate": 8.780541966835506e-06, + "loss": 0.4703, + "step": 2017 + }, + { + "epoch": 0.25, + "grad_norm": 1.4865849447276784, + "learning_rate": 8.779226408142187e-06, + "loss": 0.5175, + "step": 2018 + }, + { + "epoch": 0.25, + "grad_norm": 2.477496458338817, + "learning_rate": 8.777910238881557e-06, + "loss": 0.5737, + "step": 2019 + }, + { + "epoch": 0.25, + "grad_norm": 1.590972661806151, + "learning_rate": 8.776593459266256e-06, + "loss": 0.5884, + "step": 2020 + }, + { + "epoch": 0.25, + "grad_norm": 1.5083726906854342, + "learning_rate": 8.77527606950902e-06, + "loss": 0.5236, + "step": 2021 + }, + { + "epoch": 0.25, + "grad_norm": 1.6928660682592702, + "learning_rate": 8.773958069822684e-06, + "loss": 0.5336, + "step": 2022 + }, + { + "epoch": 0.25, + "grad_norm": 1.8285529561145588, + "learning_rate": 8.772639460420184e-06, + "loss": 0.5104, + "step": 2023 + }, + { + "epoch": 0.25, + "grad_norm": 1.4625362649817097, + "learning_rate": 8.771320241514553e-06, + "loss": 0.5071, + "step": 2024 + }, + { + "epoch": 0.25, + "grad_norm": 1.209422273835458, + "learning_rate": 8.770000413318923e-06, + "loss": 0.5072, + "step": 2025 + }, + { + "epoch": 0.25, + "grad_norm": 1.3313275590933291, + "learning_rate": 8.768679976046523e-06, + "loss": 0.4998, + "step": 2026 + }, + { + "epoch": 0.25, + "grad_norm": 1.2482005079526335, + "learning_rate": 8.767358929910681e-06, + "loss": 0.469, + "step": 2027 + }, + { + "epoch": 0.25, + "grad_norm": 1.4071157891730426, + "learning_rate": 8.766037275124824e-06, + "loss": 0.5088, + "step": 2028 + }, + { + "epoch": 0.25, + "grad_norm": 1.6073781895786337, + "learning_rate": 8.764715011902477e-06, + "loss": 0.5158, + "step": 2029 + }, + { + "epoch": 0.25, + "grad_norm": 2.084224486603882, + "learning_rate": 8.763392140457261e-06, + "loss": 0.526, + "step": 2030 + }, + { + "epoch": 0.25, + "grad_norm": 1.7510253233417739, + "learning_rate": 8.762068661002902e-06, + "loss": 0.4855, + "step": 2031 + }, + { + "epoch": 0.25, + "grad_norm": 2.3038810606284432, + "learning_rate": 8.760744573753218e-06, + "loss": 0.5514, + "step": 2032 + }, + { + "epoch": 0.25, + "grad_norm": 1.6022421653238001, + "learning_rate": 8.759419878922125e-06, + "loss": 0.5673, + "step": 2033 + }, + { + "epoch": 0.25, + "grad_norm": 1.20301441399954, + "learning_rate": 8.758094576723641e-06, + "loss": 0.539, + "step": 2034 + }, + { + "epoch": 0.25, + "grad_norm": 1.6577453684894792, + "learning_rate": 8.75676866737188e-06, + "loss": 0.5521, + "step": 2035 + }, + { + "epoch": 0.25, + "grad_norm": 0.6839437161456582, + "learning_rate": 8.755442151081054e-06, + "loss": 0.4958, + "step": 2036 + }, + { + "epoch": 0.25, + "grad_norm": 1.5834842804236429, + "learning_rate": 8.754115028065474e-06, + "loss": 0.4872, + "step": 2037 + }, + { + "epoch": 0.25, + "grad_norm": 2.2421241088167467, + "learning_rate": 8.752787298539547e-06, + "loss": 0.5161, + "step": 2038 + }, + { + "epoch": 0.25, + "grad_norm": 2.161941749490066, + "learning_rate": 8.75145896271778e-06, + "loss": 0.4987, + "step": 2039 + }, + { + "epoch": 0.25, + "grad_norm": 1.4927151108479508, + "learning_rate": 8.750130020814779e-06, + "loss": 0.4894, + "step": 2040 + }, + { + "epoch": 0.25, + "grad_norm": 1.502074237777443, + "learning_rate": 8.748800473045245e-06, + "loss": 0.5471, + "step": 2041 + }, + { + "epoch": 0.25, + "grad_norm": 1.5446193105812245, + "learning_rate": 8.747470319623976e-06, + "loss": 0.5269, + "step": 2042 + }, + { + "epoch": 0.25, + "grad_norm": 1.8144058798330323, + "learning_rate": 8.746139560765873e-06, + "loss": 0.5298, + "step": 2043 + }, + { + "epoch": 0.25, + "grad_norm": 1.3392301561466349, + "learning_rate": 8.744808196685933e-06, + "loss": 0.5376, + "step": 2044 + }, + { + "epoch": 0.25, + "grad_norm": 1.3212574679321898, + "learning_rate": 8.743476227599245e-06, + "loss": 0.5288, + "step": 2045 + }, + { + "epoch": 0.25, + "grad_norm": 1.953631424817136, + "learning_rate": 8.742143653721004e-06, + "loss": 0.5045, + "step": 2046 + }, + { + "epoch": 0.25, + "grad_norm": 1.3254459527990614, + "learning_rate": 8.740810475266497e-06, + "loss": 0.5295, + "step": 2047 + }, + { + "epoch": 0.25, + "grad_norm": 1.2471258040754365, + "learning_rate": 8.739476692451112e-06, + "loss": 0.5237, + "step": 2048 + }, + { + "epoch": 0.25, + "grad_norm": 2.2406500377543024, + "learning_rate": 8.738142305490335e-06, + "loss": 0.5653, + "step": 2049 + }, + { + "epoch": 0.25, + "grad_norm": 1.3120908521833168, + "learning_rate": 8.736807314599744e-06, + "loss": 0.4988, + "step": 2050 + }, + { + "epoch": 0.25, + "grad_norm": 1.56240581885028, + "learning_rate": 8.73547171999502e-06, + "loss": 0.5052, + "step": 2051 + }, + { + "epoch": 0.25, + "grad_norm": 1.3269004649671916, + "learning_rate": 8.734135521891941e-06, + "loss": 0.5342, + "step": 2052 + }, + { + "epoch": 0.25, + "grad_norm": 1.472749561684422, + "learning_rate": 8.732798720506381e-06, + "loss": 0.4932, + "step": 2053 + }, + { + "epoch": 0.25, + "grad_norm": 1.5456774928408388, + "learning_rate": 8.731461316054313e-06, + "loss": 0.5342, + "step": 2054 + }, + { + "epoch": 0.26, + "grad_norm": 1.2554204562914544, + "learning_rate": 8.730123308751806e-06, + "loss": 0.4592, + "step": 2055 + }, + { + "epoch": 0.26, + "grad_norm": 1.3112293500770817, + "learning_rate": 8.728784698815026e-06, + "loss": 0.5272, + "step": 2056 + }, + { + "epoch": 0.26, + "grad_norm": 1.1667461673415906, + "learning_rate": 8.727445486460236e-06, + "loss": 0.4595, + "step": 2057 + }, + { + "epoch": 0.26, + "grad_norm": 3.6367370511644874, + "learning_rate": 8.7261056719038e-06, + "loss": 0.5088, + "step": 2058 + }, + { + "epoch": 0.26, + "grad_norm": 1.50527731481556, + "learning_rate": 8.724765255362176e-06, + "loss": 0.5197, + "step": 2059 + }, + { + "epoch": 0.26, + "grad_norm": 1.5322878593937856, + "learning_rate": 8.72342423705192e-06, + "loss": 0.5218, + "step": 2060 + }, + { + "epoch": 0.26, + "grad_norm": 1.4793378737196778, + "learning_rate": 8.722082617189688e-06, + "loss": 0.5401, + "step": 2061 + }, + { + "epoch": 0.26, + "grad_norm": 1.4610037837615666, + "learning_rate": 8.720740395992225e-06, + "loss": 0.5336, + "step": 2062 + }, + { + "epoch": 0.26, + "grad_norm": 1.5622701747115364, + "learning_rate": 8.719397573676384e-06, + "loss": 0.5043, + "step": 2063 + }, + { + "epoch": 0.26, + "grad_norm": 1.4367898452648369, + "learning_rate": 8.718054150459106e-06, + "loss": 0.5192, + "step": 2064 + }, + { + "epoch": 0.26, + "grad_norm": 1.5175871163956165, + "learning_rate": 8.716710126557435e-06, + "loss": 0.5194, + "step": 2065 + }, + { + "epoch": 0.26, + "grad_norm": 1.3458435739714716, + "learning_rate": 8.715365502188508e-06, + "loss": 0.5238, + "step": 2066 + }, + { + "epoch": 0.26, + "grad_norm": 1.7803905820852695, + "learning_rate": 8.714020277569564e-06, + "loss": 0.5646, + "step": 2067 + }, + { + "epoch": 0.26, + "grad_norm": 1.7385985543427787, + "learning_rate": 8.712674452917934e-06, + "loss": 0.5495, + "step": 2068 + }, + { + "epoch": 0.26, + "grad_norm": 1.4759265210333785, + "learning_rate": 8.711328028451045e-06, + "loss": 0.4545, + "step": 2069 + }, + { + "epoch": 0.26, + "grad_norm": 1.3288442314806546, + "learning_rate": 8.709981004386429e-06, + "loss": 0.5034, + "step": 2070 + }, + { + "epoch": 0.26, + "grad_norm": 1.4758448460788367, + "learning_rate": 8.708633380941706e-06, + "loss": 0.5045, + "step": 2071 + }, + { + "epoch": 0.26, + "grad_norm": 1.3920330580189069, + "learning_rate": 8.707285158334598e-06, + "loss": 0.4962, + "step": 2072 + }, + { + "epoch": 0.26, + "grad_norm": 1.2930994944296241, + "learning_rate": 8.705936336782921e-06, + "loss": 0.5308, + "step": 2073 + }, + { + "epoch": 0.26, + "grad_norm": 1.8773196012080102, + "learning_rate": 8.704586916504592e-06, + "loss": 0.4816, + "step": 2074 + }, + { + "epoch": 0.26, + "grad_norm": 1.3165810492564216, + "learning_rate": 8.703236897717617e-06, + "loss": 0.4903, + "step": 2075 + }, + { + "epoch": 0.26, + "grad_norm": 1.5163855156336135, + "learning_rate": 8.701886280640109e-06, + "loss": 0.4882, + "step": 2076 + }, + { + "epoch": 0.26, + "grad_norm": 1.6838193810705397, + "learning_rate": 8.700535065490266e-06, + "loss": 0.5349, + "step": 2077 + }, + { + "epoch": 0.26, + "grad_norm": 1.3303115176381106, + "learning_rate": 8.699183252486395e-06, + "loss": 0.5408, + "step": 2078 + }, + { + "epoch": 0.26, + "grad_norm": 1.3360266668930008, + "learning_rate": 8.697830841846887e-06, + "loss": 0.525, + "step": 2079 + }, + { + "epoch": 0.26, + "grad_norm": 1.439059769559954, + "learning_rate": 8.69647783379024e-06, + "loss": 0.5354, + "step": 2080 + }, + { + "epoch": 0.26, + "grad_norm": 1.3942462512315814, + "learning_rate": 8.695124228535044e-06, + "loss": 0.5391, + "step": 2081 + }, + { + "epoch": 0.26, + "grad_norm": 0.626043540508294, + "learning_rate": 8.693770026299984e-06, + "loss": 0.475, + "step": 2082 + }, + { + "epoch": 0.26, + "grad_norm": 1.3662165330675327, + "learning_rate": 8.692415227303844e-06, + "loss": 0.5206, + "step": 2083 + }, + { + "epoch": 0.26, + "grad_norm": 1.4217799982549042, + "learning_rate": 8.691059831765505e-06, + "loss": 0.5648, + "step": 2084 + }, + { + "epoch": 0.26, + "grad_norm": 1.7365946002808297, + "learning_rate": 8.689703839903943e-06, + "loss": 0.5146, + "step": 2085 + }, + { + "epoch": 0.26, + "grad_norm": 1.6943207518116408, + "learning_rate": 8.688347251938229e-06, + "loss": 0.5379, + "step": 2086 + }, + { + "epoch": 0.26, + "grad_norm": 1.5989235306650624, + "learning_rate": 8.686990068087532e-06, + "loss": 0.5212, + "step": 2087 + }, + { + "epoch": 0.26, + "grad_norm": 1.5245090919991986, + "learning_rate": 8.685632288571118e-06, + "loss": 0.5261, + "step": 2088 + }, + { + "epoch": 0.26, + "grad_norm": 1.3285545188358303, + "learning_rate": 8.684273913608346e-06, + "loss": 0.4977, + "step": 2089 + }, + { + "epoch": 0.26, + "grad_norm": 2.488096499314304, + "learning_rate": 8.682914943418677e-06, + "loss": 0.5072, + "step": 2090 + }, + { + "epoch": 0.26, + "grad_norm": 1.4838765109056207, + "learning_rate": 8.681555378221661e-06, + "loss": 0.5111, + "step": 2091 + }, + { + "epoch": 0.26, + "grad_norm": 1.6550785967751374, + "learning_rate": 8.680195218236951e-06, + "loss": 0.4896, + "step": 2092 + }, + { + "epoch": 0.26, + "grad_norm": 1.4018756217200874, + "learning_rate": 8.67883446368429e-06, + "loss": 0.547, + "step": 2093 + }, + { + "epoch": 0.26, + "grad_norm": 1.3465600335196564, + "learning_rate": 8.677473114783524e-06, + "loss": 0.5501, + "step": 2094 + }, + { + "epoch": 0.26, + "grad_norm": 1.296107978657512, + "learning_rate": 8.676111171754585e-06, + "loss": 0.5285, + "step": 2095 + }, + { + "epoch": 0.26, + "grad_norm": 1.3967902124449163, + "learning_rate": 8.67474863481751e-06, + "loss": 0.5699, + "step": 2096 + }, + { + "epoch": 0.26, + "grad_norm": 1.5257350142040416, + "learning_rate": 8.673385504192428e-06, + "loss": 0.5171, + "step": 2097 + }, + { + "epoch": 0.26, + "grad_norm": 1.3449457010675756, + "learning_rate": 8.672021780099569e-06, + "loss": 0.524, + "step": 2098 + }, + { + "epoch": 0.26, + "grad_norm": 1.519683542426732, + "learning_rate": 8.670657462759248e-06, + "loss": 0.5065, + "step": 2099 + }, + { + "epoch": 0.26, + "grad_norm": 1.4787471182390501, + "learning_rate": 8.669292552391888e-06, + "loss": 0.5024, + "step": 2100 + }, + { + "epoch": 0.26, + "grad_norm": 1.6463547942760857, + "learning_rate": 8.667927049217997e-06, + "loss": 0.5639, + "step": 2101 + }, + { + "epoch": 0.26, + "grad_norm": 1.8358536422213843, + "learning_rate": 8.66656095345819e-06, + "loss": 0.4962, + "step": 2102 + }, + { + "epoch": 0.26, + "grad_norm": 1.7067416368535604, + "learning_rate": 8.665194265333167e-06, + "loss": 0.5317, + "step": 2103 + }, + { + "epoch": 0.26, + "grad_norm": 1.3435245189908687, + "learning_rate": 8.66382698506373e-06, + "loss": 0.5432, + "step": 2104 + }, + { + "epoch": 0.26, + "grad_norm": 1.5260755140956461, + "learning_rate": 8.662459112870777e-06, + "loss": 0.4877, + "step": 2105 + }, + { + "epoch": 0.26, + "grad_norm": 1.8155532123302065, + "learning_rate": 8.661090648975297e-06, + "loss": 0.5655, + "step": 2106 + }, + { + "epoch": 0.26, + "grad_norm": 1.4194970138329932, + "learning_rate": 8.659721593598379e-06, + "loss": 0.5591, + "step": 2107 + }, + { + "epoch": 0.26, + "grad_norm": 1.2344258030946587, + "learning_rate": 8.658351946961206e-06, + "loss": 0.4829, + "step": 2108 + }, + { + "epoch": 0.26, + "grad_norm": 1.5723751549266327, + "learning_rate": 8.656981709285054e-06, + "loss": 0.5254, + "step": 2109 + }, + { + "epoch": 0.26, + "grad_norm": 6.204612264343951, + "learning_rate": 8.655610880791303e-06, + "loss": 0.4789, + "step": 2110 + }, + { + "epoch": 0.26, + "grad_norm": 1.3796945954800486, + "learning_rate": 8.654239461701414e-06, + "loss": 0.587, + "step": 2111 + }, + { + "epoch": 0.26, + "grad_norm": 1.526936464451939, + "learning_rate": 8.65286745223696e-06, + "loss": 0.5601, + "step": 2112 + }, + { + "epoch": 0.26, + "grad_norm": 1.5491201410612896, + "learning_rate": 8.651494852619596e-06, + "loss": 0.5303, + "step": 2113 + }, + { + "epoch": 0.26, + "grad_norm": 1.3635548901742522, + "learning_rate": 8.650121663071079e-06, + "loss": 0.4809, + "step": 2114 + }, + { + "epoch": 0.26, + "grad_norm": 0.6497764251763823, + "learning_rate": 8.64874788381326e-06, + "loss": 0.4995, + "step": 2115 + }, + { + "epoch": 0.26, + "grad_norm": 1.4728552317424948, + "learning_rate": 8.647373515068085e-06, + "loss": 0.4796, + "step": 2116 + }, + { + "epoch": 0.26, + "grad_norm": 1.3439837298838473, + "learning_rate": 8.645998557057595e-06, + "loss": 0.5647, + "step": 2117 + }, + { + "epoch": 0.26, + "grad_norm": 0.6876273724296037, + "learning_rate": 8.644623010003928e-06, + "loss": 0.5131, + "step": 2118 + }, + { + "epoch": 0.26, + "grad_norm": 1.5868033275463422, + "learning_rate": 8.643246874129316e-06, + "loss": 0.5644, + "step": 2119 + }, + { + "epoch": 0.26, + "grad_norm": 1.511917733457017, + "learning_rate": 8.641870149656082e-06, + "loss": 0.5396, + "step": 2120 + }, + { + "epoch": 0.26, + "grad_norm": 1.3439071166591285, + "learning_rate": 8.64049283680665e-06, + "loss": 0.5074, + "step": 2121 + }, + { + "epoch": 0.26, + "grad_norm": 1.2768752235566887, + "learning_rate": 8.639114935803542e-06, + "loss": 0.5101, + "step": 2122 + }, + { + "epoch": 0.26, + "grad_norm": 1.3351149989408924, + "learning_rate": 8.637736446869362e-06, + "loss": 0.5379, + "step": 2123 + }, + { + "epoch": 0.26, + "grad_norm": 1.7684832724906134, + "learning_rate": 8.63635737022682e-06, + "loss": 0.506, + "step": 2124 + }, + { + "epoch": 0.26, + "grad_norm": 1.5063907422372096, + "learning_rate": 8.634977706098721e-06, + "loss": 0.5006, + "step": 2125 + }, + { + "epoch": 0.26, + "grad_norm": 1.3146592975273879, + "learning_rate": 8.633597454707959e-06, + "loss": 0.5306, + "step": 2126 + }, + { + "epoch": 0.26, + "grad_norm": 1.3187236273362868, + "learning_rate": 8.632216616277527e-06, + "loss": 0.4807, + "step": 2127 + }, + { + "epoch": 0.26, + "grad_norm": 2.077537638693262, + "learning_rate": 8.630835191030508e-06, + "loss": 0.5125, + "step": 2128 + }, + { + "epoch": 0.26, + "grad_norm": 1.4717538690131045, + "learning_rate": 8.62945317919009e-06, + "loss": 0.5009, + "step": 2129 + }, + { + "epoch": 0.26, + "grad_norm": 1.5072010277133576, + "learning_rate": 8.628070580979544e-06, + "loss": 0.5108, + "step": 2130 + }, + { + "epoch": 0.26, + "grad_norm": 1.7909802743135568, + "learning_rate": 8.626687396622242e-06, + "loss": 0.4817, + "step": 2131 + }, + { + "epoch": 0.26, + "grad_norm": 1.522209883270738, + "learning_rate": 8.62530362634165e-06, + "loss": 0.5689, + "step": 2132 + }, + { + "epoch": 0.26, + "grad_norm": 1.6072768318168984, + "learning_rate": 8.623919270361329e-06, + "loss": 0.4947, + "step": 2133 + }, + { + "epoch": 0.26, + "grad_norm": 1.479690164963758, + "learning_rate": 8.622534328904932e-06, + "loss": 0.5179, + "step": 2134 + }, + { + "epoch": 0.26, + "grad_norm": 1.2871577638299292, + "learning_rate": 8.621148802196211e-06, + "loss": 0.4856, + "step": 2135 + }, + { + "epoch": 0.27, + "grad_norm": 1.6382205250968358, + "learning_rate": 8.619762690459008e-06, + "loss": 0.5136, + "step": 2136 + }, + { + "epoch": 0.27, + "grad_norm": 0.6979597886359754, + "learning_rate": 8.618375993917263e-06, + "loss": 0.4986, + "step": 2137 + }, + { + "epoch": 0.27, + "grad_norm": 1.3555980447677438, + "learning_rate": 8.616988712795008e-06, + "loss": 0.4785, + "step": 2138 + }, + { + "epoch": 0.27, + "grad_norm": 4.748291939308693, + "learning_rate": 8.615600847316372e-06, + "loss": 0.5331, + "step": 2139 + }, + { + "epoch": 0.27, + "grad_norm": 2.00088888375592, + "learning_rate": 8.614212397705575e-06, + "loss": 0.5701, + "step": 2140 + }, + { + "epoch": 0.27, + "grad_norm": 1.573457827272612, + "learning_rate": 8.612823364186933e-06, + "loss": 0.4721, + "step": 2141 + }, + { + "epoch": 0.27, + "grad_norm": 1.551083989047537, + "learning_rate": 8.611433746984858e-06, + "loss": 0.5552, + "step": 2142 + }, + { + "epoch": 0.27, + "grad_norm": 1.4912064054984209, + "learning_rate": 8.610043546323855e-06, + "loss": 0.5488, + "step": 2143 + }, + { + "epoch": 0.27, + "grad_norm": 1.2760824885449171, + "learning_rate": 8.608652762428521e-06, + "loss": 0.5275, + "step": 2144 + }, + { + "epoch": 0.27, + "grad_norm": 1.5981452550912965, + "learning_rate": 8.607261395523554e-06, + "loss": 0.4953, + "step": 2145 + }, + { + "epoch": 0.27, + "grad_norm": 1.4329804648625786, + "learning_rate": 8.605869445833737e-06, + "loss": 0.5042, + "step": 2146 + }, + { + "epoch": 0.27, + "grad_norm": 1.3602541838086386, + "learning_rate": 8.604476913583955e-06, + "loss": 0.5388, + "step": 2147 + }, + { + "epoch": 0.27, + "grad_norm": 2.7925939709059873, + "learning_rate": 8.60308379899918e-06, + "loss": 0.5228, + "step": 2148 + }, + { + "epoch": 0.27, + "grad_norm": 1.2888107219051665, + "learning_rate": 8.601690102304486e-06, + "loss": 0.5206, + "step": 2149 + }, + { + "epoch": 0.27, + "grad_norm": 1.3557563294974224, + "learning_rate": 8.600295823725033e-06, + "loss": 0.5455, + "step": 2150 + }, + { + "epoch": 0.27, + "grad_norm": 1.3427580417032645, + "learning_rate": 8.598900963486083e-06, + "loss": 0.5269, + "step": 2151 + }, + { + "epoch": 0.27, + "grad_norm": 1.4120668579056157, + "learning_rate": 8.597505521812984e-06, + "loss": 0.5046, + "step": 2152 + }, + { + "epoch": 0.27, + "grad_norm": 1.3108192996952082, + "learning_rate": 8.596109498931185e-06, + "loss": 0.5103, + "step": 2153 + }, + { + "epoch": 0.27, + "grad_norm": 1.5495240706755715, + "learning_rate": 8.594712895066226e-06, + "loss": 0.4984, + "step": 2154 + }, + { + "epoch": 0.27, + "grad_norm": 1.3005502095759358, + "learning_rate": 8.593315710443739e-06, + "loss": 0.549, + "step": 2155 + }, + { + "epoch": 0.27, + "grad_norm": 1.4965928633369276, + "learning_rate": 8.59191794528945e-06, + "loss": 0.4768, + "step": 2156 + }, + { + "epoch": 0.27, + "grad_norm": 1.3650726161121465, + "learning_rate": 8.590519599829184e-06, + "loss": 0.5329, + "step": 2157 + }, + { + "epoch": 0.27, + "grad_norm": 1.4564801871656392, + "learning_rate": 8.589120674288853e-06, + "loss": 0.4863, + "step": 2158 + }, + { + "epoch": 0.27, + "grad_norm": 1.632300836612375, + "learning_rate": 8.587721168894466e-06, + "loss": 0.5334, + "step": 2159 + }, + { + "epoch": 0.27, + "grad_norm": 1.1623408678050393, + "learning_rate": 8.58632108387213e-06, + "loss": 0.4864, + "step": 2160 + }, + { + "epoch": 0.27, + "grad_norm": 2.1603521892398447, + "learning_rate": 8.584920419448035e-06, + "loss": 0.4918, + "step": 2161 + }, + { + "epoch": 0.27, + "grad_norm": 1.2579644869776925, + "learning_rate": 8.583519175848474e-06, + "loss": 0.4942, + "step": 2162 + }, + { + "epoch": 0.27, + "grad_norm": 2.129015625363322, + "learning_rate": 8.582117353299828e-06, + "loss": 0.469, + "step": 2163 + }, + { + "epoch": 0.27, + "grad_norm": 1.924953776540199, + "learning_rate": 8.580714952028575e-06, + "loss": 0.5748, + "step": 2164 + }, + { + "epoch": 0.27, + "grad_norm": 1.507743860015434, + "learning_rate": 8.579311972261286e-06, + "loss": 0.5841, + "step": 2165 + }, + { + "epoch": 0.27, + "grad_norm": 1.480253101467622, + "learning_rate": 8.577908414224625e-06, + "loss": 0.5915, + "step": 2166 + }, + { + "epoch": 0.27, + "grad_norm": 1.3203271426351713, + "learning_rate": 8.576504278145348e-06, + "loss": 0.4924, + "step": 2167 + }, + { + "epoch": 0.27, + "grad_norm": 1.2872361086433888, + "learning_rate": 8.575099564250304e-06, + "loss": 0.5593, + "step": 2168 + }, + { + "epoch": 0.27, + "grad_norm": 1.4261526727740197, + "learning_rate": 8.57369427276644e-06, + "loss": 0.5528, + "step": 2169 + }, + { + "epoch": 0.27, + "grad_norm": 1.435250926712347, + "learning_rate": 8.572288403920791e-06, + "loss": 0.5494, + "step": 2170 + }, + { + "epoch": 0.27, + "grad_norm": 1.3595312099940866, + "learning_rate": 8.570881957940491e-06, + "loss": 0.5357, + "step": 2171 + }, + { + "epoch": 0.27, + "grad_norm": 1.7425534944356988, + "learning_rate": 8.56947493505276e-06, + "loss": 0.5608, + "step": 2172 + }, + { + "epoch": 0.27, + "grad_norm": 1.5923240614789778, + "learning_rate": 8.568067335484915e-06, + "loss": 0.5255, + "step": 2173 + }, + { + "epoch": 0.27, + "grad_norm": 1.3081107286282319, + "learning_rate": 8.566659159464367e-06, + "loss": 0.5069, + "step": 2174 + }, + { + "epoch": 0.27, + "grad_norm": 1.2351708937483399, + "learning_rate": 8.565250407218622e-06, + "loss": 0.506, + "step": 2175 + }, + { + "epoch": 0.27, + "grad_norm": 1.5097611716714756, + "learning_rate": 8.563841078975273e-06, + "loss": 0.5172, + "step": 2176 + }, + { + "epoch": 0.27, + "grad_norm": 5.71048705718627, + "learning_rate": 8.562431174962009e-06, + "loss": 0.5111, + "step": 2177 + }, + { + "epoch": 0.27, + "grad_norm": 1.5677375671906304, + "learning_rate": 8.561020695406614e-06, + "loss": 0.5155, + "step": 2178 + }, + { + "epoch": 0.27, + "grad_norm": 1.5195255872079538, + "learning_rate": 8.559609640536962e-06, + "loss": 0.5085, + "step": 2179 + }, + { + "epoch": 0.27, + "grad_norm": 1.541472808973938, + "learning_rate": 8.558198010581022e-06, + "loss": 0.5067, + "step": 2180 + }, + { + "epoch": 0.27, + "grad_norm": 1.5948220379252283, + "learning_rate": 8.556785805766859e-06, + "loss": 0.4828, + "step": 2181 + }, + { + "epoch": 0.27, + "grad_norm": 1.3828488243455312, + "learning_rate": 8.55537302632262e-06, + "loss": 0.5429, + "step": 2182 + }, + { + "epoch": 0.27, + "grad_norm": 1.6313382434068164, + "learning_rate": 8.553959672476558e-06, + "loss": 0.4702, + "step": 2183 + }, + { + "epoch": 0.27, + "grad_norm": 1.9515593617267215, + "learning_rate": 8.55254574445701e-06, + "loss": 0.5574, + "step": 2184 + }, + { + "epoch": 0.27, + "grad_norm": 1.6291863643827857, + "learning_rate": 8.551131242492407e-06, + "loss": 0.5196, + "step": 2185 + }, + { + "epoch": 0.27, + "grad_norm": 1.4610097753460443, + "learning_rate": 8.54971616681128e-06, + "loss": 0.486, + "step": 2186 + }, + { + "epoch": 0.27, + "grad_norm": 1.243411626461023, + "learning_rate": 8.54830051764224e-06, + "loss": 0.4869, + "step": 2187 + }, + { + "epoch": 0.27, + "grad_norm": 1.5254988279151824, + "learning_rate": 8.546884295214002e-06, + "loss": 0.5065, + "step": 2188 + }, + { + "epoch": 0.27, + "grad_norm": 1.348325328290899, + "learning_rate": 8.545467499755366e-06, + "loss": 0.5287, + "step": 2189 + }, + { + "epoch": 0.27, + "grad_norm": 1.4484265724296916, + "learning_rate": 8.544050131495233e-06, + "loss": 0.4964, + "step": 2190 + }, + { + "epoch": 0.27, + "grad_norm": 1.3966660469184176, + "learning_rate": 8.542632190662586e-06, + "loss": 0.5599, + "step": 2191 + }, + { + "epoch": 0.27, + "grad_norm": 3.7947891690402593, + "learning_rate": 8.541213677486509e-06, + "loss": 0.5735, + "step": 2192 + }, + { + "epoch": 0.27, + "grad_norm": 1.306427889420237, + "learning_rate": 8.539794592196173e-06, + "loss": 0.5341, + "step": 2193 + }, + { + "epoch": 0.27, + "grad_norm": 2.0480279716104333, + "learning_rate": 8.538374935020846e-06, + "loss": 0.5067, + "step": 2194 + }, + { + "epoch": 0.27, + "grad_norm": 1.3681137444792715, + "learning_rate": 8.536954706189883e-06, + "loss": 0.5132, + "step": 2195 + }, + { + "epoch": 0.27, + "grad_norm": 1.7656680141879872, + "learning_rate": 8.535533905932739e-06, + "loss": 0.5815, + "step": 2196 + }, + { + "epoch": 0.27, + "grad_norm": 1.2767603247891768, + "learning_rate": 8.534112534478953e-06, + "loss": 0.4554, + "step": 2197 + }, + { + "epoch": 0.27, + "grad_norm": 1.5866330370451214, + "learning_rate": 8.532690592058161e-06, + "loss": 0.5033, + "step": 2198 + }, + { + "epoch": 0.27, + "grad_norm": 1.9089032305015174, + "learning_rate": 8.53126807890009e-06, + "loss": 0.5001, + "step": 2199 + }, + { + "epoch": 0.27, + "grad_norm": 1.474155149864476, + "learning_rate": 8.529844995234563e-06, + "loss": 0.5069, + "step": 2200 + }, + { + "epoch": 0.27, + "grad_norm": 1.4499060008784328, + "learning_rate": 8.528421341291488e-06, + "loss": 0.5084, + "step": 2201 + }, + { + "epoch": 0.27, + "grad_norm": 1.3897034097569219, + "learning_rate": 8.526997117300868e-06, + "loss": 0.5091, + "step": 2202 + }, + { + "epoch": 0.27, + "grad_norm": 1.3914446118557149, + "learning_rate": 8.525572323492803e-06, + "loss": 0.5164, + "step": 2203 + }, + { + "epoch": 0.27, + "grad_norm": 3.0644901750892553, + "learning_rate": 8.524146960097476e-06, + "loss": 0.4661, + "step": 2204 + }, + { + "epoch": 0.27, + "grad_norm": 1.5115707604600233, + "learning_rate": 8.522721027345173e-06, + "loss": 0.5254, + "step": 2205 + }, + { + "epoch": 0.27, + "grad_norm": 1.529006318561144, + "learning_rate": 8.52129452546626e-06, + "loss": 0.5786, + "step": 2206 + }, + { + "epoch": 0.27, + "grad_norm": 1.6774863769516049, + "learning_rate": 8.519867454691204e-06, + "loss": 0.505, + "step": 2207 + }, + { + "epoch": 0.27, + "grad_norm": 1.7550570033418513, + "learning_rate": 8.518439815250561e-06, + "loss": 0.5145, + "step": 2208 + }, + { + "epoch": 0.27, + "grad_norm": 1.8311153441470829, + "learning_rate": 8.517011607374978e-06, + "loss": 0.494, + "step": 2209 + }, + { + "epoch": 0.27, + "grad_norm": 1.508065531312494, + "learning_rate": 8.515582831295195e-06, + "loss": 0.5514, + "step": 2210 + }, + { + "epoch": 0.27, + "grad_norm": 3.8391102633980023, + "learning_rate": 8.514153487242042e-06, + "loss": 0.5094, + "step": 2211 + }, + { + "epoch": 0.27, + "grad_norm": 1.6333778269326167, + "learning_rate": 8.512723575446446e-06, + "loss": 0.5497, + "step": 2212 + }, + { + "epoch": 0.27, + "grad_norm": 1.5528173792460196, + "learning_rate": 8.511293096139417e-06, + "loss": 0.551, + "step": 2213 + }, + { + "epoch": 0.27, + "grad_norm": 1.5720104471169536, + "learning_rate": 8.509862049552065e-06, + "loss": 0.4421, + "step": 2214 + }, + { + "epoch": 0.27, + "grad_norm": 1.3400011066040898, + "learning_rate": 8.508430435915584e-06, + "loss": 0.4903, + "step": 2215 + }, + { + "epoch": 0.28, + "grad_norm": 1.5794600856104057, + "learning_rate": 8.506998255461269e-06, + "loss": 0.494, + "step": 2216 + }, + { + "epoch": 0.28, + "grad_norm": 1.5033180990212154, + "learning_rate": 8.505565508420498e-06, + "loss": 0.5592, + "step": 2217 + }, + { + "epoch": 0.28, + "grad_norm": 1.359279658821798, + "learning_rate": 8.504132195024747e-06, + "loss": 0.546, + "step": 2218 + }, + { + "epoch": 0.28, + "grad_norm": 1.4952159845406447, + "learning_rate": 8.502698315505573e-06, + "loss": 0.5359, + "step": 2219 + }, + { + "epoch": 0.28, + "grad_norm": 1.5247955931103605, + "learning_rate": 8.501263870094642e-06, + "loss": 0.5179, + "step": 2220 + }, + { + "epoch": 0.28, + "grad_norm": 1.7782806600137773, + "learning_rate": 8.499828859023696e-06, + "loss": 0.5474, + "step": 2221 + }, + { + "epoch": 0.28, + "grad_norm": 1.5995731206263903, + "learning_rate": 8.498393282524572e-06, + "loss": 0.4935, + "step": 2222 + }, + { + "epoch": 0.28, + "grad_norm": 1.2088287114101004, + "learning_rate": 8.496957140829203e-06, + "loss": 0.4073, + "step": 2223 + }, + { + "epoch": 0.28, + "grad_norm": 2.0638784444279548, + "learning_rate": 8.495520434169609e-06, + "loss": 0.5159, + "step": 2224 + }, + { + "epoch": 0.28, + "grad_norm": 1.497881780594768, + "learning_rate": 8.494083162777903e-06, + "loss": 0.5605, + "step": 2225 + }, + { + "epoch": 0.28, + "grad_norm": 1.413788779749229, + "learning_rate": 8.492645326886291e-06, + "loss": 0.5486, + "step": 2226 + }, + { + "epoch": 0.28, + "grad_norm": 2.654873395286464, + "learning_rate": 8.491206926727064e-06, + "loss": 0.5209, + "step": 2227 + }, + { + "epoch": 0.28, + "grad_norm": 0.6926087836331076, + "learning_rate": 8.489767962532611e-06, + "loss": 0.5036, + "step": 2228 + }, + { + "epoch": 0.28, + "grad_norm": 1.422679228906343, + "learning_rate": 8.488328434535408e-06, + "loss": 0.5435, + "step": 2229 + }, + { + "epoch": 0.28, + "grad_norm": 2.1664800474007877, + "learning_rate": 8.486888342968023e-06, + "loss": 0.4997, + "step": 2230 + }, + { + "epoch": 0.28, + "grad_norm": 1.4461426110457904, + "learning_rate": 8.485447688063117e-06, + "loss": 0.522, + "step": 2231 + }, + { + "epoch": 0.28, + "grad_norm": 2.1007331833215424, + "learning_rate": 8.484006470053441e-06, + "loss": 0.498, + "step": 2232 + }, + { + "epoch": 0.28, + "grad_norm": 1.3641751224103698, + "learning_rate": 8.482564689171834e-06, + "loss": 0.4979, + "step": 2233 + }, + { + "epoch": 0.28, + "grad_norm": 1.5884561346617245, + "learning_rate": 8.481122345651233e-06, + "loss": 0.5074, + "step": 2234 + }, + { + "epoch": 0.28, + "grad_norm": 1.4192325095037206, + "learning_rate": 8.479679439724654e-06, + "loss": 0.5471, + "step": 2235 + }, + { + "epoch": 0.28, + "grad_norm": 1.5989179484159823, + "learning_rate": 8.478235971625218e-06, + "loss": 0.5239, + "step": 2236 + }, + { + "epoch": 0.28, + "grad_norm": 1.8668321080947397, + "learning_rate": 8.476791941586126e-06, + "loss": 0.5037, + "step": 2237 + }, + { + "epoch": 0.28, + "grad_norm": 1.5158066815377742, + "learning_rate": 8.475347349840674e-06, + "loss": 0.5369, + "step": 2238 + }, + { + "epoch": 0.28, + "grad_norm": 1.3645982218539816, + "learning_rate": 8.473902196622252e-06, + "loss": 0.5108, + "step": 2239 + }, + { + "epoch": 0.28, + "grad_norm": 1.265074006713083, + "learning_rate": 8.472456482164332e-06, + "loss": 0.498, + "step": 2240 + }, + { + "epoch": 0.28, + "grad_norm": 1.2999038326538637, + "learning_rate": 8.471010206700488e-06, + "loss": 0.5254, + "step": 2241 + }, + { + "epoch": 0.28, + "grad_norm": 1.6060369052945243, + "learning_rate": 8.469563370464372e-06, + "loss": 0.5791, + "step": 2242 + }, + { + "epoch": 0.28, + "grad_norm": 1.426987421889657, + "learning_rate": 8.468115973689739e-06, + "loss": 0.5261, + "step": 2243 + }, + { + "epoch": 0.28, + "grad_norm": 1.3635787901340615, + "learning_rate": 8.466668016610423e-06, + "loss": 0.5393, + "step": 2244 + }, + { + "epoch": 0.28, + "grad_norm": 2.369209352696687, + "learning_rate": 8.46521949946036e-06, + "loss": 0.508, + "step": 2245 + }, + { + "epoch": 0.28, + "grad_norm": 3.8226736763186273, + "learning_rate": 8.463770422473566e-06, + "loss": 0.5892, + "step": 2246 + }, + { + "epoch": 0.28, + "grad_norm": 1.5394075544765462, + "learning_rate": 8.462320785884155e-06, + "loss": 0.5151, + "step": 2247 + }, + { + "epoch": 0.28, + "grad_norm": 1.815742457072043, + "learning_rate": 8.460870589926327e-06, + "loss": 0.5484, + "step": 2248 + }, + { + "epoch": 0.28, + "grad_norm": 1.4875209217946046, + "learning_rate": 8.459419834834374e-06, + "loss": 0.5414, + "step": 2249 + }, + { + "epoch": 0.28, + "grad_norm": 1.274126755614334, + "learning_rate": 8.45796852084268e-06, + "loss": 0.4801, + "step": 2250 + }, + { + "epoch": 0.28, + "grad_norm": 1.4106176029586979, + "learning_rate": 8.456516648185717e-06, + "loss": 0.4854, + "step": 2251 + }, + { + "epoch": 0.28, + "grad_norm": 1.621250166058811, + "learning_rate": 8.455064217098046e-06, + "loss": 0.5174, + "step": 2252 + }, + { + "epoch": 0.28, + "grad_norm": 0.6601874583519945, + "learning_rate": 8.453611227814322e-06, + "loss": 0.496, + "step": 2253 + }, + { + "epoch": 0.28, + "grad_norm": 1.1827510625394257, + "learning_rate": 8.452157680569287e-06, + "loss": 0.5282, + "step": 2254 + }, + { + "epoch": 0.28, + "grad_norm": 1.4471288796504076, + "learning_rate": 8.450703575597775e-06, + "loss": 0.4927, + "step": 2255 + }, + { + "epoch": 0.28, + "grad_norm": 1.569274387714447, + "learning_rate": 8.449248913134709e-06, + "loss": 0.5473, + "step": 2256 + }, + { + "epoch": 0.28, + "grad_norm": 1.221832571991313, + "learning_rate": 8.447793693415103e-06, + "loss": 0.5164, + "step": 2257 + }, + { + "epoch": 0.28, + "grad_norm": 1.4929632157250627, + "learning_rate": 8.446337916674062e-06, + "loss": 0.5397, + "step": 2258 + }, + { + "epoch": 0.28, + "grad_norm": 1.4075589123438812, + "learning_rate": 8.444881583146776e-06, + "loss": 0.5151, + "step": 2259 + }, + { + "epoch": 0.28, + "grad_norm": 2.2567410524008205, + "learning_rate": 8.44342469306853e-06, + "loss": 0.5171, + "step": 2260 + }, + { + "epoch": 0.28, + "grad_norm": 2.27510092229832, + "learning_rate": 8.441967246674698e-06, + "loss": 0.5706, + "step": 2261 + }, + { + "epoch": 0.28, + "grad_norm": 1.7994978265127097, + "learning_rate": 8.440509244200743e-06, + "loss": 0.5459, + "step": 2262 + }, + { + "epoch": 0.28, + "grad_norm": 3.6112199363869086, + "learning_rate": 8.43905068588222e-06, + "loss": 0.5131, + "step": 2263 + }, + { + "epoch": 0.28, + "grad_norm": 1.9579492063060595, + "learning_rate": 8.437591571954768e-06, + "loss": 0.4655, + "step": 2264 + }, + { + "epoch": 0.28, + "grad_norm": 1.6073308539208877, + "learning_rate": 8.436131902654123e-06, + "loss": 0.5265, + "step": 2265 + }, + { + "epoch": 0.28, + "grad_norm": 1.7586176288861322, + "learning_rate": 8.43467167821611e-06, + "loss": 0.5683, + "step": 2266 + }, + { + "epoch": 0.28, + "grad_norm": 1.3915881330613307, + "learning_rate": 8.433210898876632e-06, + "loss": 0.4483, + "step": 2267 + }, + { + "epoch": 0.28, + "grad_norm": 1.8019170267996496, + "learning_rate": 8.431749564871698e-06, + "loss": 0.5564, + "step": 2268 + }, + { + "epoch": 0.28, + "grad_norm": 3.1320048897516775, + "learning_rate": 8.430287676437399e-06, + "loss": 0.5397, + "step": 2269 + }, + { + "epoch": 0.28, + "grad_norm": 8.967697613735462, + "learning_rate": 8.428825233809914e-06, + "loss": 0.5517, + "step": 2270 + }, + { + "epoch": 0.28, + "grad_norm": 1.446804864822726, + "learning_rate": 8.427362237225513e-06, + "loss": 0.5354, + "step": 2271 + }, + { + "epoch": 0.28, + "grad_norm": 1.5128368460256831, + "learning_rate": 8.425898686920557e-06, + "loss": 0.5594, + "step": 2272 + }, + { + "epoch": 0.28, + "grad_norm": 1.3255339105369666, + "learning_rate": 8.424434583131496e-06, + "loss": 0.4988, + "step": 2273 + }, + { + "epoch": 0.28, + "grad_norm": 2.1360665417876894, + "learning_rate": 8.42296992609487e-06, + "loss": 0.5444, + "step": 2274 + }, + { + "epoch": 0.28, + "grad_norm": 1.4927309323485651, + "learning_rate": 8.421504716047305e-06, + "loss": 0.5291, + "step": 2275 + }, + { + "epoch": 0.28, + "grad_norm": 1.4284228996787336, + "learning_rate": 8.420038953225518e-06, + "loss": 0.4888, + "step": 2276 + }, + { + "epoch": 0.28, + "grad_norm": 2.98282046193224, + "learning_rate": 8.418572637866316e-06, + "loss": 0.4727, + "step": 2277 + }, + { + "epoch": 0.28, + "grad_norm": 1.3807713542293747, + "learning_rate": 8.417105770206598e-06, + "loss": 0.523, + "step": 2278 + }, + { + "epoch": 0.28, + "grad_norm": 1.8528045536909103, + "learning_rate": 8.415638350483348e-06, + "loss": 0.5091, + "step": 2279 + }, + { + "epoch": 0.28, + "grad_norm": 1.632035367705425, + "learning_rate": 8.41417037893364e-06, + "loss": 0.5403, + "step": 2280 + }, + { + "epoch": 0.28, + "grad_norm": 0.7208618427001102, + "learning_rate": 8.412701855794637e-06, + "loss": 0.5022, + "step": 2281 + }, + { + "epoch": 0.28, + "grad_norm": 1.4717548446224873, + "learning_rate": 8.411232781303593e-06, + "loss": 0.4975, + "step": 2282 + }, + { + "epoch": 0.28, + "grad_norm": 1.4415037838550437, + "learning_rate": 8.409763155697852e-06, + "loss": 0.553, + "step": 2283 + }, + { + "epoch": 0.28, + "grad_norm": 1.6165367478786659, + "learning_rate": 8.40829297921484e-06, + "loss": 0.4942, + "step": 2284 + }, + { + "epoch": 0.28, + "grad_norm": 1.4188139758484897, + "learning_rate": 8.40682225209208e-06, + "loss": 0.4914, + "step": 2285 + }, + { + "epoch": 0.28, + "grad_norm": 2.0225226153497595, + "learning_rate": 8.405350974567182e-06, + "loss": 0.4985, + "step": 2286 + }, + { + "epoch": 0.28, + "grad_norm": 1.6167432111333073, + "learning_rate": 8.403879146877841e-06, + "loss": 0.4977, + "step": 2287 + }, + { + "epoch": 0.28, + "grad_norm": 1.5411985035248754, + "learning_rate": 8.402406769261846e-06, + "loss": 0.563, + "step": 2288 + }, + { + "epoch": 0.28, + "grad_norm": 1.4103723311543415, + "learning_rate": 8.400933841957072e-06, + "loss": 0.4985, + "step": 2289 + }, + { + "epoch": 0.28, + "grad_norm": 1.4687767595342092, + "learning_rate": 8.399460365201481e-06, + "loss": 0.4976, + "step": 2290 + }, + { + "epoch": 0.28, + "grad_norm": 1.9856889826237913, + "learning_rate": 8.397986339233128e-06, + "loss": 0.5085, + "step": 2291 + }, + { + "epoch": 0.28, + "grad_norm": 1.7220986166950227, + "learning_rate": 8.396511764290158e-06, + "loss": 0.5138, + "step": 2292 + }, + { + "epoch": 0.28, + "grad_norm": 1.421132299453895, + "learning_rate": 8.395036640610796e-06, + "loss": 0.5148, + "step": 2293 + }, + { + "epoch": 0.28, + "grad_norm": 1.8056847578745472, + "learning_rate": 8.393560968433366e-06, + "loss": 0.5342, + "step": 2294 + }, + { + "epoch": 0.28, + "grad_norm": 1.5731443883103524, + "learning_rate": 8.392084747996275e-06, + "loss": 0.4951, + "step": 2295 + }, + { + "epoch": 0.28, + "grad_norm": 1.265804124392798, + "learning_rate": 8.390607979538014e-06, + "loss": 0.4872, + "step": 2296 + }, + { + "epoch": 0.29, + "grad_norm": 2.297878688851731, + "learning_rate": 8.389130663297175e-06, + "loss": 0.5321, + "step": 2297 + }, + { + "epoch": 0.29, + "grad_norm": 1.8342068119319483, + "learning_rate": 8.387652799512427e-06, + "loss": 0.5113, + "step": 2298 + }, + { + "epoch": 0.29, + "grad_norm": 1.34842173900626, + "learning_rate": 8.386174388422535e-06, + "loss": 0.531, + "step": 2299 + }, + { + "epoch": 0.29, + "grad_norm": 1.59218032004979, + "learning_rate": 8.384695430266348e-06, + "loss": 0.5145, + "step": 2300 + }, + { + "epoch": 0.29, + "grad_norm": 1.5341020416331983, + "learning_rate": 8.383215925282802e-06, + "loss": 0.5395, + "step": 2301 + }, + { + "epoch": 0.29, + "grad_norm": 1.7895960441730228, + "learning_rate": 8.381735873710928e-06, + "loss": 0.4936, + "step": 2302 + }, + { + "epoch": 0.29, + "grad_norm": 1.5468634833190629, + "learning_rate": 8.38025527578984e-06, + "loss": 0.4864, + "step": 2303 + }, + { + "epoch": 0.29, + "grad_norm": 0.7169385720661268, + "learning_rate": 8.378774131758742e-06, + "loss": 0.5151, + "step": 2304 + }, + { + "epoch": 0.29, + "grad_norm": 1.7764810966446116, + "learning_rate": 8.377292441856926e-06, + "loss": 0.5585, + "step": 2305 + }, + { + "epoch": 0.29, + "grad_norm": 1.4247792641176793, + "learning_rate": 8.37581020632377e-06, + "loss": 0.5499, + "step": 2306 + }, + { + "epoch": 0.29, + "grad_norm": 3.24826140127667, + "learning_rate": 8.374327425398744e-06, + "loss": 0.5943, + "step": 2307 + }, + { + "epoch": 0.29, + "grad_norm": 1.5190074639788063, + "learning_rate": 8.372844099321404e-06, + "loss": 0.4983, + "step": 2308 + }, + { + "epoch": 0.29, + "grad_norm": 1.3870284274978932, + "learning_rate": 8.371360228331393e-06, + "loss": 0.4672, + "step": 2309 + }, + { + "epoch": 0.29, + "grad_norm": 1.8711754548899875, + "learning_rate": 8.369875812668449e-06, + "loss": 0.5546, + "step": 2310 + }, + { + "epoch": 0.29, + "grad_norm": 1.444759030629122, + "learning_rate": 8.368390852572384e-06, + "loss": 0.5414, + "step": 2311 + }, + { + "epoch": 0.29, + "grad_norm": 1.2826081853724405, + "learning_rate": 8.366905348283114e-06, + "loss": 0.4739, + "step": 2312 + }, + { + "epoch": 0.29, + "grad_norm": 1.6215351080572908, + "learning_rate": 8.365419300040628e-06, + "loss": 0.4838, + "step": 2313 + }, + { + "epoch": 0.29, + "grad_norm": 1.368664685893942, + "learning_rate": 8.363932708085016e-06, + "loss": 0.4851, + "step": 2314 + }, + { + "epoch": 0.29, + "grad_norm": 1.268709114460342, + "learning_rate": 8.362445572656451e-06, + "loss": 0.4537, + "step": 2315 + }, + { + "epoch": 0.29, + "grad_norm": 1.2595580792264183, + "learning_rate": 8.360957893995187e-06, + "loss": 0.5065, + "step": 2316 + }, + { + "epoch": 0.29, + "grad_norm": 1.3180728188145547, + "learning_rate": 8.359469672341574e-06, + "loss": 0.449, + "step": 2317 + }, + { + "epoch": 0.29, + "grad_norm": 1.3748707275140444, + "learning_rate": 8.357980907936048e-06, + "loss": 0.5071, + "step": 2318 + }, + { + "epoch": 0.29, + "grad_norm": 2.2182849061073715, + "learning_rate": 8.356491601019135e-06, + "loss": 0.5262, + "step": 2319 + }, + { + "epoch": 0.29, + "grad_norm": 1.8497839329843278, + "learning_rate": 8.35500175183144e-06, + "loss": 0.5182, + "step": 2320 + }, + { + "epoch": 0.29, + "grad_norm": 3.7064279805028324, + "learning_rate": 8.353511360613665e-06, + "loss": 0.488, + "step": 2321 + }, + { + "epoch": 0.29, + "grad_norm": 1.696234532399231, + "learning_rate": 8.352020427606591e-06, + "loss": 0.5753, + "step": 2322 + }, + { + "epoch": 0.29, + "grad_norm": 1.6188147545472062, + "learning_rate": 8.350528953051098e-06, + "loss": 0.5407, + "step": 2323 + }, + { + "epoch": 0.29, + "grad_norm": 1.3692002533058387, + "learning_rate": 8.349036937188143e-06, + "loss": 0.5252, + "step": 2324 + }, + { + "epoch": 0.29, + "grad_norm": 1.443875711526957, + "learning_rate": 8.347544380258777e-06, + "loss": 0.5334, + "step": 2325 + }, + { + "epoch": 0.29, + "grad_norm": 1.4178971172449506, + "learning_rate": 8.34605128250413e-06, + "loss": 0.5319, + "step": 2326 + }, + { + "epoch": 0.29, + "grad_norm": 1.2259432902634666, + "learning_rate": 8.344557644165431e-06, + "loss": 0.5138, + "step": 2327 + }, + { + "epoch": 0.29, + "grad_norm": 0.6778873298316646, + "learning_rate": 8.34306346548399e-06, + "loss": 0.5238, + "step": 2328 + }, + { + "epoch": 0.29, + "grad_norm": 1.4288498248716177, + "learning_rate": 8.341568746701202e-06, + "loss": 0.5337, + "step": 2329 + }, + { + "epoch": 0.29, + "grad_norm": 1.5684048901647027, + "learning_rate": 8.340073488058552e-06, + "loss": 0.4903, + "step": 2330 + }, + { + "epoch": 0.29, + "grad_norm": 1.2599485203250966, + "learning_rate": 8.338577689797615e-06, + "loss": 0.5134, + "step": 2331 + }, + { + "epoch": 0.29, + "grad_norm": 1.4253859816398988, + "learning_rate": 8.337081352160048e-06, + "loss": 0.505, + "step": 2332 + }, + { + "epoch": 0.29, + "grad_norm": 2.1339071401428553, + "learning_rate": 8.335584475387597e-06, + "loss": 0.5273, + "step": 2333 + }, + { + "epoch": 0.29, + "grad_norm": 1.4778341026992288, + "learning_rate": 8.334087059722097e-06, + "loss": 0.4823, + "step": 2334 + }, + { + "epoch": 0.29, + "grad_norm": 1.63766176270152, + "learning_rate": 8.33258910540547e-06, + "loss": 0.4946, + "step": 2335 + }, + { + "epoch": 0.29, + "grad_norm": 1.888945724040837, + "learning_rate": 8.33109061267972e-06, + "loss": 0.5359, + "step": 2336 + }, + { + "epoch": 0.29, + "grad_norm": 0.6516161391520482, + "learning_rate": 8.329591581786946e-06, + "loss": 0.4986, + "step": 2337 + }, + { + "epoch": 0.29, + "grad_norm": 1.5261816214983497, + "learning_rate": 8.328092012969327e-06, + "loss": 0.5083, + "step": 2338 + }, + { + "epoch": 0.29, + "grad_norm": 1.4821802013711955, + "learning_rate": 8.326591906469132e-06, + "loss": 0.5237, + "step": 2339 + }, + { + "epoch": 0.29, + "grad_norm": 1.6683955615857764, + "learning_rate": 8.325091262528715e-06, + "loss": 0.5683, + "step": 2340 + }, + { + "epoch": 0.29, + "grad_norm": 1.357707619827874, + "learning_rate": 8.323590081390522e-06, + "loss": 0.5236, + "step": 2341 + }, + { + "epoch": 0.29, + "grad_norm": 1.457783527864208, + "learning_rate": 8.322088363297078e-06, + "loss": 0.4967, + "step": 2342 + }, + { + "epoch": 0.29, + "grad_norm": 1.7460405208068128, + "learning_rate": 8.320586108491002e-06, + "loss": 0.5097, + "step": 2343 + }, + { + "epoch": 0.29, + "grad_norm": 1.7744582128639608, + "learning_rate": 8.319083317214996e-06, + "loss": 0.5355, + "step": 2344 + }, + { + "epoch": 0.29, + "grad_norm": 1.5390306253936152, + "learning_rate": 8.317579989711846e-06, + "loss": 0.5013, + "step": 2345 + }, + { + "epoch": 0.29, + "grad_norm": 0.7215048565142089, + "learning_rate": 8.31607612622443e-06, + "loss": 0.4965, + "step": 2346 + }, + { + "epoch": 0.29, + "grad_norm": 1.4736527077899817, + "learning_rate": 8.314571726995711e-06, + "loss": 0.5626, + "step": 2347 + }, + { + "epoch": 0.29, + "grad_norm": 1.8277857445033219, + "learning_rate": 8.313066792268737e-06, + "loss": 0.5266, + "step": 2348 + }, + { + "epoch": 0.29, + "grad_norm": 1.6391679556371148, + "learning_rate": 8.311561322286645e-06, + "loss": 0.5405, + "step": 2349 + }, + { + "epoch": 0.29, + "grad_norm": 1.4251390271552782, + "learning_rate": 8.310055317292656e-06, + "loss": 0.4482, + "step": 2350 + }, + { + "epoch": 0.29, + "grad_norm": 1.4518548043395432, + "learning_rate": 8.308548777530077e-06, + "loss": 0.5224, + "step": 2351 + }, + { + "epoch": 0.29, + "grad_norm": 1.529537909676569, + "learning_rate": 8.307041703242305e-06, + "loss": 0.4757, + "step": 2352 + }, + { + "epoch": 0.29, + "grad_norm": 1.4641076631316765, + "learning_rate": 8.305534094672818e-06, + "loss": 0.5606, + "step": 2353 + }, + { + "epoch": 0.29, + "grad_norm": 0.6459680091281955, + "learning_rate": 8.304025952065187e-06, + "loss": 0.4939, + "step": 2354 + }, + { + "epoch": 0.29, + "grad_norm": 1.5502872604777798, + "learning_rate": 8.302517275663063e-06, + "loss": 0.5186, + "step": 2355 + }, + { + "epoch": 0.29, + "grad_norm": 1.4767675987142823, + "learning_rate": 8.301008065710188e-06, + "loss": 0.5366, + "step": 2356 + }, + { + "epoch": 0.29, + "grad_norm": 1.4597094379451114, + "learning_rate": 8.299498322450388e-06, + "loss": 0.5207, + "step": 2357 + }, + { + "epoch": 0.29, + "grad_norm": 1.5145188084629189, + "learning_rate": 8.297988046127574e-06, + "loss": 0.468, + "step": 2358 + }, + { + "epoch": 0.29, + "grad_norm": 1.5879209005951993, + "learning_rate": 8.296477236985744e-06, + "loss": 0.468, + "step": 2359 + }, + { + "epoch": 0.29, + "grad_norm": 1.389298414107315, + "learning_rate": 8.294965895268985e-06, + "loss": 0.5107, + "step": 2360 + }, + { + "epoch": 0.29, + "grad_norm": 2.367116410018487, + "learning_rate": 8.293454021221466e-06, + "loss": 0.4894, + "step": 2361 + }, + { + "epoch": 0.29, + "grad_norm": 1.4671157811272946, + "learning_rate": 8.291941615087442e-06, + "loss": 0.5283, + "step": 2362 + }, + { + "epoch": 0.29, + "grad_norm": 1.500644862850445, + "learning_rate": 8.290428677111258e-06, + "loss": 0.4954, + "step": 2363 + }, + { + "epoch": 0.29, + "grad_norm": 1.4648161938444693, + "learning_rate": 8.288915207537343e-06, + "loss": 0.4996, + "step": 2364 + }, + { + "epoch": 0.29, + "grad_norm": 1.4305139064341361, + "learning_rate": 8.28740120661021e-06, + "loss": 0.4865, + "step": 2365 + }, + { + "epoch": 0.29, + "grad_norm": 1.5603416367884653, + "learning_rate": 8.285886674574459e-06, + "loss": 0.4875, + "step": 2366 + }, + { + "epoch": 0.29, + "grad_norm": 1.2213046850748692, + "learning_rate": 8.284371611674776e-06, + "loss": 0.4688, + "step": 2367 + }, + { + "epoch": 0.29, + "grad_norm": 1.212734451608669, + "learning_rate": 8.282856018155932e-06, + "loss": 0.4468, + "step": 2368 + }, + { + "epoch": 0.29, + "grad_norm": 1.4988856306143041, + "learning_rate": 8.281339894262786e-06, + "loss": 0.5517, + "step": 2369 + }, + { + "epoch": 0.29, + "grad_norm": 1.9316901312270092, + "learning_rate": 8.279823240240282e-06, + "loss": 0.4647, + "step": 2370 + }, + { + "epoch": 0.29, + "grad_norm": 0.6550193444792216, + "learning_rate": 8.278306056333445e-06, + "loss": 0.5398, + "step": 2371 + }, + { + "epoch": 0.29, + "grad_norm": 1.6404194777907943, + "learning_rate": 8.276788342787394e-06, + "loss": 0.5123, + "step": 2372 + }, + { + "epoch": 0.29, + "grad_norm": 1.315161270026293, + "learning_rate": 8.275270099847325e-06, + "loss": 0.5449, + "step": 2373 + }, + { + "epoch": 0.29, + "grad_norm": 1.517240516199405, + "learning_rate": 8.273751327758526e-06, + "loss": 0.5153, + "step": 2374 + }, + { + "epoch": 0.29, + "grad_norm": 2.2174133228862587, + "learning_rate": 8.272232026766368e-06, + "loss": 0.491, + "step": 2375 + }, + { + "epoch": 0.29, + "grad_norm": 1.4053448725891333, + "learning_rate": 8.270712197116306e-06, + "loss": 0.5205, + "step": 2376 + }, + { + "epoch": 0.29, + "grad_norm": 1.9740569155576038, + "learning_rate": 8.269191839053884e-06, + "loss": 0.4661, + "step": 2377 + }, + { + "epoch": 0.3, + "grad_norm": 2.672953358964489, + "learning_rate": 8.267670952824726e-06, + "loss": 0.5081, + "step": 2378 + }, + { + "epoch": 0.3, + "grad_norm": 2.077306245332074, + "learning_rate": 8.266149538674548e-06, + "loss": 0.4973, + "step": 2379 + }, + { + "epoch": 0.3, + "grad_norm": 1.409380624306681, + "learning_rate": 8.264627596849146e-06, + "loss": 0.5062, + "step": 2380 + }, + { + "epoch": 0.3, + "grad_norm": 1.8358787110603083, + "learning_rate": 8.263105127594405e-06, + "loss": 0.5327, + "step": 2381 + }, + { + "epoch": 0.3, + "grad_norm": 1.4579310672276335, + "learning_rate": 8.261582131156289e-06, + "loss": 0.4562, + "step": 2382 + }, + { + "epoch": 0.3, + "grad_norm": 1.4387234091141146, + "learning_rate": 8.260058607780857e-06, + "loss": 0.5368, + "step": 2383 + }, + { + "epoch": 0.3, + "grad_norm": 1.285121771570786, + "learning_rate": 8.258534557714242e-06, + "loss": 0.5482, + "step": 2384 + }, + { + "epoch": 0.3, + "grad_norm": 1.4535365064196426, + "learning_rate": 8.257009981202673e-06, + "loss": 0.539, + "step": 2385 + }, + { + "epoch": 0.3, + "grad_norm": 1.4290733161456042, + "learning_rate": 8.255484878492454e-06, + "loss": 0.5029, + "step": 2386 + }, + { + "epoch": 0.3, + "grad_norm": 1.3993424374411836, + "learning_rate": 8.253959249829983e-06, + "loss": 0.5006, + "step": 2387 + }, + { + "epoch": 0.3, + "grad_norm": 0.6216777802783738, + "learning_rate": 8.252433095461736e-06, + "loss": 0.5374, + "step": 2388 + }, + { + "epoch": 0.3, + "grad_norm": 1.8538060165988877, + "learning_rate": 8.250906415634279e-06, + "loss": 0.493, + "step": 2389 + }, + { + "epoch": 0.3, + "grad_norm": 1.4053489404747652, + "learning_rate": 8.249379210594258e-06, + "loss": 0.5127, + "step": 2390 + }, + { + "epoch": 0.3, + "grad_norm": 15.824381664090078, + "learning_rate": 8.247851480588407e-06, + "loss": 0.4867, + "step": 2391 + }, + { + "epoch": 0.3, + "grad_norm": 1.4129574000855118, + "learning_rate": 8.246323225863545e-06, + "loss": 0.5446, + "step": 2392 + }, + { + "epoch": 0.3, + "grad_norm": 1.4106399327103274, + "learning_rate": 8.244794446666575e-06, + "loss": 0.4639, + "step": 2393 + }, + { + "epoch": 0.3, + "grad_norm": 1.449898253618827, + "learning_rate": 8.243265143244485e-06, + "loss": 0.5184, + "step": 2394 + }, + { + "epoch": 0.3, + "grad_norm": 1.4255742042291377, + "learning_rate": 8.241735315844348e-06, + "loss": 0.5316, + "step": 2395 + }, + { + "epoch": 0.3, + "grad_norm": 2.495255075502373, + "learning_rate": 8.240204964713317e-06, + "loss": 0.4794, + "step": 2396 + }, + { + "epoch": 0.3, + "grad_norm": 1.19925092790902, + "learning_rate": 8.238674090098639e-06, + "loss": 0.4861, + "step": 2397 + }, + { + "epoch": 0.3, + "grad_norm": 1.2584502352522227, + "learning_rate": 8.237142692247637e-06, + "loss": 0.5453, + "step": 2398 + }, + { + "epoch": 0.3, + "grad_norm": 1.7706569851686855, + "learning_rate": 8.235610771407725e-06, + "loss": 0.5128, + "step": 2399 + }, + { + "epoch": 0.3, + "grad_norm": 1.393286411135955, + "learning_rate": 8.234078327826394e-06, + "loss": 0.5132, + "step": 2400 + }, + { + "epoch": 0.3, + "grad_norm": 1.5911983819549134, + "learning_rate": 8.232545361751227e-06, + "loss": 0.5423, + "step": 2401 + }, + { + "epoch": 0.3, + "grad_norm": 1.2768352893732755, + "learning_rate": 8.231011873429887e-06, + "loss": 0.5027, + "step": 2402 + }, + { + "epoch": 0.3, + "grad_norm": 2.6464670337009326, + "learning_rate": 8.22947786311012e-06, + "loss": 0.5297, + "step": 2403 + }, + { + "epoch": 0.3, + "grad_norm": 2.277057973772232, + "learning_rate": 8.227943331039765e-06, + "loss": 0.53, + "step": 2404 + }, + { + "epoch": 0.3, + "grad_norm": 1.5165937159294598, + "learning_rate": 8.226408277466735e-06, + "loss": 0.5029, + "step": 2405 + }, + { + "epoch": 0.3, + "grad_norm": 1.6307182302388195, + "learning_rate": 8.22487270263903e-06, + "loss": 0.5076, + "step": 2406 + }, + { + "epoch": 0.3, + "grad_norm": 1.4016704206015314, + "learning_rate": 8.22333660680474e-06, + "loss": 0.4607, + "step": 2407 + }, + { + "epoch": 0.3, + "grad_norm": 1.5733249006556254, + "learning_rate": 8.221799990212031e-06, + "loss": 0.5086, + "step": 2408 + }, + { + "epoch": 0.3, + "grad_norm": 1.428256429738577, + "learning_rate": 8.22026285310916e-06, + "loss": 0.515, + "step": 2409 + }, + { + "epoch": 0.3, + "grad_norm": 1.974071549092424, + "learning_rate": 8.218725195744464e-06, + "loss": 0.5051, + "step": 2410 + }, + { + "epoch": 0.3, + "grad_norm": 1.3210469873226283, + "learning_rate": 8.217187018366364e-06, + "loss": 0.5219, + "step": 2411 + }, + { + "epoch": 0.3, + "grad_norm": 1.7354476740515565, + "learning_rate": 8.215648321223363e-06, + "loss": 0.5185, + "step": 2412 + }, + { + "epoch": 0.3, + "grad_norm": 1.3385225264897804, + "learning_rate": 8.21410910456406e-06, + "loss": 0.5344, + "step": 2413 + }, + { + "epoch": 0.3, + "grad_norm": 1.5655282839619544, + "learning_rate": 8.212569368637123e-06, + "loss": 0.5764, + "step": 2414 + }, + { + "epoch": 0.3, + "grad_norm": 1.838377707828001, + "learning_rate": 8.21102911369131e-06, + "loss": 0.5015, + "step": 2415 + }, + { + "epoch": 0.3, + "grad_norm": 1.3334971659877586, + "learning_rate": 8.209488339975461e-06, + "loss": 0.5422, + "step": 2416 + }, + { + "epoch": 0.3, + "grad_norm": 1.601034277173312, + "learning_rate": 8.207947047738508e-06, + "loss": 0.5079, + "step": 2417 + }, + { + "epoch": 0.3, + "grad_norm": 1.732019949409373, + "learning_rate": 8.206405237229453e-06, + "loss": 0.5537, + "step": 2418 + }, + { + "epoch": 0.3, + "grad_norm": 1.600774736304845, + "learning_rate": 8.204862908697396e-06, + "loss": 0.5462, + "step": 2419 + }, + { + "epoch": 0.3, + "grad_norm": 1.2915947660919527, + "learning_rate": 8.203320062391506e-06, + "loss": 0.4483, + "step": 2420 + }, + { + "epoch": 0.3, + "grad_norm": 1.482367313516658, + "learning_rate": 8.201776698561049e-06, + "loss": 0.5089, + "step": 2421 + }, + { + "epoch": 0.3, + "grad_norm": 1.379824868329647, + "learning_rate": 8.200232817455369e-06, + "loss": 0.522, + "step": 2422 + }, + { + "epoch": 0.3, + "grad_norm": 1.6160975125022232, + "learning_rate": 8.198688419323893e-06, + "loss": 0.5134, + "step": 2423 + }, + { + "epoch": 0.3, + "grad_norm": 1.4497050493829207, + "learning_rate": 8.197143504416127e-06, + "loss": 0.5213, + "step": 2424 + }, + { + "epoch": 0.3, + "grad_norm": 1.370280107513277, + "learning_rate": 8.195598072981674e-06, + "loss": 0.4872, + "step": 2425 + }, + { + "epoch": 0.3, + "grad_norm": 1.5170579867779488, + "learning_rate": 8.194052125270207e-06, + "loss": 0.5569, + "step": 2426 + }, + { + "epoch": 0.3, + "grad_norm": 2.0868880854014726, + "learning_rate": 8.192505661531489e-06, + "loss": 0.5531, + "step": 2427 + }, + { + "epoch": 0.3, + "grad_norm": 1.596934387560802, + "learning_rate": 8.190958682015362e-06, + "loss": 0.4936, + "step": 2428 + }, + { + "epoch": 0.3, + "grad_norm": 1.6234727453741495, + "learning_rate": 8.189411186971759e-06, + "loss": 0.4954, + "step": 2429 + }, + { + "epoch": 0.3, + "grad_norm": 1.3541932266704004, + "learning_rate": 8.187863176650688e-06, + "loss": 0.5169, + "step": 2430 + }, + { + "epoch": 0.3, + "grad_norm": 1.7951463915061205, + "learning_rate": 8.186314651302242e-06, + "loss": 0.4556, + "step": 2431 + }, + { + "epoch": 0.3, + "grad_norm": 1.3940159809171142, + "learning_rate": 8.184765611176605e-06, + "loss": 0.5454, + "step": 2432 + }, + { + "epoch": 0.3, + "grad_norm": 1.3019380098469693, + "learning_rate": 8.183216056524035e-06, + "loss": 0.522, + "step": 2433 + }, + { + "epoch": 0.3, + "grad_norm": 1.233015420492335, + "learning_rate": 8.181665987594874e-06, + "loss": 0.478, + "step": 2434 + }, + { + "epoch": 0.3, + "grad_norm": 1.347521272975239, + "learning_rate": 8.18011540463955e-06, + "loss": 0.5021, + "step": 2435 + }, + { + "epoch": 0.3, + "grad_norm": 1.4394524554290282, + "learning_rate": 8.178564307908577e-06, + "loss": 0.4458, + "step": 2436 + }, + { + "epoch": 0.3, + "grad_norm": 0.705192311131923, + "learning_rate": 8.177012697652544e-06, + "loss": 0.4793, + "step": 2437 + }, + { + "epoch": 0.3, + "grad_norm": 1.5623035789488708, + "learning_rate": 8.17546057412213e-06, + "loss": 0.5614, + "step": 2438 + }, + { + "epoch": 0.3, + "grad_norm": 1.3216329542885294, + "learning_rate": 8.173907937568093e-06, + "loss": 0.4732, + "step": 2439 + }, + { + "epoch": 0.3, + "grad_norm": 1.4405423395533385, + "learning_rate": 8.172354788241277e-06, + "loss": 0.5659, + "step": 2440 + }, + { + "epoch": 0.3, + "grad_norm": 1.4266787372028973, + "learning_rate": 8.170801126392602e-06, + "loss": 0.533, + "step": 2441 + }, + { + "epoch": 0.3, + "grad_norm": 4.544729767161987, + "learning_rate": 8.169246952273081e-06, + "loss": 0.4868, + "step": 2442 + }, + { + "epoch": 0.3, + "grad_norm": 0.7137674103325354, + "learning_rate": 8.167692266133804e-06, + "loss": 0.4834, + "step": 2443 + }, + { + "epoch": 0.3, + "grad_norm": 1.5556203466575353, + "learning_rate": 8.166137068225942e-06, + "loss": 0.5492, + "step": 2444 + }, + { + "epoch": 0.3, + "grad_norm": 1.7711514179158319, + "learning_rate": 8.164581358800749e-06, + "loss": 0.4767, + "step": 2445 + }, + { + "epoch": 0.3, + "grad_norm": 1.5301362267020098, + "learning_rate": 8.16302513810957e-06, + "loss": 0.5439, + "step": 2446 + }, + { + "epoch": 0.3, + "grad_norm": 1.6901022197817894, + "learning_rate": 8.16146840640382e-06, + "loss": 0.5311, + "step": 2447 + }, + { + "epoch": 0.3, + "grad_norm": 1.4387929530835506, + "learning_rate": 8.159911163935007e-06, + "loss": 0.5052, + "step": 2448 + }, + { + "epoch": 0.3, + "grad_norm": 1.4881162518562419, + "learning_rate": 8.158353410954715e-06, + "loss": 0.4841, + "step": 2449 + }, + { + "epoch": 0.3, + "grad_norm": 2.3232025249317436, + "learning_rate": 8.156795147714612e-06, + "loss": 0.5394, + "step": 2450 + }, + { + "epoch": 0.3, + "grad_norm": 2.1502311983887634, + "learning_rate": 8.155236374466452e-06, + "loss": 0.5267, + "step": 2451 + }, + { + "epoch": 0.3, + "grad_norm": 1.4184547623606276, + "learning_rate": 8.153677091462067e-06, + "loss": 0.4806, + "step": 2452 + }, + { + "epoch": 0.3, + "grad_norm": 1.3045598237461515, + "learning_rate": 8.15211729895337e-06, + "loss": 0.5313, + "step": 2453 + }, + { + "epoch": 0.3, + "grad_norm": 1.4262062024035553, + "learning_rate": 8.150556997192366e-06, + "loss": 0.5778, + "step": 2454 + }, + { + "epoch": 0.3, + "grad_norm": 1.9812787992571088, + "learning_rate": 8.148996186431129e-06, + "loss": 0.5107, + "step": 2455 + }, + { + "epoch": 0.3, + "grad_norm": 1.49217716763415, + "learning_rate": 8.147434866921824e-06, + "loss": 0.5229, + "step": 2456 + }, + { + "epoch": 0.3, + "grad_norm": 1.5400517482926255, + "learning_rate": 8.145873038916696e-06, + "loss": 0.5363, + "step": 2457 + }, + { + "epoch": 0.31, + "grad_norm": 1.7288336436232086, + "learning_rate": 8.144310702668072e-06, + "loss": 0.4677, + "step": 2458 + }, + { + "epoch": 0.31, + "grad_norm": 1.397832321796737, + "learning_rate": 8.142747858428364e-06, + "loss": 0.4786, + "step": 2459 + }, + { + "epoch": 0.31, + "grad_norm": 1.6129287143413962, + "learning_rate": 8.141184506450058e-06, + "loss": 0.5479, + "step": 2460 + }, + { + "epoch": 0.31, + "grad_norm": 1.4428208255741053, + "learning_rate": 8.13962064698573e-06, + "loss": 0.5064, + "step": 2461 + }, + { + "epoch": 0.31, + "grad_norm": 1.6383815783802438, + "learning_rate": 8.138056280288036e-06, + "loss": 0.5306, + "step": 2462 + }, + { + "epoch": 0.31, + "grad_norm": 1.7647730587391064, + "learning_rate": 8.136491406609712e-06, + "loss": 0.4727, + "step": 2463 + }, + { + "epoch": 0.31, + "grad_norm": 1.424228033452532, + "learning_rate": 8.134926026203578e-06, + "loss": 0.5626, + "step": 2464 + }, + { + "epoch": 0.31, + "grad_norm": 2.176560989777953, + "learning_rate": 8.133360139322533e-06, + "loss": 0.4763, + "step": 2465 + }, + { + "epoch": 0.31, + "grad_norm": 1.4747933472510153, + "learning_rate": 8.131793746219563e-06, + "loss": 0.5606, + "step": 2466 + }, + { + "epoch": 0.31, + "grad_norm": 1.5579184573495903, + "learning_rate": 8.13022684714773e-06, + "loss": 0.5483, + "step": 2467 + }, + { + "epoch": 0.31, + "grad_norm": 1.3212477231943376, + "learning_rate": 8.128659442360182e-06, + "loss": 0.5219, + "step": 2468 + }, + { + "epoch": 0.31, + "grad_norm": 1.4716718393118688, + "learning_rate": 8.127091532110147e-06, + "loss": 0.5485, + "step": 2469 + }, + { + "epoch": 0.31, + "grad_norm": 2.1543171145452837, + "learning_rate": 8.125523116650933e-06, + "loss": 0.5366, + "step": 2470 + }, + { + "epoch": 0.31, + "grad_norm": 1.9532575568968857, + "learning_rate": 8.123954196235932e-06, + "loss": 0.4726, + "step": 2471 + }, + { + "epoch": 0.31, + "grad_norm": 1.3300282366699865, + "learning_rate": 8.122384771118619e-06, + "loss": 0.45, + "step": 2472 + }, + { + "epoch": 0.31, + "grad_norm": 1.6658818267429538, + "learning_rate": 8.120814841552544e-06, + "loss": 0.5226, + "step": 2473 + }, + { + "epoch": 0.31, + "grad_norm": 2.015297622544853, + "learning_rate": 8.119244407791346e-06, + "loss": 0.5539, + "step": 2474 + }, + { + "epoch": 0.31, + "grad_norm": 1.9033582376604703, + "learning_rate": 8.117673470088745e-06, + "loss": 0.5468, + "step": 2475 + }, + { + "epoch": 0.31, + "grad_norm": 1.8411435527307176, + "learning_rate": 8.116102028698536e-06, + "loss": 0.5181, + "step": 2476 + }, + { + "epoch": 0.31, + "grad_norm": 1.4750661354518269, + "learning_rate": 8.114530083874599e-06, + "loss": 0.5282, + "step": 2477 + }, + { + "epoch": 0.31, + "grad_norm": 1.5342175409546561, + "learning_rate": 8.112957635870895e-06, + "loss": 0.5107, + "step": 2478 + }, + { + "epoch": 0.31, + "grad_norm": 1.4270612045971265, + "learning_rate": 8.111384684941471e-06, + "loss": 0.4622, + "step": 2479 + }, + { + "epoch": 0.31, + "grad_norm": 1.731355664199108, + "learning_rate": 8.109811231340448e-06, + "loss": 0.5026, + "step": 2480 + }, + { + "epoch": 0.31, + "grad_norm": 2.4987372239820895, + "learning_rate": 8.108237275322031e-06, + "loss": 0.5075, + "step": 2481 + }, + { + "epoch": 0.31, + "grad_norm": 2.195280425107586, + "learning_rate": 8.106662817140508e-06, + "loss": 0.5285, + "step": 2482 + }, + { + "epoch": 0.31, + "grad_norm": 8.236829502099297, + "learning_rate": 8.105087857050246e-06, + "loss": 0.4448, + "step": 2483 + }, + { + "epoch": 0.31, + "grad_norm": 1.552047281830834, + "learning_rate": 8.103512395305693e-06, + "loss": 0.5142, + "step": 2484 + }, + { + "epoch": 0.31, + "grad_norm": 1.6745640264545936, + "learning_rate": 8.10193643216138e-06, + "loss": 0.505, + "step": 2485 + }, + { + "epoch": 0.31, + "grad_norm": 1.3051593774586854, + "learning_rate": 8.100359967871915e-06, + "loss": 0.542, + "step": 2486 + }, + { + "epoch": 0.31, + "grad_norm": 1.6495030342810197, + "learning_rate": 8.098783002691994e-06, + "loss": 0.5699, + "step": 2487 + }, + { + "epoch": 0.31, + "grad_norm": 2.166135329822737, + "learning_rate": 8.097205536876387e-06, + "loss": 0.5654, + "step": 2488 + }, + { + "epoch": 0.31, + "grad_norm": 1.7492440243837697, + "learning_rate": 8.095627570679947e-06, + "loss": 0.5052, + "step": 2489 + }, + { + "epoch": 0.31, + "grad_norm": 1.550936281764954, + "learning_rate": 8.094049104357608e-06, + "loss": 0.553, + "step": 2490 + }, + { + "epoch": 0.31, + "grad_norm": 1.4772954354123726, + "learning_rate": 8.092470138164388e-06, + "loss": 0.5395, + "step": 2491 + }, + { + "epoch": 0.31, + "grad_norm": 2.0886376026135105, + "learning_rate": 8.09089067235538e-06, + "loss": 0.5575, + "step": 2492 + }, + { + "epoch": 0.31, + "grad_norm": 1.977419426615732, + "learning_rate": 8.089310707185763e-06, + "loss": 0.508, + "step": 2493 + }, + { + "epoch": 0.31, + "grad_norm": 1.8598515635774202, + "learning_rate": 8.087730242910792e-06, + "loss": 0.4959, + "step": 2494 + }, + { + "epoch": 0.31, + "grad_norm": 1.6130983505295127, + "learning_rate": 8.086149279785807e-06, + "loss": 0.5438, + "step": 2495 + }, + { + "epoch": 0.31, + "grad_norm": 1.6412643748883935, + "learning_rate": 8.084567818066225e-06, + "loss": 0.524, + "step": 2496 + }, + { + "epoch": 0.31, + "grad_norm": 2.796679205539921, + "learning_rate": 8.082985858007544e-06, + "loss": 0.512, + "step": 2497 + }, + { + "epoch": 0.31, + "grad_norm": 1.493724261833098, + "learning_rate": 8.081403399865347e-06, + "loss": 0.526, + "step": 2498 + }, + { + "epoch": 0.31, + "grad_norm": 1.2693373006893207, + "learning_rate": 8.079820443895292e-06, + "loss": 0.5074, + "step": 2499 + }, + { + "epoch": 0.31, + "grad_norm": 8.523194363659194, + "learning_rate": 8.07823699035312e-06, + "loss": 0.4962, + "step": 2500 + }, + { + "epoch": 0.31, + "grad_norm": 1.4165837605180032, + "learning_rate": 8.076653039494649e-06, + "loss": 0.4806, + "step": 2501 + }, + { + "epoch": 0.31, + "grad_norm": 3.5154178932893485, + "learning_rate": 8.075068591575783e-06, + "loss": 0.4951, + "step": 2502 + }, + { + "epoch": 0.31, + "grad_norm": 1.4374752979267316, + "learning_rate": 8.073483646852507e-06, + "loss": 0.4523, + "step": 2503 + }, + { + "epoch": 0.31, + "grad_norm": 1.4793590132895706, + "learning_rate": 8.071898205580877e-06, + "loss": 0.5124, + "step": 2504 + }, + { + "epoch": 0.31, + "grad_norm": 2.1797717404084764, + "learning_rate": 8.070312268017036e-06, + "loss": 0.5688, + "step": 2505 + }, + { + "epoch": 0.31, + "grad_norm": 1.3568393334677344, + "learning_rate": 8.068725834417208e-06, + "loss": 0.5011, + "step": 2506 + }, + { + "epoch": 0.31, + "grad_norm": 1.3726472919474497, + "learning_rate": 8.067138905037694e-06, + "loss": 0.549, + "step": 2507 + }, + { + "epoch": 0.31, + "grad_norm": 1.4125513596310788, + "learning_rate": 8.065551480134879e-06, + "loss": 0.4578, + "step": 2508 + }, + { + "epoch": 0.31, + "grad_norm": 1.3339617497863514, + "learning_rate": 8.063963559965221e-06, + "loss": 0.4802, + "step": 2509 + }, + { + "epoch": 0.31, + "grad_norm": 1.5081473397590144, + "learning_rate": 8.062375144785265e-06, + "loss": 0.4897, + "step": 2510 + }, + { + "epoch": 0.31, + "grad_norm": 13.303668471143709, + "learning_rate": 8.060786234851634e-06, + "loss": 0.4694, + "step": 2511 + }, + { + "epoch": 0.31, + "grad_norm": 1.573520854460718, + "learning_rate": 8.059196830421032e-06, + "loss": 0.5268, + "step": 2512 + }, + { + "epoch": 0.31, + "grad_norm": 1.3482665581028472, + "learning_rate": 8.057606931750235e-06, + "loss": 0.4916, + "step": 2513 + }, + { + "epoch": 0.31, + "grad_norm": 1.8330345165028457, + "learning_rate": 8.056016539096112e-06, + "loss": 0.4659, + "step": 2514 + }, + { + "epoch": 0.31, + "grad_norm": 1.509864150941314, + "learning_rate": 8.0544256527156e-06, + "loss": 0.5633, + "step": 2515 + }, + { + "epoch": 0.31, + "grad_norm": 1.4224595451530193, + "learning_rate": 8.052834272865724e-06, + "loss": 0.5587, + "step": 2516 + }, + { + "epoch": 0.31, + "grad_norm": 1.9882292371596657, + "learning_rate": 8.051242399803586e-06, + "loss": 0.5115, + "step": 2517 + }, + { + "epoch": 0.31, + "grad_norm": 1.443219180257739, + "learning_rate": 8.049650033786364e-06, + "loss": 0.506, + "step": 2518 + }, + { + "epoch": 0.31, + "grad_norm": 1.4703440636482108, + "learning_rate": 8.04805717507132e-06, + "loss": 0.5088, + "step": 2519 + }, + { + "epoch": 0.31, + "grad_norm": 1.938646696914128, + "learning_rate": 8.046463823915794e-06, + "loss": 0.4723, + "step": 2520 + }, + { + "epoch": 0.31, + "grad_norm": 1.8214933656484336, + "learning_rate": 8.044869980577205e-06, + "loss": 0.4937, + "step": 2521 + }, + { + "epoch": 0.31, + "grad_norm": 2.5537002581659554, + "learning_rate": 8.043275645313058e-06, + "loss": 0.5428, + "step": 2522 + }, + { + "epoch": 0.31, + "grad_norm": 1.8339999808752838, + "learning_rate": 8.041680818380924e-06, + "loss": 0.5098, + "step": 2523 + }, + { + "epoch": 0.31, + "grad_norm": 1.3599642946469352, + "learning_rate": 8.040085500038465e-06, + "loss": 0.5299, + "step": 2524 + }, + { + "epoch": 0.31, + "grad_norm": 1.2462733428982753, + "learning_rate": 8.038489690543421e-06, + "loss": 0.4943, + "step": 2525 + }, + { + "epoch": 0.31, + "grad_norm": 2.4496657853471375, + "learning_rate": 8.036893390153606e-06, + "loss": 0.4942, + "step": 2526 + }, + { + "epoch": 0.31, + "grad_norm": 1.6281553227437193, + "learning_rate": 8.035296599126917e-06, + "loss": 0.5068, + "step": 2527 + }, + { + "epoch": 0.31, + "grad_norm": 1.3263402475379906, + "learning_rate": 8.033699317721331e-06, + "loss": 0.4942, + "step": 2528 + }, + { + "epoch": 0.31, + "grad_norm": 2.5396019747741265, + "learning_rate": 8.0321015461949e-06, + "loss": 0.4934, + "step": 2529 + }, + { + "epoch": 0.31, + "grad_norm": 2.2515443414601215, + "learning_rate": 8.030503284805762e-06, + "loss": 0.5595, + "step": 2530 + }, + { + "epoch": 0.31, + "grad_norm": 0.7028155212480076, + "learning_rate": 8.028904533812125e-06, + "loss": 0.5103, + "step": 2531 + }, + { + "epoch": 0.31, + "grad_norm": 1.3470718084717475, + "learning_rate": 8.027305293472287e-06, + "loss": 0.5419, + "step": 2532 + }, + { + "epoch": 0.31, + "grad_norm": 1.3517717148809503, + "learning_rate": 8.025705564044615e-06, + "loss": 0.542, + "step": 2533 + }, + { + "epoch": 0.31, + "grad_norm": 1.6400748484848002, + "learning_rate": 8.024105345787562e-06, + "loss": 0.5487, + "step": 2534 + }, + { + "epoch": 0.31, + "grad_norm": 1.4080101276322876, + "learning_rate": 8.022504638959657e-06, + "loss": 0.5375, + "step": 2535 + }, + { + "epoch": 0.31, + "grad_norm": 1.4608135823018693, + "learning_rate": 8.020903443819507e-06, + "loss": 0.5419, + "step": 2536 + }, + { + "epoch": 0.31, + "grad_norm": 1.2070914576507898, + "learning_rate": 8.0193017606258e-06, + "loss": 0.5425, + "step": 2537 + }, + { + "epoch": 0.31, + "grad_norm": 1.2988911897831374, + "learning_rate": 8.017699589637302e-06, + "loss": 0.4764, + "step": 2538 + }, + { + "epoch": 0.32, + "grad_norm": 1.3755261690769904, + "learning_rate": 8.016096931112858e-06, + "loss": 0.5326, + "step": 2539 + }, + { + "epoch": 0.32, + "grad_norm": 1.7204618851485292, + "learning_rate": 8.014493785311391e-06, + "loss": 0.4886, + "step": 2540 + }, + { + "epoch": 0.32, + "grad_norm": 1.3003456270794864, + "learning_rate": 8.012890152491904e-06, + "loss": 0.4928, + "step": 2541 + }, + { + "epoch": 0.32, + "grad_norm": 1.3400916153759963, + "learning_rate": 8.011286032913478e-06, + "loss": 0.5215, + "step": 2542 + }, + { + "epoch": 0.32, + "grad_norm": 1.4896133467098513, + "learning_rate": 8.009681426835273e-06, + "loss": 0.5422, + "step": 2543 + }, + { + "epoch": 0.32, + "grad_norm": 1.8446002305350264, + "learning_rate": 8.008076334516523e-06, + "loss": 0.4814, + "step": 2544 + }, + { + "epoch": 0.32, + "grad_norm": 1.4655503832221062, + "learning_rate": 8.006470756216551e-06, + "loss": 0.5761, + "step": 2545 + }, + { + "epoch": 0.32, + "grad_norm": 1.332714497007832, + "learning_rate": 8.00486469219475e-06, + "loss": 0.4785, + "step": 2546 + }, + { + "epoch": 0.32, + "grad_norm": 1.772899784431288, + "learning_rate": 8.003258142710593e-06, + "loss": 0.5127, + "step": 2547 + }, + { + "epoch": 0.32, + "grad_norm": 2.18496507520994, + "learning_rate": 8.001651108023632e-06, + "loss": 0.5681, + "step": 2548 + }, + { + "epoch": 0.32, + "grad_norm": 1.5808393491809347, + "learning_rate": 8.0000435883935e-06, + "loss": 0.4923, + "step": 2549 + }, + { + "epoch": 0.32, + "grad_norm": 1.9652897069753408, + "learning_rate": 7.998435584079904e-06, + "loss": 0.5546, + "step": 2550 + }, + { + "epoch": 0.32, + "grad_norm": 1.4685113192092831, + "learning_rate": 7.99682709534263e-06, + "loss": 0.5046, + "step": 2551 + }, + { + "epoch": 0.32, + "grad_norm": 1.3755259188398998, + "learning_rate": 7.995218122441545e-06, + "loss": 0.5402, + "step": 2552 + }, + { + "epoch": 0.32, + "grad_norm": 1.6402565725216716, + "learning_rate": 7.993608665636594e-06, + "loss": 0.4868, + "step": 2553 + }, + { + "epoch": 0.32, + "grad_norm": 1.340748457030908, + "learning_rate": 7.991998725187797e-06, + "loss": 0.4743, + "step": 2554 + }, + { + "epoch": 0.32, + "grad_norm": 1.509932753295099, + "learning_rate": 7.990388301355257e-06, + "loss": 0.5071, + "step": 2555 + }, + { + "epoch": 0.32, + "grad_norm": 1.4331341944162324, + "learning_rate": 7.988777394399146e-06, + "loss": 0.5645, + "step": 2556 + }, + { + "epoch": 0.32, + "grad_norm": 1.7525247336155012, + "learning_rate": 7.987166004579727e-06, + "loss": 0.4617, + "step": 2557 + }, + { + "epoch": 0.32, + "grad_norm": 1.4084454111962381, + "learning_rate": 7.98555413215733e-06, + "loss": 0.5474, + "step": 2558 + }, + { + "epoch": 0.32, + "grad_norm": 1.303093075197306, + "learning_rate": 7.98394177739237e-06, + "loss": 0.4948, + "step": 2559 + }, + { + "epoch": 0.32, + "grad_norm": 1.2288203173720957, + "learning_rate": 7.982328940545334e-06, + "loss": 0.4921, + "step": 2560 + }, + { + "epoch": 0.32, + "grad_norm": 1.2101474981944487, + "learning_rate": 7.980715621876793e-06, + "loss": 0.4831, + "step": 2561 + }, + { + "epoch": 0.32, + "grad_norm": 1.2970614349320075, + "learning_rate": 7.97910182164739e-06, + "loss": 0.495, + "step": 2562 + }, + { + "epoch": 0.32, + "grad_norm": 0.6276571869904369, + "learning_rate": 7.977487540117852e-06, + "loss": 0.4888, + "step": 2563 + }, + { + "epoch": 0.32, + "grad_norm": 1.31307825281209, + "learning_rate": 7.975872777548977e-06, + "loss": 0.5072, + "step": 2564 + }, + { + "epoch": 0.32, + "grad_norm": 1.8758618093049964, + "learning_rate": 7.974257534201647e-06, + "loss": 0.5052, + "step": 2565 + }, + { + "epoch": 0.32, + "grad_norm": 1.4751921344767631, + "learning_rate": 7.972641810336816e-06, + "loss": 0.5237, + "step": 2566 + }, + { + "epoch": 0.32, + "grad_norm": 5.354956242965793, + "learning_rate": 7.971025606215521e-06, + "loss": 0.4831, + "step": 2567 + }, + { + "epoch": 0.32, + "grad_norm": 1.8797787633285996, + "learning_rate": 7.969408922098871e-06, + "loss": 0.4967, + "step": 2568 + }, + { + "epoch": 0.32, + "grad_norm": 1.9374064384200749, + "learning_rate": 7.96779175824806e-06, + "loss": 0.5014, + "step": 2569 + }, + { + "epoch": 0.32, + "grad_norm": 1.572450726443971, + "learning_rate": 7.966174114924352e-06, + "loss": 0.5579, + "step": 2570 + }, + { + "epoch": 0.32, + "grad_norm": 1.424117716618892, + "learning_rate": 7.964555992389092e-06, + "loss": 0.5364, + "step": 2571 + }, + { + "epoch": 0.32, + "grad_norm": 1.4695100357496795, + "learning_rate": 7.9629373909037e-06, + "loss": 0.5565, + "step": 2572 + }, + { + "epoch": 0.32, + "grad_norm": 1.5546732225028876, + "learning_rate": 7.961318310729678e-06, + "loss": 0.5246, + "step": 2573 + }, + { + "epoch": 0.32, + "grad_norm": 1.4686814158564432, + "learning_rate": 7.959698752128602e-06, + "loss": 0.5406, + "step": 2574 + }, + { + "epoch": 0.32, + "grad_norm": 1.5002993149698707, + "learning_rate": 7.958078715362127e-06, + "loss": 0.5398, + "step": 2575 + }, + { + "epoch": 0.32, + "grad_norm": 1.3502921398391599, + "learning_rate": 7.956458200691981e-06, + "loss": 0.529, + "step": 2576 + }, + { + "epoch": 0.32, + "grad_norm": 2.138050239994702, + "learning_rate": 7.954837208379978e-06, + "loss": 0.5314, + "step": 2577 + }, + { + "epoch": 0.32, + "grad_norm": 1.2974781099214527, + "learning_rate": 7.953215738687997e-06, + "loss": 0.4856, + "step": 2578 + }, + { + "epoch": 0.32, + "grad_norm": 1.4912136233611968, + "learning_rate": 7.951593791878005e-06, + "loss": 0.5309, + "step": 2579 + }, + { + "epoch": 0.32, + "grad_norm": 1.519571706645829, + "learning_rate": 7.94997136821204e-06, + "loss": 0.5126, + "step": 2580 + }, + { + "epoch": 0.32, + "grad_norm": 1.5345595769482312, + "learning_rate": 7.948348467952221e-06, + "loss": 0.4793, + "step": 2581 + }, + { + "epoch": 0.32, + "grad_norm": 1.5034221535478678, + "learning_rate": 7.946725091360738e-06, + "loss": 0.5109, + "step": 2582 + }, + { + "epoch": 0.32, + "grad_norm": 0.6353221949559826, + "learning_rate": 7.945101238699865e-06, + "loss": 0.5091, + "step": 2583 + }, + { + "epoch": 0.32, + "grad_norm": 1.6441115764867356, + "learning_rate": 7.943476910231948e-06, + "loss": 0.5287, + "step": 2584 + }, + { + "epoch": 0.32, + "grad_norm": 1.548879078222247, + "learning_rate": 7.941852106219414e-06, + "loss": 0.4787, + "step": 2585 + }, + { + "epoch": 0.32, + "grad_norm": 1.5134010020200608, + "learning_rate": 7.940226826924761e-06, + "loss": 0.464, + "step": 2586 + }, + { + "epoch": 0.32, + "grad_norm": 1.3246410761168317, + "learning_rate": 7.938601072610573e-06, + "loss": 0.5143, + "step": 2587 + }, + { + "epoch": 0.32, + "grad_norm": 0.6072185208166537, + "learning_rate": 7.936974843539496e-06, + "loss": 0.4914, + "step": 2588 + }, + { + "epoch": 0.32, + "grad_norm": 1.5873833753998925, + "learning_rate": 7.935348139974268e-06, + "loss": 0.487, + "step": 2589 + }, + { + "epoch": 0.32, + "grad_norm": 1.3803477410016485, + "learning_rate": 7.933720962177696e-06, + "loss": 0.5445, + "step": 2590 + }, + { + "epoch": 0.32, + "grad_norm": 1.479728555323523, + "learning_rate": 7.932093310412665e-06, + "loss": 0.5432, + "step": 2591 + }, + { + "epoch": 0.32, + "grad_norm": 1.461786336220308, + "learning_rate": 7.930465184942135e-06, + "loss": 0.4729, + "step": 2592 + }, + { + "epoch": 0.32, + "grad_norm": 1.372907725327091, + "learning_rate": 7.928836586029146e-06, + "loss": 0.4982, + "step": 2593 + }, + { + "epoch": 0.32, + "grad_norm": 1.3760889483532308, + "learning_rate": 7.927207513936812e-06, + "loss": 0.509, + "step": 2594 + }, + { + "epoch": 0.32, + "grad_norm": 1.3209344770538574, + "learning_rate": 7.925577968928323e-06, + "loss": 0.5233, + "step": 2595 + }, + { + "epoch": 0.32, + "grad_norm": 2.263176280923, + "learning_rate": 7.923947951266947e-06, + "loss": 0.5248, + "step": 2596 + }, + { + "epoch": 0.32, + "grad_norm": 2.869151617512003, + "learning_rate": 7.922317461216027e-06, + "loss": 0.556, + "step": 2597 + }, + { + "epoch": 0.32, + "grad_norm": 1.2799391376527303, + "learning_rate": 7.920686499038985e-06, + "loss": 0.4808, + "step": 2598 + }, + { + "epoch": 0.32, + "grad_norm": 1.3579113165318353, + "learning_rate": 7.919055064999315e-06, + "loss": 0.4971, + "step": 2599 + }, + { + "epoch": 0.32, + "grad_norm": 2.6106478321846343, + "learning_rate": 7.917423159360592e-06, + "loss": 0.5491, + "step": 2600 + }, + { + "epoch": 0.32, + "grad_norm": 5.060188368021533, + "learning_rate": 7.915790782386462e-06, + "loss": 0.4894, + "step": 2601 + }, + { + "epoch": 0.32, + "grad_norm": 1.4593670616968968, + "learning_rate": 7.91415793434065e-06, + "loss": 0.5285, + "step": 2602 + }, + { + "epoch": 0.32, + "grad_norm": 1.8946627434610617, + "learning_rate": 7.91252461548696e-06, + "loss": 0.5221, + "step": 2603 + }, + { + "epoch": 0.32, + "grad_norm": 1.486158593598876, + "learning_rate": 7.910890826089267e-06, + "loss": 0.5064, + "step": 2604 + }, + { + "epoch": 0.32, + "grad_norm": 1.5781546113023983, + "learning_rate": 7.909256566411522e-06, + "loss": 0.5123, + "step": 2605 + }, + { + "epoch": 0.32, + "grad_norm": 1.2732298072495625, + "learning_rate": 7.907621836717757e-06, + "loss": 0.4986, + "step": 2606 + }, + { + "epoch": 0.32, + "grad_norm": 1.469675681281866, + "learning_rate": 7.905986637272079e-06, + "loss": 0.5008, + "step": 2607 + }, + { + "epoch": 0.32, + "grad_norm": 1.6806249896449872, + "learning_rate": 7.904350968338663e-06, + "loss": 0.5207, + "step": 2608 + }, + { + "epoch": 0.32, + "grad_norm": 1.3999951993913762, + "learning_rate": 7.90271483018177e-06, + "loss": 0.5482, + "step": 2609 + }, + { + "epoch": 0.32, + "grad_norm": 2.3996070047305675, + "learning_rate": 7.901078223065731e-06, + "loss": 0.528, + "step": 2610 + }, + { + "epoch": 0.32, + "grad_norm": 1.3957940027695925, + "learning_rate": 7.899441147254956e-06, + "loss": 0.5362, + "step": 2611 + }, + { + "epoch": 0.32, + "grad_norm": 1.4092902102297067, + "learning_rate": 7.897803603013927e-06, + "loss": 0.5231, + "step": 2612 + }, + { + "epoch": 0.32, + "grad_norm": 1.313813586992601, + "learning_rate": 7.896165590607204e-06, + "loss": 0.4838, + "step": 2613 + }, + { + "epoch": 0.32, + "grad_norm": 1.829202367501445, + "learning_rate": 7.894527110299422e-06, + "loss": 0.5226, + "step": 2614 + }, + { + "epoch": 0.32, + "grad_norm": 1.4582804197443877, + "learning_rate": 7.892888162355293e-06, + "loss": 0.5125, + "step": 2615 + }, + { + "epoch": 0.32, + "grad_norm": 1.4644426641480262, + "learning_rate": 7.891248747039605e-06, + "loss": 0.5256, + "step": 2616 + }, + { + "epoch": 0.32, + "grad_norm": 1.5832117310103972, + "learning_rate": 7.889608864617216e-06, + "loss": 0.569, + "step": 2617 + }, + { + "epoch": 0.32, + "grad_norm": 1.349664804555716, + "learning_rate": 7.887968515353065e-06, + "loss": 0.508, + "step": 2618 + }, + { + "epoch": 0.33, + "grad_norm": 1.5097419964276189, + "learning_rate": 7.886327699512166e-06, + "loss": 0.5373, + "step": 2619 + }, + { + "epoch": 0.33, + "grad_norm": 1.314238734393822, + "learning_rate": 7.884686417359609e-06, + "loss": 0.5263, + "step": 2620 + }, + { + "epoch": 0.33, + "grad_norm": 1.3193574606716998, + "learning_rate": 7.88304466916055e-06, + "loss": 0.4542, + "step": 2621 + }, + { + "epoch": 0.33, + "grad_norm": 1.3564255854315617, + "learning_rate": 7.881402455180238e-06, + "loss": 0.4647, + "step": 2622 + }, + { + "epoch": 0.33, + "grad_norm": 1.3643754065286928, + "learning_rate": 7.87975977568398e-06, + "loss": 0.4483, + "step": 2623 + }, + { + "epoch": 0.33, + "grad_norm": 0.6474917452755293, + "learning_rate": 7.878116630937169e-06, + "loss": 0.5312, + "step": 2624 + }, + { + "epoch": 0.33, + "grad_norm": 1.3898488341013995, + "learning_rate": 7.876473021205266e-06, + "loss": 0.4786, + "step": 2625 + }, + { + "epoch": 0.33, + "grad_norm": 1.4444599189015728, + "learning_rate": 7.874828946753814e-06, + "loss": 0.5075, + "step": 2626 + }, + { + "epoch": 0.33, + "grad_norm": 1.4016382588893839, + "learning_rate": 7.873184407848428e-06, + "loss": 0.4868, + "step": 2627 + }, + { + "epoch": 0.33, + "grad_norm": 1.3055135483450153, + "learning_rate": 7.871539404754793e-06, + "loss": 0.4448, + "step": 2628 + }, + { + "epoch": 0.33, + "grad_norm": 1.4260820129750083, + "learning_rate": 7.86989393773868e-06, + "loss": 0.5093, + "step": 2629 + }, + { + "epoch": 0.33, + "grad_norm": 1.4595683756814524, + "learning_rate": 7.868248007065923e-06, + "loss": 0.528, + "step": 2630 + }, + { + "epoch": 0.33, + "grad_norm": 1.383304259086062, + "learning_rate": 7.86660161300244e-06, + "loss": 0.537, + "step": 2631 + }, + { + "epoch": 0.33, + "grad_norm": 0.7295511857203106, + "learning_rate": 7.86495475581422e-06, + "loss": 0.4833, + "step": 2632 + }, + { + "epoch": 0.33, + "grad_norm": 1.4553464257059818, + "learning_rate": 7.863307435767329e-06, + "loss": 0.5198, + "step": 2633 + }, + { + "epoch": 0.33, + "grad_norm": 1.2005384216193022, + "learning_rate": 7.861659653127899e-06, + "loss": 0.4692, + "step": 2634 + }, + { + "epoch": 0.33, + "grad_norm": 1.300754657681368, + "learning_rate": 7.860011408162153e-06, + "loss": 0.5397, + "step": 2635 + }, + { + "epoch": 0.33, + "grad_norm": 1.3431649243225865, + "learning_rate": 7.858362701136374e-06, + "loss": 0.5268, + "step": 2636 + }, + { + "epoch": 0.33, + "grad_norm": 1.4874194965826724, + "learning_rate": 7.856713532316927e-06, + "loss": 0.5276, + "step": 2637 + }, + { + "epoch": 0.33, + "grad_norm": 1.6575599415205786, + "learning_rate": 7.855063901970248e-06, + "loss": 0.5225, + "step": 2638 + }, + { + "epoch": 0.33, + "grad_norm": 1.41297561898523, + "learning_rate": 7.85341381036285e-06, + "loss": 0.5139, + "step": 2639 + }, + { + "epoch": 0.33, + "grad_norm": 1.1716924968520417, + "learning_rate": 7.851763257761322e-06, + "loss": 0.5113, + "step": 2640 + }, + { + "epoch": 0.33, + "grad_norm": 1.536281272372059, + "learning_rate": 7.850112244432322e-06, + "loss": 0.5021, + "step": 2641 + }, + { + "epoch": 0.33, + "grad_norm": 2.5685613187358625, + "learning_rate": 7.848460770642588e-06, + "loss": 0.5094, + "step": 2642 + }, + { + "epoch": 0.33, + "grad_norm": 1.5203119298235317, + "learning_rate": 7.846808836658931e-06, + "loss": 0.48, + "step": 2643 + }, + { + "epoch": 0.33, + "grad_norm": 1.5732793968340726, + "learning_rate": 7.845156442748232e-06, + "loss": 0.5238, + "step": 2644 + }, + { + "epoch": 0.33, + "grad_norm": 1.6797119267555378, + "learning_rate": 7.843503589177453e-06, + "loss": 0.5117, + "step": 2645 + }, + { + "epoch": 0.33, + "grad_norm": 1.3572741565536737, + "learning_rate": 7.841850276213626e-06, + "loss": 0.4669, + "step": 2646 + }, + { + "epoch": 0.33, + "grad_norm": 0.7178449432637385, + "learning_rate": 7.840196504123856e-06, + "loss": 0.5076, + "step": 2647 + }, + { + "epoch": 0.33, + "grad_norm": 1.6289373215406235, + "learning_rate": 7.838542273175328e-06, + "loss": 0.5302, + "step": 2648 + }, + { + "epoch": 0.33, + "grad_norm": 1.306086112198206, + "learning_rate": 7.836887583635297e-06, + "loss": 0.5639, + "step": 2649 + }, + { + "epoch": 0.33, + "grad_norm": 1.4157064488153976, + "learning_rate": 7.835232435771089e-06, + "loss": 0.524, + "step": 2650 + }, + { + "epoch": 0.33, + "grad_norm": 2.4036284659590974, + "learning_rate": 7.833576829850113e-06, + "loss": 0.5574, + "step": 2651 + }, + { + "epoch": 0.33, + "grad_norm": 1.5176353764519612, + "learning_rate": 7.831920766139844e-06, + "loss": 0.5141, + "step": 2652 + }, + { + "epoch": 0.33, + "grad_norm": 1.6075456494439913, + "learning_rate": 7.830264244907834e-06, + "loss": 0.5325, + "step": 2653 + }, + { + "epoch": 0.33, + "grad_norm": 0.6382628147633554, + "learning_rate": 7.828607266421705e-06, + "loss": 0.4846, + "step": 2654 + }, + { + "epoch": 0.33, + "grad_norm": 1.6454288930497467, + "learning_rate": 7.826949830949164e-06, + "loss": 0.5323, + "step": 2655 + }, + { + "epoch": 0.33, + "grad_norm": 1.994581273390472, + "learning_rate": 7.82529193875798e-06, + "loss": 0.4747, + "step": 2656 + }, + { + "epoch": 0.33, + "grad_norm": 2.349328328927657, + "learning_rate": 7.823633590116e-06, + "loss": 0.5051, + "step": 2657 + }, + { + "epoch": 0.33, + "grad_norm": 1.8911900323622857, + "learning_rate": 7.821974785291145e-06, + "loss": 0.5159, + "step": 2658 + }, + { + "epoch": 0.33, + "grad_norm": 1.6299950870453979, + "learning_rate": 7.82031552455141e-06, + "loss": 0.5159, + "step": 2659 + }, + { + "epoch": 0.33, + "grad_norm": 0.7036980394919018, + "learning_rate": 7.81865580816486e-06, + "loss": 0.5079, + "step": 2660 + }, + { + "epoch": 0.33, + "grad_norm": 1.2437606573047195, + "learning_rate": 7.816995636399644e-06, + "loss": 0.5237, + "step": 2661 + }, + { + "epoch": 0.33, + "grad_norm": 1.3132373861797184, + "learning_rate": 7.81533500952397e-06, + "loss": 0.5083, + "step": 2662 + }, + { + "epoch": 0.33, + "grad_norm": 1.38307268654614, + "learning_rate": 7.813673927806132e-06, + "loss": 0.5469, + "step": 2663 + }, + { + "epoch": 0.33, + "grad_norm": 1.649046553253519, + "learning_rate": 7.812012391514488e-06, + "loss": 0.528, + "step": 2664 + }, + { + "epoch": 0.33, + "grad_norm": 1.381471144918611, + "learning_rate": 7.81035040091748e-06, + "loss": 0.5184, + "step": 2665 + }, + { + "epoch": 0.33, + "grad_norm": 1.4352863734471388, + "learning_rate": 7.808687956283609e-06, + "loss": 0.5141, + "step": 2666 + }, + { + "epoch": 0.33, + "grad_norm": 1.3719121111184451, + "learning_rate": 7.807025057881463e-06, + "loss": 0.5035, + "step": 2667 + }, + { + "epoch": 0.33, + "grad_norm": 1.3920475091905369, + "learning_rate": 7.805361705979698e-06, + "loss": 0.5194, + "step": 2668 + }, + { + "epoch": 0.33, + "grad_norm": 1.5848959505401254, + "learning_rate": 7.803697900847042e-06, + "loss": 0.5866, + "step": 2669 + }, + { + "epoch": 0.33, + "grad_norm": 1.8520042447777643, + "learning_rate": 7.802033642752298e-06, + "loss": 0.5271, + "step": 2670 + }, + { + "epoch": 0.33, + "grad_norm": 1.4521535881547414, + "learning_rate": 7.80036893196434e-06, + "loss": 0.5194, + "step": 2671 + }, + { + "epoch": 0.33, + "grad_norm": 1.5181509592818552, + "learning_rate": 7.798703768752116e-06, + "loss": 0.5228, + "step": 2672 + }, + { + "epoch": 0.33, + "grad_norm": 1.5555764528978064, + "learning_rate": 7.79703815338465e-06, + "loss": 0.5209, + "step": 2673 + }, + { + "epoch": 0.33, + "grad_norm": 3.8939780601693137, + "learning_rate": 7.795372086131038e-06, + "loss": 0.5103, + "step": 2674 + }, + { + "epoch": 0.33, + "grad_norm": 1.5966979452355454, + "learning_rate": 7.793705567260445e-06, + "loss": 0.4822, + "step": 2675 + }, + { + "epoch": 0.33, + "grad_norm": 4.574764738764695, + "learning_rate": 7.792038597042113e-06, + "loss": 0.5219, + "step": 2676 + }, + { + "epoch": 0.33, + "grad_norm": 1.3781605626328648, + "learning_rate": 7.790371175745355e-06, + "loss": 0.5177, + "step": 2677 + }, + { + "epoch": 0.33, + "grad_norm": 1.31760703133334, + "learning_rate": 7.78870330363956e-06, + "loss": 0.5038, + "step": 2678 + }, + { + "epoch": 0.33, + "grad_norm": 1.4686543270678922, + "learning_rate": 7.787034980994184e-06, + "loss": 0.4935, + "step": 2679 + }, + { + "epoch": 0.33, + "grad_norm": 1.898415093232563, + "learning_rate": 7.78536620807876e-06, + "loss": 0.5388, + "step": 2680 + }, + { + "epoch": 0.33, + "grad_norm": 1.8871341002492472, + "learning_rate": 7.783696985162896e-06, + "loss": 0.5185, + "step": 2681 + }, + { + "epoch": 0.33, + "grad_norm": 1.756160964701136, + "learning_rate": 7.782027312516267e-06, + "loss": 0.4977, + "step": 2682 + }, + { + "epoch": 0.33, + "grad_norm": 1.6106092870879871, + "learning_rate": 7.780357190408622e-06, + "loss": 0.5323, + "step": 2683 + }, + { + "epoch": 0.33, + "grad_norm": 1.2645675766050717, + "learning_rate": 7.778686619109787e-06, + "loss": 0.4845, + "step": 2684 + }, + { + "epoch": 0.33, + "grad_norm": 1.7084608637158094, + "learning_rate": 7.777015598889656e-06, + "loss": 0.5252, + "step": 2685 + }, + { + "epoch": 0.33, + "grad_norm": 0.714675656281538, + "learning_rate": 7.775344130018196e-06, + "loss": 0.5275, + "step": 2686 + }, + { + "epoch": 0.33, + "grad_norm": 1.4409858609141166, + "learning_rate": 7.77367221276545e-06, + "loss": 0.4673, + "step": 2687 + }, + { + "epoch": 0.33, + "grad_norm": 1.4382057653735294, + "learning_rate": 7.77199984740153e-06, + "loss": 0.4813, + "step": 2688 + }, + { + "epoch": 0.33, + "grad_norm": 1.6903980772123022, + "learning_rate": 7.77032703419662e-06, + "loss": 0.4768, + "step": 2689 + }, + { + "epoch": 0.33, + "grad_norm": 2.7131248191772204, + "learning_rate": 7.76865377342098e-06, + "loss": 0.537, + "step": 2690 + }, + { + "epoch": 0.33, + "grad_norm": 1.5944381852953722, + "learning_rate": 7.766980065344938e-06, + "loss": 0.4751, + "step": 2691 + }, + { + "epoch": 0.33, + "grad_norm": 1.4666012126559613, + "learning_rate": 7.765305910238898e-06, + "loss": 0.5269, + "step": 2692 + }, + { + "epoch": 0.33, + "grad_norm": 3.471755000511237, + "learning_rate": 7.763631308373333e-06, + "loss": 0.5487, + "step": 2693 + }, + { + "epoch": 0.33, + "grad_norm": 1.3624088593151427, + "learning_rate": 7.76195626001879e-06, + "loss": 0.5085, + "step": 2694 + }, + { + "epoch": 0.33, + "grad_norm": 2.710395640324599, + "learning_rate": 7.760280765445888e-06, + "loss": 0.4905, + "step": 2695 + }, + { + "epoch": 0.33, + "grad_norm": 1.4268137254771691, + "learning_rate": 7.758604824925318e-06, + "loss": 0.5352, + "step": 2696 + }, + { + "epoch": 0.33, + "grad_norm": 2.3876286121831773, + "learning_rate": 7.756928438727844e-06, + "loss": 0.5045, + "step": 2697 + }, + { + "epoch": 0.33, + "grad_norm": 1.580257529292837, + "learning_rate": 7.755251607124298e-06, + "loss": 0.5337, + "step": 2698 + }, + { + "epoch": 0.33, + "grad_norm": 1.9168052951765553, + "learning_rate": 7.75357433038559e-06, + "loss": 0.5272, + "step": 2699 + }, + { + "epoch": 0.34, + "grad_norm": 1.6677726778026305, + "learning_rate": 7.751896608782696e-06, + "loss": 0.4915, + "step": 2700 + }, + { + "epoch": 0.34, + "grad_norm": 1.5021980264138404, + "learning_rate": 7.75021844258667e-06, + "loss": 0.5036, + "step": 2701 + }, + { + "epoch": 0.34, + "grad_norm": 1.6929177369758843, + "learning_rate": 7.748539832068633e-06, + "loss": 0.5464, + "step": 2702 + }, + { + "epoch": 0.34, + "grad_norm": 1.5228255867041702, + "learning_rate": 7.746860777499778e-06, + "loss": 0.5678, + "step": 2703 + }, + { + "epoch": 0.34, + "grad_norm": 1.8989886501816657, + "learning_rate": 7.745181279151373e-06, + "loss": 0.5263, + "step": 2704 + }, + { + "epoch": 0.34, + "grad_norm": 1.2883835482583696, + "learning_rate": 7.743501337294754e-06, + "loss": 0.5216, + "step": 2705 + }, + { + "epoch": 0.34, + "grad_norm": 3.960176675059009, + "learning_rate": 7.741820952201333e-06, + "loss": 0.5232, + "step": 2706 + }, + { + "epoch": 0.34, + "grad_norm": 1.7759698679308338, + "learning_rate": 7.740140124142587e-06, + "loss": 0.5104, + "step": 2707 + }, + { + "epoch": 0.34, + "grad_norm": 3.4527436454312945, + "learning_rate": 7.738458853390072e-06, + "loss": 0.5327, + "step": 2708 + }, + { + "epoch": 0.34, + "grad_norm": 1.3174291504094895, + "learning_rate": 7.736777140215412e-06, + "loss": 0.4753, + "step": 2709 + }, + { + "epoch": 0.34, + "grad_norm": 1.7018137791242018, + "learning_rate": 7.735094984890302e-06, + "loss": 0.5085, + "step": 2710 + }, + { + "epoch": 0.34, + "grad_norm": 1.5993123487899685, + "learning_rate": 7.733412387686508e-06, + "loss": 0.4816, + "step": 2711 + }, + { + "epoch": 0.34, + "grad_norm": 1.904796254721156, + "learning_rate": 7.731729348875868e-06, + "loss": 0.5451, + "step": 2712 + }, + { + "epoch": 0.34, + "grad_norm": 1.604283221653655, + "learning_rate": 7.730045868730294e-06, + "loss": 0.5185, + "step": 2713 + }, + { + "epoch": 0.34, + "grad_norm": 1.93420487671135, + "learning_rate": 7.728361947521765e-06, + "loss": 0.5288, + "step": 2714 + }, + { + "epoch": 0.34, + "grad_norm": 1.6354612809092355, + "learning_rate": 7.726677585522335e-06, + "loss": 0.466, + "step": 2715 + }, + { + "epoch": 0.34, + "grad_norm": 1.5483837104096023, + "learning_rate": 7.724992783004125e-06, + "loss": 0.545, + "step": 2716 + }, + { + "epoch": 0.34, + "grad_norm": 1.5651476019378647, + "learning_rate": 7.723307540239332e-06, + "loss": 0.5436, + "step": 2717 + }, + { + "epoch": 0.34, + "grad_norm": 2.886905512469594, + "learning_rate": 7.721621857500221e-06, + "loss": 0.4745, + "step": 2718 + }, + { + "epoch": 0.34, + "grad_norm": 2.1861427286963115, + "learning_rate": 7.719935735059131e-06, + "loss": 0.4465, + "step": 2719 + }, + { + "epoch": 0.34, + "grad_norm": 1.866549648988967, + "learning_rate": 7.718249173188465e-06, + "loss": 0.5152, + "step": 2720 + }, + { + "epoch": 0.34, + "grad_norm": 1.586552465840777, + "learning_rate": 7.716562172160706e-06, + "loss": 0.5207, + "step": 2721 + }, + { + "epoch": 0.34, + "grad_norm": 1.8513938439300908, + "learning_rate": 7.714874732248404e-06, + "loss": 0.4664, + "step": 2722 + }, + { + "epoch": 0.34, + "grad_norm": 1.4103660610251199, + "learning_rate": 7.713186853724176e-06, + "loss": 0.5427, + "step": 2723 + }, + { + "epoch": 0.34, + "grad_norm": 1.2258139433120145, + "learning_rate": 7.711498536860719e-06, + "loss": 0.5043, + "step": 2724 + }, + { + "epoch": 0.34, + "grad_norm": 1.3097524148833894, + "learning_rate": 7.709809781930791e-06, + "loss": 0.4709, + "step": 2725 + }, + { + "epoch": 0.34, + "grad_norm": 3.0590687856687837, + "learning_rate": 7.708120589207227e-06, + "loss": 0.5472, + "step": 2726 + }, + { + "epoch": 0.34, + "grad_norm": 2.0120462476542746, + "learning_rate": 7.706430958962932e-06, + "loss": 0.5511, + "step": 2727 + }, + { + "epoch": 0.34, + "grad_norm": 1.6327764346512121, + "learning_rate": 7.704740891470878e-06, + "loss": 0.4788, + "step": 2728 + }, + { + "epoch": 0.34, + "grad_norm": 1.4071899449611387, + "learning_rate": 7.703050387004111e-06, + "loss": 0.528, + "step": 2729 + }, + { + "epoch": 0.34, + "grad_norm": 1.4315602215673797, + "learning_rate": 7.70135944583575e-06, + "loss": 0.5531, + "step": 2730 + }, + { + "epoch": 0.34, + "grad_norm": 1.755214304596285, + "learning_rate": 7.69966806823898e-06, + "loss": 0.4996, + "step": 2731 + }, + { + "epoch": 0.34, + "grad_norm": 1.2982408118906106, + "learning_rate": 7.697976254487054e-06, + "loss": 0.5133, + "step": 2732 + }, + { + "epoch": 0.34, + "grad_norm": 1.3617359856408846, + "learning_rate": 7.696284004853303e-06, + "loss": 0.4639, + "step": 2733 + }, + { + "epoch": 0.34, + "grad_norm": 1.277470567398012, + "learning_rate": 7.694591319611124e-06, + "loss": 0.5406, + "step": 2734 + }, + { + "epoch": 0.34, + "grad_norm": 1.2375158989213473, + "learning_rate": 7.692898199033988e-06, + "loss": 0.4945, + "step": 2735 + }, + { + "epoch": 0.34, + "grad_norm": 1.6525320292638912, + "learning_rate": 7.691204643395426e-06, + "loss": 0.4814, + "step": 2736 + }, + { + "epoch": 0.34, + "grad_norm": 1.2916993899549458, + "learning_rate": 7.689510652969055e-06, + "loss": 0.4816, + "step": 2737 + }, + { + "epoch": 0.34, + "grad_norm": 3.391677601694305, + "learning_rate": 7.687816228028552e-06, + "loss": 0.4606, + "step": 2738 + }, + { + "epoch": 0.34, + "grad_norm": 1.4519576798931901, + "learning_rate": 7.686121368847666e-06, + "loss": 0.5472, + "step": 2739 + }, + { + "epoch": 0.34, + "grad_norm": 1.5009721768272837, + "learning_rate": 7.68442607570021e-06, + "loss": 0.5325, + "step": 2740 + }, + { + "epoch": 0.34, + "grad_norm": 1.3792196069659568, + "learning_rate": 7.682730348860085e-06, + "loss": 0.5191, + "step": 2741 + }, + { + "epoch": 0.34, + "grad_norm": 1.5022991314730445, + "learning_rate": 7.681034188601242e-06, + "loss": 0.517, + "step": 2742 + }, + { + "epoch": 0.34, + "grad_norm": 1.5302237971630834, + "learning_rate": 7.679337595197715e-06, + "loss": 0.4969, + "step": 2743 + }, + { + "epoch": 0.34, + "grad_norm": 1.4982130605022177, + "learning_rate": 7.677640568923601e-06, + "loss": 0.5161, + "step": 2744 + }, + { + "epoch": 0.34, + "grad_norm": 1.3912316865151857, + "learning_rate": 7.67594311005307e-06, + "loss": 0.4482, + "step": 2745 + }, + { + "epoch": 0.34, + "grad_norm": 1.3729878686082877, + "learning_rate": 7.674245218860362e-06, + "loss": 0.4852, + "step": 2746 + }, + { + "epoch": 0.34, + "grad_norm": 1.419155005976854, + "learning_rate": 7.672546895619786e-06, + "loss": 0.517, + "step": 2747 + }, + { + "epoch": 0.34, + "grad_norm": 1.4587619792042252, + "learning_rate": 7.670848140605723e-06, + "loss": 0.5279, + "step": 2748 + }, + { + "epoch": 0.34, + "grad_norm": 1.5269943732151012, + "learning_rate": 7.66914895409262e-06, + "loss": 0.5329, + "step": 2749 + }, + { + "epoch": 0.34, + "grad_norm": 1.975878240194581, + "learning_rate": 7.667449336354996e-06, + "loss": 0.4831, + "step": 2750 + }, + { + "epoch": 0.34, + "grad_norm": 1.4672990700567374, + "learning_rate": 7.665749287667436e-06, + "loss": 0.4792, + "step": 2751 + }, + { + "epoch": 0.34, + "grad_norm": 1.39519067576427, + "learning_rate": 7.664048808304603e-06, + "loss": 0.5704, + "step": 2752 + }, + { + "epoch": 0.34, + "grad_norm": 1.81220504073997, + "learning_rate": 7.662347898541222e-06, + "loss": 0.474, + "step": 2753 + }, + { + "epoch": 0.34, + "grad_norm": 1.454364363303493, + "learning_rate": 7.66064655865209e-06, + "loss": 0.4896, + "step": 2754 + }, + { + "epoch": 0.34, + "grad_norm": 2.345457097405705, + "learning_rate": 7.658944788912073e-06, + "loss": 0.4907, + "step": 2755 + }, + { + "epoch": 0.34, + "grad_norm": 1.6197799716830366, + "learning_rate": 7.657242589596107e-06, + "loss": 0.5136, + "step": 2756 + }, + { + "epoch": 0.34, + "grad_norm": 1.4100260413117522, + "learning_rate": 7.655539960979199e-06, + "loss": 0.4799, + "step": 2757 + }, + { + "epoch": 0.34, + "grad_norm": 1.4355816404214223, + "learning_rate": 7.653836903336423e-06, + "loss": 0.5246, + "step": 2758 + }, + { + "epoch": 0.34, + "grad_norm": 1.4093389459141474, + "learning_rate": 7.652133416942921e-06, + "loss": 0.4405, + "step": 2759 + }, + { + "epoch": 0.34, + "grad_norm": 2.1045817688671162, + "learning_rate": 7.650429502073909e-06, + "loss": 0.484, + "step": 2760 + }, + { + "epoch": 0.34, + "grad_norm": 1.4695892914070552, + "learning_rate": 7.648725159004666e-06, + "loss": 0.542, + "step": 2761 + }, + { + "epoch": 0.34, + "grad_norm": 1.3991047999982942, + "learning_rate": 7.647020388010546e-06, + "loss": 0.5196, + "step": 2762 + }, + { + "epoch": 0.34, + "grad_norm": 1.4581867933362196, + "learning_rate": 7.64531518936697e-06, + "loss": 0.5035, + "step": 2763 + }, + { + "epoch": 0.34, + "grad_norm": 1.505294682141353, + "learning_rate": 7.643609563349428e-06, + "loss": 0.5147, + "step": 2764 + }, + { + "epoch": 0.34, + "grad_norm": 1.6133177462796544, + "learning_rate": 7.641903510233478e-06, + "loss": 0.5426, + "step": 2765 + }, + { + "epoch": 0.34, + "grad_norm": 1.4806029734791937, + "learning_rate": 7.640197030294749e-06, + "loss": 0.5115, + "step": 2766 + }, + { + "epoch": 0.34, + "grad_norm": 1.616248266794655, + "learning_rate": 7.638490123808935e-06, + "loss": 0.4804, + "step": 2767 + }, + { + "epoch": 0.34, + "grad_norm": 1.3394062449465627, + "learning_rate": 7.636782791051805e-06, + "loss": 0.516, + "step": 2768 + }, + { + "epoch": 0.34, + "grad_norm": 4.177620453219728, + "learning_rate": 7.63507503229919e-06, + "loss": 0.5205, + "step": 2769 + }, + { + "epoch": 0.34, + "grad_norm": 2.5007327575946827, + "learning_rate": 7.633366847826999e-06, + "loss": 0.5107, + "step": 2770 + }, + { + "epoch": 0.34, + "grad_norm": 1.3918381642554443, + "learning_rate": 7.6316582379112e-06, + "loss": 0.5833, + "step": 2771 + }, + { + "epoch": 0.34, + "grad_norm": 1.8013497027412864, + "learning_rate": 7.629949202827835e-06, + "loss": 0.5097, + "step": 2772 + }, + { + "epoch": 0.34, + "grad_norm": 1.5598225631411284, + "learning_rate": 7.6282397428530135e-06, + "loss": 0.4993, + "step": 2773 + }, + { + "epoch": 0.34, + "grad_norm": 1.4219298971027583, + "learning_rate": 7.626529858262914e-06, + "loss": 0.5187, + "step": 2774 + }, + { + "epoch": 0.34, + "grad_norm": 1.601086179767575, + "learning_rate": 7.624819549333784e-06, + "loss": 0.4492, + "step": 2775 + }, + { + "epoch": 0.34, + "grad_norm": 1.5134415486399717, + "learning_rate": 7.62310881634194e-06, + "loss": 0.5014, + "step": 2776 + }, + { + "epoch": 0.34, + "grad_norm": 1.3155320012758154, + "learning_rate": 7.621397659563761e-06, + "loss": 0.5282, + "step": 2777 + }, + { + "epoch": 0.34, + "grad_norm": 4.026077379891773, + "learning_rate": 7.619686079275705e-06, + "loss": 0.516, + "step": 2778 + }, + { + "epoch": 0.34, + "grad_norm": 1.2985785718733407, + "learning_rate": 7.617974075754291e-06, + "loss": 0.487, + "step": 2779 + }, + { + "epoch": 0.34, + "grad_norm": 1.487272549933582, + "learning_rate": 7.616261649276107e-06, + "loss": 0.5503, + "step": 2780 + }, + { + "epoch": 0.35, + "grad_norm": 1.4482877030737968, + "learning_rate": 7.614548800117812e-06, + "loss": 0.5212, + "step": 2781 + }, + { + "epoch": 0.35, + "grad_norm": 1.264435244527073, + "learning_rate": 7.612835528556131e-06, + "loss": 0.4865, + "step": 2782 + }, + { + "epoch": 0.35, + "grad_norm": 1.428380630654264, + "learning_rate": 7.611121834867858e-06, + "loss": 0.4424, + "step": 2783 + }, + { + "epoch": 0.35, + "grad_norm": 1.4593193495946697, + "learning_rate": 7.609407719329858e-06, + "loss": 0.5258, + "step": 2784 + }, + { + "epoch": 0.35, + "grad_norm": 2.595849152307415, + "learning_rate": 7.607693182219058e-06, + "loss": 0.5197, + "step": 2785 + }, + { + "epoch": 0.35, + "grad_norm": 2.1367861478862946, + "learning_rate": 7.605978223812458e-06, + "loss": 0.5408, + "step": 2786 + }, + { + "epoch": 0.35, + "grad_norm": 1.237539684863762, + "learning_rate": 7.604262844387125e-06, + "loss": 0.5326, + "step": 2787 + }, + { + "epoch": 0.35, + "grad_norm": 1.4710863852700993, + "learning_rate": 7.602547044220192e-06, + "loss": 0.481, + "step": 2788 + }, + { + "epoch": 0.35, + "grad_norm": 1.2748156376890598, + "learning_rate": 7.600830823588864e-06, + "loss": 0.465, + "step": 2789 + }, + { + "epoch": 0.35, + "grad_norm": 1.2474025526575772, + "learning_rate": 7.5991141827704084e-06, + "loss": 0.4957, + "step": 2790 + }, + { + "epoch": 0.35, + "grad_norm": 1.361004388110375, + "learning_rate": 7.5973971220421685e-06, + "loss": 0.4817, + "step": 2791 + }, + { + "epoch": 0.35, + "grad_norm": 1.727555590942853, + "learning_rate": 7.595679641681546e-06, + "loss": 0.5122, + "step": 2792 + }, + { + "epoch": 0.35, + "grad_norm": 1.463675582477899, + "learning_rate": 7.593961741966019e-06, + "loss": 0.4723, + "step": 2793 + }, + { + "epoch": 0.35, + "grad_norm": 1.6756022980618925, + "learning_rate": 7.592243423173124e-06, + "loss": 0.5637, + "step": 2794 + }, + { + "epoch": 0.35, + "grad_norm": 1.339013495875156, + "learning_rate": 7.5905246855804735e-06, + "loss": 0.4775, + "step": 2795 + }, + { + "epoch": 0.35, + "grad_norm": 1.9885912649008708, + "learning_rate": 7.588805529465747e-06, + "loss": 0.5408, + "step": 2796 + }, + { + "epoch": 0.35, + "grad_norm": 1.3473130209888207, + "learning_rate": 7.587085955106685e-06, + "loss": 0.4883, + "step": 2797 + }, + { + "epoch": 0.35, + "grad_norm": 1.721228409362882, + "learning_rate": 7.585365962781103e-06, + "loss": 0.4898, + "step": 2798 + }, + { + "epoch": 0.35, + "grad_norm": 0.6689030328737668, + "learning_rate": 7.58364555276688e-06, + "loss": 0.5007, + "step": 2799 + }, + { + "epoch": 0.35, + "grad_norm": 1.312323699854146, + "learning_rate": 7.5819247253419624e-06, + "loss": 0.4944, + "step": 2800 + }, + { + "epoch": 0.35, + "grad_norm": 1.2682572784572763, + "learning_rate": 7.5802034807843675e-06, + "loss": 0.4607, + "step": 2801 + }, + { + "epoch": 0.35, + "grad_norm": 1.536315066547835, + "learning_rate": 7.578481819372174e-06, + "loss": 0.5442, + "step": 2802 + }, + { + "epoch": 0.35, + "grad_norm": 1.6095888676927919, + "learning_rate": 7.576759741383534e-06, + "loss": 0.5528, + "step": 2803 + }, + { + "epoch": 0.35, + "grad_norm": 1.3160002037099694, + "learning_rate": 7.575037247096664e-06, + "loss": 0.5164, + "step": 2804 + }, + { + "epoch": 0.35, + "grad_norm": 1.8566511781629418, + "learning_rate": 7.5733143367898475e-06, + "loss": 0.4993, + "step": 2805 + }, + { + "epoch": 0.35, + "grad_norm": 1.5097008377535144, + "learning_rate": 7.571591010741436e-06, + "loss": 0.5012, + "step": 2806 + }, + { + "epoch": 0.35, + "grad_norm": 1.3973734343760345, + "learning_rate": 7.569867269229849e-06, + "loss": 0.5597, + "step": 2807 + }, + { + "epoch": 0.35, + "grad_norm": 1.3440934662874864, + "learning_rate": 7.56814311253357e-06, + "loss": 0.5049, + "step": 2808 + }, + { + "epoch": 0.35, + "grad_norm": 1.7619676156979018, + "learning_rate": 7.566418540931154e-06, + "loss": 0.5333, + "step": 2809 + }, + { + "epoch": 0.35, + "grad_norm": 2.4386815776036004, + "learning_rate": 7.56469355470122e-06, + "loss": 0.4954, + "step": 2810 + }, + { + "epoch": 0.35, + "grad_norm": 1.347905562414754, + "learning_rate": 7.5629681541224544e-06, + "loss": 0.4376, + "step": 2811 + }, + { + "epoch": 0.35, + "grad_norm": 1.7566813074923715, + "learning_rate": 7.56124233947361e-06, + "loss": 0.5428, + "step": 2812 + }, + { + "epoch": 0.35, + "grad_norm": 1.4255239120541885, + "learning_rate": 7.559516111033509e-06, + "loss": 0.5485, + "step": 2813 + }, + { + "epoch": 0.35, + "grad_norm": 7.169394255097597, + "learning_rate": 7.557789469081036e-06, + "loss": 0.4957, + "step": 2814 + }, + { + "epoch": 0.35, + "grad_norm": 5.599542317523516, + "learning_rate": 7.55606241389515e-06, + "loss": 0.5238, + "step": 2815 + }, + { + "epoch": 0.35, + "grad_norm": 1.7719411350941612, + "learning_rate": 7.554334945754869e-06, + "loss": 0.5163, + "step": 2816 + }, + { + "epoch": 0.35, + "grad_norm": 2.781268122054056, + "learning_rate": 7.552607064939279e-06, + "loss": 0.5377, + "step": 2817 + }, + { + "epoch": 0.35, + "grad_norm": 1.4987431909303914, + "learning_rate": 7.550878771727537e-06, + "loss": 0.5219, + "step": 2818 + }, + { + "epoch": 0.35, + "grad_norm": 1.2623810509286617, + "learning_rate": 7.549150066398865e-06, + "loss": 0.5444, + "step": 2819 + }, + { + "epoch": 0.35, + "grad_norm": 1.5077713293075448, + "learning_rate": 7.5474209492325466e-06, + "loss": 0.5356, + "step": 2820 + }, + { + "epoch": 0.35, + "grad_norm": 1.4323981568838295, + "learning_rate": 7.5456914205079405e-06, + "loss": 0.5173, + "step": 2821 + }, + { + "epoch": 0.35, + "grad_norm": 1.4514281182699293, + "learning_rate": 7.543961480504463e-06, + "loss": 0.4911, + "step": 2822 + }, + { + "epoch": 0.35, + "grad_norm": 1.3481986067972602, + "learning_rate": 7.542231129501603e-06, + "loss": 0.5285, + "step": 2823 + }, + { + "epoch": 0.35, + "grad_norm": 3.539752312980218, + "learning_rate": 7.540500367778916e-06, + "loss": 0.4925, + "step": 2824 + }, + { + "epoch": 0.35, + "grad_norm": 1.3970147800199832, + "learning_rate": 7.538769195616018e-06, + "loss": 0.5636, + "step": 2825 + }, + { + "epoch": 0.35, + "grad_norm": 2.2279673122276384, + "learning_rate": 7.537037613292597e-06, + "loss": 0.4888, + "step": 2826 + }, + { + "epoch": 0.35, + "grad_norm": 1.7812718415678297, + "learning_rate": 7.535305621088407e-06, + "loss": 0.5011, + "step": 2827 + }, + { + "epoch": 0.35, + "grad_norm": 1.5110838443802594, + "learning_rate": 7.533573219283264e-06, + "loss": 0.5166, + "step": 2828 + }, + { + "epoch": 0.35, + "grad_norm": 1.3924394435460279, + "learning_rate": 7.531840408157054e-06, + "loss": 0.5189, + "step": 2829 + }, + { + "epoch": 0.35, + "grad_norm": 1.5926740455071986, + "learning_rate": 7.530107187989727e-06, + "loss": 0.5627, + "step": 2830 + }, + { + "epoch": 0.35, + "grad_norm": 1.679223687426762, + "learning_rate": 7.528373559061299e-06, + "loss": 0.5035, + "step": 2831 + }, + { + "epoch": 0.35, + "grad_norm": 1.4755516250039995, + "learning_rate": 7.526639521651858e-06, + "loss": 0.4423, + "step": 2832 + }, + { + "epoch": 0.35, + "grad_norm": 1.452773889809484, + "learning_rate": 7.524905076041548e-06, + "loss": 0.5392, + "step": 2833 + }, + { + "epoch": 0.35, + "grad_norm": 1.3163601597702044, + "learning_rate": 7.5231702225105855e-06, + "loss": 0.4859, + "step": 2834 + }, + { + "epoch": 0.35, + "grad_norm": 1.4811247611517149, + "learning_rate": 7.521434961339251e-06, + "loss": 0.5279, + "step": 2835 + }, + { + "epoch": 0.35, + "grad_norm": 1.3828663889486925, + "learning_rate": 7.519699292807891e-06, + "loss": 0.5661, + "step": 2836 + }, + { + "epoch": 0.35, + "grad_norm": 1.4661354083477, + "learning_rate": 7.517963217196922e-06, + "loss": 0.5295, + "step": 2837 + }, + { + "epoch": 0.35, + "grad_norm": 1.2459607024844084, + "learning_rate": 7.516226734786818e-06, + "loss": 0.4879, + "step": 2838 + }, + { + "epoch": 0.35, + "grad_norm": 0.6301408152941137, + "learning_rate": 7.514489845858122e-06, + "loss": 0.509, + "step": 2839 + }, + { + "epoch": 0.35, + "grad_norm": 3.89784173623586, + "learning_rate": 7.512752550691447e-06, + "loss": 0.51, + "step": 2840 + }, + { + "epoch": 0.35, + "grad_norm": 1.5483161180179505, + "learning_rate": 7.511014849567466e-06, + "loss": 0.4395, + "step": 2841 + }, + { + "epoch": 0.35, + "grad_norm": 1.509431583014947, + "learning_rate": 7.509276742766922e-06, + "loss": 0.5203, + "step": 2842 + }, + { + "epoch": 0.35, + "grad_norm": 1.3967413235682298, + "learning_rate": 7.50753823057062e-06, + "loss": 0.5108, + "step": 2843 + }, + { + "epoch": 0.35, + "grad_norm": 2.454115123765699, + "learning_rate": 7.505799313259433e-06, + "loss": 0.4783, + "step": 2844 + }, + { + "epoch": 0.35, + "grad_norm": 1.6608105137667641, + "learning_rate": 7.504059991114298e-06, + "loss": 0.5376, + "step": 2845 + }, + { + "epoch": 0.35, + "grad_norm": 1.3770781664755012, + "learning_rate": 7.502320264416217e-06, + "loss": 0.5124, + "step": 2846 + }, + { + "epoch": 0.35, + "grad_norm": 1.2886223833941657, + "learning_rate": 7.500580133446259e-06, + "loss": 0.4993, + "step": 2847 + }, + { + "epoch": 0.35, + "grad_norm": 1.3551384829891164, + "learning_rate": 7.498839598485557e-06, + "loss": 0.5414, + "step": 2848 + }, + { + "epoch": 0.35, + "grad_norm": 1.4519349854873853, + "learning_rate": 7.497098659815312e-06, + "loss": 0.4865, + "step": 2849 + }, + { + "epoch": 0.35, + "grad_norm": 1.5215147079444222, + "learning_rate": 7.495357317716784e-06, + "loss": 0.4933, + "step": 2850 + }, + { + "epoch": 0.35, + "grad_norm": 1.6889627685059738, + "learning_rate": 7.493615572471303e-06, + "loss": 0.5437, + "step": 2851 + }, + { + "epoch": 0.35, + "grad_norm": 1.5426468176935286, + "learning_rate": 7.491873424360267e-06, + "loss": 0.5536, + "step": 2852 + }, + { + "epoch": 0.35, + "grad_norm": 1.2934185127919198, + "learning_rate": 7.490130873665131e-06, + "loss": 0.555, + "step": 2853 + }, + { + "epoch": 0.35, + "grad_norm": 3.8929268075082235, + "learning_rate": 7.488387920667423e-06, + "loss": 0.5058, + "step": 2854 + }, + { + "epoch": 0.35, + "grad_norm": 1.5327500386068296, + "learning_rate": 7.486644565648731e-06, + "loss": 0.4923, + "step": 2855 + }, + { + "epoch": 0.35, + "grad_norm": 1.7011582348896546, + "learning_rate": 7.484900808890707e-06, + "loss": 0.5602, + "step": 2856 + }, + { + "epoch": 0.35, + "grad_norm": 1.5780262628699275, + "learning_rate": 7.483156650675073e-06, + "loss": 0.5853, + "step": 2857 + }, + { + "epoch": 0.35, + "grad_norm": 1.9623520716665723, + "learning_rate": 7.481412091283613e-06, + "loss": 0.5624, + "step": 2858 + }, + { + "epoch": 0.35, + "grad_norm": 1.4578953352943917, + "learning_rate": 7.479667130998174e-06, + "loss": 0.5114, + "step": 2859 + }, + { + "epoch": 0.35, + "grad_norm": 1.5016057014942639, + "learning_rate": 7.477921770100672e-06, + "loss": 0.5319, + "step": 2860 + }, + { + "epoch": 0.36, + "grad_norm": 0.6416102122722422, + "learning_rate": 7.476176008873084e-06, + "loss": 0.5037, + "step": 2861 + }, + { + "epoch": 0.36, + "grad_norm": 1.179152985299466, + "learning_rate": 7.474429847597454e-06, + "loss": 0.5166, + "step": 2862 + }, + { + "epoch": 0.36, + "grad_norm": 1.2704451553872496, + "learning_rate": 7.472683286555889e-06, + "loss": 0.4983, + "step": 2863 + }, + { + "epoch": 0.36, + "grad_norm": 1.4942884809928834, + "learning_rate": 7.470936326030562e-06, + "loss": 0.5133, + "step": 2864 + }, + { + "epoch": 0.36, + "grad_norm": 1.553793602713353, + "learning_rate": 7.46918896630371e-06, + "loss": 0.4992, + "step": 2865 + }, + { + "epoch": 0.36, + "grad_norm": 1.4876867607076067, + "learning_rate": 7.467441207657633e-06, + "loss": 0.5729, + "step": 2866 + }, + { + "epoch": 0.36, + "grad_norm": 1.3095012089466964, + "learning_rate": 7.465693050374698e-06, + "loss": 0.4689, + "step": 2867 + }, + { + "epoch": 0.36, + "grad_norm": 1.2044094333174193, + "learning_rate": 7.463944494737334e-06, + "loss": 0.5217, + "step": 2868 + }, + { + "epoch": 0.36, + "grad_norm": 1.8193312473869578, + "learning_rate": 7.462195541028037e-06, + "loss": 0.5285, + "step": 2869 + }, + { + "epoch": 0.36, + "grad_norm": 1.4909392607182927, + "learning_rate": 7.460446189529365e-06, + "loss": 0.5196, + "step": 2870 + }, + { + "epoch": 0.36, + "grad_norm": 1.24481673162198, + "learning_rate": 7.458696440523942e-06, + "loss": 0.4666, + "step": 2871 + }, + { + "epoch": 0.36, + "grad_norm": 1.5376375614629765, + "learning_rate": 7.4569462942944525e-06, + "loss": 0.5228, + "step": 2872 + }, + { + "epoch": 0.36, + "grad_norm": 1.4790336815690879, + "learning_rate": 7.455195751123654e-06, + "loss": 0.539, + "step": 2873 + }, + { + "epoch": 0.36, + "grad_norm": 1.3890204848477787, + "learning_rate": 7.453444811294357e-06, + "loss": 0.493, + "step": 2874 + }, + { + "epoch": 0.36, + "grad_norm": 1.9199696185437813, + "learning_rate": 7.451693475089442e-06, + "loss": 0.5167, + "step": 2875 + }, + { + "epoch": 0.36, + "grad_norm": 1.4822184521520818, + "learning_rate": 7.449941742791853e-06, + "loss": 0.5066, + "step": 2876 + }, + { + "epoch": 0.36, + "grad_norm": 1.3316945278591281, + "learning_rate": 7.448189614684599e-06, + "loss": 0.5055, + "step": 2877 + }, + { + "epoch": 0.36, + "grad_norm": 5.5407533800955155, + "learning_rate": 7.446437091050751e-06, + "loss": 0.4829, + "step": 2878 + }, + { + "epoch": 0.36, + "grad_norm": 1.7652822301013407, + "learning_rate": 7.444684172173445e-06, + "loss": 0.4929, + "step": 2879 + }, + { + "epoch": 0.36, + "grad_norm": 1.4786722254819273, + "learning_rate": 7.442930858335879e-06, + "loss": 0.4984, + "step": 2880 + }, + { + "epoch": 0.36, + "grad_norm": 1.5148769815243324, + "learning_rate": 7.44117714982132e-06, + "loss": 0.4862, + "step": 2881 + }, + { + "epoch": 0.36, + "grad_norm": 1.2571723121367273, + "learning_rate": 7.43942304691309e-06, + "loss": 0.5446, + "step": 2882 + }, + { + "epoch": 0.36, + "grad_norm": 1.4647084158513137, + "learning_rate": 7.437668549894583e-06, + "loss": 0.5334, + "step": 2883 + }, + { + "epoch": 0.36, + "grad_norm": 1.4714820556689951, + "learning_rate": 7.435913659049253e-06, + "loss": 0.4864, + "step": 2884 + }, + { + "epoch": 0.36, + "grad_norm": 1.2478116026485935, + "learning_rate": 7.434158374660617e-06, + "loss": 0.5445, + "step": 2885 + }, + { + "epoch": 0.36, + "grad_norm": 1.616471999591579, + "learning_rate": 7.432402697012258e-06, + "loss": 0.5104, + "step": 2886 + }, + { + "epoch": 0.36, + "grad_norm": 1.5126140439909073, + "learning_rate": 7.430646626387821e-06, + "loss": 0.5154, + "step": 2887 + }, + { + "epoch": 0.36, + "grad_norm": 1.4132347024921716, + "learning_rate": 7.428890163071013e-06, + "loss": 0.5495, + "step": 2888 + }, + { + "epoch": 0.36, + "grad_norm": 1.301685970559841, + "learning_rate": 7.427133307345608e-06, + "loss": 0.4776, + "step": 2889 + }, + { + "epoch": 0.36, + "grad_norm": 1.221108781930627, + "learning_rate": 7.425376059495442e-06, + "loss": 0.4889, + "step": 2890 + }, + { + "epoch": 0.36, + "grad_norm": 2.177713222796027, + "learning_rate": 7.4236184198044115e-06, + "loss": 0.4286, + "step": 2891 + }, + { + "epoch": 0.36, + "grad_norm": 1.631555248333458, + "learning_rate": 7.421860388556481e-06, + "loss": 0.5464, + "step": 2892 + }, + { + "epoch": 0.36, + "grad_norm": 1.4870772812914097, + "learning_rate": 7.4201019660356745e-06, + "loss": 0.4909, + "step": 2893 + }, + { + "epoch": 0.36, + "grad_norm": 0.7172788153908546, + "learning_rate": 7.418343152526081e-06, + "loss": 0.5042, + "step": 2894 + }, + { + "epoch": 0.36, + "grad_norm": 2.8282438880838727, + "learning_rate": 7.416583948311852e-06, + "loss": 0.5513, + "step": 2895 + }, + { + "epoch": 0.36, + "grad_norm": 1.7214347174465112, + "learning_rate": 7.414824353677202e-06, + "loss": 0.5221, + "step": 2896 + }, + { + "epoch": 0.36, + "grad_norm": 2.3148725718413026, + "learning_rate": 7.4130643689064105e-06, + "loss": 0.5182, + "step": 2897 + }, + { + "epoch": 0.36, + "grad_norm": 1.2618207897069937, + "learning_rate": 7.411303994283818e-06, + "loss": 0.4973, + "step": 2898 + }, + { + "epoch": 0.36, + "grad_norm": 1.5852620561225959, + "learning_rate": 7.4095432300938295e-06, + "loss": 0.5121, + "step": 2899 + }, + { + "epoch": 0.36, + "grad_norm": 1.3827972981621872, + "learning_rate": 7.407782076620909e-06, + "loss": 0.4795, + "step": 2900 + }, + { + "epoch": 0.36, + "grad_norm": 1.3328404352084577, + "learning_rate": 7.4060205341495895e-06, + "loss": 0.5338, + "step": 2901 + }, + { + "epoch": 0.36, + "grad_norm": 1.5203616571569938, + "learning_rate": 7.404258602964462e-06, + "loss": 0.5179, + "step": 2902 + }, + { + "epoch": 0.36, + "grad_norm": 1.4862472301200262, + "learning_rate": 7.402496283350182e-06, + "loss": 0.582, + "step": 2903 + }, + { + "epoch": 0.36, + "grad_norm": 1.5447324937281806, + "learning_rate": 7.400733575591469e-06, + "loss": 0.5216, + "step": 2904 + }, + { + "epoch": 0.36, + "grad_norm": 1.353338309003225, + "learning_rate": 7.398970479973101e-06, + "loss": 0.5196, + "step": 2905 + }, + { + "epoch": 0.36, + "grad_norm": 1.390823770991522, + "learning_rate": 7.3972069967799255e-06, + "loss": 0.5827, + "step": 2906 + }, + { + "epoch": 0.36, + "grad_norm": 1.3209525144108847, + "learning_rate": 7.395443126296846e-06, + "loss": 0.5036, + "step": 2907 + }, + { + "epoch": 0.36, + "grad_norm": 1.5146042399906572, + "learning_rate": 7.3936788688088335e-06, + "loss": 0.5087, + "step": 2908 + }, + { + "epoch": 0.36, + "grad_norm": 1.7655408507349146, + "learning_rate": 7.391914224600918e-06, + "loss": 0.5265, + "step": 2909 + }, + { + "epoch": 0.36, + "grad_norm": 1.3637002400842049, + "learning_rate": 7.390149193958192e-06, + "loss": 0.5321, + "step": 2910 + }, + { + "epoch": 0.36, + "grad_norm": 1.559416043795649, + "learning_rate": 7.388383777165815e-06, + "loss": 0.5393, + "step": 2911 + }, + { + "epoch": 0.36, + "grad_norm": 1.2247524083410941, + "learning_rate": 7.386617974509002e-06, + "loss": 0.4754, + "step": 2912 + }, + { + "epoch": 0.36, + "grad_norm": 1.4028519626964275, + "learning_rate": 7.3848517862730364e-06, + "loss": 0.5671, + "step": 2913 + }, + { + "epoch": 0.36, + "grad_norm": 1.6481816227825374, + "learning_rate": 7.38308521274326e-06, + "loss": 0.4992, + "step": 2914 + }, + { + "epoch": 0.36, + "grad_norm": 1.5708813531444694, + "learning_rate": 7.381318254205081e-06, + "loss": 0.5116, + "step": 2915 + }, + { + "epoch": 0.36, + "grad_norm": 1.6203098339157995, + "learning_rate": 7.3795509109439645e-06, + "loss": 0.5183, + "step": 2916 + }, + { + "epoch": 0.36, + "grad_norm": 1.2587625270337148, + "learning_rate": 7.377783183245442e-06, + "loss": 0.4795, + "step": 2917 + }, + { + "epoch": 0.36, + "grad_norm": 1.4369800726343376, + "learning_rate": 7.376015071395103e-06, + "loss": 0.5057, + "step": 2918 + }, + { + "epoch": 0.36, + "grad_norm": 1.3720643090525502, + "learning_rate": 7.374246575678604e-06, + "loss": 0.4933, + "step": 2919 + }, + { + "epoch": 0.36, + "grad_norm": 1.4278576722593987, + "learning_rate": 7.372477696381659e-06, + "loss": 0.5417, + "step": 2920 + }, + { + "epoch": 0.36, + "grad_norm": 2.8221467932962376, + "learning_rate": 7.370708433790048e-06, + "loss": 0.5501, + "step": 2921 + }, + { + "epoch": 0.36, + "grad_norm": 1.6611923403031745, + "learning_rate": 7.36893878818961e-06, + "loss": 0.545, + "step": 2922 + }, + { + "epoch": 0.36, + "grad_norm": 1.3367746208219013, + "learning_rate": 7.367168759866248e-06, + "loss": 0.5271, + "step": 2923 + }, + { + "epoch": 0.36, + "grad_norm": 1.3873043665769509, + "learning_rate": 7.3653983491059245e-06, + "loss": 0.5342, + "step": 2924 + }, + { + "epoch": 0.36, + "grad_norm": 1.686022468418169, + "learning_rate": 7.363627556194663e-06, + "loss": 0.5073, + "step": 2925 + }, + { + "epoch": 0.36, + "grad_norm": 3.864577609095847, + "learning_rate": 7.361856381418555e-06, + "loss": 0.5025, + "step": 2926 + }, + { + "epoch": 0.36, + "grad_norm": 1.5126834615642972, + "learning_rate": 7.360084825063748e-06, + "loss": 0.5628, + "step": 2927 + }, + { + "epoch": 0.36, + "grad_norm": 1.4677059654643878, + "learning_rate": 7.35831288741645e-06, + "loss": 0.493, + "step": 2928 + }, + { + "epoch": 0.36, + "grad_norm": 1.5816508458999734, + "learning_rate": 7.356540568762936e-06, + "loss": 0.5537, + "step": 2929 + }, + { + "epoch": 0.36, + "grad_norm": 1.5373700847214247, + "learning_rate": 7.354767869389537e-06, + "loss": 0.526, + "step": 2930 + }, + { + "epoch": 0.36, + "grad_norm": 1.407280590400603, + "learning_rate": 7.3529947895826505e-06, + "loss": 0.5235, + "step": 2931 + }, + { + "epoch": 0.36, + "grad_norm": 1.3570233205482056, + "learning_rate": 7.351221329628733e-06, + "loss": 0.485, + "step": 2932 + }, + { + "epoch": 0.36, + "grad_norm": 1.3560438717504448, + "learning_rate": 7.349447489814301e-06, + "loss": 0.539, + "step": 2933 + }, + { + "epoch": 0.36, + "grad_norm": 1.5107167346554402, + "learning_rate": 7.347673270425935e-06, + "loss": 0.5465, + "step": 2934 + }, + { + "epoch": 0.36, + "grad_norm": 1.5350279692878122, + "learning_rate": 7.345898671750277e-06, + "loss": 0.5205, + "step": 2935 + }, + { + "epoch": 0.36, + "grad_norm": 1.9087051914119872, + "learning_rate": 7.344123694074028e-06, + "loss": 0.46, + "step": 2936 + }, + { + "epoch": 0.36, + "grad_norm": 1.316312234164915, + "learning_rate": 7.342348337683949e-06, + "loss": 0.5125, + "step": 2937 + }, + { + "epoch": 0.36, + "grad_norm": 1.237503490731533, + "learning_rate": 7.340572602866868e-06, + "loss": 0.4504, + "step": 2938 + }, + { + "epoch": 0.36, + "grad_norm": 1.4873898920144497, + "learning_rate": 7.338796489909668e-06, + "loss": 0.5169, + "step": 2939 + }, + { + "epoch": 0.36, + "grad_norm": 1.4055193178497787, + "learning_rate": 7.337019999099297e-06, + "loss": 0.5279, + "step": 2940 + }, + { + "epoch": 0.36, + "grad_norm": 1.3350012468586496, + "learning_rate": 7.335243130722763e-06, + "loss": 0.5018, + "step": 2941 + }, + { + "epoch": 0.37, + "grad_norm": 1.3696508667171363, + "learning_rate": 7.333465885067133e-06, + "loss": 0.4933, + "step": 2942 + }, + { + "epoch": 0.37, + "grad_norm": 1.2081108750109073, + "learning_rate": 7.331688262419539e-06, + "loss": 0.4823, + "step": 2943 + }, + { + "epoch": 0.37, + "grad_norm": 2.126683979625058, + "learning_rate": 7.329910263067172e-06, + "loss": 0.5157, + "step": 2944 + }, + { + "epoch": 0.37, + "grad_norm": 1.6644563292123145, + "learning_rate": 7.328131887297281e-06, + "loss": 0.4806, + "step": 2945 + }, + { + "epoch": 0.37, + "grad_norm": 2.006276596512924, + "learning_rate": 7.326353135397177e-06, + "loss": 0.4804, + "step": 2946 + }, + { + "epoch": 0.37, + "grad_norm": 1.5446131874657179, + "learning_rate": 7.3245740076542385e-06, + "loss": 0.4882, + "step": 2947 + }, + { + "epoch": 0.37, + "grad_norm": 1.6923510062890936, + "learning_rate": 7.322794504355894e-06, + "loss": 0.5492, + "step": 2948 + }, + { + "epoch": 0.37, + "grad_norm": 1.3394289668212493, + "learning_rate": 7.321014625789641e-06, + "loss": 0.4962, + "step": 2949 + }, + { + "epoch": 0.37, + "grad_norm": 2.4749240895876494, + "learning_rate": 7.319234372243032e-06, + "loss": 0.5458, + "step": 2950 + }, + { + "epoch": 0.37, + "grad_norm": 1.6144924649110135, + "learning_rate": 7.317453744003686e-06, + "loss": 0.5033, + "step": 2951 + }, + { + "epoch": 0.37, + "grad_norm": 1.4906919511209462, + "learning_rate": 7.315672741359277e-06, + "loss": 0.5148, + "step": 2952 + }, + { + "epoch": 0.37, + "grad_norm": 1.4532325048454164, + "learning_rate": 7.313891364597541e-06, + "loss": 0.5407, + "step": 2953 + }, + { + "epoch": 0.37, + "grad_norm": 2.1811973839585446, + "learning_rate": 7.3121096140062776e-06, + "loss": 0.4752, + "step": 2954 + }, + { + "epoch": 0.37, + "grad_norm": 4.461913376198192, + "learning_rate": 7.310327489873341e-06, + "loss": 0.5184, + "step": 2955 + }, + { + "epoch": 0.37, + "grad_norm": 1.4023156824094034, + "learning_rate": 7.308544992486653e-06, + "loss": 0.5481, + "step": 2956 + }, + { + "epoch": 0.37, + "grad_norm": 1.3419673390743536, + "learning_rate": 7.3067621221341875e-06, + "loss": 0.4797, + "step": 2957 + }, + { + "epoch": 0.37, + "grad_norm": 2.821401888509523, + "learning_rate": 7.304978879103986e-06, + "loss": 0.5255, + "step": 2958 + }, + { + "epoch": 0.37, + "grad_norm": 1.7809055463536292, + "learning_rate": 7.303195263684146e-06, + "loss": 0.4903, + "step": 2959 + }, + { + "epoch": 0.37, + "grad_norm": 1.5198332285659275, + "learning_rate": 7.301411276162827e-06, + "loss": 0.4952, + "step": 2960 + }, + { + "epoch": 0.37, + "grad_norm": 1.4624158529664613, + "learning_rate": 7.299626916828246e-06, + "loss": 0.5132, + "step": 2961 + }, + { + "epoch": 0.37, + "grad_norm": 1.4693717947778802, + "learning_rate": 7.2978421859686845e-06, + "loss": 0.4981, + "step": 2962 + }, + { + "epoch": 0.37, + "grad_norm": 1.3892004234553528, + "learning_rate": 7.296057083872481e-06, + "loss": 0.5037, + "step": 2963 + }, + { + "epoch": 0.37, + "grad_norm": 1.6086533258064626, + "learning_rate": 7.294271610828032e-06, + "loss": 0.4843, + "step": 2964 + }, + { + "epoch": 0.37, + "grad_norm": 1.438061788483176, + "learning_rate": 7.2924857671237996e-06, + "loss": 0.4706, + "step": 2965 + }, + { + "epoch": 0.37, + "grad_norm": 1.3813683589454293, + "learning_rate": 7.2906995530482986e-06, + "loss": 0.5731, + "step": 2966 + }, + { + "epoch": 0.37, + "grad_norm": 1.4969900098101476, + "learning_rate": 7.288912968890112e-06, + "loss": 0.4994, + "step": 2967 + }, + { + "epoch": 0.37, + "grad_norm": 1.483277435038951, + "learning_rate": 7.287126014937876e-06, + "loss": 0.5449, + "step": 2968 + }, + { + "epoch": 0.37, + "grad_norm": 1.5714496524913182, + "learning_rate": 7.285338691480289e-06, + "loss": 0.5268, + "step": 2969 + }, + { + "epoch": 0.37, + "grad_norm": 1.396912755938003, + "learning_rate": 7.283550998806108e-06, + "loss": 0.5263, + "step": 2970 + }, + { + "epoch": 0.37, + "grad_norm": 1.3184850804940975, + "learning_rate": 7.2817629372041544e-06, + "loss": 0.4417, + "step": 2971 + }, + { + "epoch": 0.37, + "grad_norm": 1.5471097162982606, + "learning_rate": 7.279974506963301e-06, + "loss": 0.5393, + "step": 2972 + }, + { + "epoch": 0.37, + "grad_norm": 1.501741983849576, + "learning_rate": 7.278185708372485e-06, + "loss": 0.5418, + "step": 2973 + }, + { + "epoch": 0.37, + "grad_norm": 1.5656687536865674, + "learning_rate": 7.276396541720703e-06, + "loss": 0.5272, + "step": 2974 + }, + { + "epoch": 0.37, + "grad_norm": 1.3627469786622979, + "learning_rate": 7.274607007297011e-06, + "loss": 0.5223, + "step": 2975 + }, + { + "epoch": 0.37, + "grad_norm": 1.6736889928051844, + "learning_rate": 7.272817105390525e-06, + "loss": 0.5768, + "step": 2976 + }, + { + "epoch": 0.37, + "grad_norm": 1.284650601665037, + "learning_rate": 7.271026836290418e-06, + "loss": 0.4861, + "step": 2977 + }, + { + "epoch": 0.37, + "grad_norm": 1.1384482986339133, + "learning_rate": 7.269236200285925e-06, + "loss": 0.4921, + "step": 2978 + }, + { + "epoch": 0.37, + "grad_norm": 1.3967900521778172, + "learning_rate": 7.267445197666336e-06, + "loss": 0.446, + "step": 2979 + }, + { + "epoch": 0.37, + "grad_norm": 1.4677001082213201, + "learning_rate": 7.265653828721007e-06, + "loss": 0.4801, + "step": 2980 + }, + { + "epoch": 0.37, + "grad_norm": 1.4026183591203205, + "learning_rate": 7.263862093739349e-06, + "loss": 0.5119, + "step": 2981 + }, + { + "epoch": 0.37, + "grad_norm": 1.495942156017364, + "learning_rate": 7.2620699930108295e-06, + "loss": 0.5288, + "step": 2982 + }, + { + "epoch": 0.37, + "grad_norm": 1.5489605948938818, + "learning_rate": 7.26027752682498e-06, + "loss": 0.5008, + "step": 2983 + }, + { + "epoch": 0.37, + "grad_norm": 1.70728934022771, + "learning_rate": 7.258484695471391e-06, + "loss": 0.4936, + "step": 2984 + }, + { + "epoch": 0.37, + "grad_norm": 1.4794145127355325, + "learning_rate": 7.256691499239708e-06, + "loss": 0.5802, + "step": 2985 + }, + { + "epoch": 0.37, + "grad_norm": 1.2814675673826028, + "learning_rate": 7.254897938419637e-06, + "loss": 0.5676, + "step": 2986 + }, + { + "epoch": 0.37, + "grad_norm": 1.828637831299985, + "learning_rate": 7.253104013300944e-06, + "loss": 0.4818, + "step": 2987 + }, + { + "epoch": 0.37, + "grad_norm": 1.4575986554723903, + "learning_rate": 7.251309724173457e-06, + "loss": 0.5136, + "step": 2988 + }, + { + "epoch": 0.37, + "grad_norm": 1.4005442517007833, + "learning_rate": 7.249515071327054e-06, + "loss": 0.5347, + "step": 2989 + }, + { + "epoch": 0.37, + "grad_norm": 1.3491908553203378, + "learning_rate": 7.2477200550516805e-06, + "loss": 0.5472, + "step": 2990 + }, + { + "epoch": 0.37, + "grad_norm": 2.1381894261443333, + "learning_rate": 7.245924675637335e-06, + "loss": 0.4848, + "step": 2991 + }, + { + "epoch": 0.37, + "grad_norm": 1.3732018389100948, + "learning_rate": 7.244128933374078e-06, + "loss": 0.5319, + "step": 2992 + }, + { + "epoch": 0.37, + "grad_norm": 1.7890526708319623, + "learning_rate": 7.242332828552028e-06, + "loss": 0.4988, + "step": 2993 + }, + { + "epoch": 0.37, + "grad_norm": 3.0272958037206954, + "learning_rate": 7.240536361461361e-06, + "loss": 0.5384, + "step": 2994 + }, + { + "epoch": 0.37, + "grad_norm": 1.5727488158742704, + "learning_rate": 7.238739532392311e-06, + "loss": 0.5257, + "step": 2995 + }, + { + "epoch": 0.37, + "grad_norm": 1.6202822584631098, + "learning_rate": 7.236942341635172e-06, + "loss": 0.5044, + "step": 2996 + }, + { + "epoch": 0.37, + "grad_norm": 1.3387512396302006, + "learning_rate": 7.2351447894802975e-06, + "loss": 0.5279, + "step": 2997 + }, + { + "epoch": 0.37, + "grad_norm": 1.4411390442007235, + "learning_rate": 7.233346876218097e-06, + "loss": 0.5236, + "step": 2998 + }, + { + "epoch": 0.37, + "grad_norm": 1.5184640188144154, + "learning_rate": 7.231548602139038e-06, + "loss": 0.4996, + "step": 2999 + }, + { + "epoch": 0.37, + "grad_norm": 2.94169479448034, + "learning_rate": 7.22974996753365e-06, + "loss": 0.5685, + "step": 3000 + }, + { + "epoch": 0.37, + "grad_norm": 0.6834086594487581, + "learning_rate": 7.227950972692517e-06, + "loss": 0.4798, + "step": 3001 + }, + { + "epoch": 0.37, + "grad_norm": 0.6671153007124514, + "learning_rate": 7.226151617906281e-06, + "loss": 0.5056, + "step": 3002 + }, + { + "epoch": 0.37, + "grad_norm": 1.703830372372508, + "learning_rate": 7.224351903465644e-06, + "loss": 0.5304, + "step": 3003 + }, + { + "epoch": 0.37, + "grad_norm": 1.4897957844867715, + "learning_rate": 7.222551829661368e-06, + "loss": 0.5406, + "step": 3004 + }, + { + "epoch": 0.37, + "grad_norm": 1.2547913791026493, + "learning_rate": 7.22075139678427e-06, + "loss": 0.5353, + "step": 3005 + }, + { + "epoch": 0.37, + "grad_norm": 1.3197654635000653, + "learning_rate": 7.2189506051252255e-06, + "loss": 0.479, + "step": 3006 + }, + { + "epoch": 0.37, + "grad_norm": 1.7469509824577085, + "learning_rate": 7.217149454975168e-06, + "loss": 0.5134, + "step": 3007 + }, + { + "epoch": 0.37, + "grad_norm": 1.6336829981917924, + "learning_rate": 7.215347946625088e-06, + "loss": 0.5164, + "step": 3008 + }, + { + "epoch": 0.37, + "grad_norm": 1.3129226881612475, + "learning_rate": 7.213546080366036e-06, + "loss": 0.4935, + "step": 3009 + }, + { + "epoch": 0.37, + "grad_norm": 1.476449022913655, + "learning_rate": 7.21174385648912e-06, + "loss": 0.5313, + "step": 3010 + }, + { + "epoch": 0.37, + "grad_norm": 1.457575248183513, + "learning_rate": 7.209941275285504e-06, + "loss": 0.554, + "step": 3011 + }, + { + "epoch": 0.37, + "grad_norm": 1.4815888327448383, + "learning_rate": 7.208138337046413e-06, + "loss": 0.5028, + "step": 3012 + }, + { + "epoch": 0.37, + "grad_norm": 1.7516509103197164, + "learning_rate": 7.206335042063125e-06, + "loss": 0.5096, + "step": 3013 + }, + { + "epoch": 0.37, + "grad_norm": 1.391886475393212, + "learning_rate": 7.20453139062698e-06, + "loss": 0.5031, + "step": 3014 + }, + { + "epoch": 0.37, + "grad_norm": 1.5367984607946585, + "learning_rate": 7.202727383029372e-06, + "loss": 0.5289, + "step": 3015 + }, + { + "epoch": 0.37, + "grad_norm": 1.7267274430184791, + "learning_rate": 7.200923019561756e-06, + "loss": 0.5325, + "step": 3016 + }, + { + "epoch": 0.37, + "grad_norm": 1.538263403773552, + "learning_rate": 7.199118300515644e-06, + "loss": 0.4914, + "step": 3017 + }, + { + "epoch": 0.37, + "grad_norm": 1.5598289001903407, + "learning_rate": 7.197313226182601e-06, + "loss": 0.4706, + "step": 3018 + }, + { + "epoch": 0.37, + "grad_norm": 1.39709071983512, + "learning_rate": 7.195507796854253e-06, + "loss": 0.519, + "step": 3019 + }, + { + "epoch": 0.37, + "grad_norm": 1.97273352499986, + "learning_rate": 7.193702012822285e-06, + "loss": 0.521, + "step": 3020 + }, + { + "epoch": 0.37, + "grad_norm": 2.484045494576766, + "learning_rate": 7.191895874378436e-06, + "loss": 0.5457, + "step": 3021 + }, + { + "epoch": 0.38, + "grad_norm": 1.4923881307280848, + "learning_rate": 7.190089381814505e-06, + "loss": 0.4928, + "step": 3022 + }, + { + "epoch": 0.38, + "grad_norm": 8.044763977393002, + "learning_rate": 7.188282535422345e-06, + "loss": 0.5349, + "step": 3023 + }, + { + "epoch": 0.38, + "grad_norm": 1.6422923664857394, + "learning_rate": 7.186475335493867e-06, + "loss": 0.5185, + "step": 3024 + }, + { + "epoch": 0.38, + "grad_norm": 2.1337642672469794, + "learning_rate": 7.184667782321044e-06, + "loss": 0.568, + "step": 3025 + }, + { + "epoch": 0.38, + "grad_norm": 1.285695554350569, + "learning_rate": 7.182859876195903e-06, + "loss": 0.5264, + "step": 3026 + }, + { + "epoch": 0.38, + "grad_norm": 1.7352072076106895, + "learning_rate": 7.1810516174105195e-06, + "loss": 0.5556, + "step": 3027 + }, + { + "epoch": 0.38, + "grad_norm": 1.631523989032772, + "learning_rate": 7.179243006257038e-06, + "loss": 0.5464, + "step": 3028 + }, + { + "epoch": 0.38, + "grad_norm": 1.5491726163007693, + "learning_rate": 7.177434043027658e-06, + "loss": 0.4847, + "step": 3029 + }, + { + "epoch": 0.38, + "grad_norm": 0.7115960087940415, + "learning_rate": 7.175624728014631e-06, + "loss": 0.5297, + "step": 3030 + }, + { + "epoch": 0.38, + "grad_norm": 1.3545527937682438, + "learning_rate": 7.173815061510267e-06, + "loss": 0.45, + "step": 3031 + }, + { + "epoch": 0.38, + "grad_norm": 1.2777021474778372, + "learning_rate": 7.172005043806934e-06, + "loss": 0.5129, + "step": 3032 + }, + { + "epoch": 0.38, + "grad_norm": 1.5843455375733593, + "learning_rate": 7.170194675197059e-06, + "loss": 0.4926, + "step": 3033 + }, + { + "epoch": 0.38, + "grad_norm": 1.5214693147833276, + "learning_rate": 7.168383955973119e-06, + "loss": 0.5229, + "step": 3034 + }, + { + "epoch": 0.38, + "grad_norm": 1.261454930205327, + "learning_rate": 7.166572886427655e-06, + "loss": 0.465, + "step": 3035 + }, + { + "epoch": 0.38, + "grad_norm": 1.419593791340515, + "learning_rate": 7.164761466853258e-06, + "loss": 0.548, + "step": 3036 + }, + { + "epoch": 0.38, + "grad_norm": 1.364144842925871, + "learning_rate": 7.162949697542583e-06, + "loss": 0.4924, + "step": 3037 + }, + { + "epoch": 0.38, + "grad_norm": 1.417175570424057, + "learning_rate": 7.161137578788333e-06, + "loss": 0.5086, + "step": 3038 + }, + { + "epoch": 0.38, + "grad_norm": 1.6138840172605053, + "learning_rate": 7.159325110883274e-06, + "loss": 0.5614, + "step": 3039 + }, + { + "epoch": 0.38, + "grad_norm": 1.4548766722256878, + "learning_rate": 7.157512294120225e-06, + "loss": 0.5243, + "step": 3040 + }, + { + "epoch": 0.38, + "grad_norm": 1.6327820741598216, + "learning_rate": 7.155699128792063e-06, + "loss": 0.537, + "step": 3041 + }, + { + "epoch": 0.38, + "grad_norm": 1.6640787478353725, + "learning_rate": 7.153885615191723e-06, + "loss": 0.55, + "step": 3042 + }, + { + "epoch": 0.38, + "grad_norm": 1.526062849121019, + "learning_rate": 7.15207175361219e-06, + "loss": 0.5163, + "step": 3043 + }, + { + "epoch": 0.38, + "grad_norm": 1.488004814390481, + "learning_rate": 7.150257544346513e-06, + "loss": 0.5014, + "step": 3044 + }, + { + "epoch": 0.38, + "grad_norm": 1.458619976622426, + "learning_rate": 7.148442987687792e-06, + "loss": 0.5324, + "step": 3045 + }, + { + "epoch": 0.38, + "grad_norm": 0.6743278152473341, + "learning_rate": 7.146628083929183e-06, + "loss": 0.5597, + "step": 3046 + }, + { + "epoch": 0.38, + "grad_norm": 1.4734133235019455, + "learning_rate": 7.144812833363902e-06, + "loss": 0.4929, + "step": 3047 + }, + { + "epoch": 0.38, + "grad_norm": 1.5452831563979532, + "learning_rate": 7.142997236285217e-06, + "loss": 0.5187, + "step": 3048 + }, + { + "epoch": 0.38, + "grad_norm": 1.2664513770058123, + "learning_rate": 7.141181292986457e-06, + "loss": 0.5586, + "step": 3049 + }, + { + "epoch": 0.38, + "grad_norm": 1.3615095997700022, + "learning_rate": 7.139365003760998e-06, + "loss": 0.5051, + "step": 3050 + }, + { + "epoch": 0.38, + "grad_norm": 1.843213610546877, + "learning_rate": 7.137548368902284e-06, + "loss": 0.4999, + "step": 3051 + }, + { + "epoch": 0.38, + "grad_norm": 1.4640428774254095, + "learning_rate": 7.135731388703804e-06, + "loss": 0.4822, + "step": 3052 + }, + { + "epoch": 0.38, + "grad_norm": 1.5575020507385404, + "learning_rate": 7.133914063459108e-06, + "loss": 0.5176, + "step": 3053 + }, + { + "epoch": 0.38, + "grad_norm": 1.3698924554546537, + "learning_rate": 7.132096393461801e-06, + "loss": 0.5603, + "step": 3054 + }, + { + "epoch": 0.38, + "grad_norm": 1.3405839230187293, + "learning_rate": 7.130278379005545e-06, + "loss": 0.5195, + "step": 3055 + }, + { + "epoch": 0.38, + "grad_norm": 1.5140107590051919, + "learning_rate": 7.128460020384055e-06, + "loss": 0.4992, + "step": 3056 + }, + { + "epoch": 0.38, + "grad_norm": 1.1915114977633574, + "learning_rate": 7.126641317891101e-06, + "loss": 0.4718, + "step": 3057 + }, + { + "epoch": 0.38, + "grad_norm": 1.4336714376199327, + "learning_rate": 7.124822271820513e-06, + "loss": 0.4776, + "step": 3058 + }, + { + "epoch": 0.38, + "grad_norm": 1.388417058377369, + "learning_rate": 7.123002882466174e-06, + "loss": 0.4925, + "step": 3059 + }, + { + "epoch": 0.38, + "grad_norm": 1.538238246586762, + "learning_rate": 7.121183150122022e-06, + "loss": 0.5452, + "step": 3060 + }, + { + "epoch": 0.38, + "grad_norm": 0.7098794479468666, + "learning_rate": 7.1193630750820495e-06, + "loss": 0.5372, + "step": 3061 + }, + { + "epoch": 0.38, + "grad_norm": 1.6669472641299299, + "learning_rate": 7.117542657640307e-06, + "loss": 0.5566, + "step": 3062 + }, + { + "epoch": 0.38, + "grad_norm": 1.6543545849103167, + "learning_rate": 7.1157218980908984e-06, + "loss": 0.5145, + "step": 3063 + }, + { + "epoch": 0.38, + "grad_norm": 1.9544024341278428, + "learning_rate": 7.113900796727984e-06, + "loss": 0.5462, + "step": 3064 + }, + { + "epoch": 0.38, + "grad_norm": 1.933868451967738, + "learning_rate": 7.112079353845775e-06, + "loss": 0.5273, + "step": 3065 + }, + { + "epoch": 0.38, + "grad_norm": 1.4942206033292202, + "learning_rate": 7.110257569738549e-06, + "loss": 0.4997, + "step": 3066 + }, + { + "epoch": 0.38, + "grad_norm": 1.9699941736244426, + "learning_rate": 7.108435444700626e-06, + "loss": 0.4467, + "step": 3067 + }, + { + "epoch": 0.38, + "grad_norm": 1.5284924845003787, + "learning_rate": 7.106612979026387e-06, + "loss": 0.4825, + "step": 3068 + }, + { + "epoch": 0.38, + "grad_norm": 0.6737801292785671, + "learning_rate": 7.104790173010268e-06, + "loss": 0.5245, + "step": 3069 + }, + { + "epoch": 0.38, + "grad_norm": 1.4078834972046943, + "learning_rate": 7.102967026946758e-06, + "loss": 0.5132, + "step": 3070 + }, + { + "epoch": 0.38, + "grad_norm": 1.3875591801648743, + "learning_rate": 7.101143541130407e-06, + "loss": 0.4935, + "step": 3071 + }, + { + "epoch": 0.38, + "grad_norm": 1.2158183608208777, + "learning_rate": 7.0993197158558095e-06, + "loss": 0.491, + "step": 3072 + }, + { + "epoch": 0.38, + "grad_norm": 1.7437431691806793, + "learning_rate": 7.097495551417621e-06, + "loss": 0.5167, + "step": 3073 + }, + { + "epoch": 0.38, + "grad_norm": 1.429828259302202, + "learning_rate": 7.095671048110555e-06, + "loss": 0.5072, + "step": 3074 + }, + { + "epoch": 0.38, + "grad_norm": 1.4081079213848775, + "learning_rate": 7.093846206229373e-06, + "loss": 0.521, + "step": 3075 + }, + { + "epoch": 0.38, + "grad_norm": 1.512805807502615, + "learning_rate": 7.092021026068897e-06, + "loss": 0.5068, + "step": 3076 + }, + { + "epoch": 0.38, + "grad_norm": 1.6821786251772541, + "learning_rate": 7.090195507923998e-06, + "loss": 0.5147, + "step": 3077 + }, + { + "epoch": 0.38, + "grad_norm": 1.5016684049091409, + "learning_rate": 7.088369652089607e-06, + "loss": 0.4811, + "step": 3078 + }, + { + "epoch": 0.38, + "grad_norm": 1.3965871276845612, + "learning_rate": 7.086543458860706e-06, + "loss": 0.517, + "step": 3079 + }, + { + "epoch": 0.38, + "grad_norm": 1.9053127055549555, + "learning_rate": 7.084716928532334e-06, + "loss": 0.4807, + "step": 3080 + }, + { + "epoch": 0.38, + "grad_norm": 1.5364628357558445, + "learning_rate": 7.0828900613995775e-06, + "loss": 0.5673, + "step": 3081 + }, + { + "epoch": 0.38, + "grad_norm": 1.5726149366761402, + "learning_rate": 7.08106285775759e-06, + "loss": 0.5595, + "step": 3082 + }, + { + "epoch": 0.38, + "grad_norm": 1.4407485616166227, + "learning_rate": 7.079235317901569e-06, + "loss": 0.5062, + "step": 3083 + }, + { + "epoch": 0.38, + "grad_norm": 1.9085919826528777, + "learning_rate": 7.07740744212677e-06, + "loss": 0.5158, + "step": 3084 + }, + { + "epoch": 0.38, + "grad_norm": 1.486163869615989, + "learning_rate": 7.0755792307285024e-06, + "loss": 0.4841, + "step": 3085 + }, + { + "epoch": 0.38, + "grad_norm": 1.3083113514428966, + "learning_rate": 7.07375068400213e-06, + "loss": 0.4705, + "step": 3086 + }, + { + "epoch": 0.38, + "grad_norm": 1.831123081797392, + "learning_rate": 7.0719218022430715e-06, + "loss": 0.5052, + "step": 3087 + }, + { + "epoch": 0.38, + "grad_norm": 1.6126664823754961, + "learning_rate": 7.070092585746798e-06, + "loss": 0.5042, + "step": 3088 + }, + { + "epoch": 0.38, + "grad_norm": 1.4079867138889446, + "learning_rate": 7.0682630348088336e-06, + "loss": 0.5271, + "step": 3089 + }, + { + "epoch": 0.38, + "grad_norm": 1.4242873896765287, + "learning_rate": 7.066433149724762e-06, + "loss": 0.4858, + "step": 3090 + }, + { + "epoch": 0.38, + "grad_norm": 1.4017570824767323, + "learning_rate": 7.064602930790215e-06, + "loss": 0.5615, + "step": 3091 + }, + { + "epoch": 0.38, + "grad_norm": 0.6840962622500598, + "learning_rate": 7.062772378300882e-06, + "loss": 0.4723, + "step": 3092 + }, + { + "epoch": 0.38, + "grad_norm": 1.3857989267211437, + "learning_rate": 7.060941492552502e-06, + "loss": 0.5404, + "step": 3093 + }, + { + "epoch": 0.38, + "grad_norm": 5.490729207593928, + "learning_rate": 7.0591102738408735e-06, + "loss": 0.543, + "step": 3094 + }, + { + "epoch": 0.38, + "grad_norm": 2.5869525897166517, + "learning_rate": 7.057278722461845e-06, + "loss": 0.5254, + "step": 3095 + }, + { + "epoch": 0.38, + "grad_norm": 1.6003221119375428, + "learning_rate": 7.0554468387113214e-06, + "loss": 0.524, + "step": 3096 + }, + { + "epoch": 0.38, + "grad_norm": 1.557506866511841, + "learning_rate": 7.053614622885258e-06, + "loss": 0.4996, + "step": 3097 + }, + { + "epoch": 0.38, + "grad_norm": 1.973734265611302, + "learning_rate": 7.051782075279665e-06, + "loss": 0.4846, + "step": 3098 + }, + { + "epoch": 0.38, + "grad_norm": 1.393020111046219, + "learning_rate": 7.049949196190607e-06, + "loss": 0.5014, + "step": 3099 + }, + { + "epoch": 0.38, + "grad_norm": 1.6516241639640925, + "learning_rate": 7.048115985914204e-06, + "loss": 0.4566, + "step": 3100 + }, + { + "epoch": 0.38, + "grad_norm": 1.314792255216263, + "learning_rate": 7.046282444746624e-06, + "loss": 0.4513, + "step": 3101 + }, + { + "epoch": 0.38, + "grad_norm": 1.5707224724922517, + "learning_rate": 7.044448572984091e-06, + "loss": 0.5088, + "step": 3102 + }, + { + "epoch": 0.39, + "grad_norm": 1.4978671506491446, + "learning_rate": 7.042614370922887e-06, + "loss": 0.527, + "step": 3103 + }, + { + "epoch": 0.39, + "grad_norm": 1.7301185957841059, + "learning_rate": 7.0407798388593415e-06, + "loss": 0.4978, + "step": 3104 + }, + { + "epoch": 0.39, + "grad_norm": 1.2063768657721639, + "learning_rate": 7.03894497708984e-06, + "loss": 0.5352, + "step": 3105 + }, + { + "epoch": 0.39, + "grad_norm": 1.5425265184295172, + "learning_rate": 7.03710978591082e-06, + "loss": 0.5108, + "step": 3106 + }, + { + "epoch": 0.39, + "grad_norm": 1.646854735973331, + "learning_rate": 7.035274265618772e-06, + "loss": 0.5169, + "step": 3107 + }, + { + "epoch": 0.39, + "grad_norm": 1.5097235016060884, + "learning_rate": 7.033438416510241e-06, + "loss": 0.4808, + "step": 3108 + }, + { + "epoch": 0.39, + "grad_norm": 1.6812662874754638, + "learning_rate": 7.031602238881826e-06, + "loss": 0.5268, + "step": 3109 + }, + { + "epoch": 0.39, + "grad_norm": 1.479023250144563, + "learning_rate": 7.029765733030175e-06, + "loss": 0.523, + "step": 3110 + }, + { + "epoch": 0.39, + "grad_norm": 1.3145790277291225, + "learning_rate": 7.027928899251995e-06, + "loss": 0.5166, + "step": 3111 + }, + { + "epoch": 0.39, + "grad_norm": 1.4619991861740071, + "learning_rate": 7.02609173784404e-06, + "loss": 0.5387, + "step": 3112 + }, + { + "epoch": 0.39, + "grad_norm": 1.3539576115167475, + "learning_rate": 7.0242542491031205e-06, + "loss": 0.4792, + "step": 3113 + }, + { + "epoch": 0.39, + "grad_norm": 1.859711344496118, + "learning_rate": 7.022416433326099e-06, + "loss": 0.5065, + "step": 3114 + }, + { + "epoch": 0.39, + "grad_norm": 0.6856514624428732, + "learning_rate": 7.020578290809892e-06, + "loss": 0.4905, + "step": 3115 + }, + { + "epoch": 0.39, + "grad_norm": 2.294158902627508, + "learning_rate": 7.018739821851466e-06, + "loss": 0.5295, + "step": 3116 + }, + { + "epoch": 0.39, + "grad_norm": 1.5880413052145466, + "learning_rate": 7.016901026747842e-06, + "loss": 0.5467, + "step": 3117 + }, + { + "epoch": 0.39, + "grad_norm": 1.3439530838922578, + "learning_rate": 7.0150619057960926e-06, + "loss": 0.4675, + "step": 3118 + }, + { + "epoch": 0.39, + "grad_norm": 1.6216617436812264, + "learning_rate": 7.0132224592933464e-06, + "loss": 0.5298, + "step": 3119 + }, + { + "epoch": 0.39, + "grad_norm": 1.5317615759809857, + "learning_rate": 7.011382687536781e-06, + "loss": 0.5099, + "step": 3120 + }, + { + "epoch": 0.39, + "grad_norm": 1.8957008492341494, + "learning_rate": 7.009542590823628e-06, + "loss": 0.5106, + "step": 3121 + }, + { + "epoch": 0.39, + "grad_norm": 1.3386803487504768, + "learning_rate": 7.007702169451169e-06, + "loss": 0.4911, + "step": 3122 + }, + { + "epoch": 0.39, + "grad_norm": 1.600767614933352, + "learning_rate": 7.0058614237167445e-06, + "loss": 0.4981, + "step": 3123 + }, + { + "epoch": 0.39, + "grad_norm": 1.6010427135166088, + "learning_rate": 7.004020353917742e-06, + "loss": 0.4986, + "step": 3124 + }, + { + "epoch": 0.39, + "grad_norm": 1.6392739027265255, + "learning_rate": 7.0021789603515995e-06, + "loss": 0.5125, + "step": 3125 + }, + { + "epoch": 0.39, + "grad_norm": 1.7549088594860496, + "learning_rate": 7.000337243315812e-06, + "loss": 0.4807, + "step": 3126 + }, + { + "epoch": 0.39, + "grad_norm": 1.7399474805452981, + "learning_rate": 6.998495203107927e-06, + "loss": 0.4826, + "step": 3127 + }, + { + "epoch": 0.39, + "grad_norm": 2.0452652233489848, + "learning_rate": 6.996652840025539e-06, + "loss": 0.5083, + "step": 3128 + }, + { + "epoch": 0.39, + "grad_norm": 1.4516835604294887, + "learning_rate": 6.994810154366302e-06, + "loss": 0.481, + "step": 3129 + }, + { + "epoch": 0.39, + "grad_norm": 2.229369461643623, + "learning_rate": 6.992967146427913e-06, + "loss": 0.5527, + "step": 3130 + }, + { + "epoch": 0.39, + "grad_norm": 1.2383374837601475, + "learning_rate": 6.991123816508131e-06, + "loss": 0.4843, + "step": 3131 + }, + { + "epoch": 0.39, + "grad_norm": 1.4156075245677167, + "learning_rate": 6.989280164904759e-06, + "loss": 0.4891, + "step": 3132 + }, + { + "epoch": 0.39, + "grad_norm": 1.711299286904344, + "learning_rate": 6.987436191915658e-06, + "loss": 0.5811, + "step": 3133 + }, + { + "epoch": 0.39, + "grad_norm": 2.4285050158864885, + "learning_rate": 6.985591897838736e-06, + "loss": 0.4793, + "step": 3134 + }, + { + "epoch": 0.39, + "grad_norm": 1.6946023949726925, + "learning_rate": 6.983747282971954e-06, + "loss": 0.5321, + "step": 3135 + }, + { + "epoch": 0.39, + "grad_norm": 1.468337455984439, + "learning_rate": 6.981902347613328e-06, + "loss": 0.4785, + "step": 3136 + }, + { + "epoch": 0.39, + "grad_norm": 1.5886861702690127, + "learning_rate": 6.980057092060924e-06, + "loss": 0.4854, + "step": 3137 + }, + { + "epoch": 0.39, + "grad_norm": 1.2264486795838963, + "learning_rate": 6.9782115166128565e-06, + "loss": 0.5179, + "step": 3138 + }, + { + "epoch": 0.39, + "grad_norm": 1.5917890050453414, + "learning_rate": 6.976365621567295e-06, + "loss": 0.5331, + "step": 3139 + }, + { + "epoch": 0.39, + "grad_norm": 1.478122933457729, + "learning_rate": 6.974519407222462e-06, + "loss": 0.4715, + "step": 3140 + }, + { + "epoch": 0.39, + "grad_norm": 1.7984096108555954, + "learning_rate": 6.9726728738766295e-06, + "loss": 0.5443, + "step": 3141 + }, + { + "epoch": 0.39, + "grad_norm": 1.6560769479857425, + "learning_rate": 6.97082602182812e-06, + "loss": 0.4792, + "step": 3142 + }, + { + "epoch": 0.39, + "grad_norm": 1.7874171722080707, + "learning_rate": 6.9689788513753094e-06, + "loss": 0.5356, + "step": 3143 + }, + { + "epoch": 0.39, + "grad_norm": 0.6950227531979957, + "learning_rate": 6.967131362816623e-06, + "loss": 0.4974, + "step": 3144 + }, + { + "epoch": 0.39, + "grad_norm": 1.539464285764076, + "learning_rate": 6.965283556450542e-06, + "loss": 0.5479, + "step": 3145 + }, + { + "epoch": 0.39, + "grad_norm": 1.4146763723631428, + "learning_rate": 6.963435432575593e-06, + "loss": 0.5373, + "step": 3146 + }, + { + "epoch": 0.39, + "grad_norm": 1.4136312113724916, + "learning_rate": 6.961586991490357e-06, + "loss": 0.529, + "step": 3147 + }, + { + "epoch": 0.39, + "grad_norm": 1.9189851726240688, + "learning_rate": 6.959738233493466e-06, + "loss": 0.5202, + "step": 3148 + }, + { + "epoch": 0.39, + "grad_norm": 1.399927805798971, + "learning_rate": 6.957889158883604e-06, + "loss": 0.5258, + "step": 3149 + }, + { + "epoch": 0.39, + "grad_norm": 1.6344488410503175, + "learning_rate": 6.9560397679595044e-06, + "loss": 0.5367, + "step": 3150 + }, + { + "epoch": 0.39, + "grad_norm": 1.441472487643674, + "learning_rate": 6.954190061019954e-06, + "loss": 0.5707, + "step": 3151 + }, + { + "epoch": 0.39, + "grad_norm": 1.4277228917494604, + "learning_rate": 6.952340038363788e-06, + "loss": 0.4729, + "step": 3152 + }, + { + "epoch": 0.39, + "grad_norm": 1.3571806069444707, + "learning_rate": 6.950489700289894e-06, + "loss": 0.4925, + "step": 3153 + }, + { + "epoch": 0.39, + "grad_norm": 1.3716644997004555, + "learning_rate": 6.948639047097211e-06, + "loss": 0.433, + "step": 3154 + }, + { + "epoch": 0.39, + "grad_norm": 0.6052244929492939, + "learning_rate": 6.946788079084727e-06, + "loss": 0.4785, + "step": 3155 + }, + { + "epoch": 0.39, + "grad_norm": 1.4155172324818357, + "learning_rate": 6.944936796551482e-06, + "loss": 0.4885, + "step": 3156 + }, + { + "epoch": 0.39, + "grad_norm": 1.707369994295181, + "learning_rate": 6.943085199796571e-06, + "loss": 0.4817, + "step": 3157 + }, + { + "epoch": 0.39, + "grad_norm": 1.4534014173887673, + "learning_rate": 6.9412332891191315e-06, + "loss": 0.5137, + "step": 3158 + }, + { + "epoch": 0.39, + "grad_norm": 2.103367237771373, + "learning_rate": 6.9393810648183566e-06, + "loss": 0.5128, + "step": 3159 + }, + { + "epoch": 0.39, + "grad_norm": 1.6222332634783083, + "learning_rate": 6.937528527193491e-06, + "loss": 0.469, + "step": 3160 + }, + { + "epoch": 0.39, + "grad_norm": 3.3360922385144436, + "learning_rate": 6.935675676543827e-06, + "loss": 0.5262, + "step": 3161 + }, + { + "epoch": 0.39, + "grad_norm": 0.6389463231703143, + "learning_rate": 6.93382251316871e-06, + "loss": 0.5106, + "step": 3162 + }, + { + "epoch": 0.39, + "grad_norm": 1.4765630400181307, + "learning_rate": 6.931969037367533e-06, + "loss": 0.5152, + "step": 3163 + }, + { + "epoch": 0.39, + "grad_norm": 1.4176110067019618, + "learning_rate": 6.930115249439744e-06, + "loss": 0.4844, + "step": 3164 + }, + { + "epoch": 0.39, + "grad_norm": 1.3708693622727413, + "learning_rate": 6.928261149684837e-06, + "loss": 0.5306, + "step": 3165 + }, + { + "epoch": 0.39, + "grad_norm": 1.4359433735125964, + "learning_rate": 6.926406738402359e-06, + "loss": 0.4372, + "step": 3166 + }, + { + "epoch": 0.39, + "grad_norm": 1.567047899929472, + "learning_rate": 6.924552015891905e-06, + "loss": 0.5303, + "step": 3167 + }, + { + "epoch": 0.39, + "grad_norm": 1.490193781690818, + "learning_rate": 6.9226969824531254e-06, + "loss": 0.5776, + "step": 3168 + }, + { + "epoch": 0.39, + "grad_norm": 2.002612215630255, + "learning_rate": 6.920841638385715e-06, + "loss": 0.4715, + "step": 3169 + }, + { + "epoch": 0.39, + "grad_norm": 1.8963193464549652, + "learning_rate": 6.918985983989418e-06, + "loss": 0.5875, + "step": 3170 + }, + { + "epoch": 0.39, + "grad_norm": 2.213321963170932, + "learning_rate": 6.917130019564034e-06, + "loss": 0.5225, + "step": 3171 + }, + { + "epoch": 0.39, + "grad_norm": 1.4544021040788373, + "learning_rate": 6.915273745409413e-06, + "loss": 0.5195, + "step": 3172 + }, + { + "epoch": 0.39, + "grad_norm": 1.9278156233429231, + "learning_rate": 6.913417161825449e-06, + "loss": 0.5123, + "step": 3173 + }, + { + "epoch": 0.39, + "grad_norm": 1.6524752099528455, + "learning_rate": 6.911560269112092e-06, + "loss": 0.5542, + "step": 3174 + }, + { + "epoch": 0.39, + "grad_norm": 3.541481926746516, + "learning_rate": 6.909703067569337e-06, + "loss": 0.5246, + "step": 3175 + }, + { + "epoch": 0.39, + "grad_norm": 1.4279355948714822, + "learning_rate": 6.907845557497231e-06, + "loss": 0.4483, + "step": 3176 + }, + { + "epoch": 0.39, + "grad_norm": 1.743431193415441, + "learning_rate": 6.905987739195874e-06, + "loss": 0.5881, + "step": 3177 + }, + { + "epoch": 0.39, + "grad_norm": 2.61440211095754, + "learning_rate": 6.9041296129654125e-06, + "loss": 0.5301, + "step": 3178 + }, + { + "epoch": 0.39, + "grad_norm": 1.6702754112326066, + "learning_rate": 6.902271179106041e-06, + "loss": 0.5233, + "step": 3179 + }, + { + "epoch": 0.39, + "grad_norm": 3.920713679990864, + "learning_rate": 6.900412437918005e-06, + "loss": 0.4744, + "step": 3180 + }, + { + "epoch": 0.39, + "grad_norm": 1.3697518409491982, + "learning_rate": 6.898553389701603e-06, + "loss": 0.5108, + "step": 3181 + }, + { + "epoch": 0.39, + "grad_norm": 2.1212805977850646, + "learning_rate": 6.896694034757181e-06, + "loss": 0.5294, + "step": 3182 + }, + { + "epoch": 0.4, + "grad_norm": 1.5411551878710228, + "learning_rate": 6.894834373385132e-06, + "loss": 0.5106, + "step": 3183 + }, + { + "epoch": 0.4, + "grad_norm": 1.4368851055143177, + "learning_rate": 6.892974405885902e-06, + "loss": 0.4735, + "step": 3184 + }, + { + "epoch": 0.4, + "grad_norm": 1.9299020300383583, + "learning_rate": 6.891114132559985e-06, + "loss": 0.5244, + "step": 3185 + }, + { + "epoch": 0.4, + "grad_norm": 1.473232038624168, + "learning_rate": 6.8892535537079245e-06, + "loss": 0.5071, + "step": 3186 + }, + { + "epoch": 0.4, + "grad_norm": 1.588323591332897, + "learning_rate": 6.8873926696303135e-06, + "loss": 0.4919, + "step": 3187 + }, + { + "epoch": 0.4, + "grad_norm": 0.7334748667952768, + "learning_rate": 6.885531480627794e-06, + "loss": 0.5087, + "step": 3188 + }, + { + "epoch": 0.4, + "grad_norm": 1.4829244717106247, + "learning_rate": 6.883669987001058e-06, + "loss": 0.5095, + "step": 3189 + }, + { + "epoch": 0.4, + "grad_norm": 1.6092064733528466, + "learning_rate": 6.8818081890508456e-06, + "loss": 0.5232, + "step": 3190 + }, + { + "epoch": 0.4, + "grad_norm": 1.341271386639204, + "learning_rate": 6.8799460870779465e-06, + "loss": 0.5029, + "step": 3191 + }, + { + "epoch": 0.4, + "grad_norm": 1.4481930839888286, + "learning_rate": 6.878083681383198e-06, + "loss": 0.5057, + "step": 3192 + }, + { + "epoch": 0.4, + "grad_norm": 1.3083656791753513, + "learning_rate": 6.876220972267494e-06, + "loss": 0.4886, + "step": 3193 + }, + { + "epoch": 0.4, + "grad_norm": 11.39296038386585, + "learning_rate": 6.874357960031765e-06, + "loss": 0.5084, + "step": 3194 + }, + { + "epoch": 0.4, + "grad_norm": 1.900143200573506, + "learning_rate": 6.872494644977e-06, + "loss": 0.5262, + "step": 3195 + }, + { + "epoch": 0.4, + "grad_norm": 1.8050499601037346, + "learning_rate": 6.8706310274042345e-06, + "loss": 0.5068, + "step": 3196 + }, + { + "epoch": 0.4, + "grad_norm": 1.352345388329052, + "learning_rate": 6.868767107614552e-06, + "loss": 0.4604, + "step": 3197 + }, + { + "epoch": 0.4, + "grad_norm": 1.6545175377347725, + "learning_rate": 6.866902885909083e-06, + "loss": 0.5316, + "step": 3198 + }, + { + "epoch": 0.4, + "grad_norm": 1.2856771180257691, + "learning_rate": 6.865038362589012e-06, + "loss": 0.5097, + "step": 3199 + }, + { + "epoch": 0.4, + "grad_norm": 1.6801585779785584, + "learning_rate": 6.863173537955566e-06, + "loss": 0.5037, + "step": 3200 + }, + { + "epoch": 0.4, + "grad_norm": 1.6786852023536945, + "learning_rate": 6.861308412310026e-06, + "loss": 0.5134, + "step": 3201 + }, + { + "epoch": 0.4, + "grad_norm": 1.3999297059160014, + "learning_rate": 6.85944298595372e-06, + "loss": 0.5327, + "step": 3202 + }, + { + "epoch": 0.4, + "grad_norm": 1.3839457585180088, + "learning_rate": 6.857577259188022e-06, + "loss": 0.547, + "step": 3203 + }, + { + "epoch": 0.4, + "grad_norm": 1.4739330800383263, + "learning_rate": 6.855711232314358e-06, + "loss": 0.5638, + "step": 3204 + }, + { + "epoch": 0.4, + "grad_norm": 1.569513748632115, + "learning_rate": 6.853844905634202e-06, + "loss": 0.4931, + "step": 3205 + }, + { + "epoch": 0.4, + "grad_norm": 1.8421076393489355, + "learning_rate": 6.851978279449073e-06, + "loss": 0.5661, + "step": 3206 + }, + { + "epoch": 0.4, + "grad_norm": 1.956359431715477, + "learning_rate": 6.850111354060543e-06, + "loss": 0.4918, + "step": 3207 + }, + { + "epoch": 0.4, + "grad_norm": 1.577412127294876, + "learning_rate": 6.848244129770228e-06, + "loss": 0.5145, + "step": 3208 + }, + { + "epoch": 0.4, + "grad_norm": 1.4958117513380969, + "learning_rate": 6.8463766068797964e-06, + "loss": 0.5455, + "step": 3209 + }, + { + "epoch": 0.4, + "grad_norm": 1.4911133031146588, + "learning_rate": 6.844508785690964e-06, + "loss": 0.489, + "step": 3210 + }, + { + "epoch": 0.4, + "grad_norm": 1.4088742719955414, + "learning_rate": 6.842640666505491e-06, + "loss": 0.5215, + "step": 3211 + }, + { + "epoch": 0.4, + "grad_norm": 1.6629950094869201, + "learning_rate": 6.840772249625189e-06, + "loss": 0.5135, + "step": 3212 + }, + { + "epoch": 0.4, + "grad_norm": 1.2384801027680652, + "learning_rate": 6.838903535351921e-06, + "loss": 0.5416, + "step": 3213 + }, + { + "epoch": 0.4, + "grad_norm": 1.460819899140803, + "learning_rate": 6.837034523987589e-06, + "loss": 0.5163, + "step": 3214 + }, + { + "epoch": 0.4, + "grad_norm": 1.4184315724520054, + "learning_rate": 6.835165215834151e-06, + "loss": 0.4849, + "step": 3215 + }, + { + "epoch": 0.4, + "grad_norm": 1.3412792084883807, + "learning_rate": 6.83329561119361e-06, + "loss": 0.5011, + "step": 3216 + }, + { + "epoch": 0.4, + "grad_norm": 1.877516849937572, + "learning_rate": 6.831425710368016e-06, + "loss": 0.5177, + "step": 3217 + }, + { + "epoch": 0.4, + "grad_norm": 1.4397412895230428, + "learning_rate": 6.829555513659468e-06, + "loss": 0.4972, + "step": 3218 + }, + { + "epoch": 0.4, + "grad_norm": 1.6384697735109914, + "learning_rate": 6.827685021370115e-06, + "loss": 0.5218, + "step": 3219 + }, + { + "epoch": 0.4, + "grad_norm": 1.8141569051259778, + "learning_rate": 6.825814233802151e-06, + "loss": 0.4911, + "step": 3220 + }, + { + "epoch": 0.4, + "grad_norm": 1.306899734027489, + "learning_rate": 6.8239431512578135e-06, + "loss": 0.5356, + "step": 3221 + }, + { + "epoch": 0.4, + "grad_norm": 1.3572460136657636, + "learning_rate": 6.822071774039399e-06, + "loss": 0.5072, + "step": 3222 + }, + { + "epoch": 0.4, + "grad_norm": 1.5945829296918865, + "learning_rate": 6.820200102449243e-06, + "loss": 0.5119, + "step": 3223 + }, + { + "epoch": 0.4, + "grad_norm": 1.2301820912603123, + "learning_rate": 6.818328136789727e-06, + "loss": 0.4695, + "step": 3224 + }, + { + "epoch": 0.4, + "grad_norm": 1.379457106022792, + "learning_rate": 6.816455877363286e-06, + "loss": 0.4784, + "step": 3225 + }, + { + "epoch": 0.4, + "grad_norm": 1.3225665325357554, + "learning_rate": 6.814583324472401e-06, + "loss": 0.5207, + "step": 3226 + }, + { + "epoch": 0.4, + "grad_norm": 1.2700246527823775, + "learning_rate": 6.8127104784196e-06, + "loss": 0.461, + "step": 3227 + }, + { + "epoch": 0.4, + "grad_norm": 1.8472757730911826, + "learning_rate": 6.810837339507454e-06, + "loss": 0.5445, + "step": 3228 + }, + { + "epoch": 0.4, + "grad_norm": 0.6707847338140217, + "learning_rate": 6.808963908038589e-06, + "loss": 0.4855, + "step": 3229 + }, + { + "epoch": 0.4, + "grad_norm": 2.691319968280161, + "learning_rate": 6.807090184315671e-06, + "loss": 0.5171, + "step": 3230 + }, + { + "epoch": 0.4, + "grad_norm": 1.2302438266834328, + "learning_rate": 6.80521616864142e-06, + "loss": 0.4727, + "step": 3231 + }, + { + "epoch": 0.4, + "grad_norm": 1.3207802784267022, + "learning_rate": 6.803341861318598e-06, + "loss": 0.5208, + "step": 3232 + }, + { + "epoch": 0.4, + "grad_norm": 1.4476132703215128, + "learning_rate": 6.801467262650015e-06, + "loss": 0.476, + "step": 3233 + }, + { + "epoch": 0.4, + "grad_norm": 1.6336738979678858, + "learning_rate": 6.799592372938529e-06, + "loss": 0.5643, + "step": 3234 + }, + { + "epoch": 0.4, + "grad_norm": 1.835620022589171, + "learning_rate": 6.797717192487046e-06, + "loss": 0.5108, + "step": 3235 + }, + { + "epoch": 0.4, + "grad_norm": 1.5619037774003635, + "learning_rate": 6.79584172159852e-06, + "loss": 0.4783, + "step": 3236 + }, + { + "epoch": 0.4, + "grad_norm": 1.7819998664226975, + "learning_rate": 6.793965960575944e-06, + "loss": 0.5288, + "step": 3237 + }, + { + "epoch": 0.4, + "grad_norm": 1.4359135475085967, + "learning_rate": 6.79208990972237e-06, + "loss": 0.5227, + "step": 3238 + }, + { + "epoch": 0.4, + "grad_norm": 1.6807354645349077, + "learning_rate": 6.790213569340887e-06, + "loss": 0.5521, + "step": 3239 + }, + { + "epoch": 0.4, + "grad_norm": 1.3640185458322238, + "learning_rate": 6.788336939734634e-06, + "loss": 0.5437, + "step": 3240 + }, + { + "epoch": 0.4, + "grad_norm": 1.5519889606097421, + "learning_rate": 6.7864600212068e-06, + "loss": 0.4967, + "step": 3241 + }, + { + "epoch": 0.4, + "grad_norm": 1.3749868972314432, + "learning_rate": 6.784582814060615e-06, + "loss": 0.4629, + "step": 3242 + }, + { + "epoch": 0.4, + "grad_norm": 1.7267913886036812, + "learning_rate": 6.78270531859936e-06, + "loss": 0.5539, + "step": 3243 + }, + { + "epoch": 0.4, + "grad_norm": 1.609565352487597, + "learning_rate": 6.7808275351263595e-06, + "loss": 0.4702, + "step": 3244 + }, + { + "epoch": 0.4, + "grad_norm": 1.4380316093444492, + "learning_rate": 6.778949463944985e-06, + "loss": 0.4981, + "step": 3245 + }, + { + "epoch": 0.4, + "grad_norm": 1.515498940838077, + "learning_rate": 6.777071105358659e-06, + "loss": 0.5276, + "step": 3246 + }, + { + "epoch": 0.4, + "grad_norm": 2.2043397648266816, + "learning_rate": 6.775192459670844e-06, + "loss": 0.5164, + "step": 3247 + }, + { + "epoch": 0.4, + "grad_norm": 1.364429422355196, + "learning_rate": 6.773313527185053e-06, + "loss": 0.5203, + "step": 3248 + }, + { + "epoch": 0.4, + "grad_norm": 1.2666796513747796, + "learning_rate": 6.771434308204844e-06, + "loss": 0.48, + "step": 3249 + }, + { + "epoch": 0.4, + "grad_norm": 1.6534994415436592, + "learning_rate": 6.769554803033821e-06, + "loss": 0.4969, + "step": 3250 + }, + { + "epoch": 0.4, + "grad_norm": 1.2553049497037567, + "learning_rate": 6.767675011975634e-06, + "loss": 0.531, + "step": 3251 + }, + { + "epoch": 0.4, + "grad_norm": 1.7674560829884376, + "learning_rate": 6.765794935333981e-06, + "loss": 0.5026, + "step": 3252 + }, + { + "epoch": 0.4, + "grad_norm": 1.829940354533642, + "learning_rate": 6.763914573412604e-06, + "loss": 0.5008, + "step": 3253 + }, + { + "epoch": 0.4, + "grad_norm": 10.78692529089743, + "learning_rate": 6.762033926515293e-06, + "loss": 0.4988, + "step": 3254 + }, + { + "epoch": 0.4, + "grad_norm": 1.6657136686058656, + "learning_rate": 6.760152994945882e-06, + "loss": 0.4766, + "step": 3255 + }, + { + "epoch": 0.4, + "grad_norm": 1.3284019099181725, + "learning_rate": 6.758271779008254e-06, + "loss": 0.5209, + "step": 3256 + }, + { + "epoch": 0.4, + "grad_norm": 1.4389937674757496, + "learning_rate": 6.756390279006333e-06, + "loss": 0.5446, + "step": 3257 + }, + { + "epoch": 0.4, + "grad_norm": 1.4769530562233115, + "learning_rate": 6.754508495244096e-06, + "loss": 0.4907, + "step": 3258 + }, + { + "epoch": 0.4, + "grad_norm": 1.562785102305474, + "learning_rate": 6.752626428025557e-06, + "loss": 0.4999, + "step": 3259 + }, + { + "epoch": 0.4, + "grad_norm": 1.5520415303055872, + "learning_rate": 6.750744077654783e-06, + "loss": 0.547, + "step": 3260 + }, + { + "epoch": 0.4, + "grad_norm": 1.2761812793650475, + "learning_rate": 6.748861444435885e-06, + "loss": 0.4909, + "step": 3261 + }, + { + "epoch": 0.4, + "grad_norm": 1.5786024125723916, + "learning_rate": 6.746978528673016e-06, + "loss": 0.5128, + "step": 3262 + }, + { + "epoch": 0.4, + "grad_norm": 1.5004031067813843, + "learning_rate": 6.7450953306703815e-06, + "loss": 0.4872, + "step": 3263 + }, + { + "epoch": 0.41, + "grad_norm": 1.7776122389127431, + "learning_rate": 6.743211850732227e-06, + "loss": 0.5313, + "step": 3264 + }, + { + "epoch": 0.41, + "grad_norm": 1.5788272163994252, + "learning_rate": 6.7413280891628445e-06, + "loss": 0.5826, + "step": 3265 + }, + { + "epoch": 0.41, + "grad_norm": 1.4900859967676385, + "learning_rate": 6.739444046266572e-06, + "loss": 0.5607, + "step": 3266 + }, + { + "epoch": 0.41, + "grad_norm": 1.5222895795374038, + "learning_rate": 6.7375597223477975e-06, + "loss": 0.4946, + "step": 3267 + }, + { + "epoch": 0.41, + "grad_norm": 1.3541859908879346, + "learning_rate": 6.7356751177109435e-06, + "loss": 0.5121, + "step": 3268 + }, + { + "epoch": 0.41, + "grad_norm": 1.3777186829538355, + "learning_rate": 6.73379023266049e-06, + "loss": 0.56, + "step": 3269 + }, + { + "epoch": 0.41, + "grad_norm": 1.4314582012171, + "learning_rate": 6.731905067500952e-06, + "loss": 0.5078, + "step": 3270 + }, + { + "epoch": 0.41, + "grad_norm": 1.639823130130427, + "learning_rate": 6.730019622536899e-06, + "loss": 0.5102, + "step": 3271 + }, + { + "epoch": 0.41, + "grad_norm": 1.4987241384585417, + "learning_rate": 6.72813389807294e-06, + "loss": 0.5499, + "step": 3272 + }, + { + "epoch": 0.41, + "grad_norm": 0.6573727062661225, + "learning_rate": 6.726247894413728e-06, + "loss": 0.5103, + "step": 3273 + }, + { + "epoch": 0.41, + "grad_norm": 3.261235589777226, + "learning_rate": 6.724361611863964e-06, + "loss": 0.4597, + "step": 3274 + }, + { + "epoch": 0.41, + "grad_norm": 2.3445659210242793, + "learning_rate": 6.722475050728396e-06, + "loss": 0.55, + "step": 3275 + }, + { + "epoch": 0.41, + "grad_norm": 1.5693155050154757, + "learning_rate": 6.720588211311815e-06, + "loss": 0.5569, + "step": 3276 + }, + { + "epoch": 0.41, + "grad_norm": 0.6456907454871615, + "learning_rate": 6.7187010939190555e-06, + "loss": 0.5133, + "step": 3277 + }, + { + "epoch": 0.41, + "grad_norm": 1.6245573463874465, + "learning_rate": 6.7168136988549935e-06, + "loss": 0.4931, + "step": 3278 + }, + { + "epoch": 0.41, + "grad_norm": 1.8694239394978942, + "learning_rate": 6.714926026424561e-06, + "loss": 0.5084, + "step": 3279 + }, + { + "epoch": 0.41, + "grad_norm": 1.6244660101509918, + "learning_rate": 6.713038076932725e-06, + "loss": 0.5219, + "step": 3280 + }, + { + "epoch": 0.41, + "grad_norm": 1.5859703131172727, + "learning_rate": 6.711149850684499e-06, + "loss": 0.511, + "step": 3281 + }, + { + "epoch": 0.41, + "grad_norm": 1.6905258567661081, + "learning_rate": 6.709261347984946e-06, + "loss": 0.4687, + "step": 3282 + }, + { + "epoch": 0.41, + "grad_norm": 1.3208341183094228, + "learning_rate": 6.707372569139167e-06, + "loss": 0.4472, + "step": 3283 + }, + { + "epoch": 0.41, + "grad_norm": 1.7078454269081773, + "learning_rate": 6.705483514452314e-06, + "loss": 0.518, + "step": 3284 + }, + { + "epoch": 0.41, + "grad_norm": 2.776538177003102, + "learning_rate": 6.703594184229576e-06, + "loss": 0.5311, + "step": 3285 + }, + { + "epoch": 0.41, + "grad_norm": 1.429757999962165, + "learning_rate": 6.701704578776196e-06, + "loss": 0.4716, + "step": 3286 + }, + { + "epoch": 0.41, + "grad_norm": 1.2293398881643591, + "learning_rate": 6.699814698397454e-06, + "loss": 0.4841, + "step": 3287 + }, + { + "epoch": 0.41, + "grad_norm": 1.295501405232231, + "learning_rate": 6.697924543398675e-06, + "loss": 0.4988, + "step": 3288 + }, + { + "epoch": 0.41, + "grad_norm": 1.4495497732771263, + "learning_rate": 6.696034114085233e-06, + "loss": 0.5466, + "step": 3289 + }, + { + "epoch": 0.41, + "grad_norm": 1.684673687979301, + "learning_rate": 6.694143410762543e-06, + "loss": 0.5172, + "step": 3290 + }, + { + "epoch": 0.41, + "grad_norm": 1.4588467248806118, + "learning_rate": 6.692252433736063e-06, + "loss": 0.486, + "step": 3291 + }, + { + "epoch": 0.41, + "grad_norm": 1.454877955929744, + "learning_rate": 6.690361183311299e-06, + "loss": 0.5379, + "step": 3292 + }, + { + "epoch": 0.41, + "grad_norm": 1.4608802875863345, + "learning_rate": 6.688469659793799e-06, + "loss": 0.5089, + "step": 3293 + }, + { + "epoch": 0.41, + "grad_norm": 1.2498258415718937, + "learning_rate": 6.686577863489154e-06, + "loss": 0.5039, + "step": 3294 + }, + { + "epoch": 0.41, + "grad_norm": 1.4267227972809884, + "learning_rate": 6.684685794703003e-06, + "loss": 0.5738, + "step": 3295 + }, + { + "epoch": 0.41, + "grad_norm": 2.4259808443137425, + "learning_rate": 6.682793453741022e-06, + "loss": 0.5103, + "step": 3296 + }, + { + "epoch": 0.41, + "grad_norm": 1.506083836138939, + "learning_rate": 6.6809008409089396e-06, + "loss": 0.565, + "step": 3297 + }, + { + "epoch": 0.41, + "grad_norm": 1.5930280829423225, + "learning_rate": 6.679007956512522e-06, + "loss": 0.5352, + "step": 3298 + }, + { + "epoch": 0.41, + "grad_norm": 1.4478525083131581, + "learning_rate": 6.6771148008575805e-06, + "loss": 0.4984, + "step": 3299 + }, + { + "epoch": 0.41, + "grad_norm": 1.562424146635213, + "learning_rate": 6.675221374249972e-06, + "loss": 0.5472, + "step": 3300 + }, + { + "epoch": 0.41, + "grad_norm": 1.4015293961648203, + "learning_rate": 6.673327676995598e-06, + "loss": 0.492, + "step": 3301 + }, + { + "epoch": 0.41, + "grad_norm": 1.5461992961632107, + "learning_rate": 6.671433709400399e-06, + "loss": 0.5391, + "step": 3302 + }, + { + "epoch": 0.41, + "grad_norm": 2.0638014465505985, + "learning_rate": 6.6695394717703654e-06, + "loss": 0.5239, + "step": 3303 + }, + { + "epoch": 0.41, + "grad_norm": 1.4292878955601038, + "learning_rate": 6.6676449644115246e-06, + "loss": 0.5406, + "step": 3304 + }, + { + "epoch": 0.41, + "grad_norm": 1.8768562923306327, + "learning_rate": 6.665750187629953e-06, + "loss": 0.5434, + "step": 3305 + }, + { + "epoch": 0.41, + "grad_norm": 1.62994328696491, + "learning_rate": 6.6638551417317675e-06, + "loss": 0.547, + "step": 3306 + }, + { + "epoch": 0.41, + "grad_norm": 1.5736630956087028, + "learning_rate": 6.66195982702313e-06, + "loss": 0.5195, + "step": 3307 + }, + { + "epoch": 0.41, + "grad_norm": 1.4803308672254103, + "learning_rate": 6.6600642438102454e-06, + "loss": 0.527, + "step": 3308 + }, + { + "epoch": 0.41, + "grad_norm": 1.4342128458844394, + "learning_rate": 6.658168392399362e-06, + "loss": 0.4953, + "step": 3309 + }, + { + "epoch": 0.41, + "grad_norm": 1.412313512785876, + "learning_rate": 6.656272273096771e-06, + "loss": 0.5431, + "step": 3310 + }, + { + "epoch": 0.41, + "grad_norm": 1.6077253118836052, + "learning_rate": 6.654375886208806e-06, + "loss": 0.5168, + "step": 3311 + }, + { + "epoch": 0.41, + "grad_norm": 1.7743537280984463, + "learning_rate": 6.652479232041849e-06, + "loss": 0.4673, + "step": 3312 + }, + { + "epoch": 0.41, + "grad_norm": 1.3382429044577637, + "learning_rate": 6.650582310902316e-06, + "loss": 0.5023, + "step": 3313 + }, + { + "epoch": 0.41, + "grad_norm": 1.336504964303114, + "learning_rate": 6.648685123096674e-06, + "loss": 0.4931, + "step": 3314 + }, + { + "epoch": 0.41, + "grad_norm": 1.468379926370309, + "learning_rate": 6.646787668931429e-06, + "loss": 0.5196, + "step": 3315 + }, + { + "epoch": 0.41, + "grad_norm": 1.4020497395917302, + "learning_rate": 6.644889948713135e-06, + "loss": 0.4779, + "step": 3316 + }, + { + "epoch": 0.41, + "grad_norm": 1.416022357706084, + "learning_rate": 6.642991962748381e-06, + "loss": 0.5308, + "step": 3317 + }, + { + "epoch": 0.41, + "grad_norm": 1.4729653949046748, + "learning_rate": 6.641093711343806e-06, + "loss": 0.4772, + "step": 3318 + }, + { + "epoch": 0.41, + "grad_norm": 3.288982109275439, + "learning_rate": 6.639195194806087e-06, + "loss": 0.5182, + "step": 3319 + }, + { + "epoch": 0.41, + "grad_norm": 1.2569075774740808, + "learning_rate": 6.637296413441949e-06, + "loss": 0.4978, + "step": 3320 + }, + { + "epoch": 0.41, + "grad_norm": 1.8210600894090125, + "learning_rate": 6.635397367558156e-06, + "loss": 0.5356, + "step": 3321 + }, + { + "epoch": 0.41, + "grad_norm": 1.4408394135319371, + "learning_rate": 6.633498057461514e-06, + "loss": 0.4808, + "step": 3322 + }, + { + "epoch": 0.41, + "grad_norm": 1.3604884909269044, + "learning_rate": 6.631598483458874e-06, + "loss": 0.4612, + "step": 3323 + }, + { + "epoch": 0.41, + "grad_norm": 1.6252149096540622, + "learning_rate": 6.629698645857129e-06, + "loss": 0.5598, + "step": 3324 + }, + { + "epoch": 0.41, + "grad_norm": 1.1562231472595759, + "learning_rate": 6.6277985449632155e-06, + "loss": 0.4699, + "step": 3325 + }, + { + "epoch": 0.41, + "grad_norm": 1.9768297893227216, + "learning_rate": 6.625898181084111e-06, + "loss": 0.5483, + "step": 3326 + }, + { + "epoch": 0.41, + "grad_norm": 1.5422501208070072, + "learning_rate": 6.623997554526833e-06, + "loss": 0.4897, + "step": 3327 + }, + { + "epoch": 0.41, + "grad_norm": 3.361915446466029, + "learning_rate": 6.62209666559845e-06, + "loss": 0.5565, + "step": 3328 + }, + { + "epoch": 0.41, + "grad_norm": 1.4386159169197326, + "learning_rate": 6.620195514606063e-06, + "loss": 0.5225, + "step": 3329 + }, + { + "epoch": 0.41, + "grad_norm": 1.4925820107560432, + "learning_rate": 6.6182941018568224e-06, + "loss": 0.5247, + "step": 3330 + }, + { + "epoch": 0.41, + "grad_norm": 1.414937074964594, + "learning_rate": 6.616392427657918e-06, + "loss": 0.5447, + "step": 3331 + }, + { + "epoch": 0.41, + "grad_norm": 1.419668460668176, + "learning_rate": 6.614490492316578e-06, + "loss": 0.5357, + "step": 3332 + }, + { + "epoch": 0.41, + "grad_norm": 1.781817186987209, + "learning_rate": 6.612588296140082e-06, + "loss": 0.5121, + "step": 3333 + }, + { + "epoch": 0.41, + "grad_norm": 3.48145844537759, + "learning_rate": 6.610685839435744e-06, + "loss": 0.5703, + "step": 3334 + }, + { + "epoch": 0.41, + "grad_norm": 1.3152571504495545, + "learning_rate": 6.608783122510922e-06, + "loss": 0.515, + "step": 3335 + }, + { + "epoch": 0.41, + "grad_norm": 1.410781395131533, + "learning_rate": 6.606880145673018e-06, + "loss": 0.5163, + "step": 3336 + }, + { + "epoch": 0.41, + "grad_norm": 1.5398437941017569, + "learning_rate": 6.604976909229475e-06, + "loss": 0.5226, + "step": 3337 + }, + { + "epoch": 0.41, + "grad_norm": 1.8230133487038807, + "learning_rate": 6.603073413487777e-06, + "loss": 0.5104, + "step": 3338 + }, + { + "epoch": 0.41, + "grad_norm": 1.319234468105146, + "learning_rate": 6.6011696587554495e-06, + "loss": 0.5276, + "step": 3339 + }, + { + "epoch": 0.41, + "grad_norm": 1.8885195195601567, + "learning_rate": 6.599265645340063e-06, + "loss": 0.5423, + "step": 3340 + }, + { + "epoch": 0.41, + "grad_norm": 7.341219235782096, + "learning_rate": 6.597361373549226e-06, + "loss": 0.4838, + "step": 3341 + }, + { + "epoch": 0.41, + "grad_norm": 1.9725768133216375, + "learning_rate": 6.595456843690591e-06, + "loss": 0.5362, + "step": 3342 + }, + { + "epoch": 0.41, + "grad_norm": 1.7764264535883638, + "learning_rate": 6.59355205607185e-06, + "loss": 0.4959, + "step": 3343 + }, + { + "epoch": 0.41, + "grad_norm": 0.657915233528162, + "learning_rate": 6.59164701100074e-06, + "loss": 0.5096, + "step": 3344 + }, + { + "epoch": 0.42, + "grad_norm": 1.365255514796697, + "learning_rate": 6.589741708785038e-06, + "loss": 0.4849, + "step": 3345 + }, + { + "epoch": 0.42, + "grad_norm": 1.385547860326123, + "learning_rate": 6.587836149732562e-06, + "loss": 0.5173, + "step": 3346 + }, + { + "epoch": 0.42, + "grad_norm": 1.6822192304164865, + "learning_rate": 6.585930334151172e-06, + "loss": 0.4999, + "step": 3347 + }, + { + "epoch": 0.42, + "grad_norm": 1.401093568895059, + "learning_rate": 6.584024262348767e-06, + "loss": 0.5137, + "step": 3348 + }, + { + "epoch": 0.42, + "grad_norm": 1.5128507982800243, + "learning_rate": 6.582117934633293e-06, + "loss": 0.5186, + "step": 3349 + }, + { + "epoch": 0.42, + "grad_norm": 1.3496302725839122, + "learning_rate": 6.580211351312733e-06, + "loss": 0.5126, + "step": 3350 + }, + { + "epoch": 0.42, + "grad_norm": 1.5700861571843783, + "learning_rate": 6.5783045126951104e-06, + "loss": 0.5038, + "step": 3351 + }, + { + "epoch": 0.42, + "grad_norm": 1.332489990796265, + "learning_rate": 6.576397419088494e-06, + "loss": 0.5392, + "step": 3352 + }, + { + "epoch": 0.42, + "grad_norm": 3.3029563614078903, + "learning_rate": 6.574490070800991e-06, + "loss": 0.508, + "step": 3353 + }, + { + "epoch": 0.42, + "grad_norm": 3.434447606032874, + "learning_rate": 6.5725824681407505e-06, + "loss": 0.5029, + "step": 3354 + }, + { + "epoch": 0.42, + "grad_norm": 1.4351387854652031, + "learning_rate": 6.570674611415962e-06, + "loss": 0.5237, + "step": 3355 + }, + { + "epoch": 0.42, + "grad_norm": 0.7332743393085002, + "learning_rate": 6.5687665009348564e-06, + "loss": 0.5281, + "step": 3356 + }, + { + "epoch": 0.42, + "grad_norm": 1.3254583143473562, + "learning_rate": 6.566858137005707e-06, + "loss": 0.5138, + "step": 3357 + }, + { + "epoch": 0.42, + "grad_norm": 1.6459229735641585, + "learning_rate": 6.564949519936825e-06, + "loss": 0.5463, + "step": 3358 + }, + { + "epoch": 0.42, + "grad_norm": 1.798749927990085, + "learning_rate": 6.563040650036566e-06, + "loss": 0.5203, + "step": 3359 + }, + { + "epoch": 0.42, + "grad_norm": 1.8899694122457764, + "learning_rate": 6.5611315276133224e-06, + "loss": 0.488, + "step": 3360 + }, + { + "epoch": 0.42, + "grad_norm": 1.2222412116863846, + "learning_rate": 6.559222152975533e-06, + "loss": 0.4919, + "step": 3361 + }, + { + "epoch": 0.42, + "grad_norm": 1.6101329963661517, + "learning_rate": 6.5573125264316715e-06, + "loss": 0.5285, + "step": 3362 + }, + { + "epoch": 0.42, + "grad_norm": 1.3733533959449282, + "learning_rate": 6.555402648290256e-06, + "loss": 0.4836, + "step": 3363 + }, + { + "epoch": 0.42, + "grad_norm": 1.3305592653903977, + "learning_rate": 6.553492518859843e-06, + "loss": 0.4739, + "step": 3364 + }, + { + "epoch": 0.42, + "grad_norm": 1.5819335222378026, + "learning_rate": 6.551582138449033e-06, + "loss": 0.5243, + "step": 3365 + }, + { + "epoch": 0.42, + "grad_norm": 1.68120273290888, + "learning_rate": 6.549671507366464e-06, + "loss": 0.5258, + "step": 3366 + }, + { + "epoch": 0.42, + "grad_norm": 1.6916725965006718, + "learning_rate": 6.547760625920814e-06, + "loss": 0.5715, + "step": 3367 + }, + { + "epoch": 0.42, + "grad_norm": 1.5603014124207755, + "learning_rate": 6.545849494420802e-06, + "loss": 0.5392, + "step": 3368 + }, + { + "epoch": 0.42, + "grad_norm": 2.110977366460912, + "learning_rate": 6.543938113175191e-06, + "loss": 0.5224, + "step": 3369 + }, + { + "epoch": 0.42, + "grad_norm": 1.3684723813682567, + "learning_rate": 6.5420264824927796e-06, + "loss": 0.4657, + "step": 3370 + }, + { + "epoch": 0.42, + "grad_norm": 1.4282419027256408, + "learning_rate": 6.540114602682409e-06, + "loss": 0.527, + "step": 3371 + }, + { + "epoch": 0.42, + "grad_norm": 1.4251475318763287, + "learning_rate": 6.5382024740529605e-06, + "loss": 0.4968, + "step": 3372 + }, + { + "epoch": 0.42, + "grad_norm": 1.3420448115949397, + "learning_rate": 6.536290096913354e-06, + "loss": 0.5144, + "step": 3373 + }, + { + "epoch": 0.42, + "grad_norm": 1.295778067774979, + "learning_rate": 6.5343774715725525e-06, + "loss": 0.4714, + "step": 3374 + }, + { + "epoch": 0.42, + "grad_norm": 1.3489738242574505, + "learning_rate": 6.532464598339557e-06, + "loss": 0.5089, + "step": 3375 + }, + { + "epoch": 0.42, + "grad_norm": 1.7563087403545115, + "learning_rate": 6.530551477523411e-06, + "loss": 0.535, + "step": 3376 + }, + { + "epoch": 0.42, + "grad_norm": 1.4205894446580483, + "learning_rate": 6.528638109433191e-06, + "loss": 0.5118, + "step": 3377 + }, + { + "epoch": 0.42, + "grad_norm": 1.5426548496710313, + "learning_rate": 6.526724494378023e-06, + "loss": 0.4733, + "step": 3378 + }, + { + "epoch": 0.42, + "grad_norm": 1.4987232712876852, + "learning_rate": 6.524810632667066e-06, + "loss": 0.4909, + "step": 3379 + }, + { + "epoch": 0.42, + "grad_norm": 1.790263490504134, + "learning_rate": 6.522896524609521e-06, + "loss": 0.51, + "step": 3380 + }, + { + "epoch": 0.42, + "grad_norm": 1.8194563671942883, + "learning_rate": 6.520982170514631e-06, + "loss": 0.4846, + "step": 3381 + }, + { + "epoch": 0.42, + "grad_norm": 1.2429209424748784, + "learning_rate": 6.519067570691675e-06, + "loss": 0.4647, + "step": 3382 + }, + { + "epoch": 0.42, + "grad_norm": 1.342898948876468, + "learning_rate": 6.517152725449976e-06, + "loss": 0.5154, + "step": 3383 + }, + { + "epoch": 0.42, + "grad_norm": 1.3705724281814533, + "learning_rate": 6.515237635098891e-06, + "loss": 0.5706, + "step": 3384 + }, + { + "epoch": 0.42, + "grad_norm": 1.5642337602627236, + "learning_rate": 6.513322299947822e-06, + "loss": 0.5745, + "step": 3385 + }, + { + "epoch": 0.42, + "grad_norm": 1.4367398118514414, + "learning_rate": 6.511406720306206e-06, + "loss": 0.527, + "step": 3386 + }, + { + "epoch": 0.42, + "grad_norm": 1.2768734239191017, + "learning_rate": 6.509490896483524e-06, + "loss": 0.4542, + "step": 3387 + }, + { + "epoch": 0.42, + "grad_norm": 1.4721952171363482, + "learning_rate": 6.507574828789292e-06, + "loss": 0.4273, + "step": 3388 + }, + { + "epoch": 0.42, + "grad_norm": 1.8717370896892942, + "learning_rate": 6.50565851753307e-06, + "loss": 0.5026, + "step": 3389 + }, + { + "epoch": 0.42, + "grad_norm": 1.4527504054398452, + "learning_rate": 6.503741963024454e-06, + "loss": 0.5421, + "step": 3390 + }, + { + "epoch": 0.42, + "grad_norm": 6.178249397843699, + "learning_rate": 6.5018251655730795e-06, + "loss": 0.5536, + "step": 3391 + }, + { + "epoch": 0.42, + "grad_norm": 1.2495856813403767, + "learning_rate": 6.499908125488623e-06, + "loss": 0.5353, + "step": 3392 + }, + { + "epoch": 0.42, + "grad_norm": 2.096512316760758, + "learning_rate": 6.4979908430807995e-06, + "loss": 0.5541, + "step": 3393 + }, + { + "epoch": 0.42, + "grad_norm": 1.408989150160295, + "learning_rate": 6.4960733186593604e-06, + "loss": 0.5071, + "step": 3394 + }, + { + "epoch": 0.42, + "grad_norm": 1.6106819885568957, + "learning_rate": 6.494155552534102e-06, + "loss": 0.4794, + "step": 3395 + }, + { + "epoch": 0.42, + "grad_norm": 1.8647778316019838, + "learning_rate": 6.492237545014853e-06, + "loss": 0.5133, + "step": 3396 + }, + { + "epoch": 0.42, + "grad_norm": 1.4265607978785886, + "learning_rate": 6.490319296411487e-06, + "loss": 0.5425, + "step": 3397 + }, + { + "epoch": 0.42, + "grad_norm": 1.239661238066202, + "learning_rate": 6.488400807033913e-06, + "loss": 0.468, + "step": 3398 + }, + { + "epoch": 0.42, + "grad_norm": 1.5414791454347683, + "learning_rate": 6.486482077192081e-06, + "loss": 0.5226, + "step": 3399 + }, + { + "epoch": 0.42, + "grad_norm": 1.610310909755451, + "learning_rate": 6.484563107195977e-06, + "loss": 0.5389, + "step": 3400 + }, + { + "epoch": 0.42, + "grad_norm": 1.2558984430146658, + "learning_rate": 6.482643897355628e-06, + "loss": 0.5248, + "step": 3401 + }, + { + "epoch": 0.42, + "grad_norm": 0.6212989357041312, + "learning_rate": 6.4807244479810995e-06, + "loss": 0.5324, + "step": 3402 + }, + { + "epoch": 0.42, + "grad_norm": 2.1973737707139973, + "learning_rate": 6.478804759382495e-06, + "loss": 0.4898, + "step": 3403 + }, + { + "epoch": 0.42, + "grad_norm": 1.3683074620200943, + "learning_rate": 6.476884831869958e-06, + "loss": 0.4739, + "step": 3404 + }, + { + "epoch": 0.42, + "grad_norm": 1.4320958020667869, + "learning_rate": 6.4749646657536695e-06, + "loss": 0.4828, + "step": 3405 + }, + { + "epoch": 0.42, + "grad_norm": 4.118003030237448, + "learning_rate": 6.473044261343848e-06, + "loss": 0.5994, + "step": 3406 + }, + { + "epoch": 0.42, + "grad_norm": 1.5132374141721459, + "learning_rate": 6.4711236189507535e-06, + "loss": 0.5209, + "step": 3407 + }, + { + "epoch": 0.42, + "grad_norm": 2.9169913340489075, + "learning_rate": 6.469202738884681e-06, + "loss": 0.4791, + "step": 3408 + }, + { + "epoch": 0.42, + "grad_norm": 1.5541877617357418, + "learning_rate": 6.467281621455967e-06, + "loss": 0.5716, + "step": 3409 + }, + { + "epoch": 0.42, + "grad_norm": 1.3159596193980914, + "learning_rate": 6.465360266974984e-06, + "loss": 0.5009, + "step": 3410 + }, + { + "epoch": 0.42, + "grad_norm": 2.5372368754643895, + "learning_rate": 6.463438675752145e-06, + "loss": 0.5137, + "step": 3411 + }, + { + "epoch": 0.42, + "grad_norm": 1.8482767490950647, + "learning_rate": 6.461516848097899e-06, + "loss": 0.5033, + "step": 3412 + }, + { + "epoch": 0.42, + "grad_norm": 1.3063262071097612, + "learning_rate": 6.459594784322734e-06, + "loss": 0.5729, + "step": 3413 + }, + { + "epoch": 0.42, + "grad_norm": 1.4749651475169578, + "learning_rate": 6.457672484737177e-06, + "loss": 0.5183, + "step": 3414 + }, + { + "epoch": 0.42, + "grad_norm": 1.371908921907626, + "learning_rate": 6.455749949651791e-06, + "loss": 0.5636, + "step": 3415 + }, + { + "epoch": 0.42, + "grad_norm": 1.6074630823942877, + "learning_rate": 6.45382717937718e-06, + "loss": 0.5369, + "step": 3416 + }, + { + "epoch": 0.42, + "grad_norm": 1.4537404369016569, + "learning_rate": 6.4519041742239844e-06, + "loss": 0.4837, + "step": 3417 + }, + { + "epoch": 0.42, + "grad_norm": 2.470038846548514, + "learning_rate": 6.449980934502881e-06, + "loss": 0.5673, + "step": 3418 + }, + { + "epoch": 0.42, + "grad_norm": 1.411105238613095, + "learning_rate": 6.448057460524588e-06, + "loss": 0.5568, + "step": 3419 + }, + { + "epoch": 0.42, + "grad_norm": 1.672003949426101, + "learning_rate": 6.44613375259986e-06, + "loss": 0.4877, + "step": 3420 + }, + { + "epoch": 0.42, + "grad_norm": 2.7186118198150053, + "learning_rate": 6.444209811039488e-06, + "loss": 0.5366, + "step": 3421 + }, + { + "epoch": 0.42, + "grad_norm": 0.6707392924215835, + "learning_rate": 6.4422856361543e-06, + "loss": 0.549, + "step": 3422 + }, + { + "epoch": 0.42, + "grad_norm": 1.479892625489135, + "learning_rate": 6.440361228255165e-06, + "loss": 0.5229, + "step": 3423 + }, + { + "epoch": 0.42, + "grad_norm": 1.6666535300936052, + "learning_rate": 6.438436587652989e-06, + "loss": 0.4785, + "step": 3424 + }, + { + "epoch": 0.43, + "grad_norm": 1.6953266703542345, + "learning_rate": 6.436511714658713e-06, + "loss": 0.5738, + "step": 3425 + }, + { + "epoch": 0.43, + "grad_norm": 1.6245088591621328, + "learning_rate": 6.434586609583316e-06, + "loss": 0.4554, + "step": 3426 + }, + { + "epoch": 0.43, + "grad_norm": 1.5209450118584893, + "learning_rate": 6.43266127273782e-06, + "loss": 0.4935, + "step": 3427 + }, + { + "epoch": 0.43, + "grad_norm": 1.3368355711738351, + "learning_rate": 6.430735704433278e-06, + "loss": 0.5396, + "step": 3428 + }, + { + "epoch": 0.43, + "grad_norm": 9.899006884541238, + "learning_rate": 6.428809904980782e-06, + "loss": 0.4872, + "step": 3429 + }, + { + "epoch": 0.43, + "grad_norm": 1.5256633510601914, + "learning_rate": 6.426883874691461e-06, + "loss": 0.4879, + "step": 3430 + }, + { + "epoch": 0.43, + "grad_norm": 1.7959945788719616, + "learning_rate": 6.424957613876483e-06, + "loss": 0.4867, + "step": 3431 + }, + { + "epoch": 0.43, + "grad_norm": 2.035467778275846, + "learning_rate": 6.4230311228470535e-06, + "loss": 0.5156, + "step": 3432 + }, + { + "epoch": 0.43, + "grad_norm": 1.6409432369288743, + "learning_rate": 6.421104401914413e-06, + "loss": 0.4887, + "step": 3433 + }, + { + "epoch": 0.43, + "grad_norm": 1.316556873161966, + "learning_rate": 6.41917745138984e-06, + "loss": 0.4915, + "step": 3434 + }, + { + "epoch": 0.43, + "grad_norm": 1.4845717808016468, + "learning_rate": 6.417250271584649e-06, + "loss": 0.482, + "step": 3435 + }, + { + "epoch": 0.43, + "grad_norm": 1.3906828490733454, + "learning_rate": 6.415322862810198e-06, + "loss": 0.506, + "step": 3436 + }, + { + "epoch": 0.43, + "grad_norm": 1.4789087457973646, + "learning_rate": 6.413395225377872e-06, + "loss": 0.5158, + "step": 3437 + }, + { + "epoch": 0.43, + "grad_norm": 1.4001127768438166, + "learning_rate": 6.4114673595991e-06, + "loss": 0.5042, + "step": 3438 + }, + { + "epoch": 0.43, + "grad_norm": 0.6352396847654843, + "learning_rate": 6.409539265785344e-06, + "loss": 0.4848, + "step": 3439 + }, + { + "epoch": 0.43, + "grad_norm": 0.6458804502407448, + "learning_rate": 6.407610944248106e-06, + "loss": 0.4742, + "step": 3440 + }, + { + "epoch": 0.43, + "grad_norm": 0.6893783720661152, + "learning_rate": 6.405682395298922e-06, + "loss": 0.5247, + "step": 3441 + }, + { + "epoch": 0.43, + "grad_norm": 1.3762211902781725, + "learning_rate": 6.4037536192493665e-06, + "loss": 0.5144, + "step": 3442 + }, + { + "epoch": 0.43, + "grad_norm": 1.8440755133843545, + "learning_rate": 6.401824616411052e-06, + "loss": 0.5605, + "step": 3443 + }, + { + "epoch": 0.43, + "grad_norm": 1.502330810149086, + "learning_rate": 6.399895387095624e-06, + "loss": 0.5145, + "step": 3444 + }, + { + "epoch": 0.43, + "grad_norm": 1.310208911902816, + "learning_rate": 6.397965931614767e-06, + "loss": 0.5021, + "step": 3445 + }, + { + "epoch": 0.43, + "grad_norm": 1.4878071758351954, + "learning_rate": 6.396036250280202e-06, + "loss": 0.5111, + "step": 3446 + }, + { + "epoch": 0.43, + "grad_norm": 1.3499148482849967, + "learning_rate": 6.394106343403685e-06, + "loss": 0.4843, + "step": 3447 + }, + { + "epoch": 0.43, + "grad_norm": 1.6672272401015171, + "learning_rate": 6.392176211297011e-06, + "loss": 0.4892, + "step": 3448 + }, + { + "epoch": 0.43, + "grad_norm": 2.1963725876544813, + "learning_rate": 6.3902458542720085e-06, + "loss": 0.5252, + "step": 3449 + }, + { + "epoch": 0.43, + "grad_norm": 1.8342464367231288, + "learning_rate": 6.388315272640544e-06, + "loss": 0.4821, + "step": 3450 + }, + { + "epoch": 0.43, + "grad_norm": 1.4035352327594603, + "learning_rate": 6.386384466714518e-06, + "loss": 0.4818, + "step": 3451 + }, + { + "epoch": 0.43, + "grad_norm": 1.342198015827931, + "learning_rate": 6.384453436805873e-06, + "loss": 0.4663, + "step": 3452 + }, + { + "epoch": 0.43, + "grad_norm": 1.704755890875126, + "learning_rate": 6.382522183226583e-06, + "loss": 0.5597, + "step": 3453 + }, + { + "epoch": 0.43, + "grad_norm": 1.6219798429195122, + "learning_rate": 6.3805907062886564e-06, + "loss": 0.476, + "step": 3454 + }, + { + "epoch": 0.43, + "grad_norm": 1.7164666803756226, + "learning_rate": 6.3786590063041434e-06, + "loss": 0.5154, + "step": 3455 + }, + { + "epoch": 0.43, + "grad_norm": 1.3471849166172916, + "learning_rate": 6.376727083585126e-06, + "loss": 0.5078, + "step": 3456 + }, + { + "epoch": 0.43, + "grad_norm": 1.6146640259005276, + "learning_rate": 6.374794938443722e-06, + "loss": 0.5718, + "step": 3457 + }, + { + "epoch": 0.43, + "grad_norm": 1.6818007005864406, + "learning_rate": 6.372862571192088e-06, + "loss": 0.5091, + "step": 3458 + }, + { + "epoch": 0.43, + "grad_norm": 1.366665236720085, + "learning_rate": 6.370929982142413e-06, + "loss": 0.4912, + "step": 3459 + }, + { + "epoch": 0.43, + "grad_norm": 2.0380886234440747, + "learning_rate": 6.368997171606927e-06, + "loss": 0.5472, + "step": 3460 + }, + { + "epoch": 0.43, + "grad_norm": 1.6612318777400736, + "learning_rate": 6.367064139897891e-06, + "loss": 0.4455, + "step": 3461 + }, + { + "epoch": 0.43, + "grad_norm": 1.378939506184432, + "learning_rate": 6.365130887327603e-06, + "loss": 0.5667, + "step": 3462 + }, + { + "epoch": 0.43, + "grad_norm": 1.3967605747616427, + "learning_rate": 6.363197414208396e-06, + "loss": 0.4876, + "step": 3463 + }, + { + "epoch": 0.43, + "grad_norm": 1.8320960845441028, + "learning_rate": 6.361263720852642e-06, + "loss": 0.4931, + "step": 3464 + }, + { + "epoch": 0.43, + "grad_norm": 0.6450256223949539, + "learning_rate": 6.359329807572746e-06, + "loss": 0.4911, + "step": 3465 + }, + { + "epoch": 0.43, + "grad_norm": 1.4463229538998164, + "learning_rate": 6.357395674681146e-06, + "loss": 0.455, + "step": 3466 + }, + { + "epoch": 0.43, + "grad_norm": 1.3119855418983852, + "learning_rate": 6.355461322490319e-06, + "loss": 0.5339, + "step": 3467 + }, + { + "epoch": 0.43, + "grad_norm": 2.0993905938665463, + "learning_rate": 6.35352675131278e-06, + "loss": 0.476, + "step": 3468 + }, + { + "epoch": 0.43, + "grad_norm": 2.0789495941589324, + "learning_rate": 6.3515919614610725e-06, + "loss": 0.5042, + "step": 3469 + }, + { + "epoch": 0.43, + "grad_norm": 1.505001124357574, + "learning_rate": 6.3496569532477796e-06, + "loss": 0.5039, + "step": 3470 + }, + { + "epoch": 0.43, + "grad_norm": 1.3543726155896296, + "learning_rate": 6.347721726985518e-06, + "loss": 0.4937, + "step": 3471 + }, + { + "epoch": 0.43, + "grad_norm": 1.410733061613502, + "learning_rate": 6.345786282986944e-06, + "loss": 0.5036, + "step": 3472 + }, + { + "epoch": 0.43, + "grad_norm": 2.288134741913164, + "learning_rate": 6.343850621564742e-06, + "loss": 0.5071, + "step": 3473 + }, + { + "epoch": 0.43, + "grad_norm": 1.3889345099741446, + "learning_rate": 6.3419147430316375e-06, + "loss": 0.5137, + "step": 3474 + }, + { + "epoch": 0.43, + "grad_norm": 1.408902839713213, + "learning_rate": 6.3399786477003866e-06, + "loss": 0.5138, + "step": 3475 + }, + { + "epoch": 0.43, + "grad_norm": 1.4541724062815307, + "learning_rate": 6.338042335883784e-06, + "loss": 0.4752, + "step": 3476 + }, + { + "epoch": 0.43, + "grad_norm": 3.0315020298988067, + "learning_rate": 6.336105807894658e-06, + "loss": 0.5071, + "step": 3477 + }, + { + "epoch": 0.43, + "grad_norm": 1.479139923284239, + "learning_rate": 6.334169064045871e-06, + "loss": 0.5207, + "step": 3478 + }, + { + "epoch": 0.43, + "grad_norm": 0.6708097406058517, + "learning_rate": 6.332232104650321e-06, + "loss": 0.4893, + "step": 3479 + }, + { + "epoch": 0.43, + "grad_norm": 1.7443151935211285, + "learning_rate": 6.330294930020941e-06, + "loss": 0.5204, + "step": 3480 + }, + { + "epoch": 0.43, + "grad_norm": 1.4373772526834405, + "learning_rate": 6.3283575404706996e-06, + "loss": 0.5344, + "step": 3481 + }, + { + "epoch": 0.43, + "grad_norm": 4.2391986402563235, + "learning_rate": 6.326419936312599e-06, + "loss": 0.5516, + "step": 3482 + }, + { + "epoch": 0.43, + "grad_norm": 1.4624592865974275, + "learning_rate": 6.324482117859676e-06, + "loss": 0.5251, + "step": 3483 + }, + { + "epoch": 0.43, + "grad_norm": 1.525858161534442, + "learning_rate": 6.322544085425001e-06, + "loss": 0.5243, + "step": 3484 + }, + { + "epoch": 0.43, + "grad_norm": 3.6321469993075612, + "learning_rate": 6.320605839321681e-06, + "loss": 0.5121, + "step": 3485 + }, + { + "epoch": 0.43, + "grad_norm": 1.6245468809709254, + "learning_rate": 6.318667379862856e-06, + "loss": 0.5002, + "step": 3486 + }, + { + "epoch": 0.43, + "grad_norm": 1.3950302507184524, + "learning_rate": 6.3167287073617035e-06, + "loss": 0.4825, + "step": 3487 + }, + { + "epoch": 0.43, + "grad_norm": 1.4184155641850023, + "learning_rate": 6.31478982213143e-06, + "loss": 0.5087, + "step": 3488 + }, + { + "epoch": 0.43, + "grad_norm": 1.5052726070482894, + "learning_rate": 6.312850724485282e-06, + "loss": 0.4351, + "step": 3489 + }, + { + "epoch": 0.43, + "grad_norm": 1.2968373517440517, + "learning_rate": 6.310911414736537e-06, + "loss": 0.5013, + "step": 3490 + }, + { + "epoch": 0.43, + "grad_norm": 1.6496514532699211, + "learning_rate": 6.308971893198508e-06, + "loss": 0.5353, + "step": 3491 + }, + { + "epoch": 0.43, + "grad_norm": 1.5839632150330647, + "learning_rate": 6.307032160184541e-06, + "loss": 0.5673, + "step": 3492 + }, + { + "epoch": 0.43, + "grad_norm": 1.3248369734797494, + "learning_rate": 6.305092216008016e-06, + "loss": 0.5356, + "step": 3493 + }, + { + "epoch": 0.43, + "grad_norm": 1.4246486211657718, + "learning_rate": 6.30315206098235e-06, + "loss": 0.5004, + "step": 3494 + }, + { + "epoch": 0.43, + "grad_norm": 1.829505588718067, + "learning_rate": 6.301211695420992e-06, + "loss": 0.4931, + "step": 3495 + }, + { + "epoch": 0.43, + "grad_norm": 1.5619325285771204, + "learning_rate": 6.2992711196374236e-06, + "loss": 0.5172, + "step": 3496 + }, + { + "epoch": 0.43, + "grad_norm": 1.3228374138272259, + "learning_rate": 6.297330333945164e-06, + "loss": 0.4938, + "step": 3497 + }, + { + "epoch": 0.43, + "grad_norm": 1.3977534860361773, + "learning_rate": 6.2953893386577626e-06, + "loss": 0.4958, + "step": 3498 + }, + { + "epoch": 0.43, + "grad_norm": 1.2944479809162501, + "learning_rate": 6.293448134088805e-06, + "loss": 0.5331, + "step": 3499 + }, + { + "epoch": 0.43, + "grad_norm": 1.7182384782561086, + "learning_rate": 6.2915067205519085e-06, + "loss": 0.4769, + "step": 3500 + }, + { + "epoch": 0.43, + "grad_norm": 1.2931322581828373, + "learning_rate": 6.289565098360728e-06, + "loss": 0.4586, + "step": 3501 + }, + { + "epoch": 0.43, + "grad_norm": 2.522939179168897, + "learning_rate": 6.287623267828948e-06, + "loss": 0.4692, + "step": 3502 + }, + { + "epoch": 0.43, + "grad_norm": 2.328118856112479, + "learning_rate": 6.2856812292702884e-06, + "loss": 0.4894, + "step": 3503 + }, + { + "epoch": 0.43, + "grad_norm": 1.9947553241739746, + "learning_rate": 6.283738982998502e-06, + "loss": 0.4862, + "step": 3504 + }, + { + "epoch": 0.43, + "grad_norm": 0.6311486849974827, + "learning_rate": 6.281796529327378e-06, + "loss": 0.5108, + "step": 3505 + }, + { + "epoch": 0.44, + "grad_norm": 0.6625257516752647, + "learning_rate": 6.279853868570736e-06, + "loss": 0.5143, + "step": 3506 + }, + { + "epoch": 0.44, + "grad_norm": 1.9066907850009018, + "learning_rate": 6.27791100104243e-06, + "loss": 0.5267, + "step": 3507 + }, + { + "epoch": 0.44, + "grad_norm": 1.4781973315845938, + "learning_rate": 6.2759679270563446e-06, + "loss": 0.5299, + "step": 3508 + }, + { + "epoch": 0.44, + "grad_norm": 1.308884538474327, + "learning_rate": 6.274024646926405e-06, + "loss": 0.4653, + "step": 3509 + }, + { + "epoch": 0.44, + "grad_norm": 1.278907688874593, + "learning_rate": 6.272081160966564e-06, + "loss": 0.5022, + "step": 3510 + }, + { + "epoch": 0.44, + "grad_norm": 1.6621416069496444, + "learning_rate": 6.2701374694908045e-06, + "loss": 0.4967, + "step": 3511 + }, + { + "epoch": 0.44, + "grad_norm": 2.3440202170943447, + "learning_rate": 6.268193572813151e-06, + "loss": 0.4782, + "step": 3512 + }, + { + "epoch": 0.44, + "grad_norm": 0.7500464171050284, + "learning_rate": 6.266249471247659e-06, + "loss": 0.4943, + "step": 3513 + }, + { + "epoch": 0.44, + "grad_norm": 1.411055795264252, + "learning_rate": 6.264305165108412e-06, + "loss": 0.4958, + "step": 3514 + }, + { + "epoch": 0.44, + "grad_norm": 1.2671764977020974, + "learning_rate": 6.262360654709529e-06, + "loss": 0.4663, + "step": 3515 + }, + { + "epoch": 0.44, + "grad_norm": 1.1726670323695136, + "learning_rate": 6.260415940365165e-06, + "loss": 0.4504, + "step": 3516 + }, + { + "epoch": 0.44, + "grad_norm": 1.5489519189048802, + "learning_rate": 6.258471022389506e-06, + "loss": 0.5345, + "step": 3517 + }, + { + "epoch": 0.44, + "grad_norm": 2.9116402906622896, + "learning_rate": 6.256525901096769e-06, + "loss": 0.4731, + "step": 3518 + }, + { + "epoch": 0.44, + "grad_norm": 1.4590621142943716, + "learning_rate": 6.254580576801208e-06, + "loss": 0.5038, + "step": 3519 + }, + { + "epoch": 0.44, + "grad_norm": 1.4887237061309944, + "learning_rate": 6.252635049817104e-06, + "loss": 0.5682, + "step": 3520 + }, + { + "epoch": 0.44, + "grad_norm": 1.554091148712146, + "learning_rate": 6.250689320458775e-06, + "loss": 0.4634, + "step": 3521 + }, + { + "epoch": 0.44, + "grad_norm": 1.5543738127421725, + "learning_rate": 6.248743389040573e-06, + "loss": 0.4984, + "step": 3522 + }, + { + "epoch": 0.44, + "grad_norm": 1.7629146754869744, + "learning_rate": 6.246797255876876e-06, + "loss": 0.5209, + "step": 3523 + }, + { + "epoch": 0.44, + "grad_norm": 2.9998417318041923, + "learning_rate": 6.244850921282102e-06, + "loss": 0.5541, + "step": 3524 + }, + { + "epoch": 0.44, + "grad_norm": 1.2551611920725345, + "learning_rate": 6.242904385570699e-06, + "loss": 0.5279, + "step": 3525 + }, + { + "epoch": 0.44, + "grad_norm": 1.5336429069292903, + "learning_rate": 6.240957649057145e-06, + "loss": 0.5148, + "step": 3526 + }, + { + "epoch": 0.44, + "grad_norm": 2.044244210210011, + "learning_rate": 6.239010712055955e-06, + "loss": 0.5184, + "step": 3527 + }, + { + "epoch": 0.44, + "grad_norm": 1.4181219458895138, + "learning_rate": 6.2370635748816725e-06, + "loss": 0.4707, + "step": 3528 + }, + { + "epoch": 0.44, + "grad_norm": 1.6032922870244413, + "learning_rate": 6.235116237848872e-06, + "loss": 0.4305, + "step": 3529 + }, + { + "epoch": 0.44, + "grad_norm": 0.7304352877367216, + "learning_rate": 6.233168701272167e-06, + "loss": 0.5232, + "step": 3530 + }, + { + "epoch": 0.44, + "grad_norm": 1.3911196910792476, + "learning_rate": 6.231220965466197e-06, + "loss": 0.5021, + "step": 3531 + }, + { + "epoch": 0.44, + "grad_norm": 1.3165358719004459, + "learning_rate": 6.229273030745638e-06, + "loss": 0.5183, + "step": 3532 + }, + { + "epoch": 0.44, + "grad_norm": 1.7453145753076977, + "learning_rate": 6.227324897425191e-06, + "loss": 0.5097, + "step": 3533 + }, + { + "epoch": 0.44, + "grad_norm": 5.5380079581301915, + "learning_rate": 6.2253765658195986e-06, + "loss": 0.5064, + "step": 3534 + }, + { + "epoch": 0.44, + "grad_norm": 2.0266057416813785, + "learning_rate": 6.223428036243631e-06, + "loss": 0.5195, + "step": 3535 + }, + { + "epoch": 0.44, + "grad_norm": 1.3156969788819382, + "learning_rate": 6.2214793090120896e-06, + "loss": 0.4469, + "step": 3536 + }, + { + "epoch": 0.44, + "grad_norm": 1.5487418691974917, + "learning_rate": 6.219530384439807e-06, + "loss": 0.502, + "step": 3537 + }, + { + "epoch": 0.44, + "grad_norm": 1.4488340230656243, + "learning_rate": 6.217581262841651e-06, + "loss": 0.5008, + "step": 3538 + }, + { + "epoch": 0.44, + "grad_norm": 2.0032013299670752, + "learning_rate": 6.215631944532518e-06, + "loss": 0.5196, + "step": 3539 + }, + { + "epoch": 0.44, + "grad_norm": 1.6068169770942995, + "learning_rate": 6.213682429827338e-06, + "loss": 0.5189, + "step": 3540 + }, + { + "epoch": 0.44, + "grad_norm": 2.1926932157629393, + "learning_rate": 6.21173271904107e-06, + "loss": 0.484, + "step": 3541 + }, + { + "epoch": 0.44, + "grad_norm": 0.6671431070107134, + "learning_rate": 6.209782812488713e-06, + "loss": 0.511, + "step": 3542 + }, + { + "epoch": 0.44, + "grad_norm": 1.6212237617207985, + "learning_rate": 6.207832710485285e-06, + "loss": 0.528, + "step": 3543 + }, + { + "epoch": 0.44, + "grad_norm": 1.6305336046241468, + "learning_rate": 6.2058824133458476e-06, + "loss": 0.478, + "step": 3544 + }, + { + "epoch": 0.44, + "grad_norm": 1.822234471136306, + "learning_rate": 6.203931921385484e-06, + "loss": 0.5406, + "step": 3545 + }, + { + "epoch": 0.44, + "grad_norm": 0.6679266912093794, + "learning_rate": 6.201981234919317e-06, + "loss": 0.4911, + "step": 3546 + }, + { + "epoch": 0.44, + "grad_norm": 1.439282471232996, + "learning_rate": 6.200030354262493e-06, + "loss": 0.5135, + "step": 3547 + }, + { + "epoch": 0.44, + "grad_norm": 1.3884166014461918, + "learning_rate": 6.198079279730198e-06, + "loss": 0.4351, + "step": 3548 + }, + { + "epoch": 0.44, + "grad_norm": 1.4047748956301707, + "learning_rate": 6.196128011637642e-06, + "loss": 0.5472, + "step": 3549 + }, + { + "epoch": 0.44, + "grad_norm": 1.4808855445369755, + "learning_rate": 6.194176550300071e-06, + "loss": 0.4965, + "step": 3550 + }, + { + "epoch": 0.44, + "grad_norm": 2.0614228950515447, + "learning_rate": 6.1922248960327635e-06, + "loss": 0.5206, + "step": 3551 + }, + { + "epoch": 0.44, + "grad_norm": 1.7331701568695543, + "learning_rate": 6.190273049151022e-06, + "loss": 0.5328, + "step": 3552 + }, + { + "epoch": 0.44, + "grad_norm": 1.8941941076976965, + "learning_rate": 6.1883210099701864e-06, + "loss": 0.5372, + "step": 3553 + }, + { + "epoch": 0.44, + "grad_norm": 1.83335774832791, + "learning_rate": 6.186368778805628e-06, + "loss": 0.4962, + "step": 3554 + }, + { + "epoch": 0.44, + "grad_norm": 1.5119225106231424, + "learning_rate": 6.184416355972743e-06, + "loss": 0.5088, + "step": 3555 + }, + { + "epoch": 0.44, + "grad_norm": 1.2784469665914808, + "learning_rate": 6.182463741786965e-06, + "loss": 0.5033, + "step": 3556 + }, + { + "epoch": 0.44, + "grad_norm": 1.416495701529496, + "learning_rate": 6.180510936563754e-06, + "loss": 0.5359, + "step": 3557 + }, + { + "epoch": 0.44, + "grad_norm": 1.4934566071982733, + "learning_rate": 6.178557940618605e-06, + "loss": 0.486, + "step": 3558 + }, + { + "epoch": 0.44, + "grad_norm": 2.258013858046018, + "learning_rate": 6.17660475426704e-06, + "loss": 0.5424, + "step": 3559 + }, + { + "epoch": 0.44, + "grad_norm": 1.4267716289763825, + "learning_rate": 6.174651377824615e-06, + "loss": 0.4984, + "step": 3560 + }, + { + "epoch": 0.44, + "grad_norm": 1.5973749407245927, + "learning_rate": 6.172697811606914e-06, + "loss": 0.4614, + "step": 3561 + }, + { + "epoch": 0.44, + "grad_norm": 1.4895046489791153, + "learning_rate": 6.1707440559295554e-06, + "loss": 0.5066, + "step": 3562 + }, + { + "epoch": 0.44, + "grad_norm": 1.4827775512387729, + "learning_rate": 6.1687901111081826e-06, + "loss": 0.5524, + "step": 3563 + }, + { + "epoch": 0.44, + "grad_norm": 1.715697107372859, + "learning_rate": 6.166835977458473e-06, + "loss": 0.5476, + "step": 3564 + }, + { + "epoch": 0.44, + "grad_norm": 1.4663246145687245, + "learning_rate": 6.1648816552961355e-06, + "loss": 0.5031, + "step": 3565 + }, + { + "epoch": 0.44, + "grad_norm": 1.5067132761428363, + "learning_rate": 6.162927144936906e-06, + "loss": 0.465, + "step": 3566 + }, + { + "epoch": 0.44, + "grad_norm": 1.7021011233090093, + "learning_rate": 6.1609724466965535e-06, + "loss": 0.5044, + "step": 3567 + }, + { + "epoch": 0.44, + "grad_norm": 1.3699920347424173, + "learning_rate": 6.159017560890879e-06, + "loss": 0.4359, + "step": 3568 + }, + { + "epoch": 0.44, + "grad_norm": 1.40510218511984, + "learning_rate": 6.15706248783571e-06, + "loss": 0.4913, + "step": 3569 + }, + { + "epoch": 0.44, + "grad_norm": 0.6211969399829059, + "learning_rate": 6.155107227846904e-06, + "loss": 0.4779, + "step": 3570 + }, + { + "epoch": 0.44, + "grad_norm": 1.2953802588952739, + "learning_rate": 6.153151781240352e-06, + "loss": 0.4684, + "step": 3571 + }, + { + "epoch": 0.44, + "grad_norm": 1.3483986543487585, + "learning_rate": 6.151196148331975e-06, + "loss": 0.4908, + "step": 3572 + }, + { + "epoch": 0.44, + "grad_norm": 1.5473054117537224, + "learning_rate": 6.1492403294377225e-06, + "loss": 0.4678, + "step": 3573 + }, + { + "epoch": 0.44, + "grad_norm": 1.2787212957409102, + "learning_rate": 6.14728432487357e-06, + "loss": 0.517, + "step": 3574 + }, + { + "epoch": 0.44, + "grad_norm": 1.3290235549835527, + "learning_rate": 6.145328134955533e-06, + "loss": 0.504, + "step": 3575 + }, + { + "epoch": 0.44, + "grad_norm": 1.4494007619059526, + "learning_rate": 6.143371759999648e-06, + "loss": 0.5333, + "step": 3576 + }, + { + "epoch": 0.44, + "grad_norm": 1.5308204499267646, + "learning_rate": 6.1414152003219854e-06, + "loss": 0.5299, + "step": 3577 + }, + { + "epoch": 0.44, + "grad_norm": 1.481114838188324, + "learning_rate": 6.1394584562386425e-06, + "loss": 0.5139, + "step": 3578 + }, + { + "epoch": 0.44, + "grad_norm": 2.219964866020278, + "learning_rate": 6.137501528065752e-06, + "loss": 0.553, + "step": 3579 + }, + { + "epoch": 0.44, + "grad_norm": 1.3323902271568275, + "learning_rate": 6.1355444161194724e-06, + "loss": 0.4944, + "step": 3580 + }, + { + "epoch": 0.44, + "grad_norm": 2.275208807005506, + "learning_rate": 6.13358712071599e-06, + "loss": 0.5012, + "step": 3581 + }, + { + "epoch": 0.44, + "grad_norm": 0.7100296280459837, + "learning_rate": 6.131629642171526e-06, + "loss": 0.4876, + "step": 3582 + }, + { + "epoch": 0.44, + "grad_norm": 1.3075143321738223, + "learning_rate": 6.129671980802325e-06, + "loss": 0.4824, + "step": 3583 + }, + { + "epoch": 0.44, + "grad_norm": 1.8781822785139812, + "learning_rate": 6.127714136924667e-06, + "loss": 0.4955, + "step": 3584 + }, + { + "epoch": 0.44, + "grad_norm": 1.8573339867952048, + "learning_rate": 6.125756110854859e-06, + "loss": 0.5335, + "step": 3585 + }, + { + "epoch": 0.45, + "grad_norm": 1.3234182294746977, + "learning_rate": 6.123797902909236e-06, + "loss": 0.5012, + "step": 3586 + }, + { + "epoch": 0.45, + "grad_norm": 1.5037302731280628, + "learning_rate": 6.121839513404163e-06, + "loss": 0.5497, + "step": 3587 + }, + { + "epoch": 0.45, + "grad_norm": 1.2704736827188403, + "learning_rate": 6.119880942656038e-06, + "loss": 0.5592, + "step": 3588 + }, + { + "epoch": 0.45, + "grad_norm": 1.751140929108546, + "learning_rate": 6.117922190981282e-06, + "loss": 0.513, + "step": 3589 + }, + { + "epoch": 0.45, + "grad_norm": 1.552318521249153, + "learning_rate": 6.1159632586963524e-06, + "loss": 0.5248, + "step": 3590 + }, + { + "epoch": 0.45, + "grad_norm": 1.4367054279952804, + "learning_rate": 6.114004146117729e-06, + "loss": 0.5626, + "step": 3591 + }, + { + "epoch": 0.45, + "grad_norm": 1.2107766082355556, + "learning_rate": 6.112044853561925e-06, + "loss": 0.4458, + "step": 3592 + }, + { + "epoch": 0.45, + "grad_norm": 1.4829866654121693, + "learning_rate": 6.11008538134548e-06, + "loss": 0.5067, + "step": 3593 + }, + { + "epoch": 0.45, + "grad_norm": 1.5846189280659801, + "learning_rate": 6.108125729784964e-06, + "loss": 0.5742, + "step": 3594 + }, + { + "epoch": 0.45, + "grad_norm": 1.5089263131720982, + "learning_rate": 6.106165899196978e-06, + "loss": 0.5726, + "step": 3595 + }, + { + "epoch": 0.45, + "grad_norm": 1.4150877058823383, + "learning_rate": 6.1042058898981484e-06, + "loss": 0.5297, + "step": 3596 + }, + { + "epoch": 0.45, + "grad_norm": 1.4547063878965534, + "learning_rate": 6.1022457022051325e-06, + "loss": 0.5352, + "step": 3597 + }, + { + "epoch": 0.45, + "grad_norm": 1.3973769343405766, + "learning_rate": 6.100285336434616e-06, + "loss": 0.5057, + "step": 3598 + }, + { + "epoch": 0.45, + "grad_norm": 1.403887011962654, + "learning_rate": 6.098324792903313e-06, + "loss": 0.4837, + "step": 3599 + }, + { + "epoch": 0.45, + "grad_norm": 1.2463330225672213, + "learning_rate": 6.096364071927966e-06, + "loss": 0.4729, + "step": 3600 + }, + { + "epoch": 0.45, + "grad_norm": 2.050882947847235, + "learning_rate": 6.094403173825348e-06, + "loss": 0.4663, + "step": 3601 + }, + { + "epoch": 0.45, + "grad_norm": 1.611584775857751, + "learning_rate": 6.0924420989122565e-06, + "loss": 0.5078, + "step": 3602 + }, + { + "epoch": 0.45, + "grad_norm": 1.5437249180480355, + "learning_rate": 6.0904808475055246e-06, + "loss": 0.5367, + "step": 3603 + }, + { + "epoch": 0.45, + "grad_norm": 2.138459892149861, + "learning_rate": 6.088519419922008e-06, + "loss": 0.4932, + "step": 3604 + }, + { + "epoch": 0.45, + "grad_norm": 0.6913666855531088, + "learning_rate": 6.086557816478591e-06, + "loss": 0.5108, + "step": 3605 + }, + { + "epoch": 0.45, + "grad_norm": 1.5783740456870101, + "learning_rate": 6.08459603749219e-06, + "loss": 0.5018, + "step": 3606 + }, + { + "epoch": 0.45, + "grad_norm": 1.2359527048762666, + "learning_rate": 6.082634083279746e-06, + "loss": 0.4702, + "step": 3607 + }, + { + "epoch": 0.45, + "grad_norm": 1.4654275615242967, + "learning_rate": 6.080671954158232e-06, + "loss": 0.5437, + "step": 3608 + }, + { + "epoch": 0.45, + "grad_norm": 2.272816051175789, + "learning_rate": 6.0787096504446465e-06, + "loss": 0.4884, + "step": 3609 + }, + { + "epoch": 0.45, + "grad_norm": 1.5154753911364525, + "learning_rate": 6.076747172456016e-06, + "loss": 0.5131, + "step": 3610 + }, + { + "epoch": 0.45, + "grad_norm": 1.491605697034516, + "learning_rate": 6.074784520509395e-06, + "loss": 0.5344, + "step": 3611 + }, + { + "epoch": 0.45, + "grad_norm": 0.6704317411138851, + "learning_rate": 6.07282169492187e-06, + "loss": 0.517, + "step": 3612 + }, + { + "epoch": 0.45, + "grad_norm": 2.2193079275691656, + "learning_rate": 6.070858696010552e-06, + "loss": 0.5255, + "step": 3613 + }, + { + "epoch": 0.45, + "grad_norm": 1.442941458805938, + "learning_rate": 6.068895524092581e-06, + "loss": 0.5055, + "step": 3614 + }, + { + "epoch": 0.45, + "grad_norm": 1.4500045510329642, + "learning_rate": 6.066932179485122e-06, + "loss": 0.544, + "step": 3615 + }, + { + "epoch": 0.45, + "grad_norm": 1.243790000007731, + "learning_rate": 6.064968662505374e-06, + "loss": 0.501, + "step": 3616 + }, + { + "epoch": 0.45, + "grad_norm": 1.3412828250391715, + "learning_rate": 6.06300497347056e-06, + "loss": 0.5653, + "step": 3617 + }, + { + "epoch": 0.45, + "grad_norm": 1.2171866043917068, + "learning_rate": 6.06104111269793e-06, + "loss": 0.3964, + "step": 3618 + }, + { + "epoch": 0.45, + "grad_norm": 1.3164204240625614, + "learning_rate": 6.059077080504761e-06, + "loss": 0.4738, + "step": 3619 + }, + { + "epoch": 0.45, + "grad_norm": 1.424702659928036, + "learning_rate": 6.057112877208364e-06, + "loss": 0.5737, + "step": 3620 + }, + { + "epoch": 0.45, + "grad_norm": 1.4070668142198544, + "learning_rate": 6.0551485031260725e-06, + "loss": 0.4949, + "step": 3621 + }, + { + "epoch": 0.45, + "grad_norm": 1.3408711147549046, + "learning_rate": 6.053183958575246e-06, + "loss": 0.4988, + "step": 3622 + }, + { + "epoch": 0.45, + "grad_norm": 3.8979002790542197, + "learning_rate": 6.051219243873275e-06, + "loss": 0.4505, + "step": 3623 + }, + { + "epoch": 0.45, + "grad_norm": 1.3990509882706048, + "learning_rate": 6.049254359337578e-06, + "loss": 0.5486, + "step": 3624 + }, + { + "epoch": 0.45, + "grad_norm": 1.3622160719714407, + "learning_rate": 6.0472893052855985e-06, + "loss": 0.4876, + "step": 3625 + }, + { + "epoch": 0.45, + "grad_norm": 1.3103141054320948, + "learning_rate": 6.0453240820348066e-06, + "loss": 0.4933, + "step": 3626 + }, + { + "epoch": 0.45, + "grad_norm": 1.5448586835372746, + "learning_rate": 6.043358689902704e-06, + "loss": 0.5565, + "step": 3627 + }, + { + "epoch": 0.45, + "grad_norm": 1.740198154341534, + "learning_rate": 6.041393129206816e-06, + "loss": 0.5272, + "step": 3628 + }, + { + "epoch": 0.45, + "grad_norm": 1.4284041583081468, + "learning_rate": 6.0394274002646965e-06, + "loss": 0.4735, + "step": 3629 + }, + { + "epoch": 0.45, + "grad_norm": 1.502198490514338, + "learning_rate": 6.037461503393925e-06, + "loss": 0.5462, + "step": 3630 + }, + { + "epoch": 0.45, + "grad_norm": 1.2648890363953325, + "learning_rate": 6.03549543891211e-06, + "loss": 0.5275, + "step": 3631 + }, + { + "epoch": 0.45, + "grad_norm": 1.4609835118631087, + "learning_rate": 6.033529207136888e-06, + "loss": 0.5122, + "step": 3632 + }, + { + "epoch": 0.45, + "grad_norm": 1.5404894439925987, + "learning_rate": 6.03156280838592e-06, + "loss": 0.5405, + "step": 3633 + }, + { + "epoch": 0.45, + "grad_norm": 0.681033615151205, + "learning_rate": 6.029596242976895e-06, + "loss": 0.5145, + "step": 3634 + }, + { + "epoch": 0.45, + "grad_norm": 1.2585368863622328, + "learning_rate": 6.02762951122753e-06, + "loss": 0.5044, + "step": 3635 + }, + { + "epoch": 0.45, + "grad_norm": 1.6716525797681374, + "learning_rate": 6.025662613455566e-06, + "loss": 0.5353, + "step": 3636 + }, + { + "epoch": 0.45, + "grad_norm": 2.690724018178695, + "learning_rate": 6.023695549978774e-06, + "loss": 0.5276, + "step": 3637 + }, + { + "epoch": 0.45, + "grad_norm": 0.6397336826076158, + "learning_rate": 6.021728321114949e-06, + "loss": 0.4837, + "step": 3638 + }, + { + "epoch": 0.45, + "grad_norm": 1.333082952090204, + "learning_rate": 6.0197609271819145e-06, + "loss": 0.5052, + "step": 3639 + }, + { + "epoch": 0.45, + "grad_norm": 1.638944018469146, + "learning_rate": 6.017793368497523e-06, + "loss": 0.548, + "step": 3640 + }, + { + "epoch": 0.45, + "grad_norm": 1.3767723047356812, + "learning_rate": 6.0158256453796485e-06, + "loss": 0.492, + "step": 3641 + }, + { + "epoch": 0.45, + "grad_norm": 1.9370306650773428, + "learning_rate": 6.013857758146193e-06, + "loss": 0.538, + "step": 3642 + }, + { + "epoch": 0.45, + "grad_norm": 1.3766171654282706, + "learning_rate": 6.01188970711509e-06, + "loss": 0.47, + "step": 3643 + }, + { + "epoch": 0.45, + "grad_norm": 1.555917116487695, + "learning_rate": 6.0099214926042905e-06, + "loss": 0.5003, + "step": 3644 + }, + { + "epoch": 0.45, + "grad_norm": 3.9513733336041112, + "learning_rate": 6.00795311493178e-06, + "loss": 0.5294, + "step": 3645 + }, + { + "epoch": 0.45, + "grad_norm": 1.4435089295034635, + "learning_rate": 6.005984574415568e-06, + "loss": 0.5174, + "step": 3646 + }, + { + "epoch": 0.45, + "grad_norm": 1.3801760935816803, + "learning_rate": 6.0040158713736865e-06, + "loss": 0.5404, + "step": 3647 + }, + { + "epoch": 0.45, + "grad_norm": 1.5837482763871387, + "learning_rate": 6.002047006124198e-06, + "loss": 0.5395, + "step": 3648 + }, + { + "epoch": 0.45, + "grad_norm": 1.7192576032754558, + "learning_rate": 6.000077978985191e-06, + "loss": 0.4661, + "step": 3649 + }, + { + "epoch": 0.45, + "grad_norm": 1.3025945963336203, + "learning_rate": 5.9981087902747785e-06, + "loss": 0.5132, + "step": 3650 + }, + { + "epoch": 0.45, + "grad_norm": 1.477148566837164, + "learning_rate": 5.996139440311099e-06, + "loss": 0.5258, + "step": 3651 + }, + { + "epoch": 0.45, + "grad_norm": 1.2595738614799532, + "learning_rate": 5.994169929412323e-06, + "loss": 0.4461, + "step": 3652 + }, + { + "epoch": 0.45, + "grad_norm": 1.2587053512253195, + "learning_rate": 5.9922002578966364e-06, + "loss": 0.5135, + "step": 3653 + }, + { + "epoch": 0.45, + "grad_norm": 1.4674379833778877, + "learning_rate": 5.990230426082261e-06, + "loss": 0.552, + "step": 3654 + }, + { + "epoch": 0.45, + "grad_norm": 1.5625476515475012, + "learning_rate": 5.988260434287438e-06, + "loss": 0.5393, + "step": 3655 + }, + { + "epoch": 0.45, + "grad_norm": 1.2436803191403425, + "learning_rate": 5.986290282830438e-06, + "loss": 0.5203, + "step": 3656 + }, + { + "epoch": 0.45, + "grad_norm": 1.4690916234746247, + "learning_rate": 5.984319972029558e-06, + "loss": 0.5051, + "step": 3657 + }, + { + "epoch": 0.45, + "grad_norm": 1.4158183487114746, + "learning_rate": 5.982349502203116e-06, + "loss": 0.537, + "step": 3658 + }, + { + "epoch": 0.45, + "grad_norm": 0.6914721822117508, + "learning_rate": 5.98037887366946e-06, + "loss": 0.5228, + "step": 3659 + }, + { + "epoch": 0.45, + "grad_norm": 1.3885647159507815, + "learning_rate": 5.978408086746962e-06, + "loss": 0.4751, + "step": 3660 + }, + { + "epoch": 0.45, + "grad_norm": 3.0146078697076937, + "learning_rate": 5.976437141754021e-06, + "loss": 0.4981, + "step": 3661 + }, + { + "epoch": 0.45, + "grad_norm": 0.613300758549818, + "learning_rate": 5.974466039009063e-06, + "loss": 0.4567, + "step": 3662 + }, + { + "epoch": 0.45, + "grad_norm": 2.2454938124351362, + "learning_rate": 5.972494778830531e-06, + "loss": 0.4523, + "step": 3663 + }, + { + "epoch": 0.45, + "grad_norm": 1.391952792088452, + "learning_rate": 5.970523361536903e-06, + "loss": 0.4386, + "step": 3664 + }, + { + "epoch": 0.45, + "grad_norm": 1.470221657599158, + "learning_rate": 5.968551787446678e-06, + "loss": 0.5198, + "step": 3665 + }, + { + "epoch": 0.45, + "grad_norm": 1.5239604080432068, + "learning_rate": 5.966580056878382e-06, + "loss": 0.5177, + "step": 3666 + }, + { + "epoch": 0.46, + "grad_norm": 1.3258176562134478, + "learning_rate": 5.9646081701505654e-06, + "loss": 0.4909, + "step": 3667 + }, + { + "epoch": 0.46, + "grad_norm": 1.7142755784727002, + "learning_rate": 5.962636127581802e-06, + "loss": 0.4909, + "step": 3668 + }, + { + "epoch": 0.46, + "grad_norm": 1.484667936557781, + "learning_rate": 5.960663929490696e-06, + "loss": 0.5158, + "step": 3669 + }, + { + "epoch": 0.46, + "grad_norm": 2.0552574632383447, + "learning_rate": 5.95869157619587e-06, + "loss": 0.4578, + "step": 3670 + }, + { + "epoch": 0.46, + "grad_norm": 1.592695773175072, + "learning_rate": 5.956719068015977e-06, + "loss": 0.5533, + "step": 3671 + }, + { + "epoch": 0.46, + "grad_norm": 2.2148265940903142, + "learning_rate": 5.954746405269692e-06, + "loss": 0.493, + "step": 3672 + }, + { + "epoch": 0.46, + "grad_norm": 1.479906503336225, + "learning_rate": 5.952773588275718e-06, + "loss": 0.5627, + "step": 3673 + }, + { + "epoch": 0.46, + "grad_norm": 1.4811317212396697, + "learning_rate": 5.950800617352778e-06, + "loss": 0.5211, + "step": 3674 + }, + { + "epoch": 0.46, + "grad_norm": 1.4535882719954414, + "learning_rate": 5.9488274928196245e-06, + "loss": 0.4846, + "step": 3675 + }, + { + "epoch": 0.46, + "grad_norm": 1.8321506613837863, + "learning_rate": 5.946854214995032e-06, + "loss": 0.4663, + "step": 3676 + }, + { + "epoch": 0.46, + "grad_norm": 1.6928171693718244, + "learning_rate": 5.9448807841978044e-06, + "loss": 0.5501, + "step": 3677 + }, + { + "epoch": 0.46, + "grad_norm": 1.3754667256719582, + "learning_rate": 5.942907200746762e-06, + "loss": 0.4814, + "step": 3678 + }, + { + "epoch": 0.46, + "grad_norm": 1.6116269924251991, + "learning_rate": 5.940933464960759e-06, + "loss": 0.5573, + "step": 3679 + }, + { + "epoch": 0.46, + "grad_norm": 1.3579375286635698, + "learning_rate": 5.9389595771586675e-06, + "loss": 0.4808, + "step": 3680 + }, + { + "epoch": 0.46, + "grad_norm": 2.3432840262996693, + "learning_rate": 5.936985537659385e-06, + "loss": 0.49, + "step": 3681 + }, + { + "epoch": 0.46, + "grad_norm": 1.9449076892590997, + "learning_rate": 5.93501134678184e-06, + "loss": 0.4858, + "step": 3682 + }, + { + "epoch": 0.46, + "grad_norm": 1.712462555171365, + "learning_rate": 5.9330370048449736e-06, + "loss": 0.5005, + "step": 3683 + }, + { + "epoch": 0.46, + "grad_norm": 1.4118247291797974, + "learning_rate": 5.9310625121677636e-06, + "loss": 0.5104, + "step": 3684 + }, + { + "epoch": 0.46, + "grad_norm": 1.324781398126362, + "learning_rate": 5.929087869069202e-06, + "loss": 0.4799, + "step": 3685 + }, + { + "epoch": 0.46, + "grad_norm": 1.2617013365478964, + "learning_rate": 5.927113075868315e-06, + "loss": 0.5406, + "step": 3686 + }, + { + "epoch": 0.46, + "grad_norm": 1.5523795998609873, + "learning_rate": 5.925138132884145e-06, + "loss": 0.5128, + "step": 3687 + }, + { + "epoch": 0.46, + "grad_norm": 1.4891053513027392, + "learning_rate": 5.923163040435762e-06, + "loss": 0.5259, + "step": 3688 + }, + { + "epoch": 0.46, + "grad_norm": 1.459970627139374, + "learning_rate": 5.921187798842258e-06, + "loss": 0.5568, + "step": 3689 + }, + { + "epoch": 0.46, + "grad_norm": 1.4923420303638777, + "learning_rate": 5.9192124084227534e-06, + "loss": 0.538, + "step": 3690 + }, + { + "epoch": 0.46, + "grad_norm": 1.4986361120402343, + "learning_rate": 5.917236869496388e-06, + "loss": 0.5481, + "step": 3691 + }, + { + "epoch": 0.46, + "grad_norm": 2.3799188836426888, + "learning_rate": 5.915261182382328e-06, + "loss": 0.5267, + "step": 3692 + }, + { + "epoch": 0.46, + "grad_norm": 1.4245940989096637, + "learning_rate": 5.913285347399762e-06, + "loss": 0.5152, + "step": 3693 + }, + { + "epoch": 0.46, + "grad_norm": 1.752625008923162, + "learning_rate": 5.9113093648679065e-06, + "loss": 0.5141, + "step": 3694 + }, + { + "epoch": 0.46, + "grad_norm": 1.6379456852940069, + "learning_rate": 5.909333235105996e-06, + "loss": 0.5097, + "step": 3695 + }, + { + "epoch": 0.46, + "grad_norm": 1.4584429688128662, + "learning_rate": 5.907356958433292e-06, + "loss": 0.4911, + "step": 3696 + }, + { + "epoch": 0.46, + "grad_norm": 1.441952787689109, + "learning_rate": 5.905380535169082e-06, + "loss": 0.5118, + "step": 3697 + }, + { + "epoch": 0.46, + "grad_norm": 1.5176438222118591, + "learning_rate": 5.903403965632672e-06, + "loss": 0.4929, + "step": 3698 + }, + { + "epoch": 0.46, + "grad_norm": 1.2751446770162105, + "learning_rate": 5.9014272501433945e-06, + "loss": 0.4591, + "step": 3699 + }, + { + "epoch": 0.46, + "grad_norm": 1.404506261471304, + "learning_rate": 5.899450389020605e-06, + "loss": 0.516, + "step": 3700 + }, + { + "epoch": 0.46, + "grad_norm": 1.3889948229171092, + "learning_rate": 5.897473382583684e-06, + "loss": 0.4677, + "step": 3701 + }, + { + "epoch": 0.46, + "grad_norm": 2.2631815193287492, + "learning_rate": 5.895496231152033e-06, + "loss": 0.5102, + "step": 3702 + }, + { + "epoch": 0.46, + "grad_norm": 1.5626518136918286, + "learning_rate": 5.893518935045081e-06, + "loss": 0.5141, + "step": 3703 + }, + { + "epoch": 0.46, + "grad_norm": 1.4104502177968783, + "learning_rate": 5.891541494582274e-06, + "loss": 0.5233, + "step": 3704 + }, + { + "epoch": 0.46, + "grad_norm": 1.6254946366913998, + "learning_rate": 5.889563910083087e-06, + "loss": 0.5165, + "step": 3705 + }, + { + "epoch": 0.46, + "grad_norm": 1.3610756542467268, + "learning_rate": 5.887586181867015e-06, + "loss": 0.5031, + "step": 3706 + }, + { + "epoch": 0.46, + "grad_norm": 1.4098997632065402, + "learning_rate": 5.885608310253581e-06, + "loss": 0.5532, + "step": 3707 + }, + { + "epoch": 0.46, + "grad_norm": 1.5357593910677314, + "learning_rate": 5.8836302955623225e-06, + "loss": 0.5059, + "step": 3708 + }, + { + "epoch": 0.46, + "grad_norm": 1.822301409849621, + "learning_rate": 5.881652138112808e-06, + "loss": 0.5389, + "step": 3709 + }, + { + "epoch": 0.46, + "grad_norm": 1.4528299534532378, + "learning_rate": 5.879673838224625e-06, + "loss": 0.5259, + "step": 3710 + }, + { + "epoch": 0.46, + "grad_norm": 0.6502222787439851, + "learning_rate": 5.877695396217386e-06, + "loss": 0.4961, + "step": 3711 + }, + { + "epoch": 0.46, + "grad_norm": 1.5887911511047892, + "learning_rate": 5.875716812410727e-06, + "loss": 0.5187, + "step": 3712 + }, + { + "epoch": 0.46, + "grad_norm": 1.6127291879447092, + "learning_rate": 5.873738087124302e-06, + "loss": 0.5522, + "step": 3713 + }, + { + "epoch": 0.46, + "grad_norm": 2.3110231020898824, + "learning_rate": 5.871759220677795e-06, + "loss": 0.4711, + "step": 3714 + }, + { + "epoch": 0.46, + "grad_norm": 1.2923517567560525, + "learning_rate": 5.8697802133909085e-06, + "loss": 0.494, + "step": 3715 + }, + { + "epoch": 0.46, + "grad_norm": 2.0171625532417607, + "learning_rate": 5.867801065583369e-06, + "loss": 0.5045, + "step": 3716 + }, + { + "epoch": 0.46, + "grad_norm": 1.459700446720111, + "learning_rate": 5.865821777574922e-06, + "loss": 0.4705, + "step": 3717 + }, + { + "epoch": 0.46, + "grad_norm": 2.238435685941135, + "learning_rate": 5.863842349685344e-06, + "loss": 0.5557, + "step": 3718 + }, + { + "epoch": 0.46, + "grad_norm": 1.3170644542756293, + "learning_rate": 5.861862782234425e-06, + "loss": 0.5146, + "step": 3719 + }, + { + "epoch": 0.46, + "grad_norm": 1.77112402196552, + "learning_rate": 5.8598830755419835e-06, + "loss": 0.5185, + "step": 3720 + }, + { + "epoch": 0.46, + "grad_norm": 1.4227671148027017, + "learning_rate": 5.8579032299278585e-06, + "loss": 0.547, + "step": 3721 + }, + { + "epoch": 0.46, + "grad_norm": 4.8600426263452965, + "learning_rate": 5.855923245711909e-06, + "loss": 0.4657, + "step": 3722 + }, + { + "epoch": 0.46, + "grad_norm": 1.8078423840950666, + "learning_rate": 5.853943123214022e-06, + "loss": 0.4963, + "step": 3723 + }, + { + "epoch": 0.46, + "grad_norm": 1.3140178472040205, + "learning_rate": 5.851962862754103e-06, + "loss": 0.4336, + "step": 3724 + }, + { + "epoch": 0.46, + "grad_norm": 1.3518973598901456, + "learning_rate": 5.84998246465208e-06, + "loss": 0.4447, + "step": 3725 + }, + { + "epoch": 0.46, + "grad_norm": 2.6492949452638737, + "learning_rate": 5.848001929227902e-06, + "loss": 0.5199, + "step": 3726 + }, + { + "epoch": 0.46, + "grad_norm": 1.7079034101337343, + "learning_rate": 5.846021256801546e-06, + "loss": 0.4598, + "step": 3727 + }, + { + "epoch": 0.46, + "grad_norm": 1.3625776176116, + "learning_rate": 5.844040447693004e-06, + "loss": 0.4952, + "step": 3728 + }, + { + "epoch": 0.46, + "grad_norm": 1.5736507105134774, + "learning_rate": 5.842059502222295e-06, + "loss": 0.5335, + "step": 3729 + }, + { + "epoch": 0.46, + "grad_norm": 1.3900575889574893, + "learning_rate": 5.840078420709456e-06, + "loss": 0.4571, + "step": 3730 + }, + { + "epoch": 0.46, + "grad_norm": 1.3081077443249847, + "learning_rate": 5.838097203474549e-06, + "loss": 0.5325, + "step": 3731 + }, + { + "epoch": 0.46, + "grad_norm": 1.4149913690397518, + "learning_rate": 5.8361158508376584e-06, + "loss": 0.5263, + "step": 3732 + }, + { + "epoch": 0.46, + "grad_norm": 1.4227367199239105, + "learning_rate": 5.834134363118889e-06, + "loss": 0.554, + "step": 3733 + }, + { + "epoch": 0.46, + "grad_norm": 1.3659653418941065, + "learning_rate": 5.832152740638366e-06, + "loss": 0.5594, + "step": 3734 + }, + { + "epoch": 0.46, + "grad_norm": 1.4498815236131395, + "learning_rate": 5.8301709837162375e-06, + "loss": 0.4839, + "step": 3735 + }, + { + "epoch": 0.46, + "grad_norm": 0.6379826138444108, + "learning_rate": 5.828189092672677e-06, + "loss": 0.4814, + "step": 3736 + }, + { + "epoch": 0.46, + "grad_norm": 1.1874404120184479, + "learning_rate": 5.826207067827874e-06, + "loss": 0.4355, + "step": 3737 + }, + { + "epoch": 0.46, + "grad_norm": 1.1896968083972823, + "learning_rate": 5.824224909502042e-06, + "loss": 0.5023, + "step": 3738 + }, + { + "epoch": 0.46, + "grad_norm": 1.28469351928837, + "learning_rate": 5.822242618015417e-06, + "loss": 0.4747, + "step": 3739 + }, + { + "epoch": 0.46, + "grad_norm": 1.3405004682609176, + "learning_rate": 5.8202601936882565e-06, + "loss": 0.4912, + "step": 3740 + }, + { + "epoch": 0.46, + "grad_norm": 1.521509508415757, + "learning_rate": 5.8182776368408365e-06, + "loss": 0.5042, + "step": 3741 + }, + { + "epoch": 0.46, + "grad_norm": 1.4304713086594372, + "learning_rate": 5.816294947793457e-06, + "loss": 0.4857, + "step": 3742 + }, + { + "epoch": 0.46, + "grad_norm": 1.407516635409624, + "learning_rate": 5.814312126866441e-06, + "loss": 0.4815, + "step": 3743 + }, + { + "epoch": 0.46, + "grad_norm": 3.374491776202101, + "learning_rate": 5.812329174380128e-06, + "loss": 0.4992, + "step": 3744 + }, + { + "epoch": 0.46, + "grad_norm": 1.2658042191199907, + "learning_rate": 5.810346090654883e-06, + "loss": 0.5197, + "step": 3745 + }, + { + "epoch": 0.46, + "grad_norm": 1.589948736983883, + "learning_rate": 5.808362876011089e-06, + "loss": 0.5338, + "step": 3746 + }, + { + "epoch": 0.47, + "grad_norm": 1.5472017582413746, + "learning_rate": 5.806379530769154e-06, + "loss": 0.5497, + "step": 3747 + }, + { + "epoch": 0.47, + "grad_norm": 1.4941205729620681, + "learning_rate": 5.804396055249504e-06, + "loss": 0.4912, + "step": 3748 + }, + { + "epoch": 0.47, + "grad_norm": 1.2873758547493115, + "learning_rate": 5.802412449772585e-06, + "loss": 0.4737, + "step": 3749 + }, + { + "epoch": 0.47, + "grad_norm": 1.9862356925132492, + "learning_rate": 5.800428714658869e-06, + "loss": 0.5114, + "step": 3750 + }, + { + "epoch": 0.47, + "grad_norm": 1.2015546976438813, + "learning_rate": 5.7984448502288455e-06, + "loss": 0.5073, + "step": 3751 + }, + { + "epoch": 0.47, + "grad_norm": 1.2466761880916173, + "learning_rate": 5.796460856803024e-06, + "loss": 0.4497, + "step": 3752 + }, + { + "epoch": 0.47, + "grad_norm": 1.4910622887566962, + "learning_rate": 5.794476734701936e-06, + "loss": 0.5127, + "step": 3753 + }, + { + "epoch": 0.47, + "grad_norm": 1.290204703858359, + "learning_rate": 5.792492484246134e-06, + "loss": 0.4756, + "step": 3754 + }, + { + "epoch": 0.47, + "grad_norm": 1.4159973156842203, + "learning_rate": 5.790508105756192e-06, + "loss": 0.4534, + "step": 3755 + }, + { + "epoch": 0.47, + "grad_norm": 1.4655772184973017, + "learning_rate": 5.788523599552704e-06, + "loss": 0.4617, + "step": 3756 + }, + { + "epoch": 0.47, + "grad_norm": 1.63522079548207, + "learning_rate": 5.7865389659562835e-06, + "loss": 0.5522, + "step": 3757 + }, + { + "epoch": 0.47, + "grad_norm": 1.342507328459723, + "learning_rate": 5.784554205287564e-06, + "loss": 0.5328, + "step": 3758 + }, + { + "epoch": 0.47, + "grad_norm": 1.3710636892525927, + "learning_rate": 5.782569317867204e-06, + "loss": 0.4799, + "step": 3759 + }, + { + "epoch": 0.47, + "grad_norm": 1.238963629696016, + "learning_rate": 5.780584304015879e-06, + "loss": 0.4872, + "step": 3760 + }, + { + "epoch": 0.47, + "grad_norm": 5.516859888821994, + "learning_rate": 5.778599164054286e-06, + "loss": 0.5175, + "step": 3761 + }, + { + "epoch": 0.47, + "grad_norm": 1.5242376957130266, + "learning_rate": 5.77661389830314e-06, + "loss": 0.5162, + "step": 3762 + }, + { + "epoch": 0.47, + "grad_norm": 1.5959688209328888, + "learning_rate": 5.774628507083178e-06, + "loss": 0.478, + "step": 3763 + }, + { + "epoch": 0.47, + "grad_norm": 1.5390937579591093, + "learning_rate": 5.772642990715158e-06, + "loss": 0.549, + "step": 3764 + }, + { + "epoch": 0.47, + "grad_norm": 1.5459986321471937, + "learning_rate": 5.7706573495198595e-06, + "loss": 0.4824, + "step": 3765 + }, + { + "epoch": 0.47, + "grad_norm": 1.7485615328310886, + "learning_rate": 5.768671583818077e-06, + "loss": 0.5419, + "step": 3766 + }, + { + "epoch": 0.47, + "grad_norm": 1.6617800056061585, + "learning_rate": 5.7666856939306316e-06, + "loss": 0.5149, + "step": 3767 + }, + { + "epoch": 0.47, + "grad_norm": 2.035412095338443, + "learning_rate": 5.7646996801783595e-06, + "loss": 0.5039, + "step": 3768 + }, + { + "epoch": 0.47, + "grad_norm": 1.2570451162662535, + "learning_rate": 5.762713542882119e-06, + "loss": 0.514, + "step": 3769 + }, + { + "epoch": 0.47, + "grad_norm": 1.5096625637139065, + "learning_rate": 5.76072728236279e-06, + "loss": 0.4711, + "step": 3770 + }, + { + "epoch": 0.47, + "grad_norm": 1.4769654769939338, + "learning_rate": 5.7587408989412666e-06, + "loss": 0.4925, + "step": 3771 + }, + { + "epoch": 0.47, + "grad_norm": 2.7859026735626404, + "learning_rate": 5.7567543929384696e-06, + "loss": 0.4986, + "step": 3772 + }, + { + "epoch": 0.47, + "grad_norm": 1.7966181544995699, + "learning_rate": 5.754767764675335e-06, + "loss": 0.4997, + "step": 3773 + }, + { + "epoch": 0.47, + "grad_norm": 1.4071480749895844, + "learning_rate": 5.752781014472822e-06, + "loss": 0.482, + "step": 3774 + }, + { + "epoch": 0.47, + "grad_norm": 1.3596223913000691, + "learning_rate": 5.750794142651904e-06, + "loss": 0.5223, + "step": 3775 + }, + { + "epoch": 0.47, + "grad_norm": 1.4904250217919546, + "learning_rate": 5.74880714953358e-06, + "loss": 0.517, + "step": 3776 + }, + { + "epoch": 0.47, + "grad_norm": 1.9235479357932799, + "learning_rate": 5.746820035438868e-06, + "loss": 0.5683, + "step": 3777 + }, + { + "epoch": 0.47, + "grad_norm": 1.8101897413589039, + "learning_rate": 5.744832800688801e-06, + "loss": 0.4943, + "step": 3778 + }, + { + "epoch": 0.47, + "grad_norm": 2.255858221208562, + "learning_rate": 5.742845445604436e-06, + "loss": 0.5104, + "step": 3779 + }, + { + "epoch": 0.47, + "grad_norm": 0.7438680744054342, + "learning_rate": 5.740857970506846e-06, + "loss": 0.5296, + "step": 3780 + }, + { + "epoch": 0.47, + "grad_norm": 1.9525014070396836, + "learning_rate": 5.738870375717125e-06, + "loss": 0.5288, + "step": 3781 + }, + { + "epoch": 0.47, + "grad_norm": 1.5688073783490026, + "learning_rate": 5.736882661556389e-06, + "loss": 0.5331, + "step": 3782 + }, + { + "epoch": 0.47, + "grad_norm": 1.4595580168795548, + "learning_rate": 5.7348948283457675e-06, + "loss": 0.5149, + "step": 3783 + }, + { + "epoch": 0.47, + "grad_norm": 2.3998571453059667, + "learning_rate": 5.7329068764064155e-06, + "loss": 0.4991, + "step": 3784 + }, + { + "epoch": 0.47, + "grad_norm": 1.538385560945485, + "learning_rate": 5.730918806059502e-06, + "loss": 0.5069, + "step": 3785 + }, + { + "epoch": 0.47, + "grad_norm": 1.5397070432129656, + "learning_rate": 5.728930617626218e-06, + "loss": 0.5278, + "step": 3786 + }, + { + "epoch": 0.47, + "grad_norm": 0.64968775068539, + "learning_rate": 5.7269423114277745e-06, + "loss": 0.5205, + "step": 3787 + }, + { + "epoch": 0.47, + "grad_norm": 1.783297417690423, + "learning_rate": 5.724953887785396e-06, + "loss": 0.4983, + "step": 3788 + }, + { + "epoch": 0.47, + "grad_norm": 3.8262445860846808, + "learning_rate": 5.722965347020334e-06, + "loss": 0.4443, + "step": 3789 + }, + { + "epoch": 0.47, + "grad_norm": 1.673647777908011, + "learning_rate": 5.7209766894538525e-06, + "loss": 0.5117, + "step": 3790 + }, + { + "epoch": 0.47, + "grad_norm": 1.5446791654566836, + "learning_rate": 5.718987915407235e-06, + "loss": 0.4558, + "step": 3791 + }, + { + "epoch": 0.47, + "grad_norm": 1.7108875475636238, + "learning_rate": 5.716999025201789e-06, + "loss": 0.518, + "step": 3792 + }, + { + "epoch": 0.47, + "grad_norm": 1.830804067213509, + "learning_rate": 5.715010019158835e-06, + "loss": 0.5068, + "step": 3793 + }, + { + "epoch": 0.47, + "grad_norm": 1.4892383342831381, + "learning_rate": 5.713020897599717e-06, + "loss": 0.5711, + "step": 3794 + }, + { + "epoch": 0.47, + "grad_norm": 1.6181834703601836, + "learning_rate": 5.7110316608457905e-06, + "loss": 0.492, + "step": 3795 + }, + { + "epoch": 0.47, + "grad_norm": 10.807414076229394, + "learning_rate": 5.70904230921844e-06, + "loss": 0.4895, + "step": 3796 + }, + { + "epoch": 0.47, + "grad_norm": 1.3334993049574357, + "learning_rate": 5.7070528430390585e-06, + "loss": 0.4797, + "step": 3797 + }, + { + "epoch": 0.47, + "grad_norm": 1.481416961209663, + "learning_rate": 5.705063262629062e-06, + "loss": 0.521, + "step": 3798 + }, + { + "epoch": 0.47, + "grad_norm": 1.5103932214495144, + "learning_rate": 5.703073568309888e-06, + "loss": 0.4994, + "step": 3799 + }, + { + "epoch": 0.47, + "grad_norm": 1.697328962339054, + "learning_rate": 5.701083760402984e-06, + "loss": 0.552, + "step": 3800 + }, + { + "epoch": 0.47, + "grad_norm": 1.4739255860006288, + "learning_rate": 5.699093839229826e-06, + "loss": 0.4767, + "step": 3801 + }, + { + "epoch": 0.47, + "grad_norm": 1.634062539340076, + "learning_rate": 5.697103805111901e-06, + "loss": 0.4903, + "step": 3802 + }, + { + "epoch": 0.47, + "grad_norm": 1.3965792045845735, + "learning_rate": 5.695113658370717e-06, + "loss": 0.5166, + "step": 3803 + }, + { + "epoch": 0.47, + "grad_norm": 1.3024963936312466, + "learning_rate": 5.693123399327798e-06, + "loss": 0.5122, + "step": 3804 + }, + { + "epoch": 0.47, + "grad_norm": 1.2964010800289418, + "learning_rate": 5.691133028304691e-06, + "loss": 0.4602, + "step": 3805 + }, + { + "epoch": 0.47, + "grad_norm": 1.4052206110404424, + "learning_rate": 5.689142545622955e-06, + "loss": 0.5785, + "step": 3806 + }, + { + "epoch": 0.47, + "grad_norm": 1.5247851003929445, + "learning_rate": 5.687151951604173e-06, + "loss": 0.5741, + "step": 3807 + }, + { + "epoch": 0.47, + "grad_norm": 1.3450074532349305, + "learning_rate": 5.685161246569939e-06, + "loss": 0.4638, + "step": 3808 + }, + { + "epoch": 0.47, + "grad_norm": 1.5339496052297188, + "learning_rate": 5.6831704308418726e-06, + "loss": 0.4924, + "step": 3809 + }, + { + "epoch": 0.47, + "grad_norm": 1.3515665945195043, + "learning_rate": 5.681179504741606e-06, + "loss": 0.4642, + "step": 3810 + }, + { + "epoch": 0.47, + "grad_norm": 1.5465504956664637, + "learning_rate": 5.679188468590792e-06, + "loss": 0.5482, + "step": 3811 + }, + { + "epoch": 0.47, + "grad_norm": 1.4018583974536207, + "learning_rate": 5.6771973227110976e-06, + "loss": 0.526, + "step": 3812 + }, + { + "epoch": 0.47, + "grad_norm": 1.2934414393005258, + "learning_rate": 5.675206067424212e-06, + "loss": 0.459, + "step": 3813 + }, + { + "epoch": 0.47, + "grad_norm": 1.315842288116888, + "learning_rate": 5.67321470305184e-06, + "loss": 0.4843, + "step": 3814 + }, + { + "epoch": 0.47, + "grad_norm": 1.3248801202211313, + "learning_rate": 5.671223229915705e-06, + "loss": 0.5124, + "step": 3815 + }, + { + "epoch": 0.47, + "grad_norm": 1.2614354452804377, + "learning_rate": 5.669231648337543e-06, + "loss": 0.511, + "step": 3816 + }, + { + "epoch": 0.47, + "grad_norm": 1.4200889081262933, + "learning_rate": 5.667239958639114e-06, + "loss": 0.4547, + "step": 3817 + }, + { + "epoch": 0.47, + "grad_norm": 1.4868397770474286, + "learning_rate": 5.665248161142196e-06, + "loss": 0.4711, + "step": 3818 + }, + { + "epoch": 0.47, + "grad_norm": 1.4873342914262366, + "learning_rate": 5.663256256168577e-06, + "loss": 0.5622, + "step": 3819 + }, + { + "epoch": 0.47, + "grad_norm": 1.5527944075539648, + "learning_rate": 5.661264244040067e-06, + "loss": 0.5585, + "step": 3820 + }, + { + "epoch": 0.47, + "grad_norm": 1.455195628107584, + "learning_rate": 5.659272125078495e-06, + "loss": 0.5281, + "step": 3821 + }, + { + "epoch": 0.47, + "grad_norm": 1.4272424919821816, + "learning_rate": 5.6572798996057065e-06, + "loss": 0.4699, + "step": 3822 + }, + { + "epoch": 0.47, + "grad_norm": 1.4537711024353266, + "learning_rate": 5.65528756794356e-06, + "loss": 0.526, + "step": 3823 + }, + { + "epoch": 0.47, + "grad_norm": 1.7239883379658023, + "learning_rate": 5.653295130413937e-06, + "loss": 0.501, + "step": 3824 + }, + { + "epoch": 0.47, + "grad_norm": 1.388762025067527, + "learning_rate": 5.651302587338732e-06, + "loss": 0.465, + "step": 3825 + }, + { + "epoch": 0.47, + "grad_norm": 1.605357518243063, + "learning_rate": 5.649309939039856e-06, + "loss": 0.5448, + "step": 3826 + }, + { + "epoch": 0.47, + "grad_norm": 1.3004710250305824, + "learning_rate": 5.647317185839243e-06, + "loss": 0.5147, + "step": 3827 + }, + { + "epoch": 0.48, + "grad_norm": 1.2453765452625825, + "learning_rate": 5.645324328058834e-06, + "loss": 0.4939, + "step": 3828 + }, + { + "epoch": 0.48, + "grad_norm": 1.6630221600508244, + "learning_rate": 5.643331366020599e-06, + "loss": 0.5313, + "step": 3829 + }, + { + "epoch": 0.48, + "grad_norm": 2.1403267643695534, + "learning_rate": 5.641338300046516e-06, + "loss": 0.5347, + "step": 3830 + }, + { + "epoch": 0.48, + "grad_norm": 1.6645397725384086, + "learning_rate": 5.639345130458582e-06, + "loss": 0.4903, + "step": 3831 + }, + { + "epoch": 0.48, + "grad_norm": 1.30182281674491, + "learning_rate": 5.637351857578811e-06, + "loss": 0.4917, + "step": 3832 + }, + { + "epoch": 0.48, + "grad_norm": 1.4362963547064533, + "learning_rate": 5.635358481729234e-06, + "loss": 0.5219, + "step": 3833 + }, + { + "epoch": 0.48, + "grad_norm": 1.4382650532418015, + "learning_rate": 5.6333650032318985e-06, + "loss": 0.5203, + "step": 3834 + }, + { + "epoch": 0.48, + "grad_norm": 1.3442530910752974, + "learning_rate": 5.631371422408869e-06, + "loss": 0.4943, + "step": 3835 + }, + { + "epoch": 0.48, + "grad_norm": 1.4265060064893866, + "learning_rate": 5.629377739582225e-06, + "loss": 0.4498, + "step": 3836 + }, + { + "epoch": 0.48, + "grad_norm": 4.216269417602553, + "learning_rate": 5.627383955074064e-06, + "loss": 0.477, + "step": 3837 + }, + { + "epoch": 0.48, + "grad_norm": 1.8816201018509997, + "learning_rate": 5.6253900692065e-06, + "loss": 0.5216, + "step": 3838 + }, + { + "epoch": 0.48, + "grad_norm": 1.5067962273330215, + "learning_rate": 5.623396082301662e-06, + "loss": 0.4927, + "step": 3839 + }, + { + "epoch": 0.48, + "grad_norm": 1.553380864316869, + "learning_rate": 5.6214019946816975e-06, + "loss": 0.5067, + "step": 3840 + }, + { + "epoch": 0.48, + "grad_norm": 2.2467239685806337, + "learning_rate": 5.619407806668768e-06, + "loss": 0.5472, + "step": 3841 + }, + { + "epoch": 0.48, + "grad_norm": 1.3959033963800893, + "learning_rate": 5.617413518585051e-06, + "loss": 0.5428, + "step": 3842 + }, + { + "epoch": 0.48, + "grad_norm": 1.833831320835665, + "learning_rate": 5.615419130752743e-06, + "loss": 0.5274, + "step": 3843 + }, + { + "epoch": 0.48, + "grad_norm": 1.447765424375853, + "learning_rate": 5.613424643494054e-06, + "loss": 0.552, + "step": 3844 + }, + { + "epoch": 0.48, + "grad_norm": 1.423418358339179, + "learning_rate": 5.611430057131211e-06, + "loss": 0.5416, + "step": 3845 + }, + { + "epoch": 0.48, + "grad_norm": 1.9508924456607224, + "learning_rate": 5.609435371986457e-06, + "loss": 0.5314, + "step": 3846 + }, + { + "epoch": 0.48, + "grad_norm": 1.1955084497773902, + "learning_rate": 5.607440588382052e-06, + "loss": 0.4636, + "step": 3847 + }, + { + "epoch": 0.48, + "grad_norm": 1.8439394899829424, + "learning_rate": 5.60544570664027e-06, + "loss": 0.5107, + "step": 3848 + }, + { + "epoch": 0.48, + "grad_norm": 1.515882843463731, + "learning_rate": 5.6034507270834e-06, + "loss": 0.5218, + "step": 3849 + }, + { + "epoch": 0.48, + "grad_norm": 1.5241593853007185, + "learning_rate": 5.601455650033754e-06, + "loss": 0.5216, + "step": 3850 + }, + { + "epoch": 0.48, + "grad_norm": 12.162683050587866, + "learning_rate": 5.599460475813648e-06, + "loss": 0.4684, + "step": 3851 + }, + { + "epoch": 0.48, + "grad_norm": 2.2701973747754316, + "learning_rate": 5.5974652047454235e-06, + "loss": 0.5149, + "step": 3852 + }, + { + "epoch": 0.48, + "grad_norm": 1.9791516248661194, + "learning_rate": 5.595469837151432e-06, + "loss": 0.531, + "step": 3853 + }, + { + "epoch": 0.48, + "grad_norm": 1.2696017553188235, + "learning_rate": 5.593474373354045e-06, + "loss": 0.5118, + "step": 3854 + }, + { + "epoch": 0.48, + "grad_norm": 1.3986543128280633, + "learning_rate": 5.591478813675646e-06, + "loss": 0.53, + "step": 3855 + }, + { + "epoch": 0.48, + "grad_norm": 1.2703344434000077, + "learning_rate": 5.589483158438636e-06, + "loss": 0.5266, + "step": 3856 + }, + { + "epoch": 0.48, + "grad_norm": 1.4011913209638538, + "learning_rate": 5.587487407965429e-06, + "loss": 0.5057, + "step": 3857 + }, + { + "epoch": 0.48, + "grad_norm": 1.313243610761376, + "learning_rate": 5.585491562578456e-06, + "loss": 0.497, + "step": 3858 + }, + { + "epoch": 0.48, + "grad_norm": 1.4951327213403844, + "learning_rate": 5.583495622600168e-06, + "loss": 0.5133, + "step": 3859 + }, + { + "epoch": 0.48, + "grad_norm": 1.7132730355293824, + "learning_rate": 5.581499588353021e-06, + "loss": 0.4932, + "step": 3860 + }, + { + "epoch": 0.48, + "grad_norm": 1.479779515901214, + "learning_rate": 5.579503460159493e-06, + "loss": 0.5218, + "step": 3861 + }, + { + "epoch": 0.48, + "grad_norm": 1.3103752607086714, + "learning_rate": 5.577507238342078e-06, + "loss": 0.5094, + "step": 3862 + }, + { + "epoch": 0.48, + "grad_norm": 1.5486202310935997, + "learning_rate": 5.575510923223284e-06, + "loss": 0.4926, + "step": 3863 + }, + { + "epoch": 0.48, + "grad_norm": 8.033733047007289, + "learning_rate": 5.57351451512563e-06, + "loss": 0.4873, + "step": 3864 + }, + { + "epoch": 0.48, + "grad_norm": 1.3273892616667802, + "learning_rate": 5.5715180143716555e-06, + "loss": 0.5261, + "step": 3865 + }, + { + "epoch": 0.48, + "grad_norm": 1.2925855023656074, + "learning_rate": 5.569521421283912e-06, + "loss": 0.5083, + "step": 3866 + }, + { + "epoch": 0.48, + "grad_norm": 1.1539965994659505, + "learning_rate": 5.567524736184967e-06, + "loss": 0.4574, + "step": 3867 + }, + { + "epoch": 0.48, + "grad_norm": 1.5021009839892818, + "learning_rate": 5.565527959397403e-06, + "loss": 0.5179, + "step": 3868 + }, + { + "epoch": 0.48, + "grad_norm": 1.2228970177505332, + "learning_rate": 5.563531091243817e-06, + "loss": 0.5128, + "step": 3869 + }, + { + "epoch": 0.48, + "grad_norm": 1.7249298140042144, + "learning_rate": 5.5615341320468195e-06, + "loss": 0.5216, + "step": 3870 + }, + { + "epoch": 0.48, + "grad_norm": 1.342039726006231, + "learning_rate": 5.559537082129037e-06, + "loss": 0.5276, + "step": 3871 + }, + { + "epoch": 0.48, + "grad_norm": 1.660683024715625, + "learning_rate": 5.5575399418131115e-06, + "loss": 0.5323, + "step": 3872 + }, + { + "epoch": 0.48, + "grad_norm": 1.7599077618364973, + "learning_rate": 5.555542711421696e-06, + "loss": 0.5335, + "step": 3873 + }, + { + "epoch": 0.48, + "grad_norm": 1.4276717562059684, + "learning_rate": 5.553545391277465e-06, + "loss": 0.4936, + "step": 3874 + }, + { + "epoch": 0.48, + "grad_norm": 3.0651420932368425, + "learning_rate": 5.5515479817031005e-06, + "loss": 0.5023, + "step": 3875 + }, + { + "epoch": 0.48, + "grad_norm": 1.2854772402377153, + "learning_rate": 5.549550483021302e-06, + "loss": 0.4946, + "step": 3876 + }, + { + "epoch": 0.48, + "grad_norm": 2.999653481852163, + "learning_rate": 5.547552895554783e-06, + "loss": 0.4761, + "step": 3877 + }, + { + "epoch": 0.48, + "grad_norm": 2.355792789812382, + "learning_rate": 5.545555219626271e-06, + "loss": 0.5174, + "step": 3878 + }, + { + "epoch": 0.48, + "grad_norm": 1.4044057073752125, + "learning_rate": 5.543557455558509e-06, + "loss": 0.528, + "step": 3879 + }, + { + "epoch": 0.48, + "grad_norm": 1.4615035364180013, + "learning_rate": 5.541559603674252e-06, + "loss": 0.5201, + "step": 3880 + }, + { + "epoch": 0.48, + "grad_norm": 2.052410206473402, + "learning_rate": 5.539561664296273e-06, + "loss": 0.519, + "step": 3881 + }, + { + "epoch": 0.48, + "grad_norm": 1.7101247179212373, + "learning_rate": 5.537563637747352e-06, + "loss": 0.4933, + "step": 3882 + }, + { + "epoch": 0.48, + "grad_norm": 1.4553625355001882, + "learning_rate": 5.535565524350293e-06, + "loss": 0.515, + "step": 3883 + }, + { + "epoch": 0.48, + "grad_norm": 1.5823929048265364, + "learning_rate": 5.533567324427906e-06, + "loss": 0.5251, + "step": 3884 + }, + { + "epoch": 0.48, + "grad_norm": 1.5125343261415436, + "learning_rate": 5.5315690383030195e-06, + "loss": 0.5091, + "step": 3885 + }, + { + "epoch": 0.48, + "grad_norm": 0.6701063803891216, + "learning_rate": 5.529570666298473e-06, + "loss": 0.4574, + "step": 3886 + }, + { + "epoch": 0.48, + "grad_norm": 1.7559956685205522, + "learning_rate": 5.5275722087371205e-06, + "loss": 0.5381, + "step": 3887 + }, + { + "epoch": 0.48, + "grad_norm": 1.9654385967049164, + "learning_rate": 5.525573665941833e-06, + "loss": 0.5253, + "step": 3888 + }, + { + "epoch": 0.48, + "grad_norm": 1.92878214412045, + "learning_rate": 5.523575038235489e-06, + "loss": 0.4617, + "step": 3889 + }, + { + "epoch": 0.48, + "grad_norm": 2.9839273447811436, + "learning_rate": 5.521576325940986e-06, + "loss": 0.5056, + "step": 3890 + }, + { + "epoch": 0.48, + "grad_norm": 1.448419304382558, + "learning_rate": 5.519577529381235e-06, + "loss": 0.509, + "step": 3891 + }, + { + "epoch": 0.48, + "grad_norm": 1.6441958788761752, + "learning_rate": 5.5175786488791575e-06, + "loss": 0.5632, + "step": 3892 + }, + { + "epoch": 0.48, + "grad_norm": 2.1335798112114635, + "learning_rate": 5.515579684757691e-06, + "loss": 0.5622, + "step": 3893 + }, + { + "epoch": 0.48, + "grad_norm": 1.362047097744073, + "learning_rate": 5.513580637339786e-06, + "loss": 0.556, + "step": 3894 + }, + { + "epoch": 0.48, + "grad_norm": 1.4222148143798927, + "learning_rate": 5.511581506948407e-06, + "loss": 0.4875, + "step": 3895 + }, + { + "epoch": 0.48, + "grad_norm": 1.4232338129775202, + "learning_rate": 5.509582293906528e-06, + "loss": 0.4922, + "step": 3896 + }, + { + "epoch": 0.48, + "grad_norm": 1.479210921980862, + "learning_rate": 5.507582998537142e-06, + "loss": 0.5159, + "step": 3897 + }, + { + "epoch": 0.48, + "grad_norm": 1.3022543152571098, + "learning_rate": 5.505583621163252e-06, + "loss": 0.5361, + "step": 3898 + }, + { + "epoch": 0.48, + "grad_norm": 1.3123499942506818, + "learning_rate": 5.503584162107876e-06, + "loss": 0.5051, + "step": 3899 + }, + { + "epoch": 0.48, + "grad_norm": 1.434330700741348, + "learning_rate": 5.501584621694043e-06, + "loss": 0.5379, + "step": 3900 + }, + { + "epoch": 0.48, + "grad_norm": 1.9198939466196938, + "learning_rate": 5.4995850002447955e-06, + "loss": 0.5165, + "step": 3901 + }, + { + "epoch": 0.48, + "grad_norm": 1.5452717097371367, + "learning_rate": 5.4975852980831925e-06, + "loss": 0.4853, + "step": 3902 + }, + { + "epoch": 0.48, + "grad_norm": 1.4825630920396455, + "learning_rate": 5.495585515532302e-06, + "loss": 0.526, + "step": 3903 + }, + { + "epoch": 0.48, + "grad_norm": 1.3945136410814463, + "learning_rate": 5.4935856529152075e-06, + "loss": 0.477, + "step": 3904 + }, + { + "epoch": 0.48, + "grad_norm": 1.8283627173286865, + "learning_rate": 5.491585710555004e-06, + "loss": 0.5037, + "step": 3905 + }, + { + "epoch": 0.48, + "grad_norm": 4.863992368418988, + "learning_rate": 5.489585688774798e-06, + "loss": 0.5524, + "step": 3906 + }, + { + "epoch": 0.48, + "grad_norm": 1.5047987926389876, + "learning_rate": 5.487585587897713e-06, + "loss": 0.4737, + "step": 3907 + }, + { + "epoch": 0.48, + "grad_norm": 2.051631341269757, + "learning_rate": 5.4855854082468805e-06, + "loss": 0.5492, + "step": 3908 + }, + { + "epoch": 0.49, + "grad_norm": 1.411897797407055, + "learning_rate": 5.483585150145451e-06, + "loss": 0.4651, + "step": 3909 + }, + { + "epoch": 0.49, + "grad_norm": 1.5153953133448943, + "learning_rate": 5.481584813916579e-06, + "loss": 0.5339, + "step": 3910 + }, + { + "epoch": 0.49, + "grad_norm": 1.5773017112958216, + "learning_rate": 5.47958439988344e-06, + "loss": 0.5031, + "step": 3911 + }, + { + "epoch": 0.49, + "grad_norm": 1.4551035961762118, + "learning_rate": 5.477583908369219e-06, + "loss": 0.5042, + "step": 3912 + }, + { + "epoch": 0.49, + "grad_norm": 1.6593828724311739, + "learning_rate": 5.47558333969711e-06, + "loss": 0.5674, + "step": 3913 + }, + { + "epoch": 0.49, + "grad_norm": 1.52905879375427, + "learning_rate": 5.473582694190323e-06, + "loss": 0.4978, + "step": 3914 + }, + { + "epoch": 0.49, + "grad_norm": 2.068302465264425, + "learning_rate": 5.471581972172082e-06, + "loss": 0.4969, + "step": 3915 + }, + { + "epoch": 0.49, + "grad_norm": 1.9660457837978043, + "learning_rate": 5.4695811739656204e-06, + "loss": 0.5076, + "step": 3916 + }, + { + "epoch": 0.49, + "grad_norm": 1.45853978070074, + "learning_rate": 5.467580299894183e-06, + "loss": 0.5606, + "step": 3917 + }, + { + "epoch": 0.49, + "grad_norm": 0.664872886856364, + "learning_rate": 5.465579350281032e-06, + "loss": 0.513, + "step": 3918 + }, + { + "epoch": 0.49, + "grad_norm": 1.6934631719056283, + "learning_rate": 5.463578325449434e-06, + "loss": 0.5506, + "step": 3919 + }, + { + "epoch": 0.49, + "grad_norm": 3.0296482449817947, + "learning_rate": 5.461577225722676e-06, + "loss": 0.5128, + "step": 3920 + }, + { + "epoch": 0.49, + "grad_norm": 0.6661246497699034, + "learning_rate": 5.4595760514240525e-06, + "loss": 0.4678, + "step": 3921 + }, + { + "epoch": 0.49, + "grad_norm": 1.3550930215234538, + "learning_rate": 5.45757480287687e-06, + "loss": 0.5224, + "step": 3922 + }, + { + "epoch": 0.49, + "grad_norm": 1.4528187165438429, + "learning_rate": 5.455573480404448e-06, + "loss": 0.512, + "step": 3923 + }, + { + "epoch": 0.49, + "grad_norm": 1.4543519260418145, + "learning_rate": 5.453572084330117e-06, + "loss": 0.4873, + "step": 3924 + }, + { + "epoch": 0.49, + "grad_norm": 1.4530409679251879, + "learning_rate": 5.451570614977223e-06, + "loss": 0.5308, + "step": 3925 + }, + { + "epoch": 0.49, + "grad_norm": 1.4345990352945623, + "learning_rate": 5.449569072669119e-06, + "loss": 0.4699, + "step": 3926 + }, + { + "epoch": 0.49, + "grad_norm": 1.3333642766386664, + "learning_rate": 5.447567457729169e-06, + "loss": 0.4703, + "step": 3927 + }, + { + "epoch": 0.49, + "grad_norm": 1.9541630497777558, + "learning_rate": 5.4455657704807566e-06, + "loss": 0.5087, + "step": 3928 + }, + { + "epoch": 0.49, + "grad_norm": 1.5383476148156443, + "learning_rate": 5.44356401124727e-06, + "loss": 0.5101, + "step": 3929 + }, + { + "epoch": 0.49, + "grad_norm": 1.3848178623750136, + "learning_rate": 5.44156218035211e-06, + "loss": 0.5336, + "step": 3930 + }, + { + "epoch": 0.49, + "grad_norm": 1.2823093070822174, + "learning_rate": 5.439560278118692e-06, + "loss": 0.4547, + "step": 3931 + }, + { + "epoch": 0.49, + "grad_norm": 1.7596416413390763, + "learning_rate": 5.437558304870438e-06, + "loss": 0.5239, + "step": 3932 + }, + { + "epoch": 0.49, + "grad_norm": 1.6311320117875125, + "learning_rate": 5.435556260930788e-06, + "loss": 0.4999, + "step": 3933 + }, + { + "epoch": 0.49, + "grad_norm": 2.1424810969438024, + "learning_rate": 5.433554146623187e-06, + "loss": 0.4954, + "step": 3934 + }, + { + "epoch": 0.49, + "grad_norm": 1.2357186619718303, + "learning_rate": 5.431551962271094e-06, + "loss": 0.5075, + "step": 3935 + }, + { + "epoch": 0.49, + "grad_norm": 1.5599698797491244, + "learning_rate": 5.429549708197982e-06, + "loss": 0.4683, + "step": 3936 + }, + { + "epoch": 0.49, + "grad_norm": 1.531008438280879, + "learning_rate": 5.427547384727332e-06, + "loss": 0.5502, + "step": 3937 + }, + { + "epoch": 0.49, + "grad_norm": 1.4741978722495628, + "learning_rate": 5.425544992182636e-06, + "loss": 0.5169, + "step": 3938 + }, + { + "epoch": 0.49, + "grad_norm": 1.8507114164050131, + "learning_rate": 5.423542530887399e-06, + "loss": 0.4601, + "step": 3939 + }, + { + "epoch": 0.49, + "grad_norm": 1.3637993319871706, + "learning_rate": 5.421540001165135e-06, + "loss": 0.5606, + "step": 3940 + }, + { + "epoch": 0.49, + "grad_norm": 1.7277813484271007, + "learning_rate": 5.419537403339372e-06, + "loss": 0.4859, + "step": 3941 + }, + { + "epoch": 0.49, + "grad_norm": 2.2992120001099763, + "learning_rate": 5.417534737733648e-06, + "loss": 0.5243, + "step": 3942 + }, + { + "epoch": 0.49, + "grad_norm": 1.3523696755445922, + "learning_rate": 5.415532004671506e-06, + "loss": 0.5216, + "step": 3943 + }, + { + "epoch": 0.49, + "grad_norm": 1.4589926049928805, + "learning_rate": 5.413529204476512e-06, + "loss": 0.4862, + "step": 3944 + }, + { + "epoch": 0.49, + "grad_norm": 1.334457517749518, + "learning_rate": 5.411526337472232e-06, + "loss": 0.5187, + "step": 3945 + }, + { + "epoch": 0.49, + "grad_norm": 1.3357269722684226, + "learning_rate": 5.409523403982247e-06, + "loss": 0.5084, + "step": 3946 + }, + { + "epoch": 0.49, + "grad_norm": 1.596655906918947, + "learning_rate": 5.40752040433015e-06, + "loss": 0.4783, + "step": 3947 + }, + { + "epoch": 0.49, + "grad_norm": 1.386972195789598, + "learning_rate": 5.4055173388395445e-06, + "loss": 0.5405, + "step": 3948 + }, + { + "epoch": 0.49, + "grad_norm": 1.3719441891820827, + "learning_rate": 5.40351420783404e-06, + "loss": 0.4992, + "step": 3949 + }, + { + "epoch": 0.49, + "grad_norm": 1.5172485941704512, + "learning_rate": 5.40151101163726e-06, + "loss": 0.5282, + "step": 3950 + }, + { + "epoch": 0.49, + "grad_norm": 1.4264798256383389, + "learning_rate": 5.399507750572841e-06, + "loss": 0.5071, + "step": 3951 + }, + { + "epoch": 0.49, + "grad_norm": 1.87510756362639, + "learning_rate": 5.397504424964426e-06, + "loss": 0.5177, + "step": 3952 + }, + { + "epoch": 0.49, + "grad_norm": 1.3567163748524484, + "learning_rate": 5.395501035135671e-06, + "loss": 0.4396, + "step": 3953 + }, + { + "epoch": 0.49, + "grad_norm": 1.3409907238416823, + "learning_rate": 5.39349758141024e-06, + "loss": 0.4629, + "step": 3954 + }, + { + "epoch": 0.49, + "grad_norm": 1.982873474793292, + "learning_rate": 5.391494064111809e-06, + "loss": 0.4955, + "step": 3955 + }, + { + "epoch": 0.49, + "grad_norm": 1.508085533195063, + "learning_rate": 5.389490483564064e-06, + "loss": 0.5198, + "step": 3956 + }, + { + "epoch": 0.49, + "grad_norm": 1.2250701542250673, + "learning_rate": 5.387486840090701e-06, + "loss": 0.4803, + "step": 3957 + }, + { + "epoch": 0.49, + "grad_norm": 1.4641385971210494, + "learning_rate": 5.3854831340154265e-06, + "loss": 0.5253, + "step": 3958 + }, + { + "epoch": 0.49, + "grad_norm": 1.3998754036190544, + "learning_rate": 5.383479365661958e-06, + "loss": 0.4961, + "step": 3959 + }, + { + "epoch": 0.49, + "grad_norm": 1.4065888899634549, + "learning_rate": 5.381475535354018e-06, + "loss": 0.5066, + "step": 3960 + }, + { + "epoch": 0.49, + "grad_norm": 1.4691889688853568, + "learning_rate": 5.379471643415347e-06, + "loss": 0.5421, + "step": 3961 + }, + { + "epoch": 0.49, + "grad_norm": 1.39716223039857, + "learning_rate": 5.37746769016969e-06, + "loss": 0.5345, + "step": 3962 + }, + { + "epoch": 0.49, + "grad_norm": 1.2562632841531238, + "learning_rate": 5.375463675940803e-06, + "loss": 0.4499, + "step": 3963 + }, + { + "epoch": 0.49, + "grad_norm": 1.8008475312845498, + "learning_rate": 5.373459601052451e-06, + "loss": 0.5108, + "step": 3964 + }, + { + "epoch": 0.49, + "grad_norm": 1.483453578025183, + "learning_rate": 5.3714554658284125e-06, + "loss": 0.565, + "step": 3965 + }, + { + "epoch": 0.49, + "grad_norm": 1.4975414533192573, + "learning_rate": 5.369451270592472e-06, + "loss": 0.5173, + "step": 3966 + }, + { + "epoch": 0.49, + "grad_norm": 2.123985067004447, + "learning_rate": 5.3674470156684255e-06, + "loss": 0.522, + "step": 3967 + }, + { + "epoch": 0.49, + "grad_norm": 0.6500700336932811, + "learning_rate": 5.365442701380077e-06, + "loss": 0.4349, + "step": 3968 + }, + { + "epoch": 0.49, + "grad_norm": 1.6029152251625887, + "learning_rate": 5.3634383280512416e-06, + "loss": 0.5308, + "step": 3969 + }, + { + "epoch": 0.49, + "grad_norm": 1.3769653475610781, + "learning_rate": 5.361433896005743e-06, + "loss": 0.5098, + "step": 3970 + }, + { + "epoch": 0.49, + "grad_norm": 7.020545630737324, + "learning_rate": 5.359429405567415e-06, + "loss": 0.5218, + "step": 3971 + }, + { + "epoch": 0.49, + "grad_norm": 1.4426438371728305, + "learning_rate": 5.357424857060102e-06, + "loss": 0.5149, + "step": 3972 + }, + { + "epoch": 0.49, + "grad_norm": 1.498625679032598, + "learning_rate": 5.355420250807654e-06, + "loss": 0.5784, + "step": 3973 + }, + { + "epoch": 0.49, + "grad_norm": 1.337563009204282, + "learning_rate": 5.353415587133936e-06, + "loss": 0.5322, + "step": 3974 + }, + { + "epoch": 0.49, + "grad_norm": 2.1258963333162684, + "learning_rate": 5.351410866362816e-06, + "loss": 0.5215, + "step": 3975 + }, + { + "epoch": 0.49, + "grad_norm": 2.1845941268352767, + "learning_rate": 5.349406088818176e-06, + "loss": 0.5404, + "step": 3976 + }, + { + "epoch": 0.49, + "grad_norm": 1.5751923241999841, + "learning_rate": 5.347401254823906e-06, + "loss": 0.5373, + "step": 3977 + }, + { + "epoch": 0.49, + "grad_norm": 1.646247402251166, + "learning_rate": 5.3453963647039035e-06, + "loss": 0.5345, + "step": 3978 + }, + { + "epoch": 0.49, + "grad_norm": 1.58443216877823, + "learning_rate": 5.343391418782076e-06, + "loss": 0.4974, + "step": 3979 + }, + { + "epoch": 0.49, + "grad_norm": 4.9367408343514425, + "learning_rate": 5.341386417382338e-06, + "loss": 0.4804, + "step": 3980 + }, + { + "epoch": 0.49, + "grad_norm": 1.3965703455304954, + "learning_rate": 5.33938136082862e-06, + "loss": 0.5394, + "step": 3981 + }, + { + "epoch": 0.49, + "grad_norm": 1.590559268954731, + "learning_rate": 5.3373762494448546e-06, + "loss": 0.5371, + "step": 3982 + }, + { + "epoch": 0.49, + "grad_norm": 1.320429476244967, + "learning_rate": 5.335371083554984e-06, + "loss": 0.53, + "step": 3983 + }, + { + "epoch": 0.49, + "grad_norm": 1.421899153361156, + "learning_rate": 5.333365863482961e-06, + "loss": 0.4955, + "step": 3984 + }, + { + "epoch": 0.49, + "grad_norm": 1.5542947739027118, + "learning_rate": 5.331360589552746e-06, + "loss": 0.4968, + "step": 3985 + }, + { + "epoch": 0.49, + "grad_norm": 0.7457943000103398, + "learning_rate": 5.3293552620883115e-06, + "loss": 0.4566, + "step": 3986 + }, + { + "epoch": 0.49, + "grad_norm": 4.2672149801902926, + "learning_rate": 5.327349881413632e-06, + "loss": 0.5328, + "step": 3987 + }, + { + "epoch": 0.49, + "grad_norm": 1.6192367900745737, + "learning_rate": 5.325344447852696e-06, + "loss": 0.508, + "step": 3988 + }, + { + "epoch": 0.5, + "grad_norm": 1.478967517170313, + "learning_rate": 5.323338961729499e-06, + "loss": 0.5139, + "step": 3989 + }, + { + "epoch": 0.5, + "grad_norm": 1.5728731128199156, + "learning_rate": 5.321333423368047e-06, + "loss": 0.5522, + "step": 3990 + }, + { + "epoch": 0.5, + "grad_norm": 1.3371364747051662, + "learning_rate": 5.319327833092348e-06, + "loss": 0.4853, + "step": 3991 + }, + { + "epoch": 0.5, + "grad_norm": 0.6996233064142633, + "learning_rate": 5.317322191226426e-06, + "loss": 0.4729, + "step": 3992 + }, + { + "epoch": 0.5, + "grad_norm": 1.4012552294803955, + "learning_rate": 5.3153164980943104e-06, + "loss": 0.5021, + "step": 3993 + }, + { + "epoch": 0.5, + "grad_norm": 1.4430297337479316, + "learning_rate": 5.313310754020037e-06, + "loss": 0.4965, + "step": 3994 + }, + { + "epoch": 0.5, + "grad_norm": 1.2621347231021145, + "learning_rate": 5.311304959327651e-06, + "loss": 0.4459, + "step": 3995 + }, + { + "epoch": 0.5, + "grad_norm": 1.2817572185185382, + "learning_rate": 5.309299114341209e-06, + "loss": 0.5423, + "step": 3996 + }, + { + "epoch": 0.5, + "grad_norm": 1.5992598858188514, + "learning_rate": 5.307293219384768e-06, + "loss": 0.4952, + "step": 3997 + }, + { + "epoch": 0.5, + "grad_norm": 1.3738706690239122, + "learning_rate": 5.305287274782403e-06, + "loss": 0.5252, + "step": 3998 + }, + { + "epoch": 0.5, + "grad_norm": 1.422877770604774, + "learning_rate": 5.303281280858189e-06, + "loss": 0.5104, + "step": 3999 + }, + { + "epoch": 0.5, + "grad_norm": 1.4768927119527948, + "learning_rate": 5.301275237936214e-06, + "loss": 0.5426, + "step": 4000 + }, + { + "epoch": 0.5, + "grad_norm": 1.3443790195812775, + "learning_rate": 5.29926914634057e-06, + "loss": 0.511, + "step": 4001 + }, + { + "epoch": 0.5, + "grad_norm": 0.7210003573811843, + "learning_rate": 5.29726300639536e-06, + "loss": 0.5112, + "step": 4002 + }, + { + "epoch": 0.5, + "grad_norm": 1.4934549647405275, + "learning_rate": 5.295256818424695e-06, + "loss": 0.5028, + "step": 4003 + }, + { + "epoch": 0.5, + "grad_norm": 1.5346083778331399, + "learning_rate": 5.293250582752689e-06, + "loss": 0.5074, + "step": 4004 + }, + { + "epoch": 0.5, + "grad_norm": 1.51676023911406, + "learning_rate": 5.291244299703469e-06, + "loss": 0.4786, + "step": 4005 + }, + { + "epoch": 0.5, + "grad_norm": 1.2407408253081538, + "learning_rate": 5.2892379696011665e-06, + "loss": 0.549, + "step": 4006 + }, + { + "epoch": 0.5, + "grad_norm": 1.482395526869878, + "learning_rate": 5.2872315927699235e-06, + "loss": 0.5327, + "step": 4007 + }, + { + "epoch": 0.5, + "grad_norm": 1.9826848283440506, + "learning_rate": 5.2852251695338865e-06, + "loss": 0.5153, + "step": 4008 + }, + { + "epoch": 0.5, + "grad_norm": 2.1060888524002794, + "learning_rate": 5.283218700217211e-06, + "loss": 0.5266, + "step": 4009 + }, + { + "epoch": 0.5, + "grad_norm": 1.4164346549674205, + "learning_rate": 5.28121218514406e-06, + "loss": 0.4965, + "step": 4010 + }, + { + "epoch": 0.5, + "grad_norm": 1.4669936305099687, + "learning_rate": 5.279205624638605e-06, + "loss": 0.4723, + "step": 4011 + }, + { + "epoch": 0.5, + "grad_norm": 1.4057306748118408, + "learning_rate": 5.277199019025022e-06, + "loss": 0.4843, + "step": 4012 + }, + { + "epoch": 0.5, + "grad_norm": 1.4158292097211684, + "learning_rate": 5.275192368627495e-06, + "loss": 0.5183, + "step": 4013 + }, + { + "epoch": 0.5, + "grad_norm": 1.347733607048643, + "learning_rate": 5.2731856737702195e-06, + "loss": 0.5201, + "step": 4014 + }, + { + "epoch": 0.5, + "grad_norm": 1.4507919841534922, + "learning_rate": 5.27117893477739e-06, + "loss": 0.4904, + "step": 4015 + }, + { + "epoch": 0.5, + "grad_norm": 1.351191256523609, + "learning_rate": 5.269172151973216e-06, + "loss": 0.5062, + "step": 4016 + }, + { + "epoch": 0.5, + "grad_norm": 1.8170541743140898, + "learning_rate": 5.26716532568191e-06, + "loss": 0.5185, + "step": 4017 + }, + { + "epoch": 0.5, + "grad_norm": 1.8948850199277016, + "learning_rate": 5.2651584562276935e-06, + "loss": 0.5088, + "step": 4018 + }, + { + "epoch": 0.5, + "grad_norm": 1.4895022409198124, + "learning_rate": 5.263151543934792e-06, + "loss": 0.4905, + "step": 4019 + }, + { + "epoch": 0.5, + "grad_norm": 1.2034136078269118, + "learning_rate": 5.261144589127441e-06, + "loss": 0.4706, + "step": 4020 + }, + { + "epoch": 0.5, + "grad_norm": 1.5148654943676552, + "learning_rate": 5.259137592129883e-06, + "loss": 0.5068, + "step": 4021 + }, + { + "epoch": 0.5, + "grad_norm": 1.7539595585635952, + "learning_rate": 5.257130553266364e-06, + "loss": 0.4596, + "step": 4022 + }, + { + "epoch": 0.5, + "grad_norm": 2.1276164530176103, + "learning_rate": 5.25512347286114e-06, + "loss": 0.4614, + "step": 4023 + }, + { + "epoch": 0.5, + "grad_norm": 1.293833851355961, + "learning_rate": 5.253116351238472e-06, + "loss": 0.4858, + "step": 4024 + }, + { + "epoch": 0.5, + "grad_norm": 1.488875806713043, + "learning_rate": 5.251109188722626e-06, + "loss": 0.5054, + "step": 4025 + }, + { + "epoch": 0.5, + "grad_norm": 1.436624795404598, + "learning_rate": 5.249101985637881e-06, + "loss": 0.5032, + "step": 4026 + }, + { + "epoch": 0.5, + "grad_norm": 1.621443769321251, + "learning_rate": 5.247094742308516e-06, + "loss": 0.4775, + "step": 4027 + }, + { + "epoch": 0.5, + "grad_norm": 1.9501526582620479, + "learning_rate": 5.24508745905882e-06, + "loss": 0.5201, + "step": 4028 + }, + { + "epoch": 0.5, + "grad_norm": 2.042610189384628, + "learning_rate": 5.243080136213085e-06, + "loss": 0.544, + "step": 4029 + }, + { + "epoch": 0.5, + "grad_norm": 1.5994079979309168, + "learning_rate": 5.241072774095615e-06, + "loss": 0.5709, + "step": 4030 + }, + { + "epoch": 0.5, + "grad_norm": 1.251606242450616, + "learning_rate": 5.239065373030713e-06, + "loss": 0.4817, + "step": 4031 + }, + { + "epoch": 0.5, + "grad_norm": 1.4523263770339998, + "learning_rate": 5.237057933342696e-06, + "loss": 0.4666, + "step": 4032 + }, + { + "epoch": 0.5, + "grad_norm": 1.3870858370349122, + "learning_rate": 5.235050455355881e-06, + "loss": 0.5148, + "step": 4033 + }, + { + "epoch": 0.5, + "grad_norm": 1.2956128600540535, + "learning_rate": 5.233042939394595e-06, + "loss": 0.5353, + "step": 4034 + }, + { + "epoch": 0.5, + "grad_norm": 1.5130642043818001, + "learning_rate": 5.2310353857831695e-06, + "loss": 0.5045, + "step": 4035 + }, + { + "epoch": 0.5, + "grad_norm": 1.4965739798426347, + "learning_rate": 5.229027794845944e-06, + "loss": 0.5177, + "step": 4036 + }, + { + "epoch": 0.5, + "grad_norm": 2.3940798964222623, + "learning_rate": 5.227020166907259e-06, + "loss": 0.5138, + "step": 4037 + }, + { + "epoch": 0.5, + "grad_norm": 1.216930430982062, + "learning_rate": 5.225012502291469e-06, + "loss": 0.4939, + "step": 4038 + }, + { + "epoch": 0.5, + "grad_norm": 1.4608752793636535, + "learning_rate": 5.223004801322926e-06, + "loss": 0.5409, + "step": 4039 + }, + { + "epoch": 0.5, + "grad_norm": 1.4213725178723116, + "learning_rate": 5.220997064325994e-06, + "loss": 0.5562, + "step": 4040 + }, + { + "epoch": 0.5, + "grad_norm": 1.3870879937572156, + "learning_rate": 5.21898929162504e-06, + "loss": 0.5173, + "step": 4041 + }, + { + "epoch": 0.5, + "grad_norm": 2.782760226940039, + "learning_rate": 5.2169814835444356e-06, + "loss": 0.4981, + "step": 4042 + }, + { + "epoch": 0.5, + "grad_norm": 0.6988563115846317, + "learning_rate": 5.214973640408563e-06, + "loss": 0.4882, + "step": 4043 + }, + { + "epoch": 0.5, + "grad_norm": 1.3902236234519256, + "learning_rate": 5.2129657625418055e-06, + "loss": 0.458, + "step": 4044 + }, + { + "epoch": 0.5, + "grad_norm": 1.3850148995382587, + "learning_rate": 5.2109578502685534e-06, + "loss": 0.5024, + "step": 4045 + }, + { + "epoch": 0.5, + "grad_norm": 1.3471538316223168, + "learning_rate": 5.208949903913201e-06, + "loss": 0.4689, + "step": 4046 + }, + { + "epoch": 0.5, + "grad_norm": 1.6714409613819254, + "learning_rate": 5.206941923800154e-06, + "loss": 0.5532, + "step": 4047 + }, + { + "epoch": 0.5, + "grad_norm": 1.2658631922999917, + "learning_rate": 5.2049339102538154e-06, + "loss": 0.4786, + "step": 4048 + }, + { + "epoch": 0.5, + "grad_norm": 1.511130438261548, + "learning_rate": 5.202925863598599e-06, + "loss": 0.5137, + "step": 4049 + }, + { + "epoch": 0.5, + "grad_norm": 1.3184918589934416, + "learning_rate": 5.200917784158921e-06, + "loss": 0.4591, + "step": 4050 + }, + { + "epoch": 0.5, + "grad_norm": 2.237884065088822, + "learning_rate": 5.1989096722592055e-06, + "loss": 0.4556, + "step": 4051 + }, + { + "epoch": 0.5, + "grad_norm": 1.2726858470097968, + "learning_rate": 5.1969015282238824e-06, + "loss": 0.4671, + "step": 4052 + }, + { + "epoch": 0.5, + "grad_norm": 1.4979136717841783, + "learning_rate": 5.1948933523773824e-06, + "loss": 0.5146, + "step": 4053 + }, + { + "epoch": 0.5, + "grad_norm": 1.503751302350897, + "learning_rate": 5.192885145044143e-06, + "loss": 0.449, + "step": 4054 + }, + { + "epoch": 0.5, + "grad_norm": 2.3001077377892787, + "learning_rate": 5.190876906548612e-06, + "loss": 0.4992, + "step": 4055 + }, + { + "epoch": 0.5, + "grad_norm": 1.35780055003118, + "learning_rate": 5.188868637215235e-06, + "loss": 0.5042, + "step": 4056 + }, + { + "epoch": 0.5, + "grad_norm": 1.8096393235457786, + "learning_rate": 5.186860337368468e-06, + "loss": 0.4997, + "step": 4057 + }, + { + "epoch": 0.5, + "grad_norm": 1.7976803429353938, + "learning_rate": 5.184852007332765e-06, + "loss": 0.5438, + "step": 4058 + }, + { + "epoch": 0.5, + "grad_norm": 1.4295847395483166, + "learning_rate": 5.182843647432593e-06, + "loss": 0.5195, + "step": 4059 + }, + { + "epoch": 0.5, + "grad_norm": 1.329781113500144, + "learning_rate": 5.180835257992419e-06, + "loss": 0.4692, + "step": 4060 + }, + { + "epoch": 0.5, + "grad_norm": 1.4198247587720287, + "learning_rate": 5.178826839336718e-06, + "loss": 0.5149, + "step": 4061 + }, + { + "epoch": 0.5, + "grad_norm": 1.45463475933709, + "learning_rate": 5.176818391789964e-06, + "loss": 0.4831, + "step": 4062 + }, + { + "epoch": 0.5, + "grad_norm": 1.405498695171607, + "learning_rate": 5.174809915676643e-06, + "loss": 0.5151, + "step": 4063 + }, + { + "epoch": 0.5, + "grad_norm": 1.8636479793019347, + "learning_rate": 5.17280141132124e-06, + "loss": 0.5218, + "step": 4064 + }, + { + "epoch": 0.5, + "grad_norm": 1.3187889761676121, + "learning_rate": 5.170792879048248e-06, + "loss": 0.5031, + "step": 4065 + }, + { + "epoch": 0.5, + "grad_norm": 1.2851291534308047, + "learning_rate": 5.168784319182161e-06, + "loss": 0.4685, + "step": 4066 + }, + { + "epoch": 0.5, + "grad_norm": 3.901671464065162, + "learning_rate": 5.166775732047481e-06, + "loss": 0.4776, + "step": 4067 + }, + { + "epoch": 0.5, + "grad_norm": 1.577965546141581, + "learning_rate": 5.164767117968713e-06, + "loss": 0.504, + "step": 4068 + }, + { + "epoch": 0.5, + "grad_norm": 2.611583306216855, + "learning_rate": 5.162758477270366e-06, + "loss": 0.5139, + "step": 4069 + }, + { + "epoch": 0.51, + "grad_norm": 1.4722771473445617, + "learning_rate": 5.160749810276952e-06, + "loss": 0.5049, + "step": 4070 + }, + { + "epoch": 0.51, + "grad_norm": 0.6841236873282883, + "learning_rate": 5.158741117312992e-06, + "loss": 0.4823, + "step": 4071 + }, + { + "epoch": 0.51, + "grad_norm": 1.7283913002786766, + "learning_rate": 5.156732398703007e-06, + "loss": 0.486, + "step": 4072 + }, + { + "epoch": 0.51, + "grad_norm": 1.3644169274208862, + "learning_rate": 5.154723654771522e-06, + "loss": 0.5316, + "step": 4073 + }, + { + "epoch": 0.51, + "grad_norm": 1.412978580938532, + "learning_rate": 5.1527148858430675e-06, + "loss": 0.5205, + "step": 4074 + }, + { + "epoch": 0.51, + "grad_norm": 1.304308989910714, + "learning_rate": 5.150706092242178e-06, + "loss": 0.5112, + "step": 4075 + }, + { + "epoch": 0.51, + "grad_norm": 1.5096581064631402, + "learning_rate": 5.148697274293392e-06, + "loss": 0.5137, + "step": 4076 + }, + { + "epoch": 0.51, + "grad_norm": 1.9507912798218543, + "learning_rate": 5.146688432321253e-06, + "loss": 0.5665, + "step": 4077 + }, + { + "epoch": 0.51, + "grad_norm": 1.3974637208424106, + "learning_rate": 5.144679566650306e-06, + "loss": 0.5266, + "step": 4078 + }, + { + "epoch": 0.51, + "grad_norm": 1.381186236440111, + "learning_rate": 5.1426706776050985e-06, + "loss": 0.4926, + "step": 4079 + }, + { + "epoch": 0.51, + "grad_norm": 1.2908982111875704, + "learning_rate": 5.140661765510187e-06, + "loss": 0.5299, + "step": 4080 + }, + { + "epoch": 0.51, + "grad_norm": 1.517278250782335, + "learning_rate": 5.138652830690129e-06, + "loss": 0.5097, + "step": 4081 + }, + { + "epoch": 0.51, + "grad_norm": 1.7679776693034879, + "learning_rate": 5.136643873469487e-06, + "loss": 0.536, + "step": 4082 + }, + { + "epoch": 0.51, + "grad_norm": 1.3584650382939063, + "learning_rate": 5.1346348941728215e-06, + "loss": 0.4941, + "step": 4083 + }, + { + "epoch": 0.51, + "grad_norm": 1.260066066177889, + "learning_rate": 5.132625893124704e-06, + "loss": 0.5343, + "step": 4084 + }, + { + "epoch": 0.51, + "grad_norm": 1.2417882558433795, + "learning_rate": 5.130616870649705e-06, + "loss": 0.5519, + "step": 4085 + }, + { + "epoch": 0.51, + "grad_norm": 1.4348244112899506, + "learning_rate": 5.1286078270724e-06, + "loss": 0.5182, + "step": 4086 + }, + { + "epoch": 0.51, + "grad_norm": 1.3761732902076147, + "learning_rate": 5.126598762717367e-06, + "loss": 0.5643, + "step": 4087 + }, + { + "epoch": 0.51, + "grad_norm": 7.093279684038697, + "learning_rate": 5.1245896779091905e-06, + "loss": 0.544, + "step": 4088 + }, + { + "epoch": 0.51, + "grad_norm": 1.1953773828090084, + "learning_rate": 5.122580572972453e-06, + "loss": 0.4644, + "step": 4089 + }, + { + "epoch": 0.51, + "grad_norm": 1.335699984289526, + "learning_rate": 5.120571448231746e-06, + "loss": 0.481, + "step": 4090 + }, + { + "epoch": 0.51, + "grad_norm": 1.539519553974143, + "learning_rate": 5.118562304011657e-06, + "loss": 0.5513, + "step": 4091 + }, + { + "epoch": 0.51, + "grad_norm": 1.3014018277586958, + "learning_rate": 5.116553140636788e-06, + "loss": 0.5649, + "step": 4092 + }, + { + "epoch": 0.51, + "grad_norm": 1.2963592808820954, + "learning_rate": 5.114543958431729e-06, + "loss": 0.5241, + "step": 4093 + }, + { + "epoch": 0.51, + "grad_norm": 1.3751884600265256, + "learning_rate": 5.112534757721086e-06, + "loss": 0.4718, + "step": 4094 + }, + { + "epoch": 0.51, + "grad_norm": 0.6805985594689253, + "learning_rate": 5.110525538829461e-06, + "loss": 0.5219, + "step": 4095 + }, + { + "epoch": 0.51, + "grad_norm": 0.7427122972421231, + "learning_rate": 5.108516302081461e-06, + "loss": 0.5227, + "step": 4096 + }, + { + "epoch": 0.51, + "grad_norm": 1.4777276883977413, + "learning_rate": 5.106507047801699e-06, + "loss": 0.4717, + "step": 4097 + }, + { + "epoch": 0.51, + "grad_norm": 1.4404767805534364, + "learning_rate": 5.104497776314784e-06, + "loss": 0.4985, + "step": 4098 + }, + { + "epoch": 0.51, + "grad_norm": 1.32514183192607, + "learning_rate": 5.102488487945332e-06, + "loss": 0.4796, + "step": 4099 + }, + { + "epoch": 0.51, + "grad_norm": 1.6300526573029417, + "learning_rate": 5.100479183017963e-06, + "loss": 0.5484, + "step": 4100 + }, + { + "epoch": 0.51, + "grad_norm": 1.506233040859698, + "learning_rate": 5.098469861857299e-06, + "loss": 0.4938, + "step": 4101 + }, + { + "epoch": 0.51, + "grad_norm": 1.5854909157924568, + "learning_rate": 5.09646052478796e-06, + "loss": 0.5012, + "step": 4102 + }, + { + "epoch": 0.51, + "grad_norm": 2.7334239667693274, + "learning_rate": 5.094451172134573e-06, + "loss": 0.5402, + "step": 4103 + }, + { + "epoch": 0.51, + "grad_norm": 1.421782694911038, + "learning_rate": 5.092441804221767e-06, + "loss": 0.5402, + "step": 4104 + }, + { + "epoch": 0.51, + "grad_norm": 1.2380410240615574, + "learning_rate": 5.090432421374175e-06, + "loss": 0.4782, + "step": 4105 + }, + { + "epoch": 0.51, + "grad_norm": 1.8233363976081143, + "learning_rate": 5.0884230239164274e-06, + "loss": 0.5275, + "step": 4106 + }, + { + "epoch": 0.51, + "grad_norm": 1.2157977078482316, + "learning_rate": 5.0864136121731614e-06, + "loss": 0.5032, + "step": 4107 + }, + { + "epoch": 0.51, + "grad_norm": 2.0628077114535106, + "learning_rate": 5.084404186469016e-06, + "loss": 0.529, + "step": 4108 + }, + { + "epoch": 0.51, + "grad_norm": 1.4673681464236095, + "learning_rate": 5.082394747128632e-06, + "loss": 0.5267, + "step": 4109 + }, + { + "epoch": 0.51, + "grad_norm": 1.405043504400975, + "learning_rate": 5.08038529447665e-06, + "loss": 0.4953, + "step": 4110 + }, + { + "epoch": 0.51, + "grad_norm": 1.426360350391599, + "learning_rate": 5.078375828837716e-06, + "loss": 0.5208, + "step": 4111 + }, + { + "epoch": 0.51, + "grad_norm": 1.8072437193277209, + "learning_rate": 5.0763663505364754e-06, + "loss": 0.5057, + "step": 4112 + }, + { + "epoch": 0.51, + "grad_norm": 1.3660490562593417, + "learning_rate": 5.07435685989758e-06, + "loss": 0.462, + "step": 4113 + }, + { + "epoch": 0.51, + "grad_norm": 1.5486600415029512, + "learning_rate": 5.072347357245678e-06, + "loss": 0.5372, + "step": 4114 + }, + { + "epoch": 0.51, + "grad_norm": 1.616974280163713, + "learning_rate": 5.0703378429054226e-06, + "loss": 0.5241, + "step": 4115 + }, + { + "epoch": 0.51, + "grad_norm": 1.4809366060080675, + "learning_rate": 5.06832831720147e-06, + "loss": 0.542, + "step": 4116 + }, + { + "epoch": 0.51, + "grad_norm": 2.8635109413540314, + "learning_rate": 5.066318780458476e-06, + "loss": 0.5301, + "step": 4117 + }, + { + "epoch": 0.51, + "grad_norm": 0.6926137434244817, + "learning_rate": 5.064309233001099e-06, + "loss": 0.4694, + "step": 4118 + }, + { + "epoch": 0.51, + "grad_norm": 1.1567850084651718, + "learning_rate": 5.062299675153999e-06, + "loss": 0.4616, + "step": 4119 + }, + { + "epoch": 0.51, + "grad_norm": 1.600980417226766, + "learning_rate": 5.0602901072418375e-06, + "loss": 0.4872, + "step": 4120 + }, + { + "epoch": 0.51, + "grad_norm": 1.6067968795720464, + "learning_rate": 5.058280529589279e-06, + "loss": 0.4697, + "step": 4121 + }, + { + "epoch": 0.51, + "grad_norm": 1.3858276131809577, + "learning_rate": 5.056270942520986e-06, + "loss": 0.5071, + "step": 4122 + }, + { + "epoch": 0.51, + "grad_norm": 1.5759322132167428, + "learning_rate": 5.054261346361628e-06, + "loss": 0.5134, + "step": 4123 + }, + { + "epoch": 0.51, + "grad_norm": 1.7641683615692467, + "learning_rate": 5.0522517414358705e-06, + "loss": 0.5418, + "step": 4124 + }, + { + "epoch": 0.51, + "grad_norm": 1.5814241502874156, + "learning_rate": 5.050242128068386e-06, + "loss": 0.453, + "step": 4125 + }, + { + "epoch": 0.51, + "grad_norm": 1.2912461088706648, + "learning_rate": 5.048232506583841e-06, + "loss": 0.4845, + "step": 4126 + }, + { + "epoch": 0.51, + "grad_norm": 1.2778937846886074, + "learning_rate": 5.046222877306911e-06, + "loss": 0.4403, + "step": 4127 + }, + { + "epoch": 0.51, + "grad_norm": 2.1456675490207737, + "learning_rate": 5.044213240562268e-06, + "loss": 0.5408, + "step": 4128 + }, + { + "epoch": 0.51, + "grad_norm": 1.6392616148549624, + "learning_rate": 5.042203596674586e-06, + "loss": 0.4769, + "step": 4129 + }, + { + "epoch": 0.51, + "grad_norm": 1.4748949968945793, + "learning_rate": 5.040193945968542e-06, + "loss": 0.507, + "step": 4130 + }, + { + "epoch": 0.51, + "grad_norm": 1.385296056790979, + "learning_rate": 5.038184288768813e-06, + "loss": 0.5147, + "step": 4131 + }, + { + "epoch": 0.51, + "grad_norm": 1.812264408884099, + "learning_rate": 5.036174625400073e-06, + "loss": 0.5232, + "step": 4132 + }, + { + "epoch": 0.51, + "grad_norm": 1.2807690983806912, + "learning_rate": 5.034164956187006e-06, + "loss": 0.53, + "step": 4133 + }, + { + "epoch": 0.51, + "grad_norm": 1.4308465971303546, + "learning_rate": 5.032155281454288e-06, + "loss": 0.5092, + "step": 4134 + }, + { + "epoch": 0.51, + "grad_norm": 1.2571579009281904, + "learning_rate": 5.030145601526603e-06, + "loss": 0.5141, + "step": 4135 + }, + { + "epoch": 0.51, + "grad_norm": 3.4038444686476588, + "learning_rate": 5.028135916728628e-06, + "loss": 0.4968, + "step": 4136 + }, + { + "epoch": 0.51, + "grad_norm": 1.5252270252095017, + "learning_rate": 5.02612622738505e-06, + "loss": 0.4903, + "step": 4137 + }, + { + "epoch": 0.51, + "grad_norm": 1.8263454315055303, + "learning_rate": 5.024116533820549e-06, + "loss": 0.5004, + "step": 4138 + }, + { + "epoch": 0.51, + "grad_norm": 2.1238635399425574, + "learning_rate": 5.02210683635981e-06, + "loss": 0.516, + "step": 4139 + }, + { + "epoch": 0.51, + "grad_norm": 1.491279320066013, + "learning_rate": 5.020097135327515e-06, + "loss": 0.4862, + "step": 4140 + }, + { + "epoch": 0.51, + "grad_norm": 1.3561697321845458, + "learning_rate": 5.018087431048353e-06, + "loss": 0.4839, + "step": 4141 + }, + { + "epoch": 0.51, + "grad_norm": 1.375828041356734, + "learning_rate": 5.016077723847006e-06, + "loss": 0.5209, + "step": 4142 + }, + { + "epoch": 0.51, + "grad_norm": 1.5082826103778804, + "learning_rate": 5.0140680140481625e-06, + "loss": 0.546, + "step": 4143 + }, + { + "epoch": 0.51, + "grad_norm": 1.4036655538213862, + "learning_rate": 5.012058301976505e-06, + "loss": 0.5226, + "step": 4144 + }, + { + "epoch": 0.51, + "grad_norm": 1.3933326710765965, + "learning_rate": 5.010048587956724e-06, + "loss": 0.4945, + "step": 4145 + }, + { + "epoch": 0.51, + "grad_norm": 1.506118935176337, + "learning_rate": 5.008038872313506e-06, + "loss": 0.5511, + "step": 4146 + }, + { + "epoch": 0.51, + "grad_norm": 1.5381169525014873, + "learning_rate": 5.006029155371538e-06, + "loss": 0.5581, + "step": 4147 + }, + { + "epoch": 0.51, + "grad_norm": 1.2389131059981193, + "learning_rate": 5.004019437455504e-06, + "loss": 0.4614, + "step": 4148 + }, + { + "epoch": 0.51, + "grad_norm": 1.3076621463048856, + "learning_rate": 5.0020097188900965e-06, + "loss": 0.5189, + "step": 4149 + }, + { + "epoch": 0.52, + "grad_norm": 1.5725310274284423, + "learning_rate": 5e-06, + "loss": 0.469, + "step": 4150 + }, + { + "epoch": 0.52, + "grad_norm": 1.4551892770544779, + "learning_rate": 4.997990281109905e-06, + "loss": 0.5506, + "step": 4151 + }, + { + "epoch": 0.52, + "grad_norm": 1.5266131165054042, + "learning_rate": 4.995980562544497e-06, + "loss": 0.5118, + "step": 4152 + }, + { + "epoch": 0.52, + "grad_norm": 1.4304922054207752, + "learning_rate": 4.993970844628464e-06, + "loss": 0.5216, + "step": 4153 + }, + { + "epoch": 0.52, + "grad_norm": 1.3661929147010907, + "learning_rate": 4.9919611276864956e-06, + "loss": 0.4755, + "step": 4154 + }, + { + "epoch": 0.52, + "grad_norm": 1.5063251153366095, + "learning_rate": 4.989951412043276e-06, + "loss": 0.4918, + "step": 4155 + }, + { + "epoch": 0.52, + "grad_norm": 1.3266955956864537, + "learning_rate": 4.987941698023495e-06, + "loss": 0.5007, + "step": 4156 + }, + { + "epoch": 0.52, + "grad_norm": 1.594738069327744, + "learning_rate": 4.985931985951839e-06, + "loss": 0.5322, + "step": 4157 + }, + { + "epoch": 0.52, + "grad_norm": 1.3599941087299574, + "learning_rate": 4.983922276152995e-06, + "loss": 0.5543, + "step": 4158 + }, + { + "epoch": 0.52, + "grad_norm": 1.6960652624546686, + "learning_rate": 4.981912568951649e-06, + "loss": 0.4587, + "step": 4159 + }, + { + "epoch": 0.52, + "grad_norm": 1.6547482077030211, + "learning_rate": 4.979902864672486e-06, + "loss": 0.5651, + "step": 4160 + }, + { + "epoch": 0.52, + "grad_norm": 1.4706748716930576, + "learning_rate": 4.977893163640193e-06, + "loss": 0.4898, + "step": 4161 + }, + { + "epoch": 0.52, + "grad_norm": 1.4434440513618652, + "learning_rate": 4.975883466179453e-06, + "loss": 0.4831, + "step": 4162 + }, + { + "epoch": 0.52, + "grad_norm": 1.9645279976236119, + "learning_rate": 4.973873772614952e-06, + "loss": 0.4389, + "step": 4163 + }, + { + "epoch": 0.52, + "grad_norm": 0.7249626792549597, + "learning_rate": 4.9718640832713725e-06, + "loss": 0.5114, + "step": 4164 + }, + { + "epoch": 0.52, + "grad_norm": 1.7902153666586336, + "learning_rate": 4.9698543984733995e-06, + "loss": 0.5163, + "step": 4165 + }, + { + "epoch": 0.52, + "grad_norm": 1.784462551091202, + "learning_rate": 4.967844718545713e-06, + "loss": 0.4856, + "step": 4166 + }, + { + "epoch": 0.52, + "grad_norm": 5.968567016437116, + "learning_rate": 4.965835043812996e-06, + "loss": 0.5301, + "step": 4167 + }, + { + "epoch": 0.52, + "grad_norm": 1.415022168086972, + "learning_rate": 4.963825374599929e-06, + "loss": 0.495, + "step": 4168 + }, + { + "epoch": 0.52, + "grad_norm": 1.6551021929136456, + "learning_rate": 4.96181571123119e-06, + "loss": 0.4626, + "step": 4169 + }, + { + "epoch": 0.52, + "grad_norm": 1.2770436177440663, + "learning_rate": 4.959806054031459e-06, + "loss": 0.4402, + "step": 4170 + }, + { + "epoch": 0.52, + "grad_norm": 1.4890483786874007, + "learning_rate": 4.957796403325415e-06, + "loss": 0.535, + "step": 4171 + }, + { + "epoch": 0.52, + "grad_norm": 0.6292700008320747, + "learning_rate": 4.955786759437733e-06, + "loss": 0.4561, + "step": 4172 + }, + { + "epoch": 0.52, + "grad_norm": 1.3794138507763596, + "learning_rate": 4.9537771226930895e-06, + "loss": 0.5358, + "step": 4173 + }, + { + "epoch": 0.52, + "grad_norm": 1.2242660546252222, + "learning_rate": 4.9517674934161595e-06, + "loss": 0.4633, + "step": 4174 + }, + { + "epoch": 0.52, + "grad_norm": 1.644565449493744, + "learning_rate": 4.949757871931616e-06, + "loss": 0.4738, + "step": 4175 + }, + { + "epoch": 0.52, + "grad_norm": 1.5253593098849336, + "learning_rate": 4.94774825856413e-06, + "loss": 0.5339, + "step": 4176 + }, + { + "epoch": 0.52, + "grad_norm": 1.691103468736044, + "learning_rate": 4.945738653638374e-06, + "loss": 0.4447, + "step": 4177 + }, + { + "epoch": 0.52, + "grad_norm": 1.3440088062239994, + "learning_rate": 4.943729057479016e-06, + "loss": 0.5049, + "step": 4178 + }, + { + "epoch": 0.52, + "grad_norm": 1.55358748969524, + "learning_rate": 4.941719470410722e-06, + "loss": 0.5487, + "step": 4179 + }, + { + "epoch": 0.52, + "grad_norm": 1.6662063178996205, + "learning_rate": 4.9397098927581625e-06, + "loss": 0.5347, + "step": 4180 + }, + { + "epoch": 0.52, + "grad_norm": 2.1629719586070864, + "learning_rate": 4.937700324846002e-06, + "loss": 0.5606, + "step": 4181 + }, + { + "epoch": 0.52, + "grad_norm": 1.7344846885416572, + "learning_rate": 4.935690766998902e-06, + "loss": 0.5023, + "step": 4182 + }, + { + "epoch": 0.52, + "grad_norm": 1.9817220234093216, + "learning_rate": 4.9336812195415256e-06, + "loss": 0.452, + "step": 4183 + }, + { + "epoch": 0.52, + "grad_norm": 1.3397533477646433, + "learning_rate": 4.931671682798532e-06, + "loss": 0.4803, + "step": 4184 + }, + { + "epoch": 0.52, + "grad_norm": 1.7432222684048106, + "learning_rate": 4.929662157094579e-06, + "loss": 0.5058, + "step": 4185 + }, + { + "epoch": 0.52, + "grad_norm": 1.7210622686065686, + "learning_rate": 4.9276526427543246e-06, + "loss": 0.506, + "step": 4186 + }, + { + "epoch": 0.52, + "grad_norm": 1.4401480942162428, + "learning_rate": 4.925643140102421e-06, + "loss": 0.4744, + "step": 4187 + }, + { + "epoch": 0.52, + "grad_norm": 1.3887530249461608, + "learning_rate": 4.9236336494635245e-06, + "loss": 0.4719, + "step": 4188 + }, + { + "epoch": 0.52, + "grad_norm": 1.6669286955765157, + "learning_rate": 4.921624171162285e-06, + "loss": 0.5595, + "step": 4189 + }, + { + "epoch": 0.52, + "grad_norm": 1.3289537902911408, + "learning_rate": 4.919614705523352e-06, + "loss": 0.5222, + "step": 4190 + }, + { + "epoch": 0.52, + "grad_norm": 0.6561483477939754, + "learning_rate": 4.91760525287137e-06, + "loss": 0.4604, + "step": 4191 + }, + { + "epoch": 0.52, + "grad_norm": 1.2549264933188877, + "learning_rate": 4.915595813530985e-06, + "loss": 0.4916, + "step": 4192 + }, + { + "epoch": 0.52, + "grad_norm": 1.337661073623808, + "learning_rate": 4.913586387826839e-06, + "loss": 0.5058, + "step": 4193 + }, + { + "epoch": 0.52, + "grad_norm": 1.7660930060464695, + "learning_rate": 4.911576976083574e-06, + "loss": 0.4937, + "step": 4194 + }, + { + "epoch": 0.52, + "grad_norm": 3.4433537947099384, + "learning_rate": 4.909567578625828e-06, + "loss": 0.5052, + "step": 4195 + }, + { + "epoch": 0.52, + "grad_norm": 1.5505560786292858, + "learning_rate": 4.907558195778233e-06, + "loss": 0.4929, + "step": 4196 + }, + { + "epoch": 0.52, + "grad_norm": 3.0204960000285306, + "learning_rate": 4.905548827865428e-06, + "loss": 0.4991, + "step": 4197 + }, + { + "epoch": 0.52, + "grad_norm": 1.2185001831453082, + "learning_rate": 4.903539475212042e-06, + "loss": 0.4375, + "step": 4198 + }, + { + "epoch": 0.52, + "grad_norm": 1.3643582538915, + "learning_rate": 4.9015301381427024e-06, + "loss": 0.4703, + "step": 4199 + }, + { + "epoch": 0.52, + "grad_norm": 1.8839576911566032, + "learning_rate": 4.899520816982038e-06, + "loss": 0.5241, + "step": 4200 + }, + { + "epoch": 0.52, + "grad_norm": 1.3994381417308601, + "learning_rate": 4.8975115120546696e-06, + "loss": 0.4646, + "step": 4201 + }, + { + "epoch": 0.52, + "grad_norm": 1.495212091403681, + "learning_rate": 4.895502223685219e-06, + "loss": 0.5456, + "step": 4202 + }, + { + "epoch": 0.52, + "grad_norm": 1.6878448599306082, + "learning_rate": 4.8934929521983045e-06, + "loss": 0.5805, + "step": 4203 + }, + { + "epoch": 0.52, + "grad_norm": 1.4437728881272724, + "learning_rate": 4.891483697918539e-06, + "loss": 0.479, + "step": 4204 + }, + { + "epoch": 0.52, + "grad_norm": 1.7700415243388306, + "learning_rate": 4.88947446117054e-06, + "loss": 0.4935, + "step": 4205 + }, + { + "epoch": 0.52, + "grad_norm": 2.5026598746843285, + "learning_rate": 4.887465242278915e-06, + "loss": 0.5104, + "step": 4206 + }, + { + "epoch": 0.52, + "grad_norm": 1.2791913508237092, + "learning_rate": 4.885456041568272e-06, + "loss": 0.4952, + "step": 4207 + }, + { + "epoch": 0.52, + "grad_norm": 1.3321160626968813, + "learning_rate": 4.883446859363215e-06, + "loss": 0.5013, + "step": 4208 + }, + { + "epoch": 0.52, + "grad_norm": 0.6531483540609091, + "learning_rate": 4.881437695988344e-06, + "loss": 0.5052, + "step": 4209 + }, + { + "epoch": 0.52, + "grad_norm": 1.8734086791695874, + "learning_rate": 4.8794285517682565e-06, + "loss": 0.5431, + "step": 4210 + }, + { + "epoch": 0.52, + "grad_norm": 1.6352629459090195, + "learning_rate": 4.877419427027548e-06, + "loss": 0.4828, + "step": 4211 + }, + { + "epoch": 0.52, + "grad_norm": 1.498449057211703, + "learning_rate": 4.87541032209081e-06, + "loss": 0.5139, + "step": 4212 + }, + { + "epoch": 0.52, + "grad_norm": 1.4920133056165215, + "learning_rate": 4.873401237282634e-06, + "loss": 0.5033, + "step": 4213 + }, + { + "epoch": 0.52, + "grad_norm": 1.4502112098248143, + "learning_rate": 4.8713921729276015e-06, + "loss": 0.5103, + "step": 4214 + }, + { + "epoch": 0.52, + "grad_norm": 0.6614611334638816, + "learning_rate": 4.869383129350297e-06, + "loss": 0.4647, + "step": 4215 + }, + { + "epoch": 0.52, + "grad_norm": 0.7022236557980238, + "learning_rate": 4.867374106875298e-06, + "loss": 0.4935, + "step": 4216 + }, + { + "epoch": 0.52, + "grad_norm": 1.7900496633745702, + "learning_rate": 4.86536510582718e-06, + "loss": 0.5193, + "step": 4217 + }, + { + "epoch": 0.52, + "grad_norm": 1.3757970981568506, + "learning_rate": 4.8633561265305156e-06, + "loss": 0.4933, + "step": 4218 + }, + { + "epoch": 0.52, + "grad_norm": 1.868217808906103, + "learning_rate": 4.8613471693098724e-06, + "loss": 0.5326, + "step": 4219 + }, + { + "epoch": 0.52, + "grad_norm": 1.2677455892833875, + "learning_rate": 4.859338234489813e-06, + "loss": 0.5134, + "step": 4220 + }, + { + "epoch": 0.52, + "grad_norm": 1.3549139939818315, + "learning_rate": 4.857329322394902e-06, + "loss": 0.5593, + "step": 4221 + }, + { + "epoch": 0.52, + "grad_norm": 1.7574557695669282, + "learning_rate": 4.8553204333496965e-06, + "loss": 0.5168, + "step": 4222 + }, + { + "epoch": 0.52, + "grad_norm": 1.4457527589125292, + "learning_rate": 4.853311567678748e-06, + "loss": 0.5191, + "step": 4223 + }, + { + "epoch": 0.52, + "grad_norm": 1.4256473720212115, + "learning_rate": 4.8513027257066085e-06, + "loss": 0.4653, + "step": 4224 + }, + { + "epoch": 0.52, + "grad_norm": 1.6287755213141875, + "learning_rate": 4.849293907757823e-06, + "loss": 0.5237, + "step": 4225 + }, + { + "epoch": 0.52, + "grad_norm": 1.655830538154988, + "learning_rate": 4.847285114156934e-06, + "loss": 0.5446, + "step": 4226 + }, + { + "epoch": 0.52, + "grad_norm": 2.4664096866038583, + "learning_rate": 4.84527634522848e-06, + "loss": 0.5061, + "step": 4227 + }, + { + "epoch": 0.52, + "grad_norm": 1.6320403680583409, + "learning_rate": 4.843267601296994e-06, + "loss": 0.5454, + "step": 4228 + }, + { + "epoch": 0.52, + "grad_norm": 2.450134674129634, + "learning_rate": 4.8412588826870075e-06, + "loss": 0.5739, + "step": 4229 + }, + { + "epoch": 0.52, + "grad_norm": 0.6707231135387525, + "learning_rate": 4.839250189723048e-06, + "loss": 0.5417, + "step": 4230 + }, + { + "epoch": 0.53, + "grad_norm": 2.1014948435593066, + "learning_rate": 4.8372415227296355e-06, + "loss": 0.4824, + "step": 4231 + }, + { + "epoch": 0.53, + "grad_norm": 1.5850577118025315, + "learning_rate": 4.835232882031288e-06, + "loss": 0.5486, + "step": 4232 + }, + { + "epoch": 0.53, + "grad_norm": 1.363920654716586, + "learning_rate": 4.83322426795252e-06, + "loss": 0.5245, + "step": 4233 + }, + { + "epoch": 0.53, + "grad_norm": 1.2817202214537684, + "learning_rate": 4.8312156808178405e-06, + "loss": 0.5268, + "step": 4234 + }, + { + "epoch": 0.53, + "grad_norm": 1.5467511661175979, + "learning_rate": 4.829207120951754e-06, + "loss": 0.4896, + "step": 4235 + }, + { + "epoch": 0.53, + "grad_norm": 1.147514907824803, + "learning_rate": 4.827198588678761e-06, + "loss": 0.4676, + "step": 4236 + }, + { + "epoch": 0.53, + "grad_norm": 1.349527074343018, + "learning_rate": 4.825190084323358e-06, + "loss": 0.5278, + "step": 4237 + }, + { + "epoch": 0.53, + "grad_norm": 1.2386225759661265, + "learning_rate": 4.823181608210036e-06, + "loss": 0.4553, + "step": 4238 + }, + { + "epoch": 0.53, + "grad_norm": 1.3773870018897933, + "learning_rate": 4.821173160663284e-06, + "loss": 0.4915, + "step": 4239 + }, + { + "epoch": 0.53, + "grad_norm": 1.337943712069457, + "learning_rate": 4.819164742007582e-06, + "loss": 0.4422, + "step": 4240 + }, + { + "epoch": 0.53, + "grad_norm": 2.50843833816639, + "learning_rate": 4.817156352567409e-06, + "loss": 0.4631, + "step": 4241 + }, + { + "epoch": 0.53, + "grad_norm": 1.888360839786836, + "learning_rate": 4.815147992667237e-06, + "loss": 0.4285, + "step": 4242 + }, + { + "epoch": 0.53, + "grad_norm": 1.4492756322046154, + "learning_rate": 4.813139662631535e-06, + "loss": 0.5241, + "step": 4243 + }, + { + "epoch": 0.53, + "grad_norm": 1.597220166897354, + "learning_rate": 4.811131362784766e-06, + "loss": 0.4924, + "step": 4244 + }, + { + "epoch": 0.53, + "grad_norm": 1.6341422036164526, + "learning_rate": 4.809123093451388e-06, + "loss": 0.4793, + "step": 4245 + }, + { + "epoch": 0.53, + "grad_norm": 1.381380178994785, + "learning_rate": 4.807114854955856e-06, + "loss": 0.5316, + "step": 4246 + }, + { + "epoch": 0.53, + "grad_norm": 0.6357430017650092, + "learning_rate": 4.805106647622619e-06, + "loss": 0.5142, + "step": 4247 + }, + { + "epoch": 0.53, + "grad_norm": 1.323250740425993, + "learning_rate": 4.803098471776119e-06, + "loss": 0.5756, + "step": 4248 + }, + { + "epoch": 0.53, + "grad_norm": 1.2960946149258648, + "learning_rate": 4.801090327740795e-06, + "loss": 0.517, + "step": 4249 + }, + { + "epoch": 0.53, + "grad_norm": 1.3783769541175654, + "learning_rate": 4.799082215841081e-06, + "loss": 0.4731, + "step": 4250 + }, + { + "epoch": 0.53, + "grad_norm": 1.4238922609555158, + "learning_rate": 4.797074136401403e-06, + "loss": 0.5305, + "step": 4251 + }, + { + "epoch": 0.53, + "grad_norm": 2.3187213899984527, + "learning_rate": 4.795066089746187e-06, + "loss": 0.5133, + "step": 4252 + }, + { + "epoch": 0.53, + "grad_norm": 1.5319986153504062, + "learning_rate": 4.793058076199847e-06, + "loss": 0.4949, + "step": 4253 + }, + { + "epoch": 0.53, + "grad_norm": 1.3313211380770655, + "learning_rate": 4.791050096086799e-06, + "loss": 0.5175, + "step": 4254 + }, + { + "epoch": 0.53, + "grad_norm": 1.661121794113625, + "learning_rate": 4.789042149731448e-06, + "loss": 0.5314, + "step": 4255 + }, + { + "epoch": 0.53, + "grad_norm": 1.8520098290199518, + "learning_rate": 4.787034237458195e-06, + "loss": 0.5665, + "step": 4256 + }, + { + "epoch": 0.53, + "grad_norm": 1.4383615798536171, + "learning_rate": 4.785026359591438e-06, + "loss": 0.4781, + "step": 4257 + }, + { + "epoch": 0.53, + "grad_norm": 2.36015549004612, + "learning_rate": 4.783018516455565e-06, + "loss": 0.5016, + "step": 4258 + }, + { + "epoch": 0.53, + "grad_norm": 2.8240998590538453, + "learning_rate": 4.781010708374963e-06, + "loss": 0.5126, + "step": 4259 + }, + { + "epoch": 0.53, + "grad_norm": 1.4355718796301644, + "learning_rate": 4.779002935674008e-06, + "loss": 0.5586, + "step": 4260 + }, + { + "epoch": 0.53, + "grad_norm": 0.7266807431996253, + "learning_rate": 4.776995198677075e-06, + "loss": 0.5162, + "step": 4261 + }, + { + "epoch": 0.53, + "grad_norm": 1.2775623472876219, + "learning_rate": 4.774987497708533e-06, + "loss": 0.4685, + "step": 4262 + }, + { + "epoch": 0.53, + "grad_norm": 2.307364784532959, + "learning_rate": 4.7729798330927415e-06, + "loss": 0.4875, + "step": 4263 + }, + { + "epoch": 0.53, + "grad_norm": 1.393824074495462, + "learning_rate": 4.770972205154058e-06, + "loss": 0.4782, + "step": 4264 + }, + { + "epoch": 0.53, + "grad_norm": 1.7554410138205054, + "learning_rate": 4.768964614216831e-06, + "loss": 0.4668, + "step": 4265 + }, + { + "epoch": 0.53, + "grad_norm": 2.3652602730204615, + "learning_rate": 4.7669570606054066e-06, + "loss": 0.5051, + "step": 4266 + }, + { + "epoch": 0.53, + "grad_norm": 1.4637991891015811, + "learning_rate": 4.764949544644121e-06, + "loss": 0.5407, + "step": 4267 + }, + { + "epoch": 0.53, + "grad_norm": 1.2055380948074947, + "learning_rate": 4.7629420666573065e-06, + "loss": 0.4802, + "step": 4268 + }, + { + "epoch": 0.53, + "grad_norm": 1.3646970283412756, + "learning_rate": 4.760934626969289e-06, + "loss": 0.464, + "step": 4269 + }, + { + "epoch": 0.53, + "grad_norm": 1.2475512160334674, + "learning_rate": 4.7589272259043875e-06, + "loss": 0.4824, + "step": 4270 + }, + { + "epoch": 0.53, + "grad_norm": 1.382321088190782, + "learning_rate": 4.756919863786916e-06, + "loss": 0.4819, + "step": 4271 + }, + { + "epoch": 0.53, + "grad_norm": 1.2062445168984832, + "learning_rate": 4.754912540941182e-06, + "loss": 0.5103, + "step": 4272 + }, + { + "epoch": 0.53, + "grad_norm": 1.3685375269870788, + "learning_rate": 4.752905257691485e-06, + "loss": 0.4959, + "step": 4273 + }, + { + "epoch": 0.53, + "grad_norm": 0.6638309922683546, + "learning_rate": 4.7508980143621205e-06, + "loss": 0.4928, + "step": 4274 + }, + { + "epoch": 0.53, + "grad_norm": 1.4235012573773838, + "learning_rate": 4.7488908112773755e-06, + "loss": 0.4925, + "step": 4275 + }, + { + "epoch": 0.53, + "grad_norm": 1.6268177268925723, + "learning_rate": 4.746883648761531e-06, + "loss": 0.4982, + "step": 4276 + }, + { + "epoch": 0.53, + "grad_norm": 1.4566520518954882, + "learning_rate": 4.744876527138863e-06, + "loss": 0.5318, + "step": 4277 + }, + { + "epoch": 0.53, + "grad_norm": 1.4686807392611485, + "learning_rate": 4.742869446733636e-06, + "loss": 0.4866, + "step": 4278 + }, + { + "epoch": 0.53, + "grad_norm": 1.3052234583127929, + "learning_rate": 4.740862407870118e-06, + "loss": 0.5233, + "step": 4279 + }, + { + "epoch": 0.53, + "grad_norm": 1.4928138119168188, + "learning_rate": 4.7388554108725594e-06, + "loss": 0.4965, + "step": 4280 + }, + { + "epoch": 0.53, + "grad_norm": 1.3119229364651537, + "learning_rate": 4.7368484560652085e-06, + "loss": 0.4925, + "step": 4281 + }, + { + "epoch": 0.53, + "grad_norm": 1.8819516435959767, + "learning_rate": 4.734841543772308e-06, + "loss": 0.5072, + "step": 4282 + }, + { + "epoch": 0.53, + "grad_norm": 1.3552684872325136, + "learning_rate": 4.732834674318091e-06, + "loss": 0.4941, + "step": 4283 + }, + { + "epoch": 0.53, + "grad_norm": 1.502677241539126, + "learning_rate": 4.7308278480267865e-06, + "loss": 0.4369, + "step": 4284 + }, + { + "epoch": 0.53, + "grad_norm": 1.28365550736728, + "learning_rate": 4.728821065222612e-06, + "loss": 0.4956, + "step": 4285 + }, + { + "epoch": 0.53, + "grad_norm": 1.4078836243131156, + "learning_rate": 4.726814326229781e-06, + "loss": 0.4885, + "step": 4286 + }, + { + "epoch": 0.53, + "grad_norm": 1.3401141295793377, + "learning_rate": 4.724807631372505e-06, + "loss": 0.4562, + "step": 4287 + }, + { + "epoch": 0.53, + "grad_norm": 1.6390815902227274, + "learning_rate": 4.722800980974979e-06, + "loss": 0.4953, + "step": 4288 + }, + { + "epoch": 0.53, + "grad_norm": 1.6719051916157315, + "learning_rate": 4.720794375361397e-06, + "loss": 0.4831, + "step": 4289 + }, + { + "epoch": 0.53, + "grad_norm": 1.3668822128904665, + "learning_rate": 4.718787814855942e-06, + "loss": 0.509, + "step": 4290 + }, + { + "epoch": 0.53, + "grad_norm": 1.2587487927128795, + "learning_rate": 4.716781299782791e-06, + "loss": 0.4483, + "step": 4291 + }, + { + "epoch": 0.53, + "grad_norm": 1.4839484448291642, + "learning_rate": 4.714774830466116e-06, + "loss": 0.5056, + "step": 4292 + }, + { + "epoch": 0.53, + "grad_norm": 0.6584679793549012, + "learning_rate": 4.71276840723008e-06, + "loss": 0.5071, + "step": 4293 + }, + { + "epoch": 0.53, + "grad_norm": 1.4562233356371959, + "learning_rate": 4.7107620303988335e-06, + "loss": 0.5153, + "step": 4294 + }, + { + "epoch": 0.53, + "grad_norm": 1.2737569846521757, + "learning_rate": 4.708755700296532e-06, + "loss": 0.4427, + "step": 4295 + }, + { + "epoch": 0.53, + "grad_norm": 1.3727841747162295, + "learning_rate": 4.706749417247312e-06, + "loss": 0.4871, + "step": 4296 + }, + { + "epoch": 0.53, + "grad_norm": 1.5004984388484188, + "learning_rate": 4.704743181575306e-06, + "loss": 0.4623, + "step": 4297 + }, + { + "epoch": 0.53, + "grad_norm": 1.4457080763319423, + "learning_rate": 4.7027369936046415e-06, + "loss": 0.4888, + "step": 4298 + }, + { + "epoch": 0.53, + "grad_norm": 1.476081348114596, + "learning_rate": 4.700730853659432e-06, + "loss": 0.4998, + "step": 4299 + }, + { + "epoch": 0.53, + "grad_norm": 1.3457600490722774, + "learning_rate": 4.698724762063789e-06, + "loss": 0.4999, + "step": 4300 + }, + { + "epoch": 0.53, + "grad_norm": 1.5710943313782562, + "learning_rate": 4.696718719141813e-06, + "loss": 0.5094, + "step": 4301 + }, + { + "epoch": 0.53, + "grad_norm": 2.023508681986042, + "learning_rate": 4.694712725217598e-06, + "loss": 0.5668, + "step": 4302 + }, + { + "epoch": 0.53, + "grad_norm": 1.6785659078411073, + "learning_rate": 4.692706780615232e-06, + "loss": 0.5215, + "step": 4303 + }, + { + "epoch": 0.53, + "grad_norm": 1.3301372259571647, + "learning_rate": 4.690700885658793e-06, + "loss": 0.4892, + "step": 4304 + }, + { + "epoch": 0.53, + "grad_norm": 1.942794004902333, + "learning_rate": 4.68869504067235e-06, + "loss": 0.5059, + "step": 4305 + }, + { + "epoch": 0.53, + "grad_norm": 1.3480045867939938, + "learning_rate": 4.686689245979965e-06, + "loss": 0.5091, + "step": 4306 + }, + { + "epoch": 0.53, + "grad_norm": 1.4420165452073062, + "learning_rate": 4.68468350190569e-06, + "loss": 0.48, + "step": 4307 + }, + { + "epoch": 0.53, + "grad_norm": 1.4199701993205618, + "learning_rate": 4.682677808773576e-06, + "loss": 0.5201, + "step": 4308 + }, + { + "epoch": 0.53, + "grad_norm": 1.2548107192351894, + "learning_rate": 4.680672166907654e-06, + "loss": 0.4661, + "step": 4309 + }, + { + "epoch": 0.53, + "grad_norm": 1.5109225918516362, + "learning_rate": 4.678666576631956e-06, + "loss": 0.5373, + "step": 4310 + }, + { + "epoch": 0.53, + "grad_norm": 1.2714281680666961, + "learning_rate": 4.676661038270501e-06, + "loss": 0.5001, + "step": 4311 + }, + { + "epoch": 0.54, + "grad_norm": 1.3277386522117836, + "learning_rate": 4.674655552147305e-06, + "loss": 0.5494, + "step": 4312 + }, + { + "epoch": 0.54, + "grad_norm": 1.4401191668265765, + "learning_rate": 4.6726501185863694e-06, + "loss": 0.4942, + "step": 4313 + }, + { + "epoch": 0.54, + "grad_norm": 1.3129932472587458, + "learning_rate": 4.67064473791169e-06, + "loss": 0.5189, + "step": 4314 + }, + { + "epoch": 0.54, + "grad_norm": 1.5058725829686856, + "learning_rate": 4.668639410447255e-06, + "loss": 0.4832, + "step": 4315 + }, + { + "epoch": 0.54, + "grad_norm": 1.5243590945691707, + "learning_rate": 4.666634136517041e-06, + "loss": 0.4935, + "step": 4316 + }, + { + "epoch": 0.54, + "grad_norm": 2.614027731511778, + "learning_rate": 4.664628916445018e-06, + "loss": 0.547, + "step": 4317 + }, + { + "epoch": 0.54, + "grad_norm": 1.7247810144381126, + "learning_rate": 4.662623750555149e-06, + "loss": 0.5399, + "step": 4318 + }, + { + "epoch": 0.54, + "grad_norm": 1.5402121533125563, + "learning_rate": 4.6606186391713805e-06, + "loss": 0.4774, + "step": 4319 + }, + { + "epoch": 0.54, + "grad_norm": 1.282588310149704, + "learning_rate": 4.6586135826176625e-06, + "loss": 0.533, + "step": 4320 + }, + { + "epoch": 0.54, + "grad_norm": 2.6803085872164045, + "learning_rate": 4.6566085812179265e-06, + "loss": 0.5164, + "step": 4321 + }, + { + "epoch": 0.54, + "grad_norm": 1.3154255919996705, + "learning_rate": 4.654603635296098e-06, + "loss": 0.5056, + "step": 4322 + }, + { + "epoch": 0.54, + "grad_norm": 1.266041772416684, + "learning_rate": 4.652598745176095e-06, + "loss": 0.4811, + "step": 4323 + }, + { + "epoch": 0.54, + "grad_norm": 1.3636206626837686, + "learning_rate": 4.6505939111818246e-06, + "loss": 0.4925, + "step": 4324 + }, + { + "epoch": 0.54, + "grad_norm": 1.478486601519347, + "learning_rate": 4.648589133637185e-06, + "loss": 0.4737, + "step": 4325 + }, + { + "epoch": 0.54, + "grad_norm": 2.5515659815698486, + "learning_rate": 4.646584412866065e-06, + "loss": 0.4896, + "step": 4326 + }, + { + "epoch": 0.54, + "grad_norm": 1.2139131098187366, + "learning_rate": 4.644579749192346e-06, + "loss": 0.4533, + "step": 4327 + }, + { + "epoch": 0.54, + "grad_norm": 1.3647906625218897, + "learning_rate": 4.642575142939898e-06, + "loss": 0.5035, + "step": 4328 + }, + { + "epoch": 0.54, + "grad_norm": 1.2970981149774912, + "learning_rate": 4.640570594432586e-06, + "loss": 0.5153, + "step": 4329 + }, + { + "epoch": 0.54, + "grad_norm": 1.4040798471253462, + "learning_rate": 4.638566103994258e-06, + "loss": 0.552, + "step": 4330 + }, + { + "epoch": 0.54, + "grad_norm": 2.6276214042065345, + "learning_rate": 4.63656167194876e-06, + "loss": 0.501, + "step": 4331 + }, + { + "epoch": 0.54, + "grad_norm": 1.5530490248697577, + "learning_rate": 4.634557298619924e-06, + "loss": 0.5374, + "step": 4332 + }, + { + "epoch": 0.54, + "grad_norm": 1.3407778756048003, + "learning_rate": 4.632552984331576e-06, + "loss": 0.4578, + "step": 4333 + }, + { + "epoch": 0.54, + "grad_norm": 1.4791583074513865, + "learning_rate": 4.630548729407529e-06, + "loss": 0.5266, + "step": 4334 + }, + { + "epoch": 0.54, + "grad_norm": 1.4249788214778287, + "learning_rate": 4.6285445341715875e-06, + "loss": 0.4824, + "step": 4335 + }, + { + "epoch": 0.54, + "grad_norm": 1.2562567504953168, + "learning_rate": 4.626540398947549e-06, + "loss": 0.4949, + "step": 4336 + }, + { + "epoch": 0.54, + "grad_norm": 1.321673506920769, + "learning_rate": 4.624536324059199e-06, + "loss": 0.51, + "step": 4337 + }, + { + "epoch": 0.54, + "grad_norm": 1.7009338899251323, + "learning_rate": 4.622532309830312e-06, + "loss": 0.5161, + "step": 4338 + }, + { + "epoch": 0.54, + "grad_norm": 1.3804748517386487, + "learning_rate": 4.620528356584655e-06, + "loss": 0.5386, + "step": 4339 + }, + { + "epoch": 0.54, + "grad_norm": 1.2710254221528257, + "learning_rate": 4.6185244646459835e-06, + "loss": 0.4611, + "step": 4340 + }, + { + "epoch": 0.54, + "grad_norm": 1.8608849766501319, + "learning_rate": 4.616520634338045e-06, + "loss": 0.471, + "step": 4341 + }, + { + "epoch": 0.54, + "grad_norm": 0.6798704097529361, + "learning_rate": 4.614516865984575e-06, + "loss": 0.4846, + "step": 4342 + }, + { + "epoch": 0.54, + "grad_norm": 2.1922263872791192, + "learning_rate": 4.6125131599092995e-06, + "loss": 0.4922, + "step": 4343 + }, + { + "epoch": 0.54, + "grad_norm": 1.409966446941975, + "learning_rate": 4.610509516435937e-06, + "loss": 0.542, + "step": 4344 + }, + { + "epoch": 0.54, + "grad_norm": 1.2967390492106146, + "learning_rate": 4.608505935888192e-06, + "loss": 0.5277, + "step": 4345 + }, + { + "epoch": 0.54, + "grad_norm": 1.4520281467958223, + "learning_rate": 4.606502418589762e-06, + "loss": 0.4888, + "step": 4346 + }, + { + "epoch": 0.54, + "grad_norm": 1.5444683940931594, + "learning_rate": 4.604498964864331e-06, + "loss": 0.5162, + "step": 4347 + }, + { + "epoch": 0.54, + "grad_norm": 1.4585220753140886, + "learning_rate": 4.6024955750355755e-06, + "loss": 0.4734, + "step": 4348 + }, + { + "epoch": 0.54, + "grad_norm": 2.054117529489168, + "learning_rate": 4.600492249427161e-06, + "loss": 0.5057, + "step": 4349 + }, + { + "epoch": 0.54, + "grad_norm": 1.7080318860903294, + "learning_rate": 4.598488988362742e-06, + "loss": 0.4853, + "step": 4350 + }, + { + "epoch": 0.54, + "grad_norm": 1.5706453071719417, + "learning_rate": 4.5964857921659635e-06, + "loss": 0.5186, + "step": 4351 + }, + { + "epoch": 0.54, + "grad_norm": 1.3140820055921787, + "learning_rate": 4.594482661160458e-06, + "loss": 0.5169, + "step": 4352 + }, + { + "epoch": 0.54, + "grad_norm": 1.2680956481661165, + "learning_rate": 4.59247959566985e-06, + "loss": 0.4936, + "step": 4353 + }, + { + "epoch": 0.54, + "grad_norm": 1.3409682132801277, + "learning_rate": 4.5904765960177535e-06, + "loss": 0.5418, + "step": 4354 + }, + { + "epoch": 0.54, + "grad_norm": 1.310893595658717, + "learning_rate": 4.58847366252777e-06, + "loss": 0.495, + "step": 4355 + }, + { + "epoch": 0.54, + "grad_norm": 1.5131214786662126, + "learning_rate": 4.58647079552349e-06, + "loss": 0.4563, + "step": 4356 + }, + { + "epoch": 0.54, + "grad_norm": 1.826228307832778, + "learning_rate": 4.5844679953284946e-06, + "loss": 0.5028, + "step": 4357 + }, + { + "epoch": 0.54, + "grad_norm": 1.320033672212789, + "learning_rate": 4.582465262266355e-06, + "loss": 0.4299, + "step": 4358 + }, + { + "epoch": 0.54, + "grad_norm": 1.5183627776360842, + "learning_rate": 4.58046259666063e-06, + "loss": 0.4833, + "step": 4359 + }, + { + "epoch": 0.54, + "grad_norm": 1.4938905628289831, + "learning_rate": 4.5784599988348656e-06, + "loss": 0.5108, + "step": 4360 + }, + { + "epoch": 0.54, + "grad_norm": 1.6296818643660147, + "learning_rate": 4.576457469112602e-06, + "loss": 0.5559, + "step": 4361 + }, + { + "epoch": 0.54, + "grad_norm": 1.7693617181056263, + "learning_rate": 4.574455007817365e-06, + "loss": 0.5516, + "step": 4362 + }, + { + "epoch": 0.54, + "grad_norm": 1.3295911623522392, + "learning_rate": 4.57245261527267e-06, + "loss": 0.522, + "step": 4363 + }, + { + "epoch": 0.54, + "grad_norm": 1.5954330240640078, + "learning_rate": 4.570450291802019e-06, + "loss": 0.5147, + "step": 4364 + }, + { + "epoch": 0.54, + "grad_norm": 1.7853840557710536, + "learning_rate": 4.568448037728907e-06, + "loss": 0.5434, + "step": 4365 + }, + { + "epoch": 0.54, + "grad_norm": 1.4879695205979435, + "learning_rate": 4.5664458533768155e-06, + "loss": 0.4781, + "step": 4366 + }, + { + "epoch": 0.54, + "grad_norm": 1.4936270775797755, + "learning_rate": 4.564443739069215e-06, + "loss": 0.5157, + "step": 4367 + }, + { + "epoch": 0.54, + "grad_norm": 1.710118172072937, + "learning_rate": 4.562441695129563e-06, + "loss": 0.4956, + "step": 4368 + }, + { + "epoch": 0.54, + "grad_norm": 2.479920114691241, + "learning_rate": 4.56043972188131e-06, + "loss": 0.5305, + "step": 4369 + }, + { + "epoch": 0.54, + "grad_norm": 1.4216813600328682, + "learning_rate": 4.558437819647892e-06, + "loss": 0.519, + "step": 4370 + }, + { + "epoch": 0.54, + "grad_norm": 0.6207893049711012, + "learning_rate": 4.556435988752732e-06, + "loss": 0.4515, + "step": 4371 + }, + { + "epoch": 0.54, + "grad_norm": 1.7293017674348017, + "learning_rate": 4.554434229519244e-06, + "loss": 0.5257, + "step": 4372 + }, + { + "epoch": 0.54, + "grad_norm": 1.4846186532957242, + "learning_rate": 4.552432542270832e-06, + "loss": 0.4847, + "step": 4373 + }, + { + "epoch": 0.54, + "grad_norm": 1.4686377152136705, + "learning_rate": 4.550430927330885e-06, + "loss": 0.4727, + "step": 4374 + }, + { + "epoch": 0.54, + "grad_norm": 2.0718575292030823, + "learning_rate": 4.54842938502278e-06, + "loss": 0.5046, + "step": 4375 + }, + { + "epoch": 0.54, + "grad_norm": 1.5944676215700275, + "learning_rate": 4.546427915669882e-06, + "loss": 0.4735, + "step": 4376 + }, + { + "epoch": 0.54, + "grad_norm": 1.4081743207529693, + "learning_rate": 4.5444265195955525e-06, + "loss": 0.5041, + "step": 4377 + }, + { + "epoch": 0.54, + "grad_norm": 1.484577775781484, + "learning_rate": 4.542425197123131e-06, + "loss": 0.539, + "step": 4378 + }, + { + "epoch": 0.54, + "grad_norm": 1.6249081818741813, + "learning_rate": 4.540423948575949e-06, + "loss": 0.4749, + "step": 4379 + }, + { + "epoch": 0.54, + "grad_norm": 1.2982887830474357, + "learning_rate": 4.538422774277325e-06, + "loss": 0.5252, + "step": 4380 + }, + { + "epoch": 0.54, + "grad_norm": 1.3766822741656382, + "learning_rate": 4.536421674550567e-06, + "loss": 0.5047, + "step": 4381 + }, + { + "epoch": 0.54, + "grad_norm": 1.406652765077386, + "learning_rate": 4.534420649718972e-06, + "loss": 0.4652, + "step": 4382 + }, + { + "epoch": 0.54, + "grad_norm": 0.6791428428602214, + "learning_rate": 4.532419700105819e-06, + "loss": 0.5047, + "step": 4383 + }, + { + "epoch": 0.54, + "grad_norm": 1.9436755327984607, + "learning_rate": 4.53041882603438e-06, + "loss": 0.4829, + "step": 4384 + }, + { + "epoch": 0.54, + "grad_norm": 1.5698938749165816, + "learning_rate": 4.528418027827918e-06, + "loss": 0.4655, + "step": 4385 + }, + { + "epoch": 0.54, + "grad_norm": 1.7356508093213097, + "learning_rate": 4.526417305809677e-06, + "loss": 0.5052, + "step": 4386 + }, + { + "epoch": 0.54, + "grad_norm": 1.174600104406179, + "learning_rate": 4.5244166603028915e-06, + "loss": 0.4722, + "step": 4387 + }, + { + "epoch": 0.54, + "grad_norm": 1.5507075320836625, + "learning_rate": 4.522416091630784e-06, + "loss": 0.4298, + "step": 4388 + }, + { + "epoch": 0.54, + "grad_norm": 1.771951957318884, + "learning_rate": 4.520415600116561e-06, + "loss": 0.4666, + "step": 4389 + }, + { + "epoch": 0.54, + "grad_norm": 1.4399449863102074, + "learning_rate": 4.518415186083422e-06, + "loss": 0.4913, + "step": 4390 + }, + { + "epoch": 0.54, + "grad_norm": 1.3278433017347138, + "learning_rate": 4.516414849854552e-06, + "loss": 0.5082, + "step": 4391 + }, + { + "epoch": 0.55, + "grad_norm": 1.4678291151658778, + "learning_rate": 4.514414591753121e-06, + "loss": 0.5698, + "step": 4392 + }, + { + "epoch": 0.55, + "grad_norm": 1.5435085395494517, + "learning_rate": 4.512414412102288e-06, + "loss": 0.5142, + "step": 4393 + }, + { + "epoch": 0.55, + "grad_norm": 1.556647391808432, + "learning_rate": 4.510414311225203e-06, + "loss": 0.4913, + "step": 4394 + }, + { + "epoch": 0.55, + "grad_norm": 1.6916292073049486, + "learning_rate": 4.508414289444998e-06, + "loss": 0.4798, + "step": 4395 + }, + { + "epoch": 0.55, + "grad_norm": 1.6692120312743866, + "learning_rate": 4.506414347084793e-06, + "loss": 0.5023, + "step": 4396 + }, + { + "epoch": 0.55, + "grad_norm": 1.604330736759279, + "learning_rate": 4.5044144844676995e-06, + "loss": 0.4997, + "step": 4397 + }, + { + "epoch": 0.55, + "grad_norm": 2.118799793257604, + "learning_rate": 4.50241470191681e-06, + "loss": 0.5001, + "step": 4398 + }, + { + "epoch": 0.55, + "grad_norm": 1.3528728363724072, + "learning_rate": 4.500414999755207e-06, + "loss": 0.4631, + "step": 4399 + }, + { + "epoch": 0.55, + "grad_norm": 1.5128477685762356, + "learning_rate": 4.498415378305961e-06, + "loss": 0.483, + "step": 4400 + }, + { + "epoch": 0.55, + "grad_norm": 1.454443965715015, + "learning_rate": 4.496415837892125e-06, + "loss": 0.4791, + "step": 4401 + }, + { + "epoch": 0.55, + "grad_norm": 1.3334002112817895, + "learning_rate": 4.494416378836749e-06, + "loss": 0.4398, + "step": 4402 + }, + { + "epoch": 0.55, + "grad_norm": 1.7095351293567198, + "learning_rate": 4.492417001462859e-06, + "loss": 0.5341, + "step": 4403 + }, + { + "epoch": 0.55, + "grad_norm": 1.562988755978761, + "learning_rate": 4.490417706093473e-06, + "loss": 0.5465, + "step": 4404 + }, + { + "epoch": 0.55, + "grad_norm": 1.4290886758069221, + "learning_rate": 4.4884184930515955e-06, + "loss": 0.4642, + "step": 4405 + }, + { + "epoch": 0.55, + "grad_norm": 0.65549340328514, + "learning_rate": 4.486419362660214e-06, + "loss": 0.4921, + "step": 4406 + }, + { + "epoch": 0.55, + "grad_norm": 1.6983782171234239, + "learning_rate": 4.484420315242311e-06, + "loss": 0.48, + "step": 4407 + }, + { + "epoch": 0.55, + "grad_norm": 1.6234618539161976, + "learning_rate": 4.482421351120845e-06, + "loss": 0.5449, + "step": 4408 + }, + { + "epoch": 0.55, + "grad_norm": 1.6357978088500176, + "learning_rate": 4.480422470618766e-06, + "loss": 0.5332, + "step": 4409 + }, + { + "epoch": 0.55, + "grad_norm": 1.9663873905647158, + "learning_rate": 4.478423674059015e-06, + "loss": 0.5126, + "step": 4410 + }, + { + "epoch": 0.55, + "grad_norm": 1.4502008959206971, + "learning_rate": 4.476424961764513e-06, + "loss": 0.5099, + "step": 4411 + }, + { + "epoch": 0.55, + "grad_norm": 1.4117212277671094, + "learning_rate": 4.47442633405817e-06, + "loss": 0.4995, + "step": 4412 + }, + { + "epoch": 0.55, + "grad_norm": 1.387171017364631, + "learning_rate": 4.472427791262881e-06, + "loss": 0.4811, + "step": 4413 + }, + { + "epoch": 0.55, + "grad_norm": 1.722836592836977, + "learning_rate": 4.470429333701529e-06, + "loss": 0.4993, + "step": 4414 + }, + { + "epoch": 0.55, + "grad_norm": 2.046809463793669, + "learning_rate": 4.468430961696982e-06, + "loss": 0.5041, + "step": 4415 + }, + { + "epoch": 0.55, + "grad_norm": 1.4247780429245809, + "learning_rate": 4.466432675572096e-06, + "loss": 0.4771, + "step": 4416 + }, + { + "epoch": 0.55, + "grad_norm": 1.4185856211803107, + "learning_rate": 4.464434475649708e-06, + "loss": 0.5237, + "step": 4417 + }, + { + "epoch": 0.55, + "grad_norm": 11.468863603545287, + "learning_rate": 4.462436362252648e-06, + "loss": 0.5028, + "step": 4418 + }, + { + "epoch": 0.55, + "grad_norm": 1.5714487313977783, + "learning_rate": 4.46043833570373e-06, + "loss": 0.5409, + "step": 4419 + }, + { + "epoch": 0.55, + "grad_norm": 1.5369611189212409, + "learning_rate": 4.4584403963257485e-06, + "loss": 0.4821, + "step": 4420 + }, + { + "epoch": 0.55, + "grad_norm": 1.5062598221616934, + "learning_rate": 4.456442544441493e-06, + "loss": 0.4762, + "step": 4421 + }, + { + "epoch": 0.55, + "grad_norm": 1.8761573257549888, + "learning_rate": 4.45444478037373e-06, + "loss": 0.5218, + "step": 4422 + }, + { + "epoch": 0.55, + "grad_norm": 1.780866183894427, + "learning_rate": 4.452447104445218e-06, + "loss": 0.4763, + "step": 4423 + }, + { + "epoch": 0.55, + "grad_norm": 2.0530249281707036, + "learning_rate": 4.450449516978699e-06, + "loss": 0.507, + "step": 4424 + }, + { + "epoch": 0.55, + "grad_norm": 1.357116791630275, + "learning_rate": 4.4484520182969e-06, + "loss": 0.5084, + "step": 4425 + }, + { + "epoch": 0.55, + "grad_norm": 2.4447708016997076, + "learning_rate": 4.4464546087225346e-06, + "loss": 0.4825, + "step": 4426 + }, + { + "epoch": 0.55, + "grad_norm": 1.4769381978965075, + "learning_rate": 4.444457288578303e-06, + "loss": 0.4954, + "step": 4427 + }, + { + "epoch": 0.55, + "grad_norm": 1.6761864909530553, + "learning_rate": 4.44246005818689e-06, + "loss": 0.4857, + "step": 4428 + }, + { + "epoch": 0.55, + "grad_norm": 1.4748459109991388, + "learning_rate": 4.440462917870964e-06, + "loss": 0.5239, + "step": 4429 + }, + { + "epoch": 0.55, + "grad_norm": 1.3350226162503687, + "learning_rate": 4.438465867953182e-06, + "loss": 0.5043, + "step": 4430 + }, + { + "epoch": 0.55, + "grad_norm": 1.80295600244093, + "learning_rate": 4.4364689087561845e-06, + "loss": 0.5169, + "step": 4431 + }, + { + "epoch": 0.55, + "grad_norm": 1.334114055897355, + "learning_rate": 4.434472040602599e-06, + "loss": 0.5331, + "step": 4432 + }, + { + "epoch": 0.55, + "grad_norm": 1.5164473111659773, + "learning_rate": 4.432475263815035e-06, + "loss": 0.4775, + "step": 4433 + }, + { + "epoch": 0.55, + "grad_norm": 1.3005993439552876, + "learning_rate": 4.430478578716089e-06, + "loss": 0.4822, + "step": 4434 + }, + { + "epoch": 0.55, + "grad_norm": 1.474955584683926, + "learning_rate": 4.428481985628345e-06, + "loss": 0.5167, + "step": 4435 + }, + { + "epoch": 0.55, + "grad_norm": 1.4789894800140106, + "learning_rate": 4.426485484874371e-06, + "loss": 0.4987, + "step": 4436 + }, + { + "epoch": 0.55, + "grad_norm": 1.3903653813140513, + "learning_rate": 4.424489076776718e-06, + "loss": 0.5015, + "step": 4437 + }, + { + "epoch": 0.55, + "grad_norm": 1.3625637879056767, + "learning_rate": 4.422492761657923e-06, + "loss": 0.4851, + "step": 4438 + }, + { + "epoch": 0.55, + "grad_norm": 1.4422805656413178, + "learning_rate": 4.420496539840509e-06, + "loss": 0.4535, + "step": 4439 + }, + { + "epoch": 0.55, + "grad_norm": 1.505004277895124, + "learning_rate": 4.4185004116469824e-06, + "loss": 0.4876, + "step": 4440 + }, + { + "epoch": 0.55, + "grad_norm": 1.293316550344579, + "learning_rate": 4.416504377399835e-06, + "loss": 0.4517, + "step": 4441 + }, + { + "epoch": 0.55, + "grad_norm": 1.4825354853263424, + "learning_rate": 4.414508437421544e-06, + "loss": 0.5174, + "step": 4442 + }, + { + "epoch": 0.55, + "grad_norm": 1.3531475040207626, + "learning_rate": 4.412512592034572e-06, + "loss": 0.5488, + "step": 4443 + }, + { + "epoch": 0.55, + "grad_norm": 1.3345359208586365, + "learning_rate": 4.410516841561366e-06, + "loss": 0.5341, + "step": 4444 + }, + { + "epoch": 0.55, + "grad_norm": 1.7683739150605549, + "learning_rate": 4.408521186324356e-06, + "loss": 0.4505, + "step": 4445 + }, + { + "epoch": 0.55, + "grad_norm": 1.4109196527564847, + "learning_rate": 4.406525626645956e-06, + "loss": 0.4472, + "step": 4446 + }, + { + "epoch": 0.55, + "grad_norm": 1.4761501607577767, + "learning_rate": 4.404530162848569e-06, + "loss": 0.4795, + "step": 4447 + }, + { + "epoch": 0.55, + "grad_norm": 1.2703248406744538, + "learning_rate": 4.402534795254578e-06, + "loss": 0.4719, + "step": 4448 + }, + { + "epoch": 0.55, + "grad_norm": 2.1843683187084255, + "learning_rate": 4.4005395241863535e-06, + "loss": 0.502, + "step": 4449 + }, + { + "epoch": 0.55, + "grad_norm": 2.0136952093233424, + "learning_rate": 4.398544349966247e-06, + "loss": 0.5151, + "step": 4450 + }, + { + "epoch": 0.55, + "grad_norm": 3.106552861840512, + "learning_rate": 4.3965492729166e-06, + "loss": 0.4904, + "step": 4451 + }, + { + "epoch": 0.55, + "grad_norm": 1.6491909140414045, + "learning_rate": 4.394554293359731e-06, + "loss": 0.554, + "step": 4452 + }, + { + "epoch": 0.55, + "grad_norm": 1.4630506011879885, + "learning_rate": 4.392559411617949e-06, + "loss": 0.5344, + "step": 4453 + }, + { + "epoch": 0.55, + "grad_norm": 1.55165485319063, + "learning_rate": 4.390564628013545e-06, + "loss": 0.5136, + "step": 4454 + }, + { + "epoch": 0.55, + "grad_norm": 3.094936177696235, + "learning_rate": 4.388569942868791e-06, + "loss": 0.4817, + "step": 4455 + }, + { + "epoch": 0.55, + "grad_norm": 1.9932684063707253, + "learning_rate": 4.3865753565059485e-06, + "loss": 0.5695, + "step": 4456 + }, + { + "epoch": 0.55, + "grad_norm": 1.8901580466559014, + "learning_rate": 4.384580869247259e-06, + "loss": 0.4978, + "step": 4457 + }, + { + "epoch": 0.55, + "grad_norm": 1.505734659711322, + "learning_rate": 4.38258648141495e-06, + "loss": 0.4256, + "step": 4458 + }, + { + "epoch": 0.55, + "grad_norm": 1.6879853623833825, + "learning_rate": 4.380592193331234e-06, + "loss": 0.6263, + "step": 4459 + }, + { + "epoch": 0.55, + "grad_norm": 1.3953623635669, + "learning_rate": 4.378598005318304e-06, + "loss": 0.4724, + "step": 4460 + }, + { + "epoch": 0.55, + "grad_norm": 1.7850285873847695, + "learning_rate": 4.376603917698339e-06, + "loss": 0.4961, + "step": 4461 + }, + { + "epoch": 0.55, + "grad_norm": 1.5945344825899566, + "learning_rate": 4.374609930793501e-06, + "loss": 0.4908, + "step": 4462 + }, + { + "epoch": 0.55, + "grad_norm": 1.5711748763666404, + "learning_rate": 4.372616044925938e-06, + "loss": 0.4778, + "step": 4463 + }, + { + "epoch": 0.55, + "grad_norm": 1.362564324612832, + "learning_rate": 4.370622260417777e-06, + "loss": 0.4729, + "step": 4464 + }, + { + "epoch": 0.55, + "grad_norm": 1.5883608424991777, + "learning_rate": 4.368628577591134e-06, + "loss": 0.5014, + "step": 4465 + }, + { + "epoch": 0.55, + "grad_norm": 1.4573401950173126, + "learning_rate": 4.366634996768104e-06, + "loss": 0.491, + "step": 4466 + }, + { + "epoch": 0.55, + "grad_norm": 1.3802325792037728, + "learning_rate": 4.364641518270767e-06, + "loss": 0.5029, + "step": 4467 + }, + { + "epoch": 0.55, + "grad_norm": 2.1344719427915315, + "learning_rate": 4.362648142421191e-06, + "loss": 0.4574, + "step": 4468 + }, + { + "epoch": 0.55, + "grad_norm": 1.369050190285605, + "learning_rate": 4.360654869541419e-06, + "loss": 0.5084, + "step": 4469 + }, + { + "epoch": 0.55, + "grad_norm": 1.909973556369964, + "learning_rate": 4.358661699953486e-06, + "loss": 0.4853, + "step": 4470 + }, + { + "epoch": 0.55, + "grad_norm": 1.4003381183520367, + "learning_rate": 4.356668633979402e-06, + "loss": 0.4647, + "step": 4471 + }, + { + "epoch": 0.55, + "grad_norm": 1.606974333963992, + "learning_rate": 4.354675671941167e-06, + "loss": 0.4842, + "step": 4472 + }, + { + "epoch": 0.56, + "grad_norm": 1.2865352907440146, + "learning_rate": 4.3526828141607605e-06, + "loss": 0.4676, + "step": 4473 + }, + { + "epoch": 0.56, + "grad_norm": 1.239797259529583, + "learning_rate": 4.350690060960146e-06, + "loss": 0.4793, + "step": 4474 + }, + { + "epoch": 0.56, + "grad_norm": 1.4018255932426165, + "learning_rate": 4.348697412661269e-06, + "loss": 0.455, + "step": 4475 + }, + { + "epoch": 0.56, + "grad_norm": 1.2995263446919354, + "learning_rate": 4.346704869586064e-06, + "loss": 0.4692, + "step": 4476 + }, + { + "epoch": 0.56, + "grad_norm": 1.7508953414723027, + "learning_rate": 4.344712432056441e-06, + "loss": 0.5329, + "step": 4477 + }, + { + "epoch": 0.56, + "grad_norm": 1.4401002791730046, + "learning_rate": 4.342720100394295e-06, + "loss": 0.5505, + "step": 4478 + }, + { + "epoch": 0.56, + "grad_norm": 1.581965730475437, + "learning_rate": 4.340727874921506e-06, + "loss": 0.534, + "step": 4479 + }, + { + "epoch": 0.56, + "grad_norm": 1.2937840348727583, + "learning_rate": 4.338735755959935e-06, + "loss": 0.4743, + "step": 4480 + }, + { + "epoch": 0.56, + "grad_norm": 1.4277893209041128, + "learning_rate": 4.336743743831426e-06, + "loss": 0.4859, + "step": 4481 + }, + { + "epoch": 0.56, + "grad_norm": 0.727417677604167, + "learning_rate": 4.334751838857807e-06, + "loss": 0.4857, + "step": 4482 + }, + { + "epoch": 0.56, + "grad_norm": 1.3905935748983391, + "learning_rate": 4.332760041360885e-06, + "loss": 0.5166, + "step": 4483 + }, + { + "epoch": 0.56, + "grad_norm": 2.4155875374503135, + "learning_rate": 4.330768351662458e-06, + "loss": 0.505, + "step": 4484 + }, + { + "epoch": 0.56, + "grad_norm": 1.4320288646888266, + "learning_rate": 4.328776770084296e-06, + "loss": 0.5258, + "step": 4485 + }, + { + "epoch": 0.56, + "grad_norm": 1.2614713949066259, + "learning_rate": 4.326785296948162e-06, + "loss": 0.46, + "step": 4486 + }, + { + "epoch": 0.56, + "grad_norm": 1.462031081338204, + "learning_rate": 4.324793932575789e-06, + "loss": 0.5246, + "step": 4487 + }, + { + "epoch": 0.56, + "grad_norm": 1.4480727488848604, + "learning_rate": 4.322802677288904e-06, + "loss": 0.5121, + "step": 4488 + }, + { + "epoch": 0.56, + "grad_norm": 1.5817956181131727, + "learning_rate": 4.320811531409211e-06, + "loss": 0.5003, + "step": 4489 + }, + { + "epoch": 0.56, + "grad_norm": 1.362465636960399, + "learning_rate": 4.318820495258396e-06, + "loss": 0.4872, + "step": 4490 + }, + { + "epoch": 0.56, + "grad_norm": 1.4532442998980175, + "learning_rate": 4.316829569158127e-06, + "loss": 0.5363, + "step": 4491 + }, + { + "epoch": 0.56, + "grad_norm": 0.6472721593024459, + "learning_rate": 4.3148387534300615e-06, + "loss": 0.5004, + "step": 4492 + }, + { + "epoch": 0.56, + "grad_norm": 1.560388920149118, + "learning_rate": 4.312848048395828e-06, + "loss": 0.5634, + "step": 4493 + }, + { + "epoch": 0.56, + "grad_norm": 1.6795898740165551, + "learning_rate": 4.310857454377045e-06, + "loss": 0.4687, + "step": 4494 + }, + { + "epoch": 0.56, + "grad_norm": 2.7042855561623385, + "learning_rate": 4.30886697169531e-06, + "loss": 0.4863, + "step": 4495 + }, + { + "epoch": 0.56, + "grad_norm": 0.6835008969281285, + "learning_rate": 4.306876600672204e-06, + "loss": 0.4789, + "step": 4496 + }, + { + "epoch": 0.56, + "grad_norm": 1.5439431277188411, + "learning_rate": 4.3048863416292866e-06, + "loss": 0.5109, + "step": 4497 + }, + { + "epoch": 0.56, + "grad_norm": 1.2984059818710183, + "learning_rate": 4.302896194888102e-06, + "loss": 0.4721, + "step": 4498 + }, + { + "epoch": 0.56, + "grad_norm": 2.9009395166991934, + "learning_rate": 4.300906160770174e-06, + "loss": 0.5337, + "step": 4499 + }, + { + "epoch": 0.56, + "grad_norm": 1.5366204918981408, + "learning_rate": 4.298916239597016e-06, + "loss": 0.5198, + "step": 4500 + }, + { + "epoch": 0.56, + "grad_norm": 1.4234969050909656, + "learning_rate": 4.2969264316901135e-06, + "loss": 0.5181, + "step": 4501 + }, + { + "epoch": 0.56, + "grad_norm": 1.8208805018296175, + "learning_rate": 4.2949367373709385e-06, + "loss": 0.5468, + "step": 4502 + }, + { + "epoch": 0.56, + "grad_norm": 1.4042996491593032, + "learning_rate": 4.292947156960942e-06, + "loss": 0.5222, + "step": 4503 + }, + { + "epoch": 0.56, + "grad_norm": 1.763208800475825, + "learning_rate": 4.290957690781561e-06, + "loss": 0.5531, + "step": 4504 + }, + { + "epoch": 0.56, + "grad_norm": 1.7865389229207818, + "learning_rate": 4.28896833915421e-06, + "loss": 0.4983, + "step": 4505 + }, + { + "epoch": 0.56, + "grad_norm": 2.170165864220015, + "learning_rate": 4.286979102400286e-06, + "loss": 0.4926, + "step": 4506 + }, + { + "epoch": 0.56, + "grad_norm": 1.4323082470024437, + "learning_rate": 4.2849899808411665e-06, + "loss": 0.5024, + "step": 4507 + }, + { + "epoch": 0.56, + "grad_norm": 1.4889225319510406, + "learning_rate": 4.2830009747982115e-06, + "loss": 0.4891, + "step": 4508 + }, + { + "epoch": 0.56, + "grad_norm": 1.3382345132730324, + "learning_rate": 4.281012084592766e-06, + "loss": 0.5222, + "step": 4509 + }, + { + "epoch": 0.56, + "grad_norm": 0.675762563925517, + "learning_rate": 4.27902331054615e-06, + "loss": 0.5036, + "step": 4510 + }, + { + "epoch": 0.56, + "grad_norm": 1.6067386368305003, + "learning_rate": 4.277034652979668e-06, + "loss": 0.4876, + "step": 4511 + }, + { + "epoch": 0.56, + "grad_norm": 1.611708514685027, + "learning_rate": 4.275046112214604e-06, + "loss": 0.5042, + "step": 4512 + }, + { + "epoch": 0.56, + "grad_norm": 1.494644895380749, + "learning_rate": 4.273057688572227e-06, + "loss": 0.5389, + "step": 4513 + }, + { + "epoch": 0.56, + "grad_norm": 1.5774346058796291, + "learning_rate": 4.271069382373783e-06, + "loss": 0.4572, + "step": 4514 + }, + { + "epoch": 0.56, + "grad_norm": 1.4256215511273256, + "learning_rate": 4.2690811939405e-06, + "loss": 0.523, + "step": 4515 + }, + { + "epoch": 0.56, + "grad_norm": 1.3297159375719316, + "learning_rate": 4.267093123593585e-06, + "loss": 0.4912, + "step": 4516 + }, + { + "epoch": 0.56, + "grad_norm": 1.4035373991816045, + "learning_rate": 4.265105171654233e-06, + "loss": 0.4572, + "step": 4517 + }, + { + "epoch": 0.56, + "grad_norm": 1.3611163373689483, + "learning_rate": 4.263117338443612e-06, + "loss": 0.5006, + "step": 4518 + }, + { + "epoch": 0.56, + "grad_norm": 1.4072504029722148, + "learning_rate": 4.261129624282876e-06, + "loss": 0.4834, + "step": 4519 + }, + { + "epoch": 0.56, + "grad_norm": 1.5658032422045698, + "learning_rate": 4.2591420294931565e-06, + "loss": 0.5035, + "step": 4520 + }, + { + "epoch": 0.56, + "grad_norm": 1.823794325382853, + "learning_rate": 4.257154554395566e-06, + "loss": 0.516, + "step": 4521 + }, + { + "epoch": 0.56, + "grad_norm": 1.6114850530253642, + "learning_rate": 4.2551671993112e-06, + "loss": 0.483, + "step": 4522 + }, + { + "epoch": 0.56, + "grad_norm": 1.4416077224034098, + "learning_rate": 4.253179964561133e-06, + "loss": 0.5327, + "step": 4523 + }, + { + "epoch": 0.56, + "grad_norm": 2.233556266526752, + "learning_rate": 4.25119285046642e-06, + "loss": 0.5014, + "step": 4524 + }, + { + "epoch": 0.56, + "grad_norm": 1.4926199002802785, + "learning_rate": 4.249205857348097e-06, + "loss": 0.509, + "step": 4525 + }, + { + "epoch": 0.56, + "grad_norm": 1.5610452174421434, + "learning_rate": 4.247218985527179e-06, + "loss": 0.5521, + "step": 4526 + }, + { + "epoch": 0.56, + "grad_norm": 1.7556944477904484, + "learning_rate": 4.245232235324666e-06, + "loss": 0.4896, + "step": 4527 + }, + { + "epoch": 0.56, + "grad_norm": 1.4150439393082945, + "learning_rate": 4.243245607061531e-06, + "loss": 0.4713, + "step": 4528 + }, + { + "epoch": 0.56, + "grad_norm": 1.3719288884987528, + "learning_rate": 4.241259101058734e-06, + "loss": 0.4368, + "step": 4529 + }, + { + "epoch": 0.56, + "grad_norm": 1.4296319411129048, + "learning_rate": 4.239272717637212e-06, + "loss": 0.532, + "step": 4530 + }, + { + "epoch": 0.56, + "grad_norm": 1.6677207912248433, + "learning_rate": 4.237286457117882e-06, + "loss": 0.4865, + "step": 4531 + }, + { + "epoch": 0.56, + "grad_norm": 1.4600157709673596, + "learning_rate": 4.2353003198216405e-06, + "loss": 0.5526, + "step": 4532 + }, + { + "epoch": 0.56, + "grad_norm": 3.1718828387548506, + "learning_rate": 4.233314306069369e-06, + "loss": 0.4848, + "step": 4533 + }, + { + "epoch": 0.56, + "grad_norm": 1.2922949432077968, + "learning_rate": 4.231328416181923e-06, + "loss": 0.4936, + "step": 4534 + }, + { + "epoch": 0.56, + "grad_norm": 1.6398879189413544, + "learning_rate": 4.229342650480143e-06, + "loss": 0.5041, + "step": 4535 + }, + { + "epoch": 0.56, + "grad_norm": 3.175714480238288, + "learning_rate": 4.227357009284843e-06, + "loss": 0.4687, + "step": 4536 + }, + { + "epoch": 0.56, + "grad_norm": 1.7546675858146281, + "learning_rate": 4.225371492916824e-06, + "loss": 0.4849, + "step": 4537 + }, + { + "epoch": 0.56, + "grad_norm": 1.4694551238357199, + "learning_rate": 4.223386101696863e-06, + "loss": 0.538, + "step": 4538 + }, + { + "epoch": 0.56, + "grad_norm": 1.5947169512599413, + "learning_rate": 4.221400835945716e-06, + "loss": 0.5196, + "step": 4539 + }, + { + "epoch": 0.56, + "grad_norm": 1.7358708080151768, + "learning_rate": 4.2194156959841215e-06, + "loss": 0.4918, + "step": 4540 + }, + { + "epoch": 0.56, + "grad_norm": 1.3877507453717872, + "learning_rate": 4.217430682132796e-06, + "loss": 0.4993, + "step": 4541 + }, + { + "epoch": 0.56, + "grad_norm": 1.2591813594630503, + "learning_rate": 4.215445794712436e-06, + "loss": 0.5215, + "step": 4542 + }, + { + "epoch": 0.56, + "grad_norm": 1.4205477304663412, + "learning_rate": 4.213461034043719e-06, + "loss": 0.5416, + "step": 4543 + }, + { + "epoch": 0.56, + "grad_norm": 3.6272911494152797, + "learning_rate": 4.211476400447298e-06, + "loss": 0.5314, + "step": 4544 + }, + { + "epoch": 0.56, + "grad_norm": 1.4961401717993632, + "learning_rate": 4.20949189424381e-06, + "loss": 0.527, + "step": 4545 + }, + { + "epoch": 0.56, + "grad_norm": 2.3452851390092455, + "learning_rate": 4.207507515753867e-06, + "loss": 0.5442, + "step": 4546 + }, + { + "epoch": 0.56, + "grad_norm": 1.654315036033042, + "learning_rate": 4.205523265298066e-06, + "loss": 0.5496, + "step": 4547 + }, + { + "epoch": 0.56, + "grad_norm": 1.346401650183757, + "learning_rate": 4.203539143196978e-06, + "loss": 0.5013, + "step": 4548 + }, + { + "epoch": 0.56, + "grad_norm": 1.3989470161936295, + "learning_rate": 4.201555149771155e-06, + "loss": 0.4853, + "step": 4549 + }, + { + "epoch": 0.56, + "grad_norm": 1.3946104740281104, + "learning_rate": 4.199571285341131e-06, + "loss": 0.4594, + "step": 4550 + }, + { + "epoch": 0.56, + "grad_norm": 1.6578275793558712, + "learning_rate": 4.197587550227416e-06, + "loss": 0.5044, + "step": 4551 + }, + { + "epoch": 0.56, + "grad_norm": 1.2876289276965982, + "learning_rate": 4.195603944750498e-06, + "loss": 0.4791, + "step": 4552 + }, + { + "epoch": 0.57, + "grad_norm": 1.5370400838336578, + "learning_rate": 4.193620469230848e-06, + "loss": 0.537, + "step": 4553 + }, + { + "epoch": 0.57, + "grad_norm": 1.3196531138427288, + "learning_rate": 4.191637123988913e-06, + "loss": 0.4833, + "step": 4554 + }, + { + "epoch": 0.57, + "grad_norm": 1.2417498417268902, + "learning_rate": 4.18965390934512e-06, + "loss": 0.4718, + "step": 4555 + }, + { + "epoch": 0.57, + "grad_norm": 1.6394348825494676, + "learning_rate": 4.187670825619875e-06, + "loss": 0.495, + "step": 4556 + }, + { + "epoch": 0.57, + "grad_norm": 1.3141667309696983, + "learning_rate": 4.185687873133561e-06, + "loss": 0.5267, + "step": 4557 + }, + { + "epoch": 0.57, + "grad_norm": 1.690376529517557, + "learning_rate": 4.1837050522065434e-06, + "loss": 0.5207, + "step": 4558 + }, + { + "epoch": 0.57, + "grad_norm": 1.3277323468444318, + "learning_rate": 4.181722363159165e-06, + "loss": 0.5052, + "step": 4559 + }, + { + "epoch": 0.57, + "grad_norm": 1.4801879039268384, + "learning_rate": 4.179739806311746e-06, + "loss": 0.4945, + "step": 4560 + }, + { + "epoch": 0.57, + "grad_norm": 2.042987090195881, + "learning_rate": 4.177757381984584e-06, + "loss": 0.4719, + "step": 4561 + }, + { + "epoch": 0.57, + "grad_norm": 1.4214907749637817, + "learning_rate": 4.17577509049796e-06, + "loss": 0.5609, + "step": 4562 + }, + { + "epoch": 0.57, + "grad_norm": 1.3860726873778075, + "learning_rate": 4.173792932172128e-06, + "loss": 0.4199, + "step": 4563 + }, + { + "epoch": 0.57, + "grad_norm": 1.6571916732434602, + "learning_rate": 4.171810907327325e-06, + "loss": 0.5922, + "step": 4564 + }, + { + "epoch": 0.57, + "grad_norm": 1.51516415102186, + "learning_rate": 4.169829016283762e-06, + "loss": 0.5021, + "step": 4565 + }, + { + "epoch": 0.57, + "grad_norm": 1.4737232507571463, + "learning_rate": 4.167847259361636e-06, + "loss": 0.4847, + "step": 4566 + }, + { + "epoch": 0.57, + "grad_norm": 1.555560655088034, + "learning_rate": 4.165865636881113e-06, + "loss": 0.5149, + "step": 4567 + }, + { + "epoch": 0.57, + "grad_norm": 1.6523285718797909, + "learning_rate": 4.163884149162342e-06, + "loss": 0.4794, + "step": 4568 + }, + { + "epoch": 0.57, + "grad_norm": 3.2591279740887935, + "learning_rate": 4.161902796525452e-06, + "loss": 0.4776, + "step": 4569 + }, + { + "epoch": 0.57, + "grad_norm": 1.627216635272726, + "learning_rate": 4.159921579290546e-06, + "loss": 0.5305, + "step": 4570 + }, + { + "epoch": 0.57, + "grad_norm": 1.4736015643294451, + "learning_rate": 4.157940497777708e-06, + "loss": 0.5281, + "step": 4571 + }, + { + "epoch": 0.57, + "grad_norm": 2.0493794792054776, + "learning_rate": 4.155959552306998e-06, + "loss": 0.5319, + "step": 4572 + }, + { + "epoch": 0.57, + "grad_norm": 2.7710477417592, + "learning_rate": 4.153978743198454e-06, + "loss": 0.5209, + "step": 4573 + }, + { + "epoch": 0.57, + "grad_norm": 2.4132708828952816, + "learning_rate": 4.151998070772098e-06, + "loss": 0.5137, + "step": 4574 + }, + { + "epoch": 0.57, + "grad_norm": 3.3970788930640707, + "learning_rate": 4.150017535347922e-06, + "loss": 0.5331, + "step": 4575 + }, + { + "epoch": 0.57, + "grad_norm": 1.6491027358822592, + "learning_rate": 4.148037137245899e-06, + "loss": 0.5045, + "step": 4576 + }, + { + "epoch": 0.57, + "grad_norm": 1.19176300713811, + "learning_rate": 4.1460568767859795e-06, + "loss": 0.4623, + "step": 4577 + }, + { + "epoch": 0.57, + "grad_norm": 0.6787985574674565, + "learning_rate": 4.144076754288093e-06, + "loss": 0.4907, + "step": 4578 + }, + { + "epoch": 0.57, + "grad_norm": 1.327884011769675, + "learning_rate": 4.142096770072144e-06, + "loss": 0.525, + "step": 4579 + }, + { + "epoch": 0.57, + "grad_norm": 1.6676423532750408, + "learning_rate": 4.140116924458018e-06, + "loss": 0.5443, + "step": 4580 + }, + { + "epoch": 0.57, + "grad_norm": 1.441471086599204, + "learning_rate": 4.138137217765577e-06, + "loss": 0.5027, + "step": 4581 + }, + { + "epoch": 0.57, + "grad_norm": 1.5039303844416383, + "learning_rate": 4.1361576503146564e-06, + "loss": 0.5751, + "step": 4582 + }, + { + "epoch": 0.57, + "grad_norm": 1.394640112008406, + "learning_rate": 4.134178222425077e-06, + "loss": 0.5296, + "step": 4583 + }, + { + "epoch": 0.57, + "grad_norm": 1.3152535316930878, + "learning_rate": 4.1321989344166315e-06, + "loss": 0.5119, + "step": 4584 + }, + { + "epoch": 0.57, + "grad_norm": 2.2997952010293106, + "learning_rate": 4.130219786609092e-06, + "loss": 0.5026, + "step": 4585 + }, + { + "epoch": 0.57, + "grad_norm": 1.9590631327681363, + "learning_rate": 4.128240779322206e-06, + "loss": 0.5002, + "step": 4586 + }, + { + "epoch": 0.57, + "grad_norm": 1.8870920809940253, + "learning_rate": 4.1262619128757e-06, + "loss": 0.4644, + "step": 4587 + }, + { + "epoch": 0.57, + "grad_norm": 1.5428964670246743, + "learning_rate": 4.1242831875892755e-06, + "loss": 0.5423, + "step": 4588 + }, + { + "epoch": 0.57, + "grad_norm": 1.3085757589141793, + "learning_rate": 4.122304603782616e-06, + "loss": 0.4827, + "step": 4589 + }, + { + "epoch": 0.57, + "grad_norm": 1.4333341177657333, + "learning_rate": 4.120326161775375e-06, + "loss": 0.4813, + "step": 4590 + }, + { + "epoch": 0.57, + "grad_norm": 0.6827500733524555, + "learning_rate": 4.118347861887193e-06, + "loss": 0.4964, + "step": 4591 + }, + { + "epoch": 0.57, + "grad_norm": 1.6543162660363646, + "learning_rate": 4.116369704437678e-06, + "loss": 0.5188, + "step": 4592 + }, + { + "epoch": 0.57, + "grad_norm": 2.6541812074781275, + "learning_rate": 4.1143916897464204e-06, + "loss": 0.5051, + "step": 4593 + }, + { + "epoch": 0.57, + "grad_norm": 1.3791053313562698, + "learning_rate": 4.112413818132986e-06, + "loss": 0.5294, + "step": 4594 + }, + { + "epoch": 0.57, + "grad_norm": 1.3454682491868881, + "learning_rate": 4.110436089916915e-06, + "loss": 0.5059, + "step": 4595 + }, + { + "epoch": 0.57, + "grad_norm": 1.3218337421022976, + "learning_rate": 4.108458505417728e-06, + "loss": 0.4583, + "step": 4596 + }, + { + "epoch": 0.57, + "grad_norm": 1.5307112265445209, + "learning_rate": 4.1064810649549216e-06, + "loss": 0.5283, + "step": 4597 + }, + { + "epoch": 0.57, + "grad_norm": 1.8299476585591976, + "learning_rate": 4.104503768847967e-06, + "loss": 0.5274, + "step": 4598 + }, + { + "epoch": 0.57, + "grad_norm": 1.364327794991199, + "learning_rate": 4.102526617416317e-06, + "loss": 0.4876, + "step": 4599 + }, + { + "epoch": 0.57, + "grad_norm": 0.6838317119053259, + "learning_rate": 4.100549610979396e-06, + "loss": 0.4948, + "step": 4600 + }, + { + "epoch": 0.57, + "grad_norm": 1.5079001057864923, + "learning_rate": 4.098572749856607e-06, + "loss": 0.489, + "step": 4601 + }, + { + "epoch": 0.57, + "grad_norm": 1.440960720058185, + "learning_rate": 4.09659603436733e-06, + "loss": 0.5237, + "step": 4602 + }, + { + "epoch": 0.57, + "grad_norm": 1.6452758498878746, + "learning_rate": 4.09461946483092e-06, + "loss": 0.4759, + "step": 4603 + }, + { + "epoch": 0.57, + "grad_norm": 1.4792873873986112, + "learning_rate": 4.092643041566709e-06, + "loss": 0.4775, + "step": 4604 + }, + { + "epoch": 0.57, + "grad_norm": 2.0103496990842897, + "learning_rate": 4.090666764894007e-06, + "loss": 0.5322, + "step": 4605 + }, + { + "epoch": 0.57, + "grad_norm": 1.3916054186675135, + "learning_rate": 4.088690635132094e-06, + "loss": 0.5386, + "step": 4606 + }, + { + "epoch": 0.57, + "grad_norm": 1.225499772634962, + "learning_rate": 4.0867146526002384e-06, + "loss": 0.5163, + "step": 4607 + }, + { + "epoch": 0.57, + "grad_norm": 1.7005086325318728, + "learning_rate": 4.084738817617673e-06, + "loss": 0.5397, + "step": 4608 + }, + { + "epoch": 0.57, + "grad_norm": 1.330519772925582, + "learning_rate": 4.082763130503613e-06, + "loss": 0.4989, + "step": 4609 + }, + { + "epoch": 0.57, + "grad_norm": 1.6195287767062285, + "learning_rate": 4.080787591577247e-06, + "loss": 0.5027, + "step": 4610 + }, + { + "epoch": 0.57, + "grad_norm": 0.6955980481374432, + "learning_rate": 4.078812201157743e-06, + "loss": 0.468, + "step": 4611 + }, + { + "epoch": 0.57, + "grad_norm": 1.6819958162274402, + "learning_rate": 4.0768369595642396e-06, + "loss": 0.5542, + "step": 4612 + }, + { + "epoch": 0.57, + "grad_norm": 1.3882961570525039, + "learning_rate": 4.074861867115856e-06, + "loss": 0.4712, + "step": 4613 + }, + { + "epoch": 0.57, + "grad_norm": 1.418499308360564, + "learning_rate": 4.072886924131685e-06, + "loss": 0.5266, + "step": 4614 + }, + { + "epoch": 0.57, + "grad_norm": 1.3503995058949858, + "learning_rate": 4.070912130930798e-06, + "loss": 0.4793, + "step": 4615 + }, + { + "epoch": 0.57, + "grad_norm": 1.6011790859547999, + "learning_rate": 4.068937487832239e-06, + "loss": 0.4503, + "step": 4616 + }, + { + "epoch": 0.57, + "grad_norm": 0.7033286082288611, + "learning_rate": 4.066962995155028e-06, + "loss": 0.5097, + "step": 4617 + }, + { + "epoch": 0.57, + "grad_norm": 1.6431022028485887, + "learning_rate": 4.064988653218163e-06, + "loss": 0.52, + "step": 4618 + }, + { + "epoch": 0.57, + "grad_norm": 1.4719628028092255, + "learning_rate": 4.063014462340616e-06, + "loss": 0.5023, + "step": 4619 + }, + { + "epoch": 0.57, + "grad_norm": 1.9883852076554343, + "learning_rate": 4.061040422841334e-06, + "loss": 0.4708, + "step": 4620 + }, + { + "epoch": 0.57, + "grad_norm": 1.7468912437684312, + "learning_rate": 4.059066535039242e-06, + "loss": 0.5219, + "step": 4621 + }, + { + "epoch": 0.57, + "grad_norm": 1.5013487110806898, + "learning_rate": 4.057092799253239e-06, + "loss": 0.4259, + "step": 4622 + }, + { + "epoch": 0.57, + "grad_norm": 1.549834919534906, + "learning_rate": 4.055119215802196e-06, + "loss": 0.6012, + "step": 4623 + }, + { + "epoch": 0.57, + "grad_norm": 2.181190433647081, + "learning_rate": 4.053145785004968e-06, + "loss": 0.5504, + "step": 4624 + }, + { + "epoch": 0.57, + "grad_norm": 0.6775103648742493, + "learning_rate": 4.051172507180376e-06, + "loss": 0.4888, + "step": 4625 + }, + { + "epoch": 0.57, + "grad_norm": 1.1712565831079453, + "learning_rate": 4.049199382647224e-06, + "loss": 0.4742, + "step": 4626 + }, + { + "epoch": 0.57, + "grad_norm": 1.3516211442014003, + "learning_rate": 4.0472264117242845e-06, + "loss": 0.5361, + "step": 4627 + }, + { + "epoch": 0.57, + "grad_norm": 1.4519730976420815, + "learning_rate": 4.045253594730309e-06, + "loss": 0.4768, + "step": 4628 + }, + { + "epoch": 0.57, + "grad_norm": 1.4421565763159003, + "learning_rate": 4.043280931984025e-06, + "loss": 0.4832, + "step": 4629 + }, + { + "epoch": 0.57, + "grad_norm": 1.5819635692994547, + "learning_rate": 4.041308423804132e-06, + "loss": 0.5221, + "step": 4630 + }, + { + "epoch": 0.57, + "grad_norm": 1.366048129453399, + "learning_rate": 4.039336070509305e-06, + "loss": 0.4943, + "step": 4631 + }, + { + "epoch": 0.57, + "grad_norm": 1.4460322616649448, + "learning_rate": 4.037363872418199e-06, + "loss": 0.4793, + "step": 4632 + }, + { + "epoch": 0.57, + "grad_norm": 1.9828714102023244, + "learning_rate": 4.035391829849436e-06, + "loss": 0.5577, + "step": 4633 + }, + { + "epoch": 0.58, + "grad_norm": 0.7198579219425816, + "learning_rate": 4.033419943121619e-06, + "loss": 0.5233, + "step": 4634 + }, + { + "epoch": 0.58, + "grad_norm": 1.427904785615395, + "learning_rate": 4.0314482125533235e-06, + "loss": 0.5267, + "step": 4635 + }, + { + "epoch": 0.58, + "grad_norm": 1.4270553507828687, + "learning_rate": 4.029476638463099e-06, + "loss": 0.5129, + "step": 4636 + }, + { + "epoch": 0.58, + "grad_norm": 1.3353797022534557, + "learning_rate": 4.027505221169471e-06, + "loss": 0.4807, + "step": 4637 + }, + { + "epoch": 0.58, + "grad_norm": 1.9051587551980136, + "learning_rate": 4.02553396099094e-06, + "loss": 0.4763, + "step": 4638 + }, + { + "epoch": 0.58, + "grad_norm": 1.3909718159931581, + "learning_rate": 4.023562858245979e-06, + "loss": 0.5409, + "step": 4639 + }, + { + "epoch": 0.58, + "grad_norm": 1.4703475909795112, + "learning_rate": 4.021591913253039e-06, + "loss": 0.4978, + "step": 4640 + }, + { + "epoch": 0.58, + "grad_norm": 1.4012920248011926, + "learning_rate": 4.019621126330541e-06, + "loss": 0.4711, + "step": 4641 + }, + { + "epoch": 0.58, + "grad_norm": 1.6960206540230156, + "learning_rate": 4.017650497796886e-06, + "loss": 0.5333, + "step": 4642 + }, + { + "epoch": 0.58, + "grad_norm": 3.567076240933028, + "learning_rate": 4.015680027970445e-06, + "loss": 0.4917, + "step": 4643 + }, + { + "epoch": 0.58, + "grad_norm": 1.9646219421988003, + "learning_rate": 4.013709717169563e-06, + "loss": 0.5112, + "step": 4644 + }, + { + "epoch": 0.58, + "grad_norm": 1.3626883688448634, + "learning_rate": 4.011739565712564e-06, + "loss": 0.4741, + "step": 4645 + }, + { + "epoch": 0.58, + "grad_norm": 1.4192552372878515, + "learning_rate": 4.009769573917741e-06, + "loss": 0.5132, + "step": 4646 + }, + { + "epoch": 0.58, + "grad_norm": 1.5294985071234206, + "learning_rate": 4.007799742103365e-06, + "loss": 0.4852, + "step": 4647 + }, + { + "epoch": 0.58, + "grad_norm": 3.103961404045815, + "learning_rate": 4.005830070587679e-06, + "loss": 0.4851, + "step": 4648 + }, + { + "epoch": 0.58, + "grad_norm": 1.3305211645597752, + "learning_rate": 4.003860559688902e-06, + "loss": 0.4964, + "step": 4649 + }, + { + "epoch": 0.58, + "grad_norm": 1.4043067447743847, + "learning_rate": 4.001891209725224e-06, + "loss": 0.5026, + "step": 4650 + }, + { + "epoch": 0.58, + "grad_norm": 1.9045569050008961, + "learning_rate": 3.999922021014812e-06, + "loss": 0.4974, + "step": 4651 + }, + { + "epoch": 0.58, + "grad_norm": 1.513471288790594, + "learning_rate": 3.997952993875805e-06, + "loss": 0.4903, + "step": 4652 + }, + { + "epoch": 0.58, + "grad_norm": 3.3190837653397103, + "learning_rate": 3.995984128626317e-06, + "loss": 0.5218, + "step": 4653 + }, + { + "epoch": 0.58, + "grad_norm": 1.2803737183980128, + "learning_rate": 3.9940154255844355e-06, + "loss": 0.4901, + "step": 4654 + }, + { + "epoch": 0.58, + "grad_norm": 2.324017478162515, + "learning_rate": 3.992046885068221e-06, + "loss": 0.5455, + "step": 4655 + }, + { + "epoch": 0.58, + "grad_norm": 1.405213306853622, + "learning_rate": 3.99007850739571e-06, + "loss": 0.5239, + "step": 4656 + }, + { + "epoch": 0.58, + "grad_norm": 1.4332496852812633, + "learning_rate": 3.988110292884912e-06, + "loss": 0.5049, + "step": 4657 + }, + { + "epoch": 0.58, + "grad_norm": 1.5801465841808873, + "learning_rate": 3.986142241853808e-06, + "loss": 0.5177, + "step": 4658 + }, + { + "epoch": 0.58, + "grad_norm": 1.5017776976421946, + "learning_rate": 3.984174354620353e-06, + "loss": 0.5474, + "step": 4659 + }, + { + "epoch": 0.58, + "grad_norm": 1.264765259932856, + "learning_rate": 3.982206631502478e-06, + "loss": 0.4865, + "step": 4660 + }, + { + "epoch": 0.58, + "grad_norm": 1.8198470905922306, + "learning_rate": 3.980239072818086e-06, + "loss": 0.4662, + "step": 4661 + }, + { + "epoch": 0.58, + "grad_norm": 1.4756018404030682, + "learning_rate": 3.9782716788850525e-06, + "loss": 0.5083, + "step": 4662 + }, + { + "epoch": 0.58, + "grad_norm": 1.4214216069678782, + "learning_rate": 3.9763044500212285e-06, + "loss": 0.5023, + "step": 4663 + }, + { + "epoch": 0.58, + "grad_norm": 1.4574395662824027, + "learning_rate": 3.974337386544436e-06, + "loss": 0.4644, + "step": 4664 + }, + { + "epoch": 0.58, + "grad_norm": 1.4535700532434277, + "learning_rate": 3.972370488772472e-06, + "loss": 0.4926, + "step": 4665 + }, + { + "epoch": 0.58, + "grad_norm": 1.5898246354626455, + "learning_rate": 3.9704037570231055e-06, + "loss": 0.5456, + "step": 4666 + }, + { + "epoch": 0.58, + "grad_norm": 0.6957198744230172, + "learning_rate": 3.968437191614081e-06, + "loss": 0.5038, + "step": 4667 + }, + { + "epoch": 0.58, + "grad_norm": 1.535459803714931, + "learning_rate": 3.966470792863113e-06, + "loss": 0.5132, + "step": 4668 + }, + { + "epoch": 0.58, + "grad_norm": 1.6608193469359567, + "learning_rate": 3.964504561087891e-06, + "loss": 0.4485, + "step": 4669 + }, + { + "epoch": 0.58, + "grad_norm": 2.349331946559182, + "learning_rate": 3.962538496606077e-06, + "loss": 0.5023, + "step": 4670 + }, + { + "epoch": 0.58, + "grad_norm": 1.580207427022666, + "learning_rate": 3.960572599735306e-06, + "loss": 0.4777, + "step": 4671 + }, + { + "epoch": 0.58, + "grad_norm": 1.6782527319642873, + "learning_rate": 3.958606870793184e-06, + "loss": 0.488, + "step": 4672 + }, + { + "epoch": 0.58, + "grad_norm": 1.3316424558043596, + "learning_rate": 3.956641310097296e-06, + "loss": 0.5741, + "step": 4673 + }, + { + "epoch": 0.58, + "grad_norm": 1.3524018346110955, + "learning_rate": 3.954675917965194e-06, + "loss": 0.5005, + "step": 4674 + }, + { + "epoch": 0.58, + "grad_norm": 0.684657285155044, + "learning_rate": 3.952710694714403e-06, + "loss": 0.4864, + "step": 4675 + }, + { + "epoch": 0.58, + "grad_norm": 1.510587123243163, + "learning_rate": 3.9507456406624235e-06, + "loss": 0.5308, + "step": 4676 + }, + { + "epoch": 0.58, + "grad_norm": 1.4383162007434367, + "learning_rate": 3.948780756126726e-06, + "loss": 0.5088, + "step": 4677 + }, + { + "epoch": 0.58, + "grad_norm": 1.2909856888220486, + "learning_rate": 3.946816041424756e-06, + "loss": 0.5403, + "step": 4678 + }, + { + "epoch": 0.58, + "grad_norm": 1.522311388231281, + "learning_rate": 3.94485149687393e-06, + "loss": 0.4866, + "step": 4679 + }, + { + "epoch": 0.58, + "grad_norm": 1.5130355478873698, + "learning_rate": 3.942887122791636e-06, + "loss": 0.5284, + "step": 4680 + }, + { + "epoch": 0.58, + "grad_norm": 1.3370427571409305, + "learning_rate": 3.940922919495239e-06, + "loss": 0.481, + "step": 4681 + }, + { + "epoch": 0.58, + "grad_norm": 1.2204372209939194, + "learning_rate": 3.938958887302072e-06, + "loss": 0.5164, + "step": 4682 + }, + { + "epoch": 0.58, + "grad_norm": 1.8622732708112701, + "learning_rate": 3.9369950265294415e-06, + "loss": 0.4943, + "step": 4683 + }, + { + "epoch": 0.58, + "grad_norm": 0.6782135724024395, + "learning_rate": 3.9350313374946275e-06, + "loss": 0.4423, + "step": 4684 + }, + { + "epoch": 0.58, + "grad_norm": 2.0674398268960297, + "learning_rate": 3.933067820514879e-06, + "loss": 0.5081, + "step": 4685 + }, + { + "epoch": 0.58, + "grad_norm": 1.5423215799903884, + "learning_rate": 3.931104475907423e-06, + "loss": 0.4722, + "step": 4686 + }, + { + "epoch": 0.58, + "grad_norm": 1.455909664098324, + "learning_rate": 3.92914130398945e-06, + "loss": 0.5115, + "step": 4687 + }, + { + "epoch": 0.58, + "grad_norm": 1.4171266629705266, + "learning_rate": 3.92717830507813e-06, + "loss": 0.4569, + "step": 4688 + }, + { + "epoch": 0.58, + "grad_norm": 1.5722967736044233, + "learning_rate": 3.925215479490605e-06, + "loss": 0.519, + "step": 4689 + }, + { + "epoch": 0.58, + "grad_norm": 1.8243100670739596, + "learning_rate": 3.923252827543986e-06, + "loss": 0.4902, + "step": 4690 + }, + { + "epoch": 0.58, + "grad_norm": 1.4910512694948186, + "learning_rate": 3.921290349555355e-06, + "loss": 0.5111, + "step": 4691 + }, + { + "epoch": 0.58, + "grad_norm": 0.6551753415818495, + "learning_rate": 3.9193280458417685e-06, + "loss": 0.527, + "step": 4692 + }, + { + "epoch": 0.58, + "grad_norm": 1.660451709064471, + "learning_rate": 3.917365916720255e-06, + "loss": 0.5305, + "step": 4693 + }, + { + "epoch": 0.58, + "grad_norm": 1.64326306596274, + "learning_rate": 3.915403962507812e-06, + "loss": 0.5228, + "step": 4694 + }, + { + "epoch": 0.58, + "grad_norm": 1.601187680203445, + "learning_rate": 3.9134421835214105e-06, + "loss": 0.5226, + "step": 4695 + }, + { + "epoch": 0.58, + "grad_norm": 1.490661965402863, + "learning_rate": 3.911480580077992e-06, + "loss": 0.5178, + "step": 4696 + }, + { + "epoch": 0.58, + "grad_norm": 1.683396309148199, + "learning_rate": 3.909519152494475e-06, + "loss": 0.5278, + "step": 4697 + }, + { + "epoch": 0.58, + "grad_norm": 1.812167758657147, + "learning_rate": 3.907557901087743e-06, + "loss": 0.5433, + "step": 4698 + }, + { + "epoch": 0.58, + "grad_norm": 1.466698533298413, + "learning_rate": 3.905596826174654e-06, + "loss": 0.518, + "step": 4699 + }, + { + "epoch": 0.58, + "grad_norm": 1.7310256508734216, + "learning_rate": 3.903635928072035e-06, + "loss": 0.4678, + "step": 4700 + }, + { + "epoch": 0.58, + "grad_norm": 1.4647284525075164, + "learning_rate": 3.901675207096689e-06, + "loss": 0.5054, + "step": 4701 + }, + { + "epoch": 0.58, + "grad_norm": 1.4310062514790178, + "learning_rate": 3.899714663565386e-06, + "loss": 0.4812, + "step": 4702 + }, + { + "epoch": 0.58, + "grad_norm": 1.4360374783010557, + "learning_rate": 3.897754297794869e-06, + "loss": 0.5054, + "step": 4703 + }, + { + "epoch": 0.58, + "grad_norm": 1.4011598640375142, + "learning_rate": 3.895794110101854e-06, + "loss": 0.4953, + "step": 4704 + }, + { + "epoch": 0.58, + "grad_norm": 1.3910790082629851, + "learning_rate": 3.893834100803023e-06, + "loss": 0.4697, + "step": 4705 + }, + { + "epoch": 0.58, + "grad_norm": 1.384821909994724, + "learning_rate": 3.891874270215037e-06, + "loss": 0.5501, + "step": 4706 + }, + { + "epoch": 0.58, + "grad_norm": 1.358041590073802, + "learning_rate": 3.889914618654522e-06, + "loss": 0.5326, + "step": 4707 + }, + { + "epoch": 0.58, + "grad_norm": 1.7752222465176617, + "learning_rate": 3.887955146438077e-06, + "loss": 0.5044, + "step": 4708 + }, + { + "epoch": 0.58, + "grad_norm": 1.5484824637392638, + "learning_rate": 3.885995853882273e-06, + "loss": 0.4953, + "step": 4709 + }, + { + "epoch": 0.58, + "grad_norm": 1.1610373215701917, + "learning_rate": 3.884036741303649e-06, + "loss": 0.45, + "step": 4710 + }, + { + "epoch": 0.58, + "grad_norm": 1.276864536888107, + "learning_rate": 3.8820778090187185e-06, + "loss": 0.5035, + "step": 4711 + }, + { + "epoch": 0.58, + "grad_norm": 1.9254796816869655, + "learning_rate": 3.880119057343965e-06, + "loss": 0.5279, + "step": 4712 + }, + { + "epoch": 0.58, + "grad_norm": 1.6475310297625902, + "learning_rate": 3.878160486595837e-06, + "loss": 0.5642, + "step": 4713 + }, + { + "epoch": 0.59, + "grad_norm": 1.4280491895781326, + "learning_rate": 3.876202097090765e-06, + "loss": 0.4728, + "step": 4714 + }, + { + "epoch": 0.59, + "grad_norm": 2.2032458161147614, + "learning_rate": 3.874243889145142e-06, + "loss": 0.528, + "step": 4715 + }, + { + "epoch": 0.59, + "grad_norm": 1.4660037572297808, + "learning_rate": 3.872285863075334e-06, + "loss": 0.5171, + "step": 4716 + }, + { + "epoch": 0.59, + "grad_norm": 1.445124971634497, + "learning_rate": 3.8703280191976764e-06, + "loss": 0.4733, + "step": 4717 + }, + { + "epoch": 0.59, + "grad_norm": 0.6264348500908541, + "learning_rate": 3.868370357828476e-06, + "loss": 0.513, + "step": 4718 + }, + { + "epoch": 0.59, + "grad_norm": 1.7280898543065133, + "learning_rate": 3.866412879284011e-06, + "loss": 0.5092, + "step": 4719 + }, + { + "epoch": 0.59, + "grad_norm": 1.7655472224257922, + "learning_rate": 3.864455583880529e-06, + "loss": 0.5303, + "step": 4720 + }, + { + "epoch": 0.59, + "grad_norm": 1.3950462536660542, + "learning_rate": 3.862498471934248e-06, + "loss": 0.454, + "step": 4721 + }, + { + "epoch": 0.59, + "grad_norm": 1.5446451372213836, + "learning_rate": 3.860541543761358e-06, + "loss": 0.4389, + "step": 4722 + }, + { + "epoch": 0.59, + "grad_norm": 2.202339874550893, + "learning_rate": 3.858584799678017e-06, + "loss": 0.5332, + "step": 4723 + }, + { + "epoch": 0.59, + "grad_norm": 1.7557477548315092, + "learning_rate": 3.8566282400003545e-06, + "loss": 0.5418, + "step": 4724 + }, + { + "epoch": 0.59, + "grad_norm": 1.2698707672176435, + "learning_rate": 3.854671865044469e-06, + "loss": 0.4851, + "step": 4725 + }, + { + "epoch": 0.59, + "grad_norm": 1.6644076653329196, + "learning_rate": 3.852715675126431e-06, + "loss": 0.4325, + "step": 4726 + }, + { + "epoch": 0.59, + "grad_norm": 1.2646724952132085, + "learning_rate": 3.85075967056228e-06, + "loss": 0.5054, + "step": 4727 + }, + { + "epoch": 0.59, + "grad_norm": 1.5026733833240709, + "learning_rate": 3.848803851668026e-06, + "loss": 0.4743, + "step": 4728 + }, + { + "epoch": 0.59, + "grad_norm": 1.4012847881227701, + "learning_rate": 3.8468482187596475e-06, + "loss": 0.4744, + "step": 4729 + }, + { + "epoch": 0.59, + "grad_norm": 1.5112355270821844, + "learning_rate": 3.844892772153097e-06, + "loss": 0.5427, + "step": 4730 + }, + { + "epoch": 0.59, + "grad_norm": 0.7444916837957993, + "learning_rate": 3.842937512164292e-06, + "loss": 0.5219, + "step": 4731 + }, + { + "epoch": 0.59, + "grad_norm": 2.3893452549099528, + "learning_rate": 3.840982439109122e-06, + "loss": 0.4558, + "step": 4732 + }, + { + "epoch": 0.59, + "grad_norm": 1.7086719823044167, + "learning_rate": 3.839027553303447e-06, + "loss": 0.5013, + "step": 4733 + }, + { + "epoch": 0.59, + "grad_norm": 1.4469872927187486, + "learning_rate": 3.837072855063097e-06, + "loss": 0.5551, + "step": 4734 + }, + { + "epoch": 0.59, + "grad_norm": 1.2457288599780056, + "learning_rate": 3.835118344703868e-06, + "loss": 0.4889, + "step": 4735 + }, + { + "epoch": 0.59, + "grad_norm": 1.2896620308751967, + "learning_rate": 3.833164022541529e-06, + "loss": 0.4871, + "step": 4736 + }, + { + "epoch": 0.59, + "grad_norm": 2.03442397818743, + "learning_rate": 3.83120988889182e-06, + "loss": 0.5277, + "step": 4737 + }, + { + "epoch": 0.59, + "grad_norm": 1.3622425574642558, + "learning_rate": 3.829255944070445e-06, + "loss": 0.4914, + "step": 4738 + }, + { + "epoch": 0.59, + "grad_norm": 2.9296216351758595, + "learning_rate": 3.8273021883930865e-06, + "loss": 0.4908, + "step": 4739 + }, + { + "epoch": 0.59, + "grad_norm": 1.5253787207780585, + "learning_rate": 3.825348622175386e-06, + "loss": 0.4846, + "step": 4740 + }, + { + "epoch": 0.59, + "grad_norm": 2.187766035709169, + "learning_rate": 3.823395245732961e-06, + "loss": 0.4719, + "step": 4741 + }, + { + "epoch": 0.59, + "grad_norm": 1.4975496186557569, + "learning_rate": 3.8214420593813975e-06, + "loss": 0.5083, + "step": 4742 + }, + { + "epoch": 0.59, + "grad_norm": 0.7297899522931308, + "learning_rate": 3.819489063436248e-06, + "loss": 0.5002, + "step": 4743 + }, + { + "epoch": 0.59, + "grad_norm": 1.484072419508185, + "learning_rate": 3.817536258213038e-06, + "loss": 0.4853, + "step": 4744 + }, + { + "epoch": 0.59, + "grad_norm": 1.5021104486952073, + "learning_rate": 3.81558364402726e-06, + "loss": 0.4509, + "step": 4745 + }, + { + "epoch": 0.59, + "grad_norm": 1.6685918184482609, + "learning_rate": 3.8136312211943736e-06, + "loss": 0.5322, + "step": 4746 + }, + { + "epoch": 0.59, + "grad_norm": 1.6368358729433512, + "learning_rate": 3.8116789900298135e-06, + "loss": 0.4764, + "step": 4747 + }, + { + "epoch": 0.59, + "grad_norm": 1.43761500492689, + "learning_rate": 3.809726950848979e-06, + "loss": 0.4895, + "step": 4748 + }, + { + "epoch": 0.59, + "grad_norm": 1.5978788025111923, + "learning_rate": 3.8077751039672377e-06, + "loss": 0.5373, + "step": 4749 + }, + { + "epoch": 0.59, + "grad_norm": 1.2847629859841037, + "learning_rate": 3.805823449699929e-06, + "loss": 0.4763, + "step": 4750 + }, + { + "epoch": 0.59, + "grad_norm": 1.7553604371536287, + "learning_rate": 3.80387198836236e-06, + "loss": 0.477, + "step": 4751 + }, + { + "epoch": 0.59, + "grad_norm": 1.8653801195128799, + "learning_rate": 3.801920720269805e-06, + "loss": 0.4816, + "step": 4752 + }, + { + "epoch": 0.59, + "grad_norm": 1.6215137768031846, + "learning_rate": 3.7999696457375094e-06, + "loss": 0.4689, + "step": 4753 + }, + { + "epoch": 0.59, + "grad_norm": 1.5537199741329855, + "learning_rate": 3.7980187650806855e-06, + "loss": 0.497, + "step": 4754 + }, + { + "epoch": 0.59, + "grad_norm": 1.3683582861339472, + "learning_rate": 3.7960680786145177e-06, + "loss": 0.4956, + "step": 4755 + }, + { + "epoch": 0.59, + "grad_norm": 1.3744261784431637, + "learning_rate": 3.7941175866541545e-06, + "loss": 0.4658, + "step": 4756 + }, + { + "epoch": 0.59, + "grad_norm": 1.834936259971978, + "learning_rate": 3.7921672895147154e-06, + "loss": 0.4878, + "step": 4757 + }, + { + "epoch": 0.59, + "grad_norm": 0.6618545833319962, + "learning_rate": 3.7902171875112893e-06, + "loss": 0.4895, + "step": 4758 + }, + { + "epoch": 0.59, + "grad_norm": 3.5550754129562883, + "learning_rate": 3.7882672809589303e-06, + "loss": 0.5166, + "step": 4759 + }, + { + "epoch": 0.59, + "grad_norm": 1.6752689455613092, + "learning_rate": 3.786317570172665e-06, + "loss": 0.4803, + "step": 4760 + }, + { + "epoch": 0.59, + "grad_norm": 1.817067473755128, + "learning_rate": 3.784368055467485e-06, + "loss": 0.4799, + "step": 4761 + }, + { + "epoch": 0.59, + "grad_norm": 1.796483415824333, + "learning_rate": 3.78241873715835e-06, + "loss": 0.4372, + "step": 4762 + }, + { + "epoch": 0.59, + "grad_norm": 1.4581358695876308, + "learning_rate": 3.7804696155601946e-06, + "loss": 0.5114, + "step": 4763 + }, + { + "epoch": 0.59, + "grad_norm": 1.5162840470971275, + "learning_rate": 3.7785206909879125e-06, + "loss": 0.5593, + "step": 4764 + }, + { + "epoch": 0.59, + "grad_norm": 1.7687354735721073, + "learning_rate": 3.7765719637563704e-06, + "loss": 0.4865, + "step": 4765 + }, + { + "epoch": 0.59, + "grad_norm": 0.6016449273847941, + "learning_rate": 3.7746234341804023e-06, + "loss": 0.4856, + "step": 4766 + }, + { + "epoch": 0.59, + "grad_norm": 1.5925792667624954, + "learning_rate": 3.772675102574811e-06, + "loss": 0.4904, + "step": 4767 + }, + { + "epoch": 0.59, + "grad_norm": 1.5491375986862115, + "learning_rate": 3.7707269692543657e-06, + "loss": 0.4987, + "step": 4768 + }, + { + "epoch": 0.59, + "grad_norm": 4.024481064899184, + "learning_rate": 3.7687790345338054e-06, + "loss": 0.5366, + "step": 4769 + }, + { + "epoch": 0.59, + "grad_norm": 1.5008907912251712, + "learning_rate": 3.7668312987278333e-06, + "loss": 0.4803, + "step": 4770 + }, + { + "epoch": 0.59, + "grad_norm": 1.4008267317117975, + "learning_rate": 3.764883762151128e-06, + "loss": 0.4885, + "step": 4771 + }, + { + "epoch": 0.59, + "grad_norm": 1.3949664117379335, + "learning_rate": 3.762936425118329e-06, + "loss": 0.5142, + "step": 4772 + }, + { + "epoch": 0.59, + "grad_norm": 1.3049393678095358, + "learning_rate": 3.760989287944047e-06, + "loss": 0.5097, + "step": 4773 + }, + { + "epoch": 0.59, + "grad_norm": 1.5641320129466818, + "learning_rate": 3.7590423509428557e-06, + "loss": 0.5167, + "step": 4774 + }, + { + "epoch": 0.59, + "grad_norm": 1.4951590978042526, + "learning_rate": 3.7570956144293025e-06, + "loss": 0.5374, + "step": 4775 + }, + { + "epoch": 0.59, + "grad_norm": 4.330005347816297, + "learning_rate": 3.7551490787178996e-06, + "loss": 0.5422, + "step": 4776 + }, + { + "epoch": 0.59, + "grad_norm": 1.3316052030847458, + "learning_rate": 3.753202744123126e-06, + "loss": 0.453, + "step": 4777 + }, + { + "epoch": 0.59, + "grad_norm": 1.6324208318735371, + "learning_rate": 3.7512566109594308e-06, + "loss": 0.5317, + "step": 4778 + }, + { + "epoch": 0.59, + "grad_norm": 1.5633976503408085, + "learning_rate": 3.7493106795412254e-06, + "loss": 0.4704, + "step": 4779 + }, + { + "epoch": 0.59, + "grad_norm": 1.4535976872534775, + "learning_rate": 3.747364950182897e-06, + "loss": 0.5258, + "step": 4780 + }, + { + "epoch": 0.59, + "grad_norm": 5.231741944089487, + "learning_rate": 3.7454194231987927e-06, + "loss": 0.4684, + "step": 4781 + }, + { + "epoch": 0.59, + "grad_norm": 1.2879488975320648, + "learning_rate": 3.7434740989032316e-06, + "loss": 0.5012, + "step": 4782 + }, + { + "epoch": 0.59, + "grad_norm": 0.7339082361721024, + "learning_rate": 3.7415289776104953e-06, + "loss": 0.4991, + "step": 4783 + }, + { + "epoch": 0.59, + "grad_norm": 1.8207277057420377, + "learning_rate": 3.739584059634836e-06, + "loss": 0.4733, + "step": 4784 + }, + { + "epoch": 0.59, + "grad_norm": 1.816680471066737, + "learning_rate": 3.7376393452904725e-06, + "loss": 0.5114, + "step": 4785 + }, + { + "epoch": 0.59, + "grad_norm": 1.4300159454723032, + "learning_rate": 3.7356948348915913e-06, + "loss": 0.4716, + "step": 4786 + }, + { + "epoch": 0.59, + "grad_norm": 1.466517194302703, + "learning_rate": 3.7337505287523413e-06, + "loss": 0.5039, + "step": 4787 + }, + { + "epoch": 0.59, + "grad_norm": 1.6332080984609707, + "learning_rate": 3.731806427186848e-06, + "loss": 0.4736, + "step": 4788 + }, + { + "epoch": 0.59, + "grad_norm": 1.521013037464149, + "learning_rate": 3.7298625305091963e-06, + "loss": 0.4883, + "step": 4789 + }, + { + "epoch": 0.59, + "grad_norm": 1.5662771829569915, + "learning_rate": 3.7279188390334385e-06, + "loss": 0.4823, + "step": 4790 + }, + { + "epoch": 0.59, + "grad_norm": 1.4664761110857254, + "learning_rate": 3.7259753530735964e-06, + "loss": 0.5037, + "step": 4791 + }, + { + "epoch": 0.59, + "grad_norm": 1.406015218659607, + "learning_rate": 3.7240320729436575e-06, + "loss": 0.5087, + "step": 4792 + }, + { + "epoch": 0.59, + "grad_norm": 1.4644074732784458, + "learning_rate": 3.7220889989575737e-06, + "loss": 0.4667, + "step": 4793 + }, + { + "epoch": 0.59, + "grad_norm": 1.6162804324068345, + "learning_rate": 3.7201461314292665e-06, + "loss": 0.5539, + "step": 4794 + }, + { + "epoch": 0.6, + "grad_norm": 1.9052845018302897, + "learning_rate": 3.7182034706726224e-06, + "loss": 0.4905, + "step": 4795 + }, + { + "epoch": 0.6, + "grad_norm": 1.3987498526053814, + "learning_rate": 3.716261017001498e-06, + "loss": 0.4913, + "step": 4796 + }, + { + "epoch": 0.6, + "grad_norm": 1.653132447704072, + "learning_rate": 3.714318770729713e-06, + "loss": 0.5096, + "step": 4797 + }, + { + "epoch": 0.6, + "grad_norm": 1.4485827282911432, + "learning_rate": 3.7123767321710538e-06, + "loss": 0.5258, + "step": 4798 + }, + { + "epoch": 0.6, + "grad_norm": 1.3441229709454283, + "learning_rate": 3.710434901639274e-06, + "loss": 0.5023, + "step": 4799 + }, + { + "epoch": 0.6, + "grad_norm": 1.5096540584449896, + "learning_rate": 3.708493279448093e-06, + "loss": 0.4974, + "step": 4800 + }, + { + "epoch": 0.6, + "grad_norm": 1.3348271010756676, + "learning_rate": 3.7065518659111982e-06, + "loss": 0.5133, + "step": 4801 + }, + { + "epoch": 0.6, + "grad_norm": 1.3362769274646589, + "learning_rate": 3.7046106613422404e-06, + "loss": 0.4978, + "step": 4802 + }, + { + "epoch": 0.6, + "grad_norm": 0.6586552427143136, + "learning_rate": 3.7026696660548365e-06, + "loss": 0.5204, + "step": 4803 + }, + { + "epoch": 0.6, + "grad_norm": 1.2970674817252872, + "learning_rate": 3.700728880362577e-06, + "loss": 0.5126, + "step": 4804 + }, + { + "epoch": 0.6, + "grad_norm": 1.2692411052824943, + "learning_rate": 3.6987883045790093e-06, + "loss": 0.4919, + "step": 4805 + }, + { + "epoch": 0.6, + "grad_norm": 7.544857137361854, + "learning_rate": 3.6968479390176503e-06, + "loss": 0.4943, + "step": 4806 + }, + { + "epoch": 0.6, + "grad_norm": 1.6111067019105714, + "learning_rate": 3.6949077839919852e-06, + "loss": 0.5665, + "step": 4807 + }, + { + "epoch": 0.6, + "grad_norm": 1.5148276199786361, + "learning_rate": 3.692967839815461e-06, + "loss": 0.5128, + "step": 4808 + }, + { + "epoch": 0.6, + "grad_norm": 1.9289750603078077, + "learning_rate": 3.6910281068014935e-06, + "loss": 0.5046, + "step": 4809 + }, + { + "epoch": 0.6, + "grad_norm": 1.3571727913310128, + "learning_rate": 3.689088585263464e-06, + "loss": 0.544, + "step": 4810 + }, + { + "epoch": 0.6, + "grad_norm": 1.514263873049238, + "learning_rate": 3.687149275514718e-06, + "loss": 0.4944, + "step": 4811 + }, + { + "epoch": 0.6, + "grad_norm": 1.320448446471603, + "learning_rate": 3.68521017786857e-06, + "loss": 0.4634, + "step": 4812 + }, + { + "epoch": 0.6, + "grad_norm": 1.4253421646515063, + "learning_rate": 3.6832712926382978e-06, + "loss": 0.4323, + "step": 4813 + }, + { + "epoch": 0.6, + "grad_norm": 1.6248156405188252, + "learning_rate": 3.6813326201371448e-06, + "loss": 0.4576, + "step": 4814 + }, + { + "epoch": 0.6, + "grad_norm": 1.5491421709460802, + "learning_rate": 3.679394160678321e-06, + "loss": 0.4942, + "step": 4815 + }, + { + "epoch": 0.6, + "grad_norm": 1.5293882527430553, + "learning_rate": 3.677455914575001e-06, + "loss": 0.5424, + "step": 4816 + }, + { + "epoch": 0.6, + "grad_norm": 2.6610870035965135, + "learning_rate": 3.675517882140326e-06, + "loss": 0.491, + "step": 4817 + }, + { + "epoch": 0.6, + "grad_norm": 1.5876662699587831, + "learning_rate": 3.673580063687402e-06, + "loss": 0.5051, + "step": 4818 + }, + { + "epoch": 0.6, + "grad_norm": 0.750905294107381, + "learning_rate": 3.671642459529301e-06, + "loss": 0.4937, + "step": 4819 + }, + { + "epoch": 0.6, + "grad_norm": 1.7565409567351697, + "learning_rate": 3.6697050699790586e-06, + "loss": 0.5273, + "step": 4820 + }, + { + "epoch": 0.6, + "grad_norm": 1.5630185373429069, + "learning_rate": 3.6677678953496797e-06, + "loss": 0.536, + "step": 4821 + }, + { + "epoch": 0.6, + "grad_norm": 1.5729595543148676, + "learning_rate": 3.66583093595413e-06, + "loss": 0.5653, + "step": 4822 + }, + { + "epoch": 0.6, + "grad_norm": 1.3837247868854905, + "learning_rate": 3.663894192105344e-06, + "loss": 0.5577, + "step": 4823 + }, + { + "epoch": 0.6, + "grad_norm": 1.7157111912006533, + "learning_rate": 3.6619576641162176e-06, + "loss": 0.5539, + "step": 4824 + }, + { + "epoch": 0.6, + "grad_norm": 0.6563532770825058, + "learning_rate": 3.660021352299615e-06, + "loss": 0.5398, + "step": 4825 + }, + { + "epoch": 0.6, + "grad_norm": 1.4097431835544845, + "learning_rate": 3.658085256968365e-06, + "loss": 0.5385, + "step": 4826 + }, + { + "epoch": 0.6, + "grad_norm": 1.360389593574117, + "learning_rate": 3.65614937843526e-06, + "loss": 0.5052, + "step": 4827 + }, + { + "epoch": 0.6, + "grad_norm": 1.353844494224524, + "learning_rate": 3.6542137170130576e-06, + "loss": 0.4953, + "step": 4828 + }, + { + "epoch": 0.6, + "grad_norm": 1.422660488634164, + "learning_rate": 3.652278273014482e-06, + "loss": 0.5504, + "step": 4829 + }, + { + "epoch": 0.6, + "grad_norm": 1.5049937847673278, + "learning_rate": 3.650343046752222e-06, + "loss": 0.4411, + "step": 4830 + }, + { + "epoch": 0.6, + "grad_norm": 2.8728735408056143, + "learning_rate": 3.6484080385389296e-06, + "loss": 0.4558, + "step": 4831 + }, + { + "epoch": 0.6, + "grad_norm": 1.3355117575474822, + "learning_rate": 3.6464732486872216e-06, + "loss": 0.4604, + "step": 4832 + }, + { + "epoch": 0.6, + "grad_norm": 1.2747248934156539, + "learning_rate": 3.6445386775096813e-06, + "loss": 0.4732, + "step": 4833 + }, + { + "epoch": 0.6, + "grad_norm": 1.3518302410791765, + "learning_rate": 3.642604325318856e-06, + "loss": 0.4815, + "step": 4834 + }, + { + "epoch": 0.6, + "grad_norm": 1.4051502217298926, + "learning_rate": 3.640670192427257e-06, + "loss": 0.4661, + "step": 4835 + }, + { + "epoch": 0.6, + "grad_norm": 1.4809715490321091, + "learning_rate": 3.6387362791473583e-06, + "loss": 0.5039, + "step": 4836 + }, + { + "epoch": 0.6, + "grad_norm": 1.396861806595747, + "learning_rate": 3.6368025857916044e-06, + "loss": 0.4719, + "step": 4837 + }, + { + "epoch": 0.6, + "grad_norm": 1.4157885612242203, + "learning_rate": 3.6348691126723984e-06, + "loss": 0.5259, + "step": 4838 + }, + { + "epoch": 0.6, + "grad_norm": 1.7313886903818687, + "learning_rate": 3.6329358601021103e-06, + "loss": 0.5609, + "step": 4839 + }, + { + "epoch": 0.6, + "grad_norm": 1.4253943377103053, + "learning_rate": 3.6310028283930743e-06, + "loss": 0.5139, + "step": 4840 + }, + { + "epoch": 0.6, + "grad_norm": 2.849074190948096, + "learning_rate": 3.629070017857588e-06, + "loss": 0.4811, + "step": 4841 + }, + { + "epoch": 0.6, + "grad_norm": 1.3268550582958893, + "learning_rate": 3.6271374288079142e-06, + "loss": 0.4985, + "step": 4842 + }, + { + "epoch": 0.6, + "grad_norm": 1.3070111115301872, + "learning_rate": 3.6252050615562805e-06, + "loss": 0.4931, + "step": 4843 + }, + { + "epoch": 0.6, + "grad_norm": 1.3391127564835532, + "learning_rate": 3.623272916414876e-06, + "loss": 0.5093, + "step": 4844 + }, + { + "epoch": 0.6, + "grad_norm": 1.3718861741774446, + "learning_rate": 3.621340993695858e-06, + "loss": 0.5138, + "step": 4845 + }, + { + "epoch": 0.6, + "grad_norm": 1.4276519023118137, + "learning_rate": 3.6194092937113444e-06, + "loss": 0.5519, + "step": 4846 + }, + { + "epoch": 0.6, + "grad_norm": 1.672717431707128, + "learning_rate": 3.617477816773419e-06, + "loss": 0.4756, + "step": 4847 + }, + { + "epoch": 0.6, + "grad_norm": 1.588454212554117, + "learning_rate": 3.6155465631941276e-06, + "loss": 0.5124, + "step": 4848 + }, + { + "epoch": 0.6, + "grad_norm": 0.7098411838422147, + "learning_rate": 3.6136155332854835e-06, + "loss": 0.5085, + "step": 4849 + }, + { + "epoch": 0.6, + "grad_norm": 1.4768604915329149, + "learning_rate": 3.611684727359459e-06, + "loss": 0.513, + "step": 4850 + }, + { + "epoch": 0.6, + "grad_norm": 1.6237177436087704, + "learning_rate": 3.6097541457279945e-06, + "loss": 0.5036, + "step": 4851 + }, + { + "epoch": 0.6, + "grad_norm": 2.008599890001384, + "learning_rate": 3.607823788702991e-06, + "loss": 0.5041, + "step": 4852 + }, + { + "epoch": 0.6, + "grad_norm": 1.33530351725401, + "learning_rate": 3.6058936565963158e-06, + "loss": 0.5178, + "step": 4853 + }, + { + "epoch": 0.6, + "grad_norm": 1.5829084651240015, + "learning_rate": 3.6039637497197995e-06, + "loss": 0.4938, + "step": 4854 + }, + { + "epoch": 0.6, + "grad_norm": 3.9056804696592824, + "learning_rate": 3.602034068385234e-06, + "loss": 0.5461, + "step": 4855 + }, + { + "epoch": 0.6, + "grad_norm": 1.9163992244200692, + "learning_rate": 3.6001046129043778e-06, + "loss": 0.4962, + "step": 4856 + }, + { + "epoch": 0.6, + "grad_norm": 0.7212350091995289, + "learning_rate": 3.59817538358895e-06, + "loss": 0.5098, + "step": 4857 + }, + { + "epoch": 0.6, + "grad_norm": 1.9415046450085558, + "learning_rate": 3.596246380750635e-06, + "loss": 0.5487, + "step": 4858 + }, + { + "epoch": 0.6, + "grad_norm": 1.5949102498616452, + "learning_rate": 3.5943176047010807e-06, + "loss": 0.6143, + "step": 4859 + }, + { + "epoch": 0.6, + "grad_norm": 1.630007389484059, + "learning_rate": 3.592389055751897e-06, + "loss": 0.5065, + "step": 4860 + }, + { + "epoch": 0.6, + "grad_norm": 1.476053795816333, + "learning_rate": 3.590460734214657e-06, + "loss": 0.4923, + "step": 4861 + }, + { + "epoch": 0.6, + "grad_norm": 1.8988176215501187, + "learning_rate": 3.5885326404009022e-06, + "loss": 0.5095, + "step": 4862 + }, + { + "epoch": 0.6, + "grad_norm": 1.7075130802141478, + "learning_rate": 3.5866047746221294e-06, + "loss": 0.5574, + "step": 4863 + }, + { + "epoch": 0.6, + "grad_norm": 1.96164473093287, + "learning_rate": 3.5846771371898037e-06, + "loss": 0.5252, + "step": 4864 + }, + { + "epoch": 0.6, + "grad_norm": 1.7917743538361852, + "learning_rate": 3.582749728415351e-06, + "loss": 0.5252, + "step": 4865 + }, + { + "epoch": 0.6, + "grad_norm": 0.645792885373291, + "learning_rate": 3.580822548610162e-06, + "loss": 0.4615, + "step": 4866 + }, + { + "epoch": 0.6, + "grad_norm": 1.4344210864773772, + "learning_rate": 3.5788955980855894e-06, + "loss": 0.4666, + "step": 4867 + }, + { + "epoch": 0.6, + "grad_norm": 1.3952139809581494, + "learning_rate": 3.5769688771529486e-06, + "loss": 0.4653, + "step": 4868 + }, + { + "epoch": 0.6, + "grad_norm": 0.6897052423443377, + "learning_rate": 3.575042386123517e-06, + "loss": 0.5299, + "step": 4869 + }, + { + "epoch": 0.6, + "grad_norm": 1.5339288039723356, + "learning_rate": 3.57311612530854e-06, + "loss": 0.4769, + "step": 4870 + }, + { + "epoch": 0.6, + "grad_norm": 3.188281057473945, + "learning_rate": 3.5711900950192204e-06, + "loss": 0.4984, + "step": 4871 + }, + { + "epoch": 0.6, + "grad_norm": 1.511980292628914, + "learning_rate": 3.5692642955667235e-06, + "loss": 0.48, + "step": 4872 + }, + { + "epoch": 0.6, + "grad_norm": 1.362985578204832, + "learning_rate": 3.5673387272621805e-06, + "loss": 0.415, + "step": 4873 + }, + { + "epoch": 0.6, + "grad_norm": 1.7036428859732629, + "learning_rate": 3.565413390416684e-06, + "loss": 0.5269, + "step": 4874 + }, + { + "epoch": 0.6, + "grad_norm": 2.012695203425002, + "learning_rate": 3.56348828534129e-06, + "loss": 0.4847, + "step": 4875 + }, + { + "epoch": 0.61, + "grad_norm": 1.4546222976760228, + "learning_rate": 3.5615634123470143e-06, + "loss": 0.4663, + "step": 4876 + }, + { + "epoch": 0.61, + "grad_norm": 1.5513891298669515, + "learning_rate": 3.5596387717448354e-06, + "loss": 0.5032, + "step": 4877 + }, + { + "epoch": 0.61, + "grad_norm": 2.0380830917483737, + "learning_rate": 3.5577143638457014e-06, + "loss": 0.4912, + "step": 4878 + }, + { + "epoch": 0.61, + "grad_norm": 1.600213084538027, + "learning_rate": 3.555790188960514e-06, + "loss": 0.5027, + "step": 4879 + }, + { + "epoch": 0.61, + "grad_norm": 3.7253184447834027, + "learning_rate": 3.5538662474001414e-06, + "loss": 0.4657, + "step": 4880 + }, + { + "epoch": 0.61, + "grad_norm": 1.390200812178653, + "learning_rate": 3.551942539475414e-06, + "loss": 0.5211, + "step": 4881 + }, + { + "epoch": 0.61, + "grad_norm": 1.3614362906935606, + "learning_rate": 3.550019065497121e-06, + "loss": 0.5094, + "step": 4882 + }, + { + "epoch": 0.61, + "grad_norm": 1.4460947953267989, + "learning_rate": 3.5480958257760185e-06, + "loss": 0.4452, + "step": 4883 + }, + { + "epoch": 0.61, + "grad_norm": 1.7139647753932892, + "learning_rate": 3.546172820622823e-06, + "loss": 0.5304, + "step": 4884 + }, + { + "epoch": 0.61, + "grad_norm": 1.6558235733830828, + "learning_rate": 3.54425005034821e-06, + "loss": 0.442, + "step": 4885 + }, + { + "epoch": 0.61, + "grad_norm": 1.5570124837515966, + "learning_rate": 3.5423275152628245e-06, + "loss": 0.5011, + "step": 4886 + }, + { + "epoch": 0.61, + "grad_norm": 1.4332172878692637, + "learning_rate": 3.540405215677267e-06, + "loss": 0.5, + "step": 4887 + }, + { + "epoch": 0.61, + "grad_norm": 1.6731989318783418, + "learning_rate": 3.5384831519021024e-06, + "loss": 0.5592, + "step": 4888 + }, + { + "epoch": 0.61, + "grad_norm": 1.3510517557984159, + "learning_rate": 3.536561324247856e-06, + "loss": 0.527, + "step": 4889 + }, + { + "epoch": 0.61, + "grad_norm": 1.6372062487205699, + "learning_rate": 3.5346397330250176e-06, + "loss": 0.505, + "step": 4890 + }, + { + "epoch": 0.61, + "grad_norm": 1.4613237336873037, + "learning_rate": 3.532718378544035e-06, + "loss": 0.5313, + "step": 4891 + }, + { + "epoch": 0.61, + "grad_norm": 1.5685237927072886, + "learning_rate": 3.530797261115321e-06, + "loss": 0.5362, + "step": 4892 + }, + { + "epoch": 0.61, + "grad_norm": 1.340615457625864, + "learning_rate": 3.5288763810492486e-06, + "loss": 0.5179, + "step": 4893 + }, + { + "epoch": 0.61, + "grad_norm": 1.48299723682518, + "learning_rate": 3.5269557386561524e-06, + "loss": 0.4949, + "step": 4894 + }, + { + "epoch": 0.61, + "grad_norm": 1.1982134842143222, + "learning_rate": 3.5250353342463318e-06, + "loss": 0.4733, + "step": 4895 + }, + { + "epoch": 0.61, + "grad_norm": 1.3323531179160906, + "learning_rate": 3.5231151681300426e-06, + "loss": 0.4817, + "step": 4896 + }, + { + "epoch": 0.61, + "grad_norm": 2.1072391564702007, + "learning_rate": 3.5211952406175056e-06, + "loss": 0.5162, + "step": 4897 + }, + { + "epoch": 0.61, + "grad_norm": 1.9360651071059258, + "learning_rate": 3.5192755520189013e-06, + "loss": 0.4815, + "step": 4898 + }, + { + "epoch": 0.61, + "grad_norm": 1.316168543249597, + "learning_rate": 3.5173561026443737e-06, + "loss": 0.4946, + "step": 4899 + }, + { + "epoch": 0.61, + "grad_norm": 1.5539248601419484, + "learning_rate": 3.5154368928040255e-06, + "loss": 0.5328, + "step": 4900 + }, + { + "epoch": 0.61, + "grad_norm": 1.452887193496379, + "learning_rate": 3.513517922807922e-06, + "loss": 0.4714, + "step": 4901 + }, + { + "epoch": 0.61, + "grad_norm": 1.2772497753613659, + "learning_rate": 3.511599192966087e-06, + "loss": 0.5088, + "step": 4902 + }, + { + "epoch": 0.61, + "grad_norm": 1.6841954429457333, + "learning_rate": 3.5096807035885134e-06, + "loss": 0.5319, + "step": 4903 + }, + { + "epoch": 0.61, + "grad_norm": 0.6568211084906097, + "learning_rate": 3.5077624549851472e-06, + "loss": 0.4766, + "step": 4904 + }, + { + "epoch": 0.61, + "grad_norm": 1.251958650680514, + "learning_rate": 3.5058444474659e-06, + "loss": 0.4197, + "step": 4905 + }, + { + "epoch": 0.61, + "grad_norm": 1.3744636066787468, + "learning_rate": 3.503926681340641e-06, + "loss": 0.4747, + "step": 4906 + }, + { + "epoch": 0.61, + "grad_norm": 1.7970179104924786, + "learning_rate": 3.5020091569192025e-06, + "loss": 0.468, + "step": 4907 + }, + { + "epoch": 0.61, + "grad_norm": 2.1002716500361447, + "learning_rate": 3.500091874511379e-06, + "loss": 0.4674, + "step": 4908 + }, + { + "epoch": 0.61, + "grad_norm": 1.7344236020353803, + "learning_rate": 3.4981748344269218e-06, + "loss": 0.5158, + "step": 4909 + }, + { + "epoch": 0.61, + "grad_norm": 1.33071499329348, + "learning_rate": 3.496258036975547e-06, + "loss": 0.5131, + "step": 4910 + }, + { + "epoch": 0.61, + "grad_norm": 1.5623268444562706, + "learning_rate": 3.494341482466931e-06, + "loss": 0.5587, + "step": 4911 + }, + { + "epoch": 0.61, + "grad_norm": 1.2844120808059343, + "learning_rate": 3.492425171210708e-06, + "loss": 0.5187, + "step": 4912 + }, + { + "epoch": 0.61, + "grad_norm": 2.0085068307255383, + "learning_rate": 3.4905091035164775e-06, + "loss": 0.5232, + "step": 4913 + }, + { + "epoch": 0.61, + "grad_norm": 1.3519355908870683, + "learning_rate": 3.4885932796937948e-06, + "loss": 0.5681, + "step": 4914 + }, + { + "epoch": 0.61, + "grad_norm": 1.623680657057162, + "learning_rate": 3.48667770005218e-06, + "loss": 0.4876, + "step": 4915 + }, + { + "epoch": 0.61, + "grad_norm": 1.3509431507059002, + "learning_rate": 3.4847623649011104e-06, + "loss": 0.5221, + "step": 4916 + }, + { + "epoch": 0.61, + "grad_norm": 1.7033636091227293, + "learning_rate": 3.4828472745500256e-06, + "loss": 0.5763, + "step": 4917 + }, + { + "epoch": 0.61, + "grad_norm": 1.4902431059393642, + "learning_rate": 3.4809324293083244e-06, + "loss": 0.4778, + "step": 4918 + }, + { + "epoch": 0.61, + "grad_norm": 1.663639956345997, + "learning_rate": 3.479017829485369e-06, + "loss": 0.4622, + "step": 4919 + }, + { + "epoch": 0.61, + "grad_norm": 1.2922645060372164, + "learning_rate": 3.4771034753904798e-06, + "loss": 0.4629, + "step": 4920 + }, + { + "epoch": 0.61, + "grad_norm": 1.7922928399840536, + "learning_rate": 3.4751893673329363e-06, + "loss": 0.4866, + "step": 4921 + }, + { + "epoch": 0.61, + "grad_norm": 1.5676006214673102, + "learning_rate": 3.473275505621979e-06, + "loss": 0.5136, + "step": 4922 + }, + { + "epoch": 0.61, + "grad_norm": 1.3092971392638122, + "learning_rate": 3.4713618905668103e-06, + "loss": 0.5068, + "step": 4923 + }, + { + "epoch": 0.61, + "grad_norm": 1.4406852634601057, + "learning_rate": 3.469448522476592e-06, + "loss": 0.5243, + "step": 4924 + }, + { + "epoch": 0.61, + "grad_norm": 1.5603319740314827, + "learning_rate": 3.4675354016604433e-06, + "loss": 0.5204, + "step": 4925 + }, + { + "epoch": 0.61, + "grad_norm": 0.663970478353552, + "learning_rate": 3.465622528427447e-06, + "loss": 0.4964, + "step": 4926 + }, + { + "epoch": 0.61, + "grad_norm": 1.3333166367010005, + "learning_rate": 3.463709903086646e-06, + "loss": 0.5025, + "step": 4927 + }, + { + "epoch": 0.61, + "grad_norm": 1.271444599665555, + "learning_rate": 3.4617975259470403e-06, + "loss": 0.4348, + "step": 4928 + }, + { + "epoch": 0.61, + "grad_norm": 1.3694773642972735, + "learning_rate": 3.459885397317592e-06, + "loss": 0.4865, + "step": 4929 + }, + { + "epoch": 0.61, + "grad_norm": 1.4734405859367252, + "learning_rate": 3.4579735175072217e-06, + "loss": 0.4868, + "step": 4930 + }, + { + "epoch": 0.61, + "grad_norm": 1.5059492843366027, + "learning_rate": 3.456061886824811e-06, + "loss": 0.5086, + "step": 4931 + }, + { + "epoch": 0.61, + "grad_norm": 1.5004990367175046, + "learning_rate": 3.4541505055791993e-06, + "loss": 0.46, + "step": 4932 + }, + { + "epoch": 0.61, + "grad_norm": 1.6902157867585337, + "learning_rate": 3.4522393740791887e-06, + "loss": 0.4743, + "step": 4933 + }, + { + "epoch": 0.61, + "grad_norm": 1.3336540061732831, + "learning_rate": 3.4503284926335385e-06, + "loss": 0.4962, + "step": 4934 + }, + { + "epoch": 0.61, + "grad_norm": 1.4185962631748819, + "learning_rate": 3.4484178615509676e-06, + "loss": 0.4756, + "step": 4935 + }, + { + "epoch": 0.61, + "grad_norm": 1.443001312748826, + "learning_rate": 3.446507481140157e-06, + "loss": 0.4472, + "step": 4936 + }, + { + "epoch": 0.61, + "grad_norm": 0.6777553610363183, + "learning_rate": 3.4445973517097453e-06, + "loss": 0.5026, + "step": 4937 + }, + { + "epoch": 0.61, + "grad_norm": 1.4188816330877883, + "learning_rate": 3.44268747356833e-06, + "loss": 0.4978, + "step": 4938 + }, + { + "epoch": 0.61, + "grad_norm": 1.5110459009189268, + "learning_rate": 3.440777847024469e-06, + "loss": 0.5127, + "step": 4939 + }, + { + "epoch": 0.61, + "grad_norm": 1.6410841022146765, + "learning_rate": 3.438868472386679e-06, + "loss": 0.4696, + "step": 4940 + }, + { + "epoch": 0.61, + "grad_norm": 1.4175618287038916, + "learning_rate": 3.436959349963437e-06, + "loss": 0.5367, + "step": 4941 + }, + { + "epoch": 0.61, + "grad_norm": 0.6666608715251021, + "learning_rate": 3.4350504800631775e-06, + "loss": 0.5037, + "step": 4942 + }, + { + "epoch": 0.61, + "grad_norm": 1.542889725936689, + "learning_rate": 3.433141862994295e-06, + "loss": 0.5429, + "step": 4943 + }, + { + "epoch": 0.61, + "grad_norm": 0.7142075551377389, + "learning_rate": 3.431233499065145e-06, + "loss": 0.4819, + "step": 4944 + }, + { + "epoch": 0.61, + "grad_norm": 1.5725999569088291, + "learning_rate": 3.4293253885840395e-06, + "loss": 0.5112, + "step": 4945 + }, + { + "epoch": 0.61, + "grad_norm": 1.3023802101493411, + "learning_rate": 3.427417531859251e-06, + "loss": 0.4196, + "step": 4946 + }, + { + "epoch": 0.61, + "grad_norm": 0.6768819589739452, + "learning_rate": 3.42550992919901e-06, + "loss": 0.5163, + "step": 4947 + }, + { + "epoch": 0.61, + "grad_norm": 0.7029471252807711, + "learning_rate": 3.4236025809115076e-06, + "loss": 0.4865, + "step": 4948 + }, + { + "epoch": 0.61, + "grad_norm": 1.3768694181253318, + "learning_rate": 3.4216954873048912e-06, + "loss": 0.5202, + "step": 4949 + }, + { + "epoch": 0.61, + "grad_norm": 1.9456364823131251, + "learning_rate": 3.41978864868727e-06, + "loss": 0.5039, + "step": 4950 + }, + { + "epoch": 0.61, + "grad_norm": 1.7185901326118083, + "learning_rate": 3.4178820653667085e-06, + "loss": 0.4309, + "step": 4951 + }, + { + "epoch": 0.61, + "grad_norm": 1.518680620525008, + "learning_rate": 3.415975737651234e-06, + "loss": 0.5213, + "step": 4952 + }, + { + "epoch": 0.61, + "grad_norm": 1.153241301832152, + "learning_rate": 3.4140696658488304e-06, + "loss": 0.4299, + "step": 4953 + }, + { + "epoch": 0.61, + "grad_norm": 1.5080971432334338, + "learning_rate": 3.4121638502674397e-06, + "loss": 0.552, + "step": 4954 + }, + { + "epoch": 0.61, + "grad_norm": 1.3662961579017965, + "learning_rate": 3.4102582912149636e-06, + "loss": 0.5008, + "step": 4955 + }, + { + "epoch": 0.62, + "grad_norm": 1.5083284681008178, + "learning_rate": 3.4083529889992617e-06, + "loss": 0.4877, + "step": 4956 + }, + { + "epoch": 0.62, + "grad_norm": 1.6526344945720335, + "learning_rate": 3.406447943928152e-06, + "loss": 0.4756, + "step": 4957 + }, + { + "epoch": 0.62, + "grad_norm": 0.6386548943776154, + "learning_rate": 3.4045431563094123e-06, + "loss": 0.4902, + "step": 4958 + }, + { + "epoch": 0.62, + "grad_norm": 1.4213765901889195, + "learning_rate": 3.402638626450775e-06, + "loss": 0.486, + "step": 4959 + }, + { + "epoch": 0.62, + "grad_norm": 4.848591393086902, + "learning_rate": 3.4007343546599384e-06, + "loss": 0.4606, + "step": 4960 + }, + { + "epoch": 0.62, + "grad_norm": 1.4880124777484838, + "learning_rate": 3.3988303412445518e-06, + "loss": 0.4948, + "step": 4961 + }, + { + "epoch": 0.62, + "grad_norm": 1.277388551560406, + "learning_rate": 3.3969265865122247e-06, + "loss": 0.468, + "step": 4962 + }, + { + "epoch": 0.62, + "grad_norm": 1.2436123249223645, + "learning_rate": 3.395023090770526e-06, + "loss": 0.4636, + "step": 4963 + }, + { + "epoch": 0.62, + "grad_norm": 1.4514812397300938, + "learning_rate": 3.3931198543269835e-06, + "loss": 0.5198, + "step": 4964 + }, + { + "epoch": 0.62, + "grad_norm": 1.6008351074207805, + "learning_rate": 3.3912168774890797e-06, + "loss": 0.513, + "step": 4965 + }, + { + "epoch": 0.62, + "grad_norm": 1.6545897409823502, + "learning_rate": 3.3893141605642586e-06, + "loss": 0.5582, + "step": 4966 + }, + { + "epoch": 0.62, + "grad_norm": 1.4886116131312268, + "learning_rate": 3.3874117038599186e-06, + "loss": 0.516, + "step": 4967 + }, + { + "epoch": 0.62, + "grad_norm": 1.60481546255909, + "learning_rate": 3.3855095076834216e-06, + "loss": 0.4751, + "step": 4968 + }, + { + "epoch": 0.62, + "grad_norm": 1.5747293988329985, + "learning_rate": 3.3836075723420836e-06, + "loss": 0.544, + "step": 4969 + }, + { + "epoch": 0.62, + "grad_norm": 1.5765305986762645, + "learning_rate": 3.3817058981431784e-06, + "loss": 0.5053, + "step": 4970 + }, + { + "epoch": 0.62, + "grad_norm": 1.5653479677087112, + "learning_rate": 3.3798044853939375e-06, + "loss": 0.5167, + "step": 4971 + }, + { + "epoch": 0.62, + "grad_norm": 1.6885810336216072, + "learning_rate": 3.3779033344015515e-06, + "loss": 0.5074, + "step": 4972 + }, + { + "epoch": 0.62, + "grad_norm": 1.565508621884468, + "learning_rate": 3.3760024454731677e-06, + "loss": 0.5037, + "step": 4973 + }, + { + "epoch": 0.62, + "grad_norm": 1.588896209724684, + "learning_rate": 3.374101818915892e-06, + "loss": 0.5485, + "step": 4974 + }, + { + "epoch": 0.62, + "grad_norm": 1.5535713099568533, + "learning_rate": 3.372201455036787e-06, + "loss": 0.4844, + "step": 4975 + }, + { + "epoch": 0.62, + "grad_norm": 1.6389578924823094, + "learning_rate": 3.370301354142871e-06, + "loss": 0.5083, + "step": 4976 + }, + { + "epoch": 0.62, + "grad_norm": 1.416294243738421, + "learning_rate": 3.3684015165411264e-06, + "loss": 0.4844, + "step": 4977 + }, + { + "epoch": 0.62, + "grad_norm": 1.1980295333717508, + "learning_rate": 3.366501942538487e-06, + "loss": 0.5276, + "step": 4978 + }, + { + "epoch": 0.62, + "grad_norm": 1.2092080604491835, + "learning_rate": 3.3646026324418456e-06, + "loss": 0.4384, + "step": 4979 + }, + { + "epoch": 0.62, + "grad_norm": 2.073980580745277, + "learning_rate": 3.3627035865580525e-06, + "loss": 0.5405, + "step": 4980 + }, + { + "epoch": 0.62, + "grad_norm": 3.6017686291826134, + "learning_rate": 3.3608048051939147e-06, + "loss": 0.5, + "step": 4981 + }, + { + "epoch": 0.62, + "grad_norm": 1.3082842776433188, + "learning_rate": 3.358906288656197e-06, + "loss": 0.4998, + "step": 4982 + }, + { + "epoch": 0.62, + "grad_norm": 1.4383088949447156, + "learning_rate": 3.3570080372516213e-06, + "loss": 0.5247, + "step": 4983 + }, + { + "epoch": 0.62, + "grad_norm": 1.3887107654466078, + "learning_rate": 3.3551100512868663e-06, + "loss": 0.5017, + "step": 4984 + }, + { + "epoch": 0.62, + "grad_norm": 1.38804570646994, + "learning_rate": 3.3532123310685706e-06, + "loss": 0.5464, + "step": 4985 + }, + { + "epoch": 0.62, + "grad_norm": 1.4133582163794047, + "learning_rate": 3.3513148769033265e-06, + "loss": 0.4914, + "step": 4986 + }, + { + "epoch": 0.62, + "grad_norm": 1.5795226928379709, + "learning_rate": 3.3494176890976847e-06, + "loss": 0.4805, + "step": 4987 + }, + { + "epoch": 0.62, + "grad_norm": 1.415716935901781, + "learning_rate": 3.3475207679581524e-06, + "loss": 0.5058, + "step": 4988 + }, + { + "epoch": 0.62, + "grad_norm": 1.7829808944908172, + "learning_rate": 3.3456241137911947e-06, + "loss": 0.4917, + "step": 4989 + }, + { + "epoch": 0.62, + "grad_norm": 2.5579286305124613, + "learning_rate": 3.343727726903231e-06, + "loss": 0.4855, + "step": 4990 + }, + { + "epoch": 0.62, + "grad_norm": 1.9304590658531278, + "learning_rate": 3.3418316076006394e-06, + "loss": 0.4925, + "step": 4991 + }, + { + "epoch": 0.62, + "grad_norm": 1.6227599018995744, + "learning_rate": 3.3399357561897545e-06, + "loss": 0.5281, + "step": 4992 + }, + { + "epoch": 0.62, + "grad_norm": 1.677743562278396, + "learning_rate": 3.33804017297687e-06, + "loss": 0.4952, + "step": 4993 + }, + { + "epoch": 0.62, + "grad_norm": 1.4807846872178518, + "learning_rate": 3.3361448582682333e-06, + "loss": 0.4668, + "step": 4994 + }, + { + "epoch": 0.62, + "grad_norm": 1.526223315842072, + "learning_rate": 3.3342498123700484e-06, + "loss": 0.4895, + "step": 4995 + }, + { + "epoch": 0.62, + "grad_norm": 2.0357225736417877, + "learning_rate": 3.3323550355884767e-06, + "loss": 0.5361, + "step": 4996 + }, + { + "epoch": 0.62, + "grad_norm": 1.5860862144050736, + "learning_rate": 3.3304605282296367e-06, + "loss": 0.4871, + "step": 4997 + }, + { + "epoch": 0.62, + "grad_norm": 1.6424354224213873, + "learning_rate": 3.328566290599602e-06, + "loss": 0.5181, + "step": 4998 + }, + { + "epoch": 0.62, + "grad_norm": 1.648090373847227, + "learning_rate": 3.326672323004405e-06, + "loss": 0.5208, + "step": 4999 + }, + { + "epoch": 0.62, + "grad_norm": 2.248674939834039, + "learning_rate": 3.324778625750028e-06, + "loss": 0.5155, + "step": 5000 + }, + { + "epoch": 0.62, + "grad_norm": 1.2753266896871043, + "learning_rate": 3.3228851991424203e-06, + "loss": 0.4403, + "step": 5001 + }, + { + "epoch": 0.62, + "grad_norm": 1.464593760439206, + "learning_rate": 3.32099204348748e-06, + "loss": 0.4549, + "step": 5002 + }, + { + "epoch": 0.62, + "grad_norm": 2.000264604751063, + "learning_rate": 3.319099159091062e-06, + "loss": 0.4919, + "step": 5003 + }, + { + "epoch": 0.62, + "grad_norm": 2.8468860351733025, + "learning_rate": 3.3172065462589786e-06, + "loss": 0.5091, + "step": 5004 + }, + { + "epoch": 0.62, + "grad_norm": 1.5417003575190913, + "learning_rate": 3.315314205296999e-06, + "loss": 0.5152, + "step": 5005 + }, + { + "epoch": 0.62, + "grad_norm": 1.4809068825565626, + "learning_rate": 3.3134221365108466e-06, + "loss": 0.509, + "step": 5006 + }, + { + "epoch": 0.62, + "grad_norm": 1.3551581168063283, + "learning_rate": 3.311530340206202e-06, + "loss": 0.5268, + "step": 5007 + }, + { + "epoch": 0.62, + "grad_norm": 1.4109766464130555, + "learning_rate": 3.3096388166887007e-06, + "loss": 0.5166, + "step": 5008 + }, + { + "epoch": 0.62, + "grad_norm": 1.3582712933874206, + "learning_rate": 3.307747566263937e-06, + "loss": 0.5598, + "step": 5009 + }, + { + "epoch": 0.62, + "grad_norm": 1.4505331602252791, + "learning_rate": 3.3058565892374584e-06, + "loss": 0.5152, + "step": 5010 + }, + { + "epoch": 0.62, + "grad_norm": 1.819511271452722, + "learning_rate": 3.3039658859147683e-06, + "loss": 0.4617, + "step": 5011 + }, + { + "epoch": 0.62, + "grad_norm": 1.710710478621651, + "learning_rate": 3.3020754566013256e-06, + "loss": 0.521, + "step": 5012 + }, + { + "epoch": 0.62, + "grad_norm": 1.821480222707406, + "learning_rate": 3.300185301602549e-06, + "loss": 0.5599, + "step": 5013 + }, + { + "epoch": 0.62, + "grad_norm": 2.206968709126485, + "learning_rate": 3.2982954212238056e-06, + "loss": 0.5515, + "step": 5014 + }, + { + "epoch": 0.62, + "grad_norm": 1.397308898229944, + "learning_rate": 3.2964058157704247e-06, + "loss": 0.4342, + "step": 5015 + }, + { + "epoch": 0.62, + "grad_norm": 1.333446236835325, + "learning_rate": 3.2945164855476885e-06, + "loss": 0.4645, + "step": 5016 + }, + { + "epoch": 0.62, + "grad_norm": 1.8032287899771318, + "learning_rate": 3.292627430860833e-06, + "loss": 0.4875, + "step": 5017 + }, + { + "epoch": 0.62, + "grad_norm": 1.5778051329605842, + "learning_rate": 3.2907386520150553e-06, + "loss": 0.4935, + "step": 5018 + }, + { + "epoch": 0.62, + "grad_norm": 1.4134204464925468, + "learning_rate": 3.288850149315501e-06, + "loss": 0.5004, + "step": 5019 + }, + { + "epoch": 0.62, + "grad_norm": 1.3615187047308002, + "learning_rate": 3.2869619230672765e-06, + "loss": 0.5062, + "step": 5020 + }, + { + "epoch": 0.62, + "grad_norm": 1.4682549324756706, + "learning_rate": 3.28507397357544e-06, + "loss": 0.4583, + "step": 5021 + }, + { + "epoch": 0.62, + "grad_norm": 1.5559248236760226, + "learning_rate": 3.2831863011450073e-06, + "loss": 0.4816, + "step": 5022 + }, + { + "epoch": 0.62, + "grad_norm": 2.475983413954976, + "learning_rate": 3.2812989060809474e-06, + "loss": 0.5502, + "step": 5023 + }, + { + "epoch": 0.62, + "grad_norm": 1.3817847332999171, + "learning_rate": 3.279411788688186e-06, + "loss": 0.4537, + "step": 5024 + }, + { + "epoch": 0.62, + "grad_norm": 1.4282415662444998, + "learning_rate": 3.2775249492716034e-06, + "loss": 0.4906, + "step": 5025 + }, + { + "epoch": 0.62, + "grad_norm": 1.6435460317787725, + "learning_rate": 3.275638388136036e-06, + "loss": 0.5215, + "step": 5026 + }, + { + "epoch": 0.62, + "grad_norm": 1.6332556687334305, + "learning_rate": 3.2737521055862744e-06, + "loss": 0.501, + "step": 5027 + }, + { + "epoch": 0.62, + "grad_norm": 1.655276725205797, + "learning_rate": 3.2718661019270624e-06, + "loss": 0.5005, + "step": 5028 + }, + { + "epoch": 0.62, + "grad_norm": 1.3042666406087373, + "learning_rate": 3.269980377463103e-06, + "loss": 0.472, + "step": 5029 + }, + { + "epoch": 0.62, + "grad_norm": 1.918025369784106, + "learning_rate": 3.2680949324990497e-06, + "loss": 0.5144, + "step": 5030 + }, + { + "epoch": 0.62, + "grad_norm": 1.4987614591058473, + "learning_rate": 3.2662097673395134e-06, + "loss": 0.4655, + "step": 5031 + }, + { + "epoch": 0.62, + "grad_norm": 0.6840112024608185, + "learning_rate": 3.264324882289058e-06, + "loss": 0.484, + "step": 5032 + }, + { + "epoch": 0.62, + "grad_norm": 1.361880385408784, + "learning_rate": 3.2624402776522046e-06, + "loss": 0.5415, + "step": 5033 + }, + { + "epoch": 0.62, + "grad_norm": 1.2987058792137038, + "learning_rate": 3.260555953733428e-06, + "loss": 0.4534, + "step": 5034 + }, + { + "epoch": 0.62, + "grad_norm": 1.6933632604516184, + "learning_rate": 3.2586719108371567e-06, + "loss": 0.4754, + "step": 5035 + }, + { + "epoch": 0.62, + "grad_norm": 1.621284804628328, + "learning_rate": 3.2567881492677746e-06, + "loss": 0.4625, + "step": 5036 + }, + { + "epoch": 0.63, + "grad_norm": 1.4867385124248171, + "learning_rate": 3.2549046693296198e-06, + "loss": 0.4758, + "step": 5037 + }, + { + "epoch": 0.63, + "grad_norm": 1.4309626921575582, + "learning_rate": 3.2530214713269853e-06, + "loss": 0.463, + "step": 5038 + }, + { + "epoch": 0.63, + "grad_norm": 1.3982991375504024, + "learning_rate": 3.251138555564118e-06, + "loss": 0.5258, + "step": 5039 + }, + { + "epoch": 0.63, + "grad_norm": 2.8473583948807533, + "learning_rate": 3.2492559223452192e-06, + "loss": 0.5246, + "step": 5040 + }, + { + "epoch": 0.63, + "grad_norm": 1.6653642314561725, + "learning_rate": 3.247373571974445e-06, + "loss": 0.4932, + "step": 5041 + }, + { + "epoch": 0.63, + "grad_norm": 1.430380196755407, + "learning_rate": 3.2454915047559064e-06, + "loss": 0.5089, + "step": 5042 + }, + { + "epoch": 0.63, + "grad_norm": 1.314125842643514, + "learning_rate": 3.2436097209936678e-06, + "loss": 0.4556, + "step": 5043 + }, + { + "epoch": 0.63, + "grad_norm": 1.8527143155929457, + "learning_rate": 3.241728220991748e-06, + "loss": 0.4847, + "step": 5044 + }, + { + "epoch": 0.63, + "grad_norm": 1.7999529352552075, + "learning_rate": 3.2398470050541187e-06, + "loss": 0.5358, + "step": 5045 + }, + { + "epoch": 0.63, + "grad_norm": 1.7267699225161, + "learning_rate": 3.2379660734847085e-06, + "loss": 0.5211, + "step": 5046 + }, + { + "epoch": 0.63, + "grad_norm": 1.6039494417327065, + "learning_rate": 3.2360854265873975e-06, + "loss": 0.4832, + "step": 5047 + }, + { + "epoch": 0.63, + "grad_norm": 1.6272571061404455, + "learning_rate": 3.234205064666021e-06, + "loss": 0.5201, + "step": 5048 + }, + { + "epoch": 0.63, + "grad_norm": 1.5485993016422799, + "learning_rate": 3.232324988024368e-06, + "loss": 0.4914, + "step": 5049 + }, + { + "epoch": 0.63, + "grad_norm": 1.6654638703390399, + "learning_rate": 3.230445196966181e-06, + "loss": 0.5096, + "step": 5050 + }, + { + "epoch": 0.63, + "grad_norm": 0.7206764273073127, + "learning_rate": 3.228565691795158e-06, + "loss": 0.4857, + "step": 5051 + }, + { + "epoch": 0.63, + "grad_norm": 1.6704633349447144, + "learning_rate": 3.226686472814948e-06, + "loss": 0.5206, + "step": 5052 + }, + { + "epoch": 0.63, + "grad_norm": 1.6874928554487323, + "learning_rate": 3.2248075403291573e-06, + "loss": 0.4941, + "step": 5053 + }, + { + "epoch": 0.63, + "grad_norm": 1.448843695999027, + "learning_rate": 3.222928894641343e-06, + "loss": 0.4932, + "step": 5054 + }, + { + "epoch": 0.63, + "grad_norm": 1.9136320642533393, + "learning_rate": 3.2210505360550157e-06, + "loss": 0.5233, + "step": 5055 + }, + { + "epoch": 0.63, + "grad_norm": 1.4908347517455087, + "learning_rate": 3.2191724648736434e-06, + "loss": 0.4807, + "step": 5056 + }, + { + "epoch": 0.63, + "grad_norm": 1.2934989746182866, + "learning_rate": 3.217294681400643e-06, + "loss": 0.4955, + "step": 5057 + }, + { + "epoch": 0.63, + "grad_norm": 1.580504193882281, + "learning_rate": 3.2154171859393847e-06, + "loss": 0.4419, + "step": 5058 + }, + { + "epoch": 0.63, + "grad_norm": 1.6964366519154959, + "learning_rate": 3.213539978793201e-06, + "loss": 0.4858, + "step": 5059 + }, + { + "epoch": 0.63, + "grad_norm": 1.5524515714090368, + "learning_rate": 3.2116630602653665e-06, + "loss": 0.475, + "step": 5060 + }, + { + "epoch": 0.63, + "grad_norm": 1.167181040290641, + "learning_rate": 3.2097864306591143e-06, + "loss": 0.4805, + "step": 5061 + }, + { + "epoch": 0.63, + "grad_norm": 1.5429939849232062, + "learning_rate": 3.2079100902776318e-06, + "loss": 0.5109, + "step": 5062 + }, + { + "epoch": 0.63, + "grad_norm": 1.4068352002330462, + "learning_rate": 3.2060340394240567e-06, + "loss": 0.4986, + "step": 5063 + }, + { + "epoch": 0.63, + "grad_norm": 1.4641161058301413, + "learning_rate": 3.204158278401483e-06, + "loss": 0.4867, + "step": 5064 + }, + { + "epoch": 0.63, + "grad_norm": 1.5267975713122197, + "learning_rate": 3.2022828075129553e-06, + "loss": 0.5088, + "step": 5065 + }, + { + "epoch": 0.63, + "grad_norm": 1.4808147883075506, + "learning_rate": 3.2004076270614714e-06, + "loss": 0.4332, + "step": 5066 + }, + { + "epoch": 0.63, + "grad_norm": 1.6800443743499096, + "learning_rate": 3.1985327373499864e-06, + "loss": 0.4791, + "step": 5067 + }, + { + "epoch": 0.63, + "grad_norm": 2.136655646383344, + "learning_rate": 3.196658138681404e-06, + "loss": 0.5061, + "step": 5068 + }, + { + "epoch": 0.63, + "grad_norm": 1.450993950615321, + "learning_rate": 3.1947838313585823e-06, + "loss": 0.5267, + "step": 5069 + }, + { + "epoch": 0.63, + "grad_norm": 1.5025753880323134, + "learning_rate": 3.1929098156843307e-06, + "loss": 0.5506, + "step": 5070 + }, + { + "epoch": 0.63, + "grad_norm": 1.525847759768212, + "learning_rate": 3.1910360919614135e-06, + "loss": 0.4512, + "step": 5071 + }, + { + "epoch": 0.63, + "grad_norm": 1.4025262095606097, + "learning_rate": 3.189162660492548e-06, + "loss": 0.4771, + "step": 5072 + }, + { + "epoch": 0.63, + "grad_norm": 1.5297814691298464, + "learning_rate": 3.1872895215804035e-06, + "loss": 0.5194, + "step": 5073 + }, + { + "epoch": 0.63, + "grad_norm": 0.647810789148842, + "learning_rate": 3.1854166755275982e-06, + "loss": 0.4734, + "step": 5074 + }, + { + "epoch": 0.63, + "grad_norm": 2.0943659345513552, + "learning_rate": 3.1835441226367137e-06, + "loss": 0.5102, + "step": 5075 + }, + { + "epoch": 0.63, + "grad_norm": 1.4940861348841228, + "learning_rate": 3.181671863210274e-06, + "loss": 0.5145, + "step": 5076 + }, + { + "epoch": 0.63, + "grad_norm": 1.642438898262156, + "learning_rate": 3.1797998975507594e-06, + "loss": 0.5052, + "step": 5077 + }, + { + "epoch": 0.63, + "grad_norm": 0.7155625181979648, + "learning_rate": 3.1779282259606026e-06, + "loss": 0.5192, + "step": 5078 + }, + { + "epoch": 0.63, + "grad_norm": 1.4464801924254893, + "learning_rate": 3.1760568487421873e-06, + "loss": 0.5187, + "step": 5079 + }, + { + "epoch": 0.63, + "grad_norm": 1.2066580365905597, + "learning_rate": 3.1741857661978528e-06, + "loss": 0.4903, + "step": 5080 + }, + { + "epoch": 0.63, + "grad_norm": 1.528292843093238, + "learning_rate": 3.1723149786298867e-06, + "loss": 0.5126, + "step": 5081 + }, + { + "epoch": 0.63, + "grad_norm": 1.5730967438515793, + "learning_rate": 3.1704444863405314e-06, + "loss": 0.4854, + "step": 5082 + }, + { + "epoch": 0.63, + "grad_norm": 1.4127354692919383, + "learning_rate": 3.1685742896319847e-06, + "loss": 0.5259, + "step": 5083 + }, + { + "epoch": 0.63, + "grad_norm": 1.5901344549251768, + "learning_rate": 3.1667043888063914e-06, + "loss": 0.5187, + "step": 5084 + }, + { + "epoch": 0.63, + "grad_norm": 1.510968311326373, + "learning_rate": 3.16483478416585e-06, + "loss": 0.5035, + "step": 5085 + }, + { + "epoch": 0.63, + "grad_norm": 2.103291137686788, + "learning_rate": 3.1629654760124117e-06, + "loss": 0.4973, + "step": 5086 + }, + { + "epoch": 0.63, + "grad_norm": 1.362060770332344, + "learning_rate": 3.1610964646480806e-06, + "loss": 0.4657, + "step": 5087 + }, + { + "epoch": 0.63, + "grad_norm": 1.3181780493491968, + "learning_rate": 3.159227750374812e-06, + "loss": 0.4677, + "step": 5088 + }, + { + "epoch": 0.63, + "grad_norm": 1.6546426254608904, + "learning_rate": 3.157359333494511e-06, + "loss": 0.5134, + "step": 5089 + }, + { + "epoch": 0.63, + "grad_norm": 1.43936413406887, + "learning_rate": 3.155491214309039e-06, + "loss": 0.4698, + "step": 5090 + }, + { + "epoch": 0.63, + "grad_norm": 1.2386937886476648, + "learning_rate": 3.153623393120203e-06, + "loss": 0.462, + "step": 5091 + }, + { + "epoch": 0.63, + "grad_norm": 1.4513888189237607, + "learning_rate": 3.1517558702297724e-06, + "loss": 0.4814, + "step": 5092 + }, + { + "epoch": 0.63, + "grad_norm": 1.3730832063641922, + "learning_rate": 3.1498886459394585e-06, + "loss": 0.4829, + "step": 5093 + }, + { + "epoch": 0.63, + "grad_norm": 1.4175167888787246, + "learning_rate": 3.1480217205509282e-06, + "loss": 0.5328, + "step": 5094 + }, + { + "epoch": 0.63, + "grad_norm": 1.8484807418961515, + "learning_rate": 3.1461550943657996e-06, + "loss": 0.4671, + "step": 5095 + }, + { + "epoch": 0.63, + "grad_norm": 1.7497109505535595, + "learning_rate": 3.144288767685643e-06, + "loss": 0.5283, + "step": 5096 + }, + { + "epoch": 0.63, + "grad_norm": 1.3495861137244243, + "learning_rate": 3.1424227408119803e-06, + "loss": 0.5009, + "step": 5097 + }, + { + "epoch": 0.63, + "grad_norm": 1.373311925624968, + "learning_rate": 3.1405570140462833e-06, + "loss": 0.4901, + "step": 5098 + }, + { + "epoch": 0.63, + "grad_norm": 1.2854736848654358, + "learning_rate": 3.138691587689975e-06, + "loss": 0.4316, + "step": 5099 + }, + { + "epoch": 0.63, + "grad_norm": 1.506737083470091, + "learning_rate": 3.1368264620444356e-06, + "loss": 0.4619, + "step": 5100 + }, + { + "epoch": 0.63, + "grad_norm": 2.885061084921074, + "learning_rate": 3.1349616374109903e-06, + "loss": 0.5372, + "step": 5101 + }, + { + "epoch": 0.63, + "grad_norm": 1.4289873958250099, + "learning_rate": 3.1330971140909184e-06, + "loss": 0.4549, + "step": 5102 + }, + { + "epoch": 0.63, + "grad_norm": 3.3060382248016653, + "learning_rate": 3.131232892385451e-06, + "loss": 0.5197, + "step": 5103 + }, + { + "epoch": 0.63, + "grad_norm": 2.0688664902976086, + "learning_rate": 3.1293689725957667e-06, + "loss": 0.5058, + "step": 5104 + }, + { + "epoch": 0.63, + "grad_norm": 1.829403295529987, + "learning_rate": 3.1275053550230005e-06, + "loss": 0.4975, + "step": 5105 + }, + { + "epoch": 0.63, + "grad_norm": 2.430231383079677, + "learning_rate": 3.1256420399682365e-06, + "loss": 0.4455, + "step": 5106 + }, + { + "epoch": 0.63, + "grad_norm": 1.9110115547985471, + "learning_rate": 3.1237790277325076e-06, + "loss": 0.4466, + "step": 5107 + }, + { + "epoch": 0.63, + "grad_norm": 1.3324536100635709, + "learning_rate": 3.121916318616801e-06, + "loss": 0.5126, + "step": 5108 + }, + { + "epoch": 0.63, + "grad_norm": 1.3753593719641783, + "learning_rate": 3.1200539129220548e-06, + "loss": 0.5434, + "step": 5109 + }, + { + "epoch": 0.63, + "grad_norm": 1.3814478368048255, + "learning_rate": 3.118191810949156e-06, + "loss": 0.5164, + "step": 5110 + }, + { + "epoch": 0.63, + "grad_norm": 2.9334061670531075, + "learning_rate": 3.1163300129989434e-06, + "loss": 0.5299, + "step": 5111 + }, + { + "epoch": 0.63, + "grad_norm": 1.300266009222235, + "learning_rate": 3.114468519372207e-06, + "loss": 0.5049, + "step": 5112 + }, + { + "epoch": 0.63, + "grad_norm": 1.3880272247312915, + "learning_rate": 3.1126073303696873e-06, + "loss": 0.4898, + "step": 5113 + }, + { + "epoch": 0.63, + "grad_norm": 2.3431655578387383, + "learning_rate": 3.1107464462920767e-06, + "loss": 0.5187, + "step": 5114 + }, + { + "epoch": 0.63, + "grad_norm": 1.3546211679493592, + "learning_rate": 3.108885867440015e-06, + "loss": 0.531, + "step": 5115 + }, + { + "epoch": 0.63, + "grad_norm": 1.371488891276787, + "learning_rate": 3.1070255941140987e-06, + "loss": 0.4707, + "step": 5116 + }, + { + "epoch": 0.64, + "grad_norm": 1.3519675577207597, + "learning_rate": 3.105165626614869e-06, + "loss": 0.5278, + "step": 5117 + }, + { + "epoch": 0.64, + "grad_norm": 1.4497383499852987, + "learning_rate": 3.103305965242821e-06, + "loss": 0.461, + "step": 5118 + }, + { + "epoch": 0.64, + "grad_norm": 1.6053194736104737, + "learning_rate": 3.1014466102983986e-06, + "loss": 0.4781, + "step": 5119 + }, + { + "epoch": 0.64, + "grad_norm": 0.649216465541498, + "learning_rate": 3.099587562081997e-06, + "loss": 0.4484, + "step": 5120 + }, + { + "epoch": 0.64, + "grad_norm": 1.7489882471719203, + "learning_rate": 3.0977288208939627e-06, + "loss": 0.4445, + "step": 5121 + }, + { + "epoch": 0.64, + "grad_norm": 1.7986665377692794, + "learning_rate": 3.09587038703459e-06, + "loss": 0.4882, + "step": 5122 + }, + { + "epoch": 0.64, + "grad_norm": 1.3269475764699166, + "learning_rate": 3.094012260804127e-06, + "loss": 0.4894, + "step": 5123 + }, + { + "epoch": 0.64, + "grad_norm": 1.3517778780960439, + "learning_rate": 3.0921544425027695e-06, + "loss": 0.5173, + "step": 5124 + }, + { + "epoch": 0.64, + "grad_norm": 1.334966564160536, + "learning_rate": 3.0902969324306643e-06, + "loss": 0.4906, + "step": 5125 + }, + { + "epoch": 0.64, + "grad_norm": 1.6757784490718937, + "learning_rate": 3.0884397308879098e-06, + "loss": 0.5075, + "step": 5126 + }, + { + "epoch": 0.64, + "grad_norm": 1.6295839395714828, + "learning_rate": 3.0865828381745515e-06, + "loss": 0.4894, + "step": 5127 + }, + { + "epoch": 0.64, + "grad_norm": 1.5048313436145033, + "learning_rate": 3.0847262545905882e-06, + "loss": 0.5114, + "step": 5128 + }, + { + "epoch": 0.64, + "grad_norm": 1.4335249322231058, + "learning_rate": 3.0828699804359663e-06, + "loss": 0.4923, + "step": 5129 + }, + { + "epoch": 0.64, + "grad_norm": 1.3866191188244976, + "learning_rate": 3.081014016010584e-06, + "loss": 0.5375, + "step": 5130 + }, + { + "epoch": 0.64, + "grad_norm": 1.4240045872604474, + "learning_rate": 3.0791583616142883e-06, + "loss": 0.4682, + "step": 5131 + }, + { + "epoch": 0.64, + "grad_norm": 1.161237880252194, + "learning_rate": 3.0773030175468754e-06, + "loss": 0.4879, + "step": 5132 + }, + { + "epoch": 0.64, + "grad_norm": 0.7629290611916298, + "learning_rate": 3.0754479841080943e-06, + "loss": 0.4959, + "step": 5133 + }, + { + "epoch": 0.64, + "grad_norm": 1.4955674586673207, + "learning_rate": 3.0735932615976416e-06, + "loss": 0.5465, + "step": 5134 + }, + { + "epoch": 0.64, + "grad_norm": 1.3894155976672427, + "learning_rate": 3.071738850315164e-06, + "loss": 0.4957, + "step": 5135 + }, + { + "epoch": 0.64, + "grad_norm": 1.442652357430899, + "learning_rate": 3.0698847505602576e-06, + "loss": 0.5341, + "step": 5136 + }, + { + "epoch": 0.64, + "grad_norm": 1.7858544192469936, + "learning_rate": 3.0680309626324685e-06, + "loss": 0.5084, + "step": 5137 + }, + { + "epoch": 0.64, + "grad_norm": 1.2466849264870778, + "learning_rate": 3.0661774868312928e-06, + "loss": 0.5389, + "step": 5138 + }, + { + "epoch": 0.64, + "grad_norm": 1.6570292869455046, + "learning_rate": 3.064324323456176e-06, + "loss": 0.5112, + "step": 5139 + }, + { + "epoch": 0.64, + "grad_norm": 1.3190113984903318, + "learning_rate": 3.0624714728065106e-06, + "loss": 0.4613, + "step": 5140 + }, + { + "epoch": 0.64, + "grad_norm": 1.4251908409537135, + "learning_rate": 3.060618935181645e-06, + "loss": 0.4501, + "step": 5141 + }, + { + "epoch": 0.64, + "grad_norm": 1.4261908907765468, + "learning_rate": 3.0587667108808706e-06, + "loss": 0.4794, + "step": 5142 + }, + { + "epoch": 0.64, + "grad_norm": 1.536541083783513, + "learning_rate": 3.056914800203431e-06, + "loss": 0.516, + "step": 5143 + }, + { + "epoch": 0.64, + "grad_norm": 2.2948676856612353, + "learning_rate": 3.0550632034485186e-06, + "loss": 0.495, + "step": 5144 + }, + { + "epoch": 0.64, + "grad_norm": 1.949593745483904, + "learning_rate": 3.053211920915275e-06, + "loss": 0.4635, + "step": 5145 + }, + { + "epoch": 0.64, + "grad_norm": 1.4042463519036232, + "learning_rate": 3.0513609529027914e-06, + "loss": 0.4489, + "step": 5146 + }, + { + "epoch": 0.64, + "grad_norm": 1.5353044882980715, + "learning_rate": 3.049510299710108e-06, + "loss": 0.5417, + "step": 5147 + }, + { + "epoch": 0.64, + "grad_norm": 1.368567437939403, + "learning_rate": 3.0476599616362136e-06, + "loss": 0.5106, + "step": 5148 + }, + { + "epoch": 0.64, + "grad_norm": 1.602691605831889, + "learning_rate": 3.045809938980047e-06, + "loss": 0.4857, + "step": 5149 + }, + { + "epoch": 0.64, + "grad_norm": 1.4870178731861237, + "learning_rate": 3.0439602320404964e-06, + "loss": 0.4919, + "step": 5150 + }, + { + "epoch": 0.64, + "grad_norm": 1.5277486373103906, + "learning_rate": 3.0421108411163975e-06, + "loss": 0.4446, + "step": 5151 + }, + { + "epoch": 0.64, + "grad_norm": 1.7414214443891542, + "learning_rate": 3.040261766506536e-06, + "loss": 0.507, + "step": 5152 + }, + { + "epoch": 0.64, + "grad_norm": 1.7320551130735706, + "learning_rate": 3.038413008509645e-06, + "loss": 0.491, + "step": 5153 + }, + { + "epoch": 0.64, + "grad_norm": 1.5798003564239524, + "learning_rate": 3.0365645674244094e-06, + "loss": 0.4709, + "step": 5154 + }, + { + "epoch": 0.64, + "grad_norm": 1.6953870389910313, + "learning_rate": 3.03471644354946e-06, + "loss": 0.5186, + "step": 5155 + }, + { + "epoch": 0.64, + "grad_norm": 1.2829814597360416, + "learning_rate": 3.0328686371833765e-06, + "loss": 0.4842, + "step": 5156 + }, + { + "epoch": 0.64, + "grad_norm": 1.520157858202863, + "learning_rate": 3.031021148624691e-06, + "loss": 0.4392, + "step": 5157 + }, + { + "epoch": 0.64, + "grad_norm": 1.584303013751112, + "learning_rate": 3.0291739781718808e-06, + "loss": 0.5274, + "step": 5158 + }, + { + "epoch": 0.64, + "grad_norm": 1.4803637139074282, + "learning_rate": 3.0273271261233718e-06, + "loss": 0.4807, + "step": 5159 + }, + { + "epoch": 0.64, + "grad_norm": 1.5198555339306656, + "learning_rate": 3.025480592777539e-06, + "loss": 0.5198, + "step": 5160 + }, + { + "epoch": 0.64, + "grad_norm": 0.7245927757697032, + "learning_rate": 3.023634378432706e-06, + "loss": 0.4865, + "step": 5161 + }, + { + "epoch": 0.64, + "grad_norm": 1.5536795950544744, + "learning_rate": 3.021788483387146e-06, + "loss": 0.5101, + "step": 5162 + }, + { + "epoch": 0.64, + "grad_norm": 5.739620434569134, + "learning_rate": 3.019942907939079e-06, + "loss": 0.5367, + "step": 5163 + }, + { + "epoch": 0.64, + "grad_norm": 2.3241117537551723, + "learning_rate": 3.0180976523866717e-06, + "loss": 0.4565, + "step": 5164 + }, + { + "epoch": 0.64, + "grad_norm": 1.3465344224268267, + "learning_rate": 3.016252717028046e-06, + "loss": 0.4703, + "step": 5165 + }, + { + "epoch": 0.64, + "grad_norm": 1.3999767438476154, + "learning_rate": 3.0144081021612648e-06, + "loss": 0.4976, + "step": 5166 + }, + { + "epoch": 0.64, + "grad_norm": 1.3766289113380032, + "learning_rate": 3.0125638080843435e-06, + "loss": 0.4953, + "step": 5167 + }, + { + "epoch": 0.64, + "grad_norm": 1.6140554152088646, + "learning_rate": 3.0107198350952415e-06, + "loss": 0.5032, + "step": 5168 + }, + { + "epoch": 0.64, + "grad_norm": 0.6590837178206382, + "learning_rate": 3.0088761834918706e-06, + "loss": 0.5041, + "step": 5169 + }, + { + "epoch": 0.64, + "grad_norm": 2.092104546557681, + "learning_rate": 3.0070328535720884e-06, + "loss": 0.5199, + "step": 5170 + }, + { + "epoch": 0.64, + "grad_norm": 1.4407452582905922, + "learning_rate": 3.0051898456337013e-06, + "loss": 0.5068, + "step": 5171 + }, + { + "epoch": 0.64, + "grad_norm": 1.9815869758106484, + "learning_rate": 3.003347159974463e-06, + "loss": 0.5009, + "step": 5172 + }, + { + "epoch": 0.64, + "grad_norm": 1.2769554752225565, + "learning_rate": 3.001504796892074e-06, + "loss": 0.4757, + "step": 5173 + }, + { + "epoch": 0.64, + "grad_norm": 1.483988926560004, + "learning_rate": 2.9996627566841886e-06, + "loss": 0.4817, + "step": 5174 + }, + { + "epoch": 0.64, + "grad_norm": 1.3562102230857813, + "learning_rate": 2.9978210396484013e-06, + "loss": 0.4965, + "step": 5175 + }, + { + "epoch": 0.64, + "grad_norm": 1.7509090416933895, + "learning_rate": 2.99597964608226e-06, + "loss": 0.4859, + "step": 5176 + }, + { + "epoch": 0.64, + "grad_norm": 1.3193288728949342, + "learning_rate": 2.9941385762832563e-06, + "loss": 0.4898, + "step": 5177 + }, + { + "epoch": 0.64, + "grad_norm": 1.4786685504426724, + "learning_rate": 2.9922978305488317e-06, + "loss": 0.5607, + "step": 5178 + }, + { + "epoch": 0.64, + "grad_norm": 1.4479667494823674, + "learning_rate": 2.990457409176375e-06, + "loss": 0.5253, + "step": 5179 + }, + { + "epoch": 0.64, + "grad_norm": 1.4814821436756966, + "learning_rate": 2.9886173124632213e-06, + "loss": 0.4418, + "step": 5180 + }, + { + "epoch": 0.64, + "grad_norm": 0.674004839372321, + "learning_rate": 2.986777540706654e-06, + "loss": 0.4746, + "step": 5181 + }, + { + "epoch": 0.64, + "grad_norm": 1.3409942229031568, + "learning_rate": 2.984938094203908e-06, + "loss": 0.492, + "step": 5182 + }, + { + "epoch": 0.64, + "grad_norm": 1.3204528480802598, + "learning_rate": 2.98309897325216e-06, + "loss": 0.482, + "step": 5183 + }, + { + "epoch": 0.64, + "grad_norm": 2.2290584335361103, + "learning_rate": 2.9812601781485356e-06, + "loss": 0.5398, + "step": 5184 + }, + { + "epoch": 0.64, + "grad_norm": 1.4015321794285387, + "learning_rate": 2.9794217091901094e-06, + "loss": 0.5118, + "step": 5185 + }, + { + "epoch": 0.64, + "grad_norm": 2.1052379988251193, + "learning_rate": 2.9775835666739028e-06, + "loss": 0.4802, + "step": 5186 + }, + { + "epoch": 0.64, + "grad_norm": 1.4781251630529064, + "learning_rate": 2.975745750896881e-06, + "loss": 0.4244, + "step": 5187 + }, + { + "epoch": 0.64, + "grad_norm": 1.5160119248805892, + "learning_rate": 2.973908262155962e-06, + "loss": 0.4722, + "step": 5188 + }, + { + "epoch": 0.64, + "grad_norm": 2.0726917074692084, + "learning_rate": 2.9720711007480056e-06, + "loss": 0.5402, + "step": 5189 + }, + { + "epoch": 0.64, + "grad_norm": 1.8211951470508632, + "learning_rate": 2.9702342669698247e-06, + "loss": 0.4835, + "step": 5190 + }, + { + "epoch": 0.64, + "grad_norm": 3.5279731366187965, + "learning_rate": 2.968397761118175e-06, + "loss": 0.4654, + "step": 5191 + }, + { + "epoch": 0.64, + "grad_norm": 1.4769202027950719, + "learning_rate": 2.9665615834897597e-06, + "loss": 0.4981, + "step": 5192 + }, + { + "epoch": 0.64, + "grad_norm": 1.429049044873586, + "learning_rate": 2.9647257343812298e-06, + "loss": 0.4863, + "step": 5193 + }, + { + "epoch": 0.64, + "grad_norm": 1.712926248039558, + "learning_rate": 2.9628902140891823e-06, + "loss": 0.5002, + "step": 5194 + }, + { + "epoch": 0.64, + "grad_norm": 1.4039040905005618, + "learning_rate": 2.961055022910162e-06, + "loss": 0.4909, + "step": 5195 + }, + { + "epoch": 0.64, + "grad_norm": 1.5767933463615202, + "learning_rate": 2.9592201611406606e-06, + "loss": 0.5375, + "step": 5196 + }, + { + "epoch": 0.64, + "grad_norm": 1.450513560065177, + "learning_rate": 2.957385629077113e-06, + "loss": 0.4729, + "step": 5197 + }, + { + "epoch": 0.65, + "grad_norm": 1.3789791613221432, + "learning_rate": 2.955551427015909e-06, + "loss": 0.5128, + "step": 5198 + }, + { + "epoch": 0.65, + "grad_norm": 2.0296334637657703, + "learning_rate": 2.953717555253378e-06, + "loss": 0.5196, + "step": 5199 + }, + { + "epoch": 0.65, + "grad_norm": 1.4936191092661981, + "learning_rate": 2.951884014085798e-06, + "loss": 0.4448, + "step": 5200 + }, + { + "epoch": 0.65, + "grad_norm": 1.4903974201120964, + "learning_rate": 2.9500508038093932e-06, + "loss": 0.5354, + "step": 5201 + }, + { + "epoch": 0.65, + "grad_norm": 0.674069417016388, + "learning_rate": 2.9482179247203357e-06, + "loss": 0.5179, + "step": 5202 + }, + { + "epoch": 0.65, + "grad_norm": 1.3382165010372378, + "learning_rate": 2.9463853771147434e-06, + "loss": 0.5242, + "step": 5203 + }, + { + "epoch": 0.65, + "grad_norm": 1.4771001395371512, + "learning_rate": 2.94455316128868e-06, + "loss": 0.4246, + "step": 5204 + }, + { + "epoch": 0.65, + "grad_norm": 1.437699263205684, + "learning_rate": 2.942721277538154e-06, + "loss": 0.4561, + "step": 5205 + }, + { + "epoch": 0.65, + "grad_norm": 1.5707752606080958, + "learning_rate": 2.940889726159127e-06, + "loss": 0.5656, + "step": 5206 + }, + { + "epoch": 0.65, + "grad_norm": 1.3425068014198047, + "learning_rate": 2.939058507447499e-06, + "loss": 0.5571, + "step": 5207 + }, + { + "epoch": 0.65, + "grad_norm": 1.6390851225460445, + "learning_rate": 2.9372276216991204e-06, + "loss": 0.509, + "step": 5208 + }, + { + "epoch": 0.65, + "grad_norm": 1.4889246574510457, + "learning_rate": 2.9353970692097865e-06, + "loss": 0.4664, + "step": 5209 + }, + { + "epoch": 0.65, + "grad_norm": 1.421107639733984, + "learning_rate": 2.9335668502752395e-06, + "loss": 0.5479, + "step": 5210 + }, + { + "epoch": 0.65, + "grad_norm": 1.5940583579838103, + "learning_rate": 2.9317369651911677e-06, + "loss": 0.465, + "step": 5211 + }, + { + "epoch": 0.65, + "grad_norm": 1.5987081821051372, + "learning_rate": 2.9299074142532045e-06, + "loss": 0.4642, + "step": 5212 + }, + { + "epoch": 0.65, + "grad_norm": 2.2117746474994493, + "learning_rate": 2.9280781977569306e-06, + "loss": 0.4773, + "step": 5213 + }, + { + "epoch": 0.65, + "grad_norm": 2.8658518759658125, + "learning_rate": 2.9262493159978703e-06, + "loss": 0.468, + "step": 5214 + }, + { + "epoch": 0.65, + "grad_norm": 1.3225637881539518, + "learning_rate": 2.924420769271499e-06, + "loss": 0.469, + "step": 5215 + }, + { + "epoch": 0.65, + "grad_norm": 1.3203399947443093, + "learning_rate": 2.922592557873231e-06, + "loss": 0.4908, + "step": 5216 + }, + { + "epoch": 0.65, + "grad_norm": 2.328462249497007, + "learning_rate": 2.9207646820984325e-06, + "loss": 0.5081, + "step": 5217 + }, + { + "epoch": 0.65, + "grad_norm": 1.6711124618967272, + "learning_rate": 2.9189371422424123e-06, + "loss": 0.5171, + "step": 5218 + }, + { + "epoch": 0.65, + "grad_norm": 1.6230161561337815, + "learning_rate": 2.917109938600423e-06, + "loss": 0.4466, + "step": 5219 + }, + { + "epoch": 0.65, + "grad_norm": 1.3219255341465752, + "learning_rate": 2.9152830714676706e-06, + "loss": 0.4642, + "step": 5220 + }, + { + "epoch": 0.65, + "grad_norm": 1.4107900141016096, + "learning_rate": 2.9134565411392958e-06, + "loss": 0.5056, + "step": 5221 + }, + { + "epoch": 0.65, + "grad_norm": 1.4295451490426954, + "learning_rate": 2.9116303479103934e-06, + "loss": 0.5116, + "step": 5222 + }, + { + "epoch": 0.65, + "grad_norm": 1.856583146682241, + "learning_rate": 2.909804492076001e-06, + "loss": 0.493, + "step": 5223 + }, + { + "epoch": 0.65, + "grad_norm": 3.729174337478149, + "learning_rate": 2.9079789739311037e-06, + "loss": 0.5561, + "step": 5224 + }, + { + "epoch": 0.65, + "grad_norm": 0.6528515005726849, + "learning_rate": 2.906153793770626e-06, + "loss": 0.5052, + "step": 5225 + }, + { + "epoch": 0.65, + "grad_norm": 1.393222031708734, + "learning_rate": 2.904328951889447e-06, + "loss": 0.4537, + "step": 5226 + }, + { + "epoch": 0.65, + "grad_norm": 1.3073884655399801, + "learning_rate": 2.9025044485823815e-06, + "loss": 0.5334, + "step": 5227 + }, + { + "epoch": 0.65, + "grad_norm": 1.284878231455661, + "learning_rate": 2.900680284144194e-06, + "loss": 0.5229, + "step": 5228 + }, + { + "epoch": 0.65, + "grad_norm": 2.0539141190578754, + "learning_rate": 2.898856458869597e-06, + "loss": 0.4971, + "step": 5229 + }, + { + "epoch": 0.65, + "grad_norm": 1.4554782786293619, + "learning_rate": 2.897032973053241e-06, + "loss": 0.487, + "step": 5230 + }, + { + "epoch": 0.65, + "grad_norm": 1.5009432100606122, + "learning_rate": 2.895209826989733e-06, + "loss": 0.5017, + "step": 5231 + }, + { + "epoch": 0.65, + "grad_norm": 1.3348701171976673, + "learning_rate": 2.8933870209736136e-06, + "loss": 0.4668, + "step": 5232 + }, + { + "epoch": 0.65, + "grad_norm": 1.4133037167718843, + "learning_rate": 2.8915645552993756e-06, + "loss": 0.484, + "step": 5233 + }, + { + "epoch": 0.65, + "grad_norm": 1.2891795152863093, + "learning_rate": 2.889742430261452e-06, + "loss": 0.4565, + "step": 5234 + }, + { + "epoch": 0.65, + "grad_norm": 1.5537923383669643, + "learning_rate": 2.8879206461542253e-06, + "loss": 0.4766, + "step": 5235 + }, + { + "epoch": 0.65, + "grad_norm": 1.482406989146018, + "learning_rate": 2.8860992032720204e-06, + "loss": 0.479, + "step": 5236 + }, + { + "epoch": 0.65, + "grad_norm": 1.3677960555410495, + "learning_rate": 2.8842781019091037e-06, + "loss": 0.5023, + "step": 5237 + }, + { + "epoch": 0.65, + "grad_norm": 1.4134634784188358, + "learning_rate": 2.8824573423596946e-06, + "loss": 0.4854, + "step": 5238 + }, + { + "epoch": 0.65, + "grad_norm": 1.7260629179279658, + "learning_rate": 2.8806369249179513e-06, + "loss": 0.5403, + "step": 5239 + }, + { + "epoch": 0.65, + "grad_norm": 1.5421852186392826, + "learning_rate": 2.87881684987798e-06, + "loss": 0.5302, + "step": 5240 + }, + { + "epoch": 0.65, + "grad_norm": 1.7102444314625498, + "learning_rate": 2.876997117533826e-06, + "loss": 0.508, + "step": 5241 + }, + { + "epoch": 0.65, + "grad_norm": 1.350860727657689, + "learning_rate": 2.875177728179488e-06, + "loss": 0.5409, + "step": 5242 + }, + { + "epoch": 0.65, + "grad_norm": 0.6972621262407982, + "learning_rate": 2.8733586821088998e-06, + "loss": 0.5073, + "step": 5243 + }, + { + "epoch": 0.65, + "grad_norm": 1.4389568481054311, + "learning_rate": 2.871539979615948e-06, + "loss": 0.475, + "step": 5244 + }, + { + "epoch": 0.65, + "grad_norm": 1.44297013306258, + "learning_rate": 2.8697216209944585e-06, + "loss": 0.5036, + "step": 5245 + }, + { + "epoch": 0.65, + "grad_norm": 1.6421364177474709, + "learning_rate": 2.8679036065382003e-06, + "loss": 0.4985, + "step": 5246 + }, + { + "epoch": 0.65, + "grad_norm": 1.3111731981965287, + "learning_rate": 2.8660859365408934e-06, + "loss": 0.5307, + "step": 5247 + }, + { + "epoch": 0.65, + "grad_norm": 1.370112143441277, + "learning_rate": 2.8642686112961964e-06, + "loss": 0.5473, + "step": 5248 + }, + { + "epoch": 0.65, + "grad_norm": 0.6811937781641748, + "learning_rate": 2.8624516310977172e-06, + "loss": 0.4813, + "step": 5249 + }, + { + "epoch": 0.65, + "grad_norm": 1.5384833532133326, + "learning_rate": 2.860634996239001e-06, + "loss": 0.5112, + "step": 5250 + }, + { + "epoch": 0.65, + "grad_norm": 1.3948737627803693, + "learning_rate": 2.858818707013545e-06, + "loss": 0.4642, + "step": 5251 + }, + { + "epoch": 0.65, + "grad_norm": 1.2792735675727973, + "learning_rate": 2.8570027637147835e-06, + "loss": 0.5108, + "step": 5252 + }, + { + "epoch": 0.65, + "grad_norm": 1.366292951276149, + "learning_rate": 2.8551871666361e-06, + "loss": 0.5163, + "step": 5253 + }, + { + "epoch": 0.65, + "grad_norm": 1.4864720122990032, + "learning_rate": 2.8533719160708186e-06, + "loss": 0.5041, + "step": 5254 + }, + { + "epoch": 0.65, + "grad_norm": 1.570123238810715, + "learning_rate": 2.8515570123122094e-06, + "loss": 0.5027, + "step": 5255 + }, + { + "epoch": 0.65, + "grad_norm": 4.351478202009039, + "learning_rate": 2.8497424556534893e-06, + "loss": 0.4334, + "step": 5256 + }, + { + "epoch": 0.65, + "grad_norm": 1.3568254180690646, + "learning_rate": 2.84792824638781e-06, + "loss": 0.526, + "step": 5257 + }, + { + "epoch": 0.65, + "grad_norm": 1.3958890681102574, + "learning_rate": 2.8461143848082793e-06, + "loss": 0.4824, + "step": 5258 + }, + { + "epoch": 0.65, + "grad_norm": 1.56032635116221, + "learning_rate": 2.844300871207937e-06, + "loss": 0.5152, + "step": 5259 + }, + { + "epoch": 0.65, + "grad_norm": 1.4501231717596466, + "learning_rate": 2.842487705879777e-06, + "loss": 0.4875, + "step": 5260 + }, + { + "epoch": 0.65, + "grad_norm": 1.4516955521205268, + "learning_rate": 2.840674889116728e-06, + "loss": 0.492, + "step": 5261 + }, + { + "epoch": 0.65, + "grad_norm": 1.1679179628063483, + "learning_rate": 2.83886242121167e-06, + "loss": 0.4231, + "step": 5262 + }, + { + "epoch": 0.65, + "grad_norm": 1.391145838108834, + "learning_rate": 2.8370503024574192e-06, + "loss": 0.5274, + "step": 5263 + }, + { + "epoch": 0.65, + "grad_norm": 1.432913453073426, + "learning_rate": 2.835238533146741e-06, + "loss": 0.5189, + "step": 5264 + }, + { + "epoch": 0.65, + "grad_norm": 1.306301674370133, + "learning_rate": 2.8334271135723468e-06, + "loss": 0.5035, + "step": 5265 + }, + { + "epoch": 0.65, + "grad_norm": 1.6885766126397033, + "learning_rate": 2.8316160440268813e-06, + "loss": 0.5766, + "step": 5266 + }, + { + "epoch": 0.65, + "grad_norm": 1.407825610945716, + "learning_rate": 2.8298053248029434e-06, + "loss": 0.4842, + "step": 5267 + }, + { + "epoch": 0.65, + "grad_norm": 1.5167359860431884, + "learning_rate": 2.8279949561930665e-06, + "loss": 0.5149, + "step": 5268 + }, + { + "epoch": 0.65, + "grad_norm": 1.384774347864669, + "learning_rate": 2.8261849384897353e-06, + "loss": 0.4946, + "step": 5269 + }, + { + "epoch": 0.65, + "grad_norm": 1.7555210761499582, + "learning_rate": 2.8243752719853714e-06, + "loss": 0.4819, + "step": 5270 + }, + { + "epoch": 0.65, + "grad_norm": 1.27797541813699, + "learning_rate": 2.822565956972342e-06, + "loss": 0.4937, + "step": 5271 + }, + { + "epoch": 0.65, + "grad_norm": 1.917424442556268, + "learning_rate": 2.8207569937429626e-06, + "loss": 0.4998, + "step": 5272 + }, + { + "epoch": 0.65, + "grad_norm": 1.6865994988327913, + "learning_rate": 2.8189483825894813e-06, + "loss": 0.4762, + "step": 5273 + }, + { + "epoch": 0.65, + "grad_norm": 1.480227158958637, + "learning_rate": 2.8171401238041007e-06, + "loss": 0.5159, + "step": 5274 + }, + { + "epoch": 0.65, + "grad_norm": 1.3659359585215833, + "learning_rate": 2.8153322176789556e-06, + "loss": 0.4841, + "step": 5275 + }, + { + "epoch": 0.65, + "grad_norm": 1.8040659390676708, + "learning_rate": 2.813524664506133e-06, + "loss": 0.4936, + "step": 5276 + }, + { + "epoch": 0.65, + "grad_norm": 1.2706477624521206, + "learning_rate": 2.811717464577657e-06, + "loss": 0.4966, + "step": 5277 + }, + { + "epoch": 0.66, + "grad_norm": 1.493513927680801, + "learning_rate": 2.8099106181854974e-06, + "loss": 0.5047, + "step": 5278 + }, + { + "epoch": 0.66, + "grad_norm": 1.8631817433842472, + "learning_rate": 2.8081041256215654e-06, + "loss": 0.489, + "step": 5279 + }, + { + "epoch": 0.66, + "grad_norm": 1.1856851549371594, + "learning_rate": 2.8062979871777157e-06, + "loss": 0.4861, + "step": 5280 + }, + { + "epoch": 0.66, + "grad_norm": 2.471930383855249, + "learning_rate": 2.8044922031457487e-06, + "loss": 0.5283, + "step": 5281 + }, + { + "epoch": 0.66, + "grad_norm": 5.025917538098957, + "learning_rate": 2.8026867738174013e-06, + "loss": 0.5313, + "step": 5282 + }, + { + "epoch": 0.66, + "grad_norm": 1.358502126189759, + "learning_rate": 2.8008816994843592e-06, + "loss": 0.5362, + "step": 5283 + }, + { + "epoch": 0.66, + "grad_norm": 1.526702975365077, + "learning_rate": 2.7990769804382446e-06, + "loss": 0.5274, + "step": 5284 + }, + { + "epoch": 0.66, + "grad_norm": 5.7768441835174045, + "learning_rate": 2.79727261697063e-06, + "loss": 0.4959, + "step": 5285 + }, + { + "epoch": 0.66, + "grad_norm": 1.5861308351808623, + "learning_rate": 2.7954686093730216e-06, + "loss": 0.4983, + "step": 5286 + }, + { + "epoch": 0.66, + "grad_norm": 1.2829002137593568, + "learning_rate": 2.7936649579368776e-06, + "loss": 0.4543, + "step": 5287 + }, + { + "epoch": 0.66, + "grad_norm": 1.3304196406298983, + "learning_rate": 2.791861662953589e-06, + "loss": 0.4866, + "step": 5288 + }, + { + "epoch": 0.66, + "grad_norm": 1.5286558611134053, + "learning_rate": 2.790058724714496e-06, + "loss": 0.4656, + "step": 5289 + }, + { + "epoch": 0.66, + "grad_norm": 1.3558140277966517, + "learning_rate": 2.7882561435108823e-06, + "loss": 0.4728, + "step": 5290 + }, + { + "epoch": 0.66, + "grad_norm": 1.453425915868669, + "learning_rate": 2.7864539196339658e-06, + "loss": 0.4677, + "step": 5291 + }, + { + "epoch": 0.66, + "grad_norm": 1.3025199072127507, + "learning_rate": 2.784652053374915e-06, + "loss": 0.478, + "step": 5292 + }, + { + "epoch": 0.66, + "grad_norm": 1.35504074531167, + "learning_rate": 2.7828505450248343e-06, + "loss": 0.4876, + "step": 5293 + }, + { + "epoch": 0.66, + "grad_norm": 1.4178035232568393, + "learning_rate": 2.7810493948747775e-06, + "loss": 0.4972, + "step": 5294 + }, + { + "epoch": 0.66, + "grad_norm": 1.4695984310786603, + "learning_rate": 2.779248603215731e-06, + "loss": 0.5301, + "step": 5295 + }, + { + "epoch": 0.66, + "grad_norm": 1.4901468728111962, + "learning_rate": 2.777448170338632e-06, + "loss": 0.4498, + "step": 5296 + }, + { + "epoch": 0.66, + "grad_norm": 1.3709843878126562, + "learning_rate": 2.775648096534357e-06, + "loss": 0.4982, + "step": 5297 + }, + { + "epoch": 0.66, + "grad_norm": 1.6853526249118413, + "learning_rate": 2.7738483820937208e-06, + "loss": 0.4892, + "step": 5298 + }, + { + "epoch": 0.66, + "grad_norm": 1.5074452665631897, + "learning_rate": 2.7720490273074865e-06, + "loss": 0.5364, + "step": 5299 + }, + { + "epoch": 0.66, + "grad_norm": 1.555266495159016, + "learning_rate": 2.7702500324663518e-06, + "loss": 0.54, + "step": 5300 + }, + { + "epoch": 0.66, + "grad_norm": 7.48752313122983, + "learning_rate": 2.768451397860964e-06, + "loss": 0.5542, + "step": 5301 + }, + { + "epoch": 0.66, + "grad_norm": 1.351707167167519, + "learning_rate": 2.766653123781905e-06, + "loss": 0.502, + "step": 5302 + }, + { + "epoch": 0.66, + "grad_norm": 2.080333246439908, + "learning_rate": 2.7648552105197046e-06, + "loss": 0.5655, + "step": 5303 + }, + { + "epoch": 0.66, + "grad_norm": 1.567104012954525, + "learning_rate": 2.763057658364827e-06, + "loss": 0.4765, + "step": 5304 + }, + { + "epoch": 0.66, + "grad_norm": 1.3716735973388143, + "learning_rate": 2.7612604676076902e-06, + "loss": 0.5053, + "step": 5305 + }, + { + "epoch": 0.66, + "grad_norm": 1.367634803659491, + "learning_rate": 2.759463638538642e-06, + "loss": 0.4825, + "step": 5306 + }, + { + "epoch": 0.66, + "grad_norm": 1.3321793751367244, + "learning_rate": 2.757667171447973e-06, + "loss": 0.4742, + "step": 5307 + }, + { + "epoch": 0.66, + "grad_norm": 1.4559183914666705, + "learning_rate": 2.7558710666259235e-06, + "loss": 0.5147, + "step": 5308 + }, + { + "epoch": 0.66, + "grad_norm": 1.6637424420725844, + "learning_rate": 2.754075324362666e-06, + "loss": 0.5307, + "step": 5309 + }, + { + "epoch": 0.66, + "grad_norm": 1.7309577384722221, + "learning_rate": 2.7522799449483224e-06, + "loss": 0.5454, + "step": 5310 + }, + { + "epoch": 0.66, + "grad_norm": 1.7461041363532903, + "learning_rate": 2.7504849286729475e-06, + "loss": 0.5696, + "step": 5311 + }, + { + "epoch": 0.66, + "grad_norm": 1.360186930190867, + "learning_rate": 2.7486902758265445e-06, + "loss": 0.4063, + "step": 5312 + }, + { + "epoch": 0.66, + "grad_norm": 1.4725073849409913, + "learning_rate": 2.7468959866990554e-06, + "loss": 0.5528, + "step": 5313 + }, + { + "epoch": 0.66, + "grad_norm": 1.83191227764115, + "learning_rate": 2.745102061580365e-06, + "loss": 0.4732, + "step": 5314 + }, + { + "epoch": 0.66, + "grad_norm": 1.5576463111550587, + "learning_rate": 2.7433085007602955e-06, + "loss": 0.4874, + "step": 5315 + }, + { + "epoch": 0.66, + "grad_norm": 2.615574991409325, + "learning_rate": 2.7415153045286108e-06, + "loss": 0.5217, + "step": 5316 + }, + { + "epoch": 0.66, + "grad_norm": 1.7472158798068373, + "learning_rate": 2.7397224731750215e-06, + "loss": 0.5164, + "step": 5317 + }, + { + "epoch": 0.66, + "grad_norm": 1.564034329369173, + "learning_rate": 2.737930006989172e-06, + "loss": 0.5332, + "step": 5318 + }, + { + "epoch": 0.66, + "grad_norm": 1.3528228639339923, + "learning_rate": 2.7361379062606545e-06, + "loss": 0.4847, + "step": 5319 + }, + { + "epoch": 0.66, + "grad_norm": 1.3151718626427842, + "learning_rate": 2.734346171278992e-06, + "loss": 0.5129, + "step": 5320 + }, + { + "epoch": 0.66, + "grad_norm": 0.6482216627008905, + "learning_rate": 2.7325548023336645e-06, + "loss": 0.4735, + "step": 5321 + }, + { + "epoch": 0.66, + "grad_norm": 1.3814757439240166, + "learning_rate": 2.7307637997140757e-06, + "loss": 0.5166, + "step": 5322 + }, + { + "epoch": 0.66, + "grad_norm": 1.4763492860259062, + "learning_rate": 2.728973163709583e-06, + "loss": 0.4862, + "step": 5323 + }, + { + "epoch": 0.66, + "grad_norm": 2.74270202227617, + "learning_rate": 2.7271828946094753e-06, + "loss": 0.4746, + "step": 5324 + }, + { + "epoch": 0.66, + "grad_norm": 1.4641364925937064, + "learning_rate": 2.7253929927029897e-06, + "loss": 0.4816, + "step": 5325 + }, + { + "epoch": 0.66, + "grad_norm": 1.5149325381015646, + "learning_rate": 2.7236034582793e-06, + "loss": 0.448, + "step": 5326 + }, + { + "epoch": 0.66, + "grad_norm": 1.5913256590745417, + "learning_rate": 2.7218142916275174e-06, + "loss": 0.5024, + "step": 5327 + }, + { + "epoch": 0.66, + "grad_norm": 2.0427625447444466, + "learning_rate": 2.720025493036703e-06, + "loss": 0.5191, + "step": 5328 + }, + { + "epoch": 0.66, + "grad_norm": 1.3152223721438832, + "learning_rate": 2.718237062795846e-06, + "loss": 0.5157, + "step": 5329 + }, + { + "epoch": 0.66, + "grad_norm": 1.5793215264008136, + "learning_rate": 2.7164490011938915e-06, + "loss": 0.4871, + "step": 5330 + }, + { + "epoch": 0.66, + "grad_norm": 1.41395011217096, + "learning_rate": 2.714661308519711e-06, + "loss": 0.5235, + "step": 5331 + }, + { + "epoch": 0.66, + "grad_norm": 1.5978172239163189, + "learning_rate": 2.7128739850621255e-06, + "loss": 0.4837, + "step": 5332 + }, + { + "epoch": 0.66, + "grad_norm": 9.564577961403666, + "learning_rate": 2.7110870311098884e-06, + "loss": 0.527, + "step": 5333 + }, + { + "epoch": 0.66, + "grad_norm": 1.6697258115680298, + "learning_rate": 2.7093004469517027e-06, + "loss": 0.5105, + "step": 5334 + }, + { + "epoch": 0.66, + "grad_norm": 1.6120972779403429, + "learning_rate": 2.707514232876204e-06, + "loss": 0.4547, + "step": 5335 + }, + { + "epoch": 0.66, + "grad_norm": 1.7313556878552248, + "learning_rate": 2.7057283891719703e-06, + "loss": 0.5047, + "step": 5336 + }, + { + "epoch": 0.66, + "grad_norm": 1.7503742181212245, + "learning_rate": 2.703942916127521e-06, + "loss": 0.4938, + "step": 5337 + }, + { + "epoch": 0.66, + "grad_norm": 1.8591629343416418, + "learning_rate": 2.7021578140313155e-06, + "loss": 0.4805, + "step": 5338 + }, + { + "epoch": 0.66, + "grad_norm": 1.280015653078731, + "learning_rate": 2.7003730831717545e-06, + "loss": 0.4685, + "step": 5339 + }, + { + "epoch": 0.66, + "grad_norm": 1.3874985784665126, + "learning_rate": 2.6985887238371736e-06, + "loss": 0.4851, + "step": 5340 + }, + { + "epoch": 0.66, + "grad_norm": 2.4078872115531422, + "learning_rate": 2.6968047363158556e-06, + "loss": 0.4947, + "step": 5341 + }, + { + "epoch": 0.66, + "grad_norm": 1.5915183243210775, + "learning_rate": 2.6950211208960147e-06, + "loss": 0.5, + "step": 5342 + }, + { + "epoch": 0.66, + "grad_norm": 1.597352790740755, + "learning_rate": 2.693237877865814e-06, + "loss": 0.4887, + "step": 5343 + }, + { + "epoch": 0.66, + "grad_norm": 1.9562463930653387, + "learning_rate": 2.6914550075133506e-06, + "loss": 0.4693, + "step": 5344 + }, + { + "epoch": 0.66, + "grad_norm": 2.407055832699335, + "learning_rate": 2.6896725101266584e-06, + "loss": 0.5051, + "step": 5345 + }, + { + "epoch": 0.66, + "grad_norm": 1.657836405659061, + "learning_rate": 2.6878903859937245e-06, + "loss": 0.4861, + "step": 5346 + }, + { + "epoch": 0.66, + "grad_norm": 1.4575434472330062, + "learning_rate": 2.686108635402459e-06, + "loss": 0.5044, + "step": 5347 + }, + { + "epoch": 0.66, + "grad_norm": 1.2441405862275843, + "learning_rate": 2.684327258640725e-06, + "loss": 0.4904, + "step": 5348 + }, + { + "epoch": 0.66, + "grad_norm": 1.3436498811887572, + "learning_rate": 2.6825462559963144e-06, + "loss": 0.4471, + "step": 5349 + }, + { + "epoch": 0.66, + "grad_norm": 1.5582742686000106, + "learning_rate": 2.6807656277569694e-06, + "loss": 0.5168, + "step": 5350 + }, + { + "epoch": 0.66, + "grad_norm": 1.6881226866019212, + "learning_rate": 2.67898537421036e-06, + "loss": 0.501, + "step": 5351 + }, + { + "epoch": 0.66, + "grad_norm": 1.2901602306826498, + "learning_rate": 2.677205495644108e-06, + "loss": 0.4999, + "step": 5352 + }, + { + "epoch": 0.66, + "grad_norm": 1.4483734768467285, + "learning_rate": 2.675425992345763e-06, + "loss": 0.4864, + "step": 5353 + }, + { + "epoch": 0.66, + "grad_norm": 1.358456513957977, + "learning_rate": 2.673646864602822e-06, + "loss": 0.4564, + "step": 5354 + }, + { + "epoch": 0.66, + "grad_norm": 1.3999051743052522, + "learning_rate": 2.671868112702721e-06, + "loss": 0.5713, + "step": 5355 + }, + { + "epoch": 0.66, + "grad_norm": 1.7831927687168119, + "learning_rate": 2.6700897369328286e-06, + "loss": 0.5226, + "step": 5356 + }, + { + "epoch": 0.66, + "grad_norm": 1.4520519300332069, + "learning_rate": 2.668311737580461e-06, + "loss": 0.5093, + "step": 5357 + }, + { + "epoch": 0.66, + "grad_norm": 1.5009878577871714, + "learning_rate": 2.6665341149328667e-06, + "loss": 0.5286, + "step": 5358 + }, + { + "epoch": 0.67, + "grad_norm": 0.7095770926619317, + "learning_rate": 2.6647568692772386e-06, + "loss": 0.4946, + "step": 5359 + }, + { + "epoch": 0.67, + "grad_norm": 1.5466040717092826, + "learning_rate": 2.662980000900704e-06, + "loss": 0.5143, + "step": 5360 + }, + { + "epoch": 0.67, + "grad_norm": 1.3673672944178537, + "learning_rate": 2.661203510090332e-06, + "loss": 0.5227, + "step": 5361 + }, + { + "epoch": 0.67, + "grad_norm": 1.5160648696253634, + "learning_rate": 2.659427397133134e-06, + "loss": 0.4959, + "step": 5362 + }, + { + "epoch": 0.67, + "grad_norm": 1.5195269244707366, + "learning_rate": 2.6576516623160515e-06, + "loss": 0.4796, + "step": 5363 + }, + { + "epoch": 0.67, + "grad_norm": 2.0608837465904233, + "learning_rate": 2.6558763059259745e-06, + "loss": 0.4623, + "step": 5364 + }, + { + "epoch": 0.67, + "grad_norm": 1.4251657068323844, + "learning_rate": 2.6541013282497234e-06, + "loss": 0.52, + "step": 5365 + }, + { + "epoch": 0.67, + "grad_norm": 1.582165778558871, + "learning_rate": 2.6523267295740663e-06, + "loss": 0.4503, + "step": 5366 + }, + { + "epoch": 0.67, + "grad_norm": 1.7127631256028564, + "learning_rate": 2.6505525101857e-06, + "loss": 0.5577, + "step": 5367 + }, + { + "epoch": 0.67, + "grad_norm": 1.6587222980531775, + "learning_rate": 2.6487786703712692e-06, + "loss": 0.4739, + "step": 5368 + }, + { + "epoch": 0.67, + "grad_norm": 1.4720907931346792, + "learning_rate": 2.6470052104173504e-06, + "loss": 0.5673, + "step": 5369 + }, + { + "epoch": 0.67, + "grad_norm": 1.7165887666323725, + "learning_rate": 2.6452321306104634e-06, + "loss": 0.4357, + "step": 5370 + }, + { + "epoch": 0.67, + "grad_norm": 1.6261364104457807, + "learning_rate": 2.6434594312370664e-06, + "loss": 0.5145, + "step": 5371 + }, + { + "epoch": 0.67, + "grad_norm": 1.8610267248885162, + "learning_rate": 2.641687112583551e-06, + "loss": 0.4855, + "step": 5372 + }, + { + "epoch": 0.67, + "grad_norm": 1.6204122173348687, + "learning_rate": 2.639915174936254e-06, + "loss": 0.5469, + "step": 5373 + }, + { + "epoch": 0.67, + "grad_norm": 1.3873663278041979, + "learning_rate": 2.638143618581445e-06, + "loss": 0.5024, + "step": 5374 + }, + { + "epoch": 0.67, + "grad_norm": 1.7635110022929765, + "learning_rate": 2.6363724438053377e-06, + "loss": 0.4929, + "step": 5375 + }, + { + "epoch": 0.67, + "grad_norm": 1.5521889396540993, + "learning_rate": 2.6346016508940776e-06, + "loss": 0.4955, + "step": 5376 + }, + { + "epoch": 0.67, + "grad_norm": 1.6675794203169498, + "learning_rate": 2.632831240133754e-06, + "loss": 0.5452, + "step": 5377 + }, + { + "epoch": 0.67, + "grad_norm": 1.7268314997834946, + "learning_rate": 2.631061211810391e-06, + "loss": 0.4706, + "step": 5378 + }, + { + "epoch": 0.67, + "grad_norm": 2.1173737686340006, + "learning_rate": 2.629291566209952e-06, + "loss": 0.4931, + "step": 5379 + }, + { + "epoch": 0.67, + "grad_norm": 1.7066487562954822, + "learning_rate": 2.627522303618343e-06, + "loss": 0.4422, + "step": 5380 + }, + { + "epoch": 0.67, + "grad_norm": 1.496890984897217, + "learning_rate": 2.6257534243213977e-06, + "loss": 0.4978, + "step": 5381 + }, + { + "epoch": 0.67, + "grad_norm": 1.2373340535763007, + "learning_rate": 2.623984928604899e-06, + "loss": 0.4543, + "step": 5382 + }, + { + "epoch": 0.67, + "grad_norm": 1.3134546338998108, + "learning_rate": 2.6222168167545603e-06, + "loss": 0.4877, + "step": 5383 + }, + { + "epoch": 0.67, + "grad_norm": 1.4630151280361863, + "learning_rate": 2.6204490890560376e-06, + "loss": 0.5555, + "step": 5384 + }, + { + "epoch": 0.67, + "grad_norm": 1.451411796522502, + "learning_rate": 2.6186817457949203e-06, + "loss": 0.5066, + "step": 5385 + }, + { + "epoch": 0.67, + "grad_norm": 1.3195842991720612, + "learning_rate": 2.6169147872567398e-06, + "loss": 0.5071, + "step": 5386 + }, + { + "epoch": 0.67, + "grad_norm": 1.3759847548738064, + "learning_rate": 2.6151482137269652e-06, + "loss": 0.4938, + "step": 5387 + }, + { + "epoch": 0.67, + "grad_norm": 1.3174398231895585, + "learning_rate": 2.6133820254909993e-06, + "loss": 0.5007, + "step": 5388 + }, + { + "epoch": 0.67, + "grad_norm": 1.9484527254681077, + "learning_rate": 2.611616222834188e-06, + "loss": 0.4696, + "step": 5389 + }, + { + "epoch": 0.67, + "grad_norm": 1.3472055337152538, + "learning_rate": 2.6098508060418094e-06, + "loss": 0.5497, + "step": 5390 + }, + { + "epoch": 0.67, + "grad_norm": 1.3918179486748368, + "learning_rate": 2.6080857753990853e-06, + "loss": 0.4979, + "step": 5391 + }, + { + "epoch": 0.67, + "grad_norm": 1.3937102693000716, + "learning_rate": 2.6063211311911677e-06, + "loss": 0.5036, + "step": 5392 + }, + { + "epoch": 0.67, + "grad_norm": 1.479782076219447, + "learning_rate": 2.6045568737031557e-06, + "loss": 0.4686, + "step": 5393 + }, + { + "epoch": 0.67, + "grad_norm": 1.3786810743205558, + "learning_rate": 2.6027930032200744e-06, + "loss": 0.4774, + "step": 5394 + }, + { + "epoch": 0.67, + "grad_norm": 1.6375943044791605, + "learning_rate": 2.6010295200268993e-06, + "loss": 0.4739, + "step": 5395 + }, + { + "epoch": 0.67, + "grad_norm": 1.2752704209922996, + "learning_rate": 2.5992664244085337e-06, + "loss": 0.4696, + "step": 5396 + }, + { + "epoch": 0.67, + "grad_norm": 1.6687567800175958, + "learning_rate": 2.597503716649819e-06, + "loss": 0.5181, + "step": 5397 + }, + { + "epoch": 0.67, + "grad_norm": 2.0429923657351186, + "learning_rate": 2.5957413970355404e-06, + "loss": 0.5373, + "step": 5398 + }, + { + "epoch": 0.67, + "grad_norm": 1.5646069102263396, + "learning_rate": 2.5939794658504113e-06, + "loss": 0.5409, + "step": 5399 + }, + { + "epoch": 0.67, + "grad_norm": 1.6190220992841864, + "learning_rate": 2.592217923379093e-06, + "loss": 0.4874, + "step": 5400 + }, + { + "epoch": 0.67, + "grad_norm": 1.4753599318825783, + "learning_rate": 2.590456769906172e-06, + "loss": 0.4838, + "step": 5401 + }, + { + "epoch": 0.67, + "grad_norm": 2.4759908583800505, + "learning_rate": 2.588696005716184e-06, + "loss": 0.5297, + "step": 5402 + }, + { + "epoch": 0.67, + "grad_norm": 1.4459072953716825, + "learning_rate": 2.586935631093588e-06, + "loss": 0.5382, + "step": 5403 + }, + { + "epoch": 0.67, + "grad_norm": 1.5083852254491767, + "learning_rate": 2.5851756463227985e-06, + "loss": 0.5376, + "step": 5404 + }, + { + "epoch": 0.67, + "grad_norm": 1.4886043444239714, + "learning_rate": 2.5834160516881503e-06, + "loss": 0.5065, + "step": 5405 + }, + { + "epoch": 0.67, + "grad_norm": 1.4808882691319507, + "learning_rate": 2.5816568474739205e-06, + "loss": 0.4836, + "step": 5406 + }, + { + "epoch": 0.67, + "grad_norm": 1.8579835845587833, + "learning_rate": 2.579898033964328e-06, + "loss": 0.4827, + "step": 5407 + }, + { + "epoch": 0.67, + "grad_norm": 1.3889553938886416, + "learning_rate": 2.578139611443521e-06, + "loss": 0.474, + "step": 5408 + }, + { + "epoch": 0.67, + "grad_norm": 1.3946539677304366, + "learning_rate": 2.5763815801955906e-06, + "loss": 0.4721, + "step": 5409 + }, + { + "epoch": 0.67, + "grad_norm": 1.3500902016545895, + "learning_rate": 2.5746239405045592e-06, + "loss": 0.4449, + "step": 5410 + }, + { + "epoch": 0.67, + "grad_norm": 1.1094960897936739, + "learning_rate": 2.572866692654392e-06, + "loss": 0.4604, + "step": 5411 + }, + { + "epoch": 0.67, + "grad_norm": 1.7659196268114692, + "learning_rate": 2.5711098369289867e-06, + "loss": 0.4616, + "step": 5412 + }, + { + "epoch": 0.67, + "grad_norm": 1.494765942335093, + "learning_rate": 2.56935337361218e-06, + "loss": 0.5224, + "step": 5413 + }, + { + "epoch": 0.67, + "grad_norm": 1.4371359529712553, + "learning_rate": 2.5675973029877437e-06, + "loss": 0.4767, + "step": 5414 + }, + { + "epoch": 0.67, + "grad_norm": 1.2718173194398794, + "learning_rate": 2.565841625339384e-06, + "loss": 0.4592, + "step": 5415 + }, + { + "epoch": 0.67, + "grad_norm": 1.5232337238041327, + "learning_rate": 2.5640863409507497e-06, + "loss": 0.4844, + "step": 5416 + }, + { + "epoch": 0.67, + "grad_norm": 1.6899511433971866, + "learning_rate": 2.5623314501054187e-06, + "loss": 0.5109, + "step": 5417 + }, + { + "epoch": 0.67, + "grad_norm": 1.3912369568606449, + "learning_rate": 2.560576953086913e-06, + "loss": 0.4751, + "step": 5418 + }, + { + "epoch": 0.67, + "grad_norm": 0.6970311944964789, + "learning_rate": 2.5588228501786804e-06, + "loss": 0.4954, + "step": 5419 + }, + { + "epoch": 0.67, + "grad_norm": 1.728631154719974, + "learning_rate": 2.5570691416641215e-06, + "loss": 0.5504, + "step": 5420 + }, + { + "epoch": 0.67, + "grad_norm": 1.588536882862975, + "learning_rate": 2.5553158278265553e-06, + "loss": 0.5101, + "step": 5421 + }, + { + "epoch": 0.67, + "grad_norm": 2.2162522648632335, + "learning_rate": 2.5535629089492496e-06, + "loss": 0.5346, + "step": 5422 + }, + { + "epoch": 0.67, + "grad_norm": 1.3282845832866839, + "learning_rate": 2.551810385315403e-06, + "loss": 0.4933, + "step": 5423 + }, + { + "epoch": 0.67, + "grad_norm": 1.8115742362225473, + "learning_rate": 2.550058257208149e-06, + "loss": 0.4784, + "step": 5424 + }, + { + "epoch": 0.67, + "grad_norm": 1.4077049409848308, + "learning_rate": 2.5483065249105614e-06, + "loss": 0.4772, + "step": 5425 + }, + { + "epoch": 0.67, + "grad_norm": 1.3679985824357763, + "learning_rate": 2.546555188705646e-06, + "loss": 0.5134, + "step": 5426 + }, + { + "epoch": 0.67, + "grad_norm": 0.6740862704181385, + "learning_rate": 2.544804248876348e-06, + "loss": 0.4975, + "step": 5427 + }, + { + "epoch": 0.67, + "grad_norm": 1.3944543755158425, + "learning_rate": 2.5430537057055466e-06, + "loss": 0.4992, + "step": 5428 + }, + { + "epoch": 0.67, + "grad_norm": 1.4906121548513718, + "learning_rate": 2.54130355947606e-06, + "loss": 0.535, + "step": 5429 + }, + { + "epoch": 0.67, + "grad_norm": 1.3062691441609326, + "learning_rate": 2.539553810470636e-06, + "loss": 0.4755, + "step": 5430 + }, + { + "epoch": 0.67, + "grad_norm": 2.2265555064473412, + "learning_rate": 2.537804458971965e-06, + "loss": 0.4526, + "step": 5431 + }, + { + "epoch": 0.67, + "grad_norm": 1.6179227016248734, + "learning_rate": 2.5360555052626666e-06, + "loss": 0.4724, + "step": 5432 + }, + { + "epoch": 0.67, + "grad_norm": 1.7545604440470863, + "learning_rate": 2.534306949625305e-06, + "loss": 0.5003, + "step": 5433 + }, + { + "epoch": 0.67, + "grad_norm": 1.4398909017521693, + "learning_rate": 2.53255879234237e-06, + "loss": 0.482, + "step": 5434 + }, + { + "epoch": 0.67, + "grad_norm": 1.5461741379825082, + "learning_rate": 2.5308110336962904e-06, + "loss": 0.4763, + "step": 5435 + }, + { + "epoch": 0.67, + "grad_norm": 1.481190626860878, + "learning_rate": 2.5290636739694384e-06, + "loss": 0.5068, + "step": 5436 + }, + { + "epoch": 0.67, + "grad_norm": 1.7330072487063952, + "learning_rate": 2.5273167134441107e-06, + "loss": 0.4936, + "step": 5437 + }, + { + "epoch": 0.67, + "grad_norm": 1.7135444486315234, + "learning_rate": 2.5255701524025466e-06, + "loss": 0.5632, + "step": 5438 + }, + { + "epoch": 0.67, + "grad_norm": 1.3555355930905109, + "learning_rate": 2.523823991126916e-06, + "loss": 0.4596, + "step": 5439 + }, + { + "epoch": 0.68, + "grad_norm": 1.8351747571846526, + "learning_rate": 2.5220782298993297e-06, + "loss": 0.4841, + "step": 5440 + }, + { + "epoch": 0.68, + "grad_norm": 1.3244901585305506, + "learning_rate": 2.5203328690018266e-06, + "loss": 0.5499, + "step": 5441 + }, + { + "epoch": 0.68, + "grad_norm": 1.2873125222445203, + "learning_rate": 2.5185879087163896e-06, + "loss": 0.522, + "step": 5442 + }, + { + "epoch": 0.68, + "grad_norm": 1.4651537968091042, + "learning_rate": 2.51684334932493e-06, + "loss": 0.5391, + "step": 5443 + }, + { + "epoch": 0.68, + "grad_norm": 1.4053443389505933, + "learning_rate": 2.5150991911092935e-06, + "loss": 0.4871, + "step": 5444 + }, + { + "epoch": 0.68, + "grad_norm": 1.492951892417051, + "learning_rate": 2.513355434351271e-06, + "loss": 0.5062, + "step": 5445 + }, + { + "epoch": 0.68, + "grad_norm": 1.943793634105126, + "learning_rate": 2.511612079332577e-06, + "loss": 0.4549, + "step": 5446 + }, + { + "epoch": 0.68, + "grad_norm": 1.3936898928307528, + "learning_rate": 2.5098691263348697e-06, + "loss": 0.4575, + "step": 5447 + }, + { + "epoch": 0.68, + "grad_norm": 2.26250521984885, + "learning_rate": 2.508126575639733e-06, + "loss": 0.4571, + "step": 5448 + }, + { + "epoch": 0.68, + "grad_norm": 1.4848240903706404, + "learning_rate": 2.5063844275286974e-06, + "loss": 0.5088, + "step": 5449 + }, + { + "epoch": 0.68, + "grad_norm": 1.5001599313859406, + "learning_rate": 2.5046426822832175e-06, + "loss": 0.463, + "step": 5450 + }, + { + "epoch": 0.68, + "grad_norm": 1.5160790102059807, + "learning_rate": 2.5029013401846913e-06, + "loss": 0.4656, + "step": 5451 + }, + { + "epoch": 0.68, + "grad_norm": 2.123264081371732, + "learning_rate": 2.5011604015144435e-06, + "loss": 0.4472, + "step": 5452 + }, + { + "epoch": 0.68, + "grad_norm": 1.3789951391327997, + "learning_rate": 2.499419866553741e-06, + "loss": 0.5015, + "step": 5453 + }, + { + "epoch": 0.68, + "grad_norm": 1.4603123809144725, + "learning_rate": 2.4976797355837845e-06, + "loss": 0.5077, + "step": 5454 + }, + { + "epoch": 0.68, + "grad_norm": 1.7650029454656442, + "learning_rate": 2.495940008885703e-06, + "loss": 0.5056, + "step": 5455 + }, + { + "epoch": 0.68, + "grad_norm": 3.4849664616453735, + "learning_rate": 2.4942006867405685e-06, + "loss": 0.531, + "step": 5456 + }, + { + "epoch": 0.68, + "grad_norm": 1.493518897193527, + "learning_rate": 2.492461769429381e-06, + "loss": 0.5234, + "step": 5457 + }, + { + "epoch": 0.68, + "grad_norm": 1.3869502079430214, + "learning_rate": 2.49072325723308e-06, + "loss": 0.5068, + "step": 5458 + }, + { + "epoch": 0.68, + "grad_norm": 0.681702737575638, + "learning_rate": 2.4889851504325348e-06, + "loss": 0.5342, + "step": 5459 + }, + { + "epoch": 0.68, + "grad_norm": 0.6611963051545626, + "learning_rate": 2.487247449308554e-06, + "loss": 0.5225, + "step": 5460 + }, + { + "epoch": 0.68, + "grad_norm": 1.4589571409635018, + "learning_rate": 2.4855101541418797e-06, + "loss": 0.5014, + "step": 5461 + }, + { + "epoch": 0.68, + "grad_norm": 1.2537043292301941, + "learning_rate": 2.483773265213184e-06, + "loss": 0.4961, + "step": 5462 + }, + { + "epoch": 0.68, + "grad_norm": 1.32920342648201, + "learning_rate": 2.48203678280308e-06, + "loss": 0.4458, + "step": 5463 + }, + { + "epoch": 0.68, + "grad_norm": 1.3681830900025551, + "learning_rate": 2.4803007071921083e-06, + "loss": 0.4675, + "step": 5464 + }, + { + "epoch": 0.68, + "grad_norm": 1.4756488701641188, + "learning_rate": 2.478565038660751e-06, + "loss": 0.452, + "step": 5465 + }, + { + "epoch": 0.68, + "grad_norm": 1.6817738940482172, + "learning_rate": 2.4768297774894157e-06, + "loss": 0.5072, + "step": 5466 + }, + { + "epoch": 0.68, + "grad_norm": 1.5245969438197655, + "learning_rate": 2.4750949239584543e-06, + "loss": 0.475, + "step": 5467 + }, + { + "epoch": 0.68, + "grad_norm": 1.280761509374414, + "learning_rate": 2.4733604783481436e-06, + "loss": 0.4787, + "step": 5468 + }, + { + "epoch": 0.68, + "grad_norm": 1.4899473963878174, + "learning_rate": 2.4716264409387005e-06, + "loss": 0.496, + "step": 5469 + }, + { + "epoch": 0.68, + "grad_norm": 1.7960664090989007, + "learning_rate": 2.469892812010275e-06, + "loss": 0.5228, + "step": 5470 + }, + { + "epoch": 0.68, + "grad_norm": 1.2990665854475318, + "learning_rate": 2.4681595918429473e-06, + "loss": 0.4516, + "step": 5471 + }, + { + "epoch": 0.68, + "grad_norm": 1.717509107476117, + "learning_rate": 2.466426780716738e-06, + "loss": 0.5279, + "step": 5472 + }, + { + "epoch": 0.68, + "grad_norm": 3.820181283099114, + "learning_rate": 2.4646943789115947e-06, + "loss": 0.5585, + "step": 5473 + }, + { + "epoch": 0.68, + "grad_norm": 1.4540625568618581, + "learning_rate": 2.4629623867074043e-06, + "loss": 0.5351, + "step": 5474 + }, + { + "epoch": 0.68, + "grad_norm": 1.3742096801051715, + "learning_rate": 2.4612308043839835e-06, + "loss": 0.5146, + "step": 5475 + }, + { + "epoch": 0.68, + "grad_norm": 1.5797660549317376, + "learning_rate": 2.459499632221085e-06, + "loss": 0.5133, + "step": 5476 + }, + { + "epoch": 0.68, + "grad_norm": 1.776131095900743, + "learning_rate": 2.4577688704983984e-06, + "loss": 0.4713, + "step": 5477 + }, + { + "epoch": 0.68, + "grad_norm": 1.483890797314977, + "learning_rate": 2.456038519495538e-06, + "loss": 0.4944, + "step": 5478 + }, + { + "epoch": 0.68, + "grad_norm": 1.958313627579842, + "learning_rate": 2.4543085794920616e-06, + "loss": 0.5002, + "step": 5479 + }, + { + "epoch": 0.68, + "grad_norm": 1.6271441054448663, + "learning_rate": 2.4525790507674543e-06, + "loss": 0.5027, + "step": 5480 + }, + { + "epoch": 0.68, + "grad_norm": 0.6516835480074524, + "learning_rate": 2.4508499336011377e-06, + "loss": 0.4594, + "step": 5481 + }, + { + "epoch": 0.68, + "grad_norm": 1.5440243026978204, + "learning_rate": 2.4491212282724637e-06, + "loss": 0.5465, + "step": 5482 + }, + { + "epoch": 0.68, + "grad_norm": 1.4122249610320332, + "learning_rate": 2.447392935060723e-06, + "loss": 0.4976, + "step": 5483 + }, + { + "epoch": 0.68, + "grad_norm": 1.4013850774891232, + "learning_rate": 2.4456650542451333e-06, + "loss": 0.4864, + "step": 5484 + }, + { + "epoch": 0.68, + "grad_norm": 1.77467430479846, + "learning_rate": 2.443937586104851e-06, + "loss": 0.5007, + "step": 5485 + }, + { + "epoch": 0.68, + "grad_norm": 1.3686948184301855, + "learning_rate": 2.4422105309189646e-06, + "loss": 0.4656, + "step": 5486 + }, + { + "epoch": 0.68, + "grad_norm": 1.6813398955724435, + "learning_rate": 2.4404838889664923e-06, + "loss": 0.51, + "step": 5487 + }, + { + "epoch": 0.68, + "grad_norm": 1.3466499318485836, + "learning_rate": 2.4387576605263924e-06, + "loss": 0.4295, + "step": 5488 + }, + { + "epoch": 0.68, + "grad_norm": 1.312175726859582, + "learning_rate": 2.4370318458775472e-06, + "loss": 0.4509, + "step": 5489 + }, + { + "epoch": 0.68, + "grad_norm": 1.4006040680258878, + "learning_rate": 2.435306445298782e-06, + "loss": 0.4833, + "step": 5490 + }, + { + "epoch": 0.68, + "grad_norm": 1.960510313343304, + "learning_rate": 2.4335814590688465e-06, + "loss": 0.4776, + "step": 5491 + }, + { + "epoch": 0.68, + "grad_norm": 1.3678018833794279, + "learning_rate": 2.4318568874664318e-06, + "loss": 0.4486, + "step": 5492 + }, + { + "epoch": 0.68, + "grad_norm": 1.786463696861167, + "learning_rate": 2.4301327307701526e-06, + "loss": 0.4443, + "step": 5493 + }, + { + "epoch": 0.68, + "grad_norm": 1.9480246804073973, + "learning_rate": 2.4284089892585642e-06, + "loss": 0.4685, + "step": 5494 + }, + { + "epoch": 0.68, + "grad_norm": 1.3265526795294944, + "learning_rate": 2.426685663210154e-06, + "loss": 0.4253, + "step": 5495 + }, + { + "epoch": 0.68, + "grad_norm": 1.341929758596841, + "learning_rate": 2.424962752903337e-06, + "loss": 0.5203, + "step": 5496 + }, + { + "epoch": 0.68, + "grad_norm": 1.4182009629873498, + "learning_rate": 2.4232402586164677e-06, + "loss": 0.4407, + "step": 5497 + }, + { + "epoch": 0.68, + "grad_norm": 1.742253522033818, + "learning_rate": 2.421518180627827e-06, + "loss": 0.5481, + "step": 5498 + }, + { + "epoch": 0.68, + "grad_norm": 0.6532235894146765, + "learning_rate": 2.4197965192156354e-06, + "loss": 0.4956, + "step": 5499 + }, + { + "epoch": 0.68, + "grad_norm": 1.3161751128320978, + "learning_rate": 2.418075274658039e-06, + "loss": 0.4593, + "step": 5500 + }, + { + "epoch": 0.68, + "grad_norm": 0.6334983346371813, + "learning_rate": 2.4163544472331207e-06, + "loss": 0.4933, + "step": 5501 + }, + { + "epoch": 0.68, + "grad_norm": 1.6149656215565829, + "learning_rate": 2.4146340372188964e-06, + "loss": 0.4689, + "step": 5502 + }, + { + "epoch": 0.68, + "grad_norm": 1.7919360703700247, + "learning_rate": 2.412914044893316e-06, + "loss": 0.5363, + "step": 5503 + }, + { + "epoch": 0.68, + "grad_norm": 2.457696564614905, + "learning_rate": 2.4111944705342554e-06, + "loss": 0.5046, + "step": 5504 + }, + { + "epoch": 0.68, + "grad_norm": 1.8186669373877837, + "learning_rate": 2.409475314419527e-06, + "loss": 0.4993, + "step": 5505 + }, + { + "epoch": 0.68, + "grad_norm": 1.3221148999267518, + "learning_rate": 2.407756576826879e-06, + "loss": 0.4626, + "step": 5506 + }, + { + "epoch": 0.68, + "grad_norm": 1.6052732752272765, + "learning_rate": 2.4060382580339842e-06, + "loss": 0.4899, + "step": 5507 + }, + { + "epoch": 0.68, + "grad_norm": 2.5767022145696568, + "learning_rate": 2.404320358318456e-06, + "loss": 0.5222, + "step": 5508 + }, + { + "epoch": 0.68, + "grad_norm": 2.5095809579755186, + "learning_rate": 2.4026028779578315e-06, + "loss": 0.4617, + "step": 5509 + }, + { + "epoch": 0.68, + "grad_norm": 1.97793047083496, + "learning_rate": 2.400885817229591e-06, + "loss": 0.4826, + "step": 5510 + }, + { + "epoch": 0.68, + "grad_norm": 1.3588500491345321, + "learning_rate": 2.3991691764111365e-06, + "loss": 0.5548, + "step": 5511 + }, + { + "epoch": 0.68, + "grad_norm": 2.6518198586561694, + "learning_rate": 2.3974529557798095e-06, + "loss": 0.4769, + "step": 5512 + }, + { + "epoch": 0.68, + "grad_norm": 1.390464564482471, + "learning_rate": 2.395737155612878e-06, + "loss": 0.5309, + "step": 5513 + }, + { + "epoch": 0.68, + "grad_norm": 1.3918943127026036, + "learning_rate": 2.3940217761875433e-06, + "loss": 0.51, + "step": 5514 + }, + { + "epoch": 0.68, + "grad_norm": 1.6977411274786531, + "learning_rate": 2.392306817780945e-06, + "loss": 0.4702, + "step": 5515 + }, + { + "epoch": 0.68, + "grad_norm": 1.6922389974526388, + "learning_rate": 2.390592280670144e-06, + "loss": 0.4979, + "step": 5516 + }, + { + "epoch": 0.68, + "grad_norm": 1.3787793285566037, + "learning_rate": 2.388878165132142e-06, + "loss": 0.4635, + "step": 5517 + }, + { + "epoch": 0.68, + "grad_norm": 1.646674192788396, + "learning_rate": 2.387164471443869e-06, + "loss": 0.4554, + "step": 5518 + }, + { + "epoch": 0.68, + "grad_norm": 1.3207958483783915, + "learning_rate": 2.3854511998821894e-06, + "loss": 0.5156, + "step": 5519 + }, + { + "epoch": 0.69, + "grad_norm": 2.227695167477801, + "learning_rate": 2.3837383507238936e-06, + "loss": 0.5333, + "step": 5520 + }, + { + "epoch": 0.69, + "grad_norm": 1.4267150504517874, + "learning_rate": 2.382025924245711e-06, + "loss": 0.5071, + "step": 5521 + }, + { + "epoch": 0.69, + "grad_norm": 1.3844833274592145, + "learning_rate": 2.3803139207242974e-06, + "loss": 0.4785, + "step": 5522 + }, + { + "epoch": 0.69, + "grad_norm": 1.4789844668096037, + "learning_rate": 2.37860234043624e-06, + "loss": 0.5884, + "step": 5523 + }, + { + "epoch": 0.69, + "grad_norm": 1.4224685107437975, + "learning_rate": 2.3768911836580645e-06, + "loss": 0.5148, + "step": 5524 + }, + { + "epoch": 0.69, + "grad_norm": 1.9225409466991767, + "learning_rate": 2.3751804506662174e-06, + "loss": 0.5083, + "step": 5525 + }, + { + "epoch": 0.69, + "grad_norm": 1.5887749841143064, + "learning_rate": 2.3734701417370866e-06, + "loss": 0.4917, + "step": 5526 + }, + { + "epoch": 0.69, + "grad_norm": 1.7569083490679074, + "learning_rate": 2.3717602571469865e-06, + "loss": 0.4632, + "step": 5527 + }, + { + "epoch": 0.69, + "grad_norm": 1.4140716874174504, + "learning_rate": 2.3700507971721663e-06, + "loss": 0.427, + "step": 5528 + }, + { + "epoch": 0.69, + "grad_norm": 1.609909140872164, + "learning_rate": 2.3683417620888003e-06, + "loss": 0.5611, + "step": 5529 + }, + { + "epoch": 0.69, + "grad_norm": 1.5878396327956295, + "learning_rate": 2.3666331521730026e-06, + "loss": 0.4592, + "step": 5530 + }, + { + "epoch": 0.69, + "grad_norm": 0.7151379897062956, + "learning_rate": 2.3649249677008097e-06, + "loss": 0.5009, + "step": 5531 + }, + { + "epoch": 0.69, + "grad_norm": 1.2996372953750224, + "learning_rate": 2.3632172089481973e-06, + "loss": 0.546, + "step": 5532 + }, + { + "epoch": 0.69, + "grad_norm": 2.077678869251373, + "learning_rate": 2.361509876191068e-06, + "loss": 0.4523, + "step": 5533 + }, + { + "epoch": 0.69, + "grad_norm": 1.747621552902312, + "learning_rate": 2.3598029697052522e-06, + "loss": 0.4888, + "step": 5534 + }, + { + "epoch": 0.69, + "grad_norm": 1.3678679837061298, + "learning_rate": 2.358096489766523e-06, + "loss": 0.5107, + "step": 5535 + }, + { + "epoch": 0.69, + "grad_norm": 1.6739888453600988, + "learning_rate": 2.3563904366505717e-06, + "loss": 0.4862, + "step": 5536 + }, + { + "epoch": 0.69, + "grad_norm": 1.8955093820392193, + "learning_rate": 2.3546848106330308e-06, + "loss": 0.5101, + "step": 5537 + }, + { + "epoch": 0.69, + "grad_norm": 1.6366006897451797, + "learning_rate": 2.352979611989454e-06, + "loss": 0.5044, + "step": 5538 + }, + { + "epoch": 0.69, + "grad_norm": 1.5750665629516203, + "learning_rate": 2.3512748409953358e-06, + "loss": 0.489, + "step": 5539 + }, + { + "epoch": 0.69, + "grad_norm": 1.4189184004954971, + "learning_rate": 2.3495704979260924e-06, + "loss": 0.4985, + "step": 5540 + }, + { + "epoch": 0.69, + "grad_norm": 1.666993666611936, + "learning_rate": 2.3478665830570807e-06, + "loss": 0.4446, + "step": 5541 + }, + { + "epoch": 0.69, + "grad_norm": 1.4494929286590599, + "learning_rate": 2.3461630966635784e-06, + "loss": 0.497, + "step": 5542 + }, + { + "epoch": 0.69, + "grad_norm": 1.487393842420435, + "learning_rate": 2.3444600390208007e-06, + "loss": 0.4965, + "step": 5543 + }, + { + "epoch": 0.69, + "grad_norm": 1.5902444904379578, + "learning_rate": 2.3427574104038934e-06, + "loss": 0.4581, + "step": 5544 + }, + { + "epoch": 0.69, + "grad_norm": 1.161066446508982, + "learning_rate": 2.3410552110879277e-06, + "loss": 0.4993, + "step": 5545 + }, + { + "epoch": 0.69, + "grad_norm": 1.3929795053155725, + "learning_rate": 2.339353441347912e-06, + "loss": 0.5134, + "step": 5546 + }, + { + "epoch": 0.69, + "grad_norm": 1.37935134864808, + "learning_rate": 2.3376521014587787e-06, + "loss": 0.504, + "step": 5547 + }, + { + "epoch": 0.69, + "grad_norm": 2.0874526415093935, + "learning_rate": 2.335951191695399e-06, + "loss": 0.53, + "step": 5548 + }, + { + "epoch": 0.69, + "grad_norm": 1.7303232882596309, + "learning_rate": 2.334250712332565e-06, + "loss": 0.4948, + "step": 5549 + }, + { + "epoch": 0.69, + "grad_norm": 1.8220759392032708, + "learning_rate": 2.3325506636450056e-06, + "loss": 0.5296, + "step": 5550 + }, + { + "epoch": 0.69, + "grad_norm": 1.4777829873197916, + "learning_rate": 2.3308510459073817e-06, + "loss": 0.4775, + "step": 5551 + }, + { + "epoch": 0.69, + "grad_norm": 1.5027960581625106, + "learning_rate": 2.3291518593942774e-06, + "loss": 0.5385, + "step": 5552 + }, + { + "epoch": 0.69, + "grad_norm": 1.5217717603564733, + "learning_rate": 2.3274531043802148e-06, + "loss": 0.4594, + "step": 5553 + }, + { + "epoch": 0.69, + "grad_norm": 1.5906557921951225, + "learning_rate": 2.325754781139638e-06, + "loss": 0.4724, + "step": 5554 + }, + { + "epoch": 0.69, + "grad_norm": 1.329096145339709, + "learning_rate": 2.3240568899469317e-06, + "loss": 0.4716, + "step": 5555 + }, + { + "epoch": 0.69, + "grad_norm": 1.4012838734313477, + "learning_rate": 2.322359431076401e-06, + "loss": 0.4911, + "step": 5556 + }, + { + "epoch": 0.69, + "grad_norm": 1.5384021925882139, + "learning_rate": 2.3206624048022872e-06, + "loss": 0.506, + "step": 5557 + }, + { + "epoch": 0.69, + "grad_norm": 0.655449448220534, + "learning_rate": 2.318965811398759e-06, + "loss": 0.5081, + "step": 5558 + }, + { + "epoch": 0.69, + "grad_norm": 1.5825164260224847, + "learning_rate": 2.317269651139916e-06, + "loss": 0.5677, + "step": 5559 + }, + { + "epoch": 0.69, + "grad_norm": 1.6821233375146059, + "learning_rate": 2.3155739242997893e-06, + "loss": 0.5366, + "step": 5560 + }, + { + "epoch": 0.69, + "grad_norm": 1.5594929350983364, + "learning_rate": 2.3138786311523364e-06, + "loss": 0.4455, + "step": 5561 + }, + { + "epoch": 0.69, + "grad_norm": 1.4382628232628942, + "learning_rate": 2.3121837719714496e-06, + "loss": 0.4402, + "step": 5562 + }, + { + "epoch": 0.69, + "grad_norm": 1.4816789194218292, + "learning_rate": 2.310489347030945e-06, + "loss": 0.4789, + "step": 5563 + }, + { + "epoch": 0.69, + "grad_norm": 1.5188438068192134, + "learning_rate": 2.3087953566045743e-06, + "loss": 0.4484, + "step": 5564 + }, + { + "epoch": 0.69, + "grad_norm": 1.5098238352487316, + "learning_rate": 2.307101800966015e-06, + "loss": 0.4725, + "step": 5565 + }, + { + "epoch": 0.69, + "grad_norm": 1.7038045623006983, + "learning_rate": 2.305408680388878e-06, + "loss": 0.4944, + "step": 5566 + }, + { + "epoch": 0.69, + "grad_norm": 1.2637120328181233, + "learning_rate": 2.303715995146699e-06, + "loss": 0.5007, + "step": 5567 + }, + { + "epoch": 0.69, + "grad_norm": 1.4973349132088374, + "learning_rate": 2.3020237455129473e-06, + "loss": 0.5026, + "step": 5568 + }, + { + "epoch": 0.69, + "grad_norm": 2.5771957423178784, + "learning_rate": 2.3003319317610232e-06, + "loss": 0.441, + "step": 5569 + }, + { + "epoch": 0.69, + "grad_norm": 1.9968666654182445, + "learning_rate": 2.298640554164251e-06, + "loss": 0.5056, + "step": 5570 + }, + { + "epoch": 0.69, + "grad_norm": 1.5088225959683377, + "learning_rate": 2.2969496129958896e-06, + "loss": 0.4609, + "step": 5571 + }, + { + "epoch": 0.69, + "grad_norm": 1.5688355879342355, + "learning_rate": 2.295259108529123e-06, + "loss": 0.5082, + "step": 5572 + }, + { + "epoch": 0.69, + "grad_norm": 2.1218690144924017, + "learning_rate": 2.29356904103707e-06, + "loss": 0.477, + "step": 5573 + }, + { + "epoch": 0.69, + "grad_norm": 1.3317460113932156, + "learning_rate": 2.291879410792774e-06, + "loss": 0.5049, + "step": 5574 + }, + { + "epoch": 0.69, + "grad_norm": 1.6361394777722302, + "learning_rate": 2.2901902180692094e-06, + "loss": 0.4927, + "step": 5575 + }, + { + "epoch": 0.69, + "grad_norm": 1.2600474098118004, + "learning_rate": 2.288501463139282e-06, + "loss": 0.4862, + "step": 5576 + }, + { + "epoch": 0.69, + "grad_norm": 1.8835019069260908, + "learning_rate": 2.286813146275823e-06, + "loss": 0.465, + "step": 5577 + }, + { + "epoch": 0.69, + "grad_norm": 0.8127391882820307, + "learning_rate": 2.285125267751598e-06, + "loss": 0.5114, + "step": 5578 + }, + { + "epoch": 0.69, + "grad_norm": 1.8926521832120369, + "learning_rate": 2.2834378278392937e-06, + "loss": 0.5105, + "step": 5579 + }, + { + "epoch": 0.69, + "grad_norm": 1.4873921582045961, + "learning_rate": 2.2817508268115364e-06, + "loss": 0.4717, + "step": 5580 + }, + { + "epoch": 0.69, + "grad_norm": 2.819933821860014, + "learning_rate": 2.280064264940871e-06, + "loss": 0.5442, + "step": 5581 + }, + { + "epoch": 0.69, + "grad_norm": 1.683485669972065, + "learning_rate": 2.27837814249978e-06, + "loss": 0.5305, + "step": 5582 + }, + { + "epoch": 0.69, + "grad_norm": 0.6440117666442702, + "learning_rate": 2.2766924597606686e-06, + "loss": 0.499, + "step": 5583 + }, + { + "epoch": 0.69, + "grad_norm": 1.6879618779299213, + "learning_rate": 2.2750072169958754e-06, + "loss": 0.5379, + "step": 5584 + }, + { + "epoch": 0.69, + "grad_norm": 1.9671847158512492, + "learning_rate": 2.273322414477668e-06, + "loss": 0.4351, + "step": 5585 + }, + { + "epoch": 0.69, + "grad_norm": 1.466482216219935, + "learning_rate": 2.271638052478236e-06, + "loss": 0.4478, + "step": 5586 + }, + { + "epoch": 0.69, + "grad_norm": 1.9616512482384683, + "learning_rate": 2.269954131269708e-06, + "loss": 0.4975, + "step": 5587 + }, + { + "epoch": 0.69, + "grad_norm": 1.3639134192599855, + "learning_rate": 2.268270651124133e-06, + "loss": 0.4843, + "step": 5588 + }, + { + "epoch": 0.69, + "grad_norm": 1.3553964640402418, + "learning_rate": 2.266587612313495e-06, + "loss": 0.492, + "step": 5589 + }, + { + "epoch": 0.69, + "grad_norm": 1.5577828659517323, + "learning_rate": 2.2649050151096994e-06, + "loss": 0.5129, + "step": 5590 + }, + { + "epoch": 0.69, + "grad_norm": 1.6522603447358237, + "learning_rate": 2.2632228597845878e-06, + "loss": 0.4852, + "step": 5591 + }, + { + "epoch": 0.69, + "grad_norm": 1.6798186942226545, + "learning_rate": 2.2615411466099283e-06, + "loss": 0.5052, + "step": 5592 + }, + { + "epoch": 0.69, + "grad_norm": 1.7162936489412086, + "learning_rate": 2.259859875857413e-06, + "loss": 0.4911, + "step": 5593 + }, + { + "epoch": 0.69, + "grad_norm": 1.4025456726402974, + "learning_rate": 2.2581790477986692e-06, + "loss": 0.5039, + "step": 5594 + }, + { + "epoch": 0.69, + "grad_norm": 1.5845470815346903, + "learning_rate": 2.2564986627052463e-06, + "loss": 0.5274, + "step": 5595 + }, + { + "epoch": 0.69, + "grad_norm": 1.6416412378921217, + "learning_rate": 2.2548187208486293e-06, + "loss": 0.5115, + "step": 5596 + }, + { + "epoch": 0.69, + "grad_norm": 1.3451856385540713, + "learning_rate": 2.2531392225002236e-06, + "loss": 0.4623, + "step": 5597 + }, + { + "epoch": 0.69, + "grad_norm": 1.3698888882150644, + "learning_rate": 2.25146016793137e-06, + "loss": 0.4535, + "step": 5598 + }, + { + "epoch": 0.69, + "grad_norm": 0.6976438584873728, + "learning_rate": 2.2497815574133313e-06, + "loss": 0.5107, + "step": 5599 + }, + { + "epoch": 0.69, + "grad_norm": 2.1391226535547676, + "learning_rate": 2.2481033912173044e-06, + "loss": 0.5465, + "step": 5600 + }, + { + "epoch": 0.7, + "grad_norm": 1.4528519800705884, + "learning_rate": 2.2464256696144106e-06, + "loss": 0.5174, + "step": 5601 + }, + { + "epoch": 0.7, + "grad_norm": 1.4471062589320642, + "learning_rate": 2.2447483928757034e-06, + "loss": 0.5343, + "step": 5602 + }, + { + "epoch": 0.7, + "grad_norm": 1.7013573683309002, + "learning_rate": 2.243071561272159e-06, + "loss": 0.51, + "step": 5603 + }, + { + "epoch": 0.7, + "grad_norm": 1.6929219375700657, + "learning_rate": 2.241395175074683e-06, + "loss": 0.5223, + "step": 5604 + }, + { + "epoch": 0.7, + "grad_norm": 1.582600692096603, + "learning_rate": 2.2397192345541146e-06, + "loss": 0.5067, + "step": 5605 + }, + { + "epoch": 0.7, + "grad_norm": 1.3480484291333905, + "learning_rate": 2.238043739981212e-06, + "loss": 0.4497, + "step": 5606 + }, + { + "epoch": 0.7, + "grad_norm": 1.346904838587814, + "learning_rate": 2.2363686916266696e-06, + "loss": 0.5279, + "step": 5607 + }, + { + "epoch": 0.7, + "grad_norm": 1.4764789978750652, + "learning_rate": 2.234694089761102e-06, + "loss": 0.5216, + "step": 5608 + }, + { + "epoch": 0.7, + "grad_norm": 0.7128029989775837, + "learning_rate": 2.2330199346550624e-06, + "loss": 0.5023, + "step": 5609 + }, + { + "epoch": 0.7, + "grad_norm": 1.4965838514525496, + "learning_rate": 2.2313462265790198e-06, + "loss": 0.5115, + "step": 5610 + }, + { + "epoch": 0.7, + "grad_norm": 1.6073821385339875, + "learning_rate": 2.22967296580338e-06, + "loss": 0.4895, + "step": 5611 + }, + { + "epoch": 0.7, + "grad_norm": 1.479216078712145, + "learning_rate": 2.2280001525984718e-06, + "loss": 0.4812, + "step": 5612 + }, + { + "epoch": 0.7, + "grad_norm": 1.412021542856434, + "learning_rate": 2.2263277872345505e-06, + "loss": 0.482, + "step": 5613 + }, + { + "epoch": 0.7, + "grad_norm": 1.4877331023293925, + "learning_rate": 2.2246558699818056e-06, + "loss": 0.5364, + "step": 5614 + }, + { + "epoch": 0.7, + "grad_norm": 1.3517184707148822, + "learning_rate": 2.222984401110346e-06, + "loss": 0.467, + "step": 5615 + }, + { + "epoch": 0.7, + "grad_norm": 1.4341668244370513, + "learning_rate": 2.2213133808902143e-06, + "loss": 0.5515, + "step": 5616 + }, + { + "epoch": 0.7, + "grad_norm": 1.2868126565065974, + "learning_rate": 2.219642809591378e-06, + "loss": 0.5173, + "step": 5617 + }, + { + "epoch": 0.7, + "grad_norm": 1.3828616571993415, + "learning_rate": 2.2179726874837353e-06, + "loss": 0.4831, + "step": 5618 + }, + { + "epoch": 0.7, + "grad_norm": 1.6489534862379591, + "learning_rate": 2.2163030148371044e-06, + "loss": 0.4489, + "step": 5619 + }, + { + "epoch": 0.7, + "grad_norm": 3.1143036312350367, + "learning_rate": 2.214633791921241e-06, + "loss": 0.4652, + "step": 5620 + }, + { + "epoch": 0.7, + "grad_norm": 0.6425173069258526, + "learning_rate": 2.2129650190058188e-06, + "loss": 0.5185, + "step": 5621 + }, + { + "epoch": 0.7, + "grad_norm": 0.7896536582636093, + "learning_rate": 2.211296696360442e-06, + "loss": 0.4961, + "step": 5622 + }, + { + "epoch": 0.7, + "grad_norm": 1.363767251022316, + "learning_rate": 2.2096288242546464e-06, + "loss": 0.5514, + "step": 5623 + }, + { + "epoch": 0.7, + "grad_norm": 1.6085365905066162, + "learning_rate": 2.2079614029578865e-06, + "loss": 0.5116, + "step": 5624 + }, + { + "epoch": 0.7, + "grad_norm": 1.6616446613161735, + "learning_rate": 2.206294432739556e-06, + "loss": 0.4866, + "step": 5625 + }, + { + "epoch": 0.7, + "grad_norm": 2.405353254193536, + "learning_rate": 2.2046279138689617e-06, + "loss": 0.5026, + "step": 5626 + }, + { + "epoch": 0.7, + "grad_norm": 1.6128351079408425, + "learning_rate": 2.20296184661535e-06, + "loss": 0.4926, + "step": 5627 + }, + { + "epoch": 0.7, + "grad_norm": 1.4587630016659268, + "learning_rate": 2.201296231247884e-06, + "loss": 0.4461, + "step": 5628 + }, + { + "epoch": 0.7, + "grad_norm": 1.3835200685248408, + "learning_rate": 2.1996310680356623e-06, + "loss": 0.4562, + "step": 5629 + }, + { + "epoch": 0.7, + "grad_norm": 1.3805367109816795, + "learning_rate": 2.1979663572477057e-06, + "loss": 0.4576, + "step": 5630 + }, + { + "epoch": 0.7, + "grad_norm": 0.6463950616780922, + "learning_rate": 2.19630209915296e-06, + "loss": 0.4814, + "step": 5631 + }, + { + "epoch": 0.7, + "grad_norm": 0.6488872376356858, + "learning_rate": 2.1946382940203024e-06, + "loss": 0.4789, + "step": 5632 + }, + { + "epoch": 0.7, + "grad_norm": 1.3765869540183546, + "learning_rate": 2.1929749421185363e-06, + "loss": 0.4682, + "step": 5633 + }, + { + "epoch": 0.7, + "grad_norm": 1.7079525292081255, + "learning_rate": 2.191312043716392e-06, + "loss": 0.5344, + "step": 5634 + }, + { + "epoch": 0.7, + "grad_norm": 1.5976843490539296, + "learning_rate": 2.1896495990825224e-06, + "loss": 0.5024, + "step": 5635 + }, + { + "epoch": 0.7, + "grad_norm": 1.9632934935601438, + "learning_rate": 2.187987608485513e-06, + "loss": 0.461, + "step": 5636 + }, + { + "epoch": 0.7, + "grad_norm": 1.2722730035586236, + "learning_rate": 2.1863260721938696e-06, + "loss": 0.4721, + "step": 5637 + }, + { + "epoch": 0.7, + "grad_norm": 1.4855676434755076, + "learning_rate": 2.1846649904760315e-06, + "loss": 0.4987, + "step": 5638 + }, + { + "epoch": 0.7, + "grad_norm": 1.4833522957894028, + "learning_rate": 2.1830043636003574e-06, + "loss": 0.4885, + "step": 5639 + }, + { + "epoch": 0.7, + "grad_norm": 1.3258031890663018, + "learning_rate": 2.1813441918351407e-06, + "loss": 0.5451, + "step": 5640 + }, + { + "epoch": 0.7, + "grad_norm": 1.644381239071562, + "learning_rate": 2.179684475448592e-06, + "loss": 0.5008, + "step": 5641 + }, + { + "epoch": 0.7, + "grad_norm": 1.4396401967283894, + "learning_rate": 2.1780252147088555e-06, + "loss": 0.5547, + "step": 5642 + }, + { + "epoch": 0.7, + "grad_norm": 3.235056697104862, + "learning_rate": 2.1763664098840013e-06, + "loss": 0.4742, + "step": 5643 + }, + { + "epoch": 0.7, + "grad_norm": 2.538756193902995, + "learning_rate": 2.1747080612420202e-06, + "loss": 0.479, + "step": 5644 + }, + { + "epoch": 0.7, + "grad_norm": 1.489100563897908, + "learning_rate": 2.1730501690508363e-06, + "loss": 0.4465, + "step": 5645 + }, + { + "epoch": 0.7, + "grad_norm": 1.4644523867767538, + "learning_rate": 2.1713927335782934e-06, + "loss": 0.4719, + "step": 5646 + }, + { + "epoch": 0.7, + "grad_norm": 1.821065940683249, + "learning_rate": 2.169735755092168e-06, + "loss": 0.5072, + "step": 5647 + }, + { + "epoch": 0.7, + "grad_norm": 1.5885001554236768, + "learning_rate": 2.168079233860157e-06, + "loss": 0.5026, + "step": 5648 + }, + { + "epoch": 0.7, + "grad_norm": 1.7164720858256994, + "learning_rate": 2.166423170149887e-06, + "loss": 0.5201, + "step": 5649 + }, + { + "epoch": 0.7, + "grad_norm": 1.7594213840845911, + "learning_rate": 2.164767564228911e-06, + "loss": 0.5021, + "step": 5650 + }, + { + "epoch": 0.7, + "grad_norm": 3.028797252998077, + "learning_rate": 2.1631124163647043e-06, + "loss": 0.4976, + "step": 5651 + }, + { + "epoch": 0.7, + "grad_norm": 1.5397052115546976, + "learning_rate": 2.1614577268246735e-06, + "loss": 0.4835, + "step": 5652 + }, + { + "epoch": 0.7, + "grad_norm": 2.2110349025128313, + "learning_rate": 2.1598034958761448e-06, + "loss": 0.4917, + "step": 5653 + }, + { + "epoch": 0.7, + "grad_norm": 1.5698386403094702, + "learning_rate": 2.1581497237863767e-06, + "loss": 0.4759, + "step": 5654 + }, + { + "epoch": 0.7, + "grad_norm": 1.4109353451334927, + "learning_rate": 2.1564964108225485e-06, + "loss": 0.488, + "step": 5655 + }, + { + "epoch": 0.7, + "grad_norm": 0.6598736568588375, + "learning_rate": 2.15484355725177e-06, + "loss": 0.5005, + "step": 5656 + }, + { + "epoch": 0.7, + "grad_norm": 1.4919745602635004, + "learning_rate": 2.153191163341071e-06, + "loss": 0.486, + "step": 5657 + }, + { + "epoch": 0.7, + "grad_norm": 1.2253434355089252, + "learning_rate": 2.151539229357412e-06, + "loss": 0.4571, + "step": 5658 + }, + { + "epoch": 0.7, + "grad_norm": 1.5071311879513096, + "learning_rate": 2.149887755567679e-06, + "loss": 0.5155, + "step": 5659 + }, + { + "epoch": 0.7, + "grad_norm": 1.4334540922292345, + "learning_rate": 2.148236742238679e-06, + "loss": 0.4797, + "step": 5660 + }, + { + "epoch": 0.7, + "grad_norm": 1.6044020149090967, + "learning_rate": 2.1465861896371514e-06, + "loss": 0.5131, + "step": 5661 + }, + { + "epoch": 0.7, + "grad_norm": 1.3693514507790747, + "learning_rate": 2.1449360980297536e-06, + "loss": 0.4986, + "step": 5662 + }, + { + "epoch": 0.7, + "grad_norm": 1.230734930065685, + "learning_rate": 2.143286467683076e-06, + "loss": 0.4817, + "step": 5663 + }, + { + "epoch": 0.7, + "grad_norm": 1.3873301908337472, + "learning_rate": 2.1416372988636275e-06, + "loss": 0.4873, + "step": 5664 + }, + { + "epoch": 0.7, + "grad_norm": 0.6559315099206287, + "learning_rate": 2.1399885918378478e-06, + "loss": 0.4808, + "step": 5665 + }, + { + "epoch": 0.7, + "grad_norm": 2.19002529984588, + "learning_rate": 2.1383403468721013e-06, + "loss": 0.5296, + "step": 5666 + }, + { + "epoch": 0.7, + "grad_norm": 1.4688445935033496, + "learning_rate": 2.1366925642326735e-06, + "loss": 0.5354, + "step": 5667 + }, + { + "epoch": 0.7, + "grad_norm": 2.079621728241784, + "learning_rate": 2.135045244185781e-06, + "loss": 0.5183, + "step": 5668 + }, + { + "epoch": 0.7, + "grad_norm": 1.413368987131197, + "learning_rate": 2.13339838699756e-06, + "loss": 0.4898, + "step": 5669 + }, + { + "epoch": 0.7, + "grad_norm": 1.3210859609376642, + "learning_rate": 2.1317519929340787e-06, + "loss": 0.4783, + "step": 5670 + }, + { + "epoch": 0.7, + "grad_norm": 1.32159694606616, + "learning_rate": 2.130106062261322e-06, + "loss": 0.5103, + "step": 5671 + }, + { + "epoch": 0.7, + "grad_norm": 1.7155454747055365, + "learning_rate": 2.128460595245208e-06, + "loss": 0.4686, + "step": 5672 + }, + { + "epoch": 0.7, + "grad_norm": 1.4536064375277995, + "learning_rate": 2.126815592151574e-06, + "loss": 0.4817, + "step": 5673 + }, + { + "epoch": 0.7, + "grad_norm": 1.7002556576948114, + "learning_rate": 2.1251710532461854e-06, + "loss": 0.4708, + "step": 5674 + }, + { + "epoch": 0.7, + "grad_norm": 1.3823878012680884, + "learning_rate": 2.1235269787947345e-06, + "loss": 0.505, + "step": 5675 + }, + { + "epoch": 0.7, + "grad_norm": 1.3071705033544256, + "learning_rate": 2.121883369062832e-06, + "loss": 0.5239, + "step": 5676 + }, + { + "epoch": 0.7, + "grad_norm": 1.6916273684756746, + "learning_rate": 2.1202402243160215e-06, + "loss": 0.4552, + "step": 5677 + }, + { + "epoch": 0.7, + "grad_norm": 1.8044470154991188, + "learning_rate": 2.118597544819763e-06, + "loss": 0.4711, + "step": 5678 + }, + { + "epoch": 0.7, + "grad_norm": 1.4612633988316264, + "learning_rate": 2.11695533083945e-06, + "loss": 0.4819, + "step": 5679 + }, + { + "epoch": 0.7, + "grad_norm": 2.4045791920997903, + "learning_rate": 2.1153135826403936e-06, + "loss": 0.5078, + "step": 5680 + }, + { + "epoch": 0.71, + "grad_norm": 1.4976793521690002, + "learning_rate": 2.1136723004878356e-06, + "loss": 0.4956, + "step": 5681 + }, + { + "epoch": 0.71, + "grad_norm": 1.7046915634044535, + "learning_rate": 2.1120314846469364e-06, + "loss": 0.5065, + "step": 5682 + }, + { + "epoch": 0.71, + "grad_norm": 1.4735756528319723, + "learning_rate": 2.1103911353827855e-06, + "loss": 0.5042, + "step": 5683 + }, + { + "epoch": 0.71, + "grad_norm": 1.6003853911474244, + "learning_rate": 2.1087512529603984e-06, + "loss": 0.546, + "step": 5684 + }, + { + "epoch": 0.71, + "grad_norm": 1.6980308517680083, + "learning_rate": 2.1071118376447074e-06, + "loss": 0.5027, + "step": 5685 + }, + { + "epoch": 0.71, + "grad_norm": 1.6285847762124912, + "learning_rate": 2.10547288970058e-06, + "loss": 0.504, + "step": 5686 + }, + { + "epoch": 0.71, + "grad_norm": 1.7985510618826166, + "learning_rate": 2.1038344093927983e-06, + "loss": 0.5049, + "step": 5687 + }, + { + "epoch": 0.71, + "grad_norm": 1.4584743770881174, + "learning_rate": 2.102196396986076e-06, + "loss": 0.4688, + "step": 5688 + }, + { + "epoch": 0.71, + "grad_norm": 2.062882331433079, + "learning_rate": 2.100558852745046e-06, + "loss": 0.5158, + "step": 5689 + }, + { + "epoch": 0.71, + "grad_norm": 1.3176991940677245, + "learning_rate": 2.098921776934269e-06, + "loss": 0.5022, + "step": 5690 + }, + { + "epoch": 0.71, + "grad_norm": 1.6950672852350026, + "learning_rate": 2.097285169818232e-06, + "loss": 0.4937, + "step": 5691 + }, + { + "epoch": 0.71, + "grad_norm": 1.4397600009557168, + "learning_rate": 2.0956490316613375e-06, + "loss": 0.4694, + "step": 5692 + }, + { + "epoch": 0.71, + "grad_norm": 1.4476077867770372, + "learning_rate": 2.094013362727924e-06, + "loss": 0.514, + "step": 5693 + }, + { + "epoch": 0.71, + "grad_norm": 1.4207692375720573, + "learning_rate": 2.0923781632822434e-06, + "loss": 0.4678, + "step": 5694 + }, + { + "epoch": 0.71, + "grad_norm": 2.2564321877149407, + "learning_rate": 2.09074343358848e-06, + "loss": 0.5346, + "step": 5695 + }, + { + "epoch": 0.71, + "grad_norm": 1.416804884151017, + "learning_rate": 2.0891091739107355e-06, + "loss": 0.494, + "step": 5696 + }, + { + "epoch": 0.71, + "grad_norm": 1.6357332631911095, + "learning_rate": 2.087475384513043e-06, + "loss": 0.515, + "step": 5697 + }, + { + "epoch": 0.71, + "grad_norm": 2.5157776993178462, + "learning_rate": 2.08584206565935e-06, + "loss": 0.4975, + "step": 5698 + }, + { + "epoch": 0.71, + "grad_norm": 1.6552230500936123, + "learning_rate": 2.0842092176135396e-06, + "loss": 0.4694, + "step": 5699 + }, + { + "epoch": 0.71, + "grad_norm": 2.0776540845584193, + "learning_rate": 2.082576840639411e-06, + "loss": 0.4866, + "step": 5700 + }, + { + "epoch": 0.71, + "grad_norm": 1.4391172169598452, + "learning_rate": 2.080944935000686e-06, + "loss": 0.4897, + "step": 5701 + }, + { + "epoch": 0.71, + "grad_norm": 1.508862587297614, + "learning_rate": 2.0793135009610173e-06, + "loss": 0.5091, + "step": 5702 + }, + { + "epoch": 0.71, + "grad_norm": 1.4673804733296882, + "learning_rate": 2.077682538783974e-06, + "loss": 0.4757, + "step": 5703 + }, + { + "epoch": 0.71, + "grad_norm": 1.345254379836162, + "learning_rate": 2.0760520487330554e-06, + "loss": 0.5242, + "step": 5704 + }, + { + "epoch": 0.71, + "grad_norm": 1.5063600999543156, + "learning_rate": 2.074422031071679e-06, + "loss": 0.5018, + "step": 5705 + }, + { + "epoch": 0.71, + "grad_norm": 1.4460384916405886, + "learning_rate": 2.0727924860631886e-06, + "loss": 0.4711, + "step": 5706 + }, + { + "epoch": 0.71, + "grad_norm": 1.1349399254738972, + "learning_rate": 2.071163413970854e-06, + "loss": 0.4204, + "step": 5707 + }, + { + "epoch": 0.71, + "grad_norm": 2.421480116647311, + "learning_rate": 2.0695348150578655e-06, + "loss": 0.4944, + "step": 5708 + }, + { + "epoch": 0.71, + "grad_norm": 3.32601532841179, + "learning_rate": 2.0679066895873358e-06, + "loss": 0.4766, + "step": 5709 + }, + { + "epoch": 0.71, + "grad_norm": 1.5325685134493685, + "learning_rate": 2.066279037822305e-06, + "loss": 0.5301, + "step": 5710 + }, + { + "epoch": 0.71, + "grad_norm": 1.3833534069309887, + "learning_rate": 2.0646518600257343e-06, + "loss": 0.495, + "step": 5711 + }, + { + "epoch": 0.71, + "grad_norm": 1.4562439694652227, + "learning_rate": 2.0630251564605053e-06, + "loss": 0.4554, + "step": 5712 + }, + { + "epoch": 0.71, + "grad_norm": 1.8753303528850167, + "learning_rate": 2.0613989273894313e-06, + "loss": 0.5322, + "step": 5713 + }, + { + "epoch": 0.71, + "grad_norm": 1.4374425915890892, + "learning_rate": 2.059773173075239e-06, + "loss": 0.4604, + "step": 5714 + }, + { + "epoch": 0.71, + "grad_norm": 1.435434545196034, + "learning_rate": 2.0581478937805864e-06, + "loss": 0.5105, + "step": 5715 + }, + { + "epoch": 0.71, + "grad_norm": 2.781414285188784, + "learning_rate": 2.056523089768051e-06, + "loss": 0.5118, + "step": 5716 + }, + { + "epoch": 0.71, + "grad_norm": 1.4164073183621666, + "learning_rate": 2.054898761300136e-06, + "loss": 0.4928, + "step": 5717 + }, + { + "epoch": 0.71, + "grad_norm": 1.8871129395490855, + "learning_rate": 2.0532749086392625e-06, + "loss": 0.5301, + "step": 5718 + }, + { + "epoch": 0.71, + "grad_norm": 1.7594277639683007, + "learning_rate": 2.0516515320477813e-06, + "loss": 0.4697, + "step": 5719 + }, + { + "epoch": 0.71, + "grad_norm": 26.873164838615025, + "learning_rate": 2.0500286317879626e-06, + "loss": 0.4826, + "step": 5720 + }, + { + "epoch": 0.71, + "grad_norm": 1.5551879147629808, + "learning_rate": 2.0484062081219967e-06, + "loss": 0.5253, + "step": 5721 + }, + { + "epoch": 0.71, + "grad_norm": 1.5665900419493022, + "learning_rate": 2.046784261312006e-06, + "loss": 0.5023, + "step": 5722 + }, + { + "epoch": 0.71, + "grad_norm": 1.5347720498202977, + "learning_rate": 2.0451627916200236e-06, + "loss": 0.5354, + "step": 5723 + }, + { + "epoch": 0.71, + "grad_norm": 3.4266227402970557, + "learning_rate": 2.0435417993080194e-06, + "loss": 0.5164, + "step": 5724 + }, + { + "epoch": 0.71, + "grad_norm": 1.2459870240656372, + "learning_rate": 2.041921284637874e-06, + "loss": 0.5072, + "step": 5725 + }, + { + "epoch": 0.71, + "grad_norm": 1.4351773132797392, + "learning_rate": 2.040301247871399e-06, + "loss": 0.4542, + "step": 5726 + }, + { + "epoch": 0.71, + "grad_norm": 1.5153000504442997, + "learning_rate": 2.0386816892703225e-06, + "loss": 0.4969, + "step": 5727 + }, + { + "epoch": 0.71, + "grad_norm": 1.994793007552151, + "learning_rate": 2.037062609096302e-06, + "loss": 0.4812, + "step": 5728 + }, + { + "epoch": 0.71, + "grad_norm": 1.3214414084860961, + "learning_rate": 2.035444007610912e-06, + "loss": 0.4687, + "step": 5729 + }, + { + "epoch": 0.71, + "grad_norm": 1.6187642823487383, + "learning_rate": 2.0338258850756507e-06, + "loss": 0.4927, + "step": 5730 + }, + { + "epoch": 0.71, + "grad_norm": 3.4220341032570714, + "learning_rate": 2.032208241751941e-06, + "loss": 0.4744, + "step": 5731 + }, + { + "epoch": 0.71, + "grad_norm": 1.3246981718775324, + "learning_rate": 2.0305910779011277e-06, + "loss": 0.5096, + "step": 5732 + }, + { + "epoch": 0.71, + "grad_norm": 1.2892581949710045, + "learning_rate": 2.02897439378448e-06, + "loss": 0.5107, + "step": 5733 + }, + { + "epoch": 0.71, + "grad_norm": 1.5615295073884667, + "learning_rate": 2.0273581896631837e-06, + "loss": 0.4812, + "step": 5734 + }, + { + "epoch": 0.71, + "grad_norm": 1.4455183398778615, + "learning_rate": 2.025742465798355e-06, + "loss": 0.502, + "step": 5735 + }, + { + "epoch": 0.71, + "grad_norm": 2.557923520928341, + "learning_rate": 2.0241272224510235e-06, + "loss": 0.5193, + "step": 5736 + }, + { + "epoch": 0.71, + "grad_norm": 1.3621564814195775, + "learning_rate": 2.0225124598821498e-06, + "loss": 0.4869, + "step": 5737 + }, + { + "epoch": 0.71, + "grad_norm": 2.45337552752045, + "learning_rate": 2.02089817835261e-06, + "loss": 0.5248, + "step": 5738 + }, + { + "epoch": 0.71, + "grad_norm": 2.233319918986319, + "learning_rate": 2.019284378123207e-06, + "loss": 0.4867, + "step": 5739 + }, + { + "epoch": 0.71, + "grad_norm": 1.3556952200964876, + "learning_rate": 2.017671059454667e-06, + "loss": 0.471, + "step": 5740 + }, + { + "epoch": 0.71, + "grad_norm": 1.4902769625106627, + "learning_rate": 2.0160582226076304e-06, + "loss": 0.4932, + "step": 5741 + }, + { + "epoch": 0.71, + "grad_norm": 1.7142701712089856, + "learning_rate": 2.0144458678426705e-06, + "loss": 0.4717, + "step": 5742 + }, + { + "epoch": 0.71, + "grad_norm": 1.3939041873730047, + "learning_rate": 2.0128339954202734e-06, + "loss": 0.5065, + "step": 5743 + }, + { + "epoch": 0.71, + "grad_norm": 2.8410248810440564, + "learning_rate": 2.0112226056008547e-06, + "loss": 0.4957, + "step": 5744 + }, + { + "epoch": 0.71, + "grad_norm": 1.6987436213955667, + "learning_rate": 2.009611698644745e-06, + "loss": 0.4998, + "step": 5745 + }, + { + "epoch": 0.71, + "grad_norm": 1.3997597838397018, + "learning_rate": 2.008001274812204e-06, + "loss": 0.4884, + "step": 5746 + }, + { + "epoch": 0.71, + "grad_norm": 1.8261750841737288, + "learning_rate": 2.006391334363407e-06, + "loss": 0.5535, + "step": 5747 + }, + { + "epoch": 0.71, + "grad_norm": 1.6321622352936611, + "learning_rate": 2.004781877558455e-06, + "loss": 0.4615, + "step": 5748 + }, + { + "epoch": 0.71, + "grad_norm": 2.914268686757844, + "learning_rate": 2.003172904657372e-06, + "loss": 0.4293, + "step": 5749 + }, + { + "epoch": 0.71, + "grad_norm": 1.3570114851918123, + "learning_rate": 2.0015644159200974e-06, + "loss": 0.5056, + "step": 5750 + }, + { + "epoch": 0.71, + "grad_norm": 2.0706697522578237, + "learning_rate": 1.9999564116065017e-06, + "loss": 0.4964, + "step": 5751 + }, + { + "epoch": 0.71, + "grad_norm": 2.0556824675288645, + "learning_rate": 1.998348891976368e-06, + "loss": 0.4605, + "step": 5752 + }, + { + "epoch": 0.71, + "grad_norm": 1.9574956421434813, + "learning_rate": 1.9967418572894087e-06, + "loss": 0.4795, + "step": 5753 + }, + { + "epoch": 0.71, + "grad_norm": 0.6449227143739888, + "learning_rate": 1.995135307805251e-06, + "loss": 0.5001, + "step": 5754 + }, + { + "epoch": 0.71, + "grad_norm": 1.46664255915646, + "learning_rate": 1.9935292437834508e-06, + "loss": 0.5121, + "step": 5755 + }, + { + "epoch": 0.71, + "grad_norm": 1.3813895466955008, + "learning_rate": 1.9919236654834776e-06, + "loss": 0.5109, + "step": 5756 + }, + { + "epoch": 0.71, + "grad_norm": 2.0464129272443827, + "learning_rate": 1.9903185731647294e-06, + "loss": 0.4965, + "step": 5757 + }, + { + "epoch": 0.71, + "grad_norm": 1.575954407433671, + "learning_rate": 1.988713967086524e-06, + "loss": 0.5024, + "step": 5758 + }, + { + "epoch": 0.71, + "grad_norm": 1.5791710315234764, + "learning_rate": 1.9871098475080968e-06, + "loss": 0.5293, + "step": 5759 + }, + { + "epoch": 0.71, + "grad_norm": 1.5824864416558846, + "learning_rate": 1.9855062146886104e-06, + "loss": 0.526, + "step": 5760 + }, + { + "epoch": 0.71, + "grad_norm": 1.7775269855188194, + "learning_rate": 1.9839030688871432e-06, + "loss": 0.5091, + "step": 5761 + }, + { + "epoch": 0.72, + "grad_norm": 1.5349204131009164, + "learning_rate": 1.9823004103626996e-06, + "loss": 0.478, + "step": 5762 + }, + { + "epoch": 0.72, + "grad_norm": 1.3719854482227383, + "learning_rate": 1.980698239374201e-06, + "loss": 0.4685, + "step": 5763 + }, + { + "epoch": 0.72, + "grad_norm": 1.6852175119165915, + "learning_rate": 1.979096556180493e-06, + "loss": 0.4881, + "step": 5764 + }, + { + "epoch": 0.72, + "grad_norm": 1.8625313877636154, + "learning_rate": 1.9774953610403443e-06, + "loss": 0.5216, + "step": 5765 + }, + { + "epoch": 0.72, + "grad_norm": 1.450107885202208, + "learning_rate": 1.975894654212438e-06, + "loss": 0.4513, + "step": 5766 + }, + { + "epoch": 0.72, + "grad_norm": 1.5230450906221689, + "learning_rate": 1.9742944359553855e-06, + "loss": 0.5149, + "step": 5767 + }, + { + "epoch": 0.72, + "grad_norm": 2.873593720362626, + "learning_rate": 1.972694706527714e-06, + "loss": 0.4503, + "step": 5768 + }, + { + "epoch": 0.72, + "grad_norm": 3.623455636704811, + "learning_rate": 1.971095466187876e-06, + "loss": 0.538, + "step": 5769 + }, + { + "epoch": 0.72, + "grad_norm": 1.696139524516132, + "learning_rate": 1.9694967151942403e-06, + "loss": 0.471, + "step": 5770 + }, + { + "epoch": 0.72, + "grad_norm": 1.9568653647405692, + "learning_rate": 1.9678984538051015e-06, + "loss": 0.5261, + "step": 5771 + }, + { + "epoch": 0.72, + "grad_norm": 1.5443916493642844, + "learning_rate": 1.966300682278671e-06, + "loss": 0.4776, + "step": 5772 + }, + { + "epoch": 0.72, + "grad_norm": 1.6470232629154051, + "learning_rate": 1.964703400873083e-06, + "loss": 0.465, + "step": 5773 + }, + { + "epoch": 0.72, + "grad_norm": 1.4860505755950013, + "learning_rate": 1.963106609846395e-06, + "loss": 0.4593, + "step": 5774 + }, + { + "epoch": 0.72, + "grad_norm": 0.6271448336615271, + "learning_rate": 1.9615103094565798e-06, + "loss": 0.5128, + "step": 5775 + }, + { + "epoch": 0.72, + "grad_norm": 1.6066932286908366, + "learning_rate": 1.9599144999615355e-06, + "loss": 0.513, + "step": 5776 + }, + { + "epoch": 0.72, + "grad_norm": 2.1186709048927224, + "learning_rate": 1.9583191816190773e-06, + "loss": 0.5115, + "step": 5777 + }, + { + "epoch": 0.72, + "grad_norm": 1.4370821816965451, + "learning_rate": 1.9567243546869453e-06, + "loss": 0.4406, + "step": 5778 + }, + { + "epoch": 0.72, + "grad_norm": 1.352541970965906, + "learning_rate": 1.955130019422795e-06, + "loss": 0.5274, + "step": 5779 + }, + { + "epoch": 0.72, + "grad_norm": 1.3274741708035742, + "learning_rate": 1.953536176084207e-06, + "loss": 0.4426, + "step": 5780 + }, + { + "epoch": 0.72, + "grad_norm": 1.4631876821754177, + "learning_rate": 1.9519428249286825e-06, + "loss": 0.5083, + "step": 5781 + }, + { + "epoch": 0.72, + "grad_norm": 1.4116956697135918, + "learning_rate": 1.9503499662136378e-06, + "loss": 0.5186, + "step": 5782 + }, + { + "epoch": 0.72, + "grad_norm": 1.684331510368117, + "learning_rate": 1.9487576001964166e-06, + "loss": 0.4865, + "step": 5783 + }, + { + "epoch": 0.72, + "grad_norm": 1.4742398214878512, + "learning_rate": 1.947165727134276e-06, + "loss": 0.5243, + "step": 5784 + }, + { + "epoch": 0.72, + "grad_norm": 0.6588770030376725, + "learning_rate": 1.945574347284401e-06, + "loss": 0.5375, + "step": 5785 + }, + { + "epoch": 0.72, + "grad_norm": 1.4088544461510655, + "learning_rate": 1.9439834609038893e-06, + "loss": 0.4529, + "step": 5786 + }, + { + "epoch": 0.72, + "grad_norm": 1.4820327420256838, + "learning_rate": 1.9423930682497664e-06, + "loss": 0.4909, + "step": 5787 + }, + { + "epoch": 0.72, + "grad_norm": 1.4534489856721242, + "learning_rate": 1.940803169578969e-06, + "loss": 0.5388, + "step": 5788 + }, + { + "epoch": 0.72, + "grad_norm": 1.4831040727804403, + "learning_rate": 1.939213765148366e-06, + "loss": 0.5078, + "step": 5789 + }, + { + "epoch": 0.72, + "grad_norm": 1.3836192912332803, + "learning_rate": 1.937624855214736e-06, + "loss": 0.5005, + "step": 5790 + }, + { + "epoch": 0.72, + "grad_norm": 1.8974608544636409, + "learning_rate": 1.9360364400347803e-06, + "loss": 0.5136, + "step": 5791 + }, + { + "epoch": 0.72, + "grad_norm": 1.59025346888609, + "learning_rate": 1.9344485198651243e-06, + "loss": 0.5284, + "step": 5792 + }, + { + "epoch": 0.72, + "grad_norm": 1.767797328488242, + "learning_rate": 1.9328610949623068e-06, + "loss": 0.5235, + "step": 5793 + }, + { + "epoch": 0.72, + "grad_norm": 1.7296551897243873, + "learning_rate": 1.9312741655827945e-06, + "loss": 0.5297, + "step": 5794 + }, + { + "epoch": 0.72, + "grad_norm": 1.8958100276090986, + "learning_rate": 1.9296877319829656e-06, + "loss": 0.4773, + "step": 5795 + }, + { + "epoch": 0.72, + "grad_norm": 1.8361969548943964, + "learning_rate": 1.928101794419126e-06, + "loss": 0.4559, + "step": 5796 + }, + { + "epoch": 0.72, + "grad_norm": 3.215608924820692, + "learning_rate": 1.9265163531474935e-06, + "loss": 0.5468, + "step": 5797 + }, + { + "epoch": 0.72, + "grad_norm": 1.1295612581503611, + "learning_rate": 1.924931408424216e-06, + "loss": 0.4339, + "step": 5798 + }, + { + "epoch": 0.72, + "grad_norm": 1.4019818248835516, + "learning_rate": 1.923346960505353e-06, + "loss": 0.5145, + "step": 5799 + }, + { + "epoch": 0.72, + "grad_norm": 1.6831507214857904, + "learning_rate": 1.9217630096468824e-06, + "loss": 0.5159, + "step": 5800 + }, + { + "epoch": 0.72, + "grad_norm": 1.4923805376132564, + "learning_rate": 1.920179556104711e-06, + "loss": 0.5629, + "step": 5801 + }, + { + "epoch": 0.72, + "grad_norm": 1.2287299430523901, + "learning_rate": 1.9185966001346546e-06, + "loss": 0.4693, + "step": 5802 + }, + { + "epoch": 0.72, + "grad_norm": 0.6705904758601706, + "learning_rate": 1.9170141419924583e-06, + "loss": 0.4705, + "step": 5803 + }, + { + "epoch": 0.72, + "grad_norm": 1.447868152017363, + "learning_rate": 1.915432181933778e-06, + "loss": 0.4798, + "step": 5804 + }, + { + "epoch": 0.72, + "grad_norm": 0.7060735277920716, + "learning_rate": 1.9138507202141947e-06, + "loss": 0.4894, + "step": 5805 + }, + { + "epoch": 0.72, + "grad_norm": 1.4861249493666067, + "learning_rate": 1.912269757089208e-06, + "loss": 0.5246, + "step": 5806 + }, + { + "epoch": 0.72, + "grad_norm": 1.505205873541527, + "learning_rate": 1.9106892928142383e-06, + "loss": 0.4855, + "step": 5807 + }, + { + "epoch": 0.72, + "grad_norm": 2.0736622759742143, + "learning_rate": 1.9091093276446197e-06, + "loss": 0.4664, + "step": 5808 + }, + { + "epoch": 0.72, + "grad_norm": 1.5728866068387426, + "learning_rate": 1.9075298618356134e-06, + "loss": 0.4832, + "step": 5809 + }, + { + "epoch": 0.72, + "grad_norm": 1.567346546602158, + "learning_rate": 1.905950895642394e-06, + "loss": 0.523, + "step": 5810 + }, + { + "epoch": 0.72, + "grad_norm": 2.4303099435053412, + "learning_rate": 1.9043724293200556e-06, + "loss": 0.5129, + "step": 5811 + }, + { + "epoch": 0.72, + "grad_norm": 2.0313109936019274, + "learning_rate": 1.9027944631236161e-06, + "loss": 0.533, + "step": 5812 + }, + { + "epoch": 0.72, + "grad_norm": 1.6289956972165225, + "learning_rate": 1.9012169973080064e-06, + "loss": 0.4587, + "step": 5813 + }, + { + "epoch": 0.72, + "grad_norm": 1.4720872598727337, + "learning_rate": 1.899640032128085e-06, + "loss": 0.4889, + "step": 5814 + }, + { + "epoch": 0.72, + "grad_norm": 1.3338535876848339, + "learning_rate": 1.8980635678386206e-06, + "loss": 0.4974, + "step": 5815 + }, + { + "epoch": 0.72, + "grad_norm": 1.4441314017374718, + "learning_rate": 1.8964876046943081e-06, + "loss": 0.484, + "step": 5816 + }, + { + "epoch": 0.72, + "grad_norm": 1.5841626988615176, + "learning_rate": 1.8949121429497546e-06, + "loss": 0.5346, + "step": 5817 + }, + { + "epoch": 0.72, + "grad_norm": 1.51894023016548, + "learning_rate": 1.8933371828594932e-06, + "loss": 0.4579, + "step": 5818 + }, + { + "epoch": 0.72, + "grad_norm": 1.3708186159993732, + "learning_rate": 1.8917627246779708e-06, + "loss": 0.4752, + "step": 5819 + }, + { + "epoch": 0.72, + "grad_norm": 1.351325739538545, + "learning_rate": 1.890188768659554e-06, + "loss": 0.5004, + "step": 5820 + }, + { + "epoch": 0.72, + "grad_norm": 1.4460543738772533, + "learning_rate": 1.8886153150585295e-06, + "loss": 0.5045, + "step": 5821 + }, + { + "epoch": 0.72, + "grad_norm": 1.4696479913824958, + "learning_rate": 1.8870423641291042e-06, + "loss": 0.4601, + "step": 5822 + }, + { + "epoch": 0.72, + "grad_norm": 1.5593078416590118, + "learning_rate": 1.8854699161254031e-06, + "loss": 0.4386, + "step": 5823 + }, + { + "epoch": 0.72, + "grad_norm": 1.6144109272623572, + "learning_rate": 1.8838979713014654e-06, + "loss": 0.4931, + "step": 5824 + }, + { + "epoch": 0.72, + "grad_norm": 7.021351446475987, + "learning_rate": 1.8823265299112564e-06, + "loss": 0.4625, + "step": 5825 + }, + { + "epoch": 0.72, + "grad_norm": 1.5024912284123797, + "learning_rate": 1.880755592208653e-06, + "loss": 0.4997, + "step": 5826 + }, + { + "epoch": 0.72, + "grad_norm": 1.7213862970566716, + "learning_rate": 1.879185158447457e-06, + "loss": 0.4717, + "step": 5827 + }, + { + "epoch": 0.72, + "grad_norm": 1.6218774973615702, + "learning_rate": 1.8776152288813842e-06, + "loss": 0.474, + "step": 5828 + }, + { + "epoch": 0.72, + "grad_norm": 1.9492154048572368, + "learning_rate": 1.8760458037640677e-06, + "loss": 0.4841, + "step": 5829 + }, + { + "epoch": 0.72, + "grad_norm": 2.374140362550844, + "learning_rate": 1.874476883349068e-06, + "loss": 0.4599, + "step": 5830 + }, + { + "epoch": 0.72, + "grad_norm": 1.8033689611937722, + "learning_rate": 1.8729084678898534e-06, + "loss": 0.4844, + "step": 5831 + }, + { + "epoch": 0.72, + "grad_norm": 1.441908053408819, + "learning_rate": 1.8713405576398187e-06, + "loss": 0.4918, + "step": 5832 + }, + { + "epoch": 0.72, + "grad_norm": 1.6312171403721705, + "learning_rate": 1.8697731528522694e-06, + "loss": 0.5077, + "step": 5833 + }, + { + "epoch": 0.72, + "grad_norm": 6.715085014776812, + "learning_rate": 1.868206253780438e-06, + "loss": 0.4836, + "step": 5834 + }, + { + "epoch": 0.72, + "grad_norm": 1.3763809397355025, + "learning_rate": 1.8666398606774667e-06, + "loss": 0.4653, + "step": 5835 + }, + { + "epoch": 0.72, + "grad_norm": 1.4817881047709291, + "learning_rate": 1.865073973796424e-06, + "loss": 0.5401, + "step": 5836 + }, + { + "epoch": 0.72, + "grad_norm": 1.8522709107388002, + "learning_rate": 1.8635085933902907e-06, + "loss": 0.5189, + "step": 5837 + }, + { + "epoch": 0.72, + "grad_norm": 0.6687465006836861, + "learning_rate": 1.8619437197119644e-06, + "loss": 0.4893, + "step": 5838 + }, + { + "epoch": 0.72, + "grad_norm": 1.8708791015591064, + "learning_rate": 1.860379353014271e-06, + "loss": 0.5368, + "step": 5839 + }, + { + "epoch": 0.72, + "grad_norm": 1.7487248472707533, + "learning_rate": 1.858815493549943e-06, + "loss": 0.5026, + "step": 5840 + }, + { + "epoch": 0.72, + "grad_norm": 17.919683857302203, + "learning_rate": 1.8572521415716387e-06, + "loss": 0.4393, + "step": 5841 + }, + { + "epoch": 0.72, + "grad_norm": 1.462150126869578, + "learning_rate": 1.8556892973319284e-06, + "loss": 0.4983, + "step": 5842 + }, + { + "epoch": 0.73, + "grad_norm": 2.334792323329871, + "learning_rate": 1.8541269610833061e-06, + "loss": 0.5266, + "step": 5843 + }, + { + "epoch": 0.73, + "grad_norm": 2.2295138381557114, + "learning_rate": 1.852565133078178e-06, + "loss": 0.5002, + "step": 5844 + }, + { + "epoch": 0.73, + "grad_norm": 1.3606810914725282, + "learning_rate": 1.851003813568874e-06, + "loss": 0.5448, + "step": 5845 + }, + { + "epoch": 0.73, + "grad_norm": 1.3410900367471983, + "learning_rate": 1.8494430028076372e-06, + "loss": 0.4918, + "step": 5846 + }, + { + "epoch": 0.73, + "grad_norm": 1.353687681132741, + "learning_rate": 1.84788270104663e-06, + "loss": 0.4591, + "step": 5847 + }, + { + "epoch": 0.73, + "grad_norm": 2.2867728993341863, + "learning_rate": 1.846322908537936e-06, + "loss": 0.4626, + "step": 5848 + }, + { + "epoch": 0.73, + "grad_norm": 1.5021326759171463, + "learning_rate": 1.8447636255335488e-06, + "loss": 0.5241, + "step": 5849 + }, + { + "epoch": 0.73, + "grad_norm": 1.5503247553351363, + "learning_rate": 1.8432048522853891e-06, + "loss": 0.5121, + "step": 5850 + }, + { + "epoch": 0.73, + "grad_norm": 1.851781475388872, + "learning_rate": 1.8416465890452862e-06, + "loss": 0.4923, + "step": 5851 + }, + { + "epoch": 0.73, + "grad_norm": 1.7091706381910228, + "learning_rate": 1.8400888360649949e-06, + "loss": 0.477, + "step": 5852 + }, + { + "epoch": 0.73, + "grad_norm": 1.4134026379502893, + "learning_rate": 1.8385315935961805e-06, + "loss": 0.5358, + "step": 5853 + }, + { + "epoch": 0.73, + "grad_norm": 2.0054461199669316, + "learning_rate": 1.836974861890431e-06, + "loss": 0.5161, + "step": 5854 + }, + { + "epoch": 0.73, + "grad_norm": 1.3766422435819043, + "learning_rate": 1.8354186411992514e-06, + "loss": 0.4829, + "step": 5855 + }, + { + "epoch": 0.73, + "grad_norm": 1.3201172847650677, + "learning_rate": 1.8338629317740598e-06, + "loss": 0.4482, + "step": 5856 + }, + { + "epoch": 0.73, + "grad_norm": 2.0163699674700495, + "learning_rate": 1.8323077338661981e-06, + "loss": 0.5215, + "step": 5857 + }, + { + "epoch": 0.73, + "grad_norm": 1.6190726538042708, + "learning_rate": 1.8307530477269192e-06, + "loss": 0.4652, + "step": 5858 + }, + { + "epoch": 0.73, + "grad_norm": 1.4391573430666145, + "learning_rate": 1.829198873607399e-06, + "loss": 0.5407, + "step": 5859 + }, + { + "epoch": 0.73, + "grad_norm": 2.5196588134185403, + "learning_rate": 1.8276452117587252e-06, + "loss": 0.5364, + "step": 5860 + }, + { + "epoch": 0.73, + "grad_norm": 1.4095757168875038, + "learning_rate": 1.8260920624319084e-06, + "loss": 0.5028, + "step": 5861 + }, + { + "epoch": 0.73, + "grad_norm": 1.915322471299682, + "learning_rate": 1.824539425877871e-06, + "loss": 0.4948, + "step": 5862 + }, + { + "epoch": 0.73, + "grad_norm": 1.3294204484252052, + "learning_rate": 1.822987302347456e-06, + "loss": 0.5076, + "step": 5863 + }, + { + "epoch": 0.73, + "grad_norm": 1.472479235952562, + "learning_rate": 1.8214356920914244e-06, + "loss": 0.4979, + "step": 5864 + }, + { + "epoch": 0.73, + "grad_norm": 1.3967686022768886, + "learning_rate": 1.8198845953604494e-06, + "loss": 0.4981, + "step": 5865 + }, + { + "epoch": 0.73, + "grad_norm": 2.250885405548494, + "learning_rate": 1.818334012405128e-06, + "loss": 0.4818, + "step": 5866 + }, + { + "epoch": 0.73, + "grad_norm": 1.5312106093910633, + "learning_rate": 1.8167839434759665e-06, + "loss": 0.4943, + "step": 5867 + }, + { + "epoch": 0.73, + "grad_norm": 1.5033972291446327, + "learning_rate": 1.8152343888233965e-06, + "loss": 0.5091, + "step": 5868 + }, + { + "epoch": 0.73, + "grad_norm": 1.4241722148533027, + "learning_rate": 1.8136853486977575e-06, + "loss": 0.4618, + "step": 5869 + }, + { + "epoch": 0.73, + "grad_norm": 1.837726102143834, + "learning_rate": 1.8121368233493154e-06, + "loss": 0.4202, + "step": 5870 + }, + { + "epoch": 0.73, + "grad_norm": 1.3763583291820833, + "learning_rate": 1.8105888130282433e-06, + "loss": 0.4719, + "step": 5871 + }, + { + "epoch": 0.73, + "grad_norm": 1.3135692863116784, + "learning_rate": 1.8090413179846383e-06, + "loss": 0.4808, + "step": 5872 + }, + { + "epoch": 0.73, + "grad_norm": 0.6601278979307443, + "learning_rate": 1.8074943384685139e-06, + "loss": 0.4719, + "step": 5873 + }, + { + "epoch": 0.73, + "grad_norm": 1.533500102581177, + "learning_rate": 1.8059478747297942e-06, + "loss": 0.5315, + "step": 5874 + }, + { + "epoch": 0.73, + "grad_norm": 1.9325656771854214, + "learning_rate": 1.8044019270183278e-06, + "loss": 0.4732, + "step": 5875 + }, + { + "epoch": 0.73, + "grad_norm": 1.6384871820813307, + "learning_rate": 1.802856495583873e-06, + "loss": 0.4881, + "step": 5876 + }, + { + "epoch": 0.73, + "grad_norm": 1.3817610947695655, + "learning_rate": 1.8013115806761105e-06, + "loss": 0.5039, + "step": 5877 + }, + { + "epoch": 0.73, + "grad_norm": 1.9646188909105635, + "learning_rate": 1.7997671825446323e-06, + "loss": 0.5361, + "step": 5878 + }, + { + "epoch": 0.73, + "grad_norm": 1.658577168296523, + "learning_rate": 1.798223301438951e-06, + "loss": 0.5399, + "step": 5879 + }, + { + "epoch": 0.73, + "grad_norm": 0.6561815290671748, + "learning_rate": 1.7966799376084954e-06, + "loss": 0.5165, + "step": 5880 + }, + { + "epoch": 0.73, + "grad_norm": 1.698367245602605, + "learning_rate": 1.7951370913026067e-06, + "loss": 0.5214, + "step": 5881 + }, + { + "epoch": 0.73, + "grad_norm": 1.4701773401872964, + "learning_rate": 1.7935947627705485e-06, + "loss": 0.4431, + "step": 5882 + }, + { + "epoch": 0.73, + "grad_norm": 1.4422387143040183, + "learning_rate": 1.7920529522614943e-06, + "loss": 0.5262, + "step": 5883 + }, + { + "epoch": 0.73, + "grad_norm": 1.3581629909265374, + "learning_rate": 1.7905116600245404e-06, + "loss": 0.5287, + "step": 5884 + }, + { + "epoch": 0.73, + "grad_norm": 1.5816641254198855, + "learning_rate": 1.788970886308693e-06, + "loss": 0.5171, + "step": 5885 + }, + { + "epoch": 0.73, + "grad_norm": 1.6127891341786535, + "learning_rate": 1.7874306313628802e-06, + "loss": 0.4907, + "step": 5886 + }, + { + "epoch": 0.73, + "grad_norm": 1.3761269959792422, + "learning_rate": 1.78589089543594e-06, + "loss": 0.4984, + "step": 5887 + }, + { + "epoch": 0.73, + "grad_norm": 1.6734571305492423, + "learning_rate": 1.7843516787766357e-06, + "loss": 0.5091, + "step": 5888 + }, + { + "epoch": 0.73, + "grad_norm": 1.4534246916626015, + "learning_rate": 1.7828129816336387e-06, + "loss": 0.5099, + "step": 5889 + }, + { + "epoch": 0.73, + "grad_norm": 1.4742219251314783, + "learning_rate": 1.7812748042555378e-06, + "loss": 0.4914, + "step": 5890 + }, + { + "epoch": 0.73, + "grad_norm": 1.357326060264499, + "learning_rate": 1.7797371468908414e-06, + "loss": 0.461, + "step": 5891 + }, + { + "epoch": 0.73, + "grad_norm": 1.6500132546473334, + "learning_rate": 1.7782000097879692e-06, + "loss": 0.4542, + "step": 5892 + }, + { + "epoch": 0.73, + "grad_norm": 1.4501522402373412, + "learning_rate": 1.776663393195262e-06, + "loss": 0.4509, + "step": 5893 + }, + { + "epoch": 0.73, + "grad_norm": 1.3054125980508295, + "learning_rate": 1.7751272973609707e-06, + "loss": 0.4766, + "step": 5894 + }, + { + "epoch": 0.73, + "grad_norm": 5.982710616249453, + "learning_rate": 1.7735917225332666e-06, + "loss": 0.5887, + "step": 5895 + }, + { + "epoch": 0.73, + "grad_norm": 1.888839005725094, + "learning_rate": 1.7720566689602354e-06, + "loss": 0.4827, + "step": 5896 + }, + { + "epoch": 0.73, + "grad_norm": 1.4611341133548106, + "learning_rate": 1.77052213688988e-06, + "loss": 0.4747, + "step": 5897 + }, + { + "epoch": 0.73, + "grad_norm": 1.894171439291106, + "learning_rate": 1.768988126570116e-06, + "loss": 0.5112, + "step": 5898 + }, + { + "epoch": 0.73, + "grad_norm": 2.2186601395328758, + "learning_rate": 1.767454638248775e-06, + "loss": 0.4769, + "step": 5899 + }, + { + "epoch": 0.73, + "grad_norm": 1.2891341736086237, + "learning_rate": 1.7659216721736082e-06, + "loss": 0.4466, + "step": 5900 + }, + { + "epoch": 0.73, + "grad_norm": 1.4263591816159855, + "learning_rate": 1.764389228592277e-06, + "loss": 0.4862, + "step": 5901 + }, + { + "epoch": 0.73, + "grad_norm": 1.997366258609366, + "learning_rate": 1.7628573077523647e-06, + "loss": 0.4851, + "step": 5902 + }, + { + "epoch": 0.73, + "grad_norm": 0.6842837922339248, + "learning_rate": 1.7613259099013608e-06, + "loss": 0.4807, + "step": 5903 + }, + { + "epoch": 0.73, + "grad_norm": 1.4564574019636156, + "learning_rate": 1.7597950352866833e-06, + "loss": 0.4421, + "step": 5904 + }, + { + "epoch": 0.73, + "grad_norm": 1.783210696583975, + "learning_rate": 1.7582646841556533e-06, + "loss": 0.5436, + "step": 5905 + }, + { + "epoch": 0.73, + "grad_norm": 1.3190447863888473, + "learning_rate": 1.756734856755516e-06, + "loss": 0.4655, + "step": 5906 + }, + { + "epoch": 0.73, + "grad_norm": 1.8206358433489251, + "learning_rate": 1.7552055533334268e-06, + "loss": 0.4928, + "step": 5907 + }, + { + "epoch": 0.73, + "grad_norm": 1.4403153492787868, + "learning_rate": 1.7536767741364558e-06, + "loss": 0.4993, + "step": 5908 + }, + { + "epoch": 0.73, + "grad_norm": 1.3428632704917827, + "learning_rate": 1.752148519411595e-06, + "loss": 0.4975, + "step": 5909 + }, + { + "epoch": 0.73, + "grad_norm": 1.3080469443346712, + "learning_rate": 1.7506207894057442e-06, + "loss": 0.4692, + "step": 5910 + }, + { + "epoch": 0.73, + "grad_norm": 1.2920041190174922, + "learning_rate": 1.7490935843657242e-06, + "loss": 0.5237, + "step": 5911 + }, + { + "epoch": 0.73, + "grad_norm": 2.4014102888616162, + "learning_rate": 1.7475669045382636e-06, + "loss": 0.4767, + "step": 5912 + }, + { + "epoch": 0.73, + "grad_norm": 1.4549220607061533, + "learning_rate": 1.7460407501700178e-06, + "loss": 0.5206, + "step": 5913 + }, + { + "epoch": 0.73, + "grad_norm": 2.503657221600372, + "learning_rate": 1.7445151215075456e-06, + "loss": 0.4472, + "step": 5914 + }, + { + "epoch": 0.73, + "grad_norm": 0.6978670048145843, + "learning_rate": 1.7429900187973287e-06, + "loss": 0.496, + "step": 5915 + }, + { + "epoch": 0.73, + "grad_norm": 1.9133555141324627, + "learning_rate": 1.741465442285758e-06, + "loss": 0.5304, + "step": 5916 + }, + { + "epoch": 0.73, + "grad_norm": 2.3521218724701396, + "learning_rate": 1.7399413922191455e-06, + "loss": 0.4711, + "step": 5917 + }, + { + "epoch": 0.73, + "grad_norm": 1.2834337970618197, + "learning_rate": 1.7384178688437132e-06, + "loss": 0.4776, + "step": 5918 + }, + { + "epoch": 0.73, + "grad_norm": 1.4133634953894676, + "learning_rate": 1.7368948724055974e-06, + "loss": 0.5234, + "step": 5919 + }, + { + "epoch": 0.73, + "grad_norm": 1.7352465408962612, + "learning_rate": 1.735372403150854e-06, + "loss": 0.499, + "step": 5920 + }, + { + "epoch": 0.73, + "grad_norm": 1.5830249891711863, + "learning_rate": 1.7338504613254515e-06, + "loss": 0.5055, + "step": 5921 + }, + { + "epoch": 0.73, + "grad_norm": 1.6650457763157955, + "learning_rate": 1.7323290471752741e-06, + "loss": 0.4851, + "step": 5922 + }, + { + "epoch": 0.74, + "grad_norm": 1.181933619004897, + "learning_rate": 1.7308081609461163e-06, + "loss": 0.4511, + "step": 5923 + }, + { + "epoch": 0.74, + "grad_norm": 1.7000840553200878, + "learning_rate": 1.7292878028836946e-06, + "loss": 0.4752, + "step": 5924 + }, + { + "epoch": 0.74, + "grad_norm": 2.3554970293049347, + "learning_rate": 1.7277679732336328e-06, + "loss": 0.474, + "step": 5925 + }, + { + "epoch": 0.74, + "grad_norm": 1.67754838277534, + "learning_rate": 1.7262486722414752e-06, + "loss": 0.5077, + "step": 5926 + }, + { + "epoch": 0.74, + "grad_norm": 1.3496912569635924, + "learning_rate": 1.7247299001526773e-06, + "loss": 0.4886, + "step": 5927 + }, + { + "epoch": 0.74, + "grad_norm": 5.27870766288199, + "learning_rate": 1.7232116572126067e-06, + "loss": 0.423, + "step": 5928 + }, + { + "epoch": 0.74, + "grad_norm": 1.8641894032739894, + "learning_rate": 1.7216939436665558e-06, + "loss": 0.5036, + "step": 5929 + }, + { + "epoch": 0.74, + "grad_norm": 1.6300684737499331, + "learning_rate": 1.7201767597597197e-06, + "loss": 0.4689, + "step": 5930 + }, + { + "epoch": 0.74, + "grad_norm": 1.2278131681948699, + "learning_rate": 1.7186601057372155e-06, + "loss": 0.4723, + "step": 5931 + }, + { + "epoch": 0.74, + "grad_norm": 2.031259017202795, + "learning_rate": 1.7171439818440688e-06, + "loss": 0.4803, + "step": 5932 + }, + { + "epoch": 0.74, + "grad_norm": 0.6538675916367489, + "learning_rate": 1.7156283883252268e-06, + "loss": 0.4797, + "step": 5933 + }, + { + "epoch": 0.74, + "grad_norm": 1.6663573116080352, + "learning_rate": 1.7141133254255426e-06, + "loss": 0.4742, + "step": 5934 + }, + { + "epoch": 0.74, + "grad_norm": 1.3804574862462933, + "learning_rate": 1.712598793389792e-06, + "loss": 0.4476, + "step": 5935 + }, + { + "epoch": 0.74, + "grad_norm": 0.7024565514569163, + "learning_rate": 1.7110847924626578e-06, + "loss": 0.5029, + "step": 5936 + }, + { + "epoch": 0.74, + "grad_norm": 1.4488627553590303, + "learning_rate": 1.7095713228887411e-06, + "loss": 0.4519, + "step": 5937 + }, + { + "epoch": 0.74, + "grad_norm": 1.5775295336664255, + "learning_rate": 1.7080583849125588e-06, + "loss": 0.511, + "step": 5938 + }, + { + "epoch": 0.74, + "grad_norm": 1.6119925589353812, + "learning_rate": 1.7065459787785355e-06, + "loss": 0.4613, + "step": 5939 + }, + { + "epoch": 0.74, + "grad_norm": 1.6115711077895967, + "learning_rate": 1.705034104731017e-06, + "loss": 0.5089, + "step": 5940 + }, + { + "epoch": 0.74, + "grad_norm": 1.7297349527982928, + "learning_rate": 1.703522763014257e-06, + "loss": 0.5235, + "step": 5941 + }, + { + "epoch": 0.74, + "grad_norm": 1.353036605970803, + "learning_rate": 1.702011953872429e-06, + "loss": 0.462, + "step": 5942 + }, + { + "epoch": 0.74, + "grad_norm": 2.9813637490464453, + "learning_rate": 1.7005016775496135e-06, + "loss": 0.5045, + "step": 5943 + }, + { + "epoch": 0.74, + "grad_norm": 1.5793417303571762, + "learning_rate": 1.6989919342898127e-06, + "loss": 0.4784, + "step": 5944 + }, + { + "epoch": 0.74, + "grad_norm": 1.50614086925251, + "learning_rate": 1.697482724336938e-06, + "loss": 0.4773, + "step": 5945 + }, + { + "epoch": 0.74, + "grad_norm": 1.7080841760436045, + "learning_rate": 1.695974047934814e-06, + "loss": 0.5019, + "step": 5946 + }, + { + "epoch": 0.74, + "grad_norm": 3.0029589857718344, + "learning_rate": 1.694465905327184e-06, + "loss": 0.5011, + "step": 5947 + }, + { + "epoch": 0.74, + "grad_norm": 4.684124017120539, + "learning_rate": 1.6929582967576975e-06, + "loss": 0.5296, + "step": 5948 + }, + { + "epoch": 0.74, + "grad_norm": 2.2969706559048375, + "learning_rate": 1.6914512224699253e-06, + "loss": 0.514, + "step": 5949 + }, + { + "epoch": 0.74, + "grad_norm": 1.5848640384886945, + "learning_rate": 1.6899446827073458e-06, + "loss": 0.5012, + "step": 5950 + }, + { + "epoch": 0.74, + "grad_norm": 1.9123582479570977, + "learning_rate": 1.6884386777133566e-06, + "loss": 0.5222, + "step": 5951 + }, + { + "epoch": 0.74, + "grad_norm": 0.6449177798951945, + "learning_rate": 1.6869332077312634e-06, + "loss": 0.4658, + "step": 5952 + }, + { + "epoch": 0.74, + "grad_norm": 1.4782830397628994, + "learning_rate": 1.6854282730042893e-06, + "loss": 0.4986, + "step": 5953 + }, + { + "epoch": 0.74, + "grad_norm": 1.667017827478795, + "learning_rate": 1.6839238737755715e-06, + "loss": 0.4877, + "step": 5954 + }, + { + "epoch": 0.74, + "grad_norm": 1.203265992788667, + "learning_rate": 1.682420010288155e-06, + "loss": 0.4829, + "step": 5955 + }, + { + "epoch": 0.74, + "grad_norm": 1.5407269389663272, + "learning_rate": 1.680916682785007e-06, + "loss": 0.5296, + "step": 5956 + }, + { + "epoch": 0.74, + "grad_norm": 8.962889161814955, + "learning_rate": 1.6794138915089987e-06, + "loss": 0.4829, + "step": 5957 + }, + { + "epoch": 0.74, + "grad_norm": 1.6374891209770737, + "learning_rate": 1.677911636702923e-06, + "loss": 0.5128, + "step": 5958 + }, + { + "epoch": 0.74, + "grad_norm": 1.3436117385055986, + "learning_rate": 1.676409918609479e-06, + "loss": 0.4986, + "step": 5959 + }, + { + "epoch": 0.74, + "grad_norm": 1.5132495078968142, + "learning_rate": 1.6749087374712858e-06, + "loss": 0.5377, + "step": 5960 + }, + { + "epoch": 0.74, + "grad_norm": 2.465158233052855, + "learning_rate": 1.6734080935308694e-06, + "loss": 0.512, + "step": 5961 + }, + { + "epoch": 0.74, + "grad_norm": 1.704589541225872, + "learning_rate": 1.6719079870306737e-06, + "loss": 0.4977, + "step": 5962 + }, + { + "epoch": 0.74, + "grad_norm": 1.3285426383237866, + "learning_rate": 1.6704084182130552e-06, + "loss": 0.4882, + "step": 5963 + }, + { + "epoch": 0.74, + "grad_norm": 1.767399204910095, + "learning_rate": 1.66890938732028e-06, + "loss": 0.4877, + "step": 5964 + }, + { + "epoch": 0.74, + "grad_norm": 5.677426192641951, + "learning_rate": 1.6674108945945323e-06, + "loss": 0.4759, + "step": 5965 + }, + { + "epoch": 0.74, + "grad_norm": 1.3255144212986685, + "learning_rate": 1.6659129402779034e-06, + "loss": 0.4964, + "step": 5966 + }, + { + "epoch": 0.74, + "grad_norm": 1.4742160373416446, + "learning_rate": 1.664415524612405e-06, + "loss": 0.4928, + "step": 5967 + }, + { + "epoch": 0.74, + "grad_norm": 2.1391153981085873, + "learning_rate": 1.6629186478399538e-06, + "loss": 0.508, + "step": 5968 + }, + { + "epoch": 0.74, + "grad_norm": 1.3940825832138908, + "learning_rate": 1.6614223102023857e-06, + "loss": 0.4659, + "step": 5969 + }, + { + "epoch": 0.74, + "grad_norm": 1.777478443620191, + "learning_rate": 1.6599265119414487e-06, + "loss": 0.4779, + "step": 5970 + }, + { + "epoch": 0.74, + "grad_norm": 1.4064111179425418, + "learning_rate": 1.658431253298799e-06, + "loss": 0.4892, + "step": 5971 + }, + { + "epoch": 0.74, + "grad_norm": 1.3686681256777833, + "learning_rate": 1.6569365345160116e-06, + "loss": 0.5192, + "step": 5972 + }, + { + "epoch": 0.74, + "grad_norm": 1.6184845033964539, + "learning_rate": 1.6554423558345683e-06, + "loss": 0.4287, + "step": 5973 + }, + { + "epoch": 0.74, + "grad_norm": 1.9290552096867895, + "learning_rate": 1.6539487174958706e-06, + "loss": 0.4356, + "step": 5974 + }, + { + "epoch": 0.74, + "grad_norm": 1.3287246157659554, + "learning_rate": 1.652455619741225e-06, + "loss": 0.4179, + "step": 5975 + }, + { + "epoch": 0.74, + "grad_norm": 1.3576869075341544, + "learning_rate": 1.6509630628118584e-06, + "loss": 0.4915, + "step": 5976 + }, + { + "epoch": 0.74, + "grad_norm": 1.7120315215609672, + "learning_rate": 1.6494710469489033e-06, + "loss": 0.5053, + "step": 5977 + }, + { + "epoch": 0.74, + "grad_norm": 1.845271579003526, + "learning_rate": 1.6479795723934088e-06, + "loss": 0.5596, + "step": 5978 + }, + { + "epoch": 0.74, + "grad_norm": 1.5981560815448819, + "learning_rate": 1.646488639386339e-06, + "loss": 0.49, + "step": 5979 + }, + { + "epoch": 0.74, + "grad_norm": 1.3663767878155546, + "learning_rate": 1.6449982481685616e-06, + "loss": 0.4987, + "step": 5980 + }, + { + "epoch": 0.74, + "grad_norm": 1.3410677975477996, + "learning_rate": 1.6435083989808682e-06, + "loss": 0.5274, + "step": 5981 + }, + { + "epoch": 0.74, + "grad_norm": 1.4945763766755613, + "learning_rate": 1.6420190920639522e-06, + "loss": 0.544, + "step": 5982 + }, + { + "epoch": 0.74, + "grad_norm": 2.3177394972577257, + "learning_rate": 1.6405303276584277e-06, + "loss": 0.482, + "step": 5983 + }, + { + "epoch": 0.74, + "grad_norm": 2.7405380423742085, + "learning_rate": 1.6390421060048151e-06, + "loss": 0.4612, + "step": 5984 + }, + { + "epoch": 0.74, + "grad_norm": 1.3509356339041592, + "learning_rate": 1.6375544273435506e-06, + "loss": 0.5186, + "step": 5985 + }, + { + "epoch": 0.74, + "grad_norm": 1.3877589841195326, + "learning_rate": 1.636067291914983e-06, + "loss": 0.4668, + "step": 5986 + }, + { + "epoch": 0.74, + "grad_norm": 1.9029029071359749, + "learning_rate": 1.6345806999593717e-06, + "loss": 0.5034, + "step": 5987 + }, + { + "epoch": 0.74, + "grad_norm": 1.6081825938648133, + "learning_rate": 1.633094651716889e-06, + "loss": 0.4844, + "step": 5988 + }, + { + "epoch": 0.74, + "grad_norm": 1.4427022899923019, + "learning_rate": 1.6316091474276163e-06, + "loss": 0.4864, + "step": 5989 + }, + { + "epoch": 0.74, + "grad_norm": 2.4622955643128948, + "learning_rate": 1.6301241873315544e-06, + "loss": 0.5059, + "step": 5990 + }, + { + "epoch": 0.74, + "grad_norm": 2.2968792017603343, + "learning_rate": 1.628639771668607e-06, + "loss": 0.5585, + "step": 5991 + }, + { + "epoch": 0.74, + "grad_norm": 1.873706603490066, + "learning_rate": 1.627155900678598e-06, + "loss": 0.4722, + "step": 5992 + }, + { + "epoch": 0.74, + "grad_norm": 7.139979943714902, + "learning_rate": 1.625672574601258e-06, + "loss": 0.4448, + "step": 5993 + }, + { + "epoch": 0.74, + "grad_norm": 1.6664026791604412, + "learning_rate": 1.6241897936762313e-06, + "loss": 0.5281, + "step": 5994 + }, + { + "epoch": 0.74, + "grad_norm": 1.3131089978810235, + "learning_rate": 1.6227075581430751e-06, + "loss": 0.5188, + "step": 5995 + }, + { + "epoch": 0.74, + "grad_norm": 1.287306397039178, + "learning_rate": 1.621225868241259e-06, + "loss": 0.4818, + "step": 5996 + }, + { + "epoch": 0.74, + "grad_norm": 1.4776254966103886, + "learning_rate": 1.6197447242101615e-06, + "loss": 0.4644, + "step": 5997 + }, + { + "epoch": 0.74, + "grad_norm": 0.6944930673044044, + "learning_rate": 1.618264126289073e-06, + "loss": 0.5063, + "step": 5998 + }, + { + "epoch": 0.74, + "grad_norm": 1.554341842980303, + "learning_rate": 1.6167840747171997e-06, + "loss": 0.4816, + "step": 5999 + }, + { + "epoch": 0.74, + "grad_norm": 1.4297897583485726, + "learning_rate": 1.6153045697336544e-06, + "loss": 0.4987, + "step": 6000 + }, + { + "epoch": 0.74, + "grad_norm": 2.810195614493212, + "learning_rate": 1.6138256115774676e-06, + "loss": 0.5221, + "step": 6001 + }, + { + "epoch": 0.74, + "grad_norm": 2.166741697516714, + "learning_rate": 1.6123472004875724e-06, + "loss": 0.4781, + "step": 6002 + }, + { + "epoch": 0.74, + "grad_norm": 1.3194032375621023, + "learning_rate": 1.610869336702826e-06, + "loss": 0.459, + "step": 6003 + }, + { + "epoch": 0.75, + "grad_norm": 1.3722221268424892, + "learning_rate": 1.6093920204619856e-06, + "loss": 0.4974, + "step": 6004 + }, + { + "epoch": 0.75, + "grad_norm": 8.340858799507641, + "learning_rate": 1.6079152520037277e-06, + "loss": 0.5013, + "step": 6005 + }, + { + "epoch": 0.75, + "grad_norm": 2.1658509794232037, + "learning_rate": 1.6064390315666356e-06, + "loss": 0.4842, + "step": 6006 + }, + { + "epoch": 0.75, + "grad_norm": 4.843094029528777, + "learning_rate": 1.6049633593892045e-06, + "loss": 0.4917, + "step": 6007 + }, + { + "epoch": 0.75, + "grad_norm": 1.6668625189375703, + "learning_rate": 1.6034882357098447e-06, + "loss": 0.5162, + "step": 6008 + }, + { + "epoch": 0.75, + "grad_norm": 0.6536351282188947, + "learning_rate": 1.6020136607668724e-06, + "loss": 0.4686, + "step": 6009 + }, + { + "epoch": 0.75, + "grad_norm": 1.7594181925466228, + "learning_rate": 1.6005396347985204e-06, + "loss": 0.5251, + "step": 6010 + }, + { + "epoch": 0.75, + "grad_norm": 1.3645245329180153, + "learning_rate": 1.59906615804293e-06, + "loss": 0.4973, + "step": 6011 + }, + { + "epoch": 0.75, + "grad_norm": 13.455150978415842, + "learning_rate": 1.5975932307381564e-06, + "loss": 0.5131, + "step": 6012 + }, + { + "epoch": 0.75, + "grad_norm": 1.7661889663932724, + "learning_rate": 1.59612085312216e-06, + "loss": 0.5674, + "step": 6013 + }, + { + "epoch": 0.75, + "grad_norm": 1.5823722119837638, + "learning_rate": 1.5946490254328207e-06, + "loss": 0.517, + "step": 6014 + }, + { + "epoch": 0.75, + "grad_norm": 1.2825435915322208, + "learning_rate": 1.5931777479079224e-06, + "loss": 0.4547, + "step": 6015 + }, + { + "epoch": 0.75, + "grad_norm": 1.4780593108877558, + "learning_rate": 1.591707020785162e-06, + "loss": 0.4743, + "step": 6016 + }, + { + "epoch": 0.75, + "grad_norm": 1.5317792626925832, + "learning_rate": 1.5902368443021515e-06, + "loss": 0.4931, + "step": 6017 + }, + { + "epoch": 0.75, + "grad_norm": 1.5205937863915808, + "learning_rate": 1.5887672186964066e-06, + "loss": 0.5418, + "step": 6018 + }, + { + "epoch": 0.75, + "grad_norm": 1.5859984688186561, + "learning_rate": 1.5872981442053643e-06, + "loss": 0.4967, + "step": 6019 + }, + { + "epoch": 0.75, + "grad_norm": 2.2044996149466667, + "learning_rate": 1.585829621066361e-06, + "loss": 0.5056, + "step": 6020 + }, + { + "epoch": 0.75, + "grad_norm": 2.523560594149025, + "learning_rate": 1.5843616495166536e-06, + "loss": 0.4778, + "step": 6021 + }, + { + "epoch": 0.75, + "grad_norm": 1.3660556794601857, + "learning_rate": 1.5828942297934018e-06, + "loss": 0.4743, + "step": 6022 + }, + { + "epoch": 0.75, + "grad_norm": 1.5846245967103545, + "learning_rate": 1.581427362133685e-06, + "loss": 0.4882, + "step": 6023 + }, + { + "epoch": 0.75, + "grad_norm": 1.292360003145945, + "learning_rate": 1.5799610467744836e-06, + "loss": 0.4528, + "step": 6024 + }, + { + "epoch": 0.75, + "grad_norm": 2.007002426929482, + "learning_rate": 1.578495283952698e-06, + "loss": 0.4652, + "step": 6025 + }, + { + "epoch": 0.75, + "grad_norm": 1.7468838114684746, + "learning_rate": 1.577030073905133e-06, + "loss": 0.4918, + "step": 6026 + }, + { + "epoch": 0.75, + "grad_norm": 0.7271485836999365, + "learning_rate": 1.5755654168685037e-06, + "loss": 0.4931, + "step": 6027 + }, + { + "epoch": 0.75, + "grad_norm": 1.3111986543695473, + "learning_rate": 1.5741013130794435e-06, + "loss": 0.4633, + "step": 6028 + }, + { + "epoch": 0.75, + "grad_norm": 0.6461414512441257, + "learning_rate": 1.5726377627744877e-06, + "loss": 0.4677, + "step": 6029 + }, + { + "epoch": 0.75, + "grad_norm": 2.3608209292052993, + "learning_rate": 1.5711747661900884e-06, + "loss": 0.5287, + "step": 6030 + }, + { + "epoch": 0.75, + "grad_norm": 1.47618381807499, + "learning_rate": 1.5697123235626021e-06, + "loss": 0.51, + "step": 6031 + }, + { + "epoch": 0.75, + "grad_norm": 1.524575288440333, + "learning_rate": 1.5682504351283034e-06, + "loss": 0.5236, + "step": 6032 + }, + { + "epoch": 0.75, + "grad_norm": 1.9620674437293155, + "learning_rate": 1.5667891011233688e-06, + "loss": 0.5136, + "step": 6033 + }, + { + "epoch": 0.75, + "grad_norm": 1.429051606799475, + "learning_rate": 1.5653283217838938e-06, + "loss": 0.48, + "step": 6034 + }, + { + "epoch": 0.75, + "grad_norm": 1.9913598932383123, + "learning_rate": 1.563868097345877e-06, + "loss": 0.4949, + "step": 6035 + }, + { + "epoch": 0.75, + "grad_norm": 1.724628914180807, + "learning_rate": 1.5624084280452313e-06, + "loss": 0.5289, + "step": 6036 + }, + { + "epoch": 0.75, + "grad_norm": 1.3544155289688808, + "learning_rate": 1.5609493141177817e-06, + "loss": 0.4698, + "step": 6037 + }, + { + "epoch": 0.75, + "grad_norm": 1.4219588068053217, + "learning_rate": 1.5594907557992568e-06, + "loss": 0.5146, + "step": 6038 + }, + { + "epoch": 0.75, + "grad_norm": 1.409410630606562, + "learning_rate": 1.5580327533253037e-06, + "loss": 0.487, + "step": 6039 + }, + { + "epoch": 0.75, + "grad_norm": 1.5799718286887938, + "learning_rate": 1.5565753069314716e-06, + "loss": 0.4632, + "step": 6040 + }, + { + "epoch": 0.75, + "grad_norm": 1.6423879889232786, + "learning_rate": 1.555118416853227e-06, + "loss": 0.5132, + "step": 6041 + }, + { + "epoch": 0.75, + "grad_norm": 1.6570357501497963, + "learning_rate": 1.5536620833259409e-06, + "loss": 0.5067, + "step": 6042 + }, + { + "epoch": 0.75, + "grad_norm": 1.6347678528254872, + "learning_rate": 1.5522063065848974e-06, + "loss": 0.4755, + "step": 6043 + }, + { + "epoch": 0.75, + "grad_norm": 1.9050361222724341, + "learning_rate": 1.5507510868652919e-06, + "loss": 0.5277, + "step": 6044 + }, + { + "epoch": 0.75, + "grad_norm": 1.4023645235573734, + "learning_rate": 1.5492964244022258e-06, + "loss": 0.5248, + "step": 6045 + }, + { + "epoch": 0.75, + "grad_norm": 1.5803261516406482, + "learning_rate": 1.5478423194307147e-06, + "loss": 0.5049, + "step": 6046 + }, + { + "epoch": 0.75, + "grad_norm": 1.440458581359572, + "learning_rate": 1.546388772185679e-06, + "loss": 0.4499, + "step": 6047 + }, + { + "epoch": 0.75, + "grad_norm": 1.5016619536805895, + "learning_rate": 1.5449357829019556e-06, + "loss": 0.4498, + "step": 6048 + }, + { + "epoch": 0.75, + "grad_norm": 1.6320705758129939, + "learning_rate": 1.5434833518142839e-06, + "loss": 0.4782, + "step": 6049 + }, + { + "epoch": 0.75, + "grad_norm": 5.790727339102416, + "learning_rate": 1.542031479157321e-06, + "loss": 0.4854, + "step": 6050 + }, + { + "epoch": 0.75, + "grad_norm": 1.7130602317036618, + "learning_rate": 1.5405801651656266e-06, + "loss": 0.5126, + "step": 6051 + }, + { + "epoch": 0.75, + "grad_norm": 1.4593958980420332, + "learning_rate": 1.5391294100736736e-06, + "loss": 0.4715, + "step": 6052 + }, + { + "epoch": 0.75, + "grad_norm": 2.6284291543273373, + "learning_rate": 1.5376792141158469e-06, + "loss": 0.4736, + "step": 6053 + }, + { + "epoch": 0.75, + "grad_norm": 1.8665726653903791, + "learning_rate": 1.5362295775264353e-06, + "loss": 0.5013, + "step": 6054 + }, + { + "epoch": 0.75, + "grad_norm": 1.9314327501677002, + "learning_rate": 1.5347805005396427e-06, + "loss": 0.5416, + "step": 6055 + }, + { + "epoch": 0.75, + "grad_norm": 1.5284492197838069, + "learning_rate": 1.5333319833895776e-06, + "loss": 0.4701, + "step": 6056 + }, + { + "epoch": 0.75, + "grad_norm": 1.4593419373204979, + "learning_rate": 1.5318840263102641e-06, + "loss": 0.5075, + "step": 6057 + }, + { + "epoch": 0.75, + "grad_norm": 1.7455863857710499, + "learning_rate": 1.5304366295356287e-06, + "loss": 0.5364, + "step": 6058 + }, + { + "epoch": 0.75, + "grad_norm": 1.464678343219715, + "learning_rate": 1.5289897932995134e-06, + "loss": 0.4746, + "step": 6059 + }, + { + "epoch": 0.75, + "grad_norm": 1.8595107075944286, + "learning_rate": 1.5275435178356685e-06, + "loss": 0.5084, + "step": 6060 + }, + { + "epoch": 0.75, + "grad_norm": 1.9740787321561957, + "learning_rate": 1.526097803377749e-06, + "loss": 0.46, + "step": 6061 + }, + { + "epoch": 0.75, + "grad_norm": 0.6755881265433279, + "learning_rate": 1.5246526501593262e-06, + "loss": 0.4791, + "step": 6062 + }, + { + "epoch": 0.75, + "grad_norm": 1.5588452764686183, + "learning_rate": 1.5232080584138748e-06, + "loss": 0.5, + "step": 6063 + }, + { + "epoch": 0.75, + "grad_norm": 1.4090921609069993, + "learning_rate": 1.5217640283747836e-06, + "loss": 0.4794, + "step": 6064 + }, + { + "epoch": 0.75, + "grad_norm": 1.4383211702238097, + "learning_rate": 1.5203205602753462e-06, + "loss": 0.4716, + "step": 6065 + }, + { + "epoch": 0.75, + "grad_norm": 1.6150559905537978, + "learning_rate": 1.5188776543487699e-06, + "loss": 0.5223, + "step": 6066 + }, + { + "epoch": 0.75, + "grad_norm": 0.6791885235010544, + "learning_rate": 1.5174353108281653e-06, + "loss": 0.4273, + "step": 6067 + }, + { + "epoch": 0.75, + "grad_norm": 0.6686708077913581, + "learning_rate": 1.515993529946559e-06, + "loss": 0.5173, + "step": 6068 + }, + { + "epoch": 0.75, + "grad_norm": 1.4998571863692616, + "learning_rate": 1.5145523119368832e-06, + "loss": 0.4399, + "step": 6069 + }, + { + "epoch": 0.75, + "grad_norm": 1.3938713925748085, + "learning_rate": 1.513111657031977e-06, + "loss": 0.477, + "step": 6070 + }, + { + "epoch": 0.75, + "grad_norm": 1.639811660627296, + "learning_rate": 1.5116715654645941e-06, + "loss": 0.5163, + "step": 6071 + }, + { + "epoch": 0.75, + "grad_norm": 1.8654325248173858, + "learning_rate": 1.5102320374673907e-06, + "loss": 0.4395, + "step": 6072 + }, + { + "epoch": 0.75, + "grad_norm": 1.502720874368011, + "learning_rate": 1.5087930732729378e-06, + "loss": 0.5218, + "step": 6073 + }, + { + "epoch": 0.75, + "grad_norm": 1.716755769956245, + "learning_rate": 1.5073546731137105e-06, + "loss": 0.5174, + "step": 6074 + }, + { + "epoch": 0.75, + "grad_norm": 1.2313240862812058, + "learning_rate": 1.5059168372220984e-06, + "loss": 0.452, + "step": 6075 + }, + { + "epoch": 0.75, + "grad_norm": 1.4806778543728099, + "learning_rate": 1.5044795658303924e-06, + "loss": 0.5089, + "step": 6076 + }, + { + "epoch": 0.75, + "grad_norm": 1.9318546882145895, + "learning_rate": 1.503042859170798e-06, + "loss": 0.5295, + "step": 6077 + }, + { + "epoch": 0.75, + "grad_norm": 1.3992136102391561, + "learning_rate": 1.5016067174754301e-06, + "loss": 0.5428, + "step": 6078 + }, + { + "epoch": 0.75, + "grad_norm": 1.5946679371637276, + "learning_rate": 1.5001711409763065e-06, + "loss": 0.5025, + "step": 6079 + }, + { + "epoch": 0.75, + "grad_norm": 1.3330513773776378, + "learning_rate": 1.4987361299053599e-06, + "loss": 0.4952, + "step": 6080 + }, + { + "epoch": 0.75, + "grad_norm": 3.763017399981389, + "learning_rate": 1.4973016844944265e-06, + "loss": 0.4714, + "step": 6081 + }, + { + "epoch": 0.75, + "grad_norm": 1.412498599014573, + "learning_rate": 1.4958678049752568e-06, + "loss": 0.4751, + "step": 6082 + }, + { + "epoch": 0.75, + "grad_norm": 2.077612354528001, + "learning_rate": 1.4944344915795034e-06, + "loss": 0.5186, + "step": 6083 + }, + { + "epoch": 0.76, + "grad_norm": 2.636665581614744, + "learning_rate": 1.4930017445387317e-06, + "loss": 0.5439, + "step": 6084 + }, + { + "epoch": 0.76, + "grad_norm": 1.3895732747424572, + "learning_rate": 1.4915695640844174e-06, + "loss": 0.4998, + "step": 6085 + }, + { + "epoch": 0.76, + "grad_norm": 1.8684005695710397, + "learning_rate": 1.4901379504479374e-06, + "loss": 0.5444, + "step": 6086 + }, + { + "epoch": 0.76, + "grad_norm": 1.5776228687137166, + "learning_rate": 1.488706903860585e-06, + "loss": 0.4587, + "step": 6087 + }, + { + "epoch": 0.76, + "grad_norm": 1.5765276916924604, + "learning_rate": 1.487276424553556e-06, + "loss": 0.4779, + "step": 6088 + }, + { + "epoch": 0.76, + "grad_norm": 1.5607512892815054, + "learning_rate": 1.4858465127579585e-06, + "loss": 0.5154, + "step": 6089 + }, + { + "epoch": 0.76, + "grad_norm": 1.5630673033403901, + "learning_rate": 1.484417168704806e-06, + "loss": 0.459, + "step": 6090 + }, + { + "epoch": 0.76, + "grad_norm": 1.608404412381052, + "learning_rate": 1.4829883926250238e-06, + "loss": 0.5575, + "step": 6091 + }, + { + "epoch": 0.76, + "grad_norm": 4.239238780180317, + "learning_rate": 1.4815601847494388e-06, + "loss": 0.5174, + "step": 6092 + }, + { + "epoch": 0.76, + "grad_norm": 1.386493163183553, + "learning_rate": 1.4801325453087967e-06, + "loss": 0.4844, + "step": 6093 + }, + { + "epoch": 0.76, + "grad_norm": 1.2527730800500003, + "learning_rate": 1.4787054745337403e-06, + "loss": 0.5151, + "step": 6094 + }, + { + "epoch": 0.76, + "grad_norm": 1.4224906006832736, + "learning_rate": 1.4772789726548293e-06, + "loss": 0.47, + "step": 6095 + }, + { + "epoch": 0.76, + "grad_norm": 1.4452147059422642, + "learning_rate": 1.475853039902525e-06, + "loss": 0.4718, + "step": 6096 + }, + { + "epoch": 0.76, + "grad_norm": 1.3464263529106104, + "learning_rate": 1.4744276765071991e-06, + "loss": 0.4993, + "step": 6097 + }, + { + "epoch": 0.76, + "grad_norm": 1.4470631571011383, + "learning_rate": 1.4730028826991338e-06, + "loss": 0.5509, + "step": 6098 + }, + { + "epoch": 0.76, + "grad_norm": 2.2049191188781294, + "learning_rate": 1.471578658708514e-06, + "loss": 0.4637, + "step": 6099 + }, + { + "epoch": 0.76, + "grad_norm": 2.5432552984631496, + "learning_rate": 1.4701550047654378e-06, + "loss": 0.5719, + "step": 6100 + }, + { + "epoch": 0.76, + "grad_norm": 1.7224810211178514, + "learning_rate": 1.4687319210999085e-06, + "loss": 0.5144, + "step": 6101 + }, + { + "epoch": 0.76, + "grad_norm": 1.3750536849035844, + "learning_rate": 1.46730940794184e-06, + "loss": 0.4745, + "step": 6102 + }, + { + "epoch": 0.76, + "grad_norm": 3.2787319080534196, + "learning_rate": 1.4658874655210476e-06, + "loss": 0.5074, + "step": 6103 + }, + { + "epoch": 0.76, + "grad_norm": 1.8074307407464616, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.5201, + "step": 6104 + }, + { + "epoch": 0.76, + "grad_norm": 1.3816429645259922, + "learning_rate": 1.4630452938101187e-06, + "loss": 0.4676, + "step": 6105 + }, + { + "epoch": 0.76, + "grad_norm": 1.4917974592944019, + "learning_rate": 1.4616250649791563e-06, + "loss": 0.4782, + "step": 6106 + }, + { + "epoch": 0.76, + "grad_norm": 1.7272759649744858, + "learning_rate": 1.4602054078038296e-06, + "loss": 0.5241, + "step": 6107 + }, + { + "epoch": 0.76, + "grad_norm": 0.7922423149902295, + "learning_rate": 1.4587863225134934e-06, + "loss": 0.5275, + "step": 6108 + }, + { + "epoch": 0.76, + "grad_norm": 3.0917670324044777, + "learning_rate": 1.457367809337415e-06, + "loss": 0.491, + "step": 6109 + }, + { + "epoch": 0.76, + "grad_norm": 1.545252583770274, + "learning_rate": 1.4559498685047679e-06, + "loss": 0.5203, + "step": 6110 + }, + { + "epoch": 0.76, + "grad_norm": 1.549050685649776, + "learning_rate": 1.4545325002446337e-06, + "loss": 0.5277, + "step": 6111 + }, + { + "epoch": 0.76, + "grad_norm": 1.4392223393324777, + "learning_rate": 1.453115704785999e-06, + "loss": 0.485, + "step": 6112 + }, + { + "epoch": 0.76, + "grad_norm": 1.4131394903061087, + "learning_rate": 1.4516994823577613e-06, + "loss": 0.479, + "step": 6113 + }, + { + "epoch": 0.76, + "grad_norm": 1.386963970835647, + "learning_rate": 1.4502838331887232e-06, + "loss": 0.5144, + "step": 6114 + }, + { + "epoch": 0.76, + "grad_norm": 1.5075390793671843, + "learning_rate": 1.4488687575075932e-06, + "loss": 0.4816, + "step": 6115 + }, + { + "epoch": 0.76, + "grad_norm": 2.2656131827967814, + "learning_rate": 1.4474542555429927e-06, + "loss": 0.4684, + "step": 6116 + }, + { + "epoch": 0.76, + "grad_norm": 1.6712536801227746, + "learning_rate": 1.4460403275234425e-06, + "loss": 0.4901, + "step": 6117 + }, + { + "epoch": 0.76, + "grad_norm": 1.6216114085232909, + "learning_rate": 1.4446269736773805e-06, + "loss": 0.4703, + "step": 6118 + }, + { + "epoch": 0.76, + "grad_norm": 1.4842233786242254, + "learning_rate": 1.4432141942331423e-06, + "loss": 0.5212, + "step": 6119 + }, + { + "epoch": 0.76, + "grad_norm": 1.319943915507911, + "learning_rate": 1.4418019894189783e-06, + "loss": 0.4533, + "step": 6120 + }, + { + "epoch": 0.76, + "grad_norm": 1.5602930700038722, + "learning_rate": 1.4403903594630387e-06, + "loss": 0.495, + "step": 6121 + }, + { + "epoch": 0.76, + "grad_norm": 1.3866372003821805, + "learning_rate": 1.4389793045933887e-06, + "loss": 0.542, + "step": 6122 + }, + { + "epoch": 0.76, + "grad_norm": 1.6214768264161543, + "learning_rate": 1.4375688250379927e-06, + "loss": 0.4438, + "step": 6123 + }, + { + "epoch": 0.76, + "grad_norm": 1.5641216385268033, + "learning_rate": 1.43615892102473e-06, + "loss": 0.5375, + "step": 6124 + }, + { + "epoch": 0.76, + "grad_norm": 1.5477655317021919, + "learning_rate": 1.4347495927813797e-06, + "loss": 0.5442, + "step": 6125 + }, + { + "epoch": 0.76, + "grad_norm": 1.7259272691075274, + "learning_rate": 1.4333408405356325e-06, + "loss": 0.5016, + "step": 6126 + }, + { + "epoch": 0.76, + "grad_norm": 1.5600227007704104, + "learning_rate": 1.4319326645150861e-06, + "loss": 0.5079, + "step": 6127 + }, + { + "epoch": 0.76, + "grad_norm": 1.7365409956549633, + "learning_rate": 1.4305250649472414e-06, + "loss": 0.5232, + "step": 6128 + }, + { + "epoch": 0.76, + "grad_norm": 1.392625333117977, + "learning_rate": 1.4291180420595109e-06, + "loss": 0.5264, + "step": 6129 + }, + { + "epoch": 0.76, + "grad_norm": 1.5697261875302033, + "learning_rate": 1.4277115960792082e-06, + "loss": 0.487, + "step": 6130 + }, + { + "epoch": 0.76, + "grad_norm": 1.5277808999870366, + "learning_rate": 1.4263057272335612e-06, + "loss": 0.4424, + "step": 6131 + }, + { + "epoch": 0.76, + "grad_norm": 1.4738957764427278, + "learning_rate": 1.4249004357496965e-06, + "loss": 0.5131, + "step": 6132 + }, + { + "epoch": 0.76, + "grad_norm": 1.5456403435516437, + "learning_rate": 1.4234957218546531e-06, + "loss": 0.4809, + "step": 6133 + }, + { + "epoch": 0.76, + "grad_norm": 1.6002757304383604, + "learning_rate": 1.4220915857753765e-06, + "loss": 0.4778, + "step": 6134 + }, + { + "epoch": 0.76, + "grad_norm": 1.9512042216256624, + "learning_rate": 1.4206880277387143e-06, + "loss": 0.5432, + "step": 6135 + }, + { + "epoch": 0.76, + "grad_norm": 2.780164308223862, + "learning_rate": 1.419285047971426e-06, + "loss": 0.501, + "step": 6136 + }, + { + "epoch": 0.76, + "grad_norm": 1.4504743857264533, + "learning_rate": 1.4178826467001733e-06, + "loss": 0.524, + "step": 6137 + }, + { + "epoch": 0.76, + "grad_norm": 2.146544480364636, + "learning_rate": 1.4164808241515287e-06, + "loss": 0.5136, + "step": 6138 + }, + { + "epoch": 0.76, + "grad_norm": 1.6272967301645165, + "learning_rate": 1.4150795805519663e-06, + "loss": 0.4752, + "step": 6139 + }, + { + "epoch": 0.76, + "grad_norm": 1.5723827414808167, + "learning_rate": 1.4136789161278724e-06, + "loss": 0.4918, + "step": 6140 + }, + { + "epoch": 0.76, + "grad_norm": 1.3826988338638708, + "learning_rate": 1.4122788311055335e-06, + "loss": 0.4696, + "step": 6141 + }, + { + "epoch": 0.76, + "grad_norm": 1.3597633810300407, + "learning_rate": 1.410879325711147e-06, + "loss": 0.5011, + "step": 6142 + }, + { + "epoch": 0.76, + "grad_norm": 1.6124829679414978, + "learning_rate": 1.4094804001708174e-06, + "loss": 0.5958, + "step": 6143 + }, + { + "epoch": 0.76, + "grad_norm": 1.542078020146645, + "learning_rate": 1.4080820547105501e-06, + "loss": 0.531, + "step": 6144 + }, + { + "epoch": 0.76, + "grad_norm": 1.2953216401574426, + "learning_rate": 1.4066842895562631e-06, + "loss": 0.4607, + "step": 6145 + }, + { + "epoch": 0.76, + "grad_norm": 1.7238163480996251, + "learning_rate": 1.4052871049337752e-06, + "loss": 0.4902, + "step": 6146 + }, + { + "epoch": 0.76, + "grad_norm": 1.6268366494492497, + "learning_rate": 1.4038905010688159e-06, + "loss": 0.476, + "step": 6147 + }, + { + "epoch": 0.76, + "grad_norm": 1.4296291021875145, + "learning_rate": 1.4024944781870164e-06, + "loss": 0.4979, + "step": 6148 + }, + { + "epoch": 0.76, + "grad_norm": 1.3735107930351835, + "learning_rate": 1.4010990365139194e-06, + "loss": 0.474, + "step": 6149 + }, + { + "epoch": 0.76, + "grad_norm": 1.5927504311548404, + "learning_rate": 1.399704176274968e-06, + "loss": 0.4923, + "step": 6150 + }, + { + "epoch": 0.76, + "grad_norm": 1.4353602620740302, + "learning_rate": 1.3983098976955157e-06, + "loss": 0.4654, + "step": 6151 + }, + { + "epoch": 0.76, + "grad_norm": 1.3544221085197357, + "learning_rate": 1.3969162010008213e-06, + "loss": 0.4948, + "step": 6152 + }, + { + "epoch": 0.76, + "grad_norm": 0.7144003386631187, + "learning_rate": 1.3955230864160462e-06, + "loss": 0.5, + "step": 6153 + }, + { + "epoch": 0.76, + "grad_norm": 1.350318088129962, + "learning_rate": 1.394130554166264e-06, + "loss": 0.4938, + "step": 6154 + }, + { + "epoch": 0.76, + "grad_norm": 1.629170310706614, + "learning_rate": 1.3927386044764468e-06, + "loss": 0.5517, + "step": 6155 + }, + { + "epoch": 0.76, + "grad_norm": 1.7072373672078287, + "learning_rate": 1.3913472375714792e-06, + "loss": 0.4441, + "step": 6156 + }, + { + "epoch": 0.76, + "grad_norm": 1.9292365455123768, + "learning_rate": 1.389956453676146e-06, + "loss": 0.4999, + "step": 6157 + }, + { + "epoch": 0.76, + "grad_norm": 1.9975739951652836, + "learning_rate": 1.3885662530151422e-06, + "loss": 0.5228, + "step": 6158 + }, + { + "epoch": 0.76, + "grad_norm": 1.4946002774292722, + "learning_rate": 1.3871766358130683e-06, + "loss": 0.5346, + "step": 6159 + }, + { + "epoch": 0.76, + "grad_norm": 1.3701032802565098, + "learning_rate": 1.3857876022944266e-06, + "loss": 0.5035, + "step": 6160 + }, + { + "epoch": 0.76, + "grad_norm": 1.7139531071014826, + "learning_rate": 1.3843991526836303e-06, + "loss": 0.5413, + "step": 6161 + }, + { + "epoch": 0.76, + "grad_norm": 2.266923031675677, + "learning_rate": 1.3830112872049929e-06, + "loss": 0.4469, + "step": 6162 + }, + { + "epoch": 0.76, + "grad_norm": 1.5573888558700415, + "learning_rate": 1.3816240060827385e-06, + "loss": 0.4741, + "step": 6163 + }, + { + "epoch": 0.76, + "grad_norm": 1.7676773716867706, + "learning_rate": 1.3802373095409926e-06, + "loss": 0.5104, + "step": 6164 + }, + { + "epoch": 0.77, + "grad_norm": 1.5647130625334107, + "learning_rate": 1.378851197803791e-06, + "loss": 0.4796, + "step": 6165 + }, + { + "epoch": 0.77, + "grad_norm": 1.2424931016563316, + "learning_rate": 1.377465671095069e-06, + "loss": 0.4755, + "step": 6166 + }, + { + "epoch": 0.77, + "grad_norm": 1.3848258055425164, + "learning_rate": 1.3760807296386725e-06, + "loss": 0.4646, + "step": 6167 + }, + { + "epoch": 0.77, + "grad_norm": 1.527042221518362, + "learning_rate": 1.374696373658352e-06, + "loss": 0.4795, + "step": 6168 + }, + { + "epoch": 0.77, + "grad_norm": 1.2861442316512977, + "learning_rate": 1.3733126033777599e-06, + "loss": 0.488, + "step": 6169 + }, + { + "epoch": 0.77, + "grad_norm": 0.724289675612865, + "learning_rate": 1.371929419020459e-06, + "loss": 0.5291, + "step": 6170 + }, + { + "epoch": 0.77, + "grad_norm": 0.7075573869726784, + "learning_rate": 1.370546820809912e-06, + "loss": 0.4787, + "step": 6171 + }, + { + "epoch": 0.77, + "grad_norm": 1.4051071751574435, + "learning_rate": 1.3691648089694931e-06, + "loss": 0.4678, + "step": 6172 + }, + { + "epoch": 0.77, + "grad_norm": 1.4999698282828469, + "learning_rate": 1.3677833837224757e-06, + "loss": 0.4626, + "step": 6173 + }, + { + "epoch": 0.77, + "grad_norm": 1.407283713252255, + "learning_rate": 1.3664025452920421e-06, + "loss": 0.4922, + "step": 6174 + }, + { + "epoch": 0.77, + "grad_norm": 1.5992010864639077, + "learning_rate": 1.3650222939012809e-06, + "loss": 0.4771, + "step": 6175 + }, + { + "epoch": 0.77, + "grad_norm": 1.6736440131195913, + "learning_rate": 1.3636426297731803e-06, + "loss": 0.5068, + "step": 6176 + }, + { + "epoch": 0.77, + "grad_norm": 2.4355258158852493, + "learning_rate": 1.3622635531306405e-06, + "loss": 0.545, + "step": 6177 + }, + { + "epoch": 0.77, + "grad_norm": 1.8904563872315112, + "learning_rate": 1.3608850641964604e-06, + "loss": 0.4989, + "step": 6178 + }, + { + "epoch": 0.77, + "grad_norm": 1.3131385472915835, + "learning_rate": 1.3595071631933504e-06, + "loss": 0.4593, + "step": 6179 + }, + { + "epoch": 0.77, + "grad_norm": 1.6578571419253867, + "learning_rate": 1.3581298503439193e-06, + "loss": 0.4936, + "step": 6180 + }, + { + "epoch": 0.77, + "grad_norm": 1.517459438778973, + "learning_rate": 1.3567531258706874e-06, + "loss": 0.4905, + "step": 6181 + }, + { + "epoch": 0.77, + "grad_norm": 1.281799319717632, + "learning_rate": 1.3553769899960716e-06, + "loss": 0.5137, + "step": 6182 + }, + { + "epoch": 0.77, + "grad_norm": 1.5282590270708574, + "learning_rate": 1.3540014429424049e-06, + "loss": 0.542, + "step": 6183 + }, + { + "epoch": 0.77, + "grad_norm": 1.4806309356958096, + "learning_rate": 1.3526264849319166e-06, + "loss": 0.413, + "step": 6184 + }, + { + "epoch": 0.77, + "grad_norm": 1.4836012581304867, + "learning_rate": 1.351252116186741e-06, + "loss": 0.4292, + "step": 6185 + }, + { + "epoch": 0.77, + "grad_norm": 1.3555241248525332, + "learning_rate": 1.3498783369289226e-06, + "loss": 0.4885, + "step": 6186 + }, + { + "epoch": 0.77, + "grad_norm": 1.2483446342138798, + "learning_rate": 1.348505147380405e-06, + "loss": 0.4379, + "step": 6187 + }, + { + "epoch": 0.77, + "grad_norm": 1.493588619382274, + "learning_rate": 1.347132547763042e-06, + "loss": 0.4947, + "step": 6188 + }, + { + "epoch": 0.77, + "grad_norm": 2.088968788701897, + "learning_rate": 1.3457605382985862e-06, + "loss": 0.4351, + "step": 6189 + }, + { + "epoch": 0.77, + "grad_norm": 1.3924279918426934, + "learning_rate": 1.3443891192087e-06, + "loss": 0.4355, + "step": 6190 + }, + { + "epoch": 0.77, + "grad_norm": 1.3716290693646196, + "learning_rate": 1.3430182907149447e-06, + "loss": 0.4726, + "step": 6191 + }, + { + "epoch": 0.77, + "grad_norm": 1.3336070112616283, + "learning_rate": 1.3416480530387955e-06, + "loss": 0.4395, + "step": 6192 + }, + { + "epoch": 0.77, + "grad_norm": 1.6298967280551317, + "learning_rate": 1.340278406401621e-06, + "loss": 0.5065, + "step": 6193 + }, + { + "epoch": 0.77, + "grad_norm": 1.4516402621866342, + "learning_rate": 1.3389093510247043e-06, + "loss": 0.4969, + "step": 6194 + }, + { + "epoch": 0.77, + "grad_norm": 1.3431009790179682, + "learning_rate": 1.3375408871292256e-06, + "loss": 0.4955, + "step": 6195 + }, + { + "epoch": 0.77, + "grad_norm": 1.5731051836007435, + "learning_rate": 1.3361730149362706e-06, + "loss": 0.4964, + "step": 6196 + }, + { + "epoch": 0.77, + "grad_norm": 1.331718298876939, + "learning_rate": 1.3348057346668353e-06, + "loss": 0.5137, + "step": 6197 + }, + { + "epoch": 0.77, + "grad_norm": 0.6625395922594366, + "learning_rate": 1.3334390465418122e-06, + "loss": 0.4958, + "step": 6198 + }, + { + "epoch": 0.77, + "grad_norm": 1.4321764259365748, + "learning_rate": 1.3320729507820029e-06, + "loss": 0.5206, + "step": 6199 + }, + { + "epoch": 0.77, + "grad_norm": 1.4436428193547688, + "learning_rate": 1.3307074476081127e-06, + "loss": 0.4634, + "step": 6200 + }, + { + "epoch": 0.77, + "grad_norm": 1.4385882583497545, + "learning_rate": 1.3293425372407526e-06, + "loss": 0.4517, + "step": 6201 + }, + { + "epoch": 0.77, + "grad_norm": 1.9622624537904163, + "learning_rate": 1.3279782199004321e-06, + "loss": 0.5084, + "step": 6202 + }, + { + "epoch": 0.77, + "grad_norm": 1.2241283119219653, + "learning_rate": 1.3266144958075717e-06, + "loss": 0.4693, + "step": 6203 + }, + { + "epoch": 0.77, + "grad_norm": 1.5993684273325757, + "learning_rate": 1.325251365182492e-06, + "loss": 0.4913, + "step": 6204 + }, + { + "epoch": 0.77, + "grad_norm": 1.4483264471948116, + "learning_rate": 1.323888828245417e-06, + "loss": 0.5085, + "step": 6205 + }, + { + "epoch": 0.77, + "grad_norm": 1.916745646680996, + "learning_rate": 1.3225268852164797e-06, + "loss": 0.5189, + "step": 6206 + }, + { + "epoch": 0.77, + "grad_norm": 1.4769884347476094, + "learning_rate": 1.3211655363157094e-06, + "loss": 0.4837, + "step": 6207 + }, + { + "epoch": 0.77, + "grad_norm": 1.5140704961515885, + "learning_rate": 1.31980478176305e-06, + "loss": 0.522, + "step": 6208 + }, + { + "epoch": 0.77, + "grad_norm": 1.6809489514739075, + "learning_rate": 1.3184446217783387e-06, + "loss": 0.4498, + "step": 6209 + }, + { + "epoch": 0.77, + "grad_norm": 1.4330904014880177, + "learning_rate": 1.3170850565813243e-06, + "loss": 0.5133, + "step": 6210 + }, + { + "epoch": 0.77, + "grad_norm": 1.4340701334442918, + "learning_rate": 1.3157260863916544e-06, + "loss": 0.5312, + "step": 6211 + }, + { + "epoch": 0.77, + "grad_norm": 1.4960968171210913, + "learning_rate": 1.3143677114288845e-06, + "loss": 0.5202, + "step": 6212 + }, + { + "epoch": 0.77, + "grad_norm": 1.5065093070867135, + "learning_rate": 1.3130099319124706e-06, + "loss": 0.531, + "step": 6213 + }, + { + "epoch": 0.77, + "grad_norm": 1.6062720817845197, + "learning_rate": 1.3116527480617735e-06, + "loss": 0.4971, + "step": 6214 + }, + { + "epoch": 0.77, + "grad_norm": 1.4525219409854628, + "learning_rate": 1.3102961600960584e-06, + "loss": 0.5125, + "step": 6215 + }, + { + "epoch": 0.77, + "grad_norm": 1.333691087311118, + "learning_rate": 1.3089401682344955e-06, + "loss": 0.5467, + "step": 6216 + }, + { + "epoch": 0.77, + "grad_norm": 1.738900694292726, + "learning_rate": 1.3075847726961571e-06, + "loss": 0.52, + "step": 6217 + }, + { + "epoch": 0.77, + "grad_norm": 3.2815307166243293, + "learning_rate": 1.3062299737000173e-06, + "loss": 0.5316, + "step": 6218 + }, + { + "epoch": 0.77, + "grad_norm": 1.543516672086808, + "learning_rate": 1.3048757714649585e-06, + "loss": 0.5369, + "step": 6219 + }, + { + "epoch": 0.77, + "grad_norm": 1.6412206549282686, + "learning_rate": 1.3035221662097614e-06, + "loss": 0.4903, + "step": 6220 + }, + { + "epoch": 0.77, + "grad_norm": 1.4833664255463928, + "learning_rate": 1.302169158153115e-06, + "loss": 0.5045, + "step": 6221 + }, + { + "epoch": 0.77, + "grad_norm": 1.5423653118353517, + "learning_rate": 1.300816747513609e-06, + "loss": 0.5146, + "step": 6222 + }, + { + "epoch": 0.77, + "grad_norm": 1.5360441497197643, + "learning_rate": 1.2994649345097354e-06, + "loss": 0.4826, + "step": 6223 + }, + { + "epoch": 0.77, + "grad_norm": 1.9787375634564193, + "learning_rate": 1.298113719359893e-06, + "loss": 0.5337, + "step": 6224 + }, + { + "epoch": 0.77, + "grad_norm": 1.4540743401928375, + "learning_rate": 1.2967631022823824e-06, + "loss": 0.4575, + "step": 6225 + }, + { + "epoch": 0.77, + "grad_norm": 2.262005236895857, + "learning_rate": 1.29541308349541e-06, + "loss": 0.5949, + "step": 6226 + }, + { + "epoch": 0.77, + "grad_norm": 1.6664454512775195, + "learning_rate": 1.294063663217079e-06, + "loss": 0.4938, + "step": 6227 + }, + { + "epoch": 0.77, + "grad_norm": 1.6280218423673491, + "learning_rate": 1.2927148416654033e-06, + "loss": 0.5394, + "step": 6228 + }, + { + "epoch": 0.77, + "grad_norm": 1.430640150277463, + "learning_rate": 1.2913666190582947e-06, + "loss": 0.492, + "step": 6229 + }, + { + "epoch": 0.77, + "grad_norm": 1.607236994045575, + "learning_rate": 1.2900189956135728e-06, + "loss": 0.4961, + "step": 6230 + }, + { + "epoch": 0.77, + "grad_norm": 0.6616581269513682, + "learning_rate": 1.288671971548956e-06, + "loss": 0.4807, + "step": 6231 + }, + { + "epoch": 0.77, + "grad_norm": 2.1011305393950583, + "learning_rate": 1.2873255470820677e-06, + "loss": 0.5271, + "step": 6232 + }, + { + "epoch": 0.77, + "grad_norm": 1.3148633012922666, + "learning_rate": 1.2859797224304378e-06, + "loss": 0.4674, + "step": 6233 + }, + { + "epoch": 0.77, + "grad_norm": 1.5799807696563415, + "learning_rate": 1.2846344978114921e-06, + "loss": 0.4335, + "step": 6234 + }, + { + "epoch": 0.77, + "grad_norm": 1.2282627622228512, + "learning_rate": 1.283289873442567e-06, + "loss": 0.4612, + "step": 6235 + }, + { + "epoch": 0.77, + "grad_norm": 1.5334044812396002, + "learning_rate": 1.2819458495408948e-06, + "loss": 0.4714, + "step": 6236 + }, + { + "epoch": 0.77, + "grad_norm": 1.3003968312361376, + "learning_rate": 1.2806024263236178e-06, + "loss": 0.4699, + "step": 6237 + }, + { + "epoch": 0.77, + "grad_norm": 2.214653471241372, + "learning_rate": 1.2792596040077753e-06, + "loss": 0.4541, + "step": 6238 + }, + { + "epoch": 0.77, + "grad_norm": 1.4826821480376464, + "learning_rate": 1.277917382810314e-06, + "loss": 0.5076, + "step": 6239 + }, + { + "epoch": 0.77, + "grad_norm": 3.823356149131492, + "learning_rate": 1.27657576294808e-06, + "loss": 0.505, + "step": 6240 + }, + { + "epoch": 0.77, + "grad_norm": 1.6382532688362617, + "learning_rate": 1.2752347446378238e-06, + "loss": 0.5678, + "step": 6241 + }, + { + "epoch": 0.77, + "grad_norm": 1.4384221892090772, + "learning_rate": 1.2738943280962013e-06, + "loss": 0.4839, + "step": 6242 + }, + { + "epoch": 0.77, + "grad_norm": 1.342711899294759, + "learning_rate": 1.272554513539765e-06, + "loss": 0.481, + "step": 6243 + }, + { + "epoch": 0.77, + "grad_norm": 2.044459208891881, + "learning_rate": 1.271215301184977e-06, + "loss": 0.4773, + "step": 6244 + }, + { + "epoch": 0.78, + "grad_norm": 1.3545482257587682, + "learning_rate": 1.2698766912481958e-06, + "loss": 0.491, + "step": 6245 + }, + { + "epoch": 0.78, + "grad_norm": 1.3167855784532667, + "learning_rate": 1.2685386839456886e-06, + "loss": 0.5057, + "step": 6246 + }, + { + "epoch": 0.78, + "grad_norm": 1.8376158472318225, + "learning_rate": 1.2672012794936195e-06, + "loss": 0.4646, + "step": 6247 + }, + { + "epoch": 0.78, + "grad_norm": 2.4815672785751532, + "learning_rate": 1.2658644781080593e-06, + "loss": 0.4576, + "step": 6248 + }, + { + "epoch": 0.78, + "grad_norm": 1.8421058516258766, + "learning_rate": 1.2645282800049812e-06, + "loss": 0.5177, + "step": 6249 + }, + { + "epoch": 0.78, + "grad_norm": 1.8411185131379155, + "learning_rate": 1.2631926854002574e-06, + "loss": 0.5084, + "step": 6250 + }, + { + "epoch": 0.78, + "grad_norm": 1.3816045998659126, + "learning_rate": 1.2618576945096671e-06, + "loss": 0.5492, + "step": 6251 + }, + { + "epoch": 0.78, + "grad_norm": 1.3650484318670382, + "learning_rate": 1.2605233075488877e-06, + "loss": 0.4511, + "step": 6252 + }, + { + "epoch": 0.78, + "grad_norm": 1.4297217792305226, + "learning_rate": 1.259189524733504e-06, + "loss": 0.5025, + "step": 6253 + }, + { + "epoch": 0.78, + "grad_norm": 1.7866400137312053, + "learning_rate": 1.2578563462789967e-06, + "loss": 0.5479, + "step": 6254 + }, + { + "epoch": 0.78, + "grad_norm": 1.288055058437218, + "learning_rate": 1.2565237724007563e-06, + "loss": 0.5066, + "step": 6255 + }, + { + "epoch": 0.78, + "grad_norm": 1.5601250969802751, + "learning_rate": 1.2551918033140687e-06, + "loss": 0.525, + "step": 6256 + }, + { + "epoch": 0.78, + "grad_norm": 1.2428756455250791, + "learning_rate": 1.2538604392341264e-06, + "loss": 0.4523, + "step": 6257 + }, + { + "epoch": 0.78, + "grad_norm": 1.306311357857756, + "learning_rate": 1.2525296803760246e-06, + "loss": 0.4819, + "step": 6258 + }, + { + "epoch": 0.78, + "grad_norm": 1.5382898567298926, + "learning_rate": 1.2511995269547566e-06, + "loss": 0.5187, + "step": 6259 + }, + { + "epoch": 0.78, + "grad_norm": 1.528938983892805, + "learning_rate": 1.2498699791852225e-06, + "loss": 0.4815, + "step": 6260 + }, + { + "epoch": 0.78, + "grad_norm": 1.4201500893708132, + "learning_rate": 1.2485410372822205e-06, + "loss": 0.4682, + "step": 6261 + }, + { + "epoch": 0.78, + "grad_norm": 1.4320467050540584, + "learning_rate": 1.247212701460455e-06, + "loss": 0.5452, + "step": 6262 + }, + { + "epoch": 0.78, + "grad_norm": 1.5668199288766043, + "learning_rate": 1.2458849719345279e-06, + "loss": 0.515, + "step": 6263 + }, + { + "epoch": 0.78, + "grad_norm": 1.504159923138868, + "learning_rate": 1.244557848918948e-06, + "loss": 0.5079, + "step": 6264 + }, + { + "epoch": 0.78, + "grad_norm": 1.3188449069712342, + "learning_rate": 1.2432313326281215e-06, + "loss": 0.4766, + "step": 6265 + }, + { + "epoch": 0.78, + "grad_norm": 1.44142880046114, + "learning_rate": 1.2419054232763595e-06, + "loss": 0.4971, + "step": 6266 + }, + { + "epoch": 0.78, + "grad_norm": 1.5180567972225074, + "learning_rate": 1.2405801210778762e-06, + "loss": 0.4639, + "step": 6267 + }, + { + "epoch": 0.78, + "grad_norm": 1.3471618262889407, + "learning_rate": 1.2392554262467833e-06, + "loss": 0.4822, + "step": 6268 + }, + { + "epoch": 0.78, + "grad_norm": 1.7091520425341729, + "learning_rate": 1.2379313389970992e-06, + "loss": 0.5229, + "step": 6269 + }, + { + "epoch": 0.78, + "grad_norm": 1.570282113403896, + "learning_rate": 1.2366078595427389e-06, + "loss": 0.5271, + "step": 6270 + }, + { + "epoch": 0.78, + "grad_norm": 1.3949975771134056, + "learning_rate": 1.235284988097526e-06, + "loss": 0.4658, + "step": 6271 + }, + { + "epoch": 0.78, + "grad_norm": 1.605511267248311, + "learning_rate": 1.2339627248751778e-06, + "loss": 0.5136, + "step": 6272 + }, + { + "epoch": 0.78, + "grad_norm": 1.8807963700316133, + "learning_rate": 1.2326410700893198e-06, + "loss": 0.4902, + "step": 6273 + }, + { + "epoch": 0.78, + "grad_norm": 1.465254290911377, + "learning_rate": 1.231320023953479e-06, + "loss": 0.4896, + "step": 6274 + }, + { + "epoch": 0.78, + "grad_norm": 2.324978655566404, + "learning_rate": 1.2299995866810777e-06, + "loss": 0.4975, + "step": 6275 + }, + { + "epoch": 0.78, + "grad_norm": 1.428055739224206, + "learning_rate": 1.2286797584854482e-06, + "loss": 0.5418, + "step": 6276 + }, + { + "epoch": 0.78, + "grad_norm": 1.4792979849123264, + "learning_rate": 1.2273605395798165e-06, + "loss": 0.5015, + "step": 6277 + }, + { + "epoch": 0.78, + "grad_norm": 1.3062914972392994, + "learning_rate": 1.226041930177318e-06, + "loss": 0.5003, + "step": 6278 + }, + { + "epoch": 0.78, + "grad_norm": 1.3227903716457101, + "learning_rate": 1.2247239304909825e-06, + "loss": 0.4546, + "step": 6279 + }, + { + "epoch": 0.78, + "grad_norm": 1.5132406211568603, + "learning_rate": 1.223406540733747e-06, + "loss": 0.512, + "step": 6280 + }, + { + "epoch": 0.78, + "grad_norm": 1.5284383420872536, + "learning_rate": 1.2220897611184429e-06, + "loss": 0.4794, + "step": 6281 + }, + { + "epoch": 0.78, + "grad_norm": 1.6839892207048355, + "learning_rate": 1.220773591857814e-06, + "loss": 0.5089, + "step": 6282 + }, + { + "epoch": 0.78, + "grad_norm": 1.652141145541304, + "learning_rate": 1.2194580331644955e-06, + "loss": 0.4869, + "step": 6283 + }, + { + "epoch": 0.78, + "grad_norm": 2.323465813497087, + "learning_rate": 1.2181430852510268e-06, + "loss": 0.5281, + "step": 6284 + }, + { + "epoch": 0.78, + "grad_norm": 1.5468279291555211, + "learning_rate": 1.2168287483298514e-06, + "loss": 0.4227, + "step": 6285 + }, + { + "epoch": 0.78, + "grad_norm": 1.5243814666158966, + "learning_rate": 1.2155150226133094e-06, + "loss": 0.5128, + "step": 6286 + }, + { + "epoch": 0.78, + "grad_norm": 1.6629313928964435, + "learning_rate": 1.2142019083136475e-06, + "loss": 0.4846, + "step": 6287 + }, + { + "epoch": 0.78, + "grad_norm": 1.453438332306153, + "learning_rate": 1.2128894056430084e-06, + "loss": 0.5031, + "step": 6288 + }, + { + "epoch": 0.78, + "grad_norm": 2.169209631478575, + "learning_rate": 1.2115775148134402e-06, + "loss": 0.5078, + "step": 6289 + }, + { + "epoch": 0.78, + "grad_norm": 1.7260170535617319, + "learning_rate": 1.2102662360368893e-06, + "loss": 0.5449, + "step": 6290 + }, + { + "epoch": 0.78, + "grad_norm": 1.8277114237596315, + "learning_rate": 1.2089555695252064e-06, + "loss": 0.513, + "step": 6291 + }, + { + "epoch": 0.78, + "grad_norm": 1.5156474325060718, + "learning_rate": 1.20764551549014e-06, + "loss": 0.5189, + "step": 6292 + }, + { + "epoch": 0.78, + "grad_norm": 9.402604227104481, + "learning_rate": 1.2063360741433393e-06, + "loss": 0.5027, + "step": 6293 + }, + { + "epoch": 0.78, + "grad_norm": 4.506607537413577, + "learning_rate": 1.205027245696359e-06, + "loss": 0.4435, + "step": 6294 + }, + { + "epoch": 0.78, + "grad_norm": 1.2969275416545543, + "learning_rate": 1.2037190303606489e-06, + "loss": 0.4377, + "step": 6295 + }, + { + "epoch": 0.78, + "grad_norm": 2.7527021815831167, + "learning_rate": 1.2024114283475652e-06, + "loss": 0.5807, + "step": 6296 + }, + { + "epoch": 0.78, + "grad_norm": 1.3670555930535397, + "learning_rate": 1.2011044398683586e-06, + "loss": 0.4939, + "step": 6297 + }, + { + "epoch": 0.78, + "grad_norm": 1.2207355669928845, + "learning_rate": 1.1997980651341901e-06, + "loss": 0.4313, + "step": 6298 + }, + { + "epoch": 0.78, + "grad_norm": 1.6068755034967181, + "learning_rate": 1.1984923043561124e-06, + "loss": 0.4666, + "step": 6299 + }, + { + "epoch": 0.78, + "grad_norm": 1.3700754190571853, + "learning_rate": 1.197187157745085e-06, + "loss": 0.4371, + "step": 6300 + }, + { + "epoch": 0.78, + "grad_norm": 1.5182100119914927, + "learning_rate": 1.1958826255119626e-06, + "loss": 0.4806, + "step": 6301 + }, + { + "epoch": 0.78, + "grad_norm": 1.6868763537558658, + "learning_rate": 1.1945787078675075e-06, + "loss": 0.4545, + "step": 6302 + }, + { + "epoch": 0.78, + "grad_norm": 1.2726361777698496, + "learning_rate": 1.1932754050223772e-06, + "loss": 0.437, + "step": 6303 + }, + { + "epoch": 0.78, + "grad_norm": 1.7997900027014602, + "learning_rate": 1.1919727171871303e-06, + "loss": 0.4812, + "step": 6304 + }, + { + "epoch": 0.78, + "grad_norm": 1.7049836031903862, + "learning_rate": 1.1906706445722304e-06, + "loss": 0.5468, + "step": 6305 + }, + { + "epoch": 0.78, + "grad_norm": 1.482623305676932, + "learning_rate": 1.1893691873880348e-06, + "loss": 0.4679, + "step": 6306 + }, + { + "epoch": 0.78, + "grad_norm": 1.7546459793761726, + "learning_rate": 1.1880683458448111e-06, + "loss": 0.5173, + "step": 6307 + }, + { + "epoch": 0.78, + "grad_norm": 2.0376418787164963, + "learning_rate": 1.186768120152717e-06, + "loss": 0.4762, + "step": 6308 + }, + { + "epoch": 0.78, + "grad_norm": 1.5897112053677294, + "learning_rate": 1.1854685105218183e-06, + "loss": 0.5106, + "step": 6309 + }, + { + "epoch": 0.78, + "grad_norm": 1.554638495563267, + "learning_rate": 1.184169517162076e-06, + "loss": 0.4453, + "step": 6310 + }, + { + "epoch": 0.78, + "grad_norm": 1.4527954328226296, + "learning_rate": 1.1828711402833559e-06, + "loss": 0.4581, + "step": 6311 + }, + { + "epoch": 0.78, + "grad_norm": 1.324097258318579, + "learning_rate": 1.1815733800954221e-06, + "loss": 0.4468, + "step": 6312 + }, + { + "epoch": 0.78, + "grad_norm": 0.6615006974970603, + "learning_rate": 1.1802762368079368e-06, + "loss": 0.488, + "step": 6313 + }, + { + "epoch": 0.78, + "grad_norm": 1.1942935989812935, + "learning_rate": 1.1789797106304663e-06, + "loss": 0.4155, + "step": 6314 + }, + { + "epoch": 0.78, + "grad_norm": 3.6700048498627162, + "learning_rate": 1.1776838017724762e-06, + "loss": 0.4806, + "step": 6315 + }, + { + "epoch": 0.78, + "grad_norm": 1.426500155071954, + "learning_rate": 1.1763885104433331e-06, + "loss": 0.5418, + "step": 6316 + }, + { + "epoch": 0.78, + "grad_norm": 1.5091385876369006, + "learning_rate": 1.1750938368523e-06, + "loss": 0.5207, + "step": 6317 + }, + { + "epoch": 0.78, + "grad_norm": 1.3085262428892623, + "learning_rate": 1.1737997812085468e-06, + "loss": 0.4497, + "step": 6318 + }, + { + "epoch": 0.78, + "grad_norm": 1.4515944463730204, + "learning_rate": 1.1725063437211349e-06, + "loss": 0.4652, + "step": 6319 + }, + { + "epoch": 0.78, + "grad_norm": 1.2357108620539061, + "learning_rate": 1.1712135245990342e-06, + "loss": 0.465, + "step": 6320 + }, + { + "epoch": 0.78, + "grad_norm": 1.774785089411395, + "learning_rate": 1.1699213240511092e-06, + "loss": 0.4741, + "step": 6321 + }, + { + "epoch": 0.78, + "grad_norm": 3.101966765503848, + "learning_rate": 1.1686297422861241e-06, + "loss": 0.4625, + "step": 6322 + }, + { + "epoch": 0.78, + "grad_norm": 13.892565752711171, + "learning_rate": 1.1673387795127505e-06, + "loss": 0.467, + "step": 6323 + }, + { + "epoch": 0.78, + "grad_norm": 1.6317216482292651, + "learning_rate": 1.1660484359395514e-06, + "loss": 0.563, + "step": 6324 + }, + { + "epoch": 0.78, + "grad_norm": 1.440182272936019, + "learning_rate": 1.1647587117749942e-06, + "loss": 0.5386, + "step": 6325 + }, + { + "epoch": 0.79, + "grad_norm": 1.6785539736597066, + "learning_rate": 1.1634696072274444e-06, + "loss": 0.5317, + "step": 6326 + }, + { + "epoch": 0.79, + "grad_norm": 1.5445312568704768, + "learning_rate": 1.1621811225051698e-06, + "loss": 0.5147, + "step": 6327 + }, + { + "epoch": 0.79, + "grad_norm": 1.7406342594313629, + "learning_rate": 1.160893257816334e-06, + "loss": 0.4757, + "step": 6328 + }, + { + "epoch": 0.79, + "grad_norm": 1.3839958819337508, + "learning_rate": 1.1596060133690057e-06, + "loss": 0.5226, + "step": 6329 + }, + { + "epoch": 0.79, + "grad_norm": 1.3961518978650618, + "learning_rate": 1.1583193893711475e-06, + "loss": 0.5169, + "step": 6330 + }, + { + "epoch": 0.79, + "grad_norm": 1.6338408284297716, + "learning_rate": 1.1570333860306265e-06, + "loss": 0.5181, + "step": 6331 + }, + { + "epoch": 0.79, + "grad_norm": 1.9004636131185335, + "learning_rate": 1.1557480035552093e-06, + "loss": 0.5224, + "step": 6332 + }, + { + "epoch": 0.79, + "grad_norm": 1.8167801921999878, + "learning_rate": 1.1544632421525576e-06, + "loss": 0.5135, + "step": 6333 + }, + { + "epoch": 0.79, + "grad_norm": 1.5028563153474552, + "learning_rate": 1.1531791020302391e-06, + "loss": 0.4808, + "step": 6334 + }, + { + "epoch": 0.79, + "grad_norm": 1.4995582428022929, + "learning_rate": 1.1518955833957153e-06, + "loss": 0.5058, + "step": 6335 + }, + { + "epoch": 0.79, + "grad_norm": 1.564743351446164, + "learning_rate": 1.1506126864563522e-06, + "loss": 0.5488, + "step": 6336 + }, + { + "epoch": 0.79, + "grad_norm": 1.5702469015325229, + "learning_rate": 1.149330411419411e-06, + "loss": 0.5287, + "step": 6337 + }, + { + "epoch": 0.79, + "grad_norm": 1.4508956069268264, + "learning_rate": 1.1480487584920553e-06, + "loss": 0.4943, + "step": 6338 + }, + { + "epoch": 0.79, + "grad_norm": 1.4344574161100998, + "learning_rate": 1.146767727881349e-06, + "loss": 0.4971, + "step": 6339 + }, + { + "epoch": 0.79, + "grad_norm": 1.7641317544981567, + "learning_rate": 1.1454873197942507e-06, + "loss": 0.5236, + "step": 6340 + }, + { + "epoch": 0.79, + "grad_norm": 1.693249372026301, + "learning_rate": 1.1442075344376253e-06, + "loss": 0.535, + "step": 6341 + }, + { + "epoch": 0.79, + "grad_norm": 1.5097494619684362, + "learning_rate": 1.14292837201823e-06, + "loss": 0.5031, + "step": 6342 + }, + { + "epoch": 0.79, + "grad_norm": 1.6040740006328453, + "learning_rate": 1.1416498327427278e-06, + "loss": 0.5009, + "step": 6343 + }, + { + "epoch": 0.79, + "grad_norm": 1.5067558032158819, + "learning_rate": 1.1403719168176747e-06, + "loss": 0.4709, + "step": 6344 + }, + { + "epoch": 0.79, + "grad_norm": 2.2037937435097756, + "learning_rate": 1.1390946244495327e-06, + "loss": 0.5293, + "step": 6345 + }, + { + "epoch": 0.79, + "grad_norm": 1.4940316941859748, + "learning_rate": 1.1378179558446561e-06, + "loss": 0.481, + "step": 6346 + }, + { + "epoch": 0.79, + "grad_norm": 1.4547452020022014, + "learning_rate": 1.136541911209304e-06, + "loss": 0.5031, + "step": 6347 + }, + { + "epoch": 0.79, + "grad_norm": 1.5577078497772556, + "learning_rate": 1.135266490749634e-06, + "loss": 0.5071, + "step": 6348 + }, + { + "epoch": 0.79, + "grad_norm": 1.4071597547018102, + "learning_rate": 1.1339916946716984e-06, + "loss": 0.5052, + "step": 6349 + }, + { + "epoch": 0.79, + "grad_norm": 1.237185372105183, + "learning_rate": 1.1327175231814547e-06, + "loss": 0.474, + "step": 6350 + }, + { + "epoch": 0.79, + "grad_norm": 2.5312058336458136, + "learning_rate": 1.131443976484754e-06, + "loss": 0.4986, + "step": 6351 + }, + { + "epoch": 0.79, + "grad_norm": 1.278808877803244, + "learning_rate": 1.1301710547873512e-06, + "loss": 0.4917, + "step": 6352 + }, + { + "epoch": 0.79, + "grad_norm": 1.471052130513398, + "learning_rate": 1.1288987582948956e-06, + "loss": 0.4279, + "step": 6353 + }, + { + "epoch": 0.79, + "grad_norm": 1.9214340906088334, + "learning_rate": 1.1276270872129408e-06, + "loss": 0.4989, + "step": 6354 + }, + { + "epoch": 0.79, + "grad_norm": 1.6966020038555492, + "learning_rate": 1.1263560417469332e-06, + "loss": 0.4959, + "step": 6355 + }, + { + "epoch": 0.79, + "grad_norm": 1.9936299392743482, + "learning_rate": 1.1250856221022233e-06, + "loss": 0.4889, + "step": 6356 + }, + { + "epoch": 0.79, + "grad_norm": 1.2150697051618198, + "learning_rate": 1.1238158284840594e-06, + "loss": 0.4787, + "step": 6357 + }, + { + "epoch": 0.79, + "grad_norm": 1.376086418479404, + "learning_rate": 1.1225466610975854e-06, + "loss": 0.4775, + "step": 6358 + }, + { + "epoch": 0.79, + "grad_norm": 1.6770030416826525, + "learning_rate": 1.1212781201478496e-06, + "loss": 0.4983, + "step": 6359 + }, + { + "epoch": 0.79, + "grad_norm": 2.3716829551919227, + "learning_rate": 1.1200102058397927e-06, + "loss": 0.5119, + "step": 6360 + }, + { + "epoch": 0.79, + "grad_norm": 1.7293039440124984, + "learning_rate": 1.11874291837826e-06, + "loss": 0.4778, + "step": 6361 + }, + { + "epoch": 0.79, + "grad_norm": 1.2486910175729624, + "learning_rate": 1.117476257967991e-06, + "loss": 0.4769, + "step": 6362 + }, + { + "epoch": 0.79, + "grad_norm": 1.9053404412571056, + "learning_rate": 1.1162102248136264e-06, + "loss": 0.5044, + "step": 6363 + }, + { + "epoch": 0.79, + "grad_norm": 1.6287425902785133, + "learning_rate": 1.114944819119707e-06, + "loss": 0.4668, + "step": 6364 + }, + { + "epoch": 0.79, + "grad_norm": 1.7416565194171552, + "learning_rate": 1.1136800410906672e-06, + "loss": 0.4914, + "step": 6365 + }, + { + "epoch": 0.79, + "grad_norm": 4.085037618074328, + "learning_rate": 1.1124158909308458e-06, + "loss": 0.4845, + "step": 6366 + }, + { + "epoch": 0.79, + "grad_norm": 1.7980273651122092, + "learning_rate": 1.1111523688444741e-06, + "loss": 0.4781, + "step": 6367 + }, + { + "epoch": 0.79, + "grad_norm": 1.883979465274335, + "learning_rate": 1.1098894750356893e-06, + "loss": 0.5242, + "step": 6368 + }, + { + "epoch": 0.79, + "grad_norm": 1.313537169348473, + "learning_rate": 1.1086272097085187e-06, + "loss": 0.4718, + "step": 6369 + }, + { + "epoch": 0.79, + "grad_norm": 1.9156618835053345, + "learning_rate": 1.1073655730668965e-06, + "loss": 0.5015, + "step": 6370 + }, + { + "epoch": 0.79, + "grad_norm": 9.284402050371876, + "learning_rate": 1.1061045653146463e-06, + "loss": 0.4395, + "step": 6371 + }, + { + "epoch": 0.79, + "grad_norm": 1.4106454134612665, + "learning_rate": 1.1048441866555004e-06, + "loss": 0.4857, + "step": 6372 + }, + { + "epoch": 0.79, + "grad_norm": 1.622936999846198, + "learning_rate": 1.103584437293082e-06, + "loss": 0.4752, + "step": 6373 + }, + { + "epoch": 0.79, + "grad_norm": 1.5432383878844, + "learning_rate": 1.1023253174309128e-06, + "loss": 0.4941, + "step": 6374 + }, + { + "epoch": 0.79, + "grad_norm": 1.4826265580141185, + "learning_rate": 1.1010668272724178e-06, + "loss": 0.5047, + "step": 6375 + }, + { + "epoch": 0.79, + "grad_norm": 1.4461541616248892, + "learning_rate": 1.099808967020914e-06, + "loss": 0.5171, + "step": 6376 + }, + { + "epoch": 0.79, + "grad_norm": 1.6063058861931088, + "learning_rate": 1.0985517368796227e-06, + "loss": 0.5259, + "step": 6377 + }, + { + "epoch": 0.79, + "grad_norm": 1.6309292102124595, + "learning_rate": 1.0972951370516577e-06, + "loss": 0.565, + "step": 6378 + }, + { + "epoch": 0.79, + "grad_norm": 2.349710798461493, + "learning_rate": 1.0960391677400373e-06, + "loss": 0.4981, + "step": 6379 + }, + { + "epoch": 0.79, + "grad_norm": 1.4939126100808078, + "learning_rate": 1.0947838291476687e-06, + "loss": 0.5147, + "step": 6380 + }, + { + "epoch": 0.79, + "grad_norm": 1.443005036475542, + "learning_rate": 1.0935291214773703e-06, + "loss": 0.4891, + "step": 6381 + }, + { + "epoch": 0.79, + "grad_norm": 1.5200846600311144, + "learning_rate": 1.0922750449318464e-06, + "loss": 0.4843, + "step": 6382 + }, + { + "epoch": 0.79, + "grad_norm": 1.9827824644081335, + "learning_rate": 1.0910215997137048e-06, + "loss": 0.4668, + "step": 6383 + }, + { + "epoch": 0.79, + "grad_norm": 2.380065844859165, + "learning_rate": 1.0897687860254514e-06, + "loss": 0.4843, + "step": 6384 + }, + { + "epoch": 0.79, + "grad_norm": 1.6967284179344584, + "learning_rate": 1.0885166040694884e-06, + "loss": 0.5127, + "step": 6385 + }, + { + "epoch": 0.79, + "grad_norm": 1.6695641296913672, + "learning_rate": 1.0872650540481178e-06, + "loss": 0.4706, + "step": 6386 + }, + { + "epoch": 0.79, + "grad_norm": 1.5456696537393462, + "learning_rate": 1.0860141361635374e-06, + "loss": 0.4878, + "step": 6387 + }, + { + "epoch": 0.79, + "grad_norm": 1.2956173483980842, + "learning_rate": 1.0847638506178444e-06, + "loss": 0.5433, + "step": 6388 + }, + { + "epoch": 0.79, + "grad_norm": 1.3262687673658258, + "learning_rate": 1.0835141976130331e-06, + "loss": 0.4642, + "step": 6389 + }, + { + "epoch": 0.79, + "grad_norm": 1.64537801718758, + "learning_rate": 1.0822651773509984e-06, + "loss": 0.5271, + "step": 6390 + }, + { + "epoch": 0.79, + "grad_norm": 0.6997776708532146, + "learning_rate": 1.0810167900335283e-06, + "loss": 0.4831, + "step": 6391 + }, + { + "epoch": 0.79, + "grad_norm": 1.6522170913115182, + "learning_rate": 1.0797690358623103e-06, + "loss": 0.4778, + "step": 6392 + }, + { + "epoch": 0.79, + "grad_norm": 1.3692356724278743, + "learning_rate": 1.078521915038932e-06, + "loss": 0.4815, + "step": 6393 + }, + { + "epoch": 0.79, + "grad_norm": 1.3408947630589219, + "learning_rate": 1.0772754277648739e-06, + "loss": 0.4349, + "step": 6394 + }, + { + "epoch": 0.79, + "grad_norm": 1.8953971727722827, + "learning_rate": 1.0760295742415205e-06, + "loss": 0.4876, + "step": 6395 + }, + { + "epoch": 0.79, + "grad_norm": 1.53510002475653, + "learning_rate": 1.0747843546701452e-06, + "loss": 0.4715, + "step": 6396 + }, + { + "epoch": 0.79, + "grad_norm": 0.6980999378389511, + "learning_rate": 1.0735397692519312e-06, + "loss": 0.5138, + "step": 6397 + }, + { + "epoch": 0.79, + "grad_norm": 2.558202819615058, + "learning_rate": 1.0722958181879466e-06, + "loss": 0.4924, + "step": 6398 + }, + { + "epoch": 0.79, + "grad_norm": 1.7068775428592562, + "learning_rate": 1.0710525016791667e-06, + "loss": 0.5051, + "step": 6399 + }, + { + "epoch": 0.79, + "grad_norm": 1.6023047898555305, + "learning_rate": 1.0698098199264578e-06, + "loss": 0.5152, + "step": 6400 + }, + { + "epoch": 0.79, + "grad_norm": 1.4369425786969896, + "learning_rate": 1.0685677731305855e-06, + "loss": 0.4318, + "step": 6401 + }, + { + "epoch": 0.79, + "grad_norm": 1.5998174512047107, + "learning_rate": 1.0673263614922152e-06, + "loss": 0.558, + "step": 6402 + }, + { + "epoch": 0.79, + "grad_norm": 1.7877368017433597, + "learning_rate": 1.0660855852119062e-06, + "loss": 0.4984, + "step": 6403 + }, + { + "epoch": 0.79, + "grad_norm": 1.9796036482449146, + "learning_rate": 1.0648454444901179e-06, + "loss": 0.4578, + "step": 6404 + }, + { + "epoch": 0.79, + "grad_norm": 2.882827935795179, + "learning_rate": 1.0636059395272053e-06, + "loss": 0.4866, + "step": 6405 + }, + { + "epoch": 0.79, + "grad_norm": 9.42440315165274, + "learning_rate": 1.0623670705234235e-06, + "loss": 0.515, + "step": 6406 + }, + { + "epoch": 0.8, + "grad_norm": 2.3774620416132493, + "learning_rate": 1.0611288376789198e-06, + "loss": 0.4862, + "step": 6407 + }, + { + "epoch": 0.8, + "grad_norm": 1.3592757543233827, + "learning_rate": 1.059891241193744e-06, + "loss": 0.5142, + "step": 6408 + }, + { + "epoch": 0.8, + "grad_norm": 1.4592048124895594, + "learning_rate": 1.058654281267838e-06, + "loss": 0.5171, + "step": 6409 + }, + { + "epoch": 0.8, + "grad_norm": 1.4238846191505734, + "learning_rate": 1.057417958101047e-06, + "loss": 0.5033, + "step": 6410 + }, + { + "epoch": 0.8, + "grad_norm": 2.4464240773430705, + "learning_rate": 1.0561822718931074e-06, + "loss": 0.5016, + "step": 6411 + }, + { + "epoch": 0.8, + "grad_norm": 1.6974068949415977, + "learning_rate": 1.0549472228436535e-06, + "loss": 0.4979, + "step": 6412 + }, + { + "epoch": 0.8, + "grad_norm": 1.3602079975816623, + "learning_rate": 1.0537128111522232e-06, + "loss": 0.4518, + "step": 6413 + }, + { + "epoch": 0.8, + "grad_norm": 0.6865326422197805, + "learning_rate": 1.0524790370182431e-06, + "loss": 0.4919, + "step": 6414 + }, + { + "epoch": 0.8, + "grad_norm": 1.3161318499696908, + "learning_rate": 1.0512459006410424e-06, + "loss": 0.4716, + "step": 6415 + }, + { + "epoch": 0.8, + "grad_norm": 1.3809273299426368, + "learning_rate": 1.0500134022198421e-06, + "loss": 0.4707, + "step": 6416 + }, + { + "epoch": 0.8, + "grad_norm": 1.358364784189713, + "learning_rate": 1.0487815419537672e-06, + "loss": 0.4555, + "step": 6417 + }, + { + "epoch": 0.8, + "grad_norm": 2.2188671693165967, + "learning_rate": 1.047550320041832e-06, + "loss": 0.4906, + "step": 6418 + }, + { + "epoch": 0.8, + "grad_norm": 1.8062966823296596, + "learning_rate": 1.0463197366829536e-06, + "loss": 0.4798, + "step": 6419 + }, + { + "epoch": 0.8, + "grad_norm": 1.5575137348150088, + "learning_rate": 1.0450897920759422e-06, + "loss": 0.515, + "step": 6420 + }, + { + "epoch": 0.8, + "grad_norm": 1.4850541577111969, + "learning_rate": 1.043860486419505e-06, + "loss": 0.4679, + "step": 6421 + }, + { + "epoch": 0.8, + "grad_norm": 1.4781014051994306, + "learning_rate": 1.042631819912251e-06, + "loss": 0.4556, + "step": 6422 + }, + { + "epoch": 0.8, + "grad_norm": 3.3726640581035268, + "learning_rate": 1.0414037927526782e-06, + "loss": 0.4647, + "step": 6423 + }, + { + "epoch": 0.8, + "grad_norm": 1.714250634087497, + "learning_rate": 1.0401764051391888e-06, + "loss": 0.4789, + "step": 6424 + }, + { + "epoch": 0.8, + "grad_norm": 1.4200836700258241, + "learning_rate": 1.0389496572700747e-06, + "loss": 0.4973, + "step": 6425 + }, + { + "epoch": 0.8, + "grad_norm": 1.3985361590853174, + "learning_rate": 1.0377235493435301e-06, + "loss": 0.4872, + "step": 6426 + }, + { + "epoch": 0.8, + "grad_norm": 1.414545373279879, + "learning_rate": 1.0364980815576419e-06, + "loss": 0.4704, + "step": 6427 + }, + { + "epoch": 0.8, + "grad_norm": 1.3847624550964073, + "learning_rate": 1.0352732541103972e-06, + "loss": 0.5241, + "step": 6428 + }, + { + "epoch": 0.8, + "grad_norm": 1.5204777902527213, + "learning_rate": 1.034049067199675e-06, + "loss": 0.4473, + "step": 6429 + }, + { + "epoch": 0.8, + "grad_norm": 1.3824734739922342, + "learning_rate": 1.032825521023255e-06, + "loss": 0.4806, + "step": 6430 + }, + { + "epoch": 0.8, + "grad_norm": 1.35143148368396, + "learning_rate": 1.031602615778814e-06, + "loss": 0.4947, + "step": 6431 + }, + { + "epoch": 0.8, + "grad_norm": 1.3017480279577303, + "learning_rate": 1.0303803516639187e-06, + "loss": 0.5221, + "step": 6432 + }, + { + "epoch": 0.8, + "grad_norm": 1.314222059951463, + "learning_rate": 1.0291587288760403e-06, + "loss": 0.5007, + "step": 6433 + }, + { + "epoch": 0.8, + "grad_norm": 1.7591906723145145, + "learning_rate": 1.02793774761254e-06, + "loss": 0.4792, + "step": 6434 + }, + { + "epoch": 0.8, + "grad_norm": 1.495983252455482, + "learning_rate": 1.026717408070681e-06, + "loss": 0.4376, + "step": 6435 + }, + { + "epoch": 0.8, + "grad_norm": 1.862959961522283, + "learning_rate": 1.0254977104476166e-06, + "loss": 0.4925, + "step": 6436 + }, + { + "epoch": 0.8, + "grad_norm": 1.4445140615487064, + "learning_rate": 1.0242786549404015e-06, + "loss": 0.515, + "step": 6437 + }, + { + "epoch": 0.8, + "grad_norm": 1.5709457175445063, + "learning_rate": 1.0230602417459862e-06, + "loss": 0.5253, + "step": 6438 + }, + { + "epoch": 0.8, + "grad_norm": 1.4846376046602925, + "learning_rate": 1.0218424710612135e-06, + "loss": 0.539, + "step": 6439 + }, + { + "epoch": 0.8, + "grad_norm": 1.805011782517603, + "learning_rate": 1.0206253430828266e-06, + "loss": 0.4773, + "step": 6440 + }, + { + "epoch": 0.8, + "grad_norm": 1.4557146952088784, + "learning_rate": 1.0194088580074618e-06, + "loss": 0.465, + "step": 6441 + }, + { + "epoch": 0.8, + "grad_norm": 1.380442482912851, + "learning_rate": 1.0181930160316554e-06, + "loss": 0.5045, + "step": 6442 + }, + { + "epoch": 0.8, + "grad_norm": 1.321381721792749, + "learning_rate": 1.0169778173518347e-06, + "loss": 0.4964, + "step": 6443 + }, + { + "epoch": 0.8, + "grad_norm": 4.081757579088301, + "learning_rate": 1.0157632621643282e-06, + "loss": 0.514, + "step": 6444 + }, + { + "epoch": 0.8, + "grad_norm": 1.6201478985872608, + "learning_rate": 1.0145493506653548e-06, + "loss": 0.5069, + "step": 6445 + }, + { + "epoch": 0.8, + "grad_norm": 1.6145747097101975, + "learning_rate": 1.0133360830510352e-06, + "loss": 0.4738, + "step": 6446 + }, + { + "epoch": 0.8, + "grad_norm": 1.4517395862733522, + "learning_rate": 1.0121234595173835e-06, + "loss": 0.5372, + "step": 6447 + }, + { + "epoch": 0.8, + "grad_norm": 2.3009646288536616, + "learning_rate": 1.0109114802603081e-06, + "loss": 0.4628, + "step": 6448 + }, + { + "epoch": 0.8, + "grad_norm": 1.5314688288647906, + "learning_rate": 1.0097001454756173e-06, + "loss": 0.5247, + "step": 6449 + }, + { + "epoch": 0.8, + "grad_norm": 1.3562688096522537, + "learning_rate": 1.0084894553590098e-06, + "loss": 0.4466, + "step": 6450 + }, + { + "epoch": 0.8, + "grad_norm": 1.3705066231074705, + "learning_rate": 1.0072794101060861e-06, + "loss": 0.4579, + "step": 6451 + }, + { + "epoch": 0.8, + "grad_norm": 1.5832753927850254, + "learning_rate": 1.0060700099123372e-06, + "loss": 0.4806, + "step": 6452 + }, + { + "epoch": 0.8, + "grad_norm": 1.8103783315737578, + "learning_rate": 1.004861254973154e-06, + "loss": 0.4862, + "step": 6453 + }, + { + "epoch": 0.8, + "grad_norm": 2.247061170385736, + "learning_rate": 1.0036531454838216e-06, + "loss": 0.4913, + "step": 6454 + }, + { + "epoch": 0.8, + "grad_norm": 2.405826263478046, + "learning_rate": 1.0024456816395195e-06, + "loss": 0.511, + "step": 6455 + }, + { + "epoch": 0.8, + "grad_norm": 1.800506386193242, + "learning_rate": 1.001238863635326e-06, + "loss": 0.4625, + "step": 6456 + }, + { + "epoch": 0.8, + "grad_norm": 1.4412549678615536, + "learning_rate": 1.0000326916662112e-06, + "loss": 0.519, + "step": 6457 + }, + { + "epoch": 0.8, + "grad_norm": 1.3940917749864985, + "learning_rate": 9.988271659270444e-07, + "loss": 0.4812, + "step": 6458 + }, + { + "epoch": 0.8, + "grad_norm": 1.331626678614534, + "learning_rate": 9.976222866125867e-07, + "loss": 0.4947, + "step": 6459 + }, + { + "epoch": 0.8, + "grad_norm": 1.7731658384307778, + "learning_rate": 9.964180539175e-07, + "loss": 0.5341, + "step": 6460 + }, + { + "epoch": 0.8, + "grad_norm": 0.6299854711262632, + "learning_rate": 9.952144680363358e-07, + "loss": 0.5018, + "step": 6461 + }, + { + "epoch": 0.8, + "grad_norm": 1.7760926328349766, + "learning_rate": 9.940115291635448e-07, + "loss": 0.4784, + "step": 6462 + }, + { + "epoch": 0.8, + "grad_norm": 1.8962216803491019, + "learning_rate": 9.92809237493475e-07, + "loss": 0.47, + "step": 6463 + }, + { + "epoch": 0.8, + "grad_norm": 1.4401955974492697, + "learning_rate": 9.916075932203633e-07, + "loss": 0.4428, + "step": 6464 + }, + { + "epoch": 0.8, + "grad_norm": 1.4001781733936711, + "learning_rate": 9.904065965383492e-07, + "loss": 0.5315, + "step": 6465 + }, + { + "epoch": 0.8, + "grad_norm": 1.7370364220018046, + "learning_rate": 9.89206247641461e-07, + "loss": 0.566, + "step": 6466 + }, + { + "epoch": 0.8, + "grad_norm": 1.3805628292137204, + "learning_rate": 9.880065467236289e-07, + "loss": 0.4446, + "step": 6467 + }, + { + "epoch": 0.8, + "grad_norm": 1.4475562347146755, + "learning_rate": 9.868074939786727e-07, + "loss": 0.5286, + "step": 6468 + }, + { + "epoch": 0.8, + "grad_norm": 1.884552940235352, + "learning_rate": 9.856090896003117e-07, + "loss": 0.5077, + "step": 6469 + }, + { + "epoch": 0.8, + "grad_norm": 1.2709276223524975, + "learning_rate": 9.84411333782157e-07, + "loss": 0.4425, + "step": 6470 + }, + { + "epoch": 0.8, + "grad_norm": 1.7191717752920916, + "learning_rate": 9.83214226717717e-07, + "loss": 0.5699, + "step": 6471 + }, + { + "epoch": 0.8, + "grad_norm": 1.426700264224924, + "learning_rate": 9.820177686003972e-07, + "loss": 0.505, + "step": 6472 + }, + { + "epoch": 0.8, + "grad_norm": 1.6161588921373857, + "learning_rate": 9.808219596234924e-07, + "loss": 0.5044, + "step": 6473 + }, + { + "epoch": 0.8, + "grad_norm": 1.565981060733389, + "learning_rate": 9.796267999801995e-07, + "loss": 0.5106, + "step": 6474 + }, + { + "epoch": 0.8, + "grad_norm": 1.5211022442715696, + "learning_rate": 9.784322898636034e-07, + "loss": 0.5166, + "step": 6475 + }, + { + "epoch": 0.8, + "grad_norm": 1.4123861555634079, + "learning_rate": 9.772384294666915e-07, + "loss": 0.5146, + "step": 6476 + }, + { + "epoch": 0.8, + "grad_norm": 1.796078323450227, + "learning_rate": 9.760452189823394e-07, + "loss": 0.5041, + "step": 6477 + }, + { + "epoch": 0.8, + "grad_norm": 0.655518718867122, + "learning_rate": 9.748526586033219e-07, + "loss": 0.465, + "step": 6478 + }, + { + "epoch": 0.8, + "grad_norm": 1.321255408001864, + "learning_rate": 9.736607485223076e-07, + "loss": 0.4534, + "step": 6479 + }, + { + "epoch": 0.8, + "grad_norm": 0.5973847957315321, + "learning_rate": 9.724694889318615e-07, + "loss": 0.469, + "step": 6480 + }, + { + "epoch": 0.8, + "grad_norm": 1.7206749394132033, + "learning_rate": 9.71278880024441e-07, + "loss": 0.4653, + "step": 6481 + }, + { + "epoch": 0.8, + "grad_norm": 1.456188547690438, + "learning_rate": 9.700889219923975e-07, + "loss": 0.4847, + "step": 6482 + }, + { + "epoch": 0.8, + "grad_norm": 1.361231917322826, + "learning_rate": 9.688996150279827e-07, + "loss": 0.4994, + "step": 6483 + }, + { + "epoch": 0.8, + "grad_norm": 2.0411586611106918, + "learning_rate": 9.677109593233363e-07, + "loss": 0.4838, + "step": 6484 + }, + { + "epoch": 0.8, + "grad_norm": 1.6873790949681748, + "learning_rate": 9.665229550704991e-07, + "loss": 0.5037, + "step": 6485 + }, + { + "epoch": 0.8, + "grad_norm": 1.6506240442314934, + "learning_rate": 9.653356024613997e-07, + "loss": 0.4929, + "step": 6486 + }, + { + "epoch": 0.81, + "grad_norm": 0.6794439760619745, + "learning_rate": 9.641489016878702e-07, + "loss": 0.5285, + "step": 6487 + }, + { + "epoch": 0.81, + "grad_norm": 1.4150040353312494, + "learning_rate": 9.62962852941629e-07, + "loss": 0.5077, + "step": 6488 + }, + { + "epoch": 0.81, + "grad_norm": 1.390843234714611, + "learning_rate": 9.61777456414295e-07, + "loss": 0.4565, + "step": 6489 + }, + { + "epoch": 0.81, + "grad_norm": 2.2235657869784267, + "learning_rate": 9.60592712297379e-07, + "loss": 0.4837, + "step": 6490 + }, + { + "epoch": 0.81, + "grad_norm": 1.4018516966954073, + "learning_rate": 9.594086207822845e-07, + "loss": 0.4996, + "step": 6491 + }, + { + "epoch": 0.81, + "grad_norm": 1.8108744655621944, + "learning_rate": 9.58225182060315e-07, + "loss": 0.4742, + "step": 6492 + }, + { + "epoch": 0.81, + "grad_norm": 1.6366771711601844, + "learning_rate": 9.570423963226632e-07, + "loss": 0.4805, + "step": 6493 + }, + { + "epoch": 0.81, + "grad_norm": 0.6650749303310582, + "learning_rate": 9.558602637604192e-07, + "loss": 0.5174, + "step": 6494 + }, + { + "epoch": 0.81, + "grad_norm": 2.208185061384778, + "learning_rate": 9.546787845645672e-07, + "loss": 0.554, + "step": 6495 + }, + { + "epoch": 0.81, + "grad_norm": 1.4893339674896775, + "learning_rate": 9.534979589259869e-07, + "loss": 0.4519, + "step": 6496 + }, + { + "epoch": 0.81, + "grad_norm": 0.6165958466542163, + "learning_rate": 9.523177870354483e-07, + "loss": 0.4916, + "step": 6497 + }, + { + "epoch": 0.81, + "grad_norm": 1.7710197037064592, + "learning_rate": 9.511382690836213e-07, + "loss": 0.5109, + "step": 6498 + }, + { + "epoch": 0.81, + "grad_norm": 1.3793051055716132, + "learning_rate": 9.499594052610661e-07, + "loss": 0.4939, + "step": 6499 + }, + { + "epoch": 0.81, + "grad_norm": 2.0578189131865265, + "learning_rate": 9.487811957582376e-07, + "loss": 0.4535, + "step": 6500 + }, + { + "epoch": 0.81, + "grad_norm": 1.5081857196219226, + "learning_rate": 9.476036407654881e-07, + "loss": 0.4491, + "step": 6501 + }, + { + "epoch": 0.81, + "grad_norm": 1.456257458370949, + "learning_rate": 9.464267404730593e-07, + "loss": 0.4809, + "step": 6502 + }, + { + "epoch": 0.81, + "grad_norm": 8.474834198528999, + "learning_rate": 9.452504950710911e-07, + "loss": 0.4986, + "step": 6503 + }, + { + "epoch": 0.81, + "grad_norm": 0.7138505129091549, + "learning_rate": 9.440749047496168e-07, + "loss": 0.5048, + "step": 6504 + }, + { + "epoch": 0.81, + "grad_norm": 1.4093392810821976, + "learning_rate": 9.428999696985642e-07, + "loss": 0.4755, + "step": 6505 + }, + { + "epoch": 0.81, + "grad_norm": 1.6833102214769327, + "learning_rate": 9.417256901077515e-07, + "loss": 0.487, + "step": 6506 + }, + { + "epoch": 0.81, + "grad_norm": 1.3875066026251863, + "learning_rate": 9.405520661668977e-07, + "loss": 0.5175, + "step": 6507 + }, + { + "epoch": 0.81, + "grad_norm": 1.4386284006670491, + "learning_rate": 9.393790980656081e-07, + "loss": 0.4967, + "step": 6508 + }, + { + "epoch": 0.81, + "grad_norm": 1.477341431126602, + "learning_rate": 9.382067859933897e-07, + "loss": 0.4585, + "step": 6509 + }, + { + "epoch": 0.81, + "grad_norm": 1.3587078333246, + "learning_rate": 9.37035130139638e-07, + "loss": 0.4865, + "step": 6510 + }, + { + "epoch": 0.81, + "grad_norm": 1.4243793950309338, + "learning_rate": 9.358641306936416e-07, + "loss": 0.513, + "step": 6511 + }, + { + "epoch": 0.81, + "grad_norm": 1.4195418236296353, + "learning_rate": 9.346937878445916e-07, + "loss": 0.4805, + "step": 6512 + }, + { + "epoch": 0.81, + "grad_norm": 1.6071698001178198, + "learning_rate": 9.335241017815627e-07, + "loss": 0.5024, + "step": 6513 + }, + { + "epoch": 0.81, + "grad_norm": 2.5025394272531454, + "learning_rate": 9.323550726935304e-07, + "loss": 0.562, + "step": 6514 + }, + { + "epoch": 0.81, + "grad_norm": 1.531377625714681, + "learning_rate": 9.311867007693598e-07, + "loss": 0.4915, + "step": 6515 + }, + { + "epoch": 0.81, + "grad_norm": 1.7240203212532423, + "learning_rate": 9.300189861978143e-07, + "loss": 0.4966, + "step": 6516 + }, + { + "epoch": 0.81, + "grad_norm": 1.6216464797284622, + "learning_rate": 9.288519291675451e-07, + "loss": 0.4925, + "step": 6517 + }, + { + "epoch": 0.81, + "grad_norm": 1.5411812159660658, + "learning_rate": 9.276855298671033e-07, + "loss": 0.5281, + "step": 6518 + }, + { + "epoch": 0.81, + "grad_norm": 1.580795284714528, + "learning_rate": 9.265197884849286e-07, + "loss": 0.4922, + "step": 6519 + }, + { + "epoch": 0.81, + "grad_norm": 1.5446521341749373, + "learning_rate": 9.253547052093587e-07, + "loss": 0.4868, + "step": 6520 + }, + { + "epoch": 0.81, + "grad_norm": 1.3983084243758115, + "learning_rate": 9.241902802286229e-07, + "loss": 0.4888, + "step": 6521 + }, + { + "epoch": 0.81, + "grad_norm": 1.308543603312496, + "learning_rate": 9.23026513730843e-07, + "loss": 0.4892, + "step": 6522 + }, + { + "epoch": 0.81, + "grad_norm": 1.7423299968073849, + "learning_rate": 9.218634059040377e-07, + "loss": 0.493, + "step": 6523 + }, + { + "epoch": 0.81, + "grad_norm": 1.3112906377656586, + "learning_rate": 9.207009569361153e-07, + "loss": 0.5326, + "step": 6524 + }, + { + "epoch": 0.81, + "grad_norm": 1.4610564692928665, + "learning_rate": 9.195391670148812e-07, + "loss": 0.4936, + "step": 6525 + }, + { + "epoch": 0.81, + "grad_norm": 1.890123244074039, + "learning_rate": 9.183780363280314e-07, + "loss": 0.4905, + "step": 6526 + }, + { + "epoch": 0.81, + "grad_norm": 1.3160123399401755, + "learning_rate": 9.172175650631576e-07, + "loss": 0.4919, + "step": 6527 + }, + { + "epoch": 0.81, + "grad_norm": 0.6255882144586874, + "learning_rate": 9.160577534077453e-07, + "loss": 0.445, + "step": 6528 + }, + { + "epoch": 0.81, + "grad_norm": 1.5051745110594048, + "learning_rate": 9.148986015491695e-07, + "loss": 0.4883, + "step": 6529 + }, + { + "epoch": 0.81, + "grad_norm": 0.6865982859562735, + "learning_rate": 9.137401096747045e-07, + "loss": 0.501, + "step": 6530 + }, + { + "epoch": 0.81, + "grad_norm": 1.9954453456068366, + "learning_rate": 9.125822779715121e-07, + "loss": 0.4812, + "step": 6531 + }, + { + "epoch": 0.81, + "grad_norm": 1.683593194230567, + "learning_rate": 9.114251066266527e-07, + "loss": 0.5226, + "step": 6532 + }, + { + "epoch": 0.81, + "grad_norm": 2.165224842620645, + "learning_rate": 9.102685958270746e-07, + "loss": 0.5188, + "step": 6533 + }, + { + "epoch": 0.81, + "grad_norm": 1.8715140158538006, + "learning_rate": 9.09112745759626e-07, + "loss": 0.4523, + "step": 6534 + }, + { + "epoch": 0.81, + "grad_norm": 0.578535773480896, + "learning_rate": 9.07957556611041e-07, + "loss": 0.4604, + "step": 6535 + }, + { + "epoch": 0.81, + "grad_norm": 1.1796850042322746, + "learning_rate": 9.068030285679519e-07, + "loss": 0.4398, + "step": 6536 + }, + { + "epoch": 0.81, + "grad_norm": 0.7277912894351544, + "learning_rate": 9.05649161816885e-07, + "loss": 0.4712, + "step": 6537 + }, + { + "epoch": 0.81, + "grad_norm": 1.6811486351535663, + "learning_rate": 9.044959565442541e-07, + "loss": 0.5187, + "step": 6538 + }, + { + "epoch": 0.81, + "grad_norm": 1.3938299493600599, + "learning_rate": 9.033434129363727e-07, + "loss": 0.5106, + "step": 6539 + }, + { + "epoch": 0.81, + "grad_norm": 1.865073353995794, + "learning_rate": 9.021915311794421e-07, + "loss": 0.4647, + "step": 6540 + }, + { + "epoch": 0.81, + "grad_norm": 1.3106511091520945, + "learning_rate": 9.010403114595612e-07, + "loss": 0.4894, + "step": 6541 + }, + { + "epoch": 0.81, + "grad_norm": 1.6391450455268382, + "learning_rate": 8.998897539627172e-07, + "loss": 0.4969, + "step": 6542 + }, + { + "epoch": 0.81, + "grad_norm": 1.4456021413174263, + "learning_rate": 8.987398588747948e-07, + "loss": 0.4843, + "step": 6543 + }, + { + "epoch": 0.81, + "grad_norm": 2.5590898039625034, + "learning_rate": 8.975906263815681e-07, + "loss": 0.5073, + "step": 6544 + }, + { + "epoch": 0.81, + "grad_norm": 1.665956667537492, + "learning_rate": 8.96442056668706e-07, + "loss": 0.5101, + "step": 6545 + }, + { + "epoch": 0.81, + "grad_norm": 1.8921706826375544, + "learning_rate": 8.952941499217722e-07, + "loss": 0.4772, + "step": 6546 + }, + { + "epoch": 0.81, + "grad_norm": 3.2156733101385413, + "learning_rate": 8.941469063262182e-07, + "loss": 0.4468, + "step": 6547 + }, + { + "epoch": 0.81, + "grad_norm": 1.5766155722354083, + "learning_rate": 8.930003260673936e-07, + "loss": 0.5071, + "step": 6548 + }, + { + "epoch": 0.81, + "grad_norm": 1.545352202694005, + "learning_rate": 8.91854409330537e-07, + "loss": 0.4885, + "step": 6549 + }, + { + "epoch": 0.81, + "grad_norm": 1.31182920256894, + "learning_rate": 8.907091563007824e-07, + "loss": 0.4635, + "step": 6550 + }, + { + "epoch": 0.81, + "grad_norm": 1.6185349718289779, + "learning_rate": 8.895645671631542e-07, + "loss": 0.5057, + "step": 6551 + }, + { + "epoch": 0.81, + "grad_norm": 1.540854118339132, + "learning_rate": 8.884206421025715e-07, + "loss": 0.4671, + "step": 6552 + }, + { + "epoch": 0.81, + "grad_norm": 1.4774845635395661, + "learning_rate": 8.872773813038466e-07, + "loss": 0.5321, + "step": 6553 + }, + { + "epoch": 0.81, + "grad_norm": 1.5020064700053972, + "learning_rate": 8.861347849516816e-07, + "loss": 0.5313, + "step": 6554 + }, + { + "epoch": 0.81, + "grad_norm": 0.7142941992414023, + "learning_rate": 8.849928532306745e-07, + "loss": 0.5107, + "step": 6555 + }, + { + "epoch": 0.81, + "grad_norm": 1.91307148790412, + "learning_rate": 8.838515863253128e-07, + "loss": 0.5381, + "step": 6556 + }, + { + "epoch": 0.81, + "grad_norm": 2.6351845953656223, + "learning_rate": 8.827109844199805e-07, + "loss": 0.4757, + "step": 6557 + }, + { + "epoch": 0.81, + "grad_norm": 1.6147448240438045, + "learning_rate": 8.815710476989487e-07, + "loss": 0.5138, + "step": 6558 + }, + { + "epoch": 0.81, + "grad_norm": 2.1570591241507078, + "learning_rate": 8.804317763463877e-07, + "loss": 0.4723, + "step": 6559 + }, + { + "epoch": 0.81, + "grad_norm": 2.0196638013532677, + "learning_rate": 8.792931705463542e-07, + "loss": 0.5151, + "step": 6560 + }, + { + "epoch": 0.81, + "grad_norm": 1.358548606885228, + "learning_rate": 8.781552304828006e-07, + "loss": 0.4373, + "step": 6561 + }, + { + "epoch": 0.81, + "grad_norm": 1.5555876320839892, + "learning_rate": 8.770179563395725e-07, + "loss": 0.5272, + "step": 6562 + }, + { + "epoch": 0.81, + "grad_norm": 1.873411723591231, + "learning_rate": 8.758813483004047e-07, + "loss": 0.558, + "step": 6563 + }, + { + "epoch": 0.81, + "grad_norm": 2.5335119418462217, + "learning_rate": 8.74745406548928e-07, + "loss": 0.4761, + "step": 6564 + }, + { + "epoch": 0.81, + "grad_norm": 1.4249307697946578, + "learning_rate": 8.736101312686618e-07, + "loss": 0.525, + "step": 6565 + }, + { + "epoch": 0.81, + "grad_norm": 1.7083767116323012, + "learning_rate": 8.72475522643022e-07, + "loss": 0.4557, + "step": 6566 + }, + { + "epoch": 0.81, + "grad_norm": 1.5754117600788526, + "learning_rate": 8.713415808553121e-07, + "loss": 0.4816, + "step": 6567 + }, + { + "epoch": 0.82, + "grad_norm": 1.6529075246510267, + "learning_rate": 8.702083060887323e-07, + "loss": 0.4781, + "step": 6568 + }, + { + "epoch": 0.82, + "grad_norm": 1.5804308238440943, + "learning_rate": 8.690756985263737e-07, + "loss": 0.4972, + "step": 6569 + }, + { + "epoch": 0.82, + "grad_norm": 1.7295768902466224, + "learning_rate": 8.679437583512168e-07, + "loss": 0.5174, + "step": 6570 + }, + { + "epoch": 0.82, + "grad_norm": 0.6384249650212698, + "learning_rate": 8.668124857461385e-07, + "loss": 0.5158, + "step": 6571 + }, + { + "epoch": 0.82, + "grad_norm": 3.3816051165201575, + "learning_rate": 8.656818808939038e-07, + "loss": 0.4673, + "step": 6572 + }, + { + "epoch": 0.82, + "grad_norm": 1.4216938442872074, + "learning_rate": 8.645519439771744e-07, + "loss": 0.4542, + "step": 6573 + }, + { + "epoch": 0.82, + "grad_norm": 1.4833293871723272, + "learning_rate": 8.634226751784991e-07, + "loss": 0.5507, + "step": 6574 + }, + { + "epoch": 0.82, + "grad_norm": 1.606769771979235, + "learning_rate": 8.622940746803238e-07, + "loss": 0.5216, + "step": 6575 + }, + { + "epoch": 0.82, + "grad_norm": 1.348401256388603, + "learning_rate": 8.611661426649809e-07, + "loss": 0.4688, + "step": 6576 + }, + { + "epoch": 0.82, + "grad_norm": 1.6077809246600447, + "learning_rate": 8.600388793146991e-07, + "loss": 0.5163, + "step": 6577 + }, + { + "epoch": 0.82, + "grad_norm": 2.4424136275117476, + "learning_rate": 8.589122848115977e-07, + "loss": 0.5012, + "step": 6578 + }, + { + "epoch": 0.82, + "grad_norm": 2.6050305815590833, + "learning_rate": 8.5778635933769e-07, + "loss": 0.4884, + "step": 6579 + }, + { + "epoch": 0.82, + "grad_norm": 1.4093501853324268, + "learning_rate": 8.566611030748767e-07, + "loss": 0.4972, + "step": 6580 + }, + { + "epoch": 0.82, + "grad_norm": 1.4112177810139739, + "learning_rate": 8.555365162049529e-07, + "loss": 0.5028, + "step": 6581 + }, + { + "epoch": 0.82, + "grad_norm": 2.0270135079817404, + "learning_rate": 8.544125989096063e-07, + "loss": 0.5033, + "step": 6582 + }, + { + "epoch": 0.82, + "grad_norm": 1.2998776012858089, + "learning_rate": 8.532893513704149e-07, + "loss": 0.4833, + "step": 6583 + }, + { + "epoch": 0.82, + "grad_norm": 1.483841216926302, + "learning_rate": 8.52166773768851e-07, + "loss": 0.472, + "step": 6584 + }, + { + "epoch": 0.82, + "grad_norm": 2.3508207620418844, + "learning_rate": 8.51044866286273e-07, + "loss": 0.46, + "step": 6585 + }, + { + "epoch": 0.82, + "grad_norm": 1.653697562371862, + "learning_rate": 8.499236291039403e-07, + "loss": 0.5096, + "step": 6586 + }, + { + "epoch": 0.82, + "grad_norm": 1.5744504468038782, + "learning_rate": 8.488030624029947e-07, + "loss": 0.5106, + "step": 6587 + }, + { + "epoch": 0.82, + "grad_norm": 1.463803355129119, + "learning_rate": 8.476831663644758e-07, + "loss": 0.4511, + "step": 6588 + }, + { + "epoch": 0.82, + "grad_norm": 1.6581237351175009, + "learning_rate": 8.465639411693116e-07, + "loss": 0.5617, + "step": 6589 + }, + { + "epoch": 0.82, + "grad_norm": 1.5768383400051669, + "learning_rate": 8.45445386998322e-07, + "loss": 0.5398, + "step": 6590 + }, + { + "epoch": 0.82, + "grad_norm": 1.6111736629103783, + "learning_rate": 8.443275040322213e-07, + "loss": 0.4301, + "step": 6591 + }, + { + "epoch": 0.82, + "grad_norm": 1.5076339706999673, + "learning_rate": 8.432102924516112e-07, + "loss": 0.4785, + "step": 6592 + }, + { + "epoch": 0.82, + "grad_norm": 2.8147989208223385, + "learning_rate": 8.420937524369876e-07, + "loss": 0.5381, + "step": 6593 + }, + { + "epoch": 0.82, + "grad_norm": 1.5861599435098752, + "learning_rate": 8.409778841687383e-07, + "loss": 0.4792, + "step": 6594 + }, + { + "epoch": 0.82, + "grad_norm": 1.7391419145258895, + "learning_rate": 8.398626878271421e-07, + "loss": 0.5066, + "step": 6595 + }, + { + "epoch": 0.82, + "grad_norm": 1.3562454526494723, + "learning_rate": 8.387481635923667e-07, + "loss": 0.4684, + "step": 6596 + }, + { + "epoch": 0.82, + "grad_norm": 1.680088091115047, + "learning_rate": 8.376343116444763e-07, + "loss": 0.5065, + "step": 6597 + }, + { + "epoch": 0.82, + "grad_norm": 0.6406248070930465, + "learning_rate": 8.365211321634209e-07, + "loss": 0.5093, + "step": 6598 + }, + { + "epoch": 0.82, + "grad_norm": 1.4648655817523175, + "learning_rate": 8.354086253290445e-07, + "loss": 0.4623, + "step": 6599 + }, + { + "epoch": 0.82, + "grad_norm": 0.7348006256424257, + "learning_rate": 8.342967913210843e-07, + "loss": 0.5108, + "step": 6600 + }, + { + "epoch": 0.82, + "grad_norm": 1.6226349665887883, + "learning_rate": 8.331856303191632e-07, + "loss": 0.4747, + "step": 6601 + }, + { + "epoch": 0.82, + "grad_norm": 1.3900268223214782, + "learning_rate": 8.320751425028039e-07, + "loss": 0.5242, + "step": 6602 + }, + { + "epoch": 0.82, + "grad_norm": 1.5022996561742294, + "learning_rate": 8.309653280514118e-07, + "loss": 0.4604, + "step": 6603 + }, + { + "epoch": 0.82, + "grad_norm": 0.6514066294972094, + "learning_rate": 8.2985618714429e-07, + "loss": 0.4741, + "step": 6604 + }, + { + "epoch": 0.82, + "grad_norm": 1.3691633878283713, + "learning_rate": 8.287477199606276e-07, + "loss": 0.4871, + "step": 6605 + }, + { + "epoch": 0.82, + "grad_norm": 1.5466999097901013, + "learning_rate": 8.276399266795088e-07, + "loss": 0.5516, + "step": 6606 + }, + { + "epoch": 0.82, + "grad_norm": 1.4721951482256834, + "learning_rate": 8.265328074799067e-07, + "loss": 0.5031, + "step": 6607 + }, + { + "epoch": 0.82, + "grad_norm": 0.6451556978401359, + "learning_rate": 8.254263625406856e-07, + "loss": 0.5001, + "step": 6608 + }, + { + "epoch": 0.82, + "grad_norm": 1.4413842515562194, + "learning_rate": 8.243205920406016e-07, + "loss": 0.5116, + "step": 6609 + }, + { + "epoch": 0.82, + "grad_norm": 1.287070423719794, + "learning_rate": 8.232154961583017e-07, + "loss": 0.4357, + "step": 6610 + }, + { + "epoch": 0.82, + "grad_norm": 1.578682068297509, + "learning_rate": 8.22111075072326e-07, + "loss": 0.4841, + "step": 6611 + }, + { + "epoch": 0.82, + "grad_norm": 1.4041042897849592, + "learning_rate": 8.210073289611004e-07, + "loss": 0.486, + "step": 6612 + }, + { + "epoch": 0.82, + "grad_norm": 1.4408155137435312, + "learning_rate": 8.19904258002947e-07, + "loss": 0.4741, + "step": 6613 + }, + { + "epoch": 0.82, + "grad_norm": 1.8826684892795102, + "learning_rate": 8.188018623760747e-07, + "loss": 0.4876, + "step": 6614 + }, + { + "epoch": 0.82, + "grad_norm": 1.4705849420242674, + "learning_rate": 8.177001422585873e-07, + "loss": 0.4982, + "step": 6615 + }, + { + "epoch": 0.82, + "grad_norm": 1.6166224577611759, + "learning_rate": 8.165990978284749e-07, + "loss": 0.483, + "step": 6616 + }, + { + "epoch": 0.82, + "grad_norm": 0.659097390392933, + "learning_rate": 8.154987292636241e-07, + "loss": 0.471, + "step": 6617 + }, + { + "epoch": 0.82, + "grad_norm": 1.5422971791203492, + "learning_rate": 8.143990367418059e-07, + "loss": 0.5401, + "step": 6618 + }, + { + "epoch": 0.82, + "grad_norm": 3.252984211458063, + "learning_rate": 8.133000204406871e-07, + "loss": 0.4784, + "step": 6619 + }, + { + "epoch": 0.82, + "grad_norm": 1.4384644065425785, + "learning_rate": 8.122016805378241e-07, + "loss": 0.5327, + "step": 6620 + }, + { + "epoch": 0.82, + "grad_norm": 1.8644675432583973, + "learning_rate": 8.11104017210661e-07, + "loss": 0.5353, + "step": 6621 + }, + { + "epoch": 0.82, + "grad_norm": 1.6273030259451302, + "learning_rate": 8.100070306365382e-07, + "loss": 0.4845, + "step": 6622 + }, + { + "epoch": 0.82, + "grad_norm": 1.5151125444202667, + "learning_rate": 8.0891072099268e-07, + "loss": 0.4817, + "step": 6623 + }, + { + "epoch": 0.82, + "grad_norm": 1.6395495200122552, + "learning_rate": 8.07815088456208e-07, + "loss": 0.4903, + "step": 6624 + }, + { + "epoch": 0.82, + "grad_norm": 1.8833985121582997, + "learning_rate": 8.067201332041286e-07, + "loss": 0.5114, + "step": 6625 + }, + { + "epoch": 0.82, + "grad_norm": 1.2573482842800296, + "learning_rate": 8.056258554133433e-07, + "loss": 0.4473, + "step": 6626 + }, + { + "epoch": 0.82, + "grad_norm": 4.841027142125862, + "learning_rate": 8.045322552606427e-07, + "loss": 0.491, + "step": 6627 + }, + { + "epoch": 0.82, + "grad_norm": 1.4684161021406357, + "learning_rate": 8.034393329227052e-07, + "loss": 0.5239, + "step": 6628 + }, + { + "epoch": 0.82, + "grad_norm": 1.4732512569164948, + "learning_rate": 8.023470885761053e-07, + "loss": 0.4898, + "step": 6629 + }, + { + "epoch": 0.82, + "grad_norm": 3.2874992148845803, + "learning_rate": 8.012555223973012e-07, + "loss": 0.4937, + "step": 6630 + }, + { + "epoch": 0.82, + "grad_norm": 1.3111270367689432, + "learning_rate": 8.001646345626479e-07, + "loss": 0.4869, + "step": 6631 + }, + { + "epoch": 0.82, + "grad_norm": 1.5229838431069287, + "learning_rate": 7.990744252483856e-07, + "loss": 0.512, + "step": 6632 + }, + { + "epoch": 0.82, + "grad_norm": 1.2912528249724036, + "learning_rate": 7.979848946306496e-07, + "loss": 0.4723, + "step": 6633 + }, + { + "epoch": 0.82, + "grad_norm": 1.5275938148321992, + "learning_rate": 7.968960428854605e-07, + "loss": 0.5204, + "step": 6634 + }, + { + "epoch": 0.82, + "grad_norm": 1.5890097981606934, + "learning_rate": 7.958078701887334e-07, + "loss": 0.4763, + "step": 6635 + }, + { + "epoch": 0.82, + "grad_norm": 1.2712850160856939, + "learning_rate": 7.947203767162737e-07, + "loss": 0.4592, + "step": 6636 + }, + { + "epoch": 0.82, + "grad_norm": 1.6645130002721502, + "learning_rate": 7.936335626437725e-07, + "loss": 0.5086, + "step": 6637 + }, + { + "epoch": 0.82, + "grad_norm": 1.5569392743648256, + "learning_rate": 7.925474281468165e-07, + "loss": 0.479, + "step": 6638 + }, + { + "epoch": 0.82, + "grad_norm": 1.492305078322501, + "learning_rate": 7.914619734008782e-07, + "loss": 0.512, + "step": 6639 + }, + { + "epoch": 0.82, + "grad_norm": 1.4290671588367234, + "learning_rate": 7.903771985813252e-07, + "loss": 0.4731, + "step": 6640 + }, + { + "epoch": 0.82, + "grad_norm": 1.48286399941432, + "learning_rate": 7.892931038634094e-07, + "loss": 0.4881, + "step": 6641 + }, + { + "epoch": 0.82, + "grad_norm": 2.04536664575143, + "learning_rate": 7.882096894222774e-07, + "loss": 0.5289, + "step": 6642 + }, + { + "epoch": 0.82, + "grad_norm": 1.758295482436992, + "learning_rate": 7.871269554329652e-07, + "loss": 0.5026, + "step": 6643 + }, + { + "epoch": 0.82, + "grad_norm": 1.3460918611676007, + "learning_rate": 7.860449020703964e-07, + "loss": 0.4616, + "step": 6644 + }, + { + "epoch": 0.82, + "grad_norm": 2.689594433035852, + "learning_rate": 7.849635295093877e-07, + "loss": 0.4857, + "step": 6645 + }, + { + "epoch": 0.82, + "grad_norm": 0.6720809223765922, + "learning_rate": 7.838828379246422e-07, + "loss": 0.4977, + "step": 6646 + }, + { + "epoch": 0.82, + "grad_norm": 1.47095461092173, + "learning_rate": 7.828028274907579e-07, + "loss": 0.5237, + "step": 6647 + }, + { + "epoch": 0.83, + "grad_norm": 1.5393842391301888, + "learning_rate": 7.817234983822169e-07, + "loss": 0.4689, + "step": 6648 + }, + { + "epoch": 0.83, + "grad_norm": 1.4527967916406928, + "learning_rate": 7.806448507733977e-07, + "loss": 0.4854, + "step": 6649 + }, + { + "epoch": 0.83, + "grad_norm": 2.9815323734000887, + "learning_rate": 7.795668848385624e-07, + "loss": 0.5219, + "step": 6650 + }, + { + "epoch": 0.83, + "grad_norm": 1.7985118940600708, + "learning_rate": 7.784896007518667e-07, + "loss": 0.4939, + "step": 6651 + }, + { + "epoch": 0.83, + "grad_norm": 2.2185676476628795, + "learning_rate": 7.774129986873574e-07, + "loss": 0.5087, + "step": 6652 + }, + { + "epoch": 0.83, + "grad_norm": 1.635521286764626, + "learning_rate": 7.763370788189656e-07, + "loss": 0.5354, + "step": 6653 + }, + { + "epoch": 0.83, + "grad_norm": 1.3406571460882213, + "learning_rate": 7.752618413205193e-07, + "loss": 0.4728, + "step": 6654 + }, + { + "epoch": 0.83, + "grad_norm": 1.8504903138524444, + "learning_rate": 7.74187286365729e-07, + "loss": 0.5104, + "step": 6655 + }, + { + "epoch": 0.83, + "grad_norm": 1.389096639498019, + "learning_rate": 7.731134141282015e-07, + "loss": 0.5099, + "step": 6656 + }, + { + "epoch": 0.83, + "grad_norm": 5.5366339284376185, + "learning_rate": 7.720402247814279e-07, + "loss": 0.4948, + "step": 6657 + }, + { + "epoch": 0.83, + "grad_norm": 4.842372319506879, + "learning_rate": 7.70967718498794e-07, + "loss": 0.4862, + "step": 6658 + }, + { + "epoch": 0.83, + "grad_norm": 1.4324724171615184, + "learning_rate": 7.698958954535696e-07, + "loss": 0.5086, + "step": 6659 + }, + { + "epoch": 0.83, + "grad_norm": 1.522130642732948, + "learning_rate": 7.688247558189194e-07, + "loss": 0.5072, + "step": 6660 + }, + { + "epoch": 0.83, + "grad_norm": 1.4951917839977171, + "learning_rate": 7.67754299767895e-07, + "loss": 0.4972, + "step": 6661 + }, + { + "epoch": 0.83, + "grad_norm": 1.5651674502519535, + "learning_rate": 7.666845274734375e-07, + "loss": 0.4754, + "step": 6662 + }, + { + "epoch": 0.83, + "grad_norm": 1.5627366549655106, + "learning_rate": 7.656154391083786e-07, + "loss": 0.4705, + "step": 6663 + }, + { + "epoch": 0.83, + "grad_norm": 0.6720391307867858, + "learning_rate": 7.645470348454381e-07, + "loss": 0.5589, + "step": 6664 + }, + { + "epoch": 0.83, + "grad_norm": 3.376154003760944, + "learning_rate": 7.634793148572273e-07, + "loss": 0.4723, + "step": 6665 + }, + { + "epoch": 0.83, + "grad_norm": 1.8004953958144452, + "learning_rate": 7.624122793162442e-07, + "loss": 0.5408, + "step": 6666 + }, + { + "epoch": 0.83, + "grad_norm": 1.5105450022050773, + "learning_rate": 7.613459283948788e-07, + "loss": 0.5245, + "step": 6667 + }, + { + "epoch": 0.83, + "grad_norm": 1.5470366339253507, + "learning_rate": 7.602802622654109e-07, + "loss": 0.479, + "step": 6668 + }, + { + "epoch": 0.83, + "grad_norm": 1.4887463762505064, + "learning_rate": 7.592152811000053e-07, + "loss": 0.4652, + "step": 6669 + }, + { + "epoch": 0.83, + "grad_norm": 0.6424167995495772, + "learning_rate": 7.581509850707214e-07, + "loss": 0.4651, + "step": 6670 + }, + { + "epoch": 0.83, + "grad_norm": 1.493513154419008, + "learning_rate": 7.570873743495039e-07, + "loss": 0.5321, + "step": 6671 + }, + { + "epoch": 0.83, + "grad_norm": 1.4814326981200936, + "learning_rate": 7.560244491081903e-07, + "loss": 0.4731, + "step": 6672 + }, + { + "epoch": 0.83, + "grad_norm": 1.4213082522322267, + "learning_rate": 7.549622095185033e-07, + "loss": 0.5278, + "step": 6673 + }, + { + "epoch": 0.83, + "grad_norm": 1.290850849983848, + "learning_rate": 7.539006557520595e-07, + "loss": 0.4822, + "step": 6674 + }, + { + "epoch": 0.83, + "grad_norm": 1.4676546201481493, + "learning_rate": 7.528397879803589e-07, + "loss": 0.494, + "step": 6675 + }, + { + "epoch": 0.83, + "grad_norm": 1.9356018002672504, + "learning_rate": 7.517796063747984e-07, + "loss": 0.4418, + "step": 6676 + }, + { + "epoch": 0.83, + "grad_norm": 2.3003381438959716, + "learning_rate": 7.507201111066581e-07, + "loss": 0.5061, + "step": 6677 + }, + { + "epoch": 0.83, + "grad_norm": 0.9853811495656869, + "learning_rate": 7.496613023471061e-07, + "loss": 0.468, + "step": 6678 + }, + { + "epoch": 0.83, + "grad_norm": 1.9867056108884016, + "learning_rate": 7.486031802672061e-07, + "loss": 0.4867, + "step": 6679 + }, + { + "epoch": 0.83, + "grad_norm": 2.2765450132478833, + "learning_rate": 7.475457450379036e-07, + "loss": 0.4914, + "step": 6680 + }, + { + "epoch": 0.83, + "grad_norm": 1.230696818398601, + "learning_rate": 7.464889968300398e-07, + "loss": 0.4248, + "step": 6681 + }, + { + "epoch": 0.83, + "grad_norm": 1.4779685084729597, + "learning_rate": 7.454329358143392e-07, + "loss": 0.5187, + "step": 6682 + }, + { + "epoch": 0.83, + "grad_norm": 1.6978034202504741, + "learning_rate": 7.44377562161418e-07, + "loss": 0.525, + "step": 6683 + }, + { + "epoch": 0.83, + "grad_norm": 1.902718193616752, + "learning_rate": 7.433228760417827e-07, + "loss": 0.4744, + "step": 6684 + }, + { + "epoch": 0.83, + "grad_norm": 1.5683391925887717, + "learning_rate": 7.422688776258268e-07, + "loss": 0.4921, + "step": 6685 + }, + { + "epoch": 0.83, + "grad_norm": 1.7998659135311224, + "learning_rate": 7.412155670838311e-07, + "loss": 0.5272, + "step": 6686 + }, + { + "epoch": 0.83, + "grad_norm": 1.400954649684802, + "learning_rate": 7.401629445859704e-07, + "loss": 0.4615, + "step": 6687 + }, + { + "epoch": 0.83, + "grad_norm": 1.7090806767697804, + "learning_rate": 7.391110103023031e-07, + "loss": 0.5015, + "step": 6688 + }, + { + "epoch": 0.83, + "grad_norm": 1.3007966513175657, + "learning_rate": 7.38059764402777e-07, + "loss": 0.4713, + "step": 6689 + }, + { + "epoch": 0.83, + "grad_norm": 1.4708094230997422, + "learning_rate": 7.370092070572332e-07, + "loss": 0.4907, + "step": 6690 + }, + { + "epoch": 0.83, + "grad_norm": 2.5038268737583214, + "learning_rate": 7.359593384353958e-07, + "loss": 0.5051, + "step": 6691 + }, + { + "epoch": 0.83, + "grad_norm": 2.524279014891403, + "learning_rate": 7.349101587068819e-07, + "loss": 0.5052, + "step": 6692 + }, + { + "epoch": 0.83, + "grad_norm": 1.7172922101698749, + "learning_rate": 7.338616680411953e-07, + "loss": 0.4597, + "step": 6693 + }, + { + "epoch": 0.83, + "grad_norm": 1.5930470211813232, + "learning_rate": 7.3281386660773e-07, + "loss": 0.515, + "step": 6694 + }, + { + "epoch": 0.83, + "grad_norm": 1.8029058761861165, + "learning_rate": 7.317667545757656e-07, + "loss": 0.4888, + "step": 6695 + }, + { + "epoch": 0.83, + "grad_norm": 1.4923651663363493, + "learning_rate": 7.307203321144746e-07, + "loss": 0.4664, + "step": 6696 + }, + { + "epoch": 0.83, + "grad_norm": 1.6185986604616538, + "learning_rate": 7.296745993929144e-07, + "loss": 0.4886, + "step": 6697 + }, + { + "epoch": 0.83, + "grad_norm": 1.2183265656631697, + "learning_rate": 7.286295565800305e-07, + "loss": 0.5012, + "step": 6698 + }, + { + "epoch": 0.83, + "grad_norm": 3.0722141533527365, + "learning_rate": 7.275852038446623e-07, + "loss": 0.4462, + "step": 6699 + }, + { + "epoch": 0.83, + "grad_norm": 1.5310571180094792, + "learning_rate": 7.265415413555304e-07, + "loss": 0.5104, + "step": 6700 + }, + { + "epoch": 0.83, + "grad_norm": 1.5599434286674574, + "learning_rate": 7.254985692812517e-07, + "loss": 0.5282, + "step": 6701 + }, + { + "epoch": 0.83, + "grad_norm": 4.125125729176223, + "learning_rate": 7.244562877903244e-07, + "loss": 0.5671, + "step": 6702 + }, + { + "epoch": 0.83, + "grad_norm": 0.6534221527354256, + "learning_rate": 7.234146970511408e-07, + "loss": 0.4942, + "step": 6703 + }, + { + "epoch": 0.83, + "grad_norm": 1.4397086788418985, + "learning_rate": 7.22373797231976e-07, + "loss": 0.488, + "step": 6704 + }, + { + "epoch": 0.83, + "grad_norm": 1.4557116794569993, + "learning_rate": 7.213335885010004e-07, + "loss": 0.5201, + "step": 6705 + }, + { + "epoch": 0.83, + "grad_norm": 1.5797733791302986, + "learning_rate": 7.202940710262657e-07, + "loss": 0.5304, + "step": 6706 + }, + { + "epoch": 0.83, + "grad_norm": 0.6977736195425923, + "learning_rate": 7.192552449757157e-07, + "loss": 0.4691, + "step": 6707 + }, + { + "epoch": 0.83, + "grad_norm": 1.5101471426826862, + "learning_rate": 7.18217110517182e-07, + "loss": 0.5608, + "step": 6708 + }, + { + "epoch": 0.83, + "grad_norm": 1.5421036528060617, + "learning_rate": 7.171796678183845e-07, + "loss": 0.4554, + "step": 6709 + }, + { + "epoch": 0.83, + "grad_norm": 1.8703537498179146, + "learning_rate": 7.161429170469325e-07, + "loss": 0.4687, + "step": 6710 + }, + { + "epoch": 0.83, + "grad_norm": 3.48883273820731, + "learning_rate": 7.151068583703202e-07, + "loss": 0.4986, + "step": 6711 + }, + { + "epoch": 0.83, + "grad_norm": 1.6717400136691807, + "learning_rate": 7.140714919559339e-07, + "loss": 0.5116, + "step": 6712 + }, + { + "epoch": 0.83, + "grad_norm": 1.4106732687778272, + "learning_rate": 7.13036817971044e-07, + "loss": 0.5113, + "step": 6713 + }, + { + "epoch": 0.83, + "grad_norm": 1.5991966251618026, + "learning_rate": 7.120028365828136e-07, + "loss": 0.5148, + "step": 6714 + }, + { + "epoch": 0.83, + "grad_norm": 2.17589346803186, + "learning_rate": 7.109695479582889e-07, + "loss": 0.4669, + "step": 6715 + }, + { + "epoch": 0.83, + "grad_norm": 2.206969652600979, + "learning_rate": 7.099369522644084e-07, + "loss": 0.4686, + "step": 6716 + }, + { + "epoch": 0.83, + "grad_norm": 1.8786885049448072, + "learning_rate": 7.089050496679983e-07, + "loss": 0.4683, + "step": 6717 + }, + { + "epoch": 0.83, + "grad_norm": 1.7313156690433527, + "learning_rate": 7.078738403357682e-07, + "loss": 0.4429, + "step": 6718 + }, + { + "epoch": 0.83, + "grad_norm": 1.9177512900258369, + "learning_rate": 7.068433244343221e-07, + "loss": 0.4881, + "step": 6719 + }, + { + "epoch": 0.83, + "grad_norm": 1.477028490334779, + "learning_rate": 7.058135021301465e-07, + "loss": 0.4805, + "step": 6720 + }, + { + "epoch": 0.83, + "grad_norm": 1.5125368951161189, + "learning_rate": 7.04784373589621e-07, + "loss": 0.4786, + "step": 6721 + }, + { + "epoch": 0.83, + "grad_norm": 1.4642545983208444, + "learning_rate": 7.037559389790078e-07, + "loss": 0.4947, + "step": 6722 + }, + { + "epoch": 0.83, + "grad_norm": 1.3479241995411038, + "learning_rate": 7.027281984644613e-07, + "loss": 0.4791, + "step": 6723 + }, + { + "epoch": 0.83, + "grad_norm": 2.4424745723584182, + "learning_rate": 7.017011522120198e-07, + "loss": 0.5482, + "step": 6724 + }, + { + "epoch": 0.83, + "grad_norm": 1.6080376385555766, + "learning_rate": 7.00674800387614e-07, + "loss": 0.4837, + "step": 6725 + }, + { + "epoch": 0.83, + "grad_norm": 1.548812317353501, + "learning_rate": 6.996491431570601e-07, + "loss": 0.478, + "step": 6726 + }, + { + "epoch": 0.83, + "grad_norm": 2.0592868543349034, + "learning_rate": 6.9862418068606e-07, + "loss": 0.4935, + "step": 6727 + }, + { + "epoch": 0.83, + "grad_norm": 1.4010202204985316, + "learning_rate": 6.975999131402078e-07, + "loss": 0.4488, + "step": 6728 + }, + { + "epoch": 0.84, + "grad_norm": 1.3449621662643738, + "learning_rate": 6.965763406849807e-07, + "loss": 0.5375, + "step": 6729 + }, + { + "epoch": 0.84, + "grad_norm": 1.4492464573692114, + "learning_rate": 6.95553463485748e-07, + "loss": 0.5089, + "step": 6730 + }, + { + "epoch": 0.84, + "grad_norm": 2.11499981877359, + "learning_rate": 6.945312817077627e-07, + "loss": 0.5208, + "step": 6731 + }, + { + "epoch": 0.84, + "grad_norm": 2.3102363888371937, + "learning_rate": 6.935097955161685e-07, + "loss": 0.4738, + "step": 6732 + }, + { + "epoch": 0.84, + "grad_norm": 1.6401365279072655, + "learning_rate": 6.924890050759942e-07, + "loss": 0.4658, + "step": 6733 + }, + { + "epoch": 0.84, + "grad_norm": 0.7261704136767818, + "learning_rate": 6.914689105521583e-07, + "loss": 0.5132, + "step": 6734 + }, + { + "epoch": 0.84, + "grad_norm": 1.5893186137882933, + "learning_rate": 6.904495121094673e-07, + "loss": 0.5327, + "step": 6735 + }, + { + "epoch": 0.84, + "grad_norm": 7.970506769573906, + "learning_rate": 6.894308099126118e-07, + "loss": 0.5151, + "step": 6736 + }, + { + "epoch": 0.84, + "grad_norm": 1.5212801434657697, + "learning_rate": 6.884128041261739e-07, + "loss": 0.4688, + "step": 6737 + }, + { + "epoch": 0.84, + "grad_norm": 1.6441088709954745, + "learning_rate": 6.873954949146194e-07, + "loss": 0.4809, + "step": 6738 + }, + { + "epoch": 0.84, + "grad_norm": 1.5754933441117371, + "learning_rate": 6.863788824423056e-07, + "loss": 0.5092, + "step": 6739 + }, + { + "epoch": 0.84, + "grad_norm": 1.5357523625887821, + "learning_rate": 6.853629668734735e-07, + "loss": 0.4985, + "step": 6740 + }, + { + "epoch": 0.84, + "grad_norm": 1.7619076256572428, + "learning_rate": 6.843477483722544e-07, + "loss": 0.4898, + "step": 6741 + }, + { + "epoch": 0.84, + "grad_norm": 1.5544198655012471, + "learning_rate": 6.83333227102666e-07, + "loss": 0.5098, + "step": 6742 + }, + { + "epoch": 0.84, + "grad_norm": 1.5978962889844126, + "learning_rate": 6.823194032286117e-07, + "loss": 0.5124, + "step": 6743 + }, + { + "epoch": 0.84, + "grad_norm": 1.71567831075207, + "learning_rate": 6.813062769138856e-07, + "loss": 0.4856, + "step": 6744 + }, + { + "epoch": 0.84, + "grad_norm": 1.7832381321843704, + "learning_rate": 6.80293848322165e-07, + "loss": 0.5148, + "step": 6745 + }, + { + "epoch": 0.84, + "grad_norm": 1.4160018873091047, + "learning_rate": 6.792821176170189e-07, + "loss": 0.4683, + "step": 6746 + }, + { + "epoch": 0.84, + "grad_norm": 0.6406893819086323, + "learning_rate": 6.782710849618989e-07, + "loss": 0.4772, + "step": 6747 + }, + { + "epoch": 0.84, + "grad_norm": 1.4935104545738345, + "learning_rate": 6.772607505201489e-07, + "loss": 0.5658, + "step": 6748 + }, + { + "epoch": 0.84, + "grad_norm": 3.196383469215414, + "learning_rate": 6.762511144549949e-07, + "loss": 0.4915, + "step": 6749 + }, + { + "epoch": 0.84, + "grad_norm": 1.548129351775618, + "learning_rate": 6.75242176929553e-07, + "loss": 0.5091, + "step": 6750 + }, + { + "epoch": 0.84, + "grad_norm": 11.626900511568277, + "learning_rate": 6.74233938106828e-07, + "loss": 0.4861, + "step": 6751 + }, + { + "epoch": 0.84, + "grad_norm": 2.3609349319146933, + "learning_rate": 6.732263981497073e-07, + "loss": 0.5313, + "step": 6752 + }, + { + "epoch": 0.84, + "grad_norm": 1.650097378062769, + "learning_rate": 6.722195572209694e-07, + "loss": 0.4674, + "step": 6753 + }, + { + "epoch": 0.84, + "grad_norm": 19.547528360647437, + "learning_rate": 6.712134154832773e-07, + "loss": 0.4418, + "step": 6754 + }, + { + "epoch": 0.84, + "grad_norm": 1.4892496792692191, + "learning_rate": 6.702079730991829e-07, + "loss": 0.5574, + "step": 6755 + }, + { + "epoch": 0.84, + "grad_norm": 1.3719467735091257, + "learning_rate": 6.692032302311229e-07, + "loss": 0.5561, + "step": 6756 + }, + { + "epoch": 0.84, + "grad_norm": 10.027162632150969, + "learning_rate": 6.68199187041424e-07, + "loss": 0.4945, + "step": 6757 + }, + { + "epoch": 0.84, + "grad_norm": 2.517854761362138, + "learning_rate": 6.671958436922976e-07, + "loss": 0.4923, + "step": 6758 + }, + { + "epoch": 0.84, + "grad_norm": 1.982350398814244, + "learning_rate": 6.661932003458421e-07, + "loss": 0.4545, + "step": 6759 + }, + { + "epoch": 0.84, + "grad_norm": 1.7313162513888793, + "learning_rate": 6.651912571640451e-07, + "loss": 0.5244, + "step": 6760 + }, + { + "epoch": 0.84, + "grad_norm": 1.9088298322723904, + "learning_rate": 6.641900143087765e-07, + "loss": 0.4495, + "step": 6761 + }, + { + "epoch": 0.84, + "grad_norm": 1.2008124010151733, + "learning_rate": 6.631894719417987e-07, + "loss": 0.4727, + "step": 6762 + }, + { + "epoch": 0.84, + "grad_norm": 1.4947199189202311, + "learning_rate": 6.62189630224756e-07, + "loss": 0.484, + "step": 6763 + }, + { + "epoch": 0.84, + "grad_norm": 1.3880490751486438, + "learning_rate": 6.611904893191834e-07, + "loss": 0.4469, + "step": 6764 + }, + { + "epoch": 0.84, + "grad_norm": 1.5934160169118272, + "learning_rate": 6.601920493864983e-07, + "loss": 0.5014, + "step": 6765 + }, + { + "epoch": 0.84, + "grad_norm": 1.5999476697876904, + "learning_rate": 6.591943105880111e-07, + "loss": 0.4419, + "step": 6766 + }, + { + "epoch": 0.84, + "grad_norm": 4.5642275673683885, + "learning_rate": 6.581972730849134e-07, + "loss": 0.4993, + "step": 6767 + }, + { + "epoch": 0.84, + "grad_norm": 2.8449583643704757, + "learning_rate": 6.572009370382842e-07, + "loss": 0.4553, + "step": 6768 + }, + { + "epoch": 0.84, + "grad_norm": 1.3906538181873989, + "learning_rate": 6.562053026090931e-07, + "loss": 0.4385, + "step": 6769 + }, + { + "epoch": 0.84, + "grad_norm": 1.352069179403521, + "learning_rate": 6.552103699581908e-07, + "loss": 0.46, + "step": 6770 + }, + { + "epoch": 0.84, + "grad_norm": 1.3583931022423161, + "learning_rate": 6.542161392463197e-07, + "loss": 0.5101, + "step": 6771 + }, + { + "epoch": 0.84, + "grad_norm": 1.297035235675263, + "learning_rate": 6.53222610634105e-07, + "loss": 0.4222, + "step": 6772 + }, + { + "epoch": 0.84, + "grad_norm": 1.5077869483962805, + "learning_rate": 6.522297842820613e-07, + "loss": 0.5018, + "step": 6773 + }, + { + "epoch": 0.84, + "grad_norm": 0.6769521844531408, + "learning_rate": 6.512376603505855e-07, + "loss": 0.5211, + "step": 6774 + }, + { + "epoch": 0.84, + "grad_norm": 0.700422140158777, + "learning_rate": 6.502462389999681e-07, + "loss": 0.502, + "step": 6775 + }, + { + "epoch": 0.84, + "grad_norm": 1.69152040435353, + "learning_rate": 6.4925552039038e-07, + "loss": 0.4762, + "step": 6776 + }, + { + "epoch": 0.84, + "grad_norm": 1.3741472861278572, + "learning_rate": 6.482655046818798e-07, + "loss": 0.4722, + "step": 6777 + }, + { + "epoch": 0.84, + "grad_norm": 1.3143366070549236, + "learning_rate": 6.472761920344145e-07, + "loss": 0.4673, + "step": 6778 + }, + { + "epoch": 0.84, + "grad_norm": 1.7907775755887394, + "learning_rate": 6.462875826078152e-07, + "loss": 0.4486, + "step": 6779 + }, + { + "epoch": 0.84, + "grad_norm": 2.1548052117194203, + "learning_rate": 6.452996765618019e-07, + "loss": 0.4873, + "step": 6780 + }, + { + "epoch": 0.84, + "grad_norm": 1.4313653704255374, + "learning_rate": 6.443124740559775e-07, + "loss": 0.5491, + "step": 6781 + }, + { + "epoch": 0.84, + "grad_norm": 1.4906237832229412, + "learning_rate": 6.433259752498344e-07, + "loss": 0.5144, + "step": 6782 + }, + { + "epoch": 0.84, + "grad_norm": 2.533754994591377, + "learning_rate": 6.423401803027502e-07, + "loss": 0.4494, + "step": 6783 + }, + { + "epoch": 0.84, + "grad_norm": 0.7584702113360715, + "learning_rate": 6.413550893739901e-07, + "loss": 0.4887, + "step": 6784 + }, + { + "epoch": 0.84, + "grad_norm": 1.887154015113772, + "learning_rate": 6.403707026227013e-07, + "loss": 0.4916, + "step": 6785 + }, + { + "epoch": 0.84, + "grad_norm": 1.3250076607832437, + "learning_rate": 6.393870202079228e-07, + "loss": 0.5126, + "step": 6786 + }, + { + "epoch": 0.84, + "grad_norm": 1.7756467996112335, + "learning_rate": 6.384040422885762e-07, + "loss": 0.5473, + "step": 6787 + }, + { + "epoch": 0.84, + "grad_norm": 1.3992838988740335, + "learning_rate": 6.374217690234685e-07, + "loss": 0.487, + "step": 6788 + }, + { + "epoch": 0.84, + "grad_norm": 1.5766772797693478, + "learning_rate": 6.364402005712978e-07, + "loss": 0.4455, + "step": 6789 + }, + { + "epoch": 0.84, + "grad_norm": 1.7894202623754922, + "learning_rate": 6.354593370906414e-07, + "loss": 0.452, + "step": 6790 + }, + { + "epoch": 0.84, + "grad_norm": 1.3385213206944417, + "learning_rate": 6.344791787399707e-07, + "loss": 0.449, + "step": 6791 + }, + { + "epoch": 0.84, + "grad_norm": 2.5542387817707777, + "learning_rate": 6.334997256776354e-07, + "loss": 0.5214, + "step": 6792 + }, + { + "epoch": 0.84, + "grad_norm": 1.3368617749906255, + "learning_rate": 6.325209780618769e-07, + "loss": 0.4635, + "step": 6793 + }, + { + "epoch": 0.84, + "grad_norm": 1.5667271151416986, + "learning_rate": 6.31542936050819e-07, + "loss": 0.4699, + "step": 6794 + }, + { + "epoch": 0.84, + "grad_norm": 1.590647491310967, + "learning_rate": 6.305655998024751e-07, + "loss": 0.4396, + "step": 6795 + }, + { + "epoch": 0.84, + "grad_norm": 1.4654616436837038, + "learning_rate": 6.295889694747409e-07, + "loss": 0.5049, + "step": 6796 + }, + { + "epoch": 0.84, + "grad_norm": 1.2355365348432805, + "learning_rate": 6.286130452253985e-07, + "loss": 0.4376, + "step": 6797 + }, + { + "epoch": 0.84, + "grad_norm": 2.384137486464471, + "learning_rate": 6.276378272121187e-07, + "loss": 0.4944, + "step": 6798 + }, + { + "epoch": 0.84, + "grad_norm": 1.4240883171336052, + "learning_rate": 6.266633155924556e-07, + "loss": 0.5043, + "step": 6799 + }, + { + "epoch": 0.84, + "grad_norm": 1.4914504853613988, + "learning_rate": 6.256895105238526e-07, + "loss": 0.4874, + "step": 6800 + }, + { + "epoch": 0.84, + "grad_norm": 1.6128088096293538, + "learning_rate": 6.247164121636335e-07, + "loss": 0.4839, + "step": 6801 + }, + { + "epoch": 0.84, + "grad_norm": 2.716479416737869, + "learning_rate": 6.237440206690126e-07, + "loss": 0.4588, + "step": 6802 + }, + { + "epoch": 0.84, + "grad_norm": 1.5119129085354808, + "learning_rate": 6.227723361970872e-07, + "loss": 0.5006, + "step": 6803 + }, + { + "epoch": 0.84, + "grad_norm": 2.338104109990617, + "learning_rate": 6.218013589048427e-07, + "loss": 0.4623, + "step": 6804 + }, + { + "epoch": 0.84, + "grad_norm": 2.765034388598217, + "learning_rate": 6.208310889491492e-07, + "loss": 0.4845, + "step": 6805 + }, + { + "epoch": 0.84, + "grad_norm": 1.43352133288454, + "learning_rate": 6.198615264867586e-07, + "loss": 0.5209, + "step": 6806 + }, + { + "epoch": 0.84, + "grad_norm": 1.9436874397350934, + "learning_rate": 6.188926716743176e-07, + "loss": 0.5162, + "step": 6807 + }, + { + "epoch": 0.84, + "grad_norm": 1.781493429515346, + "learning_rate": 6.179245246683501e-07, + "loss": 0.4996, + "step": 6808 + }, + { + "epoch": 0.84, + "grad_norm": 1.4249171534997158, + "learning_rate": 6.169570856252705e-07, + "loss": 0.4656, + "step": 6809 + }, + { + "epoch": 0.85, + "grad_norm": 0.6613674725633242, + "learning_rate": 6.159903547013746e-07, + "loss": 0.5147, + "step": 6810 + }, + { + "epoch": 0.85, + "grad_norm": 2.0691800130247353, + "learning_rate": 6.150243320528492e-07, + "loss": 0.563, + "step": 6811 + }, + { + "epoch": 0.85, + "grad_norm": 1.8332421182034946, + "learning_rate": 6.140590178357613e-07, + "loss": 0.4808, + "step": 6812 + }, + { + "epoch": 0.85, + "grad_norm": 1.9932584487443314, + "learning_rate": 6.130944122060678e-07, + "loss": 0.4274, + "step": 6813 + }, + { + "epoch": 0.85, + "grad_norm": 2.389824362923636, + "learning_rate": 6.121305153196089e-07, + "loss": 0.4536, + "step": 6814 + }, + { + "epoch": 0.85, + "grad_norm": 1.6346590063909652, + "learning_rate": 6.111673273321078e-07, + "loss": 0.4885, + "step": 6815 + }, + { + "epoch": 0.85, + "grad_norm": 1.9257813859222959, + "learning_rate": 6.102048483991802e-07, + "loss": 0.4537, + "step": 6816 + }, + { + "epoch": 0.85, + "grad_norm": 2.2158152973201872, + "learning_rate": 6.092430786763204e-07, + "loss": 0.4991, + "step": 6817 + }, + { + "epoch": 0.85, + "grad_norm": 1.7135611233357684, + "learning_rate": 6.082820183189125e-07, + "loss": 0.495, + "step": 6818 + }, + { + "epoch": 0.85, + "grad_norm": 1.550687378435678, + "learning_rate": 6.073216674822224e-07, + "loss": 0.4899, + "step": 6819 + }, + { + "epoch": 0.85, + "grad_norm": 0.6664152226717598, + "learning_rate": 6.063620263214054e-07, + "loss": 0.4895, + "step": 6820 + }, + { + "epoch": 0.85, + "grad_norm": 0.6455128240796204, + "learning_rate": 6.054030949914974e-07, + "loss": 0.4846, + "step": 6821 + }, + { + "epoch": 0.85, + "grad_norm": 1.371850460382941, + "learning_rate": 6.044448736474246e-07, + "loss": 0.4772, + "step": 6822 + }, + { + "epoch": 0.85, + "grad_norm": 1.4973161988390176, + "learning_rate": 6.03487362443994e-07, + "loss": 0.5066, + "step": 6823 + }, + { + "epoch": 0.85, + "grad_norm": 1.3825259689837013, + "learning_rate": 6.025305615359012e-07, + "loss": 0.4931, + "step": 6824 + }, + { + "epoch": 0.85, + "grad_norm": 1.5546522556411053, + "learning_rate": 6.015744710777272e-07, + "loss": 0.4854, + "step": 6825 + }, + { + "epoch": 0.85, + "grad_norm": 1.3147495445183237, + "learning_rate": 6.006190912239335e-07, + "loss": 0.502, + "step": 6826 + }, + { + "epoch": 0.85, + "grad_norm": 1.4508606786369576, + "learning_rate": 5.996644221288727e-07, + "loss": 0.5091, + "step": 6827 + }, + { + "epoch": 0.85, + "grad_norm": 1.4484167775380636, + "learning_rate": 5.987104639467789e-07, + "loss": 0.5087, + "step": 6828 + }, + { + "epoch": 0.85, + "grad_norm": 1.4993726181242262, + "learning_rate": 5.977572168317736e-07, + "loss": 0.47, + "step": 6829 + }, + { + "epoch": 0.85, + "grad_norm": 1.4413689595288772, + "learning_rate": 5.968046809378603e-07, + "loss": 0.4919, + "step": 6830 + }, + { + "epoch": 0.85, + "grad_norm": 1.5899499033265325, + "learning_rate": 5.958528564189303e-07, + "loss": 0.4734, + "step": 6831 + }, + { + "epoch": 0.85, + "grad_norm": 2.389840082506576, + "learning_rate": 5.949017434287613e-07, + "loss": 0.5051, + "step": 6832 + }, + { + "epoch": 0.85, + "grad_norm": 1.3909594420089038, + "learning_rate": 5.939513421210108e-07, + "loss": 0.4938, + "step": 6833 + }, + { + "epoch": 0.85, + "grad_norm": 1.5849512907541565, + "learning_rate": 5.930016526492272e-07, + "loss": 0.5064, + "step": 6834 + }, + { + "epoch": 0.85, + "grad_norm": 1.492599074910955, + "learning_rate": 5.920526751668388e-07, + "loss": 0.4408, + "step": 6835 + }, + { + "epoch": 0.85, + "grad_norm": 1.5042860804074434, + "learning_rate": 5.911044098271634e-07, + "loss": 0.5245, + "step": 6836 + }, + { + "epoch": 0.85, + "grad_norm": 1.6754168298874146, + "learning_rate": 5.901568567833999e-07, + "loss": 0.4731, + "step": 6837 + }, + { + "epoch": 0.85, + "grad_norm": 1.409445270913483, + "learning_rate": 5.892100161886355e-07, + "loss": 0.5193, + "step": 6838 + }, + { + "epoch": 0.85, + "grad_norm": 1.6385356392034915, + "learning_rate": 5.882638881958391e-07, + "loss": 0.4756, + "step": 6839 + }, + { + "epoch": 0.85, + "grad_norm": 1.9955726024647689, + "learning_rate": 5.873184729578662e-07, + "loss": 0.4707, + "step": 6840 + }, + { + "epoch": 0.85, + "grad_norm": 1.4777711350788643, + "learning_rate": 5.863737706274585e-07, + "loss": 0.5064, + "step": 6841 + }, + { + "epoch": 0.85, + "grad_norm": 1.323637595905769, + "learning_rate": 5.854297813572391e-07, + "loss": 0.463, + "step": 6842 + }, + { + "epoch": 0.85, + "grad_norm": 2.2142497034160975, + "learning_rate": 5.844865052997195e-07, + "loss": 0.5102, + "step": 6843 + }, + { + "epoch": 0.85, + "grad_norm": 0.6423488316310543, + "learning_rate": 5.835439426072925e-07, + "loss": 0.4905, + "step": 6844 + }, + { + "epoch": 0.85, + "grad_norm": 2.0563604548077445, + "learning_rate": 5.82602093432239e-07, + "loss": 0.4863, + "step": 6845 + }, + { + "epoch": 0.85, + "grad_norm": 2.095485821395087, + "learning_rate": 5.816609579267213e-07, + "loss": 0.522, + "step": 6846 + }, + { + "epoch": 0.85, + "grad_norm": 1.4831526662475276, + "learning_rate": 5.807205362427903e-07, + "loss": 0.4434, + "step": 6847 + }, + { + "epoch": 0.85, + "grad_norm": 1.3373394692004903, + "learning_rate": 5.797808285323769e-07, + "loss": 0.4971, + "step": 6848 + }, + { + "epoch": 0.85, + "grad_norm": 1.644148456235197, + "learning_rate": 5.788418349473007e-07, + "loss": 0.4669, + "step": 6849 + }, + { + "epoch": 0.85, + "grad_norm": 1.5073342737537523, + "learning_rate": 5.779035556392649e-07, + "loss": 0.5087, + "step": 6850 + }, + { + "epoch": 0.85, + "grad_norm": 1.379628713431779, + "learning_rate": 5.769659907598552e-07, + "loss": 0.4415, + "step": 6851 + }, + { + "epoch": 0.85, + "grad_norm": 2.564518382684953, + "learning_rate": 5.760291404605456e-07, + "loss": 0.5007, + "step": 6852 + }, + { + "epoch": 0.85, + "grad_norm": 1.4022145113727098, + "learning_rate": 5.750930048926894e-07, + "loss": 0.4262, + "step": 6853 + }, + { + "epoch": 0.85, + "grad_norm": 1.304688007201386, + "learning_rate": 5.741575842075315e-07, + "loss": 0.4666, + "step": 6854 + }, + { + "epoch": 0.85, + "grad_norm": 1.545692701820653, + "learning_rate": 5.732228785561933e-07, + "loss": 0.5042, + "step": 6855 + }, + { + "epoch": 0.85, + "grad_norm": 1.8449247267045616, + "learning_rate": 5.722888880896871e-07, + "loss": 0.4493, + "step": 6856 + }, + { + "epoch": 0.85, + "grad_norm": 1.8883732598933922, + "learning_rate": 5.713556129589076e-07, + "loss": 0.477, + "step": 6857 + }, + { + "epoch": 0.85, + "grad_norm": 1.386474045155772, + "learning_rate": 5.704230533146321e-07, + "loss": 0.4596, + "step": 6858 + }, + { + "epoch": 0.85, + "grad_norm": 1.6987912717293612, + "learning_rate": 5.69491209307525e-07, + "loss": 0.4987, + "step": 6859 + }, + { + "epoch": 0.85, + "grad_norm": 2.060497679451871, + "learning_rate": 5.685600810881331e-07, + "loss": 0.505, + "step": 6860 + }, + { + "epoch": 0.85, + "grad_norm": 1.3193315097978286, + "learning_rate": 5.676296688068894e-07, + "loss": 0.5181, + "step": 6861 + }, + { + "epoch": 0.85, + "grad_norm": 1.485638345755509, + "learning_rate": 5.666999726141086e-07, + "loss": 0.5405, + "step": 6862 + }, + { + "epoch": 0.85, + "grad_norm": 2.3331619343085594, + "learning_rate": 5.65770992659993e-07, + "loss": 0.4544, + "step": 6863 + }, + { + "epoch": 0.85, + "grad_norm": 0.6910977225802527, + "learning_rate": 5.648427290946251e-07, + "loss": 0.4943, + "step": 6864 + }, + { + "epoch": 0.85, + "grad_norm": 1.5988001915255896, + "learning_rate": 5.639151820679778e-07, + "loss": 0.5161, + "step": 6865 + }, + { + "epoch": 0.85, + "grad_norm": 0.5984809540074646, + "learning_rate": 5.629883517299023e-07, + "loss": 0.4703, + "step": 6866 + }, + { + "epoch": 0.85, + "grad_norm": 2.0140295213578163, + "learning_rate": 5.620622382301349e-07, + "loss": 0.4887, + "step": 6867 + }, + { + "epoch": 0.85, + "grad_norm": 1.6634956971310924, + "learning_rate": 5.611368417183e-07, + "loss": 0.4703, + "step": 6868 + }, + { + "epoch": 0.85, + "grad_norm": 1.2212580229409449, + "learning_rate": 5.602121623439016e-07, + "loss": 0.5075, + "step": 6869 + }, + { + "epoch": 0.85, + "grad_norm": 1.4546267165288047, + "learning_rate": 5.592882002563316e-07, + "loss": 0.5057, + "step": 6870 + }, + { + "epoch": 0.85, + "grad_norm": 1.7950585081558756, + "learning_rate": 5.583649556048615e-07, + "loss": 0.4638, + "step": 6871 + }, + { + "epoch": 0.85, + "grad_norm": 1.5665975330120265, + "learning_rate": 5.574424285386515e-07, + "loss": 0.5191, + "step": 6872 + }, + { + "epoch": 0.85, + "grad_norm": 1.4222467915785417, + "learning_rate": 5.56520619206744e-07, + "loss": 0.543, + "step": 6873 + }, + { + "epoch": 0.85, + "grad_norm": 1.5842470350852345, + "learning_rate": 5.555995277580661e-07, + "loss": 0.4898, + "step": 6874 + }, + { + "epoch": 0.85, + "grad_norm": 1.5901026069408062, + "learning_rate": 5.546791543414271e-07, + "loss": 0.496, + "step": 6875 + }, + { + "epoch": 0.85, + "grad_norm": 1.5356187517929056, + "learning_rate": 5.537594991055207e-07, + "loss": 0.4643, + "step": 6876 + }, + { + "epoch": 0.85, + "grad_norm": 1.5491125842300477, + "learning_rate": 5.52840562198927e-07, + "loss": 0.4627, + "step": 6877 + }, + { + "epoch": 0.85, + "grad_norm": 1.6161860408966715, + "learning_rate": 5.519223437701066e-07, + "loss": 0.5372, + "step": 6878 + }, + { + "epoch": 0.85, + "grad_norm": 2.671789743114391, + "learning_rate": 5.510048439674082e-07, + "loss": 0.4862, + "step": 6879 + }, + { + "epoch": 0.85, + "grad_norm": 1.320335868944061, + "learning_rate": 5.500880629390581e-07, + "loss": 0.4466, + "step": 6880 + }, + { + "epoch": 0.85, + "grad_norm": 1.9753531052116389, + "learning_rate": 5.491720008331747e-07, + "loss": 0.479, + "step": 6881 + }, + { + "epoch": 0.85, + "grad_norm": 1.4582939912346808, + "learning_rate": 5.482566577977533e-07, + "loss": 0.4891, + "step": 6882 + }, + { + "epoch": 0.85, + "grad_norm": 1.5970603233940475, + "learning_rate": 5.473420339806768e-07, + "loss": 0.527, + "step": 6883 + }, + { + "epoch": 0.85, + "grad_norm": 1.4712123078288173, + "learning_rate": 5.464281295297108e-07, + "loss": 0.5004, + "step": 6884 + }, + { + "epoch": 0.85, + "grad_norm": 1.4977850469823213, + "learning_rate": 5.455149445925023e-07, + "loss": 0.51, + "step": 6885 + }, + { + "epoch": 0.85, + "grad_norm": 1.5388104560848637, + "learning_rate": 5.446024793165877e-07, + "loss": 0.5139, + "step": 6886 + }, + { + "epoch": 0.85, + "grad_norm": 1.7299515295271184, + "learning_rate": 5.436907338493813e-07, + "loss": 0.5657, + "step": 6887 + }, + { + "epoch": 0.85, + "grad_norm": 1.5580953548348517, + "learning_rate": 5.427797083381853e-07, + "loss": 0.4934, + "step": 6888 + }, + { + "epoch": 0.85, + "grad_norm": 1.3881940205970094, + "learning_rate": 5.418694029301808e-07, + "loss": 0.4988, + "step": 6889 + }, + { + "epoch": 0.86, + "grad_norm": 1.5982128731435383, + "learning_rate": 5.409598177724401e-07, + "loss": 0.505, + "step": 6890 + }, + { + "epoch": 0.86, + "grad_norm": 1.482857028697596, + "learning_rate": 5.400509530119119e-07, + "loss": 0.4682, + "step": 6891 + }, + { + "epoch": 0.86, + "grad_norm": 1.502884208589032, + "learning_rate": 5.391428087954326e-07, + "loss": 0.4585, + "step": 6892 + }, + { + "epoch": 0.86, + "grad_norm": 1.390095861718777, + "learning_rate": 5.382353852697198e-07, + "loss": 0.5184, + "step": 6893 + }, + { + "epoch": 0.86, + "grad_norm": 1.380414240486957, + "learning_rate": 5.373286825813767e-07, + "loss": 0.4599, + "step": 6894 + }, + { + "epoch": 0.86, + "grad_norm": 1.3751438932439002, + "learning_rate": 5.364227008768886e-07, + "loss": 0.4775, + "step": 6895 + }, + { + "epoch": 0.86, + "grad_norm": 1.8372806556281678, + "learning_rate": 5.355174403026242e-07, + "loss": 0.477, + "step": 6896 + }, + { + "epoch": 0.86, + "grad_norm": 1.6275339354203016, + "learning_rate": 5.346129010048368e-07, + "loss": 0.4783, + "step": 6897 + }, + { + "epoch": 0.86, + "grad_norm": 0.6915073517649066, + "learning_rate": 5.337090831296626e-07, + "loss": 0.4772, + "step": 6898 + }, + { + "epoch": 0.86, + "grad_norm": 1.722069587274079, + "learning_rate": 5.328059868231228e-07, + "loss": 0.4988, + "step": 6899 + }, + { + "epoch": 0.86, + "grad_norm": 1.7647107186011264, + "learning_rate": 5.319036122311184e-07, + "loss": 0.501, + "step": 6900 + }, + { + "epoch": 0.86, + "grad_norm": 1.5874068943608002, + "learning_rate": 5.310019594994381e-07, + "loss": 0.446, + "step": 6901 + }, + { + "epoch": 0.86, + "grad_norm": 1.7929932108513464, + "learning_rate": 5.301010287737496e-07, + "loss": 0.5228, + "step": 6902 + }, + { + "epoch": 0.86, + "grad_norm": 1.5965039152926832, + "learning_rate": 5.292008201996079e-07, + "loss": 0.5783, + "step": 6903 + }, + { + "epoch": 0.86, + "grad_norm": 1.427082342840929, + "learning_rate": 5.283013339224491e-07, + "loss": 0.4827, + "step": 6904 + }, + { + "epoch": 0.86, + "grad_norm": 1.6876122832076539, + "learning_rate": 5.274025700875912e-07, + "loss": 0.4663, + "step": 6905 + }, + { + "epoch": 0.86, + "grad_norm": 1.398426594294747, + "learning_rate": 5.265045288402415e-07, + "loss": 0.449, + "step": 6906 + }, + { + "epoch": 0.86, + "grad_norm": 1.3728899110215704, + "learning_rate": 5.256072103254828e-07, + "loss": 0.4561, + "step": 6907 + }, + { + "epoch": 0.86, + "grad_norm": 3.2854946046937044, + "learning_rate": 5.247106146882874e-07, + "loss": 0.478, + "step": 6908 + }, + { + "epoch": 0.86, + "grad_norm": 1.279134504546732, + "learning_rate": 5.238147420735057e-07, + "loss": 0.4776, + "step": 6909 + }, + { + "epoch": 0.86, + "grad_norm": 1.8530388685628911, + "learning_rate": 5.229195926258762e-07, + "loss": 0.5059, + "step": 6910 + }, + { + "epoch": 0.86, + "grad_norm": 1.436969838582139, + "learning_rate": 5.220251664900161e-07, + "loss": 0.4998, + "step": 6911 + }, + { + "epoch": 0.86, + "grad_norm": 1.5032036608395283, + "learning_rate": 5.211314638104303e-07, + "loss": 0.5714, + "step": 6912 + }, + { + "epoch": 0.86, + "grad_norm": 2.0512575600594394, + "learning_rate": 5.202384847315017e-07, + "loss": 0.4843, + "step": 6913 + }, + { + "epoch": 0.86, + "grad_norm": 1.4050897740726445, + "learning_rate": 5.193462293974999e-07, + "loss": 0.4815, + "step": 6914 + }, + { + "epoch": 0.86, + "grad_norm": 1.2195798154838986, + "learning_rate": 5.184546979525779e-07, + "loss": 0.4516, + "step": 6915 + }, + { + "epoch": 0.86, + "grad_norm": 1.4815168906852805, + "learning_rate": 5.175638905407682e-07, + "loss": 0.4266, + "step": 6916 + }, + { + "epoch": 0.86, + "grad_norm": 1.602255012128706, + "learning_rate": 5.166738073059907e-07, + "loss": 0.5068, + "step": 6917 + }, + { + "epoch": 0.86, + "grad_norm": 2.0514319780060712, + "learning_rate": 5.157844483920449e-07, + "loss": 0.4871, + "step": 6918 + }, + { + "epoch": 0.86, + "grad_norm": 1.3562324046688408, + "learning_rate": 5.148958139426152e-07, + "loss": 0.5221, + "step": 6919 + }, + { + "epoch": 0.86, + "grad_norm": 1.3925440441575618, + "learning_rate": 5.140079041012674e-07, + "loss": 0.5112, + "step": 6920 + }, + { + "epoch": 0.86, + "grad_norm": 1.463072794109092, + "learning_rate": 5.131207190114518e-07, + "loss": 0.436, + "step": 6921 + }, + { + "epoch": 0.86, + "grad_norm": 1.7851821317438705, + "learning_rate": 5.122342588165013e-07, + "loss": 0.4625, + "step": 6922 + }, + { + "epoch": 0.86, + "grad_norm": 1.4061491256502077, + "learning_rate": 5.113485236596305e-07, + "loss": 0.4975, + "step": 6923 + }, + { + "epoch": 0.86, + "grad_norm": 1.7953328392218353, + "learning_rate": 5.104635136839392e-07, + "loss": 0.5293, + "step": 6924 + }, + { + "epoch": 0.86, + "grad_norm": 4.883648584810455, + "learning_rate": 5.09579229032407e-07, + "loss": 0.4316, + "step": 6925 + }, + { + "epoch": 0.86, + "grad_norm": 1.4734215631821337, + "learning_rate": 5.08695669847899e-07, + "loss": 0.5, + "step": 6926 + }, + { + "epoch": 0.86, + "grad_norm": 1.390752070784834, + "learning_rate": 5.078128362731605e-07, + "loss": 0.4571, + "step": 6927 + }, + { + "epoch": 0.86, + "grad_norm": 1.3760124274026402, + "learning_rate": 5.06930728450823e-07, + "loss": 0.4068, + "step": 6928 + }, + { + "epoch": 0.86, + "grad_norm": 1.4446249003683183, + "learning_rate": 5.060493465233962e-07, + "loss": 0.5094, + "step": 6929 + }, + { + "epoch": 0.86, + "grad_norm": 1.636254259184282, + "learning_rate": 5.051686906332776e-07, + "loss": 0.5108, + "step": 6930 + }, + { + "epoch": 0.86, + "grad_norm": 1.267435946951365, + "learning_rate": 5.042887609227442e-07, + "loss": 0.4685, + "step": 6931 + }, + { + "epoch": 0.86, + "grad_norm": 1.754610151389348, + "learning_rate": 5.034095575339553e-07, + "loss": 0.5163, + "step": 6932 + }, + { + "epoch": 0.86, + "grad_norm": 1.463126296444401, + "learning_rate": 5.025310806089562e-07, + "loss": 0.4913, + "step": 6933 + }, + { + "epoch": 0.86, + "grad_norm": 2.013428114686188, + "learning_rate": 5.016533302896698e-07, + "loss": 0.5233, + "step": 6934 + }, + { + "epoch": 0.86, + "grad_norm": 1.3545099717129814, + "learning_rate": 5.007763067179066e-07, + "loss": 0.4909, + "step": 6935 + }, + { + "epoch": 0.86, + "grad_norm": 0.733885022215113, + "learning_rate": 4.999000100353562e-07, + "loss": 0.5019, + "step": 6936 + }, + { + "epoch": 0.86, + "grad_norm": 1.8120183817577111, + "learning_rate": 4.990244403835936e-07, + "loss": 0.4918, + "step": 6937 + }, + { + "epoch": 0.86, + "grad_norm": 1.6096501007633839, + "learning_rate": 4.981495979040723e-07, + "loss": 0.4859, + "step": 6938 + }, + { + "epoch": 0.86, + "grad_norm": 1.3699844039744509, + "learning_rate": 4.972754827381327e-07, + "loss": 0.4719, + "step": 6939 + }, + { + "epoch": 0.86, + "grad_norm": 2.6133878006430424, + "learning_rate": 4.964020950269955e-07, + "loss": 0.4776, + "step": 6940 + }, + { + "epoch": 0.86, + "grad_norm": 1.5580353495575774, + "learning_rate": 4.955294349117634e-07, + "loss": 0.4682, + "step": 6941 + }, + { + "epoch": 0.86, + "grad_norm": 1.9898604663077117, + "learning_rate": 4.946575025334244e-07, + "loss": 0.4954, + "step": 6942 + }, + { + "epoch": 0.86, + "grad_norm": 1.6918769235942603, + "learning_rate": 4.937862980328439e-07, + "loss": 0.4625, + "step": 6943 + }, + { + "epoch": 0.86, + "grad_norm": 1.5010276257602275, + "learning_rate": 4.929158215507751e-07, + "loss": 0.4708, + "step": 6944 + }, + { + "epoch": 0.86, + "grad_norm": 1.4238385712397128, + "learning_rate": 4.920460732278493e-07, + "loss": 0.4965, + "step": 6945 + }, + { + "epoch": 0.86, + "grad_norm": 1.8676252330232028, + "learning_rate": 4.911770532045828e-07, + "loss": 0.425, + "step": 6946 + }, + { + "epoch": 0.86, + "grad_norm": 1.8991603311100667, + "learning_rate": 4.903087616213748e-07, + "loss": 0.4908, + "step": 6947 + }, + { + "epoch": 0.86, + "grad_norm": 1.8943841158665158, + "learning_rate": 4.894411986185027e-07, + "loss": 0.5346, + "step": 6948 + }, + { + "epoch": 0.86, + "grad_norm": 1.7537682284072407, + "learning_rate": 4.885743643361318e-07, + "loss": 0.4971, + "step": 6949 + }, + { + "epoch": 0.86, + "grad_norm": 1.5885510591798264, + "learning_rate": 4.877082589143045e-07, + "loss": 0.4881, + "step": 6950 + }, + { + "epoch": 0.86, + "grad_norm": 1.647460623372295, + "learning_rate": 4.868428824929494e-07, + "loss": 0.5323, + "step": 6951 + }, + { + "epoch": 0.86, + "grad_norm": 1.3393433538659767, + "learning_rate": 4.859782352118742e-07, + "loss": 0.4605, + "step": 6952 + }, + { + "epoch": 0.86, + "grad_norm": 1.583819537463703, + "learning_rate": 4.851143172107719e-07, + "loss": 0.5271, + "step": 6953 + }, + { + "epoch": 0.86, + "grad_norm": 1.594390898281684, + "learning_rate": 4.84251128629214e-07, + "loss": 0.4707, + "step": 6954 + }, + { + "epoch": 0.86, + "grad_norm": 1.8417235223966422, + "learning_rate": 4.833886696066581e-07, + "loss": 0.4842, + "step": 6955 + }, + { + "epoch": 0.86, + "grad_norm": 1.5608893660067085, + "learning_rate": 4.825269402824417e-07, + "loss": 0.5323, + "step": 6956 + }, + { + "epoch": 0.86, + "grad_norm": 1.6182598019281986, + "learning_rate": 4.816659407957835e-07, + "loss": 0.4844, + "step": 6957 + }, + { + "epoch": 0.86, + "grad_norm": 1.7888155570652113, + "learning_rate": 4.808056712857878e-07, + "loss": 0.4965, + "step": 6958 + }, + { + "epoch": 0.86, + "grad_norm": 1.470841132601057, + "learning_rate": 4.799461318914366e-07, + "loss": 0.4736, + "step": 6959 + }, + { + "epoch": 0.86, + "grad_norm": 1.5958096796885273, + "learning_rate": 4.790873227515974e-07, + "loss": 0.4258, + "step": 6960 + }, + { + "epoch": 0.86, + "grad_norm": 1.4668180431159055, + "learning_rate": 4.782292440050173e-07, + "loss": 0.4788, + "step": 6961 + }, + { + "epoch": 0.86, + "grad_norm": 1.5940426778726535, + "learning_rate": 4.773718957903267e-07, + "loss": 0.484, + "step": 6962 + }, + { + "epoch": 0.86, + "grad_norm": 2.2904653920667863, + "learning_rate": 4.7651527824603804e-07, + "loss": 0.4734, + "step": 6963 + }, + { + "epoch": 0.86, + "grad_norm": 1.552833442468006, + "learning_rate": 4.7565939151054675e-07, + "loss": 0.5049, + "step": 6964 + }, + { + "epoch": 0.86, + "grad_norm": 1.7028529981961882, + "learning_rate": 4.748042357221278e-07, + "loss": 0.482, + "step": 6965 + }, + { + "epoch": 0.86, + "grad_norm": 4.118028428478353, + "learning_rate": 4.739498110189372e-07, + "loss": 0.5504, + "step": 6966 + }, + { + "epoch": 0.86, + "grad_norm": 1.5657740026232285, + "learning_rate": 4.7309611753901806e-07, + "loss": 0.4814, + "step": 6967 + }, + { + "epoch": 0.86, + "grad_norm": 1.3267340843161755, + "learning_rate": 4.7224315542028945e-07, + "loss": 0.4931, + "step": 6968 + }, + { + "epoch": 0.86, + "grad_norm": 1.3367205730794152, + "learning_rate": 4.7139092480055657e-07, + "loss": 0.4812, + "step": 6969 + }, + { + "epoch": 0.86, + "grad_norm": 1.4363411485262998, + "learning_rate": 4.705394258175039e-07, + "loss": 0.5002, + "step": 6970 + }, + { + "epoch": 0.87, + "grad_norm": 5.3872480517546, + "learning_rate": 4.696886586086985e-07, + "loss": 0.5091, + "step": 6971 + }, + { + "epoch": 0.87, + "grad_norm": 1.2398113701487647, + "learning_rate": 4.688386233115899e-07, + "loss": 0.476, + "step": 6972 + }, + { + "epoch": 0.87, + "grad_norm": 1.4385066929159718, + "learning_rate": 4.679893200635094e-07, + "loss": 0.4617, + "step": 6973 + }, + { + "epoch": 0.87, + "grad_norm": 1.5019290438655046, + "learning_rate": 4.671407490016688e-07, + "loss": 0.4902, + "step": 6974 + }, + { + "epoch": 0.87, + "grad_norm": 1.4173222510567882, + "learning_rate": 4.662929102631614e-07, + "loss": 0.4585, + "step": 6975 + }, + { + "epoch": 0.87, + "grad_norm": 5.913621336040645, + "learning_rate": 4.654458039849641e-07, + "loss": 0.4994, + "step": 6976 + }, + { + "epoch": 0.87, + "grad_norm": 1.5367385729264298, + "learning_rate": 4.6459943030393307e-07, + "loss": 0.5567, + "step": 6977 + }, + { + "epoch": 0.87, + "grad_norm": 2.1268769033496473, + "learning_rate": 4.637537893568095e-07, + "loss": 0.4387, + "step": 6978 + }, + { + "epoch": 0.87, + "grad_norm": 1.5296982464759212, + "learning_rate": 4.629088812802113e-07, + "loss": 0.4426, + "step": 6979 + }, + { + "epoch": 0.87, + "grad_norm": 2.507568436514036, + "learning_rate": 4.620647062106437e-07, + "loss": 0.4643, + "step": 6980 + }, + { + "epoch": 0.87, + "grad_norm": 1.5170270026171093, + "learning_rate": 4.6122126428448875e-07, + "loss": 0.4917, + "step": 6981 + }, + { + "epoch": 0.87, + "grad_norm": 1.3505376511882239, + "learning_rate": 4.603785556380136e-07, + "loss": 0.4725, + "step": 6982 + }, + { + "epoch": 0.87, + "grad_norm": 1.6328794550198478, + "learning_rate": 4.595365804073637e-07, + "loss": 0.5333, + "step": 6983 + }, + { + "epoch": 0.87, + "grad_norm": 1.3863656552023294, + "learning_rate": 4.5869533872856753e-07, + "loss": 0.4819, + "step": 6984 + }, + { + "epoch": 0.87, + "grad_norm": 1.4091310542928852, + "learning_rate": 4.578548307375369e-07, + "loss": 0.4823, + "step": 6985 + }, + { + "epoch": 0.87, + "grad_norm": 1.3905944890206892, + "learning_rate": 4.570150565700604e-07, + "loss": 0.4707, + "step": 6986 + }, + { + "epoch": 0.87, + "grad_norm": 1.3268258977962248, + "learning_rate": 4.561760163618129e-07, + "loss": 0.4803, + "step": 6987 + }, + { + "epoch": 0.87, + "grad_norm": 0.6313590022212238, + "learning_rate": 4.553377102483486e-07, + "loss": 0.482, + "step": 6988 + }, + { + "epoch": 0.87, + "grad_norm": 1.3878481573742947, + "learning_rate": 4.545001383651032e-07, + "loss": 0.4823, + "step": 6989 + }, + { + "epoch": 0.87, + "grad_norm": 1.5216919725001328, + "learning_rate": 4.536633008473934e-07, + "loss": 0.486, + "step": 6990 + }, + { + "epoch": 0.87, + "grad_norm": 1.6149779706162697, + "learning_rate": 4.5282719783041883e-07, + "loss": 0.518, + "step": 6991 + }, + { + "epoch": 0.87, + "grad_norm": 1.3652599863085746, + "learning_rate": 4.519918294492581e-07, + "loss": 0.4898, + "step": 6992 + }, + { + "epoch": 0.87, + "grad_norm": 1.4485925323053548, + "learning_rate": 4.5115719583887164e-07, + "loss": 0.4877, + "step": 6993 + }, + { + "epoch": 0.87, + "grad_norm": 1.5871411096296153, + "learning_rate": 4.503232971341037e-07, + "loss": 0.4946, + "step": 6994 + }, + { + "epoch": 0.87, + "grad_norm": 2.1794323859003204, + "learning_rate": 4.494901334696755e-07, + "loss": 0.4527, + "step": 6995 + }, + { + "epoch": 0.87, + "grad_norm": 0.6742379324367876, + "learning_rate": 4.4865770498019545e-07, + "loss": 0.4836, + "step": 6996 + }, + { + "epoch": 0.87, + "grad_norm": 1.4569596184809448, + "learning_rate": 4.478260118001465e-07, + "loss": 0.4605, + "step": 6997 + }, + { + "epoch": 0.87, + "grad_norm": 2.4222433310926212, + "learning_rate": 4.4699505406389844e-07, + "loss": 0.5305, + "step": 6998 + }, + { + "epoch": 0.87, + "grad_norm": 1.9163666236933992, + "learning_rate": 4.4616483190569773e-07, + "loss": 0.5253, + "step": 6999 + }, + { + "epoch": 0.87, + "grad_norm": 1.4860749254374586, + "learning_rate": 4.4533534545967593e-07, + "loss": 0.4816, + "step": 7000 + }, + { + "epoch": 0.87, + "grad_norm": 2.977852488541996, + "learning_rate": 4.445065948598426e-07, + "loss": 0.5104, + "step": 7001 + }, + { + "epoch": 0.87, + "grad_norm": 1.4291979735578966, + "learning_rate": 4.436785802400906e-07, + "loss": 0.4516, + "step": 7002 + }, + { + "epoch": 0.87, + "grad_norm": 1.6554197655711902, + "learning_rate": 4.428513017341923e-07, + "loss": 0.496, + "step": 7003 + }, + { + "epoch": 0.87, + "grad_norm": 1.4514424167573279, + "learning_rate": 4.4202475947579984e-07, + "loss": 0.495, + "step": 7004 + }, + { + "epoch": 0.87, + "grad_norm": 1.3707185593181825, + "learning_rate": 4.411989535984529e-07, + "loss": 0.4912, + "step": 7005 + }, + { + "epoch": 0.87, + "grad_norm": 1.5072176077192196, + "learning_rate": 4.403738842355643e-07, + "loss": 0.4895, + "step": 7006 + }, + { + "epoch": 0.87, + "grad_norm": 1.4085864240181591, + "learning_rate": 4.3954955152043346e-07, + "loss": 0.4813, + "step": 7007 + }, + { + "epoch": 0.87, + "grad_norm": 1.8764392388366993, + "learning_rate": 4.387259555862361e-07, + "loss": 0.5289, + "step": 7008 + }, + { + "epoch": 0.87, + "grad_norm": 1.3581652441850534, + "learning_rate": 4.3790309656603356e-07, + "loss": 0.4587, + "step": 7009 + }, + { + "epoch": 0.87, + "grad_norm": 1.314257733383638, + "learning_rate": 4.3708097459276454e-07, + "loss": 0.4924, + "step": 7010 + }, + { + "epoch": 0.87, + "grad_norm": 1.4795930320292494, + "learning_rate": 4.3625958979925167e-07, + "loss": 0.5266, + "step": 7011 + }, + { + "epoch": 0.87, + "grad_norm": 1.6737719701590443, + "learning_rate": 4.354389423181948e-07, + "loss": 0.5088, + "step": 7012 + }, + { + "epoch": 0.87, + "grad_norm": 2.0460952816878635, + "learning_rate": 4.3461903228217806e-07, + "loss": 0.4388, + "step": 7013 + }, + { + "epoch": 0.87, + "grad_norm": 1.4735752787958016, + "learning_rate": 4.3379985982366537e-07, + "loss": 0.4596, + "step": 7014 + }, + { + "epoch": 0.87, + "grad_norm": 1.811600709993341, + "learning_rate": 4.329814250750003e-07, + "loss": 0.4417, + "step": 7015 + }, + { + "epoch": 0.87, + "grad_norm": 1.59423739643667, + "learning_rate": 4.321637281684099e-07, + "loss": 0.5272, + "step": 7016 + }, + { + "epoch": 0.87, + "grad_norm": 1.5071981707243922, + "learning_rate": 4.313467692359974e-07, + "loss": 0.5227, + "step": 7017 + }, + { + "epoch": 0.87, + "grad_norm": 1.3654563630416694, + "learning_rate": 4.3053054840975325e-07, + "loss": 0.4817, + "step": 7018 + }, + { + "epoch": 0.87, + "grad_norm": 2.694570195541426, + "learning_rate": 4.2971506582154155e-07, + "loss": 0.4642, + "step": 7019 + }, + { + "epoch": 0.87, + "grad_norm": 1.3821178205066278, + "learning_rate": 4.289003216031129e-07, + "loss": 0.5009, + "step": 7020 + }, + { + "epoch": 0.87, + "grad_norm": 0.6802113063898786, + "learning_rate": 4.2808631588609704e-07, + "loss": 0.4776, + "step": 7021 + }, + { + "epoch": 0.87, + "grad_norm": 1.5209079592364636, + "learning_rate": 4.272730488020016e-07, + "loss": 0.4953, + "step": 7022 + }, + { + "epoch": 0.87, + "grad_norm": 1.4538938283349732, + "learning_rate": 4.2646052048221866e-07, + "loss": 0.5018, + "step": 7023 + }, + { + "epoch": 0.87, + "grad_norm": 1.4972582845594824, + "learning_rate": 4.2564873105801816e-07, + "loss": 0.5248, + "step": 7024 + }, + { + "epoch": 0.87, + "grad_norm": 1.6705282520765294, + "learning_rate": 4.2483768066055365e-07, + "loss": 0.5378, + "step": 7025 + }, + { + "epoch": 0.87, + "grad_norm": 1.5286983823682547, + "learning_rate": 4.240273694208552e-07, + "loss": 0.476, + "step": 7026 + }, + { + "epoch": 0.87, + "grad_norm": 1.5329349887772938, + "learning_rate": 4.2321779746983706e-07, + "loss": 0.5087, + "step": 7027 + }, + { + "epoch": 0.87, + "grad_norm": 2.142670079847223, + "learning_rate": 4.224089649382923e-07, + "loss": 0.5523, + "step": 7028 + }, + { + "epoch": 0.87, + "grad_norm": 4.099696836371022, + "learning_rate": 4.216008719568948e-07, + "loss": 0.4539, + "step": 7029 + }, + { + "epoch": 0.87, + "grad_norm": 1.3233495269215116, + "learning_rate": 4.2079351865620013e-07, + "loss": 0.4979, + "step": 7030 + }, + { + "epoch": 0.87, + "grad_norm": 1.369596831259944, + "learning_rate": 4.1998690516664165e-07, + "loss": 0.4557, + "step": 7031 + }, + { + "epoch": 0.87, + "grad_norm": 1.6306367826899542, + "learning_rate": 4.191810316185374e-07, + "loss": 0.5057, + "step": 7032 + }, + { + "epoch": 0.87, + "grad_norm": 1.498312849387136, + "learning_rate": 4.183758981420799e-07, + "loss": 0.5031, + "step": 7033 + }, + { + "epoch": 0.87, + "grad_norm": 2.390904829889811, + "learning_rate": 4.17571504867349e-07, + "loss": 0.5169, + "step": 7034 + }, + { + "epoch": 0.87, + "grad_norm": 1.6042617395300853, + "learning_rate": 4.167678519242985e-07, + "loss": 0.4576, + "step": 7035 + }, + { + "epoch": 0.87, + "grad_norm": 5.941251307886222, + "learning_rate": 4.159649394427673e-07, + "loss": 0.5456, + "step": 7036 + }, + { + "epoch": 0.87, + "grad_norm": 1.5270596546982536, + "learning_rate": 4.151627675524733e-07, + "loss": 0.4076, + "step": 7037 + }, + { + "epoch": 0.87, + "grad_norm": 1.4227414585009857, + "learning_rate": 4.1436133638301346e-07, + "loss": 0.4517, + "step": 7038 + }, + { + "epoch": 0.87, + "grad_norm": 0.6963363973200976, + "learning_rate": 4.1356064606386694e-07, + "loss": 0.5013, + "step": 7039 + }, + { + "epoch": 0.87, + "grad_norm": 1.3894778049158558, + "learning_rate": 4.1276069672439135e-07, + "loss": 0.4925, + "step": 7040 + }, + { + "epoch": 0.87, + "grad_norm": 1.6235082248499837, + "learning_rate": 4.119614884938261e-07, + "loss": 0.4537, + "step": 7041 + }, + { + "epoch": 0.87, + "grad_norm": 1.6274598684100545, + "learning_rate": 4.111630215012902e-07, + "loss": 0.4917, + "step": 7042 + }, + { + "epoch": 0.87, + "grad_norm": 1.6183380914641294, + "learning_rate": 4.103652958757831e-07, + "loss": 0.505, + "step": 7043 + }, + { + "epoch": 0.87, + "grad_norm": 1.5196493987413147, + "learning_rate": 4.0956831174618403e-07, + "loss": 0.4673, + "step": 7044 + }, + { + "epoch": 0.87, + "grad_norm": 1.383596502398442, + "learning_rate": 4.0877206924125337e-07, + "loss": 0.4677, + "step": 7045 + }, + { + "epoch": 0.87, + "grad_norm": 1.8790719779796432, + "learning_rate": 4.0797656848963086e-07, + "loss": 0.5108, + "step": 7046 + }, + { + "epoch": 0.87, + "grad_norm": 1.4487511887778446, + "learning_rate": 4.0718180961983655e-07, + "loss": 0.472, + "step": 7047 + }, + { + "epoch": 0.87, + "grad_norm": 1.6373521545515435, + "learning_rate": 4.0638779276027106e-07, + "loss": 0.5152, + "step": 7048 + }, + { + "epoch": 0.87, + "grad_norm": 0.6613826241580701, + "learning_rate": 4.05594518039214e-07, + "loss": 0.4638, + "step": 7049 + }, + { + "epoch": 0.87, + "grad_norm": 2.698074527625873, + "learning_rate": 4.048019855848273e-07, + "loss": 0.4555, + "step": 7050 + }, + { + "epoch": 0.88, + "grad_norm": 0.6727169009641928, + "learning_rate": 4.040101955251502e-07, + "loss": 0.4903, + "step": 7051 + }, + { + "epoch": 0.88, + "grad_norm": 1.5725392983596518, + "learning_rate": 4.0321914798810423e-07, + "loss": 0.4994, + "step": 7052 + }, + { + "epoch": 0.88, + "grad_norm": 1.4342289383946316, + "learning_rate": 4.0242884310148945e-07, + "loss": 0.4319, + "step": 7053 + }, + { + "epoch": 0.88, + "grad_norm": 1.5702375613899564, + "learning_rate": 4.0163928099298644e-07, + "loss": 0.4696, + "step": 7054 + }, + { + "epoch": 0.88, + "grad_norm": 1.3036488520632732, + "learning_rate": 4.008504617901571e-07, + "loss": 0.4305, + "step": 7055 + }, + { + "epoch": 0.88, + "grad_norm": 0.6565508220144567, + "learning_rate": 4.0006238562044054e-07, + "loss": 0.476, + "step": 7056 + }, + { + "epoch": 0.88, + "grad_norm": 1.8813873177790947, + "learning_rate": 3.992750526111594e-07, + "loss": 0.5009, + "step": 7057 + }, + { + "epoch": 0.88, + "grad_norm": 1.313124075124981, + "learning_rate": 3.984884628895119e-07, + "loss": 0.4919, + "step": 7058 + }, + { + "epoch": 0.88, + "grad_norm": 2.3541457299851034, + "learning_rate": 3.9770261658258093e-07, + "loss": 0.5033, + "step": 7059 + }, + { + "epoch": 0.88, + "grad_norm": 1.4366559915321013, + "learning_rate": 3.9691751381732423e-07, + "loss": 0.4939, + "step": 7060 + }, + { + "epoch": 0.88, + "grad_norm": 1.5378230470666459, + "learning_rate": 3.961331547205838e-07, + "loss": 0.4741, + "step": 7061 + }, + { + "epoch": 0.88, + "grad_norm": 2.6909535505851894, + "learning_rate": 3.953495394190804e-07, + "loss": 0.5063, + "step": 7062 + }, + { + "epoch": 0.88, + "grad_norm": 1.3864880882834227, + "learning_rate": 3.9456666803941223e-07, + "loss": 0.4779, + "step": 7063 + }, + { + "epoch": 0.88, + "grad_norm": 1.451029212441224, + "learning_rate": 3.937845407080604e-07, + "loss": 0.4789, + "step": 7064 + }, + { + "epoch": 0.88, + "grad_norm": 1.5687085727703403, + "learning_rate": 3.930031575513837e-07, + "loss": 0.5399, + "step": 7065 + }, + { + "epoch": 0.88, + "grad_norm": 1.2924570237505366, + "learning_rate": 3.9222251869562187e-07, + "loss": 0.5184, + "step": 7066 + }, + { + "epoch": 0.88, + "grad_norm": 1.500961660790129, + "learning_rate": 3.9144262426689336e-07, + "loss": 0.4618, + "step": 7067 + }, + { + "epoch": 0.88, + "grad_norm": 1.7030292271341143, + "learning_rate": 3.90663474391198e-07, + "loss": 0.5014, + "step": 7068 + }, + { + "epoch": 0.88, + "grad_norm": 1.2696125199636845, + "learning_rate": 3.898850691944123e-07, + "loss": 0.4243, + "step": 7069 + }, + { + "epoch": 0.88, + "grad_norm": 1.5740029906704345, + "learning_rate": 3.891074088022978e-07, + "loss": 0.5329, + "step": 7070 + }, + { + "epoch": 0.88, + "grad_norm": 1.6799717738407427, + "learning_rate": 3.8833049334048964e-07, + "loss": 0.4688, + "step": 7071 + }, + { + "epoch": 0.88, + "grad_norm": 2.0888448681941014, + "learning_rate": 3.875543229345069e-07, + "loss": 0.473, + "step": 7072 + }, + { + "epoch": 0.88, + "grad_norm": 2.220257275918847, + "learning_rate": 3.8677889770974584e-07, + "loss": 0.4829, + "step": 7073 + }, + { + "epoch": 0.88, + "grad_norm": 1.9022525076310015, + "learning_rate": 3.8600421779148303e-07, + "loss": 0.5442, + "step": 7074 + }, + { + "epoch": 0.88, + "grad_norm": 1.4515938616789295, + "learning_rate": 3.85230283304876e-07, + "loss": 0.5364, + "step": 7075 + }, + { + "epoch": 0.88, + "grad_norm": 1.5089744744651061, + "learning_rate": 3.844570943749593e-07, + "loss": 0.4802, + "step": 7076 + }, + { + "epoch": 0.88, + "grad_norm": 1.5771912659647935, + "learning_rate": 3.83684651126649e-07, + "loss": 0.506, + "step": 7077 + }, + { + "epoch": 0.88, + "grad_norm": 1.5971997865078935, + "learning_rate": 3.829129536847398e-07, + "loss": 0.5213, + "step": 7078 + }, + { + "epoch": 0.88, + "grad_norm": 1.8669592100775223, + "learning_rate": 3.8214200217390806e-07, + "loss": 0.5021, + "step": 7079 + }, + { + "epoch": 0.88, + "grad_norm": 1.520345410466755, + "learning_rate": 3.8137179671870527e-07, + "loss": 0.5044, + "step": 7080 + }, + { + "epoch": 0.88, + "grad_norm": 1.3843402750451081, + "learning_rate": 3.8060233744356634e-07, + "loss": 0.484, + "step": 7081 + }, + { + "epoch": 0.88, + "grad_norm": 1.3083414725989773, + "learning_rate": 3.798336244728046e-07, + "loss": 0.4468, + "step": 7082 + }, + { + "epoch": 0.88, + "grad_norm": 1.2366682560363815, + "learning_rate": 3.790656579306101e-07, + "loss": 0.512, + "step": 7083 + }, + { + "epoch": 0.88, + "grad_norm": 2.3774229109773617, + "learning_rate": 3.7829843794105703e-07, + "loss": 0.4919, + "step": 7084 + }, + { + "epoch": 0.88, + "grad_norm": 2.2780907545876765, + "learning_rate": 3.7753196462809504e-07, + "loss": 0.4593, + "step": 7085 + }, + { + "epoch": 0.88, + "grad_norm": 0.6647475571312081, + "learning_rate": 3.767662381155551e-07, + "loss": 0.4928, + "step": 7086 + }, + { + "epoch": 0.88, + "grad_norm": 1.4554371973142786, + "learning_rate": 3.760012585271472e-07, + "loss": 0.5152, + "step": 7087 + }, + { + "epoch": 0.88, + "grad_norm": 1.2374825063762878, + "learning_rate": 3.7523702598646185e-07, + "loss": 0.4613, + "step": 7088 + }, + { + "epoch": 0.88, + "grad_norm": 1.246405451093617, + "learning_rate": 3.7447354061696474e-07, + "loss": 0.4897, + "step": 7089 + }, + { + "epoch": 0.88, + "grad_norm": 1.497491611425484, + "learning_rate": 3.737108025420061e-07, + "loss": 0.5095, + "step": 7090 + }, + { + "epoch": 0.88, + "grad_norm": 2.448953188662499, + "learning_rate": 3.7294881188481235e-07, + "loss": 0.5111, + "step": 7091 + }, + { + "epoch": 0.88, + "grad_norm": 1.3265262982691868, + "learning_rate": 3.721875687684884e-07, + "loss": 0.4747, + "step": 7092 + }, + { + "epoch": 0.88, + "grad_norm": 1.4006392357014956, + "learning_rate": 3.714270733160219e-07, + "loss": 0.4648, + "step": 7093 + }, + { + "epoch": 0.88, + "grad_norm": 1.4159650763549114, + "learning_rate": 3.706673256502746e-07, + "loss": 0.5373, + "step": 7094 + }, + { + "epoch": 0.88, + "grad_norm": 1.71334454708964, + "learning_rate": 3.6990832589399396e-07, + "loss": 0.4858, + "step": 7095 + }, + { + "epoch": 0.88, + "grad_norm": 1.4613986540202215, + "learning_rate": 3.6915007416980067e-07, + "loss": 0.4787, + "step": 7096 + }, + { + "epoch": 0.88, + "grad_norm": 1.439666219997754, + "learning_rate": 3.6839257060019895e-07, + "loss": 0.5414, + "step": 7097 + }, + { + "epoch": 0.88, + "grad_norm": 1.7148078587442241, + "learning_rate": 3.6763581530756766e-07, + "loss": 0.4941, + "step": 7098 + }, + { + "epoch": 0.88, + "grad_norm": 1.7614900950085237, + "learning_rate": 3.6687980841417005e-07, + "loss": 0.5396, + "step": 7099 + }, + { + "epoch": 0.88, + "grad_norm": 1.4900408278461268, + "learning_rate": 3.661245500421429e-07, + "loss": 0.5185, + "step": 7100 + }, + { + "epoch": 0.88, + "grad_norm": 3.410188054615284, + "learning_rate": 3.653700403135074e-07, + "loss": 0.478, + "step": 7101 + }, + { + "epoch": 0.88, + "grad_norm": 1.3467691276511204, + "learning_rate": 3.6461627935015897e-07, + "loss": 0.4917, + "step": 7102 + }, + { + "epoch": 0.88, + "grad_norm": 1.476580030239518, + "learning_rate": 3.638632672738757e-07, + "loss": 0.459, + "step": 7103 + }, + { + "epoch": 0.88, + "grad_norm": 1.6438546283641577, + "learning_rate": 3.6311100420631405e-07, + "loss": 0.4631, + "step": 7104 + }, + { + "epoch": 0.88, + "grad_norm": 1.9040075114903101, + "learning_rate": 3.623594902690064e-07, + "loss": 0.4624, + "step": 7105 + }, + { + "epoch": 0.88, + "grad_norm": 2.77764161446545, + "learning_rate": 3.616087255833689e-07, + "loss": 0.4732, + "step": 7106 + }, + { + "epoch": 0.88, + "grad_norm": 1.5382156221576366, + "learning_rate": 3.608587102706923e-07, + "loss": 0.4758, + "step": 7107 + }, + { + "epoch": 0.88, + "grad_norm": 1.880523836838473, + "learning_rate": 3.601094444521497e-07, + "loss": 0.5175, + "step": 7108 + }, + { + "epoch": 0.88, + "grad_norm": 1.319183728535753, + "learning_rate": 3.5936092824878976e-07, + "loss": 0.4914, + "step": 7109 + }, + { + "epoch": 0.88, + "grad_norm": 1.4221183759049574, + "learning_rate": 3.586131617815436e-07, + "loss": 0.527, + "step": 7110 + }, + { + "epoch": 0.88, + "grad_norm": 1.7200757512072127, + "learning_rate": 3.57866145171219e-07, + "loss": 0.4504, + "step": 7111 + }, + { + "epoch": 0.88, + "grad_norm": 1.5298123462961473, + "learning_rate": 3.571198785385027e-07, + "loss": 0.4757, + "step": 7112 + }, + { + "epoch": 0.88, + "grad_norm": 1.4777977381633165, + "learning_rate": 3.5637436200396115e-07, + "loss": 0.5434, + "step": 7113 + }, + { + "epoch": 0.88, + "grad_norm": 1.6301079688826285, + "learning_rate": 3.5562959568803843e-07, + "loss": 0.4849, + "step": 7114 + }, + { + "epoch": 0.88, + "grad_norm": 1.3511275125402105, + "learning_rate": 3.54885579711059e-07, + "loss": 0.5205, + "step": 7115 + }, + { + "epoch": 0.88, + "grad_norm": 1.4338245476889193, + "learning_rate": 3.541423141932238e-07, + "loss": 0.4767, + "step": 7116 + }, + { + "epoch": 0.88, + "grad_norm": 1.5251989790197307, + "learning_rate": 3.5339979925461566e-07, + "loss": 0.4685, + "step": 7117 + }, + { + "epoch": 0.88, + "grad_norm": 1.368497415552152, + "learning_rate": 3.526580350151931e-07, + "loss": 0.5087, + "step": 7118 + }, + { + "epoch": 0.88, + "grad_norm": 1.4030800124288458, + "learning_rate": 3.519170215947948e-07, + "loss": 0.5299, + "step": 7119 + }, + { + "epoch": 0.88, + "grad_norm": 1.8438357371279437, + "learning_rate": 3.511767591131393e-07, + "loss": 0.5338, + "step": 7120 + }, + { + "epoch": 0.88, + "grad_norm": 2.089390945220485, + "learning_rate": 3.504372476898205e-07, + "loss": 0.5054, + "step": 7121 + }, + { + "epoch": 0.88, + "grad_norm": 1.6357190621813416, + "learning_rate": 3.4969848744431487e-07, + "loss": 0.5339, + "step": 7122 + }, + { + "epoch": 0.88, + "grad_norm": 1.5665757084370637, + "learning_rate": 3.489604784959744e-07, + "loss": 0.4957, + "step": 7123 + }, + { + "epoch": 0.88, + "grad_norm": 1.4908392013334355, + "learning_rate": 3.482232209640318e-07, + "loss": 0.469, + "step": 7124 + }, + { + "epoch": 0.88, + "grad_norm": 1.5897939270204158, + "learning_rate": 3.474867149675959e-07, + "loss": 0.5276, + "step": 7125 + }, + { + "epoch": 0.88, + "grad_norm": 3.0001121398745645, + "learning_rate": 3.467509606256581e-07, + "loss": 0.519, + "step": 7126 + }, + { + "epoch": 0.88, + "grad_norm": 1.7415972576660335, + "learning_rate": 3.4601595805708375e-07, + "loss": 0.4467, + "step": 7127 + }, + { + "epoch": 0.88, + "grad_norm": 1.807146073626419, + "learning_rate": 3.4528170738062027e-07, + "loss": 0.5191, + "step": 7128 + }, + { + "epoch": 0.88, + "grad_norm": 0.6450389504343121, + "learning_rate": 3.445482087148927e-07, + "loss": 0.4805, + "step": 7129 + }, + { + "epoch": 0.88, + "grad_norm": 1.4979170319680737, + "learning_rate": 3.438154621784029e-07, + "loss": 0.4916, + "step": 7130 + }, + { + "epoch": 0.88, + "grad_norm": 1.6597403892980278, + "learning_rate": 3.4308346788953394e-07, + "loss": 0.4517, + "step": 7131 + }, + { + "epoch": 0.89, + "grad_norm": 1.4360278235152508, + "learning_rate": 3.423522259665446e-07, + "loss": 0.4837, + "step": 7132 + }, + { + "epoch": 0.89, + "grad_norm": 2.78841914191125, + "learning_rate": 3.4162173652757427e-07, + "loss": 0.5079, + "step": 7133 + }, + { + "epoch": 0.89, + "grad_norm": 1.7577408413013773, + "learning_rate": 3.4089199969063957e-07, + "loss": 0.5455, + "step": 7134 + }, + { + "epoch": 0.89, + "grad_norm": 1.8621486529156506, + "learning_rate": 3.4016301557363573e-07, + "loss": 0.5004, + "step": 7135 + }, + { + "epoch": 0.89, + "grad_norm": 3.3443240630108315, + "learning_rate": 3.3943478429433797e-07, + "loss": 0.4592, + "step": 7136 + }, + { + "epoch": 0.89, + "grad_norm": 1.5636160522143439, + "learning_rate": 3.38707305970396e-07, + "loss": 0.4867, + "step": 7137 + }, + { + "epoch": 0.89, + "grad_norm": 1.4797636371493548, + "learning_rate": 3.3798058071934315e-07, + "loss": 0.4948, + "step": 7138 + }, + { + "epoch": 0.89, + "grad_norm": 1.7344713320689402, + "learning_rate": 3.372546086585859e-07, + "loss": 0.4734, + "step": 7139 + }, + { + "epoch": 0.89, + "grad_norm": 1.8399894492094186, + "learning_rate": 3.365293899054134e-07, + "loss": 0.487, + "step": 7140 + }, + { + "epoch": 0.89, + "grad_norm": 1.6768159138284706, + "learning_rate": 3.35804924576989e-07, + "loss": 0.5342, + "step": 7141 + }, + { + "epoch": 0.89, + "grad_norm": 1.492573985054435, + "learning_rate": 3.3508121279035856e-07, + "loss": 0.4673, + "step": 7142 + }, + { + "epoch": 0.89, + "grad_norm": 1.8692692734840493, + "learning_rate": 3.3435825466244245e-07, + "loss": 0.5184, + "step": 7143 + }, + { + "epoch": 0.89, + "grad_norm": 10.846394080663176, + "learning_rate": 3.3363605031004167e-07, + "loss": 0.5093, + "step": 7144 + }, + { + "epoch": 0.89, + "grad_norm": 2.8655230755218826, + "learning_rate": 3.329145998498351e-07, + "loss": 0.529, + "step": 7145 + }, + { + "epoch": 0.89, + "grad_norm": 0.7549149164049279, + "learning_rate": 3.321939033983784e-07, + "loss": 0.5134, + "step": 7146 + }, + { + "epoch": 0.89, + "grad_norm": 1.7333913256290412, + "learning_rate": 3.314739610721074e-07, + "loss": 0.4907, + "step": 7147 + }, + { + "epoch": 0.89, + "grad_norm": 1.9064714456136993, + "learning_rate": 3.307547729873339e-07, + "loss": 0.4643, + "step": 7148 + }, + { + "epoch": 0.89, + "grad_norm": 1.3797398845123694, + "learning_rate": 3.3003633926025126e-07, + "loss": 0.4881, + "step": 7149 + }, + { + "epoch": 0.89, + "grad_norm": 1.465083071290942, + "learning_rate": 3.293186600069259e-07, + "loss": 0.4659, + "step": 7150 + }, + { + "epoch": 0.89, + "grad_norm": 1.6122896595208838, + "learning_rate": 3.2860173534330744e-07, + "loss": 0.5195, + "step": 7151 + }, + { + "epoch": 0.89, + "grad_norm": 1.5037231912700562, + "learning_rate": 3.2788556538522086e-07, + "loss": 0.4601, + "step": 7152 + }, + { + "epoch": 0.89, + "grad_norm": 1.7022555579379326, + "learning_rate": 3.2717015024836875e-07, + "loss": 0.453, + "step": 7153 + }, + { + "epoch": 0.89, + "grad_norm": 1.341508810355981, + "learning_rate": 3.2645549004833464e-07, + "loss": 0.5044, + "step": 7154 + }, + { + "epoch": 0.89, + "grad_norm": 1.378762619010031, + "learning_rate": 3.2574158490057573e-07, + "loss": 0.4611, + "step": 7155 + }, + { + "epoch": 0.89, + "grad_norm": 1.694921735627754, + "learning_rate": 3.2502843492043246e-07, + "loss": 0.5117, + "step": 7156 + }, + { + "epoch": 0.89, + "grad_norm": 1.6336333360011273, + "learning_rate": 3.243160402231177e-07, + "loss": 0.4878, + "step": 7157 + }, + { + "epoch": 0.89, + "grad_norm": 1.8195915630189345, + "learning_rate": 3.2360440092372716e-07, + "loss": 0.491, + "step": 7158 + }, + { + "epoch": 0.89, + "grad_norm": 1.472846216891738, + "learning_rate": 3.228935171372299e-07, + "loss": 0.5043, + "step": 7159 + }, + { + "epoch": 0.89, + "grad_norm": 1.5073396580443326, + "learning_rate": 3.221833889784792e-07, + "loss": 0.4583, + "step": 7160 + }, + { + "epoch": 0.89, + "grad_norm": 1.628352258905142, + "learning_rate": 3.214740165622005e-07, + "loss": 0.5192, + "step": 7161 + }, + { + "epoch": 0.89, + "grad_norm": 1.4070354758414292, + "learning_rate": 3.2076540000299804e-07, + "loss": 0.4868, + "step": 7162 + }, + { + "epoch": 0.89, + "grad_norm": 1.8743589664505378, + "learning_rate": 3.200575394153577e-07, + "loss": 0.4941, + "step": 7163 + }, + { + "epoch": 0.89, + "grad_norm": 1.6957401814278172, + "learning_rate": 3.1935043491363784e-07, + "loss": 0.458, + "step": 7164 + }, + { + "epoch": 0.89, + "grad_norm": 2.5618551792774675, + "learning_rate": 3.1864408661207935e-07, + "loss": 0.4657, + "step": 7165 + }, + { + "epoch": 0.89, + "grad_norm": 1.3676928671793762, + "learning_rate": 3.1793849462479817e-07, + "loss": 0.4656, + "step": 7166 + }, + { + "epoch": 0.89, + "grad_norm": 1.3381120850497739, + "learning_rate": 3.172336590657893e-07, + "loss": 0.5007, + "step": 7167 + }, + { + "epoch": 0.89, + "grad_norm": 1.7082831621344305, + "learning_rate": 3.165295800489238e-07, + "loss": 0.4443, + "step": 7168 + }, + { + "epoch": 0.89, + "grad_norm": 2.805985383532289, + "learning_rate": 3.158262576879545e-07, + "loss": 0.4938, + "step": 7169 + }, + { + "epoch": 0.89, + "grad_norm": 1.4085538252156595, + "learning_rate": 3.1512369209650673e-07, + "loss": 0.5054, + "step": 7170 + }, + { + "epoch": 0.89, + "grad_norm": 1.3710216311892396, + "learning_rate": 3.14421883388088e-07, + "loss": 0.423, + "step": 7171 + }, + { + "epoch": 0.89, + "grad_norm": 1.4943971761522419, + "learning_rate": 3.137208316760809e-07, + "loss": 0.4773, + "step": 7172 + }, + { + "epoch": 0.89, + "grad_norm": 1.3146847075274068, + "learning_rate": 3.1302053707374605e-07, + "loss": 0.5013, + "step": 7173 + }, + { + "epoch": 0.89, + "grad_norm": 1.5673236063433522, + "learning_rate": 3.123209996942228e-07, + "loss": 0.4954, + "step": 7174 + }, + { + "epoch": 0.89, + "grad_norm": 1.5592195204413026, + "learning_rate": 3.1162221965052753e-07, + "loss": 0.4778, + "step": 7175 + }, + { + "epoch": 0.89, + "grad_norm": 0.6967523625595746, + "learning_rate": 3.109241970555538e-07, + "loss": 0.4947, + "step": 7176 + }, + { + "epoch": 0.89, + "grad_norm": 1.4297009420714675, + "learning_rate": 3.1022693202207355e-07, + "loss": 0.4709, + "step": 7177 + }, + { + "epoch": 0.89, + "grad_norm": 1.7542963468229498, + "learning_rate": 3.095304246627373e-07, + "loss": 0.4753, + "step": 7178 + }, + { + "epoch": 0.89, + "grad_norm": 1.5299932504585612, + "learning_rate": 3.0883467509007005e-07, + "loss": 0.4826, + "step": 7179 + }, + { + "epoch": 0.89, + "grad_norm": 1.199621225915617, + "learning_rate": 3.081396834164774e-07, + "loss": 0.4789, + "step": 7180 + }, + { + "epoch": 0.89, + "grad_norm": 1.539053220866323, + "learning_rate": 3.0744544975424173e-07, + "loss": 0.467, + "step": 7181 + }, + { + "epoch": 0.89, + "grad_norm": 1.5034629040386387, + "learning_rate": 3.0675197421552116e-07, + "loss": 0.4767, + "step": 7182 + }, + { + "epoch": 0.89, + "grad_norm": 1.7777352958898378, + "learning_rate": 3.060592569123544e-07, + "loss": 0.4573, + "step": 7183 + }, + { + "epoch": 0.89, + "grad_norm": 1.5106190466274196, + "learning_rate": 3.0536729795665355e-07, + "loss": 0.5401, + "step": 7184 + }, + { + "epoch": 0.89, + "grad_norm": 2.4379789088763033, + "learning_rate": 3.046760974602136e-07, + "loss": 0.446, + "step": 7185 + }, + { + "epoch": 0.89, + "grad_norm": 1.2860703238892677, + "learning_rate": 3.039856555347026e-07, + "loss": 0.4643, + "step": 7186 + }, + { + "epoch": 0.89, + "grad_norm": 1.3835890592762499, + "learning_rate": 3.032959722916684e-07, + "loss": 0.4692, + "step": 7187 + }, + { + "epoch": 0.89, + "grad_norm": 0.6878989264050003, + "learning_rate": 3.026070478425336e-07, + "loss": 0.4983, + "step": 7188 + }, + { + "epoch": 0.89, + "grad_norm": 1.6771846553556036, + "learning_rate": 3.019188822986024e-07, + "loss": 0.4646, + "step": 7189 + }, + { + "epoch": 0.89, + "grad_norm": 1.6440426570746323, + "learning_rate": 3.012314757710527e-07, + "loss": 0.5425, + "step": 7190 + }, + { + "epoch": 0.89, + "grad_norm": 2.074279077892209, + "learning_rate": 3.0054482837094003e-07, + "loss": 0.4386, + "step": 7191 + }, + { + "epoch": 0.89, + "grad_norm": 1.5173362425262247, + "learning_rate": 2.9985894020919946e-07, + "loss": 0.504, + "step": 7192 + }, + { + "epoch": 0.89, + "grad_norm": 1.4264982703753875, + "learning_rate": 2.9917381139664245e-07, + "loss": 0.5233, + "step": 7193 + }, + { + "epoch": 0.89, + "grad_norm": 26.549343377170732, + "learning_rate": 2.9848944204395827e-07, + "loss": 0.5357, + "step": 7194 + }, + { + "epoch": 0.89, + "grad_norm": 1.4586810658158686, + "learning_rate": 2.978058322617111e-07, + "loss": 0.4955, + "step": 7195 + }, + { + "epoch": 0.89, + "grad_norm": 1.3883893237445497, + "learning_rate": 2.9712298216034553e-07, + "loss": 0.4755, + "step": 7196 + }, + { + "epoch": 0.89, + "grad_norm": 1.4159920892476956, + "learning_rate": 2.96440891850181e-07, + "loss": 0.5324, + "step": 7197 + }, + { + "epoch": 0.89, + "grad_norm": 1.415269551961386, + "learning_rate": 2.95759561441416e-07, + "loss": 0.5041, + "step": 7198 + }, + { + "epoch": 0.89, + "grad_norm": 1.5364140965549926, + "learning_rate": 2.950789910441254e-07, + "loss": 0.5059, + "step": 7199 + }, + { + "epoch": 0.89, + "grad_norm": 1.4547641326655592, + "learning_rate": 2.9439918076826003e-07, + "loss": 0.5275, + "step": 7200 + }, + { + "epoch": 0.89, + "grad_norm": 0.6340603974161589, + "learning_rate": 2.937201307236504e-07, + "loss": 0.4743, + "step": 7201 + }, + { + "epoch": 0.89, + "grad_norm": 1.6838431804403837, + "learning_rate": 2.930418410200031e-07, + "loss": 0.4303, + "step": 7202 + }, + { + "epoch": 0.89, + "grad_norm": 2.59902466849343, + "learning_rate": 2.923643117669023e-07, + "loss": 0.4981, + "step": 7203 + }, + { + "epoch": 0.89, + "grad_norm": 1.588337220386599, + "learning_rate": 2.91687543073807e-07, + "loss": 0.459, + "step": 7204 + }, + { + "epoch": 0.89, + "grad_norm": 1.8065695379788074, + "learning_rate": 2.9101153505005697e-07, + "loss": 0.4769, + "step": 7205 + }, + { + "epoch": 0.89, + "grad_norm": 1.2622077395268985, + "learning_rate": 2.9033628780486603e-07, + "loss": 0.4758, + "step": 7206 + }, + { + "epoch": 0.89, + "grad_norm": 1.2822251089559757, + "learning_rate": 2.8966180144732735e-07, + "loss": 0.4679, + "step": 7207 + }, + { + "epoch": 0.89, + "grad_norm": 2.7942440218571605, + "learning_rate": 2.889880760864089e-07, + "loss": 0.504, + "step": 7208 + }, + { + "epoch": 0.89, + "grad_norm": 1.5395160273565343, + "learning_rate": 2.883151118309574e-07, + "loss": 0.4623, + "step": 7209 + }, + { + "epoch": 0.89, + "grad_norm": 1.5010348696910691, + "learning_rate": 2.8764290878969757e-07, + "loss": 0.4999, + "step": 7210 + }, + { + "epoch": 0.89, + "grad_norm": 1.6289662605286688, + "learning_rate": 2.869714670712276e-07, + "loss": 0.4778, + "step": 7211 + }, + { + "epoch": 0.9, + "grad_norm": 0.6921986203870252, + "learning_rate": 2.8630078678402673e-07, + "loss": 0.4852, + "step": 7212 + }, + { + "epoch": 0.9, + "grad_norm": 1.584016313678421, + "learning_rate": 2.856308680364472e-07, + "loss": 0.4692, + "step": 7213 + }, + { + "epoch": 0.9, + "grad_norm": 1.4918658623049326, + "learning_rate": 2.8496171093672255e-07, + "loss": 0.5201, + "step": 7214 + }, + { + "epoch": 0.9, + "grad_norm": 1.5396443968278153, + "learning_rate": 2.842933155929589e-07, + "loss": 0.4347, + "step": 7215 + }, + { + "epoch": 0.9, + "grad_norm": 1.7675236400198393, + "learning_rate": 2.8362568211314334e-07, + "loss": 0.4514, + "step": 7216 + }, + { + "epoch": 0.9, + "grad_norm": 1.568592089505673, + "learning_rate": 2.8295881060513565e-07, + "loss": 0.5031, + "step": 7217 + }, + { + "epoch": 0.9, + "grad_norm": 1.4069262109055722, + "learning_rate": 2.8229270117667686e-07, + "loss": 0.5155, + "step": 7218 + }, + { + "epoch": 0.9, + "grad_norm": 1.39470701848657, + "learning_rate": 2.8162735393538197e-07, + "loss": 0.5019, + "step": 7219 + }, + { + "epoch": 0.9, + "grad_norm": 1.4321517994755029, + "learning_rate": 2.8096276898874344e-07, + "loss": 0.4615, + "step": 7220 + }, + { + "epoch": 0.9, + "grad_norm": 1.6475657395982446, + "learning_rate": 2.802989464441319e-07, + "loss": 0.4951, + "step": 7221 + }, + { + "epoch": 0.9, + "grad_norm": 1.3483897836284766, + "learning_rate": 2.796358864087922e-07, + "loss": 0.4781, + "step": 7222 + }, + { + "epoch": 0.9, + "grad_norm": 2.2149006601586705, + "learning_rate": 2.789735889898493e-07, + "loss": 0.501, + "step": 7223 + }, + { + "epoch": 0.9, + "grad_norm": 1.5858220803089245, + "learning_rate": 2.7831205429430133e-07, + "loss": 0.5271, + "step": 7224 + }, + { + "epoch": 0.9, + "grad_norm": 1.7325956961464044, + "learning_rate": 2.776512824290256e-07, + "loss": 0.5268, + "step": 7225 + }, + { + "epoch": 0.9, + "grad_norm": 1.7756493090951462, + "learning_rate": 2.769912735007768e-07, + "loss": 0.5266, + "step": 7226 + }, + { + "epoch": 0.9, + "grad_norm": 1.9930583301822953, + "learning_rate": 2.763320276161835e-07, + "loss": 0.495, + "step": 7227 + }, + { + "epoch": 0.9, + "grad_norm": 2.8473930690073925, + "learning_rate": 2.7567354488175433e-07, + "loss": 0.5277, + "step": 7228 + }, + { + "epoch": 0.9, + "grad_norm": 1.692660262075829, + "learning_rate": 2.750158254038715e-07, + "loss": 0.525, + "step": 7229 + }, + { + "epoch": 0.9, + "grad_norm": 1.3647834288816196, + "learning_rate": 2.7435886928879663e-07, + "loss": 0.4567, + "step": 7230 + }, + { + "epoch": 0.9, + "grad_norm": 2.1593877264595376, + "learning_rate": 2.737026766426654e-07, + "loss": 0.5127, + "step": 7231 + }, + { + "epoch": 0.9, + "grad_norm": 1.3555113213082137, + "learning_rate": 2.730472475714929e-07, + "loss": 0.4757, + "step": 7232 + }, + { + "epoch": 0.9, + "grad_norm": 1.4278182115249216, + "learning_rate": 2.723925821811685e-07, + "loss": 0.489, + "step": 7233 + }, + { + "epoch": 0.9, + "grad_norm": 1.8923024576982832, + "learning_rate": 2.717386805774591e-07, + "loss": 0.5259, + "step": 7234 + }, + { + "epoch": 0.9, + "grad_norm": 1.3922482113704302, + "learning_rate": 2.710855428660097e-07, + "loss": 0.4448, + "step": 7235 + }, + { + "epoch": 0.9, + "grad_norm": 1.50956630158752, + "learning_rate": 2.7043316915233874e-07, + "loss": 0.497, + "step": 7236 + }, + { + "epoch": 0.9, + "grad_norm": 2.295771368551603, + "learning_rate": 2.6978155954184403e-07, + "loss": 0.4751, + "step": 7237 + }, + { + "epoch": 0.9, + "grad_norm": 1.3927473634808414, + "learning_rate": 2.691307141397975e-07, + "loss": 0.4655, + "step": 7238 + }, + { + "epoch": 0.9, + "grad_norm": 1.4048674764987086, + "learning_rate": 2.684806330513512e-07, + "loss": 0.4892, + "step": 7239 + }, + { + "epoch": 0.9, + "grad_norm": 3.1409357943365657, + "learning_rate": 2.6783131638152883e-07, + "loss": 0.5432, + "step": 7240 + }, + { + "epoch": 0.9, + "grad_norm": 1.547539150362807, + "learning_rate": 2.6718276423523593e-07, + "loss": 0.489, + "step": 7241 + }, + { + "epoch": 0.9, + "grad_norm": 1.3301631048062446, + "learning_rate": 2.665349767172493e-07, + "loss": 0.5291, + "step": 7242 + }, + { + "epoch": 0.9, + "grad_norm": 2.391096724620809, + "learning_rate": 2.6588795393222566e-07, + "loss": 0.4868, + "step": 7243 + }, + { + "epoch": 0.9, + "grad_norm": 1.7189858035690682, + "learning_rate": 2.6524169598469816e-07, + "loss": 0.5221, + "step": 7244 + }, + { + "epoch": 0.9, + "grad_norm": 1.4065853979337697, + "learning_rate": 2.6459620297907386e-07, + "loss": 0.5053, + "step": 7245 + }, + { + "epoch": 0.9, + "grad_norm": 1.589793540733917, + "learning_rate": 2.639514750196398e-07, + "loss": 0.5439, + "step": 7246 + }, + { + "epoch": 0.9, + "grad_norm": 1.694368040776559, + "learning_rate": 2.633075122105555e-07, + "loss": 0.503, + "step": 7247 + }, + { + "epoch": 0.9, + "grad_norm": 1.3267658279966987, + "learning_rate": 2.626643146558605e-07, + "loss": 0.5074, + "step": 7248 + }, + { + "epoch": 0.9, + "grad_norm": 1.586372075700058, + "learning_rate": 2.6202188245946726e-07, + "loss": 0.4962, + "step": 7249 + }, + { + "epoch": 0.9, + "grad_norm": 1.3828261644604938, + "learning_rate": 2.6138021572516715e-07, + "loss": 0.4942, + "step": 7250 + }, + { + "epoch": 0.9, + "grad_norm": 3.4093827303195634, + "learning_rate": 2.607393145566284e-07, + "loss": 0.4875, + "step": 7251 + }, + { + "epoch": 0.9, + "grad_norm": 1.9936764361454529, + "learning_rate": 2.6009917905739203e-07, + "loss": 0.5106, + "step": 7252 + }, + { + "epoch": 0.9, + "grad_norm": 1.422134201103214, + "learning_rate": 2.594598093308792e-07, + "loss": 0.4947, + "step": 7253 + }, + { + "epoch": 0.9, + "grad_norm": 1.2892958431167894, + "learning_rate": 2.588212054803846e-07, + "loss": 0.4645, + "step": 7254 + }, + { + "epoch": 0.9, + "grad_norm": 3.471846303409366, + "learning_rate": 2.5818336760908115e-07, + "loss": 0.5207, + "step": 7255 + }, + { + "epoch": 0.9, + "grad_norm": 1.3972002061205235, + "learning_rate": 2.5754629582001644e-07, + "loss": 0.5183, + "step": 7256 + }, + { + "epoch": 0.9, + "grad_norm": 1.4677880478954894, + "learning_rate": 2.569099902161165e-07, + "loss": 0.4941, + "step": 7257 + }, + { + "epoch": 0.9, + "grad_norm": 1.985567571670727, + "learning_rate": 2.5627445090017913e-07, + "loss": 0.4583, + "step": 7258 + }, + { + "epoch": 0.9, + "grad_norm": 1.5235852187263925, + "learning_rate": 2.5563967797488487e-07, + "loss": 0.4655, + "step": 7259 + }, + { + "epoch": 0.9, + "grad_norm": 1.677128208654452, + "learning_rate": 2.5500567154278567e-07, + "loss": 0.5045, + "step": 7260 + }, + { + "epoch": 0.9, + "grad_norm": 1.3204359303408124, + "learning_rate": 2.54372431706309e-07, + "loss": 0.5068, + "step": 7261 + }, + { + "epoch": 0.9, + "grad_norm": 1.5599365732510986, + "learning_rate": 2.537399585677625e-07, + "loss": 0.5113, + "step": 7262 + }, + { + "epoch": 0.9, + "grad_norm": 1.4635723228362618, + "learning_rate": 2.531082522293266e-07, + "loss": 0.5076, + "step": 7263 + }, + { + "epoch": 0.9, + "grad_norm": 1.4179714784792024, + "learning_rate": 2.524773127930602e-07, + "loss": 0.4573, + "step": 7264 + }, + { + "epoch": 0.9, + "grad_norm": 1.6028209650618992, + "learning_rate": 2.518471403608952e-07, + "loss": 0.5354, + "step": 7265 + }, + { + "epoch": 0.9, + "grad_norm": 0.6732471951768726, + "learning_rate": 2.512177350346434e-07, + "loss": 0.5216, + "step": 7266 + }, + { + "epoch": 0.9, + "grad_norm": 2.705772586239429, + "learning_rate": 2.5058909691598966e-07, + "loss": 0.5521, + "step": 7267 + }, + { + "epoch": 0.9, + "grad_norm": 1.3672127403570247, + "learning_rate": 2.499612261064971e-07, + "loss": 0.4545, + "step": 7268 + }, + { + "epoch": 0.9, + "grad_norm": 1.7575170328414065, + "learning_rate": 2.4933412270760295e-07, + "loss": 0.5077, + "step": 7269 + }, + { + "epoch": 0.9, + "grad_norm": 1.9858809443827865, + "learning_rate": 2.487077868206206e-07, + "loss": 0.5322, + "step": 7270 + }, + { + "epoch": 0.9, + "grad_norm": 1.4890330148958661, + "learning_rate": 2.4808221854674185e-07, + "loss": 0.493, + "step": 7271 + }, + { + "epoch": 0.9, + "grad_norm": 1.3164375874859087, + "learning_rate": 2.4745741798703094e-07, + "loss": 0.4836, + "step": 7272 + }, + { + "epoch": 0.9, + "grad_norm": 1.4383171376568136, + "learning_rate": 2.4683338524243204e-07, + "loss": 0.4913, + "step": 7273 + }, + { + "epoch": 0.9, + "grad_norm": 1.578046977346187, + "learning_rate": 2.4621012041376013e-07, + "loss": 0.4647, + "step": 7274 + }, + { + "epoch": 0.9, + "grad_norm": 1.6640790823621678, + "learning_rate": 2.455876236017124e-07, + "loss": 0.4695, + "step": 7275 + }, + { + "epoch": 0.9, + "grad_norm": 1.5343616831889717, + "learning_rate": 2.4496589490685617e-07, + "loss": 0.4824, + "step": 7276 + }, + { + "epoch": 0.9, + "grad_norm": 1.712511496470845, + "learning_rate": 2.4434493442963944e-07, + "loss": 0.5031, + "step": 7277 + }, + { + "epoch": 0.9, + "grad_norm": 1.2402210784873, + "learning_rate": 2.4372474227038145e-07, + "loss": 0.4437, + "step": 7278 + }, + { + "epoch": 0.9, + "grad_norm": 1.4835344448500538, + "learning_rate": 2.43105318529282e-07, + "loss": 0.4807, + "step": 7279 + }, + { + "epoch": 0.9, + "grad_norm": 1.6450460043573967, + "learning_rate": 2.424866633064127e-07, + "loss": 0.4643, + "step": 7280 + }, + { + "epoch": 0.9, + "grad_norm": 1.4435211933583696, + "learning_rate": 2.418687767017236e-07, + "loss": 0.5171, + "step": 7281 + }, + { + "epoch": 0.9, + "grad_norm": 2.4803283735920534, + "learning_rate": 2.412516588150393e-07, + "loss": 0.4445, + "step": 7282 + }, + { + "epoch": 0.9, + "grad_norm": 1.6756255272826075, + "learning_rate": 2.406353097460601e-07, + "loss": 0.5409, + "step": 7283 + }, + { + "epoch": 0.9, + "grad_norm": 1.4945053637205326, + "learning_rate": 2.400197295943646e-07, + "loss": 0.4947, + "step": 7284 + }, + { + "epoch": 0.9, + "grad_norm": 1.5668521984285715, + "learning_rate": 2.394049184594027e-07, + "loss": 0.4485, + "step": 7285 + }, + { + "epoch": 0.9, + "grad_norm": 0.6935285110259156, + "learning_rate": 2.3879087644050503e-07, + "loss": 0.5001, + "step": 7286 + }, + { + "epoch": 0.9, + "grad_norm": 1.478287517350354, + "learning_rate": 2.3817760363687382e-07, + "loss": 0.5318, + "step": 7287 + }, + { + "epoch": 0.9, + "grad_norm": 1.5620851531817914, + "learning_rate": 2.3756510014758926e-07, + "loss": 0.5353, + "step": 7288 + }, + { + "epoch": 0.9, + "grad_norm": 1.5637832675029486, + "learning_rate": 2.3695336607160668e-07, + "loss": 0.4819, + "step": 7289 + }, + { + "epoch": 0.9, + "grad_norm": 1.6615999181013263, + "learning_rate": 2.3634240150775645e-07, + "loss": 0.5141, + "step": 7290 + }, + { + "epoch": 0.9, + "grad_norm": 2.0650508896497906, + "learning_rate": 2.3573220655474572e-07, + "loss": 0.5006, + "step": 7291 + }, + { + "epoch": 0.9, + "grad_norm": 1.5867828142828393, + "learning_rate": 2.351227813111573e-07, + "loss": 0.4795, + "step": 7292 + }, + { + "epoch": 0.91, + "grad_norm": 0.677807090950925, + "learning_rate": 2.3451412587544908e-07, + "loss": 0.4779, + "step": 7293 + }, + { + "epoch": 0.91, + "grad_norm": 1.3721886541775619, + "learning_rate": 2.3390624034595356e-07, + "loss": 0.5185, + "step": 7294 + }, + { + "epoch": 0.91, + "grad_norm": 1.4681451485285608, + "learning_rate": 2.3329912482088213e-07, + "loss": 0.5293, + "step": 7295 + }, + { + "epoch": 0.91, + "grad_norm": 1.4934042079383747, + "learning_rate": 2.3269277939831748e-07, + "loss": 0.4729, + "step": 7296 + }, + { + "epoch": 0.91, + "grad_norm": 4.146107040808323, + "learning_rate": 2.3208720417622177e-07, + "loss": 0.4677, + "step": 7297 + }, + { + "epoch": 0.91, + "grad_norm": 1.306195733810811, + "learning_rate": 2.3148239925243067e-07, + "loss": 0.4468, + "step": 7298 + }, + { + "epoch": 0.91, + "grad_norm": 1.4923955412993348, + "learning_rate": 2.3087836472465319e-07, + "loss": 0.4738, + "step": 7299 + }, + { + "epoch": 0.91, + "grad_norm": 1.7773578519524955, + "learning_rate": 2.302751006904802e-07, + "loss": 0.5329, + "step": 7300 + }, + { + "epoch": 0.91, + "grad_norm": 1.4199512023896965, + "learning_rate": 2.296726072473726e-07, + "loss": 0.4481, + "step": 7301 + }, + { + "epoch": 0.91, + "grad_norm": 1.4258423801499063, + "learning_rate": 2.2907088449266867e-07, + "loss": 0.5177, + "step": 7302 + }, + { + "epoch": 0.91, + "grad_norm": 1.5231668326365035, + "learning_rate": 2.2846993252358119e-07, + "loss": 0.5138, + "step": 7303 + }, + { + "epoch": 0.91, + "grad_norm": 1.4672551656917383, + "learning_rate": 2.278697514372008e-07, + "loss": 0.4154, + "step": 7304 + }, + { + "epoch": 0.91, + "grad_norm": 1.6189360069251224, + "learning_rate": 2.2727034133049054e-07, + "loss": 0.5066, + "step": 7305 + }, + { + "epoch": 0.91, + "grad_norm": 2.1915601216281146, + "learning_rate": 2.2667170230029177e-07, + "loss": 0.532, + "step": 7306 + }, + { + "epoch": 0.91, + "grad_norm": 1.593834724413796, + "learning_rate": 2.2607383444331888e-07, + "loss": 0.5509, + "step": 7307 + }, + { + "epoch": 0.91, + "grad_norm": 1.7054554094812038, + "learning_rate": 2.2547673785616285e-07, + "loss": 0.4572, + "step": 7308 + }, + { + "epoch": 0.91, + "grad_norm": 1.8211515336626856, + "learning_rate": 2.2488041263529048e-07, + "loss": 0.4482, + "step": 7309 + }, + { + "epoch": 0.91, + "grad_norm": 1.648506058182298, + "learning_rate": 2.2428485887704244e-07, + "loss": 0.4783, + "step": 7310 + }, + { + "epoch": 0.91, + "grad_norm": 1.7043440299302586, + "learning_rate": 2.2369007667763676e-07, + "loss": 0.5503, + "step": 7311 + }, + { + "epoch": 0.91, + "grad_norm": 1.493921056568484, + "learning_rate": 2.2309606613316438e-07, + "loss": 0.4892, + "step": 7312 + }, + { + "epoch": 0.91, + "grad_norm": 1.5789653486264432, + "learning_rate": 2.2250282733959462e-07, + "loss": 0.4903, + "step": 7313 + }, + { + "epoch": 0.91, + "grad_norm": 1.2578843844561818, + "learning_rate": 2.2191036039276914e-07, + "loss": 0.4967, + "step": 7314 + }, + { + "epoch": 0.91, + "grad_norm": 1.4787303637879343, + "learning_rate": 2.213186653884064e-07, + "loss": 0.554, + "step": 7315 + }, + { + "epoch": 0.91, + "grad_norm": 1.4311610016698448, + "learning_rate": 2.2072774242210048e-07, + "loss": 0.4968, + "step": 7316 + }, + { + "epoch": 0.91, + "grad_norm": 1.8183962667671558, + "learning_rate": 2.2013759158931947e-07, + "loss": 0.4779, + "step": 7317 + }, + { + "epoch": 0.91, + "grad_norm": 2.241204665605121, + "learning_rate": 2.1954821298540873e-07, + "loss": 0.4723, + "step": 7318 + }, + { + "epoch": 0.91, + "grad_norm": 2.584852717960534, + "learning_rate": 2.1895960670558548e-07, + "loss": 0.5204, + "step": 7319 + }, + { + "epoch": 0.91, + "grad_norm": 2.254694152059909, + "learning_rate": 2.1837177284494637e-07, + "loss": 0.5354, + "step": 7320 + }, + { + "epoch": 0.91, + "grad_norm": 1.5623354567276635, + "learning_rate": 2.1778471149845938e-07, + "loss": 0.5041, + "step": 7321 + }, + { + "epoch": 0.91, + "grad_norm": 2.440466493552607, + "learning_rate": 2.171984227609708e-07, + "loss": 0.4777, + "step": 7322 + }, + { + "epoch": 0.91, + "grad_norm": 1.4591778128934314, + "learning_rate": 2.1661290672719993e-07, + "loss": 0.4227, + "step": 7323 + }, + { + "epoch": 0.91, + "grad_norm": 1.335574212901139, + "learning_rate": 2.160281634917416e-07, + "loss": 0.452, + "step": 7324 + }, + { + "epoch": 0.91, + "grad_norm": 1.5346708718374296, + "learning_rate": 2.1544419314906805e-07, + "loss": 0.4654, + "step": 7325 + }, + { + "epoch": 0.91, + "grad_norm": 2.117331882237975, + "learning_rate": 2.1486099579352327e-07, + "loss": 0.5138, + "step": 7326 + }, + { + "epoch": 0.91, + "grad_norm": 1.476418156466487, + "learning_rate": 2.1427857151932851e-07, + "loss": 0.4655, + "step": 7327 + }, + { + "epoch": 0.91, + "grad_norm": 2.0906591898100957, + "learning_rate": 2.136969204205791e-07, + "loss": 0.5024, + "step": 7328 + }, + { + "epoch": 0.91, + "grad_norm": 1.491630283610432, + "learning_rate": 2.1311604259124653e-07, + "loss": 0.4872, + "step": 7329 + }, + { + "epoch": 0.91, + "grad_norm": 1.5464114719601534, + "learning_rate": 2.125359381251757e-07, + "loss": 0.5307, + "step": 7330 + }, + { + "epoch": 0.91, + "grad_norm": 1.408479255570862, + "learning_rate": 2.1195660711608944e-07, + "loss": 0.512, + "step": 7331 + }, + { + "epoch": 0.91, + "grad_norm": 2.005446125069675, + "learning_rate": 2.113780496575818e-07, + "loss": 0.4561, + "step": 7332 + }, + { + "epoch": 0.91, + "grad_norm": 1.7105871509101782, + "learning_rate": 2.1080026584312407e-07, + "loss": 0.4906, + "step": 7333 + }, + { + "epoch": 0.91, + "grad_norm": 1.6551695201751928, + "learning_rate": 2.102232557660644e-07, + "loss": 0.4628, + "step": 7334 + }, + { + "epoch": 0.91, + "grad_norm": 1.515246530641419, + "learning_rate": 2.0964701951962095e-07, + "loss": 0.4869, + "step": 7335 + }, + { + "epoch": 0.91, + "grad_norm": 1.3945580750786308, + "learning_rate": 2.0907155719689208e-07, + "loss": 0.4854, + "step": 7336 + }, + { + "epoch": 0.91, + "grad_norm": 1.4034244421481015, + "learning_rate": 2.0849686889084786e-07, + "loss": 0.459, + "step": 7337 + }, + { + "epoch": 0.91, + "grad_norm": 1.5868486583384376, + "learning_rate": 2.07922954694334e-07, + "loss": 0.4551, + "step": 7338 + }, + { + "epoch": 0.91, + "grad_norm": 1.6710749272297594, + "learning_rate": 2.0734981470007133e-07, + "loss": 0.5157, + "step": 7339 + }, + { + "epoch": 0.91, + "grad_norm": 1.874129184801048, + "learning_rate": 2.0677744900065633e-07, + "loss": 0.5156, + "step": 7340 + }, + { + "epoch": 0.91, + "grad_norm": 1.8234472645431714, + "learning_rate": 2.0620585768855949e-07, + "loss": 0.4908, + "step": 7341 + }, + { + "epoch": 0.91, + "grad_norm": 1.3063947819347423, + "learning_rate": 2.0563504085612583e-07, + "loss": 0.4893, + "step": 7342 + }, + { + "epoch": 0.91, + "grad_norm": 2.418876188696213, + "learning_rate": 2.0506499859557717e-07, + "loss": 0.449, + "step": 7343 + }, + { + "epoch": 0.91, + "grad_norm": 1.4762691073800227, + "learning_rate": 2.04495730999007e-07, + "loss": 0.493, + "step": 7344 + }, + { + "epoch": 0.91, + "grad_norm": 1.554007050981706, + "learning_rate": 2.0392723815838734e-07, + "loss": 0.499, + "step": 7345 + }, + { + "epoch": 0.91, + "grad_norm": 1.3702325490919527, + "learning_rate": 2.0335952016556193e-07, + "loss": 0.4782, + "step": 7346 + }, + { + "epoch": 0.91, + "grad_norm": 1.6724464755354924, + "learning_rate": 2.0279257711225186e-07, + "loss": 0.4829, + "step": 7347 + }, + { + "epoch": 0.91, + "grad_norm": 1.5667356213060892, + "learning_rate": 2.0222640909004942e-07, + "loss": 0.4719, + "step": 7348 + }, + { + "epoch": 0.91, + "grad_norm": 1.3072445321244563, + "learning_rate": 2.0166101619042643e-07, + "loss": 0.4947, + "step": 7349 + }, + { + "epoch": 0.91, + "grad_norm": 1.2520941586893453, + "learning_rate": 2.0109639850472706e-07, + "loss": 0.4779, + "step": 7350 + }, + { + "epoch": 0.91, + "grad_norm": 2.4133855463711558, + "learning_rate": 2.0053255612416832e-07, + "loss": 0.4619, + "step": 7351 + }, + { + "epoch": 0.91, + "grad_norm": 1.4728280235748008, + "learning_rate": 1.9996948913984626e-07, + "loss": 0.4631, + "step": 7352 + }, + { + "epoch": 0.91, + "grad_norm": 0.6765788312000822, + "learning_rate": 1.99407197642727e-07, + "loss": 0.4636, + "step": 7353 + }, + { + "epoch": 0.91, + "grad_norm": 1.9494597592574943, + "learning_rate": 1.988456817236556e-07, + "loss": 0.4529, + "step": 7354 + }, + { + "epoch": 0.91, + "grad_norm": 1.443993759050577, + "learning_rate": 1.9828494147334843e-07, + "loss": 0.4866, + "step": 7355 + }, + { + "epoch": 0.91, + "grad_norm": 1.5748048125426386, + "learning_rate": 1.977249769823991e-07, + "loss": 0.4593, + "step": 7356 + }, + { + "epoch": 0.91, + "grad_norm": 1.3756291053600451, + "learning_rate": 1.9716578834127366e-07, + "loss": 0.5024, + "step": 7357 + }, + { + "epoch": 0.91, + "grad_norm": 1.639060653594591, + "learning_rate": 1.966073756403153e-07, + "loss": 0.4482, + "step": 7358 + }, + { + "epoch": 0.91, + "grad_norm": 1.3952555902533088, + "learning_rate": 1.9604973896974022e-07, + "loss": 0.4986, + "step": 7359 + }, + { + "epoch": 0.91, + "grad_norm": 1.8896722139396491, + "learning_rate": 1.9549287841963915e-07, + "loss": 0.502, + "step": 7360 + }, + { + "epoch": 0.91, + "grad_norm": 1.5602669134106195, + "learning_rate": 1.949367940799779e-07, + "loss": 0.5039, + "step": 7361 + }, + { + "epoch": 0.91, + "grad_norm": 2.7246978242341773, + "learning_rate": 1.9438148604059627e-07, + "loss": 0.4685, + "step": 7362 + }, + { + "epoch": 0.91, + "grad_norm": 2.2521584103521244, + "learning_rate": 1.9382695439121024e-07, + "loss": 0.4646, + "step": 7363 + }, + { + "epoch": 0.91, + "grad_norm": 1.9572783096995119, + "learning_rate": 1.9327319922140819e-07, + "loss": 0.5122, + "step": 7364 + }, + { + "epoch": 0.91, + "grad_norm": 1.3215317375129358, + "learning_rate": 1.9272022062065466e-07, + "loss": 0.4642, + "step": 7365 + }, + { + "epoch": 0.91, + "grad_norm": 1.3334449214336, + "learning_rate": 1.9216801867828817e-07, + "loss": 0.4742, + "step": 7366 + }, + { + "epoch": 0.91, + "grad_norm": 3.0968260435500023, + "learning_rate": 1.916165934835229e-07, + "loss": 0.456, + "step": 7367 + }, + { + "epoch": 0.91, + "grad_norm": 1.390221874520958, + "learning_rate": 1.9106594512544485e-07, + "loss": 0.472, + "step": 7368 + }, + { + "epoch": 0.91, + "grad_norm": 1.3826685396985134, + "learning_rate": 1.9051607369301616e-07, + "loss": 0.495, + "step": 7369 + }, + { + "epoch": 0.91, + "grad_norm": 1.6508771302483614, + "learning_rate": 1.899669792750747e-07, + "loss": 0.5044, + "step": 7370 + }, + { + "epoch": 0.91, + "grad_norm": 1.5330385791261132, + "learning_rate": 1.8941866196032998e-07, + "loss": 0.4936, + "step": 7371 + }, + { + "epoch": 0.91, + "grad_norm": 1.7707144138711233, + "learning_rate": 1.88871121837369e-07, + "loss": 0.486, + "step": 7372 + }, + { + "epoch": 0.91, + "grad_norm": 1.262247496461452, + "learning_rate": 1.8832435899464984e-07, + "loss": 0.4518, + "step": 7373 + }, + { + "epoch": 0.92, + "grad_norm": 1.3873141734746803, + "learning_rate": 1.8777837352050908e-07, + "loss": 0.47, + "step": 7374 + }, + { + "epoch": 0.92, + "grad_norm": 1.3693940975255388, + "learning_rate": 1.872331655031534e-07, + "loss": 0.5038, + "step": 7375 + }, + { + "epoch": 0.92, + "grad_norm": 2.0349987280215167, + "learning_rate": 1.8668873503066786e-07, + "loss": 0.4869, + "step": 7376 + }, + { + "epoch": 0.92, + "grad_norm": 1.6231415884536113, + "learning_rate": 1.861450821910088e-07, + "loss": 0.4838, + "step": 7377 + }, + { + "epoch": 0.92, + "grad_norm": 1.6188759308500793, + "learning_rate": 1.856022070720076e-07, + "loss": 0.51, + "step": 7378 + }, + { + "epoch": 0.92, + "grad_norm": 1.2550870322710346, + "learning_rate": 1.8506010976137244e-07, + "loss": 0.4649, + "step": 7379 + }, + { + "epoch": 0.92, + "grad_norm": 1.3361841719564518, + "learning_rate": 1.8451879034668163e-07, + "loss": 0.4599, + "step": 7380 + }, + { + "epoch": 0.92, + "grad_norm": 2.7603972626571474, + "learning_rate": 1.839782489153913e-07, + "loss": 0.5608, + "step": 7381 + }, + { + "epoch": 0.92, + "grad_norm": 1.786862186824463, + "learning_rate": 1.83438485554831e-07, + "loss": 0.5361, + "step": 7382 + }, + { + "epoch": 0.92, + "grad_norm": 1.3992880337173184, + "learning_rate": 1.828995003522044e-07, + "loss": 0.5047, + "step": 7383 + }, + { + "epoch": 0.92, + "grad_norm": 1.3833373236184248, + "learning_rate": 1.8236129339458787e-07, + "loss": 0.4584, + "step": 7384 + }, + { + "epoch": 0.92, + "grad_norm": 1.6915427558700735, + "learning_rate": 1.8182386476893467e-07, + "loss": 0.4778, + "step": 7385 + }, + { + "epoch": 0.92, + "grad_norm": 1.5895090237630853, + "learning_rate": 1.8128721456207088e-07, + "loss": 0.4951, + "step": 7386 + }, + { + "epoch": 0.92, + "grad_norm": 1.7293079354326406, + "learning_rate": 1.8075134286069718e-07, + "loss": 0.4858, + "step": 7387 + }, + { + "epoch": 0.92, + "grad_norm": 1.465841554108314, + "learning_rate": 1.802162497513882e-07, + "loss": 0.4851, + "step": 7388 + }, + { + "epoch": 0.92, + "grad_norm": 1.6913645552862504, + "learning_rate": 1.7968193532059197e-07, + "loss": 0.4761, + "step": 7389 + }, + { + "epoch": 0.92, + "grad_norm": 1.3774108786510824, + "learning_rate": 1.7914839965463339e-07, + "loss": 0.4674, + "step": 7390 + }, + { + "epoch": 0.92, + "grad_norm": 1.4307532756136354, + "learning_rate": 1.78615642839709e-07, + "loss": 0.4621, + "step": 7391 + }, + { + "epoch": 0.92, + "grad_norm": 1.4096128289356364, + "learning_rate": 1.780836649618911e-07, + "loss": 0.5172, + "step": 7392 + }, + { + "epoch": 0.92, + "grad_norm": 1.7973737870861728, + "learning_rate": 1.7755246610712372e-07, + "loss": 0.4674, + "step": 7393 + }, + { + "epoch": 0.92, + "grad_norm": 1.388887248954646, + "learning_rate": 1.7702204636122878e-07, + "loss": 0.4694, + "step": 7394 + }, + { + "epoch": 0.92, + "grad_norm": 1.4194576297864852, + "learning_rate": 1.764924058098988e-07, + "loss": 0.5377, + "step": 7395 + }, + { + "epoch": 0.92, + "grad_norm": 1.466166965935639, + "learning_rate": 1.7596354453870257e-07, + "loss": 0.5266, + "step": 7396 + }, + { + "epoch": 0.92, + "grad_norm": 1.4755667902189378, + "learning_rate": 1.7543546263308176e-07, + "loss": 0.4729, + "step": 7397 + }, + { + "epoch": 0.92, + "grad_norm": 2.2663679978132127, + "learning_rate": 1.74908160178352e-07, + "loss": 0.5259, + "step": 7398 + }, + { + "epoch": 0.92, + "grad_norm": 1.3924679571475973, + "learning_rate": 1.7438163725970624e-07, + "loss": 0.5001, + "step": 7399 + }, + { + "epoch": 0.92, + "grad_norm": 1.8073122795850416, + "learning_rate": 1.7385589396220592e-07, + "loss": 0.4704, + "step": 7400 + }, + { + "epoch": 0.92, + "grad_norm": 1.3579610533590791, + "learning_rate": 1.7333093037079197e-07, + "loss": 0.4827, + "step": 7401 + }, + { + "epoch": 0.92, + "grad_norm": 1.4829654561982808, + "learning_rate": 1.7280674657027486e-07, + "loss": 0.4699, + "step": 7402 + }, + { + "epoch": 0.92, + "grad_norm": 1.4002345695803489, + "learning_rate": 1.7228334264534242e-07, + "loss": 0.4516, + "step": 7403 + }, + { + "epoch": 0.92, + "grad_norm": 1.440620235561422, + "learning_rate": 1.7176071868055421e-07, + "loss": 0.4559, + "step": 7404 + }, + { + "epoch": 0.92, + "grad_norm": 2.1864763417305415, + "learning_rate": 1.7123887476034607e-07, + "loss": 0.5007, + "step": 7405 + }, + { + "epoch": 0.92, + "grad_norm": 1.5826582958870667, + "learning_rate": 1.7071781096902497e-07, + "loss": 0.4817, + "step": 7406 + }, + { + "epoch": 0.92, + "grad_norm": 1.6371850257202158, + "learning_rate": 1.701975273907741e-07, + "loss": 0.5102, + "step": 7407 + }, + { + "epoch": 0.92, + "grad_norm": 0.6444450230778677, + "learning_rate": 1.6967802410965016e-07, + "loss": 0.4651, + "step": 7408 + }, + { + "epoch": 0.92, + "grad_norm": 1.6004728101518253, + "learning_rate": 1.6915930120958268e-07, + "loss": 0.4958, + "step": 7409 + }, + { + "epoch": 0.92, + "grad_norm": 1.7822773798269116, + "learning_rate": 1.6864135877437683e-07, + "loss": 0.4633, + "step": 7410 + }, + { + "epoch": 0.92, + "grad_norm": 1.435767134768869, + "learning_rate": 1.6812419688770953e-07, + "loss": 0.467, + "step": 7411 + }, + { + "epoch": 0.92, + "grad_norm": 1.372733854208186, + "learning_rate": 1.6760781563313399e-07, + "loss": 0.4703, + "step": 7412 + }, + { + "epoch": 0.92, + "grad_norm": 1.526168994526638, + "learning_rate": 1.670922150940757e-07, + "loss": 0.4873, + "step": 7413 + }, + { + "epoch": 0.92, + "grad_norm": 1.426658070398611, + "learning_rate": 1.665773953538341e-07, + "loss": 0.5046, + "step": 7414 + }, + { + "epoch": 0.92, + "grad_norm": 1.7451714554121183, + "learning_rate": 1.6606335649558436e-07, + "loss": 0.4596, + "step": 7415 + }, + { + "epoch": 0.92, + "grad_norm": 1.4002667303381664, + "learning_rate": 1.655500986023717e-07, + "loss": 0.4861, + "step": 7416 + }, + { + "epoch": 0.92, + "grad_norm": 2.103839952829796, + "learning_rate": 1.650376217571198e-07, + "loss": 0.5538, + "step": 7417 + }, + { + "epoch": 0.92, + "grad_norm": 1.8123136742602968, + "learning_rate": 1.6452592604262185e-07, + "loss": 0.4742, + "step": 7418 + }, + { + "epoch": 0.92, + "grad_norm": 1.532392411596531, + "learning_rate": 1.6401501154154786e-07, + "loss": 0.4765, + "step": 7419 + }, + { + "epoch": 0.92, + "grad_norm": 2.232248589830049, + "learning_rate": 1.6350487833644012e-07, + "loss": 0.5166, + "step": 7420 + }, + { + "epoch": 0.92, + "grad_norm": 1.6485974896546554, + "learning_rate": 1.629955265097155e-07, + "loss": 0.4424, + "step": 7421 + }, + { + "epoch": 0.92, + "grad_norm": 1.4977480346655385, + "learning_rate": 1.6248695614366427e-07, + "loss": 0.5384, + "step": 7422 + }, + { + "epoch": 0.92, + "grad_norm": 1.5735385964767112, + "learning_rate": 1.619791673204496e-07, + "loss": 0.5221, + "step": 7423 + }, + { + "epoch": 0.92, + "grad_norm": 1.5112925459666093, + "learning_rate": 1.6147216012211087e-07, + "loss": 0.5002, + "step": 7424 + }, + { + "epoch": 0.92, + "grad_norm": 3.9905644059573038, + "learning_rate": 1.6096593463055754e-07, + "loss": 0.4811, + "step": 7425 + }, + { + "epoch": 0.92, + "grad_norm": 1.881686217710671, + "learning_rate": 1.60460490927577e-07, + "loss": 0.5035, + "step": 7426 + }, + { + "epoch": 0.92, + "grad_norm": 1.314687441900597, + "learning_rate": 1.599558290948261e-07, + "loss": 0.4807, + "step": 7427 + }, + { + "epoch": 0.92, + "grad_norm": 0.6786577910900888, + "learning_rate": 1.594519492138391e-07, + "loss": 0.459, + "step": 7428 + }, + { + "epoch": 0.92, + "grad_norm": 1.398277622031906, + "learning_rate": 1.589488513660209e-07, + "loss": 0.5031, + "step": 7429 + }, + { + "epoch": 0.92, + "grad_norm": 1.373663322700556, + "learning_rate": 1.58446535632652e-07, + "loss": 0.4752, + "step": 7430 + }, + { + "epoch": 0.92, + "grad_norm": 1.5268937260525122, + "learning_rate": 1.579450020948864e-07, + "loss": 0.5015, + "step": 7431 + }, + { + "epoch": 0.92, + "grad_norm": 2.7816541926644356, + "learning_rate": 1.574442508337498e-07, + "loss": 0.4621, + "step": 7432 + }, + { + "epoch": 0.92, + "grad_norm": 1.3089321631636746, + "learning_rate": 1.5694428193014477e-07, + "loss": 0.4985, + "step": 7433 + }, + { + "epoch": 0.92, + "grad_norm": 1.4269437872264588, + "learning_rate": 1.5644509546484387e-07, + "loss": 0.5111, + "step": 7434 + }, + { + "epoch": 0.92, + "grad_norm": 0.6211926111031364, + "learning_rate": 1.5594669151849706e-07, + "loss": 0.52, + "step": 7435 + }, + { + "epoch": 0.92, + "grad_norm": 1.9188201433192622, + "learning_rate": 1.5544907017162435e-07, + "loss": 0.5104, + "step": 7436 + }, + { + "epoch": 0.92, + "grad_norm": 1.4296650370385822, + "learning_rate": 1.5495223150462145e-07, + "loss": 0.5223, + "step": 7437 + }, + { + "epoch": 0.92, + "grad_norm": 1.379650631045068, + "learning_rate": 1.5445617559775694e-07, + "loss": 0.5103, + "step": 7438 + }, + { + "epoch": 0.92, + "grad_norm": 2.5477816134993425, + "learning_rate": 1.5396090253117224e-07, + "loss": 0.4758, + "step": 7439 + }, + { + "epoch": 0.92, + "grad_norm": 1.3501455563275457, + "learning_rate": 1.53466412384885e-07, + "loss": 0.4398, + "step": 7440 + }, + { + "epoch": 0.92, + "grad_norm": 1.4149457953921925, + "learning_rate": 1.529727052387825e-07, + "loss": 0.5128, + "step": 7441 + }, + { + "epoch": 0.92, + "grad_norm": 1.3173573220357797, + "learning_rate": 1.5247978117262918e-07, + "loss": 0.4253, + "step": 7442 + }, + { + "epoch": 0.92, + "grad_norm": 1.574707130020846, + "learning_rate": 1.5198764026605918e-07, + "loss": 0.5004, + "step": 7443 + }, + { + "epoch": 0.92, + "grad_norm": 1.4846028133844213, + "learning_rate": 1.514962825985844e-07, + "loss": 0.5118, + "step": 7444 + }, + { + "epoch": 0.92, + "grad_norm": 2.147310520687052, + "learning_rate": 1.5100570824958638e-07, + "loss": 0.498, + "step": 7445 + }, + { + "epoch": 0.92, + "grad_norm": 1.3950785800094783, + "learning_rate": 1.5051591729832282e-07, + "loss": 0.4847, + "step": 7446 + }, + { + "epoch": 0.92, + "grad_norm": 1.367432471953318, + "learning_rate": 1.5002690982392264e-07, + "loss": 0.5167, + "step": 7447 + }, + { + "epoch": 0.92, + "grad_norm": 2.1658022233578893, + "learning_rate": 1.4953868590538989e-07, + "loss": 0.4746, + "step": 7448 + }, + { + "epoch": 0.92, + "grad_norm": 1.7341710818649676, + "learning_rate": 1.49051245621602e-07, + "loss": 0.5009, + "step": 7449 + }, + { + "epoch": 0.92, + "grad_norm": 1.9017135541191659, + "learning_rate": 1.4856458905130823e-07, + "loss": 0.5082, + "step": 7450 + }, + { + "epoch": 0.92, + "grad_norm": 1.4883425706331486, + "learning_rate": 1.480787162731334e-07, + "loss": 0.5104, + "step": 7451 + }, + { + "epoch": 0.92, + "grad_norm": 1.7310640824314047, + "learning_rate": 1.4759362736557313e-07, + "loss": 0.4518, + "step": 7452 + }, + { + "epoch": 0.92, + "grad_norm": 1.6820489174655378, + "learning_rate": 1.4710932240699915e-07, + "loss": 0.4706, + "step": 7453 + }, + { + "epoch": 0.93, + "grad_norm": 5.835461807837606, + "learning_rate": 1.4662580147565386e-07, + "loss": 0.5171, + "step": 7454 + }, + { + "epoch": 0.93, + "grad_norm": 1.5063706588096515, + "learning_rate": 1.4614306464965533e-07, + "loss": 0.4654, + "step": 7455 + }, + { + "epoch": 0.93, + "grad_norm": 0.6698220323564079, + "learning_rate": 1.4566111200699396e-07, + "loss": 0.5236, + "step": 7456 + }, + { + "epoch": 0.93, + "grad_norm": 1.3900997845556018, + "learning_rate": 1.45179943625533e-07, + "loss": 0.5373, + "step": 7457 + }, + { + "epoch": 0.93, + "grad_norm": 1.5143660815460784, + "learning_rate": 1.4469955958300974e-07, + "loss": 0.4765, + "step": 7458 + }, + { + "epoch": 0.93, + "grad_norm": 2.626916081300669, + "learning_rate": 1.442199599570343e-07, + "loss": 0.5145, + "step": 7459 + }, + { + "epoch": 0.93, + "grad_norm": 1.4632897847436472, + "learning_rate": 1.4374114482509028e-07, + "loss": 0.5348, + "step": 7460 + }, + { + "epoch": 0.93, + "grad_norm": 1.4279017685403526, + "learning_rate": 1.432631142645341e-07, + "loss": 0.4946, + "step": 7461 + }, + { + "epoch": 0.93, + "grad_norm": 2.2083577425437353, + "learning_rate": 1.4278586835259622e-07, + "loss": 0.458, + "step": 7462 + }, + { + "epoch": 0.93, + "grad_norm": 1.4784130586185524, + "learning_rate": 1.4230940716637943e-07, + "loss": 0.5431, + "step": 7463 + }, + { + "epoch": 0.93, + "grad_norm": 1.6445800312409866, + "learning_rate": 1.41833730782861e-07, + "loss": 0.539, + "step": 7464 + }, + { + "epoch": 0.93, + "grad_norm": 1.2848076635613273, + "learning_rate": 1.4135883927889006e-07, + "loss": 0.5047, + "step": 7465 + }, + { + "epoch": 0.93, + "grad_norm": 1.4212789763956692, + "learning_rate": 1.408847327311902e-07, + "loss": 0.5072, + "step": 7466 + }, + { + "epoch": 0.93, + "grad_norm": 1.5426409834820622, + "learning_rate": 1.4041141121635737e-07, + "loss": 0.5697, + "step": 7467 + }, + { + "epoch": 0.93, + "grad_norm": 1.507637441455359, + "learning_rate": 1.399388748108599e-07, + "loss": 0.4793, + "step": 7468 + }, + { + "epoch": 0.93, + "grad_norm": 1.4319365540845996, + "learning_rate": 1.394671235910411e-07, + "loss": 0.5337, + "step": 7469 + }, + { + "epoch": 0.93, + "grad_norm": 1.5313856313039929, + "learning_rate": 1.389961576331156e-07, + "loss": 0.492, + "step": 7470 + }, + { + "epoch": 0.93, + "grad_norm": 1.505239142027003, + "learning_rate": 1.3852597701317306e-07, + "loss": 0.4844, + "step": 7471 + }, + { + "epoch": 0.93, + "grad_norm": 2.2618369771812814, + "learning_rate": 1.380565818071744e-07, + "loss": 0.4765, + "step": 7472 + }, + { + "epoch": 0.93, + "grad_norm": 4.497229469316913, + "learning_rate": 1.375879720909562e-07, + "loss": 0.4455, + "step": 7473 + }, + { + "epoch": 0.93, + "grad_norm": 1.5833775904859986, + "learning_rate": 1.3712014794022455e-07, + "loss": 0.4677, + "step": 7474 + }, + { + "epoch": 0.93, + "grad_norm": 1.567828763742299, + "learning_rate": 1.366531094305623e-07, + "loss": 0.4709, + "step": 7475 + }, + { + "epoch": 0.93, + "grad_norm": 1.514977473626188, + "learning_rate": 1.3618685663742248e-07, + "loss": 0.5325, + "step": 7476 + }, + { + "epoch": 0.93, + "grad_norm": 1.5067250409224675, + "learning_rate": 1.3572138963613258e-07, + "loss": 0.5067, + "step": 7477 + }, + { + "epoch": 0.93, + "grad_norm": 2.087781436589823, + "learning_rate": 1.35256708501893e-07, + "loss": 0.4266, + "step": 7478 + }, + { + "epoch": 0.93, + "grad_norm": 3.345971248358072, + "learning_rate": 1.3479281330977646e-07, + "loss": 0.4553, + "step": 7479 + }, + { + "epoch": 0.93, + "grad_norm": 2.553935766243598, + "learning_rate": 1.3432970413472967e-07, + "loss": 0.5117, + "step": 7480 + }, + { + "epoch": 0.93, + "grad_norm": 1.414846608514627, + "learning_rate": 1.3386738105157281e-07, + "loss": 0.5022, + "step": 7481 + }, + { + "epoch": 0.93, + "grad_norm": 1.5417676396175475, + "learning_rate": 1.334058441349978e-07, + "loss": 0.4737, + "step": 7482 + }, + { + "epoch": 0.93, + "grad_norm": 1.5825303379028746, + "learning_rate": 1.3294509345956884e-07, + "loss": 0.4209, + "step": 7483 + }, + { + "epoch": 0.93, + "grad_norm": 1.4591388912703152, + "learning_rate": 1.3248512909972643e-07, + "loss": 0.4908, + "step": 7484 + }, + { + "epoch": 0.93, + "grad_norm": 1.8270115930764523, + "learning_rate": 1.3202595112977945e-07, + "loss": 0.4857, + "step": 7485 + }, + { + "epoch": 0.93, + "grad_norm": 1.430368141396785, + "learning_rate": 1.3156755962391464e-07, + "loss": 0.5005, + "step": 7486 + }, + { + "epoch": 0.93, + "grad_norm": 1.3706232110255132, + "learning_rate": 1.3110995465618725e-07, + "loss": 0.4999, + "step": 7487 + }, + { + "epoch": 0.93, + "grad_norm": 7.2975987019146755, + "learning_rate": 1.3065313630052757e-07, + "loss": 0.5456, + "step": 7488 + }, + { + "epoch": 0.93, + "grad_norm": 1.532224608280657, + "learning_rate": 1.3019710463073987e-07, + "loss": 0.4886, + "step": 7489 + }, + { + "epoch": 0.93, + "grad_norm": 1.5089578742075092, + "learning_rate": 1.2974185972049858e-07, + "loss": 0.4434, + "step": 7490 + }, + { + "epoch": 0.93, + "grad_norm": 1.7338452161486364, + "learning_rate": 1.2928740164335375e-07, + "loss": 0.5407, + "step": 7491 + }, + { + "epoch": 0.93, + "grad_norm": 1.8783834907040973, + "learning_rate": 1.2883373047272663e-07, + "loss": 0.49, + "step": 7492 + }, + { + "epoch": 0.93, + "grad_norm": 1.6643538611499085, + "learning_rate": 1.2838084628191195e-07, + "loss": 0.4932, + "step": 7493 + }, + { + "epoch": 0.93, + "grad_norm": 2.998442612275223, + "learning_rate": 1.2792874914407617e-07, + "loss": 0.466, + "step": 7494 + }, + { + "epoch": 0.93, + "grad_norm": 1.3993279711964997, + "learning_rate": 1.2747743913226086e-07, + "loss": 0.481, + "step": 7495 + }, + { + "epoch": 0.93, + "grad_norm": 1.4127255593346921, + "learning_rate": 1.270269163193788e-07, + "loss": 0.4884, + "step": 7496 + }, + { + "epoch": 0.93, + "grad_norm": 2.1344092010920592, + "learning_rate": 1.2657718077821512e-07, + "loss": 0.5471, + "step": 7497 + }, + { + "epoch": 0.93, + "grad_norm": 1.6925433386356652, + "learning_rate": 1.2612823258142949e-07, + "loss": 0.4819, + "step": 7498 + }, + { + "epoch": 0.93, + "grad_norm": 1.444345067251979, + "learning_rate": 1.2568007180155328e-07, + "loss": 0.4577, + "step": 7499 + }, + { + "epoch": 0.93, + "grad_norm": 1.8671287144526687, + "learning_rate": 1.2523269851099085e-07, + "loss": 0.51, + "step": 7500 + }, + { + "epoch": 0.93, + "grad_norm": 2.5835192422472604, + "learning_rate": 1.2478611278201824e-07, + "loss": 0.4785, + "step": 7501 + }, + { + "epoch": 0.93, + "grad_norm": 0.6868950034259951, + "learning_rate": 1.243403146867872e-07, + "loss": 0.4806, + "step": 7502 + }, + { + "epoch": 0.93, + "grad_norm": 1.5403639562764544, + "learning_rate": 1.2389530429731844e-07, + "loss": 0.515, + "step": 7503 + }, + { + "epoch": 0.93, + "grad_norm": 1.434244700079374, + "learning_rate": 1.2345108168550836e-07, + "loss": 0.4656, + "step": 7504 + }, + { + "epoch": 0.93, + "grad_norm": 1.662107659471043, + "learning_rate": 1.2300764692312507e-07, + "loss": 0.5934, + "step": 7505 + }, + { + "epoch": 0.93, + "grad_norm": 1.3295395274177795, + "learning_rate": 1.225650000818085e-07, + "loss": 0.4394, + "step": 7506 + }, + { + "epoch": 0.93, + "grad_norm": 1.6382934586730635, + "learning_rate": 1.2212314123307368e-07, + "loss": 0.5056, + "step": 7507 + }, + { + "epoch": 0.93, + "grad_norm": 1.2936263880044172, + "learning_rate": 1.2168207044830572e-07, + "loss": 0.4899, + "step": 7508 + }, + { + "epoch": 0.93, + "grad_norm": 3.0376476751674737, + "learning_rate": 1.2124178779876373e-07, + "loss": 0.501, + "step": 7509 + }, + { + "epoch": 0.93, + "grad_norm": 0.6932854553594543, + "learning_rate": 1.2080229335557858e-07, + "loss": 0.5038, + "step": 7510 + }, + { + "epoch": 0.93, + "grad_norm": 1.3352372815688656, + "learning_rate": 1.2036358718975572e-07, + "loss": 0.4864, + "step": 7511 + }, + { + "epoch": 0.93, + "grad_norm": 1.5051657873763646, + "learning_rate": 1.1992566937217066e-07, + "loss": 0.4696, + "step": 7512 + }, + { + "epoch": 0.93, + "grad_norm": 1.3730238116729956, + "learning_rate": 1.1948853997357402e-07, + "loss": 0.5046, + "step": 7513 + }, + { + "epoch": 0.93, + "grad_norm": 1.8058338907155638, + "learning_rate": 1.1905219906458765e-07, + "loss": 0.4973, + "step": 7514 + }, + { + "epoch": 0.93, + "grad_norm": 1.5547767592050927, + "learning_rate": 1.1861664671570517e-07, + "loss": 0.4965, + "step": 7515 + }, + { + "epoch": 0.93, + "grad_norm": 1.6235911597108474, + "learning_rate": 1.1818188299729583e-07, + "loss": 0.4996, + "step": 7516 + }, + { + "epoch": 0.93, + "grad_norm": 1.452063347559451, + "learning_rate": 1.177479079795979e-07, + "loss": 0.4445, + "step": 7517 + }, + { + "epoch": 0.93, + "grad_norm": 1.7283130334264376, + "learning_rate": 1.1731472173272529e-07, + "loss": 0.5076, + "step": 7518 + }, + { + "epoch": 0.93, + "grad_norm": 1.700430020011993, + "learning_rate": 1.1688232432666147e-07, + "loss": 0.5086, + "step": 7519 + }, + { + "epoch": 0.93, + "grad_norm": 2.1730840757222, + "learning_rate": 1.1645071583126499e-07, + "loss": 0.4975, + "step": 7520 + }, + { + "epoch": 0.93, + "grad_norm": 1.403025967928819, + "learning_rate": 1.1601989631626565e-07, + "loss": 0.4639, + "step": 7521 + }, + { + "epoch": 0.93, + "grad_norm": 0.6772365589825002, + "learning_rate": 1.1558986585126608e-07, + "loss": 0.4967, + "step": 7522 + }, + { + "epoch": 0.93, + "grad_norm": 1.4835936516919237, + "learning_rate": 1.1516062450574239e-07, + "loss": 0.4588, + "step": 7523 + }, + { + "epoch": 0.93, + "grad_norm": 0.6819121439101569, + "learning_rate": 1.1473217234904133e-07, + "loss": 0.4629, + "step": 7524 + }, + { + "epoch": 0.93, + "grad_norm": 1.4333015642586124, + "learning_rate": 1.1430450945038363e-07, + "loss": 0.4882, + "step": 7525 + }, + { + "epoch": 0.93, + "grad_norm": 1.8930763894354676, + "learning_rate": 1.1387763587886181e-07, + "loss": 0.493, + "step": 7526 + }, + { + "epoch": 0.93, + "grad_norm": 1.8009081752597993, + "learning_rate": 1.1345155170344124e-07, + "loss": 0.5328, + "step": 7527 + }, + { + "epoch": 0.93, + "grad_norm": 1.3986294432791426, + "learning_rate": 1.1302625699295855e-07, + "loss": 0.4201, + "step": 7528 + }, + { + "epoch": 0.93, + "grad_norm": 1.4873467528400124, + "learning_rate": 1.1260175181612488e-07, + "loss": 0.5264, + "step": 7529 + }, + { + "epoch": 0.93, + "grad_norm": 1.5485362346370584, + "learning_rate": 1.1217803624152312e-07, + "loss": 0.4878, + "step": 7530 + }, + { + "epoch": 0.93, + "grad_norm": 1.542461414453805, + "learning_rate": 1.1175511033760688e-07, + "loss": 0.4484, + "step": 7531 + }, + { + "epoch": 0.93, + "grad_norm": 1.4454517583014335, + "learning_rate": 1.1133297417270539e-07, + "loss": 0.5089, + "step": 7532 + }, + { + "epoch": 0.93, + "grad_norm": 1.6930035810814124, + "learning_rate": 1.1091162781501685e-07, + "loss": 0.5337, + "step": 7533 + }, + { + "epoch": 0.93, + "grad_norm": 1.248049538526929, + "learning_rate": 1.104910713326146e-07, + "loss": 0.4951, + "step": 7534 + }, + { + "epoch": 0.94, + "grad_norm": 1.8676919633357776, + "learning_rate": 1.1007130479344208e-07, + "loss": 0.4885, + "step": 7535 + }, + { + "epoch": 0.94, + "grad_norm": 1.5895304874985916, + "learning_rate": 1.0965232826531725e-07, + "loss": 0.5174, + "step": 7536 + }, + { + "epoch": 0.94, + "grad_norm": 1.7402560764830721, + "learning_rate": 1.0923414181592873e-07, + "loss": 0.498, + "step": 7537 + }, + { + "epoch": 0.94, + "grad_norm": 1.4436156231144772, + "learning_rate": 1.088167455128386e-07, + "loss": 0.4194, + "step": 7538 + }, + { + "epoch": 0.94, + "grad_norm": 1.5020695344067279, + "learning_rate": 1.0840013942348182e-07, + "loss": 0.4603, + "step": 7539 + }, + { + "epoch": 0.94, + "grad_norm": 1.905688093155993, + "learning_rate": 1.0798432361516287e-07, + "loss": 0.4891, + "step": 7540 + }, + { + "epoch": 0.94, + "grad_norm": 1.4016492508769964, + "learning_rate": 1.075692981550619e-07, + "loss": 0.4704, + "step": 7541 + }, + { + "epoch": 0.94, + "grad_norm": 2.3829597401522613, + "learning_rate": 1.0715506311022972e-07, + "loss": 0.486, + "step": 7542 + }, + { + "epoch": 0.94, + "grad_norm": 1.5601454546027733, + "learning_rate": 1.0674161854758947e-07, + "loss": 0.5033, + "step": 7543 + }, + { + "epoch": 0.94, + "grad_norm": 1.342026448883406, + "learning_rate": 1.0632896453393605e-07, + "loss": 0.481, + "step": 7544 + }, + { + "epoch": 0.94, + "grad_norm": 2.4630019635397487, + "learning_rate": 1.0591710113593834e-07, + "loss": 0.4689, + "step": 7545 + }, + { + "epoch": 0.94, + "grad_norm": 0.6749586920383936, + "learning_rate": 1.0550602842013647e-07, + "loss": 0.4868, + "step": 7546 + }, + { + "epoch": 0.94, + "grad_norm": 1.4416026970217426, + "learning_rate": 1.0509574645294173e-07, + "loss": 0.5184, + "step": 7547 + }, + { + "epoch": 0.94, + "grad_norm": 1.9095201673801285, + "learning_rate": 1.0468625530064058e-07, + "loss": 0.5482, + "step": 7548 + }, + { + "epoch": 0.94, + "grad_norm": 1.4417897196789546, + "learning_rate": 1.042775550293884e-07, + "loss": 0.5059, + "step": 7549 + }, + { + "epoch": 0.94, + "grad_norm": 2.012208951779018, + "learning_rate": 1.0386964570521574e-07, + "loss": 0.5351, + "step": 7550 + }, + { + "epoch": 0.94, + "grad_norm": 1.37239360342099, + "learning_rate": 1.0346252739402207e-07, + "loss": 0.4915, + "step": 7551 + }, + { + "epoch": 0.94, + "grad_norm": 2.1832857772124155, + "learning_rate": 1.0305620016158258e-07, + "loss": 0.469, + "step": 7552 + }, + { + "epoch": 0.94, + "grad_norm": 1.4342619421717173, + "learning_rate": 1.0265066407354196e-07, + "loss": 0.4912, + "step": 7553 + }, + { + "epoch": 0.94, + "grad_norm": 1.9992131575406364, + "learning_rate": 1.0224591919541837e-07, + "loss": 0.5002, + "step": 7554 + }, + { + "epoch": 0.94, + "grad_norm": 1.3660580834026919, + "learning_rate": 1.0184196559260229e-07, + "loss": 0.4791, + "step": 7555 + }, + { + "epoch": 0.94, + "grad_norm": 1.567805960522422, + "learning_rate": 1.0143880333035594e-07, + "loss": 0.4781, + "step": 7556 + }, + { + "epoch": 0.94, + "grad_norm": 1.2868250415977684, + "learning_rate": 1.010364324738139e-07, + "loss": 0.456, + "step": 7557 + }, + { + "epoch": 0.94, + "grad_norm": 1.4103965262983116, + "learning_rate": 1.0063485308798193e-07, + "loss": 0.4559, + "step": 7558 + }, + { + "epoch": 0.94, + "grad_norm": 1.629549103806848, + "learning_rate": 1.0023406523773981e-07, + "loss": 0.4718, + "step": 7559 + }, + { + "epoch": 0.94, + "grad_norm": 1.5189480072759118, + "learning_rate": 9.983406898783688e-08, + "loss": 0.4718, + "step": 7560 + }, + { + "epoch": 0.94, + "grad_norm": 1.394972304534396, + "learning_rate": 9.943486440289751e-08, + "loss": 0.4669, + "step": 7561 + }, + { + "epoch": 0.94, + "grad_norm": 1.5152927943746088, + "learning_rate": 9.90364515474157e-08, + "loss": 0.4931, + "step": 7562 + }, + { + "epoch": 0.94, + "grad_norm": 1.4183916894876742, + "learning_rate": 9.86388304857594e-08, + "loss": 0.4588, + "step": 7563 + }, + { + "epoch": 0.94, + "grad_norm": 1.6619140841376372, + "learning_rate": 9.824200128216665e-08, + "loss": 0.4911, + "step": 7564 + }, + { + "epoch": 0.94, + "grad_norm": 1.4373377755286527, + "learning_rate": 9.784596400075063e-08, + "loss": 0.4961, + "step": 7565 + }, + { + "epoch": 0.94, + "grad_norm": 1.3645511041619751, + "learning_rate": 9.745071870549295e-08, + "loss": 0.489, + "step": 7566 + }, + { + "epoch": 0.94, + "grad_norm": 1.3053146652099252, + "learning_rate": 9.705626546024916e-08, + "loss": 0.4641, + "step": 7567 + }, + { + "epoch": 0.94, + "grad_norm": 1.6799087105795374, + "learning_rate": 9.666260432874719e-08, + "loss": 0.4837, + "step": 7568 + }, + { + "epoch": 0.94, + "grad_norm": 1.3840190159719987, + "learning_rate": 9.626973537458561e-08, + "loss": 0.4886, + "step": 7569 + }, + { + "epoch": 0.94, + "grad_norm": 1.7803774263684327, + "learning_rate": 9.5877658661237e-08, + "loss": 0.5038, + "step": 7570 + }, + { + "epoch": 0.94, + "grad_norm": 1.5466790610753376, + "learning_rate": 9.548637425204399e-08, + "loss": 0.4751, + "step": 7571 + }, + { + "epoch": 0.94, + "grad_norm": 1.207045566540985, + "learning_rate": 9.50958822102227e-08, + "loss": 0.5139, + "step": 7572 + }, + { + "epoch": 0.94, + "grad_norm": 2.0388938387697997, + "learning_rate": 9.470618259885989e-08, + "loss": 0.5066, + "step": 7573 + }, + { + "epoch": 0.94, + "grad_norm": 0.6693846347721168, + "learning_rate": 9.431727548091574e-08, + "loss": 0.494, + "step": 7574 + }, + { + "epoch": 0.94, + "grad_norm": 1.5269826421316284, + "learning_rate": 9.392916091922111e-08, + "loss": 0.5288, + "step": 7575 + }, + { + "epoch": 0.94, + "grad_norm": 2.011778871310886, + "learning_rate": 9.354183897647917e-08, + "loss": 0.4874, + "step": 7576 + }, + { + "epoch": 0.94, + "grad_norm": 1.5042801955481406, + "learning_rate": 9.315530971526543e-08, + "loss": 0.5289, + "step": 7577 + }, + { + "epoch": 0.94, + "grad_norm": 0.6520762442159772, + "learning_rate": 9.276957319802604e-08, + "loss": 0.4724, + "step": 7578 + }, + { + "epoch": 0.94, + "grad_norm": 1.7630841557824317, + "learning_rate": 9.238462948708227e-08, + "loss": 0.5152, + "step": 7579 + }, + { + "epoch": 0.94, + "grad_norm": 1.3418160264322265, + "learning_rate": 9.20004786446238e-08, + "loss": 0.4603, + "step": 7580 + }, + { + "epoch": 0.94, + "grad_norm": 1.6097567059166966, + "learning_rate": 9.161712073271379e-08, + "loss": 0.4397, + "step": 7581 + }, + { + "epoch": 0.94, + "grad_norm": 1.4953389541248439, + "learning_rate": 9.123455581328711e-08, + "loss": 0.5223, + "step": 7582 + }, + { + "epoch": 0.94, + "grad_norm": 1.2689736383401649, + "learning_rate": 9.085278394815045e-08, + "loss": 0.441, + "step": 7583 + }, + { + "epoch": 0.94, + "grad_norm": 1.752001103494819, + "learning_rate": 9.04718051989828e-08, + "loss": 0.4691, + "step": 7584 + }, + { + "epoch": 0.94, + "grad_norm": 17.642591161965804, + "learning_rate": 9.009161962733327e-08, + "loss": 0.4844, + "step": 7585 + }, + { + "epoch": 0.94, + "grad_norm": 1.482388634847006, + "learning_rate": 8.971222729462603e-08, + "loss": 0.4791, + "step": 7586 + }, + { + "epoch": 0.94, + "grad_norm": 1.3991761059629115, + "learning_rate": 8.933362826215374e-08, + "loss": 0.4641, + "step": 7587 + }, + { + "epoch": 0.94, + "grad_norm": 1.344337928641865, + "learning_rate": 8.895582259108415e-08, + "loss": 0.4699, + "step": 7588 + }, + { + "epoch": 0.94, + "grad_norm": 1.6337283293116154, + "learning_rate": 8.857881034245285e-08, + "loss": 0.5037, + "step": 7589 + }, + { + "epoch": 0.94, + "grad_norm": 1.5404316087887595, + "learning_rate": 8.82025915771717e-08, + "loss": 0.5291, + "step": 7590 + }, + { + "epoch": 0.94, + "grad_norm": 1.4641869888308263, + "learning_rate": 8.78271663560204e-08, + "loss": 0.4885, + "step": 7591 + }, + { + "epoch": 0.94, + "grad_norm": 1.9766064882640204, + "learning_rate": 8.745253473965376e-08, + "loss": 0.5273, + "step": 7592 + }, + { + "epoch": 0.94, + "grad_norm": 1.3515860790010006, + "learning_rate": 8.707869678859504e-08, + "loss": 0.4739, + "step": 7593 + }, + { + "epoch": 0.94, + "grad_norm": 1.7362237175457764, + "learning_rate": 8.67056525632426e-08, + "loss": 0.5172, + "step": 7594 + }, + { + "epoch": 0.94, + "grad_norm": 1.3652520626139257, + "learning_rate": 8.633340212386431e-08, + "loss": 0.4888, + "step": 7595 + }, + { + "epoch": 0.94, + "grad_norm": 2.562635416592457, + "learning_rate": 8.596194553060044e-08, + "loss": 0.4362, + "step": 7596 + }, + { + "epoch": 0.94, + "grad_norm": 0.6489051034073675, + "learning_rate": 8.559128284346352e-08, + "loss": 0.4773, + "step": 7597 + }, + { + "epoch": 0.94, + "grad_norm": 1.3462427241427373, + "learning_rate": 8.522141412233676e-08, + "loss": 0.4857, + "step": 7598 + }, + { + "epoch": 0.94, + "grad_norm": 1.4627023788707003, + "learning_rate": 8.485233942697624e-08, + "loss": 0.4767, + "step": 7599 + }, + { + "epoch": 0.94, + "grad_norm": 2.6271714374623194, + "learning_rate": 8.448405881700927e-08, + "loss": 0.4635, + "step": 7600 + }, + { + "epoch": 0.94, + "grad_norm": 1.2335539334688235, + "learning_rate": 8.411657235193438e-08, + "loss": 0.4921, + "step": 7601 + }, + { + "epoch": 0.94, + "grad_norm": 1.406101675022231, + "learning_rate": 8.37498800911224e-08, + "loss": 0.4775, + "step": 7602 + }, + { + "epoch": 0.94, + "grad_norm": 1.4206446049445998, + "learning_rate": 8.338398209381537e-08, + "loss": 0.5124, + "step": 7603 + }, + { + "epoch": 0.94, + "grad_norm": 1.4878336030354353, + "learning_rate": 8.301887841912881e-08, + "loss": 0.4836, + "step": 7604 + }, + { + "epoch": 0.94, + "grad_norm": 1.389099926530956, + "learning_rate": 8.265456912604608e-08, + "loss": 0.535, + "step": 7605 + }, + { + "epoch": 0.94, + "grad_norm": 0.6773820458682531, + "learning_rate": 8.229105427342676e-08, + "loss": 0.4777, + "step": 7606 + }, + { + "epoch": 0.94, + "grad_norm": 1.663011926443581, + "learning_rate": 8.192833391999833e-08, + "loss": 0.497, + "step": 7607 + }, + { + "epoch": 0.94, + "grad_norm": 1.4798884001272063, + "learning_rate": 8.156640812436278e-08, + "loss": 0.5235, + "step": 7608 + }, + { + "epoch": 0.94, + "grad_norm": 4.610926579403922, + "learning_rate": 8.120527694499114e-08, + "loss": 0.4755, + "step": 7609 + }, + { + "epoch": 0.94, + "grad_norm": 2.4878269166594444, + "learning_rate": 8.08449404402284e-08, + "loss": 0.4835, + "step": 7610 + }, + { + "epoch": 0.94, + "grad_norm": 1.5222386394029344, + "learning_rate": 8.048539866828909e-08, + "loss": 0.5179, + "step": 7611 + }, + { + "epoch": 0.94, + "grad_norm": 0.6431193155061984, + "learning_rate": 8.012665168726063e-08, + "loss": 0.4471, + "step": 7612 + }, + { + "epoch": 0.94, + "grad_norm": 1.4935441485253522, + "learning_rate": 7.976869955510225e-08, + "loss": 0.5175, + "step": 7613 + }, + { + "epoch": 0.94, + "grad_norm": 1.3801497191756107, + "learning_rate": 7.941154232964376e-08, + "loss": 0.4915, + "step": 7614 + }, + { + "epoch": 0.95, + "grad_norm": 0.6613682876564198, + "learning_rate": 7.905518006858792e-08, + "loss": 0.5026, + "step": 7615 + }, + { + "epoch": 0.95, + "grad_norm": 1.811683475249906, + "learning_rate": 7.8699612829507e-08, + "loss": 0.4692, + "step": 7616 + }, + { + "epoch": 0.95, + "grad_norm": 1.3027553272100894, + "learning_rate": 7.834484066984727e-08, + "loss": 0.5004, + "step": 7617 + }, + { + "epoch": 0.95, + "grad_norm": 3.9372547346821705, + "learning_rate": 7.799086364692343e-08, + "loss": 0.4583, + "step": 7618 + }, + { + "epoch": 0.95, + "grad_norm": 1.4098518044530997, + "learning_rate": 7.76376818179253e-08, + "loss": 0.4473, + "step": 7619 + }, + { + "epoch": 0.95, + "grad_norm": 1.3656186635555496, + "learning_rate": 7.728529523991224e-08, + "loss": 0.4934, + "step": 7620 + }, + { + "epoch": 0.95, + "grad_norm": 1.2652488484325881, + "learning_rate": 7.693370396981481e-08, + "loss": 0.4483, + "step": 7621 + }, + { + "epoch": 0.95, + "grad_norm": 1.5197178165179743, + "learning_rate": 7.658290806443647e-08, + "loss": 0.4688, + "step": 7622 + }, + { + "epoch": 0.95, + "grad_norm": 1.7503132976742892, + "learning_rate": 7.623290758045021e-08, + "loss": 0.5441, + "step": 7623 + }, + { + "epoch": 0.95, + "grad_norm": 1.4192647679144745, + "learning_rate": 7.588370257440303e-08, + "loss": 0.4673, + "step": 7624 + }, + { + "epoch": 0.95, + "grad_norm": 1.931190050772453, + "learning_rate": 7.553529310271147e-08, + "loss": 0.4878, + "step": 7625 + }, + { + "epoch": 0.95, + "grad_norm": 1.440480842373573, + "learning_rate": 7.518767922166381e-08, + "loss": 0.5096, + "step": 7626 + }, + { + "epoch": 0.95, + "grad_norm": 1.4898492224534166, + "learning_rate": 7.484086098742016e-08, + "loss": 0.4761, + "step": 7627 + }, + { + "epoch": 0.95, + "grad_norm": 1.6981654852220975, + "learning_rate": 7.449483845601291e-08, + "loss": 0.5055, + "step": 7628 + }, + { + "epoch": 0.95, + "grad_norm": 1.2724001278107187, + "learning_rate": 7.4149611683344e-08, + "loss": 0.4587, + "step": 7629 + }, + { + "epoch": 0.95, + "grad_norm": 1.5613833744236691, + "learning_rate": 7.380518072518883e-08, + "loss": 0.4533, + "step": 7630 + }, + { + "epoch": 0.95, + "grad_norm": 1.6580808902507445, + "learning_rate": 7.346154563719232e-08, + "loss": 0.4535, + "step": 7631 + }, + { + "epoch": 0.95, + "grad_norm": 0.6613063750238234, + "learning_rate": 7.311870647487229e-08, + "loss": 0.4727, + "step": 7632 + }, + { + "epoch": 0.95, + "grad_norm": 3.014698388336979, + "learning_rate": 7.277666329361776e-08, + "loss": 0.5082, + "step": 7633 + }, + { + "epoch": 0.95, + "grad_norm": 1.2342182090689713, + "learning_rate": 7.243541614868787e-08, + "loss": 0.4175, + "step": 7634 + }, + { + "epoch": 0.95, + "grad_norm": 1.8184782840077198, + "learning_rate": 7.209496509521519e-08, + "loss": 0.5313, + "step": 7635 + }, + { + "epoch": 0.95, + "grad_norm": 1.4133824416213492, + "learning_rate": 7.175531018820125e-08, + "loss": 0.5306, + "step": 7636 + }, + { + "epoch": 0.95, + "grad_norm": 1.6655108699828332, + "learning_rate": 7.141645148252107e-08, + "loss": 0.4792, + "step": 7637 + }, + { + "epoch": 0.95, + "grad_norm": 1.8164807101058542, + "learning_rate": 7.107838903292085e-08, + "loss": 0.4482, + "step": 7638 + }, + { + "epoch": 0.95, + "grad_norm": 1.1941787148976293, + "learning_rate": 7.074112289401636e-08, + "loss": 0.4537, + "step": 7639 + }, + { + "epoch": 0.95, + "grad_norm": 1.378549116427674, + "learning_rate": 7.040465312029731e-08, + "loss": 0.4766, + "step": 7640 + }, + { + "epoch": 0.95, + "grad_norm": 1.1018978404214064, + "learning_rate": 7.006897976612192e-08, + "loss": 0.4628, + "step": 7641 + }, + { + "epoch": 0.95, + "grad_norm": 1.44923619105166, + "learning_rate": 6.973410288572179e-08, + "loss": 0.4854, + "step": 7642 + }, + { + "epoch": 0.95, + "grad_norm": 2.9691991774920328, + "learning_rate": 6.940002253319978e-08, + "loss": 0.5281, + "step": 7643 + }, + { + "epoch": 0.95, + "grad_norm": 1.537616321063936, + "learning_rate": 6.906673876252822e-08, + "loss": 0.5314, + "step": 7644 + }, + { + "epoch": 0.95, + "grad_norm": 1.7123009207082152, + "learning_rate": 6.873425162755354e-08, + "loss": 0.5079, + "step": 7645 + }, + { + "epoch": 0.95, + "grad_norm": 2.118658183151667, + "learning_rate": 6.840256118199051e-08, + "loss": 0.4906, + "step": 7646 + }, + { + "epoch": 0.95, + "grad_norm": 1.6863783866001274, + "learning_rate": 6.807166747942795e-08, + "loss": 0.4469, + "step": 7647 + }, + { + "epoch": 0.95, + "grad_norm": 1.524207423173094, + "learning_rate": 6.774157057332365e-08, + "loss": 0.488, + "step": 7648 + }, + { + "epoch": 0.95, + "grad_norm": 1.3968831983784278, + "learning_rate": 6.741227051700827e-08, + "loss": 0.4683, + "step": 7649 + }, + { + "epoch": 0.95, + "grad_norm": 0.632796678065021, + "learning_rate": 6.70837673636826e-08, + "loss": 0.4381, + "step": 7650 + }, + { + "epoch": 0.95, + "grad_norm": 1.2330369250788322, + "learning_rate": 6.675606116642031e-08, + "loss": 0.4289, + "step": 7651 + }, + { + "epoch": 0.95, + "grad_norm": 1.5967975832790167, + "learning_rate": 6.642915197816347e-08, + "loss": 0.4778, + "step": 7652 + }, + { + "epoch": 0.95, + "grad_norm": 1.3934013360644446, + "learning_rate": 6.610303985172873e-08, + "loss": 0.4686, + "step": 7653 + }, + { + "epoch": 0.95, + "grad_norm": 1.1985070265059699, + "learning_rate": 6.577772483980228e-08, + "loss": 0.4303, + "step": 7654 + }, + { + "epoch": 0.95, + "grad_norm": 1.4211373045289915, + "learning_rate": 6.54532069949404e-08, + "loss": 0.4834, + "step": 7655 + }, + { + "epoch": 0.95, + "grad_norm": 1.6044525557085323, + "learning_rate": 6.512948636957284e-08, + "loss": 0.5273, + "step": 7656 + }, + { + "epoch": 0.95, + "grad_norm": 1.2829303434544845, + "learning_rate": 6.480656301599886e-08, + "loss": 0.4323, + "step": 7657 + }, + { + "epoch": 0.95, + "grad_norm": 1.6705878337304934, + "learning_rate": 6.448443698639062e-08, + "loss": 0.4911, + "step": 7658 + }, + { + "epoch": 0.95, + "grad_norm": 1.310649686550002, + "learning_rate": 6.416310833278872e-08, + "loss": 0.4296, + "step": 7659 + }, + { + "epoch": 0.95, + "grad_norm": 1.4713096666729257, + "learning_rate": 6.384257710710828e-08, + "loss": 0.4754, + "step": 7660 + }, + { + "epoch": 0.95, + "grad_norm": 3.0267556929527535, + "learning_rate": 6.35228433611329e-08, + "loss": 0.5011, + "step": 7661 + }, + { + "epoch": 0.95, + "grad_norm": 0.7116531090517827, + "learning_rate": 6.320390714651958e-08, + "loss": 0.4999, + "step": 7662 + }, + { + "epoch": 0.95, + "grad_norm": 2.0886268269934094, + "learning_rate": 6.288576851479378e-08, + "loss": 0.506, + "step": 7663 + }, + { + "epoch": 0.95, + "grad_norm": 1.2864415516373995, + "learning_rate": 6.256842751735492e-08, + "loss": 0.4921, + "step": 7664 + }, + { + "epoch": 0.95, + "grad_norm": 1.6822313096347454, + "learning_rate": 6.22518842054709e-08, + "loss": 0.471, + "step": 7665 + }, + { + "epoch": 0.95, + "grad_norm": 1.5822354547473338, + "learning_rate": 6.193613863028303e-08, + "loss": 0.5187, + "step": 7666 + }, + { + "epoch": 0.95, + "grad_norm": 1.2247180484116207, + "learning_rate": 6.162119084280271e-08, + "loss": 0.4228, + "step": 7667 + }, + { + "epoch": 0.95, + "grad_norm": 1.932817199646369, + "learning_rate": 6.130704089391259e-08, + "loss": 0.535, + "step": 7668 + }, + { + "epoch": 0.95, + "grad_norm": 1.4551478562969469, + "learning_rate": 6.09936888343654e-08, + "loss": 0.5047, + "step": 7669 + }, + { + "epoch": 0.95, + "grad_norm": 1.5915916781050325, + "learning_rate": 6.068113471478676e-08, + "loss": 0.5171, + "step": 7670 + }, + { + "epoch": 0.95, + "grad_norm": 1.898586512014985, + "learning_rate": 6.036937858567294e-08, + "loss": 0.4807, + "step": 7671 + }, + { + "epoch": 0.95, + "grad_norm": 1.796050658097426, + "learning_rate": 6.00584204973903e-08, + "loss": 0.513, + "step": 7672 + }, + { + "epoch": 0.95, + "grad_norm": 1.5586630964181434, + "learning_rate": 5.974826050017701e-08, + "loss": 0.4912, + "step": 7673 + }, + { + "epoch": 0.95, + "grad_norm": 1.6303241842222034, + "learning_rate": 5.943889864414243e-08, + "loss": 0.508, + "step": 7674 + }, + { + "epoch": 0.95, + "grad_norm": 1.3696136170171038, + "learning_rate": 5.913033497926546e-08, + "loss": 0.4956, + "step": 7675 + }, + { + "epoch": 0.95, + "grad_norm": 1.1958120852784795, + "learning_rate": 5.8822569555399e-08, + "loss": 0.4919, + "step": 7676 + }, + { + "epoch": 0.95, + "grad_norm": 1.3777355475873272, + "learning_rate": 5.8515602422263287e-08, + "loss": 0.495, + "step": 7677 + }, + { + "epoch": 0.95, + "grad_norm": 1.391424721471707, + "learning_rate": 5.820943362945364e-08, + "loss": 0.4778, + "step": 7678 + }, + { + "epoch": 0.95, + "grad_norm": 1.3233746300787803, + "learning_rate": 5.790406322643327e-08, + "loss": 0.4705, + "step": 7679 + }, + { + "epoch": 0.95, + "grad_norm": 1.7454553079953834, + "learning_rate": 5.759949126253772e-08, + "loss": 0.559, + "step": 7680 + }, + { + "epoch": 0.95, + "grad_norm": 1.8206013531681495, + "learning_rate": 5.7295717786972625e-08, + "loss": 0.4776, + "step": 7681 + }, + { + "epoch": 0.95, + "grad_norm": 0.6651423869229104, + "learning_rate": 5.6992742848815955e-08, + "loss": 0.5239, + "step": 7682 + }, + { + "epoch": 0.95, + "grad_norm": 1.5163882063428316, + "learning_rate": 5.669056649701632e-08, + "loss": 0.4811, + "step": 7683 + }, + { + "epoch": 0.95, + "grad_norm": 1.6129136935060087, + "learning_rate": 5.63891887803919e-08, + "loss": 0.5292, + "step": 7684 + }, + { + "epoch": 0.95, + "grad_norm": 1.4545287345230198, + "learning_rate": 5.608860974763319e-08, + "loss": 0.46, + "step": 7685 + }, + { + "epoch": 0.95, + "grad_norm": 1.765077600660373, + "learning_rate": 5.5788829447301906e-08, + "loss": 0.5191, + "step": 7686 + }, + { + "epoch": 0.95, + "grad_norm": 1.4372217535237481, + "learning_rate": 5.548984792783041e-08, + "loss": 0.4949, + "step": 7687 + }, + { + "epoch": 0.95, + "grad_norm": 1.3253862390810809, + "learning_rate": 5.519166523752117e-08, + "loss": 0.5215, + "step": 7688 + }, + { + "epoch": 0.95, + "grad_norm": 1.7964409105865975, + "learning_rate": 5.4894281424548446e-08, + "loss": 0.4757, + "step": 7689 + }, + { + "epoch": 0.95, + "grad_norm": 1.3087364335845009, + "learning_rate": 5.459769653695657e-08, + "loss": 0.4752, + "step": 7690 + }, + { + "epoch": 0.95, + "grad_norm": 1.627844249158136, + "learning_rate": 5.430191062266277e-08, + "loss": 0.4828, + "step": 7691 + }, + { + "epoch": 0.95, + "grad_norm": 1.9735770293227999, + "learning_rate": 5.400692372945271e-08, + "loss": 0.5109, + "step": 7692 + }, + { + "epoch": 0.95, + "grad_norm": 1.4459533226148706, + "learning_rate": 5.371273590498438e-08, + "loss": 0.4588, + "step": 7693 + }, + { + "epoch": 0.95, + "grad_norm": 2.704100185893509, + "learning_rate": 5.341934719678699e-08, + "loss": 0.5431, + "step": 7694 + }, + { + "epoch": 0.95, + "grad_norm": 1.7613191834669115, + "learning_rate": 5.312675765225928e-08, + "loss": 0.4894, + "step": 7695 + }, + { + "epoch": 0.96, + "grad_norm": 4.259121277853792, + "learning_rate": 5.283496731867288e-08, + "loss": 0.5277, + "step": 7696 + }, + { + "epoch": 0.96, + "grad_norm": 4.344515438831487, + "learning_rate": 5.254397624316731e-08, + "loss": 0.4692, + "step": 7697 + }, + { + "epoch": 0.96, + "grad_norm": 1.458748155261817, + "learning_rate": 5.2253784472756084e-08, + "loss": 0.5377, + "step": 7698 + }, + { + "epoch": 0.96, + "grad_norm": 1.480855482106529, + "learning_rate": 5.1964392054321686e-08, + "loss": 0.4587, + "step": 7699 + }, + { + "epoch": 0.96, + "grad_norm": 1.33662859192495, + "learning_rate": 5.167579903461839e-08, + "loss": 0.5017, + "step": 7700 + }, + { + "epoch": 0.96, + "grad_norm": 1.5791003334365201, + "learning_rate": 5.1388005460270565e-08, + "loss": 0.5071, + "step": 7701 + }, + { + "epoch": 0.96, + "grad_norm": 1.4395511057076358, + "learning_rate": 5.110101137777379e-08, + "loss": 0.5164, + "step": 7702 + }, + { + "epoch": 0.96, + "grad_norm": 1.2375084850602827, + "learning_rate": 5.0814816833494876e-08, + "loss": 0.4812, + "step": 7703 + }, + { + "epoch": 0.96, + "grad_norm": 1.3507590154257363, + "learning_rate": 5.052942187367016e-08, + "loss": 0.4899, + "step": 7704 + }, + { + "epoch": 0.96, + "grad_norm": 1.315754043115813, + "learning_rate": 5.024482654440943e-08, + "loss": 0.4567, + "step": 7705 + }, + { + "epoch": 0.96, + "grad_norm": 1.495124890904158, + "learning_rate": 4.99610308916898e-08, + "loss": 0.4682, + "step": 7706 + }, + { + "epoch": 0.96, + "grad_norm": 1.3499923101809124, + "learning_rate": 4.967803496136181e-08, + "loss": 0.4835, + "step": 7707 + }, + { + "epoch": 0.96, + "grad_norm": 1.4715506582456879, + "learning_rate": 4.9395838799146114e-08, + "loss": 0.4866, + "step": 7708 + }, + { + "epoch": 0.96, + "grad_norm": 1.5842712418136582, + "learning_rate": 4.911444245063346e-08, + "loss": 0.5489, + "step": 7709 + }, + { + "epoch": 0.96, + "grad_norm": 1.8163171475763948, + "learning_rate": 4.8833845961286375e-08, + "loss": 0.4716, + "step": 7710 + }, + { + "epoch": 0.96, + "grad_norm": 1.3644745215747869, + "learning_rate": 4.855404937643693e-08, + "loss": 0.4827, + "step": 7711 + }, + { + "epoch": 0.96, + "grad_norm": 1.4313206438200663, + "learning_rate": 4.827505274128952e-08, + "loss": 0.4813, + "step": 7712 + }, + { + "epoch": 0.96, + "grad_norm": 1.315013732174951, + "learning_rate": 4.799685610091809e-08, + "loss": 0.5051, + "step": 7713 + }, + { + "epoch": 0.96, + "grad_norm": 1.4328290676809463, + "learning_rate": 4.771945950026835e-08, + "loss": 0.4569, + "step": 7714 + }, + { + "epoch": 0.96, + "grad_norm": 2.6288798079160522, + "learning_rate": 4.744286298415557e-08, + "loss": 0.4907, + "step": 7715 + }, + { + "epoch": 0.96, + "grad_norm": 1.7069136478926474, + "learning_rate": 4.7167066597266776e-08, + "loss": 0.4865, + "step": 7716 + }, + { + "epoch": 0.96, + "grad_norm": 1.308945493190629, + "learning_rate": 4.689207038415799e-08, + "loss": 0.4076, + "step": 7717 + }, + { + "epoch": 0.96, + "grad_norm": 1.4611682125753729, + "learning_rate": 4.6617874389259246e-08, + "loss": 0.5038, + "step": 7718 + }, + { + "epoch": 0.96, + "grad_norm": 1.724751515685758, + "learning_rate": 4.634447865686842e-08, + "loss": 0.4707, + "step": 7719 + }, + { + "epoch": 0.96, + "grad_norm": 1.3998749280496665, + "learning_rate": 4.6071883231154077e-08, + "loss": 0.4931, + "step": 7720 + }, + { + "epoch": 0.96, + "grad_norm": 1.3858879631903585, + "learning_rate": 4.5800088156158215e-08, + "loss": 0.4985, + "step": 7721 + }, + { + "epoch": 0.96, + "grad_norm": 1.467848420192936, + "learning_rate": 4.5529093475790156e-08, + "loss": 0.5039, + "step": 7722 + }, + { + "epoch": 0.96, + "grad_norm": 1.5766696103475495, + "learning_rate": 4.525889923383264e-08, + "loss": 0.4582, + "step": 7723 + }, + { + "epoch": 0.96, + "grad_norm": 1.330634533354273, + "learning_rate": 4.498950547393743e-08, + "loss": 0.4371, + "step": 7724 + }, + { + "epoch": 0.96, + "grad_norm": 2.841870302597123, + "learning_rate": 4.472091223962749e-08, + "loss": 0.469, + "step": 7725 + }, + { + "epoch": 0.96, + "grad_norm": 2.0793156583564767, + "learning_rate": 4.445311957429588e-08, + "loss": 0.4845, + "step": 7726 + }, + { + "epoch": 0.96, + "grad_norm": 1.3752340510642325, + "learning_rate": 4.418612752120743e-08, + "loss": 0.5401, + "step": 7727 + }, + { + "epoch": 0.96, + "grad_norm": 1.6006130720471534, + "learning_rate": 4.3919936123497654e-08, + "loss": 0.4821, + "step": 7728 + }, + { + "epoch": 0.96, + "grad_norm": 1.3840103206152756, + "learning_rate": 4.365454542417047e-08, + "loss": 0.5033, + "step": 7729 + }, + { + "epoch": 0.96, + "grad_norm": 1.4145112943608362, + "learning_rate": 4.3389955466103804e-08, + "loss": 0.4687, + "step": 7730 + }, + { + "epoch": 0.96, + "grad_norm": 1.4189671569908613, + "learning_rate": 4.312616629204347e-08, + "loss": 0.478, + "step": 7731 + }, + { + "epoch": 0.96, + "grad_norm": 1.503259140881166, + "learning_rate": 4.2863177944607594e-08, + "loss": 0.4842, + "step": 7732 + }, + { + "epoch": 0.96, + "grad_norm": 1.2896560391253253, + "learning_rate": 4.26009904662833e-08, + "loss": 0.4583, + "step": 7733 + }, + { + "epoch": 0.96, + "grad_norm": 1.4580249324439911, + "learning_rate": 4.233960389943004e-08, + "loss": 0.4627, + "step": 7734 + }, + { + "epoch": 0.96, + "grad_norm": 1.3594047511001994, + "learning_rate": 4.2079018286277365e-08, + "loss": 0.4613, + "step": 7735 + }, + { + "epoch": 0.96, + "grad_norm": 3.0801452303061256, + "learning_rate": 4.1819233668924375e-08, + "loss": 0.4604, + "step": 7736 + }, + { + "epoch": 0.96, + "grad_norm": 1.2787454071154456, + "learning_rate": 4.156025008934195e-08, + "loss": 0.4782, + "step": 7737 + }, + { + "epoch": 0.96, + "grad_norm": 0.6044864170581877, + "learning_rate": 4.130206758937105e-08, + "loss": 0.4766, + "step": 7738 + }, + { + "epoch": 0.96, + "grad_norm": 1.5826720176373226, + "learning_rate": 4.1044686210723884e-08, + "loss": 0.4529, + "step": 7739 + }, + { + "epoch": 0.96, + "grad_norm": 2.5454458656320416, + "learning_rate": 4.078810599498162e-08, + "loss": 0.4545, + "step": 7740 + }, + { + "epoch": 0.96, + "grad_norm": 1.8520856327740807, + "learning_rate": 4.053232698359832e-08, + "loss": 0.5048, + "step": 7741 + }, + { + "epoch": 0.96, + "grad_norm": 1.8044521412502397, + "learning_rate": 4.027734921789594e-08, + "loss": 0.5048, + "step": 7742 + }, + { + "epoch": 0.96, + "grad_norm": 1.3926948369663805, + "learning_rate": 4.002317273906986e-08, + "loss": 0.4797, + "step": 7743 + }, + { + "epoch": 0.96, + "grad_norm": 1.5154880391926469, + "learning_rate": 3.976979758818389e-08, + "loss": 0.4929, + "step": 7744 + }, + { + "epoch": 0.96, + "grad_norm": 1.227339551797787, + "learning_rate": 3.9517223806171956e-08, + "loss": 0.4407, + "step": 7745 + }, + { + "epoch": 0.96, + "grad_norm": 1.5725278008193808, + "learning_rate": 3.926545143384142e-08, + "loss": 0.4647, + "step": 7746 + }, + { + "epoch": 0.96, + "grad_norm": 1.8648068300491099, + "learning_rate": 3.901448051186696e-08, + "loss": 0.5158, + "step": 7747 + }, + { + "epoch": 0.96, + "grad_norm": 2.203877062516585, + "learning_rate": 3.876431108079615e-08, + "loss": 0.4707, + "step": 7748 + }, + { + "epoch": 0.96, + "grad_norm": 1.6137307611030258, + "learning_rate": 3.8514943181044984e-08, + "loss": 0.5176, + "step": 7749 + }, + { + "epoch": 0.96, + "grad_norm": 1.3832513856819502, + "learning_rate": 3.826637685290236e-08, + "loss": 0.48, + "step": 7750 + }, + { + "epoch": 0.96, + "grad_norm": 1.7953339027244035, + "learning_rate": 3.8018612136524466e-08, + "loss": 0.5226, + "step": 7751 + }, + { + "epoch": 0.96, + "grad_norm": 1.4591121678175754, + "learning_rate": 3.777164907194209e-08, + "loss": 0.4381, + "step": 7752 + }, + { + "epoch": 0.96, + "grad_norm": 0.6930226256798032, + "learning_rate": 3.752548769905273e-08, + "loss": 0.4754, + "step": 7753 + }, + { + "epoch": 0.96, + "grad_norm": 2.077763466139759, + "learning_rate": 3.728012805762627e-08, + "loss": 0.4937, + "step": 7754 + }, + { + "epoch": 0.96, + "grad_norm": 1.2946149909988884, + "learning_rate": 3.7035570187303195e-08, + "loss": 0.4801, + "step": 7755 + }, + { + "epoch": 0.96, + "grad_norm": 1.9916719865590282, + "learning_rate": 3.6791814127593585e-08, + "loss": 0.5081, + "step": 7756 + }, + { + "epoch": 0.96, + "grad_norm": 1.458255870646246, + "learning_rate": 3.654885991787816e-08, + "loss": 0.4952, + "step": 7757 + }, + { + "epoch": 0.96, + "grad_norm": 1.5908084020469206, + "learning_rate": 3.630670759740884e-08, + "loss": 0.4759, + "step": 7758 + }, + { + "epoch": 0.96, + "grad_norm": 1.8546868327967143, + "learning_rate": 3.6065357205307125e-08, + "loss": 0.4787, + "step": 7759 + }, + { + "epoch": 0.96, + "grad_norm": 1.5645776854853, + "learning_rate": 3.582480878056516e-08, + "loss": 0.5225, + "step": 7760 + }, + { + "epoch": 0.96, + "grad_norm": 1.602317185925536, + "learning_rate": 3.5585062362046284e-08, + "loss": 0.5119, + "step": 7761 + }, + { + "epoch": 0.96, + "grad_norm": 2.407875740303682, + "learning_rate": 3.534611798848286e-08, + "loss": 0.4688, + "step": 7762 + }, + { + "epoch": 0.96, + "grad_norm": 1.5088178343160268, + "learning_rate": 3.510797569847957e-08, + "loss": 0.5119, + "step": 7763 + }, + { + "epoch": 0.96, + "grad_norm": 1.6905489646928964, + "learning_rate": 3.487063553050896e-08, + "loss": 0.4833, + "step": 7764 + }, + { + "epoch": 0.96, + "grad_norm": 1.3799093539466991, + "learning_rate": 3.4634097522916464e-08, + "loss": 0.5413, + "step": 7765 + }, + { + "epoch": 0.96, + "grad_norm": 1.3819950617851147, + "learning_rate": 3.4398361713916526e-08, + "loss": 0.4909, + "step": 7766 + }, + { + "epoch": 0.96, + "grad_norm": 1.3018371723847642, + "learning_rate": 3.416342814159423e-08, + "loss": 0.4656, + "step": 7767 + }, + { + "epoch": 0.96, + "grad_norm": 1.29448541490877, + "learning_rate": 3.392929684390533e-08, + "loss": 0.4505, + "step": 7768 + }, + { + "epoch": 0.96, + "grad_norm": 1.5527951190707798, + "learning_rate": 3.3695967858675685e-08, + "loss": 0.4814, + "step": 7769 + }, + { + "epoch": 0.96, + "grad_norm": 1.9497720926334228, + "learning_rate": 3.346344122360179e-08, + "loss": 0.5077, + "step": 7770 + }, + { + "epoch": 0.96, + "grad_norm": 1.3985591816774912, + "learning_rate": 3.323171697625027e-08, + "loss": 0.4473, + "step": 7771 + }, + { + "epoch": 0.96, + "grad_norm": 1.413311741709775, + "learning_rate": 3.300079515405841e-08, + "loss": 0.5118, + "step": 7772 + }, + { + "epoch": 0.96, + "grad_norm": 1.8534187425790147, + "learning_rate": 3.2770675794334126e-08, + "loss": 0.4712, + "step": 7773 + }, + { + "epoch": 0.96, + "grad_norm": 0.6653584338759195, + "learning_rate": 3.254135893425381e-08, + "loss": 0.4437, + "step": 7774 + }, + { + "epoch": 0.96, + "grad_norm": 1.7163977797998067, + "learning_rate": 3.231284461086615e-08, + "loss": 0.5157, + "step": 7775 + }, + { + "epoch": 0.97, + "grad_norm": 0.6106701632470143, + "learning_rate": 3.208513286109049e-08, + "loss": 0.4645, + "step": 7776 + }, + { + "epoch": 0.97, + "grad_norm": 1.7946249031013422, + "learning_rate": 3.185822372171521e-08, + "loss": 0.5111, + "step": 7777 + }, + { + "epoch": 0.97, + "grad_norm": 1.5144254795001317, + "learning_rate": 3.163211722939874e-08, + "loss": 0.4857, + "step": 7778 + }, + { + "epoch": 0.97, + "grad_norm": 1.4056587916463201, + "learning_rate": 3.140681342067187e-08, + "loss": 0.5091, + "step": 7779 + }, + { + "epoch": 0.97, + "grad_norm": 0.6749938480844588, + "learning_rate": 3.1182312331933254e-08, + "loss": 0.4518, + "step": 7780 + }, + { + "epoch": 0.97, + "grad_norm": 2.024631804677709, + "learning_rate": 3.095861399945388e-08, + "loss": 0.4645, + "step": 7781 + }, + { + "epoch": 0.97, + "grad_norm": 1.5604921011767916, + "learning_rate": 3.073571845937373e-08, + "loss": 0.4301, + "step": 7782 + }, + { + "epoch": 0.97, + "grad_norm": 1.3798606656257772, + "learning_rate": 3.0513625747703446e-08, + "loss": 0.4492, + "step": 7783 + }, + { + "epoch": 0.97, + "grad_norm": 1.8501868863665396, + "learning_rate": 3.029233590032432e-08, + "loss": 0.4584, + "step": 7784 + }, + { + "epoch": 0.97, + "grad_norm": 2.6542302064268632, + "learning_rate": 3.007184895298776e-08, + "loss": 0.4523, + "step": 7785 + }, + { + "epoch": 0.97, + "grad_norm": 3.515767367030657, + "learning_rate": 2.9852164941315266e-08, + "loss": 0.5063, + "step": 7786 + }, + { + "epoch": 0.97, + "grad_norm": 1.492474367639044, + "learning_rate": 2.963328390079845e-08, + "loss": 0.517, + "step": 7787 + }, + { + "epoch": 0.97, + "grad_norm": 1.7311554006788261, + "learning_rate": 2.9415205866800133e-08, + "loss": 0.5292, + "step": 7788 + }, + { + "epoch": 0.97, + "grad_norm": 1.6439229672503788, + "learning_rate": 2.9197930874551584e-08, + "loss": 0.4827, + "step": 7789 + }, + { + "epoch": 0.97, + "grad_norm": 1.3395622287881843, + "learning_rate": 2.898145895915694e-08, + "loss": 0.4924, + "step": 7790 + }, + { + "epoch": 0.97, + "grad_norm": 1.9080209785459703, + "learning_rate": 2.8765790155588223e-08, + "loss": 0.5357, + "step": 7791 + }, + { + "epoch": 0.97, + "grad_norm": 2.2865527623095567, + "learning_rate": 2.8550924498688127e-08, + "loss": 0.4766, + "step": 7792 + }, + { + "epoch": 0.97, + "grad_norm": 0.6958372990417212, + "learning_rate": 2.833686202317165e-08, + "loss": 0.5184, + "step": 7793 + }, + { + "epoch": 0.97, + "grad_norm": 1.5048457616257143, + "learning_rate": 2.812360276362114e-08, + "loss": 0.5324, + "step": 7794 + }, + { + "epoch": 0.97, + "grad_norm": 1.3447727190681364, + "learning_rate": 2.7911146754491247e-08, + "loss": 0.4206, + "step": 7795 + }, + { + "epoch": 0.97, + "grad_norm": 2.355496696654266, + "learning_rate": 2.7699494030105633e-08, + "loss": 0.4686, + "step": 7796 + }, + { + "epoch": 0.97, + "grad_norm": 1.7543877273172854, + "learning_rate": 2.7488644624659165e-08, + "loss": 0.5243, + "step": 7797 + }, + { + "epoch": 0.97, + "grad_norm": 1.3222072531274944, + "learning_rate": 2.7278598572216264e-08, + "loss": 0.4562, + "step": 7798 + }, + { + "epoch": 0.97, + "grad_norm": 1.3822661254883901, + "learning_rate": 2.7069355906711447e-08, + "loss": 0.4683, + "step": 7799 + }, + { + "epoch": 0.97, + "grad_norm": 1.2333982581816603, + "learning_rate": 2.6860916661949344e-08, + "loss": 0.425, + "step": 7800 + }, + { + "epoch": 0.97, + "grad_norm": 2.4989416221823086, + "learning_rate": 2.665328087160579e-08, + "loss": 0.5049, + "step": 7801 + }, + { + "epoch": 0.97, + "grad_norm": 1.1683052174068702, + "learning_rate": 2.6446448569226735e-08, + "loss": 0.4297, + "step": 7802 + }, + { + "epoch": 0.97, + "grad_norm": 14.415304359454165, + "learning_rate": 2.6240419788226557e-08, + "loss": 0.5134, + "step": 7803 + }, + { + "epoch": 0.97, + "grad_norm": 1.4887720785871432, + "learning_rate": 2.603519456189141e-08, + "loss": 0.5064, + "step": 7804 + }, + { + "epoch": 0.97, + "grad_norm": 2.269943717078514, + "learning_rate": 2.5830772923377544e-08, + "loss": 0.5161, + "step": 7805 + }, + { + "epoch": 0.97, + "grad_norm": 1.5539420147292864, + "learning_rate": 2.5627154905710772e-08, + "loss": 0.5005, + "step": 7806 + }, + { + "epoch": 0.97, + "grad_norm": 3.1212492048854474, + "learning_rate": 2.542434054178755e-08, + "loss": 0.5048, + "step": 7807 + }, + { + "epoch": 0.97, + "grad_norm": 1.3866045054200524, + "learning_rate": 2.5222329864373897e-08, + "loss": 0.4657, + "step": 7808 + }, + { + "epoch": 0.97, + "grad_norm": 1.7650098436343118, + "learning_rate": 2.5021122906107588e-08, + "loss": 0.4732, + "step": 7809 + }, + { + "epoch": 0.97, + "grad_norm": 1.4154524445769976, + "learning_rate": 2.4820719699493735e-08, + "loss": 0.4984, + "step": 7810 + }, + { + "epoch": 0.97, + "grad_norm": 1.3844456717672102, + "learning_rate": 2.462112027691088e-08, + "loss": 0.5104, + "step": 7811 + }, + { + "epoch": 0.97, + "grad_norm": 1.729652869759931, + "learning_rate": 2.44223246706049e-08, + "loss": 0.4732, + "step": 7812 + }, + { + "epoch": 0.97, + "grad_norm": 1.4065463489571486, + "learning_rate": 2.4224332912693438e-08, + "loss": 0.497, + "step": 7813 + }, + { + "epoch": 0.97, + "grad_norm": 1.647805148838936, + "learning_rate": 2.4027145035163124e-08, + "loss": 0.492, + "step": 7814 + }, + { + "epoch": 0.97, + "grad_norm": 1.541283830966554, + "learning_rate": 2.3830761069872922e-08, + "loss": 0.5116, + "step": 7815 + }, + { + "epoch": 0.97, + "grad_norm": 1.5878840426569956, + "learning_rate": 2.363518104854856e-08, + "loss": 0.5096, + "step": 7816 + }, + { + "epoch": 0.97, + "grad_norm": 1.367993865190107, + "learning_rate": 2.3440405002789214e-08, + "loss": 0.4724, + "step": 7817 + }, + { + "epoch": 0.97, + "grad_norm": 1.4697283027569419, + "learning_rate": 2.3246432964061928e-08, + "loss": 0.5135, + "step": 7818 + }, + { + "epoch": 0.97, + "grad_norm": 2.5397683114755623, + "learning_rate": 2.3053264963704413e-08, + "loss": 0.5403, + "step": 7819 + }, + { + "epoch": 0.97, + "grad_norm": 2.1536563276716563, + "learning_rate": 2.2860901032925597e-08, + "loss": 0.4726, + "step": 7820 + }, + { + "epoch": 0.97, + "grad_norm": 1.4454911065646863, + "learning_rate": 2.2669341202802285e-08, + "loss": 0.503, + "step": 7821 + }, + { + "epoch": 0.97, + "grad_norm": 1.3626319884830418, + "learning_rate": 2.2478585504284168e-08, + "loss": 0.481, + "step": 7822 + }, + { + "epoch": 0.97, + "grad_norm": 1.322301916185995, + "learning_rate": 2.22886339681877e-08, + "loss": 0.4919, + "step": 7823 + }, + { + "epoch": 0.97, + "grad_norm": 1.2743290032865122, + "learning_rate": 2.209948662520278e-08, + "loss": 0.4753, + "step": 7824 + }, + { + "epoch": 0.97, + "grad_norm": 1.7346529047342756, + "learning_rate": 2.1911143505887188e-08, + "loss": 0.4928, + "step": 7825 + }, + { + "epoch": 0.97, + "grad_norm": 1.4341911057238383, + "learning_rate": 2.1723604640668804e-08, + "loss": 0.4799, + "step": 7826 + }, + { + "epoch": 0.97, + "grad_norm": 1.6480767999457715, + "learning_rate": 2.1536870059847835e-08, + "loss": 0.4848, + "step": 7827 + }, + { + "epoch": 0.97, + "grad_norm": 1.5460470829974238, + "learning_rate": 2.1350939793591264e-08, + "loss": 0.5232, + "step": 7828 + }, + { + "epoch": 0.97, + "grad_norm": 1.5786795687722468, + "learning_rate": 2.116581387193839e-08, + "loss": 0.5959, + "step": 7829 + }, + { + "epoch": 0.97, + "grad_norm": 1.3439945497910952, + "learning_rate": 2.0981492324798068e-08, + "loss": 0.512, + "step": 7830 + }, + { + "epoch": 0.97, + "grad_norm": 22.31225001066693, + "learning_rate": 2.079797518194926e-08, + "loss": 0.4968, + "step": 7831 + }, + { + "epoch": 0.97, + "grad_norm": 2.2889659992198874, + "learning_rate": 2.061526247303991e-08, + "loss": 0.4042, + "step": 7832 + }, + { + "epoch": 0.97, + "grad_norm": 1.571354792462103, + "learning_rate": 2.043335422758974e-08, + "loss": 0.5359, + "step": 7833 + }, + { + "epoch": 0.97, + "grad_norm": 1.4696440018354946, + "learning_rate": 2.0252250474987468e-08, + "loss": 0.5617, + "step": 7834 + }, + { + "epoch": 0.97, + "grad_norm": 1.7384848762491998, + "learning_rate": 2.007195124449135e-08, + "loss": 0.4767, + "step": 7835 + }, + { + "epoch": 0.97, + "grad_norm": 1.5219894565268821, + "learning_rate": 1.9892456565230866e-08, + "loss": 0.4939, + "step": 7836 + }, + { + "epoch": 0.97, + "grad_norm": 1.516993884796863, + "learning_rate": 1.9713766466205043e-08, + "loss": 0.4976, + "step": 7837 + }, + { + "epoch": 0.97, + "grad_norm": 1.6486390321432192, + "learning_rate": 1.9535880976283007e-08, + "loss": 0.5048, + "step": 7838 + }, + { + "epoch": 0.97, + "grad_norm": 1.6886966252760525, + "learning_rate": 1.935880012420288e-08, + "loss": 0.5321, + "step": 7839 + }, + { + "epoch": 0.97, + "grad_norm": 1.881517861414415, + "learning_rate": 1.918252393857456e-08, + "loss": 0.496, + "step": 7840 + }, + { + "epoch": 0.97, + "grad_norm": 2.227334661334372, + "learning_rate": 1.9007052447876373e-08, + "loss": 0.4894, + "step": 7841 + }, + { + "epoch": 0.97, + "grad_norm": 1.5709242214792047, + "learning_rate": 1.8832385680457866e-08, + "loss": 0.5064, + "step": 7842 + }, + { + "epoch": 0.97, + "grad_norm": 0.6058301693578824, + "learning_rate": 1.865852366453702e-08, + "loss": 0.4816, + "step": 7843 + }, + { + "epoch": 0.97, + "grad_norm": 1.4782888331040613, + "learning_rate": 1.8485466428204145e-08, + "loss": 0.4699, + "step": 7844 + }, + { + "epoch": 0.97, + "grad_norm": 1.2843569767793035, + "learning_rate": 1.831321399941688e-08, + "loss": 0.4522, + "step": 7845 + }, + { + "epoch": 0.97, + "grad_norm": 1.820698828941528, + "learning_rate": 1.814176640600518e-08, + "loss": 0.4923, + "step": 7846 + }, + { + "epoch": 0.97, + "grad_norm": 1.3360093042205863, + "learning_rate": 1.797112367566689e-08, + "loss": 0.4754, + "step": 7847 + }, + { + "epoch": 0.97, + "grad_norm": 1.3404701278536049, + "learning_rate": 1.7801285835971627e-08, + "loss": 0.4751, + "step": 7848 + }, + { + "epoch": 0.97, + "grad_norm": 1.645967978646397, + "learning_rate": 1.7632252914357994e-08, + "loss": 0.4745, + "step": 7849 + }, + { + "epoch": 0.97, + "grad_norm": 1.816582910025013, + "learning_rate": 1.746402493813415e-08, + "loss": 0.4759, + "step": 7850 + }, + { + "epoch": 0.97, + "grad_norm": 1.4984141287718775, + "learning_rate": 1.7296601934480018e-08, + "loss": 0.5054, + "step": 7851 + }, + { + "epoch": 0.97, + "grad_norm": 1.2956987510462552, + "learning_rate": 1.7129983930443405e-08, + "loss": 0.5119, + "step": 7852 + }, + { + "epoch": 0.97, + "grad_norm": 1.2386501615892918, + "learning_rate": 1.696417095294278e-08, + "loss": 0.4774, + "step": 7853 + }, + { + "epoch": 0.97, + "grad_norm": 1.624533253640688, + "learning_rate": 1.6799163028767273e-08, + "loss": 0.4785, + "step": 7854 + }, + { + "epoch": 0.97, + "grad_norm": 1.3569411519411598, + "learning_rate": 1.6634960184575e-08, + "loss": 0.4949, + "step": 7855 + }, + { + "epoch": 0.97, + "grad_norm": 3.2807250837591946, + "learning_rate": 1.647156244689474e-08, + "loss": 0.4969, + "step": 7856 + }, + { + "epoch": 0.98, + "grad_norm": 0.7328035968027572, + "learning_rate": 1.6308969842124266e-08, + "loss": 0.4832, + "step": 7857 + }, + { + "epoch": 0.98, + "grad_norm": 1.702462953664373, + "learning_rate": 1.6147182396532013e-08, + "loss": 0.5399, + "step": 7858 + }, + { + "epoch": 0.98, + "grad_norm": 1.5917224380993937, + "learning_rate": 1.5986200136256512e-08, + "loss": 0.4511, + "step": 7859 + }, + { + "epoch": 0.98, + "grad_norm": 1.7284270235794241, + "learning_rate": 1.5826023087305853e-08, + "loss": 0.4766, + "step": 7860 + }, + { + "epoch": 0.98, + "grad_norm": 1.5717015680984, + "learning_rate": 1.5666651275557664e-08, + "loss": 0.5071, + "step": 7861 + }, + { + "epoch": 0.98, + "grad_norm": 3.5548574313851318, + "learning_rate": 1.550808472676024e-08, + "loss": 0.5269, + "step": 7862 + }, + { + "epoch": 0.98, + "grad_norm": 1.540841191632159, + "learning_rate": 1.535032346653087e-08, + "loss": 0.4903, + "step": 7863 + }, + { + "epoch": 0.98, + "grad_norm": 0.7457093299726929, + "learning_rate": 1.5193367520357493e-08, + "loss": 0.4557, + "step": 7864 + }, + { + "epoch": 0.98, + "grad_norm": 1.7486686969238991, + "learning_rate": 1.5037216913598163e-08, + "loss": 0.4794, + "step": 7865 + }, + { + "epoch": 0.98, + "grad_norm": 1.3896525495364775, + "learning_rate": 1.488187167147992e-08, + "loss": 0.4907, + "step": 7866 + }, + { + "epoch": 0.98, + "grad_norm": 2.3235501419649687, + "learning_rate": 1.4727331819100466e-08, + "loss": 0.4744, + "step": 7867 + }, + { + "epoch": 0.98, + "grad_norm": 1.567961452961845, + "learning_rate": 1.4573597381427052e-08, + "loss": 0.4887, + "step": 7868 + }, + { + "epoch": 0.98, + "grad_norm": 1.3819936970432891, + "learning_rate": 1.4420668383296476e-08, + "loss": 0.5158, + "step": 7869 + }, + { + "epoch": 0.98, + "grad_norm": 1.4597986889667054, + "learning_rate": 1.4268544849416199e-08, + "loss": 0.5161, + "step": 7870 + }, + { + "epoch": 0.98, + "grad_norm": 1.7559433201397077, + "learning_rate": 1.411722680436267e-08, + "loss": 0.4797, + "step": 7871 + }, + { + "epoch": 0.98, + "grad_norm": 1.7793292300008385, + "learning_rate": 1.3966714272583004e-08, + "loss": 0.5119, + "step": 7872 + }, + { + "epoch": 0.98, + "grad_norm": 0.6959078402252198, + "learning_rate": 1.3817007278393302e-08, + "loss": 0.4507, + "step": 7873 + }, + { + "epoch": 0.98, + "grad_norm": 1.2366883966054931, + "learning_rate": 1.3668105845980883e-08, + "loss": 0.4014, + "step": 7874 + }, + { + "epoch": 0.98, + "grad_norm": 1.3055286853521844, + "learning_rate": 1.3520009999401506e-08, + "loss": 0.5215, + "step": 7875 + }, + { + "epoch": 0.98, + "grad_norm": 1.5049883730512816, + "learning_rate": 1.3372719762581587e-08, + "loss": 0.4989, + "step": 7876 + }, + { + "epoch": 0.98, + "grad_norm": 1.2385683699552819, + "learning_rate": 1.3226235159317091e-08, + "loss": 0.497, + "step": 7877 + }, + { + "epoch": 0.98, + "grad_norm": 0.6753540862673787, + "learning_rate": 1.3080556213274087e-08, + "loss": 0.4658, + "step": 7878 + }, + { + "epoch": 0.98, + "grad_norm": 1.3777811871545333, + "learning_rate": 1.2935682947987638e-08, + "loss": 0.4743, + "step": 7879 + }, + { + "epoch": 0.98, + "grad_norm": 1.7929465749642053, + "learning_rate": 1.279161538686402e-08, + "loss": 0.4985, + "step": 7880 + }, + { + "epoch": 0.98, + "grad_norm": 1.5867881570055768, + "learning_rate": 1.2648353553178506e-08, + "loss": 0.5077, + "step": 7881 + }, + { + "epoch": 0.98, + "grad_norm": 1.496988232167877, + "learning_rate": 1.2505897470075357e-08, + "loss": 0.4619, + "step": 7882 + }, + { + "epoch": 0.98, + "grad_norm": 1.388582596738789, + "learning_rate": 1.2364247160571164e-08, + "loss": 0.4528, + "step": 7883 + }, + { + "epoch": 0.98, + "grad_norm": 1.3925588795777673, + "learning_rate": 1.2223402647549843e-08, + "loss": 0.4751, + "step": 7884 + }, + { + "epoch": 0.98, + "grad_norm": 1.315233321300705, + "learning_rate": 1.2083363953766525e-08, + "loss": 0.5084, + "step": 7885 + }, + { + "epoch": 0.98, + "grad_norm": 1.4819177522796911, + "learning_rate": 1.194413110184478e-08, + "loss": 0.5214, + "step": 7886 + }, + { + "epoch": 0.98, + "grad_norm": 1.2908476184298494, + "learning_rate": 1.1805704114279948e-08, + "loss": 0.4593, + "step": 7887 + }, + { + "epoch": 0.98, + "grad_norm": 1.3019806390033144, + "learning_rate": 1.1668083013435804e-08, + "loss": 0.5032, + "step": 7888 + }, + { + "epoch": 0.98, + "grad_norm": 0.7250265062600212, + "learning_rate": 1.1531267821545678e-08, + "loss": 0.4961, + "step": 7889 + }, + { + "epoch": 0.98, + "grad_norm": 1.506622513165299, + "learning_rate": 1.1395258560714106e-08, + "loss": 0.5137, + "step": 7890 + }, + { + "epoch": 0.98, + "grad_norm": 1.388515573850204, + "learning_rate": 1.1260055252914071e-08, + "loss": 0.5433, + "step": 7891 + }, + { + "epoch": 0.98, + "grad_norm": 1.648138461522593, + "learning_rate": 1.1125657919989208e-08, + "loss": 0.5046, + "step": 7892 + }, + { + "epoch": 0.98, + "grad_norm": 1.8268093577316844, + "learning_rate": 1.099206658365215e-08, + "loss": 0.474, + "step": 7893 + }, + { + "epoch": 0.98, + "grad_norm": 1.462649421981926, + "learning_rate": 1.0859281265486189e-08, + "loss": 0.4886, + "step": 7894 + }, + { + "epoch": 0.98, + "grad_norm": 1.8737725933823501, + "learning_rate": 1.0727301986943605e-08, + "loss": 0.4958, + "step": 7895 + }, + { + "epoch": 0.98, + "grad_norm": 1.4911950734488788, + "learning_rate": 1.0596128769347347e-08, + "loss": 0.4844, + "step": 7896 + }, + { + "epoch": 0.98, + "grad_norm": 1.638525328685385, + "learning_rate": 1.0465761633888793e-08, + "loss": 0.5159, + "step": 7897 + }, + { + "epoch": 0.98, + "grad_norm": 1.3760722132055998, + "learning_rate": 1.0336200601630542e-08, + "loss": 0.504, + "step": 7898 + }, + { + "epoch": 0.98, + "grad_norm": 1.525924868180361, + "learning_rate": 1.020744569350418e-08, + "loss": 0.531, + "step": 7899 + }, + { + "epoch": 0.98, + "grad_norm": 1.6230687050084631, + "learning_rate": 1.007949693031085e-08, + "loss": 0.4862, + "step": 7900 + }, + { + "epoch": 0.98, + "grad_norm": 1.759421855350358, + "learning_rate": 9.952354332722348e-09, + "loss": 0.4731, + "step": 7901 + }, + { + "epoch": 0.98, + "grad_norm": 3.7801379899361045, + "learning_rate": 9.826017921279463e-09, + "loss": 0.4566, + "step": 7902 + }, + { + "epoch": 0.98, + "grad_norm": 1.52962165080147, + "learning_rate": 9.700487716392537e-09, + "loss": 0.5379, + "step": 7903 + }, + { + "epoch": 0.98, + "grad_norm": 1.963892250825473, + "learning_rate": 9.57576373834257e-09, + "loss": 0.4842, + "step": 7904 + }, + { + "epoch": 0.98, + "grad_norm": 1.373974698728333, + "learning_rate": 9.451846007280107e-09, + "loss": 0.5251, + "step": 7905 + }, + { + "epoch": 0.98, + "grad_norm": 0.716667262902133, + "learning_rate": 9.328734543224138e-09, + "loss": 0.4694, + "step": 7906 + }, + { + "epoch": 0.98, + "grad_norm": 2.6714175262182387, + "learning_rate": 9.206429366065418e-09, + "loss": 0.4735, + "step": 7907 + }, + { + "epoch": 0.98, + "grad_norm": 1.833029537151372, + "learning_rate": 9.084930495563138e-09, + "loss": 0.5255, + "step": 7908 + }, + { + "epoch": 0.98, + "grad_norm": 3.5869767827391525, + "learning_rate": 8.964237951346044e-09, + "loss": 0.4631, + "step": 7909 + }, + { + "epoch": 0.98, + "grad_norm": 1.4241008230767385, + "learning_rate": 8.844351752913539e-09, + "loss": 0.5049, + "step": 7910 + }, + { + "epoch": 0.98, + "grad_norm": 1.5467926959639, + "learning_rate": 8.72527191963457e-09, + "loss": 0.5108, + "step": 7911 + }, + { + "epoch": 0.98, + "grad_norm": 1.5800485283358539, + "learning_rate": 8.606998470746531e-09, + "loss": 0.5012, + "step": 7912 + }, + { + "epoch": 0.98, + "grad_norm": 1.2881792953126658, + "learning_rate": 8.489531425358576e-09, + "loss": 0.4238, + "step": 7913 + }, + { + "epoch": 0.98, + "grad_norm": 1.5260835129310093, + "learning_rate": 8.372870802447752e-09, + "loss": 0.4783, + "step": 7914 + }, + { + "epoch": 0.98, + "grad_norm": 1.3749626334965497, + "learning_rate": 8.257016620862313e-09, + "loss": 0.5112, + "step": 7915 + }, + { + "epoch": 0.98, + "grad_norm": 1.3489776684036383, + "learning_rate": 8.141968899318953e-09, + "loss": 0.5131, + "step": 7916 + }, + { + "epoch": 0.98, + "grad_norm": 1.5186970614708848, + "learning_rate": 8.027727656405027e-09, + "loss": 0.5219, + "step": 7917 + }, + { + "epoch": 0.98, + "grad_norm": 1.5606982472763145, + "learning_rate": 7.914292910576882e-09, + "loss": 0.5179, + "step": 7918 + }, + { + "epoch": 0.98, + "grad_norm": 1.1687304109309202, + "learning_rate": 7.801664680161526e-09, + "loss": 0.4706, + "step": 7919 + }, + { + "epoch": 0.98, + "grad_norm": 2.9162988368769898, + "learning_rate": 7.6898429833544e-09, + "loss": 0.4693, + "step": 7920 + }, + { + "epoch": 0.98, + "grad_norm": 1.9704069261789112, + "learning_rate": 7.578827838221059e-09, + "loss": 0.5034, + "step": 7921 + }, + { + "epoch": 0.98, + "grad_norm": 1.9394228000122264, + "learning_rate": 7.468619262697708e-09, + "loss": 0.5163, + "step": 7922 + }, + { + "epoch": 0.98, + "grad_norm": 1.7271604647231455, + "learning_rate": 7.359217274589547e-09, + "loss": 0.4904, + "step": 7923 + }, + { + "epoch": 0.98, + "grad_norm": 1.4181225934413368, + "learning_rate": 7.250621891570775e-09, + "loss": 0.5419, + "step": 7924 + }, + { + "epoch": 0.98, + "grad_norm": 1.3000477016700693, + "learning_rate": 7.142833131186244e-09, + "loss": 0.473, + "step": 7925 + }, + { + "epoch": 0.98, + "grad_norm": 1.2807710850926293, + "learning_rate": 7.035851010850359e-09, + "loss": 0.4878, + "step": 7926 + }, + { + "epoch": 0.98, + "grad_norm": 1.4349642139062453, + "learning_rate": 6.929675547846515e-09, + "loss": 0.4818, + "step": 7927 + }, + { + "epoch": 0.98, + "grad_norm": 1.4312178958680224, + "learning_rate": 6.824306759328769e-09, + "loss": 0.5231, + "step": 7928 + }, + { + "epoch": 0.98, + "grad_norm": 3.5932973980498493, + "learning_rate": 6.719744662320172e-09, + "loss": 0.5194, + "step": 7929 + }, + { + "epoch": 0.98, + "grad_norm": 1.7986015396528459, + "learning_rate": 6.6159892737138746e-09, + "loss": 0.4934, + "step": 7930 + }, + { + "epoch": 0.98, + "grad_norm": 2.5386905898599568, + "learning_rate": 6.5130406102720255e-09, + "loss": 0.4808, + "step": 7931 + }, + { + "epoch": 0.98, + "grad_norm": 1.596510821692502, + "learning_rate": 6.4108986886274315e-09, + "loss": 0.4887, + "step": 7932 + }, + { + "epoch": 0.98, + "grad_norm": 1.44232300560664, + "learning_rate": 6.309563525281337e-09, + "loss": 0.4932, + "step": 7933 + }, + { + "epoch": 0.98, + "grad_norm": 1.8056468590688406, + "learning_rate": 6.2090351366062005e-09, + "loss": 0.5026, + "step": 7934 + }, + { + "epoch": 0.98, + "grad_norm": 1.4217766938218368, + "learning_rate": 6.10931353884292e-09, + "loss": 0.4779, + "step": 7935 + }, + { + "epoch": 0.98, + "grad_norm": 1.289948288498926, + "learning_rate": 6.0103987481019425e-09, + "loss": 0.4627, + "step": 7936 + }, + { + "epoch": 0.98, + "grad_norm": 1.5237895784464568, + "learning_rate": 5.912290780364371e-09, + "loss": 0.4737, + "step": 7937 + }, + { + "epoch": 0.99, + "grad_norm": 1.2976296732878239, + "learning_rate": 5.814989651480307e-09, + "loss": 0.4797, + "step": 7938 + }, + { + "epoch": 0.99, + "grad_norm": 1.4923362635589985, + "learning_rate": 5.718495377169398e-09, + "loss": 0.5031, + "step": 7939 + }, + { + "epoch": 0.99, + "grad_norm": 1.6018338958446563, + "learning_rate": 5.6228079730208386e-09, + "loss": 0.5023, + "step": 7940 + }, + { + "epoch": 0.99, + "grad_norm": 1.453318636362527, + "learning_rate": 5.527927454495041e-09, + "loss": 0.5132, + "step": 7941 + }, + { + "epoch": 0.99, + "grad_norm": 0.6526477392008091, + "learning_rate": 5.43385383691919e-09, + "loss": 0.5141, + "step": 7942 + }, + { + "epoch": 0.99, + "grad_norm": 1.527193581772258, + "learning_rate": 5.340587135492792e-09, + "loss": 0.4674, + "step": 7943 + }, + { + "epoch": 0.99, + "grad_norm": 1.5351659484527673, + "learning_rate": 5.248127365283795e-09, + "loss": 0.5062, + "step": 7944 + }, + { + "epoch": 0.99, + "grad_norm": 1.5048077567795504, + "learning_rate": 5.156474541229694e-09, + "loss": 0.4739, + "step": 7945 + }, + { + "epoch": 0.99, + "grad_norm": 1.5923104159215158, + "learning_rate": 5.065628678137535e-09, + "loss": 0.442, + "step": 7946 + }, + { + "epoch": 0.99, + "grad_norm": 1.5367669913165014, + "learning_rate": 4.975589790684465e-09, + "loss": 0.538, + "step": 7947 + }, + { + "epoch": 0.99, + "grad_norm": 1.710977541997585, + "learning_rate": 4.8863578934171815e-09, + "loss": 0.5158, + "step": 7948 + }, + { + "epoch": 0.99, + "grad_norm": 1.408633133018979, + "learning_rate": 4.797933000751376e-09, + "loss": 0.5119, + "step": 7949 + }, + { + "epoch": 0.99, + "grad_norm": 1.5436028172282805, + "learning_rate": 4.710315126973952e-09, + "loss": 0.4909, + "step": 7950 + }, + { + "epoch": 0.99, + "grad_norm": 3.0541233013668996, + "learning_rate": 4.6235042862397e-09, + "loss": 0.4652, + "step": 7951 + }, + { + "epoch": 0.99, + "grad_norm": 3.019705736632635, + "learning_rate": 4.53750049257351e-09, + "loss": 0.5356, + "step": 7952 + }, + { + "epoch": 0.99, + "grad_norm": 1.4453039044862797, + "learning_rate": 4.452303759869825e-09, + "loss": 0.4711, + "step": 7953 + }, + { + "epoch": 0.99, + "grad_norm": 1.3822195378285782, + "learning_rate": 4.3679141018937445e-09, + "loss": 0.4825, + "step": 7954 + }, + { + "epoch": 0.99, + "grad_norm": 1.4037807224753245, + "learning_rate": 4.284331532278807e-09, + "loss": 0.4862, + "step": 7955 + }, + { + "epoch": 0.99, + "grad_norm": 1.5031811880438142, + "learning_rate": 4.201556064528101e-09, + "loss": 0.5291, + "step": 7956 + }, + { + "epoch": 0.99, + "grad_norm": 1.307167431767514, + "learning_rate": 4.1195877120153716e-09, + "loss": 0.4643, + "step": 7957 + }, + { + "epoch": 0.99, + "grad_norm": 1.5174307919187913, + "learning_rate": 4.038426487982805e-09, + "loss": 0.4864, + "step": 7958 + }, + { + "epoch": 0.99, + "grad_norm": 1.5627982372472973, + "learning_rate": 3.9580724055432455e-09, + "loss": 0.5064, + "step": 7959 + }, + { + "epoch": 0.99, + "grad_norm": 5.398204333643108, + "learning_rate": 3.878525477677975e-09, + "loss": 0.4993, + "step": 7960 + }, + { + "epoch": 0.99, + "grad_norm": 1.2810352158757143, + "learning_rate": 3.799785717238935e-09, + "loss": 0.4969, + "step": 7961 + }, + { + "epoch": 0.99, + "grad_norm": 1.709476098049419, + "learning_rate": 3.7218531369476174e-09, + "loss": 0.4781, + "step": 7962 + }, + { + "epoch": 0.99, + "grad_norm": 1.5636347778347486, + "learning_rate": 3.644727749393395e-09, + "loss": 0.4709, + "step": 7963 + }, + { + "epoch": 0.99, + "grad_norm": 0.63189574021935, + "learning_rate": 3.568409567037967e-09, + "loss": 0.521, + "step": 7964 + }, + { + "epoch": 0.99, + "grad_norm": 1.6745069970963564, + "learning_rate": 3.4928986022103595e-09, + "loss": 0.4898, + "step": 7965 + }, + { + "epoch": 0.99, + "grad_norm": 1.2734109830619809, + "learning_rate": 3.4181948671102583e-09, + "loss": 0.5043, + "step": 7966 + }, + { + "epoch": 0.99, + "grad_norm": 1.7824674763979274, + "learning_rate": 3.344298373806898e-09, + "loss": 0.4987, + "step": 7967 + }, + { + "epoch": 0.99, + "grad_norm": 1.4767364501336402, + "learning_rate": 3.2712091342385065e-09, + "loss": 0.5063, + "step": 7968 + }, + { + "epoch": 0.99, + "grad_norm": 1.4764672841746604, + "learning_rate": 3.1989271602134163e-09, + "loss": 0.4688, + "step": 7969 + }, + { + "epoch": 0.99, + "grad_norm": 1.4718649596668767, + "learning_rate": 3.1274524634100634e-09, + "loss": 0.4937, + "step": 7970 + }, + { + "epoch": 0.99, + "grad_norm": 1.9146013021572743, + "learning_rate": 3.0567850553747667e-09, + "loss": 0.4767, + "step": 7971 + }, + { + "epoch": 0.99, + "grad_norm": 1.5376106330790187, + "learning_rate": 2.9869249475256155e-09, + "loss": 0.4888, + "step": 7972 + }, + { + "epoch": 0.99, + "grad_norm": 1.6629488262473051, + "learning_rate": 2.917872151148027e-09, + "loss": 0.4718, + "step": 7973 + }, + { + "epoch": 0.99, + "grad_norm": 1.5242519408275426, + "learning_rate": 2.8496266773986314e-09, + "loss": 0.5081, + "step": 7974 + }, + { + "epoch": 0.99, + "grad_norm": 1.3755774278666677, + "learning_rate": 2.7821885373030545e-09, + "loss": 0.5311, + "step": 7975 + }, + { + "epoch": 0.99, + "grad_norm": 1.385990845021792, + "learning_rate": 2.715557741756469e-09, + "loss": 0.478, + "step": 7976 + }, + { + "epoch": 0.99, + "grad_norm": 1.2967126233409623, + "learning_rate": 2.6497343015235987e-09, + "loss": 0.4459, + "step": 7977 + }, + { + "epoch": 0.99, + "grad_norm": 1.4684511791320787, + "learning_rate": 2.584718227239269e-09, + "loss": 0.4752, + "step": 7978 + }, + { + "epoch": 0.99, + "grad_norm": 1.519683551519481, + "learning_rate": 2.520509529406745e-09, + "loss": 0.4846, + "step": 7979 + }, + { + "epoch": 0.99, + "grad_norm": 1.5136123778915815, + "learning_rate": 2.4571082183993955e-09, + "loss": 0.4975, + "step": 7980 + }, + { + "epoch": 0.99, + "grad_norm": 1.440141895316327, + "learning_rate": 2.3945143044612485e-09, + "loss": 0.4474, + "step": 7981 + }, + { + "epoch": 0.99, + "grad_norm": 1.7085897376426566, + "learning_rate": 2.33272779770366e-09, + "loss": 0.4575, + "step": 7982 + }, + { + "epoch": 0.99, + "grad_norm": 1.4363383949547228, + "learning_rate": 2.271748708110311e-09, + "loss": 0.4879, + "step": 7983 + }, + { + "epoch": 0.99, + "grad_norm": 1.5093556934051071, + "learning_rate": 2.211577045531099e-09, + "loss": 0.5451, + "step": 7984 + }, + { + "epoch": 0.99, + "grad_norm": 1.4565013995684928, + "learning_rate": 2.1522128196888036e-09, + "loss": 0.5185, + "step": 7985 + }, + { + "epoch": 0.99, + "grad_norm": 1.4807416392206982, + "learning_rate": 2.0936560401735306e-09, + "loss": 0.4876, + "step": 7986 + }, + { + "epoch": 0.99, + "grad_norm": 1.7294427571785986, + "learning_rate": 2.0359067164454905e-09, + "loss": 0.4929, + "step": 7987 + }, + { + "epoch": 0.99, + "grad_norm": 1.646187430172883, + "learning_rate": 1.9789648578349975e-09, + "loss": 0.492, + "step": 7988 + }, + { + "epoch": 0.99, + "grad_norm": 1.5770653317502463, + "learning_rate": 1.9228304735413594e-09, + "loss": 0.5405, + "step": 7989 + }, + { + "epoch": 0.99, + "grad_norm": 1.5488248308485255, + "learning_rate": 1.8675035726339883e-09, + "loss": 0.4976, + "step": 7990 + }, + { + "epoch": 0.99, + "grad_norm": 1.5937102313405058, + "learning_rate": 1.812984164050735e-09, + "loss": 0.544, + "step": 7991 + }, + { + "epoch": 0.99, + "grad_norm": 1.5565998451857792, + "learning_rate": 1.7592722565995536e-09, + "loss": 0.4508, + "step": 7992 + }, + { + "epoch": 0.99, + "grad_norm": 4.208208978134417, + "learning_rate": 1.706367858959057e-09, + "loss": 0.5158, + "step": 7993 + }, + { + "epoch": 0.99, + "grad_norm": 0.6943100273181849, + "learning_rate": 1.6542709796757429e-09, + "loss": 0.4905, + "step": 7994 + }, + { + "epoch": 0.99, + "grad_norm": 1.4754337734216012, + "learning_rate": 1.6029816271667664e-09, + "loss": 0.4433, + "step": 7995 + }, + { + "epoch": 0.99, + "grad_norm": 1.7784248749244436, + "learning_rate": 1.5524998097177225e-09, + "loss": 0.5054, + "step": 7996 + }, + { + "epoch": 0.99, + "grad_norm": 1.6584613863369149, + "learning_rate": 1.502825535484309e-09, + "loss": 0.4693, + "step": 7997 + }, + { + "epoch": 0.99, + "grad_norm": 1.3627989829776788, + "learning_rate": 1.4539588124928838e-09, + "loss": 0.4536, + "step": 7998 + }, + { + "epoch": 0.99, + "grad_norm": 1.6144518696085406, + "learning_rate": 1.4058996486376875e-09, + "loss": 0.5055, + "step": 7999 + }, + { + "epoch": 0.99, + "grad_norm": 3.069795091114158, + "learning_rate": 1.3586480516825096e-09, + "loss": 0.4743, + "step": 8000 + } + ], + "logging_steps": 1.0, + "max_steps": 8058, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 4101554005508096.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}