diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,72161 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.999878419452887, + "eval_steps": 514, + "global_step": 10280, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00048632218844984804, + "grad_norm": 0.31342887076158477, + "learning_rate": 5.000000000000001e-07, + "loss": 0.7161, + "step": 1 + }, + { + "epoch": 0.00048632218844984804, + "eval_loss": 0.7662228345870972, + "eval_runtime": 104.4915, + "eval_samples_per_second": 290.483, + "eval_steps_per_second": 36.319, + "step": 1 + }, + { + "epoch": 0.0009726443768996961, + "grad_norm": 0.334158515730528, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.7407, + "step": 2 + }, + { + "epoch": 0.001458966565349544, + "grad_norm": 0.31676767269315387, + "learning_rate": 1.5e-06, + "loss": 0.7369, + "step": 3 + }, + { + "epoch": 0.0019452887537993921, + "grad_norm": 0.326261473546936, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7904, + "step": 4 + }, + { + "epoch": 0.0024316109422492403, + "grad_norm": 0.3264110392881449, + "learning_rate": 2.5e-06, + "loss": 0.8081, + "step": 5 + }, + { + "epoch": 0.002917933130699088, + "grad_norm": 0.29499153993503074, + "learning_rate": 3e-06, + "loss": 0.7963, + "step": 6 + }, + { + "epoch": 0.003404255319148936, + "grad_norm": 0.3391905332281365, + "learning_rate": 3.5e-06, + "loss": 0.7669, + "step": 7 + }, + { + "epoch": 0.0038905775075987843, + "grad_norm": 0.29078806642007365, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7713, + "step": 8 + }, + { + "epoch": 0.004376899696048632, + "grad_norm": 0.28565706294072274, + "learning_rate": 4.5e-06, + "loss": 0.7614, + "step": 9 + }, + { + "epoch": 0.004863221884498481, + "grad_norm": 0.1905001372976004, + "learning_rate": 5e-06, + "loss": 0.7764, + "step": 10 + }, + { + "epoch": 0.005349544072948328, + "grad_norm": 0.19436409340543703, + "learning_rate": 5.500000000000001e-06, + "loss": 0.7218, + "step": 11 + }, + { + "epoch": 0.005835866261398176, + "grad_norm": 0.18034219716480748, + "learning_rate": 6e-06, + "loss": 0.6978, + "step": 12 + }, + { + "epoch": 0.006322188449848025, + "grad_norm": 0.16176764190446236, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.7673, + "step": 13 + }, + { + "epoch": 0.006808510638297872, + "grad_norm": 0.1840359774691012, + "learning_rate": 7e-06, + "loss": 0.7289, + "step": 14 + }, + { + "epoch": 0.00729483282674772, + "grad_norm": 0.19452307183227896, + "learning_rate": 7.500000000000001e-06, + "loss": 0.6935, + "step": 15 + }, + { + "epoch": 0.007781155015197569, + "grad_norm": 0.18147032815544542, + "learning_rate": 8.000000000000001e-06, + "loss": 0.7184, + "step": 16 + }, + { + "epoch": 0.008267477203647417, + "grad_norm": 0.14921988529574418, + "learning_rate": 8.5e-06, + "loss": 0.7208, + "step": 17 + }, + { + "epoch": 0.008753799392097264, + "grad_norm": 0.13370296313161623, + "learning_rate": 9e-06, + "loss": 0.6778, + "step": 18 + }, + { + "epoch": 0.009240121580547113, + "grad_norm": 0.14257941486401873, + "learning_rate": 9.5e-06, + "loss": 0.6935, + "step": 19 + }, + { + "epoch": 0.009726443768996961, + "grad_norm": 0.13681987045190647, + "learning_rate": 1e-05, + "loss": 0.641, + "step": 20 + }, + { + "epoch": 0.010212765957446808, + "grad_norm": 0.14011042403475713, + "learning_rate": 9.99999963340339e-06, + "loss": 0.6986, + "step": 21 + }, + { + "epoch": 0.010699088145896657, + "grad_norm": 0.12723168311084196, + "learning_rate": 9.999998533613611e-06, + "loss": 0.6693, + "step": 22 + }, + { + "epoch": 0.011185410334346505, + "grad_norm": 0.13487668537848563, + "learning_rate": 9.999996700630827e-06, + "loss": 0.7116, + "step": 23 + }, + { + "epoch": 0.011671732522796352, + "grad_norm": 0.14091846193535895, + "learning_rate": 9.999994134455306e-06, + "loss": 0.7058, + "step": 24 + }, + { + "epoch": 0.0121580547112462, + "grad_norm": 0.14196587593318416, + "learning_rate": 9.999990835087423e-06, + "loss": 0.6628, + "step": 25 + }, + { + "epoch": 0.01264437689969605, + "grad_norm": 0.1303053296835844, + "learning_rate": 9.999986802527664e-06, + "loss": 0.654, + "step": 26 + }, + { + "epoch": 0.013130699088145896, + "grad_norm": 0.12532311300629753, + "learning_rate": 9.999982036776617e-06, + "loss": 0.6745, + "step": 27 + }, + { + "epoch": 0.013617021276595745, + "grad_norm": 0.12361891832951762, + "learning_rate": 9.999976537834983e-06, + "loss": 0.7084, + "step": 28 + }, + { + "epoch": 0.014103343465045593, + "grad_norm": 0.12688676168891824, + "learning_rate": 9.99997030570357e-06, + "loss": 0.7141, + "step": 29 + }, + { + "epoch": 0.01458966565349544, + "grad_norm": 0.1225998310166146, + "learning_rate": 9.999963340383288e-06, + "loss": 0.6663, + "step": 30 + }, + { + "epoch": 0.015075987841945289, + "grad_norm": 0.12170075854441824, + "learning_rate": 9.999955641875162e-06, + "loss": 0.6965, + "step": 31 + }, + { + "epoch": 0.015562310030395137, + "grad_norm": 0.1221017771612701, + "learning_rate": 9.999947210180319e-06, + "loss": 0.7278, + "step": 32 + }, + { + "epoch": 0.016048632218844984, + "grad_norm": 0.1235116174205995, + "learning_rate": 9.999938045299996e-06, + "loss": 0.7131, + "step": 33 + }, + { + "epoch": 0.016534954407294834, + "grad_norm": 0.12123547248834239, + "learning_rate": 9.999928147235536e-06, + "loss": 0.6628, + "step": 34 + }, + { + "epoch": 0.01702127659574468, + "grad_norm": 0.1209956452505472, + "learning_rate": 9.99991751598839e-06, + "loss": 0.6924, + "step": 35 + }, + { + "epoch": 0.017507598784194528, + "grad_norm": 0.12174816723148756, + "learning_rate": 9.999906151560122e-06, + "loss": 0.6386, + "step": 36 + }, + { + "epoch": 0.01799392097264438, + "grad_norm": 0.12194164456847331, + "learning_rate": 9.999894053952391e-06, + "loss": 0.6598, + "step": 37 + }, + { + "epoch": 0.018480243161094225, + "grad_norm": 0.12109765845671047, + "learning_rate": 9.999881223166976e-06, + "loss": 0.6792, + "step": 38 + }, + { + "epoch": 0.018966565349544072, + "grad_norm": 0.11665211787806205, + "learning_rate": 9.999867659205758e-06, + "loss": 0.7378, + "step": 39 + }, + { + "epoch": 0.019452887537993922, + "grad_norm": 0.12184656948093012, + "learning_rate": 9.999853362070724e-06, + "loss": 0.7125, + "step": 40 + }, + { + "epoch": 0.01993920972644377, + "grad_norm": 0.1234263613036688, + "learning_rate": 9.99983833176397e-06, + "loss": 0.6814, + "step": 41 + }, + { + "epoch": 0.020425531914893616, + "grad_norm": 0.11468002752463782, + "learning_rate": 9.999822568287703e-06, + "loss": 0.6636, + "step": 42 + }, + { + "epoch": 0.020911854103343466, + "grad_norm": 0.11842038191755005, + "learning_rate": 9.999806071644234e-06, + "loss": 0.6381, + "step": 43 + }, + { + "epoch": 0.021398176291793313, + "grad_norm": 0.1144864666549832, + "learning_rate": 9.999788841835981e-06, + "loss": 0.6846, + "step": 44 + }, + { + "epoch": 0.02188449848024316, + "grad_norm": 0.11904181909619377, + "learning_rate": 9.999770878865469e-06, + "loss": 0.6435, + "step": 45 + }, + { + "epoch": 0.02237082066869301, + "grad_norm": 0.12234067261973151, + "learning_rate": 9.999752182735335e-06, + "loss": 0.7094, + "step": 46 + }, + { + "epoch": 0.022857142857142857, + "grad_norm": 0.11794427564135766, + "learning_rate": 9.999732753448318e-06, + "loss": 0.6986, + "step": 47 + }, + { + "epoch": 0.023343465045592704, + "grad_norm": 0.12072167177150002, + "learning_rate": 9.99971259100727e-06, + "loss": 0.6604, + "step": 48 + }, + { + "epoch": 0.023829787234042554, + "grad_norm": 0.11996908580390374, + "learning_rate": 9.999691695415146e-06, + "loss": 0.6842, + "step": 49 + }, + { + "epoch": 0.0243161094224924, + "grad_norm": 0.12186788928863322, + "learning_rate": 9.99967006667501e-06, + "loss": 0.6848, + "step": 50 + }, + { + "epoch": 0.024802431610942248, + "grad_norm": 0.1179648173315135, + "learning_rate": 9.999647704790032e-06, + "loss": 0.6662, + "step": 51 + }, + { + "epoch": 0.0252887537993921, + "grad_norm": 0.12178429099697184, + "learning_rate": 9.999624609763495e-06, + "loss": 0.7101, + "step": 52 + }, + { + "epoch": 0.025775075987841945, + "grad_norm": 0.12246757168040684, + "learning_rate": 9.999600781598783e-06, + "loss": 0.7063, + "step": 53 + }, + { + "epoch": 0.026261398176291792, + "grad_norm": 0.1229181946656624, + "learning_rate": 9.99957622029939e-06, + "loss": 0.6585, + "step": 54 + }, + { + "epoch": 0.026747720364741642, + "grad_norm": 0.12636155475725114, + "learning_rate": 9.999550925868919e-06, + "loss": 0.6953, + "step": 55 + }, + { + "epoch": 0.02723404255319149, + "grad_norm": 0.12015041575765832, + "learning_rate": 9.999524898311077e-06, + "loss": 0.6971, + "step": 56 + }, + { + "epoch": 0.027720364741641336, + "grad_norm": 0.11512943579786336, + "learning_rate": 9.999498137629684e-06, + "loss": 0.6552, + "step": 57 + }, + { + "epoch": 0.028206686930091186, + "grad_norm": 0.1213401086498426, + "learning_rate": 9.999470643828662e-06, + "loss": 0.6479, + "step": 58 + }, + { + "epoch": 0.028693009118541033, + "grad_norm": 0.12091003596412728, + "learning_rate": 9.99944241691204e-06, + "loss": 0.6516, + "step": 59 + }, + { + "epoch": 0.02917933130699088, + "grad_norm": 0.12373118381179596, + "learning_rate": 9.999413456883963e-06, + "loss": 0.6657, + "step": 60 + }, + { + "epoch": 0.02966565349544073, + "grad_norm": 0.12080391465640891, + "learning_rate": 9.999383763748673e-06, + "loss": 0.6375, + "step": 61 + }, + { + "epoch": 0.030151975683890577, + "grad_norm": 0.11849737888049058, + "learning_rate": 9.999353337510526e-06, + "loss": 0.6786, + "step": 62 + }, + { + "epoch": 0.030638297872340424, + "grad_norm": 0.11963326720084462, + "learning_rate": 9.999322178173985e-06, + "loss": 0.6483, + "step": 63 + }, + { + "epoch": 0.031124620060790274, + "grad_norm": 0.11698726914446085, + "learning_rate": 9.999290285743617e-06, + "loss": 0.6765, + "step": 64 + }, + { + "epoch": 0.031610942249240125, + "grad_norm": 0.11522309731346195, + "learning_rate": 9.999257660224098e-06, + "loss": 0.6514, + "step": 65 + }, + { + "epoch": 0.03209726443768997, + "grad_norm": 0.12142778040603465, + "learning_rate": 9.999224301620214e-06, + "loss": 0.6607, + "step": 66 + }, + { + "epoch": 0.03258358662613982, + "grad_norm": 0.11833572736427499, + "learning_rate": 9.999190209936857e-06, + "loss": 0.659, + "step": 67 + }, + { + "epoch": 0.03306990881458967, + "grad_norm": 0.12175472926875927, + "learning_rate": 9.999155385179025e-06, + "loss": 0.6543, + "step": 68 + }, + { + "epoch": 0.03355623100303951, + "grad_norm": 0.11724310675083763, + "learning_rate": 9.999119827351824e-06, + "loss": 0.69, + "step": 69 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 0.11899996574595607, + "learning_rate": 9.99908353646047e-06, + "loss": 0.6891, + "step": 70 + }, + { + "epoch": 0.03452887537993921, + "grad_norm": 0.11978458478743302, + "learning_rate": 9.999046512510284e-06, + "loss": 0.6669, + "step": 71 + }, + { + "epoch": 0.035015197568389056, + "grad_norm": 0.12777701402284913, + "learning_rate": 9.999008755506694e-06, + "loss": 0.6619, + "step": 72 + }, + { + "epoch": 0.035501519756838906, + "grad_norm": 0.12196178571987971, + "learning_rate": 9.998970265455238e-06, + "loss": 0.645, + "step": 73 + }, + { + "epoch": 0.03598784194528876, + "grad_norm": 0.11813820328725896, + "learning_rate": 9.99893104236156e-06, + "loss": 0.6387, + "step": 74 + }, + { + "epoch": 0.0364741641337386, + "grad_norm": 0.12029825188155972, + "learning_rate": 9.99889108623141e-06, + "loss": 0.7012, + "step": 75 + }, + { + "epoch": 0.03696048632218845, + "grad_norm": 0.11227692399158924, + "learning_rate": 9.99885039707065e-06, + "loss": 0.6643, + "step": 76 + }, + { + "epoch": 0.0374468085106383, + "grad_norm": 0.11308607110289605, + "learning_rate": 9.998808974885244e-06, + "loss": 0.6366, + "step": 77 + }, + { + "epoch": 0.037933130699088144, + "grad_norm": 0.11722293546518353, + "learning_rate": 9.998766819681268e-06, + "loss": 0.6785, + "step": 78 + }, + { + "epoch": 0.038419452887537994, + "grad_norm": 0.11778874372386479, + "learning_rate": 9.9987239314649e-06, + "loss": 0.6631, + "step": 79 + }, + { + "epoch": 0.038905775075987845, + "grad_norm": 0.11393109271304705, + "learning_rate": 9.998680310242434e-06, + "loss": 0.6447, + "step": 80 + }, + { + "epoch": 0.03939209726443769, + "grad_norm": 0.11352105585660828, + "learning_rate": 9.998635956020263e-06, + "loss": 0.6953, + "step": 81 + }, + { + "epoch": 0.03987841945288754, + "grad_norm": 0.11492422904092232, + "learning_rate": 9.998590868804895e-06, + "loss": 0.6497, + "step": 82 + }, + { + "epoch": 0.04036474164133739, + "grad_norm": 0.11216849249930264, + "learning_rate": 9.998545048602938e-06, + "loss": 0.5993, + "step": 83 + }, + { + "epoch": 0.04085106382978723, + "grad_norm": 0.11208333477392295, + "learning_rate": 9.99849849542111e-06, + "loss": 0.6423, + "step": 84 + }, + { + "epoch": 0.04133738601823708, + "grad_norm": 0.11004452394150985, + "learning_rate": 9.99845120926624e-06, + "loss": 0.6814, + "step": 85 + }, + { + "epoch": 0.04182370820668693, + "grad_norm": 0.1093933685979544, + "learning_rate": 9.99840319014526e-06, + "loss": 0.6342, + "step": 86 + }, + { + "epoch": 0.042310030395136776, + "grad_norm": 0.10475010595542357, + "learning_rate": 9.998354438065215e-06, + "loss": 0.6252, + "step": 87 + }, + { + "epoch": 0.042796352583586626, + "grad_norm": 0.10849079308705853, + "learning_rate": 9.99830495303325e-06, + "loss": 0.6482, + "step": 88 + }, + { + "epoch": 0.04328267477203648, + "grad_norm": 0.1109051843507213, + "learning_rate": 9.998254735056624e-06, + "loss": 0.6181, + "step": 89 + }, + { + "epoch": 0.04376899696048632, + "grad_norm": 0.10347472557390267, + "learning_rate": 9.998203784142701e-06, + "loss": 0.6496, + "step": 90 + }, + { + "epoch": 0.04425531914893617, + "grad_norm": 0.1023494893362925, + "learning_rate": 9.998152100298952e-06, + "loss": 0.6889, + "step": 91 + }, + { + "epoch": 0.04474164133738602, + "grad_norm": 0.10128090895450541, + "learning_rate": 9.998099683532953e-06, + "loss": 0.6375, + "step": 92 + }, + { + "epoch": 0.045227963525835864, + "grad_norm": 0.1010149046358367, + "learning_rate": 9.998046533852395e-06, + "loss": 0.666, + "step": 93 + }, + { + "epoch": 0.045714285714285714, + "grad_norm": 0.09733547430095113, + "learning_rate": 9.997992651265067e-06, + "loss": 0.6296, + "step": 94 + }, + { + "epoch": 0.046200607902735565, + "grad_norm": 0.09621884123455722, + "learning_rate": 9.997938035778874e-06, + "loss": 0.638, + "step": 95 + }, + { + "epoch": 0.04668693009118541, + "grad_norm": 0.09502640263983121, + "learning_rate": 9.997882687401823e-06, + "loss": 0.6465, + "step": 96 + }, + { + "epoch": 0.04717325227963526, + "grad_norm": 0.09792477904150157, + "learning_rate": 9.997826606142031e-06, + "loss": 0.6196, + "step": 97 + }, + { + "epoch": 0.04765957446808511, + "grad_norm": 0.09964761338523928, + "learning_rate": 9.997769792007721e-06, + "loss": 0.6903, + "step": 98 + }, + { + "epoch": 0.04814589665653495, + "grad_norm": 0.09468446698771453, + "learning_rate": 9.997712245007225e-06, + "loss": 0.6601, + "step": 99 + }, + { + "epoch": 0.0486322188449848, + "grad_norm": 0.09135140026417463, + "learning_rate": 9.997653965148978e-06, + "loss": 0.6785, + "step": 100 + }, + { + "epoch": 0.04911854103343465, + "grad_norm": 0.09305247116557831, + "learning_rate": 9.997594952441533e-06, + "loss": 0.6497, + "step": 101 + }, + { + "epoch": 0.049604863221884496, + "grad_norm": 0.08954055344921939, + "learning_rate": 9.997535206893538e-06, + "loss": 0.6684, + "step": 102 + }, + { + "epoch": 0.050091185410334346, + "grad_norm": 0.09406151851787839, + "learning_rate": 9.997474728513757e-06, + "loss": 0.6902, + "step": 103 + }, + { + "epoch": 0.0505775075987842, + "grad_norm": 0.08570960033414718, + "learning_rate": 9.997413517311055e-06, + "loss": 0.6376, + "step": 104 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 0.0838484001210435, + "learning_rate": 9.997351573294412e-06, + "loss": 0.631, + "step": 105 + }, + { + "epoch": 0.05155015197568389, + "grad_norm": 0.0842468989312265, + "learning_rate": 9.997288896472907e-06, + "loss": 0.619, + "step": 106 + }, + { + "epoch": 0.05203647416413374, + "grad_norm": 0.09010940006399906, + "learning_rate": 9.997225486855735e-06, + "loss": 0.6707, + "step": 107 + }, + { + "epoch": 0.052522796352583584, + "grad_norm": 0.0808816638089893, + "learning_rate": 9.997161344452194e-06, + "loss": 0.648, + "step": 108 + }, + { + "epoch": 0.053009118541033434, + "grad_norm": 0.08337249166573397, + "learning_rate": 9.997096469271686e-06, + "loss": 0.6594, + "step": 109 + }, + { + "epoch": 0.053495440729483285, + "grad_norm": 0.08747506870175674, + "learning_rate": 9.997030861323728e-06, + "loss": 0.6706, + "step": 110 + }, + { + "epoch": 0.05398176291793313, + "grad_norm": 0.0798971427458053, + "learning_rate": 9.996964520617938e-06, + "loss": 0.6515, + "step": 111 + }, + { + "epoch": 0.05446808510638298, + "grad_norm": 0.09064111340606204, + "learning_rate": 9.996897447164047e-06, + "loss": 0.6413, + "step": 112 + }, + { + "epoch": 0.05495440729483283, + "grad_norm": 0.08853314144771497, + "learning_rate": 9.996829640971888e-06, + "loss": 0.6524, + "step": 113 + }, + { + "epoch": 0.05544072948328267, + "grad_norm": 0.07849763044568041, + "learning_rate": 9.996761102051404e-06, + "loss": 0.6399, + "step": 114 + }, + { + "epoch": 0.05592705167173252, + "grad_norm": 0.07796705560689386, + "learning_rate": 9.996691830412649e-06, + "loss": 0.6382, + "step": 115 + }, + { + "epoch": 0.05641337386018237, + "grad_norm": 0.0798110601596513, + "learning_rate": 9.996621826065776e-06, + "loss": 0.6282, + "step": 116 + }, + { + "epoch": 0.056899696048632216, + "grad_norm": 0.08267540446214865, + "learning_rate": 9.996551089021051e-06, + "loss": 0.6714, + "step": 117 + }, + { + "epoch": 0.057386018237082066, + "grad_norm": 0.07829609873617312, + "learning_rate": 9.996479619288853e-06, + "loss": 0.6554, + "step": 118 + }, + { + "epoch": 0.05787234042553192, + "grad_norm": 0.08075387885281121, + "learning_rate": 9.996407416879654e-06, + "loss": 0.6747, + "step": 119 + }, + { + "epoch": 0.05835866261398176, + "grad_norm": 0.08397011690681794, + "learning_rate": 9.996334481804047e-06, + "loss": 0.6902, + "step": 120 + }, + { + "epoch": 0.05884498480243161, + "grad_norm": 0.08116164351180236, + "learning_rate": 9.996260814072725e-06, + "loss": 0.6744, + "step": 121 + }, + { + "epoch": 0.05933130699088146, + "grad_norm": 0.07428881827032656, + "learning_rate": 9.99618641369649e-06, + "loss": 0.6376, + "step": 122 + }, + { + "epoch": 0.059817629179331304, + "grad_norm": 0.07660023165080271, + "learning_rate": 9.996111280686254e-06, + "loss": 0.651, + "step": 123 + }, + { + "epoch": 0.060303951367781154, + "grad_norm": 0.07500605550465393, + "learning_rate": 9.996035415053032e-06, + "loss": 0.6416, + "step": 124 + }, + { + "epoch": 0.060790273556231005, + "grad_norm": 0.07986814893372031, + "learning_rate": 9.995958816807951e-06, + "loss": 0.6687, + "step": 125 + }, + { + "epoch": 0.06127659574468085, + "grad_norm": 0.0804707259260744, + "learning_rate": 9.995881485962243e-06, + "loss": 0.6622, + "step": 126 + }, + { + "epoch": 0.0617629179331307, + "grad_norm": 0.07626838879851239, + "learning_rate": 9.995803422527246e-06, + "loss": 0.6189, + "step": 127 + }, + { + "epoch": 0.06224924012158055, + "grad_norm": 0.07676272450717668, + "learning_rate": 9.99572462651441e-06, + "loss": 0.6209, + "step": 128 + }, + { + "epoch": 0.0627355623100304, + "grad_norm": 0.0795701961265009, + "learning_rate": 9.995645097935285e-06, + "loss": 0.6701, + "step": 129 + }, + { + "epoch": 0.06322188449848025, + "grad_norm": 0.07338760681445435, + "learning_rate": 9.995564836801538e-06, + "loss": 0.6359, + "step": 130 + }, + { + "epoch": 0.06370820668693009, + "grad_norm": 0.07615608704119091, + "learning_rate": 9.995483843124933e-06, + "loss": 0.6762, + "step": 131 + }, + { + "epoch": 0.06419452887537994, + "grad_norm": 0.0774115676139232, + "learning_rate": 9.995402116917353e-06, + "loss": 0.6565, + "step": 132 + }, + { + "epoch": 0.06468085106382979, + "grad_norm": 0.0728271064207136, + "learning_rate": 9.995319658190778e-06, + "loss": 0.6381, + "step": 133 + }, + { + "epoch": 0.06516717325227964, + "grad_norm": 0.07473810238831956, + "learning_rate": 9.995236466957301e-06, + "loss": 0.6685, + "step": 134 + }, + { + "epoch": 0.06565349544072949, + "grad_norm": 0.07778393318867559, + "learning_rate": 9.995152543229122e-06, + "loss": 0.7078, + "step": 135 + }, + { + "epoch": 0.06613981762917934, + "grad_norm": 0.07654598128919217, + "learning_rate": 9.995067887018544e-06, + "loss": 0.6695, + "step": 136 + }, + { + "epoch": 0.06662613981762917, + "grad_norm": 0.07368534573201836, + "learning_rate": 9.994982498337985e-06, + "loss": 0.6264, + "step": 137 + }, + { + "epoch": 0.06711246200607902, + "grad_norm": 0.07755422239785308, + "learning_rate": 9.994896377199962e-06, + "loss": 0.6673, + "step": 138 + }, + { + "epoch": 0.06759878419452887, + "grad_norm": 0.0739152449458426, + "learning_rate": 9.994809523617109e-06, + "loss": 0.617, + "step": 139 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 0.07061573569583886, + "learning_rate": 9.994721937602157e-06, + "loss": 0.6225, + "step": 140 + }, + { + "epoch": 0.06857142857142857, + "grad_norm": 0.07457515709097928, + "learning_rate": 9.994633619167953e-06, + "loss": 0.6056, + "step": 141 + }, + { + "epoch": 0.06905775075987843, + "grad_norm": 0.07792013158490566, + "learning_rate": 9.994544568327445e-06, + "loss": 0.649, + "step": 142 + }, + { + "epoch": 0.06954407294832826, + "grad_norm": 0.07351487856534614, + "learning_rate": 9.994454785093695e-06, + "loss": 0.6279, + "step": 143 + }, + { + "epoch": 0.07003039513677811, + "grad_norm": 0.07577260988839267, + "learning_rate": 9.994364269479863e-06, + "loss": 0.6703, + "step": 144 + }, + { + "epoch": 0.07051671732522796, + "grad_norm": 0.08086276565885057, + "learning_rate": 9.99427302149923e-06, + "loss": 0.6939, + "step": 145 + }, + { + "epoch": 0.07100303951367781, + "grad_norm": 0.07932628326597181, + "learning_rate": 9.994181041165169e-06, + "loss": 0.6415, + "step": 146 + }, + { + "epoch": 0.07148936170212766, + "grad_norm": 0.0800659998687647, + "learning_rate": 9.994088328491173e-06, + "loss": 0.6587, + "step": 147 + }, + { + "epoch": 0.07197568389057751, + "grad_norm": 0.07676164938191389, + "learning_rate": 9.993994883490834e-06, + "loss": 0.699, + "step": 148 + }, + { + "epoch": 0.07246200607902735, + "grad_norm": 0.07354122008105174, + "learning_rate": 9.993900706177857e-06, + "loss": 0.6276, + "step": 149 + }, + { + "epoch": 0.0729483282674772, + "grad_norm": 0.07430454020811418, + "learning_rate": 9.99380579656605e-06, + "loss": 0.6437, + "step": 150 + }, + { + "epoch": 0.07343465045592705, + "grad_norm": 0.07562772046850341, + "learning_rate": 9.993710154669332e-06, + "loss": 0.6438, + "step": 151 + }, + { + "epoch": 0.0739209726443769, + "grad_norm": 0.07644640073684972, + "learning_rate": 9.993613780501727e-06, + "loss": 0.6343, + "step": 152 + }, + { + "epoch": 0.07440729483282675, + "grad_norm": 0.07181629428203033, + "learning_rate": 9.993516674077367e-06, + "loss": 0.6513, + "step": 153 + }, + { + "epoch": 0.0748936170212766, + "grad_norm": 0.07509996315055295, + "learning_rate": 9.99341883541049e-06, + "loss": 0.6626, + "step": 154 + }, + { + "epoch": 0.07537993920972644, + "grad_norm": 0.08578615670234167, + "learning_rate": 9.993320264515448e-06, + "loss": 0.6339, + "step": 155 + }, + { + "epoch": 0.07586626139817629, + "grad_norm": 0.06955973492988606, + "learning_rate": 9.99322096140669e-06, + "loss": 0.6177, + "step": 156 + }, + { + "epoch": 0.07635258358662614, + "grad_norm": 0.07267545309129311, + "learning_rate": 9.993120926098781e-06, + "loss": 0.6342, + "step": 157 + }, + { + "epoch": 0.07683890577507599, + "grad_norm": 0.07832105979852015, + "learning_rate": 9.99302015860639e-06, + "loss": 0.6736, + "step": 158 + }, + { + "epoch": 0.07732522796352584, + "grad_norm": 0.07777012583112805, + "learning_rate": 9.99291865894429e-06, + "loss": 0.6419, + "step": 159 + }, + { + "epoch": 0.07781155015197569, + "grad_norm": 0.0765011036786609, + "learning_rate": 9.992816427127367e-06, + "loss": 0.6309, + "step": 160 + }, + { + "epoch": 0.07829787234042553, + "grad_norm": 0.07604964078473068, + "learning_rate": 9.992713463170613e-06, + "loss": 0.6362, + "step": 161 + }, + { + "epoch": 0.07878419452887538, + "grad_norm": 0.07603084698259067, + "learning_rate": 9.992609767089127e-06, + "loss": 0.6715, + "step": 162 + }, + { + "epoch": 0.07927051671732523, + "grad_norm": 0.07222366232326545, + "learning_rate": 9.992505338898113e-06, + "loss": 0.6433, + "step": 163 + }, + { + "epoch": 0.07975683890577508, + "grad_norm": 0.07294061717851048, + "learning_rate": 9.992400178612882e-06, + "loss": 0.6329, + "step": 164 + }, + { + "epoch": 0.08024316109422493, + "grad_norm": 0.07461766568616675, + "learning_rate": 9.99229428624886e-06, + "loss": 0.6437, + "step": 165 + }, + { + "epoch": 0.08072948328267478, + "grad_norm": 0.07066059778730192, + "learning_rate": 9.99218766182157e-06, + "loss": 0.6429, + "step": 166 + }, + { + "epoch": 0.08121580547112461, + "grad_norm": 0.07932064552397436, + "learning_rate": 9.992080305346652e-06, + "loss": 0.6924, + "step": 167 + }, + { + "epoch": 0.08170212765957446, + "grad_norm": 0.07387041791715146, + "learning_rate": 9.991972216839845e-06, + "loss": 0.6281, + "step": 168 + }, + { + "epoch": 0.08218844984802431, + "grad_norm": 0.07464034222960271, + "learning_rate": 9.991863396317e-06, + "loss": 0.6719, + "step": 169 + }, + { + "epoch": 0.08267477203647416, + "grad_norm": 0.07629524145920062, + "learning_rate": 9.991753843794072e-06, + "loss": 0.6822, + "step": 170 + }, + { + "epoch": 0.08316109422492401, + "grad_norm": 0.0717343177682337, + "learning_rate": 9.991643559287131e-06, + "loss": 0.6253, + "step": 171 + }, + { + "epoch": 0.08364741641337387, + "grad_norm": 0.074456963337064, + "learning_rate": 9.991532542812345e-06, + "loss": 0.6728, + "step": 172 + }, + { + "epoch": 0.0841337386018237, + "grad_norm": 0.08032867027111115, + "learning_rate": 9.991420794385994e-06, + "loss": 0.6283, + "step": 173 + }, + { + "epoch": 0.08462006079027355, + "grad_norm": 0.07605022839286113, + "learning_rate": 9.991308314024466e-06, + "loss": 0.6265, + "step": 174 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 0.07803743225470429, + "learning_rate": 9.99119510174425e-06, + "loss": 0.6516, + "step": 175 + }, + { + "epoch": 0.08559270516717325, + "grad_norm": 0.07566298812696846, + "learning_rate": 9.991081157561955e-06, + "loss": 0.6559, + "step": 176 + }, + { + "epoch": 0.0860790273556231, + "grad_norm": 0.07953039318586384, + "learning_rate": 9.990966481494285e-06, + "loss": 0.6199, + "step": 177 + }, + { + "epoch": 0.08656534954407295, + "grad_norm": 0.0798743101352713, + "learning_rate": 9.990851073558056e-06, + "loss": 0.6279, + "step": 178 + }, + { + "epoch": 0.08705167173252279, + "grad_norm": 0.07549964902318917, + "learning_rate": 9.990734933770192e-06, + "loss": 0.6357, + "step": 179 + }, + { + "epoch": 0.08753799392097264, + "grad_norm": 0.08140632061403627, + "learning_rate": 9.990618062147724e-06, + "loss": 0.6924, + "step": 180 + }, + { + "epoch": 0.08802431610942249, + "grad_norm": 0.07373636899236395, + "learning_rate": 9.99050045870779e-06, + "loss": 0.638, + "step": 181 + }, + { + "epoch": 0.08851063829787234, + "grad_norm": 0.0738017611641229, + "learning_rate": 9.990382123467633e-06, + "loss": 0.6613, + "step": 182 + }, + { + "epoch": 0.08899696048632219, + "grad_norm": 0.071911012138807, + "learning_rate": 9.990263056444607e-06, + "loss": 0.6337, + "step": 183 + }, + { + "epoch": 0.08948328267477204, + "grad_norm": 0.07865641677176818, + "learning_rate": 9.990143257656173e-06, + "loss": 0.6321, + "step": 184 + }, + { + "epoch": 0.08996960486322189, + "grad_norm": 0.07330689404202993, + "learning_rate": 9.990022727119897e-06, + "loss": 0.6563, + "step": 185 + }, + { + "epoch": 0.09045592705167173, + "grad_norm": 0.07041441409005485, + "learning_rate": 9.989901464853454e-06, + "loss": 0.6084, + "step": 186 + }, + { + "epoch": 0.09094224924012158, + "grad_norm": 0.07330065136647741, + "learning_rate": 9.989779470874626e-06, + "loss": 0.6136, + "step": 187 + }, + { + "epoch": 0.09142857142857143, + "grad_norm": 0.0759547086678673, + "learning_rate": 9.9896567452013e-06, + "loss": 0.6564, + "step": 188 + }, + { + "epoch": 0.09191489361702128, + "grad_norm": 0.07164189294623767, + "learning_rate": 9.989533287851472e-06, + "loss": 0.6468, + "step": 189 + }, + { + "epoch": 0.09240121580547113, + "grad_norm": 0.07629127905525687, + "learning_rate": 9.989409098843249e-06, + "loss": 0.6488, + "step": 190 + }, + { + "epoch": 0.09288753799392098, + "grad_norm": 0.07471202058897447, + "learning_rate": 9.98928417819484e-06, + "loss": 0.6249, + "step": 191 + }, + { + "epoch": 0.09337386018237082, + "grad_norm": 0.07296776823910131, + "learning_rate": 9.989158525924562e-06, + "loss": 0.6313, + "step": 192 + }, + { + "epoch": 0.09386018237082067, + "grad_norm": 0.07251456227592262, + "learning_rate": 9.989032142050845e-06, + "loss": 0.6364, + "step": 193 + }, + { + "epoch": 0.09434650455927052, + "grad_norm": 0.07930573859948695, + "learning_rate": 9.988905026592217e-06, + "loss": 0.6566, + "step": 194 + }, + { + "epoch": 0.09483282674772037, + "grad_norm": 0.0726289212935777, + "learning_rate": 9.98877717956732e-06, + "loss": 0.6428, + "step": 195 + }, + { + "epoch": 0.09531914893617022, + "grad_norm": 0.07236931893551049, + "learning_rate": 9.988648600994898e-06, + "loss": 0.6315, + "step": 196 + }, + { + "epoch": 0.09580547112462007, + "grad_norm": 0.07250340091652571, + "learning_rate": 9.988519290893813e-06, + "loss": 0.6322, + "step": 197 + }, + { + "epoch": 0.0962917933130699, + "grad_norm": 0.07269441037382905, + "learning_rate": 9.988389249283019e-06, + "loss": 0.6359, + "step": 198 + }, + { + "epoch": 0.09677811550151975, + "grad_norm": 0.07297038121770004, + "learning_rate": 9.98825847618159e-06, + "loss": 0.6189, + "step": 199 + }, + { + "epoch": 0.0972644376899696, + "grad_norm": 0.07552803910007803, + "learning_rate": 9.9881269716087e-06, + "loss": 0.6287, + "step": 200 + }, + { + "epoch": 0.09775075987841945, + "grad_norm": 0.074487954110049, + "learning_rate": 9.987994735583635e-06, + "loss": 0.6409, + "step": 201 + }, + { + "epoch": 0.0982370820668693, + "grad_norm": 0.07236572318666377, + "learning_rate": 9.987861768125783e-06, + "loss": 0.6327, + "step": 202 + }, + { + "epoch": 0.09872340425531916, + "grad_norm": 0.07322032012079707, + "learning_rate": 9.987728069254645e-06, + "loss": 0.6251, + "step": 203 + }, + { + "epoch": 0.09920972644376899, + "grad_norm": 0.07840698977706183, + "learning_rate": 9.987593638989824e-06, + "loss": 0.6162, + "step": 204 + }, + { + "epoch": 0.09969604863221884, + "grad_norm": 0.0703988397218052, + "learning_rate": 9.987458477351034e-06, + "loss": 0.5898, + "step": 205 + }, + { + "epoch": 0.10018237082066869, + "grad_norm": 0.07562550442182135, + "learning_rate": 9.987322584358095e-06, + "loss": 0.6618, + "step": 206 + }, + { + "epoch": 0.10066869300911854, + "grad_norm": 0.07466455915688629, + "learning_rate": 9.987185960030933e-06, + "loss": 0.6193, + "step": 207 + }, + { + "epoch": 0.1011550151975684, + "grad_norm": 0.07223918433037317, + "learning_rate": 9.987048604389584e-06, + "loss": 0.6221, + "step": 208 + }, + { + "epoch": 0.10164133738601824, + "grad_norm": 0.08110884305727797, + "learning_rate": 9.986910517454188e-06, + "loss": 0.6141, + "step": 209 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 0.07312053516960584, + "learning_rate": 9.986771699244995e-06, + "loss": 0.6015, + "step": 210 + }, + { + "epoch": 0.10261398176291793, + "grad_norm": 0.07426585754655243, + "learning_rate": 9.986632149782362e-06, + "loss": 0.6064, + "step": 211 + }, + { + "epoch": 0.10310030395136778, + "grad_norm": 0.07524094590731174, + "learning_rate": 9.98649186908675e-06, + "loss": 0.6475, + "step": 212 + }, + { + "epoch": 0.10358662613981763, + "grad_norm": 0.07189953499669959, + "learning_rate": 9.98635085717873e-06, + "loss": 0.6157, + "step": 213 + }, + { + "epoch": 0.10407294832826748, + "grad_norm": 0.07289407133976893, + "learning_rate": 9.986209114078982e-06, + "loss": 0.6148, + "step": 214 + }, + { + "epoch": 0.10455927051671733, + "grad_norm": 0.08151355057726764, + "learning_rate": 9.98606663980829e-06, + "loss": 0.6475, + "step": 215 + }, + { + "epoch": 0.10504559270516717, + "grad_norm": 0.07638129409651782, + "learning_rate": 9.985923434387545e-06, + "loss": 0.6439, + "step": 216 + }, + { + "epoch": 0.10553191489361702, + "grad_norm": 0.07633020661462125, + "learning_rate": 9.985779497837748e-06, + "loss": 0.67, + "step": 217 + }, + { + "epoch": 0.10601823708206687, + "grad_norm": 0.07429493171857507, + "learning_rate": 9.985634830180005e-06, + "loss": 0.6345, + "step": 218 + }, + { + "epoch": 0.10650455927051672, + "grad_norm": 0.07294904642499844, + "learning_rate": 9.985489431435528e-06, + "loss": 0.6303, + "step": 219 + }, + { + "epoch": 0.10699088145896657, + "grad_norm": 0.07922924783596189, + "learning_rate": 9.98534330162564e-06, + "loss": 0.6402, + "step": 220 + }, + { + "epoch": 0.10747720364741642, + "grad_norm": 0.0792066395615164, + "learning_rate": 9.985196440771771e-06, + "loss": 0.6673, + "step": 221 + }, + { + "epoch": 0.10796352583586626, + "grad_norm": 0.07417640799704432, + "learning_rate": 9.985048848895454e-06, + "loss": 0.6355, + "step": 222 + }, + { + "epoch": 0.1084498480243161, + "grad_norm": 0.07534901438232722, + "learning_rate": 9.984900526018331e-06, + "loss": 0.6339, + "step": 223 + }, + { + "epoch": 0.10893617021276596, + "grad_norm": 0.07525590620754326, + "learning_rate": 9.984751472162154e-06, + "loss": 0.6754, + "step": 224 + }, + { + "epoch": 0.1094224924012158, + "grad_norm": 0.07585142840936479, + "learning_rate": 9.98460168734878e-06, + "loss": 0.6423, + "step": 225 + }, + { + "epoch": 0.10990881458966566, + "grad_norm": 0.07509447600300756, + "learning_rate": 9.984451171600171e-06, + "loss": 0.6108, + "step": 226 + }, + { + "epoch": 0.11039513677811551, + "grad_norm": 0.07379294315072249, + "learning_rate": 9.9842999249384e-06, + "loss": 0.649, + "step": 227 + }, + { + "epoch": 0.11088145896656534, + "grad_norm": 0.08109799647676486, + "learning_rate": 9.984147947385647e-06, + "loss": 0.6824, + "step": 228 + }, + { + "epoch": 0.1113677811550152, + "grad_norm": 0.08385890406611128, + "learning_rate": 9.983995238964194e-06, + "loss": 0.6624, + "step": 229 + }, + { + "epoch": 0.11185410334346504, + "grad_norm": 0.07371528502111811, + "learning_rate": 9.98384179969644e-06, + "loss": 0.6019, + "step": 230 + }, + { + "epoch": 0.1123404255319149, + "grad_norm": 0.07326480823488515, + "learning_rate": 9.983687629604879e-06, + "loss": 0.6314, + "step": 231 + }, + { + "epoch": 0.11282674772036475, + "grad_norm": 0.07698464930942882, + "learning_rate": 9.98353272871212e-06, + "loss": 0.6957, + "step": 232 + }, + { + "epoch": 0.1133130699088146, + "grad_norm": 0.08105359686436309, + "learning_rate": 9.983377097040879e-06, + "loss": 0.6538, + "step": 233 + }, + { + "epoch": 0.11379939209726443, + "grad_norm": 0.07594073018857872, + "learning_rate": 9.983220734613975e-06, + "loss": 0.6422, + "step": 234 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 0.07639549994262636, + "learning_rate": 9.98306364145434e-06, + "loss": 0.6732, + "step": 235 + }, + { + "epoch": 0.11477203647416413, + "grad_norm": 0.07361286876813115, + "learning_rate": 9.98290581758501e-06, + "loss": 0.6194, + "step": 236 + }, + { + "epoch": 0.11525835866261398, + "grad_norm": 0.0772227742611177, + "learning_rate": 9.982747263029123e-06, + "loss": 0.6436, + "step": 237 + }, + { + "epoch": 0.11574468085106383, + "grad_norm": 0.07973545295674096, + "learning_rate": 9.982587977809934e-06, + "loss": 0.6783, + "step": 238 + }, + { + "epoch": 0.11623100303951368, + "grad_norm": 0.07299524102693468, + "learning_rate": 9.9824279619508e-06, + "loss": 0.608, + "step": 239 + }, + { + "epoch": 0.11671732522796352, + "grad_norm": 0.07447952526538501, + "learning_rate": 9.982267215475186e-06, + "loss": 0.6467, + "step": 240 + }, + { + "epoch": 0.11720364741641337, + "grad_norm": 0.07367224054974836, + "learning_rate": 9.98210573840666e-06, + "loss": 0.6259, + "step": 241 + }, + { + "epoch": 0.11768996960486322, + "grad_norm": 0.08029766455887737, + "learning_rate": 9.981943530768903e-06, + "loss": 0.6468, + "step": 242 + }, + { + "epoch": 0.11817629179331307, + "grad_norm": 0.08044818330872944, + "learning_rate": 9.981780592585702e-06, + "loss": 0.6432, + "step": 243 + }, + { + "epoch": 0.11866261398176292, + "grad_norm": 0.0778126874748524, + "learning_rate": 9.981616923880948e-06, + "loss": 0.6783, + "step": 244 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 0.07142908435344075, + "learning_rate": 9.981452524678641e-06, + "loss": 0.6235, + "step": 245 + }, + { + "epoch": 0.11963525835866261, + "grad_norm": 0.07267064743613215, + "learning_rate": 9.981287395002892e-06, + "loss": 0.5984, + "step": 246 + }, + { + "epoch": 0.12012158054711246, + "grad_norm": 0.07577047506107071, + "learning_rate": 9.981121534877912e-06, + "loss": 0.656, + "step": 247 + }, + { + "epoch": 0.12060790273556231, + "grad_norm": 0.07448927533837468, + "learning_rate": 9.980954944328023e-06, + "loss": 0.614, + "step": 248 + }, + { + "epoch": 0.12109422492401216, + "grad_norm": 0.0739277611575981, + "learning_rate": 9.980787623377654e-06, + "loss": 0.6265, + "step": 249 + }, + { + "epoch": 0.12158054711246201, + "grad_norm": 0.07265875812751259, + "learning_rate": 9.98061957205134e-06, + "loss": 0.6118, + "step": 250 + }, + { + "epoch": 0.12206686930091186, + "grad_norm": 0.07429152601952896, + "learning_rate": 9.980450790373724e-06, + "loss": 0.645, + "step": 251 + }, + { + "epoch": 0.1225531914893617, + "grad_norm": 0.07254255502966724, + "learning_rate": 9.980281278369558e-06, + "loss": 0.6448, + "step": 252 + }, + { + "epoch": 0.12303951367781155, + "grad_norm": 0.0752170871851315, + "learning_rate": 9.980111036063696e-06, + "loss": 0.6514, + "step": 253 + }, + { + "epoch": 0.1235258358662614, + "grad_norm": 0.07374713989231758, + "learning_rate": 9.979940063481105e-06, + "loss": 0.6283, + "step": 254 + }, + { + "epoch": 0.12401215805471125, + "grad_norm": 0.07797996972352954, + "learning_rate": 9.979768360646854e-06, + "loss": 0.687, + "step": 255 + }, + { + "epoch": 0.1244984802431611, + "grad_norm": 0.07532383141165228, + "learning_rate": 9.97959592758612e-06, + "loss": 0.6211, + "step": 256 + }, + { + "epoch": 0.12498480243161095, + "grad_norm": 0.07408087865565535, + "learning_rate": 9.979422764324193e-06, + "loss": 0.635, + "step": 257 + }, + { + "epoch": 0.1254711246200608, + "grad_norm": 0.0751721437373112, + "learning_rate": 9.979248870886463e-06, + "loss": 0.6539, + "step": 258 + }, + { + "epoch": 0.12595744680851065, + "grad_norm": 0.0764488732522613, + "learning_rate": 9.979074247298428e-06, + "loss": 0.6647, + "step": 259 + }, + { + "epoch": 0.1264437689969605, + "grad_norm": 0.07531640440287267, + "learning_rate": 9.978898893585695e-06, + "loss": 0.6734, + "step": 260 + }, + { + "epoch": 0.12693009118541032, + "grad_norm": 0.08489543512024499, + "learning_rate": 9.978722809773979e-06, + "loss": 0.6832, + "step": 261 + }, + { + "epoch": 0.12741641337386017, + "grad_norm": 0.07450921950140398, + "learning_rate": 9.9785459958891e-06, + "loss": 0.5894, + "step": 262 + }, + { + "epoch": 0.12790273556231002, + "grad_norm": 0.07672978323296226, + "learning_rate": 9.978368451956986e-06, + "loss": 0.6666, + "step": 263 + }, + { + "epoch": 0.12838905775075987, + "grad_norm": 0.07009524571111454, + "learning_rate": 9.978190178003672e-06, + "loss": 0.613, + "step": 264 + }, + { + "epoch": 0.12887537993920972, + "grad_norm": 0.07762736841300429, + "learning_rate": 9.9780111740553e-06, + "loss": 0.6976, + "step": 265 + }, + { + "epoch": 0.12936170212765957, + "grad_norm": 0.07306396494022326, + "learning_rate": 9.977831440138117e-06, + "loss": 0.6047, + "step": 266 + }, + { + "epoch": 0.12984802431610942, + "grad_norm": 0.07401168336824537, + "learning_rate": 9.97765097627848e-06, + "loss": 0.6101, + "step": 267 + }, + { + "epoch": 0.13033434650455927, + "grad_norm": 0.071910194215463, + "learning_rate": 9.977469782502853e-06, + "loss": 0.5705, + "step": 268 + }, + { + "epoch": 0.13082066869300912, + "grad_norm": 0.07846985374167931, + "learning_rate": 9.977287858837804e-06, + "loss": 0.6543, + "step": 269 + }, + { + "epoch": 0.13130699088145897, + "grad_norm": 0.07686257706733314, + "learning_rate": 9.977105205310016e-06, + "loss": 0.644, + "step": 270 + }, + { + "epoch": 0.13179331306990882, + "grad_norm": 0.07512913995712098, + "learning_rate": 9.976921821946264e-06, + "loss": 0.6017, + "step": 271 + }, + { + "epoch": 0.13227963525835867, + "grad_norm": 0.07289863724485444, + "learning_rate": 9.976737708773445e-06, + "loss": 0.6178, + "step": 272 + }, + { + "epoch": 0.1327659574468085, + "grad_norm": 0.07965974283291472, + "learning_rate": 9.976552865818555e-06, + "loss": 0.6484, + "step": 273 + }, + { + "epoch": 0.13325227963525835, + "grad_norm": 0.07738280827994552, + "learning_rate": 9.9763672931087e-06, + "loss": 0.6514, + "step": 274 + }, + { + "epoch": 0.1337386018237082, + "grad_norm": 0.0730594725117199, + "learning_rate": 9.976180990671092e-06, + "loss": 0.6142, + "step": 275 + }, + { + "epoch": 0.13422492401215805, + "grad_norm": 0.07474972138959583, + "learning_rate": 9.97599395853305e-06, + "loss": 0.6216, + "step": 276 + }, + { + "epoch": 0.1347112462006079, + "grad_norm": 0.0778031292425587, + "learning_rate": 9.975806196722e-06, + "loss": 0.625, + "step": 277 + }, + { + "epoch": 0.13519756838905775, + "grad_norm": 0.07245982646641651, + "learning_rate": 9.975617705265475e-06, + "loss": 0.6103, + "step": 278 + }, + { + "epoch": 0.1356838905775076, + "grad_norm": 0.07575059026747623, + "learning_rate": 9.975428484191117e-06, + "loss": 0.6164, + "step": 279 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 0.07356708561545493, + "learning_rate": 9.97523853352667e-06, + "loss": 0.6361, + "step": 280 + }, + { + "epoch": 0.1366565349544073, + "grad_norm": 0.07593902953055177, + "learning_rate": 9.97504785329999e-06, + "loss": 0.6294, + "step": 281 + }, + { + "epoch": 0.13714285714285715, + "grad_norm": 0.07033779192369982, + "learning_rate": 9.974856443539036e-06, + "loss": 0.5876, + "step": 282 + }, + { + "epoch": 0.137629179331307, + "grad_norm": 0.07453039463703805, + "learning_rate": 9.974664304271881e-06, + "loss": 0.6153, + "step": 283 + }, + { + "epoch": 0.13811550151975685, + "grad_norm": 0.07486083974906238, + "learning_rate": 9.974471435526694e-06, + "loss": 0.6429, + "step": 284 + }, + { + "epoch": 0.1386018237082067, + "grad_norm": 0.07200964246522952, + "learning_rate": 9.974277837331761e-06, + "loss": 0.6183, + "step": 285 + }, + { + "epoch": 0.13908814589665652, + "grad_norm": 0.07562642361751189, + "learning_rate": 9.974083509715471e-06, + "loss": 0.6278, + "step": 286 + }, + { + "epoch": 0.13957446808510637, + "grad_norm": 0.07867559069806047, + "learning_rate": 9.973888452706317e-06, + "loss": 0.6866, + "step": 287 + }, + { + "epoch": 0.14006079027355622, + "grad_norm": 0.07827332015368069, + "learning_rate": 9.973692666332905e-06, + "loss": 0.6592, + "step": 288 + }, + { + "epoch": 0.14054711246200607, + "grad_norm": 0.07484953929573433, + "learning_rate": 9.973496150623943e-06, + "loss": 0.6283, + "step": 289 + }, + { + "epoch": 0.14103343465045592, + "grad_norm": 0.07330948170161683, + "learning_rate": 9.973298905608248e-06, + "loss": 0.6256, + "step": 290 + }, + { + "epoch": 0.14151975683890577, + "grad_norm": 0.0883851588244503, + "learning_rate": 9.973100931314743e-06, + "loss": 0.665, + "step": 291 + }, + { + "epoch": 0.14200607902735563, + "grad_norm": 0.0729883437724811, + "learning_rate": 9.972902227772461e-06, + "loss": 0.6588, + "step": 292 + }, + { + "epoch": 0.14249240121580548, + "grad_norm": 0.07442876412386121, + "learning_rate": 9.972702795010539e-06, + "loss": 0.6616, + "step": 293 + }, + { + "epoch": 0.14297872340425533, + "grad_norm": 0.07251376051202359, + "learning_rate": 9.97250263305822e-06, + "loss": 0.5975, + "step": 294 + }, + { + "epoch": 0.14346504559270518, + "grad_norm": 0.07431377251626112, + "learning_rate": 9.972301741944856e-06, + "loss": 0.6019, + "step": 295 + }, + { + "epoch": 0.14395136778115503, + "grad_norm": 0.07618992064548367, + "learning_rate": 9.972100121699907e-06, + "loss": 0.6404, + "step": 296 + }, + { + "epoch": 0.14443768996960488, + "grad_norm": 0.07338435809055148, + "learning_rate": 9.971897772352936e-06, + "loss": 0.64, + "step": 297 + }, + { + "epoch": 0.1449240121580547, + "grad_norm": 0.0777590300690969, + "learning_rate": 9.971694693933617e-06, + "loss": 0.6863, + "step": 298 + }, + { + "epoch": 0.14541033434650455, + "grad_norm": 0.07564757932934363, + "learning_rate": 9.971490886471728e-06, + "loss": 0.6282, + "step": 299 + }, + { + "epoch": 0.1458966565349544, + "grad_norm": 0.07584579832665737, + "learning_rate": 9.971286349997155e-06, + "loss": 0.6513, + "step": 300 + }, + { + "epoch": 0.14638297872340425, + "grad_norm": 0.07165551524960252, + "learning_rate": 9.971081084539893e-06, + "loss": 0.6231, + "step": 301 + }, + { + "epoch": 0.1468693009118541, + "grad_norm": 0.07759548163032844, + "learning_rate": 9.97087509013004e-06, + "loss": 0.6762, + "step": 302 + }, + { + "epoch": 0.14735562310030395, + "grad_norm": 0.07583941154315121, + "learning_rate": 9.970668366797802e-06, + "loss": 0.6395, + "step": 303 + }, + { + "epoch": 0.1478419452887538, + "grad_norm": 0.07250091980070991, + "learning_rate": 9.970460914573494e-06, + "loss": 0.6364, + "step": 304 + }, + { + "epoch": 0.14832826747720365, + "grad_norm": 0.07318863031884103, + "learning_rate": 9.970252733487537e-06, + "loss": 0.5936, + "step": 305 + }, + { + "epoch": 0.1488145896656535, + "grad_norm": 0.07446466389801451, + "learning_rate": 9.970043823570457e-06, + "loss": 0.5781, + "step": 306 + }, + { + "epoch": 0.14930091185410335, + "grad_norm": 0.07172664341644777, + "learning_rate": 9.96983418485289e-06, + "loss": 0.5729, + "step": 307 + }, + { + "epoch": 0.1497872340425532, + "grad_norm": 0.0736030378888872, + "learning_rate": 9.969623817365574e-06, + "loss": 0.6508, + "step": 308 + }, + { + "epoch": 0.15027355623100305, + "grad_norm": 0.07465281130369202, + "learning_rate": 9.96941272113936e-06, + "loss": 0.6472, + "step": 309 + }, + { + "epoch": 0.15075987841945288, + "grad_norm": 0.07306605030090156, + "learning_rate": 9.969200896205201e-06, + "loss": 0.6191, + "step": 310 + }, + { + "epoch": 0.15124620060790273, + "grad_norm": 0.08723717562871942, + "learning_rate": 9.96898834259416e-06, + "loss": 0.6658, + "step": 311 + }, + { + "epoch": 0.15173252279635258, + "grad_norm": 0.0791621089206097, + "learning_rate": 9.968775060337406e-06, + "loss": 0.6556, + "step": 312 + }, + { + "epoch": 0.15221884498480243, + "grad_norm": 0.07902589971775417, + "learning_rate": 9.968561049466214e-06, + "loss": 0.6413, + "step": 313 + }, + { + "epoch": 0.15270516717325228, + "grad_norm": 0.07978880316551544, + "learning_rate": 9.968346310011965e-06, + "loss": 0.6541, + "step": 314 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 0.07676081352777822, + "learning_rate": 9.968130842006148e-06, + "loss": 0.6567, + "step": 315 + }, + { + "epoch": 0.15367781155015198, + "grad_norm": 0.07876049793989714, + "learning_rate": 9.967914645480361e-06, + "loss": 0.6701, + "step": 316 + }, + { + "epoch": 0.15416413373860183, + "grad_norm": 0.07272295966553359, + "learning_rate": 9.967697720466306e-06, + "loss": 0.6166, + "step": 317 + }, + { + "epoch": 0.15465045592705168, + "grad_norm": 0.07734962766032809, + "learning_rate": 9.967480066995792e-06, + "loss": 0.6468, + "step": 318 + }, + { + "epoch": 0.15513677811550153, + "grad_norm": 0.07380252667197663, + "learning_rate": 9.967261685100736e-06, + "loss": 0.6317, + "step": 319 + }, + { + "epoch": 0.15562310030395138, + "grad_norm": 0.07379445209764204, + "learning_rate": 9.96704257481316e-06, + "loss": 0.634, + "step": 320 + }, + { + "epoch": 0.15610942249240123, + "grad_norm": 0.07178504237687208, + "learning_rate": 9.966822736165194e-06, + "loss": 0.6452, + "step": 321 + }, + { + "epoch": 0.15659574468085105, + "grad_norm": 0.0752094396672398, + "learning_rate": 9.966602169189077e-06, + "loss": 0.6317, + "step": 322 + }, + { + "epoch": 0.1570820668693009, + "grad_norm": 0.07704411452781729, + "learning_rate": 9.966380873917152e-06, + "loss": 0.6329, + "step": 323 + }, + { + "epoch": 0.15756838905775075, + "grad_norm": 0.07331666896990816, + "learning_rate": 9.966158850381868e-06, + "loss": 0.6213, + "step": 324 + }, + { + "epoch": 0.1580547112462006, + "grad_norm": 0.07323834901564434, + "learning_rate": 9.965936098615783e-06, + "loss": 0.6463, + "step": 325 + }, + { + "epoch": 0.15854103343465045, + "grad_norm": 0.07473530428529629, + "learning_rate": 9.965712618651561e-06, + "loss": 0.6077, + "step": 326 + }, + { + "epoch": 0.1590273556231003, + "grad_norm": 0.07017372721610983, + "learning_rate": 9.965488410521974e-06, + "loss": 0.5694, + "step": 327 + }, + { + "epoch": 0.15951367781155015, + "grad_norm": 0.07326794013189401, + "learning_rate": 9.965263474259896e-06, + "loss": 0.6197, + "step": 328 + }, + { + "epoch": 0.16, + "grad_norm": 0.07924480238840713, + "learning_rate": 9.965037809898316e-06, + "loss": 0.6612, + "step": 329 + }, + { + "epoch": 0.16048632218844985, + "grad_norm": 0.0724181569720144, + "learning_rate": 9.964811417470322e-06, + "loss": 0.6315, + "step": 330 + }, + { + "epoch": 0.1609726443768997, + "grad_norm": 0.07592111290791996, + "learning_rate": 9.964584297009112e-06, + "loss": 0.6799, + "step": 331 + }, + { + "epoch": 0.16145896656534955, + "grad_norm": 0.07594391968708164, + "learning_rate": 9.964356448547993e-06, + "loss": 0.6398, + "step": 332 + }, + { + "epoch": 0.1619452887537994, + "grad_norm": 0.07364414977888648, + "learning_rate": 9.964127872120375e-06, + "loss": 0.6375, + "step": 333 + }, + { + "epoch": 0.16243161094224923, + "grad_norm": 0.07421290629411958, + "learning_rate": 9.963898567759775e-06, + "loss": 0.6112, + "step": 334 + }, + { + "epoch": 0.16291793313069908, + "grad_norm": 0.07357497539724447, + "learning_rate": 9.96366853549982e-06, + "loss": 0.5918, + "step": 335 + }, + { + "epoch": 0.16340425531914893, + "grad_norm": 0.07243767024457398, + "learning_rate": 9.96343777537424e-06, + "loss": 0.622, + "step": 336 + }, + { + "epoch": 0.16389057750759878, + "grad_norm": 0.07453095932083066, + "learning_rate": 9.963206287416873e-06, + "loss": 0.6059, + "step": 337 + }, + { + "epoch": 0.16437689969604863, + "grad_norm": 0.07504766468119894, + "learning_rate": 9.962974071661664e-06, + "loss": 0.6742, + "step": 338 + }, + { + "epoch": 0.16486322188449848, + "grad_norm": 0.07040383572997289, + "learning_rate": 9.962741128142667e-06, + "loss": 0.6272, + "step": 339 + }, + { + "epoch": 0.16534954407294833, + "grad_norm": 0.07413113117809093, + "learning_rate": 9.96250745689404e-06, + "loss": 0.6242, + "step": 340 + }, + { + "epoch": 0.16583586626139818, + "grad_norm": 0.07334220952792778, + "learning_rate": 9.962273057950048e-06, + "loss": 0.6274, + "step": 341 + }, + { + "epoch": 0.16632218844984803, + "grad_norm": 0.0764970593500182, + "learning_rate": 9.962037931345058e-06, + "loss": 0.6349, + "step": 342 + }, + { + "epoch": 0.16680851063829788, + "grad_norm": 0.07483289748593701, + "learning_rate": 9.961802077113558e-06, + "loss": 0.6278, + "step": 343 + }, + { + "epoch": 0.16729483282674773, + "grad_norm": 0.0740205554768208, + "learning_rate": 9.961565495290126e-06, + "loss": 0.6234, + "step": 344 + }, + { + "epoch": 0.16778115501519758, + "grad_norm": 0.07378161086081035, + "learning_rate": 9.961328185909457e-06, + "loss": 0.6375, + "step": 345 + }, + { + "epoch": 0.1682674772036474, + "grad_norm": 0.07309496134712835, + "learning_rate": 9.96109014900635e-06, + "loss": 0.6371, + "step": 346 + }, + { + "epoch": 0.16875379939209725, + "grad_norm": 0.07340947211478033, + "learning_rate": 9.960851384615709e-06, + "loss": 0.6111, + "step": 347 + }, + { + "epoch": 0.1692401215805471, + "grad_norm": 0.07139157389599032, + "learning_rate": 9.960611892772544e-06, + "loss": 0.5984, + "step": 348 + }, + { + "epoch": 0.16972644376899695, + "grad_norm": 0.07516349399841989, + "learning_rate": 9.96037167351198e-06, + "loss": 0.5988, + "step": 349 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 0.07508077902165791, + "learning_rate": 9.960130726869237e-06, + "loss": 0.6306, + "step": 350 + }, + { + "epoch": 0.17069908814589665, + "grad_norm": 0.07321463915401381, + "learning_rate": 9.95988905287965e-06, + "loss": 0.6211, + "step": 351 + }, + { + "epoch": 0.1711854103343465, + "grad_norm": 0.07091922946685138, + "learning_rate": 9.959646651578656e-06, + "loss": 0.586, + "step": 352 + }, + { + "epoch": 0.17167173252279636, + "grad_norm": 0.07584891592007169, + "learning_rate": 9.959403523001801e-06, + "loss": 0.6152, + "step": 353 + }, + { + "epoch": 0.1721580547112462, + "grad_norm": 0.07788514013608414, + "learning_rate": 9.959159667184736e-06, + "loss": 0.6885, + "step": 354 + }, + { + "epoch": 0.17264437689969606, + "grad_norm": 0.08209778358702638, + "learning_rate": 9.958915084163223e-06, + "loss": 0.653, + "step": 355 + }, + { + "epoch": 0.1731306990881459, + "grad_norm": 0.07362893687536431, + "learning_rate": 9.958669773973124e-06, + "loss": 0.6366, + "step": 356 + }, + { + "epoch": 0.17361702127659576, + "grad_norm": 0.07938618815926102, + "learning_rate": 9.958423736650413e-06, + "loss": 0.6793, + "step": 357 + }, + { + "epoch": 0.17410334346504558, + "grad_norm": 0.07580898641050113, + "learning_rate": 9.958176972231166e-06, + "loss": 0.6328, + "step": 358 + }, + { + "epoch": 0.17458966565349543, + "grad_norm": 0.08175037924395943, + "learning_rate": 9.957929480751572e-06, + "loss": 0.6227, + "step": 359 + }, + { + "epoch": 0.17507598784194528, + "grad_norm": 0.0745173085056933, + "learning_rate": 9.957681262247918e-06, + "loss": 0.6038, + "step": 360 + }, + { + "epoch": 0.17556231003039513, + "grad_norm": 0.07749405203334765, + "learning_rate": 9.957432316756608e-06, + "loss": 0.6364, + "step": 361 + }, + { + "epoch": 0.17604863221884498, + "grad_norm": 0.0752943992721238, + "learning_rate": 9.957182644314144e-06, + "loss": 0.6234, + "step": 362 + }, + { + "epoch": 0.17653495440729483, + "grad_norm": 0.07489723293994147, + "learning_rate": 9.956932244957135e-06, + "loss": 0.6365, + "step": 363 + }, + { + "epoch": 0.17702127659574468, + "grad_norm": 0.07175150599048309, + "learning_rate": 9.956681118722302e-06, + "loss": 0.5957, + "step": 364 + }, + { + "epoch": 0.17750759878419453, + "grad_norm": 0.0832717686492948, + "learning_rate": 9.956429265646472e-06, + "loss": 0.6561, + "step": 365 + }, + { + "epoch": 0.17799392097264438, + "grad_norm": 0.07631890515577754, + "learning_rate": 9.956176685766574e-06, + "loss": 0.6732, + "step": 366 + }, + { + "epoch": 0.17848024316109423, + "grad_norm": 0.07452110620795337, + "learning_rate": 9.955923379119645e-06, + "loss": 0.6213, + "step": 367 + }, + { + "epoch": 0.17896656534954408, + "grad_norm": 0.080836270562088, + "learning_rate": 9.95566934574283e-06, + "loss": 0.6714, + "step": 368 + }, + { + "epoch": 0.17945288753799393, + "grad_norm": 0.07765305202486368, + "learning_rate": 9.955414585673384e-06, + "loss": 0.6251, + "step": 369 + }, + { + "epoch": 0.17993920972644378, + "grad_norm": 0.07681719880195695, + "learning_rate": 9.95515909894866e-06, + "loss": 0.6231, + "step": 370 + }, + { + "epoch": 0.1804255319148936, + "grad_norm": 0.07595508436229585, + "learning_rate": 9.954902885606122e-06, + "loss": 0.5958, + "step": 371 + }, + { + "epoch": 0.18091185410334346, + "grad_norm": 0.07572195314562842, + "learning_rate": 9.954645945683343e-06, + "loss": 0.6324, + "step": 372 + }, + { + "epoch": 0.1813981762917933, + "grad_norm": 0.07564765453982031, + "learning_rate": 9.954388279218002e-06, + "loss": 0.6233, + "step": 373 + }, + { + "epoch": 0.18188449848024316, + "grad_norm": 0.07647251819232564, + "learning_rate": 9.954129886247879e-06, + "loss": 0.6288, + "step": 374 + }, + { + "epoch": 0.182370820668693, + "grad_norm": 0.07815265755808583, + "learning_rate": 9.953870766810864e-06, + "loss": 0.6542, + "step": 375 + }, + { + "epoch": 0.18285714285714286, + "grad_norm": 0.07550430163230877, + "learning_rate": 9.953610920944959e-06, + "loss": 0.5907, + "step": 376 + }, + { + "epoch": 0.1833434650455927, + "grad_norm": 0.07752316334718226, + "learning_rate": 9.953350348688264e-06, + "loss": 0.6129, + "step": 377 + }, + { + "epoch": 0.18382978723404256, + "grad_norm": 0.08002741566262664, + "learning_rate": 9.953089050078988e-06, + "loss": 0.6672, + "step": 378 + }, + { + "epoch": 0.1843161094224924, + "grad_norm": 0.08118664207341818, + "learning_rate": 9.95282702515545e-06, + "loss": 0.6599, + "step": 379 + }, + { + "epoch": 0.18480243161094226, + "grad_norm": 0.07268067047244009, + "learning_rate": 9.952564273956071e-06, + "loss": 0.5924, + "step": 380 + }, + { + "epoch": 0.1852887537993921, + "grad_norm": 0.07525568137843315, + "learning_rate": 9.952300796519383e-06, + "loss": 0.6691, + "step": 381 + }, + { + "epoch": 0.18577507598784196, + "grad_norm": 0.07736204276080864, + "learning_rate": 9.952036592884019e-06, + "loss": 0.6351, + "step": 382 + }, + { + "epoch": 0.18626139817629178, + "grad_norm": 0.07469426185229484, + "learning_rate": 9.951771663088724e-06, + "loss": 0.6133, + "step": 383 + }, + { + "epoch": 0.18674772036474163, + "grad_norm": 0.07725666077306236, + "learning_rate": 9.951506007172344e-06, + "loss": 0.6022, + "step": 384 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 0.07980306064310377, + "learning_rate": 9.951239625173836e-06, + "loss": 0.6085, + "step": 385 + }, + { + "epoch": 0.18772036474164133, + "grad_norm": 0.07177225906938704, + "learning_rate": 9.950972517132263e-06, + "loss": 0.5814, + "step": 386 + }, + { + "epoch": 0.18820668693009118, + "grad_norm": 0.07503775177335276, + "learning_rate": 9.950704683086793e-06, + "loss": 0.6442, + "step": 387 + }, + { + "epoch": 0.18869300911854103, + "grad_norm": 0.07822645307237987, + "learning_rate": 9.950436123076698e-06, + "loss": 0.6291, + "step": 388 + }, + { + "epoch": 0.18917933130699088, + "grad_norm": 0.07701312265398665, + "learning_rate": 9.950166837141365e-06, + "loss": 0.6184, + "step": 389 + }, + { + "epoch": 0.18966565349544073, + "grad_norm": 0.07381968422799055, + "learning_rate": 9.949896825320276e-06, + "loss": 0.5961, + "step": 390 + }, + { + "epoch": 0.19015197568389058, + "grad_norm": 0.0774285794317031, + "learning_rate": 9.949626087653026e-06, + "loss": 0.6119, + "step": 391 + }, + { + "epoch": 0.19063829787234043, + "grad_norm": 0.07533357959360332, + "learning_rate": 9.94935462417932e-06, + "loss": 0.6065, + "step": 392 + }, + { + "epoch": 0.19112462006079028, + "grad_norm": 0.07592544655627194, + "learning_rate": 9.949082434938959e-06, + "loss": 0.6157, + "step": 393 + }, + { + "epoch": 0.19161094224924013, + "grad_norm": 0.0773325236029274, + "learning_rate": 9.948809519971861e-06, + "loss": 0.6163, + "step": 394 + }, + { + "epoch": 0.19209726443768996, + "grad_norm": 0.07457669854173453, + "learning_rate": 9.948535879318044e-06, + "loss": 0.619, + "step": 395 + }, + { + "epoch": 0.1925835866261398, + "grad_norm": 0.07455267687335572, + "learning_rate": 9.948261513017637e-06, + "loss": 0.6436, + "step": 396 + }, + { + "epoch": 0.19306990881458966, + "grad_norm": 0.07720189485645515, + "learning_rate": 9.947986421110867e-06, + "loss": 0.619, + "step": 397 + }, + { + "epoch": 0.1935562310030395, + "grad_norm": 0.07787967933710195, + "learning_rate": 9.947710603638078e-06, + "loss": 0.6344, + "step": 398 + }, + { + "epoch": 0.19404255319148936, + "grad_norm": 0.07666831965447489, + "learning_rate": 9.947434060639714e-06, + "loss": 0.6248, + "step": 399 + }, + { + "epoch": 0.1945288753799392, + "grad_norm": 0.07360172238376061, + "learning_rate": 9.947156792156325e-06, + "loss": 0.5773, + "step": 400 + }, + { + "epoch": 0.19501519756838906, + "grad_norm": 0.07847475862029958, + "learning_rate": 9.946878798228573e-06, + "loss": 0.6313, + "step": 401 + }, + { + "epoch": 0.1955015197568389, + "grad_norm": 0.07431338152085773, + "learning_rate": 9.94660007889722e-06, + "loss": 0.6185, + "step": 402 + }, + { + "epoch": 0.19598784194528876, + "grad_norm": 0.07909828376252435, + "learning_rate": 9.946320634203139e-06, + "loss": 0.6773, + "step": 403 + }, + { + "epoch": 0.1964741641337386, + "grad_norm": 0.07695983804208197, + "learning_rate": 9.946040464187305e-06, + "loss": 0.5862, + "step": 404 + }, + { + "epoch": 0.19696048632218846, + "grad_norm": 0.07552860793015936, + "learning_rate": 9.945759568890804e-06, + "loss": 0.6257, + "step": 405 + }, + { + "epoch": 0.1974468085106383, + "grad_norm": 0.07263370651060398, + "learning_rate": 9.945477948354825e-06, + "loss": 0.5863, + "step": 406 + }, + { + "epoch": 0.19793313069908813, + "grad_norm": 0.07281159174481305, + "learning_rate": 9.945195602620663e-06, + "loss": 0.6087, + "step": 407 + }, + { + "epoch": 0.19841945288753798, + "grad_norm": 0.0736120933522763, + "learning_rate": 9.944912531729723e-06, + "loss": 0.6471, + "step": 408 + }, + { + "epoch": 0.19890577507598783, + "grad_norm": 0.07740214011476758, + "learning_rate": 9.944628735723514e-06, + "loss": 0.6542, + "step": 409 + }, + { + "epoch": 0.19939209726443768, + "grad_norm": 0.10666995775093237, + "learning_rate": 9.94434421464365e-06, + "loss": 0.6314, + "step": 410 + }, + { + "epoch": 0.19987841945288753, + "grad_norm": 0.0772072886483773, + "learning_rate": 9.944058968531855e-06, + "loss": 0.6079, + "step": 411 + }, + { + "epoch": 0.20036474164133738, + "grad_norm": 0.07130391118148917, + "learning_rate": 9.943772997429955e-06, + "loss": 0.5873, + "step": 412 + }, + { + "epoch": 0.20085106382978724, + "grad_norm": 0.07450831706219276, + "learning_rate": 9.943486301379885e-06, + "loss": 0.6337, + "step": 413 + }, + { + "epoch": 0.20133738601823709, + "grad_norm": 0.07702137581975181, + "learning_rate": 9.943198880423685e-06, + "loss": 0.6352, + "step": 414 + }, + { + "epoch": 0.20182370820668694, + "grad_norm": 0.0758668538802447, + "learning_rate": 9.942910734603505e-06, + "loss": 0.6167, + "step": 415 + }, + { + "epoch": 0.2023100303951368, + "grad_norm": 0.0756601791447037, + "learning_rate": 9.942621863961595e-06, + "loss": 0.642, + "step": 416 + }, + { + "epoch": 0.20279635258358664, + "grad_norm": 0.0754402938917895, + "learning_rate": 9.942332268540316e-06, + "loss": 0.6528, + "step": 417 + }, + { + "epoch": 0.2032826747720365, + "grad_norm": 0.07689886002839368, + "learning_rate": 9.942041948382133e-06, + "loss": 0.6201, + "step": 418 + }, + { + "epoch": 0.2037689969604863, + "grad_norm": 0.07646160106063425, + "learning_rate": 9.94175090352962e-06, + "loss": 0.6412, + "step": 419 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 0.07369601649884883, + "learning_rate": 9.941459134025455e-06, + "loss": 0.6255, + "step": 420 + }, + { + "epoch": 0.204741641337386, + "grad_norm": 0.0849247857802085, + "learning_rate": 9.94116663991242e-06, + "loss": 0.5797, + "step": 421 + }, + { + "epoch": 0.20522796352583586, + "grad_norm": 0.07169202414610462, + "learning_rate": 9.94087342123341e-06, + "loss": 0.5798, + "step": 422 + }, + { + "epoch": 0.2057142857142857, + "grad_norm": 0.07568098279764568, + "learning_rate": 9.940579478031418e-06, + "loss": 0.6395, + "step": 423 + }, + { + "epoch": 0.20620060790273556, + "grad_norm": 0.07518919387235251, + "learning_rate": 9.94028481034955e-06, + "loss": 0.594, + "step": 424 + }, + { + "epoch": 0.2066869300911854, + "grad_norm": 0.07397919814811302, + "learning_rate": 9.939989418231015e-06, + "loss": 0.6176, + "step": 425 + }, + { + "epoch": 0.20717325227963526, + "grad_norm": 0.07639873863361582, + "learning_rate": 9.939693301719131e-06, + "loss": 0.6406, + "step": 426 + }, + { + "epoch": 0.2076595744680851, + "grad_norm": 0.0782517394883419, + "learning_rate": 9.939396460857317e-06, + "loss": 0.6164, + "step": 427 + }, + { + "epoch": 0.20814589665653496, + "grad_norm": 0.07326346144393303, + "learning_rate": 9.939098895689104e-06, + "loss": 0.615, + "step": 428 + }, + { + "epoch": 0.2086322188449848, + "grad_norm": 0.07073338269850502, + "learning_rate": 9.938800606258122e-06, + "loss": 0.5874, + "step": 429 + }, + { + "epoch": 0.20911854103343466, + "grad_norm": 0.07994201273381059, + "learning_rate": 9.938501592608117e-06, + "loss": 0.6279, + "step": 430 + }, + { + "epoch": 0.20960486322188449, + "grad_norm": 0.07591241484669652, + "learning_rate": 9.938201854782935e-06, + "loss": 0.6592, + "step": 431 + }, + { + "epoch": 0.21009118541033434, + "grad_norm": 0.07391522831462577, + "learning_rate": 9.937901392826525e-06, + "loss": 0.6484, + "step": 432 + }, + { + "epoch": 0.21057750759878419, + "grad_norm": 0.07396763630117018, + "learning_rate": 9.937600206782951e-06, + "loss": 0.5993, + "step": 433 + }, + { + "epoch": 0.21106382978723404, + "grad_norm": 0.07971136014753553, + "learning_rate": 9.937298296696377e-06, + "loss": 0.6065, + "step": 434 + }, + { + "epoch": 0.2115501519756839, + "grad_norm": 0.07469907654962667, + "learning_rate": 9.936995662611074e-06, + "loss": 0.6189, + "step": 435 + }, + { + "epoch": 0.21203647416413374, + "grad_norm": 0.07345585801523219, + "learning_rate": 9.93669230457142e-06, + "loss": 0.5837, + "step": 436 + }, + { + "epoch": 0.2125227963525836, + "grad_norm": 0.07446872719945817, + "learning_rate": 9.9363882226219e-06, + "loss": 0.6417, + "step": 437 + }, + { + "epoch": 0.21300911854103344, + "grad_norm": 0.07651414373055855, + "learning_rate": 9.936083416807103e-06, + "loss": 0.6093, + "step": 438 + }, + { + "epoch": 0.2134954407294833, + "grad_norm": 0.0756813213374065, + "learning_rate": 9.935777887171727e-06, + "loss": 0.6256, + "step": 439 + }, + { + "epoch": 0.21398176291793314, + "grad_norm": 0.07227490250178426, + "learning_rate": 9.935471633760572e-06, + "loss": 0.631, + "step": 440 + }, + { + "epoch": 0.214468085106383, + "grad_norm": 0.07389776343485273, + "learning_rate": 9.93516465661855e-06, + "loss": 0.631, + "step": 441 + }, + { + "epoch": 0.21495440729483284, + "grad_norm": 0.07734100943928443, + "learning_rate": 9.934856955790672e-06, + "loss": 0.6372, + "step": 442 + }, + { + "epoch": 0.21544072948328266, + "grad_norm": 0.07752921853682646, + "learning_rate": 9.934548531322061e-06, + "loss": 0.5906, + "step": 443 + }, + { + "epoch": 0.2159270516717325, + "grad_norm": 0.07817168205190837, + "learning_rate": 9.934239383257942e-06, + "loss": 0.6299, + "step": 444 + }, + { + "epoch": 0.21641337386018236, + "grad_norm": 0.07389085118763335, + "learning_rate": 9.933929511643651e-06, + "loss": 0.5723, + "step": 445 + }, + { + "epoch": 0.2168996960486322, + "grad_norm": 0.08895902667132614, + "learning_rate": 9.933618916524625e-06, + "loss": 0.6295, + "step": 446 + }, + { + "epoch": 0.21738601823708206, + "grad_norm": 0.07838298322165756, + "learning_rate": 9.93330759794641e-06, + "loss": 0.6487, + "step": 447 + }, + { + "epoch": 0.2178723404255319, + "grad_norm": 0.07474744779378172, + "learning_rate": 9.932995555954657e-06, + "loss": 0.6241, + "step": 448 + }, + { + "epoch": 0.21835866261398176, + "grad_norm": 0.07604580436929075, + "learning_rate": 9.932682790595123e-06, + "loss": 0.6357, + "step": 449 + }, + { + "epoch": 0.2188449848024316, + "grad_norm": 0.0733846681325185, + "learning_rate": 9.932369301913673e-06, + "loss": 0.6372, + "step": 450 + }, + { + "epoch": 0.21933130699088146, + "grad_norm": 0.07248628370045519, + "learning_rate": 9.932055089956276e-06, + "loss": 0.6373, + "step": 451 + }, + { + "epoch": 0.21981762917933131, + "grad_norm": 0.07143990946931077, + "learning_rate": 9.931740154769008e-06, + "loss": 0.6073, + "step": 452 + }, + { + "epoch": 0.22030395136778116, + "grad_norm": 0.07684842487595032, + "learning_rate": 9.931424496398048e-06, + "loss": 0.6538, + "step": 453 + }, + { + "epoch": 0.22079027355623101, + "grad_norm": 0.07721539704589185, + "learning_rate": 9.931108114889685e-06, + "loss": 0.649, + "step": 454 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 0.07074242481837646, + "learning_rate": 9.930791010290316e-06, + "loss": 0.5966, + "step": 455 + }, + { + "epoch": 0.2217629179331307, + "grad_norm": 0.07372620377435624, + "learning_rate": 9.930473182646436e-06, + "loss": 0.6437, + "step": 456 + }, + { + "epoch": 0.22224924012158054, + "grad_norm": 0.0743048136038808, + "learning_rate": 9.930154632004654e-06, + "loss": 0.6296, + "step": 457 + }, + { + "epoch": 0.2227355623100304, + "grad_norm": 0.07792617987197321, + "learning_rate": 9.929835358411682e-06, + "loss": 0.6355, + "step": 458 + }, + { + "epoch": 0.22322188449848024, + "grad_norm": 0.0708828431490397, + "learning_rate": 9.929515361914335e-06, + "loss": 0.6078, + "step": 459 + }, + { + "epoch": 0.2237082066869301, + "grad_norm": 0.07737049787688959, + "learning_rate": 9.929194642559538e-06, + "loss": 0.6528, + "step": 460 + }, + { + "epoch": 0.22419452887537994, + "grad_norm": 0.07349583464643533, + "learning_rate": 9.928873200394323e-06, + "loss": 0.6118, + "step": 461 + }, + { + "epoch": 0.2246808510638298, + "grad_norm": 0.11868280258548222, + "learning_rate": 9.928551035465823e-06, + "loss": 0.6588, + "step": 462 + }, + { + "epoch": 0.22516717325227964, + "grad_norm": 0.07721198493465202, + "learning_rate": 9.928228147821282e-06, + "loss": 0.6343, + "step": 463 + }, + { + "epoch": 0.2256534954407295, + "grad_norm": 0.07615577084672769, + "learning_rate": 9.927904537508046e-06, + "loss": 0.6219, + "step": 464 + }, + { + "epoch": 0.22613981762917934, + "grad_norm": 0.071620193767772, + "learning_rate": 9.927580204573571e-06, + "loss": 0.5889, + "step": 465 + }, + { + "epoch": 0.2266261398176292, + "grad_norm": 0.07235223427194966, + "learning_rate": 9.927255149065413e-06, + "loss": 0.6038, + "step": 466 + }, + { + "epoch": 0.22711246200607904, + "grad_norm": 0.07742966226846906, + "learning_rate": 9.926929371031242e-06, + "loss": 0.6021, + "step": 467 + }, + { + "epoch": 0.22759878419452886, + "grad_norm": 0.08084185006551577, + "learning_rate": 9.926602870518826e-06, + "loss": 0.6356, + "step": 468 + }, + { + "epoch": 0.22808510638297871, + "grad_norm": 0.07687582330497636, + "learning_rate": 9.926275647576046e-06, + "loss": 0.6036, + "step": 469 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.07280657764875939, + "learning_rate": 9.925947702250884e-06, + "loss": 0.5918, + "step": 470 + }, + { + "epoch": 0.22905775075987841, + "grad_norm": 0.07516592158582709, + "learning_rate": 9.925619034591429e-06, + "loss": 0.6178, + "step": 471 + }, + { + "epoch": 0.22954407294832826, + "grad_norm": 0.07421193750711921, + "learning_rate": 9.925289644645876e-06, + "loss": 0.6076, + "step": 472 + }, + { + "epoch": 0.23003039513677812, + "grad_norm": 0.07901459575606913, + "learning_rate": 9.924959532462527e-06, + "loss": 0.6179, + "step": 473 + }, + { + "epoch": 0.23051671732522797, + "grad_norm": 0.07984168188458464, + "learning_rate": 9.92462869808979e-06, + "loss": 0.6374, + "step": 474 + }, + { + "epoch": 0.23100303951367782, + "grad_norm": 0.07129044878012378, + "learning_rate": 9.924297141576176e-06, + "loss": 0.5536, + "step": 475 + }, + { + "epoch": 0.23148936170212767, + "grad_norm": 0.07598139936236821, + "learning_rate": 9.923964862970306e-06, + "loss": 0.5784, + "step": 476 + }, + { + "epoch": 0.23197568389057752, + "grad_norm": 0.07613946279554822, + "learning_rate": 9.923631862320907e-06, + "loss": 0.5868, + "step": 477 + }, + { + "epoch": 0.23246200607902737, + "grad_norm": 0.07115161589929749, + "learning_rate": 9.923298139676802e-06, + "loss": 0.6012, + "step": 478 + }, + { + "epoch": 0.23294832826747722, + "grad_norm": 0.0791874289284463, + "learning_rate": 9.922963695086936e-06, + "loss": 0.6136, + "step": 479 + }, + { + "epoch": 0.23343465045592704, + "grad_norm": 0.07507642632721559, + "learning_rate": 9.922628528600347e-06, + "loss": 0.6279, + "step": 480 + }, + { + "epoch": 0.2339209726443769, + "grad_norm": 0.07748937754994555, + "learning_rate": 9.922292640266184e-06, + "loss": 0.6444, + "step": 481 + }, + { + "epoch": 0.23440729483282674, + "grad_norm": 0.07759085664135958, + "learning_rate": 9.9219560301337e-06, + "loss": 0.6458, + "step": 482 + }, + { + "epoch": 0.2348936170212766, + "grad_norm": 0.07377761390411647, + "learning_rate": 9.92161869825226e-06, + "loss": 0.6138, + "step": 483 + }, + { + "epoch": 0.23537993920972644, + "grad_norm": 0.07420455297957712, + "learning_rate": 9.921280644671324e-06, + "loss": 0.6211, + "step": 484 + }, + { + "epoch": 0.2358662613981763, + "grad_norm": 0.07991178658536623, + "learning_rate": 9.92094186944047e-06, + "loss": 0.6124, + "step": 485 + }, + { + "epoch": 0.23635258358662614, + "grad_norm": 0.07470449885186195, + "learning_rate": 9.92060237260937e-06, + "loss": 0.6157, + "step": 486 + }, + { + "epoch": 0.236838905775076, + "grad_norm": 0.07626616445129225, + "learning_rate": 9.920262154227806e-06, + "loss": 0.5957, + "step": 487 + }, + { + "epoch": 0.23732522796352584, + "grad_norm": 0.07678478362923138, + "learning_rate": 9.919921214345674e-06, + "loss": 0.587, + "step": 488 + }, + { + "epoch": 0.2378115501519757, + "grad_norm": 0.07462321190667279, + "learning_rate": 9.919579553012964e-06, + "loss": 0.6327, + "step": 489 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 0.07309292679627498, + "learning_rate": 9.919237170279778e-06, + "loss": 0.5835, + "step": 490 + }, + { + "epoch": 0.2387841945288754, + "grad_norm": 0.07457806463730678, + "learning_rate": 9.918894066196322e-06, + "loss": 0.611, + "step": 491 + }, + { + "epoch": 0.23927051671732522, + "grad_norm": 0.07576722141103294, + "learning_rate": 9.918550240812912e-06, + "loss": 0.6072, + "step": 492 + }, + { + "epoch": 0.23975683890577507, + "grad_norm": 0.07661148430761854, + "learning_rate": 9.918205694179961e-06, + "loss": 0.6413, + "step": 493 + }, + { + "epoch": 0.24024316109422492, + "grad_norm": 0.07283782084317245, + "learning_rate": 9.917860426347994e-06, + "loss": 0.6072, + "step": 494 + }, + { + "epoch": 0.24072948328267477, + "grad_norm": 0.07429240232374179, + "learning_rate": 9.917514437367644e-06, + "loss": 0.641, + "step": 495 + }, + { + "epoch": 0.24121580547112462, + "grad_norm": 0.07120995408021623, + "learning_rate": 9.917167727289641e-06, + "loss": 0.5972, + "step": 496 + }, + { + "epoch": 0.24170212765957447, + "grad_norm": 0.07517749617028277, + "learning_rate": 9.91682029616483e-06, + "loss": 0.6137, + "step": 497 + }, + { + "epoch": 0.24218844984802432, + "grad_norm": 0.07923127533130168, + "learning_rate": 9.916472144044157e-06, + "loss": 0.644, + "step": 498 + }, + { + "epoch": 0.24267477203647417, + "grad_norm": 0.07387849925847506, + "learning_rate": 9.916123270978673e-06, + "loss": 0.6565, + "step": 499 + }, + { + "epoch": 0.24316109422492402, + "grad_norm": 0.07292179430812065, + "learning_rate": 9.91577367701954e-06, + "loss": 0.611, + "step": 500 + }, + { + "epoch": 0.24364741641337387, + "grad_norm": 0.0772507722278035, + "learning_rate": 9.915423362218017e-06, + "loss": 0.6446, + "step": 501 + }, + { + "epoch": 0.24413373860182372, + "grad_norm": 0.07710857117115859, + "learning_rate": 9.915072326625479e-06, + "loss": 0.6379, + "step": 502 + }, + { + "epoch": 0.24462006079027357, + "grad_norm": 0.07285698280642044, + "learning_rate": 9.914720570293397e-06, + "loss": 0.6026, + "step": 503 + }, + { + "epoch": 0.2451063829787234, + "grad_norm": 0.0733862563471539, + "learning_rate": 9.914368093273354e-06, + "loss": 0.6331, + "step": 504 + }, + { + "epoch": 0.24559270516717324, + "grad_norm": 0.07420637988829044, + "learning_rate": 9.914014895617036e-06, + "loss": 0.5941, + "step": 505 + }, + { + "epoch": 0.2460790273556231, + "grad_norm": 0.07923092784020519, + "learning_rate": 9.913660977376236e-06, + "loss": 0.6114, + "step": 506 + }, + { + "epoch": 0.24656534954407294, + "grad_norm": 0.07548113044308191, + "learning_rate": 9.913306338602852e-06, + "loss": 0.6332, + "step": 507 + }, + { + "epoch": 0.2470516717325228, + "grad_norm": 0.07568186968690875, + "learning_rate": 9.912950979348889e-06, + "loss": 0.6095, + "step": 508 + }, + { + "epoch": 0.24753799392097264, + "grad_norm": 0.073329676390455, + "learning_rate": 9.912594899666454e-06, + "loss": 0.5964, + "step": 509 + }, + { + "epoch": 0.2480243161094225, + "grad_norm": 0.07530844381021684, + "learning_rate": 9.912238099607763e-06, + "loss": 0.6348, + "step": 510 + }, + { + "epoch": 0.24851063829787234, + "grad_norm": 0.0735368360110718, + "learning_rate": 9.911880579225137e-06, + "loss": 0.6123, + "step": 511 + }, + { + "epoch": 0.2489969604863222, + "grad_norm": 0.07401497675772126, + "learning_rate": 9.911522338571002e-06, + "loss": 0.6319, + "step": 512 + }, + { + "epoch": 0.24948328267477204, + "grad_norm": 0.07598525902317663, + "learning_rate": 9.911163377697891e-06, + "loss": 0.5908, + "step": 513 + }, + { + "epoch": 0.2499696048632219, + "grad_norm": 0.07988537951661231, + "learning_rate": 9.91080369665844e-06, + "loss": 0.6227, + "step": 514 + }, + { + "epoch": 0.2499696048632219, + "eval_loss": 0.6214485168457031, + "eval_runtime": 105.2182, + "eval_samples_per_second": 288.477, + "eval_steps_per_second": 36.068, + "step": 514 + }, + { + "epoch": 0.25045592705167175, + "grad_norm": 0.07589060902724236, + "learning_rate": 9.910443295505392e-06, + "loss": 0.6145, + "step": 515 + }, + { + "epoch": 0.2509422492401216, + "grad_norm": 0.07288991412090616, + "learning_rate": 9.910082174291597e-06, + "loss": 0.6072, + "step": 516 + }, + { + "epoch": 0.25142857142857145, + "grad_norm": 0.07414715332393881, + "learning_rate": 9.90972033307001e-06, + "loss": 0.6459, + "step": 517 + }, + { + "epoch": 0.2519148936170213, + "grad_norm": 0.0761473577886796, + "learning_rate": 9.909357771893689e-06, + "loss": 0.6707, + "step": 518 + }, + { + "epoch": 0.25240121580547115, + "grad_norm": 0.08455929630962569, + "learning_rate": 9.908994490815799e-06, + "loss": 0.6258, + "step": 519 + }, + { + "epoch": 0.252887537993921, + "grad_norm": 0.07407118772256081, + "learning_rate": 9.908630489889615e-06, + "loss": 0.5906, + "step": 520 + }, + { + "epoch": 0.25337386018237085, + "grad_norm": 0.07521990216742436, + "learning_rate": 9.908265769168507e-06, + "loss": 0.5908, + "step": 521 + }, + { + "epoch": 0.25386018237082064, + "grad_norm": 0.07679672666813263, + "learning_rate": 9.907900328705965e-06, + "loss": 0.5779, + "step": 522 + }, + { + "epoch": 0.2543465045592705, + "grad_norm": 0.07671000448785356, + "learning_rate": 9.90753416855557e-06, + "loss": 0.6177, + "step": 523 + }, + { + "epoch": 0.25483282674772034, + "grad_norm": 0.07626573478005942, + "learning_rate": 9.90716728877102e-06, + "loss": 0.6124, + "step": 524 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 0.07958550152849972, + "learning_rate": 9.90679968940611e-06, + "loss": 0.6455, + "step": 525 + }, + { + "epoch": 0.25580547112462004, + "grad_norm": 0.07869148910618602, + "learning_rate": 9.906431370514746e-06, + "loss": 0.6503, + "step": 526 + }, + { + "epoch": 0.2562917933130699, + "grad_norm": 0.07442998191091328, + "learning_rate": 9.906062332150939e-06, + "loss": 0.6244, + "step": 527 + }, + { + "epoch": 0.25677811550151974, + "grad_norm": 0.07322069741610936, + "learning_rate": 9.905692574368802e-06, + "loss": 0.6143, + "step": 528 + }, + { + "epoch": 0.2572644376899696, + "grad_norm": 0.07243230323425069, + "learning_rate": 9.905322097222557e-06, + "loss": 0.577, + "step": 529 + }, + { + "epoch": 0.25775075987841944, + "grad_norm": 0.07392644886548709, + "learning_rate": 9.90495090076653e-06, + "loss": 0.6173, + "step": 530 + }, + { + "epoch": 0.2582370820668693, + "grad_norm": 0.07326043331226169, + "learning_rate": 9.904578985055151e-06, + "loss": 0.593, + "step": 531 + }, + { + "epoch": 0.25872340425531914, + "grad_norm": 0.0761866844421115, + "learning_rate": 9.904206350142962e-06, + "loss": 0.6146, + "step": 532 + }, + { + "epoch": 0.259209726443769, + "grad_norm": 0.07466453648142758, + "learning_rate": 9.9038329960846e-06, + "loss": 0.615, + "step": 533 + }, + { + "epoch": 0.25969604863221885, + "grad_norm": 0.07758069788648553, + "learning_rate": 9.903458922934819e-06, + "loss": 0.6165, + "step": 534 + }, + { + "epoch": 0.2601823708206687, + "grad_norm": 0.08130805265534356, + "learning_rate": 9.903084130748468e-06, + "loss": 0.6514, + "step": 535 + }, + { + "epoch": 0.26066869300911855, + "grad_norm": 0.07513827656136587, + "learning_rate": 9.902708619580507e-06, + "loss": 0.6419, + "step": 536 + }, + { + "epoch": 0.2611550151975684, + "grad_norm": 0.07554053453656473, + "learning_rate": 9.902332389486001e-06, + "loss": 0.5757, + "step": 537 + }, + { + "epoch": 0.26164133738601825, + "grad_norm": 0.07682878866688235, + "learning_rate": 9.901955440520121e-06, + "loss": 0.6296, + "step": 538 + }, + { + "epoch": 0.2621276595744681, + "grad_norm": 0.08162414706107891, + "learning_rate": 9.90157777273814e-06, + "loss": 0.6256, + "step": 539 + }, + { + "epoch": 0.26261398176291795, + "grad_norm": 0.07775975078261581, + "learning_rate": 9.90119938619544e-06, + "loss": 0.61, + "step": 540 + }, + { + "epoch": 0.2631003039513678, + "grad_norm": 0.07824306209995144, + "learning_rate": 9.900820280947505e-06, + "loss": 0.5721, + "step": 541 + }, + { + "epoch": 0.26358662613981765, + "grad_norm": 0.07368803248818379, + "learning_rate": 9.90044045704993e-06, + "loss": 0.6295, + "step": 542 + }, + { + "epoch": 0.2640729483282675, + "grad_norm": 0.07725198361916966, + "learning_rate": 9.90005991455841e-06, + "loss": 0.6373, + "step": 543 + }, + { + "epoch": 0.26455927051671735, + "grad_norm": 0.07847348883834761, + "learning_rate": 9.899678653528747e-06, + "loss": 0.6027, + "step": 544 + }, + { + "epoch": 0.2650455927051672, + "grad_norm": 0.07229624668472394, + "learning_rate": 9.89929667401685e-06, + "loss": 0.611, + "step": 545 + }, + { + "epoch": 0.265531914893617, + "grad_norm": 0.08032354201405056, + "learning_rate": 9.89891397607873e-06, + "loss": 0.6233, + "step": 546 + }, + { + "epoch": 0.26601823708206684, + "grad_norm": 0.07699928199935681, + "learning_rate": 9.898530559770508e-06, + "loss": 0.6367, + "step": 547 + }, + { + "epoch": 0.2665045592705167, + "grad_norm": 0.07244592165765863, + "learning_rate": 9.898146425148403e-06, + "loss": 0.6087, + "step": 548 + }, + { + "epoch": 0.26699088145896654, + "grad_norm": 0.07303833129817308, + "learning_rate": 9.897761572268748e-06, + "loss": 0.6248, + "step": 549 + }, + { + "epoch": 0.2674772036474164, + "grad_norm": 0.07648930853409155, + "learning_rate": 9.897376001187978e-06, + "loss": 0.6348, + "step": 550 + }, + { + "epoch": 0.26796352583586625, + "grad_norm": 0.07250421486915064, + "learning_rate": 9.896989711962627e-06, + "loss": 0.5822, + "step": 551 + }, + { + "epoch": 0.2684498480243161, + "grad_norm": 0.07669573361463654, + "learning_rate": 9.896602704649348e-06, + "loss": 0.6578, + "step": 552 + }, + { + "epoch": 0.26893617021276595, + "grad_norm": 0.0803366526048726, + "learning_rate": 9.896214979304884e-06, + "loss": 0.6492, + "step": 553 + }, + { + "epoch": 0.2694224924012158, + "grad_norm": 0.07700448398210802, + "learning_rate": 9.895826535986095e-06, + "loss": 0.607, + "step": 554 + }, + { + "epoch": 0.26990881458966565, + "grad_norm": 0.0776266991713559, + "learning_rate": 9.89543737474994e-06, + "loss": 0.5945, + "step": 555 + }, + { + "epoch": 0.2703951367781155, + "grad_norm": 0.07522103980996976, + "learning_rate": 9.895047495653485e-06, + "loss": 0.6153, + "step": 556 + }, + { + "epoch": 0.27088145896656535, + "grad_norm": 0.07743620531989164, + "learning_rate": 9.894656898753902e-06, + "loss": 0.574, + "step": 557 + }, + { + "epoch": 0.2713677811550152, + "grad_norm": 0.07522427661033675, + "learning_rate": 9.894265584108466e-06, + "loss": 0.616, + "step": 558 + }, + { + "epoch": 0.27185410334346505, + "grad_norm": 0.13632008039363205, + "learning_rate": 9.893873551774561e-06, + "loss": 0.6109, + "step": 559 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 0.08010155018404212, + "learning_rate": 9.893480801809675e-06, + "loss": 0.6318, + "step": 560 + }, + { + "epoch": 0.27282674772036475, + "grad_norm": 0.0749363948780222, + "learning_rate": 9.893087334271398e-06, + "loss": 0.5767, + "step": 561 + }, + { + "epoch": 0.2733130699088146, + "grad_norm": 0.07482152043920853, + "learning_rate": 9.892693149217427e-06, + "loss": 0.6156, + "step": 562 + }, + { + "epoch": 0.27379939209726445, + "grad_norm": 0.07919591176340275, + "learning_rate": 9.892298246705566e-06, + "loss": 0.6151, + "step": 563 + }, + { + "epoch": 0.2742857142857143, + "grad_norm": 0.07795060723844054, + "learning_rate": 9.891902626793723e-06, + "loss": 0.609, + "step": 564 + }, + { + "epoch": 0.27477203647416415, + "grad_norm": 0.07210354659172886, + "learning_rate": 9.891506289539912e-06, + "loss": 0.6128, + "step": 565 + }, + { + "epoch": 0.275258358662614, + "grad_norm": 0.07715300121517131, + "learning_rate": 9.891109235002248e-06, + "loss": 0.646, + "step": 566 + }, + { + "epoch": 0.27574468085106385, + "grad_norm": 0.07794141151981211, + "learning_rate": 9.89071146323896e-06, + "loss": 0.6232, + "step": 567 + }, + { + "epoch": 0.2762310030395137, + "grad_norm": 0.081645702794136, + "learning_rate": 9.89031297430837e-06, + "loss": 0.6232, + "step": 568 + }, + { + "epoch": 0.27671732522796355, + "grad_norm": 0.08178305842758182, + "learning_rate": 9.889913768268918e-06, + "loss": 0.6722, + "step": 569 + }, + { + "epoch": 0.2772036474164134, + "grad_norm": 0.07605924407860792, + "learning_rate": 9.88951384517914e-06, + "loss": 0.6155, + "step": 570 + }, + { + "epoch": 0.2776899696048632, + "grad_norm": 0.07480987856310256, + "learning_rate": 9.889113205097682e-06, + "loss": 0.5984, + "step": 571 + }, + { + "epoch": 0.27817629179331305, + "grad_norm": 0.07844896251101492, + "learning_rate": 9.88871184808329e-06, + "loss": 0.5923, + "step": 572 + }, + { + "epoch": 0.2786626139817629, + "grad_norm": 0.0763243386187881, + "learning_rate": 9.888309774194822e-06, + "loss": 0.6339, + "step": 573 + }, + { + "epoch": 0.27914893617021275, + "grad_norm": 0.07146612633011887, + "learning_rate": 9.887906983491236e-06, + "loss": 0.5898, + "step": 574 + }, + { + "epoch": 0.2796352583586626, + "grad_norm": 0.07463219701665877, + "learning_rate": 9.887503476031594e-06, + "loss": 0.6014, + "step": 575 + }, + { + "epoch": 0.28012158054711245, + "grad_norm": 0.0774337690825949, + "learning_rate": 9.887099251875072e-06, + "loss": 0.6336, + "step": 576 + }, + { + "epoch": 0.2806079027355623, + "grad_norm": 0.07622175835329276, + "learning_rate": 9.88669431108094e-06, + "loss": 0.5896, + "step": 577 + }, + { + "epoch": 0.28109422492401215, + "grad_norm": 0.07417235299022149, + "learning_rate": 9.886288653708578e-06, + "loss": 0.5898, + "step": 578 + }, + { + "epoch": 0.281580547112462, + "grad_norm": 0.0750556864122582, + "learning_rate": 9.885882279817473e-06, + "loss": 0.5994, + "step": 579 + }, + { + "epoch": 0.28206686930091185, + "grad_norm": 0.07259439228436972, + "learning_rate": 9.885475189467217e-06, + "loss": 0.6015, + "step": 580 + }, + { + "epoch": 0.2825531914893617, + "grad_norm": 0.08096810389461838, + "learning_rate": 9.885067382717501e-06, + "loss": 0.6303, + "step": 581 + }, + { + "epoch": 0.28303951367781155, + "grad_norm": 0.07656747826915329, + "learning_rate": 9.884658859628126e-06, + "loss": 0.6235, + "step": 582 + }, + { + "epoch": 0.2835258358662614, + "grad_norm": 0.07407093804321452, + "learning_rate": 9.884249620259e-06, + "loss": 0.6306, + "step": 583 + }, + { + "epoch": 0.28401215805471125, + "grad_norm": 0.07678458194671911, + "learning_rate": 9.88383966467013e-06, + "loss": 0.6341, + "step": 584 + }, + { + "epoch": 0.2844984802431611, + "grad_norm": 0.07517273995768671, + "learning_rate": 9.883428992921634e-06, + "loss": 0.6475, + "step": 585 + }, + { + "epoch": 0.28498480243161095, + "grad_norm": 0.07516108205639849, + "learning_rate": 9.88301760507373e-06, + "loss": 0.6351, + "step": 586 + }, + { + "epoch": 0.2854711246200608, + "grad_norm": 0.0741107173160987, + "learning_rate": 9.882605501186747e-06, + "loss": 0.6257, + "step": 587 + }, + { + "epoch": 0.28595744680851065, + "grad_norm": 0.07873028174145927, + "learning_rate": 9.88219268132111e-06, + "loss": 0.5696, + "step": 588 + }, + { + "epoch": 0.2864437689969605, + "grad_norm": 0.07537660815824522, + "learning_rate": 9.881779145537359e-06, + "loss": 0.5996, + "step": 589 + }, + { + "epoch": 0.28693009118541035, + "grad_norm": 0.07513490085084712, + "learning_rate": 9.88136489389613e-06, + "loss": 0.6275, + "step": 590 + }, + { + "epoch": 0.2874164133738602, + "grad_norm": 0.08066569161946668, + "learning_rate": 9.880949926458174e-06, + "loss": 0.6684, + "step": 591 + }, + { + "epoch": 0.28790273556231005, + "grad_norm": 0.07709164494735121, + "learning_rate": 9.880534243284338e-06, + "loss": 0.6149, + "step": 592 + }, + { + "epoch": 0.2883890577507599, + "grad_norm": 0.07594899490206673, + "learning_rate": 9.880117844435575e-06, + "loss": 0.6177, + "step": 593 + }, + { + "epoch": 0.28887537993920975, + "grad_norm": 0.07299231382190172, + "learning_rate": 9.87970072997295e-06, + "loss": 0.6134, + "step": 594 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 0.07763823662793731, + "learning_rate": 9.879282899957625e-06, + "loss": 0.5842, + "step": 595 + }, + { + "epoch": 0.2898480243161094, + "grad_norm": 0.0766311164247948, + "learning_rate": 9.87886435445087e-06, + "loss": 0.6314, + "step": 596 + }, + { + "epoch": 0.29033434650455925, + "grad_norm": 0.07984107461223952, + "learning_rate": 9.87844509351406e-06, + "loss": 0.6881, + "step": 597 + }, + { + "epoch": 0.2908206686930091, + "grad_norm": 0.07558194427065122, + "learning_rate": 9.878025117208676e-06, + "loss": 0.6352, + "step": 598 + }, + { + "epoch": 0.29130699088145895, + "grad_norm": 0.07728297115017416, + "learning_rate": 9.877604425596303e-06, + "loss": 0.6013, + "step": 599 + }, + { + "epoch": 0.2917933130699088, + "grad_norm": 0.07840027810999112, + "learning_rate": 9.87718301873863e-06, + "loss": 0.5728, + "step": 600 + }, + { + "epoch": 0.29227963525835865, + "grad_norm": 0.07722122568644645, + "learning_rate": 9.87676089669745e-06, + "loss": 0.6368, + "step": 601 + }, + { + "epoch": 0.2927659574468085, + "grad_norm": 0.07632554633533033, + "learning_rate": 9.876338059534664e-06, + "loss": 0.607, + "step": 602 + }, + { + "epoch": 0.29325227963525835, + "grad_norm": 0.07529935937597487, + "learning_rate": 9.875914507312277e-06, + "loss": 0.6576, + "step": 603 + }, + { + "epoch": 0.2937386018237082, + "grad_norm": 0.07460223062666418, + "learning_rate": 9.875490240092397e-06, + "loss": 0.5874, + "step": 604 + }, + { + "epoch": 0.29422492401215805, + "grad_norm": 0.07879593078852981, + "learning_rate": 9.875065257937237e-06, + "loss": 0.5934, + "step": 605 + }, + { + "epoch": 0.2947112462006079, + "grad_norm": 0.0715437718862713, + "learning_rate": 9.874639560909118e-06, + "loss": 0.5779, + "step": 606 + }, + { + "epoch": 0.29519756838905775, + "grad_norm": 0.07273058756189188, + "learning_rate": 9.874213149070463e-06, + "loss": 0.6027, + "step": 607 + }, + { + "epoch": 0.2956838905775076, + "grad_norm": 0.07662064449090256, + "learning_rate": 9.8737860224838e-06, + "loss": 0.593, + "step": 608 + }, + { + "epoch": 0.29617021276595745, + "grad_norm": 0.07741820641354102, + "learning_rate": 9.873358181211762e-06, + "loss": 0.6126, + "step": 609 + }, + { + "epoch": 0.2966565349544073, + "grad_norm": 0.07765359047523991, + "learning_rate": 9.872929625317087e-06, + "loss": 0.6162, + "step": 610 + }, + { + "epoch": 0.29714285714285715, + "grad_norm": 0.0833328990770357, + "learning_rate": 9.872500354862618e-06, + "loss": 0.6631, + "step": 611 + }, + { + "epoch": 0.297629179331307, + "grad_norm": 0.07808821812865127, + "learning_rate": 9.872070369911304e-06, + "loss": 0.6232, + "step": 612 + }, + { + "epoch": 0.29811550151975685, + "grad_norm": 0.07697593909903068, + "learning_rate": 9.871639670526194e-06, + "loss": 0.6565, + "step": 613 + }, + { + "epoch": 0.2986018237082067, + "grad_norm": 0.07370264019066085, + "learning_rate": 9.87120825677045e-06, + "loss": 0.6321, + "step": 614 + }, + { + "epoch": 0.29908814589665655, + "grad_norm": 0.0727886652645723, + "learning_rate": 9.87077612870733e-06, + "loss": 0.6015, + "step": 615 + }, + { + "epoch": 0.2995744680851064, + "grad_norm": 0.07542069863314463, + "learning_rate": 9.870343286400202e-06, + "loss": 0.6391, + "step": 616 + }, + { + "epoch": 0.30006079027355625, + "grad_norm": 0.07684140526385554, + "learning_rate": 9.86990972991254e-06, + "loss": 0.6068, + "step": 617 + }, + { + "epoch": 0.3005471124620061, + "grad_norm": 0.07621891194931185, + "learning_rate": 9.869475459307913e-06, + "loss": 0.5929, + "step": 618 + }, + { + "epoch": 0.3010334346504559, + "grad_norm": 0.07179766415423608, + "learning_rate": 9.86904047465001e-06, + "loss": 0.573, + "step": 619 + }, + { + "epoch": 0.30151975683890575, + "grad_norm": 0.0792872907161523, + "learning_rate": 9.868604776002612e-06, + "loss": 0.6523, + "step": 620 + }, + { + "epoch": 0.3020060790273556, + "grad_norm": 0.07495051908455809, + "learning_rate": 9.86816836342961e-06, + "loss": 0.6315, + "step": 621 + }, + { + "epoch": 0.30249240121580545, + "grad_norm": 0.07820825446959971, + "learning_rate": 9.867731236995e-06, + "loss": 0.5941, + "step": 622 + }, + { + "epoch": 0.3029787234042553, + "grad_norm": 0.07796673011558322, + "learning_rate": 9.86729339676288e-06, + "loss": 0.6144, + "step": 623 + }, + { + "epoch": 0.30346504559270515, + "grad_norm": 0.07361026745986436, + "learning_rate": 9.866854842797455e-06, + "loss": 0.5673, + "step": 624 + }, + { + "epoch": 0.303951367781155, + "grad_norm": 0.08289100643421991, + "learning_rate": 9.866415575163036e-06, + "loss": 0.6789, + "step": 625 + }, + { + "epoch": 0.30443768996960485, + "grad_norm": 0.07471444334785458, + "learning_rate": 9.865975593924032e-06, + "loss": 0.6125, + "step": 626 + }, + { + "epoch": 0.3049240121580547, + "grad_norm": 0.07592829255837753, + "learning_rate": 9.865534899144966e-06, + "loss": 0.5848, + "step": 627 + }, + { + "epoch": 0.30541033434650455, + "grad_norm": 0.07500904571242861, + "learning_rate": 9.865093490890457e-06, + "loss": 0.5963, + "step": 628 + }, + { + "epoch": 0.3058966565349544, + "grad_norm": 0.0789002146024155, + "learning_rate": 9.864651369225236e-06, + "loss": 0.6528, + "step": 629 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 0.07392833124910866, + "learning_rate": 9.864208534214132e-06, + "loss": 0.6176, + "step": 630 + }, + { + "epoch": 0.3068693009118541, + "grad_norm": 0.0766961339559058, + "learning_rate": 9.863764985922083e-06, + "loss": 0.6003, + "step": 631 + }, + { + "epoch": 0.30735562310030395, + "grad_norm": 0.07549114504172938, + "learning_rate": 9.863320724414134e-06, + "loss": 0.5885, + "step": 632 + }, + { + "epoch": 0.3078419452887538, + "grad_norm": 0.0771367870397442, + "learning_rate": 9.862875749755425e-06, + "loss": 0.629, + "step": 633 + }, + { + "epoch": 0.30832826747720365, + "grad_norm": 0.07757058569579986, + "learning_rate": 9.862430062011209e-06, + "loss": 0.6253, + "step": 634 + }, + { + "epoch": 0.3088145896656535, + "grad_norm": 0.07791342697788971, + "learning_rate": 9.861983661246841e-06, + "loss": 0.6214, + "step": 635 + }, + { + "epoch": 0.30930091185410336, + "grad_norm": 0.07927633000808611, + "learning_rate": 9.86153654752778e-06, + "loss": 0.6678, + "step": 636 + }, + { + "epoch": 0.3097872340425532, + "grad_norm": 0.07715047905975243, + "learning_rate": 9.861088720919592e-06, + "loss": 0.6275, + "step": 637 + }, + { + "epoch": 0.31027355623100306, + "grad_norm": 0.07598115195008412, + "learning_rate": 9.860640181487942e-06, + "loss": 0.6028, + "step": 638 + }, + { + "epoch": 0.3107598784194529, + "grad_norm": 0.07706302997100106, + "learning_rate": 9.860190929298607e-06, + "loss": 0.6185, + "step": 639 + }, + { + "epoch": 0.31124620060790276, + "grad_norm": 0.07784783744635577, + "learning_rate": 9.859740964417464e-06, + "loss": 0.6024, + "step": 640 + }, + { + "epoch": 0.3117325227963526, + "grad_norm": 0.07591354484928346, + "learning_rate": 9.859290286910495e-06, + "loss": 0.6042, + "step": 641 + }, + { + "epoch": 0.31221884498480246, + "grad_norm": 0.07446271844355028, + "learning_rate": 9.858838896843785e-06, + "loss": 0.5915, + "step": 642 + }, + { + "epoch": 0.31270516717325225, + "grad_norm": 0.07481824884425073, + "learning_rate": 9.858386794283527e-06, + "loss": 0.6225, + "step": 643 + }, + { + "epoch": 0.3131914893617021, + "grad_norm": 0.07856629947318088, + "learning_rate": 9.857933979296017e-06, + "loss": 0.6073, + "step": 644 + }, + { + "epoch": 0.31367781155015195, + "grad_norm": 0.07753303860073824, + "learning_rate": 9.857480451947653e-06, + "loss": 0.65, + "step": 645 + }, + { + "epoch": 0.3141641337386018, + "grad_norm": 0.07819017808697808, + "learning_rate": 9.857026212304942e-06, + "loss": 0.6123, + "step": 646 + }, + { + "epoch": 0.31465045592705165, + "grad_norm": 0.07495690460091109, + "learning_rate": 9.856571260434492e-06, + "loss": 0.6027, + "step": 647 + }, + { + "epoch": 0.3151367781155015, + "grad_norm": 0.07460249544332014, + "learning_rate": 9.856115596403016e-06, + "loss": 0.586, + "step": 648 + }, + { + "epoch": 0.31562310030395135, + "grad_norm": 0.07784276518518962, + "learning_rate": 9.855659220277334e-06, + "loss": 0.5718, + "step": 649 + }, + { + "epoch": 0.3161094224924012, + "grad_norm": 0.07485943767475445, + "learning_rate": 9.855202132124367e-06, + "loss": 0.6109, + "step": 650 + }, + { + "epoch": 0.31659574468085105, + "grad_norm": 0.08020793741394526, + "learning_rate": 9.85474433201114e-06, + "loss": 0.6094, + "step": 651 + }, + { + "epoch": 0.3170820668693009, + "grad_norm": 0.0774710633397956, + "learning_rate": 9.854285820004787e-06, + "loss": 0.5885, + "step": 652 + }, + { + "epoch": 0.31756838905775076, + "grad_norm": 0.08517051225467598, + "learning_rate": 9.853826596172542e-06, + "loss": 0.6504, + "step": 653 + }, + { + "epoch": 0.3180547112462006, + "grad_norm": 0.07496279642075002, + "learning_rate": 9.853366660581747e-06, + "loss": 0.629, + "step": 654 + }, + { + "epoch": 0.31854103343465046, + "grad_norm": 0.07733450380865818, + "learning_rate": 9.852906013299844e-06, + "loss": 0.6442, + "step": 655 + }, + { + "epoch": 0.3190273556231003, + "grad_norm": 0.07960430891285124, + "learning_rate": 9.852444654394381e-06, + "loss": 0.5794, + "step": 656 + }, + { + "epoch": 0.31951367781155016, + "grad_norm": 0.08247442292541365, + "learning_rate": 9.851982583933015e-06, + "loss": 0.6755, + "step": 657 + }, + { + "epoch": 0.32, + "grad_norm": 0.07906928316752099, + "learning_rate": 9.8515198019835e-06, + "loss": 0.6016, + "step": 658 + }, + { + "epoch": 0.32048632218844986, + "grad_norm": 0.07279415448761366, + "learning_rate": 9.851056308613699e-06, + "loss": 0.6201, + "step": 659 + }, + { + "epoch": 0.3209726443768997, + "grad_norm": 0.07821441013586179, + "learning_rate": 9.850592103891578e-06, + "loss": 0.6593, + "step": 660 + }, + { + "epoch": 0.32145896656534956, + "grad_norm": 0.07651420351781575, + "learning_rate": 9.850127187885206e-06, + "loss": 0.6042, + "step": 661 + }, + { + "epoch": 0.3219452887537994, + "grad_norm": 0.07898612474588815, + "learning_rate": 9.84966156066276e-06, + "loss": 0.6313, + "step": 662 + }, + { + "epoch": 0.32243161094224926, + "grad_norm": 0.07681441100755516, + "learning_rate": 9.849195222292516e-06, + "loss": 0.6382, + "step": 663 + }, + { + "epoch": 0.3229179331306991, + "grad_norm": 0.08195586140848041, + "learning_rate": 9.84872817284286e-06, + "loss": 0.6158, + "step": 664 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 0.07864083344934501, + "learning_rate": 9.848260412382279e-06, + "loss": 0.6507, + "step": 665 + }, + { + "epoch": 0.3238905775075988, + "grad_norm": 0.07889338788677215, + "learning_rate": 9.847791940979363e-06, + "loss": 0.6016, + "step": 666 + }, + { + "epoch": 0.32437689969604866, + "grad_norm": 0.0796539823708227, + "learning_rate": 9.847322758702812e-06, + "loss": 0.6211, + "step": 667 + }, + { + "epoch": 0.32486322188449845, + "grad_norm": 0.07486164930445696, + "learning_rate": 9.846852865621418e-06, + "loss": 0.6195, + "step": 668 + }, + { + "epoch": 0.3253495440729483, + "grad_norm": 0.07344230603374521, + "learning_rate": 9.846382261804095e-06, + "loss": 0.5837, + "step": 669 + }, + { + "epoch": 0.32583586626139815, + "grad_norm": 0.0761890307252086, + "learning_rate": 9.845910947319848e-06, + "loss": 0.5642, + "step": 670 + }, + { + "epoch": 0.326322188449848, + "grad_norm": 0.07368529390910186, + "learning_rate": 9.845438922237787e-06, + "loss": 0.6246, + "step": 671 + }, + { + "epoch": 0.32680851063829786, + "grad_norm": 0.07474690893701941, + "learning_rate": 9.844966186627134e-06, + "loss": 0.6071, + "step": 672 + }, + { + "epoch": 0.3272948328267477, + "grad_norm": 0.07488169201570247, + "learning_rate": 9.844492740557206e-06, + "loss": 0.5959, + "step": 673 + }, + { + "epoch": 0.32778115501519756, + "grad_norm": 0.07734039064837983, + "learning_rate": 9.84401858409743e-06, + "loss": 0.6404, + "step": 674 + }, + { + "epoch": 0.3282674772036474, + "grad_norm": 0.07457934442579399, + "learning_rate": 9.843543717317338e-06, + "loss": 0.571, + "step": 675 + }, + { + "epoch": 0.32875379939209726, + "grad_norm": 0.07382180274743218, + "learning_rate": 9.843068140286562e-06, + "loss": 0.5834, + "step": 676 + }, + { + "epoch": 0.3292401215805471, + "grad_norm": 0.07242396945111616, + "learning_rate": 9.842591853074838e-06, + "loss": 0.597, + "step": 677 + }, + { + "epoch": 0.32972644376899696, + "grad_norm": 0.07702264661601613, + "learning_rate": 9.842114855752013e-06, + "loss": 0.6258, + "step": 678 + }, + { + "epoch": 0.3302127659574468, + "grad_norm": 0.08038052018472745, + "learning_rate": 9.841637148388028e-06, + "loss": 0.6348, + "step": 679 + }, + { + "epoch": 0.33069908814589666, + "grad_norm": 0.07527820360896588, + "learning_rate": 9.841158731052937e-06, + "loss": 0.6303, + "step": 680 + }, + { + "epoch": 0.3311854103343465, + "grad_norm": 0.07944841316419674, + "learning_rate": 9.840679603816892e-06, + "loss": 0.6924, + "step": 681 + }, + { + "epoch": 0.33167173252279636, + "grad_norm": 0.07541956585873169, + "learning_rate": 9.840199766750153e-06, + "loss": 0.6139, + "step": 682 + }, + { + "epoch": 0.3321580547112462, + "grad_norm": 0.08391678283781674, + "learning_rate": 9.839719219923082e-06, + "loss": 0.6003, + "step": 683 + }, + { + "epoch": 0.33264437689969606, + "grad_norm": 0.0759802269594239, + "learning_rate": 9.839237963406147e-06, + "loss": 0.6052, + "step": 684 + }, + { + "epoch": 0.3331306990881459, + "grad_norm": 0.07769340011449526, + "learning_rate": 9.838755997269917e-06, + "loss": 0.6029, + "step": 685 + }, + { + "epoch": 0.33361702127659576, + "grad_norm": 0.07699460805415395, + "learning_rate": 9.838273321585067e-06, + "loss": 0.6133, + "step": 686 + }, + { + "epoch": 0.3341033434650456, + "grad_norm": 0.08324795617970843, + "learning_rate": 9.837789936422378e-06, + "loss": 0.6769, + "step": 687 + }, + { + "epoch": 0.33458966565349546, + "grad_norm": 0.08194637481266749, + "learning_rate": 9.837305841852731e-06, + "loss": 0.5749, + "step": 688 + }, + { + "epoch": 0.3350759878419453, + "grad_norm": 0.07811188647141626, + "learning_rate": 9.836821037947113e-06, + "loss": 0.6044, + "step": 689 + }, + { + "epoch": 0.33556231003039516, + "grad_norm": 0.08997965828782913, + "learning_rate": 9.836335524776616e-06, + "loss": 0.7448, + "step": 690 + }, + { + "epoch": 0.336048632218845, + "grad_norm": 0.08292868046021122, + "learning_rate": 9.835849302412435e-06, + "loss": 0.5858, + "step": 691 + }, + { + "epoch": 0.3365349544072948, + "grad_norm": 0.08069143975469402, + "learning_rate": 9.835362370925868e-06, + "loss": 0.6057, + "step": 692 + }, + { + "epoch": 0.33702127659574466, + "grad_norm": 0.07584607394712571, + "learning_rate": 9.83487473038832e-06, + "loss": 0.557, + "step": 693 + }, + { + "epoch": 0.3375075987841945, + "grad_norm": 0.07567055955041796, + "learning_rate": 9.834386380871294e-06, + "loss": 0.5572, + "step": 694 + }, + { + "epoch": 0.33799392097264436, + "grad_norm": 0.07297629979505112, + "learning_rate": 9.833897322446404e-06, + "loss": 0.6044, + "step": 695 + }, + { + "epoch": 0.3384802431610942, + "grad_norm": 0.07891151794248634, + "learning_rate": 9.833407555185366e-06, + "loss": 0.6468, + "step": 696 + }, + { + "epoch": 0.33896656534954406, + "grad_norm": 0.07795895679859437, + "learning_rate": 9.832917079159994e-06, + "loss": 0.6162, + "step": 697 + }, + { + "epoch": 0.3394528875379939, + "grad_norm": 0.07137876301000494, + "learning_rate": 9.832425894442217e-06, + "loss": 0.5521, + "step": 698 + }, + { + "epoch": 0.33993920972644376, + "grad_norm": 0.07294508322977147, + "learning_rate": 9.831934001104056e-06, + "loss": 0.577, + "step": 699 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 0.07696648748694457, + "learning_rate": 9.831441399217645e-06, + "loss": 0.5845, + "step": 700 + }, + { + "epoch": 0.34091185410334346, + "grad_norm": 0.07619005166154548, + "learning_rate": 9.830948088855217e-06, + "loss": 0.6073, + "step": 701 + }, + { + "epoch": 0.3413981762917933, + "grad_norm": 0.07745192730719962, + "learning_rate": 9.830454070089111e-06, + "loss": 0.6034, + "step": 702 + }, + { + "epoch": 0.34188449848024316, + "grad_norm": 0.07605660388505125, + "learning_rate": 9.829959342991769e-06, + "loss": 0.6019, + "step": 703 + }, + { + "epoch": 0.342370820668693, + "grad_norm": 0.08351355346840139, + "learning_rate": 9.829463907635737e-06, + "loss": 0.6215, + "step": 704 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 0.07559345496782574, + "learning_rate": 9.828967764093666e-06, + "loss": 0.6155, + "step": 705 + }, + { + "epoch": 0.3433434650455927, + "grad_norm": 0.0746566912957668, + "learning_rate": 9.828470912438308e-06, + "loss": 0.6551, + "step": 706 + }, + { + "epoch": 0.34382978723404256, + "grad_norm": 0.0879140204566672, + "learning_rate": 9.827973352742523e-06, + "loss": 0.5951, + "step": 707 + }, + { + "epoch": 0.3443161094224924, + "grad_norm": 0.08011827301576804, + "learning_rate": 9.82747508507927e-06, + "loss": 0.63, + "step": 708 + }, + { + "epoch": 0.34480243161094226, + "grad_norm": 0.08785356758693239, + "learning_rate": 9.826976109521616e-06, + "loss": 0.6128, + "step": 709 + }, + { + "epoch": 0.3452887537993921, + "grad_norm": 0.07691999715793084, + "learning_rate": 9.826476426142729e-06, + "loss": 0.6105, + "step": 710 + }, + { + "epoch": 0.34577507598784196, + "grad_norm": 0.07464313027315647, + "learning_rate": 9.825976035015881e-06, + "loss": 0.5893, + "step": 711 + }, + { + "epoch": 0.3462613981762918, + "grad_norm": 0.08167798190056258, + "learning_rate": 9.825474936214453e-06, + "loss": 0.6157, + "step": 712 + }, + { + "epoch": 0.34674772036474166, + "grad_norm": 0.0799450167567303, + "learning_rate": 9.824973129811919e-06, + "loss": 0.5729, + "step": 713 + }, + { + "epoch": 0.3472340425531915, + "grad_norm": 0.07631565362495049, + "learning_rate": 9.82447061588187e-06, + "loss": 0.6165, + "step": 714 + }, + { + "epoch": 0.34772036474164136, + "grad_norm": 0.07452953158131076, + "learning_rate": 9.823967394497988e-06, + "loss": 0.594, + "step": 715 + }, + { + "epoch": 0.34820668693009116, + "grad_norm": 0.07602715809828217, + "learning_rate": 9.823463465734068e-06, + "loss": 0.6164, + "step": 716 + }, + { + "epoch": 0.348693009118541, + "grad_norm": 0.07579853378576255, + "learning_rate": 9.822958829664007e-06, + "loss": 0.5924, + "step": 717 + }, + { + "epoch": 0.34917933130699086, + "grad_norm": 0.07337284322477278, + "learning_rate": 9.822453486361801e-06, + "loss": 0.5654, + "step": 718 + }, + { + "epoch": 0.3496656534954407, + "grad_norm": 0.0780096594422364, + "learning_rate": 9.821947435901552e-06, + "loss": 0.5959, + "step": 719 + }, + { + "epoch": 0.35015197568389056, + "grad_norm": 0.07657189795860213, + "learning_rate": 9.82144067835747e-06, + "loss": 0.5908, + "step": 720 + }, + { + "epoch": 0.3506382978723404, + "grad_norm": 0.07451495597472056, + "learning_rate": 9.820933213803863e-06, + "loss": 0.6152, + "step": 721 + }, + { + "epoch": 0.35112462006079026, + "grad_norm": 0.07596477483710605, + "learning_rate": 9.820425042315145e-06, + "loss": 0.6124, + "step": 722 + }, + { + "epoch": 0.3516109422492401, + "grad_norm": 0.0756182429112107, + "learning_rate": 9.819916163965835e-06, + "loss": 0.5901, + "step": 723 + }, + { + "epoch": 0.35209726443768996, + "grad_norm": 0.0783777772990784, + "learning_rate": 9.819406578830553e-06, + "loss": 0.5966, + "step": 724 + }, + { + "epoch": 0.3525835866261398, + "grad_norm": 0.08137822089637715, + "learning_rate": 9.818896286984025e-06, + "loss": 0.6366, + "step": 725 + }, + { + "epoch": 0.35306990881458966, + "grad_norm": 0.07975805026259344, + "learning_rate": 9.818385288501078e-06, + "loss": 0.6224, + "step": 726 + }, + { + "epoch": 0.3535562310030395, + "grad_norm": 0.08130133299735791, + "learning_rate": 9.817873583456646e-06, + "loss": 0.661, + "step": 727 + }, + { + "epoch": 0.35404255319148936, + "grad_norm": 0.07622972893811689, + "learning_rate": 9.81736117192576e-06, + "loss": 0.6402, + "step": 728 + }, + { + "epoch": 0.3545288753799392, + "grad_norm": 0.07570450408498365, + "learning_rate": 9.816848053983568e-06, + "loss": 0.6268, + "step": 729 + }, + { + "epoch": 0.35501519756838906, + "grad_norm": 0.07833182563314217, + "learning_rate": 9.816334229705304e-06, + "loss": 0.5937, + "step": 730 + }, + { + "epoch": 0.3555015197568389, + "grad_norm": 0.07597390230572401, + "learning_rate": 9.81581969916632e-06, + "loss": 0.609, + "step": 731 + }, + { + "epoch": 0.35598784194528876, + "grad_norm": 0.07329152255471012, + "learning_rate": 9.815304462442064e-06, + "loss": 0.5949, + "step": 732 + }, + { + "epoch": 0.3564741641337386, + "grad_norm": 0.07240498323302307, + "learning_rate": 9.81478851960809e-06, + "loss": 0.5638, + "step": 733 + }, + { + "epoch": 0.35696048632218846, + "grad_norm": 0.07369913230703103, + "learning_rate": 9.814271870740054e-06, + "loss": 0.6111, + "step": 734 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 0.07674314521953614, + "learning_rate": 9.81375451591372e-06, + "loss": 0.5791, + "step": 735 + }, + { + "epoch": 0.35793313069908816, + "grad_norm": 0.07945307747873935, + "learning_rate": 9.813236455204948e-06, + "loss": 0.6099, + "step": 736 + }, + { + "epoch": 0.358419452887538, + "grad_norm": 0.09091806479730845, + "learning_rate": 9.81271768868971e-06, + "loss": 0.6528, + "step": 737 + }, + { + "epoch": 0.35890577507598787, + "grad_norm": 0.07151096378399975, + "learning_rate": 9.812198216444072e-06, + "loss": 0.5806, + "step": 738 + }, + { + "epoch": 0.3593920972644377, + "grad_norm": 0.07746056417723758, + "learning_rate": 9.811678038544215e-06, + "loss": 0.639, + "step": 739 + }, + { + "epoch": 0.35987841945288757, + "grad_norm": 0.07804611191692483, + "learning_rate": 9.81115715506641e-06, + "loss": 0.6532, + "step": 740 + }, + { + "epoch": 0.36036474164133736, + "grad_norm": 0.07394008459322403, + "learning_rate": 9.810635566087046e-06, + "loss": 0.6404, + "step": 741 + }, + { + "epoch": 0.3608510638297872, + "grad_norm": 0.07744506355928404, + "learning_rate": 9.810113271682603e-06, + "loss": 0.6231, + "step": 742 + }, + { + "epoch": 0.36133738601823706, + "grad_norm": 0.07564954071125862, + "learning_rate": 9.809590271929673e-06, + "loss": 0.6113, + "step": 743 + }, + { + "epoch": 0.3618237082066869, + "grad_norm": 0.07453273918898405, + "learning_rate": 9.809066566904943e-06, + "loss": 0.5955, + "step": 744 + }, + { + "epoch": 0.36231003039513676, + "grad_norm": 0.07479149946864226, + "learning_rate": 9.808542156685214e-06, + "loss": 0.5769, + "step": 745 + }, + { + "epoch": 0.3627963525835866, + "grad_norm": 0.082402355714264, + "learning_rate": 9.808017041347381e-06, + "loss": 0.6218, + "step": 746 + }, + { + "epoch": 0.36328267477203646, + "grad_norm": 0.07873225068391046, + "learning_rate": 9.807491220968449e-06, + "loss": 0.6112, + "step": 747 + }, + { + "epoch": 0.3637689969604863, + "grad_norm": 0.07455467176287123, + "learning_rate": 9.806964695625521e-06, + "loss": 0.5794, + "step": 748 + }, + { + "epoch": 0.36425531914893616, + "grad_norm": 0.07669381174528357, + "learning_rate": 9.806437465395806e-06, + "loss": 0.5928, + "step": 749 + }, + { + "epoch": 0.364741641337386, + "grad_norm": 0.07708746212665712, + "learning_rate": 9.805909530356619e-06, + "loss": 0.5792, + "step": 750 + }, + { + "epoch": 0.36522796352583586, + "grad_norm": 0.0782021814908094, + "learning_rate": 9.805380890585374e-06, + "loss": 0.5986, + "step": 751 + }, + { + "epoch": 0.3657142857142857, + "grad_norm": 0.07580465311028298, + "learning_rate": 9.804851546159591e-06, + "loss": 0.6016, + "step": 752 + }, + { + "epoch": 0.36620060790273556, + "grad_norm": 0.07425982247576306, + "learning_rate": 9.804321497156889e-06, + "loss": 0.5865, + "step": 753 + }, + { + "epoch": 0.3666869300911854, + "grad_norm": 0.07343054880813348, + "learning_rate": 9.803790743654997e-06, + "loss": 0.5846, + "step": 754 + }, + { + "epoch": 0.36717325227963526, + "grad_norm": 0.07602241035777223, + "learning_rate": 9.803259285731744e-06, + "loss": 0.6245, + "step": 755 + }, + { + "epoch": 0.3676595744680851, + "grad_norm": 0.07560893349405994, + "learning_rate": 9.802727123465061e-06, + "loss": 0.5705, + "step": 756 + }, + { + "epoch": 0.36814589665653497, + "grad_norm": 0.0781592045474348, + "learning_rate": 9.802194256932985e-06, + "loss": 0.6035, + "step": 757 + }, + { + "epoch": 0.3686322188449848, + "grad_norm": 0.5479549618834126, + "learning_rate": 9.801660686213653e-06, + "loss": 0.6121, + "step": 758 + }, + { + "epoch": 0.36911854103343467, + "grad_norm": 0.07972933601606225, + "learning_rate": 9.801126411385306e-06, + "loss": 0.5695, + "step": 759 + }, + { + "epoch": 0.3696048632218845, + "grad_norm": 0.08009475079810593, + "learning_rate": 9.800591432526291e-06, + "loss": 0.6005, + "step": 760 + }, + { + "epoch": 0.37009118541033437, + "grad_norm": 0.07582135814627514, + "learning_rate": 9.80005574971506e-06, + "loss": 0.551, + "step": 761 + }, + { + "epoch": 0.3705775075987842, + "grad_norm": 0.07473388860056311, + "learning_rate": 9.79951936303016e-06, + "loss": 0.5768, + "step": 762 + }, + { + "epoch": 0.37106382978723407, + "grad_norm": 0.08367807189980862, + "learning_rate": 9.798982272550248e-06, + "loss": 0.6329, + "step": 763 + }, + { + "epoch": 0.3715501519756839, + "grad_norm": 0.07874208648761716, + "learning_rate": 9.79844447835408e-06, + "loss": 0.6011, + "step": 764 + }, + { + "epoch": 0.3720364741641337, + "grad_norm": 0.07706266515535616, + "learning_rate": 9.797905980520522e-06, + "loss": 0.6013, + "step": 765 + }, + { + "epoch": 0.37252279635258356, + "grad_norm": 0.0958171568155027, + "learning_rate": 9.797366779128532e-06, + "loss": 0.6558, + "step": 766 + }, + { + "epoch": 0.3730091185410334, + "grad_norm": 0.07697272843657556, + "learning_rate": 9.796826874257186e-06, + "loss": 0.6125, + "step": 767 + }, + { + "epoch": 0.37349544072948326, + "grad_norm": 0.08186349415790875, + "learning_rate": 9.796286265985648e-06, + "loss": 0.6011, + "step": 768 + }, + { + "epoch": 0.3739817629179331, + "grad_norm": 0.0883592718059818, + "learning_rate": 9.795744954393193e-06, + "loss": 0.5679, + "step": 769 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 0.08167812442101358, + "learning_rate": 9.795202939559202e-06, + "loss": 0.6299, + "step": 770 + }, + { + "epoch": 0.3749544072948328, + "grad_norm": 0.0792685871068706, + "learning_rate": 9.794660221563153e-06, + "loss": 0.5999, + "step": 771 + }, + { + "epoch": 0.37544072948328266, + "grad_norm": 0.08532744694283427, + "learning_rate": 9.79411680048463e-06, + "loss": 0.6254, + "step": 772 + }, + { + "epoch": 0.3759270516717325, + "grad_norm": 0.07806256718078687, + "learning_rate": 9.793572676403317e-06, + "loss": 0.619, + "step": 773 + }, + { + "epoch": 0.37641337386018237, + "grad_norm": 0.0793179847038666, + "learning_rate": 9.793027849399007e-06, + "loss": 0.6606, + "step": 774 + }, + { + "epoch": 0.3768996960486322, + "grad_norm": 0.07888310670534646, + "learning_rate": 9.792482319551591e-06, + "loss": 0.6185, + "step": 775 + }, + { + "epoch": 0.37738601823708207, + "grad_norm": 0.07647686653808983, + "learning_rate": 9.791936086941065e-06, + "loss": 0.6145, + "step": 776 + }, + { + "epoch": 0.3778723404255319, + "grad_norm": 0.07602720666265521, + "learning_rate": 9.791389151647528e-06, + "loss": 0.5883, + "step": 777 + }, + { + "epoch": 0.37835866261398177, + "grad_norm": 0.0801984661142592, + "learning_rate": 9.790841513751183e-06, + "loss": 0.5713, + "step": 778 + }, + { + "epoch": 0.3788449848024316, + "grad_norm": 0.07716436222336652, + "learning_rate": 9.790293173332332e-06, + "loss": 0.6222, + "step": 779 + }, + { + "epoch": 0.37933130699088147, + "grad_norm": 0.0756425996143619, + "learning_rate": 9.789744130471384e-06, + "loss": 0.5851, + "step": 780 + }, + { + "epoch": 0.3798176291793313, + "grad_norm": 0.1257844678410659, + "learning_rate": 9.789194385248853e-06, + "loss": 0.6561, + "step": 781 + }, + { + "epoch": 0.38030395136778117, + "grad_norm": 0.13855605654022526, + "learning_rate": 9.788643937745349e-06, + "loss": 0.6287, + "step": 782 + }, + { + "epoch": 0.380790273556231, + "grad_norm": 0.07487811345641193, + "learning_rate": 9.788092788041589e-06, + "loss": 0.5895, + "step": 783 + }, + { + "epoch": 0.38127659574468087, + "grad_norm": 0.07377100047352117, + "learning_rate": 9.787540936218393e-06, + "loss": 0.5368, + "step": 784 + }, + { + "epoch": 0.3817629179331307, + "grad_norm": 0.07584459135647661, + "learning_rate": 9.786988382356688e-06, + "loss": 0.5947, + "step": 785 + }, + { + "epoch": 0.38224924012158057, + "grad_norm": 0.07479041400561619, + "learning_rate": 9.786435126537494e-06, + "loss": 0.5972, + "step": 786 + }, + { + "epoch": 0.3827355623100304, + "grad_norm": 0.07404379534937033, + "learning_rate": 9.785881168841944e-06, + "loss": 0.614, + "step": 787 + }, + { + "epoch": 0.38322188449848027, + "grad_norm": 0.07399166012541243, + "learning_rate": 9.785326509351268e-06, + "loss": 0.6004, + "step": 788 + }, + { + "epoch": 0.38370820668693006, + "grad_norm": 0.08121249684545276, + "learning_rate": 9.7847711481468e-06, + "loss": 0.6078, + "step": 789 + }, + { + "epoch": 0.3841945288753799, + "grad_norm": 0.07987169556666426, + "learning_rate": 9.784215085309977e-06, + "loss": 0.5872, + "step": 790 + }, + { + "epoch": 0.38468085106382977, + "grad_norm": 0.07695724165931447, + "learning_rate": 9.783658320922341e-06, + "loss": 0.5858, + "step": 791 + }, + { + "epoch": 0.3851671732522796, + "grad_norm": 0.08018413963144698, + "learning_rate": 9.783100855065533e-06, + "loss": 0.601, + "step": 792 + }, + { + "epoch": 0.38565349544072947, + "grad_norm": 0.0762652933877143, + "learning_rate": 9.782542687821302e-06, + "loss": 0.6222, + "step": 793 + }, + { + "epoch": 0.3861398176291793, + "grad_norm": 0.07851378702374802, + "learning_rate": 9.781983819271494e-06, + "loss": 0.5988, + "step": 794 + }, + { + "epoch": 0.38662613981762917, + "grad_norm": 0.07160612568255702, + "learning_rate": 9.781424249498064e-06, + "loss": 0.5586, + "step": 795 + }, + { + "epoch": 0.387112462006079, + "grad_norm": 0.07633586461842592, + "learning_rate": 9.780863978583061e-06, + "loss": 0.622, + "step": 796 + }, + { + "epoch": 0.38759878419452887, + "grad_norm": 0.08553591241183264, + "learning_rate": 9.78030300660865e-06, + "loss": 0.6428, + "step": 797 + }, + { + "epoch": 0.3880851063829787, + "grad_norm": 0.07603119624398466, + "learning_rate": 9.779741333657084e-06, + "loss": 0.596, + "step": 798 + }, + { + "epoch": 0.38857142857142857, + "grad_norm": 0.07346753616473085, + "learning_rate": 9.779178959810728e-06, + "loss": 0.5701, + "step": 799 + }, + { + "epoch": 0.3890577507598784, + "grad_norm": 0.07552954801862151, + "learning_rate": 9.778615885152052e-06, + "loss": 0.6303, + "step": 800 + }, + { + "epoch": 0.38954407294832827, + "grad_norm": 0.07781764530797763, + "learning_rate": 9.778052109763619e-06, + "loss": 0.5965, + "step": 801 + }, + { + "epoch": 0.3900303951367781, + "grad_norm": 0.081946823974092, + "learning_rate": 9.777487633728103e-06, + "loss": 0.5732, + "step": 802 + }, + { + "epoch": 0.39051671732522797, + "grad_norm": 0.07662313874272146, + "learning_rate": 9.776922457128277e-06, + "loss": 0.6052, + "step": 803 + }, + { + "epoch": 0.3910030395136778, + "grad_norm": 0.07587320821089638, + "learning_rate": 9.77635658004702e-06, + "loss": 0.6246, + "step": 804 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 0.07369205805954429, + "learning_rate": 9.77579000256731e-06, + "loss": 0.602, + "step": 805 + }, + { + "epoch": 0.3919756838905775, + "grad_norm": 0.07457476463638249, + "learning_rate": 9.775222724772226e-06, + "loss": 0.5692, + "step": 806 + }, + { + "epoch": 0.39246200607902737, + "grad_norm": 0.07812562033668508, + "learning_rate": 9.774654746744957e-06, + "loss": 0.6353, + "step": 807 + }, + { + "epoch": 0.3929483282674772, + "grad_norm": 0.07605717651897465, + "learning_rate": 9.77408606856879e-06, + "loss": 0.5628, + "step": 808 + }, + { + "epoch": 0.39343465045592707, + "grad_norm": 0.07584249979144048, + "learning_rate": 9.773516690327111e-06, + "loss": 0.5825, + "step": 809 + }, + { + "epoch": 0.3939209726443769, + "grad_norm": 0.07921338181632771, + "learning_rate": 9.77294661210342e-06, + "loss": 0.6414, + "step": 810 + }, + { + "epoch": 0.39440729483282677, + "grad_norm": 0.07717726232374068, + "learning_rate": 9.772375833981306e-06, + "loss": 0.602, + "step": 811 + }, + { + "epoch": 0.3948936170212766, + "grad_norm": 0.07342179279640387, + "learning_rate": 9.771804356044473e-06, + "loss": 0.5587, + "step": 812 + }, + { + "epoch": 0.3953799392097264, + "grad_norm": 0.0752072699002813, + "learning_rate": 9.771232178376717e-06, + "loss": 0.6419, + "step": 813 + }, + { + "epoch": 0.39586626139817627, + "grad_norm": 0.07353312399126578, + "learning_rate": 9.770659301061943e-06, + "loss": 0.5743, + "step": 814 + }, + { + "epoch": 0.3963525835866261, + "grad_norm": 0.07518407178014146, + "learning_rate": 9.770085724184158e-06, + "loss": 0.5839, + "step": 815 + }, + { + "epoch": 0.39683890577507597, + "grad_norm": 0.07512845684968103, + "learning_rate": 9.769511447827466e-06, + "loss": 0.5983, + "step": 816 + }, + { + "epoch": 0.3973252279635258, + "grad_norm": 0.07483337228430045, + "learning_rate": 9.768936472076086e-06, + "loss": 0.5643, + "step": 817 + }, + { + "epoch": 0.39781155015197567, + "grad_norm": 0.07826536390308092, + "learning_rate": 9.768360797014325e-06, + "loss": 0.5902, + "step": 818 + }, + { + "epoch": 0.3982978723404255, + "grad_norm": 0.07823860074212571, + "learning_rate": 9.767784422726601e-06, + "loss": 0.6034, + "step": 819 + }, + { + "epoch": 0.39878419452887537, + "grad_norm": 0.07928236098172678, + "learning_rate": 9.767207349297434e-06, + "loss": 0.6056, + "step": 820 + }, + { + "epoch": 0.3992705167173252, + "grad_norm": 0.07502602439184818, + "learning_rate": 9.766629576811444e-06, + "loss": 0.5634, + "step": 821 + }, + { + "epoch": 0.39975683890577507, + "grad_norm": 0.07440247125134211, + "learning_rate": 9.766051105353355e-06, + "loss": 0.5997, + "step": 822 + }, + { + "epoch": 0.4002431610942249, + "grad_norm": 0.07432716463247864, + "learning_rate": 9.765471935007995e-06, + "loss": 0.647, + "step": 823 + }, + { + "epoch": 0.40072948328267477, + "grad_norm": 0.0732308644469105, + "learning_rate": 9.76489206586029e-06, + "loss": 0.578, + "step": 824 + }, + { + "epoch": 0.4012158054711246, + "grad_norm": 0.07893899133784575, + "learning_rate": 9.764311497995272e-06, + "loss": 0.6225, + "step": 825 + }, + { + "epoch": 0.40170212765957447, + "grad_norm": 0.07296501503582166, + "learning_rate": 9.763730231498077e-06, + "loss": 0.5958, + "step": 826 + }, + { + "epoch": 0.4021884498480243, + "grad_norm": 0.0764385710082842, + "learning_rate": 9.763148266453937e-06, + "loss": 0.6243, + "step": 827 + }, + { + "epoch": 0.40267477203647417, + "grad_norm": 0.07583379733812651, + "learning_rate": 9.762565602948194e-06, + "loss": 0.6402, + "step": 828 + }, + { + "epoch": 0.403161094224924, + "grad_norm": 0.07634036599173366, + "learning_rate": 9.761982241066288e-06, + "loss": 0.6329, + "step": 829 + }, + { + "epoch": 0.40364741641337387, + "grad_norm": 0.08103214345275933, + "learning_rate": 9.761398180893761e-06, + "loss": 0.585, + "step": 830 + }, + { + "epoch": 0.4041337386018237, + "grad_norm": 0.07620272595765024, + "learning_rate": 9.760813422516262e-06, + "loss": 0.6248, + "step": 831 + }, + { + "epoch": 0.4046200607902736, + "grad_norm": 0.0796826241754394, + "learning_rate": 9.760227966019537e-06, + "loss": 0.6052, + "step": 832 + }, + { + "epoch": 0.4051063829787234, + "grad_norm": 0.07632403049958285, + "learning_rate": 9.759641811489435e-06, + "loss": 0.5757, + "step": 833 + }, + { + "epoch": 0.4055927051671733, + "grad_norm": 0.07616229687040604, + "learning_rate": 9.759054959011913e-06, + "loss": 0.6068, + "step": 834 + }, + { + "epoch": 0.4060790273556231, + "grad_norm": 0.07226407706388134, + "learning_rate": 9.758467408673022e-06, + "loss": 0.5968, + "step": 835 + }, + { + "epoch": 0.406565349544073, + "grad_norm": 0.0808888319207876, + "learning_rate": 9.757879160558923e-06, + "loss": 0.6091, + "step": 836 + }, + { + "epoch": 0.4070516717325228, + "grad_norm": 0.07666755480788576, + "learning_rate": 9.757290214755873e-06, + "loss": 0.5954, + "step": 837 + }, + { + "epoch": 0.4075379939209726, + "grad_norm": 0.0785550634517315, + "learning_rate": 9.756700571350234e-06, + "loss": 0.5964, + "step": 838 + }, + { + "epoch": 0.40802431610942247, + "grad_norm": 0.0756310976979388, + "learning_rate": 9.756110230428476e-06, + "loss": 0.6078, + "step": 839 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 0.07712320689679607, + "learning_rate": 9.75551919207716e-06, + "loss": 0.5821, + "step": 840 + }, + { + "epoch": 0.40899696048632217, + "grad_norm": 0.07383297779896966, + "learning_rate": 9.754927456382957e-06, + "loss": 0.5553, + "step": 841 + }, + { + "epoch": 0.409483282674772, + "grad_norm": 0.07829601350018571, + "learning_rate": 9.75433502343264e-06, + "loss": 0.5716, + "step": 842 + }, + { + "epoch": 0.40996960486322187, + "grad_norm": 0.08032894608305195, + "learning_rate": 9.753741893313077e-06, + "loss": 0.6025, + "step": 843 + }, + { + "epoch": 0.4104559270516717, + "grad_norm": 0.07736144033521516, + "learning_rate": 9.753148066111251e-06, + "loss": 0.6401, + "step": 844 + }, + { + "epoch": 0.41094224924012157, + "grad_norm": 0.07223111267633972, + "learning_rate": 9.752553541914236e-06, + "loss": 0.6098, + "step": 845 + }, + { + "epoch": 0.4114285714285714, + "grad_norm": 0.08321710030729894, + "learning_rate": 9.751958320809213e-06, + "loss": 0.5994, + "step": 846 + }, + { + "epoch": 0.41191489361702127, + "grad_norm": 0.08000958815494474, + "learning_rate": 9.751362402883465e-06, + "loss": 0.6199, + "step": 847 + }, + { + "epoch": 0.4124012158054711, + "grad_norm": 0.07387542505957467, + "learning_rate": 9.750765788224374e-06, + "loss": 0.5773, + "step": 848 + }, + { + "epoch": 0.41288753799392097, + "grad_norm": 0.08077551136540044, + "learning_rate": 9.750168476919429e-06, + "loss": 0.608, + "step": 849 + }, + { + "epoch": 0.4133738601823708, + "grad_norm": 0.07915987309003285, + "learning_rate": 9.74957046905622e-06, + "loss": 0.601, + "step": 850 + }, + { + "epoch": 0.4138601823708207, + "grad_norm": 0.07569127118214747, + "learning_rate": 9.748971764722434e-06, + "loss": 0.6116, + "step": 851 + }, + { + "epoch": 0.4143465045592705, + "grad_norm": 0.07674797213372184, + "learning_rate": 9.74837236400587e-06, + "loss": 0.5906, + "step": 852 + }, + { + "epoch": 0.4148328267477204, + "grad_norm": 0.07856468005675736, + "learning_rate": 9.747772266994418e-06, + "loss": 0.6059, + "step": 853 + }, + { + "epoch": 0.4153191489361702, + "grad_norm": 0.07738863175280847, + "learning_rate": 9.747171473776078e-06, + "loss": 0.5887, + "step": 854 + }, + { + "epoch": 0.4158054711246201, + "grad_norm": 0.07652290342538677, + "learning_rate": 9.74656998443895e-06, + "loss": 0.6342, + "step": 855 + }, + { + "epoch": 0.4162917933130699, + "grad_norm": 0.0786624751612501, + "learning_rate": 9.745967799071234e-06, + "loss": 0.65, + "step": 856 + }, + { + "epoch": 0.4167781155015198, + "grad_norm": 0.07803366601690188, + "learning_rate": 9.745364917761235e-06, + "loss": 0.564, + "step": 857 + }, + { + "epoch": 0.4172644376899696, + "grad_norm": 0.0785850851671536, + "learning_rate": 9.744761340597356e-06, + "loss": 0.6004, + "step": 858 + }, + { + "epoch": 0.4177507598784195, + "grad_norm": 0.07700850294747096, + "learning_rate": 9.744157067668108e-06, + "loss": 0.5712, + "step": 859 + }, + { + "epoch": 0.4182370820668693, + "grad_norm": 0.07553743520602545, + "learning_rate": 9.7435520990621e-06, + "loss": 0.596, + "step": 860 + }, + { + "epoch": 0.4187234042553192, + "grad_norm": 0.07547797004118767, + "learning_rate": 9.742946434868044e-06, + "loss": 0.6543, + "step": 861 + }, + { + "epoch": 0.41920972644376897, + "grad_norm": 0.07754776062169427, + "learning_rate": 9.742340075174751e-06, + "loss": 0.6027, + "step": 862 + }, + { + "epoch": 0.4196960486322188, + "grad_norm": 0.0766152608824993, + "learning_rate": 9.74173302007114e-06, + "loss": 0.5901, + "step": 863 + }, + { + "epoch": 0.42018237082066867, + "grad_norm": 0.1914583263147464, + "learning_rate": 9.741125269646228e-06, + "loss": 0.6266, + "step": 864 + }, + { + "epoch": 0.4206686930091185, + "grad_norm": 0.07538623756665357, + "learning_rate": 9.740516823989133e-06, + "loss": 0.5612, + "step": 865 + }, + { + "epoch": 0.42115501519756837, + "grad_norm": 0.07417571500644635, + "learning_rate": 9.739907683189078e-06, + "loss": 0.5562, + "step": 866 + }, + { + "epoch": 0.4216413373860182, + "grad_norm": 0.07442235181744117, + "learning_rate": 9.739297847335387e-06, + "loss": 0.619, + "step": 867 + }, + { + "epoch": 0.4221276595744681, + "grad_norm": 0.07910113288806607, + "learning_rate": 9.738687316517486e-06, + "loss": 0.6059, + "step": 868 + }, + { + "epoch": 0.4226139817629179, + "grad_norm": 0.08559545367039657, + "learning_rate": 9.7380760908249e-06, + "loss": 0.6919, + "step": 869 + }, + { + "epoch": 0.4231003039513678, + "grad_norm": 0.08695687320206436, + "learning_rate": 9.73746417034726e-06, + "loss": 0.6031, + "step": 870 + }, + { + "epoch": 0.4235866261398176, + "grad_norm": 0.07725936774072596, + "learning_rate": 9.736851555174295e-06, + "loss": 0.5728, + "step": 871 + }, + { + "epoch": 0.4240729483282675, + "grad_norm": 0.0778958283361194, + "learning_rate": 9.736238245395842e-06, + "loss": 0.6, + "step": 872 + }, + { + "epoch": 0.4245592705167173, + "grad_norm": 0.07442993196817138, + "learning_rate": 9.735624241101836e-06, + "loss": 0.5682, + "step": 873 + }, + { + "epoch": 0.4250455927051672, + "grad_norm": 0.07601852794136615, + "learning_rate": 9.735009542382308e-06, + "loss": 0.5736, + "step": 874 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 0.07273129906169155, + "learning_rate": 9.734394149327402e-06, + "loss": 0.5669, + "step": 875 + }, + { + "epoch": 0.4260182370820669, + "grad_norm": 0.08654044254014771, + "learning_rate": 9.733778062027355e-06, + "loss": 0.6199, + "step": 876 + }, + { + "epoch": 0.4265045592705167, + "grad_norm": 0.07827821768608124, + "learning_rate": 9.733161280572512e-06, + "loss": 0.6123, + "step": 877 + }, + { + "epoch": 0.4269908814589666, + "grad_norm": 0.07420143017419538, + "learning_rate": 9.732543805053316e-06, + "loss": 0.5998, + "step": 878 + }, + { + "epoch": 0.4274772036474164, + "grad_norm": 0.07849074664698075, + "learning_rate": 9.731925635560314e-06, + "loss": 0.5939, + "step": 879 + }, + { + "epoch": 0.4279635258358663, + "grad_norm": 0.07323072801587062, + "learning_rate": 9.73130677218415e-06, + "loss": 0.5935, + "step": 880 + }, + { + "epoch": 0.4284498480243161, + "grad_norm": 0.07321945312865086, + "learning_rate": 9.730687215015576e-06, + "loss": 0.5851, + "step": 881 + }, + { + "epoch": 0.428936170212766, + "grad_norm": 0.075591448859421, + "learning_rate": 9.730066964145441e-06, + "loss": 0.5823, + "step": 882 + }, + { + "epoch": 0.4294224924012158, + "grad_norm": 0.07441155830176882, + "learning_rate": 9.729446019664701e-06, + "loss": 0.6085, + "step": 883 + }, + { + "epoch": 0.4299088145896657, + "grad_norm": 0.07271166988434982, + "learning_rate": 9.728824381664408e-06, + "loss": 0.575, + "step": 884 + }, + { + "epoch": 0.43039513677811553, + "grad_norm": 0.07346486482966536, + "learning_rate": 9.728202050235718e-06, + "loss": 0.5881, + "step": 885 + }, + { + "epoch": 0.4308814589665653, + "grad_norm": 0.07489452556645379, + "learning_rate": 9.72757902546989e-06, + "loss": 0.6044, + "step": 886 + }, + { + "epoch": 0.4313677811550152, + "grad_norm": 0.07632507019752027, + "learning_rate": 9.726955307458286e-06, + "loss": 0.6231, + "step": 887 + }, + { + "epoch": 0.431854103343465, + "grad_norm": 0.08578252253728172, + "learning_rate": 9.72633089629236e-06, + "loss": 0.6285, + "step": 888 + }, + { + "epoch": 0.4323404255319149, + "grad_norm": 0.07284350237287862, + "learning_rate": 9.725705792063681e-06, + "loss": 0.5657, + "step": 889 + }, + { + "epoch": 0.4328267477203647, + "grad_norm": 0.07759959988539591, + "learning_rate": 9.725079994863914e-06, + "loss": 0.6165, + "step": 890 + }, + { + "epoch": 0.4333130699088146, + "grad_norm": 0.07381930553688103, + "learning_rate": 9.724453504784819e-06, + "loss": 0.5513, + "step": 891 + }, + { + "epoch": 0.4337993920972644, + "grad_norm": 0.07617855274750135, + "learning_rate": 9.723826321918268e-06, + "loss": 0.5861, + "step": 892 + }, + { + "epoch": 0.4342857142857143, + "grad_norm": 0.07127956105247536, + "learning_rate": 9.72319844635623e-06, + "loss": 0.5439, + "step": 893 + }, + { + "epoch": 0.4347720364741641, + "grad_norm": 0.07448495855434166, + "learning_rate": 9.722569878190776e-06, + "loss": 0.6072, + "step": 894 + }, + { + "epoch": 0.435258358662614, + "grad_norm": 0.07440876596547552, + "learning_rate": 9.721940617514076e-06, + "loss": 0.5854, + "step": 895 + }, + { + "epoch": 0.4357446808510638, + "grad_norm": 0.0744417889991682, + "learning_rate": 9.721310664418406e-06, + "loss": 0.6157, + "step": 896 + }, + { + "epoch": 0.4362310030395137, + "grad_norm": 0.15080017407035806, + "learning_rate": 9.720680018996142e-06, + "loss": 0.6143, + "step": 897 + }, + { + "epoch": 0.4367173252279635, + "grad_norm": 0.07117656694073243, + "learning_rate": 9.72004868133976e-06, + "loss": 0.5783, + "step": 898 + }, + { + "epoch": 0.4372036474164134, + "grad_norm": 0.08124580916688322, + "learning_rate": 9.719416651541839e-06, + "loss": 0.6025, + "step": 899 + }, + { + "epoch": 0.4376899696048632, + "grad_norm": 0.07972345776023025, + "learning_rate": 9.718783929695056e-06, + "loss": 0.5895, + "step": 900 + }, + { + "epoch": 0.4381762917933131, + "grad_norm": 0.07641182226859868, + "learning_rate": 9.718150515892199e-06, + "loss": 0.5851, + "step": 901 + }, + { + "epoch": 0.4386626139817629, + "grad_norm": 0.07953936083977978, + "learning_rate": 9.717516410226144e-06, + "loss": 0.62, + "step": 902 + }, + { + "epoch": 0.4391489361702128, + "grad_norm": 0.08455624384394385, + "learning_rate": 9.716881612789878e-06, + "loss": 0.6444, + "step": 903 + }, + { + "epoch": 0.43963525835866263, + "grad_norm": 0.07756115270524952, + "learning_rate": 9.716246123676491e-06, + "loss": 0.6123, + "step": 904 + }, + { + "epoch": 0.4401215805471125, + "grad_norm": 0.07732160915673524, + "learning_rate": 9.715609942979163e-06, + "loss": 0.603, + "step": 905 + }, + { + "epoch": 0.44060790273556233, + "grad_norm": 0.07766174992231115, + "learning_rate": 9.714973070791187e-06, + "loss": 0.6185, + "step": 906 + }, + { + "epoch": 0.4410942249240122, + "grad_norm": 0.07937360530387118, + "learning_rate": 9.714335507205953e-06, + "loss": 0.5601, + "step": 907 + }, + { + "epoch": 0.44158054711246203, + "grad_norm": 0.07686143431141876, + "learning_rate": 9.713697252316951e-06, + "loss": 0.6079, + "step": 908 + }, + { + "epoch": 0.4420668693009119, + "grad_norm": 0.07288172634716461, + "learning_rate": 9.713058306217776e-06, + "loss": 0.5616, + "step": 909 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 0.0840387537815393, + "learning_rate": 9.712418669002119e-06, + "loss": 0.6173, + "step": 910 + }, + { + "epoch": 0.4430395136778115, + "grad_norm": 0.0791516823690533, + "learning_rate": 9.711778340763778e-06, + "loss": 0.6151, + "step": 911 + }, + { + "epoch": 0.4435258358662614, + "grad_norm": 0.081325706302791, + "learning_rate": 9.711137321596649e-06, + "loss": 0.6754, + "step": 912 + }, + { + "epoch": 0.4440121580547112, + "grad_norm": 0.07313916628395718, + "learning_rate": 9.71049561159473e-06, + "loss": 0.5868, + "step": 913 + }, + { + "epoch": 0.4444984802431611, + "grad_norm": 0.07855701100463838, + "learning_rate": 9.70985321085212e-06, + "loss": 0.5954, + "step": 914 + }, + { + "epoch": 0.4449848024316109, + "grad_norm": 0.08335727397651717, + "learning_rate": 9.709210119463022e-06, + "loss": 0.6362, + "step": 915 + }, + { + "epoch": 0.4454711246200608, + "grad_norm": 0.07602292881303029, + "learning_rate": 9.708566337521736e-06, + "loss": 0.6011, + "step": 916 + }, + { + "epoch": 0.4459574468085106, + "grad_norm": 0.07595063819769632, + "learning_rate": 9.707921865122665e-06, + "loss": 0.6069, + "step": 917 + }, + { + "epoch": 0.4464437689969605, + "grad_norm": 0.07746393755940535, + "learning_rate": 9.707276702360315e-06, + "loss": 0.6204, + "step": 918 + }, + { + "epoch": 0.4469300911854103, + "grad_norm": 0.07765430935306938, + "learning_rate": 9.706630849329292e-06, + "loss": 0.6032, + "step": 919 + }, + { + "epoch": 0.4474164133738602, + "grad_norm": 0.13254051685910764, + "learning_rate": 9.705984306124302e-06, + "loss": 0.5959, + "step": 920 + }, + { + "epoch": 0.44790273556231003, + "grad_norm": 0.0736482439739256, + "learning_rate": 9.705337072840152e-06, + "loss": 0.5957, + "step": 921 + }, + { + "epoch": 0.4483890577507599, + "grad_norm": 0.07161912283251916, + "learning_rate": 9.704689149571755e-06, + "loss": 0.6029, + "step": 922 + }, + { + "epoch": 0.44887537993920973, + "grad_norm": 0.07632947843340931, + "learning_rate": 9.70404053641412e-06, + "loss": 0.6094, + "step": 923 + }, + { + "epoch": 0.4493617021276596, + "grad_norm": 0.07187594235372388, + "learning_rate": 9.703391233462356e-06, + "loss": 0.5928, + "step": 924 + }, + { + "epoch": 0.44984802431610943, + "grad_norm": 0.07522064767838982, + "learning_rate": 9.70274124081168e-06, + "loss": 0.6075, + "step": 925 + }, + { + "epoch": 0.4503343465045593, + "grad_norm": 0.08512216752391312, + "learning_rate": 9.702090558557404e-06, + "loss": 0.6582, + "step": 926 + }, + { + "epoch": 0.45082066869300913, + "grad_norm": 0.0768733313224146, + "learning_rate": 9.701439186794943e-06, + "loss": 0.5934, + "step": 927 + }, + { + "epoch": 0.451306990881459, + "grad_norm": 0.07544172707600616, + "learning_rate": 9.700787125619812e-06, + "loss": 0.5961, + "step": 928 + }, + { + "epoch": 0.45179331306990883, + "grad_norm": 0.08177673987106983, + "learning_rate": 9.700134375127633e-06, + "loss": 0.6159, + "step": 929 + }, + { + "epoch": 0.4522796352583587, + "grad_norm": 0.07107017092833946, + "learning_rate": 9.69948093541412e-06, + "loss": 0.561, + "step": 930 + }, + { + "epoch": 0.45276595744680853, + "grad_norm": 0.08007185519302115, + "learning_rate": 9.698826806575093e-06, + "loss": 0.6439, + "step": 931 + }, + { + "epoch": 0.4532522796352584, + "grad_norm": 0.07487054762146209, + "learning_rate": 9.698171988706476e-06, + "loss": 0.574, + "step": 932 + }, + { + "epoch": 0.45373860182370823, + "grad_norm": 0.07978953515184978, + "learning_rate": 9.697516481904286e-06, + "loss": 0.5847, + "step": 933 + }, + { + "epoch": 0.4542249240121581, + "grad_norm": 0.08028804220821169, + "learning_rate": 9.696860286264648e-06, + "loss": 0.6054, + "step": 934 + }, + { + "epoch": 0.4547112462006079, + "grad_norm": 0.07803227031905113, + "learning_rate": 9.696203401883786e-06, + "loss": 0.6331, + "step": 935 + }, + { + "epoch": 0.4551975683890577, + "grad_norm": 0.07670678948077339, + "learning_rate": 9.695545828858024e-06, + "loss": 0.6871, + "step": 936 + }, + { + "epoch": 0.4556838905775076, + "grad_norm": 0.07743577914984455, + "learning_rate": 9.694887567283786e-06, + "loss": 0.6515, + "step": 937 + }, + { + "epoch": 0.45617021276595743, + "grad_norm": 0.07861986358754945, + "learning_rate": 9.694228617257602e-06, + "loss": 0.5849, + "step": 938 + }, + { + "epoch": 0.4566565349544073, + "grad_norm": 0.07455616986614651, + "learning_rate": 9.693568978876098e-06, + "loss": 0.6069, + "step": 939 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.07513334112457971, + "learning_rate": 9.692908652236002e-06, + "loss": 0.5738, + "step": 940 + }, + { + "epoch": 0.457629179331307, + "grad_norm": 0.08441089807460127, + "learning_rate": 9.692247637434142e-06, + "loss": 0.6307, + "step": 941 + }, + { + "epoch": 0.45811550151975683, + "grad_norm": 0.07570139872137768, + "learning_rate": 9.691585934567452e-06, + "loss": 0.5746, + "step": 942 + }, + { + "epoch": 0.4586018237082067, + "grad_norm": 0.07390655051148683, + "learning_rate": 9.690923543732962e-06, + "loss": 0.6427, + "step": 943 + }, + { + "epoch": 0.45908814589665653, + "grad_norm": 0.07322564550559024, + "learning_rate": 9.690260465027802e-06, + "loss": 0.5742, + "step": 944 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 0.07980290492363408, + "learning_rate": 9.689596698549203e-06, + "loss": 0.6152, + "step": 945 + }, + { + "epoch": 0.46006079027355623, + "grad_norm": 0.07472707478190149, + "learning_rate": 9.688932244394507e-06, + "loss": 0.581, + "step": 946 + }, + { + "epoch": 0.4605471124620061, + "grad_norm": 0.07570147061772793, + "learning_rate": 9.688267102661142e-06, + "loss": 0.6201, + "step": 947 + }, + { + "epoch": 0.46103343465045593, + "grad_norm": 0.0777773138389416, + "learning_rate": 9.687601273446645e-06, + "loss": 0.6202, + "step": 948 + }, + { + "epoch": 0.4615197568389058, + "grad_norm": 0.08220264826870818, + "learning_rate": 9.686934756848651e-06, + "loss": 0.5952, + "step": 949 + }, + { + "epoch": 0.46200607902735563, + "grad_norm": 0.07929919144080227, + "learning_rate": 9.686267552964901e-06, + "loss": 0.636, + "step": 950 + }, + { + "epoch": 0.4624924012158055, + "grad_norm": 0.07685286867352173, + "learning_rate": 9.68559966189323e-06, + "loss": 0.6212, + "step": 951 + }, + { + "epoch": 0.46297872340425533, + "grad_norm": 0.07679017248783176, + "learning_rate": 9.684931083731578e-06, + "loss": 0.6039, + "step": 952 + }, + { + "epoch": 0.4634650455927052, + "grad_norm": 0.07042817668451752, + "learning_rate": 9.68426181857798e-06, + "loss": 0.5599, + "step": 953 + }, + { + "epoch": 0.46395136778115503, + "grad_norm": 0.07582605049687227, + "learning_rate": 9.683591866530582e-06, + "loss": 0.577, + "step": 954 + }, + { + "epoch": 0.4644376899696049, + "grad_norm": 0.07607125580240615, + "learning_rate": 9.682921227687622e-06, + "loss": 0.6175, + "step": 955 + }, + { + "epoch": 0.46492401215805473, + "grad_norm": 0.07679547238371429, + "learning_rate": 9.682249902147442e-06, + "loss": 0.6199, + "step": 956 + }, + { + "epoch": 0.4654103343465046, + "grad_norm": 0.0696181884733738, + "learning_rate": 9.681577890008485e-06, + "loss": 0.5577, + "step": 957 + }, + { + "epoch": 0.46589665653495443, + "grad_norm": 0.07824392162082655, + "learning_rate": 9.680905191369293e-06, + "loss": 0.6027, + "step": 958 + }, + { + "epoch": 0.46638297872340423, + "grad_norm": 0.07504234092024205, + "learning_rate": 9.680231806328509e-06, + "loss": 0.5968, + "step": 959 + }, + { + "epoch": 0.4668693009118541, + "grad_norm": 0.07245260973260609, + "learning_rate": 9.67955773498488e-06, + "loss": 0.5817, + "step": 960 + }, + { + "epoch": 0.46735562310030393, + "grad_norm": 0.07389774613836207, + "learning_rate": 9.678882977437248e-06, + "loss": 0.5845, + "step": 961 + }, + { + "epoch": 0.4678419452887538, + "grad_norm": 0.07849571359931412, + "learning_rate": 9.678207533784558e-06, + "loss": 0.6428, + "step": 962 + }, + { + "epoch": 0.46832826747720363, + "grad_norm": 0.07368316570430054, + "learning_rate": 9.67753140412586e-06, + "loss": 0.6082, + "step": 963 + }, + { + "epoch": 0.4688145896656535, + "grad_norm": 0.07588381016100772, + "learning_rate": 9.676854588560298e-06, + "loss": 0.6272, + "step": 964 + }, + { + "epoch": 0.46930091185410333, + "grad_norm": 0.0750664224771575, + "learning_rate": 9.67617708718712e-06, + "loss": 0.6318, + "step": 965 + }, + { + "epoch": 0.4697872340425532, + "grad_norm": 0.07174390421484508, + "learning_rate": 9.675498900105674e-06, + "loss": 0.5834, + "step": 966 + }, + { + "epoch": 0.47027355623100303, + "grad_norm": 0.07457658652906511, + "learning_rate": 9.674820027415406e-06, + "loss": 0.5752, + "step": 967 + }, + { + "epoch": 0.4707598784194529, + "grad_norm": 0.07643389304674403, + "learning_rate": 9.674140469215868e-06, + "loss": 0.5857, + "step": 968 + }, + { + "epoch": 0.47124620060790273, + "grad_norm": 0.07647902583321699, + "learning_rate": 9.673460225606711e-06, + "loss": 0.623, + "step": 969 + }, + { + "epoch": 0.4717325227963526, + "grad_norm": 0.07451087076680062, + "learning_rate": 9.672779296687678e-06, + "loss": 0.5752, + "step": 970 + }, + { + "epoch": 0.47221884498480243, + "grad_norm": 0.07683147292688484, + "learning_rate": 9.672097682558628e-06, + "loss": 0.6324, + "step": 971 + }, + { + "epoch": 0.4727051671732523, + "grad_norm": 0.07772949335564111, + "learning_rate": 9.671415383319507e-06, + "loss": 0.5916, + "step": 972 + }, + { + "epoch": 0.47319148936170213, + "grad_norm": 0.07308603579574793, + "learning_rate": 9.670732399070365e-06, + "loss": 0.6207, + "step": 973 + }, + { + "epoch": 0.473677811550152, + "grad_norm": 0.07629877966214045, + "learning_rate": 9.67004872991136e-06, + "loss": 0.5829, + "step": 974 + }, + { + "epoch": 0.47416413373860183, + "grad_norm": 0.07183852049069808, + "learning_rate": 9.669364375942739e-06, + "loss": 0.5644, + "step": 975 + }, + { + "epoch": 0.4746504559270517, + "grad_norm": 0.07019255651184401, + "learning_rate": 9.668679337264857e-06, + "loss": 0.5468, + "step": 976 + }, + { + "epoch": 0.47513677811550153, + "grad_norm": 0.07488267975270439, + "learning_rate": 9.667993613978166e-06, + "loss": 0.6012, + "step": 977 + }, + { + "epoch": 0.4756231003039514, + "grad_norm": 0.07423245735285204, + "learning_rate": 9.66730720618322e-06, + "loss": 0.5769, + "step": 978 + }, + { + "epoch": 0.47610942249240124, + "grad_norm": 0.07970041513583959, + "learning_rate": 9.666620113980673e-06, + "loss": 0.6254, + "step": 979 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 0.07116566520277325, + "learning_rate": 9.66593233747128e-06, + "loss": 0.5661, + "step": 980 + }, + { + "epoch": 0.47708206686930094, + "grad_norm": 0.07815806025417396, + "learning_rate": 9.665243876755894e-06, + "loss": 0.6078, + "step": 981 + }, + { + "epoch": 0.4775683890577508, + "grad_norm": 0.08098149359036909, + "learning_rate": 9.66455473193547e-06, + "loss": 0.6158, + "step": 982 + }, + { + "epoch": 0.4780547112462006, + "grad_norm": 0.07332806249391385, + "learning_rate": 9.663864903111066e-06, + "loss": 0.5837, + "step": 983 + }, + { + "epoch": 0.47854103343465043, + "grad_norm": 0.07488025281445045, + "learning_rate": 9.663174390383836e-06, + "loss": 0.606, + "step": 984 + }, + { + "epoch": 0.4790273556231003, + "grad_norm": 0.07572123285091018, + "learning_rate": 9.662483193855035e-06, + "loss": 0.6334, + "step": 985 + }, + { + "epoch": 0.47951367781155013, + "grad_norm": 0.07754112262676686, + "learning_rate": 9.661791313626019e-06, + "loss": 0.6101, + "step": 986 + }, + { + "epoch": 0.48, + "grad_norm": 0.07368653207471317, + "learning_rate": 9.661098749798243e-06, + "loss": 0.5957, + "step": 987 + }, + { + "epoch": 0.48048632218844983, + "grad_norm": 0.07534261258971515, + "learning_rate": 9.660405502473268e-06, + "loss": 0.6143, + "step": 988 + }, + { + "epoch": 0.4809726443768997, + "grad_norm": 0.07416568150076407, + "learning_rate": 9.659711571752749e-06, + "loss": 0.5545, + "step": 989 + }, + { + "epoch": 0.48145896656534953, + "grad_norm": 0.08096641649729179, + "learning_rate": 9.659016957738441e-06, + "loss": 0.5826, + "step": 990 + }, + { + "epoch": 0.4819452887537994, + "grad_norm": 0.07448803186437745, + "learning_rate": 9.658321660532204e-06, + "loss": 0.637, + "step": 991 + }, + { + "epoch": 0.48243161094224923, + "grad_norm": 0.07693468224925519, + "learning_rate": 9.657625680235994e-06, + "loss": 0.6109, + "step": 992 + }, + { + "epoch": 0.4829179331306991, + "grad_norm": 0.07642236003341642, + "learning_rate": 9.656929016951869e-06, + "loss": 0.5642, + "step": 993 + }, + { + "epoch": 0.48340425531914893, + "grad_norm": 0.08302559808727765, + "learning_rate": 9.656231670781987e-06, + "loss": 0.6498, + "step": 994 + }, + { + "epoch": 0.4838905775075988, + "grad_norm": 0.09619351084023765, + "learning_rate": 9.655533641828602e-06, + "loss": 0.6415, + "step": 995 + }, + { + "epoch": 0.48437689969604864, + "grad_norm": 0.08491930525223579, + "learning_rate": 9.654834930194079e-06, + "loss": 0.7193, + "step": 996 + }, + { + "epoch": 0.4848632218844985, + "grad_norm": 0.07929948822646465, + "learning_rate": 9.654135535980874e-06, + "loss": 0.602, + "step": 997 + }, + { + "epoch": 0.48534954407294834, + "grad_norm": 0.0738595479696469, + "learning_rate": 9.653435459291541e-06, + "loss": 0.5978, + "step": 998 + }, + { + "epoch": 0.4858358662613982, + "grad_norm": 0.07368254032340832, + "learning_rate": 9.65273470022874e-06, + "loss": 0.6041, + "step": 999 + }, + { + "epoch": 0.48632218844984804, + "grad_norm": 0.07542858510769612, + "learning_rate": 9.652033258895233e-06, + "loss": 0.5893, + "step": 1000 + }, + { + "epoch": 0.4868085106382979, + "grad_norm": 0.07435466428592213, + "learning_rate": 9.651331135393875e-06, + "loss": 0.5937, + "step": 1001 + }, + { + "epoch": 0.48729483282674774, + "grad_norm": 0.08410787611001522, + "learning_rate": 9.650628329827627e-06, + "loss": 0.589, + "step": 1002 + }, + { + "epoch": 0.4877811550151976, + "grad_norm": 0.07650370376747663, + "learning_rate": 9.649924842299544e-06, + "loss": 0.6184, + "step": 1003 + }, + { + "epoch": 0.48826747720364744, + "grad_norm": 0.07582375275769716, + "learning_rate": 9.649220672912788e-06, + "loss": 0.5942, + "step": 1004 + }, + { + "epoch": 0.4887537993920973, + "grad_norm": 0.07783386547488437, + "learning_rate": 9.648515821770612e-06, + "loss": 0.6128, + "step": 1005 + }, + { + "epoch": 0.48924012158054714, + "grad_norm": 0.07848464060947229, + "learning_rate": 9.647810288976381e-06, + "loss": 0.613, + "step": 1006 + }, + { + "epoch": 0.48972644376899693, + "grad_norm": 0.07508332012991602, + "learning_rate": 9.64710407463355e-06, + "loss": 0.6154, + "step": 1007 + }, + { + "epoch": 0.4902127659574468, + "grad_norm": 0.07382064657896553, + "learning_rate": 9.646397178845679e-06, + "loss": 0.5961, + "step": 1008 + }, + { + "epoch": 0.49069908814589663, + "grad_norm": 0.0830034984028087, + "learning_rate": 9.645689601716424e-06, + "loss": 0.5727, + "step": 1009 + }, + { + "epoch": 0.4911854103343465, + "grad_norm": 0.07249779314047998, + "learning_rate": 9.644981343349545e-06, + "loss": 0.6099, + "step": 1010 + }, + { + "epoch": 0.49167173252279633, + "grad_norm": 0.08514650264082513, + "learning_rate": 9.644272403848897e-06, + "loss": 0.6283, + "step": 1011 + }, + { + "epoch": 0.4921580547112462, + "grad_norm": 0.08050869620416247, + "learning_rate": 9.64356278331844e-06, + "loss": 0.6077, + "step": 1012 + }, + { + "epoch": 0.49264437689969603, + "grad_norm": 0.07509472703104218, + "learning_rate": 9.642852481862235e-06, + "loss": 0.5929, + "step": 1013 + }, + { + "epoch": 0.4931306990881459, + "grad_norm": 0.07324237738835222, + "learning_rate": 9.642141499584436e-06, + "loss": 0.5893, + "step": 1014 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 0.07673034617567834, + "learning_rate": 9.6414298365893e-06, + "loss": 0.634, + "step": 1015 + }, + { + "epoch": 0.4941033434650456, + "grad_norm": 0.0726904256785006, + "learning_rate": 9.640717492981185e-06, + "loss": 0.5807, + "step": 1016 + }, + { + "epoch": 0.49458966565349544, + "grad_norm": 0.07479533320906948, + "learning_rate": 9.64000446886455e-06, + "loss": 0.6042, + "step": 1017 + }, + { + "epoch": 0.4950759878419453, + "grad_norm": 0.07960652206021393, + "learning_rate": 9.63929076434395e-06, + "loss": 0.6331, + "step": 1018 + }, + { + "epoch": 0.49556231003039514, + "grad_norm": 0.07046297273903293, + "learning_rate": 9.638576379524041e-06, + "loss": 0.546, + "step": 1019 + }, + { + "epoch": 0.496048632218845, + "grad_norm": 0.0801776950275796, + "learning_rate": 9.63786131450958e-06, + "loss": 0.6363, + "step": 1020 + }, + { + "epoch": 0.49653495440729484, + "grad_norm": 0.07445033127789338, + "learning_rate": 9.637145569405426e-06, + "loss": 0.5629, + "step": 1021 + }, + { + "epoch": 0.4970212765957447, + "grad_norm": 0.07526862940422334, + "learning_rate": 9.63642914431653e-06, + "loss": 0.6108, + "step": 1022 + }, + { + "epoch": 0.49750759878419454, + "grad_norm": 0.07254346330912441, + "learning_rate": 9.635712039347953e-06, + "loss": 0.594, + "step": 1023 + }, + { + "epoch": 0.4979939209726444, + "grad_norm": 0.0780571845691102, + "learning_rate": 9.634994254604845e-06, + "loss": 0.595, + "step": 1024 + }, + { + "epoch": 0.49848024316109424, + "grad_norm": 0.0783126025407137, + "learning_rate": 9.634275790192464e-06, + "loss": 0.615, + "step": 1025 + }, + { + "epoch": 0.4989665653495441, + "grad_norm": 0.07302505638797673, + "learning_rate": 9.633556646216164e-06, + "loss": 0.5855, + "step": 1026 + }, + { + "epoch": 0.49945288753799394, + "grad_norm": 0.07082429330689427, + "learning_rate": 9.6328368227814e-06, + "loss": 0.5242, + "step": 1027 + }, + { + "epoch": 0.4999392097264438, + "grad_norm": 0.07407361719321554, + "learning_rate": 9.632116319993726e-06, + "loss": 0.5895, + "step": 1028 + }, + { + "epoch": 0.4999392097264438, + "eval_loss": 0.6022451519966125, + "eval_runtime": 105.1713, + "eval_samples_per_second": 288.605, + "eval_steps_per_second": 36.084, + "step": 1028 + }, + { + "epoch": 0.5004255319148936, + "grad_norm": 0.07491363414159202, + "learning_rate": 9.631395137958792e-06, + "loss": 0.6184, + "step": 1029 + }, + { + "epoch": 0.5009118541033435, + "grad_norm": 0.07497897535023801, + "learning_rate": 9.630673276782356e-06, + "loss": 0.6243, + "step": 1030 + }, + { + "epoch": 0.5013981762917933, + "grad_norm": 0.07384236285385469, + "learning_rate": 9.629950736570268e-06, + "loss": 0.6156, + "step": 1031 + }, + { + "epoch": 0.5018844984802432, + "grad_norm": 0.07873756699993263, + "learning_rate": 9.629227517428482e-06, + "loss": 0.5716, + "step": 1032 + }, + { + "epoch": 0.502370820668693, + "grad_norm": 0.07576613933547476, + "learning_rate": 9.628503619463049e-06, + "loss": 0.6326, + "step": 1033 + }, + { + "epoch": 0.5028571428571429, + "grad_norm": 0.07500439174705274, + "learning_rate": 9.62777904278012e-06, + "loss": 0.6272, + "step": 1034 + }, + { + "epoch": 0.5033434650455927, + "grad_norm": 0.07537447974397528, + "learning_rate": 9.627053787485944e-06, + "loss": 0.5918, + "step": 1035 + }, + { + "epoch": 0.5038297872340426, + "grad_norm": 0.074503245299955, + "learning_rate": 9.626327853686877e-06, + "loss": 0.5583, + "step": 1036 + }, + { + "epoch": 0.5043161094224924, + "grad_norm": 0.075726082212813, + "learning_rate": 9.625601241489365e-06, + "loss": 0.6056, + "step": 1037 + }, + { + "epoch": 0.5048024316109423, + "grad_norm": 0.07059161411427989, + "learning_rate": 9.624873950999958e-06, + "loss": 0.5654, + "step": 1038 + }, + { + "epoch": 0.5052887537993921, + "grad_norm": 0.07767977093951244, + "learning_rate": 9.624145982325303e-06, + "loss": 0.6159, + "step": 1039 + }, + { + "epoch": 0.505775075987842, + "grad_norm": 0.07830933778804226, + "learning_rate": 9.623417335572155e-06, + "loss": 0.6052, + "step": 1040 + }, + { + "epoch": 0.5062613981762918, + "grad_norm": 0.07804381159534474, + "learning_rate": 9.622688010847352e-06, + "loss": 0.5782, + "step": 1041 + }, + { + "epoch": 0.5067477203647417, + "grad_norm": 0.07235693447421539, + "learning_rate": 9.621958008257848e-06, + "loss": 0.5685, + "step": 1042 + }, + { + "epoch": 0.5072340425531915, + "grad_norm": 0.07672947712687692, + "learning_rate": 9.62122732791069e-06, + "loss": 0.6142, + "step": 1043 + }, + { + "epoch": 0.5077203647416413, + "grad_norm": 0.07598179177937309, + "learning_rate": 9.62049596991302e-06, + "loss": 0.601, + "step": 1044 + }, + { + "epoch": 0.5082066869300912, + "grad_norm": 0.07957812258745778, + "learning_rate": 9.619763934372084e-06, + "loss": 0.6012, + "step": 1045 + }, + { + "epoch": 0.508693009118541, + "grad_norm": 0.07179777474901587, + "learning_rate": 9.61903122139523e-06, + "loss": 0.5676, + "step": 1046 + }, + { + "epoch": 0.5091793313069909, + "grad_norm": 0.07119213189596682, + "learning_rate": 9.6182978310899e-06, + "loss": 0.5481, + "step": 1047 + }, + { + "epoch": 0.5096656534954407, + "grad_norm": 0.07484075027205032, + "learning_rate": 9.617563763563635e-06, + "loss": 0.6257, + "step": 1048 + }, + { + "epoch": 0.5101519756838906, + "grad_norm": 0.07484625229774869, + "learning_rate": 9.616829018924083e-06, + "loss": 0.599, + "step": 1049 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 0.07479653876160268, + "learning_rate": 9.616093597278981e-06, + "loss": 0.5815, + "step": 1050 + }, + { + "epoch": 0.5111246200607903, + "grad_norm": 0.07920682715652025, + "learning_rate": 9.615357498736172e-06, + "loss": 0.6282, + "step": 1051 + }, + { + "epoch": 0.5116109422492401, + "grad_norm": 0.079108272703273, + "learning_rate": 9.614620723403599e-06, + "loss": 0.6223, + "step": 1052 + }, + { + "epoch": 0.51209726443769, + "grad_norm": 0.07666487314988865, + "learning_rate": 9.613883271389297e-06, + "loss": 0.5949, + "step": 1053 + }, + { + "epoch": 0.5125835866261398, + "grad_norm": 0.07796212649776288, + "learning_rate": 9.613145142801407e-06, + "loss": 0.6169, + "step": 1054 + }, + { + "epoch": 0.5130699088145897, + "grad_norm": 0.07727794160332548, + "learning_rate": 9.61240633774817e-06, + "loss": 0.5608, + "step": 1055 + }, + { + "epoch": 0.5135562310030395, + "grad_norm": 0.23440049337137667, + "learning_rate": 9.61166685633792e-06, + "loss": 0.6003, + "step": 1056 + }, + { + "epoch": 0.5140425531914894, + "grad_norm": 0.07302507192319002, + "learning_rate": 9.610926698679093e-06, + "loss": 0.5901, + "step": 1057 + }, + { + "epoch": 0.5145288753799392, + "grad_norm": 0.0793574387041743, + "learning_rate": 9.610185864880228e-06, + "loss": 0.6337, + "step": 1058 + }, + { + "epoch": 0.5150151975683891, + "grad_norm": 0.07760275379212223, + "learning_rate": 9.609444355049957e-06, + "loss": 0.6091, + "step": 1059 + }, + { + "epoch": 0.5155015197568389, + "grad_norm": 0.07644658434262741, + "learning_rate": 9.608702169297014e-06, + "loss": 0.6041, + "step": 1060 + }, + { + "epoch": 0.5159878419452888, + "grad_norm": 0.07579633137358423, + "learning_rate": 9.607959307730237e-06, + "loss": 0.6002, + "step": 1061 + }, + { + "epoch": 0.5164741641337386, + "grad_norm": 0.07601248234149546, + "learning_rate": 9.607215770458551e-06, + "loss": 0.6262, + "step": 1062 + }, + { + "epoch": 0.5169604863221885, + "grad_norm": 0.07614345620960081, + "learning_rate": 9.606471557590992e-06, + "loss": 0.5828, + "step": 1063 + }, + { + "epoch": 0.5174468085106383, + "grad_norm": 0.0771995257204074, + "learning_rate": 9.605726669236688e-06, + "loss": 0.6175, + "step": 1064 + }, + { + "epoch": 0.5179331306990882, + "grad_norm": 0.07387346768464613, + "learning_rate": 9.60498110550487e-06, + "loss": 0.6017, + "step": 1065 + }, + { + "epoch": 0.518419452887538, + "grad_norm": 0.08075466368475279, + "learning_rate": 9.604234866504868e-06, + "loss": 0.6301, + "step": 1066 + }, + { + "epoch": 0.5189057750759879, + "grad_norm": 0.07266668357878729, + "learning_rate": 9.603487952346104e-06, + "loss": 0.5699, + "step": 1067 + }, + { + "epoch": 0.5193920972644377, + "grad_norm": 0.07294684771036698, + "learning_rate": 9.602740363138108e-06, + "loss": 0.5854, + "step": 1068 + }, + { + "epoch": 0.5198784194528875, + "grad_norm": 0.07431612814625059, + "learning_rate": 9.601992098990506e-06, + "loss": 0.615, + "step": 1069 + }, + { + "epoch": 0.5203647416413374, + "grad_norm": 0.07444677404238248, + "learning_rate": 9.601243160013023e-06, + "loss": 0.5647, + "step": 1070 + }, + { + "epoch": 0.5208510638297872, + "grad_norm": 0.0785028987815514, + "learning_rate": 9.600493546315482e-06, + "loss": 0.5966, + "step": 1071 + }, + { + "epoch": 0.5213373860182371, + "grad_norm": 0.07459638381862556, + "learning_rate": 9.599743258007803e-06, + "loss": 0.5662, + "step": 1072 + }, + { + "epoch": 0.5218237082066869, + "grad_norm": 0.07701982569012482, + "learning_rate": 9.598992295200007e-06, + "loss": 0.6135, + "step": 1073 + }, + { + "epoch": 0.5223100303951368, + "grad_norm": 0.07516017019795646, + "learning_rate": 9.598240658002217e-06, + "loss": 0.5883, + "step": 1074 + }, + { + "epoch": 0.5227963525835866, + "grad_norm": 0.07210430860226744, + "learning_rate": 9.597488346524653e-06, + "loss": 0.57, + "step": 1075 + }, + { + "epoch": 0.5232826747720365, + "grad_norm": 0.07697882335037791, + "learning_rate": 9.59673536087763e-06, + "loss": 0.6177, + "step": 1076 + }, + { + "epoch": 0.5237689969604863, + "grad_norm": 0.08038171405169131, + "learning_rate": 9.595981701171564e-06, + "loss": 0.6211, + "step": 1077 + }, + { + "epoch": 0.5242553191489362, + "grad_norm": 0.07202005407284515, + "learning_rate": 9.595227367516974e-06, + "loss": 0.5517, + "step": 1078 + }, + { + "epoch": 0.524741641337386, + "grad_norm": 0.07598168784271613, + "learning_rate": 9.594472360024472e-06, + "loss": 0.6156, + "step": 1079 + }, + { + "epoch": 0.5252279635258359, + "grad_norm": 0.07380647747687255, + "learning_rate": 9.593716678804772e-06, + "loss": 0.5999, + "step": 1080 + }, + { + "epoch": 0.5257142857142857, + "grad_norm": 0.07510898117414629, + "learning_rate": 9.592960323968688e-06, + "loss": 0.6014, + "step": 1081 + }, + { + "epoch": 0.5262006079027356, + "grad_norm": 0.07877687247884968, + "learning_rate": 9.592203295627127e-06, + "loss": 0.6071, + "step": 1082 + }, + { + "epoch": 0.5266869300911854, + "grad_norm": 0.07301529882321994, + "learning_rate": 9.591445593891102e-06, + "loss": 0.5824, + "step": 1083 + }, + { + "epoch": 0.5271732522796353, + "grad_norm": 0.073996777203989, + "learning_rate": 9.59068721887172e-06, + "loss": 0.5965, + "step": 1084 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 0.07614222372100486, + "learning_rate": 9.589928170680186e-06, + "loss": 0.6096, + "step": 1085 + }, + { + "epoch": 0.528145896656535, + "grad_norm": 0.07907477045381418, + "learning_rate": 9.58916844942781e-06, + "loss": 0.5949, + "step": 1086 + }, + { + "epoch": 0.5286322188449848, + "grad_norm": 0.0742931640338888, + "learning_rate": 9.588408055225992e-06, + "loss": 0.6046, + "step": 1087 + }, + { + "epoch": 0.5291185410334347, + "grad_norm": 0.07792562293019752, + "learning_rate": 9.58764698818624e-06, + "loss": 0.5945, + "step": 1088 + }, + { + "epoch": 0.5296048632218845, + "grad_norm": 0.07301008726052358, + "learning_rate": 9.586885248420152e-06, + "loss": 0.5662, + "step": 1089 + }, + { + "epoch": 0.5300911854103344, + "grad_norm": 0.0755314524155536, + "learning_rate": 9.586122836039432e-06, + "loss": 0.5783, + "step": 1090 + }, + { + "epoch": 0.5305775075987842, + "grad_norm": 0.07434203408018285, + "learning_rate": 9.585359751155874e-06, + "loss": 0.5477, + "step": 1091 + }, + { + "epoch": 0.531063829787234, + "grad_norm": 0.07538682718890699, + "learning_rate": 9.58459599388138e-06, + "loss": 0.6004, + "step": 1092 + }, + { + "epoch": 0.5315501519756839, + "grad_norm": 0.07136586709394917, + "learning_rate": 9.583831564327945e-06, + "loss": 0.5912, + "step": 1093 + }, + { + "epoch": 0.5320364741641337, + "grad_norm": 0.0767772025676226, + "learning_rate": 9.583066462607664e-06, + "loss": 0.6308, + "step": 1094 + }, + { + "epoch": 0.5325227963525836, + "grad_norm": 0.0726085661295553, + "learning_rate": 9.58230068883273e-06, + "loss": 0.6073, + "step": 1095 + }, + { + "epoch": 0.5330091185410334, + "grad_norm": 0.07517222027021538, + "learning_rate": 9.581534243115437e-06, + "loss": 0.5522, + "step": 1096 + }, + { + "epoch": 0.5334954407294833, + "grad_norm": 0.07273686570952088, + "learning_rate": 9.580767125568172e-06, + "loss": 0.6008, + "step": 1097 + }, + { + "epoch": 0.5339817629179331, + "grad_norm": 0.07447422341141843, + "learning_rate": 9.579999336303427e-06, + "loss": 0.6049, + "step": 1098 + }, + { + "epoch": 0.534468085106383, + "grad_norm": 0.07004702097814727, + "learning_rate": 9.579230875433788e-06, + "loss": 0.5823, + "step": 1099 + }, + { + "epoch": 0.5349544072948328, + "grad_norm": 0.07856976194826039, + "learning_rate": 9.578461743071943e-06, + "loss": 0.6126, + "step": 1100 + }, + { + "epoch": 0.5354407294832827, + "grad_norm": 0.07457931409633366, + "learning_rate": 9.577691939330675e-06, + "loss": 0.5633, + "step": 1101 + }, + { + "epoch": 0.5359270516717325, + "grad_norm": 0.07180677894012881, + "learning_rate": 9.576921464322866e-06, + "loss": 0.577, + "step": 1102 + }, + { + "epoch": 0.5364133738601824, + "grad_norm": 0.07720956030452346, + "learning_rate": 9.576150318161499e-06, + "loss": 0.587, + "step": 1103 + }, + { + "epoch": 0.5368996960486322, + "grad_norm": 0.07543257359756422, + "learning_rate": 9.575378500959654e-06, + "loss": 0.6025, + "step": 1104 + }, + { + "epoch": 0.5373860182370821, + "grad_norm": 0.07639144543849306, + "learning_rate": 9.574606012830509e-06, + "loss": 0.5813, + "step": 1105 + }, + { + "epoch": 0.5378723404255319, + "grad_norm": 0.07846367381718024, + "learning_rate": 9.57383285388734e-06, + "loss": 0.6197, + "step": 1106 + }, + { + "epoch": 0.5383586626139818, + "grad_norm": 0.07551930929835175, + "learning_rate": 9.573059024243522e-06, + "loss": 0.6154, + "step": 1107 + }, + { + "epoch": 0.5388449848024316, + "grad_norm": 0.074725189306958, + "learning_rate": 9.57228452401253e-06, + "loss": 0.5637, + "step": 1108 + }, + { + "epoch": 0.5393313069908815, + "grad_norm": 0.07981735973049686, + "learning_rate": 9.571509353307933e-06, + "loss": 0.6231, + "step": 1109 + }, + { + "epoch": 0.5398176291793313, + "grad_norm": 0.08027613534457816, + "learning_rate": 9.570733512243402e-06, + "loss": 0.6377, + "step": 1110 + }, + { + "epoch": 0.5403039513677812, + "grad_norm": 0.07335214025548115, + "learning_rate": 9.569957000932706e-06, + "loss": 0.5974, + "step": 1111 + }, + { + "epoch": 0.540790273556231, + "grad_norm": 0.07366471459190997, + "learning_rate": 9.569179819489712e-06, + "loss": 0.5717, + "step": 1112 + }, + { + "epoch": 0.5412765957446809, + "grad_norm": 0.07460379681865435, + "learning_rate": 9.568401968028382e-06, + "loss": 0.6122, + "step": 1113 + }, + { + "epoch": 0.5417629179331307, + "grad_norm": 0.07035806810923242, + "learning_rate": 9.567623446662781e-06, + "loss": 0.6119, + "step": 1114 + }, + { + "epoch": 0.5422492401215806, + "grad_norm": 0.07634389129306696, + "learning_rate": 9.566844255507073e-06, + "loss": 0.6078, + "step": 1115 + }, + { + "epoch": 0.5427355623100304, + "grad_norm": 0.07066396119916037, + "learning_rate": 9.566064394675511e-06, + "loss": 0.5897, + "step": 1116 + }, + { + "epoch": 0.5432218844984802, + "grad_norm": 0.07917244664151035, + "learning_rate": 9.56528386428246e-06, + "loss": 0.5941, + "step": 1117 + }, + { + "epoch": 0.5437082066869301, + "grad_norm": 0.07560950730130118, + "learning_rate": 9.564502664442371e-06, + "loss": 0.5868, + "step": 1118 + }, + { + "epoch": 0.5441945288753799, + "grad_norm": 0.07493054510390476, + "learning_rate": 9.563720795269801e-06, + "loss": 0.6076, + "step": 1119 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 0.07613734596151789, + "learning_rate": 9.5629382568794e-06, + "loss": 0.5915, + "step": 1120 + }, + { + "epoch": 0.5451671732522796, + "grad_norm": 0.0727097956776757, + "learning_rate": 9.562155049385919e-06, + "loss": 0.5966, + "step": 1121 + }, + { + "epoch": 0.5456534954407295, + "grad_norm": 0.07573881937442128, + "learning_rate": 9.561371172904207e-06, + "loss": 0.6126, + "step": 1122 + }, + { + "epoch": 0.5461398176291793, + "grad_norm": 0.0763737905532064, + "learning_rate": 9.56058662754921e-06, + "loss": 0.6469, + "step": 1123 + }, + { + "epoch": 0.5466261398176292, + "grad_norm": 0.07512554259787733, + "learning_rate": 9.559801413435972e-06, + "loss": 0.6035, + "step": 1124 + }, + { + "epoch": 0.547112462006079, + "grad_norm": 0.07171943476778324, + "learning_rate": 9.559015530679639e-06, + "loss": 0.577, + "step": 1125 + }, + { + "epoch": 0.5475987841945289, + "grad_norm": 0.07556762598612062, + "learning_rate": 9.558228979395448e-06, + "loss": 0.5912, + "step": 1126 + }, + { + "epoch": 0.5480851063829787, + "grad_norm": 0.07845005559698841, + "learning_rate": 9.557441759698741e-06, + "loss": 0.6058, + "step": 1127 + }, + { + "epoch": 0.5485714285714286, + "grad_norm": 0.07541322310557426, + "learning_rate": 9.556653871704951e-06, + "loss": 0.62, + "step": 1128 + }, + { + "epoch": 0.5490577507598784, + "grad_norm": 0.07964467770841274, + "learning_rate": 9.555865315529616e-06, + "loss": 0.5998, + "step": 1129 + }, + { + "epoch": 0.5495440729483283, + "grad_norm": 0.07708081006027927, + "learning_rate": 9.555076091288366e-06, + "loss": 0.6094, + "step": 1130 + }, + { + "epoch": 0.5500303951367781, + "grad_norm": 0.07388688075123805, + "learning_rate": 9.554286199096937e-06, + "loss": 0.6088, + "step": 1131 + }, + { + "epoch": 0.550516717325228, + "grad_norm": 0.07820699280148219, + "learning_rate": 9.553495639071152e-06, + "loss": 0.6175, + "step": 1132 + }, + { + "epoch": 0.5510030395136778, + "grad_norm": 0.07895001899338633, + "learning_rate": 9.552704411326938e-06, + "loss": 0.5891, + "step": 1133 + }, + { + "epoch": 0.5514893617021277, + "grad_norm": 0.07382050601093598, + "learning_rate": 9.551912515980323e-06, + "loss": 0.5627, + "step": 1134 + }, + { + "epoch": 0.5519756838905775, + "grad_norm": 0.07499578199643642, + "learning_rate": 9.55111995314743e-06, + "loss": 0.5776, + "step": 1135 + }, + { + "epoch": 0.5524620060790274, + "grad_norm": 0.07965034804950924, + "learning_rate": 9.550326722944476e-06, + "loss": 0.6498, + "step": 1136 + }, + { + "epoch": 0.5529483282674772, + "grad_norm": 0.07527774360450135, + "learning_rate": 9.54953282548778e-06, + "loss": 0.6389, + "step": 1137 + }, + { + "epoch": 0.5534346504559271, + "grad_norm": 0.07606955485113887, + "learning_rate": 9.548738260893759e-06, + "loss": 0.6579, + "step": 1138 + }, + { + "epoch": 0.5539209726443769, + "grad_norm": 0.07382363701618233, + "learning_rate": 9.547943029278925e-06, + "loss": 0.5796, + "step": 1139 + }, + { + "epoch": 0.5544072948328268, + "grad_norm": 0.0694551534818775, + "learning_rate": 9.547147130759894e-06, + "loss": 0.5739, + "step": 1140 + }, + { + "epoch": 0.5548936170212766, + "grad_norm": 0.07373619111569436, + "learning_rate": 9.546350565453368e-06, + "loss": 0.5837, + "step": 1141 + }, + { + "epoch": 0.5553799392097264, + "grad_norm": 0.07491697143764887, + "learning_rate": 9.545553333476164e-06, + "loss": 0.6072, + "step": 1142 + }, + { + "epoch": 0.5558662613981763, + "grad_norm": 0.07300765249514717, + "learning_rate": 9.544755434945178e-06, + "loss": 0.5794, + "step": 1143 + }, + { + "epoch": 0.5563525835866261, + "grad_norm": 0.07219044733634956, + "learning_rate": 9.543956869977418e-06, + "loss": 0.5789, + "step": 1144 + }, + { + "epoch": 0.556838905775076, + "grad_norm": 0.07708903089414154, + "learning_rate": 9.543157638689982e-06, + "loss": 0.6116, + "step": 1145 + }, + { + "epoch": 0.5573252279635258, + "grad_norm": 0.07814561564178836, + "learning_rate": 9.542357741200071e-06, + "loss": 0.6113, + "step": 1146 + }, + { + "epoch": 0.5578115501519757, + "grad_norm": 0.07727051772150526, + "learning_rate": 9.541557177624978e-06, + "loss": 0.5911, + "step": 1147 + }, + { + "epoch": 0.5582978723404255, + "grad_norm": 0.10617057932196375, + "learning_rate": 9.5407559480821e-06, + "loss": 0.5659, + "step": 1148 + }, + { + "epoch": 0.5587841945288754, + "grad_norm": 0.0751891554302743, + "learning_rate": 9.539954052688921e-06, + "loss": 0.5825, + "step": 1149 + }, + { + "epoch": 0.5592705167173252, + "grad_norm": 0.0774385166597363, + "learning_rate": 9.53915149156304e-06, + "loss": 0.5873, + "step": 1150 + }, + { + "epoch": 0.5597568389057751, + "grad_norm": 0.0692018140462834, + "learning_rate": 9.538348264822135e-06, + "loss": 0.5958, + "step": 1151 + }, + { + "epoch": 0.5602431610942249, + "grad_norm": 0.0728596158882538, + "learning_rate": 9.537544372583996e-06, + "loss": 0.5913, + "step": 1152 + }, + { + "epoch": 0.5607294832826748, + "grad_norm": 0.0716324183449931, + "learning_rate": 9.536739814966499e-06, + "loss": 0.567, + "step": 1153 + }, + { + "epoch": 0.5612158054711246, + "grad_norm": 0.07715957661852073, + "learning_rate": 9.535934592087627e-06, + "loss": 0.6288, + "step": 1154 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 0.07297922165750517, + "learning_rate": 9.535128704065457e-06, + "loss": 0.643, + "step": 1155 + }, + { + "epoch": 0.5621884498480243, + "grad_norm": 0.09017847485980017, + "learning_rate": 9.534322151018163e-06, + "loss": 0.6377, + "step": 1156 + }, + { + "epoch": 0.5626747720364742, + "grad_norm": 0.07478377162164523, + "learning_rate": 9.533514933064015e-06, + "loss": 0.5626, + "step": 1157 + }, + { + "epoch": 0.563161094224924, + "grad_norm": 0.07239324297731042, + "learning_rate": 9.532707050321384e-06, + "loss": 0.5809, + "step": 1158 + }, + { + "epoch": 0.5636474164133739, + "grad_norm": 0.0746734820682129, + "learning_rate": 9.531898502908735e-06, + "loss": 0.6449, + "step": 1159 + }, + { + "epoch": 0.5641337386018237, + "grad_norm": 0.08138913592038577, + "learning_rate": 9.531089290944636e-06, + "loss": 0.5791, + "step": 1160 + }, + { + "epoch": 0.5646200607902736, + "grad_norm": 0.07793773324359105, + "learning_rate": 9.530279414547743e-06, + "loss": 0.6197, + "step": 1161 + }, + { + "epoch": 0.5651063829787234, + "grad_norm": 0.07178282678689916, + "learning_rate": 9.529468873836822e-06, + "loss": 0.5766, + "step": 1162 + }, + { + "epoch": 0.5655927051671733, + "grad_norm": 0.07242130866068215, + "learning_rate": 9.528657668930724e-06, + "loss": 0.5622, + "step": 1163 + }, + { + "epoch": 0.5660790273556231, + "grad_norm": 0.07590187583915445, + "learning_rate": 9.527845799948407e-06, + "loss": 0.5872, + "step": 1164 + }, + { + "epoch": 0.5665653495440729, + "grad_norm": 0.07302200706948855, + "learning_rate": 9.52703326700892e-06, + "loss": 0.6151, + "step": 1165 + }, + { + "epoch": 0.5670516717325228, + "grad_norm": 0.07078885700845401, + "learning_rate": 9.526220070231412e-06, + "loss": 0.5586, + "step": 1166 + }, + { + "epoch": 0.5675379939209726, + "grad_norm": 0.07663254444990945, + "learning_rate": 9.52540620973513e-06, + "loss": 0.5818, + "step": 1167 + }, + { + "epoch": 0.5680243161094225, + "grad_norm": 0.07387885600875368, + "learning_rate": 9.524591685639414e-06, + "loss": 0.5652, + "step": 1168 + }, + { + "epoch": 0.5685106382978723, + "grad_norm": 0.07383673904850574, + "learning_rate": 9.523776498063709e-06, + "loss": 0.599, + "step": 1169 + }, + { + "epoch": 0.5689969604863222, + "grad_norm": 0.07660471356355565, + "learning_rate": 9.522960647127553e-06, + "loss": 0.5769, + "step": 1170 + }, + { + "epoch": 0.569483282674772, + "grad_norm": 0.07693800724854444, + "learning_rate": 9.522144132950576e-06, + "loss": 0.6037, + "step": 1171 + }, + { + "epoch": 0.5699696048632219, + "grad_norm": 0.07342133761920683, + "learning_rate": 9.52132695565252e-06, + "loss": 0.5784, + "step": 1172 + }, + { + "epoch": 0.5704559270516717, + "grad_norm": 0.07239702949908601, + "learning_rate": 9.520509115353205e-06, + "loss": 0.594, + "step": 1173 + }, + { + "epoch": 0.5709422492401216, + "grad_norm": 0.07318922399266402, + "learning_rate": 9.519690612172563e-06, + "loss": 0.5676, + "step": 1174 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.08462636739837555, + "learning_rate": 9.518871446230616e-06, + "loss": 0.6666, + "step": 1175 + }, + { + "epoch": 0.5719148936170213, + "grad_norm": 0.07147104118893316, + "learning_rate": 9.518051617647488e-06, + "loss": 0.5904, + "step": 1176 + }, + { + "epoch": 0.5724012158054711, + "grad_norm": 0.07543685467803073, + "learning_rate": 9.517231126543396e-06, + "loss": 0.6038, + "step": 1177 + }, + { + "epoch": 0.572887537993921, + "grad_norm": 0.07442965675655525, + "learning_rate": 9.516409973038655e-06, + "loss": 0.5654, + "step": 1178 + }, + { + "epoch": 0.5733738601823708, + "grad_norm": 0.07422810108869447, + "learning_rate": 9.515588157253679e-06, + "loss": 0.6028, + "step": 1179 + }, + { + "epoch": 0.5738601823708207, + "grad_norm": 0.07281447082147334, + "learning_rate": 9.514765679308979e-06, + "loss": 0.5872, + "step": 1180 + }, + { + "epoch": 0.5743465045592705, + "grad_norm": 0.07343713574963837, + "learning_rate": 9.513942539325158e-06, + "loss": 0.5569, + "step": 1181 + }, + { + "epoch": 0.5748328267477204, + "grad_norm": 0.07524771751860296, + "learning_rate": 9.513118737422926e-06, + "loss": 0.6022, + "step": 1182 + }, + { + "epoch": 0.5753191489361702, + "grad_norm": 0.0759502988036261, + "learning_rate": 9.51229427372308e-06, + "loss": 0.5824, + "step": 1183 + }, + { + "epoch": 0.5758054711246201, + "grad_norm": 0.07450311723224759, + "learning_rate": 9.511469148346517e-06, + "loss": 0.5869, + "step": 1184 + }, + { + "epoch": 0.5762917933130699, + "grad_norm": 0.07372768985360179, + "learning_rate": 9.510643361414236e-06, + "loss": 0.6092, + "step": 1185 + }, + { + "epoch": 0.5767781155015198, + "grad_norm": 0.07135395518528469, + "learning_rate": 9.50981691304733e-06, + "loss": 0.5766, + "step": 1186 + }, + { + "epoch": 0.5772644376899696, + "grad_norm": 0.07703190611762453, + "learning_rate": 9.508989803366984e-06, + "loss": 0.5964, + "step": 1187 + }, + { + "epoch": 0.5777507598784195, + "grad_norm": 0.07712962057222558, + "learning_rate": 9.508162032494485e-06, + "loss": 0.6346, + "step": 1188 + }, + { + "epoch": 0.5782370820668693, + "grad_norm": 0.07449114456144172, + "learning_rate": 9.50733360055122e-06, + "loss": 0.6099, + "step": 1189 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 0.07904267706379085, + "learning_rate": 9.506504507658665e-06, + "loss": 0.6109, + "step": 1190 + }, + { + "epoch": 0.579209726443769, + "grad_norm": 0.07811448853572688, + "learning_rate": 9.5056747539384e-06, + "loss": 0.6281, + "step": 1191 + }, + { + "epoch": 0.5796960486322188, + "grad_norm": 0.07482062202862808, + "learning_rate": 9.504844339512096e-06, + "loss": 0.564, + "step": 1192 + }, + { + "epoch": 0.5801823708206687, + "grad_norm": 0.07577843012573345, + "learning_rate": 9.504013264501526e-06, + "loss": 0.6126, + "step": 1193 + }, + { + "epoch": 0.5806686930091185, + "grad_norm": 0.07522558537462215, + "learning_rate": 9.503181529028558e-06, + "loss": 0.5979, + "step": 1194 + }, + { + "epoch": 0.5811550151975684, + "grad_norm": 0.08146507705850126, + "learning_rate": 9.502349133215156e-06, + "loss": 0.6421, + "step": 1195 + }, + { + "epoch": 0.5816413373860182, + "grad_norm": 0.07371125397635524, + "learning_rate": 9.501516077183381e-06, + "loss": 0.5843, + "step": 1196 + }, + { + "epoch": 0.5821276595744681, + "grad_norm": 0.07504681158171868, + "learning_rate": 9.500682361055391e-06, + "loss": 0.6335, + "step": 1197 + }, + { + "epoch": 0.5826139817629179, + "grad_norm": 0.07397798706558126, + "learning_rate": 9.49984798495344e-06, + "loss": 0.5437, + "step": 1198 + }, + { + "epoch": 0.5831003039513678, + "grad_norm": 0.07154544956037139, + "learning_rate": 9.499012948999884e-06, + "loss": 0.536, + "step": 1199 + }, + { + "epoch": 0.5835866261398176, + "grad_norm": 0.07402888598281882, + "learning_rate": 9.498177253317167e-06, + "loss": 0.619, + "step": 1200 + }, + { + "epoch": 0.5840729483282675, + "grad_norm": 0.07332466029099505, + "learning_rate": 9.497340898027836e-06, + "loss": 0.6025, + "step": 1201 + }, + { + "epoch": 0.5845592705167173, + "grad_norm": 0.07512862470619239, + "learning_rate": 9.496503883254534e-06, + "loss": 0.6272, + "step": 1202 + }, + { + "epoch": 0.5850455927051672, + "grad_norm": 0.07337634551222842, + "learning_rate": 9.495666209119998e-06, + "loss": 0.5668, + "step": 1203 + }, + { + "epoch": 0.585531914893617, + "grad_norm": 0.07505241723685735, + "learning_rate": 9.494827875747064e-06, + "loss": 0.5754, + "step": 1204 + }, + { + "epoch": 0.5860182370820669, + "grad_norm": 0.0730663492813903, + "learning_rate": 9.493988883258664e-06, + "loss": 0.5829, + "step": 1205 + }, + { + "epoch": 0.5865045592705167, + "grad_norm": 0.07621175142706714, + "learning_rate": 9.493149231777828e-06, + "loss": 0.5577, + "step": 1206 + }, + { + "epoch": 0.5869908814589666, + "grad_norm": 0.07830307485344025, + "learning_rate": 9.492308921427677e-06, + "loss": 0.6024, + "step": 1207 + }, + { + "epoch": 0.5874772036474164, + "grad_norm": 0.07586187829377121, + "learning_rate": 9.49146795233144e-06, + "loss": 0.6169, + "step": 1208 + }, + { + "epoch": 0.5879635258358663, + "grad_norm": 0.07554861577529298, + "learning_rate": 9.49062632461243e-06, + "loss": 0.5898, + "step": 1209 + }, + { + "epoch": 0.5884498480243161, + "grad_norm": 0.0759674005850759, + "learning_rate": 9.489784038394065e-06, + "loss": 0.5928, + "step": 1210 + }, + { + "epoch": 0.588936170212766, + "grad_norm": 0.07780786350709513, + "learning_rate": 9.488941093799855e-06, + "loss": 0.5946, + "step": 1211 + }, + { + "epoch": 0.5894224924012158, + "grad_norm": 0.07361167720487613, + "learning_rate": 9.488097490953408e-06, + "loss": 0.5771, + "step": 1212 + }, + { + "epoch": 0.5899088145896657, + "grad_norm": 0.07261713939570166, + "learning_rate": 9.48725322997843e-06, + "loss": 0.5811, + "step": 1213 + }, + { + "epoch": 0.5903951367781155, + "grad_norm": 0.07469408824707235, + "learning_rate": 9.486408310998724e-06, + "loss": 0.5911, + "step": 1214 + }, + { + "epoch": 0.5908814589665653, + "grad_norm": 0.07547460066996871, + "learning_rate": 9.485562734138184e-06, + "loss": 0.5779, + "step": 1215 + }, + { + "epoch": 0.5913677811550152, + "grad_norm": 0.07492236184990318, + "learning_rate": 9.484716499520806e-06, + "loss": 0.5993, + "step": 1216 + }, + { + "epoch": 0.591854103343465, + "grad_norm": 0.07598081017418767, + "learning_rate": 9.48386960727068e-06, + "loss": 0.5993, + "step": 1217 + }, + { + "epoch": 0.5923404255319149, + "grad_norm": 0.07257975256839314, + "learning_rate": 9.483022057511996e-06, + "loss": 0.6299, + "step": 1218 + }, + { + "epoch": 0.5928267477203647, + "grad_norm": 0.0707006246123199, + "learning_rate": 9.482173850369034e-06, + "loss": 0.5813, + "step": 1219 + }, + { + "epoch": 0.5933130699088146, + "grad_norm": 0.07754682435623171, + "learning_rate": 9.481324985966175e-06, + "loss": 0.6043, + "step": 1220 + }, + { + "epoch": 0.5937993920972644, + "grad_norm": 0.07430417411565539, + "learning_rate": 9.480475464427896e-06, + "loss": 0.6132, + "step": 1221 + }, + { + "epoch": 0.5942857142857143, + "grad_norm": 0.07287647783414317, + "learning_rate": 9.47962528587877e-06, + "loss": 0.5796, + "step": 1222 + }, + { + "epoch": 0.5947720364741641, + "grad_norm": 0.07541448093323741, + "learning_rate": 9.478774450443465e-06, + "loss": 0.5969, + "step": 1223 + }, + { + "epoch": 0.595258358662614, + "grad_norm": 0.07667670668646333, + "learning_rate": 9.477922958246747e-06, + "loss": 0.5899, + "step": 1224 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 0.07234539868105472, + "learning_rate": 9.477070809413475e-06, + "loss": 0.569, + "step": 1225 + }, + { + "epoch": 0.5962310030395137, + "grad_norm": 0.07578751071641912, + "learning_rate": 9.476218004068611e-06, + "loss": 0.5988, + "step": 1226 + }, + { + "epoch": 0.5967173252279635, + "grad_norm": 0.07478184066147164, + "learning_rate": 9.475364542337207e-06, + "loss": 0.5893, + "step": 1227 + }, + { + "epoch": 0.5972036474164134, + "grad_norm": 0.07387041018992169, + "learning_rate": 9.474510424344416e-06, + "loss": 0.6116, + "step": 1228 + }, + { + "epoch": 0.5976899696048632, + "grad_norm": 0.07592645788685992, + "learning_rate": 9.473655650215481e-06, + "loss": 0.598, + "step": 1229 + }, + { + "epoch": 0.5981762917933131, + "grad_norm": 0.0778384138669128, + "learning_rate": 9.472800220075746e-06, + "loss": 0.632, + "step": 1230 + }, + { + "epoch": 0.5986626139817629, + "grad_norm": 0.08315915323835953, + "learning_rate": 9.471944134050652e-06, + "loss": 0.6338, + "step": 1231 + }, + { + "epoch": 0.5991489361702128, + "grad_norm": 0.07406289139040127, + "learning_rate": 9.471087392265733e-06, + "loss": 0.5984, + "step": 1232 + }, + { + "epoch": 0.5996352583586626, + "grad_norm": 0.07485174259119663, + "learning_rate": 9.470229994846621e-06, + "loss": 0.5914, + "step": 1233 + }, + { + "epoch": 0.6001215805471125, + "grad_norm": 0.0695961091240149, + "learning_rate": 9.469371941919042e-06, + "loss": 0.6005, + "step": 1234 + }, + { + "epoch": 0.6006079027355623, + "grad_norm": 0.07190793080085171, + "learning_rate": 9.46851323360882e-06, + "loss": 0.5654, + "step": 1235 + }, + { + "epoch": 0.6010942249240122, + "grad_norm": 0.08626371378581066, + "learning_rate": 9.467653870041876e-06, + "loss": 0.6234, + "step": 1236 + }, + { + "epoch": 0.601580547112462, + "grad_norm": 0.07753810543470328, + "learning_rate": 9.466793851344228e-06, + "loss": 0.5937, + "step": 1237 + }, + { + "epoch": 0.6020668693009118, + "grad_norm": 0.07275612325989092, + "learning_rate": 9.465933177641981e-06, + "loss": 0.6063, + "step": 1238 + }, + { + "epoch": 0.6025531914893617, + "grad_norm": 0.07439257696987374, + "learning_rate": 9.465071849061352e-06, + "loss": 0.6177, + "step": 1239 + }, + { + "epoch": 0.6030395136778115, + "grad_norm": 0.08869141664038398, + "learning_rate": 9.464209865728638e-06, + "loss": 0.6026, + "step": 1240 + }, + { + "epoch": 0.6035258358662614, + "grad_norm": 0.07918603182980609, + "learning_rate": 9.463347227770243e-06, + "loss": 0.5977, + "step": 1241 + }, + { + "epoch": 0.6040121580547112, + "grad_norm": 0.07729756677106756, + "learning_rate": 9.46248393531266e-06, + "loss": 0.5935, + "step": 1242 + }, + { + "epoch": 0.6044984802431611, + "grad_norm": 0.08014931140903206, + "learning_rate": 9.461619988482484e-06, + "loss": 0.6237, + "step": 1243 + }, + { + "epoch": 0.6049848024316109, + "grad_norm": 0.07227259063371626, + "learning_rate": 9.460755387406402e-06, + "loss": 0.5402, + "step": 1244 + }, + { + "epoch": 0.6054711246200608, + "grad_norm": 0.07554439089112634, + "learning_rate": 9.459890132211198e-06, + "loss": 0.6012, + "step": 1245 + }, + { + "epoch": 0.6059574468085106, + "grad_norm": 0.07168730785396009, + "learning_rate": 9.45902422302375e-06, + "loss": 0.5548, + "step": 1246 + }, + { + "epoch": 0.6064437689969605, + "grad_norm": 0.07622369288726513, + "learning_rate": 9.458157659971036e-06, + "loss": 0.6531, + "step": 1247 + }, + { + "epoch": 0.6069300911854103, + "grad_norm": 0.07599697389522288, + "learning_rate": 9.457290443180128e-06, + "loss": 0.6062, + "step": 1248 + }, + { + "epoch": 0.6074164133738602, + "grad_norm": 0.07858094420082529, + "learning_rate": 9.45642257277819e-06, + "loss": 0.6099, + "step": 1249 + }, + { + "epoch": 0.60790273556231, + "grad_norm": 0.07785243173717052, + "learning_rate": 9.45555404889249e-06, + "loss": 0.6732, + "step": 1250 + }, + { + "epoch": 0.6083890577507599, + "grad_norm": 0.07574158313953933, + "learning_rate": 9.454684871650383e-06, + "loss": 0.574, + "step": 1251 + }, + { + "epoch": 0.6088753799392097, + "grad_norm": 0.07679608406517179, + "learning_rate": 9.453815041179329e-06, + "loss": 0.5931, + "step": 1252 + }, + { + "epoch": 0.6093617021276596, + "grad_norm": 0.0766464199097489, + "learning_rate": 9.452944557606872e-06, + "loss": 0.5984, + "step": 1253 + }, + { + "epoch": 0.6098480243161094, + "grad_norm": 0.07530702461792167, + "learning_rate": 9.452073421060664e-06, + "loss": 0.576, + "step": 1254 + }, + { + "epoch": 0.6103343465045593, + "grad_norm": 0.07159417467375702, + "learning_rate": 9.451201631668445e-06, + "loss": 0.5732, + "step": 1255 + }, + { + "epoch": 0.6108206686930091, + "grad_norm": 0.08246673631446423, + "learning_rate": 9.450329189558055e-06, + "loss": 0.5734, + "step": 1256 + }, + { + "epoch": 0.611306990881459, + "grad_norm": 0.0788529325647114, + "learning_rate": 9.449456094857424e-06, + "loss": 0.5859, + "step": 1257 + }, + { + "epoch": 0.6117933130699088, + "grad_norm": 0.07147612723333374, + "learning_rate": 9.448582347694584e-06, + "loss": 0.5908, + "step": 1258 + }, + { + "epoch": 0.6122796352583587, + "grad_norm": 0.08668808967495738, + "learning_rate": 9.44770794819766e-06, + "loss": 0.6401, + "step": 1259 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 0.07864084941176788, + "learning_rate": 9.446832896494874e-06, + "loss": 0.6307, + "step": 1260 + }, + { + "epoch": 0.6132522796352584, + "grad_norm": 0.07369508141475373, + "learning_rate": 9.445957192714539e-06, + "loss": 0.5811, + "step": 1261 + }, + { + "epoch": 0.6137386018237082, + "grad_norm": 0.07836061362948436, + "learning_rate": 9.445080836985067e-06, + "loss": 0.6167, + "step": 1262 + }, + { + "epoch": 0.614224924012158, + "grad_norm": 0.0749987054036263, + "learning_rate": 9.444203829434972e-06, + "loss": 0.5782, + "step": 1263 + }, + { + "epoch": 0.6147112462006079, + "grad_norm": 0.0732620064153045, + "learning_rate": 9.44332617019285e-06, + "loss": 0.5954, + "step": 1264 + }, + { + "epoch": 0.6151975683890577, + "grad_norm": 0.07711692559992857, + "learning_rate": 9.442447859387402e-06, + "loss": 0.5656, + "step": 1265 + }, + { + "epoch": 0.6156838905775076, + "grad_norm": 0.07112162984434953, + "learning_rate": 9.441568897147423e-06, + "loss": 0.5458, + "step": 1266 + }, + { + "epoch": 0.6161702127659574, + "grad_norm": 0.07518981577083829, + "learning_rate": 9.440689283601805e-06, + "loss": 0.6413, + "step": 1267 + }, + { + "epoch": 0.6166565349544073, + "grad_norm": 0.07618361826932424, + "learning_rate": 9.43980901887953e-06, + "loss": 0.6183, + "step": 1268 + }, + { + "epoch": 0.6171428571428571, + "grad_norm": 0.07970576693265521, + "learning_rate": 9.438928103109678e-06, + "loss": 0.5605, + "step": 1269 + }, + { + "epoch": 0.617629179331307, + "grad_norm": 0.07340073066761459, + "learning_rate": 9.438046536421428e-06, + "loss": 0.6091, + "step": 1270 + }, + { + "epoch": 0.6181155015197568, + "grad_norm": 0.07694405716043108, + "learning_rate": 9.43716431894405e-06, + "loss": 0.6068, + "step": 1271 + }, + { + "epoch": 0.6186018237082067, + "grad_norm": 0.0839716058733985, + "learning_rate": 9.436281450806914e-06, + "loss": 0.6246, + "step": 1272 + }, + { + "epoch": 0.6190881458966565, + "grad_norm": 0.07374800482169956, + "learning_rate": 9.435397932139478e-06, + "loss": 0.5798, + "step": 1273 + }, + { + "epoch": 0.6195744680851064, + "grad_norm": 0.07463121031358598, + "learning_rate": 9.434513763071304e-06, + "loss": 0.5859, + "step": 1274 + }, + { + "epoch": 0.6200607902735562, + "grad_norm": 0.07200095190924559, + "learning_rate": 9.433628943732045e-06, + "loss": 0.5342, + "step": 1275 + }, + { + "epoch": 0.6205471124620061, + "grad_norm": 0.07536715163612707, + "learning_rate": 9.432743474251446e-06, + "loss": 0.5978, + "step": 1276 + }, + { + "epoch": 0.6210334346504559, + "grad_norm": 0.0753631077103014, + "learning_rate": 9.431857354759354e-06, + "loss": 0.5737, + "step": 1277 + }, + { + "epoch": 0.6215197568389058, + "grad_norm": 0.07951888275567472, + "learning_rate": 9.43097058538571e-06, + "loss": 0.5864, + "step": 1278 + }, + { + "epoch": 0.6220060790273556, + "grad_norm": 0.07552921428479899, + "learning_rate": 9.430083166260546e-06, + "loss": 0.5717, + "step": 1279 + }, + { + "epoch": 0.6224924012158055, + "grad_norm": 0.07930337438500187, + "learning_rate": 9.429195097513993e-06, + "loss": 0.5976, + "step": 1280 + }, + { + "epoch": 0.6229787234042553, + "grad_norm": 0.07354276103923192, + "learning_rate": 9.428306379276275e-06, + "loss": 0.5533, + "step": 1281 + }, + { + "epoch": 0.6234650455927052, + "grad_norm": 0.07786090673902678, + "learning_rate": 9.427417011677713e-06, + "loss": 0.6011, + "step": 1282 + }, + { + "epoch": 0.623951367781155, + "grad_norm": 0.07781255651216264, + "learning_rate": 9.426526994848724e-06, + "loss": 0.5869, + "step": 1283 + }, + { + "epoch": 0.6244376899696049, + "grad_norm": 0.07724423362135842, + "learning_rate": 9.425636328919816e-06, + "loss": 0.6124, + "step": 1284 + }, + { + "epoch": 0.6249240121580547, + "grad_norm": 0.0768883950631787, + "learning_rate": 9.424745014021598e-06, + "loss": 0.599, + "step": 1285 + }, + { + "epoch": 0.6254103343465045, + "grad_norm": 0.07255756431144399, + "learning_rate": 9.423853050284771e-06, + "loss": 0.5507, + "step": 1286 + }, + { + "epoch": 0.6258966565349544, + "grad_norm": 0.07963698391002398, + "learning_rate": 9.422960437840128e-06, + "loss": 0.6186, + "step": 1287 + }, + { + "epoch": 0.6263829787234042, + "grad_norm": 0.0752600077048511, + "learning_rate": 9.422067176818564e-06, + "loss": 0.596, + "step": 1288 + }, + { + "epoch": 0.6268693009118541, + "grad_norm": 0.07138938654122967, + "learning_rate": 9.421173267351064e-06, + "loss": 0.5658, + "step": 1289 + }, + { + "epoch": 0.6273556231003039, + "grad_norm": 0.07739640773278825, + "learning_rate": 9.42027870956871e-06, + "loss": 0.6144, + "step": 1290 + }, + { + "epoch": 0.6278419452887538, + "grad_norm": 0.07617241888870808, + "learning_rate": 9.41938350360268e-06, + "loss": 0.591, + "step": 1291 + }, + { + "epoch": 0.6283282674772036, + "grad_norm": 0.0755635925663801, + "learning_rate": 9.418487649584242e-06, + "loss": 0.5898, + "step": 1292 + }, + { + "epoch": 0.6288145896656535, + "grad_norm": 0.07579957617527425, + "learning_rate": 9.41759114764477e-06, + "loss": 0.6098, + "step": 1293 + }, + { + "epoch": 0.6293009118541033, + "grad_norm": 0.07751294289185374, + "learning_rate": 9.416693997915717e-06, + "loss": 0.6006, + "step": 1294 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 0.07595048115012434, + "learning_rate": 9.415796200528646e-06, + "loss": 0.6268, + "step": 1295 + }, + { + "epoch": 0.630273556231003, + "grad_norm": 0.07616064150377345, + "learning_rate": 9.414897755615206e-06, + "loss": 0.6164, + "step": 1296 + }, + { + "epoch": 0.6307598784194529, + "grad_norm": 0.07424701903459012, + "learning_rate": 9.413998663307145e-06, + "loss": 0.5807, + "step": 1297 + }, + { + "epoch": 0.6312462006079027, + "grad_norm": 0.08021849567048744, + "learning_rate": 9.413098923736305e-06, + "loss": 0.5615, + "step": 1298 + }, + { + "epoch": 0.6317325227963526, + "grad_norm": 0.07796190104640652, + "learning_rate": 9.412198537034622e-06, + "loss": 0.6282, + "step": 1299 + }, + { + "epoch": 0.6322188449848024, + "grad_norm": 0.0713647537014503, + "learning_rate": 9.411297503334126e-06, + "loss": 0.572, + "step": 1300 + }, + { + "epoch": 0.6327051671732523, + "grad_norm": 0.07691961588478133, + "learning_rate": 9.410395822766946e-06, + "loss": 0.6033, + "step": 1301 + }, + { + "epoch": 0.6331914893617021, + "grad_norm": 0.07181511775233088, + "learning_rate": 9.4094934954653e-06, + "loss": 0.5846, + "step": 1302 + }, + { + "epoch": 0.633677811550152, + "grad_norm": 0.07280482498453499, + "learning_rate": 9.408590521561509e-06, + "loss": 0.5776, + "step": 1303 + }, + { + "epoch": 0.6341641337386018, + "grad_norm": 0.07547264112790869, + "learning_rate": 9.407686901187978e-06, + "loss": 0.6287, + "step": 1304 + }, + { + "epoch": 0.6346504559270517, + "grad_norm": 0.07467375211345316, + "learning_rate": 9.406782634477219e-06, + "loss": 0.6164, + "step": 1305 + }, + { + "epoch": 0.6351367781155015, + "grad_norm": 0.07298493095458604, + "learning_rate": 9.405877721561826e-06, + "loss": 0.5519, + "step": 1306 + }, + { + "epoch": 0.6356231003039514, + "grad_norm": 0.07369224380228381, + "learning_rate": 9.404972162574497e-06, + "loss": 0.5951, + "step": 1307 + }, + { + "epoch": 0.6361094224924012, + "grad_norm": 0.07455567363611043, + "learning_rate": 9.404065957648023e-06, + "loss": 0.6038, + "step": 1308 + }, + { + "epoch": 0.6365957446808511, + "grad_norm": 0.06949987297984836, + "learning_rate": 9.40315910691529e-06, + "loss": 0.5354, + "step": 1309 + }, + { + "epoch": 0.6370820668693009, + "grad_norm": 0.07576107588905442, + "learning_rate": 9.402251610509272e-06, + "loss": 0.6082, + "step": 1310 + }, + { + "epoch": 0.6375683890577507, + "grad_norm": 0.07499662795318578, + "learning_rate": 9.401343468563046e-06, + "loss": 0.5933, + "step": 1311 + }, + { + "epoch": 0.6380547112462006, + "grad_norm": 0.07949361276484732, + "learning_rate": 9.400434681209782e-06, + "loss": 0.5956, + "step": 1312 + }, + { + "epoch": 0.6385410334346504, + "grad_norm": 0.07140019093868136, + "learning_rate": 9.399525248582744e-06, + "loss": 0.5921, + "step": 1313 + }, + { + "epoch": 0.6390273556231003, + "grad_norm": 0.07156033904262038, + "learning_rate": 9.398615170815286e-06, + "loss": 0.579, + "step": 1314 + }, + { + "epoch": 0.6395136778115501, + "grad_norm": 0.07320530784039472, + "learning_rate": 9.397704448040865e-06, + "loss": 0.5678, + "step": 1315 + }, + { + "epoch": 0.64, + "grad_norm": 0.0802553142080068, + "learning_rate": 9.396793080393022e-06, + "loss": 0.6226, + "step": 1316 + }, + { + "epoch": 0.6404863221884498, + "grad_norm": 0.0759220028641158, + "learning_rate": 9.395881068005406e-06, + "loss": 0.6251, + "step": 1317 + }, + { + "epoch": 0.6409726443768997, + "grad_norm": 0.07403911679268774, + "learning_rate": 9.39496841101175e-06, + "loss": 0.5755, + "step": 1318 + }, + { + "epoch": 0.6414589665653495, + "grad_norm": 0.07542447324190077, + "learning_rate": 9.394055109545884e-06, + "loss": 0.587, + "step": 1319 + }, + { + "epoch": 0.6419452887537994, + "grad_norm": 0.07147402975334692, + "learning_rate": 9.393141163741732e-06, + "loss": 0.5944, + "step": 1320 + }, + { + "epoch": 0.6424316109422492, + "grad_norm": 0.07154490670142041, + "learning_rate": 9.392226573733319e-06, + "loss": 0.5299, + "step": 1321 + }, + { + "epoch": 0.6429179331306991, + "grad_norm": 0.07441657808752647, + "learning_rate": 9.391311339654755e-06, + "loss": 0.5667, + "step": 1322 + }, + { + "epoch": 0.6434042553191489, + "grad_norm": 0.0746968318126394, + "learning_rate": 9.390395461640246e-06, + "loss": 0.6213, + "step": 1323 + }, + { + "epoch": 0.6438905775075988, + "grad_norm": 0.07366961550448232, + "learning_rate": 9.389478939824104e-06, + "loss": 0.583, + "step": 1324 + }, + { + "epoch": 0.6443768996960486, + "grad_norm": 0.07473921421602571, + "learning_rate": 9.388561774340719e-06, + "loss": 0.5706, + "step": 1325 + }, + { + "epoch": 0.6448632218844985, + "grad_norm": 0.0704060647745574, + "learning_rate": 9.387643965324584e-06, + "loss": 0.5679, + "step": 1326 + }, + { + "epoch": 0.6453495440729483, + "grad_norm": 0.07896972142778609, + "learning_rate": 9.386725512910289e-06, + "loss": 0.6135, + "step": 1327 + }, + { + "epoch": 0.6458358662613982, + "grad_norm": 0.07380151563569691, + "learning_rate": 9.385806417232511e-06, + "loss": 0.6039, + "step": 1328 + }, + { + "epoch": 0.646322188449848, + "grad_norm": 0.07790959362018812, + "learning_rate": 9.384886678426027e-06, + "loss": 0.6603, + "step": 1329 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 0.06950009595065189, + "learning_rate": 9.383966296625704e-06, + "loss": 0.5441, + "step": 1330 + }, + { + "epoch": 0.6472948328267477, + "grad_norm": 0.07004069179969032, + "learning_rate": 9.383045271966507e-06, + "loss": 0.5518, + "step": 1331 + }, + { + "epoch": 0.6477811550151976, + "grad_norm": 0.07078421296750745, + "learning_rate": 9.382123604583492e-06, + "loss": 0.6194, + "step": 1332 + }, + { + "epoch": 0.6482674772036474, + "grad_norm": 0.07680435549364414, + "learning_rate": 9.381201294611815e-06, + "loss": 0.5827, + "step": 1333 + }, + { + "epoch": 0.6487537993920973, + "grad_norm": 0.07073906693618659, + "learning_rate": 9.38027834218672e-06, + "loss": 0.5849, + "step": 1334 + }, + { + "epoch": 0.6492401215805471, + "grad_norm": 0.07342404060838689, + "learning_rate": 9.379354747443548e-06, + "loss": 0.601, + "step": 1335 + }, + { + "epoch": 0.6497264437689969, + "grad_norm": 0.08563051196626331, + "learning_rate": 9.378430510517732e-06, + "loss": 0.6084, + "step": 1336 + }, + { + "epoch": 0.6502127659574468, + "grad_norm": 0.07317972487372107, + "learning_rate": 9.3775056315448e-06, + "loss": 0.5682, + "step": 1337 + }, + { + "epoch": 0.6506990881458966, + "grad_norm": 0.0719539746980849, + "learning_rate": 9.37658011066038e-06, + "loss": 0.5699, + "step": 1338 + }, + { + "epoch": 0.6511854103343465, + "grad_norm": 0.07566972402368674, + "learning_rate": 9.375653948000186e-06, + "loss": 0.6213, + "step": 1339 + }, + { + "epoch": 0.6516717325227963, + "grad_norm": 0.07380417213702649, + "learning_rate": 9.374727143700028e-06, + "loss": 0.5904, + "step": 1340 + }, + { + "epoch": 0.6521580547112462, + "grad_norm": 0.07271264780998443, + "learning_rate": 9.373799697895813e-06, + "loss": 0.6051, + "step": 1341 + }, + { + "epoch": 0.652644376899696, + "grad_norm": 0.07838171715161318, + "learning_rate": 9.372871610723542e-06, + "loss": 0.6013, + "step": 1342 + }, + { + "epoch": 0.6531306990881459, + "grad_norm": 0.14165576564958637, + "learning_rate": 9.371942882319306e-06, + "loss": 0.6204, + "step": 1343 + }, + { + "epoch": 0.6536170212765957, + "grad_norm": 0.07348756709956493, + "learning_rate": 9.37101351281929e-06, + "loss": 0.614, + "step": 1344 + }, + { + "epoch": 0.6541033434650456, + "grad_norm": 0.07401053713580107, + "learning_rate": 9.370083502359781e-06, + "loss": 0.5747, + "step": 1345 + }, + { + "epoch": 0.6545896656534954, + "grad_norm": 0.08453390034625481, + "learning_rate": 9.36915285107715e-06, + "loss": 0.587, + "step": 1346 + }, + { + "epoch": 0.6550759878419453, + "grad_norm": 0.07227759634646228, + "learning_rate": 9.368221559107872e-06, + "loss": 0.5649, + "step": 1347 + }, + { + "epoch": 0.6555623100303951, + "grad_norm": 0.0728266159654345, + "learning_rate": 9.367289626588504e-06, + "loss": 0.5729, + "step": 1348 + }, + { + "epoch": 0.656048632218845, + "grad_norm": 0.07427873409535371, + "learning_rate": 9.366357053655707e-06, + "loss": 0.6213, + "step": 1349 + }, + { + "epoch": 0.6565349544072948, + "grad_norm": 0.072299499982864, + "learning_rate": 9.36542384044623e-06, + "loss": 0.5836, + "step": 1350 + }, + { + "epoch": 0.6570212765957447, + "grad_norm": 0.07801311903207866, + "learning_rate": 9.364489987096921e-06, + "loss": 0.5937, + "step": 1351 + }, + { + "epoch": 0.6575075987841945, + "grad_norm": 0.07462615537520143, + "learning_rate": 9.363555493744719e-06, + "loss": 0.5673, + "step": 1352 + }, + { + "epoch": 0.6579939209726444, + "grad_norm": 0.07760785514022325, + "learning_rate": 9.362620360526652e-06, + "loss": 0.618, + "step": 1353 + }, + { + "epoch": 0.6584802431610942, + "grad_norm": 0.08192318385525697, + "learning_rate": 9.36168458757985e-06, + "loss": 0.6401, + "step": 1354 + }, + { + "epoch": 0.6589665653495441, + "grad_norm": 0.07440302843810591, + "learning_rate": 9.360748175041537e-06, + "loss": 0.5455, + "step": 1355 + }, + { + "epoch": 0.6594528875379939, + "grad_norm": 0.07687441229260948, + "learning_rate": 9.359811123049022e-06, + "loss": 0.592, + "step": 1356 + }, + { + "epoch": 0.6599392097264438, + "grad_norm": 0.07370265794485174, + "learning_rate": 9.358873431739712e-06, + "loss": 0.5588, + "step": 1357 + }, + { + "epoch": 0.6604255319148936, + "grad_norm": 0.0721400445080261, + "learning_rate": 9.357935101251115e-06, + "loss": 0.5903, + "step": 1358 + }, + { + "epoch": 0.6609118541033434, + "grad_norm": 0.07182714890651236, + "learning_rate": 9.35699613172082e-06, + "loss": 0.5748, + "step": 1359 + }, + { + "epoch": 0.6613981762917933, + "grad_norm": 0.07518118467550815, + "learning_rate": 9.356056523286522e-06, + "loss": 0.6496, + "step": 1360 + }, + { + "epoch": 0.6618844984802431, + "grad_norm": 0.07468012041776521, + "learning_rate": 9.355116276086e-06, + "loss": 0.6238, + "step": 1361 + }, + { + "epoch": 0.662370820668693, + "grad_norm": 0.07325718409085749, + "learning_rate": 9.354175390257131e-06, + "loss": 0.5733, + "step": 1362 + }, + { + "epoch": 0.6628571428571428, + "grad_norm": 0.07171755391535022, + "learning_rate": 9.353233865937888e-06, + "loss": 0.565, + "step": 1363 + }, + { + "epoch": 0.6633434650455927, + "grad_norm": 0.0731184986212707, + "learning_rate": 9.352291703266332e-06, + "loss": 0.5839, + "step": 1364 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 0.07396741175939628, + "learning_rate": 9.351348902380622e-06, + "loss": 0.5738, + "step": 1365 + }, + { + "epoch": 0.6643161094224924, + "grad_norm": 0.07343427635834941, + "learning_rate": 9.350405463419006e-06, + "loss": 0.5635, + "step": 1366 + }, + { + "epoch": 0.6648024316109422, + "grad_norm": 0.07598390137699437, + "learning_rate": 9.349461386519832e-06, + "loss": 0.5626, + "step": 1367 + }, + { + "epoch": 0.6652887537993921, + "grad_norm": 0.07539746214781103, + "learning_rate": 9.348516671821537e-06, + "loss": 0.6235, + "step": 1368 + }, + { + "epoch": 0.6657750759878419, + "grad_norm": 0.07471727742270907, + "learning_rate": 9.347571319462654e-06, + "loss": 0.5894, + "step": 1369 + }, + { + "epoch": 0.6662613981762918, + "grad_norm": 0.06957919467162495, + "learning_rate": 9.346625329581805e-06, + "loss": 0.5693, + "step": 1370 + }, + { + "epoch": 0.6667477203647416, + "grad_norm": 0.07199540687343707, + "learning_rate": 9.345678702317711e-06, + "loss": 0.5708, + "step": 1371 + }, + { + "epoch": 0.6672340425531915, + "grad_norm": 0.07639686103443452, + "learning_rate": 9.344731437809184e-06, + "loss": 0.6139, + "step": 1372 + }, + { + "epoch": 0.6677203647416413, + "grad_norm": 0.07024923997307037, + "learning_rate": 9.34378353619513e-06, + "loss": 0.5854, + "step": 1373 + }, + { + "epoch": 0.6682066869300912, + "grad_norm": 0.07617353295964674, + "learning_rate": 9.342834997614547e-06, + "loss": 0.6467, + "step": 1374 + }, + { + "epoch": 0.668693009118541, + "grad_norm": 0.07461421852183255, + "learning_rate": 9.341885822206529e-06, + "loss": 0.5927, + "step": 1375 + }, + { + "epoch": 0.6691793313069909, + "grad_norm": 0.07416874639241813, + "learning_rate": 9.340936010110259e-06, + "loss": 0.5669, + "step": 1376 + }, + { + "epoch": 0.6696656534954407, + "grad_norm": 0.07430935160163336, + "learning_rate": 9.339985561465018e-06, + "loss": 0.5926, + "step": 1377 + }, + { + "epoch": 0.6701519756838906, + "grad_norm": 0.07428555762948726, + "learning_rate": 9.339034476410177e-06, + "loss": 0.5855, + "step": 1378 + }, + { + "epoch": 0.6706382978723404, + "grad_norm": 0.07725419552262293, + "learning_rate": 9.338082755085205e-06, + "loss": 0.6111, + "step": 1379 + }, + { + "epoch": 0.6711246200607903, + "grad_norm": 0.07045166755952159, + "learning_rate": 9.337130397629659e-06, + "loss": 0.5337, + "step": 1380 + }, + { + "epoch": 0.6716109422492401, + "grad_norm": 0.0741447422026241, + "learning_rate": 9.336177404183191e-06, + "loss": 0.5781, + "step": 1381 + }, + { + "epoch": 0.67209726443769, + "grad_norm": 0.08752395837855446, + "learning_rate": 9.335223774885547e-06, + "loss": 0.6158, + "step": 1382 + }, + { + "epoch": 0.6725835866261398, + "grad_norm": 0.07797832166761781, + "learning_rate": 9.334269509876566e-06, + "loss": 0.6512, + "step": 1383 + }, + { + "epoch": 0.6730699088145896, + "grad_norm": 0.07329881022345053, + "learning_rate": 9.333314609296182e-06, + "loss": 0.5876, + "step": 1384 + }, + { + "epoch": 0.6735562310030395, + "grad_norm": 0.07543180645199737, + "learning_rate": 9.332359073284417e-06, + "loss": 0.6284, + "step": 1385 + }, + { + "epoch": 0.6740425531914893, + "grad_norm": 0.07872529346912259, + "learning_rate": 9.33140290198139e-06, + "loss": 0.5843, + "step": 1386 + }, + { + "epoch": 0.6745288753799392, + "grad_norm": 0.07220715364940554, + "learning_rate": 9.330446095527316e-06, + "loss": 0.5695, + "step": 1387 + }, + { + "epoch": 0.675015197568389, + "grad_norm": 0.07165501643998996, + "learning_rate": 9.329488654062496e-06, + "loss": 0.5928, + "step": 1388 + }, + { + "epoch": 0.6755015197568389, + "grad_norm": 0.07621881974718972, + "learning_rate": 9.32853057772733e-06, + "loss": 0.5872, + "step": 1389 + }, + { + "epoch": 0.6759878419452887, + "grad_norm": 0.07428394665142012, + "learning_rate": 9.32757186666231e-06, + "loss": 0.5706, + "step": 1390 + }, + { + "epoch": 0.6764741641337386, + "grad_norm": 0.07395745268521768, + "learning_rate": 9.326612521008015e-06, + "loss": 0.5883, + "step": 1391 + }, + { + "epoch": 0.6769604863221884, + "grad_norm": 0.07074106010201726, + "learning_rate": 9.32565254090513e-06, + "loss": 0.5713, + "step": 1392 + }, + { + "epoch": 0.6774468085106383, + "grad_norm": 0.0727276700834868, + "learning_rate": 9.324691926494419e-06, + "loss": 0.5982, + "step": 1393 + }, + { + "epoch": 0.6779331306990881, + "grad_norm": 0.07364483108630815, + "learning_rate": 9.323730677916747e-06, + "loss": 0.5986, + "step": 1394 + }, + { + "epoch": 0.678419452887538, + "grad_norm": 0.07459201813263207, + "learning_rate": 9.32276879531307e-06, + "loss": 0.5743, + "step": 1395 + }, + { + "epoch": 0.6789057750759878, + "grad_norm": 0.0759748185806464, + "learning_rate": 9.321806278824436e-06, + "loss": 0.615, + "step": 1396 + }, + { + "epoch": 0.6793920972644377, + "grad_norm": 0.07002473464917006, + "learning_rate": 9.320843128591992e-06, + "loss": 0.535, + "step": 1397 + }, + { + "epoch": 0.6798784194528875, + "grad_norm": 0.0731356078991561, + "learning_rate": 9.319879344756968e-06, + "loss": 0.582, + "step": 1398 + }, + { + "epoch": 0.6803647416413374, + "grad_norm": 0.07723671011700303, + "learning_rate": 9.318914927460694e-06, + "loss": 0.5624, + "step": 1399 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 0.07724769484034885, + "learning_rate": 9.31794987684459e-06, + "loss": 0.6218, + "step": 1400 + }, + { + "epoch": 0.6813373860182371, + "grad_norm": 0.07400788891102064, + "learning_rate": 9.31698419305017e-06, + "loss": 0.5958, + "step": 1401 + }, + { + "epoch": 0.6818237082066869, + "grad_norm": 0.07757563601577189, + "learning_rate": 9.31601787621904e-06, + "loss": 0.6182, + "step": 1402 + }, + { + "epoch": 0.6823100303951368, + "grad_norm": 0.07511121562407494, + "learning_rate": 9.315050926492901e-06, + "loss": 0.6065, + "step": 1403 + }, + { + "epoch": 0.6827963525835866, + "grad_norm": 0.07931631630368464, + "learning_rate": 9.314083344013544e-06, + "loss": 0.5592, + "step": 1404 + }, + { + "epoch": 0.6832826747720365, + "grad_norm": 0.0741642610955171, + "learning_rate": 9.313115128922853e-06, + "loss": 0.6155, + "step": 1405 + }, + { + "epoch": 0.6837689969604863, + "grad_norm": 0.08036336873400678, + "learning_rate": 9.312146281362811e-06, + "loss": 0.6576, + "step": 1406 + }, + { + "epoch": 0.6842553191489362, + "grad_norm": 0.07491409078338893, + "learning_rate": 9.311176801475481e-06, + "loss": 0.5878, + "step": 1407 + }, + { + "epoch": 0.684741641337386, + "grad_norm": 0.07208127938249424, + "learning_rate": 9.31020668940303e-06, + "loss": 0.5876, + "step": 1408 + }, + { + "epoch": 0.6852279635258358, + "grad_norm": 0.07351932513541455, + "learning_rate": 9.309235945287715e-06, + "loss": 0.5753, + "step": 1409 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.07886115028555457, + "learning_rate": 9.308264569271882e-06, + "loss": 0.6074, + "step": 1410 + }, + { + "epoch": 0.6862006079027355, + "grad_norm": 0.07488771684118997, + "learning_rate": 9.307292561497974e-06, + "loss": 0.6166, + "step": 1411 + }, + { + "epoch": 0.6866869300911854, + "grad_norm": 0.07445563565056222, + "learning_rate": 9.306319922108525e-06, + "loss": 0.5779, + "step": 1412 + }, + { + "epoch": 0.6871732522796352, + "grad_norm": 0.07895209662723902, + "learning_rate": 9.30534665124616e-06, + "loss": 0.6297, + "step": 1413 + }, + { + "epoch": 0.6876595744680851, + "grad_norm": 0.0717508744330046, + "learning_rate": 9.304372749053599e-06, + "loss": 0.5608, + "step": 1414 + }, + { + "epoch": 0.6881458966565349, + "grad_norm": 0.10111506966422479, + "learning_rate": 9.303398215673654e-06, + "loss": 0.5969, + "step": 1415 + }, + { + "epoch": 0.6886322188449848, + "grad_norm": 0.07264927316471913, + "learning_rate": 9.30242305124923e-06, + "loss": 0.5566, + "step": 1416 + }, + { + "epoch": 0.6891185410334346, + "grad_norm": 0.08338970011909767, + "learning_rate": 9.301447255923321e-06, + "loss": 0.6405, + "step": 1417 + }, + { + "epoch": 0.6896048632218845, + "grad_norm": 0.07554742175539556, + "learning_rate": 9.300470829839018e-06, + "loss": 0.6081, + "step": 1418 + }, + { + "epoch": 0.6900911854103343, + "grad_norm": 0.07231073949415669, + "learning_rate": 9.299493773139504e-06, + "loss": 0.5545, + "step": 1419 + }, + { + "epoch": 0.6905775075987842, + "grad_norm": 0.0740590781974643, + "learning_rate": 9.298516085968052e-06, + "loss": 0.6112, + "step": 1420 + }, + { + "epoch": 0.691063829787234, + "grad_norm": 0.07493620812179583, + "learning_rate": 9.29753776846803e-06, + "loss": 0.5885, + "step": 1421 + }, + { + "epoch": 0.6915501519756839, + "grad_norm": 0.07029894830845575, + "learning_rate": 9.296558820782895e-06, + "loss": 0.5488, + "step": 1422 + }, + { + "epoch": 0.6920364741641337, + "grad_norm": 0.07530266395754613, + "learning_rate": 9.2955792430562e-06, + "loss": 0.6113, + "step": 1423 + }, + { + "epoch": 0.6925227963525836, + "grad_norm": 0.07160850113556726, + "learning_rate": 9.294599035431588e-06, + "loss": 0.5801, + "step": 1424 + }, + { + "epoch": 0.6930091185410334, + "grad_norm": 0.07401711312717284, + "learning_rate": 9.293618198052796e-06, + "loss": 0.5944, + "step": 1425 + }, + { + "epoch": 0.6934954407294833, + "grad_norm": 0.07612252518364429, + "learning_rate": 9.29263673106365e-06, + "loss": 0.5902, + "step": 1426 + }, + { + "epoch": 0.6939817629179331, + "grad_norm": 0.07427845038687236, + "learning_rate": 9.291654634608079e-06, + "loss": 0.6033, + "step": 1427 + }, + { + "epoch": 0.694468085106383, + "grad_norm": 0.07323589939350156, + "learning_rate": 9.290671908830087e-06, + "loss": 0.5827, + "step": 1428 + }, + { + "epoch": 0.6949544072948328, + "grad_norm": 0.0712966875300514, + "learning_rate": 9.289688553873783e-06, + "loss": 0.5741, + "step": 1429 + }, + { + "epoch": 0.6954407294832827, + "grad_norm": 0.07130510436210435, + "learning_rate": 9.288704569883366e-06, + "loss": 0.5688, + "step": 1430 + }, + { + "epoch": 0.6959270516717325, + "grad_norm": 0.07660663484948632, + "learning_rate": 9.287719957003128e-06, + "loss": 0.6295, + "step": 1431 + }, + { + "epoch": 0.6964133738601823, + "grad_norm": 0.07277229698979291, + "learning_rate": 9.286734715377446e-06, + "loss": 0.5997, + "step": 1432 + }, + { + "epoch": 0.6968996960486322, + "grad_norm": 0.07280402845683927, + "learning_rate": 9.285748845150797e-06, + "loss": 0.5987, + "step": 1433 + }, + { + "epoch": 0.697386018237082, + "grad_norm": 0.07711021088789537, + "learning_rate": 9.284762346467749e-06, + "loss": 0.5807, + "step": 1434 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 0.07800455939095077, + "learning_rate": 9.283775219472958e-06, + "loss": 0.601, + "step": 1435 + }, + { + "epoch": 0.6983586626139817, + "grad_norm": 0.07280568878207604, + "learning_rate": 9.282787464311176e-06, + "loss": 0.5458, + "step": 1436 + }, + { + "epoch": 0.6988449848024316, + "grad_norm": 0.07529816212765439, + "learning_rate": 9.281799081127249e-06, + "loss": 0.6013, + "step": 1437 + }, + { + "epoch": 0.6993313069908814, + "grad_norm": 0.0770675973094629, + "learning_rate": 9.280810070066108e-06, + "loss": 0.6156, + "step": 1438 + }, + { + "epoch": 0.6998176291793313, + "grad_norm": 0.0777478441442148, + "learning_rate": 9.279820431272783e-06, + "loss": 0.6144, + "step": 1439 + }, + { + "epoch": 0.7003039513677811, + "grad_norm": 0.07343110528732574, + "learning_rate": 9.278830164892392e-06, + "loss": 0.558, + "step": 1440 + }, + { + "epoch": 0.700790273556231, + "grad_norm": 0.0739284962130104, + "learning_rate": 9.277839271070146e-06, + "loss": 0.6098, + "step": 1441 + }, + { + "epoch": 0.7012765957446808, + "grad_norm": 0.06990607854779657, + "learning_rate": 9.27684774995135e-06, + "loss": 0.559, + "step": 1442 + }, + { + "epoch": 0.7017629179331307, + "grad_norm": 0.07621166065463421, + "learning_rate": 9.275855601681398e-06, + "loss": 0.5934, + "step": 1443 + }, + { + "epoch": 0.7022492401215805, + "grad_norm": 0.07445005797019247, + "learning_rate": 9.274862826405777e-06, + "loss": 0.6009, + "step": 1444 + }, + { + "epoch": 0.7027355623100304, + "grad_norm": 0.0761006018967215, + "learning_rate": 9.273869424270068e-06, + "loss": 0.5847, + "step": 1445 + }, + { + "epoch": 0.7032218844984802, + "grad_norm": 0.08103538466046839, + "learning_rate": 9.27287539541994e-06, + "loss": 0.5937, + "step": 1446 + }, + { + "epoch": 0.7037082066869301, + "grad_norm": 0.07678231053956183, + "learning_rate": 9.271880740001158e-06, + "loss": 0.597, + "step": 1447 + }, + { + "epoch": 0.7041945288753799, + "grad_norm": 0.07488548883099819, + "learning_rate": 9.270885458159576e-06, + "loss": 0.5999, + "step": 1448 + }, + { + "epoch": 0.7046808510638298, + "grad_norm": 0.07303580166136743, + "learning_rate": 9.269889550041138e-06, + "loss": 0.5674, + "step": 1449 + }, + { + "epoch": 0.7051671732522796, + "grad_norm": 0.07183746472331608, + "learning_rate": 9.268893015791889e-06, + "loss": 0.5702, + "step": 1450 + }, + { + "epoch": 0.7056534954407295, + "grad_norm": 0.07349428775246662, + "learning_rate": 9.267895855557954e-06, + "loss": 0.5828, + "step": 1451 + }, + { + "epoch": 0.7061398176291793, + "grad_norm": 0.0774485123484772, + "learning_rate": 9.266898069485556e-06, + "loss": 0.5873, + "step": 1452 + }, + { + "epoch": 0.7066261398176292, + "grad_norm": 0.07686840919494571, + "learning_rate": 9.26589965772101e-06, + "loss": 0.6104, + "step": 1453 + }, + { + "epoch": 0.707112462006079, + "grad_norm": 0.07506950804808296, + "learning_rate": 9.264900620410722e-06, + "loss": 0.6014, + "step": 1454 + }, + { + "epoch": 0.7075987841945289, + "grad_norm": 0.07259189951541917, + "learning_rate": 9.263900957701191e-06, + "loss": 0.5499, + "step": 1455 + }, + { + "epoch": 0.7080851063829787, + "grad_norm": 0.07660514904475572, + "learning_rate": 9.262900669739003e-06, + "loss": 0.5951, + "step": 1456 + }, + { + "epoch": 0.7085714285714285, + "grad_norm": 0.07388875540870823, + "learning_rate": 9.26189975667084e-06, + "loss": 0.5852, + "step": 1457 + }, + { + "epoch": 0.7090577507598784, + "grad_norm": 0.07142243387914578, + "learning_rate": 9.260898218643475e-06, + "loss": 0.5663, + "step": 1458 + }, + { + "epoch": 0.7095440729483282, + "grad_norm": 0.0765504082170502, + "learning_rate": 9.259896055803772e-06, + "loss": 0.5872, + "step": 1459 + }, + { + "epoch": 0.7100303951367781, + "grad_norm": 0.07653204930742795, + "learning_rate": 9.258893268298685e-06, + "loss": 0.6205, + "step": 1460 + }, + { + "epoch": 0.7105167173252279, + "grad_norm": 0.06978368133165619, + "learning_rate": 9.257889856275266e-06, + "loss": 0.5601, + "step": 1461 + }, + { + "epoch": 0.7110030395136778, + "grad_norm": 0.07527087752028105, + "learning_rate": 9.25688581988065e-06, + "loss": 0.5855, + "step": 1462 + }, + { + "epoch": 0.7114893617021276, + "grad_norm": 0.07230819844467026, + "learning_rate": 9.255881159262067e-06, + "loss": 0.6071, + "step": 1463 + }, + { + "epoch": 0.7119756838905775, + "grad_norm": 0.07905349253913133, + "learning_rate": 9.254875874566844e-06, + "loss": 0.6097, + "step": 1464 + }, + { + "epoch": 0.7124620060790273, + "grad_norm": 0.07981563554797644, + "learning_rate": 9.25386996594239e-06, + "loss": 0.5821, + "step": 1465 + }, + { + "epoch": 0.7129483282674772, + "grad_norm": 0.08896425541342512, + "learning_rate": 9.25286343353621e-06, + "loss": 0.5818, + "step": 1466 + }, + { + "epoch": 0.713434650455927, + "grad_norm": 0.07943597993980359, + "learning_rate": 9.251856277495903e-06, + "loss": 0.5834, + "step": 1467 + }, + { + "epoch": 0.7139209726443769, + "grad_norm": 0.07754984722940073, + "learning_rate": 9.250848497969156e-06, + "loss": 0.6082, + "step": 1468 + }, + { + "epoch": 0.7144072948328267, + "grad_norm": 0.07254018381941152, + "learning_rate": 9.249840095103748e-06, + "loss": 0.603, + "step": 1469 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 0.0733075420079743, + "learning_rate": 9.248831069047551e-06, + "loss": 0.559, + "step": 1470 + }, + { + "epoch": 0.7153799392097264, + "grad_norm": 0.07266444657951242, + "learning_rate": 9.247821419948526e-06, + "loss": 0.5907, + "step": 1471 + }, + { + "epoch": 0.7158662613981763, + "grad_norm": 0.07705924353116533, + "learning_rate": 9.246811147954726e-06, + "loss": 0.5999, + "step": 1472 + }, + { + "epoch": 0.7163525835866261, + "grad_norm": 0.07533047665866212, + "learning_rate": 9.245800253214298e-06, + "loss": 0.5869, + "step": 1473 + }, + { + "epoch": 0.716838905775076, + "grad_norm": 0.0744974944358589, + "learning_rate": 9.244788735875477e-06, + "loss": 0.5708, + "step": 1474 + }, + { + "epoch": 0.7173252279635258, + "grad_norm": 0.07178464311652623, + "learning_rate": 9.243776596086591e-06, + "loss": 0.5975, + "step": 1475 + }, + { + "epoch": 0.7178115501519757, + "grad_norm": 0.07236143152879067, + "learning_rate": 9.242763833996058e-06, + "loss": 0.5904, + "step": 1476 + }, + { + "epoch": 0.7182978723404255, + "grad_norm": 0.07337170812927998, + "learning_rate": 9.241750449752388e-06, + "loss": 0.5879, + "step": 1477 + }, + { + "epoch": 0.7187841945288754, + "grad_norm": 0.07635888197606563, + "learning_rate": 9.240736443504184e-06, + "loss": 0.6021, + "step": 1478 + }, + { + "epoch": 0.7192705167173252, + "grad_norm": 0.07378321260219771, + "learning_rate": 9.239721815400136e-06, + "loss": 0.5678, + "step": 1479 + }, + { + "epoch": 0.7197568389057751, + "grad_norm": 0.07611041717504945, + "learning_rate": 9.238706565589029e-06, + "loss": 0.6214, + "step": 1480 + }, + { + "epoch": 0.7202431610942249, + "grad_norm": 0.0759542049940672, + "learning_rate": 9.237690694219739e-06, + "loss": 0.6274, + "step": 1481 + }, + { + "epoch": 0.7207294832826747, + "grad_norm": 0.08270812030201027, + "learning_rate": 9.23667420144123e-06, + "loss": 0.5587, + "step": 1482 + }, + { + "epoch": 0.7212158054711246, + "grad_norm": 0.07548350171759112, + "learning_rate": 9.235657087402561e-06, + "loss": 0.5403, + "step": 1483 + }, + { + "epoch": 0.7217021276595744, + "grad_norm": 0.07307205409205421, + "learning_rate": 9.234639352252878e-06, + "loss": 0.5763, + "step": 1484 + }, + { + "epoch": 0.7221884498480243, + "grad_norm": 0.07899770694583005, + "learning_rate": 9.233620996141421e-06, + "loss": 0.5796, + "step": 1485 + }, + { + "epoch": 0.7226747720364741, + "grad_norm": 0.0722503208501567, + "learning_rate": 9.232602019217523e-06, + "loss": 0.555, + "step": 1486 + }, + { + "epoch": 0.723161094224924, + "grad_norm": 0.0777770475318072, + "learning_rate": 9.231582421630601e-06, + "loss": 0.6055, + "step": 1487 + }, + { + "epoch": 0.7236474164133738, + "grad_norm": 0.07009950208335403, + "learning_rate": 9.230562203530171e-06, + "loss": 0.5407, + "step": 1488 + }, + { + "epoch": 0.7241337386018237, + "grad_norm": 0.07477081720472192, + "learning_rate": 9.229541365065834e-06, + "loss": 0.5421, + "step": 1489 + }, + { + "epoch": 0.7246200607902735, + "grad_norm": 0.0759906895921592, + "learning_rate": 9.228519906387287e-06, + "loss": 0.6108, + "step": 1490 + }, + { + "epoch": 0.7251063829787234, + "grad_norm": 0.0733455529361756, + "learning_rate": 9.227497827644313e-06, + "loss": 0.5619, + "step": 1491 + }, + { + "epoch": 0.7255927051671732, + "grad_norm": 0.08263305822583293, + "learning_rate": 9.22647512898679e-06, + "loss": 0.5921, + "step": 1492 + }, + { + "epoch": 0.7260790273556231, + "grad_norm": 0.07574909281930131, + "learning_rate": 9.225451810564683e-06, + "loss": 0.5839, + "step": 1493 + }, + { + "epoch": 0.7265653495440729, + "grad_norm": 0.07165287942370305, + "learning_rate": 9.224427872528051e-06, + "loss": 0.5818, + "step": 1494 + }, + { + "epoch": 0.7270516717325228, + "grad_norm": 0.07230920519373132, + "learning_rate": 9.223403315027044e-06, + "loss": 0.5493, + "step": 1495 + }, + { + "epoch": 0.7275379939209726, + "grad_norm": 0.07825297185910812, + "learning_rate": 9.2223781382119e-06, + "loss": 0.5845, + "step": 1496 + }, + { + "epoch": 0.7280243161094225, + "grad_norm": 0.10248125073170866, + "learning_rate": 9.22135234223295e-06, + "loss": 0.5989, + "step": 1497 + }, + { + "epoch": 0.7285106382978723, + "grad_norm": 0.07484499954746404, + "learning_rate": 9.220325927240617e-06, + "loss": 0.5904, + "step": 1498 + }, + { + "epoch": 0.7289969604863222, + "grad_norm": 0.07437512859206304, + "learning_rate": 9.21929889338541e-06, + "loss": 0.5816, + "step": 1499 + }, + { + "epoch": 0.729483282674772, + "grad_norm": 0.0732815561616446, + "learning_rate": 9.218271240817935e-06, + "loss": 0.5893, + "step": 1500 + }, + { + "epoch": 0.7299696048632219, + "grad_norm": 0.08304059906781426, + "learning_rate": 9.217242969688883e-06, + "loss": 0.5904, + "step": 1501 + }, + { + "epoch": 0.7304559270516717, + "grad_norm": 0.07563609614571871, + "learning_rate": 9.216214080149039e-06, + "loss": 0.5929, + "step": 1502 + }, + { + "epoch": 0.7309422492401216, + "grad_norm": 0.0674900711119595, + "learning_rate": 9.21518457234928e-06, + "loss": 0.5246, + "step": 1503 + }, + { + "epoch": 0.7314285714285714, + "grad_norm": 0.17283452029261448, + "learning_rate": 9.214154446440571e-06, + "loss": 0.616, + "step": 1504 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 0.07309989389637253, + "learning_rate": 9.213123702573964e-06, + "loss": 0.5547, + "step": 1505 + }, + { + "epoch": 0.7324012158054711, + "grad_norm": 0.07344874038511445, + "learning_rate": 9.212092340900613e-06, + "loss": 0.5959, + "step": 1506 + }, + { + "epoch": 0.7328875379939209, + "grad_norm": 0.07147587530509987, + "learning_rate": 9.21106036157175e-06, + "loss": 0.5607, + "step": 1507 + }, + { + "epoch": 0.7333738601823708, + "grad_norm": 0.07980585878521716, + "learning_rate": 9.210027764738704e-06, + "loss": 0.6112, + "step": 1508 + }, + { + "epoch": 0.7338601823708206, + "grad_norm": 0.07498692429958936, + "learning_rate": 9.208994550552894e-06, + "loss": 0.5861, + "step": 1509 + }, + { + "epoch": 0.7343465045592705, + "grad_norm": 0.07347339263737054, + "learning_rate": 9.207960719165832e-06, + "loss": 0.5493, + "step": 1510 + }, + { + "epoch": 0.7348328267477203, + "grad_norm": 0.0759092211741073, + "learning_rate": 9.206926270729112e-06, + "loss": 0.6228, + "step": 1511 + }, + { + "epoch": 0.7353191489361702, + "grad_norm": 0.07368383507571005, + "learning_rate": 9.205891205394429e-06, + "loss": 0.5753, + "step": 1512 + }, + { + "epoch": 0.73580547112462, + "grad_norm": 0.10975233474738065, + "learning_rate": 9.204855523313561e-06, + "loss": 0.6039, + "step": 1513 + }, + { + "epoch": 0.7362917933130699, + "grad_norm": 0.07828013689371244, + "learning_rate": 9.203819224638381e-06, + "loss": 0.6182, + "step": 1514 + }, + { + "epoch": 0.7367781155015197, + "grad_norm": 0.07398729106921355, + "learning_rate": 9.202782309520848e-06, + "loss": 0.6034, + "step": 1515 + }, + { + "epoch": 0.7372644376899696, + "grad_norm": 0.07464730833887104, + "learning_rate": 9.201744778113016e-06, + "loss": 0.5877, + "step": 1516 + }, + { + "epoch": 0.7377507598784194, + "grad_norm": 0.0759031077190833, + "learning_rate": 9.200706630567026e-06, + "loss": 0.6258, + "step": 1517 + }, + { + "epoch": 0.7382370820668693, + "grad_norm": 0.07669451336585603, + "learning_rate": 9.199667867035111e-06, + "loss": 0.6061, + "step": 1518 + }, + { + "epoch": 0.7387234042553191, + "grad_norm": 0.07328423972939507, + "learning_rate": 9.198628487669592e-06, + "loss": 0.5765, + "step": 1519 + }, + { + "epoch": 0.739209726443769, + "grad_norm": 0.07677780278507479, + "learning_rate": 9.197588492622887e-06, + "loss": 0.5501, + "step": 1520 + }, + { + "epoch": 0.7396960486322188, + "grad_norm": 0.07562906841082154, + "learning_rate": 9.196547882047493e-06, + "loss": 0.6453, + "step": 1521 + }, + { + "epoch": 0.7401823708206687, + "grad_norm": 0.07719961962985727, + "learning_rate": 9.195506656096009e-06, + "loss": 0.5944, + "step": 1522 + }, + { + "epoch": 0.7406686930091185, + "grad_norm": 0.07365637597557352, + "learning_rate": 9.194464814921116e-06, + "loss": 0.5952, + "step": 1523 + }, + { + "epoch": 0.7411550151975684, + "grad_norm": 0.07537542482795369, + "learning_rate": 9.19342235867559e-06, + "loss": 0.6007, + "step": 1524 + }, + { + "epoch": 0.7416413373860182, + "grad_norm": 0.07442924149828235, + "learning_rate": 9.192379287512294e-06, + "loss": 0.6462, + "step": 1525 + }, + { + "epoch": 0.7421276595744681, + "grad_norm": 0.07243963193138703, + "learning_rate": 9.191335601584184e-06, + "loss": 0.5503, + "step": 1526 + }, + { + "epoch": 0.7426139817629179, + "grad_norm": 0.07705782469231785, + "learning_rate": 9.190291301044303e-06, + "loss": 0.603, + "step": 1527 + }, + { + "epoch": 0.7431003039513678, + "grad_norm": 0.07215074604849654, + "learning_rate": 9.189246386045787e-06, + "loss": 0.5577, + "step": 1528 + }, + { + "epoch": 0.7435866261398176, + "grad_norm": 0.08000326095818983, + "learning_rate": 9.18820085674186e-06, + "loss": 0.5661, + "step": 1529 + }, + { + "epoch": 0.7440729483282674, + "grad_norm": 0.0749021802919331, + "learning_rate": 9.187154713285838e-06, + "loss": 0.5809, + "step": 1530 + }, + { + "epoch": 0.7445592705167173, + "grad_norm": 0.0736784956044312, + "learning_rate": 9.186107955831127e-06, + "loss": 0.5751, + "step": 1531 + }, + { + "epoch": 0.7450455927051671, + "grad_norm": 0.07678452495549294, + "learning_rate": 9.185060584531218e-06, + "loss": 0.6519, + "step": 1532 + }, + { + "epoch": 0.745531914893617, + "grad_norm": 0.07297220390267127, + "learning_rate": 9.1840125995397e-06, + "loss": 0.5475, + "step": 1533 + }, + { + "epoch": 0.7460182370820668, + "grad_norm": 0.07344202552366175, + "learning_rate": 9.182964001010248e-06, + "loss": 0.5823, + "step": 1534 + }, + { + "epoch": 0.7465045592705167, + "grad_norm": 0.07550904525865863, + "learning_rate": 9.181914789096625e-06, + "loss": 0.563, + "step": 1535 + }, + { + "epoch": 0.7469908814589665, + "grad_norm": 0.0761919034104001, + "learning_rate": 9.180864963952686e-06, + "loss": 0.6179, + "step": 1536 + }, + { + "epoch": 0.7474772036474164, + "grad_norm": 0.0707945206580333, + "learning_rate": 9.179814525732378e-06, + "loss": 0.5708, + "step": 1537 + }, + { + "epoch": 0.7479635258358662, + "grad_norm": 0.07331153646422277, + "learning_rate": 9.178763474589734e-06, + "loss": 0.6065, + "step": 1538 + }, + { + "epoch": 0.7484498480243161, + "grad_norm": 0.07577450992313728, + "learning_rate": 9.17771181067888e-06, + "loss": 0.6273, + "step": 1539 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 0.07710105110365151, + "learning_rate": 9.17665953415403e-06, + "loss": 0.5965, + "step": 1540 + }, + { + "epoch": 0.7494224924012158, + "grad_norm": 0.07191974951275368, + "learning_rate": 9.175606645169489e-06, + "loss": 0.5571, + "step": 1541 + }, + { + "epoch": 0.7499088145896656, + "grad_norm": 0.06956148797674722, + "learning_rate": 9.174553143879649e-06, + "loss": 0.5568, + "step": 1542 + }, + { + "epoch": 0.7499088145896656, + "eval_loss": 0.592545211315155, + "eval_runtime": 105.2769, + "eval_samples_per_second": 288.316, + "eval_steps_per_second": 36.048, + "step": 1542 + }, + { + "epoch": 0.7503951367781155, + "grad_norm": 0.07118021983808168, + "learning_rate": 9.173499030438996e-06, + "loss": 0.5947, + "step": 1543 + }, + { + "epoch": 0.7508814589665653, + "grad_norm": 0.07394858052720103, + "learning_rate": 9.172444305002105e-06, + "loss": 0.5583, + "step": 1544 + }, + { + "epoch": 0.7513677811550152, + "grad_norm": 0.07357842008611473, + "learning_rate": 9.171388967723638e-06, + "loss": 0.605, + "step": 1545 + }, + { + "epoch": 0.751854103343465, + "grad_norm": 0.07210513399010314, + "learning_rate": 9.170333018758345e-06, + "loss": 0.5773, + "step": 1546 + }, + { + "epoch": 0.7523404255319149, + "grad_norm": 0.07282937669966806, + "learning_rate": 9.169276458261075e-06, + "loss": 0.5857, + "step": 1547 + }, + { + "epoch": 0.7528267477203647, + "grad_norm": 0.07604656523395356, + "learning_rate": 9.168219286386757e-06, + "loss": 0.5669, + "step": 1548 + }, + { + "epoch": 0.7533130699088146, + "grad_norm": 0.07091346756396157, + "learning_rate": 9.167161503290414e-06, + "loss": 0.5809, + "step": 1549 + }, + { + "epoch": 0.7537993920972644, + "grad_norm": 0.0708038357037518, + "learning_rate": 9.166103109127158e-06, + "loss": 0.5677, + "step": 1550 + }, + { + "epoch": 0.7542857142857143, + "grad_norm": 0.07396265084406707, + "learning_rate": 9.16504410405219e-06, + "loss": 0.6011, + "step": 1551 + }, + { + "epoch": 0.7547720364741641, + "grad_norm": 0.07753680322894918, + "learning_rate": 9.1639844882208e-06, + "loss": 0.5704, + "step": 1552 + }, + { + "epoch": 0.7552583586626139, + "grad_norm": 0.07402147944062856, + "learning_rate": 9.162924261788372e-06, + "loss": 0.5799, + "step": 1553 + }, + { + "epoch": 0.7557446808510638, + "grad_norm": 0.0726343876310961, + "learning_rate": 9.161863424910373e-06, + "loss": 0.5552, + "step": 1554 + }, + { + "epoch": 0.7562310030395136, + "grad_norm": 0.07709517455087571, + "learning_rate": 9.160801977742364e-06, + "loss": 0.6229, + "step": 1555 + }, + { + "epoch": 0.7567173252279635, + "grad_norm": 0.07321368609324581, + "learning_rate": 9.159739920439994e-06, + "loss": 0.5615, + "step": 1556 + }, + { + "epoch": 0.7572036474164133, + "grad_norm": 0.07403176107096356, + "learning_rate": 9.158677253159003e-06, + "loss": 0.5952, + "step": 1557 + }, + { + "epoch": 0.7576899696048632, + "grad_norm": 0.07811411439216485, + "learning_rate": 9.157613976055216e-06, + "loss": 0.5982, + "step": 1558 + }, + { + "epoch": 0.758176291793313, + "grad_norm": 0.07055719503424134, + "learning_rate": 9.156550089284553e-06, + "loss": 0.5715, + "step": 1559 + }, + { + "epoch": 0.7586626139817629, + "grad_norm": 0.0713130750447445, + "learning_rate": 9.15548559300302e-06, + "loss": 0.5458, + "step": 1560 + }, + { + "epoch": 0.7591489361702127, + "grad_norm": 0.0735660614070684, + "learning_rate": 9.154420487366713e-06, + "loss": 0.5719, + "step": 1561 + }, + { + "epoch": 0.7596352583586626, + "grad_norm": 0.07046788817877819, + "learning_rate": 9.153354772531819e-06, + "loss": 0.5639, + "step": 1562 + }, + { + "epoch": 0.7601215805471124, + "grad_norm": 0.07052317679863633, + "learning_rate": 9.152288448654612e-06, + "loss": 0.5895, + "step": 1563 + }, + { + "epoch": 0.7606079027355623, + "grad_norm": 0.07196556264515612, + "learning_rate": 9.151221515891455e-06, + "loss": 0.6189, + "step": 1564 + }, + { + "epoch": 0.7610942249240121, + "grad_norm": 0.07100226559825275, + "learning_rate": 9.150153974398804e-06, + "loss": 0.573, + "step": 1565 + }, + { + "epoch": 0.761580547112462, + "grad_norm": 0.07656318474867924, + "learning_rate": 9.1490858243332e-06, + "loss": 0.5786, + "step": 1566 + }, + { + "epoch": 0.7620668693009118, + "grad_norm": 0.07185257662473404, + "learning_rate": 9.148017065851276e-06, + "loss": 0.5807, + "step": 1567 + }, + { + "epoch": 0.7625531914893617, + "grad_norm": 0.07661574876741803, + "learning_rate": 9.146947699109753e-06, + "loss": 0.6625, + "step": 1568 + }, + { + "epoch": 0.7630395136778115, + "grad_norm": 0.0706785318579316, + "learning_rate": 9.145877724265444e-06, + "loss": 0.5808, + "step": 1569 + }, + { + "epoch": 0.7635258358662614, + "grad_norm": 0.07030773844311922, + "learning_rate": 9.144807141475244e-06, + "loss": 0.5835, + "step": 1570 + }, + { + "epoch": 0.7640121580547112, + "grad_norm": 0.07597128889382836, + "learning_rate": 9.143735950896143e-06, + "loss": 0.5653, + "step": 1571 + }, + { + "epoch": 0.7644984802431611, + "grad_norm": 0.07157429771445756, + "learning_rate": 9.142664152685224e-06, + "loss": 0.5802, + "step": 1572 + }, + { + "epoch": 0.7649848024316109, + "grad_norm": 0.07562191397123438, + "learning_rate": 9.141591746999648e-06, + "loss": 0.6153, + "step": 1573 + }, + { + "epoch": 0.7654711246200608, + "grad_norm": 0.06858514426249837, + "learning_rate": 9.140518733996672e-06, + "loss": 0.5499, + "step": 1574 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 0.07216713056338284, + "learning_rate": 9.139445113833644e-06, + "loss": 0.5926, + "step": 1575 + }, + { + "epoch": 0.7664437689969605, + "grad_norm": 0.07359980233813823, + "learning_rate": 9.138370886667996e-06, + "loss": 0.6309, + "step": 1576 + }, + { + "epoch": 0.7669300911854103, + "grad_norm": 0.07164340392473688, + "learning_rate": 9.137296052657252e-06, + "loss": 0.5884, + "step": 1577 + }, + { + "epoch": 0.7674164133738601, + "grad_norm": 0.07284243924280691, + "learning_rate": 9.136220611959023e-06, + "loss": 0.5624, + "step": 1578 + }, + { + "epoch": 0.76790273556231, + "grad_norm": 0.07335630182395572, + "learning_rate": 9.135144564731012e-06, + "loss": 0.5797, + "step": 1579 + }, + { + "epoch": 0.7683890577507598, + "grad_norm": 0.07427718215377827, + "learning_rate": 9.134067911131008e-06, + "loss": 0.5773, + "step": 1580 + }, + { + "epoch": 0.7688753799392097, + "grad_norm": 0.07631953773449411, + "learning_rate": 9.13299065131689e-06, + "loss": 0.5912, + "step": 1581 + }, + { + "epoch": 0.7693617021276595, + "grad_norm": 0.07475723621154033, + "learning_rate": 9.131912785446628e-06, + "loss": 0.5787, + "step": 1582 + }, + { + "epoch": 0.7698480243161094, + "grad_norm": 0.06871487386066841, + "learning_rate": 9.130834313678275e-06, + "loss": 0.572, + "step": 1583 + }, + { + "epoch": 0.7703343465045592, + "grad_norm": 0.07118428741912614, + "learning_rate": 9.12975523616998e-06, + "loss": 0.5493, + "step": 1584 + }, + { + "epoch": 0.7708206686930091, + "grad_norm": 0.08795264332600707, + "learning_rate": 9.128675553079974e-06, + "loss": 0.592, + "step": 1585 + }, + { + "epoch": 0.7713069908814589, + "grad_norm": 0.07580482493684017, + "learning_rate": 9.127595264566584e-06, + "loss": 0.5726, + "step": 1586 + }, + { + "epoch": 0.7717933130699088, + "grad_norm": 0.07397115078670591, + "learning_rate": 9.12651437078822e-06, + "loss": 0.5885, + "step": 1587 + }, + { + "epoch": 0.7722796352583586, + "grad_norm": 0.0736965848383469, + "learning_rate": 9.125432871903383e-06, + "loss": 0.5564, + "step": 1588 + }, + { + "epoch": 0.7727659574468085, + "grad_norm": 0.07556312629489341, + "learning_rate": 9.124350768070664e-06, + "loss": 0.5867, + "step": 1589 + }, + { + "epoch": 0.7732522796352583, + "grad_norm": 0.0723278569344617, + "learning_rate": 9.123268059448738e-06, + "loss": 0.5773, + "step": 1590 + }, + { + "epoch": 0.7737386018237082, + "grad_norm": 0.07111236027265931, + "learning_rate": 9.122184746196375e-06, + "loss": 0.5606, + "step": 1591 + }, + { + "epoch": 0.774224924012158, + "grad_norm": 0.07314998368055502, + "learning_rate": 9.12110082847243e-06, + "loss": 0.5652, + "step": 1592 + }, + { + "epoch": 0.7747112462006079, + "grad_norm": 0.08077715198771841, + "learning_rate": 9.120016306435845e-06, + "loss": 0.5869, + "step": 1593 + }, + { + "epoch": 0.7751975683890577, + "grad_norm": 0.07637817839652904, + "learning_rate": 9.118931180245657e-06, + "loss": 0.584, + "step": 1594 + }, + { + "epoch": 0.7756838905775076, + "grad_norm": 0.09806405502936687, + "learning_rate": 9.117845450060983e-06, + "loss": 0.6143, + "step": 1595 + }, + { + "epoch": 0.7761702127659574, + "grad_norm": 0.0762660796563828, + "learning_rate": 9.116759116041037e-06, + "loss": 0.6288, + "step": 1596 + }, + { + "epoch": 0.7766565349544073, + "grad_norm": 0.07261781541959192, + "learning_rate": 9.115672178345111e-06, + "loss": 0.5465, + "step": 1597 + }, + { + "epoch": 0.7771428571428571, + "grad_norm": 0.0775499486740439, + "learning_rate": 9.114584637132601e-06, + "loss": 0.5818, + "step": 1598 + }, + { + "epoch": 0.777629179331307, + "grad_norm": 0.07576487990215618, + "learning_rate": 9.113496492562977e-06, + "loss": 0.5936, + "step": 1599 + }, + { + "epoch": 0.7781155015197568, + "grad_norm": 0.07689785922442594, + "learning_rate": 9.112407744795803e-06, + "loss": 0.6061, + "step": 1600 + }, + { + "epoch": 0.7786018237082067, + "grad_norm": 0.07581730234825507, + "learning_rate": 9.111318393990736e-06, + "loss": 0.5767, + "step": 1601 + }, + { + "epoch": 0.7790881458966565, + "grad_norm": 0.07666301558140577, + "learning_rate": 9.11022844030751e-06, + "loss": 0.5961, + "step": 1602 + }, + { + "epoch": 0.7795744680851063, + "grad_norm": 0.07670102786083799, + "learning_rate": 9.10913788390596e-06, + "loss": 0.6211, + "step": 1603 + }, + { + "epoch": 0.7800607902735562, + "grad_norm": 0.07367474284677104, + "learning_rate": 9.108046724946e-06, + "loss": 0.5916, + "step": 1604 + }, + { + "epoch": 0.780547112462006, + "grad_norm": 0.07180226798472734, + "learning_rate": 9.10695496358764e-06, + "loss": 0.5394, + "step": 1605 + }, + { + "epoch": 0.7810334346504559, + "grad_norm": 0.07318518812704777, + "learning_rate": 9.105862599990972e-06, + "loss": 0.5515, + "step": 1606 + }, + { + "epoch": 0.7815197568389057, + "grad_norm": 0.07806489299053469, + "learning_rate": 9.104769634316177e-06, + "loss": 0.6249, + "step": 1607 + }, + { + "epoch": 0.7820060790273556, + "grad_norm": 0.07190729959180565, + "learning_rate": 9.103676066723528e-06, + "loss": 0.5847, + "step": 1608 + }, + { + "epoch": 0.7824924012158054, + "grad_norm": 0.07142981952775516, + "learning_rate": 9.102581897373385e-06, + "loss": 0.5603, + "step": 1609 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 0.07191075091806294, + "learning_rate": 9.101487126426193e-06, + "loss": 0.5567, + "step": 1610 + }, + { + "epoch": 0.7834650455927051, + "grad_norm": 0.07345364503929963, + "learning_rate": 9.100391754042493e-06, + "loss": 0.5812, + "step": 1611 + }, + { + "epoch": 0.783951367781155, + "grad_norm": 0.07743056775911689, + "learning_rate": 9.099295780382904e-06, + "loss": 0.6485, + "step": 1612 + }, + { + "epoch": 0.7844376899696048, + "grad_norm": 0.07973017596861867, + "learning_rate": 9.098199205608138e-06, + "loss": 0.6669, + "step": 1613 + }, + { + "epoch": 0.7849240121580547, + "grad_norm": 0.0738459429073468, + "learning_rate": 9.097102029878998e-06, + "loss": 0.5714, + "step": 1614 + }, + { + "epoch": 0.7854103343465045, + "grad_norm": 0.07355050608545767, + "learning_rate": 9.096004253356369e-06, + "loss": 0.6277, + "step": 1615 + }, + { + "epoch": 0.7858966565349544, + "grad_norm": 0.07575738283900403, + "learning_rate": 9.09490587620123e-06, + "loss": 0.5462, + "step": 1616 + }, + { + "epoch": 0.7863829787234042, + "grad_norm": 0.07472663205739084, + "learning_rate": 9.093806898574647e-06, + "loss": 0.5594, + "step": 1617 + }, + { + "epoch": 0.7868693009118541, + "grad_norm": 0.07565495488852164, + "learning_rate": 9.092707320637769e-06, + "loss": 0.6028, + "step": 1618 + }, + { + "epoch": 0.7873556231003039, + "grad_norm": 0.07374015368399801, + "learning_rate": 9.091607142551839e-06, + "loss": 0.5813, + "step": 1619 + }, + { + "epoch": 0.7878419452887538, + "grad_norm": 0.07705942762753146, + "learning_rate": 9.090506364478183e-06, + "loss": 0.5623, + "step": 1620 + }, + { + "epoch": 0.7883282674772036, + "grad_norm": 0.07275002427389189, + "learning_rate": 9.089404986578221e-06, + "loss": 0.5622, + "step": 1621 + }, + { + "epoch": 0.7888145896656535, + "grad_norm": 0.07581372267510471, + "learning_rate": 9.088303009013454e-06, + "loss": 0.6029, + "step": 1622 + }, + { + "epoch": 0.7893009118541033, + "grad_norm": 0.073760722989789, + "learning_rate": 9.08720043194548e-06, + "loss": 0.5783, + "step": 1623 + }, + { + "epoch": 0.7897872340425532, + "grad_norm": 0.07138247123984222, + "learning_rate": 9.086097255535974e-06, + "loss": 0.5845, + "step": 1624 + }, + { + "epoch": 0.790273556231003, + "grad_norm": 0.07239373294843275, + "learning_rate": 9.084993479946706e-06, + "loss": 0.6144, + "step": 1625 + }, + { + "epoch": 0.7907598784194528, + "grad_norm": 0.0734432758313814, + "learning_rate": 9.083889105339532e-06, + "loss": 0.5636, + "step": 1626 + }, + { + "epoch": 0.7912462006079027, + "grad_norm": 0.07684721680070418, + "learning_rate": 9.082784131876398e-06, + "loss": 0.5955, + "step": 1627 + }, + { + "epoch": 0.7917325227963525, + "grad_norm": 0.0734344558953088, + "learning_rate": 9.081678559719334e-06, + "loss": 0.5764, + "step": 1628 + }, + { + "epoch": 0.7922188449848024, + "grad_norm": 0.07336420163099079, + "learning_rate": 9.080572389030458e-06, + "loss": 0.5714, + "step": 1629 + }, + { + "epoch": 0.7927051671732522, + "grad_norm": 0.07518446872847741, + "learning_rate": 9.079465619971979e-06, + "loss": 0.5727, + "step": 1630 + }, + { + "epoch": 0.7931914893617021, + "grad_norm": 0.07557529988370244, + "learning_rate": 9.078358252706194e-06, + "loss": 0.5866, + "step": 1631 + }, + { + "epoch": 0.7936778115501519, + "grad_norm": 0.07924937651398163, + "learning_rate": 9.077250287395482e-06, + "loss": 0.5831, + "step": 1632 + }, + { + "epoch": 0.7941641337386018, + "grad_norm": 0.07111948359091794, + "learning_rate": 9.07614172420232e-06, + "loss": 0.5594, + "step": 1633 + }, + { + "epoch": 0.7946504559270516, + "grad_norm": 0.07511669959189178, + "learning_rate": 9.075032563289256e-06, + "loss": 0.5751, + "step": 1634 + }, + { + "epoch": 0.7951367781155015, + "grad_norm": 0.07439036106895336, + "learning_rate": 9.073922804818944e-06, + "loss": 0.5805, + "step": 1635 + }, + { + "epoch": 0.7956231003039513, + "grad_norm": 0.07181786016281136, + "learning_rate": 9.072812448954117e-06, + "loss": 0.5591, + "step": 1636 + }, + { + "epoch": 0.7961094224924012, + "grad_norm": 0.07878442540853178, + "learning_rate": 9.071701495857593e-06, + "loss": 0.5561, + "step": 1637 + }, + { + "epoch": 0.796595744680851, + "grad_norm": 0.07369341255058924, + "learning_rate": 9.070589945692281e-06, + "loss": 0.5723, + "step": 1638 + }, + { + "epoch": 0.7970820668693009, + "grad_norm": 0.07196748777154602, + "learning_rate": 9.069477798621178e-06, + "loss": 0.5796, + "step": 1639 + }, + { + "epoch": 0.7975683890577507, + "grad_norm": 0.07863094262229525, + "learning_rate": 9.068365054807369e-06, + "loss": 0.6029, + "step": 1640 + }, + { + "epoch": 0.7980547112462006, + "grad_norm": 0.07236803516073441, + "learning_rate": 9.067251714414023e-06, + "loss": 0.6115, + "step": 1641 + }, + { + "epoch": 0.7985410334346504, + "grad_norm": 0.07651305023013237, + "learning_rate": 9.0661377776044e-06, + "loss": 0.6132, + "step": 1642 + }, + { + "epoch": 0.7990273556231003, + "grad_norm": 0.07015096359229805, + "learning_rate": 9.065023244541846e-06, + "loss": 0.5745, + "step": 1643 + }, + { + "epoch": 0.7995136778115501, + "grad_norm": 0.07287519876258483, + "learning_rate": 9.063908115389794e-06, + "loss": 0.5588, + "step": 1644 + }, + { + "epoch": 0.8, + "grad_norm": 0.07328509906921751, + "learning_rate": 9.062792390311768e-06, + "loss": 0.5725, + "step": 1645 + }, + { + "epoch": 0.8004863221884498, + "grad_norm": 0.07523232831435116, + "learning_rate": 9.061676069471372e-06, + "loss": 0.6283, + "step": 1646 + }, + { + "epoch": 0.8009726443768997, + "grad_norm": 0.07248796754762984, + "learning_rate": 9.060559153032305e-06, + "loss": 0.5724, + "step": 1647 + }, + { + "epoch": 0.8014589665653495, + "grad_norm": 0.07389920446623921, + "learning_rate": 9.059441641158348e-06, + "loss": 0.5958, + "step": 1648 + }, + { + "epoch": 0.8019452887537994, + "grad_norm": 0.0691834517848783, + "learning_rate": 9.05832353401337e-06, + "loss": 0.5635, + "step": 1649 + }, + { + "epoch": 0.8024316109422492, + "grad_norm": 0.087411497760545, + "learning_rate": 9.057204831761334e-06, + "loss": 0.5707, + "step": 1650 + }, + { + "epoch": 0.802917933130699, + "grad_norm": 0.07525238297349253, + "learning_rate": 9.056085534566283e-06, + "loss": 0.5705, + "step": 1651 + }, + { + "epoch": 0.8034042553191489, + "grad_norm": 0.07262995236326107, + "learning_rate": 9.054965642592346e-06, + "loss": 0.5645, + "step": 1652 + }, + { + "epoch": 0.8038905775075987, + "grad_norm": 0.07417723618128876, + "learning_rate": 9.053845156003746e-06, + "loss": 0.5725, + "step": 1653 + }, + { + "epoch": 0.8043768996960486, + "grad_norm": 0.07106478714913289, + "learning_rate": 9.052724074964789e-06, + "loss": 0.5464, + "step": 1654 + }, + { + "epoch": 0.8048632218844984, + "grad_norm": 0.07773557497453094, + "learning_rate": 9.051602399639867e-06, + "loss": 0.613, + "step": 1655 + }, + { + "epoch": 0.8053495440729483, + "grad_norm": 0.07166948375181545, + "learning_rate": 9.050480130193461e-06, + "loss": 0.5715, + "step": 1656 + }, + { + "epoch": 0.8058358662613981, + "grad_norm": 0.07029399058062628, + "learning_rate": 9.049357266790143e-06, + "loss": 0.5864, + "step": 1657 + }, + { + "epoch": 0.806322188449848, + "grad_norm": 0.07442190646574257, + "learning_rate": 9.048233809594561e-06, + "loss": 0.6446, + "step": 1658 + }, + { + "epoch": 0.8068085106382978, + "grad_norm": 0.07440493159559092, + "learning_rate": 9.047109758771467e-06, + "loss": 0.5952, + "step": 1659 + }, + { + "epoch": 0.8072948328267477, + "grad_norm": 0.07314544206796571, + "learning_rate": 9.04598511448568e-06, + "loss": 0.5859, + "step": 1660 + }, + { + "epoch": 0.8077811550151975, + "grad_norm": 0.06861782040904005, + "learning_rate": 9.044859876902124e-06, + "loss": 0.5673, + "step": 1661 + }, + { + "epoch": 0.8082674772036474, + "grad_norm": 0.07227060176254103, + "learning_rate": 9.043734046185799e-06, + "loss": 0.5728, + "step": 1662 + }, + { + "epoch": 0.8087537993920972, + "grad_norm": 0.07117328740849665, + "learning_rate": 9.042607622501794e-06, + "loss": 0.5513, + "step": 1663 + }, + { + "epoch": 0.8092401215805471, + "grad_norm": 0.07466451582311796, + "learning_rate": 9.04148060601529e-06, + "loss": 0.5771, + "step": 1664 + }, + { + "epoch": 0.8097264437689969, + "grad_norm": 0.07409395764568921, + "learning_rate": 9.040352996891549e-06, + "loss": 0.5923, + "step": 1665 + }, + { + "epoch": 0.8102127659574468, + "grad_norm": 0.0727421232905243, + "learning_rate": 9.039224795295923e-06, + "loss": 0.5967, + "step": 1666 + }, + { + "epoch": 0.8106990881458966, + "grad_norm": 0.07328631324290474, + "learning_rate": 9.038096001393847e-06, + "loss": 0.6252, + "step": 1667 + }, + { + "epoch": 0.8111854103343465, + "grad_norm": 0.07561124672077546, + "learning_rate": 9.036966615350848e-06, + "loss": 0.5784, + "step": 1668 + }, + { + "epoch": 0.8116717325227963, + "grad_norm": 0.07346062434985712, + "learning_rate": 9.03583663733254e-06, + "loss": 0.584, + "step": 1669 + }, + { + "epoch": 0.8121580547112462, + "grad_norm": 0.07940070889841651, + "learning_rate": 9.034706067504618e-06, + "loss": 0.5788, + "step": 1670 + }, + { + "epoch": 0.812644376899696, + "grad_norm": 0.07187568885659784, + "learning_rate": 9.033574906032866e-06, + "loss": 0.5919, + "step": 1671 + }, + { + "epoch": 0.813130699088146, + "grad_norm": 0.06908340343975836, + "learning_rate": 9.032443153083163e-06, + "loss": 0.5803, + "step": 1672 + }, + { + "epoch": 0.8136170212765957, + "grad_norm": 0.07355584516555978, + "learning_rate": 9.03131080882146e-06, + "loss": 0.583, + "step": 1673 + }, + { + "epoch": 0.8141033434650456, + "grad_norm": 0.07423856338194974, + "learning_rate": 9.030177873413806e-06, + "loss": 0.5371, + "step": 1674 + }, + { + "epoch": 0.8145896656534954, + "grad_norm": 0.0731960037760341, + "learning_rate": 9.029044347026332e-06, + "loss": 0.5673, + "step": 1675 + }, + { + "epoch": 0.8150759878419452, + "grad_norm": 0.08038420495259692, + "learning_rate": 9.02791022982526e-06, + "loss": 0.5931, + "step": 1676 + }, + { + "epoch": 0.8155623100303951, + "grad_norm": 0.07341009886182812, + "learning_rate": 9.02677552197689e-06, + "loss": 0.6173, + "step": 1677 + }, + { + "epoch": 0.8160486322188449, + "grad_norm": 0.07212165853233529, + "learning_rate": 9.025640223647616e-06, + "loss": 0.5804, + "step": 1678 + }, + { + "epoch": 0.8165349544072948, + "grad_norm": 0.07438445712530184, + "learning_rate": 9.024504335003918e-06, + "loss": 0.568, + "step": 1679 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 0.07745216994949357, + "learning_rate": 9.023367856212362e-06, + "loss": 0.5893, + "step": 1680 + }, + { + "epoch": 0.8175075987841945, + "grad_norm": 0.07782732242653724, + "learning_rate": 9.022230787439597e-06, + "loss": 0.5706, + "step": 1681 + }, + { + "epoch": 0.8179939209726443, + "grad_norm": 0.07399645125005298, + "learning_rate": 9.021093128852363e-06, + "loss": 0.5641, + "step": 1682 + }, + { + "epoch": 0.8184802431610942, + "grad_norm": 0.07774936464533533, + "learning_rate": 9.019954880617486e-06, + "loss": 0.5965, + "step": 1683 + }, + { + "epoch": 0.818966565349544, + "grad_norm": 0.07350171443380829, + "learning_rate": 9.018816042901873e-06, + "loss": 0.5888, + "step": 1684 + }, + { + "epoch": 0.819452887537994, + "grad_norm": 0.07296282203618629, + "learning_rate": 9.017676615872524e-06, + "loss": 0.578, + "step": 1685 + }, + { + "epoch": 0.8199392097264437, + "grad_norm": 0.07458383558568267, + "learning_rate": 9.016536599696524e-06, + "loss": 0.5937, + "step": 1686 + }, + { + "epoch": 0.8204255319148936, + "grad_norm": 0.07516328495212585, + "learning_rate": 9.015395994541041e-06, + "loss": 0.6027, + "step": 1687 + }, + { + "epoch": 0.8209118541033434, + "grad_norm": 0.07760485343942312, + "learning_rate": 9.014254800573334e-06, + "loss": 0.5547, + "step": 1688 + }, + { + "epoch": 0.8213981762917933, + "grad_norm": 0.07485184169756832, + "learning_rate": 9.013113017960747e-06, + "loss": 0.5717, + "step": 1689 + }, + { + "epoch": 0.8218844984802431, + "grad_norm": 0.07375602770450967, + "learning_rate": 9.011970646870706e-06, + "loss": 0.5789, + "step": 1690 + }, + { + "epoch": 0.822370820668693, + "grad_norm": 0.07592702782230341, + "learning_rate": 9.01082768747073e-06, + "loss": 0.5876, + "step": 1691 + }, + { + "epoch": 0.8228571428571428, + "grad_norm": 0.07709204242085249, + "learning_rate": 9.009684139928419e-06, + "loss": 0.5919, + "step": 1692 + }, + { + "epoch": 0.8233434650455927, + "grad_norm": 0.07344620360493821, + "learning_rate": 9.00854000441146e-06, + "loss": 0.5738, + "step": 1693 + }, + { + "epoch": 0.8238297872340425, + "grad_norm": 0.07839384699549824, + "learning_rate": 9.007395281087632e-06, + "loss": 0.5888, + "step": 1694 + }, + { + "epoch": 0.8243161094224924, + "grad_norm": 0.07553135863620744, + "learning_rate": 9.006249970124793e-06, + "loss": 0.6127, + "step": 1695 + }, + { + "epoch": 0.8248024316109422, + "grad_norm": 0.07187484456694787, + "learning_rate": 9.005104071690887e-06, + "loss": 0.5475, + "step": 1696 + }, + { + "epoch": 0.8252887537993921, + "grad_norm": 0.07390033022275086, + "learning_rate": 9.00395758595395e-06, + "loss": 0.5847, + "step": 1697 + }, + { + "epoch": 0.8257750759878419, + "grad_norm": 0.07247525563929644, + "learning_rate": 9.002810513082104e-06, + "loss": 0.5889, + "step": 1698 + }, + { + "epoch": 0.8262613981762917, + "grad_norm": 0.0772085413640557, + "learning_rate": 9.00166285324355e-06, + "loss": 0.6176, + "step": 1699 + }, + { + "epoch": 0.8267477203647416, + "grad_norm": 0.08567194868303393, + "learning_rate": 9.00051460660658e-06, + "loss": 0.6075, + "step": 1700 + }, + { + "epoch": 0.8272340425531914, + "grad_norm": 0.07746542664192362, + "learning_rate": 8.999365773339573e-06, + "loss": 0.5719, + "step": 1701 + }, + { + "epoch": 0.8277203647416413, + "grad_norm": 0.07266215749314871, + "learning_rate": 8.998216353610989e-06, + "loss": 0.5798, + "step": 1702 + }, + { + "epoch": 0.8282066869300911, + "grad_norm": 0.07253835215721129, + "learning_rate": 8.99706634758938e-06, + "loss": 0.5878, + "step": 1703 + }, + { + "epoch": 0.828693009118541, + "grad_norm": 0.07824890088111018, + "learning_rate": 8.995915755443382e-06, + "loss": 0.6092, + "step": 1704 + }, + { + "epoch": 0.8291793313069908, + "grad_norm": 0.0760099245256956, + "learning_rate": 8.994764577341715e-06, + "loss": 0.6274, + "step": 1705 + }, + { + "epoch": 0.8296656534954407, + "grad_norm": 0.07046188661348363, + "learning_rate": 8.993612813453186e-06, + "loss": 0.5615, + "step": 1706 + }, + { + "epoch": 0.8301519756838905, + "grad_norm": 0.07280733043950458, + "learning_rate": 8.992460463946689e-06, + "loss": 0.617, + "step": 1707 + }, + { + "epoch": 0.8306382978723404, + "grad_norm": 0.07386435769644766, + "learning_rate": 8.9913075289912e-06, + "loss": 0.5753, + "step": 1708 + }, + { + "epoch": 0.8311246200607902, + "grad_norm": 0.07236056195150549, + "learning_rate": 8.99015400875579e-06, + "loss": 0.5633, + "step": 1709 + }, + { + "epoch": 0.8316109422492401, + "grad_norm": 0.07542819798329134, + "learning_rate": 8.988999903409604e-06, + "loss": 0.5836, + "step": 1710 + }, + { + "epoch": 0.8320972644376899, + "grad_norm": 0.07747168818735184, + "learning_rate": 8.987845213121879e-06, + "loss": 0.6039, + "step": 1711 + }, + { + "epoch": 0.8325835866261398, + "grad_norm": 0.077752119336714, + "learning_rate": 8.986689938061938e-06, + "loss": 0.6165, + "step": 1712 + }, + { + "epoch": 0.8330699088145896, + "grad_norm": 0.07806058042030259, + "learning_rate": 8.985534078399191e-06, + "loss": 0.6548, + "step": 1713 + }, + { + "epoch": 0.8335562310030395, + "grad_norm": 0.07553643279218694, + "learning_rate": 8.98437763430313e-06, + "loss": 0.6244, + "step": 1714 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 0.07671067908323834, + "learning_rate": 8.983220605943335e-06, + "loss": 0.6183, + "step": 1715 + }, + { + "epoch": 0.8345288753799393, + "grad_norm": 0.07741727677689454, + "learning_rate": 8.98206299348947e-06, + "loss": 0.6228, + "step": 1716 + }, + { + "epoch": 0.835015197568389, + "grad_norm": 0.07216911966672036, + "learning_rate": 8.980904797111287e-06, + "loss": 0.5604, + "step": 1717 + }, + { + "epoch": 0.835501519756839, + "grad_norm": 0.07440058651270015, + "learning_rate": 8.97974601697862e-06, + "loss": 0.5811, + "step": 1718 + }, + { + "epoch": 0.8359878419452887, + "grad_norm": 0.0746064900492746, + "learning_rate": 8.978586653261395e-06, + "loss": 0.589, + "step": 1719 + }, + { + "epoch": 0.8364741641337387, + "grad_norm": 0.07235270763835797, + "learning_rate": 8.977426706129615e-06, + "loss": 0.5899, + "step": 1720 + }, + { + "epoch": 0.8369604863221884, + "grad_norm": 0.0752546348466607, + "learning_rate": 8.976266175753376e-06, + "loss": 0.6022, + "step": 1721 + }, + { + "epoch": 0.8374468085106384, + "grad_norm": 0.0748912661462555, + "learning_rate": 8.975105062302856e-06, + "loss": 0.6094, + "step": 1722 + }, + { + "epoch": 0.8379331306990881, + "grad_norm": 0.0733073753146529, + "learning_rate": 8.973943365948318e-06, + "loss": 0.5615, + "step": 1723 + }, + { + "epoch": 0.8384194528875379, + "grad_norm": 0.0734476230232227, + "learning_rate": 8.972781086860115e-06, + "loss": 0.5753, + "step": 1724 + }, + { + "epoch": 0.8389057750759878, + "grad_norm": 0.07633658867437207, + "learning_rate": 8.971618225208678e-06, + "loss": 0.6667, + "step": 1725 + }, + { + "epoch": 0.8393920972644376, + "grad_norm": 0.07541740130586612, + "learning_rate": 8.970454781164529e-06, + "loss": 0.5845, + "step": 1726 + }, + { + "epoch": 0.8398784194528875, + "grad_norm": 0.07414936120986003, + "learning_rate": 8.969290754898272e-06, + "loss": 0.5696, + "step": 1727 + }, + { + "epoch": 0.8403647416413373, + "grad_norm": 0.07226168570528396, + "learning_rate": 8.968126146580602e-06, + "loss": 0.5913, + "step": 1728 + }, + { + "epoch": 0.8408510638297872, + "grad_norm": 0.07042620815515654, + "learning_rate": 8.966960956382293e-06, + "loss": 0.5856, + "step": 1729 + }, + { + "epoch": 0.841337386018237, + "grad_norm": 0.07296803356505525, + "learning_rate": 8.965795184474209e-06, + "loss": 0.6134, + "step": 1730 + }, + { + "epoch": 0.841823708206687, + "grad_norm": 0.07427100045337967, + "learning_rate": 8.964628831027296e-06, + "loss": 0.5921, + "step": 1731 + }, + { + "epoch": 0.8423100303951367, + "grad_norm": 0.07245783357819416, + "learning_rate": 8.963461896212585e-06, + "loss": 0.5937, + "step": 1732 + }, + { + "epoch": 0.8427963525835866, + "grad_norm": 0.0761264904307955, + "learning_rate": 8.962294380201195e-06, + "loss": 0.5871, + "step": 1733 + }, + { + "epoch": 0.8432826747720364, + "grad_norm": 0.07544877652570996, + "learning_rate": 8.961126283164328e-06, + "loss": 0.5959, + "step": 1734 + }, + { + "epoch": 0.8437689969604864, + "grad_norm": 0.07599806425823115, + "learning_rate": 8.959957605273274e-06, + "loss": 0.5895, + "step": 1735 + }, + { + "epoch": 0.8442553191489361, + "grad_norm": 0.0770339060887551, + "learning_rate": 8.958788346699405e-06, + "loss": 0.5711, + "step": 1736 + }, + { + "epoch": 0.844741641337386, + "grad_norm": 0.07250882644105865, + "learning_rate": 8.957618507614182e-06, + "loss": 0.5773, + "step": 1737 + }, + { + "epoch": 0.8452279635258358, + "grad_norm": 0.07002626608661978, + "learning_rate": 8.956448088189144e-06, + "loss": 0.5506, + "step": 1738 + }, + { + "epoch": 0.8457142857142858, + "grad_norm": 0.07628888894734462, + "learning_rate": 8.955277088595924e-06, + "loss": 0.605, + "step": 1739 + }, + { + "epoch": 0.8462006079027355, + "grad_norm": 0.07702051631590402, + "learning_rate": 8.954105509006235e-06, + "loss": 0.6031, + "step": 1740 + }, + { + "epoch": 0.8466869300911855, + "grad_norm": 0.07260300281865754, + "learning_rate": 8.952933349591872e-06, + "loss": 0.5501, + "step": 1741 + }, + { + "epoch": 0.8471732522796352, + "grad_norm": 0.07389498738809527, + "learning_rate": 8.951760610524725e-06, + "loss": 0.6181, + "step": 1742 + }, + { + "epoch": 0.8476595744680852, + "grad_norm": 0.07062573382571884, + "learning_rate": 8.950587291976758e-06, + "loss": 0.5402, + "step": 1743 + }, + { + "epoch": 0.848145896656535, + "grad_norm": 0.07350470206207332, + "learning_rate": 8.949413394120026e-06, + "loss": 0.5595, + "step": 1744 + }, + { + "epoch": 0.8486322188449849, + "grad_norm": 0.07699941052844284, + "learning_rate": 8.94823891712667e-06, + "loss": 0.5917, + "step": 1745 + }, + { + "epoch": 0.8491185410334346, + "grad_norm": 0.0741770418789494, + "learning_rate": 8.94706386116891e-06, + "loss": 0.5941, + "step": 1746 + }, + { + "epoch": 0.8496048632218844, + "grad_norm": 0.0759476218643951, + "learning_rate": 8.945888226419056e-06, + "loss": 0.6261, + "step": 1747 + }, + { + "epoch": 0.8500911854103343, + "grad_norm": 0.07298008462158603, + "learning_rate": 8.944712013049505e-06, + "loss": 0.5632, + "step": 1748 + }, + { + "epoch": 0.8505775075987841, + "grad_norm": 0.07334761932590854, + "learning_rate": 8.943535221232731e-06, + "loss": 0.5938, + "step": 1749 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 0.08062886925564591, + "learning_rate": 8.9423578511413e-06, + "loss": 0.6011, + "step": 1750 + }, + { + "epoch": 0.8515501519756838, + "grad_norm": 0.07387128105610562, + "learning_rate": 8.941179902947856e-06, + "loss": 0.5856, + "step": 1751 + }, + { + "epoch": 0.8520364741641338, + "grad_norm": 0.07063102885367174, + "learning_rate": 8.940001376825136e-06, + "loss": 0.567, + "step": 1752 + }, + { + "epoch": 0.8525227963525835, + "grad_norm": 0.07809396720464015, + "learning_rate": 8.938822272945956e-06, + "loss": 0.647, + "step": 1753 + }, + { + "epoch": 0.8530091185410335, + "grad_norm": 0.07265612064070369, + "learning_rate": 8.937642591483218e-06, + "loss": 0.5822, + "step": 1754 + }, + { + "epoch": 0.8534954407294832, + "grad_norm": 0.07511471917960437, + "learning_rate": 8.936462332609907e-06, + "loss": 0.6235, + "step": 1755 + }, + { + "epoch": 0.8539817629179332, + "grad_norm": 0.07179712358551885, + "learning_rate": 8.935281496499098e-06, + "loss": 0.6184, + "step": 1756 + }, + { + "epoch": 0.854468085106383, + "grad_norm": 0.07384253094153809, + "learning_rate": 8.934100083323945e-06, + "loss": 0.6146, + "step": 1757 + }, + { + "epoch": 0.8549544072948329, + "grad_norm": 0.07414507661436152, + "learning_rate": 8.93291809325769e-06, + "loss": 0.6134, + "step": 1758 + }, + { + "epoch": 0.8554407294832826, + "grad_norm": 0.07291158653408171, + "learning_rate": 8.931735526473657e-06, + "loss": 0.6224, + "step": 1759 + }, + { + "epoch": 0.8559270516717326, + "grad_norm": 0.07800586892031393, + "learning_rate": 8.93055238314526e-06, + "loss": 0.577, + "step": 1760 + }, + { + "epoch": 0.8564133738601823, + "grad_norm": 0.07381345121072054, + "learning_rate": 8.929368663445985e-06, + "loss": 0.597, + "step": 1761 + }, + { + "epoch": 0.8568996960486323, + "grad_norm": 0.07313500176625996, + "learning_rate": 8.92818436754942e-06, + "loss": 0.5374, + "step": 1762 + }, + { + "epoch": 0.857386018237082, + "grad_norm": 0.07754940049868893, + "learning_rate": 8.926999495629225e-06, + "loss": 0.5767, + "step": 1763 + }, + { + "epoch": 0.857872340425532, + "grad_norm": 0.07685640896181924, + "learning_rate": 8.925814047859147e-06, + "loss": 0.6029, + "step": 1764 + }, + { + "epoch": 0.8583586626139817, + "grad_norm": 0.08123464359078096, + "learning_rate": 8.92462802441302e-06, + "loss": 0.615, + "step": 1765 + }, + { + "epoch": 0.8588449848024317, + "grad_norm": 0.07521549285135744, + "learning_rate": 8.92344142546476e-06, + "loss": 0.5603, + "step": 1766 + }, + { + "epoch": 0.8593313069908814, + "grad_norm": 0.07407625619133144, + "learning_rate": 8.92225425118837e-06, + "loss": 0.5616, + "step": 1767 + }, + { + "epoch": 0.8598176291793314, + "grad_norm": 0.07479406319718562, + "learning_rate": 8.92106650175793e-06, + "loss": 0.5815, + "step": 1768 + }, + { + "epoch": 0.8603039513677812, + "grad_norm": 0.0775809123259529, + "learning_rate": 8.919878177347619e-06, + "loss": 0.5831, + "step": 1769 + }, + { + "epoch": 0.8607902735562311, + "grad_norm": 0.07290792413533646, + "learning_rate": 8.918689278131684e-06, + "loss": 0.5787, + "step": 1770 + }, + { + "epoch": 0.8612765957446809, + "grad_norm": 0.07430533803119772, + "learning_rate": 8.917499804284466e-06, + "loss": 0.607, + "step": 1771 + }, + { + "epoch": 0.8617629179331306, + "grad_norm": 0.07137457198774201, + "learning_rate": 8.91630975598039e-06, + "loss": 0.616, + "step": 1772 + }, + { + "epoch": 0.8622492401215806, + "grad_norm": 0.0732224722333857, + "learning_rate": 8.91511913339396e-06, + "loss": 0.5953, + "step": 1773 + }, + { + "epoch": 0.8627355623100303, + "grad_norm": 0.07713203812817963, + "learning_rate": 8.913927936699765e-06, + "loss": 0.5919, + "step": 1774 + }, + { + "epoch": 0.8632218844984803, + "grad_norm": 0.07125787615846159, + "learning_rate": 8.912736166072487e-06, + "loss": 0.5611, + "step": 1775 + }, + { + "epoch": 0.86370820668693, + "grad_norm": 0.0790858738192093, + "learning_rate": 8.91154382168688e-06, + "loss": 0.6164, + "step": 1776 + }, + { + "epoch": 0.86419452887538, + "grad_norm": 0.07457881168675248, + "learning_rate": 8.910350903717793e-06, + "loss": 0.5762, + "step": 1777 + }, + { + "epoch": 0.8646808510638297, + "grad_norm": 0.07382250428158418, + "learning_rate": 8.90915741234015e-06, + "loss": 0.5919, + "step": 1778 + }, + { + "epoch": 0.8651671732522797, + "grad_norm": 0.07145584394044313, + "learning_rate": 8.907963347728964e-06, + "loss": 0.5835, + "step": 1779 + }, + { + "epoch": 0.8656534954407294, + "grad_norm": 0.07250899102574372, + "learning_rate": 8.90676871005933e-06, + "loss": 0.5662, + "step": 1780 + }, + { + "epoch": 0.8661398176291794, + "grad_norm": 0.07389342039749239, + "learning_rate": 8.90557349950643e-06, + "loss": 0.5811, + "step": 1781 + }, + { + "epoch": 0.8666261398176291, + "grad_norm": 0.0755531278875473, + "learning_rate": 8.904377716245525e-06, + "loss": 0.5741, + "step": 1782 + }, + { + "epoch": 0.867112462006079, + "grad_norm": 0.07300211455686316, + "learning_rate": 8.903181360451966e-06, + "loss": 0.5679, + "step": 1783 + }, + { + "epoch": 0.8675987841945288, + "grad_norm": 0.07375435548275153, + "learning_rate": 8.901984432301185e-06, + "loss": 0.5664, + "step": 1784 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 0.07192587016348345, + "learning_rate": 8.900786931968696e-06, + "loss": 0.594, + "step": 1785 + }, + { + "epoch": 0.8685714285714285, + "grad_norm": 0.07245213201990867, + "learning_rate": 8.899588859630102e-06, + "loss": 0.553, + "step": 1786 + }, + { + "epoch": 0.8690577507598785, + "grad_norm": 0.0794118055625682, + "learning_rate": 8.89839021546108e-06, + "loss": 0.5478, + "step": 1787 + }, + { + "epoch": 0.8695440729483283, + "grad_norm": 0.07104283997778915, + "learning_rate": 8.897190999637406e-06, + "loss": 0.5407, + "step": 1788 + }, + { + "epoch": 0.8700303951367782, + "grad_norm": 0.07308588559191222, + "learning_rate": 8.895991212334927e-06, + "loss": 0.594, + "step": 1789 + }, + { + "epoch": 0.870516717325228, + "grad_norm": 0.07813308134028296, + "learning_rate": 8.894790853729577e-06, + "loss": 0.555, + "step": 1790 + }, + { + "epoch": 0.8710030395136779, + "grad_norm": 0.07810939367623708, + "learning_rate": 8.893589923997379e-06, + "loss": 0.5732, + "step": 1791 + }, + { + "epoch": 0.8714893617021277, + "grad_norm": 0.07322235690887234, + "learning_rate": 8.892388423314431e-06, + "loss": 0.5763, + "step": 1792 + }, + { + "epoch": 0.8719756838905776, + "grad_norm": 0.07304298460722047, + "learning_rate": 8.891186351856923e-06, + "loss": 0.57, + "step": 1793 + }, + { + "epoch": 0.8724620060790274, + "grad_norm": 0.07747438261036639, + "learning_rate": 8.889983709801123e-06, + "loss": 0.6369, + "step": 1794 + }, + { + "epoch": 0.8729483282674773, + "grad_norm": 0.07512815424485461, + "learning_rate": 8.888780497323386e-06, + "loss": 0.57, + "step": 1795 + }, + { + "epoch": 0.873434650455927, + "grad_norm": 0.07214961704738479, + "learning_rate": 8.88757671460015e-06, + "loss": 0.5652, + "step": 1796 + }, + { + "epoch": 0.8739209726443768, + "grad_norm": 0.07157004904071518, + "learning_rate": 8.886372361807933e-06, + "loss": 0.5685, + "step": 1797 + }, + { + "epoch": 0.8744072948328268, + "grad_norm": 0.07291528381801188, + "learning_rate": 8.885167439123343e-06, + "loss": 0.5735, + "step": 1798 + }, + { + "epoch": 0.8748936170212765, + "grad_norm": 0.07064590011520856, + "learning_rate": 8.883961946723067e-06, + "loss": 0.5771, + "step": 1799 + }, + { + "epoch": 0.8753799392097265, + "grad_norm": 0.07454442816844008, + "learning_rate": 8.882755884783877e-06, + "loss": 0.6097, + "step": 1800 + }, + { + "epoch": 0.8758662613981762, + "grad_norm": 0.08336435374338261, + "learning_rate": 8.88154925348263e-06, + "loss": 0.6007, + "step": 1801 + }, + { + "epoch": 0.8763525835866262, + "grad_norm": 0.07194424430808544, + "learning_rate": 8.88034205299626e-06, + "loss": 0.5881, + "step": 1802 + }, + { + "epoch": 0.876838905775076, + "grad_norm": 0.07217333243591798, + "learning_rate": 8.879134283501791e-06, + "loss": 0.5849, + "step": 1803 + }, + { + "epoch": 0.8773252279635259, + "grad_norm": 0.07407238318024906, + "learning_rate": 8.877925945176333e-06, + "loss": 0.5794, + "step": 1804 + }, + { + "epoch": 0.8778115501519757, + "grad_norm": 0.07417279552340213, + "learning_rate": 8.876717038197072e-06, + "loss": 0.5495, + "step": 1805 + }, + { + "epoch": 0.8782978723404256, + "grad_norm": 0.07373951036618491, + "learning_rate": 8.875507562741278e-06, + "loss": 0.6046, + "step": 1806 + }, + { + "epoch": 0.8787841945288754, + "grad_norm": 0.07475959022208804, + "learning_rate": 8.87429751898631e-06, + "loss": 0.604, + "step": 1807 + }, + { + "epoch": 0.8792705167173253, + "grad_norm": 0.07086238419697516, + "learning_rate": 8.873086907109608e-06, + "loss": 0.5601, + "step": 1808 + }, + { + "epoch": 0.879756838905775, + "grad_norm": 0.07290959201460409, + "learning_rate": 8.87187572728869e-06, + "loss": 0.588, + "step": 1809 + }, + { + "epoch": 0.880243161094225, + "grad_norm": 0.07330452794101329, + "learning_rate": 8.870663979701167e-06, + "loss": 0.6058, + "step": 1810 + }, + { + "epoch": 0.8807294832826748, + "grad_norm": 0.0734147200310214, + "learning_rate": 8.869451664524725e-06, + "loss": 0.5935, + "step": 1811 + }, + { + "epoch": 0.8812158054711247, + "grad_norm": 0.07043841596878565, + "learning_rate": 8.868238781937137e-06, + "loss": 0.5311, + "step": 1812 + }, + { + "epoch": 0.8817021276595745, + "grad_norm": 0.07232977526210412, + "learning_rate": 8.867025332116259e-06, + "loss": 0.5639, + "step": 1813 + }, + { + "epoch": 0.8821884498480244, + "grad_norm": 0.07112890099021922, + "learning_rate": 8.865811315240027e-06, + "loss": 0.5751, + "step": 1814 + }, + { + "epoch": 0.8826747720364742, + "grad_norm": 0.07345759793344649, + "learning_rate": 8.864596731486466e-06, + "loss": 0.5852, + "step": 1815 + }, + { + "epoch": 0.8831610942249241, + "grad_norm": 0.07661329800703762, + "learning_rate": 8.86338158103368e-06, + "loss": 0.6206, + "step": 1816 + }, + { + "epoch": 0.8836474164133739, + "grad_norm": 0.08308756505901159, + "learning_rate": 8.862165864059857e-06, + "loss": 0.6262, + "step": 1817 + }, + { + "epoch": 0.8841337386018238, + "grad_norm": 0.07392341845122752, + "learning_rate": 8.860949580743267e-06, + "loss": 0.588, + "step": 1818 + }, + { + "epoch": 0.8846200607902736, + "grad_norm": 0.07144143023133331, + "learning_rate": 8.859732731262268e-06, + "loss": 0.5795, + "step": 1819 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 0.07396038022740108, + "learning_rate": 8.85851531579529e-06, + "loss": 0.5772, + "step": 1820 + }, + { + "epoch": 0.8855927051671733, + "grad_norm": 0.07688288033822276, + "learning_rate": 8.857297334520859e-06, + "loss": 0.5916, + "step": 1821 + }, + { + "epoch": 0.886079027355623, + "grad_norm": 0.078087355099266, + "learning_rate": 8.856078787617577e-06, + "loss": 0.6095, + "step": 1822 + }, + { + "epoch": 0.886565349544073, + "grad_norm": 0.07155526097783196, + "learning_rate": 8.854859675264129e-06, + "loss": 0.6092, + "step": 1823 + }, + { + "epoch": 0.8870516717325228, + "grad_norm": 0.07213763377542855, + "learning_rate": 8.853639997639282e-06, + "loss": 0.5855, + "step": 1824 + }, + { + "epoch": 0.8875379939209727, + "grad_norm": 0.07371499925541963, + "learning_rate": 8.852419754921894e-06, + "loss": 0.5883, + "step": 1825 + }, + { + "epoch": 0.8880243161094225, + "grad_norm": 0.0741751389661036, + "learning_rate": 8.851198947290895e-06, + "loss": 0.5931, + "step": 1826 + }, + { + "epoch": 0.8885106382978724, + "grad_norm": 0.07535910264684668, + "learning_rate": 8.849977574925302e-06, + "loss": 0.5968, + "step": 1827 + }, + { + "epoch": 0.8889969604863222, + "grad_norm": 0.07431349323625044, + "learning_rate": 8.848755638004217e-06, + "loss": 0.6184, + "step": 1828 + }, + { + "epoch": 0.8894832826747721, + "grad_norm": 0.07177952381151019, + "learning_rate": 8.847533136706826e-06, + "loss": 0.5686, + "step": 1829 + }, + { + "epoch": 0.8899696048632219, + "grad_norm": 0.072367975354459, + "learning_rate": 8.846310071212392e-06, + "loss": 0.6207, + "step": 1830 + }, + { + "epoch": 0.8904559270516718, + "grad_norm": 0.07935903680649259, + "learning_rate": 8.845086441700261e-06, + "loss": 0.5528, + "step": 1831 + }, + { + "epoch": 0.8909422492401216, + "grad_norm": 0.07482355604954846, + "learning_rate": 8.843862248349868e-06, + "loss": 0.5717, + "step": 1832 + }, + { + "epoch": 0.8914285714285715, + "grad_norm": 0.07275913321947461, + "learning_rate": 8.842637491340728e-06, + "loss": 0.5623, + "step": 1833 + }, + { + "epoch": 0.8919148936170213, + "grad_norm": 0.0757380783011119, + "learning_rate": 8.841412170852435e-06, + "loss": 0.6116, + "step": 1834 + }, + { + "epoch": 0.8924012158054712, + "grad_norm": 0.07625820826312853, + "learning_rate": 8.840186287064669e-06, + "loss": 0.6419, + "step": 1835 + }, + { + "epoch": 0.892887537993921, + "grad_norm": 0.07272042793767222, + "learning_rate": 8.838959840157192e-06, + "loss": 0.5481, + "step": 1836 + }, + { + "epoch": 0.8933738601823709, + "grad_norm": 0.08169984702086527, + "learning_rate": 8.837732830309848e-06, + "loss": 0.6204, + "step": 1837 + }, + { + "epoch": 0.8938601823708207, + "grad_norm": 0.07427025854834735, + "learning_rate": 8.836505257702565e-06, + "loss": 0.6099, + "step": 1838 + }, + { + "epoch": 0.8943465045592706, + "grad_norm": 0.07550707716624953, + "learning_rate": 8.835277122515354e-06, + "loss": 0.6247, + "step": 1839 + }, + { + "epoch": 0.8948328267477204, + "grad_norm": 0.07315107499038113, + "learning_rate": 8.834048424928305e-06, + "loss": 0.6008, + "step": 1840 + }, + { + "epoch": 0.8953191489361703, + "grad_norm": 0.07233291013245507, + "learning_rate": 8.832819165121594e-06, + "loss": 0.5487, + "step": 1841 + }, + { + "epoch": 0.8958054711246201, + "grad_norm": 0.07163137146747516, + "learning_rate": 8.831589343275474e-06, + "loss": 0.5546, + "step": 1842 + }, + { + "epoch": 0.89629179331307, + "grad_norm": 0.07416546849099498, + "learning_rate": 8.83035895957029e-06, + "loss": 0.5731, + "step": 1843 + }, + { + "epoch": 0.8967781155015198, + "grad_norm": 0.07687206384413069, + "learning_rate": 8.829128014186458e-06, + "loss": 0.6126, + "step": 1844 + }, + { + "epoch": 0.8972644376899696, + "grad_norm": 0.07482452758160005, + "learning_rate": 8.827896507304488e-06, + "loss": 0.6352, + "step": 1845 + }, + { + "epoch": 0.8977507598784195, + "grad_norm": 0.075647536473654, + "learning_rate": 8.826664439104964e-06, + "loss": 0.5759, + "step": 1846 + }, + { + "epoch": 0.8982370820668693, + "grad_norm": 0.07240884129541364, + "learning_rate": 8.825431809768554e-06, + "loss": 0.5748, + "step": 1847 + }, + { + "epoch": 0.8987234042553192, + "grad_norm": 0.07389696423901282, + "learning_rate": 8.82419861947601e-06, + "loss": 0.5713, + "step": 1848 + }, + { + "epoch": 0.899209726443769, + "grad_norm": 0.07428749474448496, + "learning_rate": 8.822964868408164e-06, + "loss": 0.611, + "step": 1849 + }, + { + "epoch": 0.8996960486322189, + "grad_norm": 0.07690624039589869, + "learning_rate": 8.821730556745933e-06, + "loss": 0.5923, + "step": 1850 + }, + { + "epoch": 0.9001823708206687, + "grad_norm": 0.07270364688051707, + "learning_rate": 8.820495684670315e-06, + "loss": 0.5916, + "step": 1851 + }, + { + "epoch": 0.9006686930091186, + "grad_norm": 0.07312127487499655, + "learning_rate": 8.81926025236239e-06, + "loss": 0.6308, + "step": 1852 + }, + { + "epoch": 0.9011550151975684, + "grad_norm": 0.07146416378388792, + "learning_rate": 8.818024260003319e-06, + "loss": 0.5711, + "step": 1853 + }, + { + "epoch": 0.9016413373860183, + "grad_norm": 0.07130166608385094, + "learning_rate": 8.816787707774347e-06, + "loss": 0.5655, + "step": 1854 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 0.07316680537191218, + "learning_rate": 8.8155505958568e-06, + "loss": 0.5887, + "step": 1855 + }, + { + "epoch": 0.902613981762918, + "grad_norm": 0.07480706657946969, + "learning_rate": 8.814312924432086e-06, + "loss": 0.5341, + "step": 1856 + }, + { + "epoch": 0.9031003039513678, + "grad_norm": 0.07755230340098143, + "learning_rate": 8.813074693681697e-06, + "loss": 0.6117, + "step": 1857 + }, + { + "epoch": 0.9035866261398177, + "grad_norm": 0.07541342156250899, + "learning_rate": 8.811835903787204e-06, + "loss": 0.604, + "step": 1858 + }, + { + "epoch": 0.9040729483282675, + "grad_norm": 0.07906610477805168, + "learning_rate": 8.810596554930262e-06, + "loss": 0.6164, + "step": 1859 + }, + { + "epoch": 0.9045592705167174, + "grad_norm": 0.07350433657886936, + "learning_rate": 8.809356647292609e-06, + "loss": 0.6049, + "step": 1860 + }, + { + "epoch": 0.9050455927051672, + "grad_norm": 0.07512495990679414, + "learning_rate": 8.808116181056059e-06, + "loss": 0.6135, + "step": 1861 + }, + { + "epoch": 0.9055319148936171, + "grad_norm": 0.07308124586475026, + "learning_rate": 8.806875156402516e-06, + "loss": 0.6053, + "step": 1862 + }, + { + "epoch": 0.9060182370820669, + "grad_norm": 0.07131909029016202, + "learning_rate": 8.805633573513962e-06, + "loss": 0.5879, + "step": 1863 + }, + { + "epoch": 0.9065045592705168, + "grad_norm": 0.07422709663903333, + "learning_rate": 8.804391432572459e-06, + "loss": 0.5604, + "step": 1864 + }, + { + "epoch": 0.9069908814589666, + "grad_norm": 0.08205489241017076, + "learning_rate": 8.803148733760155e-06, + "loss": 0.6801, + "step": 1865 + }, + { + "epoch": 0.9074772036474165, + "grad_norm": 0.07391652391123428, + "learning_rate": 8.801905477259276e-06, + "loss": 0.6153, + "step": 1866 + }, + { + "epoch": 0.9079635258358663, + "grad_norm": 0.07077010357824341, + "learning_rate": 8.800661663252134e-06, + "loss": 0.553, + "step": 1867 + }, + { + "epoch": 0.9084498480243162, + "grad_norm": 0.08035645758146163, + "learning_rate": 8.799417291921117e-06, + "loss": 0.5557, + "step": 1868 + }, + { + "epoch": 0.908936170212766, + "grad_norm": 0.06992597126663248, + "learning_rate": 8.7981723634487e-06, + "loss": 0.5294, + "step": 1869 + }, + { + "epoch": 0.9094224924012158, + "grad_norm": 0.07058614228000314, + "learning_rate": 8.796926878017438e-06, + "loss": 0.5879, + "step": 1870 + }, + { + "epoch": 0.9099088145896657, + "grad_norm": 0.07192381102857433, + "learning_rate": 8.795680835809964e-06, + "loss": 0.5711, + "step": 1871 + }, + { + "epoch": 0.9103951367781155, + "grad_norm": 0.07045862120870869, + "learning_rate": 8.794434237009e-06, + "loss": 0.6138, + "step": 1872 + }, + { + "epoch": 0.9108814589665654, + "grad_norm": 0.0736511925101877, + "learning_rate": 8.793187081797343e-06, + "loss": 0.5503, + "step": 1873 + }, + { + "epoch": 0.9113677811550152, + "grad_norm": 0.07689011225703497, + "learning_rate": 8.791939370357876e-06, + "loss": 0.5961, + "step": 1874 + }, + { + "epoch": 0.9118541033434651, + "grad_norm": 0.07242971371312248, + "learning_rate": 8.790691102873558e-06, + "loss": 0.5748, + "step": 1875 + }, + { + "epoch": 0.9123404255319149, + "grad_norm": 0.07310194234505171, + "learning_rate": 8.789442279527438e-06, + "loss": 0.5921, + "step": 1876 + }, + { + "epoch": 0.9128267477203648, + "grad_norm": 0.07194543576831185, + "learning_rate": 8.78819290050264e-06, + "loss": 0.5741, + "step": 1877 + }, + { + "epoch": 0.9133130699088146, + "grad_norm": 0.07332708608921876, + "learning_rate": 8.78694296598237e-06, + "loss": 0.6284, + "step": 1878 + }, + { + "epoch": 0.9137993920972645, + "grad_norm": 0.08254463870559857, + "learning_rate": 8.785692476149918e-06, + "loss": 0.6006, + "step": 1879 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.07031572348799323, + "learning_rate": 8.784441431188653e-06, + "loss": 0.5916, + "step": 1880 + }, + { + "epoch": 0.9147720364741642, + "grad_norm": 0.06844342078713918, + "learning_rate": 8.783189831282028e-06, + "loss": 0.5525, + "step": 1881 + }, + { + "epoch": 0.915258358662614, + "grad_norm": 0.07299935616393388, + "learning_rate": 8.781937676613577e-06, + "loss": 0.5618, + "step": 1882 + }, + { + "epoch": 0.9157446808510639, + "grad_norm": 0.07430506819954745, + "learning_rate": 8.78068496736691e-06, + "loss": 0.6192, + "step": 1883 + }, + { + "epoch": 0.9162310030395137, + "grad_norm": 0.07721377115803853, + "learning_rate": 8.779431703725726e-06, + "loss": 0.6015, + "step": 1884 + }, + { + "epoch": 0.9167173252279636, + "grad_norm": 0.07548748738965665, + "learning_rate": 8.7781778858738e-06, + "loss": 0.5937, + "step": 1885 + }, + { + "epoch": 0.9172036474164134, + "grad_norm": 0.07204255191539881, + "learning_rate": 8.776923513994993e-06, + "loss": 0.5508, + "step": 1886 + }, + { + "epoch": 0.9176899696048633, + "grad_norm": 0.08560770198995425, + "learning_rate": 8.77566858827324e-06, + "loss": 0.6626, + "step": 1887 + }, + { + "epoch": 0.9181762917933131, + "grad_norm": 0.07371661573832375, + "learning_rate": 8.774413108892566e-06, + "loss": 0.5572, + "step": 1888 + }, + { + "epoch": 0.918662613981763, + "grad_norm": 0.07416826120355921, + "learning_rate": 8.77315707603707e-06, + "loss": 0.5573, + "step": 1889 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 0.07105055457362772, + "learning_rate": 8.771900489890936e-06, + "loss": 0.5506, + "step": 1890 + }, + { + "epoch": 0.9196352583586627, + "grad_norm": 0.08067359575114463, + "learning_rate": 8.770643350638428e-06, + "loss": 0.5687, + "step": 1891 + }, + { + "epoch": 0.9201215805471125, + "grad_norm": 0.07377436823297279, + "learning_rate": 8.76938565846389e-06, + "loss": 0.6047, + "step": 1892 + }, + { + "epoch": 0.9206079027355623, + "grad_norm": 0.0806401297058479, + "learning_rate": 8.768127413551753e-06, + "loss": 0.6379, + "step": 1893 + }, + { + "epoch": 0.9210942249240122, + "grad_norm": 0.0738382309456317, + "learning_rate": 8.766868616086517e-06, + "loss": 0.599, + "step": 1894 + }, + { + "epoch": 0.921580547112462, + "grad_norm": 0.07237935229265387, + "learning_rate": 8.765609266252775e-06, + "loss": 0.5709, + "step": 1895 + }, + { + "epoch": 0.9220668693009119, + "grad_norm": 0.07745783967681805, + "learning_rate": 8.764349364235197e-06, + "loss": 0.5959, + "step": 1896 + }, + { + "epoch": 0.9225531914893617, + "grad_norm": 0.07219069912436764, + "learning_rate": 8.763088910218528e-06, + "loss": 0.6246, + "step": 1897 + }, + { + "epoch": 0.9230395136778116, + "grad_norm": 0.07165332905833564, + "learning_rate": 8.761827904387608e-06, + "loss": 0.5744, + "step": 1898 + }, + { + "epoch": 0.9235258358662614, + "grad_norm": 0.07390910334111164, + "learning_rate": 8.76056634692734e-06, + "loss": 0.5627, + "step": 1899 + }, + { + "epoch": 0.9240121580547113, + "grad_norm": 0.07575407455272574, + "learning_rate": 8.759304238022723e-06, + "loss": 0.5885, + "step": 1900 + }, + { + "epoch": 0.9244984802431611, + "grad_norm": 0.07666566543189379, + "learning_rate": 8.75804157785883e-06, + "loss": 0.5774, + "step": 1901 + }, + { + "epoch": 0.924984802431611, + "grad_norm": 0.07653634702640007, + "learning_rate": 8.756778366620814e-06, + "loss": 0.6036, + "step": 1902 + }, + { + "epoch": 0.9254711246200608, + "grad_norm": 0.07172263865615873, + "learning_rate": 8.755514604493912e-06, + "loss": 0.5956, + "step": 1903 + }, + { + "epoch": 0.9259574468085107, + "grad_norm": 0.0837717654353363, + "learning_rate": 8.754250291663439e-06, + "loss": 0.5936, + "step": 1904 + }, + { + "epoch": 0.9264437689969605, + "grad_norm": 0.0787432784764435, + "learning_rate": 8.752985428314795e-06, + "loss": 0.5944, + "step": 1905 + }, + { + "epoch": 0.9269300911854104, + "grad_norm": 0.0733548556385432, + "learning_rate": 8.751720014633454e-06, + "loss": 0.598, + "step": 1906 + }, + { + "epoch": 0.9274164133738602, + "grad_norm": 0.0739171245461002, + "learning_rate": 8.750454050804978e-06, + "loss": 0.6018, + "step": 1907 + }, + { + "epoch": 0.9279027355623101, + "grad_norm": 0.07877341419849072, + "learning_rate": 8.749187537015003e-06, + "loss": 0.552, + "step": 1908 + }, + { + "epoch": 0.9283890577507599, + "grad_norm": 0.07783256628409974, + "learning_rate": 8.747920473449252e-06, + "loss": 0.5797, + "step": 1909 + }, + { + "epoch": 0.9288753799392098, + "grad_norm": 0.07360927254570707, + "learning_rate": 8.746652860293523e-06, + "loss": 0.5827, + "step": 1910 + }, + { + "epoch": 0.9293617021276596, + "grad_norm": 0.07427300754756623, + "learning_rate": 8.745384697733699e-06, + "loss": 0.5693, + "step": 1911 + }, + { + "epoch": 0.9298480243161095, + "grad_norm": 0.0769733477434556, + "learning_rate": 8.744115985955738e-06, + "loss": 0.6133, + "step": 1912 + }, + { + "epoch": 0.9303343465045593, + "grad_norm": 0.07271782174438762, + "learning_rate": 8.74284672514569e-06, + "loss": 0.5804, + "step": 1913 + }, + { + "epoch": 0.9308206686930092, + "grad_norm": 0.07846413106062895, + "learning_rate": 8.74157691548967e-06, + "loss": 0.6565, + "step": 1914 + }, + { + "epoch": 0.931306990881459, + "grad_norm": 0.07377507142002467, + "learning_rate": 8.740306557173881e-06, + "loss": 0.5529, + "step": 1915 + }, + { + "epoch": 0.9317933130699089, + "grad_norm": 0.0742597999791996, + "learning_rate": 8.739035650384612e-06, + "loss": 0.5608, + "step": 1916 + }, + { + "epoch": 0.9322796352583587, + "grad_norm": 0.0751723870957562, + "learning_rate": 8.737764195308226e-06, + "loss": 0.5805, + "step": 1917 + }, + { + "epoch": 0.9327659574468085, + "grad_norm": 0.0790512521332783, + "learning_rate": 8.736492192131164e-06, + "loss": 0.6095, + "step": 1918 + }, + { + "epoch": 0.9332522796352584, + "grad_norm": 0.07406014315950632, + "learning_rate": 8.735219641039953e-06, + "loss": 0.5523, + "step": 1919 + }, + { + "epoch": 0.9337386018237082, + "grad_norm": 0.07226407226743525, + "learning_rate": 8.733946542221198e-06, + "loss": 0.5418, + "step": 1920 + }, + { + "epoch": 0.9342249240121581, + "grad_norm": 0.07279557859791015, + "learning_rate": 8.732672895861585e-06, + "loss": 0.5927, + "step": 1921 + }, + { + "epoch": 0.9347112462006079, + "grad_norm": 0.07253148743378941, + "learning_rate": 8.731398702147877e-06, + "loss": 0.5738, + "step": 1922 + }, + { + "epoch": 0.9351975683890578, + "grad_norm": 0.07304467265066063, + "learning_rate": 8.730123961266923e-06, + "loss": 0.5803, + "step": 1923 + }, + { + "epoch": 0.9356838905775076, + "grad_norm": 0.07322965206078319, + "learning_rate": 8.72884867340565e-06, + "loss": 0.5611, + "step": 1924 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 0.07608127855139726, + "learning_rate": 8.727572838751062e-06, + "loss": 0.5797, + "step": 1925 + }, + { + "epoch": 0.9366565349544073, + "grad_norm": 0.07303585133740899, + "learning_rate": 8.726296457490246e-06, + "loss": 0.6106, + "step": 1926 + }, + { + "epoch": 0.9371428571428572, + "grad_norm": 0.07045183741301447, + "learning_rate": 8.72501952981037e-06, + "loss": 0.5349, + "step": 1927 + }, + { + "epoch": 0.937629179331307, + "grad_norm": 0.07306198098723882, + "learning_rate": 8.723742055898681e-06, + "loss": 0.5947, + "step": 1928 + }, + { + "epoch": 0.9381155015197569, + "grad_norm": 0.0740225934736366, + "learning_rate": 8.722464035942505e-06, + "loss": 0.5521, + "step": 1929 + }, + { + "epoch": 0.9386018237082067, + "grad_norm": 0.07067908994654054, + "learning_rate": 8.721185470129248e-06, + "loss": 0.5606, + "step": 1930 + }, + { + "epoch": 0.9390881458966566, + "grad_norm": 0.07385786456444564, + "learning_rate": 8.7199063586464e-06, + "loss": 0.5872, + "step": 1931 + }, + { + "epoch": 0.9395744680851064, + "grad_norm": 0.0842922501458967, + "learning_rate": 8.718626701681527e-06, + "loss": 0.5742, + "step": 1932 + }, + { + "epoch": 0.9400607902735563, + "grad_norm": 0.06953351399551343, + "learning_rate": 8.717346499422275e-06, + "loss": 0.5383, + "step": 1933 + }, + { + "epoch": 0.9405471124620061, + "grad_norm": 0.074435973311784, + "learning_rate": 8.716065752056373e-06, + "loss": 0.5868, + "step": 1934 + }, + { + "epoch": 0.941033434650456, + "grad_norm": 0.07414631919745308, + "learning_rate": 8.714784459771626e-06, + "loss": 0.6038, + "step": 1935 + }, + { + "epoch": 0.9415197568389058, + "grad_norm": 0.07477347706541111, + "learning_rate": 8.713502622755924e-06, + "loss": 0.6034, + "step": 1936 + }, + { + "epoch": 0.9420060790273557, + "grad_norm": 0.0767264112357078, + "learning_rate": 8.712220241197232e-06, + "loss": 0.5866, + "step": 1937 + }, + { + "epoch": 0.9424924012158055, + "grad_norm": 0.07238657065157811, + "learning_rate": 8.710937315283594e-06, + "loss": 0.5953, + "step": 1938 + }, + { + "epoch": 0.9429787234042554, + "grad_norm": 0.07567420200802548, + "learning_rate": 8.709653845203141e-06, + "loss": 0.5919, + "step": 1939 + }, + { + "epoch": 0.9434650455927052, + "grad_norm": 0.07368377062051222, + "learning_rate": 8.708369831144078e-06, + "loss": 0.6119, + "step": 1940 + }, + { + "epoch": 0.9439513677811551, + "grad_norm": 0.07455304205447773, + "learning_rate": 8.707085273294692e-06, + "loss": 0.5768, + "step": 1941 + }, + { + "epoch": 0.9444376899696049, + "grad_norm": 0.07557124587294992, + "learning_rate": 8.705800171843345e-06, + "loss": 0.6057, + "step": 1942 + }, + { + "epoch": 0.9449240121580547, + "grad_norm": 0.07335490326331268, + "learning_rate": 8.704514526978485e-06, + "loss": 0.594, + "step": 1943 + }, + { + "epoch": 0.9454103343465046, + "grad_norm": 0.07603034862894496, + "learning_rate": 8.703228338888636e-06, + "loss": 0.5993, + "step": 1944 + }, + { + "epoch": 0.9458966565349544, + "grad_norm": 0.07707660781225507, + "learning_rate": 8.701941607762407e-06, + "loss": 0.6245, + "step": 1945 + }, + { + "epoch": 0.9463829787234043, + "grad_norm": 0.07372636881111767, + "learning_rate": 8.700654333788478e-06, + "loss": 0.5867, + "step": 1946 + }, + { + "epoch": 0.9468693009118541, + "grad_norm": 0.07414267455408871, + "learning_rate": 8.699366517155614e-06, + "loss": 0.5551, + "step": 1947 + }, + { + "epoch": 0.947355623100304, + "grad_norm": 0.07513833224044704, + "learning_rate": 8.69807815805266e-06, + "loss": 0.566, + "step": 1948 + }, + { + "epoch": 0.9478419452887538, + "grad_norm": 0.07186551438153543, + "learning_rate": 8.696789256668538e-06, + "loss": 0.567, + "step": 1949 + }, + { + "epoch": 0.9483282674772037, + "grad_norm": 0.07553769945065232, + "learning_rate": 8.695499813192254e-06, + "loss": 0.6565, + "step": 1950 + }, + { + "epoch": 0.9488145896656535, + "grad_norm": 0.07654798759273028, + "learning_rate": 8.694209827812886e-06, + "loss": 0.6238, + "step": 1951 + }, + { + "epoch": 0.9493009118541034, + "grad_norm": 0.07265853801793293, + "learning_rate": 8.692919300719596e-06, + "loss": 0.5808, + "step": 1952 + }, + { + "epoch": 0.9497872340425532, + "grad_norm": 0.07065949865374788, + "learning_rate": 8.691628232101627e-06, + "loss": 0.5573, + "step": 1953 + }, + { + "epoch": 0.9502735562310031, + "grad_norm": 0.07273958322712094, + "learning_rate": 8.690336622148299e-06, + "loss": 0.6058, + "step": 1954 + }, + { + "epoch": 0.9507598784194529, + "grad_norm": 0.07281979887883913, + "learning_rate": 8.689044471049013e-06, + "loss": 0.6115, + "step": 1955 + }, + { + "epoch": 0.9512462006079028, + "grad_norm": 0.06989490203333762, + "learning_rate": 8.687751778993246e-06, + "loss": 0.5197, + "step": 1956 + }, + { + "epoch": 0.9517325227963526, + "grad_norm": 0.07206230968129262, + "learning_rate": 8.686458546170558e-06, + "loss": 0.6135, + "step": 1957 + }, + { + "epoch": 0.9522188449848025, + "grad_norm": 0.07388518202418963, + "learning_rate": 8.685164772770588e-06, + "loss": 0.6241, + "step": 1958 + }, + { + "epoch": 0.9527051671732523, + "grad_norm": 0.07680439843553713, + "learning_rate": 8.68387045898305e-06, + "loss": 0.5814, + "step": 1959 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 0.07100583350362148, + "learning_rate": 8.682575604997744e-06, + "loss": 0.5718, + "step": 1960 + }, + { + "epoch": 0.953677811550152, + "grad_norm": 0.07817881497642581, + "learning_rate": 8.681280211004543e-06, + "loss": 0.5757, + "step": 1961 + }, + { + "epoch": 0.9541641337386019, + "grad_norm": 0.07330469912104214, + "learning_rate": 8.679984277193403e-06, + "loss": 0.6138, + "step": 1962 + }, + { + "epoch": 0.9546504559270517, + "grad_norm": 0.07277157621205767, + "learning_rate": 8.678687803754358e-06, + "loss": 0.5792, + "step": 1963 + }, + { + "epoch": 0.9551367781155016, + "grad_norm": 0.07152158692855926, + "learning_rate": 8.67739079087752e-06, + "loss": 0.5612, + "step": 1964 + }, + { + "epoch": 0.9556231003039514, + "grad_norm": 0.07144118075126639, + "learning_rate": 8.676093238753083e-06, + "loss": 0.5777, + "step": 1965 + }, + { + "epoch": 0.9561094224924012, + "grad_norm": 0.07219566881100747, + "learning_rate": 8.674795147571318e-06, + "loss": 0.6013, + "step": 1966 + }, + { + "epoch": 0.9565957446808511, + "grad_norm": 0.07484808961086889, + "learning_rate": 8.673496517522572e-06, + "loss": 0.591, + "step": 1967 + }, + { + "epoch": 0.9570820668693009, + "grad_norm": 0.07256855041644918, + "learning_rate": 8.672197348797278e-06, + "loss": 0.6032, + "step": 1968 + }, + { + "epoch": 0.9575683890577508, + "grad_norm": 0.06871632178820912, + "learning_rate": 8.670897641585945e-06, + "loss": 0.5813, + "step": 1969 + }, + { + "epoch": 0.9580547112462006, + "grad_norm": 0.07448842817312101, + "learning_rate": 8.669597396079156e-06, + "loss": 0.6148, + "step": 1970 + }, + { + "epoch": 0.9585410334346505, + "grad_norm": 0.0760953890895604, + "learning_rate": 8.668296612467583e-06, + "loss": 0.6205, + "step": 1971 + }, + { + "epoch": 0.9590273556231003, + "grad_norm": 0.072864671447709, + "learning_rate": 8.666995290941967e-06, + "loss": 0.5785, + "step": 1972 + }, + { + "epoch": 0.9595136778115502, + "grad_norm": 0.07181234183052794, + "learning_rate": 8.665693431693132e-06, + "loss": 0.5795, + "step": 1973 + }, + { + "epoch": 0.96, + "grad_norm": 0.07046195108688545, + "learning_rate": 8.664391034911982e-06, + "loss": 0.5857, + "step": 1974 + }, + { + "epoch": 0.9604863221884499, + "grad_norm": 0.07627922595473915, + "learning_rate": 8.663088100789501e-06, + "loss": 0.6225, + "step": 1975 + }, + { + "epoch": 0.9609726443768997, + "grad_norm": 0.07411225875795115, + "learning_rate": 8.661784629516745e-06, + "loss": 0.6005, + "step": 1976 + }, + { + "epoch": 0.9614589665653496, + "grad_norm": 0.07187964684316119, + "learning_rate": 8.660480621284855e-06, + "loss": 0.5628, + "step": 1977 + }, + { + "epoch": 0.9619452887537994, + "grad_norm": 0.07711000051669091, + "learning_rate": 8.65917607628505e-06, + "loss": 0.563, + "step": 1978 + }, + { + "epoch": 0.9624316109422493, + "grad_norm": 0.074930074772771, + "learning_rate": 8.657870994708627e-06, + "loss": 0.6273, + "step": 1979 + }, + { + "epoch": 0.9629179331306991, + "grad_norm": 0.0745730780384544, + "learning_rate": 8.656565376746959e-06, + "loss": 0.5784, + "step": 1980 + }, + { + "epoch": 0.963404255319149, + "grad_norm": 0.06961842260503341, + "learning_rate": 8.655259222591503e-06, + "loss": 0.5631, + "step": 1981 + }, + { + "epoch": 0.9638905775075988, + "grad_norm": 0.07463124924712256, + "learning_rate": 8.65395253243379e-06, + "loss": 0.5494, + "step": 1982 + }, + { + "epoch": 0.9643768996960487, + "grad_norm": 0.07289374732622579, + "learning_rate": 8.65264530646543e-06, + "loss": 0.5931, + "step": 1983 + }, + { + "epoch": 0.9648632218844985, + "grad_norm": 0.0695810224455607, + "learning_rate": 8.651337544878115e-06, + "loss": 0.5695, + "step": 1984 + }, + { + "epoch": 0.9653495440729484, + "grad_norm": 0.0718534767257931, + "learning_rate": 8.650029247863615e-06, + "loss": 0.5545, + "step": 1985 + }, + { + "epoch": 0.9658358662613982, + "grad_norm": 0.07294577914846184, + "learning_rate": 8.648720415613774e-06, + "loss": 0.5699, + "step": 1986 + }, + { + "epoch": 0.9663221884498481, + "grad_norm": 0.07270140052961915, + "learning_rate": 8.647411048320515e-06, + "loss": 0.5562, + "step": 1987 + }, + { + "epoch": 0.9668085106382979, + "grad_norm": 0.07238928658794108, + "learning_rate": 8.646101146175846e-06, + "loss": 0.5668, + "step": 1988 + }, + { + "epoch": 0.9672948328267478, + "grad_norm": 0.07164837616909217, + "learning_rate": 8.64479070937185e-06, + "loss": 0.6128, + "step": 1989 + }, + { + "epoch": 0.9677811550151976, + "grad_norm": 0.07367725008368022, + "learning_rate": 8.643479738100684e-06, + "loss": 0.5774, + "step": 1990 + }, + { + "epoch": 0.9682674772036474, + "grad_norm": 0.07848211598679297, + "learning_rate": 8.642168232554589e-06, + "loss": 0.595, + "step": 1991 + }, + { + "epoch": 0.9687537993920973, + "grad_norm": 0.06763999496876924, + "learning_rate": 8.640856192925884e-06, + "loss": 0.5556, + "step": 1992 + }, + { + "epoch": 0.9692401215805471, + "grad_norm": 0.07027828631001344, + "learning_rate": 8.639543619406961e-06, + "loss": 0.5289, + "step": 1993 + }, + { + "epoch": 0.969726443768997, + "grad_norm": 0.07367010261794604, + "learning_rate": 8.638230512190298e-06, + "loss": 0.5922, + "step": 1994 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 0.07014210837887246, + "learning_rate": 8.636916871468442e-06, + "loss": 0.5229, + "step": 1995 + }, + { + "epoch": 0.9706990881458967, + "grad_norm": 0.07195483549925343, + "learning_rate": 8.63560269743403e-06, + "loss": 0.5868, + "step": 1996 + }, + { + "epoch": 0.9711854103343465, + "grad_norm": 0.08163082582222239, + "learning_rate": 8.634287990279767e-06, + "loss": 0.5573, + "step": 1997 + }, + { + "epoch": 0.9716717325227964, + "grad_norm": 0.07638161736844712, + "learning_rate": 8.632972750198438e-06, + "loss": 0.5735, + "step": 1998 + }, + { + "epoch": 0.9721580547112462, + "grad_norm": 0.07276505191514222, + "learning_rate": 8.631656977382912e-06, + "loss": 0.5893, + "step": 1999 + }, + { + "epoch": 0.9726443768996961, + "grad_norm": 0.07002002728625449, + "learning_rate": 8.630340672026129e-06, + "loss": 0.5758, + "step": 2000 + }, + { + "epoch": 0.9731306990881459, + "grad_norm": 0.07129980243856032, + "learning_rate": 8.629023834321113e-06, + "loss": 0.5733, + "step": 2001 + }, + { + "epoch": 0.9736170212765958, + "grad_norm": 0.07649137111336735, + "learning_rate": 8.627706464460964e-06, + "loss": 0.6199, + "step": 2002 + }, + { + "epoch": 0.9741033434650456, + "grad_norm": 0.07561432784319275, + "learning_rate": 8.626388562638853e-06, + "loss": 0.5458, + "step": 2003 + }, + { + "epoch": 0.9745896656534955, + "grad_norm": 0.07550392686570884, + "learning_rate": 8.625070129048042e-06, + "loss": 0.5969, + "step": 2004 + }, + { + "epoch": 0.9750759878419453, + "grad_norm": 0.07632023476484963, + "learning_rate": 8.623751163881862e-06, + "loss": 0.5831, + "step": 2005 + }, + { + "epoch": 0.9755623100303952, + "grad_norm": 0.0772423447099558, + "learning_rate": 8.622431667333724e-06, + "loss": 0.592, + "step": 2006 + }, + { + "epoch": 0.976048632218845, + "grad_norm": 0.07227388275743091, + "learning_rate": 8.621111639597117e-06, + "loss": 0.5945, + "step": 2007 + }, + { + "epoch": 0.9765349544072949, + "grad_norm": 0.07175435857817608, + "learning_rate": 8.619791080865609e-06, + "loss": 0.5616, + "step": 2008 + }, + { + "epoch": 0.9770212765957447, + "grad_norm": 0.09577836305853851, + "learning_rate": 8.618469991332846e-06, + "loss": 0.5759, + "step": 2009 + }, + { + "epoch": 0.9775075987841946, + "grad_norm": 0.07421635889520364, + "learning_rate": 8.617148371192547e-06, + "loss": 0.5556, + "step": 2010 + }, + { + "epoch": 0.9779939209726444, + "grad_norm": 0.07543784169961848, + "learning_rate": 8.615826220638514e-06, + "loss": 0.5539, + "step": 2011 + }, + { + "epoch": 0.9784802431610943, + "grad_norm": 0.07404003153084862, + "learning_rate": 8.61450353986463e-06, + "loss": 0.5864, + "step": 2012 + }, + { + "epoch": 0.9789665653495441, + "grad_norm": 0.07343042903390791, + "learning_rate": 8.613180329064844e-06, + "loss": 0.5616, + "step": 2013 + }, + { + "epoch": 0.9794528875379939, + "grad_norm": 0.072255986061141, + "learning_rate": 8.611856588433193e-06, + "loss": 0.5804, + "step": 2014 + }, + { + "epoch": 0.9799392097264438, + "grad_norm": 0.07213135134031219, + "learning_rate": 8.61053231816379e-06, + "loss": 0.5844, + "step": 2015 + }, + { + "epoch": 0.9804255319148936, + "grad_norm": 0.0708814169189653, + "learning_rate": 8.609207518450823e-06, + "loss": 0.573, + "step": 2016 + }, + { + "epoch": 0.9809118541033435, + "grad_norm": 0.07768338090487058, + "learning_rate": 8.607882189488558e-06, + "loss": 0.6023, + "step": 2017 + }, + { + "epoch": 0.9813981762917933, + "grad_norm": 0.07552085324272174, + "learning_rate": 8.60655633147134e-06, + "loss": 0.5846, + "step": 2018 + }, + { + "epoch": 0.9818844984802432, + "grad_norm": 0.07810155964317222, + "learning_rate": 8.605229944593592e-06, + "loss": 0.5678, + "step": 2019 + }, + { + "epoch": 0.982370820668693, + "grad_norm": 0.0736400652851956, + "learning_rate": 8.603903029049812e-06, + "loss": 0.5972, + "step": 2020 + }, + { + "epoch": 0.9828571428571429, + "grad_norm": 0.0706211241308663, + "learning_rate": 8.602575585034579e-06, + "loss": 0.5698, + "step": 2021 + }, + { + "epoch": 0.9833434650455927, + "grad_norm": 0.07145541857657702, + "learning_rate": 8.601247612742545e-06, + "loss": 0.5962, + "step": 2022 + }, + { + "epoch": 0.9838297872340426, + "grad_norm": 0.07440499802663506, + "learning_rate": 8.599919112368444e-06, + "loss": 0.5867, + "step": 2023 + }, + { + "epoch": 0.9843161094224924, + "grad_norm": 0.07111096838989284, + "learning_rate": 8.598590084107085e-06, + "loss": 0.5728, + "step": 2024 + }, + { + "epoch": 0.9848024316109423, + "grad_norm": 0.0737273788900959, + "learning_rate": 8.597260528153354e-06, + "loss": 0.5776, + "step": 2025 + }, + { + "epoch": 0.9852887537993921, + "grad_norm": 0.07220304504764649, + "learning_rate": 8.595930444702217e-06, + "loss": 0.5851, + "step": 2026 + }, + { + "epoch": 0.985775075987842, + "grad_norm": 0.07540167851680561, + "learning_rate": 8.594599833948715e-06, + "loss": 0.6414, + "step": 2027 + }, + { + "epoch": 0.9862613981762918, + "grad_norm": 0.07264709450883156, + "learning_rate": 8.593268696087966e-06, + "loss": 0.5927, + "step": 2028 + }, + { + "epoch": 0.9867477203647417, + "grad_norm": 0.07135532712612644, + "learning_rate": 8.591937031315167e-06, + "loss": 0.5568, + "step": 2029 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 0.07044681021365043, + "learning_rate": 8.590604839825593e-06, + "loss": 0.5769, + "step": 2030 + }, + { + "epoch": 0.9877203647416414, + "grad_norm": 0.07524956835825793, + "learning_rate": 8.58927212181459e-06, + "loss": 0.557, + "step": 2031 + }, + { + "epoch": 0.9882066869300912, + "grad_norm": 0.07272223393951906, + "learning_rate": 8.587938877477593e-06, + "loss": 0.5941, + "step": 2032 + }, + { + "epoch": 0.9886930091185411, + "grad_norm": 0.0749076817410383, + "learning_rate": 8.586605107010103e-06, + "loss": 0.5769, + "step": 2033 + }, + { + "epoch": 0.9891793313069909, + "grad_norm": 0.09352000564960601, + "learning_rate": 8.5852708106077e-06, + "loss": 0.6562, + "step": 2034 + }, + { + "epoch": 0.9896656534954408, + "grad_norm": 0.07210073897981624, + "learning_rate": 8.583935988466048e-06, + "loss": 0.6037, + "step": 2035 + }, + { + "epoch": 0.9901519756838906, + "grad_norm": 0.07592592637947772, + "learning_rate": 8.58260064078088e-06, + "loss": 0.5658, + "step": 2036 + }, + { + "epoch": 0.9906382978723405, + "grad_norm": 0.07686089700617904, + "learning_rate": 8.581264767748012e-06, + "loss": 0.5712, + "step": 2037 + }, + { + "epoch": 0.9911246200607903, + "grad_norm": 0.07355065392333741, + "learning_rate": 8.579928369563335e-06, + "loss": 0.6046, + "step": 2038 + }, + { + "epoch": 0.9916109422492401, + "grad_norm": 0.07260917104148394, + "learning_rate": 8.578591446422814e-06, + "loss": 0.6062, + "step": 2039 + }, + { + "epoch": 0.99209726443769, + "grad_norm": 0.0763273623411675, + "learning_rate": 8.577253998522496e-06, + "loss": 0.5613, + "step": 2040 + }, + { + "epoch": 0.9925835866261398, + "grad_norm": 0.0728063601092115, + "learning_rate": 8.5759160260585e-06, + "loss": 0.5608, + "step": 2041 + }, + { + "epoch": 0.9930699088145897, + "grad_norm": 0.07310299316281559, + "learning_rate": 8.574577529227027e-06, + "loss": 0.5772, + "step": 2042 + }, + { + "epoch": 0.9935562310030395, + "grad_norm": 0.07394776255131645, + "learning_rate": 8.573238508224351e-06, + "loss": 0.5998, + "step": 2043 + }, + { + "epoch": 0.9940425531914894, + "grad_norm": 0.07777617226060557, + "learning_rate": 8.571898963246826e-06, + "loss": 0.5949, + "step": 2044 + }, + { + "epoch": 0.9945288753799392, + "grad_norm": 0.0687563166434955, + "learning_rate": 8.570558894490878e-06, + "loss": 0.5315, + "step": 2045 + }, + { + "epoch": 0.9950151975683891, + "grad_norm": 0.07260669177965728, + "learning_rate": 8.569218302153015e-06, + "loss": 0.5727, + "step": 2046 + }, + { + "epoch": 0.9955015197568389, + "grad_norm": 0.07082971629937435, + "learning_rate": 8.567877186429819e-06, + "loss": 0.5615, + "step": 2047 + }, + { + "epoch": 0.9959878419452888, + "grad_norm": 0.0763588646851888, + "learning_rate": 8.566535547517949e-06, + "loss": 0.6399, + "step": 2048 + }, + { + "epoch": 0.9964741641337386, + "grad_norm": 0.07574455989652469, + "learning_rate": 8.565193385614143e-06, + "loss": 0.5699, + "step": 2049 + }, + { + "epoch": 0.9969604863221885, + "grad_norm": 0.07630762610502481, + "learning_rate": 8.563850700915211e-06, + "loss": 0.5885, + "step": 2050 + }, + { + "epoch": 0.9974468085106383, + "grad_norm": 0.07295687274868297, + "learning_rate": 8.562507493618046e-06, + "loss": 0.5879, + "step": 2051 + }, + { + "epoch": 0.9979331306990882, + "grad_norm": 0.07396831755057373, + "learning_rate": 8.56116376391961e-06, + "loss": 0.5413, + "step": 2052 + }, + { + "epoch": 0.998419452887538, + "grad_norm": 0.07391435511617277, + "learning_rate": 8.559819512016949e-06, + "loss": 0.5988, + "step": 2053 + }, + { + "epoch": 0.9989057750759879, + "grad_norm": 0.07007833288228817, + "learning_rate": 8.55847473810718e-06, + "loss": 0.5526, + "step": 2054 + }, + { + "epoch": 0.9993920972644377, + "grad_norm": 0.07568621827235816, + "learning_rate": 8.5571294423875e-06, + "loss": 0.6245, + "step": 2055 + }, + { + "epoch": 0.9998784194528876, + "grad_norm": 0.08584046270551529, + "learning_rate": 8.55578362505518e-06, + "loss": 0.6564, + "step": 2056 + }, + { + "epoch": 0.9998784194528876, + "eval_loss": 0.585534930229187, + "eval_runtime": 105.1103, + "eval_samples_per_second": 288.773, + "eval_steps_per_second": 36.105, + "step": 2056 + }, + { + "epoch": 1.0, + "grad_norm": 0.08584046270551529, + "learning_rate": 8.554437286307573e-06, + "loss": 0.1579, + "step": 2057 + }, + { + "epoch": 1.0003647416413375, + "grad_norm": 0.07623802936811282, + "learning_rate": 8.553090426342098e-06, + "loss": 0.448, + "step": 2058 + }, + { + "epoch": 1.00048632218845, + "grad_norm": 0.08080827269924548, + "learning_rate": 8.551743045356262e-06, + "loss": 0.5614, + "step": 2059 + }, + { + "epoch": 1.0009726443768996, + "grad_norm": 0.08034094867182404, + "learning_rate": 8.550395143547641e-06, + "loss": 0.5627, + "step": 2060 + }, + { + "epoch": 1.0014589665653495, + "grad_norm": 0.07109104639314932, + "learning_rate": 8.54904672111389e-06, + "loss": 0.5245, + "step": 2061 + }, + { + "epoch": 1.0019452887537994, + "grad_norm": 0.07510940006349107, + "learning_rate": 8.54769777825274e-06, + "loss": 0.5409, + "step": 2062 + }, + { + "epoch": 1.0024316109422493, + "grad_norm": 0.07989040607054634, + "learning_rate": 8.546348315161994e-06, + "loss": 0.583, + "step": 2063 + }, + { + "epoch": 1.002917933130699, + "grad_norm": 0.07398209704917016, + "learning_rate": 8.544998332039543e-06, + "loss": 0.5252, + "step": 2064 + }, + { + "epoch": 1.003404255319149, + "grad_norm": 0.07474911859311478, + "learning_rate": 8.54364782908334e-06, + "loss": 0.5578, + "step": 2065 + }, + { + "epoch": 1.0038905775075988, + "grad_norm": 0.08226936885030092, + "learning_rate": 8.542296806491426e-06, + "loss": 0.564, + "step": 2066 + }, + { + "epoch": 1.0043768996960487, + "grad_norm": 0.08250439967948725, + "learning_rate": 8.540945264461909e-06, + "loss": 0.6038, + "step": 2067 + }, + { + "epoch": 1.0048632218844984, + "grad_norm": 0.07329354696050824, + "learning_rate": 8.53959320319298e-06, + "loss": 0.5521, + "step": 2068 + }, + { + "epoch": 1.0053495440729483, + "grad_norm": 0.07334265848797816, + "learning_rate": 8.538240622882901e-06, + "loss": 0.5549, + "step": 2069 + }, + { + "epoch": 1.0058358662613982, + "grad_norm": 0.0756716614044278, + "learning_rate": 8.536887523730015e-06, + "loss": 0.551, + "step": 2070 + }, + { + "epoch": 1.006322188449848, + "grad_norm": 0.07275129250413012, + "learning_rate": 8.535533905932739e-06, + "loss": 0.54, + "step": 2071 + }, + { + "epoch": 1.0068085106382978, + "grad_norm": 0.07406226979470239, + "learning_rate": 8.534179769689562e-06, + "loss": 0.5369, + "step": 2072 + }, + { + "epoch": 1.0072948328267477, + "grad_norm": 0.07280953912415816, + "learning_rate": 8.532825115199057e-06, + "loss": 0.5748, + "step": 2073 + }, + { + "epoch": 1.0077811550151976, + "grad_norm": 0.07186372545295325, + "learning_rate": 8.531469942659867e-06, + "loss": 0.537, + "step": 2074 + }, + { + "epoch": 1.0082674772036475, + "grad_norm": 0.07580037131962247, + "learning_rate": 8.53011425227071e-06, + "loss": 0.5776, + "step": 2075 + }, + { + "epoch": 1.0087537993920972, + "grad_norm": 0.0745244875313004, + "learning_rate": 8.528758044230386e-06, + "loss": 0.5592, + "step": 2076 + }, + { + "epoch": 1.009240121580547, + "grad_norm": 0.07577338322201924, + "learning_rate": 8.527401318737766e-06, + "loss": 0.5617, + "step": 2077 + }, + { + "epoch": 1.009726443768997, + "grad_norm": 0.07154629608876042, + "learning_rate": 8.526044075991801e-06, + "loss": 0.5374, + "step": 2078 + }, + { + "epoch": 1.010212765957447, + "grad_norm": 0.07541158295455186, + "learning_rate": 8.524686316191512e-06, + "loss": 0.5806, + "step": 2079 + }, + { + "epoch": 1.0106990881458966, + "grad_norm": 0.07306716776333762, + "learning_rate": 8.523328039536002e-06, + "loss": 0.5312, + "step": 2080 + }, + { + "epoch": 1.0111854103343465, + "grad_norm": 0.0777828615718599, + "learning_rate": 8.521969246224442e-06, + "loss": 0.5678, + "step": 2081 + }, + { + "epoch": 1.0116717325227964, + "grad_norm": 0.07118144851681076, + "learning_rate": 8.520609936456088e-06, + "loss": 0.5409, + "step": 2082 + }, + { + "epoch": 1.012158054711246, + "grad_norm": 0.08142507329757849, + "learning_rate": 8.519250110430265e-06, + "loss": 0.5735, + "step": 2083 + }, + { + "epoch": 1.012644376899696, + "grad_norm": 0.07249102398365048, + "learning_rate": 8.517889768346378e-06, + "loss": 0.581, + "step": 2084 + }, + { + "epoch": 1.013130699088146, + "grad_norm": 0.07310912041905886, + "learning_rate": 8.516528910403906e-06, + "loss": 0.5436, + "step": 2085 + }, + { + "epoch": 1.0136170212765958, + "grad_norm": 0.07055727155659802, + "learning_rate": 8.5151675368024e-06, + "loss": 0.5759, + "step": 2086 + }, + { + "epoch": 1.0141033434650455, + "grad_norm": 0.07485275721702012, + "learning_rate": 8.51380564774149e-06, + "loss": 0.5401, + "step": 2087 + }, + { + "epoch": 1.0145896656534954, + "grad_norm": 0.07354969772707608, + "learning_rate": 8.512443243420888e-06, + "loss": 0.5208, + "step": 2088 + }, + { + "epoch": 1.0150759878419453, + "grad_norm": 0.07059929263115745, + "learning_rate": 8.511080324040371e-06, + "loss": 0.5206, + "step": 2089 + }, + { + "epoch": 1.0155623100303952, + "grad_norm": 0.07432317457166472, + "learning_rate": 8.509716889799793e-06, + "loss": 0.5244, + "step": 2090 + }, + { + "epoch": 1.016048632218845, + "grad_norm": 0.06894732267377408, + "learning_rate": 8.508352940899089e-06, + "loss": 0.5322, + "step": 2091 + }, + { + "epoch": 1.0165349544072948, + "grad_norm": 0.08592273618989658, + "learning_rate": 8.506988477538267e-06, + "loss": 0.5739, + "step": 2092 + }, + { + "epoch": 1.0170212765957447, + "grad_norm": 0.07160035061418382, + "learning_rate": 8.505623499917409e-06, + "loss": 0.5209, + "step": 2093 + }, + { + "epoch": 1.0175075987841946, + "grad_norm": 0.08393077406675499, + "learning_rate": 8.504258008236671e-06, + "loss": 0.5321, + "step": 2094 + }, + { + "epoch": 1.0179939209726443, + "grad_norm": 0.07397506669026443, + "learning_rate": 8.502892002696293e-06, + "loss": 0.566, + "step": 2095 + }, + { + "epoch": 1.0184802431610942, + "grad_norm": 0.07513818241637829, + "learning_rate": 8.50152548349658e-06, + "loss": 0.5579, + "step": 2096 + }, + { + "epoch": 1.018966565349544, + "grad_norm": 0.07300586844794889, + "learning_rate": 8.500158450837918e-06, + "loss": 0.5654, + "step": 2097 + }, + { + "epoch": 1.019452887537994, + "grad_norm": 0.07187787986600684, + "learning_rate": 8.498790904920765e-06, + "loss": 0.5697, + "step": 2098 + }, + { + "epoch": 1.0199392097264437, + "grad_norm": 0.0752395730314187, + "learning_rate": 8.497422845945658e-06, + "loss": 0.5969, + "step": 2099 + }, + { + "epoch": 1.0204255319148936, + "grad_norm": 0.07678577926050456, + "learning_rate": 8.496054274113205e-06, + "loss": 0.5428, + "step": 2100 + }, + { + "epoch": 1.0209118541033435, + "grad_norm": 0.07508445266084898, + "learning_rate": 8.494685189624094e-06, + "loss": 0.5585, + "step": 2101 + }, + { + "epoch": 1.0213981762917934, + "grad_norm": 0.07275078821546245, + "learning_rate": 8.493315592679085e-06, + "loss": 0.5519, + "step": 2102 + }, + { + "epoch": 1.021884498480243, + "grad_norm": 0.07103263076957497, + "learning_rate": 8.491945483479014e-06, + "loss": 0.5366, + "step": 2103 + }, + { + "epoch": 1.022370820668693, + "grad_norm": 0.07400272214735697, + "learning_rate": 8.49057486222479e-06, + "loss": 0.5263, + "step": 2104 + }, + { + "epoch": 1.022857142857143, + "grad_norm": 0.07447294390255364, + "learning_rate": 8.4892037291174e-06, + "loss": 0.5371, + "step": 2105 + }, + { + "epoch": 1.0233434650455928, + "grad_norm": 0.07703843168113399, + "learning_rate": 8.487832084357908e-06, + "loss": 0.5806, + "step": 2106 + }, + { + "epoch": 1.0238297872340425, + "grad_norm": 0.07112136477701764, + "learning_rate": 8.486459928147448e-06, + "loss": 0.5482, + "step": 2107 + }, + { + "epoch": 1.0243161094224924, + "grad_norm": 0.07402568629395213, + "learning_rate": 8.485087260687231e-06, + "loss": 0.6188, + "step": 2108 + }, + { + "epoch": 1.0248024316109423, + "grad_norm": 0.0711352325681102, + "learning_rate": 8.48371408217854e-06, + "loss": 0.5478, + "step": 2109 + }, + { + "epoch": 1.025288753799392, + "grad_norm": 0.07342794180455385, + "learning_rate": 8.482340392822742e-06, + "loss": 0.5695, + "step": 2110 + }, + { + "epoch": 1.025775075987842, + "grad_norm": 0.07383221788704233, + "learning_rate": 8.480966192821268e-06, + "loss": 0.5609, + "step": 2111 + }, + { + "epoch": 1.0262613981762918, + "grad_norm": 0.0712678743489981, + "learning_rate": 8.47959148237563e-06, + "loss": 0.532, + "step": 2112 + }, + { + "epoch": 1.0267477203647417, + "grad_norm": 0.07281036787112662, + "learning_rate": 8.478216261687417e-06, + "loss": 0.5355, + "step": 2113 + }, + { + "epoch": 1.0272340425531914, + "grad_norm": 0.07642695711546861, + "learning_rate": 8.476840530958286e-06, + "loss": 0.5571, + "step": 2114 + }, + { + "epoch": 1.0277203647416413, + "grad_norm": 0.07681613148925025, + "learning_rate": 8.475464290389974e-06, + "loss": 0.5727, + "step": 2115 + }, + { + "epoch": 1.0282066869300912, + "grad_norm": 0.07484996838733651, + "learning_rate": 8.47408754018429e-06, + "loss": 0.5595, + "step": 2116 + }, + { + "epoch": 1.0286930091185411, + "grad_norm": 0.07543542146274154, + "learning_rate": 8.472710280543118e-06, + "loss": 0.5603, + "step": 2117 + }, + { + "epoch": 1.0291793313069908, + "grad_norm": 0.07285779612769512, + "learning_rate": 8.47133251166842e-06, + "loss": 0.5596, + "step": 2118 + }, + { + "epoch": 1.0296656534954407, + "grad_norm": 0.09095318745851422, + "learning_rate": 8.469954233762228e-06, + "loss": 0.5624, + "step": 2119 + }, + { + "epoch": 1.0301519756838906, + "grad_norm": 0.07142018944954683, + "learning_rate": 8.468575447026653e-06, + "loss": 0.5547, + "step": 2120 + }, + { + "epoch": 1.0306382978723405, + "grad_norm": 0.07484497432204437, + "learning_rate": 8.467196151663873e-06, + "loss": 0.6306, + "step": 2121 + }, + { + "epoch": 1.0311246200607902, + "grad_norm": 0.07861051691327721, + "learning_rate": 8.465816347876154e-06, + "loss": 0.5509, + "step": 2122 + }, + { + "epoch": 1.03161094224924, + "grad_norm": 0.07127396654233782, + "learning_rate": 8.464436035865823e-06, + "loss": 0.5271, + "step": 2123 + }, + { + "epoch": 1.03209726443769, + "grad_norm": 0.07442358549394897, + "learning_rate": 8.463055215835288e-06, + "loss": 0.5632, + "step": 2124 + }, + { + "epoch": 1.03258358662614, + "grad_norm": 0.07635231185619852, + "learning_rate": 8.461673887987033e-06, + "loss": 0.5719, + "step": 2125 + }, + { + "epoch": 1.0330699088145896, + "grad_norm": 0.07785198446309144, + "learning_rate": 8.460292052523611e-06, + "loss": 0.5438, + "step": 2126 + }, + { + "epoch": 1.0335562310030395, + "grad_norm": 0.07380077539325311, + "learning_rate": 8.458909709647653e-06, + "loss": 0.5651, + "step": 2127 + }, + { + "epoch": 1.0340425531914894, + "grad_norm": 0.07545329165626526, + "learning_rate": 8.457526859561867e-06, + "loss": 0.5802, + "step": 2128 + }, + { + "epoch": 1.0345288753799393, + "grad_norm": 0.08091225237505012, + "learning_rate": 8.456143502469027e-06, + "loss": 0.6121, + "step": 2129 + }, + { + "epoch": 1.035015197568389, + "grad_norm": 0.07516124663988459, + "learning_rate": 8.454759638571991e-06, + "loss": 0.5382, + "step": 2130 + }, + { + "epoch": 1.035501519756839, + "grad_norm": 0.0715082909148488, + "learning_rate": 8.453375268073686e-06, + "loss": 0.534, + "step": 2131 + }, + { + "epoch": 1.0359878419452888, + "grad_norm": 0.07411000226810031, + "learning_rate": 8.451990391177112e-06, + "loss": 0.5945, + "step": 2132 + }, + { + "epoch": 1.0364741641337385, + "grad_norm": 0.07672861486745983, + "learning_rate": 8.450605008085348e-06, + "loss": 0.5598, + "step": 2133 + }, + { + "epoch": 1.0369604863221884, + "grad_norm": 0.07747130495077176, + "learning_rate": 8.449219119001543e-06, + "loss": 0.5802, + "step": 2134 + }, + { + "epoch": 1.0374468085106383, + "grad_norm": 0.07564503208515647, + "learning_rate": 8.447832724128926e-06, + "loss": 0.5796, + "step": 2135 + }, + { + "epoch": 1.0379331306990882, + "grad_norm": 0.07258007971835202, + "learning_rate": 8.44644582367079e-06, + "loss": 0.5461, + "step": 2136 + }, + { + "epoch": 1.038419452887538, + "grad_norm": 0.07401389323829763, + "learning_rate": 8.44505841783051e-06, + "loss": 0.5335, + "step": 2137 + }, + { + "epoch": 1.0389057750759878, + "grad_norm": 0.07667259321025276, + "learning_rate": 8.443670506811537e-06, + "loss": 0.5991, + "step": 2138 + }, + { + "epoch": 1.0393920972644377, + "grad_norm": 0.06974108948664576, + "learning_rate": 8.442282090817388e-06, + "loss": 0.5229, + "step": 2139 + }, + { + "epoch": 1.0398784194528876, + "grad_norm": 0.07294166682055703, + "learning_rate": 8.440893170051658e-06, + "loss": 0.5171, + "step": 2140 + }, + { + "epoch": 1.0403647416413373, + "grad_norm": 0.07493317720976654, + "learning_rate": 8.43950374471802e-06, + "loss": 0.5671, + "step": 2141 + }, + { + "epoch": 1.0408510638297872, + "grad_norm": 0.07229591864002279, + "learning_rate": 8.43811381502022e-06, + "loss": 0.5619, + "step": 2142 + }, + { + "epoch": 1.041337386018237, + "grad_norm": 0.07110021039536353, + "learning_rate": 8.436723381162066e-06, + "loss": 0.5596, + "step": 2143 + }, + { + "epoch": 1.041823708206687, + "grad_norm": 0.07186193090485854, + "learning_rate": 8.435332443347458e-06, + "loss": 0.5365, + "step": 2144 + }, + { + "epoch": 1.0423100303951367, + "grad_norm": 0.07477354732914351, + "learning_rate": 8.433941001780356e-06, + "loss": 0.5409, + "step": 2145 + }, + { + "epoch": 1.0427963525835866, + "grad_norm": 0.07244688674350228, + "learning_rate": 8.432549056664802e-06, + "loss": 0.5658, + "step": 2146 + }, + { + "epoch": 1.0432826747720365, + "grad_norm": 0.07313689703662463, + "learning_rate": 8.431156608204907e-06, + "loss": 0.5382, + "step": 2147 + }, + { + "epoch": 1.0437689969604864, + "grad_norm": 0.07367009641217478, + "learning_rate": 8.42976365660486e-06, + "loss": 0.6123, + "step": 2148 + }, + { + "epoch": 1.044255319148936, + "grad_norm": 0.07780235394359179, + "learning_rate": 8.42837020206892e-06, + "loss": 0.5969, + "step": 2149 + }, + { + "epoch": 1.044741641337386, + "grad_norm": 0.07472651715480962, + "learning_rate": 8.42697624480142e-06, + "loss": 0.5605, + "step": 2150 + }, + { + "epoch": 1.045227963525836, + "grad_norm": 0.07354993535807795, + "learning_rate": 8.425581785006773e-06, + "loss": 0.5178, + "step": 2151 + }, + { + "epoch": 1.0457142857142858, + "grad_norm": 0.07233547389029334, + "learning_rate": 8.424186822889455e-06, + "loss": 0.5358, + "step": 2152 + }, + { + "epoch": 1.0462006079027355, + "grad_norm": 0.07433317637025985, + "learning_rate": 8.422791358654023e-06, + "loss": 0.5811, + "step": 2153 + }, + { + "epoch": 1.0466869300911854, + "grad_norm": 0.07421772465514842, + "learning_rate": 8.42139539250511e-06, + "loss": 0.5462, + "step": 2154 + }, + { + "epoch": 1.0471732522796353, + "grad_norm": 0.07434454678202498, + "learning_rate": 8.419998924647412e-06, + "loss": 0.5775, + "step": 2155 + }, + { + "epoch": 1.047659574468085, + "grad_norm": 0.07469403535236148, + "learning_rate": 8.418601955285708e-06, + "loss": 0.577, + "step": 2156 + }, + { + "epoch": 1.048145896656535, + "grad_norm": 0.07892199011396608, + "learning_rate": 8.41720448462485e-06, + "loss": 0.5913, + "step": 2157 + }, + { + "epoch": 1.0486322188449848, + "grad_norm": 0.07357476284509384, + "learning_rate": 8.415806512869759e-06, + "loss": 0.5882, + "step": 2158 + }, + { + "epoch": 1.0491185410334347, + "grad_norm": 0.07513409151134295, + "learning_rate": 8.41440804022543e-06, + "loss": 0.5598, + "step": 2159 + }, + { + "epoch": 1.0496048632218844, + "grad_norm": 0.07461118076178293, + "learning_rate": 8.413009066896938e-06, + "loss": 0.5285, + "step": 2160 + }, + { + "epoch": 1.0500911854103343, + "grad_norm": 0.07570819271043552, + "learning_rate": 8.411609593089423e-06, + "loss": 0.5576, + "step": 2161 + }, + { + "epoch": 1.0505775075987842, + "grad_norm": 0.07453660945407443, + "learning_rate": 8.4102096190081e-06, + "loss": 0.5546, + "step": 2162 + }, + { + "epoch": 1.0510638297872341, + "grad_norm": 0.07257821944566752, + "learning_rate": 8.408809144858265e-06, + "loss": 0.5455, + "step": 2163 + }, + { + "epoch": 1.0515501519756838, + "grad_norm": 0.07159868333635545, + "learning_rate": 8.407408170845277e-06, + "loss": 0.552, + "step": 2164 + }, + { + "epoch": 1.0520364741641337, + "grad_norm": 0.07289110625977005, + "learning_rate": 8.406006697174574e-06, + "loss": 0.56, + "step": 2165 + }, + { + "epoch": 1.0525227963525836, + "grad_norm": 0.07220522122717925, + "learning_rate": 8.404604724051668e-06, + "loss": 0.5176, + "step": 2166 + }, + { + "epoch": 1.0530091185410335, + "grad_norm": 0.07469138263726466, + "learning_rate": 8.403202251682139e-06, + "loss": 0.6139, + "step": 2167 + }, + { + "epoch": 1.0534954407294832, + "grad_norm": 0.07256256065081844, + "learning_rate": 8.401799280271647e-06, + "loss": 0.557, + "step": 2168 + }, + { + "epoch": 1.053981762917933, + "grad_norm": 0.07267374607151465, + "learning_rate": 8.400395810025922e-06, + "loss": 0.5475, + "step": 2169 + }, + { + "epoch": 1.054468085106383, + "grad_norm": 0.07561963705071116, + "learning_rate": 8.398991841150763e-06, + "loss": 0.5433, + "step": 2170 + }, + { + "epoch": 1.054954407294833, + "grad_norm": 0.07282401954668276, + "learning_rate": 8.39758737385205e-06, + "loss": 0.5537, + "step": 2171 + }, + { + "epoch": 1.0554407294832826, + "grad_norm": 0.07353765326332937, + "learning_rate": 8.396182408335729e-06, + "loss": 0.5431, + "step": 2172 + }, + { + "epoch": 1.0559270516717325, + "grad_norm": 0.0732847949635867, + "learning_rate": 8.394776944807826e-06, + "loss": 0.554, + "step": 2173 + }, + { + "epoch": 1.0564133738601824, + "grad_norm": 0.07565713616358952, + "learning_rate": 8.393370983474434e-06, + "loss": 0.5731, + "step": 2174 + }, + { + "epoch": 1.0568996960486323, + "grad_norm": 0.07234871606732343, + "learning_rate": 8.39196452454172e-06, + "loss": 0.5524, + "step": 2175 + }, + { + "epoch": 1.057386018237082, + "grad_norm": 0.07668876339335134, + "learning_rate": 8.39055756821593e-06, + "loss": 0.5747, + "step": 2176 + }, + { + "epoch": 1.057872340425532, + "grad_norm": 0.07517489810268597, + "learning_rate": 8.389150114703373e-06, + "loss": 0.587, + "step": 2177 + }, + { + "epoch": 1.0583586626139818, + "grad_norm": 0.07301766554156319, + "learning_rate": 8.387742164210438e-06, + "loss": 0.566, + "step": 2178 + }, + { + "epoch": 1.0588449848024317, + "grad_norm": 0.07843038786552695, + "learning_rate": 8.386333716943584e-06, + "loss": 0.5849, + "step": 2179 + }, + { + "epoch": 1.0593313069908814, + "grad_norm": 0.07232294751002794, + "learning_rate": 8.384924773109347e-06, + "loss": 0.5413, + "step": 2180 + }, + { + "epoch": 1.0598176291793313, + "grad_norm": 0.07343113044675709, + "learning_rate": 8.38351533291433e-06, + "loss": 0.5651, + "step": 2181 + }, + { + "epoch": 1.0603039513677812, + "grad_norm": 0.07535488548512996, + "learning_rate": 8.38210539656521e-06, + "loss": 0.5435, + "step": 2182 + }, + { + "epoch": 1.060790273556231, + "grad_norm": 0.07228023912142174, + "learning_rate": 8.38069496426874e-06, + "loss": 0.5426, + "step": 2183 + }, + { + "epoch": 1.0612765957446808, + "grad_norm": 0.07474386443343015, + "learning_rate": 8.379284036231745e-06, + "loss": 0.5128, + "step": 2184 + }, + { + "epoch": 1.0617629179331307, + "grad_norm": 0.07440189406953827, + "learning_rate": 8.37787261266112e-06, + "loss": 0.5551, + "step": 2185 + }, + { + "epoch": 1.0622492401215806, + "grad_norm": 0.07466431745253764, + "learning_rate": 8.376460693763835e-06, + "loss": 0.5608, + "step": 2186 + }, + { + "epoch": 1.0627355623100303, + "grad_norm": 0.07525622176944022, + "learning_rate": 8.375048279746932e-06, + "loss": 0.5623, + "step": 2187 + }, + { + "epoch": 1.0632218844984802, + "grad_norm": 0.07562724741081166, + "learning_rate": 8.373635370817524e-06, + "loss": 0.5992, + "step": 2188 + }, + { + "epoch": 1.0637082066869301, + "grad_norm": 0.07497321672440468, + "learning_rate": 8.372221967182799e-06, + "loss": 0.5698, + "step": 2189 + }, + { + "epoch": 1.06419452887538, + "grad_norm": 0.0744499252787531, + "learning_rate": 8.370808069050016e-06, + "loss": 0.527, + "step": 2190 + }, + { + "epoch": 1.0646808510638297, + "grad_norm": 0.07570711282991258, + "learning_rate": 8.369393676626509e-06, + "loss": 0.5876, + "step": 2191 + }, + { + "epoch": 1.0651671732522796, + "grad_norm": 0.0757940071069943, + "learning_rate": 8.367978790119682e-06, + "loss": 0.5664, + "step": 2192 + }, + { + "epoch": 1.0656534954407295, + "grad_norm": 0.07737614745719217, + "learning_rate": 8.36656340973701e-06, + "loss": 0.5793, + "step": 2193 + }, + { + "epoch": 1.0661398176291794, + "grad_norm": 0.07437499874331091, + "learning_rate": 8.365147535686044e-06, + "loss": 0.5586, + "step": 2194 + }, + { + "epoch": 1.066626139817629, + "grad_norm": 0.07097712926586076, + "learning_rate": 8.363731168174406e-06, + "loss": 0.5383, + "step": 2195 + }, + { + "epoch": 1.067112462006079, + "grad_norm": 0.07620958529288846, + "learning_rate": 8.36231430740979e-06, + "loss": 0.593, + "step": 2196 + }, + { + "epoch": 1.067598784194529, + "grad_norm": 0.07654302835048336, + "learning_rate": 8.360896953599962e-06, + "loss": 0.5657, + "step": 2197 + }, + { + "epoch": 1.0680851063829788, + "grad_norm": 0.07313120306302988, + "learning_rate": 8.359479106952761e-06, + "loss": 0.5526, + "step": 2198 + }, + { + "epoch": 1.0685714285714285, + "grad_norm": 0.07309142908696792, + "learning_rate": 8.3580607676761e-06, + "loss": 0.5856, + "step": 2199 + }, + { + "epoch": 1.0690577507598784, + "grad_norm": 0.07030504162244383, + "learning_rate": 8.356641935977959e-06, + "loss": 0.5146, + "step": 2200 + }, + { + "epoch": 1.0695440729483283, + "grad_norm": 0.07316451380685493, + "learning_rate": 8.355222612066397e-06, + "loss": 0.5773, + "step": 2201 + }, + { + "epoch": 1.070030395136778, + "grad_norm": 0.07308678809183129, + "learning_rate": 8.353802796149537e-06, + "loss": 0.5366, + "step": 2202 + }, + { + "epoch": 1.070516717325228, + "grad_norm": 0.07459788396881113, + "learning_rate": 8.352382488435585e-06, + "loss": 0.5592, + "step": 2203 + }, + { + "epoch": 1.0710030395136778, + "grad_norm": 0.0726918593860298, + "learning_rate": 8.350961689132808e-06, + "loss": 0.5355, + "step": 2204 + }, + { + "epoch": 1.0714893617021277, + "grad_norm": 0.07542305808351824, + "learning_rate": 8.349540398449551e-06, + "loss": 0.5486, + "step": 2205 + }, + { + "epoch": 1.0719756838905776, + "grad_norm": 0.07345811923448287, + "learning_rate": 8.348118616594234e-06, + "loss": 0.5632, + "step": 2206 + }, + { + "epoch": 1.0724620060790273, + "grad_norm": 0.07618050686783405, + "learning_rate": 8.346696343775342e-06, + "loss": 0.5728, + "step": 2207 + }, + { + "epoch": 1.0729483282674772, + "grad_norm": 0.07286824931810211, + "learning_rate": 8.345273580201434e-06, + "loss": 0.509, + "step": 2208 + }, + { + "epoch": 1.0734346504559271, + "grad_norm": 0.07321153833555988, + "learning_rate": 8.343850326081144e-06, + "loss": 0.5399, + "step": 2209 + }, + { + "epoch": 1.0739209726443768, + "grad_norm": 0.07178865134780009, + "learning_rate": 8.342426581623175e-06, + "loss": 0.5618, + "step": 2210 + }, + { + "epoch": 1.0744072948328267, + "grad_norm": 0.08005915401669642, + "learning_rate": 8.341002347036304e-06, + "loss": 0.6187, + "step": 2211 + }, + { + "epoch": 1.0748936170212766, + "grad_norm": 0.07106382847881387, + "learning_rate": 8.33957762252938e-06, + "loss": 0.5496, + "step": 2212 + }, + { + "epoch": 1.0753799392097265, + "grad_norm": 0.07067027929768914, + "learning_rate": 8.338152408311319e-06, + "loss": 0.553, + "step": 2213 + }, + { + "epoch": 1.0758662613981762, + "grad_norm": 0.0734579768255136, + "learning_rate": 8.336726704591115e-06, + "loss": 0.5899, + "step": 2214 + }, + { + "epoch": 1.076352583586626, + "grad_norm": 0.07765415774627402, + "learning_rate": 8.33530051157783e-06, + "loss": 0.5516, + "step": 2215 + }, + { + "epoch": 1.076838905775076, + "grad_norm": 0.07842763607023047, + "learning_rate": 8.333873829480603e-06, + "loss": 0.5858, + "step": 2216 + }, + { + "epoch": 1.077325227963526, + "grad_norm": 0.07280806452633432, + "learning_rate": 8.332446658508635e-06, + "loss": 0.5336, + "step": 2217 + }, + { + "epoch": 1.0778115501519756, + "grad_norm": 0.07328412399923136, + "learning_rate": 8.331018998871207e-06, + "loss": 0.5515, + "step": 2218 + }, + { + "epoch": 1.0782978723404255, + "grad_norm": 0.07004925866365012, + "learning_rate": 8.32959085077767e-06, + "loss": 0.5217, + "step": 2219 + }, + { + "epoch": 1.0787841945288754, + "grad_norm": 0.07724885337452692, + "learning_rate": 8.328162214437445e-06, + "loss": 0.5509, + "step": 2220 + }, + { + "epoch": 1.0792705167173253, + "grad_norm": 0.07403290060850776, + "learning_rate": 8.326733090060022e-06, + "loss": 0.5188, + "step": 2221 + }, + { + "epoch": 1.079756838905775, + "grad_norm": 0.07350000907247445, + "learning_rate": 8.325303477854972e-06, + "loss": 0.5425, + "step": 2222 + }, + { + "epoch": 1.080243161094225, + "grad_norm": 0.0724690874642837, + "learning_rate": 8.323873378031929e-06, + "loss": 0.5446, + "step": 2223 + }, + { + "epoch": 1.0807294832826748, + "grad_norm": 0.07519148328232632, + "learning_rate": 8.322442790800597e-06, + "loss": 0.5325, + "step": 2224 + }, + { + "epoch": 1.0812158054711247, + "grad_norm": 0.07463718551271542, + "learning_rate": 8.32101171637076e-06, + "loss": 0.5579, + "step": 2225 + }, + { + "epoch": 1.0817021276595744, + "grad_norm": 0.07443179556519772, + "learning_rate": 8.319580154952266e-06, + "loss": 0.5528, + "step": 2226 + }, + { + "epoch": 1.0821884498480243, + "grad_norm": 0.07014470338470903, + "learning_rate": 8.318148106755042e-06, + "loss": 0.5706, + "step": 2227 + }, + { + "epoch": 1.0826747720364742, + "grad_norm": 0.07177509490248288, + "learning_rate": 8.316715571989076e-06, + "loss": 0.5604, + "step": 2228 + }, + { + "epoch": 1.083161094224924, + "grad_norm": 0.07217672580997961, + "learning_rate": 8.315282550864437e-06, + "loss": 0.5295, + "step": 2229 + }, + { + "epoch": 1.0836474164133738, + "grad_norm": 0.07944460307309677, + "learning_rate": 8.313849043591257e-06, + "loss": 0.6133, + "step": 2230 + }, + { + "epoch": 1.0841337386018237, + "grad_norm": 0.07032441954062435, + "learning_rate": 8.312415050379747e-06, + "loss": 0.5335, + "step": 2231 + }, + { + "epoch": 1.0846200607902736, + "grad_norm": 0.07353808812680972, + "learning_rate": 8.310980571440184e-06, + "loss": 0.5421, + "step": 2232 + }, + { + "epoch": 1.0851063829787233, + "grad_norm": 0.06888454283672282, + "learning_rate": 8.309545606982921e-06, + "loss": 0.5053, + "step": 2233 + }, + { + "epoch": 1.0855927051671732, + "grad_norm": 0.07420964223007172, + "learning_rate": 8.308110157218375e-06, + "loss": 0.553, + "step": 2234 + }, + { + "epoch": 1.0860790273556231, + "grad_norm": 0.0740488375265095, + "learning_rate": 8.306674222357042e-06, + "loss": 0.5475, + "step": 2235 + }, + { + "epoch": 1.086565349544073, + "grad_norm": 0.07710966058515396, + "learning_rate": 8.305237802609482e-06, + "loss": 0.6266, + "step": 2236 + }, + { + "epoch": 1.0870516717325227, + "grad_norm": 0.0713823660595672, + "learning_rate": 8.303800898186334e-06, + "loss": 0.5446, + "step": 2237 + }, + { + "epoch": 1.0875379939209726, + "grad_norm": 0.07380580777069042, + "learning_rate": 8.302363509298301e-06, + "loss": 0.5906, + "step": 2238 + }, + { + "epoch": 1.0880243161094225, + "grad_norm": 0.0729766720411248, + "learning_rate": 8.300925636156159e-06, + "loss": 0.5316, + "step": 2239 + }, + { + "epoch": 1.0885106382978724, + "grad_norm": 0.07237347169294653, + "learning_rate": 8.299487278970759e-06, + "loss": 0.5441, + "step": 2240 + }, + { + "epoch": 1.088996960486322, + "grad_norm": 0.07251047475207656, + "learning_rate": 8.298048437953016e-06, + "loss": 0.5296, + "step": 2241 + }, + { + "epoch": 1.089483282674772, + "grad_norm": 0.06924194952532757, + "learning_rate": 8.296609113313922e-06, + "loss": 0.5259, + "step": 2242 + }, + { + "epoch": 1.089969604863222, + "grad_norm": 0.07397881628415039, + "learning_rate": 8.295169305264537e-06, + "loss": 0.5775, + "step": 2243 + }, + { + "epoch": 1.0904559270516718, + "grad_norm": 0.07407185351497901, + "learning_rate": 8.293729014015992e-06, + "loss": 0.566, + "step": 2244 + }, + { + "epoch": 1.0909422492401215, + "grad_norm": 0.07206987544427718, + "learning_rate": 8.292288239779488e-06, + "loss": 0.5652, + "step": 2245 + }, + { + "epoch": 1.0914285714285714, + "grad_norm": 0.07477198142835094, + "learning_rate": 8.290846982766305e-06, + "loss": 0.5506, + "step": 2246 + }, + { + "epoch": 1.0919148936170213, + "grad_norm": 0.07233292901579166, + "learning_rate": 8.289405243187778e-06, + "loss": 0.5605, + "step": 2247 + }, + { + "epoch": 1.0924012158054712, + "grad_norm": 0.07297230216106336, + "learning_rate": 8.287963021255328e-06, + "loss": 0.5256, + "step": 2248 + }, + { + "epoch": 1.092887537993921, + "grad_norm": 0.07555470857502421, + "learning_rate": 8.286520317180436e-06, + "loss": 0.5665, + "step": 2249 + }, + { + "epoch": 1.0933738601823708, + "grad_norm": 0.07414197887063483, + "learning_rate": 8.285077131174661e-06, + "loss": 0.5274, + "step": 2250 + }, + { + "epoch": 1.0938601823708207, + "grad_norm": 0.0708868582550793, + "learning_rate": 8.283633463449632e-06, + "loss": 0.5679, + "step": 2251 + }, + { + "epoch": 1.0943465045592706, + "grad_norm": 0.07733633582481737, + "learning_rate": 8.282189314217041e-06, + "loss": 0.5828, + "step": 2252 + }, + { + "epoch": 1.0948328267477203, + "grad_norm": 0.07256848277708165, + "learning_rate": 8.28074468368866e-06, + "loss": 0.5363, + "step": 2253 + }, + { + "epoch": 1.0953191489361702, + "grad_norm": 0.08374072509978993, + "learning_rate": 8.279299572076325e-06, + "loss": 0.598, + "step": 2254 + }, + { + "epoch": 1.0958054711246201, + "grad_norm": 0.07690293949297158, + "learning_rate": 8.277853979591947e-06, + "loss": 0.5707, + "step": 2255 + }, + { + "epoch": 1.0962917933130698, + "grad_norm": 0.07592035379760226, + "learning_rate": 8.276407906447506e-06, + "loss": 0.5752, + "step": 2256 + }, + { + "epoch": 1.0967781155015197, + "grad_norm": 0.0738589124225445, + "learning_rate": 8.274961352855052e-06, + "loss": 0.5505, + "step": 2257 + }, + { + "epoch": 1.0972644376899696, + "grad_norm": 0.07401731823831176, + "learning_rate": 8.273514319026704e-06, + "loss": 0.5775, + "step": 2258 + }, + { + "epoch": 1.0977507598784195, + "grad_norm": 0.07492825265528798, + "learning_rate": 8.272066805174656e-06, + "loss": 0.5767, + "step": 2259 + }, + { + "epoch": 1.0982370820668692, + "grad_norm": 0.07443394569593802, + "learning_rate": 8.270618811511166e-06, + "loss": 0.5761, + "step": 2260 + }, + { + "epoch": 1.0987234042553191, + "grad_norm": 0.0747289140034974, + "learning_rate": 8.269170338248569e-06, + "loss": 0.5767, + "step": 2261 + }, + { + "epoch": 1.099209726443769, + "grad_norm": 0.07274394752087299, + "learning_rate": 8.267721385599265e-06, + "loss": 0.5221, + "step": 2262 + }, + { + "epoch": 1.099696048632219, + "grad_norm": 0.07658406691689554, + "learning_rate": 8.266271953775729e-06, + "loss": 0.5909, + "step": 2263 + }, + { + "epoch": 1.1001823708206686, + "grad_norm": 0.07421433969434833, + "learning_rate": 8.2648220429905e-06, + "loss": 0.5568, + "step": 2264 + }, + { + "epoch": 1.1006686930091185, + "grad_norm": 0.072874222443183, + "learning_rate": 8.263371653456193e-06, + "loss": 0.5493, + "step": 2265 + }, + { + "epoch": 1.1011550151975684, + "grad_norm": 0.06875869349577292, + "learning_rate": 8.26192078538549e-06, + "loss": 0.5013, + "step": 2266 + }, + { + "epoch": 1.1016413373860183, + "grad_norm": 0.07111814410994231, + "learning_rate": 8.260469438991147e-06, + "loss": 0.5305, + "step": 2267 + }, + { + "epoch": 1.102127659574468, + "grad_norm": 0.07018516477532395, + "learning_rate": 8.259017614485987e-06, + "loss": 0.5533, + "step": 2268 + }, + { + "epoch": 1.102613981762918, + "grad_norm": 0.07599691791673922, + "learning_rate": 8.2575653120829e-06, + "loss": 0.5559, + "step": 2269 + }, + { + "epoch": 1.1031003039513678, + "grad_norm": 0.07667748011043014, + "learning_rate": 8.256112531994855e-06, + "loss": 0.5499, + "step": 2270 + }, + { + "epoch": 1.1035866261398177, + "grad_norm": 0.07102692839201065, + "learning_rate": 8.25465927443488e-06, + "loss": 0.5686, + "step": 2271 + }, + { + "epoch": 1.1040729483282674, + "grad_norm": 0.0713289531214979, + "learning_rate": 8.253205539616083e-06, + "loss": 0.5698, + "step": 2272 + }, + { + "epoch": 1.1045592705167173, + "grad_norm": 0.07311010381473355, + "learning_rate": 8.251751327751636e-06, + "loss": 0.5533, + "step": 2273 + }, + { + "epoch": 1.1050455927051672, + "grad_norm": 0.07378183356170921, + "learning_rate": 8.250296639054782e-06, + "loss": 0.5738, + "step": 2274 + }, + { + "epoch": 1.105531914893617, + "grad_norm": 0.06968460024663366, + "learning_rate": 8.248841473738836e-06, + "loss": 0.54, + "step": 2275 + }, + { + "epoch": 1.1060182370820668, + "grad_norm": 0.07682691308786616, + "learning_rate": 8.247385832017182e-06, + "loss": 0.5986, + "step": 2276 + }, + { + "epoch": 1.1065045592705167, + "grad_norm": 0.0707884963401753, + "learning_rate": 8.24592971410327e-06, + "loss": 0.5275, + "step": 2277 + }, + { + "epoch": 1.1069908814589666, + "grad_norm": 0.07164579476861906, + "learning_rate": 8.244473120210628e-06, + "loss": 0.5513, + "step": 2278 + }, + { + "epoch": 1.1074772036474165, + "grad_norm": 0.07290309222367687, + "learning_rate": 8.243016050552843e-06, + "loss": 0.5723, + "step": 2279 + }, + { + "epoch": 1.1079635258358662, + "grad_norm": 0.07797427227668575, + "learning_rate": 8.241558505343584e-06, + "loss": 0.6045, + "step": 2280 + }, + { + "epoch": 1.1084498480243161, + "grad_norm": 0.07626168056439447, + "learning_rate": 8.240100484796581e-06, + "loss": 0.5946, + "step": 2281 + }, + { + "epoch": 1.108936170212766, + "grad_norm": 0.07078858084019193, + "learning_rate": 8.238641989125633e-06, + "loss": 0.5175, + "step": 2282 + }, + { + "epoch": 1.1094224924012157, + "grad_norm": 0.07316825569301089, + "learning_rate": 8.237183018544617e-06, + "loss": 0.5412, + "step": 2283 + }, + { + "epoch": 1.1099088145896656, + "grad_norm": 0.0735351716808662, + "learning_rate": 8.23572357326747e-06, + "loss": 0.5534, + "step": 2284 + }, + { + "epoch": 1.1103951367781155, + "grad_norm": 0.07391375176435087, + "learning_rate": 8.234263653508205e-06, + "loss": 0.566, + "step": 2285 + }, + { + "epoch": 1.1108814589665654, + "grad_norm": 0.07628396721234228, + "learning_rate": 8.232803259480903e-06, + "loss": 0.5795, + "step": 2286 + }, + { + "epoch": 1.111367781155015, + "grad_norm": 0.07346587884926059, + "learning_rate": 8.231342391399715e-06, + "loss": 0.5421, + "step": 2287 + }, + { + "epoch": 1.111854103343465, + "grad_norm": 0.07084422266219942, + "learning_rate": 8.229881049478859e-06, + "loss": 0.5736, + "step": 2288 + }, + { + "epoch": 1.112340425531915, + "grad_norm": 0.07085885676595888, + "learning_rate": 8.228419233932625e-06, + "loss": 0.557, + "step": 2289 + }, + { + "epoch": 1.1128267477203648, + "grad_norm": 0.07310875931804299, + "learning_rate": 8.226956944975371e-06, + "loss": 0.5325, + "step": 2290 + }, + { + "epoch": 1.1133130699088145, + "grad_norm": 0.0713775887788025, + "learning_rate": 8.225494182821526e-06, + "loss": 0.5739, + "step": 2291 + }, + { + "epoch": 1.1137993920972644, + "grad_norm": 0.07465749052223648, + "learning_rate": 8.224030947685588e-06, + "loss": 0.5519, + "step": 2292 + }, + { + "epoch": 1.1142857142857143, + "grad_norm": 0.07363670927009504, + "learning_rate": 8.222567239782122e-06, + "loss": 0.5485, + "step": 2293 + }, + { + "epoch": 1.1147720364741642, + "grad_norm": 0.07347708471026415, + "learning_rate": 8.221103059325764e-06, + "loss": 0.572, + "step": 2294 + }, + { + "epoch": 1.115258358662614, + "grad_norm": 0.07382179111154043, + "learning_rate": 8.21963840653122e-06, + "loss": 0.5408, + "step": 2295 + }, + { + "epoch": 1.1157446808510638, + "grad_norm": 0.0725446340206082, + "learning_rate": 8.218173281613266e-06, + "loss": 0.5648, + "step": 2296 + }, + { + "epoch": 1.1162310030395137, + "grad_norm": 0.07297901421589112, + "learning_rate": 8.216707684786747e-06, + "loss": 0.5075, + "step": 2297 + }, + { + "epoch": 1.1167173252279636, + "grad_norm": 0.07355275080077124, + "learning_rate": 8.215241616266572e-06, + "loss": 0.5676, + "step": 2298 + }, + { + "epoch": 1.1172036474164133, + "grad_norm": 0.0780838701470014, + "learning_rate": 8.213775076267725e-06, + "loss": 0.5872, + "step": 2299 + }, + { + "epoch": 1.1176899696048632, + "grad_norm": 0.07437635816784668, + "learning_rate": 8.212308065005258e-06, + "loss": 0.5528, + "step": 2300 + }, + { + "epoch": 1.1181762917933131, + "grad_norm": 0.07255265887762759, + "learning_rate": 8.210840582694292e-06, + "loss": 0.5425, + "step": 2301 + }, + { + "epoch": 1.1186626139817628, + "grad_norm": 0.07222145230841452, + "learning_rate": 8.209372629550018e-06, + "loss": 0.5325, + "step": 2302 + }, + { + "epoch": 1.1191489361702127, + "grad_norm": 0.0756938581315832, + "learning_rate": 8.20790420578769e-06, + "loss": 0.5757, + "step": 2303 + }, + { + "epoch": 1.1196352583586626, + "grad_norm": 0.07315454133058819, + "learning_rate": 8.206435311622641e-06, + "loss": 0.5478, + "step": 2304 + }, + { + "epoch": 1.1201215805471125, + "grad_norm": 0.07108540784682003, + "learning_rate": 8.204965947270263e-06, + "loss": 0.5295, + "step": 2305 + }, + { + "epoch": 1.1206079027355622, + "grad_norm": 0.07104173942728903, + "learning_rate": 8.203496112946024e-06, + "loss": 0.5281, + "step": 2306 + }, + { + "epoch": 1.1210942249240121, + "grad_norm": 0.07256337848384752, + "learning_rate": 8.202025808865457e-06, + "loss": 0.5636, + "step": 2307 + }, + { + "epoch": 1.121580547112462, + "grad_norm": 0.07464557496901748, + "learning_rate": 8.20055503524417e-06, + "loss": 0.5596, + "step": 2308 + }, + { + "epoch": 1.122066869300912, + "grad_norm": 0.06911534907101134, + "learning_rate": 8.199083792297828e-06, + "loss": 0.5198, + "step": 2309 + }, + { + "epoch": 1.1225531914893616, + "grad_norm": 0.07364494487904293, + "learning_rate": 8.197612080242176e-06, + "loss": 0.5522, + "step": 2310 + }, + { + "epoch": 1.1230395136778115, + "grad_norm": 0.0776368366618566, + "learning_rate": 8.196139899293026e-06, + "loss": 0.605, + "step": 2311 + }, + { + "epoch": 1.1235258358662614, + "grad_norm": 0.07438549464229299, + "learning_rate": 8.194667249666252e-06, + "loss": 0.5465, + "step": 2312 + }, + { + "epoch": 1.1240121580547113, + "grad_norm": 0.07548215385303386, + "learning_rate": 8.193194131577807e-06, + "loss": 0.5587, + "step": 2313 + }, + { + "epoch": 1.124498480243161, + "grad_norm": 0.09125662242222138, + "learning_rate": 8.191720545243702e-06, + "loss": 0.5984, + "step": 2314 + }, + { + "epoch": 1.124984802431611, + "grad_norm": 0.07428471493036362, + "learning_rate": 8.190246490880022e-06, + "loss": 0.5864, + "step": 2315 + }, + { + "epoch": 1.1254711246200608, + "grad_norm": 0.07233519563341763, + "learning_rate": 8.188771968702924e-06, + "loss": 0.5337, + "step": 2316 + }, + { + "epoch": 1.1259574468085107, + "grad_norm": 0.07468656713441423, + "learning_rate": 8.187296978928626e-06, + "loss": 0.5479, + "step": 2317 + }, + { + "epoch": 1.1264437689969604, + "grad_norm": 0.07203886426181973, + "learning_rate": 8.18582152177342e-06, + "loss": 0.5825, + "step": 2318 + }, + { + "epoch": 1.1269300911854103, + "grad_norm": 0.07114598694440448, + "learning_rate": 8.184345597453668e-06, + "loss": 0.5387, + "step": 2319 + }, + { + "epoch": 1.1274164133738602, + "grad_norm": 0.07372910166719628, + "learning_rate": 8.182869206185793e-06, + "loss": 0.5398, + "step": 2320 + }, + { + "epoch": 1.12790273556231, + "grad_norm": 0.07590526077415068, + "learning_rate": 8.181392348186292e-06, + "loss": 0.5646, + "step": 2321 + }, + { + "epoch": 1.1283890577507598, + "grad_norm": 0.07348618862183959, + "learning_rate": 8.17991502367173e-06, + "loss": 0.5337, + "step": 2322 + }, + { + "epoch": 1.1288753799392097, + "grad_norm": 0.07230980408532811, + "learning_rate": 8.178437232858743e-06, + "loss": 0.5361, + "step": 2323 + }, + { + "epoch": 1.1293617021276596, + "grad_norm": 0.07326929881544383, + "learning_rate": 8.176958975964027e-06, + "loss": 0.5531, + "step": 2324 + }, + { + "epoch": 1.1298480243161095, + "grad_norm": 0.07568903508588357, + "learning_rate": 8.175480253204354e-06, + "loss": 0.5344, + "step": 2325 + }, + { + "epoch": 1.1303343465045592, + "grad_norm": 0.07817467765739233, + "learning_rate": 8.174001064796561e-06, + "loss": 0.6008, + "step": 2326 + }, + { + "epoch": 1.1308206686930091, + "grad_norm": 0.07577221864945845, + "learning_rate": 8.172521410957556e-06, + "loss": 0.5538, + "step": 2327 + }, + { + "epoch": 1.131306990881459, + "grad_norm": 0.08501073684948966, + "learning_rate": 8.171041291904314e-06, + "loss": 0.5968, + "step": 2328 + }, + { + "epoch": 1.1317933130699087, + "grad_norm": 0.07583377031660356, + "learning_rate": 8.169560707853875e-06, + "loss": 0.5612, + "step": 2329 + }, + { + "epoch": 1.1322796352583586, + "grad_norm": 0.07264208084143413, + "learning_rate": 8.168079659023349e-06, + "loss": 0.5479, + "step": 2330 + }, + { + "epoch": 1.1327659574468085, + "grad_norm": 0.07409268626914883, + "learning_rate": 8.16659814562992e-06, + "loss": 0.547, + "step": 2331 + }, + { + "epoch": 1.1332522796352584, + "grad_norm": 0.07250890432028849, + "learning_rate": 8.16511616789083e-06, + "loss": 0.5613, + "step": 2332 + }, + { + "epoch": 1.1337386018237081, + "grad_norm": 0.07470000985304677, + "learning_rate": 8.163633726023397e-06, + "loss": 0.5722, + "step": 2333 + }, + { + "epoch": 1.134224924012158, + "grad_norm": 0.07379213284018293, + "learning_rate": 8.162150820245005e-06, + "loss": 0.5662, + "step": 2334 + }, + { + "epoch": 1.134711246200608, + "grad_norm": 0.07258929104067394, + "learning_rate": 8.1606674507731e-06, + "loss": 0.5402, + "step": 2335 + }, + { + "epoch": 1.1351975683890578, + "grad_norm": 0.07584517661540127, + "learning_rate": 8.159183617825208e-06, + "loss": 0.5801, + "step": 2336 + }, + { + "epoch": 1.1356838905775075, + "grad_norm": 0.07251319222361043, + "learning_rate": 8.157699321618912e-06, + "loss": 0.5039, + "step": 2337 + }, + { + "epoch": 1.1361702127659574, + "grad_norm": 0.07237410467070883, + "learning_rate": 8.156214562371872e-06, + "loss": 0.5528, + "step": 2338 + }, + { + "epoch": 1.1366565349544073, + "grad_norm": 0.07391677045271353, + "learning_rate": 8.154729340301803e-06, + "loss": 0.5409, + "step": 2339 + }, + { + "epoch": 1.1371428571428572, + "grad_norm": 0.07314975802655224, + "learning_rate": 8.153243655626501e-06, + "loss": 0.5556, + "step": 2340 + }, + { + "epoch": 1.137629179331307, + "grad_norm": 0.07315068503315438, + "learning_rate": 8.151757508563828e-06, + "loss": 0.5752, + "step": 2341 + }, + { + "epoch": 1.1381155015197568, + "grad_norm": 0.07444581458029567, + "learning_rate": 8.150270899331704e-06, + "loss": 0.5481, + "step": 2342 + }, + { + "epoch": 1.1386018237082067, + "grad_norm": 0.08269614720845281, + "learning_rate": 8.148783828148127e-06, + "loss": 0.5925, + "step": 2343 + }, + { + "epoch": 1.1390881458966566, + "grad_norm": 0.07279040538769634, + "learning_rate": 8.147296295231158e-06, + "loss": 0.6029, + "step": 2344 + }, + { + "epoch": 1.1395744680851063, + "grad_norm": 0.07399682151655745, + "learning_rate": 8.145808300798929e-06, + "loss": 0.5589, + "step": 2345 + }, + { + "epoch": 1.1400607902735562, + "grad_norm": 0.07046144666284118, + "learning_rate": 8.144319845069635e-06, + "loss": 0.5361, + "step": 2346 + }, + { + "epoch": 1.1405471124620061, + "grad_norm": 0.07476559736693897, + "learning_rate": 8.14283092826154e-06, + "loss": 0.5619, + "step": 2347 + }, + { + "epoch": 1.1410334346504558, + "grad_norm": 0.07259863378385552, + "learning_rate": 8.14134155059298e-06, + "loss": 0.5454, + "step": 2348 + }, + { + "epoch": 1.1415197568389057, + "grad_norm": 0.07290388885427557, + "learning_rate": 8.139851712282354e-06, + "loss": 0.5437, + "step": 2349 + }, + { + "epoch": 1.1420060790273556, + "grad_norm": 0.07247692548828968, + "learning_rate": 8.138361413548129e-06, + "loss": 0.5889, + "step": 2350 + }, + { + "epoch": 1.1424924012158055, + "grad_norm": 0.07299489110850316, + "learning_rate": 8.136870654608842e-06, + "loss": 0.5922, + "step": 2351 + }, + { + "epoch": 1.1429787234042554, + "grad_norm": 0.07369963670994291, + "learning_rate": 8.135379435683093e-06, + "loss": 0.583, + "step": 2352 + }, + { + "epoch": 1.1434650455927051, + "grad_norm": 0.07816873752694214, + "learning_rate": 8.133887756989558e-06, + "loss": 0.5877, + "step": 2353 + }, + { + "epoch": 1.143951367781155, + "grad_norm": 0.07520663338779614, + "learning_rate": 8.132395618746968e-06, + "loss": 0.5627, + "step": 2354 + }, + { + "epoch": 1.144437689969605, + "grad_norm": 0.07272354248253457, + "learning_rate": 8.130903021174133e-06, + "loss": 0.5721, + "step": 2355 + }, + { + "epoch": 1.1449240121580546, + "grad_norm": 0.0721093954128523, + "learning_rate": 8.129409964489922e-06, + "loss": 0.5434, + "step": 2356 + }, + { + "epoch": 1.1454103343465045, + "grad_norm": 0.07081119583169738, + "learning_rate": 8.127916448913279e-06, + "loss": 0.5178, + "step": 2357 + }, + { + "epoch": 1.1458966565349544, + "grad_norm": 0.07449602768115966, + "learning_rate": 8.126422474663205e-06, + "loss": 0.5669, + "step": 2358 + }, + { + "epoch": 1.1463829787234043, + "grad_norm": 0.07772632051068967, + "learning_rate": 8.124928041958782e-06, + "loss": 0.622, + "step": 2359 + }, + { + "epoch": 1.146869300911854, + "grad_norm": 0.07740272001670627, + "learning_rate": 8.123433151019145e-06, + "loss": 0.5525, + "step": 2360 + }, + { + "epoch": 1.147355623100304, + "grad_norm": 0.07279554553202595, + "learning_rate": 8.121937802063506e-06, + "loss": 0.5496, + "step": 2361 + }, + { + "epoch": 1.1478419452887538, + "grad_norm": 0.0743936126173316, + "learning_rate": 8.120441995311142e-06, + "loss": 0.5526, + "step": 2362 + }, + { + "epoch": 1.1483282674772037, + "grad_norm": 0.07477752989667144, + "learning_rate": 8.118945730981391e-06, + "loss": 0.5545, + "step": 2363 + }, + { + "epoch": 1.1488145896656534, + "grad_norm": 0.07226408996123988, + "learning_rate": 8.117449009293668e-06, + "loss": 0.5303, + "step": 2364 + }, + { + "epoch": 1.1493009118541033, + "grad_norm": 0.0744115269384535, + "learning_rate": 8.11595183046745e-06, + "loss": 0.5725, + "step": 2365 + }, + { + "epoch": 1.1497872340425532, + "grad_norm": 0.07881001847565439, + "learning_rate": 8.114454194722277e-06, + "loss": 0.558, + "step": 2366 + }, + { + "epoch": 1.1502735562310031, + "grad_norm": 0.07023724203490371, + "learning_rate": 8.112956102277768e-06, + "loss": 0.5424, + "step": 2367 + }, + { + "epoch": 1.1507598784194528, + "grad_norm": 0.07304456908262345, + "learning_rate": 8.111457553353593e-06, + "loss": 0.5441, + "step": 2368 + }, + { + "epoch": 1.1512462006079027, + "grad_norm": 0.07323779317396077, + "learning_rate": 8.109958548169502e-06, + "loss": 0.538, + "step": 2369 + }, + { + "epoch": 1.1517325227963526, + "grad_norm": 0.07918191480121578, + "learning_rate": 8.108459086945304e-06, + "loss": 0.5225, + "step": 2370 + }, + { + "epoch": 1.1522188449848025, + "grad_norm": 0.07288681865481134, + "learning_rate": 8.10695916990088e-06, + "loss": 0.5817, + "step": 2371 + }, + { + "epoch": 1.1527051671732522, + "grad_norm": 0.07480566253435578, + "learning_rate": 8.105458797256178e-06, + "loss": 0.5153, + "step": 2372 + }, + { + "epoch": 1.1531914893617021, + "grad_norm": 0.07325030843769754, + "learning_rate": 8.103957969231209e-06, + "loss": 0.5772, + "step": 2373 + }, + { + "epoch": 1.153677811550152, + "grad_norm": 0.07258208535140075, + "learning_rate": 8.102456686046049e-06, + "loss": 0.576, + "step": 2374 + }, + { + "epoch": 1.1541641337386017, + "grad_norm": 0.07551636128247924, + "learning_rate": 8.100954947920848e-06, + "loss": 0.5704, + "step": 2375 + }, + { + "epoch": 1.1546504559270516, + "grad_norm": 0.07388579042972913, + "learning_rate": 8.099452755075816e-06, + "loss": 0.5568, + "step": 2376 + }, + { + "epoch": 1.1551367781155015, + "grad_norm": 0.07007502527740003, + "learning_rate": 8.097950107731233e-06, + "loss": 0.5261, + "step": 2377 + }, + { + "epoch": 1.1556231003039514, + "grad_norm": 0.0710994571298559, + "learning_rate": 8.09644700610745e-06, + "loss": 0.5547, + "step": 2378 + }, + { + "epoch": 1.1561094224924013, + "grad_norm": 0.0725557276806488, + "learning_rate": 8.094943450424874e-06, + "loss": 0.5456, + "step": 2379 + }, + { + "epoch": 1.156595744680851, + "grad_norm": 0.07050151057099496, + "learning_rate": 8.093439440903988e-06, + "loss": 0.5467, + "step": 2380 + }, + { + "epoch": 1.157082066869301, + "grad_norm": 0.07348609928936829, + "learning_rate": 8.091934977765335e-06, + "loss": 0.5249, + "step": 2381 + }, + { + "epoch": 1.1575683890577508, + "grad_norm": 0.07275185521050123, + "learning_rate": 8.090430061229528e-06, + "loss": 0.5669, + "step": 2382 + }, + { + "epoch": 1.1580547112462005, + "grad_norm": 0.07142429747405558, + "learning_rate": 8.088924691517246e-06, + "loss": 0.5243, + "step": 2383 + }, + { + "epoch": 1.1585410334346504, + "grad_norm": 0.0694540819435593, + "learning_rate": 8.087418868849239e-06, + "loss": 0.5458, + "step": 2384 + }, + { + "epoch": 1.1590273556231003, + "grad_norm": 0.07172422515760193, + "learning_rate": 8.08591259344631e-06, + "loss": 0.5192, + "step": 2385 + }, + { + "epoch": 1.1595136778115502, + "grad_norm": 0.07140406545204077, + "learning_rate": 8.084405865529345e-06, + "loss": 0.5989, + "step": 2386 + }, + { + "epoch": 1.16, + "grad_norm": 0.06955197600711828, + "learning_rate": 8.082898685319285e-06, + "loss": 0.5427, + "step": 2387 + }, + { + "epoch": 1.1604863221884498, + "grad_norm": 0.07217385558971261, + "learning_rate": 8.081391053037141e-06, + "loss": 0.5604, + "step": 2388 + }, + { + "epoch": 1.1609726443768997, + "grad_norm": 0.0727474916000304, + "learning_rate": 8.079882968903991e-06, + "loss": 0.5709, + "step": 2389 + }, + { + "epoch": 1.1614589665653496, + "grad_norm": 0.07219831244740385, + "learning_rate": 8.078374433140978e-06, + "loss": 0.5509, + "step": 2390 + }, + { + "epoch": 1.1619452887537993, + "grad_norm": 0.07554260473766503, + "learning_rate": 8.076865445969313e-06, + "loss": 0.5894, + "step": 2391 + }, + { + "epoch": 1.1624316109422492, + "grad_norm": 0.07242202238091798, + "learning_rate": 8.07535600761027e-06, + "loss": 0.5359, + "step": 2392 + }, + { + "epoch": 1.1629179331306991, + "grad_norm": 0.0713802542785267, + "learning_rate": 8.07384611828519e-06, + "loss": 0.5151, + "step": 2393 + }, + { + "epoch": 1.1634042553191488, + "grad_norm": 0.07036634028632871, + "learning_rate": 8.072335778215482e-06, + "loss": 0.5202, + "step": 2394 + }, + { + "epoch": 1.1638905775075987, + "grad_norm": 0.07384883135264404, + "learning_rate": 8.070824987622622e-06, + "loss": 0.5426, + "step": 2395 + }, + { + "epoch": 1.1643768996960486, + "grad_norm": 0.07494430281618236, + "learning_rate": 8.069313746728149e-06, + "loss": 0.537, + "step": 2396 + }, + { + "epoch": 1.1648632218844985, + "grad_norm": 0.0708101239525627, + "learning_rate": 8.067802055753668e-06, + "loss": 0.5805, + "step": 2397 + }, + { + "epoch": 1.1653495440729484, + "grad_norm": 0.07304393953637715, + "learning_rate": 8.066289914920855e-06, + "loss": 0.5805, + "step": 2398 + }, + { + "epoch": 1.1658358662613981, + "grad_norm": 0.07248427654023368, + "learning_rate": 8.064777324451445e-06, + "loss": 0.5456, + "step": 2399 + }, + { + "epoch": 1.166322188449848, + "grad_norm": 0.07168278920822603, + "learning_rate": 8.063264284567245e-06, + "loss": 0.5325, + "step": 2400 + }, + { + "epoch": 1.166808510638298, + "grad_norm": 0.07260304773273424, + "learning_rate": 8.061750795490121e-06, + "loss": 0.5421, + "step": 2401 + }, + { + "epoch": 1.1672948328267476, + "grad_norm": 0.07073240793528326, + "learning_rate": 8.060236857442013e-06, + "loss": 0.5444, + "step": 2402 + }, + { + "epoch": 1.1677811550151975, + "grad_norm": 0.07124751379094929, + "learning_rate": 8.058722470644919e-06, + "loss": 0.5538, + "step": 2403 + }, + { + "epoch": 1.1682674772036474, + "grad_norm": 0.07255706674488437, + "learning_rate": 8.05720763532091e-06, + "loss": 0.5956, + "step": 2404 + }, + { + "epoch": 1.1687537993920973, + "grad_norm": 0.07514028642949143, + "learning_rate": 8.055692351692118e-06, + "loss": 0.5598, + "step": 2405 + }, + { + "epoch": 1.169240121580547, + "grad_norm": 0.07435480683259586, + "learning_rate": 8.054176619980742e-06, + "loss": 0.575, + "step": 2406 + }, + { + "epoch": 1.169726443768997, + "grad_norm": 0.06917363009187513, + "learning_rate": 8.052660440409049e-06, + "loss": 0.5447, + "step": 2407 + }, + { + "epoch": 1.1702127659574468, + "grad_norm": 0.07660782953715432, + "learning_rate": 8.051143813199366e-06, + "loss": 0.5436, + "step": 2408 + }, + { + "epoch": 1.1706990881458967, + "grad_norm": 0.07096661355467314, + "learning_rate": 8.049626738574091e-06, + "loss": 0.5357, + "step": 2409 + }, + { + "epoch": 1.1711854103343464, + "grad_norm": 0.0723918015905883, + "learning_rate": 8.048109216755687e-06, + "loss": 0.564, + "step": 2410 + }, + { + "epoch": 1.1716717325227963, + "grad_norm": 0.07240156024247478, + "learning_rate": 8.046591247966677e-06, + "loss": 0.5579, + "step": 2411 + }, + { + "epoch": 1.1721580547112462, + "grad_norm": 0.07499967279942138, + "learning_rate": 8.045072832429659e-06, + "loss": 0.5298, + "step": 2412 + }, + { + "epoch": 1.1726443768996961, + "grad_norm": 0.07206703218637224, + "learning_rate": 8.043553970367289e-06, + "loss": 0.5558, + "step": 2413 + }, + { + "epoch": 1.1731306990881458, + "grad_norm": 0.07382353090754513, + "learning_rate": 8.042034662002291e-06, + "loss": 0.559, + "step": 2414 + }, + { + "epoch": 1.1736170212765957, + "grad_norm": 0.07221139146654051, + "learning_rate": 8.040514907557453e-06, + "loss": 0.5544, + "step": 2415 + }, + { + "epoch": 1.1741033434650456, + "grad_norm": 0.07118992274955663, + "learning_rate": 8.038994707255634e-06, + "loss": 0.5592, + "step": 2416 + }, + { + "epoch": 1.1745896656534955, + "grad_norm": 0.07545508631132494, + "learning_rate": 8.037474061319749e-06, + "loss": 0.598, + "step": 2417 + }, + { + "epoch": 1.1750759878419452, + "grad_norm": 0.07437658853441415, + "learning_rate": 8.035952969972787e-06, + "loss": 0.5671, + "step": 2418 + }, + { + "epoch": 1.1755623100303951, + "grad_norm": 0.07134353693043526, + "learning_rate": 8.034431433437796e-06, + "loss": 0.5057, + "step": 2419 + }, + { + "epoch": 1.176048632218845, + "grad_norm": 0.07567010649198576, + "learning_rate": 8.032909451937894e-06, + "loss": 0.5648, + "step": 2420 + }, + { + "epoch": 1.1765349544072947, + "grad_norm": 0.07445581833328138, + "learning_rate": 8.031387025696262e-06, + "loss": 0.5437, + "step": 2421 + }, + { + "epoch": 1.1770212765957446, + "grad_norm": 0.0724348871635926, + "learning_rate": 8.029864154936147e-06, + "loss": 0.5698, + "step": 2422 + }, + { + "epoch": 1.1775075987841945, + "grad_norm": 0.07251554067984517, + "learning_rate": 8.028340839880859e-06, + "loss": 0.5499, + "step": 2423 + }, + { + "epoch": 1.1779939209726444, + "grad_norm": 0.07270364518484779, + "learning_rate": 8.026817080753777e-06, + "loss": 0.523, + "step": 2424 + }, + { + "epoch": 1.1784802431610943, + "grad_norm": 0.07547479780283077, + "learning_rate": 8.025292877778341e-06, + "loss": 0.6004, + "step": 2425 + }, + { + "epoch": 1.178966565349544, + "grad_norm": 0.07493385206939647, + "learning_rate": 8.02376823117806e-06, + "loss": 0.6001, + "step": 2426 + }, + { + "epoch": 1.179452887537994, + "grad_norm": 0.07283915499977123, + "learning_rate": 8.022243141176504e-06, + "loss": 0.5441, + "step": 2427 + }, + { + "epoch": 1.1799392097264438, + "grad_norm": 0.07832562541333396, + "learning_rate": 8.020717607997311e-06, + "loss": 0.5782, + "step": 2428 + }, + { + "epoch": 1.1804255319148935, + "grad_norm": 0.07315487304790864, + "learning_rate": 8.019191631864185e-06, + "loss": 0.5598, + "step": 2429 + }, + { + "epoch": 1.1809118541033434, + "grad_norm": 0.07260609245093201, + "learning_rate": 8.017665213000889e-06, + "loss": 0.5425, + "step": 2430 + }, + { + "epoch": 1.1813981762917933, + "grad_norm": 0.07519989191703304, + "learning_rate": 8.016138351631259e-06, + "loss": 0.5805, + "step": 2431 + }, + { + "epoch": 1.1818844984802432, + "grad_norm": 0.07355054047266626, + "learning_rate": 8.01461104797919e-06, + "loss": 0.5557, + "step": 2432 + }, + { + "epoch": 1.182370820668693, + "grad_norm": 0.07191380812111182, + "learning_rate": 8.013083302268645e-06, + "loss": 0.5486, + "step": 2433 + }, + { + "epoch": 1.1828571428571428, + "grad_norm": 0.07376908579355418, + "learning_rate": 8.011555114723648e-06, + "loss": 0.567, + "step": 2434 + }, + { + "epoch": 1.1833434650455927, + "grad_norm": 0.07125255458262628, + "learning_rate": 8.010026485568292e-06, + "loss": 0.5566, + "step": 2435 + }, + { + "epoch": 1.1838297872340426, + "grad_norm": 0.07301185433898116, + "learning_rate": 8.008497415026733e-06, + "loss": 0.5267, + "step": 2436 + }, + { + "epoch": 1.1843161094224923, + "grad_norm": 0.07171045719796212, + "learning_rate": 8.006967903323192e-06, + "loss": 0.5473, + "step": 2437 + }, + { + "epoch": 1.1848024316109422, + "grad_norm": 0.07321562721300487, + "learning_rate": 8.005437950681956e-06, + "loss": 0.5404, + "step": 2438 + }, + { + "epoch": 1.1852887537993921, + "grad_norm": 0.07683406798361234, + "learning_rate": 8.003907557327371e-06, + "loss": 0.5453, + "step": 2439 + }, + { + "epoch": 1.185775075987842, + "grad_norm": 0.07503300807536474, + "learning_rate": 8.002376723483855e-06, + "loss": 0.5278, + "step": 2440 + }, + { + "epoch": 1.1862613981762917, + "grad_norm": 0.07652952388338584, + "learning_rate": 8.000845449375888e-06, + "loss": 0.557, + "step": 2441 + }, + { + "epoch": 1.1867477203647416, + "grad_norm": 0.07450488448116986, + "learning_rate": 7.999313735228012e-06, + "loss": 0.5365, + "step": 2442 + }, + { + "epoch": 1.1872340425531915, + "grad_norm": 0.07159190223343588, + "learning_rate": 7.997781581264837e-06, + "loss": 0.5294, + "step": 2443 + }, + { + "epoch": 1.1877203647416414, + "grad_norm": 0.06893698833420311, + "learning_rate": 7.996248987711033e-06, + "loss": 0.5199, + "step": 2444 + }, + { + "epoch": 1.1882066869300911, + "grad_norm": 0.07420866832536976, + "learning_rate": 7.994715954791341e-06, + "loss": 0.5658, + "step": 2445 + }, + { + "epoch": 1.188693009118541, + "grad_norm": 0.07652733654565654, + "learning_rate": 7.993182482730562e-06, + "loss": 0.5338, + "step": 2446 + }, + { + "epoch": 1.189179331306991, + "grad_norm": 0.07666678117168962, + "learning_rate": 7.991648571753561e-06, + "loss": 0.5366, + "step": 2447 + }, + { + "epoch": 1.1896656534954406, + "grad_norm": 0.07298374530670172, + "learning_rate": 7.99011422208527e-06, + "loss": 0.534, + "step": 2448 + }, + { + "epoch": 1.1901519756838905, + "grad_norm": 0.07050531308264864, + "learning_rate": 7.988579433950682e-06, + "loss": 0.5688, + "step": 2449 + }, + { + "epoch": 1.1906382978723404, + "grad_norm": 0.07497532154511899, + "learning_rate": 7.987044207574858e-06, + "loss": 0.5545, + "step": 2450 + }, + { + "epoch": 1.1911246200607903, + "grad_norm": 0.07384194932322664, + "learning_rate": 7.985508543182922e-06, + "loss": 0.5231, + "step": 2451 + }, + { + "epoch": 1.1916109422492402, + "grad_norm": 0.0727618724432849, + "learning_rate": 7.98397244100006e-06, + "loss": 0.5358, + "step": 2452 + }, + { + "epoch": 1.19209726443769, + "grad_norm": 0.075666823132178, + "learning_rate": 7.982435901251527e-06, + "loss": 0.5779, + "step": 2453 + }, + { + "epoch": 1.1925835866261398, + "grad_norm": 0.07296200200723844, + "learning_rate": 7.980898924162634e-06, + "loss": 0.5466, + "step": 2454 + }, + { + "epoch": 1.1930699088145897, + "grad_norm": 0.07121593121910853, + "learning_rate": 7.979361509958764e-06, + "loss": 0.5544, + "step": 2455 + }, + { + "epoch": 1.1935562310030394, + "grad_norm": 0.07400329717840662, + "learning_rate": 7.977823658865364e-06, + "loss": 0.541, + "step": 2456 + }, + { + "epoch": 1.1940425531914893, + "grad_norm": 0.0724387865765923, + "learning_rate": 7.976285371107937e-06, + "loss": 0.569, + "step": 2457 + }, + { + "epoch": 1.1945288753799392, + "grad_norm": 0.07045253743964261, + "learning_rate": 7.97474664691206e-06, + "loss": 0.5806, + "step": 2458 + }, + { + "epoch": 1.1950151975683891, + "grad_norm": 0.07472049799434143, + "learning_rate": 7.973207486503368e-06, + "loss": 0.5542, + "step": 2459 + }, + { + "epoch": 1.1955015197568388, + "grad_norm": 0.07230265384847155, + "learning_rate": 7.971667890107561e-06, + "loss": 0.5308, + "step": 2460 + }, + { + "epoch": 1.1959878419452887, + "grad_norm": 0.07165969078477409, + "learning_rate": 7.970127857950403e-06, + "loss": 0.5442, + "step": 2461 + }, + { + "epoch": 1.1964741641337386, + "grad_norm": 0.07141666260760997, + "learning_rate": 7.968587390257723e-06, + "loss": 0.5392, + "step": 2462 + }, + { + "epoch": 1.1969604863221885, + "grad_norm": 0.073726461087926, + "learning_rate": 7.967046487255412e-06, + "loss": 0.5591, + "step": 2463 + }, + { + "epoch": 1.1974468085106382, + "grad_norm": 0.07365654959312602, + "learning_rate": 7.965505149169428e-06, + "loss": 0.5534, + "step": 2464 + }, + { + "epoch": 1.1979331306990881, + "grad_norm": 0.07256725216797784, + "learning_rate": 7.963963376225788e-06, + "loss": 0.5184, + "step": 2465 + }, + { + "epoch": 1.198419452887538, + "grad_norm": 0.0722980832197576, + "learning_rate": 7.962421168650576e-06, + "loss": 0.5258, + "step": 2466 + }, + { + "epoch": 1.1989057750759877, + "grad_norm": 0.07002800580779552, + "learning_rate": 7.960878526669942e-06, + "loss": 0.533, + "step": 2467 + }, + { + "epoch": 1.1993920972644376, + "grad_norm": 0.07528203964508505, + "learning_rate": 7.959335450510095e-06, + "loss": 0.5561, + "step": 2468 + }, + { + "epoch": 1.1998784194528875, + "grad_norm": 0.07609456104563551, + "learning_rate": 7.957791940397309e-06, + "loss": 0.5808, + "step": 2469 + }, + { + "epoch": 1.2003647416413374, + "grad_norm": 0.06919684922827463, + "learning_rate": 7.956247996557924e-06, + "loss": 0.5209, + "step": 2470 + }, + { + "epoch": 1.2008510638297873, + "grad_norm": 0.0732215846563845, + "learning_rate": 7.95470361921834e-06, + "loss": 0.5601, + "step": 2471 + }, + { + "epoch": 1.201337386018237, + "grad_norm": 0.07671494773527357, + "learning_rate": 7.953158808605023e-06, + "loss": 0.6011, + "step": 2472 + }, + { + "epoch": 1.201823708206687, + "grad_norm": 0.07531487021854413, + "learning_rate": 7.951613564944502e-06, + "loss": 0.599, + "step": 2473 + }, + { + "epoch": 1.2023100303951368, + "grad_norm": 0.0718495963689583, + "learning_rate": 7.95006788846337e-06, + "loss": 0.5438, + "step": 2474 + }, + { + "epoch": 1.2027963525835865, + "grad_norm": 0.0705459930335682, + "learning_rate": 7.94852177938828e-06, + "loss": 0.5155, + "step": 2475 + }, + { + "epoch": 1.2032826747720364, + "grad_norm": 0.07212110106373414, + "learning_rate": 7.946975237945958e-06, + "loss": 0.5541, + "step": 2476 + }, + { + "epoch": 1.2037689969604863, + "grad_norm": 0.07152013637987481, + "learning_rate": 7.94542826436318e-06, + "loss": 0.5142, + "step": 2477 + }, + { + "epoch": 1.2042553191489362, + "grad_norm": 0.07505027355903858, + "learning_rate": 7.943880858866794e-06, + "loss": 0.5644, + "step": 2478 + }, + { + "epoch": 1.204741641337386, + "grad_norm": 0.07345249967963459, + "learning_rate": 7.942333021683712e-06, + "loss": 0.5288, + "step": 2479 + }, + { + "epoch": 1.2052279635258358, + "grad_norm": 0.0710178381330521, + "learning_rate": 7.940784753040903e-06, + "loss": 0.5588, + "step": 2480 + }, + { + "epoch": 1.2057142857142857, + "grad_norm": 0.07389132534313793, + "learning_rate": 7.939236053165404e-06, + "loss": 0.5495, + "step": 2481 + }, + { + "epoch": 1.2062006079027356, + "grad_norm": 0.07367613047760858, + "learning_rate": 7.937686922284319e-06, + "loss": 0.528, + "step": 2482 + }, + { + "epoch": 1.2066869300911853, + "grad_norm": 0.0704449840835992, + "learning_rate": 7.936137360624802e-06, + "loss": 0.536, + "step": 2483 + }, + { + "epoch": 1.2071732522796352, + "grad_norm": 0.07098458523649463, + "learning_rate": 7.934587368414085e-06, + "loss": 0.5548, + "step": 2484 + }, + { + "epoch": 1.2076595744680851, + "grad_norm": 0.07141953262171628, + "learning_rate": 7.933036945879455e-06, + "loss": 0.5372, + "step": 2485 + }, + { + "epoch": 1.208145896656535, + "grad_norm": 0.07441054373557604, + "learning_rate": 7.931486093248263e-06, + "loss": 0.5808, + "step": 2486 + }, + { + "epoch": 1.2086322188449847, + "grad_norm": 0.07615638168102483, + "learning_rate": 7.929934810747926e-06, + "loss": 0.5867, + "step": 2487 + }, + { + "epoch": 1.2091185410334346, + "grad_norm": 0.07317579164594856, + "learning_rate": 7.928383098605921e-06, + "loss": 0.5749, + "step": 2488 + }, + { + "epoch": 1.2096048632218845, + "grad_norm": 0.07023029276349807, + "learning_rate": 7.926830957049787e-06, + "loss": 0.5434, + "step": 2489 + }, + { + "epoch": 1.2100911854103344, + "grad_norm": 0.07346701164137073, + "learning_rate": 7.92527838630713e-06, + "loss": 0.588, + "step": 2490 + }, + { + "epoch": 1.2105775075987841, + "grad_norm": 0.0720930157034673, + "learning_rate": 7.923725386605617e-06, + "loss": 0.5407, + "step": 2491 + }, + { + "epoch": 1.211063829787234, + "grad_norm": 0.07560890263630829, + "learning_rate": 7.922171958172976e-06, + "loss": 0.5613, + "step": 2492 + }, + { + "epoch": 1.211550151975684, + "grad_norm": 0.07103108081036084, + "learning_rate": 7.920618101237001e-06, + "loss": 0.5719, + "step": 2493 + }, + { + "epoch": 1.2120364741641336, + "grad_norm": 0.07036796985177463, + "learning_rate": 7.919063816025547e-06, + "loss": 0.5442, + "step": 2494 + }, + { + "epoch": 1.2125227963525835, + "grad_norm": 0.07682791215301267, + "learning_rate": 7.917509102766535e-06, + "loss": 0.6024, + "step": 2495 + }, + { + "epoch": 1.2130091185410334, + "grad_norm": 0.0698638951979365, + "learning_rate": 7.915953961687942e-06, + "loss": 0.5412, + "step": 2496 + }, + { + "epoch": 1.2134954407294833, + "grad_norm": 0.07205491035653111, + "learning_rate": 7.914398393017812e-06, + "loss": 0.5612, + "step": 2497 + }, + { + "epoch": 1.2139817629179332, + "grad_norm": 0.07381386928626457, + "learning_rate": 7.912842396984256e-06, + "loss": 0.568, + "step": 2498 + }, + { + "epoch": 1.214468085106383, + "grad_norm": 0.07378946035796669, + "learning_rate": 7.911285973815437e-06, + "loss": 0.5599, + "step": 2499 + }, + { + "epoch": 1.2149544072948328, + "grad_norm": 0.07209503485870092, + "learning_rate": 7.90972912373959e-06, + "loss": 0.567, + "step": 2500 + }, + { + "epoch": 1.2154407294832827, + "grad_norm": 0.07304474231827386, + "learning_rate": 7.90817184698501e-06, + "loss": 0.5433, + "step": 2501 + }, + { + "epoch": 1.2159270516717324, + "grad_norm": 0.0737088694280214, + "learning_rate": 7.906614143780053e-06, + "loss": 0.5726, + "step": 2502 + }, + { + "epoch": 1.2164133738601823, + "grad_norm": 0.0726806891844952, + "learning_rate": 7.905056014353139e-06, + "loss": 0.5425, + "step": 2503 + }, + { + "epoch": 1.2168996960486322, + "grad_norm": 0.07055461388876123, + "learning_rate": 7.903497458932749e-06, + "loss": 0.5309, + "step": 2504 + }, + { + "epoch": 1.2173860182370821, + "grad_norm": 0.07089727983976464, + "learning_rate": 7.901938477747428e-06, + "loss": 0.541, + "step": 2505 + }, + { + "epoch": 1.2178723404255318, + "grad_norm": 0.07313922186526196, + "learning_rate": 7.900379071025783e-06, + "loss": 0.5774, + "step": 2506 + }, + { + "epoch": 1.2183586626139817, + "grad_norm": 0.07319577197309318, + "learning_rate": 7.898819238996484e-06, + "loss": 0.5606, + "step": 2507 + }, + { + "epoch": 1.2188449848024316, + "grad_norm": 0.07321982527566595, + "learning_rate": 7.897258981888261e-06, + "loss": 0.5421, + "step": 2508 + }, + { + "epoch": 1.2193313069908815, + "grad_norm": 0.07783215847254792, + "learning_rate": 7.895698299929909e-06, + "loss": 0.5763, + "step": 2509 + }, + { + "epoch": 1.2198176291793312, + "grad_norm": 0.07221243246103402, + "learning_rate": 7.894137193350284e-06, + "loss": 0.5462, + "step": 2510 + }, + { + "epoch": 1.2203039513677811, + "grad_norm": 0.07006969259138601, + "learning_rate": 7.892575662378306e-06, + "loss": 0.5438, + "step": 2511 + }, + { + "epoch": 1.220790273556231, + "grad_norm": 0.07150545694044028, + "learning_rate": 7.891013707242953e-06, + "loss": 0.517, + "step": 2512 + }, + { + "epoch": 1.2212765957446807, + "grad_norm": 0.07218882606700625, + "learning_rate": 7.88945132817327e-06, + "loss": 0.5365, + "step": 2513 + }, + { + "epoch": 1.2217629179331306, + "grad_norm": 0.07476785491518514, + "learning_rate": 7.887888525398362e-06, + "loss": 0.5429, + "step": 2514 + }, + { + "epoch": 1.2222492401215805, + "grad_norm": 0.07353948350864925, + "learning_rate": 7.886325299147394e-06, + "loss": 0.5549, + "step": 2515 + }, + { + "epoch": 1.2227355623100304, + "grad_norm": 0.07417135655598107, + "learning_rate": 7.8847616496496e-06, + "loss": 0.5999, + "step": 2516 + }, + { + "epoch": 1.2232218844984803, + "grad_norm": 0.07233066218840994, + "learning_rate": 7.883197577134267e-06, + "loss": 0.5698, + "step": 2517 + }, + { + "epoch": 1.22370820668693, + "grad_norm": 0.07085273845507489, + "learning_rate": 7.881633081830751e-06, + "loss": 0.5273, + "step": 2518 + }, + { + "epoch": 1.22419452887538, + "grad_norm": 0.07097689357999455, + "learning_rate": 7.880068163968467e-06, + "loss": 0.5643, + "step": 2519 + }, + { + "epoch": 1.2246808510638298, + "grad_norm": 0.07339634432025, + "learning_rate": 7.878502823776892e-06, + "loss": 0.5712, + "step": 2520 + }, + { + "epoch": 1.2251671732522795, + "grad_norm": 0.07539123662166866, + "learning_rate": 7.876937061485563e-06, + "loss": 0.5967, + "step": 2521 + }, + { + "epoch": 1.2256534954407294, + "grad_norm": 0.07139334714072847, + "learning_rate": 7.875370877324086e-06, + "loss": 0.5605, + "step": 2522 + }, + { + "epoch": 1.2261398176291793, + "grad_norm": 0.07813376555309574, + "learning_rate": 7.873804271522122e-06, + "loss": 0.5481, + "step": 2523 + }, + { + "epoch": 1.2266261398176292, + "grad_norm": 0.07152899008994364, + "learning_rate": 7.872237244309395e-06, + "loss": 0.568, + "step": 2524 + }, + { + "epoch": 1.2271124620060792, + "grad_norm": 0.07419754228040835, + "learning_rate": 7.870669795915692e-06, + "loss": 0.5456, + "step": 2525 + }, + { + "epoch": 1.2275987841945288, + "grad_norm": 0.07102799209861953, + "learning_rate": 7.869101926570864e-06, + "loss": 0.5572, + "step": 2526 + }, + { + "epoch": 1.2280851063829787, + "grad_norm": 0.0719424005075752, + "learning_rate": 7.867533636504818e-06, + "loss": 0.5326, + "step": 2527 + }, + { + "epoch": 1.2285714285714286, + "grad_norm": 0.07174277234768475, + "learning_rate": 7.865964925947526e-06, + "loss": 0.5407, + "step": 2528 + }, + { + "epoch": 1.2290577507598783, + "grad_norm": 0.07079182926445766, + "learning_rate": 7.864395795129025e-06, + "loss": 0.5221, + "step": 2529 + }, + { + "epoch": 1.2295440729483282, + "grad_norm": 0.07329877496339461, + "learning_rate": 7.862826244279406e-06, + "loss": 0.5573, + "step": 2530 + }, + { + "epoch": 1.2300303951367781, + "grad_norm": 0.07312771116259933, + "learning_rate": 7.86125627362883e-06, + "loss": 0.5633, + "step": 2531 + }, + { + "epoch": 1.230516717325228, + "grad_norm": 0.07131109345688764, + "learning_rate": 7.859685883407513e-06, + "loss": 0.5306, + "step": 2532 + }, + { + "epoch": 1.2310030395136777, + "grad_norm": 0.07531301797415688, + "learning_rate": 7.858115073845733e-06, + "loss": 0.5573, + "step": 2533 + }, + { + "epoch": 1.2314893617021276, + "grad_norm": 0.07141081234675177, + "learning_rate": 7.856543845173836e-06, + "loss": 0.5641, + "step": 2534 + }, + { + "epoch": 1.2319756838905775, + "grad_norm": 0.07323072910414083, + "learning_rate": 7.854972197622221e-06, + "loss": 0.5543, + "step": 2535 + }, + { + "epoch": 1.2324620060790275, + "grad_norm": 0.06860959496601561, + "learning_rate": 7.853400131421353e-06, + "loss": 0.5372, + "step": 2536 + }, + { + "epoch": 1.2329483282674771, + "grad_norm": 0.07285360241160728, + "learning_rate": 7.85182764680176e-06, + "loss": 0.5862, + "step": 2537 + }, + { + "epoch": 1.233434650455927, + "grad_norm": 0.07160352449303746, + "learning_rate": 7.850254743994026e-06, + "loss": 0.5564, + "step": 2538 + }, + { + "epoch": 1.233920972644377, + "grad_norm": 0.07488460756657045, + "learning_rate": 7.848681423228799e-06, + "loss": 0.5826, + "step": 2539 + }, + { + "epoch": 1.2344072948328266, + "grad_norm": 0.07029839485982914, + "learning_rate": 7.847107684736792e-06, + "loss": 0.5283, + "step": 2540 + }, + { + "epoch": 1.2348936170212765, + "grad_norm": 0.07618170457925849, + "learning_rate": 7.845533528748774e-06, + "loss": 0.545, + "step": 2541 + }, + { + "epoch": 1.2353799392097264, + "grad_norm": 0.07119841948715123, + "learning_rate": 7.843958955495579e-06, + "loss": 0.5221, + "step": 2542 + }, + { + "epoch": 1.2358662613981763, + "grad_norm": 0.07457691116419964, + "learning_rate": 7.842383965208095e-06, + "loss": 0.5499, + "step": 2543 + }, + { + "epoch": 1.2363525835866263, + "grad_norm": 0.07475554035386363, + "learning_rate": 7.840808558117281e-06, + "loss": 0.5988, + "step": 2544 + }, + { + "epoch": 1.236838905775076, + "grad_norm": 0.07600818264050219, + "learning_rate": 7.839232734454154e-06, + "loss": 0.623, + "step": 2545 + }, + { + "epoch": 1.2373252279635258, + "grad_norm": 0.07066056329461261, + "learning_rate": 7.837656494449785e-06, + "loss": 0.5365, + "step": 2546 + }, + { + "epoch": 1.2378115501519757, + "grad_norm": 0.06929162205657466, + "learning_rate": 7.836079838335317e-06, + "loss": 0.5378, + "step": 2547 + }, + { + "epoch": 1.2382978723404254, + "grad_norm": 0.07160701142561782, + "learning_rate": 7.834502766341944e-06, + "loss": 0.5392, + "step": 2548 + }, + { + "epoch": 1.2387841945288753, + "grad_norm": 0.07760038183075246, + "learning_rate": 7.83292527870093e-06, + "loss": 0.5454, + "step": 2549 + }, + { + "epoch": 1.2392705167173252, + "grad_norm": 0.07262611121677474, + "learning_rate": 7.831347375643594e-06, + "loss": 0.5338, + "step": 2550 + }, + { + "epoch": 1.2397568389057751, + "grad_norm": 0.07438800345414766, + "learning_rate": 7.829769057401316e-06, + "loss": 0.5276, + "step": 2551 + }, + { + "epoch": 1.2402431610942248, + "grad_norm": 0.07997838302733981, + "learning_rate": 7.828190324205542e-06, + "loss": 0.6197, + "step": 2552 + }, + { + "epoch": 1.2407294832826747, + "grad_norm": 0.09832363003269558, + "learning_rate": 7.826611176287772e-06, + "loss": 0.5797, + "step": 2553 + }, + { + "epoch": 1.2412158054711246, + "grad_norm": 0.07171182557266415, + "learning_rate": 7.825031613879572e-06, + "loss": 0.5561, + "step": 2554 + }, + { + "epoch": 1.2417021276595746, + "grad_norm": 0.0738645441554879, + "learning_rate": 7.823451637212564e-06, + "loss": 0.5824, + "step": 2555 + }, + { + "epoch": 1.2421884498480242, + "grad_norm": 0.0725554935230709, + "learning_rate": 7.821871246518437e-06, + "loss": 0.5284, + "step": 2556 + }, + { + "epoch": 1.2426747720364741, + "grad_norm": 0.07570249684648245, + "learning_rate": 7.820290442028937e-06, + "loss": 0.5723, + "step": 2557 + }, + { + "epoch": 1.243161094224924, + "grad_norm": 0.07630549253280923, + "learning_rate": 7.81870922397587e-06, + "loss": 0.5837, + "step": 2558 + }, + { + "epoch": 1.243647416413374, + "grad_norm": 0.08104742521616841, + "learning_rate": 7.817127592591105e-06, + "loss": 0.6158, + "step": 2559 + }, + { + "epoch": 1.2441337386018236, + "grad_norm": 0.07669906480136184, + "learning_rate": 7.815545548106567e-06, + "loss": 0.573, + "step": 2560 + }, + { + "epoch": 1.2446200607902735, + "grad_norm": 0.07180462704135149, + "learning_rate": 7.813963090754248e-06, + "loss": 0.5387, + "step": 2561 + }, + { + "epoch": 1.2451063829787234, + "grad_norm": 0.07095743048486752, + "learning_rate": 7.812380220766195e-06, + "loss": 0.5467, + "step": 2562 + }, + { + "epoch": 1.2455927051671734, + "grad_norm": 0.06935717459609982, + "learning_rate": 7.810796938374521e-06, + "loss": 0.5322, + "step": 2563 + }, + { + "epoch": 1.246079027355623, + "grad_norm": 0.07418734013166545, + "learning_rate": 7.809213243811394e-06, + "loss": 0.5531, + "step": 2564 + }, + { + "epoch": 1.246565349544073, + "grad_norm": 0.07526756646521683, + "learning_rate": 7.807629137309046e-06, + "loss": 0.5553, + "step": 2565 + }, + { + "epoch": 1.2470516717325228, + "grad_norm": 0.07588947083105696, + "learning_rate": 7.806044619099767e-06, + "loss": 0.5537, + "step": 2566 + }, + { + "epoch": 1.2475379939209725, + "grad_norm": 0.0776090176055357, + "learning_rate": 7.80445968941591e-06, + "loss": 0.5816, + "step": 2567 + }, + { + "epoch": 1.2480243161094224, + "grad_norm": 0.07796578492508335, + "learning_rate": 7.802874348489887e-06, + "loss": 0.5485, + "step": 2568 + }, + { + "epoch": 1.2485106382978723, + "grad_norm": 0.07499229875326857, + "learning_rate": 7.801288596554168e-06, + "loss": 0.5727, + "step": 2569 + }, + { + "epoch": 1.2489969604863222, + "grad_norm": 0.0703022327767209, + "learning_rate": 7.799702433841288e-06, + "loss": 0.5224, + "step": 2570 + }, + { + "epoch": 1.2489969604863222, + "eval_loss": 0.5824215412139893, + "eval_runtime": 105.1433, + "eval_samples_per_second": 288.682, + "eval_steps_per_second": 36.094, + "step": 2570 + }, + { + "epoch": 1.2494832826747722, + "grad_norm": 0.07358938619390225, + "learning_rate": 7.79811586058384e-06, + "loss": 0.5753, + "step": 2571 + }, + { + "epoch": 1.2499696048632218, + "grad_norm": 0.07051388390271865, + "learning_rate": 7.796528877014474e-06, + "loss": 0.4974, + "step": 2572 + }, + { + "epoch": 1.2504559270516717, + "grad_norm": 0.07452573103011828, + "learning_rate": 7.794941483365903e-06, + "loss": 0.5566, + "step": 2573 + }, + { + "epoch": 1.2509422492401217, + "grad_norm": 0.07810445928003949, + "learning_rate": 7.793353679870906e-06, + "loss": 0.5483, + "step": 2574 + }, + { + "epoch": 1.2514285714285713, + "grad_norm": 0.07174413784254638, + "learning_rate": 7.791765466762308e-06, + "loss": 0.5169, + "step": 2575 + }, + { + "epoch": 1.2519148936170212, + "grad_norm": 0.07524117931119266, + "learning_rate": 7.79017684427301e-06, + "loss": 0.5762, + "step": 2576 + }, + { + "epoch": 1.2524012158054711, + "grad_norm": 0.07356261552811819, + "learning_rate": 7.788587812635964e-06, + "loss": 0.5639, + "step": 2577 + }, + { + "epoch": 1.252887537993921, + "grad_norm": 0.07277492668062818, + "learning_rate": 7.786998372084179e-06, + "loss": 0.5437, + "step": 2578 + }, + { + "epoch": 1.253373860182371, + "grad_norm": 0.07728796632913545, + "learning_rate": 7.785408522850733e-06, + "loss": 0.5447, + "step": 2579 + }, + { + "epoch": 1.2538601823708206, + "grad_norm": 0.07466761671733702, + "learning_rate": 7.783818265168756e-06, + "loss": 0.562, + "step": 2580 + }, + { + "epoch": 1.2543465045592705, + "grad_norm": 0.07274026823554142, + "learning_rate": 7.782227599271443e-06, + "loss": 0.5604, + "step": 2581 + }, + { + "epoch": 1.2548328267477205, + "grad_norm": 0.07830210020324926, + "learning_rate": 7.780636525392047e-06, + "loss": 0.6249, + "step": 2582 + }, + { + "epoch": 1.2553191489361701, + "grad_norm": 0.08748456334804489, + "learning_rate": 7.779045043763883e-06, + "loss": 0.5472, + "step": 2583 + }, + { + "epoch": 1.25580547112462, + "grad_norm": 0.07567169382500258, + "learning_rate": 7.777453154620318e-06, + "loss": 0.5863, + "step": 2584 + }, + { + "epoch": 1.25629179331307, + "grad_norm": 0.07086785292878028, + "learning_rate": 7.775860858194788e-06, + "loss": 0.5273, + "step": 2585 + }, + { + "epoch": 1.2567781155015196, + "grad_norm": 0.07470599770095356, + "learning_rate": 7.774268154720788e-06, + "loss": 0.5204, + "step": 2586 + }, + { + "epoch": 1.2572644376899695, + "grad_norm": 0.07509453124864622, + "learning_rate": 7.772675044431865e-06, + "loss": 0.5657, + "step": 2587 + }, + { + "epoch": 1.2577507598784194, + "grad_norm": 0.07293588515574081, + "learning_rate": 7.771081527561632e-06, + "loss": 0.5569, + "step": 2588 + }, + { + "epoch": 1.2582370820668694, + "grad_norm": 0.0743274443805202, + "learning_rate": 7.769487604343761e-06, + "loss": 0.5694, + "step": 2589 + }, + { + "epoch": 1.2587234042553193, + "grad_norm": 0.07370388435728399, + "learning_rate": 7.767893275011986e-06, + "loss": 0.5552, + "step": 2590 + }, + { + "epoch": 1.259209726443769, + "grad_norm": 0.0717242308642081, + "learning_rate": 7.76629853980009e-06, + "loss": 0.5369, + "step": 2591 + }, + { + "epoch": 1.2596960486322188, + "grad_norm": 0.07726035737295897, + "learning_rate": 7.764703398941927e-06, + "loss": 0.6139, + "step": 2592 + }, + { + "epoch": 1.2601823708206688, + "grad_norm": 0.073874805303104, + "learning_rate": 7.763107852671406e-06, + "loss": 0.5318, + "step": 2593 + }, + { + "epoch": 1.2606686930091184, + "grad_norm": 0.07517378400057814, + "learning_rate": 7.761511901222495e-06, + "loss": 0.5554, + "step": 2594 + }, + { + "epoch": 1.2611550151975683, + "grad_norm": 0.0734981507620573, + "learning_rate": 7.759915544829225e-06, + "loss": 0.5767, + "step": 2595 + }, + { + "epoch": 1.2616413373860182, + "grad_norm": 0.07223905718186543, + "learning_rate": 7.758318783725678e-06, + "loss": 0.5605, + "step": 2596 + }, + { + "epoch": 1.2621276595744682, + "grad_norm": 0.07318658729171917, + "learning_rate": 7.756721618146007e-06, + "loss": 0.534, + "step": 2597 + }, + { + "epoch": 1.262613981762918, + "grad_norm": 0.07123303100648597, + "learning_rate": 7.755124048324416e-06, + "loss": 0.5681, + "step": 2598 + }, + { + "epoch": 1.2631003039513677, + "grad_norm": 0.07476412223009882, + "learning_rate": 7.753526074495168e-06, + "loss": 0.5959, + "step": 2599 + }, + { + "epoch": 1.2635866261398176, + "grad_norm": 0.07797201064958541, + "learning_rate": 7.75192769689259e-06, + "loss": 0.6223, + "step": 2600 + }, + { + "epoch": 1.2640729483282676, + "grad_norm": 0.07484586517357113, + "learning_rate": 7.750328915751064e-06, + "loss": 0.5424, + "step": 2601 + }, + { + "epoch": 1.2645592705167172, + "grad_norm": 0.07171490308238153, + "learning_rate": 7.748729731305036e-06, + "loss": 0.5351, + "step": 2602 + }, + { + "epoch": 1.2650455927051671, + "grad_norm": 0.07388897417089899, + "learning_rate": 7.747130143789006e-06, + "loss": 0.5539, + "step": 2603 + }, + { + "epoch": 1.265531914893617, + "grad_norm": 0.0727703100841655, + "learning_rate": 7.745530153437538e-06, + "loss": 0.5779, + "step": 2604 + }, + { + "epoch": 1.2660182370820667, + "grad_norm": 0.0725721800207721, + "learning_rate": 7.743929760485248e-06, + "loss": 0.5572, + "step": 2605 + }, + { + "epoch": 1.2665045592705166, + "grad_norm": 0.07560644933635846, + "learning_rate": 7.742328965166818e-06, + "loss": 0.5774, + "step": 2606 + }, + { + "epoch": 1.2669908814589665, + "grad_norm": 0.07170362081087035, + "learning_rate": 7.74072776771699e-06, + "loss": 0.5197, + "step": 2607 + }, + { + "epoch": 1.2674772036474165, + "grad_norm": 0.07697544950202906, + "learning_rate": 7.739126168370554e-06, + "loss": 0.551, + "step": 2608 + }, + { + "epoch": 1.2679635258358664, + "grad_norm": 0.07172692702726284, + "learning_rate": 7.737524167362373e-06, + "loss": 0.5228, + "step": 2609 + }, + { + "epoch": 1.268449848024316, + "grad_norm": 0.07120482949202118, + "learning_rate": 7.73592176492736e-06, + "loss": 0.5699, + "step": 2610 + }, + { + "epoch": 1.268936170212766, + "grad_norm": 0.07344462909185348, + "learning_rate": 7.734318961300484e-06, + "loss": 0.531, + "step": 2611 + }, + { + "epoch": 1.2694224924012159, + "grad_norm": 0.07326625012114829, + "learning_rate": 7.732715756716786e-06, + "loss": 0.539, + "step": 2612 + }, + { + "epoch": 1.2699088145896655, + "grad_norm": 0.07256093941774487, + "learning_rate": 7.731112151411355e-06, + "loss": 0.537, + "step": 2613 + }, + { + "epoch": 1.2703951367781154, + "grad_norm": 0.0774795879946063, + "learning_rate": 7.729508145619339e-06, + "loss": 0.5101, + "step": 2614 + }, + { + "epoch": 1.2708814589665653, + "grad_norm": 0.07533263430429454, + "learning_rate": 7.72790373957595e-06, + "loss": 0.6075, + "step": 2615 + }, + { + "epoch": 1.2713677811550153, + "grad_norm": 0.07272391406574882, + "learning_rate": 7.726298933516453e-06, + "loss": 0.5445, + "step": 2616 + }, + { + "epoch": 1.2718541033434652, + "grad_norm": 0.07405978547986122, + "learning_rate": 7.724693727676181e-06, + "loss": 0.5636, + "step": 2617 + }, + { + "epoch": 1.2723404255319148, + "grad_norm": 0.07398454840100682, + "learning_rate": 7.72308812229051e-06, + "loss": 0.573, + "step": 2618 + }, + { + "epoch": 1.2728267477203647, + "grad_norm": 0.07407253394234505, + "learning_rate": 7.721482117594891e-06, + "loss": 0.5389, + "step": 2619 + }, + { + "epoch": 1.2733130699088147, + "grad_norm": 0.07386724668611457, + "learning_rate": 7.719875713824824e-06, + "loss": 0.5318, + "step": 2620 + }, + { + "epoch": 1.2737993920972643, + "grad_norm": 0.07387320747550771, + "learning_rate": 7.718268911215869e-06, + "loss": 0.5367, + "step": 2621 + }, + { + "epoch": 1.2742857142857142, + "grad_norm": 0.06914403426406225, + "learning_rate": 7.716661710003647e-06, + "loss": 0.5415, + "step": 2622 + }, + { + "epoch": 1.2747720364741641, + "grad_norm": 0.07069904237227867, + "learning_rate": 7.715054110423834e-06, + "loss": 0.5448, + "step": 2623 + }, + { + "epoch": 1.275258358662614, + "grad_norm": 0.0798494750730378, + "learning_rate": 7.71344611271217e-06, + "loss": 0.5648, + "step": 2624 + }, + { + "epoch": 1.275744680851064, + "grad_norm": 0.07324639351911368, + "learning_rate": 7.711837717104442e-06, + "loss": 0.5543, + "step": 2625 + }, + { + "epoch": 1.2762310030395136, + "grad_norm": 0.07240246960603337, + "learning_rate": 7.71022892383651e-06, + "loss": 0.5852, + "step": 2626 + }, + { + "epoch": 1.2767173252279636, + "grad_norm": 0.07512491596878752, + "learning_rate": 7.708619733144285e-06, + "loss": 0.5796, + "step": 2627 + }, + { + "epoch": 1.2772036474164135, + "grad_norm": 0.07294552336685806, + "learning_rate": 7.707010145263733e-06, + "loss": 0.5607, + "step": 2628 + }, + { + "epoch": 1.2776899696048631, + "grad_norm": 0.07344589203686383, + "learning_rate": 7.705400160430882e-06, + "loss": 0.567, + "step": 2629 + }, + { + "epoch": 1.278176291793313, + "grad_norm": 0.07631165533278293, + "learning_rate": 7.70378977888182e-06, + "loss": 0.5903, + "step": 2630 + }, + { + "epoch": 1.278662613981763, + "grad_norm": 0.0701379290634204, + "learning_rate": 7.702179000852693e-06, + "loss": 0.5155, + "step": 2631 + }, + { + "epoch": 1.2791489361702126, + "grad_norm": 0.07329540652056298, + "learning_rate": 7.700567826579697e-06, + "loss": 0.5636, + "step": 2632 + }, + { + "epoch": 1.2796352583586625, + "grad_norm": 0.07321077775536616, + "learning_rate": 7.698956256299098e-06, + "loss": 0.5407, + "step": 2633 + }, + { + "epoch": 1.2801215805471124, + "grad_norm": 0.07325217104749837, + "learning_rate": 7.697344290247214e-06, + "loss": 0.5537, + "step": 2634 + }, + { + "epoch": 1.2806079027355624, + "grad_norm": 0.07416569074271362, + "learning_rate": 7.69573192866042e-06, + "loss": 0.5929, + "step": 2635 + }, + { + "epoch": 1.2810942249240123, + "grad_norm": 0.07557838205853717, + "learning_rate": 7.694119171775148e-06, + "loss": 0.5947, + "step": 2636 + }, + { + "epoch": 1.281580547112462, + "grad_norm": 0.07584617624367863, + "learning_rate": 7.692506019827894e-06, + "loss": 0.5836, + "step": 2637 + }, + { + "epoch": 1.2820668693009118, + "grad_norm": 0.07428891374469863, + "learning_rate": 7.69089247305521e-06, + "loss": 0.5725, + "step": 2638 + }, + { + "epoch": 1.2825531914893618, + "grad_norm": 0.06932411279762006, + "learning_rate": 7.689278531693698e-06, + "loss": 0.5279, + "step": 2639 + }, + { + "epoch": 1.2830395136778114, + "grad_norm": 0.07207574358356089, + "learning_rate": 7.687664195980031e-06, + "loss": 0.5331, + "step": 2640 + }, + { + "epoch": 1.2835258358662613, + "grad_norm": 0.07266605096940389, + "learning_rate": 7.686049466150931e-06, + "loss": 0.5453, + "step": 2641 + }, + { + "epoch": 1.2840121580547113, + "grad_norm": 0.07152417217234301, + "learning_rate": 7.684434342443176e-06, + "loss": 0.5281, + "step": 2642 + }, + { + "epoch": 1.2844984802431612, + "grad_norm": 0.07088887399366368, + "learning_rate": 7.682818825093613e-06, + "loss": 0.5648, + "step": 2643 + }, + { + "epoch": 1.284984802431611, + "grad_norm": 0.07492941461371523, + "learning_rate": 7.68120291433913e-06, + "loss": 0.586, + "step": 2644 + }, + { + "epoch": 1.2854711246200607, + "grad_norm": 0.07400338823111081, + "learning_rate": 7.679586610416689e-06, + "loss": 0.5428, + "step": 2645 + }, + { + "epoch": 1.2859574468085107, + "grad_norm": 0.07067097572968692, + "learning_rate": 7.6779699135633e-06, + "loss": 0.5302, + "step": 2646 + }, + { + "epoch": 1.2864437689969606, + "grad_norm": 0.07073207345566752, + "learning_rate": 7.676352824016032e-06, + "loss": 0.5587, + "step": 2647 + }, + { + "epoch": 1.2869300911854102, + "grad_norm": 0.06920525666732837, + "learning_rate": 7.674735342012014e-06, + "loss": 0.512, + "step": 2648 + }, + { + "epoch": 1.2874164133738601, + "grad_norm": 0.07382544918151622, + "learning_rate": 7.673117467788435e-06, + "loss": 0.553, + "step": 2649 + }, + { + "epoch": 1.28790273556231, + "grad_norm": 0.07316121970231027, + "learning_rate": 7.671499201582533e-06, + "loss": 0.5777, + "step": 2650 + }, + { + "epoch": 1.28838905775076, + "grad_norm": 0.07180229251100594, + "learning_rate": 7.66988054363161e-06, + "loss": 0.5309, + "step": 2651 + }, + { + "epoch": 1.2888753799392099, + "grad_norm": 0.07453064513266427, + "learning_rate": 7.668261494173024e-06, + "loss": 0.5544, + "step": 2652 + }, + { + "epoch": 1.2893617021276595, + "grad_norm": 0.07314819667687368, + "learning_rate": 7.66664205344419e-06, + "loss": 0.5471, + "step": 2653 + }, + { + "epoch": 1.2898480243161095, + "grad_norm": 0.06843732806395814, + "learning_rate": 7.665022221682578e-06, + "loss": 0.52, + "step": 2654 + }, + { + "epoch": 1.2903343465045594, + "grad_norm": 0.07496065238433991, + "learning_rate": 7.663401999125724e-06, + "loss": 0.5642, + "step": 2655 + }, + { + "epoch": 1.290820668693009, + "grad_norm": 0.07207211095392176, + "learning_rate": 7.661781386011211e-06, + "loss": 0.5649, + "step": 2656 + }, + { + "epoch": 1.291306990881459, + "grad_norm": 0.07444921741384546, + "learning_rate": 7.660160382576683e-06, + "loss": 0.5417, + "step": 2657 + }, + { + "epoch": 1.2917933130699089, + "grad_norm": 0.07403637105177087, + "learning_rate": 7.658538989059846e-06, + "loss": 0.5698, + "step": 2658 + }, + { + "epoch": 1.2922796352583585, + "grad_norm": 0.07318493121602063, + "learning_rate": 7.656917205698452e-06, + "loss": 0.5437, + "step": 2659 + }, + { + "epoch": 1.2927659574468084, + "grad_norm": 0.0726427717659996, + "learning_rate": 7.655295032730323e-06, + "loss": 0.6179, + "step": 2660 + }, + { + "epoch": 1.2932522796352584, + "grad_norm": 0.07307210350273614, + "learning_rate": 7.65367247039333e-06, + "loss": 0.5628, + "step": 2661 + }, + { + "epoch": 1.2937386018237083, + "grad_norm": 0.073894520521864, + "learning_rate": 7.652049518925404e-06, + "loss": 0.5646, + "step": 2662 + }, + { + "epoch": 1.2942249240121582, + "grad_norm": 0.07086921025174431, + "learning_rate": 7.650426178564532e-06, + "loss": 0.5149, + "step": 2663 + }, + { + "epoch": 1.2947112462006078, + "grad_norm": 0.06973234170335904, + "learning_rate": 7.648802449548758e-06, + "loss": 0.5202, + "step": 2664 + }, + { + "epoch": 1.2951975683890578, + "grad_norm": 0.07339648183617803, + "learning_rate": 7.647178332116186e-06, + "loss": 0.5464, + "step": 2665 + }, + { + "epoch": 1.2956838905775077, + "grad_norm": 0.074870667285401, + "learning_rate": 7.64555382650497e-06, + "loss": 0.5813, + "step": 2666 + }, + { + "epoch": 1.2961702127659573, + "grad_norm": 0.07267891042846718, + "learning_rate": 7.643928932953328e-06, + "loss": 0.5467, + "step": 2667 + }, + { + "epoch": 1.2966565349544072, + "grad_norm": 0.07040101068996828, + "learning_rate": 7.642303651699533e-06, + "loss": 0.5588, + "step": 2668 + }, + { + "epoch": 1.2971428571428572, + "grad_norm": 0.07369027735012644, + "learning_rate": 7.64067798298191e-06, + "loss": 0.5715, + "step": 2669 + }, + { + "epoch": 1.297629179331307, + "grad_norm": 0.07648028830088888, + "learning_rate": 7.63905192703885e-06, + "loss": 0.575, + "step": 2670 + }, + { + "epoch": 1.298115501519757, + "grad_norm": 0.07169219565123473, + "learning_rate": 7.637425484108793e-06, + "loss": 0.563, + "step": 2671 + }, + { + "epoch": 1.2986018237082066, + "grad_norm": 0.07685159389997796, + "learning_rate": 7.635798654430237e-06, + "loss": 0.5745, + "step": 2672 + }, + { + "epoch": 1.2990881458966566, + "grad_norm": 0.07328231737117846, + "learning_rate": 7.634171438241745e-06, + "loss": 0.5446, + "step": 2673 + }, + { + "epoch": 1.2995744680851065, + "grad_norm": 0.07154640234028622, + "learning_rate": 7.63254383578192e-06, + "loss": 0.5483, + "step": 2674 + }, + { + "epoch": 1.3000607902735561, + "grad_norm": 0.0744325457954457, + "learning_rate": 7.630915847289435e-06, + "loss": 0.5324, + "step": 2675 + }, + { + "epoch": 1.300547112462006, + "grad_norm": 0.0738984084935473, + "learning_rate": 7.629287473003019e-06, + "loss": 0.5716, + "step": 2676 + }, + { + "epoch": 1.301033434650456, + "grad_norm": 0.06985192839078329, + "learning_rate": 7.627658713161453e-06, + "loss": 0.5419, + "step": 2677 + }, + { + "epoch": 1.3015197568389056, + "grad_norm": 0.07328820265870761, + "learning_rate": 7.626029568003575e-06, + "loss": 0.5348, + "step": 2678 + }, + { + "epoch": 1.3020060790273555, + "grad_norm": 0.07498915909201755, + "learning_rate": 7.624400037768283e-06, + "loss": 0.5123, + "step": 2679 + }, + { + "epoch": 1.3024924012158055, + "grad_norm": 0.07561418122831266, + "learning_rate": 7.622770122694526e-06, + "loss": 0.5862, + "step": 2680 + }, + { + "epoch": 1.3029787234042554, + "grad_norm": 0.07017352174027704, + "learning_rate": 7.6211398230213155e-06, + "loss": 0.5585, + "step": 2681 + }, + { + "epoch": 1.3034650455927053, + "grad_norm": 0.07066548657233461, + "learning_rate": 7.619509138987713e-06, + "loss": 0.5586, + "step": 2682 + }, + { + "epoch": 1.303951367781155, + "grad_norm": 0.07537297276629902, + "learning_rate": 7.617878070832842e-06, + "loss": 0.5355, + "step": 2683 + }, + { + "epoch": 1.3044376899696049, + "grad_norm": 0.07105726117785882, + "learning_rate": 7.616246618795879e-06, + "loss": 0.5416, + "step": 2684 + }, + { + "epoch": 1.3049240121580548, + "grad_norm": 0.06872990101322848, + "learning_rate": 7.614614783116061e-06, + "loss": 0.5456, + "step": 2685 + }, + { + "epoch": 1.3054103343465044, + "grad_norm": 0.0729629812521218, + "learning_rate": 7.612982564032675e-06, + "loss": 0.5789, + "step": 2686 + }, + { + "epoch": 1.3058966565349543, + "grad_norm": 0.07083687337857913, + "learning_rate": 7.61134996178507e-06, + "loss": 0.5395, + "step": 2687 + }, + { + "epoch": 1.3063829787234043, + "grad_norm": 0.0720236302006423, + "learning_rate": 7.6097169766126445e-06, + "loss": 0.5708, + "step": 2688 + }, + { + "epoch": 1.3068693009118542, + "grad_norm": 0.074979362657179, + "learning_rate": 7.608083608754861e-06, + "loss": 0.5485, + "step": 2689 + }, + { + "epoch": 1.307355623100304, + "grad_norm": 0.07442099562708793, + "learning_rate": 7.606449858451232e-06, + "loss": 0.5539, + "step": 2690 + }, + { + "epoch": 1.3078419452887537, + "grad_norm": 0.0730207405677716, + "learning_rate": 7.60481572594133e-06, + "loss": 0.5502, + "step": 2691 + }, + { + "epoch": 1.3083282674772037, + "grad_norm": 0.07272502430633573, + "learning_rate": 7.603181211464783e-06, + "loss": 0.5454, + "step": 2692 + }, + { + "epoch": 1.3088145896656536, + "grad_norm": 0.07197664749840792, + "learning_rate": 7.60154631526127e-06, + "loss": 0.5387, + "step": 2693 + }, + { + "epoch": 1.3093009118541032, + "grad_norm": 0.07432887890012968, + "learning_rate": 7.599911037570533e-06, + "loss": 0.5526, + "step": 2694 + }, + { + "epoch": 1.3097872340425532, + "grad_norm": 0.07169348023536669, + "learning_rate": 7.598275378632367e-06, + "loss": 0.5329, + "step": 2695 + }, + { + "epoch": 1.310273556231003, + "grad_norm": 0.07174243799509789, + "learning_rate": 7.596639338686622e-06, + "loss": 0.5537, + "step": 2696 + }, + { + "epoch": 1.310759878419453, + "grad_norm": 0.07353367460780375, + "learning_rate": 7.595002917973204e-06, + "loss": 0.5635, + "step": 2697 + }, + { + "epoch": 1.3112462006079029, + "grad_norm": 0.07512562945371454, + "learning_rate": 7.593366116732077e-06, + "loss": 0.5717, + "step": 2698 + }, + { + "epoch": 1.3117325227963526, + "grad_norm": 0.07244172271725031, + "learning_rate": 7.59172893520326e-06, + "loss": 0.5296, + "step": 2699 + }, + { + "epoch": 1.3122188449848025, + "grad_norm": 0.06860621533762223, + "learning_rate": 7.590091373626823e-06, + "loss": 0.5024, + "step": 2700 + }, + { + "epoch": 1.3127051671732524, + "grad_norm": 0.0754041820927164, + "learning_rate": 7.588453432242899e-06, + "loss": 0.5705, + "step": 2701 + }, + { + "epoch": 1.313191489361702, + "grad_norm": 0.07584182472745714, + "learning_rate": 7.586815111291674e-06, + "loss": 0.5532, + "step": 2702 + }, + { + "epoch": 1.313677811550152, + "grad_norm": 0.07811875447392391, + "learning_rate": 7.585176411013389e-06, + "loss": 0.5636, + "step": 2703 + }, + { + "epoch": 1.3141641337386019, + "grad_norm": 0.07434695285341286, + "learning_rate": 7.583537331648339e-06, + "loss": 0.5621, + "step": 2704 + }, + { + "epoch": 1.3146504559270515, + "grad_norm": 0.07161432299560656, + "learning_rate": 7.581897873436876e-06, + "loss": 0.5546, + "step": 2705 + }, + { + "epoch": 1.3151367781155014, + "grad_norm": 0.07083763721926453, + "learning_rate": 7.58025803661941e-06, + "loss": 0.5352, + "step": 2706 + }, + { + "epoch": 1.3156231003039514, + "grad_norm": 0.07263295184757475, + "learning_rate": 7.578617821436405e-06, + "loss": 0.5769, + "step": 2707 + }, + { + "epoch": 1.3161094224924013, + "grad_norm": 0.07402261218645703, + "learning_rate": 7.576977228128377e-06, + "loss": 0.5368, + "step": 2708 + }, + { + "epoch": 1.3165957446808512, + "grad_norm": 0.07496631978430902, + "learning_rate": 7.575336256935902e-06, + "loss": 0.5596, + "step": 2709 + }, + { + "epoch": 1.3170820668693008, + "grad_norm": 0.07125406556075664, + "learning_rate": 7.573694908099612e-06, + "loss": 0.5617, + "step": 2710 + }, + { + "epoch": 1.3175683890577508, + "grad_norm": 0.07081462316054976, + "learning_rate": 7.5720531818601876e-06, + "loss": 0.5797, + "step": 2711 + }, + { + "epoch": 1.3180547112462007, + "grad_norm": 0.07141198018795063, + "learning_rate": 7.570411078458373e-06, + "loss": 0.5678, + "step": 2712 + }, + { + "epoch": 1.3185410334346503, + "grad_norm": 0.07134062044168017, + "learning_rate": 7.568768598134961e-06, + "loss": 0.5382, + "step": 2713 + }, + { + "epoch": 1.3190273556231003, + "grad_norm": 0.07744804283935018, + "learning_rate": 7.567125741130806e-06, + "loss": 0.5657, + "step": 2714 + }, + { + "epoch": 1.3195136778115502, + "grad_norm": 0.07560759785152554, + "learning_rate": 7.5654825076868124e-06, + "loss": 0.575, + "step": 2715 + }, + { + "epoch": 1.32, + "grad_norm": 0.0723428554303582, + "learning_rate": 7.563838898043942e-06, + "loss": 0.5621, + "step": 2716 + }, + { + "epoch": 1.32048632218845, + "grad_norm": 0.07282727632474047, + "learning_rate": 7.56219491244321e-06, + "loss": 0.5428, + "step": 2717 + }, + { + "epoch": 1.3209726443768997, + "grad_norm": 0.0766516296440535, + "learning_rate": 7.560550551125691e-06, + "loss": 0.5596, + "step": 2718 + }, + { + "epoch": 1.3214589665653496, + "grad_norm": 0.07223833061979915, + "learning_rate": 7.558905814332514e-06, + "loss": 0.5348, + "step": 2719 + }, + { + "epoch": 1.3219452887537995, + "grad_norm": 0.07229167302652503, + "learning_rate": 7.557260702304853e-06, + "loss": 0.5507, + "step": 2720 + }, + { + "epoch": 1.3224316109422491, + "grad_norm": 0.07201860164448105, + "learning_rate": 7.555615215283952e-06, + "loss": 0.5723, + "step": 2721 + }, + { + "epoch": 1.322917933130699, + "grad_norm": 0.07294388551249625, + "learning_rate": 7.553969353511099e-06, + "loss": 0.5507, + "step": 2722 + }, + { + "epoch": 1.323404255319149, + "grad_norm": 0.0748079880187394, + "learning_rate": 7.552323117227642e-06, + "loss": 0.5641, + "step": 2723 + }, + { + "epoch": 1.3238905775075989, + "grad_norm": 0.07573816637056172, + "learning_rate": 7.550676506674986e-06, + "loss": 0.5767, + "step": 2724 + }, + { + "epoch": 1.3243768996960488, + "grad_norm": 0.07506100403603923, + "learning_rate": 7.549029522094583e-06, + "loss": 0.5746, + "step": 2725 + }, + { + "epoch": 1.3248632218844985, + "grad_norm": 0.07100833882128814, + "learning_rate": 7.547382163727949e-06, + "loss": 0.53, + "step": 2726 + }, + { + "epoch": 1.3253495440729484, + "grad_norm": 0.071307822896735, + "learning_rate": 7.545734431816647e-06, + "loss": 0.5378, + "step": 2727 + }, + { + "epoch": 1.3258358662613983, + "grad_norm": 0.07369445843652978, + "learning_rate": 7.544086326602298e-06, + "loss": 0.5447, + "step": 2728 + }, + { + "epoch": 1.326322188449848, + "grad_norm": 0.06875727544382673, + "learning_rate": 7.5424378483265795e-06, + "loss": 0.5262, + "step": 2729 + }, + { + "epoch": 1.3268085106382979, + "grad_norm": 0.06877588729134668, + "learning_rate": 7.5407889972312236e-06, + "loss": 0.5268, + "step": 2730 + }, + { + "epoch": 1.3272948328267478, + "grad_norm": 0.07596685692231431, + "learning_rate": 7.5391397735580115e-06, + "loss": 0.5296, + "step": 2731 + }, + { + "epoch": 1.3277811550151974, + "grad_norm": 0.07100340791035951, + "learning_rate": 7.537490177548787e-06, + "loss": 0.5425, + "step": 2732 + }, + { + "epoch": 1.3282674772036474, + "grad_norm": 0.07347219706544224, + "learning_rate": 7.535840209445444e-06, + "loss": 0.5817, + "step": 2733 + }, + { + "epoch": 1.3287537993920973, + "grad_norm": 0.07269969932284565, + "learning_rate": 7.53418986948993e-06, + "loss": 0.5657, + "step": 2734 + }, + { + "epoch": 1.3292401215805472, + "grad_norm": 0.07108917353295441, + "learning_rate": 7.5325391579242476e-06, + "loss": 0.5809, + "step": 2735 + }, + { + "epoch": 1.329726443768997, + "grad_norm": 0.0756803482113215, + "learning_rate": 7.5308880749904576e-06, + "loss": 0.5183, + "step": 2736 + }, + { + "epoch": 1.3302127659574468, + "grad_norm": 0.0713285105015546, + "learning_rate": 7.529236620930671e-06, + "loss": 0.5425, + "step": 2737 + }, + { + "epoch": 1.3306990881458967, + "grad_norm": 0.07373404240829051, + "learning_rate": 7.527584795987057e-06, + "loss": 0.5569, + "step": 2738 + }, + { + "epoch": 1.3311854103343466, + "grad_norm": 0.0767723705422561, + "learning_rate": 7.525932600401833e-06, + "loss": 0.5716, + "step": 2739 + }, + { + "epoch": 1.3316717325227962, + "grad_norm": 0.07091844335228077, + "learning_rate": 7.524280034417278e-06, + "loss": 0.5115, + "step": 2740 + }, + { + "epoch": 1.3321580547112462, + "grad_norm": 0.07273466135943404, + "learning_rate": 7.522627098275723e-06, + "loss": 0.5515, + "step": 2741 + }, + { + "epoch": 1.332644376899696, + "grad_norm": 0.07499768447130718, + "learning_rate": 7.520973792219548e-06, + "loss": 0.5206, + "step": 2742 + }, + { + "epoch": 1.333130699088146, + "grad_norm": 0.07515786948003962, + "learning_rate": 7.519320116491195e-06, + "loss": 0.5816, + "step": 2743 + }, + { + "epoch": 1.3336170212765959, + "grad_norm": 0.07270801398997014, + "learning_rate": 7.517666071333155e-06, + "loss": 0.5637, + "step": 2744 + }, + { + "epoch": 1.3341033434650456, + "grad_norm": 0.07474638177335434, + "learning_rate": 7.516011656987976e-06, + "loss": 0.5458, + "step": 2745 + }, + { + "epoch": 1.3345896656534955, + "grad_norm": 0.07479027885327674, + "learning_rate": 7.5143568736982585e-06, + "loss": 0.5724, + "step": 2746 + }, + { + "epoch": 1.3350759878419454, + "grad_norm": 0.07024389883300447, + "learning_rate": 7.512701721706659e-06, + "loss": 0.5381, + "step": 2747 + }, + { + "epoch": 1.335562310030395, + "grad_norm": 0.07414204692333647, + "learning_rate": 7.5110462012558835e-06, + "loss": 0.5745, + "step": 2748 + }, + { + "epoch": 1.336048632218845, + "grad_norm": 0.07695660208068462, + "learning_rate": 7.509390312588699e-06, + "loss": 0.5749, + "step": 2749 + }, + { + "epoch": 1.3365349544072949, + "grad_norm": 0.0732252332053761, + "learning_rate": 7.50773405594792e-06, + "loss": 0.5229, + "step": 2750 + }, + { + "epoch": 1.3370212765957445, + "grad_norm": 0.07068423805726977, + "learning_rate": 7.5060774315764195e-06, + "loss": 0.5703, + "step": 2751 + }, + { + "epoch": 1.3375075987841945, + "grad_norm": 0.07380370967009903, + "learning_rate": 7.5044204397171225e-06, + "loss": 0.5284, + "step": 2752 + }, + { + "epoch": 1.3379939209726444, + "grad_norm": 0.07702725696772723, + "learning_rate": 7.502763080613008e-06, + "loss": 0.6185, + "step": 2753 + }, + { + "epoch": 1.3384802431610943, + "grad_norm": 0.07265422851999226, + "learning_rate": 7.501105354507107e-06, + "loss": 0.5503, + "step": 2754 + }, + { + "epoch": 1.3389665653495442, + "grad_norm": 0.07361567032933555, + "learning_rate": 7.499447261642509e-06, + "loss": 0.5178, + "step": 2755 + }, + { + "epoch": 1.3394528875379939, + "grad_norm": 0.07129462515465361, + "learning_rate": 7.497788802262353e-06, + "loss": 0.5452, + "step": 2756 + }, + { + "epoch": 1.3399392097264438, + "grad_norm": 0.07064482130612999, + "learning_rate": 7.496129976609833e-06, + "loss": 0.5532, + "step": 2757 + }, + { + "epoch": 1.3404255319148937, + "grad_norm": 0.07292646659664338, + "learning_rate": 7.494470784928197e-06, + "loss": 0.5813, + "step": 2758 + }, + { + "epoch": 1.3409118541033433, + "grad_norm": 0.0733806862580877, + "learning_rate": 7.492811227460748e-06, + "loss": 0.5639, + "step": 2759 + }, + { + "epoch": 1.3413981762917933, + "grad_norm": 0.07500971335263869, + "learning_rate": 7.491151304450839e-06, + "loss": 0.5391, + "step": 2760 + }, + { + "epoch": 1.3418844984802432, + "grad_norm": 0.07343777062528506, + "learning_rate": 7.489491016141881e-06, + "loss": 0.5908, + "step": 2761 + }, + { + "epoch": 1.342370820668693, + "grad_norm": 0.07680005433648988, + "learning_rate": 7.487830362777335e-06, + "loss": 0.5437, + "step": 2762 + }, + { + "epoch": 1.342857142857143, + "grad_norm": 0.07642924520386632, + "learning_rate": 7.486169344600718e-06, + "loss": 0.5691, + "step": 2763 + }, + { + "epoch": 1.3433434650455927, + "grad_norm": 0.07235966130217936, + "learning_rate": 7.484507961855599e-06, + "loss": 0.5683, + "step": 2764 + }, + { + "epoch": 1.3438297872340426, + "grad_norm": 0.07154928798615308, + "learning_rate": 7.482846214785602e-06, + "loss": 0.5549, + "step": 2765 + }, + { + "epoch": 1.3443161094224925, + "grad_norm": 0.0746256748865418, + "learning_rate": 7.481184103634399e-06, + "loss": 0.5874, + "step": 2766 + }, + { + "epoch": 1.3448024316109422, + "grad_norm": 0.07636614904951292, + "learning_rate": 7.479521628645725e-06, + "loss": 0.5746, + "step": 2767 + }, + { + "epoch": 1.345288753799392, + "grad_norm": 0.06935711963650168, + "learning_rate": 7.47785879006336e-06, + "loss": 0.5186, + "step": 2768 + }, + { + "epoch": 1.345775075987842, + "grad_norm": 0.06918929230542553, + "learning_rate": 7.476195588131142e-06, + "loss": 0.5128, + "step": 2769 + }, + { + "epoch": 1.3462613981762919, + "grad_norm": 0.07343596859446479, + "learning_rate": 7.474532023092961e-06, + "loss": 0.5576, + "step": 2770 + }, + { + "epoch": 1.3467477203647418, + "grad_norm": 0.07621140587989529, + "learning_rate": 7.472868095192758e-06, + "loss": 0.566, + "step": 2771 + }, + { + "epoch": 1.3472340425531915, + "grad_norm": 0.07480282113210489, + "learning_rate": 7.471203804674531e-06, + "loss": 0.5404, + "step": 2772 + }, + { + "epoch": 1.3477203647416414, + "grad_norm": 0.0767269627779973, + "learning_rate": 7.469539151782328e-06, + "loss": 0.5625, + "step": 2773 + }, + { + "epoch": 1.3482066869300913, + "grad_norm": 0.07135737087821863, + "learning_rate": 7.467874136760251e-06, + "loss": 0.5096, + "step": 2774 + }, + { + "epoch": 1.348693009118541, + "grad_norm": 0.07067221291446979, + "learning_rate": 7.4662087598524555e-06, + "loss": 0.5166, + "step": 2775 + }, + { + "epoch": 1.3491793313069909, + "grad_norm": 0.07194599554732935, + "learning_rate": 7.464543021303153e-06, + "loss": 0.5348, + "step": 2776 + }, + { + "epoch": 1.3496656534954408, + "grad_norm": 0.11722754992133265, + "learning_rate": 7.462876921356602e-06, + "loss": 0.5661, + "step": 2777 + }, + { + "epoch": 1.3501519756838904, + "grad_norm": 0.07924318696350909, + "learning_rate": 7.46121046025712e-06, + "loss": 0.5812, + "step": 2778 + }, + { + "epoch": 1.3506382978723404, + "grad_norm": 0.072894839745393, + "learning_rate": 7.459543638249071e-06, + "loss": 0.5802, + "step": 2779 + }, + { + "epoch": 1.3511246200607903, + "grad_norm": 0.07079268796791323, + "learning_rate": 7.457876455576879e-06, + "loss": 0.5204, + "step": 2780 + }, + { + "epoch": 1.3516109422492402, + "grad_norm": 0.06954499029261359, + "learning_rate": 7.456208912485015e-06, + "loss": 0.5533, + "step": 2781 + }, + { + "epoch": 1.35209726443769, + "grad_norm": 0.07180234178730577, + "learning_rate": 7.454541009218006e-06, + "loss": 0.5444, + "step": 2782 + }, + { + "epoch": 1.3525835866261398, + "grad_norm": 0.0750973886651206, + "learning_rate": 7.4528727460204316e-06, + "loss": 0.5503, + "step": 2783 + }, + { + "epoch": 1.3530699088145897, + "grad_norm": 0.07052402629173818, + "learning_rate": 7.451204123136923e-06, + "loss": 0.5269, + "step": 2784 + }, + { + "epoch": 1.3535562310030396, + "grad_norm": 0.07312009233445739, + "learning_rate": 7.449535140812164e-06, + "loss": 0.5661, + "step": 2785 + }, + { + "epoch": 1.3540425531914893, + "grad_norm": 0.07308544634827979, + "learning_rate": 7.447865799290894e-06, + "loss": 0.5615, + "step": 2786 + }, + { + "epoch": 1.3545288753799392, + "grad_norm": 0.07383957112236195, + "learning_rate": 7.446196098817903e-06, + "loss": 0.5618, + "step": 2787 + }, + { + "epoch": 1.355015197568389, + "grad_norm": 0.06910668368195767, + "learning_rate": 7.4445260396380315e-06, + "loss": 0.5271, + "step": 2788 + }, + { + "epoch": 1.355501519756839, + "grad_norm": 0.08520558447022207, + "learning_rate": 7.4428556219961745e-06, + "loss": 0.5478, + "step": 2789 + }, + { + "epoch": 1.3559878419452889, + "grad_norm": 0.0728728692581348, + "learning_rate": 7.441184846137282e-06, + "loss": 0.5286, + "step": 2790 + }, + { + "epoch": 1.3564741641337386, + "grad_norm": 0.0746774473855491, + "learning_rate": 7.4395137123063535e-06, + "loss": 0.5528, + "step": 2791 + }, + { + "epoch": 1.3569604863221885, + "grad_norm": 0.07441602427893382, + "learning_rate": 7.437842220748441e-06, + "loss": 0.5376, + "step": 2792 + }, + { + "epoch": 1.3574468085106384, + "grad_norm": 0.07789502817645118, + "learning_rate": 7.43617037170865e-06, + "loss": 0.5438, + "step": 2793 + }, + { + "epoch": 1.357933130699088, + "grad_norm": 0.07085350568100886, + "learning_rate": 7.43449816543214e-06, + "loss": 0.5563, + "step": 2794 + }, + { + "epoch": 1.358419452887538, + "grad_norm": 0.07124389214173889, + "learning_rate": 7.43282560216412e-06, + "loss": 0.5573, + "step": 2795 + }, + { + "epoch": 1.3589057750759879, + "grad_norm": 0.07284524070744001, + "learning_rate": 7.4311526821498505e-06, + "loss": 0.5443, + "step": 2796 + }, + { + "epoch": 1.3593920972644378, + "grad_norm": 0.07601028754945906, + "learning_rate": 7.429479405634647e-06, + "loss": 0.5762, + "step": 2797 + }, + { + "epoch": 1.3598784194528877, + "grad_norm": 0.07748444578281123, + "learning_rate": 7.427805772863878e-06, + "loss": 0.5623, + "step": 2798 + }, + { + "epoch": 1.3603647416413374, + "grad_norm": 0.07310537205843276, + "learning_rate": 7.4261317840829635e-06, + "loss": 0.5716, + "step": 2799 + }, + { + "epoch": 1.3608510638297873, + "grad_norm": 0.07168809957318174, + "learning_rate": 7.424457439537371e-06, + "loss": 0.5427, + "step": 2800 + }, + { + "epoch": 1.3613373860182372, + "grad_norm": 0.07202764836618479, + "learning_rate": 7.42278273947263e-06, + "loss": 0.5379, + "step": 2801 + }, + { + "epoch": 1.3618237082066869, + "grad_norm": 0.07386640909667373, + "learning_rate": 7.42110768413431e-06, + "loss": 0.5526, + "step": 2802 + }, + { + "epoch": 1.3623100303951368, + "grad_norm": 0.0736861769756912, + "learning_rate": 7.419432273768041e-06, + "loss": 0.5527, + "step": 2803 + }, + { + "epoch": 1.3627963525835867, + "grad_norm": 0.07183365272393766, + "learning_rate": 7.417756508619504e-06, + "loss": 0.54, + "step": 2804 + }, + { + "epoch": 1.3632826747720364, + "grad_norm": 0.07437558078885173, + "learning_rate": 7.416080388934433e-06, + "loss": 0.5467, + "step": 2805 + }, + { + "epoch": 1.3637689969604863, + "grad_norm": 0.07475835623248893, + "learning_rate": 7.414403914958607e-06, + "loss": 0.5718, + "step": 2806 + }, + { + "epoch": 1.3642553191489362, + "grad_norm": 0.07149494868687824, + "learning_rate": 7.412727086937864e-06, + "loss": 0.556, + "step": 2807 + }, + { + "epoch": 1.364741641337386, + "grad_norm": 0.07148101038637412, + "learning_rate": 7.411049905118093e-06, + "loss": 0.5192, + "step": 2808 + }, + { + "epoch": 1.365227963525836, + "grad_norm": 0.07241062392752333, + "learning_rate": 7.409372369745232e-06, + "loss": 0.5675, + "step": 2809 + }, + { + "epoch": 1.3657142857142857, + "grad_norm": 0.07631166170340024, + "learning_rate": 7.407694481065274e-06, + "loss": 0.5724, + "step": 2810 + }, + { + "epoch": 1.3662006079027356, + "grad_norm": 0.06952784825593136, + "learning_rate": 7.406016239324262e-06, + "loss": 0.5257, + "step": 2811 + }, + { + "epoch": 1.3666869300911855, + "grad_norm": 0.07355501824184059, + "learning_rate": 7.404337644768289e-06, + "loss": 0.5273, + "step": 2812 + }, + { + "epoch": 1.3671732522796352, + "grad_norm": 0.07154587221104756, + "learning_rate": 7.402658697643504e-06, + "loss": 0.5826, + "step": 2813 + }, + { + "epoch": 1.367659574468085, + "grad_norm": 0.07236149323762646, + "learning_rate": 7.400979398196107e-06, + "loss": 0.5461, + "step": 2814 + }, + { + "epoch": 1.368145896656535, + "grad_norm": 0.07317092435020892, + "learning_rate": 7.399299746672344e-06, + "loss": 0.5572, + "step": 2815 + }, + { + "epoch": 1.3686322188449849, + "grad_norm": 0.08558772311693255, + "learning_rate": 7.397619743318519e-06, + "loss": 0.5861, + "step": 2816 + }, + { + "epoch": 1.3691185410334348, + "grad_norm": 0.07169593551956592, + "learning_rate": 7.395939388380986e-06, + "loss": 0.5531, + "step": 2817 + }, + { + "epoch": 1.3696048632218845, + "grad_norm": 0.07048911954452661, + "learning_rate": 7.3942586821061505e-06, + "loss": 0.5456, + "step": 2818 + }, + { + "epoch": 1.3700911854103344, + "grad_norm": 0.07470367769238317, + "learning_rate": 7.392577624740467e-06, + "loss": 0.5842, + "step": 2819 + }, + { + "epoch": 1.3705775075987843, + "grad_norm": 0.18957076990199326, + "learning_rate": 7.390896216530442e-06, + "loss": 0.6094, + "step": 2820 + }, + { + "epoch": 1.371063829787234, + "grad_norm": 0.07181529660807395, + "learning_rate": 7.38921445772264e-06, + "loss": 0.5357, + "step": 2821 + }, + { + "epoch": 1.3715501519756839, + "grad_norm": 0.07517854521329402, + "learning_rate": 7.387532348563668e-06, + "loss": 0.5977, + "step": 2822 + }, + { + "epoch": 1.3720364741641338, + "grad_norm": 0.07339891946059386, + "learning_rate": 7.38584988930019e-06, + "loss": 0.5354, + "step": 2823 + }, + { + "epoch": 1.3725227963525835, + "grad_norm": 0.0749372211065316, + "learning_rate": 7.3841670801789175e-06, + "loss": 0.5832, + "step": 2824 + }, + { + "epoch": 1.3730091185410334, + "grad_norm": 0.07333183161587552, + "learning_rate": 7.382483921446619e-06, + "loss": 0.5865, + "step": 2825 + }, + { + "epoch": 1.3734954407294833, + "grad_norm": 0.07131610559246881, + "learning_rate": 7.380800413350108e-06, + "loss": 0.5596, + "step": 2826 + }, + { + "epoch": 1.3739817629179332, + "grad_norm": 0.07585132827891361, + "learning_rate": 7.379116556136251e-06, + "loss": 0.5634, + "step": 2827 + }, + { + "epoch": 1.374468085106383, + "grad_norm": 0.07086433646522601, + "learning_rate": 7.377432350051968e-06, + "loss": 0.5701, + "step": 2828 + }, + { + "epoch": 1.3749544072948328, + "grad_norm": 0.07815009712652388, + "learning_rate": 7.375747795344227e-06, + "loss": 0.5596, + "step": 2829 + }, + { + "epoch": 1.3754407294832827, + "grad_norm": 0.07533885199256991, + "learning_rate": 7.374062892260052e-06, + "loss": 0.5458, + "step": 2830 + }, + { + "epoch": 1.3759270516717326, + "grad_norm": 0.07409574900520231, + "learning_rate": 7.372377641046512e-06, + "loss": 0.504, + "step": 2831 + }, + { + "epoch": 1.3764133738601823, + "grad_norm": 0.07587331618184268, + "learning_rate": 7.3706920419507325e-06, + "loss": 0.534, + "step": 2832 + }, + { + "epoch": 1.3768996960486322, + "grad_norm": 0.0720185972198476, + "learning_rate": 7.369006095219886e-06, + "loss": 0.5191, + "step": 2833 + }, + { + "epoch": 1.377386018237082, + "grad_norm": 0.07082787061550921, + "learning_rate": 7.367319801101196e-06, + "loss": 0.5503, + "step": 2834 + }, + { + "epoch": 1.377872340425532, + "grad_norm": 0.07465233848387376, + "learning_rate": 7.3656331598419405e-06, + "loss": 0.5214, + "step": 2835 + }, + { + "epoch": 1.3783586626139819, + "grad_norm": 0.07384824657892668, + "learning_rate": 7.3639461716894465e-06, + "loss": 0.547, + "step": 2836 + }, + { + "epoch": 1.3788449848024316, + "grad_norm": 0.07098669004602412, + "learning_rate": 7.36225883689109e-06, + "loss": 0.5197, + "step": 2837 + }, + { + "epoch": 1.3793313069908815, + "grad_norm": 0.06927078737252892, + "learning_rate": 7.360571155694299e-06, + "loss": 0.5398, + "step": 2838 + }, + { + "epoch": 1.3798176291793314, + "grad_norm": 0.07129588884877884, + "learning_rate": 7.358883128346556e-06, + "loss": 0.5557, + "step": 2839 + }, + { + "epoch": 1.380303951367781, + "grad_norm": 0.07586852058302193, + "learning_rate": 7.35719475509539e-06, + "loss": 0.5662, + "step": 2840 + }, + { + "epoch": 1.380790273556231, + "grad_norm": 0.0718329178142727, + "learning_rate": 7.355506036188379e-06, + "loss": 0.54, + "step": 2841 + }, + { + "epoch": 1.3812765957446809, + "grad_norm": 0.07233453430135549, + "learning_rate": 7.353816971873157e-06, + "loss": 0.5711, + "step": 2842 + }, + { + "epoch": 1.3817629179331308, + "grad_norm": 0.07483219855572677, + "learning_rate": 7.352127562397405e-06, + "loss": 0.5425, + "step": 2843 + }, + { + "epoch": 1.3822492401215807, + "grad_norm": 0.07378567330802537, + "learning_rate": 7.3504378080088565e-06, + "loss": 0.5661, + "step": 2844 + }, + { + "epoch": 1.3827355623100304, + "grad_norm": 0.0788120437038464, + "learning_rate": 7.348747708955295e-06, + "loss": 0.5166, + "step": 2845 + }, + { + "epoch": 1.3832218844984803, + "grad_norm": 0.07173608627648453, + "learning_rate": 7.347057265484553e-06, + "loss": 0.5479, + "step": 2846 + }, + { + "epoch": 1.3837082066869302, + "grad_norm": 0.07248099415059925, + "learning_rate": 7.345366477844516e-06, + "loss": 0.6016, + "step": 2847 + }, + { + "epoch": 1.3841945288753799, + "grad_norm": 0.07054823618129706, + "learning_rate": 7.343675346283118e-06, + "loss": 0.5623, + "step": 2848 + }, + { + "epoch": 1.3846808510638298, + "grad_norm": 0.07109370612873508, + "learning_rate": 7.341983871048343e-06, + "loss": 0.5275, + "step": 2849 + }, + { + "epoch": 1.3851671732522797, + "grad_norm": 0.07323870167369544, + "learning_rate": 7.340292052388232e-06, + "loss": 0.5659, + "step": 2850 + }, + { + "epoch": 1.3856534954407294, + "grad_norm": 0.07004095477558245, + "learning_rate": 7.338599890550865e-06, + "loss": 0.5122, + "step": 2851 + }, + { + "epoch": 1.3861398176291793, + "grad_norm": 0.07112006983727202, + "learning_rate": 7.3369073857843805e-06, + "loss": 0.5138, + "step": 2852 + }, + { + "epoch": 1.3866261398176292, + "grad_norm": 0.07785461494481812, + "learning_rate": 7.3352145383369655e-06, + "loss": 0.6049, + "step": 2853 + }, + { + "epoch": 1.387112462006079, + "grad_norm": 0.07382196200333008, + "learning_rate": 7.333521348456858e-06, + "loss": 0.5865, + "step": 2854 + }, + { + "epoch": 1.387598784194529, + "grad_norm": 0.07256705503860443, + "learning_rate": 7.331827816392341e-06, + "loss": 0.5387, + "step": 2855 + }, + { + "epoch": 1.3880851063829787, + "grad_norm": 0.07219826758052535, + "learning_rate": 7.330133942391757e-06, + "loss": 0.5449, + "step": 2856 + }, + { + "epoch": 1.3885714285714286, + "grad_norm": 0.0733839603950585, + "learning_rate": 7.328439726703489e-06, + "loss": 0.5903, + "step": 2857 + }, + { + "epoch": 1.3890577507598785, + "grad_norm": 0.07241773622070892, + "learning_rate": 7.326745169575978e-06, + "loss": 0.5543, + "step": 2858 + }, + { + "epoch": 1.3895440729483282, + "grad_norm": 0.07097945461680506, + "learning_rate": 7.325050271257707e-06, + "loss": 0.5159, + "step": 2859 + }, + { + "epoch": 1.390030395136778, + "grad_norm": 0.07060664798701335, + "learning_rate": 7.323355031997219e-06, + "loss": 0.5167, + "step": 2860 + }, + { + "epoch": 1.390516717325228, + "grad_norm": 0.06969922407444792, + "learning_rate": 7.321659452043098e-06, + "loss": 0.5243, + "step": 2861 + }, + { + "epoch": 1.3910030395136779, + "grad_norm": 0.07144180799115653, + "learning_rate": 7.319963531643983e-06, + "loss": 0.5387, + "step": 2862 + }, + { + "epoch": 1.3914893617021278, + "grad_norm": 0.07215642533462507, + "learning_rate": 7.318267271048561e-06, + "loss": 0.5542, + "step": 2863 + }, + { + "epoch": 1.3919756838905775, + "grad_norm": 0.07275601306170419, + "learning_rate": 7.3165706705055695e-06, + "loss": 0.5487, + "step": 2864 + }, + { + "epoch": 1.3924620060790274, + "grad_norm": 0.07508419904523045, + "learning_rate": 7.314873730263795e-06, + "loss": 0.5754, + "step": 2865 + }, + { + "epoch": 1.3929483282674773, + "grad_norm": 0.07151443273915334, + "learning_rate": 7.313176450572075e-06, + "loss": 0.5342, + "step": 2866 + }, + { + "epoch": 1.393434650455927, + "grad_norm": 0.07278666922289213, + "learning_rate": 7.311478831679296e-06, + "loss": 0.5581, + "step": 2867 + }, + { + "epoch": 1.3939209726443769, + "grad_norm": 0.07160797595436258, + "learning_rate": 7.3097808738343955e-06, + "loss": 0.531, + "step": 2868 + }, + { + "epoch": 1.3944072948328268, + "grad_norm": 0.07474752287177243, + "learning_rate": 7.308082577286359e-06, + "loss": 0.5968, + "step": 2869 + }, + { + "epoch": 1.3948936170212767, + "grad_norm": 0.07071587047456368, + "learning_rate": 7.306383942284223e-06, + "loss": 0.5377, + "step": 2870 + }, + { + "epoch": 1.3953799392097264, + "grad_norm": 0.07273908938660241, + "learning_rate": 7.304684969077074e-06, + "loss": 0.5186, + "step": 2871 + }, + { + "epoch": 1.3958662613981763, + "grad_norm": 0.07411038431648295, + "learning_rate": 7.302985657914044e-06, + "loss": 0.5527, + "step": 2872 + }, + { + "epoch": 1.3963525835866262, + "grad_norm": 0.07646412926021802, + "learning_rate": 7.3012860090443196e-06, + "loss": 0.5736, + "step": 2873 + }, + { + "epoch": 1.396838905775076, + "grad_norm": 0.07393976364464108, + "learning_rate": 7.299586022717134e-06, + "loss": 0.5997, + "step": 2874 + }, + { + "epoch": 1.3973252279635258, + "grad_norm": 0.07415618502150105, + "learning_rate": 7.2978856991817715e-06, + "loss": 0.564, + "step": 2875 + }, + { + "epoch": 1.3978115501519757, + "grad_norm": 0.07532202269238364, + "learning_rate": 7.296185038687566e-06, + "loss": 0.5509, + "step": 2876 + }, + { + "epoch": 1.3982978723404256, + "grad_norm": 0.06939612130541992, + "learning_rate": 7.2944840414839e-06, + "loss": 0.5397, + "step": 2877 + }, + { + "epoch": 1.3987841945288753, + "grad_norm": 0.0740348888212191, + "learning_rate": 7.292782707820205e-06, + "loss": 0.5591, + "step": 2878 + }, + { + "epoch": 1.3992705167173252, + "grad_norm": 0.07390150825340479, + "learning_rate": 7.291081037945963e-06, + "loss": 0.5575, + "step": 2879 + }, + { + "epoch": 1.399756838905775, + "grad_norm": 0.07231549687431432, + "learning_rate": 7.2893790321107015e-06, + "loss": 0.5584, + "step": 2880 + }, + { + "epoch": 1.400243161094225, + "grad_norm": 0.07733413463169406, + "learning_rate": 7.287676690564005e-06, + "loss": 0.5865, + "step": 2881 + }, + { + "epoch": 1.4007294832826749, + "grad_norm": 0.0724220270588826, + "learning_rate": 7.285974013555498e-06, + "loss": 0.57, + "step": 2882 + }, + { + "epoch": 1.4012158054711246, + "grad_norm": 0.07338037031835948, + "learning_rate": 7.284271001334862e-06, + "loss": 0.543, + "step": 2883 + }, + { + "epoch": 1.4017021276595745, + "grad_norm": 0.07560763262282227, + "learning_rate": 7.282567654151822e-06, + "loss": 0.5828, + "step": 2884 + }, + { + "epoch": 1.4021884498480244, + "grad_norm": 0.07615789911052342, + "learning_rate": 7.280863972256156e-06, + "loss": 0.601, + "step": 2885 + }, + { + "epoch": 1.402674772036474, + "grad_norm": 0.07020233800855825, + "learning_rate": 7.2791599558976925e-06, + "loss": 0.522, + "step": 2886 + }, + { + "epoch": 1.403161094224924, + "grad_norm": 0.07156443938040494, + "learning_rate": 7.2774556053263e-06, + "loss": 0.5396, + "step": 2887 + }, + { + "epoch": 1.4036474164133739, + "grad_norm": 0.07303153564149846, + "learning_rate": 7.275750920791905e-06, + "loss": 0.5696, + "step": 2888 + }, + { + "epoch": 1.4041337386018238, + "grad_norm": 0.076449654503809, + "learning_rate": 7.274045902544482e-06, + "loss": 0.5677, + "step": 2889 + }, + { + "epoch": 1.4046200607902737, + "grad_norm": 0.07254520642196749, + "learning_rate": 7.272340550834049e-06, + "loss": 0.5781, + "step": 2890 + }, + { + "epoch": 1.4051063829787234, + "grad_norm": 0.07090249053401353, + "learning_rate": 7.27063486591068e-06, + "loss": 0.5416, + "step": 2891 + }, + { + "epoch": 1.4055927051671733, + "grad_norm": 0.07068832992042283, + "learning_rate": 7.268928848024492e-06, + "loss": 0.5502, + "step": 2892 + }, + { + "epoch": 1.4060790273556232, + "grad_norm": 0.07102742964508688, + "learning_rate": 7.267222497425653e-06, + "loss": 0.522, + "step": 2893 + }, + { + "epoch": 1.4065653495440729, + "grad_norm": 0.07072343476739507, + "learning_rate": 7.2655158143643835e-06, + "loss": 0.5644, + "step": 2894 + }, + { + "epoch": 1.4070516717325228, + "grad_norm": 0.07407702907327725, + "learning_rate": 7.263808799090944e-06, + "loss": 0.5737, + "step": 2895 + }, + { + "epoch": 1.4075379939209727, + "grad_norm": 0.0739752075966807, + "learning_rate": 7.262101451855652e-06, + "loss": 0.5802, + "step": 2896 + }, + { + "epoch": 1.4080243161094224, + "grad_norm": 0.07034633406691732, + "learning_rate": 7.26039377290887e-06, + "loss": 0.5063, + "step": 2897 + }, + { + "epoch": 1.4085106382978723, + "grad_norm": 0.07258208292619515, + "learning_rate": 7.25868576250101e-06, + "loss": 0.5289, + "step": 2898 + }, + { + "epoch": 1.4089969604863222, + "grad_norm": 0.07395196085382696, + "learning_rate": 7.256977420882532e-06, + "loss": 0.5664, + "step": 2899 + }, + { + "epoch": 1.409483282674772, + "grad_norm": 0.07466260790183507, + "learning_rate": 7.255268748303944e-06, + "loss": 0.5548, + "step": 2900 + }, + { + "epoch": 1.409969604863222, + "grad_norm": 0.07091725641249104, + "learning_rate": 7.253559745015804e-06, + "loss": 0.5174, + "step": 2901 + }, + { + "epoch": 1.4104559270516717, + "grad_norm": 0.07343794089575462, + "learning_rate": 7.25185041126872e-06, + "loss": 0.5619, + "step": 2902 + }, + { + "epoch": 1.4109422492401216, + "grad_norm": 0.07304034701522473, + "learning_rate": 7.250140747313344e-06, + "loss": 0.5555, + "step": 2903 + }, + { + "epoch": 1.4114285714285715, + "grad_norm": 0.07043003883584882, + "learning_rate": 7.24843075340038e-06, + "loss": 0.536, + "step": 2904 + }, + { + "epoch": 1.4119148936170212, + "grad_norm": 0.0731204391784189, + "learning_rate": 7.246720429780577e-06, + "loss": 0.5358, + "step": 2905 + }, + { + "epoch": 1.412401215805471, + "grad_norm": 0.07553089619187109, + "learning_rate": 7.2450097767047365e-06, + "loss": 0.5554, + "step": 2906 + }, + { + "epoch": 1.412887537993921, + "grad_norm": 0.07178286341781658, + "learning_rate": 7.243298794423707e-06, + "loss": 0.5219, + "step": 2907 + }, + { + "epoch": 1.4133738601823709, + "grad_norm": 0.075458150666953, + "learning_rate": 7.241587483188383e-06, + "loss": 0.5843, + "step": 2908 + }, + { + "epoch": 1.4138601823708208, + "grad_norm": 0.07264898605466318, + "learning_rate": 7.239875843249711e-06, + "loss": 0.56, + "step": 2909 + }, + { + "epoch": 1.4143465045592705, + "grad_norm": 0.07324078665502846, + "learning_rate": 7.238163874858681e-06, + "loss": 0.5523, + "step": 2910 + }, + { + "epoch": 1.4148328267477204, + "grad_norm": 0.07304737618554122, + "learning_rate": 7.236451578266334e-06, + "loss": 0.524, + "step": 2911 + }, + { + "epoch": 1.4153191489361703, + "grad_norm": 0.07681772766413196, + "learning_rate": 7.23473895372376e-06, + "loss": 0.5623, + "step": 2912 + }, + { + "epoch": 1.41580547112462, + "grad_norm": 0.09232250440068646, + "learning_rate": 7.233026001482095e-06, + "loss": 0.5442, + "step": 2913 + }, + { + "epoch": 1.4162917933130699, + "grad_norm": 0.07158200143802625, + "learning_rate": 7.231312721792526e-06, + "loss": 0.5142, + "step": 2914 + }, + { + "epoch": 1.4167781155015198, + "grad_norm": 0.0752229809809883, + "learning_rate": 7.229599114906284e-06, + "loss": 0.546, + "step": 2915 + }, + { + "epoch": 1.4172644376899697, + "grad_norm": 0.07332506419653727, + "learning_rate": 7.227885181074651e-06, + "loss": 0.5495, + "step": 2916 + }, + { + "epoch": 1.4177507598784196, + "grad_norm": 0.07655510596221113, + "learning_rate": 7.226170920548955e-06, + "loss": 0.5632, + "step": 2917 + }, + { + "epoch": 1.4182370820668693, + "grad_norm": 0.07161543018038535, + "learning_rate": 7.224456333580574e-06, + "loss": 0.5718, + "step": 2918 + }, + { + "epoch": 1.4187234042553192, + "grad_norm": 0.07220675071247758, + "learning_rate": 7.2227414204209316e-06, + "loss": 0.5363, + "step": 2919 + }, + { + "epoch": 1.419209726443769, + "grad_norm": 0.07253803096202562, + "learning_rate": 7.2210261813215e-06, + "loss": 0.5485, + "step": 2920 + }, + { + "epoch": 1.4196960486322188, + "grad_norm": 0.0692362395534776, + "learning_rate": 7.2193106165338e-06, + "loss": 0.5363, + "step": 2921 + }, + { + "epoch": 1.4201823708206687, + "grad_norm": 0.07172650959798435, + "learning_rate": 7.2175947263094015e-06, + "loss": 0.5429, + "step": 2922 + }, + { + "epoch": 1.4206686930091186, + "grad_norm": 0.07552111125767548, + "learning_rate": 7.215878510899918e-06, + "loss": 0.5865, + "step": 2923 + }, + { + "epoch": 1.4211550151975683, + "grad_norm": 0.07690821478542738, + "learning_rate": 7.214161970557014e-06, + "loss": 0.5676, + "step": 2924 + }, + { + "epoch": 1.4216413373860182, + "grad_norm": 0.07494485129939144, + "learning_rate": 7.212445105532402e-06, + "loss": 0.5698, + "step": 2925 + }, + { + "epoch": 1.422127659574468, + "grad_norm": 0.07281961081163435, + "learning_rate": 7.2107279160778376e-06, + "loss": 0.563, + "step": 2926 + }, + { + "epoch": 1.422613981762918, + "grad_norm": 0.07153299855121928, + "learning_rate": 7.209010402445129e-06, + "loss": 0.5389, + "step": 2927 + }, + { + "epoch": 1.4231003039513679, + "grad_norm": 0.07205024151133409, + "learning_rate": 7.2072925648861304e-06, + "loss": 0.5416, + "step": 2928 + }, + { + "epoch": 1.4235866261398176, + "grad_norm": 0.0723651105714062, + "learning_rate": 7.205574403652742e-06, + "loss": 0.5149, + "step": 2929 + }, + { + "epoch": 1.4240729483282675, + "grad_norm": 0.06961775243006621, + "learning_rate": 7.203855918996912e-06, + "loss": 0.5352, + "step": 2930 + }, + { + "epoch": 1.4245592705167174, + "grad_norm": 0.07295568722111719, + "learning_rate": 7.20213711117064e-06, + "loss": 0.5607, + "step": 2931 + }, + { + "epoch": 1.425045592705167, + "grad_norm": 0.07233304147311813, + "learning_rate": 7.200417980425969e-06, + "loss": 0.5364, + "step": 2932 + }, + { + "epoch": 1.425531914893617, + "grad_norm": 0.07244381251575628, + "learning_rate": 7.198698527014985e-06, + "loss": 0.5534, + "step": 2933 + }, + { + "epoch": 1.4260182370820669, + "grad_norm": 0.07184447123285677, + "learning_rate": 7.1969787511898315e-06, + "loss": 0.5388, + "step": 2934 + }, + { + "epoch": 1.4265045592705168, + "grad_norm": 0.07207958771349697, + "learning_rate": 7.195258653202693e-06, + "loss": 0.5269, + "step": 2935 + }, + { + "epoch": 1.4269908814589667, + "grad_norm": 0.07235465963887146, + "learning_rate": 7.193538233305801e-06, + "loss": 0.569, + "step": 2936 + }, + { + "epoch": 1.4274772036474164, + "grad_norm": 0.07523043195217095, + "learning_rate": 7.191817491751437e-06, + "loss": 0.5643, + "step": 2937 + }, + { + "epoch": 1.4279635258358663, + "grad_norm": 0.06973791267895596, + "learning_rate": 7.190096428791926e-06, + "loss": 0.5504, + "step": 2938 + }, + { + "epoch": 1.4284498480243162, + "grad_norm": 0.07336296054204193, + "learning_rate": 7.188375044679645e-06, + "loss": 0.572, + "step": 2939 + }, + { + "epoch": 1.4289361702127659, + "grad_norm": 0.07416592836274838, + "learning_rate": 7.186653339667016e-06, + "loss": 0.5374, + "step": 2940 + }, + { + "epoch": 1.4294224924012158, + "grad_norm": 0.07167443545562771, + "learning_rate": 7.184931314006504e-06, + "loss": 0.5398, + "step": 2941 + }, + { + "epoch": 1.4299088145896657, + "grad_norm": 0.07193340663506219, + "learning_rate": 7.183208967950627e-06, + "loss": 0.5519, + "step": 2942 + }, + { + "epoch": 1.4303951367781156, + "grad_norm": 0.07637659783285151, + "learning_rate": 7.181486301751945e-06, + "loss": 0.5423, + "step": 2943 + }, + { + "epoch": 1.4308814589665653, + "grad_norm": 0.07460956940701864, + "learning_rate": 7.179763315663071e-06, + "loss": 0.5791, + "step": 2944 + }, + { + "epoch": 1.4313677811550152, + "grad_norm": 0.07178759314396749, + "learning_rate": 7.1780400099366595e-06, + "loss": 0.5472, + "step": 2945 + }, + { + "epoch": 1.431854103343465, + "grad_norm": 0.0732634138785685, + "learning_rate": 7.176316384825414e-06, + "loss": 0.5664, + "step": 2946 + }, + { + "epoch": 1.432340425531915, + "grad_norm": 0.07101649384704461, + "learning_rate": 7.174592440582084e-06, + "loss": 0.5468, + "step": 2947 + }, + { + "epoch": 1.4328267477203647, + "grad_norm": 0.07311283871274517, + "learning_rate": 7.172868177459467e-06, + "loss": 0.551, + "step": 2948 + }, + { + "epoch": 1.4333130699088146, + "grad_norm": 0.0748691707711707, + "learning_rate": 7.171143595710406e-06, + "loss": 0.5256, + "step": 2949 + }, + { + "epoch": 1.4337993920972645, + "grad_norm": 0.07156771620031142, + "learning_rate": 7.169418695587791e-06, + "loss": 0.5065, + "step": 2950 + }, + { + "epoch": 1.4342857142857142, + "grad_norm": 0.07525715626809713, + "learning_rate": 7.16769347734456e-06, + "loss": 0.5747, + "step": 2951 + }, + { + "epoch": 1.434772036474164, + "grad_norm": 0.07142270349168325, + "learning_rate": 7.165967941233698e-06, + "loss": 0.5199, + "step": 2952 + }, + { + "epoch": 1.435258358662614, + "grad_norm": 0.07509813779295409, + "learning_rate": 7.164242087508232e-06, + "loss": 0.5773, + "step": 2953 + }, + { + "epoch": 1.4357446808510639, + "grad_norm": 0.07195585471837591, + "learning_rate": 7.162515916421241e-06, + "loss": 0.511, + "step": 2954 + }, + { + "epoch": 1.4362310030395138, + "grad_norm": 0.07118912309486448, + "learning_rate": 7.160789428225847e-06, + "loss": 0.5375, + "step": 2955 + }, + { + "epoch": 1.4367173252279635, + "grad_norm": 0.07240982057746294, + "learning_rate": 7.159062623175222e-06, + "loss": 0.5504, + "step": 2956 + }, + { + "epoch": 1.4372036474164134, + "grad_norm": 0.07412775758703977, + "learning_rate": 7.1573355015225795e-06, + "loss": 0.5524, + "step": 2957 + }, + { + "epoch": 1.4376899696048633, + "grad_norm": 0.07026734906933059, + "learning_rate": 7.155608063521185e-06, + "loss": 0.5429, + "step": 2958 + }, + { + "epoch": 1.438176291793313, + "grad_norm": 0.07533367397354152, + "learning_rate": 7.153880309424347e-06, + "loss": 0.5765, + "step": 2959 + }, + { + "epoch": 1.4386626139817629, + "grad_norm": 0.07183972165978872, + "learning_rate": 7.152152239485419e-06, + "loss": 0.5655, + "step": 2960 + }, + { + "epoch": 1.4391489361702128, + "grad_norm": 0.07248919534916828, + "learning_rate": 7.1504238539578064e-06, + "loss": 0.5757, + "step": 2961 + }, + { + "epoch": 1.4396352583586627, + "grad_norm": 0.0746259132503898, + "learning_rate": 7.148695153094954e-06, + "loss": 0.5314, + "step": 2962 + }, + { + "epoch": 1.4401215805471126, + "grad_norm": 0.07048273194028765, + "learning_rate": 7.1469661371503575e-06, + "loss": 0.5302, + "step": 2963 + }, + { + "epoch": 1.4406079027355623, + "grad_norm": 0.07455320830588442, + "learning_rate": 7.145236806377559e-06, + "loss": 0.5487, + "step": 2964 + }, + { + "epoch": 1.4410942249240122, + "grad_norm": 0.0751360432228851, + "learning_rate": 7.143507161030141e-06, + "loss": 0.5633, + "step": 2965 + }, + { + "epoch": 1.441580547112462, + "grad_norm": 0.07267197128379294, + "learning_rate": 7.14177720136174e-06, + "loss": 0.5162, + "step": 2966 + }, + { + "epoch": 1.4420668693009118, + "grad_norm": 0.07125145902664135, + "learning_rate": 7.140046927626034e-06, + "loss": 0.5289, + "step": 2967 + }, + { + "epoch": 1.4425531914893617, + "grad_norm": 0.07310039768164478, + "learning_rate": 7.138316340076748e-06, + "loss": 0.5461, + "step": 2968 + }, + { + "epoch": 1.4430395136778116, + "grad_norm": 0.07306132053552425, + "learning_rate": 7.136585438967653e-06, + "loss": 0.5814, + "step": 2969 + }, + { + "epoch": 1.4435258358662613, + "grad_norm": 0.07221209602036564, + "learning_rate": 7.134854224552565e-06, + "loss": 0.5439, + "step": 2970 + }, + { + "epoch": 1.4440121580547112, + "grad_norm": 0.07284288447008033, + "learning_rate": 7.1331226970853504e-06, + "loss": 0.5808, + "step": 2971 + }, + { + "epoch": 1.444498480243161, + "grad_norm": 0.07558224654508197, + "learning_rate": 7.131390856819914e-06, + "loss": 0.6128, + "step": 2972 + }, + { + "epoch": 1.444984802431611, + "grad_norm": 0.07048089675894902, + "learning_rate": 7.129658704010212e-06, + "loss": 0.5206, + "step": 2973 + }, + { + "epoch": 1.4454711246200609, + "grad_norm": 0.07082341456388493, + "learning_rate": 7.127926238910243e-06, + "loss": 0.5691, + "step": 2974 + }, + { + "epoch": 1.4459574468085106, + "grad_norm": 0.07522646736361067, + "learning_rate": 7.126193461774058e-06, + "loss": 0.6011, + "step": 2975 + }, + { + "epoch": 1.4464437689969605, + "grad_norm": 0.07120922249737295, + "learning_rate": 7.124460372855745e-06, + "loss": 0.5659, + "step": 2976 + }, + { + "epoch": 1.4469300911854104, + "grad_norm": 0.07259620255231417, + "learning_rate": 7.122726972409443e-06, + "loss": 0.5896, + "step": 2977 + }, + { + "epoch": 1.44741641337386, + "grad_norm": 0.07359223836184006, + "learning_rate": 7.120993260689337e-06, + "loss": 0.5377, + "step": 2978 + }, + { + "epoch": 1.44790273556231, + "grad_norm": 0.0718845196665278, + "learning_rate": 7.1192592379496535e-06, + "loss": 0.513, + "step": 2979 + }, + { + "epoch": 1.4483890577507599, + "grad_norm": 0.07138327331800925, + "learning_rate": 7.11752490444467e-06, + "loss": 0.5782, + "step": 2980 + }, + { + "epoch": 1.4488753799392098, + "grad_norm": 0.07205155524477207, + "learning_rate": 7.115790260428704e-06, + "loss": 0.5259, + "step": 2981 + }, + { + "epoch": 1.4493617021276597, + "grad_norm": 0.07407600788318844, + "learning_rate": 7.114055306156122e-06, + "loss": 0.5693, + "step": 2982 + }, + { + "epoch": 1.4498480243161094, + "grad_norm": 0.07065327395196716, + "learning_rate": 7.112320041881338e-06, + "loss": 0.5548, + "step": 2983 + }, + { + "epoch": 1.4503343465045593, + "grad_norm": 0.07492596056103878, + "learning_rate": 7.110584467858806e-06, + "loss": 0.5734, + "step": 2984 + }, + { + "epoch": 1.4508206686930092, + "grad_norm": 0.07248858280579723, + "learning_rate": 7.108848584343028e-06, + "loss": 0.5511, + "step": 2985 + }, + { + "epoch": 1.4513069908814589, + "grad_norm": 0.07332417704501748, + "learning_rate": 7.107112391588554e-06, + "loss": 0.5697, + "step": 2986 + }, + { + "epoch": 1.4517933130699088, + "grad_norm": 0.07079207263051158, + "learning_rate": 7.105375889849976e-06, + "loss": 0.5444, + "step": 2987 + }, + { + "epoch": 1.4522796352583587, + "grad_norm": 0.07527411800683574, + "learning_rate": 7.103639079381931e-06, + "loss": 0.5795, + "step": 2988 + }, + { + "epoch": 1.4527659574468086, + "grad_norm": 0.0730518598455749, + "learning_rate": 7.101901960439104e-06, + "loss": 0.5271, + "step": 2989 + }, + { + "epoch": 1.4532522796352585, + "grad_norm": 0.07515882259900461, + "learning_rate": 7.100164533276223e-06, + "loss": 0.5462, + "step": 2990 + }, + { + "epoch": 1.4537386018237082, + "grad_norm": 0.0719848490921908, + "learning_rate": 7.098426798148061e-06, + "loss": 0.563, + "step": 2991 + }, + { + "epoch": 1.454224924012158, + "grad_norm": 0.07067161697045983, + "learning_rate": 7.09668875530944e-06, + "loss": 0.5432, + "step": 2992 + }, + { + "epoch": 1.454711246200608, + "grad_norm": 0.07217341124494471, + "learning_rate": 7.0949504050152206e-06, + "loss": 0.5497, + "step": 2993 + }, + { + "epoch": 1.4551975683890577, + "grad_norm": 0.07401443780624582, + "learning_rate": 7.093211747520316e-06, + "loss": 0.554, + "step": 2994 + }, + { + "epoch": 1.4556838905775076, + "grad_norm": 0.07282930442079073, + "learning_rate": 7.091472783079677e-06, + "loss": 0.5539, + "step": 2995 + }, + { + "epoch": 1.4561702127659575, + "grad_norm": 0.07355281223700758, + "learning_rate": 7.089733511948306e-06, + "loss": 0.586, + "step": 2996 + }, + { + "epoch": 1.4566565349544072, + "grad_norm": 0.07266678283458142, + "learning_rate": 7.087993934381245e-06, + "loss": 0.5955, + "step": 2997 + }, + { + "epoch": 1.457142857142857, + "grad_norm": 0.07243351259384075, + "learning_rate": 7.086254050633584e-06, + "loss": 0.5573, + "step": 2998 + }, + { + "epoch": 1.457629179331307, + "grad_norm": 0.07447622621153978, + "learning_rate": 7.084513860960458e-06, + "loss": 0.6065, + "step": 2999 + }, + { + "epoch": 1.4581155015197569, + "grad_norm": 0.07315971291215784, + "learning_rate": 7.082773365617046e-06, + "loss": 0.5669, + "step": 3000 + }, + { + "epoch": 1.4586018237082068, + "grad_norm": 0.07684035492440253, + "learning_rate": 7.081032564858571e-06, + "loss": 0.5571, + "step": 3001 + }, + { + "epoch": 1.4590881458966565, + "grad_norm": 0.072572804482754, + "learning_rate": 7.079291458940302e-06, + "loss": 0.5736, + "step": 3002 + }, + { + "epoch": 1.4595744680851064, + "grad_norm": 0.07099953911953658, + "learning_rate": 7.077550048117552e-06, + "loss": 0.5406, + "step": 3003 + }, + { + "epoch": 1.4600607902735563, + "grad_norm": 0.07435277917034713, + "learning_rate": 7.075808332645681e-06, + "loss": 0.5647, + "step": 3004 + }, + { + "epoch": 1.460547112462006, + "grad_norm": 0.07113457425602679, + "learning_rate": 7.074066312780088e-06, + "loss": 0.5502, + "step": 3005 + }, + { + "epoch": 1.4610334346504559, + "grad_norm": 0.07374900945887536, + "learning_rate": 7.0723239887762255e-06, + "loss": 0.5808, + "step": 3006 + }, + { + "epoch": 1.4615197568389058, + "grad_norm": 0.07469148464397955, + "learning_rate": 7.070581360889581e-06, + "loss": 0.5763, + "step": 3007 + }, + { + "epoch": 1.4620060790273557, + "grad_norm": 0.0726295731428625, + "learning_rate": 7.0688384293756925e-06, + "loss": 0.5658, + "step": 3008 + }, + { + "epoch": 1.4624924012158056, + "grad_norm": 0.07129793794780138, + "learning_rate": 7.067095194490143e-06, + "loss": 0.5284, + "step": 3009 + }, + { + "epoch": 1.4629787234042553, + "grad_norm": 0.07859046644428544, + "learning_rate": 7.065351656488557e-06, + "loss": 0.6022, + "step": 3010 + }, + { + "epoch": 1.4634650455927052, + "grad_norm": 0.07128031613683354, + "learning_rate": 7.063607815626603e-06, + "loss": 0.5362, + "step": 3011 + }, + { + "epoch": 1.463951367781155, + "grad_norm": 0.07167413239697759, + "learning_rate": 7.0618636721599965e-06, + "loss": 0.527, + "step": 3012 + }, + { + "epoch": 1.4644376899696048, + "grad_norm": 0.0721553867960776, + "learning_rate": 7.060119226344497e-06, + "loss": 0.5565, + "step": 3013 + }, + { + "epoch": 1.4649240121580547, + "grad_norm": 0.07124441621907927, + "learning_rate": 7.058374478435908e-06, + "loss": 0.5437, + "step": 3014 + }, + { + "epoch": 1.4654103343465046, + "grad_norm": 0.07407227290221569, + "learning_rate": 7.056629428690075e-06, + "loss": 0.5578, + "step": 3015 + }, + { + "epoch": 1.4658966565349545, + "grad_norm": 0.0743308615949895, + "learning_rate": 7.0548840773628915e-06, + "loss": 0.578, + "step": 3016 + }, + { + "epoch": 1.4663829787234042, + "grad_norm": 0.07107825327066132, + "learning_rate": 7.053138424710293e-06, + "loss": 0.5542, + "step": 3017 + }, + { + "epoch": 1.466869300911854, + "grad_norm": 0.07125440918150784, + "learning_rate": 7.0513924709882595e-06, + "loss": 0.5175, + "step": 3018 + }, + { + "epoch": 1.467355623100304, + "grad_norm": 0.07055319986563977, + "learning_rate": 7.049646216452815e-06, + "loss": 0.5405, + "step": 3019 + }, + { + "epoch": 1.467841945288754, + "grad_norm": 0.07018330260522715, + "learning_rate": 7.047899661360027e-06, + "loss": 0.5087, + "step": 3020 + }, + { + "epoch": 1.4683282674772036, + "grad_norm": 0.07550792592405127, + "learning_rate": 7.046152805966009e-06, + "loss": 0.5935, + "step": 3021 + }, + { + "epoch": 1.4688145896656535, + "grad_norm": 0.0751320774378785, + "learning_rate": 7.044405650526919e-06, + "loss": 0.5371, + "step": 3022 + }, + { + "epoch": 1.4693009118541034, + "grad_norm": 0.07543667946173886, + "learning_rate": 7.042658195298956e-06, + "loss": 0.5808, + "step": 3023 + }, + { + "epoch": 1.469787234042553, + "grad_norm": 0.07168438950940491, + "learning_rate": 7.040910440538364e-06, + "loss": 0.5445, + "step": 3024 + }, + { + "epoch": 1.470273556231003, + "grad_norm": 0.0719175441601274, + "learning_rate": 7.0391623865014325e-06, + "loss": 0.5433, + "step": 3025 + }, + { + "epoch": 1.4707598784194529, + "grad_norm": 0.0722164607459636, + "learning_rate": 7.037414033444494e-06, + "loss": 0.5631, + "step": 3026 + }, + { + "epoch": 1.4712462006079028, + "grad_norm": 0.07523894214520926, + "learning_rate": 7.035665381623922e-06, + "loss": 0.5354, + "step": 3027 + }, + { + "epoch": 1.4717325227963527, + "grad_norm": 0.073140997379711, + "learning_rate": 7.033916431296139e-06, + "loss": 0.5675, + "step": 3028 + }, + { + "epoch": 1.4722188449848024, + "grad_norm": 0.07318593829045134, + "learning_rate": 7.032167182717607e-06, + "loss": 0.5542, + "step": 3029 + }, + { + "epoch": 1.4727051671732523, + "grad_norm": 0.07014501567334279, + "learning_rate": 7.030417636144836e-06, + "loss": 0.5187, + "step": 3030 + }, + { + "epoch": 1.4731914893617022, + "grad_norm": 0.07165359940136974, + "learning_rate": 7.028667791834375e-06, + "loss": 0.5526, + "step": 3031 + }, + { + "epoch": 1.4736778115501519, + "grad_norm": 0.07833469378227913, + "learning_rate": 7.026917650042821e-06, + "loss": 0.6136, + "step": 3032 + }, + { + "epoch": 1.4741641337386018, + "grad_norm": 0.0736553271457589, + "learning_rate": 7.0251672110268084e-06, + "loss": 0.5759, + "step": 3033 + }, + { + "epoch": 1.4746504559270517, + "grad_norm": 0.07188472691743732, + "learning_rate": 7.0234164750430235e-06, + "loss": 0.5304, + "step": 3034 + }, + { + "epoch": 1.4751367781155016, + "grad_norm": 0.07790420205160582, + "learning_rate": 7.021665442348189e-06, + "loss": 0.5581, + "step": 3035 + }, + { + "epoch": 1.4756231003039515, + "grad_norm": 0.07197532584910898, + "learning_rate": 7.019914113199074e-06, + "loss": 0.5473, + "step": 3036 + }, + { + "epoch": 1.4761094224924012, + "grad_norm": 0.07208350701203245, + "learning_rate": 7.018162487852494e-06, + "loss": 0.5618, + "step": 3037 + }, + { + "epoch": 1.476595744680851, + "grad_norm": 0.07799693452450422, + "learning_rate": 7.0164105665652995e-06, + "loss": 0.5553, + "step": 3038 + }, + { + "epoch": 1.477082066869301, + "grad_norm": 0.07053946525652079, + "learning_rate": 7.014658349594396e-06, + "loss": 0.5156, + "step": 3039 + }, + { + "epoch": 1.4775683890577507, + "grad_norm": 0.06900659441096428, + "learning_rate": 7.012905837196724e-06, + "loss": 0.5323, + "step": 3040 + }, + { + "epoch": 1.4780547112462006, + "grad_norm": 0.08067139962297624, + "learning_rate": 7.011153029629267e-06, + "loss": 0.5531, + "step": 3041 + }, + { + "epoch": 1.4785410334346505, + "grad_norm": 0.07078492767699299, + "learning_rate": 7.009399927149059e-06, + "loss": 0.5659, + "step": 3042 + }, + { + "epoch": 1.4790273556231002, + "grad_norm": 0.07305158196589566, + "learning_rate": 7.007646530013168e-06, + "loss": 0.5633, + "step": 3043 + }, + { + "epoch": 1.47951367781155, + "grad_norm": 0.07546726610475094, + "learning_rate": 7.0058928384787115e-06, + "loss": 0.5584, + "step": 3044 + }, + { + "epoch": 1.48, + "grad_norm": 0.06983494205533083, + "learning_rate": 7.004138852802849e-06, + "loss": 0.5551, + "step": 3045 + }, + { + "epoch": 1.4804863221884499, + "grad_norm": 0.07294696941571022, + "learning_rate": 7.002384573242782e-06, + "loss": 0.5516, + "step": 3046 + }, + { + "epoch": 1.4809726443768998, + "grad_norm": 0.07063355929854213, + "learning_rate": 7.000630000055757e-06, + "loss": 0.5211, + "step": 3047 + }, + { + "epoch": 1.4814589665653495, + "grad_norm": 0.07126431143715, + "learning_rate": 6.99887513349906e-06, + "loss": 0.5431, + "step": 3048 + }, + { + "epoch": 1.4819452887537994, + "grad_norm": 0.07428791534868628, + "learning_rate": 6.997119973830024e-06, + "loss": 0.5686, + "step": 3049 + }, + { + "epoch": 1.4824316109422493, + "grad_norm": 0.07309933810283628, + "learning_rate": 6.995364521306023e-06, + "loss": 0.5474, + "step": 3050 + }, + { + "epoch": 1.482917933130699, + "grad_norm": 0.07212724445739932, + "learning_rate": 6.993608776184473e-06, + "loss": 0.5148, + "step": 3051 + }, + { + "epoch": 1.4834042553191489, + "grad_norm": 0.07318674268465943, + "learning_rate": 6.991852738722835e-06, + "loss": 0.5631, + "step": 3052 + }, + { + "epoch": 1.4838905775075988, + "grad_norm": 0.07380906062330862, + "learning_rate": 6.990096409178612e-06, + "loss": 0.5633, + "step": 3053 + }, + { + "epoch": 1.4843768996960487, + "grad_norm": 0.07175019954692677, + "learning_rate": 6.98833978780935e-06, + "loss": 0.5695, + "step": 3054 + }, + { + "epoch": 1.4848632218844986, + "grad_norm": 0.07514840922383961, + "learning_rate": 6.9865828748726376e-06, + "loss": 0.5776, + "step": 3055 + }, + { + "epoch": 1.4853495440729483, + "grad_norm": 0.07259754236959098, + "learning_rate": 6.984825670626105e-06, + "loss": 0.5442, + "step": 3056 + }, + { + "epoch": 1.4858358662613982, + "grad_norm": 0.07477696437719421, + "learning_rate": 6.983068175327427e-06, + "loss": 0.554, + "step": 3057 + }, + { + "epoch": 1.486322188449848, + "grad_norm": 0.07136033655996425, + "learning_rate": 6.9813103892343205e-06, + "loss": 0.5594, + "step": 3058 + }, + { + "epoch": 1.4868085106382978, + "grad_norm": 0.0695046548188177, + "learning_rate": 6.979552312604545e-06, + "loss": 0.5404, + "step": 3059 + }, + { + "epoch": 1.4872948328267477, + "grad_norm": 0.07550839218908577, + "learning_rate": 6.977793945695901e-06, + "loss": 0.5704, + "step": 3060 + }, + { + "epoch": 1.4877811550151976, + "grad_norm": 0.07104429896611426, + "learning_rate": 6.976035288766235e-06, + "loss": 0.5369, + "step": 3061 + }, + { + "epoch": 1.4882674772036475, + "grad_norm": 0.07619924556653611, + "learning_rate": 6.974276342073434e-06, + "loss": 0.5708, + "step": 3062 + }, + { + "epoch": 1.4887537993920974, + "grad_norm": 0.07110498030569501, + "learning_rate": 6.9725171058754275e-06, + "loss": 0.5352, + "step": 3063 + }, + { + "epoch": 1.489240121580547, + "grad_norm": 0.07329850463671592, + "learning_rate": 6.970757580430184e-06, + "loss": 0.5583, + "step": 3064 + }, + { + "epoch": 1.489726443768997, + "grad_norm": 0.07063919179366497, + "learning_rate": 6.968997765995722e-06, + "loss": 0.5486, + "step": 3065 + }, + { + "epoch": 1.490212765957447, + "grad_norm": 0.07503426553670628, + "learning_rate": 6.967237662830096e-06, + "loss": 0.564, + "step": 3066 + }, + { + "epoch": 1.4906990881458966, + "grad_norm": 0.07148177717612322, + "learning_rate": 6.965477271191407e-06, + "loss": 0.5537, + "step": 3067 + }, + { + "epoch": 1.4911854103343465, + "grad_norm": 0.07348425497748244, + "learning_rate": 6.963716591337797e-06, + "loss": 0.5736, + "step": 3068 + }, + { + "epoch": 1.4916717325227964, + "grad_norm": 0.07497767912472064, + "learning_rate": 6.9619556235274475e-06, + "loss": 0.636, + "step": 3069 + }, + { + "epoch": 1.492158054711246, + "grad_norm": 0.07411834084612781, + "learning_rate": 6.960194368018587e-06, + "loss": 0.5776, + "step": 3070 + }, + { + "epoch": 1.492644376899696, + "grad_norm": 0.07324794197358897, + "learning_rate": 6.95843282506948e-06, + "loss": 0.562, + "step": 3071 + }, + { + "epoch": 1.4931306990881459, + "grad_norm": 0.0727415948537489, + "learning_rate": 6.956670994938438e-06, + "loss": 0.5885, + "step": 3072 + }, + { + "epoch": 1.4936170212765958, + "grad_norm": 0.07151424124766156, + "learning_rate": 6.9549088778838145e-06, + "loss": 0.531, + "step": 3073 + }, + { + "epoch": 1.4941033434650457, + "grad_norm": 0.07411308962781803, + "learning_rate": 6.953146474164003e-06, + "loss": 0.5346, + "step": 3074 + }, + { + "epoch": 1.4945896656534954, + "grad_norm": 0.07235227352132019, + "learning_rate": 6.951383784037442e-06, + "loss": 0.5425, + "step": 3075 + }, + { + "epoch": 1.4950759878419453, + "grad_norm": 0.07495502558692727, + "learning_rate": 6.9496208077626084e-06, + "loss": 0.5905, + "step": 3076 + }, + { + "epoch": 1.4955623100303952, + "grad_norm": 0.0703968726060793, + "learning_rate": 6.947857545598023e-06, + "loss": 0.5071, + "step": 3077 + }, + { + "epoch": 1.4960486322188449, + "grad_norm": 0.0720641865876843, + "learning_rate": 6.946093997802248e-06, + "loss": 0.5505, + "step": 3078 + }, + { + "epoch": 1.4965349544072948, + "grad_norm": 0.07325115038844345, + "learning_rate": 6.944330164633886e-06, + "loss": 0.5397, + "step": 3079 + }, + { + "epoch": 1.4970212765957447, + "grad_norm": 0.07412819439542268, + "learning_rate": 6.942566046351586e-06, + "loss": 0.5561, + "step": 3080 + }, + { + "epoch": 1.4975075987841946, + "grad_norm": 0.0726852683677845, + "learning_rate": 6.940801643214033e-06, + "loss": 0.5404, + "step": 3081 + }, + { + "epoch": 1.4979939209726445, + "grad_norm": 0.07318285267210774, + "learning_rate": 6.93903695547996e-06, + "loss": 0.5568, + "step": 3082 + }, + { + "epoch": 1.4984802431610942, + "grad_norm": 0.07479248574935365, + "learning_rate": 6.9372719834081345e-06, + "loss": 0.5628, + "step": 3083 + }, + { + "epoch": 1.498966565349544, + "grad_norm": 0.07041983721972382, + "learning_rate": 6.935506727257374e-06, + "loss": 0.5459, + "step": 3084 + }, + { + "epoch": 1.498966565349544, + "eval_loss": 0.5784030556678772, + "eval_runtime": 105.0371, + "eval_samples_per_second": 288.974, + "eval_steps_per_second": 36.13, + "step": 3084 + }, + { + "epoch": 1.499452887537994, + "grad_norm": 0.0724537077619785, + "learning_rate": 6.9337411872865316e-06, + "loss": 0.5684, + "step": 3085 + }, + { + "epoch": 1.4999392097264437, + "grad_norm": 0.0727640790978791, + "learning_rate": 6.931975363754502e-06, + "loss": 0.5349, + "step": 3086 + }, + { + "epoch": 1.5004255319148936, + "grad_norm": 0.06949866650222498, + "learning_rate": 6.930209256920224e-06, + "loss": 0.5249, + "step": 3087 + }, + { + "epoch": 1.5009118541033435, + "grad_norm": 0.08516437144935896, + "learning_rate": 6.928442867042679e-06, + "loss": 0.5693, + "step": 3088 + }, + { + "epoch": 1.5013981762917932, + "grad_norm": 0.09787544489121727, + "learning_rate": 6.926676194380884e-06, + "loss": 0.5678, + "step": 3089 + }, + { + "epoch": 1.5018844984802433, + "grad_norm": 0.0739586349387754, + "learning_rate": 6.924909239193905e-06, + "loss": 0.5712, + "step": 3090 + }, + { + "epoch": 1.502370820668693, + "grad_norm": 0.07316698456215946, + "learning_rate": 6.9231420017408456e-06, + "loss": 0.5736, + "step": 3091 + }, + { + "epoch": 1.502857142857143, + "grad_norm": 0.07256762125561368, + "learning_rate": 6.921374482280851e-06, + "loss": 0.5512, + "step": 3092 + }, + { + "epoch": 1.5033434650455928, + "grad_norm": 0.07538508659684552, + "learning_rate": 6.9196066810731055e-06, + "loss": 0.5792, + "step": 3093 + }, + { + "epoch": 1.5038297872340425, + "grad_norm": 0.07278086333803305, + "learning_rate": 6.9178385983768396e-06, + "loss": 0.556, + "step": 3094 + }, + { + "epoch": 1.5043161094224924, + "grad_norm": 0.07541265418038101, + "learning_rate": 6.916070234451321e-06, + "loss": 0.5657, + "step": 3095 + }, + { + "epoch": 1.5048024316109423, + "grad_norm": 0.06879095829094561, + "learning_rate": 6.914301589555862e-06, + "loss": 0.5386, + "step": 3096 + }, + { + "epoch": 1.505288753799392, + "grad_norm": 0.0713171273585956, + "learning_rate": 6.912532663949813e-06, + "loss": 0.5435, + "step": 3097 + }, + { + "epoch": 1.505775075987842, + "grad_norm": 0.07375715098468083, + "learning_rate": 6.910763457892567e-06, + "loss": 0.5394, + "step": 3098 + }, + { + "epoch": 1.5062613981762918, + "grad_norm": 0.0734911066106034, + "learning_rate": 6.9089939716435575e-06, + "loss": 0.5979, + "step": 3099 + }, + { + "epoch": 1.5067477203647417, + "grad_norm": 0.0715276213150515, + "learning_rate": 6.90722420546226e-06, + "loss": 0.5367, + "step": 3100 + }, + { + "epoch": 1.5072340425531916, + "grad_norm": 0.07347299469010009, + "learning_rate": 6.905454159608191e-06, + "loss": 0.5526, + "step": 3101 + }, + { + "epoch": 1.5077203647416413, + "grad_norm": 0.07181239218242261, + "learning_rate": 6.903683834340909e-06, + "loss": 0.52, + "step": 3102 + }, + { + "epoch": 1.5082066869300912, + "grad_norm": 0.07594578773348291, + "learning_rate": 6.901913229920008e-06, + "loss": 0.5461, + "step": 3103 + }, + { + "epoch": 1.508693009118541, + "grad_norm": 0.07081633161367618, + "learning_rate": 6.90014234660513e-06, + "loss": 0.5589, + "step": 3104 + }, + { + "epoch": 1.5091793313069908, + "grad_norm": 0.07092879956122423, + "learning_rate": 6.898371184655955e-06, + "loss": 0.5416, + "step": 3105 + }, + { + "epoch": 1.5096656534954407, + "grad_norm": 0.07389180320722963, + "learning_rate": 6.896599744332204e-06, + "loss": 0.5288, + "step": 3106 + }, + { + "epoch": 1.5101519756838906, + "grad_norm": 0.07153559585772981, + "learning_rate": 6.894828025893636e-06, + "loss": 0.5365, + "step": 3107 + }, + { + "epoch": 1.5106382978723403, + "grad_norm": 0.07476698559294594, + "learning_rate": 6.893056029600056e-06, + "loss": 0.5884, + "step": 3108 + }, + { + "epoch": 1.5111246200607904, + "grad_norm": 0.07561354566118965, + "learning_rate": 6.891283755711309e-06, + "loss": 0.5616, + "step": 3109 + }, + { + "epoch": 1.51161094224924, + "grad_norm": 0.07696656598183628, + "learning_rate": 6.889511204487273e-06, + "loss": 0.5609, + "step": 3110 + }, + { + "epoch": 1.51209726443769, + "grad_norm": 0.07709441751978834, + "learning_rate": 6.887738376187876e-06, + "loss": 0.5664, + "step": 3111 + }, + { + "epoch": 1.51258358662614, + "grad_norm": 0.07560704909968341, + "learning_rate": 6.8859652710730826e-06, + "loss": 0.5808, + "step": 3112 + }, + { + "epoch": 1.5130699088145896, + "grad_norm": 0.07165194474765134, + "learning_rate": 6.8841918894028995e-06, + "loss": 0.5459, + "step": 3113 + }, + { + "epoch": 1.5135562310030395, + "grad_norm": 0.07183554804472318, + "learning_rate": 6.882418231437371e-06, + "loss": 0.526, + "step": 3114 + }, + { + "epoch": 1.5140425531914894, + "grad_norm": 0.0707048188502243, + "learning_rate": 6.880644297436587e-06, + "loss": 0.5478, + "step": 3115 + }, + { + "epoch": 1.514528875379939, + "grad_norm": 0.07352916573160088, + "learning_rate": 6.878870087660673e-06, + "loss": 0.5512, + "step": 3116 + }, + { + "epoch": 1.5150151975683892, + "grad_norm": 0.07437583341490736, + "learning_rate": 6.877095602369796e-06, + "loss": 0.5584, + "step": 3117 + }, + { + "epoch": 1.5155015197568389, + "grad_norm": 0.07493668972429506, + "learning_rate": 6.8753208418241645e-06, + "loss": 0.5848, + "step": 3118 + }, + { + "epoch": 1.5159878419452888, + "grad_norm": 0.0743196163226166, + "learning_rate": 6.873545806284027e-06, + "loss": 0.5635, + "step": 3119 + }, + { + "epoch": 1.5164741641337387, + "grad_norm": 0.07128069207444965, + "learning_rate": 6.871770496009671e-06, + "loss": 0.5334, + "step": 3120 + }, + { + "epoch": 1.5169604863221884, + "grad_norm": 0.07079144509858189, + "learning_rate": 6.869994911261429e-06, + "loss": 0.5389, + "step": 3121 + }, + { + "epoch": 1.5174468085106383, + "grad_norm": 0.06928529093219288, + "learning_rate": 6.868219052299669e-06, + "loss": 0.5367, + "step": 3122 + }, + { + "epoch": 1.5179331306990882, + "grad_norm": 0.07305972708173213, + "learning_rate": 6.866442919384799e-06, + "loss": 0.5635, + "step": 3123 + }, + { + "epoch": 1.5184194528875379, + "grad_norm": 0.07548168694245962, + "learning_rate": 6.8646665127772715e-06, + "loss": 0.5544, + "step": 3124 + }, + { + "epoch": 1.518905775075988, + "grad_norm": 0.07208763548114153, + "learning_rate": 6.862889832737573e-06, + "loss": 0.5153, + "step": 3125 + }, + { + "epoch": 1.5193920972644377, + "grad_norm": 0.07318620215535641, + "learning_rate": 6.8611128795262345e-06, + "loss": 0.563, + "step": 3126 + }, + { + "epoch": 1.5198784194528874, + "grad_norm": 0.07170851941567723, + "learning_rate": 6.859335653403828e-06, + "loss": 0.5543, + "step": 3127 + }, + { + "epoch": 1.5203647416413375, + "grad_norm": 0.07152631565596852, + "learning_rate": 6.8575581546309614e-06, + "loss": 0.5534, + "step": 3128 + }, + { + "epoch": 1.5208510638297872, + "grad_norm": 0.07341010178694772, + "learning_rate": 6.855780383468285e-06, + "loss": 0.548, + "step": 3129 + }, + { + "epoch": 1.521337386018237, + "grad_norm": 0.06967523281033652, + "learning_rate": 6.854002340176489e-06, + "loss": 0.5118, + "step": 3130 + }, + { + "epoch": 1.521823708206687, + "grad_norm": 0.07071033144460957, + "learning_rate": 6.852224025016304e-06, + "loss": 0.5324, + "step": 3131 + }, + { + "epoch": 1.5223100303951367, + "grad_norm": 0.07468146926454766, + "learning_rate": 6.8504454382484995e-06, + "loss": 0.5541, + "step": 3132 + }, + { + "epoch": 1.5227963525835866, + "grad_norm": 0.07320351831902266, + "learning_rate": 6.848666580133885e-06, + "loss": 0.5815, + "step": 3133 + }, + { + "epoch": 1.5232826747720365, + "grad_norm": 0.06970478739345275, + "learning_rate": 6.846887450933308e-06, + "loss": 0.5386, + "step": 3134 + }, + { + "epoch": 1.5237689969604862, + "grad_norm": 0.07484435534881968, + "learning_rate": 6.8451080509076594e-06, + "loss": 0.5607, + "step": 3135 + }, + { + "epoch": 1.5242553191489363, + "grad_norm": 0.07175184017360012, + "learning_rate": 6.843328380317869e-06, + "loss": 0.5144, + "step": 3136 + }, + { + "epoch": 1.524741641337386, + "grad_norm": 0.07500237812607986, + "learning_rate": 6.841548439424904e-06, + "loss": 0.5487, + "step": 3137 + }, + { + "epoch": 1.525227963525836, + "grad_norm": 0.0710474445588129, + "learning_rate": 6.83976822848977e-06, + "loss": 0.5205, + "step": 3138 + }, + { + "epoch": 1.5257142857142858, + "grad_norm": 0.07277617539552242, + "learning_rate": 6.83798774777352e-06, + "loss": 0.5417, + "step": 3139 + }, + { + "epoch": 1.5262006079027355, + "grad_norm": 0.0719445412108135, + "learning_rate": 6.836206997537237e-06, + "loss": 0.5196, + "step": 3140 + }, + { + "epoch": 1.5266869300911854, + "grad_norm": 0.07054146362597816, + "learning_rate": 6.834425978042049e-06, + "loss": 0.5617, + "step": 3141 + }, + { + "epoch": 1.5271732522796353, + "grad_norm": 0.0725585821349836, + "learning_rate": 6.832644689549124e-06, + "loss": 0.5749, + "step": 3142 + }, + { + "epoch": 1.527659574468085, + "grad_norm": 0.07509877595405921, + "learning_rate": 6.830863132319666e-06, + "loss": 0.5929, + "step": 3143 + }, + { + "epoch": 1.528145896656535, + "grad_norm": 0.07183585737634014, + "learning_rate": 6.82908130661492e-06, + "loss": 0.5405, + "step": 3144 + }, + { + "epoch": 1.5286322188449848, + "grad_norm": 0.07344294571363798, + "learning_rate": 6.827299212696171e-06, + "loss": 0.4908, + "step": 3145 + }, + { + "epoch": 1.5291185410334347, + "grad_norm": 0.07495323196219186, + "learning_rate": 6.8255168508247425e-06, + "loss": 0.5787, + "step": 3146 + }, + { + "epoch": 1.5296048632218846, + "grad_norm": 0.07376405243380828, + "learning_rate": 6.823734221261999e-06, + "loss": 0.5659, + "step": 3147 + }, + { + "epoch": 1.5300911854103343, + "grad_norm": 0.07002837051559817, + "learning_rate": 6.821951324269341e-06, + "loss": 0.513, + "step": 3148 + }, + { + "epoch": 1.5305775075987842, + "grad_norm": 0.07194280447730006, + "learning_rate": 6.820168160108211e-06, + "loss": 0.5421, + "step": 3149 + }, + { + "epoch": 1.531063829787234, + "grad_norm": 0.07343311766494003, + "learning_rate": 6.818384729040091e-06, + "loss": 0.5432, + "step": 3150 + }, + { + "epoch": 1.5315501519756838, + "grad_norm": 0.07414031098421327, + "learning_rate": 6.816601031326498e-06, + "loss": 0.5741, + "step": 3151 + }, + { + "epoch": 1.5320364741641337, + "grad_norm": 0.07439420780960881, + "learning_rate": 6.814817067228993e-06, + "loss": 0.5485, + "step": 3152 + }, + { + "epoch": 1.5325227963525836, + "grad_norm": 0.07390182381954416, + "learning_rate": 6.8130328370091745e-06, + "loss": 0.5258, + "step": 3153 + }, + { + "epoch": 1.5330091185410333, + "grad_norm": 0.0718458946892044, + "learning_rate": 6.811248340928678e-06, + "loss": 0.5466, + "step": 3154 + }, + { + "epoch": 1.5334954407294834, + "grad_norm": 0.07234075836847709, + "learning_rate": 6.809463579249182e-06, + "loss": 0.5166, + "step": 3155 + }, + { + "epoch": 1.533981762917933, + "grad_norm": 0.07406731431262802, + "learning_rate": 6.807678552232397e-06, + "loss": 0.5703, + "step": 3156 + }, + { + "epoch": 1.534468085106383, + "grad_norm": 0.07266559850895499, + "learning_rate": 6.8058932601400815e-06, + "loss": 0.5746, + "step": 3157 + }, + { + "epoch": 1.534954407294833, + "grad_norm": 0.07229886526051414, + "learning_rate": 6.804107703234026e-06, + "loss": 0.5408, + "step": 3158 + }, + { + "epoch": 1.5354407294832826, + "grad_norm": 0.06722723172174702, + "learning_rate": 6.802321881776064e-06, + "loss": 0.4872, + "step": 3159 + }, + { + "epoch": 1.5359270516717325, + "grad_norm": 0.07166558743849152, + "learning_rate": 6.800535796028064e-06, + "loss": 0.4936, + "step": 3160 + }, + { + "epoch": 1.5364133738601824, + "grad_norm": 0.07055004239131063, + "learning_rate": 6.798749446251935e-06, + "loss": 0.5706, + "step": 3161 + }, + { + "epoch": 1.536899696048632, + "grad_norm": 0.07395142364101165, + "learning_rate": 6.796962832709628e-06, + "loss": 0.5743, + "step": 3162 + }, + { + "epoch": 1.5373860182370822, + "grad_norm": 0.07579883200079188, + "learning_rate": 6.795175955663127e-06, + "loss": 0.5919, + "step": 3163 + }, + { + "epoch": 1.537872340425532, + "grad_norm": 0.07104115584521561, + "learning_rate": 6.793388815374458e-06, + "loss": 0.5344, + "step": 3164 + }, + { + "epoch": 1.5383586626139818, + "grad_norm": 0.07004090427406262, + "learning_rate": 6.791601412105682e-06, + "loss": 0.5046, + "step": 3165 + }, + { + "epoch": 1.5388449848024317, + "grad_norm": 0.07711177709162328, + "learning_rate": 6.789813746118905e-06, + "loss": 0.5985, + "step": 3166 + }, + { + "epoch": 1.5393313069908814, + "grad_norm": 0.07618121127762313, + "learning_rate": 6.788025817676267e-06, + "loss": 0.5577, + "step": 3167 + }, + { + "epoch": 1.5398176291793313, + "grad_norm": 0.07833061904585184, + "learning_rate": 6.7862376270399475e-06, + "loss": 0.5759, + "step": 3168 + }, + { + "epoch": 1.5403039513677812, + "grad_norm": 0.07148994344193592, + "learning_rate": 6.784449174472164e-06, + "loss": 0.5399, + "step": 3169 + }, + { + "epoch": 1.5407902735562309, + "grad_norm": 0.07220829942206815, + "learning_rate": 6.782660460235174e-06, + "loss": 0.5419, + "step": 3170 + }, + { + "epoch": 1.541276595744681, + "grad_norm": 0.07271922423562557, + "learning_rate": 6.78087148459127e-06, + "loss": 0.5735, + "step": 3171 + }, + { + "epoch": 1.5417629179331307, + "grad_norm": 0.0723293628145058, + "learning_rate": 6.779082247802785e-06, + "loss": 0.5329, + "step": 3172 + }, + { + "epoch": 1.5422492401215806, + "grad_norm": 0.07432241856381423, + "learning_rate": 6.777292750132092e-06, + "loss": 0.5704, + "step": 3173 + }, + { + "epoch": 1.5427355623100305, + "grad_norm": 0.07702546353732548, + "learning_rate": 6.775502991841599e-06, + "loss": 0.5194, + "step": 3174 + }, + { + "epoch": 1.5432218844984802, + "grad_norm": 0.07564757115329124, + "learning_rate": 6.773712973193756e-06, + "loss": 0.5905, + "step": 3175 + }, + { + "epoch": 1.54370820668693, + "grad_norm": 0.07058217865105539, + "learning_rate": 6.771922694451045e-06, + "loss": 0.5488, + "step": 3176 + }, + { + "epoch": 1.54419452887538, + "grad_norm": 0.07177721713578614, + "learning_rate": 6.770132155875994e-06, + "loss": 0.5229, + "step": 3177 + }, + { + "epoch": 1.5446808510638297, + "grad_norm": 0.07540765684642103, + "learning_rate": 6.768341357731164e-06, + "loss": 0.5512, + "step": 3178 + }, + { + "epoch": 1.5451671732522796, + "grad_norm": 0.07355782345854359, + "learning_rate": 6.766550300279154e-06, + "loss": 0.5418, + "step": 3179 + }, + { + "epoch": 1.5456534954407295, + "grad_norm": 0.07247766030644205, + "learning_rate": 6.764758983782603e-06, + "loss": 0.564, + "step": 3180 + }, + { + "epoch": 1.5461398176291792, + "grad_norm": 0.07150159358993446, + "learning_rate": 6.762967408504188e-06, + "loss": 0.5636, + "step": 3181 + }, + { + "epoch": 1.5466261398176293, + "grad_norm": 0.07301610043205627, + "learning_rate": 6.761175574706621e-06, + "loss": 0.5299, + "step": 3182 + }, + { + "epoch": 1.547112462006079, + "grad_norm": 0.07866225329389155, + "learning_rate": 6.759383482652655e-06, + "loss": 0.6068, + "step": 3183 + }, + { + "epoch": 1.547598784194529, + "grad_norm": 0.07474911744879517, + "learning_rate": 6.757591132605082e-06, + "loss": 0.5613, + "step": 3184 + }, + { + "epoch": 1.5480851063829788, + "grad_norm": 0.07609140762383507, + "learning_rate": 6.755798524826728e-06, + "loss": 0.5458, + "step": 3185 + }, + { + "epoch": 1.5485714285714285, + "grad_norm": 0.0711891370539839, + "learning_rate": 6.7540056595804585e-06, + "loss": 0.5663, + "step": 3186 + }, + { + "epoch": 1.5490577507598784, + "grad_norm": 0.07343599011323172, + "learning_rate": 6.752212537129177e-06, + "loss": 0.5389, + "step": 3187 + }, + { + "epoch": 1.5495440729483283, + "grad_norm": 0.07158832676612963, + "learning_rate": 6.750419157735823e-06, + "loss": 0.5403, + "step": 3188 + }, + { + "epoch": 1.550030395136778, + "grad_norm": 0.07117522223174699, + "learning_rate": 6.748625521663379e-06, + "loss": 0.5468, + "step": 3189 + }, + { + "epoch": 1.550516717325228, + "grad_norm": 0.07567464696082797, + "learning_rate": 6.7468316291748596e-06, + "loss": 0.6244, + "step": 3190 + }, + { + "epoch": 1.5510030395136778, + "grad_norm": 0.07163995518174511, + "learning_rate": 6.745037480533316e-06, + "loss": 0.5111, + "step": 3191 + }, + { + "epoch": 1.5514893617021277, + "grad_norm": 0.07346488860301928, + "learning_rate": 6.743243076001844e-06, + "loss": 0.583, + "step": 3192 + }, + { + "epoch": 1.5519756838905776, + "grad_norm": 0.07203699435665942, + "learning_rate": 6.74144841584357e-06, + "loss": 0.5643, + "step": 3193 + }, + { + "epoch": 1.5524620060790273, + "grad_norm": 0.07025097264787714, + "learning_rate": 6.739653500321661e-06, + "loss": 0.5587, + "step": 3194 + }, + { + "epoch": 1.5529483282674772, + "grad_norm": 0.0732974312736898, + "learning_rate": 6.737858329699322e-06, + "loss": 0.5742, + "step": 3195 + }, + { + "epoch": 1.553434650455927, + "grad_norm": 0.07328227920673683, + "learning_rate": 6.736062904239793e-06, + "loss": 0.5443, + "step": 3196 + }, + { + "epoch": 1.5539209726443768, + "grad_norm": 0.07330600466366281, + "learning_rate": 6.734267224206355e-06, + "loss": 0.53, + "step": 3197 + }, + { + "epoch": 1.554407294832827, + "grad_norm": 0.08022963931480914, + "learning_rate": 6.73247128986232e-06, + "loss": 0.5316, + "step": 3198 + }, + { + "epoch": 1.5548936170212766, + "grad_norm": 0.07497996039847053, + "learning_rate": 6.730675101471044e-06, + "loss": 0.5856, + "step": 3199 + }, + { + "epoch": 1.5553799392097263, + "grad_norm": 0.07142146001295316, + "learning_rate": 6.72887865929592e-06, + "loss": 0.5703, + "step": 3200 + }, + { + "epoch": 1.5558662613981764, + "grad_norm": 0.0727213804568006, + "learning_rate": 6.727081963600371e-06, + "loss": 0.5601, + "step": 3201 + }, + { + "epoch": 1.556352583586626, + "grad_norm": 0.07280318451316951, + "learning_rate": 6.725285014647866e-06, + "loss": 0.5552, + "step": 3202 + }, + { + "epoch": 1.556838905775076, + "grad_norm": 0.07444714144621148, + "learning_rate": 6.723487812701904e-06, + "loss": 0.5708, + "step": 3203 + }, + { + "epoch": 1.557325227963526, + "grad_norm": 0.07232298868451444, + "learning_rate": 6.721690358026027e-06, + "loss": 0.5745, + "step": 3204 + }, + { + "epoch": 1.5578115501519756, + "grad_norm": 0.07116376868026475, + "learning_rate": 6.7198926508838095e-06, + "loss": 0.5391, + "step": 3205 + }, + { + "epoch": 1.5582978723404255, + "grad_norm": 0.07234229019563575, + "learning_rate": 6.718094691538866e-06, + "loss": 0.532, + "step": 3206 + }, + { + "epoch": 1.5587841945288754, + "grad_norm": 0.07016951902865325, + "learning_rate": 6.716296480254845e-06, + "loss": 0.5298, + "step": 3207 + }, + { + "epoch": 1.559270516717325, + "grad_norm": 0.07235586125803008, + "learning_rate": 6.714498017295436e-06, + "loss": 0.5425, + "step": 3208 + }, + { + "epoch": 1.5597568389057752, + "grad_norm": 0.31434103642050903, + "learning_rate": 6.712699302924362e-06, + "loss": 0.5588, + "step": 3209 + }, + { + "epoch": 1.560243161094225, + "grad_norm": 0.07314316819547519, + "learning_rate": 6.7109003374053834e-06, + "loss": 0.5258, + "step": 3210 + }, + { + "epoch": 1.5607294832826748, + "grad_norm": 0.07171419228159252, + "learning_rate": 6.7091011210023e-06, + "loss": 0.5983, + "step": 3211 + }, + { + "epoch": 1.5612158054711247, + "grad_norm": 0.07730277799525664, + "learning_rate": 6.707301653978945e-06, + "loss": 0.5976, + "step": 3212 + }, + { + "epoch": 1.5617021276595744, + "grad_norm": 0.07173323902425, + "learning_rate": 6.70550193659919e-06, + "loss": 0.5391, + "step": 3213 + }, + { + "epoch": 1.5621884498480243, + "grad_norm": 0.07333917368499077, + "learning_rate": 6.703701969126944e-06, + "loss": 0.5664, + "step": 3214 + }, + { + "epoch": 1.5626747720364742, + "grad_norm": 0.07576083610894023, + "learning_rate": 6.70190175182615e-06, + "loss": 0.5993, + "step": 3215 + }, + { + "epoch": 1.5631610942249239, + "grad_norm": 0.07348457237728495, + "learning_rate": 6.700101284960792e-06, + "loss": 0.5528, + "step": 3216 + }, + { + "epoch": 1.563647416413374, + "grad_norm": 0.06827242260955661, + "learning_rate": 6.698300568794884e-06, + "loss": 0.502, + "step": 3217 + }, + { + "epoch": 1.5641337386018237, + "grad_norm": 0.07933834881970789, + "learning_rate": 6.696499603592486e-06, + "loss": 0.5412, + "step": 3218 + }, + { + "epoch": 1.5646200607902736, + "grad_norm": 0.07488163723208957, + "learning_rate": 6.694698389617684e-06, + "loss": 0.5373, + "step": 3219 + }, + { + "epoch": 1.5651063829787235, + "grad_norm": 0.0717208267990933, + "learning_rate": 6.6928969271346065e-06, + "loss": 0.5076, + "step": 3220 + }, + { + "epoch": 1.5655927051671732, + "grad_norm": 0.07476304552491932, + "learning_rate": 6.691095216407422e-06, + "loss": 0.5445, + "step": 3221 + }, + { + "epoch": 1.566079027355623, + "grad_norm": 0.0719274754530095, + "learning_rate": 6.689293257700325e-06, + "loss": 0.5937, + "step": 3222 + }, + { + "epoch": 1.566565349544073, + "grad_norm": 0.07090253390507247, + "learning_rate": 6.687491051277557e-06, + "loss": 0.5417, + "step": 3223 + }, + { + "epoch": 1.5670516717325227, + "grad_norm": 0.07085954313742286, + "learning_rate": 6.6856885974033895e-06, + "loss": 0.5494, + "step": 3224 + }, + { + "epoch": 1.5675379939209726, + "grad_norm": 0.07442683356199686, + "learning_rate": 6.6838858963421295e-06, + "loss": 0.5335, + "step": 3225 + }, + { + "epoch": 1.5680243161094225, + "grad_norm": 0.07174191055132459, + "learning_rate": 6.682082948358125e-06, + "loss": 0.5689, + "step": 3226 + }, + { + "epoch": 1.5685106382978722, + "grad_norm": 0.07060495703511777, + "learning_rate": 6.680279753715758e-06, + "loss": 0.5277, + "step": 3227 + }, + { + "epoch": 1.5689969604863223, + "grad_norm": 0.076415025813712, + "learning_rate": 6.678476312679446e-06, + "loss": 0.5711, + "step": 3228 + }, + { + "epoch": 1.569483282674772, + "grad_norm": 0.0714782210933602, + "learning_rate": 6.676672625513642e-06, + "loss": 0.5532, + "step": 3229 + }, + { + "epoch": 1.569969604863222, + "grad_norm": 0.07130813186999481, + "learning_rate": 6.674868692482839e-06, + "loss": 0.5495, + "step": 3230 + }, + { + "epoch": 1.5704559270516718, + "grad_norm": 0.0706254640829068, + "learning_rate": 6.67306451385156e-06, + "loss": 0.5257, + "step": 3231 + }, + { + "epoch": 1.5709422492401215, + "grad_norm": 0.07553240429440361, + "learning_rate": 6.6712600898843705e-06, + "loss": 0.5776, + "step": 3232 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.07146520155715774, + "learning_rate": 6.6694554208458665e-06, + "loss": 0.5509, + "step": 3233 + }, + { + "epoch": 1.5719148936170213, + "grad_norm": 0.07410855691473235, + "learning_rate": 6.6676505070006826e-06, + "loss": 0.5337, + "step": 3234 + }, + { + "epoch": 1.572401215805471, + "grad_norm": 0.07384514727552748, + "learning_rate": 6.6658453486134885e-06, + "loss": 0.5374, + "step": 3235 + }, + { + "epoch": 1.5728875379939211, + "grad_norm": 0.070967292667365, + "learning_rate": 6.6640399459489924e-06, + "loss": 0.5598, + "step": 3236 + }, + { + "epoch": 1.5733738601823708, + "grad_norm": 0.07181216459562484, + "learning_rate": 6.662234299271934e-06, + "loss": 0.531, + "step": 3237 + }, + { + "epoch": 1.5738601823708207, + "grad_norm": 0.07029494882217778, + "learning_rate": 6.660428408847093e-06, + "loss": 0.553, + "step": 3238 + }, + { + "epoch": 1.5743465045592706, + "grad_norm": 0.07012420579724925, + "learning_rate": 6.658622274939279e-06, + "loss": 0.5685, + "step": 3239 + }, + { + "epoch": 1.5748328267477203, + "grad_norm": 0.07660643213081383, + "learning_rate": 6.6568158978133455e-06, + "loss": 0.5467, + "step": 3240 + }, + { + "epoch": 1.5753191489361702, + "grad_norm": 0.07149805914337362, + "learning_rate": 6.655009277734174e-06, + "loss": 0.5662, + "step": 3241 + }, + { + "epoch": 1.57580547112462, + "grad_norm": 0.07336476678608676, + "learning_rate": 6.653202414966685e-06, + "loss": 0.5254, + "step": 3242 + }, + { + "epoch": 1.5762917933130698, + "grad_norm": 0.07460358075613517, + "learning_rate": 6.651395309775837e-06, + "loss": 0.5563, + "step": 3243 + }, + { + "epoch": 1.57677811550152, + "grad_norm": 0.0729305948739106, + "learning_rate": 6.649587962426618e-06, + "loss": 0.5279, + "step": 3244 + }, + { + "epoch": 1.5772644376899696, + "grad_norm": 0.07018140427685124, + "learning_rate": 6.647780373184056e-06, + "loss": 0.5273, + "step": 3245 + }, + { + "epoch": 1.5777507598784195, + "grad_norm": 0.07181111863298231, + "learning_rate": 6.645972542313216e-06, + "loss": 0.5563, + "step": 3246 + }, + { + "epoch": 1.5782370820668694, + "grad_norm": 0.07327155633461549, + "learning_rate": 6.644164470079193e-06, + "loss": 0.5683, + "step": 3247 + }, + { + "epoch": 1.578723404255319, + "grad_norm": 0.07065978850029275, + "learning_rate": 6.642356156747122e-06, + "loss": 0.5258, + "step": 3248 + }, + { + "epoch": 1.579209726443769, + "grad_norm": 0.07304613705470726, + "learning_rate": 6.64054760258217e-06, + "loss": 0.5811, + "step": 3249 + }, + { + "epoch": 1.579696048632219, + "grad_norm": 0.0724926883521955, + "learning_rate": 6.6387388078495405e-06, + "loss": 0.5512, + "step": 3250 + }, + { + "epoch": 1.5801823708206686, + "grad_norm": 0.07417419833082137, + "learning_rate": 6.636929772814476e-06, + "loss": 0.5658, + "step": 3251 + }, + { + "epoch": 1.5806686930091185, + "grad_norm": 0.0774684680523497, + "learning_rate": 6.635120497742249e-06, + "loss": 0.5548, + "step": 3252 + }, + { + "epoch": 1.5811550151975684, + "grad_norm": 0.07144822690310161, + "learning_rate": 6.633310982898168e-06, + "loss": 0.5408, + "step": 3253 + }, + { + "epoch": 1.581641337386018, + "grad_norm": 0.07423440047920843, + "learning_rate": 6.63150122854758e-06, + "loss": 0.5372, + "step": 3254 + }, + { + "epoch": 1.5821276595744682, + "grad_norm": 0.07335562427886222, + "learning_rate": 6.629691234955863e-06, + "loss": 0.5467, + "step": 3255 + }, + { + "epoch": 1.582613981762918, + "grad_norm": 0.07156578625257565, + "learning_rate": 6.627881002388431e-06, + "loss": 0.5274, + "step": 3256 + }, + { + "epoch": 1.5831003039513678, + "grad_norm": 0.07217948467383893, + "learning_rate": 6.626070531110738e-06, + "loss": 0.5512, + "step": 3257 + }, + { + "epoch": 1.5835866261398177, + "grad_norm": 0.07127305873656875, + "learning_rate": 6.624259821388266e-06, + "loss": 0.5648, + "step": 3258 + }, + { + "epoch": 1.5840729483282674, + "grad_norm": 0.07814092976868031, + "learning_rate": 6.622448873486536e-06, + "loss": 0.5462, + "step": 3259 + }, + { + "epoch": 1.5845592705167173, + "grad_norm": 0.081895010949251, + "learning_rate": 6.620637687671103e-06, + "loss": 0.609, + "step": 3260 + }, + { + "epoch": 1.5850455927051672, + "grad_norm": 0.07393576618543267, + "learning_rate": 6.6188262642075566e-06, + "loss": 0.5778, + "step": 3261 + }, + { + "epoch": 1.585531914893617, + "grad_norm": 0.07093869832903571, + "learning_rate": 6.617014603361522e-06, + "loss": 0.5517, + "step": 3262 + }, + { + "epoch": 1.586018237082067, + "grad_norm": 0.07267524584484943, + "learning_rate": 6.6152027053986575e-06, + "loss": 0.5894, + "step": 3263 + }, + { + "epoch": 1.5865045592705167, + "grad_norm": 0.07247215403176348, + "learning_rate": 6.613390570584659e-06, + "loss": 0.5243, + "step": 3264 + }, + { + "epoch": 1.5869908814589666, + "grad_norm": 0.07539930335319005, + "learning_rate": 6.6115781991852535e-06, + "loss": 0.5645, + "step": 3265 + }, + { + "epoch": 1.5874772036474165, + "grad_norm": 0.07676592524900043, + "learning_rate": 6.609765591466206e-06, + "loss": 0.5319, + "step": 3266 + }, + { + "epoch": 1.5879635258358662, + "grad_norm": 0.07200588528709602, + "learning_rate": 6.607952747693315e-06, + "loss": 0.5022, + "step": 3267 + }, + { + "epoch": 1.588449848024316, + "grad_norm": 0.07289569684017909, + "learning_rate": 6.606139668132412e-06, + "loss": 0.5377, + "step": 3268 + }, + { + "epoch": 1.588936170212766, + "grad_norm": 0.0737518333531295, + "learning_rate": 6.604326353049368e-06, + "loss": 0.5526, + "step": 3269 + }, + { + "epoch": 1.5894224924012157, + "grad_norm": 0.07384929929083588, + "learning_rate": 6.602512802710082e-06, + "loss": 0.5914, + "step": 3270 + }, + { + "epoch": 1.5899088145896658, + "grad_norm": 0.07307112002147667, + "learning_rate": 6.60069901738049e-06, + "loss": 0.5229, + "step": 3271 + }, + { + "epoch": 1.5903951367781155, + "grad_norm": 0.07293252687403938, + "learning_rate": 6.598884997326564e-06, + "loss": 0.546, + "step": 3272 + }, + { + "epoch": 1.5908814589665652, + "grad_norm": 0.07904077204692532, + "learning_rate": 6.597070742814311e-06, + "loss": 0.6021, + "step": 3273 + }, + { + "epoch": 1.5913677811550153, + "grad_norm": 0.07341496526115474, + "learning_rate": 6.595256254109768e-06, + "loss": 0.5608, + "step": 3274 + }, + { + "epoch": 1.591854103343465, + "grad_norm": 0.07015346872022103, + "learning_rate": 6.593441531479011e-06, + "loss": 0.5338, + "step": 3275 + }, + { + "epoch": 1.592340425531915, + "grad_norm": 0.0765460423403433, + "learning_rate": 6.591626575188149e-06, + "loss": 0.571, + "step": 3276 + }, + { + "epoch": 1.5928267477203648, + "grad_norm": 0.07489506422806602, + "learning_rate": 6.589811385503324e-06, + "loss": 0.5581, + "step": 3277 + }, + { + "epoch": 1.5933130699088145, + "grad_norm": 0.07201722047185763, + "learning_rate": 6.587995962690712e-06, + "loss": 0.5585, + "step": 3278 + }, + { + "epoch": 1.5937993920972644, + "grad_norm": 0.07405049988261514, + "learning_rate": 6.586180307016525e-06, + "loss": 0.5652, + "step": 3279 + }, + { + "epoch": 1.5942857142857143, + "grad_norm": 0.07134103679162339, + "learning_rate": 6.584364418747009e-06, + "loss": 0.5796, + "step": 3280 + }, + { + "epoch": 1.594772036474164, + "grad_norm": 0.07238724185584482, + "learning_rate": 6.582548298148442e-06, + "loss": 0.5033, + "step": 3281 + }, + { + "epoch": 1.5952583586626141, + "grad_norm": 0.07164794837473736, + "learning_rate": 6.5807319454871385e-06, + "loss": 0.5508, + "step": 3282 + }, + { + "epoch": 1.5957446808510638, + "grad_norm": 0.07293965281400856, + "learning_rate": 6.5789153610294445e-06, + "loss": 0.5572, + "step": 3283 + }, + { + "epoch": 1.5962310030395137, + "grad_norm": 0.0727976292071456, + "learning_rate": 6.5770985450417445e-06, + "loss": 0.5664, + "step": 3284 + }, + { + "epoch": 1.5967173252279636, + "grad_norm": 0.07186216370513868, + "learning_rate": 6.575281497790451e-06, + "loss": 0.5192, + "step": 3285 + }, + { + "epoch": 1.5972036474164133, + "grad_norm": 0.07395938092943025, + "learning_rate": 6.5734642195420136e-06, + "loss": 0.5772, + "step": 3286 + }, + { + "epoch": 1.5976899696048632, + "grad_norm": 0.0699872818752595, + "learning_rate": 6.571646710562918e-06, + "loss": 0.537, + "step": 3287 + }, + { + "epoch": 1.598176291793313, + "grad_norm": 0.0739518065036882, + "learning_rate": 6.5698289711196785e-06, + "loss": 0.5919, + "step": 3288 + }, + { + "epoch": 1.5986626139817628, + "grad_norm": 0.07191872355669547, + "learning_rate": 6.568011001478846e-06, + "loss": 0.5667, + "step": 3289 + }, + { + "epoch": 1.599148936170213, + "grad_norm": 0.07340443237635351, + "learning_rate": 6.5661928019070075e-06, + "loss": 0.5136, + "step": 3290 + }, + { + "epoch": 1.5996352583586626, + "grad_norm": 0.07368969597025045, + "learning_rate": 6.56437437267078e-06, + "loss": 0.5676, + "step": 3291 + }, + { + "epoch": 1.6001215805471125, + "grad_norm": 0.07088516554493693, + "learning_rate": 6.562555714036814e-06, + "loss": 0.5558, + "step": 3292 + }, + { + "epoch": 1.6006079027355624, + "grad_norm": 0.07121955410425318, + "learning_rate": 6.560736826271799e-06, + "loss": 0.546, + "step": 3293 + }, + { + "epoch": 1.601094224924012, + "grad_norm": 0.07146220070247321, + "learning_rate": 6.55891770964245e-06, + "loss": 0.5479, + "step": 3294 + }, + { + "epoch": 1.601580547112462, + "grad_norm": 0.07163782715059633, + "learning_rate": 6.55709836441552e-06, + "loss": 0.5306, + "step": 3295 + }, + { + "epoch": 1.602066869300912, + "grad_norm": 0.07112124390682836, + "learning_rate": 6.5552787908578e-06, + "loss": 0.5402, + "step": 3296 + }, + { + "epoch": 1.6025531914893616, + "grad_norm": 0.0729935016764105, + "learning_rate": 6.553458989236105e-06, + "loss": 0.5344, + "step": 3297 + }, + { + "epoch": 1.6030395136778115, + "grad_norm": 0.07308374443027824, + "learning_rate": 6.55163895981729e-06, + "loss": 0.5714, + "step": 3298 + }, + { + "epoch": 1.6035258358662614, + "grad_norm": 0.07555733970186158, + "learning_rate": 6.5498187028682425e-06, + "loss": 0.5486, + "step": 3299 + }, + { + "epoch": 1.604012158054711, + "grad_norm": 0.07199022580017078, + "learning_rate": 6.547998218655881e-06, + "loss": 0.5852, + "step": 3300 + }, + { + "epoch": 1.6044984802431612, + "grad_norm": 0.07227522564501239, + "learning_rate": 6.546177507447158e-06, + "loss": 0.5227, + "step": 3301 + }, + { + "epoch": 1.604984802431611, + "grad_norm": 0.07219391640105048, + "learning_rate": 6.5443565695090624e-06, + "loss": 0.5713, + "step": 3302 + }, + { + "epoch": 1.6054711246200608, + "grad_norm": 0.06953391874443515, + "learning_rate": 6.542535405108614e-06, + "loss": 0.5447, + "step": 3303 + }, + { + "epoch": 1.6059574468085107, + "grad_norm": 0.07107518834888034, + "learning_rate": 6.540714014512866e-06, + "loss": 0.5448, + "step": 3304 + }, + { + "epoch": 1.6064437689969604, + "grad_norm": 0.07101284290220428, + "learning_rate": 6.538892397988902e-06, + "loss": 0.5342, + "step": 3305 + }, + { + "epoch": 1.6069300911854103, + "grad_norm": 0.07226955782389821, + "learning_rate": 6.537070555803844e-06, + "loss": 0.5584, + "step": 3306 + }, + { + "epoch": 1.6074164133738602, + "grad_norm": 0.07274351835337117, + "learning_rate": 6.535248488224843e-06, + "loss": 0.5744, + "step": 3307 + }, + { + "epoch": 1.60790273556231, + "grad_norm": 0.07459823992393674, + "learning_rate": 6.533426195519086e-06, + "loss": 0.553, + "step": 3308 + }, + { + "epoch": 1.60838905775076, + "grad_norm": 0.07456248119729599, + "learning_rate": 6.5316036779537896e-06, + "loss": 0.5932, + "step": 3309 + }, + { + "epoch": 1.6088753799392097, + "grad_norm": 0.07316986771263397, + "learning_rate": 6.5297809357962064e-06, + "loss": 0.5552, + "step": 3310 + }, + { + "epoch": 1.6093617021276596, + "grad_norm": 0.0700021588345707, + "learning_rate": 6.527957969313621e-06, + "loss": 0.5063, + "step": 3311 + }, + { + "epoch": 1.6098480243161095, + "grad_norm": 0.07229185641738506, + "learning_rate": 6.526134778773349e-06, + "loss": 0.5271, + "step": 3312 + }, + { + "epoch": 1.6103343465045592, + "grad_norm": 0.07716759762977039, + "learning_rate": 6.524311364442745e-06, + "loss": 0.5506, + "step": 3313 + }, + { + "epoch": 1.610820668693009, + "grad_norm": 0.0742000874502674, + "learning_rate": 6.522487726589187e-06, + "loss": 0.5379, + "step": 3314 + }, + { + "epoch": 1.611306990881459, + "grad_norm": 0.072679282896985, + "learning_rate": 6.520663865480095e-06, + "loss": 0.5514, + "step": 3315 + }, + { + "epoch": 1.6117933130699087, + "grad_norm": 0.0723466796728807, + "learning_rate": 6.518839781382914e-06, + "loss": 0.5115, + "step": 3316 + }, + { + "epoch": 1.6122796352583588, + "grad_norm": 0.07482409978841498, + "learning_rate": 6.517015474565127e-06, + "loss": 0.5782, + "step": 3317 + }, + { + "epoch": 1.6127659574468085, + "grad_norm": 0.07091564284890334, + "learning_rate": 6.515190945294248e-06, + "loss": 0.5348, + "step": 3318 + }, + { + "epoch": 1.6132522796352584, + "grad_norm": 0.07133014992093543, + "learning_rate": 6.5133661938378205e-06, + "loss": 0.584, + "step": 3319 + }, + { + "epoch": 1.6137386018237083, + "grad_norm": 0.0749693946980126, + "learning_rate": 6.511541220463427e-06, + "loss": 0.5569, + "step": 3320 + }, + { + "epoch": 1.614224924012158, + "grad_norm": 0.07179411847546166, + "learning_rate": 6.509716025438679e-06, + "loss": 0.5585, + "step": 3321 + }, + { + "epoch": 1.614711246200608, + "grad_norm": 0.07185434387601454, + "learning_rate": 6.50789060903122e-06, + "loss": 0.5275, + "step": 3322 + }, + { + "epoch": 1.6151975683890578, + "grad_norm": 0.07813640536644906, + "learning_rate": 6.5060649715087275e-06, + "loss": 0.5945, + "step": 3323 + }, + { + "epoch": 1.6156838905775075, + "grad_norm": 0.07333886719594797, + "learning_rate": 6.5042391131389086e-06, + "loss": 0.5438, + "step": 3324 + }, + { + "epoch": 1.6161702127659574, + "grad_norm": 0.07202840727115539, + "learning_rate": 6.502413034189505e-06, + "loss": 0.538, + "step": 3325 + }, + { + "epoch": 1.6166565349544073, + "grad_norm": 0.0714070481420102, + "learning_rate": 6.500586734928292e-06, + "loss": 0.547, + "step": 3326 + }, + { + "epoch": 1.617142857142857, + "grad_norm": 0.1107496293830457, + "learning_rate": 6.498760215623072e-06, + "loss": 0.559, + "step": 3327 + }, + { + "epoch": 1.6176291793313071, + "grad_norm": 0.07610325435589102, + "learning_rate": 6.496933476541687e-06, + "loss": 0.6172, + "step": 3328 + }, + { + "epoch": 1.6181155015197568, + "grad_norm": 0.07218988589616845, + "learning_rate": 6.495106517952007e-06, + "loss": 0.5442, + "step": 3329 + }, + { + "epoch": 1.6186018237082067, + "grad_norm": 0.07306265130124329, + "learning_rate": 6.493279340121935e-06, + "loss": 0.5241, + "step": 3330 + }, + { + "epoch": 1.6190881458966566, + "grad_norm": 0.0726919210048757, + "learning_rate": 6.4914519433194046e-06, + "loss": 0.611, + "step": 3331 + }, + { + "epoch": 1.6195744680851063, + "grad_norm": 0.06984768128625651, + "learning_rate": 6.489624327812383e-06, + "loss": 0.5292, + "step": 3332 + }, + { + "epoch": 1.6200607902735562, + "grad_norm": 0.07176070271103654, + "learning_rate": 6.48779649386887e-06, + "loss": 0.5788, + "step": 3333 + }, + { + "epoch": 1.6205471124620061, + "grad_norm": 0.07370358775992361, + "learning_rate": 6.4859684417568955e-06, + "loss": 0.5739, + "step": 3334 + }, + { + "epoch": 1.6210334346504558, + "grad_norm": 0.07463639342421202, + "learning_rate": 6.484140171744524e-06, + "loss": 0.5594, + "step": 3335 + }, + { + "epoch": 1.621519756838906, + "grad_norm": 0.07521656953566101, + "learning_rate": 6.482311684099849e-06, + "loss": 0.5647, + "step": 3336 + }, + { + "epoch": 1.6220060790273556, + "grad_norm": 0.07673724562491101, + "learning_rate": 6.480482979090999e-06, + "loss": 0.5682, + "step": 3337 + }, + { + "epoch": 1.6224924012158055, + "grad_norm": 0.06892917284174166, + "learning_rate": 6.4786540569861315e-06, + "loss": 0.545, + "step": 3338 + }, + { + "epoch": 1.6229787234042554, + "grad_norm": 0.07140027295216198, + "learning_rate": 6.476824918053438e-06, + "loss": 0.5113, + "step": 3339 + }, + { + "epoch": 1.623465045592705, + "grad_norm": 0.07659421781724557, + "learning_rate": 6.474995562561142e-06, + "loss": 0.5756, + "step": 3340 + }, + { + "epoch": 1.623951367781155, + "grad_norm": 0.07423572415022596, + "learning_rate": 6.473165990777495e-06, + "loss": 0.5542, + "step": 3341 + }, + { + "epoch": 1.624437689969605, + "grad_norm": 0.07091832573032526, + "learning_rate": 6.471336202970784e-06, + "loss": 0.5079, + "step": 3342 + }, + { + "epoch": 1.6249240121580546, + "grad_norm": 0.0744546931374889, + "learning_rate": 6.469506199409328e-06, + "loss": 0.5657, + "step": 3343 + }, + { + "epoch": 1.6254103343465045, + "grad_norm": 0.07119571369192718, + "learning_rate": 6.467675980361474e-06, + "loss": 0.5083, + "step": 3344 + }, + { + "epoch": 1.6258966565349544, + "grad_norm": 0.07028604422875337, + "learning_rate": 6.465845546095605e-06, + "loss": 0.5312, + "step": 3345 + }, + { + "epoch": 1.626382978723404, + "grad_norm": 0.07479678802136304, + "learning_rate": 6.464014896880133e-06, + "loss": 0.565, + "step": 3346 + }, + { + "epoch": 1.6268693009118542, + "grad_norm": 0.06915116777199172, + "learning_rate": 6.4621840329835e-06, + "loss": 0.5135, + "step": 3347 + }, + { + "epoch": 1.627355623100304, + "grad_norm": 0.0718886127527123, + "learning_rate": 6.460352954674184e-06, + "loss": 0.5106, + "step": 3348 + }, + { + "epoch": 1.6278419452887538, + "grad_norm": 0.07222227018942742, + "learning_rate": 6.4585216622206895e-06, + "loss": 0.5363, + "step": 3349 + }, + { + "epoch": 1.6283282674772037, + "grad_norm": 0.07359034532628995, + "learning_rate": 6.456690155891556e-06, + "loss": 0.5765, + "step": 3350 + }, + { + "epoch": 1.6288145896656534, + "grad_norm": 0.07365975369712202, + "learning_rate": 6.454858435955353e-06, + "loss": 0.5617, + "step": 3351 + }, + { + "epoch": 1.6293009118541033, + "grad_norm": 0.07425469323802146, + "learning_rate": 6.453026502680683e-06, + "loss": 0.5529, + "step": 3352 + }, + { + "epoch": 1.6297872340425532, + "grad_norm": 0.07585176691977917, + "learning_rate": 6.451194356336174e-06, + "loss": 0.5573, + "step": 3353 + }, + { + "epoch": 1.630273556231003, + "grad_norm": 0.07361855253982075, + "learning_rate": 6.449361997190495e-06, + "loss": 0.5441, + "step": 3354 + }, + { + "epoch": 1.630759878419453, + "grad_norm": 0.07415644033566286, + "learning_rate": 6.4475294255123355e-06, + "loss": 0.5631, + "step": 3355 + }, + { + "epoch": 1.6312462006079027, + "grad_norm": 0.06964319946642412, + "learning_rate": 6.445696641570423e-06, + "loss": 0.53, + "step": 3356 + }, + { + "epoch": 1.6317325227963526, + "grad_norm": 0.07106091866374166, + "learning_rate": 6.443863645633517e-06, + "loss": 0.5401, + "step": 3357 + }, + { + "epoch": 1.6322188449848025, + "grad_norm": 0.07555702494703029, + "learning_rate": 6.442030437970402e-06, + "loss": 0.5874, + "step": 3358 + }, + { + "epoch": 1.6327051671732522, + "grad_norm": 0.07664921378519898, + "learning_rate": 6.4401970188499e-06, + "loss": 0.5929, + "step": 3359 + }, + { + "epoch": 1.633191489361702, + "grad_norm": 0.06963611351982496, + "learning_rate": 6.438363388540858e-06, + "loss": 0.5413, + "step": 3360 + }, + { + "epoch": 1.633677811550152, + "grad_norm": 0.07366895035817969, + "learning_rate": 6.436529547312161e-06, + "loss": 0.5711, + "step": 3361 + }, + { + "epoch": 1.6341641337386017, + "grad_norm": 0.0724475753197608, + "learning_rate": 6.434695495432718e-06, + "loss": 0.57, + "step": 3362 + }, + { + "epoch": 1.6346504559270518, + "grad_norm": 0.06817296255819978, + "learning_rate": 6.432861233171473e-06, + "loss": 0.4896, + "step": 3363 + }, + { + "epoch": 1.6351367781155015, + "grad_norm": 0.07131217234375203, + "learning_rate": 6.431026760797397e-06, + "loss": 0.5627, + "step": 3364 + }, + { + "epoch": 1.6356231003039514, + "grad_norm": 0.06995061696203149, + "learning_rate": 6.429192078579498e-06, + "loss": 0.5267, + "step": 3365 + }, + { + "epoch": 1.6361094224924013, + "grad_norm": 0.07077290492741074, + "learning_rate": 6.42735718678681e-06, + "loss": 0.5589, + "step": 3366 + }, + { + "epoch": 1.636595744680851, + "grad_norm": 0.07154034910375451, + "learning_rate": 6.425522085688401e-06, + "loss": 0.5791, + "step": 3367 + }, + { + "epoch": 1.637082066869301, + "grad_norm": 0.0731832489640663, + "learning_rate": 6.423686775553364e-06, + "loss": 0.5694, + "step": 3368 + }, + { + "epoch": 1.6375683890577508, + "grad_norm": 0.07459119368468636, + "learning_rate": 6.421851256650831e-06, + "loss": 0.543, + "step": 3369 + }, + { + "epoch": 1.6380547112462005, + "grad_norm": 0.07276161805255993, + "learning_rate": 6.420015529249955e-06, + "loss": 0.5469, + "step": 3370 + }, + { + "epoch": 1.6385410334346504, + "grad_norm": 0.0687617035605552, + "learning_rate": 6.418179593619928e-06, + "loss": 0.5522, + "step": 3371 + }, + { + "epoch": 1.6390273556231003, + "grad_norm": 0.07023826343869467, + "learning_rate": 6.416343450029967e-06, + "loss": 0.53, + "step": 3372 + }, + { + "epoch": 1.63951367781155, + "grad_norm": 0.07153498347034304, + "learning_rate": 6.414507098749324e-06, + "loss": 0.5448, + "step": 3373 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.07031049952680485, + "learning_rate": 6.412670540047275e-06, + "loss": 0.5372, + "step": 3374 + }, + { + "epoch": 1.6404863221884498, + "grad_norm": 0.07642733811048066, + "learning_rate": 6.410833774193137e-06, + "loss": 0.5967, + "step": 3375 + }, + { + "epoch": 1.6409726443768997, + "grad_norm": 0.06967841381479503, + "learning_rate": 6.408996801456246e-06, + "loss": 0.516, + "step": 3376 + }, + { + "epoch": 1.6414589665653496, + "grad_norm": 0.07275926964865533, + "learning_rate": 6.407159622105974e-06, + "loss": 0.5476, + "step": 3377 + }, + { + "epoch": 1.6419452887537993, + "grad_norm": 0.06911037198168787, + "learning_rate": 6.405322236411722e-06, + "loss": 0.5076, + "step": 3378 + }, + { + "epoch": 1.6424316109422492, + "grad_norm": 0.06993404724053431, + "learning_rate": 6.403484644642923e-06, + "loss": 0.5197, + "step": 3379 + }, + { + "epoch": 1.6429179331306991, + "grad_norm": 0.07286448383096875, + "learning_rate": 6.401646847069038e-06, + "loss": 0.57, + "step": 3380 + }, + { + "epoch": 1.6434042553191488, + "grad_norm": 0.07435546696321146, + "learning_rate": 6.3998088439595605e-06, + "loss": 0.521, + "step": 3381 + }, + { + "epoch": 1.643890577507599, + "grad_norm": 0.06869542579443716, + "learning_rate": 6.397970635584012e-06, + "loss": 0.5233, + "step": 3382 + }, + { + "epoch": 1.6443768996960486, + "grad_norm": 0.07623847245338596, + "learning_rate": 6.396132222211945e-06, + "loss": 0.5842, + "step": 3383 + }, + { + "epoch": 1.6448632218844985, + "grad_norm": 0.069444079369739, + "learning_rate": 6.394293604112941e-06, + "loss": 0.5584, + "step": 3384 + }, + { + "epoch": 1.6453495440729484, + "grad_norm": 0.06920549963577673, + "learning_rate": 6.392454781556614e-06, + "loss": 0.527, + "step": 3385 + }, + { + "epoch": 1.645835866261398, + "grad_norm": 0.07150253604511128, + "learning_rate": 6.390615754812605e-06, + "loss": 0.5422, + "step": 3386 + }, + { + "epoch": 1.646322188449848, + "grad_norm": 0.07380647396593863, + "learning_rate": 6.388776524150586e-06, + "loss": 0.6001, + "step": 3387 + }, + { + "epoch": 1.646808510638298, + "grad_norm": 0.07534998231994375, + "learning_rate": 6.386937089840262e-06, + "loss": 0.5726, + "step": 3388 + }, + { + "epoch": 1.6472948328267476, + "grad_norm": 0.0703812333326199, + "learning_rate": 6.385097452151363e-06, + "loss": 0.559, + "step": 3389 + }, + { + "epoch": 1.6477811550151977, + "grad_norm": 0.07579382270670242, + "learning_rate": 6.3832576113536515e-06, + "loss": 0.5931, + "step": 3390 + }, + { + "epoch": 1.6482674772036474, + "grad_norm": 0.0730380978934539, + "learning_rate": 6.381417567716919e-06, + "loss": 0.5484, + "step": 3391 + }, + { + "epoch": 1.6487537993920973, + "grad_norm": 0.080873039534468, + "learning_rate": 6.379577321510988e-06, + "loss": 0.5263, + "step": 3392 + }, + { + "epoch": 1.6492401215805472, + "grad_norm": 0.07442847785577929, + "learning_rate": 6.3777368730057075e-06, + "loss": 0.557, + "step": 3393 + }, + { + "epoch": 1.649726443768997, + "grad_norm": 0.06712855488192049, + "learning_rate": 6.375896222470961e-06, + "loss": 0.4888, + "step": 3394 + }, + { + "epoch": 1.6502127659574468, + "grad_norm": 0.0743225166886425, + "learning_rate": 6.374055370176657e-06, + "loss": 0.5373, + "step": 3395 + }, + { + "epoch": 1.6506990881458967, + "grad_norm": 0.07154325162816429, + "learning_rate": 6.372214316392737e-06, + "loss": 0.5552, + "step": 3396 + }, + { + "epoch": 1.6511854103343464, + "grad_norm": 0.07191603652286814, + "learning_rate": 6.37037306138917e-06, + "loss": 0.5282, + "step": 3397 + }, + { + "epoch": 1.6516717325227963, + "grad_norm": 0.06939362956965285, + "learning_rate": 6.368531605435955e-06, + "loss": 0.5465, + "step": 3398 + }, + { + "epoch": 1.6521580547112462, + "grad_norm": 0.07437388186012737, + "learning_rate": 6.366689948803121e-06, + "loss": 0.625, + "step": 3399 + }, + { + "epoch": 1.652644376899696, + "grad_norm": 0.07492458554891947, + "learning_rate": 6.364848091760727e-06, + "loss": 0.5401, + "step": 3400 + }, + { + "epoch": 1.653130699088146, + "grad_norm": 0.07604797113839827, + "learning_rate": 6.363006034578856e-06, + "loss": 0.5564, + "step": 3401 + }, + { + "epoch": 1.6536170212765957, + "grad_norm": 0.07419551766677225, + "learning_rate": 6.36116377752763e-06, + "loss": 0.5271, + "step": 3402 + }, + { + "epoch": 1.6541033434650456, + "grad_norm": 0.07504553771179602, + "learning_rate": 6.359321320877193e-06, + "loss": 0.5532, + "step": 3403 + }, + { + "epoch": 1.6545896656534955, + "grad_norm": 0.0725165387268915, + "learning_rate": 6.3574786648977205e-06, + "loss": 0.5956, + "step": 3404 + }, + { + "epoch": 1.6550759878419452, + "grad_norm": 0.0700954308787515, + "learning_rate": 6.355635809859416e-06, + "loss": 0.557, + "step": 3405 + }, + { + "epoch": 1.6555623100303951, + "grad_norm": 0.07022157789133485, + "learning_rate": 6.3537927560325155e-06, + "loss": 0.5624, + "step": 3406 + }, + { + "epoch": 1.656048632218845, + "grad_norm": 0.07220842322534013, + "learning_rate": 6.3519495036872815e-06, + "loss": 0.5464, + "step": 3407 + }, + { + "epoch": 1.6565349544072947, + "grad_norm": 0.07020232744591141, + "learning_rate": 6.350106053094004e-06, + "loss": 0.5616, + "step": 3408 + }, + { + "epoch": 1.6570212765957448, + "grad_norm": 0.07342677126572464, + "learning_rate": 6.348262404523005e-06, + "loss": 0.5553, + "step": 3409 + }, + { + "epoch": 1.6575075987841945, + "grad_norm": 0.0735559234379739, + "learning_rate": 6.346418558244634e-06, + "loss": 0.5617, + "step": 3410 + }, + { + "epoch": 1.6579939209726444, + "grad_norm": 0.07410723554599913, + "learning_rate": 6.344574514529272e-06, + "loss": 0.5457, + "step": 3411 + }, + { + "epoch": 1.6584802431610943, + "grad_norm": 0.06764480738125728, + "learning_rate": 6.342730273647327e-06, + "loss": 0.5073, + "step": 3412 + }, + { + "epoch": 1.658966565349544, + "grad_norm": 0.07155817038410059, + "learning_rate": 6.340885835869233e-06, + "loss": 0.5644, + "step": 3413 + }, + { + "epoch": 1.659452887537994, + "grad_norm": 0.0695935896652981, + "learning_rate": 6.339041201465459e-06, + "loss": 0.5168, + "step": 3414 + }, + { + "epoch": 1.6599392097264438, + "grad_norm": 0.07077491789727446, + "learning_rate": 6.3371963707065e-06, + "loss": 0.5543, + "step": 3415 + }, + { + "epoch": 1.6604255319148935, + "grad_norm": 0.08015293257927245, + "learning_rate": 6.3353513438628764e-06, + "loss": 0.5585, + "step": 3416 + }, + { + "epoch": 1.6609118541033434, + "grad_norm": 0.07357720324580157, + "learning_rate": 6.333506121205144e-06, + "loss": 0.6082, + "step": 3417 + }, + { + "epoch": 1.6613981762917933, + "grad_norm": 0.07326962524988391, + "learning_rate": 6.33166070300388e-06, + "loss": 0.5378, + "step": 3418 + }, + { + "epoch": 1.661884498480243, + "grad_norm": 0.06750085921534843, + "learning_rate": 6.329815089529696e-06, + "loss": 0.5132, + "step": 3419 + }, + { + "epoch": 1.6623708206686931, + "grad_norm": 0.07327890231180548, + "learning_rate": 6.32796928105323e-06, + "loss": 0.6039, + "step": 3420 + }, + { + "epoch": 1.6628571428571428, + "grad_norm": 0.07301937848254036, + "learning_rate": 6.32612327784515e-06, + "loss": 0.5667, + "step": 3421 + }, + { + "epoch": 1.6633434650455927, + "grad_norm": 0.06924845171060541, + "learning_rate": 6.324277080176151e-06, + "loss": 0.494, + "step": 3422 + }, + { + "epoch": 1.6638297872340426, + "grad_norm": 0.07149583581069985, + "learning_rate": 6.3224306883169565e-06, + "loss": 0.535, + "step": 3423 + }, + { + "epoch": 1.6643161094224923, + "grad_norm": 0.0737453113581752, + "learning_rate": 6.320584102538316e-06, + "loss": 0.5391, + "step": 3424 + }, + { + "epoch": 1.6648024316109422, + "grad_norm": 0.07237495506610687, + "learning_rate": 6.318737323111015e-06, + "loss": 0.5289, + "step": 3425 + }, + { + "epoch": 1.6652887537993921, + "grad_norm": 0.07308418536483928, + "learning_rate": 6.316890350305861e-06, + "loss": 0.582, + "step": 3426 + }, + { + "epoch": 1.6657750759878418, + "grad_norm": 0.07039007862105334, + "learning_rate": 6.315043184393691e-06, + "loss": 0.5181, + "step": 3427 + }, + { + "epoch": 1.666261398176292, + "grad_norm": 0.07294068981271111, + "learning_rate": 6.313195825645371e-06, + "loss": 0.5558, + "step": 3428 + }, + { + "epoch": 1.6667477203647416, + "grad_norm": 0.07440584718609539, + "learning_rate": 6.311348274331797e-06, + "loss": 0.5299, + "step": 3429 + }, + { + "epoch": 1.6672340425531915, + "grad_norm": 0.07220354336316229, + "learning_rate": 6.309500530723889e-06, + "loss": 0.552, + "step": 3430 + }, + { + "epoch": 1.6677203647416414, + "grad_norm": 0.07408564378247474, + "learning_rate": 6.3076525950925975e-06, + "loss": 0.5546, + "step": 3431 + }, + { + "epoch": 1.668206686930091, + "grad_norm": 0.07199997484962213, + "learning_rate": 6.305804467708902e-06, + "loss": 0.5346, + "step": 3432 + }, + { + "epoch": 1.668693009118541, + "grad_norm": 0.07044592574990392, + "learning_rate": 6.3039561488438115e-06, + "loss": 0.5364, + "step": 3433 + }, + { + "epoch": 1.669179331306991, + "grad_norm": 0.07016152799974806, + "learning_rate": 6.302107638768359e-06, + "loss": 0.5316, + "step": 3434 + }, + { + "epoch": 1.6696656534954406, + "grad_norm": 0.07206425324719826, + "learning_rate": 6.300258937753607e-06, + "loss": 0.55, + "step": 3435 + }, + { + "epoch": 1.6701519756838907, + "grad_norm": 0.07031404784080043, + "learning_rate": 6.2984100460706476e-06, + "loss": 0.5162, + "step": 3436 + }, + { + "epoch": 1.6706382978723404, + "grad_norm": 0.07132698925175493, + "learning_rate": 6.296560963990599e-06, + "loss": 0.5278, + "step": 3437 + }, + { + "epoch": 1.6711246200607903, + "grad_norm": 0.07315548152365, + "learning_rate": 6.2947116917846085e-06, + "loss": 0.5506, + "step": 3438 + }, + { + "epoch": 1.6716109422492402, + "grad_norm": 0.07320352950730759, + "learning_rate": 6.29286222972385e-06, + "loss": 0.5472, + "step": 3439 + }, + { + "epoch": 1.67209726443769, + "grad_norm": 0.07584456827317385, + "learning_rate": 6.291012578079528e-06, + "loss": 0.5883, + "step": 3440 + }, + { + "epoch": 1.6725835866261398, + "grad_norm": 0.07060915074816591, + "learning_rate": 6.289162737122873e-06, + "loss": 0.5338, + "step": 3441 + }, + { + "epoch": 1.6730699088145897, + "grad_norm": 0.0700103844965376, + "learning_rate": 6.287312707125139e-06, + "loss": 0.549, + "step": 3442 + }, + { + "epoch": 1.6735562310030394, + "grad_norm": 0.07987773004371675, + "learning_rate": 6.285462488357618e-06, + "loss": 0.5463, + "step": 3443 + }, + { + "epoch": 1.6740425531914893, + "grad_norm": 0.07065521169091803, + "learning_rate": 6.283612081091619e-06, + "loss": 0.5256, + "step": 3444 + }, + { + "epoch": 1.6745288753799392, + "grad_norm": 0.07185225363803119, + "learning_rate": 6.281761485598484e-06, + "loss": 0.5422, + "step": 3445 + }, + { + "epoch": 1.675015197568389, + "grad_norm": 0.07331266687140567, + "learning_rate": 6.279910702149584e-06, + "loss": 0.5876, + "step": 3446 + }, + { + "epoch": 1.675501519756839, + "grad_norm": 0.07461911269210564, + "learning_rate": 6.278059731016313e-06, + "loss": 0.5459, + "step": 3447 + }, + { + "epoch": 1.6759878419452887, + "grad_norm": 0.06956593015645747, + "learning_rate": 6.276208572470096e-06, + "loss": 0.4949, + "step": 3448 + }, + { + "epoch": 1.6764741641337386, + "grad_norm": 0.0702961430246337, + "learning_rate": 6.274357226782384e-06, + "loss": 0.5523, + "step": 3449 + }, + { + "epoch": 1.6769604863221885, + "grad_norm": 0.07147229026179, + "learning_rate": 6.272505694224655e-06, + "loss": 0.5496, + "step": 3450 + }, + { + "epoch": 1.6774468085106382, + "grad_norm": 0.07587812402997444, + "learning_rate": 6.270653975068418e-06, + "loss": 0.5539, + "step": 3451 + }, + { + "epoch": 1.6779331306990881, + "grad_norm": 0.0734741665495876, + "learning_rate": 6.268802069585205e-06, + "loss": 0.56, + "step": 3452 + }, + { + "epoch": 1.678419452887538, + "grad_norm": 0.07506979605329234, + "learning_rate": 6.266949978046576e-06, + "loss": 0.5889, + "step": 3453 + }, + { + "epoch": 1.6789057750759877, + "grad_norm": 0.07300090763970113, + "learning_rate": 6.26509770072412e-06, + "loss": 0.5637, + "step": 3454 + }, + { + "epoch": 1.6793920972644378, + "grad_norm": 0.06914753026846533, + "learning_rate": 6.263245237889451e-06, + "loss": 0.5544, + "step": 3455 + }, + { + "epoch": 1.6798784194528875, + "grad_norm": 0.0722890714909905, + "learning_rate": 6.261392589814214e-06, + "loss": 0.563, + "step": 3456 + }, + { + "epoch": 1.6803647416413374, + "grad_norm": 0.07269298755097635, + "learning_rate": 6.259539756770078e-06, + "loss": 0.5573, + "step": 3457 + }, + { + "epoch": 1.6808510638297873, + "grad_norm": 0.07055887355195972, + "learning_rate": 6.257686739028739e-06, + "loss": 0.5486, + "step": 3458 + }, + { + "epoch": 1.681337386018237, + "grad_norm": 0.07042043459620466, + "learning_rate": 6.255833536861921e-06, + "loss": 0.5337, + "step": 3459 + }, + { + "epoch": 1.681823708206687, + "grad_norm": 0.0729035793138229, + "learning_rate": 6.253980150541378e-06, + "loss": 0.5639, + "step": 3460 + }, + { + "epoch": 1.6823100303951368, + "grad_norm": 0.07131871959759666, + "learning_rate": 6.252126580338885e-06, + "loss": 0.5166, + "step": 3461 + }, + { + "epoch": 1.6827963525835865, + "grad_norm": 0.07104127141305687, + "learning_rate": 6.250272826526248e-06, + "loss": 0.545, + "step": 3462 + }, + { + "epoch": 1.6832826747720366, + "grad_norm": 0.07028033697890941, + "learning_rate": 6.248418889375299e-06, + "loss": 0.5334, + "step": 3463 + }, + { + "epoch": 1.6837689969604863, + "grad_norm": 0.06858869063379364, + "learning_rate": 6.246564769157895e-06, + "loss": 0.5299, + "step": 3464 + }, + { + "epoch": 1.6842553191489362, + "grad_norm": 0.07466509908792074, + "learning_rate": 6.244710466145924e-06, + "loss": 0.5858, + "step": 3465 + }, + { + "epoch": 1.6847416413373861, + "grad_norm": 0.07497535752935497, + "learning_rate": 6.242855980611298e-06, + "loss": 0.6044, + "step": 3466 + }, + { + "epoch": 1.6852279635258358, + "grad_norm": 0.07746186650363425, + "learning_rate": 6.241001312825955e-06, + "loss": 0.5756, + "step": 3467 + }, + { + "epoch": 1.6857142857142857, + "grad_norm": 0.07220985737960124, + "learning_rate": 6.239146463061864e-06, + "loss": 0.5848, + "step": 3468 + }, + { + "epoch": 1.6862006079027356, + "grad_norm": 0.07390719815103404, + "learning_rate": 6.237291431591015e-06, + "loss": 0.5594, + "step": 3469 + }, + { + "epoch": 1.6866869300911853, + "grad_norm": 0.07291685913457813, + "learning_rate": 6.235436218685427e-06, + "loss": 0.566, + "step": 3470 + }, + { + "epoch": 1.6871732522796352, + "grad_norm": 0.0722062240015657, + "learning_rate": 6.233580824617147e-06, + "loss": 0.5494, + "step": 3471 + }, + { + "epoch": 1.6876595744680851, + "grad_norm": 0.07355731543139962, + "learning_rate": 6.231725249658248e-06, + "loss": 0.5471, + "step": 3472 + }, + { + "epoch": 1.6881458966565348, + "grad_norm": 0.07015747437220496, + "learning_rate": 6.229869494080828e-06, + "loss": 0.5166, + "step": 3473 + }, + { + "epoch": 1.688632218844985, + "grad_norm": 0.07267653450607674, + "learning_rate": 6.228013558157011e-06, + "loss": 0.5676, + "step": 3474 + }, + { + "epoch": 1.6891185410334346, + "grad_norm": 0.07061020483229881, + "learning_rate": 6.226157442158954e-06, + "loss": 0.5297, + "step": 3475 + }, + { + "epoch": 1.6896048632218845, + "grad_norm": 0.07740873367140404, + "learning_rate": 6.224301146358831e-06, + "loss": 0.5487, + "step": 3476 + }, + { + "epoch": 1.6900911854103344, + "grad_norm": 0.07262189988070133, + "learning_rate": 6.222444671028846e-06, + "loss": 0.5494, + "step": 3477 + }, + { + "epoch": 1.6905775075987841, + "grad_norm": 0.06967127517489614, + "learning_rate": 6.220588016441234e-06, + "loss": 0.5145, + "step": 3478 + }, + { + "epoch": 1.691063829787234, + "grad_norm": 0.06966666670930907, + "learning_rate": 6.218731182868249e-06, + "loss": 0.5221, + "step": 3479 + }, + { + "epoch": 1.691550151975684, + "grad_norm": 0.06860028726467582, + "learning_rate": 6.216874170582176e-06, + "loss": 0.5237, + "step": 3480 + }, + { + "epoch": 1.6920364741641336, + "grad_norm": 0.07334406390886508, + "learning_rate": 6.215016979855324e-06, + "loss": 0.5751, + "step": 3481 + }, + { + "epoch": 1.6925227963525837, + "grad_norm": 0.07015295794360239, + "learning_rate": 6.213159610960029e-06, + "loss": 0.5388, + "step": 3482 + }, + { + "epoch": 1.6930091185410334, + "grad_norm": 0.07038569302208326, + "learning_rate": 6.211302064168654e-06, + "loss": 0.5262, + "step": 3483 + }, + { + "epoch": 1.6934954407294833, + "grad_norm": 0.0687806760408564, + "learning_rate": 6.209444339753587e-06, + "loss": 0.5344, + "step": 3484 + }, + { + "epoch": 1.6939817629179332, + "grad_norm": 0.07150568590958441, + "learning_rate": 6.207586437987241e-06, + "loss": 0.5303, + "step": 3485 + }, + { + "epoch": 1.694468085106383, + "grad_norm": 0.07267683292631394, + "learning_rate": 6.205728359142056e-06, + "loss": 0.5769, + "step": 3486 + }, + { + "epoch": 1.6949544072948328, + "grad_norm": 0.07025379501761682, + "learning_rate": 6.2038701034905e-06, + "loss": 0.5618, + "step": 3487 + }, + { + "epoch": 1.6954407294832827, + "grad_norm": 0.07193721091828442, + "learning_rate": 6.202011671305065e-06, + "loss": 0.5203, + "step": 3488 + }, + { + "epoch": 1.6959270516717324, + "grad_norm": 0.07103752414158168, + "learning_rate": 6.200153062858268e-06, + "loss": 0.5454, + "step": 3489 + }, + { + "epoch": 1.6964133738601823, + "grad_norm": 0.07274057275608531, + "learning_rate": 6.198294278422652e-06, + "loss": 0.5702, + "step": 3490 + }, + { + "epoch": 1.6968996960486322, + "grad_norm": 0.07127870652316795, + "learning_rate": 6.196435318270788e-06, + "loss": 0.5242, + "step": 3491 + }, + { + "epoch": 1.697386018237082, + "grad_norm": 0.07270868914238528, + "learning_rate": 6.19457618267527e-06, + "loss": 0.5578, + "step": 3492 + }, + { + "epoch": 1.697872340425532, + "grad_norm": 0.0703077530354761, + "learning_rate": 6.192716871908721e-06, + "loss": 0.5365, + "step": 3493 + }, + { + "epoch": 1.6983586626139817, + "grad_norm": 0.07204047106001092, + "learning_rate": 6.1908573862437885e-06, + "loss": 0.5797, + "step": 3494 + }, + { + "epoch": 1.6988449848024316, + "grad_norm": 0.07504411096514044, + "learning_rate": 6.188997725953141e-06, + "loss": 0.585, + "step": 3495 + }, + { + "epoch": 1.6993313069908815, + "grad_norm": 0.07112888675894341, + "learning_rate": 6.18713789130948e-06, + "loss": 0.5444, + "step": 3496 + }, + { + "epoch": 1.6998176291793312, + "grad_norm": 0.07132450544712032, + "learning_rate": 6.185277882585528e-06, + "loss": 0.5232, + "step": 3497 + }, + { + "epoch": 1.7003039513677811, + "grad_norm": 0.07291371092849813, + "learning_rate": 6.183417700054035e-06, + "loss": 0.5565, + "step": 3498 + }, + { + "epoch": 1.700790273556231, + "grad_norm": 0.07314998467558513, + "learning_rate": 6.181557343987775e-06, + "loss": 0.5541, + "step": 3499 + }, + { + "epoch": 1.7012765957446807, + "grad_norm": 0.07155905366858664, + "learning_rate": 6.179696814659547e-06, + "loss": 0.5322, + "step": 3500 + }, + { + "epoch": 1.7017629179331308, + "grad_norm": 0.07287449463409627, + "learning_rate": 6.177836112342176e-06, + "loss": 0.5887, + "step": 3501 + }, + { + "epoch": 1.7022492401215805, + "grad_norm": 0.07261494504931845, + "learning_rate": 6.175975237308516e-06, + "loss": 0.5319, + "step": 3502 + }, + { + "epoch": 1.7027355623100304, + "grad_norm": 0.073142233700149, + "learning_rate": 6.174114189831441e-06, + "loss": 0.5568, + "step": 3503 + }, + { + "epoch": 1.7032218844984803, + "grad_norm": 0.07275484890670135, + "learning_rate": 6.172252970183854e-06, + "loss": 0.6009, + "step": 3504 + }, + { + "epoch": 1.70370820668693, + "grad_norm": 0.07237350334886138, + "learning_rate": 6.170391578638681e-06, + "loss": 0.5483, + "step": 3505 + }, + { + "epoch": 1.70419452887538, + "grad_norm": 0.07517900788602927, + "learning_rate": 6.168530015468872e-06, + "loss": 0.5733, + "step": 3506 + }, + { + "epoch": 1.7046808510638298, + "grad_norm": 0.07352048336663716, + "learning_rate": 6.166668280947408e-06, + "loss": 0.5409, + "step": 3507 + }, + { + "epoch": 1.7051671732522795, + "grad_norm": 0.0725547704029076, + "learning_rate": 6.1648063753472875e-06, + "loss": 0.5552, + "step": 3508 + }, + { + "epoch": 1.7056534954407296, + "grad_norm": 0.07220301147051507, + "learning_rate": 6.16294429894154e-06, + "loss": 0.5464, + "step": 3509 + }, + { + "epoch": 1.7061398176291793, + "grad_norm": 0.0710666461142256, + "learning_rate": 6.161082052003215e-06, + "loss": 0.5346, + "step": 3510 + }, + { + "epoch": 1.7066261398176292, + "grad_norm": 0.07565430366698148, + "learning_rate": 6.159219634805394e-06, + "loss": 0.587, + "step": 3511 + }, + { + "epoch": 1.7071124620060791, + "grad_norm": 0.07128599994758103, + "learning_rate": 6.157357047621176e-06, + "loss": 0.5748, + "step": 3512 + }, + { + "epoch": 1.7075987841945288, + "grad_norm": 0.07297765244833161, + "learning_rate": 6.155494290723691e-06, + "loss": 0.5375, + "step": 3513 + }, + { + "epoch": 1.7080851063829787, + "grad_norm": 0.0792437135387949, + "learning_rate": 6.153631364386091e-06, + "loss": 0.5373, + "step": 3514 + }, + { + "epoch": 1.7085714285714286, + "grad_norm": 0.07240105274594913, + "learning_rate": 6.15176826888155e-06, + "loss": 0.5485, + "step": 3515 + }, + { + "epoch": 1.7090577507598783, + "grad_norm": 0.07393454324383125, + "learning_rate": 6.149905004483272e-06, + "loss": 0.5716, + "step": 3516 + }, + { + "epoch": 1.7095440729483282, + "grad_norm": 0.07475190267630555, + "learning_rate": 6.148041571464483e-06, + "loss": 0.5601, + "step": 3517 + }, + { + "epoch": 1.7100303951367781, + "grad_norm": 0.07326956142275377, + "learning_rate": 6.146177970098434e-06, + "loss": 0.5234, + "step": 3518 + }, + { + "epoch": 1.7105167173252278, + "grad_norm": 0.07439871870321202, + "learning_rate": 6.144314200658401e-06, + "loss": 0.5407, + "step": 3519 + }, + { + "epoch": 1.711003039513678, + "grad_norm": 0.07607020859262323, + "learning_rate": 6.142450263417685e-06, + "loss": 0.5172, + "step": 3520 + }, + { + "epoch": 1.7114893617021276, + "grad_norm": 0.07121399993155196, + "learning_rate": 6.1405861586496125e-06, + "loss": 0.517, + "step": 3521 + }, + { + "epoch": 1.7119756838905775, + "grad_norm": 0.07059184271014422, + "learning_rate": 6.138721886627532e-06, + "loss": 0.5648, + "step": 3522 + }, + { + "epoch": 1.7124620060790274, + "grad_norm": 0.07140527318071622, + "learning_rate": 6.136857447624818e-06, + "loss": 0.5272, + "step": 3523 + }, + { + "epoch": 1.7129483282674771, + "grad_norm": 0.0730752244195604, + "learning_rate": 6.134992841914869e-06, + "loss": 0.5668, + "step": 3524 + }, + { + "epoch": 1.713434650455927, + "grad_norm": 0.07565606813572627, + "learning_rate": 6.133128069771107e-06, + "loss": 0.5858, + "step": 3525 + }, + { + "epoch": 1.713920972644377, + "grad_norm": 0.07203715162008086, + "learning_rate": 6.131263131466982e-06, + "loss": 0.5507, + "step": 3526 + }, + { + "epoch": 1.7144072948328266, + "grad_norm": 0.0775887937031711, + "learning_rate": 6.129398027275966e-06, + "loss": 0.6, + "step": 3527 + }, + { + "epoch": 1.7148936170212767, + "grad_norm": 0.07064216132886232, + "learning_rate": 6.127532757471553e-06, + "loss": 0.5539, + "step": 3528 + }, + { + "epoch": 1.7153799392097264, + "grad_norm": 0.0726866206458179, + "learning_rate": 6.125667322327266e-06, + "loss": 0.5664, + "step": 3529 + }, + { + "epoch": 1.7158662613981763, + "grad_norm": 0.06967467383360446, + "learning_rate": 6.123801722116649e-06, + "loss": 0.525, + "step": 3530 + }, + { + "epoch": 1.7163525835866262, + "grad_norm": 0.07081201980771898, + "learning_rate": 6.121935957113271e-06, + "loss": 0.5242, + "step": 3531 + }, + { + "epoch": 1.716838905775076, + "grad_norm": 0.07351651568219589, + "learning_rate": 6.120070027590724e-06, + "loss": 0.5426, + "step": 3532 + }, + { + "epoch": 1.7173252279635258, + "grad_norm": 0.07222617797151484, + "learning_rate": 6.118203933822628e-06, + "loss": 0.5381, + "step": 3533 + }, + { + "epoch": 1.7178115501519757, + "grad_norm": 0.06917247862721279, + "learning_rate": 6.116337676082623e-06, + "loss": 0.511, + "step": 3534 + }, + { + "epoch": 1.7182978723404254, + "grad_norm": 0.0754651715338366, + "learning_rate": 6.114471254644375e-06, + "loss": 0.6034, + "step": 3535 + }, + { + "epoch": 1.7187841945288755, + "grad_norm": 0.07205487316612337, + "learning_rate": 6.112604669781572e-06, + "loss": 0.5721, + "step": 3536 + }, + { + "epoch": 1.7192705167173252, + "grad_norm": 0.07856583507993582, + "learning_rate": 6.110737921767931e-06, + "loss": 0.535, + "step": 3537 + }, + { + "epoch": 1.7197568389057751, + "grad_norm": 0.07115444337024836, + "learning_rate": 6.1088710108771845e-06, + "loss": 0.5448, + "step": 3538 + }, + { + "epoch": 1.720243161094225, + "grad_norm": 0.07305108996799066, + "learning_rate": 6.107003937383098e-06, + "loss": 0.5533, + "step": 3539 + }, + { + "epoch": 1.7207294832826747, + "grad_norm": 0.07550895883040964, + "learning_rate": 6.105136701559453e-06, + "loss": 0.5831, + "step": 3540 + }, + { + "epoch": 1.7212158054711246, + "grad_norm": 0.07507836898136312, + "learning_rate": 6.103269303680063e-06, + "loss": 0.5661, + "step": 3541 + }, + { + "epoch": 1.7217021276595745, + "grad_norm": 0.07008914370481566, + "learning_rate": 6.101401744018756e-06, + "loss": 0.5029, + "step": 3542 + }, + { + "epoch": 1.7221884498480242, + "grad_norm": 0.07651101941723358, + "learning_rate": 6.099534022849392e-06, + "loss": 0.5167, + "step": 3543 + }, + { + "epoch": 1.7226747720364741, + "grad_norm": 0.07278891169391895, + "learning_rate": 6.097666140445848e-06, + "loss": 0.574, + "step": 3544 + }, + { + "epoch": 1.723161094224924, + "grad_norm": 0.07419667475638432, + "learning_rate": 6.09579809708203e-06, + "loss": 0.5675, + "step": 3545 + }, + { + "epoch": 1.7236474164133737, + "grad_norm": 0.07132183658485121, + "learning_rate": 6.093929893031865e-06, + "loss": 0.5284, + "step": 3546 + }, + { + "epoch": 1.7241337386018238, + "grad_norm": 0.07247309940970907, + "learning_rate": 6.092061528569303e-06, + "loss": 0.5155, + "step": 3547 + }, + { + "epoch": 1.7246200607902735, + "grad_norm": 0.0705966682854827, + "learning_rate": 6.090193003968319e-06, + "loss": 0.5375, + "step": 3548 + }, + { + "epoch": 1.7251063829787234, + "grad_norm": 0.07441859204161902, + "learning_rate": 6.088324319502912e-06, + "loss": 0.5765, + "step": 3549 + }, + { + "epoch": 1.7255927051671733, + "grad_norm": 0.0733133846506137, + "learning_rate": 6.086455475447102e-06, + "loss": 0.555, + "step": 3550 + }, + { + "epoch": 1.726079027355623, + "grad_norm": 0.07415090356658685, + "learning_rate": 6.084586472074933e-06, + "loss": 0.5407, + "step": 3551 + }, + { + "epoch": 1.726565349544073, + "grad_norm": 0.07198872510394184, + "learning_rate": 6.082717309660474e-06, + "loss": 0.5274, + "step": 3552 + }, + { + "epoch": 1.7270516717325228, + "grad_norm": 0.07286615459382179, + "learning_rate": 6.080847988477819e-06, + "loss": 0.5242, + "step": 3553 + }, + { + "epoch": 1.7275379939209725, + "grad_norm": 0.07538009981745537, + "learning_rate": 6.078978508801079e-06, + "loss": 0.5684, + "step": 3554 + }, + { + "epoch": 1.7280243161094226, + "grad_norm": 0.07185413538904672, + "learning_rate": 6.0771088709043915e-06, + "loss": 0.5303, + "step": 3555 + }, + { + "epoch": 1.7285106382978723, + "grad_norm": 0.07264790164048276, + "learning_rate": 6.075239075061921e-06, + "loss": 0.5706, + "step": 3556 + }, + { + "epoch": 1.7289969604863222, + "grad_norm": 0.07266353472118756, + "learning_rate": 6.073369121547851e-06, + "loss": 0.56, + "step": 3557 + }, + { + "epoch": 1.7294832826747721, + "grad_norm": 0.07423555281045208, + "learning_rate": 6.071499010636387e-06, + "loss": 0.5666, + "step": 3558 + }, + { + "epoch": 1.7299696048632218, + "grad_norm": 0.07525381662140035, + "learning_rate": 6.069628742601761e-06, + "loss": 0.5676, + "step": 3559 + }, + { + "epoch": 1.7304559270516717, + "grad_norm": 0.07443860364609997, + "learning_rate": 6.067758317718227e-06, + "loss": 0.5661, + "step": 3560 + }, + { + "epoch": 1.7309422492401216, + "grad_norm": 0.06969686395927308, + "learning_rate": 6.065887736260061e-06, + "loss": 0.5405, + "step": 3561 + }, + { + "epoch": 1.7314285714285713, + "grad_norm": 0.0718479392508208, + "learning_rate": 6.064016998501563e-06, + "loss": 0.5646, + "step": 3562 + }, + { + "epoch": 1.7319148936170212, + "grad_norm": 0.07514505651563702, + "learning_rate": 6.062146104717053e-06, + "loss": 0.5582, + "step": 3563 + }, + { + "epoch": 1.7324012158054711, + "grad_norm": 0.07103669937924188, + "learning_rate": 6.060275055180877e-06, + "loss": 0.5687, + "step": 3564 + }, + { + "epoch": 1.7328875379939208, + "grad_norm": 0.0710078358345454, + "learning_rate": 6.058403850167407e-06, + "loss": 0.559, + "step": 3565 + }, + { + "epoch": 1.733373860182371, + "grad_norm": 0.06997667824929588, + "learning_rate": 6.056532489951032e-06, + "loss": 0.536, + "step": 3566 + }, + { + "epoch": 1.7338601823708206, + "grad_norm": 0.0706188721861992, + "learning_rate": 6.054660974806164e-06, + "loss": 0.5742, + "step": 3567 + }, + { + "epoch": 1.7343465045592705, + "grad_norm": 0.07204773947379925, + "learning_rate": 6.052789305007241e-06, + "loss": 0.5917, + "step": 3568 + }, + { + "epoch": 1.7348328267477204, + "grad_norm": 0.06936993031005802, + "learning_rate": 6.050917480828721e-06, + "loss": 0.5496, + "step": 3569 + }, + { + "epoch": 1.7353191489361701, + "grad_norm": 0.07148965795942203, + "learning_rate": 6.049045502545085e-06, + "loss": 0.5003, + "step": 3570 + }, + { + "epoch": 1.73580547112462, + "grad_norm": 0.0737125659556872, + "learning_rate": 6.047173370430841e-06, + "loss": 0.5539, + "step": 3571 + }, + { + "epoch": 1.73629179331307, + "grad_norm": 0.07185820600545809, + "learning_rate": 6.045301084760513e-06, + "loss": 0.5477, + "step": 3572 + }, + { + "epoch": 1.7367781155015196, + "grad_norm": 0.07671154228843335, + "learning_rate": 6.04342864580865e-06, + "loss": 0.5778, + "step": 3573 + }, + { + "epoch": 1.7372644376899697, + "grad_norm": 0.07876861276744633, + "learning_rate": 6.041556053849825e-06, + "loss": 0.5683, + "step": 3574 + }, + { + "epoch": 1.7377507598784194, + "grad_norm": 0.07621898739787598, + "learning_rate": 6.039683309158635e-06, + "loss": 0.5414, + "step": 3575 + }, + { + "epoch": 1.7382370820668693, + "grad_norm": 0.07062539025763707, + "learning_rate": 6.037810412009693e-06, + "loss": 0.5307, + "step": 3576 + }, + { + "epoch": 1.7387234042553192, + "grad_norm": 0.08218588023469914, + "learning_rate": 6.035937362677637e-06, + "loss": 0.6059, + "step": 3577 + }, + { + "epoch": 1.739209726443769, + "grad_norm": 0.07219396593997263, + "learning_rate": 6.034064161437133e-06, + "loss": 0.5285, + "step": 3578 + }, + { + "epoch": 1.7396960486322188, + "grad_norm": 0.073513164585894, + "learning_rate": 6.032190808562861e-06, + "loss": 0.5773, + "step": 3579 + }, + { + "epoch": 1.7401823708206687, + "grad_norm": 0.07320664527212553, + "learning_rate": 6.0303173043295295e-06, + "loss": 0.5578, + "step": 3580 + }, + { + "epoch": 1.7406686930091184, + "grad_norm": 0.07710056816365062, + "learning_rate": 6.028443649011864e-06, + "loss": 0.5649, + "step": 3581 + }, + { + "epoch": 1.7411550151975685, + "grad_norm": 0.07326470935035503, + "learning_rate": 6.026569842884617e-06, + "loss": 0.5534, + "step": 3582 + }, + { + "epoch": 1.7416413373860182, + "grad_norm": 0.07197865271632267, + "learning_rate": 6.02469588622256e-06, + "loss": 0.5579, + "step": 3583 + }, + { + "epoch": 1.7421276595744681, + "grad_norm": 0.07416629554633317, + "learning_rate": 6.022821779300487e-06, + "loss": 0.5405, + "step": 3584 + }, + { + "epoch": 1.742613981762918, + "grad_norm": 0.0739177085627786, + "learning_rate": 6.020947522393214e-06, + "loss": 0.5448, + "step": 3585 + }, + { + "epoch": 1.7431003039513677, + "grad_norm": 0.07383647293083587, + "learning_rate": 6.019073115775582e-06, + "loss": 0.5629, + "step": 3586 + }, + { + "epoch": 1.7435866261398176, + "grad_norm": 0.07400571979424464, + "learning_rate": 6.017198559722451e-06, + "loss": 0.5545, + "step": 3587 + }, + { + "epoch": 1.7440729483282675, + "grad_norm": 0.07582019354540598, + "learning_rate": 6.0153238545087e-06, + "loss": 0.5519, + "step": 3588 + }, + { + "epoch": 1.7445592705167172, + "grad_norm": 0.07241105769853494, + "learning_rate": 6.013449000409236e-06, + "loss": 0.5362, + "step": 3589 + }, + { + "epoch": 1.7450455927051671, + "grad_norm": 0.07854879356795515, + "learning_rate": 6.011573997698985e-06, + "loss": 0.5927, + "step": 3590 + }, + { + "epoch": 1.745531914893617, + "grad_norm": 0.07245268974875702, + "learning_rate": 6.009698846652896e-06, + "loss": 0.5163, + "step": 3591 + }, + { + "epoch": 1.7460182370820667, + "grad_norm": 0.07329472333519695, + "learning_rate": 6.007823547545933e-06, + "loss": 0.5712, + "step": 3592 + }, + { + "epoch": 1.7465045592705168, + "grad_norm": 0.07178601716777852, + "learning_rate": 6.005948100653094e-06, + "loss": 0.5265, + "step": 3593 + }, + { + "epoch": 1.7469908814589665, + "grad_norm": 0.08049572723234834, + "learning_rate": 6.00407250624939e-06, + "loss": 0.5437, + "step": 3594 + }, + { + "epoch": 1.7474772036474164, + "grad_norm": 0.07338112673150728, + "learning_rate": 6.002196764609853e-06, + "loss": 0.5679, + "step": 3595 + }, + { + "epoch": 1.7479635258358663, + "grad_norm": 0.07032839623261679, + "learning_rate": 6.0003208760095426e-06, + "loss": 0.5332, + "step": 3596 + }, + { + "epoch": 1.748449848024316, + "grad_norm": 0.0755904426093377, + "learning_rate": 5.998444840723534e-06, + "loss": 0.5629, + "step": 3597 + }, + { + "epoch": 1.748936170212766, + "grad_norm": 0.06932283426829247, + "learning_rate": 5.996568659026929e-06, + "loss": 0.535, + "step": 3598 + }, + { + "epoch": 1.748936170212766, + "eval_loss": 0.5749828219413757, + "eval_runtime": 105.195, + "eval_samples_per_second": 288.54, + "eval_steps_per_second": 36.076, + "step": 3598 + }, + { + "epoch": 1.7494224924012158, + "grad_norm": 0.07192976175379051, + "learning_rate": 5.994692331194847e-06, + "loss": 0.5592, + "step": 3599 + }, + { + "epoch": 1.7499088145896655, + "grad_norm": 0.073295720203715, + "learning_rate": 5.99281585750243e-06, + "loss": 0.544, + "step": 3600 + }, + { + "epoch": 1.7503951367781156, + "grad_norm": 0.07101517052015995, + "learning_rate": 5.99093923822484e-06, + "loss": 0.5377, + "step": 3601 + }, + { + "epoch": 1.7508814589665653, + "grad_norm": 0.07510091411794076, + "learning_rate": 5.989062473637264e-06, + "loss": 0.5365, + "step": 3602 + }, + { + "epoch": 1.7513677811550152, + "grad_norm": 0.07245106002976794, + "learning_rate": 5.9871855640149075e-06, + "loss": 0.5404, + "step": 3603 + }, + { + "epoch": 1.7518541033434651, + "grad_norm": 0.07367707966079796, + "learning_rate": 5.985308509633e-06, + "loss": 0.5853, + "step": 3604 + }, + { + "epoch": 1.7523404255319148, + "grad_norm": 0.06952865724495874, + "learning_rate": 5.983431310766787e-06, + "loss": 0.5034, + "step": 3605 + }, + { + "epoch": 1.7528267477203647, + "grad_norm": 0.07349873704457437, + "learning_rate": 5.981553967691542e-06, + "loss": 0.5555, + "step": 3606 + }, + { + "epoch": 1.7533130699088146, + "grad_norm": 0.07096240195801282, + "learning_rate": 5.979676480682553e-06, + "loss": 0.5272, + "step": 3607 + }, + { + "epoch": 1.7537993920972643, + "grad_norm": 0.07111505280060815, + "learning_rate": 5.977798850015132e-06, + "loss": 0.5303, + "step": 3608 + }, + { + "epoch": 1.7542857142857144, + "grad_norm": 0.07267172831798531, + "learning_rate": 5.975921075964614e-06, + "loss": 0.5414, + "step": 3609 + }, + { + "epoch": 1.7547720364741641, + "grad_norm": 0.07375392374034453, + "learning_rate": 5.974043158806351e-06, + "loss": 0.5707, + "step": 3610 + }, + { + "epoch": 1.7552583586626138, + "grad_norm": 0.06945078244517708, + "learning_rate": 5.972165098815721e-06, + "loss": 0.5453, + "step": 3611 + }, + { + "epoch": 1.755744680851064, + "grad_norm": 0.07373462664172362, + "learning_rate": 5.970286896268118e-06, + "loss": 0.5334, + "step": 3612 + }, + { + "epoch": 1.7562310030395136, + "grad_norm": 0.07532147086303878, + "learning_rate": 5.968408551438963e-06, + "loss": 0.5516, + "step": 3613 + }, + { + "epoch": 1.7567173252279635, + "grad_norm": 0.07462807868060534, + "learning_rate": 5.966530064603688e-06, + "loss": 0.5612, + "step": 3614 + }, + { + "epoch": 1.7572036474164134, + "grad_norm": 0.07291028674466045, + "learning_rate": 5.964651436037756e-06, + "loss": 0.5494, + "step": 3615 + }, + { + "epoch": 1.7576899696048631, + "grad_norm": 0.07612560089523582, + "learning_rate": 5.9627726660166455e-06, + "loss": 0.5431, + "step": 3616 + }, + { + "epoch": 1.758176291793313, + "grad_norm": 0.06951500235641493, + "learning_rate": 5.960893754815855e-06, + "loss": 0.5219, + "step": 3617 + }, + { + "epoch": 1.758662613981763, + "grad_norm": 0.07313582547903942, + "learning_rate": 5.959014702710908e-06, + "loss": 0.5576, + "step": 3618 + }, + { + "epoch": 1.7591489361702126, + "grad_norm": 0.08916398621923113, + "learning_rate": 5.957135509977344e-06, + "loss": 0.5697, + "step": 3619 + }, + { + "epoch": 1.7596352583586627, + "grad_norm": 0.074255810644125, + "learning_rate": 5.955256176890728e-06, + "loss": 0.5021, + "step": 3620 + }, + { + "epoch": 1.7601215805471124, + "grad_norm": 0.07999992375481285, + "learning_rate": 5.953376703726642e-06, + "loss": 0.5713, + "step": 3621 + }, + { + "epoch": 1.7606079027355623, + "grad_norm": 0.07732857159098362, + "learning_rate": 5.951497090760687e-06, + "loss": 0.5928, + "step": 3622 + }, + { + "epoch": 1.7610942249240122, + "grad_norm": 0.0752080245612592, + "learning_rate": 5.94961733826849e-06, + "loss": 0.5581, + "step": 3623 + }, + { + "epoch": 1.761580547112462, + "grad_norm": 0.07118654165041885, + "learning_rate": 5.9477374465256936e-06, + "loss": 0.524, + "step": 3624 + }, + { + "epoch": 1.7620668693009118, + "grad_norm": 0.07787560729031419, + "learning_rate": 5.945857415807962e-06, + "loss": 0.5601, + "step": 3625 + }, + { + "epoch": 1.7625531914893617, + "grad_norm": 0.07311346398756595, + "learning_rate": 5.943977246390982e-06, + "loss": 0.5851, + "step": 3626 + }, + { + "epoch": 1.7630395136778114, + "grad_norm": 0.07133042578063493, + "learning_rate": 5.942096938550458e-06, + "loss": 0.5328, + "step": 3627 + }, + { + "epoch": 1.7635258358662615, + "grad_norm": 0.07896292882967725, + "learning_rate": 5.940216492562116e-06, + "loss": 0.6395, + "step": 3628 + }, + { + "epoch": 1.7640121580547112, + "grad_norm": 0.06927315267834584, + "learning_rate": 5.938335908701702e-06, + "loss": 0.5235, + "step": 3629 + }, + { + "epoch": 1.7644984802431611, + "grad_norm": 0.07628064400730479, + "learning_rate": 5.936455187244984e-06, + "loss": 0.5718, + "step": 3630 + }, + { + "epoch": 1.764984802431611, + "grad_norm": 0.07036770818404987, + "learning_rate": 5.934574328467746e-06, + "loss": 0.5473, + "step": 3631 + }, + { + "epoch": 1.7654711246200607, + "grad_norm": 0.07267823799718665, + "learning_rate": 5.932693332645796e-06, + "loss": 0.5633, + "step": 3632 + }, + { + "epoch": 1.7659574468085106, + "grad_norm": 0.0711508112093232, + "learning_rate": 5.930812200054959e-06, + "loss": 0.5407, + "step": 3633 + }, + { + "epoch": 1.7664437689969605, + "grad_norm": 0.08003617253679567, + "learning_rate": 5.928930930971084e-06, + "loss": 0.4988, + "step": 3634 + }, + { + "epoch": 1.7669300911854102, + "grad_norm": 0.07557166099442586, + "learning_rate": 5.927049525670036e-06, + "loss": 0.5529, + "step": 3635 + }, + { + "epoch": 1.7674164133738601, + "grad_norm": 0.07102324358656804, + "learning_rate": 5.925167984427703e-06, + "loss": 0.5742, + "step": 3636 + }, + { + "epoch": 1.76790273556231, + "grad_norm": 0.06986729522281891, + "learning_rate": 5.923286307519991e-06, + "loss": 0.5393, + "step": 3637 + }, + { + "epoch": 1.7683890577507597, + "grad_norm": 0.07094448030699813, + "learning_rate": 5.921404495222827e-06, + "loss": 0.5697, + "step": 3638 + }, + { + "epoch": 1.7688753799392098, + "grad_norm": 0.08123346878547236, + "learning_rate": 5.919522547812155e-06, + "loss": 0.5935, + "step": 3639 + }, + { + "epoch": 1.7693617021276595, + "grad_norm": 0.07338052107632764, + "learning_rate": 5.917640465563945e-06, + "loss": 0.5718, + "step": 3640 + }, + { + "epoch": 1.7698480243161094, + "grad_norm": 0.07217313260325932, + "learning_rate": 5.915758248754181e-06, + "loss": 0.5666, + "step": 3641 + }, + { + "epoch": 1.7703343465045593, + "grad_norm": 0.06905138734972223, + "learning_rate": 5.913875897658869e-06, + "loss": 0.5498, + "step": 3642 + }, + { + "epoch": 1.770820668693009, + "grad_norm": 0.06966606160607425, + "learning_rate": 5.911993412554035e-06, + "loss": 0.5329, + "step": 3643 + }, + { + "epoch": 1.771306990881459, + "grad_norm": 0.07182203077965714, + "learning_rate": 5.910110793715722e-06, + "loss": 0.5792, + "step": 3644 + }, + { + "epoch": 1.7717933130699088, + "grad_norm": 0.07238359817385392, + "learning_rate": 5.908228041419998e-06, + "loss": 0.554, + "step": 3645 + }, + { + "epoch": 1.7722796352583585, + "grad_norm": 0.0699897897107677, + "learning_rate": 5.906345155942943e-06, + "loss": 0.5559, + "step": 3646 + }, + { + "epoch": 1.7727659574468086, + "grad_norm": 0.0731887618484756, + "learning_rate": 5.904462137560664e-06, + "loss": 0.5555, + "step": 3647 + }, + { + "epoch": 1.7732522796352583, + "grad_norm": 0.07163249058896368, + "learning_rate": 5.902578986549283e-06, + "loss": 0.5366, + "step": 3648 + }, + { + "epoch": 1.7737386018237082, + "grad_norm": 0.0761682820692465, + "learning_rate": 5.900695703184944e-06, + "loss": 0.5729, + "step": 3649 + }, + { + "epoch": 1.7742249240121581, + "grad_norm": 0.07793529449567257, + "learning_rate": 5.898812287743808e-06, + "loss": 0.6167, + "step": 3650 + }, + { + "epoch": 1.7747112462006078, + "grad_norm": 0.07441850369094558, + "learning_rate": 5.896928740502057e-06, + "loss": 0.5681, + "step": 3651 + }, + { + "epoch": 1.7751975683890577, + "grad_norm": 0.07578780790005148, + "learning_rate": 5.895045061735891e-06, + "loss": 0.5767, + "step": 3652 + }, + { + "epoch": 1.7756838905775076, + "grad_norm": 0.06891701086798034, + "learning_rate": 5.8931612517215305e-06, + "loss": 0.5176, + "step": 3653 + }, + { + "epoch": 1.7761702127659573, + "grad_norm": 0.07473015452158757, + "learning_rate": 5.891277310735216e-06, + "loss": 0.5415, + "step": 3654 + }, + { + "epoch": 1.7766565349544075, + "grad_norm": 0.07022847829312584, + "learning_rate": 5.889393239053203e-06, + "loss": 0.5188, + "step": 3655 + }, + { + "epoch": 1.7771428571428571, + "grad_norm": 0.07053240306281798, + "learning_rate": 5.887509036951773e-06, + "loss": 0.545, + "step": 3656 + }, + { + "epoch": 1.777629179331307, + "grad_norm": 0.07320632766375113, + "learning_rate": 5.88562470470722e-06, + "loss": 0.5638, + "step": 3657 + }, + { + "epoch": 1.778115501519757, + "grad_norm": 0.07024946691265871, + "learning_rate": 5.883740242595862e-06, + "loss": 0.5695, + "step": 3658 + }, + { + "epoch": 1.7786018237082066, + "grad_norm": 0.07346540996701748, + "learning_rate": 5.8818556508940325e-06, + "loss": 0.5649, + "step": 3659 + }, + { + "epoch": 1.7790881458966565, + "grad_norm": 0.07255370698813807, + "learning_rate": 5.879970929878086e-06, + "loss": 0.5398, + "step": 3660 + }, + { + "epoch": 1.7795744680851064, + "grad_norm": 0.07249017333713408, + "learning_rate": 5.878086079824394e-06, + "loss": 0.5519, + "step": 3661 + }, + { + "epoch": 1.7800607902735561, + "grad_norm": 0.07320838547526598, + "learning_rate": 5.876201101009352e-06, + "loss": 0.5336, + "step": 3662 + }, + { + "epoch": 1.780547112462006, + "grad_norm": 0.07665163618450926, + "learning_rate": 5.874315993709368e-06, + "loss": 0.5972, + "step": 3663 + }, + { + "epoch": 1.781033434650456, + "grad_norm": 0.07098204771026999, + "learning_rate": 5.872430758200869e-06, + "loss": 0.5502, + "step": 3664 + }, + { + "epoch": 1.7815197568389056, + "grad_norm": 0.07232170640321002, + "learning_rate": 5.8705453947603096e-06, + "loss": 0.5335, + "step": 3665 + }, + { + "epoch": 1.7820060790273557, + "grad_norm": 0.07407242812484605, + "learning_rate": 5.868659903664152e-06, + "loss": 0.5461, + "step": 3666 + }, + { + "epoch": 1.7824924012158054, + "grad_norm": 0.07081051683619383, + "learning_rate": 5.866774285188887e-06, + "loss": 0.5322, + "step": 3667 + }, + { + "epoch": 1.7829787234042553, + "grad_norm": 0.07256050589283215, + "learning_rate": 5.8648885396110136e-06, + "loss": 0.5751, + "step": 3668 + }, + { + "epoch": 1.7834650455927052, + "grad_norm": 0.07345485356488476, + "learning_rate": 5.863002667207057e-06, + "loss": 0.5334, + "step": 3669 + }, + { + "epoch": 1.783951367781155, + "grad_norm": 0.07311652224392305, + "learning_rate": 5.861116668253559e-06, + "loss": 0.5545, + "step": 3670 + }, + { + "epoch": 1.7844376899696048, + "grad_norm": 0.07035308188288608, + "learning_rate": 5.8592305430270814e-06, + "loss": 0.5326, + "step": 3671 + }, + { + "epoch": 1.7849240121580547, + "grad_norm": 0.07337421208906444, + "learning_rate": 5.8573442918042015e-06, + "loss": 0.5399, + "step": 3672 + }, + { + "epoch": 1.7854103343465044, + "grad_norm": 0.07339199974139411, + "learning_rate": 5.855457914861515e-06, + "loss": 0.5672, + "step": 3673 + }, + { + "epoch": 1.7858966565349546, + "grad_norm": 0.07263076362552391, + "learning_rate": 5.853571412475644e-06, + "loss": 0.5894, + "step": 3674 + }, + { + "epoch": 1.7863829787234042, + "grad_norm": 0.06975837140108927, + "learning_rate": 5.851684784923215e-06, + "loss": 0.5456, + "step": 3675 + }, + { + "epoch": 1.7868693009118541, + "grad_norm": 0.07510960014348426, + "learning_rate": 5.849798032480886e-06, + "loss": 0.5819, + "step": 3676 + }, + { + "epoch": 1.787355623100304, + "grad_norm": 0.07087019549529482, + "learning_rate": 5.8479111554253235e-06, + "loss": 0.5189, + "step": 3677 + }, + { + "epoch": 1.7878419452887537, + "grad_norm": 0.07108053470474533, + "learning_rate": 5.8460241540332195e-06, + "loss": 0.541, + "step": 3678 + }, + { + "epoch": 1.7883282674772036, + "grad_norm": 0.06875864765708137, + "learning_rate": 5.84413702858128e-06, + "loss": 0.5463, + "step": 3679 + }, + { + "epoch": 1.7888145896656535, + "grad_norm": 0.07062519581399442, + "learning_rate": 5.8422497793462315e-06, + "loss": 0.5519, + "step": 3680 + }, + { + "epoch": 1.7893009118541032, + "grad_norm": 0.07193395736085764, + "learning_rate": 5.840362406604818e-06, + "loss": 0.5102, + "step": 3681 + }, + { + "epoch": 1.7897872340425534, + "grad_norm": 0.06948915889974423, + "learning_rate": 5.8384749106338e-06, + "loss": 0.542, + "step": 3682 + }, + { + "epoch": 1.790273556231003, + "grad_norm": 0.07321434746943749, + "learning_rate": 5.836587291709958e-06, + "loss": 0.5682, + "step": 3683 + }, + { + "epoch": 1.7907598784194527, + "grad_norm": 0.07430036328949104, + "learning_rate": 5.83469955011009e-06, + "loss": 0.5942, + "step": 3684 + }, + { + "epoch": 1.7912462006079028, + "grad_norm": 0.07059708000115801, + "learning_rate": 5.832811686111011e-06, + "loss": 0.5472, + "step": 3685 + }, + { + "epoch": 1.7917325227963525, + "grad_norm": 0.07171486128807007, + "learning_rate": 5.830923699989556e-06, + "loss": 0.5416, + "step": 3686 + }, + { + "epoch": 1.7922188449848024, + "grad_norm": 0.07159611497207326, + "learning_rate": 5.829035592022575e-06, + "loss": 0.564, + "step": 3687 + }, + { + "epoch": 1.7927051671732523, + "grad_norm": 0.07368568359696098, + "learning_rate": 5.82714736248694e-06, + "loss": 0.5643, + "step": 3688 + }, + { + "epoch": 1.793191489361702, + "grad_norm": 0.07437127960393947, + "learning_rate": 5.825259011659537e-06, + "loss": 0.5557, + "step": 3689 + }, + { + "epoch": 1.793677811550152, + "grad_norm": 0.07771526757703247, + "learning_rate": 5.82337053981727e-06, + "loss": 0.5891, + "step": 3690 + }, + { + "epoch": 1.7941641337386018, + "grad_norm": 0.07346763115777125, + "learning_rate": 5.821481947237066e-06, + "loss": 0.6043, + "step": 3691 + }, + { + "epoch": 1.7946504559270515, + "grad_norm": 0.07097919982578436, + "learning_rate": 5.81959323419586e-06, + "loss": 0.5244, + "step": 3692 + }, + { + "epoch": 1.7951367781155017, + "grad_norm": 0.07744988259151567, + "learning_rate": 5.817704400970615e-06, + "loss": 0.5422, + "step": 3693 + }, + { + "epoch": 1.7956231003039513, + "grad_norm": 0.07188470449476515, + "learning_rate": 5.815815447838304e-06, + "loss": 0.567, + "step": 3694 + }, + { + "epoch": 1.7961094224924012, + "grad_norm": 0.07121923239208007, + "learning_rate": 5.813926375075924e-06, + "loss": 0.5311, + "step": 3695 + }, + { + "epoch": 1.7965957446808511, + "grad_norm": 0.07422271611608093, + "learning_rate": 5.812037182960483e-06, + "loss": 0.595, + "step": 3696 + }, + { + "epoch": 1.7970820668693008, + "grad_norm": 0.06910502049231106, + "learning_rate": 5.8101478717690095e-06, + "loss": 0.5113, + "step": 3697 + }, + { + "epoch": 1.7975683890577507, + "grad_norm": 0.07607890949252384, + "learning_rate": 5.8082584417785515e-06, + "loss": 0.5644, + "step": 3698 + }, + { + "epoch": 1.7980547112462006, + "grad_norm": 0.07612833556606277, + "learning_rate": 5.806368893266171e-06, + "loss": 0.5559, + "step": 3699 + }, + { + "epoch": 1.7985410334346503, + "grad_norm": 0.07302375718789052, + "learning_rate": 5.804479226508949e-06, + "loss": 0.5568, + "step": 3700 + }, + { + "epoch": 1.7990273556231005, + "grad_norm": 0.07367495043226692, + "learning_rate": 5.8025894417839835e-06, + "loss": 0.5936, + "step": 3701 + }, + { + "epoch": 1.7995136778115501, + "grad_norm": 0.06990224779710322, + "learning_rate": 5.800699539368391e-06, + "loss": 0.5375, + "step": 3702 + }, + { + "epoch": 1.8, + "grad_norm": 0.06947119655247352, + "learning_rate": 5.798809519539302e-06, + "loss": 0.5797, + "step": 3703 + }, + { + "epoch": 1.80048632218845, + "grad_norm": 0.07224670458000126, + "learning_rate": 5.7969193825738705e-06, + "loss": 0.5622, + "step": 3704 + }, + { + "epoch": 1.8009726443768996, + "grad_norm": 0.0718322950102316, + "learning_rate": 5.795029128749261e-06, + "loss": 0.5381, + "step": 3705 + }, + { + "epoch": 1.8014589665653495, + "grad_norm": 0.07103691465981724, + "learning_rate": 5.793138758342657e-06, + "loss": 0.5354, + "step": 3706 + }, + { + "epoch": 1.8019452887537994, + "grad_norm": 0.07085149059648509, + "learning_rate": 5.79124827163126e-06, + "loss": 0.5507, + "step": 3707 + }, + { + "epoch": 1.8024316109422491, + "grad_norm": 0.0699389836394544, + "learning_rate": 5.78935766889229e-06, + "loss": 0.5547, + "step": 3708 + }, + { + "epoch": 1.802917933130699, + "grad_norm": 0.07145536821188114, + "learning_rate": 5.7874669504029825e-06, + "loss": 0.555, + "step": 3709 + }, + { + "epoch": 1.803404255319149, + "grad_norm": 0.0733717470584871, + "learning_rate": 5.785576116440586e-06, + "loss": 0.5703, + "step": 3710 + }, + { + "epoch": 1.8038905775075986, + "grad_norm": 0.06988065584928621, + "learning_rate": 5.783685167282376e-06, + "loss": 0.5628, + "step": 3711 + }, + { + "epoch": 1.8043768996960488, + "grad_norm": 0.07332804898486768, + "learning_rate": 5.781794103205633e-06, + "loss": 0.5527, + "step": 3712 + }, + { + "epoch": 1.8048632218844984, + "grad_norm": 0.07534552990260916, + "learning_rate": 5.779902924487666e-06, + "loss": 0.572, + "step": 3713 + }, + { + "epoch": 1.8053495440729483, + "grad_norm": 0.1380323192966416, + "learning_rate": 5.77801163140579e-06, + "loss": 0.5342, + "step": 3714 + }, + { + "epoch": 1.8058358662613982, + "grad_norm": 0.07387432136816943, + "learning_rate": 5.776120224237343e-06, + "loss": 0.584, + "step": 3715 + }, + { + "epoch": 1.806322188449848, + "grad_norm": 0.07457195188960578, + "learning_rate": 5.774228703259678e-06, + "loss": 0.5578, + "step": 3716 + }, + { + "epoch": 1.8068085106382978, + "grad_norm": 0.07214637870674079, + "learning_rate": 5.772337068750165e-06, + "loss": 0.5613, + "step": 3717 + }, + { + "epoch": 1.8072948328267477, + "grad_norm": 0.07317412182570897, + "learning_rate": 5.770445320986194e-06, + "loss": 0.5901, + "step": 3718 + }, + { + "epoch": 1.8077811550151974, + "grad_norm": 0.07140099143671426, + "learning_rate": 5.768553460245162e-06, + "loss": 0.5413, + "step": 3719 + }, + { + "epoch": 1.8082674772036476, + "grad_norm": 0.07249968798442306, + "learning_rate": 5.766661486804495e-06, + "loss": 0.5444, + "step": 3720 + }, + { + "epoch": 1.8087537993920972, + "grad_norm": 0.07363267939840673, + "learning_rate": 5.7647694009416264e-06, + "loss": 0.5377, + "step": 3721 + }, + { + "epoch": 1.8092401215805471, + "grad_norm": 0.07498413465427893, + "learning_rate": 5.762877202934011e-06, + "loss": 0.5548, + "step": 3722 + }, + { + "epoch": 1.809726443768997, + "grad_norm": 0.06991658062181326, + "learning_rate": 5.760984893059115e-06, + "loss": 0.5399, + "step": 3723 + }, + { + "epoch": 1.8102127659574467, + "grad_norm": 0.06972887924205033, + "learning_rate": 5.7590924715944265e-06, + "loss": 0.5588, + "step": 3724 + }, + { + "epoch": 1.8106990881458966, + "grad_norm": 0.0740470483094124, + "learning_rate": 5.757199938817447e-06, + "loss": 0.5509, + "step": 3725 + }, + { + "epoch": 1.8111854103343465, + "grad_norm": 0.07400547939704079, + "learning_rate": 5.755307295005695e-06, + "loss": 0.5589, + "step": 3726 + }, + { + "epoch": 1.8116717325227962, + "grad_norm": 0.07134953028119086, + "learning_rate": 5.753414540436706e-06, + "loss": 0.5381, + "step": 3727 + }, + { + "epoch": 1.8121580547112464, + "grad_norm": 0.07369445984870336, + "learning_rate": 5.75152167538803e-06, + "loss": 0.5627, + "step": 3728 + }, + { + "epoch": 1.812644376899696, + "grad_norm": 0.07171313402278903, + "learning_rate": 5.749628700137234e-06, + "loss": 0.5558, + "step": 3729 + }, + { + "epoch": 1.813130699088146, + "grad_norm": 0.06949271634278949, + "learning_rate": 5.747735614961902e-06, + "loss": 0.5295, + "step": 3730 + }, + { + "epoch": 1.8136170212765959, + "grad_norm": 0.07149059892565086, + "learning_rate": 5.745842420139632e-06, + "loss": 0.5508, + "step": 3731 + }, + { + "epoch": 1.8141033434650455, + "grad_norm": 0.0726401694256946, + "learning_rate": 5.743949115948042e-06, + "loss": 0.579, + "step": 3732 + }, + { + "epoch": 1.8145896656534954, + "grad_norm": 0.07788523135051495, + "learning_rate": 5.7420557026647625e-06, + "loss": 0.6107, + "step": 3733 + }, + { + "epoch": 1.8150759878419453, + "grad_norm": 0.07157641394166475, + "learning_rate": 5.74016218056744e-06, + "loss": 0.5161, + "step": 3734 + }, + { + "epoch": 1.815562310030395, + "grad_norm": 0.06893289553292144, + "learning_rate": 5.7382685499337385e-06, + "loss": 0.5069, + "step": 3735 + }, + { + "epoch": 1.816048632218845, + "grad_norm": 0.07109600112504161, + "learning_rate": 5.736374811041339e-06, + "loss": 0.5006, + "step": 3736 + }, + { + "epoch": 1.8165349544072948, + "grad_norm": 0.07124248629780326, + "learning_rate": 5.734480964167935e-06, + "loss": 0.5433, + "step": 3737 + }, + { + "epoch": 1.8170212765957445, + "grad_norm": 0.07473931891488424, + "learning_rate": 5.732587009591238e-06, + "loss": 0.5227, + "step": 3738 + }, + { + "epoch": 1.8175075987841947, + "grad_norm": 0.07204180645130044, + "learning_rate": 5.730692947588975e-06, + "loss": 0.5328, + "step": 3739 + }, + { + "epoch": 1.8179939209726443, + "grad_norm": 0.07376308697607588, + "learning_rate": 5.728798778438889e-06, + "loss": 0.5457, + "step": 3740 + }, + { + "epoch": 1.8184802431610942, + "grad_norm": 0.07001674613205891, + "learning_rate": 5.726904502418739e-06, + "loss": 0.5567, + "step": 3741 + }, + { + "epoch": 1.8189665653495442, + "grad_norm": 0.07234234515741456, + "learning_rate": 5.725010119806297e-06, + "loss": 0.5477, + "step": 3742 + }, + { + "epoch": 1.8194528875379938, + "grad_norm": 0.07624441123895309, + "learning_rate": 5.7231156308793545e-06, + "loss": 0.5701, + "step": 3743 + }, + { + "epoch": 1.8199392097264437, + "grad_norm": 0.07352657802771531, + "learning_rate": 5.721221035915717e-06, + "loss": 0.5587, + "step": 3744 + }, + { + "epoch": 1.8204255319148936, + "grad_norm": 0.07124527335189594, + "learning_rate": 5.719326335193204e-06, + "loss": 0.5657, + "step": 3745 + }, + { + "epoch": 1.8209118541033433, + "grad_norm": 0.07074566160162261, + "learning_rate": 5.717431528989651e-06, + "loss": 0.5451, + "step": 3746 + }, + { + "epoch": 1.8213981762917935, + "grad_norm": 0.07446677937679787, + "learning_rate": 5.715536617582913e-06, + "loss": 0.594, + "step": 3747 + }, + { + "epoch": 1.8218844984802431, + "grad_norm": 0.07478715029140917, + "learning_rate": 5.713641601250854e-06, + "loss": 0.5097, + "step": 3748 + }, + { + "epoch": 1.822370820668693, + "grad_norm": 0.06976310538342256, + "learning_rate": 5.71174648027136e-06, + "loss": 0.4994, + "step": 3749 + }, + { + "epoch": 1.822857142857143, + "grad_norm": 0.07248474608070116, + "learning_rate": 5.709851254922326e-06, + "loss": 0.536, + "step": 3750 + }, + { + "epoch": 1.8233434650455926, + "grad_norm": 0.07085216066460785, + "learning_rate": 5.7079559254816665e-06, + "loss": 0.5359, + "step": 3751 + }, + { + "epoch": 1.8238297872340425, + "grad_norm": 0.07000847837824252, + "learning_rate": 5.706060492227311e-06, + "loss": 0.5324, + "step": 3752 + }, + { + "epoch": 1.8243161094224924, + "grad_norm": 0.07080978614683664, + "learning_rate": 5.7041649554372015e-06, + "loss": 0.5622, + "step": 3753 + }, + { + "epoch": 1.8248024316109421, + "grad_norm": 0.07212048374871464, + "learning_rate": 5.702269315389296e-06, + "loss": 0.5169, + "step": 3754 + }, + { + "epoch": 1.8252887537993923, + "grad_norm": 0.07374951233787296, + "learning_rate": 5.70037357236157e-06, + "loss": 0.5767, + "step": 3755 + }, + { + "epoch": 1.825775075987842, + "grad_norm": 0.06737529434869072, + "learning_rate": 5.698477726632015e-06, + "loss": 0.5066, + "step": 3756 + }, + { + "epoch": 1.8262613981762916, + "grad_norm": 0.06982984660697597, + "learning_rate": 5.6965817784786325e-06, + "loss": 0.5388, + "step": 3757 + }, + { + "epoch": 1.8267477203647418, + "grad_norm": 0.0712839524633681, + "learning_rate": 5.694685728179442e-06, + "loss": 0.5457, + "step": 3758 + }, + { + "epoch": 1.8272340425531914, + "grad_norm": 0.07475472581807995, + "learning_rate": 5.69278957601248e-06, + "loss": 0.557, + "step": 3759 + }, + { + "epoch": 1.8277203647416413, + "grad_norm": 0.07384993449774267, + "learning_rate": 5.690893322255791e-06, + "loss": 0.5712, + "step": 3760 + }, + { + "epoch": 1.8282066869300913, + "grad_norm": 0.07239955692087521, + "learning_rate": 5.688996967187445e-06, + "loss": 0.5363, + "step": 3761 + }, + { + "epoch": 1.828693009118541, + "grad_norm": 0.06774496828657123, + "learning_rate": 5.687100511085515e-06, + "loss": 0.488, + "step": 3762 + }, + { + "epoch": 1.8291793313069908, + "grad_norm": 0.0725479493409849, + "learning_rate": 5.685203954228099e-06, + "loss": 0.5591, + "step": 3763 + }, + { + "epoch": 1.8296656534954407, + "grad_norm": 0.07129933692150062, + "learning_rate": 5.683307296893303e-06, + "loss": 0.525, + "step": 3764 + }, + { + "epoch": 1.8301519756838904, + "grad_norm": 0.0740819613134729, + "learning_rate": 5.681410539359251e-06, + "loss": 0.5974, + "step": 3765 + }, + { + "epoch": 1.8306382978723406, + "grad_norm": 0.06792062074025114, + "learning_rate": 5.679513681904084e-06, + "loss": 0.5468, + "step": 3766 + }, + { + "epoch": 1.8311246200607902, + "grad_norm": 0.07084457779575046, + "learning_rate": 5.67761672480595e-06, + "loss": 0.5432, + "step": 3767 + }, + { + "epoch": 1.8316109422492401, + "grad_norm": 0.07157840213834797, + "learning_rate": 5.675719668343019e-06, + "loss": 0.5447, + "step": 3768 + }, + { + "epoch": 1.83209726443769, + "grad_norm": 0.07330750788051905, + "learning_rate": 5.673822512793471e-06, + "loss": 0.5874, + "step": 3769 + }, + { + "epoch": 1.8325835866261397, + "grad_norm": 0.07112420469894742, + "learning_rate": 5.671925258435504e-06, + "loss": 0.4942, + "step": 3770 + }, + { + "epoch": 1.8330699088145896, + "grad_norm": 0.06948498376476693, + "learning_rate": 5.670027905547329e-06, + "loss": 0.5788, + "step": 3771 + }, + { + "epoch": 1.8335562310030395, + "grad_norm": 0.0717781120787052, + "learning_rate": 5.668130454407168e-06, + "loss": 0.5593, + "step": 3772 + }, + { + "epoch": 1.8340425531914892, + "grad_norm": 0.07215707676776603, + "learning_rate": 5.666232905293263e-06, + "loss": 0.5677, + "step": 3773 + }, + { + "epoch": 1.8345288753799394, + "grad_norm": 0.06992043349783852, + "learning_rate": 5.664335258483871e-06, + "loss": 0.5693, + "step": 3774 + }, + { + "epoch": 1.835015197568389, + "grad_norm": 0.06998043413851437, + "learning_rate": 5.6624375142572555e-06, + "loss": 0.5422, + "step": 3775 + }, + { + "epoch": 1.835501519756839, + "grad_norm": 0.07438282662720953, + "learning_rate": 5.6605396728917006e-06, + "loss": 0.5562, + "step": 3776 + }, + { + "epoch": 1.8359878419452889, + "grad_norm": 0.06952341201837078, + "learning_rate": 5.658641734665503e-06, + "loss": 0.5517, + "step": 3777 + }, + { + "epoch": 1.8364741641337385, + "grad_norm": 0.07031362325325231, + "learning_rate": 5.656743699856976e-06, + "loss": 0.5401, + "step": 3778 + }, + { + "epoch": 1.8369604863221884, + "grad_norm": 0.06846355226164075, + "learning_rate": 5.654845568744443e-06, + "loss": 0.5158, + "step": 3779 + }, + { + "epoch": 1.8374468085106384, + "grad_norm": 0.069317791282025, + "learning_rate": 5.652947341606243e-06, + "loss": 0.5426, + "step": 3780 + }, + { + "epoch": 1.837933130699088, + "grad_norm": 0.07102378330650222, + "learning_rate": 5.65104901872073e-06, + "loss": 0.5823, + "step": 3781 + }, + { + "epoch": 1.838419452887538, + "grad_norm": 0.06903938789067822, + "learning_rate": 5.649150600366272e-06, + "loss": 0.5286, + "step": 3782 + }, + { + "epoch": 1.8389057750759878, + "grad_norm": 0.07252347180331538, + "learning_rate": 5.64725208682125e-06, + "loss": 0.5455, + "step": 3783 + }, + { + "epoch": 1.8393920972644375, + "grad_norm": 0.07207477062229378, + "learning_rate": 5.645353478364059e-06, + "loss": 0.5556, + "step": 3784 + }, + { + "epoch": 1.8398784194528877, + "grad_norm": 0.0710066071493234, + "learning_rate": 5.64345477527311e-06, + "loss": 0.5733, + "step": 3785 + }, + { + "epoch": 1.8403647416413373, + "grad_norm": 0.07240032700807762, + "learning_rate": 5.641555977826824e-06, + "loss": 0.5895, + "step": 3786 + }, + { + "epoch": 1.8408510638297872, + "grad_norm": 0.07330265381642612, + "learning_rate": 5.639657086303639e-06, + "loss": 0.5504, + "step": 3787 + }, + { + "epoch": 1.8413373860182372, + "grad_norm": 0.07176866909404722, + "learning_rate": 5.637758100982007e-06, + "loss": 0.5532, + "step": 3788 + }, + { + "epoch": 1.8418237082066868, + "grad_norm": 0.07343934482235806, + "learning_rate": 5.635859022140391e-06, + "loss": 0.55, + "step": 3789 + }, + { + "epoch": 1.8423100303951367, + "grad_norm": 0.07001562238906066, + "learning_rate": 5.633959850057271e-06, + "loss": 0.5451, + "step": 3790 + }, + { + "epoch": 1.8427963525835866, + "grad_norm": 0.07117624050553056, + "learning_rate": 5.632060585011138e-06, + "loss": 0.5419, + "step": 3791 + }, + { + "epoch": 1.8432826747720363, + "grad_norm": 0.07420003163647222, + "learning_rate": 5.630161227280496e-06, + "loss": 0.6141, + "step": 3792 + }, + { + "epoch": 1.8437689969604865, + "grad_norm": 0.07046916735273435, + "learning_rate": 5.628261777143867e-06, + "loss": 0.5335, + "step": 3793 + }, + { + "epoch": 1.8442553191489361, + "grad_norm": 0.06949081978361181, + "learning_rate": 5.626362234879783e-06, + "loss": 0.5251, + "step": 3794 + }, + { + "epoch": 1.844741641337386, + "grad_norm": 0.06894200071003406, + "learning_rate": 5.62446260076679e-06, + "loss": 0.5451, + "step": 3795 + }, + { + "epoch": 1.845227963525836, + "grad_norm": 0.07145977770879011, + "learning_rate": 5.622562875083448e-06, + "loss": 0.5753, + "step": 3796 + }, + { + "epoch": 1.8457142857142856, + "grad_norm": 0.07112706181136323, + "learning_rate": 5.620663058108331e-06, + "loss": 0.5631, + "step": 3797 + }, + { + "epoch": 1.8462006079027355, + "grad_norm": 0.06995478302103418, + "learning_rate": 5.618763150120024e-06, + "loss": 0.5471, + "step": 3798 + }, + { + "epoch": 1.8466869300911855, + "grad_norm": 0.06914015464339193, + "learning_rate": 5.616863151397127e-06, + "loss": 0.5406, + "step": 3799 + }, + { + "epoch": 1.8471732522796351, + "grad_norm": 0.06943456983708265, + "learning_rate": 5.614963062218253e-06, + "loss": 0.5267, + "step": 3800 + }, + { + "epoch": 1.8476595744680853, + "grad_norm": 0.07191017173048178, + "learning_rate": 5.61306288286203e-06, + "loss": 0.5529, + "step": 3801 + }, + { + "epoch": 1.848145896656535, + "grad_norm": 0.07494122014165715, + "learning_rate": 5.611162613607098e-06, + "loss": 0.5526, + "step": 3802 + }, + { + "epoch": 1.8486322188449849, + "grad_norm": 0.07197727781320475, + "learning_rate": 5.609262254732107e-06, + "loss": 0.5362, + "step": 3803 + }, + { + "epoch": 1.8491185410334348, + "grad_norm": 0.07479025691542991, + "learning_rate": 5.607361806515727e-06, + "loss": 0.5332, + "step": 3804 + }, + { + "epoch": 1.8496048632218844, + "grad_norm": 0.07023181417435177, + "learning_rate": 5.605461269236635e-06, + "loss": 0.5834, + "step": 3805 + }, + { + "epoch": 1.8500911854103343, + "grad_norm": 0.06995680724069142, + "learning_rate": 5.603560643173522e-06, + "loss": 0.538, + "step": 3806 + }, + { + "epoch": 1.8505775075987843, + "grad_norm": 0.07053583060796019, + "learning_rate": 5.601659928605095e-06, + "loss": 0.52, + "step": 3807 + }, + { + "epoch": 1.851063829787234, + "grad_norm": 0.07227349280471497, + "learning_rate": 5.599759125810073e-06, + "loss": 0.5759, + "step": 3808 + }, + { + "epoch": 1.8515501519756838, + "grad_norm": 0.07273850793386738, + "learning_rate": 5.597858235067184e-06, + "loss": 0.5327, + "step": 3809 + }, + { + "epoch": 1.8520364741641338, + "grad_norm": 0.07037922265494392, + "learning_rate": 5.595957256655174e-06, + "loss": 0.5488, + "step": 3810 + }, + { + "epoch": 1.8525227963525834, + "grad_norm": 0.06965936993655833, + "learning_rate": 5.594056190852801e-06, + "loss": 0.5164, + "step": 3811 + }, + { + "epoch": 1.8530091185410336, + "grad_norm": 0.07030335031873611, + "learning_rate": 5.592155037938834e-06, + "loss": 0.5114, + "step": 3812 + }, + { + "epoch": 1.8534954407294832, + "grad_norm": 0.07190786810614103, + "learning_rate": 5.5902537981920545e-06, + "loss": 0.5821, + "step": 3813 + }, + { + "epoch": 1.8539817629179332, + "grad_norm": 0.07535660040517028, + "learning_rate": 5.588352471891259e-06, + "loss": 0.613, + "step": 3814 + }, + { + "epoch": 1.854468085106383, + "grad_norm": 0.07413350038775467, + "learning_rate": 5.586451059315253e-06, + "loss": 0.5684, + "step": 3815 + }, + { + "epoch": 1.8549544072948327, + "grad_norm": 0.07270456231481043, + "learning_rate": 5.584549560742859e-06, + "loss": 0.5974, + "step": 3816 + }, + { + "epoch": 1.8554407294832826, + "grad_norm": 0.07168856657316058, + "learning_rate": 5.58264797645291e-06, + "loss": 0.54, + "step": 3817 + }, + { + "epoch": 1.8559270516717326, + "grad_norm": 0.06948602461810775, + "learning_rate": 5.580746306724252e-06, + "loss": 0.5237, + "step": 3818 + }, + { + "epoch": 1.8564133738601822, + "grad_norm": 0.0697832717925343, + "learning_rate": 5.578844551835742e-06, + "loss": 0.5509, + "step": 3819 + }, + { + "epoch": 1.8568996960486324, + "grad_norm": 0.0711256778625363, + "learning_rate": 5.576942712066255e-06, + "loss": 0.5429, + "step": 3820 + }, + { + "epoch": 1.857386018237082, + "grad_norm": 0.07462287816350456, + "learning_rate": 5.575040787694668e-06, + "loss": 0.5684, + "step": 3821 + }, + { + "epoch": 1.857872340425532, + "grad_norm": 0.07207751331406778, + "learning_rate": 5.57313877899988e-06, + "loss": 0.5492, + "step": 3822 + }, + { + "epoch": 1.8583586626139819, + "grad_norm": 0.07470260184550374, + "learning_rate": 5.571236686260798e-06, + "loss": 0.5506, + "step": 3823 + }, + { + "epoch": 1.8588449848024315, + "grad_norm": 0.07352014608781392, + "learning_rate": 5.569334509756344e-06, + "loss": 0.5639, + "step": 3824 + }, + { + "epoch": 1.8593313069908814, + "grad_norm": 0.0706346220437522, + "learning_rate": 5.567432249765449e-06, + "loss": 0.5165, + "step": 3825 + }, + { + "epoch": 1.8598176291793314, + "grad_norm": 0.07151119129191524, + "learning_rate": 5.565529906567057e-06, + "loss": 0.5969, + "step": 3826 + }, + { + "epoch": 1.860303951367781, + "grad_norm": 0.06985574004158678, + "learning_rate": 5.563627480440127e-06, + "loss": 0.5326, + "step": 3827 + }, + { + "epoch": 1.8607902735562312, + "grad_norm": 0.07580449354122563, + "learning_rate": 5.561724971663628e-06, + "loss": 0.5668, + "step": 3828 + }, + { + "epoch": 1.8612765957446809, + "grad_norm": 0.07302002817660541, + "learning_rate": 5.559822380516539e-06, + "loss": 0.5452, + "step": 3829 + }, + { + "epoch": 1.8617629179331305, + "grad_norm": 0.0723112034166102, + "learning_rate": 5.557919707277857e-06, + "loss": 0.554, + "step": 3830 + }, + { + "epoch": 1.8622492401215807, + "grad_norm": 0.07250965581229067, + "learning_rate": 5.556016952226585e-06, + "loss": 0.5478, + "step": 3831 + }, + { + "epoch": 1.8627355623100303, + "grad_norm": 0.0712328543900115, + "learning_rate": 5.554114115641741e-06, + "loss": 0.5669, + "step": 3832 + }, + { + "epoch": 1.8632218844984803, + "grad_norm": 0.07334487483474895, + "learning_rate": 5.552211197802354e-06, + "loss": 0.5403, + "step": 3833 + }, + { + "epoch": 1.8637082066869302, + "grad_norm": 0.07367820153807855, + "learning_rate": 5.550308198987466e-06, + "loss": 0.5349, + "step": 3834 + }, + { + "epoch": 1.8641945288753798, + "grad_norm": 0.0703755107906746, + "learning_rate": 5.548405119476129e-06, + "loss": 0.5579, + "step": 3835 + }, + { + "epoch": 1.8646808510638297, + "grad_norm": 0.07094734124585628, + "learning_rate": 5.546501959547411e-06, + "loss": 0.5341, + "step": 3836 + }, + { + "epoch": 1.8651671732522797, + "grad_norm": 0.07282036499949628, + "learning_rate": 5.544598719480383e-06, + "loss": 0.5382, + "step": 3837 + }, + { + "epoch": 1.8656534954407293, + "grad_norm": 0.07447166786091301, + "learning_rate": 5.54269539955414e-06, + "loss": 0.5417, + "step": 3838 + }, + { + "epoch": 1.8661398176291795, + "grad_norm": 0.0751788995888351, + "learning_rate": 5.540792000047778e-06, + "loss": 0.5502, + "step": 3839 + }, + { + "epoch": 1.8666261398176291, + "grad_norm": 0.06961499050080971, + "learning_rate": 5.538888521240411e-06, + "loss": 0.5282, + "step": 3840 + }, + { + "epoch": 1.867112462006079, + "grad_norm": 0.073412480596751, + "learning_rate": 5.53698496341116e-06, + "loss": 0.5475, + "step": 3841 + }, + { + "epoch": 1.867598784194529, + "grad_norm": 0.06985806140808806, + "learning_rate": 5.535081326839165e-06, + "loss": 0.527, + "step": 3842 + }, + { + "epoch": 1.8680851063829786, + "grad_norm": 0.07483264512773195, + "learning_rate": 5.5331776118035675e-06, + "loss": 0.5569, + "step": 3843 + }, + { + "epoch": 1.8685714285714285, + "grad_norm": 0.06884638658368249, + "learning_rate": 5.53127381858353e-06, + "loss": 0.5407, + "step": 3844 + }, + { + "epoch": 1.8690577507598785, + "grad_norm": 0.07308863350731348, + "learning_rate": 5.529369947458219e-06, + "loss": 0.5305, + "step": 3845 + }, + { + "epoch": 1.8695440729483281, + "grad_norm": 0.07345115465989771, + "learning_rate": 5.527465998706815e-06, + "loss": 0.527, + "step": 3846 + }, + { + "epoch": 1.8700303951367783, + "grad_norm": 0.07044998523833643, + "learning_rate": 5.525561972608513e-06, + "loss": 0.5322, + "step": 3847 + }, + { + "epoch": 1.870516717325228, + "grad_norm": 0.07408426800749078, + "learning_rate": 5.523657869442516e-06, + "loss": 0.5883, + "step": 3848 + }, + { + "epoch": 1.8710030395136779, + "grad_norm": 0.07356293128805845, + "learning_rate": 5.521753689488039e-06, + "loss": 0.5475, + "step": 3849 + }, + { + "epoch": 1.8714893617021278, + "grad_norm": 0.08080229590294455, + "learning_rate": 5.519849433024308e-06, + "loss": 0.584, + "step": 3850 + }, + { + "epoch": 1.8719756838905774, + "grad_norm": 0.07829341159558895, + "learning_rate": 5.517945100330563e-06, + "loss": 0.5597, + "step": 3851 + }, + { + "epoch": 1.8724620060790274, + "grad_norm": 0.07113114856750816, + "learning_rate": 5.516040691686049e-06, + "loss": 0.5604, + "step": 3852 + }, + { + "epoch": 1.8729483282674773, + "grad_norm": 0.07140712557824944, + "learning_rate": 5.514136207370026e-06, + "loss": 0.5548, + "step": 3853 + }, + { + "epoch": 1.873434650455927, + "grad_norm": 0.06989501222149999, + "learning_rate": 5.512231647661769e-06, + "loss": 0.5362, + "step": 3854 + }, + { + "epoch": 1.8739209726443768, + "grad_norm": 0.07006989477905902, + "learning_rate": 5.510327012840556e-06, + "loss": 0.5465, + "step": 3855 + }, + { + "epoch": 1.8744072948328268, + "grad_norm": 0.07101153343172435, + "learning_rate": 5.508422303185682e-06, + "loss": 0.5601, + "step": 3856 + }, + { + "epoch": 1.8748936170212764, + "grad_norm": 0.07072906848524038, + "learning_rate": 5.506517518976452e-06, + "loss": 0.5176, + "step": 3857 + }, + { + "epoch": 1.8753799392097266, + "grad_norm": 0.07042125266705732, + "learning_rate": 5.50461266049218e-06, + "loss": 0.56, + "step": 3858 + }, + { + "epoch": 1.8758662613981762, + "grad_norm": 0.0791804688202137, + "learning_rate": 5.502707728012191e-06, + "loss": 0.5806, + "step": 3859 + }, + { + "epoch": 1.8763525835866262, + "grad_norm": 0.0739118879721059, + "learning_rate": 5.500802721815821e-06, + "loss": 0.5812, + "step": 3860 + }, + { + "epoch": 1.876838905775076, + "grad_norm": 0.07219821284102616, + "learning_rate": 5.49889764218242e-06, + "loss": 0.5675, + "step": 3861 + }, + { + "epoch": 1.8773252279635257, + "grad_norm": 0.07220254355889123, + "learning_rate": 5.496992489391345e-06, + "loss": 0.5562, + "step": 3862 + }, + { + "epoch": 1.8778115501519757, + "grad_norm": 0.07156920075813895, + "learning_rate": 5.495087263721965e-06, + "loss": 0.5627, + "step": 3863 + }, + { + "epoch": 1.8782978723404256, + "grad_norm": 0.07315547986518335, + "learning_rate": 5.493181965453659e-06, + "loss": 0.5747, + "step": 3864 + }, + { + "epoch": 1.8787841945288752, + "grad_norm": 0.07303839541450867, + "learning_rate": 5.491276594865818e-06, + "loss": 0.5846, + "step": 3865 + }, + { + "epoch": 1.8792705167173254, + "grad_norm": 0.07185358493666952, + "learning_rate": 5.489371152237847e-06, + "loss": 0.5485, + "step": 3866 + }, + { + "epoch": 1.879756838905775, + "grad_norm": 0.07236494540245907, + "learning_rate": 5.487465637849151e-06, + "loss": 0.5835, + "step": 3867 + }, + { + "epoch": 1.880243161094225, + "grad_norm": 0.07536297807903382, + "learning_rate": 5.4855600519791545e-06, + "loss": 0.5499, + "step": 3868 + }, + { + "epoch": 1.8807294832826749, + "grad_norm": 0.07329774304125254, + "learning_rate": 5.483654394907291e-06, + "loss": 0.5462, + "step": 3869 + }, + { + "epoch": 1.8812158054711245, + "grad_norm": 0.06997923564090663, + "learning_rate": 5.481748666913001e-06, + "loss": 0.5306, + "step": 3870 + }, + { + "epoch": 1.8817021276595745, + "grad_norm": 0.07844534972760336, + "learning_rate": 5.479842868275742e-06, + "loss": 0.5799, + "step": 3871 + }, + { + "epoch": 1.8821884498480244, + "grad_norm": 0.07270346625623395, + "learning_rate": 5.477936999274975e-06, + "loss": 0.5848, + "step": 3872 + }, + { + "epoch": 1.882674772036474, + "grad_norm": 0.07006362676995413, + "learning_rate": 5.476031060190173e-06, + "loss": 0.5212, + "step": 3873 + }, + { + "epoch": 1.8831610942249242, + "grad_norm": 0.07100012373998063, + "learning_rate": 5.474125051300821e-06, + "loss": 0.5357, + "step": 3874 + }, + { + "epoch": 1.8836474164133739, + "grad_norm": 0.07592269995110867, + "learning_rate": 5.472218972886416e-06, + "loss": 0.5273, + "step": 3875 + }, + { + "epoch": 1.8841337386018238, + "grad_norm": 0.06928587250916836, + "learning_rate": 5.470312825226461e-06, + "loss": 0.559, + "step": 3876 + }, + { + "epoch": 1.8846200607902737, + "grad_norm": 0.07456925756944746, + "learning_rate": 5.46840660860047e-06, + "loss": 0.5562, + "step": 3877 + }, + { + "epoch": 1.8851063829787233, + "grad_norm": 0.07021559134295732, + "learning_rate": 5.46650032328797e-06, + "loss": 0.5351, + "step": 3878 + }, + { + "epoch": 1.8855927051671733, + "grad_norm": 0.07018384447802946, + "learning_rate": 5.464593969568494e-06, + "loss": 0.5189, + "step": 3879 + }, + { + "epoch": 1.8860790273556232, + "grad_norm": 0.06838130255211024, + "learning_rate": 5.46268754772159e-06, + "loss": 0.5484, + "step": 3880 + }, + { + "epoch": 1.8865653495440728, + "grad_norm": 0.07267231207550441, + "learning_rate": 5.4607810580268094e-06, + "loss": 0.5723, + "step": 3881 + }, + { + "epoch": 1.8870516717325228, + "grad_norm": 0.07126094801636974, + "learning_rate": 5.45887450076372e-06, + "loss": 0.568, + "step": 3882 + }, + { + "epoch": 1.8875379939209727, + "grad_norm": 0.0704742186439123, + "learning_rate": 5.456967876211896e-06, + "loss": 0.5557, + "step": 3883 + }, + { + "epoch": 1.8880243161094223, + "grad_norm": 0.07126629149481976, + "learning_rate": 5.455061184650921e-06, + "loss": 0.5805, + "step": 3884 + }, + { + "epoch": 1.8885106382978725, + "grad_norm": 0.07299989314675018, + "learning_rate": 5.453154426360393e-06, + "loss": 0.5858, + "step": 3885 + }, + { + "epoch": 1.8889969604863222, + "grad_norm": 0.07049872375213509, + "learning_rate": 5.451247601619913e-06, + "loss": 0.5403, + "step": 3886 + }, + { + "epoch": 1.889483282674772, + "grad_norm": 0.07591691887747315, + "learning_rate": 5.449340710709097e-06, + "loss": 0.5481, + "step": 3887 + }, + { + "epoch": 1.889969604863222, + "grad_norm": 0.07091248843228551, + "learning_rate": 5.4474337539075675e-06, + "loss": 0.5561, + "step": 3888 + }, + { + "epoch": 1.8904559270516716, + "grad_norm": 0.07358419989721389, + "learning_rate": 5.445526731494959e-06, + "loss": 0.5815, + "step": 3889 + }, + { + "epoch": 1.8909422492401216, + "grad_norm": 0.07069980987976046, + "learning_rate": 5.443619643750916e-06, + "loss": 0.5442, + "step": 3890 + }, + { + "epoch": 1.8914285714285715, + "grad_norm": 0.07208274573910803, + "learning_rate": 5.441712490955088e-06, + "loss": 0.593, + "step": 3891 + }, + { + "epoch": 1.8919148936170211, + "grad_norm": 0.06985344054806343, + "learning_rate": 5.43980527338714e-06, + "loss": 0.5956, + "step": 3892 + }, + { + "epoch": 1.8924012158054713, + "grad_norm": 0.07245029698248719, + "learning_rate": 5.437897991326743e-06, + "loss": 0.5517, + "step": 3893 + }, + { + "epoch": 1.892887537993921, + "grad_norm": 0.07317693205661047, + "learning_rate": 5.435990645053578e-06, + "loss": 0.5419, + "step": 3894 + }, + { + "epoch": 1.8933738601823709, + "grad_norm": 0.07464253102269856, + "learning_rate": 5.434083234847336e-06, + "loss": 0.5417, + "step": 3895 + }, + { + "epoch": 1.8938601823708208, + "grad_norm": 0.07301445302319524, + "learning_rate": 5.432175760987717e-06, + "loss": 0.5015, + "step": 3896 + }, + { + "epoch": 1.8943465045592704, + "grad_norm": 0.07070000351914066, + "learning_rate": 5.430268223754431e-06, + "loss": 0.5716, + "step": 3897 + }, + { + "epoch": 1.8948328267477204, + "grad_norm": 0.07216905273639297, + "learning_rate": 5.4283606234271955e-06, + "loss": 0.542, + "step": 3898 + }, + { + "epoch": 1.8953191489361703, + "grad_norm": 0.07173351593003321, + "learning_rate": 5.42645296028574e-06, + "loss": 0.5351, + "step": 3899 + }, + { + "epoch": 1.89580547112462, + "grad_norm": 0.07149455964894896, + "learning_rate": 5.424545234609798e-06, + "loss": 0.5778, + "step": 3900 + }, + { + "epoch": 1.89629179331307, + "grad_norm": 0.07425694531189812, + "learning_rate": 5.42263744667912e-06, + "loss": 0.5402, + "step": 3901 + }, + { + "epoch": 1.8967781155015198, + "grad_norm": 0.06991460034781667, + "learning_rate": 5.4207295967734595e-06, + "loss": 0.5071, + "step": 3902 + }, + { + "epoch": 1.8972644376899694, + "grad_norm": 0.06679630885145077, + "learning_rate": 5.418821685172582e-06, + "loss": 0.5101, + "step": 3903 + }, + { + "epoch": 1.8977507598784196, + "grad_norm": 0.07648029531820426, + "learning_rate": 5.41691371215626e-06, + "loss": 0.5527, + "step": 3904 + }, + { + "epoch": 1.8982370820668693, + "grad_norm": 0.07441453165877433, + "learning_rate": 5.415005678004277e-06, + "loss": 0.5512, + "step": 3905 + }, + { + "epoch": 1.8987234042553192, + "grad_norm": 0.07054987103941525, + "learning_rate": 5.413097582996423e-06, + "loss": 0.5491, + "step": 3906 + }, + { + "epoch": 1.899209726443769, + "grad_norm": 0.07307538508563054, + "learning_rate": 5.4111894274125e-06, + "loss": 0.5502, + "step": 3907 + }, + { + "epoch": 1.8996960486322187, + "grad_norm": 0.07268277319538835, + "learning_rate": 5.409281211532317e-06, + "loss": 0.5476, + "step": 3908 + }, + { + "epoch": 1.9001823708206687, + "grad_norm": 0.07254551181776563, + "learning_rate": 5.40737293563569e-06, + "loss": 0.5432, + "step": 3909 + }, + { + "epoch": 1.9006686930091186, + "grad_norm": 0.0697224203216681, + "learning_rate": 5.40546460000245e-06, + "loss": 0.5245, + "step": 3910 + }, + { + "epoch": 1.9011550151975682, + "grad_norm": 0.07385901022020659, + "learning_rate": 5.40355620491243e-06, + "loss": 0.5308, + "step": 3911 + }, + { + "epoch": 1.9016413373860184, + "grad_norm": 0.0705972347693607, + "learning_rate": 5.401647750645477e-06, + "loss": 0.5738, + "step": 3912 + }, + { + "epoch": 1.902127659574468, + "grad_norm": 0.07139404096088262, + "learning_rate": 5.399739237481441e-06, + "loss": 0.5444, + "step": 3913 + }, + { + "epoch": 1.902613981762918, + "grad_norm": 0.07443937105047072, + "learning_rate": 5.397830665700185e-06, + "loss": 0.5382, + "step": 3914 + }, + { + "epoch": 1.9031003039513679, + "grad_norm": 0.07133547255546843, + "learning_rate": 5.39592203558158e-06, + "loss": 0.5643, + "step": 3915 + }, + { + "epoch": 1.9035866261398176, + "grad_norm": 0.07117385062756147, + "learning_rate": 5.394013347405505e-06, + "loss": 0.5329, + "step": 3916 + }, + { + "epoch": 1.9040729483282675, + "grad_norm": 0.07247909116453287, + "learning_rate": 5.392104601451845e-06, + "loss": 0.4999, + "step": 3917 + }, + { + "epoch": 1.9045592705167174, + "grad_norm": 0.06985442759353504, + "learning_rate": 5.390195798000498e-06, + "loss": 0.5547, + "step": 3918 + }, + { + "epoch": 1.905045592705167, + "grad_norm": 0.06996615553375128, + "learning_rate": 5.38828693733137e-06, + "loss": 0.5353, + "step": 3919 + }, + { + "epoch": 1.9055319148936172, + "grad_norm": 0.06883564717525122, + "learning_rate": 5.386378019724372e-06, + "loss": 0.5374, + "step": 3920 + }, + { + "epoch": 1.9060182370820669, + "grad_norm": 0.07193159873531106, + "learning_rate": 5.384469045459424e-06, + "loss": 0.5637, + "step": 3921 + }, + { + "epoch": 1.9065045592705168, + "grad_norm": 0.07196204445970461, + "learning_rate": 5.382560014816457e-06, + "loss": 0.5492, + "step": 3922 + }, + { + "epoch": 1.9069908814589667, + "grad_norm": 0.07020407782598055, + "learning_rate": 5.380650928075407e-06, + "loss": 0.5276, + "step": 3923 + }, + { + "epoch": 1.9074772036474164, + "grad_norm": 0.07268836847250665, + "learning_rate": 5.378741785516222e-06, + "loss": 0.5335, + "step": 3924 + }, + { + "epoch": 1.9079635258358663, + "grad_norm": 0.0728378345883877, + "learning_rate": 5.376832587418854e-06, + "loss": 0.5742, + "step": 3925 + }, + { + "epoch": 1.9084498480243162, + "grad_norm": 0.07528133437836587, + "learning_rate": 5.3749233340632676e-06, + "loss": 0.5641, + "step": 3926 + }, + { + "epoch": 1.9089361702127658, + "grad_norm": 0.06986530945962423, + "learning_rate": 5.373014025729431e-06, + "loss": 0.5189, + "step": 3927 + }, + { + "epoch": 1.9094224924012158, + "grad_norm": 0.07137578923183152, + "learning_rate": 5.371104662697324e-06, + "loss": 0.5762, + "step": 3928 + }, + { + "epoch": 1.9099088145896657, + "grad_norm": 0.07180724242060038, + "learning_rate": 5.369195245246932e-06, + "loss": 0.5611, + "step": 3929 + }, + { + "epoch": 1.9103951367781153, + "grad_norm": 0.07406527169169644, + "learning_rate": 5.36728577365825e-06, + "loss": 0.5107, + "step": 3930 + }, + { + "epoch": 1.9108814589665655, + "grad_norm": 0.07820647798443633, + "learning_rate": 5.365376248211279e-06, + "loss": 0.5442, + "step": 3931 + }, + { + "epoch": 1.9113677811550152, + "grad_norm": 0.07074490016606895, + "learning_rate": 5.363466669186032e-06, + "loss": 0.5376, + "step": 3932 + }, + { + "epoch": 1.911854103343465, + "grad_norm": 0.07314800802056005, + "learning_rate": 5.3615570368625235e-06, + "loss": 0.5575, + "step": 3933 + }, + { + "epoch": 1.912340425531915, + "grad_norm": 0.07102998973258125, + "learning_rate": 5.359647351520783e-06, + "loss": 0.5461, + "step": 3934 + }, + { + "epoch": 1.9128267477203647, + "grad_norm": 0.07318178917179745, + "learning_rate": 5.357737613440842e-06, + "loss": 0.5822, + "step": 3935 + }, + { + "epoch": 1.9133130699088146, + "grad_norm": 0.07373341681033826, + "learning_rate": 5.355827822902741e-06, + "loss": 0.5479, + "step": 3936 + }, + { + "epoch": 1.9137993920972645, + "grad_norm": 0.07285353873902074, + "learning_rate": 5.353917980186533e-06, + "loss": 0.5967, + "step": 3937 + }, + { + "epoch": 1.9142857142857141, + "grad_norm": 0.06897671793099461, + "learning_rate": 5.35200808557227e-06, + "loss": 0.5096, + "step": 3938 + }, + { + "epoch": 1.9147720364741643, + "grad_norm": 0.07331452707960859, + "learning_rate": 5.35009813934002e-06, + "loss": 0.5756, + "step": 3939 + }, + { + "epoch": 1.915258358662614, + "grad_norm": 0.07424482210725596, + "learning_rate": 5.348188141769852e-06, + "loss": 0.5494, + "step": 3940 + }, + { + "epoch": 1.9157446808510639, + "grad_norm": 0.0695504987734764, + "learning_rate": 5.3462780931418475e-06, + "loss": 0.5287, + "step": 3941 + }, + { + "epoch": 1.9162310030395138, + "grad_norm": 0.07204821742874425, + "learning_rate": 5.344367993736094e-06, + "loss": 0.5336, + "step": 3942 + }, + { + "epoch": 1.9167173252279635, + "grad_norm": 0.07224236062750922, + "learning_rate": 5.342457843832686e-06, + "loss": 0.5313, + "step": 3943 + }, + { + "epoch": 1.9172036474164134, + "grad_norm": 0.07063997584455262, + "learning_rate": 5.340547643711721e-06, + "loss": 0.5206, + "step": 3944 + }, + { + "epoch": 1.9176899696048633, + "grad_norm": 0.06907095390317514, + "learning_rate": 5.338637393653313e-06, + "loss": 0.5443, + "step": 3945 + }, + { + "epoch": 1.918176291793313, + "grad_norm": 0.0714142566364829, + "learning_rate": 5.336727093937575e-06, + "loss": 0.5391, + "step": 3946 + }, + { + "epoch": 1.918662613981763, + "grad_norm": 0.0709911708720824, + "learning_rate": 5.334816744844633e-06, + "loss": 0.5596, + "step": 3947 + }, + { + "epoch": 1.9191489361702128, + "grad_norm": 0.07182097343162858, + "learning_rate": 5.3329063466546186e-06, + "loss": 0.56, + "step": 3948 + }, + { + "epoch": 1.9196352583586627, + "grad_norm": 0.0723170487931, + "learning_rate": 5.3309958996476676e-06, + "loss": 0.5539, + "step": 3949 + }, + { + "epoch": 1.9201215805471126, + "grad_norm": 0.07111847281377336, + "learning_rate": 5.329085404103929e-06, + "loss": 0.5907, + "step": 3950 + }, + { + "epoch": 1.9206079027355623, + "grad_norm": 0.0710246141206249, + "learning_rate": 5.32717486030355e-06, + "loss": 0.5571, + "step": 3951 + }, + { + "epoch": 1.9210942249240122, + "grad_norm": 0.07033797682194054, + "learning_rate": 5.3252642685266945e-06, + "loss": 0.5434, + "step": 3952 + }, + { + "epoch": 1.921580547112462, + "grad_norm": 0.07385120540880619, + "learning_rate": 5.323353629053527e-06, + "loss": 0.5858, + "step": 3953 + }, + { + "epoch": 1.9220668693009118, + "grad_norm": 0.0701213705216543, + "learning_rate": 5.3214429421642224e-06, + "loss": 0.5506, + "step": 3954 + }, + { + "epoch": 1.9225531914893617, + "grad_norm": 0.07527342395918077, + "learning_rate": 5.319532208138959e-06, + "loss": 0.5708, + "step": 3955 + }, + { + "epoch": 1.9230395136778116, + "grad_norm": 0.07239596068120352, + "learning_rate": 5.317621427257927e-06, + "loss": 0.5227, + "step": 3956 + }, + { + "epoch": 1.9235258358662612, + "grad_norm": 0.06778220679439734, + "learning_rate": 5.31571059980132e-06, + "loss": 0.5293, + "step": 3957 + }, + { + "epoch": 1.9240121580547114, + "grad_norm": 0.07155222314869827, + "learning_rate": 5.313799726049339e-06, + "loss": 0.5439, + "step": 3958 + }, + { + "epoch": 1.924498480243161, + "grad_norm": 0.07235966322343111, + "learning_rate": 5.311888806282191e-06, + "loss": 0.5325, + "step": 3959 + }, + { + "epoch": 1.924984802431611, + "grad_norm": 0.0715066938139115, + "learning_rate": 5.30997784078009e-06, + "loss": 0.5599, + "step": 3960 + }, + { + "epoch": 1.9254711246200609, + "grad_norm": 0.07530726114029725, + "learning_rate": 5.308066829823261e-06, + "loss": 0.572, + "step": 3961 + }, + { + "epoch": 1.9259574468085106, + "grad_norm": 0.07401814104456632, + "learning_rate": 5.306155773691928e-06, + "loss": 0.5813, + "step": 3962 + }, + { + "epoch": 1.9264437689969605, + "grad_norm": 0.07332258443827892, + "learning_rate": 5.304244672666328e-06, + "loss": 0.5431, + "step": 3963 + }, + { + "epoch": 1.9269300911854104, + "grad_norm": 0.07460145163668583, + "learning_rate": 5.3023335270267e-06, + "loss": 0.6378, + "step": 3964 + }, + { + "epoch": 1.92741641337386, + "grad_norm": 0.07069127075668984, + "learning_rate": 5.300422337053297e-06, + "loss": 0.564, + "step": 3965 + }, + { + "epoch": 1.9279027355623102, + "grad_norm": 0.07069580705921646, + "learning_rate": 5.2985111030263685e-06, + "loss": 0.524, + "step": 3966 + }, + { + "epoch": 1.9283890577507599, + "grad_norm": 0.0727467711525965, + "learning_rate": 5.2965998252261755e-06, + "loss": 0.5416, + "step": 3967 + }, + { + "epoch": 1.9288753799392098, + "grad_norm": 0.0730161945732158, + "learning_rate": 5.294688503932986e-06, + "loss": 0.5358, + "step": 3968 + }, + { + "epoch": 1.9293617021276597, + "grad_norm": 0.06934197918501088, + "learning_rate": 5.2927771394270754e-06, + "loss": 0.5642, + "step": 3969 + }, + { + "epoch": 1.9298480243161094, + "grad_norm": 0.07728581534430642, + "learning_rate": 5.290865731988721e-06, + "loss": 0.5871, + "step": 3970 + }, + { + "epoch": 1.9303343465045593, + "grad_norm": 0.07420400809000982, + "learning_rate": 5.28895428189821e-06, + "loss": 0.5354, + "step": 3971 + }, + { + "epoch": 1.9308206686930092, + "grad_norm": 0.07210480311510799, + "learning_rate": 5.2870427894358345e-06, + "loss": 0.5481, + "step": 3972 + }, + { + "epoch": 1.9313069908814589, + "grad_norm": 0.069850947800103, + "learning_rate": 5.285131254881895e-06, + "loss": 0.5385, + "step": 3973 + }, + { + "epoch": 1.931793313069909, + "grad_norm": 0.06939765769104236, + "learning_rate": 5.283219678516694e-06, + "loss": 0.5252, + "step": 3974 + }, + { + "epoch": 1.9322796352583587, + "grad_norm": 0.0725840974144373, + "learning_rate": 5.281308060620543e-06, + "loss": 0.4922, + "step": 3975 + }, + { + "epoch": 1.9327659574468083, + "grad_norm": 0.0756057412616692, + "learning_rate": 5.279396401473759e-06, + "loss": 0.5511, + "step": 3976 + }, + { + "epoch": 1.9332522796352585, + "grad_norm": 0.07052894851960988, + "learning_rate": 5.277484701356665e-06, + "loss": 0.5241, + "step": 3977 + }, + { + "epoch": 1.9337386018237082, + "grad_norm": 0.07145290958592206, + "learning_rate": 5.275572960549592e-06, + "loss": 0.5484, + "step": 3978 + }, + { + "epoch": 1.934224924012158, + "grad_norm": 0.06914131431117099, + "learning_rate": 5.273661179332874e-06, + "loss": 0.5068, + "step": 3979 + }, + { + "epoch": 1.934711246200608, + "grad_norm": 0.072621996981385, + "learning_rate": 5.27174935798685e-06, + "loss": 0.5381, + "step": 3980 + }, + { + "epoch": 1.9351975683890577, + "grad_norm": 0.07168212583920859, + "learning_rate": 5.269837496791871e-06, + "loss": 0.5487, + "step": 3981 + }, + { + "epoch": 1.9356838905775076, + "grad_norm": 0.07226193494444172, + "learning_rate": 5.267925596028285e-06, + "loss": 0.5668, + "step": 3982 + }, + { + "epoch": 1.9361702127659575, + "grad_norm": 0.06999206951575934, + "learning_rate": 5.266013655976454e-06, + "loss": 0.527, + "step": 3983 + }, + { + "epoch": 1.9366565349544071, + "grad_norm": 0.06888583022490859, + "learning_rate": 5.264101676916741e-06, + "loss": 0.5054, + "step": 3984 + }, + { + "epoch": 1.9371428571428573, + "grad_norm": 0.06930221184078372, + "learning_rate": 5.262189659129515e-06, + "loss": 0.5458, + "step": 3985 + }, + { + "epoch": 1.937629179331307, + "grad_norm": 0.07149689576266506, + "learning_rate": 5.260277602895154e-06, + "loss": 0.5364, + "step": 3986 + }, + { + "epoch": 1.9381155015197569, + "grad_norm": 0.0723772926861595, + "learning_rate": 5.258365508494039e-06, + "loss": 0.5719, + "step": 3987 + }, + { + "epoch": 1.9386018237082068, + "grad_norm": 0.07061068362122885, + "learning_rate": 5.256453376206555e-06, + "loss": 0.5411, + "step": 3988 + }, + { + "epoch": 1.9390881458966565, + "grad_norm": 0.07096073363336144, + "learning_rate": 5.2545412063130964e-06, + "loss": 0.5095, + "step": 3989 + }, + { + "epoch": 1.9395744680851064, + "grad_norm": 0.07126984711713157, + "learning_rate": 5.252628999094059e-06, + "loss": 0.5326, + "step": 3990 + }, + { + "epoch": 1.9400607902735563, + "grad_norm": 0.07002943984588834, + "learning_rate": 5.2507167548298475e-06, + "loss": 0.5303, + "step": 3991 + }, + { + "epoch": 1.940547112462006, + "grad_norm": 0.07303616960770704, + "learning_rate": 5.248804473800872e-06, + "loss": 0.5388, + "step": 3992 + }, + { + "epoch": 1.941033434650456, + "grad_norm": 0.06987933879758945, + "learning_rate": 5.246892156287546e-06, + "loss": 0.5644, + "step": 3993 + }, + { + "epoch": 1.9415197568389058, + "grad_norm": 0.07092382913678416, + "learning_rate": 5.244979802570288e-06, + "loss": 0.4876, + "step": 3994 + }, + { + "epoch": 1.9420060790273557, + "grad_norm": 0.07183594237566124, + "learning_rate": 5.243067412929524e-06, + "loss": 0.5376, + "step": 3995 + }, + { + "epoch": 1.9424924012158056, + "grad_norm": 0.07218317665616278, + "learning_rate": 5.241154987645687e-06, + "loss": 0.5494, + "step": 3996 + }, + { + "epoch": 1.9429787234042553, + "grad_norm": 0.0738576927670373, + "learning_rate": 5.239242526999207e-06, + "loss": 0.5898, + "step": 3997 + }, + { + "epoch": 1.9434650455927052, + "grad_norm": 0.07209239934543615, + "learning_rate": 5.237330031270526e-06, + "loss": 0.5806, + "step": 3998 + }, + { + "epoch": 1.943951367781155, + "grad_norm": 0.07188300097353648, + "learning_rate": 5.235417500740093e-06, + "loss": 0.5655, + "step": 3999 + }, + { + "epoch": 1.9444376899696048, + "grad_norm": 0.07541195031417097, + "learning_rate": 5.233504935688355e-06, + "loss": 0.6531, + "step": 4000 + }, + { + "epoch": 1.9449240121580547, + "grad_norm": 0.07168950322346442, + "learning_rate": 5.231592336395771e-06, + "loss": 0.5679, + "step": 4001 + }, + { + "epoch": 1.9454103343465046, + "grad_norm": 0.07131777859880024, + "learning_rate": 5.229679703142801e-06, + "loss": 0.5313, + "step": 4002 + }, + { + "epoch": 1.9458966565349542, + "grad_norm": 0.0725064508896351, + "learning_rate": 5.227767036209911e-06, + "loss": 0.5589, + "step": 4003 + }, + { + "epoch": 1.9463829787234044, + "grad_norm": 0.07317081821397187, + "learning_rate": 5.225854335877571e-06, + "loss": 0.5578, + "step": 4004 + }, + { + "epoch": 1.946869300911854, + "grad_norm": 0.06909282131971625, + "learning_rate": 5.223941602426258e-06, + "loss": 0.536, + "step": 4005 + }, + { + "epoch": 1.947355623100304, + "grad_norm": 0.07251713147657754, + "learning_rate": 5.222028836136451e-06, + "loss": 0.5372, + "step": 4006 + }, + { + "epoch": 1.9478419452887539, + "grad_norm": 0.07342534467884852, + "learning_rate": 5.220116037288637e-06, + "loss": 0.534, + "step": 4007 + }, + { + "epoch": 1.9483282674772036, + "grad_norm": 0.07086290939044369, + "learning_rate": 5.218203206163306e-06, + "loss": 0.5362, + "step": 4008 + }, + { + "epoch": 1.9488145896656535, + "grad_norm": 0.0729895180234574, + "learning_rate": 5.216290343040952e-06, + "loss": 0.5345, + "step": 4009 + }, + { + "epoch": 1.9493009118541034, + "grad_norm": 0.07390850352288804, + "learning_rate": 5.214377448202075e-06, + "loss": 0.5469, + "step": 4010 + }, + { + "epoch": 1.949787234042553, + "grad_norm": 0.07469818163452135, + "learning_rate": 5.212464521927182e-06, + "loss": 0.5923, + "step": 4011 + }, + { + "epoch": 1.9502735562310032, + "grad_norm": 0.07385365805873335, + "learning_rate": 5.210551564496778e-06, + "loss": 0.5726, + "step": 4012 + }, + { + "epoch": 1.9507598784194529, + "grad_norm": 0.06869536584519903, + "learning_rate": 5.2086385761913775e-06, + "loss": 0.5292, + "step": 4013 + }, + { + "epoch": 1.9512462006079028, + "grad_norm": 0.07268517784103212, + "learning_rate": 5.2067255572914995e-06, + "loss": 0.585, + "step": 4014 + }, + { + "epoch": 1.9517325227963527, + "grad_norm": 0.07237858910815546, + "learning_rate": 5.204812508077666e-06, + "loss": 0.5477, + "step": 4015 + }, + { + "epoch": 1.9522188449848024, + "grad_norm": 0.07232510702682564, + "learning_rate": 5.202899428830404e-06, + "loss": 0.5708, + "step": 4016 + }, + { + "epoch": 1.9527051671732523, + "grad_norm": 0.07168769172266656, + "learning_rate": 5.200986319830245e-06, + "loss": 0.5362, + "step": 4017 + }, + { + "epoch": 1.9531914893617022, + "grad_norm": 0.06986399501839158, + "learning_rate": 5.199073181357725e-06, + "loss": 0.5196, + "step": 4018 + }, + { + "epoch": 1.9536778115501519, + "grad_norm": 0.0721377619109054, + "learning_rate": 5.197160013693382e-06, + "loss": 0.5711, + "step": 4019 + }, + { + "epoch": 1.954164133738602, + "grad_norm": 0.07280342068128169, + "learning_rate": 5.195246817117763e-06, + "loss": 0.5483, + "step": 4020 + }, + { + "epoch": 1.9546504559270517, + "grad_norm": 0.07057144849216664, + "learning_rate": 5.193333591911416e-06, + "loss": 0.5296, + "step": 4021 + }, + { + "epoch": 1.9551367781155016, + "grad_norm": 0.06970470710427557, + "learning_rate": 5.191420338354892e-06, + "loss": 0.5063, + "step": 4022 + }, + { + "epoch": 1.9556231003039515, + "grad_norm": 0.07083213803572287, + "learning_rate": 5.18950705672875e-06, + "loss": 0.5294, + "step": 4023 + }, + { + "epoch": 1.9561094224924012, + "grad_norm": 0.07133845440081246, + "learning_rate": 5.18759374731355e-06, + "loss": 0.5076, + "step": 4024 + }, + { + "epoch": 1.956595744680851, + "grad_norm": 0.06995335914854336, + "learning_rate": 5.185680410389856e-06, + "loss": 0.5165, + "step": 4025 + }, + { + "epoch": 1.957082066869301, + "grad_norm": 0.07226385668066908, + "learning_rate": 5.183767046238239e-06, + "loss": 0.5179, + "step": 4026 + }, + { + "epoch": 1.9575683890577507, + "grad_norm": 0.07405507194945109, + "learning_rate": 5.181853655139272e-06, + "loss": 0.5317, + "step": 4027 + }, + { + "epoch": 1.9580547112462006, + "grad_norm": 0.07216105571906228, + "learning_rate": 5.179940237373532e-06, + "loss": 0.5537, + "step": 4028 + }, + { + "epoch": 1.9585410334346505, + "grad_norm": 0.07353059038918697, + "learning_rate": 5.1780267932215985e-06, + "loss": 0.5453, + "step": 4029 + }, + { + "epoch": 1.9590273556231002, + "grad_norm": 0.0737631617555941, + "learning_rate": 5.176113322964058e-06, + "loss": 0.5927, + "step": 4030 + }, + { + "epoch": 1.9595136778115503, + "grad_norm": 0.0715725098077897, + "learning_rate": 5.174199826881498e-06, + "loss": 0.5455, + "step": 4031 + }, + { + "epoch": 1.96, + "grad_norm": 0.07020638664063605, + "learning_rate": 5.1722863052545124e-06, + "loss": 0.53, + "step": 4032 + }, + { + "epoch": 1.9604863221884499, + "grad_norm": 0.0726420771490652, + "learning_rate": 5.170372758363695e-06, + "loss": 0.5275, + "step": 4033 + }, + { + "epoch": 1.9609726443768998, + "grad_norm": 0.07075944796673647, + "learning_rate": 5.168459186489649e-06, + "loss": 0.5603, + "step": 4034 + }, + { + "epoch": 1.9614589665653495, + "grad_norm": 0.07192249888606185, + "learning_rate": 5.166545589912977e-06, + "loss": 0.5395, + "step": 4035 + }, + { + "epoch": 1.9619452887537994, + "grad_norm": 0.06991776406339933, + "learning_rate": 5.1646319689142835e-06, + "loss": 0.5219, + "step": 4036 + }, + { + "epoch": 1.9624316109422493, + "grad_norm": 0.07066816999990898, + "learning_rate": 5.1627183237741816e-06, + "loss": 0.543, + "step": 4037 + }, + { + "epoch": 1.962917933130699, + "grad_norm": 0.07379540972212562, + "learning_rate": 5.160804654773286e-06, + "loss": 0.5481, + "step": 4038 + }, + { + "epoch": 1.963404255319149, + "grad_norm": 0.07201823454747135, + "learning_rate": 5.158890962192214e-06, + "loss": 0.5395, + "step": 4039 + }, + { + "epoch": 1.9638905775075988, + "grad_norm": 0.07317804299750949, + "learning_rate": 5.156977246311585e-06, + "loss": 0.5846, + "step": 4040 + }, + { + "epoch": 1.9643768996960487, + "grad_norm": 0.07088796227807322, + "learning_rate": 5.155063507412027e-06, + "loss": 0.5358, + "step": 4041 + }, + { + "epoch": 1.9648632218844986, + "grad_norm": 0.07095566646975195, + "learning_rate": 5.153149745774167e-06, + "loss": 0.5322, + "step": 4042 + }, + { + "epoch": 1.9653495440729483, + "grad_norm": 0.07415211984680001, + "learning_rate": 5.151235961678635e-06, + "loss": 0.5465, + "step": 4043 + }, + { + "epoch": 1.9658358662613982, + "grad_norm": 0.07032419654654286, + "learning_rate": 5.149322155406067e-06, + "loss": 0.5598, + "step": 4044 + }, + { + "epoch": 1.966322188449848, + "grad_norm": 0.0721532815547123, + "learning_rate": 5.147408327237099e-06, + "loss": 0.5493, + "step": 4045 + }, + { + "epoch": 1.9668085106382978, + "grad_norm": 0.06997445146897438, + "learning_rate": 5.145494477452375e-06, + "loss": 0.5347, + "step": 4046 + }, + { + "epoch": 1.9672948328267479, + "grad_norm": 0.0744699556037541, + "learning_rate": 5.143580606332539e-06, + "loss": 0.5726, + "step": 4047 + }, + { + "epoch": 1.9677811550151976, + "grad_norm": 0.07257275928752864, + "learning_rate": 5.1416667141582355e-06, + "loss": 0.5473, + "step": 4048 + }, + { + "epoch": 1.9682674772036473, + "grad_norm": 0.07353724562471495, + "learning_rate": 5.139752801210118e-06, + "loss": 0.5621, + "step": 4049 + }, + { + "epoch": 1.9687537993920974, + "grad_norm": 0.07708911764038119, + "learning_rate": 5.1378388677688415e-06, + "loss": 0.5778, + "step": 4050 + }, + { + "epoch": 1.969240121580547, + "grad_norm": 0.07422356627523158, + "learning_rate": 5.135924914115058e-06, + "loss": 0.553, + "step": 4051 + }, + { + "epoch": 1.969726443768997, + "grad_norm": 0.07368977492635634, + "learning_rate": 5.134010940529429e-06, + "loss": 0.5409, + "step": 4052 + }, + { + "epoch": 1.9702127659574469, + "grad_norm": 0.07218959240008989, + "learning_rate": 5.132096947292618e-06, + "loss": 0.5343, + "step": 4053 + }, + { + "epoch": 1.9706990881458966, + "grad_norm": 0.06817959793877926, + "learning_rate": 5.130182934685289e-06, + "loss": 0.533, + "step": 4054 + }, + { + "epoch": 1.9711854103343465, + "grad_norm": 0.07050214275020256, + "learning_rate": 5.128268902988112e-06, + "loss": 0.5542, + "step": 4055 + }, + { + "epoch": 1.9716717325227964, + "grad_norm": 0.06909868526207856, + "learning_rate": 5.126354852481757e-06, + "loss": 0.5244, + "step": 4056 + }, + { + "epoch": 1.972158054711246, + "grad_norm": 0.07509301931401295, + "learning_rate": 5.124440783446898e-06, + "loss": 0.5497, + "step": 4057 + }, + { + "epoch": 1.9726443768996962, + "grad_norm": 0.07196289454389032, + "learning_rate": 5.122526696164211e-06, + "loss": 0.5522, + "step": 4058 + }, + { + "epoch": 1.9731306990881459, + "grad_norm": 0.07429985104790021, + "learning_rate": 5.1206125909143745e-06, + "loss": 0.5358, + "step": 4059 + }, + { + "epoch": 1.9736170212765958, + "grad_norm": 0.07345988567106172, + "learning_rate": 5.118698467978072e-06, + "loss": 0.5175, + "step": 4060 + }, + { + "epoch": 1.9741033434650457, + "grad_norm": 0.07507778545792528, + "learning_rate": 5.1167843276359865e-06, + "loss": 0.5669, + "step": 4061 + }, + { + "epoch": 1.9745896656534954, + "grad_norm": 0.06873489290267816, + "learning_rate": 5.114870170168806e-06, + "loss": 0.525, + "step": 4062 + }, + { + "epoch": 1.9750759878419453, + "grad_norm": 0.07326567348593864, + "learning_rate": 5.112955995857219e-06, + "loss": 0.5597, + "step": 4063 + }, + { + "epoch": 1.9755623100303952, + "grad_norm": 0.07402028313087448, + "learning_rate": 5.111041804981919e-06, + "loss": 0.5443, + "step": 4064 + }, + { + "epoch": 1.9760486322188449, + "grad_norm": 0.07002293728443178, + "learning_rate": 5.109127597823598e-06, + "loss": 0.5243, + "step": 4065 + }, + { + "epoch": 1.976534954407295, + "grad_norm": 0.07333863778960509, + "learning_rate": 5.107213374662954e-06, + "loss": 0.5506, + "step": 4066 + }, + { + "epoch": 1.9770212765957447, + "grad_norm": 0.07135685685852484, + "learning_rate": 5.1052991357806865e-06, + "loss": 0.5594, + "step": 4067 + }, + { + "epoch": 1.9775075987841946, + "grad_norm": 0.07330575880332194, + "learning_rate": 5.103384881457497e-06, + "loss": 0.5445, + "step": 4068 + }, + { + "epoch": 1.9779939209726445, + "grad_norm": 0.06983365004135339, + "learning_rate": 5.1014706119740875e-06, + "loss": 0.5399, + "step": 4069 + }, + { + "epoch": 1.9784802431610942, + "grad_norm": 0.07358784483555246, + "learning_rate": 5.0995563276111655e-06, + "loss": 0.5571, + "step": 4070 + }, + { + "epoch": 1.978966565349544, + "grad_norm": 0.0714844601554554, + "learning_rate": 5.09764202864944e-06, + "loss": 0.5307, + "step": 4071 + }, + { + "epoch": 1.979452887537994, + "grad_norm": 0.0711046234990911, + "learning_rate": 5.095727715369618e-06, + "loss": 0.5671, + "step": 4072 + }, + { + "epoch": 1.9799392097264437, + "grad_norm": 0.07429043037525987, + "learning_rate": 5.0938133880524145e-06, + "loss": 0.5926, + "step": 4073 + }, + { + "epoch": 1.9804255319148936, + "grad_norm": 0.06911949445868028, + "learning_rate": 5.091899046978542e-06, + "loss": 0.5193, + "step": 4074 + }, + { + "epoch": 1.9809118541033435, + "grad_norm": 0.07112322230377795, + "learning_rate": 5.0899846924287184e-06, + "loss": 0.5365, + "step": 4075 + }, + { + "epoch": 1.9813981762917932, + "grad_norm": 0.0719120839126588, + "learning_rate": 5.0880703246836614e-06, + "loss": 0.5501, + "step": 4076 + }, + { + "epoch": 1.9818844984802433, + "grad_norm": 0.07339119495313026, + "learning_rate": 5.086155944024093e-06, + "loss": 0.5627, + "step": 4077 + }, + { + "epoch": 1.982370820668693, + "grad_norm": 0.07503823576953976, + "learning_rate": 5.084241550730732e-06, + "loss": 0.5843, + "step": 4078 + }, + { + "epoch": 1.9828571428571429, + "grad_norm": 0.07195738578023082, + "learning_rate": 5.0823271450843045e-06, + "loss": 0.5629, + "step": 4079 + }, + { + "epoch": 1.9833434650455928, + "grad_norm": 0.07506331827880443, + "learning_rate": 5.080412727365536e-06, + "loss": 0.5742, + "step": 4080 + }, + { + "epoch": 1.9838297872340425, + "grad_norm": 0.07080692046771095, + "learning_rate": 5.078498297855156e-06, + "loss": 0.556, + "step": 4081 + }, + { + "epoch": 1.9843161094224924, + "grad_norm": 0.07462004660722861, + "learning_rate": 5.076583856833888e-06, + "loss": 0.5687, + "step": 4082 + }, + { + "epoch": 1.9848024316109423, + "grad_norm": 0.07714499905054355, + "learning_rate": 5.074669404582469e-06, + "loss": 0.5561, + "step": 4083 + }, + { + "epoch": 1.985288753799392, + "grad_norm": 0.06959446373368061, + "learning_rate": 5.072754941381631e-06, + "loss": 0.5263, + "step": 4084 + }, + { + "epoch": 1.985775075987842, + "grad_norm": 0.07378311642764318, + "learning_rate": 5.070840467512106e-06, + "loss": 0.5458, + "step": 4085 + }, + { + "epoch": 1.9862613981762918, + "grad_norm": 0.06885446916429626, + "learning_rate": 5.0689259832546314e-06, + "loss": 0.5346, + "step": 4086 + }, + { + "epoch": 1.9867477203647417, + "grad_norm": 0.0705163584020101, + "learning_rate": 5.067011488889944e-06, + "loss": 0.5493, + "step": 4087 + }, + { + "epoch": 1.9872340425531916, + "grad_norm": 0.07303617589070245, + "learning_rate": 5.065096984698783e-06, + "loss": 0.5583, + "step": 4088 + }, + { + "epoch": 1.9877203647416413, + "grad_norm": 0.07515882598679031, + "learning_rate": 5.063182470961888e-06, + "loss": 0.5944, + "step": 4089 + }, + { + "epoch": 1.9882066869300912, + "grad_norm": 0.07209010037220351, + "learning_rate": 5.061267947960001e-06, + "loss": 0.5619, + "step": 4090 + }, + { + "epoch": 1.988693009118541, + "grad_norm": 0.07138280513586763, + "learning_rate": 5.059353415973865e-06, + "loss": 0.5216, + "step": 4091 + }, + { + "epoch": 1.9891793313069908, + "grad_norm": 0.07235689246934927, + "learning_rate": 5.057438875284224e-06, + "loss": 0.5539, + "step": 4092 + }, + { + "epoch": 1.989665653495441, + "grad_norm": 0.07043325640792983, + "learning_rate": 5.0555243261718245e-06, + "loss": 0.522, + "step": 4093 + }, + { + "epoch": 1.9901519756838906, + "grad_norm": 0.06843926096249822, + "learning_rate": 5.053609768917414e-06, + "loss": 0.5254, + "step": 4094 + }, + { + "epoch": 1.9906382978723405, + "grad_norm": 0.07374768776751374, + "learning_rate": 5.051695203801739e-06, + "loss": 0.5654, + "step": 4095 + }, + { + "epoch": 1.9911246200607904, + "grad_norm": 0.07364024588806227, + "learning_rate": 5.0497806311055505e-06, + "loss": 0.5459, + "step": 4096 + }, + { + "epoch": 1.99161094224924, + "grad_norm": 0.06944235861576799, + "learning_rate": 5.047866051109597e-06, + "loss": 0.5113, + "step": 4097 + }, + { + "epoch": 1.99209726443769, + "grad_norm": 0.0733226774324953, + "learning_rate": 5.04595146409463e-06, + "loss": 0.5662, + "step": 4098 + }, + { + "epoch": 1.9925835866261399, + "grad_norm": 0.06968827268978985, + "learning_rate": 5.044036870341403e-06, + "loss": 0.5288, + "step": 4099 + }, + { + "epoch": 1.9930699088145896, + "grad_norm": 0.07132226729205574, + "learning_rate": 5.0421222701306685e-06, + "loss": 0.5377, + "step": 4100 + }, + { + "epoch": 1.9935562310030395, + "grad_norm": 0.07136921612932069, + "learning_rate": 5.040207663743182e-06, + "loss": 0.5458, + "step": 4101 + }, + { + "epoch": 1.9940425531914894, + "grad_norm": 0.07261022128192629, + "learning_rate": 5.038293051459698e-06, + "loss": 0.5601, + "step": 4102 + }, + { + "epoch": 1.994528875379939, + "grad_norm": 0.07029858773182517, + "learning_rate": 5.0363784335609744e-06, + "loss": 0.5371, + "step": 4103 + }, + { + "epoch": 1.9950151975683892, + "grad_norm": 0.07160378990780952, + "learning_rate": 5.034463810327766e-06, + "loss": 0.5643, + "step": 4104 + }, + { + "epoch": 1.9955015197568389, + "grad_norm": 0.07237286811587941, + "learning_rate": 5.03254918204083e-06, + "loss": 0.5254, + "step": 4105 + }, + { + "epoch": 1.9959878419452888, + "grad_norm": 0.06890026400947108, + "learning_rate": 5.030634548980926e-06, + "loss": 0.5074, + "step": 4106 + }, + { + "epoch": 1.9964741641337387, + "grad_norm": 0.07340888261976052, + "learning_rate": 5.028719911428814e-06, + "loss": 0.5684, + "step": 4107 + }, + { + "epoch": 1.9969604863221884, + "grad_norm": 0.07154652513498307, + "learning_rate": 5.026805269665254e-06, + "loss": 0.5613, + "step": 4108 + }, + { + "epoch": 1.9974468085106383, + "grad_norm": 0.06863125066672594, + "learning_rate": 5.0248906239710025e-06, + "loss": 0.5495, + "step": 4109 + }, + { + "epoch": 1.9979331306990882, + "grad_norm": 0.06959777212822249, + "learning_rate": 5.022975974626827e-06, + "loss": 0.5575, + "step": 4110 + }, + { + "epoch": 1.9984194528875379, + "grad_norm": 0.07228218266351222, + "learning_rate": 5.021061321913484e-06, + "loss": 0.5278, + "step": 4111 + }, + { + "epoch": 1.998905775075988, + "grad_norm": 0.07318956743749344, + "learning_rate": 5.0191466661117385e-06, + "loss": 0.576, + "step": 4112 + }, + { + "epoch": 1.998905775075988, + "eval_loss": 0.5719841718673706, + "eval_runtime": 105.2602, + "eval_samples_per_second": 288.362, + "eval_steps_per_second": 36.054, + "step": 4112 + }, + { + "epoch": 1.9993920972644377, + "grad_norm": 0.06899319062796848, + "learning_rate": 5.0172320075023504e-06, + "loss": 0.521, + "step": 4113 + }, + { + "epoch": 1.9998784194528876, + "grad_norm": 0.0724600612638861, + "learning_rate": 5.015317346366085e-06, + "loss": 0.544, + "step": 4114 + }, + { + "epoch": 2.0, + "grad_norm": 0.0724600612638861, + "learning_rate": 5.013402682983705e-06, + "loss": 0.1305, + "step": 4115 + }, + { + "epoch": 2.0003647416413375, + "grad_norm": 0.07090566340907718, + "learning_rate": 5.011488017635973e-06, + "loss": 0.3857, + "step": 4116 + }, + { + "epoch": 2.0004863221884497, + "grad_norm": 0.07560565895633789, + "learning_rate": 5.009573350603654e-06, + "loss": 0.5416, + "step": 4117 + }, + { + "epoch": 2.0009726443769, + "grad_norm": 0.07463160319833827, + "learning_rate": 5.007658682167511e-06, + "loss": 0.5216, + "step": 4118 + }, + { + "epoch": 2.0014589665653495, + "grad_norm": 0.0704286164735406, + "learning_rate": 5.0057440126083105e-06, + "loss": 0.5226, + "step": 4119 + }, + { + "epoch": 2.001945288753799, + "grad_norm": 0.0756680374343442, + "learning_rate": 5.003829342206815e-06, + "loss": 0.513, + "step": 4120 + }, + { + "epoch": 2.0024316109422493, + "grad_norm": 0.07200504172525779, + "learning_rate": 5.00191467124379e-06, + "loss": 0.5403, + "step": 4121 + }, + { + "epoch": 2.002917933130699, + "grad_norm": 0.07970405331927172, + "learning_rate": 5e-06, + "loss": 0.5563, + "step": 4122 + }, + { + "epoch": 2.003404255319149, + "grad_norm": 0.07336267398523405, + "learning_rate": 4.998085328756211e-06, + "loss": 0.5194, + "step": 4123 + }, + { + "epoch": 2.003890577507599, + "grad_norm": 0.07732463909571638, + "learning_rate": 4.9961706577931865e-06, + "loss": 0.5164, + "step": 4124 + }, + { + "epoch": 2.0043768996960485, + "grad_norm": 0.07538394478171248, + "learning_rate": 4.99425598739169e-06, + "loss": 0.5233, + "step": 4125 + }, + { + "epoch": 2.0048632218844986, + "grad_norm": 0.07178068742791732, + "learning_rate": 4.99234131783249e-06, + "loss": 0.4967, + "step": 4126 + }, + { + "epoch": 2.0053495440729483, + "grad_norm": 0.07280447920895801, + "learning_rate": 4.990426649396349e-06, + "loss": 0.5426, + "step": 4127 + }, + { + "epoch": 2.005835866261398, + "grad_norm": 0.07895528535780773, + "learning_rate": 4.98851198236403e-06, + "loss": 0.5518, + "step": 4128 + }, + { + "epoch": 2.006322188449848, + "grad_norm": 0.07581014746674569, + "learning_rate": 4.986597317016298e-06, + "loss": 0.5256, + "step": 4129 + }, + { + "epoch": 2.006808510638298, + "grad_norm": 0.07120238586983466, + "learning_rate": 4.984682653633917e-06, + "loss": 0.5177, + "step": 4130 + }, + { + "epoch": 2.007294832826748, + "grad_norm": 0.07119804097902376, + "learning_rate": 4.982767992497652e-06, + "loss": 0.5203, + "step": 4131 + }, + { + "epoch": 2.0077811550151976, + "grad_norm": 0.07559586109173598, + "learning_rate": 4.980853333888262e-06, + "loss": 0.5398, + "step": 4132 + }, + { + "epoch": 2.0082674772036473, + "grad_norm": 0.07350223373534821, + "learning_rate": 4.978938678086517e-06, + "loss": 0.5405, + "step": 4133 + }, + { + "epoch": 2.0087537993920974, + "grad_norm": 0.07208264639169884, + "learning_rate": 4.977024025373174e-06, + "loss": 0.5432, + "step": 4134 + }, + { + "epoch": 2.009240121580547, + "grad_norm": 0.07131172276718777, + "learning_rate": 4.9751093760289975e-06, + "loss": 0.5057, + "step": 4135 + }, + { + "epoch": 2.009726443768997, + "grad_norm": 0.07025931825106611, + "learning_rate": 4.9731947303347485e-06, + "loss": 0.5219, + "step": 4136 + }, + { + "epoch": 2.010212765957447, + "grad_norm": 0.07234591131423174, + "learning_rate": 4.971280088571187e-06, + "loss": 0.5022, + "step": 4137 + }, + { + "epoch": 2.0106990881458966, + "grad_norm": 0.07184852077216447, + "learning_rate": 4.969365451019075e-06, + "loss": 0.5143, + "step": 4138 + }, + { + "epoch": 2.0111854103343467, + "grad_norm": 0.07427629112549243, + "learning_rate": 4.967450817959171e-06, + "loss": 0.5432, + "step": 4139 + }, + { + "epoch": 2.0116717325227964, + "grad_norm": 0.07088431735887683, + "learning_rate": 4.965536189672236e-06, + "loss": 0.5, + "step": 4140 + }, + { + "epoch": 2.012158054711246, + "grad_norm": 0.07118584943300031, + "learning_rate": 4.963621566439027e-06, + "loss": 0.5232, + "step": 4141 + }, + { + "epoch": 2.012644376899696, + "grad_norm": 0.07206765143102957, + "learning_rate": 4.961706948540303e-06, + "loss": 0.5285, + "step": 4142 + }, + { + "epoch": 2.013130699088146, + "grad_norm": 0.07256591203748938, + "learning_rate": 4.959792336256819e-06, + "loss": 0.5482, + "step": 4143 + }, + { + "epoch": 2.0136170212765956, + "grad_norm": 0.06995835914694061, + "learning_rate": 4.957877729869332e-06, + "loss": 0.5209, + "step": 4144 + }, + { + "epoch": 2.0141033434650457, + "grad_norm": 0.0740994737041949, + "learning_rate": 4.955963129658599e-06, + "loss": 0.5261, + "step": 4145 + }, + { + "epoch": 2.0145896656534954, + "grad_norm": 0.07456378202106874, + "learning_rate": 4.954048535905372e-06, + "loss": 0.5189, + "step": 4146 + }, + { + "epoch": 2.015075987841945, + "grad_norm": 0.07511338240986537, + "learning_rate": 4.952133948890406e-06, + "loss": 0.5562, + "step": 4147 + }, + { + "epoch": 2.015562310030395, + "grad_norm": 0.0742620935041202, + "learning_rate": 4.950219368894452e-06, + "loss": 0.5299, + "step": 4148 + }, + { + "epoch": 2.016048632218845, + "grad_norm": 0.07290041090130435, + "learning_rate": 4.948304796198262e-06, + "loss": 0.4918, + "step": 4149 + }, + { + "epoch": 2.016534954407295, + "grad_norm": 0.07032882448577277, + "learning_rate": 4.946390231082586e-06, + "loss": 0.4971, + "step": 4150 + }, + { + "epoch": 2.0170212765957447, + "grad_norm": 0.07241552735232347, + "learning_rate": 4.9444756738281755e-06, + "loss": 0.5386, + "step": 4151 + }, + { + "epoch": 2.0175075987841944, + "grad_norm": 0.07000695319231048, + "learning_rate": 4.942561124715776e-06, + "loss": 0.5048, + "step": 4152 + }, + { + "epoch": 2.0179939209726445, + "grad_norm": 0.07320006631497436, + "learning_rate": 4.940646584026136e-06, + "loss": 0.5178, + "step": 4153 + }, + { + "epoch": 2.018480243161094, + "grad_norm": 0.07268784042118799, + "learning_rate": 4.93873205204e-06, + "loss": 0.5521, + "step": 4154 + }, + { + "epoch": 2.018966565349544, + "grad_norm": 0.07309063892742822, + "learning_rate": 4.936817529038113e-06, + "loss": 0.5432, + "step": 4155 + }, + { + "epoch": 2.019452887537994, + "grad_norm": 0.0751944181988103, + "learning_rate": 4.934903015301218e-06, + "loss": 0.5415, + "step": 4156 + }, + { + "epoch": 2.0199392097264437, + "grad_norm": 0.0717854491309982, + "learning_rate": 4.932988511110058e-06, + "loss": 0.5705, + "step": 4157 + }, + { + "epoch": 2.020425531914894, + "grad_norm": 0.06762626397281103, + "learning_rate": 4.93107401674537e-06, + "loss": 0.4698, + "step": 4158 + }, + { + "epoch": 2.0209118541033435, + "grad_norm": 0.07382780195075954, + "learning_rate": 4.929159532487895e-06, + "loss": 0.5526, + "step": 4159 + }, + { + "epoch": 2.021398176291793, + "grad_norm": 0.07380518136830846, + "learning_rate": 4.92724505861837e-06, + "loss": 0.5456, + "step": 4160 + }, + { + "epoch": 2.0218844984802433, + "grad_norm": 0.07498551761636121, + "learning_rate": 4.9253305954175316e-06, + "loss": 0.5339, + "step": 4161 + }, + { + "epoch": 2.022370820668693, + "grad_norm": 0.07476218196616753, + "learning_rate": 4.9234161431661124e-06, + "loss": 0.5407, + "step": 4162 + }, + { + "epoch": 2.0228571428571427, + "grad_norm": 0.07188806504372429, + "learning_rate": 4.9215017021448476e-06, + "loss": 0.5189, + "step": 4163 + }, + { + "epoch": 2.023343465045593, + "grad_norm": 0.07642579388800144, + "learning_rate": 4.919587272634466e-06, + "loss": 0.5595, + "step": 4164 + }, + { + "epoch": 2.0238297872340425, + "grad_norm": 0.07056888290520946, + "learning_rate": 4.917672854915697e-06, + "loss": 0.5149, + "step": 4165 + }, + { + "epoch": 2.024316109422492, + "grad_norm": 0.07207158328439678, + "learning_rate": 4.915758449269271e-06, + "loss": 0.5433, + "step": 4166 + }, + { + "epoch": 2.0248024316109423, + "grad_norm": 0.0712268953352973, + "learning_rate": 4.91384405597591e-06, + "loss": 0.5552, + "step": 4167 + }, + { + "epoch": 2.025288753799392, + "grad_norm": 0.07160884103680992, + "learning_rate": 4.9119296753163385e-06, + "loss": 0.552, + "step": 4168 + }, + { + "epoch": 2.025775075987842, + "grad_norm": 0.0705533289785172, + "learning_rate": 4.9100153075712815e-06, + "loss": 0.5297, + "step": 4169 + }, + { + "epoch": 2.026261398176292, + "grad_norm": 0.07214722507916452, + "learning_rate": 4.908100953021458e-06, + "loss": 0.5241, + "step": 4170 + }, + { + "epoch": 2.0267477203647415, + "grad_norm": 0.07379176885362691, + "learning_rate": 4.906186611947587e-06, + "loss": 0.5793, + "step": 4171 + }, + { + "epoch": 2.0272340425531916, + "grad_norm": 0.0704623777822778, + "learning_rate": 4.9042722846303836e-06, + "loss": 0.4898, + "step": 4172 + }, + { + "epoch": 2.0277203647416413, + "grad_norm": 0.07360585873779849, + "learning_rate": 4.902357971350562e-06, + "loss": 0.5257, + "step": 4173 + }, + { + "epoch": 2.028206686930091, + "grad_norm": 0.07277010233759118, + "learning_rate": 4.900443672388835e-06, + "loss": 0.5212, + "step": 4174 + }, + { + "epoch": 2.028693009118541, + "grad_norm": 0.0721973073846585, + "learning_rate": 4.898529388025913e-06, + "loss": 0.5276, + "step": 4175 + }, + { + "epoch": 2.029179331306991, + "grad_norm": 0.07716581794527941, + "learning_rate": 4.896615118542505e-06, + "loss": 0.5153, + "step": 4176 + }, + { + "epoch": 2.029665653495441, + "grad_norm": 0.07735463118096297, + "learning_rate": 4.894700864219314e-06, + "loss": 0.5267, + "step": 4177 + }, + { + "epoch": 2.0301519756838906, + "grad_norm": 0.07002988041194366, + "learning_rate": 4.892786625337047e-06, + "loss": 0.5047, + "step": 4178 + }, + { + "epoch": 2.0306382978723403, + "grad_norm": 0.06980995310400111, + "learning_rate": 4.890872402176404e-06, + "loss": 0.507, + "step": 4179 + }, + { + "epoch": 2.0311246200607904, + "grad_norm": 0.07303792142582727, + "learning_rate": 4.8889581950180835e-06, + "loss": 0.5325, + "step": 4180 + }, + { + "epoch": 2.03161094224924, + "grad_norm": 0.0703341485201467, + "learning_rate": 4.887044004142783e-06, + "loss": 0.529, + "step": 4181 + }, + { + "epoch": 2.03209726443769, + "grad_norm": 0.07221863594394819, + "learning_rate": 4.8851298298311965e-06, + "loss": 0.5263, + "step": 4182 + }, + { + "epoch": 2.03258358662614, + "grad_norm": 0.07138545399327458, + "learning_rate": 4.883215672364016e-06, + "loss": 0.5425, + "step": 4183 + }, + { + "epoch": 2.0330699088145896, + "grad_norm": 0.06970260205901674, + "learning_rate": 4.881301532021931e-06, + "loss": 0.4924, + "step": 4184 + }, + { + "epoch": 2.0335562310030397, + "grad_norm": 0.06925962166465548, + "learning_rate": 4.879387409085628e-06, + "loss": 0.5089, + "step": 4185 + }, + { + "epoch": 2.0340425531914894, + "grad_norm": 0.07320339747292594, + "learning_rate": 4.877473303835791e-06, + "loss": 0.555, + "step": 4186 + }, + { + "epoch": 2.034528875379939, + "grad_norm": 0.07052211739307121, + "learning_rate": 4.875559216553104e-06, + "loss": 0.5212, + "step": 4187 + }, + { + "epoch": 2.0350151975683892, + "grad_norm": 0.07173699493673336, + "learning_rate": 4.873645147518244e-06, + "loss": 0.5494, + "step": 4188 + }, + { + "epoch": 2.035501519756839, + "grad_norm": 0.07123179946314978, + "learning_rate": 4.871731097011889e-06, + "loss": 0.5288, + "step": 4189 + }, + { + "epoch": 2.0359878419452886, + "grad_norm": 0.07295432467900945, + "learning_rate": 4.869817065314711e-06, + "loss": 0.5469, + "step": 4190 + }, + { + "epoch": 2.0364741641337387, + "grad_norm": 0.06949617993069249, + "learning_rate": 4.867903052707383e-06, + "loss": 0.4878, + "step": 4191 + }, + { + "epoch": 2.0369604863221884, + "grad_norm": 0.07072975679994874, + "learning_rate": 4.865989059470572e-06, + "loss": 0.5129, + "step": 4192 + }, + { + "epoch": 2.037446808510638, + "grad_norm": 0.07139665836685158, + "learning_rate": 4.8640750858849435e-06, + "loss": 0.5437, + "step": 4193 + }, + { + "epoch": 2.037933130699088, + "grad_norm": 0.07180968210847434, + "learning_rate": 4.86216113223116e-06, + "loss": 0.5027, + "step": 4194 + }, + { + "epoch": 2.038419452887538, + "grad_norm": 0.07397630404840642, + "learning_rate": 4.860247198789883e-06, + "loss": 0.5335, + "step": 4195 + }, + { + "epoch": 2.038905775075988, + "grad_norm": 0.07140117973666019, + "learning_rate": 4.858333285841765e-06, + "loss": 0.526, + "step": 4196 + }, + { + "epoch": 2.0393920972644377, + "grad_norm": 0.07070515223426743, + "learning_rate": 4.856419393667463e-06, + "loss": 0.511, + "step": 4197 + }, + { + "epoch": 2.0398784194528874, + "grad_norm": 0.07291706561477407, + "learning_rate": 4.8545055225476265e-06, + "loss": 0.5319, + "step": 4198 + }, + { + "epoch": 2.0403647416413375, + "grad_norm": 0.07030076437250854, + "learning_rate": 4.8525916727629025e-06, + "loss": 0.5169, + "step": 4199 + }, + { + "epoch": 2.040851063829787, + "grad_norm": 0.07393845704355517, + "learning_rate": 4.850677844593936e-06, + "loss": 0.5252, + "step": 4200 + }, + { + "epoch": 2.041337386018237, + "grad_norm": 0.07027881943979325, + "learning_rate": 4.848764038321367e-06, + "loss": 0.5152, + "step": 4201 + }, + { + "epoch": 2.041823708206687, + "grad_norm": 0.0730431578843535, + "learning_rate": 4.846850254225835e-06, + "loss": 0.5429, + "step": 4202 + }, + { + "epoch": 2.0423100303951367, + "grad_norm": 0.06987754199598122, + "learning_rate": 4.8449364925879745e-06, + "loss": 0.5667, + "step": 4203 + }, + { + "epoch": 2.042796352583587, + "grad_norm": 0.07165875716009988, + "learning_rate": 4.843022753688415e-06, + "loss": 0.5004, + "step": 4204 + }, + { + "epoch": 2.0432826747720365, + "grad_norm": 0.07216895464315917, + "learning_rate": 4.841109037807787e-06, + "loss": 0.5568, + "step": 4205 + }, + { + "epoch": 2.043768996960486, + "grad_norm": 0.0714623513986144, + "learning_rate": 4.839195345226715e-06, + "loss": 0.5071, + "step": 4206 + }, + { + "epoch": 2.0442553191489363, + "grad_norm": 0.07240143533521515, + "learning_rate": 4.837281676225819e-06, + "loss": 0.5349, + "step": 4207 + }, + { + "epoch": 2.044741641337386, + "grad_norm": 0.07244361076855506, + "learning_rate": 4.835368031085717e-06, + "loss": 0.4984, + "step": 4208 + }, + { + "epoch": 2.0452279635258357, + "grad_norm": 0.07202071236904925, + "learning_rate": 4.833454410087024e-06, + "loss": 0.4868, + "step": 4209 + }, + { + "epoch": 2.045714285714286, + "grad_norm": 0.06925764718537741, + "learning_rate": 4.831540813510352e-06, + "loss": 0.5199, + "step": 4210 + }, + { + "epoch": 2.0462006079027355, + "grad_norm": 0.07533749803028636, + "learning_rate": 4.829627241636306e-06, + "loss": 0.5519, + "step": 4211 + }, + { + "epoch": 2.0466869300911856, + "grad_norm": 0.07465566970295337, + "learning_rate": 4.827713694745489e-06, + "loss": 0.5335, + "step": 4212 + }, + { + "epoch": 2.0471732522796353, + "grad_norm": 0.07170669847746954, + "learning_rate": 4.825800173118503e-06, + "loss": 0.5285, + "step": 4213 + }, + { + "epoch": 2.047659574468085, + "grad_norm": 0.07260824290566995, + "learning_rate": 4.823886677035944e-06, + "loss": 0.5343, + "step": 4214 + }, + { + "epoch": 2.048145896656535, + "grad_norm": 0.07965887746294288, + "learning_rate": 4.821973206778403e-06, + "loss": 0.4994, + "step": 4215 + }, + { + "epoch": 2.048632218844985, + "grad_norm": 0.07177784973470001, + "learning_rate": 4.82005976262647e-06, + "loss": 0.507, + "step": 4216 + }, + { + "epoch": 2.0491185410334345, + "grad_norm": 0.08499717899919779, + "learning_rate": 4.818146344860729e-06, + "loss": 0.5485, + "step": 4217 + }, + { + "epoch": 2.0496048632218846, + "grad_norm": 0.07176714252214604, + "learning_rate": 4.816232953761762e-06, + "loss": 0.5384, + "step": 4218 + }, + { + "epoch": 2.0500911854103343, + "grad_norm": 0.07289023465593142, + "learning_rate": 4.814319589610146e-06, + "loss": 0.5575, + "step": 4219 + }, + { + "epoch": 2.050577507598784, + "grad_norm": 0.07162841827502749, + "learning_rate": 4.812406252686453e-06, + "loss": 0.5087, + "step": 4220 + }, + { + "epoch": 2.051063829787234, + "grad_norm": 0.06999808794476338, + "learning_rate": 4.810492943271253e-06, + "loss": 0.5381, + "step": 4221 + }, + { + "epoch": 2.051550151975684, + "grad_norm": 0.07362939865519282, + "learning_rate": 4.8085796616451086e-06, + "loss": 0.5467, + "step": 4222 + }, + { + "epoch": 2.052036474164134, + "grad_norm": 0.07235852498278311, + "learning_rate": 4.806666408088585e-06, + "loss": 0.5181, + "step": 4223 + }, + { + "epoch": 2.0525227963525836, + "grad_norm": 0.0717383388555045, + "learning_rate": 4.804753182882237e-06, + "loss": 0.548, + "step": 4224 + }, + { + "epoch": 2.0530091185410333, + "grad_norm": 0.07089317780872104, + "learning_rate": 4.802839986306619e-06, + "loss": 0.5145, + "step": 4225 + }, + { + "epoch": 2.0534954407294834, + "grad_norm": 0.06934855459471415, + "learning_rate": 4.800926818642278e-06, + "loss": 0.4913, + "step": 4226 + }, + { + "epoch": 2.053981762917933, + "grad_norm": 0.07144099789625005, + "learning_rate": 4.799013680169757e-06, + "loss": 0.5542, + "step": 4227 + }, + { + "epoch": 2.054468085106383, + "grad_norm": 0.07252433831201918, + "learning_rate": 4.797100571169597e-06, + "loss": 0.5007, + "step": 4228 + }, + { + "epoch": 2.054954407294833, + "grad_norm": 0.0749424627498245, + "learning_rate": 4.795187491922336e-06, + "loss": 0.5374, + "step": 4229 + }, + { + "epoch": 2.0554407294832826, + "grad_norm": 0.07256734461702409, + "learning_rate": 4.793274442708502e-06, + "loss": 0.5005, + "step": 4230 + }, + { + "epoch": 2.0559270516717327, + "grad_norm": 0.07206862839449416, + "learning_rate": 4.791361423808623e-06, + "loss": 0.555, + "step": 4231 + }, + { + "epoch": 2.0564133738601824, + "grad_norm": 0.07178024549561958, + "learning_rate": 4.789448435503224e-06, + "loss": 0.5347, + "step": 4232 + }, + { + "epoch": 2.056899696048632, + "grad_norm": 0.07063748081507971, + "learning_rate": 4.78753547807282e-06, + "loss": 0.5148, + "step": 4233 + }, + { + "epoch": 2.0573860182370822, + "grad_norm": 0.06988911857142277, + "learning_rate": 4.785622551797926e-06, + "loss": 0.4973, + "step": 4234 + }, + { + "epoch": 2.057872340425532, + "grad_norm": 0.07065133985763723, + "learning_rate": 4.78370965695905e-06, + "loss": 0.5148, + "step": 4235 + }, + { + "epoch": 2.0583586626139816, + "grad_norm": 0.0732950012079987, + "learning_rate": 4.781796793836696e-06, + "loss": 0.4937, + "step": 4236 + }, + { + "epoch": 2.0588449848024317, + "grad_norm": 0.0694929566050334, + "learning_rate": 4.779883962711364e-06, + "loss": 0.5067, + "step": 4237 + }, + { + "epoch": 2.0593313069908814, + "grad_norm": 0.0716832955681354, + "learning_rate": 4.7779711638635504e-06, + "loss": 0.5417, + "step": 4238 + }, + { + "epoch": 2.059817629179331, + "grad_norm": 0.07233257038104912, + "learning_rate": 4.776058397573744e-06, + "loss": 0.5001, + "step": 4239 + }, + { + "epoch": 2.060303951367781, + "grad_norm": 0.07224215030014126, + "learning_rate": 4.7741456641224295e-06, + "loss": 0.5291, + "step": 4240 + }, + { + "epoch": 2.060790273556231, + "grad_norm": 0.07081977650099833, + "learning_rate": 4.7722329637900895e-06, + "loss": 0.488, + "step": 4241 + }, + { + "epoch": 2.061276595744681, + "grad_norm": 0.07122991027392214, + "learning_rate": 4.7703202968572e-06, + "loss": 0.5148, + "step": 4242 + }, + { + "epoch": 2.0617629179331307, + "grad_norm": 0.0715727784395469, + "learning_rate": 4.768407663604229e-06, + "loss": 0.5322, + "step": 4243 + }, + { + "epoch": 2.0622492401215804, + "grad_norm": 0.07028995464871736, + "learning_rate": 4.7664950643116445e-06, + "loss": 0.5064, + "step": 4244 + }, + { + "epoch": 2.0627355623100305, + "grad_norm": 0.07034899044321367, + "learning_rate": 4.764582499259908e-06, + "loss": 0.5243, + "step": 4245 + }, + { + "epoch": 2.06322188449848, + "grad_norm": 0.07235862275941812, + "learning_rate": 4.7626699687294746e-06, + "loss": 0.5326, + "step": 4246 + }, + { + "epoch": 2.06370820668693, + "grad_norm": 0.07509184317130671, + "learning_rate": 4.760757473000794e-06, + "loss": 0.5324, + "step": 4247 + }, + { + "epoch": 2.06419452887538, + "grad_norm": 0.07254159181567657, + "learning_rate": 4.758845012354314e-06, + "loss": 0.5137, + "step": 4248 + }, + { + "epoch": 2.0646808510638297, + "grad_norm": 0.07267173493401578, + "learning_rate": 4.756932587070476e-06, + "loss": 0.5284, + "step": 4249 + }, + { + "epoch": 2.06516717325228, + "grad_norm": 0.07311315318936584, + "learning_rate": 4.755020197429713e-06, + "loss": 0.543, + "step": 4250 + }, + { + "epoch": 2.0656534954407295, + "grad_norm": 0.0734409520378827, + "learning_rate": 4.7531078437124555e-06, + "loss": 0.5171, + "step": 4251 + }, + { + "epoch": 2.066139817629179, + "grad_norm": 0.07187278949456045, + "learning_rate": 4.751195526199129e-06, + "loss": 0.5286, + "step": 4252 + }, + { + "epoch": 2.0666261398176293, + "grad_norm": 0.07164257355329291, + "learning_rate": 4.749283245170153e-06, + "loss": 0.5113, + "step": 4253 + }, + { + "epoch": 2.067112462006079, + "grad_norm": 0.07227591181386367, + "learning_rate": 4.747371000905943e-06, + "loss": 0.5251, + "step": 4254 + }, + { + "epoch": 2.0675987841945287, + "grad_norm": 0.07252537478858759, + "learning_rate": 4.745458793686906e-06, + "loss": 0.5247, + "step": 4255 + }, + { + "epoch": 2.068085106382979, + "grad_norm": 0.07474147359887902, + "learning_rate": 4.743546623793447e-06, + "loss": 0.5475, + "step": 4256 + }, + { + "epoch": 2.0685714285714285, + "grad_norm": 0.06968974901182258, + "learning_rate": 4.741634491505963e-06, + "loss": 0.482, + "step": 4257 + }, + { + "epoch": 2.0690577507598786, + "grad_norm": 0.07364892356782594, + "learning_rate": 4.739722397104849e-06, + "loss": 0.5244, + "step": 4258 + }, + { + "epoch": 2.0695440729483283, + "grad_norm": 0.07361437024359314, + "learning_rate": 4.737810340870484e-06, + "loss": 0.549, + "step": 4259 + }, + { + "epoch": 2.070030395136778, + "grad_norm": 0.07175054061207829, + "learning_rate": 4.73589832308326e-06, + "loss": 0.5396, + "step": 4260 + }, + { + "epoch": 2.070516717325228, + "grad_norm": 0.07522201111796856, + "learning_rate": 4.733986344023547e-06, + "loss": 0.5461, + "step": 4261 + }, + { + "epoch": 2.071003039513678, + "grad_norm": 0.07293163098546677, + "learning_rate": 4.732074403971716e-06, + "loss": 0.5522, + "step": 4262 + }, + { + "epoch": 2.0714893617021275, + "grad_norm": 0.07235982880906476, + "learning_rate": 4.730162503208131e-06, + "loss": 0.5387, + "step": 4263 + }, + { + "epoch": 2.0719756838905776, + "grad_norm": 0.07058410497227291, + "learning_rate": 4.728250642013151e-06, + "loss": 0.5388, + "step": 4264 + }, + { + "epoch": 2.0724620060790273, + "grad_norm": 0.0727881467734768, + "learning_rate": 4.726338820667128e-06, + "loss": 0.5213, + "step": 4265 + }, + { + "epoch": 2.072948328267477, + "grad_norm": 0.0711038161087961, + "learning_rate": 4.7244270394504085e-06, + "loss": 0.554, + "step": 4266 + }, + { + "epoch": 2.073434650455927, + "grad_norm": 0.07323877583410632, + "learning_rate": 4.722515298643335e-06, + "loss": 0.5151, + "step": 4267 + }, + { + "epoch": 2.073920972644377, + "grad_norm": 0.0736735373071005, + "learning_rate": 4.720603598526243e-06, + "loss": 0.527, + "step": 4268 + }, + { + "epoch": 2.074407294832827, + "grad_norm": 0.07490647231505362, + "learning_rate": 4.718691939379459e-06, + "loss": 0.5319, + "step": 4269 + }, + { + "epoch": 2.0748936170212766, + "grad_norm": 0.0726703140165146, + "learning_rate": 4.716780321483308e-06, + "loss": 0.5215, + "step": 4270 + }, + { + "epoch": 2.0753799392097263, + "grad_norm": 0.07039672869273815, + "learning_rate": 4.714868745118107e-06, + "loss": 0.5142, + "step": 4271 + }, + { + "epoch": 2.0758662613981764, + "grad_norm": 0.07150052442021539, + "learning_rate": 4.712957210564166e-06, + "loss": 0.5239, + "step": 4272 + }, + { + "epoch": 2.076352583586626, + "grad_norm": 0.07266818345949104, + "learning_rate": 4.7110457181017925e-06, + "loss": 0.5631, + "step": 4273 + }, + { + "epoch": 2.076838905775076, + "grad_norm": 0.07405293793860726, + "learning_rate": 4.709134268011281e-06, + "loss": 0.5305, + "step": 4274 + }, + { + "epoch": 2.077325227963526, + "grad_norm": 0.07175009658076159, + "learning_rate": 4.707222860572928e-06, + "loss": 0.5136, + "step": 4275 + }, + { + "epoch": 2.0778115501519756, + "grad_norm": 0.07114418055672524, + "learning_rate": 4.705311496067016e-06, + "loss": 0.5141, + "step": 4276 + }, + { + "epoch": 2.0782978723404257, + "grad_norm": 0.07065371945223853, + "learning_rate": 4.703400174773825e-06, + "loss": 0.5085, + "step": 4277 + }, + { + "epoch": 2.0787841945288754, + "grad_norm": 0.07275091785706649, + "learning_rate": 4.701488896973633e-06, + "loss": 0.5033, + "step": 4278 + }, + { + "epoch": 2.079270516717325, + "grad_norm": 0.07149974274334023, + "learning_rate": 4.6995776629467045e-06, + "loss": 0.5219, + "step": 4279 + }, + { + "epoch": 2.0797568389057752, + "grad_norm": 0.06835347572736793, + "learning_rate": 4.6976664729733e-06, + "loss": 0.5029, + "step": 4280 + }, + { + "epoch": 2.080243161094225, + "grad_norm": 0.06915904097928988, + "learning_rate": 4.695755327333673e-06, + "loss": 0.5139, + "step": 4281 + }, + { + "epoch": 2.0807294832826746, + "grad_norm": 0.07463886689914738, + "learning_rate": 4.693844226308073e-06, + "loss": 0.5522, + "step": 4282 + }, + { + "epoch": 2.0812158054711247, + "grad_norm": 0.0722963202740917, + "learning_rate": 4.691933170176741e-06, + "loss": 0.5187, + "step": 4283 + }, + { + "epoch": 2.0817021276595744, + "grad_norm": 0.07214746268838576, + "learning_rate": 4.6900221592199105e-06, + "loss": 0.5343, + "step": 4284 + }, + { + "epoch": 2.082188449848024, + "grad_norm": 0.07260814640148158, + "learning_rate": 4.68811119371781e-06, + "loss": 0.5274, + "step": 4285 + }, + { + "epoch": 2.082674772036474, + "grad_norm": 0.07157456990337037, + "learning_rate": 4.686200273950662e-06, + "loss": 0.5252, + "step": 4286 + }, + { + "epoch": 2.083161094224924, + "grad_norm": 0.07055184941556789, + "learning_rate": 4.684289400198682e-06, + "loss": 0.5179, + "step": 4287 + }, + { + "epoch": 2.083647416413374, + "grad_norm": 0.07009853751425962, + "learning_rate": 4.682378572742074e-06, + "loss": 0.5215, + "step": 4288 + }, + { + "epoch": 2.0841337386018237, + "grad_norm": 0.07473808557838861, + "learning_rate": 4.680467791861042e-06, + "loss": 0.5105, + "step": 4289 + }, + { + "epoch": 2.0846200607902734, + "grad_norm": 0.07159394012230107, + "learning_rate": 4.67855705783578e-06, + "loss": 0.5113, + "step": 4290 + }, + { + "epoch": 2.0851063829787235, + "grad_norm": 0.07003680550608568, + "learning_rate": 4.676646370946475e-06, + "loss": 0.4907, + "step": 4291 + }, + { + "epoch": 2.085592705167173, + "grad_norm": 0.07090815229239068, + "learning_rate": 4.674735731473308e-06, + "loss": 0.5267, + "step": 4292 + }, + { + "epoch": 2.086079027355623, + "grad_norm": 0.07244282893226998, + "learning_rate": 4.672825139696452e-06, + "loss": 0.514, + "step": 4293 + }, + { + "epoch": 2.086565349544073, + "grad_norm": 0.07012379184722464, + "learning_rate": 4.670914595896075e-06, + "loss": 0.5109, + "step": 4294 + }, + { + "epoch": 2.0870516717325227, + "grad_norm": 0.07360448994801068, + "learning_rate": 4.669004100352333e-06, + "loss": 0.4967, + "step": 4295 + }, + { + "epoch": 2.087537993920973, + "grad_norm": 0.07345894847607636, + "learning_rate": 4.667093653345382e-06, + "loss": 0.568, + "step": 4296 + }, + { + "epoch": 2.0880243161094225, + "grad_norm": 0.0719953055546164, + "learning_rate": 4.665183255155367e-06, + "loss": 0.5517, + "step": 4297 + }, + { + "epoch": 2.088510638297872, + "grad_norm": 0.07145027950921647, + "learning_rate": 4.663272906062426e-06, + "loss": 0.519, + "step": 4298 + }, + { + "epoch": 2.0889969604863223, + "grad_norm": 0.07292007262090149, + "learning_rate": 4.661362606346689e-06, + "loss": 0.5412, + "step": 4299 + }, + { + "epoch": 2.089483282674772, + "grad_norm": 0.06976454334601297, + "learning_rate": 4.65945235628828e-06, + "loss": 0.4879, + "step": 4300 + }, + { + "epoch": 2.0899696048632217, + "grad_norm": 0.0708872059691606, + "learning_rate": 4.657542156167316e-06, + "loss": 0.5036, + "step": 4301 + }, + { + "epoch": 2.090455927051672, + "grad_norm": 0.07277940726493234, + "learning_rate": 4.655632006263907e-06, + "loss": 0.5109, + "step": 4302 + }, + { + "epoch": 2.0909422492401215, + "grad_norm": 0.07262977668497943, + "learning_rate": 4.653721906858153e-06, + "loss": 0.5727, + "step": 4303 + }, + { + "epoch": 2.0914285714285716, + "grad_norm": 0.07464151898875335, + "learning_rate": 4.651811858230149e-06, + "loss": 0.5068, + "step": 4304 + }, + { + "epoch": 2.0919148936170213, + "grad_norm": 0.07239360841205382, + "learning_rate": 4.6499018606599815e-06, + "loss": 0.547, + "step": 4305 + }, + { + "epoch": 2.092401215805471, + "grad_norm": 0.07078991567607104, + "learning_rate": 4.647991914427732e-06, + "loss": 0.5422, + "step": 4306 + }, + { + "epoch": 2.092887537993921, + "grad_norm": 0.07056129806041661, + "learning_rate": 4.64608201981347e-06, + "loss": 0.498, + "step": 4307 + }, + { + "epoch": 2.093373860182371, + "grad_norm": 0.06966870092423305, + "learning_rate": 4.644172177097259e-06, + "loss": 0.5165, + "step": 4308 + }, + { + "epoch": 2.0938601823708205, + "grad_norm": 0.07103894963685853, + "learning_rate": 4.64226238655916e-06, + "loss": 0.4946, + "step": 4309 + }, + { + "epoch": 2.0943465045592706, + "grad_norm": 0.07365719921672839, + "learning_rate": 4.640352648479219e-06, + "loss": 0.564, + "step": 4310 + }, + { + "epoch": 2.0948328267477203, + "grad_norm": 0.07441239044739215, + "learning_rate": 4.638442963137478e-06, + "loss": 0.5748, + "step": 4311 + }, + { + "epoch": 2.09531914893617, + "grad_norm": 0.07244824047157294, + "learning_rate": 4.636533330813971e-06, + "loss": 0.5222, + "step": 4312 + }, + { + "epoch": 2.09580547112462, + "grad_norm": 0.07166380134159551, + "learning_rate": 4.6346237517887214e-06, + "loss": 0.5098, + "step": 4313 + }, + { + "epoch": 2.09629179331307, + "grad_norm": 0.07032576929905841, + "learning_rate": 4.632714226341751e-06, + "loss": 0.4956, + "step": 4314 + }, + { + "epoch": 2.09677811550152, + "grad_norm": 0.07244751402274383, + "learning_rate": 4.630804754753069e-06, + "loss": 0.5207, + "step": 4315 + }, + { + "epoch": 2.0972644376899696, + "grad_norm": 0.06921197183750415, + "learning_rate": 4.628895337302676e-06, + "loss": 0.4966, + "step": 4316 + }, + { + "epoch": 2.0977507598784193, + "grad_norm": 0.07455758201185998, + "learning_rate": 4.62698597427057e-06, + "loss": 0.5158, + "step": 4317 + }, + { + "epoch": 2.0982370820668694, + "grad_norm": 0.07075546064268393, + "learning_rate": 4.625076665936733e-06, + "loss": 0.4856, + "step": 4318 + }, + { + "epoch": 2.098723404255319, + "grad_norm": 0.07475995164046163, + "learning_rate": 4.623167412581147e-06, + "loss": 0.5166, + "step": 4319 + }, + { + "epoch": 2.099209726443769, + "grad_norm": 0.07230168705872128, + "learning_rate": 4.621258214483779e-06, + "loss": 0.5416, + "step": 4320 + }, + { + "epoch": 2.099696048632219, + "grad_norm": 0.07280495227536604, + "learning_rate": 4.619349071924594e-06, + "loss": 0.5269, + "step": 4321 + }, + { + "epoch": 2.1001823708206686, + "grad_norm": 0.07272142886384839, + "learning_rate": 4.617439985183545e-06, + "loss": 0.5566, + "step": 4322 + }, + { + "epoch": 2.1006686930091187, + "grad_norm": 0.06986298285529687, + "learning_rate": 4.615530954540578e-06, + "loss": 0.515, + "step": 4323 + }, + { + "epoch": 2.1011550151975684, + "grad_norm": 0.06904839372569721, + "learning_rate": 4.6136219802756295e-06, + "loss": 0.483, + "step": 4324 + }, + { + "epoch": 2.101641337386018, + "grad_norm": 0.07023845723050942, + "learning_rate": 4.6117130626686304e-06, + "loss": 0.493, + "step": 4325 + }, + { + "epoch": 2.1021276595744682, + "grad_norm": 0.07063847117540045, + "learning_rate": 4.609804201999503e-06, + "loss": 0.522, + "step": 4326 + }, + { + "epoch": 2.102613981762918, + "grad_norm": 0.0714599395002521, + "learning_rate": 4.6078953985481565e-06, + "loss": 0.5031, + "step": 4327 + }, + { + "epoch": 2.1031003039513676, + "grad_norm": 0.07290365972396484, + "learning_rate": 4.6059866525944984e-06, + "loss": 0.5264, + "step": 4328 + }, + { + "epoch": 2.1035866261398177, + "grad_norm": 0.07368246307381719, + "learning_rate": 4.604077964418422e-06, + "loss": 0.5907, + "step": 4329 + }, + { + "epoch": 2.1040729483282674, + "grad_norm": 0.06990178817338745, + "learning_rate": 4.602169334299817e-06, + "loss": 0.5231, + "step": 4330 + }, + { + "epoch": 2.1045592705167175, + "grad_norm": 0.07145163700036854, + "learning_rate": 4.60026076251856e-06, + "loss": 0.5304, + "step": 4331 + }, + { + "epoch": 2.1050455927051672, + "grad_norm": 0.07204756646226827, + "learning_rate": 4.5983522493545246e-06, + "loss": 0.5552, + "step": 4332 + }, + { + "epoch": 2.105531914893617, + "grad_norm": 0.0705789472994367, + "learning_rate": 4.59644379508757e-06, + "loss": 0.5288, + "step": 4333 + }, + { + "epoch": 2.106018237082067, + "grad_norm": 0.07242253113802473, + "learning_rate": 4.594535399997551e-06, + "loss": 0.5434, + "step": 4334 + }, + { + "epoch": 2.1065045592705167, + "grad_norm": 0.07047178928556341, + "learning_rate": 4.59262706436431e-06, + "loss": 0.5093, + "step": 4335 + }, + { + "epoch": 2.1069908814589664, + "grad_norm": 0.07459318432108243, + "learning_rate": 4.590718788467685e-06, + "loss": 0.5662, + "step": 4336 + }, + { + "epoch": 2.1074772036474165, + "grad_norm": 0.07259322741165329, + "learning_rate": 4.588810572587502e-06, + "loss": 0.5233, + "step": 4337 + }, + { + "epoch": 2.107963525835866, + "grad_norm": 0.07034984759649421, + "learning_rate": 4.5869024170035786e-06, + "loss": 0.523, + "step": 4338 + }, + { + "epoch": 2.108449848024316, + "grad_norm": 0.0723900388291076, + "learning_rate": 4.584994321995725e-06, + "loss": 0.5572, + "step": 4339 + }, + { + "epoch": 2.108936170212766, + "grad_norm": 0.07247297420638485, + "learning_rate": 4.583086287843741e-06, + "loss": 0.5627, + "step": 4340 + }, + { + "epoch": 2.1094224924012157, + "grad_norm": 0.07349196046813768, + "learning_rate": 4.58117831482742e-06, + "loss": 0.5491, + "step": 4341 + }, + { + "epoch": 2.109908814589666, + "grad_norm": 0.07250925681538714, + "learning_rate": 4.579270403226542e-06, + "loss": 0.521, + "step": 4342 + }, + { + "epoch": 2.1103951367781155, + "grad_norm": 0.06974951948719156, + "learning_rate": 4.577362553320882e-06, + "loss": 0.482, + "step": 4343 + }, + { + "epoch": 2.110881458966565, + "grad_norm": 0.07289419016946141, + "learning_rate": 4.575454765390204e-06, + "loss": 0.545, + "step": 4344 + }, + { + "epoch": 2.1113677811550153, + "grad_norm": 0.07283152144900658, + "learning_rate": 4.573547039714263e-06, + "loss": 0.4963, + "step": 4345 + }, + { + "epoch": 2.111854103343465, + "grad_norm": 0.07459158269856914, + "learning_rate": 4.571639376572806e-06, + "loss": 0.5551, + "step": 4346 + }, + { + "epoch": 2.1123404255319147, + "grad_norm": 0.07127804473628366, + "learning_rate": 4.569731776245571e-06, + "loss": 0.5118, + "step": 4347 + }, + { + "epoch": 2.112826747720365, + "grad_norm": 0.07176982435222327, + "learning_rate": 4.567824239012284e-06, + "loss": 0.4822, + "step": 4348 + }, + { + "epoch": 2.1133130699088145, + "grad_norm": 0.07094660000979022, + "learning_rate": 4.5659167651526645e-06, + "loss": 0.5492, + "step": 4349 + }, + { + "epoch": 2.1137993920972646, + "grad_norm": 0.07468500504931763, + "learning_rate": 4.564009354946422e-06, + "loss": 0.4999, + "step": 4350 + }, + { + "epoch": 2.1142857142857143, + "grad_norm": 0.07096007843599139, + "learning_rate": 4.562102008673258e-06, + "loss": 0.4932, + "step": 4351 + }, + { + "epoch": 2.114772036474164, + "grad_norm": 0.07472469352671478, + "learning_rate": 4.56019472661286e-06, + "loss": 0.5505, + "step": 4352 + }, + { + "epoch": 2.115258358662614, + "grad_norm": 0.07229753615196546, + "learning_rate": 4.558287509044913e-06, + "loss": 0.5381, + "step": 4353 + }, + { + "epoch": 2.115744680851064, + "grad_norm": 0.07050239474417577, + "learning_rate": 4.556380356249086e-06, + "loss": 0.5205, + "step": 4354 + }, + { + "epoch": 2.1162310030395135, + "grad_norm": 0.07117464513021124, + "learning_rate": 4.554473268505043e-06, + "loss": 0.5029, + "step": 4355 + }, + { + "epoch": 2.1167173252279636, + "grad_norm": 0.07075203879334808, + "learning_rate": 4.552566246092434e-06, + "loss": 0.5271, + "step": 4356 + }, + { + "epoch": 2.1172036474164133, + "grad_norm": 0.07555866623262578, + "learning_rate": 4.550659289290905e-06, + "loss": 0.5174, + "step": 4357 + }, + { + "epoch": 2.1176899696048634, + "grad_norm": 0.0723577624300876, + "learning_rate": 4.548752398380088e-06, + "loss": 0.5277, + "step": 4358 + }, + { + "epoch": 2.118176291793313, + "grad_norm": 0.07390188067949988, + "learning_rate": 4.546845573639609e-06, + "loss": 0.5274, + "step": 4359 + }, + { + "epoch": 2.118662613981763, + "grad_norm": 0.07405879124155919, + "learning_rate": 4.544938815349079e-06, + "loss": 0.5452, + "step": 4360 + }, + { + "epoch": 2.119148936170213, + "grad_norm": 0.07467607547896579, + "learning_rate": 4.543032123788105e-06, + "loss": 0.5272, + "step": 4361 + }, + { + "epoch": 2.1196352583586626, + "grad_norm": 0.073900252927743, + "learning_rate": 4.541125499236281e-06, + "loss": 0.5637, + "step": 4362 + }, + { + "epoch": 2.1201215805471123, + "grad_norm": 0.07222066525962098, + "learning_rate": 4.539218941973191e-06, + "loss": 0.5379, + "step": 4363 + }, + { + "epoch": 2.1206079027355624, + "grad_norm": 0.0714327005797944, + "learning_rate": 4.537312452278412e-06, + "loss": 0.523, + "step": 4364 + }, + { + "epoch": 2.121094224924012, + "grad_norm": 0.07518860546404318, + "learning_rate": 4.535406030431507e-06, + "loss": 0.5221, + "step": 4365 + }, + { + "epoch": 2.121580547112462, + "grad_norm": 0.07293100530194213, + "learning_rate": 4.533499676712032e-06, + "loss": 0.5008, + "step": 4366 + }, + { + "epoch": 2.122066869300912, + "grad_norm": 0.07103974494309062, + "learning_rate": 4.531593391399532e-06, + "loss": 0.4809, + "step": 4367 + }, + { + "epoch": 2.1225531914893616, + "grad_norm": 0.07299123816732794, + "learning_rate": 4.5296871747735396e-06, + "loss": 0.5408, + "step": 4368 + }, + { + "epoch": 2.1230395136778117, + "grad_norm": 0.07312515277720558, + "learning_rate": 4.527781027113584e-06, + "loss": 0.5308, + "step": 4369 + }, + { + "epoch": 2.1235258358662614, + "grad_norm": 0.07131281916667226, + "learning_rate": 4.5258749486991794e-06, + "loss": 0.5264, + "step": 4370 + }, + { + "epoch": 2.124012158054711, + "grad_norm": 0.07351564221942003, + "learning_rate": 4.523968939809829e-06, + "loss": 0.562, + "step": 4371 + }, + { + "epoch": 2.1244984802431612, + "grad_norm": 0.0711517874828819, + "learning_rate": 4.522063000725028e-06, + "loss": 0.4869, + "step": 4372 + }, + { + "epoch": 2.124984802431611, + "grad_norm": 0.07363116551111525, + "learning_rate": 4.52015713172426e-06, + "loss": 0.5528, + "step": 4373 + }, + { + "epoch": 2.1254711246200606, + "grad_norm": 0.07261106618957541, + "learning_rate": 4.5182513330869996e-06, + "loss": 0.582, + "step": 4374 + }, + { + "epoch": 2.1259574468085107, + "grad_norm": 0.07012232020803523, + "learning_rate": 4.516345605092712e-06, + "loss": 0.503, + "step": 4375 + }, + { + "epoch": 2.1264437689969604, + "grad_norm": 0.07423309762826079, + "learning_rate": 4.514439948020847e-06, + "loss": 0.5412, + "step": 4376 + }, + { + "epoch": 2.12693009118541, + "grad_norm": 0.06996702153876222, + "learning_rate": 4.512534362150851e-06, + "loss": 0.5273, + "step": 4377 + }, + { + "epoch": 2.1274164133738602, + "grad_norm": 0.07004293397103051, + "learning_rate": 4.510628847762155e-06, + "loss": 0.4796, + "step": 4378 + }, + { + "epoch": 2.12790273556231, + "grad_norm": 0.10364937852536732, + "learning_rate": 4.5087234051341825e-06, + "loss": 0.5239, + "step": 4379 + }, + { + "epoch": 2.12838905775076, + "grad_norm": 0.07153957074665195, + "learning_rate": 4.506818034546343e-06, + "loss": 0.5366, + "step": 4380 + }, + { + "epoch": 2.1288753799392097, + "grad_norm": 0.07267839149239437, + "learning_rate": 4.504912736278038e-06, + "loss": 0.5012, + "step": 4381 + }, + { + "epoch": 2.1293617021276594, + "grad_norm": 0.07203663993755269, + "learning_rate": 4.503007510608657e-06, + "loss": 0.5335, + "step": 4382 + }, + { + "epoch": 2.1298480243161095, + "grad_norm": 0.0715465433743639, + "learning_rate": 4.501102357817582e-06, + "loss": 0.5115, + "step": 4383 + }, + { + "epoch": 2.130334346504559, + "grad_norm": 0.07195440043584993, + "learning_rate": 4.499197278184181e-06, + "loss": 0.5112, + "step": 4384 + }, + { + "epoch": 2.1308206686930093, + "grad_norm": 0.07513808040327515, + "learning_rate": 4.497292271987812e-06, + "loss": 0.505, + "step": 4385 + }, + { + "epoch": 2.131306990881459, + "grad_norm": 0.07168007140474772, + "learning_rate": 4.495387339507822e-06, + "loss": 0.5376, + "step": 4386 + }, + { + "epoch": 2.1317933130699087, + "grad_norm": 0.07358464449910782, + "learning_rate": 4.493482481023549e-06, + "loss": 0.5986, + "step": 4387 + }, + { + "epoch": 2.132279635258359, + "grad_norm": 0.07045364757161546, + "learning_rate": 4.491577696814318e-06, + "loss": 0.5577, + "step": 4388 + }, + { + "epoch": 2.1327659574468085, + "grad_norm": 0.07397463312377561, + "learning_rate": 4.4896729871594446e-06, + "loss": 0.5283, + "step": 4389 + }, + { + "epoch": 2.133252279635258, + "grad_norm": 0.07356866875885495, + "learning_rate": 4.487768352338232e-06, + "loss": 0.5568, + "step": 4390 + }, + { + "epoch": 2.1337386018237083, + "grad_norm": 0.0732072980689226, + "learning_rate": 4.4858637926299745e-06, + "loss": 0.5648, + "step": 4391 + }, + { + "epoch": 2.134224924012158, + "grad_norm": 0.07065829230946877, + "learning_rate": 4.4839593083139536e-06, + "loss": 0.4997, + "step": 4392 + }, + { + "epoch": 2.1347112462006077, + "grad_norm": 0.0705261761238876, + "learning_rate": 4.482054899669439e-06, + "loss": 0.5296, + "step": 4393 + }, + { + "epoch": 2.135197568389058, + "grad_norm": 0.07140251558208224, + "learning_rate": 4.480150566975693e-06, + "loss": 0.536, + "step": 4394 + }, + { + "epoch": 2.1356838905775075, + "grad_norm": 0.07393775448589415, + "learning_rate": 4.478246310511963e-06, + "loss": 0.5339, + "step": 4395 + }, + { + "epoch": 2.1361702127659576, + "grad_norm": 0.07167531424173072, + "learning_rate": 4.476342130557486e-06, + "loss": 0.5686, + "step": 4396 + }, + { + "epoch": 2.1366565349544073, + "grad_norm": 0.07158235561773914, + "learning_rate": 4.474438027391489e-06, + "loss": 0.5183, + "step": 4397 + }, + { + "epoch": 2.137142857142857, + "grad_norm": 0.0706203738598817, + "learning_rate": 4.472534001293187e-06, + "loss": 0.5242, + "step": 4398 + }, + { + "epoch": 2.137629179331307, + "grad_norm": 0.07585642372597831, + "learning_rate": 4.4706300525417845e-06, + "loss": 0.5496, + "step": 4399 + }, + { + "epoch": 2.138115501519757, + "grad_norm": 0.07214989660051936, + "learning_rate": 4.468726181416473e-06, + "loss": 0.5296, + "step": 4400 + }, + { + "epoch": 2.1386018237082065, + "grad_norm": 0.07326627443925492, + "learning_rate": 4.466822388196434e-06, + "loss": 0.5733, + "step": 4401 + }, + { + "epoch": 2.1390881458966566, + "grad_norm": 0.07068439489348208, + "learning_rate": 4.464918673160837e-06, + "loss": 0.5364, + "step": 4402 + }, + { + "epoch": 2.1395744680851063, + "grad_norm": 0.07027562850145501, + "learning_rate": 4.463015036588841e-06, + "loss": 0.522, + "step": 4403 + }, + { + "epoch": 2.140060790273556, + "grad_norm": 0.07164970636775356, + "learning_rate": 4.46111147875959e-06, + "loss": 0.5092, + "step": 4404 + }, + { + "epoch": 2.140547112462006, + "grad_norm": 0.07040889695873687, + "learning_rate": 4.459207999952223e-06, + "loss": 0.4772, + "step": 4405 + }, + { + "epoch": 2.141033434650456, + "grad_norm": 0.07402437125040097, + "learning_rate": 4.457304600445861e-06, + "loss": 0.5818, + "step": 4406 + }, + { + "epoch": 2.141519756838906, + "grad_norm": 0.07055123213084917, + "learning_rate": 4.455401280519617e-06, + "loss": 0.5225, + "step": 4407 + }, + { + "epoch": 2.1420060790273556, + "grad_norm": 0.07274622696245434, + "learning_rate": 4.45349804045259e-06, + "loss": 0.5227, + "step": 4408 + }, + { + "epoch": 2.1424924012158053, + "grad_norm": 0.07233454570922326, + "learning_rate": 4.451594880523872e-06, + "loss": 0.5482, + "step": 4409 + }, + { + "epoch": 2.1429787234042554, + "grad_norm": 0.07079664235469232, + "learning_rate": 4.449691801012535e-06, + "loss": 0.487, + "step": 4410 + }, + { + "epoch": 2.143465045592705, + "grad_norm": 0.07145382426144663, + "learning_rate": 4.447788802197647e-06, + "loss": 0.5194, + "step": 4411 + }, + { + "epoch": 2.1439513677811552, + "grad_norm": 0.070013756924666, + "learning_rate": 4.44588588435826e-06, + "loss": 0.5288, + "step": 4412 + }, + { + "epoch": 2.144437689969605, + "grad_norm": 0.07459799994464224, + "learning_rate": 4.443983047773417e-06, + "loss": 0.5999, + "step": 4413 + }, + { + "epoch": 2.1449240121580546, + "grad_norm": 0.07096886542848622, + "learning_rate": 4.442080292722144e-06, + "loss": 0.5415, + "step": 4414 + }, + { + "epoch": 2.1454103343465047, + "grad_norm": 0.07258522929978131, + "learning_rate": 4.4401776194834615e-06, + "loss": 0.5443, + "step": 4415 + }, + { + "epoch": 2.1458966565349544, + "grad_norm": 0.07216420978459806, + "learning_rate": 4.438275028336374e-06, + "loss": 0.5068, + "step": 4416 + }, + { + "epoch": 2.146382978723404, + "grad_norm": 0.07258998264618445, + "learning_rate": 4.436372519559874e-06, + "loss": 0.5189, + "step": 4417 + }, + { + "epoch": 2.1468693009118542, + "grad_norm": 0.07184244385273168, + "learning_rate": 4.434470093432945e-06, + "loss": 0.5464, + "step": 4418 + }, + { + "epoch": 2.147355623100304, + "grad_norm": 0.07152660167371257, + "learning_rate": 4.432567750234554e-06, + "loss": 0.5316, + "step": 4419 + }, + { + "epoch": 2.1478419452887536, + "grad_norm": 0.07649460446870177, + "learning_rate": 4.430665490243659e-06, + "loss": 0.5276, + "step": 4420 + }, + { + "epoch": 2.1483282674772037, + "grad_norm": 0.07127005041148703, + "learning_rate": 4.428763313739204e-06, + "loss": 0.5354, + "step": 4421 + }, + { + "epoch": 2.1488145896656534, + "grad_norm": 0.07296418973582286, + "learning_rate": 4.426861221000121e-06, + "loss": 0.5471, + "step": 4422 + }, + { + "epoch": 2.1493009118541035, + "grad_norm": 0.07097311914711622, + "learning_rate": 4.424959212305334e-06, + "loss": 0.4923, + "step": 4423 + }, + { + "epoch": 2.1497872340425532, + "grad_norm": 0.07275208094053433, + "learning_rate": 4.423057287933748e-06, + "loss": 0.5256, + "step": 4424 + }, + { + "epoch": 2.150273556231003, + "grad_norm": 0.07289688301809912, + "learning_rate": 4.421155448164258e-06, + "loss": 0.5334, + "step": 4425 + }, + { + "epoch": 2.150759878419453, + "grad_norm": 0.07219912078647304, + "learning_rate": 4.419253693275749e-06, + "loss": 0.5093, + "step": 4426 + }, + { + "epoch": 2.1512462006079027, + "grad_norm": 0.0725635644775168, + "learning_rate": 4.4173520235470905e-06, + "loss": 0.5058, + "step": 4427 + }, + { + "epoch": 2.1517325227963524, + "grad_norm": 0.0753176353347214, + "learning_rate": 4.415450439257142e-06, + "loss": 0.5717, + "step": 4428 + }, + { + "epoch": 2.1522188449848025, + "grad_norm": 0.06971112714686117, + "learning_rate": 4.4135489406847485e-06, + "loss": 0.505, + "step": 4429 + }, + { + "epoch": 2.152705167173252, + "grad_norm": 0.07338960859625769, + "learning_rate": 4.411647528108744e-06, + "loss": 0.5157, + "step": 4430 + }, + { + "epoch": 2.153191489361702, + "grad_norm": 0.07650207286913285, + "learning_rate": 4.409746201807947e-06, + "loss": 0.5286, + "step": 4431 + }, + { + "epoch": 2.153677811550152, + "grad_norm": 0.06991520097871859, + "learning_rate": 4.4078449620611674e-06, + "loss": 0.5185, + "step": 4432 + }, + { + "epoch": 2.1541641337386017, + "grad_norm": 0.07123275782083067, + "learning_rate": 4.4059438091472e-06, + "loss": 0.5328, + "step": 4433 + }, + { + "epoch": 2.154650455927052, + "grad_norm": 0.07502707815545193, + "learning_rate": 4.404042743344827e-06, + "loss": 0.5355, + "step": 4434 + }, + { + "epoch": 2.1551367781155015, + "grad_norm": 0.0717210084395386, + "learning_rate": 4.402141764932818e-06, + "loss": 0.5377, + "step": 4435 + }, + { + "epoch": 2.155623100303951, + "grad_norm": 0.0699130628287047, + "learning_rate": 4.40024087418993e-06, + "loss": 0.4834, + "step": 4436 + }, + { + "epoch": 2.1561094224924013, + "grad_norm": 0.07032858642481528, + "learning_rate": 4.398340071394906e-06, + "loss": 0.4989, + "step": 4437 + }, + { + "epoch": 2.156595744680851, + "grad_norm": 0.07335052042640616, + "learning_rate": 4.39643935682648e-06, + "loss": 0.5305, + "step": 4438 + }, + { + "epoch": 2.1570820668693007, + "grad_norm": 0.07612970735097073, + "learning_rate": 4.394538730763368e-06, + "loss": 0.5405, + "step": 4439 + }, + { + "epoch": 2.157568389057751, + "grad_norm": 0.07165852630690416, + "learning_rate": 4.392638193484274e-06, + "loss": 0.5501, + "step": 4440 + }, + { + "epoch": 2.1580547112462005, + "grad_norm": 0.07224930588412967, + "learning_rate": 4.390737745267893e-06, + "loss": 0.5234, + "step": 4441 + }, + { + "epoch": 2.1585410334346506, + "grad_norm": 0.07496450242407453, + "learning_rate": 4.388837386392903e-06, + "loss": 0.5161, + "step": 4442 + }, + { + "epoch": 2.1590273556231003, + "grad_norm": 0.07262170312440885, + "learning_rate": 4.38693711713797e-06, + "loss": 0.5339, + "step": 4443 + }, + { + "epoch": 2.15951367781155, + "grad_norm": 0.07470624822842568, + "learning_rate": 4.385036937781747e-06, + "loss": 0.5471, + "step": 4444 + }, + { + "epoch": 2.16, + "grad_norm": 0.0737252417065575, + "learning_rate": 4.383136848602874e-06, + "loss": 0.5548, + "step": 4445 + }, + { + "epoch": 2.16048632218845, + "grad_norm": 0.07493011443252882, + "learning_rate": 4.381236849879977e-06, + "loss": 0.5608, + "step": 4446 + }, + { + "epoch": 2.1609726443768995, + "grad_norm": 0.07173763277960096, + "learning_rate": 4.3793369418916705e-06, + "loss": 0.5073, + "step": 4447 + }, + { + "epoch": 2.1614589665653496, + "grad_norm": 0.0725532176251541, + "learning_rate": 4.3774371249165525e-06, + "loss": 0.5494, + "step": 4448 + }, + { + "epoch": 2.1619452887537993, + "grad_norm": 0.06817887560450904, + "learning_rate": 4.375537399233211e-06, + "loss": 0.5148, + "step": 4449 + }, + { + "epoch": 2.1624316109422494, + "grad_norm": 0.07149176284578583, + "learning_rate": 4.373637765120218e-06, + "loss": 0.538, + "step": 4450 + }, + { + "epoch": 2.162917933130699, + "grad_norm": 0.06933298478705206, + "learning_rate": 4.371738222856134e-06, + "loss": 0.4891, + "step": 4451 + }, + { + "epoch": 2.163404255319149, + "grad_norm": 0.07537965341211422, + "learning_rate": 4.369838772719505e-06, + "loss": 0.5396, + "step": 4452 + }, + { + "epoch": 2.163890577507599, + "grad_norm": 0.07489509692036718, + "learning_rate": 4.3679394149888646e-06, + "loss": 0.5409, + "step": 4453 + }, + { + "epoch": 2.1643768996960486, + "grad_norm": 0.07225314716427281, + "learning_rate": 4.366040149942731e-06, + "loss": 0.5256, + "step": 4454 + }, + { + "epoch": 2.1648632218844983, + "grad_norm": 0.06921989961719661, + "learning_rate": 4.36414097785961e-06, + "loss": 0.5053, + "step": 4455 + }, + { + "epoch": 2.1653495440729484, + "grad_norm": 0.06957765254772537, + "learning_rate": 4.362241899017995e-06, + "loss": 0.5076, + "step": 4456 + }, + { + "epoch": 2.165835866261398, + "grad_norm": 0.07317471277420289, + "learning_rate": 4.360342913696363e-06, + "loss": 0.5389, + "step": 4457 + }, + { + "epoch": 2.166322188449848, + "grad_norm": 0.07111617999675747, + "learning_rate": 4.358444022173177e-06, + "loss": 0.5108, + "step": 4458 + }, + { + "epoch": 2.166808510638298, + "grad_norm": 0.0705933684205117, + "learning_rate": 4.356545224726891e-06, + "loss": 0.5002, + "step": 4459 + }, + { + "epoch": 2.1672948328267476, + "grad_norm": 0.07283998229835192, + "learning_rate": 4.354646521635942e-06, + "loss": 0.5008, + "step": 4460 + }, + { + "epoch": 2.1677811550151977, + "grad_norm": 0.07281584095044312, + "learning_rate": 4.3527479131787505e-06, + "loss": 0.5423, + "step": 4461 + }, + { + "epoch": 2.1682674772036474, + "grad_norm": 0.07098384695177379, + "learning_rate": 4.35084939963373e-06, + "loss": 0.5157, + "step": 4462 + }, + { + "epoch": 2.168753799392097, + "grad_norm": 0.07229381706846445, + "learning_rate": 4.348950981279271e-06, + "loss": 0.5205, + "step": 4463 + }, + { + "epoch": 2.1692401215805472, + "grad_norm": 0.0727448953839507, + "learning_rate": 4.347052658393759e-06, + "loss": 0.5266, + "step": 4464 + }, + { + "epoch": 2.169726443768997, + "grad_norm": 0.0747763508564579, + "learning_rate": 4.345154431255559e-06, + "loss": 0.5466, + "step": 4465 + }, + { + "epoch": 2.1702127659574466, + "grad_norm": 0.06954406846660843, + "learning_rate": 4.343256300143026e-06, + "loss": 0.5201, + "step": 4466 + }, + { + "epoch": 2.1706990881458967, + "grad_norm": 0.0756097044811774, + "learning_rate": 4.341358265334498e-06, + "loss": 0.5266, + "step": 4467 + }, + { + "epoch": 2.1711854103343464, + "grad_norm": 0.0712351094386963, + "learning_rate": 4.339460327108301e-06, + "loss": 0.5066, + "step": 4468 + }, + { + "epoch": 2.1716717325227965, + "grad_norm": 0.07048703296532557, + "learning_rate": 4.337562485742747e-06, + "loss": 0.5175, + "step": 4469 + }, + { + "epoch": 2.1721580547112462, + "grad_norm": 0.07233271147362716, + "learning_rate": 4.335664741516132e-06, + "loss": 0.5179, + "step": 4470 + }, + { + "epoch": 2.172644376899696, + "grad_norm": 0.07672380531710715, + "learning_rate": 4.333767094706738e-06, + "loss": 0.5466, + "step": 4471 + }, + { + "epoch": 2.173130699088146, + "grad_norm": 0.07852345841998083, + "learning_rate": 4.331869545592834e-06, + "loss": 0.5534, + "step": 4472 + }, + { + "epoch": 2.1736170212765957, + "grad_norm": 0.0722184845292258, + "learning_rate": 4.3299720944526746e-06, + "loss": 0.5549, + "step": 4473 + }, + { + "epoch": 2.1741033434650454, + "grad_norm": 0.07130650521018668, + "learning_rate": 4.328074741564498e-06, + "loss": 0.51, + "step": 4474 + }, + { + "epoch": 2.1745896656534955, + "grad_norm": 0.07387767633058749, + "learning_rate": 4.326177487206531e-06, + "loss": 0.5224, + "step": 4475 + }, + { + "epoch": 2.1750759878419452, + "grad_norm": 0.07595354066387881, + "learning_rate": 4.324280331656982e-06, + "loss": 0.5559, + "step": 4476 + }, + { + "epoch": 2.1755623100303954, + "grad_norm": 0.0732706271764856, + "learning_rate": 4.322383275194051e-06, + "loss": 0.5161, + "step": 4477 + }, + { + "epoch": 2.176048632218845, + "grad_norm": 0.07536805327615512, + "learning_rate": 4.320486318095917e-06, + "loss": 0.5464, + "step": 4478 + }, + { + "epoch": 2.1765349544072947, + "grad_norm": 0.06948896974377608, + "learning_rate": 4.318589460640748e-06, + "loss": 0.4822, + "step": 4479 + }, + { + "epoch": 2.177021276595745, + "grad_norm": 0.0736711355302051, + "learning_rate": 4.316692703106698e-06, + "loss": 0.5455, + "step": 4480 + }, + { + "epoch": 2.1775075987841945, + "grad_norm": 0.07227677761420534, + "learning_rate": 4.3147960457719025e-06, + "loss": 0.5263, + "step": 4481 + }, + { + "epoch": 2.177993920972644, + "grad_norm": 0.0676000024639692, + "learning_rate": 4.312899488914486e-06, + "loss": 0.4812, + "step": 4482 + }, + { + "epoch": 2.1784802431610943, + "grad_norm": 0.07171891389812027, + "learning_rate": 4.311003032812558e-06, + "loss": 0.5489, + "step": 4483 + }, + { + "epoch": 2.178966565349544, + "grad_norm": 0.07074825308105669, + "learning_rate": 4.3091066777442094e-06, + "loss": 0.5247, + "step": 4484 + }, + { + "epoch": 2.1794528875379937, + "grad_norm": 0.07216165209108397, + "learning_rate": 4.307210423987522e-06, + "loss": 0.5038, + "step": 4485 + }, + { + "epoch": 2.179939209726444, + "grad_norm": 0.07405024135841418, + "learning_rate": 4.30531427182056e-06, + "loss": 0.5374, + "step": 4486 + }, + { + "epoch": 2.1804255319148935, + "grad_norm": 0.07175393728509953, + "learning_rate": 4.303418221521369e-06, + "loss": 0.5396, + "step": 4487 + }, + { + "epoch": 2.1809118541033437, + "grad_norm": 0.0746652818866144, + "learning_rate": 4.301522273367986e-06, + "loss": 0.558, + "step": 4488 + }, + { + "epoch": 2.1813981762917933, + "grad_norm": 0.0730468957005626, + "learning_rate": 4.2996264276384305e-06, + "loss": 0.5148, + "step": 4489 + }, + { + "epoch": 2.181884498480243, + "grad_norm": 0.07543224255611654, + "learning_rate": 4.297730684610706e-06, + "loss": 0.532, + "step": 4490 + }, + { + "epoch": 2.182370820668693, + "grad_norm": 0.06991979796767724, + "learning_rate": 4.295835044562802e-06, + "loss": 0.5055, + "step": 4491 + }, + { + "epoch": 2.182857142857143, + "grad_norm": 0.07445756278160635, + "learning_rate": 4.293939507772692e-06, + "loss": 0.5532, + "step": 4492 + }, + { + "epoch": 2.1833434650455925, + "grad_norm": 0.07412206940131677, + "learning_rate": 4.292044074518335e-06, + "loss": 0.5067, + "step": 4493 + }, + { + "epoch": 2.1838297872340426, + "grad_norm": 0.07235603166218797, + "learning_rate": 4.290148745077675e-06, + "loss": 0.524, + "step": 4494 + }, + { + "epoch": 2.1843161094224923, + "grad_norm": 0.07092968594100339, + "learning_rate": 4.28825351972864e-06, + "loss": 0.5041, + "step": 4495 + }, + { + "epoch": 2.1848024316109425, + "grad_norm": 0.0749341346831391, + "learning_rate": 4.286358398749146e-06, + "loss": 0.5407, + "step": 4496 + }, + { + "epoch": 2.185288753799392, + "grad_norm": 0.07214373683180679, + "learning_rate": 4.284463382417088e-06, + "loss": 0.5546, + "step": 4497 + }, + { + "epoch": 2.185775075987842, + "grad_norm": 0.07444150224827252, + "learning_rate": 4.282568471010349e-06, + "loss": 0.5702, + "step": 4498 + }, + { + "epoch": 2.186261398176292, + "grad_norm": 0.07066189503164857, + "learning_rate": 4.280673664806798e-06, + "loss": 0.4866, + "step": 4499 + }, + { + "epoch": 2.1867477203647416, + "grad_norm": 0.07028448033069667, + "learning_rate": 4.278778964084284e-06, + "loss": 0.5137, + "step": 4500 + }, + { + "epoch": 2.1872340425531913, + "grad_norm": 0.06927337705382537, + "learning_rate": 4.276884369120647e-06, + "loss": 0.4922, + "step": 4501 + }, + { + "epoch": 2.1877203647416414, + "grad_norm": 0.07389348298428064, + "learning_rate": 4.274989880193705e-06, + "loss": 0.5232, + "step": 4502 + }, + { + "epoch": 2.188206686930091, + "grad_norm": 0.07310086878402257, + "learning_rate": 4.273095497581263e-06, + "loss": 0.519, + "step": 4503 + }, + { + "epoch": 2.1886930091185413, + "grad_norm": 0.07009569188352663, + "learning_rate": 4.271201221561112e-06, + "loss": 0.5057, + "step": 4504 + }, + { + "epoch": 2.189179331306991, + "grad_norm": 0.07508730736469998, + "learning_rate": 4.269307052411026e-06, + "loss": 0.5522, + "step": 4505 + }, + { + "epoch": 2.1896656534954406, + "grad_norm": 0.0725965306677151, + "learning_rate": 4.267412990408764e-06, + "loss": 0.526, + "step": 4506 + }, + { + "epoch": 2.1901519756838908, + "grad_norm": 0.06951893872984645, + "learning_rate": 4.2655190358320665e-06, + "loss": 0.5245, + "step": 4507 + }, + { + "epoch": 2.1906382978723404, + "grad_norm": 0.0700318474514735, + "learning_rate": 4.263625188958662e-06, + "loss": 0.538, + "step": 4508 + }, + { + "epoch": 2.19112462006079, + "grad_norm": 0.0712692185200809, + "learning_rate": 4.261731450066262e-06, + "loss": 0.5149, + "step": 4509 + }, + { + "epoch": 2.1916109422492402, + "grad_norm": 0.07205495500666896, + "learning_rate": 4.259837819432562e-06, + "loss": 0.5125, + "step": 4510 + }, + { + "epoch": 2.19209726443769, + "grad_norm": 0.07417334975079336, + "learning_rate": 4.25794429733524e-06, + "loss": 0.5368, + "step": 4511 + }, + { + "epoch": 2.1925835866261396, + "grad_norm": 0.07284175709859952, + "learning_rate": 4.2560508840519595e-06, + "loss": 0.5086, + "step": 4512 + }, + { + "epoch": 2.1930699088145897, + "grad_norm": 0.07272175185268577, + "learning_rate": 4.254157579860367e-06, + "loss": 0.5743, + "step": 4513 + }, + { + "epoch": 2.1935562310030394, + "grad_norm": 0.07210804493097787, + "learning_rate": 4.2522643850380985e-06, + "loss": 0.5117, + "step": 4514 + }, + { + "epoch": 2.1940425531914896, + "grad_norm": 0.07198219782177015, + "learning_rate": 4.250371299862768e-06, + "loss": 0.5131, + "step": 4515 + }, + { + "epoch": 2.1945288753799392, + "grad_norm": 0.07306219573299305, + "learning_rate": 4.248478324611972e-06, + "loss": 0.5521, + "step": 4516 + }, + { + "epoch": 2.195015197568389, + "grad_norm": 0.07190504043703015, + "learning_rate": 4.2465854595632956e-06, + "loss": 0.5295, + "step": 4517 + }, + { + "epoch": 2.195501519756839, + "grad_norm": 0.06900639720632466, + "learning_rate": 4.244692704994306e-06, + "loss": 0.5215, + "step": 4518 + }, + { + "epoch": 2.1959878419452887, + "grad_norm": 0.07320857810557652, + "learning_rate": 4.242800061182555e-06, + "loss": 0.5362, + "step": 4519 + }, + { + "epoch": 2.1964741641337384, + "grad_norm": 0.07590987873967169, + "learning_rate": 4.240907528405574e-06, + "loss": 0.5649, + "step": 4520 + }, + { + "epoch": 2.1969604863221885, + "grad_norm": 0.07043953599485209, + "learning_rate": 4.239015106940887e-06, + "loss": 0.5162, + "step": 4521 + }, + { + "epoch": 2.1974468085106382, + "grad_norm": 0.07603876241019514, + "learning_rate": 4.2371227970659916e-06, + "loss": 0.5217, + "step": 4522 + }, + { + "epoch": 2.197933130699088, + "grad_norm": 0.07226567672526107, + "learning_rate": 4.235230599058374e-06, + "loss": 0.5135, + "step": 4523 + }, + { + "epoch": 2.198419452887538, + "grad_norm": 0.07378502313590074, + "learning_rate": 4.233338513195505e-06, + "loss": 0.5559, + "step": 4524 + }, + { + "epoch": 2.1989057750759877, + "grad_norm": 0.07001480243661323, + "learning_rate": 4.2314465397548395e-06, + "loss": 0.5454, + "step": 4525 + }, + { + "epoch": 2.199392097264438, + "grad_norm": 0.07261870597075992, + "learning_rate": 4.229554679013809e-06, + "loss": 0.5103, + "step": 4526 + }, + { + "epoch": 2.1998784194528875, + "grad_norm": 0.07381774804999802, + "learning_rate": 4.227662931249837e-06, + "loss": 0.5452, + "step": 4527 + }, + { + "epoch": 2.200364741641337, + "grad_norm": 0.07242801873466624, + "learning_rate": 4.225771296740325e-06, + "loss": 0.5081, + "step": 4528 + }, + { + "epoch": 2.2008510638297873, + "grad_norm": 0.07200057722862975, + "learning_rate": 4.2238797757626595e-06, + "loss": 0.531, + "step": 4529 + }, + { + "epoch": 2.201337386018237, + "grad_norm": 0.07131311868598234, + "learning_rate": 4.221988368594213e-06, + "loss": 0.5305, + "step": 4530 + }, + { + "epoch": 2.201823708206687, + "grad_norm": 0.07014780105050691, + "learning_rate": 4.220097075512335e-06, + "loss": 0.525, + "step": 4531 + }, + { + "epoch": 2.202310030395137, + "grad_norm": 0.07362033174690961, + "learning_rate": 4.218205896794366e-06, + "loss": 0.5449, + "step": 4532 + }, + { + "epoch": 2.2027963525835865, + "grad_norm": 0.077389770039334, + "learning_rate": 4.216314832717625e-06, + "loss": 0.5447, + "step": 4533 + }, + { + "epoch": 2.2032826747720367, + "grad_norm": 0.07485067164681249, + "learning_rate": 4.214423883559414e-06, + "loss": 0.5689, + "step": 4534 + }, + { + "epoch": 2.2037689969604863, + "grad_norm": 0.06938596053126789, + "learning_rate": 4.21253304959702e-06, + "loss": 0.5205, + "step": 4535 + }, + { + "epoch": 2.204255319148936, + "grad_norm": 0.0725739562019719, + "learning_rate": 4.210642331107711e-06, + "loss": 0.5353, + "step": 4536 + }, + { + "epoch": 2.204741641337386, + "grad_norm": 0.07115611396586252, + "learning_rate": 4.208751728368741e-06, + "loss": 0.4929, + "step": 4537 + }, + { + "epoch": 2.205227963525836, + "grad_norm": 0.0735740345041347, + "learning_rate": 4.206861241657345e-06, + "loss": 0.5124, + "step": 4538 + }, + { + "epoch": 2.2057142857142855, + "grad_norm": 0.07352300880891982, + "learning_rate": 4.204970871250741e-06, + "loss": 0.5252, + "step": 4539 + }, + { + "epoch": 2.2062006079027356, + "grad_norm": 0.07403477627573103, + "learning_rate": 4.203080617426131e-06, + "loss": 0.5639, + "step": 4540 + }, + { + "epoch": 2.2066869300911853, + "grad_norm": 0.07080002310921088, + "learning_rate": 4.201190480460699e-06, + "loss": 0.5282, + "step": 4541 + }, + { + "epoch": 2.2071732522796355, + "grad_norm": 0.07274715488876342, + "learning_rate": 4.1993004606316114e-06, + "loss": 0.5215, + "step": 4542 + }, + { + "epoch": 2.207659574468085, + "grad_norm": 0.07349389107660528, + "learning_rate": 4.197410558216018e-06, + "loss": 0.5517, + "step": 4543 + }, + { + "epoch": 2.208145896656535, + "grad_norm": 0.07512923526697164, + "learning_rate": 4.1955207734910536e-06, + "loss": 0.532, + "step": 4544 + }, + { + "epoch": 2.208632218844985, + "grad_norm": 0.07047559321781026, + "learning_rate": 4.193631106733831e-06, + "loss": 0.4959, + "step": 4545 + }, + { + "epoch": 2.2091185410334346, + "grad_norm": 0.0716291272792849, + "learning_rate": 4.191741558221451e-06, + "loss": 0.5242, + "step": 4546 + }, + { + "epoch": 2.2096048632218843, + "grad_norm": 0.0749361291341572, + "learning_rate": 4.189852128230992e-06, + "loss": 0.5773, + "step": 4547 + }, + { + "epoch": 2.2100911854103344, + "grad_norm": 0.07238070330465997, + "learning_rate": 4.187962817039519e-06, + "loss": 0.5127, + "step": 4548 + }, + { + "epoch": 2.210577507598784, + "grad_norm": 0.07097999202201388, + "learning_rate": 4.186073624924077e-06, + "loss": 0.56, + "step": 4549 + }, + { + "epoch": 2.211063829787234, + "grad_norm": 0.07368278475409375, + "learning_rate": 4.184184552161696e-06, + "loss": 0.5271, + "step": 4550 + }, + { + "epoch": 2.211550151975684, + "grad_norm": 0.07285182586110941, + "learning_rate": 4.182295599029386e-06, + "loss": 0.545, + "step": 4551 + }, + { + "epoch": 2.2120364741641336, + "grad_norm": 0.07230305585935967, + "learning_rate": 4.180406765804141e-06, + "loss": 0.5506, + "step": 4552 + }, + { + "epoch": 2.2125227963525838, + "grad_norm": 0.07287189587695644, + "learning_rate": 4.178518052762935e-06, + "loss": 0.5409, + "step": 4553 + }, + { + "epoch": 2.2130091185410334, + "grad_norm": 0.07359170055027614, + "learning_rate": 4.176629460182731e-06, + "loss": 0.5092, + "step": 4554 + }, + { + "epoch": 2.213495440729483, + "grad_norm": 0.0786894117608461, + "learning_rate": 4.174740988340465e-06, + "loss": 0.522, + "step": 4555 + }, + { + "epoch": 2.2139817629179332, + "grad_norm": 0.07303618465465561, + "learning_rate": 4.172852637513062e-06, + "loss": 0.5318, + "step": 4556 + }, + { + "epoch": 2.214468085106383, + "grad_norm": 0.0712247643033999, + "learning_rate": 4.170964407977426e-06, + "loss": 0.5217, + "step": 4557 + }, + { + "epoch": 2.214954407294833, + "grad_norm": 0.06723044216202748, + "learning_rate": 4.169076300010446e-06, + "loss": 0.4926, + "step": 4558 + }, + { + "epoch": 2.2154407294832827, + "grad_norm": 0.0711516408446843, + "learning_rate": 4.167188313888991e-06, + "loss": 0.4711, + "step": 4559 + }, + { + "epoch": 2.2159270516717324, + "grad_norm": 0.07169142713748168, + "learning_rate": 4.165300449889912e-06, + "loss": 0.5289, + "step": 4560 + }, + { + "epoch": 2.2164133738601826, + "grad_norm": 0.07209325629063798, + "learning_rate": 4.163412708290043e-06, + "loss": 0.5193, + "step": 4561 + }, + { + "epoch": 2.2168996960486322, + "grad_norm": 0.07691351297728855, + "learning_rate": 4.161525089366201e-06, + "loss": 0.5346, + "step": 4562 + }, + { + "epoch": 2.217386018237082, + "grad_norm": 0.07129482418946885, + "learning_rate": 4.1596375933951835e-06, + "loss": 0.5065, + "step": 4563 + }, + { + "epoch": 2.217872340425532, + "grad_norm": 0.07133699367081238, + "learning_rate": 4.15775022065377e-06, + "loss": 0.5284, + "step": 4564 + }, + { + "epoch": 2.2183586626139817, + "grad_norm": 0.07915723191085236, + "learning_rate": 4.155862971418721e-06, + "loss": 0.5387, + "step": 4565 + }, + { + "epoch": 2.2188449848024314, + "grad_norm": 0.07392118929781695, + "learning_rate": 4.153975845966783e-06, + "loss": 0.5827, + "step": 4566 + }, + { + "epoch": 2.2193313069908815, + "grad_norm": 0.07292366089529126, + "learning_rate": 4.1520888445746765e-06, + "loss": 0.5229, + "step": 4567 + }, + { + "epoch": 2.2198176291793312, + "grad_norm": 0.07253272769929474, + "learning_rate": 4.150201967519115e-06, + "loss": 0.533, + "step": 4568 + }, + { + "epoch": 2.2203039513677814, + "grad_norm": 0.07275933252685185, + "learning_rate": 4.148315215076786e-06, + "loss": 0.5241, + "step": 4569 + }, + { + "epoch": 2.220790273556231, + "grad_norm": 0.07183404921358959, + "learning_rate": 4.146428587524358e-06, + "loss": 0.5422, + "step": 4570 + }, + { + "epoch": 2.2212765957446807, + "grad_norm": 0.07065214255551733, + "learning_rate": 4.144542085138484e-06, + "loss": 0.5158, + "step": 4571 + }, + { + "epoch": 2.221762917933131, + "grad_norm": 0.07374647611575025, + "learning_rate": 4.142655708195799e-06, + "loss": 0.5002, + "step": 4572 + }, + { + "epoch": 2.2222492401215805, + "grad_norm": 0.07055733762349352, + "learning_rate": 4.140769456972919e-06, + "loss": 0.5021, + "step": 4573 + }, + { + "epoch": 2.22273556231003, + "grad_norm": 0.07314155481825418, + "learning_rate": 4.138883331746442e-06, + "loss": 0.5453, + "step": 4574 + }, + { + "epoch": 2.2232218844984803, + "grad_norm": 0.06835525333484847, + "learning_rate": 4.136997332792944e-06, + "loss": 0.4826, + "step": 4575 + }, + { + "epoch": 2.22370820668693, + "grad_norm": 0.07444367651981872, + "learning_rate": 4.135111460388989e-06, + "loss": 0.5235, + "step": 4576 + }, + { + "epoch": 2.2241945288753797, + "grad_norm": 0.07637947583636884, + "learning_rate": 4.133225714811115e-06, + "loss": 0.534, + "step": 4577 + }, + { + "epoch": 2.22468085106383, + "grad_norm": 0.07035935307584323, + "learning_rate": 4.131340096335849e-06, + "loss": 0.5161, + "step": 4578 + }, + { + "epoch": 2.2251671732522795, + "grad_norm": 0.06844843561417313, + "learning_rate": 4.129454605239692e-06, + "loss": 0.4676, + "step": 4579 + }, + { + "epoch": 2.2256534954407297, + "grad_norm": 0.07005001194486436, + "learning_rate": 4.127569241799132e-06, + "loss": 0.4827, + "step": 4580 + }, + { + "epoch": 2.2261398176291793, + "grad_norm": 0.07256398001827741, + "learning_rate": 4.125684006290636e-06, + "loss": 0.5268, + "step": 4581 + }, + { + "epoch": 2.226626139817629, + "grad_norm": 0.07429866901508127, + "learning_rate": 4.123798898990651e-06, + "loss": 0.5419, + "step": 4582 + }, + { + "epoch": 2.227112462006079, + "grad_norm": 0.07200029499266601, + "learning_rate": 4.121913920175608e-06, + "loss": 0.5304, + "step": 4583 + }, + { + "epoch": 2.227598784194529, + "grad_norm": 0.07255472672805988, + "learning_rate": 4.120029070121917e-06, + "loss": 0.5179, + "step": 4584 + }, + { + "epoch": 2.2280851063829785, + "grad_norm": 0.0732353870750939, + "learning_rate": 4.118144349105969e-06, + "loss": 0.5733, + "step": 4585 + }, + { + "epoch": 2.2285714285714286, + "grad_norm": 0.07421249353976915, + "learning_rate": 4.116259757404139e-06, + "loss": 0.5467, + "step": 4586 + }, + { + "epoch": 2.2290577507598783, + "grad_norm": 0.07077620611601619, + "learning_rate": 4.114375295292781e-06, + "loss": 0.5197, + "step": 4587 + }, + { + "epoch": 2.2295440729483285, + "grad_norm": 0.07129469226114649, + "learning_rate": 4.112490963048228e-06, + "loss": 0.5269, + "step": 4588 + }, + { + "epoch": 2.230030395136778, + "grad_norm": 0.07378638969970018, + "learning_rate": 4.110606760946797e-06, + "loss": 0.5615, + "step": 4589 + }, + { + "epoch": 2.230516717325228, + "grad_norm": 0.07015004010154713, + "learning_rate": 4.108722689264786e-06, + "loss": 0.5233, + "step": 4590 + }, + { + "epoch": 2.231003039513678, + "grad_norm": 0.07102246082714694, + "learning_rate": 4.10683874827847e-06, + "loss": 0.5367, + "step": 4591 + }, + { + "epoch": 2.2314893617021276, + "grad_norm": 0.06864479112331835, + "learning_rate": 4.104954938264109e-06, + "loss": 0.5003, + "step": 4592 + }, + { + "epoch": 2.2319756838905773, + "grad_norm": 0.0723011955543065, + "learning_rate": 4.103071259497945e-06, + "loss": 0.5284, + "step": 4593 + }, + { + "epoch": 2.2324620060790275, + "grad_norm": 0.07224698434066429, + "learning_rate": 4.101187712256193e-06, + "loss": 0.5304, + "step": 4594 + }, + { + "epoch": 2.232948328267477, + "grad_norm": 0.07046994643372083, + "learning_rate": 4.099304296815058e-06, + "loss": 0.5196, + "step": 4595 + }, + { + "epoch": 2.2334346504559273, + "grad_norm": 0.07476715069820349, + "learning_rate": 4.097421013450718e-06, + "loss": 0.5415, + "step": 4596 + }, + { + "epoch": 2.233920972644377, + "grad_norm": 0.07031622644758091, + "learning_rate": 4.095537862439338e-06, + "loss": 0.5028, + "step": 4597 + }, + { + "epoch": 2.2344072948328266, + "grad_norm": 0.07243301251094518, + "learning_rate": 4.093654844057059e-06, + "loss": 0.5382, + "step": 4598 + }, + { + "epoch": 2.2348936170212768, + "grad_norm": 0.07357768193996367, + "learning_rate": 4.091771958580005e-06, + "loss": 0.5525, + "step": 4599 + }, + { + "epoch": 2.2353799392097264, + "grad_norm": 0.06907282011657624, + "learning_rate": 4.089889206284279e-06, + "loss": 0.4794, + "step": 4600 + }, + { + "epoch": 2.235866261398176, + "grad_norm": 0.0713803728663108, + "learning_rate": 4.088006587445967e-06, + "loss": 0.5079, + "step": 4601 + }, + { + "epoch": 2.2363525835866263, + "grad_norm": 0.07066356129219667, + "learning_rate": 4.086124102341133e-06, + "loss": 0.5417, + "step": 4602 + }, + { + "epoch": 2.236838905775076, + "grad_norm": 0.07160730071846899, + "learning_rate": 4.0842417512458184e-06, + "loss": 0.5287, + "step": 4603 + }, + { + "epoch": 2.2373252279635256, + "grad_norm": 0.07067886150922982, + "learning_rate": 4.082359534436055e-06, + "loss": 0.5363, + "step": 4604 + }, + { + "epoch": 2.2378115501519757, + "grad_norm": 0.07179142221792519, + "learning_rate": 4.080477452187845e-06, + "loss": 0.5161, + "step": 4605 + }, + { + "epoch": 2.2382978723404254, + "grad_norm": 0.0743988081950859, + "learning_rate": 4.078595504777174e-06, + "loss": 0.5364, + "step": 4606 + }, + { + "epoch": 2.2387841945288756, + "grad_norm": 0.07147876117880697, + "learning_rate": 4.07671369248001e-06, + "loss": 0.4912, + "step": 4607 + }, + { + "epoch": 2.2392705167173252, + "grad_norm": 0.07683126158721, + "learning_rate": 4.074832015572299e-06, + "loss": 0.5517, + "step": 4608 + }, + { + "epoch": 2.239756838905775, + "grad_norm": 0.07643915789894339, + "learning_rate": 4.072950474329965e-06, + "loss": 0.5218, + "step": 4609 + }, + { + "epoch": 2.240243161094225, + "grad_norm": 0.07204330812356448, + "learning_rate": 4.071069069028918e-06, + "loss": 0.5159, + "step": 4610 + }, + { + "epoch": 2.2407294832826747, + "grad_norm": 0.07480233157113961, + "learning_rate": 4.0691877999450425e-06, + "loss": 0.534, + "step": 4611 + }, + { + "epoch": 2.2412158054711244, + "grad_norm": 0.06969247871258243, + "learning_rate": 4.067306667354206e-06, + "loss": 0.4986, + "step": 4612 + }, + { + "epoch": 2.2417021276595746, + "grad_norm": 0.0729209483183523, + "learning_rate": 4.065425671532256e-06, + "loss": 0.5654, + "step": 4613 + }, + { + "epoch": 2.2421884498480242, + "grad_norm": 0.07287429914765206, + "learning_rate": 4.063544812755018e-06, + "loss": 0.5296, + "step": 4614 + }, + { + "epoch": 2.2426747720364744, + "grad_norm": 0.06970692610968177, + "learning_rate": 4.061664091298299e-06, + "loss": 0.47, + "step": 4615 + }, + { + "epoch": 2.243161094224924, + "grad_norm": 0.07151609676590803, + "learning_rate": 4.059783507437886e-06, + "loss": 0.5249, + "step": 4616 + }, + { + "epoch": 2.2436474164133737, + "grad_norm": 0.06959640989329831, + "learning_rate": 4.057903061449545e-06, + "loss": 0.5031, + "step": 4617 + }, + { + "epoch": 2.244133738601824, + "grad_norm": 0.07062961902988361, + "learning_rate": 4.0560227536090206e-06, + "loss": 0.5109, + "step": 4618 + }, + { + "epoch": 2.2446200607902735, + "grad_norm": 0.0763826527208737, + "learning_rate": 4.05414258419204e-06, + "loss": 0.5526, + "step": 4619 + }, + { + "epoch": 2.2451063829787232, + "grad_norm": 0.07449247268750961, + "learning_rate": 4.05226255347431e-06, + "loss": 0.5297, + "step": 4620 + }, + { + "epoch": 2.2455927051671734, + "grad_norm": 0.07481854868251386, + "learning_rate": 4.050382661731513e-06, + "loss": 0.5562, + "step": 4621 + }, + { + "epoch": 2.246079027355623, + "grad_norm": 0.07600959564524962, + "learning_rate": 4.048502909239314e-06, + "loss": 0.5612, + "step": 4622 + }, + { + "epoch": 2.246565349544073, + "grad_norm": 0.07154286450044224, + "learning_rate": 4.046623296273359e-06, + "loss": 0.5441, + "step": 4623 + }, + { + "epoch": 2.247051671732523, + "grad_norm": 0.0732323366918422, + "learning_rate": 4.044743823109272e-06, + "loss": 0.5358, + "step": 4624 + }, + { + "epoch": 2.2475379939209725, + "grad_norm": 0.07030235463227916, + "learning_rate": 4.042864490022656e-06, + "loss": 0.5035, + "step": 4625 + }, + { + "epoch": 2.2480243161094227, + "grad_norm": 0.07219052717636693, + "learning_rate": 4.040985297289093e-06, + "loss": 0.5231, + "step": 4626 + }, + { + "epoch": 2.2480243161094227, + "eval_loss": 0.5724824666976929, + "eval_runtime": 105.2478, + "eval_samples_per_second": 288.396, + "eval_steps_per_second": 36.058, + "step": 4626 + }, + { + "epoch": 2.2485106382978723, + "grad_norm": 0.06894684561606178, + "learning_rate": 4.0391062451841455e-06, + "loss": 0.5131, + "step": 4627 + }, + { + "epoch": 2.248996960486322, + "grad_norm": 0.0726377000345425, + "learning_rate": 4.037227333983356e-06, + "loss": 0.5379, + "step": 4628 + }, + { + "epoch": 2.249483282674772, + "grad_norm": 0.07495720921777778, + "learning_rate": 4.035348563962245e-06, + "loss": 0.5513, + "step": 4629 + }, + { + "epoch": 2.249969604863222, + "grad_norm": 0.07434418403216476, + "learning_rate": 4.033469935396313e-06, + "loss": 0.5665, + "step": 4630 + }, + { + "epoch": 2.2504559270516715, + "grad_norm": 0.0721622807479631, + "learning_rate": 4.031591448561038e-06, + "loss": 0.5036, + "step": 4631 + }, + { + "epoch": 2.2509422492401217, + "grad_norm": 0.07101519806430857, + "learning_rate": 4.0297131037318826e-06, + "loss": 0.5292, + "step": 4632 + }, + { + "epoch": 2.2514285714285713, + "grad_norm": 0.07130386691688971, + "learning_rate": 4.0278349011842806e-06, + "loss": 0.5513, + "step": 4633 + }, + { + "epoch": 2.2519148936170215, + "grad_norm": 0.07154600826096935, + "learning_rate": 4.025956841193651e-06, + "loss": 0.546, + "step": 4634 + }, + { + "epoch": 2.252401215805471, + "grad_norm": 0.06985275787801024, + "learning_rate": 4.0240789240353885e-06, + "loss": 0.5144, + "step": 4635 + }, + { + "epoch": 2.252887537993921, + "grad_norm": 0.06890198681421501, + "learning_rate": 4.022201149984871e-06, + "loss": 0.4974, + "step": 4636 + }, + { + "epoch": 2.253373860182371, + "grad_norm": 0.07104841733003331, + "learning_rate": 4.02032351931745e-06, + "loss": 0.5111, + "step": 4637 + }, + { + "epoch": 2.2538601823708206, + "grad_norm": 0.07476571695331928, + "learning_rate": 4.01844603230846e-06, + "loss": 0.5467, + "step": 4638 + }, + { + "epoch": 2.2543465045592703, + "grad_norm": 0.07383025791394142, + "learning_rate": 4.016568689233214e-06, + "loss": 0.5372, + "step": 4639 + }, + { + "epoch": 2.2548328267477205, + "grad_norm": 0.0714222417481429, + "learning_rate": 4.014691490367e-06, + "loss": 0.5211, + "step": 4640 + }, + { + "epoch": 2.25531914893617, + "grad_norm": 0.07231688856038394, + "learning_rate": 4.012814435985092e-06, + "loss": 0.4955, + "step": 4641 + }, + { + "epoch": 2.25580547112462, + "grad_norm": 0.07432097278521449, + "learning_rate": 4.010937526362737e-06, + "loss": 0.5577, + "step": 4642 + }, + { + "epoch": 2.25629179331307, + "grad_norm": 0.07361659734610657, + "learning_rate": 4.009060761775161e-06, + "loss": 0.5448, + "step": 4643 + }, + { + "epoch": 2.2567781155015196, + "grad_norm": 0.07263906957251715, + "learning_rate": 4.007184142497572e-06, + "loss": 0.5581, + "step": 4644 + }, + { + "epoch": 2.2572644376899698, + "grad_norm": 0.07150350702796775, + "learning_rate": 4.005307668805154e-06, + "loss": 0.5385, + "step": 4645 + }, + { + "epoch": 2.2577507598784194, + "grad_norm": 0.07248082133327993, + "learning_rate": 4.0034313409730726e-06, + "loss": 0.5385, + "step": 4646 + }, + { + "epoch": 2.258237082066869, + "grad_norm": 0.07637048032760417, + "learning_rate": 4.001555159276467e-06, + "loss": 0.58, + "step": 4647 + }, + { + "epoch": 2.2587234042553193, + "grad_norm": 0.0709570721204946, + "learning_rate": 3.999679123990458e-06, + "loss": 0.5205, + "step": 4648 + }, + { + "epoch": 2.259209726443769, + "grad_norm": 0.07119445196361225, + "learning_rate": 3.997803235390148e-06, + "loss": 0.5056, + "step": 4649 + }, + { + "epoch": 2.259696048632219, + "grad_norm": 0.07226177551198266, + "learning_rate": 3.9959274937506125e-06, + "loss": 0.5164, + "step": 4650 + }, + { + "epoch": 2.2601823708206688, + "grad_norm": 0.07259049106799932, + "learning_rate": 3.994051899346907e-06, + "loss": 0.505, + "step": 4651 + }, + { + "epoch": 2.2606686930091184, + "grad_norm": 0.07165842963067946, + "learning_rate": 3.9921764524540675e-06, + "loss": 0.5155, + "step": 4652 + }, + { + "epoch": 2.2611550151975686, + "grad_norm": 0.07275979455306059, + "learning_rate": 3.990301153347107e-06, + "loss": 0.5387, + "step": 4653 + }, + { + "epoch": 2.2616413373860182, + "grad_norm": 0.07375990794048601, + "learning_rate": 3.988426002301016e-06, + "loss": 0.5488, + "step": 4654 + }, + { + "epoch": 2.262127659574468, + "grad_norm": 0.07060908895057215, + "learning_rate": 3.9865509995907656e-06, + "loss": 0.5137, + "step": 4655 + }, + { + "epoch": 2.262613981762918, + "grad_norm": 0.07263896287183237, + "learning_rate": 3.984676145491302e-06, + "loss": 0.5424, + "step": 4656 + }, + { + "epoch": 2.2631003039513677, + "grad_norm": 0.0704289138414631, + "learning_rate": 3.982801440277552e-06, + "loss": 0.5284, + "step": 4657 + }, + { + "epoch": 2.2635866261398174, + "grad_norm": 0.07567836995960589, + "learning_rate": 3.980926884224417e-06, + "loss": 0.5314, + "step": 4658 + }, + { + "epoch": 2.2640729483282676, + "grad_norm": 0.07207726148750707, + "learning_rate": 3.979052477606785e-06, + "loss": 0.5102, + "step": 4659 + }, + { + "epoch": 2.2645592705167172, + "grad_norm": 0.06963018684727802, + "learning_rate": 3.977178220699514e-06, + "loss": 0.5642, + "step": 4660 + }, + { + "epoch": 2.2650455927051674, + "grad_norm": 0.06972161012251381, + "learning_rate": 3.9753041137774414e-06, + "loss": 0.4987, + "step": 4661 + }, + { + "epoch": 2.265531914893617, + "grad_norm": 0.07375655082562882, + "learning_rate": 3.9734301571153845e-06, + "loss": 0.5676, + "step": 4662 + }, + { + "epoch": 2.2660182370820667, + "grad_norm": 0.07115099880046266, + "learning_rate": 3.971556350988137e-06, + "loss": 0.5307, + "step": 4663 + }, + { + "epoch": 2.266504559270517, + "grad_norm": 0.07041156696485094, + "learning_rate": 3.969682695670472e-06, + "loss": 0.493, + "step": 4664 + }, + { + "epoch": 2.2669908814589665, + "grad_norm": 0.07670636771867809, + "learning_rate": 3.96780919143714e-06, + "loss": 0.5489, + "step": 4665 + }, + { + "epoch": 2.2674772036474162, + "grad_norm": 0.07414283445717815, + "learning_rate": 3.965935838562868e-06, + "loss": 0.5069, + "step": 4666 + }, + { + "epoch": 2.2679635258358664, + "grad_norm": 0.07353006904596122, + "learning_rate": 3.9640626373223636e-06, + "loss": 0.524, + "step": 4667 + }, + { + "epoch": 2.268449848024316, + "grad_norm": 0.07001901504172404, + "learning_rate": 3.96218958799031e-06, + "loss": 0.4888, + "step": 4668 + }, + { + "epoch": 2.2689361702127657, + "grad_norm": 0.07387519149202325, + "learning_rate": 3.9603166908413665e-06, + "loss": 0.5397, + "step": 4669 + }, + { + "epoch": 2.269422492401216, + "grad_norm": 0.07462909382918813, + "learning_rate": 3.958443946150176e-06, + "loss": 0.5267, + "step": 4670 + }, + { + "epoch": 2.2699088145896655, + "grad_norm": 0.07581939008792042, + "learning_rate": 3.956571354191352e-06, + "loss": 0.5518, + "step": 4671 + }, + { + "epoch": 2.2703951367781157, + "grad_norm": 0.07367720647418047, + "learning_rate": 3.95469891523949e-06, + "loss": 0.5449, + "step": 4672 + }, + { + "epoch": 2.2708814589665653, + "grad_norm": 0.06898445831740564, + "learning_rate": 3.952826629569162e-06, + "loss": 0.5034, + "step": 4673 + }, + { + "epoch": 2.271367781155015, + "grad_norm": 0.07124674172456269, + "learning_rate": 3.950954497454916e-06, + "loss": 0.5213, + "step": 4674 + }, + { + "epoch": 2.271854103343465, + "grad_norm": 0.07317665317565439, + "learning_rate": 3.949082519171282e-06, + "loss": 0.527, + "step": 4675 + }, + { + "epoch": 2.272340425531915, + "grad_norm": 0.07177760821923058, + "learning_rate": 3.947210694992761e-06, + "loss": 0.5182, + "step": 4676 + }, + { + "epoch": 2.272826747720365, + "grad_norm": 0.07299501756526486, + "learning_rate": 3.945339025193837e-06, + "loss": 0.5334, + "step": 4677 + }, + { + "epoch": 2.2733130699088147, + "grad_norm": 0.07265700949450907, + "learning_rate": 3.943467510048969e-06, + "loss": 0.499, + "step": 4678 + }, + { + "epoch": 2.2737993920972643, + "grad_norm": 0.07297855579993766, + "learning_rate": 3.941596149832593e-06, + "loss": 0.5396, + "step": 4679 + }, + { + "epoch": 2.2742857142857145, + "grad_norm": 0.07111614061774357, + "learning_rate": 3.939724944819122e-06, + "loss": 0.521, + "step": 4680 + }, + { + "epoch": 2.274772036474164, + "grad_norm": 0.07058081484806333, + "learning_rate": 3.937853895282948e-06, + "loss": 0.514, + "step": 4681 + }, + { + "epoch": 2.275258358662614, + "grad_norm": 0.07133281626498338, + "learning_rate": 3.935983001498439e-06, + "loss": 0.5256, + "step": 4682 + }, + { + "epoch": 2.275744680851064, + "grad_norm": 0.079577019991552, + "learning_rate": 3.9341122637399395e-06, + "loss": 0.5137, + "step": 4683 + }, + { + "epoch": 2.2762310030395136, + "grad_norm": 0.07245763195009111, + "learning_rate": 3.932241682281774e-06, + "loss": 0.5395, + "step": 4684 + }, + { + "epoch": 2.2767173252279633, + "grad_norm": 0.0711027919037749, + "learning_rate": 3.93037125739824e-06, + "loss": 0.5041, + "step": 4685 + }, + { + "epoch": 2.2772036474164135, + "grad_norm": 0.07090235344559005, + "learning_rate": 3.928500989363614e-06, + "loss": 0.535, + "step": 4686 + }, + { + "epoch": 2.277689969604863, + "grad_norm": 0.07326328738846202, + "learning_rate": 3.9266308784521515e-06, + "loss": 0.5145, + "step": 4687 + }, + { + "epoch": 2.2781762917933133, + "grad_norm": 0.07415270979860497, + "learning_rate": 3.92476092493808e-06, + "loss": 0.5587, + "step": 4688 + }, + { + "epoch": 2.278662613981763, + "grad_norm": 0.07531879257606712, + "learning_rate": 3.922891129095609e-06, + "loss": 0.5633, + "step": 4689 + }, + { + "epoch": 2.2791489361702126, + "grad_norm": 0.07452808453864476, + "learning_rate": 3.9210214911989235e-06, + "loss": 0.5794, + "step": 4690 + }, + { + "epoch": 2.2796352583586628, + "grad_norm": 0.06826857272640785, + "learning_rate": 3.919152011522183e-06, + "loss": 0.4823, + "step": 4691 + }, + { + "epoch": 2.2801215805471124, + "grad_norm": 0.07137685960301225, + "learning_rate": 3.917282690339527e-06, + "loss": 0.4975, + "step": 4692 + }, + { + "epoch": 2.280607902735562, + "grad_norm": 0.07439128760967693, + "learning_rate": 3.915413527925069e-06, + "loss": 0.5278, + "step": 4693 + }, + { + "epoch": 2.2810942249240123, + "grad_norm": 0.07618049696879843, + "learning_rate": 3.913544524552899e-06, + "loss": 0.5567, + "step": 4694 + }, + { + "epoch": 2.281580547112462, + "grad_norm": 0.07384424073008243, + "learning_rate": 3.911675680497089e-06, + "loss": 0.5265, + "step": 4695 + }, + { + "epoch": 2.2820668693009116, + "grad_norm": 0.07291462394707528, + "learning_rate": 3.9098069960316805e-06, + "loss": 0.5036, + "step": 4696 + }, + { + "epoch": 2.2825531914893618, + "grad_norm": 0.06924792038609237, + "learning_rate": 3.907938471430697e-06, + "loss": 0.4844, + "step": 4697 + }, + { + "epoch": 2.2830395136778114, + "grad_norm": 0.07073242890386755, + "learning_rate": 3.906070106968135e-06, + "loss": 0.527, + "step": 4698 + }, + { + "epoch": 2.2835258358662616, + "grad_norm": 0.07398613169879711, + "learning_rate": 3.90420190291797e-06, + "loss": 0.5402, + "step": 4699 + }, + { + "epoch": 2.2840121580547113, + "grad_norm": 0.07156168035173004, + "learning_rate": 3.9023338595541535e-06, + "loss": 0.5294, + "step": 4700 + }, + { + "epoch": 2.284498480243161, + "grad_norm": 0.06993466863618257, + "learning_rate": 3.90046597715061e-06, + "loss": 0.545, + "step": 4701 + }, + { + "epoch": 2.284984802431611, + "grad_norm": 0.07086382449912602, + "learning_rate": 3.898598255981245e-06, + "loss": 0.5151, + "step": 4702 + }, + { + "epoch": 2.2854711246200607, + "grad_norm": 0.07101506174003709, + "learning_rate": 3.8967306963199394e-06, + "loss": 0.4801, + "step": 4703 + }, + { + "epoch": 2.285957446808511, + "grad_norm": 0.07147039245482908, + "learning_rate": 3.894863298440548e-06, + "loss": 0.5235, + "step": 4704 + }, + { + "epoch": 2.2864437689969606, + "grad_norm": 0.06988310483689339, + "learning_rate": 3.8929960626169036e-06, + "loss": 0.5272, + "step": 4705 + }, + { + "epoch": 2.2869300911854102, + "grad_norm": 0.07048340393795582, + "learning_rate": 3.891128989122816e-06, + "loss": 0.5355, + "step": 4706 + }, + { + "epoch": 2.2874164133738604, + "grad_norm": 0.0743465997284842, + "learning_rate": 3.889262078232071e-06, + "loss": 0.5662, + "step": 4707 + }, + { + "epoch": 2.28790273556231, + "grad_norm": 0.07107155213536039, + "learning_rate": 3.887395330218429e-06, + "loss": 0.5308, + "step": 4708 + }, + { + "epoch": 2.2883890577507597, + "grad_norm": 0.07308034590818346, + "learning_rate": 3.8855287453556275e-06, + "loss": 0.5128, + "step": 4709 + }, + { + "epoch": 2.28887537993921, + "grad_norm": 0.07297240475753164, + "learning_rate": 3.8836623239173794e-06, + "loss": 0.5137, + "step": 4710 + }, + { + "epoch": 2.2893617021276595, + "grad_norm": 0.07148740551001319, + "learning_rate": 3.881796066177374e-06, + "loss": 0.4975, + "step": 4711 + }, + { + "epoch": 2.2898480243161092, + "grad_norm": 0.07199130545353712, + "learning_rate": 3.879929972409276e-06, + "loss": 0.5352, + "step": 4712 + }, + { + "epoch": 2.2903343465045594, + "grad_norm": 0.07472586487009215, + "learning_rate": 3.87806404288673e-06, + "loss": 0.5494, + "step": 4713 + }, + { + "epoch": 2.290820668693009, + "grad_norm": 0.07351698648123663, + "learning_rate": 3.876198277883353e-06, + "loss": 0.5177, + "step": 4714 + }, + { + "epoch": 2.291306990881459, + "grad_norm": 0.07444764856092664, + "learning_rate": 3.874332677672735e-06, + "loss": 0.5549, + "step": 4715 + }, + { + "epoch": 2.291793313069909, + "grad_norm": 0.07391139936003299, + "learning_rate": 3.872467242528448e-06, + "loss": 0.5604, + "step": 4716 + }, + { + "epoch": 2.2922796352583585, + "grad_norm": 0.06864485214114208, + "learning_rate": 3.870601972724036e-06, + "loss": 0.5119, + "step": 4717 + }, + { + "epoch": 2.2927659574468087, + "grad_norm": 0.0727952036298192, + "learning_rate": 3.868736868533019e-06, + "loss": 0.5045, + "step": 4718 + }, + { + "epoch": 2.2932522796352584, + "grad_norm": 0.07105980123026319, + "learning_rate": 3.866871930228894e-06, + "loss": 0.5121, + "step": 4719 + }, + { + "epoch": 2.293738601823708, + "grad_norm": 0.07278632325963608, + "learning_rate": 3.865007158085134e-06, + "loss": 0.5251, + "step": 4720 + }, + { + "epoch": 2.294224924012158, + "grad_norm": 0.07036386733435124, + "learning_rate": 3.863142552375184e-06, + "loss": 0.5125, + "step": 4721 + }, + { + "epoch": 2.294711246200608, + "grad_norm": 0.07184550273936362, + "learning_rate": 3.8612781133724695e-06, + "loss": 0.5339, + "step": 4722 + }, + { + "epoch": 2.2951975683890575, + "grad_norm": 0.07296058221154193, + "learning_rate": 3.859413841350388e-06, + "loss": 0.5394, + "step": 4723 + }, + { + "epoch": 2.2956838905775077, + "grad_norm": 0.07301576172435108, + "learning_rate": 3.8575497365823164e-06, + "loss": 0.547, + "step": 4724 + }, + { + "epoch": 2.2961702127659573, + "grad_norm": 0.07148596667984473, + "learning_rate": 3.855685799341601e-06, + "loss": 0.5085, + "step": 4725 + }, + { + "epoch": 2.2966565349544075, + "grad_norm": 0.07141941471796145, + "learning_rate": 3.853822029901568e-06, + "loss": 0.5258, + "step": 4726 + }, + { + "epoch": 2.297142857142857, + "grad_norm": 0.07187324843725157, + "learning_rate": 3.85195842853552e-06, + "loss": 0.5476, + "step": 4727 + }, + { + "epoch": 2.297629179331307, + "grad_norm": 0.07151277663867313, + "learning_rate": 3.85009499551673e-06, + "loss": 0.5243, + "step": 4728 + }, + { + "epoch": 2.298115501519757, + "grad_norm": 0.06959874803827659, + "learning_rate": 3.848231731118452e-06, + "loss": 0.479, + "step": 4729 + }, + { + "epoch": 2.2986018237082066, + "grad_norm": 0.07133049646175173, + "learning_rate": 3.846368635613912e-06, + "loss": 0.5044, + "step": 4730 + }, + { + "epoch": 2.2990881458966568, + "grad_norm": 0.07249016020552276, + "learning_rate": 3.8445057092763086e-06, + "loss": 0.5206, + "step": 4731 + }, + { + "epoch": 2.2995744680851065, + "grad_norm": 0.07414595690934293, + "learning_rate": 3.842642952378823e-06, + "loss": 0.4939, + "step": 4732 + }, + { + "epoch": 2.300060790273556, + "grad_norm": 0.0732216936531094, + "learning_rate": 3.840780365194606e-06, + "loss": 0.5117, + "step": 4733 + }, + { + "epoch": 2.3005471124620063, + "grad_norm": 0.07005468712117852, + "learning_rate": 3.838917947996786e-06, + "loss": 0.5215, + "step": 4734 + }, + { + "epoch": 2.301033434650456, + "grad_norm": 0.07158626863991081, + "learning_rate": 3.837055701058462e-06, + "loss": 0.5067, + "step": 4735 + }, + { + "epoch": 2.3015197568389056, + "grad_norm": 0.07049661636196115, + "learning_rate": 3.835193624652714e-06, + "loss": 0.4754, + "step": 4736 + }, + { + "epoch": 2.3020060790273558, + "grad_norm": 0.06993799380420346, + "learning_rate": 3.833331719052593e-06, + "loss": 0.5046, + "step": 4737 + }, + { + "epoch": 2.3024924012158055, + "grad_norm": 0.07070982916552473, + "learning_rate": 3.8314699845311295e-06, + "loss": 0.5014, + "step": 4738 + }, + { + "epoch": 2.302978723404255, + "grad_norm": 0.07131370952569205, + "learning_rate": 3.829608421361321e-06, + "loss": 0.5261, + "step": 4739 + }, + { + "epoch": 2.3034650455927053, + "grad_norm": 0.07281328662297258, + "learning_rate": 3.827747029816148e-06, + "loss": 0.5169, + "step": 4740 + }, + { + "epoch": 2.303951367781155, + "grad_norm": 0.0730018194450882, + "learning_rate": 3.82588581016856e-06, + "loss": 0.5345, + "step": 4741 + }, + { + "epoch": 2.304437689969605, + "grad_norm": 0.07176971230555247, + "learning_rate": 3.824024762691485e-06, + "loss": 0.5591, + "step": 4742 + }, + { + "epoch": 2.3049240121580548, + "grad_norm": 0.0730575520211184, + "learning_rate": 3.822163887657825e-06, + "loss": 0.5016, + "step": 4743 + }, + { + "epoch": 2.3054103343465044, + "grad_norm": 0.07111386060619199, + "learning_rate": 3.820303185340456e-06, + "loss": 0.5157, + "step": 4744 + }, + { + "epoch": 2.3058966565349546, + "grad_norm": 0.0719805286930231, + "learning_rate": 3.818442656012228e-06, + "loss": 0.5297, + "step": 4745 + }, + { + "epoch": 2.3063829787234043, + "grad_norm": 0.07345378781834982, + "learning_rate": 3.816582299945967e-06, + "loss": 0.5341, + "step": 4746 + }, + { + "epoch": 2.306869300911854, + "grad_norm": 0.07203398851803429, + "learning_rate": 3.814722117414473e-06, + "loss": 0.4971, + "step": 4747 + }, + { + "epoch": 2.307355623100304, + "grad_norm": 0.07108983146392801, + "learning_rate": 3.812862108690522e-06, + "loss": 0.5231, + "step": 4748 + }, + { + "epoch": 2.3078419452887537, + "grad_norm": 0.07077642296207613, + "learning_rate": 3.8110022740468587e-06, + "loss": 0.5099, + "step": 4749 + }, + { + "epoch": 2.3083282674772034, + "grad_norm": 0.0718048590818655, + "learning_rate": 3.8091426137562128e-06, + "loss": 0.5031, + "step": 4750 + }, + { + "epoch": 2.3088145896656536, + "grad_norm": 0.07155037591587134, + "learning_rate": 3.8072831280912785e-06, + "loss": 0.5534, + "step": 4751 + }, + { + "epoch": 2.3093009118541032, + "grad_norm": 0.07114749677425357, + "learning_rate": 3.8054238173247295e-06, + "loss": 0.5054, + "step": 4752 + }, + { + "epoch": 2.3097872340425534, + "grad_norm": 0.0701140022782335, + "learning_rate": 3.8035646817292136e-06, + "loss": 0.5116, + "step": 4753 + }, + { + "epoch": 2.310273556231003, + "grad_norm": 0.07063718472295341, + "learning_rate": 3.8017057215773502e-06, + "loss": 0.5194, + "step": 4754 + }, + { + "epoch": 2.3107598784194527, + "grad_norm": 0.07179254125621902, + "learning_rate": 3.799846937141734e-06, + "loss": 0.5287, + "step": 4755 + }, + { + "epoch": 2.311246200607903, + "grad_norm": 0.07255922385885435, + "learning_rate": 3.7979883286949366e-06, + "loss": 0.5177, + "step": 4756 + }, + { + "epoch": 2.3117325227963526, + "grad_norm": 0.06930819228017146, + "learning_rate": 3.7961298965095005e-06, + "loss": 0.5051, + "step": 4757 + }, + { + "epoch": 2.3122188449848027, + "grad_norm": 0.07497506082260916, + "learning_rate": 3.794271640857945e-06, + "loss": 0.5585, + "step": 4758 + }, + { + "epoch": 2.3127051671732524, + "grad_norm": 0.0699955603041079, + "learning_rate": 3.792413562012761e-06, + "loss": 0.5191, + "step": 4759 + }, + { + "epoch": 2.313191489361702, + "grad_norm": 0.07171338950978387, + "learning_rate": 3.790555660246415e-06, + "loss": 0.5253, + "step": 4760 + }, + { + "epoch": 2.3136778115501517, + "grad_norm": 0.07228107672574575, + "learning_rate": 3.7886979358313477e-06, + "loss": 0.5367, + "step": 4761 + }, + { + "epoch": 2.314164133738602, + "grad_norm": 0.07395584018772629, + "learning_rate": 3.7868403890399734e-06, + "loss": 0.5715, + "step": 4762 + }, + { + "epoch": 2.3146504559270515, + "grad_norm": 0.07755690422327892, + "learning_rate": 3.784983020144679e-06, + "loss": 0.5908, + "step": 4763 + }, + { + "epoch": 2.3151367781155017, + "grad_norm": 0.07157445152422266, + "learning_rate": 3.7831258294178268e-06, + "loss": 0.5094, + "step": 4764 + }, + { + "epoch": 2.3156231003039514, + "grad_norm": 0.06955610533387216, + "learning_rate": 3.7812688171317534e-06, + "loss": 0.4898, + "step": 4765 + }, + { + "epoch": 2.316109422492401, + "grad_norm": 0.07203231578730333, + "learning_rate": 3.7794119835587687e-06, + "loss": 0.5197, + "step": 4766 + }, + { + "epoch": 2.316595744680851, + "grad_norm": 0.07098383689277571, + "learning_rate": 3.7775553289711536e-06, + "loss": 0.4942, + "step": 4767 + }, + { + "epoch": 2.317082066869301, + "grad_norm": 0.06945193951382612, + "learning_rate": 3.775698853641171e-06, + "loss": 0.5191, + "step": 4768 + }, + { + "epoch": 2.317568389057751, + "grad_norm": 0.0724118194764258, + "learning_rate": 3.7738425578410477e-06, + "loss": 0.5253, + "step": 4769 + }, + { + "epoch": 2.3180547112462007, + "grad_norm": 0.07415442115060311, + "learning_rate": 3.7719864418429887e-06, + "loss": 0.5425, + "step": 4770 + }, + { + "epoch": 2.3185410334346503, + "grad_norm": 0.07288948164699163, + "learning_rate": 3.7701305059191736e-06, + "loss": 0.5358, + "step": 4771 + }, + { + "epoch": 2.3190273556231005, + "grad_norm": 0.07314320607063393, + "learning_rate": 3.7682747503417537e-06, + "loss": 0.5181, + "step": 4772 + }, + { + "epoch": 2.31951367781155, + "grad_norm": 0.07141040186456821, + "learning_rate": 3.7664191753828536e-06, + "loss": 0.5355, + "step": 4773 + }, + { + "epoch": 2.32, + "grad_norm": 0.07141149119131202, + "learning_rate": 3.764563781314574e-06, + "loss": 0.5388, + "step": 4774 + }, + { + "epoch": 2.32048632218845, + "grad_norm": 0.07050257665061001, + "learning_rate": 3.762708568408987e-06, + "loss": 0.5055, + "step": 4775 + }, + { + "epoch": 2.3209726443768997, + "grad_norm": 0.07036539776286459, + "learning_rate": 3.760853536938137e-06, + "loss": 0.5025, + "step": 4776 + }, + { + "epoch": 2.3214589665653493, + "grad_norm": 0.07005550035762793, + "learning_rate": 3.7589986871740466e-06, + "loss": 0.4919, + "step": 4777 + }, + { + "epoch": 2.3219452887537995, + "grad_norm": 0.07613550958473317, + "learning_rate": 3.7571440193887044e-06, + "loss": 0.5536, + "step": 4778 + }, + { + "epoch": 2.322431610942249, + "grad_norm": 0.07806552850604012, + "learning_rate": 3.7552895338540785e-06, + "loss": 0.5183, + "step": 4779 + }, + { + "epoch": 2.3229179331306993, + "grad_norm": 0.07288733633932881, + "learning_rate": 3.7534352308421075e-06, + "loss": 0.5026, + "step": 4780 + }, + { + "epoch": 2.323404255319149, + "grad_norm": 0.07202825418931809, + "learning_rate": 3.7515811106247047e-06, + "loss": 0.5293, + "step": 4781 + }, + { + "epoch": 2.3238905775075986, + "grad_norm": 0.0708108492366933, + "learning_rate": 3.7497271734737545e-06, + "loss": 0.5065, + "step": 4782 + }, + { + "epoch": 2.3243768996960488, + "grad_norm": 0.0723607877647087, + "learning_rate": 3.7478734196611172e-06, + "loss": 0.5546, + "step": 4783 + }, + { + "epoch": 2.3248632218844985, + "grad_norm": 0.07195535147468135, + "learning_rate": 3.7460198494586236e-06, + "loss": 0.5342, + "step": 4784 + }, + { + "epoch": 2.325349544072948, + "grad_norm": 0.06962206754001322, + "learning_rate": 3.7441664631380787e-06, + "loss": 0.5238, + "step": 4785 + }, + { + "epoch": 2.3258358662613983, + "grad_norm": 0.06909603792505634, + "learning_rate": 3.7423132609712613e-06, + "loss": 0.512, + "step": 4786 + }, + { + "epoch": 2.326322188449848, + "grad_norm": 0.07044052794358648, + "learning_rate": 3.740460243229923e-06, + "loss": 0.5187, + "step": 4787 + }, + { + "epoch": 2.3268085106382976, + "grad_norm": 0.07491907293544683, + "learning_rate": 3.7386074101857866e-06, + "loss": 0.5827, + "step": 4788 + }, + { + "epoch": 2.3272948328267478, + "grad_norm": 0.07142568653057813, + "learning_rate": 3.736754762110549e-06, + "loss": 0.5395, + "step": 4789 + }, + { + "epoch": 2.3277811550151974, + "grad_norm": 0.07071150138241877, + "learning_rate": 3.7349022992758816e-06, + "loss": 0.5343, + "step": 4790 + }, + { + "epoch": 2.3282674772036476, + "grad_norm": 0.07158552480920521, + "learning_rate": 3.733050021953425e-06, + "loss": 0.5355, + "step": 4791 + }, + { + "epoch": 2.3287537993920973, + "grad_norm": 0.07190579571713522, + "learning_rate": 3.731197930414797e-06, + "loss": 0.509, + "step": 4792 + }, + { + "epoch": 2.329240121580547, + "grad_norm": 0.0719205145585961, + "learning_rate": 3.7293460249315826e-06, + "loss": 0.5204, + "step": 4793 + }, + { + "epoch": 2.329726443768997, + "grad_norm": 0.07390191912726275, + "learning_rate": 3.7274943057753455e-06, + "loss": 0.5365, + "step": 4794 + }, + { + "epoch": 2.3302127659574468, + "grad_norm": 0.07421952266945099, + "learning_rate": 3.725642773217617e-06, + "loss": 0.5496, + "step": 4795 + }, + { + "epoch": 2.330699088145897, + "grad_norm": 0.07368880630778797, + "learning_rate": 3.7237914275299057e-06, + "loss": 0.543, + "step": 4796 + }, + { + "epoch": 2.3311854103343466, + "grad_norm": 0.07089745500331597, + "learning_rate": 3.721940268983688e-06, + "loss": 0.5415, + "step": 4797 + }, + { + "epoch": 2.3316717325227962, + "grad_norm": 0.06981533793513522, + "learning_rate": 3.720089297850418e-06, + "loss": 0.506, + "step": 4798 + }, + { + "epoch": 2.3321580547112464, + "grad_norm": 0.07037159633706239, + "learning_rate": 3.7182385144015165e-06, + "loss": 0.5255, + "step": 4799 + }, + { + "epoch": 2.332644376899696, + "grad_norm": 0.06824072749249188, + "learning_rate": 3.716387918908383e-06, + "loss": 0.511, + "step": 4800 + }, + { + "epoch": 2.3331306990881457, + "grad_norm": 0.07148729152702869, + "learning_rate": 3.7145375116423847e-06, + "loss": 0.5298, + "step": 4801 + }, + { + "epoch": 2.333617021276596, + "grad_norm": 0.07303649687938313, + "learning_rate": 3.7126872928748623e-06, + "loss": 0.5027, + "step": 4802 + }, + { + "epoch": 2.3341033434650456, + "grad_norm": 0.071207249943337, + "learning_rate": 3.7108372628771284e-06, + "loss": 0.5055, + "step": 4803 + }, + { + "epoch": 2.3345896656534952, + "grad_norm": 0.07197906235010862, + "learning_rate": 3.7089874219204715e-06, + "loss": 0.5383, + "step": 4804 + }, + { + "epoch": 2.3350759878419454, + "grad_norm": 0.07362050908452387, + "learning_rate": 3.707137770276149e-06, + "loss": 0.5428, + "step": 4805 + }, + { + "epoch": 2.335562310030395, + "grad_norm": 0.07671725407732392, + "learning_rate": 3.7052883082153927e-06, + "loss": 0.5407, + "step": 4806 + }, + { + "epoch": 2.336048632218845, + "grad_norm": 0.07427949562763843, + "learning_rate": 3.7034390360094026e-06, + "loss": 0.5472, + "step": 4807 + }, + { + "epoch": 2.336534954407295, + "grad_norm": 0.07145509561176622, + "learning_rate": 3.701589953929354e-06, + "loss": 0.5186, + "step": 4808 + }, + { + "epoch": 2.3370212765957445, + "grad_norm": 0.07342895170374958, + "learning_rate": 3.6997410622463947e-06, + "loss": 0.554, + "step": 4809 + }, + { + "epoch": 2.3375075987841947, + "grad_norm": 0.07333972793510114, + "learning_rate": 3.6978923612316427e-06, + "loss": 0.5575, + "step": 4810 + }, + { + "epoch": 2.3379939209726444, + "grad_norm": 0.07116944443519507, + "learning_rate": 3.6960438511561897e-06, + "loss": 0.5343, + "step": 4811 + }, + { + "epoch": 2.338480243161094, + "grad_norm": 0.07030477272928837, + "learning_rate": 3.694195532291098e-06, + "loss": 0.5166, + "step": 4812 + }, + { + "epoch": 2.338966565349544, + "grad_norm": 0.07121734054318392, + "learning_rate": 3.6923474049074037e-06, + "loss": 0.491, + "step": 4813 + }, + { + "epoch": 2.339452887537994, + "grad_norm": 0.07407484792987686, + "learning_rate": 3.690499469276113e-06, + "loss": 0.5565, + "step": 4814 + }, + { + "epoch": 2.3399392097264435, + "grad_norm": 0.07365003602827443, + "learning_rate": 3.6886517256682053e-06, + "loss": 0.5295, + "step": 4815 + }, + { + "epoch": 2.3404255319148937, + "grad_norm": 0.07009545778284128, + "learning_rate": 3.686804174354631e-06, + "loss": 0.5008, + "step": 4816 + }, + { + "epoch": 2.3409118541033433, + "grad_norm": 0.07093665843271561, + "learning_rate": 3.684956815606311e-06, + "loss": 0.5033, + "step": 4817 + }, + { + "epoch": 2.3413981762917935, + "grad_norm": 0.07405490483646458, + "learning_rate": 3.683109649694141e-06, + "loss": 0.5465, + "step": 4818 + }, + { + "epoch": 2.341884498480243, + "grad_norm": 0.0702377861505547, + "learning_rate": 3.681262676888987e-06, + "loss": 0.5185, + "step": 4819 + }, + { + "epoch": 2.342370820668693, + "grad_norm": 0.07231639145354879, + "learning_rate": 3.6794158974616857e-06, + "loss": 0.5201, + "step": 4820 + }, + { + "epoch": 2.342857142857143, + "grad_norm": 0.07176915036529734, + "learning_rate": 3.6775693116830456e-06, + "loss": 0.5498, + "step": 4821 + }, + { + "epoch": 2.3433434650455927, + "grad_norm": 0.07051901451212278, + "learning_rate": 3.67572291982385e-06, + "loss": 0.4991, + "step": 4822 + }, + { + "epoch": 2.343829787234043, + "grad_norm": 0.06992144811152087, + "learning_rate": 3.6738767221548505e-06, + "loss": 0.4726, + "step": 4823 + }, + { + "epoch": 2.3443161094224925, + "grad_norm": 0.07274848353394854, + "learning_rate": 3.6720307189467702e-06, + "loss": 0.5203, + "step": 4824 + }, + { + "epoch": 2.344802431610942, + "grad_norm": 0.06974851133236662, + "learning_rate": 3.6701849104703046e-06, + "loss": 0.5117, + "step": 4825 + }, + { + "epoch": 2.3452887537993923, + "grad_norm": 0.07503067916975177, + "learning_rate": 3.6683392969961213e-06, + "loss": 0.5434, + "step": 4826 + }, + { + "epoch": 2.345775075987842, + "grad_norm": 0.07194162855671007, + "learning_rate": 3.666493878794858e-06, + "loss": 0.5428, + "step": 4827 + }, + { + "epoch": 2.3462613981762916, + "grad_norm": 0.07654877556882764, + "learning_rate": 3.664648656137124e-06, + "loss": 0.5215, + "step": 4828 + }, + { + "epoch": 2.3467477203647418, + "grad_norm": 0.06877015547464911, + "learning_rate": 3.662803629293501e-06, + "loss": 0.4833, + "step": 4829 + }, + { + "epoch": 2.3472340425531915, + "grad_norm": 0.07081742857510276, + "learning_rate": 3.6609587985345418e-06, + "loss": 0.4967, + "step": 4830 + }, + { + "epoch": 2.347720364741641, + "grad_norm": 0.07099546314230694, + "learning_rate": 3.6591141641307683e-06, + "loss": 0.526, + "step": 4831 + }, + { + "epoch": 2.3482066869300913, + "grad_norm": 0.07009258986916114, + "learning_rate": 3.657269726352676e-06, + "loss": 0.5141, + "step": 4832 + }, + { + "epoch": 2.348693009118541, + "grad_norm": 0.07150192698685824, + "learning_rate": 3.6554254854707294e-06, + "loss": 0.5088, + "step": 4833 + }, + { + "epoch": 2.349179331306991, + "grad_norm": 0.07117969510673795, + "learning_rate": 3.6535814417553674e-06, + "loss": 0.5034, + "step": 4834 + }, + { + "epoch": 2.3496656534954408, + "grad_norm": 0.07634931192122975, + "learning_rate": 3.6517375954769975e-06, + "loss": 0.5341, + "step": 4835 + }, + { + "epoch": 2.3501519756838904, + "grad_norm": 0.07494425914526873, + "learning_rate": 3.649893946905999e-06, + "loss": 0.5745, + "step": 4836 + }, + { + "epoch": 2.3506382978723406, + "grad_norm": 0.07001160047298115, + "learning_rate": 3.648050496312721e-06, + "loss": 0.487, + "step": 4837 + }, + { + "epoch": 2.3511246200607903, + "grad_norm": 0.06904929264094169, + "learning_rate": 3.6462072439674857e-06, + "loss": 0.484, + "step": 4838 + }, + { + "epoch": 2.35161094224924, + "grad_norm": 0.07184589864888712, + "learning_rate": 3.6443641901405834e-06, + "loss": 0.5626, + "step": 4839 + }, + { + "epoch": 2.35209726443769, + "grad_norm": 0.07413227159000095, + "learning_rate": 3.6425213351022803e-06, + "loss": 0.5473, + "step": 4840 + }, + { + "epoch": 2.3525835866261398, + "grad_norm": 0.07070517806821587, + "learning_rate": 3.640678679122808e-06, + "loss": 0.5197, + "step": 4841 + }, + { + "epoch": 2.3530699088145894, + "grad_norm": 0.0739435464504075, + "learning_rate": 3.6388362224723705e-06, + "loss": 0.5512, + "step": 4842 + }, + { + "epoch": 2.3535562310030396, + "grad_norm": 0.06968604104799363, + "learning_rate": 3.636993965421144e-06, + "loss": 0.514, + "step": 4843 + }, + { + "epoch": 2.3540425531914893, + "grad_norm": 0.07414563041898994, + "learning_rate": 3.635151908239275e-06, + "loss": 0.5619, + "step": 4844 + }, + { + "epoch": 2.3545288753799394, + "grad_norm": 0.0763558038856102, + "learning_rate": 3.6333100511968807e-06, + "loss": 0.5319, + "step": 4845 + }, + { + "epoch": 2.355015197568389, + "grad_norm": 0.0713713703377589, + "learning_rate": 3.6314683945640462e-06, + "loss": 0.4953, + "step": 4846 + }, + { + "epoch": 2.3555015197568387, + "grad_norm": 0.07386739311345307, + "learning_rate": 3.629626938610831e-06, + "loss": 0.5098, + "step": 4847 + }, + { + "epoch": 2.355987841945289, + "grad_norm": 0.07077049155481384, + "learning_rate": 3.6277856836072647e-06, + "loss": 0.5093, + "step": 4848 + }, + { + "epoch": 2.3564741641337386, + "grad_norm": 0.07648380639768826, + "learning_rate": 3.6259446298233434e-06, + "loss": 0.5799, + "step": 4849 + }, + { + "epoch": 2.3569604863221887, + "grad_norm": 0.07352679893735549, + "learning_rate": 3.62410377752904e-06, + "loss": 0.5052, + "step": 4850 + }, + { + "epoch": 2.3574468085106384, + "grad_norm": 0.07355841291565811, + "learning_rate": 3.6222631269942933e-06, + "loss": 0.5203, + "step": 4851 + }, + { + "epoch": 2.357933130699088, + "grad_norm": 0.07470438722217965, + "learning_rate": 3.620422678489014e-06, + "loss": 0.5285, + "step": 4852 + }, + { + "epoch": 2.358419452887538, + "grad_norm": 0.07189338853898246, + "learning_rate": 3.618582432283082e-06, + "loss": 0.5432, + "step": 4853 + }, + { + "epoch": 2.358905775075988, + "grad_norm": 0.07311963763751952, + "learning_rate": 3.616742388646351e-06, + "loss": 0.5386, + "step": 4854 + }, + { + "epoch": 2.3593920972644375, + "grad_norm": 0.07099606579083245, + "learning_rate": 3.6149025478486393e-06, + "loss": 0.4954, + "step": 4855 + }, + { + "epoch": 2.3598784194528877, + "grad_norm": 0.0750258337291273, + "learning_rate": 3.6130629101597404e-06, + "loss": 0.5387, + "step": 4856 + }, + { + "epoch": 2.3603647416413374, + "grad_norm": 0.07569689187905691, + "learning_rate": 3.6112234758494156e-06, + "loss": 0.5514, + "step": 4857 + }, + { + "epoch": 2.360851063829787, + "grad_norm": 0.07319743555953466, + "learning_rate": 3.6093842451873955e-06, + "loss": 0.5027, + "step": 4858 + }, + { + "epoch": 2.361337386018237, + "grad_norm": 0.07370897480139857, + "learning_rate": 3.6075452184433867e-06, + "loss": 0.5318, + "step": 4859 + }, + { + "epoch": 2.361823708206687, + "grad_norm": 0.07117204860689534, + "learning_rate": 3.6057063958870604e-06, + "loss": 0.5186, + "step": 4860 + }, + { + "epoch": 2.362310030395137, + "grad_norm": 0.07099939836852176, + "learning_rate": 3.6038677777880564e-06, + "loss": 0.5298, + "step": 4861 + }, + { + "epoch": 2.3627963525835867, + "grad_norm": 0.07109218678986207, + "learning_rate": 3.6020293644159887e-06, + "loss": 0.5181, + "step": 4862 + }, + { + "epoch": 2.3632826747720364, + "grad_norm": 0.0728788035204336, + "learning_rate": 3.6001911560404403e-06, + "loss": 0.5518, + "step": 4863 + }, + { + "epoch": 2.3637689969604865, + "grad_norm": 0.06970638306718911, + "learning_rate": 3.5983531529309625e-06, + "loss": 0.5256, + "step": 4864 + }, + { + "epoch": 2.364255319148936, + "grad_norm": 0.0708632735719275, + "learning_rate": 3.5965153553570774e-06, + "loss": 0.5184, + "step": 4865 + }, + { + "epoch": 2.364741641337386, + "grad_norm": 0.07267390571201542, + "learning_rate": 3.594677763588279e-06, + "loss": 0.5231, + "step": 4866 + }, + { + "epoch": 2.365227963525836, + "grad_norm": 0.07136855787566969, + "learning_rate": 3.592840377894028e-06, + "loss": 0.5176, + "step": 4867 + }, + { + "epoch": 2.3657142857142857, + "grad_norm": 0.07032806839767311, + "learning_rate": 3.5910031985437553e-06, + "loss": 0.5312, + "step": 4868 + }, + { + "epoch": 2.3662006079027353, + "grad_norm": 0.07234474630538439, + "learning_rate": 3.589166225806865e-06, + "loss": 0.5298, + "step": 4869 + }, + { + "epoch": 2.3666869300911855, + "grad_norm": 0.07240200475705828, + "learning_rate": 3.5873294599527255e-06, + "loss": 0.5472, + "step": 4870 + }, + { + "epoch": 2.367173252279635, + "grad_norm": 0.07406665008836129, + "learning_rate": 3.5854929012506788e-06, + "loss": 0.5362, + "step": 4871 + }, + { + "epoch": 2.3676595744680853, + "grad_norm": 0.06955507412384188, + "learning_rate": 3.5836565499700348e-06, + "loss": 0.494, + "step": 4872 + }, + { + "epoch": 2.368145896656535, + "grad_norm": 0.07437143351671988, + "learning_rate": 3.581820406380075e-06, + "loss": 0.5387, + "step": 4873 + }, + { + "epoch": 2.3686322188449846, + "grad_norm": 0.07197375685166531, + "learning_rate": 3.5799844707500475e-06, + "loss": 0.5355, + "step": 4874 + }, + { + "epoch": 2.3691185410334348, + "grad_norm": 0.07343538810466457, + "learning_rate": 3.5781487433491724e-06, + "loss": 0.501, + "step": 4875 + }, + { + "epoch": 2.3696048632218845, + "grad_norm": 0.07493330334459582, + "learning_rate": 3.5763132244466363e-06, + "loss": 0.5111, + "step": 4876 + }, + { + "epoch": 2.3700911854103346, + "grad_norm": 0.07184650306558257, + "learning_rate": 3.5744779143116005e-06, + "loss": 0.5146, + "step": 4877 + }, + { + "epoch": 2.3705775075987843, + "grad_norm": 0.07148059921025327, + "learning_rate": 3.5726428132131902e-06, + "loss": 0.5367, + "step": 4878 + }, + { + "epoch": 2.371063829787234, + "grad_norm": 0.07157251056385625, + "learning_rate": 3.5708079214205027e-06, + "loss": 0.5399, + "step": 4879 + }, + { + "epoch": 2.371550151975684, + "grad_norm": 0.07086216464861735, + "learning_rate": 3.5689732392026044e-06, + "loss": 0.5176, + "step": 4880 + }, + { + "epoch": 2.3720364741641338, + "grad_norm": 0.07354005094734528, + "learning_rate": 3.5671387668285294e-06, + "loss": 0.5609, + "step": 4881 + }, + { + "epoch": 2.3725227963525835, + "grad_norm": 0.07361620052596375, + "learning_rate": 3.565304504567284e-06, + "loss": 0.5239, + "step": 4882 + }, + { + "epoch": 2.3730091185410336, + "grad_norm": 0.07340444972490866, + "learning_rate": 3.5634704526878405e-06, + "loss": 0.5837, + "step": 4883 + }, + { + "epoch": 2.3734954407294833, + "grad_norm": 0.071420407144292, + "learning_rate": 3.561636611459143e-06, + "loss": 0.5227, + "step": 4884 + }, + { + "epoch": 2.373981762917933, + "grad_norm": 0.07335219183256791, + "learning_rate": 3.559802981150102e-06, + "loss": 0.5083, + "step": 4885 + }, + { + "epoch": 2.374468085106383, + "grad_norm": 0.07195634767795334, + "learning_rate": 3.557969562029599e-06, + "loss": 0.5199, + "step": 4886 + }, + { + "epoch": 2.3749544072948328, + "grad_norm": 0.07223007961779847, + "learning_rate": 3.5561363543664846e-06, + "loss": 0.5431, + "step": 4887 + }, + { + "epoch": 2.375440729483283, + "grad_norm": 0.07330084517468019, + "learning_rate": 3.5543033584295775e-06, + "loss": 0.5466, + "step": 4888 + }, + { + "epoch": 2.3759270516717326, + "grad_norm": 0.0713798829929337, + "learning_rate": 3.5524705744876666e-06, + "loss": 0.5222, + "step": 4889 + }, + { + "epoch": 2.3764133738601823, + "grad_norm": 0.07173041515811288, + "learning_rate": 3.550638002809507e-06, + "loss": 0.5019, + "step": 4890 + }, + { + "epoch": 2.3768996960486324, + "grad_norm": 0.070172140030036, + "learning_rate": 3.548805643663826e-06, + "loss": 0.4846, + "step": 4891 + }, + { + "epoch": 2.377386018237082, + "grad_norm": 0.0737277339039337, + "learning_rate": 3.546973497319319e-06, + "loss": 0.5242, + "step": 4892 + }, + { + "epoch": 2.3778723404255317, + "grad_norm": 0.07381191572837867, + "learning_rate": 3.5451415640446485e-06, + "loss": 0.5796, + "step": 4893 + }, + { + "epoch": 2.378358662613982, + "grad_norm": 0.07046316987745402, + "learning_rate": 3.543309844108444e-06, + "loss": 0.5197, + "step": 4894 + }, + { + "epoch": 2.3788449848024316, + "grad_norm": 0.07114823390414793, + "learning_rate": 3.5414783377793105e-06, + "loss": 0.5134, + "step": 4895 + }, + { + "epoch": 2.3793313069908812, + "grad_norm": 0.07696036127901158, + "learning_rate": 3.539647045325817e-06, + "loss": 0.5797, + "step": 4896 + }, + { + "epoch": 2.3798176291793314, + "grad_norm": 0.07314560125293358, + "learning_rate": 3.5378159670165e-06, + "loss": 0.5219, + "step": 4897 + }, + { + "epoch": 2.380303951367781, + "grad_norm": 0.072809667254612, + "learning_rate": 3.5359851031198687e-06, + "loss": 0.5237, + "step": 4898 + }, + { + "epoch": 2.380790273556231, + "grad_norm": 0.0718829561423349, + "learning_rate": 3.534154453904396e-06, + "loss": 0.5233, + "step": 4899 + }, + { + "epoch": 2.381276595744681, + "grad_norm": 0.0725935395427643, + "learning_rate": 3.5323240196385265e-06, + "loss": 0.5416, + "step": 4900 + }, + { + "epoch": 2.3817629179331306, + "grad_norm": 0.0703349844653874, + "learning_rate": 3.530493800590674e-06, + "loss": 0.5255, + "step": 4901 + }, + { + "epoch": 2.3822492401215807, + "grad_norm": 0.07275114629535887, + "learning_rate": 3.5286637970292176e-06, + "loss": 0.5193, + "step": 4902 + }, + { + "epoch": 2.3827355623100304, + "grad_norm": 0.07205386414748965, + "learning_rate": 3.5268340092225074e-06, + "loss": 0.5229, + "step": 4903 + }, + { + "epoch": 2.3832218844984805, + "grad_norm": 0.07009430414343244, + "learning_rate": 3.5250044374388605e-06, + "loss": 0.4847, + "step": 4904 + }, + { + "epoch": 2.38370820668693, + "grad_norm": 0.0718443944175079, + "learning_rate": 3.5231750819465633e-06, + "loss": 0.4914, + "step": 4905 + }, + { + "epoch": 2.38419452887538, + "grad_norm": 0.07108323576989391, + "learning_rate": 3.5213459430138697e-06, + "loss": 0.523, + "step": 4906 + }, + { + "epoch": 2.3846808510638295, + "grad_norm": 0.07208729365718768, + "learning_rate": 3.5195170209090026e-06, + "loss": 0.5257, + "step": 4907 + }, + { + "epoch": 2.3851671732522797, + "grad_norm": 0.07269320932049202, + "learning_rate": 3.5176883159001536e-06, + "loss": 0.5287, + "step": 4908 + }, + { + "epoch": 2.3856534954407294, + "grad_norm": 0.07244691989592308, + "learning_rate": 3.515859828255479e-06, + "loss": 0.5737, + "step": 4909 + }, + { + "epoch": 2.3861398176291795, + "grad_norm": 0.07276605958379954, + "learning_rate": 3.5140315582431074e-06, + "loss": 0.4934, + "step": 4910 + }, + { + "epoch": 2.386626139817629, + "grad_norm": 0.06961523820488281, + "learning_rate": 3.512203506131133e-06, + "loss": 0.5142, + "step": 4911 + }, + { + "epoch": 2.387112462006079, + "grad_norm": 0.07414438917471289, + "learning_rate": 3.510375672187617e-06, + "loss": 0.5415, + "step": 4912 + }, + { + "epoch": 2.387598784194529, + "grad_norm": 0.07097672058797641, + "learning_rate": 3.5085480566805963e-06, + "loss": 0.5366, + "step": 4913 + }, + { + "epoch": 2.3880851063829787, + "grad_norm": 0.07018587035995108, + "learning_rate": 3.5067206598780656e-06, + "loss": 0.4792, + "step": 4914 + }, + { + "epoch": 2.388571428571429, + "grad_norm": 0.07188431897401468, + "learning_rate": 3.504893482047993e-06, + "loss": 0.5199, + "step": 4915 + }, + { + "epoch": 2.3890577507598785, + "grad_norm": 0.07393089822267285, + "learning_rate": 3.503066523458313e-06, + "loss": 0.5482, + "step": 4916 + }, + { + "epoch": 2.389544072948328, + "grad_norm": 0.0788267512243813, + "learning_rate": 3.5012397843769287e-06, + "loss": 0.5528, + "step": 4917 + }, + { + "epoch": 2.3900303951367783, + "grad_norm": 0.07172982882729341, + "learning_rate": 3.4994132650717107e-06, + "loss": 0.52, + "step": 4918 + }, + { + "epoch": 2.390516717325228, + "grad_norm": 0.07119375132893335, + "learning_rate": 3.4975869658104964e-06, + "loss": 0.5413, + "step": 4919 + }, + { + "epoch": 2.3910030395136777, + "grad_norm": 0.07489557955345133, + "learning_rate": 3.495760886861093e-06, + "loss": 0.5593, + "step": 4920 + }, + { + "epoch": 2.391489361702128, + "grad_norm": 0.07352003216737586, + "learning_rate": 3.4939350284912737e-06, + "loss": 0.5026, + "step": 4921 + }, + { + "epoch": 2.3919756838905775, + "grad_norm": 0.07409139572085124, + "learning_rate": 3.4921093909687808e-06, + "loss": 0.5302, + "step": 4922 + }, + { + "epoch": 2.392462006079027, + "grad_norm": 0.07052214217858034, + "learning_rate": 3.490283974561322e-06, + "loss": 0.5108, + "step": 4923 + }, + { + "epoch": 2.3929483282674773, + "grad_norm": 0.08132424318787977, + "learning_rate": 3.4884587795365744e-06, + "loss": 0.5927, + "step": 4924 + }, + { + "epoch": 2.393434650455927, + "grad_norm": 0.07116737077954373, + "learning_rate": 3.486633806162181e-06, + "loss": 0.4948, + "step": 4925 + }, + { + "epoch": 2.393920972644377, + "grad_norm": 0.07142369209504221, + "learning_rate": 3.4848090547057556e-06, + "loss": 0.5379, + "step": 4926 + }, + { + "epoch": 2.3944072948328268, + "grad_norm": 0.0719390472744797, + "learning_rate": 3.482984525434876e-06, + "loss": 0.4995, + "step": 4927 + }, + { + "epoch": 2.3948936170212765, + "grad_norm": 0.07081053458427171, + "learning_rate": 3.4811602186170886e-06, + "loss": 0.5295, + "step": 4928 + }, + { + "epoch": 2.3953799392097266, + "grad_norm": 0.07064728634122945, + "learning_rate": 3.4793361345199074e-06, + "loss": 0.518, + "step": 4929 + }, + { + "epoch": 2.3958662613981763, + "grad_norm": 0.06928785824615319, + "learning_rate": 3.4775122734108128e-06, + "loss": 0.5024, + "step": 4930 + }, + { + "epoch": 2.396352583586626, + "grad_norm": 0.07401725184601242, + "learning_rate": 3.475688635557256e-06, + "loss": 0.5738, + "step": 4931 + }, + { + "epoch": 2.396838905775076, + "grad_norm": 0.07304477812217713, + "learning_rate": 3.4738652212266506e-06, + "loss": 0.5237, + "step": 4932 + }, + { + "epoch": 2.3973252279635258, + "grad_norm": 0.06998979315027283, + "learning_rate": 3.47204203068638e-06, + "loss": 0.5012, + "step": 4933 + }, + { + "epoch": 2.3978115501519754, + "grad_norm": 0.07150652298146956, + "learning_rate": 3.470219064203795e-06, + "loss": 0.5106, + "step": 4934 + }, + { + "epoch": 2.3982978723404256, + "grad_norm": 0.07192059209215543, + "learning_rate": 3.4683963220462113e-06, + "loss": 0.524, + "step": 4935 + }, + { + "epoch": 2.3987841945288753, + "grad_norm": 0.07068771005942975, + "learning_rate": 3.4665738044809155e-06, + "loss": 0.5188, + "step": 4936 + }, + { + "epoch": 2.3992705167173254, + "grad_norm": 0.07204779497860687, + "learning_rate": 3.4647515117751586e-06, + "loss": 0.556, + "step": 4937 + }, + { + "epoch": 2.399756838905775, + "grad_norm": 0.06956834440228282, + "learning_rate": 3.462929444196158e-06, + "loss": 0.5242, + "step": 4938 + }, + { + "epoch": 2.4002431610942248, + "grad_norm": 0.07463770657647964, + "learning_rate": 3.4611076020110996e-06, + "loss": 0.5506, + "step": 4939 + }, + { + "epoch": 2.400729483282675, + "grad_norm": 0.07148630675309851, + "learning_rate": 3.4592859854871362e-06, + "loss": 0.5102, + "step": 4940 + }, + { + "epoch": 2.4012158054711246, + "grad_norm": 0.07150810813784506, + "learning_rate": 3.4574645948913866e-06, + "loss": 0.5262, + "step": 4941 + }, + { + "epoch": 2.4017021276595747, + "grad_norm": 0.07196031898359004, + "learning_rate": 3.455643430490938e-06, + "loss": 0.5232, + "step": 4942 + }, + { + "epoch": 2.4021884498480244, + "grad_norm": 0.07150618340785458, + "learning_rate": 3.453822492552843e-06, + "loss": 0.5271, + "step": 4943 + }, + { + "epoch": 2.402674772036474, + "grad_norm": 0.06959454167437171, + "learning_rate": 3.452001781344121e-06, + "loss": 0.5274, + "step": 4944 + }, + { + "epoch": 2.403161094224924, + "grad_norm": 0.07749890209793345, + "learning_rate": 3.4501812971317596e-06, + "loss": 0.5579, + "step": 4945 + }, + { + "epoch": 2.403647416413374, + "grad_norm": 0.06986649260752789, + "learning_rate": 3.448361040182712e-06, + "loss": 0.5274, + "step": 4946 + }, + { + "epoch": 2.4041337386018236, + "grad_norm": 0.07553775535189526, + "learning_rate": 3.4465410107638974e-06, + "loss": 0.5021, + "step": 4947 + }, + { + "epoch": 2.4046200607902737, + "grad_norm": 0.07298953312805401, + "learning_rate": 3.444721209142201e-06, + "loss": 0.5056, + "step": 4948 + }, + { + "epoch": 2.4051063829787234, + "grad_norm": 0.07184657536314888, + "learning_rate": 3.442901635584479e-06, + "loss": 0.4956, + "step": 4949 + }, + { + "epoch": 2.405592705167173, + "grad_norm": 0.0704641027560034, + "learning_rate": 3.4410822903575516e-06, + "loss": 0.4979, + "step": 4950 + }, + { + "epoch": 2.406079027355623, + "grad_norm": 0.07006671870344676, + "learning_rate": 3.4392631737282022e-06, + "loss": 0.4962, + "step": 4951 + }, + { + "epoch": 2.406565349544073, + "grad_norm": 0.07273848194328271, + "learning_rate": 3.437444285963187e-06, + "loss": 0.5265, + "step": 4952 + }, + { + "epoch": 2.407051671732523, + "grad_norm": 0.07130091640710873, + "learning_rate": 3.4356256273292215e-06, + "loss": 0.5506, + "step": 4953 + }, + { + "epoch": 2.4075379939209727, + "grad_norm": 0.07309498937330876, + "learning_rate": 3.4338071980929933e-06, + "loss": 0.5436, + "step": 4954 + }, + { + "epoch": 2.4080243161094224, + "grad_norm": 0.07631879586998074, + "learning_rate": 3.431988998521155e-06, + "loss": 0.5264, + "step": 4955 + }, + { + "epoch": 2.4085106382978725, + "grad_norm": 0.07013709319346881, + "learning_rate": 3.430171028880323e-06, + "loss": 0.4985, + "step": 4956 + }, + { + "epoch": 2.408996960486322, + "grad_norm": 0.06959931044153714, + "learning_rate": 3.428353289437084e-06, + "loss": 0.5184, + "step": 4957 + }, + { + "epoch": 2.409483282674772, + "grad_norm": 0.0719716497037636, + "learning_rate": 3.426535780457987e-06, + "loss": 0.5339, + "step": 4958 + }, + { + "epoch": 2.409969604863222, + "grad_norm": 0.0695659600629915, + "learning_rate": 3.424718502209551e-06, + "loss": 0.4859, + "step": 4959 + }, + { + "epoch": 2.4104559270516717, + "grad_norm": 0.07241069862462253, + "learning_rate": 3.4229014549582567e-06, + "loss": 0.5625, + "step": 4960 + }, + { + "epoch": 2.4109422492401213, + "grad_norm": 0.07440275495385786, + "learning_rate": 3.4210846389705567e-06, + "loss": 0.5551, + "step": 4961 + }, + { + "epoch": 2.4114285714285715, + "grad_norm": 0.07512386055939603, + "learning_rate": 3.4192680545128636e-06, + "loss": 0.5225, + "step": 4962 + }, + { + "epoch": 2.411914893617021, + "grad_norm": 0.07366556776583193, + "learning_rate": 3.4174517018515603e-06, + "loss": 0.523, + "step": 4963 + }, + { + "epoch": 2.4124012158054713, + "grad_norm": 0.07229718128615438, + "learning_rate": 3.415635581252993e-06, + "loss": 0.5142, + "step": 4964 + }, + { + "epoch": 2.412887537993921, + "grad_norm": 0.07078196288655379, + "learning_rate": 3.4138196929834765e-06, + "loss": 0.5106, + "step": 4965 + }, + { + "epoch": 2.4133738601823707, + "grad_norm": 0.07281030570025167, + "learning_rate": 3.4120040373092876e-06, + "loss": 0.5323, + "step": 4966 + }, + { + "epoch": 2.413860182370821, + "grad_norm": 0.0727869988770298, + "learning_rate": 3.4101886144966772e-06, + "loss": 0.5096, + "step": 4967 + }, + { + "epoch": 2.4143465045592705, + "grad_norm": 0.07109123264772012, + "learning_rate": 3.4083734248118514e-06, + "loss": 0.5496, + "step": 4968 + }, + { + "epoch": 2.4148328267477206, + "grad_norm": 0.07372158280934651, + "learning_rate": 3.4065584685209895e-06, + "loss": 0.5315, + "step": 4969 + }, + { + "epoch": 2.4153191489361703, + "grad_norm": 0.07052926961665368, + "learning_rate": 3.4047437458902333e-06, + "loss": 0.5259, + "step": 4970 + }, + { + "epoch": 2.41580547112462, + "grad_norm": 0.07303967664621815, + "learning_rate": 3.402929257185691e-06, + "loss": 0.5662, + "step": 4971 + }, + { + "epoch": 2.41629179331307, + "grad_norm": 0.0705263464176145, + "learning_rate": 3.4011150026734373e-06, + "loss": 0.5378, + "step": 4972 + }, + { + "epoch": 2.4167781155015198, + "grad_norm": 0.07115516616977438, + "learning_rate": 3.3993009826195116e-06, + "loss": 0.5499, + "step": 4973 + }, + { + "epoch": 2.4172644376899695, + "grad_norm": 0.073584321003851, + "learning_rate": 3.3974871972899204e-06, + "loss": 0.5768, + "step": 4974 + }, + { + "epoch": 2.4177507598784196, + "grad_norm": 0.07267100038340704, + "learning_rate": 3.3956736469506334e-06, + "loss": 0.5467, + "step": 4975 + }, + { + "epoch": 2.4182370820668693, + "grad_norm": 0.06841099298058952, + "learning_rate": 3.3938603318675888e-06, + "loss": 0.4699, + "step": 4976 + }, + { + "epoch": 2.418723404255319, + "grad_norm": 0.07093807119191846, + "learning_rate": 3.392047252306687e-06, + "loss": 0.5135, + "step": 4977 + }, + { + "epoch": 2.419209726443769, + "grad_norm": 0.07208415886449852, + "learning_rate": 3.3902344085337956e-06, + "loss": 0.5242, + "step": 4978 + }, + { + "epoch": 2.4196960486322188, + "grad_norm": 0.07117623067278844, + "learning_rate": 3.3884218008147486e-06, + "loss": 0.5066, + "step": 4979 + }, + { + "epoch": 2.420182370820669, + "grad_norm": 0.07269038638145053, + "learning_rate": 3.3866094294153436e-06, + "loss": 0.5248, + "step": 4980 + }, + { + "epoch": 2.4206686930091186, + "grad_norm": 0.07631746466793166, + "learning_rate": 3.384797294601344e-06, + "loss": 0.5348, + "step": 4981 + }, + { + "epoch": 2.4211550151975683, + "grad_norm": 0.07027250038154988, + "learning_rate": 3.3829853966384803e-06, + "loss": 0.5232, + "step": 4982 + }, + { + "epoch": 2.4216413373860184, + "grad_norm": 0.06996966005597284, + "learning_rate": 3.381173735792445e-06, + "loss": 0.5241, + "step": 4983 + }, + { + "epoch": 2.422127659574468, + "grad_norm": 0.07012278863799731, + "learning_rate": 3.379362312328899e-06, + "loss": 0.5093, + "step": 4984 + }, + { + "epoch": 2.4226139817629178, + "grad_norm": 0.06935591945197173, + "learning_rate": 3.3775511265134646e-06, + "loss": 0.4929, + "step": 4985 + }, + { + "epoch": 2.423100303951368, + "grad_norm": 0.07399464231282409, + "learning_rate": 3.375740178611735e-06, + "loss": 0.5702, + "step": 4986 + }, + { + "epoch": 2.4235866261398176, + "grad_norm": 0.07498828428271959, + "learning_rate": 3.3739294688892632e-06, + "loss": 0.5202, + "step": 4987 + }, + { + "epoch": 2.4240729483282673, + "grad_norm": 0.072819413162072, + "learning_rate": 3.3721189976115693e-06, + "loss": 0.5192, + "step": 4988 + }, + { + "epoch": 2.4245592705167174, + "grad_norm": 0.07150380805813271, + "learning_rate": 3.370308765044139e-06, + "loss": 0.5245, + "step": 4989 + }, + { + "epoch": 2.425045592705167, + "grad_norm": 0.07105783789227622, + "learning_rate": 3.368498771452422e-06, + "loss": 0.5448, + "step": 4990 + }, + { + "epoch": 2.425531914893617, + "grad_norm": 0.07133550615854353, + "learning_rate": 3.366689017101834e-06, + "loss": 0.5342, + "step": 4991 + }, + { + "epoch": 2.426018237082067, + "grad_norm": 0.07275842003869201, + "learning_rate": 3.364879502257753e-06, + "loss": 0.5638, + "step": 4992 + }, + { + "epoch": 2.4265045592705166, + "grad_norm": 0.072649123807271, + "learning_rate": 3.3630702271855253e-06, + "loss": 0.5288, + "step": 4993 + }, + { + "epoch": 2.4269908814589667, + "grad_norm": 0.07027449399870422, + "learning_rate": 3.36126119215046e-06, + "loss": 0.5316, + "step": 4994 + }, + { + "epoch": 2.4274772036474164, + "grad_norm": 0.07371908724534923, + "learning_rate": 3.359452397417832e-06, + "loss": 0.525, + "step": 4995 + }, + { + "epoch": 2.4279635258358665, + "grad_norm": 0.07297046365843625, + "learning_rate": 3.35764384325288e-06, + "loss": 0.5412, + "step": 4996 + }, + { + "epoch": 2.428449848024316, + "grad_norm": 0.07071125744170881, + "learning_rate": 3.355835529920808e-06, + "loss": 0.5394, + "step": 4997 + }, + { + "epoch": 2.428936170212766, + "grad_norm": 0.07198964939283464, + "learning_rate": 3.3540274576867853e-06, + "loss": 0.5462, + "step": 4998 + }, + { + "epoch": 2.429422492401216, + "grad_norm": 0.07044944977639393, + "learning_rate": 3.3522196268159444e-06, + "loss": 0.5286, + "step": 4999 + }, + { + "epoch": 2.4299088145896657, + "grad_norm": 0.07203293701028296, + "learning_rate": 3.350412037573385e-06, + "loss": 0.5473, + "step": 5000 + }, + { + "epoch": 2.4303951367781154, + "grad_norm": 0.07302044938806027, + "learning_rate": 3.3486046902241663e-06, + "loss": 0.5592, + "step": 5001 + }, + { + "epoch": 2.4308814589665655, + "grad_norm": 0.07182251391475623, + "learning_rate": 3.3467975850333167e-06, + "loss": 0.5367, + "step": 5002 + }, + { + "epoch": 2.431367781155015, + "grad_norm": 0.07281607973738147, + "learning_rate": 3.3449907222658266e-06, + "loss": 0.5404, + "step": 5003 + }, + { + "epoch": 2.431854103343465, + "grad_norm": 0.07243085781205838, + "learning_rate": 3.3431841021866553e-06, + "loss": 0.5515, + "step": 5004 + }, + { + "epoch": 2.432340425531915, + "grad_norm": 0.07008303138680833, + "learning_rate": 3.3413777250607215e-06, + "loss": 0.5287, + "step": 5005 + }, + { + "epoch": 2.4328267477203647, + "grad_norm": 0.07354579843332798, + "learning_rate": 3.3395715911529087e-06, + "loss": 0.5625, + "step": 5006 + }, + { + "epoch": 2.433313069908815, + "grad_norm": 0.07262924032431435, + "learning_rate": 3.337765700728066e-06, + "loss": 0.5351, + "step": 5007 + }, + { + "epoch": 2.4337993920972645, + "grad_norm": 0.0706884573910302, + "learning_rate": 3.3359600540510084e-06, + "loss": 0.5038, + "step": 5008 + }, + { + "epoch": 2.434285714285714, + "grad_norm": 0.07335802698683956, + "learning_rate": 3.334154651386512e-06, + "loss": 0.5349, + "step": 5009 + }, + { + "epoch": 2.4347720364741643, + "grad_norm": 0.0736114455498874, + "learning_rate": 3.3323494929993187e-06, + "loss": 0.5226, + "step": 5010 + }, + { + "epoch": 2.435258358662614, + "grad_norm": 0.07106527596001265, + "learning_rate": 3.330544579154135e-06, + "loss": 0.5295, + "step": 5011 + }, + { + "epoch": 2.4357446808510637, + "grad_norm": 0.07007541160356795, + "learning_rate": 3.3287399101156316e-06, + "loss": 0.5093, + "step": 5012 + }, + { + "epoch": 2.436231003039514, + "grad_norm": 0.071585998365469, + "learning_rate": 3.326935486148441e-06, + "loss": 0.5345, + "step": 5013 + }, + { + "epoch": 2.4367173252279635, + "grad_norm": 0.07211180696172437, + "learning_rate": 3.325131307517163e-06, + "loss": 0.4913, + "step": 5014 + }, + { + "epoch": 2.437203647416413, + "grad_norm": 0.07239889854364302, + "learning_rate": 3.3233273744863604e-06, + "loss": 0.5672, + "step": 5015 + }, + { + "epoch": 2.4376899696048633, + "grad_norm": 0.0743708532124169, + "learning_rate": 3.321523687320557e-06, + "loss": 0.5348, + "step": 5016 + }, + { + "epoch": 2.438176291793313, + "grad_norm": 0.06834174755171417, + "learning_rate": 3.319720246284245e-06, + "loss": 0.5057, + "step": 5017 + }, + { + "epoch": 2.438662613981763, + "grad_norm": 0.07168753302618132, + "learning_rate": 3.3179170516418766e-06, + "loss": 0.5023, + "step": 5018 + }, + { + "epoch": 2.439148936170213, + "grad_norm": 0.07021648883197688, + "learning_rate": 3.316114103657873e-06, + "loss": 0.5042, + "step": 5019 + }, + { + "epoch": 2.4396352583586625, + "grad_norm": 0.07298538780128841, + "learning_rate": 3.314311402596614e-06, + "loss": 0.5258, + "step": 5020 + }, + { + "epoch": 2.4401215805471126, + "grad_norm": 0.07083119018124148, + "learning_rate": 3.3125089487224436e-06, + "loss": 0.4902, + "step": 5021 + }, + { + "epoch": 2.4406079027355623, + "grad_norm": 0.073575205324048, + "learning_rate": 3.310706742299675e-06, + "loss": 0.5171, + "step": 5022 + }, + { + "epoch": 2.4410942249240124, + "grad_norm": 0.07271595891577688, + "learning_rate": 3.308904783592579e-06, + "loss": 0.5509, + "step": 5023 + }, + { + "epoch": 2.441580547112462, + "grad_norm": 0.07343510789440008, + "learning_rate": 3.307103072865393e-06, + "loss": 0.5367, + "step": 5024 + }, + { + "epoch": 2.4420668693009118, + "grad_norm": 0.06847101097932697, + "learning_rate": 3.3053016103823177e-06, + "loss": 0.5095, + "step": 5025 + }, + { + "epoch": 2.4425531914893615, + "grad_norm": 0.0714349311736113, + "learning_rate": 3.3035003964075164e-06, + "loss": 0.5518, + "step": 5026 + }, + { + "epoch": 2.4430395136778116, + "grad_norm": 0.07292549049261239, + "learning_rate": 3.3016994312051165e-06, + "loss": 0.5505, + "step": 5027 + }, + { + "epoch": 2.4435258358662613, + "grad_norm": 0.0715769313065459, + "learning_rate": 3.2998987150392105e-06, + "loss": 0.5145, + "step": 5028 + }, + { + "epoch": 2.4440121580547114, + "grad_norm": 0.07106215780346531, + "learning_rate": 3.298098248173852e-06, + "loss": 0.4977, + "step": 5029 + }, + { + "epoch": 2.444498480243161, + "grad_norm": 0.0719648408348036, + "learning_rate": 3.2962980308730584e-06, + "loss": 0.5354, + "step": 5030 + }, + { + "epoch": 2.4449848024316108, + "grad_norm": 0.07383354532637519, + "learning_rate": 3.2944980634008116e-06, + "loss": 0.5278, + "step": 5031 + }, + { + "epoch": 2.445471124620061, + "grad_norm": 0.0681992386879797, + "learning_rate": 3.2926983460210564e-06, + "loss": 0.4934, + "step": 5032 + }, + { + "epoch": 2.4459574468085106, + "grad_norm": 0.07072753816425954, + "learning_rate": 3.2908988789977015e-06, + "loss": 0.4894, + "step": 5033 + }, + { + "epoch": 2.4464437689969607, + "grad_norm": 0.07487692419696054, + "learning_rate": 3.2890996625946182e-06, + "loss": 0.5401, + "step": 5034 + }, + { + "epoch": 2.4469300911854104, + "grad_norm": 0.07705752697122643, + "learning_rate": 3.2873006970756398e-06, + "loss": 0.5424, + "step": 5035 + }, + { + "epoch": 2.44741641337386, + "grad_norm": 0.07552299366795687, + "learning_rate": 3.2855019827045657e-06, + "loss": 0.5661, + "step": 5036 + }, + { + "epoch": 2.44790273556231, + "grad_norm": 0.07102483254120338, + "learning_rate": 3.2837035197451562e-06, + "loss": 0.5162, + "step": 5037 + }, + { + "epoch": 2.44838905775076, + "grad_norm": 0.07024705567037107, + "learning_rate": 3.2819053084611362e-06, + "loss": 0.5253, + "step": 5038 + }, + { + "epoch": 2.4488753799392096, + "grad_norm": 0.07306052835846437, + "learning_rate": 3.280107349116191e-06, + "loss": 0.5271, + "step": 5039 + }, + { + "epoch": 2.4493617021276597, + "grad_norm": 0.07307510661801636, + "learning_rate": 3.2783096419739737e-06, + "loss": 0.5195, + "step": 5040 + }, + { + "epoch": 2.4498480243161094, + "grad_norm": 0.0752984637950309, + "learning_rate": 3.2765121872980965e-06, + "loss": 0.568, + "step": 5041 + }, + { + "epoch": 2.450334346504559, + "grad_norm": 0.07346277918700476, + "learning_rate": 3.2747149853521347e-06, + "loss": 0.5533, + "step": 5042 + }, + { + "epoch": 2.450820668693009, + "grad_norm": 0.07267036618745515, + "learning_rate": 3.2729180363996295e-06, + "loss": 0.5318, + "step": 5043 + }, + { + "epoch": 2.451306990881459, + "grad_norm": 0.0710627941839871, + "learning_rate": 3.271121340704082e-06, + "loss": 0.515, + "step": 5044 + }, + { + "epoch": 2.451793313069909, + "grad_norm": 0.07276987587388119, + "learning_rate": 3.269324898528956e-06, + "loss": 0.552, + "step": 5045 + }, + { + "epoch": 2.4522796352583587, + "grad_norm": 0.06922275148956468, + "learning_rate": 3.2675287101376816e-06, + "loss": 0.5152, + "step": 5046 + }, + { + "epoch": 2.4527659574468084, + "grad_norm": 0.07359596710512181, + "learning_rate": 3.2657327757936473e-06, + "loss": 0.5498, + "step": 5047 + }, + { + "epoch": 2.4532522796352585, + "grad_norm": 0.07195999137370987, + "learning_rate": 3.263937095760208e-06, + "loss": 0.565, + "step": 5048 + }, + { + "epoch": 2.453738601823708, + "grad_norm": 0.07642688500893463, + "learning_rate": 3.262141670300679e-06, + "loss": 0.5306, + "step": 5049 + }, + { + "epoch": 2.4542249240121583, + "grad_norm": 0.0726630718418881, + "learning_rate": 3.26034649967834e-06, + "loss": 0.5133, + "step": 5050 + }, + { + "epoch": 2.454711246200608, + "grad_norm": 0.07331577238739899, + "learning_rate": 3.258551584156432e-06, + "loss": 0.5264, + "step": 5051 + }, + { + "epoch": 2.4551975683890577, + "grad_norm": 0.06953850791873235, + "learning_rate": 3.2567569239981576e-06, + "loss": 0.5223, + "step": 5052 + }, + { + "epoch": 2.4556838905775074, + "grad_norm": 0.06972153226455745, + "learning_rate": 3.254962519466686e-06, + "loss": 0.5019, + "step": 5053 + }, + { + "epoch": 2.4561702127659575, + "grad_norm": 0.0724929681390814, + "learning_rate": 3.2531683708251438e-06, + "loss": 0.5096, + "step": 5054 + }, + { + "epoch": 2.456656534954407, + "grad_norm": 0.07122869359278075, + "learning_rate": 3.251374478336623e-06, + "loss": 0.5232, + "step": 5055 + }, + { + "epoch": 2.4571428571428573, + "grad_norm": 0.0729856920840314, + "learning_rate": 3.2495808422641785e-06, + "loss": 0.5525, + "step": 5056 + }, + { + "epoch": 2.457629179331307, + "grad_norm": 0.07464012492086165, + "learning_rate": 3.247787462870824e-06, + "loss": 0.494, + "step": 5057 + }, + { + "epoch": 2.4581155015197567, + "grad_norm": 0.07161393532190671, + "learning_rate": 3.2459943404195428e-06, + "loss": 0.5134, + "step": 5058 + }, + { + "epoch": 2.458601823708207, + "grad_norm": 0.07764407501003323, + "learning_rate": 3.2442014751732735e-06, + "loss": 0.6107, + "step": 5059 + }, + { + "epoch": 2.4590881458966565, + "grad_norm": 0.07226747955452442, + "learning_rate": 3.2424088673949195e-06, + "loss": 0.529, + "step": 5060 + }, + { + "epoch": 2.4595744680851066, + "grad_norm": 0.07234355836589856, + "learning_rate": 3.240616517347346e-06, + "loss": 0.5185, + "step": 5061 + }, + { + "epoch": 2.4600607902735563, + "grad_norm": 0.07122928197953035, + "learning_rate": 3.2388244252933802e-06, + "loss": 0.5174, + "step": 5062 + }, + { + "epoch": 2.460547112462006, + "grad_norm": 0.07105231715013642, + "learning_rate": 3.237032591495814e-06, + "loss": 0.5169, + "step": 5063 + }, + { + "epoch": 2.461033434650456, + "grad_norm": 0.07324041315179579, + "learning_rate": 3.235241016217398e-06, + "loss": 0.536, + "step": 5064 + }, + { + "epoch": 2.461519756838906, + "grad_norm": 0.07262187301769511, + "learning_rate": 3.233449699720847e-06, + "loss": 0.5509, + "step": 5065 + }, + { + "epoch": 2.4620060790273555, + "grad_norm": 0.0718986128087145, + "learning_rate": 3.231658642268837e-06, + "loss": 0.5369, + "step": 5066 + }, + { + "epoch": 2.4624924012158056, + "grad_norm": 0.07194898261899282, + "learning_rate": 3.229867844124006e-06, + "loss": 0.546, + "step": 5067 + }, + { + "epoch": 2.4629787234042553, + "grad_norm": 0.0714987692882125, + "learning_rate": 3.2280773055489563e-06, + "loss": 0.5336, + "step": 5068 + }, + { + "epoch": 2.463465045592705, + "grad_norm": 0.07049700257498381, + "learning_rate": 3.2262870268062463e-06, + "loss": 0.4797, + "step": 5069 + }, + { + "epoch": 2.463951367781155, + "grad_norm": 0.07366350993223539, + "learning_rate": 3.2244970081584027e-06, + "loss": 0.5218, + "step": 5070 + }, + { + "epoch": 2.4644376899696048, + "grad_norm": 0.07430582458980749, + "learning_rate": 3.22270724986791e-06, + "loss": 0.5368, + "step": 5071 + }, + { + "epoch": 2.464924012158055, + "grad_norm": 0.07194807492022116, + "learning_rate": 3.2209177521972168e-06, + "loss": 0.528, + "step": 5072 + }, + { + "epoch": 2.4654103343465046, + "grad_norm": 0.07572255353920243, + "learning_rate": 3.219128515408733e-06, + "loss": 0.5637, + "step": 5073 + }, + { + "epoch": 2.4658966565349543, + "grad_norm": 0.07160160451857686, + "learning_rate": 3.217339539764829e-06, + "loss": 0.5682, + "step": 5074 + }, + { + "epoch": 2.4663829787234044, + "grad_norm": 0.07560677368806716, + "learning_rate": 3.215550825527836e-06, + "loss": 0.5623, + "step": 5075 + }, + { + "epoch": 2.466869300911854, + "grad_norm": 0.0695743038537317, + "learning_rate": 3.2137623729600533e-06, + "loss": 0.5156, + "step": 5076 + }, + { + "epoch": 2.4673556231003038, + "grad_norm": 0.07234252817302601, + "learning_rate": 3.211974182323733e-06, + "loss": 0.5134, + "step": 5077 + }, + { + "epoch": 2.467841945288754, + "grad_norm": 0.0698089297650444, + "learning_rate": 3.2101862538810957e-06, + "loss": 0.4844, + "step": 5078 + }, + { + "epoch": 2.4683282674772036, + "grad_norm": 0.07153891775507754, + "learning_rate": 3.208398587894319e-06, + "loss": 0.5109, + "step": 5079 + }, + { + "epoch": 2.4688145896656533, + "grad_norm": 0.07403346441706771, + "learning_rate": 3.2066111846255443e-06, + "loss": 0.5449, + "step": 5080 + }, + { + "epoch": 2.4693009118541034, + "grad_norm": 0.07548148533572237, + "learning_rate": 3.2048240443368745e-06, + "loss": 0.5715, + "step": 5081 + }, + { + "epoch": 2.469787234042553, + "grad_norm": 0.07185172430860484, + "learning_rate": 3.2030371672903725e-06, + "loss": 0.5198, + "step": 5082 + }, + { + "epoch": 2.470273556231003, + "grad_norm": 0.07409698546697403, + "learning_rate": 3.2012505537480655e-06, + "loss": 0.5346, + "step": 5083 + }, + { + "epoch": 2.470759878419453, + "grad_norm": 0.07106574003718143, + "learning_rate": 3.199464203971938e-06, + "loss": 0.5071, + "step": 5084 + }, + { + "epoch": 2.4712462006079026, + "grad_norm": 0.07129559196503259, + "learning_rate": 3.197678118223938e-06, + "loss": 0.5567, + "step": 5085 + }, + { + "epoch": 2.4717325227963527, + "grad_norm": 0.07257593060183752, + "learning_rate": 3.1958922967659755e-06, + "loss": 0.5357, + "step": 5086 + }, + { + "epoch": 2.4722188449848024, + "grad_norm": 0.0709990155023218, + "learning_rate": 3.19410673985992e-06, + "loss": 0.5234, + "step": 5087 + }, + { + "epoch": 2.4727051671732525, + "grad_norm": 0.07290586660681272, + "learning_rate": 3.1923214477676044e-06, + "loss": 0.5155, + "step": 5088 + }, + { + "epoch": 2.473191489361702, + "grad_norm": 0.06971042862182966, + "learning_rate": 3.190536420750821e-06, + "loss": 0.4969, + "step": 5089 + }, + { + "epoch": 2.473677811550152, + "grad_norm": 0.07188004896681896, + "learning_rate": 3.1887516590713235e-06, + "loss": 0.5263, + "step": 5090 + }, + { + "epoch": 2.474164133738602, + "grad_norm": 0.07224013653999033, + "learning_rate": 3.186967162990827e-06, + "loss": 0.5217, + "step": 5091 + }, + { + "epoch": 2.4746504559270517, + "grad_norm": 0.07357100791819154, + "learning_rate": 3.185182932771009e-06, + "loss": 0.5399, + "step": 5092 + }, + { + "epoch": 2.4751367781155014, + "grad_norm": 0.06976210807632803, + "learning_rate": 3.1833989686735046e-06, + "loss": 0.5169, + "step": 5093 + }, + { + "epoch": 2.4756231003039515, + "grad_norm": 0.07201308193067005, + "learning_rate": 3.1816152709599097e-06, + "loss": 0.5583, + "step": 5094 + }, + { + "epoch": 2.476109422492401, + "grad_norm": 0.07089292305447724, + "learning_rate": 3.179831839891788e-06, + "loss": 0.5247, + "step": 5095 + }, + { + "epoch": 2.476595744680851, + "grad_norm": 0.07262174947901902, + "learning_rate": 3.178048675730659e-06, + "loss": 0.5586, + "step": 5096 + }, + { + "epoch": 2.477082066869301, + "grad_norm": 0.07037491773073425, + "learning_rate": 3.1762657787380026e-06, + "loss": 0.4807, + "step": 5097 + }, + { + "epoch": 2.4775683890577507, + "grad_norm": 0.0678355792009076, + "learning_rate": 3.1744831491752583e-06, + "loss": 0.4794, + "step": 5098 + }, + { + "epoch": 2.478054711246201, + "grad_norm": 0.0697889555240722, + "learning_rate": 3.17270078730383e-06, + "loss": 0.4906, + "step": 5099 + }, + { + "epoch": 2.4785410334346505, + "grad_norm": 0.07174063352807762, + "learning_rate": 3.170918693385081e-06, + "loss": 0.5244, + "step": 5100 + }, + { + "epoch": 2.4790273556231, + "grad_norm": 0.0751389700518392, + "learning_rate": 3.169136867680336e-06, + "loss": 0.5608, + "step": 5101 + }, + { + "epoch": 2.4795136778115503, + "grad_norm": 0.07063891123769281, + "learning_rate": 3.167355310450877e-06, + "loss": 0.5075, + "step": 5102 + }, + { + "epoch": 2.48, + "grad_norm": 0.07095625005602661, + "learning_rate": 3.165574021957952e-06, + "loss": 0.5066, + "step": 5103 + }, + { + "epoch": 2.4804863221884497, + "grad_norm": 0.07270491841607786, + "learning_rate": 3.1637930024627645e-06, + "loss": 0.549, + "step": 5104 + }, + { + "epoch": 2.4809726443769, + "grad_norm": 0.07046670502839882, + "learning_rate": 3.1620122522264817e-06, + "loss": 0.5219, + "step": 5105 + }, + { + "epoch": 2.4814589665653495, + "grad_norm": 0.07050616523016418, + "learning_rate": 3.160231771510231e-06, + "loss": 0.5171, + "step": 5106 + }, + { + "epoch": 2.481945288753799, + "grad_norm": 0.0775774417380877, + "learning_rate": 3.1584515605750998e-06, + "loss": 0.529, + "step": 5107 + }, + { + "epoch": 2.4824316109422493, + "grad_norm": 0.07238201009234521, + "learning_rate": 3.1566716196821333e-06, + "loss": 0.5343, + "step": 5108 + }, + { + "epoch": 2.482917933130699, + "grad_norm": 0.07084878986794341, + "learning_rate": 3.1548919490923422e-06, + "loss": 0.5585, + "step": 5109 + }, + { + "epoch": 2.483404255319149, + "grad_norm": 0.07032651813525949, + "learning_rate": 3.1531125490666946e-06, + "loss": 0.5074, + "step": 5110 + }, + { + "epoch": 2.483890577507599, + "grad_norm": 0.07263629183764513, + "learning_rate": 3.1513334198661183e-06, + "loss": 0.5074, + "step": 5111 + }, + { + "epoch": 2.4843768996960485, + "grad_norm": 0.07250651862525727, + "learning_rate": 3.149554561751502e-06, + "loss": 0.5153, + "step": 5112 + }, + { + "epoch": 2.4848632218844986, + "grad_norm": 0.07009165594868517, + "learning_rate": 3.1477759749836967e-06, + "loss": 0.5261, + "step": 5113 + }, + { + "epoch": 2.4853495440729483, + "grad_norm": 0.07289543070295175, + "learning_rate": 3.145997659823512e-06, + "loss": 0.5382, + "step": 5114 + }, + { + "epoch": 2.4858358662613984, + "grad_norm": 0.07196283219115769, + "learning_rate": 3.1442196165317164e-06, + "loss": 0.5255, + "step": 5115 + }, + { + "epoch": 2.486322188449848, + "grad_norm": 0.07583836895376056, + "learning_rate": 3.1424418453690402e-06, + "loss": 0.5672, + "step": 5116 + }, + { + "epoch": 2.4868085106382978, + "grad_norm": 0.07547283790207375, + "learning_rate": 3.140664346596174e-06, + "loss": 0.571, + "step": 5117 + }, + { + "epoch": 2.487294832826748, + "grad_norm": 0.07158727011845846, + "learning_rate": 3.1388871204737663e-06, + "loss": 0.5162, + "step": 5118 + }, + { + "epoch": 2.4877811550151976, + "grad_norm": 0.07313306887019055, + "learning_rate": 3.1371101672624283e-06, + "loss": 0.5417, + "step": 5119 + }, + { + "epoch": 2.4882674772036473, + "grad_norm": 0.07831223238599944, + "learning_rate": 3.13533348722273e-06, + "loss": 0.5313, + "step": 5120 + }, + { + "epoch": 2.4887537993920974, + "grad_norm": 0.07192867093833415, + "learning_rate": 3.1335570806152027e-06, + "loss": 0.4988, + "step": 5121 + }, + { + "epoch": 2.489240121580547, + "grad_norm": 0.0705930535697104, + "learning_rate": 3.1317809477003326e-06, + "loss": 0.531, + "step": 5122 + }, + { + "epoch": 2.4897264437689968, + "grad_norm": 0.07185750539686564, + "learning_rate": 3.130005088738572e-06, + "loss": 0.5175, + "step": 5123 + }, + { + "epoch": 2.490212765957447, + "grad_norm": 0.0711568250872684, + "learning_rate": 3.1282295039903297e-06, + "loss": 0.5242, + "step": 5124 + }, + { + "epoch": 2.4906990881458966, + "grad_norm": 0.07012317643905235, + "learning_rate": 3.126454193715975e-06, + "loss": 0.5317, + "step": 5125 + }, + { + "epoch": 2.4911854103343467, + "grad_norm": 0.07077478128219651, + "learning_rate": 3.1246791581758384e-06, + "loss": 0.5095, + "step": 5126 + }, + { + "epoch": 2.4916717325227964, + "grad_norm": 0.07340973295579808, + "learning_rate": 3.1229043976302064e-06, + "loss": 0.5363, + "step": 5127 + }, + { + "epoch": 2.492158054711246, + "grad_norm": 0.07394294213601862, + "learning_rate": 3.1211299123393296e-06, + "loss": 0.5164, + "step": 5128 + }, + { + "epoch": 2.492644376899696, + "grad_norm": 0.07337476372527484, + "learning_rate": 3.1193557025634147e-06, + "loss": 0.554, + "step": 5129 + }, + { + "epoch": 2.493130699088146, + "grad_norm": 0.23507972355249732, + "learning_rate": 3.1175817685626285e-06, + "loss": 0.5848, + "step": 5130 + }, + { + "epoch": 2.4936170212765956, + "grad_norm": 0.07376429044732093, + "learning_rate": 3.1158081105971018e-06, + "loss": 0.5432, + "step": 5131 + }, + { + "epoch": 2.4941033434650457, + "grad_norm": 0.0699875429290197, + "learning_rate": 3.114034728926918e-06, + "loss": 0.5211, + "step": 5132 + }, + { + "epoch": 2.4945896656534954, + "grad_norm": 0.06980976826865469, + "learning_rate": 3.112261623812125e-06, + "loss": 0.5296, + "step": 5133 + }, + { + "epoch": 2.495075987841945, + "grad_norm": 0.07405956990481313, + "learning_rate": 3.1104887955127283e-06, + "loss": 0.5402, + "step": 5134 + }, + { + "epoch": 2.495562310030395, + "grad_norm": 0.07112345591580038, + "learning_rate": 3.108716244288693e-06, + "loss": 0.5428, + "step": 5135 + }, + { + "epoch": 2.496048632218845, + "grad_norm": 0.07000413692993583, + "learning_rate": 3.1069439703999447e-06, + "loss": 0.5325, + "step": 5136 + }, + { + "epoch": 2.496534954407295, + "grad_norm": 0.0736007009386916, + "learning_rate": 3.1051719741063646e-06, + "loss": 0.5433, + "step": 5137 + }, + { + "epoch": 2.4970212765957447, + "grad_norm": 0.07117726808041258, + "learning_rate": 3.103400255667798e-06, + "loss": 0.5111, + "step": 5138 + }, + { + "epoch": 2.4975075987841944, + "grad_norm": 0.07191517886552011, + "learning_rate": 3.101628815344046e-06, + "loss": 0.5238, + "step": 5139 + }, + { + "epoch": 2.4979939209726445, + "grad_norm": 0.07020578696988963, + "learning_rate": 3.099857653394871e-06, + "loss": 0.4881, + "step": 5140 + }, + { + "epoch": 2.4979939209726445, + "eval_loss": 0.5709736347198486, + "eval_runtime": 104.9642, + "eval_samples_per_second": 289.175, + "eval_steps_per_second": 36.155, + "step": 5140 + }, + { + "epoch": 2.498480243161094, + "grad_norm": 0.07165086577672265, + "learning_rate": 3.098086770079993e-06, + "loss": 0.5572, + "step": 5141 + }, + { + "epoch": 2.4989665653495443, + "grad_norm": 0.07483984142746496, + "learning_rate": 3.0963161656590933e-06, + "loss": 0.5512, + "step": 5142 + }, + { + "epoch": 2.499452887537994, + "grad_norm": 0.07300335321382202, + "learning_rate": 3.0945458403918104e-06, + "loss": 0.5444, + "step": 5143 + }, + { + "epoch": 2.4999392097264437, + "grad_norm": 0.07364864241562977, + "learning_rate": 3.0927757945377413e-06, + "loss": 0.5603, + "step": 5144 + }, + { + "epoch": 2.5004255319148934, + "grad_norm": 0.06895337202780867, + "learning_rate": 3.0910060283564454e-06, + "loss": 0.5116, + "step": 5145 + }, + { + "epoch": 2.5009118541033435, + "grad_norm": 0.07236063519921059, + "learning_rate": 3.0892365421074366e-06, + "loss": 0.5501, + "step": 5146 + }, + { + "epoch": 2.501398176291793, + "grad_norm": 0.07020073702375772, + "learning_rate": 3.08746733605019e-06, + "loss": 0.5065, + "step": 5147 + }, + { + "epoch": 2.5018844984802433, + "grad_norm": 0.06964144865939198, + "learning_rate": 3.085698410444139e-06, + "loss": 0.514, + "step": 5148 + }, + { + "epoch": 2.502370820668693, + "grad_norm": 0.07336464063880291, + "learning_rate": 3.083929765548679e-06, + "loss": 0.5401, + "step": 5149 + }, + { + "epoch": 2.5028571428571427, + "grad_norm": 0.0711731885608878, + "learning_rate": 3.0821614016231617e-06, + "loss": 0.5333, + "step": 5150 + }, + { + "epoch": 2.503343465045593, + "grad_norm": 0.07567293270713694, + "learning_rate": 3.0803933189268966e-06, + "loss": 0.5526, + "step": 5151 + }, + { + "epoch": 2.5038297872340425, + "grad_norm": 0.07151002183351457, + "learning_rate": 3.0786255177191515e-06, + "loss": 0.5313, + "step": 5152 + }, + { + "epoch": 2.5043161094224926, + "grad_norm": 0.0723448603993983, + "learning_rate": 3.0768579982591557e-06, + "loss": 0.5107, + "step": 5153 + }, + { + "epoch": 2.5048024316109423, + "grad_norm": 0.07565431318969268, + "learning_rate": 3.0750907608060954e-06, + "loss": 0.5619, + "step": 5154 + }, + { + "epoch": 2.505288753799392, + "grad_norm": 0.0691905260316451, + "learning_rate": 3.0733238056191173e-06, + "loss": 0.5236, + "step": 5155 + }, + { + "epoch": 2.505775075987842, + "grad_norm": 0.07368013747856297, + "learning_rate": 3.0715571329573233e-06, + "loss": 0.5408, + "step": 5156 + }, + { + "epoch": 2.506261398176292, + "grad_norm": 0.07311793676604153, + "learning_rate": 3.0697907430797767e-06, + "loss": 0.5229, + "step": 5157 + }, + { + "epoch": 2.506747720364742, + "grad_norm": 0.0714406976803571, + "learning_rate": 3.068024636245499e-06, + "loss": 0.5179, + "step": 5158 + }, + { + "epoch": 2.5072340425531916, + "grad_norm": 0.0741667379421227, + "learning_rate": 3.0662588127134697e-06, + "loss": 0.5521, + "step": 5159 + }, + { + "epoch": 2.5077203647416413, + "grad_norm": 0.07061477567556528, + "learning_rate": 3.0644932727426275e-06, + "loss": 0.5081, + "step": 5160 + }, + { + "epoch": 2.508206686930091, + "grad_norm": 0.0676538916570248, + "learning_rate": 3.062728016591866e-06, + "loss": 0.4511, + "step": 5161 + }, + { + "epoch": 2.508693009118541, + "grad_norm": 0.06965509800177648, + "learning_rate": 3.0609630445200424e-06, + "loss": 0.5185, + "step": 5162 + }, + { + "epoch": 2.509179331306991, + "grad_norm": 0.0736500319452885, + "learning_rate": 3.0591983567859685e-06, + "loss": 0.5337, + "step": 5163 + }, + { + "epoch": 2.509665653495441, + "grad_norm": 0.073219446833232, + "learning_rate": 3.0574339536484164e-06, + "loss": 0.5016, + "step": 5164 + }, + { + "epoch": 2.5101519756838906, + "grad_norm": 0.07152702686461844, + "learning_rate": 3.055669835366116e-06, + "loss": 0.5276, + "step": 5165 + }, + { + "epoch": 2.5106382978723403, + "grad_norm": 0.07110473566386229, + "learning_rate": 3.053906002197754e-06, + "loss": 0.5638, + "step": 5166 + }, + { + "epoch": 2.5111246200607904, + "grad_norm": 0.0762557165318456, + "learning_rate": 3.0521424544019786e-06, + "loss": 0.544, + "step": 5167 + }, + { + "epoch": 2.51161094224924, + "grad_norm": 0.07305990167429371, + "learning_rate": 3.050379192237393e-06, + "loss": 0.5127, + "step": 5168 + }, + { + "epoch": 2.51209726443769, + "grad_norm": 0.07305388570305144, + "learning_rate": 3.048616215962558e-06, + "loss": 0.5057, + "step": 5169 + }, + { + "epoch": 2.51258358662614, + "grad_norm": 0.07425831814350997, + "learning_rate": 3.0468535258359964e-06, + "loss": 0.5756, + "step": 5170 + }, + { + "epoch": 2.5130699088145896, + "grad_norm": 0.07036315899760799, + "learning_rate": 3.045091122116186e-06, + "loss": 0.5124, + "step": 5171 + }, + { + "epoch": 2.5135562310030393, + "grad_norm": 0.07222052317843929, + "learning_rate": 3.0433290050615626e-06, + "loss": 0.5526, + "step": 5172 + }, + { + "epoch": 2.5140425531914894, + "grad_norm": 0.07202971612489384, + "learning_rate": 3.041567174930522e-06, + "loss": 0.5125, + "step": 5173 + }, + { + "epoch": 2.514528875379939, + "grad_norm": 0.07181682251503373, + "learning_rate": 3.039805631981415e-06, + "loss": 0.5183, + "step": 5174 + }, + { + "epoch": 2.515015197568389, + "grad_norm": 0.0720441762663447, + "learning_rate": 3.0380443764725538e-06, + "loss": 0.5261, + "step": 5175 + }, + { + "epoch": 2.515501519756839, + "grad_norm": 0.0717310784091408, + "learning_rate": 3.036283408662204e-06, + "loss": 0.5286, + "step": 5176 + }, + { + "epoch": 2.5159878419452886, + "grad_norm": 0.0701282753128611, + "learning_rate": 3.034522728808593e-06, + "loss": 0.4995, + "step": 5177 + }, + { + "epoch": 2.5164741641337387, + "grad_norm": 0.07066424607797053, + "learning_rate": 3.0327623371699043e-06, + "loss": 0.4916, + "step": 5178 + }, + { + "epoch": 2.5169604863221884, + "grad_norm": 0.06821429406072016, + "learning_rate": 3.0310022340042798e-06, + "loss": 0.4884, + "step": 5179 + }, + { + "epoch": 2.5174468085106385, + "grad_norm": 0.07183190513578791, + "learning_rate": 3.0292424195698177e-06, + "loss": 0.5251, + "step": 5180 + }, + { + "epoch": 2.517933130699088, + "grad_norm": 0.07047535260710414, + "learning_rate": 3.027482894124576e-06, + "loss": 0.5182, + "step": 5181 + }, + { + "epoch": 2.518419452887538, + "grad_norm": 0.07231648296593661, + "learning_rate": 3.025723657926568e-06, + "loss": 0.5106, + "step": 5182 + }, + { + "epoch": 2.518905775075988, + "grad_norm": 0.07022519935915474, + "learning_rate": 3.023964711233767e-06, + "loss": 0.5275, + "step": 5183 + }, + { + "epoch": 2.5193920972644377, + "grad_norm": 0.07116710643697498, + "learning_rate": 3.0222060543040994e-06, + "loss": 0.5058, + "step": 5184 + }, + { + "epoch": 2.5198784194528874, + "grad_norm": 0.07033377823009221, + "learning_rate": 3.0204476873954558e-06, + "loss": 0.5085, + "step": 5185 + }, + { + "epoch": 2.5203647416413375, + "grad_norm": 0.07219927495954807, + "learning_rate": 3.0186896107656803e-06, + "loss": 0.5413, + "step": 5186 + }, + { + "epoch": 2.520851063829787, + "grad_norm": 0.06940063883794943, + "learning_rate": 3.016931824672573e-06, + "loss": 0.486, + "step": 5187 + }, + { + "epoch": 2.521337386018237, + "grad_norm": 0.06906707175649249, + "learning_rate": 3.0151743293738955e-06, + "loss": 0.4855, + "step": 5188 + }, + { + "epoch": 2.521823708206687, + "grad_norm": 0.07206245306632703, + "learning_rate": 3.013417125127364e-06, + "loss": 0.5128, + "step": 5189 + }, + { + "epoch": 2.5223100303951367, + "grad_norm": 0.06889094833978955, + "learning_rate": 3.0116602121906514e-06, + "loss": 0.4675, + "step": 5190 + }, + { + "epoch": 2.522796352583587, + "grad_norm": 0.07087814856013125, + "learning_rate": 3.0099035908213893e-06, + "loss": 0.5221, + "step": 5191 + }, + { + "epoch": 2.5232826747720365, + "grad_norm": 0.07008707338481325, + "learning_rate": 3.0081472612771656e-06, + "loss": 0.51, + "step": 5192 + }, + { + "epoch": 2.523768996960486, + "grad_norm": 0.07174946418666132, + "learning_rate": 3.006391223815528e-06, + "loss": 0.5337, + "step": 5193 + }, + { + "epoch": 2.5242553191489363, + "grad_norm": 0.07031243831566994, + "learning_rate": 3.0046354786939785e-06, + "loss": 0.4829, + "step": 5194 + }, + { + "epoch": 2.524741641337386, + "grad_norm": 0.0729406523193348, + "learning_rate": 3.002880026169977e-06, + "loss": 0.5347, + "step": 5195 + }, + { + "epoch": 2.525227963525836, + "grad_norm": 0.07059223581186867, + "learning_rate": 3.0011248665009405e-06, + "loss": 0.532, + "step": 5196 + }, + { + "epoch": 2.525714285714286, + "grad_norm": 0.07048365431127444, + "learning_rate": 2.9993699999442445e-06, + "loss": 0.5502, + "step": 5197 + }, + { + "epoch": 2.5262006079027355, + "grad_norm": 0.06910190222350247, + "learning_rate": 2.997615426757219e-06, + "loss": 0.4988, + "step": 5198 + }, + { + "epoch": 2.526686930091185, + "grad_norm": 0.0699236297396845, + "learning_rate": 2.9958611471971534e-06, + "loss": 0.5095, + "step": 5199 + }, + { + "epoch": 2.5271732522796353, + "grad_norm": 0.0739257683961832, + "learning_rate": 2.9941071615212906e-06, + "loss": 0.5694, + "step": 5200 + }, + { + "epoch": 2.527659574468085, + "grad_norm": 0.07325283319655512, + "learning_rate": 2.992353469986835e-06, + "loss": 0.5028, + "step": 5201 + }, + { + "epoch": 2.528145896656535, + "grad_norm": 0.07148874722633088, + "learning_rate": 2.990600072850942e-06, + "loss": 0.5436, + "step": 5202 + }, + { + "epoch": 2.528632218844985, + "grad_norm": 0.07341045758166169, + "learning_rate": 2.9888469703707323e-06, + "loss": 0.52, + "step": 5203 + }, + { + "epoch": 2.5291185410334345, + "grad_norm": 0.07119609016675617, + "learning_rate": 2.9870941628032777e-06, + "loss": 0.5314, + "step": 5204 + }, + { + "epoch": 2.5296048632218846, + "grad_norm": 0.07177544344758144, + "learning_rate": 2.9853416504056044e-06, + "loss": 0.5239, + "step": 5205 + }, + { + "epoch": 2.5300911854103343, + "grad_norm": 0.06994400548437793, + "learning_rate": 2.9835894334347005e-06, + "loss": 0.4928, + "step": 5206 + }, + { + "epoch": 2.5305775075987844, + "grad_norm": 0.06966879737529871, + "learning_rate": 2.9818375121475084e-06, + "loss": 0.4945, + "step": 5207 + }, + { + "epoch": 2.531063829787234, + "grad_norm": 0.07279910295191842, + "learning_rate": 2.9800858868009276e-06, + "loss": 0.53, + "step": 5208 + }, + { + "epoch": 2.531550151975684, + "grad_norm": 0.06929579028085348, + "learning_rate": 2.978334557651813e-06, + "loss": 0.484, + "step": 5209 + }, + { + "epoch": 2.5320364741641335, + "grad_norm": 0.06713729823254085, + "learning_rate": 2.9765835249569786e-06, + "loss": 0.462, + "step": 5210 + }, + { + "epoch": 2.5325227963525836, + "grad_norm": 0.07271017018802592, + "learning_rate": 2.974832788973193e-06, + "loss": 0.5683, + "step": 5211 + }, + { + "epoch": 2.5330091185410333, + "grad_norm": 0.0731602019961289, + "learning_rate": 2.973082349957181e-06, + "loss": 0.5321, + "step": 5212 + }, + { + "epoch": 2.5334954407294834, + "grad_norm": 0.07075667268472696, + "learning_rate": 2.971332208165626e-06, + "loss": 0.5406, + "step": 5213 + }, + { + "epoch": 2.533981762917933, + "grad_norm": 0.06889726630509321, + "learning_rate": 2.9695823638551657e-06, + "loss": 0.4914, + "step": 5214 + }, + { + "epoch": 2.5344680851063828, + "grad_norm": 0.0691887393959937, + "learning_rate": 2.9678328172823937e-06, + "loss": 0.4851, + "step": 5215 + }, + { + "epoch": 2.534954407294833, + "grad_norm": 0.07046926930632237, + "learning_rate": 2.966083568703863e-06, + "loss": 0.519, + "step": 5216 + }, + { + "epoch": 2.5354407294832826, + "grad_norm": 0.075046384901123, + "learning_rate": 2.9643346183760802e-06, + "loss": 0.5183, + "step": 5217 + }, + { + "epoch": 2.5359270516717327, + "grad_norm": 0.0720481248942936, + "learning_rate": 2.962585966555509e-06, + "loss": 0.5493, + "step": 5218 + }, + { + "epoch": 2.5364133738601824, + "grad_norm": 0.07397849605279953, + "learning_rate": 2.9608376134985696e-06, + "loss": 0.5349, + "step": 5219 + }, + { + "epoch": 2.536899696048632, + "grad_norm": 0.07083021017899752, + "learning_rate": 2.9590895594616377e-06, + "loss": 0.5215, + "step": 5220 + }, + { + "epoch": 2.537386018237082, + "grad_norm": 0.07066373777118727, + "learning_rate": 2.9573418047010448e-06, + "loss": 0.5084, + "step": 5221 + }, + { + "epoch": 2.537872340425532, + "grad_norm": 0.07405362335264465, + "learning_rate": 2.9555943494730817e-06, + "loss": 0.5297, + "step": 5222 + }, + { + "epoch": 2.538358662613982, + "grad_norm": 0.0723235318908062, + "learning_rate": 2.953847194033991e-06, + "loss": 0.5335, + "step": 5223 + }, + { + "epoch": 2.5388449848024317, + "grad_norm": 0.07300873760833729, + "learning_rate": 2.952100338639974e-06, + "loss": 0.5366, + "step": 5224 + }, + { + "epoch": 2.5393313069908814, + "grad_norm": 0.07227121490875922, + "learning_rate": 2.950353783547187e-06, + "loss": 0.5174, + "step": 5225 + }, + { + "epoch": 2.539817629179331, + "grad_norm": 0.07166463649943024, + "learning_rate": 2.948607529011742e-06, + "loss": 0.5534, + "step": 5226 + }, + { + "epoch": 2.540303951367781, + "grad_norm": 0.07041677367153859, + "learning_rate": 2.946861575289708e-06, + "loss": 0.5483, + "step": 5227 + }, + { + "epoch": 2.540790273556231, + "grad_norm": 0.07382337984952471, + "learning_rate": 2.9451159226371097e-06, + "loss": 0.5383, + "step": 5228 + }, + { + "epoch": 2.541276595744681, + "grad_norm": 0.07095447433106677, + "learning_rate": 2.9433705713099257e-06, + "loss": 0.4841, + "step": 5229 + }, + { + "epoch": 2.5417629179331307, + "grad_norm": 0.070941641566057, + "learning_rate": 2.941625521564093e-06, + "loss": 0.516, + "step": 5230 + }, + { + "epoch": 2.5422492401215804, + "grad_norm": 0.07234703568990339, + "learning_rate": 2.9398807736555036e-06, + "loss": 0.5462, + "step": 5231 + }, + { + "epoch": 2.5427355623100305, + "grad_norm": 0.0718035031427715, + "learning_rate": 2.9381363278400043e-06, + "loss": 0.5041, + "step": 5232 + }, + { + "epoch": 2.54322188449848, + "grad_norm": 0.07209680344069905, + "learning_rate": 2.9363921843733984e-06, + "loss": 0.5167, + "step": 5233 + }, + { + "epoch": 2.5437082066869303, + "grad_norm": 0.07222340455173297, + "learning_rate": 2.934648343511445e-06, + "loss": 0.5595, + "step": 5234 + }, + { + "epoch": 2.54419452887538, + "grad_norm": 0.06949006995013941, + "learning_rate": 2.9329048055098582e-06, + "loss": 0.4957, + "step": 5235 + }, + { + "epoch": 2.5446808510638297, + "grad_norm": 0.07077812057498424, + "learning_rate": 2.931161570624308e-06, + "loss": 0.5455, + "step": 5236 + }, + { + "epoch": 2.5451671732522794, + "grad_norm": 0.07297797733435306, + "learning_rate": 2.929418639110422e-06, + "loss": 0.5055, + "step": 5237 + }, + { + "epoch": 2.5456534954407295, + "grad_norm": 0.07235705936738485, + "learning_rate": 2.927676011223778e-06, + "loss": 0.5184, + "step": 5238 + }, + { + "epoch": 2.546139817629179, + "grad_norm": 0.07115646941972145, + "learning_rate": 2.925933687219912e-06, + "loss": 0.5192, + "step": 5239 + }, + { + "epoch": 2.5466261398176293, + "grad_norm": 0.07222373189759254, + "learning_rate": 2.92419166735432e-06, + "loss": 0.5253, + "step": 5240 + }, + { + "epoch": 2.547112462006079, + "grad_norm": 0.06844920517682934, + "learning_rate": 2.922449951882448e-06, + "loss": 0.4976, + "step": 5241 + }, + { + "epoch": 2.5475987841945287, + "grad_norm": 0.07150832419015081, + "learning_rate": 2.9207085410596987e-06, + "loss": 0.5224, + "step": 5242 + }, + { + "epoch": 2.548085106382979, + "grad_norm": 0.07065039976242223, + "learning_rate": 2.91896743514143e-06, + "loss": 0.5242, + "step": 5243 + }, + { + "epoch": 2.5485714285714285, + "grad_norm": 0.07036292963992544, + "learning_rate": 2.9172266343829547e-06, + "loss": 0.4951, + "step": 5244 + }, + { + "epoch": 2.5490577507598786, + "grad_norm": 0.07280445217666232, + "learning_rate": 2.9154861390395417e-06, + "loss": 0.5499, + "step": 5245 + }, + { + "epoch": 2.5495440729483283, + "grad_norm": 0.0697516359103554, + "learning_rate": 2.913745949366416e-06, + "loss": 0.516, + "step": 5246 + }, + { + "epoch": 2.550030395136778, + "grad_norm": 0.07252689662567713, + "learning_rate": 2.9120060656187577e-06, + "loss": 0.5202, + "step": 5247 + }, + { + "epoch": 2.550516717325228, + "grad_norm": 0.0739677980914118, + "learning_rate": 2.9102664880516973e-06, + "loss": 0.5857, + "step": 5248 + }, + { + "epoch": 2.551003039513678, + "grad_norm": 0.07195073968025892, + "learning_rate": 2.908527216920325e-06, + "loss": 0.5605, + "step": 5249 + }, + { + "epoch": 2.551489361702128, + "grad_norm": 0.0709461915170774, + "learning_rate": 2.906788252479687e-06, + "loss": 0.5137, + "step": 5250 + }, + { + "epoch": 2.5519756838905776, + "grad_norm": 0.07016900503196073, + "learning_rate": 2.905049594984781e-06, + "loss": 0.4832, + "step": 5251 + }, + { + "epoch": 2.5524620060790273, + "grad_norm": 0.07461585214894023, + "learning_rate": 2.903311244690563e-06, + "loss": 0.5603, + "step": 5252 + }, + { + "epoch": 2.552948328267477, + "grad_norm": 0.07006371556845854, + "learning_rate": 2.9015732018519415e-06, + "loss": 0.5042, + "step": 5253 + }, + { + "epoch": 2.553434650455927, + "grad_norm": 0.07304062182928074, + "learning_rate": 2.8998354667237806e-06, + "loss": 0.5161, + "step": 5254 + }, + { + "epoch": 2.553920972644377, + "grad_norm": 0.07115914863402885, + "learning_rate": 2.898098039560899e-06, + "loss": 0.508, + "step": 5255 + }, + { + "epoch": 2.554407294832827, + "grad_norm": 0.07209164351203949, + "learning_rate": 2.8963609206180715e-06, + "loss": 0.4868, + "step": 5256 + }, + { + "epoch": 2.5548936170212766, + "grad_norm": 0.07273737158722844, + "learning_rate": 2.8946241101500246e-06, + "loss": 0.5673, + "step": 5257 + }, + { + "epoch": 2.5553799392097263, + "grad_norm": 0.07217837398100864, + "learning_rate": 2.8928876084114456e-06, + "loss": 0.5278, + "step": 5258 + }, + { + "epoch": 2.5558662613981764, + "grad_norm": 0.07300754126510027, + "learning_rate": 2.8911514156569715e-06, + "loss": 0.5299, + "step": 5259 + }, + { + "epoch": 2.556352583586626, + "grad_norm": 0.0729385912459572, + "learning_rate": 2.8894155321411943e-06, + "loss": 0.5378, + "step": 5260 + }, + { + "epoch": 2.556838905775076, + "grad_norm": 0.06840634153916052, + "learning_rate": 2.887679958118662e-06, + "loss": 0.4847, + "step": 5261 + }, + { + "epoch": 2.557325227963526, + "grad_norm": 0.0706326013417714, + "learning_rate": 2.8859446938438794e-06, + "loss": 0.5105, + "step": 5262 + }, + { + "epoch": 2.5578115501519756, + "grad_norm": 0.07392191466607143, + "learning_rate": 2.884209739571299e-06, + "loss": 0.5421, + "step": 5263 + }, + { + "epoch": 2.5582978723404253, + "grad_norm": 0.07040961638293448, + "learning_rate": 2.8824750955553325e-06, + "loss": 0.5472, + "step": 5264 + }, + { + "epoch": 2.5587841945288754, + "grad_norm": 0.06971496028015046, + "learning_rate": 2.880740762050348e-06, + "loss": 0.5005, + "step": 5265 + }, + { + "epoch": 2.559270516717325, + "grad_norm": 0.07072607532764644, + "learning_rate": 2.8790067393106653e-06, + "loss": 0.515, + "step": 5266 + }, + { + "epoch": 2.559756838905775, + "grad_norm": 0.06917005729041993, + "learning_rate": 2.877273027590558e-06, + "loss": 0.5145, + "step": 5267 + }, + { + "epoch": 2.560243161094225, + "grad_norm": 0.07141443837313326, + "learning_rate": 2.875539627144257e-06, + "loss": 0.5204, + "step": 5268 + }, + { + "epoch": 2.5607294832826746, + "grad_norm": 0.06899865083882975, + "learning_rate": 2.873806538225944e-06, + "loss": 0.4919, + "step": 5269 + }, + { + "epoch": 2.5612158054711247, + "grad_norm": 0.06975767819440655, + "learning_rate": 2.8720737610897575e-06, + "loss": 0.5132, + "step": 5270 + }, + { + "epoch": 2.5617021276595744, + "grad_norm": 0.07172518728964794, + "learning_rate": 2.8703412959897904e-06, + "loss": 0.5255, + "step": 5271 + }, + { + "epoch": 2.5621884498480245, + "grad_norm": 0.07076484580877153, + "learning_rate": 2.8686091431800883e-06, + "loss": 0.538, + "step": 5272 + }, + { + "epoch": 2.562674772036474, + "grad_norm": 0.07069483007438665, + "learning_rate": 2.8668773029146517e-06, + "loss": 0.487, + "step": 5273 + }, + { + "epoch": 2.563161094224924, + "grad_norm": 0.07152830371593032, + "learning_rate": 2.8651457754474354e-06, + "loss": 0.5172, + "step": 5274 + }, + { + "epoch": 2.563647416413374, + "grad_norm": 0.07231745130793858, + "learning_rate": 2.8634145610323462e-06, + "loss": 0.5349, + "step": 5275 + }, + { + "epoch": 2.5641337386018237, + "grad_norm": 0.06942002369005398, + "learning_rate": 2.8616836599232513e-06, + "loss": 0.5, + "step": 5276 + }, + { + "epoch": 2.564620060790274, + "grad_norm": 0.07133104590296707, + "learning_rate": 2.8599530723739673e-06, + "loss": 0.5166, + "step": 5277 + }, + { + "epoch": 2.5651063829787235, + "grad_norm": 0.07220081410227014, + "learning_rate": 2.8582227986382617e-06, + "loss": 0.5149, + "step": 5278 + }, + { + "epoch": 2.565592705167173, + "grad_norm": 0.06859177438327475, + "learning_rate": 2.8564928389698605e-06, + "loss": 0.4893, + "step": 5279 + }, + { + "epoch": 2.566079027355623, + "grad_norm": 0.06935779233292914, + "learning_rate": 2.854763193622444e-06, + "loss": 0.5109, + "step": 5280 + }, + { + "epoch": 2.566565349544073, + "grad_norm": 0.07230111515732772, + "learning_rate": 2.8530338628496433e-06, + "loss": 0.5445, + "step": 5281 + }, + { + "epoch": 2.5670516717325227, + "grad_norm": 0.07597840175779053, + "learning_rate": 2.8513048469050476e-06, + "loss": 0.5588, + "step": 5282 + }, + { + "epoch": 2.567537993920973, + "grad_norm": 0.07151983171688847, + "learning_rate": 2.8495761460421957e-06, + "loss": 0.5148, + "step": 5283 + }, + { + "epoch": 2.5680243161094225, + "grad_norm": 0.07337071758599584, + "learning_rate": 2.8478477605145815e-06, + "loss": 0.5389, + "step": 5284 + }, + { + "epoch": 2.568510638297872, + "grad_norm": 0.07144312497699838, + "learning_rate": 2.8461196905756544e-06, + "loss": 0.5352, + "step": 5285 + }, + { + "epoch": 2.5689969604863223, + "grad_norm": 0.07082174842796415, + "learning_rate": 2.8443919364788157e-06, + "loss": 0.4933, + "step": 5286 + }, + { + "epoch": 2.569483282674772, + "grad_norm": 0.07244737093567413, + "learning_rate": 2.842664498477421e-06, + "loss": 0.5257, + "step": 5287 + }, + { + "epoch": 2.569969604863222, + "grad_norm": 0.07415387531548687, + "learning_rate": 2.8409373768247795e-06, + "loss": 0.5698, + "step": 5288 + }, + { + "epoch": 2.570455927051672, + "grad_norm": 0.07388906751027818, + "learning_rate": 2.839210571774154e-06, + "loss": 0.5524, + "step": 5289 + }, + { + "epoch": 2.5709422492401215, + "grad_norm": 0.07262094743517403, + "learning_rate": 2.837484083578761e-06, + "loss": 0.5499, + "step": 5290 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.06943069103189393, + "learning_rate": 2.8357579124917694e-06, + "loss": 0.5061, + "step": 5291 + }, + { + "epoch": 2.5719148936170213, + "grad_norm": 0.07203734129483275, + "learning_rate": 2.834032058766304e-06, + "loss": 0.5393, + "step": 5292 + }, + { + "epoch": 2.572401215805471, + "grad_norm": 0.07310077793872213, + "learning_rate": 2.83230652265544e-06, + "loss": 0.5474, + "step": 5293 + }, + { + "epoch": 2.572887537993921, + "grad_norm": 0.08540313442429173, + "learning_rate": 2.83058130441221e-06, + "loss": 0.5856, + "step": 5294 + }, + { + "epoch": 2.573373860182371, + "grad_norm": 0.06925801772830338, + "learning_rate": 2.828856404289596e-06, + "loss": 0.4806, + "step": 5295 + }, + { + "epoch": 2.5738601823708205, + "grad_norm": 0.07285313916417645, + "learning_rate": 2.827131822540535e-06, + "loss": 0.5301, + "step": 5296 + }, + { + "epoch": 2.5743465045592706, + "grad_norm": 0.07005282769407138, + "learning_rate": 2.8254075594179177e-06, + "loss": 0.5214, + "step": 5297 + }, + { + "epoch": 2.5748328267477203, + "grad_norm": 0.06760403731737047, + "learning_rate": 2.823683615174587e-06, + "loss": 0.4743, + "step": 5298 + }, + { + "epoch": 2.5753191489361704, + "grad_norm": 0.07155123614917548, + "learning_rate": 2.8219599900633417e-06, + "loss": 0.5253, + "step": 5299 + }, + { + "epoch": 2.57580547112462, + "grad_norm": 0.07164208280364782, + "learning_rate": 2.82023668433693e-06, + "loss": 0.5091, + "step": 5300 + }, + { + "epoch": 2.57629179331307, + "grad_norm": 0.07173328473577424, + "learning_rate": 2.8185136982480554e-06, + "loss": 0.5339, + "step": 5301 + }, + { + "epoch": 2.57677811550152, + "grad_norm": 0.07232906126540413, + "learning_rate": 2.816791032049375e-06, + "loss": 0.5495, + "step": 5302 + }, + { + "epoch": 2.5772644376899696, + "grad_norm": 0.07271566293105103, + "learning_rate": 2.8150686859934974e-06, + "loss": 0.5271, + "step": 5303 + }, + { + "epoch": 2.5777507598784197, + "grad_norm": 0.07032940175152132, + "learning_rate": 2.813346660332986e-06, + "loss": 0.5094, + "step": 5304 + }, + { + "epoch": 2.5782370820668694, + "grad_norm": 0.0699621605588178, + "learning_rate": 2.811624955320356e-06, + "loss": 0.5269, + "step": 5305 + }, + { + "epoch": 2.578723404255319, + "grad_norm": 0.07205969727227979, + "learning_rate": 2.809903571208075e-06, + "loss": 0.5302, + "step": 5306 + }, + { + "epoch": 2.579209726443769, + "grad_norm": 0.07002055560445936, + "learning_rate": 2.808182508248565e-06, + "loss": 0.503, + "step": 5307 + }, + { + "epoch": 2.579696048632219, + "grad_norm": 0.0685787232898974, + "learning_rate": 2.8064617666942e-06, + "loss": 0.4965, + "step": 5308 + }, + { + "epoch": 2.5801823708206686, + "grad_norm": 0.07603117430086737, + "learning_rate": 2.804741346797308e-06, + "loss": 0.5432, + "step": 5309 + }, + { + "epoch": 2.5806686930091187, + "grad_norm": 0.07207282273647803, + "learning_rate": 2.8030212488101714e-06, + "loss": 0.5292, + "step": 5310 + }, + { + "epoch": 2.5811550151975684, + "grad_norm": 0.07230275450271861, + "learning_rate": 2.801301472985016e-06, + "loss": 0.5378, + "step": 5311 + }, + { + "epoch": 2.581641337386018, + "grad_norm": 0.07234208356600152, + "learning_rate": 2.799582019574033e-06, + "loss": 0.5063, + "step": 5312 + }, + { + "epoch": 2.582127659574468, + "grad_norm": 0.07248370684931718, + "learning_rate": 2.79786288882936e-06, + "loss": 0.5065, + "step": 5313 + }, + { + "epoch": 2.582613981762918, + "grad_norm": 0.07022688737732607, + "learning_rate": 2.7961440810030878e-06, + "loss": 0.5139, + "step": 5314 + }, + { + "epoch": 2.583100303951368, + "grad_norm": 0.07024845533111988, + "learning_rate": 2.794425596347259e-06, + "loss": 0.503, + "step": 5315 + }, + { + "epoch": 2.5835866261398177, + "grad_norm": 0.07150716858810083, + "learning_rate": 2.7927074351138704e-06, + "loss": 0.5109, + "step": 5316 + }, + { + "epoch": 2.5840729483282674, + "grad_norm": 0.06849105936526505, + "learning_rate": 2.7909895975548717e-06, + "loss": 0.4921, + "step": 5317 + }, + { + "epoch": 2.584559270516717, + "grad_norm": 0.07044883715059527, + "learning_rate": 2.7892720839221633e-06, + "loss": 0.5272, + "step": 5318 + }, + { + "epoch": 2.585045592705167, + "grad_norm": 0.07394074859092659, + "learning_rate": 2.787554894467599e-06, + "loss": 0.5485, + "step": 5319 + }, + { + "epoch": 2.585531914893617, + "grad_norm": 0.07346051836976804, + "learning_rate": 2.785838029442986e-06, + "loss": 0.5562, + "step": 5320 + }, + { + "epoch": 2.586018237082067, + "grad_norm": 0.07017259779383905, + "learning_rate": 2.784121489100082e-06, + "loss": 0.5067, + "step": 5321 + }, + { + "epoch": 2.5865045592705167, + "grad_norm": 0.07326868226652182, + "learning_rate": 2.7824052736905993e-06, + "loss": 0.5386, + "step": 5322 + }, + { + "epoch": 2.5869908814589664, + "grad_norm": 0.07263318556183915, + "learning_rate": 2.7806893834661998e-06, + "loss": 0.5389, + "step": 5323 + }, + { + "epoch": 2.5874772036474165, + "grad_norm": 0.07280897133010984, + "learning_rate": 2.778973818678501e-06, + "loss": 0.5296, + "step": 5324 + }, + { + "epoch": 2.587963525835866, + "grad_norm": 0.06900944653569056, + "learning_rate": 2.777258579579072e-06, + "loss": 0.4905, + "step": 5325 + }, + { + "epoch": 2.5884498480243163, + "grad_norm": 0.07420931195821912, + "learning_rate": 2.7755436664194293e-06, + "loss": 0.5208, + "step": 5326 + }, + { + "epoch": 2.588936170212766, + "grad_norm": 0.07046102140257733, + "learning_rate": 2.773829079451048e-06, + "loss": 0.4951, + "step": 5327 + }, + { + "epoch": 2.5894224924012157, + "grad_norm": 0.07500427753109747, + "learning_rate": 2.772114818925352e-06, + "loss": 0.5778, + "step": 5328 + }, + { + "epoch": 2.589908814589666, + "grad_norm": 0.07207267607008341, + "learning_rate": 2.770400885093718e-06, + "loss": 0.5379, + "step": 5329 + }, + { + "epoch": 2.5903951367781155, + "grad_norm": 0.0708115651005762, + "learning_rate": 2.768687278207475e-06, + "loss": 0.4859, + "step": 5330 + }, + { + "epoch": 2.590881458966565, + "grad_norm": 0.07504072594663244, + "learning_rate": 2.7669739985179046e-06, + "loss": 0.5626, + "step": 5331 + }, + { + "epoch": 2.5913677811550153, + "grad_norm": 0.07052117077560555, + "learning_rate": 2.7652610462762407e-06, + "loss": 0.5192, + "step": 5332 + }, + { + "epoch": 2.591854103343465, + "grad_norm": 0.0719427334073481, + "learning_rate": 2.7635484217336666e-06, + "loss": 0.5228, + "step": 5333 + }, + { + "epoch": 2.5923404255319147, + "grad_norm": 0.07087242421254078, + "learning_rate": 2.7618361251413207e-06, + "loss": 0.5131, + "step": 5334 + }, + { + "epoch": 2.592826747720365, + "grad_norm": 0.07081540882432405, + "learning_rate": 2.76012415675029e-06, + "loss": 0.5211, + "step": 5335 + }, + { + "epoch": 2.5933130699088145, + "grad_norm": 0.07282737956103896, + "learning_rate": 2.758412516811617e-06, + "loss": 0.5384, + "step": 5336 + }, + { + "epoch": 2.5937993920972646, + "grad_norm": 0.07116256741985431, + "learning_rate": 2.756701205576293e-06, + "loss": 0.5026, + "step": 5337 + }, + { + "epoch": 2.5942857142857143, + "grad_norm": 0.07056525705508711, + "learning_rate": 2.754990223295263e-06, + "loss": 0.4989, + "step": 5338 + }, + { + "epoch": 2.594772036474164, + "grad_norm": 0.07131474025342094, + "learning_rate": 2.7532795702194253e-06, + "loss": 0.5218, + "step": 5339 + }, + { + "epoch": 2.595258358662614, + "grad_norm": 0.07077640561004091, + "learning_rate": 2.7515692465996236e-06, + "loss": 0.54, + "step": 5340 + }, + { + "epoch": 2.595744680851064, + "grad_norm": 0.0717768894162335, + "learning_rate": 2.7498592526866584e-06, + "loss": 0.5061, + "step": 5341 + }, + { + "epoch": 2.596231003039514, + "grad_norm": 0.06968033830886546, + "learning_rate": 2.7481495887312824e-06, + "loss": 0.4864, + "step": 5342 + }, + { + "epoch": 2.5967173252279636, + "grad_norm": 0.07050832828051792, + "learning_rate": 2.7464402549841974e-06, + "loss": 0.5076, + "step": 5343 + }, + { + "epoch": 2.5972036474164133, + "grad_norm": 0.07476535279675123, + "learning_rate": 2.7447312516960584e-06, + "loss": 0.5386, + "step": 5344 + }, + { + "epoch": 2.597689969604863, + "grad_norm": 0.0739501455654807, + "learning_rate": 2.743022579117471e-06, + "loss": 0.5491, + "step": 5345 + }, + { + "epoch": 2.598176291793313, + "grad_norm": 0.07169171462833135, + "learning_rate": 2.741314237498993e-06, + "loss": 0.509, + "step": 5346 + }, + { + "epoch": 2.598662613981763, + "grad_norm": 0.07345756835469414, + "learning_rate": 2.739606227091132e-06, + "loss": 0.5157, + "step": 5347 + }, + { + "epoch": 2.599148936170213, + "grad_norm": 0.07140180546973116, + "learning_rate": 2.7378985481443483e-06, + "loss": 0.5127, + "step": 5348 + }, + { + "epoch": 2.5996352583586626, + "grad_norm": 0.07365691975968618, + "learning_rate": 2.7361912009090565e-06, + "loss": 0.5287, + "step": 5349 + }, + { + "epoch": 2.6001215805471123, + "grad_norm": 0.0707045686004956, + "learning_rate": 2.7344841856356173e-06, + "loss": 0.5332, + "step": 5350 + }, + { + "epoch": 2.6006079027355624, + "grad_norm": 0.0728315060365903, + "learning_rate": 2.732777502574346e-06, + "loss": 0.5419, + "step": 5351 + }, + { + "epoch": 2.601094224924012, + "grad_norm": 0.07228432394782106, + "learning_rate": 2.7310711519755084e-06, + "loss": 0.5265, + "step": 5352 + }, + { + "epoch": 2.6015805471124622, + "grad_norm": 0.07560565480418585, + "learning_rate": 2.72936513408932e-06, + "loss": 0.5176, + "step": 5353 + }, + { + "epoch": 2.602066869300912, + "grad_norm": 0.073950056714824, + "learning_rate": 2.7276594491659523e-06, + "loss": 0.5267, + "step": 5354 + }, + { + "epoch": 2.6025531914893616, + "grad_norm": 0.0747150400588746, + "learning_rate": 2.725954097455521e-06, + "loss": 0.5384, + "step": 5355 + }, + { + "epoch": 2.6030395136778113, + "grad_norm": 0.0719801009949029, + "learning_rate": 2.7242490792080965e-06, + "loss": 0.5219, + "step": 5356 + }, + { + "epoch": 2.6035258358662614, + "grad_norm": 0.07548736066206169, + "learning_rate": 2.722544394673703e-06, + "loss": 0.5249, + "step": 5357 + }, + { + "epoch": 2.604012158054711, + "grad_norm": 0.07212949782593735, + "learning_rate": 2.720840044102311e-06, + "loss": 0.5429, + "step": 5358 + }, + { + "epoch": 2.604498480243161, + "grad_norm": 0.07220165538862498, + "learning_rate": 2.719136027743845e-06, + "loss": 0.5178, + "step": 5359 + }, + { + "epoch": 2.604984802431611, + "grad_norm": 0.07076923339172092, + "learning_rate": 2.7174323458481798e-06, + "loss": 0.5234, + "step": 5360 + }, + { + "epoch": 2.6054711246200606, + "grad_norm": 0.07207414681785498, + "learning_rate": 2.7157289986651403e-06, + "loss": 0.4976, + "step": 5361 + }, + { + "epoch": 2.6059574468085107, + "grad_norm": 0.07534019862275049, + "learning_rate": 2.714025986444504e-06, + "loss": 0.525, + "step": 5362 + }, + { + "epoch": 2.6064437689969604, + "grad_norm": 0.07104113436064638, + "learning_rate": 2.712323309435998e-06, + "loss": 0.5063, + "step": 5363 + }, + { + "epoch": 2.6069300911854105, + "grad_norm": 0.07338672309848925, + "learning_rate": 2.7106209678893e-06, + "loss": 0.5371, + "step": 5364 + }, + { + "epoch": 2.60741641337386, + "grad_norm": 0.06958449635019943, + "learning_rate": 2.7089189620540394e-06, + "loss": 0.4697, + "step": 5365 + }, + { + "epoch": 2.60790273556231, + "grad_norm": 0.07152435385673764, + "learning_rate": 2.7072172921797947e-06, + "loss": 0.5601, + "step": 5366 + }, + { + "epoch": 2.60838905775076, + "grad_norm": 0.06982943350477738, + "learning_rate": 2.7055159585160996e-06, + "loss": 0.4824, + "step": 5367 + }, + { + "epoch": 2.6088753799392097, + "grad_norm": 0.0704847983076069, + "learning_rate": 2.703814961312433e-06, + "loss": 0.539, + "step": 5368 + }, + { + "epoch": 2.60936170212766, + "grad_norm": 0.07357250143935838, + "learning_rate": 2.7021143008182297e-06, + "loss": 0.4924, + "step": 5369 + }, + { + "epoch": 2.6098480243161095, + "grad_norm": 0.0695244855160726, + "learning_rate": 2.700413977282868e-06, + "loss": 0.4915, + "step": 5370 + }, + { + "epoch": 2.610334346504559, + "grad_norm": 0.07046223635539373, + "learning_rate": 2.698713990955683e-06, + "loss": 0.5105, + "step": 5371 + }, + { + "epoch": 2.610820668693009, + "grad_norm": 0.06989228866744805, + "learning_rate": 2.6970143420859585e-06, + "loss": 0.5168, + "step": 5372 + }, + { + "epoch": 2.611306990881459, + "grad_norm": 0.07379507159177955, + "learning_rate": 2.6953150309229287e-06, + "loss": 0.5174, + "step": 5373 + }, + { + "epoch": 2.6117933130699087, + "grad_norm": 0.07042036716669867, + "learning_rate": 2.6936160577157776e-06, + "loss": 0.5239, + "step": 5374 + }, + { + "epoch": 2.612279635258359, + "grad_norm": 0.0686762000938755, + "learning_rate": 2.6919174227136417e-06, + "loss": 0.499, + "step": 5375 + }, + { + "epoch": 2.6127659574468085, + "grad_norm": 0.07253511335109725, + "learning_rate": 2.6902191261656053e-06, + "loss": 0.5363, + "step": 5376 + }, + { + "epoch": 2.613252279635258, + "grad_norm": 0.07026774500298956, + "learning_rate": 2.6885211683207048e-06, + "loss": 0.4961, + "step": 5377 + }, + { + "epoch": 2.6137386018237083, + "grad_norm": 0.0732181670954931, + "learning_rate": 2.6868235494279266e-06, + "loss": 0.5504, + "step": 5378 + }, + { + "epoch": 2.614224924012158, + "grad_norm": 0.07052222124938673, + "learning_rate": 2.685126269736207e-06, + "loss": 0.5052, + "step": 5379 + }, + { + "epoch": 2.614711246200608, + "grad_norm": 0.07320576478941102, + "learning_rate": 2.6834293294944326e-06, + "loss": 0.5112, + "step": 5380 + }, + { + "epoch": 2.615197568389058, + "grad_norm": 0.07155115078112458, + "learning_rate": 2.6817327289514406e-06, + "loss": 0.5095, + "step": 5381 + }, + { + "epoch": 2.6156838905775075, + "grad_norm": 0.06935843349081532, + "learning_rate": 2.680036468356018e-06, + "loss": 0.4904, + "step": 5382 + }, + { + "epoch": 2.616170212765957, + "grad_norm": 0.06882477273475883, + "learning_rate": 2.678340547956903e-06, + "loss": 0.4941, + "step": 5383 + }, + { + "epoch": 2.6166565349544073, + "grad_norm": 0.07176851914682074, + "learning_rate": 2.6766449680027816e-06, + "loss": 0.5149, + "step": 5384 + }, + { + "epoch": 2.617142857142857, + "grad_norm": 0.07072959811037098, + "learning_rate": 2.674949728742293e-06, + "loss": 0.5183, + "step": 5385 + }, + { + "epoch": 2.617629179331307, + "grad_norm": 0.07152883595682842, + "learning_rate": 2.673254830424024e-06, + "loss": 0.4906, + "step": 5386 + }, + { + "epoch": 2.618115501519757, + "grad_norm": 0.07173668116143077, + "learning_rate": 2.6715602732965117e-06, + "loss": 0.5207, + "step": 5387 + }, + { + "epoch": 2.6186018237082065, + "grad_norm": 0.07203431301024624, + "learning_rate": 2.6698660576082447e-06, + "loss": 0.5014, + "step": 5388 + }, + { + "epoch": 2.6190881458966566, + "grad_norm": 0.07291059057218043, + "learning_rate": 2.668172183607659e-06, + "loss": 0.5261, + "step": 5389 + }, + { + "epoch": 2.6195744680851063, + "grad_norm": 0.07336530900987731, + "learning_rate": 2.666478651543144e-06, + "loss": 0.535, + "step": 5390 + }, + { + "epoch": 2.6200607902735564, + "grad_norm": 0.07047428040808709, + "learning_rate": 2.6647854616630353e-06, + "loss": 0.546, + "step": 5391 + }, + { + "epoch": 2.620547112462006, + "grad_norm": 0.06957885197784568, + "learning_rate": 2.6630926142156203e-06, + "loss": 0.5222, + "step": 5392 + }, + { + "epoch": 2.621033434650456, + "grad_norm": 0.06941410059153282, + "learning_rate": 2.6614001094491366e-06, + "loss": 0.4757, + "step": 5393 + }, + { + "epoch": 2.621519756838906, + "grad_norm": 0.07354842673337593, + "learning_rate": 2.65970794761177e-06, + "loss": 0.5625, + "step": 5394 + }, + { + "epoch": 2.6220060790273556, + "grad_norm": 0.0719906916771628, + "learning_rate": 2.658016128951657e-06, + "loss": 0.5172, + "step": 5395 + }, + { + "epoch": 2.6224924012158057, + "grad_norm": 0.06972531460506548, + "learning_rate": 2.656324653716884e-06, + "loss": 0.5204, + "step": 5396 + }, + { + "epoch": 2.6229787234042554, + "grad_norm": 0.07046115123659691, + "learning_rate": 2.6546335221554863e-06, + "loss": 0.5086, + "step": 5397 + }, + { + "epoch": 2.623465045592705, + "grad_norm": 0.07077336423198709, + "learning_rate": 2.652942734515449e-06, + "loss": 0.4948, + "step": 5398 + }, + { + "epoch": 2.623951367781155, + "grad_norm": 0.07262509628601738, + "learning_rate": 2.651252291044707e-06, + "loss": 0.5186, + "step": 5399 + }, + { + "epoch": 2.624437689969605, + "grad_norm": 0.07364182001915393, + "learning_rate": 2.649562191991145e-06, + "loss": 0.55, + "step": 5400 + }, + { + "epoch": 2.6249240121580546, + "grad_norm": 0.07237908006245841, + "learning_rate": 2.6478724376025966e-06, + "loss": 0.5364, + "step": 5401 + }, + { + "epoch": 2.6254103343465047, + "grad_norm": 0.06829752929436642, + "learning_rate": 2.646183028126844e-06, + "loss": 0.5237, + "step": 5402 + }, + { + "epoch": 2.6258966565349544, + "grad_norm": 0.07043563725870507, + "learning_rate": 2.6444939638116224e-06, + "loss": 0.5129, + "step": 5403 + }, + { + "epoch": 2.626382978723404, + "grad_norm": 0.06991877719724306, + "learning_rate": 2.6428052449046116e-06, + "loss": 0.4868, + "step": 5404 + }, + { + "epoch": 2.626869300911854, + "grad_norm": 0.07086121907189452, + "learning_rate": 2.641116871653444e-06, + "loss": 0.5315, + "step": 5405 + }, + { + "epoch": 2.627355623100304, + "grad_norm": 0.07255566790931886, + "learning_rate": 2.639428844305701e-06, + "loss": 0.5185, + "step": 5406 + }, + { + "epoch": 2.627841945288754, + "grad_norm": 0.07277298799470891, + "learning_rate": 2.637741163108911e-06, + "loss": 0.5282, + "step": 5407 + }, + { + "epoch": 2.6283282674772037, + "grad_norm": 0.071298737147626, + "learning_rate": 2.636053828310555e-06, + "loss": 0.5438, + "step": 5408 + }, + { + "epoch": 2.6288145896656534, + "grad_norm": 0.07078792644565439, + "learning_rate": 2.6343668401580603e-06, + "loss": 0.5082, + "step": 5409 + }, + { + "epoch": 2.629300911854103, + "grad_norm": 0.07208895976637411, + "learning_rate": 2.632680198898805e-06, + "loss": 0.5299, + "step": 5410 + }, + { + "epoch": 2.629787234042553, + "grad_norm": 0.07064011601316954, + "learning_rate": 2.630993904780116e-06, + "loss": 0.5073, + "step": 5411 + }, + { + "epoch": 2.630273556231003, + "grad_norm": 0.0696893706421984, + "learning_rate": 2.6293079580492688e-06, + "loss": 0.4821, + "step": 5412 + }, + { + "epoch": 2.630759878419453, + "grad_norm": 0.07064719280396976, + "learning_rate": 2.6276223589534877e-06, + "loss": 0.5085, + "step": 5413 + }, + { + "epoch": 2.6312462006079027, + "grad_norm": 0.06985828675677272, + "learning_rate": 2.6259371077399487e-06, + "loss": 0.5073, + "step": 5414 + }, + { + "epoch": 2.6317325227963524, + "grad_norm": 0.07440549595560729, + "learning_rate": 2.624252204655773e-06, + "loss": 0.5545, + "step": 5415 + }, + { + "epoch": 2.6322188449848025, + "grad_norm": 0.0727442963447437, + "learning_rate": 2.6225676499480335e-06, + "loss": 0.5462, + "step": 5416 + }, + { + "epoch": 2.632705167173252, + "grad_norm": 0.07367430195550313, + "learning_rate": 2.6208834438637525e-06, + "loss": 0.5359, + "step": 5417 + }, + { + "epoch": 2.6331914893617023, + "grad_norm": 0.07179279324843299, + "learning_rate": 2.619199586649895e-06, + "loss": 0.509, + "step": 5418 + }, + { + "epoch": 2.633677811550152, + "grad_norm": 0.07323904450174652, + "learning_rate": 2.6175160785533836e-06, + "loss": 0.5517, + "step": 5419 + }, + { + "epoch": 2.6341641337386017, + "grad_norm": 0.07185192751608897, + "learning_rate": 2.615832919821082e-06, + "loss": 0.5458, + "step": 5420 + }, + { + "epoch": 2.634650455927052, + "grad_norm": 0.07039799585439428, + "learning_rate": 2.6141501106998105e-06, + "loss": 0.5026, + "step": 5421 + }, + { + "epoch": 2.6351367781155015, + "grad_norm": 0.07083730090579023, + "learning_rate": 2.612467651436332e-06, + "loss": 0.5256, + "step": 5422 + }, + { + "epoch": 2.6356231003039516, + "grad_norm": 0.06911830742166492, + "learning_rate": 2.610785542277361e-06, + "loss": 0.4997, + "step": 5423 + }, + { + "epoch": 2.6361094224924013, + "grad_norm": 0.07146195336802952, + "learning_rate": 2.6091037834695582e-06, + "loss": 0.5253, + "step": 5424 + }, + { + "epoch": 2.636595744680851, + "grad_norm": 0.07177669466974806, + "learning_rate": 2.6074223752595353e-06, + "loss": 0.5089, + "step": 5425 + }, + { + "epoch": 2.6370820668693007, + "grad_norm": 0.07514201973940192, + "learning_rate": 2.605741317893851e-06, + "loss": 0.5304, + "step": 5426 + }, + { + "epoch": 2.637568389057751, + "grad_norm": 0.07522626997007498, + "learning_rate": 2.6040606116190148e-06, + "loss": 0.5677, + "step": 5427 + }, + { + "epoch": 2.6380547112462005, + "grad_norm": 0.07182090106266803, + "learning_rate": 2.6023802566814814e-06, + "loss": 0.5287, + "step": 5428 + }, + { + "epoch": 2.6385410334346506, + "grad_norm": 0.07139477720137276, + "learning_rate": 2.6007002533276572e-06, + "loss": 0.4881, + "step": 5429 + }, + { + "epoch": 2.6390273556231003, + "grad_norm": 0.07050693859886928, + "learning_rate": 2.5990206018038945e-06, + "loss": 0.5054, + "step": 5430 + }, + { + "epoch": 2.63951367781155, + "grad_norm": 0.0760384804349611, + "learning_rate": 2.597341302356495e-06, + "loss": 0.5592, + "step": 5431 + }, + { + "epoch": 2.64, + "grad_norm": 0.07123783936348688, + "learning_rate": 2.595662355231713e-06, + "loss": 0.5193, + "step": 5432 + }, + { + "epoch": 2.64048632218845, + "grad_norm": 0.07185462538174979, + "learning_rate": 2.5939837606757413e-06, + "loss": 0.517, + "step": 5433 + }, + { + "epoch": 2.6409726443769, + "grad_norm": 0.07559142914504793, + "learning_rate": 2.592305518934728e-06, + "loss": 0.5368, + "step": 5434 + }, + { + "epoch": 2.6414589665653496, + "grad_norm": 0.07019601347029748, + "learning_rate": 2.5906276302547696e-06, + "loss": 0.4925, + "step": 5435 + }, + { + "epoch": 2.6419452887537993, + "grad_norm": 0.07127606451702115, + "learning_rate": 2.5889500948819092e-06, + "loss": 0.5101, + "step": 5436 + }, + { + "epoch": 2.642431610942249, + "grad_norm": 0.07141823128198688, + "learning_rate": 2.5872729130621376e-06, + "loss": 0.5397, + "step": 5437 + }, + { + "epoch": 2.642917933130699, + "grad_norm": 0.072262958458875, + "learning_rate": 2.5855960850413936e-06, + "loss": 0.5041, + "step": 5438 + }, + { + "epoch": 2.643404255319149, + "grad_norm": 0.07105915717983775, + "learning_rate": 2.5839196110655684e-06, + "loss": 0.5182, + "step": 5439 + }, + { + "epoch": 2.643890577507599, + "grad_norm": 0.07256900934576364, + "learning_rate": 2.582243491380495e-06, + "loss": 0.4918, + "step": 5440 + }, + { + "epoch": 2.6443768996960486, + "grad_norm": 0.07011692996937904, + "learning_rate": 2.580567726231959e-06, + "loss": 0.5059, + "step": 5441 + }, + { + "epoch": 2.6448632218844983, + "grad_norm": 0.06970504967531409, + "learning_rate": 2.5788923158656907e-06, + "loss": 0.5266, + "step": 5442 + }, + { + "epoch": 2.6453495440729484, + "grad_norm": 0.06860414306069924, + "learning_rate": 2.5772172605273716e-06, + "loss": 0.4819, + "step": 5443 + }, + { + "epoch": 2.645835866261398, + "grad_norm": 0.07261839124610621, + "learning_rate": 2.575542560462628e-06, + "loss": 0.538, + "step": 5444 + }, + { + "epoch": 2.6463221884498482, + "grad_norm": 0.07030170337939613, + "learning_rate": 2.573868215917037e-06, + "loss": 0.4965, + "step": 5445 + }, + { + "epoch": 2.646808510638298, + "grad_norm": 0.07472859611553342, + "learning_rate": 2.5721942271361233e-06, + "loss": 0.5321, + "step": 5446 + }, + { + "epoch": 2.6472948328267476, + "grad_norm": 0.0702225524867287, + "learning_rate": 2.5705205943653543e-06, + "loss": 0.5105, + "step": 5447 + }, + { + "epoch": 2.6477811550151977, + "grad_norm": 0.06931388998874215, + "learning_rate": 2.568847317850152e-06, + "loss": 0.503, + "step": 5448 + }, + { + "epoch": 2.6482674772036474, + "grad_norm": 0.07238755783628201, + "learning_rate": 2.567174397835883e-06, + "loss": 0.5181, + "step": 5449 + }, + { + "epoch": 2.6487537993920975, + "grad_norm": 0.07221063094098708, + "learning_rate": 2.565501834567862e-06, + "loss": 0.5097, + "step": 5450 + }, + { + "epoch": 2.6492401215805472, + "grad_norm": 0.07146203195646807, + "learning_rate": 2.563829628291351e-06, + "loss": 0.5292, + "step": 5451 + }, + { + "epoch": 2.649726443768997, + "grad_norm": 0.07107832846243915, + "learning_rate": 2.562157779251561e-06, + "loss": 0.5083, + "step": 5452 + }, + { + "epoch": 2.6502127659574466, + "grad_norm": 0.0730459562903449, + "learning_rate": 2.5604862876936486e-06, + "loss": 0.5319, + "step": 5453 + }, + { + "epoch": 2.6506990881458967, + "grad_norm": 0.07067950687746566, + "learning_rate": 2.55881515386272e-06, + "loss": 0.5211, + "step": 5454 + }, + { + "epoch": 2.6511854103343464, + "grad_norm": 0.07456650374475696, + "learning_rate": 2.5571443780038276e-06, + "loss": 0.5193, + "step": 5455 + }, + { + "epoch": 2.6516717325227965, + "grad_norm": 0.07264119265392369, + "learning_rate": 2.5554739603619714e-06, + "loss": 0.5082, + "step": 5456 + }, + { + "epoch": 2.652158054711246, + "grad_norm": 0.07209841873963313, + "learning_rate": 2.553803901182098e-06, + "loss": 0.4962, + "step": 5457 + }, + { + "epoch": 2.652644376899696, + "grad_norm": 0.07421098993375227, + "learning_rate": 2.5521342007091056e-06, + "loss": 0.5106, + "step": 5458 + }, + { + "epoch": 2.653130699088146, + "grad_norm": 0.07232980647533674, + "learning_rate": 2.5504648591878356e-06, + "loss": 0.4968, + "step": 5459 + }, + { + "epoch": 2.6536170212765957, + "grad_norm": 0.07091403602817326, + "learning_rate": 2.5487958768630774e-06, + "loss": 0.5476, + "step": 5460 + }, + { + "epoch": 2.654103343465046, + "grad_norm": 0.07293520639187848, + "learning_rate": 2.5471272539795705e-06, + "loss": 0.5236, + "step": 5461 + }, + { + "epoch": 2.6545896656534955, + "grad_norm": 0.06984339439743953, + "learning_rate": 2.545458990781996e-06, + "loss": 0.5164, + "step": 5462 + }, + { + "epoch": 2.655075987841945, + "grad_norm": 0.0712103953582825, + "learning_rate": 2.5437910875149868e-06, + "loss": 0.5378, + "step": 5463 + }, + { + "epoch": 2.655562310030395, + "grad_norm": 0.07147882450311213, + "learning_rate": 2.542123544423123e-06, + "loss": 0.5337, + "step": 5464 + }, + { + "epoch": 2.656048632218845, + "grad_norm": 0.07240067887458716, + "learning_rate": 2.5404563617509303e-06, + "loss": 0.5353, + "step": 5465 + }, + { + "epoch": 2.6565349544072947, + "grad_norm": 0.07154365135057479, + "learning_rate": 2.5387895397428818e-06, + "loss": 0.5513, + "step": 5466 + }, + { + "epoch": 2.657021276595745, + "grad_norm": 0.07014841861279734, + "learning_rate": 2.5371230786433985e-06, + "loss": 0.4875, + "step": 5467 + }, + { + "epoch": 2.6575075987841945, + "grad_norm": 0.0724819852173919, + "learning_rate": 2.5354569786968486e-06, + "loss": 0.5523, + "step": 5468 + }, + { + "epoch": 2.657993920972644, + "grad_norm": 0.07177628213291166, + "learning_rate": 2.5337912401475453e-06, + "loss": 0.5144, + "step": 5469 + }, + { + "epoch": 2.6584802431610943, + "grad_norm": 0.06993684346586565, + "learning_rate": 2.5321258632397516e-06, + "loss": 0.5211, + "step": 5470 + }, + { + "epoch": 2.658966565349544, + "grad_norm": 0.07071627977417842, + "learning_rate": 2.530460848217675e-06, + "loss": 0.5609, + "step": 5471 + }, + { + "epoch": 2.659452887537994, + "grad_norm": 0.07050137630629093, + "learning_rate": 2.5287961953254712e-06, + "loss": 0.4928, + "step": 5472 + }, + { + "epoch": 2.659939209726444, + "grad_norm": 0.07293696855713029, + "learning_rate": 2.527131904807244e-06, + "loss": 0.5288, + "step": 5473 + }, + { + "epoch": 2.6604255319148935, + "grad_norm": 0.07419547537809282, + "learning_rate": 2.525467976907041e-06, + "loss": 0.5529, + "step": 5474 + }, + { + "epoch": 2.660911854103343, + "grad_norm": 0.07102323721474527, + "learning_rate": 2.523804411868857e-06, + "loss": 0.5315, + "step": 5475 + }, + { + "epoch": 2.6613981762917933, + "grad_norm": 0.07163897353856712, + "learning_rate": 2.522141209936641e-06, + "loss": 0.5191, + "step": 5476 + }, + { + "epoch": 2.661884498480243, + "grad_norm": 0.06956818849518469, + "learning_rate": 2.520478371354277e-06, + "loss": 0.5173, + "step": 5477 + }, + { + "epoch": 2.662370820668693, + "grad_norm": 0.07268633543083317, + "learning_rate": 2.5188158963656023e-06, + "loss": 0.514, + "step": 5478 + }, + { + "epoch": 2.662857142857143, + "grad_norm": 0.07384958300308643, + "learning_rate": 2.517153785214401e-06, + "loss": 0.5431, + "step": 5479 + }, + { + "epoch": 2.6633434650455925, + "grad_norm": 0.07235268669890155, + "learning_rate": 2.5154920381444026e-06, + "loss": 0.5252, + "step": 5480 + }, + { + "epoch": 2.6638297872340426, + "grad_norm": 0.07043298014929625, + "learning_rate": 2.513830655399283e-06, + "loss": 0.5168, + "step": 5481 + }, + { + "epoch": 2.6643161094224923, + "grad_norm": 0.07134391552701407, + "learning_rate": 2.512169637222666e-06, + "loss": 0.5306, + "step": 5482 + }, + { + "epoch": 2.6648024316109424, + "grad_norm": 0.0717307847963953, + "learning_rate": 2.51050898385812e-06, + "loss": 0.4981, + "step": 5483 + }, + { + "epoch": 2.665288753799392, + "grad_norm": 0.07252979963221534, + "learning_rate": 2.508848695549162e-06, + "loss": 0.5212, + "step": 5484 + }, + { + "epoch": 2.665775075987842, + "grad_norm": 0.07355436401453107, + "learning_rate": 2.507188772539254e-06, + "loss": 0.5423, + "step": 5485 + }, + { + "epoch": 2.666261398176292, + "grad_norm": 0.07526361711365165, + "learning_rate": 2.505529215071804e-06, + "loss": 0.5479, + "step": 5486 + }, + { + "epoch": 2.6667477203647416, + "grad_norm": 0.07165395763290587, + "learning_rate": 2.5038700233901684e-06, + "loss": 0.5363, + "step": 5487 + }, + { + "epoch": 2.6672340425531917, + "grad_norm": 0.07059084631046932, + "learning_rate": 2.5022111977376486e-06, + "loss": 0.5209, + "step": 5488 + }, + { + "epoch": 2.6677203647416414, + "grad_norm": 0.07234672794170492, + "learning_rate": 2.5005527383574925e-06, + "loss": 0.5256, + "step": 5489 + }, + { + "epoch": 2.668206686930091, + "grad_norm": 0.06837843636370457, + "learning_rate": 2.4988946454928934e-06, + "loss": 0.4836, + "step": 5490 + }, + { + "epoch": 2.668693009118541, + "grad_norm": 0.07390564917350201, + "learning_rate": 2.4972369193869935e-06, + "loss": 0.5437, + "step": 5491 + }, + { + "epoch": 2.669179331306991, + "grad_norm": 0.07348253210685227, + "learning_rate": 2.495579560282878e-06, + "loss": 0.5324, + "step": 5492 + }, + { + "epoch": 2.6696656534954406, + "grad_norm": 0.07141996806576167, + "learning_rate": 2.4939225684235814e-06, + "loss": 0.5332, + "step": 5493 + }, + { + "epoch": 2.6701519756838907, + "grad_norm": 0.0723361192948505, + "learning_rate": 2.4922659440520806e-06, + "loss": 0.5355, + "step": 5494 + }, + { + "epoch": 2.6706382978723404, + "grad_norm": 0.07345653446246092, + "learning_rate": 2.4906096874113023e-06, + "loss": 0.5086, + "step": 5495 + }, + { + "epoch": 2.67112462006079, + "grad_norm": 0.07135224528207111, + "learning_rate": 2.4889537987441177e-06, + "loss": 0.5265, + "step": 5496 + }, + { + "epoch": 2.6716109422492402, + "grad_norm": 0.07182902592095845, + "learning_rate": 2.487298278293343e-06, + "loss": 0.5255, + "step": 5497 + }, + { + "epoch": 2.67209726443769, + "grad_norm": 0.07494158870178803, + "learning_rate": 2.4856431263017427e-06, + "loss": 0.5333, + "step": 5498 + }, + { + "epoch": 2.67258358662614, + "grad_norm": 0.06840428343481617, + "learning_rate": 2.4839883430120253e-06, + "loss": 0.4895, + "step": 5499 + }, + { + "epoch": 2.6730699088145897, + "grad_norm": 0.06960352607855569, + "learning_rate": 2.4823339286668464e-06, + "loss": 0.5479, + "step": 5500 + }, + { + "epoch": 2.6735562310030394, + "grad_norm": 0.07030391637082783, + "learning_rate": 2.4806798835088066e-06, + "loss": 0.4812, + "step": 5501 + }, + { + "epoch": 2.674042553191489, + "grad_norm": 0.07171035050184746, + "learning_rate": 2.4790262077804534e-06, + "loss": 0.4921, + "step": 5502 + }, + { + "epoch": 2.674528875379939, + "grad_norm": 0.07211281686168014, + "learning_rate": 2.477372901724279e-06, + "loss": 0.5282, + "step": 5503 + }, + { + "epoch": 2.675015197568389, + "grad_norm": 0.07268259051770798, + "learning_rate": 2.475719965582722e-06, + "loss": 0.5542, + "step": 5504 + }, + { + "epoch": 2.675501519756839, + "grad_norm": 0.08048652171992439, + "learning_rate": 2.4740673995981672e-06, + "loss": 0.5848, + "step": 5505 + }, + { + "epoch": 2.6759878419452887, + "grad_norm": 0.07379128152650583, + "learning_rate": 2.4724152040129447e-06, + "loss": 0.5815, + "step": 5506 + }, + { + "epoch": 2.6764741641337384, + "grad_norm": 0.07148522054284791, + "learning_rate": 2.4707633790693296e-06, + "loss": 0.5676, + "step": 5507 + }, + { + "epoch": 2.6769604863221885, + "grad_norm": 0.07224678269640722, + "learning_rate": 2.4691119250095437e-06, + "loss": 0.4953, + "step": 5508 + }, + { + "epoch": 2.677446808510638, + "grad_norm": 0.07148156681707943, + "learning_rate": 2.467460842075756e-06, + "loss": 0.5086, + "step": 5509 + }, + { + "epoch": 2.6779331306990883, + "grad_norm": 0.07002738198610287, + "learning_rate": 2.4658101305100746e-06, + "loss": 0.5315, + "step": 5510 + }, + { + "epoch": 2.678419452887538, + "grad_norm": 0.07274020135726435, + "learning_rate": 2.4641597905545576e-06, + "loss": 0.5205, + "step": 5511 + }, + { + "epoch": 2.6789057750759877, + "grad_norm": 0.07094755233200126, + "learning_rate": 2.4625098224512136e-06, + "loss": 0.5457, + "step": 5512 + }, + { + "epoch": 2.679392097264438, + "grad_norm": 0.07162468971909133, + "learning_rate": 2.460860226441989e-06, + "loss": 0.5342, + "step": 5513 + }, + { + "epoch": 2.6798784194528875, + "grad_norm": 0.06958401149156632, + "learning_rate": 2.4592110027687777e-06, + "loss": 0.498, + "step": 5514 + }, + { + "epoch": 2.6803647416413376, + "grad_norm": 0.07136401504593012, + "learning_rate": 2.457562151673421e-06, + "loss": 0.519, + "step": 5515 + }, + { + "epoch": 2.6808510638297873, + "grad_norm": 0.07196541431784219, + "learning_rate": 2.4559136733977027e-06, + "loss": 0.5058, + "step": 5516 + }, + { + "epoch": 2.681337386018237, + "grad_norm": 0.07040794256120098, + "learning_rate": 2.454265568183355e-06, + "loss": 0.5307, + "step": 5517 + }, + { + "epoch": 2.6818237082066867, + "grad_norm": 0.0709257024571426, + "learning_rate": 2.4526178362720525e-06, + "loss": 0.5352, + "step": 5518 + }, + { + "epoch": 2.682310030395137, + "grad_norm": 0.07101514949193398, + "learning_rate": 2.450970477905417e-06, + "loss": 0.5268, + "step": 5519 + }, + { + "epoch": 2.6827963525835865, + "grad_norm": 0.07108833033756663, + "learning_rate": 2.449323493325015e-06, + "loss": 0.5531, + "step": 5520 + }, + { + "epoch": 2.6832826747720366, + "grad_norm": 0.06936878392460284, + "learning_rate": 2.4476768827723578e-06, + "loss": 0.5178, + "step": 5521 + }, + { + "epoch": 2.6837689969604863, + "grad_norm": 0.0716604086673739, + "learning_rate": 2.4460306464889023e-06, + "loss": 0.532, + "step": 5522 + }, + { + "epoch": 2.684255319148936, + "grad_norm": 0.07463544926494957, + "learning_rate": 2.4443847847160496e-06, + "loss": 0.5441, + "step": 5523 + }, + { + "epoch": 2.684741641337386, + "grad_norm": 0.07198906494769135, + "learning_rate": 2.44273929769515e-06, + "loss": 0.5186, + "step": 5524 + }, + { + "epoch": 2.685227963525836, + "grad_norm": 0.07025053527235166, + "learning_rate": 2.44109418566749e-06, + "loss": 0.5071, + "step": 5525 + }, + { + "epoch": 2.685714285714286, + "grad_norm": 0.07293297745366367, + "learning_rate": 2.4394494488743096e-06, + "loss": 0.5158, + "step": 5526 + }, + { + "epoch": 2.6862006079027356, + "grad_norm": 0.0691133111848275, + "learning_rate": 2.437805087556791e-06, + "loss": 0.4969, + "step": 5527 + }, + { + "epoch": 2.6866869300911853, + "grad_norm": 0.07390547638546005, + "learning_rate": 2.4361611019560604e-06, + "loss": 0.5474, + "step": 5528 + }, + { + "epoch": 2.687173252279635, + "grad_norm": 0.07174916433543181, + "learning_rate": 2.434517492313188e-06, + "loss": 0.5097, + "step": 5529 + }, + { + "epoch": 2.687659574468085, + "grad_norm": 0.07005905846385797, + "learning_rate": 2.4328742588691943e-06, + "loss": 0.5319, + "step": 5530 + }, + { + "epoch": 2.688145896656535, + "grad_norm": 0.07161187006550968, + "learning_rate": 2.431231401865039e-06, + "loss": 0.5531, + "step": 5531 + }, + { + "epoch": 2.688632218844985, + "grad_norm": 0.07093107406094733, + "learning_rate": 2.429588921541628e-06, + "loss": 0.5302, + "step": 5532 + }, + { + "epoch": 2.6891185410334346, + "grad_norm": 0.07327274548228252, + "learning_rate": 2.427946818139813e-06, + "loss": 0.5181, + "step": 5533 + }, + { + "epoch": 2.6896048632218843, + "grad_norm": 0.07110966723406568, + "learning_rate": 2.4263050919003896e-06, + "loss": 0.5293, + "step": 5534 + }, + { + "epoch": 2.6900911854103344, + "grad_norm": 0.0736781505637428, + "learning_rate": 2.424663743064098e-06, + "loss": 0.5439, + "step": 5535 + }, + { + "epoch": 2.690577507598784, + "grad_norm": 0.07541627188970834, + "learning_rate": 2.4230227718716236e-06, + "loss": 0.5403, + "step": 5536 + }, + { + "epoch": 2.6910638297872342, + "grad_norm": 0.06986709238970748, + "learning_rate": 2.421382178563596e-06, + "loss": 0.5453, + "step": 5537 + }, + { + "epoch": 2.691550151975684, + "grad_norm": 0.07320275588309096, + "learning_rate": 2.419741963380592e-06, + "loss": 0.5262, + "step": 5538 + }, + { + "epoch": 2.6920364741641336, + "grad_norm": 0.06986324858812706, + "learning_rate": 2.4181021265631266e-06, + "loss": 0.5027, + "step": 5539 + }, + { + "epoch": 2.6925227963525837, + "grad_norm": 0.07304783069393933, + "learning_rate": 2.4164626683516645e-06, + "loss": 0.5143, + "step": 5540 + }, + { + "epoch": 2.6930091185410334, + "grad_norm": 0.07433487188953387, + "learning_rate": 2.414823588986614e-06, + "loss": 0.5423, + "step": 5541 + }, + { + "epoch": 2.6934954407294835, + "grad_norm": 0.06984550945744551, + "learning_rate": 2.413184888708328e-06, + "loss": 0.5209, + "step": 5542 + }, + { + "epoch": 2.6939817629179332, + "grad_norm": 0.07082889998695244, + "learning_rate": 2.4115465677571028e-06, + "loss": 0.4847, + "step": 5543 + }, + { + "epoch": 2.694468085106383, + "grad_norm": 0.07075816786007702, + "learning_rate": 2.409908626373179e-06, + "loss": 0.5254, + "step": 5544 + }, + { + "epoch": 2.6949544072948326, + "grad_norm": 0.06965501234774592, + "learning_rate": 2.4082710647967433e-06, + "loss": 0.5289, + "step": 5545 + }, + { + "epoch": 2.6954407294832827, + "grad_norm": 0.06882764416383183, + "learning_rate": 2.4066338832679247e-06, + "loss": 0.4929, + "step": 5546 + }, + { + "epoch": 2.6959270516717324, + "grad_norm": 0.06989379449315315, + "learning_rate": 2.4049970820267955e-06, + "loss": 0.5305, + "step": 5547 + }, + { + "epoch": 2.6964133738601825, + "grad_norm": 0.07065726726124981, + "learning_rate": 2.403360661313378e-06, + "loss": 0.5114, + "step": 5548 + }, + { + "epoch": 2.6968996960486322, + "grad_norm": 0.07201786486434365, + "learning_rate": 2.4017246213676327e-06, + "loss": 0.5113, + "step": 5549 + }, + { + "epoch": 2.697386018237082, + "grad_norm": 0.07164575621587971, + "learning_rate": 2.4000889624294665e-06, + "loss": 0.5153, + "step": 5550 + }, + { + "epoch": 2.697872340425532, + "grad_norm": 0.07729564282718766, + "learning_rate": 2.3984536847387297e-06, + "loss": 0.5508, + "step": 5551 + }, + { + "epoch": 2.6983586626139817, + "grad_norm": 0.07260408882671729, + "learning_rate": 2.3968187885352177e-06, + "loss": 0.5447, + "step": 5552 + }, + { + "epoch": 2.698844984802432, + "grad_norm": 0.06809081848991765, + "learning_rate": 2.3951842740586713e-06, + "loss": 0.4852, + "step": 5553 + }, + { + "epoch": 2.6993313069908815, + "grad_norm": 0.07183952533858455, + "learning_rate": 2.3935501415487695e-06, + "loss": 0.5079, + "step": 5554 + }, + { + "epoch": 2.699817629179331, + "grad_norm": 0.07128041658867738, + "learning_rate": 2.391916391245141e-06, + "loss": 0.537, + "step": 5555 + }, + { + "epoch": 2.700303951367781, + "grad_norm": 0.07102924837917349, + "learning_rate": 2.3902830233873576e-06, + "loss": 0.5233, + "step": 5556 + }, + { + "epoch": 2.700790273556231, + "grad_norm": 0.07198285070901485, + "learning_rate": 2.388650038214933e-06, + "loss": 0.5465, + "step": 5557 + }, + { + "epoch": 2.7012765957446807, + "grad_norm": 0.07059293976643284, + "learning_rate": 2.3870174359673265e-06, + "loss": 0.5278, + "step": 5558 + }, + { + "epoch": 2.701762917933131, + "grad_norm": 0.0704532376320318, + "learning_rate": 2.3853852168839405e-06, + "loss": 0.5377, + "step": 5559 + }, + { + "epoch": 2.7022492401215805, + "grad_norm": 0.07405614468932602, + "learning_rate": 2.3837533812041215e-06, + "loss": 0.5281, + "step": 5560 + }, + { + "epoch": 2.70273556231003, + "grad_norm": 0.06904732661446518, + "learning_rate": 2.38212192916716e-06, + "loss": 0.5085, + "step": 5561 + }, + { + "epoch": 2.7032218844984803, + "grad_norm": 0.07158806723325084, + "learning_rate": 2.3804908610122897e-06, + "loss": 0.494, + "step": 5562 + }, + { + "epoch": 2.70370820668693, + "grad_norm": 0.07248825127993813, + "learning_rate": 2.378860176978688e-06, + "loss": 0.5232, + "step": 5563 + }, + { + "epoch": 2.70419452887538, + "grad_norm": 0.07019742536519977, + "learning_rate": 2.377229877305476e-06, + "loss": 0.4804, + "step": 5564 + }, + { + "epoch": 2.70468085106383, + "grad_norm": 0.0743684617059333, + "learning_rate": 2.375599962231717e-06, + "loss": 0.5729, + "step": 5565 + }, + { + "epoch": 2.7051671732522795, + "grad_norm": 0.07020128103353578, + "learning_rate": 2.373970431996424e-06, + "loss": 0.5437, + "step": 5566 + }, + { + "epoch": 2.7056534954407296, + "grad_norm": 0.06930938199853524, + "learning_rate": 2.3723412868385463e-06, + "loss": 0.5031, + "step": 5567 + }, + { + "epoch": 2.7061398176291793, + "grad_norm": 0.07223579879131732, + "learning_rate": 2.3707125269969814e-06, + "loss": 0.5095, + "step": 5568 + }, + { + "epoch": 2.7066261398176295, + "grad_norm": 0.0719653038090186, + "learning_rate": 2.3690841527105658e-06, + "loss": 0.5364, + "step": 5569 + }, + { + "epoch": 2.707112462006079, + "grad_norm": 0.06755525438650527, + "learning_rate": 2.3674561642180826e-06, + "loss": 0.4909, + "step": 5570 + }, + { + "epoch": 2.707598784194529, + "grad_norm": 0.07029513463711251, + "learning_rate": 2.365828561758259e-06, + "loss": 0.5249, + "step": 5571 + }, + { + "epoch": 2.7080851063829785, + "grad_norm": 0.07209432010500018, + "learning_rate": 2.3642013455697633e-06, + "loss": 0.5425, + "step": 5572 + }, + { + "epoch": 2.7085714285714286, + "grad_norm": 0.07448050151918781, + "learning_rate": 2.3625745158912083e-06, + "loss": 0.5562, + "step": 5573 + }, + { + "epoch": 2.7090577507598783, + "grad_norm": 0.07373010631886454, + "learning_rate": 2.360948072961151e-06, + "loss": 0.5313, + "step": 5574 + }, + { + "epoch": 2.7095440729483284, + "grad_norm": 0.06972371224639265, + "learning_rate": 2.3593220170180907e-06, + "loss": 0.5047, + "step": 5575 + }, + { + "epoch": 2.710030395136778, + "grad_norm": 0.0723323149363478, + "learning_rate": 2.3576963483004695e-06, + "loss": 0.5209, + "step": 5576 + }, + { + "epoch": 2.710516717325228, + "grad_norm": 0.07224878616671035, + "learning_rate": 2.3560710670466736e-06, + "loss": 0.5091, + "step": 5577 + }, + { + "epoch": 2.711003039513678, + "grad_norm": 0.06951460608906329, + "learning_rate": 2.354446173495032e-06, + "loss": 0.4877, + "step": 5578 + }, + { + "epoch": 2.7114893617021276, + "grad_norm": 0.06902839979461814, + "learning_rate": 2.3528216678838167e-06, + "loss": 0.5088, + "step": 5579 + }, + { + "epoch": 2.7119756838905777, + "grad_norm": 0.07326065318949865, + "learning_rate": 2.351197550451243e-06, + "loss": 0.5157, + "step": 5580 + }, + { + "epoch": 2.7124620060790274, + "grad_norm": 0.07070154790644714, + "learning_rate": 2.349573821435469e-06, + "loss": 0.5108, + "step": 5581 + }, + { + "epoch": 2.712948328267477, + "grad_norm": 0.0716585021239851, + "learning_rate": 2.3479504810745974e-06, + "loss": 0.5149, + "step": 5582 + }, + { + "epoch": 2.713434650455927, + "grad_norm": 0.07281089782751654, + "learning_rate": 2.3463275296066714e-06, + "loss": 0.5311, + "step": 5583 + }, + { + "epoch": 2.713920972644377, + "grad_norm": 0.06873241475266974, + "learning_rate": 2.344704967269678e-06, + "loss": 0.5035, + "step": 5584 + }, + { + "epoch": 2.7144072948328266, + "grad_norm": 0.07011966025948382, + "learning_rate": 2.3430827943015494e-06, + "loss": 0.5286, + "step": 5585 + }, + { + "epoch": 2.7148936170212767, + "grad_norm": 0.07632790413093576, + "learning_rate": 2.341461010940157e-06, + "loss": 0.5298, + "step": 5586 + }, + { + "epoch": 2.7153799392097264, + "grad_norm": 0.07078491865452298, + "learning_rate": 2.339839617423318e-06, + "loss": 0.5321, + "step": 5587 + }, + { + "epoch": 2.715866261398176, + "grad_norm": 0.06985087531831614, + "learning_rate": 2.3382186139887907e-06, + "loss": 0.5166, + "step": 5588 + }, + { + "epoch": 2.7163525835866262, + "grad_norm": 0.07268927934417703, + "learning_rate": 2.336598000874277e-06, + "loss": 0.5353, + "step": 5589 + }, + { + "epoch": 2.716838905775076, + "grad_norm": 0.06954367265126185, + "learning_rate": 2.3349777783174215e-06, + "loss": 0.5081, + "step": 5590 + }, + { + "epoch": 2.717325227963526, + "grad_norm": 0.07109153459356275, + "learning_rate": 2.333357946555812e-06, + "loss": 0.5051, + "step": 5591 + }, + { + "epoch": 2.7178115501519757, + "grad_norm": 0.07053609850390538, + "learning_rate": 2.3317385058269776e-06, + "loss": 0.5118, + "step": 5592 + }, + { + "epoch": 2.7182978723404254, + "grad_norm": 0.07361033838033207, + "learning_rate": 2.3301194563683914e-06, + "loss": 0.568, + "step": 5593 + }, + { + "epoch": 2.7187841945288755, + "grad_norm": 0.07481171847106032, + "learning_rate": 2.3285007984174686e-06, + "loss": 0.5372, + "step": 5594 + }, + { + "epoch": 2.7192705167173252, + "grad_norm": 0.07365341846223933, + "learning_rate": 2.3268825322115662e-06, + "loss": 0.5754, + "step": 5595 + }, + { + "epoch": 2.7197568389057754, + "grad_norm": 0.07022134671005106, + "learning_rate": 2.3252646579879856e-06, + "loss": 0.5142, + "step": 5596 + }, + { + "epoch": 2.720243161094225, + "grad_norm": 0.07003090411619539, + "learning_rate": 2.323647175983969e-06, + "loss": 0.5103, + "step": 5597 + }, + { + "epoch": 2.7207294832826747, + "grad_norm": 0.07080753856763596, + "learning_rate": 2.3220300864367023e-06, + "loss": 0.5007, + "step": 5598 + }, + { + "epoch": 2.7212158054711244, + "grad_norm": 0.07017405179048752, + "learning_rate": 2.320413389583313e-06, + "loss": 0.5339, + "step": 5599 + }, + { + "epoch": 2.7217021276595745, + "grad_norm": 0.070600487568738, + "learning_rate": 2.318797085660871e-06, + "loss": 0.5135, + "step": 5600 + }, + { + "epoch": 2.722188449848024, + "grad_norm": 0.07090042371628778, + "learning_rate": 2.3171811749063915e-06, + "loss": 0.5456, + "step": 5601 + }, + { + "epoch": 2.7226747720364743, + "grad_norm": 0.06950781471709777, + "learning_rate": 2.3155656575568235e-06, + "loss": 0.4862, + "step": 5602 + }, + { + "epoch": 2.723161094224924, + "grad_norm": 0.07287528728892682, + "learning_rate": 2.3139505338490703e-06, + "loss": 0.5338, + "step": 5603 + }, + { + "epoch": 2.7236474164133737, + "grad_norm": 0.06975011693382906, + "learning_rate": 2.312335804019969e-06, + "loss": 0.4974, + "step": 5604 + }, + { + "epoch": 2.724133738601824, + "grad_norm": 0.0713308710116638, + "learning_rate": 2.3107214683063016e-06, + "loss": 0.5381, + "step": 5605 + }, + { + "epoch": 2.7246200607902735, + "grad_norm": 0.07097525191371563, + "learning_rate": 2.309107526944792e-06, + "loss": 0.5391, + "step": 5606 + }, + { + "epoch": 2.7251063829787237, + "grad_norm": 0.07264474672484772, + "learning_rate": 2.307493980172106e-06, + "loss": 0.5286, + "step": 5607 + }, + { + "epoch": 2.7255927051671733, + "grad_norm": 0.07195173921207297, + "learning_rate": 2.305880828224853e-06, + "loss": 0.523, + "step": 5608 + }, + { + "epoch": 2.726079027355623, + "grad_norm": 0.07126396038957458, + "learning_rate": 2.3042680713395827e-06, + "loss": 0.5382, + "step": 5609 + }, + { + "epoch": 2.7265653495440727, + "grad_norm": 0.06793950166223107, + "learning_rate": 2.3026557097527876e-06, + "loss": 0.4953, + "step": 5610 + }, + { + "epoch": 2.727051671732523, + "grad_norm": 0.06919454588343514, + "learning_rate": 2.3010437437009024e-06, + "loss": 0.5049, + "step": 5611 + }, + { + "epoch": 2.7275379939209725, + "grad_norm": 0.07001554335212819, + "learning_rate": 2.2994321734203033e-06, + "loss": 0.527, + "step": 5612 + }, + { + "epoch": 2.7280243161094226, + "grad_norm": 0.06837297417544108, + "learning_rate": 2.2978209991473087e-06, + "loss": 0.4849, + "step": 5613 + }, + { + "epoch": 2.7285106382978723, + "grad_norm": 0.07098178413468569, + "learning_rate": 2.2962102211181804e-06, + "loss": 0.5339, + "step": 5614 + }, + { + "epoch": 2.728996960486322, + "grad_norm": 0.07066720128928897, + "learning_rate": 2.2945998395691184e-06, + "loss": 0.5244, + "step": 5615 + }, + { + "epoch": 2.729483282674772, + "grad_norm": 0.07113858815212676, + "learning_rate": 2.2929898547362704e-06, + "loss": 0.5362, + "step": 5616 + }, + { + "epoch": 2.729969604863222, + "grad_norm": 0.07209782526530836, + "learning_rate": 2.2913802668557184e-06, + "loss": 0.5336, + "step": 5617 + }, + { + "epoch": 2.730455927051672, + "grad_norm": 0.07330370384198436, + "learning_rate": 2.2897710761634915e-06, + "loss": 0.5103, + "step": 5618 + }, + { + "epoch": 2.7309422492401216, + "grad_norm": 0.07166970053568655, + "learning_rate": 2.2881622828955596e-06, + "loss": 0.5067, + "step": 5619 + }, + { + "epoch": 2.7314285714285713, + "grad_norm": 0.07989330839393682, + "learning_rate": 2.2865538872878323e-06, + "loss": 0.5246, + "step": 5620 + }, + { + "epoch": 2.731914893617021, + "grad_norm": 0.06971311989823643, + "learning_rate": 2.284945889576166e-06, + "loss": 0.5022, + "step": 5621 + }, + { + "epoch": 2.732401215805471, + "grad_norm": 0.07238271284131334, + "learning_rate": 2.2833382899963535e-06, + "loss": 0.5173, + "step": 5622 + }, + { + "epoch": 2.732887537993921, + "grad_norm": 0.07277622360808055, + "learning_rate": 2.2817310887841317e-06, + "loss": 0.5278, + "step": 5623 + }, + { + "epoch": 2.733373860182371, + "grad_norm": 0.07306060202304211, + "learning_rate": 2.2801242861751764e-06, + "loss": 0.5362, + "step": 5624 + }, + { + "epoch": 2.7338601823708206, + "grad_norm": 0.07142415733875977, + "learning_rate": 2.278517882405109e-06, + "loss": 0.5305, + "step": 5625 + }, + { + "epoch": 2.7343465045592703, + "grad_norm": 0.07339428420585087, + "learning_rate": 2.27691187770949e-06, + "loss": 0.5098, + "step": 5626 + }, + { + "epoch": 2.7348328267477204, + "grad_norm": 0.07122928974435357, + "learning_rate": 2.275306272323821e-06, + "loss": 0.554, + "step": 5627 + }, + { + "epoch": 2.73531914893617, + "grad_norm": 0.07150665122920959, + "learning_rate": 2.2737010664835463e-06, + "loss": 0.5146, + "step": 5628 + }, + { + "epoch": 2.7358054711246202, + "grad_norm": 0.08413665437670209, + "learning_rate": 2.2720962604240507e-06, + "loss": 0.5171, + "step": 5629 + }, + { + "epoch": 2.73629179331307, + "grad_norm": 0.0695353502480389, + "learning_rate": 2.270491854380664e-06, + "loss": 0.4989, + "step": 5630 + }, + { + "epoch": 2.7367781155015196, + "grad_norm": 0.07134401407134096, + "learning_rate": 2.2688878485886485e-06, + "loss": 0.5136, + "step": 5631 + }, + { + "epoch": 2.7372644376899697, + "grad_norm": 0.07457788954707786, + "learning_rate": 2.267284243283216e-06, + "loss": 0.5565, + "step": 5632 + }, + { + "epoch": 2.7377507598784194, + "grad_norm": 0.07236904847217807, + "learning_rate": 2.2656810386995177e-06, + "loss": 0.5319, + "step": 5633 + }, + { + "epoch": 2.7382370820668696, + "grad_norm": 0.07060571139008473, + "learning_rate": 2.264078235072645e-06, + "loss": 0.5054, + "step": 5634 + }, + { + "epoch": 2.7387234042553192, + "grad_norm": 0.07177170374485324, + "learning_rate": 2.2624758326376302e-06, + "loss": 0.5148, + "step": 5635 + }, + { + "epoch": 2.739209726443769, + "grad_norm": 0.07045058052696496, + "learning_rate": 2.260873831629448e-06, + "loss": 0.4928, + "step": 5636 + }, + { + "epoch": 2.7396960486322186, + "grad_norm": 0.0712138974195529, + "learning_rate": 2.2592722322830134e-06, + "loss": 0.5102, + "step": 5637 + }, + { + "epoch": 2.7401823708206687, + "grad_norm": 0.07170392369198372, + "learning_rate": 2.257671034833181e-06, + "loss": 0.5146, + "step": 5638 + }, + { + "epoch": 2.7406686930091184, + "grad_norm": 0.07018899215736679, + "learning_rate": 2.2560702395147525e-06, + "loss": 0.5195, + "step": 5639 + }, + { + "epoch": 2.7411550151975685, + "grad_norm": 0.07570360447055016, + "learning_rate": 2.2544698465624636e-06, + "loss": 0.5309, + "step": 5640 + }, + { + "epoch": 2.7416413373860182, + "grad_norm": 0.07191305688364757, + "learning_rate": 2.252869856210994e-06, + "loss": 0.5303, + "step": 5641 + }, + { + "epoch": 2.742127659574468, + "grad_norm": 0.07138258218868866, + "learning_rate": 2.251270268694965e-06, + "loss": 0.5163, + "step": 5642 + }, + { + "epoch": 2.742613981762918, + "grad_norm": 0.07240392461843162, + "learning_rate": 2.2496710842489366e-06, + "loss": 0.5344, + "step": 5643 + }, + { + "epoch": 2.7431003039513677, + "grad_norm": 0.0726514924909096, + "learning_rate": 2.2480723031074115e-06, + "loss": 0.5244, + "step": 5644 + }, + { + "epoch": 2.743586626139818, + "grad_norm": 0.07269021595662892, + "learning_rate": 2.246473925504835e-06, + "loss": 0.5656, + "step": 5645 + }, + { + "epoch": 2.7440729483282675, + "grad_norm": 0.07214932848795096, + "learning_rate": 2.2448759516755875e-06, + "loss": 0.5682, + "step": 5646 + }, + { + "epoch": 2.744559270516717, + "grad_norm": 0.07191148846468028, + "learning_rate": 2.2432783818539943e-06, + "loss": 0.5371, + "step": 5647 + }, + { + "epoch": 2.745045592705167, + "grad_norm": 0.07381359296925709, + "learning_rate": 2.2416812162743223e-06, + "loss": 0.5602, + "step": 5648 + }, + { + "epoch": 2.745531914893617, + "grad_norm": 0.07175112283130808, + "learning_rate": 2.2400844551707775e-06, + "loss": 0.5234, + "step": 5649 + }, + { + "epoch": 2.7460182370820667, + "grad_norm": 0.07346820404330406, + "learning_rate": 2.238488098777506e-06, + "loss": 0.5373, + "step": 5650 + }, + { + "epoch": 2.746504559270517, + "grad_norm": 0.07105938320022662, + "learning_rate": 2.236892147328596e-06, + "loss": 0.5204, + "step": 5651 + }, + { + "epoch": 2.7469908814589665, + "grad_norm": 0.07105792234366043, + "learning_rate": 2.235296601058075e-06, + "loss": 0.4975, + "step": 5652 + }, + { + "epoch": 2.747477203647416, + "grad_norm": 0.06846726381934992, + "learning_rate": 2.2337014601999126e-06, + "loss": 0.489, + "step": 5653 + }, + { + "epoch": 2.7479635258358663, + "grad_norm": 0.07091550834247812, + "learning_rate": 2.2321067249880174e-06, + "loss": 0.5415, + "step": 5654 + }, + { + "epoch": 2.7479635258358663, + "eval_loss": 0.5694078803062439, + "eval_runtime": 105.092, + "eval_samples_per_second": 288.823, + "eval_steps_per_second": 36.111, + "step": 5654 + }, + { + "epoch": 2.748449848024316, + "grad_norm": 0.07141290534589537, + "learning_rate": 2.23051239565624e-06, + "loss": 0.513, + "step": 5655 + }, + { + "epoch": 2.748936170212766, + "grad_norm": 0.07638192578767015, + "learning_rate": 2.228918472438367e-06, + "loss": 0.5621, + "step": 5656 + }, + { + "epoch": 2.749422492401216, + "grad_norm": 0.07102926799877714, + "learning_rate": 2.2273249555681353e-06, + "loss": 0.5069, + "step": 5657 + }, + { + "epoch": 2.7499088145896655, + "grad_norm": 0.07203318290741409, + "learning_rate": 2.2257318452792125e-06, + "loss": 0.5403, + "step": 5658 + }, + { + "epoch": 2.7503951367781156, + "grad_norm": 0.0723852810533683, + "learning_rate": 2.224139141805211e-06, + "loss": 0.5342, + "step": 5659 + }, + { + "epoch": 2.7508814589665653, + "grad_norm": 0.07043355093162192, + "learning_rate": 2.2225468453796845e-06, + "loss": 0.523, + "step": 5660 + }, + { + "epoch": 2.7513677811550155, + "grad_norm": 0.0722684863195179, + "learning_rate": 2.220954956236121e-06, + "loss": 0.5052, + "step": 5661 + }, + { + "epoch": 2.751854103343465, + "grad_norm": 0.06666211191068558, + "learning_rate": 2.2193634746079547e-06, + "loss": 0.4631, + "step": 5662 + }, + { + "epoch": 2.752340425531915, + "grad_norm": 0.06917323312668554, + "learning_rate": 2.217772400728559e-06, + "loss": 0.4783, + "step": 5663 + }, + { + "epoch": 2.7528267477203645, + "grad_norm": 0.07073418099752314, + "learning_rate": 2.216181734831246e-06, + "loss": 0.5492, + "step": 5664 + }, + { + "epoch": 2.7533130699088146, + "grad_norm": 0.07056968473260043, + "learning_rate": 2.2145914771492695e-06, + "loss": 0.5046, + "step": 5665 + }, + { + "epoch": 2.7537993920972643, + "grad_norm": 0.06975736567621126, + "learning_rate": 2.213001627915823e-06, + "loss": 0.4997, + "step": 5666 + }, + { + "epoch": 2.7542857142857144, + "grad_norm": 0.07185033001928948, + "learning_rate": 2.211412187364038e-06, + "loss": 0.5482, + "step": 5667 + }, + { + "epoch": 2.754772036474164, + "grad_norm": 0.07018843701639142, + "learning_rate": 2.2098231557269904e-06, + "loss": 0.5228, + "step": 5668 + }, + { + "epoch": 2.755258358662614, + "grad_norm": 0.07489600556079372, + "learning_rate": 2.208234533237692e-06, + "loss": 0.5334, + "step": 5669 + }, + { + "epoch": 2.755744680851064, + "grad_norm": 0.07143307120942906, + "learning_rate": 2.206646320129097e-06, + "loss": 0.5555, + "step": 5670 + }, + { + "epoch": 2.7562310030395136, + "grad_norm": 0.07121134469861118, + "learning_rate": 2.2050585166340983e-06, + "loss": 0.5005, + "step": 5671 + }, + { + "epoch": 2.7567173252279638, + "grad_norm": 0.06980461310639734, + "learning_rate": 2.2034711229855294e-06, + "loss": 0.5037, + "step": 5672 + }, + { + "epoch": 2.7572036474164134, + "grad_norm": 0.07513261524121989, + "learning_rate": 2.201884139416163e-06, + "loss": 0.5456, + "step": 5673 + }, + { + "epoch": 2.757689969604863, + "grad_norm": 0.06958029805555384, + "learning_rate": 2.200297566158714e-06, + "loss": 0.4997, + "step": 5674 + }, + { + "epoch": 2.758176291793313, + "grad_norm": 0.07100101880619507, + "learning_rate": 2.1987114034458334e-06, + "loss": 0.5096, + "step": 5675 + }, + { + "epoch": 2.758662613981763, + "grad_norm": 0.07142993279008687, + "learning_rate": 2.197125651510115e-06, + "loss": 0.5402, + "step": 5676 + }, + { + "epoch": 2.7591489361702126, + "grad_norm": 0.07355139146509079, + "learning_rate": 2.195540310584091e-06, + "loss": 0.5544, + "step": 5677 + }, + { + "epoch": 2.7596352583586627, + "grad_norm": 0.07604200028932977, + "learning_rate": 2.193955380900234e-06, + "loss": 0.5734, + "step": 5678 + }, + { + "epoch": 2.7601215805471124, + "grad_norm": 0.07162264768455495, + "learning_rate": 2.1923708626909556e-06, + "loss": 0.5379, + "step": 5679 + }, + { + "epoch": 2.760607902735562, + "grad_norm": 0.07280155808718544, + "learning_rate": 2.1907867561886072e-06, + "loss": 0.488, + "step": 5680 + }, + { + "epoch": 2.7610942249240122, + "grad_norm": 0.07179248641820297, + "learning_rate": 2.1892030616254806e-06, + "loss": 0.5219, + "step": 5681 + }, + { + "epoch": 2.761580547112462, + "grad_norm": 0.07289671255835094, + "learning_rate": 2.187619779233806e-06, + "loss": 0.5288, + "step": 5682 + }, + { + "epoch": 2.762066869300912, + "grad_norm": 0.07128508152861099, + "learning_rate": 2.1860369092457538e-06, + "loss": 0.5193, + "step": 5683 + }, + { + "epoch": 2.7625531914893617, + "grad_norm": 0.07621602651666413, + "learning_rate": 2.1844544518934347e-06, + "loss": 0.5894, + "step": 5684 + }, + { + "epoch": 2.7630395136778114, + "grad_norm": 0.07324615745984062, + "learning_rate": 2.1828724074088974e-06, + "loss": 0.5413, + "step": 5685 + }, + { + "epoch": 2.7635258358662615, + "grad_norm": 0.0687093950885111, + "learning_rate": 2.181290776024131e-06, + "loss": 0.4925, + "step": 5686 + }, + { + "epoch": 2.7640121580547112, + "grad_norm": 0.07157969474799021, + "learning_rate": 2.1797095579710635e-06, + "loss": 0.5287, + "step": 5687 + }, + { + "epoch": 2.7644984802431614, + "grad_norm": 0.0709592813701329, + "learning_rate": 2.178128753481563e-06, + "loss": 0.5144, + "step": 5688 + }, + { + "epoch": 2.764984802431611, + "grad_norm": 0.07059597497068189, + "learning_rate": 2.1765483627874367e-06, + "loss": 0.5217, + "step": 5689 + }, + { + "epoch": 2.7654711246200607, + "grad_norm": 0.06934498745626758, + "learning_rate": 2.17496838612043e-06, + "loss": 0.5057, + "step": 5690 + }, + { + "epoch": 2.7659574468085104, + "grad_norm": 0.07260755429842425, + "learning_rate": 2.17338882371223e-06, + "loss": 0.542, + "step": 5691 + }, + { + "epoch": 2.7664437689969605, + "grad_norm": 0.07227348678611166, + "learning_rate": 2.1718096757944595e-06, + "loss": 0.5525, + "step": 5692 + }, + { + "epoch": 2.7669300911854102, + "grad_norm": 0.07359647639091586, + "learning_rate": 2.1702309425986844e-06, + "loss": 0.5761, + "step": 5693 + }, + { + "epoch": 2.7674164133738604, + "grad_norm": 0.06963198455221663, + "learning_rate": 2.168652624356407e-06, + "loss": 0.5067, + "step": 5694 + }, + { + "epoch": 2.76790273556231, + "grad_norm": 0.07089314811044034, + "learning_rate": 2.1670747212990713e-06, + "loss": 0.5509, + "step": 5695 + }, + { + "epoch": 2.7683890577507597, + "grad_norm": 0.07457690545924978, + "learning_rate": 2.1654972336580564e-06, + "loss": 0.592, + "step": 5696 + }, + { + "epoch": 2.76887537993921, + "grad_norm": 0.07034505457662908, + "learning_rate": 2.163920161664685e-06, + "loss": 0.4911, + "step": 5697 + }, + { + "epoch": 2.7693617021276595, + "grad_norm": 0.07317829353500106, + "learning_rate": 2.162343505550216e-06, + "loss": 0.5376, + "step": 5698 + }, + { + "epoch": 2.7698480243161097, + "grad_norm": 0.07272812315437173, + "learning_rate": 2.160767265545848e-06, + "loss": 0.5767, + "step": 5699 + }, + { + "epoch": 2.7703343465045593, + "grad_norm": 0.07097499352682997, + "learning_rate": 2.1591914418827186e-06, + "loss": 0.5181, + "step": 5700 + }, + { + "epoch": 2.770820668693009, + "grad_norm": 0.07338895054972039, + "learning_rate": 2.1576160347919057e-06, + "loss": 0.5079, + "step": 5701 + }, + { + "epoch": 2.7713069908814587, + "grad_norm": 0.07340714535159648, + "learning_rate": 2.156041044504423e-06, + "loss": 0.5499, + "step": 5702 + }, + { + "epoch": 2.771793313069909, + "grad_norm": 0.07228642072474788, + "learning_rate": 2.154466471251226e-06, + "loss": 0.5442, + "step": 5703 + }, + { + "epoch": 2.7722796352583585, + "grad_norm": 0.07261309153864731, + "learning_rate": 2.1528923152632082e-06, + "loss": 0.5504, + "step": 5704 + }, + { + "epoch": 2.7727659574468086, + "grad_norm": 0.07080196355380952, + "learning_rate": 2.1513185767712007e-06, + "loss": 0.4914, + "step": 5705 + }, + { + "epoch": 2.7732522796352583, + "grad_norm": 0.07004859884970265, + "learning_rate": 2.1497452560059756e-06, + "loss": 0.5034, + "step": 5706 + }, + { + "epoch": 2.773738601823708, + "grad_norm": 0.07091009165202827, + "learning_rate": 2.1481723531982417e-06, + "loss": 0.549, + "step": 5707 + }, + { + "epoch": 2.774224924012158, + "grad_norm": 0.07295709553380648, + "learning_rate": 2.146599868578649e-06, + "loss": 0.5187, + "step": 5708 + }, + { + "epoch": 2.774711246200608, + "grad_norm": 0.0709791895901247, + "learning_rate": 2.1450278023777823e-06, + "loss": 0.5067, + "step": 5709 + }, + { + "epoch": 2.775197568389058, + "grad_norm": 0.0714091929739096, + "learning_rate": 2.1434561548261666e-06, + "loss": 0.5226, + "step": 5710 + }, + { + "epoch": 2.7756838905775076, + "grad_norm": 0.06989281216328769, + "learning_rate": 2.1418849261542667e-06, + "loss": 0.5234, + "step": 5711 + }, + { + "epoch": 2.7761702127659573, + "grad_norm": 0.06963062486462816, + "learning_rate": 2.1403141165924877e-06, + "loss": 0.5403, + "step": 5712 + }, + { + "epoch": 2.7766565349544075, + "grad_norm": 0.07047935292102145, + "learning_rate": 2.1387437263711702e-06, + "loss": 0.5158, + "step": 5713 + }, + { + "epoch": 2.777142857142857, + "grad_norm": 0.07346388177391716, + "learning_rate": 2.1371737557205928e-06, + "loss": 0.5828, + "step": 5714 + }, + { + "epoch": 2.7776291793313073, + "grad_norm": 0.06995173229130977, + "learning_rate": 2.135604204870975e-06, + "loss": 0.5251, + "step": 5715 + }, + { + "epoch": 2.778115501519757, + "grad_norm": 0.07026410882101986, + "learning_rate": 2.1340350740524735e-06, + "loss": 0.5198, + "step": 5716 + }, + { + "epoch": 2.7786018237082066, + "grad_norm": 0.07176639094885762, + "learning_rate": 2.1324663634951826e-06, + "loss": 0.5547, + "step": 5717 + }, + { + "epoch": 2.7790881458966563, + "grad_norm": 0.0701402835214612, + "learning_rate": 2.130898073429137e-06, + "loss": 0.5136, + "step": 5718 + }, + { + "epoch": 2.7795744680851064, + "grad_norm": 0.07055392906838966, + "learning_rate": 2.1293302040843073e-06, + "loss": 0.5223, + "step": 5719 + }, + { + "epoch": 2.780060790273556, + "grad_norm": 0.07153922747116245, + "learning_rate": 2.1277627556906057e-06, + "loss": 0.5371, + "step": 5720 + }, + { + "epoch": 2.7805471124620063, + "grad_norm": 0.07265716916318489, + "learning_rate": 2.1261957284778784e-06, + "loss": 0.5192, + "step": 5721 + }, + { + "epoch": 2.781033434650456, + "grad_norm": 0.07098168504086935, + "learning_rate": 2.1246291226759157e-06, + "loss": 0.5142, + "step": 5722 + }, + { + "epoch": 2.7815197568389056, + "grad_norm": 0.07136917271299496, + "learning_rate": 2.1230629385144388e-06, + "loss": 0.5155, + "step": 5723 + }, + { + "epoch": 2.7820060790273557, + "grad_norm": 0.07084188270269456, + "learning_rate": 2.1214971762231113e-06, + "loss": 0.5048, + "step": 5724 + }, + { + "epoch": 2.7824924012158054, + "grad_norm": 0.07054557304479277, + "learning_rate": 2.1199318360315356e-06, + "loss": 0.516, + "step": 5725 + }, + { + "epoch": 2.7829787234042556, + "grad_norm": 0.07242674971400906, + "learning_rate": 2.118366918169251e-06, + "loss": 0.5614, + "step": 5726 + }, + { + "epoch": 2.7834650455927052, + "grad_norm": 0.07352127403953698, + "learning_rate": 2.1168024228657345e-06, + "loss": 0.5716, + "step": 5727 + }, + { + "epoch": 2.783951367781155, + "grad_norm": 0.06993833048638261, + "learning_rate": 2.115238350350402e-06, + "loss": 0.5034, + "step": 5728 + }, + { + "epoch": 2.7844376899696046, + "grad_norm": 0.07044432038973897, + "learning_rate": 2.1136747008526055e-06, + "loss": 0.5012, + "step": 5729 + }, + { + "epoch": 2.7849240121580547, + "grad_norm": 0.07217061610508575, + "learning_rate": 2.1121114746016386e-06, + "loss": 0.4989, + "step": 5730 + }, + { + "epoch": 2.7854103343465044, + "grad_norm": 0.07452163843632588, + "learning_rate": 2.1105486718267304e-06, + "loss": 0.5715, + "step": 5731 + }, + { + "epoch": 2.7858966565349546, + "grad_norm": 0.0694893833016534, + "learning_rate": 2.1089862927570474e-06, + "loss": 0.5043, + "step": 5732 + }, + { + "epoch": 2.7863829787234042, + "grad_norm": 0.07303068781298948, + "learning_rate": 2.1074243376216947e-06, + "loss": 0.5159, + "step": 5733 + }, + { + "epoch": 2.786869300911854, + "grad_norm": 0.06736055148091624, + "learning_rate": 2.105862806649716e-06, + "loss": 0.4786, + "step": 5734 + }, + { + "epoch": 2.787355623100304, + "grad_norm": 0.07166611537333546, + "learning_rate": 2.104301700070091e-06, + "loss": 0.5217, + "step": 5735 + }, + { + "epoch": 2.7878419452887537, + "grad_norm": 0.06934871248787595, + "learning_rate": 2.102741018111739e-06, + "loss": 0.5031, + "step": 5736 + }, + { + "epoch": 2.788328267477204, + "grad_norm": 0.07132465895570106, + "learning_rate": 2.1011807610035184e-06, + "loss": 0.5303, + "step": 5737 + }, + { + "epoch": 2.7888145896656535, + "grad_norm": 0.0726711361821812, + "learning_rate": 2.099620928974219e-06, + "loss": 0.5462, + "step": 5738 + }, + { + "epoch": 2.7893009118541032, + "grad_norm": 0.07451730972315569, + "learning_rate": 2.098061522252574e-06, + "loss": 0.5347, + "step": 5739 + }, + { + "epoch": 2.7897872340425534, + "grad_norm": 0.07305846242527281, + "learning_rate": 2.0965025410672535e-06, + "loss": 0.5397, + "step": 5740 + }, + { + "epoch": 2.790273556231003, + "grad_norm": 0.07096620474136943, + "learning_rate": 2.094943985646864e-06, + "loss": 0.5057, + "step": 5741 + }, + { + "epoch": 2.7907598784194527, + "grad_norm": 0.07197848039927461, + "learning_rate": 2.0933858562199496e-06, + "loss": 0.515, + "step": 5742 + }, + { + "epoch": 2.791246200607903, + "grad_norm": 0.07162385688008034, + "learning_rate": 2.0918281530149925e-06, + "loss": 0.5598, + "step": 5743 + }, + { + "epoch": 2.7917325227963525, + "grad_norm": 0.07225857096500149, + "learning_rate": 2.090270876260412e-06, + "loss": 0.5538, + "step": 5744 + }, + { + "epoch": 2.792218844984802, + "grad_norm": 0.0710079335104998, + "learning_rate": 2.0887140261845662e-06, + "loss": 0.5129, + "step": 5745 + }, + { + "epoch": 2.7927051671732523, + "grad_norm": 0.06897347717466744, + "learning_rate": 2.087157603015748e-06, + "loss": 0.4972, + "step": 5746 + }, + { + "epoch": 2.793191489361702, + "grad_norm": 0.07158877477721493, + "learning_rate": 2.085601606982188e-06, + "loss": 0.5473, + "step": 5747 + }, + { + "epoch": 2.793677811550152, + "grad_norm": 0.07226705053894875, + "learning_rate": 2.084046038312059e-06, + "loss": 0.5454, + "step": 5748 + }, + { + "epoch": 2.794164133738602, + "grad_norm": 0.06884637258851432, + "learning_rate": 2.0824908972334663e-06, + "loss": 0.4886, + "step": 5749 + }, + { + "epoch": 2.7946504559270515, + "grad_norm": 0.06711383641550824, + "learning_rate": 2.0809361839744525e-06, + "loss": 0.4852, + "step": 5750 + }, + { + "epoch": 2.7951367781155017, + "grad_norm": 0.06991482797358153, + "learning_rate": 2.079381898762999e-06, + "loss": 0.5301, + "step": 5751 + }, + { + "epoch": 2.7956231003039513, + "grad_norm": 0.07046779476426093, + "learning_rate": 2.077828041827026e-06, + "loss": 0.4931, + "step": 5752 + }, + { + "epoch": 2.7961094224924015, + "grad_norm": 0.07044017005746363, + "learning_rate": 2.076274613394386e-06, + "loss": 0.5224, + "step": 5753 + }, + { + "epoch": 2.796595744680851, + "grad_norm": 0.06963419833518798, + "learning_rate": 2.0747216136928723e-06, + "loss": 0.5132, + "step": 5754 + }, + { + "epoch": 2.797082066869301, + "grad_norm": 0.07118776107136607, + "learning_rate": 2.0731690429502147e-06, + "loss": 0.5121, + "step": 5755 + }, + { + "epoch": 2.7975683890577505, + "grad_norm": 0.0724347149903344, + "learning_rate": 2.0716169013940812e-06, + "loss": 0.5385, + "step": 5756 + }, + { + "epoch": 2.7980547112462006, + "grad_norm": 0.06874723162493693, + "learning_rate": 2.070065189252075e-06, + "loss": 0.489, + "step": 5757 + }, + { + "epoch": 2.7985410334346503, + "grad_norm": 0.07044914805777469, + "learning_rate": 2.068513906751738e-06, + "loss": 0.5126, + "step": 5758 + }, + { + "epoch": 2.7990273556231005, + "grad_norm": 0.06807560767207664, + "learning_rate": 2.0669630541205466e-06, + "loss": 0.4912, + "step": 5759 + }, + { + "epoch": 2.79951367781155, + "grad_norm": 0.07073039019185584, + "learning_rate": 2.0654126315859163e-06, + "loss": 0.5083, + "step": 5760 + }, + { + "epoch": 2.8, + "grad_norm": 0.06932443322958762, + "learning_rate": 2.063862639375199e-06, + "loss": 0.5178, + "step": 5761 + }, + { + "epoch": 2.80048632218845, + "grad_norm": 0.07414322991678517, + "learning_rate": 2.062313077715684e-06, + "loss": 0.5711, + "step": 5762 + }, + { + "epoch": 2.8009726443768996, + "grad_norm": 0.07084694860474966, + "learning_rate": 2.0607639468345965e-06, + "loss": 0.5, + "step": 5763 + }, + { + "epoch": 2.8014589665653498, + "grad_norm": 0.07101310025760442, + "learning_rate": 2.0592152469590994e-06, + "loss": 0.494, + "step": 5764 + }, + { + "epoch": 2.8019452887537994, + "grad_norm": 0.0723072157347182, + "learning_rate": 2.057666978316289e-06, + "loss": 0.5021, + "step": 5765 + }, + { + "epoch": 2.802431610942249, + "grad_norm": 0.07195987893412906, + "learning_rate": 2.0561191411332052e-06, + "loss": 0.5051, + "step": 5766 + }, + { + "epoch": 2.802917933130699, + "grad_norm": 0.07177955323971101, + "learning_rate": 2.054571735636822e-06, + "loss": 0.5323, + "step": 5767 + }, + { + "epoch": 2.803404255319149, + "grad_norm": 0.07045972107988707, + "learning_rate": 2.0530247620540444e-06, + "loss": 0.5117, + "step": 5768 + }, + { + "epoch": 2.8038905775075986, + "grad_norm": 0.0743218681275549, + "learning_rate": 2.05147822061172e-06, + "loss": 0.5097, + "step": 5769 + }, + { + "epoch": 2.8043768996960488, + "grad_norm": 0.07315073997702788, + "learning_rate": 2.049932111536632e-06, + "loss": 0.5018, + "step": 5770 + }, + { + "epoch": 2.8048632218844984, + "grad_norm": 0.0721786973598316, + "learning_rate": 2.0483864350555e-06, + "loss": 0.5167, + "step": 5771 + }, + { + "epoch": 2.805349544072948, + "grad_norm": 0.07163341001088543, + "learning_rate": 2.0468411913949787e-06, + "loss": 0.5106, + "step": 5772 + }, + { + "epoch": 2.8058358662613982, + "grad_norm": 0.07239868251111778, + "learning_rate": 2.0452963807816616e-06, + "loss": 0.5396, + "step": 5773 + }, + { + "epoch": 2.806322188449848, + "grad_norm": 0.0693450570944712, + "learning_rate": 2.043752003442078e-06, + "loss": 0.5128, + "step": 5774 + }, + { + "epoch": 2.806808510638298, + "grad_norm": 0.07379031693412846, + "learning_rate": 2.042208059602692e-06, + "loss": 0.5533, + "step": 5775 + }, + { + "epoch": 2.8072948328267477, + "grad_norm": 0.07184461643654642, + "learning_rate": 2.0406645494899063e-06, + "loss": 0.5332, + "step": 5776 + }, + { + "epoch": 2.8077811550151974, + "grad_norm": 0.07341888226834573, + "learning_rate": 2.039121473330059e-06, + "loss": 0.5326, + "step": 5777 + }, + { + "epoch": 2.8082674772036476, + "grad_norm": 0.0701332332262917, + "learning_rate": 2.0375788313494245e-06, + "loss": 0.5216, + "step": 5778 + }, + { + "epoch": 2.8087537993920972, + "grad_norm": 0.07137725859230615, + "learning_rate": 2.036036623774214e-06, + "loss": 0.5662, + "step": 5779 + }, + { + "epoch": 2.8092401215805474, + "grad_norm": 0.07060620140196677, + "learning_rate": 2.0344948508305746e-06, + "loss": 0.496, + "step": 5780 + }, + { + "epoch": 2.809726443768997, + "grad_norm": 0.07221948308646506, + "learning_rate": 2.03295351274459e-06, + "loss": 0.5372, + "step": 5781 + }, + { + "epoch": 2.8102127659574467, + "grad_norm": 0.070956310554444, + "learning_rate": 2.031412609742279e-06, + "loss": 0.5431, + "step": 5782 + }, + { + "epoch": 2.8106990881458964, + "grad_norm": 0.06935941415008046, + "learning_rate": 2.0298721420495986e-06, + "loss": 0.5152, + "step": 5783 + }, + { + "epoch": 2.8111854103343465, + "grad_norm": 0.07089598014718915, + "learning_rate": 2.0283321098924407e-06, + "loss": 0.5154, + "step": 5784 + }, + { + "epoch": 2.8116717325227962, + "grad_norm": 0.07065291169604153, + "learning_rate": 2.0267925134966333e-06, + "loss": 0.5222, + "step": 5785 + }, + { + "epoch": 2.8121580547112464, + "grad_norm": 0.07248031597923572, + "learning_rate": 2.025253353087941e-06, + "loss": 0.5289, + "step": 5786 + }, + { + "epoch": 2.812644376899696, + "grad_norm": 0.07020067258837767, + "learning_rate": 2.0237146288920632e-06, + "loss": 0.4923, + "step": 5787 + }, + { + "epoch": 2.8131306990881457, + "grad_norm": 0.07117474561102627, + "learning_rate": 2.022176341134638e-06, + "loss": 0.5274, + "step": 5788 + }, + { + "epoch": 2.813617021276596, + "grad_norm": 0.07560309408683605, + "learning_rate": 2.0206384900412364e-06, + "loss": 0.5157, + "step": 5789 + }, + { + "epoch": 2.8141033434650455, + "grad_norm": 0.0687342718106197, + "learning_rate": 2.0191010758373675e-06, + "loss": 0.5428, + "step": 5790 + }, + { + "epoch": 2.8145896656534957, + "grad_norm": 0.07198071207499912, + "learning_rate": 2.0175640987484755e-06, + "loss": 0.538, + "step": 5791 + }, + { + "epoch": 2.8150759878419453, + "grad_norm": 0.07066103065228692, + "learning_rate": 2.0160275589999407e-06, + "loss": 0.5595, + "step": 5792 + }, + { + "epoch": 2.815562310030395, + "grad_norm": 0.07295026560041401, + "learning_rate": 2.014491456817079e-06, + "loss": 0.5116, + "step": 5793 + }, + { + "epoch": 2.8160486322188447, + "grad_norm": 0.06917307368191179, + "learning_rate": 2.0129557924251425e-06, + "loss": 0.5249, + "step": 5794 + }, + { + "epoch": 2.816534954407295, + "grad_norm": 0.06905962678506213, + "learning_rate": 2.011420566049319e-06, + "loss": 0.5124, + "step": 5795 + }, + { + "epoch": 2.8170212765957445, + "grad_norm": 0.07397970018237573, + "learning_rate": 2.0098857779147316e-06, + "loss": 0.5273, + "step": 5796 + }, + { + "epoch": 2.8175075987841947, + "grad_norm": 0.07157959264698292, + "learning_rate": 2.00835142824644e-06, + "loss": 0.5224, + "step": 5797 + }, + { + "epoch": 2.8179939209726443, + "grad_norm": 0.07038787458285181, + "learning_rate": 2.0068175172694394e-06, + "loss": 0.5032, + "step": 5798 + }, + { + "epoch": 2.818480243161094, + "grad_norm": 0.07188025256279675, + "learning_rate": 2.0052840452086595e-06, + "loss": 0.5242, + "step": 5799 + }, + { + "epoch": 2.818966565349544, + "grad_norm": 0.07096406154939713, + "learning_rate": 2.003751012288969e-06, + "loss": 0.5073, + "step": 5800 + }, + { + "epoch": 2.819452887537994, + "grad_norm": 0.07115547759235426, + "learning_rate": 2.002218418735165e-06, + "loss": 0.5042, + "step": 5801 + }, + { + "epoch": 2.819939209726444, + "grad_norm": 0.07002606997167922, + "learning_rate": 2.0006862647719887e-06, + "loss": 0.527, + "step": 5802 + }, + { + "epoch": 2.8204255319148936, + "grad_norm": 0.0718461093582595, + "learning_rate": 1.999154550624113e-06, + "loss": 0.5505, + "step": 5803 + }, + { + "epoch": 2.8209118541033433, + "grad_norm": 0.06956317750838951, + "learning_rate": 1.9976232765161453e-06, + "loss": 0.523, + "step": 5804 + }, + { + "epoch": 2.8213981762917935, + "grad_norm": 0.07007071552060323, + "learning_rate": 1.99609244267263e-06, + "loss": 0.5179, + "step": 5805 + }, + { + "epoch": 2.821884498480243, + "grad_norm": 0.0719422450559224, + "learning_rate": 1.994562049318046e-06, + "loss": 0.5147, + "step": 5806 + }, + { + "epoch": 2.8223708206686933, + "grad_norm": 0.07038272683119702, + "learning_rate": 1.993032096676808e-06, + "loss": 0.5769, + "step": 5807 + }, + { + "epoch": 2.822857142857143, + "grad_norm": 0.07260561605391949, + "learning_rate": 1.991502584973267e-06, + "loss": 0.5128, + "step": 5808 + }, + { + "epoch": 2.8233434650455926, + "grad_norm": 0.07200563112090586, + "learning_rate": 1.989973514431709e-06, + "loss": 0.5681, + "step": 5809 + }, + { + "epoch": 2.8238297872340423, + "grad_norm": 0.070468852376796, + "learning_rate": 1.9884448852763534e-06, + "loss": 0.5061, + "step": 5810 + }, + { + "epoch": 2.8243161094224924, + "grad_norm": 0.07277574241372085, + "learning_rate": 1.9869166977313565e-06, + "loss": 0.5544, + "step": 5811 + }, + { + "epoch": 2.824802431610942, + "grad_norm": 0.07275952137937516, + "learning_rate": 1.98538895202081e-06, + "loss": 0.524, + "step": 5812 + }, + { + "epoch": 2.8252887537993923, + "grad_norm": 0.07128718743871466, + "learning_rate": 1.9838616483687414e-06, + "loss": 0.4881, + "step": 5813 + }, + { + "epoch": 2.825775075987842, + "grad_norm": 0.06921129214403458, + "learning_rate": 1.982334786999111e-06, + "loss": 0.4786, + "step": 5814 + }, + { + "epoch": 2.8262613981762916, + "grad_norm": 0.07151066581385927, + "learning_rate": 1.980808368135818e-06, + "loss": 0.5232, + "step": 5815 + }, + { + "epoch": 2.8267477203647418, + "grad_norm": 0.07207762732264693, + "learning_rate": 1.979282392002691e-06, + "loss": 0.5467, + "step": 5816 + }, + { + "epoch": 2.8272340425531914, + "grad_norm": 0.07098923965832625, + "learning_rate": 1.9777568588234985e-06, + "loss": 0.5535, + "step": 5817 + }, + { + "epoch": 2.8277203647416416, + "grad_norm": 0.07013401460573276, + "learning_rate": 1.976231768821943e-06, + "loss": 0.5142, + "step": 5818 + }, + { + "epoch": 2.8282066869300913, + "grad_norm": 0.0732227452899353, + "learning_rate": 1.9747071222216614e-06, + "loss": 0.5528, + "step": 5819 + }, + { + "epoch": 2.828693009118541, + "grad_norm": 0.0697618981827064, + "learning_rate": 1.9731829192462236e-06, + "loss": 0.5272, + "step": 5820 + }, + { + "epoch": 2.8291793313069906, + "grad_norm": 0.07169420302433707, + "learning_rate": 1.9716591601191413e-06, + "loss": 0.4844, + "step": 5821 + }, + { + "epoch": 2.8296656534954407, + "grad_norm": 0.07174393539631475, + "learning_rate": 1.9701358450638543e-06, + "loss": 0.5355, + "step": 5822 + }, + { + "epoch": 2.8301519756838904, + "grad_norm": 0.07114986162707497, + "learning_rate": 1.9686129743037387e-06, + "loss": 0.4957, + "step": 5823 + }, + { + "epoch": 2.8306382978723406, + "grad_norm": 0.07038983638335994, + "learning_rate": 1.9670905480621068e-06, + "loss": 0.5284, + "step": 5824 + }, + { + "epoch": 2.8311246200607902, + "grad_norm": 0.07220166255257733, + "learning_rate": 1.965568566562205e-06, + "loss": 0.5183, + "step": 5825 + }, + { + "epoch": 2.83161094224924, + "grad_norm": 0.068669207063441, + "learning_rate": 1.9640470300272146e-06, + "loss": 0.48, + "step": 5826 + }, + { + "epoch": 2.83209726443769, + "grad_norm": 0.07061723105411294, + "learning_rate": 1.962525938680252e-06, + "loss": 0.5204, + "step": 5827 + }, + { + "epoch": 2.8325835866261397, + "grad_norm": 0.07275773790562813, + "learning_rate": 1.961005292744368e-06, + "loss": 0.5324, + "step": 5828 + }, + { + "epoch": 2.83306990881459, + "grad_norm": 0.07148353604041843, + "learning_rate": 1.9594850924425486e-06, + "loss": 0.5146, + "step": 5829 + }, + { + "epoch": 2.8335562310030395, + "grad_norm": 0.07102619578790878, + "learning_rate": 1.957965337997712e-06, + "loss": 0.5195, + "step": 5830 + }, + { + "epoch": 2.8340425531914892, + "grad_norm": 0.07194909091768985, + "learning_rate": 1.9564460296327137e-06, + "loss": 0.5421, + "step": 5831 + }, + { + "epoch": 2.8345288753799394, + "grad_norm": 0.07034046436333947, + "learning_rate": 1.9549271675703434e-06, + "loss": 0.5525, + "step": 5832 + }, + { + "epoch": 2.835015197568389, + "grad_norm": 0.072395113559526, + "learning_rate": 1.953408752033325e-06, + "loss": 0.5291, + "step": 5833 + }, + { + "epoch": 2.835501519756839, + "grad_norm": 0.07123389602895357, + "learning_rate": 1.951890783244316e-06, + "loss": 0.5132, + "step": 5834 + }, + { + "epoch": 2.835987841945289, + "grad_norm": 0.06929022928358465, + "learning_rate": 1.9503732614259113e-06, + "loss": 0.4982, + "step": 5835 + }, + { + "epoch": 2.8364741641337385, + "grad_norm": 0.0715648512994881, + "learning_rate": 1.948856186800636e-06, + "loss": 0.5598, + "step": 5836 + }, + { + "epoch": 2.8369604863221882, + "grad_norm": 0.07182233045283128, + "learning_rate": 1.9473395595909533e-06, + "loss": 0.5135, + "step": 5837 + }, + { + "epoch": 2.8374468085106384, + "grad_norm": 0.07378443404345784, + "learning_rate": 1.945823380019257e-06, + "loss": 0.5535, + "step": 5838 + }, + { + "epoch": 2.837933130699088, + "grad_norm": 0.0699727216242389, + "learning_rate": 1.944307648307882e-06, + "loss": 0.5181, + "step": 5839 + }, + { + "epoch": 2.838419452887538, + "grad_norm": 0.07384742178422674, + "learning_rate": 1.94279236467909e-06, + "loss": 0.5804, + "step": 5840 + }, + { + "epoch": 2.838905775075988, + "grad_norm": 0.06967814521185865, + "learning_rate": 1.9412775293550814e-06, + "loss": 0.4913, + "step": 5841 + }, + { + "epoch": 2.8393920972644375, + "grad_norm": 0.07668445566534125, + "learning_rate": 1.9397631425579884e-06, + "loss": 0.5283, + "step": 5842 + }, + { + "epoch": 2.8398784194528877, + "grad_norm": 0.06996898714384027, + "learning_rate": 1.9382492045098792e-06, + "loss": 0.5263, + "step": 5843 + }, + { + "epoch": 2.8403647416413373, + "grad_norm": 0.07105381155641805, + "learning_rate": 1.9367357154327577e-06, + "loss": 0.4945, + "step": 5844 + }, + { + "epoch": 2.8408510638297875, + "grad_norm": 0.07109255895461253, + "learning_rate": 1.935222675548556e-06, + "loss": 0.5154, + "step": 5845 + }, + { + "epoch": 2.841337386018237, + "grad_norm": 0.07181578825833233, + "learning_rate": 1.933710085079146e-06, + "loss": 0.5159, + "step": 5846 + }, + { + "epoch": 2.841823708206687, + "grad_norm": 0.07241450417865103, + "learning_rate": 1.9321979442463325e-06, + "loss": 0.5174, + "step": 5847 + }, + { + "epoch": 2.8423100303951365, + "grad_norm": 0.07379668212785787, + "learning_rate": 1.9306862532718527e-06, + "loss": 0.5296, + "step": 5848 + }, + { + "epoch": 2.8427963525835866, + "grad_norm": 0.07232859649421171, + "learning_rate": 1.92917501237738e-06, + "loss": 0.5534, + "step": 5849 + }, + { + "epoch": 2.8432826747720363, + "grad_norm": 0.07007512173931056, + "learning_rate": 1.9276642217845197e-06, + "loss": 0.5322, + "step": 5850 + }, + { + "epoch": 2.8437689969604865, + "grad_norm": 0.0722997015260819, + "learning_rate": 1.926153881714813e-06, + "loss": 0.5801, + "step": 5851 + }, + { + "epoch": 2.844255319148936, + "grad_norm": 0.06906461006082809, + "learning_rate": 1.9246439923897335e-06, + "loss": 0.4783, + "step": 5852 + }, + { + "epoch": 2.844741641337386, + "grad_norm": 0.07137444718583476, + "learning_rate": 1.9231345540306893e-06, + "loss": 0.4999, + "step": 5853 + }, + { + "epoch": 2.845227963525836, + "grad_norm": 0.06893563741233799, + "learning_rate": 1.9216255668590233e-06, + "loss": 0.5044, + "step": 5854 + }, + { + "epoch": 2.8457142857142856, + "grad_norm": 0.07055570255494763, + "learning_rate": 1.92011703109601e-06, + "loss": 0.5061, + "step": 5855 + }, + { + "epoch": 2.8462006079027358, + "grad_norm": 0.07170500706941783, + "learning_rate": 1.918608946962858e-06, + "loss": 0.5391, + "step": 5856 + }, + { + "epoch": 2.8466869300911855, + "grad_norm": 0.0736865699433173, + "learning_rate": 1.9171013146807148e-06, + "loss": 0.5374, + "step": 5857 + }, + { + "epoch": 2.847173252279635, + "grad_norm": 0.06915405862816253, + "learning_rate": 1.9155941344706547e-06, + "loss": 0.4745, + "step": 5858 + }, + { + "epoch": 2.8476595744680853, + "grad_norm": 0.07017117745507662, + "learning_rate": 1.914087406553691e-06, + "loss": 0.5254, + "step": 5859 + }, + { + "epoch": 2.848145896656535, + "grad_norm": 0.07153193780324923, + "learning_rate": 1.912581131150764e-06, + "loss": 0.496, + "step": 5860 + }, + { + "epoch": 2.848632218844985, + "grad_norm": 0.06907821202483959, + "learning_rate": 1.911075308482754e-06, + "loss": 0.4957, + "step": 5861 + }, + { + "epoch": 2.8491185410334348, + "grad_norm": 0.07038768803173205, + "learning_rate": 1.909569938770474e-06, + "loss": 0.5387, + "step": 5862 + }, + { + "epoch": 2.8496048632218844, + "grad_norm": 0.0713273212298614, + "learning_rate": 1.908065022234668e-06, + "loss": 0.5229, + "step": 5863 + }, + { + "epoch": 2.850091185410334, + "grad_norm": 0.07059827157015541, + "learning_rate": 1.9065605590960146e-06, + "loss": 0.5057, + "step": 5864 + }, + { + "epoch": 2.8505775075987843, + "grad_norm": 0.07140515753841477, + "learning_rate": 1.9050565495751271e-06, + "loss": 0.5229, + "step": 5865 + }, + { + "epoch": 2.851063829787234, + "grad_norm": 0.07347178816517402, + "learning_rate": 1.9035529938925518e-06, + "loss": 0.5131, + "step": 5866 + }, + { + "epoch": 2.851550151975684, + "grad_norm": 0.07189753191769561, + "learning_rate": 1.9020498922687668e-06, + "loss": 0.5217, + "step": 5867 + }, + { + "epoch": 2.8520364741641338, + "grad_norm": 0.07624390139991714, + "learning_rate": 1.9005472449241857e-06, + "loss": 0.5327, + "step": 5868 + }, + { + "epoch": 2.8525227963525834, + "grad_norm": 0.06986334710139973, + "learning_rate": 1.8990450520791547e-06, + "loss": 0.4916, + "step": 5869 + }, + { + "epoch": 2.8530091185410336, + "grad_norm": 0.0720841881751233, + "learning_rate": 1.8975433139539534e-06, + "loss": 0.5041, + "step": 5870 + }, + { + "epoch": 2.8534954407294832, + "grad_norm": 0.07233949366611366, + "learning_rate": 1.8960420307687937e-06, + "loss": 0.5452, + "step": 5871 + }, + { + "epoch": 2.8539817629179334, + "grad_norm": 0.0741325589165113, + "learning_rate": 1.8945412027438226e-06, + "loss": 0.5202, + "step": 5872 + }, + { + "epoch": 2.854468085106383, + "grad_norm": 0.06993626746771713, + "learning_rate": 1.8930408300991194e-06, + "loss": 0.5067, + "step": 5873 + }, + { + "epoch": 2.8549544072948327, + "grad_norm": 0.06993304776029446, + "learning_rate": 1.8915409130546968e-06, + "loss": 0.5039, + "step": 5874 + }, + { + "epoch": 2.8554407294832824, + "grad_norm": 0.07288562071089878, + "learning_rate": 1.8900414518305004e-06, + "loss": 0.5344, + "step": 5875 + }, + { + "epoch": 2.8559270516717326, + "grad_norm": 0.0720526868085092, + "learning_rate": 1.8885424466464086e-06, + "loss": 0.5334, + "step": 5876 + }, + { + "epoch": 2.8564133738601822, + "grad_norm": 0.07360343886864168, + "learning_rate": 1.8870438977222345e-06, + "loss": 0.5405, + "step": 5877 + }, + { + "epoch": 2.8568996960486324, + "grad_norm": 0.07383537183725086, + "learning_rate": 1.885545805277723e-06, + "loss": 0.5839, + "step": 5878 + }, + { + "epoch": 2.857386018237082, + "grad_norm": 0.06836280121655228, + "learning_rate": 1.8840481695325519e-06, + "loss": 0.504, + "step": 5879 + }, + { + "epoch": 2.8578723404255317, + "grad_norm": 0.07503690367359185, + "learning_rate": 1.8825509907063328e-06, + "loss": 0.5568, + "step": 5880 + }, + { + "epoch": 2.858358662613982, + "grad_norm": 0.07101128036324922, + "learning_rate": 1.88105426901861e-06, + "loss": 0.5388, + "step": 5881 + }, + { + "epoch": 2.8588449848024315, + "grad_norm": 0.07018221159276668, + "learning_rate": 1.8795580046888607e-06, + "loss": 0.4987, + "step": 5882 + }, + { + "epoch": 2.8593313069908817, + "grad_norm": 0.07351941044608798, + "learning_rate": 1.878062197936495e-06, + "loss": 0.548, + "step": 5883 + }, + { + "epoch": 2.8598176291793314, + "grad_norm": 0.0713989646309417, + "learning_rate": 1.8765668489808559e-06, + "loss": 0.5045, + "step": 5884 + }, + { + "epoch": 2.860303951367781, + "grad_norm": 0.07032523093089568, + "learning_rate": 1.8750719580412196e-06, + "loss": 0.5002, + "step": 5885 + }, + { + "epoch": 2.860790273556231, + "grad_norm": 0.07441826580913863, + "learning_rate": 1.873577525336795e-06, + "loss": 0.5564, + "step": 5886 + }, + { + "epoch": 2.861276595744681, + "grad_norm": 0.07111767656647187, + "learning_rate": 1.872083551086723e-06, + "loss": 0.5343, + "step": 5887 + }, + { + "epoch": 2.8617629179331305, + "grad_norm": 0.06989844336756533, + "learning_rate": 1.8705900355100787e-06, + "loss": 0.5444, + "step": 5888 + }, + { + "epoch": 2.8622492401215807, + "grad_norm": 0.07148322306161893, + "learning_rate": 1.8690969788258684e-06, + "loss": 0.5458, + "step": 5889 + }, + { + "epoch": 2.8627355623100303, + "grad_norm": 0.07115519202624379, + "learning_rate": 1.8676043812530325e-06, + "loss": 0.5322, + "step": 5890 + }, + { + "epoch": 2.86322188449848, + "grad_norm": 0.07196279675006961, + "learning_rate": 1.866112243010444e-06, + "loss": 0.5227, + "step": 5891 + }, + { + "epoch": 2.86370820668693, + "grad_norm": 0.07119518886513132, + "learning_rate": 1.864620564316907e-06, + "loss": 0.5339, + "step": 5892 + }, + { + "epoch": 2.86419452887538, + "grad_norm": 0.07051931878186678, + "learning_rate": 1.8631293453911596e-06, + "loss": 0.5114, + "step": 5893 + }, + { + "epoch": 2.86468085106383, + "grad_norm": 0.07135782068445463, + "learning_rate": 1.861638586451872e-06, + "loss": 0.5159, + "step": 5894 + }, + { + "epoch": 2.8651671732522797, + "grad_norm": 0.06988752907224728, + "learning_rate": 1.8601482877176475e-06, + "loss": 0.5226, + "step": 5895 + }, + { + "epoch": 2.8656534954407293, + "grad_norm": 0.07216267415702485, + "learning_rate": 1.8586584494070214e-06, + "loss": 0.5414, + "step": 5896 + }, + { + "epoch": 2.8661398176291795, + "grad_norm": 0.0707290624063378, + "learning_rate": 1.857169071738461e-06, + "loss": 0.5349, + "step": 5897 + }, + { + "epoch": 2.866626139817629, + "grad_norm": 0.0720293644487387, + "learning_rate": 1.855680154930367e-06, + "loss": 0.4974, + "step": 5898 + }, + { + "epoch": 2.8671124620060793, + "grad_norm": 0.07282997624292858, + "learning_rate": 1.8541916992010727e-06, + "loss": 0.5331, + "step": 5899 + }, + { + "epoch": 2.867598784194529, + "grad_norm": 0.06872523260553294, + "learning_rate": 1.8527037047688422e-06, + "loss": 0.5416, + "step": 5900 + }, + { + "epoch": 2.8680851063829786, + "grad_norm": 0.06985026635125152, + "learning_rate": 1.851216171851874e-06, + "loss": 0.5221, + "step": 5901 + }, + { + "epoch": 2.8685714285714283, + "grad_norm": 0.06916399231211036, + "learning_rate": 1.8497291006682967e-06, + "loss": 0.4832, + "step": 5902 + }, + { + "epoch": 2.8690577507598785, + "grad_norm": 0.06914632836635576, + "learning_rate": 1.8482424914361735e-06, + "loss": 0.4996, + "step": 5903 + }, + { + "epoch": 2.869544072948328, + "grad_norm": 0.07194505543985094, + "learning_rate": 1.8467563443734982e-06, + "loss": 0.5488, + "step": 5904 + }, + { + "epoch": 2.8700303951367783, + "grad_norm": 0.07193927603522775, + "learning_rate": 1.845270659698198e-06, + "loss": 0.5038, + "step": 5905 + }, + { + "epoch": 2.870516717325228, + "grad_norm": 0.06977363500973308, + "learning_rate": 1.8437854376281307e-06, + "loss": 0.5064, + "step": 5906 + }, + { + "epoch": 2.8710030395136776, + "grad_norm": 0.07336662309007039, + "learning_rate": 1.8423006783810893e-06, + "loss": 0.5445, + "step": 5907 + }, + { + "epoch": 2.8714893617021278, + "grad_norm": 0.07349780066206979, + "learning_rate": 1.8408163821747943e-06, + "loss": 0.525, + "step": 5908 + }, + { + "epoch": 2.8719756838905774, + "grad_norm": 0.07358272136557588, + "learning_rate": 1.8393325492269016e-06, + "loss": 0.527, + "step": 5909 + }, + { + "epoch": 2.8724620060790276, + "grad_norm": 0.0705264759202915, + "learning_rate": 1.8378491797549969e-06, + "loss": 0.4959, + "step": 5910 + }, + { + "epoch": 2.8729483282674773, + "grad_norm": 0.07045440984823527, + "learning_rate": 1.8363662739766036e-06, + "loss": 0.5175, + "step": 5911 + }, + { + "epoch": 2.873434650455927, + "grad_norm": 0.07027576530858748, + "learning_rate": 1.8348838321091705e-06, + "loss": 0.5218, + "step": 5912 + }, + { + "epoch": 2.8739209726443766, + "grad_norm": 0.07077195358920799, + "learning_rate": 1.833401854370081e-06, + "loss": 0.5094, + "step": 5913 + }, + { + "epoch": 2.8744072948328268, + "grad_norm": 0.06933642971308226, + "learning_rate": 1.8319203409766507e-06, + "loss": 0.4972, + "step": 5914 + }, + { + "epoch": 2.8748936170212764, + "grad_norm": 0.06977561700168815, + "learning_rate": 1.8304392921461262e-06, + "loss": 0.4994, + "step": 5915 + }, + { + "epoch": 2.8753799392097266, + "grad_norm": 0.0727938020413016, + "learning_rate": 1.8289587080956873e-06, + "loss": 0.5443, + "step": 5916 + }, + { + "epoch": 2.8758662613981762, + "grad_norm": 0.06737963729317442, + "learning_rate": 1.8274785890424434e-06, + "loss": 0.4869, + "step": 5917 + }, + { + "epoch": 2.876352583586626, + "grad_norm": 0.06835079165428672, + "learning_rate": 1.8259989352034385e-06, + "loss": 0.4793, + "step": 5918 + }, + { + "epoch": 2.876838905775076, + "grad_norm": 0.07107406207500278, + "learning_rate": 1.8245197467956472e-06, + "loss": 0.5054, + "step": 5919 + }, + { + "epoch": 2.8773252279635257, + "grad_norm": 0.07368407128404685, + "learning_rate": 1.8230410240359742e-06, + "loss": 0.5451, + "step": 5920 + }, + { + "epoch": 2.877811550151976, + "grad_norm": 0.06893833745367822, + "learning_rate": 1.8215627671412605e-06, + "loss": 0.5022, + "step": 5921 + }, + { + "epoch": 2.8782978723404256, + "grad_norm": 0.07037184250553943, + "learning_rate": 1.8200849763282713e-06, + "loss": 0.5251, + "step": 5922 + }, + { + "epoch": 2.8787841945288752, + "grad_norm": 0.07365039573898581, + "learning_rate": 1.8186076518137102e-06, + "loss": 0.5596, + "step": 5923 + }, + { + "epoch": 2.8792705167173254, + "grad_norm": 0.07277497393431025, + "learning_rate": 1.8171307938142101e-06, + "loss": 0.5396, + "step": 5924 + }, + { + "epoch": 2.879756838905775, + "grad_norm": 0.06994217079860676, + "learning_rate": 1.8156544025463346e-06, + "loss": 0.5085, + "step": 5925 + }, + { + "epoch": 2.880243161094225, + "grad_norm": 0.07100016575212124, + "learning_rate": 1.8141784782265809e-06, + "loss": 0.5039, + "step": 5926 + }, + { + "epoch": 2.880729483282675, + "grad_norm": 0.07197386046214273, + "learning_rate": 1.812703021071376e-06, + "loss": 0.5636, + "step": 5927 + }, + { + "epoch": 2.8812158054711245, + "grad_norm": 0.06906040139240434, + "learning_rate": 1.811228031297077e-06, + "loss": 0.5171, + "step": 5928 + }, + { + "epoch": 2.8817021276595742, + "grad_norm": 0.07248768415258881, + "learning_rate": 1.809753509119978e-06, + "loss": 0.5218, + "step": 5929 + }, + { + "epoch": 2.8821884498480244, + "grad_norm": 0.06940242852647203, + "learning_rate": 1.8082794547562993e-06, + "loss": 0.5116, + "step": 5930 + }, + { + "epoch": 2.882674772036474, + "grad_norm": 0.070882906781668, + "learning_rate": 1.806805868422194e-06, + "loss": 0.4932, + "step": 5931 + }, + { + "epoch": 2.883161094224924, + "grad_norm": 0.07297468126613556, + "learning_rate": 1.805332750333747e-06, + "loss": 0.5619, + "step": 5932 + }, + { + "epoch": 2.883647416413374, + "grad_norm": 0.06884612396778514, + "learning_rate": 1.8038601007069745e-06, + "loss": 0.4903, + "step": 5933 + }, + { + "epoch": 2.8841337386018235, + "grad_norm": 0.07274918156943112, + "learning_rate": 1.8023879197578237e-06, + "loss": 0.5818, + "step": 5934 + }, + { + "epoch": 2.8846200607902737, + "grad_norm": 0.07041133158857056, + "learning_rate": 1.800916207702173e-06, + "loss": 0.499, + "step": 5935 + }, + { + "epoch": 2.8851063829787233, + "grad_norm": 0.07027261923576446, + "learning_rate": 1.7994449647558337e-06, + "loss": 0.5209, + "step": 5936 + }, + { + "epoch": 2.8855927051671735, + "grad_norm": 0.07249666418503105, + "learning_rate": 1.7979741911345445e-06, + "loss": 0.5424, + "step": 5937 + }, + { + "epoch": 2.886079027355623, + "grad_norm": 0.06978072084012901, + "learning_rate": 1.7965038870539785e-06, + "loss": 0.4935, + "step": 5938 + }, + { + "epoch": 2.886565349544073, + "grad_norm": 0.07842199313111962, + "learning_rate": 1.7950340527297399e-06, + "loss": 0.55, + "step": 5939 + }, + { + "epoch": 2.8870516717325225, + "grad_norm": 0.07213701620607908, + "learning_rate": 1.7935646883773622e-06, + "loss": 0.5458, + "step": 5940 + }, + { + "epoch": 2.8875379939209727, + "grad_norm": 0.07050509495685729, + "learning_rate": 1.7920957942123113e-06, + "loss": 0.5278, + "step": 5941 + }, + { + "epoch": 2.8880243161094223, + "grad_norm": 0.07227954260715562, + "learning_rate": 1.7906273704499844e-06, + "loss": 0.5136, + "step": 5942 + }, + { + "epoch": 2.8885106382978725, + "grad_norm": 0.07280864929648496, + "learning_rate": 1.7891594173057086e-06, + "loss": 0.5305, + "step": 5943 + }, + { + "epoch": 2.888996960486322, + "grad_norm": 0.07080388388532134, + "learning_rate": 1.787691934994743e-06, + "loss": 0.5224, + "step": 5944 + }, + { + "epoch": 2.889483282674772, + "grad_norm": 0.07191046458895495, + "learning_rate": 1.7862249237322765e-06, + "loss": 0.4826, + "step": 5945 + }, + { + "epoch": 2.889969604863222, + "grad_norm": 0.07374816464037266, + "learning_rate": 1.7847583837334303e-06, + "loss": 0.5477, + "step": 5946 + }, + { + "epoch": 2.8904559270516716, + "grad_norm": 0.07200066641878645, + "learning_rate": 1.7832923152132542e-06, + "loss": 0.5323, + "step": 5947 + }, + { + "epoch": 2.8909422492401218, + "grad_norm": 0.07304184845500321, + "learning_rate": 1.7818267183867332e-06, + "loss": 0.5604, + "step": 5948 + }, + { + "epoch": 2.8914285714285715, + "grad_norm": 0.07037607372791684, + "learning_rate": 1.7803615934687796e-06, + "loss": 0.5137, + "step": 5949 + }, + { + "epoch": 2.891914893617021, + "grad_norm": 0.07425937143966992, + "learning_rate": 1.7788969406742363e-06, + "loss": 0.5419, + "step": 5950 + }, + { + "epoch": 2.8924012158054713, + "grad_norm": 0.07848712966988176, + "learning_rate": 1.777432760217881e-06, + "loss": 0.5634, + "step": 5951 + }, + { + "epoch": 2.892887537993921, + "grad_norm": 0.07547995739528522, + "learning_rate": 1.7759690523144146e-06, + "loss": 0.6216, + "step": 5952 + }, + { + "epoch": 2.893373860182371, + "grad_norm": 0.07167419191835152, + "learning_rate": 1.774505817178475e-06, + "loss": 0.5389, + "step": 5953 + }, + { + "epoch": 2.8938601823708208, + "grad_norm": 0.06763457554316366, + "learning_rate": 1.7730430550246303e-06, + "loss": 0.469, + "step": 5954 + }, + { + "epoch": 2.8943465045592704, + "grad_norm": 0.07020301150716754, + "learning_rate": 1.7715807660673768e-06, + "loss": 0.5174, + "step": 5955 + }, + { + "epoch": 2.89483282674772, + "grad_norm": 0.07313243217168361, + "learning_rate": 1.7701189505211424e-06, + "loss": 0.5341, + "step": 5956 + }, + { + "epoch": 2.8953191489361703, + "grad_norm": 0.07191171511544767, + "learning_rate": 1.7686576086002866e-06, + "loss": 0.5042, + "step": 5957 + }, + { + "epoch": 2.89580547112462, + "grad_norm": 0.0701450628664206, + "learning_rate": 1.7671967405190976e-06, + "loss": 0.5239, + "step": 5958 + }, + { + "epoch": 2.89629179331307, + "grad_norm": 0.07212256973710782, + "learning_rate": 1.7657363464917964e-06, + "loss": 0.5539, + "step": 5959 + }, + { + "epoch": 2.8967781155015198, + "grad_norm": 0.07146378804551277, + "learning_rate": 1.7642764267325323e-06, + "loss": 0.537, + "step": 5960 + }, + { + "epoch": 2.8972644376899694, + "grad_norm": 0.07155844731220108, + "learning_rate": 1.7628169814553858e-06, + "loss": 0.5222, + "step": 5961 + }, + { + "epoch": 2.8977507598784196, + "grad_norm": 0.07022504830497889, + "learning_rate": 1.761358010874369e-06, + "loss": 0.4966, + "step": 5962 + }, + { + "epoch": 2.8982370820668693, + "grad_norm": 0.0697817587268933, + "learning_rate": 1.759899515203422e-06, + "loss": 0.4913, + "step": 5963 + }, + { + "epoch": 2.8987234042553194, + "grad_norm": 0.06827936377073991, + "learning_rate": 1.7584414946564176e-06, + "loss": 0.5013, + "step": 5964 + }, + { + "epoch": 2.899209726443769, + "grad_norm": 0.07096588902736498, + "learning_rate": 1.7569839494471574e-06, + "loss": 0.5001, + "step": 5965 + }, + { + "epoch": 2.8996960486322187, + "grad_norm": 0.07422903737339483, + "learning_rate": 1.7555268797893743e-06, + "loss": 0.5337, + "step": 5966 + }, + { + "epoch": 2.9001823708206684, + "grad_norm": 0.07100258758927531, + "learning_rate": 1.7540702858967313e-06, + "loss": 0.5107, + "step": 5967 + }, + { + "epoch": 2.9006686930091186, + "grad_norm": 0.07322721442600164, + "learning_rate": 1.7526141679828202e-06, + "loss": 0.5296, + "step": 5968 + }, + { + "epoch": 2.9011550151975682, + "grad_norm": 0.07087597321835215, + "learning_rate": 1.7511585262611652e-06, + "loss": 0.4965, + "step": 5969 + }, + { + "epoch": 2.9016413373860184, + "grad_norm": 0.07189283734367957, + "learning_rate": 1.7497033609452192e-06, + "loss": 0.5437, + "step": 5970 + }, + { + "epoch": 2.902127659574468, + "grad_norm": 0.07343283569382482, + "learning_rate": 1.748248672248366e-06, + "loss": 0.5144, + "step": 5971 + }, + { + "epoch": 2.9026139817629177, + "grad_norm": 0.06887918035627322, + "learning_rate": 1.7467944603839187e-06, + "loss": 0.5165, + "step": 5972 + }, + { + "epoch": 2.903100303951368, + "grad_norm": 0.06991157318745035, + "learning_rate": 1.7453407255651212e-06, + "loss": 0.5389, + "step": 5973 + }, + { + "epoch": 2.9035866261398176, + "grad_norm": 0.07250723012097712, + "learning_rate": 1.743887468005147e-06, + "loss": 0.5291, + "step": 5974 + }, + { + "epoch": 2.9040729483282677, + "grad_norm": 0.06847955452770746, + "learning_rate": 1.7424346879171001e-06, + "loss": 0.482, + "step": 5975 + }, + { + "epoch": 2.9045592705167174, + "grad_norm": 0.07117190239765178, + "learning_rate": 1.7409823855140146e-06, + "loss": 0.5143, + "step": 5976 + }, + { + "epoch": 2.905045592705167, + "grad_norm": 0.07397924116146913, + "learning_rate": 1.739530561008853e-06, + "loss": 0.5334, + "step": 5977 + }, + { + "epoch": 2.905531914893617, + "grad_norm": 0.06902003861202617, + "learning_rate": 1.7380792146145098e-06, + "loss": 0.5171, + "step": 5978 + }, + { + "epoch": 2.906018237082067, + "grad_norm": 0.06956050885778228, + "learning_rate": 1.7366283465438082e-06, + "loss": 0.5218, + "step": 5979 + }, + { + "epoch": 2.906504559270517, + "grad_norm": 0.07180694213311371, + "learning_rate": 1.7351779570095017e-06, + "loss": 0.5215, + "step": 5980 + }, + { + "epoch": 2.9069908814589667, + "grad_norm": 0.0703823518152641, + "learning_rate": 1.7337280462242735e-06, + "loss": 0.5107, + "step": 5981 + }, + { + "epoch": 2.9074772036474164, + "grad_norm": 0.0690398187429159, + "learning_rate": 1.7322786144007358e-06, + "loss": 0.5093, + "step": 5982 + }, + { + "epoch": 2.907963525835866, + "grad_norm": 0.07071541522557791, + "learning_rate": 1.7308296617514319e-06, + "loss": 0.4974, + "step": 5983 + }, + { + "epoch": 2.908449848024316, + "grad_norm": 0.07062205160482402, + "learning_rate": 1.7293811884888344e-06, + "loss": 0.4917, + "step": 5984 + }, + { + "epoch": 2.908936170212766, + "grad_norm": 0.07498167025936278, + "learning_rate": 1.7279331948253452e-06, + "loss": 0.5366, + "step": 5985 + }, + { + "epoch": 2.909422492401216, + "grad_norm": 0.07200212818994361, + "learning_rate": 1.7264856809732966e-06, + "loss": 0.5256, + "step": 5986 + }, + { + "epoch": 2.9099088145896657, + "grad_norm": 0.07481312959203508, + "learning_rate": 1.7250386471449493e-06, + "loss": 0.5388, + "step": 5987 + }, + { + "epoch": 2.9103951367781153, + "grad_norm": 0.07477589243709247, + "learning_rate": 1.7235920935524947e-06, + "loss": 0.567, + "step": 5988 + }, + { + "epoch": 2.9108814589665655, + "grad_norm": 0.06825899619487304, + "learning_rate": 1.7221460204080537e-06, + "loss": 0.4513, + "step": 5989 + }, + { + "epoch": 2.911367781155015, + "grad_norm": 0.07225644321129099, + "learning_rate": 1.7207004279236762e-06, + "loss": 0.5246, + "step": 5990 + }, + { + "epoch": 2.9118541033434653, + "grad_norm": 0.07484221681855527, + "learning_rate": 1.719255316311342e-06, + "loss": 0.5591, + "step": 5991 + }, + { + "epoch": 2.912340425531915, + "grad_norm": 0.06939055296710232, + "learning_rate": 1.7178106857829602e-06, + "loss": 0.5016, + "step": 5992 + }, + { + "epoch": 2.9128267477203647, + "grad_norm": 0.06974699141258886, + "learning_rate": 1.7163665365503702e-06, + "loss": 0.5067, + "step": 5993 + }, + { + "epoch": 2.9133130699088143, + "grad_norm": 0.07488770736024253, + "learning_rate": 1.7149228688253388e-06, + "loss": 0.5461, + "step": 5994 + }, + { + "epoch": 2.9137993920972645, + "grad_norm": 0.07055136412381614, + "learning_rate": 1.7134796828195643e-06, + "loss": 0.5156, + "step": 5995 + }, + { + "epoch": 2.914285714285714, + "grad_norm": 0.07104380273782636, + "learning_rate": 1.7120369787446734e-06, + "loss": 0.519, + "step": 5996 + }, + { + "epoch": 2.9147720364741643, + "grad_norm": 0.07327946385579932, + "learning_rate": 1.7105947568122227e-06, + "loss": 0.5518, + "step": 5997 + }, + { + "epoch": 2.915258358662614, + "grad_norm": 0.07168190690447519, + "learning_rate": 1.7091530172336968e-06, + "loss": 0.4871, + "step": 5998 + }, + { + "epoch": 2.9157446808510636, + "grad_norm": 0.0676031914797481, + "learning_rate": 1.7077117602205128e-06, + "loss": 0.5023, + "step": 5999 + }, + { + "epoch": 2.9162310030395138, + "grad_norm": 0.07008615685094173, + "learning_rate": 1.706270985984011e-06, + "loss": 0.5088, + "step": 6000 + }, + { + "epoch": 2.9167173252279635, + "grad_norm": 0.06867686135284184, + "learning_rate": 1.7048306947354642e-06, + "loss": 0.4736, + "step": 6001 + }, + { + "epoch": 2.9172036474164136, + "grad_norm": 0.07005083320905237, + "learning_rate": 1.7033908866860794e-06, + "loss": 0.5128, + "step": 6002 + }, + { + "epoch": 2.9176899696048633, + "grad_norm": 0.06933621532800117, + "learning_rate": 1.7019515620469851e-06, + "loss": 0.5362, + "step": 6003 + }, + { + "epoch": 2.918176291793313, + "grad_norm": 0.0703131563761392, + "learning_rate": 1.700512721029242e-06, + "loss": 0.5366, + "step": 6004 + }, + { + "epoch": 2.918662613981763, + "grad_norm": 0.0717712736315408, + "learning_rate": 1.6990743638438411e-06, + "loss": 0.5527, + "step": 6005 + }, + { + "epoch": 2.9191489361702128, + "grad_norm": 0.07361356277841921, + "learning_rate": 1.6976364907016995e-06, + "loss": 0.5074, + "step": 6006 + }, + { + "epoch": 2.919635258358663, + "grad_norm": 0.0676281383765581, + "learning_rate": 1.6961991018136664e-06, + "loss": 0.4929, + "step": 6007 + }, + { + "epoch": 2.9201215805471126, + "grad_norm": 0.07227412116950283, + "learning_rate": 1.6947621973905176e-06, + "loss": 0.5763, + "step": 6008 + }, + { + "epoch": 2.9206079027355623, + "grad_norm": 0.06893859678991712, + "learning_rate": 1.693325777642959e-06, + "loss": 0.5147, + "step": 6009 + }, + { + "epoch": 2.921094224924012, + "grad_norm": 0.0714765583781806, + "learning_rate": 1.6918898427816255e-06, + "loss": 0.5168, + "step": 6010 + }, + { + "epoch": 2.921580547112462, + "grad_norm": 0.06789973577550529, + "learning_rate": 1.6904543930170802e-06, + "loss": 0.504, + "step": 6011 + }, + { + "epoch": 2.9220668693009118, + "grad_norm": 0.0722466904533531, + "learning_rate": 1.689019428559816e-06, + "loss": 0.5232, + "step": 6012 + }, + { + "epoch": 2.922553191489362, + "grad_norm": 0.06838086464874564, + "learning_rate": 1.687584949620255e-06, + "loss": 0.5147, + "step": 6013 + }, + { + "epoch": 2.9230395136778116, + "grad_norm": 0.07365026698934893, + "learning_rate": 1.6861509564087453e-06, + "loss": 0.5334, + "step": 6014 + }, + { + "epoch": 2.9235258358662612, + "grad_norm": 0.07015160335133704, + "learning_rate": 1.6847174491355662e-06, + "loss": 0.5365, + "step": 6015 + }, + { + "epoch": 2.9240121580547114, + "grad_norm": 0.07146543331738837, + "learning_rate": 1.6832844280109256e-06, + "loss": 0.5278, + "step": 6016 + }, + { + "epoch": 2.924498480243161, + "grad_norm": 0.06789744125428117, + "learning_rate": 1.68185189324496e-06, + "loss": 0.4657, + "step": 6017 + }, + { + "epoch": 2.924984802431611, + "grad_norm": 0.07122907863059538, + "learning_rate": 1.6804198450477345e-06, + "loss": 0.5122, + "step": 6018 + }, + { + "epoch": 2.925471124620061, + "grad_norm": 0.07112499999109342, + "learning_rate": 1.6789882836292403e-06, + "loss": 0.5488, + "step": 6019 + }, + { + "epoch": 2.9259574468085106, + "grad_norm": 0.07130916145644813, + "learning_rate": 1.6775572091994036e-06, + "loss": 0.5318, + "step": 6020 + }, + { + "epoch": 2.9264437689969602, + "grad_norm": 0.07092088569827337, + "learning_rate": 1.6761266219680734e-06, + "loss": 0.5332, + "step": 6021 + }, + { + "epoch": 2.9269300911854104, + "grad_norm": 0.07078000135641391, + "learning_rate": 1.6746965221450285e-06, + "loss": 0.5419, + "step": 6022 + }, + { + "epoch": 2.92741641337386, + "grad_norm": 0.06816657452952198, + "learning_rate": 1.673266909939978e-06, + "loss": 0.5108, + "step": 6023 + }, + { + "epoch": 2.92790273556231, + "grad_norm": 0.07281214106359819, + "learning_rate": 1.6718377855625567e-06, + "loss": 0.5309, + "step": 6024 + }, + { + "epoch": 2.92838905775076, + "grad_norm": 0.07141867442896573, + "learning_rate": 1.6704091492223313e-06, + "loss": 0.5222, + "step": 6025 + }, + { + "epoch": 2.9288753799392095, + "grad_norm": 0.0718099258009478, + "learning_rate": 1.6689810011287933e-06, + "loss": 0.5436, + "step": 6026 + }, + { + "epoch": 2.9293617021276597, + "grad_norm": 0.07223062760344223, + "learning_rate": 1.667553341491366e-06, + "loss": 0.5431, + "step": 6027 + }, + { + "epoch": 2.9298480243161094, + "grad_norm": 0.07097535127157406, + "learning_rate": 1.6661261705193998e-06, + "loss": 0.5103, + "step": 6028 + }, + { + "epoch": 2.9303343465045595, + "grad_norm": 0.07280884368679566, + "learning_rate": 1.6646994884221707e-06, + "loss": 0.5239, + "step": 6029 + }, + { + "epoch": 2.930820668693009, + "grad_norm": 0.07156559806022524, + "learning_rate": 1.663273295408887e-06, + "loss": 0.5515, + "step": 6030 + }, + { + "epoch": 2.931306990881459, + "grad_norm": 0.07046549374071892, + "learning_rate": 1.6618475916886834e-06, + "loss": 0.5105, + "step": 6031 + }, + { + "epoch": 2.931793313069909, + "grad_norm": 0.07100912615035251, + "learning_rate": 1.660422377470623e-06, + "loss": 0.5061, + "step": 6032 + }, + { + "epoch": 2.9322796352583587, + "grad_norm": 0.07177730183676781, + "learning_rate": 1.6589976529636976e-06, + "loss": 0.5296, + "step": 6033 + }, + { + "epoch": 2.9327659574468083, + "grad_norm": 0.06996226536673274, + "learning_rate": 1.6575734183768267e-06, + "loss": 0.4991, + "step": 6034 + }, + { + "epoch": 2.9332522796352585, + "grad_norm": 0.07065636754306481, + "learning_rate": 1.6561496739188582e-06, + "loss": 0.5621, + "step": 6035 + }, + { + "epoch": 2.933738601823708, + "grad_norm": 0.06868773838186035, + "learning_rate": 1.6547264197985685e-06, + "loss": 0.5006, + "step": 6036 + }, + { + "epoch": 2.934224924012158, + "grad_norm": 0.06856724646626858, + "learning_rate": 1.653303656224659e-06, + "loss": 0.4835, + "step": 6037 + }, + { + "epoch": 2.934711246200608, + "grad_norm": 0.06977618962396141, + "learning_rate": 1.6518813834057662e-06, + "loss": 0.5222, + "step": 6038 + }, + { + "epoch": 2.9351975683890577, + "grad_norm": 0.073265559420687, + "learning_rate": 1.6504596015504482e-06, + "loss": 0.5312, + "step": 6039 + }, + { + "epoch": 2.935683890577508, + "grad_norm": 0.0719482414605524, + "learning_rate": 1.6490383108671926e-06, + "loss": 0.5398, + "step": 6040 + }, + { + "epoch": 2.9361702127659575, + "grad_norm": 0.07281183254750259, + "learning_rate": 1.6476175115644162e-06, + "loss": 0.5675, + "step": 6041 + }, + { + "epoch": 2.936656534954407, + "grad_norm": 0.07049732432187852, + "learning_rate": 1.6461972038504631e-06, + "loss": 0.4935, + "step": 6042 + }, + { + "epoch": 2.9371428571428573, + "grad_norm": 0.07162588331611189, + "learning_rate": 1.6447773879336064e-06, + "loss": 0.5386, + "step": 6043 + }, + { + "epoch": 2.937629179331307, + "grad_norm": 0.07056773183133058, + "learning_rate": 1.6433580640220431e-06, + "loss": 0.5036, + "step": 6044 + }, + { + "epoch": 2.938115501519757, + "grad_norm": 0.07132677753981184, + "learning_rate": 1.6419392323239026e-06, + "loss": 0.5334, + "step": 6045 + }, + { + "epoch": 2.9386018237082068, + "grad_norm": 0.07468836253526807, + "learning_rate": 1.6405208930472404e-06, + "loss": 0.5505, + "step": 6046 + }, + { + "epoch": 2.9390881458966565, + "grad_norm": 0.07214054683430728, + "learning_rate": 1.63910304640004e-06, + "loss": 0.4762, + "step": 6047 + }, + { + "epoch": 2.939574468085106, + "grad_norm": 0.07475753543519399, + "learning_rate": 1.6376856925902123e-06, + "loss": 0.5363, + "step": 6048 + }, + { + "epoch": 2.9400607902735563, + "grad_norm": 0.07106541720468036, + "learning_rate": 1.6362688318255958e-06, + "loss": 0.526, + "step": 6049 + }, + { + "epoch": 2.940547112462006, + "grad_norm": 0.06920072059423771, + "learning_rate": 1.634852464313958e-06, + "loss": 0.5095, + "step": 6050 + }, + { + "epoch": 2.941033434650456, + "grad_norm": 0.0704480594708247, + "learning_rate": 1.6334365902629917e-06, + "loss": 0.5274, + "step": 6051 + }, + { + "epoch": 2.9415197568389058, + "grad_norm": 0.07097498997412845, + "learning_rate": 1.63202120988032e-06, + "loss": 0.4945, + "step": 6052 + }, + { + "epoch": 2.9420060790273554, + "grad_norm": 0.0738334596683305, + "learning_rate": 1.630606323373492e-06, + "loss": 0.516, + "step": 6053 + }, + { + "epoch": 2.9424924012158056, + "grad_norm": 0.07141998040096903, + "learning_rate": 1.6291919309499849e-06, + "loss": 0.5254, + "step": 6054 + }, + { + "epoch": 2.9429787234042553, + "grad_norm": 0.07119822669664502, + "learning_rate": 1.6277780328172026e-06, + "loss": 0.5118, + "step": 6055 + }, + { + "epoch": 2.9434650455927054, + "grad_norm": 0.07230331754380472, + "learning_rate": 1.6263646291824764e-06, + "loss": 0.5261, + "step": 6056 + }, + { + "epoch": 2.943951367781155, + "grad_norm": 0.07406292921022503, + "learning_rate": 1.6249517202530707e-06, + "loss": 0.5199, + "step": 6057 + }, + { + "epoch": 2.9444376899696048, + "grad_norm": 0.07392152844374228, + "learning_rate": 1.6235393062361666e-06, + "loss": 0.5795, + "step": 6058 + }, + { + "epoch": 2.9449240121580544, + "grad_norm": 0.07398911777580285, + "learning_rate": 1.6221273873388816e-06, + "loss": 0.5473, + "step": 6059 + }, + { + "epoch": 2.9454103343465046, + "grad_norm": 0.07002314362490732, + "learning_rate": 1.6207159637682568e-06, + "loss": 0.5028, + "step": 6060 + }, + { + "epoch": 2.9458966565349542, + "grad_norm": 0.06939334846963183, + "learning_rate": 1.6193050357312612e-06, + "loss": 0.5189, + "step": 6061 + }, + { + "epoch": 2.9463829787234044, + "grad_norm": 0.06948157078915279, + "learning_rate": 1.617894603434792e-06, + "loss": 0.4878, + "step": 6062 + }, + { + "epoch": 2.946869300911854, + "grad_norm": 0.07042747510794506, + "learning_rate": 1.6164846670856732e-06, + "loss": 0.5241, + "step": 6063 + }, + { + "epoch": 2.9473556231003037, + "grad_norm": 0.07468897432673499, + "learning_rate": 1.6150752268906555e-06, + "loss": 0.5496, + "step": 6064 + }, + { + "epoch": 2.947841945288754, + "grad_norm": 0.07318349839112621, + "learning_rate": 1.613666283056417e-06, + "loss": 0.5105, + "step": 6065 + }, + { + "epoch": 2.9483282674772036, + "grad_norm": 0.06990394364557215, + "learning_rate": 1.6122578357895641e-06, + "loss": 0.5099, + "step": 6066 + }, + { + "epoch": 2.9488145896656537, + "grad_norm": 0.07407482681614101, + "learning_rate": 1.6108498852966291e-06, + "loss": 0.5797, + "step": 6067 + }, + { + "epoch": 2.9493009118541034, + "grad_norm": 0.07191676067587008, + "learning_rate": 1.6094424317840724e-06, + "loss": 0.5391, + "step": 6068 + }, + { + "epoch": 2.949787234042553, + "grad_norm": 0.07010958232081264, + "learning_rate": 1.60803547545828e-06, + "loss": 0.5339, + "step": 6069 + }, + { + "epoch": 2.950273556231003, + "grad_norm": 0.0699173064606949, + "learning_rate": 1.6066290165255676e-06, + "loss": 0.5147, + "step": 6070 + }, + { + "epoch": 2.950759878419453, + "grad_norm": 0.06962652414736897, + "learning_rate": 1.6052230551921748e-06, + "loss": 0.5221, + "step": 6071 + }, + { + "epoch": 2.951246200607903, + "grad_norm": 0.06985566938253028, + "learning_rate": 1.6038175916642718e-06, + "loss": 0.4967, + "step": 6072 + }, + { + "epoch": 2.9517325227963527, + "grad_norm": 0.07258963703840368, + "learning_rate": 1.6024126261479516e-06, + "loss": 0.5263, + "step": 6073 + }, + { + "epoch": 2.9522188449848024, + "grad_norm": 0.07314877709463435, + "learning_rate": 1.6010081588492381e-06, + "loss": 0.5115, + "step": 6074 + }, + { + "epoch": 2.952705167173252, + "grad_norm": 0.0748136834455607, + "learning_rate": 1.5996041899740804e-06, + "loss": 0.5298, + "step": 6075 + }, + { + "epoch": 2.953191489361702, + "grad_norm": 0.07469751670967531, + "learning_rate": 1.5982007197283539e-06, + "loss": 0.5805, + "step": 6076 + }, + { + "epoch": 2.953677811550152, + "grad_norm": 0.07206073194033645, + "learning_rate": 1.596797748317862e-06, + "loss": 0.5387, + "step": 6077 + }, + { + "epoch": 2.954164133738602, + "grad_norm": 0.07002819675982361, + "learning_rate": 1.5953952759483344e-06, + "loss": 0.5165, + "step": 6078 + }, + { + "epoch": 2.9546504559270517, + "grad_norm": 0.07158275399443301, + "learning_rate": 1.5939933028254272e-06, + "loss": 0.5121, + "step": 6079 + }, + { + "epoch": 2.9551367781155014, + "grad_norm": 0.07258915114443422, + "learning_rate": 1.5925918291547249e-06, + "loss": 0.5081, + "step": 6080 + }, + { + "epoch": 2.9556231003039515, + "grad_norm": 0.07123377533054698, + "learning_rate": 1.591190855141737e-06, + "loss": 0.5352, + "step": 6081 + }, + { + "epoch": 2.956109422492401, + "grad_norm": 0.07193813824280655, + "learning_rate": 1.5897903809919008e-06, + "loss": 0.5376, + "step": 6082 + }, + { + "epoch": 2.9565957446808513, + "grad_norm": 0.0729967474531232, + "learning_rate": 1.5883904069105793e-06, + "loss": 0.5722, + "step": 6083 + }, + { + "epoch": 2.957082066869301, + "grad_norm": 0.07019001229182958, + "learning_rate": 1.5869909331030636e-06, + "loss": 0.5009, + "step": 6084 + }, + { + "epoch": 2.9575683890577507, + "grad_norm": 0.07088399060126647, + "learning_rate": 1.58559195977457e-06, + "loss": 0.5249, + "step": 6085 + }, + { + "epoch": 2.9580547112462003, + "grad_norm": 0.07176912479884007, + "learning_rate": 1.5841934871302423e-06, + "loss": 0.5416, + "step": 6086 + }, + { + "epoch": 2.9585410334346505, + "grad_norm": 0.07280805550982031, + "learning_rate": 1.5827955153751507e-06, + "loss": 0.5323, + "step": 6087 + }, + { + "epoch": 2.9590273556231, + "grad_norm": 0.07106150728197004, + "learning_rate": 1.5813980447142924e-06, + "loss": 0.5062, + "step": 6088 + }, + { + "epoch": 2.9595136778115503, + "grad_norm": 0.07253740645729541, + "learning_rate": 1.5800010753525896e-06, + "loss": 0.542, + "step": 6089 + }, + { + "epoch": 2.96, + "grad_norm": 0.07217280197606152, + "learning_rate": 1.5786046074948924e-06, + "loss": 0.5513, + "step": 6090 + }, + { + "epoch": 2.9604863221884496, + "grad_norm": 0.06928289448280522, + "learning_rate": 1.5772086413459787e-06, + "loss": 0.5073, + "step": 6091 + }, + { + "epoch": 2.9609726443768998, + "grad_norm": 0.06960362723218959, + "learning_rate": 1.5758131771105457e-06, + "loss": 0.5012, + "step": 6092 + }, + { + "epoch": 2.9614589665653495, + "grad_norm": 0.06825358917997426, + "learning_rate": 1.574418214993228e-06, + "loss": 0.4727, + "step": 6093 + }, + { + "epoch": 2.9619452887537996, + "grad_norm": 0.06889632566465823, + "learning_rate": 1.5730237551985794e-06, + "loss": 0.5153, + "step": 6094 + }, + { + "epoch": 2.9624316109422493, + "grad_norm": 0.07279289320975993, + "learning_rate": 1.5716297979310807e-06, + "loss": 0.5228, + "step": 6095 + }, + { + "epoch": 2.962917933130699, + "grad_norm": 0.07129552296789354, + "learning_rate": 1.5702363433951407e-06, + "loss": 0.5254, + "step": 6096 + }, + { + "epoch": 2.963404255319149, + "grad_norm": 0.07213254444269546, + "learning_rate": 1.5688433917950934e-06, + "loss": 0.5276, + "step": 6097 + }, + { + "epoch": 2.9638905775075988, + "grad_norm": 0.07307685030409322, + "learning_rate": 1.5674509433351992e-06, + "loss": 0.5613, + "step": 6098 + }, + { + "epoch": 2.964376899696049, + "grad_norm": 0.07021795372797525, + "learning_rate": 1.566058998219645e-06, + "loss": 0.5225, + "step": 6099 + }, + { + "epoch": 2.9648632218844986, + "grad_norm": 0.07051656671820612, + "learning_rate": 1.5646675566525437e-06, + "loss": 0.5046, + "step": 6100 + }, + { + "epoch": 2.9653495440729483, + "grad_norm": 0.07163921484084286, + "learning_rate": 1.5632766188379346e-06, + "loss": 0.5243, + "step": 6101 + }, + { + "epoch": 2.965835866261398, + "grad_norm": 0.06946789441899145, + "learning_rate": 1.5618861849797824e-06, + "loss": 0.5206, + "step": 6102 + }, + { + "epoch": 2.966322188449848, + "grad_norm": 0.06940574585310512, + "learning_rate": 1.5604962552819792e-06, + "loss": 0.4944, + "step": 6103 + }, + { + "epoch": 2.9668085106382978, + "grad_norm": 0.06913005264797906, + "learning_rate": 1.559106829948342e-06, + "loss": 0.4807, + "step": 6104 + }, + { + "epoch": 2.967294832826748, + "grad_norm": 0.07269691611788394, + "learning_rate": 1.5577179091826156e-06, + "loss": 0.5571, + "step": 6105 + }, + { + "epoch": 2.9677811550151976, + "grad_norm": 0.07254529560632424, + "learning_rate": 1.5563294931884665e-06, + "loss": 0.549, + "step": 6106 + }, + { + "epoch": 2.9682674772036473, + "grad_norm": 0.07168650248203734, + "learning_rate": 1.554941582169492e-06, + "loss": 0.5294, + "step": 6107 + }, + { + "epoch": 2.9687537993920974, + "grad_norm": 0.07075477280000363, + "learning_rate": 1.5535541763292127e-06, + "loss": 0.5072, + "step": 6108 + }, + { + "epoch": 2.969240121580547, + "grad_norm": 0.07165454533166851, + "learning_rate": 1.5521672758710772e-06, + "loss": 0.5289, + "step": 6109 + }, + { + "epoch": 2.969726443768997, + "grad_norm": 0.07219331729512117, + "learning_rate": 1.550780880998456e-06, + "loss": 0.5215, + "step": 6110 + }, + { + "epoch": 2.970212765957447, + "grad_norm": 0.07119116092956648, + "learning_rate": 1.5493949919146517e-06, + "loss": 0.5284, + "step": 6111 + }, + { + "epoch": 2.9706990881458966, + "grad_norm": 0.0700695460048129, + "learning_rate": 1.5480096088228874e-06, + "loss": 0.5112, + "step": 6112 + }, + { + "epoch": 2.9711854103343462, + "grad_norm": 0.0716373132061361, + "learning_rate": 1.5466247319263144e-06, + "loss": 0.5306, + "step": 6113 + }, + { + "epoch": 2.9716717325227964, + "grad_norm": 0.07144441769307608, + "learning_rate": 1.5452403614280087e-06, + "loss": 0.5128, + "step": 6114 + }, + { + "epoch": 2.972158054711246, + "grad_norm": 0.07155792877241401, + "learning_rate": 1.5438564975309728e-06, + "loss": 0.5122, + "step": 6115 + }, + { + "epoch": 2.972644376899696, + "grad_norm": 0.07361865969549194, + "learning_rate": 1.5424731404381344e-06, + "loss": 0.5554, + "step": 6116 + }, + { + "epoch": 2.973130699088146, + "grad_norm": 0.07128556580327253, + "learning_rate": 1.5410902903523467e-06, + "loss": 0.528, + "step": 6117 + }, + { + "epoch": 2.9736170212765956, + "grad_norm": 0.07478992479503081, + "learning_rate": 1.53970794747639e-06, + "loss": 0.5402, + "step": 6118 + }, + { + "epoch": 2.9741033434650457, + "grad_norm": 0.07097175238564427, + "learning_rate": 1.5383261120129679e-06, + "loss": 0.5117, + "step": 6119 + }, + { + "epoch": 2.9745896656534954, + "grad_norm": 0.07217017031113146, + "learning_rate": 1.5369447841647133e-06, + "loss": 0.5174, + "step": 6120 + }, + { + "epoch": 2.9750759878419455, + "grad_norm": 0.07153177765734692, + "learning_rate": 1.535563964134179e-06, + "loss": 0.5089, + "step": 6121 + }, + { + "epoch": 2.975562310030395, + "grad_norm": 0.07278554468685437, + "learning_rate": 1.5341836521238486e-06, + "loss": 0.5647, + "step": 6122 + }, + { + "epoch": 2.976048632218845, + "grad_norm": 0.07018225675099196, + "learning_rate": 1.532803848336128e-06, + "loss": 0.4829, + "step": 6123 + }, + { + "epoch": 2.976534954407295, + "grad_norm": 0.07046158785296074, + "learning_rate": 1.5314245529733507e-06, + "loss": 0.5269, + "step": 6124 + }, + { + "epoch": 2.9770212765957447, + "grad_norm": 0.07155523015072264, + "learning_rate": 1.5300457662377744e-06, + "loss": 0.5183, + "step": 6125 + }, + { + "epoch": 2.977507598784195, + "grad_norm": 0.07030725503171567, + "learning_rate": 1.5286674883315828e-06, + "loss": 0.4965, + "step": 6126 + }, + { + "epoch": 2.9779939209726445, + "grad_norm": 0.07015393569927068, + "learning_rate": 1.5272897194568837e-06, + "loss": 0.5502, + "step": 6127 + }, + { + "epoch": 2.978480243161094, + "grad_norm": 0.06920298317638908, + "learning_rate": 1.525912459815711e-06, + "loss": 0.5304, + "step": 6128 + }, + { + "epoch": 2.978966565349544, + "grad_norm": 0.06948782463006692, + "learning_rate": 1.5245357096100266e-06, + "loss": 0.4959, + "step": 6129 + }, + { + "epoch": 2.979452887537994, + "grad_norm": 0.069537593099965, + "learning_rate": 1.523159469041714e-06, + "loss": 0.5258, + "step": 6130 + }, + { + "epoch": 2.9799392097264437, + "grad_norm": 0.07111963157495997, + "learning_rate": 1.5217837383125828e-06, + "loss": 0.5206, + "step": 6131 + }, + { + "epoch": 2.980425531914894, + "grad_norm": 0.07089125224482085, + "learning_rate": 1.520408517624369e-06, + "loss": 0.4981, + "step": 6132 + }, + { + "epoch": 2.9809118541033435, + "grad_norm": 0.07104372785165855, + "learning_rate": 1.5190338071787325e-06, + "loss": 0.5695, + "step": 6133 + }, + { + "epoch": 2.981398176291793, + "grad_norm": 0.07222586036350302, + "learning_rate": 1.5176596071772592e-06, + "loss": 0.5036, + "step": 6134 + }, + { + "epoch": 2.9818844984802433, + "grad_norm": 0.07251231875788276, + "learning_rate": 1.5162859178214617e-06, + "loss": 0.5456, + "step": 6135 + }, + { + "epoch": 2.982370820668693, + "grad_norm": 0.0684742697918209, + "learning_rate": 1.5149127393127727e-06, + "loss": 0.4976, + "step": 6136 + }, + { + "epoch": 2.982857142857143, + "grad_norm": 0.07190973330983985, + "learning_rate": 1.5135400718525545e-06, + "loss": 0.5695, + "step": 6137 + }, + { + "epoch": 2.983343465045593, + "grad_norm": 0.07176590213796266, + "learning_rate": 1.5121679156420932e-06, + "loss": 0.5241, + "step": 6138 + }, + { + "epoch": 2.9838297872340425, + "grad_norm": 0.07181166336761126, + "learning_rate": 1.5107962708826e-06, + "loss": 0.5402, + "step": 6139 + }, + { + "epoch": 2.984316109422492, + "grad_norm": 0.06958430694648206, + "learning_rate": 1.5094251377752112e-06, + "loss": 0.5181, + "step": 6140 + }, + { + "epoch": 2.9848024316109423, + "grad_norm": 0.07000975881574777, + "learning_rate": 1.5080545165209881e-06, + "loss": 0.5105, + "step": 6141 + }, + { + "epoch": 2.985288753799392, + "grad_norm": 0.06921138290213592, + "learning_rate": 1.5066844073209164e-06, + "loss": 0.5157, + "step": 6142 + }, + { + "epoch": 2.985775075987842, + "grad_norm": 0.07036673448578677, + "learning_rate": 1.5053148103759075e-06, + "loss": 0.5278, + "step": 6143 + }, + { + "epoch": 2.9862613981762918, + "grad_norm": 0.07128475918845432, + "learning_rate": 1.5039457258867961e-06, + "loss": 0.5205, + "step": 6144 + }, + { + "epoch": 2.9867477203647415, + "grad_norm": 0.07272772419222057, + "learning_rate": 1.5025771540543443e-06, + "loss": 0.5113, + "step": 6145 + }, + { + "epoch": 2.9872340425531916, + "grad_norm": 0.07086674942672036, + "learning_rate": 1.5012090950792353e-06, + "loss": 0.5338, + "step": 6146 + }, + { + "epoch": 2.9877203647416413, + "grad_norm": 0.06983750488366851, + "learning_rate": 1.4998415491620822e-06, + "loss": 0.4985, + "step": 6147 + }, + { + "epoch": 2.9882066869300914, + "grad_norm": 0.07163101506992368, + "learning_rate": 1.4984745165034192e-06, + "loss": 0.5401, + "step": 6148 + }, + { + "epoch": 2.988693009118541, + "grad_norm": 0.07263895277449675, + "learning_rate": 1.4971079973037078e-06, + "loss": 0.5349, + "step": 6149 + }, + { + "epoch": 2.9891793313069908, + "grad_norm": 0.07714082810051419, + "learning_rate": 1.4957419917633293e-06, + "loss": 0.4933, + "step": 6150 + }, + { + "epoch": 2.989665653495441, + "grad_norm": 0.07236696119733044, + "learning_rate": 1.4943765000825933e-06, + "loss": 0.5105, + "step": 6151 + }, + { + "epoch": 2.9901519756838906, + "grad_norm": 0.07102042208541053, + "learning_rate": 1.4930115224617353e-06, + "loss": 0.5238, + "step": 6152 + }, + { + "epoch": 2.9906382978723407, + "grad_norm": 0.07049020098172762, + "learning_rate": 1.491647059100913e-06, + "loss": 0.5236, + "step": 6153 + }, + { + "epoch": 2.9911246200607904, + "grad_norm": 0.06999746809100563, + "learning_rate": 1.490283110200209e-06, + "loss": 0.4926, + "step": 6154 + }, + { + "epoch": 2.99161094224924, + "grad_norm": 0.0732270009364142, + "learning_rate": 1.488919675959632e-06, + "loss": 0.5493, + "step": 6155 + }, + { + "epoch": 2.9920972644376898, + "grad_norm": 0.07085013259654209, + "learning_rate": 1.4875567565791132e-06, + "loss": 0.5481, + "step": 6156 + }, + { + "epoch": 2.99258358662614, + "grad_norm": 0.06999918428777495, + "learning_rate": 1.4861943522585093e-06, + "loss": 0.505, + "step": 6157 + }, + { + "epoch": 2.9930699088145896, + "grad_norm": 0.06963902933179202, + "learning_rate": 1.4848324631976025e-06, + "loss": 0.5259, + "step": 6158 + }, + { + "epoch": 2.9935562310030397, + "grad_norm": 0.07001521704660149, + "learning_rate": 1.4834710895960968e-06, + "loss": 0.4737, + "step": 6159 + }, + { + "epoch": 2.9940425531914894, + "grad_norm": 0.07287657562554004, + "learning_rate": 1.4821102316536235e-06, + "loss": 0.5212, + "step": 6160 + }, + { + "epoch": 2.994528875379939, + "grad_norm": 0.07061187642821795, + "learning_rate": 1.4807498895697365e-06, + "loss": 0.507, + "step": 6161 + }, + { + "epoch": 2.995015197568389, + "grad_norm": 0.07173662217190524, + "learning_rate": 1.479390063543914e-06, + "loss": 0.5391, + "step": 6162 + }, + { + "epoch": 2.995501519756839, + "grad_norm": 0.06943580885088292, + "learning_rate": 1.47803075377556e-06, + "loss": 0.4807, + "step": 6163 + }, + { + "epoch": 2.995987841945289, + "grad_norm": 0.07244212099807654, + "learning_rate": 1.4766719604640012e-06, + "loss": 0.5509, + "step": 6164 + }, + { + "epoch": 2.9964741641337387, + "grad_norm": 0.07057960905286166, + "learning_rate": 1.4753136838084892e-06, + "loss": 0.523, + "step": 6165 + }, + { + "epoch": 2.9969604863221884, + "grad_norm": 0.06959391140760514, + "learning_rate": 1.4739559240082001e-06, + "loss": 0.5007, + "step": 6166 + }, + { + "epoch": 2.997446808510638, + "grad_norm": 0.07490875829482416, + "learning_rate": 1.4725986812622339e-06, + "loss": 0.5632, + "step": 6167 + }, + { + "epoch": 2.997933130699088, + "grad_norm": 0.06918319147220206, + "learning_rate": 1.471241955769615e-06, + "loss": 0.4932, + "step": 6168 + }, + { + "epoch": 2.997933130699088, + "eval_loss": 0.5682429075241089, + "eval_runtime": 105.2179, + "eval_samples_per_second": 288.478, + "eval_steps_per_second": 36.068, + "step": 6168 + }, + { + "epoch": 2.998419452887538, + "grad_norm": 0.07322987594972527, + "learning_rate": 1.469885747729291e-06, + "loss": 0.5123, + "step": 6169 + }, + { + "epoch": 2.998905775075988, + "grad_norm": 0.06951847275323363, + "learning_rate": 1.4685300573401357e-06, + "loss": 0.4923, + "step": 6170 + }, + { + "epoch": 2.9993920972644377, + "grad_norm": 0.07291889866397412, + "learning_rate": 1.4671748848009443e-06, + "loss": 0.5287, + "step": 6171 + }, + { + "epoch": 2.9998784194528874, + "grad_norm": 0.07112669098830986, + "learning_rate": 1.4658202303104385e-06, + "loss": 0.529, + "step": 6172 + }, + { + "epoch": 3.0, + "grad_norm": 0.07112669098830986, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.1511, + "step": 6173 + }, + { + "epoch": 3.0003647416413375, + "grad_norm": 0.06955238737644102, + "learning_rate": 1.4631124762699856e-06, + "loss": 0.3596, + "step": 6174 + }, + { + "epoch": 3.0004863221884497, + "grad_norm": 0.07051223448095613, + "learning_rate": 1.4617593771170996e-06, + "loss": 0.4762, + "step": 6175 + }, + { + "epoch": 3.0009726443769, + "grad_norm": 0.07204726616358026, + "learning_rate": 1.4604067968070218e-06, + "loss": 0.5139, + "step": 6176 + }, + { + "epoch": 3.0014589665653495, + "grad_norm": 0.0721694540504832, + "learning_rate": 1.4590547355380925e-06, + "loss": 0.477, + "step": 6177 + }, + { + "epoch": 3.001945288753799, + "grad_norm": 0.07155953490441903, + "learning_rate": 1.4577031935085762e-06, + "loss": 0.5355, + "step": 6178 + }, + { + "epoch": 3.0024316109422493, + "grad_norm": 0.06948066977378234, + "learning_rate": 1.4563521709166606e-06, + "loss": 0.4806, + "step": 6179 + }, + { + "epoch": 3.002917933130699, + "grad_norm": 0.07219113710944954, + "learning_rate": 1.455001667960459e-06, + "loss": 0.5102, + "step": 6180 + }, + { + "epoch": 3.003404255319149, + "grad_norm": 0.07069258204358841, + "learning_rate": 1.4536516848380061e-06, + "loss": 0.4911, + "step": 6181 + }, + { + "epoch": 3.003890577507599, + "grad_norm": 0.07250456230563963, + "learning_rate": 1.4523022217472626e-06, + "loss": 0.4937, + "step": 6182 + }, + { + "epoch": 3.0043768996960485, + "grad_norm": 0.07210395667743953, + "learning_rate": 1.4509532788861113e-06, + "loss": 0.5389, + "step": 6183 + }, + { + "epoch": 3.0048632218844986, + "grad_norm": 0.07200277108613336, + "learning_rate": 1.4496048564523595e-06, + "loss": 0.5277, + "step": 6184 + }, + { + "epoch": 3.0053495440729483, + "grad_norm": 0.06992059476819104, + "learning_rate": 1.4482569546437386e-06, + "loss": 0.4959, + "step": 6185 + }, + { + "epoch": 3.005835866261398, + "grad_norm": 0.07223332263166926, + "learning_rate": 1.4469095736579019e-06, + "loss": 0.5138, + "step": 6186 + }, + { + "epoch": 3.006322188449848, + "grad_norm": 0.0706874370686038, + "learning_rate": 1.4455627136924282e-06, + "loss": 0.5124, + "step": 6187 + }, + { + "epoch": 3.006808510638298, + "grad_norm": 0.07267794332800777, + "learning_rate": 1.4442163749448201e-06, + "loss": 0.5732, + "step": 6188 + }, + { + "epoch": 3.007294832826748, + "grad_norm": 0.0727296703807697, + "learning_rate": 1.4428705576125012e-06, + "loss": 0.5033, + "step": 6189 + }, + { + "epoch": 3.0077811550151976, + "grad_norm": 0.071193769441196, + "learning_rate": 1.4415252618928216e-06, + "loss": 0.5097, + "step": 6190 + }, + { + "epoch": 3.0082674772036473, + "grad_norm": 0.07004230354354819, + "learning_rate": 1.4401804879830527e-06, + "loss": 0.4983, + "step": 6191 + }, + { + "epoch": 3.0087537993920974, + "grad_norm": 0.07075477419294558, + "learning_rate": 1.4388362360803909e-06, + "loss": 0.5062, + "step": 6192 + }, + { + "epoch": 3.009240121580547, + "grad_norm": 0.07435942836111588, + "learning_rate": 1.4374925063819557e-06, + "loss": 0.5568, + "step": 6193 + }, + { + "epoch": 3.009726443768997, + "grad_norm": 0.074419386899034, + "learning_rate": 1.4361492990847892e-06, + "loss": 0.5102, + "step": 6194 + }, + { + "epoch": 3.010212765957447, + "grad_norm": 0.07130657221589082, + "learning_rate": 1.434806614385858e-06, + "loss": 0.5209, + "step": 6195 + }, + { + "epoch": 3.0106990881458966, + "grad_norm": 0.07139139930266573, + "learning_rate": 1.4334644524820512e-06, + "loss": 0.494, + "step": 6196 + }, + { + "epoch": 3.0111854103343467, + "grad_norm": 0.0701504719476038, + "learning_rate": 1.432122813570182e-06, + "loss": 0.5285, + "step": 6197 + }, + { + "epoch": 3.0116717325227964, + "grad_norm": 0.0690511543570714, + "learning_rate": 1.430781697846988e-06, + "loss": 0.4964, + "step": 6198 + }, + { + "epoch": 3.012158054711246, + "grad_norm": 0.07207671846249566, + "learning_rate": 1.4294411055091246e-06, + "loss": 0.5094, + "step": 6199 + }, + { + "epoch": 3.012644376899696, + "grad_norm": 0.06929879249846607, + "learning_rate": 1.4281010367531773e-06, + "loss": 0.5136, + "step": 6200 + }, + { + "epoch": 3.013130699088146, + "grad_norm": 0.06924668780601602, + "learning_rate": 1.4267614917756495e-06, + "loss": 0.519, + "step": 6201 + }, + { + "epoch": 3.0136170212765956, + "grad_norm": 0.07050709081625034, + "learning_rate": 1.4254224707729736e-06, + "loss": 0.4733, + "step": 6202 + }, + { + "epoch": 3.0141033434650457, + "grad_norm": 0.072794380308219, + "learning_rate": 1.4240839739415002e-06, + "loss": 0.5298, + "step": 6203 + }, + { + "epoch": 3.0145896656534954, + "grad_norm": 0.0697028306395541, + "learning_rate": 1.4227460014775051e-06, + "loss": 0.4784, + "step": 6204 + }, + { + "epoch": 3.015075987841945, + "grad_norm": 0.07291993555025729, + "learning_rate": 1.4214085535771865e-06, + "loss": 0.5438, + "step": 6205 + }, + { + "epoch": 3.015562310030395, + "grad_norm": 0.06960749927708733, + "learning_rate": 1.4200716304366658e-06, + "loss": 0.4893, + "step": 6206 + }, + { + "epoch": 3.016048632218845, + "grad_norm": 0.07162484393079868, + "learning_rate": 1.4187352322519876e-06, + "loss": 0.5018, + "step": 6207 + }, + { + "epoch": 3.016534954407295, + "grad_norm": 0.07149464072525191, + "learning_rate": 1.4173993592191199e-06, + "loss": 0.5174, + "step": 6208 + }, + { + "epoch": 3.0170212765957447, + "grad_norm": 0.0716207880463414, + "learning_rate": 1.416064011533953e-06, + "loss": 0.5189, + "step": 6209 + }, + { + "epoch": 3.0175075987841944, + "grad_norm": 0.06972230036932861, + "learning_rate": 1.4147291893923004e-06, + "loss": 0.5191, + "step": 6210 + }, + { + "epoch": 3.0179939209726445, + "grad_norm": 0.07318914066741655, + "learning_rate": 1.4133948929898988e-06, + "loss": 0.5074, + "step": 6211 + }, + { + "epoch": 3.018480243161094, + "grad_norm": 0.07063476281997885, + "learning_rate": 1.412061122522409e-06, + "loss": 0.5547, + "step": 6212 + }, + { + "epoch": 3.018966565349544, + "grad_norm": 0.07304224120202682, + "learning_rate": 1.4107278781854107e-06, + "loss": 0.534, + "step": 6213 + }, + { + "epoch": 3.019452887537994, + "grad_norm": 0.06895765283435634, + "learning_rate": 1.4093951601744098e-06, + "loss": 0.4958, + "step": 6214 + }, + { + "epoch": 3.0199392097264437, + "grad_norm": 0.07217781718503266, + "learning_rate": 1.4080629686848347e-06, + "loss": 0.5055, + "step": 6215 + }, + { + "epoch": 3.020425531914894, + "grad_norm": 0.07276528482364031, + "learning_rate": 1.4067313039120361e-06, + "loss": 0.5214, + "step": 6216 + }, + { + "epoch": 3.0209118541033435, + "grad_norm": 0.07160486140565049, + "learning_rate": 1.4054001660512873e-06, + "loss": 0.4875, + "step": 6217 + }, + { + "epoch": 3.021398176291793, + "grad_norm": 0.06866248613764944, + "learning_rate": 1.404069555297785e-06, + "loss": 0.4975, + "step": 6218 + }, + { + "epoch": 3.0218844984802433, + "grad_norm": 0.07032880112944945, + "learning_rate": 1.4027394718466463e-06, + "loss": 0.5038, + "step": 6219 + }, + { + "epoch": 3.022370820668693, + "grad_norm": 0.07077522392031475, + "learning_rate": 1.4014099158929162e-06, + "loss": 0.5105, + "step": 6220 + }, + { + "epoch": 3.0228571428571427, + "grad_norm": 0.06927426904119653, + "learning_rate": 1.4000808876315568e-06, + "loss": 0.519, + "step": 6221 + }, + { + "epoch": 3.023343465045593, + "grad_norm": 0.07363887692539962, + "learning_rate": 1.398752387257456e-06, + "loss": 0.5779, + "step": 6222 + }, + { + "epoch": 3.0238297872340425, + "grad_norm": 0.06902544366548685, + "learning_rate": 1.3974244149654221e-06, + "loss": 0.4847, + "step": 6223 + }, + { + "epoch": 3.024316109422492, + "grad_norm": 0.07075728502671924, + "learning_rate": 1.396096970950188e-06, + "loss": 0.4873, + "step": 6224 + }, + { + "epoch": 3.0248024316109423, + "grad_norm": 0.07237784655488048, + "learning_rate": 1.3947700554064086e-06, + "loss": 0.5298, + "step": 6225 + }, + { + "epoch": 3.025288753799392, + "grad_norm": 0.06925651254264355, + "learning_rate": 1.39344366852866e-06, + "loss": 0.489, + "step": 6226 + }, + { + "epoch": 3.025775075987842, + "grad_norm": 0.06945003347846412, + "learning_rate": 1.3921178105114436e-06, + "loss": 0.5039, + "step": 6227 + }, + { + "epoch": 3.026261398176292, + "grad_norm": 0.07107567334362128, + "learning_rate": 1.3907924815491791e-06, + "loss": 0.5178, + "step": 6228 + }, + { + "epoch": 3.0267477203647415, + "grad_norm": 0.07470872355468923, + "learning_rate": 1.3894676818362112e-06, + "loss": 0.5208, + "step": 6229 + }, + { + "epoch": 3.0272340425531916, + "grad_norm": 0.0689356325817015, + "learning_rate": 1.388143411566808e-06, + "loss": 0.4893, + "step": 6230 + }, + { + "epoch": 3.0277203647416413, + "grad_norm": 0.07016919821332085, + "learning_rate": 1.3868196709351582e-06, + "loss": 0.505, + "step": 6231 + }, + { + "epoch": 3.028206686930091, + "grad_norm": 0.07241493526076244, + "learning_rate": 1.3854964601353732e-06, + "loss": 0.5002, + "step": 6232 + }, + { + "epoch": 3.028693009118541, + "grad_norm": 0.07239961017815634, + "learning_rate": 1.3841737793614869e-06, + "loss": 0.5332, + "step": 6233 + }, + { + "epoch": 3.029179331306991, + "grad_norm": 0.0741605478453966, + "learning_rate": 1.3828516288074551e-06, + "loss": 0.5113, + "step": 6234 + }, + { + "epoch": 3.029665653495441, + "grad_norm": 0.0699171037833093, + "learning_rate": 1.3815300086671569e-06, + "loss": 0.5013, + "step": 6235 + }, + { + "epoch": 3.0301519756838906, + "grad_norm": 0.06880322183982368, + "learning_rate": 1.380208919134392e-06, + "loss": 0.4955, + "step": 6236 + }, + { + "epoch": 3.0306382978723403, + "grad_norm": 0.07159963920616899, + "learning_rate": 1.3788883604028825e-06, + "loss": 0.4988, + "step": 6237 + }, + { + "epoch": 3.0311246200607904, + "grad_norm": 0.07051842277578378, + "learning_rate": 1.377568332666276e-06, + "loss": 0.504, + "step": 6238 + }, + { + "epoch": 3.03161094224924, + "grad_norm": 0.06947650910451554, + "learning_rate": 1.3762488361181382e-06, + "loss": 0.4606, + "step": 6239 + }, + { + "epoch": 3.03209726443769, + "grad_norm": 0.07284046010469804, + "learning_rate": 1.3749298709519576e-06, + "loss": 0.5367, + "step": 6240 + }, + { + "epoch": 3.03258358662614, + "grad_norm": 0.06930677666814176, + "learning_rate": 1.3736114373611464e-06, + "loss": 0.493, + "step": 6241 + }, + { + "epoch": 3.0330699088145896, + "grad_norm": 0.07178891453088389, + "learning_rate": 1.3722935355390394e-06, + "loss": 0.5118, + "step": 6242 + }, + { + "epoch": 3.0335562310030397, + "grad_norm": 0.07255145843100853, + "learning_rate": 1.3709761656788884e-06, + "loss": 0.5364, + "step": 6243 + }, + { + "epoch": 3.0340425531914894, + "grad_norm": 0.06934723784036224, + "learning_rate": 1.3696593279738718e-06, + "loss": 0.5046, + "step": 6244 + }, + { + "epoch": 3.034528875379939, + "grad_norm": 0.07427312441588829, + "learning_rate": 1.3683430226170903e-06, + "loss": 0.5037, + "step": 6245 + }, + { + "epoch": 3.0350151975683892, + "grad_norm": 0.06928410477212978, + "learning_rate": 1.3670272498015636e-06, + "loss": 0.4838, + "step": 6246 + }, + { + "epoch": 3.035501519756839, + "grad_norm": 0.07128968017872989, + "learning_rate": 1.3657120097202359e-06, + "loss": 0.4791, + "step": 6247 + }, + { + "epoch": 3.0359878419452886, + "grad_norm": 0.07173212178439788, + "learning_rate": 1.3643973025659723e-06, + "loss": 0.5385, + "step": 6248 + }, + { + "epoch": 3.0364741641337387, + "grad_norm": 0.07035323168537334, + "learning_rate": 1.3630831285315588e-06, + "loss": 0.5042, + "step": 6249 + }, + { + "epoch": 3.0369604863221884, + "grad_norm": 0.07251501452065133, + "learning_rate": 1.3617694878097048e-06, + "loss": 0.5322, + "step": 6250 + }, + { + "epoch": 3.037446808510638, + "grad_norm": 0.07251463708211572, + "learning_rate": 1.3604563805930405e-06, + "loss": 0.4886, + "step": 6251 + }, + { + "epoch": 3.037933130699088, + "grad_norm": 0.06907004330044605, + "learning_rate": 1.3591438070741182e-06, + "loss": 0.5016, + "step": 6252 + }, + { + "epoch": 3.038419452887538, + "grad_norm": 0.07170928985389605, + "learning_rate": 1.3578317674454117e-06, + "loss": 0.501, + "step": 6253 + }, + { + "epoch": 3.038905775075988, + "grad_norm": 0.0714334508847959, + "learning_rate": 1.3565202618993173e-06, + "loss": 0.498, + "step": 6254 + }, + { + "epoch": 3.0393920972644377, + "grad_norm": 0.07113356755831104, + "learning_rate": 1.3552092906281505e-06, + "loss": 0.4976, + "step": 6255 + }, + { + "epoch": 3.0398784194528874, + "grad_norm": 0.07338302872459229, + "learning_rate": 1.3538988538241548e-06, + "loss": 0.4945, + "step": 6256 + }, + { + "epoch": 3.0403647416413375, + "grad_norm": 0.07321136354776898, + "learning_rate": 1.3525889516794865e-06, + "loss": 0.518, + "step": 6257 + }, + { + "epoch": 3.040851063829787, + "grad_norm": 0.07159510679921081, + "learning_rate": 1.3512795843862292e-06, + "loss": 0.509, + "step": 6258 + }, + { + "epoch": 3.041337386018237, + "grad_norm": 0.07433397630829636, + "learning_rate": 1.349970752136387e-06, + "loss": 0.5349, + "step": 6259 + }, + { + "epoch": 3.041823708206687, + "grad_norm": 0.06958759396400542, + "learning_rate": 1.3486624551218853e-06, + "loss": 0.502, + "step": 6260 + }, + { + "epoch": 3.0423100303951367, + "grad_norm": 0.0704650932052657, + "learning_rate": 1.3473546935345704e-06, + "loss": 0.5116, + "step": 6261 + }, + { + "epoch": 3.042796352583587, + "grad_norm": 0.07027851137018615, + "learning_rate": 1.3460474675662117e-06, + "loss": 0.4975, + "step": 6262 + }, + { + "epoch": 3.0432826747720365, + "grad_norm": 0.06795607521033717, + "learning_rate": 1.344740777408498e-06, + "loss": 0.5008, + "step": 6263 + }, + { + "epoch": 3.043768996960486, + "grad_norm": 0.07057546231753216, + "learning_rate": 1.3434346232530416e-06, + "loss": 0.4678, + "step": 6264 + }, + { + "epoch": 3.0442553191489363, + "grad_norm": 0.07178224314994901, + "learning_rate": 1.3421290052913744e-06, + "loss": 0.4973, + "step": 6265 + }, + { + "epoch": 3.044741641337386, + "grad_norm": 0.06819376763651104, + "learning_rate": 1.3408239237149507e-06, + "loss": 0.4837, + "step": 6266 + }, + { + "epoch": 3.0452279635258357, + "grad_norm": 0.07052840370706334, + "learning_rate": 1.3395193787151455e-06, + "loss": 0.4882, + "step": 6267 + }, + { + "epoch": 3.045714285714286, + "grad_norm": 0.07029072818881148, + "learning_rate": 1.3382153704832569e-06, + "loss": 0.4989, + "step": 6268 + }, + { + "epoch": 3.0462006079027355, + "grad_norm": 0.07275798787716026, + "learning_rate": 1.3369118992105012e-06, + "loss": 0.55, + "step": 6269 + }, + { + "epoch": 3.0466869300911856, + "grad_norm": 0.07083812317306203, + "learning_rate": 1.3356089650880184e-06, + "loss": 0.5089, + "step": 6270 + }, + { + "epoch": 3.0471732522796353, + "grad_norm": 0.07249749636383808, + "learning_rate": 1.334306568306869e-06, + "loss": 0.5494, + "step": 6271 + }, + { + "epoch": 3.047659574468085, + "grad_norm": 0.07215260095398096, + "learning_rate": 1.3330047090580345e-06, + "loss": 0.488, + "step": 6272 + }, + { + "epoch": 3.048145896656535, + "grad_norm": 0.07110178094004686, + "learning_rate": 1.3317033875324182e-06, + "loss": 0.5076, + "step": 6273 + }, + { + "epoch": 3.048632218844985, + "grad_norm": 0.07175936217987068, + "learning_rate": 1.3304026039208434e-06, + "loss": 0.5021, + "step": 6274 + }, + { + "epoch": 3.0491185410334345, + "grad_norm": 0.06935499105944194, + "learning_rate": 1.3291023584140562e-06, + "loss": 0.5146, + "step": 6275 + }, + { + "epoch": 3.0496048632218846, + "grad_norm": 0.07102140988657896, + "learning_rate": 1.327802651202722e-06, + "loss": 0.5101, + "step": 6276 + }, + { + "epoch": 3.0500911854103343, + "grad_norm": 0.0709765143690515, + "learning_rate": 1.3265034824774287e-06, + "loss": 0.529, + "step": 6277 + }, + { + "epoch": 3.050577507598784, + "grad_norm": 0.0735611020926711, + "learning_rate": 1.3252048524286843e-06, + "loss": 0.5232, + "step": 6278 + }, + { + "epoch": 3.051063829787234, + "grad_norm": 0.06836202171404077, + "learning_rate": 1.3239067612469182e-06, + "loss": 0.4472, + "step": 6279 + }, + { + "epoch": 3.051550151975684, + "grad_norm": 0.07090143864412517, + "learning_rate": 1.3226092091224806e-06, + "loss": 0.4929, + "step": 6280 + }, + { + "epoch": 3.052036474164134, + "grad_norm": 0.06984041427868111, + "learning_rate": 1.3213121962456433e-06, + "loss": 0.4953, + "step": 6281 + }, + { + "epoch": 3.0525227963525836, + "grad_norm": 0.0730628285266339, + "learning_rate": 1.320015722806598e-06, + "loss": 0.5035, + "step": 6282 + }, + { + "epoch": 3.0530091185410333, + "grad_norm": 0.07417655743507183, + "learning_rate": 1.3187197889954579e-06, + "loss": 0.5184, + "step": 6283 + }, + { + "epoch": 3.0534954407294834, + "grad_norm": 0.07164698341009024, + "learning_rate": 1.3174243950022569e-06, + "loss": 0.505, + "step": 6284 + }, + { + "epoch": 3.053981762917933, + "grad_norm": 0.06742437210564374, + "learning_rate": 1.31612954101695e-06, + "loss": 0.4722, + "step": 6285 + }, + { + "epoch": 3.054468085106383, + "grad_norm": 0.07048552927687309, + "learning_rate": 1.3148352272294128e-06, + "loss": 0.4846, + "step": 6286 + }, + { + "epoch": 3.054954407294833, + "grad_norm": 0.07223294823135329, + "learning_rate": 1.3135414538294421e-06, + "loss": 0.5241, + "step": 6287 + }, + { + "epoch": 3.0554407294832826, + "grad_norm": 0.06816249635962288, + "learning_rate": 1.3122482210067545e-06, + "loss": 0.4932, + "step": 6288 + }, + { + "epoch": 3.0559270516717327, + "grad_norm": 0.07257120033790468, + "learning_rate": 1.3109555289509879e-06, + "loss": 0.5173, + "step": 6289 + }, + { + "epoch": 3.0564133738601824, + "grad_norm": 0.07446069467480337, + "learning_rate": 1.3096633778517026e-06, + "loss": 0.5135, + "step": 6290 + }, + { + "epoch": 3.056899696048632, + "grad_norm": 0.07154583301404861, + "learning_rate": 1.3083717678983737e-06, + "loss": 0.5186, + "step": 6291 + }, + { + "epoch": 3.0573860182370822, + "grad_norm": 0.07168192956656737, + "learning_rate": 1.3070806992804047e-06, + "loss": 0.5379, + "step": 6292 + }, + { + "epoch": 3.057872340425532, + "grad_norm": 0.06962513931621467, + "learning_rate": 1.3057901721871157e-06, + "loss": 0.4855, + "step": 6293 + }, + { + "epoch": 3.0583586626139816, + "grad_norm": 0.06947283885309945, + "learning_rate": 1.3045001868077478e-06, + "loss": 0.4867, + "step": 6294 + }, + { + "epoch": 3.0588449848024317, + "grad_norm": 0.07353521692109127, + "learning_rate": 1.3032107433314618e-06, + "loss": 0.499, + "step": 6295 + }, + { + "epoch": 3.0593313069908814, + "grad_norm": 0.07148062990889317, + "learning_rate": 1.3019218419473406e-06, + "loss": 0.4968, + "step": 6296 + }, + { + "epoch": 3.059817629179331, + "grad_norm": 0.07256044760721751, + "learning_rate": 1.3006334828443868e-06, + "loss": 0.5405, + "step": 6297 + }, + { + "epoch": 3.060303951367781, + "grad_norm": 0.07041057907092613, + "learning_rate": 1.2993456662115234e-06, + "loss": 0.5224, + "step": 6298 + }, + { + "epoch": 3.060790273556231, + "grad_norm": 0.07219259347943267, + "learning_rate": 1.298058392237595e-06, + "loss": 0.529, + "step": 6299 + }, + { + "epoch": 3.061276595744681, + "grad_norm": 0.07055471295668753, + "learning_rate": 1.2967716611113645e-06, + "loss": 0.4971, + "step": 6300 + }, + { + "epoch": 3.0617629179331307, + "grad_norm": 0.07227026389293165, + "learning_rate": 1.2954854730215172e-06, + "loss": 0.5034, + "step": 6301 + }, + { + "epoch": 3.0622492401215804, + "grad_norm": 0.07076437334651081, + "learning_rate": 1.2941998281566575e-06, + "loss": 0.5088, + "step": 6302 + }, + { + "epoch": 3.0627355623100305, + "grad_norm": 0.07099447501307093, + "learning_rate": 1.292914726705311e-06, + "loss": 0.5127, + "step": 6303 + }, + { + "epoch": 3.06322188449848, + "grad_norm": 0.07276744147582066, + "learning_rate": 1.291630168855924e-06, + "loss": 0.5458, + "step": 6304 + }, + { + "epoch": 3.06370820668693, + "grad_norm": 0.06956717760551456, + "learning_rate": 1.290346154796861e-06, + "loss": 0.4726, + "step": 6305 + }, + { + "epoch": 3.06419452887538, + "grad_norm": 0.07276407136808401, + "learning_rate": 1.2890626847164078e-06, + "loss": 0.5148, + "step": 6306 + }, + { + "epoch": 3.0646808510638297, + "grad_norm": 0.07206947415467035, + "learning_rate": 1.2877797588027713e-06, + "loss": 0.5399, + "step": 6307 + }, + { + "epoch": 3.06516717325228, + "grad_norm": 0.07449496778010822, + "learning_rate": 1.2864973772440787e-06, + "loss": 0.5116, + "step": 6308 + }, + { + "epoch": 3.0656534954407295, + "grad_norm": 0.06991173475730521, + "learning_rate": 1.2852155402283756e-06, + "loss": 0.4741, + "step": 6309 + }, + { + "epoch": 3.066139817629179, + "grad_norm": 0.0691589602206517, + "learning_rate": 1.2839342479436279e-06, + "loss": 0.5114, + "step": 6310 + }, + { + "epoch": 3.0666261398176293, + "grad_norm": 0.07486175626612472, + "learning_rate": 1.2826535005777257e-06, + "loss": 0.5432, + "step": 6311 + }, + { + "epoch": 3.067112462006079, + "grad_norm": 0.06931029321733267, + "learning_rate": 1.2813732983184745e-06, + "loss": 0.4911, + "step": 6312 + }, + { + "epoch": 3.0675987841945287, + "grad_norm": 0.07156429182458042, + "learning_rate": 1.2800936413536008e-06, + "loss": 0.5327, + "step": 6313 + }, + { + "epoch": 3.068085106382979, + "grad_norm": 0.07182725231193507, + "learning_rate": 1.2788145298707526e-06, + "loss": 0.5379, + "step": 6314 + }, + { + "epoch": 3.0685714285714285, + "grad_norm": 0.07123078441726857, + "learning_rate": 1.2775359640574969e-06, + "loss": 0.5117, + "step": 6315 + }, + { + "epoch": 3.0690577507598786, + "grad_norm": 0.07521351926623729, + "learning_rate": 1.2762579441013207e-06, + "loss": 0.5241, + "step": 6316 + }, + { + "epoch": 3.0695440729483283, + "grad_norm": 0.07345094914579875, + "learning_rate": 1.2749804701896307e-06, + "loss": 0.5321, + "step": 6317 + }, + { + "epoch": 3.070030395136778, + "grad_norm": 0.07080752055646776, + "learning_rate": 1.2737035425097543e-06, + "loss": 0.5428, + "step": 6318 + }, + { + "epoch": 3.070516717325228, + "grad_norm": 0.07054922317713008, + "learning_rate": 1.2724271612489403e-06, + "loss": 0.4962, + "step": 6319 + }, + { + "epoch": 3.071003039513678, + "grad_norm": 0.0696894246487351, + "learning_rate": 1.271151326594352e-06, + "loss": 0.5079, + "step": 6320 + }, + { + "epoch": 3.0714893617021275, + "grad_norm": 0.06858804681381224, + "learning_rate": 1.2698760387330782e-06, + "loss": 0.5081, + "step": 6321 + }, + { + "epoch": 3.0719756838905776, + "grad_norm": 0.07292821371054758, + "learning_rate": 1.2686012978521244e-06, + "loss": 0.5062, + "step": 6322 + }, + { + "epoch": 3.0724620060790273, + "grad_norm": 0.06872869023772299, + "learning_rate": 1.2673271041384177e-06, + "loss": 0.487, + "step": 6323 + }, + { + "epoch": 3.072948328267477, + "grad_norm": 0.07140688053844178, + "learning_rate": 1.266053457778804e-06, + "loss": 0.5078, + "step": 6324 + }, + { + "epoch": 3.073434650455927, + "grad_norm": 0.07061360252060708, + "learning_rate": 1.2647803589600488e-06, + "loss": 0.5158, + "step": 6325 + }, + { + "epoch": 3.073920972644377, + "grad_norm": 0.07179162073953198, + "learning_rate": 1.2635078078688378e-06, + "loss": 0.5075, + "step": 6326 + }, + { + "epoch": 3.074407294832827, + "grad_norm": 0.07045132338503757, + "learning_rate": 1.262235804691776e-06, + "loss": 0.5068, + "step": 6327 + }, + { + "epoch": 3.0748936170212766, + "grad_norm": 0.07118685947785651, + "learning_rate": 1.2609643496153866e-06, + "loss": 0.5073, + "step": 6328 + }, + { + "epoch": 3.0753799392097263, + "grad_norm": 0.07184116769388826, + "learning_rate": 1.2596934428261181e-06, + "loss": 0.5118, + "step": 6329 + }, + { + "epoch": 3.0758662613981764, + "grad_norm": 0.07113752423084277, + "learning_rate": 1.2584230845103312e-06, + "loss": 0.5219, + "step": 6330 + }, + { + "epoch": 3.076352583586626, + "grad_norm": 0.07097254703792906, + "learning_rate": 1.2571532748543114e-06, + "loss": 0.5001, + "step": 6331 + }, + { + "epoch": 3.076838905775076, + "grad_norm": 0.06938086836569492, + "learning_rate": 1.2558840140442602e-06, + "loss": 0.5206, + "step": 6332 + }, + { + "epoch": 3.077325227963526, + "grad_norm": 0.0717412607583114, + "learning_rate": 1.2546153022663015e-06, + "loss": 0.5232, + "step": 6333 + }, + { + "epoch": 3.0778115501519756, + "grad_norm": 0.07134253539864335, + "learning_rate": 1.2533471397064783e-06, + "loss": 0.5096, + "step": 6334 + }, + { + "epoch": 3.0782978723404257, + "grad_norm": 0.06845991587156157, + "learning_rate": 1.2520795265507502e-06, + "loss": 0.4801, + "step": 6335 + }, + { + "epoch": 3.0787841945288754, + "grad_norm": 0.07210549772526677, + "learning_rate": 1.2508124629849981e-06, + "loss": 0.5239, + "step": 6336 + }, + { + "epoch": 3.079270516717325, + "grad_norm": 0.06973865953789864, + "learning_rate": 1.249545949195024e-06, + "loss": 0.4897, + "step": 6337 + }, + { + "epoch": 3.0797568389057752, + "grad_norm": 0.0722478556980012, + "learning_rate": 1.2482799853665473e-06, + "loss": 0.5239, + "step": 6338 + }, + { + "epoch": 3.080243161094225, + "grad_norm": 0.07359308056235384, + "learning_rate": 1.2470145716852072e-06, + "loss": 0.5164, + "step": 6339 + }, + { + "epoch": 3.0807294832826746, + "grad_norm": 0.07131621061382196, + "learning_rate": 1.245749708336562e-06, + "loss": 0.4762, + "step": 6340 + }, + { + "epoch": 3.0812158054711247, + "grad_norm": 0.07263226576185644, + "learning_rate": 1.2444853955060899e-06, + "loss": 0.5286, + "step": 6341 + }, + { + "epoch": 3.0817021276595744, + "grad_norm": 0.07299253584561631, + "learning_rate": 1.2432216333791875e-06, + "loss": 0.5177, + "step": 6342 + }, + { + "epoch": 3.082188449848024, + "grad_norm": 0.07091960840101029, + "learning_rate": 1.2419584221411719e-06, + "loss": 0.4905, + "step": 6343 + }, + { + "epoch": 3.082674772036474, + "grad_norm": 0.07007660780025235, + "learning_rate": 1.240695761977278e-06, + "loss": 0.511, + "step": 6344 + }, + { + "epoch": 3.083161094224924, + "grad_norm": 0.07013552941360761, + "learning_rate": 1.2394336530726608e-06, + "loss": 0.4929, + "step": 6345 + }, + { + "epoch": 3.083647416413374, + "grad_norm": 0.06951857278051657, + "learning_rate": 1.2381720956123933e-06, + "loss": 0.5273, + "step": 6346 + }, + { + "epoch": 3.0841337386018237, + "grad_norm": 0.07373984506989155, + "learning_rate": 1.2369110897814708e-06, + "loss": 0.5187, + "step": 6347 + }, + { + "epoch": 3.0846200607902734, + "grad_norm": 0.06876828815974513, + "learning_rate": 1.2356506357648058e-06, + "loss": 0.474, + "step": 6348 + }, + { + "epoch": 3.0851063829787235, + "grad_norm": 0.07113609268148895, + "learning_rate": 1.2343907337472261e-06, + "loss": 0.5393, + "step": 6349 + }, + { + "epoch": 3.085592705167173, + "grad_norm": 0.07008265151912244, + "learning_rate": 1.2331313839134845e-06, + "loss": 0.4674, + "step": 6350 + }, + { + "epoch": 3.086079027355623, + "grad_norm": 0.07362148196322874, + "learning_rate": 1.23187258644825e-06, + "loss": 0.5221, + "step": 6351 + }, + { + "epoch": 3.086565349544073, + "grad_norm": 0.07217852200413702, + "learning_rate": 1.2306143415361104e-06, + "loss": 0.5286, + "step": 6352 + }, + { + "epoch": 3.0870516717325227, + "grad_norm": 0.07084309184955308, + "learning_rate": 1.2293566493615734e-06, + "loss": 0.51, + "step": 6353 + }, + { + "epoch": 3.087537993920973, + "grad_norm": 0.07106327145762124, + "learning_rate": 1.2280995101090653e-06, + "loss": 0.5211, + "step": 6354 + }, + { + "epoch": 3.0880243161094225, + "grad_norm": 0.07100153619531661, + "learning_rate": 1.2268429239629314e-06, + "loss": 0.4956, + "step": 6355 + }, + { + "epoch": 3.088510638297872, + "grad_norm": 0.06737100715668831, + "learning_rate": 1.225586891107436e-06, + "loss": 0.459, + "step": 6356 + }, + { + "epoch": 3.0889969604863223, + "grad_norm": 0.06917788612588545, + "learning_rate": 1.2243314117267608e-06, + "loss": 0.483, + "step": 6357 + }, + { + "epoch": 3.089483282674772, + "grad_norm": 0.07090212462023035, + "learning_rate": 1.2230764860050094e-06, + "loss": 0.5065, + "step": 6358 + }, + { + "epoch": 3.0899696048632217, + "grad_norm": 0.07255885515473502, + "learning_rate": 1.221822114126201e-06, + "loss": 0.5186, + "step": 6359 + }, + { + "epoch": 3.090455927051672, + "grad_norm": 0.0716228877422812, + "learning_rate": 1.2205682962742754e-06, + "loss": 0.5048, + "step": 6360 + }, + { + "epoch": 3.0909422492401215, + "grad_norm": 0.06983538010363304, + "learning_rate": 1.2193150326330915e-06, + "loss": 0.4788, + "step": 6361 + }, + { + "epoch": 3.0914285714285716, + "grad_norm": 0.06962692046284083, + "learning_rate": 1.2180623233864254e-06, + "loss": 0.5057, + "step": 6362 + }, + { + "epoch": 3.0919148936170213, + "grad_norm": 0.06861106518135233, + "learning_rate": 1.2168101687179722e-06, + "loss": 0.4685, + "step": 6363 + }, + { + "epoch": 3.092401215805471, + "grad_norm": 0.07138466663715992, + "learning_rate": 1.2155585688113476e-06, + "loss": 0.4958, + "step": 6364 + }, + { + "epoch": 3.092887537993921, + "grad_norm": 0.06940887925919648, + "learning_rate": 1.214307523850083e-06, + "loss": 0.5019, + "step": 6365 + }, + { + "epoch": 3.093373860182371, + "grad_norm": 0.07142201589147261, + "learning_rate": 1.2130570340176306e-06, + "loss": 0.5035, + "step": 6366 + }, + { + "epoch": 3.0938601823708205, + "grad_norm": 0.07149166942094624, + "learning_rate": 1.2118070994973612e-06, + "loss": 0.5472, + "step": 6367 + }, + { + "epoch": 3.0943465045592706, + "grad_norm": 0.07370689619712235, + "learning_rate": 1.2105577204725627e-06, + "loss": 0.5077, + "step": 6368 + }, + { + "epoch": 3.0948328267477203, + "grad_norm": 0.06954060524989816, + "learning_rate": 1.209308897126442e-06, + "loss": 0.5229, + "step": 6369 + }, + { + "epoch": 3.09531914893617, + "grad_norm": 0.07152600209309178, + "learning_rate": 1.208060629642126e-06, + "loss": 0.4883, + "step": 6370 + }, + { + "epoch": 3.09580547112462, + "grad_norm": 0.07024148787034443, + "learning_rate": 1.2068129182026582e-06, + "loss": 0.5164, + "step": 6371 + }, + { + "epoch": 3.09629179331307, + "grad_norm": 0.06939886621298567, + "learning_rate": 1.205565762991001e-06, + "loss": 0.4999, + "step": 6372 + }, + { + "epoch": 3.09677811550152, + "grad_norm": 0.07192795308272598, + "learning_rate": 1.204319164190037e-06, + "loss": 0.5114, + "step": 6373 + }, + { + "epoch": 3.0972644376899696, + "grad_norm": 0.07122076974685827, + "learning_rate": 1.2030731219825637e-06, + "loss": 0.4736, + "step": 6374 + }, + { + "epoch": 3.0977507598784193, + "grad_norm": 0.06931748807231478, + "learning_rate": 1.2018276365513009e-06, + "loss": 0.4853, + "step": 6375 + }, + { + "epoch": 3.0982370820668694, + "grad_norm": 0.07190532623733153, + "learning_rate": 1.2005827080788835e-06, + "loss": 0.5382, + "step": 6376 + }, + { + "epoch": 3.098723404255319, + "grad_norm": 0.07557272310804244, + "learning_rate": 1.1993383367478672e-06, + "loss": 0.5493, + "step": 6377 + }, + { + "epoch": 3.099209726443769, + "grad_norm": 0.07028894199463619, + "learning_rate": 1.1980945227407242e-06, + "loss": 0.5331, + "step": 6378 + }, + { + "epoch": 3.099696048632219, + "grad_norm": 0.06953908103001373, + "learning_rate": 1.1968512662398458e-06, + "loss": 0.5069, + "step": 6379 + }, + { + "epoch": 3.1001823708206686, + "grad_norm": 0.07070552459581703, + "learning_rate": 1.1956085674275419e-06, + "loss": 0.5164, + "step": 6380 + }, + { + "epoch": 3.1006686930091187, + "grad_norm": 0.07278151402145511, + "learning_rate": 1.1943664264860395e-06, + "loss": 0.5325, + "step": 6381 + }, + { + "epoch": 3.1011550151975684, + "grad_norm": 0.07048772064653154, + "learning_rate": 1.193124843597485e-06, + "loss": 0.4709, + "step": 6382 + }, + { + "epoch": 3.101641337386018, + "grad_norm": 0.07323368836055087, + "learning_rate": 1.1918838189439426e-06, + "loss": 0.5399, + "step": 6383 + }, + { + "epoch": 3.1021276595744682, + "grad_norm": 0.06980025203650327, + "learning_rate": 1.1906433527073934e-06, + "loss": 0.5133, + "step": 6384 + }, + { + "epoch": 3.102613981762918, + "grad_norm": 0.07171990081161943, + "learning_rate": 1.1894034450697389e-06, + "loss": 0.5008, + "step": 6385 + }, + { + "epoch": 3.1031003039513676, + "grad_norm": 0.0773046940284695, + "learning_rate": 1.1881640962127972e-06, + "loss": 0.543, + "step": 6386 + }, + { + "epoch": 3.1035866261398177, + "grad_norm": 0.07622947195450273, + "learning_rate": 1.1869253063183039e-06, + "loss": 0.5906, + "step": 6387 + }, + { + "epoch": 3.1040729483282674, + "grad_norm": 0.07202062857229782, + "learning_rate": 1.1856870755679146e-06, + "loss": 0.5243, + "step": 6388 + }, + { + "epoch": 3.1045592705167175, + "grad_norm": 0.07124144193174368, + "learning_rate": 1.1844494041432008e-06, + "loss": 0.5145, + "step": 6389 + }, + { + "epoch": 3.1050455927051672, + "grad_norm": 0.07050302024638118, + "learning_rate": 1.1832122922256539e-06, + "loss": 0.5067, + "step": 6390 + }, + { + "epoch": 3.105531914893617, + "grad_norm": 0.07209536677435852, + "learning_rate": 1.181975739996682e-06, + "loss": 0.5537, + "step": 6391 + }, + { + "epoch": 3.106018237082067, + "grad_norm": 0.06929175378286714, + "learning_rate": 1.1807397476376109e-06, + "loss": 0.4975, + "step": 6392 + }, + { + "epoch": 3.1065045592705167, + "grad_norm": 0.0696040713117208, + "learning_rate": 1.1795043153296849e-06, + "loss": 0.4853, + "step": 6393 + }, + { + "epoch": 3.1069908814589664, + "grad_norm": 0.07267699530695315, + "learning_rate": 1.178269443254067e-06, + "loss": 0.5128, + "step": 6394 + }, + { + "epoch": 3.1074772036474165, + "grad_norm": 0.06980701990610522, + "learning_rate": 1.1770351315918365e-06, + "loss": 0.4856, + "step": 6395 + }, + { + "epoch": 3.107963525835866, + "grad_norm": 0.07085465994523285, + "learning_rate": 1.1758013805239925e-06, + "loss": 0.4977, + "step": 6396 + }, + { + "epoch": 3.108449848024316, + "grad_norm": 0.07118085670496584, + "learning_rate": 1.1745681902314481e-06, + "loss": 0.4935, + "step": 6397 + }, + { + "epoch": 3.108936170212766, + "grad_norm": 0.07054901413909052, + "learning_rate": 1.173335560895038e-06, + "loss": 0.4983, + "step": 6398 + }, + { + "epoch": 3.1094224924012157, + "grad_norm": 0.07073906424636892, + "learning_rate": 1.172103492695514e-06, + "loss": 0.5189, + "step": 6399 + }, + { + "epoch": 3.109908814589666, + "grad_norm": 0.07306412765885605, + "learning_rate": 1.1708719858135415e-06, + "loss": 0.5508, + "step": 6400 + }, + { + "epoch": 3.1103951367781155, + "grad_norm": 0.07092420869708216, + "learning_rate": 1.1696410404297115e-06, + "loss": 0.484, + "step": 6401 + }, + { + "epoch": 3.110881458966565, + "grad_norm": 0.07087973722298993, + "learning_rate": 1.1684106567245268e-06, + "loss": 0.5114, + "step": 6402 + }, + { + "epoch": 3.1113677811550153, + "grad_norm": 0.0694453810190983, + "learning_rate": 1.167180834878408e-06, + "loss": 0.4701, + "step": 6403 + }, + { + "epoch": 3.111854103343465, + "grad_norm": 0.07005915801860707, + "learning_rate": 1.1659515750716953e-06, + "loss": 0.5013, + "step": 6404 + }, + { + "epoch": 3.1123404255319147, + "grad_norm": 0.0743821835995421, + "learning_rate": 1.164722877484646e-06, + "loss": 0.5161, + "step": 6405 + }, + { + "epoch": 3.112826747720365, + "grad_norm": 0.07115091590642147, + "learning_rate": 1.163494742297434e-06, + "loss": 0.512, + "step": 6406 + }, + { + "epoch": 3.1133130699088145, + "grad_norm": 0.07017697233296112, + "learning_rate": 1.1622671696901515e-06, + "loss": 0.5243, + "step": 6407 + }, + { + "epoch": 3.1137993920972646, + "grad_norm": 0.06874474726754566, + "learning_rate": 1.1610401598428089e-06, + "loss": 0.4991, + "step": 6408 + }, + { + "epoch": 3.1142857142857143, + "grad_norm": 0.0698505587286747, + "learning_rate": 1.159813712935332e-06, + "loss": 0.5015, + "step": 6409 + }, + { + "epoch": 3.114772036474164, + "grad_norm": 0.07130561064962943, + "learning_rate": 1.158587829147566e-06, + "loss": 0.5212, + "step": 6410 + }, + { + "epoch": 3.115258358662614, + "grad_norm": 0.0696803114674483, + "learning_rate": 1.1573625086592744e-06, + "loss": 0.4951, + "step": 6411 + }, + { + "epoch": 3.115744680851064, + "grad_norm": 0.06952126592570107, + "learning_rate": 1.1561377516501332e-06, + "loss": 0.4865, + "step": 6412 + }, + { + "epoch": 3.1162310030395135, + "grad_norm": 0.07310200272286681, + "learning_rate": 1.1549135582997406e-06, + "loss": 0.5278, + "step": 6413 + }, + { + "epoch": 3.1167173252279636, + "grad_norm": 0.06995403451589693, + "learning_rate": 1.1536899287876108e-06, + "loss": 0.5073, + "step": 6414 + }, + { + "epoch": 3.1172036474164133, + "grad_norm": 0.06980246919365295, + "learning_rate": 1.1524668632931756e-06, + "loss": 0.4972, + "step": 6415 + }, + { + "epoch": 3.1176899696048634, + "grad_norm": 0.07089394701964344, + "learning_rate": 1.1512443619957831e-06, + "loss": 0.5101, + "step": 6416 + }, + { + "epoch": 3.118176291793313, + "grad_norm": 0.06896366804844528, + "learning_rate": 1.1500224250746993e-06, + "loss": 0.4978, + "step": 6417 + }, + { + "epoch": 3.118662613981763, + "grad_norm": 0.07044518015495504, + "learning_rate": 1.1488010527091075e-06, + "loss": 0.5143, + "step": 6418 + }, + { + "epoch": 3.119148936170213, + "grad_norm": 0.0694168420707088, + "learning_rate": 1.1475802450781064e-06, + "loss": 0.4627, + "step": 6419 + }, + { + "epoch": 3.1196352583586626, + "grad_norm": 0.07108325359298967, + "learning_rate": 1.1463600023607174e-06, + "loss": 0.5264, + "step": 6420 + }, + { + "epoch": 3.1201215805471123, + "grad_norm": 0.0741012993303335, + "learning_rate": 1.1451403247358728e-06, + "loss": 0.5305, + "step": 6421 + }, + { + "epoch": 3.1206079027355624, + "grad_norm": 0.0730793398426267, + "learning_rate": 1.1439212123824244e-06, + "loss": 0.5305, + "step": 6422 + }, + { + "epoch": 3.121094224924012, + "grad_norm": 0.07158082043287103, + "learning_rate": 1.1427026654791417e-06, + "loss": 0.5116, + "step": 6423 + }, + { + "epoch": 3.121580547112462, + "grad_norm": 0.07215578268810226, + "learning_rate": 1.1414846842047106e-06, + "loss": 0.5093, + "step": 6424 + }, + { + "epoch": 3.122066869300912, + "grad_norm": 0.07130340174874215, + "learning_rate": 1.1402672687377341e-06, + "loss": 0.5163, + "step": 6425 + }, + { + "epoch": 3.1225531914893616, + "grad_norm": 0.07147530505630274, + "learning_rate": 1.1390504192567336e-06, + "loss": 0.5125, + "step": 6426 + }, + { + "epoch": 3.1230395136778117, + "grad_norm": 0.07205014957433295, + "learning_rate": 1.1378341359401445e-06, + "loss": 0.535, + "step": 6427 + }, + { + "epoch": 3.1235258358662614, + "grad_norm": 0.07057800732615672, + "learning_rate": 1.136618418966321e-06, + "loss": 0.4911, + "step": 6428 + }, + { + "epoch": 3.124012158054711, + "grad_norm": 0.07182188469510198, + "learning_rate": 1.1354032685135346e-06, + "loss": 0.5079, + "step": 6429 + }, + { + "epoch": 3.1244984802431612, + "grad_norm": 0.07093446407995187, + "learning_rate": 1.1341886847599742e-06, + "loss": 0.5225, + "step": 6430 + }, + { + "epoch": 3.124984802431611, + "grad_norm": 0.06971455401202456, + "learning_rate": 1.1329746678837433e-06, + "loss": 0.4892, + "step": 6431 + }, + { + "epoch": 3.1254711246200606, + "grad_norm": 0.0724929505064268, + "learning_rate": 1.1317612180628645e-06, + "loss": 0.5121, + "step": 6432 + }, + { + "epoch": 3.1259574468085107, + "grad_norm": 0.07267449665125537, + "learning_rate": 1.1305483354752767e-06, + "loss": 0.5109, + "step": 6433 + }, + { + "epoch": 3.1264437689969604, + "grad_norm": 0.07050355890136843, + "learning_rate": 1.1293360202988346e-06, + "loss": 0.5272, + "step": 6434 + }, + { + "epoch": 3.12693009118541, + "grad_norm": 0.07160387234521386, + "learning_rate": 1.1281242727113112e-06, + "loss": 0.4799, + "step": 6435 + }, + { + "epoch": 3.1274164133738602, + "grad_norm": 0.07253525937791627, + "learning_rate": 1.126913092890395e-06, + "loss": 0.5211, + "step": 6436 + }, + { + "epoch": 3.12790273556231, + "grad_norm": 0.07453943214394856, + "learning_rate": 1.1257024810136903e-06, + "loss": 0.5386, + "step": 6437 + }, + { + "epoch": 3.12838905775076, + "grad_norm": 0.07111230448726456, + "learning_rate": 1.1244924372587224e-06, + "loss": 0.5057, + "step": 6438 + }, + { + "epoch": 3.1288753799392097, + "grad_norm": 0.06883369956196185, + "learning_rate": 1.1232829618029295e-06, + "loss": 0.4554, + "step": 6439 + }, + { + "epoch": 3.1293617021276594, + "grad_norm": 0.06990761360628253, + "learning_rate": 1.1220740548236685e-06, + "loss": 0.501, + "step": 6440 + }, + { + "epoch": 3.1298480243161095, + "grad_norm": 0.07248503063531504, + "learning_rate": 1.1208657164982096e-06, + "loss": 0.4931, + "step": 6441 + }, + { + "epoch": 3.130334346504559, + "grad_norm": 0.07135472037332573, + "learning_rate": 1.1196579470037427e-06, + "loss": 0.5106, + "step": 6442 + }, + { + "epoch": 3.1308206686930093, + "grad_norm": 0.07307392616254636, + "learning_rate": 1.1184507465173732e-06, + "loss": 0.5238, + "step": 6443 + }, + { + "epoch": 3.131306990881459, + "grad_norm": 0.06803548474807124, + "learning_rate": 1.1172441152161246e-06, + "loss": 0.4867, + "step": 6444 + }, + { + "epoch": 3.1317933130699087, + "grad_norm": 0.07050409202969812, + "learning_rate": 1.1160380532769343e-06, + "loss": 0.5006, + "step": 6445 + }, + { + "epoch": 3.132279635258359, + "grad_norm": 0.0701212453951125, + "learning_rate": 1.1148325608766586e-06, + "loss": 0.4865, + "step": 6446 + }, + { + "epoch": 3.1327659574468085, + "grad_norm": 0.0728671539359153, + "learning_rate": 1.1136276381920684e-06, + "loss": 0.5396, + "step": 6447 + }, + { + "epoch": 3.133252279635258, + "grad_norm": 0.07154492601195753, + "learning_rate": 1.112423285399853e-06, + "loss": 0.516, + "step": 6448 + }, + { + "epoch": 3.1337386018237083, + "grad_norm": 0.07028965389300273, + "learning_rate": 1.111219502676616e-06, + "loss": 0.4756, + "step": 6449 + }, + { + "epoch": 3.134224924012158, + "grad_norm": 0.07010990940901243, + "learning_rate": 1.1100162901988786e-06, + "loss": 0.5071, + "step": 6450 + }, + { + "epoch": 3.1347112462006077, + "grad_norm": 0.07263811393972959, + "learning_rate": 1.108813648143079e-06, + "loss": 0.5358, + "step": 6451 + }, + { + "epoch": 3.135197568389058, + "grad_norm": 0.0697698055740256, + "learning_rate": 1.1076115766855705e-06, + "loss": 0.4968, + "step": 6452 + }, + { + "epoch": 3.1356838905775075, + "grad_norm": 0.07328748919823568, + "learning_rate": 1.106410076002623e-06, + "loss": 0.5, + "step": 6453 + }, + { + "epoch": 3.1361702127659576, + "grad_norm": 0.07295765849631576, + "learning_rate": 1.1052091462704235e-06, + "loss": 0.5289, + "step": 6454 + }, + { + "epoch": 3.1366565349544073, + "grad_norm": 0.0721159537936726, + "learning_rate": 1.1040087876650745e-06, + "loss": 0.5185, + "step": 6455 + }, + { + "epoch": 3.137142857142857, + "grad_norm": 0.07252959834112245, + "learning_rate": 1.1028090003625946e-06, + "loss": 0.5123, + "step": 6456 + }, + { + "epoch": 3.137629179331307, + "grad_norm": 0.07400730843125519, + "learning_rate": 1.1016097845389195e-06, + "loss": 0.5023, + "step": 6457 + }, + { + "epoch": 3.138115501519757, + "grad_norm": 0.07037077636612832, + "learning_rate": 1.1004111403699002e-06, + "loss": 0.515, + "step": 6458 + }, + { + "epoch": 3.1386018237082065, + "grad_norm": 0.06949047008004194, + "learning_rate": 1.0992130680313046e-06, + "loss": 0.5014, + "step": 6459 + }, + { + "epoch": 3.1390881458966566, + "grad_norm": 0.0723124746550498, + "learning_rate": 1.0980155676988159e-06, + "loss": 0.4957, + "step": 6460 + }, + { + "epoch": 3.1395744680851063, + "grad_norm": 0.0733040070150769, + "learning_rate": 1.0968186395480345e-06, + "loss": 0.5137, + "step": 6461 + }, + { + "epoch": 3.140060790273556, + "grad_norm": 0.0693618606067608, + "learning_rate": 1.0956222837544762e-06, + "loss": 0.5084, + "step": 6462 + }, + { + "epoch": 3.140547112462006, + "grad_norm": 0.07147189848760056, + "learning_rate": 1.0944265004935723e-06, + "loss": 0.5543, + "step": 6463 + }, + { + "epoch": 3.141033434650456, + "grad_norm": 0.07092690296158626, + "learning_rate": 1.0932312899406717e-06, + "loss": 0.5092, + "step": 6464 + }, + { + "epoch": 3.141519756838906, + "grad_norm": 0.06904957285530479, + "learning_rate": 1.092036652271038e-06, + "loss": 0.4779, + "step": 6465 + }, + { + "epoch": 3.1420060790273556, + "grad_norm": 0.07027316271985638, + "learning_rate": 1.0908425876598512e-06, + "loss": 0.5164, + "step": 6466 + }, + { + "epoch": 3.1424924012158053, + "grad_norm": 0.07462036631002521, + "learning_rate": 1.0896490962822082e-06, + "loss": 0.5364, + "step": 6467 + }, + { + "epoch": 3.1429787234042554, + "grad_norm": 0.07178073401915917, + "learning_rate": 1.0884561783131192e-06, + "loss": 0.4985, + "step": 6468 + }, + { + "epoch": 3.143465045592705, + "grad_norm": 0.07040681264010286, + "learning_rate": 1.0872638339275137e-06, + "loss": 0.5043, + "step": 6469 + }, + { + "epoch": 3.1439513677811552, + "grad_norm": 0.06799596736398962, + "learning_rate": 1.0860720633002353e-06, + "loss": 0.455, + "step": 6470 + }, + { + "epoch": 3.144437689969605, + "grad_norm": 0.07266262274751878, + "learning_rate": 1.0848808666060428e-06, + "loss": 0.4814, + "step": 6471 + }, + { + "epoch": 3.1449240121580546, + "grad_norm": 0.0710889334460567, + "learning_rate": 1.0836902440196123e-06, + "loss": 0.4888, + "step": 6472 + }, + { + "epoch": 3.1454103343465047, + "grad_norm": 0.06953159349693616, + "learning_rate": 1.0825001957155344e-06, + "loss": 0.5311, + "step": 6473 + }, + { + "epoch": 3.1458966565349544, + "grad_norm": 0.07087006109263393, + "learning_rate": 1.0813107218683171e-06, + "loss": 0.5157, + "step": 6474 + }, + { + "epoch": 3.146382978723404, + "grad_norm": 0.0682967817638768, + "learning_rate": 1.0801218226523825e-06, + "loss": 0.4834, + "step": 6475 + }, + { + "epoch": 3.1468693009118542, + "grad_norm": 0.07416131855160117, + "learning_rate": 1.0789334982420697e-06, + "loss": 0.5521, + "step": 6476 + }, + { + "epoch": 3.147355623100304, + "grad_norm": 0.07184293517622208, + "learning_rate": 1.0777457488116323e-06, + "loss": 0.513, + "step": 6477 + }, + { + "epoch": 3.1478419452887536, + "grad_norm": 0.07113617237868496, + "learning_rate": 1.0765585745352408e-06, + "loss": 0.4873, + "step": 6478 + }, + { + "epoch": 3.1483282674772037, + "grad_norm": 0.06857685330472764, + "learning_rate": 1.0753719755869813e-06, + "loss": 0.487, + "step": 6479 + }, + { + "epoch": 3.1488145896656534, + "grad_norm": 0.07131933428519391, + "learning_rate": 1.0741859521408538e-06, + "loss": 0.5288, + "step": 6480 + }, + { + "epoch": 3.1493009118541035, + "grad_norm": 0.0692138770368766, + "learning_rate": 1.0730005043707765e-06, + "loss": 0.504, + "step": 6481 + }, + { + "epoch": 3.1497872340425532, + "grad_norm": 0.07141054056291997, + "learning_rate": 1.0718156324505802e-06, + "loss": 0.4879, + "step": 6482 + }, + { + "epoch": 3.150273556231003, + "grad_norm": 0.07220328983530064, + "learning_rate": 1.070631336554015e-06, + "loss": 0.513, + "step": 6483 + }, + { + "epoch": 3.150759878419453, + "grad_norm": 0.07037762286764798, + "learning_rate": 1.0694476168547424e-06, + "loss": 0.4942, + "step": 6484 + }, + { + "epoch": 3.1512462006079027, + "grad_norm": 0.07149601343198705, + "learning_rate": 1.068264473526343e-06, + "loss": 0.5213, + "step": 6485 + }, + { + "epoch": 3.1517325227963524, + "grad_norm": 0.0689444757096791, + "learning_rate": 1.0670819067423106e-06, + "loss": 0.4762, + "step": 6486 + }, + { + "epoch": 3.1522188449848025, + "grad_norm": 0.07197825902523278, + "learning_rate": 1.0658999166760553e-06, + "loss": 0.5037, + "step": 6487 + }, + { + "epoch": 3.152705167173252, + "grad_norm": 0.07145908179824197, + "learning_rate": 1.064718503500904e-06, + "loss": 0.5184, + "step": 6488 + }, + { + "epoch": 3.153191489361702, + "grad_norm": 0.07296960258045442, + "learning_rate": 1.063537667390095e-06, + "loss": 0.5449, + "step": 6489 + }, + { + "epoch": 3.153677811550152, + "grad_norm": 0.07053824556853927, + "learning_rate": 1.0623574085167848e-06, + "loss": 0.5033, + "step": 6490 + }, + { + "epoch": 3.1541641337386017, + "grad_norm": 0.06925212017012625, + "learning_rate": 1.0611777270540452e-06, + "loss": 0.475, + "step": 6491 + }, + { + "epoch": 3.154650455927052, + "grad_norm": 0.06900113915807202, + "learning_rate": 1.0599986231748644e-06, + "loss": 0.524, + "step": 6492 + }, + { + "epoch": 3.1551367781155015, + "grad_norm": 0.07098961588124957, + "learning_rate": 1.0588200970521439e-06, + "loss": 0.5085, + "step": 6493 + }, + { + "epoch": 3.155623100303951, + "grad_norm": 0.07255999292924366, + "learning_rate": 1.0576421488587013e-06, + "loss": 0.5089, + "step": 6494 + }, + { + "epoch": 3.1561094224924013, + "grad_norm": 0.07195155002584919, + "learning_rate": 1.0564647787672694e-06, + "loss": 0.5297, + "step": 6495 + }, + { + "epoch": 3.156595744680851, + "grad_norm": 0.07095972315666883, + "learning_rate": 1.0552879869504956e-06, + "loss": 0.5321, + "step": 6496 + }, + { + "epoch": 3.1570820668693007, + "grad_norm": 0.07080137678492887, + "learning_rate": 1.054111773580943e-06, + "loss": 0.5102, + "step": 6497 + }, + { + "epoch": 3.157568389057751, + "grad_norm": 0.07347241061884391, + "learning_rate": 1.052936138831091e-06, + "loss": 0.538, + "step": 6498 + }, + { + "epoch": 3.1580547112462005, + "grad_norm": 0.0698909188455296, + "learning_rate": 1.0517610828733322e-06, + "loss": 0.4934, + "step": 6499 + }, + { + "epoch": 3.1585410334346506, + "grad_norm": 0.06991964254084676, + "learning_rate": 1.0505866058799746e-06, + "loss": 0.461, + "step": 6500 + }, + { + "epoch": 3.1590273556231003, + "grad_norm": 0.06971650722292501, + "learning_rate": 1.0494127080232436e-06, + "loss": 0.4767, + "step": 6501 + }, + { + "epoch": 3.15951367781155, + "grad_norm": 0.07066489578198713, + "learning_rate": 1.0482393894752764e-06, + "loss": 0.5191, + "step": 6502 + }, + { + "epoch": 3.16, + "grad_norm": 0.07247830060940354, + "learning_rate": 1.0470666504081295e-06, + "loss": 0.5605, + "step": 6503 + }, + { + "epoch": 3.16048632218845, + "grad_norm": 0.0721443024330125, + "learning_rate": 1.045894490993768e-06, + "loss": 0.4867, + "step": 6504 + }, + { + "epoch": 3.1609726443768995, + "grad_norm": 0.07147873334806096, + "learning_rate": 1.0447229114040774e-06, + "loss": 0.5182, + "step": 6505 + }, + { + "epoch": 3.1614589665653496, + "grad_norm": 0.07250603851923879, + "learning_rate": 1.0435519118108572e-06, + "loss": 0.5238, + "step": 6506 + }, + { + "epoch": 3.1619452887537993, + "grad_norm": 0.06886656634411971, + "learning_rate": 1.0423814923858205e-06, + "loss": 0.4658, + "step": 6507 + }, + { + "epoch": 3.1624316109422494, + "grad_norm": 0.07004182950749595, + "learning_rate": 1.0412116533005962e-06, + "loss": 0.4726, + "step": 6508 + }, + { + "epoch": 3.162917933130699, + "grad_norm": 0.07118430137392404, + "learning_rate": 1.0400423947267264e-06, + "loss": 0.5102, + "step": 6509 + }, + { + "epoch": 3.163404255319149, + "grad_norm": 0.07179710744773422, + "learning_rate": 1.0388737168356728e-06, + "loss": 0.522, + "step": 6510 + }, + { + "epoch": 3.163890577507599, + "grad_norm": 0.07372243185053347, + "learning_rate": 1.0377056197988067e-06, + "loss": 0.5422, + "step": 6511 + }, + { + "epoch": 3.1643768996960486, + "grad_norm": 0.07238017122553848, + "learning_rate": 1.0365381037874166e-06, + "loss": 0.531, + "step": 6512 + }, + { + "epoch": 3.1648632218844983, + "grad_norm": 0.07021007864470094, + "learning_rate": 1.0353711689727058e-06, + "loss": 0.4992, + "step": 6513 + }, + { + "epoch": 3.1653495440729484, + "grad_norm": 0.06887908772773807, + "learning_rate": 1.0342048155257917e-06, + "loss": 0.4782, + "step": 6514 + }, + { + "epoch": 3.165835866261398, + "grad_norm": 0.07463052992192676, + "learning_rate": 1.0330390436177061e-06, + "loss": 0.5396, + "step": 6515 + }, + { + "epoch": 3.166322188449848, + "grad_norm": 0.07009162139754778, + "learning_rate": 1.031873853419398e-06, + "loss": 0.5119, + "step": 6516 + }, + { + "epoch": 3.166808510638298, + "grad_norm": 0.07224230827980828, + "learning_rate": 1.0307092451017275e-06, + "loss": 0.5168, + "step": 6517 + }, + { + "epoch": 3.1672948328267476, + "grad_norm": 0.07334670292074728, + "learning_rate": 1.0295452188354737e-06, + "loss": 0.5223, + "step": 6518 + }, + { + "epoch": 3.1677811550151977, + "grad_norm": 0.0695372476547809, + "learning_rate": 1.0283817747913244e-06, + "loss": 0.5007, + "step": 6519 + }, + { + "epoch": 3.1682674772036474, + "grad_norm": 0.0722402283297588, + "learning_rate": 1.0272189131398875e-06, + "loss": 0.5007, + "step": 6520 + }, + { + "epoch": 3.168753799392097, + "grad_norm": 0.07066322122298833, + "learning_rate": 1.0260566340516826e-06, + "loss": 0.4933, + "step": 6521 + }, + { + "epoch": 3.1692401215805472, + "grad_norm": 0.07467043023409528, + "learning_rate": 1.0248949376971457e-06, + "loss": 0.5552, + "step": 6522 + }, + { + "epoch": 3.169726443768997, + "grad_norm": 0.06981953434443888, + "learning_rate": 1.0237338242466254e-06, + "loss": 0.5153, + "step": 6523 + }, + { + "epoch": 3.1702127659574466, + "grad_norm": 0.0744979639933302, + "learning_rate": 1.0225732938703865e-06, + "loss": 0.5124, + "step": 6524 + }, + { + "epoch": 3.1706990881458967, + "grad_norm": 0.07143937671122483, + "learning_rate": 1.0214133467386072e-06, + "loss": 0.5245, + "step": 6525 + }, + { + "epoch": 3.1711854103343464, + "grad_norm": 0.07518448974694031, + "learning_rate": 1.0202539830213808e-06, + "loss": 0.5922, + "step": 6526 + }, + { + "epoch": 3.1716717325227965, + "grad_norm": 0.0694974279600695, + "learning_rate": 1.0190952028887136e-06, + "loss": 0.5026, + "step": 6527 + }, + { + "epoch": 3.1721580547112462, + "grad_norm": 0.07345074555256155, + "learning_rate": 1.0179370065105299e-06, + "loss": 0.564, + "step": 6528 + }, + { + "epoch": 3.172644376899696, + "grad_norm": 0.0726248518607165, + "learning_rate": 1.016779394056665e-06, + "loss": 0.5175, + "step": 6529 + }, + { + "epoch": 3.173130699088146, + "grad_norm": 0.07237804506656892, + "learning_rate": 1.0156223656968695e-06, + "loss": 0.5524, + "step": 6530 + }, + { + "epoch": 3.1736170212765957, + "grad_norm": 0.06939657595428865, + "learning_rate": 1.0144659216008084e-06, + "loss": 0.4636, + "step": 6531 + }, + { + "epoch": 3.1741033434650454, + "grad_norm": 0.07038133202224696, + "learning_rate": 1.0133100619380626e-06, + "loss": 0.5054, + "step": 6532 + }, + { + "epoch": 3.1745896656534955, + "grad_norm": 0.07139104462041464, + "learning_rate": 1.0121547868781228e-06, + "loss": 0.493, + "step": 6533 + }, + { + "epoch": 3.1750759878419452, + "grad_norm": 0.07018597721813158, + "learning_rate": 1.0110000965903988e-06, + "loss": 0.5018, + "step": 6534 + }, + { + "epoch": 3.1755623100303954, + "grad_norm": 0.07173715312699312, + "learning_rate": 1.0098459912442126e-06, + "loss": 0.5059, + "step": 6535 + }, + { + "epoch": 3.176048632218845, + "grad_norm": 0.07079841394075786, + "learning_rate": 1.0086924710088003e-06, + "loss": 0.5078, + "step": 6536 + }, + { + "epoch": 3.1765349544072947, + "grad_norm": 0.07284862135037284, + "learning_rate": 1.007539536053313e-06, + "loss": 0.5438, + "step": 6537 + }, + { + "epoch": 3.177021276595745, + "grad_norm": 0.07044494982880632, + "learning_rate": 1.0063871865468156e-06, + "loss": 0.4973, + "step": 6538 + }, + { + "epoch": 3.1775075987841945, + "grad_norm": 0.07106102566423396, + "learning_rate": 1.0052354226582861e-06, + "loss": 0.4717, + "step": 6539 + }, + { + "epoch": 3.177993920972644, + "grad_norm": 0.07188334847371916, + "learning_rate": 1.004084244556619e-06, + "loss": 0.5053, + "step": 6540 + }, + { + "epoch": 3.1784802431610943, + "grad_norm": 0.07053686110847719, + "learning_rate": 1.0029336524106202e-06, + "loss": 0.473, + "step": 6541 + }, + { + "epoch": 3.178966565349544, + "grad_norm": 0.07175167836259107, + "learning_rate": 1.0017836463890118e-06, + "loss": 0.4972, + "step": 6542 + }, + { + "epoch": 3.1794528875379937, + "grad_norm": 0.07044035821433776, + "learning_rate": 1.0006342266604291e-06, + "loss": 0.4512, + "step": 6543 + }, + { + "epoch": 3.179939209726444, + "grad_norm": 0.07167506835808471, + "learning_rate": 9.994853933934212e-07, + "loss": 0.4959, + "step": 6544 + }, + { + "epoch": 3.1804255319148935, + "grad_norm": 0.07288030164798386, + "learning_rate": 9.983371467564511e-07, + "loss": 0.5233, + "step": 6545 + }, + { + "epoch": 3.1809118541033437, + "grad_norm": 0.07296513728946342, + "learning_rate": 9.97189486917896e-07, + "loss": 0.4849, + "step": 6546 + }, + { + "epoch": 3.1813981762917933, + "grad_norm": 0.07375066155458689, + "learning_rate": 9.960424140460496e-07, + "loss": 0.5481, + "step": 6547 + }, + { + "epoch": 3.181884498480243, + "grad_norm": 0.07188220544116591, + "learning_rate": 9.948959283091141e-07, + "loss": 0.5636, + "step": 6548 + }, + { + "epoch": 3.182370820668693, + "grad_norm": 0.07252764198511936, + "learning_rate": 9.937500298752101e-07, + "loss": 0.5214, + "step": 6549 + }, + { + "epoch": 3.182857142857143, + "grad_norm": 0.07061568805737316, + "learning_rate": 9.926047189123699e-07, + "loss": 0.4787, + "step": 6550 + }, + { + "epoch": 3.1833434650455925, + "grad_norm": 0.07278077678552378, + "learning_rate": 9.914599955885407e-07, + "loss": 0.5207, + "step": 6551 + }, + { + "epoch": 3.1838297872340426, + "grad_norm": 0.07033369968208668, + "learning_rate": 9.903158600715834e-07, + "loss": 0.4972, + "step": 6552 + }, + { + "epoch": 3.1843161094224923, + "grad_norm": 0.07234286522060195, + "learning_rate": 9.891723125292723e-07, + "loss": 0.5256, + "step": 6553 + }, + { + "epoch": 3.1848024316109425, + "grad_norm": 0.07102207940232765, + "learning_rate": 9.88029353129295e-07, + "loss": 0.5016, + "step": 6554 + }, + { + "epoch": 3.185288753799392, + "grad_norm": 0.07316197025954028, + "learning_rate": 9.868869820392545e-07, + "loss": 0.5188, + "step": 6555 + }, + { + "epoch": 3.185775075987842, + "grad_norm": 0.06800174953747572, + "learning_rate": 9.857451994266665e-07, + "loss": 0.4631, + "step": 6556 + }, + { + "epoch": 3.186261398176292, + "grad_norm": 0.07114460446832876, + "learning_rate": 9.846040054589596e-07, + "loss": 0.5113, + "step": 6557 + }, + { + "epoch": 3.1867477203647416, + "grad_norm": 0.07016854668068076, + "learning_rate": 9.834634003034777e-07, + "loss": 0.4835, + "step": 6558 + }, + { + "epoch": 3.1872340425531913, + "grad_norm": 0.07204058729789382, + "learning_rate": 9.82323384127477e-07, + "loss": 0.5333, + "step": 6559 + }, + { + "epoch": 3.1877203647416414, + "grad_norm": 0.07370539109717562, + "learning_rate": 9.811839570981291e-07, + "loss": 0.5435, + "step": 6560 + }, + { + "epoch": 3.188206686930091, + "grad_norm": 0.07242073036108321, + "learning_rate": 9.800451193825167e-07, + "loss": 0.5241, + "step": 6561 + }, + { + "epoch": 3.1886930091185413, + "grad_norm": 0.07101656429489084, + "learning_rate": 9.78906871147638e-07, + "loss": 0.4929, + "step": 6562 + }, + { + "epoch": 3.189179331306991, + "grad_norm": 0.07616065146271954, + "learning_rate": 9.777692125604039e-07, + "loss": 0.5322, + "step": 6563 + }, + { + "epoch": 3.1896656534954406, + "grad_norm": 0.07323442124348449, + "learning_rate": 9.766321437876391e-07, + "loss": 0.5198, + "step": 6564 + }, + { + "epoch": 3.1901519756838908, + "grad_norm": 0.07132179297828939, + "learning_rate": 9.754956649960823e-07, + "loss": 0.5516, + "step": 6565 + }, + { + "epoch": 3.1906382978723404, + "grad_norm": 0.07057315349992284, + "learning_rate": 9.743597763523855e-07, + "loss": 0.485, + "step": 6566 + }, + { + "epoch": 3.19112462006079, + "grad_norm": 0.0712612554963249, + "learning_rate": 9.732244780231127e-07, + "loss": 0.5193, + "step": 6567 + }, + { + "epoch": 3.1916109422492402, + "grad_norm": 0.07187691360138726, + "learning_rate": 9.720897701747435e-07, + "loss": 0.5336, + "step": 6568 + }, + { + "epoch": 3.19209726443769, + "grad_norm": 0.06805895008877791, + "learning_rate": 9.709556529736692e-07, + "loss": 0.4727, + "step": 6569 + }, + { + "epoch": 3.1925835866261396, + "grad_norm": 0.07177273244968645, + "learning_rate": 9.698221265861957e-07, + "loss": 0.5302, + "step": 6570 + }, + { + "epoch": 3.1930699088145897, + "grad_norm": 0.07254163064575793, + "learning_rate": 9.686891911785418e-07, + "loss": 0.5386, + "step": 6571 + }, + { + "epoch": 3.1935562310030394, + "grad_norm": 0.07035196610090834, + "learning_rate": 9.675568469168388e-07, + "loss": 0.4986, + "step": 6572 + }, + { + "epoch": 3.1940425531914896, + "grad_norm": 0.0695053897796987, + "learning_rate": 9.664250939671332e-07, + "loss": 0.489, + "step": 6573 + }, + { + "epoch": 3.1945288753799392, + "grad_norm": 0.08581518037914929, + "learning_rate": 9.652939324953835e-07, + "loss": 0.5726, + "step": 6574 + }, + { + "epoch": 3.195015197568389, + "grad_norm": 0.06990004453786167, + "learning_rate": 9.641633626674612e-07, + "loss": 0.4919, + "step": 6575 + }, + { + "epoch": 3.195501519756839, + "grad_norm": 0.06954543980025382, + "learning_rate": 9.630333846491518e-07, + "loss": 0.4882, + "step": 6576 + }, + { + "epoch": 3.1959878419452887, + "grad_norm": 0.07087106464588994, + "learning_rate": 9.61903998606154e-07, + "loss": 0.542, + "step": 6577 + }, + { + "epoch": 3.1964741641337384, + "grad_norm": 0.07319970567781901, + "learning_rate": 9.607752047040792e-07, + "loss": 0.5162, + "step": 6578 + }, + { + "epoch": 3.1969604863221885, + "grad_norm": 0.07132779625985851, + "learning_rate": 9.59647003108452e-07, + "loss": 0.5181, + "step": 6579 + }, + { + "epoch": 3.1974468085106382, + "grad_norm": 0.07110947027204667, + "learning_rate": 9.58519393984712e-07, + "loss": 0.5163, + "step": 6580 + }, + { + "epoch": 3.197933130699088, + "grad_norm": 0.07313446791100611, + "learning_rate": 9.573923774982075e-07, + "loss": 0.5642, + "step": 6581 + }, + { + "epoch": 3.198419452887538, + "grad_norm": 0.06999437368133128, + "learning_rate": 9.562659538142027e-07, + "loss": 0.5109, + "step": 6582 + }, + { + "epoch": 3.1989057750759877, + "grad_norm": 0.07058124518391623, + "learning_rate": 9.551401230978773e-07, + "loss": 0.5212, + "step": 6583 + }, + { + "epoch": 3.199392097264438, + "grad_norm": 0.07144869186374213, + "learning_rate": 9.540148855143205e-07, + "loss": 0.5023, + "step": 6584 + }, + { + "epoch": 3.1998784194528875, + "grad_norm": 0.0710627424540113, + "learning_rate": 9.528902412285351e-07, + "loss": 0.5053, + "step": 6585 + }, + { + "epoch": 3.200364741641337, + "grad_norm": 0.07064290630221878, + "learning_rate": 9.517661904054387e-07, + "loss": 0.5165, + "step": 6586 + }, + { + "epoch": 3.2008510638297873, + "grad_norm": 0.07243029054189776, + "learning_rate": 9.506427332098589e-07, + "loss": 0.5128, + "step": 6587 + }, + { + "epoch": 3.201337386018237, + "grad_norm": 0.07127970828142006, + "learning_rate": 9.495198698065394e-07, + "loss": 0.5428, + "step": 6588 + }, + { + "epoch": 3.201823708206687, + "grad_norm": 0.07300225237254729, + "learning_rate": 9.483976003601341e-07, + "loss": 0.5024, + "step": 6589 + }, + { + "epoch": 3.202310030395137, + "grad_norm": 0.07096281524340389, + "learning_rate": 9.472759250352126e-07, + "loss": 0.507, + "step": 6590 + }, + { + "epoch": 3.2027963525835865, + "grad_norm": 0.07075964114599965, + "learning_rate": 9.461548439962542e-07, + "loss": 0.555, + "step": 6591 + }, + { + "epoch": 3.2032826747720367, + "grad_norm": 0.07560427051894147, + "learning_rate": 9.450343574076537e-07, + "loss": 0.5466, + "step": 6592 + }, + { + "epoch": 3.2037689969604863, + "grad_norm": 0.07379002330362418, + "learning_rate": 9.439144654337179e-07, + "loss": 0.5292, + "step": 6593 + }, + { + "epoch": 3.204255319148936, + "grad_norm": 0.0715001658634348, + "learning_rate": 9.427951682386654e-07, + "loss": 0.514, + "step": 6594 + }, + { + "epoch": 3.204741641337386, + "grad_norm": 0.06931942506780431, + "learning_rate": 9.416764659866301e-07, + "loss": 0.4881, + "step": 6595 + }, + { + "epoch": 3.205227963525836, + "grad_norm": 0.07138063141402301, + "learning_rate": 9.405583588416545e-07, + "loss": 0.5011, + "step": 6596 + }, + { + "epoch": 3.2057142857142855, + "grad_norm": 0.07312667271218534, + "learning_rate": 9.394408469676974e-07, + "loss": 0.5232, + "step": 6597 + }, + { + "epoch": 3.2062006079027356, + "grad_norm": 0.07307923454377412, + "learning_rate": 9.383239305286302e-07, + "loss": 0.5227, + "step": 6598 + }, + { + "epoch": 3.2066869300911853, + "grad_norm": 0.0710776103712388, + "learning_rate": 9.372076096882344e-07, + "loss": 0.5221, + "step": 6599 + }, + { + "epoch": 3.2071732522796355, + "grad_norm": 0.07509157716515864, + "learning_rate": 9.360918846102057e-07, + "loss": 0.5274, + "step": 6600 + }, + { + "epoch": 3.207659574468085, + "grad_norm": 0.06899005816163993, + "learning_rate": 9.34976755458154e-07, + "loss": 0.4544, + "step": 6601 + }, + { + "epoch": 3.208145896656535, + "grad_norm": 0.0708896240031363, + "learning_rate": 9.338622223956006e-07, + "loss": 0.5183, + "step": 6602 + }, + { + "epoch": 3.208632218844985, + "grad_norm": 0.06917118232357904, + "learning_rate": 9.327482855859776e-07, + "loss": 0.4722, + "step": 6603 + }, + { + "epoch": 3.2091185410334346, + "grad_norm": 0.07060890582885317, + "learning_rate": 9.31634945192632e-07, + "loss": 0.5056, + "step": 6604 + }, + { + "epoch": 3.2096048632218843, + "grad_norm": 0.07285005633966471, + "learning_rate": 9.305222013788223e-07, + "loss": 0.535, + "step": 6605 + }, + { + "epoch": 3.2100911854103344, + "grad_norm": 0.07419051531572524, + "learning_rate": 9.294100543077201e-07, + "loss": 0.5671, + "step": 6606 + }, + { + "epoch": 3.210577507598784, + "grad_norm": 0.07082598871396202, + "learning_rate": 9.282985041424086e-07, + "loss": 0.4965, + "step": 6607 + }, + { + "epoch": 3.211063829787234, + "grad_norm": 0.07188920613846218, + "learning_rate": 9.271875510458845e-07, + "loss": 0.5251, + "step": 6608 + }, + { + "epoch": 3.211550151975684, + "grad_norm": 0.06979772983714502, + "learning_rate": 9.26077195181056e-07, + "loss": 0.4803, + "step": 6609 + }, + { + "epoch": 3.2120364741641336, + "grad_norm": 0.0714712665373541, + "learning_rate": 9.249674367107453e-07, + "loss": 0.5041, + "step": 6610 + }, + { + "epoch": 3.2125227963525838, + "grad_norm": 0.07338503338950801, + "learning_rate": 9.238582757976839e-07, + "loss": 0.5635, + "step": 6611 + }, + { + "epoch": 3.2130091185410334, + "grad_norm": 0.06895964928351896, + "learning_rate": 9.227497126045187e-07, + "loss": 0.4963, + "step": 6612 + }, + { + "epoch": 3.213495440729483, + "grad_norm": 0.07286438208122141, + "learning_rate": 9.216417472938083e-07, + "loss": 0.4933, + "step": 6613 + }, + { + "epoch": 3.2139817629179332, + "grad_norm": 0.07292777456855977, + "learning_rate": 9.20534380028022e-07, + "loss": 0.5485, + "step": 6614 + }, + { + "epoch": 3.214468085106383, + "grad_norm": 0.06952089188154065, + "learning_rate": 9.194276109695443e-07, + "loss": 0.4985, + "step": 6615 + }, + { + "epoch": 3.214954407294833, + "grad_norm": 0.07202678853959618, + "learning_rate": 9.183214402806689e-07, + "loss": 0.5055, + "step": 6616 + }, + { + "epoch": 3.2154407294832827, + "grad_norm": 0.07176545814135184, + "learning_rate": 9.172158681236043e-07, + "loss": 0.5412, + "step": 6617 + }, + { + "epoch": 3.2159270516717324, + "grad_norm": 0.07091850643730259, + "learning_rate": 9.161108946604674e-07, + "loss": 0.5167, + "step": 6618 + }, + { + "epoch": 3.2164133738601826, + "grad_norm": 0.07242664674208017, + "learning_rate": 9.150065200532942e-07, + "loss": 0.4939, + "step": 6619 + }, + { + "epoch": 3.2168996960486322, + "grad_norm": 0.07159815503445581, + "learning_rate": 9.139027444640264e-07, + "loss": 0.5114, + "step": 6620 + }, + { + "epoch": 3.217386018237082, + "grad_norm": 0.07181949478413042, + "learning_rate": 9.127995680545204e-07, + "loss": 0.5444, + "step": 6621 + }, + { + "epoch": 3.217872340425532, + "grad_norm": 0.07241375375847589, + "learning_rate": 9.116969909865448e-07, + "loss": 0.5096, + "step": 6622 + }, + { + "epoch": 3.2183586626139817, + "grad_norm": 0.07027026120224286, + "learning_rate": 9.105950134217795e-07, + "loss": 0.5005, + "step": 6623 + }, + { + "epoch": 3.2188449848024314, + "grad_norm": 0.0705956631807819, + "learning_rate": 9.09493635521817e-07, + "loss": 0.5228, + "step": 6624 + }, + { + "epoch": 3.2193313069908815, + "grad_norm": 0.07202787207458479, + "learning_rate": 9.083928574481637e-07, + "loss": 0.5103, + "step": 6625 + }, + { + "epoch": 3.2198176291793312, + "grad_norm": 0.07143516564887394, + "learning_rate": 9.072926793622333e-07, + "loss": 0.5213, + "step": 6626 + }, + { + "epoch": 3.2203039513677814, + "grad_norm": 0.07354103714973544, + "learning_rate": 9.061931014253556e-07, + "loss": 0.5354, + "step": 6627 + }, + { + "epoch": 3.220790273556231, + "grad_norm": 0.07298242203084525, + "learning_rate": 9.050941237987709e-07, + "loss": 0.5837, + "step": 6628 + }, + { + "epoch": 3.2212765957446807, + "grad_norm": 0.07068375541369609, + "learning_rate": 9.039957466436328e-07, + "loss": 0.4972, + "step": 6629 + }, + { + "epoch": 3.221762917933131, + "grad_norm": 0.07128749994619307, + "learning_rate": 9.02897970121005e-07, + "loss": 0.4883, + "step": 6630 + }, + { + "epoch": 3.2222492401215805, + "grad_norm": 0.07135012464820764, + "learning_rate": 9.018007943918645e-07, + "loss": 0.5203, + "step": 6631 + }, + { + "epoch": 3.22273556231003, + "grad_norm": 0.0710371230491351, + "learning_rate": 9.007042196170989e-07, + "loss": 0.508, + "step": 6632 + }, + { + "epoch": 3.2232218844984803, + "grad_norm": 0.07261249038470223, + "learning_rate": 8.99608245957509e-07, + "loss": 0.5074, + "step": 6633 + }, + { + "epoch": 3.22370820668693, + "grad_norm": 0.0685338730412036, + "learning_rate": 8.985128735738069e-07, + "loss": 0.4747, + "step": 6634 + }, + { + "epoch": 3.2241945288753797, + "grad_norm": 0.07353307586944226, + "learning_rate": 8.974181026266165e-07, + "loss": 0.4988, + "step": 6635 + }, + { + "epoch": 3.22468085106383, + "grad_norm": 0.06928889568150072, + "learning_rate": 8.963239332764718e-07, + "loss": 0.4996, + "step": 6636 + }, + { + "epoch": 3.2251671732522795, + "grad_norm": 0.07699051618424035, + "learning_rate": 8.952303656838235e-07, + "loss": 0.5494, + "step": 6637 + }, + { + "epoch": 3.2256534954407297, + "grad_norm": 0.07118625366267374, + "learning_rate": 8.941374000090297e-07, + "loss": 0.5159, + "step": 6638 + }, + { + "epoch": 3.2261398176291793, + "grad_norm": 0.07148611563861293, + "learning_rate": 8.930450364123616e-07, + "loss": 0.5383, + "step": 6639 + }, + { + "epoch": 3.226626139817629, + "grad_norm": 0.0703852031534301, + "learning_rate": 8.919532750540006e-07, + "loss": 0.5076, + "step": 6640 + }, + { + "epoch": 3.227112462006079, + "grad_norm": 0.07182033856521591, + "learning_rate": 8.908621160940418e-07, + "loss": 0.4783, + "step": 6641 + }, + { + "epoch": 3.227598784194529, + "grad_norm": 0.07021041457609291, + "learning_rate": 8.89771559692491e-07, + "loss": 0.5109, + "step": 6642 + }, + { + "epoch": 3.2280851063829785, + "grad_norm": 0.07201072462342475, + "learning_rate": 8.886816060092663e-07, + "loss": 0.513, + "step": 6643 + }, + { + "epoch": 3.2285714285714286, + "grad_norm": 0.06837650667282466, + "learning_rate": 8.875922552041971e-07, + "loss": 0.4593, + "step": 6644 + }, + { + "epoch": 3.2290577507598783, + "grad_norm": 0.07115129558286391, + "learning_rate": 8.865035074370243e-07, + "loss": 0.4773, + "step": 6645 + }, + { + "epoch": 3.2295440729483285, + "grad_norm": 0.06993739552887139, + "learning_rate": 8.854153628674e-07, + "loss": 0.5203, + "step": 6646 + }, + { + "epoch": 3.230030395136778, + "grad_norm": 0.07052411287303395, + "learning_rate": 8.84327821654889e-07, + "loss": 0.4717, + "step": 6647 + }, + { + "epoch": 3.230516717325228, + "grad_norm": 0.07182067644441656, + "learning_rate": 8.832408839589656e-07, + "loss": 0.5043, + "step": 6648 + }, + { + "epoch": 3.231003039513678, + "grad_norm": 0.07136228080162373, + "learning_rate": 8.821545499390183e-07, + "loss": 0.536, + "step": 6649 + }, + { + "epoch": 3.2314893617021276, + "grad_norm": 0.07022756578578787, + "learning_rate": 8.810688197543449e-07, + "loss": 0.5142, + "step": 6650 + }, + { + "epoch": 3.2319756838905773, + "grad_norm": 0.07025163375485158, + "learning_rate": 8.799836935641559e-07, + "loss": 0.5254, + "step": 6651 + }, + { + "epoch": 3.2324620060790275, + "grad_norm": 0.07096770568648499, + "learning_rate": 8.788991715275718e-07, + "loss": 0.5187, + "step": 6652 + }, + { + "epoch": 3.232948328267477, + "grad_norm": 0.07241760452396179, + "learning_rate": 8.77815253803626e-07, + "loss": 0.515, + "step": 6653 + }, + { + "epoch": 3.2334346504559273, + "grad_norm": 0.07219397152470888, + "learning_rate": 8.767319405512631e-07, + "loss": 0.522, + "step": 6654 + }, + { + "epoch": 3.233920972644377, + "grad_norm": 0.07157240958150009, + "learning_rate": 8.756492319293381e-07, + "loss": 0.5269, + "step": 6655 + }, + { + "epoch": 3.2344072948328266, + "grad_norm": 0.06960412720172376, + "learning_rate": 8.745671280966178e-07, + "loss": 0.4881, + "step": 6656 + }, + { + "epoch": 3.2348936170212768, + "grad_norm": 0.07017221096742848, + "learning_rate": 8.73485629211781e-07, + "loss": 0.4671, + "step": 6657 + }, + { + "epoch": 3.2353799392097264, + "grad_norm": 0.07045450849325116, + "learning_rate": 8.724047354334169e-07, + "loss": 0.5082, + "step": 6658 + }, + { + "epoch": 3.235866261398176, + "grad_norm": 0.07106206450127038, + "learning_rate": 8.713244469200272e-07, + "loss": 0.4787, + "step": 6659 + }, + { + "epoch": 3.2363525835866263, + "grad_norm": 0.06890903072031643, + "learning_rate": 8.702447638300221e-07, + "loss": 0.4938, + "step": 6660 + }, + { + "epoch": 3.236838905775076, + "grad_norm": 0.0704515274818252, + "learning_rate": 8.691656863217263e-07, + "loss": 0.4992, + "step": 6661 + }, + { + "epoch": 3.2373252279635256, + "grad_norm": 0.07346651287263162, + "learning_rate": 8.680872145533742e-07, + "loss": 0.5394, + "step": 6662 + }, + { + "epoch": 3.2378115501519757, + "grad_norm": 0.07127118651222632, + "learning_rate": 8.670093486831105e-07, + "loss": 0.4934, + "step": 6663 + }, + { + "epoch": 3.2382978723404254, + "grad_norm": 0.07268534199866672, + "learning_rate": 8.659320888689932e-07, + "loss": 0.5511, + "step": 6664 + }, + { + "epoch": 3.2387841945288756, + "grad_norm": 0.07068047988483725, + "learning_rate": 8.648554352689892e-07, + "loss": 0.5291, + "step": 6665 + }, + { + "epoch": 3.2392705167173252, + "grad_norm": 0.07293907280902406, + "learning_rate": 8.637793880409778e-07, + "loss": 0.5238, + "step": 6666 + }, + { + "epoch": 3.239756838905775, + "grad_norm": 0.07019097943184698, + "learning_rate": 8.627039473427495e-07, + "loss": 0.4628, + "step": 6667 + }, + { + "epoch": 3.240243161094225, + "grad_norm": 0.06937920215943859, + "learning_rate": 8.616291133320053e-07, + "loss": 0.4743, + "step": 6668 + }, + { + "epoch": 3.2407294832826747, + "grad_norm": 0.07198544230257038, + "learning_rate": 8.605548861663571e-07, + "loss": 0.5234, + "step": 6669 + }, + { + "epoch": 3.2412158054711244, + "grad_norm": 0.07081791779059504, + "learning_rate": 8.594812660033286e-07, + "loss": 0.4964, + "step": 6670 + }, + { + "epoch": 3.2417021276595746, + "grad_norm": 0.07122570568972306, + "learning_rate": 8.584082530003535e-07, + "loss": 0.5076, + "step": 6671 + }, + { + "epoch": 3.2421884498480242, + "grad_norm": 0.07109610901748635, + "learning_rate": 8.573358473147775e-07, + "loss": 0.5098, + "step": 6672 + }, + { + "epoch": 3.2426747720364744, + "grad_norm": 0.07052747008516987, + "learning_rate": 8.56264049103856e-07, + "loss": 0.5418, + "step": 6673 + }, + { + "epoch": 3.243161094224924, + "grad_norm": 0.07097981311497588, + "learning_rate": 8.551928585247565e-07, + "loss": 0.4875, + "step": 6674 + }, + { + "epoch": 3.2436474164133737, + "grad_norm": 0.07077987147567684, + "learning_rate": 8.541222757345574e-07, + "loss": 0.4915, + "step": 6675 + }, + { + "epoch": 3.244133738601824, + "grad_norm": 0.07039884421441911, + "learning_rate": 8.530523008902464e-07, + "loss": 0.5176, + "step": 6676 + }, + { + "epoch": 3.2446200607902735, + "grad_norm": 0.07089924937189282, + "learning_rate": 8.51982934148724e-07, + "loss": 0.4903, + "step": 6677 + }, + { + "epoch": 3.2451063829787232, + "grad_norm": 0.07264228774406058, + "learning_rate": 8.50914175666801e-07, + "loss": 0.5673, + "step": 6678 + }, + { + "epoch": 3.2455927051671734, + "grad_norm": 0.07108938563226617, + "learning_rate": 8.498460256011976e-07, + "loss": 0.499, + "step": 6679 + }, + { + "epoch": 3.246079027355623, + "grad_norm": 0.06956281785561204, + "learning_rate": 8.487784841085461e-07, + "loss": 0.5014, + "step": 6680 + }, + { + "epoch": 3.246565349544073, + "grad_norm": 0.07188876324488672, + "learning_rate": 8.477115513453904e-07, + "loss": 0.5286, + "step": 6681 + }, + { + "epoch": 3.247051671732523, + "grad_norm": 0.07232988044828668, + "learning_rate": 8.466452274681825e-07, + "loss": 0.5353, + "step": 6682 + }, + { + "epoch": 3.247051671732523, + "eval_loss": 0.5697982907295227, + "eval_runtime": 105.1823, + "eval_samples_per_second": 288.575, + "eval_steps_per_second": 36.08, + "step": 6682 + }, + { + "epoch": 3.2475379939209725, + "grad_norm": 0.07050943371541134, + "learning_rate": 8.455795126332883e-07, + "loss": 0.5272, + "step": 6683 + }, + { + "epoch": 3.2480243161094227, + "grad_norm": 0.06906538143042283, + "learning_rate": 8.445144069969813e-07, + "loss": 0.4706, + "step": 6684 + }, + { + "epoch": 3.2485106382978723, + "grad_norm": 0.06959869590037358, + "learning_rate": 8.434499107154486e-07, + "loss": 0.5078, + "step": 6685 + }, + { + "epoch": 3.248996960486322, + "grad_norm": 0.06833386951533407, + "learning_rate": 8.423860239447851e-07, + "loss": 0.4784, + "step": 6686 + }, + { + "epoch": 3.249483282674772, + "grad_norm": 0.06957653617225527, + "learning_rate": 8.413227468410001e-07, + "loss": 0.5003, + "step": 6687 + }, + { + "epoch": 3.249969604863222, + "grad_norm": 0.07190594943453445, + "learning_rate": 8.40260079560008e-07, + "loss": 0.5167, + "step": 6688 + }, + { + "epoch": 3.2504559270516715, + "grad_norm": 0.07120868946868153, + "learning_rate": 8.39198022257638e-07, + "loss": 0.5215, + "step": 6689 + }, + { + "epoch": 3.2509422492401217, + "grad_norm": 0.07338432145314944, + "learning_rate": 8.381365750896292e-07, + "loss": 0.5233, + "step": 6690 + }, + { + "epoch": 3.2514285714285713, + "grad_norm": 0.06982103780120874, + "learning_rate": 8.37075738211629e-07, + "loss": 0.4729, + "step": 6691 + }, + { + "epoch": 3.2519148936170215, + "grad_norm": 0.07012801317526553, + "learning_rate": 8.360155117792002e-07, + "loss": 0.4961, + "step": 6692 + }, + { + "epoch": 3.252401215805471, + "grad_norm": 0.07458212688986891, + "learning_rate": 8.349558959478116e-07, + "loss": 0.5428, + "step": 6693 + }, + { + "epoch": 3.252887537993921, + "grad_norm": 0.07126116694589527, + "learning_rate": 8.338968908728434e-07, + "loss": 0.5119, + "step": 6694 + }, + { + "epoch": 3.253373860182371, + "grad_norm": 0.07004356300358536, + "learning_rate": 8.32838496709587e-07, + "loss": 0.5136, + "step": 6695 + }, + { + "epoch": 3.2538601823708206, + "grad_norm": 0.07200875228844517, + "learning_rate": 8.317807136132439e-07, + "loss": 0.5508, + "step": 6696 + }, + { + "epoch": 3.2543465045592703, + "grad_norm": 0.06993379002200782, + "learning_rate": 8.307235417389253e-07, + "loss": 0.4862, + "step": 6697 + }, + { + "epoch": 3.2548328267477205, + "grad_norm": 0.07357284131512169, + "learning_rate": 8.296669812416546e-07, + "loss": 0.5091, + "step": 6698 + }, + { + "epoch": 3.25531914893617, + "grad_norm": 0.06981934265510063, + "learning_rate": 8.286110322763635e-07, + "loss": 0.4885, + "step": 6699 + }, + { + "epoch": 3.25580547112462, + "grad_norm": 0.06853191052680513, + "learning_rate": 8.275556949978958e-07, + "loss": 0.4661, + "step": 6700 + }, + { + "epoch": 3.25629179331307, + "grad_norm": 0.07160968738055536, + "learning_rate": 8.265009695610038e-07, + "loss": 0.5139, + "step": 6701 + }, + { + "epoch": 3.2567781155015196, + "grad_norm": 0.06970425937040438, + "learning_rate": 8.254468561203527e-07, + "loss": 0.5184, + "step": 6702 + }, + { + "epoch": 3.2572644376899698, + "grad_norm": 0.06988621532912487, + "learning_rate": 8.243933548305133e-07, + "loss": 0.5002, + "step": 6703 + }, + { + "epoch": 3.2577507598784194, + "grad_norm": 0.07045706268925343, + "learning_rate": 8.233404658459721e-07, + "loss": 0.4973, + "step": 6704 + }, + { + "epoch": 3.258237082066869, + "grad_norm": 0.07087213933843038, + "learning_rate": 8.222881893211221e-07, + "loss": 0.514, + "step": 6705 + }, + { + "epoch": 3.2587234042553193, + "grad_norm": 0.07146602335709688, + "learning_rate": 8.212365254102677e-07, + "loss": 0.5292, + "step": 6706 + }, + { + "epoch": 3.259209726443769, + "grad_norm": 0.07260648628114531, + "learning_rate": 8.201854742676241e-07, + "loss": 0.5212, + "step": 6707 + }, + { + "epoch": 3.259696048632219, + "grad_norm": 0.07302207479121936, + "learning_rate": 8.191350360473161e-07, + "loss": 0.5495, + "step": 6708 + }, + { + "epoch": 3.2601823708206688, + "grad_norm": 0.07044224517567534, + "learning_rate": 8.180852109033766e-07, + "loss": 0.529, + "step": 6709 + }, + { + "epoch": 3.2606686930091184, + "grad_norm": 0.06983207440788895, + "learning_rate": 8.17035998989753e-07, + "loss": 0.4896, + "step": 6710 + }, + { + "epoch": 3.2611550151975686, + "grad_norm": 0.07134130972511271, + "learning_rate": 8.159874004603002e-07, + "loss": 0.537, + "step": 6711 + }, + { + "epoch": 3.2616413373860182, + "grad_norm": 0.0729634608031717, + "learning_rate": 8.149394154687823e-07, + "loss": 0.5375, + "step": 6712 + }, + { + "epoch": 3.262127659574468, + "grad_norm": 0.0702147847350036, + "learning_rate": 8.138920441688741e-07, + "loss": 0.474, + "step": 6713 + }, + { + "epoch": 3.262613981762918, + "grad_norm": 0.07555019420391294, + "learning_rate": 8.128452867141618e-07, + "loss": 0.5241, + "step": 6714 + }, + { + "epoch": 3.2631003039513677, + "grad_norm": 0.07237785593610427, + "learning_rate": 8.117991432581396e-07, + "loss": 0.5357, + "step": 6715 + }, + { + "epoch": 3.2635866261398174, + "grad_norm": 0.06965607494193911, + "learning_rate": 8.107536139542132e-07, + "loss": 0.5194, + "step": 6716 + }, + { + "epoch": 3.2640729483282676, + "grad_norm": 0.07128960605514549, + "learning_rate": 8.097086989556979e-07, + "loss": 0.5139, + "step": 6717 + }, + { + "epoch": 3.2645592705167172, + "grad_norm": 0.07038015499999553, + "learning_rate": 8.086643984158177e-07, + "loss": 0.5469, + "step": 6718 + }, + { + "epoch": 3.2650455927051674, + "grad_norm": 0.0720534862310441, + "learning_rate": 8.076207124877067e-07, + "loss": 0.5016, + "step": 6719 + }, + { + "epoch": 3.265531914893617, + "grad_norm": 0.07420249347257879, + "learning_rate": 8.065776413244114e-07, + "loss": 0.5269, + "step": 6720 + }, + { + "epoch": 3.2660182370820667, + "grad_norm": 0.07270066923272671, + "learning_rate": 8.05535185078885e-07, + "loss": 0.5323, + "step": 6721 + }, + { + "epoch": 3.266504559270517, + "grad_norm": 0.07420155667146282, + "learning_rate": 8.044933439039926e-07, + "loss": 0.5333, + "step": 6722 + }, + { + "epoch": 3.2669908814589665, + "grad_norm": 0.06909639448934451, + "learning_rate": 8.034521179525079e-07, + "loss": 0.5057, + "step": 6723 + }, + { + "epoch": 3.2674772036474162, + "grad_norm": 0.07106245916394575, + "learning_rate": 8.024115073771154e-07, + "loss": 0.5204, + "step": 6724 + }, + { + "epoch": 3.2679635258358664, + "grad_norm": 0.06897116055661569, + "learning_rate": 8.013715123304089e-07, + "loss": 0.5234, + "step": 6725 + }, + { + "epoch": 3.268449848024316, + "grad_norm": 0.07031939633422496, + "learning_rate": 8.003321329648911e-07, + "loss": 0.5019, + "step": 6726 + }, + { + "epoch": 3.2689361702127657, + "grad_norm": 0.07288176899274526, + "learning_rate": 7.992933694329747e-07, + "loss": 0.5566, + "step": 6727 + }, + { + "epoch": 3.269422492401216, + "grad_norm": 0.07443121690921613, + "learning_rate": 7.982552218869843e-07, + "loss": 0.5381, + "step": 6728 + }, + { + "epoch": 3.2699088145896655, + "grad_norm": 0.06991596429238804, + "learning_rate": 7.972176904791518e-07, + "loss": 0.494, + "step": 6729 + }, + { + "epoch": 3.2703951367781157, + "grad_norm": 0.07274997577093707, + "learning_rate": 7.96180775361619e-07, + "loss": 0.5255, + "step": 6730 + }, + { + "epoch": 3.2708814589665653, + "grad_norm": 0.07267445539400239, + "learning_rate": 7.951444766864397e-07, + "loss": 0.5084, + "step": 6731 + }, + { + "epoch": 3.271367781155015, + "grad_norm": 0.06945763463521465, + "learning_rate": 7.94108794605572e-07, + "loss": 0.4999, + "step": 6732 + }, + { + "epoch": 3.271854103343465, + "grad_norm": 0.07044988524813009, + "learning_rate": 7.930737292708889e-07, + "loss": 0.4983, + "step": 6733 + }, + { + "epoch": 3.272340425531915, + "grad_norm": 0.07277831958200251, + "learning_rate": 7.920392808341704e-07, + "loss": 0.5308, + "step": 6734 + }, + { + "epoch": 3.272826747720365, + "grad_norm": 0.07243967490419853, + "learning_rate": 7.910054494471064e-07, + "loss": 0.5136, + "step": 6735 + }, + { + "epoch": 3.2733130699088147, + "grad_norm": 0.07103700203490818, + "learning_rate": 7.899722352612976e-07, + "loss": 0.5209, + "step": 6736 + }, + { + "epoch": 3.2737993920972643, + "grad_norm": 0.07173752856364275, + "learning_rate": 7.889396384282522e-07, + "loss": 0.4937, + "step": 6737 + }, + { + "epoch": 3.2742857142857145, + "grad_norm": 0.07223241974934827, + "learning_rate": 7.879076590993889e-07, + "loss": 0.5064, + "step": 6738 + }, + { + "epoch": 3.274772036474164, + "grad_norm": 0.07187526073243612, + "learning_rate": 7.868762974260358e-07, + "loss": 0.5364, + "step": 6739 + }, + { + "epoch": 3.275258358662614, + "grad_norm": 0.07355205971680061, + "learning_rate": 7.858455535594306e-07, + "loss": 0.5042, + "step": 6740 + }, + { + "epoch": 3.275744680851064, + "grad_norm": 0.07302633220465582, + "learning_rate": 7.848154276507203e-07, + "loss": 0.5266, + "step": 6741 + }, + { + "epoch": 3.2762310030395136, + "grad_norm": 0.07185108454555761, + "learning_rate": 7.837859198509612e-07, + "loss": 0.5352, + "step": 6742 + }, + { + "epoch": 3.2767173252279633, + "grad_norm": 0.06970707735731069, + "learning_rate": 7.827570303111182e-07, + "loss": 0.5098, + "step": 6743 + }, + { + "epoch": 3.2772036474164135, + "grad_norm": 0.06983103064121515, + "learning_rate": 7.817287591820666e-07, + "loss": 0.4939, + "step": 6744 + }, + { + "epoch": 3.277689969604863, + "grad_norm": 0.07063865905214017, + "learning_rate": 7.807011066145897e-07, + "loss": 0.5167, + "step": 6745 + }, + { + "epoch": 3.2781762917933133, + "grad_norm": 0.07052672816456025, + "learning_rate": 7.796740727593849e-07, + "loss": 0.5206, + "step": 6746 + }, + { + "epoch": 3.278662613981763, + "grad_norm": 0.0710467610618152, + "learning_rate": 7.786476577670509e-07, + "loss": 0.4939, + "step": 6747 + }, + { + "epoch": 3.2791489361702126, + "grad_norm": 0.0724447999760171, + "learning_rate": 7.776218617881016e-07, + "loss": 0.5164, + "step": 6748 + }, + { + "epoch": 3.2796352583586628, + "grad_norm": 0.07445658665825779, + "learning_rate": 7.765966849729578e-07, + "loss": 0.5472, + "step": 6749 + }, + { + "epoch": 3.2801215805471124, + "grad_norm": 0.06952869171566352, + "learning_rate": 7.755721274719502e-07, + "loss": 0.4791, + "step": 6750 + }, + { + "epoch": 3.280607902735562, + "grad_norm": 0.0712442624772608, + "learning_rate": 7.745481894353186e-07, + "loss": 0.4885, + "step": 6751 + }, + { + "epoch": 3.2810942249240123, + "grad_norm": 0.06954541584441366, + "learning_rate": 7.735248710132115e-07, + "loss": 0.4988, + "step": 6752 + }, + { + "epoch": 3.281580547112462, + "grad_norm": 0.07137156468614886, + "learning_rate": 7.725021723556875e-07, + "loss": 0.4884, + "step": 6753 + }, + { + "epoch": 3.2820668693009116, + "grad_norm": 0.07121825188777446, + "learning_rate": 7.714800936127137e-07, + "loss": 0.5203, + "step": 6754 + }, + { + "epoch": 3.2825531914893618, + "grad_norm": 0.07259237781028875, + "learning_rate": 7.704586349341658e-07, + "loss": 0.528, + "step": 6755 + }, + { + "epoch": 3.2830395136778114, + "grad_norm": 0.07181558435771633, + "learning_rate": 7.694377964698297e-07, + "loss": 0.493, + "step": 6756 + }, + { + "epoch": 3.2835258358662616, + "grad_norm": 0.06973374411306754, + "learning_rate": 7.684175783693998e-07, + "loss": 0.4952, + "step": 6757 + }, + { + "epoch": 3.2840121580547113, + "grad_norm": 0.07179561183796015, + "learning_rate": 7.673979807824788e-07, + "loss": 0.5216, + "step": 6758 + }, + { + "epoch": 3.284498480243161, + "grad_norm": 0.06984942550101632, + "learning_rate": 7.663790038585794e-07, + "loss": 0.4961, + "step": 6759 + }, + { + "epoch": 3.284984802431611, + "grad_norm": 0.07264213214800169, + "learning_rate": 7.653606477471237e-07, + "loss": 0.4816, + "step": 6760 + }, + { + "epoch": 3.2854711246200607, + "grad_norm": 0.07155529148916669, + "learning_rate": 7.643429125974411e-07, + "loss": 0.5261, + "step": 6761 + }, + { + "epoch": 3.285957446808511, + "grad_norm": 0.0704857524744365, + "learning_rate": 7.633257985587711e-07, + "loss": 0.4903, + "step": 6762 + }, + { + "epoch": 3.2864437689969606, + "grad_norm": 0.06888892461398638, + "learning_rate": 7.623093057802622e-07, + "loss": 0.4863, + "step": 6763 + }, + { + "epoch": 3.2869300911854102, + "grad_norm": 0.06930161900158482, + "learning_rate": 7.612934344109718e-07, + "loss": 0.488, + "step": 6764 + }, + { + "epoch": 3.2874164133738604, + "grad_norm": 0.06957023485748803, + "learning_rate": 7.602781845998652e-07, + "loss": 0.4884, + "step": 6765 + }, + { + "epoch": 3.28790273556231, + "grad_norm": 0.07232513784655341, + "learning_rate": 7.592635564958178e-07, + "loss": 0.5248, + "step": 6766 + }, + { + "epoch": 3.2883890577507597, + "grad_norm": 0.07253776676777018, + "learning_rate": 7.582495502476134e-07, + "loss": 0.54, + "step": 6767 + }, + { + "epoch": 3.28887537993921, + "grad_norm": 0.07055165233972628, + "learning_rate": 7.572361660039434e-07, + "loss": 0.4807, + "step": 6768 + }, + { + "epoch": 3.2893617021276595, + "grad_norm": 0.07430399192847663, + "learning_rate": 7.562234039134103e-07, + "loss": 0.5165, + "step": 6769 + }, + { + "epoch": 3.2898480243161092, + "grad_norm": 0.07031466168770636, + "learning_rate": 7.552112641245241e-07, + "loss": 0.503, + "step": 6770 + }, + { + "epoch": 3.2903343465045594, + "grad_norm": 0.07115246543581653, + "learning_rate": 7.541997467857026e-07, + "loss": 0.5163, + "step": 6771 + }, + { + "epoch": 3.290820668693009, + "grad_norm": 0.07174269848071549, + "learning_rate": 7.531888520452746e-07, + "loss": 0.5042, + "step": 6772 + }, + { + "epoch": 3.291306990881459, + "grad_norm": 0.07064398846981618, + "learning_rate": 7.521785800514752e-07, + "loss": 0.5128, + "step": 6773 + }, + { + "epoch": 3.291793313069909, + "grad_norm": 0.07069848162737531, + "learning_rate": 7.511689309524501e-07, + "loss": 0.5075, + "step": 6774 + }, + { + "epoch": 3.2922796352583585, + "grad_norm": 0.07167712052467, + "learning_rate": 7.501599048962527e-07, + "loss": 0.5276, + "step": 6775 + }, + { + "epoch": 3.2927659574468087, + "grad_norm": 0.07100369101453206, + "learning_rate": 7.491515020308448e-07, + "loss": 0.5, + "step": 6776 + }, + { + "epoch": 3.2932522796352584, + "grad_norm": 0.07238004051221887, + "learning_rate": 7.481437225040978e-07, + "loss": 0.5408, + "step": 6777 + }, + { + "epoch": 3.293738601823708, + "grad_norm": 0.07119703583328676, + "learning_rate": 7.471365664637903e-07, + "loss": 0.5271, + "step": 6778 + }, + { + "epoch": 3.294224924012158, + "grad_norm": 0.0733725231937714, + "learning_rate": 7.461300340576128e-07, + "loss": 0.5519, + "step": 6779 + }, + { + "epoch": 3.294711246200608, + "grad_norm": 0.07132850809023118, + "learning_rate": 7.451241254331582e-07, + "loss": 0.5221, + "step": 6780 + }, + { + "epoch": 3.2951975683890575, + "grad_norm": 0.06837321276176714, + "learning_rate": 7.441188407379335e-07, + "loss": 0.4874, + "step": 6781 + }, + { + "epoch": 3.2956838905775077, + "grad_norm": 0.07240454055555363, + "learning_rate": 7.431141801193509e-07, + "loss": 0.5198, + "step": 6782 + }, + { + "epoch": 3.2961702127659573, + "grad_norm": 0.07196043095564707, + "learning_rate": 7.421101437247346e-07, + "loss": 0.5332, + "step": 6783 + }, + { + "epoch": 3.2966565349544075, + "grad_norm": 0.07009129817913741, + "learning_rate": 7.411067317013148e-07, + "loss": 0.4809, + "step": 6784 + }, + { + "epoch": 3.297142857142857, + "grad_norm": 0.07073461793953208, + "learning_rate": 7.401039441962293e-07, + "loss": 0.5092, + "step": 6785 + }, + { + "epoch": 3.297629179331307, + "grad_norm": 0.06732821441053059, + "learning_rate": 7.39101781356526e-07, + "loss": 0.4807, + "step": 6786 + }, + { + "epoch": 3.298115501519757, + "grad_norm": 0.07038959061073322, + "learning_rate": 7.381002433291612e-07, + "loss": 0.5157, + "step": 6787 + }, + { + "epoch": 3.2986018237082066, + "grad_norm": 0.07188046113026367, + "learning_rate": 7.370993302609986e-07, + "loss": 0.5143, + "step": 6788 + }, + { + "epoch": 3.2990881458966568, + "grad_norm": 0.0715737240485823, + "learning_rate": 7.360990422988101e-07, + "loss": 0.5059, + "step": 6789 + }, + { + "epoch": 3.2995744680851065, + "grad_norm": 0.07322841119648478, + "learning_rate": 7.35099379589278e-07, + "loss": 0.5228, + "step": 6790 + }, + { + "epoch": 3.300060790273556, + "grad_norm": 0.07221420468839182, + "learning_rate": 7.341003422789905e-07, + "loss": 0.4891, + "step": 6791 + }, + { + "epoch": 3.3005471124620063, + "grad_norm": 0.07151167475645931, + "learning_rate": 7.331019305144455e-07, + "loss": 0.5068, + "step": 6792 + }, + { + "epoch": 3.301033434650456, + "grad_norm": 0.07080602881298616, + "learning_rate": 7.321041444420479e-07, + "loss": 0.479, + "step": 6793 + }, + { + "epoch": 3.3015197568389056, + "grad_norm": 0.07479582354571418, + "learning_rate": 7.311069842081142e-07, + "loss": 0.5757, + "step": 6794 + }, + { + "epoch": 3.3020060790273558, + "grad_norm": 0.0716130319413144, + "learning_rate": 7.301104499588629e-07, + "loss": 0.5307, + "step": 6795 + }, + { + "epoch": 3.3024924012158055, + "grad_norm": 0.07347438582822559, + "learning_rate": 7.291145418404272e-07, + "loss": 0.525, + "step": 6796 + }, + { + "epoch": 3.302978723404255, + "grad_norm": 0.07230789927798162, + "learning_rate": 7.281192599988441e-07, + "loss": 0.5216, + "step": 6797 + }, + { + "epoch": 3.3034650455927053, + "grad_norm": 0.07220062514410652, + "learning_rate": 7.271246045800612e-07, + "loss": 0.5264, + "step": 6798 + }, + { + "epoch": 3.303951367781155, + "grad_norm": 0.07206019733268727, + "learning_rate": 7.261305757299336e-07, + "loss": 0.5165, + "step": 6799 + }, + { + "epoch": 3.304437689969605, + "grad_norm": 0.06917531688270125, + "learning_rate": 7.251371735942231e-07, + "loss": 0.5255, + "step": 6800 + }, + { + "epoch": 3.3049240121580548, + "grad_norm": 0.0702745406461957, + "learning_rate": 7.241443983186025e-07, + "loss": 0.5088, + "step": 6801 + }, + { + "epoch": 3.3054103343465044, + "grad_norm": 0.07011622230132807, + "learning_rate": 7.231522500486504e-07, + "loss": 0.5155, + "step": 6802 + }, + { + "epoch": 3.3058966565349546, + "grad_norm": 0.07116590907538135, + "learning_rate": 7.221607289298538e-07, + "loss": 0.5107, + "step": 6803 + }, + { + "epoch": 3.3063829787234043, + "grad_norm": 0.07221529877806088, + "learning_rate": 7.211698351076085e-07, + "loss": 0.5513, + "step": 6804 + }, + { + "epoch": 3.306869300911854, + "grad_norm": 0.07107072814013866, + "learning_rate": 7.201795687272178e-07, + "loss": 0.5244, + "step": 6805 + }, + { + "epoch": 3.307355623100304, + "grad_norm": 0.07233933774627571, + "learning_rate": 7.191899299338923e-07, + "loss": 0.4831, + "step": 6806 + }, + { + "epoch": 3.3078419452887537, + "grad_norm": 0.07173336054889858, + "learning_rate": 7.182009188727524e-07, + "loss": 0.4924, + "step": 6807 + }, + { + "epoch": 3.3083282674772034, + "grad_norm": 0.07273530116143037, + "learning_rate": 7.172125356888237e-07, + "loss": 0.5248, + "step": 6808 + }, + { + "epoch": 3.3088145896656536, + "grad_norm": 0.07074318123582363, + "learning_rate": 7.162247805270445e-07, + "loss": 0.5402, + "step": 6809 + }, + { + "epoch": 3.3093009118541032, + "grad_norm": 0.07047086884404011, + "learning_rate": 7.152376535322542e-07, + "loss": 0.5084, + "step": 6810 + }, + { + "epoch": 3.3097872340425534, + "grad_norm": 0.07055593819345828, + "learning_rate": 7.142511548492054e-07, + "loss": 0.5086, + "step": 6811 + }, + { + "epoch": 3.310273556231003, + "grad_norm": 0.07049889519653926, + "learning_rate": 7.132652846225563e-07, + "loss": 0.52, + "step": 6812 + }, + { + "epoch": 3.3107598784194527, + "grad_norm": 0.07112052853074773, + "learning_rate": 7.122800429968746e-07, + "loss": 0.4907, + "step": 6813 + }, + { + "epoch": 3.311246200607903, + "grad_norm": 0.07011189736779255, + "learning_rate": 7.112954301166341e-07, + "loss": 0.518, + "step": 6814 + }, + { + "epoch": 3.3117325227963526, + "grad_norm": 0.06903750164058604, + "learning_rate": 7.103114461262179e-07, + "loss": 0.4889, + "step": 6815 + }, + { + "epoch": 3.3122188449848027, + "grad_norm": 0.06806022432356529, + "learning_rate": 7.093280911699147e-07, + "loss": 0.4577, + "step": 6816 + }, + { + "epoch": 3.3127051671732524, + "grad_norm": 0.07044795974662547, + "learning_rate": 7.083453653919237e-07, + "loss": 0.5079, + "step": 6817 + }, + { + "epoch": 3.313191489361702, + "grad_norm": 0.07063160311300756, + "learning_rate": 7.073632689363485e-07, + "loss": 0.5069, + "step": 6818 + }, + { + "epoch": 3.3136778115501517, + "grad_norm": 0.0695024603128343, + "learning_rate": 7.063818019472046e-07, + "loss": 0.4834, + "step": 6819 + }, + { + "epoch": 3.314164133738602, + "grad_norm": 0.07205120122841974, + "learning_rate": 7.054009645684128e-07, + "loss": 0.5065, + "step": 6820 + }, + { + "epoch": 3.3146504559270515, + "grad_norm": 0.07092022520080976, + "learning_rate": 7.044207569438011e-07, + "loss": 0.5137, + "step": 6821 + }, + { + "epoch": 3.3151367781155017, + "grad_norm": 0.0684122819517937, + "learning_rate": 7.034411792171053e-07, + "loss": 0.4988, + "step": 6822 + }, + { + "epoch": 3.3156231003039514, + "grad_norm": 0.07116750965108525, + "learning_rate": 7.024622315319713e-07, + "loss": 0.5461, + "step": 6823 + }, + { + "epoch": 3.316109422492401, + "grad_norm": 0.0721253908927146, + "learning_rate": 7.014839140319485e-07, + "loss": 0.5113, + "step": 6824 + }, + { + "epoch": 3.316595744680851, + "grad_norm": 0.07034657572765021, + "learning_rate": 7.005062268604962e-07, + "loss": 0.4876, + "step": 6825 + }, + { + "epoch": 3.317082066869301, + "grad_norm": 0.07213556525075031, + "learning_rate": 6.995291701609824e-07, + "loss": 0.5047, + "step": 6826 + }, + { + "epoch": 3.317568389057751, + "grad_norm": 0.07033754126346546, + "learning_rate": 6.985527440766804e-07, + "loss": 0.5073, + "step": 6827 + }, + { + "epoch": 3.3180547112462007, + "grad_norm": 0.07055221284024986, + "learning_rate": 6.975769487507722e-07, + "loss": 0.5127, + "step": 6828 + }, + { + "epoch": 3.3185410334346503, + "grad_norm": 0.07329364576154067, + "learning_rate": 6.966017843263473e-07, + "loss": 0.479, + "step": 6829 + }, + { + "epoch": 3.3190273556231005, + "grad_norm": 0.07287549121511681, + "learning_rate": 6.956272509464024e-07, + "loss": 0.4868, + "step": 6830 + }, + { + "epoch": 3.31951367781155, + "grad_norm": 0.07079689226198173, + "learning_rate": 6.946533487538415e-07, + "loss": 0.4973, + "step": 6831 + }, + { + "epoch": 3.32, + "grad_norm": 0.07105703135005628, + "learning_rate": 6.93680077891477e-07, + "loss": 0.5193, + "step": 6832 + }, + { + "epoch": 3.32048632218845, + "grad_norm": 0.07442941352069911, + "learning_rate": 6.927074385020271e-07, + "loss": 0.5471, + "step": 6833 + }, + { + "epoch": 3.3209726443768997, + "grad_norm": 0.07201007580230237, + "learning_rate": 6.917354307281193e-07, + "loss": 0.5261, + "step": 6834 + }, + { + "epoch": 3.3214589665653493, + "grad_norm": 0.07306767546324147, + "learning_rate": 6.907640547122868e-07, + "loss": 0.5106, + "step": 6835 + }, + { + "epoch": 3.3219452887537995, + "grad_norm": 0.07297965829683148, + "learning_rate": 6.897933105969701e-07, + "loss": 0.5401, + "step": 6836 + }, + { + "epoch": 3.322431610942249, + "grad_norm": 0.0703237212977566, + "learning_rate": 6.888231985245197e-07, + "loss": 0.5119, + "step": 6837 + }, + { + "epoch": 3.3229179331306993, + "grad_norm": 0.06833607035733347, + "learning_rate": 6.878537186371914e-07, + "loss": 0.4886, + "step": 6838 + }, + { + "epoch": 3.323404255319149, + "grad_norm": 0.06941673468252307, + "learning_rate": 6.868848710771469e-07, + "loss": 0.5058, + "step": 6839 + }, + { + "epoch": 3.3238905775075986, + "grad_norm": 0.06976322127009543, + "learning_rate": 6.859166559864571e-07, + "loss": 0.516, + "step": 6840 + }, + { + "epoch": 3.3243768996960488, + "grad_norm": 0.07247747356576587, + "learning_rate": 6.849490735071008e-07, + "loss": 0.5437, + "step": 6841 + }, + { + "epoch": 3.3248632218844985, + "grad_norm": 0.07020835253166453, + "learning_rate": 6.839821237809613e-07, + "loss": 0.5111, + "step": 6842 + }, + { + "epoch": 3.325349544072948, + "grad_norm": 0.0727483312302522, + "learning_rate": 6.830158069498322e-07, + "loss": 0.538, + "step": 6843 + }, + { + "epoch": 3.3258358662613983, + "grad_norm": 0.07098719836907831, + "learning_rate": 6.820501231554121e-07, + "loss": 0.5023, + "step": 6844 + }, + { + "epoch": 3.326322188449848, + "grad_norm": 0.07215865030941816, + "learning_rate": 6.810850725393081e-07, + "loss": 0.5043, + "step": 6845 + }, + { + "epoch": 3.3268085106382976, + "grad_norm": 0.07129774083883823, + "learning_rate": 6.801206552430334e-07, + "loss": 0.4993, + "step": 6846 + }, + { + "epoch": 3.3272948328267478, + "grad_norm": 0.07098179715158444, + "learning_rate": 6.791568714080093e-07, + "loss": 0.5025, + "step": 6847 + }, + { + "epoch": 3.3277811550151974, + "grad_norm": 0.07039916869682948, + "learning_rate": 6.78193721175564e-07, + "loss": 0.4878, + "step": 6848 + }, + { + "epoch": 3.3282674772036476, + "grad_norm": 0.07004807825599091, + "learning_rate": 6.772312046869317e-07, + "loss": 0.5012, + "step": 6849 + }, + { + "epoch": 3.3287537993920973, + "grad_norm": 0.07385155510415911, + "learning_rate": 6.762693220832551e-07, + "loss": 0.5397, + "step": 6850 + }, + { + "epoch": 3.329240121580547, + "grad_norm": 0.06900376933849205, + "learning_rate": 6.753080735055828e-07, + "loss": 0.524, + "step": 6851 + }, + { + "epoch": 3.329726443768997, + "grad_norm": 0.07223504908694708, + "learning_rate": 6.743474590948718e-07, + "loss": 0.5044, + "step": 6852 + }, + { + "epoch": 3.3302127659574468, + "grad_norm": 0.0725330101992085, + "learning_rate": 6.733874789919847e-07, + "loss": 0.5302, + "step": 6853 + }, + { + "epoch": 3.330699088145897, + "grad_norm": 0.07151248169613576, + "learning_rate": 6.724281333376919e-07, + "loss": 0.4921, + "step": 6854 + }, + { + "epoch": 3.3311854103343466, + "grad_norm": 0.07296942855063826, + "learning_rate": 6.714694222726703e-07, + "loss": 0.526, + "step": 6855 + }, + { + "epoch": 3.3316717325227962, + "grad_norm": 0.06901844245211027, + "learning_rate": 6.705113459375046e-07, + "loss": 0.4962, + "step": 6856 + }, + { + "epoch": 3.3321580547112464, + "grad_norm": 0.07019486686507174, + "learning_rate": 6.695539044726851e-07, + "loss": 0.4893, + "step": 6857 + }, + { + "epoch": 3.332644376899696, + "grad_norm": 0.06924531756890973, + "learning_rate": 6.685970980186107e-07, + "loss": 0.4823, + "step": 6858 + }, + { + "epoch": 3.3331306990881457, + "grad_norm": 0.07405629083660108, + "learning_rate": 6.676409267155847e-07, + "loss": 0.5141, + "step": 6859 + }, + { + "epoch": 3.333617021276596, + "grad_norm": 0.07135188527271055, + "learning_rate": 6.666853907038201e-07, + "loss": 0.5186, + "step": 6860 + }, + { + "epoch": 3.3341033434650456, + "grad_norm": 0.07553585945008054, + "learning_rate": 6.657304901234346e-07, + "loss": 0.5827, + "step": 6861 + }, + { + "epoch": 3.3345896656534952, + "grad_norm": 0.07176905115831651, + "learning_rate": 6.647762251144541e-07, + "loss": 0.5327, + "step": 6862 + }, + { + "epoch": 3.3350759878419454, + "grad_norm": 0.07161461562606672, + "learning_rate": 6.638225958168104e-07, + "loss": 0.501, + "step": 6863 + }, + { + "epoch": 3.335562310030395, + "grad_norm": 0.0679051608513659, + "learning_rate": 6.628696023703424e-07, + "loss": 0.471, + "step": 6864 + }, + { + "epoch": 3.336048632218845, + "grad_norm": 0.07047009248565354, + "learning_rate": 6.619172449147953e-07, + "loss": 0.5208, + "step": 6865 + }, + { + "epoch": 3.336534954407295, + "grad_norm": 0.07325616084016405, + "learning_rate": 6.609655235898227e-07, + "loss": 0.5145, + "step": 6866 + }, + { + "epoch": 3.3370212765957445, + "grad_norm": 0.07306992308508012, + "learning_rate": 6.600144385349833e-07, + "loss": 0.5265, + "step": 6867 + }, + { + "epoch": 3.3375075987841947, + "grad_norm": 0.06890257583881561, + "learning_rate": 6.590639898897421e-07, + "loss": 0.5007, + "step": 6868 + }, + { + "epoch": 3.3379939209726444, + "grad_norm": 0.07052569019779599, + "learning_rate": 6.581141777934724e-07, + "loss": 0.5223, + "step": 6869 + }, + { + "epoch": 3.338480243161094, + "grad_norm": 0.07156611360513149, + "learning_rate": 6.571650023854531e-07, + "loss": 0.5173, + "step": 6870 + }, + { + "epoch": 3.338966565349544, + "grad_norm": 0.07390972790522173, + "learning_rate": 6.562164638048712e-07, + "loss": 0.5271, + "step": 6871 + }, + { + "epoch": 3.339452887537994, + "grad_norm": 0.07022824836649157, + "learning_rate": 6.552685621908155e-07, + "loss": 0.4915, + "step": 6872 + }, + { + "epoch": 3.3399392097264435, + "grad_norm": 0.06812410937204921, + "learning_rate": 6.543212976822894e-07, + "loss": 0.4856, + "step": 6873 + }, + { + "epoch": 3.3404255319148937, + "grad_norm": 0.06763439434837915, + "learning_rate": 6.533746704181959e-07, + "loss": 0.4865, + "step": 6874 + }, + { + "epoch": 3.3409118541033433, + "grad_norm": 0.06985526175797392, + "learning_rate": 6.524286805373475e-07, + "loss": 0.5106, + "step": 6875 + }, + { + "epoch": 3.3413981762917935, + "grad_norm": 0.07301500226376727, + "learning_rate": 6.514833281784638e-07, + "loss": 0.5249, + "step": 6876 + }, + { + "epoch": 3.341884498480243, + "grad_norm": 0.07312317989562328, + "learning_rate": 6.505386134801688e-07, + "loss": 0.4944, + "step": 6877 + }, + { + "epoch": 3.342370820668693, + "grad_norm": 0.06991911870995818, + "learning_rate": 6.495945365809947e-07, + "loss": 0.4948, + "step": 6878 + }, + { + "epoch": 3.342857142857143, + "grad_norm": 0.07354675339985596, + "learning_rate": 6.486510976193799e-07, + "loss": 0.5039, + "step": 6879 + }, + { + "epoch": 3.3433434650455927, + "grad_norm": 0.07229525841049106, + "learning_rate": 6.47708296733669e-07, + "loss": 0.5039, + "step": 6880 + }, + { + "epoch": 3.343829787234043, + "grad_norm": 0.07199884472693327, + "learning_rate": 6.467661340621129e-07, + "loss": 0.5033, + "step": 6881 + }, + { + "epoch": 3.3443161094224925, + "grad_norm": 0.07170618104931153, + "learning_rate": 6.458246097428689e-07, + "loss": 0.5162, + "step": 6882 + }, + { + "epoch": 3.344802431610942, + "grad_norm": 0.07172488898901912, + "learning_rate": 6.448837239140004e-07, + "loss": 0.5381, + "step": 6883 + }, + { + "epoch": 3.3452887537993923, + "grad_norm": 0.07340970993169409, + "learning_rate": 6.439434767134789e-07, + "loss": 0.4978, + "step": 6884 + }, + { + "epoch": 3.345775075987842, + "grad_norm": 0.07489428138614118, + "learning_rate": 6.430038682791795e-07, + "loss": 0.536, + "step": 6885 + }, + { + "epoch": 3.3462613981762916, + "grad_norm": 0.06890991108619181, + "learning_rate": 6.420648987488876e-07, + "loss": 0.469, + "step": 6886 + }, + { + "epoch": 3.3467477203647418, + "grad_norm": 0.07159045268877802, + "learning_rate": 6.411265682602891e-07, + "loss": 0.5299, + "step": 6887 + }, + { + "epoch": 3.3472340425531915, + "grad_norm": 0.06987965818335329, + "learning_rate": 6.401888769509812e-07, + "loss": 0.4956, + "step": 6888 + }, + { + "epoch": 3.347720364741641, + "grad_norm": 0.07122874921424016, + "learning_rate": 6.392518249584656e-07, + "loss": 0.4741, + "step": 6889 + }, + { + "epoch": 3.3482066869300913, + "grad_norm": 0.07095585158552611, + "learning_rate": 6.383154124201496e-07, + "loss": 0.5256, + "step": 6890 + }, + { + "epoch": 3.348693009118541, + "grad_norm": 0.07465843087894065, + "learning_rate": 6.373796394733489e-07, + "loss": 0.5357, + "step": 6891 + }, + { + "epoch": 3.349179331306991, + "grad_norm": 0.07424499235580542, + "learning_rate": 6.364445062552832e-07, + "loss": 0.5394, + "step": 6892 + }, + { + "epoch": 3.3496656534954408, + "grad_norm": 0.07305714454200418, + "learning_rate": 6.355100129030794e-07, + "loss": 0.5015, + "step": 6893 + }, + { + "epoch": 3.3501519756838904, + "grad_norm": 0.07193012438733509, + "learning_rate": 6.345761595537698e-07, + "loss": 0.5187, + "step": 6894 + }, + { + "epoch": 3.3506382978723406, + "grad_norm": 0.0696762064376855, + "learning_rate": 6.336429463442939e-07, + "loss": 0.4847, + "step": 6895 + }, + { + "epoch": 3.3511246200607903, + "grad_norm": 0.06980664385411242, + "learning_rate": 6.327103734114965e-07, + "loss": 0.5123, + "step": 6896 + }, + { + "epoch": 3.35161094224924, + "grad_norm": 0.06987357360157116, + "learning_rate": 6.31778440892129e-07, + "loss": 0.4915, + "step": 6897 + }, + { + "epoch": 3.35209726443769, + "grad_norm": 0.07077963651500906, + "learning_rate": 6.308471489228491e-07, + "loss": 0.5145, + "step": 6898 + }, + { + "epoch": 3.3525835866261398, + "grad_norm": 0.06990134387597335, + "learning_rate": 6.299164976402195e-07, + "loss": 0.5102, + "step": 6899 + }, + { + "epoch": 3.3530699088145894, + "grad_norm": 0.07007511425795404, + "learning_rate": 6.2898648718071e-07, + "loss": 0.4967, + "step": 6900 + }, + { + "epoch": 3.3535562310030396, + "grad_norm": 0.07039322230102411, + "learning_rate": 6.280571176806971e-07, + "loss": 0.5179, + "step": 6901 + }, + { + "epoch": 3.3540425531914893, + "grad_norm": 0.07088766077186194, + "learning_rate": 6.271283892764602e-07, + "loss": 0.5125, + "step": 6902 + }, + { + "epoch": 3.3545288753799394, + "grad_norm": 0.07012555991426875, + "learning_rate": 6.262003021041873e-07, + "loss": 0.5011, + "step": 6903 + }, + { + "epoch": 3.355015197568389, + "grad_norm": 0.07259479134508723, + "learning_rate": 6.252728562999727e-07, + "loss": 0.5299, + "step": 6904 + }, + { + "epoch": 3.3555015197568387, + "grad_norm": 0.07233588008061326, + "learning_rate": 6.243460519998156e-07, + "loss": 0.5288, + "step": 6905 + }, + { + "epoch": 3.355987841945289, + "grad_norm": 0.07197398806366222, + "learning_rate": 6.234198893396209e-07, + "loss": 0.5156, + "step": 6906 + }, + { + "epoch": 3.3564741641337386, + "grad_norm": 0.07352762733286107, + "learning_rate": 6.224943684551998e-07, + "loss": 0.5266, + "step": 6907 + }, + { + "epoch": 3.3569604863221887, + "grad_norm": 0.07493186335683674, + "learning_rate": 6.215694894822699e-07, + "loss": 0.523, + "step": 6908 + }, + { + "epoch": 3.3574468085106384, + "grad_norm": 0.07445561573799665, + "learning_rate": 6.206452525564533e-07, + "loss": 0.4994, + "step": 6909 + }, + { + "epoch": 3.357933130699088, + "grad_norm": 0.07052407039126818, + "learning_rate": 6.197216578132803e-07, + "loss": 0.498, + "step": 6910 + }, + { + "epoch": 3.358419452887538, + "grad_norm": 0.07012363060295893, + "learning_rate": 6.187987053881845e-07, + "loss": 0.4722, + "step": 6911 + }, + { + "epoch": 3.358905775075988, + "grad_norm": 0.07261266834039559, + "learning_rate": 6.178763954165068e-07, + "loss": 0.5213, + "step": 6912 + }, + { + "epoch": 3.3593920972644375, + "grad_norm": 0.07232269041700885, + "learning_rate": 6.169547280334937e-07, + "loss": 0.5275, + "step": 6913 + }, + { + "epoch": 3.3598784194528877, + "grad_norm": 0.07164236571973215, + "learning_rate": 6.16033703374297e-07, + "loss": 0.5135, + "step": 6914 + }, + { + "epoch": 3.3603647416413374, + "grad_norm": 0.07025702522424446, + "learning_rate": 6.151133215739752e-07, + "loss": 0.5064, + "step": 6915 + }, + { + "epoch": 3.360851063829787, + "grad_norm": 0.0749076829503178, + "learning_rate": 6.141935827674905e-07, + "loss": 0.5305, + "step": 6916 + }, + { + "epoch": 3.361337386018237, + "grad_norm": 0.07331282874334093, + "learning_rate": 6.132744870897122e-07, + "loss": 0.546, + "step": 6917 + }, + { + "epoch": 3.361823708206687, + "grad_norm": 0.07315131009487047, + "learning_rate": 6.123560346754165e-07, + "loss": 0.5186, + "step": 6918 + }, + { + "epoch": 3.362310030395137, + "grad_norm": 0.07163492403134779, + "learning_rate": 6.114382256592826e-07, + "loss": 0.4884, + "step": 6919 + }, + { + "epoch": 3.3627963525835867, + "grad_norm": 0.07188716509215164, + "learning_rate": 6.105210601758982e-07, + "loss": 0.5243, + "step": 6920 + }, + { + "epoch": 3.3632826747720364, + "grad_norm": 0.07093292399676515, + "learning_rate": 6.096045383597537e-07, + "loss": 0.4748, + "step": 6921 + }, + { + "epoch": 3.3637689969604865, + "grad_norm": 0.07095474269585418, + "learning_rate": 6.08688660345248e-07, + "loss": 0.5043, + "step": 6922 + }, + { + "epoch": 3.364255319148936, + "grad_norm": 0.07485676589207554, + "learning_rate": 6.077734262666834e-07, + "loss": 0.5328, + "step": 6923 + }, + { + "epoch": 3.364741641337386, + "grad_norm": 0.06989839092264896, + "learning_rate": 6.06858836258269e-07, + "loss": 0.4727, + "step": 6924 + }, + { + "epoch": 3.365227963525836, + "grad_norm": 0.07345294628496163, + "learning_rate": 6.059448904541182e-07, + "loss": 0.5133, + "step": 6925 + }, + { + "epoch": 3.3657142857142857, + "grad_norm": 0.07248750660668703, + "learning_rate": 6.050315889882519e-07, + "loss": 0.509, + "step": 6926 + }, + { + "epoch": 3.3662006079027353, + "grad_norm": 0.07255466582114804, + "learning_rate": 6.04118931994594e-07, + "loss": 0.4967, + "step": 6927 + }, + { + "epoch": 3.3666869300911855, + "grad_norm": 0.06982519439786458, + "learning_rate": 6.032069196069773e-07, + "loss": 0.5053, + "step": 6928 + }, + { + "epoch": 3.367173252279635, + "grad_norm": 0.07180227655168682, + "learning_rate": 6.022955519591367e-07, + "loss": 0.4845, + "step": 6929 + }, + { + "epoch": 3.3676595744680853, + "grad_norm": 0.06945925797893547, + "learning_rate": 6.013848291847152e-07, + "loss": 0.505, + "step": 6930 + }, + { + "epoch": 3.368145896656535, + "grad_norm": 0.07040127217085423, + "learning_rate": 6.004747514172576e-07, + "loss": 0.5031, + "step": 6931 + }, + { + "epoch": 3.3686322188449846, + "grad_norm": 0.0716278952251427, + "learning_rate": 5.995653187902178e-07, + "loss": 0.5159, + "step": 6932 + }, + { + "epoch": 3.3691185410334348, + "grad_norm": 0.07078209420303555, + "learning_rate": 5.986565314369541e-07, + "loss": 0.5199, + "step": 6933 + }, + { + "epoch": 3.3696048632218845, + "grad_norm": 0.07047849202980369, + "learning_rate": 5.977483894907294e-07, + "loss": 0.5216, + "step": 6934 + }, + { + "epoch": 3.3700911854103346, + "grad_norm": 0.07026331066333459, + "learning_rate": 5.968408930847125e-07, + "loss": 0.4904, + "step": 6935 + }, + { + "epoch": 3.3705775075987843, + "grad_norm": 0.07584883007700144, + "learning_rate": 5.959340423519777e-07, + "loss": 0.5275, + "step": 6936 + }, + { + "epoch": 3.371063829787234, + "grad_norm": 0.07145854236577773, + "learning_rate": 5.950278374255036e-07, + "loss": 0.5122, + "step": 6937 + }, + { + "epoch": 3.371550151975684, + "grad_norm": 0.06994631048614783, + "learning_rate": 5.941222784381756e-07, + "loss": 0.4609, + "step": 6938 + }, + { + "epoch": 3.3720364741641338, + "grad_norm": 0.07289834743011642, + "learning_rate": 5.932173655227835e-07, + "loss": 0.5201, + "step": 6939 + }, + { + "epoch": 3.3725227963525835, + "grad_norm": 0.07009972446600306, + "learning_rate": 5.923130988120223e-07, + "loss": 0.5027, + "step": 6940 + }, + { + "epoch": 3.3730091185410336, + "grad_norm": 0.07182732171826438, + "learning_rate": 5.914094784384927e-07, + "loss": 0.5036, + "step": 6941 + }, + { + "epoch": 3.3734954407294833, + "grad_norm": 0.06978282640318427, + "learning_rate": 5.905065045347002e-07, + "loss": 0.4857, + "step": 6942 + }, + { + "epoch": 3.373981762917933, + "grad_norm": 0.07292716115234207, + "learning_rate": 5.896041772330558e-07, + "loss": 0.5062, + "step": 6943 + }, + { + "epoch": 3.374468085106383, + "grad_norm": 0.0694177295780197, + "learning_rate": 5.88702496665875e-07, + "loss": 0.5034, + "step": 6944 + }, + { + "epoch": 3.3749544072948328, + "grad_norm": 0.07184645671495853, + "learning_rate": 5.8780146296538e-07, + "loss": 0.5274, + "step": 6945 + }, + { + "epoch": 3.375440729483283, + "grad_norm": 0.07204979439826342, + "learning_rate": 5.869010762636962e-07, + "loss": 0.502, + "step": 6946 + }, + { + "epoch": 3.3759270516717326, + "grad_norm": 0.07137765799661595, + "learning_rate": 5.860013366928558e-07, + "loss": 0.4835, + "step": 6947 + }, + { + "epoch": 3.3764133738601823, + "grad_norm": 0.07113765494398433, + "learning_rate": 5.851022443847948e-07, + "loss": 0.4734, + "step": 6948 + }, + { + "epoch": 3.3768996960486324, + "grad_norm": 0.07076144058410078, + "learning_rate": 5.842037994713551e-07, + "loss": 0.5181, + "step": 6949 + }, + { + "epoch": 3.377386018237082, + "grad_norm": 0.07213018338053362, + "learning_rate": 5.83306002084284e-07, + "loss": 0.4971, + "step": 6950 + }, + { + "epoch": 3.3778723404255317, + "grad_norm": 0.06849450134607875, + "learning_rate": 5.824088523552323e-07, + "loss": 0.4902, + "step": 6951 + }, + { + "epoch": 3.378358662613982, + "grad_norm": 0.07235153708470951, + "learning_rate": 5.815123504157577e-07, + "loss": 0.5262, + "step": 6952 + }, + { + "epoch": 3.3788449848024316, + "grad_norm": 0.06997504362258379, + "learning_rate": 5.806164963973216e-07, + "loss": 0.5187, + "step": 6953 + }, + { + "epoch": 3.3793313069908812, + "grad_norm": 0.07116715910091094, + "learning_rate": 5.79721290431291e-07, + "loss": 0.5121, + "step": 6954 + }, + { + "epoch": 3.3798176291793314, + "grad_norm": 0.0693753037815743, + "learning_rate": 5.788267326489372e-07, + "loss": 0.4834, + "step": 6955 + }, + { + "epoch": 3.380303951367781, + "grad_norm": 0.07463094711201416, + "learning_rate": 5.779328231814374e-07, + "loss": 0.5291, + "step": 6956 + }, + { + "epoch": 3.380790273556231, + "grad_norm": 0.07289396407672954, + "learning_rate": 5.770395621598734e-07, + "loss": 0.526, + "step": 6957 + }, + { + "epoch": 3.381276595744681, + "grad_norm": 0.07195755756508643, + "learning_rate": 5.761469497152317e-07, + "loss": 0.5137, + "step": 6958 + }, + { + "epoch": 3.3817629179331306, + "grad_norm": 0.071978917504858, + "learning_rate": 5.752549859784034e-07, + "loss": 0.5335, + "step": 6959 + }, + { + "epoch": 3.3822492401215807, + "grad_norm": 0.0703328971028879, + "learning_rate": 5.743636710801848e-07, + "loss": 0.5032, + "step": 6960 + }, + { + "epoch": 3.3827355623100304, + "grad_norm": 0.06820713659481933, + "learning_rate": 5.734730051512777e-07, + "loss": 0.4686, + "step": 6961 + }, + { + "epoch": 3.3832218844984805, + "grad_norm": 0.06944420384106066, + "learning_rate": 5.725829883222877e-07, + "loss": 0.4972, + "step": 6962 + }, + { + "epoch": 3.38370820668693, + "grad_norm": 0.0707031295099363, + "learning_rate": 5.716936207237261e-07, + "loss": 0.5148, + "step": 6963 + }, + { + "epoch": 3.38419452887538, + "grad_norm": 0.0699244552159447, + "learning_rate": 5.708049024860085e-07, + "loss": 0.4831, + "step": 6964 + }, + { + "epoch": 3.3846808510638295, + "grad_norm": 0.0723919186975563, + "learning_rate": 5.699168337394545e-07, + "loss": 0.5136, + "step": 6965 + }, + { + "epoch": 3.3851671732522797, + "grad_norm": 0.07104080541784422, + "learning_rate": 5.690294146142899e-07, + "loss": 0.4973, + "step": 6966 + }, + { + "epoch": 3.3856534954407294, + "grad_norm": 0.07396129154565106, + "learning_rate": 5.681426452406453e-07, + "loss": 0.534, + "step": 6967 + }, + { + "epoch": 3.3861398176291795, + "grad_norm": 0.07233664955810466, + "learning_rate": 5.67256525748554e-07, + "loss": 0.4943, + "step": 6968 + }, + { + "epoch": 3.386626139817629, + "grad_norm": 0.0709888910446805, + "learning_rate": 5.663710562679564e-07, + "loss": 0.5103, + "step": 6969 + }, + { + "epoch": 3.387112462006079, + "grad_norm": 0.07344730493274089, + "learning_rate": 5.654862369286962e-07, + "loss": 0.5157, + "step": 6970 + }, + { + "epoch": 3.387598784194529, + "grad_norm": 0.07036358846668035, + "learning_rate": 5.646020678605219e-07, + "loss": 0.5054, + "step": 6971 + }, + { + "epoch": 3.3880851063829787, + "grad_norm": 0.07557695558944763, + "learning_rate": 5.637185491930875e-07, + "loss": 0.5619, + "step": 6972 + }, + { + "epoch": 3.388571428571429, + "grad_norm": 0.07301760348337281, + "learning_rate": 5.628356810559499e-07, + "loss": 0.5083, + "step": 6973 + }, + { + "epoch": 3.3890577507598785, + "grad_norm": 0.06970113006259425, + "learning_rate": 5.619534635785729e-07, + "loss": 0.4926, + "step": 6974 + }, + { + "epoch": 3.389544072948328, + "grad_norm": 0.07082295446846913, + "learning_rate": 5.610718968903228e-07, + "loss": 0.5258, + "step": 6975 + }, + { + "epoch": 3.3900303951367783, + "grad_norm": 0.07610425384451482, + "learning_rate": 5.60190981120472e-07, + "loss": 0.5703, + "step": 6976 + }, + { + "epoch": 3.390516717325228, + "grad_norm": 0.06964598441979405, + "learning_rate": 5.593107163981959e-07, + "loss": 0.509, + "step": 6977 + }, + { + "epoch": 3.3910030395136777, + "grad_norm": 0.07538918402249042, + "learning_rate": 5.584311028525774e-07, + "loss": 0.5478, + "step": 6978 + }, + { + "epoch": 3.391489361702128, + "grad_norm": 0.07284115855131425, + "learning_rate": 5.575521406125989e-07, + "loss": 0.5361, + "step": 6979 + }, + { + "epoch": 3.3919756838905775, + "grad_norm": 0.07071726931141209, + "learning_rate": 5.566738298071522e-07, + "loss": 0.4765, + "step": 6980 + }, + { + "epoch": 3.392462006079027, + "grad_norm": 0.0725478087966237, + "learning_rate": 5.557961705650294e-07, + "loss": 0.5194, + "step": 6981 + }, + { + "epoch": 3.3929483282674773, + "grad_norm": 0.07129352237661303, + "learning_rate": 5.549191630149326e-07, + "loss": 0.4878, + "step": 6982 + }, + { + "epoch": 3.393434650455927, + "grad_norm": 0.07253398573983671, + "learning_rate": 5.540428072854626e-07, + "loss": 0.5306, + "step": 6983 + }, + { + "epoch": 3.393920972644377, + "grad_norm": 0.07224592433579474, + "learning_rate": 5.531671035051278e-07, + "loss": 0.5263, + "step": 6984 + }, + { + "epoch": 3.3944072948328268, + "grad_norm": 0.07048070394112615, + "learning_rate": 5.522920518023406e-07, + "loss": 0.5177, + "step": 6985 + }, + { + "epoch": 3.3948936170212765, + "grad_norm": 0.06932192918130829, + "learning_rate": 5.514176523054166e-07, + "loss": 0.4835, + "step": 6986 + }, + { + "epoch": 3.3953799392097266, + "grad_norm": 0.07168886718069899, + "learning_rate": 5.50543905142577e-07, + "loss": 0.5108, + "step": 6987 + }, + { + "epoch": 3.3958662613981763, + "grad_norm": 0.07121659000177742, + "learning_rate": 5.496708104419468e-07, + "loss": 0.5295, + "step": 6988 + }, + { + "epoch": 3.396352583586626, + "grad_norm": 0.06974675909586693, + "learning_rate": 5.487983683315556e-07, + "loss": 0.4906, + "step": 6989 + }, + { + "epoch": 3.396838905775076, + "grad_norm": 0.07027974009588528, + "learning_rate": 5.479265789393368e-07, + "loss": 0.4915, + "step": 6990 + }, + { + "epoch": 3.3973252279635258, + "grad_norm": 0.07050969042971653, + "learning_rate": 5.470554423931285e-07, + "loss": 0.505, + "step": 6991 + }, + { + "epoch": 3.3978115501519754, + "grad_norm": 0.07264894599366457, + "learning_rate": 5.461849588206725e-07, + "loss": 0.5111, + "step": 6992 + }, + { + "epoch": 3.3982978723404256, + "grad_norm": 0.07288463132939543, + "learning_rate": 5.453151283496177e-07, + "loss": 0.537, + "step": 6993 + }, + { + "epoch": 3.3987841945288753, + "grad_norm": 0.07090308068330071, + "learning_rate": 5.444459511075117e-07, + "loss": 0.5371, + "step": 6994 + }, + { + "epoch": 3.3992705167173254, + "grad_norm": 0.07039122724116818, + "learning_rate": 5.435774272218109e-07, + "loss": 0.4694, + "step": 6995 + }, + { + "epoch": 3.399756838905775, + "grad_norm": 0.07111096784485435, + "learning_rate": 5.427095568198743e-07, + "loss": 0.5052, + "step": 6996 + }, + { + "epoch": 3.4002431610942248, + "grad_norm": 0.07019330839702344, + "learning_rate": 5.418423400289651e-07, + "loss": 0.4978, + "step": 6997 + }, + { + "epoch": 3.400729483282675, + "grad_norm": 0.07133190991310967, + "learning_rate": 5.409757769762514e-07, + "loss": 0.5128, + "step": 6998 + }, + { + "epoch": 3.4012158054711246, + "grad_norm": 0.07080585057474179, + "learning_rate": 5.401098677888029e-07, + "loss": 0.5158, + "step": 6999 + }, + { + "epoch": 3.4017021276595747, + "grad_norm": 0.07491017341842574, + "learning_rate": 5.392446125935985e-07, + "loss": 0.5764, + "step": 7000 + }, + { + "epoch": 3.4021884498480244, + "grad_norm": 0.0709200181091196, + "learning_rate": 5.383800115175159e-07, + "loss": 0.5142, + "step": 7001 + }, + { + "epoch": 3.402674772036474, + "grad_norm": 0.07251055782223502, + "learning_rate": 5.375160646873395e-07, + "loss": 0.5414, + "step": 7002 + }, + { + "epoch": 3.403161094224924, + "grad_norm": 0.06818878883827324, + "learning_rate": 5.366527722297577e-07, + "loss": 0.4578, + "step": 7003 + }, + { + "epoch": 3.403647416413374, + "grad_norm": 0.07359690497362702, + "learning_rate": 5.357901342713623e-07, + "loss": 0.5461, + "step": 7004 + }, + { + "epoch": 3.4041337386018236, + "grad_norm": 0.06994053916105168, + "learning_rate": 5.349281509386489e-07, + "loss": 0.4954, + "step": 7005 + }, + { + "epoch": 3.4046200607902737, + "grad_norm": 0.0761199202333254, + "learning_rate": 5.340668223580181e-07, + "loss": 0.5306, + "step": 7006 + }, + { + "epoch": 3.4051063829787234, + "grad_norm": 0.06982552015233777, + "learning_rate": 5.332061486557738e-07, + "loss": 0.4884, + "step": 7007 + }, + { + "epoch": 3.405592705167173, + "grad_norm": 0.07176320925514097, + "learning_rate": 5.32346129958125e-07, + "loss": 0.5174, + "step": 7008 + }, + { + "epoch": 3.406079027355623, + "grad_norm": 0.07107531987896348, + "learning_rate": 5.314867663911816e-07, + "loss": 0.5325, + "step": 7009 + }, + { + "epoch": 3.406565349544073, + "grad_norm": 0.07292283770107981, + "learning_rate": 5.306280580809609e-07, + "loss": 0.5568, + "step": 7010 + }, + { + "epoch": 3.407051671732523, + "grad_norm": 0.0712032282326734, + "learning_rate": 5.297700051533816e-07, + "loss": 0.5369, + "step": 7011 + }, + { + "epoch": 3.4075379939209727, + "grad_norm": 0.07087962842437506, + "learning_rate": 5.289126077342687e-07, + "loss": 0.4855, + "step": 7012 + }, + { + "epoch": 3.4080243161094224, + "grad_norm": 0.07078355715841247, + "learning_rate": 5.280558659493495e-07, + "loss": 0.4856, + "step": 7013 + }, + { + "epoch": 3.4085106382978725, + "grad_norm": 0.07036367780252621, + "learning_rate": 5.271997799242551e-07, + "loss": 0.4898, + "step": 7014 + }, + { + "epoch": 3.408996960486322, + "grad_norm": 0.07348879404029077, + "learning_rate": 5.263443497845211e-07, + "loss": 0.5336, + "step": 7015 + }, + { + "epoch": 3.409483282674772, + "grad_norm": 0.07236918066119626, + "learning_rate": 5.254895756555861e-07, + "loss": 0.5347, + "step": 7016 + }, + { + "epoch": 3.409969604863222, + "grad_norm": 0.07193884345986701, + "learning_rate": 5.246354576627927e-07, + "loss": 0.5326, + "step": 7017 + }, + { + "epoch": 3.4104559270516717, + "grad_norm": 0.0710068962169839, + "learning_rate": 5.237819959313895e-07, + "loss": 0.4693, + "step": 7018 + }, + { + "epoch": 3.4109422492401213, + "grad_norm": 0.07259084261867696, + "learning_rate": 5.229291905865252e-07, + "loss": 0.5452, + "step": 7019 + }, + { + "epoch": 3.4114285714285715, + "grad_norm": 0.07168642138661631, + "learning_rate": 5.220770417532551e-07, + "loss": 0.5031, + "step": 7020 + }, + { + "epoch": 3.411914893617021, + "grad_norm": 0.07363813709197267, + "learning_rate": 5.21225549556536e-07, + "loss": 0.5068, + "step": 7021 + }, + { + "epoch": 3.4124012158054713, + "grad_norm": 0.07271955634676451, + "learning_rate": 5.203747141212318e-07, + "loss": 0.5212, + "step": 7022 + }, + { + "epoch": 3.412887537993921, + "grad_norm": 0.07234699811117531, + "learning_rate": 5.195245355721051e-07, + "loss": 0.5188, + "step": 7023 + }, + { + "epoch": 3.4133738601823707, + "grad_norm": 0.07044012315922774, + "learning_rate": 5.186750140338265e-07, + "loss": 0.5, + "step": 7024 + }, + { + "epoch": 3.413860182370821, + "grad_norm": 0.06927884975579712, + "learning_rate": 5.178261496309678e-07, + "loss": 0.491, + "step": 7025 + }, + { + "epoch": 3.4143465045592705, + "grad_norm": 0.07385042443044354, + "learning_rate": 5.169779424880056e-07, + "loss": 0.5075, + "step": 7026 + }, + { + "epoch": 3.4148328267477206, + "grad_norm": 0.06872762242420284, + "learning_rate": 5.161303927293204e-07, + "loss": 0.4499, + "step": 7027 + }, + { + "epoch": 3.4153191489361703, + "grad_norm": 0.07087068488454867, + "learning_rate": 5.152835004791951e-07, + "loss": 0.4861, + "step": 7028 + }, + { + "epoch": 3.41580547112462, + "grad_norm": 0.07211812711599211, + "learning_rate": 5.144372658618175e-07, + "loss": 0.5175, + "step": 7029 + }, + { + "epoch": 3.41629179331307, + "grad_norm": 0.07242957922778535, + "learning_rate": 5.135916890012776e-07, + "loss": 0.5278, + "step": 7030 + }, + { + "epoch": 3.4167781155015198, + "grad_norm": 0.0681669124635261, + "learning_rate": 5.127467700215705e-07, + "loss": 0.5018, + "step": 7031 + }, + { + "epoch": 3.4172644376899695, + "grad_norm": 0.07093977691217154, + "learning_rate": 5.119025090465929e-07, + "loss": 0.5003, + "step": 7032 + }, + { + "epoch": 3.4177507598784196, + "grad_norm": 0.07215714001791727, + "learning_rate": 5.110589062001464e-07, + "loss": 0.5069, + "step": 7033 + }, + { + "epoch": 3.4182370820668693, + "grad_norm": 0.07218972844316805, + "learning_rate": 5.102159616059365e-07, + "loss": 0.5189, + "step": 7034 + }, + { + "epoch": 3.418723404255319, + "grad_norm": 0.06916201744900763, + "learning_rate": 5.093736753875711e-07, + "loss": 0.5108, + "step": 7035 + }, + { + "epoch": 3.419209726443769, + "grad_norm": 0.06901520081784702, + "learning_rate": 5.085320476685601e-07, + "loss": 0.506, + "step": 7036 + }, + { + "epoch": 3.4196960486322188, + "grad_norm": 0.07030731603597559, + "learning_rate": 5.076910785723226e-07, + "loss": 0.4928, + "step": 7037 + }, + { + "epoch": 3.420182370820669, + "grad_norm": 0.0678502030957812, + "learning_rate": 5.068507682221741e-07, + "loss": 0.4508, + "step": 7038 + }, + { + "epoch": 3.4206686930091186, + "grad_norm": 0.07121966076896366, + "learning_rate": 5.060111167413373e-07, + "loss": 0.5355, + "step": 7039 + }, + { + "epoch": 3.4211550151975683, + "grad_norm": 0.07092152492180169, + "learning_rate": 5.051721242529378e-07, + "loss": 0.5018, + "step": 7040 + }, + { + "epoch": 3.4216413373860184, + "grad_norm": 0.07301453923717974, + "learning_rate": 5.043337908800039e-07, + "loss": 0.5236, + "step": 7041 + }, + { + "epoch": 3.422127659574468, + "grad_norm": 0.07220762767794195, + "learning_rate": 5.034961167454677e-07, + "loss": 0.4998, + "step": 7042 + }, + { + "epoch": 3.4226139817629178, + "grad_norm": 0.07027668722284915, + "learning_rate": 5.02659101972165e-07, + "loss": 0.4808, + "step": 7043 + }, + { + "epoch": 3.423100303951368, + "grad_norm": 0.07123364200597594, + "learning_rate": 5.018227466828341e-07, + "loss": 0.5286, + "step": 7044 + }, + { + "epoch": 3.4235866261398176, + "grad_norm": 0.06898968630479926, + "learning_rate": 5.009870510001175e-07, + "loss": 0.4512, + "step": 7045 + }, + { + "epoch": 3.4240729483282673, + "grad_norm": 0.07309872312298674, + "learning_rate": 5.0015201504656e-07, + "loss": 0.5126, + "step": 7046 + }, + { + "epoch": 3.4245592705167174, + "grad_norm": 0.06961150293663415, + "learning_rate": 4.993176389446103e-07, + "loss": 0.4945, + "step": 7047 + }, + { + "epoch": 3.425045592705167, + "grad_norm": 0.0718692549643475, + "learning_rate": 4.984839228166205e-07, + "loss": 0.4987, + "step": 7048 + }, + { + "epoch": 3.425531914893617, + "grad_norm": 0.06981846342767056, + "learning_rate": 4.97650866784845e-07, + "loss": 0.5103, + "step": 7049 + }, + { + "epoch": 3.426018237082067, + "grad_norm": 0.071170215354394, + "learning_rate": 4.968184709714424e-07, + "loss": 0.5005, + "step": 7050 + }, + { + "epoch": 3.4265045592705166, + "grad_norm": 0.07067289717994367, + "learning_rate": 4.959867354984743e-07, + "loss": 0.5275, + "step": 7051 + }, + { + "epoch": 3.4269908814589667, + "grad_norm": 0.07218932808030425, + "learning_rate": 4.951556604879049e-07, + "loss": 0.5139, + "step": 7052 + }, + { + "epoch": 3.4274772036474164, + "grad_norm": 0.07135808243745731, + "learning_rate": 4.943252460616016e-07, + "loss": 0.5249, + "step": 7053 + }, + { + "epoch": 3.4279635258358665, + "grad_norm": 0.06967460094210229, + "learning_rate": 4.934954923413359e-07, + "loss": 0.5056, + "step": 7054 + }, + { + "epoch": 3.428449848024316, + "grad_norm": 0.07398164664889825, + "learning_rate": 4.926663994487813e-07, + "loss": 0.5343, + "step": 7055 + }, + { + "epoch": 3.428936170212766, + "grad_norm": 0.07061688764260694, + "learning_rate": 4.918379675055152e-07, + "loss": 0.5019, + "step": 7056 + }, + { + "epoch": 3.429422492401216, + "grad_norm": 0.0693050127553063, + "learning_rate": 4.910101966330178e-07, + "loss": 0.4885, + "step": 7057 + }, + { + "epoch": 3.4299088145896657, + "grad_norm": 0.07164036328868739, + "learning_rate": 4.90183086952672e-07, + "loss": 0.5474, + "step": 7058 + }, + { + "epoch": 3.4303951367781154, + "grad_norm": 0.07326086395011282, + "learning_rate": 4.89356638585764e-07, + "loss": 0.5103, + "step": 7059 + }, + { + "epoch": 3.4308814589665655, + "grad_norm": 0.06937296923055017, + "learning_rate": 4.885308516534831e-07, + "loss": 0.5121, + "step": 7060 + }, + { + "epoch": 3.431367781155015, + "grad_norm": 0.07241584210722997, + "learning_rate": 4.877057262769219e-07, + "loss": 0.5309, + "step": 7061 + }, + { + "epoch": 3.431854103343465, + "grad_norm": 0.07102410199391818, + "learning_rate": 4.868812625770752e-07, + "loss": 0.5148, + "step": 7062 + }, + { + "epoch": 3.432340425531915, + "grad_norm": 0.07120101665824168, + "learning_rate": 4.860574606748419e-07, + "loss": 0.5209, + "step": 7063 + }, + { + "epoch": 3.4328267477203647, + "grad_norm": 0.07250009451665422, + "learning_rate": 4.852343206910226e-07, + "loss": 0.5154, + "step": 7064 + }, + { + "epoch": 3.433313069908815, + "grad_norm": 0.0693803939931791, + "learning_rate": 4.844118427463212e-07, + "loss": 0.4883, + "step": 7065 + }, + { + "epoch": 3.4337993920972645, + "grad_norm": 0.07548868797724302, + "learning_rate": 4.835900269613458e-07, + "loss": 0.5632, + "step": 7066 + }, + { + "epoch": 3.434285714285714, + "grad_norm": 0.07123149948943107, + "learning_rate": 4.827688734566055e-07, + "loss": 0.5183, + "step": 7067 + }, + { + "epoch": 3.4347720364741643, + "grad_norm": 0.07185169163908411, + "learning_rate": 4.819483823525128e-07, + "loss": 0.5058, + "step": 7068 + }, + { + "epoch": 3.435258358662614, + "grad_norm": 0.07160810322752237, + "learning_rate": 4.81128553769385e-07, + "loss": 0.5022, + "step": 7069 + }, + { + "epoch": 3.4357446808510637, + "grad_norm": 0.07056010585649415, + "learning_rate": 4.803093878274395e-07, + "loss": 0.4992, + "step": 7070 + }, + { + "epoch": 3.436231003039514, + "grad_norm": 0.07026237044656844, + "learning_rate": 4.794908846467977e-07, + "loss": 0.526, + "step": 7071 + }, + { + "epoch": 3.4367173252279635, + "grad_norm": 0.07056044928922275, + "learning_rate": 4.786730443474824e-07, + "loss": 0.5158, + "step": 7072 + }, + { + "epoch": 3.437203647416413, + "grad_norm": 0.07304663594739744, + "learning_rate": 4.778558670494232e-07, + "loss": 0.536, + "step": 7073 + }, + { + "epoch": 3.4376899696048633, + "grad_norm": 0.06901423513347338, + "learning_rate": 4.770393528724488e-07, + "loss": 0.4869, + "step": 7074 + }, + { + "epoch": 3.438176291793313, + "grad_norm": 0.07032568562380762, + "learning_rate": 4.7622350193629154e-07, + "loss": 0.4967, + "step": 7075 + }, + { + "epoch": 3.438662613981763, + "grad_norm": 0.06976932633573946, + "learning_rate": 4.7540831436058697e-07, + "loss": 0.4925, + "step": 7076 + }, + { + "epoch": 3.439148936170213, + "grad_norm": 0.06943396048677875, + "learning_rate": 4.7459379026487287e-07, + "loss": 0.4759, + "step": 7077 + }, + { + "epoch": 3.4396352583586625, + "grad_norm": 0.07187575923267354, + "learning_rate": 4.7377992976858965e-07, + "loss": 0.5189, + "step": 7078 + }, + { + "epoch": 3.4401215805471126, + "grad_norm": 0.07243133155628674, + "learning_rate": 4.72966732991082e-07, + "loss": 0.5248, + "step": 7079 + }, + { + "epoch": 3.4406079027355623, + "grad_norm": 0.06917242258743568, + "learning_rate": 4.721542000515944e-07, + "loss": 0.4812, + "step": 7080 + }, + { + "epoch": 3.4410942249240124, + "grad_norm": 0.07165330712505101, + "learning_rate": 4.713423310692761e-07, + "loss": 0.5365, + "step": 7081 + }, + { + "epoch": 3.441580547112462, + "grad_norm": 0.07300047992753031, + "learning_rate": 4.7053112616317897e-07, + "loss": 0.551, + "step": 7082 + }, + { + "epoch": 3.4420668693009118, + "grad_norm": 0.06996620557840325, + "learning_rate": 4.6972058545225684e-07, + "loss": 0.5222, + "step": 7083 + }, + { + "epoch": 3.4425531914893615, + "grad_norm": 0.06926246347018855, + "learning_rate": 4.6891070905536574e-07, + "loss": 0.5037, + "step": 7084 + }, + { + "epoch": 3.4430395136778116, + "grad_norm": 0.07139518563847702, + "learning_rate": 4.6810149709126673e-07, + "loss": 0.5067, + "step": 7085 + }, + { + "epoch": 3.4435258358662613, + "grad_norm": 0.07307768421863547, + "learning_rate": 4.672929496786188e-07, + "loss": 0.5431, + "step": 7086 + }, + { + "epoch": 3.4440121580547114, + "grad_norm": 0.07133626947826453, + "learning_rate": 4.6648506693598717e-07, + "loss": 0.5053, + "step": 7087 + }, + { + "epoch": 3.444498480243161, + "grad_norm": 0.07266562294426104, + "learning_rate": 4.656778489818392e-07, + "loss": 0.5209, + "step": 7088 + }, + { + "epoch": 3.4449848024316108, + "grad_norm": 0.0717737756173884, + "learning_rate": 4.6487129593454415e-07, + "loss": 0.5372, + "step": 7089 + }, + { + "epoch": 3.445471124620061, + "grad_norm": 0.06998815530908782, + "learning_rate": 4.64065407912373e-07, + "loss": 0.5212, + "step": 7090 + }, + { + "epoch": 3.4459574468085106, + "grad_norm": 0.06968397885693899, + "learning_rate": 4.6326018503350165e-07, + "loss": 0.5005, + "step": 7091 + }, + { + "epoch": 3.4464437689969607, + "grad_norm": 0.07104505938033598, + "learning_rate": 4.624556274160058e-07, + "loss": 0.5275, + "step": 7092 + }, + { + "epoch": 3.4469300911854104, + "grad_norm": 0.0712905520734446, + "learning_rate": 4.6165173517786543e-07, + "loss": 0.5355, + "step": 7093 + }, + { + "epoch": 3.44741641337386, + "grad_norm": 0.07301836762255351, + "learning_rate": 4.6084850843696126e-07, + "loss": 0.5305, + "step": 7094 + }, + { + "epoch": 3.44790273556231, + "grad_norm": 0.07631065334930298, + "learning_rate": 4.60045947311078e-07, + "loss": 0.5296, + "step": 7095 + }, + { + "epoch": 3.44838905775076, + "grad_norm": 0.07242896729776276, + "learning_rate": 4.59244051917902e-07, + "loss": 0.5438, + "step": 7096 + }, + { + "epoch": 3.4488753799392096, + "grad_norm": 0.07160705541580994, + "learning_rate": 4.58442822375022e-07, + "loss": 0.5025, + "step": 7097 + }, + { + "epoch": 3.4493617021276597, + "grad_norm": 0.072060190992709, + "learning_rate": 4.576422587999296e-07, + "loss": 0.5598, + "step": 7098 + }, + { + "epoch": 3.4498480243161094, + "grad_norm": 0.07223336011269477, + "learning_rate": 4.568423613100176e-07, + "loss": 0.5504, + "step": 7099 + }, + { + "epoch": 3.450334346504559, + "grad_norm": 0.07056766944609026, + "learning_rate": 4.560431300225837e-07, + "loss": 0.4883, + "step": 7100 + }, + { + "epoch": 3.450820668693009, + "grad_norm": 0.07168714970698431, + "learning_rate": 4.552445650548237e-07, + "loss": 0.5216, + "step": 7101 + }, + { + "epoch": 3.451306990881459, + "grad_norm": 0.07278388720838608, + "learning_rate": 4.54446666523839e-07, + "loss": 0.5016, + "step": 7102 + }, + { + "epoch": 3.451793313069909, + "grad_norm": 0.07287780960020424, + "learning_rate": 4.5364943454663245e-07, + "loss": 0.4878, + "step": 7103 + }, + { + "epoch": 3.4522796352583587, + "grad_norm": 0.06834876349142859, + "learning_rate": 4.528528692401091e-07, + "loss": 0.459, + "step": 7104 + }, + { + "epoch": 3.4527659574468084, + "grad_norm": 0.0699751033364438, + "learning_rate": 4.5205697072107645e-07, + "loss": 0.516, + "step": 7105 + }, + { + "epoch": 3.4532522796352585, + "grad_norm": 0.0718489100081899, + "learning_rate": 4.512617391062435e-07, + "loss": 0.5034, + "step": 7106 + }, + { + "epoch": 3.453738601823708, + "grad_norm": 0.07224462156306567, + "learning_rate": 4.504671745122219e-07, + "loss": 0.5149, + "step": 7107 + }, + { + "epoch": 3.4542249240121583, + "grad_norm": 0.07302976300880235, + "learning_rate": 4.496732770555251e-07, + "loss": 0.524, + "step": 7108 + }, + { + "epoch": 3.454711246200608, + "grad_norm": 0.07022343940747477, + "learning_rate": 4.4888004685257115e-07, + "loss": 0.4994, + "step": 7109 + }, + { + "epoch": 3.4551975683890577, + "grad_norm": 0.07179982711260838, + "learning_rate": 4.480874840196764e-07, + "loss": 0.5137, + "step": 7110 + }, + { + "epoch": 3.4556838905775074, + "grad_norm": 0.0714773289119303, + "learning_rate": 4.472955886730618e-07, + "loss": 0.51, + "step": 7111 + }, + { + "epoch": 3.4561702127659575, + "grad_norm": 0.07253153438209857, + "learning_rate": 4.4650436092884995e-07, + "loss": 0.5076, + "step": 7112 + }, + { + "epoch": 3.456656534954407, + "grad_norm": 0.07163446422150273, + "learning_rate": 4.457138009030654e-07, + "loss": 0.5384, + "step": 7113 + }, + { + "epoch": 3.4571428571428573, + "grad_norm": 0.0732425096355872, + "learning_rate": 4.449239087116353e-07, + "loss": 0.5602, + "step": 7114 + }, + { + "epoch": 3.457629179331307, + "grad_norm": 0.070089514892091, + "learning_rate": 4.4413468447038645e-07, + "loss": 0.4952, + "step": 7115 + }, + { + "epoch": 3.4581155015197567, + "grad_norm": 0.06946903703312084, + "learning_rate": 4.433461282950513e-07, + "loss": 0.4779, + "step": 7116 + }, + { + "epoch": 3.458601823708207, + "grad_norm": 0.07162757805626836, + "learning_rate": 4.425582403012618e-07, + "loss": 0.4773, + "step": 7117 + }, + { + "epoch": 3.4590881458966565, + "grad_norm": 0.07459478433037206, + "learning_rate": 4.4177102060455337e-07, + "loss": 0.5424, + "step": 7118 + }, + { + "epoch": 3.4595744680851066, + "grad_norm": 0.07047360814388473, + "learning_rate": 4.4098446932036245e-07, + "loss": 0.5208, + "step": 7119 + }, + { + "epoch": 3.4600607902735563, + "grad_norm": 0.0715903661063747, + "learning_rate": 4.401985865640285e-07, + "loss": 0.5153, + "step": 7120 + }, + { + "epoch": 3.460547112462006, + "grad_norm": 0.0725549678723102, + "learning_rate": 4.394133724507915e-07, + "loss": 0.5404, + "step": 7121 + }, + { + "epoch": 3.461033434650456, + "grad_norm": 0.07160869326361206, + "learning_rate": 4.386288270957945e-07, + "loss": 0.5229, + "step": 7122 + }, + { + "epoch": 3.461519756838906, + "grad_norm": 0.07246517266938815, + "learning_rate": 4.378449506140825e-07, + "loss": 0.5311, + "step": 7123 + }, + { + "epoch": 3.4620060790273555, + "grad_norm": 0.07048331845904107, + "learning_rate": 4.3706174312060144e-07, + "loss": 0.4768, + "step": 7124 + }, + { + "epoch": 3.4624924012158056, + "grad_norm": 0.07085176001446238, + "learning_rate": 4.362792047302006e-07, + "loss": 0.5045, + "step": 7125 + }, + { + "epoch": 3.4629787234042553, + "grad_norm": 0.06929619781996207, + "learning_rate": 4.3549733555762865e-07, + "loss": 0.4709, + "step": 7126 + }, + { + "epoch": 3.463465045592705, + "grad_norm": 0.06929348444624528, + "learning_rate": 4.3471613571754e-07, + "loss": 0.5107, + "step": 7127 + }, + { + "epoch": 3.463951367781155, + "grad_norm": 0.07065988592529936, + "learning_rate": 4.339356053244881e-07, + "loss": 0.5002, + "step": 7128 + }, + { + "epoch": 3.4644376899696048, + "grad_norm": 0.0730719880299531, + "learning_rate": 4.331557444929291e-07, + "loss": 0.5204, + "step": 7129 + }, + { + "epoch": 3.464924012158055, + "grad_norm": 0.07100835518529446, + "learning_rate": 4.323765533372193e-07, + "loss": 0.502, + "step": 7130 + }, + { + "epoch": 3.4654103343465046, + "grad_norm": 0.071465428028525, + "learning_rate": 4.3159803197161956e-07, + "loss": 0.4954, + "step": 7131 + }, + { + "epoch": 3.4658966565349543, + "grad_norm": 0.07015204160106535, + "learning_rate": 4.308201805102907e-07, + "loss": 0.5164, + "step": 7132 + }, + { + "epoch": 3.4663829787234044, + "grad_norm": 0.07125852329743032, + "learning_rate": 4.3004299906729553e-07, + "loss": 0.5345, + "step": 7133 + }, + { + "epoch": 3.466869300911854, + "grad_norm": 0.07147334100257322, + "learning_rate": 4.292664877565994e-07, + "loss": 0.5302, + "step": 7134 + }, + { + "epoch": 3.4673556231003038, + "grad_norm": 0.06994256639555531, + "learning_rate": 4.2849064669206907e-07, + "loss": 0.5232, + "step": 7135 + }, + { + "epoch": 3.467841945288754, + "grad_norm": 0.0690320813027386, + "learning_rate": 4.277154759874719e-07, + "loss": 0.5092, + "step": 7136 + }, + { + "epoch": 3.4683282674772036, + "grad_norm": 0.07234131221540303, + "learning_rate": 4.2694097575647906e-07, + "loss": 0.5211, + "step": 7137 + }, + { + "epoch": 3.4688145896656533, + "grad_norm": 0.06711167917497439, + "learning_rate": 4.26167146112661e-07, + "loss": 0.4939, + "step": 7138 + }, + { + "epoch": 3.4693009118541034, + "grad_norm": 0.07143250277257142, + "learning_rate": 4.2539398716949233e-07, + "loss": 0.5179, + "step": 7139 + }, + { + "epoch": 3.469787234042553, + "grad_norm": 0.07176964242052265, + "learning_rate": 4.2462149904034686e-07, + "loss": 0.4986, + "step": 7140 + }, + { + "epoch": 3.470273556231003, + "grad_norm": 0.06903382031338814, + "learning_rate": 4.238496818385018e-07, + "loss": 0.495, + "step": 7141 + }, + { + "epoch": 3.470759878419453, + "grad_norm": 0.07223861402373928, + "learning_rate": 4.2307853567713495e-07, + "loss": 0.5513, + "step": 7142 + }, + { + "epoch": 3.4712462006079026, + "grad_norm": 0.07446774725994262, + "learning_rate": 4.2230806066932695e-07, + "loss": 0.5224, + "step": 7143 + }, + { + "epoch": 3.4717325227963527, + "grad_norm": 0.07076187304073288, + "learning_rate": 4.215382569280585e-07, + "loss": 0.4988, + "step": 7144 + }, + { + "epoch": 3.4722188449848024, + "grad_norm": 0.0723513833382034, + "learning_rate": 4.2076912456621265e-07, + "loss": 0.5322, + "step": 7145 + }, + { + "epoch": 3.4727051671732525, + "grad_norm": 0.06886845431872449, + "learning_rate": 4.200006636965742e-07, + "loss": 0.4924, + "step": 7146 + }, + { + "epoch": 3.473191489361702, + "grad_norm": 0.0729523350576455, + "learning_rate": 4.192328744318291e-07, + "loss": 0.4722, + "step": 7147 + }, + { + "epoch": 3.473677811550152, + "grad_norm": 0.06954424549529944, + "learning_rate": 4.1846575688456516e-07, + "loss": 0.5167, + "step": 7148 + }, + { + "epoch": 3.474164133738602, + "grad_norm": 0.0713360878554759, + "learning_rate": 4.1769931116727114e-07, + "loss": 0.5204, + "step": 7149 + }, + { + "epoch": 3.4746504559270517, + "grad_norm": 0.07207305888129763, + "learning_rate": 4.169335373923372e-07, + "loss": 0.505, + "step": 7150 + }, + { + "epoch": 3.4751367781155014, + "grad_norm": 0.06919764667063984, + "learning_rate": 4.1616843567205636e-07, + "loss": 0.4661, + "step": 7151 + }, + { + "epoch": 3.4756231003039515, + "grad_norm": 0.07080383094334816, + "learning_rate": 4.154040061186215e-07, + "loss": 0.5224, + "step": 7152 + }, + { + "epoch": 3.476109422492401, + "grad_norm": 0.0684666964684904, + "learning_rate": 4.14640248844127e-07, + "loss": 0.4823, + "step": 7153 + }, + { + "epoch": 3.476595744680851, + "grad_norm": 0.0710639276185835, + "learning_rate": 4.1387716396057044e-07, + "loss": 0.5157, + "step": 7154 + }, + { + "epoch": 3.477082066869301, + "grad_norm": 0.07250362419442305, + "learning_rate": 4.1311475157984895e-07, + "loss": 0.4864, + "step": 7155 + }, + { + "epoch": 3.4775683890577507, + "grad_norm": 0.07525166273514451, + "learning_rate": 4.123530118137609e-07, + "loss": 0.5422, + "step": 7156 + }, + { + "epoch": 3.478054711246201, + "grad_norm": 0.06877918690737896, + "learning_rate": 4.1159194477400797e-07, + "loss": 0.5068, + "step": 7157 + }, + { + "epoch": 3.4785410334346505, + "grad_norm": 0.06844393142806099, + "learning_rate": 4.108315505721916e-07, + "loss": 0.4806, + "step": 7158 + }, + { + "epoch": 3.4790273556231, + "grad_norm": 0.07071825519917485, + "learning_rate": 4.100718293198147e-07, + "loss": 0.5156, + "step": 7159 + }, + { + "epoch": 3.4795136778115503, + "grad_norm": 0.06990493781272573, + "learning_rate": 4.093127811282821e-07, + "loss": 0.4915, + "step": 7160 + }, + { + "epoch": 3.48, + "grad_norm": 0.07077455058971847, + "learning_rate": 4.085544061088992e-07, + "loss": 0.4807, + "step": 7161 + }, + { + "epoch": 3.4804863221884497, + "grad_norm": 0.07137638945796826, + "learning_rate": 4.0779670437287475e-07, + "loss": 0.5388, + "step": 7162 + }, + { + "epoch": 3.4809726443769, + "grad_norm": 0.07077230534039745, + "learning_rate": 4.0703967603131334e-07, + "loss": 0.497, + "step": 7163 + }, + { + "epoch": 3.4814589665653495, + "grad_norm": 0.07088931557799548, + "learning_rate": 4.0628332119522827e-07, + "loss": 0.5151, + "step": 7164 + }, + { + "epoch": 3.481945288753799, + "grad_norm": 0.07069425624386806, + "learning_rate": 4.055276399755287e-07, + "loss": 0.5009, + "step": 7165 + }, + { + "epoch": 3.4824316109422493, + "grad_norm": 0.07440464444834212, + "learning_rate": 4.04772632483027e-07, + "loss": 0.4936, + "step": 7166 + }, + { + "epoch": 3.482917933130699, + "grad_norm": 0.07064961563693023, + "learning_rate": 4.0401829882843635e-07, + "loss": 0.517, + "step": 7167 + }, + { + "epoch": 3.483404255319149, + "grad_norm": 0.07173284264035261, + "learning_rate": 4.0326463912237156e-07, + "loss": 0.5181, + "step": 7168 + }, + { + "epoch": 3.483890577507599, + "grad_norm": 0.07045352367077301, + "learning_rate": 4.0251165347534815e-07, + "loss": 0.48, + "step": 7169 + }, + { + "epoch": 3.4843768996960485, + "grad_norm": 0.07216283147051752, + "learning_rate": 4.0175934199778275e-07, + "loss": 0.4871, + "step": 7170 + }, + { + "epoch": 3.4848632218844986, + "grad_norm": 0.06990082659203009, + "learning_rate": 4.010077047999933e-07, + "loss": 0.5175, + "step": 7171 + }, + { + "epoch": 3.4853495440729483, + "grad_norm": 0.07302629595603717, + "learning_rate": 4.0025674199219877e-07, + "loss": 0.5255, + "step": 7172 + }, + { + "epoch": 3.4858358662613984, + "grad_norm": 0.06939565664702761, + "learning_rate": 3.9950645368452e-07, + "loss": 0.5, + "step": 7173 + }, + { + "epoch": 3.486322188449848, + "grad_norm": 0.07109383901805268, + "learning_rate": 3.987568399869773e-07, + "loss": 0.4967, + "step": 7174 + }, + { + "epoch": 3.4868085106382978, + "grad_norm": 0.07230052445640836, + "learning_rate": 3.980079010094934e-07, + "loss": 0.5249, + "step": 7175 + }, + { + "epoch": 3.487294832826748, + "grad_norm": 0.07087001627414656, + "learning_rate": 3.9725963686189197e-07, + "loss": 0.5186, + "step": 7176 + }, + { + "epoch": 3.4877811550151976, + "grad_norm": 0.07214567955124625, + "learning_rate": 3.9651204765389806e-07, + "loss": 0.5153, + "step": 7177 + }, + { + "epoch": 3.4882674772036473, + "grad_norm": 0.06926102858760724, + "learning_rate": 3.957651334951357e-07, + "loss": 0.4896, + "step": 7178 + }, + { + "epoch": 3.4887537993920974, + "grad_norm": 0.07232712025398089, + "learning_rate": 3.950188944951311e-07, + "loss": 0.501, + "step": 7179 + }, + { + "epoch": 3.489240121580547, + "grad_norm": 0.06916223835487106, + "learning_rate": 3.9427333076331343e-07, + "loss": 0.4751, + "step": 7180 + }, + { + "epoch": 3.4897264437689968, + "grad_norm": 0.07116770262447458, + "learning_rate": 3.935284424090091e-07, + "loss": 0.4947, + "step": 7181 + }, + { + "epoch": 3.490212765957447, + "grad_norm": 0.06987756140168783, + "learning_rate": 3.9278422954144965e-07, + "loss": 0.4973, + "step": 7182 + }, + { + "epoch": 3.4906990881458966, + "grad_norm": 0.06947174725678804, + "learning_rate": 3.920406922697645e-07, + "loss": 0.4817, + "step": 7183 + }, + { + "epoch": 3.4911854103343467, + "grad_norm": 0.07122649105872585, + "learning_rate": 3.9129783070298523e-07, + "loss": 0.4957, + "step": 7184 + }, + { + "epoch": 3.4916717325227964, + "grad_norm": 0.07352646944774852, + "learning_rate": 3.9055564495004306e-07, + "loss": 0.499, + "step": 7185 + }, + { + "epoch": 3.492158054711246, + "grad_norm": 0.06801638145018832, + "learning_rate": 3.898141351197726e-07, + "loss": 0.4684, + "step": 7186 + }, + { + "epoch": 3.492644376899696, + "grad_norm": 0.06917360868985718, + "learning_rate": 3.8907330132090694e-07, + "loss": 0.4778, + "step": 7187 + }, + { + "epoch": 3.493130699088146, + "grad_norm": 0.06850481430670212, + "learning_rate": 3.8833314366208077e-07, + "loss": 0.5036, + "step": 7188 + }, + { + "epoch": 3.4936170212765956, + "grad_norm": 0.07086381384672834, + "learning_rate": 3.875936622518306e-07, + "loss": 0.5007, + "step": 7189 + }, + { + "epoch": 3.4941033434650457, + "grad_norm": 0.07178627609022711, + "learning_rate": 3.8685485719859253e-07, + "loss": 0.5087, + "step": 7190 + }, + { + "epoch": 3.4945896656534954, + "grad_norm": 0.07445536827016301, + "learning_rate": 3.861167286107037e-07, + "loss": 0.5219, + "step": 7191 + }, + { + "epoch": 3.495075987841945, + "grad_norm": 0.06917347367775835, + "learning_rate": 3.853792765964032e-07, + "loss": 0.5007, + "step": 7192 + }, + { + "epoch": 3.495562310030395, + "grad_norm": 0.07141310309123573, + "learning_rate": 3.846425012638283e-07, + "loss": 0.5113, + "step": 7193 + }, + { + "epoch": 3.496048632218845, + "grad_norm": 0.07118210431123688, + "learning_rate": 3.839064027210204e-07, + "loss": 0.5089, + "step": 7194 + }, + { + "epoch": 3.496534954407295, + "grad_norm": 0.06976991006173139, + "learning_rate": 3.831709810759188e-07, + "loss": 0.4918, + "step": 7195 + }, + { + "epoch": 3.4970212765957447, + "grad_norm": 0.0695214866054242, + "learning_rate": 3.824362364363654e-07, + "loss": 0.5112, + "step": 7196 + }, + { + "epoch": 3.4970212765957447, + "eval_loss": 0.5695327520370483, + "eval_runtime": 105.0873, + "eval_samples_per_second": 288.836, + "eval_steps_per_second": 36.113, + "step": 7196 + }, + { + "epoch": 3.4975075987841944, + "grad_norm": 0.07127541092472973, + "learning_rate": 3.817021689101019e-07, + "loss": 0.4985, + "step": 7197 + }, + { + "epoch": 3.4979939209726445, + "grad_norm": 0.06933232372142334, + "learning_rate": 3.809687786047711e-07, + "loss": 0.4956, + "step": 7198 + }, + { + "epoch": 3.498480243161094, + "grad_norm": 0.06976287854341931, + "learning_rate": 3.8023606562791584e-07, + "loss": 0.507, + "step": 7199 + }, + { + "epoch": 3.4989665653495443, + "grad_norm": 0.07417999133780669, + "learning_rate": 3.795040300869812e-07, + "loss": 0.5355, + "step": 7200 + }, + { + "epoch": 3.499452887537994, + "grad_norm": 0.06957336907711105, + "learning_rate": 3.7877267208931147e-07, + "loss": 0.4784, + "step": 7201 + }, + { + "epoch": 3.4999392097264437, + "grad_norm": 0.06993007321014524, + "learning_rate": 3.7804199174215183e-07, + "loss": 0.4783, + "step": 7202 + }, + { + "epoch": 3.5004255319148934, + "grad_norm": 0.07071303000219951, + "learning_rate": 3.773119891526483e-07, + "loss": 0.5265, + "step": 7203 + }, + { + "epoch": 3.5009118541033435, + "grad_norm": 0.0700252671593182, + "learning_rate": 3.7658266442784754e-07, + "loss": 0.4978, + "step": 7204 + }, + { + "epoch": 3.501398176291793, + "grad_norm": 0.07275378205179954, + "learning_rate": 3.758540176746961e-07, + "loss": 0.4984, + "step": 7205 + }, + { + "epoch": 3.5018844984802433, + "grad_norm": 0.07103846666310974, + "learning_rate": 3.751260490000436e-07, + "loss": 0.4924, + "step": 7206 + }, + { + "epoch": 3.502370820668693, + "grad_norm": 0.06964549269570205, + "learning_rate": 3.743987585106362e-07, + "loss": 0.4747, + "step": 7207 + }, + { + "epoch": 3.5028571428571427, + "grad_norm": 0.07325844307206314, + "learning_rate": 3.7367214631312377e-07, + "loss": 0.5368, + "step": 7208 + }, + { + "epoch": 3.503343465045593, + "grad_norm": 0.07428724785785978, + "learning_rate": 3.729462125140559e-07, + "loss": 0.5176, + "step": 7209 + }, + { + "epoch": 3.5038297872340425, + "grad_norm": 0.07274666171203868, + "learning_rate": 3.7222095721988204e-07, + "loss": 0.5359, + "step": 7210 + }, + { + "epoch": 3.5043161094224926, + "grad_norm": 0.07000711141871847, + "learning_rate": 3.7149638053695256e-07, + "loss": 0.4958, + "step": 7211 + }, + { + "epoch": 3.5048024316109423, + "grad_norm": 0.06953932306519946, + "learning_rate": 3.707724825715192e-07, + "loss": 0.5076, + "step": 7212 + }, + { + "epoch": 3.505288753799392, + "grad_norm": 0.06981113823480962, + "learning_rate": 3.7004926342973257e-07, + "loss": 0.5096, + "step": 7213 + }, + { + "epoch": 3.505775075987842, + "grad_norm": 0.07100583897763792, + "learning_rate": 3.6932672321764507e-07, + "loss": 0.5064, + "step": 7214 + }, + { + "epoch": 3.506261398176292, + "grad_norm": 0.07209787004705707, + "learning_rate": 3.686048620412086e-07, + "loss": 0.5264, + "step": 7215 + }, + { + "epoch": 3.506747720364742, + "grad_norm": 0.06983077455674916, + "learning_rate": 3.678836800062763e-07, + "loss": 0.5243, + "step": 7216 + }, + { + "epoch": 3.5072340425531916, + "grad_norm": 0.07071790008837835, + "learning_rate": 3.671631772186007e-07, + "loss": 0.5154, + "step": 7217 + }, + { + "epoch": 3.5077203647416413, + "grad_norm": 0.07099082095666753, + "learning_rate": 3.664433537838363e-07, + "loss": 0.5013, + "step": 7218 + }, + { + "epoch": 3.508206686930091, + "grad_norm": 0.06786772715700722, + "learning_rate": 3.6572420980753643e-07, + "loss": 0.4771, + "step": 7219 + }, + { + "epoch": 3.508693009118541, + "grad_norm": 0.07187268531739176, + "learning_rate": 3.6500574539515557e-07, + "loss": 0.4585, + "step": 7220 + }, + { + "epoch": 3.509179331306991, + "grad_norm": 0.07140916415400446, + "learning_rate": 3.642879606520494e-07, + "loss": 0.5282, + "step": 7221 + }, + { + "epoch": 3.509665653495441, + "grad_norm": 0.06872218671094972, + "learning_rate": 3.635708556834705e-07, + "loss": 0.5035, + "step": 7222 + }, + { + "epoch": 3.5101519756838906, + "grad_norm": 0.07046853781545757, + "learning_rate": 3.628544305945758e-07, + "loss": 0.4945, + "step": 7223 + }, + { + "epoch": 3.5106382978723403, + "grad_norm": 0.07031160272234435, + "learning_rate": 3.6213868549042073e-07, + "loss": 0.504, + "step": 7224 + }, + { + "epoch": 3.5111246200607904, + "grad_norm": 0.06806264170788083, + "learning_rate": 3.614236204759608e-07, + "loss": 0.4876, + "step": 7225 + }, + { + "epoch": 3.51161094224924, + "grad_norm": 0.06855526913994629, + "learning_rate": 3.607092356560521e-07, + "loss": 0.4654, + "step": 7226 + }, + { + "epoch": 3.51209726443769, + "grad_norm": 0.0744999005612875, + "learning_rate": 3.5999553113545193e-07, + "loss": 0.5299, + "step": 7227 + }, + { + "epoch": 3.51258358662614, + "grad_norm": 0.07141921773306568, + "learning_rate": 3.5928250701881606e-07, + "loss": 0.5109, + "step": 7228 + }, + { + "epoch": 3.5130699088145896, + "grad_norm": 0.07180466879930848, + "learning_rate": 3.5857016341070136e-07, + "loss": 0.5072, + "step": 7229 + }, + { + "epoch": 3.5135562310030393, + "grad_norm": 0.07056873086957717, + "learning_rate": 3.57858500415566e-07, + "loss": 0.5182, + "step": 7230 + }, + { + "epoch": 3.5140425531914894, + "grad_norm": 0.06932979467409334, + "learning_rate": 3.5714751813776593e-07, + "loss": 0.4707, + "step": 7231 + }, + { + "epoch": 3.514528875379939, + "grad_norm": 0.07354808645815163, + "learning_rate": 3.564372166815594e-07, + "loss": 0.5364, + "step": 7232 + }, + { + "epoch": 3.515015197568389, + "grad_norm": 0.07024151848785647, + "learning_rate": 3.557275961511042e-07, + "loss": 0.5036, + "step": 7233 + }, + { + "epoch": 3.515501519756839, + "grad_norm": 0.06963388563562779, + "learning_rate": 3.550186566504576e-07, + "loss": 0.4909, + "step": 7234 + }, + { + "epoch": 3.5159878419452886, + "grad_norm": 0.07524825802159825, + "learning_rate": 3.5431039828357717e-07, + "loss": 0.5306, + "step": 7235 + }, + { + "epoch": 3.5164741641337387, + "grad_norm": 0.07174939743415036, + "learning_rate": 3.53602821154323e-07, + "loss": 0.5209, + "step": 7236 + }, + { + "epoch": 3.5169604863221884, + "grad_norm": 0.07235563433780665, + "learning_rate": 3.5289592536645047e-07, + "loss": 0.5002, + "step": 7237 + }, + { + "epoch": 3.5174468085106385, + "grad_norm": 0.07295740900489246, + "learning_rate": 3.5218971102361945e-07, + "loss": 0.5156, + "step": 7238 + }, + { + "epoch": 3.517933130699088, + "grad_norm": 0.07051145344528857, + "learning_rate": 3.514841782293882e-07, + "loss": 0.5348, + "step": 7239 + }, + { + "epoch": 3.518419452887538, + "grad_norm": 0.06978549215038118, + "learning_rate": 3.507793270872145e-07, + "loss": 0.4936, + "step": 7240 + }, + { + "epoch": 3.518905775075988, + "grad_norm": 0.0725950057773154, + "learning_rate": 3.500751577004574e-07, + "loss": 0.5079, + "step": 7241 + }, + { + "epoch": 3.5193920972644377, + "grad_norm": 0.07063254131635323, + "learning_rate": 3.4937167017237484e-07, + "loss": 0.5006, + "step": 7242 + }, + { + "epoch": 3.5198784194528874, + "grad_norm": 0.07208035373266007, + "learning_rate": 3.4866886460612536e-07, + "loss": 0.537, + "step": 7243 + }, + { + "epoch": 3.5203647416413375, + "grad_norm": 0.07053910416583992, + "learning_rate": 3.479667411047677e-07, + "loss": 0.5043, + "step": 7244 + }, + { + "epoch": 3.520851063829787, + "grad_norm": 0.07020601152892847, + "learning_rate": 3.4726529977126e-07, + "loss": 0.483, + "step": 7245 + }, + { + "epoch": 3.521337386018237, + "grad_norm": 0.07195262181511256, + "learning_rate": 3.465645407084611e-07, + "loss": 0.5521, + "step": 7246 + }, + { + "epoch": 3.521823708206687, + "grad_norm": 0.07078835399859162, + "learning_rate": 3.4586446401912833e-07, + "loss": 0.4946, + "step": 7247 + }, + { + "epoch": 3.5223100303951367, + "grad_norm": 0.07257120723260403, + "learning_rate": 3.451650698059211e-07, + "loss": 0.5104, + "step": 7248 + }, + { + "epoch": 3.522796352583587, + "grad_norm": 0.07289859046708881, + "learning_rate": 3.444663581713975e-07, + "loss": 0.5166, + "step": 7249 + }, + { + "epoch": 3.5232826747720365, + "grad_norm": 0.07032179912809126, + "learning_rate": 3.4376832921801494e-07, + "loss": 0.4812, + "step": 7250 + }, + { + "epoch": 3.523768996960486, + "grad_norm": 0.06983409897908303, + "learning_rate": 3.4307098304813215e-07, + "loss": 0.4849, + "step": 7251 + }, + { + "epoch": 3.5242553191489363, + "grad_norm": 0.0700615456848194, + "learning_rate": 3.423743197640067e-07, + "loss": 0.5053, + "step": 7252 + }, + { + "epoch": 3.524741641337386, + "grad_norm": 0.06886085254109796, + "learning_rate": 3.4167833946779696e-07, + "loss": 0.4834, + "step": 7253 + }, + { + "epoch": 3.525227963525836, + "grad_norm": 0.06909140227938304, + "learning_rate": 3.40983042261559e-07, + "loss": 0.5098, + "step": 7254 + }, + { + "epoch": 3.525714285714286, + "grad_norm": 0.07250910437189453, + "learning_rate": 3.4028842824725183e-07, + "loss": 0.5175, + "step": 7255 + }, + { + "epoch": 3.5262006079027355, + "grad_norm": 0.07269852744018593, + "learning_rate": 3.3959449752673235e-07, + "loss": 0.5163, + "step": 7256 + }, + { + "epoch": 3.526686930091185, + "grad_norm": 0.07032829211257909, + "learning_rate": 3.3890125020175693e-07, + "loss": 0.5105, + "step": 7257 + }, + { + "epoch": 3.5271732522796353, + "grad_norm": 0.07452891215519673, + "learning_rate": 3.3820868637398305e-07, + "loss": 0.5469, + "step": 7258 + }, + { + "epoch": 3.527659574468085, + "grad_norm": 0.07154076504998388, + "learning_rate": 3.3751680614496686e-07, + "loss": 0.5103, + "step": 7259 + }, + { + "epoch": 3.528145896656535, + "grad_norm": 0.06965758296055197, + "learning_rate": 3.3682560961616537e-07, + "loss": 0.5097, + "step": 7260 + }, + { + "epoch": 3.528632218844985, + "grad_norm": 0.06982160264205557, + "learning_rate": 3.3613509688893433e-07, + "loss": 0.4888, + "step": 7261 + }, + { + "epoch": 3.5291185410334345, + "grad_norm": 0.06985765804356635, + "learning_rate": 3.354452680645298e-07, + "loss": 0.5221, + "step": 7262 + }, + { + "epoch": 3.5296048632218846, + "grad_norm": 0.0702127760826987, + "learning_rate": 3.3475612324410656e-07, + "loss": 0.5015, + "step": 7263 + }, + { + "epoch": 3.5300911854103343, + "grad_norm": 0.07270437601568784, + "learning_rate": 3.340676625287209e-07, + "loss": 0.5217, + "step": 7264 + }, + { + "epoch": 3.5305775075987844, + "grad_norm": 0.07272582472340099, + "learning_rate": 3.333798860193277e-07, + "loss": 0.5135, + "step": 7265 + }, + { + "epoch": 3.531063829787234, + "grad_norm": 0.07086140237375115, + "learning_rate": 3.3269279381678065e-07, + "loss": 0.5148, + "step": 7266 + }, + { + "epoch": 3.531550151975684, + "grad_norm": 0.07013359650019135, + "learning_rate": 3.3200638602183533e-07, + "loss": 0.5103, + "step": 7267 + }, + { + "epoch": 3.5320364741641335, + "grad_norm": 0.06846799013128262, + "learning_rate": 3.3132066273514397e-07, + "loss": 0.474, + "step": 7268 + }, + { + "epoch": 3.5325227963525836, + "grad_norm": 0.07381697955492518, + "learning_rate": 3.3063562405726277e-07, + "loss": 0.5211, + "step": 7269 + }, + { + "epoch": 3.5330091185410333, + "grad_norm": 0.07164947525743896, + "learning_rate": 3.299512700886415e-07, + "loss": 0.5184, + "step": 7270 + }, + { + "epoch": 3.5334954407294834, + "grad_norm": 0.07025197422814637, + "learning_rate": 3.292676009296353e-07, + "loss": 0.5163, + "step": 7271 + }, + { + "epoch": 3.533981762917933, + "grad_norm": 0.07518078518688896, + "learning_rate": 3.285846166804946e-07, + "loss": 0.5232, + "step": 7272 + }, + { + "epoch": 3.5344680851063828, + "grad_norm": 0.07443116926836839, + "learning_rate": 3.2790231744137315e-07, + "loss": 0.5277, + "step": 7273 + }, + { + "epoch": 3.534954407294833, + "grad_norm": 0.07140071642719877, + "learning_rate": 3.27220703312322e-07, + "loss": 0.5168, + "step": 7274 + }, + { + "epoch": 3.5354407294832826, + "grad_norm": 0.06914373463769209, + "learning_rate": 3.265397743932913e-07, + "loss": 0.5159, + "step": 7275 + }, + { + "epoch": 3.5359270516717327, + "grad_norm": 0.07184274483734404, + "learning_rate": 3.2585953078413225e-07, + "loss": 0.5113, + "step": 7276 + }, + { + "epoch": 3.5364133738601824, + "grad_norm": 0.06965237554484427, + "learning_rate": 3.25179972584595e-07, + "loss": 0.4758, + "step": 7277 + }, + { + "epoch": 3.536899696048632, + "grad_norm": 0.07108640543151509, + "learning_rate": 3.245010998943282e-07, + "loss": 0.4964, + "step": 7278 + }, + { + "epoch": 3.537386018237082, + "grad_norm": 0.07048754731021302, + "learning_rate": 3.2382291281288113e-07, + "loss": 0.5292, + "step": 7279 + }, + { + "epoch": 3.537872340425532, + "grad_norm": 0.07127731989257474, + "learning_rate": 3.231454114397026e-07, + "loss": 0.4912, + "step": 7280 + }, + { + "epoch": 3.538358662613982, + "grad_norm": 0.07204628855333067, + "learning_rate": 3.224685958741408e-07, + "loss": 0.5294, + "step": 7281 + }, + { + "epoch": 3.5388449848024317, + "grad_norm": 0.07064881150887918, + "learning_rate": 3.21792466215442e-07, + "loss": 0.5127, + "step": 7282 + }, + { + "epoch": 3.5393313069908814, + "grad_norm": 0.07036704136607158, + "learning_rate": 3.2111702256275355e-07, + "loss": 0.5008, + "step": 7283 + }, + { + "epoch": 3.539817629179331, + "grad_norm": 0.07207000377795311, + "learning_rate": 3.2044226501512233e-07, + "loss": 0.5365, + "step": 7284 + }, + { + "epoch": 3.540303951367781, + "grad_norm": 0.0704674492089507, + "learning_rate": 3.197681936714919e-07, + "loss": 0.5217, + "step": 7285 + }, + { + "epoch": 3.540790273556231, + "grad_norm": 0.07244311802596472, + "learning_rate": 3.1909480863070884e-07, + "loss": 0.5238, + "step": 7286 + }, + { + "epoch": 3.541276595744681, + "grad_norm": 0.0730980193009039, + "learning_rate": 3.184221099915163e-07, + "loss": 0.5163, + "step": 7287 + }, + { + "epoch": 3.5417629179331307, + "grad_norm": 0.07489262923338012, + "learning_rate": 3.177500978525594e-07, + "loss": 0.5678, + "step": 7288 + }, + { + "epoch": 3.5422492401215804, + "grad_norm": 0.07119119841417239, + "learning_rate": 3.1707877231237916e-07, + "loss": 0.5199, + "step": 7289 + }, + { + "epoch": 3.5427355623100305, + "grad_norm": 0.07298154464650503, + "learning_rate": 3.164081334694186e-07, + "loss": 0.5816, + "step": 7290 + }, + { + "epoch": 3.54322188449848, + "grad_norm": 0.07122694185732, + "learning_rate": 3.157381814220206e-07, + "loss": 0.5424, + "step": 7291 + }, + { + "epoch": 3.5437082066869303, + "grad_norm": 0.07261416258715955, + "learning_rate": 3.150689162684245e-07, + "loss": 0.5518, + "step": 7292 + }, + { + "epoch": 3.54419452887538, + "grad_norm": 0.06949925293479649, + "learning_rate": 3.1440033810677117e-07, + "loss": 0.5205, + "step": 7293 + }, + { + "epoch": 3.5446808510638297, + "grad_norm": 0.0696420813181653, + "learning_rate": 3.1373244703509996e-07, + "loss": 0.497, + "step": 7294 + }, + { + "epoch": 3.5451671732522794, + "grad_norm": 0.07148950250450589, + "learning_rate": 3.130652431513487e-07, + "loss": 0.5304, + "step": 7295 + }, + { + "epoch": 3.5456534954407295, + "grad_norm": 0.07056600888232763, + "learning_rate": 3.1239872655335625e-07, + "loss": 0.4983, + "step": 7296 + }, + { + "epoch": 3.546139817629179, + "grad_norm": 0.07080671410213533, + "learning_rate": 3.117328973388595e-07, + "loss": 0.501, + "step": 7297 + }, + { + "epoch": 3.5466261398176293, + "grad_norm": 0.06958819278284702, + "learning_rate": 3.1106775560549473e-07, + "loss": 0.4902, + "step": 7298 + }, + { + "epoch": 3.547112462006079, + "grad_norm": 0.07069892602944006, + "learning_rate": 3.104033014507968e-07, + "loss": 0.495, + "step": 7299 + }, + { + "epoch": 3.5475987841945287, + "grad_norm": 0.07165124410756647, + "learning_rate": 3.09739534972201e-07, + "loss": 0.5044, + "step": 7300 + }, + { + "epoch": 3.548085106382979, + "grad_norm": 0.07255700556377429, + "learning_rate": 3.0907645626704066e-07, + "loss": 0.5294, + "step": 7301 + }, + { + "epoch": 3.5485714285714285, + "grad_norm": 0.07089350357252436, + "learning_rate": 3.0841406543254904e-07, + "loss": 0.535, + "step": 7302 + }, + { + "epoch": 3.5490577507598786, + "grad_norm": 0.07056033227481667, + "learning_rate": 3.077523625658585e-07, + "loss": 0.4949, + "step": 7303 + }, + { + "epoch": 3.5495440729483283, + "grad_norm": 0.06987344071988595, + "learning_rate": 3.0709134776399973e-07, + "loss": 0.4992, + "step": 7304 + }, + { + "epoch": 3.550030395136778, + "grad_norm": 0.07139241586942209, + "learning_rate": 3.064310211239035e-07, + "loss": 0.5283, + "step": 7305 + }, + { + "epoch": 3.550516717325228, + "grad_norm": 0.06995996907834645, + "learning_rate": 3.0577138274239913e-07, + "loss": 0.5302, + "step": 7306 + }, + { + "epoch": 3.551003039513678, + "grad_norm": 0.07210397608334039, + "learning_rate": 3.0511243271621474e-07, + "loss": 0.498, + "step": 7307 + }, + { + "epoch": 3.551489361702128, + "grad_norm": 0.07764076830224335, + "learning_rate": 3.044541711419774e-07, + "loss": 0.5631, + "step": 7308 + }, + { + "epoch": 3.5519756838905776, + "grad_norm": 0.07048568394689735, + "learning_rate": 3.03796598116215e-07, + "loss": 0.4868, + "step": 7309 + }, + { + "epoch": 3.5524620060790273, + "grad_norm": 0.07073990252842037, + "learning_rate": 3.0313971373535257e-07, + "loss": 0.5133, + "step": 7310 + }, + { + "epoch": 3.552948328267477, + "grad_norm": 0.06957175542958917, + "learning_rate": 3.024835180957153e-07, + "loss": 0.4561, + "step": 7311 + }, + { + "epoch": 3.553434650455927, + "grad_norm": 0.07158292685283037, + "learning_rate": 3.018280112935257e-07, + "loss": 0.5156, + "step": 7312 + }, + { + "epoch": 3.553920972644377, + "grad_norm": 0.07408736551579814, + "learning_rate": 3.011731934249079e-07, + "loss": 0.5567, + "step": 7313 + }, + { + "epoch": 3.554407294832827, + "grad_norm": 0.07355718618666156, + "learning_rate": 3.005190645858819e-07, + "loss": 0.5476, + "step": 7314 + }, + { + "epoch": 3.5548936170212766, + "grad_norm": 0.06999316579318449, + "learning_rate": 2.998656248723686e-07, + "loss": 0.4811, + "step": 7315 + }, + { + "epoch": 3.5553799392097263, + "grad_norm": 0.07356577301068572, + "learning_rate": 2.992128743801881e-07, + "loss": 0.5477, + "step": 7316 + }, + { + "epoch": 3.5558662613981764, + "grad_norm": 0.0709066460044624, + "learning_rate": 2.985608132050588e-07, + "loss": 0.4957, + "step": 7317 + }, + { + "epoch": 3.556352583586626, + "grad_norm": 0.0737971307981053, + "learning_rate": 2.9790944144259757e-07, + "loss": 0.538, + "step": 7318 + }, + { + "epoch": 3.556838905775076, + "grad_norm": 0.0704499061960886, + "learning_rate": 2.9725875918832084e-07, + "loss": 0.5193, + "step": 7319 + }, + { + "epoch": 3.557325227963526, + "grad_norm": 0.07020270123634124, + "learning_rate": 2.9660876653764435e-07, + "loss": 0.4957, + "step": 7320 + }, + { + "epoch": 3.5578115501519756, + "grad_norm": 0.07162127256981395, + "learning_rate": 2.9595946358588144e-07, + "loss": 0.5485, + "step": 7321 + }, + { + "epoch": 3.5582978723404253, + "grad_norm": 0.07157321804593564, + "learning_rate": 2.953108504282454e-07, + "loss": 0.5151, + "step": 7322 + }, + { + "epoch": 3.5587841945288754, + "grad_norm": 0.07160379601788354, + "learning_rate": 2.9466292715984724e-07, + "loss": 0.5226, + "step": 7323 + }, + { + "epoch": 3.559270516717325, + "grad_norm": 0.06904822263240454, + "learning_rate": 2.9401569387569885e-07, + "loss": 0.4852, + "step": 7324 + }, + { + "epoch": 3.559756838905775, + "grad_norm": 0.07243466734547406, + "learning_rate": 2.933691506707087e-07, + "loss": 0.531, + "step": 7325 + }, + { + "epoch": 3.560243161094225, + "grad_norm": 0.06975049768054856, + "learning_rate": 2.927232976396849e-07, + "loss": 0.4899, + "step": 7326 + }, + { + "epoch": 3.5607294832826746, + "grad_norm": 0.0716366338504042, + "learning_rate": 2.9207813487733493e-07, + "loss": 0.5255, + "step": 7327 + }, + { + "epoch": 3.5612158054711247, + "grad_norm": 0.07091388921273582, + "learning_rate": 2.91433662478266e-07, + "loss": 0.4954, + "step": 7328 + }, + { + "epoch": 3.5617021276595744, + "grad_norm": 0.07352913450742303, + "learning_rate": 2.907898805369797e-07, + "loss": 0.521, + "step": 7329 + }, + { + "epoch": 3.5621884498480245, + "grad_norm": 0.07051134063364362, + "learning_rate": 2.901467891478815e-07, + "loss": 0.4929, + "step": 7330 + }, + { + "epoch": 3.562674772036474, + "grad_norm": 0.06977906695245811, + "learning_rate": 2.895043884052723e-07, + "loss": 0.5167, + "step": 7331 + }, + { + "epoch": 3.563161094224924, + "grad_norm": 0.07052924754278243, + "learning_rate": 2.8886267840335326e-07, + "loss": 0.4674, + "step": 7332 + }, + { + "epoch": 3.563647416413374, + "grad_norm": 0.07141125812518652, + "learning_rate": 2.8822165923622415e-07, + "loss": 0.5193, + "step": 7333 + }, + { + "epoch": 3.5641337386018237, + "grad_norm": 0.07141514804061745, + "learning_rate": 2.8758133099788257e-07, + "loss": 0.4861, + "step": 7334 + }, + { + "epoch": 3.564620060790274, + "grad_norm": 0.0707225607741812, + "learning_rate": 2.8694169378222614e-07, + "loss": 0.5111, + "step": 7335 + }, + { + "epoch": 3.5651063829787235, + "grad_norm": 0.07074623550470321, + "learning_rate": 2.863027476830499e-07, + "loss": 0.5257, + "step": 7336 + }, + { + "epoch": 3.565592705167173, + "grad_norm": 0.07049540091389571, + "learning_rate": 2.856644927940477e-07, + "loss": 0.5098, + "step": 7337 + }, + { + "epoch": 3.566079027355623, + "grad_norm": 0.07357890612930548, + "learning_rate": 2.8502692920881314e-07, + "loss": 0.5293, + "step": 7338 + }, + { + "epoch": 3.566565349544073, + "grad_norm": 0.06851401088382048, + "learning_rate": 2.8439005702083745e-07, + "loss": 0.4753, + "step": 7339 + }, + { + "epoch": 3.5670516717325227, + "grad_norm": 0.06934347719824543, + "learning_rate": 2.837538763235104e-07, + "loss": 0.504, + "step": 7340 + }, + { + "epoch": 3.567537993920973, + "grad_norm": 0.07084414331060598, + "learning_rate": 2.8311838721012117e-07, + "loss": 0.5049, + "step": 7341 + }, + { + "epoch": 3.5680243161094225, + "grad_norm": 0.07223718426749741, + "learning_rate": 2.8248358977385647e-07, + "loss": 0.4937, + "step": 7342 + }, + { + "epoch": 3.568510638297872, + "grad_norm": 0.07148240198208974, + "learning_rate": 2.8184948410780234e-07, + "loss": 0.5267, + "step": 7343 + }, + { + "epoch": 3.5689969604863223, + "grad_norm": 0.0686633788215377, + "learning_rate": 2.8121607030494325e-07, + "loss": 0.4858, + "step": 7344 + }, + { + "epoch": 3.569483282674772, + "grad_norm": 0.07098816534537977, + "learning_rate": 2.8058334845816214e-07, + "loss": 0.5174, + "step": 7345 + }, + { + "epoch": 3.569969604863222, + "grad_norm": 0.07012065138151906, + "learning_rate": 2.7995131866024093e-07, + "loss": 0.5128, + "step": 7346 + }, + { + "epoch": 3.570455927051672, + "grad_norm": 0.07181431304103206, + "learning_rate": 2.7931998100385826e-07, + "loss": 0.5493, + "step": 7347 + }, + { + "epoch": 3.5709422492401215, + "grad_norm": 0.07000204189012846, + "learning_rate": 2.7868933558159393e-07, + "loss": 0.5226, + "step": 7348 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 0.071025096261643, + "learning_rate": 2.7805938248592456e-07, + "loss": 0.542, + "step": 7349 + }, + { + "epoch": 3.5719148936170213, + "grad_norm": 0.07075337073244381, + "learning_rate": 2.7743012180922566e-07, + "loss": 0.5179, + "step": 7350 + }, + { + "epoch": 3.572401215805471, + "grad_norm": 0.07008650815964618, + "learning_rate": 2.7680155364377073e-07, + "loss": 0.499, + "step": 7351 + }, + { + "epoch": 3.572887537993921, + "grad_norm": 0.0718683174311907, + "learning_rate": 2.7617367808173256e-07, + "loss": 0.5355, + "step": 7352 + }, + { + "epoch": 3.573373860182371, + "grad_norm": 0.0706689430312276, + "learning_rate": 2.7554649521518204e-07, + "loss": 0.5189, + "step": 7353 + }, + { + "epoch": 3.5738601823708205, + "grad_norm": 0.0702389835659637, + "learning_rate": 2.749200051360884e-07, + "loss": 0.5022, + "step": 7354 + }, + { + "epoch": 3.5743465045592706, + "grad_norm": 0.07344524893557296, + "learning_rate": 2.7429420793631924e-07, + "loss": 0.5148, + "step": 7355 + }, + { + "epoch": 3.5748328267477203, + "grad_norm": 0.07093299060198022, + "learning_rate": 2.7366910370764e-07, + "loss": 0.5154, + "step": 7356 + }, + { + "epoch": 3.5753191489361704, + "grad_norm": 0.06930701045478772, + "learning_rate": 2.7304469254171626e-07, + "loss": 0.5013, + "step": 7357 + }, + { + "epoch": 3.57580547112462, + "grad_norm": 0.07499797266616692, + "learning_rate": 2.7242097453010984e-07, + "loss": 0.5077, + "step": 7358 + }, + { + "epoch": 3.57629179331307, + "grad_norm": 0.07034100300012767, + "learning_rate": 2.7179794976428197e-07, + "loss": 0.5023, + "step": 7359 + }, + { + "epoch": 3.57677811550152, + "grad_norm": 0.0704858758123458, + "learning_rate": 2.7117561833559293e-07, + "loss": 0.4778, + "step": 7360 + }, + { + "epoch": 3.5772644376899696, + "grad_norm": 0.0694951792810299, + "learning_rate": 2.705539803353008e-07, + "loss": 0.4869, + "step": 7361 + }, + { + "epoch": 3.5777507598784197, + "grad_norm": 0.0700097501186522, + "learning_rate": 2.699330358545599e-07, + "loss": 0.4688, + "step": 7362 + }, + { + "epoch": 3.5782370820668694, + "grad_norm": 0.07445755451914522, + "learning_rate": 2.6931278498442625e-07, + "loss": 0.5605, + "step": 7363 + }, + { + "epoch": 3.578723404255319, + "grad_norm": 0.07057763626888834, + "learning_rate": 2.686932278158516e-07, + "loss": 0.4986, + "step": 7364 + }, + { + "epoch": 3.579209726443769, + "grad_norm": 0.06905809186338163, + "learning_rate": 2.680743644396883e-07, + "loss": 0.4849, + "step": 7365 + }, + { + "epoch": 3.579696048632219, + "grad_norm": 0.07077645054507545, + "learning_rate": 2.6745619494668473e-07, + "loss": 0.5161, + "step": 7366 + }, + { + "epoch": 3.5801823708206686, + "grad_norm": 0.07061450291663751, + "learning_rate": 2.668387194274885e-07, + "loss": 0.4961, + "step": 7367 + }, + { + "epoch": 3.5806686930091187, + "grad_norm": 0.07331287074668448, + "learning_rate": 2.662219379726455e-07, + "loss": 0.5272, + "step": 7368 + }, + { + "epoch": 3.5811550151975684, + "grad_norm": 0.07255059252702477, + "learning_rate": 2.6560585067259947e-07, + "loss": 0.491, + "step": 7369 + }, + { + "epoch": 3.581641337386018, + "grad_norm": 0.06985575383706202, + "learning_rate": 2.649904576176932e-07, + "loss": 0.465, + "step": 7370 + }, + { + "epoch": 3.582127659574468, + "grad_norm": 0.0720242260152669, + "learning_rate": 2.64375758898166e-07, + "loss": 0.4901, + "step": 7371 + }, + { + "epoch": 3.582613981762918, + "grad_norm": 0.07171093475434703, + "learning_rate": 2.637617546041582e-07, + "loss": 0.5016, + "step": 7372 + }, + { + "epoch": 3.583100303951368, + "grad_norm": 0.0726937568670949, + "learning_rate": 2.631484448257049e-07, + "loss": 0.5284, + "step": 7373 + }, + { + "epoch": 3.5835866261398177, + "grad_norm": 0.07183658239706207, + "learning_rate": 2.6253582965274194e-07, + "loss": 0.5035, + "step": 7374 + }, + { + "epoch": 3.5840729483282674, + "grad_norm": 0.07423235951045551, + "learning_rate": 2.6192390917510193e-07, + "loss": 0.5371, + "step": 7375 + }, + { + "epoch": 3.584559270516717, + "grad_norm": 0.07046762590420802, + "learning_rate": 2.613126834825169e-07, + "loss": 0.5689, + "step": 7376 + }, + { + "epoch": 3.585045592705167, + "grad_norm": 0.07174241814782681, + "learning_rate": 2.6070215266461474e-07, + "loss": 0.5288, + "step": 7377 + }, + { + "epoch": 3.585531914893617, + "grad_norm": 0.07064555403281207, + "learning_rate": 2.6009231681092375e-07, + "loss": 0.5057, + "step": 7378 + }, + { + "epoch": 3.586018237082067, + "grad_norm": 0.07192003560825794, + "learning_rate": 2.5948317601086905e-07, + "loss": 0.5292, + "step": 7379 + }, + { + "epoch": 3.5865045592705167, + "grad_norm": 0.07499300581435787, + "learning_rate": 2.588747303537742e-07, + "loss": 0.532, + "step": 7380 + }, + { + "epoch": 3.5869908814589664, + "grad_norm": 0.07102887886070022, + "learning_rate": 2.582669799288612e-07, + "loss": 0.5318, + "step": 7381 + }, + { + "epoch": 3.5874772036474165, + "grad_norm": 0.07089444182256209, + "learning_rate": 2.5765992482524984e-07, + "loss": 0.4865, + "step": 7382 + }, + { + "epoch": 3.587963525835866, + "grad_norm": 0.07022833005819736, + "learning_rate": 2.570535651319578e-07, + "loss": 0.5586, + "step": 7383 + }, + { + "epoch": 3.5884498480243163, + "grad_norm": 0.07050662195952102, + "learning_rate": 2.5644790093790063e-07, + "loss": 0.525, + "step": 7384 + }, + { + "epoch": 3.588936170212766, + "grad_norm": 0.07280986380759177, + "learning_rate": 2.5584293233189227e-07, + "loss": 0.5203, + "step": 7385 + }, + { + "epoch": 3.5894224924012157, + "grad_norm": 0.07027483080377553, + "learning_rate": 2.5523865940264405e-07, + "loss": 0.4961, + "step": 7386 + }, + { + "epoch": 3.589908814589666, + "grad_norm": 0.06943702667867642, + "learning_rate": 2.5463508223876663e-07, + "loss": 0.5115, + "step": 7387 + }, + { + "epoch": 3.5903951367781155, + "grad_norm": 0.06891489153963594, + "learning_rate": 2.540322009287671e-07, + "loss": 0.4773, + "step": 7388 + }, + { + "epoch": 3.590881458966565, + "grad_norm": 0.07136935747486889, + "learning_rate": 2.5343001556105087e-07, + "loss": 0.533, + "step": 7389 + }, + { + "epoch": 3.5913677811550153, + "grad_norm": 0.0696238531507961, + "learning_rate": 2.528285262239233e-07, + "loss": 0.4942, + "step": 7390 + }, + { + "epoch": 3.591854103343465, + "grad_norm": 0.06957558191341166, + "learning_rate": 2.5222773300558333e-07, + "loss": 0.4924, + "step": 7391 + }, + { + "epoch": 3.5923404255319147, + "grad_norm": 0.07390134284820647, + "learning_rate": 2.516276359941322e-07, + "loss": 0.5317, + "step": 7392 + }, + { + "epoch": 3.592826747720365, + "grad_norm": 0.07053198466740905, + "learning_rate": 2.510282352775667e-07, + "loss": 0.4903, + "step": 7393 + }, + { + "epoch": 3.5933130699088145, + "grad_norm": 0.0721362443692563, + "learning_rate": 2.5042953094378263e-07, + "loss": 0.4803, + "step": 7394 + }, + { + "epoch": 3.5937993920972646, + "grad_norm": 0.06991093912445948, + "learning_rate": 2.4983152308057255e-07, + "loss": 0.4927, + "step": 7395 + }, + { + "epoch": 3.5942857142857143, + "grad_norm": 0.06969785853854706, + "learning_rate": 2.49234211775628e-07, + "loss": 0.5184, + "step": 7396 + }, + { + "epoch": 3.594772036474164, + "grad_norm": 0.07196577209444105, + "learning_rate": 2.486375971165378e-07, + "loss": 0.4988, + "step": 7397 + }, + { + "epoch": 3.595258358662614, + "grad_norm": 0.07109875877628512, + "learning_rate": 2.480416791907886e-07, + "loss": 0.5006, + "step": 7398 + }, + { + "epoch": 3.595744680851064, + "grad_norm": 0.07247700603611107, + "learning_rate": 2.474464580857644e-07, + "loss": 0.5486, + "step": 7399 + }, + { + "epoch": 3.596231003039514, + "grad_norm": 0.07269753622871465, + "learning_rate": 2.468519338887493e-07, + "loss": 0.5443, + "step": 7400 + }, + { + "epoch": 3.5967173252279636, + "grad_norm": 0.06837364821142979, + "learning_rate": 2.462581066869224e-07, + "loss": 0.5018, + "step": 7401 + }, + { + "epoch": 3.5972036474164133, + "grad_norm": 0.07631546900480853, + "learning_rate": 2.456649765673619e-07, + "loss": 0.5691, + "step": 7402 + }, + { + "epoch": 3.597689969604863, + "grad_norm": 0.07000531815975064, + "learning_rate": 2.4507254361704314e-07, + "loss": 0.5129, + "step": 7403 + }, + { + "epoch": 3.598176291793313, + "grad_norm": 0.06757217200901171, + "learning_rate": 2.444808079228406e-07, + "loss": 0.5149, + "step": 7404 + }, + { + "epoch": 3.598662613981763, + "grad_norm": 0.07296253687726682, + "learning_rate": 2.438897695715253e-07, + "loss": 0.5365, + "step": 7405 + }, + { + "epoch": 3.599148936170213, + "grad_norm": 0.07088652055812886, + "learning_rate": 2.432994286497653e-07, + "loss": 0.5009, + "step": 7406 + }, + { + "epoch": 3.5996352583586626, + "grad_norm": 0.07235536573855218, + "learning_rate": 2.427097852441285e-07, + "loss": 0.5372, + "step": 7407 + }, + { + "epoch": 3.6001215805471123, + "grad_norm": 0.07394500721827958, + "learning_rate": 2.42120839441079e-07, + "loss": 0.4965, + "step": 7408 + }, + { + "epoch": 3.6006079027355624, + "grad_norm": 0.07160582200171706, + "learning_rate": 2.415325913269795e-07, + "loss": 0.5119, + "step": 7409 + }, + { + "epoch": 3.601094224924012, + "grad_norm": 0.06897702814331691, + "learning_rate": 2.4094504098808866e-07, + "loss": 0.4893, + "step": 7410 + }, + { + "epoch": 3.6015805471124622, + "grad_norm": 0.07247169442824791, + "learning_rate": 2.403581885105655e-07, + "loss": 0.5441, + "step": 7411 + }, + { + "epoch": 3.602066869300912, + "grad_norm": 0.07183658560987477, + "learning_rate": 2.397720339804649e-07, + "loss": 0.5198, + "step": 7412 + }, + { + "epoch": 3.6025531914893616, + "grad_norm": 0.07276708641966347, + "learning_rate": 2.3918657748373875e-07, + "loss": 0.5024, + "step": 7413 + }, + { + "epoch": 3.6030395136778113, + "grad_norm": 0.07014591244813682, + "learning_rate": 2.386018191062389e-07, + "loss": 0.5102, + "step": 7414 + }, + { + "epoch": 3.6035258358662614, + "grad_norm": 0.07062201830488386, + "learning_rate": 2.3801775893371293e-07, + "loss": 0.4936, + "step": 7415 + }, + { + "epoch": 3.604012158054711, + "grad_norm": 0.071802947917923, + "learning_rate": 2.3743439705180725e-07, + "loss": 0.527, + "step": 7416 + }, + { + "epoch": 3.604498480243161, + "grad_norm": 0.06784027054515912, + "learning_rate": 2.368517335460635e-07, + "loss": 0.4961, + "step": 7417 + }, + { + "epoch": 3.604984802431611, + "grad_norm": 0.06887758033600011, + "learning_rate": 2.362697685019244e-07, + "loss": 0.4993, + "step": 7418 + }, + { + "epoch": 3.6054711246200606, + "grad_norm": 0.0748083992245796, + "learning_rate": 2.3568850200472838e-07, + "loss": 0.5514, + "step": 7419 + }, + { + "epoch": 3.6059574468085107, + "grad_norm": 0.07297176482559332, + "learning_rate": 2.3510793413971167e-07, + "loss": 0.4907, + "step": 7420 + }, + { + "epoch": 3.6064437689969604, + "grad_norm": 0.0688853916093152, + "learning_rate": 2.3452806499200675e-07, + "loss": 0.5083, + "step": 7421 + }, + { + "epoch": 3.6069300911854105, + "grad_norm": 0.07102867200205872, + "learning_rate": 2.339488946466456e-07, + "loss": 0.5431, + "step": 7422 + }, + { + "epoch": 3.60741641337386, + "grad_norm": 0.07223226914649179, + "learning_rate": 2.3337042318855695e-07, + "loss": 0.5469, + "step": 7423 + }, + { + "epoch": 3.60790273556231, + "grad_norm": 0.07277685605012142, + "learning_rate": 2.3279265070256741e-07, + "loss": 0.5139, + "step": 7424 + }, + { + "epoch": 3.60838905775076, + "grad_norm": 0.07142077991011306, + "learning_rate": 2.3221557727340026e-07, + "loss": 0.5004, + "step": 7425 + }, + { + "epoch": 3.6088753799392097, + "grad_norm": 0.0720272062001332, + "learning_rate": 2.3163920298567677e-07, + "loss": 0.5432, + "step": 7426 + }, + { + "epoch": 3.60936170212766, + "grad_norm": 0.07274183132731583, + "learning_rate": 2.3106352792391595e-07, + "loss": 0.5265, + "step": 7427 + }, + { + "epoch": 3.6098480243161095, + "grad_norm": 0.06988868757771198, + "learning_rate": 2.3048855217253363e-07, + "loss": 0.4977, + "step": 7428 + }, + { + "epoch": 3.610334346504559, + "grad_norm": 0.07442318782605123, + "learning_rate": 2.2991427581584402e-07, + "loss": 0.5421, + "step": 7429 + }, + { + "epoch": 3.610820668693009, + "grad_norm": 0.07098783879347895, + "learning_rate": 2.293406989380581e-07, + "loss": 0.5061, + "step": 7430 + }, + { + "epoch": 3.611306990881459, + "grad_norm": 0.06958367767745825, + "learning_rate": 2.2876782162328415e-07, + "loss": 0.5026, + "step": 7431 + }, + { + "epoch": 3.6117933130699087, + "grad_norm": 0.06951823437829387, + "learning_rate": 2.281956439555283e-07, + "loss": 0.526, + "step": 7432 + }, + { + "epoch": 3.612279635258359, + "grad_norm": 0.06748836588914339, + "learning_rate": 2.276241660186934e-07, + "loss": 0.4738, + "step": 7433 + }, + { + "epoch": 3.6127659574468085, + "grad_norm": 0.07240365452193252, + "learning_rate": 2.2705338789658082e-07, + "loss": 0.5397, + "step": 7434 + }, + { + "epoch": 3.613252279635258, + "grad_norm": 0.0695118761076568, + "learning_rate": 2.2648330967288857e-07, + "loss": 0.5136, + "step": 7435 + }, + { + "epoch": 3.6137386018237083, + "grad_norm": 0.07266416395887992, + "learning_rate": 2.25913931431212e-07, + "loss": 0.5085, + "step": 7436 + }, + { + "epoch": 3.614224924012158, + "grad_norm": 0.07251594605067847, + "learning_rate": 2.253452532550443e-07, + "loss": 0.5412, + "step": 7437 + }, + { + "epoch": 3.614711246200608, + "grad_norm": 0.07268244260928317, + "learning_rate": 2.247772752277755e-07, + "loss": 0.5122, + "step": 7438 + }, + { + "epoch": 3.615197568389058, + "grad_norm": 0.07528286822987387, + "learning_rate": 2.242099974326928e-07, + "loss": 0.5839, + "step": 7439 + }, + { + "epoch": 3.6156838905775075, + "grad_norm": 0.06895527780745686, + "learning_rate": 2.2364341995298133e-07, + "loss": 0.4772, + "step": 7440 + }, + { + "epoch": 3.616170212765957, + "grad_norm": 0.07364367874106896, + "learning_rate": 2.2307754287172302e-07, + "loss": 0.5136, + "step": 7441 + }, + { + "epoch": 3.6166565349544073, + "grad_norm": 0.07035932283324323, + "learning_rate": 2.2251236627189753e-07, + "loss": 0.5142, + "step": 7442 + }, + { + "epoch": 3.617142857142857, + "grad_norm": 0.07032631320183147, + "learning_rate": 2.2194789023638143e-07, + "loss": 0.5046, + "step": 7443 + }, + { + "epoch": 3.617629179331307, + "grad_norm": 0.0715205283156684, + "learning_rate": 2.2138411484794953e-07, + "loss": 0.4893, + "step": 7444 + }, + { + "epoch": 3.618115501519757, + "grad_norm": 0.06917054097695331, + "learning_rate": 2.2082104018927187e-07, + "loss": 0.495, + "step": 7445 + }, + { + "epoch": 3.6186018237082065, + "grad_norm": 0.07030850935382998, + "learning_rate": 2.2025866634291736e-07, + "loss": 0.5284, + "step": 7446 + }, + { + "epoch": 3.6190881458966566, + "grad_norm": 0.07325364317153965, + "learning_rate": 2.1969699339135232e-07, + "loss": 0.4958, + "step": 7447 + }, + { + "epoch": 3.6195744680851063, + "grad_norm": 0.07020402839254393, + "learning_rate": 2.1913602141693914e-07, + "loss": 0.4822, + "step": 7448 + }, + { + "epoch": 3.6200607902735564, + "grad_norm": 0.06963284597459264, + "learning_rate": 2.1857575050193757e-07, + "loss": 0.4835, + "step": 7449 + }, + { + "epoch": 3.620547112462006, + "grad_norm": 0.0691059181677648, + "learning_rate": 2.1801618072850639e-07, + "loss": 0.4757, + "step": 7450 + }, + { + "epoch": 3.621033434650456, + "grad_norm": 0.1402888849727775, + "learning_rate": 2.174573121786988e-07, + "loss": 0.5087, + "step": 7451 + }, + { + "epoch": 3.621519756838906, + "grad_norm": 0.07348022933877976, + "learning_rate": 2.1689914493446706e-07, + "loss": 0.518, + "step": 7452 + }, + { + "epoch": 3.6220060790273556, + "grad_norm": 0.06882365623797035, + "learning_rate": 2.1634167907766013e-07, + "loss": 0.4951, + "step": 7453 + }, + { + "epoch": 3.6224924012158057, + "grad_norm": 0.0705265521251137, + "learning_rate": 2.1578491469002372e-07, + "loss": 0.4737, + "step": 7454 + }, + { + "epoch": 3.6229787234042554, + "grad_norm": 0.07046128277836229, + "learning_rate": 2.1522885185320087e-07, + "loss": 0.5104, + "step": 7455 + }, + { + "epoch": 3.623465045592705, + "grad_norm": 0.07336965831753864, + "learning_rate": 2.14673490648733e-07, + "loss": 0.5429, + "step": 7456 + }, + { + "epoch": 3.623951367781155, + "grad_norm": 0.0725408501616935, + "learning_rate": 2.141188311580561e-07, + "loss": 0.5264, + "step": 7457 + }, + { + "epoch": 3.624437689969605, + "grad_norm": 0.07432858153177303, + "learning_rate": 2.1356487346250565e-07, + "loss": 0.5761, + "step": 7458 + }, + { + "epoch": 3.6249240121580546, + "grad_norm": 0.07124357527045246, + "learning_rate": 2.130116176433128e-07, + "loss": 0.5113, + "step": 7459 + }, + { + "epoch": 3.6254103343465047, + "grad_norm": 0.07204968916253275, + "learning_rate": 2.1245906378160653e-07, + "loss": 0.5435, + "step": 7460 + }, + { + "epoch": 3.6258966565349544, + "grad_norm": 0.07539146703039709, + "learning_rate": 2.1190721195841258e-07, + "loss": 0.5377, + "step": 7461 + }, + { + "epoch": 3.626382978723404, + "grad_norm": 0.07215788789355801, + "learning_rate": 2.1135606225465343e-07, + "loss": 0.5342, + "step": 7462 + }, + { + "epoch": 3.626869300911854, + "grad_norm": 0.0712556716268439, + "learning_rate": 2.1080561475114891e-07, + "loss": 0.5063, + "step": 7463 + }, + { + "epoch": 3.627355623100304, + "grad_norm": 0.06997387064185723, + "learning_rate": 2.1025586952861608e-07, + "loss": 0.5159, + "step": 7464 + }, + { + "epoch": 3.627841945288754, + "grad_norm": 0.07428476347166349, + "learning_rate": 2.0970682666766884e-07, + "loss": 0.5314, + "step": 7465 + }, + { + "epoch": 3.6283282674772037, + "grad_norm": 0.0723191926741368, + "learning_rate": 2.091584862488183e-07, + "loss": 0.5502, + "step": 7466 + }, + { + "epoch": 3.6288145896656534, + "grad_norm": 0.07074671122403996, + "learning_rate": 2.0861084835247237e-07, + "loss": 0.5088, + "step": 7467 + }, + { + "epoch": 3.629300911854103, + "grad_norm": 0.07326909417220616, + "learning_rate": 2.0806391305893568e-07, + "loss": 0.5288, + "step": 7468 + }, + { + "epoch": 3.629787234042553, + "grad_norm": 0.06985561967981996, + "learning_rate": 2.0751768044841027e-07, + "loss": 0.4751, + "step": 7469 + }, + { + "epoch": 3.630273556231003, + "grad_norm": 0.0699171773158737, + "learning_rate": 2.0697215060099417e-07, + "loss": 0.4986, + "step": 7470 + }, + { + "epoch": 3.630759878419453, + "grad_norm": 0.06846680490778347, + "learning_rate": 2.0642732359668294e-07, + "loss": 0.4787, + "step": 7471 + }, + { + "epoch": 3.6312462006079027, + "grad_norm": 0.0705745900938879, + "learning_rate": 2.0588319951537095e-07, + "loss": 0.5104, + "step": 7472 + }, + { + "epoch": 3.6317325227963524, + "grad_norm": 0.06908864481648404, + "learning_rate": 2.0533977843684716e-07, + "loss": 0.4798, + "step": 7473 + }, + { + "epoch": 3.6322188449848025, + "grad_norm": 0.0703322752196678, + "learning_rate": 2.0479706044079784e-07, + "loss": 0.5262, + "step": 7474 + }, + { + "epoch": 3.632705167173252, + "grad_norm": 0.0686731730665516, + "learning_rate": 2.0425504560680654e-07, + "loss": 0.4865, + "step": 7475 + }, + { + "epoch": 3.6331914893617023, + "grad_norm": 0.07202493190236539, + "learning_rate": 2.03713734014353e-07, + "loss": 0.5308, + "step": 7476 + }, + { + "epoch": 3.633677811550152, + "grad_norm": 0.06979792559986302, + "learning_rate": 2.0317312574281544e-07, + "loss": 0.4934, + "step": 7477 + }, + { + "epoch": 3.6341641337386017, + "grad_norm": 0.07473781976748568, + "learning_rate": 2.0263322087146708e-07, + "loss": 0.5192, + "step": 7478 + }, + { + "epoch": 3.634650455927052, + "grad_norm": 0.0703706668541725, + "learning_rate": 2.020940194794796e-07, + "loss": 0.5048, + "step": 7479 + }, + { + "epoch": 3.6351367781155015, + "grad_norm": 0.06992478243465769, + "learning_rate": 2.015555216459203e-07, + "loss": 0.5145, + "step": 7480 + }, + { + "epoch": 3.6356231003039516, + "grad_norm": 0.07310334378802451, + "learning_rate": 2.0101772744975324e-07, + "loss": 0.4975, + "step": 7481 + }, + { + "epoch": 3.6361094224924013, + "grad_norm": 0.06962887525056555, + "learning_rate": 2.0048063696984088e-07, + "loss": 0.4973, + "step": 7482 + }, + { + "epoch": 3.636595744680851, + "grad_norm": 0.07252029821221445, + "learning_rate": 1.9994425028494137e-07, + "loss": 0.5351, + "step": 7483 + }, + { + "epoch": 3.6370820668693007, + "grad_norm": 0.07624364882914925, + "learning_rate": 1.9940856747370895e-07, + "loss": 0.53, + "step": 7484 + }, + { + "epoch": 3.637568389057751, + "grad_norm": 0.07283865369015942, + "learning_rate": 1.988735886146953e-07, + "loss": 0.5472, + "step": 7485 + }, + { + "epoch": 3.6380547112462005, + "grad_norm": 0.06889612348561956, + "learning_rate": 1.9833931378634985e-07, + "loss": 0.4637, + "step": 7486 + }, + { + "epoch": 3.6385410334346506, + "grad_norm": 0.06862095008992869, + "learning_rate": 1.9780574306701715e-07, + "loss": 0.5111, + "step": 7487 + }, + { + "epoch": 3.6390273556231003, + "grad_norm": 0.07006805649849787, + "learning_rate": 1.972728765349402e-07, + "loss": 0.4975, + "step": 7488 + }, + { + "epoch": 3.63951367781155, + "grad_norm": 0.11638624971168805, + "learning_rate": 1.9674071426825647e-07, + "loss": 0.5266, + "step": 7489 + }, + { + "epoch": 3.64, + "grad_norm": 0.07362129423013457, + "learning_rate": 1.96209256345003e-07, + "loss": 0.5358, + "step": 7490 + }, + { + "epoch": 3.64048632218845, + "grad_norm": 0.07234328561646522, + "learning_rate": 1.9567850284311185e-07, + "loss": 0.5348, + "step": 7491 + }, + { + "epoch": 3.6409726443769, + "grad_norm": 0.07036638198076449, + "learning_rate": 1.9514845384041081e-07, + "loss": 0.5021, + "step": 7492 + }, + { + "epoch": 3.6414589665653496, + "grad_norm": 0.07183510166582527, + "learning_rate": 1.9461910941462657e-07, + "loss": 0.5171, + "step": 7493 + }, + { + "epoch": 3.6419452887537993, + "grad_norm": 0.07121910445789473, + "learning_rate": 1.9409046964338152e-07, + "loss": 0.5131, + "step": 7494 + }, + { + "epoch": 3.642431610942249, + "grad_norm": 0.07123112984882467, + "learning_rate": 1.9356253460419416e-07, + "loss": 0.5231, + "step": 7495 + }, + { + "epoch": 3.642917933130699, + "grad_norm": 0.07230851178721137, + "learning_rate": 1.9303530437448036e-07, + "loss": 0.5285, + "step": 7496 + }, + { + "epoch": 3.643404255319149, + "grad_norm": 0.07180968000417191, + "learning_rate": 1.9250877903155329e-07, + "loss": 0.5605, + "step": 7497 + }, + { + "epoch": 3.643890577507599, + "grad_norm": 0.07193875215347835, + "learning_rate": 1.9198295865262063e-07, + "loss": 0.5146, + "step": 7498 + }, + { + "epoch": 3.6443768996960486, + "grad_norm": 0.07302790121597812, + "learning_rate": 1.914578433147879e-07, + "loss": 0.5255, + "step": 7499 + }, + { + "epoch": 3.6448632218844983, + "grad_norm": 0.07130233395149034, + "learning_rate": 1.9093343309505797e-07, + "loss": 0.5055, + "step": 7500 + }, + { + "epoch": 3.6453495440729484, + "grad_norm": 0.07203586544334373, + "learning_rate": 1.9040972807032988e-07, + "loss": 0.5238, + "step": 7501 + }, + { + "epoch": 3.645835866261398, + "grad_norm": 0.0728670850843985, + "learning_rate": 1.8988672831739828e-07, + "loss": 0.5151, + "step": 7502 + }, + { + "epoch": 3.6463221884498482, + "grad_norm": 0.07058333820381851, + "learning_rate": 1.8936443391295578e-07, + "loss": 0.484, + "step": 7503 + }, + { + "epoch": 3.646808510638298, + "grad_norm": 0.06888119281513207, + "learning_rate": 1.888428449335905e-07, + "loss": 0.4772, + "step": 7504 + }, + { + "epoch": 3.6472948328267476, + "grad_norm": 0.07005706663977893, + "learning_rate": 1.883219614557874e-07, + "loss": 0.4864, + "step": 7505 + }, + { + "epoch": 3.6477811550151977, + "grad_norm": 0.07017465080421058, + "learning_rate": 1.878017835559287e-07, + "loss": 0.4854, + "step": 7506 + }, + { + "epoch": 3.6482674772036474, + "grad_norm": 0.069891375391845, + "learning_rate": 1.872823113102923e-07, + "loss": 0.4784, + "step": 7507 + }, + { + "epoch": 3.6487537993920975, + "grad_norm": 0.07324580049276005, + "learning_rate": 1.867635447950522e-07, + "loss": 0.5448, + "step": 7508 + }, + { + "epoch": 3.6492401215805472, + "grad_norm": 0.07277742233402125, + "learning_rate": 1.8624548408628152e-07, + "loss": 0.5059, + "step": 7509 + }, + { + "epoch": 3.649726443768997, + "grad_norm": 0.07001265792556749, + "learning_rate": 1.857281292599461e-07, + "loss": 0.5018, + "step": 7510 + }, + { + "epoch": 3.6502127659574466, + "grad_norm": 0.06979377235499186, + "learning_rate": 1.852114803919114e-07, + "loss": 0.4886, + "step": 7511 + }, + { + "epoch": 3.6506990881458967, + "grad_norm": 0.06877636540563002, + "learning_rate": 1.846955375579379e-07, + "loss": 0.4833, + "step": 7512 + }, + { + "epoch": 3.6511854103343464, + "grad_norm": 0.07306061608472722, + "learning_rate": 1.8418030083368178e-07, + "loss": 0.5144, + "step": 7513 + }, + { + "epoch": 3.6516717325227965, + "grad_norm": 0.06852282899311782, + "learning_rate": 1.8366577029469701e-07, + "loss": 0.5075, + "step": 7514 + }, + { + "epoch": 3.652158054711246, + "grad_norm": 0.07112433008367616, + "learning_rate": 1.8315194601643439e-07, + "loss": 0.5191, + "step": 7515 + }, + { + "epoch": 3.652644376899696, + "grad_norm": 0.06898192783108989, + "learning_rate": 1.8263882807423972e-07, + "loss": 0.4863, + "step": 7516 + }, + { + "epoch": 3.653130699088146, + "grad_norm": 0.0690880864184972, + "learning_rate": 1.8212641654335618e-07, + "loss": 0.4798, + "step": 7517 + }, + { + "epoch": 3.6536170212765957, + "grad_norm": 0.07184164267834733, + "learning_rate": 1.8161471149892306e-07, + "loss": 0.5303, + "step": 7518 + }, + { + "epoch": 3.654103343465046, + "grad_norm": 0.0739491901871606, + "learning_rate": 1.8110371301597596e-07, + "loss": 0.5295, + "step": 7519 + }, + { + "epoch": 3.6545896656534955, + "grad_norm": 0.07362011800370305, + "learning_rate": 1.8059342116944711e-07, + "loss": 0.5315, + "step": 7520 + }, + { + "epoch": 3.655075987841945, + "grad_norm": 0.07119662997312755, + "learning_rate": 1.8008383603416558e-07, + "loss": 0.5241, + "step": 7521 + }, + { + "epoch": 3.655562310030395, + "grad_norm": 0.07032375495360829, + "learning_rate": 1.7957495768485543e-07, + "loss": 0.5152, + "step": 7522 + }, + { + "epoch": 3.656048632218845, + "grad_norm": 0.07334274070229832, + "learning_rate": 1.7906678619613814e-07, + "loss": 0.5285, + "step": 7523 + }, + { + "epoch": 3.6565349544072947, + "grad_norm": 0.07324739680703293, + "learning_rate": 1.7855932164253133e-07, + "loss": 0.5604, + "step": 7524 + }, + { + "epoch": 3.657021276595745, + "grad_norm": 0.07191865658351163, + "learning_rate": 1.7805256409844873e-07, + "loss": 0.5109, + "step": 7525 + }, + { + "epoch": 3.6575075987841945, + "grad_norm": 0.06996314492438994, + "learning_rate": 1.7754651363820042e-07, + "loss": 0.5038, + "step": 7526 + }, + { + "epoch": 3.657993920972644, + "grad_norm": 0.0702110192894247, + "learning_rate": 1.7704117033599477e-07, + "loss": 0.4843, + "step": 7527 + }, + { + "epoch": 3.6584802431610943, + "grad_norm": 0.07133565464454686, + "learning_rate": 1.7653653426593197e-07, + "loss": 0.5104, + "step": 7528 + }, + { + "epoch": 3.658966565349544, + "grad_norm": 0.07254986884328445, + "learning_rate": 1.7603260550201284e-07, + "loss": 0.5075, + "step": 7529 + }, + { + "epoch": 3.659452887537994, + "grad_norm": 0.07234594342644482, + "learning_rate": 1.7552938411813214e-07, + "loss": 0.494, + "step": 7530 + }, + { + "epoch": 3.659939209726444, + "grad_norm": 0.06997898847699532, + "learning_rate": 1.750268701880814e-07, + "loss": 0.5021, + "step": 7531 + }, + { + "epoch": 3.6604255319148935, + "grad_norm": 0.06841732536213396, + "learning_rate": 1.7452506378554945e-07, + "loss": 0.5122, + "step": 7532 + }, + { + "epoch": 3.660911854103343, + "grad_norm": 0.0715954332291871, + "learning_rate": 1.7402396498411967e-07, + "loss": 0.5223, + "step": 7533 + }, + { + "epoch": 3.6613981762917933, + "grad_norm": 0.07156559770059086, + "learning_rate": 1.7352357385727326e-07, + "loss": 0.5058, + "step": 7534 + }, + { + "epoch": 3.661884498480243, + "grad_norm": 0.07099284531232679, + "learning_rate": 1.7302389047838597e-07, + "loss": 0.4979, + "step": 7535 + }, + { + "epoch": 3.662370820668693, + "grad_norm": 0.07162207060421072, + "learning_rate": 1.7252491492073143e-07, + "loss": 0.5191, + "step": 7536 + }, + { + "epoch": 3.662857142857143, + "grad_norm": 0.07145515138662749, + "learning_rate": 1.7202664725747885e-07, + "loss": 0.523, + "step": 7537 + }, + { + "epoch": 3.6633434650455925, + "grad_norm": 0.07041061927338907, + "learning_rate": 1.715290875616926e-07, + "loss": 0.484, + "step": 7538 + }, + { + "epoch": 3.6638297872340426, + "grad_norm": 0.07289429657436966, + "learning_rate": 1.7103223590633489e-07, + "loss": 0.5221, + "step": 7539 + }, + { + "epoch": 3.6643161094224923, + "grad_norm": 0.0698684944550393, + "learning_rate": 1.705360923642635e-07, + "loss": 0.5334, + "step": 7540 + }, + { + "epoch": 3.6648024316109424, + "grad_norm": 0.06973663585020563, + "learning_rate": 1.7004065700823192e-07, + "loss": 0.4884, + "step": 7541 + }, + { + "epoch": 3.665288753799392, + "grad_norm": 0.06965277173611224, + "learning_rate": 1.6954592991088982e-07, + "loss": 0.4983, + "step": 7542 + }, + { + "epoch": 3.665775075987842, + "grad_norm": 0.0712242615242087, + "learning_rate": 1.6905191114478415e-07, + "loss": 0.5297, + "step": 7543 + }, + { + "epoch": 3.666261398176292, + "grad_norm": 0.06993880782261867, + "learning_rate": 1.6855860078235642e-07, + "loss": 0.483, + "step": 7544 + }, + { + "epoch": 3.6667477203647416, + "grad_norm": 0.07168190079168284, + "learning_rate": 1.6806599889594488e-07, + "loss": 0.5286, + "step": 7545 + }, + { + "epoch": 3.6672340425531917, + "grad_norm": 0.0714318564216297, + "learning_rate": 1.6757410555778454e-07, + "loss": 0.5334, + "step": 7546 + }, + { + "epoch": 3.6677203647416414, + "grad_norm": 0.06996352764558236, + "learning_rate": 1.67082920840006e-07, + "loss": 0.4922, + "step": 7547 + }, + { + "epoch": 3.668206686930091, + "grad_norm": 0.07091207684774888, + "learning_rate": 1.6659244481463553e-07, + "loss": 0.4977, + "step": 7548 + }, + { + "epoch": 3.668693009118541, + "grad_norm": 0.07038717321528706, + "learning_rate": 1.661026775535962e-07, + "loss": 0.4868, + "step": 7549 + }, + { + "epoch": 3.669179331306991, + "grad_norm": 0.06978806215257613, + "learning_rate": 1.6561361912870667e-07, + "loss": 0.4914, + "step": 7550 + }, + { + "epoch": 3.6696656534954406, + "grad_norm": 0.07232564978162159, + "learning_rate": 1.6512526961168173e-07, + "loss": 0.5275, + "step": 7551 + }, + { + "epoch": 3.6701519756838907, + "grad_norm": 0.0695353219389521, + "learning_rate": 1.646376290741325e-07, + "loss": 0.5272, + "step": 7552 + }, + { + "epoch": 3.6706382978723404, + "grad_norm": 0.07076372926681185, + "learning_rate": 1.6415069758756564e-07, + "loss": 0.5244, + "step": 7553 + }, + { + "epoch": 3.67112462006079, + "grad_norm": 0.06975678337646281, + "learning_rate": 1.636644752233846e-07, + "loss": 0.4836, + "step": 7554 + }, + { + "epoch": 3.6716109422492402, + "grad_norm": 0.07084517689200552, + "learning_rate": 1.631789620528873e-07, + "loss": 0.5405, + "step": 7555 + }, + { + "epoch": 3.67209726443769, + "grad_norm": 0.06926883388737212, + "learning_rate": 1.6269415814727018e-07, + "loss": 0.4942, + "step": 7556 + }, + { + "epoch": 3.67258358662614, + "grad_norm": 0.07095701886476784, + "learning_rate": 1.6221006357762304e-07, + "loss": 0.5147, + "step": 7557 + }, + { + "epoch": 3.6730699088145897, + "grad_norm": 0.06857787415435455, + "learning_rate": 1.6172667841493351e-07, + "loss": 0.5008, + "step": 7558 + }, + { + "epoch": 3.6735562310030394, + "grad_norm": 0.07206429092235508, + "learning_rate": 1.6124400273008434e-07, + "loss": 0.5125, + "step": 7559 + }, + { + "epoch": 3.674042553191489, + "grad_norm": 0.07299348530193564, + "learning_rate": 1.6076203659385503e-07, + "loss": 0.5021, + "step": 7560 + }, + { + "epoch": 3.674528875379939, + "grad_norm": 0.06916757741865318, + "learning_rate": 1.6028078007691962e-07, + "loss": 0.521, + "step": 7561 + }, + { + "epoch": 3.675015197568389, + "grad_norm": 0.07063395884421529, + "learning_rate": 1.598002332498483e-07, + "loss": 0.5087, + "step": 7562 + }, + { + "epoch": 3.675501519756839, + "grad_norm": 0.07063914656245275, + "learning_rate": 1.5932039618310913e-07, + "loss": 0.5236, + "step": 7563 + }, + { + "epoch": 3.6759878419452887, + "grad_norm": 0.07278125313816183, + "learning_rate": 1.588412689470642e-07, + "loss": 0.5291, + "step": 7564 + }, + { + "epoch": 3.6764741641337384, + "grad_norm": 0.06915412333874485, + "learning_rate": 1.583628516119723e-07, + "loss": 0.5126, + "step": 7565 + }, + { + "epoch": 3.6769604863221885, + "grad_norm": 0.07255686904347249, + "learning_rate": 1.5788514424798785e-07, + "loss": 0.5161, + "step": 7566 + }, + { + "epoch": 3.677446808510638, + "grad_norm": 0.07220331901678488, + "learning_rate": 1.574081469251615e-07, + "loss": 0.5014, + "step": 7567 + }, + { + "epoch": 3.6779331306990883, + "grad_norm": 0.07088416458188823, + "learning_rate": 1.5693185971343895e-07, + "loss": 0.4972, + "step": 7568 + }, + { + "epoch": 3.678419452887538, + "grad_norm": 0.07159320842660802, + "learning_rate": 1.564562826826621e-07, + "loss": 0.5075, + "step": 7569 + }, + { + "epoch": 3.6789057750759877, + "grad_norm": 0.07133548899316397, + "learning_rate": 1.5598141590256966e-07, + "loss": 0.5155, + "step": 7570 + }, + { + "epoch": 3.679392097264438, + "grad_norm": 0.07148643741355755, + "learning_rate": 1.5550725944279476e-07, + "loss": 0.4651, + "step": 7571 + }, + { + "epoch": 3.6798784194528875, + "grad_norm": 0.07042599246655751, + "learning_rate": 1.550338133728674e-07, + "loss": 0.4801, + "step": 7572 + }, + { + "epoch": 3.6803647416413376, + "grad_norm": 0.07010151777419237, + "learning_rate": 1.5456107776221363e-07, + "loss": 0.4825, + "step": 7573 + }, + { + "epoch": 3.6808510638297873, + "grad_norm": 0.0692687248738914, + "learning_rate": 1.5408905268015361e-07, + "loss": 0.4857, + "step": 7574 + }, + { + "epoch": 3.681337386018237, + "grad_norm": 0.07088841546605369, + "learning_rate": 1.5361773819590585e-07, + "loss": 0.4897, + "step": 7575 + }, + { + "epoch": 3.6818237082066867, + "grad_norm": 0.0708516927536673, + "learning_rate": 1.5314713437858174e-07, + "loss": 0.5234, + "step": 7576 + }, + { + "epoch": 3.682310030395137, + "grad_norm": 0.07051250862511367, + "learning_rate": 1.5267724129719108e-07, + "loss": 0.4981, + "step": 7577 + }, + { + "epoch": 3.6827963525835865, + "grad_norm": 0.07274738439743421, + "learning_rate": 1.5220805902063762e-07, + "loss": 0.4884, + "step": 7578 + }, + { + "epoch": 3.6832826747720366, + "grad_norm": 0.07226312187858681, + "learning_rate": 1.5173958761772246e-07, + "loss": 0.5242, + "step": 7579 + }, + { + "epoch": 3.6837689969604863, + "grad_norm": 0.06949428643138196, + "learning_rate": 1.5127182715714006e-07, + "loss": 0.5074, + "step": 7580 + }, + { + "epoch": 3.684255319148936, + "grad_norm": 0.07101417643479951, + "learning_rate": 1.5080477770748392e-07, + "loss": 0.5152, + "step": 7581 + }, + { + "epoch": 3.684741641337386, + "grad_norm": 0.07034009047803332, + "learning_rate": 1.503384393372409e-07, + "loss": 0.4962, + "step": 7582 + }, + { + "epoch": 3.685227963525836, + "grad_norm": 0.07060814694846111, + "learning_rate": 1.4987281211479466e-07, + "loss": 0.4946, + "step": 7583 + }, + { + "epoch": 3.685714285714286, + "grad_norm": 0.07068001761701923, + "learning_rate": 1.4940789610842332e-07, + "loss": 0.5521, + "step": 7584 + }, + { + "epoch": 3.6862006079027356, + "grad_norm": 0.07075723212084156, + "learning_rate": 1.4894369138630182e-07, + "loss": 0.4936, + "step": 7585 + }, + { + "epoch": 3.6866869300911853, + "grad_norm": 0.07259529763174113, + "learning_rate": 1.484801980165007e-07, + "loss": 0.4844, + "step": 7586 + }, + { + "epoch": 3.687173252279635, + "grad_norm": 0.0721453433478143, + "learning_rate": 1.480174160669856e-07, + "loss": 0.5082, + "step": 7587 + }, + { + "epoch": 3.687659574468085, + "grad_norm": 0.0704804282711398, + "learning_rate": 1.475553456056189e-07, + "loss": 0.5209, + "step": 7588 + }, + { + "epoch": 3.688145896656535, + "grad_norm": 0.06964632010285582, + "learning_rate": 1.4709398670015752e-07, + "loss": 0.4896, + "step": 7589 + }, + { + "epoch": 3.688632218844985, + "grad_norm": 0.07000606698519893, + "learning_rate": 1.4663333941825452e-07, + "loss": 0.5161, + "step": 7590 + }, + { + "epoch": 3.6891185410334346, + "grad_norm": 0.07251020833609347, + "learning_rate": 1.461734038274587e-07, + "loss": 0.4848, + "step": 7591 + }, + { + "epoch": 3.6896048632218843, + "grad_norm": 0.07640300754390118, + "learning_rate": 1.4571417999521442e-07, + "loss": 0.5622, + "step": 7592 + }, + { + "epoch": 3.6900911854103344, + "grad_norm": 0.07157233023252813, + "learning_rate": 1.4525566798886115e-07, + "loss": 0.5161, + "step": 7593 + }, + { + "epoch": 3.690577507598784, + "grad_norm": 0.07105282903720113, + "learning_rate": 1.4479786787563565e-07, + "loss": 0.5088, + "step": 7594 + }, + { + "epoch": 3.6910638297872342, + "grad_norm": 0.07691002610042437, + "learning_rate": 1.4434077972266757e-07, + "loss": 0.5577, + "step": 7595 + }, + { + "epoch": 3.691550151975684, + "grad_norm": 0.07154039785140516, + "learning_rate": 1.4388440359698496e-07, + "loss": 0.5417, + "step": 7596 + }, + { + "epoch": 3.6920364741641336, + "grad_norm": 0.07288494659992656, + "learning_rate": 1.4342873956550928e-07, + "loss": 0.5267, + "step": 7597 + }, + { + "epoch": 3.6925227963525837, + "grad_norm": 0.07134174873389273, + "learning_rate": 1.4297378769505876e-07, + "loss": 0.5447, + "step": 7598 + }, + { + "epoch": 3.6930091185410334, + "grad_norm": 0.07465082134896685, + "learning_rate": 1.425195480523478e-07, + "loss": 0.5342, + "step": 7599 + }, + { + "epoch": 3.6934954407294835, + "grad_norm": 0.07066529622833347, + "learning_rate": 1.4206602070398424e-07, + "loss": 0.5055, + "step": 7600 + }, + { + "epoch": 3.6939817629179332, + "grad_norm": 0.07340587027710287, + "learning_rate": 1.4161320571647374e-07, + "loss": 0.5202, + "step": 7601 + }, + { + "epoch": 3.694468085106383, + "grad_norm": 0.07252197267251007, + "learning_rate": 1.4116110315621546e-07, + "loss": 0.5129, + "step": 7602 + }, + { + "epoch": 3.6949544072948326, + "grad_norm": 0.07066841715628236, + "learning_rate": 1.4070971308950577e-07, + "loss": 0.4969, + "step": 7603 + }, + { + "epoch": 3.6954407294832827, + "grad_norm": 0.07080447583813533, + "learning_rate": 1.4025903558253673e-07, + "loss": 0.5166, + "step": 7604 + }, + { + "epoch": 3.6959270516717324, + "grad_norm": 0.06927407891020573, + "learning_rate": 1.3980907070139328e-07, + "loss": 0.4804, + "step": 7605 + }, + { + "epoch": 3.6964133738601825, + "grad_norm": 0.07047311460358835, + "learning_rate": 1.3935981851205815e-07, + "loss": 0.5028, + "step": 7606 + }, + { + "epoch": 3.6968996960486322, + "grad_norm": 0.0701806178313648, + "learning_rate": 1.389112790804098e-07, + "loss": 0.5085, + "step": 7607 + }, + { + "epoch": 3.697386018237082, + "grad_norm": 0.07068318754242246, + "learning_rate": 1.3846345247222115e-07, + "loss": 0.4903, + "step": 7608 + }, + { + "epoch": 3.697872340425532, + "grad_norm": 0.0732179315736856, + "learning_rate": 1.3801633875316078e-07, + "loss": 0.5198, + "step": 7609 + }, + { + "epoch": 3.6983586626139817, + "grad_norm": 0.06900913217367231, + "learning_rate": 1.3756993798879237e-07, + "loss": 0.4831, + "step": 7610 + }, + { + "epoch": 3.698844984802432, + "grad_norm": 0.06800958306976836, + "learning_rate": 1.3712425024457633e-07, + "loss": 0.4504, + "step": 7611 + }, + { + "epoch": 3.6993313069908815, + "grad_norm": 0.07270882847618375, + "learning_rate": 1.3667927558586756e-07, + "loss": 0.4912, + "step": 7612 + }, + { + "epoch": 3.699817629179331, + "grad_norm": 0.07359378423019014, + "learning_rate": 1.3623501407791618e-07, + "loss": 0.5063, + "step": 7613 + }, + { + "epoch": 3.700303951367781, + "grad_norm": 0.06995003341319025, + "learning_rate": 1.3579146578586832e-07, + "loss": 0.4909, + "step": 7614 + }, + { + "epoch": 3.700790273556231, + "grad_norm": 0.07064228266289245, + "learning_rate": 1.3534863077476535e-07, + "loss": 0.4971, + "step": 7615 + }, + { + "epoch": 3.7012765957446807, + "grad_norm": 0.07338589569601295, + "learning_rate": 1.3490650910954306e-07, + "loss": 0.5741, + "step": 7616 + }, + { + "epoch": 3.701762917933131, + "grad_norm": 0.06836310783173091, + "learning_rate": 1.3446510085503516e-07, + "loss": 0.4851, + "step": 7617 + }, + { + "epoch": 3.7022492401215805, + "grad_norm": 0.07155419113637652, + "learning_rate": 1.3402440607596821e-07, + "loss": 0.5356, + "step": 7618 + }, + { + "epoch": 3.70273556231003, + "grad_norm": 0.06912456092294525, + "learning_rate": 1.335844248369661e-07, + "loss": 0.5103, + "step": 7619 + }, + { + "epoch": 3.7032218844984803, + "grad_norm": 0.07165302452954286, + "learning_rate": 1.3314515720254552e-07, + "loss": 0.5298, + "step": 7620 + }, + { + "epoch": 3.70370820668693, + "grad_norm": 0.07241053022989817, + "learning_rate": 1.3270660323712104e-07, + "loss": 0.5231, + "step": 7621 + }, + { + "epoch": 3.70419452887538, + "grad_norm": 0.07020504665779764, + "learning_rate": 1.3226876300500125e-07, + "loss": 0.5065, + "step": 7622 + }, + { + "epoch": 3.70468085106383, + "grad_norm": 0.07436683994800043, + "learning_rate": 1.318316365703909e-07, + "loss": 0.554, + "step": 7623 + }, + { + "epoch": 3.7051671732522795, + "grad_norm": 0.07107360024773648, + "learning_rate": 1.3139522399738924e-07, + "loss": 0.5351, + "step": 7624 + }, + { + "epoch": 3.7056534954407296, + "grad_norm": 0.06897596353895076, + "learning_rate": 1.3095952534999123e-07, + "loss": 0.5001, + "step": 7625 + }, + { + "epoch": 3.7061398176291793, + "grad_norm": 0.07009673119420902, + "learning_rate": 1.3052454069208686e-07, + "loss": 0.5154, + "step": 7626 + }, + { + "epoch": 3.7066261398176295, + "grad_norm": 0.06937188232905657, + "learning_rate": 1.3009027008746234e-07, + "loss": 0.4803, + "step": 7627 + }, + { + "epoch": 3.707112462006079, + "grad_norm": 0.07099904865956519, + "learning_rate": 1.2965671359979838e-07, + "loss": 0.4989, + "step": 7628 + }, + { + "epoch": 3.707598784194529, + "grad_norm": 0.0746563104742717, + "learning_rate": 1.2922387129267077e-07, + "loss": 0.5355, + "step": 7629 + }, + { + "epoch": 3.7080851063829785, + "grad_norm": 0.06922289823851557, + "learning_rate": 1.28791743229551e-07, + "loss": 0.4938, + "step": 7630 + }, + { + "epoch": 3.7085714285714286, + "grad_norm": 0.07208477780138299, + "learning_rate": 1.2836032947380616e-07, + "loss": 0.5141, + "step": 7631 + }, + { + "epoch": 3.7090577507598783, + "grad_norm": 0.06922684255046883, + "learning_rate": 1.2792963008869786e-07, + "loss": 0.4909, + "step": 7632 + }, + { + "epoch": 3.7095440729483284, + "grad_norm": 0.07215478226688445, + "learning_rate": 1.2749964513738277e-07, + "loss": 0.5166, + "step": 7633 + }, + { + "epoch": 3.710030395136778, + "grad_norm": 0.07173461218672605, + "learning_rate": 1.2707037468291438e-07, + "loss": 0.5329, + "step": 7634 + }, + { + "epoch": 3.710516717325228, + "grad_norm": 0.07670294994184858, + "learning_rate": 1.2664181878823955e-07, + "loss": 0.5679, + "step": 7635 + }, + { + "epoch": 3.711003039513678, + "grad_norm": 0.07347386059810658, + "learning_rate": 1.2621397751620135e-07, + "loss": 0.4993, + "step": 7636 + }, + { + "epoch": 3.7114893617021276, + "grad_norm": 0.07283229973830455, + "learning_rate": 1.257868509295379e-07, + "loss": 0.5362, + "step": 7637 + }, + { + "epoch": 3.7119756838905777, + "grad_norm": 0.0699449288709463, + "learning_rate": 1.253604390908819e-07, + "loss": 0.4791, + "step": 7638 + }, + { + "epoch": 3.7124620060790274, + "grad_norm": 0.06875317816670751, + "learning_rate": 1.249347420627628e-07, + "loss": 0.4817, + "step": 7639 + }, + { + "epoch": 3.712948328267477, + "grad_norm": 0.0708711370024746, + "learning_rate": 1.2450975990760395e-07, + "loss": 0.4902, + "step": 7640 + }, + { + "epoch": 3.713434650455927, + "grad_norm": 0.07414615114078611, + "learning_rate": 1.240854926877233e-07, + "loss": 0.5418, + "step": 7641 + }, + { + "epoch": 3.713920972644377, + "grad_norm": 0.07233836782519713, + "learning_rate": 1.2366194046533608e-07, + "loss": 0.5357, + "step": 7642 + }, + { + "epoch": 3.7144072948328266, + "grad_norm": 0.06927638489188884, + "learning_rate": 1.232391033025504e-07, + "loss": 0.4693, + "step": 7643 + }, + { + "epoch": 3.7148936170212767, + "grad_norm": 0.07170036375808124, + "learning_rate": 1.228169812613711e-07, + "loss": 0.4974, + "step": 7644 + }, + { + "epoch": 3.7153799392097264, + "grad_norm": 0.0706144044845616, + "learning_rate": 1.2239557440369754e-07, + "loss": 0.5294, + "step": 7645 + }, + { + "epoch": 3.715866261398176, + "grad_norm": 0.07164507648402191, + "learning_rate": 1.219748827913242e-07, + "loss": 0.5319, + "step": 7646 + }, + { + "epoch": 3.7163525835866262, + "grad_norm": 0.07018422264423925, + "learning_rate": 1.215549064859406e-07, + "loss": 0.4821, + "step": 7647 + }, + { + "epoch": 3.716838905775076, + "grad_norm": 0.07231904033091041, + "learning_rate": 1.2113564554913137e-07, + "loss": 0.5123, + "step": 7648 + }, + { + "epoch": 3.717325227963526, + "grad_norm": 0.07318609230931034, + "learning_rate": 1.2071710004237624e-07, + "loss": 0.4861, + "step": 7649 + }, + { + "epoch": 3.7178115501519757, + "grad_norm": 0.07023324809750443, + "learning_rate": 1.2029927002705112e-07, + "loss": 0.4837, + "step": 7650 + }, + { + "epoch": 3.7182978723404254, + "grad_norm": 0.070244922239307, + "learning_rate": 1.1988215556442474e-07, + "loss": 0.501, + "step": 7651 + }, + { + "epoch": 3.7187841945288755, + "grad_norm": 0.07211113681391308, + "learning_rate": 1.1946575671566373e-07, + "loss": 0.5224, + "step": 7652 + }, + { + "epoch": 3.7192705167173252, + "grad_norm": 0.0711248489328205, + "learning_rate": 1.1905007354182651e-07, + "loss": 0.5709, + "step": 7653 + }, + { + "epoch": 3.7197568389057754, + "grad_norm": 0.07233170031170742, + "learning_rate": 1.186351061038693e-07, + "loss": 0.5283, + "step": 7654 + }, + { + "epoch": 3.720243161094225, + "grad_norm": 0.07077937210014604, + "learning_rate": 1.1822085446264231e-07, + "loss": 0.516, + "step": 7655 + }, + { + "epoch": 3.7207294832826747, + "grad_norm": 0.07010215098590673, + "learning_rate": 1.1780731867889084e-07, + "loss": 0.4993, + "step": 7656 + }, + { + "epoch": 3.7212158054711244, + "grad_norm": 0.07403837525571441, + "learning_rate": 1.1739449881325471e-07, + "loss": 0.5342, + "step": 7657 + }, + { + "epoch": 3.7217021276595745, + "grad_norm": 0.07193425347874219, + "learning_rate": 1.1698239492626995e-07, + "loss": 0.5135, + "step": 7658 + }, + { + "epoch": 3.722188449848024, + "grad_norm": 0.07186607821697964, + "learning_rate": 1.1657100707836711e-07, + "loss": 0.5167, + "step": 7659 + }, + { + "epoch": 3.7226747720364743, + "grad_norm": 0.07210328771239458, + "learning_rate": 1.1616033532987014e-07, + "loss": 0.5126, + "step": 7660 + }, + { + "epoch": 3.723161094224924, + "grad_norm": 0.07167477408992676, + "learning_rate": 1.157503797410009e-07, + "loss": 0.514, + "step": 7661 + }, + { + "epoch": 3.7236474164133737, + "grad_norm": 0.07012051918891697, + "learning_rate": 1.1534114037187404e-07, + "loss": 0.4989, + "step": 7662 + }, + { + "epoch": 3.724133738601824, + "grad_norm": 0.07242985684220683, + "learning_rate": 1.1493261728249994e-07, + "loss": 0.4999, + "step": 7663 + }, + { + "epoch": 3.7246200607902735, + "grad_norm": 0.07247641802680536, + "learning_rate": 1.1452481053278398e-07, + "loss": 0.5159, + "step": 7664 + }, + { + "epoch": 3.7251063829787237, + "grad_norm": 0.07137726377360303, + "learning_rate": 1.1411772018252665e-07, + "loss": 0.5309, + "step": 7665 + }, + { + "epoch": 3.7255927051671733, + "grad_norm": 0.06982851917962289, + "learning_rate": 1.1371134629142189e-07, + "loss": 0.5013, + "step": 7666 + }, + { + "epoch": 3.726079027355623, + "grad_norm": 0.07031351800497866, + "learning_rate": 1.1330568891906202e-07, + "loss": 0.4836, + "step": 7667 + }, + { + "epoch": 3.7265653495440727, + "grad_norm": 0.07103266506018624, + "learning_rate": 1.1290074812493001e-07, + "loss": 0.5097, + "step": 7668 + }, + { + "epoch": 3.727051671732523, + "grad_norm": 0.06974900788385342, + "learning_rate": 1.1249652396840672e-07, + "loss": 0.5025, + "step": 7669 + }, + { + "epoch": 3.7275379939209725, + "grad_norm": 0.06901027156582124, + "learning_rate": 1.1209301650876636e-07, + "loss": 0.4599, + "step": 7670 + }, + { + "epoch": 3.7280243161094226, + "grad_norm": 0.07093595557651035, + "learning_rate": 1.1169022580517941e-07, + "loss": 0.4878, + "step": 7671 + }, + { + "epoch": 3.7285106382978723, + "grad_norm": 0.07052910320572137, + "learning_rate": 1.1128815191671083e-07, + "loss": 0.5008, + "step": 7672 + }, + { + "epoch": 3.728996960486322, + "grad_norm": 0.0731877231163336, + "learning_rate": 1.1088679490231957e-07, + "loss": 0.5672, + "step": 7673 + }, + { + "epoch": 3.729483282674772, + "grad_norm": 0.07339205740980259, + "learning_rate": 1.1048615482086023e-07, + "loss": 0.5442, + "step": 7674 + }, + { + "epoch": 3.729969604863222, + "grad_norm": 0.06903168525592246, + "learning_rate": 1.1008623173108191e-07, + "loss": 0.4853, + "step": 7675 + }, + { + "epoch": 3.730455927051672, + "grad_norm": 0.07523719574251578, + "learning_rate": 1.0968702569162992e-07, + "loss": 0.5342, + "step": 7676 + }, + { + "epoch": 3.7309422492401216, + "grad_norm": 0.06692655233097024, + "learning_rate": 1.092885367610419e-07, + "loss": 0.4608, + "step": 7677 + }, + { + "epoch": 3.7314285714285713, + "grad_norm": 0.07212617980149026, + "learning_rate": 1.088907649977522e-07, + "loss": 0.5513, + "step": 7678 + }, + { + "epoch": 3.731914893617021, + "grad_norm": 0.07122131620923024, + "learning_rate": 1.0849371046008971e-07, + "loss": 0.5099, + "step": 7679 + }, + { + "epoch": 3.732401215805471, + "grad_norm": 0.07405052838004117, + "learning_rate": 1.0809737320627733e-07, + "loss": 0.566, + "step": 7680 + }, + { + "epoch": 3.732887537993921, + "grad_norm": 0.07119708603134343, + "learning_rate": 1.0770175329443521e-07, + "loss": 0.5187, + "step": 7681 + }, + { + "epoch": 3.733373860182371, + "grad_norm": 0.07195301881303603, + "learning_rate": 1.0730685078257418e-07, + "loss": 0.5012, + "step": 7682 + }, + { + "epoch": 3.7338601823708206, + "grad_norm": 0.07274869256330098, + "learning_rate": 1.0691266572860348e-07, + "loss": 0.5193, + "step": 7683 + }, + { + "epoch": 3.7343465045592703, + "grad_norm": 0.0698769526640335, + "learning_rate": 1.0651919819032574e-07, + "loss": 0.5025, + "step": 7684 + }, + { + "epoch": 3.7348328267477204, + "grad_norm": 0.07292602709683178, + "learning_rate": 1.0612644822543871e-07, + "loss": 0.5323, + "step": 7685 + }, + { + "epoch": 3.73531914893617, + "grad_norm": 0.07202062094302504, + "learning_rate": 1.0573441589153411e-07, + "loss": 0.5249, + "step": 7686 + }, + { + "epoch": 3.7358054711246202, + "grad_norm": 0.07070207502566973, + "learning_rate": 1.0534310124609926e-07, + "loss": 0.5278, + "step": 7687 + }, + { + "epoch": 3.73629179331307, + "grad_norm": 0.07007999656390297, + "learning_rate": 1.0495250434651604e-07, + "loss": 0.4818, + "step": 7688 + }, + { + "epoch": 3.7367781155015196, + "grad_norm": 0.06909397718244228, + "learning_rate": 1.0456262525006089e-07, + "loss": 0.5034, + "step": 7689 + }, + { + "epoch": 3.7372644376899697, + "grad_norm": 0.07355070954577128, + "learning_rate": 1.0417346401390582e-07, + "loss": 0.5148, + "step": 7690 + }, + { + "epoch": 3.7377507598784194, + "grad_norm": 0.06872264458556969, + "learning_rate": 1.0378502069511631e-07, + "loss": 0.4668, + "step": 7691 + }, + { + "epoch": 3.7382370820668696, + "grad_norm": 0.0731657868460375, + "learning_rate": 1.0339729535065346e-07, + "loss": 0.4949, + "step": 7692 + }, + { + "epoch": 3.7387234042553192, + "grad_norm": 0.07010725468949738, + "learning_rate": 1.0301028803737234e-07, + "loss": 0.4885, + "step": 7693 + }, + { + "epoch": 3.739209726443769, + "grad_norm": 0.07383235823572544, + "learning_rate": 1.0262399881202367e-07, + "loss": 0.5329, + "step": 7694 + }, + { + "epoch": 3.7396960486322186, + "grad_norm": 0.07215151996141322, + "learning_rate": 1.022384277312527e-07, + "loss": 0.5094, + "step": 7695 + }, + { + "epoch": 3.7401823708206687, + "grad_norm": 0.07011532505657807, + "learning_rate": 1.0185357485159808e-07, + "loss": 0.5111, + "step": 7696 + }, + { + "epoch": 3.7406686930091184, + "grad_norm": 0.07099063894681273, + "learning_rate": 1.0146944022949467e-07, + "loss": 0.5198, + "step": 7697 + }, + { + "epoch": 3.7411550151975685, + "grad_norm": 0.07085483380424308, + "learning_rate": 1.0108602392127131e-07, + "loss": 0.5121, + "step": 7698 + }, + { + "epoch": 3.7416413373860182, + "grad_norm": 0.07051419266290489, + "learning_rate": 1.0070332598315135e-07, + "loss": 0.4983, + "step": 7699 + }, + { + "epoch": 3.742127659574468, + "grad_norm": 0.07167384562559563, + "learning_rate": 1.003213464712538e-07, + "loss": 0.5156, + "step": 7700 + }, + { + "epoch": 3.742613981762918, + "grad_norm": 0.07129763955641986, + "learning_rate": 9.994008544159106e-08, + "loss": 0.5047, + "step": 7701 + }, + { + "epoch": 3.7431003039513677, + "grad_norm": 0.07328466995093061, + "learning_rate": 9.95595429500712e-08, + "loss": 0.508, + "step": 7702 + }, + { + "epoch": 3.743586626139818, + "grad_norm": 0.0726347452359639, + "learning_rate": 9.917971905249568e-08, + "loss": 0.5186, + "step": 7703 + }, + { + "epoch": 3.7440729483282675, + "grad_norm": 0.0682033717409881, + "learning_rate": 9.880061380456218e-08, + "loss": 0.4771, + "step": 7704 + }, + { + "epoch": 3.744559270516717, + "grad_norm": 0.07002158602692787, + "learning_rate": 9.842222726186179e-08, + "loss": 0.4932, + "step": 7705 + }, + { + "epoch": 3.745045592705167, + "grad_norm": 0.07149011531806744, + "learning_rate": 9.804455947988067e-08, + "loss": 0.5126, + "step": 7706 + }, + { + "epoch": 3.745531914893617, + "grad_norm": 0.07067131462924231, + "learning_rate": 9.766761051399954e-08, + "loss": 0.5029, + "step": 7707 + }, + { + "epoch": 3.7460182370820667, + "grad_norm": 0.06864055007370284, + "learning_rate": 9.729138041949359e-08, + "loss": 0.4808, + "step": 7708 + }, + { + "epoch": 3.746504559270517, + "grad_norm": 0.06946166489287424, + "learning_rate": 9.691586925153262e-08, + "loss": 0.4954, + "step": 7709 + }, + { + "epoch": 3.7469908814589665, + "grad_norm": 0.072672556691609, + "learning_rate": 9.654107706518145e-08, + "loss": 0.536, + "step": 7710 + }, + { + "epoch": 3.7469908814589665, + "eval_loss": 0.5693764686584473, + "eval_runtime": 105.164, + "eval_samples_per_second": 288.625, + "eval_steps_per_second": 36.087, + "step": 7710 + }, + { + "epoch": 3.747477203647416, + "grad_norm": 0.07401255269072891, + "learning_rate": 9.616700391539947e-08, + "loss": 0.5188, + "step": 7711 + }, + { + "epoch": 3.7479635258358663, + "grad_norm": 0.0689452786677942, + "learning_rate": 9.579364985703887e-08, + "loss": 0.4875, + "step": 7712 + }, + { + "epoch": 3.748449848024316, + "grad_norm": 0.0714686620646307, + "learning_rate": 9.542101494484867e-08, + "loss": 0.5053, + "step": 7713 + }, + { + "epoch": 3.748936170212766, + "grad_norm": 0.07076403033714619, + "learning_rate": 9.504909923347127e-08, + "loss": 0.4964, + "step": 7714 + }, + { + "epoch": 3.749422492401216, + "grad_norm": 0.07099573600275635, + "learning_rate": 9.46779027774447e-08, + "loss": 0.5118, + "step": 7715 + }, + { + "epoch": 3.7499088145896655, + "grad_norm": 0.07492769120656283, + "learning_rate": 9.430742563119932e-08, + "loss": 0.5665, + "step": 7716 + }, + { + "epoch": 3.7503951367781156, + "grad_norm": 0.07147937855029338, + "learning_rate": 9.393766784906277e-08, + "loss": 0.5052, + "step": 7717 + }, + { + "epoch": 3.7508814589665653, + "grad_norm": 0.07053002047155855, + "learning_rate": 9.356862948525447e-08, + "loss": 0.5445, + "step": 7718 + }, + { + "epoch": 3.7513677811550155, + "grad_norm": 0.0704034358557316, + "learning_rate": 9.320031059389112e-08, + "loss": 0.4982, + "step": 7719 + }, + { + "epoch": 3.751854103343465, + "grad_norm": 0.07190381013695785, + "learning_rate": 9.283271122898174e-08, + "loss": 0.5225, + "step": 7720 + }, + { + "epoch": 3.752340425531915, + "grad_norm": 0.07106024899789946, + "learning_rate": 9.246583144443044e-08, + "loss": 0.4826, + "step": 7721 + }, + { + "epoch": 3.7528267477203645, + "grad_norm": 0.0701169228898378, + "learning_rate": 9.209967129403585e-08, + "loss": 0.5454, + "step": 7722 + }, + { + "epoch": 3.7533130699088146, + "grad_norm": 0.07100833659957302, + "learning_rate": 9.173423083149224e-08, + "loss": 0.517, + "step": 7723 + }, + { + "epoch": 3.7537993920972643, + "grad_norm": 0.07037772196538646, + "learning_rate": 9.13695101103862e-08, + "loss": 0.5088, + "step": 7724 + }, + { + "epoch": 3.7542857142857144, + "grad_norm": 0.07140332964667107, + "learning_rate": 9.100550918420048e-08, + "loss": 0.5548, + "step": 7725 + }, + { + "epoch": 3.754772036474164, + "grad_norm": 0.07118150260149636, + "learning_rate": 9.064222810631185e-08, + "loss": 0.5156, + "step": 7726 + }, + { + "epoch": 3.755258358662614, + "grad_norm": 0.07054829159526961, + "learning_rate": 9.027966692999046e-08, + "loss": 0.5048, + "step": 7727 + }, + { + "epoch": 3.755744680851064, + "grad_norm": 0.06991846164704252, + "learning_rate": 8.991782570840269e-08, + "loss": 0.4692, + "step": 7728 + }, + { + "epoch": 3.7562310030395136, + "grad_norm": 0.07146371575995158, + "learning_rate": 8.955670449460773e-08, + "loss": 0.5028, + "step": 7729 + }, + { + "epoch": 3.7567173252279638, + "grad_norm": 0.07024640452959402, + "learning_rate": 8.919630334156049e-08, + "loss": 0.5094, + "step": 7730 + }, + { + "epoch": 3.7572036474164134, + "grad_norm": 0.07363743364009692, + "learning_rate": 8.883662230210977e-08, + "loss": 0.5137, + "step": 7731 + }, + { + "epoch": 3.757689969604863, + "grad_norm": 0.06997767666772173, + "learning_rate": 8.847766142899839e-08, + "loss": 0.492, + "step": 7732 + }, + { + "epoch": 3.758176291793313, + "grad_norm": 0.06853033038881279, + "learning_rate": 8.811942077486369e-08, + "loss": 0.4523, + "step": 7733 + }, + { + "epoch": 3.758662613981763, + "grad_norm": 0.06960930169371204, + "learning_rate": 8.776190039223753e-08, + "loss": 0.486, + "step": 7734 + }, + { + "epoch": 3.7591489361702126, + "grad_norm": 0.07245123958728712, + "learning_rate": 8.740510033354688e-08, + "loss": 0.5097, + "step": 7735 + }, + { + "epoch": 3.7596352583586627, + "grad_norm": 0.06983026851201811, + "learning_rate": 8.704902065111209e-08, + "loss": 0.4933, + "step": 7736 + }, + { + "epoch": 3.7601215805471124, + "grad_norm": 0.070011578391243, + "learning_rate": 8.669366139714808e-08, + "loss": 0.5029, + "step": 7737 + }, + { + "epoch": 3.760607902735562, + "grad_norm": 0.07021901100753344, + "learning_rate": 8.633902262376425e-08, + "loss": 0.5155, + "step": 7738 + }, + { + "epoch": 3.7610942249240122, + "grad_norm": 0.06963807861193787, + "learning_rate": 8.598510438296459e-08, + "loss": 0.5151, + "step": 7739 + }, + { + "epoch": 3.761580547112462, + "grad_norm": 0.07407682065062528, + "learning_rate": 8.563190672664701e-08, + "loss": 0.5501, + "step": 7740 + }, + { + "epoch": 3.762066869300912, + "grad_norm": 0.06807800433434286, + "learning_rate": 8.527942970660396e-08, + "loss": 0.4824, + "step": 7741 + }, + { + "epoch": 3.7625531914893617, + "grad_norm": 0.07016065469569435, + "learning_rate": 8.492767337452246e-08, + "loss": 0.5088, + "step": 7742 + }, + { + "epoch": 3.7630395136778114, + "grad_norm": 0.07151401098844945, + "learning_rate": 8.457663778198288e-08, + "loss": 0.5037, + "step": 7743 + }, + { + "epoch": 3.7635258358662615, + "grad_norm": 0.07077814626567643, + "learning_rate": 8.422632298046129e-08, + "loss": 0.4832, + "step": 7744 + }, + { + "epoch": 3.7640121580547112, + "grad_norm": 0.07310981759699837, + "learning_rate": 8.387672902132715e-08, + "loss": 0.5116, + "step": 7745 + }, + { + "epoch": 3.7644984802431614, + "grad_norm": 0.07047883999217851, + "learning_rate": 8.35278559558439e-08, + "loss": 0.5164, + "step": 7746 + }, + { + "epoch": 3.764984802431611, + "grad_norm": 0.07221396770439817, + "learning_rate": 8.317970383517115e-08, + "loss": 0.5245, + "step": 7747 + }, + { + "epoch": 3.7654711246200607, + "grad_norm": 0.06839329811368865, + "learning_rate": 8.283227271035976e-08, + "loss": 0.4694, + "step": 7748 + }, + { + "epoch": 3.7659574468085104, + "grad_norm": 0.07111116043817207, + "learning_rate": 8.24855626323584e-08, + "loss": 0.4883, + "step": 7749 + }, + { + "epoch": 3.7664437689969605, + "grad_norm": 0.0728438452541302, + "learning_rate": 8.213957365200642e-08, + "loss": 0.5255, + "step": 7750 + }, + { + "epoch": 3.7669300911854102, + "grad_norm": 0.07492742473737278, + "learning_rate": 8.179430582004045e-08, + "loss": 0.5658, + "step": 7751 + }, + { + "epoch": 3.7674164133738604, + "grad_norm": 0.07001314783322304, + "learning_rate": 8.144975918708941e-08, + "loss": 0.4829, + "step": 7752 + }, + { + "epoch": 3.76790273556231, + "grad_norm": 0.07112155282149944, + "learning_rate": 8.110593380367737e-08, + "loss": 0.5105, + "step": 7753 + }, + { + "epoch": 3.7683890577507597, + "grad_norm": 0.07199049394533069, + "learning_rate": 8.076282972022232e-08, + "loss": 0.5365, + "step": 7754 + }, + { + "epoch": 3.76887537993921, + "grad_norm": 0.07152711078433943, + "learning_rate": 8.042044698703676e-08, + "loss": 0.5094, + "step": 7755 + }, + { + "epoch": 3.7693617021276595, + "grad_norm": 0.07118183193702346, + "learning_rate": 8.007878565432669e-08, + "loss": 0.4938, + "step": 7756 + }, + { + "epoch": 3.7698480243161097, + "grad_norm": 0.07164547343704651, + "learning_rate": 7.973784577219368e-08, + "loss": 0.5395, + "step": 7757 + }, + { + "epoch": 3.7703343465045593, + "grad_norm": 0.06961476049470836, + "learning_rate": 7.939762739063217e-08, + "loss": 0.4953, + "step": 7758 + }, + { + "epoch": 3.770820668693009, + "grad_norm": 0.07411063978964855, + "learning_rate": 7.905813055953227e-08, + "loss": 0.5812, + "step": 7759 + }, + { + "epoch": 3.7713069908814587, + "grad_norm": 0.07064286742619653, + "learning_rate": 7.87193553286758e-08, + "loss": 0.4927, + "step": 7760 + }, + { + "epoch": 3.771793313069909, + "grad_norm": 0.07137804994590126, + "learning_rate": 7.838130174774083e-08, + "loss": 0.5152, + "step": 7761 + }, + { + "epoch": 3.7722796352583585, + "grad_norm": 0.07220577117782458, + "learning_rate": 7.804396986629936e-08, + "loss": 0.5293, + "step": 7762 + }, + { + "epoch": 3.7727659574468086, + "grad_norm": 0.06974146470799511, + "learning_rate": 7.770735973381737e-08, + "loss": 0.4895, + "step": 7763 + }, + { + "epoch": 3.7732522796352583, + "grad_norm": 0.07161473468232411, + "learning_rate": 7.737147139965484e-08, + "loss": 0.5167, + "step": 7764 + }, + { + "epoch": 3.773738601823708, + "grad_norm": 0.0711206025735908, + "learning_rate": 7.703630491306568e-08, + "loss": 0.5104, + "step": 7765 + }, + { + "epoch": 3.774224924012158, + "grad_norm": 0.0698696650192937, + "learning_rate": 7.670186032319837e-08, + "loss": 0.4773, + "step": 7766 + }, + { + "epoch": 3.774711246200608, + "grad_norm": 0.07112267845987022, + "learning_rate": 7.636813767909534e-08, + "loss": 0.5049, + "step": 7767 + }, + { + "epoch": 3.775197568389058, + "grad_norm": 0.0713664405146949, + "learning_rate": 7.603513702969412e-08, + "loss": 0.5028, + "step": 7768 + }, + { + "epoch": 3.7756838905775076, + "grad_norm": 0.07101275038554768, + "learning_rate": 7.570285842382396e-08, + "loss": 0.5044, + "step": 7769 + }, + { + "epoch": 3.7761702127659573, + "grad_norm": 0.070824573826293, + "learning_rate": 7.537130191021091e-08, + "loss": 0.5069, + "step": 7770 + }, + { + "epoch": 3.7766565349544075, + "grad_norm": 0.07073915403343078, + "learning_rate": 7.50404675374733e-08, + "loss": 0.5118, + "step": 7771 + }, + { + "epoch": 3.777142857142857, + "grad_norm": 0.07173185365424199, + "learning_rate": 7.471035535412508e-08, + "loss": 0.5198, + "step": 7772 + }, + { + "epoch": 3.7776291793313073, + "grad_norm": 0.07143915504987877, + "learning_rate": 7.438096540857254e-08, + "loss": 0.5048, + "step": 7773 + }, + { + "epoch": 3.778115501519757, + "grad_norm": 0.07245258291915908, + "learning_rate": 7.405229774911759e-08, + "loss": 0.5317, + "step": 7774 + }, + { + "epoch": 3.7786018237082066, + "grad_norm": 0.06975720862357525, + "learning_rate": 7.372435242395504e-08, + "loss": 0.5346, + "step": 7775 + }, + { + "epoch": 3.7790881458966563, + "grad_norm": 0.07093710869503764, + "learning_rate": 7.339712948117416e-08, + "loss": 0.5128, + "step": 7776 + }, + { + "epoch": 3.7795744680851064, + "grad_norm": 0.07019112109244323, + "learning_rate": 7.307062896875938e-08, + "loss": 0.5034, + "step": 7777 + }, + { + "epoch": 3.780060790273556, + "grad_norm": 0.07214557791330724, + "learning_rate": 7.274485093458794e-08, + "loss": 0.531, + "step": 7778 + }, + { + "epoch": 3.7805471124620063, + "grad_norm": 0.06988299892027762, + "learning_rate": 7.241979542643162e-08, + "loss": 0.4923, + "step": 7779 + }, + { + "epoch": 3.781033434650456, + "grad_norm": 0.0717535516835555, + "learning_rate": 7.209546249195509e-08, + "loss": 0.4936, + "step": 7780 + }, + { + "epoch": 3.7815197568389056, + "grad_norm": 0.07266876744376508, + "learning_rate": 7.177185217871974e-08, + "loss": 0.5355, + "step": 7781 + }, + { + "epoch": 3.7820060790273557, + "grad_norm": 0.07301470070331494, + "learning_rate": 7.144896453417816e-08, + "loss": 0.5324, + "step": 7782 + }, + { + "epoch": 3.7824924012158054, + "grad_norm": 0.06971067933149161, + "learning_rate": 7.112679960567858e-08, + "loss": 0.4911, + "step": 7783 + }, + { + "epoch": 3.7829787234042556, + "grad_norm": 0.07032926182160719, + "learning_rate": 7.080535744046268e-08, + "loss": 0.4981, + "step": 7784 + }, + { + "epoch": 3.7834650455927052, + "grad_norm": 0.07054086290073121, + "learning_rate": 7.048463808566663e-08, + "loss": 0.5135, + "step": 7785 + }, + { + "epoch": 3.783951367781155, + "grad_norm": 0.06983713804253608, + "learning_rate": 7.016464158832004e-08, + "loss": 0.5183, + "step": 7786 + }, + { + "epoch": 3.7844376899696046, + "grad_norm": 0.07259163726003327, + "learning_rate": 6.984536799534702e-08, + "loss": 0.5518, + "step": 7787 + }, + { + "epoch": 3.7849240121580547, + "grad_norm": 0.07117251160452116, + "learning_rate": 6.952681735356514e-08, + "loss": 0.4973, + "step": 7788 + }, + { + "epoch": 3.7854103343465044, + "grad_norm": 0.06994672228891526, + "learning_rate": 6.920898970968593e-08, + "loss": 0.5024, + "step": 7789 + }, + { + "epoch": 3.7858966565349546, + "grad_norm": 0.07312686210672546, + "learning_rate": 6.889188511031541e-08, + "loss": 0.5304, + "step": 7790 + }, + { + "epoch": 3.7863829787234042, + "grad_norm": 0.0693726335978675, + "learning_rate": 6.857550360195364e-08, + "loss": 0.4725, + "step": 7791 + }, + { + "epoch": 3.786869300911854, + "grad_norm": 0.07297801900897251, + "learning_rate": 6.82598452309946e-08, + "loss": 0.5639, + "step": 7792 + }, + { + "epoch": 3.787355623100304, + "grad_norm": 0.07171369918597217, + "learning_rate": 6.794491004372516e-08, + "loss": 0.5196, + "step": 7793 + }, + { + "epoch": 3.7878419452887537, + "grad_norm": 0.07196317081973158, + "learning_rate": 6.763069808632783e-08, + "loss": 0.5369, + "step": 7794 + }, + { + "epoch": 3.788328267477204, + "grad_norm": 0.07212672396374854, + "learning_rate": 6.73172094048774e-08, + "loss": 0.5524, + "step": 7795 + }, + { + "epoch": 3.7888145896656535, + "grad_norm": 0.07046776091701304, + "learning_rate": 6.700444404534434e-08, + "loss": 0.5202, + "step": 7796 + }, + { + "epoch": 3.7893009118541032, + "grad_norm": 0.07163163695006357, + "learning_rate": 6.669240205359139e-08, + "loss": 0.5084, + "step": 7797 + }, + { + "epoch": 3.7897872340425534, + "grad_norm": 0.07037420173174773, + "learning_rate": 6.638108347537587e-08, + "loss": 0.5361, + "step": 7798 + }, + { + "epoch": 3.790273556231003, + "grad_norm": 0.074410572555401, + "learning_rate": 6.60704883563501e-08, + "loss": 0.5407, + "step": 7799 + }, + { + "epoch": 3.7907598784194527, + "grad_norm": 0.07194043298296843, + "learning_rate": 6.576061674205825e-08, + "loss": 0.5248, + "step": 7800 + }, + { + "epoch": 3.791246200607903, + "grad_norm": 0.07103625618496813, + "learning_rate": 6.54514686779406e-08, + "loss": 0.54, + "step": 7801 + }, + { + "epoch": 3.7917325227963525, + "grad_norm": 0.06970610957861578, + "learning_rate": 6.514304420932927e-08, + "loss": 0.534, + "step": 7802 + }, + { + "epoch": 3.792218844984802, + "grad_norm": 0.0724173150267212, + "learning_rate": 6.483534338145192e-08, + "loss": 0.5317, + "step": 7803 + }, + { + "epoch": 3.7927051671732523, + "grad_norm": 0.06920362971271889, + "learning_rate": 6.452836623942859e-08, + "loss": 0.4739, + "step": 7804 + }, + { + "epoch": 3.793191489361702, + "grad_norm": 0.06861661899647022, + "learning_rate": 6.422211282827384e-08, + "loss": 0.4944, + "step": 7805 + }, + { + "epoch": 3.793677811550152, + "grad_norm": 0.07376530336636636, + "learning_rate": 6.391658319289729e-08, + "loss": 0.5397, + "step": 7806 + }, + { + "epoch": 3.794164133738602, + "grad_norm": 0.07159772618726348, + "learning_rate": 6.361177737810087e-08, + "loss": 0.469, + "step": 7807 + }, + { + "epoch": 3.7946504559270515, + "grad_norm": 0.07187959489780896, + "learning_rate": 6.330769542858106e-08, + "loss": 0.4883, + "step": 7808 + }, + { + "epoch": 3.7951367781155017, + "grad_norm": 0.0698058257982298, + "learning_rate": 6.30043373889272e-08, + "loss": 0.4727, + "step": 7809 + }, + { + "epoch": 3.7956231003039513, + "grad_norm": 0.07125906445090661, + "learning_rate": 6.270170330362479e-08, + "loss": 0.5222, + "step": 7810 + }, + { + "epoch": 3.7961094224924015, + "grad_norm": 0.072199820897027, + "learning_rate": 6.239979321705003e-08, + "loss": 0.5111, + "step": 7811 + }, + { + "epoch": 3.796595744680851, + "grad_norm": 0.07012334199895957, + "learning_rate": 6.209860717347638e-08, + "loss": 0.4944, + "step": 7812 + }, + { + "epoch": 3.797082066869301, + "grad_norm": 0.06898592212385408, + "learning_rate": 6.179814521706739e-08, + "loss": 0.4807, + "step": 7813 + }, + { + "epoch": 3.7975683890577505, + "grad_norm": 0.0726591986935889, + "learning_rate": 6.149840739188396e-08, + "loss": 0.532, + "step": 7814 + }, + { + "epoch": 3.7980547112462006, + "grad_norm": 0.07188334665145897, + "learning_rate": 6.119939374187866e-08, + "loss": 0.5231, + "step": 7815 + }, + { + "epoch": 3.7985410334346503, + "grad_norm": 0.0717317819459501, + "learning_rate": 6.090110431089813e-08, + "loss": 0.4989, + "step": 7816 + }, + { + "epoch": 3.7990273556231005, + "grad_norm": 0.07219236459886245, + "learning_rate": 6.060353914268402e-08, + "loss": 0.499, + "step": 7817 + }, + { + "epoch": 3.79951367781155, + "grad_norm": 0.06886751728506314, + "learning_rate": 6.030669828087033e-08, + "loss": 0.4927, + "step": 7818 + }, + { + "epoch": 3.8, + "grad_norm": 0.07159356727369592, + "learning_rate": 6.0010581768985e-08, + "loss": 0.5246, + "step": 7819 + }, + { + "epoch": 3.80048632218845, + "grad_norm": 0.07076491383466604, + "learning_rate": 5.971518965045054e-08, + "loss": 0.4927, + "step": 7820 + }, + { + "epoch": 3.8009726443768996, + "grad_norm": 0.07333703014917749, + "learning_rate": 5.942052196858339e-08, + "loss": 0.5141, + "step": 7821 + }, + { + "epoch": 3.8014589665653498, + "grad_norm": 0.07402200485244595, + "learning_rate": 5.9126578766592334e-08, + "loss": 0.5749, + "step": 7822 + }, + { + "epoch": 3.8019452887537994, + "grad_norm": 0.0717420115764388, + "learning_rate": 5.8833360087581225e-08, + "loss": 0.4986, + "step": 7823 + }, + { + "epoch": 3.802431610942249, + "grad_norm": 0.06934498795118592, + "learning_rate": 5.854086597454678e-08, + "loss": 0.5089, + "step": 7824 + }, + { + "epoch": 3.802917933130699, + "grad_norm": 0.0699841659199271, + "learning_rate": 5.8249096470380793e-08, + "loss": 0.5132, + "step": 7825 + }, + { + "epoch": 3.803404255319149, + "grad_norm": 0.07320797365932691, + "learning_rate": 5.7958051617867384e-08, + "loss": 0.5461, + "step": 7826 + }, + { + "epoch": 3.8038905775075986, + "grad_norm": 0.07109882012615379, + "learning_rate": 5.7667731459685185e-08, + "loss": 0.4809, + "step": 7827 + }, + { + "epoch": 3.8043768996960488, + "grad_norm": 0.06913960629259028, + "learning_rate": 5.737813603840625e-08, + "loss": 0.5108, + "step": 7828 + }, + { + "epoch": 3.8048632218844984, + "grad_norm": 0.07123382032328665, + "learning_rate": 5.7089265396496617e-08, + "loss": 0.5012, + "step": 7829 + }, + { + "epoch": 3.805349544072948, + "grad_norm": 0.07139689541080747, + "learning_rate": 5.680111957631518e-08, + "loss": 0.5106, + "step": 7830 + }, + { + "epoch": 3.8058358662613982, + "grad_norm": 0.07008085710962113, + "learning_rate": 5.651369862011646e-08, + "loss": 0.5001, + "step": 7831 + }, + { + "epoch": 3.806322188449848, + "grad_norm": 0.07009846601560987, + "learning_rate": 5.622700257004676e-08, + "loss": 0.4833, + "step": 7832 + }, + { + "epoch": 3.806808510638298, + "grad_norm": 0.0730541562075264, + "learning_rate": 5.594103146814633e-08, + "loss": 0.505, + "step": 7833 + }, + { + "epoch": 3.8072948328267477, + "grad_norm": 0.06997168458391015, + "learning_rate": 5.565578535635052e-08, + "loss": 0.4949, + "step": 7834 + }, + { + "epoch": 3.8077811550151974, + "grad_norm": 0.07041589109332315, + "learning_rate": 5.537126427648698e-08, + "loss": 0.4849, + "step": 7835 + }, + { + "epoch": 3.8082674772036476, + "grad_norm": 0.06878899087449222, + "learning_rate": 5.508746827027789e-08, + "loss": 0.4664, + "step": 7836 + }, + { + "epoch": 3.8087537993920972, + "grad_norm": 0.07209964303415166, + "learning_rate": 5.480439737933774e-08, + "loss": 0.5324, + "step": 7837 + }, + { + "epoch": 3.8092401215805474, + "grad_norm": 0.072873171654013, + "learning_rate": 5.4522051645176654e-08, + "loss": 0.5115, + "step": 7838 + }, + { + "epoch": 3.809726443768997, + "grad_norm": 0.06991811675142123, + "learning_rate": 5.4240431109197075e-08, + "loss": 0.4868, + "step": 7839 + }, + { + "epoch": 3.8102127659574467, + "grad_norm": 0.07195009979048064, + "learning_rate": 5.395953581269542e-08, + "loss": 0.5366, + "step": 7840 + }, + { + "epoch": 3.8106990881458964, + "grad_norm": 0.06984969421663202, + "learning_rate": 5.367936579686206e-08, + "loss": 0.4904, + "step": 7841 + }, + { + "epoch": 3.8111854103343465, + "grad_norm": 0.07108010520164718, + "learning_rate": 5.339992110278025e-08, + "loss": 0.5182, + "step": 7842 + }, + { + "epoch": 3.8116717325227962, + "grad_norm": 0.07001040453672337, + "learning_rate": 5.3121201771427214e-08, + "loss": 0.5125, + "step": 7843 + }, + { + "epoch": 3.8121580547112464, + "grad_norm": 0.07098170843084955, + "learning_rate": 5.284320784367525e-08, + "loss": 0.5077, + "step": 7844 + }, + { + "epoch": 3.812644376899696, + "grad_norm": 0.06947872438084872, + "learning_rate": 5.2565939360287866e-08, + "loss": 0.4798, + "step": 7845 + }, + { + "epoch": 3.8131306990881457, + "grad_norm": 0.07032561420588579, + "learning_rate": 5.2289396361923096e-08, + "loss": 0.4718, + "step": 7846 + }, + { + "epoch": 3.813617021276596, + "grad_norm": 0.07046750898414043, + "learning_rate": 5.2013578889134054e-08, + "loss": 0.5091, + "step": 7847 + }, + { + "epoch": 3.8141033434650455, + "grad_norm": 0.07076459196141849, + "learning_rate": 5.1738486982365055e-08, + "loss": 0.5221, + "step": 7848 + }, + { + "epoch": 3.8145896656534957, + "grad_norm": 0.07292765833751894, + "learning_rate": 5.1464120681956055e-08, + "loss": 0.5083, + "step": 7849 + }, + { + "epoch": 3.8150759878419453, + "grad_norm": 0.0703473460462279, + "learning_rate": 5.119048002813931e-08, + "loss": 0.5159, + "step": 7850 + }, + { + "epoch": 3.815562310030395, + "grad_norm": 0.07235388709190145, + "learning_rate": 5.091756506104162e-08, + "loss": 0.532, + "step": 7851 + }, + { + "epoch": 3.8160486322188447, + "grad_norm": 0.07090107767816602, + "learning_rate": 5.0645375820682075e-08, + "loss": 0.5472, + "step": 7852 + }, + { + "epoch": 3.816534954407295, + "grad_norm": 0.07253015436573712, + "learning_rate": 5.0373912346974305e-08, + "loss": 0.5107, + "step": 7853 + }, + { + "epoch": 3.8170212765957445, + "grad_norm": 0.07138190665650147, + "learning_rate": 5.010317467972592e-08, + "loss": 0.5198, + "step": 7854 + }, + { + "epoch": 3.8175075987841947, + "grad_norm": 0.07245824848976665, + "learning_rate": 4.983316285863682e-08, + "loss": 0.5122, + "step": 7855 + }, + { + "epoch": 3.8179939209726443, + "grad_norm": 0.07105082772833467, + "learning_rate": 4.9563876923302004e-08, + "loss": 0.4913, + "step": 7856 + }, + { + "epoch": 3.818480243161094, + "grad_norm": 0.07009525509163841, + "learning_rate": 4.929531691320821e-08, + "loss": 0.4898, + "step": 7857 + }, + { + "epoch": 3.818966565349544, + "grad_norm": 0.07293765875685293, + "learning_rate": 4.9027482867737286e-08, + "loss": 0.5189, + "step": 7858 + }, + { + "epoch": 3.819452887537994, + "grad_norm": 0.07079885810829864, + "learning_rate": 4.876037482616447e-08, + "loss": 0.5313, + "step": 7859 + }, + { + "epoch": 3.819939209726444, + "grad_norm": 0.06927717991427378, + "learning_rate": 4.849399282765732e-08, + "loss": 0.4853, + "step": 7860 + }, + { + "epoch": 3.8204255319148936, + "grad_norm": 0.06938708230031493, + "learning_rate": 4.822833691127793e-08, + "loss": 0.5033, + "step": 7861 + }, + { + "epoch": 3.8209118541033433, + "grad_norm": 0.07075138864596536, + "learning_rate": 4.79634071159818e-08, + "loss": 0.5036, + "step": 7862 + }, + { + "epoch": 3.8213981762917935, + "grad_norm": 0.06879710615572684, + "learning_rate": 4.769920348061785e-08, + "loss": 0.4961, + "step": 7863 + }, + { + "epoch": 3.821884498480243, + "grad_norm": 0.07129116758488921, + "learning_rate": 4.743572604392899e-08, + "loss": 0.5013, + "step": 7864 + }, + { + "epoch": 3.8223708206686933, + "grad_norm": 0.07063452560304825, + "learning_rate": 4.717297484455041e-08, + "loss": 0.5062, + "step": 7865 + }, + { + "epoch": 3.822857142857143, + "grad_norm": 0.06790452136271805, + "learning_rate": 4.691094992101242e-08, + "loss": 0.4893, + "step": 7866 + }, + { + "epoch": 3.8233434650455926, + "grad_norm": 0.06966108365059806, + "learning_rate": 4.66496513117376e-08, + "loss": 0.4884, + "step": 7867 + }, + { + "epoch": 3.8238297872340423, + "grad_norm": 0.0700776471147457, + "learning_rate": 4.6389079055041976e-08, + "loss": 0.5226, + "step": 7868 + }, + { + "epoch": 3.8243161094224924, + "grad_norm": 0.06897968533983888, + "learning_rate": 4.612923318913609e-08, + "loss": 0.4881, + "step": 7869 + }, + { + "epoch": 3.824802431610942, + "grad_norm": 0.06955439364137804, + "learning_rate": 4.5870113752123355e-08, + "loss": 0.49, + "step": 7870 + }, + { + "epoch": 3.8252887537993923, + "grad_norm": 0.07223171414025488, + "learning_rate": 4.561172078200005e-08, + "loss": 0.5569, + "step": 7871 + }, + { + "epoch": 3.825775075987842, + "grad_norm": 0.07027894520467323, + "learning_rate": 4.535405431665751e-08, + "loss": 0.52, + "step": 7872 + }, + { + "epoch": 3.8262613981762916, + "grad_norm": 0.07234696220495436, + "learning_rate": 4.5097114393879426e-08, + "loss": 0.5183, + "step": 7873 + }, + { + "epoch": 3.8267477203647418, + "grad_norm": 0.06741978813203077, + "learning_rate": 4.484090105134231e-08, + "loss": 0.4679, + "step": 7874 + }, + { + "epoch": 3.8272340425531914, + "grad_norm": 0.07253755459446587, + "learning_rate": 4.458541432661778e-08, + "loss": 0.5334, + "step": 7875 + }, + { + "epoch": 3.8277203647416416, + "grad_norm": 0.0692470292194821, + "learning_rate": 4.433065425716976e-08, + "loss": 0.4898, + "step": 7876 + }, + { + "epoch": 3.8282066869300913, + "grad_norm": 0.07073813658569714, + "learning_rate": 4.407662088035613e-08, + "loss": 0.5308, + "step": 7877 + }, + { + "epoch": 3.828693009118541, + "grad_norm": 0.06900075105207301, + "learning_rate": 4.382331423342767e-08, + "loss": 0.4872, + "step": 7878 + }, + { + "epoch": 3.8291793313069906, + "grad_norm": 0.06973538266640854, + "learning_rate": 4.3570734353528545e-08, + "loss": 0.4986, + "step": 7879 + }, + { + "epoch": 3.8296656534954407, + "grad_norm": 0.07416474799652553, + "learning_rate": 4.331888127769801e-08, + "loss": 0.5374, + "step": 7880 + }, + { + "epoch": 3.8301519756838904, + "grad_norm": 0.07118278628780612, + "learning_rate": 4.3067755042866534e-08, + "loss": 0.5398, + "step": 7881 + }, + { + "epoch": 3.8306382978723406, + "grad_norm": 0.07059650921553912, + "learning_rate": 4.28173556858591e-08, + "loss": 0.4968, + "step": 7882 + }, + { + "epoch": 3.8311246200607902, + "grad_norm": 0.071473825792252, + "learning_rate": 4.256768324339356e-08, + "loss": 0.4863, + "step": 7883 + }, + { + "epoch": 3.83161094224924, + "grad_norm": 0.07224562663968982, + "learning_rate": 4.231873775208228e-08, + "loss": 0.503, + "step": 7884 + }, + { + "epoch": 3.83209726443769, + "grad_norm": 0.0731941168944801, + "learning_rate": 4.207051924842942e-08, + "loss": 0.5283, + "step": 7885 + }, + { + "epoch": 3.8325835866261397, + "grad_norm": 0.0725168508435478, + "learning_rate": 4.182302776883418e-08, + "loss": 0.5349, + "step": 7886 + }, + { + "epoch": 3.83306990881459, + "grad_norm": 0.0721460699220282, + "learning_rate": 4.157626334958809e-08, + "loss": 0.5211, + "step": 7887 + }, + { + "epoch": 3.8335562310030395, + "grad_norm": 0.07329234530379539, + "learning_rate": 4.133022602687664e-08, + "loss": 0.5072, + "step": 7888 + }, + { + "epoch": 3.8340425531914892, + "grad_norm": 0.07433649136403024, + "learning_rate": 4.108491583677765e-08, + "loss": 0.5368, + "step": 7889 + }, + { + "epoch": 3.8345288753799394, + "grad_norm": 0.06986081044042201, + "learning_rate": 4.084033281526345e-08, + "loss": 0.5185, + "step": 7890 + }, + { + "epoch": 3.835015197568389, + "grad_norm": 0.07157927764885447, + "learning_rate": 4.0596476998199795e-08, + "loss": 0.523, + "step": 7891 + }, + { + "epoch": 3.835501519756839, + "grad_norm": 0.07272545510780835, + "learning_rate": 4.035334842134475e-08, + "loss": 0.5138, + "step": 7892 + }, + { + "epoch": 3.835987841945289, + "grad_norm": 0.07047869766337185, + "learning_rate": 4.011094712035091e-08, + "loss": 0.4675, + "step": 7893 + }, + { + "epoch": 3.8364741641337385, + "grad_norm": 0.06939849308348055, + "learning_rate": 3.986927313076372e-08, + "loss": 0.4822, + "step": 7894 + }, + { + "epoch": 3.8369604863221882, + "grad_norm": 0.07034797788341138, + "learning_rate": 3.962832648802151e-08, + "loss": 0.516, + "step": 7895 + }, + { + "epoch": 3.8374468085106384, + "grad_norm": 0.07225809705980982, + "learning_rate": 3.9388107227456007e-08, + "loss": 0.494, + "step": 7896 + }, + { + "epoch": 3.837933130699088, + "grad_norm": 0.07161172741771306, + "learning_rate": 3.914861538429349e-08, + "loss": 0.5001, + "step": 7897 + }, + { + "epoch": 3.838419452887538, + "grad_norm": 0.06865559628080964, + "learning_rate": 3.890985099365196e-08, + "loss": 0.4913, + "step": 7898 + }, + { + "epoch": 3.838905775075988, + "grad_norm": 0.07014735261654034, + "learning_rate": 3.867181409054399e-08, + "loss": 0.5233, + "step": 7899 + }, + { + "epoch": 3.8393920972644375, + "grad_norm": 0.07052245799038553, + "learning_rate": 3.8434504709874974e-08, + "loss": 0.5304, + "step": 7900 + }, + { + "epoch": 3.8398784194528877, + "grad_norm": 0.072591351918813, + "learning_rate": 3.81979228864432e-08, + "loss": 0.5101, + "step": 7901 + }, + { + "epoch": 3.8403647416413373, + "grad_norm": 0.07023520616863106, + "learning_rate": 3.7962068654941454e-08, + "loss": 0.4865, + "step": 7902 + }, + { + "epoch": 3.8408510638297875, + "grad_norm": 0.07089444509138243, + "learning_rate": 3.772694204995431e-08, + "loss": 0.5143, + "step": 7903 + }, + { + "epoch": 3.841337386018237, + "grad_norm": 0.0706212951663669, + "learning_rate": 3.74925431059614e-08, + "loss": 0.5216, + "step": 7904 + }, + { + "epoch": 3.841823708206687, + "grad_norm": 0.07240029730870431, + "learning_rate": 3.725887185733357e-08, + "loss": 0.5256, + "step": 7905 + }, + { + "epoch": 3.8423100303951365, + "grad_norm": 0.06826607263517383, + "learning_rate": 3.702592833833618e-08, + "loss": 0.4897, + "step": 7906 + }, + { + "epoch": 3.8427963525835866, + "grad_norm": 0.06889763208378237, + "learning_rate": 3.679371258312858e-08, + "loss": 0.5102, + "step": 7907 + }, + { + "epoch": 3.8432826747720363, + "grad_norm": 0.07105502092173523, + "learning_rate": 3.656222462576187e-08, + "loss": 0.5044, + "step": 7908 + }, + { + "epoch": 3.8437689969604865, + "grad_norm": 0.07254610173429767, + "learning_rate": 3.6331464500181656e-08, + "loss": 0.5155, + "step": 7909 + }, + { + "epoch": 3.844255319148936, + "grad_norm": 0.07228866457659237, + "learning_rate": 3.610143224022589e-08, + "loss": 0.4897, + "step": 7910 + }, + { + "epoch": 3.844741641337386, + "grad_norm": 0.07329058159601545, + "learning_rate": 3.5872127879625904e-08, + "loss": 0.5269, + "step": 7911 + }, + { + "epoch": 3.845227963525836, + "grad_norm": 0.0728705956119226, + "learning_rate": 3.5643551452007595e-08, + "loss": 0.5007, + "step": 7912 + }, + { + "epoch": 3.8457142857142856, + "grad_norm": 0.07100347098671603, + "learning_rate": 3.5415702990888035e-08, + "loss": 0.52, + "step": 7913 + }, + { + "epoch": 3.8462006079027358, + "grad_norm": 0.07433461743446373, + "learning_rate": 3.518858252967883e-08, + "loss": 0.4843, + "step": 7914 + }, + { + "epoch": 3.8466869300911855, + "grad_norm": 0.0697692931923992, + "learning_rate": 3.496219010168556e-08, + "loss": 0.4854, + "step": 7915 + }, + { + "epoch": 3.847173252279635, + "grad_norm": 0.07156558640182122, + "learning_rate": 3.473652574010444e-08, + "loss": 0.5126, + "step": 7916 + }, + { + "epoch": 3.8476595744680853, + "grad_norm": 0.06953625004607342, + "learning_rate": 3.451158947802846e-08, + "loss": 0.4859, + "step": 7917 + }, + { + "epoch": 3.848145896656535, + "grad_norm": 0.07036746057962606, + "learning_rate": 3.428738134844012e-08, + "loss": 0.5032, + "step": 7918 + }, + { + "epoch": 3.848632218844985, + "grad_norm": 0.07130335421097073, + "learning_rate": 3.406390138421867e-08, + "loss": 0.5113, + "step": 7919 + }, + { + "epoch": 3.8491185410334348, + "grad_norm": 0.07251656070863585, + "learning_rate": 3.384114961813345e-08, + "loss": 0.4894, + "step": 7920 + }, + { + "epoch": 3.8496048632218844, + "grad_norm": 0.07201203072271546, + "learning_rate": 3.361912608284945e-08, + "loss": 0.5111, + "step": 7921 + }, + { + "epoch": 3.850091185410334, + "grad_norm": 0.07087311142096846, + "learning_rate": 3.339783081092396e-08, + "loss": 0.5162, + "step": 7922 + }, + { + "epoch": 3.8505775075987843, + "grad_norm": 0.06882443704770187, + "learning_rate": 3.317726383480657e-08, + "loss": 0.4748, + "step": 7923 + }, + { + "epoch": 3.851063829787234, + "grad_norm": 0.06999204770726007, + "learning_rate": 3.295742518684198e-08, + "loss": 0.5043, + "step": 7924 + }, + { + "epoch": 3.851550151975684, + "grad_norm": 0.07062946953927692, + "learning_rate": 3.273831489926604e-08, + "loss": 0.5002, + "step": 7925 + }, + { + "epoch": 3.8520364741641338, + "grad_norm": 0.06907333204617308, + "learning_rate": 3.251993300420919e-08, + "loss": 0.4989, + "step": 7926 + }, + { + "epoch": 3.8525227963525834, + "grad_norm": 0.0701487432594494, + "learning_rate": 3.2302279533695244e-08, + "loss": 0.4863, + "step": 7927 + }, + { + "epoch": 3.8530091185410336, + "grad_norm": 0.07109656714211889, + "learning_rate": 3.208535451963979e-08, + "loss": 0.5198, + "step": 7928 + }, + { + "epoch": 3.8534954407294832, + "grad_norm": 0.06805048706923046, + "learning_rate": 3.186915799385237e-08, + "loss": 0.4735, + "step": 7929 + }, + { + "epoch": 3.8539817629179334, + "grad_norm": 0.07020111510368054, + "learning_rate": 3.165368998803597e-08, + "loss": 0.513, + "step": 7930 + }, + { + "epoch": 3.854468085106383, + "grad_norm": 0.06853839430597691, + "learning_rate": 3.143895053378698e-08, + "loss": 0.4913, + "step": 7931 + }, + { + "epoch": 3.8549544072948327, + "grad_norm": 0.06873485436844724, + "learning_rate": 3.12249396625941e-08, + "loss": 0.4745, + "step": 7932 + }, + { + "epoch": 3.8554407294832824, + "grad_norm": 0.07019950593056339, + "learning_rate": 3.101165740584e-08, + "loss": 0.4949, + "step": 7933 + }, + { + "epoch": 3.8559270516717326, + "grad_norm": 0.06906572506719856, + "learning_rate": 3.079910379479911e-08, + "loss": 0.4945, + "step": 7934 + }, + { + "epoch": 3.8564133738601822, + "grad_norm": 0.07110717184636539, + "learning_rate": 3.0587278860640946e-08, + "loss": 0.4886, + "step": 7935 + }, + { + "epoch": 3.8568996960486324, + "grad_norm": 0.07156272785598021, + "learning_rate": 3.037618263442676e-08, + "loss": 0.5273, + "step": 7936 + }, + { + "epoch": 3.857386018237082, + "grad_norm": 0.07229447303050426, + "learning_rate": 3.016581514711181e-08, + "loss": 0.5408, + "step": 7937 + }, + { + "epoch": 3.8578723404255317, + "grad_norm": 0.07061412520549051, + "learning_rate": 2.9956176429543626e-08, + "loss": 0.5363, + "step": 7938 + }, + { + "epoch": 3.858358662613982, + "grad_norm": 0.07075999479098197, + "learning_rate": 2.9747266512463735e-08, + "loss": 0.5112, + "step": 7939 + }, + { + "epoch": 3.8588449848024315, + "grad_norm": 0.0685432427703031, + "learning_rate": 2.9539085426505965e-08, + "loss": 0.4642, + "step": 7940 + }, + { + "epoch": 3.8593313069908817, + "grad_norm": 0.06990221476977632, + "learning_rate": 2.9331633202198116e-08, + "loss": 0.4987, + "step": 7941 + }, + { + "epoch": 3.8598176291793314, + "grad_norm": 0.0700837523202798, + "learning_rate": 2.912490986996086e-08, + "loss": 0.4944, + "step": 7942 + }, + { + "epoch": 3.860303951367781, + "grad_norm": 0.07189610924231331, + "learning_rate": 2.8918915460107723e-08, + "loss": 0.5121, + "step": 7943 + }, + { + "epoch": 3.860790273556231, + "grad_norm": 0.06933396439094201, + "learning_rate": 2.871365000284454e-08, + "loss": 0.5046, + "step": 7944 + }, + { + "epoch": 3.861276595744681, + "grad_norm": 0.0713305139064588, + "learning_rate": 2.8509113528272238e-08, + "loss": 0.5034, + "step": 7945 + }, + { + "epoch": 3.8617629179331305, + "grad_norm": 0.07142600151912995, + "learning_rate": 2.8305306066383487e-08, + "loss": 0.4954, + "step": 7946 + }, + { + "epoch": 3.8622492401215807, + "grad_norm": 0.06839717905794623, + "learning_rate": 2.8102227647064385e-08, + "loss": 0.4615, + "step": 7947 + }, + { + "epoch": 3.8627355623100303, + "grad_norm": 0.07174380305517765, + "learning_rate": 2.7899878300093886e-08, + "loss": 0.4838, + "step": 7948 + }, + { + "epoch": 3.86322188449848, + "grad_norm": 0.07251124745635402, + "learning_rate": 2.769825805514381e-08, + "loss": 0.5365, + "step": 7949 + }, + { + "epoch": 3.86370820668693, + "grad_norm": 0.07144880923228115, + "learning_rate": 2.7497366941780513e-08, + "loss": 0.4932, + "step": 7950 + }, + { + "epoch": 3.86419452887538, + "grad_norm": 0.07028412591632652, + "learning_rate": 2.7297204989461536e-08, + "loss": 0.5106, + "step": 7951 + }, + { + "epoch": 3.86468085106383, + "grad_norm": 0.07157316307252287, + "learning_rate": 2.7097772227538956e-08, + "loss": 0.5275, + "step": 7952 + }, + { + "epoch": 3.8651671732522797, + "grad_norm": 0.07322448346050967, + "learning_rate": 2.689906868525716e-08, + "loss": 0.5032, + "step": 7953 + }, + { + "epoch": 3.8656534954407293, + "grad_norm": 0.06944357777896402, + "learning_rate": 2.6701094391753392e-08, + "loss": 0.4881, + "step": 7954 + }, + { + "epoch": 3.8661398176291795, + "grad_norm": 0.0691630501171565, + "learning_rate": 2.650384937605832e-08, + "loss": 0.4879, + "step": 7955 + }, + { + "epoch": 3.866626139817629, + "grad_norm": 0.07000903279732242, + "learning_rate": 2.6307333667096036e-08, + "loss": 0.517, + "step": 7956 + }, + { + "epoch": 3.8671124620060793, + "grad_norm": 0.07332953420884539, + "learning_rate": 2.6111547293683482e-08, + "loss": 0.4955, + "step": 7957 + }, + { + "epoch": 3.867598784194529, + "grad_norm": 0.06766470855443787, + "learning_rate": 2.591649028453047e-08, + "loss": 0.4765, + "step": 7958 + }, + { + "epoch": 3.8680851063829786, + "grad_norm": 0.07102516840665797, + "learning_rate": 2.5722162668239124e-08, + "loss": 0.522, + "step": 7959 + }, + { + "epoch": 3.8685714285714283, + "grad_norm": 0.07205109579088757, + "learning_rate": 2.5528564473306648e-08, + "loss": 0.5011, + "step": 7960 + }, + { + "epoch": 3.8690577507598785, + "grad_norm": 0.07371491152142387, + "learning_rate": 2.5335695728120336e-08, + "loss": 0.5115, + "step": 7961 + }, + { + "epoch": 3.869544072948328, + "grad_norm": 0.07108892884186348, + "learning_rate": 2.514355646096367e-08, + "loss": 0.5184, + "step": 7962 + }, + { + "epoch": 3.8700303951367783, + "grad_norm": 0.07012381297719256, + "learning_rate": 2.495214670001134e-08, + "loss": 0.5127, + "step": 7963 + }, + { + "epoch": 3.870516717325228, + "grad_norm": 0.06854326583201292, + "learning_rate": 2.4761466473331443e-08, + "loss": 0.4836, + "step": 7964 + }, + { + "epoch": 3.8710030395136776, + "grad_norm": 0.0699264852923121, + "learning_rate": 2.457151580888495e-08, + "loss": 0.5022, + "step": 7965 + }, + { + "epoch": 3.8714893617021278, + "grad_norm": 0.07154132930607603, + "learning_rate": 2.438229473452569e-08, + "loss": 0.5231, + "step": 7966 + }, + { + "epoch": 3.8719756838905774, + "grad_norm": 0.0729996863070197, + "learning_rate": 2.4193803278000916e-08, + "loss": 0.534, + "step": 7967 + }, + { + "epoch": 3.8724620060790276, + "grad_norm": 0.0698960523369294, + "learning_rate": 2.4006041466950735e-08, + "loss": 0.5167, + "step": 7968 + }, + { + "epoch": 3.8729483282674773, + "grad_norm": 0.07313663359826762, + "learning_rate": 2.3819009328908683e-08, + "loss": 0.4901, + "step": 7969 + }, + { + "epoch": 3.873434650455927, + "grad_norm": 0.07341818534513772, + "learning_rate": 2.3632706891300593e-08, + "loss": 0.5032, + "step": 7970 + }, + { + "epoch": 3.8739209726443766, + "grad_norm": 0.06797034121251241, + "learning_rate": 2.344713418144573e-08, + "loss": 0.4978, + "step": 7971 + }, + { + "epoch": 3.8744072948328268, + "grad_norm": 0.07139995622111268, + "learning_rate": 2.326229122655621e-08, + "loss": 0.5086, + "step": 7972 + }, + { + "epoch": 3.8748936170212764, + "grad_norm": 0.06973763921383579, + "learning_rate": 2.307817805373702e-08, + "loss": 0.5161, + "step": 7973 + }, + { + "epoch": 3.8753799392097266, + "grad_norm": 0.0732180694004753, + "learning_rate": 2.2894794689986565e-08, + "loss": 0.5179, + "step": 7974 + }, + { + "epoch": 3.8758662613981762, + "grad_norm": 0.07347820833299802, + "learning_rate": 2.2712141162195e-08, + "loss": 0.5198, + "step": 7975 + }, + { + "epoch": 3.876352583586626, + "grad_norm": 0.07075042268700264, + "learning_rate": 2.2530217497147566e-08, + "loss": 0.5171, + "step": 7976 + }, + { + "epoch": 3.876838905775076, + "grad_norm": 0.07148612546164884, + "learning_rate": 2.23490237215207e-08, + "loss": 0.5312, + "step": 7977 + }, + { + "epoch": 3.8773252279635257, + "grad_norm": 0.07246561919183483, + "learning_rate": 2.216855986188482e-08, + "loss": 0.5209, + "step": 7978 + }, + { + "epoch": 3.877811550151976, + "grad_norm": 0.0709196590158568, + "learning_rate": 2.1988825944702086e-08, + "loss": 0.5072, + "step": 7979 + }, + { + "epoch": 3.8782978723404256, + "grad_norm": 0.07221483609910441, + "learning_rate": 2.1809821996329195e-08, + "loss": 0.5416, + "step": 7980 + }, + { + "epoch": 3.8787841945288752, + "grad_norm": 0.06911499768906816, + "learning_rate": 2.1631548043014593e-08, + "loss": 0.4929, + "step": 7981 + }, + { + "epoch": 3.8792705167173254, + "grad_norm": 0.07057777768451894, + "learning_rate": 2.1454004110900706e-08, + "loss": 0.5078, + "step": 7982 + }, + { + "epoch": 3.879756838905775, + "grad_norm": 0.07049499313249019, + "learning_rate": 2.1277190226021706e-08, + "loss": 0.5225, + "step": 7983 + }, + { + "epoch": 3.880243161094225, + "grad_norm": 0.07039969486107699, + "learning_rate": 2.1101106414306293e-08, + "loss": 0.52, + "step": 7984 + }, + { + "epoch": 3.880729483282675, + "grad_norm": 0.07118495342841619, + "learning_rate": 2.092575270157382e-08, + "loss": 0.5165, + "step": 7985 + }, + { + "epoch": 3.8812158054711245, + "grad_norm": 0.07177304519647379, + "learning_rate": 2.0751129113538715e-08, + "loss": 0.5211, + "step": 7986 + }, + { + "epoch": 3.8817021276595742, + "grad_norm": 0.07136027164333868, + "learning_rate": 2.0577235675807717e-08, + "loss": 0.5272, + "step": 7987 + }, + { + "epoch": 3.8821884498480244, + "grad_norm": 0.07170350874886534, + "learning_rate": 2.0404072413879318e-08, + "loss": 0.5141, + "step": 7988 + }, + { + "epoch": 3.882674772036474, + "grad_norm": 0.07200504812634104, + "learning_rate": 2.0231639353147093e-08, + "loss": 0.5256, + "step": 7989 + }, + { + "epoch": 3.883161094224924, + "grad_norm": 0.07150320420962743, + "learning_rate": 2.0059936518895816e-08, + "loss": 0.4851, + "step": 7990 + }, + { + "epoch": 3.883647416413374, + "grad_norm": 0.0723740199137926, + "learning_rate": 1.988896393630424e-08, + "loss": 0.4972, + "step": 7991 + }, + { + "epoch": 3.8841337386018235, + "grad_norm": 0.07019376784195688, + "learning_rate": 1.971872163044286e-08, + "loss": 0.503, + "step": 7992 + }, + { + "epoch": 3.8846200607902737, + "grad_norm": 0.06803617292223385, + "learning_rate": 1.9549209626276156e-08, + "loss": 0.5013, + "step": 7993 + }, + { + "epoch": 3.8851063829787233, + "grad_norm": 0.07025752497536777, + "learning_rate": 1.9380427948660906e-08, + "loss": 0.4938, + "step": 7994 + }, + { + "epoch": 3.8855927051671735, + "grad_norm": 0.07168717704944641, + "learning_rate": 1.9212376622347318e-08, + "loss": 0.5256, + "step": 7995 + }, + { + "epoch": 3.886079027355623, + "grad_norm": 0.06975030982400551, + "learning_rate": 1.9045055671978452e-08, + "loss": 0.5085, + "step": 7996 + }, + { + "epoch": 3.886565349544073, + "grad_norm": 0.07066849529866817, + "learning_rate": 1.8878465122089683e-08, + "loss": 0.4946, + "step": 7997 + }, + { + "epoch": 3.8870516717325225, + "grad_norm": 0.07087872286839735, + "learning_rate": 1.8712604997108696e-08, + "loss": 0.5033, + "step": 7998 + }, + { + "epoch": 3.8875379939209727, + "grad_norm": 0.07137933732484057, + "learning_rate": 1.854747532135881e-08, + "loss": 0.4957, + "step": 7999 + }, + { + "epoch": 3.8880243161094223, + "grad_norm": 0.07321259547447465, + "learning_rate": 1.8383076119053433e-08, + "loss": 0.4922, + "step": 8000 + }, + { + "epoch": 3.8885106382978725, + "grad_norm": 0.0709579965162484, + "learning_rate": 1.821940741429995e-08, + "loss": 0.4958, + "step": 8001 + }, + { + "epoch": 3.888996960486322, + "grad_norm": 0.07289988721569056, + "learning_rate": 1.805646923109805e-08, + "loss": 0.5131, + "step": 8002 + }, + { + "epoch": 3.889483282674772, + "grad_norm": 0.06970760540710476, + "learning_rate": 1.7894261593341956e-08, + "loss": 0.4883, + "step": 8003 + }, + { + "epoch": 3.889969604863222, + "grad_norm": 0.07111830826865695, + "learning_rate": 1.773278452481597e-08, + "loss": 0.5106, + "step": 8004 + }, + { + "epoch": 3.8904559270516716, + "grad_norm": 0.0694227570324697, + "learning_rate": 1.7572038049200603e-08, + "loss": 0.5001, + "step": 8005 + }, + { + "epoch": 3.8909422492401218, + "grad_norm": 0.07346816846689401, + "learning_rate": 1.741202219006588e-08, + "loss": 0.5364, + "step": 8006 + }, + { + "epoch": 3.8914285714285715, + "grad_norm": 0.07277358815957531, + "learning_rate": 1.7252736970877483e-08, + "loss": 0.5103, + "step": 8007 + }, + { + "epoch": 3.891914893617021, + "grad_norm": 0.07317398562816137, + "learning_rate": 1.7094182414992277e-08, + "loss": 0.5296, + "step": 8008 + }, + { + "epoch": 3.8924012158054713, + "grad_norm": 0.07220176697891172, + "learning_rate": 1.693635854566056e-08, + "loss": 0.526, + "step": 8009 + }, + { + "epoch": 3.892887537993921, + "grad_norm": 0.07039717442309316, + "learning_rate": 1.6779265386025478e-08, + "loss": 0.5375, + "step": 8010 + }, + { + "epoch": 3.893373860182371, + "grad_norm": 0.07012526973137066, + "learning_rate": 1.6622902959123055e-08, + "loss": 0.4978, + "step": 8011 + }, + { + "epoch": 3.8938601823708208, + "grad_norm": 0.07111492182383376, + "learning_rate": 1.6467271287881615e-08, + "loss": 0.5017, + "step": 8012 + }, + { + "epoch": 3.8943465045592704, + "grad_norm": 0.0712681043073196, + "learning_rate": 1.63123703951229e-08, + "loss": 0.4995, + "step": 8013 + }, + { + "epoch": 3.89483282674772, + "grad_norm": 0.07170752214295342, + "learning_rate": 1.615820030356208e-08, + "loss": 0.536, + "step": 8014 + }, + { + "epoch": 3.8953191489361703, + "grad_norm": 0.07188190801573636, + "learning_rate": 1.6004761035805505e-08, + "loss": 0.5392, + "step": 8015 + }, + { + "epoch": 3.89580547112462, + "grad_norm": 0.07183102857139019, + "learning_rate": 1.5852052614354074e-08, + "loss": 0.5327, + "step": 8016 + }, + { + "epoch": 3.89629179331307, + "grad_norm": 0.06906320866272496, + "learning_rate": 1.5700075061600427e-08, + "loss": 0.5085, + "step": 8017 + }, + { + "epoch": 3.8967781155015198, + "grad_norm": 0.06958322538993796, + "learning_rate": 1.554882839982952e-08, + "loss": 0.5061, + "step": 8018 + }, + { + "epoch": 3.8972644376899694, + "grad_norm": 0.07141640504236503, + "learning_rate": 1.539831265122138e-08, + "loss": 0.5157, + "step": 8019 + }, + { + "epoch": 3.8977507598784196, + "grad_norm": 0.06949156172828688, + "learning_rate": 1.5248527837846694e-08, + "loss": 0.514, + "step": 8020 + }, + { + "epoch": 3.8982370820668693, + "grad_norm": 0.06985149360430036, + "learning_rate": 1.509947398167011e-08, + "loss": 0.4768, + "step": 8021 + }, + { + "epoch": 3.8987234042553194, + "grad_norm": 0.07116323573215827, + "learning_rate": 1.4951151104548034e-08, + "loss": 0.5086, + "step": 8022 + }, + { + "epoch": 3.899209726443769, + "grad_norm": 0.07192095375157564, + "learning_rate": 1.4803559228230291e-08, + "loss": 0.5206, + "step": 8023 + }, + { + "epoch": 3.8996960486322187, + "grad_norm": 0.07393147855346544, + "learning_rate": 1.4656698374360678e-08, + "loss": 0.5586, + "step": 8024 + }, + { + "epoch": 3.9001823708206684, + "grad_norm": 0.07006074145980727, + "learning_rate": 1.4510568564473082e-08, + "loss": 0.5385, + "step": 8025 + }, + { + "epoch": 3.9006686930091186, + "grad_norm": 0.07144684980170236, + "learning_rate": 1.4365169819997582e-08, + "loss": 0.518, + "step": 8026 + }, + { + "epoch": 3.9011550151975682, + "grad_norm": 0.06967983310849515, + "learning_rate": 1.422050216225379e-08, + "loss": 0.5157, + "step": 8027 + }, + { + "epoch": 3.9016413373860184, + "grad_norm": 0.07032373783419817, + "learning_rate": 1.4076565612455851e-08, + "loss": 0.5328, + "step": 8028 + }, + { + "epoch": 3.902127659574468, + "grad_norm": 0.07510804876309583, + "learning_rate": 1.3933360191710766e-08, + "loss": 0.5361, + "step": 8029 + }, + { + "epoch": 3.9026139817629177, + "grad_norm": 0.0721959049120463, + "learning_rate": 1.379088592101785e-08, + "loss": 0.5086, + "step": 8030 + }, + { + "epoch": 3.903100303951368, + "grad_norm": 0.07070208058292794, + "learning_rate": 1.3649142821269834e-08, + "loss": 0.4771, + "step": 8031 + }, + { + "epoch": 3.9035866261398176, + "grad_norm": 0.0733363413301437, + "learning_rate": 1.350813091325065e-08, + "loss": 0.5219, + "step": 8032 + }, + { + "epoch": 3.9040729483282677, + "grad_norm": 0.07031034853924799, + "learning_rate": 1.3367850217639312e-08, + "loss": 0.4882, + "step": 8033 + }, + { + "epoch": 3.9045592705167174, + "grad_norm": 0.06892490282880302, + "learning_rate": 1.3228300755005474e-08, + "loss": 0.5073, + "step": 8034 + }, + { + "epoch": 3.905045592705167, + "grad_norm": 0.06996258561675435, + "learning_rate": 1.308948254581277e-08, + "loss": 0.5063, + "step": 8035 + }, + { + "epoch": 3.905531914893617, + "grad_norm": 0.07088594932322743, + "learning_rate": 1.2951395610417139e-08, + "loss": 0.5099, + "step": 8036 + }, + { + "epoch": 3.906018237082067, + "grad_norm": 0.07284858078289273, + "learning_rate": 1.2814039969067938e-08, + "loss": 0.5215, + "step": 8037 + }, + { + "epoch": 3.906504559270517, + "grad_norm": 0.07099830970134939, + "learning_rate": 1.2677415641906277e-08, + "loss": 0.5, + "step": 8038 + }, + { + "epoch": 3.9069908814589667, + "grad_norm": 0.06909582846937622, + "learning_rate": 1.2541522648966686e-08, + "loss": 0.5178, + "step": 8039 + }, + { + "epoch": 3.9074772036474164, + "grad_norm": 0.07344312000094438, + "learning_rate": 1.2406361010177115e-08, + "loss": 0.5519, + "step": 8040 + }, + { + "epoch": 3.907963525835866, + "grad_norm": 0.06989499221506916, + "learning_rate": 1.2271930745356153e-08, + "loss": 0.5127, + "step": 8041 + }, + { + "epoch": 3.908449848024316, + "grad_norm": 0.07147983131290152, + "learning_rate": 1.2138231874217477e-08, + "loss": 0.5047, + "step": 8042 + }, + { + "epoch": 3.908936170212766, + "grad_norm": 0.07270367242485364, + "learning_rate": 1.2005264416365958e-08, + "loss": 0.5125, + "step": 8043 + }, + { + "epoch": 3.909422492401216, + "grad_norm": 0.07285102824140043, + "learning_rate": 1.1873028391300445e-08, + "loss": 0.498, + "step": 8044 + }, + { + "epoch": 3.9099088145896657, + "grad_norm": 0.07164730193100352, + "learning_rate": 1.1741523818410983e-08, + "loss": 0.5126, + "step": 8045 + }, + { + "epoch": 3.9103951367781153, + "grad_norm": 0.07075149131951955, + "learning_rate": 1.161075071698159e-08, + "loss": 0.5196, + "step": 8046 + }, + { + "epoch": 3.9108814589665655, + "grad_norm": 0.07368281119650384, + "learning_rate": 1.1480709106189148e-08, + "loss": 0.5633, + "step": 8047 + }, + { + "epoch": 3.911367781155015, + "grad_norm": 0.07108252270608832, + "learning_rate": 1.1351399005101737e-08, + "loss": 0.4856, + "step": 8048 + }, + { + "epoch": 3.9118541033434653, + "grad_norm": 0.0693175439233018, + "learning_rate": 1.1222820432681969e-08, + "loss": 0.5013, + "step": 8049 + }, + { + "epoch": 3.912340425531915, + "grad_norm": 0.07234073173869401, + "learning_rate": 1.109497340778476e-08, + "loss": 0.5089, + "step": 8050 + }, + { + "epoch": 3.9128267477203647, + "grad_norm": 0.07302117425737847, + "learning_rate": 1.0967857949156224e-08, + "loss": 0.5466, + "step": 8051 + }, + { + "epoch": 3.9133130699088143, + "grad_norm": 0.07303509271950777, + "learning_rate": 1.0841474075437563e-08, + "loss": 0.4849, + "step": 8052 + }, + { + "epoch": 3.9137993920972645, + "grad_norm": 0.07076824660442604, + "learning_rate": 1.071582180516062e-08, + "loss": 0.5153, + "step": 8053 + }, + { + "epoch": 3.914285714285714, + "grad_norm": 0.07109986978236102, + "learning_rate": 1.0590901156751765e-08, + "loss": 0.5131, + "step": 8054 + }, + { + "epoch": 3.9147720364741643, + "grad_norm": 0.07131947830451511, + "learning_rate": 1.0466712148528569e-08, + "loss": 0.5143, + "step": 8055 + }, + { + "epoch": 3.915258358662614, + "grad_norm": 0.07175059732026251, + "learning_rate": 1.0343254798702018e-08, + "loss": 0.5503, + "step": 8056 + }, + { + "epoch": 3.9157446808510636, + "grad_norm": 0.07202828609471504, + "learning_rate": 1.0220529125375967e-08, + "loss": 0.512, + "step": 8057 + }, + { + "epoch": 3.9162310030395138, + "grad_norm": 0.0707318762087733, + "learning_rate": 1.0098535146547128e-08, + "loss": 0.5097, + "step": 8058 + }, + { + "epoch": 3.9167173252279635, + "grad_norm": 0.07093996689822704, + "learning_rate": 9.977272880103418e-09, + "loss": 0.5279, + "step": 8059 + }, + { + "epoch": 3.9172036474164136, + "grad_norm": 0.07153232852592072, + "learning_rate": 9.856742343827275e-09, + "loss": 0.5079, + "step": 8060 + }, + { + "epoch": 3.9176899696048633, + "grad_norm": 0.07001829844735431, + "learning_rate": 9.736943555392897e-09, + "loss": 0.5286, + "step": 8061 + }, + { + "epoch": 3.918176291793313, + "grad_norm": 0.07266345529267103, + "learning_rate": 9.617876532367897e-09, + "loss": 0.5486, + "step": 8062 + }, + { + "epoch": 3.918662613981763, + "grad_norm": 0.06941666160229705, + "learning_rate": 9.499541292211645e-09, + "loss": 0.4737, + "step": 8063 + }, + { + "epoch": 3.9191489361702128, + "grad_norm": 0.0698667328209181, + "learning_rate": 9.381937852276924e-09, + "loss": 0.4918, + "step": 8064 + }, + { + "epoch": 3.919635258358663, + "grad_norm": 0.0728594624611233, + "learning_rate": 9.265066229808272e-09, + "loss": 0.5466, + "step": 8065 + }, + { + "epoch": 3.9201215805471126, + "grad_norm": 0.07212304246567539, + "learning_rate": 9.148926441944762e-09, + "loss": 0.5341, + "step": 8066 + }, + { + "epoch": 3.9206079027355623, + "grad_norm": 0.06768923129651373, + "learning_rate": 9.0335185057161e-09, + "loss": 0.4758, + "step": 8067 + }, + { + "epoch": 3.921094224924012, + "grad_norm": 0.06987244158761928, + "learning_rate": 8.918842438045416e-09, + "loss": 0.5126, + "step": 8068 + }, + { + "epoch": 3.921580547112462, + "grad_norm": 0.07084128925372642, + "learning_rate": 8.804898255749261e-09, + "loss": 0.5078, + "step": 8069 + }, + { + "epoch": 3.9220668693009118, + "grad_norm": 0.0716854976258303, + "learning_rate": 8.691685975535935e-09, + "loss": 0.511, + "step": 8070 + }, + { + "epoch": 3.922553191489362, + "grad_norm": 0.07222527846272814, + "learning_rate": 8.579205614006603e-09, + "loss": 0.5096, + "step": 8071 + }, + { + "epoch": 3.9230395136778116, + "grad_norm": 0.06996761532935165, + "learning_rate": 8.467457187655847e-09, + "loss": 0.5227, + "step": 8072 + }, + { + "epoch": 3.9235258358662612, + "grad_norm": 0.07161724318413952, + "learning_rate": 8.356440712869452e-09, + "loss": 0.5348, + "step": 8073 + }, + { + "epoch": 3.9240121580547114, + "grad_norm": 0.07413587454398333, + "learning_rate": 8.246156205927725e-09, + "loss": 0.5382, + "step": 8074 + }, + { + "epoch": 3.924498480243161, + "grad_norm": 0.07044649136763231, + "learning_rate": 8.13660368300162e-09, + "loss": 0.5072, + "step": 8075 + }, + { + "epoch": 3.924984802431611, + "grad_norm": 0.06826681416322335, + "learning_rate": 8.027783160156622e-09, + "loss": 0.4963, + "step": 8076 + }, + { + "epoch": 3.925471124620061, + "grad_norm": 0.0721365217239518, + "learning_rate": 7.919694653349408e-09, + "loss": 0.5051, + "step": 8077 + }, + { + "epoch": 3.9259574468085106, + "grad_norm": 0.07278202553001152, + "learning_rate": 7.812338178430079e-09, + "loss": 0.5021, + "step": 8078 + }, + { + "epoch": 3.9264437689969602, + "grad_norm": 0.07225236612920691, + "learning_rate": 7.705713751141041e-09, + "loss": 0.5579, + "step": 8079 + }, + { + "epoch": 3.9269300911854104, + "grad_norm": 0.07197941205032261, + "learning_rate": 7.599821387118122e-09, + "loss": 0.5131, + "step": 8080 + }, + { + "epoch": 3.92741641337386, + "grad_norm": 0.07153841104866336, + "learning_rate": 7.494661101889456e-09, + "loss": 0.5068, + "step": 8081 + }, + { + "epoch": 3.92790273556231, + "grad_norm": 0.07887686925940758, + "learning_rate": 7.390232910874373e-09, + "loss": 0.5372, + "step": 8082 + }, + { + "epoch": 3.92838905775076, + "grad_norm": 0.06896797961653627, + "learning_rate": 7.286536829386737e-09, + "loss": 0.5092, + "step": 8083 + }, + { + "epoch": 3.9288753799392095, + "grad_norm": 0.07192585453559809, + "learning_rate": 7.183572872632716e-09, + "loss": 0.4893, + "step": 8084 + }, + { + "epoch": 3.9293617021276597, + "grad_norm": 0.07287838702336884, + "learning_rate": 7.081341055710789e-09, + "loss": 0.5513, + "step": 8085 + }, + { + "epoch": 3.9298480243161094, + "grad_norm": 0.076278886647371, + "learning_rate": 6.979841393611741e-09, + "loss": 0.533, + "step": 8086 + }, + { + "epoch": 3.9303343465045595, + "grad_norm": 0.07175553030354498, + "learning_rate": 6.879073901219224e-09, + "loss": 0.5127, + "step": 8087 + }, + { + "epoch": 3.930820668693009, + "grad_norm": 0.07206795743866867, + "learning_rate": 6.7790385933097505e-09, + "loss": 0.5086, + "step": 8088 + }, + { + "epoch": 3.931306990881459, + "grad_norm": 0.07291996651267474, + "learning_rate": 6.679735484552696e-09, + "loss": 0.5287, + "step": 8089 + }, + { + "epoch": 3.931793313069909, + "grad_norm": 0.07403055741183005, + "learning_rate": 6.581164589509192e-09, + "loss": 0.5072, + "step": 8090 + }, + { + "epoch": 3.9322796352583587, + "grad_norm": 0.06903281597139206, + "learning_rate": 6.483325922634342e-09, + "loss": 0.4806, + "step": 8091 + }, + { + "epoch": 3.9327659574468083, + "grad_norm": 0.06834538252196153, + "learning_rate": 6.386219498274449e-09, + "loss": 0.5005, + "step": 8092 + }, + { + "epoch": 3.9332522796352585, + "grad_norm": 0.07238842386730485, + "learning_rate": 6.289845330669231e-09, + "loss": 0.512, + "step": 8093 + }, + { + "epoch": 3.933738601823708, + "grad_norm": 0.0742271470241622, + "learning_rate": 6.194203433951274e-09, + "loss": 0.5067, + "step": 8094 + }, + { + "epoch": 3.934224924012158, + "grad_norm": 0.07062883187495671, + "learning_rate": 6.099293822144359e-09, + "loss": 0.5013, + "step": 8095 + }, + { + "epoch": 3.934711246200608, + "grad_norm": 0.06990834319789539, + "learning_rate": 6.005116509166797e-09, + "loss": 0.5117, + "step": 8096 + }, + { + "epoch": 3.9351975683890577, + "grad_norm": 0.07039004221223741, + "learning_rate": 5.911671508828098e-09, + "loss": 0.4815, + "step": 8097 + }, + { + "epoch": 3.935683890577508, + "grad_norm": 0.07292652009321607, + "learning_rate": 5.81895883483119e-09, + "loss": 0.5244, + "step": 8098 + }, + { + "epoch": 3.9361702127659575, + "grad_norm": 0.07155828797770042, + "learning_rate": 5.726978500771307e-09, + "loss": 0.5088, + "step": 8099 + }, + { + "epoch": 3.936656534954407, + "grad_norm": 0.06892749606309138, + "learning_rate": 5.635730520136551e-09, + "loss": 0.4707, + "step": 8100 + }, + { + "epoch": 3.9371428571428573, + "grad_norm": 0.07224752014891758, + "learning_rate": 5.5452149063067724e-09, + "loss": 0.4893, + "step": 8101 + }, + { + "epoch": 3.937629179331307, + "grad_norm": 0.07286475931072105, + "learning_rate": 5.4554316725558e-09, + "loss": 0.5401, + "step": 8102 + }, + { + "epoch": 3.938115501519757, + "grad_norm": 0.07169064040307217, + "learning_rate": 5.366380832048657e-09, + "loss": 0.5364, + "step": 8103 + }, + { + "epoch": 3.9386018237082068, + "grad_norm": 0.06964149781236798, + "learning_rate": 5.278062397844341e-09, + "loss": 0.4997, + "step": 8104 + }, + { + "epoch": 3.9390881458966565, + "grad_norm": 0.07368815291661669, + "learning_rate": 5.190476382893051e-09, + "loss": 0.5127, + "step": 8105 + }, + { + "epoch": 3.939574468085106, + "grad_norm": 0.07056507845515898, + "learning_rate": 5.103622800038399e-09, + "loss": 0.5012, + "step": 8106 + }, + { + "epoch": 3.9400607902735563, + "grad_norm": 0.06900093577242564, + "learning_rate": 5.017501662016866e-09, + "loss": 0.4918, + "step": 8107 + }, + { + "epoch": 3.940547112462006, + "grad_norm": 0.07093634738506353, + "learning_rate": 4.932112981456682e-09, + "loss": 0.504, + "step": 8108 + }, + { + "epoch": 3.941033434650456, + "grad_norm": 0.07323133370754058, + "learning_rate": 4.847456770880055e-09, + "loss": 0.5589, + "step": 8109 + }, + { + "epoch": 3.9415197568389058, + "grad_norm": 0.07079831119733815, + "learning_rate": 4.7635330426992755e-09, + "loss": 0.4945, + "step": 8110 + }, + { + "epoch": 3.9420060790273554, + "grad_norm": 0.07454331180430393, + "learning_rate": 4.680341809222277e-09, + "loss": 0.5673, + "step": 8111 + }, + { + "epoch": 3.9424924012158056, + "grad_norm": 0.0722031174688914, + "learning_rate": 4.597883082647636e-09, + "loss": 0.5304, + "step": 8112 + }, + { + "epoch": 3.9429787234042553, + "grad_norm": 0.07369599139790713, + "learning_rate": 4.51615687506679e-09, + "loss": 0.5489, + "step": 8113 + }, + { + "epoch": 3.9434650455927054, + "grad_norm": 0.07018931189411005, + "learning_rate": 4.435163198463488e-09, + "loss": 0.5016, + "step": 8114 + }, + { + "epoch": 3.943951367781155, + "grad_norm": 0.07264213074530183, + "learning_rate": 4.354902064716005e-09, + "loss": 0.5356, + "step": 8115 + }, + { + "epoch": 3.9444376899696048, + "grad_norm": 0.06940034409598339, + "learning_rate": 4.275373485592149e-09, + "loss": 0.5014, + "step": 8116 + }, + { + "epoch": 3.9449240121580544, + "grad_norm": 0.07212645682487613, + "learning_rate": 4.196577472754815e-09, + "loss": 0.5045, + "step": 8117 + }, + { + "epoch": 3.9454103343465046, + "grad_norm": 0.07022424093777124, + "learning_rate": 4.118514037758093e-09, + "loss": 0.5059, + "step": 8118 + }, + { + "epoch": 3.9458966565349542, + "grad_norm": 0.06994721478412266, + "learning_rate": 4.041183192049492e-09, + "loss": 0.5344, + "step": 8119 + }, + { + "epoch": 3.9463829787234044, + "grad_norm": 0.07063191809187022, + "learning_rate": 3.96458494696883e-09, + "loss": 0.4856, + "step": 8120 + }, + { + "epoch": 3.946869300911854, + "grad_norm": 0.07206067649205745, + "learning_rate": 3.88871931374768e-09, + "loss": 0.5336, + "step": 8121 + }, + { + "epoch": 3.9473556231003037, + "grad_norm": 0.07007946834779166, + "learning_rate": 3.81358630351103e-09, + "loss": 0.5124, + "step": 8122 + }, + { + "epoch": 3.947841945288754, + "grad_norm": 0.06979905687357034, + "learning_rate": 3.739185927276734e-09, + "loss": 0.4766, + "step": 8123 + }, + { + "epoch": 3.9483282674772036, + "grad_norm": 0.07092051187454781, + "learning_rate": 3.6655181959543984e-09, + "loss": 0.4803, + "step": 8124 + }, + { + "epoch": 3.9488145896656537, + "grad_norm": 0.06894058270608466, + "learning_rate": 3.5925831203470484e-09, + "loss": 0.4813, + "step": 8125 + }, + { + "epoch": 3.9493009118541034, + "grad_norm": 0.06977231889346967, + "learning_rate": 3.5203807111489074e-09, + "loss": 0.4875, + "step": 8126 + }, + { + "epoch": 3.949787234042553, + "grad_norm": 0.07458863074111872, + "learning_rate": 3.4489109789487275e-09, + "loss": 0.5366, + "step": 8127 + }, + { + "epoch": 3.950273556231003, + "grad_norm": 0.0726611346904699, + "learning_rate": 3.3781739342259033e-09, + "loss": 0.5401, + "step": 8128 + }, + { + "epoch": 3.950759878419453, + "grad_norm": 0.07202718490680439, + "learning_rate": 3.3081695873532493e-09, + "loss": 0.4994, + "step": 8129 + }, + { + "epoch": 3.951246200607903, + "grad_norm": 0.0703655299161122, + "learning_rate": 3.2388979485964422e-09, + "loss": 0.5312, + "step": 8130 + }, + { + "epoch": 3.9517325227963527, + "grad_norm": 0.0696733281534409, + "learning_rate": 3.1703590281134676e-09, + "loss": 0.4759, + "step": 8131 + }, + { + "epoch": 3.9522188449848024, + "grad_norm": 0.0737254573011368, + "learning_rate": 3.1025528359540644e-09, + "loss": 0.5452, + "step": 8132 + }, + { + "epoch": 3.952705167173252, + "grad_norm": 0.07274737439261085, + "learning_rate": 3.0354793820625005e-09, + "loss": 0.5384, + "step": 8133 + }, + { + "epoch": 3.953191489361702, + "grad_norm": 0.07124987279119638, + "learning_rate": 2.969138676273131e-09, + "loss": 0.5191, + "step": 8134 + }, + { + "epoch": 3.953677811550152, + "grad_norm": 0.07023365395484578, + "learning_rate": 2.9035307283142857e-09, + "loss": 0.5106, + "step": 8135 + }, + { + "epoch": 3.954164133738602, + "grad_norm": 0.07356515817803144, + "learning_rate": 2.8386555478071566e-09, + "loss": 0.5179, + "step": 8136 + }, + { + "epoch": 3.9546504559270517, + "grad_norm": 0.07240874014530504, + "learning_rate": 2.77451314426469e-09, + "loss": 0.5562, + "step": 8137 + }, + { + "epoch": 3.9551367781155014, + "grad_norm": 0.06962325511307683, + "learning_rate": 2.7111035270926956e-09, + "loss": 0.471, + "step": 8138 + }, + { + "epoch": 3.9556231003039515, + "grad_norm": 0.07136874549247091, + "learning_rate": 2.6484267055892907e-09, + "loss": 0.4866, + "step": 8139 + }, + { + "epoch": 3.956109422492401, + "grad_norm": 0.07181745592666004, + "learning_rate": 2.5864826889454574e-09, + "loss": 0.5488, + "step": 8140 + }, + { + "epoch": 3.9565957446808513, + "grad_norm": 0.07221214086171204, + "learning_rate": 2.5252714862444848e-09, + "loss": 0.5036, + "step": 8141 + }, + { + "epoch": 3.957082066869301, + "grad_norm": 0.07003244553923746, + "learning_rate": 2.4647931064625263e-09, + "loss": 0.4885, + "step": 8142 + }, + { + "epoch": 3.9575683890577507, + "grad_norm": 0.06790757620095221, + "learning_rate": 2.4050475584680433e-09, + "loss": 0.4919, + "step": 8143 + }, + { + "epoch": 3.9580547112462003, + "grad_norm": 0.07005590309632957, + "learning_rate": 2.3460348510212503e-09, + "loss": 0.4911, + "step": 8144 + }, + { + "epoch": 3.9585410334346505, + "grad_norm": 0.07187730644827803, + "learning_rate": 2.2877549927768914e-09, + "loss": 0.5415, + "step": 8145 + }, + { + "epoch": 3.9590273556231, + "grad_norm": 0.0706836695120867, + "learning_rate": 2.230207992280353e-09, + "loss": 0.5088, + "step": 8146 + }, + { + "epoch": 3.9595136778115503, + "grad_norm": 0.072387026294712, + "learning_rate": 2.1733938579698853e-09, + "loss": 0.5014, + "step": 8147 + }, + { + "epoch": 3.96, + "grad_norm": 0.07115738281883845, + "learning_rate": 2.117312598177712e-09, + "loss": 0.5066, + "step": 8148 + }, + { + "epoch": 3.9604863221884496, + "grad_norm": 0.07139689653704843, + "learning_rate": 2.0619642211266998e-09, + "loss": 0.5096, + "step": 8149 + }, + { + "epoch": 3.9609726443768998, + "grad_norm": 0.0702955967905899, + "learning_rate": 2.0073487349336894e-09, + "loss": 0.5149, + "step": 8150 + }, + { + "epoch": 3.9614589665653495, + "grad_norm": 0.07102713027669254, + "learning_rate": 1.9534661476067195e-09, + "loss": 0.5191, + "step": 8151 + }, + { + "epoch": 3.9619452887537996, + "grad_norm": 0.07154412624725065, + "learning_rate": 1.9003164670472474e-09, + "loss": 0.5256, + "step": 8152 + }, + { + "epoch": 3.9624316109422493, + "grad_norm": 0.0720294956571181, + "learning_rate": 1.8478997010490384e-09, + "loss": 0.5151, + "step": 8153 + }, + { + "epoch": 3.962917933130699, + "grad_norm": 0.06862619438173168, + "learning_rate": 1.796215857298722e-09, + "loss": 0.4634, + "step": 8154 + }, + { + "epoch": 3.963404255319149, + "grad_norm": 0.0764921404797082, + "learning_rate": 1.7452649433752355e-09, + "loss": 0.6147, + "step": 8155 + }, + { + "epoch": 3.9638905775075988, + "grad_norm": 0.06866013650053804, + "learning_rate": 1.6950469667492697e-09, + "loss": 0.4623, + "step": 8156 + }, + { + "epoch": 3.964376899696049, + "grad_norm": 0.07074925107637062, + "learning_rate": 1.645561934785489e-09, + "loss": 0.4919, + "step": 8157 + }, + { + "epoch": 3.9648632218844986, + "grad_norm": 0.07026614510420989, + "learning_rate": 1.596809854739756e-09, + "loss": 0.5281, + "step": 8158 + }, + { + "epoch": 3.9653495440729483, + "grad_norm": 0.06977210402333377, + "learning_rate": 1.5487907337613517e-09, + "loss": 0.5136, + "step": 8159 + }, + { + "epoch": 3.965835866261398, + "grad_norm": 0.07021467126198487, + "learning_rate": 1.5015045788918658e-09, + "loss": 0.4989, + "step": 8160 + }, + { + "epoch": 3.966322188449848, + "grad_norm": 0.07117600998553947, + "learning_rate": 1.454951397064641e-09, + "loss": 0.4763, + "step": 8161 + }, + { + "epoch": 3.9668085106382978, + "grad_norm": 0.06970448755775487, + "learning_rate": 1.409131195106439e-09, + "loss": 0.4962, + "step": 8162 + }, + { + "epoch": 3.967294832826748, + "grad_norm": 0.07306336551471798, + "learning_rate": 1.3640439797368843e-09, + "loss": 0.5095, + "step": 8163 + }, + { + "epoch": 3.9677811550151976, + "grad_norm": 0.07252556711660253, + "learning_rate": 1.3196897575668e-09, + "loss": 0.496, + "step": 8164 + }, + { + "epoch": 3.9682674772036473, + "grad_norm": 0.06935909114039966, + "learning_rate": 1.2760685351004277e-09, + "loss": 0.5007, + "step": 8165 + }, + { + "epoch": 3.9687537993920974, + "grad_norm": 0.07090171749214573, + "learning_rate": 1.2331803187343171e-09, + "loss": 0.524, + "step": 8166 + }, + { + "epoch": 3.969240121580547, + "grad_norm": 0.07406312878779753, + "learning_rate": 1.1910251147573272e-09, + "loss": 0.5413, + "step": 8167 + }, + { + "epoch": 3.969726443768997, + "grad_norm": 0.07114174440793454, + "learning_rate": 1.149602929351179e-09, + "loss": 0.4991, + "step": 8168 + }, + { + "epoch": 3.970212765957447, + "grad_norm": 0.07040133790442153, + "learning_rate": 1.1089137685904583e-09, + "loss": 0.5125, + "step": 8169 + }, + { + "epoch": 3.9706990881458966, + "grad_norm": 0.06936154269524242, + "learning_rate": 1.0689576384415035e-09, + "loss": 0.5126, + "step": 8170 + }, + { + "epoch": 3.9711854103343462, + "grad_norm": 0.0722400663600258, + "learning_rate": 1.0297345447629615e-09, + "loss": 0.5659, + "step": 8171 + }, + { + "epoch": 3.9716717325227964, + "grad_norm": 0.06972781353985649, + "learning_rate": 9.912444933068976e-10, + "loss": 0.4755, + "step": 8172 + }, + { + "epoch": 3.972158054711246, + "grad_norm": 0.07047353065042987, + "learning_rate": 9.534874897171309e-10, + "loss": 0.516, + "step": 8173 + }, + { + "epoch": 3.972644376899696, + "grad_norm": 0.07302752596832238, + "learning_rate": 9.164635395303434e-10, + "loss": 0.5601, + "step": 8174 + }, + { + "epoch": 3.973130699088146, + "grad_norm": 0.0689001322167617, + "learning_rate": 8.801726481766359e-10, + "loss": 0.5049, + "step": 8175 + }, + { + "epoch": 3.9736170212765956, + "grad_norm": 0.07428377465526251, + "learning_rate": 8.446148209761973e-10, + "loss": 0.5063, + "step": 8176 + }, + { + "epoch": 3.9741033434650457, + "grad_norm": 0.07128211857910462, + "learning_rate": 8.09790063143745e-10, + "loss": 0.5382, + "step": 8177 + }, + { + "epoch": 3.9745896656534954, + "grad_norm": 0.07057616688384419, + "learning_rate": 7.75698379786305e-10, + "loss": 0.5177, + "step": 8178 + }, + { + "epoch": 3.9750759878419455, + "grad_norm": 0.07196260975657118, + "learning_rate": 7.423397759026563e-10, + "loss": 0.5351, + "step": 8179 + }, + { + "epoch": 3.975562310030395, + "grad_norm": 0.07434715626005581, + "learning_rate": 7.097142563844417e-10, + "loss": 0.5404, + "step": 8180 + }, + { + "epoch": 3.976048632218845, + "grad_norm": 0.07069529803170822, + "learning_rate": 6.778218260161673e-10, + "loss": 0.4838, + "step": 8181 + }, + { + "epoch": 3.976534954407295, + "grad_norm": 0.06927011428174207, + "learning_rate": 6.466624894740925e-10, + "loss": 0.494, + "step": 8182 + }, + { + "epoch": 3.9770212765957447, + "grad_norm": 0.07496577366429516, + "learning_rate": 6.162362513273401e-10, + "loss": 0.5345, + "step": 8183 + }, + { + "epoch": 3.977507598784195, + "grad_norm": 0.06954442783966105, + "learning_rate": 5.865431160378964e-10, + "loss": 0.4961, + "step": 8184 + }, + { + "epoch": 3.9779939209726445, + "grad_norm": 0.0726533837987503, + "learning_rate": 5.575830879600564e-10, + "loss": 0.4996, + "step": 8185 + }, + { + "epoch": 3.978480243161094, + "grad_norm": 0.07340734325706312, + "learning_rate": 5.293561713398676e-10, + "loss": 0.5008, + "step": 8186 + }, + { + "epoch": 3.978966565349544, + "grad_norm": 0.07397886319056134, + "learning_rate": 5.01862370317352e-10, + "loss": 0.578, + "step": 8187 + }, + { + "epoch": 3.979452887537994, + "grad_norm": 0.07252826850789273, + "learning_rate": 4.751016889231741e-10, + "loss": 0.541, + "step": 8188 + }, + { + "epoch": 3.9799392097264437, + "grad_norm": 0.07267762781569143, + "learning_rate": 4.490741310819724e-10, + "loss": 0.5543, + "step": 8189 + }, + { + "epoch": 3.980425531914894, + "grad_norm": 0.07150693103716799, + "learning_rate": 4.237797006106936e-10, + "loss": 0.5079, + "step": 8190 + }, + { + "epoch": 3.9809118541033435, + "grad_norm": 0.07052437152147178, + "learning_rate": 3.9921840121803777e-10, + "loss": 0.5186, + "step": 8191 + }, + { + "epoch": 3.981398176291793, + "grad_norm": 0.06880520378124522, + "learning_rate": 3.753902365061235e-10, + "loss": 0.4956, + "step": 8192 + }, + { + "epoch": 3.9818844984802433, + "grad_norm": 0.07061848070283695, + "learning_rate": 3.522952099682675e-10, + "loss": 0.5027, + "step": 8193 + }, + { + "epoch": 3.982370820668693, + "grad_norm": 0.07418091822139052, + "learning_rate": 3.2993332499176024e-10, + "loss": 0.5466, + "step": 8194 + }, + { + "epoch": 3.982857142857143, + "grad_norm": 0.07098606588952133, + "learning_rate": 3.083045848550903e-10, + "loss": 0.5056, + "step": 8195 + }, + { + "epoch": 3.983343465045593, + "grad_norm": 0.0726395717090971, + "learning_rate": 2.8740899273071996e-10, + "loss": 0.4973, + "step": 8196 + }, + { + "epoch": 3.9838297872340425, + "grad_norm": 0.07550587967062494, + "learning_rate": 2.672465516823097e-10, + "loss": 0.5216, + "step": 8197 + }, + { + "epoch": 3.984316109422492, + "grad_norm": 0.07121613661275214, + "learning_rate": 2.478172646663835e-10, + "loss": 0.4895, + "step": 8198 + }, + { + "epoch": 3.9848024316109423, + "grad_norm": 0.07246843768241126, + "learning_rate": 2.2912113453232854e-10, + "loss": 0.5193, + "step": 8199 + }, + { + "epoch": 3.985288753799392, + "grad_norm": 0.07212811789144231, + "learning_rate": 2.1115816402128563e-10, + "loss": 0.5295, + "step": 8200 + }, + { + "epoch": 3.985775075987842, + "grad_norm": 0.0853453254585128, + "learning_rate": 1.9392835576725888e-10, + "loss": 0.533, + "step": 8201 + }, + { + "epoch": 3.9862613981762918, + "grad_norm": 0.06961462277246325, + "learning_rate": 1.7743171229711586e-10, + "loss": 0.4879, + "step": 8202 + }, + { + "epoch": 3.9867477203647415, + "grad_norm": 0.06941821680312064, + "learning_rate": 1.6166823603058768e-10, + "loss": 0.5001, + "step": 8203 + }, + { + "epoch": 3.9872340425531916, + "grad_norm": 0.07211203081906382, + "learning_rate": 1.466379292774933e-10, + "loss": 0.5075, + "step": 8204 + }, + { + "epoch": 3.9877203647416413, + "grad_norm": 0.07044030763022163, + "learning_rate": 1.3234079424384593e-10, + "loss": 0.5077, + "step": 8205 + }, + { + "epoch": 3.9882066869300914, + "grad_norm": 0.07147018420974952, + "learning_rate": 1.187768330246364e-10, + "loss": 0.5169, + "step": 8206 + }, + { + "epoch": 3.988693009118541, + "grad_norm": 0.06998522917390052, + "learning_rate": 1.0594604760938431e-10, + "loss": 0.5009, + "step": 8207 + }, + { + "epoch": 3.9891793313069908, + "grad_norm": 0.07150452256496002, + "learning_rate": 9.384843987936266e-11, + "loss": 0.5144, + "step": 8208 + }, + { + "epoch": 3.989665653495441, + "grad_norm": 0.07307341410450179, + "learning_rate": 8.24840116092629e-11, + "loss": 0.519, + "step": 8209 + }, + { + "epoch": 3.9901519756838906, + "grad_norm": 0.06852716940560657, + "learning_rate": 7.185276446441958e-11, + "loss": 0.4841, + "step": 8210 + }, + { + "epoch": 3.9906382978723407, + "grad_norm": 0.07036543224293386, + "learning_rate": 6.195470000525116e-11, + "loss": 0.5117, + "step": 8211 + }, + { + "epoch": 3.9911246200607904, + "grad_norm": 0.07149663926051224, + "learning_rate": 5.278981968170893e-11, + "loss": 0.5038, + "step": 8212 + }, + { + "epoch": 3.99161094224924, + "grad_norm": 0.07080408322503196, + "learning_rate": 4.4358124838828064e-11, + "loss": 0.5283, + "step": 8213 + }, + { + "epoch": 3.9920972644376898, + "grad_norm": 0.07180663847748395, + "learning_rate": 3.665961671228679e-11, + "loss": 0.527, + "step": 8214 + }, + { + "epoch": 3.99258358662614, + "grad_norm": 0.07133509728331337, + "learning_rate": 2.969429643118193e-11, + "loss": 0.5191, + "step": 8215 + }, + { + "epoch": 3.9930699088145896, + "grad_norm": 0.07008770849451361, + "learning_rate": 2.346216501691867e-11, + "loss": 0.5078, + "step": 8216 + }, + { + "epoch": 3.9935562310030397, + "grad_norm": 0.07014451915149213, + "learning_rate": 1.796322338376566e-11, + "loss": 0.5052, + "step": 8217 + }, + { + "epoch": 3.9940425531914894, + "grad_norm": 0.07066078685559711, + "learning_rate": 1.3197472337744822e-11, + "loss": 0.5024, + "step": 8218 + }, + { + "epoch": 3.994528875379939, + "grad_norm": 0.07517486681995703, + "learning_rate": 9.164912577741547e-12, + "loss": 0.5505, + "step": 8219 + }, + { + "epoch": 3.995015197568389, + "grad_norm": 0.07108906354695735, + "learning_rate": 5.865544694949599e-12, + "loss": 0.4933, + "step": 8220 + }, + { + "epoch": 3.995501519756839, + "grad_norm": 0.06956036331651641, + "learning_rate": 3.299369172871103e-12, + "loss": 0.5009, + "step": 8221 + }, + { + "epoch": 3.995987841945289, + "grad_norm": 0.0706466963612353, + "learning_rate": 1.4663863889818885e-12, + "loss": 0.5074, + "step": 8222 + }, + { + "epoch": 3.9964741641337387, + "grad_norm": 0.07142461159105022, + "learning_rate": 3.665966108457042e-13, + "loss": 0.4782, + "step": 8223 + }, + { + "epoch": 3.9969604863221884, + "grad_norm": 0.07134949425623466, + "learning_rate": 0.0, + "loss": 0.5257, + "step": 8224 + }, + { + "epoch": 3.9969604863221884, + "eval_loss": 0.5693754553794861, + "eval_runtime": 104.9899, + "eval_samples_per_second": 289.104, + "eval_steps_per_second": 36.146, + "step": 8224 + }, + { + "epoch": 4.00048632218845, + "grad_norm": 0.06987722920988068, + "learning_rate": 5.008605527125408e-06, + "loss": 0.5249, + "step": 8225 + }, + { + "epoch": 4.000972644376899, + "grad_norm": 0.07304031658279857, + "learning_rate": 5.007649358237405e-06, + "loss": 0.4971, + "step": 8226 + }, + { + "epoch": 4.00145896656535, + "grad_norm": 0.07087156742228275, + "learning_rate": 5.006693189069661e-06, + "loss": 0.4925, + "step": 8227 + }, + { + "epoch": 4.0019452887538, + "grad_norm": 0.07299306342503452, + "learning_rate": 5.005737019657147e-06, + "loss": 0.5113, + "step": 8228 + }, + { + "epoch": 4.002431610942249, + "grad_norm": 0.07088033929123744, + "learning_rate": 5.004780850034825e-06, + "loss": 0.5015, + "step": 8229 + }, + { + "epoch": 4.002917933130699, + "grad_norm": 0.07625491773835517, + "learning_rate": 5.003824680237666e-06, + "loss": 0.5461, + "step": 8230 + }, + { + "epoch": 4.003404255319149, + "grad_norm": 0.07148663232221743, + "learning_rate": 5.002868510300636e-06, + "loss": 0.4638, + "step": 8231 + }, + { + "epoch": 4.003890577507598, + "grad_norm": 0.07442791255292215, + "learning_rate": 5.001912340258703e-06, + "loss": 0.5325, + "step": 8232 + }, + { + "epoch": 4.004376899696049, + "grad_norm": 0.07460171298984732, + "learning_rate": 5.000956170146836e-06, + "loss": 0.5423, + "step": 8233 + }, + { + "epoch": 4.004863221884499, + "grad_norm": 0.07751291052820865, + "learning_rate": 5e-06, + "loss": 0.5103, + "step": 8234 + }, + { + "epoch": 4.005349544072948, + "grad_norm": 0.0711764615739941, + "learning_rate": 4.999043829853165e-06, + "loss": 0.5166, + "step": 8235 + }, + { + "epoch": 4.005835866261398, + "grad_norm": 0.07327707870288805, + "learning_rate": 4.9980876597412985e-06, + "loss": 0.5106, + "step": 8236 + }, + { + "epoch": 4.006322188449848, + "grad_norm": 0.07373016803088536, + "learning_rate": 4.997131489699365e-06, + "loss": 0.508, + "step": 8237 + }, + { + "epoch": 4.006808510638298, + "grad_norm": 0.07611251044609917, + "learning_rate": 4.996175319762336e-06, + "loss": 0.5181, + "step": 8238 + }, + { + "epoch": 4.007294832826748, + "grad_norm": 0.07780645532882732, + "learning_rate": 4.995219149965176e-06, + "loss": 0.5149, + "step": 8239 + }, + { + "epoch": 4.007781155015198, + "grad_norm": 0.0726645716616451, + "learning_rate": 4.994262980342856e-06, + "loss": 0.5072, + "step": 8240 + }, + { + "epoch": 4.008267477203647, + "grad_norm": 0.07179560202076597, + "learning_rate": 4.993306810930339e-06, + "loss": 0.4785, + "step": 8241 + }, + { + "epoch": 4.008753799392097, + "grad_norm": 0.07201453349903571, + "learning_rate": 4.9923506417625955e-06, + "loss": 0.4783, + "step": 8242 + }, + { + "epoch": 4.0092401215805475, + "grad_norm": 0.07540298746065632, + "learning_rate": 4.9913944728745925e-06, + "loss": 0.5193, + "step": 8243 + }, + { + "epoch": 4.009726443768997, + "grad_norm": 0.07614780971146975, + "learning_rate": 4.990438304301299e-06, + "loss": 0.5054, + "step": 8244 + }, + { + "epoch": 4.010212765957447, + "grad_norm": 0.07529273969300707, + "learning_rate": 4.989482136077679e-06, + "loss": 0.5327, + "step": 8245 + }, + { + "epoch": 4.010699088145897, + "grad_norm": 0.0730295857152015, + "learning_rate": 4.988525968238703e-06, + "loss": 0.524, + "step": 8246 + }, + { + "epoch": 4.011185410334346, + "grad_norm": 0.07171160164164063, + "learning_rate": 4.987569800819337e-06, + "loss": 0.5181, + "step": 8247 + }, + { + "epoch": 4.011671732522796, + "grad_norm": 0.07076031014339057, + "learning_rate": 4.986613633854551e-06, + "loss": 0.4903, + "step": 8248 + }, + { + "epoch": 4.0121580547112465, + "grad_norm": 0.07501363195329876, + "learning_rate": 4.985657467379308e-06, + "loss": 0.5273, + "step": 8249 + }, + { + "epoch": 4.012644376899696, + "grad_norm": 0.07463473227894854, + "learning_rate": 4.98470130142858e-06, + "loss": 0.5223, + "step": 8250 + }, + { + "epoch": 4.013130699088146, + "grad_norm": 0.07716483246374314, + "learning_rate": 4.983745136037331e-06, + "loss": 0.5228, + "step": 8251 + }, + { + "epoch": 4.013617021276596, + "grad_norm": 0.07234220880101895, + "learning_rate": 4.98278897124053e-06, + "loss": 0.4782, + "step": 8252 + }, + { + "epoch": 4.014103343465045, + "grad_norm": 0.07384480458318275, + "learning_rate": 4.981832807073143e-06, + "loss": 0.512, + "step": 8253 + }, + { + "epoch": 4.014589665653496, + "grad_norm": 0.07981209815387338, + "learning_rate": 4.980876643570142e-06, + "loss": 0.5528, + "step": 8254 + }, + { + "epoch": 4.0150759878419455, + "grad_norm": 0.07530918411483194, + "learning_rate": 4.979920480766488e-06, + "loss": 0.536, + "step": 8255 + }, + { + "epoch": 4.015562310030395, + "grad_norm": 0.0753745968480417, + "learning_rate": 4.978964318697152e-06, + "loss": 0.4961, + "step": 8256 + }, + { + "epoch": 4.016048632218845, + "grad_norm": 0.0731400284668495, + "learning_rate": 4.978008157397099e-06, + "loss": 0.4932, + "step": 8257 + }, + { + "epoch": 4.016534954407295, + "grad_norm": 0.07608280575089299, + "learning_rate": 4.977051996901301e-06, + "loss": 0.5201, + "step": 8258 + }, + { + "epoch": 4.017021276595744, + "grad_norm": 0.07348973407919926, + "learning_rate": 4.9760958372447185e-06, + "loss": 0.4889, + "step": 8259 + }, + { + "epoch": 4.017507598784195, + "grad_norm": 0.07636240363913128, + "learning_rate": 4.975139678462324e-06, + "loss": 0.4906, + "step": 8260 + }, + { + "epoch": 4.0179939209726445, + "grad_norm": 0.0722656852489509, + "learning_rate": 4.974183520589082e-06, + "loss": 0.4855, + "step": 8261 + }, + { + "epoch": 4.018480243161094, + "grad_norm": 0.07498030476452544, + "learning_rate": 4.973227363659959e-06, + "loss": 0.5158, + "step": 8262 + }, + { + "epoch": 4.018966565349544, + "grad_norm": 0.07167201556798897, + "learning_rate": 4.9722712077099255e-06, + "loss": 0.5102, + "step": 8263 + }, + { + "epoch": 4.019452887537994, + "grad_norm": 0.07358820873873567, + "learning_rate": 4.971315052773945e-06, + "loss": 0.5262, + "step": 8264 + }, + { + "epoch": 4.019939209726444, + "grad_norm": 0.07436902319990749, + "learning_rate": 4.970358898886989e-06, + "loss": 0.5025, + "step": 8265 + }, + { + "epoch": 4.020425531914894, + "grad_norm": 0.07588409367566396, + "learning_rate": 4.969402746084019e-06, + "loss": 0.491, + "step": 8266 + }, + { + "epoch": 4.0209118541033435, + "grad_norm": 0.07609855860591161, + "learning_rate": 4.9684465944000045e-06, + "loss": 0.5063, + "step": 8267 + }, + { + "epoch": 4.021398176291793, + "grad_norm": 0.07346674002556423, + "learning_rate": 4.967490443869913e-06, + "loss": 0.4947, + "step": 8268 + }, + { + "epoch": 4.021884498480243, + "grad_norm": 0.07487369400302217, + "learning_rate": 4.966534294528711e-06, + "loss": 0.5124, + "step": 8269 + }, + { + "epoch": 4.0223708206686934, + "grad_norm": 0.07800721007168725, + "learning_rate": 4.965578146411364e-06, + "loss": 0.5565, + "step": 8270 + }, + { + "epoch": 4.022857142857143, + "grad_norm": 0.07800763230764782, + "learning_rate": 4.964621999552841e-06, + "loss": 0.505, + "step": 8271 + }, + { + "epoch": 4.023343465045593, + "grad_norm": 0.0724595747284464, + "learning_rate": 4.963665853988106e-06, + "loss": 0.4855, + "step": 8272 + }, + { + "epoch": 4.0238297872340425, + "grad_norm": 0.07816626330895647, + "learning_rate": 4.96270970975213e-06, + "loss": 0.5275, + "step": 8273 + }, + { + "epoch": 4.024316109422492, + "grad_norm": 0.07249627500905842, + "learning_rate": 4.961753566879874e-06, + "loss": 0.4942, + "step": 8274 + }, + { + "epoch": 4.024802431610942, + "grad_norm": 0.07347928119639274, + "learning_rate": 4.96079742540631e-06, + "loss": 0.5019, + "step": 8275 + }, + { + "epoch": 4.025288753799392, + "grad_norm": 0.07248405057913224, + "learning_rate": 4.9598412853663994e-06, + "loss": 0.5363, + "step": 8276 + }, + { + "epoch": 4.025775075987842, + "grad_norm": 0.07196782351160874, + "learning_rate": 4.958885146795113e-06, + "loss": 0.4902, + "step": 8277 + }, + { + "epoch": 4.026261398176292, + "grad_norm": 0.07526385761607707, + "learning_rate": 4.957929009727414e-06, + "loss": 0.4978, + "step": 8278 + }, + { + "epoch": 4.0267477203647415, + "grad_norm": 0.07864048019359021, + "learning_rate": 4.956972874198272e-06, + "loss": 0.5152, + "step": 8279 + }, + { + "epoch": 4.027234042553191, + "grad_norm": 0.07759430388828786, + "learning_rate": 4.956016740242651e-06, + "loss": 0.5286, + "step": 8280 + }, + { + "epoch": 4.027720364741642, + "grad_norm": 0.07246746999491008, + "learning_rate": 4.955060607895517e-06, + "loss": 0.4609, + "step": 8281 + }, + { + "epoch": 4.028206686930091, + "grad_norm": 0.07600816888442034, + "learning_rate": 4.954104477191837e-06, + "loss": 0.5145, + "step": 8282 + }, + { + "epoch": 4.028693009118541, + "grad_norm": 0.07374485174005918, + "learning_rate": 4.953148348166579e-06, + "loss": 0.5126, + "step": 8283 + }, + { + "epoch": 4.029179331306991, + "grad_norm": 0.07296946755934197, + "learning_rate": 4.9521922208547045e-06, + "loss": 0.4964, + "step": 8284 + }, + { + "epoch": 4.0296656534954405, + "grad_norm": 0.07666621804184696, + "learning_rate": 4.951236095291184e-06, + "loss": 0.5196, + "step": 8285 + }, + { + "epoch": 4.03015197568389, + "grad_norm": 0.07309108649474487, + "learning_rate": 4.95027997151098e-06, + "loss": 0.5099, + "step": 8286 + }, + { + "epoch": 4.030638297872341, + "grad_norm": 0.07485528933237076, + "learning_rate": 4.94932384954906e-06, + "loss": 0.5074, + "step": 8287 + }, + { + "epoch": 4.03112462006079, + "grad_norm": 0.07492133312897797, + "learning_rate": 4.948367729440393e-06, + "loss": 0.5202, + "step": 8288 + }, + { + "epoch": 4.03161094224924, + "grad_norm": 0.0761830811916514, + "learning_rate": 4.947411611219938e-06, + "loss": 0.5384, + "step": 8289 + }, + { + "epoch": 4.03209726443769, + "grad_norm": 0.07395306993404442, + "learning_rate": 4.946455494922668e-06, + "loss": 0.5199, + "step": 8290 + }, + { + "epoch": 4.0325835866261395, + "grad_norm": 0.0757393358938198, + "learning_rate": 4.945499380583541e-06, + "loss": 0.5255, + "step": 8291 + }, + { + "epoch": 4.03306990881459, + "grad_norm": 0.07465702270700343, + "learning_rate": 4.944543268237529e-06, + "loss": 0.4846, + "step": 8292 + }, + { + "epoch": 4.03355623100304, + "grad_norm": 0.07416351978628313, + "learning_rate": 4.943587157919593e-06, + "loss": 0.5087, + "step": 8293 + }, + { + "epoch": 4.034042553191489, + "grad_norm": 0.07499156015079488, + "learning_rate": 4.9426310496647025e-06, + "loss": 0.5032, + "step": 8294 + }, + { + "epoch": 4.034528875379939, + "grad_norm": 0.07499909907130087, + "learning_rate": 4.941674943507818e-06, + "loss": 0.5284, + "step": 8295 + }, + { + "epoch": 4.035015197568389, + "grad_norm": 0.07565107936721804, + "learning_rate": 4.940718839483909e-06, + "loss": 0.5297, + "step": 8296 + }, + { + "epoch": 4.0355015197568385, + "grad_norm": 0.07551316318035235, + "learning_rate": 4.939762737627938e-06, + "loss": 0.4989, + "step": 8297 + }, + { + "epoch": 4.035987841945289, + "grad_norm": 0.07292462208638406, + "learning_rate": 4.9388066379748725e-06, + "loss": 0.4991, + "step": 8298 + }, + { + "epoch": 4.036474164133739, + "grad_norm": 0.07018192302429595, + "learning_rate": 4.937850540559675e-06, + "loss": 0.483, + "step": 8299 + }, + { + "epoch": 4.036960486322188, + "grad_norm": 0.07552310193010787, + "learning_rate": 4.936894445417312e-06, + "loss": 0.5109, + "step": 8300 + }, + { + "epoch": 4.037446808510638, + "grad_norm": 0.07770641866458226, + "learning_rate": 4.935938352582747e-06, + "loss": 0.5493, + "step": 8301 + }, + { + "epoch": 4.037933130699088, + "grad_norm": 0.07626169461086708, + "learning_rate": 4.934982262090947e-06, + "loss": 0.5197, + "step": 8302 + }, + { + "epoch": 4.038419452887538, + "grad_norm": 0.07029229678580952, + "learning_rate": 4.9340261739768734e-06, + "loss": 0.4666, + "step": 8303 + }, + { + "epoch": 4.038905775075988, + "grad_norm": 0.07058071882128329, + "learning_rate": 4.933070088275494e-06, + "loss": 0.4927, + "step": 8304 + }, + { + "epoch": 4.039392097264438, + "grad_norm": 0.07227584313506363, + "learning_rate": 4.932114005021772e-06, + "loss": 0.4763, + "step": 8305 + }, + { + "epoch": 4.039878419452887, + "grad_norm": 0.07529344441574466, + "learning_rate": 4.93115792425067e-06, + "loss": 0.5112, + "step": 8306 + }, + { + "epoch": 4.040364741641337, + "grad_norm": 0.07246324321490295, + "learning_rate": 4.930201845997155e-06, + "loss": 0.4803, + "step": 8307 + }, + { + "epoch": 4.040851063829788, + "grad_norm": 0.07152337273850556, + "learning_rate": 4.929245770296191e-06, + "loss": 0.4806, + "step": 8308 + }, + { + "epoch": 4.041337386018237, + "grad_norm": 0.07504907189175945, + "learning_rate": 4.92828969718274e-06, + "loss": 0.5339, + "step": 8309 + }, + { + "epoch": 4.041823708206687, + "grad_norm": 0.07109371557920573, + "learning_rate": 4.9273336266917685e-06, + "loss": 0.4968, + "step": 8310 + }, + { + "epoch": 4.042310030395137, + "grad_norm": 0.07384910355549405, + "learning_rate": 4.926377558858238e-06, + "loss": 0.5142, + "step": 8311 + }, + { + "epoch": 4.042796352583586, + "grad_norm": 0.07911339434259716, + "learning_rate": 4.9254214937171144e-06, + "loss": 0.546, + "step": 8312 + }, + { + "epoch": 4.043282674772036, + "grad_norm": 0.07433159137103462, + "learning_rate": 4.92446543130336e-06, + "loss": 0.5068, + "step": 8313 + }, + { + "epoch": 4.043768996960487, + "grad_norm": 0.07489035554560758, + "learning_rate": 4.923509371651939e-06, + "loss": 0.5293, + "step": 8314 + }, + { + "epoch": 4.044255319148936, + "grad_norm": 0.07449143827797121, + "learning_rate": 4.922553314797817e-06, + "loss": 0.4977, + "step": 8315 + }, + { + "epoch": 4.044741641337386, + "grad_norm": 0.07086697161949952, + "learning_rate": 4.921597260775954e-06, + "loss": 0.4806, + "step": 8316 + }, + { + "epoch": 4.045227963525836, + "grad_norm": 0.0741159875188485, + "learning_rate": 4.920641209621315e-06, + "loss": 0.5156, + "step": 8317 + }, + { + "epoch": 4.045714285714285, + "grad_norm": 0.07376534038221326, + "learning_rate": 4.919685161368862e-06, + "loss": 0.5418, + "step": 8318 + }, + { + "epoch": 4.046200607902736, + "grad_norm": 0.07487511790617565, + "learning_rate": 4.9187291160535615e-06, + "loss": 0.5286, + "step": 8319 + }, + { + "epoch": 4.046686930091186, + "grad_norm": 0.0726235926555809, + "learning_rate": 4.917773073710372e-06, + "loss": 0.513, + "step": 8320 + }, + { + "epoch": 4.047173252279635, + "grad_norm": 0.07426291642458527, + "learning_rate": 4.916817034374259e-06, + "loss": 0.5015, + "step": 8321 + }, + { + "epoch": 4.047659574468085, + "grad_norm": 0.07311915002192036, + "learning_rate": 4.915860998080184e-06, + "loss": 0.5106, + "step": 8322 + }, + { + "epoch": 4.048145896656535, + "grad_norm": 0.07358610562882019, + "learning_rate": 4.914904964863113e-06, + "loss": 0.4932, + "step": 8323 + }, + { + "epoch": 4.048632218844984, + "grad_norm": 0.07442255063043904, + "learning_rate": 4.913948934758004e-06, + "loss": 0.5242, + "step": 8324 + }, + { + "epoch": 4.049118541033435, + "grad_norm": 0.07359918688109728, + "learning_rate": 4.912992907799823e-06, + "loss": 0.5384, + "step": 8325 + }, + { + "epoch": 4.049604863221885, + "grad_norm": 0.07026683099809078, + "learning_rate": 4.912036884023529e-06, + "loss": 0.5004, + "step": 8326 + }, + { + "epoch": 4.050091185410334, + "grad_norm": 0.07135693577891172, + "learning_rate": 4.9110808634640885e-06, + "loss": 0.5033, + "step": 8327 + }, + { + "epoch": 4.050577507598784, + "grad_norm": 0.0737125754595248, + "learning_rate": 4.910124846156459e-06, + "loss": 0.5023, + "step": 8328 + }, + { + "epoch": 4.051063829787234, + "grad_norm": 0.07168587289438462, + "learning_rate": 4.909168832135607e-06, + "loss": 0.4884, + "step": 8329 + }, + { + "epoch": 4.051550151975684, + "grad_norm": 0.07666287110516042, + "learning_rate": 4.90821282143649e-06, + "loss": 0.5043, + "step": 8330 + }, + { + "epoch": 4.052036474164134, + "grad_norm": 0.07949253015617697, + "learning_rate": 4.907256814094073e-06, + "loss": 0.5916, + "step": 8331 + }, + { + "epoch": 4.052522796352584, + "grad_norm": 0.07264451891553099, + "learning_rate": 4.9063008101433156e-06, + "loss": 0.4927, + "step": 8332 + }, + { + "epoch": 4.053009118541033, + "grad_norm": 0.0753920070326215, + "learning_rate": 4.905344809619182e-06, + "loss": 0.5171, + "step": 8333 + }, + { + "epoch": 4.053495440729483, + "grad_norm": 0.07289025039595012, + "learning_rate": 4.904388812556629e-06, + "loss": 0.4758, + "step": 8334 + }, + { + "epoch": 4.0539817629179336, + "grad_norm": 0.07505849587045671, + "learning_rate": 4.9034328189906226e-06, + "loss": 0.525, + "step": 8335 + }, + { + "epoch": 4.054468085106383, + "grad_norm": 0.07331274692768641, + "learning_rate": 4.90247682895612e-06, + "loss": 0.5023, + "step": 8336 + }, + { + "epoch": 4.054954407294833, + "grad_norm": 0.07358955410934745, + "learning_rate": 4.901520842488087e-06, + "loss": 0.4829, + "step": 8337 + }, + { + "epoch": 4.055440729483283, + "grad_norm": 0.07354169698942832, + "learning_rate": 4.900564859621479e-06, + "loss": 0.5028, + "step": 8338 + }, + { + "epoch": 4.055927051671732, + "grad_norm": 0.0719302294944975, + "learning_rate": 4.899608880391259e-06, + "loss": 0.5086, + "step": 8339 + }, + { + "epoch": 4.056413373860182, + "grad_norm": 0.07258150943863832, + "learning_rate": 4.898652904832389e-06, + "loss": 0.4974, + "step": 8340 + }, + { + "epoch": 4.0568996960486325, + "grad_norm": 0.08029129607170804, + "learning_rate": 4.897696932979827e-06, + "loss": 0.4945, + "step": 8341 + }, + { + "epoch": 4.057386018237082, + "grad_norm": 0.07864295424139797, + "learning_rate": 4.896740964868537e-06, + "loss": 0.5663, + "step": 8342 + }, + { + "epoch": 4.057872340425532, + "grad_norm": 0.07183141716746466, + "learning_rate": 4.895785000533475e-06, + "loss": 0.5172, + "step": 8343 + }, + { + "epoch": 4.058358662613982, + "grad_norm": 0.07077327935443194, + "learning_rate": 4.894829040009606e-06, + "loss": 0.4706, + "step": 8344 + }, + { + "epoch": 4.058844984802431, + "grad_norm": 0.07213827545903248, + "learning_rate": 4.8938730833318825e-06, + "loss": 0.4723, + "step": 8345 + }, + { + "epoch": 4.059331306990882, + "grad_norm": 0.07322353846768895, + "learning_rate": 4.892917130535271e-06, + "loss": 0.5266, + "step": 8346 + }, + { + "epoch": 4.0598176291793315, + "grad_norm": 0.07160238873466572, + "learning_rate": 4.891961181654727e-06, + "loss": 0.5026, + "step": 8347 + }, + { + "epoch": 4.060303951367781, + "grad_norm": 0.07510253043640047, + "learning_rate": 4.8910052367252146e-06, + "loss": 0.513, + "step": 8348 + }, + { + "epoch": 4.060790273556231, + "grad_norm": 0.07480732881359131, + "learning_rate": 4.890049295781687e-06, + "loss": 0.5158, + "step": 8349 + }, + { + "epoch": 4.061276595744681, + "grad_norm": 0.07687670361687345, + "learning_rate": 4.889093358859108e-06, + "loss": 0.5014, + "step": 8350 + }, + { + "epoch": 4.06176291793313, + "grad_norm": 0.07379484040025416, + "learning_rate": 4.888137425992435e-06, + "loss": 0.51, + "step": 8351 + }, + { + "epoch": 4.062249240121581, + "grad_norm": 0.07554029126287756, + "learning_rate": 4.887181497216628e-06, + "loss": 0.5162, + "step": 8352 + }, + { + "epoch": 4.0627355623100305, + "grad_norm": 0.07341335294156111, + "learning_rate": 4.886225572566644e-06, + "loss": 0.5118, + "step": 8353 + }, + { + "epoch": 4.06322188449848, + "grad_norm": 0.07361731471875785, + "learning_rate": 4.885269652077444e-06, + "loss": 0.4636, + "step": 8354 + }, + { + "epoch": 4.06370820668693, + "grad_norm": 0.07365564250204958, + "learning_rate": 4.8843137357839836e-06, + "loss": 0.4767, + "step": 8355 + }, + { + "epoch": 4.06419452887538, + "grad_norm": 0.0729357769344785, + "learning_rate": 4.883357823721222e-06, + "loss": 0.5179, + "step": 8356 + }, + { + "epoch": 4.06468085106383, + "grad_norm": 0.07149240912487374, + "learning_rate": 4.8824019159241175e-06, + "loss": 0.5041, + "step": 8357 + }, + { + "epoch": 4.06516717325228, + "grad_norm": 0.0712629026057834, + "learning_rate": 4.8814460124276305e-06, + "loss": 0.5091, + "step": 8358 + }, + { + "epoch": 4.0656534954407295, + "grad_norm": 0.07826148458011131, + "learning_rate": 4.880490113266715e-06, + "loss": 0.5324, + "step": 8359 + }, + { + "epoch": 4.066139817629179, + "grad_norm": 0.07596092969879549, + "learning_rate": 4.879534218476331e-06, + "loss": 0.5138, + "step": 8360 + }, + { + "epoch": 4.066626139817629, + "grad_norm": 0.07451951136200978, + "learning_rate": 4.878578328091434e-06, + "loss": 0.5292, + "step": 8361 + }, + { + "epoch": 4.0671124620060795, + "grad_norm": 0.0724650741277835, + "learning_rate": 4.877622442146985e-06, + "loss": 0.496, + "step": 8362 + }, + { + "epoch": 4.067598784194529, + "grad_norm": 0.07628316436925109, + "learning_rate": 4.876666560677937e-06, + "loss": 0.5263, + "step": 8363 + }, + { + "epoch": 4.068085106382979, + "grad_norm": 0.07563351111660888, + "learning_rate": 4.87571068371925e-06, + "loss": 0.5333, + "step": 8364 + }, + { + "epoch": 4.0685714285714285, + "grad_norm": 0.07377569860097302, + "learning_rate": 4.874754811305879e-06, + "loss": 0.483, + "step": 8365 + }, + { + "epoch": 4.069057750759878, + "grad_norm": 0.07739807928793731, + "learning_rate": 4.873798943472781e-06, + "loss": 0.5356, + "step": 8366 + }, + { + "epoch": 4.069544072948328, + "grad_norm": 0.07166219387027353, + "learning_rate": 4.872843080254915e-06, + "loss": 0.4746, + "step": 8367 + }, + { + "epoch": 4.0700303951367784, + "grad_norm": 0.07359154610133878, + "learning_rate": 4.871887221687233e-06, + "loss": 0.5065, + "step": 8368 + }, + { + "epoch": 4.070516717325228, + "grad_norm": 0.07649912048346863, + "learning_rate": 4.870931367804696e-06, + "loss": 0.5399, + "step": 8369 + }, + { + "epoch": 4.071003039513678, + "grad_norm": 0.07548857514330723, + "learning_rate": 4.869975518642255e-06, + "loss": 0.4921, + "step": 8370 + }, + { + "epoch": 4.0714893617021275, + "grad_norm": 0.07468029251851437, + "learning_rate": 4.86901967423487e-06, + "loss": 0.5233, + "step": 8371 + }, + { + "epoch": 4.071975683890577, + "grad_norm": 0.07799294894236818, + "learning_rate": 4.868063834617494e-06, + "loss": 0.5275, + "step": 8372 + }, + { + "epoch": 4.072462006079028, + "grad_norm": 0.07864036649754841, + "learning_rate": 4.867107999825085e-06, + "loss": 0.4984, + "step": 8373 + }, + { + "epoch": 4.072948328267477, + "grad_norm": 0.07821686722444757, + "learning_rate": 4.866152169892595e-06, + "loss": 0.5696, + "step": 8374 + }, + { + "epoch": 4.073434650455927, + "grad_norm": 0.07417454247051666, + "learning_rate": 4.865196344854982e-06, + "loss": 0.5102, + "step": 8375 + }, + { + "epoch": 4.073920972644377, + "grad_norm": 0.07668708019371931, + "learning_rate": 4.864240524747199e-06, + "loss": 0.5109, + "step": 8376 + }, + { + "epoch": 4.0744072948328265, + "grad_norm": 0.08065066959386501, + "learning_rate": 4.863284709604204e-06, + "loss": 0.5123, + "step": 8377 + }, + { + "epoch": 4.074893617021276, + "grad_norm": 0.07448139730342614, + "learning_rate": 4.862328899460947e-06, + "loss": 0.4595, + "step": 8378 + }, + { + "epoch": 4.075379939209727, + "grad_norm": 0.07671842343736177, + "learning_rate": 4.861373094352386e-06, + "loss": 0.5419, + "step": 8379 + }, + { + "epoch": 4.075866261398176, + "grad_norm": 0.07777131323591312, + "learning_rate": 4.860417294313472e-06, + "loss": 0.5249, + "step": 8380 + }, + { + "epoch": 4.076352583586626, + "grad_norm": 0.07598653032454367, + "learning_rate": 4.859461499379164e-06, + "loss": 0.5469, + "step": 8381 + }, + { + "epoch": 4.076838905775076, + "grad_norm": 0.07176166833320136, + "learning_rate": 4.85850570958441e-06, + "loss": 0.4888, + "step": 8382 + }, + { + "epoch": 4.0773252279635255, + "grad_norm": 0.07276747438348917, + "learning_rate": 4.857549924964169e-06, + "loss": 0.4932, + "step": 8383 + }, + { + "epoch": 4.077811550151976, + "grad_norm": 0.07463859499197374, + "learning_rate": 4.856594145553389e-06, + "loss": 0.5122, + "step": 8384 + }, + { + "epoch": 4.078297872340426, + "grad_norm": 0.0806355385410711, + "learning_rate": 4.855638371387029e-06, + "loss": 0.5452, + "step": 8385 + }, + { + "epoch": 4.078784194528875, + "grad_norm": 0.07315902574618881, + "learning_rate": 4.854682602500037e-06, + "loss": 0.5142, + "step": 8386 + }, + { + "epoch": 4.079270516717325, + "grad_norm": 0.07143494080084337, + "learning_rate": 4.853726838927371e-06, + "loss": 0.494, + "step": 8387 + }, + { + "epoch": 4.079756838905775, + "grad_norm": 0.07131911191310064, + "learning_rate": 4.852771080703978e-06, + "loss": 0.4896, + "step": 8388 + }, + { + "epoch": 4.080243161094225, + "grad_norm": 0.07468581761156592, + "learning_rate": 4.851815327864815e-06, + "loss": 0.5266, + "step": 8389 + }, + { + "epoch": 4.080729483282675, + "grad_norm": 0.07662789409288262, + "learning_rate": 4.850859580444832e-06, + "loss": 0.5094, + "step": 8390 + }, + { + "epoch": 4.081215805471125, + "grad_norm": 0.07452174842149409, + "learning_rate": 4.8499038384789816e-06, + "loss": 0.5022, + "step": 8391 + }, + { + "epoch": 4.081702127659574, + "grad_norm": 0.0751309596755392, + "learning_rate": 4.848948102002218e-06, + "loss": 0.503, + "step": 8392 + }, + { + "epoch": 4.082188449848024, + "grad_norm": 0.0740922383507223, + "learning_rate": 4.847992371049489e-06, + "loss": 0.539, + "step": 8393 + }, + { + "epoch": 4.082674772036474, + "grad_norm": 0.07585553411796839, + "learning_rate": 4.847036645655749e-06, + "loss": 0.5517, + "step": 8394 + }, + { + "epoch": 4.083161094224924, + "grad_norm": 0.07299891860442781, + "learning_rate": 4.846080925855947e-06, + "loss": 0.5157, + "step": 8395 + }, + { + "epoch": 4.083647416413374, + "grad_norm": 0.07350908093952695, + "learning_rate": 4.845125211685039e-06, + "loss": 0.5038, + "step": 8396 + }, + { + "epoch": 4.084133738601824, + "grad_norm": 0.07247360758118054, + "learning_rate": 4.844169503177969e-06, + "loss": 0.4977, + "step": 8397 + }, + { + "epoch": 4.084620060790273, + "grad_norm": 0.07567373547740953, + "learning_rate": 4.843213800369694e-06, + "loss": 0.5195, + "step": 8398 + }, + { + "epoch": 4.085106382978723, + "grad_norm": 0.07444425012663346, + "learning_rate": 4.842258103295159e-06, + "loss": 0.5305, + "step": 8399 + }, + { + "epoch": 4.085592705167174, + "grad_norm": 0.07598434185453064, + "learning_rate": 4.841302411989318e-06, + "loss": 0.5206, + "step": 8400 + }, + { + "epoch": 4.086079027355623, + "grad_norm": 0.07677697022737197, + "learning_rate": 4.840346726487119e-06, + "loss": 0.5021, + "step": 8401 + }, + { + "epoch": 4.086565349544073, + "grad_norm": 0.07728882643219065, + "learning_rate": 4.839391046823514e-06, + "loss": 0.5314, + "step": 8402 + }, + { + "epoch": 4.087051671732523, + "grad_norm": 0.07358295326272597, + "learning_rate": 4.83843537303345e-06, + "loss": 0.4904, + "step": 8403 + }, + { + "epoch": 4.087537993920972, + "grad_norm": 0.07593188535565605, + "learning_rate": 4.837479705151878e-06, + "loss": 0.5259, + "step": 8404 + }, + { + "epoch": 4.088024316109422, + "grad_norm": 0.07411325100474116, + "learning_rate": 4.8365240432137465e-06, + "loss": 0.5289, + "step": 8405 + }, + { + "epoch": 4.088510638297873, + "grad_norm": 0.07211279051929438, + "learning_rate": 4.835568387254008e-06, + "loss": 0.4695, + "step": 8406 + }, + { + "epoch": 4.088996960486322, + "grad_norm": 0.07484746303545112, + "learning_rate": 4.8346127373076044e-06, + "loss": 0.5221, + "step": 8407 + }, + { + "epoch": 4.089483282674772, + "grad_norm": 0.0744416300688764, + "learning_rate": 4.833657093409491e-06, + "loss": 0.5126, + "step": 8408 + }, + { + "epoch": 4.089969604863222, + "grad_norm": 0.07269969955565858, + "learning_rate": 4.832701455594612e-06, + "loss": 0.5214, + "step": 8409 + }, + { + "epoch": 4.090455927051671, + "grad_norm": 0.07335578739278328, + "learning_rate": 4.831745823897917e-06, + "loss": 0.5138, + "step": 8410 + }, + { + "epoch": 4.090942249240122, + "grad_norm": 0.07635890634093613, + "learning_rate": 4.8307901983543535e-06, + "loss": 0.5171, + "step": 8411 + }, + { + "epoch": 4.091428571428572, + "grad_norm": 0.0766818077182476, + "learning_rate": 4.82983457899887e-06, + "loss": 0.5347, + "step": 8412 + }, + { + "epoch": 4.091914893617021, + "grad_norm": 0.07118819501345552, + "learning_rate": 4.8288789658664125e-06, + "loss": 0.4925, + "step": 8413 + }, + { + "epoch": 4.092401215805471, + "grad_norm": 0.0786071043349153, + "learning_rate": 4.827923358991929e-06, + "loss": 0.5483, + "step": 8414 + }, + { + "epoch": 4.092887537993921, + "grad_norm": 0.07650896870916511, + "learning_rate": 4.826967758410366e-06, + "loss": 0.5056, + "step": 8415 + }, + { + "epoch": 4.093373860182371, + "grad_norm": 0.07461718401611261, + "learning_rate": 4.826012164156673e-06, + "loss": 0.4904, + "step": 8416 + }, + { + "epoch": 4.093860182370821, + "grad_norm": 0.07544098451107405, + "learning_rate": 4.825056576265791e-06, + "loss": 0.4898, + "step": 8417 + }, + { + "epoch": 4.094346504559271, + "grad_norm": 0.07519129722240067, + "learning_rate": 4.824100994772671e-06, + "loss": 0.5369, + "step": 8418 + }, + { + "epoch": 4.09483282674772, + "grad_norm": 0.07516849338187878, + "learning_rate": 4.8231454197122575e-06, + "loss": 0.5237, + "step": 8419 + }, + { + "epoch": 4.09531914893617, + "grad_norm": 0.07856254107428146, + "learning_rate": 4.822189851119495e-06, + "loss": 0.5007, + "step": 8420 + }, + { + "epoch": 4.09580547112462, + "grad_norm": 0.07515755797835864, + "learning_rate": 4.8212342890293335e-06, + "loss": 0.5205, + "step": 8421 + }, + { + "epoch": 4.09629179331307, + "grad_norm": 0.07631363036739496, + "learning_rate": 4.820278733476713e-06, + "loss": 0.5279, + "step": 8422 + }, + { + "epoch": 4.09677811550152, + "grad_norm": 0.07539843259341894, + "learning_rate": 4.819323184496582e-06, + "loss": 0.5551, + "step": 8423 + }, + { + "epoch": 4.09726443768997, + "grad_norm": 0.07663154193959248, + "learning_rate": 4.818367642123883e-06, + "loss": 0.5129, + "step": 8424 + }, + { + "epoch": 4.097750759878419, + "grad_norm": 0.07131465105836605, + "learning_rate": 4.817412106393563e-06, + "loss": 0.5329, + "step": 8425 + }, + { + "epoch": 4.098237082066869, + "grad_norm": 0.07602080319672558, + "learning_rate": 4.816456577340564e-06, + "loss": 0.4994, + "step": 8426 + }, + { + "epoch": 4.09872340425532, + "grad_norm": 0.07559629619139957, + "learning_rate": 4.815501054999834e-06, + "loss": 0.512, + "step": 8427 + }, + { + "epoch": 4.099209726443769, + "grad_norm": 0.07919439788068093, + "learning_rate": 4.814545539406311e-06, + "loss": 0.5187, + "step": 8428 + }, + { + "epoch": 4.099696048632219, + "grad_norm": 0.07850853057203927, + "learning_rate": 4.813590030594944e-06, + "loss": 0.4908, + "step": 8429 + }, + { + "epoch": 4.100182370820669, + "grad_norm": 0.07743809474402, + "learning_rate": 4.812634528600673e-06, + "loss": 0.5484, + "step": 8430 + }, + { + "epoch": 4.100668693009118, + "grad_norm": 0.07361810116202264, + "learning_rate": 4.8116790334584435e-06, + "loss": 0.4627, + "step": 8431 + }, + { + "epoch": 4.101155015197568, + "grad_norm": 0.07378348470796924, + "learning_rate": 4.810723545203196e-06, + "loss": 0.4913, + "step": 8432 + }, + { + "epoch": 4.1016413373860185, + "grad_norm": 0.07241972587777123, + "learning_rate": 4.809768063869875e-06, + "loss": 0.5176, + "step": 8433 + }, + { + "epoch": 4.102127659574468, + "grad_norm": 0.07457140313801962, + "learning_rate": 4.8088125894934215e-06, + "loss": 0.5097, + "step": 8434 + }, + { + "epoch": 4.102613981762918, + "grad_norm": 0.07494365427789915, + "learning_rate": 4.807857122108781e-06, + "loss": 0.5018, + "step": 8435 + }, + { + "epoch": 4.103100303951368, + "grad_norm": 0.07571627120749001, + "learning_rate": 4.806901661750891e-06, + "loss": 0.5162, + "step": 8436 + }, + { + "epoch": 4.103586626139817, + "grad_norm": 0.07296338650363951, + "learning_rate": 4.8059462084546965e-06, + "loss": 0.5006, + "step": 8437 + }, + { + "epoch": 4.104072948328268, + "grad_norm": 0.0766451506480599, + "learning_rate": 4.804990762255135e-06, + "loss": 0.5208, + "step": 8438 + }, + { + "epoch": 4.1045592705167175, + "grad_norm": 0.07165932527790829, + "learning_rate": 4.8040353231871515e-06, + "loss": 0.4932, + "step": 8439 + }, + { + "epoch": 4.105045592705167, + "grad_norm": 0.0756174238550916, + "learning_rate": 4.803079891285684e-06, + "loss": 0.5354, + "step": 8440 + }, + { + "epoch": 4.105531914893617, + "grad_norm": 0.07355449036255178, + "learning_rate": 4.8021244665856764e-06, + "loss": 0.4884, + "step": 8441 + }, + { + "epoch": 4.106018237082067, + "grad_norm": 0.07263658461320514, + "learning_rate": 4.801169049122065e-06, + "loss": 0.4791, + "step": 8442 + }, + { + "epoch": 4.106504559270517, + "grad_norm": 0.07264491467987305, + "learning_rate": 4.800213638929792e-06, + "loss": 0.4948, + "step": 8443 + }, + { + "epoch": 4.106990881458967, + "grad_norm": 0.075062382012485, + "learning_rate": 4.799258236043797e-06, + "loss": 0.512, + "step": 8444 + }, + { + "epoch": 4.1074772036474165, + "grad_norm": 0.07701915905724878, + "learning_rate": 4.798302840499019e-06, + "loss": 0.5009, + "step": 8445 + }, + { + "epoch": 4.107963525835866, + "grad_norm": 0.07432349917959506, + "learning_rate": 4.7973474523304e-06, + "loss": 0.4869, + "step": 8446 + }, + { + "epoch": 4.108449848024316, + "grad_norm": 0.07604818737076773, + "learning_rate": 4.796392071572875e-06, + "loss": 0.5073, + "step": 8447 + }, + { + "epoch": 4.108936170212766, + "grad_norm": 0.07918190398610037, + "learning_rate": 4.795436698261386e-06, + "loss": 0.5458, + "step": 8448 + }, + { + "epoch": 4.109422492401216, + "grad_norm": 0.07241430394658066, + "learning_rate": 4.794481332430868e-06, + "loss": 0.5071, + "step": 8449 + }, + { + "epoch": 4.109908814589666, + "grad_norm": 0.07333025741684206, + "learning_rate": 4.793525974116262e-06, + "loss": 0.5073, + "step": 8450 + }, + { + "epoch": 4.1103951367781155, + "grad_norm": 0.07369458476874377, + "learning_rate": 4.792570623352504e-06, + "loss": 0.5086, + "step": 8451 + }, + { + "epoch": 4.110881458966565, + "grad_norm": 0.07554204454969354, + "learning_rate": 4.791615280174535e-06, + "loss": 0.4789, + "step": 8452 + }, + { + "epoch": 4.111367781155015, + "grad_norm": 0.07462955760059636, + "learning_rate": 4.790659944617287e-06, + "loss": 0.5258, + "step": 8453 + }, + { + "epoch": 4.1118541033434655, + "grad_norm": 0.07663972165689943, + "learning_rate": 4.789704616715701e-06, + "loss": 0.515, + "step": 8454 + }, + { + "epoch": 4.112340425531915, + "grad_norm": 0.07386731959863665, + "learning_rate": 4.788749296504712e-06, + "loss": 0.5451, + "step": 8455 + }, + { + "epoch": 4.112826747720365, + "grad_norm": 0.07300364202219237, + "learning_rate": 4.78779398401926e-06, + "loss": 0.4853, + "step": 8456 + }, + { + "epoch": 4.1133130699088145, + "grad_norm": 0.07480187749125185, + "learning_rate": 4.786838679294275e-06, + "loss": 0.5021, + "step": 8457 + }, + { + "epoch": 4.113799392097264, + "grad_norm": 0.07316756874914099, + "learning_rate": 4.785883382364698e-06, + "loss": 0.5024, + "step": 8458 + }, + { + "epoch": 4.114285714285714, + "grad_norm": 0.07575717063939928, + "learning_rate": 4.7849280932654625e-06, + "loss": 0.5187, + "step": 8459 + }, + { + "epoch": 4.1147720364741645, + "grad_norm": 0.07309875939571202, + "learning_rate": 4.783972812031506e-06, + "loss": 0.4893, + "step": 8460 + }, + { + "epoch": 4.115258358662614, + "grad_norm": 0.07232193939457907, + "learning_rate": 4.783017538697759e-06, + "loss": 0.5138, + "step": 8461 + }, + { + "epoch": 4.115744680851064, + "grad_norm": 0.07912145505561217, + "learning_rate": 4.782062273299163e-06, + "loss": 0.5809, + "step": 8462 + }, + { + "epoch": 4.1162310030395135, + "grad_norm": 0.07345369400214838, + "learning_rate": 4.781107015870645e-06, + "loss": 0.5109, + "step": 8463 + }, + { + "epoch": 4.116717325227963, + "grad_norm": 0.07494664278290203, + "learning_rate": 4.780151766447145e-06, + "loss": 0.5151, + "step": 8464 + }, + { + "epoch": 4.117203647416414, + "grad_norm": 0.07702789300248726, + "learning_rate": 4.779196525063593e-06, + "loss": 0.521, + "step": 8465 + }, + { + "epoch": 4.117689969604863, + "grad_norm": 0.07609796343293225, + "learning_rate": 4.778241291754927e-06, + "loss": 0.5615, + "step": 8466 + }, + { + "epoch": 4.118176291793313, + "grad_norm": 0.0733806476170799, + "learning_rate": 4.777286066556075e-06, + "loss": 0.4975, + "step": 8467 + }, + { + "epoch": 4.118662613981763, + "grad_norm": 0.0760401841307999, + "learning_rate": 4.776330849501974e-06, + "loss": 0.4808, + "step": 8468 + }, + { + "epoch": 4.1191489361702125, + "grad_norm": 0.07238702981342365, + "learning_rate": 4.775375640627555e-06, + "loss": 0.5182, + "step": 8469 + }, + { + "epoch": 4.119635258358662, + "grad_norm": 0.07388456835847891, + "learning_rate": 4.77442043996775e-06, + "loss": 0.4877, + "step": 8470 + }, + { + "epoch": 4.120121580547113, + "grad_norm": 0.07236446820570626, + "learning_rate": 4.773465247557494e-06, + "loss": 0.5089, + "step": 8471 + }, + { + "epoch": 4.120607902735562, + "grad_norm": 0.0742892714944287, + "learning_rate": 4.772510063431716e-06, + "loss": 0.5402, + "step": 8472 + }, + { + "epoch": 4.121094224924012, + "grad_norm": 0.07565318559592946, + "learning_rate": 4.771554887625348e-06, + "loss": 0.5211, + "step": 8473 + }, + { + "epoch": 4.121580547112462, + "grad_norm": 0.0751694433449456, + "learning_rate": 4.770599720173321e-06, + "loss": 0.5369, + "step": 8474 + }, + { + "epoch": 4.1220668693009115, + "grad_norm": 0.07160251049156026, + "learning_rate": 4.769644561110569e-06, + "loss": 0.4756, + "step": 8475 + }, + { + "epoch": 4.122553191489362, + "grad_norm": 0.07554071580412855, + "learning_rate": 4.768689410472018e-06, + "loss": 0.5269, + "step": 8476 + }, + { + "epoch": 4.123039513677812, + "grad_norm": 0.07538997699840858, + "learning_rate": 4.767734268292602e-06, + "loss": 0.4852, + "step": 8477 + }, + { + "epoch": 4.123525835866261, + "grad_norm": 0.07561541940040853, + "learning_rate": 4.766779134607247e-06, + "loss": 0.5174, + "step": 8478 + }, + { + "epoch": 4.124012158054711, + "grad_norm": 0.07414870272579187, + "learning_rate": 4.765824009450887e-06, + "loss": 0.4668, + "step": 8479 + }, + { + "epoch": 4.124498480243161, + "grad_norm": 0.07277545413423886, + "learning_rate": 4.764868892858447e-06, + "loss": 0.4995, + "step": 8480 + }, + { + "epoch": 4.124984802431611, + "grad_norm": 0.07313710546007884, + "learning_rate": 4.7639137848648616e-06, + "loss": 0.5396, + "step": 8481 + }, + { + "epoch": 4.125471124620061, + "grad_norm": 0.07784819306949427, + "learning_rate": 4.7629586855050535e-06, + "loss": 0.5573, + "step": 8482 + }, + { + "epoch": 4.125957446808511, + "grad_norm": 0.07469256148305614, + "learning_rate": 4.762003594813955e-06, + "loss": 0.508, + "step": 8483 + }, + { + "epoch": 4.12644376899696, + "grad_norm": 0.07676605050562277, + "learning_rate": 4.761048512826493e-06, + "loss": 0.5045, + "step": 8484 + }, + { + "epoch": 4.12693009118541, + "grad_norm": 0.07463859453319577, + "learning_rate": 4.760093439577597e-06, + "loss": 0.4884, + "step": 8485 + }, + { + "epoch": 4.12741641337386, + "grad_norm": 0.07271760362583618, + "learning_rate": 4.759138375102191e-06, + "loss": 0.4833, + "step": 8486 + }, + { + "epoch": 4.12790273556231, + "grad_norm": 0.07436945294964803, + "learning_rate": 4.7581833194352044e-06, + "loss": 0.5313, + "step": 8487 + }, + { + "epoch": 4.12838905775076, + "grad_norm": 0.07424254588825265, + "learning_rate": 4.757228272611563e-06, + "loss": 0.5007, + "step": 8488 + }, + { + "epoch": 4.12887537993921, + "grad_norm": 0.07725554441577545, + "learning_rate": 4.756273234666196e-06, + "loss": 0.523, + "step": 8489 + }, + { + "epoch": 4.129361702127659, + "grad_norm": 0.07181531891643472, + "learning_rate": 4.755318205634026e-06, + "loss": 0.4995, + "step": 8490 + }, + { + "epoch": 4.129848024316109, + "grad_norm": 0.07713948845313415, + "learning_rate": 4.754363185549982e-06, + "loss": 0.5479, + "step": 8491 + }, + { + "epoch": 4.13033434650456, + "grad_norm": 0.07510118932034981, + "learning_rate": 4.753408174448986e-06, + "loss": 0.483, + "step": 8492 + }, + { + "epoch": 4.130820668693009, + "grad_norm": 0.07445380695687027, + "learning_rate": 4.752453172365966e-06, + "loss": 0.5193, + "step": 8493 + }, + { + "epoch": 4.131306990881459, + "grad_norm": 0.07161513378779802, + "learning_rate": 4.751498179335845e-06, + "loss": 0.5021, + "step": 8494 + }, + { + "epoch": 4.131793313069909, + "grad_norm": 0.078423799652204, + "learning_rate": 4.750543195393551e-06, + "loss": 0.5504, + "step": 8495 + }, + { + "epoch": 4.132279635258358, + "grad_norm": 0.07435634578442701, + "learning_rate": 4.749588220574003e-06, + "loss": 0.5083, + "step": 8496 + }, + { + "epoch": 4.132765957446808, + "grad_norm": 0.07032029957204687, + "learning_rate": 4.748633254912128e-06, + "loss": 0.483, + "step": 8497 + }, + { + "epoch": 4.133252279635259, + "grad_norm": 0.07547310162210427, + "learning_rate": 4.747678298442849e-06, + "loss": 0.5313, + "step": 8498 + }, + { + "epoch": 4.133738601823708, + "grad_norm": 0.07476563714440676, + "learning_rate": 4.746723351201089e-06, + "loss": 0.5251, + "step": 8499 + }, + { + "epoch": 4.134224924012158, + "grad_norm": 0.07530646207841561, + "learning_rate": 4.745768413221774e-06, + "loss": 0.4939, + "step": 8500 + }, + { + "epoch": 4.134711246200608, + "grad_norm": 0.07281349082892166, + "learning_rate": 4.74481348453982e-06, + "loss": 0.5081, + "step": 8501 + }, + { + "epoch": 4.135197568389057, + "grad_norm": 0.07535371882766576, + "learning_rate": 4.7438585651901555e-06, + "loss": 0.502, + "step": 8502 + }, + { + "epoch": 4.135683890577508, + "grad_norm": 0.07629967520929797, + "learning_rate": 4.742903655207698e-06, + "loss": 0.5179, + "step": 8503 + }, + { + "epoch": 4.136170212765958, + "grad_norm": 0.07330226439801589, + "learning_rate": 4.741948754627372e-06, + "loss": 0.5393, + "step": 8504 + }, + { + "epoch": 4.136656534954407, + "grad_norm": 0.07745839454052919, + "learning_rate": 4.740993863484095e-06, + "loss": 0.5384, + "step": 8505 + }, + { + "epoch": 4.137142857142857, + "grad_norm": 0.07630889573110095, + "learning_rate": 4.740038981812793e-06, + "loss": 0.5351, + "step": 8506 + }, + { + "epoch": 4.137629179331307, + "grad_norm": 0.07542290662599023, + "learning_rate": 4.739084109648382e-06, + "loss": 0.5372, + "step": 8507 + }, + { + "epoch": 4.138115501519757, + "grad_norm": 0.0726327719816919, + "learning_rate": 4.738129247025783e-06, + "loss": 0.4963, + "step": 8508 + }, + { + "epoch": 4.138601823708207, + "grad_norm": 0.07515230661333981, + "learning_rate": 4.737174393979916e-06, + "loss": 0.5159, + "step": 8509 + }, + { + "epoch": 4.139088145896657, + "grad_norm": 0.07593225769332512, + "learning_rate": 4.736219550545704e-06, + "loss": 0.5067, + "step": 8510 + }, + { + "epoch": 4.139574468085106, + "grad_norm": 0.0747183198223034, + "learning_rate": 4.7352647167580595e-06, + "loss": 0.5113, + "step": 8511 + }, + { + "epoch": 4.140060790273556, + "grad_norm": 0.0727444087033036, + "learning_rate": 4.734309892651907e-06, + "loss": 0.4833, + "step": 8512 + }, + { + "epoch": 4.140547112462006, + "grad_norm": 0.07356653661906826, + "learning_rate": 4.733355078262159e-06, + "loss": 0.5049, + "step": 8513 + }, + { + "epoch": 4.141033434650456, + "grad_norm": 0.07472178029173547, + "learning_rate": 4.732400273623741e-06, + "loss": 0.5262, + "step": 8514 + }, + { + "epoch": 4.141519756838906, + "grad_norm": 0.07460977023180501, + "learning_rate": 4.731445478771564e-06, + "loss": 0.504, + "step": 8515 + }, + { + "epoch": 4.142006079027356, + "grad_norm": 0.075974412267872, + "learning_rate": 4.730490693740551e-06, + "loss": 0.55, + "step": 8516 + }, + { + "epoch": 4.142492401215805, + "grad_norm": 0.07420864525004155, + "learning_rate": 4.729535918565612e-06, + "loss": 0.4944, + "step": 8517 + }, + { + "epoch": 4.142978723404255, + "grad_norm": 0.07515982342346939, + "learning_rate": 4.728581153281669e-06, + "loss": 0.5232, + "step": 8518 + }, + { + "epoch": 4.143465045592706, + "grad_norm": 0.0755553574457653, + "learning_rate": 4.7276263979236354e-06, + "loss": 0.5073, + "step": 8519 + }, + { + "epoch": 4.143951367781155, + "grad_norm": 0.07368947400406485, + "learning_rate": 4.72667165252643e-06, + "loss": 0.517, + "step": 8520 + }, + { + "epoch": 4.144437689969605, + "grad_norm": 0.07707189428488803, + "learning_rate": 4.725716917124965e-06, + "loss": 0.543, + "step": 8521 + }, + { + "epoch": 4.144924012158055, + "grad_norm": 0.0727757337017132, + "learning_rate": 4.724762191754157e-06, + "loss": 0.52, + "step": 8522 + }, + { + "epoch": 4.145410334346504, + "grad_norm": 0.074262894151074, + "learning_rate": 4.7238074764489215e-06, + "loss": 0.5228, + "step": 8523 + }, + { + "epoch": 4.145896656534954, + "grad_norm": 0.0737422218344668, + "learning_rate": 4.722852771244171e-06, + "loss": 0.5122, + "step": 8524 + }, + { + "epoch": 4.1463829787234046, + "grad_norm": 0.07511797028927264, + "learning_rate": 4.721898076174822e-06, + "loss": 0.5274, + "step": 8525 + }, + { + "epoch": 4.146869300911854, + "grad_norm": 0.07481646911294902, + "learning_rate": 4.720943391275786e-06, + "loss": 0.5212, + "step": 8526 + }, + { + "epoch": 4.147355623100304, + "grad_norm": 0.07258703612897424, + "learning_rate": 4.719988716581977e-06, + "loss": 0.4877, + "step": 8527 + }, + { + "epoch": 4.147841945288754, + "grad_norm": 0.0737324916919453, + "learning_rate": 4.719034052128307e-06, + "loss": 0.5307, + "step": 8528 + }, + { + "epoch": 4.148328267477203, + "grad_norm": 0.07371562653864114, + "learning_rate": 4.718079397949691e-06, + "loss": 0.5297, + "step": 8529 + }, + { + "epoch": 4.148814589665654, + "grad_norm": 0.07302499564175673, + "learning_rate": 4.717124754081038e-06, + "loss": 0.4943, + "step": 8530 + }, + { + "epoch": 4.1493009118541035, + "grad_norm": 0.08540931841216334, + "learning_rate": 4.716170120557264e-06, + "loss": 0.5181, + "step": 8531 + }, + { + "epoch": 4.149787234042553, + "grad_norm": 0.0810716942002274, + "learning_rate": 4.715215497413275e-06, + "loss": 0.6227, + "step": 8532 + }, + { + "epoch": 4.150273556231003, + "grad_norm": 0.07473611648709737, + "learning_rate": 4.714260884683985e-06, + "loss": 0.5305, + "step": 8533 + }, + { + "epoch": 4.150759878419453, + "grad_norm": 0.07410754546538102, + "learning_rate": 4.713306282404303e-06, + "loss": 0.5072, + "step": 8534 + }, + { + "epoch": 4.151246200607902, + "grad_norm": 0.07339648917271148, + "learning_rate": 4.712351690609144e-06, + "loss": 0.496, + "step": 8535 + }, + { + "epoch": 4.151732522796353, + "grad_norm": 0.07440001898343732, + "learning_rate": 4.7113971093334115e-06, + "loss": 0.5297, + "step": 8536 + }, + { + "epoch": 4.1522188449848025, + "grad_norm": 0.07202434766563388, + "learning_rate": 4.710442538612019e-06, + "loss": 0.4916, + "step": 8537 + }, + { + "epoch": 4.152705167173252, + "grad_norm": 0.07403508750837864, + "learning_rate": 4.709487978479873e-06, + "loss": 0.5012, + "step": 8538 + }, + { + "epoch": 4.153191489361702, + "grad_norm": 0.07535790937446205, + "learning_rate": 4.708533428971886e-06, + "loss": 0.4904, + "step": 8539 + }, + { + "epoch": 4.153677811550152, + "grad_norm": 0.07373613606725883, + "learning_rate": 4.707578890122962e-06, + "loss": 0.5074, + "step": 8540 + }, + { + "epoch": 4.154164133738602, + "grad_norm": 0.07349977027162974, + "learning_rate": 4.706624361968013e-06, + "loss": 0.5182, + "step": 8541 + }, + { + "epoch": 4.154650455927052, + "grad_norm": 0.0729252228155845, + "learning_rate": 4.705669844541942e-06, + "loss": 0.5277, + "step": 8542 + }, + { + "epoch": 4.1551367781155015, + "grad_norm": 0.07633788585071823, + "learning_rate": 4.70471533787966e-06, + "loss": 0.5298, + "step": 8543 + }, + { + "epoch": 4.155623100303951, + "grad_norm": 0.07596510423713741, + "learning_rate": 4.7037608420160706e-06, + "loss": 0.4977, + "step": 8544 + }, + { + "epoch": 4.156109422492401, + "grad_norm": 0.07709053723604035, + "learning_rate": 4.7028063569860834e-06, + "loss": 0.5192, + "step": 8545 + }, + { + "epoch": 4.1565957446808515, + "grad_norm": 0.07294504698514606, + "learning_rate": 4.701851882824602e-06, + "loss": 0.5213, + "step": 8546 + }, + { + "epoch": 4.157082066869301, + "grad_norm": 0.0747635006898603, + "learning_rate": 4.700897419566533e-06, + "loss": 0.5091, + "step": 8547 + }, + { + "epoch": 4.157568389057751, + "grad_norm": 0.07572329051297934, + "learning_rate": 4.69994296724678e-06, + "loss": 0.5038, + "step": 8548 + }, + { + "epoch": 4.1580547112462005, + "grad_norm": 0.07659586356667382, + "learning_rate": 4.6989885259002495e-06, + "loss": 0.4816, + "step": 8549 + }, + { + "epoch": 4.15854103343465, + "grad_norm": 0.07192192965669973, + "learning_rate": 4.698034095561847e-06, + "loss": 0.4949, + "step": 8550 + }, + { + "epoch": 4.1590273556231, + "grad_norm": 0.07316210150314204, + "learning_rate": 4.697079676266473e-06, + "loss": 0.5311, + "step": 8551 + }, + { + "epoch": 4.1595136778115505, + "grad_norm": 0.07450112052090677, + "learning_rate": 4.696125268049034e-06, + "loss": 0.5271, + "step": 8552 + }, + { + "epoch": 4.16, + "grad_norm": 0.0758047351351251, + "learning_rate": 4.695170870944431e-06, + "loss": 0.5265, + "step": 8553 + }, + { + "epoch": 4.16048632218845, + "grad_norm": 0.07634671296173454, + "learning_rate": 4.69421648498757e-06, + "loss": 0.5402, + "step": 8554 + }, + { + "epoch": 4.1609726443768995, + "grad_norm": 0.07641618843784993, + "learning_rate": 4.6932621102133486e-06, + "loss": 0.5188, + "step": 8555 + }, + { + "epoch": 4.161458966565349, + "grad_norm": 0.07579075965807226, + "learning_rate": 4.692307746656673e-06, + "loss": 0.5305, + "step": 8556 + }, + { + "epoch": 4.1619452887538, + "grad_norm": 0.0756996815933246, + "learning_rate": 4.691353394352442e-06, + "loss": 0.5276, + "step": 8557 + }, + { + "epoch": 4.1624316109422494, + "grad_norm": 0.07567878484857823, + "learning_rate": 4.690399053335557e-06, + "loss": 0.4895, + "step": 8558 + }, + { + "epoch": 4.162917933130699, + "grad_norm": 0.0801301112291178, + "learning_rate": 4.689444723640919e-06, + "loss": 0.514, + "step": 8559 + }, + { + "epoch": 4.163404255319149, + "grad_norm": 0.07705269609616539, + "learning_rate": 4.688490405303431e-06, + "loss": 0.5234, + "step": 8560 + }, + { + "epoch": 4.1638905775075985, + "grad_norm": 0.07284394538072295, + "learning_rate": 4.687536098357988e-06, + "loss": 0.4976, + "step": 8561 + }, + { + "epoch": 4.164376899696048, + "grad_norm": 0.07661810538307577, + "learning_rate": 4.686581802839493e-06, + "loss": 0.5272, + "step": 8562 + }, + { + "epoch": 4.164863221884499, + "grad_norm": 0.07472252280571419, + "learning_rate": 4.685627518782843e-06, + "loss": 0.5017, + "step": 8563 + }, + { + "epoch": 4.165349544072948, + "grad_norm": 0.07381353930310107, + "learning_rate": 4.684673246222939e-06, + "loss": 0.4664, + "step": 8564 + }, + { + "epoch": 4.165835866261398, + "grad_norm": 0.07914523833047131, + "learning_rate": 4.683718985194676e-06, + "loss": 0.5235, + "step": 8565 + }, + { + "epoch": 4.166322188449848, + "grad_norm": 0.07672838283438023, + "learning_rate": 4.682764735732954e-06, + "loss": 0.5326, + "step": 8566 + }, + { + "epoch": 4.1668085106382975, + "grad_norm": 0.07437841536997172, + "learning_rate": 4.6818104978726685e-06, + "loss": 0.4994, + "step": 8567 + }, + { + "epoch": 4.167294832826748, + "grad_norm": 0.07621161484387089, + "learning_rate": 4.68085627164872e-06, + "loss": 0.4885, + "step": 8568 + }, + { + "epoch": 4.167781155015198, + "grad_norm": 0.07621089522380564, + "learning_rate": 4.679902057096001e-06, + "loss": 0.5465, + "step": 8569 + }, + { + "epoch": 4.168267477203647, + "grad_norm": 0.07666603167624715, + "learning_rate": 4.678947854249412e-06, + "loss": 0.4928, + "step": 8570 + }, + { + "epoch": 4.168753799392097, + "grad_norm": 0.0799760010047343, + "learning_rate": 4.677993663143842e-06, + "loss": 0.5241, + "step": 8571 + }, + { + "epoch": 4.169240121580547, + "grad_norm": 0.07601145026836639, + "learning_rate": 4.677039483814192e-06, + "loss": 0.5095, + "step": 8572 + }, + { + "epoch": 4.169726443768997, + "grad_norm": 0.07319976740421791, + "learning_rate": 4.676085316295353e-06, + "loss": 0.5138, + "step": 8573 + }, + { + "epoch": 4.170212765957447, + "grad_norm": 0.07698212498465612, + "learning_rate": 4.675131160622224e-06, + "loss": 0.5646, + "step": 8574 + }, + { + "epoch": 4.170699088145897, + "grad_norm": 0.07341667373466226, + "learning_rate": 4.674177016829694e-06, + "loss": 0.4891, + "step": 8575 + }, + { + "epoch": 4.171185410334346, + "grad_norm": 0.07533218591699581, + "learning_rate": 4.673222884952659e-06, + "loss": 0.5116, + "step": 8576 + }, + { + "epoch": 4.171671732522796, + "grad_norm": 0.08239102919120085, + "learning_rate": 4.672268765026011e-06, + "loss": 0.4947, + "step": 8577 + }, + { + "epoch": 4.172158054711246, + "grad_norm": 0.07472168464902597, + "learning_rate": 4.671314657084644e-06, + "loss": 0.5217, + "step": 8578 + }, + { + "epoch": 4.172644376899696, + "grad_norm": 0.07329272817616501, + "learning_rate": 4.67036056116345e-06, + "loss": 0.4965, + "step": 8579 + }, + { + "epoch": 4.173130699088146, + "grad_norm": 0.07489677119824412, + "learning_rate": 4.669406477297319e-06, + "loss": 0.5244, + "step": 8580 + }, + { + "epoch": 4.173617021276596, + "grad_norm": 0.07301657628825546, + "learning_rate": 4.668452405521143e-06, + "loss": 0.4933, + "step": 8581 + }, + { + "epoch": 4.174103343465045, + "grad_norm": 0.07529606263515964, + "learning_rate": 4.667498345869813e-06, + "loss": 0.5154, + "step": 8582 + }, + { + "epoch": 4.174589665653495, + "grad_norm": 0.07614329140654127, + "learning_rate": 4.666544298378222e-06, + "loss": 0.5293, + "step": 8583 + }, + { + "epoch": 4.175075987841946, + "grad_norm": 0.07618148460829599, + "learning_rate": 4.665590263081255e-06, + "loss": 0.5267, + "step": 8584 + }, + { + "epoch": 4.175562310030395, + "grad_norm": 0.07813183655327673, + "learning_rate": 4.664636240013805e-06, + "loss": 0.5503, + "step": 8585 + }, + { + "epoch": 4.176048632218845, + "grad_norm": 0.07654839960026084, + "learning_rate": 4.66368222921076e-06, + "loss": 0.5229, + "step": 8586 + }, + { + "epoch": 4.176534954407295, + "grad_norm": 0.07231874862337323, + "learning_rate": 4.662728230707008e-06, + "loss": 0.4937, + "step": 8587 + }, + { + "epoch": 4.177021276595744, + "grad_norm": 0.07439836542775184, + "learning_rate": 4.661774244537438e-06, + "loss": 0.5114, + "step": 8588 + }, + { + "epoch": 4.177507598784194, + "grad_norm": 0.07676921242630429, + "learning_rate": 4.660820270736939e-06, + "loss": 0.5544, + "step": 8589 + }, + { + "epoch": 4.177993920972645, + "grad_norm": 0.07605215328553559, + "learning_rate": 4.659866309340395e-06, + "loss": 0.5112, + "step": 8590 + }, + { + "epoch": 4.178480243161094, + "grad_norm": 0.07774803695600067, + "learning_rate": 4.658912360382695e-06, + "loss": 0.5232, + "step": 8591 + }, + { + "epoch": 4.178966565349544, + "grad_norm": 0.07554711015557045, + "learning_rate": 4.657958423898725e-06, + "loss": 0.5176, + "step": 8592 + }, + { + "epoch": 4.179452887537994, + "grad_norm": 0.07465462389494179, + "learning_rate": 4.657004499923372e-06, + "loss": 0.5186, + "step": 8593 + }, + { + "epoch": 4.179939209726443, + "grad_norm": 0.07392814739112226, + "learning_rate": 4.656050588491519e-06, + "loss": 0.4966, + "step": 8594 + }, + { + "epoch": 4.180425531914894, + "grad_norm": 0.07615430034639238, + "learning_rate": 4.655096689638054e-06, + "loss": 0.5308, + "step": 8595 + }, + { + "epoch": 4.180911854103344, + "grad_norm": 0.07523723262167048, + "learning_rate": 4.654142803397857e-06, + "loss": 0.5111, + "step": 8596 + }, + { + "epoch": 4.181398176291793, + "grad_norm": 0.07937715225630966, + "learning_rate": 4.653188929805816e-06, + "loss": 0.5421, + "step": 8597 + }, + { + "epoch": 4.181884498480243, + "grad_norm": 0.07172274786130603, + "learning_rate": 4.652235068896813e-06, + "loss": 0.4731, + "step": 8598 + }, + { + "epoch": 4.182370820668693, + "grad_norm": 0.07612235852400556, + "learning_rate": 4.651281220705733e-06, + "loss": 0.5438, + "step": 8599 + }, + { + "epoch": 4.182857142857143, + "grad_norm": 0.07617463385113146, + "learning_rate": 4.650327385267456e-06, + "loss": 0.4968, + "step": 8600 + }, + { + "epoch": 4.183343465045593, + "grad_norm": 0.07172675714949191, + "learning_rate": 4.649373562616865e-06, + "loss": 0.4865, + "step": 8601 + }, + { + "epoch": 4.183829787234043, + "grad_norm": 0.07423280489502632, + "learning_rate": 4.648419752788843e-06, + "loss": 0.5256, + "step": 8602 + }, + { + "epoch": 4.184316109422492, + "grad_norm": 0.07411568394547702, + "learning_rate": 4.647465955818269e-06, + "loss": 0.4976, + "step": 8603 + }, + { + "epoch": 4.184802431610942, + "grad_norm": 0.07847448634992887, + "learning_rate": 4.646512171740028e-06, + "loss": 0.5421, + "step": 8604 + }, + { + "epoch": 4.185288753799392, + "grad_norm": 0.07112772827262814, + "learning_rate": 4.6455584005889944e-06, + "loss": 0.4735, + "step": 8605 + }, + { + "epoch": 4.185775075987842, + "grad_norm": 0.07307619662880278, + "learning_rate": 4.644604642400053e-06, + "loss": 0.5223, + "step": 8606 + }, + { + "epoch": 4.186261398176292, + "grad_norm": 0.07167307445154907, + "learning_rate": 4.64365089720808e-06, + "loss": 0.5249, + "step": 8607 + }, + { + "epoch": 4.186747720364742, + "grad_norm": 0.07590288612844606, + "learning_rate": 4.6426971650479575e-06, + "loss": 0.5363, + "step": 8608 + }, + { + "epoch": 4.187234042553191, + "grad_norm": 0.08237896067364611, + "learning_rate": 4.64174344595456e-06, + "loss": 0.5004, + "step": 8609 + }, + { + "epoch": 4.187720364741641, + "grad_norm": 0.0733909118603107, + "learning_rate": 4.64078973996277e-06, + "loss": 0.5028, + "step": 8610 + }, + { + "epoch": 4.188206686930092, + "grad_norm": 0.07239892222658986, + "learning_rate": 4.63983604710746e-06, + "loss": 0.5034, + "step": 8611 + }, + { + "epoch": 4.188693009118541, + "grad_norm": 0.07719001751921614, + "learning_rate": 4.63888236742351e-06, + "loss": 0.5535, + "step": 8612 + }, + { + "epoch": 4.189179331306991, + "grad_norm": 0.07688731979910943, + "learning_rate": 4.637928700945795e-06, + "loss": 0.5995, + "step": 8613 + }, + { + "epoch": 4.189665653495441, + "grad_norm": 0.07393772045710352, + "learning_rate": 4.636975047709195e-06, + "loss": 0.4729, + "step": 8614 + }, + { + "epoch": 4.19015197568389, + "grad_norm": 0.07393831761603349, + "learning_rate": 4.6360214077485785e-06, + "loss": 0.5213, + "step": 8615 + }, + { + "epoch": 4.19063829787234, + "grad_norm": 0.07389979504158992, + "learning_rate": 4.635067781098827e-06, + "loss": 0.5048, + "step": 8616 + }, + { + "epoch": 4.191124620060791, + "grad_norm": 0.07357789263361195, + "learning_rate": 4.634114167794811e-06, + "loss": 0.4757, + "step": 8617 + }, + { + "epoch": 4.19161094224924, + "grad_norm": 0.07837721310275758, + "learning_rate": 4.633160567871408e-06, + "loss": 0.5543, + "step": 8618 + }, + { + "epoch": 4.19209726443769, + "grad_norm": 0.07691414190321177, + "learning_rate": 4.632206981363488e-06, + "loss": 0.5221, + "step": 8619 + }, + { + "epoch": 4.19258358662614, + "grad_norm": 0.07204873070183852, + "learning_rate": 4.631253408305927e-06, + "loss": 0.4894, + "step": 8620 + }, + { + "epoch": 4.193069908814589, + "grad_norm": 0.0744755189278179, + "learning_rate": 4.630299848733595e-06, + "loss": 0.5263, + "step": 8621 + }, + { + "epoch": 4.19355623100304, + "grad_norm": 0.0733820070362056, + "learning_rate": 4.629346302681367e-06, + "loss": 0.5058, + "step": 8622 + }, + { + "epoch": 4.1940425531914896, + "grad_norm": 0.07433033885422978, + "learning_rate": 4.628392770184112e-06, + "loss": 0.515, + "step": 8623 + }, + { + "epoch": 4.194528875379939, + "grad_norm": 0.0733964033081725, + "learning_rate": 4.627439251276704e-06, + "loss": 0.4928, + "step": 8624 + }, + { + "epoch": 4.195015197568389, + "grad_norm": 0.07711271776796749, + "learning_rate": 4.626485745994009e-06, + "loss": 0.5265, + "step": 8625 + }, + { + "epoch": 4.195501519756839, + "grad_norm": 0.07271916532576288, + "learning_rate": 4.6255322543709025e-06, + "loss": 0.5148, + "step": 8626 + }, + { + "epoch": 4.195987841945289, + "grad_norm": 0.07990990018849148, + "learning_rate": 4.624578776442249e-06, + "loss": 0.5292, + "step": 8627 + }, + { + "epoch": 4.196474164133739, + "grad_norm": 0.0732324168012335, + "learning_rate": 4.623625312242922e-06, + "loss": 0.4914, + "step": 8628 + }, + { + "epoch": 4.1969604863221885, + "grad_norm": 0.0744643730181638, + "learning_rate": 4.622671861807788e-06, + "loss": 0.5059, + "step": 8629 + }, + { + "epoch": 4.197446808510638, + "grad_norm": 0.07275270708369055, + "learning_rate": 4.621718425171716e-06, + "loss": 0.5145, + "step": 8630 + }, + { + "epoch": 4.197933130699088, + "grad_norm": 0.07414208509829934, + "learning_rate": 4.620765002369573e-06, + "loss": 0.5336, + "step": 8631 + }, + { + "epoch": 4.198419452887538, + "grad_norm": 0.07464983337027688, + "learning_rate": 4.619811593436224e-06, + "loss": 0.5239, + "step": 8632 + }, + { + "epoch": 4.198905775075988, + "grad_norm": 0.07314849326706202, + "learning_rate": 4.618858198406541e-06, + "loss": 0.4961, + "step": 8633 + }, + { + "epoch": 4.199392097264438, + "grad_norm": 0.08339314949028541, + "learning_rate": 4.6179048173153845e-06, + "loss": 0.6021, + "step": 8634 + }, + { + "epoch": 4.1998784194528875, + "grad_norm": 0.07671418208163941, + "learning_rate": 4.616951450197624e-06, + "loss": 0.5165, + "step": 8635 + }, + { + "epoch": 4.200364741641337, + "grad_norm": 0.0763546290088519, + "learning_rate": 4.6159980970881225e-06, + "loss": 0.5216, + "step": 8636 + }, + { + "epoch": 4.200851063829787, + "grad_norm": 0.07595293413864083, + "learning_rate": 4.615044758021745e-06, + "loss": 0.5361, + "step": 8637 + }, + { + "epoch": 4.2013373860182375, + "grad_norm": 0.07565301608467517, + "learning_rate": 4.614091433033354e-06, + "loss": 0.5397, + "step": 8638 + }, + { + "epoch": 4.201823708206687, + "grad_norm": 0.07574818133900997, + "learning_rate": 4.613138122157817e-06, + "loss": 0.5028, + "step": 8639 + }, + { + "epoch": 4.202310030395137, + "grad_norm": 0.07393028628759328, + "learning_rate": 4.612184825429994e-06, + "loss": 0.4898, + "step": 8640 + }, + { + "epoch": 4.2027963525835865, + "grad_norm": 0.07531791085120113, + "learning_rate": 4.611231542884747e-06, + "loss": 0.5237, + "step": 8641 + }, + { + "epoch": 4.203282674772036, + "grad_norm": 0.07780320279590346, + "learning_rate": 4.61027827455694e-06, + "loss": 0.5362, + "step": 8642 + }, + { + "epoch": 4.203768996960486, + "grad_norm": 0.07651307564738992, + "learning_rate": 4.609325020481435e-06, + "loss": 0.4989, + "step": 8643 + }, + { + "epoch": 4.2042553191489365, + "grad_norm": 0.07050200957379826, + "learning_rate": 4.6083717806930884e-06, + "loss": 0.4672, + "step": 8644 + }, + { + "epoch": 4.204741641337386, + "grad_norm": 0.07406671092151201, + "learning_rate": 4.607418555226766e-06, + "loss": 0.5103, + "step": 8645 + }, + { + "epoch": 4.205227963525836, + "grad_norm": 0.07501517962427871, + "learning_rate": 4.606465344117324e-06, + "loss": 0.5056, + "step": 8646 + }, + { + "epoch": 4.2057142857142855, + "grad_norm": 0.076291861748952, + "learning_rate": 4.6055121473996245e-06, + "loss": 0.521, + "step": 8647 + }, + { + "epoch": 4.206200607902735, + "grad_norm": 0.07742683853975692, + "learning_rate": 4.604558965108524e-06, + "loss": 0.5258, + "step": 8648 + }, + { + "epoch": 4.206686930091186, + "grad_norm": 0.07574776110495535, + "learning_rate": 4.603605797278883e-06, + "loss": 0.4965, + "step": 8649 + }, + { + "epoch": 4.2071732522796355, + "grad_norm": 0.074181891208974, + "learning_rate": 4.602652643945557e-06, + "loss": 0.5075, + "step": 8650 + }, + { + "epoch": 4.207659574468085, + "grad_norm": 0.07358544889370713, + "learning_rate": 4.601699505143404e-06, + "loss": 0.5043, + "step": 8651 + }, + { + "epoch": 4.208145896656535, + "grad_norm": 0.07604478950482464, + "learning_rate": 4.6007463809072815e-06, + "loss": 0.5101, + "step": 8652 + }, + { + "epoch": 4.2086322188449845, + "grad_norm": 0.07641742178833977, + "learning_rate": 4.5997932712720435e-06, + "loss": 0.4681, + "step": 8653 + }, + { + "epoch": 4.209118541033435, + "grad_norm": 0.07259836123755714, + "learning_rate": 4.598840176272551e-06, + "loss": 0.4938, + "step": 8654 + }, + { + "epoch": 4.209604863221885, + "grad_norm": 0.07715505768697828, + "learning_rate": 4.597887095943653e-06, + "loss": 0.5136, + "step": 8655 + }, + { + "epoch": 4.2100911854103344, + "grad_norm": 0.07646212567090423, + "learning_rate": 4.596934030320207e-06, + "loss": 0.5383, + "step": 8656 + }, + { + "epoch": 4.210577507598784, + "grad_norm": 0.07363603651140135, + "learning_rate": 4.595980979437067e-06, + "loss": 0.522, + "step": 8657 + }, + { + "epoch": 4.211063829787234, + "grad_norm": 0.07864830286149185, + "learning_rate": 4.595027943329087e-06, + "loss": 0.5393, + "step": 8658 + }, + { + "epoch": 4.2115501519756835, + "grad_norm": 0.0736310624462123, + "learning_rate": 4.594074922031117e-06, + "loss": 0.5039, + "step": 8659 + }, + { + "epoch": 4.212036474164134, + "grad_norm": 0.07275290155325348, + "learning_rate": 4.593121915578013e-06, + "loss": 0.4986, + "step": 8660 + }, + { + "epoch": 4.212522796352584, + "grad_norm": 0.07979793846762186, + "learning_rate": 4.592168924004624e-06, + "loss": 0.5679, + "step": 8661 + }, + { + "epoch": 4.213009118541033, + "grad_norm": 0.07410067545416402, + "learning_rate": 4.591215947345806e-06, + "loss": 0.5025, + "step": 8662 + }, + { + "epoch": 4.213495440729483, + "grad_norm": 0.07410159786805302, + "learning_rate": 4.590262985636403e-06, + "loss": 0.5363, + "step": 8663 + }, + { + "epoch": 4.213981762917933, + "grad_norm": 0.0736693982797478, + "learning_rate": 4.5893100389112715e-06, + "loss": 0.5159, + "step": 8664 + }, + { + "epoch": 4.214468085106383, + "grad_norm": 0.07565466476913871, + "learning_rate": 4.588357107205256e-06, + "loss": 0.5047, + "step": 8665 + }, + { + "epoch": 4.214954407294833, + "grad_norm": 0.07469850226358561, + "learning_rate": 4.5874041905532096e-06, + "loss": 0.4879, + "step": 8666 + }, + { + "epoch": 4.215440729483283, + "grad_norm": 0.07908504052529744, + "learning_rate": 4.586451288989978e-06, + "loss": 0.5188, + "step": 8667 + }, + { + "epoch": 4.215927051671732, + "grad_norm": 0.0731847070001752, + "learning_rate": 4.585498402550413e-06, + "loss": 0.4949, + "step": 8668 + }, + { + "epoch": 4.216413373860182, + "grad_norm": 0.07733456091230159, + "learning_rate": 4.584545531269357e-06, + "loss": 0.5365, + "step": 8669 + }, + { + "epoch": 4.216899696048632, + "grad_norm": 0.07546241812109102, + "learning_rate": 4.5835926751816626e-06, + "loss": 0.507, + "step": 8670 + }, + { + "epoch": 4.217386018237082, + "grad_norm": 0.07662160343646876, + "learning_rate": 4.58263983432217e-06, + "loss": 0.5178, + "step": 8671 + }, + { + "epoch": 4.217872340425532, + "grad_norm": 0.0763430413315756, + "learning_rate": 4.581687008725731e-06, + "loss": 0.5076, + "step": 8672 + }, + { + "epoch": 4.218358662613982, + "grad_norm": 0.07722427949859416, + "learning_rate": 4.580734198427187e-06, + "loss": 0.5294, + "step": 8673 + }, + { + "epoch": 4.218844984802431, + "grad_norm": 0.07713551194320627, + "learning_rate": 4.579781403461384e-06, + "loss": 0.5154, + "step": 8674 + }, + { + "epoch": 4.219331306990881, + "grad_norm": 0.07591098778000949, + "learning_rate": 4.578828623863165e-06, + "loss": 0.5126, + "step": 8675 + }, + { + "epoch": 4.219817629179332, + "grad_norm": 0.07310728554999228, + "learning_rate": 4.577875859667377e-06, + "loss": 0.4926, + "step": 8676 + }, + { + "epoch": 4.220303951367781, + "grad_norm": 0.07436779151687803, + "learning_rate": 4.576923110908858e-06, + "loss": 0.5004, + "step": 8677 + }, + { + "epoch": 4.220790273556231, + "grad_norm": 0.07281434253732112, + "learning_rate": 4.575970377622456e-06, + "loss": 0.4987, + "step": 8678 + }, + { + "epoch": 4.221276595744681, + "grad_norm": 0.07664552228304802, + "learning_rate": 4.575017659843007e-06, + "loss": 0.5183, + "step": 8679 + }, + { + "epoch": 4.22176291793313, + "grad_norm": 0.072920975667414, + "learning_rate": 4.574064957605356e-06, + "loss": 0.4782, + "step": 8680 + }, + { + "epoch": 4.222249240121581, + "grad_norm": 0.07819163668521517, + "learning_rate": 4.573112270944343e-06, + "loss": 0.538, + "step": 8681 + }, + { + "epoch": 4.222735562310031, + "grad_norm": 0.07683564925643531, + "learning_rate": 4.572159599894808e-06, + "loss": 0.5092, + "step": 8682 + }, + { + "epoch": 4.22322188449848, + "grad_norm": 0.07347147602483026, + "learning_rate": 4.571206944491593e-06, + "loss": 0.472, + "step": 8683 + }, + { + "epoch": 4.22370820668693, + "grad_norm": 0.07718951275473292, + "learning_rate": 4.570254304769532e-06, + "loss": 0.5098, + "step": 8684 + }, + { + "epoch": 4.22419452887538, + "grad_norm": 0.07679857193516876, + "learning_rate": 4.569301680763468e-06, + "loss": 0.5365, + "step": 8685 + }, + { + "epoch": 4.224680851063829, + "grad_norm": 0.07382949547252148, + "learning_rate": 4.568349072508236e-06, + "loss": 0.4642, + "step": 8686 + }, + { + "epoch": 4.22516717325228, + "grad_norm": 0.07938568871975125, + "learning_rate": 4.567396480038677e-06, + "loss": 0.4913, + "step": 8687 + }, + { + "epoch": 4.22565349544073, + "grad_norm": 0.07793437345472343, + "learning_rate": 4.566443903389622e-06, + "loss": 0.5519, + "step": 8688 + }, + { + "epoch": 4.226139817629179, + "grad_norm": 0.07377412301186179, + "learning_rate": 4.565491342595914e-06, + "loss": 0.4945, + "step": 8689 + }, + { + "epoch": 4.226626139817629, + "grad_norm": 0.0776944029922295, + "learning_rate": 4.564538797692382e-06, + "loss": 0.5232, + "step": 8690 + }, + { + "epoch": 4.227112462006079, + "grad_norm": 0.07285783945653469, + "learning_rate": 4.5635862687138645e-06, + "loss": 0.5288, + "step": 8691 + }, + { + "epoch": 4.227598784194529, + "grad_norm": 0.07699902132673209, + "learning_rate": 4.562633755695195e-06, + "loss": 0.5189, + "step": 8692 + }, + { + "epoch": 4.228085106382979, + "grad_norm": 0.07225996419098245, + "learning_rate": 4.56168125867121e-06, + "loss": 0.4827, + "step": 8693 + }, + { + "epoch": 4.228571428571429, + "grad_norm": 0.074962130347158, + "learning_rate": 4.5607287776767386e-06, + "loss": 0.5291, + "step": 8694 + }, + { + "epoch": 4.229057750759878, + "grad_norm": 0.07791887590340825, + "learning_rate": 4.559776312746617e-06, + "loss": 0.5312, + "step": 8695 + }, + { + "epoch": 4.229544072948328, + "grad_norm": 0.07473625643536515, + "learning_rate": 4.558823863915673e-06, + "loss": 0.499, + "step": 8696 + }, + { + "epoch": 4.230030395136778, + "grad_norm": 0.07681095331078629, + "learning_rate": 4.557871431218744e-06, + "loss": 0.5001, + "step": 8697 + }, + { + "epoch": 4.230516717325228, + "grad_norm": 0.07648674349380324, + "learning_rate": 4.556919014690655e-06, + "loss": 0.5426, + "step": 8698 + }, + { + "epoch": 4.231003039513678, + "grad_norm": 0.0744084001770638, + "learning_rate": 4.55596661436624e-06, + "loss": 0.4957, + "step": 8699 + }, + { + "epoch": 4.231489361702128, + "grad_norm": 0.07402201557692092, + "learning_rate": 4.555014230280327e-06, + "loss": 0.5125, + "step": 8700 + }, + { + "epoch": 4.231975683890577, + "grad_norm": 0.07381443245221855, + "learning_rate": 4.554061862467748e-06, + "loss": 0.4804, + "step": 8701 + }, + { + "epoch": 4.232462006079027, + "grad_norm": 0.07550477766177831, + "learning_rate": 4.553109510963327e-06, + "loss": 0.5306, + "step": 8702 + }, + { + "epoch": 4.232948328267478, + "grad_norm": 0.07152553495268849, + "learning_rate": 4.552157175801896e-06, + "loss": 0.4958, + "step": 8703 + }, + { + "epoch": 4.233434650455927, + "grad_norm": 0.07581792328001027, + "learning_rate": 4.551204857018278e-06, + "loss": 0.5334, + "step": 8704 + }, + { + "epoch": 4.233920972644377, + "grad_norm": 0.0746442794651795, + "learning_rate": 4.550252554647303e-06, + "loss": 0.4968, + "step": 8705 + }, + { + "epoch": 4.234407294832827, + "grad_norm": 0.07383079482696556, + "learning_rate": 4.549300268723798e-06, + "loss": 0.5211, + "step": 8706 + }, + { + "epoch": 4.234893617021276, + "grad_norm": 0.07446737507778915, + "learning_rate": 4.548347999282584e-06, + "loss": 0.5693, + "step": 8707 + }, + { + "epoch": 4.235379939209727, + "grad_norm": 0.07482578397225607, + "learning_rate": 4.547395746358493e-06, + "loss": 0.4895, + "step": 8708 + }, + { + "epoch": 4.235866261398177, + "grad_norm": 0.07475043530154361, + "learning_rate": 4.5464435099863415e-06, + "loss": 0.5035, + "step": 8709 + }, + { + "epoch": 4.236352583586626, + "grad_norm": 0.07532848373433775, + "learning_rate": 4.545491290200959e-06, + "loss": 0.5104, + "step": 8710 + }, + { + "epoch": 4.236838905775076, + "grad_norm": 0.07267218439532801, + "learning_rate": 4.5445390870371656e-06, + "loss": 0.5021, + "step": 8711 + }, + { + "epoch": 4.237325227963526, + "grad_norm": 0.074723096238761, + "learning_rate": 4.543586900529786e-06, + "loss": 0.5081, + "step": 8712 + }, + { + "epoch": 4.237811550151975, + "grad_norm": 0.07799118909340397, + "learning_rate": 4.542634730713639e-06, + "loss": 0.4959, + "step": 8713 + }, + { + "epoch": 4.238297872340426, + "grad_norm": 0.07653889202887335, + "learning_rate": 4.541682577623548e-06, + "loss": 0.5333, + "step": 8714 + }, + { + "epoch": 4.238784194528876, + "grad_norm": 0.07465745108764789, + "learning_rate": 4.540730441294334e-06, + "loss": 0.5114, + "step": 8715 + }, + { + "epoch": 4.239270516717325, + "grad_norm": 0.07339665835885074, + "learning_rate": 4.5397783217608174e-06, + "loss": 0.4885, + "step": 8716 + }, + { + "epoch": 4.239756838905775, + "grad_norm": 0.07407899126115443, + "learning_rate": 4.538826219057815e-06, + "loss": 0.5062, + "step": 8717 + }, + { + "epoch": 4.240243161094225, + "grad_norm": 0.07435973308592166, + "learning_rate": 4.537874133220149e-06, + "loss": 0.4998, + "step": 8718 + }, + { + "epoch": 4.240729483282675, + "grad_norm": 0.07397053380948075, + "learning_rate": 4.536922064282634e-06, + "loss": 0.5403, + "step": 8719 + }, + { + "epoch": 4.241215805471125, + "grad_norm": 0.07490142171157223, + "learning_rate": 4.53597001228009e-06, + "loss": 0.5425, + "step": 8720 + }, + { + "epoch": 4.2417021276595746, + "grad_norm": 0.07471311350176477, + "learning_rate": 4.535017977247334e-06, + "loss": 0.544, + "step": 8721 + }, + { + "epoch": 4.242188449848024, + "grad_norm": 0.0734170697021857, + "learning_rate": 4.534065959219182e-06, + "loss": 0.5189, + "step": 8722 + }, + { + "epoch": 4.242674772036474, + "grad_norm": 0.07742542844479222, + "learning_rate": 4.533113958230449e-06, + "loss": 0.5921, + "step": 8723 + }, + { + "epoch": 4.243161094224924, + "grad_norm": 0.07183809436425274, + "learning_rate": 4.532161974315951e-06, + "loss": 0.498, + "step": 8724 + }, + { + "epoch": 4.243647416413374, + "grad_norm": 0.0731772908572836, + "learning_rate": 4.531210007510501e-06, + "loss": 0.5048, + "step": 8725 + }, + { + "epoch": 4.244133738601824, + "grad_norm": 0.07487899733392256, + "learning_rate": 4.530258057848916e-06, + "loss": 0.5182, + "step": 8726 + }, + { + "epoch": 4.2446200607902735, + "grad_norm": 0.07252965677148214, + "learning_rate": 4.5293061253660056e-06, + "loss": 0.4875, + "step": 8727 + }, + { + "epoch": 4.245106382978723, + "grad_norm": 0.07447330119276449, + "learning_rate": 4.528354210096585e-06, + "loss": 0.4833, + "step": 8728 + }, + { + "epoch": 4.245592705167173, + "grad_norm": 0.07382969610481885, + "learning_rate": 4.527402312075464e-06, + "loss": 0.5043, + "step": 8729 + }, + { + "epoch": 4.2460790273556235, + "grad_norm": 0.07723969282011674, + "learning_rate": 4.526450431337457e-06, + "loss": 0.5433, + "step": 8730 + }, + { + "epoch": 4.246565349544073, + "grad_norm": 0.07191645882261344, + "learning_rate": 4.525498567917371e-06, + "loss": 0.4987, + "step": 8731 + }, + { + "epoch": 4.247051671732523, + "grad_norm": 0.07519499527333512, + "learning_rate": 4.524546721850018e-06, + "loss": 0.5276, + "step": 8732 + }, + { + "epoch": 4.2475379939209725, + "grad_norm": 0.07187429383530079, + "learning_rate": 4.52359489317021e-06, + "loss": 0.4977, + "step": 8733 + }, + { + "epoch": 4.248024316109422, + "grad_norm": 0.07538453184585003, + "learning_rate": 4.5226430819127504e-06, + "loss": 0.5616, + "step": 8734 + }, + { + "epoch": 4.248510638297873, + "grad_norm": 0.07299794767359329, + "learning_rate": 4.521691288112451e-06, + "loss": 0.5074, + "step": 8735 + }, + { + "epoch": 4.2489969604863225, + "grad_norm": 0.07396174481122963, + "learning_rate": 4.5207395118041185e-06, + "loss": 0.5311, + "step": 8736 + }, + { + "epoch": 4.249483282674772, + "grad_norm": 0.07418648976744513, + "learning_rate": 4.519787753022561e-06, + "loss": 0.5154, + "step": 8737 + }, + { + "epoch": 4.249969604863222, + "grad_norm": 0.07324800322254649, + "learning_rate": 4.518836011802582e-06, + "loss": 0.5294, + "step": 8738 + }, + { + "epoch": 4.249969604863222, + "eval_loss": 0.5709888339042664, + "eval_runtime": 104.8517, + "eval_samples_per_second": 289.485, + "eval_steps_per_second": 36.194, + "step": 8738 + }, + { + "epoch": 4.2504559270516715, + "grad_norm": 0.07259134379553166, + "learning_rate": 4.517884288178989e-06, + "loss": 0.5104, + "step": 8739 + }, + { + "epoch": 4.250942249240121, + "grad_norm": 0.07803557469579345, + "learning_rate": 4.516932582186586e-06, + "loss": 0.5334, + "step": 8740 + }, + { + "epoch": 4.251428571428572, + "grad_norm": 0.0725182000730815, + "learning_rate": 4.51598089386018e-06, + "loss": 0.4907, + "step": 8741 + }, + { + "epoch": 4.2519148936170215, + "grad_norm": 0.10968620580813508, + "learning_rate": 4.51502922323457e-06, + "loss": 0.5457, + "step": 8742 + }, + { + "epoch": 4.252401215805471, + "grad_norm": 0.0757296271890457, + "learning_rate": 4.514077570344565e-06, + "loss": 0.5319, + "step": 8743 + }, + { + "epoch": 4.252887537993921, + "grad_norm": 0.07409161370369666, + "learning_rate": 4.5131259352249616e-06, + "loss": 0.5104, + "step": 8744 + }, + { + "epoch": 4.2533738601823705, + "grad_norm": 0.07317896499004671, + "learning_rate": 4.5121743179105635e-06, + "loss": 0.4986, + "step": 8745 + }, + { + "epoch": 4.25386018237082, + "grad_norm": 0.07426250639754535, + "learning_rate": 4.5112227184361726e-06, + "loss": 0.4983, + "step": 8746 + }, + { + "epoch": 4.254346504559271, + "grad_norm": 0.07121806614551542, + "learning_rate": 4.510271136836591e-06, + "loss": 0.51, + "step": 8747 + }, + { + "epoch": 4.2548328267477205, + "grad_norm": 0.07583702226295949, + "learning_rate": 4.509319573146614e-06, + "loss": 0.4834, + "step": 8748 + }, + { + "epoch": 4.25531914893617, + "grad_norm": 0.07193527971193463, + "learning_rate": 4.508368027401044e-06, + "loss": 0.5152, + "step": 8749 + }, + { + "epoch": 4.25580547112462, + "grad_norm": 0.07436967309549494, + "learning_rate": 4.507416499634678e-06, + "loss": 0.5234, + "step": 8750 + }, + { + "epoch": 4.2562917933130695, + "grad_norm": 0.07303227563763393, + "learning_rate": 4.506464989882316e-06, + "loss": 0.5258, + "step": 8751 + }, + { + "epoch": 4.25677811550152, + "grad_norm": 0.07717013075558937, + "learning_rate": 4.505513498178752e-06, + "loss": 0.5335, + "step": 8752 + }, + { + "epoch": 4.25726443768997, + "grad_norm": 0.07454202332370188, + "learning_rate": 4.504562024558785e-06, + "loss": 0.5042, + "step": 8753 + }, + { + "epoch": 4.2577507598784194, + "grad_norm": 0.07467668953758691, + "learning_rate": 4.503610569057208e-06, + "loss": 0.5121, + "step": 8754 + }, + { + "epoch": 4.258237082066869, + "grad_norm": 0.07443851881122236, + "learning_rate": 4.502659131708821e-06, + "loss": 0.5473, + "step": 8755 + }, + { + "epoch": 4.258723404255319, + "grad_norm": 0.07131672641325543, + "learning_rate": 4.501707712548413e-06, + "loss": 0.4888, + "step": 8756 + }, + { + "epoch": 4.259209726443769, + "grad_norm": 0.07779720583889972, + "learning_rate": 4.5007563116107825e-06, + "loss": 0.4812, + "step": 8757 + }, + { + "epoch": 4.259696048632219, + "grad_norm": 0.0723020858048172, + "learning_rate": 4.499804928930719e-06, + "loss": 0.5063, + "step": 8758 + }, + { + "epoch": 4.260182370820669, + "grad_norm": 0.07597115797662181, + "learning_rate": 4.498853564543015e-06, + "loss": 0.5051, + "step": 8759 + }, + { + "epoch": 4.260668693009118, + "grad_norm": 0.0744597841786799, + "learning_rate": 4.497902218482466e-06, + "loss": 0.4974, + "step": 8760 + }, + { + "epoch": 4.261155015197568, + "grad_norm": 0.0748062920763063, + "learning_rate": 4.49695089078386e-06, + "loss": 0.5105, + "step": 8761 + }, + { + "epoch": 4.261641337386019, + "grad_norm": 0.07448387237692716, + "learning_rate": 4.4959995814819904e-06, + "loss": 0.4842, + "step": 8762 + }, + { + "epoch": 4.262127659574468, + "grad_norm": 0.07427814720623255, + "learning_rate": 4.495048290611643e-06, + "loss": 0.5229, + "step": 8763 + }, + { + "epoch": 4.262613981762918, + "grad_norm": 0.07251018282983426, + "learning_rate": 4.494097018207609e-06, + "loss": 0.5038, + "step": 8764 + }, + { + "epoch": 4.263100303951368, + "grad_norm": 0.07484050226843347, + "learning_rate": 4.4931457643046775e-06, + "loss": 0.5277, + "step": 8765 + }, + { + "epoch": 4.263586626139817, + "grad_norm": 0.07465325259856306, + "learning_rate": 4.492194528937637e-06, + "loss": 0.509, + "step": 8766 + }, + { + "epoch": 4.264072948328267, + "grad_norm": 0.07585190924487659, + "learning_rate": 4.491243312141271e-06, + "loss": 0.5262, + "step": 8767 + }, + { + "epoch": 4.264559270516718, + "grad_norm": 0.0731946682333731, + "learning_rate": 4.49029211395037e-06, + "loss": 0.5228, + "step": 8768 + }, + { + "epoch": 4.265045592705167, + "grad_norm": 0.07231915622254274, + "learning_rate": 4.4893409343997165e-06, + "loss": 0.4823, + "step": 8769 + }, + { + "epoch": 4.265531914893617, + "grad_norm": 0.07028292122038556, + "learning_rate": 4.488389773524099e-06, + "loss": 0.5139, + "step": 8770 + }, + { + "epoch": 4.266018237082067, + "grad_norm": 0.07470876903983961, + "learning_rate": 4.487438631358298e-06, + "loss": 0.5336, + "step": 8771 + }, + { + "epoch": 4.266504559270516, + "grad_norm": 0.07280975141157943, + "learning_rate": 4.4864875079371e-06, + "loss": 0.5033, + "step": 8772 + }, + { + "epoch": 4.266990881458966, + "grad_norm": 0.07333147533724117, + "learning_rate": 4.485536403295287e-06, + "loss": 0.521, + "step": 8773 + }, + { + "epoch": 4.267477203647417, + "grad_norm": 0.07511462236629839, + "learning_rate": 4.484585317467642e-06, + "loss": 0.5329, + "step": 8774 + }, + { + "epoch": 4.267963525835866, + "grad_norm": 0.07668881321204545, + "learning_rate": 4.483634250488945e-06, + "loss": 0.5172, + "step": 8775 + }, + { + "epoch": 4.268449848024316, + "grad_norm": 0.07423731055110483, + "learning_rate": 4.482683202393979e-06, + "loss": 0.535, + "step": 8776 + }, + { + "epoch": 4.268936170212766, + "grad_norm": 0.07317403272780562, + "learning_rate": 4.481732173217523e-06, + "loss": 0.5191, + "step": 8777 + }, + { + "epoch": 4.269422492401215, + "grad_norm": 0.07654644360192758, + "learning_rate": 4.480781162994356e-06, + "loss": 0.533, + "step": 8778 + }, + { + "epoch": 4.269908814589666, + "grad_norm": 0.07597300033638635, + "learning_rate": 4.479830171759258e-06, + "loss": 0.5141, + "step": 8779 + }, + { + "epoch": 4.270395136778116, + "grad_norm": 0.0731460516462222, + "learning_rate": 4.478879199547009e-06, + "loss": 0.5284, + "step": 8780 + }, + { + "epoch": 4.270881458966565, + "grad_norm": 0.07529553989918555, + "learning_rate": 4.477928246392382e-06, + "loss": 0.5136, + "step": 8781 + }, + { + "epoch": 4.271367781155015, + "grad_norm": 0.07454572143905275, + "learning_rate": 4.4769773123301586e-06, + "loss": 0.5104, + "step": 8782 + }, + { + "epoch": 4.271854103343465, + "grad_norm": 0.07470779552390609, + "learning_rate": 4.47602639739511e-06, + "loss": 0.5093, + "step": 8783 + }, + { + "epoch": 4.272340425531915, + "grad_norm": 0.07558066268226521, + "learning_rate": 4.475075501622014e-06, + "loss": 0.5344, + "step": 8784 + }, + { + "epoch": 4.272826747720365, + "grad_norm": 0.0729851124107236, + "learning_rate": 4.474124625045647e-06, + "loss": 0.5071, + "step": 8785 + }, + { + "epoch": 4.273313069908815, + "grad_norm": 0.07738719736032744, + "learning_rate": 4.47317376770078e-06, + "loss": 0.5393, + "step": 8786 + }, + { + "epoch": 4.273799392097264, + "grad_norm": 0.07464123587971279, + "learning_rate": 4.47222292962219e-06, + "loss": 0.5063, + "step": 8787 + }, + { + "epoch": 4.274285714285714, + "grad_norm": 0.07743054088657754, + "learning_rate": 4.471272110844646e-06, + "loss": 0.4839, + "step": 8788 + }, + { + "epoch": 4.274772036474165, + "grad_norm": 0.07481329572451914, + "learning_rate": 4.47032131140292e-06, + "loss": 0.5122, + "step": 8789 + }, + { + "epoch": 4.275258358662614, + "grad_norm": 0.07132892533148658, + "learning_rate": 4.469370531331784e-06, + "loss": 0.5002, + "step": 8790 + }, + { + "epoch": 4.275744680851064, + "grad_norm": 0.0723130685776087, + "learning_rate": 4.4684197706660125e-06, + "loss": 0.4732, + "step": 8791 + }, + { + "epoch": 4.276231003039514, + "grad_norm": 0.0768637366148514, + "learning_rate": 4.4674690294403676e-06, + "loss": 0.5418, + "step": 8792 + }, + { + "epoch": 4.276717325227963, + "grad_norm": 0.07682422531247371, + "learning_rate": 4.466518307689624e-06, + "loss": 0.4787, + "step": 8793 + }, + { + "epoch": 4.277203647416413, + "grad_norm": 0.07394132920279146, + "learning_rate": 4.465567605448547e-06, + "loss": 0.5028, + "step": 8794 + }, + { + "epoch": 4.277689969604864, + "grad_norm": 0.07479170102597361, + "learning_rate": 4.4646169227519075e-06, + "loss": 0.478, + "step": 8795 + }, + { + "epoch": 4.278176291793313, + "grad_norm": 0.07208901257486133, + "learning_rate": 4.463666259634469e-06, + "loss": 0.512, + "step": 8796 + }, + { + "epoch": 4.278662613981763, + "grad_norm": 0.07576886107125212, + "learning_rate": 4.462715616131e-06, + "loss": 0.5116, + "step": 8797 + }, + { + "epoch": 4.279148936170213, + "grad_norm": 0.07659478748145507, + "learning_rate": 4.461764992276264e-06, + "loss": 0.5373, + "step": 8798 + }, + { + "epoch": 4.279635258358662, + "grad_norm": 0.07590146217647921, + "learning_rate": 4.460814388105027e-06, + "loss": 0.5129, + "step": 8799 + }, + { + "epoch": 4.280121580547112, + "grad_norm": 0.07245037718056008, + "learning_rate": 4.459863803652052e-06, + "loss": 0.5005, + "step": 8800 + }, + { + "epoch": 4.280607902735563, + "grad_norm": 0.078890503219173, + "learning_rate": 4.458913238952105e-06, + "loss": 0.5514, + "step": 8801 + }, + { + "epoch": 4.281094224924012, + "grad_norm": 0.07492741407117794, + "learning_rate": 4.457962694039945e-06, + "loss": 0.5171, + "step": 8802 + }, + { + "epoch": 4.281580547112462, + "grad_norm": 0.07555930591964198, + "learning_rate": 4.457012168950336e-06, + "loss": 0.4818, + "step": 8803 + }, + { + "epoch": 4.282066869300912, + "grad_norm": 0.0754832058900754, + "learning_rate": 4.456061663718039e-06, + "loss": 0.4948, + "step": 8804 + }, + { + "epoch": 4.282553191489361, + "grad_norm": 0.07530560834202397, + "learning_rate": 4.455111178377815e-06, + "loss": 0.5411, + "step": 8805 + }, + { + "epoch": 4.283039513677812, + "grad_norm": 0.07451560872676627, + "learning_rate": 4.45416071296442e-06, + "loss": 0.5386, + "step": 8806 + }, + { + "epoch": 4.283525835866262, + "grad_norm": 0.07394584771918655, + "learning_rate": 4.4532102675126185e-06, + "loss": 0.5246, + "step": 8807 + }, + { + "epoch": 4.284012158054711, + "grad_norm": 0.07491523655089026, + "learning_rate": 4.452259842057164e-06, + "loss": 0.5112, + "step": 8808 + }, + { + "epoch": 4.284498480243161, + "grad_norm": 0.07220221987507201, + "learning_rate": 4.451309436632818e-06, + "loss": 0.5077, + "step": 8809 + }, + { + "epoch": 4.284984802431611, + "grad_norm": 0.07658118758865776, + "learning_rate": 4.450359051274332e-06, + "loss": 0.5326, + "step": 8810 + }, + { + "epoch": 4.285471124620061, + "grad_norm": 0.07797859669975359, + "learning_rate": 4.449408686016467e-06, + "loss": 0.594, + "step": 8811 + }, + { + "epoch": 4.285957446808511, + "grad_norm": 0.0711988378757658, + "learning_rate": 4.448458340893979e-06, + "loss": 0.4862, + "step": 8812 + }, + { + "epoch": 4.286443768996961, + "grad_norm": 0.07391444606359074, + "learning_rate": 4.447508015941616e-06, + "loss": 0.4448, + "step": 8813 + }, + { + "epoch": 4.28693009118541, + "grad_norm": 0.07354575247285881, + "learning_rate": 4.446557711194138e-06, + "loss": 0.5024, + "step": 8814 + }, + { + "epoch": 4.28741641337386, + "grad_norm": 0.07322822137937383, + "learning_rate": 4.445607426686295e-06, + "loss": 0.5132, + "step": 8815 + }, + { + "epoch": 4.2879027355623105, + "grad_norm": 0.07224456244252213, + "learning_rate": 4.444657162452842e-06, + "loss": 0.464, + "step": 8816 + }, + { + "epoch": 4.28838905775076, + "grad_norm": 0.07602056854868319, + "learning_rate": 4.443706918528527e-06, + "loss": 0.5102, + "step": 8817 + }, + { + "epoch": 4.28887537993921, + "grad_norm": 0.07513831770111727, + "learning_rate": 4.442756694948103e-06, + "loss": 0.5173, + "step": 8818 + }, + { + "epoch": 4.2893617021276595, + "grad_norm": 0.07056319942432344, + "learning_rate": 4.441806491746319e-06, + "loss": 0.4633, + "step": 8819 + }, + { + "epoch": 4.289848024316109, + "grad_norm": 0.07639973987500419, + "learning_rate": 4.440856308957928e-06, + "loss": 0.5431, + "step": 8820 + }, + { + "epoch": 4.290334346504559, + "grad_norm": 0.07381501319111135, + "learning_rate": 4.439906146617674e-06, + "loss": 0.5064, + "step": 8821 + }, + { + "epoch": 4.2908206686930095, + "grad_norm": 0.07394732410366524, + "learning_rate": 4.438956004760307e-06, + "loss": 0.5459, + "step": 8822 + }, + { + "epoch": 4.291306990881459, + "grad_norm": 0.0759200443771861, + "learning_rate": 4.438005883420572e-06, + "loss": 0.521, + "step": 8823 + }, + { + "epoch": 4.291793313069909, + "grad_norm": 0.07572521699622785, + "learning_rate": 4.437055782633221e-06, + "loss": 0.5317, + "step": 8824 + }, + { + "epoch": 4.2922796352583585, + "grad_norm": 0.07216002648877469, + "learning_rate": 4.4361057024329926e-06, + "loss": 0.4721, + "step": 8825 + }, + { + "epoch": 4.292765957446808, + "grad_norm": 0.0730238582838513, + "learning_rate": 4.435155642854637e-06, + "loss": 0.5282, + "step": 8826 + }, + { + "epoch": 4.293252279635258, + "grad_norm": 0.07549362622246844, + "learning_rate": 4.434205603932895e-06, + "loss": 0.4735, + "step": 8827 + }, + { + "epoch": 4.2937386018237085, + "grad_norm": 0.07256035173566985, + "learning_rate": 4.433255585702511e-06, + "loss": 0.5368, + "step": 8828 + }, + { + "epoch": 4.294224924012158, + "grad_norm": 0.07309291082885862, + "learning_rate": 4.432305588198227e-06, + "loss": 0.5233, + "step": 8829 + }, + { + "epoch": 4.294711246200608, + "grad_norm": 0.07553199473785852, + "learning_rate": 4.431355611454788e-06, + "loss": 0.5143, + "step": 8830 + }, + { + "epoch": 4.2951975683890575, + "grad_norm": 0.0749011396508686, + "learning_rate": 4.43040565550693e-06, + "loss": 0.4991, + "step": 8831 + }, + { + "epoch": 4.295683890577507, + "grad_norm": 0.0743130066759057, + "learning_rate": 4.429455720389397e-06, + "loss": 0.5209, + "step": 8832 + }, + { + "epoch": 4.296170212765958, + "grad_norm": 0.07351555334264333, + "learning_rate": 4.428505806136927e-06, + "loss": 0.4973, + "step": 8833 + }, + { + "epoch": 4.2966565349544075, + "grad_norm": 0.07421232291265613, + "learning_rate": 4.427555912784262e-06, + "loss": 0.5108, + "step": 8834 + }, + { + "epoch": 4.297142857142857, + "grad_norm": 0.07293126190840091, + "learning_rate": 4.426606040366133e-06, + "loss": 0.5062, + "step": 8835 + }, + { + "epoch": 4.297629179331307, + "grad_norm": 0.07586631993850015, + "learning_rate": 4.425656188917284e-06, + "loss": 0.5331, + "step": 8836 + }, + { + "epoch": 4.2981155015197565, + "grad_norm": 0.07402833946435487, + "learning_rate": 4.42470635847245e-06, + "loss": 0.5005, + "step": 8837 + }, + { + "epoch": 4.298601823708207, + "grad_norm": 0.0744793687247343, + "learning_rate": 4.423756549066364e-06, + "loss": 0.5069, + "step": 8838 + }, + { + "epoch": 4.299088145896657, + "grad_norm": 0.07410432112739444, + "learning_rate": 4.422806760733764e-06, + "loss": 0.5029, + "step": 8839 + }, + { + "epoch": 4.2995744680851065, + "grad_norm": 0.0808500669230187, + "learning_rate": 4.421856993509382e-06, + "loss": 0.521, + "step": 8840 + }, + { + "epoch": 4.300060790273556, + "grad_norm": 0.08017556864986378, + "learning_rate": 4.420907247427954e-06, + "loss": 0.5293, + "step": 8841 + }, + { + "epoch": 4.300547112462006, + "grad_norm": 0.07343790461639645, + "learning_rate": 4.419957522524209e-06, + "loss": 0.4731, + "step": 8842 + }, + { + "epoch": 4.3010334346504555, + "grad_norm": 0.07435980626915134, + "learning_rate": 4.419007818832883e-06, + "loss": 0.4867, + "step": 8843 + }, + { + "epoch": 4.301519756838906, + "grad_norm": 0.07420557641036699, + "learning_rate": 4.4180581363887024e-06, + "loss": 0.536, + "step": 8844 + }, + { + "epoch": 4.302006079027356, + "grad_norm": 0.07333483604257106, + "learning_rate": 4.417108475226403e-06, + "loss": 0.4959, + "step": 8845 + }, + { + "epoch": 4.3024924012158055, + "grad_norm": 0.07692840014364763, + "learning_rate": 4.41615883538071e-06, + "loss": 0.5148, + "step": 8846 + }, + { + "epoch": 4.302978723404255, + "grad_norm": 0.07503908909506307, + "learning_rate": 4.415209216886354e-06, + "loss": 0.5463, + "step": 8847 + }, + { + "epoch": 4.303465045592705, + "grad_norm": 0.07222173315302174, + "learning_rate": 4.414259619778062e-06, + "loss": 0.5019, + "step": 8848 + }, + { + "epoch": 4.303951367781155, + "grad_norm": 0.0748463928362814, + "learning_rate": 4.413310044090563e-06, + "loss": 0.5038, + "step": 8849 + }, + { + "epoch": 4.304437689969605, + "grad_norm": 0.07246934839549578, + "learning_rate": 4.412360489858581e-06, + "loss": 0.5171, + "step": 8850 + }, + { + "epoch": 4.304924012158055, + "grad_norm": 0.07532778899577577, + "learning_rate": 4.4114109571168444e-06, + "loss": 0.5284, + "step": 8851 + }, + { + "epoch": 4.305410334346504, + "grad_norm": 0.07418476849790004, + "learning_rate": 4.410461445900075e-06, + "loss": 0.5427, + "step": 8852 + }, + { + "epoch": 4.305896656534954, + "grad_norm": 0.0758987137726916, + "learning_rate": 4.409511956242999e-06, + "loss": 0.5599, + "step": 8853 + }, + { + "epoch": 4.306382978723404, + "grad_norm": 0.07546546038024077, + "learning_rate": 4.408562488180338e-06, + "loss": 0.5226, + "step": 8854 + }, + { + "epoch": 4.306869300911854, + "grad_norm": 0.07586808486344018, + "learning_rate": 4.407613041746818e-06, + "loss": 0.512, + "step": 8855 + }, + { + "epoch": 4.307355623100304, + "grad_norm": 0.07588463291409742, + "learning_rate": 4.406663616977156e-06, + "loss": 0.5142, + "step": 8856 + }, + { + "epoch": 4.307841945288754, + "grad_norm": 0.0746360354744008, + "learning_rate": 4.405714213906075e-06, + "loss": 0.5046, + "step": 8857 + }, + { + "epoch": 4.308328267477203, + "grad_norm": 0.07636943917876411, + "learning_rate": 4.404764832568296e-06, + "loss": 0.5289, + "step": 8858 + }, + { + "epoch": 4.308814589665653, + "grad_norm": 0.07787474247598106, + "learning_rate": 4.403815472998539e-06, + "loss": 0.5399, + "step": 8859 + }, + { + "epoch": 4.309300911854104, + "grad_norm": 0.07916379328334402, + "learning_rate": 4.402866135231518e-06, + "loss": 0.5215, + "step": 8860 + }, + { + "epoch": 4.309787234042553, + "grad_norm": 0.07177162684397294, + "learning_rate": 4.401916819301956e-06, + "loss": 0.4987, + "step": 8861 + }, + { + "epoch": 4.310273556231003, + "grad_norm": 0.08212220367829835, + "learning_rate": 4.400967525244565e-06, + "loss": 0.5347, + "step": 8862 + }, + { + "epoch": 4.310759878419453, + "grad_norm": 0.07157022332514935, + "learning_rate": 4.400018253094065e-06, + "loss": 0.4875, + "step": 8863 + }, + { + "epoch": 4.311246200607902, + "grad_norm": 0.07344837469413698, + "learning_rate": 4.399069002885171e-06, + "loss": 0.5037, + "step": 8864 + }, + { + "epoch": 4.311732522796353, + "grad_norm": 0.07500922821222847, + "learning_rate": 4.398119774652596e-06, + "loss": 0.517, + "step": 8865 + }, + { + "epoch": 4.312218844984803, + "grad_norm": 0.07626514907512279, + "learning_rate": 4.397170568431056e-06, + "loss": 0.5396, + "step": 8866 + }, + { + "epoch": 4.312705167173252, + "grad_norm": 0.07712850986800156, + "learning_rate": 4.39622138425526e-06, + "loss": 0.5436, + "step": 8867 + }, + { + "epoch": 4.313191489361702, + "grad_norm": 0.0744705521369078, + "learning_rate": 4.395272222159923e-06, + "loss": 0.5247, + "step": 8868 + }, + { + "epoch": 4.313677811550152, + "grad_norm": 0.07333172452682638, + "learning_rate": 4.394323082179755e-06, + "loss": 0.5175, + "step": 8869 + }, + { + "epoch": 4.314164133738601, + "grad_norm": 0.07608902381385374, + "learning_rate": 4.393373964349469e-06, + "loss": 0.5087, + "step": 8870 + }, + { + "epoch": 4.314650455927052, + "grad_norm": 0.07390671415557674, + "learning_rate": 4.3924248687037705e-06, + "loss": 0.5128, + "step": 8871 + }, + { + "epoch": 4.315136778115502, + "grad_norm": 0.07595870352857209, + "learning_rate": 4.391475795277371e-06, + "loss": 0.5013, + "step": 8872 + }, + { + "epoch": 4.315623100303951, + "grad_norm": 0.07485824968183762, + "learning_rate": 4.390526744104978e-06, + "loss": 0.5327, + "step": 8873 + }, + { + "epoch": 4.316109422492401, + "grad_norm": 0.07543939663567027, + "learning_rate": 4.389577715221301e-06, + "loss": 0.532, + "step": 8874 + }, + { + "epoch": 4.316595744680851, + "grad_norm": 0.07657137578278234, + "learning_rate": 4.388628708661042e-06, + "loss": 0.4964, + "step": 8875 + }, + { + "epoch": 4.317082066869301, + "grad_norm": 0.07160964541451799, + "learning_rate": 4.387679724458911e-06, + "loss": 0.5087, + "step": 8876 + }, + { + "epoch": 4.317568389057751, + "grad_norm": 0.0719238648155246, + "learning_rate": 4.3867307626496085e-06, + "loss": 0.5192, + "step": 8877 + }, + { + "epoch": 4.318054711246201, + "grad_norm": 0.0710696258549487, + "learning_rate": 4.385781823267841e-06, + "loss": 0.4829, + "step": 8878 + }, + { + "epoch": 4.31854103343465, + "grad_norm": 0.07172916800527629, + "learning_rate": 4.384832906348311e-06, + "loss": 0.4919, + "step": 8879 + }, + { + "epoch": 4.3190273556231, + "grad_norm": 0.0750671578329595, + "learning_rate": 4.383884011925723e-06, + "loss": 0.5121, + "step": 8880 + }, + { + "epoch": 4.31951367781155, + "grad_norm": 0.07255129956589293, + "learning_rate": 4.382935140034775e-06, + "loss": 0.4896, + "step": 8881 + }, + { + "epoch": 4.32, + "grad_norm": 0.07452289204902597, + "learning_rate": 4.38198629071017e-06, + "loss": 0.5205, + "step": 8882 + }, + { + "epoch": 4.32048632218845, + "grad_norm": 0.07469569549049279, + "learning_rate": 4.3810374639866055e-06, + "loss": 0.5344, + "step": 8883 + }, + { + "epoch": 4.3209726443769, + "grad_norm": 0.07242040528457883, + "learning_rate": 4.380088659898784e-06, + "loss": 0.4895, + "step": 8884 + }, + { + "epoch": 4.321458966565349, + "grad_norm": 0.07330868075861767, + "learning_rate": 4.379139878481401e-06, + "loss": 0.5048, + "step": 8885 + }, + { + "epoch": 4.321945288753799, + "grad_norm": 0.07232087946460136, + "learning_rate": 4.378191119769155e-06, + "loss": 0.4647, + "step": 8886 + }, + { + "epoch": 4.32243161094225, + "grad_norm": 0.07518700637971938, + "learning_rate": 4.3772423837967415e-06, + "loss": 0.5091, + "step": 8887 + }, + { + "epoch": 4.322917933130699, + "grad_norm": 0.07534152298482906, + "learning_rate": 4.3762936705988566e-06, + "loss": 0.4925, + "step": 8888 + }, + { + "epoch": 4.323404255319149, + "grad_norm": 0.07477823833745276, + "learning_rate": 4.375344980210198e-06, + "loss": 0.5331, + "step": 8889 + }, + { + "epoch": 4.323890577507599, + "grad_norm": 0.07416128838319172, + "learning_rate": 4.3743963126654555e-06, + "loss": 0.5204, + "step": 8890 + }, + { + "epoch": 4.324376899696048, + "grad_norm": 0.07471537661589851, + "learning_rate": 4.373447667999326e-06, + "loss": 0.5487, + "step": 8891 + }, + { + "epoch": 4.324863221884499, + "grad_norm": 0.0755163967098278, + "learning_rate": 4.372499046246497e-06, + "loss": 0.518, + "step": 8892 + }, + { + "epoch": 4.325349544072949, + "grad_norm": 0.07206053574538679, + "learning_rate": 4.371550447441665e-06, + "loss": 0.527, + "step": 8893 + }, + { + "epoch": 4.325835866261398, + "grad_norm": 0.07172106928710374, + "learning_rate": 4.370601871619517e-06, + "loss": 0.466, + "step": 8894 + }, + { + "epoch": 4.326322188449848, + "grad_norm": 0.07554363047926235, + "learning_rate": 4.369653318814747e-06, + "loss": 0.4929, + "step": 8895 + }, + { + "epoch": 4.326808510638298, + "grad_norm": 0.07168538598565016, + "learning_rate": 4.368704789062039e-06, + "loss": 0.5249, + "step": 8896 + }, + { + "epoch": 4.327294832826747, + "grad_norm": 0.07111451244903727, + "learning_rate": 4.367756282396085e-06, + "loss": 0.4887, + "step": 8897 + }, + { + "epoch": 4.327781155015198, + "grad_norm": 0.07601813067434408, + "learning_rate": 4.36680779885157e-06, + "loss": 0.5241, + "step": 8898 + }, + { + "epoch": 4.328267477203648, + "grad_norm": 0.07344152425421795, + "learning_rate": 4.365859338463183e-06, + "loss": 0.492, + "step": 8899 + }, + { + "epoch": 4.328753799392097, + "grad_norm": 0.07234623875486963, + "learning_rate": 4.364910901265607e-06, + "loss": 0.4697, + "step": 8900 + }, + { + "epoch": 4.329240121580547, + "grad_norm": 0.06980103112906447, + "learning_rate": 4.363962487293528e-06, + "loss": 0.4496, + "step": 8901 + }, + { + "epoch": 4.329726443768997, + "grad_norm": 0.07795854916768889, + "learning_rate": 4.3630140965816294e-06, + "loss": 0.5255, + "step": 8902 + }, + { + "epoch": 4.330212765957447, + "grad_norm": 0.0718304546288623, + "learning_rate": 4.362065729164596e-06, + "loss": 0.495, + "step": 8903 + }, + { + "epoch": 4.330699088145897, + "grad_norm": 0.07424082195604742, + "learning_rate": 4.3611173850771074e-06, + "loss": 0.509, + "step": 8904 + }, + { + "epoch": 4.331185410334347, + "grad_norm": 0.07410623741866641, + "learning_rate": 4.360169064353848e-06, + "loss": 0.5327, + "step": 8905 + }, + { + "epoch": 4.331671732522796, + "grad_norm": 0.07232232562747383, + "learning_rate": 4.359220767029495e-06, + "loss": 0.4998, + "step": 8906 + }, + { + "epoch": 4.332158054711246, + "grad_norm": 0.07230138257650034, + "learning_rate": 4.35827249313873e-06, + "loss": 0.4678, + "step": 8907 + }, + { + "epoch": 4.332644376899696, + "grad_norm": 0.07729282519907332, + "learning_rate": 4.357324242716231e-06, + "loss": 0.5317, + "step": 8908 + }, + { + "epoch": 4.333130699088146, + "grad_norm": 0.07657744428302635, + "learning_rate": 4.356376015796678e-06, + "loss": 0.5075, + "step": 8909 + }, + { + "epoch": 4.333617021276596, + "grad_norm": 0.07766636248162703, + "learning_rate": 4.355427812414745e-06, + "loss": 0.5074, + "step": 8910 + }, + { + "epoch": 4.3341033434650456, + "grad_norm": 0.0738651748799548, + "learning_rate": 4.35447963260511e-06, + "loss": 0.5013, + "step": 8911 + }, + { + "epoch": 4.334589665653495, + "grad_norm": 0.07498968654727205, + "learning_rate": 4.3535314764024475e-06, + "loss": 0.505, + "step": 8912 + }, + { + "epoch": 4.335075987841945, + "grad_norm": 0.07508829461148832, + "learning_rate": 4.352583343841435e-06, + "loss": 0.5594, + "step": 8913 + }, + { + "epoch": 4.3355623100303955, + "grad_norm": 0.07542999148669198, + "learning_rate": 4.351635234956741e-06, + "loss": 0.5123, + "step": 8914 + }, + { + "epoch": 4.336048632218845, + "grad_norm": 0.07253343119260115, + "learning_rate": 4.350687149783042e-06, + "loss": 0.5072, + "step": 8915 + }, + { + "epoch": 4.336534954407295, + "grad_norm": 0.07381610424724026, + "learning_rate": 4.34973908835501e-06, + "loss": 0.5238, + "step": 8916 + }, + { + "epoch": 4.3370212765957445, + "grad_norm": 0.07620926980982037, + "learning_rate": 4.3487910507073124e-06, + "loss": 0.5049, + "step": 8917 + }, + { + "epoch": 4.337507598784194, + "grad_norm": 0.07303803359960734, + "learning_rate": 4.347843036874625e-06, + "loss": 0.4915, + "step": 8918 + }, + { + "epoch": 4.337993920972645, + "grad_norm": 0.07296787195692302, + "learning_rate": 4.346895046891612e-06, + "loss": 0.5065, + "step": 8919 + }, + { + "epoch": 4.3384802431610945, + "grad_norm": 0.07210109231810234, + "learning_rate": 4.345947080792946e-06, + "loss": 0.4655, + "step": 8920 + }, + { + "epoch": 4.338966565349544, + "grad_norm": 0.07411960088034263, + "learning_rate": 4.34499913861329e-06, + "loss": 0.5232, + "step": 8921 + }, + { + "epoch": 4.339452887537994, + "grad_norm": 0.07528378630595386, + "learning_rate": 4.344051220387314e-06, + "loss": 0.5348, + "step": 8922 + }, + { + "epoch": 4.3399392097264435, + "grad_norm": 0.0785423273024407, + "learning_rate": 4.343103326149682e-06, + "loss": 0.5235, + "step": 8923 + }, + { + "epoch": 4.340425531914893, + "grad_norm": 0.0752622483213002, + "learning_rate": 4.342155455935063e-06, + "loss": 0.4896, + "step": 8924 + }, + { + "epoch": 4.340911854103344, + "grad_norm": 0.07493896221957322, + "learning_rate": 4.341207609778114e-06, + "loss": 0.5302, + "step": 8925 + }, + { + "epoch": 4.3413981762917935, + "grad_norm": 0.0752287306881958, + "learning_rate": 4.340259787713505e-06, + "loss": 0.4989, + "step": 8926 + }, + { + "epoch": 4.341884498480243, + "grad_norm": 0.07617569671450834, + "learning_rate": 4.339311989775893e-06, + "loss": 0.5205, + "step": 8927 + }, + { + "epoch": 4.342370820668693, + "grad_norm": 0.07386382882538522, + "learning_rate": 4.338364215999944e-06, + "loss": 0.491, + "step": 8928 + }, + { + "epoch": 4.3428571428571425, + "grad_norm": 0.07608096331178003, + "learning_rate": 4.337416466420313e-06, + "loss": 0.519, + "step": 8929 + }, + { + "epoch": 4.343343465045593, + "grad_norm": 0.0739610071552224, + "learning_rate": 4.3364687410716665e-06, + "loss": 0.4859, + "step": 8930 + }, + { + "epoch": 4.343829787234043, + "grad_norm": 0.07344625314616673, + "learning_rate": 4.335521039988657e-06, + "loss": 0.5178, + "step": 8931 + }, + { + "epoch": 4.3443161094224925, + "grad_norm": 0.0764110206964944, + "learning_rate": 4.334573363205946e-06, + "loss": 0.519, + "step": 8932 + }, + { + "epoch": 4.344802431610942, + "grad_norm": 0.07591205649450931, + "learning_rate": 4.333625710758188e-06, + "loss": 0.5117, + "step": 8933 + }, + { + "epoch": 4.345288753799392, + "grad_norm": 0.07616932213990428, + "learning_rate": 4.332678082680043e-06, + "loss": 0.5639, + "step": 8934 + }, + { + "epoch": 4.3457750759878415, + "grad_norm": 0.07522738759425052, + "learning_rate": 4.331730479006162e-06, + "loss": 0.5384, + "step": 8935 + }, + { + "epoch": 4.346261398176292, + "grad_norm": 0.07399524553376609, + "learning_rate": 4.330782899771201e-06, + "loss": 0.4959, + "step": 8936 + }, + { + "epoch": 4.346747720364742, + "grad_norm": 0.07395539774238039, + "learning_rate": 4.329835345009813e-06, + "loss": 0.5462, + "step": 8937 + }, + { + "epoch": 4.3472340425531915, + "grad_norm": 0.076310336152694, + "learning_rate": 4.328887814756653e-06, + "loss": 0.5394, + "step": 8938 + }, + { + "epoch": 4.347720364741641, + "grad_norm": 0.07363981733818392, + "learning_rate": 4.327940309046368e-06, + "loss": 0.5038, + "step": 8939 + }, + { + "epoch": 4.348206686930091, + "grad_norm": 0.0720931778531534, + "learning_rate": 4.326992827913613e-06, + "loss": 0.5087, + "step": 8940 + }, + { + "epoch": 4.348693009118541, + "grad_norm": 0.07350593493429489, + "learning_rate": 4.326045371393034e-06, + "loss": 0.5237, + "step": 8941 + }, + { + "epoch": 4.349179331306991, + "grad_norm": 0.07323029716157982, + "learning_rate": 4.3250979395192834e-06, + "loss": 0.5078, + "step": 8942 + }, + { + "epoch": 4.349665653495441, + "grad_norm": 0.07414676450484353, + "learning_rate": 4.324150532327009e-06, + "loss": 0.4922, + "step": 8943 + }, + { + "epoch": 4.3501519756838904, + "grad_norm": 0.07132569584618621, + "learning_rate": 4.323203149850855e-06, + "loss": 0.4983, + "step": 8944 + }, + { + "epoch": 4.35063829787234, + "grad_norm": 0.07358142836906066, + "learning_rate": 4.322255792125471e-06, + "loss": 0.5077, + "step": 8945 + }, + { + "epoch": 4.351124620060791, + "grad_norm": 0.0735995225088054, + "learning_rate": 4.3213084591854984e-06, + "loss": 0.5218, + "step": 8946 + }, + { + "epoch": 4.35161094224924, + "grad_norm": 0.07513396594071299, + "learning_rate": 4.3203611510655845e-06, + "loss": 0.5021, + "step": 8947 + }, + { + "epoch": 4.35209726443769, + "grad_norm": 0.07529187866288382, + "learning_rate": 4.319413867800372e-06, + "loss": 0.5214, + "step": 8948 + }, + { + "epoch": 4.35258358662614, + "grad_norm": 0.07381393236850418, + "learning_rate": 4.318466609424505e-06, + "loss": 0.4673, + "step": 8949 + }, + { + "epoch": 4.353069908814589, + "grad_norm": 0.07677864929108318, + "learning_rate": 4.317519375972622e-06, + "loss": 0.5617, + "step": 8950 + }, + { + "epoch": 4.353556231003039, + "grad_norm": 0.0740033176650963, + "learning_rate": 4.316572167479366e-06, + "loss": 0.4654, + "step": 8951 + }, + { + "epoch": 4.35404255319149, + "grad_norm": 0.07532289408922148, + "learning_rate": 4.315624983979375e-06, + "loss": 0.5114, + "step": 8952 + }, + { + "epoch": 4.354528875379939, + "grad_norm": 0.07369310003524242, + "learning_rate": 4.314677825507293e-06, + "loss": 0.502, + "step": 8953 + }, + { + "epoch": 4.355015197568389, + "grad_norm": 0.077197710604692, + "learning_rate": 4.313730692097751e-06, + "loss": 0.5725, + "step": 8954 + }, + { + "epoch": 4.355501519756839, + "grad_norm": 0.07713828472241255, + "learning_rate": 4.31278358378539e-06, + "loss": 0.5291, + "step": 8955 + }, + { + "epoch": 4.355987841945288, + "grad_norm": 0.07558809496618385, + "learning_rate": 4.311836500604846e-06, + "loss": 0.5169, + "step": 8956 + }, + { + "epoch": 4.356474164133739, + "grad_norm": 0.07539873407292505, + "learning_rate": 4.310889442590755e-06, + "loss": 0.5115, + "step": 8957 + }, + { + "epoch": 4.356960486322189, + "grad_norm": 0.072277900988303, + "learning_rate": 4.309942409777747e-06, + "loss": 0.4769, + "step": 8958 + }, + { + "epoch": 4.357446808510638, + "grad_norm": 0.07463996880449425, + "learning_rate": 4.308995402200462e-06, + "loss": 0.4942, + "step": 8959 + }, + { + "epoch": 4.357933130699088, + "grad_norm": 0.07230046070225032, + "learning_rate": 4.308048419893527e-06, + "loss": 0.537, + "step": 8960 + }, + { + "epoch": 4.358419452887538, + "grad_norm": 0.0745794181193096, + "learning_rate": 4.307101462891576e-06, + "loss": 0.5064, + "step": 8961 + }, + { + "epoch": 4.358905775075987, + "grad_norm": 0.07383499078362998, + "learning_rate": 4.306154531229239e-06, + "loss": 0.5221, + "step": 8962 + }, + { + "epoch": 4.359392097264438, + "grad_norm": 0.07717260718233843, + "learning_rate": 4.305207624941148e-06, + "loss": 0.5121, + "step": 8963 + }, + { + "epoch": 4.359878419452888, + "grad_norm": 0.07366586505225742, + "learning_rate": 4.304260744061928e-06, + "loss": 0.5302, + "step": 8964 + }, + { + "epoch": 4.360364741641337, + "grad_norm": 0.07665854726318598, + "learning_rate": 4.303313888626208e-06, + "loss": 0.4999, + "step": 8965 + }, + { + "epoch": 4.360851063829787, + "grad_norm": 0.07451813847677896, + "learning_rate": 4.302367058668617e-06, + "loss": 0.521, + "step": 8966 + }, + { + "epoch": 4.361337386018237, + "grad_norm": 0.0752899722033896, + "learning_rate": 4.3014202542237785e-06, + "loss": 0.5177, + "step": 8967 + }, + { + "epoch": 4.361823708206687, + "grad_norm": 0.07849829279744301, + "learning_rate": 4.3004734753263205e-06, + "loss": 0.5551, + "step": 8968 + }, + { + "epoch": 4.362310030395137, + "grad_norm": 0.07444937303631532, + "learning_rate": 4.2995267220108634e-06, + "loss": 0.5061, + "step": 8969 + }, + { + "epoch": 4.362796352583587, + "grad_norm": 0.074694372403024, + "learning_rate": 4.298579994312034e-06, + "loss": 0.556, + "step": 8970 + }, + { + "epoch": 4.363282674772036, + "grad_norm": 0.0717691317317463, + "learning_rate": 4.2976332922644515e-06, + "loss": 0.4737, + "step": 8971 + }, + { + "epoch": 4.363768996960486, + "grad_norm": 0.07660514643130598, + "learning_rate": 4.296686615902739e-06, + "loss": 0.5236, + "step": 8972 + }, + { + "epoch": 4.364255319148937, + "grad_norm": 0.07405669123126633, + "learning_rate": 4.295739965261516e-06, + "loss": 0.4998, + "step": 8973 + }, + { + "epoch": 4.364741641337386, + "grad_norm": 0.07370248275717206, + "learning_rate": 4.294793340375405e-06, + "loss": 0.4899, + "step": 8974 + }, + { + "epoch": 4.365227963525836, + "grad_norm": 0.07612564587026457, + "learning_rate": 4.293846741279019e-06, + "loss": 0.5209, + "step": 8975 + }, + { + "epoch": 4.365714285714286, + "grad_norm": 0.07453807468611076, + "learning_rate": 4.292900168006979e-06, + "loss": 0.545, + "step": 8976 + }, + { + "epoch": 4.366200607902735, + "grad_norm": 0.07406787688920971, + "learning_rate": 4.291953620593902e-06, + "loss": 0.5315, + "step": 8977 + }, + { + "epoch": 4.366686930091185, + "grad_norm": 0.07537165161300213, + "learning_rate": 4.291007099074403e-06, + "loss": 0.478, + "step": 8978 + }, + { + "epoch": 4.367173252279636, + "grad_norm": 0.0706841018736068, + "learning_rate": 4.290060603483095e-06, + "loss": 0.4843, + "step": 8979 + }, + { + "epoch": 4.367659574468085, + "grad_norm": 0.07394689173604536, + "learning_rate": 4.289114133854594e-06, + "loss": 0.4929, + "step": 8980 + }, + { + "epoch": 4.368145896656535, + "grad_norm": 0.07959312816235899, + "learning_rate": 4.288167690223512e-06, + "loss": 0.5236, + "step": 8981 + }, + { + "epoch": 4.368632218844985, + "grad_norm": 0.07215387193545882, + "learning_rate": 4.287221272624462e-06, + "loss": 0.508, + "step": 8982 + }, + { + "epoch": 4.369118541033434, + "grad_norm": 0.07349109961906883, + "learning_rate": 4.286274881092053e-06, + "loss": 0.5188, + "step": 8983 + }, + { + "epoch": 4.369604863221885, + "grad_norm": 0.07671258711287796, + "learning_rate": 4.285328515660897e-06, + "loss": 0.4767, + "step": 8984 + }, + { + "epoch": 4.370091185410335, + "grad_norm": 0.0745084888237647, + "learning_rate": 4.2843821763656e-06, + "loss": 0.4997, + "step": 8985 + }, + { + "epoch": 4.370577507598784, + "grad_norm": 0.07575426080794644, + "learning_rate": 4.283435863240773e-06, + "loss": 0.515, + "step": 8986 + }, + { + "epoch": 4.371063829787234, + "grad_norm": 0.07716065604557355, + "learning_rate": 4.282489576321021e-06, + "loss": 0.5146, + "step": 8987 + }, + { + "epoch": 4.371550151975684, + "grad_norm": 0.07492100670539603, + "learning_rate": 4.281543315640953e-06, + "loss": 0.5419, + "step": 8988 + }, + { + "epoch": 4.372036474164133, + "grad_norm": 0.07544483639828305, + "learning_rate": 4.280597081235171e-06, + "loss": 0.4932, + "step": 8989 + }, + { + "epoch": 4.372522796352584, + "grad_norm": 0.07659431445285168, + "learning_rate": 4.279650873138281e-06, + "loss": 0.5369, + "step": 8990 + }, + { + "epoch": 4.373009118541034, + "grad_norm": 0.07900993196104746, + "learning_rate": 4.278704691384885e-06, + "loss": 0.5497, + "step": 8991 + }, + { + "epoch": 4.373495440729483, + "grad_norm": 0.07372706145496581, + "learning_rate": 4.277758536009588e-06, + "loss": 0.4863, + "step": 8992 + }, + { + "epoch": 4.373981762917933, + "grad_norm": 0.07524489423047519, + "learning_rate": 4.2768124070469875e-06, + "loss": 0.5547, + "step": 8993 + }, + { + "epoch": 4.374468085106383, + "grad_norm": 0.07436807266822268, + "learning_rate": 4.2758663045316866e-06, + "loss": 0.5287, + "step": 8994 + }, + { + "epoch": 4.374954407294833, + "grad_norm": 0.07330239069204415, + "learning_rate": 4.274920228498284e-06, + "loss": 0.4791, + "step": 8995 + }, + { + "epoch": 4.375440729483283, + "grad_norm": 0.07651491508045516, + "learning_rate": 4.273974178981377e-06, + "loss": 0.5091, + "step": 8996 + }, + { + "epoch": 4.375927051671733, + "grad_norm": 0.07290540102598252, + "learning_rate": 4.273028156015566e-06, + "loss": 0.5051, + "step": 8997 + }, + { + "epoch": 4.376413373860182, + "grad_norm": 0.0728792059861237, + "learning_rate": 4.2720821596354444e-06, + "loss": 0.4997, + "step": 8998 + }, + { + "epoch": 4.376899696048632, + "grad_norm": 0.07491539302081295, + "learning_rate": 4.271136189875611e-06, + "loss": 0.5254, + "step": 8999 + }, + { + "epoch": 4.3773860182370825, + "grad_norm": 0.07318635320704184, + "learning_rate": 4.270190246770656e-06, + "loss": 0.4871, + "step": 9000 + }, + { + "epoch": 4.377872340425532, + "grad_norm": 0.07266173846733405, + "learning_rate": 4.2692443303551755e-06, + "loss": 0.5061, + "step": 9001 + }, + { + "epoch": 4.378358662613982, + "grad_norm": 0.07563284034120024, + "learning_rate": 4.268298440663762e-06, + "loss": 0.5251, + "step": 9002 + }, + { + "epoch": 4.378844984802432, + "grad_norm": 0.07510102872727847, + "learning_rate": 4.267352577731008e-06, + "loss": 0.5344, + "step": 9003 + }, + { + "epoch": 4.379331306990881, + "grad_norm": 0.07619567204533358, + "learning_rate": 4.266406741591502e-06, + "loss": 0.5376, + "step": 9004 + }, + { + "epoch": 4.379817629179331, + "grad_norm": 0.07965380077124826, + "learning_rate": 4.2654609322798345e-06, + "loss": 0.5328, + "step": 9005 + }, + { + "epoch": 4.3803039513677815, + "grad_norm": 0.07503217431703178, + "learning_rate": 4.264515149830595e-06, + "loss": 0.5251, + "step": 9006 + }, + { + "epoch": 4.380790273556231, + "grad_norm": 0.07270153742412923, + "learning_rate": 4.263569394278371e-06, + "loss": 0.5059, + "step": 9007 + }, + { + "epoch": 4.381276595744681, + "grad_norm": 0.07763096972220519, + "learning_rate": 4.262623665657748e-06, + "loss": 0.5616, + "step": 9008 + }, + { + "epoch": 4.3817629179331306, + "grad_norm": 0.07140433487202989, + "learning_rate": 4.261677964003313e-06, + "loss": 0.4779, + "step": 9009 + }, + { + "epoch": 4.38224924012158, + "grad_norm": 0.07042517833143659, + "learning_rate": 4.2607322893496495e-06, + "loss": 0.4458, + "step": 9010 + }, + { + "epoch": 4.382735562310031, + "grad_norm": 0.07833307615913485, + "learning_rate": 4.259786641731344e-06, + "loss": 0.5316, + "step": 9011 + }, + { + "epoch": 4.3832218844984805, + "grad_norm": 0.07704773043950978, + "learning_rate": 4.2588410211829755e-06, + "loss": 0.5075, + "step": 9012 + }, + { + "epoch": 4.38370820668693, + "grad_norm": 0.07293520348608629, + "learning_rate": 4.257895427739129e-06, + "loss": 0.5056, + "step": 9013 + }, + { + "epoch": 4.38419452887538, + "grad_norm": 0.07649131670078893, + "learning_rate": 4.256949861434382e-06, + "loss": 0.5154, + "step": 9014 + }, + { + "epoch": 4.3846808510638295, + "grad_norm": 0.07279869677145666, + "learning_rate": 4.256004322303318e-06, + "loss": 0.4945, + "step": 9015 + }, + { + "epoch": 4.385167173252279, + "grad_norm": 0.07657900549816428, + "learning_rate": 4.255058810380512e-06, + "loss": 0.5043, + "step": 9016 + }, + { + "epoch": 4.38565349544073, + "grad_norm": 0.07543383162759293, + "learning_rate": 4.254113325700547e-06, + "loss": 0.5435, + "step": 9017 + }, + { + "epoch": 4.3861398176291795, + "grad_norm": 0.07496872333988061, + "learning_rate": 4.253167868297993e-06, + "loss": 0.5104, + "step": 9018 + }, + { + "epoch": 4.386626139817629, + "grad_norm": 0.07456958186471606, + "learning_rate": 4.25222243820743e-06, + "loss": 0.526, + "step": 9019 + }, + { + "epoch": 4.387112462006079, + "grad_norm": 0.07576880618808729, + "learning_rate": 4.251277035463433e-06, + "loss": 0.5437, + "step": 9020 + }, + { + "epoch": 4.3875987841945285, + "grad_norm": 0.07671835050407548, + "learning_rate": 4.250331660100574e-06, + "loss": 0.5402, + "step": 9021 + }, + { + "epoch": 4.388085106382979, + "grad_norm": 0.07370104775995952, + "learning_rate": 4.24938631215343e-06, + "loss": 0.4894, + "step": 9022 + }, + { + "epoch": 4.388571428571429, + "grad_norm": 0.07142073350530018, + "learning_rate": 4.248440991656566e-06, + "loss": 0.4923, + "step": 9023 + }, + { + "epoch": 4.3890577507598785, + "grad_norm": 0.07128914650917673, + "learning_rate": 4.247495698644559e-06, + "loss": 0.471, + "step": 9024 + }, + { + "epoch": 4.389544072948328, + "grad_norm": 0.07455547258875524, + "learning_rate": 4.246550433151973e-06, + "loss": 0.4934, + "step": 9025 + }, + { + "epoch": 4.390030395136778, + "grad_norm": 0.07402123021651788, + "learning_rate": 4.245605195213383e-06, + "loss": 0.4964, + "step": 9026 + }, + { + "epoch": 4.390516717325228, + "grad_norm": 0.0723470715913622, + "learning_rate": 4.244659984863352e-06, + "loss": 0.5222, + "step": 9027 + }, + { + "epoch": 4.391003039513678, + "grad_norm": 0.07452849188939858, + "learning_rate": 4.24371480213645e-06, + "loss": 0.5106, + "step": 9028 + }, + { + "epoch": 4.391489361702128, + "grad_norm": 0.07590476558689858, + "learning_rate": 4.24276964706724e-06, + "loss": 0.5182, + "step": 9029 + }, + { + "epoch": 4.3919756838905775, + "grad_norm": 0.07355395019148747, + "learning_rate": 4.241824519690288e-06, + "loss": 0.5071, + "step": 9030 + }, + { + "epoch": 4.392462006079027, + "grad_norm": 0.07554176845373808, + "learning_rate": 4.240879420040158e-06, + "loss": 0.4862, + "step": 9031 + }, + { + "epoch": 4.392948328267477, + "grad_norm": 0.0741586743298622, + "learning_rate": 4.239934348151413e-06, + "loss": 0.5015, + "step": 9032 + }, + { + "epoch": 4.393434650455927, + "grad_norm": 0.07362941654653629, + "learning_rate": 4.2389893040586136e-06, + "loss": 0.4873, + "step": 9033 + }, + { + "epoch": 4.393920972644377, + "grad_norm": 0.07226512902417866, + "learning_rate": 4.238044287796322e-06, + "loss": 0.4856, + "step": 9034 + }, + { + "epoch": 4.394407294832827, + "grad_norm": 0.07377129558844482, + "learning_rate": 4.237099299399095e-06, + "loss": 0.522, + "step": 9035 + }, + { + "epoch": 4.3948936170212765, + "grad_norm": 0.07360844929935485, + "learning_rate": 4.236154338901496e-06, + "loss": 0.5206, + "step": 9036 + }, + { + "epoch": 4.395379939209726, + "grad_norm": 0.07534835206032656, + "learning_rate": 4.235209406338078e-06, + "loss": 0.5283, + "step": 9037 + }, + { + "epoch": 4.395866261398176, + "grad_norm": 0.07715964041408652, + "learning_rate": 4.234264501743401e-06, + "loss": 0.5056, + "step": 9038 + }, + { + "epoch": 4.396352583586626, + "grad_norm": 0.07436895408191456, + "learning_rate": 4.233319625152017e-06, + "loss": 0.5255, + "step": 9039 + }, + { + "epoch": 4.396838905775076, + "grad_norm": 0.07375793144052216, + "learning_rate": 4.232374776598483e-06, + "loss": 0.5257, + "step": 9040 + }, + { + "epoch": 4.397325227963526, + "grad_norm": 0.07441814047501576, + "learning_rate": 4.231429956117353e-06, + "loss": 0.5095, + "step": 9041 + }, + { + "epoch": 4.3978115501519754, + "grad_norm": 0.07523545678108617, + "learning_rate": 4.23048516374318e-06, + "loss": 0.4825, + "step": 9042 + }, + { + "epoch": 4.398297872340425, + "grad_norm": 0.07379752232221334, + "learning_rate": 4.2295403995105114e-06, + "loss": 0.5019, + "step": 9043 + }, + { + "epoch": 4.398784194528876, + "grad_norm": 0.07290779688624605, + "learning_rate": 4.228595663453902e-06, + "loss": 0.4999, + "step": 9044 + }, + { + "epoch": 4.399270516717325, + "grad_norm": 0.07228222378426806, + "learning_rate": 4.227650955607898e-06, + "loss": 0.46, + "step": 9045 + }, + { + "epoch": 4.399756838905775, + "grad_norm": 0.08030212702038708, + "learning_rate": 4.22670627600705e-06, + "loss": 0.5914, + "step": 9046 + }, + { + "epoch": 4.400243161094225, + "grad_norm": 0.07762499667310019, + "learning_rate": 4.225761624685907e-06, + "loss": 0.5488, + "step": 9047 + }, + { + "epoch": 4.400729483282674, + "grad_norm": 0.07455248583218539, + "learning_rate": 4.224817001679011e-06, + "loss": 0.5573, + "step": 9048 + }, + { + "epoch": 4.401215805471125, + "grad_norm": 0.07747893884142715, + "learning_rate": 4.2238724070209106e-06, + "loss": 0.5025, + "step": 9049 + }, + { + "epoch": 4.401702127659575, + "grad_norm": 0.0731013291850829, + "learning_rate": 4.222927840746147e-06, + "loss": 0.4728, + "step": 9050 + }, + { + "epoch": 4.402188449848024, + "grad_norm": 0.07551629917769796, + "learning_rate": 4.221983302889268e-06, + "loss": 0.4967, + "step": 9051 + }, + { + "epoch": 4.402674772036474, + "grad_norm": 0.07470691787108497, + "learning_rate": 4.2210387934848115e-06, + "loss": 0.5216, + "step": 9052 + }, + { + "epoch": 4.403161094224924, + "grad_norm": 0.0741162819899098, + "learning_rate": 4.220094312567322e-06, + "loss": 0.5122, + "step": 9053 + }, + { + "epoch": 4.403647416413374, + "grad_norm": 0.0745271948599348, + "learning_rate": 4.219149860171335e-06, + "loss": 0.5226, + "step": 9054 + }, + { + "epoch": 4.404133738601824, + "grad_norm": 0.07332281977617179, + "learning_rate": 4.218205436331394e-06, + "loss": 0.4995, + "step": 9055 + }, + { + "epoch": 4.404620060790274, + "grad_norm": 0.0767985456368329, + "learning_rate": 4.217261041082034e-06, + "loss": 0.5438, + "step": 9056 + }, + { + "epoch": 4.405106382978723, + "grad_norm": 0.07416760821963926, + "learning_rate": 4.216316674457796e-06, + "loss": 0.531, + "step": 9057 + }, + { + "epoch": 4.405592705167173, + "grad_norm": 0.07336749337127642, + "learning_rate": 4.215372336493211e-06, + "loss": 0.4996, + "step": 9058 + }, + { + "epoch": 4.406079027355623, + "grad_norm": 0.07550463856438298, + "learning_rate": 4.214428027222816e-06, + "loss": 0.4821, + "step": 9059 + }, + { + "epoch": 4.406565349544073, + "grad_norm": 0.07507610205327063, + "learning_rate": 4.2134837466811455e-06, + "loss": 0.5056, + "step": 9060 + }, + { + "epoch": 4.407051671732523, + "grad_norm": 0.07563393938130927, + "learning_rate": 4.212539494902734e-06, + "loss": 0.4904, + "step": 9061 + }, + { + "epoch": 4.407537993920973, + "grad_norm": 0.07219387953873002, + "learning_rate": 4.211595271922108e-06, + "loss": 0.4859, + "step": 9062 + }, + { + "epoch": 4.408024316109422, + "grad_norm": 0.07428949609653966, + "learning_rate": 4.210651077773803e-06, + "loss": 0.5054, + "step": 9063 + }, + { + "epoch": 4.408510638297872, + "grad_norm": 0.07332929695079349, + "learning_rate": 4.209706912492345e-06, + "loss": 0.5305, + "step": 9064 + }, + { + "epoch": 4.408996960486322, + "grad_norm": 0.07254212468718906, + "learning_rate": 4.208762776112265e-06, + "loss": 0.5064, + "step": 9065 + }, + { + "epoch": 4.409483282674772, + "grad_norm": 0.07222729982152454, + "learning_rate": 4.207818668668089e-06, + "loss": 0.4754, + "step": 9066 + }, + { + "epoch": 4.409969604863222, + "grad_norm": 0.07224255179255343, + "learning_rate": 4.2068745901943465e-06, + "loss": 0.4936, + "step": 9067 + }, + { + "epoch": 4.410455927051672, + "grad_norm": 0.07698495376583506, + "learning_rate": 4.205930540725558e-06, + "loss": 0.5543, + "step": 9068 + }, + { + "epoch": 4.410942249240121, + "grad_norm": 0.07485188419614182, + "learning_rate": 4.204986520296251e-06, + "loss": 0.4787, + "step": 9069 + }, + { + "epoch": 4.411428571428571, + "grad_norm": 0.07241756838664043, + "learning_rate": 4.204042528940948e-06, + "loss": 0.481, + "step": 9070 + }, + { + "epoch": 4.411914893617022, + "grad_norm": 0.07401808347233409, + "learning_rate": 4.203098566694174e-06, + "loss": 0.4906, + "step": 9071 + }, + { + "epoch": 4.412401215805471, + "grad_norm": 0.07472563348904236, + "learning_rate": 4.202154633590444e-06, + "loss": 0.5118, + "step": 9072 + }, + { + "epoch": 4.412887537993921, + "grad_norm": 0.07463649824438731, + "learning_rate": 4.201210729664282e-06, + "loss": 0.4953, + "step": 9073 + }, + { + "epoch": 4.413373860182371, + "grad_norm": 0.07461440640088356, + "learning_rate": 4.200266854950208e-06, + "loss": 0.532, + "step": 9074 + }, + { + "epoch": 4.41386018237082, + "grad_norm": 0.07872846831950941, + "learning_rate": 4.1993230094827365e-06, + "loss": 0.5259, + "step": 9075 + }, + { + "epoch": 4.414346504559271, + "grad_norm": 0.0748588363741536, + "learning_rate": 4.198379193296389e-06, + "loss": 0.5273, + "step": 9076 + }, + { + "epoch": 4.414832826747721, + "grad_norm": 0.07676513184315509, + "learning_rate": 4.197435406425676e-06, + "loss": 0.5221, + "step": 9077 + }, + { + "epoch": 4.41531914893617, + "grad_norm": 0.07418360592519829, + "learning_rate": 4.196491648905118e-06, + "loss": 0.5148, + "step": 9078 + }, + { + "epoch": 4.41580547112462, + "grad_norm": 0.07492311173844467, + "learning_rate": 4.195547920769222e-06, + "loss": 0.4938, + "step": 9079 + }, + { + "epoch": 4.41629179331307, + "grad_norm": 0.07540831221269857, + "learning_rate": 4.194604222052507e-06, + "loss": 0.5545, + "step": 9080 + }, + { + "epoch": 4.41677811550152, + "grad_norm": 0.07675563697465707, + "learning_rate": 4.193660552789479e-06, + "loss": 0.4857, + "step": 9081 + }, + { + "epoch": 4.41726443768997, + "grad_norm": 0.07457433600622512, + "learning_rate": 4.192716913014653e-06, + "loss": 0.4946, + "step": 9082 + }, + { + "epoch": 4.41775075987842, + "grad_norm": 0.07376700804617603, + "learning_rate": 4.191773302762534e-06, + "loss": 0.5152, + "step": 9083 + }, + { + "epoch": 4.418237082066869, + "grad_norm": 0.07308860634821253, + "learning_rate": 4.1908297220676345e-06, + "loss": 0.5163, + "step": 9084 + }, + { + "epoch": 4.418723404255319, + "grad_norm": 0.07214967881111042, + "learning_rate": 4.189886170964458e-06, + "loss": 0.4916, + "step": 9085 + }, + { + "epoch": 4.419209726443769, + "grad_norm": 0.07577197842988086, + "learning_rate": 4.188942649487514e-06, + "loss": 0.556, + "step": 9086 + }, + { + "epoch": 4.419696048632219, + "grad_norm": 0.07608259734300477, + "learning_rate": 4.187999157671304e-06, + "loss": 0.5152, + "step": 9087 + }, + { + "epoch": 4.420182370820669, + "grad_norm": 0.0731186727929142, + "learning_rate": 4.187055695550335e-06, + "loss": 0.5331, + "step": 9088 + }, + { + "epoch": 4.420668693009119, + "grad_norm": 0.07718537948198227, + "learning_rate": 4.186112263159108e-06, + "loss": 0.5044, + "step": 9089 + }, + { + "epoch": 4.421155015197568, + "grad_norm": 0.07662218528622265, + "learning_rate": 4.185168860532127e-06, + "loss": 0.51, + "step": 9090 + }, + { + "epoch": 4.421641337386018, + "grad_norm": 0.07394124837735269, + "learning_rate": 4.184225487703888e-06, + "loss": 0.5229, + "step": 9091 + }, + { + "epoch": 4.422127659574468, + "grad_norm": 0.0719442143277997, + "learning_rate": 4.183282144708897e-06, + "loss": 0.5093, + "step": 9092 + }, + { + "epoch": 4.422613981762918, + "grad_norm": 0.0761258445087453, + "learning_rate": 4.182338831581646e-06, + "loss": 0.4985, + "step": 9093 + }, + { + "epoch": 4.423100303951368, + "grad_norm": 0.07419935530040028, + "learning_rate": 4.181395548356636e-06, + "loss": 0.4965, + "step": 9094 + }, + { + "epoch": 4.423586626139818, + "grad_norm": 0.07374791817061643, + "learning_rate": 4.180452295068363e-06, + "loss": 0.5331, + "step": 9095 + }, + { + "epoch": 4.424072948328267, + "grad_norm": 0.07711191890342523, + "learning_rate": 4.179509071751323e-06, + "loss": 0.5105, + "step": 9096 + }, + { + "epoch": 4.424559270516717, + "grad_norm": 0.07513900914892778, + "learning_rate": 4.1785658784400076e-06, + "loss": 0.5275, + "step": 9097 + }, + { + "epoch": 4.4250455927051675, + "grad_norm": 0.07296096693191595, + "learning_rate": 4.177622715168911e-06, + "loss": 0.5221, + "step": 9098 + }, + { + "epoch": 4.425531914893617, + "grad_norm": 0.07262837212831806, + "learning_rate": 4.176679581972526e-06, + "loss": 0.487, + "step": 9099 + }, + { + "epoch": 4.426018237082067, + "grad_norm": 0.07586383156011682, + "learning_rate": 4.175736478885342e-06, + "loss": 0.5459, + "step": 9100 + }, + { + "epoch": 4.426504559270517, + "grad_norm": 0.07458208431354382, + "learning_rate": 4.1747934059418514e-06, + "loss": 0.5236, + "step": 9101 + }, + { + "epoch": 4.426990881458966, + "grad_norm": 0.07112016482307837, + "learning_rate": 4.173850363176539e-06, + "loss": 0.479, + "step": 9102 + }, + { + "epoch": 4.427477203647417, + "grad_norm": 0.07387071893359579, + "learning_rate": 4.172907350623896e-06, + "loss": 0.5217, + "step": 9103 + }, + { + "epoch": 4.4279635258358665, + "grad_norm": 0.07329809776277209, + "learning_rate": 4.171964368318404e-06, + "loss": 0.4707, + "step": 9104 + }, + { + "epoch": 4.428449848024316, + "grad_norm": 0.07696985784682245, + "learning_rate": 4.171021416294555e-06, + "loss": 0.5128, + "step": 9105 + }, + { + "epoch": 4.428936170212766, + "grad_norm": 0.07378305836485363, + "learning_rate": 4.170078494586826e-06, + "loss": 0.4887, + "step": 9106 + }, + { + "epoch": 4.4294224924012155, + "grad_norm": 0.0778186705889417, + "learning_rate": 4.169135603229707e-06, + "loss": 0.5055, + "step": 9107 + }, + { + "epoch": 4.429908814589666, + "grad_norm": 0.07586478389881635, + "learning_rate": 4.168192742257674e-06, + "loss": 0.5213, + "step": 9108 + }, + { + "epoch": 4.430395136778116, + "grad_norm": 0.07335612360719625, + "learning_rate": 4.16724991170521e-06, + "loss": 0.5303, + "step": 9109 + }, + { + "epoch": 4.4308814589665655, + "grad_norm": 0.07657735512603027, + "learning_rate": 4.166307111606795e-06, + "loss": 0.5249, + "step": 9110 + }, + { + "epoch": 4.431367781155015, + "grad_norm": 0.07205231498610169, + "learning_rate": 4.1653643419969104e-06, + "loss": 0.4908, + "step": 9111 + }, + { + "epoch": 4.431854103343465, + "grad_norm": 0.07438732692377771, + "learning_rate": 4.164421602910028e-06, + "loss": 0.5031, + "step": 9112 + }, + { + "epoch": 4.4323404255319145, + "grad_norm": 0.07292292153829716, + "learning_rate": 4.1634788943806286e-06, + "loss": 0.4933, + "step": 9113 + }, + { + "epoch": 4.432826747720365, + "grad_norm": 0.07272489464759248, + "learning_rate": 4.162536216443185e-06, + "loss": 0.505, + "step": 9114 + }, + { + "epoch": 4.433313069908815, + "grad_norm": 0.075104762108542, + "learning_rate": 4.161593569132175e-06, + "loss": 0.5465, + "step": 9115 + }, + { + "epoch": 4.4337993920972645, + "grad_norm": 0.0707235435697312, + "learning_rate": 4.1606509524820666e-06, + "loss": 0.4663, + "step": 9116 + }, + { + "epoch": 4.434285714285714, + "grad_norm": 0.0727043044237745, + "learning_rate": 4.159708366527337e-06, + "loss": 0.5259, + "step": 9117 + }, + { + "epoch": 4.434772036474164, + "grad_norm": 0.07320888965153415, + "learning_rate": 4.1587658113024505e-06, + "loss": 0.5052, + "step": 9118 + }, + { + "epoch": 4.4352583586626135, + "grad_norm": 0.07528755572832935, + "learning_rate": 4.157823286841882e-06, + "loss": 0.5339, + "step": 9119 + }, + { + "epoch": 4.435744680851064, + "grad_norm": 0.07464732244225625, + "learning_rate": 4.156880793180098e-06, + "loss": 0.5035, + "step": 9120 + }, + { + "epoch": 4.436231003039514, + "grad_norm": 0.0725134205794363, + "learning_rate": 4.155938330351569e-06, + "loss": 0.4872, + "step": 9121 + }, + { + "epoch": 4.4367173252279635, + "grad_norm": 0.07390102078467314, + "learning_rate": 4.154995898390756e-06, + "loss": 0.5077, + "step": 9122 + }, + { + "epoch": 4.437203647416413, + "grad_norm": 0.07638143174560055, + "learning_rate": 4.1540534973321275e-06, + "loss": 0.5516, + "step": 9123 + }, + { + "epoch": 4.437689969604863, + "grad_norm": 0.07546406710573061, + "learning_rate": 4.153111127210147e-06, + "loss": 0.5159, + "step": 9124 + }, + { + "epoch": 4.438176291793313, + "grad_norm": 0.07613944887188974, + "learning_rate": 4.152168788059276e-06, + "loss": 0.5202, + "step": 9125 + }, + { + "epoch": 4.438662613981763, + "grad_norm": 0.07222537282958809, + "learning_rate": 4.151226479913981e-06, + "loss": 0.4715, + "step": 9126 + }, + { + "epoch": 4.439148936170213, + "grad_norm": 0.07233969747422339, + "learning_rate": 4.150284202808716e-06, + "loss": 0.5035, + "step": 9127 + }, + { + "epoch": 4.4396352583586625, + "grad_norm": 0.07434789901175949, + "learning_rate": 4.149341956777945e-06, + "loss": 0.4909, + "step": 9128 + }, + { + "epoch": 4.440121580547112, + "grad_norm": 0.07462667558137118, + "learning_rate": 4.148399741856125e-06, + "loss": 0.5049, + "step": 9129 + }, + { + "epoch": 4.440607902735563, + "grad_norm": 0.0764114471862437, + "learning_rate": 4.1474575580777145e-06, + "loss": 0.528, + "step": 9130 + }, + { + "epoch": 4.441094224924012, + "grad_norm": 0.07302644598794808, + "learning_rate": 4.1465154054771674e-06, + "loss": 0.4914, + "step": 9131 + }, + { + "epoch": 4.441580547112462, + "grad_norm": 0.07260935998242737, + "learning_rate": 4.145573284088941e-06, + "loss": 0.4985, + "step": 9132 + }, + { + "epoch": 4.442066869300912, + "grad_norm": 0.07734936149770032, + "learning_rate": 4.144631193947485e-06, + "loss": 0.5558, + "step": 9133 + }, + { + "epoch": 4.4425531914893615, + "grad_norm": 0.07487387208687873, + "learning_rate": 4.143689135087257e-06, + "loss": 0.5228, + "step": 9134 + }, + { + "epoch": 4.443039513677811, + "grad_norm": 0.077470120227211, + "learning_rate": 4.142747107542705e-06, + "loss": 0.5251, + "step": 9135 + }, + { + "epoch": 4.443525835866262, + "grad_norm": 0.0733020601161558, + "learning_rate": 4.1418051113482825e-06, + "loss": 0.5144, + "step": 9136 + }, + { + "epoch": 4.444012158054711, + "grad_norm": 0.07715403605423381, + "learning_rate": 4.1408631465384355e-06, + "loss": 0.5324, + "step": 9137 + }, + { + "epoch": 4.444498480243161, + "grad_norm": 0.07398415580005642, + "learning_rate": 4.139921213147614e-06, + "loss": 0.5243, + "step": 9138 + }, + { + "epoch": 4.444984802431611, + "grad_norm": 0.0728185274545478, + "learning_rate": 4.138979311210264e-06, + "loss": 0.4566, + "step": 9139 + }, + { + "epoch": 4.44547112462006, + "grad_norm": 0.07506445678451476, + "learning_rate": 4.138037440760834e-06, + "loss": 0.4988, + "step": 9140 + }, + { + "epoch": 4.445957446808511, + "grad_norm": 0.08126597511347414, + "learning_rate": 4.1370956018337635e-06, + "loss": 0.5583, + "step": 9141 + }, + { + "epoch": 4.446443768996961, + "grad_norm": 0.07539548085363766, + "learning_rate": 4.136153794463501e-06, + "loss": 0.5486, + "step": 9142 + }, + { + "epoch": 4.44693009118541, + "grad_norm": 0.07510361004220807, + "learning_rate": 4.135212018684485e-06, + "loss": 0.5328, + "step": 9143 + }, + { + "epoch": 4.44741641337386, + "grad_norm": 0.07523356817819044, + "learning_rate": 4.13427027453116e-06, + "loss": 0.5251, + "step": 9144 + }, + { + "epoch": 4.44790273556231, + "grad_norm": 0.07719522285238851, + "learning_rate": 4.133328562037962e-06, + "loss": 0.5294, + "step": 9145 + }, + { + "epoch": 4.448389057750759, + "grad_norm": 0.07383354379769586, + "learning_rate": 4.132386881239336e-06, + "loss": 0.4959, + "step": 9146 + }, + { + "epoch": 4.44887537993921, + "grad_norm": 0.07520400574567981, + "learning_rate": 4.131445232169713e-06, + "loss": 0.528, + "step": 9147 + }, + { + "epoch": 4.44936170212766, + "grad_norm": 0.07279881023644758, + "learning_rate": 4.1305036148635334e-06, + "loss": 0.4789, + "step": 9148 + }, + { + "epoch": 4.449848024316109, + "grad_norm": 0.07578316685053149, + "learning_rate": 4.12956202935523e-06, + "loss": 0.5305, + "step": 9149 + }, + { + "epoch": 4.450334346504559, + "grad_norm": 0.0745941043817119, + "learning_rate": 4.1286204756792395e-06, + "loss": 0.49, + "step": 9150 + }, + { + "epoch": 4.450820668693009, + "grad_norm": 0.07800376845251361, + "learning_rate": 4.127678953869996e-06, + "loss": 0.5364, + "step": 9151 + }, + { + "epoch": 4.451306990881459, + "grad_norm": 0.07433122862396811, + "learning_rate": 4.126737463961927e-06, + "loss": 0.5417, + "step": 9152 + }, + { + "epoch": 4.451793313069909, + "grad_norm": 0.0738548815111225, + "learning_rate": 4.125796005989468e-06, + "loss": 0.5151, + "step": 9153 + }, + { + "epoch": 4.452279635258359, + "grad_norm": 0.07632446306627214, + "learning_rate": 4.124854579987043e-06, + "loss": 0.515, + "step": 9154 + }, + { + "epoch": 4.452765957446808, + "grad_norm": 0.07453589290986092, + "learning_rate": 4.1239131859890875e-06, + "loss": 0.4892, + "step": 9155 + }, + { + "epoch": 4.453252279635258, + "grad_norm": 0.07465550229398378, + "learning_rate": 4.122971824030022e-06, + "loss": 0.5532, + "step": 9156 + }, + { + "epoch": 4.453738601823709, + "grad_norm": 0.07225889514780721, + "learning_rate": 4.122030494144278e-06, + "loss": 0.4755, + "step": 9157 + }, + { + "epoch": 4.454224924012158, + "grad_norm": 0.07262962775281033, + "learning_rate": 4.121089196366274e-06, + "loss": 0.4972, + "step": 9158 + }, + { + "epoch": 4.454711246200608, + "grad_norm": 0.07292766334312963, + "learning_rate": 4.12014793073044e-06, + "loss": 0.5011, + "step": 9159 + }, + { + "epoch": 4.455197568389058, + "grad_norm": 0.07464166631207429, + "learning_rate": 4.119206697271195e-06, + "loss": 0.5036, + "step": 9160 + }, + { + "epoch": 4.455683890577507, + "grad_norm": 0.07520585648564442, + "learning_rate": 4.118265496022963e-06, + "loss": 0.5312, + "step": 9161 + }, + { + "epoch": 4.456170212765957, + "grad_norm": 0.0734861239147741, + "learning_rate": 4.1173243270201604e-06, + "loss": 0.5059, + "step": 9162 + }, + { + "epoch": 4.456656534954408, + "grad_norm": 0.07200911399975939, + "learning_rate": 4.116383190297209e-06, + "loss": 0.5331, + "step": 9163 + }, + { + "epoch": 4.457142857142857, + "grad_norm": 0.07386988059053695, + "learning_rate": 4.1154420858885245e-06, + "loss": 0.5242, + "step": 9164 + }, + { + "epoch": 4.457629179331307, + "grad_norm": 0.0721244639412503, + "learning_rate": 4.1145010138285265e-06, + "loss": 0.5026, + "step": 9165 + }, + { + "epoch": 4.458115501519757, + "grad_norm": 0.07808444002853071, + "learning_rate": 4.113559974151628e-06, + "loss": 0.5318, + "step": 9166 + }, + { + "epoch": 4.458601823708206, + "grad_norm": 0.07716373419170366, + "learning_rate": 4.112618966892245e-06, + "loss": 0.5349, + "step": 9167 + }, + { + "epoch": 4.459088145896657, + "grad_norm": 0.07379194939909248, + "learning_rate": 4.111677992084787e-06, + "loss": 0.5036, + "step": 9168 + }, + { + "epoch": 4.459574468085107, + "grad_norm": 0.07348873200685176, + "learning_rate": 4.110737049763671e-06, + "loss": 0.4799, + "step": 9169 + }, + { + "epoch": 4.460060790273556, + "grad_norm": 0.07443480805290324, + "learning_rate": 4.109796139963303e-06, + "loss": 0.5147, + "step": 9170 + }, + { + "epoch": 4.460547112462006, + "grad_norm": 0.07530280503401027, + "learning_rate": 4.108855262718098e-06, + "loss": 0.5107, + "step": 9171 + }, + { + "epoch": 4.461033434650456, + "grad_norm": 0.07434427440289688, + "learning_rate": 4.107914418062457e-06, + "loss": 0.4979, + "step": 9172 + }, + { + "epoch": 4.461519756838905, + "grad_norm": 0.07564531306472663, + "learning_rate": 4.106973606030793e-06, + "loss": 0.4851, + "step": 9173 + }, + { + "epoch": 4.462006079027356, + "grad_norm": 0.07146423142950971, + "learning_rate": 4.106032826657509e-06, + "loss": 0.4865, + "step": 9174 + }, + { + "epoch": 4.462492401215806, + "grad_norm": 0.07283468829760416, + "learning_rate": 4.105092079977012e-06, + "loss": 0.5039, + "step": 9175 + }, + { + "epoch": 4.462978723404255, + "grad_norm": 0.07231614855905573, + "learning_rate": 4.104151366023703e-06, + "loss": 0.5062, + "step": 9176 + }, + { + "epoch": 4.463465045592705, + "grad_norm": 0.07746994349434007, + "learning_rate": 4.1032106848319856e-06, + "loss": 0.5327, + "step": 9177 + }, + { + "epoch": 4.463951367781155, + "grad_norm": 0.07449258666988649, + "learning_rate": 4.102270036436261e-06, + "loss": 0.4585, + "step": 9178 + }, + { + "epoch": 4.464437689969605, + "grad_norm": 0.07664642111752314, + "learning_rate": 4.101329420870929e-06, + "loss": 0.5163, + "step": 9179 + }, + { + "epoch": 4.464924012158055, + "grad_norm": 0.07225118597422747, + "learning_rate": 4.100388838170389e-06, + "loss": 0.495, + "step": 9180 + }, + { + "epoch": 4.465410334346505, + "grad_norm": 0.07381462558186154, + "learning_rate": 4.099448288369037e-06, + "loss": 0.4948, + "step": 9181 + }, + { + "epoch": 4.465896656534954, + "grad_norm": 0.07564861456899609, + "learning_rate": 4.098507771501272e-06, + "loss": 0.5184, + "step": 9182 + }, + { + "epoch": 4.466382978723404, + "grad_norm": 0.07242232205109025, + "learning_rate": 4.097567287601485e-06, + "loss": 0.5441, + "step": 9183 + }, + { + "epoch": 4.4668693009118545, + "grad_norm": 0.07713014278620212, + "learning_rate": 4.096626836704074e-06, + "loss": 0.5196, + "step": 9184 + }, + { + "epoch": 4.467355623100304, + "grad_norm": 0.0759162521295131, + "learning_rate": 4.095686418843429e-06, + "loss": 0.527, + "step": 9185 + }, + { + "epoch": 4.467841945288754, + "grad_norm": 0.07263596208204152, + "learning_rate": 4.094746034053945e-06, + "loss": 0.5117, + "step": 9186 + }, + { + "epoch": 4.468328267477204, + "grad_norm": 0.07378193519869722, + "learning_rate": 4.093805682370007e-06, + "loss": 0.4877, + "step": 9187 + }, + { + "epoch": 4.468814589665653, + "grad_norm": 0.0740858492601317, + "learning_rate": 4.092865363826007e-06, + "loss": 0.5236, + "step": 9188 + }, + { + "epoch": 4.469300911854103, + "grad_norm": 0.07324258186699761, + "learning_rate": 4.091925078456333e-06, + "loss": 0.4921, + "step": 9189 + }, + { + "epoch": 4.4697872340425535, + "grad_norm": 0.076402138535962, + "learning_rate": 4.090984826295373e-06, + "loss": 0.5216, + "step": 9190 + }, + { + "epoch": 4.470273556231003, + "grad_norm": 0.07585259376944907, + "learning_rate": 4.090044607377509e-06, + "loss": 0.5514, + "step": 9191 + }, + { + "epoch": 4.470759878419453, + "grad_norm": 0.07439737611996425, + "learning_rate": 4.089104421737128e-06, + "loss": 0.5286, + "step": 9192 + }, + { + "epoch": 4.471246200607903, + "grad_norm": 0.07364013295426698, + "learning_rate": 4.088164269408612e-06, + "loss": 0.5298, + "step": 9193 + }, + { + "epoch": 4.471732522796352, + "grad_norm": 0.0754632171597059, + "learning_rate": 4.087224150426344e-06, + "loss": 0.4934, + "step": 9194 + }, + { + "epoch": 4.472218844984803, + "grad_norm": 0.07784454436577268, + "learning_rate": 4.086284064824702e-06, + "loss": 0.5239, + "step": 9195 + }, + { + "epoch": 4.4727051671732525, + "grad_norm": 0.0759475669798295, + "learning_rate": 4.085344012638067e-06, + "loss": 0.556, + "step": 9196 + }, + { + "epoch": 4.473191489361702, + "grad_norm": 0.07365999847006834, + "learning_rate": 4.084403993900818e-06, + "loss": 0.5159, + "step": 9197 + }, + { + "epoch": 4.473677811550152, + "grad_norm": 0.07443198087071798, + "learning_rate": 4.083464008647331e-06, + "loss": 0.497, + "step": 9198 + }, + { + "epoch": 4.474164133738602, + "grad_norm": 0.0735587635551236, + "learning_rate": 4.0825240569119795e-06, + "loss": 0.5115, + "step": 9199 + }, + { + "epoch": 4.474650455927051, + "grad_norm": 0.0713216298603689, + "learning_rate": 4.081584138729144e-06, + "loss": 0.4819, + "step": 9200 + }, + { + "epoch": 4.475136778115502, + "grad_norm": 0.07256181532494116, + "learning_rate": 4.080644254133189e-06, + "loss": 0.5194, + "step": 9201 + }, + { + "epoch": 4.4756231003039515, + "grad_norm": 0.07280272785430154, + "learning_rate": 4.0797044031584935e-06, + "loss": 0.4993, + "step": 9202 + }, + { + "epoch": 4.476109422492401, + "grad_norm": 0.07679057200614148, + "learning_rate": 4.078764585839426e-06, + "loss": 0.5227, + "step": 9203 + }, + { + "epoch": 4.476595744680851, + "grad_norm": 0.07774980929133844, + "learning_rate": 4.077824802210356e-06, + "loss": 0.5296, + "step": 9204 + }, + { + "epoch": 4.4770820668693005, + "grad_norm": 0.07598749068598204, + "learning_rate": 4.076885052305654e-06, + "loss": 0.5005, + "step": 9205 + }, + { + "epoch": 4.477568389057751, + "grad_norm": 0.07414400475947454, + "learning_rate": 4.075945336159682e-06, + "loss": 0.4663, + "step": 9206 + }, + { + "epoch": 4.478054711246201, + "grad_norm": 0.07712391417719938, + "learning_rate": 4.07500565380681e-06, + "loss": 0.5411, + "step": 9207 + }, + { + "epoch": 4.4785410334346505, + "grad_norm": 0.07747465468414004, + "learning_rate": 4.0740660052814e-06, + "loss": 0.4983, + "step": 9208 + }, + { + "epoch": 4.4790273556231, + "grad_norm": 0.07303760276416392, + "learning_rate": 4.073126390617821e-06, + "loss": 0.502, + "step": 9209 + }, + { + "epoch": 4.47951367781155, + "grad_norm": 0.07784725598303321, + "learning_rate": 4.0721868098504275e-06, + "loss": 0.5367, + "step": 9210 + }, + { + "epoch": 4.48, + "grad_norm": 0.07456733990797919, + "learning_rate": 4.0712472630135865e-06, + "loss": 0.5083, + "step": 9211 + }, + { + "epoch": 4.48048632218845, + "grad_norm": 0.074913364572263, + "learning_rate": 4.070307750141652e-06, + "loss": 0.5359, + "step": 9212 + }, + { + "epoch": 4.4809726443769, + "grad_norm": 0.07695317591547961, + "learning_rate": 4.069368271268987e-06, + "loss": 0.5217, + "step": 9213 + }, + { + "epoch": 4.4814589665653495, + "grad_norm": 0.07521774877799299, + "learning_rate": 4.068428826429946e-06, + "loss": 0.5091, + "step": 9214 + }, + { + "epoch": 4.481945288753799, + "grad_norm": 0.07308852502263749, + "learning_rate": 4.067489415658889e-06, + "loss": 0.5196, + "step": 9215 + }, + { + "epoch": 4.482431610942249, + "grad_norm": 0.07787148826527195, + "learning_rate": 4.066550038990165e-06, + "loss": 0.5112, + "step": 9216 + }, + { + "epoch": 4.482917933130699, + "grad_norm": 0.0778526721759418, + "learning_rate": 4.065610696458131e-06, + "loss": 0.5037, + "step": 9217 + }, + { + "epoch": 4.483404255319149, + "grad_norm": 0.07498795125566858, + "learning_rate": 4.064671388097138e-06, + "loss": 0.5204, + "step": 9218 + }, + { + "epoch": 4.483890577507599, + "grad_norm": 0.07753354247840392, + "learning_rate": 4.063732113941539e-06, + "loss": 0.5376, + "step": 9219 + }, + { + "epoch": 4.4843768996960485, + "grad_norm": 0.0720535942705819, + "learning_rate": 4.062792874025679e-06, + "loss": 0.5175, + "step": 9220 + }, + { + "epoch": 4.484863221884498, + "grad_norm": 0.07461435039785043, + "learning_rate": 4.061853668383912e-06, + "loss": 0.5132, + "step": 9221 + }, + { + "epoch": 4.485349544072949, + "grad_norm": 0.07358080704390886, + "learning_rate": 4.0609144970505805e-06, + "loss": 0.5085, + "step": 9222 + }, + { + "epoch": 4.485835866261398, + "grad_norm": 0.07570764144735304, + "learning_rate": 4.059975360060035e-06, + "loss": 0.5357, + "step": 9223 + }, + { + "epoch": 4.486322188449848, + "grad_norm": 0.07383572852496767, + "learning_rate": 4.059036257446614e-06, + "loss": 0.479, + "step": 9224 + }, + { + "epoch": 4.486808510638298, + "grad_norm": 0.07400357054456261, + "learning_rate": 4.058097189244669e-06, + "loss": 0.5237, + "step": 9225 + }, + { + "epoch": 4.4872948328267475, + "grad_norm": 0.07417429522100236, + "learning_rate": 4.0571581554885345e-06, + "loss": 0.4767, + "step": 9226 + }, + { + "epoch": 4.487781155015197, + "grad_norm": 0.07385172551776475, + "learning_rate": 4.056219156212556e-06, + "loss": 0.5152, + "step": 9227 + }, + { + "epoch": 4.488267477203648, + "grad_norm": 0.07661381870005167, + "learning_rate": 4.05528019145107e-06, + "loss": 0.5163, + "step": 9228 + }, + { + "epoch": 4.488753799392097, + "grad_norm": 0.07608187355579815, + "learning_rate": 4.054341261238418e-06, + "loss": 0.555, + "step": 9229 + }, + { + "epoch": 4.489240121580547, + "grad_norm": 0.07300010804799278, + "learning_rate": 4.053402365608936e-06, + "loss": 0.513, + "step": 9230 + }, + { + "epoch": 4.489726443768997, + "grad_norm": 0.0742954532691905, + "learning_rate": 4.052463504596959e-06, + "loss": 0.4973, + "step": 9231 + }, + { + "epoch": 4.4902127659574465, + "grad_norm": 0.07468256668069106, + "learning_rate": 4.051524678236822e-06, + "loss": 0.5236, + "step": 9232 + }, + { + "epoch": 4.490699088145897, + "grad_norm": 0.07388177110272309, + "learning_rate": 4.050585886562858e-06, + "loss": 0.5061, + "step": 9233 + }, + { + "epoch": 4.491185410334347, + "grad_norm": 0.07505008850314789, + "learning_rate": 4.0496471296094016e-06, + "loss": 0.5538, + "step": 9234 + }, + { + "epoch": 4.491671732522796, + "grad_norm": 0.07241490035379888, + "learning_rate": 4.048708407410779e-06, + "loss": 0.516, + "step": 9235 + }, + { + "epoch": 4.492158054711246, + "grad_norm": 0.07481466981765436, + "learning_rate": 4.047769720001323e-06, + "loss": 0.5291, + "step": 9236 + }, + { + "epoch": 4.492644376899696, + "grad_norm": 0.07696794365658906, + "learning_rate": 4.046831067415361e-06, + "loss": 0.5403, + "step": 9237 + }, + { + "epoch": 4.493130699088146, + "grad_norm": 0.07390932000186048, + "learning_rate": 4.045892449687221e-06, + "loss": 0.4975, + "step": 9238 + }, + { + "epoch": 4.493617021276596, + "grad_norm": 0.07272641882298463, + "learning_rate": 4.044953866851226e-06, + "loss": 0.5159, + "step": 9239 + }, + { + "epoch": 4.494103343465046, + "grad_norm": 0.07721302557672421, + "learning_rate": 4.044015318941705e-06, + "loss": 0.5379, + "step": 9240 + }, + { + "epoch": 4.494589665653495, + "grad_norm": 0.07207208181933604, + "learning_rate": 4.043076805992974e-06, + "loss": 0.495, + "step": 9241 + }, + { + "epoch": 4.495075987841945, + "grad_norm": 0.07309043489005235, + "learning_rate": 4.042138328039361e-06, + "loss": 0.4869, + "step": 9242 + }, + { + "epoch": 4.495562310030395, + "grad_norm": 0.07267955554950069, + "learning_rate": 4.041199885115183e-06, + "loss": 0.4976, + "step": 9243 + }, + { + "epoch": 4.496048632218845, + "grad_norm": 0.07215341886535027, + "learning_rate": 4.040261477254763e-06, + "loss": 0.5073, + "step": 9244 + }, + { + "epoch": 4.496534954407295, + "grad_norm": 0.07333871415941677, + "learning_rate": 4.039323104492415e-06, + "loss": 0.4911, + "step": 9245 + }, + { + "epoch": 4.497021276595745, + "grad_norm": 0.07181224526645402, + "learning_rate": 4.0383847668624584e-06, + "loss": 0.5093, + "step": 9246 + }, + { + "epoch": 4.497507598784194, + "grad_norm": 0.07234753626864866, + "learning_rate": 4.037446464399207e-06, + "loss": 0.4835, + "step": 9247 + }, + { + "epoch": 4.497993920972644, + "grad_norm": 0.07227323987217513, + "learning_rate": 4.036508197136978e-06, + "loss": 0.5056, + "step": 9248 + }, + { + "epoch": 4.498480243161095, + "grad_norm": 0.07422656757214123, + "learning_rate": 4.03556996511008e-06, + "loss": 0.5374, + "step": 9249 + }, + { + "epoch": 4.498966565349544, + "grad_norm": 0.0744136777963956, + "learning_rate": 4.034631768352828e-06, + "loss": 0.5326, + "step": 9250 + }, + { + "epoch": 4.499452887537994, + "grad_norm": 0.07492827930506817, + "learning_rate": 4.0336936068995286e-06, + "loss": 0.4969, + "step": 9251 + }, + { + "epoch": 4.499939209726444, + "grad_norm": 0.07223839735174606, + "learning_rate": 4.032755480784494e-06, + "loss": 0.502, + "step": 9252 + }, + { + "epoch": 4.499939209726444, + "eval_loss": 0.5691717267036438, + "eval_runtime": 105.0958, + "eval_samples_per_second": 288.813, + "eval_steps_per_second": 36.11, + "step": 9252 + }, + { + "epoch": 4.500425531914893, + "grad_norm": 0.07197295245632464, + "learning_rate": 4.031817390042031e-06, + "loss": 0.5173, + "step": 9253 + }, + { + "epoch": 4.500911854103343, + "grad_norm": 0.0718166851303264, + "learning_rate": 4.030879334706447e-06, + "loss": 0.4802, + "step": 9254 + }, + { + "epoch": 4.501398176291794, + "grad_norm": 0.07657006337369, + "learning_rate": 4.0299413148120444e-06, + "loss": 0.5236, + "step": 9255 + }, + { + "epoch": 4.501884498480243, + "grad_norm": 0.07434252278578819, + "learning_rate": 4.029003330393128e-06, + "loss": 0.5149, + "step": 9256 + }, + { + "epoch": 4.502370820668693, + "grad_norm": 0.07425606058979606, + "learning_rate": 4.028065381484002e-06, + "loss": 0.5242, + "step": 9257 + }, + { + "epoch": 4.502857142857143, + "grad_norm": 0.07405844853871049, + "learning_rate": 4.027127468118967e-06, + "loss": 0.5132, + "step": 9258 + }, + { + "epoch": 4.503343465045592, + "grad_norm": 0.07387535955120854, + "learning_rate": 4.026189590332323e-06, + "loss": 0.5079, + "step": 9259 + }, + { + "epoch": 4.503829787234043, + "grad_norm": 0.07316425652515, + "learning_rate": 4.0252517481583665e-06, + "loss": 0.5006, + "step": 9260 + }, + { + "epoch": 4.504316109422493, + "grad_norm": 0.07202150422096368, + "learning_rate": 4.024313941631397e-06, + "loss": 0.5001, + "step": 9261 + }, + { + "epoch": 4.504802431610942, + "grad_norm": 0.07641129960121701, + "learning_rate": 4.02337617078571e-06, + "loss": 0.5077, + "step": 9262 + }, + { + "epoch": 4.505288753799392, + "grad_norm": 0.07383331947767907, + "learning_rate": 4.022438435655601e-06, + "loss": 0.4902, + "step": 9263 + }, + { + "epoch": 4.505775075987842, + "grad_norm": 0.07296262164701718, + "learning_rate": 4.021500736275361e-06, + "loss": 0.5149, + "step": 9264 + }, + { + "epoch": 4.506261398176292, + "grad_norm": 0.07448444653916979, + "learning_rate": 4.020563072679286e-06, + "loss": 0.5326, + "step": 9265 + }, + { + "epoch": 4.506747720364742, + "grad_norm": 0.07518831906037178, + "learning_rate": 4.019625444901662e-06, + "loss": 0.5261, + "step": 9266 + }, + { + "epoch": 4.507234042553192, + "grad_norm": 0.07417316446330552, + "learning_rate": 4.018687852976783e-06, + "loss": 0.5186, + "step": 9267 + }, + { + "epoch": 4.507720364741641, + "grad_norm": 0.0762097291840639, + "learning_rate": 4.017750296938932e-06, + "loss": 0.5009, + "step": 9268 + }, + { + "epoch": 4.508206686930091, + "grad_norm": 0.07357603364539657, + "learning_rate": 4.016812776822402e-06, + "loss": 0.4909, + "step": 9269 + }, + { + "epoch": 4.508693009118541, + "grad_norm": 0.07095681142857135, + "learning_rate": 4.015875292661474e-06, + "loss": 0.4836, + "step": 9270 + }, + { + "epoch": 4.509179331306991, + "grad_norm": 0.07782831667284015, + "learning_rate": 4.014937844490434e-06, + "loss": 0.5203, + "step": 9271 + }, + { + "epoch": 4.509665653495441, + "grad_norm": 0.07693551833551153, + "learning_rate": 4.014000432343563e-06, + "loss": 0.5138, + "step": 9272 + }, + { + "epoch": 4.510151975683891, + "grad_norm": 0.07675372753590344, + "learning_rate": 4.013063056255147e-06, + "loss": 0.5203, + "step": 9273 + }, + { + "epoch": 4.51063829787234, + "grad_norm": 0.07304918207798498, + "learning_rate": 4.012125716259461e-06, + "loss": 0.4991, + "step": 9274 + }, + { + "epoch": 4.51112462006079, + "grad_norm": 0.07635546349615119, + "learning_rate": 4.011188412390788e-06, + "loss": 0.5158, + "step": 9275 + }, + { + "epoch": 4.51161094224924, + "grad_norm": 0.07276277522100671, + "learning_rate": 4.0102511446834025e-06, + "loss": 0.5027, + "step": 9276 + }, + { + "epoch": 4.51209726443769, + "grad_norm": 0.07252339477070273, + "learning_rate": 4.009313913171584e-06, + "loss": 0.4951, + "step": 9277 + }, + { + "epoch": 4.51258358662614, + "grad_norm": 0.0737220120378319, + "learning_rate": 4.0083767178896046e-06, + "loss": 0.5125, + "step": 9278 + }, + { + "epoch": 4.51306990881459, + "grad_norm": 0.07208568380818856, + "learning_rate": 4.0074395588717406e-06, + "loss": 0.5043, + "step": 9279 + }, + { + "epoch": 4.513556231003039, + "grad_norm": 0.07384039444594008, + "learning_rate": 4.0065024361522606e-06, + "loss": 0.5052, + "step": 9280 + }, + { + "epoch": 4.514042553191489, + "grad_norm": 0.07356824048246989, + "learning_rate": 4.005565349765438e-06, + "loss": 0.5011, + "step": 9281 + }, + { + "epoch": 4.5145288753799395, + "grad_norm": 0.07471788020450786, + "learning_rate": 4.004628299745544e-06, + "loss": 0.5001, + "step": 9282 + }, + { + "epoch": 4.515015197568389, + "grad_norm": 0.0710858693953974, + "learning_rate": 4.0036912861268434e-06, + "loss": 0.465, + "step": 9283 + }, + { + "epoch": 4.515501519756839, + "grad_norm": 0.0720420701395051, + "learning_rate": 4.002754308943608e-06, + "loss": 0.4945, + "step": 9284 + }, + { + "epoch": 4.515987841945289, + "grad_norm": 0.07574934255713653, + "learning_rate": 4.001817368230098e-06, + "loss": 0.522, + "step": 9285 + }, + { + "epoch": 4.516474164133738, + "grad_norm": 0.07582312734836276, + "learning_rate": 4.000880464020582e-06, + "loss": 0.5324, + "step": 9286 + }, + { + "epoch": 4.516960486322189, + "grad_norm": 0.07194231379733766, + "learning_rate": 3.9999435963493195e-06, + "loss": 0.473, + "step": 9287 + }, + { + "epoch": 4.5174468085106385, + "grad_norm": 0.07297646560307082, + "learning_rate": 3.999006765250576e-06, + "loss": 0.5206, + "step": 9288 + }, + { + "epoch": 4.517933130699088, + "grad_norm": 0.07835034276835257, + "learning_rate": 3.998069970758609e-06, + "loss": 0.5298, + "step": 9289 + }, + { + "epoch": 4.518419452887538, + "grad_norm": 0.07519668264825975, + "learning_rate": 3.997133212907679e-06, + "loss": 0.5356, + "step": 9290 + }, + { + "epoch": 4.518905775075988, + "grad_norm": 0.0746168240062158, + "learning_rate": 3.996196491732041e-06, + "loss": 0.5438, + "step": 9291 + }, + { + "epoch": 4.519392097264438, + "grad_norm": 0.0732138021003351, + "learning_rate": 3.995259807265956e-06, + "loss": 0.4903, + "step": 9292 + }, + { + "epoch": 4.519878419452888, + "grad_norm": 0.0773828109032807, + "learning_rate": 3.994323159543675e-06, + "loss": 0.5311, + "step": 9293 + }, + { + "epoch": 4.5203647416413375, + "grad_norm": 0.07487883981499747, + "learning_rate": 3.993386548599454e-06, + "loss": 0.5258, + "step": 9294 + }, + { + "epoch": 4.520851063829787, + "grad_norm": 0.07321492962793047, + "learning_rate": 3.992449974467542e-06, + "loss": 0.4825, + "step": 9295 + }, + { + "epoch": 4.521337386018237, + "grad_norm": 0.07315931764974398, + "learning_rate": 3.9915134371821936e-06, + "loss": 0.5136, + "step": 9296 + }, + { + "epoch": 4.5218237082066866, + "grad_norm": 0.07636731854830328, + "learning_rate": 3.9905769367776564e-06, + "loss": 0.5277, + "step": 9297 + }, + { + "epoch": 4.522310030395137, + "grad_norm": 0.07386656265960556, + "learning_rate": 3.989640473288181e-06, + "loss": 0.5099, + "step": 9298 + }, + { + "epoch": 4.522796352583587, + "grad_norm": 0.07308825735289265, + "learning_rate": 3.988704046748011e-06, + "loss": 0.5042, + "step": 9299 + }, + { + "epoch": 4.5232826747720365, + "grad_norm": 0.07446430230279066, + "learning_rate": 3.987767657191393e-06, + "loss": 0.5118, + "step": 9300 + }, + { + "epoch": 4.523768996960486, + "grad_norm": 0.07289579885509151, + "learning_rate": 3.986831304652572e-06, + "loss": 0.5113, + "step": 9301 + }, + { + "epoch": 4.524255319148936, + "grad_norm": 0.07435811531917867, + "learning_rate": 3.985894989165792e-06, + "loss": 0.523, + "step": 9302 + }, + { + "epoch": 4.5247416413373855, + "grad_norm": 0.07642410642259997, + "learning_rate": 3.984958710765291e-06, + "loss": 0.5088, + "step": 9303 + }, + { + "epoch": 4.525227963525836, + "grad_norm": 0.07134414286383806, + "learning_rate": 3.984022469485314e-06, + "loss": 0.4856, + "step": 9304 + }, + { + "epoch": 4.525714285714286, + "grad_norm": 0.0726318784517991, + "learning_rate": 3.983086265360093e-06, + "loss": 0.5243, + "step": 9305 + }, + { + "epoch": 4.5262006079027355, + "grad_norm": 0.07451550069135994, + "learning_rate": 3.982150098423871e-06, + "loss": 0.5336, + "step": 9306 + }, + { + "epoch": 4.526686930091185, + "grad_norm": 0.07111640653778921, + "learning_rate": 3.981213968710882e-06, + "loss": 0.4622, + "step": 9307 + }, + { + "epoch": 4.527173252279635, + "grad_norm": 0.07378757736643685, + "learning_rate": 3.9802778762553606e-06, + "loss": 0.5067, + "step": 9308 + }, + { + "epoch": 4.527659574468085, + "grad_norm": 0.07805891922187286, + "learning_rate": 3.979341821091543e-06, + "loss": 0.5494, + "step": 9309 + }, + { + "epoch": 4.528145896656535, + "grad_norm": 0.07335835696619315, + "learning_rate": 3.978405803253656e-06, + "loss": 0.5614, + "step": 9310 + }, + { + "epoch": 4.528632218844985, + "grad_norm": 0.07441263378442507, + "learning_rate": 3.977469822775934e-06, + "loss": 0.5322, + "step": 9311 + }, + { + "epoch": 4.5291185410334345, + "grad_norm": 0.0755974508599259, + "learning_rate": 3.976533879692604e-06, + "loss": 0.5199, + "step": 9312 + }, + { + "epoch": 4.529604863221884, + "grad_norm": 0.07268009756675874, + "learning_rate": 3.975597974037898e-06, + "loss": 0.5154, + "step": 9313 + }, + { + "epoch": 4.530091185410335, + "grad_norm": 0.0746747605080762, + "learning_rate": 3.974662105846036e-06, + "loss": 0.5233, + "step": 9314 + }, + { + "epoch": 4.530577507598784, + "grad_norm": 0.07389784926086358, + "learning_rate": 3.97372627515125e-06, + "loss": 0.5047, + "step": 9315 + }, + { + "epoch": 4.531063829787234, + "grad_norm": 0.07424802971303267, + "learning_rate": 3.972790481987757e-06, + "loss": 0.5011, + "step": 9316 + }, + { + "epoch": 4.531550151975684, + "grad_norm": 0.07441778022364974, + "learning_rate": 3.971854726389786e-06, + "loss": 0.5092, + "step": 9317 + }, + { + "epoch": 4.5320364741641335, + "grad_norm": 0.07310846216832277, + "learning_rate": 3.970919008391552e-06, + "loss": 0.5119, + "step": 9318 + }, + { + "epoch": 4.532522796352584, + "grad_norm": 0.07450421601150241, + "learning_rate": 3.96998332802728e-06, + "loss": 0.5388, + "step": 9319 + }, + { + "epoch": 4.533009118541034, + "grad_norm": 0.07243285938287639, + "learning_rate": 3.969047685331184e-06, + "loss": 0.4677, + "step": 9320 + }, + { + "epoch": 4.533495440729483, + "grad_norm": 0.07507835126735407, + "learning_rate": 3.9681120803374824e-06, + "loss": 0.4946, + "step": 9321 + }, + { + "epoch": 4.533981762917933, + "grad_norm": 0.07417852424322498, + "learning_rate": 3.967176513080391e-06, + "loss": 0.5269, + "step": 9322 + }, + { + "epoch": 4.534468085106383, + "grad_norm": 0.07639212451088119, + "learning_rate": 3.9662409835941245e-06, + "loss": 0.5236, + "step": 9323 + }, + { + "epoch": 4.5349544072948325, + "grad_norm": 0.07449302434528304, + "learning_rate": 3.965305491912894e-06, + "loss": 0.4932, + "step": 9324 + }, + { + "epoch": 4.535440729483283, + "grad_norm": 0.07451420604336527, + "learning_rate": 3.964370038070912e-06, + "loss": 0.5144, + "step": 9325 + }, + { + "epoch": 4.535927051671733, + "grad_norm": 0.07382092089704145, + "learning_rate": 3.963434622102387e-06, + "loss": 0.5119, + "step": 9326 + }, + { + "epoch": 4.536413373860182, + "grad_norm": 0.07692256117569182, + "learning_rate": 3.962499244041532e-06, + "loss": 0.4826, + "step": 9327 + }, + { + "epoch": 4.536899696048632, + "grad_norm": 0.0741511798853068, + "learning_rate": 3.961563903922549e-06, + "loss": 0.5144, + "step": 9328 + }, + { + "epoch": 4.537386018237082, + "grad_norm": 0.07228413136967098, + "learning_rate": 3.960628601779645e-06, + "loss": 0.4748, + "step": 9329 + }, + { + "epoch": 4.5378723404255314, + "grad_norm": 0.0728412750705643, + "learning_rate": 3.959693337647026e-06, + "loss": 0.4964, + "step": 9330 + }, + { + "epoch": 4.538358662613982, + "grad_norm": 0.07156280063182621, + "learning_rate": 3.9587581115588955e-06, + "loss": 0.4802, + "step": 9331 + }, + { + "epoch": 4.538844984802432, + "grad_norm": 0.07676877097281809, + "learning_rate": 3.957822923549452e-06, + "loss": 0.5224, + "step": 9332 + }, + { + "epoch": 4.539331306990881, + "grad_norm": 0.07512896871439582, + "learning_rate": 3.956887773652898e-06, + "loss": 0.5366, + "step": 9333 + }, + { + "epoch": 4.539817629179331, + "grad_norm": 0.0720943316031756, + "learning_rate": 3.9559526619034335e-06, + "loss": 0.4974, + "step": 9334 + }, + { + "epoch": 4.540303951367781, + "grad_norm": 0.07245508083669079, + "learning_rate": 3.955017588335252e-06, + "loss": 0.4963, + "step": 9335 + }, + { + "epoch": 4.540790273556231, + "grad_norm": 0.0721791386805876, + "learning_rate": 3.954082552982554e-06, + "loss": 0.4793, + "step": 9336 + }, + { + "epoch": 4.541276595744681, + "grad_norm": 0.07087622774420815, + "learning_rate": 3.953147555879531e-06, + "loss": 0.4854, + "step": 9337 + }, + { + "epoch": 4.541762917933131, + "grad_norm": 0.07502285430543014, + "learning_rate": 3.95221259706038e-06, + "loss": 0.4769, + "step": 9338 + }, + { + "epoch": 4.54224924012158, + "grad_norm": 0.07352153868419029, + "learning_rate": 3.951277676559288e-06, + "loss": 0.5038, + "step": 9339 + }, + { + "epoch": 4.54273556231003, + "grad_norm": 0.07330248743611278, + "learning_rate": 3.9503427944104486e-06, + "loss": 0.4877, + "step": 9340 + }, + { + "epoch": 4.543221884498481, + "grad_norm": 0.07850893172390568, + "learning_rate": 3.949407950648049e-06, + "loss": 0.5261, + "step": 9341 + }, + { + "epoch": 4.54370820668693, + "grad_norm": 0.07176517894202601, + "learning_rate": 3.94847314530628e-06, + "loss": 0.483, + "step": 9342 + }, + { + "epoch": 4.54419452887538, + "grad_norm": 0.07490126077222273, + "learning_rate": 3.9475383784193245e-06, + "loss": 0.5312, + "step": 9343 + }, + { + "epoch": 4.54468085106383, + "grad_norm": 0.07237236937699397, + "learning_rate": 3.94660365002137e-06, + "loss": 0.4961, + "step": 9344 + }, + { + "epoch": 4.545167173252279, + "grad_norm": 0.07569531788936606, + "learning_rate": 3.945668960146597e-06, + "loss": 0.5136, + "step": 9345 + }, + { + "epoch": 4.54565349544073, + "grad_norm": 0.0735559775455133, + "learning_rate": 3.944734308829189e-06, + "loss": 0.5257, + "step": 9346 + }, + { + "epoch": 4.54613981762918, + "grad_norm": 0.07403499294450266, + "learning_rate": 3.943799696103327e-06, + "loss": 0.4877, + "step": 9347 + }, + { + "epoch": 4.546626139817629, + "grad_norm": 0.07269643030045872, + "learning_rate": 3.942865122003192e-06, + "loss": 0.5263, + "step": 9348 + }, + { + "epoch": 4.547112462006079, + "grad_norm": 0.07516116892036966, + "learning_rate": 3.941930586562957e-06, + "loss": 0.5254, + "step": 9349 + }, + { + "epoch": 4.547598784194529, + "grad_norm": 0.07314509438835426, + "learning_rate": 3.940996089816803e-06, + "loss": 0.4918, + "step": 9350 + }, + { + "epoch": 4.548085106382978, + "grad_norm": 0.07614808936951598, + "learning_rate": 3.940061631798901e-06, + "loss": 0.5387, + "step": 9351 + }, + { + "epoch": 4.548571428571429, + "grad_norm": 0.07448856312515807, + "learning_rate": 3.939127212543429e-06, + "loss": 0.5041, + "step": 9352 + }, + { + "epoch": 4.549057750759879, + "grad_norm": 0.07081525501058379, + "learning_rate": 3.938192832084555e-06, + "loss": 0.4924, + "step": 9353 + }, + { + "epoch": 4.549544072948328, + "grad_norm": 0.07251093752698896, + "learning_rate": 3.937258490456453e-06, + "loss": 0.5129, + "step": 9354 + }, + { + "epoch": 4.550030395136778, + "grad_norm": 0.07530015707247262, + "learning_rate": 3.936324187693289e-06, + "loss": 0.508, + "step": 9355 + }, + { + "epoch": 4.550516717325228, + "grad_norm": 0.07527067208010085, + "learning_rate": 3.9353899238292355e-06, + "loss": 0.5068, + "step": 9356 + }, + { + "epoch": 4.551003039513677, + "grad_norm": 0.0764365355266283, + "learning_rate": 3.934455698898454e-06, + "loss": 0.542, + "step": 9357 + }, + { + "epoch": 4.551489361702128, + "grad_norm": 0.07462172737486743, + "learning_rate": 3.933521512935114e-06, + "loss": 0.5154, + "step": 9358 + }, + { + "epoch": 4.551975683890578, + "grad_norm": 0.07266794715742102, + "learning_rate": 3.932587365973374e-06, + "loss": 0.4933, + "step": 9359 + }, + { + "epoch": 4.552462006079027, + "grad_norm": 0.07297169637157461, + "learning_rate": 3.9316532580474e-06, + "loss": 0.5194, + "step": 9360 + }, + { + "epoch": 4.552948328267477, + "grad_norm": 0.07089586743175724, + "learning_rate": 3.930719189191352e-06, + "loss": 0.4788, + "step": 9361 + }, + { + "epoch": 4.553434650455927, + "grad_norm": 0.07582381698166774, + "learning_rate": 3.9297851594393874e-06, + "loss": 0.5107, + "step": 9362 + }, + { + "epoch": 4.553920972644377, + "grad_norm": 0.07816746455215838, + "learning_rate": 3.928851168825669e-06, + "loss": 0.5265, + "step": 9363 + }, + { + "epoch": 4.554407294832827, + "grad_norm": 0.07538072594197215, + "learning_rate": 3.927917217384347e-06, + "loss": 0.5224, + "step": 9364 + }, + { + "epoch": 4.554893617021277, + "grad_norm": 0.07630716360194935, + "learning_rate": 3.926983305149581e-06, + "loss": 0.5835, + "step": 9365 + }, + { + "epoch": 4.555379939209726, + "grad_norm": 0.07608038192539752, + "learning_rate": 3.926049432155522e-06, + "loss": 0.5357, + "step": 9366 + }, + { + "epoch": 4.555866261398176, + "grad_norm": 0.07340667982104773, + "learning_rate": 3.925115598436325e-06, + "loss": 0.4868, + "step": 9367 + }, + { + "epoch": 4.5563525835866265, + "grad_norm": 0.07668947030122514, + "learning_rate": 3.924181804026137e-06, + "loss": 0.5325, + "step": 9368 + }, + { + "epoch": 4.556838905775076, + "grad_norm": 0.07532392010784962, + "learning_rate": 3.9232480489591104e-06, + "loss": 0.5365, + "step": 9369 + }, + { + "epoch": 4.557325227963526, + "grad_norm": 0.07408912273729372, + "learning_rate": 3.92231433326939e-06, + "loss": 0.5312, + "step": 9370 + }, + { + "epoch": 4.557811550151976, + "grad_norm": 0.0735007120621975, + "learning_rate": 3.921380656991127e-06, + "loss": 0.509, + "step": 9371 + }, + { + "epoch": 4.558297872340425, + "grad_norm": 0.07666412749504235, + "learning_rate": 3.920447020158461e-06, + "loss": 0.5612, + "step": 9372 + }, + { + "epoch": 4.558784194528876, + "grad_norm": 0.07343465139650254, + "learning_rate": 3.9195134228055395e-06, + "loss": 0.4885, + "step": 9373 + }, + { + "epoch": 4.5592705167173255, + "grad_norm": 0.07424544077073247, + "learning_rate": 3.918579864966502e-06, + "loss": 0.4863, + "step": 9374 + }, + { + "epoch": 4.559756838905775, + "grad_norm": 0.07414750458564023, + "learning_rate": 3.917646346675491e-06, + "loss": 0.5235, + "step": 9375 + }, + { + "epoch": 4.560243161094225, + "grad_norm": 0.07430105556109139, + "learning_rate": 3.916712867966644e-06, + "loss": 0.5371, + "step": 9376 + }, + { + "epoch": 4.560729483282675, + "grad_norm": 0.07399558110506868, + "learning_rate": 3.9157794288741e-06, + "loss": 0.548, + "step": 9377 + }, + { + "epoch": 4.561215805471124, + "grad_norm": 0.07213414500122718, + "learning_rate": 3.914846029431995e-06, + "loss": 0.4969, + "step": 9378 + }, + { + "epoch": 4.561702127659575, + "grad_norm": 0.07664282878385731, + "learning_rate": 3.9139126696744636e-06, + "loss": 0.4737, + "step": 9379 + }, + { + "epoch": 4.5621884498480245, + "grad_norm": 0.07618814863371207, + "learning_rate": 3.912979349635638e-06, + "loss": 0.5316, + "step": 9380 + }, + { + "epoch": 4.562674772036474, + "grad_norm": 0.07192547278007334, + "learning_rate": 3.912046069349654e-06, + "loss": 0.4788, + "step": 9381 + }, + { + "epoch": 4.563161094224924, + "grad_norm": 0.07321205343944717, + "learning_rate": 3.911112828850637e-06, + "loss": 0.5207, + "step": 9382 + }, + { + "epoch": 4.563647416413374, + "grad_norm": 0.07286769804339066, + "learning_rate": 3.91017962817272e-06, + "loss": 0.4836, + "step": 9383 + }, + { + "epoch": 4.564133738601823, + "grad_norm": 0.0731352906986514, + "learning_rate": 3.909246467350028e-06, + "loss": 0.5141, + "step": 9384 + }, + { + "epoch": 4.564620060790274, + "grad_norm": 0.07651885023375475, + "learning_rate": 3.9083133464166905e-06, + "loss": 0.5258, + "step": 9385 + }, + { + "epoch": 4.5651063829787235, + "grad_norm": 0.07347187739775783, + "learning_rate": 3.907380265406827e-06, + "loss": 0.5014, + "step": 9386 + }, + { + "epoch": 4.565592705167173, + "grad_norm": 0.07484664211420576, + "learning_rate": 3.906447224354565e-06, + "loss": 0.512, + "step": 9387 + }, + { + "epoch": 4.566079027355623, + "grad_norm": 0.07173261805579136, + "learning_rate": 3.905514223294026e-06, + "loss": 0.503, + "step": 9388 + }, + { + "epoch": 4.566565349544073, + "grad_norm": 0.07359214952830676, + "learning_rate": 3.9045812622593275e-06, + "loss": 0.5521, + "step": 9389 + }, + { + "epoch": 4.567051671732523, + "grad_norm": 0.07453135287178002, + "learning_rate": 3.9036483412845905e-06, + "loss": 0.5016, + "step": 9390 + }, + { + "epoch": 4.567537993920973, + "grad_norm": 0.07684984731358946, + "learning_rate": 3.902715460403931e-06, + "loss": 0.5535, + "step": 9391 + }, + { + "epoch": 4.5680243161094225, + "grad_norm": 0.07659117273950458, + "learning_rate": 3.901782619651468e-06, + "loss": 0.5244, + "step": 9392 + }, + { + "epoch": 4.568510638297872, + "grad_norm": 0.07627677323635071, + "learning_rate": 3.90084981906131e-06, + "loss": 0.5242, + "step": 9393 + }, + { + "epoch": 4.568996960486322, + "grad_norm": 0.07471246540561714, + "learning_rate": 3.899917058667576e-06, + "loss": 0.5101, + "step": 9394 + }, + { + "epoch": 4.569483282674772, + "grad_norm": 0.0772917860316628, + "learning_rate": 3.898984338504373e-06, + "loss": 0.547, + "step": 9395 + }, + { + "epoch": 4.569969604863222, + "grad_norm": 0.07418603674263063, + "learning_rate": 3.8980516586058155e-06, + "loss": 0.538, + "step": 9396 + }, + { + "epoch": 4.570455927051672, + "grad_norm": 0.07591462600023226, + "learning_rate": 3.897119019006008e-06, + "loss": 0.5178, + "step": 9397 + }, + { + "epoch": 4.5709422492401215, + "grad_norm": 0.07344926732978485, + "learning_rate": 3.89618641973906e-06, + "loss": 0.4887, + "step": 9398 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 0.07761402148302443, + "learning_rate": 3.895253860839075e-06, + "loss": 0.4971, + "step": 9399 + }, + { + "epoch": 4.571914893617022, + "grad_norm": 0.0762308165676122, + "learning_rate": 3.894321342340159e-06, + "loss": 0.5318, + "step": 9400 + }, + { + "epoch": 4.572401215805471, + "grad_norm": 0.07323678965678548, + "learning_rate": 3.893388864276413e-06, + "loss": 0.4804, + "step": 9401 + }, + { + "epoch": 4.572887537993921, + "grad_norm": 0.07312112617763636, + "learning_rate": 3.89245642668194e-06, + "loss": 0.494, + "step": 9402 + }, + { + "epoch": 4.573373860182371, + "grad_norm": 0.07625460373135336, + "learning_rate": 3.891524029590837e-06, + "loss": 0.5111, + "step": 9403 + }, + { + "epoch": 4.5738601823708205, + "grad_norm": 0.07585508992838295, + "learning_rate": 3.890591673037205e-06, + "loss": 0.5231, + "step": 9404 + }, + { + "epoch": 4.57434650455927, + "grad_norm": 0.07219676731221171, + "learning_rate": 3.889659357055139e-06, + "loss": 0.4965, + "step": 9405 + }, + { + "epoch": 4.574832826747721, + "grad_norm": 0.07256806982886983, + "learning_rate": 3.888727081678737e-06, + "loss": 0.5171, + "step": 9406 + }, + { + "epoch": 4.57531914893617, + "grad_norm": 0.07988977124293499, + "learning_rate": 3.887794846942088e-06, + "loss": 0.5757, + "step": 9407 + }, + { + "epoch": 4.57580547112462, + "grad_norm": 0.07456679588377475, + "learning_rate": 3.886862652879288e-06, + "loss": 0.5075, + "step": 9408 + }, + { + "epoch": 4.57629179331307, + "grad_norm": 0.07225183621512404, + "learning_rate": 3.885930499524425e-06, + "loss": 0.4974, + "step": 9409 + }, + { + "epoch": 4.5767781155015195, + "grad_norm": 0.07213481894390628, + "learning_rate": 3.884998386911592e-06, + "loss": 0.4887, + "step": 9410 + }, + { + "epoch": 4.577264437689969, + "grad_norm": 0.07093542674314501, + "learning_rate": 3.884066315074872e-06, + "loss": 0.491, + "step": 9411 + }, + { + "epoch": 4.57775075987842, + "grad_norm": 0.07402843518613701, + "learning_rate": 3.883134284048355e-06, + "loss": 0.4961, + "step": 9412 + }, + { + "epoch": 4.578237082066869, + "grad_norm": 0.07318893932406952, + "learning_rate": 3.8822022938661255e-06, + "loss": 0.5027, + "step": 9413 + }, + { + "epoch": 4.578723404255319, + "grad_norm": 0.07828587395429795, + "learning_rate": 3.881270344562264e-06, + "loss": 0.5307, + "step": 9414 + }, + { + "epoch": 4.579209726443769, + "grad_norm": 0.07021040892394347, + "learning_rate": 3.880338436170857e-06, + "loss": 0.4953, + "step": 9415 + }, + { + "epoch": 4.5796960486322185, + "grad_norm": 0.07563537304206946, + "learning_rate": 3.87940656872598e-06, + "loss": 0.5271, + "step": 9416 + }, + { + "epoch": 4.580182370820669, + "grad_norm": 0.07148901880726123, + "learning_rate": 3.878474742261716e-06, + "loss": 0.4921, + "step": 9417 + }, + { + "epoch": 4.580668693009119, + "grad_norm": 0.0729408647462505, + "learning_rate": 3.877542956812137e-06, + "loss": 0.4971, + "step": 9418 + }, + { + "epoch": 4.581155015197568, + "grad_norm": 0.07185942828665691, + "learning_rate": 3.876611212411324e-06, + "loss": 0.5068, + "step": 9419 + }, + { + "epoch": 4.581641337386018, + "grad_norm": 0.07404036217980606, + "learning_rate": 3.875679509093348e-06, + "loss": 0.5066, + "step": 9420 + }, + { + "epoch": 4.582127659574468, + "grad_norm": 0.07372009271917068, + "learning_rate": 3.874747846892286e-06, + "loss": 0.5542, + "step": 9421 + }, + { + "epoch": 4.582613981762918, + "grad_norm": 0.07495766607464964, + "learning_rate": 3.873816225842204e-06, + "loss": 0.5278, + "step": 9422 + }, + { + "epoch": 4.583100303951368, + "grad_norm": 0.07078148696778389, + "learning_rate": 3.872884645977175e-06, + "loss": 0.4882, + "step": 9423 + }, + { + "epoch": 4.583586626139818, + "grad_norm": 0.07608058702230615, + "learning_rate": 3.871953107331266e-06, + "loss": 0.5244, + "step": 9424 + }, + { + "epoch": 4.584072948328267, + "grad_norm": 0.078461127525936, + "learning_rate": 3.871021609938547e-06, + "loss": 0.5425, + "step": 9425 + }, + { + "epoch": 4.584559270516717, + "grad_norm": 0.07404268375669316, + "learning_rate": 3.870090153833077e-06, + "loss": 0.514, + "step": 9426 + }, + { + "epoch": 4.585045592705168, + "grad_norm": 0.07474617671772428, + "learning_rate": 3.869158739048927e-06, + "loss": 0.4974, + "step": 9427 + }, + { + "epoch": 4.585531914893617, + "grad_norm": 0.07333944582933946, + "learning_rate": 3.868227365620152e-06, + "loss": 0.5435, + "step": 9428 + }, + { + "epoch": 4.586018237082067, + "grad_norm": 0.07145503174283267, + "learning_rate": 3.867296033580819e-06, + "loss": 0.4742, + "step": 9429 + }, + { + "epoch": 4.586504559270517, + "grad_norm": 0.07635005777735188, + "learning_rate": 3.8663647429649824e-06, + "loss": 0.5205, + "step": 9430 + }, + { + "epoch": 4.586990881458966, + "grad_norm": 0.07282747294151719, + "learning_rate": 3.865433493806705e-06, + "loss": 0.5048, + "step": 9431 + }, + { + "epoch": 4.587477203647416, + "grad_norm": 0.07608790871955881, + "learning_rate": 3.864502286140038e-06, + "loss": 0.4693, + "step": 9432 + }, + { + "epoch": 4.587963525835867, + "grad_norm": 0.07747000345311038, + "learning_rate": 3.8635711199990395e-06, + "loss": 0.5343, + "step": 9433 + }, + { + "epoch": 4.588449848024316, + "grad_norm": 0.07617283097140695, + "learning_rate": 3.86263999541776e-06, + "loss": 0.5176, + "step": 9434 + }, + { + "epoch": 4.588936170212766, + "grad_norm": 0.07242281563116791, + "learning_rate": 3.8617089124302546e-06, + "loss": 0.4839, + "step": 9435 + }, + { + "epoch": 4.589422492401216, + "grad_norm": 0.07376442049941472, + "learning_rate": 3.86077787107057e-06, + "loss": 0.4855, + "step": 9436 + }, + { + "epoch": 4.589908814589665, + "grad_norm": 0.07177725289119948, + "learning_rate": 3.8598468713727565e-06, + "loss": 0.4884, + "step": 9437 + }, + { + "epoch": 4.590395136778115, + "grad_norm": 0.07560285939052847, + "learning_rate": 3.8589159133708616e-06, + "loss": 0.5122, + "step": 9438 + }, + { + "epoch": 4.590881458966566, + "grad_norm": 0.07901844861992932, + "learning_rate": 3.857984997098928e-06, + "loss": 0.4981, + "step": 9439 + }, + { + "epoch": 4.591367781155015, + "grad_norm": 0.0760807200893336, + "learning_rate": 3.857054122591004e-06, + "loss": 0.5304, + "step": 9440 + }, + { + "epoch": 4.591854103343465, + "grad_norm": 0.07452078713634845, + "learning_rate": 3.856123289881129e-06, + "loss": 0.5238, + "step": 9441 + }, + { + "epoch": 4.592340425531915, + "grad_norm": 0.07525974081641466, + "learning_rate": 3.855192499003347e-06, + "loss": 0.5281, + "step": 9442 + }, + { + "epoch": 4.592826747720364, + "grad_norm": 0.07539140432671773, + "learning_rate": 3.854261749991693e-06, + "loss": 0.5048, + "step": 9443 + }, + { + "epoch": 4.593313069908815, + "grad_norm": 0.07750851584138688, + "learning_rate": 3.8533310428802084e-06, + "loss": 0.4981, + "step": 9444 + }, + { + "epoch": 4.593799392097265, + "grad_norm": 0.07439655316628532, + "learning_rate": 3.852400377702927e-06, + "loss": 0.4947, + "step": 9445 + }, + { + "epoch": 4.594285714285714, + "grad_norm": 0.07428810358142628, + "learning_rate": 3.8514697544938885e-06, + "loss": 0.4783, + "step": 9446 + }, + { + "epoch": 4.594772036474164, + "grad_norm": 0.07332533353272915, + "learning_rate": 3.85053917328712e-06, + "loss": 0.5078, + "step": 9447 + }, + { + "epoch": 4.595258358662614, + "grad_norm": 0.07650448476123038, + "learning_rate": 3.849608634116657e-06, + "loss": 0.5017, + "step": 9448 + }, + { + "epoch": 4.595744680851064, + "grad_norm": 0.07211763738390814, + "learning_rate": 3.848678137016528e-06, + "loss": 0.4664, + "step": 9449 + }, + { + "epoch": 4.596231003039514, + "grad_norm": 0.07484629803964395, + "learning_rate": 3.8477476820207646e-06, + "loss": 0.5244, + "step": 9450 + }, + { + "epoch": 4.596717325227964, + "grad_norm": 0.07308498024444289, + "learning_rate": 3.846817269163391e-06, + "loss": 0.5293, + "step": 9451 + }, + { + "epoch": 4.597203647416413, + "grad_norm": 0.07503528945239649, + "learning_rate": 3.845886898478435e-06, + "loss": 0.5008, + "step": 9452 + }, + { + "epoch": 4.597689969604863, + "grad_norm": 0.07258533972228852, + "learning_rate": 3.844956569999917e-06, + "loss": 0.5253, + "step": 9453 + }, + { + "epoch": 4.5981762917933136, + "grad_norm": 0.07456482892045778, + "learning_rate": 3.8440262837618635e-06, + "loss": 0.5363, + "step": 9454 + }, + { + "epoch": 4.598662613981763, + "grad_norm": 0.07368446187337664, + "learning_rate": 3.843096039798293e-06, + "loss": 0.5005, + "step": 9455 + }, + { + "epoch": 4.599148936170213, + "grad_norm": 0.07814073533624062, + "learning_rate": 3.8421658381432275e-06, + "loss": 0.5661, + "step": 9456 + }, + { + "epoch": 4.599635258358663, + "grad_norm": 0.07601538028404108, + "learning_rate": 3.841235678830682e-06, + "loss": 0.5143, + "step": 9457 + }, + { + "epoch": 4.600121580547112, + "grad_norm": 0.0721920364470967, + "learning_rate": 3.840305561894675e-06, + "loss": 0.5227, + "step": 9458 + }, + { + "epoch": 4.600607902735562, + "grad_norm": 0.07459943025647815, + "learning_rate": 3.839375487369219e-06, + "loss": 0.5114, + "step": 9459 + }, + { + "epoch": 4.6010942249240125, + "grad_norm": 0.0757330807119611, + "learning_rate": 3.838445455288331e-06, + "loss": 0.5064, + "step": 9460 + }, + { + "epoch": 4.601580547112462, + "grad_norm": 0.07544646910788051, + "learning_rate": 3.837515465686018e-06, + "loss": 0.5312, + "step": 9461 + }, + { + "epoch": 4.602066869300912, + "grad_norm": 0.07555236857900353, + "learning_rate": 3.836585518596294e-06, + "loss": 0.4915, + "step": 9462 + }, + { + "epoch": 4.602553191489362, + "grad_norm": 0.077142049341222, + "learning_rate": 3.835655614053165e-06, + "loss": 0.5044, + "step": 9463 + }, + { + "epoch": 4.603039513677811, + "grad_norm": 0.07314501582103763, + "learning_rate": 3.83472575209064e-06, + "loss": 0.507, + "step": 9464 + }, + { + "epoch": 4.603525835866261, + "grad_norm": 0.07615035555205024, + "learning_rate": 3.833795932742725e-06, + "loss": 0.5202, + "step": 9465 + }, + { + "epoch": 4.6040121580547115, + "grad_norm": 0.07462205267127738, + "learning_rate": 3.83286615604342e-06, + "loss": 0.518, + "step": 9466 + }, + { + "epoch": 4.604498480243161, + "grad_norm": 0.07541329463386828, + "learning_rate": 3.831936422026733e-06, + "loss": 0.4938, + "step": 9467 + }, + { + "epoch": 4.604984802431611, + "grad_norm": 0.07390541315198346, + "learning_rate": 3.831006730726659e-06, + "loss": 0.5035, + "step": 9468 + }, + { + "epoch": 4.605471124620061, + "grad_norm": 0.0745351254956586, + "learning_rate": 3.8300770821772015e-06, + "loss": 0.4845, + "step": 9469 + }, + { + "epoch": 4.60595744680851, + "grad_norm": 0.07554491562162251, + "learning_rate": 3.8291474764123544e-06, + "loss": 0.5214, + "step": 9470 + }, + { + "epoch": 4.606443768996961, + "grad_norm": 0.07299659021121051, + "learning_rate": 3.82821791346612e-06, + "loss": 0.4669, + "step": 9471 + }, + { + "epoch": 4.6069300911854105, + "grad_norm": 0.07422117763230204, + "learning_rate": 3.827288393372486e-06, + "loss": 0.4986, + "step": 9472 + }, + { + "epoch": 4.60741641337386, + "grad_norm": 0.07423495880259527, + "learning_rate": 3.826358916165448e-06, + "loss": 0.5023, + "step": 9473 + }, + { + "epoch": 4.60790273556231, + "grad_norm": 0.07123173110027327, + "learning_rate": 3.825429481878999e-06, + "loss": 0.485, + "step": 9474 + }, + { + "epoch": 4.60838905775076, + "grad_norm": 0.0752127379143631, + "learning_rate": 3.824500090547127e-06, + "loss": 0.5205, + "step": 9475 + }, + { + "epoch": 4.60887537993921, + "grad_norm": 0.07786879865888996, + "learning_rate": 3.823570742203821e-06, + "loss": 0.5764, + "step": 9476 + }, + { + "epoch": 4.60936170212766, + "grad_norm": 0.07219552369975611, + "learning_rate": 3.822641436883067e-06, + "loss": 0.4678, + "step": 9477 + }, + { + "epoch": 4.6098480243161095, + "grad_norm": 0.07564927326451448, + "learning_rate": 3.8217121746188496e-06, + "loss": 0.5271, + "step": 9478 + }, + { + "epoch": 4.610334346504559, + "grad_norm": 0.07444991600551204, + "learning_rate": 3.820782955445156e-06, + "loss": 0.5105, + "step": 9479 + }, + { + "epoch": 4.610820668693009, + "grad_norm": 0.07494161729830032, + "learning_rate": 3.819853779395963e-06, + "loss": 0.513, + "step": 9480 + }, + { + "epoch": 4.6113069908814595, + "grad_norm": 0.07317616315481869, + "learning_rate": 3.818924646505256e-06, + "loss": 0.5227, + "step": 9481 + }, + { + "epoch": 4.611793313069909, + "grad_norm": 0.07480035058675628, + "learning_rate": 3.817995556807008e-06, + "loss": 0.5204, + "step": 9482 + }, + { + "epoch": 4.612279635258359, + "grad_norm": 0.07686659295049114, + "learning_rate": 3.8170665103352015e-06, + "loss": 0.5214, + "step": 9483 + }, + { + "epoch": 4.6127659574468085, + "grad_norm": 0.07555703319714131, + "learning_rate": 3.816137507123809e-06, + "loss": 0.5031, + "step": 9484 + }, + { + "epoch": 4.613252279635258, + "grad_norm": 0.07521412396946751, + "learning_rate": 3.8152085472068074e-06, + "loss": 0.5218, + "step": 9485 + }, + { + "epoch": 4.613738601823708, + "grad_norm": 0.0743982844701979, + "learning_rate": 3.8142796306181656e-06, + "loss": 0.5322, + "step": 9486 + }, + { + "epoch": 4.614224924012158, + "grad_norm": 0.0744369163396937, + "learning_rate": 3.8133507573918575e-06, + "loss": 0.5051, + "step": 9487 + }, + { + "epoch": 4.614711246200608, + "grad_norm": 0.07193154401324867, + "learning_rate": 3.8124219275618507e-06, + "loss": 0.484, + "step": 9488 + }, + { + "epoch": 4.615197568389058, + "grad_norm": 0.07481107910941633, + "learning_rate": 3.811493141162115e-06, + "loss": 0.4872, + "step": 9489 + }, + { + "epoch": 4.6156838905775075, + "grad_norm": 0.07578475644720155, + "learning_rate": 3.8105643982266137e-06, + "loss": 0.5108, + "step": 9490 + }, + { + "epoch": 4.616170212765957, + "grad_norm": 0.07497276982605097, + "learning_rate": 3.8096356987893123e-06, + "loss": 0.5185, + "step": 9491 + }, + { + "epoch": 4.616656534954407, + "grad_norm": 0.07259339086909142, + "learning_rate": 3.808707042884176e-06, + "loss": 0.5006, + "step": 9492 + }, + { + "epoch": 4.617142857142857, + "grad_norm": 0.07490271475208989, + "learning_rate": 3.8077784305451628e-06, + "loss": 0.5241, + "step": 9493 + }, + { + "epoch": 4.617629179331307, + "grad_norm": 0.07878946988105262, + "learning_rate": 3.806849861806235e-06, + "loss": 0.5612, + "step": 9494 + }, + { + "epoch": 4.618115501519757, + "grad_norm": 0.07225396036900433, + "learning_rate": 3.8059213367013485e-06, + "loss": 0.4998, + "step": 9495 + }, + { + "epoch": 4.6186018237082065, + "grad_norm": 0.07332486475880097, + "learning_rate": 3.804992855264464e-06, + "loss": 0.486, + "step": 9496 + }, + { + "epoch": 4.619088145896656, + "grad_norm": 0.0765174148555299, + "learning_rate": 3.8040644175295304e-06, + "loss": 0.5267, + "step": 9497 + }, + { + "epoch": 4.619574468085107, + "grad_norm": 0.07567744715304449, + "learning_rate": 3.8031360235305064e-06, + "loss": 0.5532, + "step": 9498 + }, + { + "epoch": 4.620060790273556, + "grad_norm": 0.07709301199733738, + "learning_rate": 3.802207673301341e-06, + "loss": 0.5125, + "step": 9499 + }, + { + "epoch": 4.620547112462006, + "grad_norm": 0.07628850399117111, + "learning_rate": 3.801279366875986e-06, + "loss": 0.5311, + "step": 9500 + }, + { + "epoch": 4.621033434650456, + "grad_norm": 0.07714273442330423, + "learning_rate": 3.800351104288388e-06, + "loss": 0.5068, + "step": 9501 + }, + { + "epoch": 4.6215197568389055, + "grad_norm": 0.07295483953696705, + "learning_rate": 3.7994228855724963e-06, + "loss": 0.5164, + "step": 9502 + }, + { + "epoch": 4.622006079027356, + "grad_norm": 0.07379443000323054, + "learning_rate": 3.7984947107622536e-06, + "loss": 0.5018, + "step": 9503 + }, + { + "epoch": 4.622492401215806, + "grad_norm": 0.07213853633630675, + "learning_rate": 3.797566579891607e-06, + "loss": 0.5055, + "step": 9504 + }, + { + "epoch": 4.622978723404255, + "grad_norm": 0.07396693713182244, + "learning_rate": 3.7966384929944955e-06, + "loss": 0.5012, + "step": 9505 + }, + { + "epoch": 4.623465045592705, + "grad_norm": 0.07730228493107462, + "learning_rate": 3.795710450104863e-06, + "loss": 0.5316, + "step": 9506 + }, + { + "epoch": 4.623951367781155, + "grad_norm": 0.07210203341271802, + "learning_rate": 3.7947824512566443e-06, + "loss": 0.5053, + "step": 9507 + }, + { + "epoch": 4.624437689969605, + "grad_norm": 0.07471053322435454, + "learning_rate": 3.79385449648378e-06, + "loss": 0.5123, + "step": 9508 + }, + { + "epoch": 4.624924012158055, + "grad_norm": 0.07514867551497609, + "learning_rate": 3.7929265858202035e-06, + "loss": 0.5271, + "step": 9509 + }, + { + "epoch": 4.625410334346505, + "grad_norm": 0.07310418719063497, + "learning_rate": 3.7919987192998526e-06, + "loss": 0.5124, + "step": 9510 + }, + { + "epoch": 4.625896656534954, + "grad_norm": 0.07580512671457157, + "learning_rate": 3.791070896956655e-06, + "loss": 0.5293, + "step": 9511 + }, + { + "epoch": 4.626382978723404, + "grad_norm": 0.0766143247335217, + "learning_rate": 3.7901431188245453e-06, + "loss": 0.5075, + "step": 9512 + }, + { + "epoch": 4.626869300911854, + "grad_norm": 0.0743828943028626, + "learning_rate": 3.78921538493745e-06, + "loss": 0.5132, + "step": 9513 + }, + { + "epoch": 4.6273556231003035, + "grad_norm": 0.0759008034594148, + "learning_rate": 3.7882876953293003e-06, + "loss": 0.5191, + "step": 9514 + }, + { + "epoch": 4.627841945288754, + "grad_norm": 0.07776551062692796, + "learning_rate": 3.7873600500340178e-06, + "loss": 0.5097, + "step": 9515 + }, + { + "epoch": 4.628328267477204, + "grad_norm": 0.07306178262039249, + "learning_rate": 3.7864324490855297e-06, + "loss": 0.4821, + "step": 9516 + }, + { + "epoch": 4.628814589665653, + "grad_norm": 0.07313781810878421, + "learning_rate": 3.785504892517759e-06, + "loss": 0.5076, + "step": 9517 + }, + { + "epoch": 4.629300911854103, + "grad_norm": 0.07402287239839066, + "learning_rate": 3.7845773803646247e-06, + "loss": 0.5223, + "step": 9518 + }, + { + "epoch": 4.629787234042553, + "grad_norm": 0.07046761284790008, + "learning_rate": 3.7836499126600507e-06, + "loss": 0.5048, + "step": 9519 + }, + { + "epoch": 4.630273556231003, + "grad_norm": 0.07327775229916007, + "learning_rate": 3.7827224894379494e-06, + "loss": 0.4882, + "step": 9520 + }, + { + "epoch": 4.630759878419453, + "grad_norm": 0.07440353791655598, + "learning_rate": 3.781795110732242e-06, + "loss": 0.4958, + "step": 9521 + }, + { + "epoch": 4.631246200607903, + "grad_norm": 0.07320541664060844, + "learning_rate": 3.780867776576839e-06, + "loss": 0.5001, + "step": 9522 + }, + { + "epoch": 4.631732522796352, + "grad_norm": 0.07616011703036604, + "learning_rate": 3.7799404870056557e-06, + "loss": 0.5149, + "step": 9523 + }, + { + "epoch": 4.632218844984802, + "grad_norm": 0.07744333425585907, + "learning_rate": 3.7790132420526026e-06, + "loss": 0.5276, + "step": 9524 + }, + { + "epoch": 4.632705167173253, + "grad_norm": 0.07566761043196567, + "learning_rate": 3.7780860417515918e-06, + "loss": 0.4892, + "step": 9525 + }, + { + "epoch": 4.633191489361702, + "grad_norm": 0.07514088263625601, + "learning_rate": 3.777158886136528e-06, + "loss": 0.4967, + "step": 9526 + }, + { + "epoch": 4.633677811550152, + "grad_norm": 0.07526882707414885, + "learning_rate": 3.776231775241319e-06, + "loss": 0.5281, + "step": 9527 + }, + { + "epoch": 4.634164133738602, + "grad_norm": 0.07651193187251416, + "learning_rate": 3.77530470909987e-06, + "loss": 0.5375, + "step": 9528 + }, + { + "epoch": 4.634650455927051, + "grad_norm": 0.07423840591000447, + "learning_rate": 3.7743776877460864e-06, + "loss": 0.496, + "step": 9529 + }, + { + "epoch": 4.635136778115502, + "grad_norm": 0.073935256014923, + "learning_rate": 3.7734507112138652e-06, + "loss": 0.531, + "step": 9530 + }, + { + "epoch": 4.635623100303952, + "grad_norm": 0.07194190894330107, + "learning_rate": 3.7725237795371094e-06, + "loss": 0.4954, + "step": 9531 + }, + { + "epoch": 4.636109422492401, + "grad_norm": 0.07353560460273091, + "learning_rate": 3.7715968927497167e-06, + "loss": 0.5001, + "step": 9532 + }, + { + "epoch": 4.636595744680851, + "grad_norm": 0.0746582790320707, + "learning_rate": 3.770670050885585e-06, + "loss": 0.4854, + "step": 9533 + }, + { + "epoch": 4.637082066869301, + "grad_norm": 0.07372899887960643, + "learning_rate": 3.769743253978606e-06, + "loss": 0.5213, + "step": 9534 + }, + { + "epoch": 4.63756838905775, + "grad_norm": 0.07212950839336363, + "learning_rate": 3.7688165020626772e-06, + "loss": 0.4936, + "step": 9535 + }, + { + "epoch": 4.638054711246201, + "grad_norm": 0.07594817659848711, + "learning_rate": 3.7678897951716863e-06, + "loss": 0.5073, + "step": 9536 + }, + { + "epoch": 4.638541033434651, + "grad_norm": 0.07476370620708332, + "learning_rate": 3.766963133339526e-06, + "loss": 0.5196, + "step": 9537 + }, + { + "epoch": 4.6390273556231, + "grad_norm": 0.07584518566263565, + "learning_rate": 3.7660365166000834e-06, + "loss": 0.5392, + "step": 9538 + }, + { + "epoch": 4.63951367781155, + "grad_norm": 0.07569146466071262, + "learning_rate": 3.7651099449872485e-06, + "loss": 0.5675, + "step": 9539 + }, + { + "epoch": 4.64, + "grad_norm": 0.07259531974374066, + "learning_rate": 3.7641834185349014e-06, + "loss": 0.495, + "step": 9540 + }, + { + "epoch": 4.640486322188449, + "grad_norm": 0.07503499095375923, + "learning_rate": 3.7632569372769294e-06, + "loss": 0.5112, + "step": 9541 + }, + { + "epoch": 4.6409726443769, + "grad_norm": 0.07606440715202127, + "learning_rate": 3.762330501247212e-06, + "loss": 0.5251, + "step": 9542 + }, + { + "epoch": 4.64145896656535, + "grad_norm": 0.07130470785763345, + "learning_rate": 3.7614041104796307e-06, + "loss": 0.496, + "step": 9543 + }, + { + "epoch": 4.641945288753799, + "grad_norm": 0.07615671337157306, + "learning_rate": 3.7604777650080654e-06, + "loss": 0.5101, + "step": 9544 + }, + { + "epoch": 4.642431610942249, + "grad_norm": 0.07192905081790707, + "learning_rate": 3.7595514648663894e-06, + "loss": 0.5223, + "step": 9545 + }, + { + "epoch": 4.642917933130699, + "grad_norm": 0.07371448061533994, + "learning_rate": 3.758625210088482e-06, + "loss": 0.5033, + "step": 9546 + }, + { + "epoch": 4.643404255319149, + "grad_norm": 0.07649761607323644, + "learning_rate": 3.7576990007082125e-06, + "loss": 0.5146, + "step": 9547 + }, + { + "epoch": 4.643890577507599, + "grad_norm": 0.07057738478422075, + "learning_rate": 3.7567728367594564e-06, + "loss": 0.4798, + "step": 9548 + }, + { + "epoch": 4.644376899696049, + "grad_norm": 0.07438195516742215, + "learning_rate": 3.755846718276081e-06, + "loss": 0.483, + "step": 9549 + }, + { + "epoch": 4.644863221884498, + "grad_norm": 0.07330784264288097, + "learning_rate": 3.7549206452919584e-06, + "loss": 0.5313, + "step": 9550 + }, + { + "epoch": 4.645349544072948, + "grad_norm": 0.07289138818300984, + "learning_rate": 3.753994617840952e-06, + "loss": 0.4888, + "step": 9551 + }, + { + "epoch": 4.6458358662613986, + "grad_norm": 0.07465262810113721, + "learning_rate": 3.753068635956929e-06, + "loss": 0.5034, + "step": 9552 + }, + { + "epoch": 4.646322188449848, + "grad_norm": 0.07542769098137869, + "learning_rate": 3.7521426996737516e-06, + "loss": 0.5433, + "step": 9553 + }, + { + "epoch": 4.646808510638298, + "grad_norm": 0.0749693442702763, + "learning_rate": 3.751216809025285e-06, + "loss": 0.5346, + "step": 9554 + }, + { + "epoch": 4.647294832826748, + "grad_norm": 0.07466763207742935, + "learning_rate": 3.750290964045384e-06, + "loss": 0.4994, + "step": 9555 + }, + { + "epoch": 4.647781155015197, + "grad_norm": 0.07250374001258861, + "learning_rate": 3.749365164767912e-06, + "loss": 0.4926, + "step": 9556 + }, + { + "epoch": 4.648267477203648, + "grad_norm": 0.07570652146570055, + "learning_rate": 3.748439411226723e-06, + "loss": 0.5569, + "step": 9557 + }, + { + "epoch": 4.6487537993920975, + "grad_norm": 0.07421932023576838, + "learning_rate": 3.7475137034556753e-06, + "loss": 0.5148, + "step": 9558 + }, + { + "epoch": 4.649240121580547, + "grad_norm": 0.07457992476791968, + "learning_rate": 3.746588041488619e-06, + "loss": 0.473, + "step": 9559 + }, + { + "epoch": 4.649726443768997, + "grad_norm": 0.07497857139757647, + "learning_rate": 3.7456624253594087e-06, + "loss": 0.518, + "step": 9560 + }, + { + "epoch": 4.650212765957447, + "grad_norm": 0.07494900953534446, + "learning_rate": 3.7447368551018916e-06, + "loss": 0.5081, + "step": 9561 + }, + { + "epoch": 4.650699088145896, + "grad_norm": 0.07442428162605412, + "learning_rate": 3.743811330749919e-06, + "loss": 0.4999, + "step": 9562 + }, + { + "epoch": 4.651185410334347, + "grad_norm": 0.07535008447958119, + "learning_rate": 3.742885852337336e-06, + "loss": 0.5459, + "step": 9563 + }, + { + "epoch": 4.6516717325227965, + "grad_norm": 0.07228302431615818, + "learning_rate": 3.741960419897991e-06, + "loss": 0.5024, + "step": 9564 + }, + { + "epoch": 4.652158054711246, + "grad_norm": 0.07455600044653893, + "learning_rate": 3.7410350334657218e-06, + "loss": 0.5034, + "step": 9565 + }, + { + "epoch": 4.652644376899696, + "grad_norm": 0.07641868902654676, + "learning_rate": 3.7401096930743753e-06, + "loss": 0.5378, + "step": 9566 + }, + { + "epoch": 4.653130699088146, + "grad_norm": 0.0732124516137757, + "learning_rate": 3.739184398757788e-06, + "loss": 0.5223, + "step": 9567 + }, + { + "epoch": 4.653617021276595, + "grad_norm": 0.07282143747930755, + "learning_rate": 3.738259150549803e-06, + "loss": 0.5079, + "step": 9568 + }, + { + "epoch": 4.654103343465046, + "grad_norm": 0.07124438143144253, + "learning_rate": 3.737333948484251e-06, + "loss": 0.4786, + "step": 9569 + }, + { + "epoch": 4.6545896656534955, + "grad_norm": 0.07273540222266, + "learning_rate": 3.736408792594971e-06, + "loss": 0.4768, + "step": 9570 + }, + { + "epoch": 4.655075987841945, + "grad_norm": 0.07673260424922278, + "learning_rate": 3.735483682915796e-06, + "loss": 0.5525, + "step": 9571 + }, + { + "epoch": 4.655562310030395, + "grad_norm": 0.07600713076263813, + "learning_rate": 3.7345586194805562e-06, + "loss": 0.5128, + "step": 9572 + }, + { + "epoch": 4.656048632218845, + "grad_norm": 0.07559240948015863, + "learning_rate": 3.7336336023230853e-06, + "loss": 0.5265, + "step": 9573 + }, + { + "epoch": 4.656534954407295, + "grad_norm": 0.07284962303963942, + "learning_rate": 3.7327086314772064e-06, + "loss": 0.4991, + "step": 9574 + }, + { + "epoch": 4.657021276595745, + "grad_norm": 0.0784734662661495, + "learning_rate": 3.7317837069767505e-06, + "loss": 0.5702, + "step": 9575 + }, + { + "epoch": 4.6575075987841945, + "grad_norm": 0.07428019499799277, + "learning_rate": 3.730858828855539e-06, + "loss": 0.5273, + "step": 9576 + }, + { + "epoch": 4.657993920972644, + "grad_norm": 0.07618090946263137, + "learning_rate": 3.7299339971473973e-06, + "loss": 0.5254, + "step": 9577 + }, + { + "epoch": 4.658480243161094, + "grad_norm": 0.07280468127081419, + "learning_rate": 3.7290092118861454e-06, + "loss": 0.5074, + "step": 9578 + }, + { + "epoch": 4.6589665653495445, + "grad_norm": 0.07329129289431018, + "learning_rate": 3.7280844731056066e-06, + "loss": 0.4986, + "step": 9579 + }, + { + "epoch": 4.659452887537994, + "grad_norm": 0.07223412735139698, + "learning_rate": 3.727159780839594e-06, + "loss": 0.5047, + "step": 9580 + }, + { + "epoch": 4.659939209726444, + "grad_norm": 0.07498428933515157, + "learning_rate": 3.726235135121927e-06, + "loss": 0.5122, + "step": 9581 + }, + { + "epoch": 4.6604255319148935, + "grad_norm": 0.07291334164386905, + "learning_rate": 3.72531053598642e-06, + "loss": 0.4875, + "step": 9582 + }, + { + "epoch": 4.660911854103343, + "grad_norm": 0.07706097745118637, + "learning_rate": 3.724385983466887e-06, + "loss": 0.5232, + "step": 9583 + }, + { + "epoch": 4.661398176291794, + "grad_norm": 0.07339697845810081, + "learning_rate": 3.7234614775971366e-06, + "loss": 0.4765, + "step": 9584 + }, + { + "epoch": 4.661884498480243, + "grad_norm": 0.0744146720615144, + "learning_rate": 3.7225370184109814e-06, + "loss": 0.5079, + "step": 9585 + }, + { + "epoch": 4.662370820668693, + "grad_norm": 0.07077348494825894, + "learning_rate": 3.7216126059422263e-06, + "loss": 0.4756, + "step": 9586 + }, + { + "epoch": 4.662857142857143, + "grad_norm": 0.07274179623828411, + "learning_rate": 3.7206882402246796e-06, + "loss": 0.4904, + "step": 9587 + }, + { + "epoch": 4.6633434650455925, + "grad_norm": 0.07598361193116747, + "learning_rate": 3.7197639212921445e-06, + "loss": 0.5012, + "step": 9588 + }, + { + "epoch": 4.663829787234042, + "grad_norm": 0.07176317379494655, + "learning_rate": 3.7188396491784262e-06, + "loss": 0.5031, + "step": 9589 + }, + { + "epoch": 4.664316109422493, + "grad_norm": 0.07407781378437643, + "learning_rate": 3.717915423917322e-06, + "loss": 0.5285, + "step": 9590 + }, + { + "epoch": 4.664802431610942, + "grad_norm": 0.07485449954704698, + "learning_rate": 3.7169912455426348e-06, + "loss": 0.5202, + "step": 9591 + }, + { + "epoch": 4.665288753799392, + "grad_norm": 0.07247835616473004, + "learning_rate": 3.716067114088159e-06, + "loss": 0.5009, + "step": 9592 + }, + { + "epoch": 4.665775075987842, + "grad_norm": 0.07392328061597725, + "learning_rate": 3.7151430295876943e-06, + "loss": 0.5076, + "step": 9593 + }, + { + "epoch": 4.6662613981762915, + "grad_norm": 0.0758691485924479, + "learning_rate": 3.7142189920750304e-06, + "loss": 0.5576, + "step": 9594 + }, + { + "epoch": 4.666747720364741, + "grad_norm": 0.07660410457253233, + "learning_rate": 3.713295001583963e-06, + "loss": 0.5115, + "step": 9595 + }, + { + "epoch": 4.667234042553192, + "grad_norm": 0.07594373902212093, + "learning_rate": 3.712371058148282e-06, + "loss": 0.5166, + "step": 9596 + }, + { + "epoch": 4.667720364741641, + "grad_norm": 0.07230989221290264, + "learning_rate": 3.7114471618017756e-06, + "loss": 0.5082, + "step": 9597 + }, + { + "epoch": 4.668206686930091, + "grad_norm": 0.07928490318333174, + "learning_rate": 3.710523312578235e-06, + "loss": 0.5069, + "step": 9598 + }, + { + "epoch": 4.668693009118541, + "grad_norm": 0.07396771480513312, + "learning_rate": 3.709599510511439e-06, + "loss": 0.5415, + "step": 9599 + }, + { + "epoch": 4.6691793313069905, + "grad_norm": 0.07424907094946222, + "learning_rate": 3.708675755635178e-06, + "loss": 0.5169, + "step": 9600 + }, + { + "epoch": 4.669665653495441, + "grad_norm": 0.0756231051950839, + "learning_rate": 3.7077520479832296e-06, + "loss": 0.506, + "step": 9601 + }, + { + "epoch": 4.670151975683891, + "grad_norm": 0.07387786097719395, + "learning_rate": 3.706828387589377e-06, + "loss": 0.537, + "step": 9602 + }, + { + "epoch": 4.67063829787234, + "grad_norm": 0.0763183024763555, + "learning_rate": 3.705904774487396e-06, + "loss": 0.5257, + "step": 9603 + }, + { + "epoch": 4.67112462006079, + "grad_norm": 0.07472397518631525, + "learning_rate": 3.704981208711068e-06, + "loss": 0.5007, + "step": 9604 + }, + { + "epoch": 4.67161094224924, + "grad_norm": 0.07492416020627674, + "learning_rate": 3.7040576902941634e-06, + "loss": 0.5256, + "step": 9605 + }, + { + "epoch": 4.67209726443769, + "grad_norm": 0.07485880416766108, + "learning_rate": 3.7031342192704588e-06, + "loss": 0.5325, + "step": 9606 + }, + { + "epoch": 4.67258358662614, + "grad_norm": 0.07158234307150661, + "learning_rate": 3.7022107956737234e-06, + "loss": 0.4915, + "step": 9607 + }, + { + "epoch": 4.67306990881459, + "grad_norm": 0.07225528992868703, + "learning_rate": 3.7012874195377315e-06, + "loss": 0.529, + "step": 9608 + }, + { + "epoch": 4.673556231003039, + "grad_norm": 0.07225706820737202, + "learning_rate": 3.700364090896247e-06, + "loss": 0.4774, + "step": 9609 + }, + { + "epoch": 4.674042553191489, + "grad_norm": 0.07561806125101941, + "learning_rate": 3.699440809783038e-06, + "loss": 0.5288, + "step": 9610 + }, + { + "epoch": 4.67452887537994, + "grad_norm": 0.07410462174182866, + "learning_rate": 3.6985175762318694e-06, + "loss": 0.549, + "step": 9611 + }, + { + "epoch": 4.675015197568389, + "grad_norm": 0.07858269014990575, + "learning_rate": 3.6975943902765064e-06, + "loss": 0.5299, + "step": 9612 + }, + { + "epoch": 4.675501519756839, + "grad_norm": 0.07303523938707543, + "learning_rate": 3.6966712519507052e-06, + "loss": 0.4974, + "step": 9613 + }, + { + "epoch": 4.675987841945289, + "grad_norm": 0.0721016590337247, + "learning_rate": 3.695748161288232e-06, + "loss": 0.5052, + "step": 9614 + }, + { + "epoch": 4.676474164133738, + "grad_norm": 0.07364381736590409, + "learning_rate": 3.6948251183228377e-06, + "loss": 0.4833, + "step": 9615 + }, + { + "epoch": 4.676960486322188, + "grad_norm": 0.07653172132265926, + "learning_rate": 3.693902123088284e-06, + "loss": 0.5071, + "step": 9616 + }, + { + "epoch": 4.677446808510639, + "grad_norm": 0.07286093668446615, + "learning_rate": 3.692979175618321e-06, + "loss": 0.519, + "step": 9617 + }, + { + "epoch": 4.677933130699088, + "grad_norm": 0.07161945261191206, + "learning_rate": 3.692056275946706e-06, + "loss": 0.4954, + "step": 9618 + }, + { + "epoch": 4.678419452887538, + "grad_norm": 0.07332748044486957, + "learning_rate": 3.691133424107185e-06, + "loss": 0.553, + "step": 9619 + }, + { + "epoch": 4.678905775075988, + "grad_norm": 0.07468228670131369, + "learning_rate": 3.6902106201335104e-06, + "loss": 0.5327, + "step": 9620 + }, + { + "epoch": 4.679392097264437, + "grad_norm": 0.07594165262276767, + "learning_rate": 3.689287864059427e-06, + "loss": 0.5462, + "step": 9621 + }, + { + "epoch": 4.679878419452887, + "grad_norm": 0.07207785603150786, + "learning_rate": 3.6883651559186822e-06, + "loss": 0.5096, + "step": 9622 + }, + { + "epoch": 4.680364741641338, + "grad_norm": 0.0727797728628278, + "learning_rate": 3.6874424957450215e-06, + "loss": 0.494, + "step": 9623 + }, + { + "epoch": 4.680851063829787, + "grad_norm": 0.07667396172812461, + "learning_rate": 3.686519883572184e-06, + "loss": 0.5044, + "step": 9624 + }, + { + "epoch": 4.681337386018237, + "grad_norm": 0.07316306043841397, + "learning_rate": 3.6855973194339113e-06, + "loss": 0.5144, + "step": 9625 + }, + { + "epoch": 4.681823708206687, + "grad_norm": 0.07602396817093657, + "learning_rate": 3.6846748033639402e-06, + "loss": 0.5104, + "step": 9626 + }, + { + "epoch": 4.682310030395136, + "grad_norm": 0.07441150851758274, + "learning_rate": 3.683752335396012e-06, + "loss": 0.5013, + "step": 9627 + }, + { + "epoch": 4.682796352583587, + "grad_norm": 0.0743595482002386, + "learning_rate": 3.682829915563857e-06, + "loss": 0.5006, + "step": 9628 + }, + { + "epoch": 4.683282674772037, + "grad_norm": 0.07746167588075516, + "learning_rate": 3.681907543901212e-06, + "loss": 0.5332, + "step": 9629 + }, + { + "epoch": 4.683768996960486, + "grad_norm": 0.07446019977027796, + "learning_rate": 3.6809852204418045e-06, + "loss": 0.4915, + "step": 9630 + }, + { + "epoch": 4.684255319148936, + "grad_norm": 0.07212699422511144, + "learning_rate": 3.6800629452193683e-06, + "loss": 0.4902, + "step": 9631 + }, + { + "epoch": 4.684741641337386, + "grad_norm": 0.07138086466157413, + "learning_rate": 3.6791407182676287e-06, + "loss": 0.4808, + "step": 9632 + }, + { + "epoch": 4.685227963525836, + "grad_norm": 0.07541732465208206, + "learning_rate": 3.678218539620315e-06, + "loss": 0.4978, + "step": 9633 + }, + { + "epoch": 4.685714285714286, + "grad_norm": 0.07398786260987969, + "learning_rate": 3.6772964093111486e-06, + "loss": 0.4843, + "step": 9634 + }, + { + "epoch": 4.686200607902736, + "grad_norm": 0.07873925613633227, + "learning_rate": 3.676374327373854e-06, + "loss": 0.5216, + "step": 9635 + }, + { + "epoch": 4.686686930091185, + "grad_norm": 0.07280369958394933, + "learning_rate": 3.67545229384215e-06, + "loss": 0.5104, + "step": 9636 + }, + { + "epoch": 4.687173252279635, + "grad_norm": 0.07148377818397378, + "learning_rate": 3.67453030874976e-06, + "loss": 0.4434, + "step": 9637 + }, + { + "epoch": 4.687659574468086, + "grad_norm": 0.07630968900013077, + "learning_rate": 3.6736083721303966e-06, + "loss": 0.5507, + "step": 9638 + }, + { + "epoch": 4.688145896656535, + "grad_norm": 0.07264716097839222, + "learning_rate": 3.67268648401778e-06, + "loss": 0.5066, + "step": 9639 + }, + { + "epoch": 4.688632218844985, + "grad_norm": 0.0729846544579363, + "learning_rate": 3.6717646444456196e-06, + "loss": 0.5157, + "step": 9640 + }, + { + "epoch": 4.689118541033435, + "grad_norm": 0.07558655287620303, + "learning_rate": 3.6708428534476302e-06, + "loss": 0.5394, + "step": 9641 + }, + { + "epoch": 4.689604863221884, + "grad_norm": 0.07425848146701212, + "learning_rate": 3.6699211110575206e-06, + "loss": 0.5255, + "step": 9642 + }, + { + "epoch": 4.690091185410334, + "grad_norm": 0.0720756011348175, + "learning_rate": 3.6689994173090025e-06, + "loss": 0.4843, + "step": 9643 + }, + { + "epoch": 4.690577507598785, + "grad_norm": 0.07355597179477104, + "learning_rate": 3.6680777722357787e-06, + "loss": 0.5149, + "step": 9644 + }, + { + "epoch": 4.691063829787234, + "grad_norm": 0.07375458795722081, + "learning_rate": 3.6671561758715564e-06, + "loss": 0.5016, + "step": 9645 + }, + { + "epoch": 4.691550151975684, + "grad_norm": 0.0757619720418, + "learning_rate": 3.6662346282500373e-06, + "loss": 0.5217, + "step": 9646 + }, + { + "epoch": 4.692036474164134, + "grad_norm": 0.07626029129190803, + "learning_rate": 3.6653131294049236e-06, + "loss": 0.5357, + "step": 9647 + }, + { + "epoch": 4.692522796352583, + "grad_norm": 0.07318033886282783, + "learning_rate": 3.6643916793699175e-06, + "loss": 0.524, + "step": 9648 + }, + { + "epoch": 4.693009118541033, + "grad_norm": 0.07355118118920227, + "learning_rate": 3.6634702781787122e-06, + "loss": 0.5034, + "step": 9649 + }, + { + "epoch": 4.6934954407294835, + "grad_norm": 0.0712011619777786, + "learning_rate": 3.662548925865008e-06, + "loss": 0.4835, + "step": 9650 + }, + { + "epoch": 4.693981762917933, + "grad_norm": 0.07329112172033672, + "learning_rate": 3.6616276224624947e-06, + "loss": 0.5079, + "step": 9651 + }, + { + "epoch": 4.694468085106383, + "grad_norm": 0.07397830532987652, + "learning_rate": 3.6607063680048706e-06, + "loss": 0.5474, + "step": 9652 + }, + { + "epoch": 4.694954407294833, + "grad_norm": 0.07241938745143882, + "learning_rate": 3.6597851625258205e-06, + "loss": 0.502, + "step": 9653 + }, + { + "epoch": 4.695440729483282, + "grad_norm": 0.07393857469957417, + "learning_rate": 3.658864006059038e-06, + "loss": 0.5219, + "step": 9654 + }, + { + "epoch": 4.695927051671733, + "grad_norm": 0.07075624955625197, + "learning_rate": 3.657942898638206e-06, + "loss": 0.4801, + "step": 9655 + }, + { + "epoch": 4.6964133738601825, + "grad_norm": 0.0769859262197467, + "learning_rate": 3.6570218402970124e-06, + "loss": 0.543, + "step": 9656 + }, + { + "epoch": 4.696899696048632, + "grad_norm": 0.07896110797264357, + "learning_rate": 3.6561008310691405e-06, + "loss": 0.5629, + "step": 9657 + }, + { + "epoch": 4.697386018237082, + "grad_norm": 0.07898354688370803, + "learning_rate": 3.655179870988273e-06, + "loss": 0.5696, + "step": 9658 + }, + { + "epoch": 4.697872340425532, + "grad_norm": 0.07524286310016723, + "learning_rate": 3.654258960088087e-06, + "loss": 0.524, + "step": 9659 + }, + { + "epoch": 4.698358662613982, + "grad_norm": 0.077592247965106, + "learning_rate": 3.6533380984022625e-06, + "loss": 0.531, + "step": 9660 + }, + { + "epoch": 4.698844984802432, + "grad_norm": 0.07357493558132225, + "learning_rate": 3.6524172859644752e-06, + "loss": 0.5061, + "step": 9661 + }, + { + "epoch": 4.6993313069908815, + "grad_norm": 0.07438941113371288, + "learning_rate": 3.651496522808402e-06, + "loss": 0.5403, + "step": 9662 + }, + { + "epoch": 4.699817629179331, + "grad_norm": 0.0710591043464617, + "learning_rate": 3.650575808967711e-06, + "loss": 0.507, + "step": 9663 + }, + { + "epoch": 4.700303951367781, + "grad_norm": 0.07444801583242466, + "learning_rate": 3.6496551444760773e-06, + "loss": 0.4813, + "step": 9664 + }, + { + "epoch": 4.7007902735562315, + "grad_norm": 0.07364107287050706, + "learning_rate": 3.6487345293671673e-06, + "loss": 0.522, + "step": 9665 + }, + { + "epoch": 4.701276595744681, + "grad_norm": 0.07418207340608211, + "learning_rate": 3.647813963674651e-06, + "loss": 0.5324, + "step": 9666 + }, + { + "epoch": 4.701762917933131, + "grad_norm": 0.07086389652058153, + "learning_rate": 3.6468934474321916e-06, + "loss": 0.4841, + "step": 9667 + }, + { + "epoch": 4.7022492401215805, + "grad_norm": 0.07399552472498022, + "learning_rate": 3.6459729806734544e-06, + "loss": 0.5384, + "step": 9668 + }, + { + "epoch": 4.70273556231003, + "grad_norm": 0.07131064160335929, + "learning_rate": 3.6450525634320986e-06, + "loss": 0.4815, + "step": 9669 + }, + { + "epoch": 4.70322188449848, + "grad_norm": 0.0752254813076923, + "learning_rate": 3.6441321957417874e-06, + "loss": 0.4824, + "step": 9670 + }, + { + "epoch": 4.7037082066869305, + "grad_norm": 0.07431840307812927, + "learning_rate": 3.6432118776361767e-06, + "loss": 0.5216, + "step": 9671 + }, + { + "epoch": 4.70419452887538, + "grad_norm": 0.07594438156081466, + "learning_rate": 3.642291609148927e-06, + "loss": 0.506, + "step": 9672 + }, + { + "epoch": 4.70468085106383, + "grad_norm": 0.07244197669566692, + "learning_rate": 3.641371390313687e-06, + "loss": 0.4997, + "step": 9673 + }, + { + "epoch": 4.7051671732522795, + "grad_norm": 0.07507202720339307, + "learning_rate": 3.6404512211641123e-06, + "loss": 0.5336, + "step": 9674 + }, + { + "epoch": 4.705653495440729, + "grad_norm": 0.07385520390171911, + "learning_rate": 3.639531101733856e-06, + "loss": 0.5421, + "step": 9675 + }, + { + "epoch": 4.706139817629179, + "grad_norm": 0.07262138845245943, + "learning_rate": 3.6386110320565636e-06, + "loss": 0.5078, + "step": 9676 + }, + { + "epoch": 4.7066261398176295, + "grad_norm": 0.07440605342391532, + "learning_rate": 3.6376910121658867e-06, + "loss": 0.5393, + "step": 9677 + }, + { + "epoch": 4.707112462006079, + "grad_norm": 0.07486882996441581, + "learning_rate": 3.636771042095466e-06, + "loss": 0.5017, + "step": 9678 + }, + { + "epoch": 4.707598784194529, + "grad_norm": 0.07302965695974867, + "learning_rate": 3.6358511218789507e-06, + "loss": 0.4882, + "step": 9679 + }, + { + "epoch": 4.7080851063829785, + "grad_norm": 0.07416681013074236, + "learning_rate": 3.6349312515499765e-06, + "loss": 0.4822, + "step": 9680 + }, + { + "epoch": 4.708571428571428, + "grad_norm": 0.07514982724806697, + "learning_rate": 3.634011431142188e-06, + "loss": 0.5242, + "step": 9681 + }, + { + "epoch": 4.709057750759879, + "grad_norm": 0.07673511482193901, + "learning_rate": 3.6330916606892208e-06, + "loss": 0.5496, + "step": 9682 + }, + { + "epoch": 4.709544072948328, + "grad_norm": 0.07297530854212485, + "learning_rate": 3.6321719402247144e-06, + "loss": 0.4859, + "step": 9683 + }, + { + "epoch": 4.710030395136778, + "grad_norm": 0.0739553311762443, + "learning_rate": 3.6312522697823004e-06, + "loss": 0.5089, + "step": 9684 + }, + { + "epoch": 4.710516717325228, + "grad_norm": 0.07191695130023956, + "learning_rate": 3.630332649395614e-06, + "loss": 0.4973, + "step": 9685 + }, + { + "epoch": 4.7110030395136775, + "grad_norm": 0.07343181185032584, + "learning_rate": 3.629413079098282e-06, + "loss": 0.5336, + "step": 9686 + }, + { + "epoch": 4.711489361702128, + "grad_norm": 0.07387737643779135, + "learning_rate": 3.62849355892394e-06, + "loss": 0.5037, + "step": 9687 + }, + { + "epoch": 4.711975683890578, + "grad_norm": 0.07630022820497294, + "learning_rate": 3.6275740889062095e-06, + "loss": 0.5284, + "step": 9688 + }, + { + "epoch": 4.712462006079027, + "grad_norm": 0.07309824966081761, + "learning_rate": 3.6266546690787187e-06, + "loss": 0.4959, + "step": 9689 + }, + { + "epoch": 4.712948328267477, + "grad_norm": 0.0731185972075894, + "learning_rate": 3.6257352994750895e-06, + "loss": 0.4953, + "step": 9690 + }, + { + "epoch": 4.713434650455927, + "grad_norm": 0.07552762648953919, + "learning_rate": 3.624815980128947e-06, + "loss": 0.489, + "step": 9691 + }, + { + "epoch": 4.713920972644377, + "grad_norm": 0.0749982077456791, + "learning_rate": 3.623896711073907e-06, + "loss": 0.5144, + "step": 9692 + }, + { + "epoch": 4.714407294832827, + "grad_norm": 0.07537583112118984, + "learning_rate": 3.6229774923435913e-06, + "loss": 0.5192, + "step": 9693 + }, + { + "epoch": 4.714893617021277, + "grad_norm": 0.07466143128530704, + "learning_rate": 3.622058323971612e-06, + "loss": 0.5143, + "step": 9694 + }, + { + "epoch": 4.715379939209726, + "grad_norm": 0.07397705973358144, + "learning_rate": 3.6211392059915878e-06, + "loss": 0.5007, + "step": 9695 + }, + { + "epoch": 4.715866261398176, + "grad_norm": 0.0752079888287382, + "learning_rate": 3.6202201384371275e-06, + "loss": 0.5337, + "step": 9696 + }, + { + "epoch": 4.716352583586626, + "grad_norm": 0.07629116753465477, + "learning_rate": 3.619301121341846e-06, + "loss": 0.5086, + "step": 9697 + }, + { + "epoch": 4.716838905775076, + "grad_norm": 0.07270555575141048, + "learning_rate": 3.6183821547393473e-06, + "loss": 0.5093, + "step": 9698 + }, + { + "epoch": 4.717325227963526, + "grad_norm": 0.07329952883677855, + "learning_rate": 3.617463238663241e-06, + "loss": 0.5054, + "step": 9699 + }, + { + "epoch": 4.717811550151976, + "grad_norm": 0.07177485735913473, + "learning_rate": 3.616544373147134e-06, + "loss": 0.4863, + "step": 9700 + }, + { + "epoch": 4.718297872340425, + "grad_norm": 0.07249512808292269, + "learning_rate": 3.615625558224626e-06, + "loss": 0.5161, + "step": 9701 + }, + { + "epoch": 4.718784194528875, + "grad_norm": 0.0744781437103124, + "learning_rate": 3.6147067939293225e-06, + "loss": 0.497, + "step": 9702 + }, + { + "epoch": 4.719270516717325, + "grad_norm": 0.07123421964173951, + "learning_rate": 3.6137880802948187e-06, + "loss": 0.4848, + "step": 9703 + }, + { + "epoch": 4.719756838905775, + "grad_norm": 0.07591929736847168, + "learning_rate": 3.612869417354716e-06, + "loss": 0.5122, + "step": 9704 + }, + { + "epoch": 4.720243161094225, + "grad_norm": 0.07436117337444799, + "learning_rate": 3.6119508051426074e-06, + "loss": 0.5124, + "step": 9705 + }, + { + "epoch": 4.720729483282675, + "grad_norm": 0.07712767828042628, + "learning_rate": 3.6110322436920907e-06, + "loss": 0.5168, + "step": 9706 + }, + { + "epoch": 4.721215805471124, + "grad_norm": 0.07308873909564961, + "learning_rate": 3.610113733036754e-06, + "loss": 0.4883, + "step": 9707 + }, + { + "epoch": 4.721702127659574, + "grad_norm": 0.07182223928762813, + "learning_rate": 3.6091952732101914e-06, + "loss": 0.4941, + "step": 9708 + }, + { + "epoch": 4.722188449848025, + "grad_norm": 0.07365357247373885, + "learning_rate": 3.6082768642459874e-06, + "loss": 0.5118, + "step": 9709 + }, + { + "epoch": 4.722674772036474, + "grad_norm": 0.07410324829943231, + "learning_rate": 3.6073585061777317e-06, + "loss": 0.5324, + "step": 9710 + }, + { + "epoch": 4.723161094224924, + "grad_norm": 0.07193067339448064, + "learning_rate": 3.6064401990390073e-06, + "loss": 0.4812, + "step": 9711 + }, + { + "epoch": 4.723647416413374, + "grad_norm": 0.07651798257715141, + "learning_rate": 3.6055219428634004e-06, + "loss": 0.4877, + "step": 9712 + }, + { + "epoch": 4.724133738601823, + "grad_norm": 0.07176851181679521, + "learning_rate": 3.6046037376844874e-06, + "loss": 0.4793, + "step": 9713 + }, + { + "epoch": 4.724620060790274, + "grad_norm": 0.07150576438483817, + "learning_rate": 3.60368558353585e-06, + "loss": 0.4693, + "step": 9714 + }, + { + "epoch": 4.725106382978724, + "grad_norm": 0.07283998278940824, + "learning_rate": 3.6027674804510648e-06, + "loss": 0.5048, + "step": 9715 + }, + { + "epoch": 4.725592705167173, + "grad_norm": 0.07580647852892412, + "learning_rate": 3.6018494284637096e-06, + "loss": 0.4965, + "step": 9716 + }, + { + "epoch": 4.726079027355623, + "grad_norm": 0.0752706878266174, + "learning_rate": 3.6009314276073543e-06, + "loss": 0.5062, + "step": 9717 + }, + { + "epoch": 4.726565349544073, + "grad_norm": 0.07383080829912109, + "learning_rate": 3.6000134779155727e-06, + "loss": 0.4723, + "step": 9718 + }, + { + "epoch": 4.727051671732523, + "grad_norm": 0.07044824552727634, + "learning_rate": 3.5990955794219335e-06, + "loss": 0.4937, + "step": 9719 + }, + { + "epoch": 4.727537993920973, + "grad_norm": 0.07434810067840288, + "learning_rate": 3.5981777321600077e-06, + "loss": 0.5173, + "step": 9720 + }, + { + "epoch": 4.728024316109423, + "grad_norm": 0.07265568359586524, + "learning_rate": 3.5972599361633564e-06, + "loss": 0.4718, + "step": 9721 + }, + { + "epoch": 4.728510638297872, + "grad_norm": 0.07551258315662726, + "learning_rate": 3.5963421914655492e-06, + "loss": 0.5122, + "step": 9722 + }, + { + "epoch": 4.728996960486322, + "grad_norm": 0.07652969660092714, + "learning_rate": 3.595424498100144e-06, + "loss": 0.493, + "step": 9723 + }, + { + "epoch": 4.729483282674772, + "grad_norm": 0.07578458077270042, + "learning_rate": 3.5945068561007037e-06, + "loss": 0.5226, + "step": 9724 + }, + { + "epoch": 4.729969604863222, + "grad_norm": 0.07364530256160033, + "learning_rate": 3.593589265500784e-06, + "loss": 0.4909, + "step": 9725 + }, + { + "epoch": 4.730455927051672, + "grad_norm": 0.07222076382519117, + "learning_rate": 3.5926717263339458e-06, + "loss": 0.4647, + "step": 9726 + }, + { + "epoch": 4.730942249240122, + "grad_norm": 0.07441978681154107, + "learning_rate": 3.5917542386337427e-06, + "loss": 0.5091, + "step": 9727 + }, + { + "epoch": 4.731428571428571, + "grad_norm": 0.075908317255612, + "learning_rate": 3.590836802433725e-06, + "loss": 0.5296, + "step": 9728 + }, + { + "epoch": 4.731914893617021, + "grad_norm": 0.07281737757326617, + "learning_rate": 3.589919417767447e-06, + "loss": 0.4821, + "step": 9729 + }, + { + "epoch": 4.732401215805471, + "grad_norm": 0.07147181446720538, + "learning_rate": 3.5890020846684557e-06, + "loss": 0.4894, + "step": 9730 + }, + { + "epoch": 4.732887537993921, + "grad_norm": 0.07405245263519245, + "learning_rate": 3.5880848031703007e-06, + "loss": 0.5141, + "step": 9731 + }, + { + "epoch": 4.733373860182371, + "grad_norm": 0.07212471909025862, + "learning_rate": 3.587167573306525e-06, + "loss": 0.5183, + "step": 9732 + }, + { + "epoch": 4.733860182370821, + "grad_norm": 0.07246583488246103, + "learning_rate": 3.5862503951106738e-06, + "loss": 0.5127, + "step": 9733 + }, + { + "epoch": 4.73434650455927, + "grad_norm": 0.07313886099581665, + "learning_rate": 3.585333268616286e-06, + "loss": 0.4778, + "step": 9734 + }, + { + "epoch": 4.73483282674772, + "grad_norm": 0.07713045774273816, + "learning_rate": 3.5844161938569044e-06, + "loss": 0.4919, + "step": 9735 + }, + { + "epoch": 4.735319148936171, + "grad_norm": 0.07886834976454239, + "learning_rate": 3.5834991708660648e-06, + "loss": 0.5594, + "step": 9736 + }, + { + "epoch": 4.73580547112462, + "grad_norm": 0.07335635439637181, + "learning_rate": 3.5825821996773067e-06, + "loss": 0.5094, + "step": 9737 + }, + { + "epoch": 4.73629179331307, + "grad_norm": 0.07388011356129526, + "learning_rate": 3.5816652803241593e-06, + "loss": 0.5072, + "step": 9738 + }, + { + "epoch": 4.73677811550152, + "grad_norm": 0.07403671210681402, + "learning_rate": 3.5807484128401577e-06, + "loss": 0.524, + "step": 9739 + }, + { + "epoch": 4.737264437689969, + "grad_norm": 0.07226507872097664, + "learning_rate": 3.5798315972588306e-06, + "loss": 0.4812, + "step": 9740 + }, + { + "epoch": 4.73775075987842, + "grad_norm": 0.07565678335867206, + "learning_rate": 3.5789148336137085e-06, + "loss": 0.5327, + "step": 9741 + }, + { + "epoch": 4.7382370820668696, + "grad_norm": 0.07752654608280349, + "learning_rate": 3.5779981219383153e-06, + "loss": 0.5104, + "step": 9742 + }, + { + "epoch": 4.738723404255319, + "grad_norm": 0.07694209300471293, + "learning_rate": 3.5770814622661775e-06, + "loss": 0.5811, + "step": 9743 + }, + { + "epoch": 4.739209726443769, + "grad_norm": 0.07164510482494477, + "learning_rate": 3.5761648546308163e-06, + "loss": 0.4723, + "step": 9744 + }, + { + "epoch": 4.739696048632219, + "grad_norm": 0.07344454350082696, + "learning_rate": 3.5752482990657557e-06, + "loss": 0.4851, + "step": 9745 + }, + { + "epoch": 4.740182370820669, + "grad_norm": 0.0706854632627505, + "learning_rate": 3.5743317956045093e-06, + "loss": 0.472, + "step": 9746 + }, + { + "epoch": 4.740668693009119, + "grad_norm": 0.0753541275425176, + "learning_rate": 3.5734153442805993e-06, + "loss": 0.5155, + "step": 9747 + }, + { + "epoch": 4.7411550151975685, + "grad_norm": 0.07368347283064641, + "learning_rate": 3.572498945127536e-06, + "loss": 0.5103, + "step": 9748 + }, + { + "epoch": 4.741641337386018, + "grad_norm": 0.07400782047150341, + "learning_rate": 3.5715825981788353e-06, + "loss": 0.5178, + "step": 9749 + }, + { + "epoch": 4.742127659574468, + "grad_norm": 0.07580568293924207, + "learning_rate": 3.570666303468008e-06, + "loss": 0.5416, + "step": 9750 + }, + { + "epoch": 4.742613981762918, + "grad_norm": 0.07589635897475323, + "learning_rate": 3.569750061028565e-06, + "loss": 0.5311, + "step": 9751 + }, + { + "epoch": 4.743100303951368, + "grad_norm": 0.07146293716591269, + "learning_rate": 3.56883387089401e-06, + "loss": 0.4885, + "step": 9752 + }, + { + "epoch": 4.743586626139818, + "grad_norm": 0.0754189832744774, + "learning_rate": 3.567917733097851e-06, + "loss": 0.5175, + "step": 9753 + }, + { + "epoch": 4.7440729483282675, + "grad_norm": 0.07504084512620407, + "learning_rate": 3.5670016476735916e-06, + "loss": 0.5452, + "step": 9754 + }, + { + "epoch": 4.744559270516717, + "grad_norm": 0.07493569225025028, + "learning_rate": 3.5660856146547316e-06, + "loss": 0.5081, + "step": 9755 + }, + { + "epoch": 4.745045592705167, + "grad_norm": 0.07679996303387869, + "learning_rate": 3.5651696340747747e-06, + "loss": 0.4957, + "step": 9756 + }, + { + "epoch": 4.745531914893617, + "grad_norm": 0.07730414505923569, + "learning_rate": 3.5642537059672142e-06, + "loss": 0.572, + "step": 9757 + }, + { + "epoch": 4.746018237082067, + "grad_norm": 0.07264580644260071, + "learning_rate": 3.5633378303655486e-06, + "loss": 0.4766, + "step": 9758 + }, + { + "epoch": 4.746504559270517, + "grad_norm": 0.07300710064794234, + "learning_rate": 3.5624220073032707e-06, + "loss": 0.4882, + "step": 9759 + }, + { + "epoch": 4.7469908814589665, + "grad_norm": 0.0776537167036983, + "learning_rate": 3.561506236813875e-06, + "loss": 0.5465, + "step": 9760 + }, + { + "epoch": 4.747477203647416, + "grad_norm": 0.07252783462957638, + "learning_rate": 3.5605905189308477e-06, + "loss": 0.504, + "step": 9761 + }, + { + "epoch": 4.747963525835866, + "grad_norm": 0.07135799151487464, + "learning_rate": 3.559674853687681e-06, + "loss": 0.5117, + "step": 9762 + }, + { + "epoch": 4.7484498480243165, + "grad_norm": 0.07352622976456395, + "learning_rate": 3.5587592411178574e-06, + "loss": 0.4992, + "step": 9763 + }, + { + "epoch": 4.748936170212766, + "grad_norm": 0.0744112848498426, + "learning_rate": 3.5578436812548637e-06, + "loss": 0.5093, + "step": 9764 + }, + { + "epoch": 4.749422492401216, + "grad_norm": 0.07623190672905851, + "learning_rate": 3.5569281741321813e-06, + "loss": 0.5214, + "step": 9765 + }, + { + "epoch": 4.7499088145896655, + "grad_norm": 0.0789306543042129, + "learning_rate": 3.556012719783293e-06, + "loss": 0.5126, + "step": 9766 + }, + { + "epoch": 4.7499088145896655, + "eval_loss": 0.567747950553894, + "eval_runtime": 105.2197, + "eval_samples_per_second": 288.472, + "eval_steps_per_second": 36.067, + "step": 9766 + }, + { + "epoch": 4.750395136778115, + "grad_norm": 0.07255644356136397, + "learning_rate": 3.5550973182416736e-06, + "loss": 0.5055, + "step": 9767 + }, + { + "epoch": 4.750881458966566, + "grad_norm": 0.07793849841204106, + "learning_rate": 3.554181969540803e-06, + "loss": 0.525, + "step": 9768 + }, + { + "epoch": 4.7513677811550155, + "grad_norm": 0.07258094189185788, + "learning_rate": 3.553266673714153e-06, + "loss": 0.5059, + "step": 9769 + }, + { + "epoch": 4.751854103343465, + "grad_norm": 0.0746562524015995, + "learning_rate": 3.5523514307952e-06, + "loss": 0.5398, + "step": 9770 + }, + { + "epoch": 4.752340425531915, + "grad_norm": 0.07320671991583819, + "learning_rate": 3.551436240817412e-06, + "loss": 0.4994, + "step": 9771 + }, + { + "epoch": 4.7528267477203645, + "grad_norm": 0.0736358204626534, + "learning_rate": 3.5505211038142597e-06, + "loss": 0.4974, + "step": 9772 + }, + { + "epoch": 4.753313069908815, + "grad_norm": 0.07470222006856667, + "learning_rate": 3.5496060198192073e-06, + "loss": 0.5204, + "step": 9773 + }, + { + "epoch": 4.753799392097265, + "grad_norm": 0.07504460776172159, + "learning_rate": 3.5486909888657227e-06, + "loss": 0.5068, + "step": 9774 + }, + { + "epoch": 4.7542857142857144, + "grad_norm": 0.07018892982883108, + "learning_rate": 3.547776010987268e-06, + "loss": 0.4837, + "step": 9775 + }, + { + "epoch": 4.754772036474164, + "grad_norm": 0.07253018225378546, + "learning_rate": 3.5468610862173054e-06, + "loss": 0.543, + "step": 9776 + }, + { + "epoch": 4.755258358662614, + "grad_norm": 0.07301560198511932, + "learning_rate": 3.545946214589291e-06, + "loss": 0.5117, + "step": 9777 + }, + { + "epoch": 4.7557446808510635, + "grad_norm": 0.07409450356552133, + "learning_rate": 3.5450313961366843e-06, + "loss": 0.5161, + "step": 9778 + }, + { + "epoch": 4.756231003039513, + "grad_norm": 0.07640159500494696, + "learning_rate": 3.544116630892942e-06, + "loss": 0.5302, + "step": 9779 + }, + { + "epoch": 4.756717325227964, + "grad_norm": 0.07519060246872211, + "learning_rate": 3.5432019188915147e-06, + "loss": 0.4826, + "step": 9780 + }, + { + "epoch": 4.757203647416413, + "grad_norm": 0.07609169050357863, + "learning_rate": 3.5422872601658566e-06, + "loss": 0.523, + "step": 9781 + }, + { + "epoch": 4.757689969604863, + "grad_norm": 0.07171616715320253, + "learning_rate": 3.541372654749414e-06, + "loss": 0.4725, + "step": 9782 + }, + { + "epoch": 4.758176291793313, + "grad_norm": 0.07603686940243717, + "learning_rate": 3.5404581026756368e-06, + "loss": 0.5247, + "step": 9783 + }, + { + "epoch": 4.7586626139817625, + "grad_norm": 0.0715948939181546, + "learning_rate": 3.539543603977969e-06, + "loss": 0.4676, + "step": 9784 + }, + { + "epoch": 4.759148936170213, + "grad_norm": 0.07195841178651871, + "learning_rate": 3.5386291586898575e-06, + "loss": 0.4959, + "step": 9785 + }, + { + "epoch": 4.759635258358663, + "grad_norm": 0.07360118603985216, + "learning_rate": 3.537714766844739e-06, + "loss": 0.5236, + "step": 9786 + }, + { + "epoch": 4.760121580547112, + "grad_norm": 0.0717758448362556, + "learning_rate": 3.5368004284760584e-06, + "loss": 0.5091, + "step": 9787 + }, + { + "epoch": 4.760607902735562, + "grad_norm": 0.07450682832691077, + "learning_rate": 3.5358861436172487e-06, + "loss": 0.5256, + "step": 9788 + }, + { + "epoch": 4.761094224924012, + "grad_norm": 0.07080516295258174, + "learning_rate": 3.534971912301749e-06, + "loss": 0.4751, + "step": 9789 + }, + { + "epoch": 4.761580547112462, + "grad_norm": 0.07377008894900824, + "learning_rate": 3.534057734562991e-06, + "loss": 0.5109, + "step": 9790 + }, + { + "epoch": 4.762066869300912, + "grad_norm": 0.07401104027256379, + "learning_rate": 3.53314361043441e-06, + "loss": 0.5056, + "step": 9791 + }, + { + "epoch": 4.762553191489362, + "grad_norm": 0.07432988650059785, + "learning_rate": 3.5322295399494307e-06, + "loss": 0.4925, + "step": 9792 + }, + { + "epoch": 4.763039513677811, + "grad_norm": 0.07239458482336424, + "learning_rate": 3.5313155231414855e-06, + "loss": 0.489, + "step": 9793 + }, + { + "epoch": 4.763525835866261, + "grad_norm": 0.071325125694811, + "learning_rate": 3.5304015600439977e-06, + "loss": 0.5018, + "step": 9794 + }, + { + "epoch": 4.764012158054712, + "grad_norm": 0.07683321340533981, + "learning_rate": 3.5294876506903947e-06, + "loss": 0.5081, + "step": 9795 + }, + { + "epoch": 4.764498480243161, + "grad_norm": 0.07464038713559308, + "learning_rate": 3.528573795114094e-06, + "loss": 0.5063, + "step": 9796 + }, + { + "epoch": 4.764984802431611, + "grad_norm": 0.07628768713775802, + "learning_rate": 3.52765999334852e-06, + "loss": 0.533, + "step": 9797 + }, + { + "epoch": 4.765471124620061, + "grad_norm": 0.07375930236235786, + "learning_rate": 3.526746245427087e-06, + "loss": 0.5094, + "step": 9798 + }, + { + "epoch": 4.76595744680851, + "grad_norm": 0.07586561658147456, + "learning_rate": 3.5258325513832157e-06, + "loss": 0.5501, + "step": 9799 + }, + { + "epoch": 4.766443768996961, + "grad_norm": 0.07347675720691822, + "learning_rate": 3.5249189112503156e-06, + "loss": 0.5325, + "step": 9800 + }, + { + "epoch": 4.766930091185411, + "grad_norm": 0.07470445954407519, + "learning_rate": 3.5240053250618035e-06, + "loss": 0.4977, + "step": 9801 + }, + { + "epoch": 4.76741641337386, + "grad_norm": 0.07443680061146025, + "learning_rate": 3.5230917928510844e-06, + "loss": 0.4815, + "step": 9802 + }, + { + "epoch": 4.76790273556231, + "grad_norm": 0.07463936270291485, + "learning_rate": 3.522178314651571e-06, + "loss": 0.4965, + "step": 9803 + }, + { + "epoch": 4.76838905775076, + "grad_norm": 0.07388399009859531, + "learning_rate": 3.5212648904966675e-06, + "loss": 0.5146, + "step": 9804 + }, + { + "epoch": 4.768875379939209, + "grad_norm": 0.07351295468656954, + "learning_rate": 3.5203515204197774e-06, + "loss": 0.4994, + "step": 9805 + }, + { + "epoch": 4.769361702127659, + "grad_norm": 0.07552166616510825, + "learning_rate": 3.519438204454307e-06, + "loss": 0.4943, + "step": 9806 + }, + { + "epoch": 4.76984802431611, + "grad_norm": 0.07269306158659065, + "learning_rate": 3.5185249426336526e-06, + "loss": 0.4896, + "step": 9807 + }, + { + "epoch": 4.770334346504559, + "grad_norm": 0.07190776536994783, + "learning_rate": 3.5176117349912153e-06, + "loss": 0.4757, + "step": 9808 + }, + { + "epoch": 4.770820668693009, + "grad_norm": 0.07425955366321417, + "learning_rate": 3.516698581560388e-06, + "loss": 0.5768, + "step": 9809 + }, + { + "epoch": 4.771306990881459, + "grad_norm": 0.07757335723694997, + "learning_rate": 3.5157854823745706e-06, + "loss": 0.5572, + "step": 9810 + }, + { + "epoch": 4.771793313069908, + "grad_norm": 0.07365121798165827, + "learning_rate": 3.5148724374671504e-06, + "loss": 0.514, + "step": 9811 + }, + { + "epoch": 4.772279635258359, + "grad_norm": 0.07392207043245279, + "learning_rate": 3.513959446871521e-06, + "loss": 0.5189, + "step": 9812 + }, + { + "epoch": 4.772765957446809, + "grad_norm": 0.07275929456702032, + "learning_rate": 3.5130465106210683e-06, + "loss": 0.4925, + "step": 9813 + }, + { + "epoch": 4.773252279635258, + "grad_norm": 0.07493324808021674, + "learning_rate": 3.5121336287491827e-06, + "loss": 0.5239, + "step": 9814 + }, + { + "epoch": 4.773738601823708, + "grad_norm": 0.07188830018941737, + "learning_rate": 3.5112208012892434e-06, + "loss": 0.5226, + "step": 9815 + }, + { + "epoch": 4.774224924012158, + "grad_norm": 0.07188461585063607, + "learning_rate": 3.510308028274638e-06, + "loss": 0.5191, + "step": 9816 + }, + { + "epoch": 4.774711246200608, + "grad_norm": 0.07750324293504426, + "learning_rate": 3.5093953097387432e-06, + "loss": 0.5421, + "step": 9817 + }, + { + "epoch": 4.775197568389058, + "grad_norm": 0.07247154017826295, + "learning_rate": 3.5084826457149403e-06, + "loss": 0.4997, + "step": 9818 + }, + { + "epoch": 4.775683890577508, + "grad_norm": 0.07593107666964263, + "learning_rate": 3.5075700362366037e-06, + "loss": 0.5221, + "step": 9819 + }, + { + "epoch": 4.776170212765957, + "grad_norm": 0.0794232921091397, + "learning_rate": 3.5066574813371107e-06, + "loss": 0.5347, + "step": 9820 + }, + { + "epoch": 4.776656534954407, + "grad_norm": 0.07369308489835924, + "learning_rate": 3.5057449810498303e-06, + "loss": 0.5143, + "step": 9821 + }, + { + "epoch": 4.777142857142858, + "grad_norm": 0.07511724406723964, + "learning_rate": 3.5048325354081355e-06, + "loss": 0.5183, + "step": 9822 + }, + { + "epoch": 4.777629179331307, + "grad_norm": 0.082368562252736, + "learning_rate": 3.503920144445393e-06, + "loss": 0.5434, + "step": 9823 + }, + { + "epoch": 4.778115501519757, + "grad_norm": 0.0744682451149145, + "learning_rate": 3.5030078081949727e-06, + "loss": 0.5018, + "step": 9824 + }, + { + "epoch": 4.778601823708207, + "grad_norm": 0.07579777787978553, + "learning_rate": 3.5020955266902344e-06, + "loss": 0.5132, + "step": 9825 + }, + { + "epoch": 4.779088145896656, + "grad_norm": 0.07226973858210134, + "learning_rate": 3.5011832999645466e-06, + "loss": 0.5196, + "step": 9826 + }, + { + "epoch": 4.779574468085106, + "grad_norm": 0.07178286219415975, + "learning_rate": 3.5002711280512638e-06, + "loss": 0.5039, + "step": 9827 + }, + { + "epoch": 4.780060790273557, + "grad_norm": 0.07331009740163545, + "learning_rate": 3.499359010983748e-06, + "loss": 0.5182, + "step": 9828 + }, + { + "epoch": 4.780547112462006, + "grad_norm": 0.07811334721069643, + "learning_rate": 3.4984469487953537e-06, + "loss": 0.5336, + "step": 9829 + }, + { + "epoch": 4.781033434650456, + "grad_norm": 0.0746750737897446, + "learning_rate": 3.497534941519437e-06, + "loss": 0.5206, + "step": 9830 + }, + { + "epoch": 4.781519756838906, + "grad_norm": 0.07919322016925144, + "learning_rate": 3.496622989189352e-06, + "loss": 0.5021, + "step": 9831 + }, + { + "epoch": 4.782006079027355, + "grad_norm": 0.07441818594727875, + "learning_rate": 3.4957110918384457e-06, + "loss": 0.5299, + "step": 9832 + }, + { + "epoch": 4.782492401215805, + "grad_norm": 0.07490565547441784, + "learning_rate": 3.4947992495000693e-06, + "loss": 0.4936, + "step": 9833 + }, + { + "epoch": 4.782978723404256, + "grad_norm": 0.07493913993770857, + "learning_rate": 3.4938874622075664e-06, + "loss": 0.5459, + "step": 9834 + }, + { + "epoch": 4.783465045592705, + "grad_norm": 0.07405899335629826, + "learning_rate": 3.4929757299942856e-06, + "loss": 0.502, + "step": 9835 + }, + { + "epoch": 4.783951367781155, + "grad_norm": 0.0749874857172059, + "learning_rate": 3.492064052893565e-06, + "loss": 0.5163, + "step": 9836 + }, + { + "epoch": 4.784437689969605, + "grad_norm": 0.06983182178092108, + "learning_rate": 3.4911524309387486e-06, + "loss": 0.4824, + "step": 9837 + }, + { + "epoch": 4.784924012158054, + "grad_norm": 0.072479363122535, + "learning_rate": 3.4902408641631712e-06, + "loss": 0.541, + "step": 9838 + }, + { + "epoch": 4.785410334346505, + "grad_norm": 0.07415528128373292, + "learning_rate": 3.489329352600175e-06, + "loss": 0.515, + "step": 9839 + }, + { + "epoch": 4.7858966565349546, + "grad_norm": 0.07654566834136997, + "learning_rate": 3.4884178962830873e-06, + "loss": 0.5433, + "step": 9840 + }, + { + "epoch": 4.786382978723404, + "grad_norm": 0.0722118979317469, + "learning_rate": 3.4875064952452465e-06, + "loss": 0.4841, + "step": 9841 + }, + { + "epoch": 4.786869300911854, + "grad_norm": 0.07785931538963002, + "learning_rate": 3.4865951495199777e-06, + "loss": 0.5142, + "step": 9842 + }, + { + "epoch": 4.787355623100304, + "grad_norm": 0.07249086345814795, + "learning_rate": 3.4856838591406133e-06, + "loss": 0.5045, + "step": 9843 + }, + { + "epoch": 4.787841945288754, + "grad_norm": 0.07386509165721343, + "learning_rate": 3.4847726241404773e-06, + "loss": 0.5179, + "step": 9844 + }, + { + "epoch": 4.788328267477204, + "grad_norm": 0.07409307035709214, + "learning_rate": 3.4838614445528966e-06, + "loss": 0.5143, + "step": 9845 + }, + { + "epoch": 4.7888145896656535, + "grad_norm": 0.07588906375978266, + "learning_rate": 3.4829503204111897e-06, + "loss": 0.5586, + "step": 9846 + }, + { + "epoch": 4.789300911854103, + "grad_norm": 0.07410715122236765, + "learning_rate": 3.48203925174868e-06, + "loss": 0.5044, + "step": 9847 + }, + { + "epoch": 4.789787234042553, + "grad_norm": 0.07308231934404547, + "learning_rate": 3.4811282385986835e-06, + "loss": 0.489, + "step": 9848 + }, + { + "epoch": 4.7902735562310035, + "grad_norm": 0.07379629850994263, + "learning_rate": 3.480217280994519e-06, + "loss": 0.5013, + "step": 9849 + }, + { + "epoch": 4.790759878419453, + "grad_norm": 0.07192926801370358, + "learning_rate": 3.479306378969497e-06, + "loss": 0.5042, + "step": 9850 + }, + { + "epoch": 4.791246200607903, + "grad_norm": 0.07385691051176226, + "learning_rate": 3.478395532556933e-06, + "loss": 0.5081, + "step": 9851 + }, + { + "epoch": 4.7917325227963525, + "grad_norm": 0.071388404141872, + "learning_rate": 3.4774847417901345e-06, + "loss": 0.4972, + "step": 9852 + }, + { + "epoch": 4.792218844984802, + "grad_norm": 0.07501240873261915, + "learning_rate": 3.4765740067024133e-06, + "loss": 0.5154, + "step": 9853 + }, + { + "epoch": 4.792705167173252, + "grad_norm": 0.07316167562038231, + "learning_rate": 3.47566332732707e-06, + "loss": 0.5131, + "step": 9854 + }, + { + "epoch": 4.7931914893617025, + "grad_norm": 0.07549758956830742, + "learning_rate": 3.4747527036974137e-06, + "loss": 0.5321, + "step": 9855 + }, + { + "epoch": 4.793677811550152, + "grad_norm": 0.07558350175019174, + "learning_rate": 3.4738421358467417e-06, + "loss": 0.5369, + "step": 9856 + }, + { + "epoch": 4.794164133738602, + "grad_norm": 0.07238987935246406, + "learning_rate": 3.4729316238083564e-06, + "loss": 0.5042, + "step": 9857 + }, + { + "epoch": 4.7946504559270515, + "grad_norm": 0.07337581550885319, + "learning_rate": 3.4720211676155564e-06, + "loss": 0.4975, + "step": 9858 + }, + { + "epoch": 4.795136778115501, + "grad_norm": 0.0758419292319453, + "learning_rate": 3.4711107673016355e-06, + "loss": 0.509, + "step": 9859 + }, + { + "epoch": 4.795623100303951, + "grad_norm": 0.07232613874457919, + "learning_rate": 3.47020042289989e-06, + "loss": 0.494, + "step": 9860 + }, + { + "epoch": 4.7961094224924015, + "grad_norm": 0.07219658888394759, + "learning_rate": 3.4692901344436085e-06, + "loss": 0.4864, + "step": 9861 + }, + { + "epoch": 4.796595744680851, + "grad_norm": 0.07382802307391192, + "learning_rate": 3.4683799019660834e-06, + "loss": 0.5029, + "step": 9862 + }, + { + "epoch": 4.797082066869301, + "grad_norm": 0.07516563350058748, + "learning_rate": 3.4674697255005995e-06, + "loss": 0.53, + "step": 9863 + }, + { + "epoch": 4.7975683890577505, + "grad_norm": 0.0751437166769319, + "learning_rate": 3.466559605080447e-06, + "loss": 0.4872, + "step": 9864 + }, + { + "epoch": 4.7980547112462, + "grad_norm": 0.07608862040643435, + "learning_rate": 3.4656495407389033e-06, + "loss": 0.5129, + "step": 9865 + }, + { + "epoch": 4.798541033434651, + "grad_norm": 0.07379749351457024, + "learning_rate": 3.464739532509256e-06, + "loss": 0.5001, + "step": 9866 + }, + { + "epoch": 4.7990273556231005, + "grad_norm": 0.07078433037537617, + "learning_rate": 3.463829580424779e-06, + "loss": 0.5016, + "step": 9867 + }, + { + "epoch": 4.79951367781155, + "grad_norm": 0.0727552629446106, + "learning_rate": 3.462919684518753e-06, + "loss": 0.5163, + "step": 9868 + }, + { + "epoch": 4.8, + "grad_norm": 0.07409056608308609, + "learning_rate": 3.462009844824451e-06, + "loss": 0.4951, + "step": 9869 + }, + { + "epoch": 4.8004863221884495, + "grad_norm": 0.07443712380351969, + "learning_rate": 3.461100061375151e-06, + "loss": 0.5073, + "step": 9870 + }, + { + "epoch": 4.8009726443769, + "grad_norm": 0.07532965553008598, + "learning_rate": 3.460190334204118e-06, + "loss": 0.5075, + "step": 9871 + }, + { + "epoch": 4.80145896656535, + "grad_norm": 0.07509388942729486, + "learning_rate": 3.459280663344625e-06, + "loss": 0.5399, + "step": 9872 + }, + { + "epoch": 4.8019452887537994, + "grad_norm": 0.07380387466634364, + "learning_rate": 3.4583710488299375e-06, + "loss": 0.5014, + "step": 9873 + }, + { + "epoch": 4.802431610942249, + "grad_norm": 0.07467597097277383, + "learning_rate": 3.4574614906933234e-06, + "loss": 0.5446, + "step": 9874 + }, + { + "epoch": 4.802917933130699, + "grad_norm": 0.07315069463562376, + "learning_rate": 3.456551988968041e-06, + "loss": 0.4979, + "step": 9875 + }, + { + "epoch": 4.803404255319149, + "grad_norm": 0.07279755130071162, + "learning_rate": 3.455642543687355e-06, + "loss": 0.5275, + "step": 9876 + }, + { + "epoch": 4.803890577507599, + "grad_norm": 0.07554319797421721, + "learning_rate": 3.454733154884521e-06, + "loss": 0.5181, + "step": 9877 + }, + { + "epoch": 4.804376899696049, + "grad_norm": 0.07299003141026666, + "learning_rate": 3.4538238225928e-06, + "loss": 0.4791, + "step": 9878 + }, + { + "epoch": 4.804863221884498, + "grad_norm": 0.07632411918381653, + "learning_rate": 3.4529145468454427e-06, + "loss": 0.512, + "step": 9879 + }, + { + "epoch": 4.805349544072948, + "grad_norm": 0.07592163029083701, + "learning_rate": 3.452005327675705e-06, + "loss": 0.5312, + "step": 9880 + }, + { + "epoch": 4.805835866261398, + "grad_norm": 0.07297920857642748, + "learning_rate": 3.4510961651168328e-06, + "loss": 0.4882, + "step": 9881 + }, + { + "epoch": 4.806322188449848, + "grad_norm": 0.07433125933882768, + "learning_rate": 3.4501870592020802e-06, + "loss": 0.486, + "step": 9882 + }, + { + "epoch": 4.806808510638298, + "grad_norm": 0.07266363866821993, + "learning_rate": 3.4492780099646887e-06, + "loss": 0.4901, + "step": 9883 + }, + { + "epoch": 4.807294832826748, + "grad_norm": 0.07452999573080502, + "learning_rate": 3.4483690174379055e-06, + "loss": 0.5051, + "step": 9884 + }, + { + "epoch": 4.807781155015197, + "grad_norm": 0.07581845063732459, + "learning_rate": 3.447460081654974e-06, + "loss": 0.5129, + "step": 9885 + }, + { + "epoch": 4.808267477203647, + "grad_norm": 0.07281744585970004, + "learning_rate": 3.446551202649131e-06, + "loss": 0.506, + "step": 9886 + }, + { + "epoch": 4.808753799392097, + "grad_norm": 0.07616365660500832, + "learning_rate": 3.445642380453617e-06, + "loss": 0.5056, + "step": 9887 + }, + { + "epoch": 4.809240121580547, + "grad_norm": 0.07397139837369174, + "learning_rate": 3.4447336151016663e-06, + "loss": 0.4928, + "step": 9888 + }, + { + "epoch": 4.809726443768997, + "grad_norm": 0.07742794319823819, + "learning_rate": 3.4438249066265163e-06, + "loss": 0.5408, + "step": 9889 + }, + { + "epoch": 4.810212765957447, + "grad_norm": 0.07654845924557889, + "learning_rate": 3.4429162550613937e-06, + "loss": 0.4991, + "step": 9890 + }, + { + "epoch": 4.810699088145896, + "grad_norm": 0.07482347257278676, + "learning_rate": 3.4420076604395327e-06, + "loss": 0.5311, + "step": 9891 + }, + { + "epoch": 4.811185410334346, + "grad_norm": 0.07469645729465225, + "learning_rate": 3.441099122794158e-06, + "loss": 0.5222, + "step": 9892 + }, + { + "epoch": 4.811671732522797, + "grad_norm": 0.0784919170775449, + "learning_rate": 3.4401906421584996e-06, + "loss": 0.5307, + "step": 9893 + }, + { + "epoch": 4.812158054711246, + "grad_norm": 0.0754819640765405, + "learning_rate": 3.4392822185657747e-06, + "loss": 0.5187, + "step": 9894 + }, + { + "epoch": 4.812644376899696, + "grad_norm": 0.07923219645377234, + "learning_rate": 3.438373852049211e-06, + "loss": 0.5752, + "step": 9895 + }, + { + "epoch": 4.813130699088146, + "grad_norm": 0.0750010896940689, + "learning_rate": 3.437465542642023e-06, + "loss": 0.5538, + "step": 9896 + }, + { + "epoch": 4.813617021276595, + "grad_norm": 0.07536977279665356, + "learning_rate": 3.4365572903774304e-06, + "loss": 0.5484, + "step": 9897 + }, + { + "epoch": 4.814103343465046, + "grad_norm": 0.07369347055207746, + "learning_rate": 3.4356490952886477e-06, + "loss": 0.4911, + "step": 9898 + }, + { + "epoch": 4.814589665653496, + "grad_norm": 0.07269023541954552, + "learning_rate": 3.4347409574088896e-06, + "loss": 0.4764, + "step": 9899 + }, + { + "epoch": 4.815075987841945, + "grad_norm": 0.0740189798211945, + "learning_rate": 3.433832876771365e-06, + "loss": 0.5103, + "step": 9900 + }, + { + "epoch": 4.815562310030395, + "grad_norm": 0.07318115419514643, + "learning_rate": 3.432924853409283e-06, + "loss": 0.5227, + "step": 9901 + }, + { + "epoch": 4.816048632218845, + "grad_norm": 0.07064213584603708, + "learning_rate": 3.432016887355851e-06, + "loss": 0.4964, + "step": 9902 + }, + { + "epoch": 4.816534954407295, + "grad_norm": 0.0709788647563823, + "learning_rate": 3.431108978644276e-06, + "loss": 0.4695, + "step": 9903 + }, + { + "epoch": 4.817021276595745, + "grad_norm": 0.07366166983146442, + "learning_rate": 3.430201127307756e-06, + "loss": 0.503, + "step": 9904 + }, + { + "epoch": 4.817507598784195, + "grad_norm": 0.07349084825580836, + "learning_rate": 3.4292933333794955e-06, + "loss": 0.5165, + "step": 9905 + }, + { + "epoch": 4.817993920972644, + "grad_norm": 0.07310470058748092, + "learning_rate": 3.428385596892689e-06, + "loss": 0.5125, + "step": 9906 + }, + { + "epoch": 4.818480243161094, + "grad_norm": 0.07388463356470455, + "learning_rate": 3.427477917880539e-06, + "loss": 0.5001, + "step": 9907 + }, + { + "epoch": 4.818966565349544, + "grad_norm": 0.0727782842341365, + "learning_rate": 3.426570296376233e-06, + "loss": 0.5067, + "step": 9908 + }, + { + "epoch": 4.819452887537994, + "grad_norm": 0.07376985977931319, + "learning_rate": 3.4256627324129667e-06, + "loss": 0.5136, + "step": 9909 + }, + { + "epoch": 4.819939209726444, + "grad_norm": 0.07728120971890819, + "learning_rate": 3.424755226023931e-06, + "loss": 0.5267, + "step": 9910 + }, + { + "epoch": 4.820425531914894, + "grad_norm": 0.07264030809979652, + "learning_rate": 3.423847777242311e-06, + "loss": 0.522, + "step": 9911 + }, + { + "epoch": 4.820911854103343, + "grad_norm": 0.07501985726659798, + "learning_rate": 3.4229403861012938e-06, + "loss": 0.5038, + "step": 9912 + }, + { + "epoch": 4.821398176291793, + "grad_norm": 0.07420533693071835, + "learning_rate": 3.4220330526340627e-06, + "loss": 0.4887, + "step": 9913 + }, + { + "epoch": 4.821884498480243, + "grad_norm": 0.07390935663500048, + "learning_rate": 3.4211257768738014e-06, + "loss": 0.5466, + "step": 9914 + }, + { + "epoch": 4.822370820668693, + "grad_norm": 0.07501816611662929, + "learning_rate": 3.420218558853687e-06, + "loss": 0.5299, + "step": 9915 + }, + { + "epoch": 4.822857142857143, + "grad_norm": 0.07825381389850279, + "learning_rate": 3.4193113986068975e-06, + "loss": 0.5324, + "step": 9916 + }, + { + "epoch": 4.823343465045593, + "grad_norm": 0.07472965993156623, + "learning_rate": 3.4184042961666077e-06, + "loss": 0.5493, + "step": 9917 + }, + { + "epoch": 4.823829787234042, + "grad_norm": 0.07353954208778861, + "learning_rate": 3.417497251565993e-06, + "loss": 0.5147, + "step": 9918 + }, + { + "epoch": 4.824316109422492, + "grad_norm": 0.07225236763041619, + "learning_rate": 3.416590264838221e-06, + "loss": 0.5115, + "step": 9919 + }, + { + "epoch": 4.824802431610943, + "grad_norm": 0.07445816652460352, + "learning_rate": 3.415683336016465e-06, + "loss": 0.4974, + "step": 9920 + }, + { + "epoch": 4.825288753799392, + "grad_norm": 0.07442185604207374, + "learning_rate": 3.4147764651338867e-06, + "loss": 0.5102, + "step": 9921 + }, + { + "epoch": 4.825775075987842, + "grad_norm": 0.07508518707252185, + "learning_rate": 3.4138696522236536e-06, + "loss": 0.4982, + "step": 9922 + }, + { + "epoch": 4.826261398176292, + "grad_norm": 0.07477285477857919, + "learning_rate": 3.4129628973189276e-06, + "loss": 0.5183, + "step": 9923 + }, + { + "epoch": 4.826747720364741, + "grad_norm": 0.07287682275463758, + "learning_rate": 3.412056200452871e-06, + "loss": 0.5122, + "step": 9924 + }, + { + "epoch": 4.827234042553192, + "grad_norm": 0.07563406363863467, + "learning_rate": 3.41114956165864e-06, + "loss": 0.504, + "step": 9925 + }, + { + "epoch": 4.827720364741642, + "grad_norm": 0.07401860667094659, + "learning_rate": 3.410242980969391e-06, + "loss": 0.5006, + "step": 9926 + }, + { + "epoch": 4.828206686930091, + "grad_norm": 0.07365967112844747, + "learning_rate": 3.4093364584182776e-06, + "loss": 0.5106, + "step": 9927 + }, + { + "epoch": 4.828693009118541, + "grad_norm": 0.0737457879408845, + "learning_rate": 3.4084299940384545e-06, + "loss": 0.4901, + "step": 9928 + }, + { + "epoch": 4.829179331306991, + "grad_norm": 0.08344830942181425, + "learning_rate": 3.4075235878630687e-06, + "loss": 0.5213, + "step": 9929 + }, + { + "epoch": 4.829665653495441, + "grad_norm": 0.07286199829243596, + "learning_rate": 3.4066172399252684e-06, + "loss": 0.502, + "step": 9930 + }, + { + "epoch": 4.830151975683891, + "grad_norm": 0.07326707000163565, + "learning_rate": 3.4057109502581993e-06, + "loss": 0.499, + "step": 9931 + }, + { + "epoch": 4.830638297872341, + "grad_norm": 0.07373937864381767, + "learning_rate": 3.404804718895007e-06, + "loss": 0.5106, + "step": 9932 + }, + { + "epoch": 4.83112462006079, + "grad_norm": 0.07384152959034693, + "learning_rate": 3.403898545868829e-06, + "loss": 0.5255, + "step": 9933 + }, + { + "epoch": 4.83161094224924, + "grad_norm": 0.0764035663334682, + "learning_rate": 3.402992431212808e-06, + "loss": 0.557, + "step": 9934 + }, + { + "epoch": 4.83209726443769, + "grad_norm": 0.07265171373181742, + "learning_rate": 3.4020863749600775e-06, + "loss": 0.4987, + "step": 9935 + }, + { + "epoch": 4.83258358662614, + "grad_norm": 0.07304273683395976, + "learning_rate": 3.401180377143774e-06, + "loss": 0.5208, + "step": 9936 + }, + { + "epoch": 4.83306990881459, + "grad_norm": 0.07492914260466464, + "learning_rate": 3.4002744377970315e-06, + "loss": 0.5012, + "step": 9937 + }, + { + "epoch": 4.8335562310030395, + "grad_norm": 0.07084420139606204, + "learning_rate": 3.399368556952979e-06, + "loss": 0.4829, + "step": 9938 + }, + { + "epoch": 4.834042553191489, + "grad_norm": 0.07476823603231, + "learning_rate": 3.3984627346447474e-06, + "loss": 0.5349, + "step": 9939 + }, + { + "epoch": 4.834528875379939, + "grad_norm": 0.07410193222690933, + "learning_rate": 3.397556970905459e-06, + "loss": 0.491, + "step": 9940 + }, + { + "epoch": 4.835015197568389, + "grad_norm": 0.0718510477340495, + "learning_rate": 3.3966512657682417e-06, + "loss": 0.4798, + "step": 9941 + }, + { + "epoch": 4.835501519756839, + "grad_norm": 0.07290836216206803, + "learning_rate": 3.3957456192662143e-06, + "loss": 0.4878, + "step": 9942 + }, + { + "epoch": 4.835987841945289, + "grad_norm": 0.07024778130214428, + "learning_rate": 3.3948400314325007e-06, + "loss": 0.4837, + "step": 9943 + }, + { + "epoch": 4.8364741641337385, + "grad_norm": 0.0755056860751704, + "learning_rate": 3.3939345023002146e-06, + "loss": 0.5187, + "step": 9944 + }, + { + "epoch": 4.836960486322188, + "grad_norm": 0.07450840446820256, + "learning_rate": 3.3930290319024746e-06, + "loss": 0.5228, + "step": 9945 + }, + { + "epoch": 4.837446808510638, + "grad_norm": 0.0741644843374763, + "learning_rate": 3.3921236202723916e-06, + "loss": 0.5003, + "step": 9946 + }, + { + "epoch": 4.8379331306990885, + "grad_norm": 0.07334184543332505, + "learning_rate": 3.3912182674430805e-06, + "loss": 0.4974, + "step": 9947 + }, + { + "epoch": 4.838419452887538, + "grad_norm": 0.07505031434502361, + "learning_rate": 3.390312973447646e-06, + "loss": 0.5294, + "step": 9948 + }, + { + "epoch": 4.838905775075988, + "grad_norm": 0.07455947563647232, + "learning_rate": 3.3894077383192e-06, + "loss": 0.5421, + "step": 9949 + }, + { + "epoch": 4.8393920972644375, + "grad_norm": 0.07469136038362498, + "learning_rate": 3.388502562090842e-06, + "loss": 0.5113, + "step": 9950 + }, + { + "epoch": 4.839878419452887, + "grad_norm": 0.07552243990510667, + "learning_rate": 3.3875974447956795e-06, + "loss": 0.5047, + "step": 9951 + }, + { + "epoch": 4.840364741641338, + "grad_norm": 0.07188136474218193, + "learning_rate": 3.386692386466809e-06, + "loss": 0.5163, + "step": 9952 + }, + { + "epoch": 4.8408510638297875, + "grad_norm": 0.0732656384061756, + "learning_rate": 3.385787387137333e-06, + "loss": 0.5086, + "step": 9953 + }, + { + "epoch": 4.841337386018237, + "grad_norm": 0.0763368841400177, + "learning_rate": 3.384882446840344e-06, + "loss": 0.5165, + "step": 9954 + }, + { + "epoch": 4.841823708206687, + "grad_norm": 0.07368715187595497, + "learning_rate": 3.383977565608938e-06, + "loss": 0.4859, + "step": 9955 + }, + { + "epoch": 4.8423100303951365, + "grad_norm": 0.07317892548658124, + "learning_rate": 3.3830727434762068e-06, + "loss": 0.5391, + "step": 9956 + }, + { + "epoch": 4.842796352583587, + "grad_norm": 0.0758572337108233, + "learning_rate": 3.3821679804752413e-06, + "loss": 0.5184, + "step": 9957 + }, + { + "epoch": 4.843282674772037, + "grad_norm": 0.0761427260422004, + "learning_rate": 3.3812632766391252e-06, + "loss": 0.537, + "step": 9958 + }, + { + "epoch": 4.8437689969604865, + "grad_norm": 0.07661617984394792, + "learning_rate": 3.3803586320009497e-06, + "loss": 0.5087, + "step": 9959 + }, + { + "epoch": 4.844255319148936, + "grad_norm": 0.07160121442996414, + "learning_rate": 3.379454046593792e-06, + "loss": 0.4751, + "step": 9960 + }, + { + "epoch": 4.844741641337386, + "grad_norm": 0.07837639786820297, + "learning_rate": 3.3785495204507363e-06, + "loss": 0.5335, + "step": 9961 + }, + { + "epoch": 4.8452279635258355, + "grad_norm": 0.07353751947740553, + "learning_rate": 3.3776450536048623e-06, + "loss": 0.5259, + "step": 9962 + }, + { + "epoch": 4.845714285714286, + "grad_norm": 0.07222211739228658, + "learning_rate": 3.3767406460892447e-06, + "loss": 0.5067, + "step": 9963 + }, + { + "epoch": 4.846200607902736, + "grad_norm": 0.07401648117741538, + "learning_rate": 3.375836297936961e-06, + "loss": 0.5188, + "step": 9964 + }, + { + "epoch": 4.8466869300911855, + "grad_norm": 0.07521469517492321, + "learning_rate": 3.37493200918108e-06, + "loss": 0.5273, + "step": 9965 + }, + { + "epoch": 4.847173252279635, + "grad_norm": 0.07680043984100841, + "learning_rate": 3.374027779854675e-06, + "loss": 0.514, + "step": 9966 + }, + { + "epoch": 4.847659574468085, + "grad_norm": 0.0748688396911167, + "learning_rate": 3.3731236099908116e-06, + "loss": 0.5461, + "step": 9967 + }, + { + "epoch": 4.8481458966565345, + "grad_norm": 0.07385640871075642, + "learning_rate": 3.372219499622559e-06, + "loss": 0.4922, + "step": 9968 + }, + { + "epoch": 4.848632218844985, + "grad_norm": 0.0725224523564961, + "learning_rate": 3.3713154487829764e-06, + "loss": 0.5096, + "step": 9969 + }, + { + "epoch": 4.849118541033435, + "grad_norm": 0.07344292832818512, + "learning_rate": 3.370411457505129e-06, + "loss": 0.4741, + "step": 9970 + }, + { + "epoch": 4.849604863221884, + "grad_norm": 0.07667379348288852, + "learning_rate": 3.3695075258220745e-06, + "loss": 0.5276, + "step": 9971 + }, + { + "epoch": 4.850091185410334, + "grad_norm": 0.07469491580733274, + "learning_rate": 3.368603653766872e-06, + "loss": 0.498, + "step": 9972 + }, + { + "epoch": 4.850577507598784, + "grad_norm": 0.07277299863912308, + "learning_rate": 3.3676998413725726e-06, + "loss": 0.5039, + "step": 9973 + }, + { + "epoch": 4.851063829787234, + "grad_norm": 0.07420847236987306, + "learning_rate": 3.366796088672234e-06, + "loss": 0.5271, + "step": 9974 + }, + { + "epoch": 4.851550151975684, + "grad_norm": 0.07321948161815856, + "learning_rate": 3.3658923956989033e-06, + "loss": 0.522, + "step": 9975 + }, + { + "epoch": 4.852036474164134, + "grad_norm": 0.07750270866383223, + "learning_rate": 3.3649887624856303e-06, + "loss": 0.5136, + "step": 9976 + }, + { + "epoch": 4.852522796352583, + "grad_norm": 0.0749579091940929, + "learning_rate": 3.3640851890654596e-06, + "loss": 0.5047, + "step": 9977 + }, + { + "epoch": 4.853009118541033, + "grad_norm": 0.07777987185989435, + "learning_rate": 3.36318167547144e-06, + "loss": 0.5424, + "step": 9978 + }, + { + "epoch": 4.853495440729484, + "grad_norm": 0.07359019254798982, + "learning_rate": 3.3622782217366066e-06, + "loss": 0.5341, + "step": 9979 + }, + { + "epoch": 4.853981762917933, + "grad_norm": 0.07767641189602528, + "learning_rate": 3.361374827894005e-06, + "loss": 0.5248, + "step": 9980 + }, + { + "epoch": 4.854468085106383, + "grad_norm": 0.07388300598841771, + "learning_rate": 3.3604714939766693e-06, + "loss": 0.5125, + "step": 9981 + }, + { + "epoch": 4.854954407294833, + "grad_norm": 0.0780196347436397, + "learning_rate": 3.3595682200176372e-06, + "loss": 0.5228, + "step": 9982 + }, + { + "epoch": 4.855440729483282, + "grad_norm": 0.07189098717799296, + "learning_rate": 3.3586650060499394e-06, + "loss": 0.4865, + "step": 9983 + }, + { + "epoch": 4.855927051671733, + "grad_norm": 0.07504904649218191, + "learning_rate": 3.357761852106608e-06, + "loss": 0.5382, + "step": 9984 + }, + { + "epoch": 4.856413373860183, + "grad_norm": 0.0700289614060224, + "learning_rate": 3.3568587582206712e-06, + "loss": 0.472, + "step": 9985 + }, + { + "epoch": 4.856899696048632, + "grad_norm": 0.07678016258738653, + "learning_rate": 3.3559557244251585e-06, + "loss": 0.5427, + "step": 9986 + }, + { + "epoch": 4.857386018237082, + "grad_norm": 0.07948920413560626, + "learning_rate": 3.35505275075309e-06, + "loss": 0.5394, + "step": 9987 + }, + { + "epoch": 4.857872340425532, + "grad_norm": 0.07251650638918544, + "learning_rate": 3.354149837237489e-06, + "loss": 0.4996, + "step": 9988 + }, + { + "epoch": 4.858358662613981, + "grad_norm": 0.07288686904849229, + "learning_rate": 3.353246983911379e-06, + "loss": 0.492, + "step": 9989 + }, + { + "epoch": 4.858844984802432, + "grad_norm": 0.07658468624542433, + "learning_rate": 3.3523441908077726e-06, + "loss": 0.5027, + "step": 9990 + }, + { + "epoch": 4.859331306990882, + "grad_norm": 0.07666243253355116, + "learning_rate": 3.351441457959689e-06, + "loss": 0.5885, + "step": 9991 + }, + { + "epoch": 4.859817629179331, + "grad_norm": 0.07227232657200966, + "learning_rate": 3.3505387854001387e-06, + "loss": 0.4979, + "step": 9992 + }, + { + "epoch": 4.860303951367781, + "grad_norm": 0.07414202713011281, + "learning_rate": 3.3496361731621364e-06, + "loss": 0.5188, + "step": 9993 + }, + { + "epoch": 4.860790273556231, + "grad_norm": 0.07366606635805148, + "learning_rate": 3.3487336212786875e-06, + "loss": 0.5278, + "step": 9994 + }, + { + "epoch": 4.86127659574468, + "grad_norm": 0.07473170751222145, + "learning_rate": 3.3478311297828013e-06, + "loss": 0.5329, + "step": 9995 + }, + { + "epoch": 4.861762917933131, + "grad_norm": 0.07477726441339, + "learning_rate": 3.3469286987074803e-06, + "loss": 0.4896, + "step": 9996 + }, + { + "epoch": 4.862249240121581, + "grad_norm": 0.07304589698054452, + "learning_rate": 3.3460263280857295e-06, + "loss": 0.4978, + "step": 9997 + }, + { + "epoch": 4.86273556231003, + "grad_norm": 0.0744397921859017, + "learning_rate": 3.345124017950545e-06, + "loss": 0.5214, + "step": 9998 + }, + { + "epoch": 4.86322188449848, + "grad_norm": 0.07216225407263793, + "learning_rate": 3.3442217683349286e-06, + "loss": 0.4717, + "step": 9999 + }, + { + "epoch": 4.86370820668693, + "grad_norm": 0.07497180266407774, + "learning_rate": 3.3433195792718732e-06, + "loss": 0.4915, + "step": 10000 + }, + { + "epoch": 4.86419452887538, + "grad_norm": 0.07261845506524202, + "learning_rate": 3.342417450794375e-06, + "loss": 0.5388, + "step": 10001 + }, + { + "epoch": 4.86468085106383, + "grad_norm": 0.07211161284215672, + "learning_rate": 3.341515382935423e-06, + "loss": 0.4969, + "step": 10002 + }, + { + "epoch": 4.86516717325228, + "grad_norm": 0.07261994916627902, + "learning_rate": 3.340613375728008e-06, + "loss": 0.4737, + "step": 10003 + }, + { + "epoch": 4.865653495440729, + "grad_norm": 0.07693557724595707, + "learning_rate": 3.3397114292051135e-06, + "loss": 0.484, + "step": 10004 + }, + { + "epoch": 4.866139817629179, + "grad_norm": 0.07412737687312042, + "learning_rate": 3.338809543399728e-06, + "loss": 0.4844, + "step": 10005 + }, + { + "epoch": 4.86662613981763, + "grad_norm": 0.07381212469044743, + "learning_rate": 3.3379077183448306e-06, + "loss": 0.5247, + "step": 10006 + }, + { + "epoch": 4.867112462006079, + "grad_norm": 0.07408225321789191, + "learning_rate": 3.3370059540734058e-06, + "loss": 0.4792, + "step": 10007 + }, + { + "epoch": 4.867598784194529, + "grad_norm": 0.07350698283682915, + "learning_rate": 3.336104250618426e-06, + "loss": 0.5144, + "step": 10008 + }, + { + "epoch": 4.868085106382979, + "grad_norm": 0.0768106247785902, + "learning_rate": 3.3352026080128715e-06, + "loss": 0.5378, + "step": 10009 + }, + { + "epoch": 4.868571428571428, + "grad_norm": 0.07483959806061541, + "learning_rate": 3.3343010262897125e-06, + "loss": 0.5152, + "step": 10010 + }, + { + "epoch": 4.869057750759879, + "grad_norm": 0.0775465361829277, + "learning_rate": 3.3333995054819236e-06, + "loss": 0.5261, + "step": 10011 + }, + { + "epoch": 4.869544072948329, + "grad_norm": 0.07593471594999275, + "learning_rate": 3.33249804562247e-06, + "loss": 0.5387, + "step": 10012 + }, + { + "epoch": 4.870030395136778, + "grad_norm": 0.07732332122445831, + "learning_rate": 3.331596646744321e-06, + "loss": 0.5125, + "step": 10013 + }, + { + "epoch": 4.870516717325228, + "grad_norm": 0.07524045023730902, + "learning_rate": 3.3306953088804417e-06, + "loss": 0.4794, + "step": 10014 + }, + { + "epoch": 4.871003039513678, + "grad_norm": 0.07469719399219478, + "learning_rate": 3.3297940320637924e-06, + "loss": 0.516, + "step": 10015 + }, + { + "epoch": 4.871489361702127, + "grad_norm": 0.07351274318456377, + "learning_rate": 3.3288928163273344e-06, + "loss": 0.5227, + "step": 10016 + }, + { + "epoch": 4.871975683890578, + "grad_norm": 0.07493342586633125, + "learning_rate": 3.327991661704024e-06, + "loss": 0.503, + "step": 10017 + }, + { + "epoch": 4.872462006079028, + "grad_norm": 0.07695294657415475, + "learning_rate": 3.327090568226821e-06, + "loss": 0.5147, + "step": 10018 + }, + { + "epoch": 4.872948328267477, + "grad_norm": 0.07758095855370004, + "learning_rate": 3.326189535928674e-06, + "loss": 0.5057, + "step": 10019 + }, + { + "epoch": 4.873434650455927, + "grad_norm": 0.07446454933886645, + "learning_rate": 3.325288564842537e-06, + "loss": 0.5268, + "step": 10020 + }, + { + "epoch": 4.873920972644377, + "grad_norm": 0.07250081048624252, + "learning_rate": 3.3243876550013566e-06, + "loss": 0.4942, + "step": 10021 + }, + { + "epoch": 4.874407294832826, + "grad_norm": 0.07445212591946888, + "learning_rate": 3.323486806438083e-06, + "loss": 0.4922, + "step": 10022 + }, + { + "epoch": 4.874893617021277, + "grad_norm": 0.07092277499178194, + "learning_rate": 3.322586019185657e-06, + "loss": 0.4628, + "step": 10023 + }, + { + "epoch": 4.875379939209727, + "grad_norm": 0.07440357078776229, + "learning_rate": 3.3216852932770228e-06, + "loss": 0.5082, + "step": 10024 + }, + { + "epoch": 4.875866261398176, + "grad_norm": 0.07487056505890616, + "learning_rate": 3.3207846287451194e-06, + "loss": 0.4967, + "step": 10025 + }, + { + "epoch": 4.876352583586626, + "grad_norm": 0.07572685907517213, + "learning_rate": 3.319884025622887e-06, + "loss": 0.5155, + "step": 10026 + }, + { + "epoch": 4.876838905775076, + "grad_norm": 0.07700505800149454, + "learning_rate": 3.3189834839432565e-06, + "loss": 0.5287, + "step": 10027 + }, + { + "epoch": 4.877325227963526, + "grad_norm": 0.0725051245910781, + "learning_rate": 3.3180830037391666e-06, + "loss": 0.5011, + "step": 10028 + }, + { + "epoch": 4.877811550151976, + "grad_norm": 0.07264478767315147, + "learning_rate": 3.317182585043543e-06, + "loss": 0.5021, + "step": 10029 + }, + { + "epoch": 4.878297872340426, + "grad_norm": 0.07840865554245192, + "learning_rate": 3.316282227889318e-06, + "loss": 0.5373, + "step": 10030 + }, + { + "epoch": 4.878784194528875, + "grad_norm": 0.07650653821435979, + "learning_rate": 3.315381932309415e-06, + "loss": 0.5178, + "step": 10031 + }, + { + "epoch": 4.879270516717325, + "grad_norm": 0.07223226121659362, + "learning_rate": 3.3144816983367634e-06, + "loss": 0.5381, + "step": 10032 + }, + { + "epoch": 4.8797568389057755, + "grad_norm": 0.07303058812566782, + "learning_rate": 3.3135815260042792e-06, + "loss": 0.5071, + "step": 10033 + }, + { + "epoch": 4.880243161094225, + "grad_norm": 0.07416992883607613, + "learning_rate": 3.3126814153448856e-06, + "loss": 0.5213, + "step": 10034 + }, + { + "epoch": 4.880729483282675, + "grad_norm": 0.07381039134191224, + "learning_rate": 3.3117813663914984e-06, + "loss": 0.5369, + "step": 10035 + }, + { + "epoch": 4.8812158054711245, + "grad_norm": 0.07642139422645766, + "learning_rate": 3.3108813791770356e-06, + "loss": 0.5451, + "step": 10036 + }, + { + "epoch": 4.881702127659574, + "grad_norm": 0.07432602863555783, + "learning_rate": 3.309981453734406e-06, + "loss": 0.4941, + "step": 10037 + }, + { + "epoch": 4.882188449848025, + "grad_norm": 0.07503644792132684, + "learning_rate": 3.3090815900965234e-06, + "loss": 0.5437, + "step": 10038 + }, + { + "epoch": 4.8826747720364745, + "grad_norm": 0.07358503992661927, + "learning_rate": 3.3081817882962946e-06, + "loss": 0.507, + "step": 10039 + }, + { + "epoch": 4.883161094224924, + "grad_norm": 0.07534949257411974, + "learning_rate": 3.307282048366627e-06, + "loss": 0.5251, + "step": 10040 + }, + { + "epoch": 4.883647416413374, + "grad_norm": 0.07273369234153167, + "learning_rate": 3.306382370340425e-06, + "loss": 0.4796, + "step": 10041 + }, + { + "epoch": 4.8841337386018235, + "grad_norm": 0.07263800851275007, + "learning_rate": 3.3054827542505874e-06, + "loss": 0.5413, + "step": 10042 + }, + { + "epoch": 4.884620060790273, + "grad_norm": 0.07294296729047788, + "learning_rate": 3.304583200130017e-06, + "loss": 0.5227, + "step": 10043 + }, + { + "epoch": 4.885106382978723, + "grad_norm": 0.07403974302333012, + "learning_rate": 3.303683708011608e-06, + "loss": 0.5015, + "step": 10044 + }, + { + "epoch": 4.8855927051671735, + "grad_norm": 0.07642653557864701, + "learning_rate": 3.302784277928257e-06, + "loss": 0.5215, + "step": 10045 + }, + { + "epoch": 4.886079027355623, + "grad_norm": 0.07484165392494449, + "learning_rate": 3.301884909912855e-06, + "loss": 0.4965, + "step": 10046 + }, + { + "epoch": 4.886565349544073, + "grad_norm": 0.07195841038166434, + "learning_rate": 3.300985603998296e-06, + "loss": 0.5064, + "step": 10047 + }, + { + "epoch": 4.8870516717325225, + "grad_norm": 0.07043711799280249, + "learning_rate": 3.3000863602174626e-06, + "loss": 0.4924, + "step": 10048 + }, + { + "epoch": 4.887537993920972, + "grad_norm": 0.07562753975290293, + "learning_rate": 3.299187178603244e-06, + "loss": 0.5416, + "step": 10049 + }, + { + "epoch": 4.888024316109423, + "grad_norm": 0.07188691157298088, + "learning_rate": 3.2982880591885227e-06, + "loss": 0.5282, + "step": 10050 + }, + { + "epoch": 4.8885106382978725, + "grad_norm": 0.07309477649351301, + "learning_rate": 3.297389002006182e-06, + "loss": 0.4851, + "step": 10051 + }, + { + "epoch": 4.888996960486322, + "grad_norm": 0.07450735668268164, + "learning_rate": 3.2964900070890973e-06, + "loss": 0.5552, + "step": 10052 + }, + { + "epoch": 4.889483282674772, + "grad_norm": 0.07433251156736057, + "learning_rate": 3.2955910744701485e-06, + "loss": 0.5469, + "step": 10053 + }, + { + "epoch": 4.8899696048632215, + "grad_norm": 0.0742850319957969, + "learning_rate": 3.294692204182207e-06, + "loss": 0.513, + "step": 10054 + }, + { + "epoch": 4.890455927051672, + "grad_norm": 0.07351676782551884, + "learning_rate": 3.293793396258147e-06, + "loss": 0.5207, + "step": 10055 + }, + { + "epoch": 4.890942249240122, + "grad_norm": 0.07347130313129643, + "learning_rate": 3.2928946507308367e-06, + "loss": 0.4867, + "step": 10056 + }, + { + "epoch": 4.8914285714285715, + "grad_norm": 0.07241237959784527, + "learning_rate": 3.2919959676331464e-06, + "loss": 0.5066, + "step": 10057 + }, + { + "epoch": 4.891914893617021, + "grad_norm": 0.07529887467471853, + "learning_rate": 3.291097346997938e-06, + "loss": 0.5153, + "step": 10058 + }, + { + "epoch": 4.892401215805471, + "grad_norm": 0.0770271100472494, + "learning_rate": 3.2901987888580767e-06, + "loss": 0.5277, + "step": 10059 + }, + { + "epoch": 4.892887537993921, + "grad_norm": 0.0749990216451476, + "learning_rate": 3.2893002932464215e-06, + "loss": 0.5259, + "step": 10060 + }, + { + "epoch": 4.893373860182371, + "grad_norm": 0.07684437899549906, + "learning_rate": 3.288401860195834e-06, + "loss": 0.5196, + "step": 10061 + }, + { + "epoch": 4.893860182370821, + "grad_norm": 0.0754757202453132, + "learning_rate": 3.2875034897391656e-06, + "loss": 0.5005, + "step": 10062 + }, + { + "epoch": 4.8943465045592704, + "grad_norm": 0.07354082339836521, + "learning_rate": 3.2866051819092743e-06, + "loss": 0.4975, + "step": 10063 + }, + { + "epoch": 4.89483282674772, + "grad_norm": 0.07557988352935968, + "learning_rate": 3.285706936739008e-06, + "loss": 0.5254, + "step": 10064 + }, + { + "epoch": 4.895319148936171, + "grad_norm": 0.07728608958968981, + "learning_rate": 3.2848087542612204e-06, + "loss": 0.5415, + "step": 10065 + }, + { + "epoch": 4.89580547112462, + "grad_norm": 0.07377136328852381, + "learning_rate": 3.2839106345087545e-06, + "loss": 0.527, + "step": 10066 + }, + { + "epoch": 4.89629179331307, + "grad_norm": 0.0730167069055218, + "learning_rate": 3.283012577514456e-06, + "loss": 0.494, + "step": 10067 + }, + { + "epoch": 4.89677811550152, + "grad_norm": 0.07773378427797767, + "learning_rate": 3.282114583311169e-06, + "loss": 0.5595, + "step": 10068 + }, + { + "epoch": 4.897264437689969, + "grad_norm": 0.07513939358252131, + "learning_rate": 3.281216651931731e-06, + "loss": 0.4905, + "step": 10069 + }, + { + "epoch": 4.897750759878419, + "grad_norm": 0.0732380198628978, + "learning_rate": 3.280318783408981e-06, + "loss": 0.5069, + "step": 10070 + }, + { + "epoch": 4.898237082066869, + "grad_norm": 0.07626974749959135, + "learning_rate": 3.279420977775754e-06, + "loss": 0.5334, + "step": 10071 + }, + { + "epoch": 4.898723404255319, + "grad_norm": 0.07630598532652054, + "learning_rate": 3.2785232350648854e-06, + "loss": 0.534, + "step": 10072 + }, + { + "epoch": 4.899209726443769, + "grad_norm": 0.07428312400051716, + "learning_rate": 3.2776255553092024e-06, + "loss": 0.513, + "step": 10073 + }, + { + "epoch": 4.899696048632219, + "grad_norm": 0.07324659078790577, + "learning_rate": 3.2767279385415364e-06, + "loss": 0.5226, + "step": 10074 + }, + { + "epoch": 4.900182370820668, + "grad_norm": 0.0739185759618749, + "learning_rate": 3.2758303847947114e-06, + "loss": 0.4917, + "step": 10075 + }, + { + "epoch": 4.900668693009118, + "grad_norm": 0.07759010118045165, + "learning_rate": 3.2749328941015545e-06, + "loss": 0.5467, + "step": 10076 + }, + { + "epoch": 4.901155015197569, + "grad_norm": 0.07798652395815736, + "learning_rate": 3.2740354664948837e-06, + "loss": 0.5237, + "step": 10077 + }, + { + "epoch": 4.901641337386018, + "grad_norm": 0.07444816311336011, + "learning_rate": 3.2731381020075204e-06, + "loss": 0.5344, + "step": 10078 + }, + { + "epoch": 4.902127659574468, + "grad_norm": 0.07338779078486492, + "learning_rate": 3.2722408006722807e-06, + "loss": 0.5008, + "step": 10079 + }, + { + "epoch": 4.902613981762918, + "grad_norm": 0.07710639754871149, + "learning_rate": 3.2713435625219813e-06, + "loss": 0.5053, + "step": 10080 + }, + { + "epoch": 4.903100303951367, + "grad_norm": 0.07568679094565293, + "learning_rate": 3.2704463875894323e-06, + "loss": 0.508, + "step": 10081 + }, + { + "epoch": 4.903586626139818, + "grad_norm": 0.07462793941545953, + "learning_rate": 3.2695492759074458e-06, + "loss": 0.4915, + "step": 10082 + }, + { + "epoch": 4.904072948328268, + "grad_norm": 0.07506896105135114, + "learning_rate": 3.268652227508827e-06, + "loss": 0.5358, + "step": 10083 + }, + { + "epoch": 4.904559270516717, + "grad_norm": 0.07300692498025771, + "learning_rate": 3.2677552424263836e-06, + "loss": 0.5078, + "step": 10084 + }, + { + "epoch": 4.905045592705167, + "grad_norm": 0.07199599761022066, + "learning_rate": 3.2668583206929166e-06, + "loss": 0.5135, + "step": 10085 + }, + { + "epoch": 4.905531914893617, + "grad_norm": 0.07048441485612092, + "learning_rate": 3.2659614623412305e-06, + "loss": 0.4819, + "step": 10086 + }, + { + "epoch": 4.906018237082067, + "grad_norm": 0.07472383236313225, + "learning_rate": 3.2650646674041196e-06, + "loss": 0.4994, + "step": 10087 + }, + { + "epoch": 4.906504559270517, + "grad_norm": 0.07254862112445593, + "learning_rate": 3.264167935914383e-06, + "loss": 0.4966, + "step": 10088 + }, + { + "epoch": 4.906990881458967, + "grad_norm": 0.07543274313869557, + "learning_rate": 3.2632712679048127e-06, + "loss": 0.5213, + "step": 10089 + }, + { + "epoch": 4.907477203647416, + "grad_norm": 0.07401936546812483, + "learning_rate": 3.2623746634082034e-06, + "loss": 0.5354, + "step": 10090 + }, + { + "epoch": 4.907963525835866, + "grad_norm": 0.07464742325826529, + "learning_rate": 3.26147812245734e-06, + "loss": 0.49, + "step": 10091 + }, + { + "epoch": 4.908449848024317, + "grad_norm": 0.07238322343079291, + "learning_rate": 3.2605816450850116e-06, + "loss": 0.511, + "step": 10092 + }, + { + "epoch": 4.908936170212766, + "grad_norm": 0.07503912469681893, + "learning_rate": 3.259685231324003e-06, + "loss": 0.53, + "step": 10093 + }, + { + "epoch": 4.909422492401216, + "grad_norm": 0.0746899302485398, + "learning_rate": 3.2587888812070956e-06, + "loss": 0.5075, + "step": 10094 + }, + { + "epoch": 4.909908814589666, + "grad_norm": 0.07299117729776937, + "learning_rate": 3.2578925947670716e-06, + "loss": 0.5068, + "step": 10095 + }, + { + "epoch": 4.910395136778115, + "grad_norm": 0.07129649205902233, + "learning_rate": 3.256996372036705e-06, + "loss": 0.5028, + "step": 10096 + }, + { + "epoch": 4.910881458966565, + "grad_norm": 0.07523870673689492, + "learning_rate": 3.256100213048775e-06, + "loss": 0.4752, + "step": 10097 + }, + { + "epoch": 4.911367781155015, + "grad_norm": 0.07342438861814998, + "learning_rate": 3.255204117836051e-06, + "loss": 0.5293, + "step": 10098 + }, + { + "epoch": 4.911854103343465, + "grad_norm": 0.0728246775435652, + "learning_rate": 3.254308086431306e-06, + "loss": 0.5136, + "step": 10099 + }, + { + "epoch": 4.912340425531915, + "grad_norm": 0.07520794201256967, + "learning_rate": 3.2534121188673064e-06, + "loss": 0.5508, + "step": 10100 + }, + { + "epoch": 4.912826747720365, + "grad_norm": 0.07010348019175214, + "learning_rate": 3.25251621517682e-06, + "loss": 0.457, + "step": 10101 + }, + { + "epoch": 4.913313069908814, + "grad_norm": 0.07526788885435415, + "learning_rate": 3.251620375392609e-06, + "loss": 0.5425, + "step": 10102 + }, + { + "epoch": 4.913799392097264, + "grad_norm": 0.0732285865228861, + "learning_rate": 3.2507245995474353e-06, + "loss": 0.5126, + "step": 10103 + }, + { + "epoch": 4.914285714285715, + "grad_norm": 0.07363440283947675, + "learning_rate": 3.249828887674057e-06, + "loss": 0.5315, + "step": 10104 + }, + { + "epoch": 4.914772036474164, + "grad_norm": 0.0735967361530456, + "learning_rate": 3.248933239805233e-06, + "loss": 0.5196, + "step": 10105 + }, + { + "epoch": 4.915258358662614, + "grad_norm": 0.07490431315455255, + "learning_rate": 3.2480376559737147e-06, + "loss": 0.5221, + "step": 10106 + }, + { + "epoch": 4.915744680851064, + "grad_norm": 0.07463721848508649, + "learning_rate": 3.247142136212257e-06, + "loss": 0.5291, + "step": 10107 + }, + { + "epoch": 4.916231003039513, + "grad_norm": 0.07362034506764235, + "learning_rate": 3.2462466805536058e-06, + "loss": 0.5475, + "step": 10108 + }, + { + "epoch": 4.916717325227964, + "grad_norm": 0.07352108763488108, + "learning_rate": 3.245351289030511e-06, + "loss": 0.5018, + "step": 10109 + }, + { + "epoch": 4.917203647416414, + "grad_norm": 0.07171146209607958, + "learning_rate": 3.244455961675716e-06, + "loss": 0.4963, + "step": 10110 + }, + { + "epoch": 4.917689969604863, + "grad_norm": 0.07248580847278054, + "learning_rate": 3.243560698521966e-06, + "loss": 0.5262, + "step": 10111 + }, + { + "epoch": 4.918176291793313, + "grad_norm": 0.07073569024089672, + "learning_rate": 3.2426654996019967e-06, + "loss": 0.4937, + "step": 10112 + }, + { + "epoch": 4.918662613981763, + "grad_norm": 0.07259405070735103, + "learning_rate": 3.2417703649485504e-06, + "loss": 0.504, + "step": 10113 + }, + { + "epoch": 4.919148936170213, + "grad_norm": 0.0735226674894187, + "learning_rate": 3.240875294594359e-06, + "loss": 0.4856, + "step": 10114 + }, + { + "epoch": 4.919635258358663, + "grad_norm": 0.07465883809508439, + "learning_rate": 3.2399802885721597e-06, + "loss": 0.5302, + "step": 10115 + }, + { + "epoch": 4.920121580547113, + "grad_norm": 0.07843580298466458, + "learning_rate": 3.2390853469146787e-06, + "loss": 0.5714, + "step": 10116 + }, + { + "epoch": 4.920607902735562, + "grad_norm": 0.0724506693349241, + "learning_rate": 3.2381904696546474e-06, + "loss": 0.489, + "step": 10117 + }, + { + "epoch": 4.921094224924012, + "grad_norm": 0.07687938449118557, + "learning_rate": 3.2372956568247905e-06, + "loss": 0.5024, + "step": 10118 + }, + { + "epoch": 4.921580547112462, + "grad_norm": 0.07127194609043487, + "learning_rate": 3.236400908457832e-06, + "loss": 0.516, + "step": 10119 + }, + { + "epoch": 4.922066869300912, + "grad_norm": 0.07384623629316198, + "learning_rate": 3.2355062245864953e-06, + "loss": 0.4891, + "step": 10120 + }, + { + "epoch": 4.922553191489362, + "grad_norm": 0.0752684506219215, + "learning_rate": 3.234611605243496e-06, + "loss": 0.5226, + "step": 10121 + }, + { + "epoch": 4.923039513677812, + "grad_norm": 0.07285626149931103, + "learning_rate": 3.2337170504615545e-06, + "loss": 0.4961, + "step": 10122 + }, + { + "epoch": 4.923525835866261, + "grad_norm": 0.07530193598218167, + "learning_rate": 3.23282256027338e-06, + "loss": 0.5083, + "step": 10123 + }, + { + "epoch": 4.924012158054711, + "grad_norm": 0.07387518829505471, + "learning_rate": 3.2319281347116895e-06, + "loss": 0.5123, + "step": 10124 + }, + { + "epoch": 4.924498480243161, + "grad_norm": 0.07462073428969465, + "learning_rate": 3.231033773809189e-06, + "loss": 0.5064, + "step": 10125 + }, + { + "epoch": 4.924984802431611, + "grad_norm": 0.07607156429184778, + "learning_rate": 3.230139477598588e-06, + "loss": 0.5319, + "step": 10126 + }, + { + "epoch": 4.925471124620061, + "grad_norm": 0.07402970795567006, + "learning_rate": 3.22924524611259e-06, + "loss": 0.5242, + "step": 10127 + }, + { + "epoch": 4.9259574468085106, + "grad_norm": 0.07162298414438688, + "learning_rate": 3.2283510793838977e-06, + "loss": 0.5269, + "step": 10128 + }, + { + "epoch": 4.92644376899696, + "grad_norm": 0.07636262016561529, + "learning_rate": 3.2274569774452112e-06, + "loss": 0.5219, + "step": 10129 + }, + { + "epoch": 4.92693009118541, + "grad_norm": 0.07200005768216096, + "learning_rate": 3.22656294032923e-06, + "loss": 0.4801, + "step": 10130 + }, + { + "epoch": 4.9274164133738605, + "grad_norm": 0.07878081794081868, + "learning_rate": 3.225668968068645e-06, + "loss": 0.5217, + "step": 10131 + }, + { + "epoch": 4.92790273556231, + "grad_norm": 0.07688272859795572, + "learning_rate": 3.224775060696154e-06, + "loss": 0.4953, + "step": 10132 + }, + { + "epoch": 4.92838905775076, + "grad_norm": 0.0750831097317653, + "learning_rate": 3.2238812182444444e-06, + "loss": 0.5262, + "step": 10133 + }, + { + "epoch": 4.9288753799392095, + "grad_norm": 0.07767257636475636, + "learning_rate": 3.222987440746207e-06, + "loss": 0.5506, + "step": 10134 + }, + { + "epoch": 4.929361702127659, + "grad_norm": 0.07504626140201265, + "learning_rate": 3.2220937282341247e-06, + "loss": 0.519, + "step": 10135 + }, + { + "epoch": 4.92984802431611, + "grad_norm": 0.07438335940981164, + "learning_rate": 3.221200080740885e-06, + "loss": 0.5214, + "step": 10136 + }, + { + "epoch": 4.9303343465045595, + "grad_norm": 0.07355184298357463, + "learning_rate": 3.220306498299164e-06, + "loss": 0.5164, + "step": 10137 + }, + { + "epoch": 4.930820668693009, + "grad_norm": 0.07494156930823387, + "learning_rate": 3.2194129809416437e-06, + "loss": 0.5142, + "step": 10138 + }, + { + "epoch": 4.931306990881459, + "grad_norm": 0.07293953779961893, + "learning_rate": 3.2185195287009984e-06, + "loss": 0.4997, + "step": 10139 + }, + { + "epoch": 4.9317933130699085, + "grad_norm": 0.07183619574176704, + "learning_rate": 3.217626141609906e-06, + "loss": 0.4934, + "step": 10140 + }, + { + "epoch": 4.932279635258359, + "grad_norm": 0.0761620888974068, + "learning_rate": 3.2167328197010326e-06, + "loss": 0.5443, + "step": 10141 + }, + { + "epoch": 4.932765957446809, + "grad_norm": 0.07170636422325165, + "learning_rate": 3.2158395630070514e-06, + "loss": 0.5016, + "step": 10142 + }, + { + "epoch": 4.9332522796352585, + "grad_norm": 0.0787232549881691, + "learning_rate": 3.214946371560628e-06, + "loss": 0.5542, + "step": 10143 + }, + { + "epoch": 4.933738601823708, + "grad_norm": 0.07203952489052931, + "learning_rate": 3.214053245394425e-06, + "loss": 0.4853, + "step": 10144 + }, + { + "epoch": 4.934224924012158, + "grad_norm": 0.07204912019497156, + "learning_rate": 3.2131601845411096e-06, + "loss": 0.4835, + "step": 10145 + }, + { + "epoch": 4.9347112462006075, + "grad_norm": 0.0739552499606694, + "learning_rate": 3.212267189033336e-06, + "loss": 0.5126, + "step": 10146 + }, + { + "epoch": 4.935197568389058, + "grad_norm": 0.07369019119675618, + "learning_rate": 3.211374258903765e-06, + "loss": 0.514, + "step": 10147 + }, + { + "epoch": 4.935683890577508, + "grad_norm": 0.0716321018893341, + "learning_rate": 3.2104813941850475e-06, + "loss": 0.5125, + "step": 10148 + }, + { + "epoch": 4.9361702127659575, + "grad_norm": 0.07250159981829861, + "learning_rate": 3.2095885949098405e-06, + "loss": 0.5018, + "step": 10149 + }, + { + "epoch": 4.936656534954407, + "grad_norm": 0.07586676834909536, + "learning_rate": 3.2086958611107906e-06, + "loss": 0.5143, + "step": 10150 + }, + { + "epoch": 4.937142857142857, + "grad_norm": 0.07368063094769385, + "learning_rate": 3.207803192820549e-06, + "loss": 0.5057, + "step": 10151 + }, + { + "epoch": 4.9376291793313065, + "grad_norm": 0.07673472289939451, + "learning_rate": 3.2069105900717566e-06, + "loss": 0.5282, + "step": 10152 + }, + { + "epoch": 4.938115501519757, + "grad_norm": 0.07544132688355634, + "learning_rate": 3.2060180528970597e-06, + "loss": 0.4961, + "step": 10153 + }, + { + "epoch": 4.938601823708207, + "grad_norm": 0.07531378028171364, + "learning_rate": 3.205125581329096e-06, + "loss": 0.5171, + "step": 10154 + }, + { + "epoch": 4.9390881458966565, + "grad_norm": 0.0755485059959871, + "learning_rate": 3.2042331754005084e-06, + "loss": 0.5226, + "step": 10155 + }, + { + "epoch": 4.939574468085106, + "grad_norm": 0.07715451396575607, + "learning_rate": 3.2033408351439265e-06, + "loss": 0.5121, + "step": 10156 + }, + { + "epoch": 4.940060790273556, + "grad_norm": 0.07711850605945271, + "learning_rate": 3.202448560591988e-06, + "loss": 0.4921, + "step": 10157 + }, + { + "epoch": 4.940547112462006, + "grad_norm": 0.07394846259410856, + "learning_rate": 3.2015563517773214e-06, + "loss": 0.5395, + "step": 10158 + }, + { + "epoch": 4.941033434650456, + "grad_norm": 0.07226930212961101, + "learning_rate": 3.200664208732558e-06, + "loss": 0.5134, + "step": 10159 + }, + { + "epoch": 4.941519756838906, + "grad_norm": 0.07238613601785343, + "learning_rate": 3.1997721314903195e-06, + "loss": 0.5036, + "step": 10160 + }, + { + "epoch": 4.9420060790273554, + "grad_norm": 0.07173746792858594, + "learning_rate": 3.1988801200832344e-06, + "loss": 0.4825, + "step": 10161 + }, + { + "epoch": 4.942492401215805, + "grad_norm": 0.07365118290774321, + "learning_rate": 3.19798817454392e-06, + "loss": 0.4953, + "step": 10162 + }, + { + "epoch": 4.942978723404256, + "grad_norm": 0.07306819499163708, + "learning_rate": 3.1970962949049973e-06, + "loss": 0.499, + "step": 10163 + }, + { + "epoch": 4.943465045592705, + "grad_norm": 0.0735579642748732, + "learning_rate": 3.196204481199081e-06, + "loss": 0.524, + "step": 10164 + }, + { + "epoch": 4.943951367781155, + "grad_norm": 0.07421169026414505, + "learning_rate": 3.1953127334587887e-06, + "loss": 0.5064, + "step": 10165 + }, + { + "epoch": 4.944437689969605, + "grad_norm": 0.07408113074226359, + "learning_rate": 3.194421051716727e-06, + "loss": 0.5262, + "step": 10166 + }, + { + "epoch": 4.944924012158054, + "grad_norm": 0.07443714560090055, + "learning_rate": 3.1935294360055096e-06, + "loss": 0.4973, + "step": 10167 + }, + { + "epoch": 4.945410334346505, + "grad_norm": 0.07252831821766367, + "learning_rate": 3.1926378863577403e-06, + "loss": 0.4946, + "step": 10168 + }, + { + "epoch": 4.945896656534955, + "grad_norm": 0.07617294902513563, + "learning_rate": 3.1917464028060262e-06, + "loss": 0.5361, + "step": 10169 + }, + { + "epoch": 4.946382978723404, + "grad_norm": 0.07001175281472896, + "learning_rate": 3.1908549853829664e-06, + "loss": 0.474, + "step": 10170 + }, + { + "epoch": 4.946869300911854, + "grad_norm": 0.07552514847825104, + "learning_rate": 3.1899636341211604e-06, + "loss": 0.5236, + "step": 10171 + }, + { + "epoch": 4.947355623100304, + "grad_norm": 0.07545566733202758, + "learning_rate": 3.189072349053209e-06, + "loss": 0.5343, + "step": 10172 + }, + { + "epoch": 4.947841945288753, + "grad_norm": 0.07408979041791526, + "learning_rate": 3.1881811302117025e-06, + "loss": 0.5412, + "step": 10173 + }, + { + "epoch": 4.948328267477204, + "grad_norm": 0.0763076222792267, + "learning_rate": 3.1872899776292382e-06, + "loss": 0.5286, + "step": 10174 + }, + { + "epoch": 4.948814589665654, + "grad_norm": 0.07303290464238386, + "learning_rate": 3.1863988913384002e-06, + "loss": 0.5049, + "step": 10175 + }, + { + "epoch": 4.949300911854103, + "grad_norm": 0.07474930552380307, + "learning_rate": 3.1855078713717815e-06, + "loss": 0.5376, + "step": 10176 + }, + { + "epoch": 4.949787234042553, + "grad_norm": 0.07125295908755164, + "learning_rate": 3.1846169177619614e-06, + "loss": 0.5, + "step": 10177 + }, + { + "epoch": 4.950273556231003, + "grad_norm": 0.0734276806463007, + "learning_rate": 3.1837260305415267e-06, + "loss": 0.5499, + "step": 10178 + }, + { + "epoch": 4.950759878419452, + "grad_norm": 0.07494445845930545, + "learning_rate": 3.182835209743056e-06, + "loss": 0.5237, + "step": 10179 + }, + { + "epoch": 4.951246200607903, + "grad_norm": 0.07153233260783684, + "learning_rate": 3.1819444553991287e-06, + "loss": 0.4649, + "step": 10180 + }, + { + "epoch": 4.951732522796353, + "grad_norm": 0.07315808378702987, + "learning_rate": 3.181053767542316e-06, + "loss": 0.4887, + "step": 10181 + }, + { + "epoch": 4.952218844984802, + "grad_norm": 0.07158873673318761, + "learning_rate": 3.180163146205195e-06, + "loss": 0.5136, + "step": 10182 + }, + { + "epoch": 4.952705167173252, + "grad_norm": 0.07326766036441759, + "learning_rate": 3.1792725914203337e-06, + "loss": 0.5262, + "step": 10183 + }, + { + "epoch": 4.953191489361702, + "grad_norm": 0.07537848034634737, + "learning_rate": 3.178382103220302e-06, + "loss": 0.5084, + "step": 10184 + }, + { + "epoch": 4.953677811550152, + "grad_norm": 0.07422608631054724, + "learning_rate": 3.177491681637663e-06, + "loss": 0.5131, + "step": 10185 + }, + { + "epoch": 4.954164133738602, + "grad_norm": 0.07841547142015454, + "learning_rate": 3.1766013267049827e-06, + "loss": 0.5635, + "step": 10186 + }, + { + "epoch": 4.954650455927052, + "grad_norm": 0.07758471451620878, + "learning_rate": 3.175711038454819e-06, + "loss": 0.4747, + "step": 10187 + }, + { + "epoch": 4.955136778115501, + "grad_norm": 0.07439273861185047, + "learning_rate": 3.1748208169197336e-06, + "loss": 0.542, + "step": 10188 + }, + { + "epoch": 4.955623100303951, + "grad_norm": 0.07556239145644843, + "learning_rate": 3.1739306621322776e-06, + "loss": 0.5165, + "step": 10189 + }, + { + "epoch": 4.956109422492402, + "grad_norm": 0.07124557854570239, + "learning_rate": 3.1730405741250093e-06, + "loss": 0.4602, + "step": 10190 + }, + { + "epoch": 4.956595744680851, + "grad_norm": 0.07202594429471039, + "learning_rate": 3.172150552930475e-06, + "loss": 0.4994, + "step": 10191 + }, + { + "epoch": 4.957082066869301, + "grad_norm": 0.07486785958375644, + "learning_rate": 3.171260598581227e-06, + "loss": 0.5152, + "step": 10192 + }, + { + "epoch": 4.957568389057751, + "grad_norm": 0.0745513198162049, + "learning_rate": 3.170370711109808e-06, + "loss": 0.5126, + "step": 10193 + }, + { + "epoch": 4.9580547112462, + "grad_norm": 0.072016591137042, + "learning_rate": 3.1694808905487658e-06, + "loss": 0.4654, + "step": 10194 + }, + { + "epoch": 4.958541033434651, + "grad_norm": 0.07376451329851894, + "learning_rate": 3.1685911369306364e-06, + "loss": 0.4923, + "step": 10195 + }, + { + "epoch": 4.959027355623101, + "grad_norm": 0.07281783344501232, + "learning_rate": 3.167701450287962e-06, + "loss": 0.5043, + "step": 10196 + }, + { + "epoch": 4.95951367781155, + "grad_norm": 0.07409012176544998, + "learning_rate": 3.1668118306532786e-06, + "loss": 0.5078, + "step": 10197 + }, + { + "epoch": 4.96, + "grad_norm": 0.07624964578884641, + "learning_rate": 3.165922278059118e-06, + "loss": 0.5203, + "step": 10198 + }, + { + "epoch": 4.96048632218845, + "grad_norm": 0.07382328500562757, + "learning_rate": 3.1650327925380164e-06, + "loss": 0.5086, + "step": 10199 + }, + { + "epoch": 4.960972644376899, + "grad_norm": 0.07138799535388599, + "learning_rate": 3.1641433741224957e-06, + "loss": 0.4902, + "step": 10200 + }, + { + "epoch": 4.96145896656535, + "grad_norm": 0.07360831497654938, + "learning_rate": 3.1632540228450887e-06, + "loss": 0.5264, + "step": 10201 + }, + { + "epoch": 4.9619452887538, + "grad_norm": 0.07915917538342591, + "learning_rate": 3.1623647387383143e-06, + "loss": 0.5251, + "step": 10202 + }, + { + "epoch": 4.962431610942249, + "grad_norm": 0.07461986944318749, + "learning_rate": 3.161475521834697e-06, + "loss": 0.5148, + "step": 10203 + }, + { + "epoch": 4.962917933130699, + "grad_norm": 0.07463837093952712, + "learning_rate": 3.160586372166755e-06, + "loss": 0.5185, + "step": 10204 + }, + { + "epoch": 4.963404255319149, + "grad_norm": 0.07439308370997685, + "learning_rate": 3.1596972897670063e-06, + "loss": 0.5157, + "step": 10205 + }, + { + "epoch": 4.963890577507598, + "grad_norm": 0.07399661921538865, + "learning_rate": 3.158808274667962e-06, + "loss": 0.4994, + "step": 10206 + }, + { + "epoch": 4.964376899696049, + "grad_norm": 0.07609952851204221, + "learning_rate": 3.157919326902137e-06, + "loss": 0.4967, + "step": 10207 + }, + { + "epoch": 4.964863221884499, + "grad_norm": 0.07582631828125783, + "learning_rate": 3.1570304465020374e-06, + "loss": 0.5445, + "step": 10208 + }, + { + "epoch": 4.965349544072948, + "grad_norm": 0.07468550093963576, + "learning_rate": 3.1561416335001737e-06, + "loss": 0.5207, + "step": 10209 + }, + { + "epoch": 4.965835866261398, + "grad_norm": 0.07058795260886852, + "learning_rate": 3.155252887929047e-06, + "loss": 0.4739, + "step": 10210 + }, + { + "epoch": 4.966322188449848, + "grad_norm": 0.07460639061899049, + "learning_rate": 3.1543642098211606e-06, + "loss": 0.5112, + "step": 10211 + }, + { + "epoch": 4.966808510638298, + "grad_norm": 0.0717316701008223, + "learning_rate": 3.1534755992090126e-06, + "loss": 0.4785, + "step": 10212 + }, + { + "epoch": 4.967294832826748, + "grad_norm": 0.07343705942991076, + "learning_rate": 3.152587056125103e-06, + "loss": 0.5223, + "step": 10213 + }, + { + "epoch": 4.967781155015198, + "grad_norm": 0.07518200298946358, + "learning_rate": 3.1516985806019225e-06, + "loss": 0.5536, + "step": 10214 + }, + { + "epoch": 4.968267477203647, + "grad_norm": 0.07279037354171893, + "learning_rate": 3.150810172671966e-06, + "loss": 0.4943, + "step": 10215 + }, + { + "epoch": 4.968753799392097, + "grad_norm": 0.07516409804308415, + "learning_rate": 3.1499218323677196e-06, + "loss": 0.5121, + "step": 10216 + }, + { + "epoch": 4.9692401215805475, + "grad_norm": 0.07365436465231501, + "learning_rate": 3.149033559721674e-06, + "loss": 0.5311, + "step": 10217 + }, + { + "epoch": 4.969726443768997, + "grad_norm": 0.07452446780337373, + "learning_rate": 3.14814535476631e-06, + "loss": 0.507, + "step": 10218 + }, + { + "epoch": 4.970212765957447, + "grad_norm": 0.07253757268717734, + "learning_rate": 3.1472572175341145e-06, + "loss": 0.49, + "step": 10219 + }, + { + "epoch": 4.970699088145897, + "grad_norm": 0.07014285600943093, + "learning_rate": 3.146369148057562e-06, + "loss": 0.4777, + "step": 10220 + }, + { + "epoch": 4.971185410334346, + "grad_norm": 0.07166862751340183, + "learning_rate": 3.1454811463691334e-06, + "loss": 0.4994, + "step": 10221 + }, + { + "epoch": 4.971671732522797, + "grad_norm": 0.07294772702853751, + "learning_rate": 3.1445932125013002e-06, + "loss": 0.4909, + "step": 10222 + }, + { + "epoch": 4.9721580547112465, + "grad_norm": 0.074094172505211, + "learning_rate": 3.1437053464865363e-06, + "loss": 0.5181, + "step": 10223 + }, + { + "epoch": 4.972644376899696, + "grad_norm": 0.07493546783834686, + "learning_rate": 3.142817548357313e-06, + "loss": 0.4958, + "step": 10224 + }, + { + "epoch": 4.973130699088146, + "grad_norm": 0.07312614116589011, + "learning_rate": 3.1419298181460944e-06, + "loss": 0.5213, + "step": 10225 + }, + { + "epoch": 4.9736170212765956, + "grad_norm": 0.07346659318117896, + "learning_rate": 3.141042155885348e-06, + "loss": 0.5393, + "step": 10226 + }, + { + "epoch": 4.974103343465045, + "grad_norm": 0.07307699657872137, + "learning_rate": 3.1401545616075317e-06, + "loss": 0.5085, + "step": 10227 + }, + { + "epoch": 4.974589665653496, + "grad_norm": 0.07294189605911208, + "learning_rate": 3.1392670353451114e-06, + "loss": 0.5094, + "step": 10228 + }, + { + "epoch": 4.9750759878419455, + "grad_norm": 0.07706253190101434, + "learning_rate": 3.1383795771305386e-06, + "loss": 0.5324, + "step": 10229 + }, + { + "epoch": 4.975562310030395, + "grad_norm": 0.0769271314475107, + "learning_rate": 3.137492186996273e-06, + "loss": 0.535, + "step": 10230 + }, + { + "epoch": 4.976048632218845, + "grad_norm": 0.07614367764655156, + "learning_rate": 3.1366048649747617e-06, + "loss": 0.5481, + "step": 10231 + }, + { + "epoch": 4.9765349544072945, + "grad_norm": 0.0771138792937198, + "learning_rate": 3.1357176110984578e-06, + "loss": 0.536, + "step": 10232 + }, + { + "epoch": 4.977021276595744, + "grad_norm": 0.0792233669774828, + "learning_rate": 3.1348304253998074e-06, + "loss": 0.5354, + "step": 10233 + }, + { + "epoch": 4.977507598784195, + "grad_norm": 0.0708128986887633, + "learning_rate": 3.133943307911257e-06, + "loss": 0.4636, + "step": 10234 + }, + { + "epoch": 4.9779939209726445, + "grad_norm": 0.07400958622695976, + "learning_rate": 3.133056258665246e-06, + "loss": 0.5052, + "step": 10235 + }, + { + "epoch": 4.978480243161094, + "grad_norm": 0.070496052835644, + "learning_rate": 3.132169277694217e-06, + "loss": 0.5079, + "step": 10236 + }, + { + "epoch": 4.978966565349544, + "grad_norm": 0.07565349747368681, + "learning_rate": 3.1312823650306057e-06, + "loss": 0.5169, + "step": 10237 + }, + { + "epoch": 4.9794528875379935, + "grad_norm": 0.07214730059546492, + "learning_rate": 3.130395520706848e-06, + "loss": 0.5079, + "step": 10238 + }, + { + "epoch": 4.979939209726444, + "grad_norm": 0.07300507813060274, + "learning_rate": 3.1295087447553745e-06, + "loss": 0.5026, + "step": 10239 + }, + { + "epoch": 4.980425531914894, + "grad_norm": 0.07321225749951657, + "learning_rate": 3.128622037208617e-06, + "loss": 0.5162, + "step": 10240 + }, + { + "epoch": 4.9809118541033435, + "grad_norm": 0.07221755145758249, + "learning_rate": 3.1277353980990012e-06, + "loss": 0.4863, + "step": 10241 + }, + { + "epoch": 4.981398176291793, + "grad_norm": 0.07482020030019791, + "learning_rate": 3.1268488274589526e-06, + "loss": 0.5421, + "step": 10242 + }, + { + "epoch": 4.981884498480243, + "grad_norm": 0.0730556729888338, + "learning_rate": 3.1259623253208928e-06, + "loss": 0.4942, + "step": 10243 + }, + { + "epoch": 4.982370820668693, + "grad_norm": 0.07304073005168182, + "learning_rate": 3.125075891717244e-06, + "loss": 0.4978, + "step": 10244 + }, + { + "epoch": 4.982857142857143, + "grad_norm": 0.07807132055053483, + "learning_rate": 3.12418952668042e-06, + "loss": 0.5137, + "step": 10245 + }, + { + "epoch": 4.983343465045593, + "grad_norm": 0.07313450850260139, + "learning_rate": 3.123303230242838e-06, + "loss": 0.4758, + "step": 10246 + }, + { + "epoch": 4.9838297872340425, + "grad_norm": 0.07123861478395989, + "learning_rate": 3.122417002436908e-06, + "loss": 0.4934, + "step": 10247 + }, + { + "epoch": 4.984316109422492, + "grad_norm": 0.07408830345256412, + "learning_rate": 3.1215308432950435e-06, + "loss": 0.5336, + "step": 10248 + }, + { + "epoch": 4.984802431610943, + "grad_norm": 0.07307369786628065, + "learning_rate": 3.1206447528496477e-06, + "loss": 0.5186, + "step": 10249 + }, + { + "epoch": 4.985288753799392, + "grad_norm": 0.07068473962276077, + "learning_rate": 3.1197587311331266e-06, + "loss": 0.4555, + "step": 10250 + }, + { + "epoch": 4.985775075987842, + "grad_norm": 0.07254233727661387, + "learning_rate": 3.118872778177885e-06, + "loss": 0.4922, + "step": 10251 + }, + { + "epoch": 4.986261398176292, + "grad_norm": 0.07195916141458782, + "learning_rate": 3.1179868940163187e-06, + "loss": 0.4824, + "step": 10252 + }, + { + "epoch": 4.9867477203647415, + "grad_norm": 0.07592802675085018, + "learning_rate": 3.1171010786808286e-06, + "loss": 0.5337, + "step": 10253 + }, + { + "epoch": 4.987234042553191, + "grad_norm": 0.07340716911643105, + "learning_rate": 3.116215332203806e-06, + "loss": 0.5056, + "step": 10254 + }, + { + "epoch": 4.987720364741642, + "grad_norm": 0.07485085680500117, + "learning_rate": 3.115329654617647e-06, + "loss": 0.5341, + "step": 10255 + }, + { + "epoch": 4.988206686930091, + "grad_norm": 0.07423446292563808, + "learning_rate": 3.1144440459547355e-06, + "loss": 0.4985, + "step": 10256 + }, + { + "epoch": 4.988693009118541, + "grad_norm": 0.071868463004274, + "learning_rate": 3.113558506247464e-06, + "loss": 0.4785, + "step": 10257 + }, + { + "epoch": 4.989179331306991, + "grad_norm": 0.07450094166013828, + "learning_rate": 3.112673035528213e-06, + "loss": 0.5228, + "step": 10258 + }, + { + "epoch": 4.9896656534954404, + "grad_norm": 0.07174762226281818, + "learning_rate": 3.1117876338293697e-06, + "loss": 0.5082, + "step": 10259 + }, + { + "epoch": 4.99015197568389, + "grad_norm": 0.07311332853867696, + "learning_rate": 3.110902301183307e-06, + "loss": 0.493, + "step": 10260 + }, + { + "epoch": 4.990638297872341, + "grad_norm": 0.074140761913313, + "learning_rate": 3.110017037622408e-06, + "loss": 0.498, + "step": 10261 + }, + { + "epoch": 4.99112462006079, + "grad_norm": 0.07294891605603335, + "learning_rate": 3.109131843179043e-06, + "loss": 0.5275, + "step": 10262 + }, + { + "epoch": 4.99161094224924, + "grad_norm": 0.07237190432044079, + "learning_rate": 3.108246717885587e-06, + "loss": 0.529, + "step": 10263 + }, + { + "epoch": 4.99209726443769, + "grad_norm": 0.07622478406109423, + "learning_rate": 3.107361661774406e-06, + "loss": 0.5347, + "step": 10264 + }, + { + "epoch": 4.992583586626139, + "grad_norm": 0.07292900283168355, + "learning_rate": 3.1064766748778706e-06, + "loss": 0.5103, + "step": 10265 + }, + { + "epoch": 4.99306990881459, + "grad_norm": 0.07162934528368807, + "learning_rate": 3.1055917572283423e-06, + "loss": 0.5067, + "step": 10266 + }, + { + "epoch": 4.99355623100304, + "grad_norm": 0.0772221657392243, + "learning_rate": 3.104706908858186e-06, + "loss": 0.558, + "step": 10267 + }, + { + "epoch": 4.994042553191489, + "grad_norm": 0.07358215377524287, + "learning_rate": 3.1038221297997574e-06, + "loss": 0.5331, + "step": 10268 + }, + { + "epoch": 4.994528875379939, + "grad_norm": 0.07254897051563992, + "learning_rate": 3.1029374200854167e-06, + "loss": 0.542, + "step": 10269 + }, + { + "epoch": 4.995015197568389, + "grad_norm": 0.07197521975929773, + "learning_rate": 3.102052779747514e-06, + "loss": 0.4817, + "step": 10270 + }, + { + "epoch": 4.995501519756839, + "grad_norm": 0.0726734972824184, + "learning_rate": 3.101168208818405e-06, + "loss": 0.4983, + "step": 10271 + }, + { + "epoch": 4.995987841945289, + "grad_norm": 0.07549430565792285, + "learning_rate": 3.100283707330436e-06, + "loss": 0.5319, + "step": 10272 + }, + { + "epoch": 4.996474164133739, + "grad_norm": 0.07407634278685511, + "learning_rate": 3.099399275315957e-06, + "loss": 0.519, + "step": 10273 + }, + { + "epoch": 4.996960486322188, + "grad_norm": 0.07275511124006062, + "learning_rate": 3.0985149128073083e-06, + "loss": 0.4962, + "step": 10274 + }, + { + "epoch": 4.997446808510638, + "grad_norm": 0.07493752613740207, + "learning_rate": 3.097630619836833e-06, + "loss": 0.5442, + "step": 10275 + }, + { + "epoch": 4.997933130699089, + "grad_norm": 0.07215634510644309, + "learning_rate": 3.096746396436871e-06, + "loss": 0.5213, + "step": 10276 + }, + { + "epoch": 4.998419452887538, + "grad_norm": 0.07142708886125362, + "learning_rate": 3.0958622426397567e-06, + "loss": 0.5065, + "step": 10277 + }, + { + "epoch": 4.998905775075988, + "grad_norm": 0.07234281504213742, + "learning_rate": 3.0949781584778284e-06, + "loss": 0.4892, + "step": 10278 + }, + { + "epoch": 4.999392097264438, + "grad_norm": 0.0747017809557893, + "learning_rate": 3.094094143983411e-06, + "loss": 0.5128, + "step": 10279 + }, + { + "epoch": 4.999878419452887, + "grad_norm": 0.07223965839724003, + "learning_rate": 3.0932101991888385e-06, + "loss": 0.4924, + "step": 10280 + }, + { + "epoch": 4.999878419452887, + "eval_loss": 0.5666021108627319, + "eval_runtime": 105.1621, + "eval_samples_per_second": 288.631, + "eval_steps_per_second": 36.087, + "step": 10280 + } + ], + "logging_steps": 1, + "max_steps": 16448, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 2056, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4684063684165632.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}