diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,30483 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.22686392180443912, + "eval_steps": 500, + "global_step": 870000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.215262570216991e-05, + "grad_norm": 196.05113220214844, + "learning_rate": 2.6076639242734397e-08, + "loss": 19.4606, + "step": 200 + }, + { + "epoch": 0.00010430525140433982, + "grad_norm": 184.3112335205078, + "learning_rate": 5.2153278485468794e-08, + "loss": 19.0516, + "step": 400 + }, + { + "epoch": 0.00015645787710650975, + "grad_norm": 126.78282928466797, + "learning_rate": 7.822991772820321e-08, + "loss": 17.3077, + "step": 600 + }, + { + "epoch": 0.00020861050280867964, + "grad_norm": 59.83221435546875, + "learning_rate": 1.0430655697093759e-07, + "loss": 15.0746, + "step": 800 + }, + { + "epoch": 0.0002607631285108496, + "grad_norm": 28.47006607055664, + "learning_rate": 1.30383196213672e-07, + "loss": 13.063, + "step": 1000 + }, + { + "epoch": 0.0003129157542130195, + "grad_norm": 16.229631423950195, + "learning_rate": 1.5645983545640642e-07, + "loss": 11.636, + "step": 1200 + }, + { + "epoch": 0.0003650683799151894, + "grad_norm": 14.528124809265137, + "learning_rate": 1.825364746991408e-07, + "loss": 10.7786, + "step": 1400 + }, + { + "epoch": 0.0004172210056173593, + "grad_norm": 12.064294815063477, + "learning_rate": 2.0861311394187518e-07, + "loss": 10.2434, + "step": 1600 + }, + { + "epoch": 0.00046937363131952924, + "grad_norm": 11.837329864501953, + "learning_rate": 2.3468975318460956e-07, + "loss": 9.9074, + "step": 1800 + }, + { + "epoch": 0.0005215262570216992, + "grad_norm": 10.613241195678711, + "learning_rate": 2.60766392427344e-07, + "loss": 9.6339, + "step": 2000 + }, + { + "epoch": 0.000573678882723869, + "grad_norm": 10.466361045837402, + "learning_rate": 2.8684303167007837e-07, + "loss": 9.4193, + "step": 2200 + }, + { + "epoch": 0.000625831508426039, + "grad_norm": 10.19588565826416, + "learning_rate": 3.1291967091281283e-07, + "loss": 9.2786, + "step": 2400 + }, + { + "epoch": 0.0006779841341282089, + "grad_norm": 9.127360343933105, + "learning_rate": 3.389963101555472e-07, + "loss": 9.1531, + "step": 2600 + }, + { + "epoch": 0.0007301367598303788, + "grad_norm": 10.378647804260254, + "learning_rate": 3.650729493982816e-07, + "loss": 9.0132, + "step": 2800 + }, + { + "epoch": 0.0007822893855325487, + "grad_norm": 10.13025951385498, + "learning_rate": 3.91149588641016e-07, + "loss": 8.876, + "step": 3000 + }, + { + "epoch": 0.0008344420112347186, + "grad_norm": 8.717490196228027, + "learning_rate": 4.1722622788375035e-07, + "loss": 8.7792, + "step": 3200 + }, + { + "epoch": 0.0008865946369368885, + "grad_norm": 8.891645431518555, + "learning_rate": 4.4330286712648476e-07, + "loss": 8.675, + "step": 3400 + }, + { + "epoch": 0.0009387472626390585, + "grad_norm": 8.681458473205566, + "learning_rate": 4.693795063692191e-07, + "loss": 8.5239, + "step": 3600 + }, + { + "epoch": 0.0009908998883412284, + "grad_norm": 9.971908569335938, + "learning_rate": 4.954561456119536e-07, + "loss": 8.4858, + "step": 3800 + }, + { + "epoch": 0.0010430525140433984, + "grad_norm": 9.44089412689209, + "learning_rate": 5.21532784854688e-07, + "loss": 8.3296, + "step": 4000 + }, + { + "epoch": 0.0010952051397455681, + "grad_norm": 8.453426361083984, + "learning_rate": 5.476094240974224e-07, + "loss": 8.2809, + "step": 4200 + }, + { + "epoch": 0.001147357765447738, + "grad_norm": 8.532354354858398, + "learning_rate": 5.736860633401567e-07, + "loss": 8.1715, + "step": 4400 + }, + { + "epoch": 0.001199510391149908, + "grad_norm": 8.517669677734375, + "learning_rate": 5.997627025828912e-07, + "loss": 8.0996, + "step": 4600 + }, + { + "epoch": 0.001251663016852078, + "grad_norm": 9.150565147399902, + "learning_rate": 6.258393418256257e-07, + "loss": 8.0604, + "step": 4800 + }, + { + "epoch": 0.001303815642554248, + "grad_norm": 8.797100067138672, + "learning_rate": 6.519159810683599e-07, + "loss": 7.9397, + "step": 5000 + }, + { + "epoch": 0.0013559682682564179, + "grad_norm": 8.194568634033203, + "learning_rate": 6.779926203110944e-07, + "loss": 7.8885, + "step": 5200 + }, + { + "epoch": 0.0014081208939585876, + "grad_norm": 8.242785453796387, + "learning_rate": 7.040692595538287e-07, + "loss": 7.8287, + "step": 5400 + }, + { + "epoch": 0.0014602735196607576, + "grad_norm": 9.448251724243164, + "learning_rate": 7.301458987965632e-07, + "loss": 7.7276, + "step": 5600 + }, + { + "epoch": 0.0015124261453629275, + "grad_norm": 7.932211875915527, + "learning_rate": 7.562225380392975e-07, + "loss": 7.6658, + "step": 5800 + }, + { + "epoch": 0.0015645787710650975, + "grad_norm": 8.316240310668945, + "learning_rate": 7.82299177282032e-07, + "loss": 7.5645, + "step": 6000 + }, + { + "epoch": 0.0016167313967672674, + "grad_norm": 8.247200965881348, + "learning_rate": 8.083758165247665e-07, + "loss": 7.508, + "step": 6200 + }, + { + "epoch": 0.0016688840224694372, + "grad_norm": 8.689373970031738, + "learning_rate": 8.344524557675007e-07, + "loss": 7.4703, + "step": 6400 + }, + { + "epoch": 0.001721036648171607, + "grad_norm": 7.99875545501709, + "learning_rate": 8.605290950102352e-07, + "loss": 7.3914, + "step": 6600 + }, + { + "epoch": 0.001773189273873777, + "grad_norm": 8.472365379333496, + "learning_rate": 8.866057342529695e-07, + "loss": 7.3268, + "step": 6800 + }, + { + "epoch": 0.001825341899575947, + "grad_norm": 8.410703659057617, + "learning_rate": 9.12682373495704e-07, + "loss": 7.2791, + "step": 7000 + }, + { + "epoch": 0.001877494525278117, + "grad_norm": 9.348575592041016, + "learning_rate": 9.387590127384382e-07, + "loss": 7.2181, + "step": 7200 + }, + { + "epoch": 0.001929647150980287, + "grad_norm": 8.044520378112793, + "learning_rate": 9.648356519811728e-07, + "loss": 7.1393, + "step": 7400 + }, + { + "epoch": 0.001981799776682457, + "grad_norm": 8.222118377685547, + "learning_rate": 9.909122912239071e-07, + "loss": 7.1147, + "step": 7600 + }, + { + "epoch": 0.002033952402384627, + "grad_norm": 8.315069198608398, + "learning_rate": 1.0169889304666415e-06, + "loss": 7.0527, + "step": 7800 + }, + { + "epoch": 0.0020861050280867968, + "grad_norm": 8.228837013244629, + "learning_rate": 1.043065569709376e-06, + "loss": 6.9868, + "step": 8000 + }, + { + "epoch": 0.0021382576537889663, + "grad_norm": 8.025296211242676, + "learning_rate": 1.0691422089521102e-06, + "loss": 6.9165, + "step": 8200 + }, + { + "epoch": 0.0021904102794911362, + "grad_norm": 8.421102523803711, + "learning_rate": 1.0952188481948448e-06, + "loss": 6.8728, + "step": 8400 + }, + { + "epoch": 0.002242562905193306, + "grad_norm": 8.444592475891113, + "learning_rate": 1.1212954874375791e-06, + "loss": 6.8894, + "step": 8600 + }, + { + "epoch": 0.002294715530895476, + "grad_norm": 9.23551082611084, + "learning_rate": 1.1473721266803135e-06, + "loss": 6.8165, + "step": 8800 + }, + { + "epoch": 0.002346868156597646, + "grad_norm": 7.454519748687744, + "learning_rate": 1.173448765923048e-06, + "loss": 6.7488, + "step": 9000 + }, + { + "epoch": 0.002399020782299816, + "grad_norm": 8.644956588745117, + "learning_rate": 1.1995254051657824e-06, + "loss": 6.7656, + "step": 9200 + }, + { + "epoch": 0.002451173408001986, + "grad_norm": 8.16921615600586, + "learning_rate": 1.2256020444085168e-06, + "loss": 6.6187, + "step": 9400 + }, + { + "epoch": 0.002503326033704156, + "grad_norm": 8.748774528503418, + "learning_rate": 1.2516786836512513e-06, + "loss": 6.5993, + "step": 9600 + }, + { + "epoch": 0.002555478659406326, + "grad_norm": 7.972047328948975, + "learning_rate": 1.2777553228939855e-06, + "loss": 6.6099, + "step": 9800 + }, + { + "epoch": 0.002607631285108496, + "grad_norm": 7.958851337432861, + "learning_rate": 1.3038319621367198e-06, + "loss": 6.5708, + "step": 10000 + }, + { + "epoch": 0.002659783910810666, + "grad_norm": 8.20793628692627, + "learning_rate": 1.3299086013794544e-06, + "loss": 6.5175, + "step": 10200 + }, + { + "epoch": 0.0027119365365128358, + "grad_norm": 8.190882682800293, + "learning_rate": 1.3559852406221887e-06, + "loss": 6.4806, + "step": 10400 + }, + { + "epoch": 0.0027640891622150053, + "grad_norm": 8.604300498962402, + "learning_rate": 1.3820618798649233e-06, + "loss": 6.4768, + "step": 10600 + }, + { + "epoch": 0.0028162417879171752, + "grad_norm": 8.548392295837402, + "learning_rate": 1.4081385191076574e-06, + "loss": 6.4409, + "step": 10800 + }, + { + "epoch": 0.002868394413619345, + "grad_norm": 7.730503559112549, + "learning_rate": 1.4342151583503918e-06, + "loss": 6.4013, + "step": 11000 + }, + { + "epoch": 0.002920547039321515, + "grad_norm": 8.61767292022705, + "learning_rate": 1.4602917975931264e-06, + "loss": 6.3355, + "step": 11200 + }, + { + "epoch": 0.002972699665023685, + "grad_norm": 7.736725330352783, + "learning_rate": 1.4863684368358607e-06, + "loss": 6.2992, + "step": 11400 + }, + { + "epoch": 0.003024852290725855, + "grad_norm": 8.229381561279297, + "learning_rate": 1.512445076078595e-06, + "loss": 6.293, + "step": 11600 + }, + { + "epoch": 0.003077004916428025, + "grad_norm": 8.532485008239746, + "learning_rate": 1.5385217153213294e-06, + "loss": 6.2469, + "step": 11800 + }, + { + "epoch": 0.003129157542130195, + "grad_norm": 8.137749671936035, + "learning_rate": 1.564598354564064e-06, + "loss": 6.1848, + "step": 12000 + }, + { + "epoch": 0.003181310167832365, + "grad_norm": 7.897096157073975, + "learning_rate": 1.5906749938067983e-06, + "loss": 6.1823, + "step": 12200 + }, + { + "epoch": 0.003233462793534535, + "grad_norm": 7.656859874725342, + "learning_rate": 1.616751633049533e-06, + "loss": 6.1289, + "step": 12400 + }, + { + "epoch": 0.003285615419236705, + "grad_norm": 8.03537654876709, + "learning_rate": 1.642828272292267e-06, + "loss": 6.1108, + "step": 12600 + }, + { + "epoch": 0.0033377680449388743, + "grad_norm": 7.999448299407959, + "learning_rate": 1.6689049115350014e-06, + "loss": 6.0881, + "step": 12800 + }, + { + "epoch": 0.0033899206706410443, + "grad_norm": 8.130730628967285, + "learning_rate": 1.694981550777736e-06, + "loss": 6.05, + "step": 13000 + }, + { + "epoch": 0.003442073296343214, + "grad_norm": 8.437195777893066, + "learning_rate": 1.7210581900204703e-06, + "loss": 6.0568, + "step": 13200 + }, + { + "epoch": 0.003494225922045384, + "grad_norm": 8.395051956176758, + "learning_rate": 1.747134829263205e-06, + "loss": 6.0069, + "step": 13400 + }, + { + "epoch": 0.003546378547747554, + "grad_norm": 8.429478645324707, + "learning_rate": 1.773211468505939e-06, + "loss": 5.9598, + "step": 13600 + }, + { + "epoch": 0.003598531173449724, + "grad_norm": 8.610182762145996, + "learning_rate": 1.7992881077486734e-06, + "loss": 5.9299, + "step": 13800 + }, + { + "epoch": 0.003650683799151894, + "grad_norm": 7.439322471618652, + "learning_rate": 1.825364746991408e-06, + "loss": 5.913, + "step": 14000 + }, + { + "epoch": 0.003702836424854064, + "grad_norm": 8.080249786376953, + "learning_rate": 1.8514413862341423e-06, + "loss": 5.9126, + "step": 14200 + }, + { + "epoch": 0.003754989050556234, + "grad_norm": 7.7550201416015625, + "learning_rate": 1.8775180254768765e-06, + "loss": 5.8244, + "step": 14400 + }, + { + "epoch": 0.003807141676258404, + "grad_norm": 8.251408576965332, + "learning_rate": 1.903594664719611e-06, + "loss": 5.8174, + "step": 14600 + }, + { + "epoch": 0.003859294301960574, + "grad_norm": 7.648636817932129, + "learning_rate": 1.9296713039623456e-06, + "loss": 5.8122, + "step": 14800 + }, + { + "epoch": 0.003911446927662743, + "grad_norm": 7.906362533569336, + "learning_rate": 1.95574794320508e-06, + "loss": 5.7618, + "step": 15000 + }, + { + "epoch": 0.003963599553364914, + "grad_norm": 7.8534698486328125, + "learning_rate": 1.9818245824478143e-06, + "loss": 5.7498, + "step": 15200 + }, + { + "epoch": 0.004015752179067083, + "grad_norm": 8.2284574508667, + "learning_rate": 2.0079012216905486e-06, + "loss": 5.7466, + "step": 15400 + }, + { + "epoch": 0.004067904804769254, + "grad_norm": 8.237747192382812, + "learning_rate": 2.033977860933283e-06, + "loss": 5.7004, + "step": 15600 + }, + { + "epoch": 0.004120057430471423, + "grad_norm": 7.435640335083008, + "learning_rate": 2.0600545001760174e-06, + "loss": 5.6729, + "step": 15800 + }, + { + "epoch": 0.0041722100561735935, + "grad_norm": 7.641366958618164, + "learning_rate": 2.086131139418752e-06, + "loss": 5.6398, + "step": 16000 + }, + { + "epoch": 0.004224362681875763, + "grad_norm": 7.673348426818848, + "learning_rate": 2.1122077786614865e-06, + "loss": 5.6584, + "step": 16200 + }, + { + "epoch": 0.004276515307577933, + "grad_norm": 8.171424865722656, + "learning_rate": 2.1382844179042204e-06, + "loss": 5.5945, + "step": 16400 + }, + { + "epoch": 0.004328667933280103, + "grad_norm": 7.752162933349609, + "learning_rate": 2.164361057146955e-06, + "loss": 5.6081, + "step": 16600 + }, + { + "epoch": 0.0043808205589822725, + "grad_norm": 7.9565958976745605, + "learning_rate": 2.1904376963896896e-06, + "loss": 5.5563, + "step": 16800 + }, + { + "epoch": 0.004432973184684443, + "grad_norm": 7.686338424682617, + "learning_rate": 2.216514335632424e-06, + "loss": 5.5785, + "step": 17000 + }, + { + "epoch": 0.004485125810386612, + "grad_norm": 7.707006931304932, + "learning_rate": 2.2425909748751583e-06, + "loss": 5.57, + "step": 17200 + }, + { + "epoch": 0.004537278436088783, + "grad_norm": 8.033032417297363, + "learning_rate": 2.2686676141178926e-06, + "loss": 5.5128, + "step": 17400 + }, + { + "epoch": 0.004589431061790952, + "grad_norm": 8.142727851867676, + "learning_rate": 2.294744253360627e-06, + "loss": 5.4853, + "step": 17600 + }, + { + "epoch": 0.004641583687493123, + "grad_norm": 7.563163757324219, + "learning_rate": 2.3208208926033613e-06, + "loss": 5.4514, + "step": 17800 + }, + { + "epoch": 0.004693736313195292, + "grad_norm": 7.709859848022461, + "learning_rate": 2.346897531846096e-06, + "loss": 5.4332, + "step": 18000 + }, + { + "epoch": 0.004745888938897463, + "grad_norm": 8.00818920135498, + "learning_rate": 2.37297417108883e-06, + "loss": 5.474, + "step": 18200 + }, + { + "epoch": 0.004798041564599632, + "grad_norm": 8.24853229522705, + "learning_rate": 2.399050810331565e-06, + "loss": 5.4155, + "step": 18400 + }, + { + "epoch": 0.004850194190301802, + "grad_norm": 8.416743278503418, + "learning_rate": 2.425127449574299e-06, + "loss": 5.3975, + "step": 18600 + }, + { + "epoch": 0.004902346816003972, + "grad_norm": 8.024255752563477, + "learning_rate": 2.4512040888170335e-06, + "loss": 5.3896, + "step": 18800 + }, + { + "epoch": 0.0049544994417061415, + "grad_norm": 7.6818132400512695, + "learning_rate": 2.477280728059768e-06, + "loss": 5.3643, + "step": 19000 + }, + { + "epoch": 0.005006652067408312, + "grad_norm": 7.544449329376221, + "learning_rate": 2.5033573673025026e-06, + "loss": 5.3326, + "step": 19200 + }, + { + "epoch": 0.005058804693110481, + "grad_norm": 7.725053787231445, + "learning_rate": 2.5294340065452366e-06, + "loss": 5.3108, + "step": 19400 + }, + { + "epoch": 0.005110957318812652, + "grad_norm": 7.843037128448486, + "learning_rate": 2.555510645787971e-06, + "loss": 5.2766, + "step": 19600 + }, + { + "epoch": 0.005163109944514821, + "grad_norm": 7.6276164054870605, + "learning_rate": 2.5815872850307057e-06, + "loss": 5.3071, + "step": 19800 + }, + { + "epoch": 0.005215262570216992, + "grad_norm": 7.907887935638428, + "learning_rate": 2.6076639242734396e-06, + "loss": 5.2333, + "step": 20000 + }, + { + "epoch": 0.005267415195919161, + "grad_norm": 7.383638858795166, + "learning_rate": 2.6337405635161744e-06, + "loss": 5.3013, + "step": 20200 + }, + { + "epoch": 0.005319567821621332, + "grad_norm": 7.0909247398376465, + "learning_rate": 2.6598172027589088e-06, + "loss": 5.222, + "step": 20400 + }, + { + "epoch": 0.005371720447323501, + "grad_norm": 8.317301750183105, + "learning_rate": 2.6858938420016427e-06, + "loss": 5.1935, + "step": 20600 + }, + { + "epoch": 0.0054238730730256715, + "grad_norm": 7.909549713134766, + "learning_rate": 2.7119704812443775e-06, + "loss": 5.1732, + "step": 20800 + }, + { + "epoch": 0.005476025698727841, + "grad_norm": 8.112817764282227, + "learning_rate": 2.738047120487112e-06, + "loss": 5.2078, + "step": 21000 + }, + { + "epoch": 0.0055281783244300105, + "grad_norm": 7.593140125274658, + "learning_rate": 2.7641237597298466e-06, + "loss": 5.1478, + "step": 21200 + }, + { + "epoch": 0.005580330950132181, + "grad_norm": 7.865172863006592, + "learning_rate": 2.7902003989725805e-06, + "loss": 5.1364, + "step": 21400 + }, + { + "epoch": 0.0056324835758343504, + "grad_norm": 7.748229503631592, + "learning_rate": 2.816277038215315e-06, + "loss": 5.1232, + "step": 21600 + }, + { + "epoch": 0.005684636201536521, + "grad_norm": 7.954893112182617, + "learning_rate": 2.8423536774580497e-06, + "loss": 5.138, + "step": 21800 + }, + { + "epoch": 0.00573678882723869, + "grad_norm": 7.68300199508667, + "learning_rate": 2.8684303167007836e-06, + "loss": 5.1083, + "step": 22000 + }, + { + "epoch": 0.005788941452940861, + "grad_norm": 9.848554611206055, + "learning_rate": 2.8945069559435184e-06, + "loss": 5.0713, + "step": 22200 + }, + { + "epoch": 0.00584109407864303, + "grad_norm": 7.352326393127441, + "learning_rate": 2.9205835951862527e-06, + "loss": 5.0239, + "step": 22400 + }, + { + "epoch": 0.005893246704345201, + "grad_norm": 8.010299682617188, + "learning_rate": 2.9466602344289867e-06, + "loss": 5.0562, + "step": 22600 + }, + { + "epoch": 0.00594539933004737, + "grad_norm": 7.50053071975708, + "learning_rate": 2.9727368736717214e-06, + "loss": 4.9805, + "step": 22800 + }, + { + "epoch": 0.0059975519557495405, + "grad_norm": 7.903580188751221, + "learning_rate": 2.998813512914456e-06, + "loss": 5.0251, + "step": 23000 + }, + { + "epoch": 0.00604970458145171, + "grad_norm": 7.933001518249512, + "learning_rate": 3.02489015215719e-06, + "loss": 5.0268, + "step": 23200 + }, + { + "epoch": 0.00610185720715388, + "grad_norm": 7.974423885345459, + "learning_rate": 3.0509667913999245e-06, + "loss": 5.0149, + "step": 23400 + }, + { + "epoch": 0.00615400983285605, + "grad_norm": 7.946493148803711, + "learning_rate": 3.077043430642659e-06, + "loss": 4.9947, + "step": 23600 + }, + { + "epoch": 0.0062061624585582195, + "grad_norm": 8.70067024230957, + "learning_rate": 3.1031200698853936e-06, + "loss": 4.9885, + "step": 23800 + }, + { + "epoch": 0.00625831508426039, + "grad_norm": 7.401217937469482, + "learning_rate": 3.129196709128128e-06, + "loss": 4.9058, + "step": 24000 + }, + { + "epoch": 0.006310467709962559, + "grad_norm": 8.371158599853516, + "learning_rate": 3.155273348370862e-06, + "loss": 4.9456, + "step": 24200 + }, + { + "epoch": 0.00636262033566473, + "grad_norm": 7.775518894195557, + "learning_rate": 3.1813499876135967e-06, + "loss": 4.9482, + "step": 24400 + }, + { + "epoch": 0.006414772961366899, + "grad_norm": 7.885232448577881, + "learning_rate": 3.207426626856331e-06, + "loss": 4.9251, + "step": 24600 + }, + { + "epoch": 0.00646692558706907, + "grad_norm": 7.657386302947998, + "learning_rate": 3.233503266099066e-06, + "loss": 4.8979, + "step": 24800 + }, + { + "epoch": 0.006519078212771239, + "grad_norm": 7.980145454406738, + "learning_rate": 3.2595799053417998e-06, + "loss": 4.8544, + "step": 25000 + }, + { + "epoch": 0.00657123083847341, + "grad_norm": 7.923119068145752, + "learning_rate": 3.285656544584534e-06, + "loss": 4.8472, + "step": 25200 + }, + { + "epoch": 0.006623383464175579, + "grad_norm": 7.500796318054199, + "learning_rate": 3.311733183827269e-06, + "loss": 4.8235, + "step": 25400 + }, + { + "epoch": 0.006675536089877749, + "grad_norm": 7.593719005584717, + "learning_rate": 3.337809823070003e-06, + "loss": 4.8552, + "step": 25600 + }, + { + "epoch": 0.006727688715579919, + "grad_norm": 7.597663879394531, + "learning_rate": 3.3638864623127376e-06, + "loss": 4.822, + "step": 25800 + }, + { + "epoch": 0.0067798413412820885, + "grad_norm": 7.924131393432617, + "learning_rate": 3.389963101555472e-06, + "loss": 4.8013, + "step": 26000 + }, + { + "epoch": 0.006831993966984259, + "grad_norm": 8.150907516479492, + "learning_rate": 3.416039740798206e-06, + "loss": 4.808, + "step": 26200 + }, + { + "epoch": 0.006884146592686428, + "grad_norm": 8.553031921386719, + "learning_rate": 3.4421163800409407e-06, + "loss": 4.7903, + "step": 26400 + }, + { + "epoch": 0.006936299218388599, + "grad_norm": 7.992140293121338, + "learning_rate": 3.468193019283675e-06, + "loss": 4.7669, + "step": 26600 + }, + { + "epoch": 0.006988451844090768, + "grad_norm": 9.123887062072754, + "learning_rate": 3.49426965852641e-06, + "loss": 4.7772, + "step": 26800 + }, + { + "epoch": 0.007040604469792939, + "grad_norm": 7.843687057495117, + "learning_rate": 3.5203462977691437e-06, + "loss": 4.7443, + "step": 27000 + }, + { + "epoch": 0.007092757095495108, + "grad_norm": 7.194400787353516, + "learning_rate": 3.546422937011878e-06, + "loss": 4.6394, + "step": 27200 + }, + { + "epoch": 0.007144909721197279, + "grad_norm": 8.256219863891602, + "learning_rate": 3.572499576254613e-06, + "loss": 4.7375, + "step": 27400 + }, + { + "epoch": 0.007197062346899448, + "grad_norm": 7.898242473602295, + "learning_rate": 3.5985762154973468e-06, + "loss": 4.6597, + "step": 27600 + }, + { + "epoch": 0.0072492149726016185, + "grad_norm": 8.30362606048584, + "learning_rate": 3.6246528547400816e-06, + "loss": 4.6724, + "step": 27800 + }, + { + "epoch": 0.007301367598303788, + "grad_norm": 7.795161247253418, + "learning_rate": 3.650729493982816e-06, + "loss": 4.6485, + "step": 28000 + }, + { + "epoch": 0.0073535202240059576, + "grad_norm": 7.712084770202637, + "learning_rate": 3.67680613322555e-06, + "loss": 4.6684, + "step": 28200 + }, + { + "epoch": 0.007405672849708128, + "grad_norm": 7.2499165534973145, + "learning_rate": 3.7028827724682846e-06, + "loss": 4.6383, + "step": 28400 + }, + { + "epoch": 0.0074578254754102975, + "grad_norm": 7.843800067901611, + "learning_rate": 3.728959411711019e-06, + "loss": 4.644, + "step": 28600 + }, + { + "epoch": 0.007509978101112468, + "grad_norm": 8.171004295349121, + "learning_rate": 3.755036050953753e-06, + "loss": 4.5964, + "step": 28800 + }, + { + "epoch": 0.007562130726814637, + "grad_norm": 8.430412292480469, + "learning_rate": 3.7811126901964877e-06, + "loss": 4.5651, + "step": 29000 + }, + { + "epoch": 0.007614283352516808, + "grad_norm": 8.003270149230957, + "learning_rate": 3.807189329439222e-06, + "loss": 4.6326, + "step": 29200 + }, + { + "epoch": 0.007666435978218977, + "grad_norm": 8.111075401306152, + "learning_rate": 3.833265968681957e-06, + "loss": 4.5397, + "step": 29400 + }, + { + "epoch": 0.007718588603921148, + "grad_norm": 8.523173332214355, + "learning_rate": 3.859342607924691e-06, + "loss": 4.5506, + "step": 29600 + }, + { + "epoch": 0.007770741229623317, + "grad_norm": 8.225560188293457, + "learning_rate": 3.8854192471674255e-06, + "loss": 4.5974, + "step": 29800 + }, + { + "epoch": 0.007822893855325487, + "grad_norm": 8.14545726776123, + "learning_rate": 3.91149588641016e-06, + "loss": 4.5563, + "step": 30000 + }, + { + "epoch": 0.007875046481027657, + "grad_norm": 8.375019073486328, + "learning_rate": 3.937572525652894e-06, + "loss": 4.513, + "step": 30200 + }, + { + "epoch": 0.007927199106729827, + "grad_norm": 7.618561744689941, + "learning_rate": 3.963649164895629e-06, + "loss": 4.5606, + "step": 30400 + }, + { + "epoch": 0.007979351732431996, + "grad_norm": 7.901875019073486, + "learning_rate": 3.989725804138363e-06, + "loss": 4.5278, + "step": 30600 + }, + { + "epoch": 0.008031504358134166, + "grad_norm": 8.019709587097168, + "learning_rate": 4.015802443381097e-06, + "loss": 4.5119, + "step": 30800 + }, + { + "epoch": 0.008083656983836337, + "grad_norm": 7.532272815704346, + "learning_rate": 4.041879082623832e-06, + "loss": 4.4828, + "step": 31000 + }, + { + "epoch": 0.008135809609538507, + "grad_norm": 8.28953742980957, + "learning_rate": 4.067955721866566e-06, + "loss": 4.516, + "step": 31200 + }, + { + "epoch": 0.008187962235240676, + "grad_norm": 8.234850883483887, + "learning_rate": 4.094032361109301e-06, + "loss": 4.5126, + "step": 31400 + }, + { + "epoch": 0.008240114860942846, + "grad_norm": 7.353629112243652, + "learning_rate": 4.120109000352035e-06, + "loss": 4.4683, + "step": 31600 + }, + { + "epoch": 0.008292267486645017, + "grad_norm": 7.64729118347168, + "learning_rate": 4.146185639594769e-06, + "loss": 4.4538, + "step": 31800 + }, + { + "epoch": 0.008344420112347187, + "grad_norm": 7.47349214553833, + "learning_rate": 4.172262278837504e-06, + "loss": 4.4283, + "step": 32000 + }, + { + "epoch": 0.008396572738049356, + "grad_norm": 8.011587142944336, + "learning_rate": 4.198338918080238e-06, + "loss": 4.414, + "step": 32200 + }, + { + "epoch": 0.008448725363751526, + "grad_norm": 8.157088279724121, + "learning_rate": 4.224415557322973e-06, + "loss": 4.4103, + "step": 32400 + }, + { + "epoch": 0.008500877989453696, + "grad_norm": 8.141148567199707, + "learning_rate": 4.250492196565707e-06, + "loss": 4.3756, + "step": 32600 + }, + { + "epoch": 0.008553030615155865, + "grad_norm": 7.892102241516113, + "learning_rate": 4.276568835808441e-06, + "loss": 4.415, + "step": 32800 + }, + { + "epoch": 0.008605183240858036, + "grad_norm": 8.264842987060547, + "learning_rate": 4.302645475051176e-06, + "loss": 4.4202, + "step": 33000 + }, + { + "epoch": 0.008657335866560206, + "grad_norm": 7.995069980621338, + "learning_rate": 4.32872211429391e-06, + "loss": 4.407, + "step": 33200 + }, + { + "epoch": 0.008709488492262376, + "grad_norm": 8.319681167602539, + "learning_rate": 4.354798753536645e-06, + "loss": 4.3725, + "step": 33400 + }, + { + "epoch": 0.008761641117964545, + "grad_norm": 7.941286087036133, + "learning_rate": 4.380875392779379e-06, + "loss": 4.3491, + "step": 33600 + }, + { + "epoch": 0.008813793743666715, + "grad_norm": 7.6486029624938965, + "learning_rate": 4.4069520320221135e-06, + "loss": 4.3385, + "step": 33800 + }, + { + "epoch": 0.008865946369368886, + "grad_norm": 8.253968238830566, + "learning_rate": 4.433028671264848e-06, + "loss": 4.3506, + "step": 34000 + }, + { + "epoch": 0.008918098995071056, + "grad_norm": 7.83550500869751, + "learning_rate": 4.459105310507582e-06, + "loss": 4.3523, + "step": 34200 + }, + { + "epoch": 0.008970251620773225, + "grad_norm": 8.142230987548828, + "learning_rate": 4.4851819497503165e-06, + "loss": 4.3497, + "step": 34400 + }, + { + "epoch": 0.009022404246475395, + "grad_norm": 8.258264541625977, + "learning_rate": 4.511258588993051e-06, + "loss": 4.334, + "step": 34600 + }, + { + "epoch": 0.009074556872177566, + "grad_norm": 7.864555358886719, + "learning_rate": 4.537335228235785e-06, + "loss": 4.3043, + "step": 34800 + }, + { + "epoch": 0.009126709497879734, + "grad_norm": 7.830788612365723, + "learning_rate": 4.56341186747852e-06, + "loss": 4.2891, + "step": 35000 + }, + { + "epoch": 0.009178862123581905, + "grad_norm": 8.586557388305664, + "learning_rate": 4.589488506721254e-06, + "loss": 4.2071, + "step": 35200 + }, + { + "epoch": 0.009231014749284075, + "grad_norm": 8.016130447387695, + "learning_rate": 4.615565145963988e-06, + "loss": 4.2689, + "step": 35400 + }, + { + "epoch": 0.009283167374986245, + "grad_norm": 7.466185092926025, + "learning_rate": 4.641641785206723e-06, + "loss": 4.313, + "step": 35600 + }, + { + "epoch": 0.009335320000688414, + "grad_norm": 7.97619104385376, + "learning_rate": 4.667718424449457e-06, + "loss": 4.2527, + "step": 35800 + }, + { + "epoch": 0.009387472626390584, + "grad_norm": 8.14663028717041, + "learning_rate": 4.693795063692192e-06, + "loss": 4.2316, + "step": 36000 + }, + { + "epoch": 0.009439625252092755, + "grad_norm": 7.500378608703613, + "learning_rate": 4.7198717029349265e-06, + "loss": 4.2477, + "step": 36200 + }, + { + "epoch": 0.009491777877794925, + "grad_norm": 7.814052104949951, + "learning_rate": 4.74594834217766e-06, + "loss": 4.2685, + "step": 36400 + }, + { + "epoch": 0.009543930503497094, + "grad_norm": 7.774244785308838, + "learning_rate": 4.772024981420395e-06, + "loss": 4.2282, + "step": 36600 + }, + { + "epoch": 0.009596083129199264, + "grad_norm": 7.567872524261475, + "learning_rate": 4.79810162066313e-06, + "loss": 4.2201, + "step": 36800 + }, + { + "epoch": 0.009648235754901435, + "grad_norm": 7.794075965881348, + "learning_rate": 4.824178259905864e-06, + "loss": 4.2564, + "step": 37000 + }, + { + "epoch": 0.009700388380603603, + "grad_norm": 7.229556560516357, + "learning_rate": 4.850254899148598e-06, + "loss": 4.197, + "step": 37200 + }, + { + "epoch": 0.009752541006305774, + "grad_norm": 7.552301406860352, + "learning_rate": 4.876331538391333e-06, + "loss": 4.1951, + "step": 37400 + }, + { + "epoch": 0.009804693632007944, + "grad_norm": 7.268792152404785, + "learning_rate": 4.902408177634067e-06, + "loss": 4.1937, + "step": 37600 + }, + { + "epoch": 0.009856846257710114, + "grad_norm": 8.427607536315918, + "learning_rate": 4.928484816876801e-06, + "loss": 4.1549, + "step": 37800 + }, + { + "epoch": 0.009908998883412283, + "grad_norm": 7.394150733947754, + "learning_rate": 4.954561456119536e-06, + "loss": 4.1654, + "step": 38000 + }, + { + "epoch": 0.009961151509114453, + "grad_norm": 8.243969917297363, + "learning_rate": 4.98063809536227e-06, + "loss": 4.1688, + "step": 38200 + }, + { + "epoch": 0.010013304134816624, + "grad_norm": 7.82209587097168, + "learning_rate": 5.006714734605005e-06, + "loss": 4.164, + "step": 38400 + }, + { + "epoch": 0.010065456760518794, + "grad_norm": 7.940906047821045, + "learning_rate": 5.032791373847739e-06, + "loss": 4.1694, + "step": 38600 + }, + { + "epoch": 0.010117609386220963, + "grad_norm": 7.952986717224121, + "learning_rate": 5.058868013090473e-06, + "loss": 4.18, + "step": 38800 + }, + { + "epoch": 0.010169762011923133, + "grad_norm": 8.000782012939453, + "learning_rate": 5.084944652333208e-06, + "loss": 4.1193, + "step": 39000 + }, + { + "epoch": 0.010221914637625304, + "grad_norm": 7.911661148071289, + "learning_rate": 5.111021291575942e-06, + "loss": 4.0802, + "step": 39200 + }, + { + "epoch": 0.010274067263327474, + "grad_norm": 7.344413757324219, + "learning_rate": 5.137097930818676e-06, + "loss": 4.0842, + "step": 39400 + }, + { + "epoch": 0.010326219889029643, + "grad_norm": 8.170016288757324, + "learning_rate": 5.163174570061411e-06, + "loss": 4.111, + "step": 39600 + }, + { + "epoch": 0.010378372514731813, + "grad_norm": 7.0969719886779785, + "learning_rate": 5.189251209304146e-06, + "loss": 4.1323, + "step": 39800 + }, + { + "epoch": 0.010430525140433983, + "grad_norm": 7.286205291748047, + "learning_rate": 5.215327848546879e-06, + "loss": 4.0863, + "step": 40000 + }, + { + "epoch": 0.010482677766136152, + "grad_norm": 7.330994129180908, + "learning_rate": 5.2414044877896145e-06, + "loss": 4.0936, + "step": 40200 + }, + { + "epoch": 0.010534830391838322, + "grad_norm": 7.573282718658447, + "learning_rate": 5.267481127032349e-06, + "loss": 4.1067, + "step": 40400 + }, + { + "epoch": 0.010586983017540493, + "grad_norm": 8.244158744812012, + "learning_rate": 5.293557766275082e-06, + "loss": 4.0502, + "step": 40600 + }, + { + "epoch": 0.010639135643242663, + "grad_norm": 7.125401496887207, + "learning_rate": 5.3196344055178175e-06, + "loss": 4.0565, + "step": 40800 + }, + { + "epoch": 0.010691288268944832, + "grad_norm": 8.068306922912598, + "learning_rate": 5.345711044760552e-06, + "loss": 4.0404, + "step": 41000 + }, + { + "epoch": 0.010743440894647002, + "grad_norm": 7.503974437713623, + "learning_rate": 5.371787684003285e-06, + "loss": 4.063, + "step": 41200 + }, + { + "epoch": 0.010795593520349173, + "grad_norm": 7.42249059677124, + "learning_rate": 5.397864323246021e-06, + "loss": 4.0053, + "step": 41400 + }, + { + "epoch": 0.010847746146051343, + "grad_norm": 7.555575370788574, + "learning_rate": 5.423940962488755e-06, + "loss": 4.045, + "step": 41600 + }, + { + "epoch": 0.010899898771753512, + "grad_norm": 7.423194885253906, + "learning_rate": 5.4500176017314885e-06, + "loss": 4.0292, + "step": 41800 + }, + { + "epoch": 0.010952051397455682, + "grad_norm": 7.335684776306152, + "learning_rate": 5.476094240974224e-06, + "loss": 3.9726, + "step": 42000 + }, + { + "epoch": 0.011004204023157852, + "grad_norm": 7.528399467468262, + "learning_rate": 5.502170880216958e-06, + "loss": 3.9907, + "step": 42200 + }, + { + "epoch": 0.011056356648860021, + "grad_norm": 7.644163131713867, + "learning_rate": 5.528247519459693e-06, + "loss": 4.034, + "step": 42400 + }, + { + "epoch": 0.011108509274562191, + "grad_norm": 8.300440788269043, + "learning_rate": 5.554324158702427e-06, + "loss": 4.0416, + "step": 42600 + }, + { + "epoch": 0.011160661900264362, + "grad_norm": 7.345607280731201, + "learning_rate": 5.580400797945161e-06, + "loss": 3.9673, + "step": 42800 + }, + { + "epoch": 0.011212814525966532, + "grad_norm": 7.909209251403809, + "learning_rate": 5.606477437187896e-06, + "loss": 3.9715, + "step": 43000 + }, + { + "epoch": 0.011264967151668701, + "grad_norm": 7.389960289001465, + "learning_rate": 5.63255407643063e-06, + "loss": 3.9649, + "step": 43200 + }, + { + "epoch": 0.011317119777370871, + "grad_norm": 7.785503387451172, + "learning_rate": 5.658630715673364e-06, + "loss": 3.9642, + "step": 43400 + }, + { + "epoch": 0.011369272403073042, + "grad_norm": 7.3766984939575195, + "learning_rate": 5.684707354916099e-06, + "loss": 4.0052, + "step": 43600 + }, + { + "epoch": 0.011421425028775212, + "grad_norm": 7.540368556976318, + "learning_rate": 5.710783994158833e-06, + "loss": 4.0034, + "step": 43800 + }, + { + "epoch": 0.01147357765447738, + "grad_norm": 8.014230728149414, + "learning_rate": 5.736860633401567e-06, + "loss": 3.9643, + "step": 44000 + }, + { + "epoch": 0.011525730280179551, + "grad_norm": 7.941577434539795, + "learning_rate": 5.762937272644302e-06, + "loss": 3.9858, + "step": 44200 + }, + { + "epoch": 0.011577882905881721, + "grad_norm": 7.780430793762207, + "learning_rate": 5.789013911887037e-06, + "loss": 3.993, + "step": 44400 + }, + { + "epoch": 0.01163003553158389, + "grad_norm": 8.022960662841797, + "learning_rate": 5.81509055112977e-06, + "loss": 3.9126, + "step": 44600 + }, + { + "epoch": 0.01168218815728606, + "grad_norm": 8.14714527130127, + "learning_rate": 5.8411671903725055e-06, + "loss": 3.9218, + "step": 44800 + }, + { + "epoch": 0.011734340782988231, + "grad_norm": 7.369551658630371, + "learning_rate": 5.86724382961524e-06, + "loss": 3.959, + "step": 45000 + }, + { + "epoch": 0.011786493408690401, + "grad_norm": 8.105989456176758, + "learning_rate": 5.893320468857973e-06, + "loss": 3.8897, + "step": 45200 + }, + { + "epoch": 0.01183864603439257, + "grad_norm": 8.191625595092773, + "learning_rate": 5.9193971081007085e-06, + "loss": 3.9187, + "step": 45400 + }, + { + "epoch": 0.01189079866009474, + "grad_norm": 7.703822135925293, + "learning_rate": 5.945473747343443e-06, + "loss": 3.8947, + "step": 45600 + }, + { + "epoch": 0.01194295128579691, + "grad_norm": 7.697946548461914, + "learning_rate": 5.971550386586176e-06, + "loss": 3.8795, + "step": 45800 + }, + { + "epoch": 0.011995103911499081, + "grad_norm": 7.603724002838135, + "learning_rate": 5.997627025828912e-06, + "loss": 3.911, + "step": 46000 + }, + { + "epoch": 0.01204725653720125, + "grad_norm": 8.207696914672852, + "learning_rate": 6.023703665071646e-06, + "loss": 3.896, + "step": 46200 + }, + { + "epoch": 0.01209940916290342, + "grad_norm": 8.124335289001465, + "learning_rate": 6.04978030431438e-06, + "loss": 3.8306, + "step": 46400 + }, + { + "epoch": 0.01215156178860559, + "grad_norm": 7.28653621673584, + "learning_rate": 6.075856943557115e-06, + "loss": 3.8624, + "step": 46600 + }, + { + "epoch": 0.01220371441430776, + "grad_norm": 7.189661026000977, + "learning_rate": 6.101933582799849e-06, + "loss": 3.8445, + "step": 46800 + }, + { + "epoch": 0.01225586704000993, + "grad_norm": 8.082289695739746, + "learning_rate": 6.128010222042584e-06, + "loss": 3.8956, + "step": 47000 + }, + { + "epoch": 0.0123080196657121, + "grad_norm": 7.882737636566162, + "learning_rate": 6.154086861285318e-06, + "loss": 3.8779, + "step": 47200 + }, + { + "epoch": 0.01236017229141427, + "grad_norm": 7.791324615478516, + "learning_rate": 6.180163500528052e-06, + "loss": 3.8339, + "step": 47400 + }, + { + "epoch": 0.012412324917116439, + "grad_norm": 7.604274272918701, + "learning_rate": 6.206240139770787e-06, + "loss": 3.8552, + "step": 47600 + }, + { + "epoch": 0.01246447754281861, + "grad_norm": 7.377195358276367, + "learning_rate": 6.232316779013521e-06, + "loss": 3.8075, + "step": 47800 + }, + { + "epoch": 0.01251663016852078, + "grad_norm": 7.8386640548706055, + "learning_rate": 6.258393418256256e-06, + "loss": 3.7798, + "step": 48000 + }, + { + "epoch": 0.01256878279422295, + "grad_norm": 7.123723983764648, + "learning_rate": 6.28447005749899e-06, + "loss": 3.8206, + "step": 48200 + }, + { + "epoch": 0.012620935419925119, + "grad_norm": 8.309353828430176, + "learning_rate": 6.310546696741724e-06, + "loss": 3.8562, + "step": 48400 + }, + { + "epoch": 0.01267308804562729, + "grad_norm": 7.630406379699707, + "learning_rate": 6.336623335984459e-06, + "loss": 3.829, + "step": 48600 + }, + { + "epoch": 0.01272524067132946, + "grad_norm": 8.362411499023438, + "learning_rate": 6.362699975227193e-06, + "loss": 3.8011, + "step": 48800 + }, + { + "epoch": 0.012777393297031628, + "grad_norm": 7.613163471221924, + "learning_rate": 6.388776614469928e-06, + "loss": 3.8119, + "step": 49000 + }, + { + "epoch": 0.012829545922733799, + "grad_norm": 8.339622497558594, + "learning_rate": 6.414853253712662e-06, + "loss": 3.8347, + "step": 49200 + }, + { + "epoch": 0.012881698548435969, + "grad_norm": 7.240667343139648, + "learning_rate": 6.4409298929553965e-06, + "loss": 3.7863, + "step": 49400 + }, + { + "epoch": 0.01293385117413814, + "grad_norm": 7.500837326049805, + "learning_rate": 6.467006532198132e-06, + "loss": 3.78, + "step": 49600 + }, + { + "epoch": 0.012986003799840308, + "grad_norm": 7.476478099822998, + "learning_rate": 6.493083171440865e-06, + "loss": 3.8424, + "step": 49800 + }, + { + "epoch": 0.013038156425542478, + "grad_norm": 7.900282382965088, + "learning_rate": 6.5191598106835995e-06, + "loss": 3.757, + "step": 50000 + }, + { + "epoch": 0.013090309051244649, + "grad_norm": 7.225290298461914, + "learning_rate": 6.545236449926335e-06, + "loss": 3.751, + "step": 50200 + }, + { + "epoch": 0.01314246167694682, + "grad_norm": 7.843364715576172, + "learning_rate": 6.571313089169068e-06, + "loss": 3.8209, + "step": 50400 + }, + { + "epoch": 0.013194614302648988, + "grad_norm": 7.373351573944092, + "learning_rate": 6.597389728411803e-06, + "loss": 3.8055, + "step": 50600 + }, + { + "epoch": 0.013246766928351158, + "grad_norm": 8.107486724853516, + "learning_rate": 6.623466367654538e-06, + "loss": 3.7116, + "step": 50800 + }, + { + "epoch": 0.013298919554053329, + "grad_norm": 7.618676662445068, + "learning_rate": 6.649543006897272e-06, + "loss": 3.7805, + "step": 51000 + }, + { + "epoch": 0.013351072179755497, + "grad_norm": 7.945600509643555, + "learning_rate": 6.675619646140006e-06, + "loss": 3.7396, + "step": 51200 + }, + { + "epoch": 0.013403224805457668, + "grad_norm": 7.414207935333252, + "learning_rate": 6.701696285382741e-06, + "loss": 3.7788, + "step": 51400 + }, + { + "epoch": 0.013455377431159838, + "grad_norm": 8.000473022460938, + "learning_rate": 6.727772924625475e-06, + "loss": 3.7445, + "step": 51600 + }, + { + "epoch": 0.013507530056862008, + "grad_norm": 7.633871555328369, + "learning_rate": 6.753849563868209e-06, + "loss": 3.7474, + "step": 51800 + }, + { + "epoch": 0.013559682682564177, + "grad_norm": 7.910715103149414, + "learning_rate": 6.779926203110944e-06, + "loss": 3.7563, + "step": 52000 + }, + { + "epoch": 0.013611835308266347, + "grad_norm": 8.315561294555664, + "learning_rate": 6.806002842353678e-06, + "loss": 3.7435, + "step": 52200 + }, + { + "epoch": 0.013663987933968518, + "grad_norm": 7.666395664215088, + "learning_rate": 6.832079481596412e-06, + "loss": 3.7198, + "step": 52400 + }, + { + "epoch": 0.013716140559670688, + "grad_norm": 7.776802062988281, + "learning_rate": 6.858156120839147e-06, + "loss": 3.7318, + "step": 52600 + }, + { + "epoch": 0.013768293185372857, + "grad_norm": 7.258547782897949, + "learning_rate": 6.884232760081881e-06, + "loss": 3.7325, + "step": 52800 + }, + { + "epoch": 0.013820445811075027, + "grad_norm": 7.867082595825195, + "learning_rate": 6.910309399324615e-06, + "loss": 3.7163, + "step": 53000 + }, + { + "epoch": 0.013872598436777198, + "grad_norm": 7.452786922454834, + "learning_rate": 6.93638603856735e-06, + "loss": 3.7157, + "step": 53200 + }, + { + "epoch": 0.013924751062479366, + "grad_norm": 7.4677042961120605, + "learning_rate": 6.962462677810084e-06, + "loss": 3.7251, + "step": 53400 + }, + { + "epoch": 0.013976903688181537, + "grad_norm": 7.686807155609131, + "learning_rate": 6.98853931705282e-06, + "loss": 3.7158, + "step": 53600 + }, + { + "epoch": 0.014029056313883707, + "grad_norm": 7.389094352722168, + "learning_rate": 7.014615956295553e-06, + "loss": 3.7139, + "step": 53800 + }, + { + "epoch": 0.014081208939585877, + "grad_norm": 7.3070526123046875, + "learning_rate": 7.0406925955382874e-06, + "loss": 3.6748, + "step": 54000 + }, + { + "epoch": 0.014133361565288046, + "grad_norm": 7.088414192199707, + "learning_rate": 7.066769234781023e-06, + "loss": 3.6817, + "step": 54200 + }, + { + "epoch": 0.014185514190990216, + "grad_norm": 7.4261088371276855, + "learning_rate": 7.092845874023756e-06, + "loss": 3.7422, + "step": 54400 + }, + { + "epoch": 0.014237666816692387, + "grad_norm": 7.321529388427734, + "learning_rate": 7.1189225132664905e-06, + "loss": 3.7282, + "step": 54600 + }, + { + "epoch": 0.014289819442394557, + "grad_norm": 7.516880035400391, + "learning_rate": 7.144999152509226e-06, + "loss": 3.6875, + "step": 54800 + }, + { + "epoch": 0.014341972068096726, + "grad_norm": 6.853364944458008, + "learning_rate": 7.171075791751959e-06, + "loss": 3.6896, + "step": 55000 + }, + { + "epoch": 0.014394124693798896, + "grad_norm": 7.774241924285889, + "learning_rate": 7.1971524309946936e-06, + "loss": 3.6278, + "step": 55200 + }, + { + "epoch": 0.014446277319501067, + "grad_norm": 8.340727806091309, + "learning_rate": 7.223229070237429e-06, + "loss": 3.6977, + "step": 55400 + }, + { + "epoch": 0.014498429945203237, + "grad_norm": 7.890402793884277, + "learning_rate": 7.249305709480163e-06, + "loss": 3.6974, + "step": 55600 + }, + { + "epoch": 0.014550582570905406, + "grad_norm": 7.747776985168457, + "learning_rate": 7.275382348722897e-06, + "loss": 3.6873, + "step": 55800 + }, + { + "epoch": 0.014602735196607576, + "grad_norm": 6.7120184898376465, + "learning_rate": 7.301458987965632e-06, + "loss": 3.6961, + "step": 56000 + }, + { + "epoch": 0.014654887822309746, + "grad_norm": 7.8532891273498535, + "learning_rate": 7.327535627208366e-06, + "loss": 3.6477, + "step": 56200 + }, + { + "epoch": 0.014707040448011915, + "grad_norm": 8.437646865844727, + "learning_rate": 7.3536122664511e-06, + "loss": 3.6398, + "step": 56400 + }, + { + "epoch": 0.014759193073714085, + "grad_norm": 7.008194446563721, + "learning_rate": 7.379688905693835e-06, + "loss": 3.6403, + "step": 56600 + }, + { + "epoch": 0.014811345699416256, + "grad_norm": 7.821798801422119, + "learning_rate": 7.405765544936569e-06, + "loss": 3.6426, + "step": 56800 + }, + { + "epoch": 0.014863498325118426, + "grad_norm": 6.9409356117248535, + "learning_rate": 7.431842184179303e-06, + "loss": 3.6323, + "step": 57000 + }, + { + "epoch": 0.014915650950820595, + "grad_norm": 7.209104537963867, + "learning_rate": 7.457918823422038e-06, + "loss": 3.6272, + "step": 57200 + }, + { + "epoch": 0.014967803576522765, + "grad_norm": 7.187128067016602, + "learning_rate": 7.483995462664772e-06, + "loss": 3.6558, + "step": 57400 + }, + { + "epoch": 0.015019956202224936, + "grad_norm": 7.077954292297363, + "learning_rate": 7.510072101907506e-06, + "loss": 3.6594, + "step": 57600 + }, + { + "epoch": 0.015072108827927106, + "grad_norm": 7.292501926422119, + "learning_rate": 7.536148741150241e-06, + "loss": 3.6413, + "step": 57800 + }, + { + "epoch": 0.015124261453629275, + "grad_norm": 7.346433162689209, + "learning_rate": 7.562225380392975e-06, + "loss": 3.6647, + "step": 58000 + }, + { + "epoch": 0.015176414079331445, + "grad_norm": 7.485527992248535, + "learning_rate": 7.5883020196357106e-06, + "loss": 3.6067, + "step": 58200 + }, + { + "epoch": 0.015228566705033615, + "grad_norm": 7.4600629806518555, + "learning_rate": 7.614378658878444e-06, + "loss": 3.6154, + "step": 58400 + }, + { + "epoch": 0.015280719330735784, + "grad_norm": 7.584334373474121, + "learning_rate": 7.640455298121178e-06, + "loss": 3.627, + "step": 58600 + }, + { + "epoch": 0.015332871956437955, + "grad_norm": 6.661746978759766, + "learning_rate": 7.666531937363914e-06, + "loss": 3.6439, + "step": 58800 + }, + { + "epoch": 0.015385024582140125, + "grad_norm": 7.055318355560303, + "learning_rate": 7.692608576606647e-06, + "loss": 3.6274, + "step": 59000 + }, + { + "epoch": 0.015437177207842295, + "grad_norm": 7.231868267059326, + "learning_rate": 7.718685215849382e-06, + "loss": 3.6557, + "step": 59200 + }, + { + "epoch": 0.015489329833544464, + "grad_norm": 7.788670063018799, + "learning_rate": 7.744761855092116e-06, + "loss": 3.6534, + "step": 59400 + }, + { + "epoch": 0.015541482459246634, + "grad_norm": 7.5201263427734375, + "learning_rate": 7.770838494334851e-06, + "loss": 3.6055, + "step": 59600 + }, + { + "epoch": 0.015593635084948805, + "grad_norm": 6.89668083190918, + "learning_rate": 7.796915133577585e-06, + "loss": 3.5638, + "step": 59800 + }, + { + "epoch": 0.015645787710650973, + "grad_norm": 7.163186550140381, + "learning_rate": 7.82299177282032e-06, + "loss": 3.6157, + "step": 60000 + }, + { + "epoch": 0.015697940336353144, + "grad_norm": 6.937023639678955, + "learning_rate": 7.849068412063055e-06, + "loss": 3.58, + "step": 60200 + }, + { + "epoch": 0.015750092962055314, + "grad_norm": 7.33142614364624, + "learning_rate": 7.875145051305788e-06, + "loss": 3.5582, + "step": 60400 + }, + { + "epoch": 0.015802245587757485, + "grad_norm": 7.158875465393066, + "learning_rate": 7.901221690548524e-06, + "loss": 3.5748, + "step": 60600 + }, + { + "epoch": 0.015854398213459655, + "grad_norm": 7.254717826843262, + "learning_rate": 7.927298329791257e-06, + "loss": 3.5821, + "step": 60800 + }, + { + "epoch": 0.015906550839161825, + "grad_norm": 7.033998966217041, + "learning_rate": 7.95337496903399e-06, + "loss": 3.5704, + "step": 61000 + }, + { + "epoch": 0.015958703464863992, + "grad_norm": 8.059544563293457, + "learning_rate": 7.979451608276726e-06, + "loss": 3.5057, + "step": 61200 + }, + { + "epoch": 0.016010856090566163, + "grad_norm": 6.929595470428467, + "learning_rate": 8.005528247519461e-06, + "loss": 3.5442, + "step": 61400 + }, + { + "epoch": 0.016063008716268333, + "grad_norm": 7.488486289978027, + "learning_rate": 8.031604886762195e-06, + "loss": 3.5421, + "step": 61600 + }, + { + "epoch": 0.016115161341970503, + "grad_norm": 7.025427341461182, + "learning_rate": 8.05768152600493e-06, + "loss": 3.5727, + "step": 61800 + }, + { + "epoch": 0.016167313967672674, + "grad_norm": 7.745316982269287, + "learning_rate": 8.083758165247663e-06, + "loss": 3.5274, + "step": 62000 + }, + { + "epoch": 0.016219466593374844, + "grad_norm": 8.315921783447266, + "learning_rate": 8.109834804490397e-06, + "loss": 3.5662, + "step": 62200 + }, + { + "epoch": 0.016271619219077015, + "grad_norm": 7.587878704071045, + "learning_rate": 8.135911443733132e-06, + "loss": 3.5412, + "step": 62400 + }, + { + "epoch": 0.01632377184477918, + "grad_norm": 7.723850250244141, + "learning_rate": 8.161988082975867e-06, + "loss": 3.5607, + "step": 62600 + }, + { + "epoch": 0.016375924470481352, + "grad_norm": 7.842193603515625, + "learning_rate": 8.188064722218602e-06, + "loss": 3.5359, + "step": 62800 + }, + { + "epoch": 0.016428077096183522, + "grad_norm": 7.071596145629883, + "learning_rate": 8.214141361461336e-06, + "loss": 3.5514, + "step": 63000 + }, + { + "epoch": 0.016480229721885693, + "grad_norm": 7.444825649261475, + "learning_rate": 8.24021800070407e-06, + "loss": 3.5449, + "step": 63200 + }, + { + "epoch": 0.016532382347587863, + "grad_norm": 7.005112648010254, + "learning_rate": 8.266294639946805e-06, + "loss": 3.5011, + "step": 63400 + }, + { + "epoch": 0.016584534973290033, + "grad_norm": 7.4845290184021, + "learning_rate": 8.292371279189538e-06, + "loss": 3.5421, + "step": 63600 + }, + { + "epoch": 0.016636687598992204, + "grad_norm": 7.57586669921875, + "learning_rate": 8.318447918432273e-06, + "loss": 3.4857, + "step": 63800 + }, + { + "epoch": 0.016688840224694374, + "grad_norm": 6.911862850189209, + "learning_rate": 8.344524557675009e-06, + "loss": 3.5319, + "step": 64000 + }, + { + "epoch": 0.01674099285039654, + "grad_norm": 6.8444061279296875, + "learning_rate": 8.370601196917742e-06, + "loss": 3.5423, + "step": 64200 + }, + { + "epoch": 0.01679314547609871, + "grad_norm": 6.937068462371826, + "learning_rate": 8.396677836160476e-06, + "loss": 3.5273, + "step": 64400 + }, + { + "epoch": 0.016845298101800882, + "grad_norm": 7.857777118682861, + "learning_rate": 8.42275447540321e-06, + "loss": 3.539, + "step": 64600 + }, + { + "epoch": 0.016897450727503052, + "grad_norm": 6.760194778442383, + "learning_rate": 8.448831114645946e-06, + "loss": 3.545, + "step": 64800 + }, + { + "epoch": 0.016949603353205223, + "grad_norm": 7.323217868804932, + "learning_rate": 8.47490775388868e-06, + "loss": 3.554, + "step": 65000 + }, + { + "epoch": 0.017001755978907393, + "grad_norm": 7.225944519042969, + "learning_rate": 8.500984393131415e-06, + "loss": 3.5109, + "step": 65200 + }, + { + "epoch": 0.017053908604609563, + "grad_norm": 7.282871246337891, + "learning_rate": 8.527061032374148e-06, + "loss": 3.5113, + "step": 65400 + }, + { + "epoch": 0.01710606123031173, + "grad_norm": 6.699639320373535, + "learning_rate": 8.553137671616882e-06, + "loss": 3.555, + "step": 65600 + }, + { + "epoch": 0.0171582138560139, + "grad_norm": 7.353262901306152, + "learning_rate": 8.579214310859617e-06, + "loss": 3.4825, + "step": 65800 + }, + { + "epoch": 0.01721036648171607, + "grad_norm": 7.2611284255981445, + "learning_rate": 8.605290950102352e-06, + "loss": 3.5099, + "step": 66000 + }, + { + "epoch": 0.01726251910741824, + "grad_norm": 7.1428680419921875, + "learning_rate": 8.631367589345086e-06, + "loss": 3.4868, + "step": 66200 + }, + { + "epoch": 0.017314671733120412, + "grad_norm": 7.161787986755371, + "learning_rate": 8.65744422858782e-06, + "loss": 3.4979, + "step": 66400 + }, + { + "epoch": 0.017366824358822582, + "grad_norm": 7.3627142906188965, + "learning_rate": 8.683520867830554e-06, + "loss": 3.4826, + "step": 66600 + }, + { + "epoch": 0.017418976984524753, + "grad_norm": 7.277604103088379, + "learning_rate": 8.70959750707329e-06, + "loss": 3.4848, + "step": 66800 + }, + { + "epoch": 0.017471129610226923, + "grad_norm": 6.910572052001953, + "learning_rate": 8.735674146316023e-06, + "loss": 3.4787, + "step": 67000 + }, + { + "epoch": 0.01752328223592909, + "grad_norm": 7.803658962249756, + "learning_rate": 8.761750785558758e-06, + "loss": 3.4327, + "step": 67200 + }, + { + "epoch": 0.01757543486163126, + "grad_norm": 7.014376640319824, + "learning_rate": 8.787827424801493e-06, + "loss": 3.4453, + "step": 67400 + }, + { + "epoch": 0.01762758748733343, + "grad_norm": 6.735202312469482, + "learning_rate": 8.813904064044227e-06, + "loss": 3.4588, + "step": 67600 + }, + { + "epoch": 0.0176797401130356, + "grad_norm": 7.092906951904297, + "learning_rate": 8.83998070328696e-06, + "loss": 3.4441, + "step": 67800 + }, + { + "epoch": 0.01773189273873777, + "grad_norm": 7.564960956573486, + "learning_rate": 8.866057342529696e-06, + "loss": 3.4894, + "step": 68000 + }, + { + "epoch": 0.017784045364439942, + "grad_norm": 7.072451591491699, + "learning_rate": 8.892133981772429e-06, + "loss": 3.4665, + "step": 68200 + }, + { + "epoch": 0.017836197990142112, + "grad_norm": 7.669380187988281, + "learning_rate": 8.918210621015164e-06, + "loss": 3.4343, + "step": 68400 + }, + { + "epoch": 0.01788835061584428, + "grad_norm": 6.885750770568848, + "learning_rate": 8.9442872602579e-06, + "loss": 3.4742, + "step": 68600 + }, + { + "epoch": 0.01794050324154645, + "grad_norm": 6.932456016540527, + "learning_rate": 8.970363899500633e-06, + "loss": 3.5029, + "step": 68800 + }, + { + "epoch": 0.01799265586724862, + "grad_norm": 7.914639472961426, + "learning_rate": 8.996440538743367e-06, + "loss": 3.4828, + "step": 69000 + }, + { + "epoch": 0.01804480849295079, + "grad_norm": 7.070594310760498, + "learning_rate": 9.022517177986102e-06, + "loss": 3.474, + "step": 69200 + }, + { + "epoch": 0.01809696111865296, + "grad_norm": 7.355897426605225, + "learning_rate": 9.048593817228837e-06, + "loss": 3.4265, + "step": 69400 + }, + { + "epoch": 0.01814911374435513, + "grad_norm": 7.521884441375732, + "learning_rate": 9.07467045647157e-06, + "loss": 3.4737, + "step": 69600 + }, + { + "epoch": 0.0182012663700573, + "grad_norm": 7.33009672164917, + "learning_rate": 9.100747095714306e-06, + "loss": 3.4509, + "step": 69800 + }, + { + "epoch": 0.01825341899575947, + "grad_norm": 7.302325248718262, + "learning_rate": 9.12682373495704e-06, + "loss": 3.4412, + "step": 70000 + }, + { + "epoch": 0.01830557162146164, + "grad_norm": 7.756179332733154, + "learning_rate": 9.152900374199773e-06, + "loss": 3.4473, + "step": 70200 + }, + { + "epoch": 0.01835772424716381, + "grad_norm": 7.5515336990356445, + "learning_rate": 9.178977013442508e-06, + "loss": 3.4144, + "step": 70400 + }, + { + "epoch": 0.01840987687286598, + "grad_norm": 7.913027286529541, + "learning_rate": 9.205053652685243e-06, + "loss": 3.4406, + "step": 70600 + }, + { + "epoch": 0.01846202949856815, + "grad_norm": 7.048377990722656, + "learning_rate": 9.231130291927977e-06, + "loss": 3.4356, + "step": 70800 + }, + { + "epoch": 0.01851418212427032, + "grad_norm": 7.313281536102295, + "learning_rate": 9.257206931170712e-06, + "loss": 3.4147, + "step": 71000 + }, + { + "epoch": 0.01856633474997249, + "grad_norm": 7.301833629608154, + "learning_rate": 9.283283570413445e-06, + "loss": 3.4198, + "step": 71200 + }, + { + "epoch": 0.01861848737567466, + "grad_norm": 6.697605133056641, + "learning_rate": 9.30936020965618e-06, + "loss": 3.4352, + "step": 71400 + }, + { + "epoch": 0.018670640001376828, + "grad_norm": 6.958870887756348, + "learning_rate": 9.335436848898914e-06, + "loss": 3.4299, + "step": 71600 + }, + { + "epoch": 0.018722792627079, + "grad_norm": 7.359804153442383, + "learning_rate": 9.36151348814165e-06, + "loss": 3.4789, + "step": 71800 + }, + { + "epoch": 0.01877494525278117, + "grad_norm": 6.766062259674072, + "learning_rate": 9.387590127384384e-06, + "loss": 3.4272, + "step": 72000 + }, + { + "epoch": 0.01882709787848334, + "grad_norm": 7.107087135314941, + "learning_rate": 9.413666766627118e-06, + "loss": 3.3802, + "step": 72200 + }, + { + "epoch": 0.01887925050418551, + "grad_norm": 7.090783596038818, + "learning_rate": 9.439743405869853e-06, + "loss": 3.4318, + "step": 72400 + }, + { + "epoch": 0.01893140312988768, + "grad_norm": 6.893887519836426, + "learning_rate": 9.465820045112587e-06, + "loss": 3.4297, + "step": 72600 + }, + { + "epoch": 0.01898355575558985, + "grad_norm": 6.953131198883057, + "learning_rate": 9.49189668435532e-06, + "loss": 3.4238, + "step": 72800 + }, + { + "epoch": 0.019035708381292017, + "grad_norm": 6.934586524963379, + "learning_rate": 9.517973323598055e-06, + "loss": 3.4275, + "step": 73000 + }, + { + "epoch": 0.019087861006994188, + "grad_norm": 7.763511657714844, + "learning_rate": 9.54404996284079e-06, + "loss": 3.3778, + "step": 73200 + }, + { + "epoch": 0.019140013632696358, + "grad_norm": 6.7908806800842285, + "learning_rate": 9.570126602083524e-06, + "loss": 3.4133, + "step": 73400 + }, + { + "epoch": 0.01919216625839853, + "grad_norm": 6.870711326599121, + "learning_rate": 9.59620324132626e-06, + "loss": 3.4281, + "step": 73600 + }, + { + "epoch": 0.0192443188841007, + "grad_norm": 7.813330173492432, + "learning_rate": 9.622279880568993e-06, + "loss": 3.424, + "step": 73800 + }, + { + "epoch": 0.01929647150980287, + "grad_norm": 7.28371524810791, + "learning_rate": 9.648356519811728e-06, + "loss": 3.4238, + "step": 74000 + }, + { + "epoch": 0.01934862413550504, + "grad_norm": 7.129782676696777, + "learning_rate": 9.674433159054461e-06, + "loss": 3.4407, + "step": 74200 + }, + { + "epoch": 0.019400776761207206, + "grad_norm": 7.057884216308594, + "learning_rate": 9.700509798297197e-06, + "loss": 3.3992, + "step": 74400 + }, + { + "epoch": 0.019452929386909377, + "grad_norm": 7.445634841918945, + "learning_rate": 9.726586437539932e-06, + "loss": 3.4258, + "step": 74600 + }, + { + "epoch": 0.019505082012611547, + "grad_norm": 7.498905658721924, + "learning_rate": 9.752663076782665e-06, + "loss": 3.4097, + "step": 74800 + }, + { + "epoch": 0.019557234638313718, + "grad_norm": 8.201305389404297, + "learning_rate": 9.778739716025399e-06, + "loss": 3.3639, + "step": 75000 + }, + { + "epoch": 0.019609387264015888, + "grad_norm": 7.614349365234375, + "learning_rate": 9.804816355268134e-06, + "loss": 3.3741, + "step": 75200 + }, + { + "epoch": 0.01966153988971806, + "grad_norm": 6.917168140411377, + "learning_rate": 9.830892994510868e-06, + "loss": 3.4018, + "step": 75400 + }, + { + "epoch": 0.01971369251542023, + "grad_norm": 6.967098236083984, + "learning_rate": 9.856969633753603e-06, + "loss": 3.3677, + "step": 75600 + }, + { + "epoch": 0.0197658451411224, + "grad_norm": 7.0407795906066895, + "learning_rate": 9.883046272996338e-06, + "loss": 3.4018, + "step": 75800 + }, + { + "epoch": 0.019817997766824566, + "grad_norm": 6.73028564453125, + "learning_rate": 9.909122912239071e-06, + "loss": 3.3787, + "step": 76000 + }, + { + "epoch": 0.019870150392526736, + "grad_norm": 6.180406093597412, + "learning_rate": 9.935199551481805e-06, + "loss": 3.3538, + "step": 76200 + }, + { + "epoch": 0.019922303018228907, + "grad_norm": 7.133777618408203, + "learning_rate": 9.96127619072454e-06, + "loss": 3.3845, + "step": 76400 + }, + { + "epoch": 0.019974455643931077, + "grad_norm": 6.977362632751465, + "learning_rate": 9.987352829967275e-06, + "loss": 3.3386, + "step": 76600 + }, + { + "epoch": 0.020026608269633248, + "grad_norm": 7.869418621063232, + "learning_rate": 9.999999981466639e-06, + "loss": 3.355, + "step": 76800 + }, + { + "epoch": 0.020078760895335418, + "grad_norm": 6.740837574005127, + "learning_rate": 9.999999839614538e-06, + "loss": 3.4371, + "step": 77000 + }, + { + "epoch": 0.02013091352103759, + "grad_norm": 7.533843994140625, + "learning_rate": 9.999999558006677e-06, + "loss": 3.3576, + "step": 77200 + }, + { + "epoch": 0.020183066146739755, + "grad_norm": 6.733776092529297, + "learning_rate": 9.999999136643062e-06, + "loss": 3.3787, + "step": 77400 + }, + { + "epoch": 0.020235218772441926, + "grad_norm": 7.021015644073486, + "learning_rate": 9.999998575523706e-06, + "loss": 3.3798, + "step": 77600 + }, + { + "epoch": 0.020287371398144096, + "grad_norm": 7.0405755043029785, + "learning_rate": 9.999997874648624e-06, + "loss": 3.3451, + "step": 77800 + }, + { + "epoch": 0.020339524023846266, + "grad_norm": 7.242607116699219, + "learning_rate": 9.999997034017837e-06, + "loss": 3.3643, + "step": 78000 + }, + { + "epoch": 0.020391676649548437, + "grad_norm": 7.021972179412842, + "learning_rate": 9.999996053631368e-06, + "loss": 3.3861, + "step": 78200 + }, + { + "epoch": 0.020443829275250607, + "grad_norm": 7.320739269256592, + "learning_rate": 9.999994933489244e-06, + "loss": 3.3503, + "step": 78400 + }, + { + "epoch": 0.020495981900952778, + "grad_norm": 7.283355712890625, + "learning_rate": 9.999993673591494e-06, + "loss": 3.393, + "step": 78600 + }, + { + "epoch": 0.020548134526654948, + "grad_norm": 7.413722515106201, + "learning_rate": 9.999992273938159e-06, + "loss": 3.3393, + "step": 78800 + }, + { + "epoch": 0.020600287152357115, + "grad_norm": 7.285412788391113, + "learning_rate": 9.999990734529274e-06, + "loss": 3.3531, + "step": 79000 + }, + { + "epoch": 0.020652439778059285, + "grad_norm": 6.342610836029053, + "learning_rate": 9.999989055364881e-06, + "loss": 3.3199, + "step": 79200 + }, + { + "epoch": 0.020704592403761456, + "grad_norm": 7.5218892097473145, + "learning_rate": 9.999987236445031e-06, + "loss": 3.3587, + "step": 79400 + }, + { + "epoch": 0.020756745029463626, + "grad_norm": 6.564230918884277, + "learning_rate": 9.99998527776977e-06, + "loss": 3.3877, + "step": 79600 + }, + { + "epoch": 0.020808897655165796, + "grad_norm": 6.8725786209106445, + "learning_rate": 9.999983179339158e-06, + "loss": 3.3915, + "step": 79800 + }, + { + "epoch": 0.020861050280867967, + "grad_norm": 7.533461093902588, + "learning_rate": 9.999980941153249e-06, + "loss": 3.3572, + "step": 80000 + }, + { + "epoch": 0.020913202906570137, + "grad_norm": 7.089059829711914, + "learning_rate": 9.999978563212107e-06, + "loss": 3.3238, + "step": 80200 + }, + { + "epoch": 0.020965355532272304, + "grad_norm": 7.241924285888672, + "learning_rate": 9.999976045515802e-06, + "loss": 3.3311, + "step": 80400 + }, + { + "epoch": 0.021017508157974474, + "grad_norm": 6.942836761474609, + "learning_rate": 9.9999733880644e-06, + "loss": 3.3374, + "step": 80600 + }, + { + "epoch": 0.021069660783676645, + "grad_norm": 7.0181884765625, + "learning_rate": 9.999970590857975e-06, + "loss": 3.3009, + "step": 80800 + }, + { + "epoch": 0.021121813409378815, + "grad_norm": 7.321265697479248, + "learning_rate": 9.999967653896607e-06, + "loss": 3.3696, + "step": 81000 + }, + { + "epoch": 0.021173966035080986, + "grad_norm": 6.867557525634766, + "learning_rate": 9.999964577180379e-06, + "loss": 3.3183, + "step": 81200 + }, + { + "epoch": 0.021226118660783156, + "grad_norm": 7.854918956756592, + "learning_rate": 9.999961360709376e-06, + "loss": 3.3517, + "step": 81400 + }, + { + "epoch": 0.021278271286485326, + "grad_norm": 7.4617438316345215, + "learning_rate": 9.999958004483687e-06, + "loss": 3.3323, + "step": 81600 + }, + { + "epoch": 0.021330423912187493, + "grad_norm": 7.94450569152832, + "learning_rate": 9.999954508503407e-06, + "loss": 3.3163, + "step": 81800 + }, + { + "epoch": 0.021382576537889664, + "grad_norm": 6.6632184982299805, + "learning_rate": 9.999950872768633e-06, + "loss": 3.3429, + "step": 82000 + }, + { + "epoch": 0.021434729163591834, + "grad_norm": 6.834831237792969, + "learning_rate": 9.999947097279468e-06, + "loss": 3.3278, + "step": 82200 + }, + { + "epoch": 0.021486881789294004, + "grad_norm": 7.85338020324707, + "learning_rate": 9.999943182036017e-06, + "loss": 3.3649, + "step": 82400 + }, + { + "epoch": 0.021539034414996175, + "grad_norm": 7.1437811851501465, + "learning_rate": 9.999939127038387e-06, + "loss": 3.3501, + "step": 82600 + }, + { + "epoch": 0.021591187040698345, + "grad_norm": 7.180891990661621, + "learning_rate": 9.999934932286692e-06, + "loss": 3.3236, + "step": 82800 + }, + { + "epoch": 0.021643339666400516, + "grad_norm": 6.1819353103637695, + "learning_rate": 9.999930597781054e-06, + "loss": 3.2915, + "step": 83000 + }, + { + "epoch": 0.021695492292102686, + "grad_norm": 6.358526229858398, + "learning_rate": 9.999926123521588e-06, + "loss": 3.2759, + "step": 83200 + }, + { + "epoch": 0.021747644917804853, + "grad_norm": 7.643764495849609, + "learning_rate": 9.999921509508424e-06, + "loss": 3.3382, + "step": 83400 + }, + { + "epoch": 0.021799797543507023, + "grad_norm": 6.65496826171875, + "learning_rate": 9.999916755741687e-06, + "loss": 3.3042, + "step": 83600 + }, + { + "epoch": 0.021851950169209194, + "grad_norm": 6.801998138427734, + "learning_rate": 9.999911862221512e-06, + "loss": 3.3193, + "step": 83800 + }, + { + "epoch": 0.021904102794911364, + "grad_norm": 7.052958011627197, + "learning_rate": 9.999906828948035e-06, + "loss": 3.2707, + "step": 84000 + }, + { + "epoch": 0.021956255420613534, + "grad_norm": 6.636064529418945, + "learning_rate": 9.999901655921398e-06, + "loss": 3.317, + "step": 84200 + }, + { + "epoch": 0.022008408046315705, + "grad_norm": 6.989995956420898, + "learning_rate": 9.999896343141742e-06, + "loss": 3.3297, + "step": 84400 + }, + { + "epoch": 0.022060560672017875, + "grad_norm": 6.694401741027832, + "learning_rate": 9.999890890609221e-06, + "loss": 3.2781, + "step": 84600 + }, + { + "epoch": 0.022112713297720042, + "grad_norm": 6.958297252655029, + "learning_rate": 9.999885298323984e-06, + "loss": 3.3058, + "step": 84800 + }, + { + "epoch": 0.022164865923422213, + "grad_norm": 7.147956371307373, + "learning_rate": 9.999879566286187e-06, + "loss": 3.3504, + "step": 85000 + }, + { + "epoch": 0.022217018549124383, + "grad_norm": 7.238255023956299, + "learning_rate": 9.999873694495991e-06, + "loss": 3.3022, + "step": 85200 + }, + { + "epoch": 0.022269171174826553, + "grad_norm": 7.0600056648254395, + "learning_rate": 9.999867682953562e-06, + "loss": 3.2688, + "step": 85400 + }, + { + "epoch": 0.022321323800528724, + "grad_norm": 6.889624118804932, + "learning_rate": 9.999861531659063e-06, + "loss": 3.2854, + "step": 85600 + }, + { + "epoch": 0.022373476426230894, + "grad_norm": 6.706833362579346, + "learning_rate": 9.99985524061267e-06, + "loss": 3.3073, + "step": 85800 + }, + { + "epoch": 0.022425629051933064, + "grad_norm": 6.433074951171875, + "learning_rate": 9.99984880981456e-06, + "loss": 3.3223, + "step": 86000 + }, + { + "epoch": 0.02247778167763523, + "grad_norm": 6.203549385070801, + "learning_rate": 9.99984223926491e-06, + "loss": 3.324, + "step": 86200 + }, + { + "epoch": 0.022529934303337402, + "grad_norm": 6.171205043792725, + "learning_rate": 9.999835528963905e-06, + "loss": 3.2961, + "step": 86400 + }, + { + "epoch": 0.022582086929039572, + "grad_norm": 7.106544017791748, + "learning_rate": 9.999828678911729e-06, + "loss": 3.2777, + "step": 86600 + }, + { + "epoch": 0.022634239554741743, + "grad_norm": 6.84821081161499, + "learning_rate": 9.99982168910858e-06, + "loss": 3.3162, + "step": 86800 + }, + { + "epoch": 0.022686392180443913, + "grad_norm": 7.026462078094482, + "learning_rate": 9.999814559554648e-06, + "loss": 3.2632, + "step": 87000 + }, + { + "epoch": 0.022738544806146083, + "grad_norm": 6.290987014770508, + "learning_rate": 9.999807290250133e-06, + "loss": 3.2957, + "step": 87200 + }, + { + "epoch": 0.022790697431848254, + "grad_norm": 6.156450271606445, + "learning_rate": 9.99979988119524e-06, + "loss": 3.2679, + "step": 87400 + }, + { + "epoch": 0.022842850057550424, + "grad_norm": 6.672393321990967, + "learning_rate": 9.999792332390177e-06, + "loss": 3.2777, + "step": 87600 + }, + { + "epoch": 0.02289500268325259, + "grad_norm": 6.625957489013672, + "learning_rate": 9.99978464383515e-06, + "loss": 3.2897, + "step": 87800 + }, + { + "epoch": 0.02294715530895476, + "grad_norm": 6.969167232513428, + "learning_rate": 9.99977681553038e-06, + "loss": 3.2615, + "step": 88000 + }, + { + "epoch": 0.022999307934656932, + "grad_norm": 6.787604331970215, + "learning_rate": 9.999768847476084e-06, + "loss": 3.2896, + "step": 88200 + }, + { + "epoch": 0.023051460560359102, + "grad_norm": 7.575397491455078, + "learning_rate": 9.999760739672481e-06, + "loss": 3.2926, + "step": 88400 + }, + { + "epoch": 0.023103613186061273, + "grad_norm": 6.810946941375732, + "learning_rate": 9.9997524921198e-06, + "loss": 3.2828, + "step": 88600 + }, + { + "epoch": 0.023155765811763443, + "grad_norm": 6.538626194000244, + "learning_rate": 9.999744104818275e-06, + "loss": 3.2672, + "step": 88800 + }, + { + "epoch": 0.023207918437465613, + "grad_norm": 6.649213790893555, + "learning_rate": 9.999735577768135e-06, + "loss": 3.3231, + "step": 89000 + }, + { + "epoch": 0.02326007106316778, + "grad_norm": 6.549954414367676, + "learning_rate": 9.999726910969621e-06, + "loss": 3.2322, + "step": 89200 + }, + { + "epoch": 0.02331222368886995, + "grad_norm": 6.339663982391357, + "learning_rate": 9.999718104422977e-06, + "loss": 3.2849, + "step": 89400 + }, + { + "epoch": 0.02336437631457212, + "grad_norm": 7.996878147125244, + "learning_rate": 9.999709158128444e-06, + "loss": 3.2478, + "step": 89600 + }, + { + "epoch": 0.02341652894027429, + "grad_norm": 6.590426921844482, + "learning_rate": 9.999700072086277e-06, + "loss": 3.257, + "step": 89800 + }, + { + "epoch": 0.023468681565976462, + "grad_norm": 6.166894435882568, + "learning_rate": 9.999690846296728e-06, + "loss": 3.266, + "step": 90000 + }, + { + "epoch": 0.023520834191678632, + "grad_norm": 6.546900749206543, + "learning_rate": 9.999681480760054e-06, + "loss": 3.3065, + "step": 90200 + }, + { + "epoch": 0.023572986817380803, + "grad_norm": 7.30242919921875, + "learning_rate": 9.99967197547652e-06, + "loss": 3.2692, + "step": 90400 + }, + { + "epoch": 0.02362513944308297, + "grad_norm": 6.420165061950684, + "learning_rate": 9.999662330446387e-06, + "loss": 3.2749, + "step": 90600 + }, + { + "epoch": 0.02367729206878514, + "grad_norm": 6.860097408294678, + "learning_rate": 9.999652545669926e-06, + "loss": 3.2887, + "step": 90800 + }, + { + "epoch": 0.02372944469448731, + "grad_norm": 6.4746599197387695, + "learning_rate": 9.999642621147414e-06, + "loss": 3.2687, + "step": 91000 + }, + { + "epoch": 0.02378159732018948, + "grad_norm": 6.279762268066406, + "learning_rate": 9.999632556879127e-06, + "loss": 3.2578, + "step": 91200 + }, + { + "epoch": 0.02383374994589165, + "grad_norm": 6.2368268966674805, + "learning_rate": 9.999622352865342e-06, + "loss": 3.2125, + "step": 91400 + }, + { + "epoch": 0.02388590257159382, + "grad_norm": 6.1834869384765625, + "learning_rate": 9.999612009106349e-06, + "loss": 3.2436, + "step": 91600 + }, + { + "epoch": 0.023938055197295992, + "grad_norm": 7.046905517578125, + "learning_rate": 9.999601525602436e-06, + "loss": 3.2105, + "step": 91800 + }, + { + "epoch": 0.023990207822998162, + "grad_norm": 6.951870918273926, + "learning_rate": 9.999590902353895e-06, + "loss": 3.2179, + "step": 92000 + }, + { + "epoch": 0.02404236044870033, + "grad_norm": 6.480534553527832, + "learning_rate": 9.999580139361023e-06, + "loss": 3.2562, + "step": 92200 + }, + { + "epoch": 0.0240945130744025, + "grad_norm": 6.898007869720459, + "learning_rate": 9.999569236624122e-06, + "loss": 3.2618, + "step": 92400 + }, + { + "epoch": 0.02414666570010467, + "grad_norm": 7.365018367767334, + "learning_rate": 9.999558194143497e-06, + "loss": 3.2709, + "step": 92600 + }, + { + "epoch": 0.02419881832580684, + "grad_norm": 6.268816947937012, + "learning_rate": 9.999547011919454e-06, + "loss": 3.2585, + "step": 92800 + }, + { + "epoch": 0.02425097095150901, + "grad_norm": 7.152459621429443, + "learning_rate": 9.999535689952309e-06, + "loss": 3.2336, + "step": 93000 + }, + { + "epoch": 0.02430312357721118, + "grad_norm": 7.479735851287842, + "learning_rate": 9.999524228242376e-06, + "loss": 3.2201, + "step": 93200 + }, + { + "epoch": 0.02435527620291335, + "grad_norm": 7.143455982208252, + "learning_rate": 9.999512626789977e-06, + "loss": 3.248, + "step": 93400 + }, + { + "epoch": 0.02440742882861552, + "grad_norm": 6.253408432006836, + "learning_rate": 9.999500885595435e-06, + "loss": 3.2142, + "step": 93600 + }, + { + "epoch": 0.02445958145431769, + "grad_norm": 7.234580039978027, + "learning_rate": 9.999489004659077e-06, + "loss": 3.2751, + "step": 93800 + }, + { + "epoch": 0.02451173408001986, + "grad_norm": 6.902135372161865, + "learning_rate": 9.999476983981238e-06, + "loss": 3.2603, + "step": 94000 + }, + { + "epoch": 0.02456388670572203, + "grad_norm": 6.845031261444092, + "learning_rate": 9.999464823562253e-06, + "loss": 3.2149, + "step": 94200 + }, + { + "epoch": 0.0246160393314242, + "grad_norm": 6.742398738861084, + "learning_rate": 9.999452523402461e-06, + "loss": 3.2196, + "step": 94400 + }, + { + "epoch": 0.02466819195712637, + "grad_norm": 7.369746208190918, + "learning_rate": 9.999440083502206e-06, + "loss": 3.2394, + "step": 94600 + }, + { + "epoch": 0.02472034458282854, + "grad_norm": 7.084136962890625, + "learning_rate": 9.999427503861836e-06, + "loss": 3.1992, + "step": 94800 + }, + { + "epoch": 0.02477249720853071, + "grad_norm": 6.0851054191589355, + "learning_rate": 9.999414784481705e-06, + "loss": 3.2615, + "step": 95000 + }, + { + "epoch": 0.024824649834232878, + "grad_norm": 6.620251178741455, + "learning_rate": 9.999401925362164e-06, + "loss": 3.2164, + "step": 95200 + }, + { + "epoch": 0.02487680245993505, + "grad_norm": 6.744817733764648, + "learning_rate": 9.999388926503576e-06, + "loss": 3.247, + "step": 95400 + }, + { + "epoch": 0.02492895508563722, + "grad_norm": 7.277287006378174, + "learning_rate": 9.999375787906301e-06, + "loss": 3.2377, + "step": 95600 + }, + { + "epoch": 0.02498110771133939, + "grad_norm": 6.504336357116699, + "learning_rate": 9.999362509570709e-06, + "loss": 3.2138, + "step": 95800 + }, + { + "epoch": 0.02503326033704156, + "grad_norm": 6.75746488571167, + "learning_rate": 9.999349091497173e-06, + "loss": 3.2087, + "step": 96000 + }, + { + "epoch": 0.02508541296274373, + "grad_norm": 6.542148590087891, + "learning_rate": 9.999335533686061e-06, + "loss": 3.1869, + "step": 96200 + }, + { + "epoch": 0.0251375655884459, + "grad_norm": 6.434966087341309, + "learning_rate": 9.999321836137759e-06, + "loss": 3.2454, + "step": 96400 + }, + { + "epoch": 0.025189718214148067, + "grad_norm": 6.355975151062012, + "learning_rate": 9.999307998852648e-06, + "loss": 3.2172, + "step": 96600 + }, + { + "epoch": 0.025241870839850238, + "grad_norm": 6.389385223388672, + "learning_rate": 9.999294021831112e-06, + "loss": 3.238, + "step": 96800 + }, + { + "epoch": 0.025294023465552408, + "grad_norm": 6.235101222991943, + "learning_rate": 9.999279905073544e-06, + "loss": 3.2511, + "step": 97000 + }, + { + "epoch": 0.02534617609125458, + "grad_norm": 7.004864692687988, + "learning_rate": 9.99926564858034e-06, + "loss": 3.1807, + "step": 97200 + }, + { + "epoch": 0.02539832871695675, + "grad_norm": 6.632132530212402, + "learning_rate": 9.999251252351896e-06, + "loss": 3.2, + "step": 97400 + }, + { + "epoch": 0.02545048134265892, + "grad_norm": 7.012777328491211, + "learning_rate": 9.999236716388614e-06, + "loss": 3.2303, + "step": 97600 + }, + { + "epoch": 0.02550263396836109, + "grad_norm": 7.067131519317627, + "learning_rate": 9.999222040690901e-06, + "loss": 3.1878, + "step": 97800 + }, + { + "epoch": 0.025554786594063256, + "grad_norm": 6.799013137817383, + "learning_rate": 9.99920722525917e-06, + "loss": 3.1748, + "step": 98000 + }, + { + "epoch": 0.025606939219765427, + "grad_norm": 6.825366497039795, + "learning_rate": 9.999192270093832e-06, + "loss": 3.2289, + "step": 98200 + }, + { + "epoch": 0.025659091845467597, + "grad_norm": 7.0657782554626465, + "learning_rate": 9.999177175195305e-06, + "loss": 3.2448, + "step": 98400 + }, + { + "epoch": 0.025711244471169768, + "grad_norm": 6.592565059661865, + "learning_rate": 9.99916194056401e-06, + "loss": 3.2417, + "step": 98600 + }, + { + "epoch": 0.025763397096871938, + "grad_norm": 6.476219177246094, + "learning_rate": 9.999146566200378e-06, + "loss": 3.1744, + "step": 98800 + }, + { + "epoch": 0.02581554972257411, + "grad_norm": 6.580258846282959, + "learning_rate": 9.999131052104834e-06, + "loss": 3.2692, + "step": 99000 + }, + { + "epoch": 0.02586770234827628, + "grad_norm": 6.556769371032715, + "learning_rate": 9.999115398277812e-06, + "loss": 3.189, + "step": 99200 + }, + { + "epoch": 0.02591985497397845, + "grad_norm": 7.036558151245117, + "learning_rate": 9.999099604719751e-06, + "loss": 3.2138, + "step": 99400 + }, + { + "epoch": 0.025972007599680616, + "grad_norm": 6.926954746246338, + "learning_rate": 9.999083671431092e-06, + "loss": 3.2066, + "step": 99600 + }, + { + "epoch": 0.026024160225382786, + "grad_norm": 6.171466827392578, + "learning_rate": 9.999067598412279e-06, + "loss": 3.1867, + "step": 99800 + }, + { + "epoch": 0.026076312851084957, + "grad_norm": 6.851698398590088, + "learning_rate": 9.999051385663765e-06, + "loss": 3.2022, + "step": 100000 + }, + { + "epoch": 0.026128465476787127, + "grad_norm": 6.199542045593262, + "learning_rate": 9.999035033185998e-06, + "loss": 3.205, + "step": 100200 + }, + { + "epoch": 0.026180618102489298, + "grad_norm": 6.503675937652588, + "learning_rate": 9.99901854097944e-06, + "loss": 3.2172, + "step": 100400 + }, + { + "epoch": 0.026232770728191468, + "grad_norm": 6.505238056182861, + "learning_rate": 9.999001909044548e-06, + "loss": 3.2047, + "step": 100600 + }, + { + "epoch": 0.02628492335389364, + "grad_norm": 6.692224979400635, + "learning_rate": 9.99898513738179e-06, + "loss": 3.1518, + "step": 100800 + }, + { + "epoch": 0.026337075979595805, + "grad_norm": 6.675695419311523, + "learning_rate": 9.998968225991632e-06, + "loss": 3.1623, + "step": 101000 + }, + { + "epoch": 0.026389228605297976, + "grad_norm": 6.617093086242676, + "learning_rate": 9.998951174874548e-06, + "loss": 3.2322, + "step": 101200 + }, + { + "epoch": 0.026441381231000146, + "grad_norm": 6.3754119873046875, + "learning_rate": 9.998933984031016e-06, + "loss": 3.2037, + "step": 101400 + }, + { + "epoch": 0.026493533856702316, + "grad_norm": 6.500361442565918, + "learning_rate": 9.998916653461515e-06, + "loss": 3.2131, + "step": 101600 + }, + { + "epoch": 0.026545686482404487, + "grad_norm": 6.8905415534973145, + "learning_rate": 9.998899183166529e-06, + "loss": 3.1613, + "step": 101800 + }, + { + "epoch": 0.026597839108106657, + "grad_norm": 6.983830451965332, + "learning_rate": 9.998881573146546e-06, + "loss": 3.1294, + "step": 102000 + }, + { + "epoch": 0.026649991733808828, + "grad_norm": 6.894038677215576, + "learning_rate": 9.99886382340206e-06, + "loss": 3.2047, + "step": 102200 + }, + { + "epoch": 0.026702144359510994, + "grad_norm": 6.187081336975098, + "learning_rate": 9.998845933933565e-06, + "loss": 3.1967, + "step": 102400 + }, + { + "epoch": 0.026754296985213165, + "grad_norm": 7.0318708419799805, + "learning_rate": 9.998827904741563e-06, + "loss": 3.2053, + "step": 102600 + }, + { + "epoch": 0.026806449610915335, + "grad_norm": 6.508965969085693, + "learning_rate": 9.998809735826559e-06, + "loss": 3.1671, + "step": 102800 + }, + { + "epoch": 0.026858602236617506, + "grad_norm": 6.52016019821167, + "learning_rate": 9.998791427189058e-06, + "loss": 3.1954, + "step": 103000 + }, + { + "epoch": 0.026910754862319676, + "grad_norm": 6.182394981384277, + "learning_rate": 9.998772978829571e-06, + "loss": 3.1369, + "step": 103200 + }, + { + "epoch": 0.026962907488021846, + "grad_norm": 6.727584362030029, + "learning_rate": 9.998754390748617e-06, + "loss": 3.1729, + "step": 103400 + }, + { + "epoch": 0.027015060113724017, + "grad_norm": 6.066624641418457, + "learning_rate": 9.998735662946715e-06, + "loss": 3.1742, + "step": 103600 + }, + { + "epoch": 0.027067212739426187, + "grad_norm": 6.274039268493652, + "learning_rate": 9.998716795424385e-06, + "loss": 3.1548, + "step": 103800 + }, + { + "epoch": 0.027119365365128354, + "grad_norm": 6.5091352462768555, + "learning_rate": 9.998697788182158e-06, + "loss": 3.1464, + "step": 104000 + }, + { + "epoch": 0.027171517990830524, + "grad_norm": 7.099887847900391, + "learning_rate": 9.998678641220564e-06, + "loss": 3.1755, + "step": 104200 + }, + { + "epoch": 0.027223670616532695, + "grad_norm": 6.529077529907227, + "learning_rate": 9.99865935454014e-06, + "loss": 3.1749, + "step": 104400 + }, + { + "epoch": 0.027275823242234865, + "grad_norm": 6.756040573120117, + "learning_rate": 9.998639928141422e-06, + "loss": 3.192, + "step": 104600 + }, + { + "epoch": 0.027327975867937036, + "grad_norm": 6.822609901428223, + "learning_rate": 9.998620362024954e-06, + "loss": 3.2029, + "step": 104800 + }, + { + "epoch": 0.027380128493639206, + "grad_norm": 6.252408504486084, + "learning_rate": 9.998600656191284e-06, + "loss": 3.1596, + "step": 105000 + }, + { + "epoch": 0.027432281119341376, + "grad_norm": 6.114434719085693, + "learning_rate": 9.99858081064096e-06, + "loss": 3.2181, + "step": 105200 + }, + { + "epoch": 0.027484433745043543, + "grad_norm": 6.77893590927124, + "learning_rate": 9.998560825374542e-06, + "loss": 3.1836, + "step": 105400 + }, + { + "epoch": 0.027536586370745714, + "grad_norm": 5.703882217407227, + "learning_rate": 9.998540700392583e-06, + "loss": 3.1661, + "step": 105600 + }, + { + "epoch": 0.027588738996447884, + "grad_norm": 6.503817558288574, + "learning_rate": 9.99852043569565e-06, + "loss": 3.1212, + "step": 105800 + }, + { + "epoch": 0.027640891622150054, + "grad_norm": 6.397846221923828, + "learning_rate": 9.998500031284304e-06, + "loss": 3.1341, + "step": 106000 + }, + { + "epoch": 0.027693044247852225, + "grad_norm": 6.377574443817139, + "learning_rate": 9.998479487159121e-06, + "loss": 3.1652, + "step": 106200 + }, + { + "epoch": 0.027745196873554395, + "grad_norm": 6.59285831451416, + "learning_rate": 9.998458803320671e-06, + "loss": 3.1326, + "step": 106400 + }, + { + "epoch": 0.027797349499256566, + "grad_norm": 5.924184322357178, + "learning_rate": 9.998437979769536e-06, + "loss": 3.1707, + "step": 106600 + }, + { + "epoch": 0.027849502124958733, + "grad_norm": 6.608452320098877, + "learning_rate": 9.998417016506296e-06, + "loss": 3.176, + "step": 106800 + }, + { + "epoch": 0.027901654750660903, + "grad_norm": 7.678200721740723, + "learning_rate": 9.998395913531536e-06, + "loss": 3.1778, + "step": 107000 + }, + { + "epoch": 0.027953807376363073, + "grad_norm": 7.167779922485352, + "learning_rate": 9.998374670845847e-06, + "loss": 3.181, + "step": 107200 + }, + { + "epoch": 0.028005960002065244, + "grad_norm": 6.5178937911987305, + "learning_rate": 9.998353288449823e-06, + "loss": 3.1868, + "step": 107400 + }, + { + "epoch": 0.028058112627767414, + "grad_norm": 5.7756218910217285, + "learning_rate": 9.998331766344062e-06, + "loss": 3.1752, + "step": 107600 + }, + { + "epoch": 0.028110265253469584, + "grad_norm": 6.773250579833984, + "learning_rate": 9.998310104529163e-06, + "loss": 3.1601, + "step": 107800 + }, + { + "epoch": 0.028162417879171755, + "grad_norm": 7.053867816925049, + "learning_rate": 9.998288303005734e-06, + "loss": 3.181, + "step": 108000 + }, + { + "epoch": 0.028214570504873925, + "grad_norm": 6.301112174987793, + "learning_rate": 9.998266361774385e-06, + "loss": 3.1717, + "step": 108200 + }, + { + "epoch": 0.028266723130576092, + "grad_norm": 6.55330753326416, + "learning_rate": 9.998244280835728e-06, + "loss": 3.1113, + "step": 108400 + }, + { + "epoch": 0.028318875756278263, + "grad_norm": 6.428907871246338, + "learning_rate": 9.998222060190377e-06, + "loss": 3.1395, + "step": 108600 + }, + { + "epoch": 0.028371028381980433, + "grad_norm": 6.372659683227539, + "learning_rate": 9.998199699838959e-06, + "loss": 3.2008, + "step": 108800 + }, + { + "epoch": 0.028423181007682603, + "grad_norm": 6.7776689529418945, + "learning_rate": 9.998177199782095e-06, + "loss": 3.214, + "step": 109000 + }, + { + "epoch": 0.028475333633384774, + "grad_norm": 6.657548427581787, + "learning_rate": 9.998154560020417e-06, + "loss": 3.1759, + "step": 109200 + }, + { + "epoch": 0.028527486259086944, + "grad_norm": 7.199187278747559, + "learning_rate": 9.998131780554554e-06, + "loss": 3.1263, + "step": 109400 + }, + { + "epoch": 0.028579638884789114, + "grad_norm": 6.797530651092529, + "learning_rate": 9.998108861385145e-06, + "loss": 3.1394, + "step": 109600 + }, + { + "epoch": 0.02863179151049128, + "grad_norm": 7.414548873901367, + "learning_rate": 9.998085802512832e-06, + "loss": 3.116, + "step": 109800 + }, + { + "epoch": 0.028683944136193452, + "grad_norm": 6.82701301574707, + "learning_rate": 9.998062603938255e-06, + "loss": 3.1589, + "step": 110000 + }, + { + "epoch": 0.028736096761895622, + "grad_norm": 6.446861267089844, + "learning_rate": 9.998039265662067e-06, + "loss": 3.1494, + "step": 110200 + }, + { + "epoch": 0.028788249387597793, + "grad_norm": 6.727176189422607, + "learning_rate": 9.998015787684919e-06, + "loss": 3.1162, + "step": 110400 + }, + { + "epoch": 0.028840402013299963, + "grad_norm": 7.0100226402282715, + "learning_rate": 9.997992170007464e-06, + "loss": 3.1154, + "step": 110600 + }, + { + "epoch": 0.028892554639002133, + "grad_norm": 5.6385722160339355, + "learning_rate": 9.997968412630368e-06, + "loss": 3.1144, + "step": 110800 + }, + { + "epoch": 0.028944707264704304, + "grad_norm": 6.636441230773926, + "learning_rate": 9.997944515554291e-06, + "loss": 3.1642, + "step": 111000 + }, + { + "epoch": 0.028996859890406474, + "grad_norm": 6.764395236968994, + "learning_rate": 9.997920478779901e-06, + "loss": 3.1758, + "step": 111200 + }, + { + "epoch": 0.02904901251610864, + "grad_norm": 7.620389938354492, + "learning_rate": 9.997896302307872e-06, + "loss": 3.1518, + "step": 111400 + }, + { + "epoch": 0.02910116514181081, + "grad_norm": 6.4152021408081055, + "learning_rate": 9.997871986138878e-06, + "loss": 3.1043, + "step": 111600 + }, + { + "epoch": 0.029153317767512982, + "grad_norm": 5.856719493865967, + "learning_rate": 9.9978475302736e-06, + "loss": 3.1493, + "step": 111800 + }, + { + "epoch": 0.029205470393215152, + "grad_norm": 6.708147048950195, + "learning_rate": 9.99782293471272e-06, + "loss": 3.1646, + "step": 112000 + }, + { + "epoch": 0.029257623018917323, + "grad_norm": 6.783796310424805, + "learning_rate": 9.997798199456927e-06, + "loss": 3.1477, + "step": 112200 + }, + { + "epoch": 0.029309775644619493, + "grad_norm": 6.652106761932373, + "learning_rate": 9.997773324506912e-06, + "loss": 3.107, + "step": 112400 + }, + { + "epoch": 0.029361928270321663, + "grad_norm": 6.451571464538574, + "learning_rate": 9.997748309863367e-06, + "loss": 3.1245, + "step": 112600 + }, + { + "epoch": 0.02941408089602383, + "grad_norm": 6.6614203453063965, + "learning_rate": 9.997723155526998e-06, + "loss": 3.1216, + "step": 112800 + }, + { + "epoch": 0.029466233521726, + "grad_norm": 6.864782333374023, + "learning_rate": 9.9976978614985e-06, + "loss": 3.1465, + "step": 113000 + }, + { + "epoch": 0.02951838614742817, + "grad_norm": 7.249753952026367, + "learning_rate": 9.997672427778588e-06, + "loss": 3.1112, + "step": 113200 + }, + { + "epoch": 0.02957053877313034, + "grad_norm": 6.135322093963623, + "learning_rate": 9.997646854367968e-06, + "loss": 3.1491, + "step": 113400 + }, + { + "epoch": 0.029622691398832512, + "grad_norm": 6.486145496368408, + "learning_rate": 9.997621141267355e-06, + "loss": 3.1672, + "step": 113600 + }, + { + "epoch": 0.029674844024534682, + "grad_norm": 7.196221828460693, + "learning_rate": 9.99759528847747e-06, + "loss": 3.1266, + "step": 113800 + }, + { + "epoch": 0.029726996650236853, + "grad_norm": 6.458970546722412, + "learning_rate": 9.997569295999032e-06, + "loss": 3.1566, + "step": 114000 + }, + { + "epoch": 0.02977914927593902, + "grad_norm": 6.437240123748779, + "learning_rate": 9.997543163832772e-06, + "loss": 3.0992, + "step": 114200 + }, + { + "epoch": 0.02983130190164119, + "grad_norm": 6.01683235168457, + "learning_rate": 9.997516891979418e-06, + "loss": 3.1501, + "step": 114400 + }, + { + "epoch": 0.02988345452734336, + "grad_norm": 6.238487720489502, + "learning_rate": 9.997490480439705e-06, + "loss": 3.1628, + "step": 114600 + }, + { + "epoch": 0.02993560715304553, + "grad_norm": 6.446498870849609, + "learning_rate": 9.997463929214368e-06, + "loss": 3.1472, + "step": 114800 + }, + { + "epoch": 0.0299877597787477, + "grad_norm": 7.348095417022705, + "learning_rate": 9.997437238304154e-06, + "loss": 3.171, + "step": 115000 + }, + { + "epoch": 0.03003991240444987, + "grad_norm": 6.736360549926758, + "learning_rate": 9.997410407709806e-06, + "loss": 3.1841, + "step": 115200 + }, + { + "epoch": 0.030092065030152042, + "grad_norm": 6.533570289611816, + "learning_rate": 9.997383437432075e-06, + "loss": 3.1261, + "step": 115400 + }, + { + "epoch": 0.030144217655854212, + "grad_norm": 6.483672618865967, + "learning_rate": 9.997356327471716e-06, + "loss": 3.1736, + "step": 115600 + }, + { + "epoch": 0.03019637028155638, + "grad_norm": 5.92385721206665, + "learning_rate": 9.997329077829484e-06, + "loss": 3.0937, + "step": 115800 + }, + { + "epoch": 0.03024852290725855, + "grad_norm": 6.332182884216309, + "learning_rate": 9.997301688506143e-06, + "loss": 3.0874, + "step": 116000 + }, + { + "epoch": 0.03030067553296072, + "grad_norm": 6.6715521812438965, + "learning_rate": 9.997274159502457e-06, + "loss": 3.1354, + "step": 116200 + }, + { + "epoch": 0.03035282815866289, + "grad_norm": 6.579791069030762, + "learning_rate": 9.997246490819197e-06, + "loss": 3.1421, + "step": 116400 + }, + { + "epoch": 0.03040498078436506, + "grad_norm": 7.069493770599365, + "learning_rate": 9.997218682457135e-06, + "loss": 3.1171, + "step": 116600 + }, + { + "epoch": 0.03045713341006723, + "grad_norm": 6.911351680755615, + "learning_rate": 9.997190734417048e-06, + "loss": 3.0969, + "step": 116800 + }, + { + "epoch": 0.0305092860357694, + "grad_norm": 6.206057548522949, + "learning_rate": 9.99716264669972e-06, + "loss": 3.1368, + "step": 117000 + }, + { + "epoch": 0.03056143866147157, + "grad_norm": 6.447131156921387, + "learning_rate": 9.997134419305933e-06, + "loss": 3.1165, + "step": 117200 + }, + { + "epoch": 0.03061359128717374, + "grad_norm": 6.3493804931640625, + "learning_rate": 9.997106052236475e-06, + "loss": 3.1207, + "step": 117400 + }, + { + "epoch": 0.03066574391287591, + "grad_norm": 6.453786849975586, + "learning_rate": 9.997077545492144e-06, + "loss": 3.1396, + "step": 117600 + }, + { + "epoch": 0.03071789653857808, + "grad_norm": 6.4193010330200195, + "learning_rate": 9.997048899073734e-06, + "loss": 3.129, + "step": 117800 + }, + { + "epoch": 0.03077004916428025, + "grad_norm": 6.173766136169434, + "learning_rate": 9.997020112982043e-06, + "loss": 3.157, + "step": 118000 + }, + { + "epoch": 0.03082220178998242, + "grad_norm": 6.576676368713379, + "learning_rate": 9.99699118721788e-06, + "loss": 3.1641, + "step": 118200 + }, + { + "epoch": 0.03087435441568459, + "grad_norm": 6.761925220489502, + "learning_rate": 9.99696212178205e-06, + "loss": 3.101, + "step": 118400 + }, + { + "epoch": 0.030926507041386758, + "grad_norm": 6.417123317718506, + "learning_rate": 9.996932916675368e-06, + "loss": 3.0993, + "step": 118600 + }, + { + "epoch": 0.030978659667088928, + "grad_norm": 6.432227611541748, + "learning_rate": 9.996903571898649e-06, + "loss": 3.1012, + "step": 118800 + }, + { + "epoch": 0.0310308122927911, + "grad_norm": 6.400918006896973, + "learning_rate": 9.996874087452714e-06, + "loss": 3.1017, + "step": 119000 + }, + { + "epoch": 0.03108296491849327, + "grad_norm": 6.3425984382629395, + "learning_rate": 9.996844463338387e-06, + "loss": 3.0497, + "step": 119200 + }, + { + "epoch": 0.03113511754419544, + "grad_norm": 6.788375377655029, + "learning_rate": 9.996814699556494e-06, + "loss": 3.1498, + "step": 119400 + }, + { + "epoch": 0.03118727016989761, + "grad_norm": 6.5473761558532715, + "learning_rate": 9.99678479610787e-06, + "loss": 3.0913, + "step": 119600 + }, + { + "epoch": 0.03123942279559978, + "grad_norm": 6.9028191566467285, + "learning_rate": 9.996754752993348e-06, + "loss": 3.1111, + "step": 119800 + }, + { + "epoch": 0.03129157542130195, + "grad_norm": 6.383526802062988, + "learning_rate": 9.99672457021377e-06, + "loss": 3.1439, + "step": 120000 + }, + { + "epoch": 0.03134372804700412, + "grad_norm": 6.976934909820557, + "learning_rate": 9.996694247769979e-06, + "loss": 3.1025, + "step": 120200 + }, + { + "epoch": 0.03139588067270629, + "grad_norm": 6.561999320983887, + "learning_rate": 9.996663785662823e-06, + "loss": 3.1209, + "step": 120400 + }, + { + "epoch": 0.03144803329840846, + "grad_norm": 6.6694817543029785, + "learning_rate": 9.996633183893152e-06, + "loss": 3.0911, + "step": 120600 + }, + { + "epoch": 0.03150018592411063, + "grad_norm": 6.184081077575684, + "learning_rate": 9.996602442461823e-06, + "loss": 3.1038, + "step": 120800 + }, + { + "epoch": 0.0315523385498128, + "grad_norm": 6.33581018447876, + "learning_rate": 9.996571561369692e-06, + "loss": 3.08, + "step": 121000 + }, + { + "epoch": 0.03160449117551497, + "grad_norm": 6.361224174499512, + "learning_rate": 9.996540540617628e-06, + "loss": 3.1018, + "step": 121200 + }, + { + "epoch": 0.03165664380121714, + "grad_norm": 5.576674461364746, + "learning_rate": 9.996509380206491e-06, + "loss": 3.1165, + "step": 121400 + }, + { + "epoch": 0.03170879642691931, + "grad_norm": 6.4147844314575195, + "learning_rate": 9.996478080137158e-06, + "loss": 3.0525, + "step": 121600 + }, + { + "epoch": 0.03176094905262148, + "grad_norm": 7.201076030731201, + "learning_rate": 9.996446640410502e-06, + "loss": 3.0917, + "step": 121800 + }, + { + "epoch": 0.03181310167832365, + "grad_norm": 6.715845584869385, + "learning_rate": 9.9964150610274e-06, + "loss": 3.0954, + "step": 122000 + }, + { + "epoch": 0.03186525430402582, + "grad_norm": 6.023102283477783, + "learning_rate": 9.996383341988736e-06, + "loss": 3.0728, + "step": 122200 + }, + { + "epoch": 0.031917406929727984, + "grad_norm": 7.259410381317139, + "learning_rate": 9.996351483295396e-06, + "loss": 3.0708, + "step": 122400 + }, + { + "epoch": 0.031969559555430155, + "grad_norm": 6.183089256286621, + "learning_rate": 9.996319484948273e-06, + "loss": 3.1023, + "step": 122600 + }, + { + "epoch": 0.032021712181132325, + "grad_norm": 6.622340202331543, + "learning_rate": 9.996287346948258e-06, + "loss": 3.0743, + "step": 122800 + }, + { + "epoch": 0.032073864806834496, + "grad_norm": 5.829206943511963, + "learning_rate": 9.996255069296251e-06, + "loss": 3.1051, + "step": 123000 + }, + { + "epoch": 0.032126017432536666, + "grad_norm": 6.266514301300049, + "learning_rate": 9.996222651993153e-06, + "loss": 3.0887, + "step": 123200 + }, + { + "epoch": 0.032178170058238836, + "grad_norm": 5.771326065063477, + "learning_rate": 9.996190095039874e-06, + "loss": 3.0605, + "step": 123400 + }, + { + "epoch": 0.03223032268394101, + "grad_norm": 6.461459636688232, + "learning_rate": 9.996157398437319e-06, + "loss": 3.089, + "step": 123600 + }, + { + "epoch": 0.03228247530964318, + "grad_norm": 6.701711177825928, + "learning_rate": 9.996124562186402e-06, + "loss": 3.0984, + "step": 123800 + }, + { + "epoch": 0.03233462793534535, + "grad_norm": 6.251786708831787, + "learning_rate": 9.996091586288045e-06, + "loss": 3.0881, + "step": 124000 + }, + { + "epoch": 0.03238678056104752, + "grad_norm": 6.822304725646973, + "learning_rate": 9.996058470743167e-06, + "loss": 3.1225, + "step": 124200 + }, + { + "epoch": 0.03243893318674969, + "grad_norm": 6.3806257247924805, + "learning_rate": 9.996025215552694e-06, + "loss": 3.0939, + "step": 124400 + }, + { + "epoch": 0.03249108581245186, + "grad_norm": 7.634413242340088, + "learning_rate": 9.995991820717557e-06, + "loss": 3.0543, + "step": 124600 + }, + { + "epoch": 0.03254323843815403, + "grad_norm": 7.31152868270874, + "learning_rate": 9.995958286238685e-06, + "loss": 3.1145, + "step": 124800 + }, + { + "epoch": 0.0325953910638562, + "grad_norm": 6.632618427276611, + "learning_rate": 9.99592461211702e-06, + "loss": 3.0899, + "step": 125000 + }, + { + "epoch": 0.03264754368955836, + "grad_norm": 6.440008640289307, + "learning_rate": 9.995890798353503e-06, + "loss": 3.0536, + "step": 125200 + }, + { + "epoch": 0.03269969631526053, + "grad_norm": 6.341098785400391, + "learning_rate": 9.995856844949075e-06, + "loss": 3.1024, + "step": 125400 + }, + { + "epoch": 0.032751848940962704, + "grad_norm": 6.386812686920166, + "learning_rate": 9.99582275190469e-06, + "loss": 3.0769, + "step": 125600 + }, + { + "epoch": 0.032804001566664874, + "grad_norm": 6.581667900085449, + "learning_rate": 9.995788519221297e-06, + "loss": 3.0737, + "step": 125800 + }, + { + "epoch": 0.032856154192367044, + "grad_norm": 7.009296894073486, + "learning_rate": 9.995754146899856e-06, + "loss": 3.0732, + "step": 126000 + }, + { + "epoch": 0.032908306818069215, + "grad_norm": 6.533790588378906, + "learning_rate": 9.995719634941325e-06, + "loss": 3.1034, + "step": 126200 + }, + { + "epoch": 0.032960459443771385, + "grad_norm": 6.344036102294922, + "learning_rate": 9.99568498334667e-06, + "loss": 3.0929, + "step": 126400 + }, + { + "epoch": 0.033012612069473556, + "grad_norm": 6.279763221740723, + "learning_rate": 9.995650192116862e-06, + "loss": 3.068, + "step": 126600 + }, + { + "epoch": 0.033064764695175726, + "grad_norm": 7.0318922996521, + "learning_rate": 9.995615261252868e-06, + "loss": 3.061, + "step": 126800 + }, + { + "epoch": 0.033116917320877896, + "grad_norm": 6.694097518920898, + "learning_rate": 9.995580190755667e-06, + "loss": 3.0943, + "step": 127000 + }, + { + "epoch": 0.03316906994658007, + "grad_norm": 5.494612693786621, + "learning_rate": 9.995544980626241e-06, + "loss": 3.1181, + "step": 127200 + }, + { + "epoch": 0.03322122257228224, + "grad_norm": 6.504891395568848, + "learning_rate": 9.995509630865573e-06, + "loss": 3.0556, + "step": 127400 + }, + { + "epoch": 0.03327337519798441, + "grad_norm": 6.207190036773682, + "learning_rate": 9.99547414147465e-06, + "loss": 3.106, + "step": 127600 + }, + { + "epoch": 0.03332552782368658, + "grad_norm": 6.687653064727783, + "learning_rate": 9.995438512454465e-06, + "loss": 3.1383, + "step": 127800 + }, + { + "epoch": 0.03337768044938875, + "grad_norm": 5.720125198364258, + "learning_rate": 9.995402743806012e-06, + "loss": 3.062, + "step": 128000 + }, + { + "epoch": 0.03342983307509091, + "grad_norm": 5.79355001449585, + "learning_rate": 9.995366835530295e-06, + "loss": 3.0938, + "step": 128200 + }, + { + "epoch": 0.03348198570079308, + "grad_norm": 6.339210510253906, + "learning_rate": 9.995330787628315e-06, + "loss": 3.0733, + "step": 128400 + }, + { + "epoch": 0.03353413832649525, + "grad_norm": 6.438317775726318, + "learning_rate": 9.995294600101077e-06, + "loss": 3.0625, + "step": 128600 + }, + { + "epoch": 0.03358629095219742, + "grad_norm": 6.823408126831055, + "learning_rate": 9.995258272949597e-06, + "loss": 3.1021, + "step": 128800 + }, + { + "epoch": 0.03363844357789959, + "grad_norm": 5.777518272399902, + "learning_rate": 9.995221806174888e-06, + "loss": 3.0813, + "step": 129000 + }, + { + "epoch": 0.033690596203601764, + "grad_norm": 6.753819942474365, + "learning_rate": 9.99518519977797e-06, + "loss": 3.0556, + "step": 129200 + }, + { + "epoch": 0.033742748829303934, + "grad_norm": 6.781210422515869, + "learning_rate": 9.995148453759866e-06, + "loss": 3.1216, + "step": 129400 + }, + { + "epoch": 0.033794901455006104, + "grad_norm": 6.243967533111572, + "learning_rate": 9.995111568121605e-06, + "loss": 3.0294, + "step": 129600 + }, + { + "epoch": 0.033847054080708275, + "grad_norm": 6.602694511413574, + "learning_rate": 9.995074542864215e-06, + "loss": 3.0701, + "step": 129800 + }, + { + "epoch": 0.033899206706410445, + "grad_norm": 6.756824016571045, + "learning_rate": 9.99503737798873e-06, + "loss": 3.0907, + "step": 130000 + }, + { + "epoch": 0.033951359332112616, + "grad_norm": 6.042007923126221, + "learning_rate": 9.995000073496192e-06, + "loss": 3.0582, + "step": 130200 + }, + { + "epoch": 0.034003511957814786, + "grad_norm": 7.078050136566162, + "learning_rate": 9.994962629387643e-06, + "loss": 3.0474, + "step": 130400 + }, + { + "epoch": 0.034055664583516956, + "grad_norm": 5.917229652404785, + "learning_rate": 9.994925045664127e-06, + "loss": 3.0569, + "step": 130600 + }, + { + "epoch": 0.03410781720921913, + "grad_norm": 6.29770565032959, + "learning_rate": 9.994887322326698e-06, + "loss": 3.0739, + "step": 130800 + }, + { + "epoch": 0.0341599698349213, + "grad_norm": 5.848755836486816, + "learning_rate": 9.99484945937641e-06, + "loss": 3.0496, + "step": 131000 + }, + { + "epoch": 0.03421212246062346, + "grad_norm": 6.648880481719971, + "learning_rate": 9.99481145681432e-06, + "loss": 3.0691, + "step": 131200 + }, + { + "epoch": 0.03426427508632563, + "grad_norm": 6.360013961791992, + "learning_rate": 9.99477331464149e-06, + "loss": 3.084, + "step": 131400 + }, + { + "epoch": 0.0343164277120278, + "grad_norm": 6.162940979003906, + "learning_rate": 9.994735032858987e-06, + "loss": 3.1258, + "step": 131600 + }, + { + "epoch": 0.03436858033772997, + "grad_norm": 6.413846492767334, + "learning_rate": 9.99469661146788e-06, + "loss": 3.0983, + "step": 131800 + }, + { + "epoch": 0.03442073296343214, + "grad_norm": 6.364840030670166, + "learning_rate": 9.994658050469243e-06, + "loss": 3.0567, + "step": 132000 + }, + { + "epoch": 0.03447288558913431, + "grad_norm": 6.655250072479248, + "learning_rate": 9.994619349864156e-06, + "loss": 3.0799, + "step": 132200 + }, + { + "epoch": 0.03452503821483648, + "grad_norm": 5.781773090362549, + "learning_rate": 9.9945805096537e-06, + "loss": 3.0514, + "step": 132400 + }, + { + "epoch": 0.03457719084053865, + "grad_norm": 7.000985145568848, + "learning_rate": 9.994541529838957e-06, + "loss": 3.0704, + "step": 132600 + }, + { + "epoch": 0.034629343466240824, + "grad_norm": 6.9071550369262695, + "learning_rate": 9.99450241042102e-06, + "loss": 3.0856, + "step": 132800 + }, + { + "epoch": 0.034681496091942994, + "grad_norm": 5.725367546081543, + "learning_rate": 9.994463151400984e-06, + "loss": 3.0851, + "step": 133000 + }, + { + "epoch": 0.034733648717645164, + "grad_norm": 6.823086738586426, + "learning_rate": 9.994423752779942e-06, + "loss": 3.0728, + "step": 133200 + }, + { + "epoch": 0.034785801343347335, + "grad_norm": 6.22132682800293, + "learning_rate": 9.994384214558999e-06, + "loss": 3.0625, + "step": 133400 + }, + { + "epoch": 0.034837953969049505, + "grad_norm": 6.658827304840088, + "learning_rate": 9.994344536739256e-06, + "loss": 3.0398, + "step": 133600 + }, + { + "epoch": 0.034890106594751676, + "grad_norm": 6.231532096862793, + "learning_rate": 9.994304719321825e-06, + "loss": 3.0462, + "step": 133800 + }, + { + "epoch": 0.034942259220453846, + "grad_norm": 6.183959484100342, + "learning_rate": 9.994264762307821e-06, + "loss": 3.0558, + "step": 134000 + }, + { + "epoch": 0.03499441184615601, + "grad_norm": 6.413631439208984, + "learning_rate": 9.994224665698356e-06, + "loss": 3.0879, + "step": 134200 + }, + { + "epoch": 0.03504656447185818, + "grad_norm": 6.981194496154785, + "learning_rate": 9.994184429494554e-06, + "loss": 3.0688, + "step": 134400 + }, + { + "epoch": 0.03509871709756035, + "grad_norm": 6.0282135009765625, + "learning_rate": 9.994144053697539e-06, + "loss": 3.0768, + "step": 134600 + }, + { + "epoch": 0.03515086972326252, + "grad_norm": 5.871272563934326, + "learning_rate": 9.994103538308437e-06, + "loss": 3.0281, + "step": 134800 + }, + { + "epoch": 0.03520302234896469, + "grad_norm": 6.554361343383789, + "learning_rate": 9.994062883328384e-06, + "loss": 3.054, + "step": 135000 + }, + { + "epoch": 0.03525517497466686, + "grad_norm": 6.451972007751465, + "learning_rate": 9.994022088758515e-06, + "loss": 3.0884, + "step": 135200 + }, + { + "epoch": 0.03530732760036903, + "grad_norm": 6.5166168212890625, + "learning_rate": 9.993981154599972e-06, + "loss": 3.034, + "step": 135400 + }, + { + "epoch": 0.0353594802260712, + "grad_norm": 6.459281921386719, + "learning_rate": 9.993940080853895e-06, + "loss": 3.0411, + "step": 135600 + }, + { + "epoch": 0.03541163285177337, + "grad_norm": 6.400155544281006, + "learning_rate": 9.993898867521435e-06, + "loss": 3.0376, + "step": 135800 + }, + { + "epoch": 0.03546378547747554, + "grad_norm": 6.980037212371826, + "learning_rate": 9.993857514603744e-06, + "loss": 3.0522, + "step": 136000 + }, + { + "epoch": 0.03551593810317771, + "grad_norm": 7.635961532592773, + "learning_rate": 9.99381602210198e-06, + "loss": 3.0653, + "step": 136200 + }, + { + "epoch": 0.035568090728879884, + "grad_norm": 7.077882289886475, + "learning_rate": 9.993774390017294e-06, + "loss": 3.0062, + "step": 136400 + }, + { + "epoch": 0.035620243354582054, + "grad_norm": 6.433791637420654, + "learning_rate": 9.99373261835086e-06, + "loss": 3.0285, + "step": 136600 + }, + { + "epoch": 0.035672395980284224, + "grad_norm": 6.266335964202881, + "learning_rate": 9.99369070710384e-06, + "loss": 3.0393, + "step": 136800 + }, + { + "epoch": 0.03572454860598639, + "grad_norm": 7.584403038024902, + "learning_rate": 9.993648656277409e-06, + "loss": 3.0379, + "step": 137000 + }, + { + "epoch": 0.03577670123168856, + "grad_norm": 5.1457133293151855, + "learning_rate": 9.993606465872737e-06, + "loss": 3.025, + "step": 137200 + }, + { + "epoch": 0.03582885385739073, + "grad_norm": 6.755367279052734, + "learning_rate": 9.993564135891007e-06, + "loss": 3.039, + "step": 137400 + }, + { + "epoch": 0.0358810064830929, + "grad_norm": 6.786823272705078, + "learning_rate": 9.993521666333404e-06, + "loss": 3.0242, + "step": 137600 + }, + { + "epoch": 0.03593315910879507, + "grad_norm": 6.774286270141602, + "learning_rate": 9.99347905720111e-06, + "loss": 3.0289, + "step": 137800 + }, + { + "epoch": 0.03598531173449724, + "grad_norm": 6.3922200202941895, + "learning_rate": 9.99343630849532e-06, + "loss": 3.0061, + "step": 138000 + }, + { + "epoch": 0.03603746436019941, + "grad_norm": 6.802679061889648, + "learning_rate": 9.993393420217229e-06, + "loss": 3.0451, + "step": 138200 + }, + { + "epoch": 0.03608961698590158, + "grad_norm": 6.436954021453857, + "learning_rate": 9.993350392368031e-06, + "loss": 3.0221, + "step": 138400 + }, + { + "epoch": 0.03614176961160375, + "grad_norm": 6.1146721839904785, + "learning_rate": 9.993307224948934e-06, + "loss": 3.0119, + "step": 138600 + }, + { + "epoch": 0.03619392223730592, + "grad_norm": 6.813344478607178, + "learning_rate": 9.993263917961142e-06, + "loss": 3.0613, + "step": 138800 + }, + { + "epoch": 0.03624607486300809, + "grad_norm": 5.840065002441406, + "learning_rate": 9.993220471405866e-06, + "loss": 3.0639, + "step": 139000 + }, + { + "epoch": 0.03629822748871026, + "grad_norm": 7.216462135314941, + "learning_rate": 9.99317688528432e-06, + "loss": 3.0472, + "step": 139200 + }, + { + "epoch": 0.03635038011441243, + "grad_norm": 6.292632102966309, + "learning_rate": 9.993133159597724e-06, + "loss": 3.0252, + "step": 139400 + }, + { + "epoch": 0.0364025327401146, + "grad_norm": 6.57106351852417, + "learning_rate": 9.993089294347297e-06, + "loss": 3.0342, + "step": 139600 + }, + { + "epoch": 0.03645468536581677, + "grad_norm": 6.309388160705566, + "learning_rate": 9.99304528953427e-06, + "loss": 3.0658, + "step": 139800 + }, + { + "epoch": 0.03650683799151894, + "grad_norm": 6.843193054199219, + "learning_rate": 9.993001145159867e-06, + "loss": 3.0457, + "step": 140000 + }, + { + "epoch": 0.03655899061722111, + "grad_norm": 6.428710460662842, + "learning_rate": 9.992956861225325e-06, + "loss": 3.108, + "step": 140200 + }, + { + "epoch": 0.03661114324292328, + "grad_norm": 5.9581499099731445, + "learning_rate": 9.992912437731884e-06, + "loss": 3.0241, + "step": 140400 + }, + { + "epoch": 0.03666329586862545, + "grad_norm": 6.154149055480957, + "learning_rate": 9.99286787468078e-06, + "loss": 3.0459, + "step": 140600 + }, + { + "epoch": 0.03671544849432762, + "grad_norm": 6.878948211669922, + "learning_rate": 9.992823172073264e-06, + "loss": 3.0133, + "step": 140800 + }, + { + "epoch": 0.03676760112002979, + "grad_norm": 7.1989264488220215, + "learning_rate": 9.992778329910585e-06, + "loss": 3.1056, + "step": 141000 + }, + { + "epoch": 0.03681975374573196, + "grad_norm": 6.319765567779541, + "learning_rate": 9.992733348193993e-06, + "loss": 2.9999, + "step": 141200 + }, + { + "epoch": 0.03687190637143413, + "grad_norm": 6.223499298095703, + "learning_rate": 9.992688226924747e-06, + "loss": 3.0706, + "step": 141400 + }, + { + "epoch": 0.0369240589971363, + "grad_norm": 5.92350435256958, + "learning_rate": 9.992642966104107e-06, + "loss": 3.0198, + "step": 141600 + }, + { + "epoch": 0.03697621162283847, + "grad_norm": 6.555150032043457, + "learning_rate": 9.992597565733341e-06, + "loss": 3.0331, + "step": 141800 + }, + { + "epoch": 0.03702836424854064, + "grad_norm": 6.806456089019775, + "learning_rate": 9.992552025813716e-06, + "loss": 3.042, + "step": 142000 + }, + { + "epoch": 0.03708051687424281, + "grad_norm": 6.848278045654297, + "learning_rate": 9.992506346346505e-06, + "loss": 3.0455, + "step": 142200 + }, + { + "epoch": 0.03713266949994498, + "grad_norm": 6.233341693878174, + "learning_rate": 9.992460527332986e-06, + "loss": 3.0627, + "step": 142400 + }, + { + "epoch": 0.03718482212564715, + "grad_norm": 6.317975044250488, + "learning_rate": 9.992414568774438e-06, + "loss": 3.0447, + "step": 142600 + }, + { + "epoch": 0.03723697475134932, + "grad_norm": 6.190425395965576, + "learning_rate": 9.992368470672146e-06, + "loss": 3.0495, + "step": 142800 + }, + { + "epoch": 0.037289127377051486, + "grad_norm": 6.360509395599365, + "learning_rate": 9.992322233027398e-06, + "loss": 2.9939, + "step": 143000 + }, + { + "epoch": 0.037341280002753656, + "grad_norm": 6.74845552444458, + "learning_rate": 9.99227585584149e-06, + "loss": 2.9787, + "step": 143200 + }, + { + "epoch": 0.037393432628455826, + "grad_norm": 6.610714912414551, + "learning_rate": 9.992229339115714e-06, + "loss": 3.0016, + "step": 143400 + }, + { + "epoch": 0.037445585254158, + "grad_norm": 6.472560405731201, + "learning_rate": 9.99218268285137e-06, + "loss": 2.9786, + "step": 143600 + }, + { + "epoch": 0.03749773787986017, + "grad_norm": 7.01295804977417, + "learning_rate": 9.992135887049766e-06, + "loss": 3.0622, + "step": 143800 + }, + { + "epoch": 0.03754989050556234, + "grad_norm": 5.686848163604736, + "learning_rate": 9.992088951712207e-06, + "loss": 3.0528, + "step": 144000 + }, + { + "epoch": 0.03760204313126451, + "grad_norm": 6.806542873382568, + "learning_rate": 9.992041876840007e-06, + "loss": 3.0256, + "step": 144200 + }, + { + "epoch": 0.03765419575696668, + "grad_norm": 6.5922088623046875, + "learning_rate": 9.991994662434478e-06, + "loss": 3.046, + "step": 144400 + }, + { + "epoch": 0.03770634838266885, + "grad_norm": 6.281486988067627, + "learning_rate": 9.991947308496945e-06, + "loss": 3.0209, + "step": 144600 + }, + { + "epoch": 0.03775850100837102, + "grad_norm": 6.36431360244751, + "learning_rate": 9.991899815028728e-06, + "loss": 2.9928, + "step": 144800 + }, + { + "epoch": 0.03781065363407319, + "grad_norm": 6.694343090057373, + "learning_rate": 9.991852182031153e-06, + "loss": 3.0287, + "step": 145000 + }, + { + "epoch": 0.03786280625977536, + "grad_norm": 6.216729164123535, + "learning_rate": 9.991804409505557e-06, + "loss": 3.0095, + "step": 145200 + }, + { + "epoch": 0.03791495888547753, + "grad_norm": 6.089059352874756, + "learning_rate": 9.99175649745327e-06, + "loss": 3.0268, + "step": 145400 + }, + { + "epoch": 0.0379671115111797, + "grad_norm": 5.32235050201416, + "learning_rate": 9.991708445875633e-06, + "loss": 3.066, + "step": 145600 + }, + { + "epoch": 0.03801926413688187, + "grad_norm": 7.004727363586426, + "learning_rate": 9.991660254773989e-06, + "loss": 3.0154, + "step": 145800 + }, + { + "epoch": 0.038071416762584034, + "grad_norm": 5.930303573608398, + "learning_rate": 9.991611924149687e-06, + "loss": 3.0313, + "step": 146000 + }, + { + "epoch": 0.038123569388286205, + "grad_norm": 6.0669145584106445, + "learning_rate": 9.991563454004076e-06, + "loss": 3.0479, + "step": 146200 + }, + { + "epoch": 0.038175722013988375, + "grad_norm": 6.367830276489258, + "learning_rate": 9.991514844338509e-06, + "loss": 3.0386, + "step": 146400 + }, + { + "epoch": 0.038227874639690546, + "grad_norm": 6.8309221267700195, + "learning_rate": 9.991466095154348e-06, + "loss": 3.0073, + "step": 146600 + }, + { + "epoch": 0.038280027265392716, + "grad_norm": 5.97960090637207, + "learning_rate": 9.991417206452953e-06, + "loss": 3.0035, + "step": 146800 + }, + { + "epoch": 0.038332179891094886, + "grad_norm": 6.310183048248291, + "learning_rate": 9.991368178235695e-06, + "loss": 3.0623, + "step": 147000 + }, + { + "epoch": 0.03838433251679706, + "grad_norm": 5.791366100311279, + "learning_rate": 9.991319010503938e-06, + "loss": 2.9937, + "step": 147200 + }, + { + "epoch": 0.03843648514249923, + "grad_norm": 6.281528949737549, + "learning_rate": 9.991269703259061e-06, + "loss": 3.0342, + "step": 147400 + }, + { + "epoch": 0.0384886377682014, + "grad_norm": 6.502980709075928, + "learning_rate": 9.99122025650244e-06, + "loss": 3.0311, + "step": 147600 + }, + { + "epoch": 0.03854079039390357, + "grad_norm": 6.228078842163086, + "learning_rate": 9.991170670235456e-06, + "loss": 3.0259, + "step": 147800 + }, + { + "epoch": 0.03859294301960574, + "grad_norm": 6.6315083503723145, + "learning_rate": 9.9911209444595e-06, + "loss": 3.0557, + "step": 148000 + }, + { + "epoch": 0.03864509564530791, + "grad_norm": 5.9421916007995605, + "learning_rate": 9.991071079175958e-06, + "loss": 3.0471, + "step": 148200 + }, + { + "epoch": 0.03869724827101008, + "grad_norm": 5.794018745422363, + "learning_rate": 9.991021074386222e-06, + "loss": 3.0512, + "step": 148400 + }, + { + "epoch": 0.03874940089671225, + "grad_norm": 6.507724761962891, + "learning_rate": 9.990970930091695e-06, + "loss": 3.0823, + "step": 148600 + }, + { + "epoch": 0.03880155352241441, + "grad_norm": 6.405786514282227, + "learning_rate": 9.990920646293773e-06, + "loss": 2.9923, + "step": 148800 + }, + { + "epoch": 0.03885370614811658, + "grad_norm": 6.925950527191162, + "learning_rate": 9.990870222993867e-06, + "loss": 3.003, + "step": 149000 + }, + { + "epoch": 0.038905858773818754, + "grad_norm": 6.7786030769348145, + "learning_rate": 9.990819660193383e-06, + "loss": 2.9691, + "step": 149200 + }, + { + "epoch": 0.038958011399520924, + "grad_norm": 6.323172569274902, + "learning_rate": 9.990768957893732e-06, + "loss": 3.0457, + "step": 149400 + }, + { + "epoch": 0.039010164025223094, + "grad_norm": 5.784226894378662, + "learning_rate": 9.990718116096336e-06, + "loss": 3.0644, + "step": 149600 + }, + { + "epoch": 0.039062316650925265, + "grad_norm": 6.236383438110352, + "learning_rate": 9.990667134802616e-06, + "loss": 3.0147, + "step": 149800 + }, + { + "epoch": 0.039114469276627435, + "grad_norm": 6.505099773406982, + "learning_rate": 9.990616014013992e-06, + "loss": 3.0315, + "step": 150000 + }, + { + "epoch": 0.039166621902329606, + "grad_norm": 5.979858875274658, + "learning_rate": 9.990564753731898e-06, + "loss": 2.9863, + "step": 150200 + }, + { + "epoch": 0.039218774528031776, + "grad_norm": 6.28549861907959, + "learning_rate": 9.990513353957765e-06, + "loss": 3.0117, + "step": 150400 + }, + { + "epoch": 0.039270927153733946, + "grad_norm": 6.456376552581787, + "learning_rate": 9.990461814693028e-06, + "loss": 3.0182, + "step": 150600 + }, + { + "epoch": 0.03932307977943612, + "grad_norm": 6.814133644104004, + "learning_rate": 9.99041013593913e-06, + "loss": 2.9962, + "step": 150800 + }, + { + "epoch": 0.03937523240513829, + "grad_norm": 5.923751354217529, + "learning_rate": 9.990358317697513e-06, + "loss": 2.9773, + "step": 151000 + }, + { + "epoch": 0.03942738503084046, + "grad_norm": 6.386137962341309, + "learning_rate": 9.990306359969629e-06, + "loss": 3.0187, + "step": 151200 + }, + { + "epoch": 0.03947953765654263, + "grad_norm": 6.146199703216553, + "learning_rate": 9.990254262756926e-06, + "loss": 3.0153, + "step": 151400 + }, + { + "epoch": 0.0395316902822448, + "grad_norm": 6.410252571105957, + "learning_rate": 9.990202026060864e-06, + "loss": 2.9887, + "step": 151600 + }, + { + "epoch": 0.03958384290794696, + "grad_norm": 5.882859230041504, + "learning_rate": 9.990149649882902e-06, + "loss": 3.026, + "step": 151800 + }, + { + "epoch": 0.03963599553364913, + "grad_norm": 6.177847862243652, + "learning_rate": 9.990097134224503e-06, + "loss": 2.9983, + "step": 152000 + }, + { + "epoch": 0.0396881481593513, + "grad_norm": 6.150598049163818, + "learning_rate": 9.990044479087134e-06, + "loss": 2.9855, + "step": 152200 + }, + { + "epoch": 0.03974030078505347, + "grad_norm": 6.874467372894287, + "learning_rate": 9.989991684472269e-06, + "loss": 3.0343, + "step": 152400 + }, + { + "epoch": 0.03979245341075564, + "grad_norm": 6.260432720184326, + "learning_rate": 9.989938750381383e-06, + "loss": 3.0507, + "step": 152600 + }, + { + "epoch": 0.039844606036457814, + "grad_norm": 6.114156246185303, + "learning_rate": 9.989885676815955e-06, + "loss": 3.0064, + "step": 152800 + }, + { + "epoch": 0.039896758662159984, + "grad_norm": 6.27534294128418, + "learning_rate": 9.989832463777469e-06, + "loss": 2.9889, + "step": 153000 + }, + { + "epoch": 0.039948911287862154, + "grad_norm": 6.149242877960205, + "learning_rate": 9.989779111267411e-06, + "loss": 2.9928, + "step": 153200 + }, + { + "epoch": 0.040001063913564325, + "grad_norm": 6.631002902984619, + "learning_rate": 9.989725619287276e-06, + "loss": 2.9901, + "step": 153400 + }, + { + "epoch": 0.040053216539266495, + "grad_norm": 6.320735454559326, + "learning_rate": 9.989671987838554e-06, + "loss": 3.0054, + "step": 153600 + }, + { + "epoch": 0.040105369164968666, + "grad_norm": 6.417031764984131, + "learning_rate": 9.989618216922747e-06, + "loss": 3.0356, + "step": 153800 + }, + { + "epoch": 0.040157521790670836, + "grad_norm": 5.854264736175537, + "learning_rate": 9.989564306541359e-06, + "loss": 2.9719, + "step": 154000 + }, + { + "epoch": 0.040209674416373006, + "grad_norm": 6.106746196746826, + "learning_rate": 9.989510256695893e-06, + "loss": 3.0148, + "step": 154200 + }, + { + "epoch": 0.04026182704207518, + "grad_norm": 6.044825553894043, + "learning_rate": 9.989456067387864e-06, + "loss": 2.9664, + "step": 154400 + }, + { + "epoch": 0.04031397966777735, + "grad_norm": 6.020196437835693, + "learning_rate": 9.989401738618785e-06, + "loss": 2.9737, + "step": 154600 + }, + { + "epoch": 0.04036613229347951, + "grad_norm": 5.970254898071289, + "learning_rate": 9.989347270390174e-06, + "loss": 3.0247, + "step": 154800 + }, + { + "epoch": 0.04041828491918168, + "grad_norm": 6.415491104125977, + "learning_rate": 9.989292662703554e-06, + "loss": 3.0039, + "step": 155000 + }, + { + "epoch": 0.04047043754488385, + "grad_norm": 6.5930094718933105, + "learning_rate": 9.98923791556045e-06, + "loss": 2.9937, + "step": 155200 + }, + { + "epoch": 0.04052259017058602, + "grad_norm": 6.674596786499023, + "learning_rate": 9.989183028962395e-06, + "loss": 3.0017, + "step": 155400 + }, + { + "epoch": 0.04057474279628819, + "grad_norm": 6.060914039611816, + "learning_rate": 9.989128002910922e-06, + "loss": 3.0152, + "step": 155600 + }, + { + "epoch": 0.04062689542199036, + "grad_norm": 5.70841121673584, + "learning_rate": 9.989072837407567e-06, + "loss": 3.0362, + "step": 155800 + }, + { + "epoch": 0.04067904804769253, + "grad_norm": 5.961905479431152, + "learning_rate": 9.989017532453876e-06, + "loss": 2.9848, + "step": 156000 + }, + { + "epoch": 0.0407312006733947, + "grad_norm": 5.478226184844971, + "learning_rate": 9.988962088051389e-06, + "loss": 2.9478, + "step": 156200 + }, + { + "epoch": 0.040783353299096874, + "grad_norm": 6.810139179229736, + "learning_rate": 9.98890650420166e-06, + "loss": 3.0032, + "step": 156400 + }, + { + "epoch": 0.040835505924799044, + "grad_norm": 6.793480396270752, + "learning_rate": 9.988850780906242e-06, + "loss": 2.9757, + "step": 156600 + }, + { + "epoch": 0.040887658550501214, + "grad_norm": 6.606081962585449, + "learning_rate": 9.988794918166695e-06, + "loss": 3.0407, + "step": 156800 + }, + { + "epoch": 0.040939811176203385, + "grad_norm": 6.329009532928467, + "learning_rate": 9.988738915984575e-06, + "loss": 2.9683, + "step": 157000 + }, + { + "epoch": 0.040991963801905555, + "grad_norm": 6.828449249267578, + "learning_rate": 9.988682774361451e-06, + "loss": 3.0212, + "step": 157200 + }, + { + "epoch": 0.041044116427607726, + "grad_norm": 7.731756210327148, + "learning_rate": 9.98862649329889e-06, + "loss": 2.9905, + "step": 157400 + }, + { + "epoch": 0.041096269053309896, + "grad_norm": 6.351221561431885, + "learning_rate": 9.98857007279847e-06, + "loss": 2.9857, + "step": 157600 + }, + { + "epoch": 0.04114842167901206, + "grad_norm": 6.527481555938721, + "learning_rate": 9.988513512861761e-06, + "loss": 2.9985, + "step": 157800 + }, + { + "epoch": 0.04120057430471423, + "grad_norm": 6.648956298828125, + "learning_rate": 9.988456813490348e-06, + "loss": 2.972, + "step": 158000 + }, + { + "epoch": 0.0412527269304164, + "grad_norm": 6.268485069274902, + "learning_rate": 9.988399974685815e-06, + "loss": 2.9893, + "step": 158200 + }, + { + "epoch": 0.04130487955611857, + "grad_norm": 6.1307806968688965, + "learning_rate": 9.988342996449751e-06, + "loss": 2.9647, + "step": 158400 + }, + { + "epoch": 0.04135703218182074, + "grad_norm": 7.105797290802002, + "learning_rate": 9.988285878783748e-06, + "loss": 2.9828, + "step": 158600 + }, + { + "epoch": 0.04140918480752291, + "grad_norm": 6.388288974761963, + "learning_rate": 9.988228621689403e-06, + "loss": 3.0058, + "step": 158800 + }, + { + "epoch": 0.04146133743322508, + "grad_norm": 6.2273850440979, + "learning_rate": 9.988171225168318e-06, + "loss": 2.9956, + "step": 159000 + }, + { + "epoch": 0.04151349005892725, + "grad_norm": 6.558915138244629, + "learning_rate": 9.988113689222094e-06, + "loss": 2.9879, + "step": 159200 + }, + { + "epoch": 0.04156564268462942, + "grad_norm": 5.836716651916504, + "learning_rate": 9.988056013852343e-06, + "loss": 2.9391, + "step": 159400 + }, + { + "epoch": 0.04161779531033159, + "grad_norm": 6.490838050842285, + "learning_rate": 9.987998199060674e-06, + "loss": 3.0215, + "step": 159600 + }, + { + "epoch": 0.04166994793603376, + "grad_norm": 6.8759589195251465, + "learning_rate": 9.987940244848701e-06, + "loss": 3.0393, + "step": 159800 + }, + { + "epoch": 0.041722100561735934, + "grad_norm": 6.900877952575684, + "learning_rate": 9.987882151218052e-06, + "loss": 2.9726, + "step": 160000 + }, + { + "epoch": 0.041774253187438104, + "grad_norm": 6.450899124145508, + "learning_rate": 9.987823918170343e-06, + "loss": 3.0096, + "step": 160200 + }, + { + "epoch": 0.041826405813140274, + "grad_norm": 6.636816024780273, + "learning_rate": 9.987765545707202e-06, + "loss": 2.9626, + "step": 160400 + }, + { + "epoch": 0.04187855843884244, + "grad_norm": 6.257472515106201, + "learning_rate": 9.987707033830266e-06, + "loss": 2.9946, + "step": 160600 + }, + { + "epoch": 0.04193071106454461, + "grad_norm": 7.453077793121338, + "learning_rate": 9.987648382541167e-06, + "loss": 3.0467, + "step": 160800 + }, + { + "epoch": 0.04198286369024678, + "grad_norm": 6.978682041168213, + "learning_rate": 9.987589591841545e-06, + "loss": 2.9799, + "step": 161000 + }, + { + "epoch": 0.04203501631594895, + "grad_norm": 6.9115095138549805, + "learning_rate": 9.98753066173304e-06, + "loss": 3.0193, + "step": 161200 + }, + { + "epoch": 0.04208716894165112, + "grad_norm": 6.3997392654418945, + "learning_rate": 9.987471592217306e-06, + "loss": 3.0198, + "step": 161400 + }, + { + "epoch": 0.04213932156735329, + "grad_norm": 5.600056171417236, + "learning_rate": 9.987412383295988e-06, + "loss": 3.0084, + "step": 161600 + }, + { + "epoch": 0.04219147419305546, + "grad_norm": 6.3253984451293945, + "learning_rate": 9.987353034970743e-06, + "loss": 2.964, + "step": 161800 + }, + { + "epoch": 0.04224362681875763, + "grad_norm": 7.4514384269714355, + "learning_rate": 9.987293547243231e-06, + "loss": 3.0007, + "step": 162000 + }, + { + "epoch": 0.0422957794444598, + "grad_norm": 6.729308605194092, + "learning_rate": 9.987233920115114e-06, + "loss": 2.9636, + "step": 162200 + }, + { + "epoch": 0.04234793207016197, + "grad_norm": 6.6917195320129395, + "learning_rate": 9.987174153588058e-06, + "loss": 2.9993, + "step": 162400 + }, + { + "epoch": 0.04240008469586414, + "grad_norm": 6.379083156585693, + "learning_rate": 9.987114247663734e-06, + "loss": 2.9828, + "step": 162600 + }, + { + "epoch": 0.04245223732156631, + "grad_norm": 7.085773944854736, + "learning_rate": 9.987054202343817e-06, + "loss": 2.9738, + "step": 162800 + }, + { + "epoch": 0.04250438994726848, + "grad_norm": 6.782082557678223, + "learning_rate": 9.986994017629983e-06, + "loss": 2.9808, + "step": 163000 + }, + { + "epoch": 0.04255654257297065, + "grad_norm": 5.9306182861328125, + "learning_rate": 9.986933693523919e-06, + "loss": 2.981, + "step": 163200 + }, + { + "epoch": 0.04260869519867282, + "grad_norm": 6.100393295288086, + "learning_rate": 9.986873230027305e-06, + "loss": 2.9452, + "step": 163400 + }, + { + "epoch": 0.04266084782437499, + "grad_norm": 6.214852809906006, + "learning_rate": 9.986812627141836e-06, + "loss": 2.9452, + "step": 163600 + }, + { + "epoch": 0.04271300045007716, + "grad_norm": 6.460691928863525, + "learning_rate": 9.986751884869204e-06, + "loss": 2.9781, + "step": 163800 + }, + { + "epoch": 0.04276515307577933, + "grad_norm": 5.98699951171875, + "learning_rate": 9.986691003211106e-06, + "loss": 2.9453, + "step": 164000 + }, + { + "epoch": 0.0428173057014815, + "grad_norm": 6.0534281730651855, + "learning_rate": 9.986629982169244e-06, + "loss": 2.978, + "step": 164200 + }, + { + "epoch": 0.04286945832718367, + "grad_norm": 6.20578145980835, + "learning_rate": 9.986568821745327e-06, + "loss": 2.9576, + "step": 164400 + }, + { + "epoch": 0.04292161095288584, + "grad_norm": 6.547789573669434, + "learning_rate": 9.986507521941058e-06, + "loss": 3.0062, + "step": 164600 + }, + { + "epoch": 0.04297376357858801, + "grad_norm": 6.358675956726074, + "learning_rate": 9.986446082758157e-06, + "loss": 2.9771, + "step": 164800 + }, + { + "epoch": 0.04302591620429018, + "grad_norm": 6.335031509399414, + "learning_rate": 9.986384504198336e-06, + "loss": 2.9796, + "step": 165000 + }, + { + "epoch": 0.04307806882999235, + "grad_norm": 6.8711090087890625, + "learning_rate": 9.98632278626332e-06, + "loss": 2.9605, + "step": 165200 + }, + { + "epoch": 0.04313022145569452, + "grad_norm": 5.9179911613464355, + "learning_rate": 9.986260928954833e-06, + "loss": 2.9654, + "step": 165400 + }, + { + "epoch": 0.04318237408139669, + "grad_norm": 6.011270046234131, + "learning_rate": 9.986198932274601e-06, + "loss": 2.9727, + "step": 165600 + }, + { + "epoch": 0.04323452670709886, + "grad_norm": 6.755000591278076, + "learning_rate": 9.986136796224363e-06, + "loss": 2.9796, + "step": 165800 + }, + { + "epoch": 0.04328667933280103, + "grad_norm": 6.37379264831543, + "learning_rate": 9.986074520805853e-06, + "loss": 3.0062, + "step": 166000 + }, + { + "epoch": 0.0433388319585032, + "grad_norm": 6.6591877937316895, + "learning_rate": 9.986012106020808e-06, + "loss": 2.9866, + "step": 166200 + }, + { + "epoch": 0.04339098458420537, + "grad_norm": 6.0572614669799805, + "learning_rate": 9.985949551870977e-06, + "loss": 3.014, + "step": 166400 + }, + { + "epoch": 0.043443137209907536, + "grad_norm": 5.694347381591797, + "learning_rate": 9.985886858358105e-06, + "loss": 2.9979, + "step": 166600 + }, + { + "epoch": 0.043495289835609706, + "grad_norm": 6.983020782470703, + "learning_rate": 9.98582402548395e-06, + "loss": 2.9455, + "step": 166800 + }, + { + "epoch": 0.043547442461311876, + "grad_norm": 5.710437297821045, + "learning_rate": 9.985761053250261e-06, + "loss": 2.9718, + "step": 167000 + }, + { + "epoch": 0.04359959508701405, + "grad_norm": 6.083151817321777, + "learning_rate": 9.985697941658803e-06, + "loss": 2.9651, + "step": 167200 + }, + { + "epoch": 0.04365174771271622, + "grad_norm": 6.084566116333008, + "learning_rate": 9.985634690711339e-06, + "loss": 2.9767, + "step": 167400 + }, + { + "epoch": 0.04370390033841839, + "grad_norm": 5.805023193359375, + "learning_rate": 9.985571300409637e-06, + "loss": 2.9637, + "step": 167600 + }, + { + "epoch": 0.04375605296412056, + "grad_norm": 6.559878826141357, + "learning_rate": 9.985507770755469e-06, + "loss": 2.9828, + "step": 167800 + }, + { + "epoch": 0.04380820558982273, + "grad_norm": 6.1664347648620605, + "learning_rate": 9.98544410175061e-06, + "loss": 2.967, + "step": 168000 + }, + { + "epoch": 0.0438603582155249, + "grad_norm": 6.0472564697265625, + "learning_rate": 9.985380293396839e-06, + "loss": 2.9384, + "step": 168200 + }, + { + "epoch": 0.04391251084122707, + "grad_norm": 6.2360029220581055, + "learning_rate": 9.985316345695941e-06, + "loss": 2.9687, + "step": 168400 + }, + { + "epoch": 0.04396466346692924, + "grad_norm": 6.714407920837402, + "learning_rate": 9.985252258649702e-06, + "loss": 2.9763, + "step": 168600 + }, + { + "epoch": 0.04401681609263141, + "grad_norm": 7.117247104644775, + "learning_rate": 9.985188032259917e-06, + "loss": 2.9749, + "step": 168800 + }, + { + "epoch": 0.04406896871833358, + "grad_norm": 5.8086981773376465, + "learning_rate": 9.985123666528376e-06, + "loss": 2.9501, + "step": 169000 + }, + { + "epoch": 0.04412112134403575, + "grad_norm": 6.109192848205566, + "learning_rate": 9.98505916145688e-06, + "loss": 3.0017, + "step": 169200 + }, + { + "epoch": 0.044173273969737914, + "grad_norm": 6.363615989685059, + "learning_rate": 9.984994517047234e-06, + "loss": 2.9637, + "step": 169400 + }, + { + "epoch": 0.044225426595440084, + "grad_norm": 5.879502773284912, + "learning_rate": 9.984929733301243e-06, + "loss": 2.9474, + "step": 169600 + }, + { + "epoch": 0.044277579221142255, + "grad_norm": 6.141928672790527, + "learning_rate": 9.984864810220717e-06, + "loss": 2.982, + "step": 169800 + }, + { + "epoch": 0.044329731846844425, + "grad_norm": 6.232560157775879, + "learning_rate": 9.984799747807473e-06, + "loss": 2.9348, + "step": 170000 + }, + { + "epoch": 0.044381884472546596, + "grad_norm": 6.826907634735107, + "learning_rate": 9.984734546063328e-06, + "loss": 3.0119, + "step": 170200 + }, + { + "epoch": 0.044434037098248766, + "grad_norm": 6.180233955383301, + "learning_rate": 9.984669204990104e-06, + "loss": 2.9685, + "step": 170400 + }, + { + "epoch": 0.044486189723950936, + "grad_norm": 6.817476272583008, + "learning_rate": 9.98460372458963e-06, + "loss": 2.9706, + "step": 170600 + }, + { + "epoch": 0.04453834234965311, + "grad_norm": 6.815362930297852, + "learning_rate": 9.984538104863732e-06, + "loss": 2.9585, + "step": 170800 + }, + { + "epoch": 0.04459049497535528, + "grad_norm": 6.279226779937744, + "learning_rate": 9.98447234581425e-06, + "loss": 2.9819, + "step": 171000 + }, + { + "epoch": 0.04464264760105745, + "grad_norm": 5.946432590484619, + "learning_rate": 9.984406447443015e-06, + "loss": 2.92, + "step": 171200 + }, + { + "epoch": 0.04469480022675962, + "grad_norm": 6.094913482666016, + "learning_rate": 9.984340409751875e-06, + "loss": 2.9627, + "step": 171400 + }, + { + "epoch": 0.04474695285246179, + "grad_norm": 6.792775630950928, + "learning_rate": 9.984274232742673e-06, + "loss": 2.9564, + "step": 171600 + }, + { + "epoch": 0.04479910547816396, + "grad_norm": 6.441338062286377, + "learning_rate": 9.984207916417257e-06, + "loss": 2.9794, + "step": 171800 + }, + { + "epoch": 0.04485125810386613, + "grad_norm": 6.825250625610352, + "learning_rate": 9.984141460777484e-06, + "loss": 2.9504, + "step": 172000 + }, + { + "epoch": 0.0449034107295683, + "grad_norm": 6.110622882843018, + "learning_rate": 9.98407486582521e-06, + "loss": 2.9836, + "step": 172200 + }, + { + "epoch": 0.04495556335527046, + "grad_norm": 7.110909461975098, + "learning_rate": 9.984008131562299e-06, + "loss": 2.9565, + "step": 172400 + }, + { + "epoch": 0.04500771598097263, + "grad_norm": 6.6831488609313965, + "learning_rate": 9.983941257990611e-06, + "loss": 2.9093, + "step": 172600 + }, + { + "epoch": 0.045059868606674804, + "grad_norm": 6.000646114349365, + "learning_rate": 9.983874245112019e-06, + "loss": 2.9815, + "step": 172800 + }, + { + "epoch": 0.045112021232376974, + "grad_norm": 5.678177356719971, + "learning_rate": 9.983807092928396e-06, + "loss": 3.0084, + "step": 173000 + }, + { + "epoch": 0.045164173858079144, + "grad_norm": 5.743597030639648, + "learning_rate": 9.983739801441618e-06, + "loss": 2.937, + "step": 173200 + }, + { + "epoch": 0.045216326483781315, + "grad_norm": 6.317743301391602, + "learning_rate": 9.983672370653565e-06, + "loss": 2.9439, + "step": 173400 + }, + { + "epoch": 0.045268479109483485, + "grad_norm": 6.100225448608398, + "learning_rate": 9.983604800566124e-06, + "loss": 2.9455, + "step": 173600 + }, + { + "epoch": 0.045320631735185656, + "grad_norm": 5.574644088745117, + "learning_rate": 9.98353709118118e-06, + "loss": 2.9886, + "step": 173800 + }, + { + "epoch": 0.045372784360887826, + "grad_norm": 6.011674880981445, + "learning_rate": 9.983469242500632e-06, + "loss": 2.9362, + "step": 174000 + }, + { + "epoch": 0.045424936986589996, + "grad_norm": 6.598609447479248, + "learning_rate": 9.98340125452637e-06, + "loss": 2.9851, + "step": 174200 + }, + { + "epoch": 0.04547708961229217, + "grad_norm": 7.741209030151367, + "learning_rate": 9.983333127260299e-06, + "loss": 2.917, + "step": 174400 + }, + { + "epoch": 0.04552924223799434, + "grad_norm": 6.492852687835693, + "learning_rate": 9.98326486070432e-06, + "loss": 2.936, + "step": 174600 + }, + { + "epoch": 0.04558139486369651, + "grad_norm": 6.744225978851318, + "learning_rate": 9.98319645486034e-06, + "loss": 2.953, + "step": 174800 + }, + { + "epoch": 0.04563354748939868, + "grad_norm": 6.86396598815918, + "learning_rate": 9.983127909730275e-06, + "loss": 2.9551, + "step": 175000 + }, + { + "epoch": 0.04568570011510085, + "grad_norm": 6.352712631225586, + "learning_rate": 9.983059225316042e-06, + "loss": 2.9466, + "step": 175200 + }, + { + "epoch": 0.04573785274080301, + "grad_norm": 6.346860408782959, + "learning_rate": 9.982990401619554e-06, + "loss": 2.9497, + "step": 175400 + }, + { + "epoch": 0.04579000536650518, + "grad_norm": 7.074236869812012, + "learning_rate": 9.982921438642739e-06, + "loss": 2.9437, + "step": 175600 + }, + { + "epoch": 0.04584215799220735, + "grad_norm": 5.964405536651611, + "learning_rate": 9.982852336387525e-06, + "loss": 2.9576, + "step": 175800 + }, + { + "epoch": 0.04589431061790952, + "grad_norm": 6.961352348327637, + "learning_rate": 9.982783094855844e-06, + "loss": 2.9587, + "step": 176000 + }, + { + "epoch": 0.04594646324361169, + "grad_norm": 6.538314342498779, + "learning_rate": 9.982713714049627e-06, + "loss": 2.996, + "step": 176200 + }, + { + "epoch": 0.045998615869313864, + "grad_norm": 5.937180519104004, + "learning_rate": 9.98264419397082e-06, + "loss": 2.9536, + "step": 176400 + }, + { + "epoch": 0.046050768495016034, + "grad_norm": 5.624795913696289, + "learning_rate": 9.98257453462136e-06, + "loss": 2.9593, + "step": 176600 + }, + { + "epoch": 0.046102921120718204, + "grad_norm": 6.610598087310791, + "learning_rate": 9.982504736003198e-06, + "loss": 2.9338, + "step": 176800 + }, + { + "epoch": 0.046155073746420375, + "grad_norm": 6.419288158416748, + "learning_rate": 9.982434798118283e-06, + "loss": 2.9388, + "step": 177000 + }, + { + "epoch": 0.046207226372122545, + "grad_norm": 6.405141830444336, + "learning_rate": 9.982364720968572e-06, + "loss": 2.9473, + "step": 177200 + }, + { + "epoch": 0.046259378997824716, + "grad_norm": 6.353682994842529, + "learning_rate": 9.98229450455602e-06, + "loss": 2.9132, + "step": 177400 + }, + { + "epoch": 0.046311531623526886, + "grad_norm": 5.974079608917236, + "learning_rate": 9.982224148882593e-06, + "loss": 2.9748, + "step": 177600 + }, + { + "epoch": 0.046363684249229056, + "grad_norm": 5.714940071105957, + "learning_rate": 9.982153653950254e-06, + "loss": 2.9446, + "step": 177800 + }, + { + "epoch": 0.04641583687493123, + "grad_norm": 6.13021183013916, + "learning_rate": 9.982083019760978e-06, + "loss": 2.9442, + "step": 178000 + }, + { + "epoch": 0.0464679895006334, + "grad_norm": 6.623483180999756, + "learning_rate": 9.982012246316737e-06, + "loss": 2.9269, + "step": 178200 + }, + { + "epoch": 0.04652014212633556, + "grad_norm": 6.2891364097595215, + "learning_rate": 9.981941333619509e-06, + "loss": 2.9979, + "step": 178400 + }, + { + "epoch": 0.04657229475203773, + "grad_norm": 6.528567314147949, + "learning_rate": 9.981870281671277e-06, + "loss": 2.9436, + "step": 178600 + }, + { + "epoch": 0.0466244473777399, + "grad_norm": 6.393155574798584, + "learning_rate": 9.981799090474024e-06, + "loss": 2.9763, + "step": 178800 + }, + { + "epoch": 0.04667660000344207, + "grad_norm": 6.487037181854248, + "learning_rate": 9.981727760029745e-06, + "loss": 2.9561, + "step": 179000 + }, + { + "epoch": 0.04672875262914424, + "grad_norm": 5.926519393920898, + "learning_rate": 9.981656290340429e-06, + "loss": 2.9715, + "step": 179200 + }, + { + "epoch": 0.04678090525484641, + "grad_norm": 6.406266689300537, + "learning_rate": 9.981584681408076e-06, + "loss": 2.9343, + "step": 179400 + }, + { + "epoch": 0.04683305788054858, + "grad_norm": 6.141971588134766, + "learning_rate": 9.981512933234688e-06, + "loss": 2.9437, + "step": 179600 + }, + { + "epoch": 0.04688521050625075, + "grad_norm": 5.708194732666016, + "learning_rate": 9.98144104582227e-06, + "loss": 2.927, + "step": 179800 + }, + { + "epoch": 0.046937363131952924, + "grad_norm": 6.583145618438721, + "learning_rate": 9.981369019172829e-06, + "loss": 2.9649, + "step": 180000 + }, + { + "epoch": 0.046989515757655094, + "grad_norm": 6.8080363273620605, + "learning_rate": 9.981296853288382e-06, + "loss": 2.9718, + "step": 180200 + }, + { + "epoch": 0.047041668383357264, + "grad_norm": 5.960728645324707, + "learning_rate": 9.981224548170942e-06, + "loss": 2.9552, + "step": 180400 + }, + { + "epoch": 0.047093821009059435, + "grad_norm": 6.71892786026001, + "learning_rate": 9.981152103822535e-06, + "loss": 2.878, + "step": 180600 + }, + { + "epoch": 0.047145973634761605, + "grad_norm": 6.228333950042725, + "learning_rate": 9.98107952024518e-06, + "loss": 2.9596, + "step": 180800 + }, + { + "epoch": 0.047198126260463776, + "grad_norm": 6.656292915344238, + "learning_rate": 9.981006797440913e-06, + "loss": 2.9251, + "step": 181000 + }, + { + "epoch": 0.04725027888616594, + "grad_norm": 6.142068386077881, + "learning_rate": 9.98093393541176e-06, + "loss": 2.9004, + "step": 181200 + }, + { + "epoch": 0.04730243151186811, + "grad_norm": 6.923412799835205, + "learning_rate": 9.980860934159761e-06, + "loss": 2.9182, + "step": 181400 + }, + { + "epoch": 0.04735458413757028, + "grad_norm": 7.025545597076416, + "learning_rate": 9.980787793686955e-06, + "loss": 2.9484, + "step": 181600 + }, + { + "epoch": 0.04740673676327245, + "grad_norm": 6.994638442993164, + "learning_rate": 9.980714513995389e-06, + "loss": 2.9093, + "step": 181800 + }, + { + "epoch": 0.04745888938897462, + "grad_norm": 6.052518367767334, + "learning_rate": 9.980641095087107e-06, + "loss": 2.9469, + "step": 182000 + }, + { + "epoch": 0.04751104201467679, + "grad_norm": 6.930753707885742, + "learning_rate": 9.980567536964166e-06, + "loss": 2.916, + "step": 182200 + }, + { + "epoch": 0.04756319464037896, + "grad_norm": 5.495729923248291, + "learning_rate": 9.980493839628618e-06, + "loss": 2.9307, + "step": 182400 + }, + { + "epoch": 0.04761534726608113, + "grad_norm": 5.72206974029541, + "learning_rate": 9.980420003082524e-06, + "loss": 2.9098, + "step": 182600 + }, + { + "epoch": 0.0476674998917833, + "grad_norm": 6.242524147033691, + "learning_rate": 9.980346027327948e-06, + "loss": 2.889, + "step": 182800 + }, + { + "epoch": 0.04771965251748547, + "grad_norm": 6.146420955657959, + "learning_rate": 9.980271912366959e-06, + "loss": 2.942, + "step": 183000 + }, + { + "epoch": 0.04777180514318764, + "grad_norm": 6.690853595733643, + "learning_rate": 9.980197658201627e-06, + "loss": 2.9214, + "step": 183200 + }, + { + "epoch": 0.04782395776888981, + "grad_norm": 5.759744644165039, + "learning_rate": 9.98012326483403e-06, + "loss": 2.9074, + "step": 183400 + }, + { + "epoch": 0.047876110394591984, + "grad_norm": 7.025085926055908, + "learning_rate": 9.980048732266243e-06, + "loss": 2.9566, + "step": 183600 + }, + { + "epoch": 0.047928263020294154, + "grad_norm": 6.400143623352051, + "learning_rate": 9.979974060500353e-06, + "loss": 2.9725, + "step": 183800 + }, + { + "epoch": 0.047980415645996324, + "grad_norm": 6.825561046600342, + "learning_rate": 9.979899249538445e-06, + "loss": 2.8797, + "step": 184000 + }, + { + "epoch": 0.04803256827169849, + "grad_norm": 6.444281578063965, + "learning_rate": 9.979824299382612e-06, + "loss": 2.9194, + "step": 184200 + }, + { + "epoch": 0.04808472089740066, + "grad_norm": 5.93040132522583, + "learning_rate": 9.979749210034948e-06, + "loss": 2.9068, + "step": 184400 + }, + { + "epoch": 0.04813687352310283, + "grad_norm": 5.90882682800293, + "learning_rate": 9.97967398149755e-06, + "loss": 2.894, + "step": 184600 + }, + { + "epoch": 0.048189026148805, + "grad_norm": 5.31546688079834, + "learning_rate": 9.979598613772523e-06, + "loss": 2.9244, + "step": 184800 + }, + { + "epoch": 0.04824117877450717, + "grad_norm": 6.048388957977295, + "learning_rate": 9.979523106861974e-06, + "loss": 2.8599, + "step": 185000 + }, + { + "epoch": 0.04829333140020934, + "grad_norm": 6.296876430511475, + "learning_rate": 9.979447460768012e-06, + "loss": 2.9213, + "step": 185200 + }, + { + "epoch": 0.04834548402591151, + "grad_norm": 6.221283435821533, + "learning_rate": 9.979371675492753e-06, + "loss": 2.9058, + "step": 185400 + }, + { + "epoch": 0.04839763665161368, + "grad_norm": 7.359024524688721, + "learning_rate": 9.979295751038313e-06, + "loss": 2.9192, + "step": 185600 + }, + { + "epoch": 0.04844978927731585, + "grad_norm": 6.607453346252441, + "learning_rate": 9.979219687406816e-06, + "loss": 3.0031, + "step": 185800 + }, + { + "epoch": 0.04850194190301802, + "grad_norm": 6.028778553009033, + "learning_rate": 9.979143484600387e-06, + "loss": 2.9068, + "step": 186000 + }, + { + "epoch": 0.04855409452872019, + "grad_norm": 5.630479335784912, + "learning_rate": 9.979067142621157e-06, + "loss": 2.9278, + "step": 186200 + }, + { + "epoch": 0.04860624715442236, + "grad_norm": 6.049453258514404, + "learning_rate": 9.978990661471257e-06, + "loss": 2.891, + "step": 186400 + }, + { + "epoch": 0.04865839978012453, + "grad_norm": 6.467658519744873, + "learning_rate": 9.97891404115283e-06, + "loss": 2.9207, + "step": 186600 + }, + { + "epoch": 0.0487105524058267, + "grad_norm": 6.942977428436279, + "learning_rate": 9.978837281668013e-06, + "loss": 2.9211, + "step": 186800 + }, + { + "epoch": 0.04876270503152887, + "grad_norm": 6.686459541320801, + "learning_rate": 9.978760383018953e-06, + "loss": 2.9207, + "step": 187000 + }, + { + "epoch": 0.04881485765723104, + "grad_norm": 6.126353740692139, + "learning_rate": 9.978683345207802e-06, + "loss": 2.9331, + "step": 187200 + }, + { + "epoch": 0.04886701028293321, + "grad_norm": 6.489861488342285, + "learning_rate": 9.97860616823671e-06, + "loss": 2.9333, + "step": 187400 + }, + { + "epoch": 0.04891916290863538, + "grad_norm": 6.95447301864624, + "learning_rate": 9.978528852107833e-06, + "loss": 2.9839, + "step": 187600 + }, + { + "epoch": 0.04897131553433755, + "grad_norm": 5.979517459869385, + "learning_rate": 9.978451396823334e-06, + "loss": 2.9585, + "step": 187800 + }, + { + "epoch": 0.04902346816003972, + "grad_norm": 6.67393684387207, + "learning_rate": 9.97837380238538e-06, + "loss": 2.9367, + "step": 188000 + }, + { + "epoch": 0.04907562078574189, + "grad_norm": 5.634337425231934, + "learning_rate": 9.978296068796138e-06, + "loss": 2.9027, + "step": 188200 + }, + { + "epoch": 0.04912777341144406, + "grad_norm": 6.696192264556885, + "learning_rate": 9.97821819605778e-06, + "loss": 2.8881, + "step": 188400 + }, + { + "epoch": 0.04917992603714623, + "grad_norm": 5.69584321975708, + "learning_rate": 9.978140184172482e-06, + "loss": 2.9149, + "step": 188600 + }, + { + "epoch": 0.0492320786628484, + "grad_norm": 6.718496322631836, + "learning_rate": 9.978062033142429e-06, + "loss": 2.8973, + "step": 188800 + }, + { + "epoch": 0.04928423128855057, + "grad_norm": 6.430253505706787, + "learning_rate": 9.977983742969798e-06, + "loss": 2.9127, + "step": 189000 + }, + { + "epoch": 0.04933638391425274, + "grad_norm": 6.769885063171387, + "learning_rate": 9.977905313656785e-06, + "loss": 2.9418, + "step": 189200 + }, + { + "epoch": 0.04938853653995491, + "grad_norm": 6.998106956481934, + "learning_rate": 9.977826745205578e-06, + "loss": 2.9484, + "step": 189400 + }, + { + "epoch": 0.04944068916565708, + "grad_norm": 6.269783020019531, + "learning_rate": 9.977748037618374e-06, + "loss": 2.9273, + "step": 189600 + }, + { + "epoch": 0.04949284179135925, + "grad_norm": 6.623769283294678, + "learning_rate": 9.977669190897372e-06, + "loss": 2.9459, + "step": 189800 + }, + { + "epoch": 0.04954499441706142, + "grad_norm": 6.445021152496338, + "learning_rate": 9.977590205044776e-06, + "loss": 2.932, + "step": 190000 + }, + { + "epoch": 0.049597147042763585, + "grad_norm": 6.26726770401001, + "learning_rate": 9.977511080062797e-06, + "loss": 2.9177, + "step": 190200 + }, + { + "epoch": 0.049649299668465756, + "grad_norm": 5.96261739730835, + "learning_rate": 9.977431815953642e-06, + "loss": 2.9476, + "step": 190400 + }, + { + "epoch": 0.049701452294167926, + "grad_norm": 6.362253189086914, + "learning_rate": 9.977352412719528e-06, + "loss": 2.9598, + "step": 190600 + }, + { + "epoch": 0.0497536049198701, + "grad_norm": 6.155995845794678, + "learning_rate": 9.977272870362676e-06, + "loss": 2.8895, + "step": 190800 + }, + { + "epoch": 0.04980575754557227, + "grad_norm": 6.854986667633057, + "learning_rate": 9.977193188885307e-06, + "loss": 2.9139, + "step": 191000 + }, + { + "epoch": 0.04985791017127444, + "grad_norm": 5.611915588378906, + "learning_rate": 9.97711336828965e-06, + "loss": 2.9157, + "step": 191200 + }, + { + "epoch": 0.04991006279697661, + "grad_norm": 6.135472774505615, + "learning_rate": 9.977033408577936e-06, + "loss": 2.8925, + "step": 191400 + }, + { + "epoch": 0.04996221542267878, + "grad_norm": 6.853118419647217, + "learning_rate": 9.976953309752401e-06, + "loss": 2.9362, + "step": 191600 + }, + { + "epoch": 0.05001436804838095, + "grad_norm": 6.357486248016357, + "learning_rate": 9.97687307181528e-06, + "loss": 2.9344, + "step": 191800 + }, + { + "epoch": 0.05006652067408312, + "grad_norm": 6.773046016693115, + "learning_rate": 9.97679269476882e-06, + "loss": 2.9625, + "step": 192000 + }, + { + "epoch": 0.05011867329978529, + "grad_norm": 6.14764404296875, + "learning_rate": 9.976712178615264e-06, + "loss": 2.9519, + "step": 192200 + }, + { + "epoch": 0.05017082592548746, + "grad_norm": 6.529992580413818, + "learning_rate": 9.976631523356866e-06, + "loss": 2.9013, + "step": 192400 + }, + { + "epoch": 0.05022297855118963, + "grad_norm": 5.915500164031982, + "learning_rate": 9.976550728995877e-06, + "loss": 2.9096, + "step": 192600 + }, + { + "epoch": 0.0502751311768918, + "grad_norm": 6.39959716796875, + "learning_rate": 9.976469795534557e-06, + "loss": 2.9478, + "step": 192800 + }, + { + "epoch": 0.050327283802593964, + "grad_norm": 6.220710277557373, + "learning_rate": 9.976388722975169e-06, + "loss": 2.9282, + "step": 193000 + }, + { + "epoch": 0.050379436428296134, + "grad_norm": 5.63620662689209, + "learning_rate": 9.976307511319979e-06, + "loss": 2.9344, + "step": 193200 + }, + { + "epoch": 0.050431589053998305, + "grad_norm": 6.4898600578308105, + "learning_rate": 9.976226160571254e-06, + "loss": 2.9223, + "step": 193400 + }, + { + "epoch": 0.050483741679700475, + "grad_norm": 5.993746757507324, + "learning_rate": 9.97614467073127e-06, + "loss": 2.9314, + "step": 193600 + }, + { + "epoch": 0.050535894305402645, + "grad_norm": 6.51394510269165, + "learning_rate": 9.976063041802306e-06, + "loss": 2.9471, + "step": 193800 + }, + { + "epoch": 0.050588046931104816, + "grad_norm": 5.766570091247559, + "learning_rate": 9.975981273786643e-06, + "loss": 2.954, + "step": 194000 + }, + { + "epoch": 0.050640199556806986, + "grad_norm": 6.516523838043213, + "learning_rate": 9.975899366686564e-06, + "loss": 2.9265, + "step": 194200 + }, + { + "epoch": 0.05069235218250916, + "grad_norm": 6.924788475036621, + "learning_rate": 9.975817320504362e-06, + "loss": 2.8861, + "step": 194400 + }, + { + "epoch": 0.05074450480821133, + "grad_norm": 6.611748218536377, + "learning_rate": 9.975735135242328e-06, + "loss": 2.9391, + "step": 194600 + }, + { + "epoch": 0.0507966574339135, + "grad_norm": 6.635772228240967, + "learning_rate": 9.975652810902759e-06, + "loss": 2.9112, + "step": 194800 + }, + { + "epoch": 0.05084881005961567, + "grad_norm": 6.5964741706848145, + "learning_rate": 9.975570347487958e-06, + "loss": 2.8749, + "step": 195000 + }, + { + "epoch": 0.05090096268531784, + "grad_norm": 5.7668986320495605, + "learning_rate": 9.975487745000228e-06, + "loss": 2.9284, + "step": 195200 + }, + { + "epoch": 0.05095311531102001, + "grad_norm": 6.671104907989502, + "learning_rate": 9.975405003441877e-06, + "loss": 2.924, + "step": 195400 + }, + { + "epoch": 0.05100526793672218, + "grad_norm": 6.551841735839844, + "learning_rate": 9.975322122815221e-06, + "loss": 2.9267, + "step": 195600 + }, + { + "epoch": 0.05105742056242435, + "grad_norm": 6.587834358215332, + "learning_rate": 9.975239103122576e-06, + "loss": 2.9144, + "step": 195800 + }, + { + "epoch": 0.05110957318812651, + "grad_norm": 6.745659828186035, + "learning_rate": 9.97515594436626e-06, + "loss": 2.9037, + "step": 196000 + }, + { + "epoch": 0.05116172581382868, + "grad_norm": 6.943085670471191, + "learning_rate": 9.975072646548597e-06, + "loss": 2.9468, + "step": 196200 + }, + { + "epoch": 0.051213878439530854, + "grad_norm": 6.360815525054932, + "learning_rate": 9.97498920967192e-06, + "loss": 2.9095, + "step": 196400 + }, + { + "epoch": 0.051266031065233024, + "grad_norm": 6.90670108795166, + "learning_rate": 9.974905633738559e-06, + "loss": 2.8835, + "step": 196600 + }, + { + "epoch": 0.051318183690935194, + "grad_norm": 5.816658020019531, + "learning_rate": 9.974821918750846e-06, + "loss": 2.9313, + "step": 196800 + }, + { + "epoch": 0.051370336316637365, + "grad_norm": 6.233535289764404, + "learning_rate": 9.974738064711125e-06, + "loss": 2.9081, + "step": 197000 + }, + { + "epoch": 0.051422488942339535, + "grad_norm": 6.274383068084717, + "learning_rate": 9.97465407162174e-06, + "loss": 2.9143, + "step": 197200 + }, + { + "epoch": 0.051474641568041705, + "grad_norm": 6.0712738037109375, + "learning_rate": 9.974569939485038e-06, + "loss": 2.9224, + "step": 197400 + }, + { + "epoch": 0.051526794193743876, + "grad_norm": 6.007919788360596, + "learning_rate": 9.974485668303369e-06, + "loss": 2.9035, + "step": 197600 + }, + { + "epoch": 0.051578946819446046, + "grad_norm": 6.441709518432617, + "learning_rate": 9.974401258079092e-06, + "loss": 2.938, + "step": 197800 + }, + { + "epoch": 0.05163109944514822, + "grad_norm": 5.86509895324707, + "learning_rate": 9.974316708814562e-06, + "loss": 2.9291, + "step": 198000 + }, + { + "epoch": 0.05168325207085039, + "grad_norm": 6.994657039642334, + "learning_rate": 9.974232020512144e-06, + "loss": 2.8551, + "step": 198200 + }, + { + "epoch": 0.05173540469655256, + "grad_norm": 6.415815353393555, + "learning_rate": 9.974147193174205e-06, + "loss": 2.9468, + "step": 198400 + }, + { + "epoch": 0.05178755732225473, + "grad_norm": 5.555944919586182, + "learning_rate": 9.97406222680312e-06, + "loss": 2.8942, + "step": 198600 + }, + { + "epoch": 0.0518397099479569, + "grad_norm": 6.089178085327148, + "learning_rate": 9.973977121401258e-06, + "loss": 2.9051, + "step": 198800 + }, + { + "epoch": 0.05189186257365906, + "grad_norm": 6.0938496589660645, + "learning_rate": 9.973891876971e-06, + "loss": 2.9038, + "step": 199000 + }, + { + "epoch": 0.05194401519936123, + "grad_norm": 6.132476806640625, + "learning_rate": 9.97380649351473e-06, + "loss": 2.912, + "step": 199200 + }, + { + "epoch": 0.0519961678250634, + "grad_norm": 6.650424957275391, + "learning_rate": 9.973720971034834e-06, + "loss": 2.9345, + "step": 199400 + }, + { + "epoch": 0.05204832045076557, + "grad_norm": 5.777918338775635, + "learning_rate": 9.9736353095337e-06, + "loss": 2.9157, + "step": 199600 + }, + { + "epoch": 0.05210047307646774, + "grad_norm": 6.441890239715576, + "learning_rate": 9.973549509013727e-06, + "loss": 2.8985, + "step": 199800 + }, + { + "epoch": 0.052152625702169914, + "grad_norm": 6.862900733947754, + "learning_rate": 9.973463569477309e-06, + "loss": 2.9326, + "step": 200000 + }, + { + "epoch": 0.052204778327872084, + "grad_norm": 6.423320770263672, + "learning_rate": 9.973377490926848e-06, + "loss": 2.8986, + "step": 200200 + }, + { + "epoch": 0.052256930953574254, + "grad_norm": 6.303603649139404, + "learning_rate": 9.973291273364754e-06, + "loss": 2.885, + "step": 200400 + }, + { + "epoch": 0.052309083579276425, + "grad_norm": 6.71627140045166, + "learning_rate": 9.973204916793433e-06, + "loss": 2.8799, + "step": 200600 + }, + { + "epoch": 0.052361236204978595, + "grad_norm": 6.168727397918701, + "learning_rate": 9.9731184212153e-06, + "loss": 2.8999, + "step": 200800 + }, + { + "epoch": 0.052413388830680765, + "grad_norm": 6.115705966949463, + "learning_rate": 9.973031786632773e-06, + "loss": 2.9156, + "step": 201000 + }, + { + "epoch": 0.052465541456382936, + "grad_norm": 5.8287200927734375, + "learning_rate": 9.972945013048275e-06, + "loss": 2.8945, + "step": 201200 + }, + { + "epoch": 0.052517694082085106, + "grad_norm": 6.276069164276123, + "learning_rate": 9.972858100464229e-06, + "loss": 2.9383, + "step": 201400 + }, + { + "epoch": 0.05256984670778728, + "grad_norm": 5.857385635375977, + "learning_rate": 9.972771048883065e-06, + "loss": 2.895, + "step": 201600 + }, + { + "epoch": 0.05262199933348944, + "grad_norm": 7.052547931671143, + "learning_rate": 9.972683858307217e-06, + "loss": 2.8879, + "step": 201800 + }, + { + "epoch": 0.05267415195919161, + "grad_norm": 6.282922267913818, + "learning_rate": 9.97259652873912e-06, + "loss": 2.8945, + "step": 202000 + }, + { + "epoch": 0.05272630458489378, + "grad_norm": 6.320275783538818, + "learning_rate": 9.972509060181218e-06, + "loss": 2.9122, + "step": 202200 + }, + { + "epoch": 0.05277845721059595, + "grad_norm": 5.877612113952637, + "learning_rate": 9.972421452635954e-06, + "loss": 2.8862, + "step": 202400 + }, + { + "epoch": 0.05283060983629812, + "grad_norm": 6.659661769866943, + "learning_rate": 9.972333706105777e-06, + "loss": 2.9418, + "step": 202600 + }, + { + "epoch": 0.05288276246200029, + "grad_norm": 6.702484130859375, + "learning_rate": 9.972245820593138e-06, + "loss": 2.9173, + "step": 202800 + }, + { + "epoch": 0.05293491508770246, + "grad_norm": 6.761118412017822, + "learning_rate": 9.972157796100497e-06, + "loss": 2.9208, + "step": 203000 + }, + { + "epoch": 0.05298706771340463, + "grad_norm": 5.8187642097473145, + "learning_rate": 9.972069632630312e-06, + "loss": 2.8982, + "step": 203200 + }, + { + "epoch": 0.0530392203391068, + "grad_norm": 6.023776531219482, + "learning_rate": 9.971981330185047e-06, + "loss": 2.9081, + "step": 203400 + }, + { + "epoch": 0.053091372964808974, + "grad_norm": 6.4479451179504395, + "learning_rate": 9.971892888767172e-06, + "loss": 2.9146, + "step": 203600 + }, + { + "epoch": 0.053143525590511144, + "grad_norm": 6.822802543640137, + "learning_rate": 9.971804308379156e-06, + "loss": 2.8861, + "step": 203800 + }, + { + "epoch": 0.053195678216213314, + "grad_norm": 6.120795726776123, + "learning_rate": 9.971715589023478e-06, + "loss": 2.8868, + "step": 204000 + }, + { + "epoch": 0.053247830841915485, + "grad_norm": 5.959011077880859, + "learning_rate": 9.971626730702617e-06, + "loss": 2.886, + "step": 204200 + }, + { + "epoch": 0.053299983467617655, + "grad_norm": 5.855833530426025, + "learning_rate": 9.971537733419057e-06, + "loss": 2.9224, + "step": 204400 + }, + { + "epoch": 0.053352136093319825, + "grad_norm": 6.570242881774902, + "learning_rate": 9.971448597175284e-06, + "loss": 2.9392, + "step": 204600 + }, + { + "epoch": 0.05340428871902199, + "grad_norm": 5.748819828033447, + "learning_rate": 9.97135932197379e-06, + "loss": 2.8955, + "step": 204800 + }, + { + "epoch": 0.05345644134472416, + "grad_norm": 6.4593000411987305, + "learning_rate": 9.97126990781707e-06, + "loss": 2.878, + "step": 205000 + }, + { + "epoch": 0.05350859397042633, + "grad_norm": 5.999743461608887, + "learning_rate": 9.971180354707627e-06, + "loss": 2.9366, + "step": 205200 + }, + { + "epoch": 0.0535607465961285, + "grad_norm": 6.419029235839844, + "learning_rate": 9.97109066264796e-06, + "loss": 2.9191, + "step": 205400 + }, + { + "epoch": 0.05361289922183067, + "grad_norm": 6.093469142913818, + "learning_rate": 9.971000831640576e-06, + "loss": 2.925, + "step": 205600 + }, + { + "epoch": 0.05366505184753284, + "grad_norm": 6.597023010253906, + "learning_rate": 9.970910861687988e-06, + "loss": 2.905, + "step": 205800 + }, + { + "epoch": 0.05371720447323501, + "grad_norm": 6.599836349487305, + "learning_rate": 9.97082075279271e-06, + "loss": 2.8901, + "step": 206000 + }, + { + "epoch": 0.05376935709893718, + "grad_norm": 7.024128437042236, + "learning_rate": 9.970730504957258e-06, + "loss": 2.8819, + "step": 206200 + }, + { + "epoch": 0.05382150972463935, + "grad_norm": 5.909964561462402, + "learning_rate": 9.970640118184158e-06, + "loss": 2.9062, + "step": 206400 + }, + { + "epoch": 0.05387366235034152, + "grad_norm": 6.118581771850586, + "learning_rate": 9.970549592475936e-06, + "loss": 2.9099, + "step": 206600 + }, + { + "epoch": 0.05392581497604369, + "grad_norm": 5.8928327560424805, + "learning_rate": 9.970458927835122e-06, + "loss": 2.8966, + "step": 206800 + }, + { + "epoch": 0.05397796760174586, + "grad_norm": 6.539867877960205, + "learning_rate": 9.970368124264249e-06, + "loss": 2.9375, + "step": 207000 + }, + { + "epoch": 0.054030120227448034, + "grad_norm": 6.851650714874268, + "learning_rate": 9.970277181765858e-06, + "loss": 2.863, + "step": 207200 + }, + { + "epoch": 0.054082272853150204, + "grad_norm": 6.393698215484619, + "learning_rate": 9.970186100342486e-06, + "loss": 2.9317, + "step": 207400 + }, + { + "epoch": 0.054134425478852374, + "grad_norm": 5.992748260498047, + "learning_rate": 9.970094879996683e-06, + "loss": 2.8793, + "step": 207600 + }, + { + "epoch": 0.05418657810455454, + "grad_norm": 5.8636298179626465, + "learning_rate": 9.970003520730997e-06, + "loss": 2.925, + "step": 207800 + }, + { + "epoch": 0.05423873073025671, + "grad_norm": 6.160617351531982, + "learning_rate": 9.96991202254798e-06, + "loss": 2.9411, + "step": 208000 + }, + { + "epoch": 0.05429088335595888, + "grad_norm": 7.532545566558838, + "learning_rate": 9.969820385450195e-06, + "loss": 2.8771, + "step": 208200 + }, + { + "epoch": 0.05434303598166105, + "grad_norm": 6.879082202911377, + "learning_rate": 9.969728609440197e-06, + "loss": 2.9334, + "step": 208400 + }, + { + "epoch": 0.05439518860736322, + "grad_norm": 6.399229049682617, + "learning_rate": 9.969636694520556e-06, + "loss": 2.8777, + "step": 208600 + }, + { + "epoch": 0.05444734123306539, + "grad_norm": 6.303351402282715, + "learning_rate": 9.969544640693838e-06, + "loss": 2.8949, + "step": 208800 + }, + { + "epoch": 0.05449949385876756, + "grad_norm": 6.815507888793945, + "learning_rate": 9.969452447962617e-06, + "loss": 2.876, + "step": 209000 + }, + { + "epoch": 0.05455164648446973, + "grad_norm": 6.317996978759766, + "learning_rate": 9.969360116329472e-06, + "loss": 2.8672, + "step": 209200 + }, + { + "epoch": 0.0546037991101719, + "grad_norm": 6.783117294311523, + "learning_rate": 9.96926764579698e-06, + "loss": 2.8434, + "step": 209400 + }, + { + "epoch": 0.05465595173587407, + "grad_norm": 6.445932865142822, + "learning_rate": 9.969175036367728e-06, + "loss": 2.9331, + "step": 209600 + }, + { + "epoch": 0.05470810436157624, + "grad_norm": 6.53240966796875, + "learning_rate": 9.969082288044304e-06, + "loss": 2.9213, + "step": 209800 + }, + { + "epoch": 0.05476025698727841, + "grad_norm": 6.389587879180908, + "learning_rate": 9.968989400829301e-06, + "loss": 2.9206, + "step": 210000 + }, + { + "epoch": 0.05481240961298058, + "grad_norm": 6.929988384246826, + "learning_rate": 9.968896374725314e-06, + "loss": 2.8684, + "step": 210200 + }, + { + "epoch": 0.05486456223868275, + "grad_norm": 6.338830471038818, + "learning_rate": 9.968803209734944e-06, + "loss": 2.8601, + "step": 210400 + }, + { + "epoch": 0.05491671486438492, + "grad_norm": 6.379602909088135, + "learning_rate": 9.968709905860796e-06, + "loss": 2.8824, + "step": 210600 + }, + { + "epoch": 0.05496886749008709, + "grad_norm": 6.1573710441589355, + "learning_rate": 9.968616463105476e-06, + "loss": 2.85, + "step": 210800 + }, + { + "epoch": 0.05502102011578926, + "grad_norm": 5.6570611000061035, + "learning_rate": 9.968522881471599e-06, + "loss": 2.8345, + "step": 211000 + }, + { + "epoch": 0.05507317274149143, + "grad_norm": 6.473025798797607, + "learning_rate": 9.968429160961776e-06, + "loss": 2.9062, + "step": 211200 + }, + { + "epoch": 0.0551253253671936, + "grad_norm": 5.675154209136963, + "learning_rate": 9.968335301578629e-06, + "loss": 2.9005, + "step": 211400 + }, + { + "epoch": 0.05517747799289577, + "grad_norm": 6.401944160461426, + "learning_rate": 9.968241303324783e-06, + "loss": 2.8383, + "step": 211600 + }, + { + "epoch": 0.05522963061859794, + "grad_norm": 5.85010290145874, + "learning_rate": 9.968147166202864e-06, + "loss": 2.8892, + "step": 211800 + }, + { + "epoch": 0.05528178324430011, + "grad_norm": 5.4949517250061035, + "learning_rate": 9.968052890215502e-06, + "loss": 2.8813, + "step": 212000 + }, + { + "epoch": 0.05533393587000228, + "grad_norm": 6.284722328186035, + "learning_rate": 9.967958475365334e-06, + "loss": 2.877, + "step": 212200 + }, + { + "epoch": 0.05538608849570445, + "grad_norm": 6.463416576385498, + "learning_rate": 9.967863921654998e-06, + "loss": 2.8947, + "step": 212400 + }, + { + "epoch": 0.05543824112140662, + "grad_norm": 6.696000576019287, + "learning_rate": 9.967769229087138e-06, + "loss": 2.8707, + "step": 212600 + }, + { + "epoch": 0.05549039374710879, + "grad_norm": 6.760770320892334, + "learning_rate": 9.9676743976644e-06, + "loss": 2.8864, + "step": 212800 + }, + { + "epoch": 0.05554254637281096, + "grad_norm": 5.885373115539551, + "learning_rate": 9.967579427389434e-06, + "loss": 2.8919, + "step": 213000 + }, + { + "epoch": 0.05559469899851313, + "grad_norm": 5.777275085449219, + "learning_rate": 9.967484318264895e-06, + "loss": 2.8508, + "step": 213200 + }, + { + "epoch": 0.0556468516242153, + "grad_norm": 6.474213123321533, + "learning_rate": 9.967389070293442e-06, + "loss": 2.875, + "step": 213400 + }, + { + "epoch": 0.055699004249917465, + "grad_norm": 6.556694030761719, + "learning_rate": 9.967293683477737e-06, + "loss": 2.9205, + "step": 213600 + }, + { + "epoch": 0.055751156875619635, + "grad_norm": 6.6564202308654785, + "learning_rate": 9.967198157820445e-06, + "loss": 2.8501, + "step": 213800 + }, + { + "epoch": 0.055803309501321806, + "grad_norm": 6.465703964233398, + "learning_rate": 9.967102493324239e-06, + "loss": 2.9072, + "step": 214000 + }, + { + "epoch": 0.055855462127023976, + "grad_norm": 6.2116780281066895, + "learning_rate": 9.96700668999179e-06, + "loss": 2.8542, + "step": 214200 + }, + { + "epoch": 0.05590761475272615, + "grad_norm": 6.195470333099365, + "learning_rate": 9.966910747825775e-06, + "loss": 2.8891, + "step": 214400 + }, + { + "epoch": 0.05595976737842832, + "grad_norm": 6.002094745635986, + "learning_rate": 9.96681466682888e-06, + "loss": 2.8885, + "step": 214600 + }, + { + "epoch": 0.05601192000413049, + "grad_norm": 6.3701653480529785, + "learning_rate": 9.966718447003787e-06, + "loss": 2.9023, + "step": 214800 + }, + { + "epoch": 0.05606407262983266, + "grad_norm": 6.410655975341797, + "learning_rate": 9.966622088353189e-06, + "loss": 2.9003, + "step": 215000 + }, + { + "epoch": 0.05611622525553483, + "grad_norm": 6.293005466461182, + "learning_rate": 9.966525590879774e-06, + "loss": 2.8997, + "step": 215200 + }, + { + "epoch": 0.056168377881237, + "grad_norm": 7.010239124298096, + "learning_rate": 9.966428954586243e-06, + "loss": 2.9075, + "step": 215400 + }, + { + "epoch": 0.05622053050693917, + "grad_norm": 7.058774948120117, + "learning_rate": 9.966332179475296e-06, + "loss": 2.8649, + "step": 215600 + }, + { + "epoch": 0.05627268313264134, + "grad_norm": 6.797397613525391, + "learning_rate": 9.966235265549637e-06, + "loss": 2.8828, + "step": 215800 + }, + { + "epoch": 0.05632483575834351, + "grad_norm": 6.849272727966309, + "learning_rate": 9.96613821281198e-06, + "loss": 2.8903, + "step": 216000 + }, + { + "epoch": 0.05637698838404568, + "grad_norm": 5.813775062561035, + "learning_rate": 9.96604102126503e-06, + "loss": 2.8662, + "step": 216200 + }, + { + "epoch": 0.05642914100974785, + "grad_norm": 6.535881996154785, + "learning_rate": 9.965943690911509e-06, + "loss": 2.8643, + "step": 216400 + }, + { + "epoch": 0.056481293635450014, + "grad_norm": 6.4796576499938965, + "learning_rate": 9.965846221754135e-06, + "loss": 2.8863, + "step": 216600 + }, + { + "epoch": 0.056533446261152184, + "grad_norm": 6.315826892852783, + "learning_rate": 9.965748613795633e-06, + "loss": 2.8721, + "step": 216800 + }, + { + "epoch": 0.056585598886854355, + "grad_norm": 6.6356635093688965, + "learning_rate": 9.965650867038732e-06, + "loss": 2.8823, + "step": 217000 + }, + { + "epoch": 0.056637751512556525, + "grad_norm": 6.826320648193359, + "learning_rate": 9.965552981486163e-06, + "loss": 2.8792, + "step": 217200 + }, + { + "epoch": 0.056689904138258695, + "grad_norm": 5.991021633148193, + "learning_rate": 9.965454957140665e-06, + "loss": 2.8717, + "step": 217400 + }, + { + "epoch": 0.056742056763960866, + "grad_norm": 5.6707048416137695, + "learning_rate": 9.965356794004974e-06, + "loss": 2.898, + "step": 217600 + }, + { + "epoch": 0.056794209389663036, + "grad_norm": 6.135042667388916, + "learning_rate": 9.965258492081835e-06, + "loss": 2.862, + "step": 217800 + }, + { + "epoch": 0.05684636201536521, + "grad_norm": 6.86878776550293, + "learning_rate": 9.965160051373996e-06, + "loss": 2.8626, + "step": 218000 + }, + { + "epoch": 0.05689851464106738, + "grad_norm": 6.301908016204834, + "learning_rate": 9.96506147188421e-06, + "loss": 2.865, + "step": 218200 + }, + { + "epoch": 0.05695066726676955, + "grad_norm": 6.453223705291748, + "learning_rate": 9.96496275361523e-06, + "loss": 2.8613, + "step": 218400 + }, + { + "epoch": 0.05700281989247172, + "grad_norm": 6.500892639160156, + "learning_rate": 9.964863896569816e-06, + "loss": 2.8533, + "step": 218600 + }, + { + "epoch": 0.05705497251817389, + "grad_norm": 5.618515491485596, + "learning_rate": 9.964764900750731e-06, + "loss": 2.872, + "step": 218800 + }, + { + "epoch": 0.05710712514387606, + "grad_norm": 6.596179485321045, + "learning_rate": 9.964665766160744e-06, + "loss": 2.8752, + "step": 219000 + }, + { + "epoch": 0.05715927776957823, + "grad_norm": 6.309721946716309, + "learning_rate": 9.964566492802623e-06, + "loss": 2.8882, + "step": 219200 + }, + { + "epoch": 0.0572114303952804, + "grad_norm": 6.222306251525879, + "learning_rate": 9.964467080679145e-06, + "loss": 2.8676, + "step": 219400 + }, + { + "epoch": 0.05726358302098256, + "grad_norm": 6.931632041931152, + "learning_rate": 9.964367529793086e-06, + "loss": 2.8618, + "step": 219600 + }, + { + "epoch": 0.05731573564668473, + "grad_norm": 6.057002544403076, + "learning_rate": 9.964267840147232e-06, + "loss": 2.8378, + "step": 219800 + }, + { + "epoch": 0.057367888272386904, + "grad_norm": 6.065264701843262, + "learning_rate": 9.964168011744367e-06, + "loss": 2.9028, + "step": 220000 + }, + { + "epoch": 0.057420040898089074, + "grad_norm": 6.861353874206543, + "learning_rate": 9.964068044587283e-06, + "loss": 2.8933, + "step": 220200 + }, + { + "epoch": 0.057472193523791244, + "grad_norm": 6.993216037750244, + "learning_rate": 9.963967938678774e-06, + "loss": 2.8741, + "step": 220400 + }, + { + "epoch": 0.057524346149493415, + "grad_norm": 6.941470146179199, + "learning_rate": 9.963867694021637e-06, + "loss": 2.8534, + "step": 220600 + }, + { + "epoch": 0.057576498775195585, + "grad_norm": 6.341226100921631, + "learning_rate": 9.963767310618673e-06, + "loss": 2.8837, + "step": 220800 + }, + { + "epoch": 0.057628651400897755, + "grad_norm": 6.165623188018799, + "learning_rate": 9.96366678847269e-06, + "loss": 2.9066, + "step": 221000 + }, + { + "epoch": 0.057680804026599926, + "grad_norm": 5.758754730224609, + "learning_rate": 9.963566127586497e-06, + "loss": 2.8871, + "step": 221200 + }, + { + "epoch": 0.057732956652302096, + "grad_norm": 6.3193206787109375, + "learning_rate": 9.963465327962907e-06, + "loss": 2.852, + "step": 221400 + }, + { + "epoch": 0.05778510927800427, + "grad_norm": 6.58033561706543, + "learning_rate": 9.963364389604739e-06, + "loss": 2.8514, + "step": 221600 + }, + { + "epoch": 0.05783726190370644, + "grad_norm": 5.843617916107178, + "learning_rate": 9.963263312514812e-06, + "loss": 2.8494, + "step": 221800 + }, + { + "epoch": 0.05788941452940861, + "grad_norm": 6.093498706817627, + "learning_rate": 9.963162096695954e-06, + "loss": 2.9139, + "step": 222000 + }, + { + "epoch": 0.05794156715511078, + "grad_norm": 6.31635856628418, + "learning_rate": 9.963060742150992e-06, + "loss": 2.8631, + "step": 222200 + }, + { + "epoch": 0.05799371978081295, + "grad_norm": 6.643308639526367, + "learning_rate": 9.96295924888276e-06, + "loss": 2.9046, + "step": 222400 + }, + { + "epoch": 0.05804587240651511, + "grad_norm": 6.348966121673584, + "learning_rate": 9.962857616894095e-06, + "loss": 2.87, + "step": 222600 + }, + { + "epoch": 0.05809802503221728, + "grad_norm": 6.411705017089844, + "learning_rate": 9.962755846187834e-06, + "loss": 2.8712, + "step": 222800 + }, + { + "epoch": 0.05815017765791945, + "grad_norm": 6.178708553314209, + "learning_rate": 9.962653936766828e-06, + "loss": 2.8583, + "step": 223000 + }, + { + "epoch": 0.05820233028362162, + "grad_norm": 6.825628757476807, + "learning_rate": 9.962551888633923e-06, + "loss": 2.8627, + "step": 223200 + }, + { + "epoch": 0.05825448290932379, + "grad_norm": 5.721690654754639, + "learning_rate": 9.96244970179197e-06, + "loss": 2.8824, + "step": 223400 + }, + { + "epoch": 0.058306635535025964, + "grad_norm": 6.0849127769470215, + "learning_rate": 9.962347376243824e-06, + "loss": 2.8654, + "step": 223600 + }, + { + "epoch": 0.058358788160728134, + "grad_norm": 6.45623254776001, + "learning_rate": 9.962244911992347e-06, + "loss": 2.9017, + "step": 223800 + }, + { + "epoch": 0.058410940786430304, + "grad_norm": 6.159735679626465, + "learning_rate": 9.962142309040405e-06, + "loss": 2.9074, + "step": 224000 + }, + { + "epoch": 0.058463093412132475, + "grad_norm": 6.511104106903076, + "learning_rate": 9.962039567390863e-06, + "loss": 2.8768, + "step": 224200 + }, + { + "epoch": 0.058515246037834645, + "grad_norm": 6.441580772399902, + "learning_rate": 9.961936687046595e-06, + "loss": 2.8928, + "step": 224400 + }, + { + "epoch": 0.058567398663536815, + "grad_norm": 5.743595123291016, + "learning_rate": 9.961833668010473e-06, + "loss": 2.9014, + "step": 224600 + }, + { + "epoch": 0.058619551289238986, + "grad_norm": 6.290615558624268, + "learning_rate": 9.961730510285379e-06, + "loss": 2.8954, + "step": 224800 + }, + { + "epoch": 0.058671703914941156, + "grad_norm": 6.227867126464844, + "learning_rate": 9.961627213874198e-06, + "loss": 2.858, + "step": 225000 + }, + { + "epoch": 0.05872385654064333, + "grad_norm": 6.234389305114746, + "learning_rate": 9.961523778779814e-06, + "loss": 2.8905, + "step": 225200 + }, + { + "epoch": 0.05877600916634549, + "grad_norm": 6.24118709564209, + "learning_rate": 9.961420205005118e-06, + "loss": 2.8632, + "step": 225400 + }, + { + "epoch": 0.05882816179204766, + "grad_norm": 6.4152750968933105, + "learning_rate": 9.96131649255301e-06, + "loss": 2.857, + "step": 225600 + }, + { + "epoch": 0.05888031441774983, + "grad_norm": 7.344254493713379, + "learning_rate": 9.961212641426384e-06, + "loss": 2.8742, + "step": 225800 + }, + { + "epoch": 0.058932467043452, + "grad_norm": 6.300400257110596, + "learning_rate": 9.961108651628144e-06, + "loss": 2.893, + "step": 226000 + }, + { + "epoch": 0.05898461966915417, + "grad_norm": 5.992100715637207, + "learning_rate": 9.961004523161197e-06, + "loss": 2.8256, + "step": 226200 + }, + { + "epoch": 0.05903677229485634, + "grad_norm": 5.90500545501709, + "learning_rate": 9.960900256028453e-06, + "loss": 2.9001, + "step": 226400 + }, + { + "epoch": 0.05908892492055851, + "grad_norm": 6.329569339752197, + "learning_rate": 9.960795850232827e-06, + "loss": 2.8679, + "step": 226600 + }, + { + "epoch": 0.05914107754626068, + "grad_norm": 6.701547622680664, + "learning_rate": 9.960691305777235e-06, + "loss": 2.8826, + "step": 226800 + }, + { + "epoch": 0.05919323017196285, + "grad_norm": 7.074747085571289, + "learning_rate": 9.960586622664603e-06, + "loss": 2.8816, + "step": 227000 + }, + { + "epoch": 0.059245382797665024, + "grad_norm": 5.853117942810059, + "learning_rate": 9.960481800897855e-06, + "loss": 2.8941, + "step": 227200 + }, + { + "epoch": 0.059297535423367194, + "grad_norm": 6.21675968170166, + "learning_rate": 9.960376840479922e-06, + "loss": 2.9143, + "step": 227400 + }, + { + "epoch": 0.059349688049069364, + "grad_norm": 6.886922836303711, + "learning_rate": 9.960271741413737e-06, + "loss": 2.8215, + "step": 227600 + }, + { + "epoch": 0.059401840674771535, + "grad_norm": 6.113399028778076, + "learning_rate": 9.960166503702234e-06, + "loss": 2.8652, + "step": 227800 + }, + { + "epoch": 0.059453993300473705, + "grad_norm": 6.7419304847717285, + "learning_rate": 9.96006112734836e-06, + "loss": 2.8997, + "step": 228000 + }, + { + "epoch": 0.059506145926175875, + "grad_norm": 5.719422817230225, + "learning_rate": 9.959955612355059e-06, + "loss": 2.8397, + "step": 228200 + }, + { + "epoch": 0.05955829855187804, + "grad_norm": 6.3306732177734375, + "learning_rate": 9.959849958725278e-06, + "loss": 2.8576, + "step": 228400 + }, + { + "epoch": 0.05961045117758021, + "grad_norm": 6.876055717468262, + "learning_rate": 9.959744166461973e-06, + "loss": 2.8646, + "step": 228600 + }, + { + "epoch": 0.05966260380328238, + "grad_norm": 6.750349998474121, + "learning_rate": 9.9596382355681e-06, + "loss": 2.8445, + "step": 228800 + }, + { + "epoch": 0.05971475642898455, + "grad_norm": 5.776258945465088, + "learning_rate": 9.959532166046619e-06, + "loss": 2.8702, + "step": 229000 + }, + { + "epoch": 0.05976690905468672, + "grad_norm": 6.348162651062012, + "learning_rate": 9.959425957900497e-06, + "loss": 2.8475, + "step": 229200 + }, + { + "epoch": 0.05981906168038889, + "grad_norm": 6.766139507293701, + "learning_rate": 9.959319611132698e-06, + "loss": 2.8688, + "step": 229400 + }, + { + "epoch": 0.05987121430609106, + "grad_norm": 6.318508625030518, + "learning_rate": 9.959213125746198e-06, + "loss": 2.8977, + "step": 229600 + }, + { + "epoch": 0.05992336693179323, + "grad_norm": 6.733820915222168, + "learning_rate": 9.959106501743975e-06, + "loss": 2.8547, + "step": 229800 + }, + { + "epoch": 0.0599755195574954, + "grad_norm": 7.468286037445068, + "learning_rate": 9.958999739129006e-06, + "loss": 2.9255, + "step": 230000 + }, + { + "epoch": 0.06002767218319757, + "grad_norm": 6.781874179840088, + "learning_rate": 9.958892837904277e-06, + "loss": 2.8473, + "step": 230200 + }, + { + "epoch": 0.06007982480889974, + "grad_norm": 6.273848056793213, + "learning_rate": 9.958785798072775e-06, + "loss": 2.8736, + "step": 230400 + }, + { + "epoch": 0.06013197743460191, + "grad_norm": 6.0436320304870605, + "learning_rate": 9.958678619637491e-06, + "loss": 2.8747, + "step": 230600 + }, + { + "epoch": 0.060184130060304084, + "grad_norm": 6.5935378074646, + "learning_rate": 9.958571302601425e-06, + "loss": 2.8568, + "step": 230800 + }, + { + "epoch": 0.060236282686006254, + "grad_norm": 6.040035724639893, + "learning_rate": 9.958463846967572e-06, + "loss": 2.8901, + "step": 231000 + }, + { + "epoch": 0.060288435311708424, + "grad_norm": 6.27731466293335, + "learning_rate": 9.958356252738937e-06, + "loss": 2.8297, + "step": 231200 + }, + { + "epoch": 0.06034058793741059, + "grad_norm": 6.9736199378967285, + "learning_rate": 9.958248519918527e-06, + "loss": 2.8432, + "step": 231400 + }, + { + "epoch": 0.06039274056311276, + "grad_norm": 6.537654876708984, + "learning_rate": 9.958140648509354e-06, + "loss": 2.907, + "step": 231600 + }, + { + "epoch": 0.06044489318881493, + "grad_norm": 6.521362781524658, + "learning_rate": 9.958032638514435e-06, + "loss": 2.832, + "step": 231800 + }, + { + "epoch": 0.0604970458145171, + "grad_norm": 6.518667697906494, + "learning_rate": 9.957924489936783e-06, + "loss": 2.8741, + "step": 232000 + }, + { + "epoch": 0.06054919844021927, + "grad_norm": 6.39730167388916, + "learning_rate": 9.957816202779429e-06, + "loss": 2.841, + "step": 232200 + }, + { + "epoch": 0.06060135106592144, + "grad_norm": 6.15274715423584, + "learning_rate": 9.957707777045392e-06, + "loss": 2.8499, + "step": 232400 + }, + { + "epoch": 0.06065350369162361, + "grad_norm": 6.834228515625, + "learning_rate": 9.957599212737707e-06, + "loss": 2.8709, + "step": 232600 + }, + { + "epoch": 0.06070565631732578, + "grad_norm": 6.3925251960754395, + "learning_rate": 9.957490509859409e-06, + "loss": 2.8808, + "step": 232800 + }, + { + "epoch": 0.06075780894302795, + "grad_norm": 6.036910533905029, + "learning_rate": 9.957381668413535e-06, + "loss": 2.8857, + "step": 233000 + }, + { + "epoch": 0.06080996156873012, + "grad_norm": 6.22501802444458, + "learning_rate": 9.957272688403126e-06, + "loss": 2.8427, + "step": 233200 + }, + { + "epoch": 0.06086211419443229, + "grad_norm": 5.928175449371338, + "learning_rate": 9.95716356983123e-06, + "loss": 2.8687, + "step": 233400 + }, + { + "epoch": 0.06091426682013446, + "grad_norm": 6.415838241577148, + "learning_rate": 9.957054312700897e-06, + "loss": 2.8375, + "step": 233600 + }, + { + "epoch": 0.06096641944583663, + "grad_norm": 6.5070013999938965, + "learning_rate": 9.956944917015179e-06, + "loss": 2.8811, + "step": 233800 + }, + { + "epoch": 0.0610185720715388, + "grad_norm": 5.6311821937561035, + "learning_rate": 9.956835382777137e-06, + "loss": 2.8823, + "step": 234000 + }, + { + "epoch": 0.061070724697240966, + "grad_norm": 5.70000696182251, + "learning_rate": 9.956725709989829e-06, + "loss": 2.881, + "step": 234200 + }, + { + "epoch": 0.06112287732294314, + "grad_norm": 6.649327754974365, + "learning_rate": 9.956615898656322e-06, + "loss": 2.8617, + "step": 234400 + }, + { + "epoch": 0.06117502994864531, + "grad_norm": 6.379722595214844, + "learning_rate": 9.956505948779687e-06, + "loss": 2.8547, + "step": 234600 + }, + { + "epoch": 0.06122718257434748, + "grad_norm": 6.3186116218566895, + "learning_rate": 9.956395860362992e-06, + "loss": 2.8306, + "step": 234800 + }, + { + "epoch": 0.06127933520004965, + "grad_norm": 6.583649158477783, + "learning_rate": 9.956285633409322e-06, + "loss": 2.8816, + "step": 235000 + }, + { + "epoch": 0.06133148782575182, + "grad_norm": 6.297317981719971, + "learning_rate": 9.956175267921752e-06, + "loss": 2.8649, + "step": 235200 + }, + { + "epoch": 0.06138364045145399, + "grad_norm": 7.1909871101379395, + "learning_rate": 9.956064763903368e-06, + "loss": 2.8988, + "step": 235400 + }, + { + "epoch": 0.06143579307715616, + "grad_norm": 6.038774490356445, + "learning_rate": 9.955954121357262e-06, + "loss": 2.8585, + "step": 235600 + }, + { + "epoch": 0.06148794570285833, + "grad_norm": 6.198418140411377, + "learning_rate": 9.95584334028652e-06, + "loss": 2.8393, + "step": 235800 + }, + { + "epoch": 0.0615400983285605, + "grad_norm": 6.438840389251709, + "learning_rate": 9.955732420694244e-06, + "loss": 2.8219, + "step": 236000 + }, + { + "epoch": 0.06159225095426267, + "grad_norm": 6.378972053527832, + "learning_rate": 9.955621362583533e-06, + "loss": 2.8501, + "step": 236200 + }, + { + "epoch": 0.06164440357996484, + "grad_norm": 6.684047222137451, + "learning_rate": 9.95551016595749e-06, + "loss": 2.8735, + "step": 236400 + }, + { + "epoch": 0.06169655620566701, + "grad_norm": 6.312347412109375, + "learning_rate": 9.955398830819225e-06, + "loss": 2.852, + "step": 236600 + }, + { + "epoch": 0.06174870883136918, + "grad_norm": 6.520938873291016, + "learning_rate": 9.955287357171848e-06, + "loss": 2.8494, + "step": 236800 + }, + { + "epoch": 0.06180086145707135, + "grad_norm": 6.856761932373047, + "learning_rate": 9.955175745018477e-06, + "loss": 2.8285, + "step": 237000 + }, + { + "epoch": 0.061853014082773515, + "grad_norm": 6.1591620445251465, + "learning_rate": 9.955063994362229e-06, + "loss": 2.8291, + "step": 237200 + }, + { + "epoch": 0.061905166708475685, + "grad_norm": 5.8291850090026855, + "learning_rate": 9.95495210520623e-06, + "loss": 2.8288, + "step": 237400 + }, + { + "epoch": 0.061957319334177856, + "grad_norm": 6.742002964019775, + "learning_rate": 9.954840077553604e-06, + "loss": 2.8435, + "step": 237600 + }, + { + "epoch": 0.062009471959880026, + "grad_norm": 6.073098659515381, + "learning_rate": 9.954727911407489e-06, + "loss": 2.8344, + "step": 237800 + }, + { + "epoch": 0.0620616245855822, + "grad_norm": 7.29561710357666, + "learning_rate": 9.954615606771012e-06, + "loss": 2.8352, + "step": 238000 + }, + { + "epoch": 0.06211377721128437, + "grad_norm": 6.251704216003418, + "learning_rate": 9.954503163647319e-06, + "loss": 2.8576, + "step": 238200 + }, + { + "epoch": 0.06216592983698654, + "grad_norm": 5.640725612640381, + "learning_rate": 9.954390582039545e-06, + "loss": 2.8439, + "step": 238400 + }, + { + "epoch": 0.06221808246268871, + "grad_norm": 6.10871696472168, + "learning_rate": 9.954277861950847e-06, + "loss": 2.8231, + "step": 238600 + }, + { + "epoch": 0.06227023508839088, + "grad_norm": 7.135128974914551, + "learning_rate": 9.954165003384367e-06, + "loss": 2.857, + "step": 238800 + }, + { + "epoch": 0.06232238771409305, + "grad_norm": 6.5773773193359375, + "learning_rate": 9.954052006343264e-06, + "loss": 2.8329, + "step": 239000 + }, + { + "epoch": 0.06237454033979522, + "grad_norm": 6.622820854187012, + "learning_rate": 9.953938870830696e-06, + "loss": 2.8687, + "step": 239200 + }, + { + "epoch": 0.06242669296549739, + "grad_norm": 6.241696834564209, + "learning_rate": 9.953825596849823e-06, + "loss": 2.8318, + "step": 239400 + }, + { + "epoch": 0.06247884559119956, + "grad_norm": 5.970136642456055, + "learning_rate": 9.953712184403811e-06, + "loss": 2.8651, + "step": 239600 + }, + { + "epoch": 0.06253099821690172, + "grad_norm": 5.942645072937012, + "learning_rate": 9.953598633495835e-06, + "loss": 2.8463, + "step": 239800 + }, + { + "epoch": 0.0625831508426039, + "grad_norm": 5.821661949157715, + "learning_rate": 9.953484944129064e-06, + "loss": 2.8955, + "step": 240000 + }, + { + "epoch": 0.06263530346830606, + "grad_norm": 6.606387138366699, + "learning_rate": 9.953371116306678e-06, + "loss": 2.8606, + "step": 240200 + }, + { + "epoch": 0.06268745609400823, + "grad_norm": 6.661681175231934, + "learning_rate": 9.953257150031857e-06, + "loss": 2.8399, + "step": 240400 + }, + { + "epoch": 0.0627396087197104, + "grad_norm": 6.761116027832031, + "learning_rate": 9.953143045307788e-06, + "loss": 2.8202, + "step": 240600 + }, + { + "epoch": 0.06279176134541258, + "grad_norm": 6.392307281494141, + "learning_rate": 9.953028802137658e-06, + "loss": 2.8542, + "step": 240800 + }, + { + "epoch": 0.06284391397111475, + "grad_norm": 6.313865661621094, + "learning_rate": 9.952914420524663e-06, + "loss": 2.8388, + "step": 241000 + }, + { + "epoch": 0.06289606659681692, + "grad_norm": 6.371045112609863, + "learning_rate": 9.952799900472e-06, + "loss": 2.8371, + "step": 241200 + }, + { + "epoch": 0.06294821922251909, + "grad_norm": 5.998501777648926, + "learning_rate": 9.952685241982867e-06, + "loss": 2.7792, + "step": 241400 + }, + { + "epoch": 0.06300037184822126, + "grad_norm": 6.239037036895752, + "learning_rate": 9.952570445060472e-06, + "loss": 2.8392, + "step": 241600 + }, + { + "epoch": 0.06305252447392343, + "grad_norm": 6.2942304611206055, + "learning_rate": 9.95245550970802e-06, + "loss": 2.8724, + "step": 241800 + }, + { + "epoch": 0.0631046770996256, + "grad_norm": 6.233669281005859, + "learning_rate": 9.95234043592873e-06, + "loss": 2.8326, + "step": 242000 + }, + { + "epoch": 0.06315682972532777, + "grad_norm": 6.2911577224731445, + "learning_rate": 9.952225223725814e-06, + "loss": 2.8716, + "step": 242200 + }, + { + "epoch": 0.06320898235102994, + "grad_norm": 6.646729946136475, + "learning_rate": 9.95210987310249e-06, + "loss": 2.8266, + "step": 242400 + }, + { + "epoch": 0.06326113497673211, + "grad_norm": 6.116643905639648, + "learning_rate": 9.951994384061988e-06, + "loss": 2.8853, + "step": 242600 + }, + { + "epoch": 0.06331328760243428, + "grad_norm": 6.70005464553833, + "learning_rate": 9.951878756607532e-06, + "loss": 2.8247, + "step": 242800 + }, + { + "epoch": 0.06336544022813645, + "grad_norm": 5.81242036819458, + "learning_rate": 9.951762990742356e-06, + "loss": 2.8165, + "step": 243000 + }, + { + "epoch": 0.06341759285383862, + "grad_norm": 6.2581329345703125, + "learning_rate": 9.951647086469694e-06, + "loss": 2.883, + "step": 243200 + }, + { + "epoch": 0.06346974547954079, + "grad_norm": 6.079429626464844, + "learning_rate": 9.951531043792785e-06, + "loss": 2.8643, + "step": 243400 + }, + { + "epoch": 0.06352189810524296, + "grad_norm": 6.490660667419434, + "learning_rate": 9.951414862714877e-06, + "loss": 2.8605, + "step": 243600 + }, + { + "epoch": 0.06357405073094513, + "grad_norm": 6.3001909255981445, + "learning_rate": 9.951298543239213e-06, + "loss": 2.8898, + "step": 243800 + }, + { + "epoch": 0.0636262033566473, + "grad_norm": 6.262456893920898, + "learning_rate": 9.951182085369044e-06, + "loss": 2.8698, + "step": 244000 + }, + { + "epoch": 0.06367835598234947, + "grad_norm": 7.45053243637085, + "learning_rate": 9.951065489107628e-06, + "loss": 2.8388, + "step": 244200 + }, + { + "epoch": 0.06373050860805164, + "grad_norm": 6.364867210388184, + "learning_rate": 9.950948754458222e-06, + "loss": 2.8592, + "step": 244400 + }, + { + "epoch": 0.0637826612337538, + "grad_norm": 6.993120193481445, + "learning_rate": 9.950831881424092e-06, + "loss": 2.824, + "step": 244600 + }, + { + "epoch": 0.06383481385945597, + "grad_norm": 6.528937339782715, + "learning_rate": 9.9507148700085e-06, + "loss": 2.8447, + "step": 244800 + }, + { + "epoch": 0.06388696648515814, + "grad_norm": 6.995550155639648, + "learning_rate": 9.950597720214721e-06, + "loss": 2.8018, + "step": 245000 + }, + { + "epoch": 0.06393911911086031, + "grad_norm": 6.352587699890137, + "learning_rate": 9.950480432046025e-06, + "loss": 2.8619, + "step": 245200 + }, + { + "epoch": 0.06399127173656248, + "grad_norm": 6.427020072937012, + "learning_rate": 9.950363005505695e-06, + "loss": 2.854, + "step": 245400 + }, + { + "epoch": 0.06404342436226465, + "grad_norm": 6.2600603103637695, + "learning_rate": 9.950245440597011e-06, + "loss": 2.8246, + "step": 245600 + }, + { + "epoch": 0.06409557698796682, + "grad_norm": 5.7311906814575195, + "learning_rate": 9.950127737323258e-06, + "loss": 2.8132, + "step": 245800 + }, + { + "epoch": 0.06414772961366899, + "grad_norm": 5.9210920333862305, + "learning_rate": 9.950009895687727e-06, + "loss": 2.8446, + "step": 246000 + }, + { + "epoch": 0.06419988223937116, + "grad_norm": 5.30364990234375, + "learning_rate": 9.949891915693712e-06, + "loss": 2.8175, + "step": 246200 + }, + { + "epoch": 0.06425203486507333, + "grad_norm": 6.701354503631592, + "learning_rate": 9.949773797344511e-06, + "loss": 2.8503, + "step": 246400 + }, + { + "epoch": 0.0643041874907755, + "grad_norm": 6.559759140014648, + "learning_rate": 9.949655540643425e-06, + "loss": 2.8999, + "step": 246600 + }, + { + "epoch": 0.06435634011647767, + "grad_norm": 6.051071643829346, + "learning_rate": 9.949537145593759e-06, + "loss": 2.8777, + "step": 246800 + }, + { + "epoch": 0.06440849274217984, + "grad_norm": 5.7465972900390625, + "learning_rate": 9.949418612198822e-06, + "loss": 2.8398, + "step": 247000 + }, + { + "epoch": 0.06446064536788201, + "grad_norm": 6.2163872718811035, + "learning_rate": 9.94929994046193e-06, + "loss": 2.8245, + "step": 247200 + }, + { + "epoch": 0.06451279799358418, + "grad_norm": 6.6872639656066895, + "learning_rate": 9.949181130386396e-06, + "loss": 2.8519, + "step": 247400 + }, + { + "epoch": 0.06456495061928635, + "grad_norm": 5.855112552642822, + "learning_rate": 9.949062181975544e-06, + "loss": 2.8286, + "step": 247600 + }, + { + "epoch": 0.06461710324498852, + "grad_norm": 6.136778354644775, + "learning_rate": 9.948943095232697e-06, + "loss": 2.8164, + "step": 247800 + }, + { + "epoch": 0.0646692558706907, + "grad_norm": 5.666250705718994, + "learning_rate": 9.948823870161184e-06, + "loss": 2.8247, + "step": 248000 + }, + { + "epoch": 0.06472140849639287, + "grad_norm": 6.700359344482422, + "learning_rate": 9.948704506764336e-06, + "loss": 2.7989, + "step": 248200 + }, + { + "epoch": 0.06477356112209504, + "grad_norm": 6.757811546325684, + "learning_rate": 9.948585005045495e-06, + "loss": 2.868, + "step": 248400 + }, + { + "epoch": 0.0648257137477972, + "grad_norm": 6.108405113220215, + "learning_rate": 9.948465365007995e-06, + "loss": 2.85, + "step": 248600 + }, + { + "epoch": 0.06487786637349938, + "grad_norm": 6.390014171600342, + "learning_rate": 9.948345586655181e-06, + "loss": 2.8826, + "step": 248800 + }, + { + "epoch": 0.06493001899920155, + "grad_norm": 6.4399733543396, + "learning_rate": 9.948225669990404e-06, + "loss": 2.7958, + "step": 249000 + }, + { + "epoch": 0.06498217162490372, + "grad_norm": 6.0152058601379395, + "learning_rate": 9.948105615017014e-06, + "loss": 2.8615, + "step": 249200 + }, + { + "epoch": 0.06503432425060589, + "grad_norm": 6.281088352203369, + "learning_rate": 9.947985421738366e-06, + "loss": 2.8323, + "step": 249400 + }, + { + "epoch": 0.06508647687630806, + "grad_norm": 7.041154384613037, + "learning_rate": 9.94786509015782e-06, + "loss": 2.812, + "step": 249600 + }, + { + "epoch": 0.06513862950201023, + "grad_norm": 6.751956462860107, + "learning_rate": 9.94774462027874e-06, + "loss": 2.8045, + "step": 249800 + }, + { + "epoch": 0.0651907821277124, + "grad_norm": 6.53183126449585, + "learning_rate": 9.947624012104494e-06, + "loss": 2.8629, + "step": 250000 + }, + { + "epoch": 0.06524293475341457, + "grad_norm": 6.1272969245910645, + "learning_rate": 9.947503265638449e-06, + "loss": 2.8009, + "step": 250200 + }, + { + "epoch": 0.06529508737911673, + "grad_norm": 6.2636942863464355, + "learning_rate": 9.947382380883985e-06, + "loss": 2.8502, + "step": 250400 + }, + { + "epoch": 0.0653472400048189, + "grad_norm": 5.978120803833008, + "learning_rate": 9.94726135784448e-06, + "loss": 2.8581, + "step": 250600 + }, + { + "epoch": 0.06539939263052107, + "grad_norm": 6.1595778465271, + "learning_rate": 9.947140196523312e-06, + "loss": 2.8445, + "step": 250800 + }, + { + "epoch": 0.06545154525622324, + "grad_norm": 6.8264923095703125, + "learning_rate": 9.947018896923875e-06, + "loss": 2.8439, + "step": 251000 + }, + { + "epoch": 0.06550369788192541, + "grad_norm": 5.948680877685547, + "learning_rate": 9.946897459049553e-06, + "loss": 2.8293, + "step": 251200 + }, + { + "epoch": 0.06555585050762758, + "grad_norm": 6.657248497009277, + "learning_rate": 9.946775882903745e-06, + "loss": 2.8726, + "step": 251400 + }, + { + "epoch": 0.06560800313332975, + "grad_norm": 5.8180155754089355, + "learning_rate": 9.946654168489847e-06, + "loss": 2.8181, + "step": 251600 + }, + { + "epoch": 0.06566015575903192, + "grad_norm": 6.403216361999512, + "learning_rate": 9.946532315811262e-06, + "loss": 2.806, + "step": 251800 + }, + { + "epoch": 0.06571230838473409, + "grad_norm": 6.092254638671875, + "learning_rate": 9.946410324871394e-06, + "loss": 2.8513, + "step": 252000 + }, + { + "epoch": 0.06576446101043626, + "grad_norm": 5.990058422088623, + "learning_rate": 9.946288195673654e-06, + "loss": 2.8147, + "step": 252200 + }, + { + "epoch": 0.06581661363613843, + "grad_norm": 6.334691047668457, + "learning_rate": 9.946165928221456e-06, + "loss": 2.8539, + "step": 252400 + }, + { + "epoch": 0.0658687662618406, + "grad_norm": 5.580613613128662, + "learning_rate": 9.946043522518217e-06, + "loss": 2.8393, + "step": 252600 + }, + { + "epoch": 0.06592091888754277, + "grad_norm": 6.962843418121338, + "learning_rate": 9.945920978567357e-06, + "loss": 2.8269, + "step": 252800 + }, + { + "epoch": 0.06597307151324494, + "grad_norm": 5.977869510650635, + "learning_rate": 9.945798296372303e-06, + "loss": 2.8493, + "step": 253000 + }, + { + "epoch": 0.06602522413894711, + "grad_norm": 5.998539447784424, + "learning_rate": 9.945675475936487e-06, + "loss": 2.8413, + "step": 253200 + }, + { + "epoch": 0.06607737676464928, + "grad_norm": 6.014648914337158, + "learning_rate": 9.945552517263336e-06, + "loss": 2.7819, + "step": 253400 + }, + { + "epoch": 0.06612952939035145, + "grad_norm": 6.409533977508545, + "learning_rate": 9.945429420356291e-06, + "loss": 2.8191, + "step": 253600 + }, + { + "epoch": 0.06618168201605362, + "grad_norm": 7.013908386230469, + "learning_rate": 9.94530618521879e-06, + "loss": 2.8302, + "step": 253800 + }, + { + "epoch": 0.06623383464175579, + "grad_norm": 6.836598873138428, + "learning_rate": 9.945182811854282e-06, + "loss": 2.8531, + "step": 254000 + }, + { + "epoch": 0.06628598726745796, + "grad_norm": 5.761704921722412, + "learning_rate": 9.945059300266209e-06, + "loss": 2.8364, + "step": 254200 + }, + { + "epoch": 0.06633813989316013, + "grad_norm": 5.95949125289917, + "learning_rate": 9.944935650458028e-06, + "loss": 2.8268, + "step": 254400 + }, + { + "epoch": 0.0663902925188623, + "grad_norm": 7.2780985832214355, + "learning_rate": 9.944811862433194e-06, + "loss": 2.8539, + "step": 254600 + }, + { + "epoch": 0.06644244514456447, + "grad_norm": 6.372018814086914, + "learning_rate": 9.944687936195168e-06, + "loss": 2.8406, + "step": 254800 + }, + { + "epoch": 0.06649459777026664, + "grad_norm": 7.005834579467773, + "learning_rate": 9.94456387174741e-06, + "loss": 2.8495, + "step": 255000 + }, + { + "epoch": 0.06654675039596882, + "grad_norm": 6.090085983276367, + "learning_rate": 9.944439669093393e-06, + "loss": 2.8185, + "step": 255200 + }, + { + "epoch": 0.06659890302167099, + "grad_norm": 6.5275750160217285, + "learning_rate": 9.944315328236585e-06, + "loss": 2.8444, + "step": 255400 + }, + { + "epoch": 0.06665105564737316, + "grad_norm": 6.6855316162109375, + "learning_rate": 9.944190849180464e-06, + "loss": 2.8106, + "step": 255600 + }, + { + "epoch": 0.06670320827307533, + "grad_norm": 5.903449058532715, + "learning_rate": 9.944066231928506e-06, + "loss": 2.8174, + "step": 255800 + }, + { + "epoch": 0.0667553608987775, + "grad_norm": 5.807356357574463, + "learning_rate": 9.943941476484195e-06, + "loss": 2.8591, + "step": 256000 + }, + { + "epoch": 0.06680751352447967, + "grad_norm": 6.048740386962891, + "learning_rate": 9.943816582851023e-06, + "loss": 2.8281, + "step": 256200 + }, + { + "epoch": 0.06685966615018182, + "grad_norm": 5.910772800445557, + "learning_rate": 9.943691551032472e-06, + "loss": 2.8024, + "step": 256400 + }, + { + "epoch": 0.066911818775884, + "grad_norm": 6.064548492431641, + "learning_rate": 9.943566381032047e-06, + "loss": 2.8232, + "step": 256600 + }, + { + "epoch": 0.06696397140158616, + "grad_norm": 6.429322719573975, + "learning_rate": 9.943441072853239e-06, + "loss": 2.8604, + "step": 256800 + }, + { + "epoch": 0.06701612402728833, + "grad_norm": 6.18205451965332, + "learning_rate": 9.943315626499552e-06, + "loss": 2.8166, + "step": 257000 + }, + { + "epoch": 0.0670682766529905, + "grad_norm": 6.421455383300781, + "learning_rate": 9.943190041974495e-06, + "loss": 2.8616, + "step": 257200 + }, + { + "epoch": 0.06712042927869268, + "grad_norm": 6.15507173538208, + "learning_rate": 9.943064319281576e-06, + "loss": 2.878, + "step": 257400 + }, + { + "epoch": 0.06717258190439485, + "grad_norm": 6.6916728019714355, + "learning_rate": 9.94293845842431e-06, + "loss": 2.8272, + "step": 257600 + }, + { + "epoch": 0.06722473453009702, + "grad_norm": 6.15769100189209, + "learning_rate": 9.942812459406215e-06, + "loss": 2.8024, + "step": 257800 + }, + { + "epoch": 0.06727688715579919, + "grad_norm": 6.359096527099609, + "learning_rate": 9.942686322230812e-06, + "loss": 2.8265, + "step": 258000 + }, + { + "epoch": 0.06732903978150136, + "grad_norm": 5.924700736999512, + "learning_rate": 9.942560046901628e-06, + "loss": 2.8314, + "step": 258200 + }, + { + "epoch": 0.06738119240720353, + "grad_norm": 6.2472968101501465, + "learning_rate": 9.94243363342219e-06, + "loss": 2.8348, + "step": 258400 + }, + { + "epoch": 0.0674333450329057, + "grad_norm": 6.63815450668335, + "learning_rate": 9.942307081796034e-06, + "loss": 2.8363, + "step": 258600 + }, + { + "epoch": 0.06748549765860787, + "grad_norm": 7.617677211761475, + "learning_rate": 9.942180392026697e-06, + "loss": 2.8584, + "step": 258800 + }, + { + "epoch": 0.06753765028431004, + "grad_norm": 6.234862327575684, + "learning_rate": 9.942053564117718e-06, + "loss": 2.8885, + "step": 259000 + }, + { + "epoch": 0.06758980291001221, + "grad_norm": 6.803557395935059, + "learning_rate": 9.941926598072644e-06, + "loss": 2.8659, + "step": 259200 + }, + { + "epoch": 0.06764195553571438, + "grad_norm": 6.141203880310059, + "learning_rate": 9.941799493895024e-06, + "loss": 2.8241, + "step": 259400 + }, + { + "epoch": 0.06769410816141655, + "grad_norm": 6.708746910095215, + "learning_rate": 9.94167225158841e-06, + "loss": 2.8405, + "step": 259600 + }, + { + "epoch": 0.06774626078711872, + "grad_norm": 6.132614612579346, + "learning_rate": 9.94154487115636e-06, + "loss": 2.8347, + "step": 259800 + }, + { + "epoch": 0.06779841341282089, + "grad_norm": 6.152980327606201, + "learning_rate": 9.941417352602429e-06, + "loss": 2.795, + "step": 260000 + }, + { + "epoch": 0.06785056603852306, + "grad_norm": 6.715810775756836, + "learning_rate": 9.941289695930188e-06, + "loss": 2.8301, + "step": 260200 + }, + { + "epoch": 0.06790271866422523, + "grad_norm": 6.222926139831543, + "learning_rate": 9.941161901143203e-06, + "loss": 2.8402, + "step": 260400 + }, + { + "epoch": 0.0679548712899274, + "grad_norm": 5.573058128356934, + "learning_rate": 9.941033968245045e-06, + "loss": 2.8325, + "step": 260600 + }, + { + "epoch": 0.06800702391562957, + "grad_norm": 5.889303207397461, + "learning_rate": 9.940905897239289e-06, + "loss": 2.8836, + "step": 260800 + }, + { + "epoch": 0.06805917654133174, + "grad_norm": 6.204052448272705, + "learning_rate": 9.940777688129517e-06, + "loss": 2.8051, + "step": 261000 + }, + { + "epoch": 0.06811132916703391, + "grad_norm": 6.2043585777282715, + "learning_rate": 9.940649340919313e-06, + "loss": 2.8156, + "step": 261200 + }, + { + "epoch": 0.06816348179273608, + "grad_norm": 6.628237724304199, + "learning_rate": 9.940520855612262e-06, + "loss": 2.8152, + "step": 261400 + }, + { + "epoch": 0.06821563441843825, + "grad_norm": 5.52294397354126, + "learning_rate": 9.940392232211955e-06, + "loss": 2.8127, + "step": 261600 + }, + { + "epoch": 0.06826778704414042, + "grad_norm": 6.279348373413086, + "learning_rate": 9.94026347072199e-06, + "loss": 2.8359, + "step": 261800 + }, + { + "epoch": 0.0683199396698426, + "grad_norm": 5.843539237976074, + "learning_rate": 9.940134571145966e-06, + "loss": 2.8432, + "step": 262000 + }, + { + "epoch": 0.06837209229554475, + "grad_norm": 6.166245460510254, + "learning_rate": 9.940005533487483e-06, + "loss": 2.8402, + "step": 262200 + }, + { + "epoch": 0.06842424492124692, + "grad_norm": 5.798083305358887, + "learning_rate": 9.93987635775015e-06, + "loss": 2.8465, + "step": 262400 + }, + { + "epoch": 0.06847639754694909, + "grad_norm": 6.206599712371826, + "learning_rate": 9.939747043937575e-06, + "loss": 2.8503, + "step": 262600 + }, + { + "epoch": 0.06852855017265126, + "grad_norm": 6.811930179595947, + "learning_rate": 9.939617592053376e-06, + "loss": 2.7846, + "step": 262800 + }, + { + "epoch": 0.06858070279835343, + "grad_norm": 6.215068340301514, + "learning_rate": 9.93948800210117e-06, + "loss": 2.8258, + "step": 263000 + }, + { + "epoch": 0.0686328554240556, + "grad_norm": 6.74024772644043, + "learning_rate": 9.939358274084578e-06, + "loss": 2.8517, + "step": 263200 + }, + { + "epoch": 0.06868500804975777, + "grad_norm": 6.108590602874756, + "learning_rate": 9.939228408007227e-06, + "loss": 2.8154, + "step": 263400 + }, + { + "epoch": 0.06873716067545994, + "grad_norm": 7.009735584259033, + "learning_rate": 9.939098403872747e-06, + "loss": 2.8149, + "step": 263600 + }, + { + "epoch": 0.06878931330116211, + "grad_norm": 6.192783832550049, + "learning_rate": 9.938968261684771e-06, + "loss": 2.8519, + "step": 263800 + }, + { + "epoch": 0.06884146592686428, + "grad_norm": 6.660505771636963, + "learning_rate": 9.938837981446939e-06, + "loss": 2.8706, + "step": 264000 + }, + { + "epoch": 0.06889361855256645, + "grad_norm": 6.753753185272217, + "learning_rate": 9.938707563162888e-06, + "loss": 2.7983, + "step": 264200 + }, + { + "epoch": 0.06894577117826863, + "grad_norm": 5.999804973602295, + "learning_rate": 9.938577006836268e-06, + "loss": 2.8527, + "step": 264400 + }, + { + "epoch": 0.0689979238039708, + "grad_norm": 6.876380443572998, + "learning_rate": 9.938446312470726e-06, + "loss": 2.8108, + "step": 264600 + }, + { + "epoch": 0.06905007642967297, + "grad_norm": 6.110062122344971, + "learning_rate": 9.938315480069916e-06, + "loss": 2.8565, + "step": 264800 + }, + { + "epoch": 0.06910222905537514, + "grad_norm": 6.41756534576416, + "learning_rate": 9.938184509637494e-06, + "loss": 2.8097, + "step": 265000 + }, + { + "epoch": 0.0691543816810773, + "grad_norm": 6.181519985198975, + "learning_rate": 9.93805340117712e-06, + "loss": 2.7903, + "step": 265200 + }, + { + "epoch": 0.06920653430677948, + "grad_norm": 6.470280170440674, + "learning_rate": 9.93792215469246e-06, + "loss": 2.7889, + "step": 265400 + }, + { + "epoch": 0.06925868693248165, + "grad_norm": 6.139248371124268, + "learning_rate": 9.937790770187182e-06, + "loss": 2.8096, + "step": 265600 + }, + { + "epoch": 0.06931083955818382, + "grad_norm": 5.582498073577881, + "learning_rate": 9.937659247664959e-06, + "loss": 2.8197, + "step": 265800 + }, + { + "epoch": 0.06936299218388599, + "grad_norm": 6.460460186004639, + "learning_rate": 9.937527587129468e-06, + "loss": 2.78, + "step": 266000 + }, + { + "epoch": 0.06941514480958816, + "grad_norm": 6.965539455413818, + "learning_rate": 9.937395788584384e-06, + "loss": 2.8352, + "step": 266200 + }, + { + "epoch": 0.06946729743529033, + "grad_norm": 6.817525386810303, + "learning_rate": 9.937263852033399e-06, + "loss": 2.8182, + "step": 266400 + }, + { + "epoch": 0.0695194500609925, + "grad_norm": 5.603514671325684, + "learning_rate": 9.937131777480194e-06, + "loss": 2.8105, + "step": 266600 + }, + { + "epoch": 0.06957160268669467, + "grad_norm": 6.385352611541748, + "learning_rate": 9.936999564928462e-06, + "loss": 2.8152, + "step": 266800 + }, + { + "epoch": 0.06962375531239684, + "grad_norm": 5.736618518829346, + "learning_rate": 9.936867214381902e-06, + "loss": 2.785, + "step": 267000 + }, + { + "epoch": 0.06967590793809901, + "grad_norm": 7.078679084777832, + "learning_rate": 9.93673472584421e-06, + "loss": 2.8278, + "step": 267200 + }, + { + "epoch": 0.06972806056380118, + "grad_norm": 6.587845802307129, + "learning_rate": 9.93660209931909e-06, + "loss": 2.7866, + "step": 267400 + }, + { + "epoch": 0.06978021318950335, + "grad_norm": 6.5862717628479, + "learning_rate": 9.93646933481025e-06, + "loss": 2.7899, + "step": 267600 + }, + { + "epoch": 0.06983236581520552, + "grad_norm": 6.024425506591797, + "learning_rate": 9.936336432321398e-06, + "loss": 2.8223, + "step": 267800 + }, + { + "epoch": 0.06988451844090769, + "grad_norm": 6.144232749938965, + "learning_rate": 9.936203391856254e-06, + "loss": 2.8156, + "step": 268000 + }, + { + "epoch": 0.06993667106660985, + "grad_norm": 6.591801166534424, + "learning_rate": 9.936070213418532e-06, + "loss": 2.7847, + "step": 268200 + }, + { + "epoch": 0.06998882369231202, + "grad_norm": 6.024001121520996, + "learning_rate": 9.935936897011955e-06, + "loss": 2.808, + "step": 268400 + }, + { + "epoch": 0.07004097631801419, + "grad_norm": 6.559907913208008, + "learning_rate": 9.93580344264025e-06, + "loss": 2.8023, + "step": 268600 + }, + { + "epoch": 0.07009312894371636, + "grad_norm": 6.3227033615112305, + "learning_rate": 9.93566985030715e-06, + "loss": 2.7682, + "step": 268800 + }, + { + "epoch": 0.07014528156941853, + "grad_norm": 5.811648368835449, + "learning_rate": 9.935536120016386e-06, + "loss": 2.8045, + "step": 269000 + }, + { + "epoch": 0.0701974341951207, + "grad_norm": 6.381039619445801, + "learning_rate": 9.935402251771696e-06, + "loss": 2.822, + "step": 269200 + }, + { + "epoch": 0.07024958682082287, + "grad_norm": 6.522758960723877, + "learning_rate": 9.935268245576822e-06, + "loss": 2.8439, + "step": 269400 + }, + { + "epoch": 0.07030173944652504, + "grad_norm": 6.2131428718566895, + "learning_rate": 9.935134101435511e-06, + "loss": 2.8601, + "step": 269600 + }, + { + "epoch": 0.07035389207222721, + "grad_norm": 6.588796615600586, + "learning_rate": 9.93499981935151e-06, + "loss": 2.8367, + "step": 269800 + }, + { + "epoch": 0.07040604469792938, + "grad_norm": 6.30739688873291, + "learning_rate": 9.934865399328575e-06, + "loss": 2.8663, + "step": 270000 + }, + { + "epoch": 0.07045819732363155, + "grad_norm": 6.022554397583008, + "learning_rate": 9.934730841370461e-06, + "loss": 2.8205, + "step": 270200 + }, + { + "epoch": 0.07051034994933372, + "grad_norm": 6.7225341796875, + "learning_rate": 9.93459614548093e-06, + "loss": 2.8732, + "step": 270400 + }, + { + "epoch": 0.0705625025750359, + "grad_norm": 5.783280849456787, + "learning_rate": 9.934461311663748e-06, + "loss": 2.7827, + "step": 270600 + }, + { + "epoch": 0.07061465520073806, + "grad_norm": 6.880871772766113, + "learning_rate": 9.934326339922682e-06, + "loss": 2.7892, + "step": 270800 + }, + { + "epoch": 0.07066680782644023, + "grad_norm": 7.110273361206055, + "learning_rate": 9.934191230261506e-06, + "loss": 2.8391, + "step": 271000 + }, + { + "epoch": 0.0707189604521424, + "grad_norm": 6.837869167327881, + "learning_rate": 9.934055982683995e-06, + "loss": 2.824, + "step": 271200 + }, + { + "epoch": 0.07077111307784457, + "grad_norm": 5.97743558883667, + "learning_rate": 9.933920597193932e-06, + "loss": 2.805, + "step": 271400 + }, + { + "epoch": 0.07082326570354674, + "grad_norm": 6.262125015258789, + "learning_rate": 9.933785073795096e-06, + "loss": 2.7591, + "step": 271600 + }, + { + "epoch": 0.07087541832924892, + "grad_norm": 6.377274036407471, + "learning_rate": 9.93364941249128e-06, + "loss": 2.8213, + "step": 271800 + }, + { + "epoch": 0.07092757095495109, + "grad_norm": 6.591363906860352, + "learning_rate": 9.933513613286273e-06, + "loss": 2.8096, + "step": 272000 + }, + { + "epoch": 0.07097972358065326, + "grad_norm": 6.756320476531982, + "learning_rate": 9.933377676183875e-06, + "loss": 2.7908, + "step": 272200 + }, + { + "epoch": 0.07103187620635543, + "grad_norm": 5.893064975738525, + "learning_rate": 9.93324160118788e-06, + "loss": 2.8085, + "step": 272400 + }, + { + "epoch": 0.0710840288320576, + "grad_norm": 6.648096561431885, + "learning_rate": 9.933105388302094e-06, + "loss": 2.8344, + "step": 272600 + }, + { + "epoch": 0.07113618145775977, + "grad_norm": 5.9668049812316895, + "learning_rate": 9.932969037530325e-06, + "loss": 2.8419, + "step": 272800 + }, + { + "epoch": 0.07118833408346194, + "grad_norm": 7.359618186950684, + "learning_rate": 9.932832548876384e-06, + "loss": 2.7993, + "step": 273000 + }, + { + "epoch": 0.07124048670916411, + "grad_norm": 7.143940448760986, + "learning_rate": 9.932695922344085e-06, + "loss": 2.8044, + "step": 273200 + }, + { + "epoch": 0.07129263933486628, + "grad_norm": 7.557311534881592, + "learning_rate": 9.932559157937247e-06, + "loss": 2.7901, + "step": 273400 + }, + { + "epoch": 0.07134479196056845, + "grad_norm": 6.371750354766846, + "learning_rate": 9.932422255659692e-06, + "loss": 2.8085, + "step": 273600 + }, + { + "epoch": 0.07139694458627062, + "grad_norm": 6.6176838874816895, + "learning_rate": 9.93228521551525e-06, + "loss": 2.8022, + "step": 273800 + }, + { + "epoch": 0.07144909721197278, + "grad_norm": 6.3366851806640625, + "learning_rate": 9.93214803750775e-06, + "loss": 2.7898, + "step": 274000 + }, + { + "epoch": 0.07150124983767495, + "grad_norm": 5.762211799621582, + "learning_rate": 9.932010721641022e-06, + "loss": 2.8627, + "step": 274200 + }, + { + "epoch": 0.07155340246337712, + "grad_norm": 6.653818607330322, + "learning_rate": 9.931873267918908e-06, + "loss": 2.7655, + "step": 274400 + }, + { + "epoch": 0.07160555508907929, + "grad_norm": 6.697924613952637, + "learning_rate": 9.931735676345252e-06, + "loss": 2.7799, + "step": 274600 + }, + { + "epoch": 0.07165770771478146, + "grad_norm": 6.904690742492676, + "learning_rate": 9.931597946923895e-06, + "loss": 2.8291, + "step": 274800 + }, + { + "epoch": 0.07170986034048363, + "grad_norm": 6.095543384552002, + "learning_rate": 9.93146007965869e-06, + "loss": 2.8048, + "step": 275000 + }, + { + "epoch": 0.0717620129661858, + "grad_norm": 6.26823616027832, + "learning_rate": 9.931322074553488e-06, + "loss": 2.7887, + "step": 275200 + }, + { + "epoch": 0.07181416559188797, + "grad_norm": 5.967849254608154, + "learning_rate": 9.931183931612151e-06, + "loss": 2.7847, + "step": 275400 + }, + { + "epoch": 0.07186631821759014, + "grad_norm": 6.218311786651611, + "learning_rate": 9.931045650838536e-06, + "loss": 2.7812, + "step": 275600 + }, + { + "epoch": 0.07191847084329231, + "grad_norm": 5.455589771270752, + "learning_rate": 9.930907232236508e-06, + "loss": 2.8283, + "step": 275800 + }, + { + "epoch": 0.07197062346899448, + "grad_norm": 5.432662010192871, + "learning_rate": 9.930768675809939e-06, + "loss": 2.8221, + "step": 276000 + }, + { + "epoch": 0.07202277609469665, + "grad_norm": 6.456852436065674, + "learning_rate": 9.930629981562698e-06, + "loss": 2.8253, + "step": 276200 + }, + { + "epoch": 0.07207492872039882, + "grad_norm": 6.429563999176025, + "learning_rate": 9.930491149498667e-06, + "loss": 2.831, + "step": 276400 + }, + { + "epoch": 0.07212708134610099, + "grad_norm": 6.186648368835449, + "learning_rate": 9.930352179621721e-06, + "loss": 2.7932, + "step": 276600 + }, + { + "epoch": 0.07217923397180316, + "grad_norm": 6.750385761260986, + "learning_rate": 9.930213071935746e-06, + "loss": 2.748, + "step": 276800 + }, + { + "epoch": 0.07223138659750533, + "grad_norm": 6.330704689025879, + "learning_rate": 9.930073826444634e-06, + "loss": 2.8346, + "step": 277000 + }, + { + "epoch": 0.0722835392232075, + "grad_norm": 6.233454704284668, + "learning_rate": 9.929934443152272e-06, + "loss": 2.7588, + "step": 277200 + }, + { + "epoch": 0.07233569184890967, + "grad_norm": 6.775033473968506, + "learning_rate": 9.929794922062556e-06, + "loss": 2.8368, + "step": 277400 + }, + { + "epoch": 0.07238784447461184, + "grad_norm": 6.234418869018555, + "learning_rate": 9.929655263179389e-06, + "loss": 2.7867, + "step": 277600 + }, + { + "epoch": 0.07243999710031401, + "grad_norm": 7.00774621963501, + "learning_rate": 9.929515466506675e-06, + "loss": 2.8419, + "step": 277800 + }, + { + "epoch": 0.07249214972601618, + "grad_norm": 6.623013019561768, + "learning_rate": 9.929375532048318e-06, + "loss": 2.7692, + "step": 278000 + }, + { + "epoch": 0.07254430235171835, + "grad_norm": 5.8589582443237305, + "learning_rate": 9.929235459808233e-06, + "loss": 2.8091, + "step": 278200 + }, + { + "epoch": 0.07259645497742052, + "grad_norm": 5.785161018371582, + "learning_rate": 9.92909524979033e-06, + "loss": 2.8113, + "step": 278400 + }, + { + "epoch": 0.0726486076031227, + "grad_norm": 6.4824018478393555, + "learning_rate": 9.928954901998535e-06, + "loss": 2.7695, + "step": 278600 + }, + { + "epoch": 0.07270076022882486, + "grad_norm": 5.669132232666016, + "learning_rate": 9.928814416436764e-06, + "loss": 2.8168, + "step": 278800 + }, + { + "epoch": 0.07275291285452704, + "grad_norm": 5.815093040466309, + "learning_rate": 9.92867379310895e-06, + "loss": 2.8186, + "step": 279000 + }, + { + "epoch": 0.0728050654802292, + "grad_norm": 6.025607585906982, + "learning_rate": 9.928533032019018e-06, + "loss": 2.7955, + "step": 279200 + }, + { + "epoch": 0.07285721810593138, + "grad_norm": 6.561540126800537, + "learning_rate": 9.928392133170906e-06, + "loss": 2.8035, + "step": 279400 + }, + { + "epoch": 0.07290937073163355, + "grad_norm": 6.505258560180664, + "learning_rate": 9.928251096568551e-06, + "loss": 2.8136, + "step": 279600 + }, + { + "epoch": 0.07296152335733572, + "grad_norm": 6.027585506439209, + "learning_rate": 9.928109922215895e-06, + "loss": 2.819, + "step": 279800 + }, + { + "epoch": 0.07301367598303787, + "grad_norm": 6.295257091522217, + "learning_rate": 9.927968610116885e-06, + "loss": 2.7694, + "step": 280000 + }, + { + "epoch": 0.07306582860874004, + "grad_norm": 5.988527297973633, + "learning_rate": 9.92782716027547e-06, + "loss": 2.8237, + "step": 280200 + }, + { + "epoch": 0.07311798123444221, + "grad_norm": 6.000006198883057, + "learning_rate": 9.927685572695602e-06, + "loss": 2.7769, + "step": 280400 + }, + { + "epoch": 0.07317013386014438, + "grad_norm": 5.901209831237793, + "learning_rate": 9.927543847381242e-06, + "loss": 2.8113, + "step": 280600 + }, + { + "epoch": 0.07322228648584655, + "grad_norm": 6.257493019104004, + "learning_rate": 9.927401984336351e-06, + "loss": 2.803, + "step": 280800 + }, + { + "epoch": 0.07327443911154873, + "grad_norm": 6.857747554779053, + "learning_rate": 9.927259983564892e-06, + "loss": 2.7732, + "step": 281000 + }, + { + "epoch": 0.0733265917372509, + "grad_norm": 6.075799465179443, + "learning_rate": 9.927117845070834e-06, + "loss": 2.8093, + "step": 281200 + }, + { + "epoch": 0.07337874436295307, + "grad_norm": 6.6486663818359375, + "learning_rate": 9.926975568858152e-06, + "loss": 2.7942, + "step": 281400 + }, + { + "epoch": 0.07343089698865524, + "grad_norm": 6.643108367919922, + "learning_rate": 9.926833154930823e-06, + "loss": 2.7644, + "step": 281600 + }, + { + "epoch": 0.0734830496143574, + "grad_norm": 6.577000141143799, + "learning_rate": 9.926690603292825e-06, + "loss": 2.8179, + "step": 281800 + }, + { + "epoch": 0.07353520224005958, + "grad_norm": 6.4486083984375, + "learning_rate": 9.926547913948146e-06, + "loss": 2.8009, + "step": 282000 + }, + { + "epoch": 0.07358735486576175, + "grad_norm": 6.277480602264404, + "learning_rate": 9.92640508690077e-06, + "loss": 2.8281, + "step": 282200 + }, + { + "epoch": 0.07363950749146392, + "grad_norm": 5.925256729125977, + "learning_rate": 9.926262122154692e-06, + "loss": 2.7898, + "step": 282400 + }, + { + "epoch": 0.07369166011716609, + "grad_norm": 6.8221049308776855, + "learning_rate": 9.926119019713908e-06, + "loss": 2.7954, + "step": 282600 + }, + { + "epoch": 0.07374381274286826, + "grad_norm": 6.1247429847717285, + "learning_rate": 9.925975779582417e-06, + "loss": 2.7843, + "step": 282800 + }, + { + "epoch": 0.07379596536857043, + "grad_norm": 6.930478572845459, + "learning_rate": 9.925832401764222e-06, + "loss": 2.7843, + "step": 283000 + }, + { + "epoch": 0.0738481179942726, + "grad_norm": 6.8114190101623535, + "learning_rate": 9.925688886263333e-06, + "loss": 2.7745, + "step": 283200 + }, + { + "epoch": 0.07390027061997477, + "grad_norm": 6.889013767242432, + "learning_rate": 9.92554523308376e-06, + "loss": 2.8041, + "step": 283400 + }, + { + "epoch": 0.07395242324567694, + "grad_norm": 5.969966411590576, + "learning_rate": 9.925401442229518e-06, + "loss": 2.7588, + "step": 283600 + }, + { + "epoch": 0.07400457587137911, + "grad_norm": 6.67039155960083, + "learning_rate": 9.925257513704627e-06, + "loss": 2.7643, + "step": 283800 + }, + { + "epoch": 0.07405672849708128, + "grad_norm": 6.659237861633301, + "learning_rate": 9.925113447513108e-06, + "loss": 2.7798, + "step": 284000 + }, + { + "epoch": 0.07410888112278345, + "grad_norm": 5.949248790740967, + "learning_rate": 9.924969243658991e-06, + "loss": 2.7853, + "step": 284200 + }, + { + "epoch": 0.07416103374848562, + "grad_norm": 6.595699310302734, + "learning_rate": 9.924824902146304e-06, + "loss": 2.8293, + "step": 284400 + }, + { + "epoch": 0.07421318637418779, + "grad_norm": 6.128589630126953, + "learning_rate": 9.924680422979082e-06, + "loss": 2.8168, + "step": 284600 + }, + { + "epoch": 0.07426533899988996, + "grad_norm": 6.020960330963135, + "learning_rate": 9.924535806161367e-06, + "loss": 2.7953, + "step": 284800 + }, + { + "epoch": 0.07431749162559213, + "grad_norm": 6.3884358406066895, + "learning_rate": 9.924391051697194e-06, + "loss": 2.8002, + "step": 285000 + }, + { + "epoch": 0.0743696442512943, + "grad_norm": 6.152309417724609, + "learning_rate": 9.924246159590614e-06, + "loss": 2.7912, + "step": 285200 + }, + { + "epoch": 0.07442179687699647, + "grad_norm": 6.479878902435303, + "learning_rate": 9.924101129845678e-06, + "loss": 2.7759, + "step": 285400 + }, + { + "epoch": 0.07447394950269864, + "grad_norm": 6.872642517089844, + "learning_rate": 9.923955962466437e-06, + "loss": 2.801, + "step": 285600 + }, + { + "epoch": 0.0745261021284008, + "grad_norm": 6.354683876037598, + "learning_rate": 9.923810657456947e-06, + "loss": 2.7839, + "step": 285800 + }, + { + "epoch": 0.07457825475410297, + "grad_norm": 6.509720325469971, + "learning_rate": 9.923665214821274e-06, + "loss": 2.7437, + "step": 286000 + }, + { + "epoch": 0.07463040737980514, + "grad_norm": 7.049443244934082, + "learning_rate": 9.92351963456348e-06, + "loss": 2.8347, + "step": 286200 + }, + { + "epoch": 0.07468256000550731, + "grad_norm": 5.861264705657959, + "learning_rate": 9.923373916687634e-06, + "loss": 2.8464, + "step": 286400 + }, + { + "epoch": 0.07473471263120948, + "grad_norm": 6.284032821655273, + "learning_rate": 9.923228061197814e-06, + "loss": 2.8014, + "step": 286600 + }, + { + "epoch": 0.07478686525691165, + "grad_norm": 7.213873386383057, + "learning_rate": 9.92308206809809e-06, + "loss": 2.8036, + "step": 286800 + }, + { + "epoch": 0.07483901788261382, + "grad_norm": 5.673182010650635, + "learning_rate": 9.922935937392545e-06, + "loss": 2.7957, + "step": 287000 + }, + { + "epoch": 0.074891170508316, + "grad_norm": 6.418299198150635, + "learning_rate": 9.922789669085266e-06, + "loss": 2.7754, + "step": 287200 + }, + { + "epoch": 0.07494332313401816, + "grad_norm": 6.251279830932617, + "learning_rate": 9.92264326318034e-06, + "loss": 2.8254, + "step": 287400 + }, + { + "epoch": 0.07499547575972033, + "grad_norm": 11.405594825744629, + "learning_rate": 9.922496719681858e-06, + "loss": 2.8577, + "step": 287600 + }, + { + "epoch": 0.0750476283854225, + "grad_norm": 5.6194963455200195, + "learning_rate": 9.922350038593918e-06, + "loss": 2.7584, + "step": 287800 + }, + { + "epoch": 0.07509978101112467, + "grad_norm": 6.875600814819336, + "learning_rate": 9.922203219920617e-06, + "loss": 2.7921, + "step": 288000 + }, + { + "epoch": 0.07515193363682685, + "grad_norm": 5.844818592071533, + "learning_rate": 9.92205626366606e-06, + "loss": 2.8321, + "step": 288200 + }, + { + "epoch": 0.07520408626252902, + "grad_norm": 6.084321975708008, + "learning_rate": 9.921909169834357e-06, + "loss": 2.7855, + "step": 288400 + }, + { + "epoch": 0.07525623888823119, + "grad_norm": 6.810995101928711, + "learning_rate": 9.921761938429615e-06, + "loss": 2.7948, + "step": 288600 + }, + { + "epoch": 0.07530839151393336, + "grad_norm": 6.287647724151611, + "learning_rate": 9.921614569455956e-06, + "loss": 2.815, + "step": 288800 + }, + { + "epoch": 0.07536054413963553, + "grad_norm": 6.69083833694458, + "learning_rate": 9.921467062917492e-06, + "loss": 2.7916, + "step": 289000 + }, + { + "epoch": 0.0754126967653377, + "grad_norm": 6.045042514801025, + "learning_rate": 9.921319418818347e-06, + "loss": 2.7645, + "step": 289200 + }, + { + "epoch": 0.07546484939103987, + "grad_norm": 6.019128799438477, + "learning_rate": 9.921171637162654e-06, + "loss": 2.813, + "step": 289400 + }, + { + "epoch": 0.07551700201674204, + "grad_norm": 6.279255390167236, + "learning_rate": 9.921023717954537e-06, + "loss": 2.8184, + "step": 289600 + }, + { + "epoch": 0.07556915464244421, + "grad_norm": 5.646428108215332, + "learning_rate": 9.920875661198132e-06, + "loss": 2.7849, + "step": 289800 + }, + { + "epoch": 0.07562130726814638, + "grad_norm": 6.301240921020508, + "learning_rate": 9.92072746689758e-06, + "loss": 2.7739, + "step": 290000 + }, + { + "epoch": 0.07567345989384855, + "grad_norm": 6.263399600982666, + "learning_rate": 9.920579135057022e-06, + "loss": 2.8212, + "step": 290200 + }, + { + "epoch": 0.07572561251955072, + "grad_norm": 6.300200939178467, + "learning_rate": 9.920430665680602e-06, + "loss": 2.8088, + "step": 290400 + }, + { + "epoch": 0.07577776514525289, + "grad_norm": 6.656348705291748, + "learning_rate": 9.920282058772472e-06, + "loss": 2.8058, + "step": 290600 + }, + { + "epoch": 0.07582991777095506, + "grad_norm": 6.327916622161865, + "learning_rate": 9.920133314336785e-06, + "loss": 2.8079, + "step": 290800 + }, + { + "epoch": 0.07588207039665723, + "grad_norm": 6.084270477294922, + "learning_rate": 9.919984432377698e-06, + "loss": 2.789, + "step": 291000 + }, + { + "epoch": 0.0759342230223594, + "grad_norm": 6.9517388343811035, + "learning_rate": 9.919835412899375e-06, + "loss": 2.7942, + "step": 291200 + }, + { + "epoch": 0.07598637564806157, + "grad_norm": 6.387750148773193, + "learning_rate": 9.919686255905978e-06, + "loss": 2.7857, + "step": 291400 + }, + { + "epoch": 0.07603852827376374, + "grad_norm": 6.805675506591797, + "learning_rate": 9.91953696140168e-06, + "loss": 2.8233, + "step": 291600 + }, + { + "epoch": 0.0760906808994659, + "grad_norm": 6.1650214195251465, + "learning_rate": 9.919387529390648e-06, + "loss": 2.8102, + "step": 291800 + }, + { + "epoch": 0.07614283352516807, + "grad_norm": 6.393313407897949, + "learning_rate": 9.919237959877064e-06, + "loss": 2.8349, + "step": 292000 + }, + { + "epoch": 0.07619498615087024, + "grad_norm": 6.139355659484863, + "learning_rate": 9.919088252865107e-06, + "loss": 2.7698, + "step": 292200 + }, + { + "epoch": 0.07624713877657241, + "grad_norm": 6.505699157714844, + "learning_rate": 9.918938408358961e-06, + "loss": 2.8034, + "step": 292400 + }, + { + "epoch": 0.07629929140227458, + "grad_norm": 6.276296138763428, + "learning_rate": 9.918788426362816e-06, + "loss": 2.7709, + "step": 292600 + }, + { + "epoch": 0.07635144402797675, + "grad_norm": 5.510497570037842, + "learning_rate": 9.918638306880861e-06, + "loss": 2.7805, + "step": 292800 + }, + { + "epoch": 0.07640359665367892, + "grad_norm": 6.827136993408203, + "learning_rate": 9.918488049917294e-06, + "loss": 2.7736, + "step": 293000 + }, + { + "epoch": 0.07645574927938109, + "grad_norm": 6.214416980743408, + "learning_rate": 9.918337655476315e-06, + "loss": 2.8018, + "step": 293200 + }, + { + "epoch": 0.07650790190508326, + "grad_norm": 6.037956237792969, + "learning_rate": 9.91818712356213e-06, + "loss": 2.7831, + "step": 293400 + }, + { + "epoch": 0.07656005453078543, + "grad_norm": 7.491139888763428, + "learning_rate": 9.918036454178942e-06, + "loss": 2.8054, + "step": 293600 + }, + { + "epoch": 0.0766122071564876, + "grad_norm": 7.301893711090088, + "learning_rate": 9.917885647330963e-06, + "loss": 2.8469, + "step": 293800 + }, + { + "epoch": 0.07666435978218977, + "grad_norm": 6.116480350494385, + "learning_rate": 9.917734703022411e-06, + "loss": 2.7881, + "step": 294000 + }, + { + "epoch": 0.07671651240789194, + "grad_norm": 6.379162788391113, + "learning_rate": 9.917583621257504e-06, + "loss": 2.7695, + "step": 294200 + }, + { + "epoch": 0.07676866503359411, + "grad_norm": 7.193239212036133, + "learning_rate": 9.917432402040463e-06, + "loss": 2.7596, + "step": 294400 + }, + { + "epoch": 0.07682081765929628, + "grad_norm": 6.380529880523682, + "learning_rate": 9.917281045375518e-06, + "loss": 2.8354, + "step": 294600 + }, + { + "epoch": 0.07687297028499845, + "grad_norm": 6.858937740325928, + "learning_rate": 9.917129551266897e-06, + "loss": 2.7872, + "step": 294800 + }, + { + "epoch": 0.07692512291070062, + "grad_norm": 6.391117572784424, + "learning_rate": 9.916977919718837e-06, + "loss": 2.8193, + "step": 295000 + }, + { + "epoch": 0.0769772755364028, + "grad_norm": 6.219594478607178, + "learning_rate": 9.916826150735573e-06, + "loss": 2.8042, + "step": 295200 + }, + { + "epoch": 0.07702942816210497, + "grad_norm": 6.027685642242432, + "learning_rate": 9.91667424432135e-06, + "loss": 2.8282, + "step": 295400 + }, + { + "epoch": 0.07708158078780714, + "grad_norm": 5.730934143066406, + "learning_rate": 9.916522200480412e-06, + "loss": 2.766, + "step": 295600 + }, + { + "epoch": 0.0771337334135093, + "grad_norm": 6.417144298553467, + "learning_rate": 9.91637001921701e-06, + "loss": 2.8094, + "step": 295800 + }, + { + "epoch": 0.07718588603921148, + "grad_norm": 6.070188522338867, + "learning_rate": 9.916217700535395e-06, + "loss": 2.7917, + "step": 296000 + }, + { + "epoch": 0.07723803866491365, + "grad_norm": 5.724254131317139, + "learning_rate": 9.91606524443983e-06, + "loss": 2.8104, + "step": 296200 + }, + { + "epoch": 0.07729019129061582, + "grad_norm": 5.841590404510498, + "learning_rate": 9.91591265093457e-06, + "loss": 2.7847, + "step": 296400 + }, + { + "epoch": 0.07734234391631799, + "grad_norm": 6.103632926940918, + "learning_rate": 9.915759920023886e-06, + "loss": 2.7978, + "step": 296600 + }, + { + "epoch": 0.07739449654202016, + "grad_norm": 6.138303756713867, + "learning_rate": 9.915607051712042e-06, + "loss": 2.7617, + "step": 296800 + }, + { + "epoch": 0.07744664916772233, + "grad_norm": 6.288359642028809, + "learning_rate": 9.915454046003312e-06, + "loss": 2.7719, + "step": 297000 + }, + { + "epoch": 0.0774988017934245, + "grad_norm": 7.5223002433776855, + "learning_rate": 9.915300902901976e-06, + "loss": 2.8224, + "step": 297200 + }, + { + "epoch": 0.07755095441912667, + "grad_norm": 6.539451599121094, + "learning_rate": 9.915147622412311e-06, + "loss": 2.7655, + "step": 297400 + }, + { + "epoch": 0.07760310704482883, + "grad_norm": 6.718863487243652, + "learning_rate": 9.914994204538603e-06, + "loss": 2.8393, + "step": 297600 + }, + { + "epoch": 0.077655259670531, + "grad_norm": 6.5553669929504395, + "learning_rate": 9.914840649285142e-06, + "loss": 2.7846, + "step": 297800 + }, + { + "epoch": 0.07770741229623317, + "grad_norm": 5.977219104766846, + "learning_rate": 9.914686956656214e-06, + "loss": 2.7882, + "step": 298000 + }, + { + "epoch": 0.07775956492193534, + "grad_norm": 6.6102752685546875, + "learning_rate": 9.91453312665612e-06, + "loss": 2.7676, + "step": 298200 + }, + { + "epoch": 0.07781171754763751, + "grad_norm": 6.522745609283447, + "learning_rate": 9.91437915928916e-06, + "loss": 2.7544, + "step": 298400 + }, + { + "epoch": 0.07786387017333968, + "grad_norm": 6.287838935852051, + "learning_rate": 9.914225054559636e-06, + "loss": 2.8284, + "step": 298600 + }, + { + "epoch": 0.07791602279904185, + "grad_norm": 6.7383270263671875, + "learning_rate": 9.914070812471853e-06, + "loss": 2.7425, + "step": 298800 + }, + { + "epoch": 0.07796817542474402, + "grad_norm": 6.654935359954834, + "learning_rate": 9.913916433030126e-06, + "loss": 2.7467, + "step": 299000 + }, + { + "epoch": 0.07802032805044619, + "grad_norm": 5.797543525695801, + "learning_rate": 9.91376191623877e-06, + "loss": 2.789, + "step": 299200 + }, + { + "epoch": 0.07807248067614836, + "grad_norm": 8.258265495300293, + "learning_rate": 9.913607262102101e-06, + "loss": 2.7822, + "step": 299400 + }, + { + "epoch": 0.07812463330185053, + "grad_norm": 5.814871788024902, + "learning_rate": 9.913452470624443e-06, + "loss": 2.763, + "step": 299600 + }, + { + "epoch": 0.0781767859275527, + "grad_norm": 6.059165954589844, + "learning_rate": 9.913297541810123e-06, + "loss": 2.7726, + "step": 299800 + }, + { + "epoch": 0.07822893855325487, + "grad_norm": 6.355923175811768, + "learning_rate": 9.913142475663472e-06, + "loss": 2.8118, + "step": 300000 + }, + { + "epoch": 0.07828109117895704, + "grad_norm": 6.16987419128418, + "learning_rate": 9.912987272188826e-06, + "loss": 2.7481, + "step": 300200 + }, + { + "epoch": 0.07833324380465921, + "grad_norm": 6.023130416870117, + "learning_rate": 9.912831931390518e-06, + "loss": 2.7463, + "step": 300400 + }, + { + "epoch": 0.07838539643036138, + "grad_norm": 5.977993011474609, + "learning_rate": 9.912676453272894e-06, + "loss": 2.845, + "step": 300600 + }, + { + "epoch": 0.07843754905606355, + "grad_norm": 6.411383152008057, + "learning_rate": 9.912520837840297e-06, + "loss": 2.7646, + "step": 300800 + }, + { + "epoch": 0.07848970168176572, + "grad_norm": 6.466489791870117, + "learning_rate": 9.91236508509708e-06, + "loss": 2.7944, + "step": 301000 + }, + { + "epoch": 0.07854185430746789, + "grad_norm": 6.878299713134766, + "learning_rate": 9.912209195047594e-06, + "loss": 2.798, + "step": 301200 + }, + { + "epoch": 0.07859400693317006, + "grad_norm": 6.08614444732666, + "learning_rate": 9.912053167696195e-06, + "loss": 2.7649, + "step": 301400 + }, + { + "epoch": 0.07864615955887223, + "grad_norm": 6.1333909034729, + "learning_rate": 9.911897003047251e-06, + "loss": 2.7795, + "step": 301600 + }, + { + "epoch": 0.0786983121845744, + "grad_norm": 6.349955081939697, + "learning_rate": 9.91174070110512e-06, + "loss": 2.7147, + "step": 301800 + }, + { + "epoch": 0.07875046481027657, + "grad_norm": 6.769674301147461, + "learning_rate": 9.91158426187417e-06, + "loss": 2.8079, + "step": 302000 + }, + { + "epoch": 0.07880261743597874, + "grad_norm": 6.519297122955322, + "learning_rate": 9.91142768535878e-06, + "loss": 2.7955, + "step": 302200 + }, + { + "epoch": 0.07885477006168091, + "grad_norm": 7.310003757476807, + "learning_rate": 9.911270971563322e-06, + "loss": 2.7827, + "step": 302400 + }, + { + "epoch": 0.07890692268738309, + "grad_norm": 5.958581924438477, + "learning_rate": 9.911114120492177e-06, + "loss": 2.7466, + "step": 302600 + }, + { + "epoch": 0.07895907531308526, + "grad_norm": 5.792890548706055, + "learning_rate": 9.91095713214973e-06, + "loss": 2.7629, + "step": 302800 + }, + { + "epoch": 0.07901122793878743, + "grad_norm": 6.76243782043457, + "learning_rate": 9.910800006540368e-06, + "loss": 2.8411, + "step": 303000 + }, + { + "epoch": 0.0790633805644896, + "grad_norm": 6.586005687713623, + "learning_rate": 9.910642743668486e-06, + "loss": 2.7527, + "step": 303200 + }, + { + "epoch": 0.07911553319019177, + "grad_norm": 6.768370628356934, + "learning_rate": 9.910485343538474e-06, + "loss": 2.7507, + "step": 303400 + }, + { + "epoch": 0.07916768581589392, + "grad_norm": 6.578641891479492, + "learning_rate": 9.910327806154735e-06, + "loss": 2.8037, + "step": 303600 + }, + { + "epoch": 0.0792198384415961, + "grad_norm": 6.524317741394043, + "learning_rate": 9.910170131521675e-06, + "loss": 2.7402, + "step": 303800 + }, + { + "epoch": 0.07927199106729826, + "grad_norm": 6.295195579528809, + "learning_rate": 9.910012319643696e-06, + "loss": 2.8275, + "step": 304000 + }, + { + "epoch": 0.07932414369300043, + "grad_norm": 7.033209800720215, + "learning_rate": 9.909854370525212e-06, + "loss": 2.7591, + "step": 304200 + }, + { + "epoch": 0.0793762963187026, + "grad_norm": 6.22572660446167, + "learning_rate": 9.909696284170636e-06, + "loss": 2.7604, + "step": 304400 + }, + { + "epoch": 0.07942844894440478, + "grad_norm": 6.447390556335449, + "learning_rate": 9.909538060584387e-06, + "loss": 2.799, + "step": 304600 + }, + { + "epoch": 0.07948060157010695, + "grad_norm": 6.915390491485596, + "learning_rate": 9.909379699770889e-06, + "loss": 2.792, + "step": 304800 + }, + { + "epoch": 0.07953275419580912, + "grad_norm": 6.417814254760742, + "learning_rate": 9.909221201734568e-06, + "loss": 2.7755, + "step": 305000 + }, + { + "epoch": 0.07958490682151129, + "grad_norm": 6.955142974853516, + "learning_rate": 9.909062566479854e-06, + "loss": 2.7464, + "step": 305200 + }, + { + "epoch": 0.07963705944721346, + "grad_norm": 6.3183393478393555, + "learning_rate": 9.908903794011183e-06, + "loss": 2.7509, + "step": 305400 + }, + { + "epoch": 0.07968921207291563, + "grad_norm": 6.242965221405029, + "learning_rate": 9.908744884332988e-06, + "loss": 2.8254, + "step": 305600 + }, + { + "epoch": 0.0797413646986178, + "grad_norm": 6.4110941886901855, + "learning_rate": 9.908585837449714e-06, + "loss": 2.7494, + "step": 305800 + }, + { + "epoch": 0.07979351732431997, + "grad_norm": 6.298842430114746, + "learning_rate": 9.908426653365805e-06, + "loss": 2.7913, + "step": 306000 + }, + { + "epoch": 0.07984566995002214, + "grad_norm": 6.478790760040283, + "learning_rate": 9.908267332085712e-06, + "loss": 2.7974, + "step": 306200 + }, + { + "epoch": 0.07989782257572431, + "grad_norm": 6.1008453369140625, + "learning_rate": 9.908107873613888e-06, + "loss": 2.79, + "step": 306400 + }, + { + "epoch": 0.07994997520142648, + "grad_norm": 6.282758712768555, + "learning_rate": 9.90794827795479e-06, + "loss": 2.7768, + "step": 306600 + }, + { + "epoch": 0.08000212782712865, + "grad_norm": 6.637455463409424, + "learning_rate": 9.907788545112879e-06, + "loss": 2.7501, + "step": 306800 + }, + { + "epoch": 0.08005428045283082, + "grad_norm": 6.408087730407715, + "learning_rate": 9.907628675092618e-06, + "loss": 2.8239, + "step": 307000 + }, + { + "epoch": 0.08010643307853299, + "grad_norm": 5.750133991241455, + "learning_rate": 9.907468667898478e-06, + "loss": 2.7664, + "step": 307200 + }, + { + "epoch": 0.08015858570423516, + "grad_norm": 6.643690586090088, + "learning_rate": 9.90730852353493e-06, + "loss": 2.7875, + "step": 307400 + }, + { + "epoch": 0.08021073832993733, + "grad_norm": 6.663003921508789, + "learning_rate": 9.907148242006451e-06, + "loss": 2.758, + "step": 307600 + }, + { + "epoch": 0.0802628909556395, + "grad_norm": 7.2608113288879395, + "learning_rate": 9.906987823317517e-06, + "loss": 2.7628, + "step": 307800 + }, + { + "epoch": 0.08031504358134167, + "grad_norm": 5.9124274253845215, + "learning_rate": 9.906827267472619e-06, + "loss": 2.766, + "step": 308000 + }, + { + "epoch": 0.08036719620704384, + "grad_norm": 6.0799241065979, + "learning_rate": 9.90666657447624e-06, + "loss": 2.7949, + "step": 308200 + }, + { + "epoch": 0.08041934883274601, + "grad_norm": 6.578120231628418, + "learning_rate": 9.906505744332873e-06, + "loss": 2.7463, + "step": 308400 + }, + { + "epoch": 0.08047150145844818, + "grad_norm": 6.545755386352539, + "learning_rate": 9.906344777047012e-06, + "loss": 2.771, + "step": 308600 + }, + { + "epoch": 0.08052365408415035, + "grad_norm": 6.288052082061768, + "learning_rate": 9.906183672623158e-06, + "loss": 2.787, + "step": 308800 + }, + { + "epoch": 0.08057580670985252, + "grad_norm": 7.15657377243042, + "learning_rate": 9.906022431065814e-06, + "loss": 2.7635, + "step": 309000 + }, + { + "epoch": 0.0806279593355547, + "grad_norm": 6.247620582580566, + "learning_rate": 9.905861052379484e-06, + "loss": 2.8279, + "step": 309200 + }, + { + "epoch": 0.08068011196125685, + "grad_norm": 6.499161243438721, + "learning_rate": 9.905699536568682e-06, + "loss": 2.8063, + "step": 309400 + }, + { + "epoch": 0.08073226458695902, + "grad_norm": 5.619341850280762, + "learning_rate": 9.905537883637923e-06, + "loss": 2.776, + "step": 309600 + }, + { + "epoch": 0.08078441721266119, + "grad_norm": 6.434546947479248, + "learning_rate": 9.905376093591722e-06, + "loss": 2.8069, + "step": 309800 + }, + { + "epoch": 0.08083656983836336, + "grad_norm": 5.9819207191467285, + "learning_rate": 9.905214166434605e-06, + "loss": 2.7757, + "step": 310000 + }, + { + "epoch": 0.08088872246406553, + "grad_norm": 6.195422649383545, + "learning_rate": 9.905052102171093e-06, + "loss": 2.7516, + "step": 310200 + }, + { + "epoch": 0.0809408750897677, + "grad_norm": 6.815351486206055, + "learning_rate": 9.904889900805721e-06, + "loss": 2.7657, + "step": 310400 + }, + { + "epoch": 0.08099302771546987, + "grad_norm": 6.130044937133789, + "learning_rate": 9.904727562343021e-06, + "loss": 2.8082, + "step": 310600 + }, + { + "epoch": 0.08104518034117204, + "grad_norm": 6.268271446228027, + "learning_rate": 9.90456508678753e-06, + "loss": 2.7433, + "step": 310800 + }, + { + "epoch": 0.08109733296687421, + "grad_norm": 6.54405403137207, + "learning_rate": 9.904402474143789e-06, + "loss": 2.7833, + "step": 311000 + }, + { + "epoch": 0.08114948559257638, + "grad_norm": 6.206225872039795, + "learning_rate": 9.904239724416345e-06, + "loss": 2.7515, + "step": 311200 + }, + { + "epoch": 0.08120163821827855, + "grad_norm": 6.3456645011901855, + "learning_rate": 9.904076837609745e-06, + "loss": 2.8183, + "step": 311400 + }, + { + "epoch": 0.08125379084398072, + "grad_norm": 6.351884841918945, + "learning_rate": 9.903913813728543e-06, + "loss": 2.7917, + "step": 311600 + }, + { + "epoch": 0.0813059434696829, + "grad_norm": 6.585855960845947, + "learning_rate": 9.903750652777296e-06, + "loss": 2.7465, + "step": 311800 + }, + { + "epoch": 0.08135809609538507, + "grad_norm": 6.618515491485596, + "learning_rate": 9.903587354760562e-06, + "loss": 2.787, + "step": 312000 + }, + { + "epoch": 0.08141024872108724, + "grad_norm": 6.154926300048828, + "learning_rate": 9.90342391968291e-06, + "loss": 2.7838, + "step": 312200 + }, + { + "epoch": 0.0814624013467894, + "grad_norm": 6.921688079833984, + "learning_rate": 9.903260347548904e-06, + "loss": 2.7479, + "step": 312400 + }, + { + "epoch": 0.08151455397249158, + "grad_norm": 7.639387130737305, + "learning_rate": 9.903096638363119e-06, + "loss": 2.752, + "step": 312600 + }, + { + "epoch": 0.08156670659819375, + "grad_norm": 6.138737201690674, + "learning_rate": 9.902932792130127e-06, + "loss": 2.8019, + "step": 312800 + }, + { + "epoch": 0.08161885922389592, + "grad_norm": 6.133929252624512, + "learning_rate": 9.902768808854513e-06, + "loss": 2.7514, + "step": 313000 + }, + { + "epoch": 0.08167101184959809, + "grad_norm": 6.8112359046936035, + "learning_rate": 9.902604688540855e-06, + "loss": 2.8003, + "step": 313200 + }, + { + "epoch": 0.08172316447530026, + "grad_norm": 6.365881443023682, + "learning_rate": 9.902440431193744e-06, + "loss": 2.7725, + "step": 313400 + }, + { + "epoch": 0.08177531710100243, + "grad_norm": 6.9227447509765625, + "learning_rate": 9.902276036817772e-06, + "loss": 2.7692, + "step": 313600 + }, + { + "epoch": 0.0818274697267046, + "grad_norm": 7.009827136993408, + "learning_rate": 9.90211150541753e-06, + "loss": 2.7507, + "step": 313800 + }, + { + "epoch": 0.08187962235240677, + "grad_norm": 6.5645976066589355, + "learning_rate": 9.90194683699762e-06, + "loss": 2.7632, + "step": 314000 + }, + { + "epoch": 0.08193177497810894, + "grad_norm": 6.148155689239502, + "learning_rate": 9.901782031562643e-06, + "loss": 2.7872, + "step": 314200 + }, + { + "epoch": 0.08198392760381111, + "grad_norm": 6.268352031707764, + "learning_rate": 9.901617089117209e-06, + "loss": 2.7974, + "step": 314400 + }, + { + "epoch": 0.08203608022951328, + "grad_norm": 5.926452159881592, + "learning_rate": 9.901452009665922e-06, + "loss": 2.7486, + "step": 314600 + }, + { + "epoch": 0.08208823285521545, + "grad_norm": 6.640249729156494, + "learning_rate": 9.901286793213402e-06, + "loss": 2.7851, + "step": 314800 + }, + { + "epoch": 0.08214038548091762, + "grad_norm": 7.127112865447998, + "learning_rate": 9.901121439764263e-06, + "loss": 2.7037, + "step": 315000 + }, + { + "epoch": 0.08219253810661979, + "grad_norm": 7.389196872711182, + "learning_rate": 9.90095594932313e-06, + "loss": 2.7886, + "step": 315200 + }, + { + "epoch": 0.08224469073232195, + "grad_norm": 6.33438777923584, + "learning_rate": 9.900790321894627e-06, + "loss": 2.7621, + "step": 315400 + }, + { + "epoch": 0.08229684335802412, + "grad_norm": 6.59637451171875, + "learning_rate": 9.900624557483383e-06, + "loss": 2.7757, + "step": 315600 + }, + { + "epoch": 0.08234899598372629, + "grad_norm": 6.34528923034668, + "learning_rate": 9.900458656094031e-06, + "loss": 2.7377, + "step": 315800 + }, + { + "epoch": 0.08240114860942846, + "grad_norm": 6.219143390655518, + "learning_rate": 9.90029261773121e-06, + "loss": 2.7665, + "step": 316000 + }, + { + "epoch": 0.08245330123513063, + "grad_norm": 6.358184337615967, + "learning_rate": 9.900126442399562e-06, + "loss": 2.7857, + "step": 316200 + }, + { + "epoch": 0.0825054538608328, + "grad_norm": 6.92657470703125, + "learning_rate": 9.899960130103728e-06, + "loss": 2.7823, + "step": 316400 + }, + { + "epoch": 0.08255760648653497, + "grad_norm": 6.66539192199707, + "learning_rate": 9.89979368084836e-06, + "loss": 2.7912, + "step": 316600 + }, + { + "epoch": 0.08260975911223714, + "grad_norm": 6.198975563049316, + "learning_rate": 9.899627094638107e-06, + "loss": 2.7537, + "step": 316800 + }, + { + "epoch": 0.08266191173793931, + "grad_norm": 5.979285717010498, + "learning_rate": 9.899460371477629e-06, + "loss": 2.7273, + "step": 317000 + }, + { + "epoch": 0.08271406436364148, + "grad_norm": 6.5081071853637695, + "learning_rate": 9.899293511371582e-06, + "loss": 2.7514, + "step": 317200 + }, + { + "epoch": 0.08276621698934365, + "grad_norm": 7.048851490020752, + "learning_rate": 9.899126514324636e-06, + "loss": 2.7624, + "step": 317400 + }, + { + "epoch": 0.08281836961504582, + "grad_norm": 6.467654228210449, + "learning_rate": 9.89895938034145e-06, + "loss": 2.7895, + "step": 317600 + }, + { + "epoch": 0.08287052224074799, + "grad_norm": 6.6041646003723145, + "learning_rate": 9.898792109426705e-06, + "loss": 2.7708, + "step": 317800 + }, + { + "epoch": 0.08292267486645016, + "grad_norm": 6.265130996704102, + "learning_rate": 9.898624701585069e-06, + "loss": 2.7361, + "step": 318000 + }, + { + "epoch": 0.08297482749215233, + "grad_norm": 6.6074724197387695, + "learning_rate": 9.898457156821226e-06, + "loss": 2.7643, + "step": 318200 + }, + { + "epoch": 0.0830269801178545, + "grad_norm": 6.601407527923584, + "learning_rate": 9.898289475139857e-06, + "loss": 2.7805, + "step": 318400 + }, + { + "epoch": 0.08307913274355667, + "grad_norm": 6.4573845863342285, + "learning_rate": 9.89812165654565e-06, + "loss": 2.8072, + "step": 318600 + }, + { + "epoch": 0.08313128536925884, + "grad_norm": 6.106098175048828, + "learning_rate": 9.897953701043292e-06, + "loss": 2.7642, + "step": 318800 + }, + { + "epoch": 0.08318343799496102, + "grad_norm": 6.303261756896973, + "learning_rate": 9.897785608637484e-06, + "loss": 2.7252, + "step": 319000 + }, + { + "epoch": 0.08323559062066319, + "grad_norm": 6.720864772796631, + "learning_rate": 9.897617379332919e-06, + "loss": 2.8183, + "step": 319200 + }, + { + "epoch": 0.08328774324636536, + "grad_norm": 6.235776424407959, + "learning_rate": 9.897449013134301e-06, + "loss": 2.7476, + "step": 319400 + }, + { + "epoch": 0.08333989587206753, + "grad_norm": 7.237074851989746, + "learning_rate": 9.897280510046337e-06, + "loss": 2.7326, + "step": 319600 + }, + { + "epoch": 0.0833920484977697, + "grad_norm": 6.430893421173096, + "learning_rate": 9.897111870073736e-06, + "loss": 2.7449, + "step": 319800 + }, + { + "epoch": 0.08344420112347187, + "grad_norm": 6.649140357971191, + "learning_rate": 9.896943093221212e-06, + "loss": 2.778, + "step": 320000 + }, + { + "epoch": 0.08349635374917404, + "grad_norm": 6.112555980682373, + "learning_rate": 9.89677417949348e-06, + "loss": 2.7041, + "step": 320200 + }, + { + "epoch": 0.08354850637487621, + "grad_norm": 6.915252685546875, + "learning_rate": 9.896605128895265e-06, + "loss": 2.7855, + "step": 320400 + }, + { + "epoch": 0.08360065900057838, + "grad_norm": 6.36874532699585, + "learning_rate": 9.896435941431291e-06, + "loss": 2.7738, + "step": 320600 + }, + { + "epoch": 0.08365281162628055, + "grad_norm": 7.028889179229736, + "learning_rate": 9.896266617106285e-06, + "loss": 2.8031, + "step": 320800 + }, + { + "epoch": 0.08370496425198272, + "grad_norm": 6.048758029937744, + "learning_rate": 9.896097155924983e-06, + "loss": 2.7842, + "step": 321000 + }, + { + "epoch": 0.08375711687768488, + "grad_norm": 6.261166095733643, + "learning_rate": 9.89592755789212e-06, + "loss": 2.7553, + "step": 321200 + }, + { + "epoch": 0.08380926950338705, + "grad_norm": 6.835334300994873, + "learning_rate": 9.895757823012435e-06, + "loss": 2.7636, + "step": 321400 + }, + { + "epoch": 0.08386142212908922, + "grad_norm": 6.209692478179932, + "learning_rate": 9.895587951290675e-06, + "loss": 2.763, + "step": 321600 + }, + { + "epoch": 0.08391357475479139, + "grad_norm": 6.277283191680908, + "learning_rate": 9.895417942731589e-06, + "loss": 2.8042, + "step": 321800 + }, + { + "epoch": 0.08396572738049356, + "grad_norm": 6.796102523803711, + "learning_rate": 9.895247797339925e-06, + "loss": 2.7538, + "step": 322000 + }, + { + "epoch": 0.08401788000619573, + "grad_norm": 6.786470890045166, + "learning_rate": 9.895077515120439e-06, + "loss": 2.7547, + "step": 322200 + }, + { + "epoch": 0.0840700326318979, + "grad_norm": 6.2995429039001465, + "learning_rate": 9.894907096077893e-06, + "loss": 2.7658, + "step": 322400 + }, + { + "epoch": 0.08412218525760007, + "grad_norm": 7.4601521492004395, + "learning_rate": 9.894736540217052e-06, + "loss": 2.7518, + "step": 322600 + }, + { + "epoch": 0.08417433788330224, + "grad_norm": 7.032712459564209, + "learning_rate": 9.89456584754268e-06, + "loss": 2.7879, + "step": 322800 + }, + { + "epoch": 0.08422649050900441, + "grad_norm": 6.522708892822266, + "learning_rate": 9.894395018059546e-06, + "loss": 2.782, + "step": 323000 + }, + { + "epoch": 0.08427864313470658, + "grad_norm": 6.497341632843018, + "learning_rate": 9.894224051772429e-06, + "loss": 2.7811, + "step": 323200 + }, + { + "epoch": 0.08433079576040875, + "grad_norm": 5.980485916137695, + "learning_rate": 9.894052948686108e-06, + "loss": 2.7278, + "step": 323400 + }, + { + "epoch": 0.08438294838611092, + "grad_norm": 6.1288628578186035, + "learning_rate": 9.893881708805364e-06, + "loss": 2.7643, + "step": 323600 + }, + { + "epoch": 0.08443510101181309, + "grad_norm": 6.619753360748291, + "learning_rate": 9.893710332134982e-06, + "loss": 2.7793, + "step": 323800 + }, + { + "epoch": 0.08448725363751526, + "grad_norm": 6.239343166351318, + "learning_rate": 9.893538818679754e-06, + "loss": 2.7257, + "step": 324000 + }, + { + "epoch": 0.08453940626321743, + "grad_norm": 6.4836883544921875, + "learning_rate": 9.893367168444474e-06, + "loss": 2.7462, + "step": 324200 + }, + { + "epoch": 0.0845915588889196, + "grad_norm": 6.484433650970459, + "learning_rate": 9.89319538143394e-06, + "loss": 2.7384, + "step": 324400 + }, + { + "epoch": 0.08464371151462177, + "grad_norm": 6.930042743682861, + "learning_rate": 9.893023457652951e-06, + "loss": 2.7905, + "step": 324600 + }, + { + "epoch": 0.08469586414032394, + "grad_norm": 5.899946212768555, + "learning_rate": 9.892851397106316e-06, + "loss": 2.7162, + "step": 324800 + }, + { + "epoch": 0.08474801676602611, + "grad_norm": 5.355585098266602, + "learning_rate": 9.892679199798843e-06, + "loss": 2.7872, + "step": 325000 + }, + { + "epoch": 0.08480016939172828, + "grad_norm": 7.755842685699463, + "learning_rate": 9.892506865735344e-06, + "loss": 2.7645, + "step": 325200 + }, + { + "epoch": 0.08485232201743045, + "grad_norm": 6.277362823486328, + "learning_rate": 9.892334394920638e-06, + "loss": 2.7905, + "step": 325400 + }, + { + "epoch": 0.08490447464313262, + "grad_norm": 6.218322277069092, + "learning_rate": 9.892161787359544e-06, + "loss": 2.7879, + "step": 325600 + }, + { + "epoch": 0.0849566272688348, + "grad_norm": 6.325297832489014, + "learning_rate": 9.891989043056886e-06, + "loss": 2.7732, + "step": 325800 + }, + { + "epoch": 0.08500877989453696, + "grad_norm": 6.6276445388793945, + "learning_rate": 9.891816162017495e-06, + "loss": 2.7643, + "step": 326000 + }, + { + "epoch": 0.08506093252023914, + "grad_norm": 6.802475929260254, + "learning_rate": 9.891643144246202e-06, + "loss": 2.7389, + "step": 326200 + }, + { + "epoch": 0.0851130851459413, + "grad_norm": 6.825811862945557, + "learning_rate": 9.89146998974784e-06, + "loss": 2.7736, + "step": 326400 + }, + { + "epoch": 0.08516523777164348, + "grad_norm": 6.184617519378662, + "learning_rate": 9.891296698527255e-06, + "loss": 2.7717, + "step": 326600 + }, + { + "epoch": 0.08521739039734565, + "grad_norm": 7.5419158935546875, + "learning_rate": 9.891123270589285e-06, + "loss": 2.7393, + "step": 326800 + }, + { + "epoch": 0.0852695430230478, + "grad_norm": 7.02357292175293, + "learning_rate": 9.89094970593878e-06, + "loss": 2.7573, + "step": 327000 + }, + { + "epoch": 0.08532169564874997, + "grad_norm": 6.308924674987793, + "learning_rate": 9.890776004580595e-06, + "loss": 2.7554, + "step": 327200 + }, + { + "epoch": 0.08537384827445214, + "grad_norm": 6.975244522094727, + "learning_rate": 9.890602166519578e-06, + "loss": 2.7893, + "step": 327400 + }, + { + "epoch": 0.08542600090015431, + "grad_norm": 6.773929119110107, + "learning_rate": 9.890428191760593e-06, + "loss": 2.7138, + "step": 327600 + }, + { + "epoch": 0.08547815352585648, + "grad_norm": 5.785812854766846, + "learning_rate": 9.890254080308498e-06, + "loss": 2.7643, + "step": 327800 + }, + { + "epoch": 0.08553030615155865, + "grad_norm": 6.477972507476807, + "learning_rate": 9.890079832168167e-06, + "loss": 2.7599, + "step": 328000 + }, + { + "epoch": 0.08558245877726083, + "grad_norm": 7.792716026306152, + "learning_rate": 9.889905447344463e-06, + "loss": 2.7773, + "step": 328200 + }, + { + "epoch": 0.085634611402963, + "grad_norm": 5.729294300079346, + "learning_rate": 9.889730925842266e-06, + "loss": 2.7661, + "step": 328400 + }, + { + "epoch": 0.08568676402866517, + "grad_norm": 5.776681900024414, + "learning_rate": 9.889556267666449e-06, + "loss": 2.714, + "step": 328600 + }, + { + "epoch": 0.08573891665436734, + "grad_norm": 5.912201404571533, + "learning_rate": 9.8893814728219e-06, + "loss": 2.781, + "step": 328800 + }, + { + "epoch": 0.0857910692800695, + "grad_norm": 6.655936241149902, + "learning_rate": 9.889206541313498e-06, + "loss": 2.752, + "step": 329000 + }, + { + "epoch": 0.08584322190577168, + "grad_norm": 6.418868064880371, + "learning_rate": 9.889031473146136e-06, + "loss": 2.8002, + "step": 329200 + }, + { + "epoch": 0.08589537453147385, + "grad_norm": 6.111396312713623, + "learning_rate": 9.888856268324707e-06, + "loss": 2.7168, + "step": 329400 + }, + { + "epoch": 0.08594752715717602, + "grad_norm": 6.363344669342041, + "learning_rate": 9.88868092685411e-06, + "loss": 2.7737, + "step": 329600 + }, + { + "epoch": 0.08599967978287819, + "grad_norm": 6.287228107452393, + "learning_rate": 9.888505448739243e-06, + "loss": 2.7443, + "step": 329800 + }, + { + "epoch": 0.08605183240858036, + "grad_norm": 7.1249542236328125, + "learning_rate": 9.888329833985012e-06, + "loss": 2.7436, + "step": 330000 + }, + { + "epoch": 0.08610398503428253, + "grad_norm": 6.433495998382568, + "learning_rate": 9.888154082596326e-06, + "loss": 2.7575, + "step": 330200 + }, + { + "epoch": 0.0861561376599847, + "grad_norm": 5.9914937019348145, + "learning_rate": 9.887978194578097e-06, + "loss": 2.7368, + "step": 330400 + }, + { + "epoch": 0.08620829028568687, + "grad_norm": 6.580966472625732, + "learning_rate": 9.88780216993524e-06, + "loss": 2.7948, + "step": 330600 + }, + { + "epoch": 0.08626044291138904, + "grad_norm": 6.364790439605713, + "learning_rate": 9.88762600867268e-06, + "loss": 2.7685, + "step": 330800 + }, + { + "epoch": 0.08631259553709121, + "grad_norm": 6.688601016998291, + "learning_rate": 9.887449710795333e-06, + "loss": 2.7504, + "step": 331000 + }, + { + "epoch": 0.08636474816279338, + "grad_norm": 6.046875953674316, + "learning_rate": 9.887273276308135e-06, + "loss": 2.7278, + "step": 331200 + }, + { + "epoch": 0.08641690078849555, + "grad_norm": 6.681288719177246, + "learning_rate": 9.887096705216011e-06, + "loss": 2.7646, + "step": 331400 + }, + { + "epoch": 0.08646905341419772, + "grad_norm": 6.149299621582031, + "learning_rate": 9.8869199975239e-06, + "loss": 2.727, + "step": 331600 + }, + { + "epoch": 0.08652120603989989, + "grad_norm": 6.459449291229248, + "learning_rate": 9.886743153236741e-06, + "loss": 2.7715, + "step": 331800 + }, + { + "epoch": 0.08657335866560206, + "grad_norm": 7.189176082611084, + "learning_rate": 9.886566172359475e-06, + "loss": 2.7691, + "step": 332000 + }, + { + "epoch": 0.08662551129130423, + "grad_norm": 6.436927318572998, + "learning_rate": 9.886389054897051e-06, + "loss": 2.7188, + "step": 332200 + }, + { + "epoch": 0.0866776639170064, + "grad_norm": 6.325084686279297, + "learning_rate": 9.886211800854419e-06, + "loss": 2.7552, + "step": 332400 + }, + { + "epoch": 0.08672981654270857, + "grad_norm": 5.9876251220703125, + "learning_rate": 9.886034410236533e-06, + "loss": 2.7334, + "step": 332600 + }, + { + "epoch": 0.08678196916841074, + "grad_norm": 6.643549919128418, + "learning_rate": 9.88585688304835e-06, + "loss": 2.7637, + "step": 332800 + }, + { + "epoch": 0.0868341217941129, + "grad_norm": 6.4676971435546875, + "learning_rate": 9.885679219294835e-06, + "loss": 2.7169, + "step": 333000 + }, + { + "epoch": 0.08688627441981507, + "grad_norm": 6.548349380493164, + "learning_rate": 9.885501418980953e-06, + "loss": 2.7825, + "step": 333200 + }, + { + "epoch": 0.08693842704551724, + "grad_norm": 5.312566757202148, + "learning_rate": 9.885323482111671e-06, + "loss": 2.7209, + "step": 333400 + }, + { + "epoch": 0.08699057967121941, + "grad_norm": 6.16527795791626, + "learning_rate": 9.885145408691965e-06, + "loss": 2.713, + "step": 333600 + }, + { + "epoch": 0.08704273229692158, + "grad_norm": 6.8437652587890625, + "learning_rate": 9.884967198726814e-06, + "loss": 2.7806, + "step": 333800 + }, + { + "epoch": 0.08709488492262375, + "grad_norm": 6.265858173370361, + "learning_rate": 9.884788852221195e-06, + "loss": 2.789, + "step": 334000 + }, + { + "epoch": 0.08714703754832592, + "grad_norm": 6.236710548400879, + "learning_rate": 9.884610369180097e-06, + "loss": 2.7499, + "step": 334200 + }, + { + "epoch": 0.0871991901740281, + "grad_norm": 6.732831001281738, + "learning_rate": 9.884431749608505e-06, + "loss": 2.7575, + "step": 334400 + }, + { + "epoch": 0.08725134279973026, + "grad_norm": 6.163575649261475, + "learning_rate": 9.884252993511415e-06, + "loss": 2.7529, + "step": 334600 + }, + { + "epoch": 0.08730349542543243, + "grad_norm": 6.205536365509033, + "learning_rate": 9.884074100893821e-06, + "loss": 2.7299, + "step": 334800 + }, + { + "epoch": 0.0873556480511346, + "grad_norm": 6.635568618774414, + "learning_rate": 9.883895071760726e-06, + "loss": 2.7082, + "step": 335000 + }, + { + "epoch": 0.08740780067683677, + "grad_norm": 7.192404270172119, + "learning_rate": 9.883715906117132e-06, + "loss": 2.7596, + "step": 335200 + }, + { + "epoch": 0.08745995330253895, + "grad_norm": 6.824044704437256, + "learning_rate": 9.883536603968047e-06, + "loss": 2.7477, + "step": 335400 + }, + { + "epoch": 0.08751210592824112, + "grad_norm": 7.502521991729736, + "learning_rate": 9.883357165318483e-06, + "loss": 2.7955, + "step": 335600 + }, + { + "epoch": 0.08756425855394329, + "grad_norm": 6.514331817626953, + "learning_rate": 9.883177590173454e-06, + "loss": 2.7597, + "step": 335800 + }, + { + "epoch": 0.08761641117964546, + "grad_norm": 5.833806037902832, + "learning_rate": 9.882997878537984e-06, + "loss": 2.7714, + "step": 336000 + }, + { + "epoch": 0.08766856380534763, + "grad_norm": 6.365836143493652, + "learning_rate": 9.88281803041709e-06, + "loss": 2.7577, + "step": 336200 + }, + { + "epoch": 0.0877207164310498, + "grad_norm": 6.60760498046875, + "learning_rate": 9.882638045815804e-06, + "loss": 2.7664, + "step": 336400 + }, + { + "epoch": 0.08777286905675197, + "grad_norm": 6.648801326751709, + "learning_rate": 9.882457924739153e-06, + "loss": 2.7739, + "step": 336600 + }, + { + "epoch": 0.08782502168245414, + "grad_norm": 6.417782306671143, + "learning_rate": 9.882277667192175e-06, + "loss": 2.777, + "step": 336800 + }, + { + "epoch": 0.08787717430815631, + "grad_norm": 7.36497163772583, + "learning_rate": 9.882097273179904e-06, + "loss": 2.7481, + "step": 337000 + }, + { + "epoch": 0.08792932693385848, + "grad_norm": 6.485381603240967, + "learning_rate": 9.881916742707389e-06, + "loss": 2.7754, + "step": 337200 + }, + { + "epoch": 0.08798147955956065, + "grad_norm": 6.6873369216918945, + "learning_rate": 9.88173607577967e-06, + "loss": 2.7517, + "step": 337400 + }, + { + "epoch": 0.08803363218526282, + "grad_norm": 7.222953796386719, + "learning_rate": 9.881555272401797e-06, + "loss": 2.7745, + "step": 337600 + }, + { + "epoch": 0.08808578481096499, + "grad_norm": 6.487031936645508, + "learning_rate": 9.881374332578829e-06, + "loss": 2.762, + "step": 337800 + }, + { + "epoch": 0.08813793743666716, + "grad_norm": 6.065067768096924, + "learning_rate": 9.881193256315816e-06, + "loss": 2.7632, + "step": 338000 + }, + { + "epoch": 0.08819009006236933, + "grad_norm": 6.604552745819092, + "learning_rate": 9.881012043617826e-06, + "loss": 2.7412, + "step": 338200 + }, + { + "epoch": 0.0882422426880715, + "grad_norm": 6.458118915557861, + "learning_rate": 9.88083069448992e-06, + "loss": 2.7364, + "step": 338400 + }, + { + "epoch": 0.08829439531377367, + "grad_norm": 7.036708354949951, + "learning_rate": 9.88064920893717e-06, + "loss": 2.779, + "step": 338600 + }, + { + "epoch": 0.08834654793947583, + "grad_norm": 6.828887462615967, + "learning_rate": 9.880467586964646e-06, + "loss": 2.7671, + "step": 338800 + }, + { + "epoch": 0.088398700565178, + "grad_norm": 7.044036388397217, + "learning_rate": 9.880285828577426e-06, + "loss": 2.7701, + "step": 339000 + }, + { + "epoch": 0.08845085319088017, + "grad_norm": 6.224187850952148, + "learning_rate": 9.880103933780589e-06, + "loss": 2.7674, + "step": 339200 + }, + { + "epoch": 0.08850300581658234, + "grad_norm": 5.749971389770508, + "learning_rate": 9.879921902579219e-06, + "loss": 2.7177, + "step": 339400 + }, + { + "epoch": 0.08855515844228451, + "grad_norm": 6.637483596801758, + "learning_rate": 9.879739734978408e-06, + "loss": 2.7148, + "step": 339600 + }, + { + "epoch": 0.08860731106798668, + "grad_norm": 6.52347469329834, + "learning_rate": 9.879557430983242e-06, + "loss": 2.7768, + "step": 339800 + }, + { + "epoch": 0.08865946369368885, + "grad_norm": 7.416553497314453, + "learning_rate": 9.879374990598821e-06, + "loss": 2.7519, + "step": 340000 + }, + { + "epoch": 0.08871161631939102, + "grad_norm": 7.152632713317871, + "learning_rate": 9.879192413830244e-06, + "loss": 2.761, + "step": 340200 + }, + { + "epoch": 0.08876376894509319, + "grad_norm": 6.972984313964844, + "learning_rate": 9.879009700682611e-06, + "loss": 2.7335, + "step": 340400 + }, + { + "epoch": 0.08881592157079536, + "grad_norm": 6.66843318939209, + "learning_rate": 9.878826851161032e-06, + "loss": 2.7011, + "step": 340600 + }, + { + "epoch": 0.08886807419649753, + "grad_norm": 6.625106334686279, + "learning_rate": 9.878643865270617e-06, + "loss": 2.7638, + "step": 340800 + }, + { + "epoch": 0.0889202268221997, + "grad_norm": 5.736109733581543, + "learning_rate": 9.87846074301648e-06, + "loss": 2.6837, + "step": 341000 + }, + { + "epoch": 0.08897237944790187, + "grad_norm": 6.541588306427002, + "learning_rate": 9.878277484403742e-06, + "loss": 2.785, + "step": 341200 + }, + { + "epoch": 0.08902453207360404, + "grad_norm": 6.787119388580322, + "learning_rate": 9.878094089437523e-06, + "loss": 2.768, + "step": 341400 + }, + { + "epoch": 0.08907668469930621, + "grad_norm": 6.477860927581787, + "learning_rate": 9.877910558122948e-06, + "loss": 2.7734, + "step": 341600 + }, + { + "epoch": 0.08912883732500838, + "grad_norm": 6.601665496826172, + "learning_rate": 9.87772689046515e-06, + "loss": 2.7677, + "step": 341800 + }, + { + "epoch": 0.08918098995071055, + "grad_norm": 6.276432514190674, + "learning_rate": 9.87754308646926e-06, + "loss": 2.7615, + "step": 342000 + }, + { + "epoch": 0.08923314257641272, + "grad_norm": 6.968433380126953, + "learning_rate": 9.877359146140416e-06, + "loss": 2.7536, + "step": 342200 + }, + { + "epoch": 0.0892852952021149, + "grad_norm": 6.095193862915039, + "learning_rate": 9.877175069483762e-06, + "loss": 2.727, + "step": 342400 + }, + { + "epoch": 0.08933744782781707, + "grad_norm": 6.783011436462402, + "learning_rate": 9.87699085650444e-06, + "loss": 2.7556, + "step": 342600 + }, + { + "epoch": 0.08938960045351924, + "grad_norm": 6.720550537109375, + "learning_rate": 9.876806507207601e-06, + "loss": 2.7567, + "step": 342800 + }, + { + "epoch": 0.0894417530792214, + "grad_norm": 7.076850891113281, + "learning_rate": 9.876622021598396e-06, + "loss": 2.735, + "step": 343000 + }, + { + "epoch": 0.08949390570492358, + "grad_norm": 6.559168815612793, + "learning_rate": 9.876437399681983e-06, + "loss": 2.7888, + "step": 343200 + }, + { + "epoch": 0.08954605833062575, + "grad_norm": 6.786516189575195, + "learning_rate": 9.876252641463522e-06, + "loss": 2.7834, + "step": 343400 + }, + { + "epoch": 0.08959821095632792, + "grad_norm": 7.153230667114258, + "learning_rate": 9.876067746948176e-06, + "loss": 2.724, + "step": 343600 + }, + { + "epoch": 0.08965036358203009, + "grad_norm": 6.643190860748291, + "learning_rate": 9.875882716141116e-06, + "loss": 2.7588, + "step": 343800 + }, + { + "epoch": 0.08970251620773226, + "grad_norm": 5.990670680999756, + "learning_rate": 9.875697549047511e-06, + "loss": 2.7544, + "step": 344000 + }, + { + "epoch": 0.08975466883343443, + "grad_norm": 6.413429260253906, + "learning_rate": 9.875512245672538e-06, + "loss": 2.755, + "step": 344200 + }, + { + "epoch": 0.0898068214591366, + "grad_norm": 6.6495361328125, + "learning_rate": 9.875326806021377e-06, + "loss": 2.7291, + "step": 344400 + }, + { + "epoch": 0.08985897408483877, + "grad_norm": 6.507997512817383, + "learning_rate": 9.875141230099209e-06, + "loss": 2.7421, + "step": 344600 + }, + { + "epoch": 0.08991112671054093, + "grad_norm": 6.5377726554870605, + "learning_rate": 9.874955517911223e-06, + "loss": 2.7329, + "step": 344800 + }, + { + "epoch": 0.0899632793362431, + "grad_norm": 7.305025577545166, + "learning_rate": 9.874769669462608e-06, + "loss": 2.7645, + "step": 345000 + }, + { + "epoch": 0.09001543196194527, + "grad_norm": 6.375731945037842, + "learning_rate": 9.87458368475856e-06, + "loss": 2.7613, + "step": 345200 + }, + { + "epoch": 0.09006758458764744, + "grad_norm": 6.055731773376465, + "learning_rate": 9.87439756380428e-06, + "loss": 2.7406, + "step": 345400 + }, + { + "epoch": 0.09011973721334961, + "grad_norm": 6.731203556060791, + "learning_rate": 9.874211306604966e-06, + "loss": 2.7252, + "step": 345600 + }, + { + "epoch": 0.09017188983905178, + "grad_norm": 7.642002582550049, + "learning_rate": 9.874024913165825e-06, + "loss": 2.7897, + "step": 345800 + }, + { + "epoch": 0.09022404246475395, + "grad_norm": 6.946300983428955, + "learning_rate": 9.873838383492069e-06, + "loss": 2.7631, + "step": 346000 + }, + { + "epoch": 0.09027619509045612, + "grad_norm": 6.039514064788818, + "learning_rate": 9.873651717588909e-06, + "loss": 2.7506, + "step": 346200 + }, + { + "epoch": 0.09032834771615829, + "grad_norm": 6.728977203369141, + "learning_rate": 9.873464915461566e-06, + "loss": 2.747, + "step": 346400 + }, + { + "epoch": 0.09038050034186046, + "grad_norm": 5.914807319641113, + "learning_rate": 9.873277977115256e-06, + "loss": 2.7535, + "step": 346600 + }, + { + "epoch": 0.09043265296756263, + "grad_norm": 7.127989768981934, + "learning_rate": 9.87309090255521e-06, + "loss": 2.7418, + "step": 346800 + }, + { + "epoch": 0.0904848055932648, + "grad_norm": 6.722864151000977, + "learning_rate": 9.872903691786655e-06, + "loss": 2.753, + "step": 347000 + }, + { + "epoch": 0.09053695821896697, + "grad_norm": 7.177712440490723, + "learning_rate": 9.87271634481482e-06, + "loss": 2.7418, + "step": 347200 + }, + { + "epoch": 0.09058911084466914, + "grad_norm": 6.4624552726745605, + "learning_rate": 9.872528861644947e-06, + "loss": 2.79, + "step": 347400 + }, + { + "epoch": 0.09064126347037131, + "grad_norm": 6.420626163482666, + "learning_rate": 9.872341242282274e-06, + "loss": 2.7962, + "step": 347600 + }, + { + "epoch": 0.09069341609607348, + "grad_norm": 5.890540599822998, + "learning_rate": 9.872153486732045e-06, + "loss": 2.7281, + "step": 347800 + }, + { + "epoch": 0.09074556872177565, + "grad_norm": 6.1183013916015625, + "learning_rate": 9.871965594999509e-06, + "loss": 2.7133, + "step": 348000 + }, + { + "epoch": 0.09079772134747782, + "grad_norm": 6.170734405517578, + "learning_rate": 9.871777567089916e-06, + "loss": 2.7438, + "step": 348200 + }, + { + "epoch": 0.09084987397317999, + "grad_norm": 6.432772636413574, + "learning_rate": 9.871589403008524e-06, + "loss": 2.7667, + "step": 348400 + }, + { + "epoch": 0.09090202659888216, + "grad_norm": 6.09767484664917, + "learning_rate": 9.87140110276059e-06, + "loss": 2.7265, + "step": 348600 + }, + { + "epoch": 0.09095417922458433, + "grad_norm": 6.650676727294922, + "learning_rate": 9.871212666351378e-06, + "loss": 2.7454, + "step": 348800 + }, + { + "epoch": 0.0910063318502865, + "grad_norm": 6.796266078948975, + "learning_rate": 9.871024093786154e-06, + "loss": 2.7593, + "step": 349000 + }, + { + "epoch": 0.09105848447598867, + "grad_norm": 6.901579856872559, + "learning_rate": 9.870835385070191e-06, + "loss": 2.7642, + "step": 349200 + }, + { + "epoch": 0.09111063710169084, + "grad_norm": 7.005277633666992, + "learning_rate": 9.870646540208763e-06, + "loss": 2.6938, + "step": 349400 + }, + { + "epoch": 0.09116278972739301, + "grad_norm": 7.363898277282715, + "learning_rate": 9.87045755920715e-06, + "loss": 2.7215, + "step": 349600 + }, + { + "epoch": 0.09121494235309519, + "grad_norm": 6.584136009216309, + "learning_rate": 9.870268442070629e-06, + "loss": 2.7236, + "step": 349800 + }, + { + "epoch": 0.09126709497879736, + "grad_norm": 6.563084602355957, + "learning_rate": 9.870079188804492e-06, + "loss": 2.7463, + "step": 350000 + }, + { + "epoch": 0.09131924760449953, + "grad_norm": 6.536829471588135, + "learning_rate": 9.869889799414026e-06, + "loss": 2.814, + "step": 350200 + }, + { + "epoch": 0.0913714002302017, + "grad_norm": 6.699076175689697, + "learning_rate": 9.869700273904524e-06, + "loss": 2.7248, + "step": 350400 + }, + { + "epoch": 0.09142355285590385, + "grad_norm": 6.6044111251831055, + "learning_rate": 9.869510612281284e-06, + "loss": 2.7594, + "step": 350600 + }, + { + "epoch": 0.09147570548160602, + "grad_norm": 6.523010730743408, + "learning_rate": 9.869320814549608e-06, + "loss": 2.7255, + "step": 350800 + }, + { + "epoch": 0.0915278581073082, + "grad_norm": 6.2166218757629395, + "learning_rate": 9.869130880714801e-06, + "loss": 2.7329, + "step": 351000 + }, + { + "epoch": 0.09158001073301036, + "grad_norm": 7.409599304199219, + "learning_rate": 9.868940810782172e-06, + "loss": 2.7557, + "step": 351200 + }, + { + "epoch": 0.09163216335871253, + "grad_norm": 6.580782890319824, + "learning_rate": 9.868750604757034e-06, + "loss": 2.7538, + "step": 351400 + }, + { + "epoch": 0.0916843159844147, + "grad_norm": 6.399933815002441, + "learning_rate": 9.868560262644701e-06, + "loss": 2.7467, + "step": 351600 + }, + { + "epoch": 0.09173646861011688, + "grad_norm": 6.934614181518555, + "learning_rate": 9.868369784450496e-06, + "loss": 2.7431, + "step": 351800 + }, + { + "epoch": 0.09178862123581905, + "grad_norm": 6.793469429016113, + "learning_rate": 9.868179170179742e-06, + "loss": 2.7429, + "step": 352000 + }, + { + "epoch": 0.09184077386152122, + "grad_norm": 6.359063148498535, + "learning_rate": 9.867988419837765e-06, + "loss": 2.7431, + "step": 352200 + }, + { + "epoch": 0.09189292648722339, + "grad_norm": 6.626612663269043, + "learning_rate": 9.8677975334299e-06, + "loss": 2.7322, + "step": 352400 + }, + { + "epoch": 0.09194507911292556, + "grad_norm": 7.432502746582031, + "learning_rate": 9.867606510961482e-06, + "loss": 2.7438, + "step": 352600 + }, + { + "epoch": 0.09199723173862773, + "grad_norm": 6.605191707611084, + "learning_rate": 9.867415352437849e-06, + "loss": 2.7487, + "step": 352800 + }, + { + "epoch": 0.0920493843643299, + "grad_norm": 7.211688995361328, + "learning_rate": 9.867224057864344e-06, + "loss": 2.721, + "step": 353000 + }, + { + "epoch": 0.09210153699003207, + "grad_norm": 6.4896135330200195, + "learning_rate": 9.867032627246315e-06, + "loss": 2.7345, + "step": 353200 + }, + { + "epoch": 0.09215368961573424, + "grad_norm": 7.31119966506958, + "learning_rate": 9.866841060589113e-06, + "loss": 2.7576, + "step": 353400 + }, + { + "epoch": 0.09220584224143641, + "grad_norm": 7.012179851531982, + "learning_rate": 9.866649357898089e-06, + "loss": 2.72, + "step": 353600 + }, + { + "epoch": 0.09225799486713858, + "grad_norm": 6.721981048583984, + "learning_rate": 9.866457519178605e-06, + "loss": 2.7204, + "step": 353800 + }, + { + "epoch": 0.09231014749284075, + "grad_norm": 6.36168098449707, + "learning_rate": 9.866265544436024e-06, + "loss": 2.7532, + "step": 354000 + }, + { + "epoch": 0.09236230011854292, + "grad_norm": 6.767820835113525, + "learning_rate": 9.866073433675709e-06, + "loss": 2.7872, + "step": 354200 + }, + { + "epoch": 0.09241445274424509, + "grad_norm": 6.610678672790527, + "learning_rate": 9.86588118690303e-06, + "loss": 2.687, + "step": 354400 + }, + { + "epoch": 0.09246660536994726, + "grad_norm": 6.138505458831787, + "learning_rate": 9.865688804123361e-06, + "loss": 2.7339, + "step": 354600 + }, + { + "epoch": 0.09251875799564943, + "grad_norm": 6.75731897354126, + "learning_rate": 9.865496285342079e-06, + "loss": 2.7374, + "step": 354800 + }, + { + "epoch": 0.0925709106213516, + "grad_norm": 6.869755268096924, + "learning_rate": 9.865303630564569e-06, + "loss": 2.7248, + "step": 355000 + }, + { + "epoch": 0.09262306324705377, + "grad_norm": 6.32354211807251, + "learning_rate": 9.86511083979621e-06, + "loss": 2.7101, + "step": 355200 + }, + { + "epoch": 0.09267521587275594, + "grad_norm": 6.488105297088623, + "learning_rate": 9.864917913042393e-06, + "loss": 2.7202, + "step": 355400 + }, + { + "epoch": 0.09272736849845811, + "grad_norm": 6.858880043029785, + "learning_rate": 9.864724850308513e-06, + "loss": 2.7165, + "step": 355600 + }, + { + "epoch": 0.09277952112416028, + "grad_norm": 7.065299034118652, + "learning_rate": 9.864531651599963e-06, + "loss": 2.759, + "step": 355800 + }, + { + "epoch": 0.09283167374986245, + "grad_norm": 6.129859924316406, + "learning_rate": 9.864338316922145e-06, + "loss": 2.7342, + "step": 356000 + }, + { + "epoch": 0.09288382637556462, + "grad_norm": 6.4933648109436035, + "learning_rate": 9.864144846280461e-06, + "loss": 2.7194, + "step": 356200 + }, + { + "epoch": 0.0929359790012668, + "grad_norm": 6.755699634552002, + "learning_rate": 9.86395123968032e-06, + "loss": 2.7097, + "step": 356400 + }, + { + "epoch": 0.09298813162696895, + "grad_norm": 6.979990005493164, + "learning_rate": 9.863757497127134e-06, + "loss": 2.6846, + "step": 356600 + }, + { + "epoch": 0.09304028425267112, + "grad_norm": 7.256637096405029, + "learning_rate": 9.863563618626317e-06, + "loss": 2.7368, + "step": 356800 + }, + { + "epoch": 0.09309243687837329, + "grad_norm": 6.8452253341674805, + "learning_rate": 9.86336960418329e-06, + "loss": 2.75, + "step": 357000 + }, + { + "epoch": 0.09314458950407546, + "grad_norm": 5.944540977478027, + "learning_rate": 9.863175453803476e-06, + "loss": 2.7395, + "step": 357200 + }, + { + "epoch": 0.09319674212977763, + "grad_norm": 7.049804210662842, + "learning_rate": 9.862981167492298e-06, + "loss": 2.7871, + "step": 357400 + }, + { + "epoch": 0.0932488947554798, + "grad_norm": 6.192124843597412, + "learning_rate": 9.862786745255191e-06, + "loss": 2.7268, + "step": 357600 + }, + { + "epoch": 0.09330104738118197, + "grad_norm": 6.69340181350708, + "learning_rate": 9.862592187097587e-06, + "loss": 2.7712, + "step": 357800 + }, + { + "epoch": 0.09335320000688414, + "grad_norm": 6.953091144561768, + "learning_rate": 9.862397493024925e-06, + "loss": 2.7733, + "step": 358000 + }, + { + "epoch": 0.09340535263258631, + "grad_norm": 7.205249309539795, + "learning_rate": 9.862202663042647e-06, + "loss": 2.691, + "step": 358200 + }, + { + "epoch": 0.09345750525828848, + "grad_norm": 6.435111999511719, + "learning_rate": 9.862007697156195e-06, + "loss": 2.7516, + "step": 358400 + }, + { + "epoch": 0.09350965788399065, + "grad_norm": 6.6230669021606445, + "learning_rate": 9.861812595371026e-06, + "loss": 2.7293, + "step": 358600 + }, + { + "epoch": 0.09356181050969282, + "grad_norm": 6.619906425476074, + "learning_rate": 9.861617357692585e-06, + "loss": 2.7295, + "step": 358800 + }, + { + "epoch": 0.093613963135395, + "grad_norm": 6.5545268058776855, + "learning_rate": 9.861421984126335e-06, + "loss": 2.72, + "step": 359000 + }, + { + "epoch": 0.09366611576109717, + "grad_norm": 7.116846561431885, + "learning_rate": 9.861226474677737e-06, + "loss": 2.7458, + "step": 359200 + }, + { + "epoch": 0.09371826838679934, + "grad_norm": 7.412746429443359, + "learning_rate": 9.861030829352252e-06, + "loss": 2.7375, + "step": 359400 + }, + { + "epoch": 0.0937704210125015, + "grad_norm": 7.558204174041748, + "learning_rate": 9.86083504815535e-06, + "loss": 2.7256, + "step": 359600 + }, + { + "epoch": 0.09382257363820368, + "grad_norm": 6.926628589630127, + "learning_rate": 9.860639131092504e-06, + "loss": 2.7327, + "step": 359800 + }, + { + "epoch": 0.09387472626390585, + "grad_norm": 6.14035177230835, + "learning_rate": 9.86044307816919e-06, + "loss": 2.7614, + "step": 360000 + }, + { + "epoch": 0.09392687888960802, + "grad_norm": 7.323028087615967, + "learning_rate": 9.860246889390888e-06, + "loss": 2.7235, + "step": 360200 + }, + { + "epoch": 0.09397903151531019, + "grad_norm": 6.862823009490967, + "learning_rate": 9.86005056476308e-06, + "loss": 2.75, + "step": 360400 + }, + { + "epoch": 0.09403118414101236, + "grad_norm": 6.618449687957764, + "learning_rate": 9.859854104291256e-06, + "loss": 2.7222, + "step": 360600 + }, + { + "epoch": 0.09408333676671453, + "grad_norm": 5.8654608726501465, + "learning_rate": 9.859657507980907e-06, + "loss": 2.7302, + "step": 360800 + }, + { + "epoch": 0.0941354893924167, + "grad_norm": 6.324068546295166, + "learning_rate": 9.859460775837526e-06, + "loss": 2.7301, + "step": 361000 + }, + { + "epoch": 0.09418764201811887, + "grad_norm": 6.8306732177734375, + "learning_rate": 9.859263907866612e-06, + "loss": 2.7369, + "step": 361200 + }, + { + "epoch": 0.09423979464382104, + "grad_norm": 6.69620943069458, + "learning_rate": 9.859066904073671e-06, + "loss": 2.7301, + "step": 361400 + }, + { + "epoch": 0.09429194726952321, + "grad_norm": 6.917806148529053, + "learning_rate": 9.858869764464208e-06, + "loss": 2.7438, + "step": 361600 + }, + { + "epoch": 0.09434409989522538, + "grad_norm": 7.454828262329102, + "learning_rate": 9.85867248904373e-06, + "loss": 2.729, + "step": 361800 + }, + { + "epoch": 0.09439625252092755, + "grad_norm": 6.281285762786865, + "learning_rate": 9.858475077817756e-06, + "loss": 2.7598, + "step": 362000 + }, + { + "epoch": 0.09444840514662972, + "grad_norm": 6.297204971313477, + "learning_rate": 9.8582775307918e-06, + "loss": 2.741, + "step": 362200 + }, + { + "epoch": 0.09450055777233188, + "grad_norm": 6.6822590827941895, + "learning_rate": 9.858079847971384e-06, + "loss": 2.715, + "step": 362400 + }, + { + "epoch": 0.09455271039803405, + "grad_norm": 6.762353897094727, + "learning_rate": 9.857882029362036e-06, + "loss": 2.727, + "step": 362600 + }, + { + "epoch": 0.09460486302373622, + "grad_norm": 7.140170574188232, + "learning_rate": 9.857684074969285e-06, + "loss": 2.714, + "step": 362800 + }, + { + "epoch": 0.09465701564943839, + "grad_norm": 6.909539222717285, + "learning_rate": 9.85748598479866e-06, + "loss": 2.7019, + "step": 363000 + }, + { + "epoch": 0.09470916827514056, + "grad_norm": 7.183372497558594, + "learning_rate": 9.857287758855705e-06, + "loss": 2.7792, + "step": 363200 + }, + { + "epoch": 0.09476132090084273, + "grad_norm": 6.247964859008789, + "learning_rate": 9.857089397145954e-06, + "loss": 2.7433, + "step": 363400 + }, + { + "epoch": 0.0948134735265449, + "grad_norm": 7.346426010131836, + "learning_rate": 9.856890899674954e-06, + "loss": 2.6835, + "step": 363600 + }, + { + "epoch": 0.09486562615224707, + "grad_norm": 7.062637805938721, + "learning_rate": 9.856692266448254e-06, + "loss": 2.7761, + "step": 363800 + }, + { + "epoch": 0.09491777877794924, + "grad_norm": 7.02108907699585, + "learning_rate": 9.856493497471405e-06, + "loss": 2.7348, + "step": 364000 + }, + { + "epoch": 0.09496993140365141, + "grad_norm": 6.802475929260254, + "learning_rate": 9.856294592749963e-06, + "loss": 2.7111, + "step": 364200 + }, + { + "epoch": 0.09502208402935358, + "grad_norm": 6.6680402755737305, + "learning_rate": 9.856095552289487e-06, + "loss": 2.7501, + "step": 364400 + }, + { + "epoch": 0.09507423665505575, + "grad_norm": 6.280881404876709, + "learning_rate": 9.85589637609554e-06, + "loss": 2.7415, + "step": 364600 + }, + { + "epoch": 0.09512638928075792, + "grad_norm": 6.65505313873291, + "learning_rate": 9.85569706417369e-06, + "loss": 2.7116, + "step": 364800 + }, + { + "epoch": 0.09517854190646009, + "grad_norm": 6.576626777648926, + "learning_rate": 9.855497616529511e-06, + "loss": 2.7489, + "step": 365000 + }, + { + "epoch": 0.09523069453216226, + "grad_norm": 7.567196369171143, + "learning_rate": 9.855298033168575e-06, + "loss": 2.7427, + "step": 365200 + }, + { + "epoch": 0.09528284715786443, + "grad_norm": 6.262023448944092, + "learning_rate": 9.855098314096459e-06, + "loss": 2.7202, + "step": 365400 + }, + { + "epoch": 0.0953349997835666, + "grad_norm": 6.940713405609131, + "learning_rate": 9.854898459318748e-06, + "loss": 2.7366, + "step": 365600 + }, + { + "epoch": 0.09538715240926877, + "grad_norm": 7.290978908538818, + "learning_rate": 9.854698468841024e-06, + "loss": 2.732, + "step": 365800 + }, + { + "epoch": 0.09543930503497094, + "grad_norm": 6.973400592803955, + "learning_rate": 9.854498342668883e-06, + "loss": 2.7675, + "step": 366000 + }, + { + "epoch": 0.09549145766067312, + "grad_norm": 6.667946815490723, + "learning_rate": 9.854298080807917e-06, + "loss": 2.7709, + "step": 366200 + }, + { + "epoch": 0.09554361028637529, + "grad_norm": 6.699540615081787, + "learning_rate": 9.854097683263719e-06, + "loss": 2.7282, + "step": 366400 + }, + { + "epoch": 0.09559576291207746, + "grad_norm": 6.533851146697998, + "learning_rate": 9.853897150041896e-06, + "loss": 2.6941, + "step": 366600 + }, + { + "epoch": 0.09564791553777963, + "grad_norm": 6.288170337677002, + "learning_rate": 9.85369648114805e-06, + "loss": 2.7054, + "step": 366800 + }, + { + "epoch": 0.0957000681634818, + "grad_norm": 6.440084457397461, + "learning_rate": 9.853495676587791e-06, + "loss": 2.7668, + "step": 367000 + }, + { + "epoch": 0.09575222078918397, + "grad_norm": 6.943708896636963, + "learning_rate": 9.853294736366732e-06, + "loss": 2.758, + "step": 367200 + }, + { + "epoch": 0.09580437341488614, + "grad_norm": 7.354572296142578, + "learning_rate": 9.85309366049049e-06, + "loss": 2.7425, + "step": 367400 + }, + { + "epoch": 0.09585652604058831, + "grad_norm": 6.695876121520996, + "learning_rate": 9.852892448964682e-06, + "loss": 2.7463, + "step": 367600 + }, + { + "epoch": 0.09590867866629048, + "grad_norm": 6.828666687011719, + "learning_rate": 9.852691101794937e-06, + "loss": 2.74, + "step": 367800 + }, + { + "epoch": 0.09596083129199265, + "grad_norm": 6.269225120544434, + "learning_rate": 9.852489618986878e-06, + "loss": 2.718, + "step": 368000 + }, + { + "epoch": 0.09601298391769482, + "grad_norm": 6.452040672302246, + "learning_rate": 9.85228800054614e-06, + "loss": 2.7166, + "step": 368200 + }, + { + "epoch": 0.09606513654339698, + "grad_norm": 6.910484790802002, + "learning_rate": 9.852086246478358e-06, + "loss": 2.744, + "step": 368400 + }, + { + "epoch": 0.09611728916909915, + "grad_norm": 8.016570091247559, + "learning_rate": 9.85188435678917e-06, + "loss": 2.7105, + "step": 368600 + }, + { + "epoch": 0.09616944179480132, + "grad_norm": 6.72928524017334, + "learning_rate": 9.85168233148422e-06, + "loss": 2.7281, + "step": 368800 + }, + { + "epoch": 0.09622159442050349, + "grad_norm": 7.141217231750488, + "learning_rate": 9.851480170569155e-06, + "loss": 2.7017, + "step": 369000 + }, + { + "epoch": 0.09627374704620566, + "grad_norm": 7.320204734802246, + "learning_rate": 9.851277874049624e-06, + "loss": 2.7614, + "step": 369200 + }, + { + "epoch": 0.09632589967190783, + "grad_norm": 6.801264762878418, + "learning_rate": 9.851075441931285e-06, + "loss": 2.7137, + "step": 369400 + }, + { + "epoch": 0.09637805229761, + "grad_norm": 6.665674686431885, + "learning_rate": 9.850872874219792e-06, + "loss": 2.7348, + "step": 369600 + }, + { + "epoch": 0.09643020492331217, + "grad_norm": 7.685469627380371, + "learning_rate": 9.85067017092081e-06, + "loss": 2.7101, + "step": 369800 + }, + { + "epoch": 0.09648235754901434, + "grad_norm": 7.03206729888916, + "learning_rate": 9.850467332040003e-06, + "loss": 2.7283, + "step": 370000 + }, + { + "epoch": 0.09653451017471651, + "grad_norm": 6.763091087341309, + "learning_rate": 9.850264357583042e-06, + "loss": 2.721, + "step": 370200 + }, + { + "epoch": 0.09658666280041868, + "grad_norm": 7.308859348297119, + "learning_rate": 9.850061247555598e-06, + "loss": 2.7411, + "step": 370400 + }, + { + "epoch": 0.09663881542612085, + "grad_norm": 6.72197961807251, + "learning_rate": 9.84985800196335e-06, + "loss": 2.7493, + "step": 370600 + }, + { + "epoch": 0.09669096805182302, + "grad_norm": 7.104571342468262, + "learning_rate": 9.849654620811981e-06, + "loss": 2.7556, + "step": 370800 + }, + { + "epoch": 0.09674312067752519, + "grad_norm": 6.912766456604004, + "learning_rate": 9.849451104107172e-06, + "loss": 2.7397, + "step": 371000 + }, + { + "epoch": 0.09679527330322736, + "grad_norm": 7.259765148162842, + "learning_rate": 9.849247451854614e-06, + "loss": 2.6929, + "step": 371200 + }, + { + "epoch": 0.09684742592892953, + "grad_norm": 7.524007320404053, + "learning_rate": 9.849043664059996e-06, + "loss": 2.7068, + "step": 371400 + }, + { + "epoch": 0.0968995785546317, + "grad_norm": 6.4871392250061035, + "learning_rate": 9.848839740729018e-06, + "loss": 2.752, + "step": 371600 + }, + { + "epoch": 0.09695173118033387, + "grad_norm": 6.942018985748291, + "learning_rate": 9.848635681867377e-06, + "loss": 2.7455, + "step": 371800 + }, + { + "epoch": 0.09700388380603604, + "grad_norm": 6.313770294189453, + "learning_rate": 9.84843148748078e-06, + "loss": 2.7225, + "step": 372000 + }, + { + "epoch": 0.09705603643173821, + "grad_norm": 7.180691242218018, + "learning_rate": 9.848227157574932e-06, + "loss": 2.7707, + "step": 372200 + }, + { + "epoch": 0.09710818905744038, + "grad_norm": 6.999627113342285, + "learning_rate": 9.848022692155544e-06, + "loss": 2.7113, + "step": 372400 + }, + { + "epoch": 0.09716034168314255, + "grad_norm": 6.8867950439453125, + "learning_rate": 9.847818091228332e-06, + "loss": 2.7619, + "step": 372600 + }, + { + "epoch": 0.09721249430884472, + "grad_norm": 6.820121765136719, + "learning_rate": 9.847613354799014e-06, + "loss": 2.7334, + "step": 372800 + }, + { + "epoch": 0.0972646469345469, + "grad_norm": 6.528743267059326, + "learning_rate": 9.847408482873316e-06, + "loss": 2.7372, + "step": 373000 + }, + { + "epoch": 0.09731679956024906, + "grad_norm": 7.362827301025391, + "learning_rate": 9.847203475456959e-06, + "loss": 2.7489, + "step": 373200 + }, + { + "epoch": 0.09736895218595124, + "grad_norm": 7.119494438171387, + "learning_rate": 9.846998332555676e-06, + "loss": 2.7094, + "step": 373400 + }, + { + "epoch": 0.0974211048116534, + "grad_norm": 7.4878387451171875, + "learning_rate": 9.8467930541752e-06, + "loss": 2.7318, + "step": 373600 + }, + { + "epoch": 0.09747325743735558, + "grad_norm": 7.716200828552246, + "learning_rate": 9.846587640321273e-06, + "loss": 2.8075, + "step": 373800 + }, + { + "epoch": 0.09752541006305775, + "grad_norm": 6.292218208312988, + "learning_rate": 9.84638209099963e-06, + "loss": 2.7333, + "step": 374000 + }, + { + "epoch": 0.0975775626887599, + "grad_norm": 6.929902076721191, + "learning_rate": 9.846176406216019e-06, + "loss": 2.7248, + "step": 374200 + }, + { + "epoch": 0.09762971531446207, + "grad_norm": 7.24159049987793, + "learning_rate": 9.845970585976192e-06, + "loss": 2.749, + "step": 374400 + }, + { + "epoch": 0.09768186794016424, + "grad_norm": 7.5973663330078125, + "learning_rate": 9.8457646302859e-06, + "loss": 2.7228, + "step": 374600 + }, + { + "epoch": 0.09773402056586641, + "grad_norm": 7.047082424163818, + "learning_rate": 9.845558539150895e-06, + "loss": 2.7224, + "step": 374800 + }, + { + "epoch": 0.09778617319156858, + "grad_norm": 6.793696880340576, + "learning_rate": 9.845352312576946e-06, + "loss": 2.7347, + "step": 375000 + }, + { + "epoch": 0.09783832581727075, + "grad_norm": 7.282424449920654, + "learning_rate": 9.84514595056981e-06, + "loss": 2.7181, + "step": 375200 + }, + { + "epoch": 0.09789047844297293, + "grad_norm": 7.171825885772705, + "learning_rate": 9.84493945313526e-06, + "loss": 2.7174, + "step": 375400 + }, + { + "epoch": 0.0979426310686751, + "grad_norm": 7.00319766998291, + "learning_rate": 9.844732820279064e-06, + "loss": 2.7221, + "step": 375600 + }, + { + "epoch": 0.09799478369437727, + "grad_norm": 7.138320446014404, + "learning_rate": 9.844526052007001e-06, + "loss": 2.7306, + "step": 375800 + }, + { + "epoch": 0.09804693632007944, + "grad_norm": 6.744442939758301, + "learning_rate": 9.84431914832485e-06, + "loss": 2.6985, + "step": 376000 + }, + { + "epoch": 0.0980990889457816, + "grad_norm": 7.503383636474609, + "learning_rate": 9.84411210923839e-06, + "loss": 2.7451, + "step": 376200 + }, + { + "epoch": 0.09815124157148378, + "grad_norm": 6.678633689880371, + "learning_rate": 9.843904934753414e-06, + "loss": 2.7316, + "step": 376400 + }, + { + "epoch": 0.09820339419718595, + "grad_norm": 6.815631866455078, + "learning_rate": 9.843697624875708e-06, + "loss": 2.7003, + "step": 376600 + }, + { + "epoch": 0.09825554682288812, + "grad_norm": 6.820425987243652, + "learning_rate": 9.843490179611069e-06, + "loss": 2.7462, + "step": 376800 + }, + { + "epoch": 0.09830769944859029, + "grad_norm": 7.071511268615723, + "learning_rate": 9.843282598965293e-06, + "loss": 2.7266, + "step": 377000 + }, + { + "epoch": 0.09835985207429246, + "grad_norm": 6.579829692840576, + "learning_rate": 9.843074882944188e-06, + "loss": 2.7225, + "step": 377200 + }, + { + "epoch": 0.09841200469999463, + "grad_norm": 7.302144527435303, + "learning_rate": 9.842867031553551e-06, + "loss": 2.7498, + "step": 377400 + }, + { + "epoch": 0.0984641573256968, + "grad_norm": 7.175357341766357, + "learning_rate": 9.8426590447992e-06, + "loss": 2.7374, + "step": 377600 + }, + { + "epoch": 0.09851630995139897, + "grad_norm": 7.382828712463379, + "learning_rate": 9.842450922686944e-06, + "loss": 2.6983, + "step": 377800 + }, + { + "epoch": 0.09856846257710114, + "grad_norm": 6.3378190994262695, + "learning_rate": 9.8422426652226e-06, + "loss": 2.7448, + "step": 378000 + }, + { + "epoch": 0.09862061520280331, + "grad_norm": 5.941269397735596, + "learning_rate": 9.84203427241199e-06, + "loss": 2.7119, + "step": 378200 + }, + { + "epoch": 0.09867276782850548, + "grad_norm": 7.9518842697143555, + "learning_rate": 9.84182574426094e-06, + "loss": 2.7104, + "step": 378400 + }, + { + "epoch": 0.09872492045420765, + "grad_norm": 7.776278018951416, + "learning_rate": 9.841617080775278e-06, + "loss": 2.7247, + "step": 378600 + }, + { + "epoch": 0.09877707307990982, + "grad_norm": 7.218269348144531, + "learning_rate": 9.841408281960836e-06, + "loss": 2.7109, + "step": 378800 + }, + { + "epoch": 0.09882922570561199, + "grad_norm": 6.931689739227295, + "learning_rate": 9.841199347823448e-06, + "loss": 2.7071, + "step": 379000 + }, + { + "epoch": 0.09888137833131416, + "grad_norm": 6.874160289764404, + "learning_rate": 9.840990278368957e-06, + "loss": 2.7048, + "step": 379200 + }, + { + "epoch": 0.09893353095701633, + "grad_norm": 7.4862823486328125, + "learning_rate": 9.840781073603208e-06, + "loss": 2.7754, + "step": 379400 + }, + { + "epoch": 0.0989856835827185, + "grad_norm": 6.68509578704834, + "learning_rate": 9.840571733532044e-06, + "loss": 2.7133, + "step": 379600 + }, + { + "epoch": 0.09903783620842067, + "grad_norm": 7.173377990722656, + "learning_rate": 9.840362258161322e-06, + "loss": 2.7296, + "step": 379800 + }, + { + "epoch": 0.09908998883412284, + "grad_norm": 6.899176120758057, + "learning_rate": 9.84015264749689e-06, + "loss": 2.7376, + "step": 380000 + }, + { + "epoch": 0.099142141459825, + "grad_norm": 7.660719394683838, + "learning_rate": 9.839942901544612e-06, + "loss": 2.6995, + "step": 380200 + }, + { + "epoch": 0.09919429408552717, + "grad_norm": 6.3884687423706055, + "learning_rate": 9.83973302031035e-06, + "loss": 2.7238, + "step": 380400 + }, + { + "epoch": 0.09924644671122934, + "grad_norm": 7.147850513458252, + "learning_rate": 9.839523003799969e-06, + "loss": 2.7093, + "step": 380600 + }, + { + "epoch": 0.09929859933693151, + "grad_norm": 7.06928014755249, + "learning_rate": 9.839312852019337e-06, + "loss": 2.6814, + "step": 380800 + }, + { + "epoch": 0.09935075196263368, + "grad_norm": 6.980913162231445, + "learning_rate": 9.839102564974336e-06, + "loss": 2.6875, + "step": 381000 + }, + { + "epoch": 0.09940290458833585, + "grad_norm": 7.352754592895508, + "learning_rate": 9.838892142670834e-06, + "loss": 2.6892, + "step": 381200 + }, + { + "epoch": 0.09945505721403802, + "grad_norm": 7.018017768859863, + "learning_rate": 9.838681585114721e-06, + "loss": 2.7277, + "step": 381400 + }, + { + "epoch": 0.0995072098397402, + "grad_norm": 7.365713119506836, + "learning_rate": 9.838470892311876e-06, + "loss": 2.7038, + "step": 381600 + }, + { + "epoch": 0.09955936246544236, + "grad_norm": 6.4748711585998535, + "learning_rate": 9.838260064268192e-06, + "loss": 2.727, + "step": 381800 + }, + { + "epoch": 0.09961151509114453, + "grad_norm": 7.302340507507324, + "learning_rate": 9.83804910098956e-06, + "loss": 2.7436, + "step": 382000 + }, + { + "epoch": 0.0996636677168467, + "grad_norm": 6.783388614654541, + "learning_rate": 9.837838002481876e-06, + "loss": 2.6953, + "step": 382200 + }, + { + "epoch": 0.09971582034254887, + "grad_norm": 7.748034477233887, + "learning_rate": 9.837626768751043e-06, + "loss": 2.737, + "step": 382400 + }, + { + "epoch": 0.09976797296825105, + "grad_norm": 7.11197566986084, + "learning_rate": 9.837415399802962e-06, + "loss": 2.7801, + "step": 382600 + }, + { + "epoch": 0.09982012559395322, + "grad_norm": 7.256601333618164, + "learning_rate": 9.837203895643546e-06, + "loss": 2.7174, + "step": 382800 + }, + { + "epoch": 0.09987227821965539, + "grad_norm": 7.089999675750732, + "learning_rate": 9.8369922562787e-06, + "loss": 2.7441, + "step": 383000 + }, + { + "epoch": 0.09992443084535756, + "grad_norm": 6.694625377655029, + "learning_rate": 9.836780481714348e-06, + "loss": 2.6984, + "step": 383200 + }, + { + "epoch": 0.09997658347105973, + "grad_norm": 7.070290565490723, + "learning_rate": 9.8365685719564e-06, + "loss": 2.7062, + "step": 383400 + }, + { + "epoch": 0.1000287360967619, + "grad_norm": 7.1637959480285645, + "learning_rate": 9.836356527010785e-06, + "loss": 2.7344, + "step": 383600 + }, + { + "epoch": 0.10008088872246407, + "grad_norm": 6.960715293884277, + "learning_rate": 9.83614434688343e-06, + "loss": 2.7294, + "step": 383800 + }, + { + "epoch": 0.10013304134816624, + "grad_norm": 6.766301155090332, + "learning_rate": 9.835932031580262e-06, + "loss": 2.7184, + "step": 384000 + }, + { + "epoch": 0.10018519397386841, + "grad_norm": 6.3087358474731445, + "learning_rate": 9.83571958110722e-06, + "loss": 2.6987, + "step": 384200 + }, + { + "epoch": 0.10023734659957058, + "grad_norm": 7.121251583099365, + "learning_rate": 9.835506995470237e-06, + "loss": 2.7211, + "step": 384400 + }, + { + "epoch": 0.10028949922527275, + "grad_norm": 6.787637710571289, + "learning_rate": 9.83529427467526e-06, + "loss": 2.7046, + "step": 384600 + }, + { + "epoch": 0.10034165185097492, + "grad_norm": 6.736263751983643, + "learning_rate": 9.835081418728232e-06, + "loss": 2.7612, + "step": 384800 + }, + { + "epoch": 0.10039380447667709, + "grad_norm": 7.012972831726074, + "learning_rate": 9.834868427635102e-06, + "loss": 2.6529, + "step": 385000 + }, + { + "epoch": 0.10044595710237926, + "grad_norm": 7.4089531898498535, + "learning_rate": 9.834655301401827e-06, + "loss": 2.7274, + "step": 385200 + }, + { + "epoch": 0.10049810972808143, + "grad_norm": 6.628983497619629, + "learning_rate": 9.83444204003436e-06, + "loss": 2.7027, + "step": 385400 + }, + { + "epoch": 0.1005502623537836, + "grad_norm": 6.684551239013672, + "learning_rate": 9.834228643538665e-06, + "loss": 2.72, + "step": 385600 + }, + { + "epoch": 0.10060241497948577, + "grad_norm": 7.784319877624512, + "learning_rate": 9.834015111920705e-06, + "loss": 2.7083, + "step": 385800 + }, + { + "epoch": 0.10065456760518793, + "grad_norm": 6.074433326721191, + "learning_rate": 9.833801445186448e-06, + "loss": 2.7258, + "step": 386000 + }, + { + "epoch": 0.1007067202308901, + "grad_norm": 7.0298171043396, + "learning_rate": 9.833587643341869e-06, + "loss": 2.7326, + "step": 386200 + }, + { + "epoch": 0.10075887285659227, + "grad_norm": 6.243155002593994, + "learning_rate": 9.83337370639294e-06, + "loss": 2.6969, + "step": 386400 + }, + { + "epoch": 0.10081102548229444, + "grad_norm": 6.764981269836426, + "learning_rate": 9.833159634345643e-06, + "loss": 2.709, + "step": 386600 + }, + { + "epoch": 0.10086317810799661, + "grad_norm": 6.844011306762695, + "learning_rate": 9.832945427205963e-06, + "loss": 2.738, + "step": 386800 + }, + { + "epoch": 0.10091533073369878, + "grad_norm": 7.266912460327148, + "learning_rate": 9.832731084979883e-06, + "loss": 2.7677, + "step": 387000 + }, + { + "epoch": 0.10096748335940095, + "grad_norm": 7.799858093261719, + "learning_rate": 9.8325166076734e-06, + "loss": 2.7008, + "step": 387200 + }, + { + "epoch": 0.10101963598510312, + "grad_norm": 6.45143461227417, + "learning_rate": 9.832301995292502e-06, + "loss": 2.7465, + "step": 387400 + }, + { + "epoch": 0.10107178861080529, + "grad_norm": 6.146852970123291, + "learning_rate": 9.832087247843194e-06, + "loss": 2.6801, + "step": 387600 + }, + { + "epoch": 0.10112394123650746, + "grad_norm": 6.76859188079834, + "learning_rate": 9.831872365331475e-06, + "loss": 2.7365, + "step": 387800 + }, + { + "epoch": 0.10117609386220963, + "grad_norm": 6.941719055175781, + "learning_rate": 9.831657347763353e-06, + "loss": 2.7285, + "step": 388000 + }, + { + "epoch": 0.1012282464879118, + "grad_norm": 7.166417121887207, + "learning_rate": 9.831442195144836e-06, + "loss": 2.7176, + "step": 388200 + }, + { + "epoch": 0.10128039911361397, + "grad_norm": 8.05933952331543, + "learning_rate": 9.83122690748194e-06, + "loss": 2.6899, + "step": 388400 + }, + { + "epoch": 0.10133255173931614, + "grad_norm": 6.222696781158447, + "learning_rate": 9.831011484780679e-06, + "loss": 2.7254, + "step": 388600 + }, + { + "epoch": 0.10138470436501831, + "grad_norm": 7.399394512176514, + "learning_rate": 9.830795927047078e-06, + "loss": 2.7333, + "step": 388800 + }, + { + "epoch": 0.10143685699072048, + "grad_norm": 7.602478504180908, + "learning_rate": 9.83058023428716e-06, + "loss": 2.7245, + "step": 389000 + }, + { + "epoch": 0.10148900961642265, + "grad_norm": 6.894057273864746, + "learning_rate": 9.830364406506956e-06, + "loss": 2.7472, + "step": 389200 + }, + { + "epoch": 0.10154116224212482, + "grad_norm": 6.566694259643555, + "learning_rate": 9.830148443712498e-06, + "loss": 2.6904, + "step": 389400 + }, + { + "epoch": 0.101593314867827, + "grad_norm": 6.913676738739014, + "learning_rate": 9.829932345909819e-06, + "loss": 2.7406, + "step": 389600 + }, + { + "epoch": 0.10164546749352917, + "grad_norm": 6.623051166534424, + "learning_rate": 9.829716113104964e-06, + "loss": 2.71, + "step": 389800 + }, + { + "epoch": 0.10169762011923134, + "grad_norm": 6.746899127960205, + "learning_rate": 9.829499745303972e-06, + "loss": 2.7284, + "step": 390000 + }, + { + "epoch": 0.1017497727449335, + "grad_norm": 7.177034854888916, + "learning_rate": 9.829283242512896e-06, + "loss": 2.7186, + "step": 390200 + }, + { + "epoch": 0.10180192537063568, + "grad_norm": 7.524350166320801, + "learning_rate": 9.829066604737784e-06, + "loss": 2.692, + "step": 390400 + }, + { + "epoch": 0.10185407799633785, + "grad_norm": 6.514491558074951, + "learning_rate": 9.828849831984693e-06, + "loss": 2.7025, + "step": 390600 + }, + { + "epoch": 0.10190623062204002, + "grad_norm": 7.224747180938721, + "learning_rate": 9.82863292425968e-06, + "loss": 2.7793, + "step": 390800 + }, + { + "epoch": 0.10195838324774219, + "grad_norm": 7.076272010803223, + "learning_rate": 9.82841588156881e-06, + "loss": 2.7168, + "step": 391000 + }, + { + "epoch": 0.10201053587344436, + "grad_norm": 7.147103786468506, + "learning_rate": 9.828198703918148e-06, + "loss": 2.7419, + "step": 391200 + }, + { + "epoch": 0.10206268849914653, + "grad_norm": 6.700717926025391, + "learning_rate": 9.827981391313766e-06, + "loss": 2.668, + "step": 391400 + }, + { + "epoch": 0.1021148411248487, + "grad_norm": 6.936060428619385, + "learning_rate": 9.827763943761736e-06, + "loss": 2.709, + "step": 391600 + }, + { + "epoch": 0.10216699375055086, + "grad_norm": 6.975337982177734, + "learning_rate": 9.827546361268139e-06, + "loss": 2.7271, + "step": 391800 + }, + { + "epoch": 0.10221914637625303, + "grad_norm": 7.4715118408203125, + "learning_rate": 9.827328643839054e-06, + "loss": 2.7067, + "step": 392000 + }, + { + "epoch": 0.1022712990019552, + "grad_norm": 8.031476974487305, + "learning_rate": 9.827110791480568e-06, + "loss": 2.7094, + "step": 392200 + }, + { + "epoch": 0.10232345162765737, + "grad_norm": 6.826619625091553, + "learning_rate": 9.82689280419877e-06, + "loss": 2.6987, + "step": 392400 + }, + { + "epoch": 0.10237560425335954, + "grad_norm": 7.765749931335449, + "learning_rate": 9.826674681999749e-06, + "loss": 2.7286, + "step": 392600 + }, + { + "epoch": 0.10242775687906171, + "grad_norm": 7.1541666984558105, + "learning_rate": 9.826456424889608e-06, + "loss": 2.7061, + "step": 392800 + }, + { + "epoch": 0.10247990950476388, + "grad_norm": 6.534503936767578, + "learning_rate": 9.826238032874445e-06, + "loss": 2.7177, + "step": 393000 + }, + { + "epoch": 0.10253206213046605, + "grad_norm": 7.043641090393066, + "learning_rate": 9.826019505960363e-06, + "loss": 2.7428, + "step": 393200 + }, + { + "epoch": 0.10258421475616822, + "grad_norm": 6.870479583740234, + "learning_rate": 9.82580084415347e-06, + "loss": 2.73, + "step": 393400 + }, + { + "epoch": 0.10263636738187039, + "grad_norm": 7.458539962768555, + "learning_rate": 9.825582047459883e-06, + "loss": 2.6956, + "step": 393600 + }, + { + "epoch": 0.10268852000757256, + "grad_norm": 7.4572038650512695, + "learning_rate": 9.825363115885711e-06, + "loss": 2.7238, + "step": 393800 + }, + { + "epoch": 0.10274067263327473, + "grad_norm": 6.63360595703125, + "learning_rate": 9.825144049437077e-06, + "loss": 2.752, + "step": 394000 + }, + { + "epoch": 0.1027928252589769, + "grad_norm": 6.744916915893555, + "learning_rate": 9.824924848120101e-06, + "loss": 2.694, + "step": 394200 + }, + { + "epoch": 0.10284497788467907, + "grad_norm": 7.818175315856934, + "learning_rate": 9.824705511940914e-06, + "loss": 2.7334, + "step": 394400 + }, + { + "epoch": 0.10289713051038124, + "grad_norm": 6.705904483795166, + "learning_rate": 9.824486040905646e-06, + "loss": 2.7112, + "step": 394600 + }, + { + "epoch": 0.10294928313608341, + "grad_norm": 6.82873010635376, + "learning_rate": 9.824266435020428e-06, + "loss": 2.6945, + "step": 394800 + }, + { + "epoch": 0.10300143576178558, + "grad_norm": 7.144585132598877, + "learning_rate": 9.824046694291401e-06, + "loss": 2.7079, + "step": 395000 + }, + { + "epoch": 0.10305358838748775, + "grad_norm": 7.308895587921143, + "learning_rate": 9.823826818724707e-06, + "loss": 2.6953, + "step": 395200 + }, + { + "epoch": 0.10310574101318992, + "grad_norm": 6.582967281341553, + "learning_rate": 9.823606808326491e-06, + "loss": 2.7148, + "step": 395400 + }, + { + "epoch": 0.10315789363889209, + "grad_norm": 7.389861106872559, + "learning_rate": 9.823386663102902e-06, + "loss": 2.6945, + "step": 395600 + }, + { + "epoch": 0.10321004626459426, + "grad_norm": 7.0465006828308105, + "learning_rate": 9.823166383060096e-06, + "loss": 2.7263, + "step": 395800 + }, + { + "epoch": 0.10326219889029643, + "grad_norm": 7.7955322265625, + "learning_rate": 9.82294596820423e-06, + "loss": 2.7256, + "step": 396000 + }, + { + "epoch": 0.1033143515159986, + "grad_norm": 7.1410322189331055, + "learning_rate": 9.82272541854146e-06, + "loss": 2.6904, + "step": 396200 + }, + { + "epoch": 0.10336650414170077, + "grad_norm": 7.031514644622803, + "learning_rate": 9.822504734077953e-06, + "loss": 2.6924, + "step": 396400 + }, + { + "epoch": 0.10341865676740294, + "grad_norm": 7.500546932220459, + "learning_rate": 9.822283914819882e-06, + "loss": 2.6723, + "step": 396600 + }, + { + "epoch": 0.10347080939310511, + "grad_norm": 7.630676746368408, + "learning_rate": 9.82206296077341e-06, + "loss": 2.7074, + "step": 396800 + }, + { + "epoch": 0.10352296201880729, + "grad_norm": 6.640892028808594, + "learning_rate": 9.821841871944723e-06, + "loss": 2.7259, + "step": 397000 + }, + { + "epoch": 0.10357511464450946, + "grad_norm": 7.342281818389893, + "learning_rate": 9.821620648339993e-06, + "loss": 2.6999, + "step": 397200 + }, + { + "epoch": 0.10362726727021163, + "grad_norm": 7.83634614944458, + "learning_rate": 9.821399289965408e-06, + "loss": 2.7423, + "step": 397400 + }, + { + "epoch": 0.1036794198959138, + "grad_norm": 7.7452826499938965, + "learning_rate": 9.821177796827152e-06, + "loss": 2.7492, + "step": 397600 + }, + { + "epoch": 0.10373157252161595, + "grad_norm": 7.283862113952637, + "learning_rate": 9.820956168931418e-06, + "loss": 2.6983, + "step": 397800 + }, + { + "epoch": 0.10378372514731812, + "grad_norm": 7.292904376983643, + "learning_rate": 9.820734406284402e-06, + "loss": 2.7275, + "step": 398000 + }, + { + "epoch": 0.1038358777730203, + "grad_norm": 7.42738151550293, + "learning_rate": 9.820512508892299e-06, + "loss": 2.673, + "step": 398200 + }, + { + "epoch": 0.10388803039872246, + "grad_norm": 7.42814302444458, + "learning_rate": 9.820290476761314e-06, + "loss": 2.7334, + "step": 398400 + }, + { + "epoch": 0.10394018302442463, + "grad_norm": 7.708849906921387, + "learning_rate": 9.820068309897653e-06, + "loss": 2.6999, + "step": 398600 + }, + { + "epoch": 0.1039923356501268, + "grad_norm": 7.1372551918029785, + "learning_rate": 9.819846008307523e-06, + "loss": 2.6921, + "step": 398800 + }, + { + "epoch": 0.10404448827582898, + "grad_norm": 6.49362850189209, + "learning_rate": 9.81962357199714e-06, + "loss": 2.7561, + "step": 399000 + }, + { + "epoch": 0.10409664090153115, + "grad_norm": 7.146295547485352, + "learning_rate": 9.819401000972721e-06, + "loss": 2.7148, + "step": 399200 + }, + { + "epoch": 0.10414879352723332, + "grad_norm": 7.127407550811768, + "learning_rate": 9.819178295240488e-06, + "loss": 2.7187, + "step": 399400 + }, + { + "epoch": 0.10420094615293549, + "grad_norm": 7.735341548919678, + "learning_rate": 9.818955454806664e-06, + "loss": 2.7221, + "step": 399600 + }, + { + "epoch": 0.10425309877863766, + "grad_norm": 7.073373317718506, + "learning_rate": 9.81873247967748e-06, + "loss": 2.7069, + "step": 399800 + }, + { + "epoch": 0.10430525140433983, + "grad_norm": 6.615175247192383, + "learning_rate": 9.818509369859165e-06, + "loss": 2.6807, + "step": 400000 + }, + { + "epoch": 0.104357404030042, + "grad_norm": 7.4061174392700195, + "learning_rate": 9.818286125357958e-06, + "loss": 2.671, + "step": 400200 + }, + { + "epoch": 0.10440955665574417, + "grad_norm": 7.3244524002075195, + "learning_rate": 9.818062746180098e-06, + "loss": 2.7053, + "step": 400400 + }, + { + "epoch": 0.10446170928144634, + "grad_norm": 7.333294868469238, + "learning_rate": 9.81783923233183e-06, + "loss": 2.7226, + "step": 400600 + }, + { + "epoch": 0.10451386190714851, + "grad_norm": 8.426340103149414, + "learning_rate": 9.817615583819397e-06, + "loss": 2.7258, + "step": 400800 + }, + { + "epoch": 0.10456601453285068, + "grad_norm": 7.6054558753967285, + "learning_rate": 9.817391800649056e-06, + "loss": 2.6957, + "step": 401000 + }, + { + "epoch": 0.10461816715855285, + "grad_norm": 7.174556732177734, + "learning_rate": 9.81716788282706e-06, + "loss": 2.6637, + "step": 401200 + }, + { + "epoch": 0.10467031978425502, + "grad_norm": 7.382963180541992, + "learning_rate": 9.816943830359666e-06, + "loss": 2.6776, + "step": 401400 + }, + { + "epoch": 0.10472247240995719, + "grad_norm": 7.8001203536987305, + "learning_rate": 9.816719643253138e-06, + "loss": 2.7124, + "step": 401600 + }, + { + "epoch": 0.10477462503565936, + "grad_norm": 7.5196733474731445, + "learning_rate": 9.816495321513744e-06, + "loss": 2.6645, + "step": 401800 + }, + { + "epoch": 0.10482677766136153, + "grad_norm": 6.8788628578186035, + "learning_rate": 9.81627086514775e-06, + "loss": 2.7302, + "step": 402000 + }, + { + "epoch": 0.1048789302870637, + "grad_norm": 7.129185199737549, + "learning_rate": 9.816046274161432e-06, + "loss": 2.6782, + "step": 402200 + }, + { + "epoch": 0.10493108291276587, + "grad_norm": 7.342111587524414, + "learning_rate": 9.815821548561069e-06, + "loss": 2.711, + "step": 402400 + }, + { + "epoch": 0.10498323553846804, + "grad_norm": 5.888463020324707, + "learning_rate": 9.81559668835294e-06, + "loss": 2.6887, + "step": 402600 + }, + { + "epoch": 0.10503538816417021, + "grad_norm": 7.24282169342041, + "learning_rate": 9.81537169354333e-06, + "loss": 2.746, + "step": 402800 + }, + { + "epoch": 0.10508754078987238, + "grad_norm": 7.5223822593688965, + "learning_rate": 9.81514656413853e-06, + "loss": 2.7138, + "step": 403000 + }, + { + "epoch": 0.10513969341557455, + "grad_norm": 6.787247180938721, + "learning_rate": 9.814921300144831e-06, + "loss": 2.7083, + "step": 403200 + }, + { + "epoch": 0.10519184604127672, + "grad_norm": 8.027321815490723, + "learning_rate": 9.814695901568529e-06, + "loss": 2.7397, + "step": 403400 + }, + { + "epoch": 0.10524399866697888, + "grad_norm": 7.712162017822266, + "learning_rate": 9.814470368415925e-06, + "loss": 2.6962, + "step": 403600 + }, + { + "epoch": 0.10529615129268105, + "grad_norm": 7.541025638580322, + "learning_rate": 9.814244700693325e-06, + "loss": 2.717, + "step": 403800 + }, + { + "epoch": 0.10534830391838322, + "grad_norm": 7.47944974899292, + "learning_rate": 9.814018898407034e-06, + "loss": 2.6941, + "step": 404000 + }, + { + "epoch": 0.10540045654408539, + "grad_norm": 7.1561737060546875, + "learning_rate": 9.813792961563363e-06, + "loss": 2.6746, + "step": 404200 + }, + { + "epoch": 0.10545260916978756, + "grad_norm": 7.5194220542907715, + "learning_rate": 9.81356689016863e-06, + "loss": 2.6951, + "step": 404400 + }, + { + "epoch": 0.10550476179548973, + "grad_norm": 7.529834747314453, + "learning_rate": 9.813340684229148e-06, + "loss": 2.6781, + "step": 404600 + }, + { + "epoch": 0.1055569144211919, + "grad_norm": 7.3788533210754395, + "learning_rate": 9.813114343751248e-06, + "loss": 2.6886, + "step": 404800 + }, + { + "epoch": 0.10560906704689407, + "grad_norm": 7.26027250289917, + "learning_rate": 9.81288786874125e-06, + "loss": 2.7073, + "step": 405000 + }, + { + "epoch": 0.10566121967259624, + "grad_norm": 7.290615081787109, + "learning_rate": 9.812661259205489e-06, + "loss": 2.72, + "step": 405200 + }, + { + "epoch": 0.10571337229829841, + "grad_norm": 7.432085037231445, + "learning_rate": 9.812434515150294e-06, + "loss": 2.6673, + "step": 405400 + }, + { + "epoch": 0.10576552492400058, + "grad_norm": 7.36521577835083, + "learning_rate": 9.812207636582007e-06, + "loss": 2.7188, + "step": 405600 + }, + { + "epoch": 0.10581767754970275, + "grad_norm": 6.904064655303955, + "learning_rate": 9.811980623506967e-06, + "loss": 2.6936, + "step": 405800 + }, + { + "epoch": 0.10586983017540492, + "grad_norm": 6.532309055328369, + "learning_rate": 9.81175347593152e-06, + "loss": 2.6753, + "step": 406000 + }, + { + "epoch": 0.1059219828011071, + "grad_norm": 7.805009365081787, + "learning_rate": 9.811526193862017e-06, + "loss": 2.7279, + "step": 406200 + }, + { + "epoch": 0.10597413542680927, + "grad_norm": 6.72572135925293, + "learning_rate": 9.811298777304807e-06, + "loss": 2.6603, + "step": 406400 + }, + { + "epoch": 0.10602628805251144, + "grad_norm": 6.9229559898376465, + "learning_rate": 9.811071226266248e-06, + "loss": 2.6847, + "step": 406600 + }, + { + "epoch": 0.1060784406782136, + "grad_norm": 7.181591033935547, + "learning_rate": 9.810843540752703e-06, + "loss": 2.7129, + "step": 406800 + }, + { + "epoch": 0.10613059330391578, + "grad_norm": 7.120484828948975, + "learning_rate": 9.810615720770533e-06, + "loss": 2.7121, + "step": 407000 + }, + { + "epoch": 0.10618274592961795, + "grad_norm": 8.234344482421875, + "learning_rate": 9.810387766326108e-06, + "loss": 2.6959, + "step": 407200 + }, + { + "epoch": 0.10623489855532012, + "grad_norm": 7.5896100997924805, + "learning_rate": 9.810159677425797e-06, + "loss": 2.7096, + "step": 407400 + }, + { + "epoch": 0.10628705118102229, + "grad_norm": 6.704971790313721, + "learning_rate": 9.809931454075976e-06, + "loss": 2.7057, + "step": 407600 + }, + { + "epoch": 0.10633920380672446, + "grad_norm": 8.379301071166992, + "learning_rate": 9.809703096283025e-06, + "loss": 2.7059, + "step": 407800 + }, + { + "epoch": 0.10639135643242663, + "grad_norm": 8.338081359863281, + "learning_rate": 9.809474604053329e-06, + "loss": 2.7231, + "step": 408000 + }, + { + "epoch": 0.1064435090581288, + "grad_norm": 7.795104026794434, + "learning_rate": 9.809245977393268e-06, + "loss": 2.7019, + "step": 408200 + }, + { + "epoch": 0.10649566168383097, + "grad_norm": 6.772091865539551, + "learning_rate": 9.80901721630924e-06, + "loss": 2.7151, + "step": 408400 + }, + { + "epoch": 0.10654781430953314, + "grad_norm": 7.263775825500488, + "learning_rate": 9.808788320807636e-06, + "loss": 2.7085, + "step": 408600 + }, + { + "epoch": 0.10659996693523531, + "grad_norm": 7.627434730529785, + "learning_rate": 9.808559290894852e-06, + "loss": 2.6663, + "step": 408800 + }, + { + "epoch": 0.10665211956093748, + "grad_norm": 7.4458465576171875, + "learning_rate": 9.808330126577293e-06, + "loss": 2.6748, + "step": 409000 + }, + { + "epoch": 0.10670427218663965, + "grad_norm": 7.223238468170166, + "learning_rate": 9.808100827861361e-06, + "loss": 2.6919, + "step": 409200 + }, + { + "epoch": 0.10675642481234182, + "grad_norm": 7.894956111907959, + "learning_rate": 9.807871394753468e-06, + "loss": 2.6707, + "step": 409400 + }, + { + "epoch": 0.10680857743804398, + "grad_norm": 7.548394203186035, + "learning_rate": 9.807641827260027e-06, + "loss": 2.6829, + "step": 409600 + }, + { + "epoch": 0.10686073006374615, + "grad_norm": 6.605945587158203, + "learning_rate": 9.807412125387449e-06, + "loss": 2.7355, + "step": 409800 + }, + { + "epoch": 0.10691288268944832, + "grad_norm": 6.510282516479492, + "learning_rate": 9.807182289142163e-06, + "loss": 2.726, + "step": 410000 + }, + { + "epoch": 0.10696503531515049, + "grad_norm": 6.95159387588501, + "learning_rate": 9.806952318530589e-06, + "loss": 2.7377, + "step": 410200 + }, + { + "epoch": 0.10701718794085266, + "grad_norm": 8.0206298828125, + "learning_rate": 9.806722213559153e-06, + "loss": 2.7229, + "step": 410400 + }, + { + "epoch": 0.10706934056655483, + "grad_norm": 7.942624568939209, + "learning_rate": 9.806491974234291e-06, + "loss": 2.7275, + "step": 410600 + }, + { + "epoch": 0.107121493192257, + "grad_norm": 8.461700439453125, + "learning_rate": 9.806261600562434e-06, + "loss": 2.6925, + "step": 410800 + }, + { + "epoch": 0.10717364581795917, + "grad_norm": 7.115810871124268, + "learning_rate": 9.806031092550025e-06, + "loss": 2.6673, + "step": 411000 + }, + { + "epoch": 0.10722579844366134, + "grad_norm": 7.023535251617432, + "learning_rate": 9.805800450203504e-06, + "loss": 2.768, + "step": 411200 + }, + { + "epoch": 0.10727795106936351, + "grad_norm": 7.725059986114502, + "learning_rate": 9.80556967352932e-06, + "loss": 2.6898, + "step": 411400 + }, + { + "epoch": 0.10733010369506568, + "grad_norm": 7.8010573387146, + "learning_rate": 9.805338762533924e-06, + "loss": 2.7011, + "step": 411600 + }, + { + "epoch": 0.10738225632076785, + "grad_norm": 7.579238414764404, + "learning_rate": 9.805107717223767e-06, + "loss": 2.7297, + "step": 411800 + }, + { + "epoch": 0.10743440894647002, + "grad_norm": 8.260748863220215, + "learning_rate": 9.80487653760531e-06, + "loss": 2.6825, + "step": 412000 + }, + { + "epoch": 0.10748656157217219, + "grad_norm": 7.9606547355651855, + "learning_rate": 9.804645223685012e-06, + "loss": 2.7009, + "step": 412200 + }, + { + "epoch": 0.10753871419787436, + "grad_norm": 8.684904098510742, + "learning_rate": 9.804413775469342e-06, + "loss": 2.7123, + "step": 412400 + }, + { + "epoch": 0.10759086682357653, + "grad_norm": 7.265321731567383, + "learning_rate": 9.804182192964765e-06, + "loss": 2.664, + "step": 412600 + }, + { + "epoch": 0.1076430194492787, + "grad_norm": 7.080298900604248, + "learning_rate": 9.803950476177757e-06, + "loss": 2.705, + "step": 412800 + }, + { + "epoch": 0.10769517207498087, + "grad_norm": 7.625601768493652, + "learning_rate": 9.803718625114796e-06, + "loss": 2.7355, + "step": 413000 + }, + { + "epoch": 0.10774732470068304, + "grad_norm": 7.863028526306152, + "learning_rate": 9.803486639782357e-06, + "loss": 2.7024, + "step": 413200 + }, + { + "epoch": 0.10779947732638522, + "grad_norm": 7.483668327331543, + "learning_rate": 9.803254520186932e-06, + "loss": 2.7058, + "step": 413400 + }, + { + "epoch": 0.10785162995208739, + "grad_norm": 7.60951042175293, + "learning_rate": 9.803022266335e-06, + "loss": 2.689, + "step": 413600 + }, + { + "epoch": 0.10790378257778956, + "grad_norm": 7.47837495803833, + "learning_rate": 9.80278987823306e-06, + "loss": 2.6998, + "step": 413800 + }, + { + "epoch": 0.10795593520349173, + "grad_norm": 7.093349933624268, + "learning_rate": 9.802557355887607e-06, + "loss": 2.7003, + "step": 414000 + }, + { + "epoch": 0.1080080878291939, + "grad_norm": 7.799093723297119, + "learning_rate": 9.802324699305136e-06, + "loss": 2.7044, + "step": 414200 + }, + { + "epoch": 0.10806024045489607, + "grad_norm": 7.693939208984375, + "learning_rate": 9.802091908492153e-06, + "loss": 2.6745, + "step": 414400 + }, + { + "epoch": 0.10811239308059824, + "grad_norm": 7.931087493896484, + "learning_rate": 9.801858983455164e-06, + "loss": 2.703, + "step": 414600 + }, + { + "epoch": 0.10816454570630041, + "grad_norm": 7.6536407470703125, + "learning_rate": 9.801625924200679e-06, + "loss": 2.6576, + "step": 414800 + }, + { + "epoch": 0.10821669833200258, + "grad_norm": 7.151515483856201, + "learning_rate": 9.801392730735214e-06, + "loss": 2.7166, + "step": 415000 + }, + { + "epoch": 0.10826885095770475, + "grad_norm": 8.142443656921387, + "learning_rate": 9.801159403065286e-06, + "loss": 2.6738, + "step": 415200 + }, + { + "epoch": 0.1083210035834069, + "grad_norm": 7.3941121101379395, + "learning_rate": 9.800925941197415e-06, + "loss": 2.7265, + "step": 415400 + }, + { + "epoch": 0.10837315620910908, + "grad_norm": 7.601657390594482, + "learning_rate": 9.80069234513813e-06, + "loss": 2.6959, + "step": 415600 + }, + { + "epoch": 0.10842530883481125, + "grad_norm": 7.414285182952881, + "learning_rate": 9.800458614893958e-06, + "loss": 2.7261, + "step": 415800 + }, + { + "epoch": 0.10847746146051342, + "grad_norm": 7.267521858215332, + "learning_rate": 9.800224750471434e-06, + "loss": 2.6731, + "step": 416000 + }, + { + "epoch": 0.10852961408621559, + "grad_norm": 7.712509632110596, + "learning_rate": 9.79999075187709e-06, + "loss": 2.7288, + "step": 416200 + }, + { + "epoch": 0.10858176671191776, + "grad_norm": 7.250224590301514, + "learning_rate": 9.799756619117473e-06, + "loss": 2.7043, + "step": 416400 + }, + { + "epoch": 0.10863391933761993, + "grad_norm": 7.441985130310059, + "learning_rate": 9.799522352199122e-06, + "loss": 2.6993, + "step": 416600 + }, + { + "epoch": 0.1086860719633221, + "grad_norm": 7.325314044952393, + "learning_rate": 9.79928795112859e-06, + "loss": 2.6846, + "step": 416800 + }, + { + "epoch": 0.10873822458902427, + "grad_norm": 7.375991344451904, + "learning_rate": 9.799053415912422e-06, + "loss": 2.7052, + "step": 417000 + }, + { + "epoch": 0.10879037721472644, + "grad_norm": 7.841715335845947, + "learning_rate": 9.798818746557182e-06, + "loss": 2.7222, + "step": 417200 + }, + { + "epoch": 0.10884252984042861, + "grad_norm": 7.707851886749268, + "learning_rate": 9.798583943069422e-06, + "loss": 2.6917, + "step": 417400 + }, + { + "epoch": 0.10889468246613078, + "grad_norm": 8.474435806274414, + "learning_rate": 9.798349005455707e-06, + "loss": 2.6629, + "step": 417600 + }, + { + "epoch": 0.10894683509183295, + "grad_norm": 7.427988529205322, + "learning_rate": 9.798113933722606e-06, + "loss": 2.6719, + "step": 417800 + }, + { + "epoch": 0.10899898771753512, + "grad_norm": 7.597637176513672, + "learning_rate": 9.797878727876689e-06, + "loss": 2.6991, + "step": 418000 + }, + { + "epoch": 0.10905114034323729, + "grad_norm": 8.201911926269531, + "learning_rate": 9.797643387924529e-06, + "loss": 2.6592, + "step": 418200 + }, + { + "epoch": 0.10910329296893946, + "grad_norm": 8.380842208862305, + "learning_rate": 9.797407913872703e-06, + "loss": 2.7308, + "step": 418400 + }, + { + "epoch": 0.10915544559464163, + "grad_norm": 8.024592399597168, + "learning_rate": 9.797172305727796e-06, + "loss": 2.6984, + "step": 418600 + }, + { + "epoch": 0.1092075982203438, + "grad_norm": 7.051395893096924, + "learning_rate": 9.79693656349639e-06, + "loss": 2.7273, + "step": 418800 + }, + { + "epoch": 0.10925975084604597, + "grad_norm": 8.769367218017578, + "learning_rate": 9.796700687185076e-06, + "loss": 2.7199, + "step": 419000 + }, + { + "epoch": 0.10931190347174814, + "grad_norm": 6.581594467163086, + "learning_rate": 9.796464676800448e-06, + "loss": 2.7053, + "step": 419200 + }, + { + "epoch": 0.10936405609745031, + "grad_norm": 7.729722499847412, + "learning_rate": 9.796228532349102e-06, + "loss": 2.6976, + "step": 419400 + }, + { + "epoch": 0.10941620872315248, + "grad_norm": 6.908459186553955, + "learning_rate": 9.795992253837638e-06, + "loss": 2.704, + "step": 419600 + }, + { + "epoch": 0.10946836134885465, + "grad_norm": 7.443035125732422, + "learning_rate": 9.79575584127266e-06, + "loss": 2.6812, + "step": 419800 + }, + { + "epoch": 0.10952051397455682, + "grad_norm": 7.436124801635742, + "learning_rate": 9.795519294660775e-06, + "loss": 2.6817, + "step": 420000 + }, + { + "epoch": 0.109572666600259, + "grad_norm": 7.913905620574951, + "learning_rate": 9.795282614008598e-06, + "loss": 2.7261, + "step": 420200 + }, + { + "epoch": 0.10962481922596116, + "grad_norm": 7.672698497772217, + "learning_rate": 9.795045799322744e-06, + "loss": 2.7001, + "step": 420400 + }, + { + "epoch": 0.10967697185166334, + "grad_norm": 7.240036487579346, + "learning_rate": 9.794808850609828e-06, + "loss": 2.6813, + "step": 420600 + }, + { + "epoch": 0.1097291244773655, + "grad_norm": 7.6813812255859375, + "learning_rate": 9.79457176787648e-06, + "loss": 2.6612, + "step": 420800 + }, + { + "epoch": 0.10978127710306768, + "grad_norm": 7.365231513977051, + "learning_rate": 9.79433455112932e-06, + "loss": 2.7116, + "step": 421000 + }, + { + "epoch": 0.10983342972876985, + "grad_norm": 8.09189510345459, + "learning_rate": 9.794097200374981e-06, + "loss": 2.6995, + "step": 421200 + }, + { + "epoch": 0.109885582354472, + "grad_norm": 7.653886318206787, + "learning_rate": 9.7938597156201e-06, + "loss": 2.6798, + "step": 421400 + }, + { + "epoch": 0.10993773498017417, + "grad_norm": 6.208761692047119, + "learning_rate": 9.79362209687131e-06, + "loss": 2.6639, + "step": 421600 + }, + { + "epoch": 0.10998988760587634, + "grad_norm": 7.84056282043457, + "learning_rate": 9.793384344135254e-06, + "loss": 2.6726, + "step": 421800 + }, + { + "epoch": 0.11004204023157851, + "grad_norm": 6.51863431930542, + "learning_rate": 9.793146457418581e-06, + "loss": 2.6799, + "step": 422000 + }, + { + "epoch": 0.11009419285728068, + "grad_norm": 7.248006820678711, + "learning_rate": 9.792908436727937e-06, + "loss": 2.7188, + "step": 422200 + }, + { + "epoch": 0.11014634548298285, + "grad_norm": 7.784631252288818, + "learning_rate": 9.792670282069976e-06, + "loss": 2.6932, + "step": 422400 + }, + { + "epoch": 0.11019849810868503, + "grad_norm": 7.782918930053711, + "learning_rate": 9.792431993451355e-06, + "loss": 2.6632, + "step": 422600 + }, + { + "epoch": 0.1102506507343872, + "grad_norm": 6.995285511016846, + "learning_rate": 9.792193570878733e-06, + "loss": 2.7118, + "step": 422800 + }, + { + "epoch": 0.11030280336008937, + "grad_norm": 8.329195976257324, + "learning_rate": 9.791955014358774e-06, + "loss": 2.6919, + "step": 423000 + }, + { + "epoch": 0.11035495598579154, + "grad_norm": 7.235878944396973, + "learning_rate": 9.791716323898149e-06, + "loss": 2.6916, + "step": 423200 + }, + { + "epoch": 0.1104071086114937, + "grad_norm": 7.546281814575195, + "learning_rate": 9.791477499503528e-06, + "loss": 2.695, + "step": 423400 + }, + { + "epoch": 0.11045926123719588, + "grad_norm": 7.888071537017822, + "learning_rate": 9.791238541181584e-06, + "loss": 2.708, + "step": 423600 + }, + { + "epoch": 0.11051141386289805, + "grad_norm": 8.009730339050293, + "learning_rate": 9.790999448939001e-06, + "loss": 2.6739, + "step": 423800 + }, + { + "epoch": 0.11056356648860022, + "grad_norm": 7.215407371520996, + "learning_rate": 9.790760222782455e-06, + "loss": 2.6846, + "step": 424000 + }, + { + "epoch": 0.11061571911430239, + "grad_norm": 7.5460638999938965, + "learning_rate": 9.79052086271864e-06, + "loss": 2.6856, + "step": 424200 + }, + { + "epoch": 0.11066787174000456, + "grad_norm": 7.906571865081787, + "learning_rate": 9.790281368754245e-06, + "loss": 2.6999, + "step": 424400 + }, + { + "epoch": 0.11072002436570673, + "grad_norm": 7.218746662139893, + "learning_rate": 9.790041740895958e-06, + "loss": 2.7122, + "step": 424600 + }, + { + "epoch": 0.1107721769914089, + "grad_norm": 9.075844764709473, + "learning_rate": 9.789801979150483e-06, + "loss": 2.6593, + "step": 424800 + }, + { + "epoch": 0.11082432961711107, + "grad_norm": 8.792651176452637, + "learning_rate": 9.78956208352452e-06, + "loss": 2.7338, + "step": 425000 + }, + { + "epoch": 0.11087648224281324, + "grad_norm": 7.27746057510376, + "learning_rate": 9.789322054024774e-06, + "loss": 2.6658, + "step": 425200 + }, + { + "epoch": 0.11092863486851541, + "grad_norm": 7.638511657714844, + "learning_rate": 9.789081890657955e-06, + "loss": 2.7166, + "step": 425400 + }, + { + "epoch": 0.11098078749421758, + "grad_norm": 7.376978874206543, + "learning_rate": 9.788841593430775e-06, + "loss": 2.6812, + "step": 425600 + }, + { + "epoch": 0.11103294011991975, + "grad_norm": 7.535377025604248, + "learning_rate": 9.788601162349949e-06, + "loss": 2.6784, + "step": 425800 + }, + { + "epoch": 0.11108509274562192, + "grad_norm": 8.068282127380371, + "learning_rate": 9.7883605974222e-06, + "loss": 2.7089, + "step": 426000 + }, + { + "epoch": 0.11113724537132409, + "grad_norm": 8.093073844909668, + "learning_rate": 9.788119898654253e-06, + "loss": 2.6628, + "step": 426200 + }, + { + "epoch": 0.11118939799702626, + "grad_norm": 7.169579029083252, + "learning_rate": 9.787879066052833e-06, + "loss": 2.7379, + "step": 426400 + }, + { + "epoch": 0.11124155062272843, + "grad_norm": 7.598651885986328, + "learning_rate": 9.78763809962467e-06, + "loss": 2.6831, + "step": 426600 + }, + { + "epoch": 0.1112937032484306, + "grad_norm": 8.083666801452637, + "learning_rate": 9.787396999376503e-06, + "loss": 2.7267, + "step": 426800 + }, + { + "epoch": 0.11134585587413277, + "grad_norm": 8.373312950134277, + "learning_rate": 9.787155765315071e-06, + "loss": 2.6836, + "step": 427000 + }, + { + "epoch": 0.11139800849983493, + "grad_norm": 8.161797523498535, + "learning_rate": 9.786914397447116e-06, + "loss": 2.6782, + "step": 427200 + }, + { + "epoch": 0.1114501611255371, + "grad_norm": 8.155803680419922, + "learning_rate": 9.786672895779382e-06, + "loss": 2.661, + "step": 427400 + }, + { + "epoch": 0.11150231375123927, + "grad_norm": 8.83053207397461, + "learning_rate": 9.786431260318624e-06, + "loss": 2.6674, + "step": 427600 + }, + { + "epoch": 0.11155446637694144, + "grad_norm": 6.54095983505249, + "learning_rate": 9.78618949107159e-06, + "loss": 2.7123, + "step": 427800 + }, + { + "epoch": 0.11160661900264361, + "grad_norm": 7.278541088104248, + "learning_rate": 9.785947588045044e-06, + "loss": 2.6894, + "step": 428000 + }, + { + "epoch": 0.11165877162834578, + "grad_norm": 8.509937286376953, + "learning_rate": 9.785705551245741e-06, + "loss": 2.6652, + "step": 428200 + }, + { + "epoch": 0.11171092425404795, + "grad_norm": 8.071086883544922, + "learning_rate": 9.785463380680454e-06, + "loss": 2.6859, + "step": 428400 + }, + { + "epoch": 0.11176307687975012, + "grad_norm": 8.377106666564941, + "learning_rate": 9.785221076355944e-06, + "loss": 2.7175, + "step": 428600 + }, + { + "epoch": 0.1118152295054523, + "grad_norm": 7.088006496429443, + "learning_rate": 9.784978638278989e-06, + "loss": 2.6663, + "step": 428800 + }, + { + "epoch": 0.11186738213115446, + "grad_norm": 8.437390327453613, + "learning_rate": 9.784736066456363e-06, + "loss": 2.7336, + "step": 429000 + }, + { + "epoch": 0.11191953475685663, + "grad_norm": 8.951874732971191, + "learning_rate": 9.784493360894849e-06, + "loss": 2.732, + "step": 429200 + }, + { + "epoch": 0.1119716873825588, + "grad_norm": 7.750514030456543, + "learning_rate": 9.784250521601226e-06, + "loss": 2.6847, + "step": 429400 + }, + { + "epoch": 0.11202384000826097, + "grad_norm": 8.56009292602539, + "learning_rate": 9.784007548582287e-06, + "loss": 2.6984, + "step": 429600 + }, + { + "epoch": 0.11207599263396315, + "grad_norm": 7.063655376434326, + "learning_rate": 9.783764441844819e-06, + "loss": 2.6662, + "step": 429800 + }, + { + "epoch": 0.11212814525966532, + "grad_norm": 7.597879886627197, + "learning_rate": 9.78352120139562e-06, + "loss": 2.6689, + "step": 430000 + }, + { + "epoch": 0.11218029788536749, + "grad_norm": 7.35047721862793, + "learning_rate": 9.783277827241486e-06, + "loss": 2.7019, + "step": 430200 + }, + { + "epoch": 0.11223245051106966, + "grad_norm": 7.905674457550049, + "learning_rate": 9.783034319389223e-06, + "loss": 2.6588, + "step": 430400 + }, + { + "epoch": 0.11228460313677183, + "grad_norm": 8.24709415435791, + "learning_rate": 9.782790677845638e-06, + "loss": 2.6809, + "step": 430600 + }, + { + "epoch": 0.112336755762474, + "grad_norm": 7.832298278808594, + "learning_rate": 9.782546902617535e-06, + "loss": 2.6982, + "step": 430800 + }, + { + "epoch": 0.11238890838817617, + "grad_norm": 7.27885103225708, + "learning_rate": 9.782302993711733e-06, + "loss": 2.7151, + "step": 431000 + }, + { + "epoch": 0.11244106101387834, + "grad_norm": 7.919535160064697, + "learning_rate": 9.782058951135047e-06, + "loss": 2.6966, + "step": 431200 + }, + { + "epoch": 0.11249321363958051, + "grad_norm": 7.770106315612793, + "learning_rate": 9.781814774894302e-06, + "loss": 2.7011, + "step": 431400 + }, + { + "epoch": 0.11254536626528268, + "grad_norm": 8.116305351257324, + "learning_rate": 9.781570464996318e-06, + "loss": 2.6831, + "step": 431600 + }, + { + "epoch": 0.11259751889098485, + "grad_norm": 8.72941780090332, + "learning_rate": 9.781326021447928e-06, + "loss": 2.6948, + "step": 431800 + }, + { + "epoch": 0.11264967151668702, + "grad_norm": 8.299556732177734, + "learning_rate": 9.781081444255962e-06, + "loss": 2.7151, + "step": 432000 + }, + { + "epoch": 0.11270182414238919, + "grad_norm": 7.460847854614258, + "learning_rate": 9.780836733427255e-06, + "loss": 2.6989, + "step": 432200 + }, + { + "epoch": 0.11275397676809136, + "grad_norm": 8.605817794799805, + "learning_rate": 9.780591888968652e-06, + "loss": 2.6516, + "step": 432400 + }, + { + "epoch": 0.11280612939379353, + "grad_norm": 7.617114067077637, + "learning_rate": 9.780346910886991e-06, + "loss": 2.6607, + "step": 432600 + }, + { + "epoch": 0.1128582820194957, + "grad_norm": 7.732776641845703, + "learning_rate": 9.78010179918912e-06, + "loss": 2.6997, + "step": 432800 + }, + { + "epoch": 0.11291043464519787, + "grad_norm": 7.245545864105225, + "learning_rate": 9.779856553881897e-06, + "loss": 2.6959, + "step": 433000 + }, + { + "epoch": 0.11296258727090003, + "grad_norm": 7.128195285797119, + "learning_rate": 9.779611174972169e-06, + "loss": 2.656, + "step": 433200 + }, + { + "epoch": 0.1130147398966022, + "grad_norm": 7.312046051025391, + "learning_rate": 9.779365662466798e-06, + "loss": 2.6691, + "step": 433400 + }, + { + "epoch": 0.11306689252230437, + "grad_norm": 8.948485374450684, + "learning_rate": 9.779120016372646e-06, + "loss": 2.7061, + "step": 433600 + }, + { + "epoch": 0.11311904514800654, + "grad_norm": 7.606996059417725, + "learning_rate": 9.77887423669658e-06, + "loss": 2.6527, + "step": 433800 + }, + { + "epoch": 0.11317119777370871, + "grad_norm": 8.001826286315918, + "learning_rate": 9.778628323445467e-06, + "loss": 2.663, + "step": 434000 + }, + { + "epoch": 0.11322335039941088, + "grad_norm": 8.344403266906738, + "learning_rate": 9.778382276626183e-06, + "loss": 2.6678, + "step": 434200 + }, + { + "epoch": 0.11327550302511305, + "grad_norm": 8.27188491821289, + "learning_rate": 9.778136096245605e-06, + "loss": 2.7192, + "step": 434400 + }, + { + "epoch": 0.11332765565081522, + "grad_norm": 8.35887336730957, + "learning_rate": 9.777889782310614e-06, + "loss": 2.6627, + "step": 434600 + }, + { + "epoch": 0.11337980827651739, + "grad_norm": 7.3902201652526855, + "learning_rate": 9.777643334828094e-06, + "loss": 2.7158, + "step": 434800 + }, + { + "epoch": 0.11343196090221956, + "grad_norm": 8.090744972229004, + "learning_rate": 9.777396753804935e-06, + "loss": 2.7031, + "step": 435000 + }, + { + "epoch": 0.11348411352792173, + "grad_norm": 7.8479084968566895, + "learning_rate": 9.777150039248027e-06, + "loss": 2.6811, + "step": 435200 + }, + { + "epoch": 0.1135362661536239, + "grad_norm": 7.591457366943359, + "learning_rate": 9.776903191164267e-06, + "loss": 2.6928, + "step": 435400 + }, + { + "epoch": 0.11358841877932607, + "grad_norm": 8.277278900146484, + "learning_rate": 9.776656209560556e-06, + "loss": 2.6798, + "step": 435600 + }, + { + "epoch": 0.11364057140502824, + "grad_norm": 8.627765655517578, + "learning_rate": 9.776409094443796e-06, + "loss": 2.6743, + "step": 435800 + }, + { + "epoch": 0.11369272403073041, + "grad_norm": 7.7793288230896, + "learning_rate": 9.776161845820894e-06, + "loss": 2.6513, + "step": 436000 + }, + { + "epoch": 0.11374487665643258, + "grad_norm": 6.800634384155273, + "learning_rate": 9.775914463698758e-06, + "loss": 2.7216, + "step": 436200 + }, + { + "epoch": 0.11379702928213475, + "grad_norm": 7.648603439331055, + "learning_rate": 9.77566694808431e-06, + "loss": 2.6822, + "step": 436400 + }, + { + "epoch": 0.11384918190783692, + "grad_norm": 7.305148124694824, + "learning_rate": 9.775419298984463e-06, + "loss": 2.6863, + "step": 436600 + }, + { + "epoch": 0.1139013345335391, + "grad_norm": 7.8893513679504395, + "learning_rate": 9.775171516406142e-06, + "loss": 2.6731, + "step": 436800 + }, + { + "epoch": 0.11395348715924127, + "grad_norm": 7.8577775955200195, + "learning_rate": 9.77492360035627e-06, + "loss": 2.6812, + "step": 437000 + }, + { + "epoch": 0.11400563978494344, + "grad_norm": 8.009392738342285, + "learning_rate": 9.774675550841776e-06, + "loss": 2.7047, + "step": 437200 + }, + { + "epoch": 0.1140577924106456, + "grad_norm": 7.961512088775635, + "learning_rate": 9.774427367869597e-06, + "loss": 2.6934, + "step": 437400 + }, + { + "epoch": 0.11410994503634778, + "grad_norm": 7.57539701461792, + "learning_rate": 9.774179051446667e-06, + "loss": 2.667, + "step": 437600 + }, + { + "epoch": 0.11416209766204995, + "grad_norm": 7.794543266296387, + "learning_rate": 9.773930601579928e-06, + "loss": 2.6578, + "step": 437800 + }, + { + "epoch": 0.11421425028775212, + "grad_norm": 8.647083282470703, + "learning_rate": 9.773682018276325e-06, + "loss": 2.7059, + "step": 438000 + }, + { + "epoch": 0.11426640291345429, + "grad_norm": 6.988203048706055, + "learning_rate": 9.773433301542805e-06, + "loss": 2.6848, + "step": 438200 + }, + { + "epoch": 0.11431855553915646, + "grad_norm": 7.052835941314697, + "learning_rate": 9.77318445138632e-06, + "loss": 2.696, + "step": 438400 + }, + { + "epoch": 0.11437070816485863, + "grad_norm": 8.016982078552246, + "learning_rate": 9.772935467813825e-06, + "loss": 2.6304, + "step": 438600 + }, + { + "epoch": 0.1144228607905608, + "grad_norm": 8.256826400756836, + "learning_rate": 9.77268635083228e-06, + "loss": 2.6369, + "step": 438800 + }, + { + "epoch": 0.11447501341626296, + "grad_norm": 9.124797821044922, + "learning_rate": 9.772437100448649e-06, + "loss": 2.6915, + "step": 439000 + }, + { + "epoch": 0.11452716604196513, + "grad_norm": 6.831801414489746, + "learning_rate": 9.772187716669898e-06, + "loss": 2.6886, + "step": 439200 + }, + { + "epoch": 0.1145793186676673, + "grad_norm": 8.428510665893555, + "learning_rate": 9.771938199503e-06, + "loss": 2.7035, + "step": 439400 + }, + { + "epoch": 0.11463147129336947, + "grad_norm": 7.8952717781066895, + "learning_rate": 9.771688548954923e-06, + "loss": 2.7098, + "step": 439600 + }, + { + "epoch": 0.11468362391907164, + "grad_norm": 6.776541233062744, + "learning_rate": 9.771438765032653e-06, + "loss": 2.6655, + "step": 439800 + }, + { + "epoch": 0.11473577654477381, + "grad_norm": 7.783594608306885, + "learning_rate": 9.771188847743167e-06, + "loss": 2.7002, + "step": 440000 + }, + { + "epoch": 0.11478792917047598, + "grad_norm": 7.315393924713135, + "learning_rate": 9.77093879709345e-06, + "loss": 2.6525, + "step": 440200 + }, + { + "epoch": 0.11484008179617815, + "grad_norm": 8.225870132446289, + "learning_rate": 9.770688613090492e-06, + "loss": 2.6535, + "step": 440400 + }, + { + "epoch": 0.11489223442188032, + "grad_norm": 7.509810447692871, + "learning_rate": 9.770438295741289e-06, + "loss": 2.7409, + "step": 440600 + }, + { + "epoch": 0.11494438704758249, + "grad_norm": 7.6700592041015625, + "learning_rate": 9.770187845052833e-06, + "loss": 2.6997, + "step": 440800 + }, + { + "epoch": 0.11499653967328466, + "grad_norm": 6.8598527908325195, + "learning_rate": 9.769937261032126e-06, + "loss": 2.6779, + "step": 441000 + }, + { + "epoch": 0.11504869229898683, + "grad_norm": 8.12748908996582, + "learning_rate": 9.769686543686174e-06, + "loss": 2.717, + "step": 441200 + }, + { + "epoch": 0.115100844924689, + "grad_norm": 8.549765586853027, + "learning_rate": 9.769435693021982e-06, + "loss": 2.663, + "step": 441400 + }, + { + "epoch": 0.11515299755039117, + "grad_norm": 6.6760783195495605, + "learning_rate": 9.769184709046565e-06, + "loss": 2.6786, + "step": 441600 + }, + { + "epoch": 0.11520515017609334, + "grad_norm": 7.719881057739258, + "learning_rate": 9.768933591766935e-06, + "loss": 2.6848, + "step": 441800 + }, + { + "epoch": 0.11525730280179551, + "grad_norm": 7.913510799407959, + "learning_rate": 9.768682341190114e-06, + "loss": 2.7145, + "step": 442000 + }, + { + "epoch": 0.11530945542749768, + "grad_norm": 7.543449401855469, + "learning_rate": 9.768430957323122e-06, + "loss": 2.6923, + "step": 442200 + }, + { + "epoch": 0.11536160805319985, + "grad_norm": 8.190464973449707, + "learning_rate": 9.768179440172985e-06, + "loss": 2.6409, + "step": 442400 + }, + { + "epoch": 0.11541376067890202, + "grad_norm": 8.82253360748291, + "learning_rate": 9.767927789746736e-06, + "loss": 2.6654, + "step": 442600 + }, + { + "epoch": 0.11546591330460419, + "grad_norm": 8.999987602233887, + "learning_rate": 9.767676006051409e-06, + "loss": 2.7038, + "step": 442800 + }, + { + "epoch": 0.11551806593030636, + "grad_norm": 8.303157806396484, + "learning_rate": 9.767424089094039e-06, + "loss": 2.6908, + "step": 443000 + }, + { + "epoch": 0.11557021855600853, + "grad_norm": 6.951584339141846, + "learning_rate": 9.767172038881669e-06, + "loss": 2.623, + "step": 443200 + }, + { + "epoch": 0.1156223711817107, + "grad_norm": 8.04922866821289, + "learning_rate": 9.766919855421343e-06, + "loss": 2.6307, + "step": 443400 + }, + { + "epoch": 0.11567452380741287, + "grad_norm": 7.755749225616455, + "learning_rate": 9.76666753872011e-06, + "loss": 2.662, + "step": 443600 + }, + { + "epoch": 0.11572667643311504, + "grad_norm": 9.410798072814941, + "learning_rate": 9.766415088785026e-06, + "loss": 2.7075, + "step": 443800 + }, + { + "epoch": 0.11577882905881721, + "grad_norm": 9.511571884155273, + "learning_rate": 9.76616250562314e-06, + "loss": 2.6914, + "step": 444000 + }, + { + "epoch": 0.11583098168451939, + "grad_norm": 8.075037956237793, + "learning_rate": 9.765909789241521e-06, + "loss": 2.6813, + "step": 444200 + }, + { + "epoch": 0.11588313431022156, + "grad_norm": 7.264787197113037, + "learning_rate": 9.765656939647225e-06, + "loss": 2.6665, + "step": 444400 + }, + { + "epoch": 0.11593528693592373, + "grad_norm": 8.67489242553711, + "learning_rate": 9.765403956847323e-06, + "loss": 2.7131, + "step": 444600 + }, + { + "epoch": 0.1159874395616259, + "grad_norm": 7.648482799530029, + "learning_rate": 9.765150840848888e-06, + "loss": 2.7141, + "step": 444800 + }, + { + "epoch": 0.11603959218732805, + "grad_norm": 7.76444149017334, + "learning_rate": 9.764897591658989e-06, + "loss": 2.6504, + "step": 445000 + }, + { + "epoch": 0.11609174481303022, + "grad_norm": 7.081927299499512, + "learning_rate": 9.764644209284711e-06, + "loss": 2.7151, + "step": 445200 + }, + { + "epoch": 0.1161438974387324, + "grad_norm": 8.81956958770752, + "learning_rate": 9.764390693733132e-06, + "loss": 2.6997, + "step": 445400 + }, + { + "epoch": 0.11619605006443456, + "grad_norm": 7.683972358703613, + "learning_rate": 9.764137045011339e-06, + "loss": 2.7042, + "step": 445600 + }, + { + "epoch": 0.11624820269013673, + "grad_norm": 8.693424224853516, + "learning_rate": 9.763883263126423e-06, + "loss": 2.7078, + "step": 445800 + }, + { + "epoch": 0.1163003553158389, + "grad_norm": 8.919655799865723, + "learning_rate": 9.763629348085478e-06, + "loss": 2.6838, + "step": 446000 + }, + { + "epoch": 0.11635250794154108, + "grad_norm": 9.183130264282227, + "learning_rate": 9.763375299895598e-06, + "loss": 2.7268, + "step": 446200 + }, + { + "epoch": 0.11640466056724325, + "grad_norm": 8.304872512817383, + "learning_rate": 9.763121118563888e-06, + "loss": 2.6786, + "step": 446400 + }, + { + "epoch": 0.11645681319294542, + "grad_norm": 8.370039939880371, + "learning_rate": 9.76286680409745e-06, + "loss": 2.6958, + "step": 446600 + }, + { + "epoch": 0.11650896581864759, + "grad_norm": 9.099044799804688, + "learning_rate": 9.762612356503394e-06, + "loss": 2.6626, + "step": 446800 + }, + { + "epoch": 0.11656111844434976, + "grad_norm": 7.61244535446167, + "learning_rate": 9.762357775788829e-06, + "loss": 2.6916, + "step": 447000 + }, + { + "epoch": 0.11661327107005193, + "grad_norm": 7.555819034576416, + "learning_rate": 9.762103061960874e-06, + "loss": 2.6282, + "step": 447200 + }, + { + "epoch": 0.1166654236957541, + "grad_norm": 8.290244102478027, + "learning_rate": 9.761848215026647e-06, + "loss": 2.6645, + "step": 447400 + }, + { + "epoch": 0.11671757632145627, + "grad_norm": 7.066608905792236, + "learning_rate": 9.761593234993273e-06, + "loss": 2.6977, + "step": 447600 + }, + { + "epoch": 0.11676972894715844, + "grad_norm": 8.447099685668945, + "learning_rate": 9.761338121867876e-06, + "loss": 2.7276, + "step": 447800 + }, + { + "epoch": 0.11682188157286061, + "grad_norm": 8.586709976196289, + "learning_rate": 9.76108287565759e-06, + "loss": 2.7122, + "step": 448000 + }, + { + "epoch": 0.11687403419856278, + "grad_norm": 8.613016128540039, + "learning_rate": 9.760827496369547e-06, + "loss": 2.6843, + "step": 448200 + }, + { + "epoch": 0.11692618682426495, + "grad_norm": 7.8855719566345215, + "learning_rate": 9.760571984010887e-06, + "loss": 2.6395, + "step": 448400 + }, + { + "epoch": 0.11697833944996712, + "grad_norm": 9.135221481323242, + "learning_rate": 9.76031633858875e-06, + "loss": 2.667, + "step": 448600 + }, + { + "epoch": 0.11703049207566929, + "grad_norm": 8.805586814880371, + "learning_rate": 9.760060560110283e-06, + "loss": 2.6881, + "step": 448800 + }, + { + "epoch": 0.11708264470137146, + "grad_norm": 8.323440551757812, + "learning_rate": 9.759804648582633e-06, + "loss": 2.656, + "step": 449000 + }, + { + "epoch": 0.11713479732707363, + "grad_norm": 7.569894790649414, + "learning_rate": 9.759548604012959e-06, + "loss": 2.6998, + "step": 449200 + }, + { + "epoch": 0.1171869499527758, + "grad_norm": 8.15855598449707, + "learning_rate": 9.759292426408408e-06, + "loss": 2.6691, + "step": 449400 + }, + { + "epoch": 0.11723910257847797, + "grad_norm": 7.520715713500977, + "learning_rate": 9.75903611577615e-06, + "loss": 2.6674, + "step": 449600 + }, + { + "epoch": 0.11729125520418014, + "grad_norm": 8.77229118347168, + "learning_rate": 9.758779672123344e-06, + "loss": 2.6771, + "step": 449800 + }, + { + "epoch": 0.11734340782988231, + "grad_norm": 8.861888885498047, + "learning_rate": 9.75852309545716e-06, + "loss": 2.7158, + "step": 450000 + }, + { + "epoch": 0.11739556045558448, + "grad_norm": 7.774789810180664, + "learning_rate": 9.758266385784767e-06, + "loss": 2.6555, + "step": 450200 + }, + { + "epoch": 0.11744771308128665, + "grad_norm": 7.58319091796875, + "learning_rate": 9.758009543113343e-06, + "loss": 2.6737, + "step": 450400 + }, + { + "epoch": 0.11749986570698882, + "grad_norm": 8.081302642822266, + "learning_rate": 9.757752567450065e-06, + "loss": 2.6793, + "step": 450600 + }, + { + "epoch": 0.11755201833269098, + "grad_norm": 7.114253044128418, + "learning_rate": 9.757495458802119e-06, + "loss": 2.694, + "step": 450800 + }, + { + "epoch": 0.11760417095839315, + "grad_norm": 7.961178779602051, + "learning_rate": 9.757238217176687e-06, + "loss": 2.6656, + "step": 451000 + }, + { + "epoch": 0.11765632358409532, + "grad_norm": 7.241010665893555, + "learning_rate": 9.756980842580963e-06, + "loss": 2.6901, + "step": 451200 + }, + { + "epoch": 0.11770847620979749, + "grad_norm": 9.523578643798828, + "learning_rate": 9.75672333502214e-06, + "loss": 2.6705, + "step": 451400 + }, + { + "epoch": 0.11776062883549966, + "grad_norm": 7.9366230964660645, + "learning_rate": 9.756465694507413e-06, + "loss": 2.6603, + "step": 451600 + }, + { + "epoch": 0.11781278146120183, + "grad_norm": 8.917617797851562, + "learning_rate": 9.756207921043988e-06, + "loss": 2.7168, + "step": 451800 + }, + { + "epoch": 0.117864934086904, + "grad_norm": 8.094945907592773, + "learning_rate": 9.755950014639065e-06, + "loss": 2.6525, + "step": 452000 + }, + { + "epoch": 0.11791708671260617, + "grad_norm": 8.251530647277832, + "learning_rate": 9.755691975299857e-06, + "loss": 2.6882, + "step": 452200 + }, + { + "epoch": 0.11796923933830834, + "grad_norm": 7.913506031036377, + "learning_rate": 9.755433803033574e-06, + "loss": 2.6975, + "step": 452400 + }, + { + "epoch": 0.11802139196401051, + "grad_norm": 8.891002655029297, + "learning_rate": 9.755175497847434e-06, + "loss": 2.6518, + "step": 452600 + }, + { + "epoch": 0.11807354458971268, + "grad_norm": 9.051965713500977, + "learning_rate": 9.754917059748654e-06, + "loss": 2.715, + "step": 452800 + }, + { + "epoch": 0.11812569721541485, + "grad_norm": 8.167122840881348, + "learning_rate": 9.75465848874446e-06, + "loss": 2.7022, + "step": 453000 + }, + { + "epoch": 0.11817784984111702, + "grad_norm": 7.507691860198975, + "learning_rate": 9.754399784842079e-06, + "loss": 2.6796, + "step": 453200 + }, + { + "epoch": 0.1182300024668192, + "grad_norm": 7.737783908843994, + "learning_rate": 9.754140948048743e-06, + "loss": 2.6774, + "step": 453400 + }, + { + "epoch": 0.11828215509252137, + "grad_norm": 8.166272163391113, + "learning_rate": 9.753881978371685e-06, + "loss": 2.7361, + "step": 453600 + }, + { + "epoch": 0.11833430771822354, + "grad_norm": 7.494532585144043, + "learning_rate": 9.753622875818145e-06, + "loss": 2.6566, + "step": 453800 + }, + { + "epoch": 0.1183864603439257, + "grad_norm": 8.62021255493164, + "learning_rate": 9.753363640395364e-06, + "loss": 2.6363, + "step": 454000 + }, + { + "epoch": 0.11843861296962788, + "grad_norm": 8.136500358581543, + "learning_rate": 9.753104272110587e-06, + "loss": 2.7016, + "step": 454200 + }, + { + "epoch": 0.11849076559533005, + "grad_norm": 6.9378662109375, + "learning_rate": 9.752844770971067e-06, + "loss": 2.6903, + "step": 454400 + }, + { + "epoch": 0.11854291822103222, + "grad_norm": 9.27441692352295, + "learning_rate": 9.752585136984055e-06, + "loss": 2.6897, + "step": 454600 + }, + { + "epoch": 0.11859507084673439, + "grad_norm": 8.395896911621094, + "learning_rate": 9.752325370156808e-06, + "loss": 2.6787, + "step": 454800 + }, + { + "epoch": 0.11864722347243656, + "grad_norm": 6.82293701171875, + "learning_rate": 9.752065470496587e-06, + "loss": 2.6816, + "step": 455000 + }, + { + "epoch": 0.11869937609813873, + "grad_norm": 9.240446090698242, + "learning_rate": 9.751805438010658e-06, + "loss": 2.7092, + "step": 455200 + }, + { + "epoch": 0.1187515287238409, + "grad_norm": 7.735306262969971, + "learning_rate": 9.751545272706287e-06, + "loss": 2.6778, + "step": 455400 + }, + { + "epoch": 0.11880368134954307, + "grad_norm": 7.703882694244385, + "learning_rate": 9.751284974590746e-06, + "loss": 2.6471, + "step": 455600 + }, + { + "epoch": 0.11885583397524524, + "grad_norm": 7.228545665740967, + "learning_rate": 9.751024543671315e-06, + "loss": 2.617, + "step": 455800 + }, + { + "epoch": 0.11890798660094741, + "grad_norm": 8.354397773742676, + "learning_rate": 9.750763979955267e-06, + "loss": 2.6919, + "step": 456000 + }, + { + "epoch": 0.11896013922664958, + "grad_norm": 7.73373556137085, + "learning_rate": 9.750503283449888e-06, + "loss": 2.7015, + "step": 456200 + }, + { + "epoch": 0.11901229185235175, + "grad_norm": 8.244440078735352, + "learning_rate": 9.750242454162465e-06, + "loss": 2.6825, + "step": 456400 + }, + { + "epoch": 0.11906444447805391, + "grad_norm": 8.330254554748535, + "learning_rate": 9.749981492100289e-06, + "loss": 2.6659, + "step": 456600 + }, + { + "epoch": 0.11911659710375608, + "grad_norm": 8.01742935180664, + "learning_rate": 9.749720397270652e-06, + "loss": 2.6779, + "step": 456800 + }, + { + "epoch": 0.11916874972945825, + "grad_norm": 8.478322982788086, + "learning_rate": 9.749459169680855e-06, + "loss": 2.6841, + "step": 457000 + }, + { + "epoch": 0.11922090235516042, + "grad_norm": 7.7415080070495605, + "learning_rate": 9.749197809338195e-06, + "loss": 2.6466, + "step": 457200 + }, + { + "epoch": 0.11927305498086259, + "grad_norm": 8.464523315429688, + "learning_rate": 9.748936316249982e-06, + "loss": 2.6419, + "step": 457400 + }, + { + "epoch": 0.11932520760656476, + "grad_norm": 8.651251792907715, + "learning_rate": 9.748674690423524e-06, + "loss": 2.684, + "step": 457600 + }, + { + "epoch": 0.11937736023226693, + "grad_norm": 8.235745429992676, + "learning_rate": 9.748412931866132e-06, + "loss": 2.6694, + "step": 457800 + }, + { + "epoch": 0.1194295128579691, + "grad_norm": 7.739285469055176, + "learning_rate": 9.748151040585123e-06, + "loss": 2.6602, + "step": 458000 + }, + { + "epoch": 0.11948166548367127, + "grad_norm": 9.078447341918945, + "learning_rate": 9.747889016587819e-06, + "loss": 2.6301, + "step": 458200 + }, + { + "epoch": 0.11953381810937344, + "grad_norm": 7.439968109130859, + "learning_rate": 9.747626859881542e-06, + "loss": 2.6789, + "step": 458400 + }, + { + "epoch": 0.11958597073507561, + "grad_norm": 8.582900047302246, + "learning_rate": 9.74736457047362e-06, + "loss": 2.6473, + "step": 458600 + }, + { + "epoch": 0.11963812336077778, + "grad_norm": 8.283160209655762, + "learning_rate": 9.747102148371384e-06, + "loss": 2.6383, + "step": 458800 + }, + { + "epoch": 0.11969027598647995, + "grad_norm": 9.031500816345215, + "learning_rate": 9.74683959358217e-06, + "loss": 2.6908, + "step": 459000 + }, + { + "epoch": 0.11974242861218212, + "grad_norm": 7.941998481750488, + "learning_rate": 9.746576906113314e-06, + "loss": 2.6518, + "step": 459200 + }, + { + "epoch": 0.11979458123788429, + "grad_norm": 9.694351196289062, + "learning_rate": 9.746314085972162e-06, + "loss": 2.6758, + "step": 459400 + }, + { + "epoch": 0.11984673386358646, + "grad_norm": 7.908174991607666, + "learning_rate": 9.746051133166059e-06, + "loss": 2.6652, + "step": 459600 + }, + { + "epoch": 0.11989888648928863, + "grad_norm": 7.092363357543945, + "learning_rate": 9.745788047702354e-06, + "loss": 2.6604, + "step": 459800 + }, + { + "epoch": 0.1199510391149908, + "grad_norm": 7.674038887023926, + "learning_rate": 9.7455248295884e-06, + "loss": 2.6792, + "step": 460000 + }, + { + "epoch": 0.12000319174069297, + "grad_norm": 8.855584144592285, + "learning_rate": 9.745261478831555e-06, + "loss": 2.6561, + "step": 460200 + }, + { + "epoch": 0.12005534436639514, + "grad_norm": 7.2290120124816895, + "learning_rate": 9.744997995439184e-06, + "loss": 2.6789, + "step": 460400 + }, + { + "epoch": 0.12010749699209732, + "grad_norm": 8.269558906555176, + "learning_rate": 9.744734379418644e-06, + "loss": 2.6772, + "step": 460600 + }, + { + "epoch": 0.12015964961779949, + "grad_norm": 9.019466400146484, + "learning_rate": 9.744470630777307e-06, + "loss": 2.685, + "step": 460800 + }, + { + "epoch": 0.12021180224350166, + "grad_norm": 8.867697715759277, + "learning_rate": 9.744206749522547e-06, + "loss": 2.6937, + "step": 461000 + }, + { + "epoch": 0.12026395486920383, + "grad_norm": 8.083369255065918, + "learning_rate": 9.743942735661738e-06, + "loss": 2.6659, + "step": 461200 + }, + { + "epoch": 0.120316107494906, + "grad_norm": 8.023956298828125, + "learning_rate": 9.743678589202258e-06, + "loss": 2.6797, + "step": 461400 + }, + { + "epoch": 0.12036826012060817, + "grad_norm": 8.839423179626465, + "learning_rate": 9.743414310151494e-06, + "loss": 2.7316, + "step": 461600 + }, + { + "epoch": 0.12042041274631034, + "grad_norm": 8.470783233642578, + "learning_rate": 9.74314989851683e-06, + "loss": 2.6571, + "step": 461800 + }, + { + "epoch": 0.12047256537201251, + "grad_norm": 8.4376802444458, + "learning_rate": 9.742885354305657e-06, + "loss": 2.649, + "step": 462000 + }, + { + "epoch": 0.12052471799771468, + "grad_norm": 7.8062424659729, + "learning_rate": 9.74262067752537e-06, + "loss": 2.6703, + "step": 462200 + }, + { + "epoch": 0.12057687062341685, + "grad_norm": 8.25600814819336, + "learning_rate": 9.742355868183365e-06, + "loss": 2.7073, + "step": 462400 + }, + { + "epoch": 0.120629023249119, + "grad_norm": 8.327945709228516, + "learning_rate": 9.742090926287047e-06, + "loss": 2.6969, + "step": 462600 + }, + { + "epoch": 0.12068117587482118, + "grad_norm": 8.068275451660156, + "learning_rate": 9.741825851843819e-06, + "loss": 2.7028, + "step": 462800 + }, + { + "epoch": 0.12073332850052335, + "grad_norm": 10.301764488220215, + "learning_rate": 9.741560644861091e-06, + "loss": 2.6903, + "step": 463000 + }, + { + "epoch": 0.12078548112622552, + "grad_norm": 7.792321681976318, + "learning_rate": 9.741295305346276e-06, + "loss": 2.6443, + "step": 463200 + }, + { + "epoch": 0.12083763375192769, + "grad_norm": 8.240435600280762, + "learning_rate": 9.74102983330679e-06, + "loss": 2.6793, + "step": 463400 + }, + { + "epoch": 0.12088978637762986, + "grad_norm": 8.414860725402832, + "learning_rate": 9.740764228750053e-06, + "loss": 2.6749, + "step": 463600 + }, + { + "epoch": 0.12094193900333203, + "grad_norm": 8.185742378234863, + "learning_rate": 9.74049849168349e-06, + "loss": 2.6609, + "step": 463800 + }, + { + "epoch": 0.1209940916290342, + "grad_norm": 8.727797508239746, + "learning_rate": 9.74023262211453e-06, + "loss": 2.6738, + "step": 464000 + }, + { + "epoch": 0.12104624425473637, + "grad_norm": 8.264991760253906, + "learning_rate": 9.739966620050601e-06, + "loss": 2.6553, + "step": 464200 + }, + { + "epoch": 0.12109839688043854, + "grad_norm": 8.572891235351562, + "learning_rate": 9.739700485499139e-06, + "loss": 2.689, + "step": 464400 + }, + { + "epoch": 0.12115054950614071, + "grad_norm": 8.757530212402344, + "learning_rate": 9.739434218467583e-06, + "loss": 2.6752, + "step": 464600 + }, + { + "epoch": 0.12120270213184288, + "grad_norm": 8.559809684753418, + "learning_rate": 9.739167818963378e-06, + "loss": 2.6809, + "step": 464800 + }, + { + "epoch": 0.12125485475754505, + "grad_norm": 8.356423377990723, + "learning_rate": 9.738901286993965e-06, + "loss": 2.6699, + "step": 465000 + }, + { + "epoch": 0.12130700738324722, + "grad_norm": 7.829847812652588, + "learning_rate": 9.738634622566799e-06, + "loss": 2.6739, + "step": 465200 + }, + { + "epoch": 0.12135916000894939, + "grad_norm": 8.362701416015625, + "learning_rate": 9.73836782568933e-06, + "loss": 2.6874, + "step": 465400 + }, + { + "epoch": 0.12141131263465156, + "grad_norm": 9.722343444824219, + "learning_rate": 9.738100896369019e-06, + "loss": 2.6827, + "step": 465600 + }, + { + "epoch": 0.12146346526035373, + "grad_norm": 8.5645112991333, + "learning_rate": 9.737833834613322e-06, + "loss": 2.6557, + "step": 465800 + }, + { + "epoch": 0.1215156178860559, + "grad_norm": 8.088398933410645, + "learning_rate": 9.737566640429707e-06, + "loss": 2.6685, + "step": 466000 + }, + { + "epoch": 0.12156777051175807, + "grad_norm": 8.453975677490234, + "learning_rate": 9.737299313825644e-06, + "loss": 2.6338, + "step": 466200 + }, + { + "epoch": 0.12161992313746024, + "grad_norm": 7.445858478546143, + "learning_rate": 9.737031854808601e-06, + "loss": 2.6752, + "step": 466400 + }, + { + "epoch": 0.12167207576316241, + "grad_norm": 9.255156517028809, + "learning_rate": 9.736764263386057e-06, + "loss": 2.7129, + "step": 466600 + }, + { + "epoch": 0.12172422838886458, + "grad_norm": 9.095346450805664, + "learning_rate": 9.736496539565488e-06, + "loss": 2.6596, + "step": 466800 + }, + { + "epoch": 0.12177638101456675, + "grad_norm": 8.963985443115234, + "learning_rate": 9.73622868335438e-06, + "loss": 2.6857, + "step": 467000 + }, + { + "epoch": 0.12182853364026892, + "grad_norm": 9.29616641998291, + "learning_rate": 9.73596069476022e-06, + "loss": 2.6297, + "step": 467200 + }, + { + "epoch": 0.1218806862659711, + "grad_norm": 7.7633795738220215, + "learning_rate": 9.735692573790498e-06, + "loss": 2.6467, + "step": 467400 + }, + { + "epoch": 0.12193283889167326, + "grad_norm": 9.138328552246094, + "learning_rate": 9.735424320452707e-06, + "loss": 2.7386, + "step": 467600 + }, + { + "epoch": 0.12198499151737544, + "grad_norm": 7.824506759643555, + "learning_rate": 9.735155934754347e-06, + "loss": 2.6447, + "step": 467800 + }, + { + "epoch": 0.1220371441430776, + "grad_norm": 9.13517951965332, + "learning_rate": 9.734887416702919e-06, + "loss": 2.6749, + "step": 468000 + }, + { + "epoch": 0.12208929676877978, + "grad_norm": 8.389497756958008, + "learning_rate": 9.734618766305928e-06, + "loss": 2.6867, + "step": 468200 + }, + { + "epoch": 0.12214144939448193, + "grad_norm": 8.484880447387695, + "learning_rate": 9.734349983570882e-06, + "loss": 2.6819, + "step": 468400 + }, + { + "epoch": 0.1221936020201841, + "grad_norm": 8.580875396728516, + "learning_rate": 9.734081068505296e-06, + "loss": 2.6606, + "step": 468600 + }, + { + "epoch": 0.12224575464588627, + "grad_norm": 8.64186954498291, + "learning_rate": 9.733812021116687e-06, + "loss": 2.711, + "step": 468800 + }, + { + "epoch": 0.12229790727158844, + "grad_norm": 8.544197082519531, + "learning_rate": 9.733542841412571e-06, + "loss": 2.6684, + "step": 469000 + }, + { + "epoch": 0.12235005989729061, + "grad_norm": 7.277874946594238, + "learning_rate": 9.733273529400476e-06, + "loss": 2.6726, + "step": 469200 + }, + { + "epoch": 0.12240221252299278, + "grad_norm": 7.846140384674072, + "learning_rate": 9.733004085087926e-06, + "loss": 2.6603, + "step": 469400 + }, + { + "epoch": 0.12245436514869495, + "grad_norm": 7.558496475219727, + "learning_rate": 9.732734508482458e-06, + "loss": 2.6833, + "step": 469600 + }, + { + "epoch": 0.12250651777439713, + "grad_norm": 8.439860343933105, + "learning_rate": 9.732464799591603e-06, + "loss": 2.6507, + "step": 469800 + }, + { + "epoch": 0.1225586704000993, + "grad_norm": 9.658734321594238, + "learning_rate": 9.732194958422898e-06, + "loss": 2.7056, + "step": 470000 + }, + { + "epoch": 0.12261082302580147, + "grad_norm": 7.978820323944092, + "learning_rate": 9.731924984983889e-06, + "loss": 2.704, + "step": 470200 + }, + { + "epoch": 0.12266297565150364, + "grad_norm": 7.804432392120361, + "learning_rate": 9.73165487928212e-06, + "loss": 2.6903, + "step": 470400 + }, + { + "epoch": 0.1227151282772058, + "grad_norm": 8.81722640991211, + "learning_rate": 9.73138464132514e-06, + "loss": 2.7032, + "step": 470600 + }, + { + "epoch": 0.12276728090290798, + "grad_norm": 9.00986385345459, + "learning_rate": 9.731114271120505e-06, + "loss": 2.6696, + "step": 470800 + }, + { + "epoch": 0.12281943352861015, + "grad_norm": 8.249679565429688, + "learning_rate": 9.730843768675772e-06, + "loss": 2.6984, + "step": 471000 + }, + { + "epoch": 0.12287158615431232, + "grad_norm": 8.557232856750488, + "learning_rate": 9.7305731339985e-06, + "loss": 2.6371, + "step": 471200 + }, + { + "epoch": 0.12292373878001449, + "grad_norm": 8.401301383972168, + "learning_rate": 9.730302367096254e-06, + "loss": 2.6919, + "step": 471400 + }, + { + "epoch": 0.12297589140571666, + "grad_norm": 7.651167392730713, + "learning_rate": 9.730031467976601e-06, + "loss": 2.7055, + "step": 471600 + }, + { + "epoch": 0.12302804403141883, + "grad_norm": 8.762410163879395, + "learning_rate": 9.729760436647118e-06, + "loss": 2.6395, + "step": 471800 + }, + { + "epoch": 0.123080196657121, + "grad_norm": 8.571608543395996, + "learning_rate": 9.729489273115376e-06, + "loss": 2.651, + "step": 472000 + }, + { + "epoch": 0.12313234928282317, + "grad_norm": 8.2172269821167, + "learning_rate": 9.729217977388954e-06, + "loss": 2.6385, + "step": 472200 + }, + { + "epoch": 0.12318450190852534, + "grad_norm": 9.237062454223633, + "learning_rate": 9.728946549475438e-06, + "loss": 2.718, + "step": 472400 + }, + { + "epoch": 0.12323665453422751, + "grad_norm": 8.741727828979492, + "learning_rate": 9.728674989382412e-06, + "loss": 2.6775, + "step": 472600 + }, + { + "epoch": 0.12328880715992968, + "grad_norm": 7.985013008117676, + "learning_rate": 9.728403297117469e-06, + "loss": 2.6703, + "step": 472800 + }, + { + "epoch": 0.12334095978563185, + "grad_norm": 8.41588020324707, + "learning_rate": 9.7281314726882e-06, + "loss": 2.6602, + "step": 473000 + }, + { + "epoch": 0.12339311241133402, + "grad_norm": 10.485689163208008, + "learning_rate": 9.727859516102208e-06, + "loss": 2.6371, + "step": 473200 + }, + { + "epoch": 0.12344526503703619, + "grad_norm": 8.508200645446777, + "learning_rate": 9.727587427367088e-06, + "loss": 2.7213, + "step": 473400 + }, + { + "epoch": 0.12349741766273836, + "grad_norm": 7.656301498413086, + "learning_rate": 9.72731520649045e-06, + "loss": 2.6225, + "step": 473600 + }, + { + "epoch": 0.12354957028844053, + "grad_norm": 7.055719375610352, + "learning_rate": 9.7270428534799e-06, + "loss": 2.6489, + "step": 473800 + }, + { + "epoch": 0.1236017229141427, + "grad_norm": 7.498048782348633, + "learning_rate": 9.72677036834305e-06, + "loss": 2.679, + "step": 474000 + }, + { + "epoch": 0.12365387553984487, + "grad_norm": 8.528254508972168, + "learning_rate": 9.726497751087522e-06, + "loss": 2.6907, + "step": 474200 + }, + { + "epoch": 0.12370602816554703, + "grad_norm": 8.69704818725586, + "learning_rate": 9.72622500172093e-06, + "loss": 2.666, + "step": 474400 + }, + { + "epoch": 0.1237581807912492, + "grad_norm": 8.989028930664062, + "learning_rate": 9.7259521202509e-06, + "loss": 2.6891, + "step": 474600 + }, + { + "epoch": 0.12381033341695137, + "grad_norm": 8.570834159851074, + "learning_rate": 9.725679106685058e-06, + "loss": 2.6665, + "step": 474800 + }, + { + "epoch": 0.12386248604265354, + "grad_norm": 8.544633865356445, + "learning_rate": 9.725405961031038e-06, + "loss": 2.6161, + "step": 475000 + }, + { + "epoch": 0.12391463866835571, + "grad_norm": 8.439756393432617, + "learning_rate": 9.725132683296471e-06, + "loss": 2.6956, + "step": 475200 + }, + { + "epoch": 0.12396679129405788, + "grad_norm": 8.925586700439453, + "learning_rate": 9.724859273488999e-06, + "loss": 2.6371, + "step": 475400 + }, + { + "epoch": 0.12401894391976005, + "grad_norm": 8.36025333404541, + "learning_rate": 9.72458573161626e-06, + "loss": 2.6451, + "step": 475600 + }, + { + "epoch": 0.12407109654546222, + "grad_norm": 8.255962371826172, + "learning_rate": 9.724312057685902e-06, + "loss": 2.6251, + "step": 475800 + }, + { + "epoch": 0.1241232491711644, + "grad_norm": 9.104443550109863, + "learning_rate": 9.724038251705577e-06, + "loss": 2.6458, + "step": 476000 + }, + { + "epoch": 0.12417540179686656, + "grad_norm": 8.984846115112305, + "learning_rate": 9.723764313682936e-06, + "loss": 2.6515, + "step": 476200 + }, + { + "epoch": 0.12422755442256873, + "grad_norm": 8.620625495910645, + "learning_rate": 9.723490243625634e-06, + "loss": 2.6752, + "step": 476400 + }, + { + "epoch": 0.1242797070482709, + "grad_norm": 8.312873840332031, + "learning_rate": 9.723216041541334e-06, + "loss": 2.6998, + "step": 476600 + }, + { + "epoch": 0.12433185967397307, + "grad_norm": 8.567025184631348, + "learning_rate": 9.7229417074377e-06, + "loss": 2.6366, + "step": 476800 + }, + { + "epoch": 0.12438401229967525, + "grad_norm": 9.091941833496094, + "learning_rate": 9.7226672413224e-06, + "loss": 2.6761, + "step": 477000 + }, + { + "epoch": 0.12443616492537742, + "grad_norm": 8.620004653930664, + "learning_rate": 9.722392643203103e-06, + "loss": 2.6256, + "step": 477200 + }, + { + "epoch": 0.12448831755107959, + "grad_norm": 8.094307899475098, + "learning_rate": 9.722117913087488e-06, + "loss": 2.6586, + "step": 477400 + }, + { + "epoch": 0.12454047017678176, + "grad_norm": 9.369007110595703, + "learning_rate": 9.72184305098323e-06, + "loss": 2.6223, + "step": 477600 + }, + { + "epoch": 0.12459262280248393, + "grad_norm": 9.586237907409668, + "learning_rate": 9.721568056898018e-06, + "loss": 2.6983, + "step": 477800 + }, + { + "epoch": 0.1246447754281861, + "grad_norm": 7.8790106773376465, + "learning_rate": 9.721292930839532e-06, + "loss": 2.6527, + "step": 478000 + }, + { + "epoch": 0.12469692805388827, + "grad_norm": 9.248851776123047, + "learning_rate": 9.721017672815467e-06, + "loss": 2.6589, + "step": 478200 + }, + { + "epoch": 0.12474908067959044, + "grad_norm": 9.875216484069824, + "learning_rate": 9.720742282833513e-06, + "loss": 2.6659, + "step": 478400 + }, + { + "epoch": 0.12480123330529261, + "grad_norm": 9.6345796585083, + "learning_rate": 9.720466760901368e-06, + "loss": 2.676, + "step": 478600 + }, + { + "epoch": 0.12485338593099478, + "grad_norm": 8.288617134094238, + "learning_rate": 9.720191107026735e-06, + "loss": 2.661, + "step": 478800 + }, + { + "epoch": 0.12490553855669695, + "grad_norm": 9.437127113342285, + "learning_rate": 9.719915321217319e-06, + "loss": 2.6597, + "step": 479000 + }, + { + "epoch": 0.12495769118239912, + "grad_norm": 8.890555381774902, + "learning_rate": 9.719639403480827e-06, + "loss": 2.6872, + "step": 479200 + }, + { + "epoch": 0.1250098438081013, + "grad_norm": 8.575922966003418, + "learning_rate": 9.71936335382497e-06, + "loss": 2.6705, + "step": 479400 + }, + { + "epoch": 0.12506199643380345, + "grad_norm": 8.43736457824707, + "learning_rate": 9.719087172257468e-06, + "loss": 2.6723, + "step": 479600 + }, + { + "epoch": 0.12511414905950563, + "grad_norm": 7.534491539001465, + "learning_rate": 9.71881085878604e-06, + "loss": 2.6203, + "step": 479800 + }, + { + "epoch": 0.1251663016852078, + "grad_norm": 9.604147911071777, + "learning_rate": 9.718534413418404e-06, + "loss": 2.6615, + "step": 480000 + }, + { + "epoch": 0.12521845431090997, + "grad_norm": 8.846263885498047, + "learning_rate": 9.71825783616229e-06, + "loss": 2.6699, + "step": 480200 + }, + { + "epoch": 0.12527060693661213, + "grad_norm": 8.537163734436035, + "learning_rate": 9.71798112702543e-06, + "loss": 2.6123, + "step": 480400 + }, + { + "epoch": 0.1253227595623143, + "grad_norm": 9.015103340148926, + "learning_rate": 9.71770428601556e-06, + "loss": 2.6473, + "step": 480600 + }, + { + "epoch": 0.12537491218801647, + "grad_norm": 8.286890029907227, + "learning_rate": 9.717427313140412e-06, + "loss": 2.6601, + "step": 480800 + }, + { + "epoch": 0.12542706481371865, + "grad_norm": 8.386801719665527, + "learning_rate": 9.717150208407733e-06, + "loss": 2.6411, + "step": 481000 + }, + { + "epoch": 0.1254792174394208, + "grad_norm": 9.405022621154785, + "learning_rate": 9.716872971825265e-06, + "loss": 2.6861, + "step": 481200 + }, + { + "epoch": 0.125531370065123, + "grad_norm": 9.213386535644531, + "learning_rate": 9.71659560340076e-06, + "loss": 2.6431, + "step": 481400 + }, + { + "epoch": 0.12558352269082515, + "grad_norm": 9.34145736694336, + "learning_rate": 9.71631810314197e-06, + "loss": 2.6649, + "step": 481600 + }, + { + "epoch": 0.12563567531652733, + "grad_norm": 9.355376243591309, + "learning_rate": 9.71604047105665e-06, + "loss": 2.6769, + "step": 481800 + }, + { + "epoch": 0.1256878279422295, + "grad_norm": 9.561544418334961, + "learning_rate": 9.715762707152561e-06, + "loss": 2.6069, + "step": 482000 + }, + { + "epoch": 0.12573998056793168, + "grad_norm": 9.205405235290527, + "learning_rate": 9.715484811437468e-06, + "loss": 2.6307, + "step": 482200 + }, + { + "epoch": 0.12579213319363383, + "grad_norm": 9.112932205200195, + "learning_rate": 9.715206783919136e-06, + "loss": 2.6701, + "step": 482400 + }, + { + "epoch": 0.12584428581933602, + "grad_norm": 8.555209159851074, + "learning_rate": 9.714928624605337e-06, + "loss": 2.6885, + "step": 482600 + }, + { + "epoch": 0.12589643844503817, + "grad_norm": 8.725874900817871, + "learning_rate": 9.714650333503848e-06, + "loss": 2.663, + "step": 482800 + }, + { + "epoch": 0.12594859107074036, + "grad_norm": 8.564589500427246, + "learning_rate": 9.714371910622445e-06, + "loss": 2.6496, + "step": 483000 + }, + { + "epoch": 0.1260007436964425, + "grad_norm": 9.859867095947266, + "learning_rate": 9.714093355968913e-06, + "loss": 2.6103, + "step": 483200 + }, + { + "epoch": 0.12605289632214467, + "grad_norm": 8.561869621276855, + "learning_rate": 9.713814669551034e-06, + "loss": 2.6476, + "step": 483400 + }, + { + "epoch": 0.12610504894784685, + "grad_norm": 9.05595874786377, + "learning_rate": 9.7135358513766e-06, + "loss": 2.6854, + "step": 483600 + }, + { + "epoch": 0.126157201573549, + "grad_norm": 8.561729431152344, + "learning_rate": 9.713256901453405e-06, + "loss": 2.6784, + "step": 483800 + }, + { + "epoch": 0.1262093541992512, + "grad_norm": 9.193467140197754, + "learning_rate": 9.712977819789244e-06, + "loss": 2.6623, + "step": 484000 + }, + { + "epoch": 0.12626150682495335, + "grad_norm": 8.660527229309082, + "learning_rate": 9.71269860639192e-06, + "loss": 2.682, + "step": 484200 + }, + { + "epoch": 0.12631365945065554, + "grad_norm": 8.973142623901367, + "learning_rate": 9.712419261269234e-06, + "loss": 2.6837, + "step": 484400 + }, + { + "epoch": 0.1263658120763577, + "grad_norm": 9.378042221069336, + "learning_rate": 9.712139784428998e-06, + "loss": 2.6662, + "step": 484600 + }, + { + "epoch": 0.12641796470205988, + "grad_norm": 7.6433424949646, + "learning_rate": 9.71186017587902e-06, + "loss": 2.6865, + "step": 484800 + }, + { + "epoch": 0.12647011732776203, + "grad_norm": 8.966972351074219, + "learning_rate": 9.711580435627118e-06, + "loss": 2.671, + "step": 485000 + }, + { + "epoch": 0.12652226995346422, + "grad_norm": 9.110841751098633, + "learning_rate": 9.71130056368111e-06, + "loss": 2.6932, + "step": 485200 + }, + { + "epoch": 0.12657442257916637, + "grad_norm": 8.476582527160645, + "learning_rate": 9.711020560048819e-06, + "loss": 2.6634, + "step": 485400 + }, + { + "epoch": 0.12662657520486856, + "grad_norm": 8.898265838623047, + "learning_rate": 9.710740424738072e-06, + "loss": 2.6732, + "step": 485600 + }, + { + "epoch": 0.12667872783057071, + "grad_norm": 9.571182250976562, + "learning_rate": 9.710460157756698e-06, + "loss": 2.6777, + "step": 485800 + }, + { + "epoch": 0.1267308804562729, + "grad_norm": 9.099579811096191, + "learning_rate": 9.710179759112531e-06, + "loss": 2.6837, + "step": 486000 + }, + { + "epoch": 0.12678303308197506, + "grad_norm": 8.9638671875, + "learning_rate": 9.709899228813408e-06, + "loss": 2.6928, + "step": 486200 + }, + { + "epoch": 0.12683518570767724, + "grad_norm": 9.293763160705566, + "learning_rate": 9.709618566867173e-06, + "loss": 2.6583, + "step": 486400 + }, + { + "epoch": 0.1268873383333794, + "grad_norm": 7.878268241882324, + "learning_rate": 9.709337773281666e-06, + "loss": 2.6466, + "step": 486600 + }, + { + "epoch": 0.12693949095908158, + "grad_norm": 8.64033317565918, + "learning_rate": 9.70905684806474e-06, + "loss": 2.6509, + "step": 486800 + }, + { + "epoch": 0.12699164358478374, + "grad_norm": 9.131351470947266, + "learning_rate": 9.708775791224246e-06, + "loss": 2.6382, + "step": 487000 + }, + { + "epoch": 0.12704379621048592, + "grad_norm": 8.987448692321777, + "learning_rate": 9.708494602768038e-06, + "loss": 2.6429, + "step": 487200 + }, + { + "epoch": 0.12709594883618808, + "grad_norm": 9.501585960388184, + "learning_rate": 9.708213282703975e-06, + "loss": 2.6805, + "step": 487400 + }, + { + "epoch": 0.12714810146189026, + "grad_norm": 9.123064994812012, + "learning_rate": 9.707931831039926e-06, + "loss": 2.6948, + "step": 487600 + }, + { + "epoch": 0.12720025408759242, + "grad_norm": 9.532994270324707, + "learning_rate": 9.70765024778375e-06, + "loss": 2.6968, + "step": 487800 + }, + { + "epoch": 0.1272524067132946, + "grad_norm": 9.438848495483398, + "learning_rate": 9.707368532943324e-06, + "loss": 2.669, + "step": 488000 + }, + { + "epoch": 0.12730455933899676, + "grad_norm": 9.121577262878418, + "learning_rate": 9.707086686526518e-06, + "loss": 2.6328, + "step": 488200 + }, + { + "epoch": 0.12735671196469894, + "grad_norm": 9.110394477844238, + "learning_rate": 9.706804708541211e-06, + "loss": 2.6712, + "step": 488400 + }, + { + "epoch": 0.1274088645904011, + "grad_norm": 7.500752925872803, + "learning_rate": 9.706522598995288e-06, + "loss": 2.6645, + "step": 488600 + }, + { + "epoch": 0.12746101721610328, + "grad_norm": 9.652029037475586, + "learning_rate": 9.706240357896631e-06, + "loss": 2.6274, + "step": 488800 + }, + { + "epoch": 0.12751316984180544, + "grad_norm": 9.1790189743042, + "learning_rate": 9.705957985253128e-06, + "loss": 2.6722, + "step": 489000 + }, + { + "epoch": 0.1275653224675076, + "grad_norm": 10.447507858276367, + "learning_rate": 9.705675481072673e-06, + "loss": 2.6685, + "step": 489200 + }, + { + "epoch": 0.12761747509320978, + "grad_norm": 9.254826545715332, + "learning_rate": 9.705392845363164e-06, + "loss": 2.6453, + "step": 489400 + }, + { + "epoch": 0.12766962771891194, + "grad_norm": 8.488991737365723, + "learning_rate": 9.705110078132498e-06, + "loss": 2.6604, + "step": 489600 + }, + { + "epoch": 0.12772178034461412, + "grad_norm": 8.976917266845703, + "learning_rate": 9.704827179388581e-06, + "loss": 2.6436, + "step": 489800 + }, + { + "epoch": 0.12777393297031628, + "grad_norm": 9.096659660339355, + "learning_rate": 9.704544149139319e-06, + "loss": 2.6794, + "step": 490000 + }, + { + "epoch": 0.12782608559601846, + "grad_norm": 9.129677772521973, + "learning_rate": 9.704260987392624e-06, + "loss": 2.6928, + "step": 490200 + }, + { + "epoch": 0.12787823822172062, + "grad_norm": 8.751590728759766, + "learning_rate": 9.70397769415641e-06, + "loss": 2.6381, + "step": 490400 + }, + { + "epoch": 0.1279303908474228, + "grad_norm": 9.134084701538086, + "learning_rate": 9.703694269438596e-06, + "loss": 2.6475, + "step": 490600 + }, + { + "epoch": 0.12798254347312496, + "grad_norm": 9.106485366821289, + "learning_rate": 9.703410713247102e-06, + "loss": 2.6657, + "step": 490800 + }, + { + "epoch": 0.12803469609882714, + "grad_norm": 8.137367248535156, + "learning_rate": 9.703127025589856e-06, + "loss": 2.6392, + "step": 491000 + }, + { + "epoch": 0.1280868487245293, + "grad_norm": 8.145878791809082, + "learning_rate": 9.702843206474788e-06, + "loss": 2.6477, + "step": 491200 + }, + { + "epoch": 0.12813900135023149, + "grad_norm": 8.465982437133789, + "learning_rate": 9.70255925590983e-06, + "loss": 2.6539, + "step": 491400 + }, + { + "epoch": 0.12819115397593364, + "grad_norm": 8.704390525817871, + "learning_rate": 9.702275173902915e-06, + "loss": 2.656, + "step": 491600 + }, + { + "epoch": 0.12824330660163583, + "grad_norm": 9.234391212463379, + "learning_rate": 9.70199096046199e-06, + "loss": 2.6496, + "step": 491800 + }, + { + "epoch": 0.12829545922733798, + "grad_norm": 8.896957397460938, + "learning_rate": 9.701706615594996e-06, + "loss": 2.5983, + "step": 492000 + }, + { + "epoch": 0.12834761185304017, + "grad_norm": 8.498276710510254, + "learning_rate": 9.70142213930988e-06, + "loss": 2.6514, + "step": 492200 + }, + { + "epoch": 0.12839976447874232, + "grad_norm": 8.999076843261719, + "learning_rate": 9.701137531614595e-06, + "loss": 2.7108, + "step": 492400 + }, + { + "epoch": 0.1284519171044445, + "grad_norm": 8.710136413574219, + "learning_rate": 9.700852792517095e-06, + "loss": 2.6373, + "step": 492600 + }, + { + "epoch": 0.12850406973014666, + "grad_norm": 8.627429008483887, + "learning_rate": 9.70056792202534e-06, + "loss": 2.6688, + "step": 492800 + }, + { + "epoch": 0.12855622235584885, + "grad_norm": 8.266898155212402, + "learning_rate": 9.700282920147288e-06, + "loss": 2.6819, + "step": 493000 + }, + { + "epoch": 0.128608374981551, + "grad_norm": 8.928200721740723, + "learning_rate": 9.699997786890911e-06, + "loss": 2.6685, + "step": 493200 + }, + { + "epoch": 0.1286605276072532, + "grad_norm": 9.263498306274414, + "learning_rate": 9.699712522264179e-06, + "loss": 2.6548, + "step": 493400 + }, + { + "epoch": 0.12871268023295535, + "grad_norm": 8.80411434173584, + "learning_rate": 9.69942712627506e-06, + "loss": 2.6194, + "step": 493600 + }, + { + "epoch": 0.12876483285865753, + "grad_norm": 9.677428245544434, + "learning_rate": 9.699141598931536e-06, + "loss": 2.6491, + "step": 493800 + }, + { + "epoch": 0.1288169854843597, + "grad_norm": 8.916584968566895, + "learning_rate": 9.698855940241584e-06, + "loss": 2.6393, + "step": 494000 + }, + { + "epoch": 0.12886913811006187, + "grad_norm": 8.620122909545898, + "learning_rate": 9.698570150213192e-06, + "loss": 2.7015, + "step": 494200 + }, + { + "epoch": 0.12892129073576403, + "grad_norm": 9.354145050048828, + "learning_rate": 9.698284228854346e-06, + "loss": 2.6735, + "step": 494400 + }, + { + "epoch": 0.1289734433614662, + "grad_norm": 9.221833229064941, + "learning_rate": 9.69799817617304e-06, + "loss": 2.639, + "step": 494600 + }, + { + "epoch": 0.12902559598716837, + "grad_norm": 8.699933052062988, + "learning_rate": 9.697711992177266e-06, + "loss": 2.6933, + "step": 494800 + }, + { + "epoch": 0.12907774861287052, + "grad_norm": 8.266427993774414, + "learning_rate": 9.697425676875026e-06, + "loss": 2.6265, + "step": 495000 + }, + { + "epoch": 0.1291299012385727, + "grad_norm": 9.16121768951416, + "learning_rate": 9.697139230274322e-06, + "loss": 2.6737, + "step": 495200 + }, + { + "epoch": 0.12918205386427487, + "grad_norm": 9.327274322509766, + "learning_rate": 9.69685265238316e-06, + "loss": 2.6599, + "step": 495400 + }, + { + "epoch": 0.12923420648997705, + "grad_norm": 8.239920616149902, + "learning_rate": 9.696565943209551e-06, + "loss": 2.6745, + "step": 495600 + }, + { + "epoch": 0.1292863591156792, + "grad_norm": 10.874256134033203, + "learning_rate": 9.696279102761508e-06, + "loss": 2.639, + "step": 495800 + }, + { + "epoch": 0.1293385117413814, + "grad_norm": 8.510387420654297, + "learning_rate": 9.69599213104705e-06, + "loss": 2.6401, + "step": 496000 + }, + { + "epoch": 0.12939066436708355, + "grad_norm": 9.092486381530762, + "learning_rate": 9.695705028074197e-06, + "loss": 2.6394, + "step": 496200 + }, + { + "epoch": 0.12944281699278573, + "grad_norm": 9.543620109558105, + "learning_rate": 9.695417793850974e-06, + "loss": 2.6777, + "step": 496400 + }, + { + "epoch": 0.1294949696184879, + "grad_norm": 8.863883018493652, + "learning_rate": 9.69513042838541e-06, + "loss": 2.6614, + "step": 496600 + }, + { + "epoch": 0.12954712224419007, + "grad_norm": 8.839035034179688, + "learning_rate": 9.694842931685536e-06, + "loss": 2.6575, + "step": 496800 + }, + { + "epoch": 0.12959927486989223, + "grad_norm": 9.193402290344238, + "learning_rate": 9.69455530375939e-06, + "loss": 2.676, + "step": 497000 + }, + { + "epoch": 0.1296514274955944, + "grad_norm": 8.699831008911133, + "learning_rate": 9.69426754461501e-06, + "loss": 2.654, + "step": 497200 + }, + { + "epoch": 0.12970358012129657, + "grad_norm": 9.69882869720459, + "learning_rate": 9.693979654260439e-06, + "loss": 2.6663, + "step": 497400 + }, + { + "epoch": 0.12975573274699875, + "grad_norm": 10.164560317993164, + "learning_rate": 9.693691632703724e-06, + "loss": 2.671, + "step": 497600 + }, + { + "epoch": 0.1298078853727009, + "grad_norm": 9.261726379394531, + "learning_rate": 9.693403479952914e-06, + "loss": 2.6322, + "step": 497800 + }, + { + "epoch": 0.1298600379984031, + "grad_norm": 8.697185516357422, + "learning_rate": 9.693115196016068e-06, + "loss": 2.6649, + "step": 498000 + }, + { + "epoch": 0.12991219062410525, + "grad_norm": 9.236394882202148, + "learning_rate": 9.69282678090124e-06, + "loss": 2.6507, + "step": 498200 + }, + { + "epoch": 0.12996434324980743, + "grad_norm": 9.209188461303711, + "learning_rate": 9.692538234616491e-06, + "loss": 2.6858, + "step": 498400 + }, + { + "epoch": 0.1300164958755096, + "grad_norm": 7.591978073120117, + "learning_rate": 9.69224955716989e-06, + "loss": 2.6934, + "step": 498600 + }, + { + "epoch": 0.13006864850121178, + "grad_norm": 9.55863094329834, + "learning_rate": 9.691960748569503e-06, + "loss": 2.6473, + "step": 498800 + }, + { + "epoch": 0.13012080112691393, + "grad_norm": 9.636316299438477, + "learning_rate": 9.691671808823403e-06, + "loss": 2.7021, + "step": 499000 + }, + { + "epoch": 0.13017295375261612, + "grad_norm": 9.680166244506836, + "learning_rate": 9.691382737939665e-06, + "loss": 2.6483, + "step": 499200 + }, + { + "epoch": 0.13022510637831827, + "grad_norm": 8.427440643310547, + "learning_rate": 9.69109353592637e-06, + "loss": 2.6132, + "step": 499400 + }, + { + "epoch": 0.13027725900402046, + "grad_norm": 9.360069274902344, + "learning_rate": 9.690804202791603e-06, + "loss": 2.6739, + "step": 499600 + }, + { + "epoch": 0.1303294116297226, + "grad_norm": 8.880375862121582, + "learning_rate": 9.69051473854345e-06, + "loss": 2.6774, + "step": 499800 + }, + { + "epoch": 0.1303815642554248, + "grad_norm": 8.440618515014648, + "learning_rate": 9.69022514319e-06, + "loss": 2.6508, + "step": 500000 + }, + { + "epoch": 0.13043371688112695, + "grad_norm": 9.217138290405273, + "learning_rate": 9.68993541673935e-06, + "loss": 2.6629, + "step": 500200 + }, + { + "epoch": 0.13048586950682914, + "grad_norm": 9.152853012084961, + "learning_rate": 9.6896455591996e-06, + "loss": 2.6395, + "step": 500400 + }, + { + "epoch": 0.1305380221325313, + "grad_norm": 9.89096736907959, + "learning_rate": 9.689355570578845e-06, + "loss": 2.6171, + "step": 500600 + }, + { + "epoch": 0.13059017475823345, + "grad_norm": 8.032015800476074, + "learning_rate": 9.689065450885198e-06, + "loss": 2.6471, + "step": 500800 + }, + { + "epoch": 0.13064232738393564, + "grad_norm": 8.966907501220703, + "learning_rate": 9.688775200126763e-06, + "loss": 2.6885, + "step": 501000 + }, + { + "epoch": 0.1306944800096378, + "grad_norm": 9.194870948791504, + "learning_rate": 9.688484818311654e-06, + "loss": 2.6357, + "step": 501200 + }, + { + "epoch": 0.13074663263533998, + "grad_norm": 10.05204963684082, + "learning_rate": 9.68819430544799e-06, + "loss": 2.6813, + "step": 501400 + }, + { + "epoch": 0.13079878526104213, + "grad_norm": 9.8402738571167, + "learning_rate": 9.68790366154389e-06, + "loss": 2.6826, + "step": 501600 + }, + { + "epoch": 0.13085093788674432, + "grad_norm": 8.931715965270996, + "learning_rate": 9.687612886607477e-06, + "loss": 2.6638, + "step": 501800 + }, + { + "epoch": 0.13090309051244647, + "grad_norm": 9.182944297790527, + "learning_rate": 9.687321980646877e-06, + "loss": 2.6094, + "step": 502000 + }, + { + "epoch": 0.13095524313814866, + "grad_norm": 8.474428176879883, + "learning_rate": 9.687030943670224e-06, + "loss": 2.6104, + "step": 502200 + }, + { + "epoch": 0.13100739576385081, + "grad_norm": 9.128947257995605, + "learning_rate": 9.686739775685653e-06, + "loss": 2.6591, + "step": 502400 + }, + { + "epoch": 0.131059548389553, + "grad_norm": 8.498746871948242, + "learning_rate": 9.6864484767013e-06, + "loss": 2.6751, + "step": 502600 + }, + { + "epoch": 0.13111170101525516, + "grad_norm": 9.038972854614258, + "learning_rate": 9.686157046725308e-06, + "loss": 2.6486, + "step": 502800 + }, + { + "epoch": 0.13116385364095734, + "grad_norm": 7.748460292816162, + "learning_rate": 9.685865485765824e-06, + "loss": 2.6387, + "step": 503000 + }, + { + "epoch": 0.1312160062666595, + "grad_norm": 9.406355857849121, + "learning_rate": 9.685573793830996e-06, + "loss": 2.66, + "step": 503200 + }, + { + "epoch": 0.13126815889236168, + "grad_norm": 10.351799011230469, + "learning_rate": 9.68528197092898e-06, + "loss": 2.6431, + "step": 503400 + }, + { + "epoch": 0.13132031151806384, + "grad_norm": 10.22693920135498, + "learning_rate": 9.684990017067928e-06, + "loss": 2.6585, + "step": 503600 + }, + { + "epoch": 0.13137246414376602, + "grad_norm": 9.691548347473145, + "learning_rate": 9.684697932256004e-06, + "loss": 2.6583, + "step": 503800 + }, + { + "epoch": 0.13142461676946818, + "grad_norm": 9.202975273132324, + "learning_rate": 9.68440571650137e-06, + "loss": 2.6798, + "step": 504000 + }, + { + "epoch": 0.13147676939517036, + "grad_norm": 10.292789459228516, + "learning_rate": 9.684113369812198e-06, + "loss": 2.6643, + "step": 504200 + }, + { + "epoch": 0.13152892202087252, + "grad_norm": 7.787657737731934, + "learning_rate": 9.683820892196654e-06, + "loss": 2.631, + "step": 504400 + }, + { + "epoch": 0.1315810746465747, + "grad_norm": 8.93878173828125, + "learning_rate": 9.683528283662916e-06, + "loss": 2.6179, + "step": 504600 + }, + { + "epoch": 0.13163322727227686, + "grad_norm": 7.960376739501953, + "learning_rate": 9.683235544219161e-06, + "loss": 2.6363, + "step": 504800 + }, + { + "epoch": 0.13168537989797904, + "grad_norm": 8.953857421875, + "learning_rate": 9.682942673873572e-06, + "loss": 2.642, + "step": 505000 + }, + { + "epoch": 0.1317375325236812, + "grad_norm": 8.703145027160645, + "learning_rate": 9.682649672634337e-06, + "loss": 2.639, + "step": 505200 + }, + { + "epoch": 0.13178968514938338, + "grad_norm": 9.463846206665039, + "learning_rate": 9.682356540509645e-06, + "loss": 2.6506, + "step": 505400 + }, + { + "epoch": 0.13184183777508554, + "grad_norm": 9.317432403564453, + "learning_rate": 9.68206327750769e-06, + "loss": 2.6546, + "step": 505600 + }, + { + "epoch": 0.13189399040078773, + "grad_norm": 8.97231674194336, + "learning_rate": 9.681769883636664e-06, + "loss": 2.6196, + "step": 505800 + }, + { + "epoch": 0.13194614302648988, + "grad_norm": 9.235013961791992, + "learning_rate": 9.681476358904773e-06, + "loss": 2.6777, + "step": 506000 + }, + { + "epoch": 0.13199829565219207, + "grad_norm": 9.837597846984863, + "learning_rate": 9.68118270332022e-06, + "loss": 2.6295, + "step": 506200 + }, + { + "epoch": 0.13205044827789422, + "grad_norm": 9.842741966247559, + "learning_rate": 9.680888916891213e-06, + "loss": 2.6498, + "step": 506400 + }, + { + "epoch": 0.1321026009035964, + "grad_norm": 9.616024017333984, + "learning_rate": 9.680594999625964e-06, + "loss": 2.6751, + "step": 506600 + }, + { + "epoch": 0.13215475352929856, + "grad_norm": 8.86656665802002, + "learning_rate": 9.680300951532686e-06, + "loss": 2.6171, + "step": 506800 + }, + { + "epoch": 0.13220690615500072, + "grad_norm": 8.192331314086914, + "learning_rate": 9.6800067726196e-06, + "loss": 2.6294, + "step": 507000 + }, + { + "epoch": 0.1322590587807029, + "grad_norm": 8.469772338867188, + "learning_rate": 9.679712462894931e-06, + "loss": 2.6976, + "step": 507200 + }, + { + "epoch": 0.13231121140640506, + "grad_norm": 7.870084762573242, + "learning_rate": 9.679418022366903e-06, + "loss": 2.6391, + "step": 507400 + }, + { + "epoch": 0.13236336403210724, + "grad_norm": 9.129673957824707, + "learning_rate": 9.679123451043743e-06, + "loss": 2.6854, + "step": 507600 + }, + { + "epoch": 0.1324155166578094, + "grad_norm": 9.1187744140625, + "learning_rate": 9.678828748933689e-06, + "loss": 2.6234, + "step": 507800 + }, + { + "epoch": 0.13246766928351159, + "grad_norm": 9.26366901397705, + "learning_rate": 9.678533916044974e-06, + "loss": 2.64, + "step": 508000 + }, + { + "epoch": 0.13251982190921374, + "grad_norm": 8.143036842346191, + "learning_rate": 9.678238952385844e-06, + "loss": 2.6068, + "step": 508200 + }, + { + "epoch": 0.13257197453491593, + "grad_norm": 9.583711624145508, + "learning_rate": 9.67794385796454e-06, + "loss": 2.6514, + "step": 508400 + }, + { + "epoch": 0.13262412716061808, + "grad_norm": 8.947481155395508, + "learning_rate": 9.67764863278931e-06, + "loss": 2.6757, + "step": 508600 + }, + { + "epoch": 0.13267627978632027, + "grad_norm": 9.801794052124023, + "learning_rate": 9.67735327686841e-06, + "loss": 2.6304, + "step": 508800 + }, + { + "epoch": 0.13272843241202242, + "grad_norm": 8.388129234313965, + "learning_rate": 9.677057790210093e-06, + "loss": 2.6435, + "step": 509000 + }, + { + "epoch": 0.1327805850377246, + "grad_norm": 9.845732688903809, + "learning_rate": 9.676762172822615e-06, + "loss": 2.6488, + "step": 509200 + }, + { + "epoch": 0.13283273766342676, + "grad_norm": 8.10527515411377, + "learning_rate": 9.676466424714244e-06, + "loss": 2.6525, + "step": 509400 + }, + { + "epoch": 0.13288489028912895, + "grad_norm": 9.184629440307617, + "learning_rate": 9.676170545893245e-06, + "loss": 2.62, + "step": 509600 + }, + { + "epoch": 0.1329370429148311, + "grad_norm": 9.896688461303711, + "learning_rate": 9.675874536367883e-06, + "loss": 2.6222, + "step": 509800 + }, + { + "epoch": 0.1329891955405333, + "grad_norm": 9.492822647094727, + "learning_rate": 9.675578396146441e-06, + "loss": 2.6601, + "step": 510000 + }, + { + "epoch": 0.13304134816623545, + "grad_norm": 8.227307319641113, + "learning_rate": 9.67528212523719e-06, + "loss": 2.6555, + "step": 510200 + }, + { + "epoch": 0.13309350079193763, + "grad_norm": 8.036877632141113, + "learning_rate": 9.674985723648413e-06, + "loss": 2.648, + "step": 510400 + }, + { + "epoch": 0.1331456534176398, + "grad_norm": 10.020105361938477, + "learning_rate": 9.674689191388393e-06, + "loss": 2.6916, + "step": 510600 + }, + { + "epoch": 0.13319780604334197, + "grad_norm": 9.248919486999512, + "learning_rate": 9.674392528465421e-06, + "loss": 2.621, + "step": 510800 + }, + { + "epoch": 0.13324995866904413, + "grad_norm": 10.11646842956543, + "learning_rate": 9.67409573488779e-06, + "loss": 2.6674, + "step": 511000 + }, + { + "epoch": 0.1333021112947463, + "grad_norm": 8.71399974822998, + "learning_rate": 9.67379881066379e-06, + "loss": 2.6399, + "step": 511200 + }, + { + "epoch": 0.13335426392044847, + "grad_norm": 9.745195388793945, + "learning_rate": 9.673501755801726e-06, + "loss": 2.6376, + "step": 511400 + }, + { + "epoch": 0.13340641654615065, + "grad_norm": 9.938071250915527, + "learning_rate": 9.673204570309899e-06, + "loss": 2.6562, + "step": 511600 + }, + { + "epoch": 0.1334585691718528, + "grad_norm": 9.505668640136719, + "learning_rate": 9.672907254196618e-06, + "loss": 2.6752, + "step": 511800 + }, + { + "epoch": 0.133510721797555, + "grad_norm": 9.43238353729248, + "learning_rate": 9.672609807470187e-06, + "loss": 2.6658, + "step": 512000 + }, + { + "epoch": 0.13356287442325715, + "grad_norm": 9.957179069519043, + "learning_rate": 9.672312230138926e-06, + "loss": 2.6154, + "step": 512200 + }, + { + "epoch": 0.13361502704895933, + "grad_norm": 10.3068265914917, + "learning_rate": 9.672014522211153e-06, + "loss": 2.6855, + "step": 512400 + }, + { + "epoch": 0.1336671796746615, + "grad_norm": 10.027313232421875, + "learning_rate": 9.671716683695184e-06, + "loss": 2.6288, + "step": 512600 + }, + { + "epoch": 0.13371933230036365, + "grad_norm": 8.177754402160645, + "learning_rate": 9.67141871459935e-06, + "loss": 2.6548, + "step": 512800 + }, + { + "epoch": 0.13377148492606583, + "grad_norm": 7.8090500831604, + "learning_rate": 9.671120614931975e-06, + "loss": 2.6628, + "step": 513000 + }, + { + "epoch": 0.133823637551768, + "grad_norm": 9.168340682983398, + "learning_rate": 9.670822384701393e-06, + "loss": 2.6837, + "step": 513200 + }, + { + "epoch": 0.13387579017747017, + "grad_norm": 9.05853271484375, + "learning_rate": 9.670524023915939e-06, + "loss": 2.619, + "step": 513400 + }, + { + "epoch": 0.13392794280317233, + "grad_norm": 9.077415466308594, + "learning_rate": 9.670225532583954e-06, + "loss": 2.6538, + "step": 513600 + }, + { + "epoch": 0.1339800954288745, + "grad_norm": 10.212859153747559, + "learning_rate": 9.66992691071378e-06, + "loss": 2.6305, + "step": 513800 + }, + { + "epoch": 0.13403224805457667, + "grad_norm": 8.626009941101074, + "learning_rate": 9.669628158313764e-06, + "loss": 2.6326, + "step": 514000 + }, + { + "epoch": 0.13408440068027885, + "grad_norm": 9.030853271484375, + "learning_rate": 9.669329275392256e-06, + "loss": 2.6567, + "step": 514200 + }, + { + "epoch": 0.134136553305981, + "grad_norm": 10.016151428222656, + "learning_rate": 9.669030261957613e-06, + "loss": 2.6399, + "step": 514400 + }, + { + "epoch": 0.1341887059316832, + "grad_norm": 9.281685829162598, + "learning_rate": 9.668731118018189e-06, + "loss": 2.6532, + "step": 514600 + }, + { + "epoch": 0.13424085855738535, + "grad_norm": 9.473543167114258, + "learning_rate": 9.668431843582347e-06, + "loss": 2.6171, + "step": 514800 + }, + { + "epoch": 0.13429301118308754, + "grad_norm": 11.197412490844727, + "learning_rate": 9.668132438658452e-06, + "loss": 2.6532, + "step": 515000 + }, + { + "epoch": 0.1343451638087897, + "grad_norm": 8.542428016662598, + "learning_rate": 9.667832903254873e-06, + "loss": 2.6227, + "step": 515200 + }, + { + "epoch": 0.13439731643449188, + "grad_norm": 8.935531616210938, + "learning_rate": 9.667533237379983e-06, + "loss": 2.663, + "step": 515400 + }, + { + "epoch": 0.13444946906019403, + "grad_norm": 8.45200252532959, + "learning_rate": 9.667233441042156e-06, + "loss": 2.6751, + "step": 515600 + }, + { + "epoch": 0.13450162168589622, + "grad_norm": 10.14538288116455, + "learning_rate": 9.666933514249773e-06, + "loss": 2.6057, + "step": 515800 + }, + { + "epoch": 0.13455377431159837, + "grad_norm": 9.2836332321167, + "learning_rate": 9.666633457011216e-06, + "loss": 2.6303, + "step": 516000 + }, + { + "epoch": 0.13460592693730056, + "grad_norm": 10.366707801818848, + "learning_rate": 9.666333269334876e-06, + "loss": 2.693, + "step": 516200 + }, + { + "epoch": 0.1346580795630027, + "grad_norm": 10.014763832092285, + "learning_rate": 9.666032951229138e-06, + "loss": 2.6208, + "step": 516400 + }, + { + "epoch": 0.1347102321887049, + "grad_norm": 8.867531776428223, + "learning_rate": 9.665732502702401e-06, + "loss": 2.6653, + "step": 516600 + }, + { + "epoch": 0.13476238481440705, + "grad_norm": 9.882331848144531, + "learning_rate": 9.665431923763059e-06, + "loss": 2.6914, + "step": 516800 + }, + { + "epoch": 0.13481453744010924, + "grad_norm": 9.857499122619629, + "learning_rate": 9.665131214419516e-06, + "loss": 2.6641, + "step": 517000 + }, + { + "epoch": 0.1348666900658114, + "grad_norm": 9.31207275390625, + "learning_rate": 9.664830374680176e-06, + "loss": 2.6546, + "step": 517200 + }, + { + "epoch": 0.13491884269151358, + "grad_norm": 10.368709564208984, + "learning_rate": 9.664529404553449e-06, + "loss": 2.6633, + "step": 517400 + }, + { + "epoch": 0.13497099531721574, + "grad_norm": 9.503002166748047, + "learning_rate": 9.664228304047746e-06, + "loss": 2.6746, + "step": 517600 + }, + { + "epoch": 0.13502314794291792, + "grad_norm": 8.640365600585938, + "learning_rate": 9.663927073171485e-06, + "loss": 2.6339, + "step": 517800 + }, + { + "epoch": 0.13507530056862008, + "grad_norm": 10.261897087097168, + "learning_rate": 9.663625711933083e-06, + "loss": 2.6787, + "step": 518000 + }, + { + "epoch": 0.13512745319432226, + "grad_norm": 10.067601203918457, + "learning_rate": 9.663324220340964e-06, + "loss": 2.6697, + "step": 518200 + }, + { + "epoch": 0.13517960582002442, + "grad_norm": 8.839470863342285, + "learning_rate": 9.66302259840356e-06, + "loss": 2.6629, + "step": 518400 + }, + { + "epoch": 0.13523175844572657, + "grad_norm": 9.251420974731445, + "learning_rate": 9.662720846129295e-06, + "loss": 2.6574, + "step": 518600 + }, + { + "epoch": 0.13528391107142876, + "grad_norm": 9.32863998413086, + "learning_rate": 9.662418963526605e-06, + "loss": 2.6484, + "step": 518800 + }, + { + "epoch": 0.13533606369713091, + "grad_norm": 9.35538387298584, + "learning_rate": 9.662116950603932e-06, + "loss": 2.6542, + "step": 519000 + }, + { + "epoch": 0.1353882163228331, + "grad_norm": 9.882723808288574, + "learning_rate": 9.661814807369713e-06, + "loss": 2.6853, + "step": 519200 + }, + { + "epoch": 0.13544036894853526, + "grad_norm": 9.342474937438965, + "learning_rate": 9.661512533832395e-06, + "loss": 2.6694, + "step": 519400 + }, + { + "epoch": 0.13549252157423744, + "grad_norm": 8.824580192565918, + "learning_rate": 9.661210130000425e-06, + "loss": 2.6505, + "step": 519600 + }, + { + "epoch": 0.1355446741999396, + "grad_norm": 9.28695011138916, + "learning_rate": 9.660907595882261e-06, + "loss": 2.6744, + "step": 519800 + }, + { + "epoch": 0.13559682682564178, + "grad_norm": 9.04250431060791, + "learning_rate": 9.660604931486353e-06, + "loss": 2.6515, + "step": 520000 + }, + { + "epoch": 0.13564897945134394, + "grad_norm": 10.339582443237305, + "learning_rate": 9.660302136821163e-06, + "loss": 2.6614, + "step": 520200 + }, + { + "epoch": 0.13570113207704612, + "grad_norm": 8.636717796325684, + "learning_rate": 9.659999211895156e-06, + "loss": 2.6249, + "step": 520400 + }, + { + "epoch": 0.13575328470274828, + "grad_norm": 8.852993965148926, + "learning_rate": 9.659696156716797e-06, + "loss": 2.6207, + "step": 520600 + }, + { + "epoch": 0.13580543732845046, + "grad_norm": 9.094849586486816, + "learning_rate": 9.659392971294558e-06, + "loss": 2.6388, + "step": 520800 + }, + { + "epoch": 0.13585758995415262, + "grad_norm": 8.45062255859375, + "learning_rate": 9.659089655636914e-06, + "loss": 2.6394, + "step": 521000 + }, + { + "epoch": 0.1359097425798548, + "grad_norm": 10.011489868164062, + "learning_rate": 9.65878620975234e-06, + "loss": 2.6376, + "step": 521200 + }, + { + "epoch": 0.13596189520555696, + "grad_norm": 9.696266174316406, + "learning_rate": 9.65848263364932e-06, + "loss": 2.6553, + "step": 521400 + }, + { + "epoch": 0.13601404783125914, + "grad_norm": 8.92795181274414, + "learning_rate": 9.65817892733634e-06, + "loss": 2.6496, + "step": 521600 + }, + { + "epoch": 0.1360662004569613, + "grad_norm": 9.415807723999023, + "learning_rate": 9.657875090821886e-06, + "loss": 2.5864, + "step": 521800 + }, + { + "epoch": 0.13611835308266348, + "grad_norm": 10.63681697845459, + "learning_rate": 9.657571124114455e-06, + "loss": 2.6961, + "step": 522000 + }, + { + "epoch": 0.13617050570836564, + "grad_norm": 9.53947639465332, + "learning_rate": 9.657267027222539e-06, + "loss": 2.6501, + "step": 522200 + }, + { + "epoch": 0.13622265833406783, + "grad_norm": 10.805290222167969, + "learning_rate": 9.656962800154641e-06, + "loss": 2.6548, + "step": 522400 + }, + { + "epoch": 0.13627481095976998, + "grad_norm": 7.7652764320373535, + "learning_rate": 9.656658442919261e-06, + "loss": 2.6244, + "step": 522600 + }, + { + "epoch": 0.13632696358547217, + "grad_norm": 9.094009399414062, + "learning_rate": 9.65635395552491e-06, + "loss": 2.6543, + "step": 522800 + }, + { + "epoch": 0.13637911621117432, + "grad_norm": 9.510210990905762, + "learning_rate": 9.656049337980096e-06, + "loss": 2.6013, + "step": 523000 + }, + { + "epoch": 0.1364312688368765, + "grad_norm": 9.347325325012207, + "learning_rate": 9.655744590293334e-06, + "loss": 2.617, + "step": 523200 + }, + { + "epoch": 0.13648342146257866, + "grad_norm": 10.486848831176758, + "learning_rate": 9.655439712473143e-06, + "loss": 2.6461, + "step": 523400 + }, + { + "epoch": 0.13653557408828085, + "grad_norm": 9.532522201538086, + "learning_rate": 9.655134704528044e-06, + "loss": 2.6238, + "step": 523600 + }, + { + "epoch": 0.136587726713983, + "grad_norm": 9.079800605773926, + "learning_rate": 9.654829566466564e-06, + "loss": 2.6561, + "step": 523800 + }, + { + "epoch": 0.1366398793396852, + "grad_norm": 9.900603294372559, + "learning_rate": 9.654524298297228e-06, + "loss": 2.6736, + "step": 524000 + }, + { + "epoch": 0.13669203196538735, + "grad_norm": 10.008954048156738, + "learning_rate": 9.65421890002857e-06, + "loss": 2.5807, + "step": 524200 + }, + { + "epoch": 0.1367441845910895, + "grad_norm": 9.736871719360352, + "learning_rate": 9.653913371669131e-06, + "loss": 2.6297, + "step": 524400 + }, + { + "epoch": 0.13679633721679169, + "grad_norm": 9.787025451660156, + "learning_rate": 9.653607713227445e-06, + "loss": 2.6355, + "step": 524600 + }, + { + "epoch": 0.13684848984249384, + "grad_norm": 9.512006759643555, + "learning_rate": 9.65330192471206e-06, + "loss": 2.6131, + "step": 524800 + }, + { + "epoch": 0.13690064246819603, + "grad_norm": 9.525954246520996, + "learning_rate": 9.652996006131517e-06, + "loss": 2.6761, + "step": 525000 + }, + { + "epoch": 0.13695279509389818, + "grad_norm": 9.713210105895996, + "learning_rate": 9.652689957494371e-06, + "loss": 2.6427, + "step": 525200 + }, + { + "epoch": 0.13700494771960037, + "grad_norm": 7.939797878265381, + "learning_rate": 9.652383778809178e-06, + "loss": 2.6448, + "step": 525400 + }, + { + "epoch": 0.13705710034530252, + "grad_norm": 9.904065132141113, + "learning_rate": 9.652077470084492e-06, + "loss": 2.616, + "step": 525600 + }, + { + "epoch": 0.1371092529710047, + "grad_norm": 8.858099937438965, + "learning_rate": 9.651771031328878e-06, + "loss": 2.6253, + "step": 525800 + }, + { + "epoch": 0.13716140559670686, + "grad_norm": 9.863373756408691, + "learning_rate": 9.651464462550902e-06, + "loss": 2.6251, + "step": 526000 + }, + { + "epoch": 0.13721355822240905, + "grad_norm": 9.789137840270996, + "learning_rate": 9.651157763759126e-06, + "loss": 2.6234, + "step": 526200 + }, + { + "epoch": 0.1372657108481112, + "grad_norm": 10.034318923950195, + "learning_rate": 9.650850934962132e-06, + "loss": 2.6572, + "step": 526400 + }, + { + "epoch": 0.1373178634738134, + "grad_norm": 9.916723251342773, + "learning_rate": 9.65054397616849e-06, + "loss": 2.632, + "step": 526600 + }, + { + "epoch": 0.13737001609951555, + "grad_norm": 8.940628051757812, + "learning_rate": 9.65023688738678e-06, + "loss": 2.6402, + "step": 526800 + }, + { + "epoch": 0.13742216872521773, + "grad_norm": 9.220625877380371, + "learning_rate": 9.649929668625589e-06, + "loss": 2.6262, + "step": 527000 + }, + { + "epoch": 0.1374743213509199, + "grad_norm": 8.476348876953125, + "learning_rate": 9.649622319893502e-06, + "loss": 2.6734, + "step": 527200 + }, + { + "epoch": 0.13752647397662207, + "grad_norm": 8.9732084274292, + "learning_rate": 9.649314841199109e-06, + "loss": 2.6198, + "step": 527400 + }, + { + "epoch": 0.13757862660232423, + "grad_norm": 9.643847465515137, + "learning_rate": 9.649007232551006e-06, + "loss": 2.672, + "step": 527600 + }, + { + "epoch": 0.1376307792280264, + "grad_norm": 9.115861892700195, + "learning_rate": 9.648699493957792e-06, + "loss": 2.6423, + "step": 527800 + }, + { + "epoch": 0.13768293185372857, + "grad_norm": 10.297475814819336, + "learning_rate": 9.648391625428064e-06, + "loss": 2.6552, + "step": 528000 + }, + { + "epoch": 0.13773508447943075, + "grad_norm": 9.935859680175781, + "learning_rate": 9.648083626970432e-06, + "loss": 2.6381, + "step": 528200 + }, + { + "epoch": 0.1377872371051329, + "grad_norm": 10.028658866882324, + "learning_rate": 9.647775498593502e-06, + "loss": 2.6879, + "step": 528400 + }, + { + "epoch": 0.1378393897308351, + "grad_norm": 10.135675430297852, + "learning_rate": 9.647467240305888e-06, + "loss": 2.6395, + "step": 528600 + }, + { + "epoch": 0.13789154235653725, + "grad_norm": 9.688112258911133, + "learning_rate": 9.647158852116207e-06, + "loss": 2.6373, + "step": 528800 + }, + { + "epoch": 0.13794369498223943, + "grad_norm": 9.650997161865234, + "learning_rate": 9.646850334033078e-06, + "loss": 2.623, + "step": 529000 + }, + { + "epoch": 0.1379958476079416, + "grad_norm": 8.232544898986816, + "learning_rate": 9.646541686065122e-06, + "loss": 2.6484, + "step": 529200 + }, + { + "epoch": 0.13804800023364378, + "grad_norm": 9.324143409729004, + "learning_rate": 9.64623290822097e-06, + "loss": 2.6221, + "step": 529400 + }, + { + "epoch": 0.13810015285934593, + "grad_norm": 9.4964599609375, + "learning_rate": 9.64592400050925e-06, + "loss": 2.6273, + "step": 529600 + }, + { + "epoch": 0.13815230548504812, + "grad_norm": 10.16766357421875, + "learning_rate": 9.6456149629386e-06, + "loss": 2.6316, + "step": 529800 + }, + { + "epoch": 0.13820445811075027, + "grad_norm": 9.12529468536377, + "learning_rate": 9.64530579551765e-06, + "loss": 2.6704, + "step": 530000 + }, + { + "epoch": 0.13825661073645246, + "grad_norm": 10.48526382446289, + "learning_rate": 9.64499649825505e-06, + "loss": 2.6643, + "step": 530200 + }, + { + "epoch": 0.1383087633621546, + "grad_norm": 8.55528736114502, + "learning_rate": 9.644687071159442e-06, + "loss": 2.6488, + "step": 530400 + }, + { + "epoch": 0.13836091598785677, + "grad_norm": 9.063948631286621, + "learning_rate": 9.644377514239473e-06, + "loss": 2.6316, + "step": 530600 + }, + { + "epoch": 0.13841306861355895, + "grad_norm": 8.189355850219727, + "learning_rate": 9.6440678275038e-06, + "loss": 2.6458, + "step": 530800 + }, + { + "epoch": 0.1384652212392611, + "grad_norm": 10.547919273376465, + "learning_rate": 9.643758010961075e-06, + "loss": 2.6643, + "step": 531000 + }, + { + "epoch": 0.1385173738649633, + "grad_norm": 8.875152587890625, + "learning_rate": 9.643448064619958e-06, + "loss": 2.6343, + "step": 531200 + }, + { + "epoch": 0.13856952649066545, + "grad_norm": 10.155411720275879, + "learning_rate": 9.643137988489115e-06, + "loss": 2.6402, + "step": 531400 + }, + { + "epoch": 0.13862167911636764, + "grad_norm": 9.877212524414062, + "learning_rate": 9.64282778257721e-06, + "loss": 2.6482, + "step": 531600 + }, + { + "epoch": 0.1386738317420698, + "grad_norm": 8.81261157989502, + "learning_rate": 9.642517446892919e-06, + "loss": 2.6458, + "step": 531800 + }, + { + "epoch": 0.13872598436777198, + "grad_norm": 9.765755653381348, + "learning_rate": 9.642206981444908e-06, + "loss": 2.6615, + "step": 532000 + }, + { + "epoch": 0.13877813699347413, + "grad_norm": 9.739371299743652, + "learning_rate": 9.641896386241861e-06, + "loss": 2.6525, + "step": 532200 + }, + { + "epoch": 0.13883028961917632, + "grad_norm": 10.315713882446289, + "learning_rate": 9.641585661292457e-06, + "loss": 2.659, + "step": 532400 + }, + { + "epoch": 0.13888244224487847, + "grad_norm": 9.760064125061035, + "learning_rate": 9.641274806605384e-06, + "loss": 2.6282, + "step": 532600 + }, + { + "epoch": 0.13893459487058066, + "grad_norm": 9.685067176818848, + "learning_rate": 9.640963822189327e-06, + "loss": 2.6185, + "step": 532800 + }, + { + "epoch": 0.13898674749628281, + "grad_norm": 10.004737854003906, + "learning_rate": 9.640652708052978e-06, + "loss": 2.6327, + "step": 533000 + }, + { + "epoch": 0.139038900121985, + "grad_norm": 8.516515731811523, + "learning_rate": 9.640341464205037e-06, + "loss": 2.6649, + "step": 533200 + }, + { + "epoch": 0.13909105274768715, + "grad_norm": 10.124285697937012, + "learning_rate": 9.640030090654202e-06, + "loss": 2.6205, + "step": 533400 + }, + { + "epoch": 0.13914320537338934, + "grad_norm": 10.336373329162598, + "learning_rate": 9.639718587409174e-06, + "loss": 2.6386, + "step": 533600 + }, + { + "epoch": 0.1391953579990915, + "grad_norm": 10.257096290588379, + "learning_rate": 9.639406954478663e-06, + "loss": 2.643, + "step": 533800 + }, + { + "epoch": 0.13924751062479368, + "grad_norm": 9.125519752502441, + "learning_rate": 9.639095191871379e-06, + "loss": 2.6576, + "step": 534000 + }, + { + "epoch": 0.13929966325049584, + "grad_norm": 8.960975646972656, + "learning_rate": 9.638783299596033e-06, + "loss": 2.6339, + "step": 534200 + }, + { + "epoch": 0.13935181587619802, + "grad_norm": 10.509865760803223, + "learning_rate": 9.638471277661347e-06, + "loss": 2.6326, + "step": 534400 + }, + { + "epoch": 0.13940396850190018, + "grad_norm": 9.875240325927734, + "learning_rate": 9.63815912607604e-06, + "loss": 2.6501, + "step": 534600 + }, + { + "epoch": 0.13945612112760236, + "grad_norm": 8.997678756713867, + "learning_rate": 9.637846844848838e-06, + "loss": 2.6376, + "step": 534800 + }, + { + "epoch": 0.13950827375330452, + "grad_norm": 9.591438293457031, + "learning_rate": 9.637534433988467e-06, + "loss": 2.6116, + "step": 535000 + }, + { + "epoch": 0.1395604263790067, + "grad_norm": 9.580354690551758, + "learning_rate": 9.637221893503662e-06, + "loss": 2.6207, + "step": 535200 + }, + { + "epoch": 0.13961257900470886, + "grad_norm": 8.961016654968262, + "learning_rate": 9.63690922340316e-06, + "loss": 2.6337, + "step": 535400 + }, + { + "epoch": 0.13966473163041104, + "grad_norm": 8.852940559387207, + "learning_rate": 9.636596423695697e-06, + "loss": 2.6134, + "step": 535600 + }, + { + "epoch": 0.1397168842561132, + "grad_norm": 9.002432823181152, + "learning_rate": 9.636283494390017e-06, + "loss": 2.6261, + "step": 535800 + }, + { + "epoch": 0.13976903688181538, + "grad_norm": 9.397136688232422, + "learning_rate": 9.635970435494868e-06, + "loss": 2.6458, + "step": 536000 + }, + { + "epoch": 0.13982118950751754, + "grad_norm": 9.2616548538208, + "learning_rate": 9.635657247019001e-06, + "loss": 2.6269, + "step": 536200 + }, + { + "epoch": 0.1398733421332197, + "grad_norm": 9.671730041503906, + "learning_rate": 9.635343928971167e-06, + "loss": 2.6507, + "step": 536400 + }, + { + "epoch": 0.13992549475892188, + "grad_norm": 9.459322929382324, + "learning_rate": 9.635030481360129e-06, + "loss": 2.6104, + "step": 536600 + }, + { + "epoch": 0.13997764738462404, + "grad_norm": 9.426004409790039, + "learning_rate": 9.63471690419464e-06, + "loss": 2.63, + "step": 536800 + }, + { + "epoch": 0.14002980001032622, + "grad_norm": 9.91978931427002, + "learning_rate": 9.634403197483472e-06, + "loss": 2.6297, + "step": 537000 + }, + { + "epoch": 0.14008195263602838, + "grad_norm": 10.36573600769043, + "learning_rate": 9.634089361235391e-06, + "loss": 2.6198, + "step": 537200 + }, + { + "epoch": 0.14013410526173056, + "grad_norm": 12.126402854919434, + "learning_rate": 9.633775395459169e-06, + "loss": 2.6285, + "step": 537400 + }, + { + "epoch": 0.14018625788743272, + "grad_norm": 8.916886329650879, + "learning_rate": 9.633461300163582e-06, + "loss": 2.6479, + "step": 537600 + }, + { + "epoch": 0.1402384105131349, + "grad_norm": 10.035049438476562, + "learning_rate": 9.633147075357409e-06, + "loss": 2.6763, + "step": 537800 + }, + { + "epoch": 0.14029056313883706, + "grad_norm": 9.93181324005127, + "learning_rate": 9.632832721049432e-06, + "loss": 2.6169, + "step": 538000 + }, + { + "epoch": 0.14034271576453924, + "grad_norm": 10.294628143310547, + "learning_rate": 9.632518237248439e-06, + "loss": 2.6648, + "step": 538200 + }, + { + "epoch": 0.1403948683902414, + "grad_norm": 9.200163841247559, + "learning_rate": 9.63220362396322e-06, + "loss": 2.6165, + "step": 538400 + }, + { + "epoch": 0.14044702101594359, + "grad_norm": 9.44741153717041, + "learning_rate": 9.631888881202569e-06, + "loss": 2.6108, + "step": 538600 + }, + { + "epoch": 0.14049917364164574, + "grad_norm": 9.307562828063965, + "learning_rate": 9.631574008975281e-06, + "loss": 2.6276, + "step": 538800 + }, + { + "epoch": 0.14055132626734793, + "grad_norm": 8.72498893737793, + "learning_rate": 9.631259007290162e-06, + "loss": 2.6432, + "step": 539000 + }, + { + "epoch": 0.14060347889305008, + "grad_norm": 10.48019027709961, + "learning_rate": 9.63094387615601e-06, + "loss": 2.6292, + "step": 539200 + }, + { + "epoch": 0.14065563151875227, + "grad_norm": 9.463690757751465, + "learning_rate": 9.63062861558164e-06, + "loss": 2.6205, + "step": 539400 + }, + { + "epoch": 0.14070778414445442, + "grad_norm": 9.219118118286133, + "learning_rate": 9.630313225575862e-06, + "loss": 2.6125, + "step": 539600 + }, + { + "epoch": 0.1407599367701566, + "grad_norm": 11.975398063659668, + "learning_rate": 9.629997706147488e-06, + "loss": 2.6088, + "step": 539800 + }, + { + "epoch": 0.14081208939585876, + "grad_norm": 8.592241287231445, + "learning_rate": 9.629682057305341e-06, + "loss": 2.626, + "step": 540000 + }, + { + "epoch": 0.14086424202156095, + "grad_norm": 9.817209243774414, + "learning_rate": 9.62936627905824e-06, + "loss": 2.647, + "step": 540200 + }, + { + "epoch": 0.1409163946472631, + "grad_norm": 10.493924140930176, + "learning_rate": 9.629050371415015e-06, + "loss": 2.622, + "step": 540400 + }, + { + "epoch": 0.1409685472729653, + "grad_norm": 9.412971496582031, + "learning_rate": 9.628734334384495e-06, + "loss": 2.6524, + "step": 540600 + }, + { + "epoch": 0.14102069989866745, + "grad_norm": 10.187376976013184, + "learning_rate": 9.628418167975512e-06, + "loss": 2.6041, + "step": 540800 + }, + { + "epoch": 0.14107285252436963, + "grad_norm": 9.966575622558594, + "learning_rate": 9.628101872196907e-06, + "loss": 2.664, + "step": 541000 + }, + { + "epoch": 0.1411250051500718, + "grad_norm": 9.64553451538086, + "learning_rate": 9.627785447057517e-06, + "loss": 2.686, + "step": 541200 + }, + { + "epoch": 0.14117715777577397, + "grad_norm": 11.089734077453613, + "learning_rate": 9.627468892566186e-06, + "loss": 2.6556, + "step": 541400 + }, + { + "epoch": 0.14122931040147613, + "grad_norm": 9.867591857910156, + "learning_rate": 9.627152208731765e-06, + "loss": 2.6004, + "step": 541600 + }, + { + "epoch": 0.1412814630271783, + "grad_norm": 9.480960845947266, + "learning_rate": 9.626835395563104e-06, + "loss": 2.6388, + "step": 541800 + }, + { + "epoch": 0.14133361565288047, + "grad_norm": 9.904638290405273, + "learning_rate": 9.62651845306906e-06, + "loss": 2.6437, + "step": 542000 + }, + { + "epoch": 0.14138576827858262, + "grad_norm": 10.75538158416748, + "learning_rate": 9.626201381258488e-06, + "loss": 2.6249, + "step": 542200 + }, + { + "epoch": 0.1414379209042848, + "grad_norm": 9.948746681213379, + "learning_rate": 9.625884180140255e-06, + "loss": 2.6459, + "step": 542400 + }, + { + "epoch": 0.14149007352998696, + "grad_norm": 8.870070457458496, + "learning_rate": 9.625566849723227e-06, + "loss": 2.6629, + "step": 542600 + }, + { + "epoch": 0.14154222615568915, + "grad_norm": 10.196480751037598, + "learning_rate": 9.62524939001627e-06, + "loss": 2.6402, + "step": 542800 + }, + { + "epoch": 0.1415943787813913, + "grad_norm": 9.863924026489258, + "learning_rate": 9.624931801028258e-06, + "loss": 2.6385, + "step": 543000 + }, + { + "epoch": 0.1416465314070935, + "grad_norm": 9.387092590332031, + "learning_rate": 9.624614082768073e-06, + "loss": 2.6503, + "step": 543200 + }, + { + "epoch": 0.14169868403279565, + "grad_norm": 11.089527130126953, + "learning_rate": 9.62429623524459e-06, + "loss": 2.6537, + "step": 543400 + }, + { + "epoch": 0.14175083665849783, + "grad_norm": 10.329263687133789, + "learning_rate": 9.623978258466696e-06, + "loss": 2.6778, + "step": 543600 + }, + { + "epoch": 0.1418029892842, + "grad_norm": 10.49349308013916, + "learning_rate": 9.623660152443277e-06, + "loss": 2.6065, + "step": 543800 + }, + { + "epoch": 0.14185514190990217, + "grad_norm": 9.694085121154785, + "learning_rate": 9.623341917183227e-06, + "loss": 2.627, + "step": 544000 + }, + { + "epoch": 0.14190729453560433, + "grad_norm": 10.875962257385254, + "learning_rate": 9.623023552695438e-06, + "loss": 2.653, + "step": 544200 + }, + { + "epoch": 0.1419594471613065, + "grad_norm": 8.842427253723145, + "learning_rate": 9.62270505898881e-06, + "loss": 2.6317, + "step": 544400 + }, + { + "epoch": 0.14201159978700867, + "grad_norm": 9.090118408203125, + "learning_rate": 9.622386436072246e-06, + "loss": 2.6197, + "step": 544600 + }, + { + "epoch": 0.14206375241271085, + "grad_norm": 10.754172325134277, + "learning_rate": 9.622067683954651e-06, + "loss": 2.5924, + "step": 544800 + }, + { + "epoch": 0.142115905038413, + "grad_norm": 9.412361145019531, + "learning_rate": 9.621748802644934e-06, + "loss": 2.6484, + "step": 545000 + }, + { + "epoch": 0.1421680576641152, + "grad_norm": 10.36867618560791, + "learning_rate": 9.62142979215201e-06, + "loss": 2.6376, + "step": 545200 + }, + { + "epoch": 0.14222021028981735, + "grad_norm": 9.003300666809082, + "learning_rate": 9.621110652484794e-06, + "loss": 2.6281, + "step": 545400 + }, + { + "epoch": 0.14227236291551953, + "grad_norm": 11.816666603088379, + "learning_rate": 9.62079138365221e-06, + "loss": 2.6324, + "step": 545600 + }, + { + "epoch": 0.1423245155412217, + "grad_norm": 9.261345863342285, + "learning_rate": 9.620471985663175e-06, + "loss": 2.6267, + "step": 545800 + }, + { + "epoch": 0.14237666816692388, + "grad_norm": 8.67002010345459, + "learning_rate": 9.620152458526622e-06, + "loss": 2.6205, + "step": 546000 + }, + { + "epoch": 0.14242882079262603, + "grad_norm": 10.905217170715332, + "learning_rate": 9.61983280225148e-06, + "loss": 2.6096, + "step": 546200 + }, + { + "epoch": 0.14248097341832822, + "grad_norm": 10.862191200256348, + "learning_rate": 9.619513016846685e-06, + "loss": 2.6368, + "step": 546400 + }, + { + "epoch": 0.14253312604403037, + "grad_norm": 9.508679389953613, + "learning_rate": 9.619193102321174e-06, + "loss": 2.6533, + "step": 546600 + }, + { + "epoch": 0.14258527866973256, + "grad_norm": 9.74131965637207, + "learning_rate": 9.61887305868389e-06, + "loss": 2.6502, + "step": 546800 + }, + { + "epoch": 0.1426374312954347, + "grad_norm": 9.636917114257812, + "learning_rate": 9.618552885943777e-06, + "loss": 2.6608, + "step": 547000 + }, + { + "epoch": 0.1426895839211369, + "grad_norm": 9.495331764221191, + "learning_rate": 9.618232584109788e-06, + "loss": 2.6519, + "step": 547200 + }, + { + "epoch": 0.14274173654683905, + "grad_norm": 9.559065818786621, + "learning_rate": 9.617912153190871e-06, + "loss": 2.6379, + "step": 547400 + }, + { + "epoch": 0.14279388917254124, + "grad_norm": 9.602763175964355, + "learning_rate": 9.617591593195987e-06, + "loss": 2.6304, + "step": 547600 + }, + { + "epoch": 0.1428460417982434, + "grad_norm": 9.828039169311523, + "learning_rate": 9.617270904134091e-06, + "loss": 2.6251, + "step": 547800 + }, + { + "epoch": 0.14289819442394555, + "grad_norm": 9.119592666625977, + "learning_rate": 9.616950086014152e-06, + "loss": 2.6231, + "step": 548000 + }, + { + "epoch": 0.14295034704964774, + "grad_norm": 10.890464782714844, + "learning_rate": 9.616629138845132e-06, + "loss": 2.649, + "step": 548200 + }, + { + "epoch": 0.1430024996753499, + "grad_norm": 8.780187606811523, + "learning_rate": 9.616308062636006e-06, + "loss": 2.6794, + "step": 548400 + }, + { + "epoch": 0.14305465230105208, + "grad_norm": 9.78468132019043, + "learning_rate": 9.615986857395746e-06, + "loss": 2.6462, + "step": 548600 + }, + { + "epoch": 0.14310680492675423, + "grad_norm": 8.558454513549805, + "learning_rate": 9.615665523133331e-06, + "loss": 2.6206, + "step": 548800 + }, + { + "epoch": 0.14315895755245642, + "grad_norm": 8.706876754760742, + "learning_rate": 9.615344059857743e-06, + "loss": 2.6301, + "step": 549000 + }, + { + "epoch": 0.14321111017815857, + "grad_norm": 9.68352222442627, + "learning_rate": 9.615022467577966e-06, + "loss": 2.6818, + "step": 549200 + }, + { + "epoch": 0.14326326280386076, + "grad_norm": 11.24920654296875, + "learning_rate": 9.614700746302991e-06, + "loss": 2.6596, + "step": 549400 + }, + { + "epoch": 0.14331541542956291, + "grad_norm": 9.429580688476562, + "learning_rate": 9.614378896041808e-06, + "loss": 2.6525, + "step": 549600 + }, + { + "epoch": 0.1433675680552651, + "grad_norm": 10.127577781677246, + "learning_rate": 9.614056916803415e-06, + "loss": 2.6228, + "step": 549800 + }, + { + "epoch": 0.14341972068096726, + "grad_norm": 10.32852840423584, + "learning_rate": 9.61373480859681e-06, + "loss": 2.6263, + "step": 550000 + }, + { + "epoch": 0.14347187330666944, + "grad_norm": 10.721578598022461, + "learning_rate": 9.613412571430998e-06, + "loss": 2.6296, + "step": 550200 + }, + { + "epoch": 0.1435240259323716, + "grad_norm": 8.454923629760742, + "learning_rate": 9.613090205314985e-06, + "loss": 2.656, + "step": 550400 + }, + { + "epoch": 0.14357617855807378, + "grad_norm": 9.832836151123047, + "learning_rate": 9.612767710257782e-06, + "loss": 2.6442, + "step": 550600 + }, + { + "epoch": 0.14362833118377594, + "grad_norm": 9.232677459716797, + "learning_rate": 9.612445086268403e-06, + "loss": 2.6341, + "step": 550800 + }, + { + "epoch": 0.14368048380947812, + "grad_norm": 11.069082260131836, + "learning_rate": 9.612122333355865e-06, + "loss": 2.6251, + "step": 551000 + }, + { + "epoch": 0.14373263643518028, + "grad_norm": 10.895528793334961, + "learning_rate": 9.611799451529189e-06, + "loss": 2.6328, + "step": 551200 + }, + { + "epoch": 0.14378478906088246, + "grad_norm": 9.884492874145508, + "learning_rate": 9.611476440797403e-06, + "loss": 2.5978, + "step": 551400 + }, + { + "epoch": 0.14383694168658462, + "grad_norm": 9.460986137390137, + "learning_rate": 9.61115330116953e-06, + "loss": 2.6392, + "step": 551600 + }, + { + "epoch": 0.1438890943122868, + "grad_norm": 9.917821884155273, + "learning_rate": 9.610830032654607e-06, + "loss": 2.6414, + "step": 551800 + }, + { + "epoch": 0.14394124693798896, + "grad_norm": 8.92539119720459, + "learning_rate": 9.61050663526167e-06, + "loss": 2.606, + "step": 552000 + }, + { + "epoch": 0.14399339956369114, + "grad_norm": 10.357989311218262, + "learning_rate": 9.610183108999754e-06, + "loss": 2.5919, + "step": 552200 + }, + { + "epoch": 0.1440455521893933, + "grad_norm": 11.38748550415039, + "learning_rate": 9.609859453877906e-06, + "loss": 2.6614, + "step": 552400 + }, + { + "epoch": 0.14409770481509548, + "grad_norm": 8.538469314575195, + "learning_rate": 9.60953566990517e-06, + "loss": 2.6751, + "step": 552600 + }, + { + "epoch": 0.14414985744079764, + "grad_norm": 10.609665870666504, + "learning_rate": 9.609211757090596e-06, + "loss": 2.6102, + "step": 552800 + }, + { + "epoch": 0.14420201006649983, + "grad_norm": 9.466772079467773, + "learning_rate": 9.60888771544324e-06, + "loss": 2.6074, + "step": 553000 + }, + { + "epoch": 0.14425416269220198, + "grad_norm": 10.574810981750488, + "learning_rate": 9.608563544972159e-06, + "loss": 2.6195, + "step": 553200 + }, + { + "epoch": 0.14430631531790417, + "grad_norm": 8.984042167663574, + "learning_rate": 9.60823924568641e-06, + "loss": 2.6586, + "step": 553400 + }, + { + "epoch": 0.14435846794360632, + "grad_norm": 9.159278869628906, + "learning_rate": 9.607914817595062e-06, + "loss": 2.5936, + "step": 553600 + }, + { + "epoch": 0.1444106205693085, + "grad_norm": 10.08705997467041, + "learning_rate": 9.607590260707182e-06, + "loss": 2.6402, + "step": 553800 + }, + { + "epoch": 0.14446277319501066, + "grad_norm": 9.373217582702637, + "learning_rate": 9.607265575031843e-06, + "loss": 2.6356, + "step": 554000 + }, + { + "epoch": 0.14451492582071282, + "grad_norm": 9.303369522094727, + "learning_rate": 9.606940760578118e-06, + "loss": 2.6004, + "step": 554200 + }, + { + "epoch": 0.144567078446415, + "grad_norm": 10.792057991027832, + "learning_rate": 9.606615817355084e-06, + "loss": 2.619, + "step": 554400 + }, + { + "epoch": 0.14461923107211716, + "grad_norm": 9.816146850585938, + "learning_rate": 9.60629074537183e-06, + "loss": 2.5744, + "step": 554600 + }, + { + "epoch": 0.14467138369781934, + "grad_norm": 9.5195894241333, + "learning_rate": 9.605965544637437e-06, + "loss": 2.6646, + "step": 554800 + }, + { + "epoch": 0.1447235363235215, + "grad_norm": 9.027356147766113, + "learning_rate": 9.605640215160996e-06, + "loss": 2.5848, + "step": 555000 + }, + { + "epoch": 0.14477568894922369, + "grad_norm": 9.522998809814453, + "learning_rate": 9.6053147569516e-06, + "loss": 2.631, + "step": 555200 + }, + { + "epoch": 0.14482784157492584, + "grad_norm": 10.631654739379883, + "learning_rate": 9.604989170018347e-06, + "loss": 2.6315, + "step": 555400 + }, + { + "epoch": 0.14487999420062803, + "grad_norm": 9.056488037109375, + "learning_rate": 9.604663454370338e-06, + "loss": 2.5954, + "step": 555600 + }, + { + "epoch": 0.14493214682633018, + "grad_norm": 8.656085014343262, + "learning_rate": 9.604337610016674e-06, + "loss": 2.619, + "step": 555800 + }, + { + "epoch": 0.14498429945203237, + "grad_norm": 10.284260749816895, + "learning_rate": 9.604011636966466e-06, + "loss": 2.6313, + "step": 556000 + }, + { + "epoch": 0.14503645207773452, + "grad_norm": 8.677928924560547, + "learning_rate": 9.603685535228823e-06, + "loss": 2.6198, + "step": 556200 + }, + { + "epoch": 0.1450886047034367, + "grad_norm": 9.978887557983398, + "learning_rate": 9.603359304812863e-06, + "loss": 2.5997, + "step": 556400 + }, + { + "epoch": 0.14514075732913886, + "grad_norm": 10.44409465789795, + "learning_rate": 9.6030329457277e-06, + "loss": 2.6256, + "step": 556600 + }, + { + "epoch": 0.14519290995484105, + "grad_norm": 10.068913459777832, + "learning_rate": 9.60270645798246e-06, + "loss": 2.5958, + "step": 556800 + }, + { + "epoch": 0.1452450625805432, + "grad_norm": 9.706381797790527, + "learning_rate": 9.602379841586269e-06, + "loss": 2.5839, + "step": 557000 + }, + { + "epoch": 0.1452972152062454, + "grad_norm": 8.250496864318848, + "learning_rate": 9.602053096548251e-06, + "loss": 2.6462, + "step": 557200 + }, + { + "epoch": 0.14534936783194755, + "grad_norm": 10.653825759887695, + "learning_rate": 9.601726222877546e-06, + "loss": 2.6536, + "step": 557400 + }, + { + "epoch": 0.14540152045764973, + "grad_norm": 9.490010261535645, + "learning_rate": 9.601399220583285e-06, + "loss": 2.6482, + "step": 557600 + }, + { + "epoch": 0.1454536730833519, + "grad_norm": 9.792756080627441, + "learning_rate": 9.601072089674613e-06, + "loss": 2.6215, + "step": 557800 + }, + { + "epoch": 0.14550582570905407, + "grad_norm": 11.322357177734375, + "learning_rate": 9.600744830160667e-06, + "loss": 2.6084, + "step": 558000 + }, + { + "epoch": 0.14555797833475623, + "grad_norm": 10.1467924118042, + "learning_rate": 9.6004174420506e-06, + "loss": 2.6443, + "step": 558200 + }, + { + "epoch": 0.1456101309604584, + "grad_norm": 10.210265159606934, + "learning_rate": 9.600089925353562e-06, + "loss": 2.6163, + "step": 558400 + }, + { + "epoch": 0.14566228358616057, + "grad_norm": 12.064213752746582, + "learning_rate": 9.599762280078705e-06, + "loss": 2.6455, + "step": 558600 + }, + { + "epoch": 0.14571443621186275, + "grad_norm": 8.971247673034668, + "learning_rate": 9.599434506235188e-06, + "loss": 2.6323, + "step": 558800 + }, + { + "epoch": 0.1457665888375649, + "grad_norm": 9.186264991760254, + "learning_rate": 9.599106603832176e-06, + "loss": 2.6271, + "step": 559000 + }, + { + "epoch": 0.1458187414632671, + "grad_norm": 10.96983814239502, + "learning_rate": 9.598778572878828e-06, + "loss": 2.6766, + "step": 559200 + }, + { + "epoch": 0.14587089408896925, + "grad_norm": 9.236897468566895, + "learning_rate": 9.59845041338432e-06, + "loss": 2.6698, + "step": 559400 + }, + { + "epoch": 0.14592304671467143, + "grad_norm": 9.946492195129395, + "learning_rate": 9.598122125357817e-06, + "loss": 2.6006, + "step": 559600 + }, + { + "epoch": 0.1459751993403736, + "grad_norm": 10.890714645385742, + "learning_rate": 9.597793708808501e-06, + "loss": 2.6319, + "step": 559800 + }, + { + "epoch": 0.14602735196607575, + "grad_norm": 10.893404006958008, + "learning_rate": 9.597465163745548e-06, + "loss": 2.6361, + "step": 560000 + }, + { + "epoch": 0.14607950459177793, + "grad_norm": 10.29994010925293, + "learning_rate": 9.597136490178145e-06, + "loss": 2.6443, + "step": 560200 + }, + { + "epoch": 0.1461316572174801, + "grad_norm": 10.262697219848633, + "learning_rate": 9.596807688115474e-06, + "loss": 2.6291, + "step": 560400 + }, + { + "epoch": 0.14618380984318227, + "grad_norm": 10.052336692810059, + "learning_rate": 9.596478757566729e-06, + "loss": 2.6116, + "step": 560600 + }, + { + "epoch": 0.14623596246888443, + "grad_norm": 9.586968421936035, + "learning_rate": 9.596149698541102e-06, + "loss": 2.6361, + "step": 560800 + }, + { + "epoch": 0.1462881150945866, + "grad_norm": 10.506963729858398, + "learning_rate": 9.595820511047791e-06, + "loss": 2.6558, + "step": 561000 + }, + { + "epoch": 0.14634026772028877, + "grad_norm": 11.302355766296387, + "learning_rate": 9.595491195095998e-06, + "loss": 2.6385, + "step": 561200 + }, + { + "epoch": 0.14639242034599095, + "grad_norm": 10.142350196838379, + "learning_rate": 9.595161750694927e-06, + "loss": 2.6049, + "step": 561400 + }, + { + "epoch": 0.1464445729716931, + "grad_norm": 8.8736572265625, + "learning_rate": 9.594832177853787e-06, + "loss": 2.606, + "step": 561600 + }, + { + "epoch": 0.1464967255973953, + "grad_norm": 10.957189559936523, + "learning_rate": 9.59450247658179e-06, + "loss": 2.6365, + "step": 561800 + }, + { + "epoch": 0.14654887822309745, + "grad_norm": 10.375482559204102, + "learning_rate": 9.594172646888151e-06, + "loss": 2.6283, + "step": 562000 + }, + { + "epoch": 0.14660103084879963, + "grad_norm": 10.932716369628906, + "learning_rate": 9.59384268878209e-06, + "loss": 2.658, + "step": 562200 + }, + { + "epoch": 0.1466531834745018, + "grad_norm": 9.20926284790039, + "learning_rate": 9.593512602272828e-06, + "loss": 2.6566, + "step": 562400 + }, + { + "epoch": 0.14670533610020398, + "grad_norm": 10.561077117919922, + "learning_rate": 9.593182387369592e-06, + "loss": 2.6381, + "step": 562600 + }, + { + "epoch": 0.14675748872590613, + "grad_norm": 9.518685340881348, + "learning_rate": 9.592852044081614e-06, + "loss": 2.5938, + "step": 562800 + }, + { + "epoch": 0.14680964135160832, + "grad_norm": 10.072381973266602, + "learning_rate": 9.592521572418123e-06, + "loss": 2.665, + "step": 563000 + }, + { + "epoch": 0.14686179397731047, + "grad_norm": 10.22624683380127, + "learning_rate": 9.59219097238836e-06, + "loss": 2.563, + "step": 563200 + }, + { + "epoch": 0.14691394660301266, + "grad_norm": 10.61490249633789, + "learning_rate": 9.591860244001563e-06, + "loss": 2.6146, + "step": 563400 + }, + { + "epoch": 0.1469660992287148, + "grad_norm": 9.314760208129883, + "learning_rate": 9.59152938726698e-06, + "loss": 2.6327, + "step": 563600 + }, + { + "epoch": 0.147018251854417, + "grad_norm": 10.206314086914062, + "learning_rate": 9.591198402193854e-06, + "loss": 2.6513, + "step": 563800 + }, + { + "epoch": 0.14707040448011915, + "grad_norm": 11.540358543395996, + "learning_rate": 9.59086728879144e-06, + "loss": 2.6168, + "step": 564000 + }, + { + "epoch": 0.14712255710582134, + "grad_norm": 10.943085670471191, + "learning_rate": 9.590536047068992e-06, + "loss": 2.6184, + "step": 564200 + }, + { + "epoch": 0.1471747097315235, + "grad_norm": 11.087271690368652, + "learning_rate": 9.590204677035769e-06, + "loss": 2.6272, + "step": 564400 + }, + { + "epoch": 0.14722686235722568, + "grad_norm": 10.729355812072754, + "learning_rate": 9.58987317870103e-06, + "loss": 2.6478, + "step": 564600 + }, + { + "epoch": 0.14727901498292784, + "grad_norm": 10.2618408203125, + "learning_rate": 9.589541552074044e-06, + "loss": 2.6489, + "step": 564800 + }, + { + "epoch": 0.14733116760863002, + "grad_norm": 8.719612121582031, + "learning_rate": 9.589209797164082e-06, + "loss": 2.6471, + "step": 565000 + }, + { + "epoch": 0.14738332023433218, + "grad_norm": 9.179375648498535, + "learning_rate": 9.588877913980411e-06, + "loss": 2.6204, + "step": 565200 + }, + { + "epoch": 0.14743547286003436, + "grad_norm": 8.887311935424805, + "learning_rate": 9.588545902532316e-06, + "loss": 2.627, + "step": 565400 + }, + { + "epoch": 0.14748762548573652, + "grad_norm": 10.409414291381836, + "learning_rate": 9.588213762829068e-06, + "loss": 2.6492, + "step": 565600 + }, + { + "epoch": 0.14753977811143867, + "grad_norm": 9.475335121154785, + "learning_rate": 9.587881494879956e-06, + "loss": 2.6261, + "step": 565800 + }, + { + "epoch": 0.14759193073714086, + "grad_norm": 10.67691707611084, + "learning_rate": 9.587549098694268e-06, + "loss": 2.6279, + "step": 566000 + }, + { + "epoch": 0.14764408336284301, + "grad_norm": 9.022562980651855, + "learning_rate": 9.58721657428129e-06, + "loss": 2.6222, + "step": 566200 + }, + { + "epoch": 0.1476962359885452, + "grad_norm": 9.80457878112793, + "learning_rate": 9.586883921650322e-06, + "loss": 2.6257, + "step": 566400 + }, + { + "epoch": 0.14774838861424736, + "grad_norm": 10.465949058532715, + "learning_rate": 9.586551140810659e-06, + "loss": 2.6321, + "step": 566600 + }, + { + "epoch": 0.14780054123994954, + "grad_norm": 8.894414901733398, + "learning_rate": 9.586218231771602e-06, + "loss": 2.6289, + "step": 566800 + }, + { + "epoch": 0.1478526938656517, + "grad_norm": 9.935043334960938, + "learning_rate": 9.58588519454246e-06, + "loss": 2.6112, + "step": 567000 + }, + { + "epoch": 0.14790484649135388, + "grad_norm": 10.814425468444824, + "learning_rate": 9.585552029132537e-06, + "loss": 2.6257, + "step": 567200 + }, + { + "epoch": 0.14795699911705604, + "grad_norm": 10.175069808959961, + "learning_rate": 9.585218735551147e-06, + "loss": 2.6417, + "step": 567400 + }, + { + "epoch": 0.14800915174275822, + "grad_norm": 9.798264503479004, + "learning_rate": 9.584885313807607e-06, + "loss": 2.6016, + "step": 567600 + }, + { + "epoch": 0.14806130436846038, + "grad_norm": 9.588288307189941, + "learning_rate": 9.584551763911236e-06, + "loss": 2.6012, + "step": 567800 + }, + { + "epoch": 0.14811345699416256, + "grad_norm": 9.886007308959961, + "learning_rate": 9.584218085871358e-06, + "loss": 2.6357, + "step": 568000 + }, + { + "epoch": 0.14816560961986472, + "grad_norm": 9.629053115844727, + "learning_rate": 9.583884279697297e-06, + "loss": 2.6257, + "step": 568200 + }, + { + "epoch": 0.1482177622455669, + "grad_norm": 9.915539741516113, + "learning_rate": 9.583550345398385e-06, + "loss": 2.6651, + "step": 568400 + }, + { + "epoch": 0.14826991487126906, + "grad_norm": 10.119706153869629, + "learning_rate": 9.583216282983955e-06, + "loss": 2.6624, + "step": 568600 + }, + { + "epoch": 0.14832206749697124, + "grad_norm": 10.616450309753418, + "learning_rate": 9.582882092463348e-06, + "loss": 2.6166, + "step": 568800 + }, + { + "epoch": 0.1483742201226734, + "grad_norm": 10.009668350219727, + "learning_rate": 9.582547773845901e-06, + "loss": 2.6372, + "step": 569000 + }, + { + "epoch": 0.14842637274837558, + "grad_norm": 9.56704330444336, + "learning_rate": 9.582213327140958e-06, + "loss": 2.648, + "step": 569200 + }, + { + "epoch": 0.14847852537407774, + "grad_norm": 12.643216133117676, + "learning_rate": 9.581878752357871e-06, + "loss": 2.6362, + "step": 569400 + }, + { + "epoch": 0.14853067799977993, + "grad_norm": 9.96658992767334, + "learning_rate": 9.581544049505991e-06, + "loss": 2.6044, + "step": 569600 + }, + { + "epoch": 0.14858283062548208, + "grad_norm": 9.937406539916992, + "learning_rate": 9.58120921859467e-06, + "loss": 2.6604, + "step": 569800 + }, + { + "epoch": 0.14863498325118427, + "grad_norm": 10.475345611572266, + "learning_rate": 9.58087425963327e-06, + "loss": 2.6185, + "step": 570000 + }, + { + "epoch": 0.14868713587688642, + "grad_norm": 9.268296241760254, + "learning_rate": 9.580539172631152e-06, + "loss": 2.6081, + "step": 570200 + }, + { + "epoch": 0.1487392885025886, + "grad_norm": 9.733953475952148, + "learning_rate": 9.580203957597683e-06, + "loss": 2.6293, + "step": 570400 + }, + { + "epoch": 0.14879144112829076, + "grad_norm": 10.396161079406738, + "learning_rate": 9.579868614542231e-06, + "loss": 2.6485, + "step": 570600 + }, + { + "epoch": 0.14884359375399295, + "grad_norm": 10.291255950927734, + "learning_rate": 9.579533143474172e-06, + "loss": 2.6246, + "step": 570800 + }, + { + "epoch": 0.1488957463796951, + "grad_norm": 10.212169647216797, + "learning_rate": 9.579197544402881e-06, + "loss": 2.5966, + "step": 571000 + }, + { + "epoch": 0.1489478990053973, + "grad_norm": 9.417984962463379, + "learning_rate": 9.578861817337736e-06, + "loss": 2.628, + "step": 571200 + }, + { + "epoch": 0.14900005163109944, + "grad_norm": 11.286150932312012, + "learning_rate": 9.578525962288127e-06, + "loss": 2.6472, + "step": 571400 + }, + { + "epoch": 0.1490522042568016, + "grad_norm": 9.67978286743164, + "learning_rate": 9.578189979263438e-06, + "loss": 2.6192, + "step": 571600 + }, + { + "epoch": 0.14910435688250379, + "grad_norm": 10.592140197753906, + "learning_rate": 9.577853868273057e-06, + "loss": 2.6086, + "step": 571800 + }, + { + "epoch": 0.14915650950820594, + "grad_norm": 10.959556579589844, + "learning_rate": 9.577517629326385e-06, + "loss": 2.5899, + "step": 572000 + }, + { + "epoch": 0.14920866213390813, + "grad_norm": 11.056251525878906, + "learning_rate": 9.577181262432815e-06, + "loss": 2.655, + "step": 572200 + }, + { + "epoch": 0.14926081475961028, + "grad_norm": 10.780267715454102, + "learning_rate": 9.576844767601753e-06, + "loss": 2.6094, + "step": 572400 + }, + { + "epoch": 0.14931296738531247, + "grad_norm": 10.068269729614258, + "learning_rate": 9.576508144842603e-06, + "loss": 2.6275, + "step": 572600 + }, + { + "epoch": 0.14936512001101462, + "grad_norm": 10.287019729614258, + "learning_rate": 9.57617139416477e-06, + "loss": 2.6343, + "step": 572800 + }, + { + "epoch": 0.1494172726367168, + "grad_norm": 8.428248405456543, + "learning_rate": 9.575834515577673e-06, + "loss": 2.597, + "step": 573000 + }, + { + "epoch": 0.14946942526241896, + "grad_norm": 10.998594284057617, + "learning_rate": 9.575497509090723e-06, + "loss": 2.6543, + "step": 573200 + }, + { + "epoch": 0.14952157788812115, + "grad_norm": 9.181807518005371, + "learning_rate": 9.575160374713344e-06, + "loss": 2.6463, + "step": 573400 + }, + { + "epoch": 0.1495737305138233, + "grad_norm": 10.688831329345703, + "learning_rate": 9.574823112454957e-06, + "loss": 2.6277, + "step": 573600 + }, + { + "epoch": 0.1496258831395255, + "grad_norm": 9.806432723999023, + "learning_rate": 9.574485722324986e-06, + "loss": 2.6082, + "step": 573800 + }, + { + "epoch": 0.14967803576522765, + "grad_norm": 10.59438419342041, + "learning_rate": 9.574148204332868e-06, + "loss": 2.6137, + "step": 574000 + }, + { + "epoch": 0.14973018839092983, + "grad_norm": 11.886221885681152, + "learning_rate": 9.573810558488032e-06, + "loss": 2.6198, + "step": 574200 + }, + { + "epoch": 0.149782341016632, + "grad_norm": 9.97330379486084, + "learning_rate": 9.573472784799918e-06, + "loss": 2.6259, + "step": 574400 + }, + { + "epoch": 0.14983449364233417, + "grad_norm": 10.082623481750488, + "learning_rate": 9.573134883277966e-06, + "loss": 2.6735, + "step": 574600 + }, + { + "epoch": 0.14988664626803633, + "grad_norm": 10.873509407043457, + "learning_rate": 9.572796853931619e-06, + "loss": 2.6497, + "step": 574800 + }, + { + "epoch": 0.1499387988937385, + "grad_norm": 10.863423347473145, + "learning_rate": 9.57245869677033e-06, + "loss": 2.6382, + "step": 575000 + }, + { + "epoch": 0.14999095151944067, + "grad_norm": 9.81516170501709, + "learning_rate": 9.572120411803547e-06, + "loss": 2.5861, + "step": 575200 + }, + { + "epoch": 0.15004310414514285, + "grad_norm": 10.294231414794922, + "learning_rate": 9.571781999040726e-06, + "loss": 2.5945, + "step": 575400 + }, + { + "epoch": 0.150095256770845, + "grad_norm": 10.843868255615234, + "learning_rate": 9.571443458491327e-06, + "loss": 2.6269, + "step": 575600 + }, + { + "epoch": 0.1501474093965472, + "grad_norm": 8.99116039276123, + "learning_rate": 9.571104790164814e-06, + "loss": 2.6215, + "step": 575800 + }, + { + "epoch": 0.15019956202224935, + "grad_norm": 10.161559104919434, + "learning_rate": 9.57076599407065e-06, + "loss": 2.6052, + "step": 576000 + }, + { + "epoch": 0.15025171464795153, + "grad_norm": 9.422714233398438, + "learning_rate": 9.570427070218306e-06, + "loss": 2.6268, + "step": 576200 + }, + { + "epoch": 0.1503038672736537, + "grad_norm": 9.956077575683594, + "learning_rate": 9.570088018617255e-06, + "loss": 2.6241, + "step": 576400 + }, + { + "epoch": 0.15035601989935587, + "grad_norm": 10.455558776855469, + "learning_rate": 9.569748839276975e-06, + "loss": 2.6146, + "step": 576600 + }, + { + "epoch": 0.15040817252505803, + "grad_norm": 9.470061302185059, + "learning_rate": 9.569409532206945e-06, + "loss": 2.6122, + "step": 576800 + }, + { + "epoch": 0.15046032515076022, + "grad_norm": 9.468082427978516, + "learning_rate": 9.56907009741665e-06, + "loss": 2.6527, + "step": 577000 + }, + { + "epoch": 0.15051247777646237, + "grad_norm": 10.123863220214844, + "learning_rate": 9.56873053491558e-06, + "loss": 2.6252, + "step": 577200 + }, + { + "epoch": 0.15056463040216453, + "grad_norm": 9.473311424255371, + "learning_rate": 9.568390844713221e-06, + "loss": 2.622, + "step": 577400 + }, + { + "epoch": 0.1506167830278667, + "grad_norm": 10.261393547058105, + "learning_rate": 9.568051026819072e-06, + "loss": 2.6369, + "step": 577600 + }, + { + "epoch": 0.15066893565356887, + "grad_norm": 7.631354331970215, + "learning_rate": 9.567711081242628e-06, + "loss": 2.6039, + "step": 577800 + }, + { + "epoch": 0.15072108827927105, + "grad_norm": 11.26151180267334, + "learning_rate": 9.567371007993394e-06, + "loss": 2.6304, + "step": 578000 + }, + { + "epoch": 0.1507732409049732, + "grad_norm": 10.592456817626953, + "learning_rate": 9.567030807080872e-06, + "loss": 2.6539, + "step": 578200 + }, + { + "epoch": 0.1508253935306754, + "grad_norm": 9.679903030395508, + "learning_rate": 9.566690478514574e-06, + "loss": 2.621, + "step": 578400 + }, + { + "epoch": 0.15087754615637755, + "grad_norm": 9.416767120361328, + "learning_rate": 9.566350022304013e-06, + "loss": 2.6417, + "step": 578600 + }, + { + "epoch": 0.15092969878207974, + "grad_norm": 11.1919527053833, + "learning_rate": 9.566009438458701e-06, + "loss": 2.5929, + "step": 578800 + }, + { + "epoch": 0.1509818514077819, + "grad_norm": 10.748730659484863, + "learning_rate": 9.565668726988161e-06, + "loss": 2.6213, + "step": 579000 + }, + { + "epoch": 0.15103400403348408, + "grad_norm": 11.362808227539062, + "learning_rate": 9.565327887901918e-06, + "loss": 2.65, + "step": 579200 + }, + { + "epoch": 0.15108615665918623, + "grad_norm": 10.134300231933594, + "learning_rate": 9.564986921209493e-06, + "loss": 2.5866, + "step": 579400 + }, + { + "epoch": 0.15113830928488842, + "grad_norm": 10.209643363952637, + "learning_rate": 9.564645826920422e-06, + "loss": 2.5839, + "step": 579600 + }, + { + "epoch": 0.15119046191059057, + "grad_norm": 11.057905197143555, + "learning_rate": 9.564304605044239e-06, + "loss": 2.5859, + "step": 579800 + }, + { + "epoch": 0.15124261453629276, + "grad_norm": 11.382614135742188, + "learning_rate": 9.563963255590476e-06, + "loss": 2.5781, + "step": 580000 + }, + { + "epoch": 0.15129476716199491, + "grad_norm": 9.950427055358887, + "learning_rate": 9.563621778568679e-06, + "loss": 2.6076, + "step": 580200 + }, + { + "epoch": 0.1513469197876971, + "grad_norm": 9.879427909851074, + "learning_rate": 9.56328017398839e-06, + "loss": 2.5791, + "step": 580400 + }, + { + "epoch": 0.15139907241339925, + "grad_norm": 9.203298568725586, + "learning_rate": 9.56293844185916e-06, + "loss": 2.6405, + "step": 580600 + }, + { + "epoch": 0.15145122503910144, + "grad_norm": 10.701813697814941, + "learning_rate": 9.562596582190539e-06, + "loss": 2.6244, + "step": 580800 + }, + { + "epoch": 0.1515033776648036, + "grad_norm": 10.396395683288574, + "learning_rate": 9.562254594992082e-06, + "loss": 2.5892, + "step": 581000 + }, + { + "epoch": 0.15155553029050578, + "grad_norm": 8.901702880859375, + "learning_rate": 9.561912480273348e-06, + "loss": 2.631, + "step": 581200 + }, + { + "epoch": 0.15160768291620794, + "grad_norm": 10.96087646484375, + "learning_rate": 9.561570238043902e-06, + "loss": 2.6193, + "step": 581400 + }, + { + "epoch": 0.15165983554191012, + "grad_norm": 10.27599048614502, + "learning_rate": 9.561227868313306e-06, + "loss": 2.6072, + "step": 581600 + }, + { + "epoch": 0.15171198816761228, + "grad_norm": 9.896045684814453, + "learning_rate": 9.560885371091134e-06, + "loss": 2.6083, + "step": 581800 + }, + { + "epoch": 0.15176414079331446, + "grad_norm": 10.777628898620605, + "learning_rate": 9.560542746386955e-06, + "loss": 2.5971, + "step": 582000 + }, + { + "epoch": 0.15181629341901662, + "grad_norm": 9.133131980895996, + "learning_rate": 9.560199994210349e-06, + "loss": 2.6302, + "step": 582200 + }, + { + "epoch": 0.1518684460447188, + "grad_norm": 11.197638511657715, + "learning_rate": 9.559857114570893e-06, + "loss": 2.6054, + "step": 582400 + }, + { + "epoch": 0.15192059867042096, + "grad_norm": 10.832380294799805, + "learning_rate": 9.559514107478177e-06, + "loss": 2.5988, + "step": 582600 + }, + { + "epoch": 0.15197275129612314, + "grad_norm": 10.314582824707031, + "learning_rate": 9.559170972941782e-06, + "loss": 2.5965, + "step": 582800 + }, + { + "epoch": 0.1520249039218253, + "grad_norm": 11.280247688293457, + "learning_rate": 9.558827710971302e-06, + "loss": 2.631, + "step": 583000 + }, + { + "epoch": 0.15207705654752748, + "grad_norm": 10.295195579528809, + "learning_rate": 9.558484321576329e-06, + "loss": 2.6085, + "step": 583200 + }, + { + "epoch": 0.15212920917322964, + "grad_norm": 9.563957214355469, + "learning_rate": 9.558140804766464e-06, + "loss": 2.6147, + "step": 583400 + }, + { + "epoch": 0.1521813617989318, + "grad_norm": 10.042048454284668, + "learning_rate": 9.55779716055131e-06, + "loss": 2.6046, + "step": 583600 + }, + { + "epoch": 0.15223351442463398, + "grad_norm": 11.11385440826416, + "learning_rate": 9.557453388940468e-06, + "loss": 2.6468, + "step": 583800 + }, + { + "epoch": 0.15228566705033614, + "grad_norm": 10.20233154296875, + "learning_rate": 9.557109489943548e-06, + "loss": 2.6675, + "step": 584000 + }, + { + "epoch": 0.15233781967603832, + "grad_norm": 10.255865097045898, + "learning_rate": 9.556765463570164e-06, + "loss": 2.6439, + "step": 584200 + }, + { + "epoch": 0.15238997230174048, + "grad_norm": 9.48961067199707, + "learning_rate": 9.55642130982993e-06, + "loss": 2.6361, + "step": 584400 + }, + { + "epoch": 0.15244212492744266, + "grad_norm": 9.803009986877441, + "learning_rate": 9.55607702873247e-06, + "loss": 2.6297, + "step": 584600 + }, + { + "epoch": 0.15249427755314482, + "grad_norm": 10.121756553649902, + "learning_rate": 9.5557326202874e-06, + "loss": 2.61, + "step": 584800 + }, + { + "epoch": 0.152546430178847, + "grad_norm": 9.789412498474121, + "learning_rate": 9.555388084504353e-06, + "loss": 2.6089, + "step": 585000 + }, + { + "epoch": 0.15259858280454916, + "grad_norm": 10.218076705932617, + "learning_rate": 9.555043421392955e-06, + "loss": 2.5853, + "step": 585200 + }, + { + "epoch": 0.15265073543025134, + "grad_norm": 10.865063667297363, + "learning_rate": 9.554698630962841e-06, + "loss": 2.6124, + "step": 585400 + }, + { + "epoch": 0.1527028880559535, + "grad_norm": 10.640392303466797, + "learning_rate": 9.55435371322365e-06, + "loss": 2.6186, + "step": 585600 + }, + { + "epoch": 0.15275504068165568, + "grad_norm": 11.789724349975586, + "learning_rate": 9.55400866818502e-06, + "loss": 2.6226, + "step": 585800 + }, + { + "epoch": 0.15280719330735784, + "grad_norm": 8.585991859436035, + "learning_rate": 9.553663495856598e-06, + "loss": 2.6153, + "step": 586000 + }, + { + "epoch": 0.15285934593306003, + "grad_norm": 12.74191665649414, + "learning_rate": 9.55331819624803e-06, + "loss": 2.6273, + "step": 586200 + }, + { + "epoch": 0.15291149855876218, + "grad_norm": 11.765472412109375, + "learning_rate": 9.552972769368969e-06, + "loss": 2.6034, + "step": 586400 + }, + { + "epoch": 0.15296365118446437, + "grad_norm": 8.79220962524414, + "learning_rate": 9.552627215229067e-06, + "loss": 2.624, + "step": 586600 + }, + { + "epoch": 0.15301580381016652, + "grad_norm": 9.72270679473877, + "learning_rate": 9.552281533837988e-06, + "loss": 2.6228, + "step": 586800 + }, + { + "epoch": 0.1530679564358687, + "grad_norm": 11.258645057678223, + "learning_rate": 9.55193572520539e-06, + "loss": 2.6312, + "step": 587000 + }, + { + "epoch": 0.15312010906157086, + "grad_norm": 10.41986083984375, + "learning_rate": 9.551589789340939e-06, + "loss": 2.5924, + "step": 587200 + }, + { + "epoch": 0.15317226168727305, + "grad_norm": 10.305185317993164, + "learning_rate": 9.551243726254304e-06, + "loss": 2.6179, + "step": 587400 + }, + { + "epoch": 0.1532244143129752, + "grad_norm": 10.521088600158691, + "learning_rate": 9.550897535955161e-06, + "loss": 2.6227, + "step": 587600 + }, + { + "epoch": 0.1532765669386774, + "grad_norm": 10.343436241149902, + "learning_rate": 9.550551218453186e-06, + "loss": 2.6194, + "step": 587800 + }, + { + "epoch": 0.15332871956437955, + "grad_norm": 9.396683692932129, + "learning_rate": 9.550204773758055e-06, + "loss": 2.6268, + "step": 588000 + }, + { + "epoch": 0.15338087219008173, + "grad_norm": 11.622981071472168, + "learning_rate": 9.549858201879456e-06, + "loss": 2.6188, + "step": 588200 + }, + { + "epoch": 0.15343302481578389, + "grad_norm": 9.679468154907227, + "learning_rate": 9.549511502827071e-06, + "loss": 2.5841, + "step": 588400 + }, + { + "epoch": 0.15348517744148607, + "grad_norm": 10.439282417297363, + "learning_rate": 9.549164676610596e-06, + "loss": 2.626, + "step": 588600 + }, + { + "epoch": 0.15353733006718823, + "grad_norm": 10.602144241333008, + "learning_rate": 9.548817723239723e-06, + "loss": 2.6081, + "step": 588800 + }, + { + "epoch": 0.1535894826928904, + "grad_norm": 9.781011581420898, + "learning_rate": 9.548470642724148e-06, + "loss": 2.5835, + "step": 589000 + }, + { + "epoch": 0.15364163531859257, + "grad_norm": 10.308581352233887, + "learning_rate": 9.548123435073575e-06, + "loss": 2.6292, + "step": 589200 + }, + { + "epoch": 0.15369378794429472, + "grad_norm": 11.796847343444824, + "learning_rate": 9.547776100297708e-06, + "loss": 2.6315, + "step": 589400 + }, + { + "epoch": 0.1537459405699969, + "grad_norm": 10.120135307312012, + "learning_rate": 9.547428638406255e-06, + "loss": 2.6352, + "step": 589600 + }, + { + "epoch": 0.15379809319569906, + "grad_norm": 10.831207275390625, + "learning_rate": 9.547081049408928e-06, + "loss": 2.6174, + "step": 589800 + }, + { + "epoch": 0.15385024582140125, + "grad_norm": 9.786165237426758, + "learning_rate": 9.546733333315444e-06, + "loss": 2.6019, + "step": 590000 + }, + { + "epoch": 0.1539023984471034, + "grad_norm": 10.431953430175781, + "learning_rate": 9.54638549013552e-06, + "loss": 2.5921, + "step": 590200 + }, + { + "epoch": 0.1539545510728056, + "grad_norm": 10.351963996887207, + "learning_rate": 9.546037519878878e-06, + "loss": 2.6084, + "step": 590400 + }, + { + "epoch": 0.15400670369850775, + "grad_norm": 13.948197364807129, + "learning_rate": 9.545689422555246e-06, + "loss": 2.6002, + "step": 590600 + }, + { + "epoch": 0.15405885632420993, + "grad_norm": 9.773366928100586, + "learning_rate": 9.545341198174355e-06, + "loss": 2.6003, + "step": 590800 + }, + { + "epoch": 0.1541110089499121, + "grad_norm": 9.792441368103027, + "learning_rate": 9.544992846745936e-06, + "loss": 2.6275, + "step": 591000 + }, + { + "epoch": 0.15416316157561427, + "grad_norm": 10.63184642791748, + "learning_rate": 9.544644368279724e-06, + "loss": 2.6105, + "step": 591200 + }, + { + "epoch": 0.15421531420131643, + "grad_norm": 11.19427490234375, + "learning_rate": 9.544295762785464e-06, + "loss": 2.6435, + "step": 591400 + }, + { + "epoch": 0.1542674668270186, + "grad_norm": 9.695633888244629, + "learning_rate": 9.543947030272897e-06, + "loss": 2.6084, + "step": 591600 + }, + { + "epoch": 0.15431961945272077, + "grad_norm": 10.136941909790039, + "learning_rate": 9.543598170751772e-06, + "loss": 2.6144, + "step": 591800 + }, + { + "epoch": 0.15437177207842295, + "grad_norm": 10.13197135925293, + "learning_rate": 9.543249184231837e-06, + "loss": 2.6069, + "step": 592000 + }, + { + "epoch": 0.1544239247041251, + "grad_norm": 10.096322059631348, + "learning_rate": 9.54290007072285e-06, + "loss": 2.6032, + "step": 592200 + }, + { + "epoch": 0.1544760773298273, + "grad_norm": 8.74548053741455, + "learning_rate": 9.542550830234568e-06, + "loss": 2.6174, + "step": 592400 + }, + { + "epoch": 0.15452822995552945, + "grad_norm": 13.123319625854492, + "learning_rate": 9.542201462776752e-06, + "loss": 2.6112, + "step": 592600 + }, + { + "epoch": 0.15458038258123163, + "grad_norm": 11.316987037658691, + "learning_rate": 9.541851968359169e-06, + "loss": 2.6042, + "step": 592800 + }, + { + "epoch": 0.1546325352069338, + "grad_norm": 9.933026313781738, + "learning_rate": 9.541502346991586e-06, + "loss": 2.6165, + "step": 593000 + }, + { + "epoch": 0.15468468783263598, + "grad_norm": 9.549524307250977, + "learning_rate": 9.541152598683776e-06, + "loss": 2.5907, + "step": 593200 + }, + { + "epoch": 0.15473684045833813, + "grad_norm": 10.845429420471191, + "learning_rate": 9.540802723445513e-06, + "loss": 2.6287, + "step": 593400 + }, + { + "epoch": 0.15478899308404032, + "grad_norm": 9.898306846618652, + "learning_rate": 9.54045272128658e-06, + "loss": 2.6095, + "step": 593600 + }, + { + "epoch": 0.15484114570974247, + "grad_norm": 11.204236030578613, + "learning_rate": 9.540102592216757e-06, + "loss": 2.6506, + "step": 593800 + }, + { + "epoch": 0.15489329833544466, + "grad_norm": 9.741256713867188, + "learning_rate": 9.539752336245834e-06, + "loss": 2.5979, + "step": 594000 + }, + { + "epoch": 0.1549454509611468, + "grad_norm": 9.863458633422852, + "learning_rate": 9.539401953383595e-06, + "loss": 2.559, + "step": 594200 + }, + { + "epoch": 0.154997603586849, + "grad_norm": 11.014803886413574, + "learning_rate": 9.53905144363984e-06, + "loss": 2.647, + "step": 594400 + }, + { + "epoch": 0.15504975621255115, + "grad_norm": 10.70952033996582, + "learning_rate": 9.538700807024363e-06, + "loss": 2.6143, + "step": 594600 + }, + { + "epoch": 0.15510190883825334, + "grad_norm": 10.817914962768555, + "learning_rate": 9.538350043546965e-06, + "loss": 2.5906, + "step": 594800 + }, + { + "epoch": 0.1551540614639555, + "grad_norm": 10.918020248413086, + "learning_rate": 9.53799915321745e-06, + "loss": 2.6202, + "step": 595000 + }, + { + "epoch": 0.15520621408965765, + "grad_norm": 9.967550277709961, + "learning_rate": 9.537648136045626e-06, + "loss": 2.6368, + "step": 595200 + }, + { + "epoch": 0.15525836671535984, + "grad_norm": 10.808151245117188, + "learning_rate": 9.537296992041303e-06, + "loss": 2.6267, + "step": 595400 + }, + { + "epoch": 0.155310519341062, + "grad_norm": 10.046856880187988, + "learning_rate": 9.536945721214299e-06, + "loss": 2.6376, + "step": 595600 + }, + { + "epoch": 0.15536267196676418, + "grad_norm": 10.47192668914795, + "learning_rate": 9.536594323574432e-06, + "loss": 2.5965, + "step": 595800 + }, + { + "epoch": 0.15541482459246633, + "grad_norm": 10.471516609191895, + "learning_rate": 9.536242799131522e-06, + "loss": 2.6248, + "step": 596000 + }, + { + "epoch": 0.15546697721816852, + "grad_norm": 9.41662311553955, + "learning_rate": 9.535891147895395e-06, + "loss": 2.568, + "step": 596200 + }, + { + "epoch": 0.15551912984387067, + "grad_norm": 10.728153228759766, + "learning_rate": 9.53553936987588e-06, + "loss": 2.615, + "step": 596400 + }, + { + "epoch": 0.15557128246957286, + "grad_norm": 11.101585388183594, + "learning_rate": 9.535187465082809e-06, + "loss": 2.6044, + "step": 596600 + }, + { + "epoch": 0.15562343509527501, + "grad_norm": 10.208956718444824, + "learning_rate": 9.534835433526021e-06, + "loss": 2.5869, + "step": 596800 + }, + { + "epoch": 0.1556755877209772, + "grad_norm": 9.980835914611816, + "learning_rate": 9.534483275215353e-06, + "loss": 2.6418, + "step": 597000 + }, + { + "epoch": 0.15572774034667936, + "grad_norm": 10.444533348083496, + "learning_rate": 9.534130990160649e-06, + "loss": 2.6149, + "step": 597200 + }, + { + "epoch": 0.15577989297238154, + "grad_norm": 9.33448600769043, + "learning_rate": 9.533778578371755e-06, + "loss": 2.6071, + "step": 597400 + }, + { + "epoch": 0.1558320455980837, + "grad_norm": 11.563689231872559, + "learning_rate": 9.533426039858523e-06, + "loss": 2.5906, + "step": 597600 + }, + { + "epoch": 0.15588419822378588, + "grad_norm": 10.888169288635254, + "learning_rate": 9.533073374630805e-06, + "loss": 2.6418, + "step": 597800 + }, + { + "epoch": 0.15593635084948804, + "grad_norm": 10.408832550048828, + "learning_rate": 9.53272058269846e-06, + "loss": 2.5685, + "step": 598000 + }, + { + "epoch": 0.15598850347519022, + "grad_norm": 11.165427207946777, + "learning_rate": 9.532367664071349e-06, + "loss": 2.6015, + "step": 598200 + }, + { + "epoch": 0.15604065610089238, + "grad_norm": 11.39417552947998, + "learning_rate": 9.532014618759336e-06, + "loss": 2.5874, + "step": 598400 + }, + { + "epoch": 0.15609280872659456, + "grad_norm": 10.001264572143555, + "learning_rate": 9.531661446772287e-06, + "loss": 2.5927, + "step": 598600 + }, + { + "epoch": 0.15614496135229672, + "grad_norm": 10.457130432128906, + "learning_rate": 9.531308148120077e-06, + "loss": 2.5764, + "step": 598800 + }, + { + "epoch": 0.1561971139779989, + "grad_norm": 8.923357963562012, + "learning_rate": 9.530954722812577e-06, + "loss": 2.6262, + "step": 599000 + }, + { + "epoch": 0.15624926660370106, + "grad_norm": 11.424574851989746, + "learning_rate": 9.530601170859672e-06, + "loss": 2.6137, + "step": 599200 + }, + { + "epoch": 0.15630141922940324, + "grad_norm": 11.1590576171875, + "learning_rate": 9.530247492271237e-06, + "loss": 2.6372, + "step": 599400 + }, + { + "epoch": 0.1563535718551054, + "grad_norm": 10.053206443786621, + "learning_rate": 9.529893687057163e-06, + "loss": 2.6275, + "step": 599600 + }, + { + "epoch": 0.15640572448080758, + "grad_norm": 10.665011405944824, + "learning_rate": 9.529539755227336e-06, + "loss": 2.5987, + "step": 599800 + }, + { + "epoch": 0.15645787710650974, + "grad_norm": 10.702035903930664, + "learning_rate": 9.529185696791651e-06, + "loss": 2.5779, + "step": 600000 + }, + { + "epoch": 0.15651002973221192, + "grad_norm": 9.53780746459961, + "learning_rate": 9.528831511760003e-06, + "loss": 2.6361, + "step": 600200 + }, + { + "epoch": 0.15656218235791408, + "grad_norm": 11.620895385742188, + "learning_rate": 9.528477200142292e-06, + "loss": 2.6155, + "step": 600400 + }, + { + "epoch": 0.15661433498361627, + "grad_norm": 9.885120391845703, + "learning_rate": 9.528122761948421e-06, + "loss": 2.6552, + "step": 600600 + }, + { + "epoch": 0.15666648760931842, + "grad_norm": 11.59696102142334, + "learning_rate": 9.527768197188299e-06, + "loss": 2.6134, + "step": 600800 + }, + { + "epoch": 0.15671864023502058, + "grad_norm": 10.598244667053223, + "learning_rate": 9.527413505871836e-06, + "loss": 2.5894, + "step": 601000 + }, + { + "epoch": 0.15677079286072276, + "grad_norm": 11.047648429870605, + "learning_rate": 9.527058688008943e-06, + "loss": 2.6413, + "step": 601200 + }, + { + "epoch": 0.15682294548642492, + "grad_norm": 10.149199485778809, + "learning_rate": 9.526703743609543e-06, + "loss": 2.6164, + "step": 601400 + }, + { + "epoch": 0.1568750981121271, + "grad_norm": 10.866960525512695, + "learning_rate": 9.52634867268355e-06, + "loss": 2.5793, + "step": 601600 + }, + { + "epoch": 0.15692725073782926, + "grad_norm": 9.808792114257812, + "learning_rate": 9.525993475240894e-06, + "loss": 2.6329, + "step": 601800 + }, + { + "epoch": 0.15697940336353144, + "grad_norm": 9.316394805908203, + "learning_rate": 9.525638151291504e-06, + "loss": 2.6382, + "step": 602000 + }, + { + "epoch": 0.1570315559892336, + "grad_norm": 9.754195213317871, + "learning_rate": 9.525282700845305e-06, + "loss": 2.6143, + "step": 602200 + }, + { + "epoch": 0.15708370861493579, + "grad_norm": 11.504837036132812, + "learning_rate": 9.52492712391224e-06, + "loss": 2.6187, + "step": 602400 + }, + { + "epoch": 0.15713586124063794, + "grad_norm": 9.503823280334473, + "learning_rate": 9.524571420502243e-06, + "loss": 2.638, + "step": 602600 + }, + { + "epoch": 0.15718801386634013, + "grad_norm": 11.267765998840332, + "learning_rate": 9.524215590625258e-06, + "loss": 2.6037, + "step": 602800 + }, + { + "epoch": 0.15724016649204228, + "grad_norm": 12.685362815856934, + "learning_rate": 9.52385963429123e-06, + "loss": 2.5829, + "step": 603000 + }, + { + "epoch": 0.15729231911774447, + "grad_norm": 9.462133407592773, + "learning_rate": 9.523503551510112e-06, + "loss": 2.6734, + "step": 603200 + }, + { + "epoch": 0.15734447174344662, + "grad_norm": 10.139527320861816, + "learning_rate": 9.52314734229185e-06, + "loss": 2.6098, + "step": 603400 + }, + { + "epoch": 0.1573966243691488, + "grad_norm": 10.855724334716797, + "learning_rate": 9.522791006646407e-06, + "loss": 2.5865, + "step": 603600 + }, + { + "epoch": 0.15744877699485096, + "grad_norm": 10.368106842041016, + "learning_rate": 9.522434544583738e-06, + "loss": 2.621, + "step": 603800 + }, + { + "epoch": 0.15750092962055315, + "grad_norm": 11.346565246582031, + "learning_rate": 9.522077956113812e-06, + "loss": 2.6346, + "step": 604000 + }, + { + "epoch": 0.1575530822462553, + "grad_norm": 11.265953063964844, + "learning_rate": 9.521721241246591e-06, + "loss": 2.6319, + "step": 604200 + }, + { + "epoch": 0.1576052348719575, + "grad_norm": 10.973451614379883, + "learning_rate": 9.521364399992048e-06, + "loss": 2.6154, + "step": 604400 + }, + { + "epoch": 0.15765738749765965, + "grad_norm": 10.001646995544434, + "learning_rate": 9.521007432360157e-06, + "loss": 2.5999, + "step": 604600 + }, + { + "epoch": 0.15770954012336183, + "grad_norm": 9.617718696594238, + "learning_rate": 9.520650338360896e-06, + "loss": 2.6654, + "step": 604800 + }, + { + "epoch": 0.157761692749064, + "grad_norm": 9.889549255371094, + "learning_rate": 9.520293118004243e-06, + "loss": 2.6504, + "step": 605000 + }, + { + "epoch": 0.15781384537476617, + "grad_norm": 12.019047737121582, + "learning_rate": 9.519935771300188e-06, + "loss": 2.619, + "step": 605200 + }, + { + "epoch": 0.15786599800046833, + "grad_norm": 9.76209831237793, + "learning_rate": 9.519578298258714e-06, + "loss": 2.5828, + "step": 605400 + }, + { + "epoch": 0.1579181506261705, + "grad_norm": 11.635560989379883, + "learning_rate": 9.519220698889817e-06, + "loss": 2.5842, + "step": 605600 + }, + { + "epoch": 0.15797030325187267, + "grad_norm": 11.385933876037598, + "learning_rate": 9.518862973203491e-06, + "loss": 2.57, + "step": 605800 + }, + { + "epoch": 0.15802245587757485, + "grad_norm": 10.346911430358887, + "learning_rate": 9.518505121209733e-06, + "loss": 2.6275, + "step": 606000 + }, + { + "epoch": 0.158074608503277, + "grad_norm": 11.468281745910645, + "learning_rate": 9.518147142918548e-06, + "loss": 2.5972, + "step": 606200 + }, + { + "epoch": 0.1581267611289792, + "grad_norm": 10.361491203308105, + "learning_rate": 9.517789038339939e-06, + "loss": 2.6139, + "step": 606400 + }, + { + "epoch": 0.15817891375468135, + "grad_norm": 10.77645206451416, + "learning_rate": 9.517430807483919e-06, + "loss": 2.6024, + "step": 606600 + }, + { + "epoch": 0.15823106638038353, + "grad_norm": 10.53165054321289, + "learning_rate": 9.517072450360499e-06, + "loss": 2.5576, + "step": 606800 + }, + { + "epoch": 0.1582832190060857, + "grad_norm": 10.261857986450195, + "learning_rate": 9.516713966979694e-06, + "loss": 2.6202, + "step": 607000 + }, + { + "epoch": 0.15833537163178785, + "grad_norm": 10.302119255065918, + "learning_rate": 9.516355357351528e-06, + "loss": 2.617, + "step": 607200 + }, + { + "epoch": 0.15838752425749003, + "grad_norm": 10.450231552124023, + "learning_rate": 9.515996621486021e-06, + "loss": 2.6169, + "step": 607400 + }, + { + "epoch": 0.1584396768831922, + "grad_norm": 11.078191757202148, + "learning_rate": 9.515637759393201e-06, + "loss": 2.6044, + "step": 607600 + }, + { + "epoch": 0.15849182950889437, + "grad_norm": 11.261504173278809, + "learning_rate": 9.515278771083098e-06, + "loss": 2.6508, + "step": 607800 + }, + { + "epoch": 0.15854398213459653, + "grad_norm": 11.22635555267334, + "learning_rate": 9.514919656565749e-06, + "loss": 2.595, + "step": 608000 + }, + { + "epoch": 0.1585961347602987, + "grad_norm": 9.812448501586914, + "learning_rate": 9.514560415851189e-06, + "loss": 2.6281, + "step": 608200 + }, + { + "epoch": 0.15864828738600087, + "grad_norm": 11.062207221984863, + "learning_rate": 9.51420104894946e-06, + "loss": 2.628, + "step": 608400 + }, + { + "epoch": 0.15870044001170305, + "grad_norm": 11.523585319519043, + "learning_rate": 9.513841555870607e-06, + "loss": 2.6367, + "step": 608600 + }, + { + "epoch": 0.1587525926374052, + "grad_norm": 9.39417839050293, + "learning_rate": 9.513481936624677e-06, + "loss": 2.63, + "step": 608800 + }, + { + "epoch": 0.1588047452631074, + "grad_norm": 10.997912406921387, + "learning_rate": 9.513122191221722e-06, + "loss": 2.5634, + "step": 609000 + }, + { + "epoch": 0.15885689788880955, + "grad_norm": 10.309552192687988, + "learning_rate": 9.5127623196718e-06, + "loss": 2.6526, + "step": 609200 + }, + { + "epoch": 0.15890905051451173, + "grad_norm": 10.835392951965332, + "learning_rate": 9.512402321984966e-06, + "loss": 2.6135, + "step": 609400 + }, + { + "epoch": 0.1589612031402139, + "grad_norm": 10.294236183166504, + "learning_rate": 9.512042198171286e-06, + "loss": 2.6536, + "step": 609600 + }, + { + "epoch": 0.15901335576591608, + "grad_norm": 11.246005058288574, + "learning_rate": 9.51168194824082e-06, + "loss": 2.6102, + "step": 609800 + }, + { + "epoch": 0.15906550839161823, + "grad_norm": 11.818621635437012, + "learning_rate": 9.511321572203645e-06, + "loss": 2.6259, + "step": 610000 + }, + { + "epoch": 0.15911766101732042, + "grad_norm": 9.672310829162598, + "learning_rate": 9.510961070069829e-06, + "loss": 2.5676, + "step": 610200 + }, + { + "epoch": 0.15916981364302257, + "grad_norm": 11.772770881652832, + "learning_rate": 9.510600441849451e-06, + "loss": 2.5989, + "step": 610400 + }, + { + "epoch": 0.15922196626872476, + "grad_norm": 10.861141204833984, + "learning_rate": 9.510239687552589e-06, + "loss": 2.5797, + "step": 610600 + }, + { + "epoch": 0.1592741188944269, + "grad_norm": 9.267687797546387, + "learning_rate": 9.509878807189327e-06, + "loss": 2.5731, + "step": 610800 + }, + { + "epoch": 0.1593262715201291, + "grad_norm": 9.538235664367676, + "learning_rate": 9.509517800769752e-06, + "loss": 2.6158, + "step": 611000 + }, + { + "epoch": 0.15937842414583125, + "grad_norm": 11.026472091674805, + "learning_rate": 9.509156668303956e-06, + "loss": 2.6308, + "step": 611200 + }, + { + "epoch": 0.15943057677153344, + "grad_norm": 10.019219398498535, + "learning_rate": 9.50879540980203e-06, + "loss": 2.6177, + "step": 611400 + }, + { + "epoch": 0.1594827293972356, + "grad_norm": 9.39755916595459, + "learning_rate": 9.508434025274074e-06, + "loss": 2.5825, + "step": 611600 + }, + { + "epoch": 0.15953488202293778, + "grad_norm": 9.156355857849121, + "learning_rate": 9.508072514730189e-06, + "loss": 2.6348, + "step": 611800 + }, + { + "epoch": 0.15958703464863994, + "grad_norm": 10.577462196350098, + "learning_rate": 9.50771087818048e-06, + "loss": 2.627, + "step": 612000 + }, + { + "epoch": 0.15963918727434212, + "grad_norm": 10.779411315917969, + "learning_rate": 9.507349115635053e-06, + "loss": 2.6185, + "step": 612200 + }, + { + "epoch": 0.15969133990004428, + "grad_norm": 12.37381362915039, + "learning_rate": 9.506987227104021e-06, + "loss": 2.6284, + "step": 612400 + }, + { + "epoch": 0.15974349252574646, + "grad_norm": 9.256758689880371, + "learning_rate": 9.5066252125975e-06, + "loss": 2.6302, + "step": 612600 + }, + { + "epoch": 0.15979564515144862, + "grad_norm": 10.987409591674805, + "learning_rate": 9.506263072125607e-06, + "loss": 2.5929, + "step": 612800 + }, + { + "epoch": 0.15984779777715077, + "grad_norm": 9.691329002380371, + "learning_rate": 9.505900805698468e-06, + "loss": 2.6097, + "step": 613000 + }, + { + "epoch": 0.15989995040285296, + "grad_norm": 10.040337562561035, + "learning_rate": 9.505538413326203e-06, + "loss": 2.6048, + "step": 613200 + }, + { + "epoch": 0.15995210302855511, + "grad_norm": 8.556117057800293, + "learning_rate": 9.505175895018946e-06, + "loss": 2.5706, + "step": 613400 + }, + { + "epoch": 0.1600042556542573, + "grad_norm": 11.223811149597168, + "learning_rate": 9.504813250786826e-06, + "loss": 2.5724, + "step": 613600 + }, + { + "epoch": 0.16005640827995946, + "grad_norm": 9.269017219543457, + "learning_rate": 9.504450480639984e-06, + "loss": 2.6118, + "step": 613800 + }, + { + "epoch": 0.16010856090566164, + "grad_norm": 10.954751968383789, + "learning_rate": 9.504087584588554e-06, + "loss": 2.622, + "step": 614000 + }, + { + "epoch": 0.1601607135313638, + "grad_norm": 10.984915733337402, + "learning_rate": 9.503724562642686e-06, + "loss": 2.5626, + "step": 614200 + }, + { + "epoch": 0.16021286615706598, + "grad_norm": 10.41535758972168, + "learning_rate": 9.503361414812522e-06, + "loss": 2.6036, + "step": 614400 + }, + { + "epoch": 0.16026501878276814, + "grad_norm": 9.983470916748047, + "learning_rate": 9.502998141108215e-06, + "loss": 2.6439, + "step": 614600 + }, + { + "epoch": 0.16031717140847032, + "grad_norm": 9.90550708770752, + "learning_rate": 9.502634741539916e-06, + "loss": 2.5825, + "step": 614800 + }, + { + "epoch": 0.16036932403417248, + "grad_norm": 10.278912544250488, + "learning_rate": 9.502271216117784e-06, + "loss": 2.5804, + "step": 615000 + }, + { + "epoch": 0.16042147665987466, + "grad_norm": 10.6985445022583, + "learning_rate": 9.501907564851982e-06, + "loss": 2.6131, + "step": 615200 + }, + { + "epoch": 0.16047362928557682, + "grad_norm": 10.952655792236328, + "learning_rate": 9.501543787752672e-06, + "loss": 2.6165, + "step": 615400 + }, + { + "epoch": 0.160525781911279, + "grad_norm": 10.143887519836426, + "learning_rate": 9.501179884830021e-06, + "loss": 2.6062, + "step": 615600 + }, + { + "epoch": 0.16057793453698116, + "grad_norm": 11.375021934509277, + "learning_rate": 9.500815856094204e-06, + "loss": 2.6002, + "step": 615800 + }, + { + "epoch": 0.16063008716268334, + "grad_norm": 10.76820182800293, + "learning_rate": 9.500451701555394e-06, + "loss": 2.5903, + "step": 616000 + }, + { + "epoch": 0.1606822397883855, + "grad_norm": 9.969457626342773, + "learning_rate": 9.500087421223769e-06, + "loss": 2.6216, + "step": 616200 + }, + { + "epoch": 0.16073439241408768, + "grad_norm": 11.117074012756348, + "learning_rate": 9.499723015109514e-06, + "loss": 2.6253, + "step": 616400 + }, + { + "epoch": 0.16078654503978984, + "grad_norm": 10.098209381103516, + "learning_rate": 9.499358483222809e-06, + "loss": 2.6172, + "step": 616600 + }, + { + "epoch": 0.16083869766549203, + "grad_norm": 10.3887939453125, + "learning_rate": 9.49899382557385e-06, + "loss": 2.619, + "step": 616800 + }, + { + "epoch": 0.16089085029119418, + "grad_norm": 11.24229621887207, + "learning_rate": 9.498629042172824e-06, + "loss": 2.6362, + "step": 617000 + }, + { + "epoch": 0.16094300291689637, + "grad_norm": 11.60924243927002, + "learning_rate": 9.498264133029928e-06, + "loss": 2.6756, + "step": 617200 + }, + { + "epoch": 0.16099515554259852, + "grad_norm": 11.145500183105469, + "learning_rate": 9.497899098155364e-06, + "loss": 2.6094, + "step": 617400 + }, + { + "epoch": 0.1610473081683007, + "grad_norm": 11.344988822937012, + "learning_rate": 9.497533937559335e-06, + "loss": 2.5854, + "step": 617600 + }, + { + "epoch": 0.16109946079400286, + "grad_norm": 11.063643455505371, + "learning_rate": 9.497168651252044e-06, + "loss": 2.6057, + "step": 617800 + }, + { + "epoch": 0.16115161341970505, + "grad_norm": 10.891105651855469, + "learning_rate": 9.496803239243706e-06, + "loss": 2.5996, + "step": 618000 + }, + { + "epoch": 0.1612037660454072, + "grad_norm": 10.276857376098633, + "learning_rate": 9.496437701544533e-06, + "loss": 2.6236, + "step": 618200 + }, + { + "epoch": 0.1612559186711094, + "grad_norm": 10.791783332824707, + "learning_rate": 9.49607203816474e-06, + "loss": 2.6193, + "step": 618400 + }, + { + "epoch": 0.16130807129681154, + "grad_norm": 9.532054901123047, + "learning_rate": 9.495706249114549e-06, + "loss": 2.5948, + "step": 618600 + }, + { + "epoch": 0.1613602239225137, + "grad_norm": 9.186354637145996, + "learning_rate": 9.495340334404186e-06, + "loss": 2.595, + "step": 618800 + }, + { + "epoch": 0.16141237654821589, + "grad_norm": 9.525355339050293, + "learning_rate": 9.494974294043878e-06, + "loss": 2.551, + "step": 619000 + }, + { + "epoch": 0.16146452917391804, + "grad_norm": 9.88534164428711, + "learning_rate": 9.494608128043852e-06, + "loss": 2.5548, + "step": 619200 + }, + { + "epoch": 0.16151668179962023, + "grad_norm": 11.86941146850586, + "learning_rate": 9.49424183641435e-06, + "loss": 2.6126, + "step": 619400 + }, + { + "epoch": 0.16156883442532238, + "grad_norm": 9.518510818481445, + "learning_rate": 9.493875419165605e-06, + "loss": 2.6217, + "step": 619600 + }, + { + "epoch": 0.16162098705102457, + "grad_norm": 11.503427505493164, + "learning_rate": 9.49350887630786e-06, + "loss": 2.6139, + "step": 619800 + }, + { + "epoch": 0.16167313967672672, + "grad_norm": 11.120110511779785, + "learning_rate": 9.493142207851362e-06, + "loss": 2.6086, + "step": 620000 + }, + { + "epoch": 0.1617252923024289, + "grad_norm": 9.80033016204834, + "learning_rate": 9.492775413806357e-06, + "loss": 2.5888, + "step": 620200 + }, + { + "epoch": 0.16177744492813106, + "grad_norm": 9.526128768920898, + "learning_rate": 9.4924084941831e-06, + "loss": 2.5898, + "step": 620400 + }, + { + "epoch": 0.16182959755383325, + "grad_norm": 9.477727890014648, + "learning_rate": 9.492041448991845e-06, + "loss": 2.5997, + "step": 620600 + }, + { + "epoch": 0.1618817501795354, + "grad_norm": 11.579855918884277, + "learning_rate": 9.491674278242852e-06, + "loss": 2.6183, + "step": 620800 + }, + { + "epoch": 0.1619339028052376, + "grad_norm": 10.910771369934082, + "learning_rate": 9.491306981946385e-06, + "loss": 2.5857, + "step": 621000 + }, + { + "epoch": 0.16198605543093975, + "grad_norm": 10.065752983093262, + "learning_rate": 9.490939560112709e-06, + "loss": 2.6177, + "step": 621200 + }, + { + "epoch": 0.16203820805664193, + "grad_norm": 11.171086311340332, + "learning_rate": 9.490572012752093e-06, + "loss": 2.5936, + "step": 621400 + }, + { + "epoch": 0.1620903606823441, + "grad_norm": 11.971680641174316, + "learning_rate": 9.490204339874811e-06, + "loss": 2.6105, + "step": 621600 + }, + { + "epoch": 0.16214251330804627, + "grad_norm": 11.462523460388184, + "learning_rate": 9.489836541491142e-06, + "loss": 2.571, + "step": 621800 + }, + { + "epoch": 0.16219466593374843, + "grad_norm": 10.30375862121582, + "learning_rate": 9.489468617611363e-06, + "loss": 2.6248, + "step": 622000 + }, + { + "epoch": 0.1622468185594506, + "grad_norm": 12.754924774169922, + "learning_rate": 9.48910056824576e-06, + "loss": 2.626, + "step": 622200 + }, + { + "epoch": 0.16229897118515277, + "grad_norm": 10.41318416595459, + "learning_rate": 9.48873239340462e-06, + "loss": 2.6197, + "step": 622400 + }, + { + "epoch": 0.16235112381085495, + "grad_norm": 10.452898025512695, + "learning_rate": 9.488364093098235e-06, + "loss": 2.6034, + "step": 622600 + }, + { + "epoch": 0.1624032764365571, + "grad_norm": 10.983317375183105, + "learning_rate": 9.487995667336897e-06, + "loss": 2.6084, + "step": 622800 + }, + { + "epoch": 0.1624554290622593, + "grad_norm": 12.018777847290039, + "learning_rate": 9.487627116130907e-06, + "loss": 2.5987, + "step": 623000 + }, + { + "epoch": 0.16250758168796145, + "grad_norm": 10.180075645446777, + "learning_rate": 9.487258439490563e-06, + "loss": 2.578, + "step": 623200 + }, + { + "epoch": 0.16255973431366363, + "grad_norm": 10.785030364990234, + "learning_rate": 9.486889637426171e-06, + "loss": 2.5835, + "step": 623400 + }, + { + "epoch": 0.1626118869393658, + "grad_norm": 12.28320026397705, + "learning_rate": 9.486520709948042e-06, + "loss": 2.5512, + "step": 623600 + }, + { + "epoch": 0.16266403956506797, + "grad_norm": 10.83224868774414, + "learning_rate": 9.486151657066485e-06, + "loss": 2.621, + "step": 623800 + }, + { + "epoch": 0.16271619219077013, + "grad_norm": 11.643034934997559, + "learning_rate": 9.485782478791817e-06, + "loss": 2.6352, + "step": 624000 + }, + { + "epoch": 0.16276834481647232, + "grad_norm": 11.283828735351562, + "learning_rate": 9.485413175134355e-06, + "loss": 2.6173, + "step": 624200 + }, + { + "epoch": 0.16282049744217447, + "grad_norm": 11.758881568908691, + "learning_rate": 9.485043746104424e-06, + "loss": 2.6289, + "step": 624400 + }, + { + "epoch": 0.16287265006787663, + "grad_norm": 10.56377124786377, + "learning_rate": 9.484674191712348e-06, + "loss": 2.5545, + "step": 624600 + }, + { + "epoch": 0.1629248026935788, + "grad_norm": 10.32989501953125, + "learning_rate": 9.48430451196846e-06, + "loss": 2.5994, + "step": 624800 + }, + { + "epoch": 0.16297695531928097, + "grad_norm": 10.324919700622559, + "learning_rate": 9.483934706883086e-06, + "loss": 2.6167, + "step": 625000 + }, + { + "epoch": 0.16302910794498315, + "grad_norm": 10.184969902038574, + "learning_rate": 9.48356477646657e-06, + "loss": 2.5771, + "step": 625200 + }, + { + "epoch": 0.1630812605706853, + "grad_norm": 10.589273452758789, + "learning_rate": 9.483194720729247e-06, + "loss": 2.6033, + "step": 625400 + }, + { + "epoch": 0.1631334131963875, + "grad_norm": 10.217822074890137, + "learning_rate": 9.482824539681463e-06, + "loss": 2.6012, + "step": 625600 + }, + { + "epoch": 0.16318556582208965, + "grad_norm": 10.657920837402344, + "learning_rate": 9.482454233333564e-06, + "loss": 2.6132, + "step": 625800 + }, + { + "epoch": 0.16323771844779184, + "grad_norm": 12.07504653930664, + "learning_rate": 9.482083801695902e-06, + "loss": 2.6208, + "step": 626000 + }, + { + "epoch": 0.163289871073494, + "grad_norm": 10.118162155151367, + "learning_rate": 9.481713244778829e-06, + "loss": 2.6053, + "step": 626200 + }, + { + "epoch": 0.16334202369919618, + "grad_norm": 10.543754577636719, + "learning_rate": 9.481342562592702e-06, + "loss": 2.6122, + "step": 626400 + }, + { + "epoch": 0.16339417632489833, + "grad_norm": 11.980351448059082, + "learning_rate": 9.480971755147884e-06, + "loss": 2.5896, + "step": 626600 + }, + { + "epoch": 0.16344632895060052, + "grad_norm": 9.149388313293457, + "learning_rate": 9.48060082245474e-06, + "loss": 2.62, + "step": 626800 + }, + { + "epoch": 0.16349848157630267, + "grad_norm": 10.801803588867188, + "learning_rate": 9.480229764523634e-06, + "loss": 2.5989, + "step": 627000 + }, + { + "epoch": 0.16355063420200486, + "grad_norm": 9.936863899230957, + "learning_rate": 9.479858581364942e-06, + "loss": 2.5892, + "step": 627200 + }, + { + "epoch": 0.16360278682770701, + "grad_norm": 12.720370292663574, + "learning_rate": 9.479487272989036e-06, + "loss": 2.6159, + "step": 627400 + }, + { + "epoch": 0.1636549394534092, + "grad_norm": 8.66869831085205, + "learning_rate": 9.479115839406296e-06, + "loss": 2.6255, + "step": 627600 + }, + { + "epoch": 0.16370709207911135, + "grad_norm": 11.05100154876709, + "learning_rate": 9.478744280627106e-06, + "loss": 2.6127, + "step": 627800 + }, + { + "epoch": 0.16375924470481354, + "grad_norm": 11.327431678771973, + "learning_rate": 9.478372596661847e-06, + "loss": 2.6414, + "step": 628000 + }, + { + "epoch": 0.1638113973305157, + "grad_norm": 10.757742881774902, + "learning_rate": 9.47800078752091e-06, + "loss": 2.6034, + "step": 628200 + }, + { + "epoch": 0.16386354995621788, + "grad_norm": 9.70084285736084, + "learning_rate": 9.477628853214689e-06, + "loss": 2.582, + "step": 628400 + }, + { + "epoch": 0.16391570258192004, + "grad_norm": 11.229771614074707, + "learning_rate": 9.477256793753578e-06, + "loss": 2.6069, + "step": 628600 + }, + { + "epoch": 0.16396785520762222, + "grad_norm": 10.731244087219238, + "learning_rate": 9.476884609147976e-06, + "loss": 2.5613, + "step": 628800 + }, + { + "epoch": 0.16402000783332438, + "grad_norm": 9.405749320983887, + "learning_rate": 9.476512299408287e-06, + "loss": 2.5655, + "step": 629000 + }, + { + "epoch": 0.16407216045902656, + "grad_norm": 12.480544090270996, + "learning_rate": 9.47613986454492e-06, + "loss": 2.6047, + "step": 629200 + }, + { + "epoch": 0.16412431308472872, + "grad_norm": 10.722823143005371, + "learning_rate": 9.47576730456828e-06, + "loss": 2.5775, + "step": 629400 + }, + { + "epoch": 0.1641764657104309, + "grad_norm": 10.385933876037598, + "learning_rate": 9.475394619488785e-06, + "loss": 2.6002, + "step": 629600 + }, + { + "epoch": 0.16422861833613306, + "grad_norm": 8.670275688171387, + "learning_rate": 9.47502180931685e-06, + "loss": 2.552, + "step": 629800 + }, + { + "epoch": 0.16428077096183524, + "grad_norm": 11.322480201721191, + "learning_rate": 9.474648874062894e-06, + "loss": 2.6183, + "step": 630000 + }, + { + "epoch": 0.1643329235875374, + "grad_norm": 10.096075057983398, + "learning_rate": 9.474275813737344e-06, + "loss": 2.582, + "step": 630200 + }, + { + "epoch": 0.16438507621323958, + "grad_norm": 10.714835166931152, + "learning_rate": 9.473902628350624e-06, + "loss": 2.6248, + "step": 630400 + }, + { + "epoch": 0.16443722883894174, + "grad_norm": 9.770901679992676, + "learning_rate": 9.473529317913169e-06, + "loss": 2.6214, + "step": 630600 + }, + { + "epoch": 0.1644893814646439, + "grad_norm": 11.455342292785645, + "learning_rate": 9.47315588243541e-06, + "loss": 2.5787, + "step": 630800 + }, + { + "epoch": 0.16454153409034608, + "grad_norm": 11.288113594055176, + "learning_rate": 9.472782321927786e-06, + "loss": 2.6009, + "step": 631000 + }, + { + "epoch": 0.16459368671604824, + "grad_norm": 9.5037260055542, + "learning_rate": 9.472408636400742e-06, + "loss": 2.579, + "step": 631200 + }, + { + "epoch": 0.16464583934175042, + "grad_norm": 11.143527030944824, + "learning_rate": 9.472034825864718e-06, + "loss": 2.585, + "step": 631400 + }, + { + "epoch": 0.16469799196745258, + "grad_norm": 10.50413703918457, + "learning_rate": 9.471660890330163e-06, + "loss": 2.5971, + "step": 631600 + }, + { + "epoch": 0.16475014459315476, + "grad_norm": 11.185114860534668, + "learning_rate": 9.471286829807531e-06, + "loss": 2.6326, + "step": 631800 + }, + { + "epoch": 0.16480229721885692, + "grad_norm": 10.260226249694824, + "learning_rate": 9.470912644307276e-06, + "loss": 2.6244, + "step": 632000 + }, + { + "epoch": 0.1648544498445591, + "grad_norm": 11.488462448120117, + "learning_rate": 9.470538333839858e-06, + "loss": 2.598, + "step": 632200 + }, + { + "epoch": 0.16490660247026126, + "grad_norm": 11.961732864379883, + "learning_rate": 9.470163898415738e-06, + "loss": 2.5907, + "step": 632400 + }, + { + "epoch": 0.16495875509596344, + "grad_norm": 12.416627883911133, + "learning_rate": 9.469789338045383e-06, + "loss": 2.5822, + "step": 632600 + }, + { + "epoch": 0.1650109077216656, + "grad_norm": 10.282339096069336, + "learning_rate": 9.469414652739262e-06, + "loss": 2.5707, + "step": 632800 + }, + { + "epoch": 0.16506306034736778, + "grad_norm": 10.362985610961914, + "learning_rate": 9.46903984250785e-06, + "loss": 2.6038, + "step": 633000 + }, + { + "epoch": 0.16511521297306994, + "grad_norm": 11.942998886108398, + "learning_rate": 9.468664907361618e-06, + "loss": 2.6165, + "step": 633200 + }, + { + "epoch": 0.16516736559877213, + "grad_norm": 11.504192352294922, + "learning_rate": 9.468289847311052e-06, + "loss": 2.6164, + "step": 633400 + }, + { + "epoch": 0.16521951822447428, + "grad_norm": 11.131896018981934, + "learning_rate": 9.467914662366632e-06, + "loss": 2.5774, + "step": 633600 + }, + { + "epoch": 0.16527167085017647, + "grad_norm": 10.091717720031738, + "learning_rate": 9.467539352538845e-06, + "loss": 2.6165, + "step": 633800 + }, + { + "epoch": 0.16532382347587862, + "grad_norm": 9.936517715454102, + "learning_rate": 9.467163917838183e-06, + "loss": 2.5712, + "step": 634000 + }, + { + "epoch": 0.1653759761015808, + "grad_norm": 11.358585357666016, + "learning_rate": 9.466788358275136e-06, + "loss": 2.5891, + "step": 634200 + }, + { + "epoch": 0.16542812872728296, + "grad_norm": 10.875038146972656, + "learning_rate": 9.466412673860206e-06, + "loss": 2.6178, + "step": 634400 + }, + { + "epoch": 0.16548028135298515, + "grad_norm": 10.750335693359375, + "learning_rate": 9.466036864603893e-06, + "loss": 2.6026, + "step": 634600 + }, + { + "epoch": 0.1655324339786873, + "grad_norm": 9.322654724121094, + "learning_rate": 9.465660930516698e-06, + "loss": 2.5806, + "step": 634800 + }, + { + "epoch": 0.1655845866043895, + "grad_norm": 11.006738662719727, + "learning_rate": 9.465284871609132e-06, + "loss": 2.5799, + "step": 635000 + }, + { + "epoch": 0.16563673923009165, + "grad_norm": 11.173583030700684, + "learning_rate": 9.464908687891704e-06, + "loss": 2.6159, + "step": 635200 + }, + { + "epoch": 0.16568889185579383, + "grad_norm": 9.691333770751953, + "learning_rate": 9.46453237937493e-06, + "loss": 2.5984, + "step": 635400 + }, + { + "epoch": 0.16574104448149599, + "grad_norm": 10.239128112792969, + "learning_rate": 9.464155946069329e-06, + "loss": 2.6037, + "step": 635600 + }, + { + "epoch": 0.16579319710719817, + "grad_norm": 11.106518745422363, + "learning_rate": 9.463779387985422e-06, + "loss": 2.5693, + "step": 635800 + }, + { + "epoch": 0.16584534973290033, + "grad_norm": 11.097664833068848, + "learning_rate": 9.463402705133735e-06, + "loss": 2.567, + "step": 636000 + }, + { + "epoch": 0.1658975023586025, + "grad_norm": 10.165889739990234, + "learning_rate": 9.463025897524794e-06, + "loss": 2.6039, + "step": 636200 + }, + { + "epoch": 0.16594965498430467, + "grad_norm": 10.918819427490234, + "learning_rate": 9.462648965169133e-06, + "loss": 2.6199, + "step": 636400 + }, + { + "epoch": 0.16600180761000682, + "grad_norm": 10.222068786621094, + "learning_rate": 9.46227190807729e-06, + "loss": 2.5954, + "step": 636600 + }, + { + "epoch": 0.166053960235709, + "grad_norm": 10.973430633544922, + "learning_rate": 9.4618947262598e-06, + "loss": 2.5924, + "step": 636800 + }, + { + "epoch": 0.16610611286141116, + "grad_norm": 12.050614356994629, + "learning_rate": 9.461517419727208e-06, + "loss": 2.5885, + "step": 637000 + }, + { + "epoch": 0.16615826548711335, + "grad_norm": 10.913813591003418, + "learning_rate": 9.46113998849006e-06, + "loss": 2.5929, + "step": 637200 + }, + { + "epoch": 0.1662104181128155, + "grad_norm": 11.964225769042969, + "learning_rate": 9.460762432558905e-06, + "loss": 2.5462, + "step": 637400 + }, + { + "epoch": 0.1662625707385177, + "grad_norm": 12.226649284362793, + "learning_rate": 9.460384751944298e-06, + "loss": 2.5944, + "step": 637600 + }, + { + "epoch": 0.16631472336421985, + "grad_norm": 11.865588188171387, + "learning_rate": 9.460006946656794e-06, + "loss": 2.5778, + "step": 637800 + }, + { + "epoch": 0.16636687598992203, + "grad_norm": 10.3606538772583, + "learning_rate": 9.459629016706952e-06, + "loss": 2.5963, + "step": 638000 + }, + { + "epoch": 0.1664190286156242, + "grad_norm": 9.922835350036621, + "learning_rate": 9.459250962105337e-06, + "loss": 2.606, + "step": 638200 + }, + { + "epoch": 0.16647118124132637, + "grad_norm": 11.583536148071289, + "learning_rate": 9.458872782862516e-06, + "loss": 2.5747, + "step": 638400 + }, + { + "epoch": 0.16652333386702853, + "grad_norm": 9.941807746887207, + "learning_rate": 9.458494478989059e-06, + "loss": 2.5838, + "step": 638600 + }, + { + "epoch": 0.1665754864927307, + "grad_norm": 11.269411087036133, + "learning_rate": 9.458116050495543e-06, + "loss": 2.6033, + "step": 638800 + }, + { + "epoch": 0.16662763911843287, + "grad_norm": 11.465824127197266, + "learning_rate": 9.457737497392541e-06, + "loss": 2.5794, + "step": 639000 + }, + { + "epoch": 0.16667979174413505, + "grad_norm": 11.089844703674316, + "learning_rate": 9.457358819690636e-06, + "loss": 2.591, + "step": 639200 + }, + { + "epoch": 0.1667319443698372, + "grad_norm": 11.80525016784668, + "learning_rate": 9.456980017400413e-06, + "loss": 2.5967, + "step": 639400 + }, + { + "epoch": 0.1667840969955394, + "grad_norm": 11.84639835357666, + "learning_rate": 9.456601090532458e-06, + "loss": 2.5895, + "step": 639600 + }, + { + "epoch": 0.16683624962124155, + "grad_norm": 10.74103832244873, + "learning_rate": 9.456222039097365e-06, + "loss": 2.5974, + "step": 639800 + }, + { + "epoch": 0.16688840224694373, + "grad_norm": 11.63347053527832, + "learning_rate": 9.455842863105728e-06, + "loss": 2.6094, + "step": 640000 + }, + { + "epoch": 0.1669405548726459, + "grad_norm": 10.795598983764648, + "learning_rate": 9.455463562568143e-06, + "loss": 2.585, + "step": 640200 + }, + { + "epoch": 0.16699270749834808, + "grad_norm": 10.217510223388672, + "learning_rate": 9.455084137495216e-06, + "loss": 2.6029, + "step": 640400 + }, + { + "epoch": 0.16704486012405023, + "grad_norm": 11.189576148986816, + "learning_rate": 9.454704587897553e-06, + "loss": 2.6158, + "step": 640600 + }, + { + "epoch": 0.16709701274975242, + "grad_norm": 12.054612159729004, + "learning_rate": 9.454324913785756e-06, + "loss": 2.5981, + "step": 640800 + }, + { + "epoch": 0.16714916537545457, + "grad_norm": 10.436782836914062, + "learning_rate": 9.453945115170444e-06, + "loss": 2.5662, + "step": 641000 + }, + { + "epoch": 0.16720131800115676, + "grad_norm": 8.611639022827148, + "learning_rate": 9.453565192062228e-06, + "loss": 2.5866, + "step": 641200 + }, + { + "epoch": 0.1672534706268589, + "grad_norm": 12.085579872131348, + "learning_rate": 9.453185144471732e-06, + "loss": 2.5935, + "step": 641400 + }, + { + "epoch": 0.1673056232525611, + "grad_norm": 12.087554931640625, + "learning_rate": 9.452804972409576e-06, + "loss": 2.6145, + "step": 641600 + }, + { + "epoch": 0.16735777587826325, + "grad_norm": 11.656718254089355, + "learning_rate": 9.452424675886386e-06, + "loss": 2.6006, + "step": 641800 + }, + { + "epoch": 0.16740992850396544, + "grad_norm": 9.5364351272583, + "learning_rate": 9.452044254912794e-06, + "loss": 2.638, + "step": 642000 + }, + { + "epoch": 0.1674620811296676, + "grad_norm": 9.897908210754395, + "learning_rate": 9.45166370949943e-06, + "loss": 2.5881, + "step": 642200 + }, + { + "epoch": 0.16751423375536975, + "grad_norm": 10.919352531433105, + "learning_rate": 9.451283039656933e-06, + "loss": 2.5761, + "step": 642400 + }, + { + "epoch": 0.16756638638107194, + "grad_norm": 11.361388206481934, + "learning_rate": 9.450902245395943e-06, + "loss": 2.6329, + "step": 642600 + }, + { + "epoch": 0.1676185390067741, + "grad_norm": 11.634418487548828, + "learning_rate": 9.450521326727104e-06, + "loss": 2.5698, + "step": 642800 + }, + { + "epoch": 0.16767069163247628, + "grad_norm": 11.556270599365234, + "learning_rate": 9.45014028366106e-06, + "loss": 2.5883, + "step": 643000 + }, + { + "epoch": 0.16772284425817843, + "grad_norm": 11.184852600097656, + "learning_rate": 9.449759116208465e-06, + "loss": 2.6441, + "step": 643200 + }, + { + "epoch": 0.16777499688388062, + "grad_norm": 10.851284980773926, + "learning_rate": 9.449377824379973e-06, + "loss": 2.5469, + "step": 643400 + }, + { + "epoch": 0.16782714950958277, + "grad_norm": 11.684367179870605, + "learning_rate": 9.448996408186239e-06, + "loss": 2.6165, + "step": 643600 + }, + { + "epoch": 0.16787930213528496, + "grad_norm": 11.91071891784668, + "learning_rate": 9.448614867637926e-06, + "loss": 2.6167, + "step": 643800 + }, + { + "epoch": 0.16793145476098711, + "grad_norm": 10.489933013916016, + "learning_rate": 9.448233202745699e-06, + "loss": 2.5944, + "step": 644000 + }, + { + "epoch": 0.1679836073866893, + "grad_norm": 9.974836349487305, + "learning_rate": 9.447851413520222e-06, + "loss": 2.5532, + "step": 644200 + }, + { + "epoch": 0.16803576001239146, + "grad_norm": 12.725610733032227, + "learning_rate": 9.447469499972172e-06, + "loss": 2.5883, + "step": 644400 + }, + { + "epoch": 0.16808791263809364, + "grad_norm": 10.390010833740234, + "learning_rate": 9.447087462112222e-06, + "loss": 2.6363, + "step": 644600 + }, + { + "epoch": 0.1681400652637958, + "grad_norm": 11.040779113769531, + "learning_rate": 9.446705299951047e-06, + "loss": 2.5922, + "step": 644800 + }, + { + "epoch": 0.16819221788949798, + "grad_norm": 10.456809997558594, + "learning_rate": 9.446323013499333e-06, + "loss": 2.587, + "step": 645000 + }, + { + "epoch": 0.16824437051520014, + "grad_norm": 12.05203628540039, + "learning_rate": 9.445940602767764e-06, + "loss": 2.6597, + "step": 645200 + }, + { + "epoch": 0.16829652314090232, + "grad_norm": 12.002357482910156, + "learning_rate": 9.445558067767028e-06, + "loss": 2.6082, + "step": 645400 + }, + { + "epoch": 0.16834867576660448, + "grad_norm": 11.387803077697754, + "learning_rate": 9.445175408507818e-06, + "loss": 2.6126, + "step": 645600 + }, + { + "epoch": 0.16840082839230666, + "grad_norm": 11.261042594909668, + "learning_rate": 9.444792625000832e-06, + "loss": 2.6011, + "step": 645800 + }, + { + "epoch": 0.16845298101800882, + "grad_norm": 10.792635917663574, + "learning_rate": 9.444409717256766e-06, + "loss": 2.6017, + "step": 646000 + }, + { + "epoch": 0.168505133643711, + "grad_norm": 10.310630798339844, + "learning_rate": 9.444026685286323e-06, + "loss": 2.6158, + "step": 646200 + }, + { + "epoch": 0.16855728626941316, + "grad_norm": 11.336012840270996, + "learning_rate": 9.443643529100211e-06, + "loss": 2.5637, + "step": 646400 + }, + { + "epoch": 0.16860943889511534, + "grad_norm": 10.27746295928955, + "learning_rate": 9.443260248709138e-06, + "loss": 2.5942, + "step": 646600 + }, + { + "epoch": 0.1686615915208175, + "grad_norm": 11.652708053588867, + "learning_rate": 9.442876844123818e-06, + "loss": 2.5802, + "step": 646800 + }, + { + "epoch": 0.16871374414651968, + "grad_norm": 10.635597229003906, + "learning_rate": 9.442493315354966e-06, + "loss": 2.6022, + "step": 647000 + }, + { + "epoch": 0.16876589677222184, + "grad_norm": 12.215426445007324, + "learning_rate": 9.442109662413306e-06, + "loss": 2.603, + "step": 647200 + }, + { + "epoch": 0.16881804939792402, + "grad_norm": 10.632146835327148, + "learning_rate": 9.441725885309558e-06, + "loss": 2.5967, + "step": 647400 + }, + { + "epoch": 0.16887020202362618, + "grad_norm": 12.226157188415527, + "learning_rate": 9.441341984054448e-06, + "loss": 2.6191, + "step": 647600 + }, + { + "epoch": 0.16892235464932837, + "grad_norm": 11.939155578613281, + "learning_rate": 9.440957958658712e-06, + "loss": 2.5977, + "step": 647800 + }, + { + "epoch": 0.16897450727503052, + "grad_norm": 11.38477611541748, + "learning_rate": 9.440573809133078e-06, + "loss": 2.6372, + "step": 648000 + }, + { + "epoch": 0.16902665990073268, + "grad_norm": 10.353914260864258, + "learning_rate": 9.440189535488286e-06, + "loss": 2.6133, + "step": 648200 + }, + { + "epoch": 0.16907881252643486, + "grad_norm": 10.853445053100586, + "learning_rate": 9.439805137735078e-06, + "loss": 2.5439, + "step": 648400 + }, + { + "epoch": 0.16913096515213702, + "grad_norm": 10.450331687927246, + "learning_rate": 9.439420615884196e-06, + "loss": 2.6195, + "step": 648600 + }, + { + "epoch": 0.1691831177778392, + "grad_norm": 11.067255973815918, + "learning_rate": 9.43903596994639e-06, + "loss": 2.5792, + "step": 648800 + }, + { + "epoch": 0.16923527040354136, + "grad_norm": 10.674095153808594, + "learning_rate": 9.43865119993241e-06, + "loss": 2.5886, + "step": 649000 + }, + { + "epoch": 0.16928742302924354, + "grad_norm": 11.1282377243042, + "learning_rate": 9.43826630585301e-06, + "loss": 2.5493, + "step": 649200 + }, + { + "epoch": 0.1693395756549457, + "grad_norm": 11.285887718200684, + "learning_rate": 9.43788128771895e-06, + "loss": 2.5827, + "step": 649400 + }, + { + "epoch": 0.16939172828064789, + "grad_norm": 12.13648509979248, + "learning_rate": 9.43749614554099e-06, + "loss": 2.589, + "step": 649600 + }, + { + "epoch": 0.16944388090635004, + "grad_norm": 11.782530784606934, + "learning_rate": 9.437110879329897e-06, + "loss": 2.5266, + "step": 649800 + }, + { + "epoch": 0.16949603353205223, + "grad_norm": 10.601338386535645, + "learning_rate": 9.436725489096438e-06, + "loss": 2.6385, + "step": 650000 + }, + { + "epoch": 0.16954818615775438, + "grad_norm": 10.485753059387207, + "learning_rate": 9.436339974851388e-06, + "loss": 2.5673, + "step": 650200 + }, + { + "epoch": 0.16960033878345657, + "grad_norm": 10.410168647766113, + "learning_rate": 9.435954336605518e-06, + "loss": 2.6, + "step": 650400 + }, + { + "epoch": 0.16965249140915872, + "grad_norm": 11.205649375915527, + "learning_rate": 9.43556857436961e-06, + "loss": 2.5582, + "step": 650600 + }, + { + "epoch": 0.1697046440348609, + "grad_norm": 11.990690231323242, + "learning_rate": 9.435182688154447e-06, + "loss": 2.6067, + "step": 650800 + }, + { + "epoch": 0.16975679666056306, + "grad_norm": 12.044350624084473, + "learning_rate": 9.434796677970813e-06, + "loss": 2.5565, + "step": 651000 + }, + { + "epoch": 0.16980894928626525, + "grad_norm": 12.324272155761719, + "learning_rate": 9.434410543829497e-06, + "loss": 2.5784, + "step": 651200 + }, + { + "epoch": 0.1698611019119674, + "grad_norm": 10.339320182800293, + "learning_rate": 9.434024285741295e-06, + "loss": 2.5863, + "step": 651400 + }, + { + "epoch": 0.1699132545376696, + "grad_norm": 10.739986419677734, + "learning_rate": 9.433637903717e-06, + "loss": 2.6013, + "step": 651600 + }, + { + "epoch": 0.16996540716337175, + "grad_norm": 11.532021522521973, + "learning_rate": 9.433251397767415e-06, + "loss": 2.5864, + "step": 651800 + }, + { + "epoch": 0.17001755978907393, + "grad_norm": 11.322754859924316, + "learning_rate": 9.432864767903342e-06, + "loss": 2.5692, + "step": 652000 + }, + { + "epoch": 0.1700697124147761, + "grad_norm": 10.201240539550781, + "learning_rate": 9.432478014135587e-06, + "loss": 2.579, + "step": 652200 + }, + { + "epoch": 0.17012186504047827, + "grad_norm": 10.831350326538086, + "learning_rate": 9.432091136474961e-06, + "loss": 2.5776, + "step": 652400 + }, + { + "epoch": 0.17017401766618043, + "grad_norm": 10.44613265991211, + "learning_rate": 9.431704134932276e-06, + "loss": 2.5915, + "step": 652600 + }, + { + "epoch": 0.1702261702918826, + "grad_norm": 10.422142028808594, + "learning_rate": 9.431317009518352e-06, + "loss": 2.5365, + "step": 652800 + }, + { + "epoch": 0.17027832291758477, + "grad_norm": 12.878448486328125, + "learning_rate": 9.430929760244007e-06, + "loss": 2.6059, + "step": 653000 + }, + { + "epoch": 0.17033047554328695, + "grad_norm": 10.190254211425781, + "learning_rate": 9.430542387120068e-06, + "loss": 2.5466, + "step": 653200 + }, + { + "epoch": 0.1703826281689891, + "grad_norm": 11.413830757141113, + "learning_rate": 9.430154890157359e-06, + "loss": 2.5667, + "step": 653400 + }, + { + "epoch": 0.1704347807946913, + "grad_norm": 9.806634902954102, + "learning_rate": 9.429767269366712e-06, + "loss": 2.5951, + "step": 653600 + }, + { + "epoch": 0.17048693342039345, + "grad_norm": 11.987215042114258, + "learning_rate": 9.429379524758963e-06, + "loss": 2.5993, + "step": 653800 + }, + { + "epoch": 0.1705390860460956, + "grad_norm": 11.944783210754395, + "learning_rate": 9.42899165634495e-06, + "loss": 2.6648, + "step": 654000 + }, + { + "epoch": 0.1705912386717978, + "grad_norm": 10.974967956542969, + "learning_rate": 9.428603664135511e-06, + "loss": 2.5808, + "step": 654200 + }, + { + "epoch": 0.17064339129749995, + "grad_norm": 11.658656120300293, + "learning_rate": 9.428215548141495e-06, + "loss": 2.5701, + "step": 654400 + }, + { + "epoch": 0.17069554392320213, + "grad_norm": 10.598328590393066, + "learning_rate": 9.427827308373747e-06, + "loss": 2.6166, + "step": 654600 + }, + { + "epoch": 0.1707476965489043, + "grad_norm": 11.660055160522461, + "learning_rate": 9.427438944843121e-06, + "loss": 2.6142, + "step": 654800 + }, + { + "epoch": 0.17079984917460647, + "grad_norm": 11.008819580078125, + "learning_rate": 9.427050457560472e-06, + "loss": 2.6348, + "step": 655000 + }, + { + "epoch": 0.17085200180030863, + "grad_norm": 10.61733341217041, + "learning_rate": 9.42666184653666e-06, + "loss": 2.6307, + "step": 655200 + }, + { + "epoch": 0.1709041544260108, + "grad_norm": 10.868151664733887, + "learning_rate": 9.426273111782543e-06, + "loss": 2.5962, + "step": 655400 + }, + { + "epoch": 0.17095630705171297, + "grad_norm": 11.051012992858887, + "learning_rate": 9.425884253308988e-06, + "loss": 2.625, + "step": 655600 + }, + { + "epoch": 0.17100845967741515, + "grad_norm": 11.177716255187988, + "learning_rate": 9.425495271126865e-06, + "loss": 2.5808, + "step": 655800 + }, + { + "epoch": 0.1710606123031173, + "grad_norm": 10.845020294189453, + "learning_rate": 9.425106165247048e-06, + "loss": 2.5558, + "step": 656000 + }, + { + "epoch": 0.1711127649288195, + "grad_norm": 10.903834342956543, + "learning_rate": 9.42471693568041e-06, + "loss": 2.6147, + "step": 656200 + }, + { + "epoch": 0.17116491755452165, + "grad_norm": 9.43630599975586, + "learning_rate": 9.424327582437833e-06, + "loss": 2.6235, + "step": 656400 + }, + { + "epoch": 0.17121707018022383, + "grad_norm": 10.715693473815918, + "learning_rate": 9.423938105530197e-06, + "loss": 2.6456, + "step": 656600 + }, + { + "epoch": 0.171269222805926, + "grad_norm": 10.508771896362305, + "learning_rate": 9.423548504968392e-06, + "loss": 2.578, + "step": 656800 + }, + { + "epoch": 0.17132137543162818, + "grad_norm": 10.303911209106445, + "learning_rate": 9.423158780763304e-06, + "loss": 2.5929, + "step": 657000 + }, + { + "epoch": 0.17137352805733033, + "grad_norm": 11.929397583007812, + "learning_rate": 9.422768932925828e-06, + "loss": 2.5516, + "step": 657200 + }, + { + "epoch": 0.17142568068303252, + "grad_norm": 11.60670280456543, + "learning_rate": 9.422378961466863e-06, + "loss": 2.6122, + "step": 657400 + }, + { + "epoch": 0.17147783330873467, + "grad_norm": 10.905845642089844, + "learning_rate": 9.421988866397304e-06, + "loss": 2.5991, + "step": 657600 + }, + { + "epoch": 0.17152998593443686, + "grad_norm": 9.888165473937988, + "learning_rate": 9.421598647728059e-06, + "loss": 2.5957, + "step": 657800 + }, + { + "epoch": 0.171582138560139, + "grad_norm": 10.218853950500488, + "learning_rate": 9.421208305470033e-06, + "loss": 2.5762, + "step": 658000 + }, + { + "epoch": 0.1716342911858412, + "grad_norm": 10.51092529296875, + "learning_rate": 9.420817839634138e-06, + "loss": 2.6286, + "step": 658200 + }, + { + "epoch": 0.17168644381154335, + "grad_norm": 12.242321968078613, + "learning_rate": 9.420427250231286e-06, + "loss": 2.5824, + "step": 658400 + }, + { + "epoch": 0.17173859643724554, + "grad_norm": 9.587675094604492, + "learning_rate": 9.420036537272395e-06, + "loss": 2.6159, + "step": 658600 + }, + { + "epoch": 0.1717907490629477, + "grad_norm": 12.193526268005371, + "learning_rate": 9.419645700768386e-06, + "loss": 2.5814, + "step": 658800 + }, + { + "epoch": 0.17184290168864988, + "grad_norm": 13.005501747131348, + "learning_rate": 9.419254740730184e-06, + "loss": 2.6249, + "step": 659000 + }, + { + "epoch": 0.17189505431435204, + "grad_norm": 12.042132377624512, + "learning_rate": 9.418863657168717e-06, + "loss": 2.592, + "step": 659200 + }, + { + "epoch": 0.17194720694005422, + "grad_norm": 11.057788848876953, + "learning_rate": 9.418472450094915e-06, + "loss": 2.6023, + "step": 659400 + }, + { + "epoch": 0.17199935956575638, + "grad_norm": 11.42861557006836, + "learning_rate": 9.418081119519715e-06, + "loss": 2.6074, + "step": 659600 + }, + { + "epoch": 0.17205151219145856, + "grad_norm": 10.534041404724121, + "learning_rate": 9.417689665454052e-06, + "loss": 2.6056, + "step": 659800 + }, + { + "epoch": 0.17210366481716072, + "grad_norm": 10.17191219329834, + "learning_rate": 9.417298087908869e-06, + "loss": 2.5732, + "step": 660000 + }, + { + "epoch": 0.17215581744286287, + "grad_norm": 10.42569351196289, + "learning_rate": 9.41690638689511e-06, + "loss": 2.5551, + "step": 660200 + }, + { + "epoch": 0.17220797006856506, + "grad_norm": 10.62281608581543, + "learning_rate": 9.416514562423726e-06, + "loss": 2.5706, + "step": 660400 + }, + { + "epoch": 0.17226012269426721, + "grad_norm": 10.042311668395996, + "learning_rate": 9.416122614505669e-06, + "loss": 2.5726, + "step": 660600 + }, + { + "epoch": 0.1723122753199694, + "grad_norm": 11.639966011047363, + "learning_rate": 9.41573054315189e-06, + "loss": 2.6004, + "step": 660800 + }, + { + "epoch": 0.17236442794567156, + "grad_norm": 12.120760917663574, + "learning_rate": 9.415338348373354e-06, + "loss": 2.5984, + "step": 661000 + }, + { + "epoch": 0.17241658057137374, + "grad_norm": 10.926827430725098, + "learning_rate": 9.414946030181018e-06, + "loss": 2.5645, + "step": 661200 + }, + { + "epoch": 0.1724687331970759, + "grad_norm": 12.61771011352539, + "learning_rate": 9.414553588585849e-06, + "loss": 2.6064, + "step": 661400 + }, + { + "epoch": 0.17252088582277808, + "grad_norm": 11.024273872375488, + "learning_rate": 9.41416102359882e-06, + "loss": 2.6001, + "step": 661600 + }, + { + "epoch": 0.17257303844848024, + "grad_norm": 12.414682388305664, + "learning_rate": 9.413768335230897e-06, + "loss": 2.5993, + "step": 661800 + }, + { + "epoch": 0.17262519107418242, + "grad_norm": 12.029796600341797, + "learning_rate": 9.413375523493062e-06, + "loss": 2.542, + "step": 662000 + }, + { + "epoch": 0.17267734369988458, + "grad_norm": 10.11413288116455, + "learning_rate": 9.412982588396292e-06, + "loss": 2.5833, + "step": 662200 + }, + { + "epoch": 0.17272949632558676, + "grad_norm": 11.750227928161621, + "learning_rate": 9.41258952995157e-06, + "loss": 2.5748, + "step": 662400 + }, + { + "epoch": 0.17278164895128892, + "grad_norm": 10.92453670501709, + "learning_rate": 9.412196348169883e-06, + "loss": 2.5841, + "step": 662600 + }, + { + "epoch": 0.1728338015769911, + "grad_norm": 11.723861694335938, + "learning_rate": 9.411803043062222e-06, + "loss": 2.5925, + "step": 662800 + }, + { + "epoch": 0.17288595420269326, + "grad_norm": 10.404121398925781, + "learning_rate": 9.411409614639576e-06, + "loss": 2.5609, + "step": 663000 + }, + { + "epoch": 0.17293810682839544, + "grad_norm": 10.455796241760254, + "learning_rate": 9.411016062912946e-06, + "loss": 2.6097, + "step": 663200 + }, + { + "epoch": 0.1729902594540976, + "grad_norm": 10.422048568725586, + "learning_rate": 9.410622387893332e-06, + "loss": 2.6332, + "step": 663400 + }, + { + "epoch": 0.17304241207979978, + "grad_norm": 10.72389030456543, + "learning_rate": 9.410228589591735e-06, + "loss": 2.5613, + "step": 663600 + }, + { + "epoch": 0.17309456470550194, + "grad_norm": 10.777589797973633, + "learning_rate": 9.409834668019165e-06, + "loss": 2.56, + "step": 663800 + }, + { + "epoch": 0.17314671733120413, + "grad_norm": 11.299818992614746, + "learning_rate": 9.40944062318663e-06, + "loss": 2.6022, + "step": 664000 + }, + { + "epoch": 0.17319886995690628, + "grad_norm": 12.202249526977539, + "learning_rate": 9.409046455105146e-06, + "loss": 2.6085, + "step": 664200 + }, + { + "epoch": 0.17325102258260847, + "grad_norm": 10.913101196289062, + "learning_rate": 9.40865216378573e-06, + "loss": 2.5667, + "step": 664400 + }, + { + "epoch": 0.17330317520831062, + "grad_norm": 12.865901947021484, + "learning_rate": 9.408257749239402e-06, + "loss": 2.6148, + "step": 664600 + }, + { + "epoch": 0.1733553278340128, + "grad_norm": 11.728925704956055, + "learning_rate": 9.407863211477189e-06, + "loss": 2.5964, + "step": 664800 + }, + { + "epoch": 0.17340748045971496, + "grad_norm": 10.57736587524414, + "learning_rate": 9.407468550510114e-06, + "loss": 2.5577, + "step": 665000 + }, + { + "epoch": 0.17345963308541715, + "grad_norm": 11.458696365356445, + "learning_rate": 9.407073766349213e-06, + "loss": 2.5685, + "step": 665200 + }, + { + "epoch": 0.1735117857111193, + "grad_norm": 10.13565444946289, + "learning_rate": 9.406678859005518e-06, + "loss": 2.5543, + "step": 665400 + }, + { + "epoch": 0.1735639383368215, + "grad_norm": 11.961712837219238, + "learning_rate": 9.406283828490068e-06, + "loss": 2.5873, + "step": 665600 + }, + { + "epoch": 0.17361609096252364, + "grad_norm": 12.306097030639648, + "learning_rate": 9.405888674813903e-06, + "loss": 2.6193, + "step": 665800 + }, + { + "epoch": 0.1736682435882258, + "grad_norm": 11.791540145874023, + "learning_rate": 9.40549339798807e-06, + "loss": 2.575, + "step": 666000 + }, + { + "epoch": 0.17372039621392799, + "grad_norm": 12.87740421295166, + "learning_rate": 9.405097998023618e-06, + "loss": 2.5365, + "step": 666200 + }, + { + "epoch": 0.17377254883963014, + "grad_norm": 13.530356407165527, + "learning_rate": 9.404702474931598e-06, + "loss": 2.5866, + "step": 666400 + }, + { + "epoch": 0.17382470146533233, + "grad_norm": 10.974345207214355, + "learning_rate": 9.404306828723063e-06, + "loss": 2.6068, + "step": 666600 + }, + { + "epoch": 0.17387685409103448, + "grad_norm": 12.054535865783691, + "learning_rate": 9.403911059409075e-06, + "loss": 2.6092, + "step": 666800 + }, + { + "epoch": 0.17392900671673667, + "grad_norm": 10.9446382522583, + "learning_rate": 9.403515167000694e-06, + "loss": 2.5992, + "step": 667000 + }, + { + "epoch": 0.17398115934243882, + "grad_norm": 10.59375286102295, + "learning_rate": 9.403119151508988e-06, + "loss": 2.5542, + "step": 667200 + }, + { + "epoch": 0.174033311968141, + "grad_norm": 10.471702575683594, + "learning_rate": 9.402723012945023e-06, + "loss": 2.6148, + "step": 667400 + }, + { + "epoch": 0.17408546459384316, + "grad_norm": 11.591514587402344, + "learning_rate": 9.402326751319875e-06, + "loss": 2.5909, + "step": 667600 + }, + { + "epoch": 0.17413761721954535, + "grad_norm": 11.10065746307373, + "learning_rate": 9.401930366644617e-06, + "loss": 2.5856, + "step": 667800 + }, + { + "epoch": 0.1741897698452475, + "grad_norm": 11.09948444366455, + "learning_rate": 9.401533858930331e-06, + "loss": 2.6074, + "step": 668000 + }, + { + "epoch": 0.1742419224709497, + "grad_norm": 10.385692596435547, + "learning_rate": 9.401137228188098e-06, + "loss": 2.5725, + "step": 668200 + }, + { + "epoch": 0.17429407509665185, + "grad_norm": 8.567261695861816, + "learning_rate": 9.400740474429004e-06, + "loss": 2.5714, + "step": 668400 + }, + { + "epoch": 0.17434622772235403, + "grad_norm": 10.014311790466309, + "learning_rate": 9.40034359766414e-06, + "loss": 2.5659, + "step": 668600 + }, + { + "epoch": 0.1743983803480562, + "grad_norm": 11.485774040222168, + "learning_rate": 9.3999465979046e-06, + "loss": 2.5965, + "step": 668800 + }, + { + "epoch": 0.17445053297375837, + "grad_norm": 10.644266128540039, + "learning_rate": 9.399549475161476e-06, + "loss": 2.61, + "step": 669000 + }, + { + "epoch": 0.17450268559946053, + "grad_norm": 12.157289505004883, + "learning_rate": 9.399152229445874e-06, + "loss": 2.5701, + "step": 669200 + }, + { + "epoch": 0.1745548382251627, + "grad_norm": 9.978353500366211, + "learning_rate": 9.398754860768892e-06, + "loss": 2.5829, + "step": 669400 + }, + { + "epoch": 0.17460699085086487, + "grad_norm": 11.341612815856934, + "learning_rate": 9.398357369141641e-06, + "loss": 2.6211, + "step": 669600 + }, + { + "epoch": 0.17465914347656705, + "grad_norm": 10.040153503417969, + "learning_rate": 9.397959754575232e-06, + "loss": 2.6005, + "step": 669800 + }, + { + "epoch": 0.1747112961022692, + "grad_norm": 12.061646461486816, + "learning_rate": 9.397562017080774e-06, + "loss": 2.608, + "step": 670000 + }, + { + "epoch": 0.1747634487279714, + "grad_norm": 11.82841968536377, + "learning_rate": 9.39716415666939e-06, + "loss": 2.5988, + "step": 670200 + }, + { + "epoch": 0.17481560135367355, + "grad_norm": 9.31937313079834, + "learning_rate": 9.396766173352196e-06, + "loss": 2.585, + "step": 670400 + }, + { + "epoch": 0.17486775397937573, + "grad_norm": 11.591330528259277, + "learning_rate": 9.396368067140318e-06, + "loss": 2.5517, + "step": 670600 + }, + { + "epoch": 0.1749199066050779, + "grad_norm": 10.716523170471191, + "learning_rate": 9.395969838044882e-06, + "loss": 2.5751, + "step": 670800 + }, + { + "epoch": 0.17497205923078007, + "grad_norm": 9.377893447875977, + "learning_rate": 9.395571486077024e-06, + "loss": 2.5578, + "step": 671000 + }, + { + "epoch": 0.17502421185648223, + "grad_norm": 10.563931465148926, + "learning_rate": 9.395173011247871e-06, + "loss": 2.5933, + "step": 671200 + }, + { + "epoch": 0.17507636448218442, + "grad_norm": 12.169573783874512, + "learning_rate": 9.394774413568565e-06, + "loss": 2.5995, + "step": 671400 + }, + { + "epoch": 0.17512851710788657, + "grad_norm": 10.579012870788574, + "learning_rate": 9.394375693050248e-06, + "loss": 2.6174, + "step": 671600 + }, + { + "epoch": 0.17518066973358873, + "grad_norm": 11.727015495300293, + "learning_rate": 9.393976849704063e-06, + "loss": 2.5779, + "step": 671800 + }, + { + "epoch": 0.1752328223592909, + "grad_norm": 11.265336036682129, + "learning_rate": 9.393577883541158e-06, + "loss": 2.6134, + "step": 672000 + }, + { + "epoch": 0.17528497498499307, + "grad_norm": 10.374798774719238, + "learning_rate": 9.393178794572687e-06, + "loss": 2.6363, + "step": 672200 + }, + { + "epoch": 0.17533712761069525, + "grad_norm": 12.196139335632324, + "learning_rate": 9.392779582809802e-06, + "loss": 2.599, + "step": 672400 + }, + { + "epoch": 0.1753892802363974, + "grad_norm": 10.827107429504395, + "learning_rate": 9.392380248263661e-06, + "loss": 2.5711, + "step": 672600 + }, + { + "epoch": 0.1754414328620996, + "grad_norm": 11.606658935546875, + "learning_rate": 9.39198079094543e-06, + "loss": 2.5836, + "step": 672800 + }, + { + "epoch": 0.17549358548780175, + "grad_norm": 11.592090606689453, + "learning_rate": 9.39158121086627e-06, + "loss": 2.5763, + "step": 673000 + }, + { + "epoch": 0.17554573811350394, + "grad_norm": 9.465235710144043, + "learning_rate": 9.391181508037352e-06, + "loss": 2.5332, + "step": 673200 + }, + { + "epoch": 0.1755978907392061, + "grad_norm": 11.990010261535645, + "learning_rate": 9.390781682469848e-06, + "loss": 2.5785, + "step": 673400 + }, + { + "epoch": 0.17565004336490828, + "grad_norm": 11.225507736206055, + "learning_rate": 9.390381734174933e-06, + "loss": 2.5475, + "step": 673600 + }, + { + "epoch": 0.17570219599061043, + "grad_norm": 12.462555885314941, + "learning_rate": 9.389981663163786e-06, + "loss": 2.628, + "step": 673800 + }, + { + "epoch": 0.17575434861631262, + "grad_norm": 12.385912895202637, + "learning_rate": 9.389581469447591e-06, + "loss": 2.5793, + "step": 674000 + }, + { + "epoch": 0.17580650124201477, + "grad_norm": 10.774480819702148, + "learning_rate": 9.38918115303753e-06, + "loss": 2.5479, + "step": 674200 + }, + { + "epoch": 0.17585865386771696, + "grad_norm": 11.743460655212402, + "learning_rate": 9.388780713944795e-06, + "loss": 2.5841, + "step": 674400 + }, + { + "epoch": 0.17591080649341911, + "grad_norm": 10.81772232055664, + "learning_rate": 9.388380152180582e-06, + "loss": 2.568, + "step": 674600 + }, + { + "epoch": 0.1759629591191213, + "grad_norm": 11.486669540405273, + "learning_rate": 9.387979467756081e-06, + "loss": 2.5902, + "step": 674800 + }, + { + "epoch": 0.17601511174482345, + "grad_norm": 10.133848190307617, + "learning_rate": 9.387578660682495e-06, + "loss": 2.5696, + "step": 675000 + }, + { + "epoch": 0.17606726437052564, + "grad_norm": 11.145345687866211, + "learning_rate": 9.387177730971027e-06, + "loss": 2.5567, + "step": 675200 + }, + { + "epoch": 0.1761194169962278, + "grad_norm": 11.252696990966797, + "learning_rate": 9.386776678632881e-06, + "loss": 2.5772, + "step": 675400 + }, + { + "epoch": 0.17617156962192998, + "grad_norm": 10.816537857055664, + "learning_rate": 9.38637550367927e-06, + "loss": 2.6356, + "step": 675600 + }, + { + "epoch": 0.17622372224763214, + "grad_norm": 11.040006637573242, + "learning_rate": 9.385974206121407e-06, + "loss": 2.558, + "step": 675800 + }, + { + "epoch": 0.17627587487333432, + "grad_norm": 10.698840141296387, + "learning_rate": 9.385572785970505e-06, + "loss": 2.5948, + "step": 676000 + }, + { + "epoch": 0.17632802749903648, + "grad_norm": 11.69105339050293, + "learning_rate": 9.38517124323779e-06, + "loss": 2.6081, + "step": 676200 + }, + { + "epoch": 0.17638018012473866, + "grad_norm": 11.279681205749512, + "learning_rate": 9.38476957793448e-06, + "loss": 2.6172, + "step": 676400 + }, + { + "epoch": 0.17643233275044082, + "grad_norm": 10.574265480041504, + "learning_rate": 9.384367790071805e-06, + "loss": 2.603, + "step": 676600 + }, + { + "epoch": 0.176484485376143, + "grad_norm": 13.000247955322266, + "learning_rate": 9.383965879660995e-06, + "loss": 2.5997, + "step": 676800 + }, + { + "epoch": 0.17653663800184516, + "grad_norm": 10.329712867736816, + "learning_rate": 9.383563846713284e-06, + "loss": 2.5801, + "step": 677000 + }, + { + "epoch": 0.17658879062754734, + "grad_norm": 10.054405212402344, + "learning_rate": 9.383161691239909e-06, + "loss": 2.5559, + "step": 677200 + }, + { + "epoch": 0.1766409432532495, + "grad_norm": 10.512011528015137, + "learning_rate": 9.382759413252112e-06, + "loss": 2.5955, + "step": 677400 + }, + { + "epoch": 0.17669309587895166, + "grad_norm": 11.089954376220703, + "learning_rate": 9.382357012761133e-06, + "loss": 2.565, + "step": 677600 + }, + { + "epoch": 0.17674524850465384, + "grad_norm": 9.8038330078125, + "learning_rate": 9.381954489778225e-06, + "loss": 2.5907, + "step": 677800 + }, + { + "epoch": 0.176797401130356, + "grad_norm": 15.173497200012207, + "learning_rate": 9.381551844314637e-06, + "loss": 2.6021, + "step": 678000 + }, + { + "epoch": 0.17684955375605818, + "grad_norm": 12.605514526367188, + "learning_rate": 9.38114907638162e-06, + "loss": 2.6105, + "step": 678200 + }, + { + "epoch": 0.17690170638176034, + "grad_norm": 10.773606300354004, + "learning_rate": 9.380746185990435e-06, + "loss": 2.5503, + "step": 678400 + }, + { + "epoch": 0.17695385900746252, + "grad_norm": 12.675154685974121, + "learning_rate": 9.380343173152344e-06, + "loss": 2.5884, + "step": 678600 + }, + { + "epoch": 0.17700601163316468, + "grad_norm": 11.189080238342285, + "learning_rate": 9.379940037878611e-06, + "loss": 2.5901, + "step": 678800 + }, + { + "epoch": 0.17705816425886686, + "grad_norm": 11.658578872680664, + "learning_rate": 9.379536780180503e-06, + "loss": 2.6359, + "step": 679000 + }, + { + "epoch": 0.17711031688456902, + "grad_norm": 11.764540672302246, + "learning_rate": 9.379133400069292e-06, + "loss": 2.572, + "step": 679200 + }, + { + "epoch": 0.1771624695102712, + "grad_norm": 12.119824409484863, + "learning_rate": 9.378729897556253e-06, + "loss": 2.581, + "step": 679400 + }, + { + "epoch": 0.17721462213597336, + "grad_norm": 12.850314140319824, + "learning_rate": 9.378326272652666e-06, + "loss": 2.6052, + "step": 679600 + }, + { + "epoch": 0.17726677476167554, + "grad_norm": 11.843114852905273, + "learning_rate": 9.37792252536981e-06, + "loss": 2.6066, + "step": 679800 + }, + { + "epoch": 0.1773189273873777, + "grad_norm": 11.768165588378906, + "learning_rate": 9.377518655718971e-06, + "loss": 2.5708, + "step": 680000 + }, + { + "epoch": 0.17737108001307988, + "grad_norm": 11.470100402832031, + "learning_rate": 9.37711466371144e-06, + "loss": 2.5771, + "step": 680200 + }, + { + "epoch": 0.17742323263878204, + "grad_norm": 11.421998023986816, + "learning_rate": 9.376710549358505e-06, + "loss": 2.602, + "step": 680400 + }, + { + "epoch": 0.17747538526448423, + "grad_norm": 11.222713470458984, + "learning_rate": 9.376306312671465e-06, + "loss": 2.6073, + "step": 680600 + }, + { + "epoch": 0.17752753789018638, + "grad_norm": 11.578697204589844, + "learning_rate": 9.375901953661618e-06, + "loss": 2.6128, + "step": 680800 + }, + { + "epoch": 0.17757969051588857, + "grad_norm": 11.527877807617188, + "learning_rate": 9.375497472340263e-06, + "loss": 2.6079, + "step": 681000 + }, + { + "epoch": 0.17763184314159072, + "grad_norm": 11.338236808776855, + "learning_rate": 9.375092868718711e-06, + "loss": 2.5406, + "step": 681200 + }, + { + "epoch": 0.1776839957672929, + "grad_norm": 10.98487377166748, + "learning_rate": 9.374688142808268e-06, + "loss": 2.5617, + "step": 681400 + }, + { + "epoch": 0.17773614839299506, + "grad_norm": 10.685301780700684, + "learning_rate": 9.374283294620247e-06, + "loss": 2.5682, + "step": 681600 + }, + { + "epoch": 0.17778830101869725, + "grad_norm": 12.549797058105469, + "learning_rate": 9.373878324165965e-06, + "loss": 2.6306, + "step": 681800 + }, + { + "epoch": 0.1778404536443994, + "grad_norm": 11.423171997070312, + "learning_rate": 9.37347323145674e-06, + "loss": 2.5989, + "step": 682000 + }, + { + "epoch": 0.1778926062701016, + "grad_norm": 11.801868438720703, + "learning_rate": 9.373068016503894e-06, + "loss": 2.5902, + "step": 682200 + }, + { + "epoch": 0.17794475889580375, + "grad_norm": 10.22305965423584, + "learning_rate": 9.372662679318755e-06, + "loss": 2.5873, + "step": 682400 + }, + { + "epoch": 0.17799691152150593, + "grad_norm": 10.769112586975098, + "learning_rate": 9.372257219912654e-06, + "loss": 2.5549, + "step": 682600 + }, + { + "epoch": 0.17804906414720809, + "grad_norm": 11.621337890625, + "learning_rate": 9.371851638296922e-06, + "loss": 2.6047, + "step": 682800 + }, + { + "epoch": 0.17810121677291027, + "grad_norm": 10.884603500366211, + "learning_rate": 9.371445934482895e-06, + "loss": 2.6138, + "step": 683000 + }, + { + "epoch": 0.17815336939861243, + "grad_norm": 11.253877639770508, + "learning_rate": 9.371040108481915e-06, + "loss": 2.5755, + "step": 683200 + }, + { + "epoch": 0.1782055220243146, + "grad_norm": 13.072035789489746, + "learning_rate": 9.370634160305321e-06, + "loss": 2.5701, + "step": 683400 + }, + { + "epoch": 0.17825767465001677, + "grad_norm": 13.245817184448242, + "learning_rate": 9.370228089964466e-06, + "loss": 2.5832, + "step": 683600 + }, + { + "epoch": 0.17830982727571892, + "grad_norm": 11.859313011169434, + "learning_rate": 9.369821897470695e-06, + "loss": 2.5743, + "step": 683800 + }, + { + "epoch": 0.1783619799014211, + "grad_norm": 10.999593734741211, + "learning_rate": 9.369415582835365e-06, + "loss": 2.6368, + "step": 684000 + }, + { + "epoch": 0.17841413252712326, + "grad_norm": 12.858357429504395, + "learning_rate": 9.369009146069828e-06, + "loss": 2.5957, + "step": 684200 + }, + { + "epoch": 0.17846628515282545, + "grad_norm": 12.619375228881836, + "learning_rate": 9.36860258718545e-06, + "loss": 2.55, + "step": 684400 + }, + { + "epoch": 0.1785184377785276, + "grad_norm": 11.915067672729492, + "learning_rate": 9.368195906193593e-06, + "loss": 2.5971, + "step": 684600 + }, + { + "epoch": 0.1785705904042298, + "grad_norm": 15.2025146484375, + "learning_rate": 9.367789103105623e-06, + "loss": 2.5893, + "step": 684800 + }, + { + "epoch": 0.17862274302993195, + "grad_norm": 11.45402717590332, + "learning_rate": 9.367382177932911e-06, + "loss": 2.5893, + "step": 685000 + }, + { + "epoch": 0.17867489565563413, + "grad_norm": 12.089704513549805, + "learning_rate": 9.36697513068683e-06, + "loss": 2.5795, + "step": 685200 + }, + { + "epoch": 0.1787270482813363, + "grad_norm": 12.791685104370117, + "learning_rate": 9.366567961378762e-06, + "loss": 2.5793, + "step": 685400 + }, + { + "epoch": 0.17877920090703847, + "grad_norm": 11.997780799865723, + "learning_rate": 9.366160670020084e-06, + "loss": 2.5665, + "step": 685600 + }, + { + "epoch": 0.17883135353274063, + "grad_norm": 10.478496551513672, + "learning_rate": 9.365753256622178e-06, + "loss": 2.5858, + "step": 685800 + }, + { + "epoch": 0.1788835061584428, + "grad_norm": 9.763985633850098, + "learning_rate": 9.365345721196436e-06, + "loss": 2.5637, + "step": 686000 + }, + { + "epoch": 0.17893565878414497, + "grad_norm": 12.446106910705566, + "learning_rate": 9.36493806375425e-06, + "loss": 2.5734, + "step": 686200 + }, + { + "epoch": 0.17898781140984715, + "grad_norm": 12.356990814208984, + "learning_rate": 9.364530284307009e-06, + "loss": 2.5836, + "step": 686400 + }, + { + "epoch": 0.1790399640355493, + "grad_norm": 12.088711738586426, + "learning_rate": 9.364122382866117e-06, + "loss": 2.5795, + "step": 686600 + }, + { + "epoch": 0.1790921166612515, + "grad_norm": 10.435306549072266, + "learning_rate": 9.363714359442971e-06, + "loss": 2.6026, + "step": 686800 + }, + { + "epoch": 0.17914426928695365, + "grad_norm": 11.215888023376465, + "learning_rate": 9.363306214048975e-06, + "loss": 2.577, + "step": 687000 + }, + { + "epoch": 0.17919642191265583, + "grad_norm": 13.190821647644043, + "learning_rate": 9.362897946695542e-06, + "loss": 2.6001, + "step": 687200 + }, + { + "epoch": 0.179248574538358, + "grad_norm": 11.123869895935059, + "learning_rate": 9.362489557394079e-06, + "loss": 2.6028, + "step": 687400 + }, + { + "epoch": 0.17930072716406018, + "grad_norm": 11.40549373626709, + "learning_rate": 9.362081046156004e-06, + "loss": 2.6268, + "step": 687600 + }, + { + "epoch": 0.17935287978976233, + "grad_norm": 11.619917869567871, + "learning_rate": 9.361672412992734e-06, + "loss": 2.6188, + "step": 687800 + }, + { + "epoch": 0.17940503241546452, + "grad_norm": 11.15652084350586, + "learning_rate": 9.36126365791569e-06, + "loss": 2.5824, + "step": 688000 + }, + { + "epoch": 0.17945718504116667, + "grad_norm": 11.156540870666504, + "learning_rate": 9.360854780936298e-06, + "loss": 2.581, + "step": 688200 + }, + { + "epoch": 0.17950933766686886, + "grad_norm": 11.437117576599121, + "learning_rate": 9.360445782065989e-06, + "loss": 2.5932, + "step": 688400 + }, + { + "epoch": 0.179561490292571, + "grad_norm": 11.62105655670166, + "learning_rate": 9.36003666131619e-06, + "loss": 2.5851, + "step": 688600 + }, + { + "epoch": 0.1796136429182732, + "grad_norm": 14.763154029846191, + "learning_rate": 9.359627418698338e-06, + "loss": 2.6017, + "step": 688800 + }, + { + "epoch": 0.17966579554397535, + "grad_norm": 12.010329246520996, + "learning_rate": 9.359218054223876e-06, + "loss": 2.5934, + "step": 689000 + }, + { + "epoch": 0.17971794816967754, + "grad_norm": 12.975419998168945, + "learning_rate": 9.35880856790424e-06, + "loss": 2.5678, + "step": 689200 + }, + { + "epoch": 0.1797701007953797, + "grad_norm": 11.763967514038086, + "learning_rate": 9.35839895975088e-06, + "loss": 2.5383, + "step": 689400 + }, + { + "epoch": 0.17982225342108185, + "grad_norm": 11.407500267028809, + "learning_rate": 9.357989229775245e-06, + "loss": 2.5591, + "step": 689600 + }, + { + "epoch": 0.17987440604678404, + "grad_norm": 10.644564628601074, + "learning_rate": 9.357579377988786e-06, + "loss": 2.5843, + "step": 689800 + }, + { + "epoch": 0.1799265586724862, + "grad_norm": 11.088133811950684, + "learning_rate": 9.357169404402958e-06, + "loss": 2.592, + "step": 690000 + }, + { + "epoch": 0.17997871129818838, + "grad_norm": 11.872809410095215, + "learning_rate": 9.35675930902922e-06, + "loss": 2.5783, + "step": 690200 + }, + { + "epoch": 0.18003086392389053, + "grad_norm": 12.999768257141113, + "learning_rate": 9.356349091879037e-06, + "loss": 2.6072, + "step": 690400 + }, + { + "epoch": 0.18008301654959272, + "grad_norm": 13.067625045776367, + "learning_rate": 9.355938752963874e-06, + "loss": 2.5654, + "step": 690600 + }, + { + "epoch": 0.18013516917529487, + "grad_norm": 11.797418594360352, + "learning_rate": 9.355528292295199e-06, + "loss": 2.5624, + "step": 690800 + }, + { + "epoch": 0.18018732180099706, + "grad_norm": 10.904340744018555, + "learning_rate": 9.355117709884487e-06, + "loss": 2.5769, + "step": 691000 + }, + { + "epoch": 0.18023947442669921, + "grad_norm": 10.31658935546875, + "learning_rate": 9.354707005743213e-06, + "loss": 2.5788, + "step": 691200 + }, + { + "epoch": 0.1802916270524014, + "grad_norm": 12.880640029907227, + "learning_rate": 9.354296179882858e-06, + "loss": 2.588, + "step": 691400 + }, + { + "epoch": 0.18034377967810356, + "grad_norm": 12.985132217407227, + "learning_rate": 9.353885232314902e-06, + "loss": 2.599, + "step": 691600 + }, + { + "epoch": 0.18039593230380574, + "grad_norm": 9.16226863861084, + "learning_rate": 9.353474163050835e-06, + "loss": 2.5426, + "step": 691800 + }, + { + "epoch": 0.1804480849295079, + "grad_norm": 12.11599349975586, + "learning_rate": 9.353062972102146e-06, + "loss": 2.5754, + "step": 692000 + }, + { + "epoch": 0.18050023755521008, + "grad_norm": 12.336219787597656, + "learning_rate": 9.352651659480327e-06, + "loss": 2.5988, + "step": 692200 + }, + { + "epoch": 0.18055239018091224, + "grad_norm": 10.14101505279541, + "learning_rate": 9.352240225196876e-06, + "loss": 2.5971, + "step": 692400 + }, + { + "epoch": 0.18060454280661442, + "grad_norm": 12.834318161010742, + "learning_rate": 9.35182866926329e-06, + "loss": 2.5767, + "step": 692600 + }, + { + "epoch": 0.18065669543231658, + "grad_norm": 10.38381576538086, + "learning_rate": 9.351416991691078e-06, + "loss": 2.5789, + "step": 692800 + }, + { + "epoch": 0.18070884805801876, + "grad_norm": 11.648655891418457, + "learning_rate": 9.351005192491742e-06, + "loss": 2.5979, + "step": 693000 + }, + { + "epoch": 0.18076100068372092, + "grad_norm": 11.521854400634766, + "learning_rate": 9.350593271676793e-06, + "loss": 2.5649, + "step": 693200 + }, + { + "epoch": 0.1808131533094231, + "grad_norm": 12.451175689697266, + "learning_rate": 9.350181229257748e-06, + "loss": 2.5619, + "step": 693400 + }, + { + "epoch": 0.18086530593512526, + "grad_norm": 11.495298385620117, + "learning_rate": 9.34976906524612e-06, + "loss": 2.5701, + "step": 693600 + }, + { + "epoch": 0.18091745856082744, + "grad_norm": 12.428627967834473, + "learning_rate": 9.349356779653432e-06, + "loss": 2.5957, + "step": 693800 + }, + { + "epoch": 0.1809696111865296, + "grad_norm": 12.761686325073242, + "learning_rate": 9.348944372491207e-06, + "loss": 2.577, + "step": 694000 + }, + { + "epoch": 0.18102176381223178, + "grad_norm": 13.41728401184082, + "learning_rate": 9.348531843770971e-06, + "loss": 2.5966, + "step": 694200 + }, + { + "epoch": 0.18107391643793394, + "grad_norm": 12.403990745544434, + "learning_rate": 9.348119193504256e-06, + "loss": 2.5791, + "step": 694400 + }, + { + "epoch": 0.18112606906363612, + "grad_norm": 11.769331932067871, + "learning_rate": 9.347706421702598e-06, + "loss": 2.5963, + "step": 694600 + }, + { + "epoch": 0.18117822168933828, + "grad_norm": 12.589640617370605, + "learning_rate": 9.347293528377532e-06, + "loss": 2.6107, + "step": 694800 + }, + { + "epoch": 0.18123037431504047, + "grad_norm": 13.722719192504883, + "learning_rate": 9.346880513540598e-06, + "loss": 2.5917, + "step": 695000 + }, + { + "epoch": 0.18128252694074262, + "grad_norm": 9.757765769958496, + "learning_rate": 9.34646737720334e-06, + "loss": 2.585, + "step": 695200 + }, + { + "epoch": 0.18133467956644478, + "grad_norm": 11.891173362731934, + "learning_rate": 9.34605411937731e-06, + "loss": 2.5579, + "step": 695400 + }, + { + "epoch": 0.18138683219214696, + "grad_norm": 10.937880516052246, + "learning_rate": 9.345640740074055e-06, + "loss": 2.5802, + "step": 695600 + }, + { + "epoch": 0.18143898481784912, + "grad_norm": 11.980300903320312, + "learning_rate": 9.345227239305133e-06, + "loss": 2.5721, + "step": 695800 + }, + { + "epoch": 0.1814911374435513, + "grad_norm": 10.967789649963379, + "learning_rate": 9.344813617082096e-06, + "loss": 2.6024, + "step": 696000 + }, + { + "epoch": 0.18154329006925346, + "grad_norm": 12.861534118652344, + "learning_rate": 9.34439987341651e-06, + "loss": 2.619, + "step": 696200 + }, + { + "epoch": 0.18159544269495564, + "grad_norm": 12.319485664367676, + "learning_rate": 9.343986008319938e-06, + "loss": 2.5858, + "step": 696400 + }, + { + "epoch": 0.1816475953206578, + "grad_norm": 11.069070816040039, + "learning_rate": 9.343572021803948e-06, + "loss": 2.6071, + "step": 696600 + }, + { + "epoch": 0.18169974794635999, + "grad_norm": 12.012398719787598, + "learning_rate": 9.343157913880113e-06, + "loss": 2.6151, + "step": 696800 + }, + { + "epoch": 0.18175190057206214, + "grad_norm": 9.027801513671875, + "learning_rate": 9.342743684560006e-06, + "loss": 2.6, + "step": 697000 + }, + { + "epoch": 0.18180405319776433, + "grad_norm": 13.315918922424316, + "learning_rate": 9.342329333855205e-06, + "loss": 2.5678, + "step": 697200 + }, + { + "epoch": 0.18185620582346648, + "grad_norm": 11.573213577270508, + "learning_rate": 9.341914861777293e-06, + "loss": 2.5673, + "step": 697400 + }, + { + "epoch": 0.18190835844916867, + "grad_norm": 10.803704261779785, + "learning_rate": 9.341500268337853e-06, + "loss": 2.5895, + "step": 697600 + }, + { + "epoch": 0.18196051107487082, + "grad_norm": 11.29845142364502, + "learning_rate": 9.341085553548474e-06, + "loss": 2.5807, + "step": 697800 + }, + { + "epoch": 0.182012663700573, + "grad_norm": 12.04796314239502, + "learning_rate": 9.34067071742075e-06, + "loss": 2.5675, + "step": 698000 + }, + { + "epoch": 0.18206481632627516, + "grad_norm": 12.755510330200195, + "learning_rate": 9.340255759966274e-06, + "loss": 2.5761, + "step": 698200 + }, + { + "epoch": 0.18211696895197735, + "grad_norm": 11.835643768310547, + "learning_rate": 9.339840681196645e-06, + "loss": 2.5841, + "step": 698400 + }, + { + "epoch": 0.1821691215776795, + "grad_norm": 11.568483352661133, + "learning_rate": 9.339425481123465e-06, + "loss": 2.5586, + "step": 698600 + }, + { + "epoch": 0.1822212742033817, + "grad_norm": 13.110424041748047, + "learning_rate": 9.33901015975834e-06, + "loss": 2.6143, + "step": 698800 + }, + { + "epoch": 0.18227342682908385, + "grad_norm": 11.408985137939453, + "learning_rate": 9.338594717112877e-06, + "loss": 2.5753, + "step": 699000 + }, + { + "epoch": 0.18232557945478603, + "grad_norm": 10.737070083618164, + "learning_rate": 9.33817915319869e-06, + "loss": 2.5876, + "step": 699200 + }, + { + "epoch": 0.1823777320804882, + "grad_norm": 12.560068130493164, + "learning_rate": 9.337763468027392e-06, + "loss": 2.5513, + "step": 699400 + }, + { + "epoch": 0.18242988470619037, + "grad_norm": 11.872684478759766, + "learning_rate": 9.337347661610604e-06, + "loss": 2.5624, + "step": 699600 + }, + { + "epoch": 0.18248203733189253, + "grad_norm": 11.913339614868164, + "learning_rate": 9.336931733959948e-06, + "loss": 2.538, + "step": 699800 + }, + { + "epoch": 0.1825341899575947, + "grad_norm": 10.39101791381836, + "learning_rate": 9.336515685087048e-06, + "loss": 2.5836, + "step": 700000 + }, + { + "epoch": 0.18258634258329687, + "grad_norm": 8.983271598815918, + "learning_rate": 9.336099515003538e-06, + "loss": 2.5803, + "step": 700200 + }, + { + "epoch": 0.18263849520899905, + "grad_norm": 12.488344192504883, + "learning_rate": 9.335683223721044e-06, + "loss": 2.5908, + "step": 700400 + }, + { + "epoch": 0.1826906478347012, + "grad_norm": 10.796935081481934, + "learning_rate": 9.335266811251205e-06, + "loss": 2.6309, + "step": 700600 + }, + { + "epoch": 0.1827428004604034, + "grad_norm": 10.452614784240723, + "learning_rate": 9.33485027760566e-06, + "loss": 2.5893, + "step": 700800 + }, + { + "epoch": 0.18279495308610555, + "grad_norm": 10.865897178649902, + "learning_rate": 9.33443362279605e-06, + "loss": 2.5761, + "step": 701000 + }, + { + "epoch": 0.1828471057118077, + "grad_norm": 12.461289405822754, + "learning_rate": 9.334016846834024e-06, + "loss": 2.5851, + "step": 701200 + }, + { + "epoch": 0.1828992583375099, + "grad_norm": 10.119328498840332, + "learning_rate": 9.333599949731232e-06, + "loss": 2.5364, + "step": 701400 + }, + { + "epoch": 0.18295141096321205, + "grad_norm": 12.12794017791748, + "learning_rate": 9.33318293149932e-06, + "loss": 2.6064, + "step": 701600 + }, + { + "epoch": 0.18300356358891423, + "grad_norm": 12.383764266967773, + "learning_rate": 9.332765792149951e-06, + "loss": 2.6052, + "step": 701800 + }, + { + "epoch": 0.1830557162146164, + "grad_norm": 10.087251663208008, + "learning_rate": 9.332348531694783e-06, + "loss": 2.5666, + "step": 702000 + }, + { + "epoch": 0.18310786884031857, + "grad_norm": 12.394096374511719, + "learning_rate": 9.331931150145478e-06, + "loss": 2.6195, + "step": 702200 + }, + { + "epoch": 0.18316002146602073, + "grad_norm": 12.181896209716797, + "learning_rate": 9.331513647513702e-06, + "loss": 2.5802, + "step": 702400 + }, + { + "epoch": 0.1832121740917229, + "grad_norm": 12.717833518981934, + "learning_rate": 9.331096023811125e-06, + "loss": 2.5753, + "step": 702600 + }, + { + "epoch": 0.18326432671742507, + "grad_norm": 11.721474647521973, + "learning_rate": 9.33067827904942e-06, + "loss": 2.6129, + "step": 702800 + }, + { + "epoch": 0.18331647934312725, + "grad_norm": 10.442642211914062, + "learning_rate": 9.330260413240265e-06, + "loss": 2.614, + "step": 703000 + }, + { + "epoch": 0.1833686319688294, + "grad_norm": 10.827333450317383, + "learning_rate": 9.329842426395335e-06, + "loss": 2.5673, + "step": 703200 + }, + { + "epoch": 0.1834207845945316, + "grad_norm": 11.326786994934082, + "learning_rate": 9.32942431852632e-06, + "loss": 2.5926, + "step": 703400 + }, + { + "epoch": 0.18347293722023375, + "grad_norm": 11.326935768127441, + "learning_rate": 9.329006089644902e-06, + "loss": 2.6239, + "step": 703600 + }, + { + "epoch": 0.18352508984593593, + "grad_norm": 13.342899322509766, + "learning_rate": 9.328587739762774e-06, + "loss": 2.5926, + "step": 703800 + }, + { + "epoch": 0.1835772424716381, + "grad_norm": 10.269484519958496, + "learning_rate": 9.328169268891624e-06, + "loss": 2.5741, + "step": 704000 + }, + { + "epoch": 0.18362939509734028, + "grad_norm": 12.175603866577148, + "learning_rate": 9.327750677043156e-06, + "loss": 2.5677, + "step": 704200 + }, + { + "epoch": 0.18368154772304243, + "grad_norm": 11.766140937805176, + "learning_rate": 9.327331964229066e-06, + "loss": 2.5791, + "step": 704400 + }, + { + "epoch": 0.18373370034874462, + "grad_norm": 10.756917953491211, + "learning_rate": 9.326913130461056e-06, + "loss": 2.5927, + "step": 704600 + }, + { + "epoch": 0.18378585297444677, + "grad_norm": 11.94490909576416, + "learning_rate": 9.326494175750836e-06, + "loss": 2.6214, + "step": 704800 + }, + { + "epoch": 0.18383800560014896, + "grad_norm": 12.475340843200684, + "learning_rate": 9.326075100110115e-06, + "loss": 2.5548, + "step": 705000 + }, + { + "epoch": 0.1838901582258511, + "grad_norm": 10.366930961608887, + "learning_rate": 9.325655903550605e-06, + "loss": 2.5704, + "step": 705200 + }, + { + "epoch": 0.1839423108515533, + "grad_norm": 13.210090637207031, + "learning_rate": 9.325236586084028e-06, + "loss": 2.5656, + "step": 705400 + }, + { + "epoch": 0.18399446347725545, + "grad_norm": 12.236307144165039, + "learning_rate": 9.3248171477221e-06, + "loss": 2.549, + "step": 705600 + }, + { + "epoch": 0.18404661610295764, + "grad_norm": 12.707765579223633, + "learning_rate": 9.324397588476545e-06, + "loss": 2.5777, + "step": 705800 + }, + { + "epoch": 0.1840987687286598, + "grad_norm": 12.61831283569336, + "learning_rate": 9.323977908359093e-06, + "loss": 2.6, + "step": 706000 + }, + { + "epoch": 0.18415092135436198, + "grad_norm": 11.035533905029297, + "learning_rate": 9.323558107381472e-06, + "loss": 2.5415, + "step": 706200 + }, + { + "epoch": 0.18420307398006414, + "grad_norm": 10.974006652832031, + "learning_rate": 9.323138185555416e-06, + "loss": 2.6035, + "step": 706400 + }, + { + "epoch": 0.18425522660576632, + "grad_norm": 11.0816650390625, + "learning_rate": 9.322718142892663e-06, + "loss": 2.5626, + "step": 706600 + }, + { + "epoch": 0.18430737923146848, + "grad_norm": 11.282931327819824, + "learning_rate": 9.322297979404955e-06, + "loss": 2.5733, + "step": 706800 + }, + { + "epoch": 0.18435953185717063, + "grad_norm": 11.803282737731934, + "learning_rate": 9.321877695104034e-06, + "loss": 2.5855, + "step": 707000 + }, + { + "epoch": 0.18441168448287282, + "grad_norm": 11.176085472106934, + "learning_rate": 9.321457290001646e-06, + "loss": 2.5848, + "step": 707200 + }, + { + "epoch": 0.18446383710857497, + "grad_norm": 10.337486267089844, + "learning_rate": 9.321036764109546e-06, + "loss": 2.5328, + "step": 707400 + }, + { + "epoch": 0.18451598973427716, + "grad_norm": 11.199088096618652, + "learning_rate": 9.320616117439486e-06, + "loss": 2.5684, + "step": 707600 + }, + { + "epoch": 0.18456814235997931, + "grad_norm": 11.944901466369629, + "learning_rate": 9.320195350003223e-06, + "loss": 2.5778, + "step": 707800 + }, + { + "epoch": 0.1846202949856815, + "grad_norm": 12.067788124084473, + "learning_rate": 9.31977446181252e-06, + "loss": 2.6062, + "step": 708000 + }, + { + "epoch": 0.18467244761138366, + "grad_norm": 11.054697036743164, + "learning_rate": 9.319353452879139e-06, + "loss": 2.5903, + "step": 708200 + }, + { + "epoch": 0.18472460023708584, + "grad_norm": 12.969680786132812, + "learning_rate": 9.31893232321485e-06, + "loss": 2.5689, + "step": 708400 + }, + { + "epoch": 0.184776752862788, + "grad_norm": 12.888757705688477, + "learning_rate": 9.31851107283142e-06, + "loss": 2.593, + "step": 708600 + }, + { + "epoch": 0.18482890548849018, + "grad_norm": 11.72866153717041, + "learning_rate": 9.318089701740627e-06, + "loss": 2.5896, + "step": 708800 + }, + { + "epoch": 0.18488105811419234, + "grad_norm": 12.54987621307373, + "learning_rate": 9.317668209954248e-06, + "loss": 2.5961, + "step": 709000 + }, + { + "epoch": 0.18493321073989452, + "grad_norm": 12.830785751342773, + "learning_rate": 9.317246597484065e-06, + "loss": 2.5997, + "step": 709200 + }, + { + "epoch": 0.18498536336559668, + "grad_norm": 10.779577255249023, + "learning_rate": 9.31682486434186e-06, + "loss": 2.608, + "step": 709400 + }, + { + "epoch": 0.18503751599129886, + "grad_norm": 10.413871765136719, + "learning_rate": 9.316403010539424e-06, + "loss": 2.5584, + "step": 709600 + }, + { + "epoch": 0.18508966861700102, + "grad_norm": 12.727044105529785, + "learning_rate": 9.315981036088547e-06, + "loss": 2.5805, + "step": 709800 + }, + { + "epoch": 0.1851418212427032, + "grad_norm": 10.990044593811035, + "learning_rate": 9.315558941001022e-06, + "loss": 2.5659, + "step": 710000 + }, + { + "epoch": 0.18519397386840536, + "grad_norm": 13.053669929504395, + "learning_rate": 9.315136725288648e-06, + "loss": 2.5645, + "step": 710200 + }, + { + "epoch": 0.18524612649410754, + "grad_norm": 12.4849853515625, + "learning_rate": 9.314714388963229e-06, + "loss": 2.5712, + "step": 710400 + }, + { + "epoch": 0.1852982791198097, + "grad_norm": 13.245108604431152, + "learning_rate": 9.314291932036568e-06, + "loss": 2.589, + "step": 710600 + }, + { + "epoch": 0.18535043174551188, + "grad_norm": 11.096653938293457, + "learning_rate": 9.31386935452047e-06, + "loss": 2.5642, + "step": 710800 + }, + { + "epoch": 0.18540258437121404, + "grad_norm": 12.3607177734375, + "learning_rate": 9.313446656426752e-06, + "loss": 2.5596, + "step": 711000 + }, + { + "epoch": 0.18545473699691623, + "grad_norm": 11.200916290283203, + "learning_rate": 9.313023837767225e-06, + "loss": 2.5321, + "step": 711200 + }, + { + "epoch": 0.18550688962261838, + "grad_norm": 12.221725463867188, + "learning_rate": 9.31260089855371e-06, + "loss": 2.5827, + "step": 711400 + }, + { + "epoch": 0.18555904224832057, + "grad_norm": 11.39847469329834, + "learning_rate": 9.312177838798028e-06, + "loss": 2.581, + "step": 711600 + }, + { + "epoch": 0.18561119487402272, + "grad_norm": 11.100894927978516, + "learning_rate": 9.311754658512003e-06, + "loss": 2.5598, + "step": 711800 + }, + { + "epoch": 0.1856633474997249, + "grad_norm": 11.927329063415527, + "learning_rate": 9.31133135770746e-06, + "loss": 2.5918, + "step": 712000 + }, + { + "epoch": 0.18571550012542706, + "grad_norm": 12.195674896240234, + "learning_rate": 9.310907936396238e-06, + "loss": 2.5352, + "step": 712200 + }, + { + "epoch": 0.18576765275112925, + "grad_norm": 12.455114364624023, + "learning_rate": 9.310484394590168e-06, + "loss": 2.5571, + "step": 712400 + }, + { + "epoch": 0.1858198053768314, + "grad_norm": 12.462471961975098, + "learning_rate": 9.310060732301087e-06, + "loss": 2.5686, + "step": 712600 + }, + { + "epoch": 0.1858719580025336, + "grad_norm": 12.136844635009766, + "learning_rate": 9.30963694954084e-06, + "loss": 2.5705, + "step": 712800 + }, + { + "epoch": 0.18592411062823574, + "grad_norm": 13.6740083694458, + "learning_rate": 9.309213046321272e-06, + "loss": 2.568, + "step": 713000 + }, + { + "epoch": 0.1859762632539379, + "grad_norm": 10.739278793334961, + "learning_rate": 9.308789022654233e-06, + "loss": 2.576, + "step": 713200 + }, + { + "epoch": 0.18602841587964009, + "grad_norm": 12.809683799743652, + "learning_rate": 9.308364878551567e-06, + "loss": 2.5984, + "step": 713400 + }, + { + "epoch": 0.18608056850534224, + "grad_norm": 11.436633110046387, + "learning_rate": 9.30794061402514e-06, + "loss": 2.5605, + "step": 713600 + }, + { + "epoch": 0.18613272113104443, + "grad_norm": 10.629294395446777, + "learning_rate": 9.307516229086802e-06, + "loss": 2.5838, + "step": 713800 + }, + { + "epoch": 0.18618487375674658, + "grad_norm": 11.265149116516113, + "learning_rate": 9.30709172374842e-06, + "loss": 2.5753, + "step": 714000 + }, + { + "epoch": 0.18623702638244877, + "grad_norm": 13.775673866271973, + "learning_rate": 9.306667098021858e-06, + "loss": 2.5764, + "step": 714200 + }, + { + "epoch": 0.18628917900815092, + "grad_norm": 10.820959091186523, + "learning_rate": 9.306242351918983e-06, + "loss": 2.5516, + "step": 714400 + }, + { + "epoch": 0.1863413316338531, + "grad_norm": 11.11262035369873, + "learning_rate": 9.305817485451672e-06, + "loss": 2.5498, + "step": 714600 + }, + { + "epoch": 0.18639348425955526, + "grad_norm": 12.226655006408691, + "learning_rate": 9.305392498631796e-06, + "loss": 2.5372, + "step": 714800 + }, + { + "epoch": 0.18644563688525745, + "grad_norm": 11.85600757598877, + "learning_rate": 9.304967391471236e-06, + "loss": 2.628, + "step": 715000 + }, + { + "epoch": 0.1864977895109596, + "grad_norm": 10.384273529052734, + "learning_rate": 9.304542163981874e-06, + "loss": 2.5591, + "step": 715200 + }, + { + "epoch": 0.1865499421366618, + "grad_norm": 10.744622230529785, + "learning_rate": 9.304116816175593e-06, + "loss": 2.5466, + "step": 715400 + }, + { + "epoch": 0.18660209476236395, + "grad_norm": 14.192846298217773, + "learning_rate": 9.303691348064285e-06, + "loss": 2.6048, + "step": 715600 + }, + { + "epoch": 0.18665424738806613, + "grad_norm": 10.374147415161133, + "learning_rate": 9.303265759659842e-06, + "loss": 2.5472, + "step": 715800 + }, + { + "epoch": 0.1867064000137683, + "grad_norm": 12.727148056030273, + "learning_rate": 9.30284005097416e-06, + "loss": 2.5792, + "step": 716000 + }, + { + "epoch": 0.18675855263947047, + "grad_norm": 14.218303680419922, + "learning_rate": 9.302414222019136e-06, + "loss": 2.5839, + "step": 716200 + }, + { + "epoch": 0.18681070526517263, + "grad_norm": 11.11287784576416, + "learning_rate": 9.301988272806675e-06, + "loss": 2.5779, + "step": 716400 + }, + { + "epoch": 0.1868628578908748, + "grad_norm": 9.908751487731934, + "learning_rate": 9.30156220334868e-06, + "loss": 2.562, + "step": 716600 + }, + { + "epoch": 0.18691501051657697, + "grad_norm": 10.67510986328125, + "learning_rate": 9.301136013657061e-06, + "loss": 2.6033, + "step": 716800 + }, + { + "epoch": 0.18696716314227915, + "grad_norm": 10.921405792236328, + "learning_rate": 9.300709703743733e-06, + "loss": 2.5775, + "step": 717000 + }, + { + "epoch": 0.1870193157679813, + "grad_norm": 11.9124116897583, + "learning_rate": 9.300283273620607e-06, + "loss": 2.5652, + "step": 717200 + }, + { + "epoch": 0.1870714683936835, + "grad_norm": 13.153666496276855, + "learning_rate": 9.299856723299608e-06, + "loss": 2.5931, + "step": 717400 + }, + { + "epoch": 0.18712362101938565, + "grad_norm": 11.42650318145752, + "learning_rate": 9.299430052792655e-06, + "loss": 2.5649, + "step": 717600 + }, + { + "epoch": 0.18717577364508783, + "grad_norm": 14.207989692687988, + "learning_rate": 9.299003262111672e-06, + "loss": 2.5943, + "step": 717800 + }, + { + "epoch": 0.18722792627079, + "grad_norm": 12.080742835998535, + "learning_rate": 9.298576351268594e-06, + "loss": 2.5583, + "step": 718000 + }, + { + "epoch": 0.18728007889649217, + "grad_norm": 10.130331039428711, + "learning_rate": 9.298149320275349e-06, + "loss": 2.5811, + "step": 718200 + }, + { + "epoch": 0.18733223152219433, + "grad_norm": 11.960268020629883, + "learning_rate": 9.297722169143875e-06, + "loss": 2.5706, + "step": 718400 + }, + { + "epoch": 0.18738438414789652, + "grad_norm": 12.298662185668945, + "learning_rate": 9.29729489788611e-06, + "loss": 2.5851, + "step": 718600 + }, + { + "epoch": 0.18743653677359867, + "grad_norm": 12.417919158935547, + "learning_rate": 9.296867506514e-06, + "loss": 2.557, + "step": 718800 + }, + { + "epoch": 0.18748868939930083, + "grad_norm": 11.724634170532227, + "learning_rate": 9.296439995039488e-06, + "loss": 2.571, + "step": 719000 + }, + { + "epoch": 0.187540842025003, + "grad_norm": 13.457600593566895, + "learning_rate": 9.296012363474523e-06, + "loss": 2.6037, + "step": 719200 + }, + { + "epoch": 0.18759299465070517, + "grad_norm": 15.612174987792969, + "learning_rate": 9.295584611831059e-06, + "loss": 2.5789, + "step": 719400 + }, + { + "epoch": 0.18764514727640735, + "grad_norm": 11.381507873535156, + "learning_rate": 9.295156740121052e-06, + "loss": 2.5929, + "step": 719600 + }, + { + "epoch": 0.1876972999021095, + "grad_norm": 11.643939971923828, + "learning_rate": 9.294728748356463e-06, + "loss": 2.5697, + "step": 719800 + }, + { + "epoch": 0.1877494525278117, + "grad_norm": 12.142743110656738, + "learning_rate": 9.294300636549251e-06, + "loss": 2.5505, + "step": 720000 + }, + { + "epoch": 0.18780160515351385, + "grad_norm": 12.137763023376465, + "learning_rate": 9.293872404711388e-06, + "loss": 2.6389, + "step": 720200 + }, + { + "epoch": 0.18785375777921604, + "grad_norm": 12.277729988098145, + "learning_rate": 9.29344405285484e-06, + "loss": 2.5933, + "step": 720400 + }, + { + "epoch": 0.1879059104049182, + "grad_norm": 11.714141845703125, + "learning_rate": 9.293015580991577e-06, + "loss": 2.5719, + "step": 720600 + }, + { + "epoch": 0.18795806303062038, + "grad_norm": 11.93021297454834, + "learning_rate": 9.292586989133581e-06, + "loss": 2.5805, + "step": 720800 + }, + { + "epoch": 0.18801021565632253, + "grad_norm": 12.212942123413086, + "learning_rate": 9.292158277292828e-06, + "loss": 2.5759, + "step": 721000 + }, + { + "epoch": 0.18806236828202472, + "grad_norm": 10.348050117492676, + "learning_rate": 9.2917294454813e-06, + "loss": 2.5756, + "step": 721200 + }, + { + "epoch": 0.18811452090772687, + "grad_norm": 11.41096305847168, + "learning_rate": 9.29130049371099e-06, + "loss": 2.589, + "step": 721400 + }, + { + "epoch": 0.18816667353342906, + "grad_norm": 11.99845027923584, + "learning_rate": 9.290871421993881e-06, + "loss": 2.5932, + "step": 721600 + }, + { + "epoch": 0.18821882615913121, + "grad_norm": 13.32406234741211, + "learning_rate": 9.290442230341967e-06, + "loss": 2.5926, + "step": 721800 + }, + { + "epoch": 0.1882709787848334, + "grad_norm": 12.009814262390137, + "learning_rate": 9.290012918767248e-06, + "loss": 2.5701, + "step": 722000 + }, + { + "epoch": 0.18832313141053555, + "grad_norm": 13.789295196533203, + "learning_rate": 9.289583487281719e-06, + "loss": 2.576, + "step": 722200 + }, + { + "epoch": 0.18837528403623774, + "grad_norm": 10.698989868164062, + "learning_rate": 9.289153935897387e-06, + "loss": 2.5756, + "step": 722400 + }, + { + "epoch": 0.1884274366619399, + "grad_norm": 12.086974143981934, + "learning_rate": 9.288724264626256e-06, + "loss": 2.5829, + "step": 722600 + }, + { + "epoch": 0.18847958928764208, + "grad_norm": 11.715788841247559, + "learning_rate": 9.288294473480337e-06, + "loss": 2.5337, + "step": 722800 + }, + { + "epoch": 0.18853174191334424, + "grad_norm": 10.705584526062012, + "learning_rate": 9.287864562471644e-06, + "loss": 2.5735, + "step": 723000 + }, + { + "epoch": 0.18858389453904642, + "grad_norm": 12.674153327941895, + "learning_rate": 9.287434531612192e-06, + "loss": 2.5843, + "step": 723200 + }, + { + "epoch": 0.18863604716474858, + "grad_norm": 12.425131797790527, + "learning_rate": 9.287004380914e-06, + "loss": 2.5627, + "step": 723400 + }, + { + "epoch": 0.18868819979045076, + "grad_norm": 11.173230171203613, + "learning_rate": 9.286574110389094e-06, + "loss": 2.5609, + "step": 723600 + }, + { + "epoch": 0.18874035241615292, + "grad_norm": 12.521120071411133, + "learning_rate": 9.286143720049498e-06, + "loss": 2.5708, + "step": 723800 + }, + { + "epoch": 0.1887925050418551, + "grad_norm": 10.894947052001953, + "learning_rate": 9.285713209907243e-06, + "loss": 2.524, + "step": 724000 + }, + { + "epoch": 0.18884465766755726, + "grad_norm": 11.677881240844727, + "learning_rate": 9.285282579974362e-06, + "loss": 2.6056, + "step": 724200 + }, + { + "epoch": 0.18889681029325944, + "grad_norm": 12.158601760864258, + "learning_rate": 9.284851830262892e-06, + "loss": 2.6083, + "step": 724400 + }, + { + "epoch": 0.1889489629189616, + "grad_norm": 13.806035995483398, + "learning_rate": 9.284420960784873e-06, + "loss": 2.6437, + "step": 724600 + }, + { + "epoch": 0.18900111554466376, + "grad_norm": 12.63658332824707, + "learning_rate": 9.283989971552348e-06, + "loss": 2.5694, + "step": 724800 + }, + { + "epoch": 0.18905326817036594, + "grad_norm": 13.176187515258789, + "learning_rate": 9.283558862577363e-06, + "loss": 2.5266, + "step": 725000 + }, + { + "epoch": 0.1891054207960681, + "grad_norm": 11.850820541381836, + "learning_rate": 9.28312763387197e-06, + "loss": 2.5799, + "step": 725200 + }, + { + "epoch": 0.18915757342177028, + "grad_norm": 12.633203506469727, + "learning_rate": 9.282696285448219e-06, + "loss": 2.5638, + "step": 725400 + }, + { + "epoch": 0.18920972604747244, + "grad_norm": 11.37259292602539, + "learning_rate": 9.28226481731817e-06, + "loss": 2.5573, + "step": 725600 + }, + { + "epoch": 0.18926187867317462, + "grad_norm": 12.123799324035645, + "learning_rate": 9.28183322949388e-06, + "loss": 2.5736, + "step": 725800 + }, + { + "epoch": 0.18931403129887678, + "grad_norm": 11.90881061553955, + "learning_rate": 9.281401521987416e-06, + "loss": 2.5502, + "step": 726000 + }, + { + "epoch": 0.18936618392457896, + "grad_norm": 12.91506576538086, + "learning_rate": 9.280969694810844e-06, + "loss": 2.5229, + "step": 726200 + }, + { + "epoch": 0.18941833655028112, + "grad_norm": 11.650004386901855, + "learning_rate": 9.28053774797623e-06, + "loss": 2.5783, + "step": 726400 + }, + { + "epoch": 0.1894704891759833, + "grad_norm": 12.156107902526855, + "learning_rate": 9.280105681495652e-06, + "loss": 2.604, + "step": 726600 + }, + { + "epoch": 0.18952264180168546, + "grad_norm": 10.867297172546387, + "learning_rate": 9.279673495381185e-06, + "loss": 2.5855, + "step": 726800 + }, + { + "epoch": 0.18957479442738764, + "grad_norm": 12.994755744934082, + "learning_rate": 9.279241189644908e-06, + "loss": 2.5467, + "step": 727000 + }, + { + "epoch": 0.1896269470530898, + "grad_norm": 11.489134788513184, + "learning_rate": 9.278808764298906e-06, + "loss": 2.5654, + "step": 727200 + }, + { + "epoch": 0.18967909967879198, + "grad_norm": 12.920225143432617, + "learning_rate": 9.278376219355267e-06, + "loss": 2.5749, + "step": 727400 + }, + { + "epoch": 0.18973125230449414, + "grad_norm": 9.510746002197266, + "learning_rate": 9.277943554826076e-06, + "loss": 2.5537, + "step": 727600 + }, + { + "epoch": 0.18978340493019633, + "grad_norm": 12.295012474060059, + "learning_rate": 9.277510770723433e-06, + "loss": 2.5912, + "step": 727800 + }, + { + "epoch": 0.18983555755589848, + "grad_norm": 10.54262638092041, + "learning_rate": 9.277077867059432e-06, + "loss": 2.591, + "step": 728000 + }, + { + "epoch": 0.18988771018160067, + "grad_norm": 11.955072402954102, + "learning_rate": 9.276644843846171e-06, + "loss": 2.5709, + "step": 728200 + }, + { + "epoch": 0.18993986280730282, + "grad_norm": 10.959343910217285, + "learning_rate": 9.276211701095756e-06, + "loss": 2.5735, + "step": 728400 + }, + { + "epoch": 0.189992015433005, + "grad_norm": 12.265323638916016, + "learning_rate": 9.275778438820294e-06, + "loss": 2.5402, + "step": 728600 + }, + { + "epoch": 0.19004416805870716, + "grad_norm": 13.312240600585938, + "learning_rate": 9.275345057031894e-06, + "loss": 2.5894, + "step": 728800 + }, + { + "epoch": 0.19009632068440935, + "grad_norm": 12.820699691772461, + "learning_rate": 9.27491155574267e-06, + "loss": 2.5495, + "step": 729000 + }, + { + "epoch": 0.1901484733101115, + "grad_norm": 12.239838600158691, + "learning_rate": 9.27447793496474e-06, + "loss": 2.5772, + "step": 729200 + }, + { + "epoch": 0.1902006259358137, + "grad_norm": 12.58304214477539, + "learning_rate": 9.274044194710222e-06, + "loss": 2.5978, + "step": 729400 + }, + { + "epoch": 0.19025277856151585, + "grad_norm": 12.55810260772705, + "learning_rate": 9.27361033499124e-06, + "loss": 2.6003, + "step": 729600 + }, + { + "epoch": 0.19030493118721803, + "grad_norm": 11.778428077697754, + "learning_rate": 9.273176355819922e-06, + "loss": 2.5742, + "step": 729800 + }, + { + "epoch": 0.19035708381292019, + "grad_norm": 13.294259071350098, + "learning_rate": 9.272742257208399e-06, + "loss": 2.5477, + "step": 730000 + }, + { + "epoch": 0.19040923643862237, + "grad_norm": 13.392877578735352, + "learning_rate": 9.272308039168802e-06, + "loss": 2.535, + "step": 730200 + }, + { + "epoch": 0.19046138906432453, + "grad_norm": 11.833348274230957, + "learning_rate": 9.271873701713269e-06, + "loss": 2.5452, + "step": 730400 + }, + { + "epoch": 0.19051354169002668, + "grad_norm": 12.308874130249023, + "learning_rate": 9.27143924485394e-06, + "loss": 2.5484, + "step": 730600 + }, + { + "epoch": 0.19056569431572887, + "grad_norm": 10.478835105895996, + "learning_rate": 9.27100466860296e-06, + "loss": 2.5708, + "step": 730800 + }, + { + "epoch": 0.19061784694143102, + "grad_norm": 11.751260757446289, + "learning_rate": 9.270569972972475e-06, + "loss": 2.5554, + "step": 731000 + }, + { + "epoch": 0.1906699995671332, + "grad_norm": 13.028668403625488, + "learning_rate": 9.270135157974633e-06, + "loss": 2.5585, + "step": 731200 + }, + { + "epoch": 0.19072215219283536, + "grad_norm": 11.351934432983398, + "learning_rate": 9.269700223621591e-06, + "loss": 2.5808, + "step": 731400 + }, + { + "epoch": 0.19077430481853755, + "grad_norm": 11.560927391052246, + "learning_rate": 9.269265169925506e-06, + "loss": 2.5884, + "step": 731600 + }, + { + "epoch": 0.1908264574442397, + "grad_norm": 11.362569808959961, + "learning_rate": 9.268829996898537e-06, + "loss": 2.6066, + "step": 731800 + }, + { + "epoch": 0.1908786100699419, + "grad_norm": 12.66362476348877, + "learning_rate": 9.268394704552845e-06, + "loss": 2.5914, + "step": 732000 + }, + { + "epoch": 0.19093076269564405, + "grad_norm": 10.702469825744629, + "learning_rate": 9.2679592929006e-06, + "loss": 2.5223, + "step": 732200 + }, + { + "epoch": 0.19098291532134623, + "grad_norm": 12.114631652832031, + "learning_rate": 9.267523761953973e-06, + "loss": 2.5882, + "step": 732400 + }, + { + "epoch": 0.1910350679470484, + "grad_norm": 10.952610969543457, + "learning_rate": 9.267088111725135e-06, + "loss": 2.5453, + "step": 732600 + }, + { + "epoch": 0.19108722057275057, + "grad_norm": 11.303411483764648, + "learning_rate": 9.266652342226264e-06, + "loss": 2.5963, + "step": 732800 + }, + { + "epoch": 0.19113937319845273, + "grad_norm": 11.67508602142334, + "learning_rate": 9.26621645346954e-06, + "loss": 2.5723, + "step": 733000 + }, + { + "epoch": 0.1911915258241549, + "grad_norm": 11.651264190673828, + "learning_rate": 9.265780445467148e-06, + "loss": 2.5943, + "step": 733200 + }, + { + "epoch": 0.19124367844985707, + "grad_norm": 12.28518009185791, + "learning_rate": 9.265344318231274e-06, + "loss": 2.5283, + "step": 733400 + }, + { + "epoch": 0.19129583107555925, + "grad_norm": 13.750175476074219, + "learning_rate": 9.264908071774108e-06, + "loss": 2.5599, + "step": 733600 + }, + { + "epoch": 0.1913479837012614, + "grad_norm": 12.860754013061523, + "learning_rate": 9.264471706107846e-06, + "loss": 2.5571, + "step": 733800 + }, + { + "epoch": 0.1914001363269636, + "grad_norm": 11.767181396484375, + "learning_rate": 9.264035221244678e-06, + "loss": 2.5767, + "step": 734000 + }, + { + "epoch": 0.19145228895266575, + "grad_norm": 13.27584171295166, + "learning_rate": 9.263598617196812e-06, + "loss": 2.5659, + "step": 734200 + }, + { + "epoch": 0.19150444157836793, + "grad_norm": 11.297654151916504, + "learning_rate": 9.263161893976446e-06, + "loss": 2.5645, + "step": 734400 + }, + { + "epoch": 0.1915565942040701, + "grad_norm": 13.385966300964355, + "learning_rate": 9.262725051595793e-06, + "loss": 2.5824, + "step": 734600 + }, + { + "epoch": 0.19160874682977228, + "grad_norm": 12.745625495910645, + "learning_rate": 9.262288090067058e-06, + "loss": 2.5605, + "step": 734800 + }, + { + "epoch": 0.19166089945547443, + "grad_norm": 13.604351043701172, + "learning_rate": 9.261851009402457e-06, + "loss": 2.5457, + "step": 735000 + }, + { + "epoch": 0.19171305208117662, + "grad_norm": 14.262986183166504, + "learning_rate": 9.261413809614204e-06, + "loss": 2.5429, + "step": 735200 + }, + { + "epoch": 0.19176520470687877, + "grad_norm": 13.41905689239502, + "learning_rate": 9.260976490714524e-06, + "loss": 2.5912, + "step": 735400 + }, + { + "epoch": 0.19181735733258096, + "grad_norm": 11.548351287841797, + "learning_rate": 9.260539052715636e-06, + "loss": 2.55, + "step": 735600 + }, + { + "epoch": 0.1918695099582831, + "grad_norm": 12.51290512084961, + "learning_rate": 9.26010149562977e-06, + "loss": 2.557, + "step": 735800 + }, + { + "epoch": 0.1919216625839853, + "grad_norm": 11.825644493103027, + "learning_rate": 9.259663819469154e-06, + "loss": 2.5657, + "step": 736000 + }, + { + "epoch": 0.19197381520968745, + "grad_norm": 11.373788833618164, + "learning_rate": 9.259226024246024e-06, + "loss": 2.5175, + "step": 736200 + }, + { + "epoch": 0.19202596783538964, + "grad_norm": 13.40487003326416, + "learning_rate": 9.258788109972616e-06, + "loss": 2.5853, + "step": 736400 + }, + { + "epoch": 0.1920781204610918, + "grad_norm": 11.846600532531738, + "learning_rate": 9.258350076661168e-06, + "loss": 2.5468, + "step": 736600 + }, + { + "epoch": 0.19213027308679395, + "grad_norm": 11.58352279663086, + "learning_rate": 9.257911924323926e-06, + "loss": 2.5769, + "step": 736800 + }, + { + "epoch": 0.19218242571249614, + "grad_norm": 11.748197555541992, + "learning_rate": 9.257473652973137e-06, + "loss": 2.5518, + "step": 737000 + }, + { + "epoch": 0.1922345783381983, + "grad_norm": 10.648813247680664, + "learning_rate": 9.25703526262105e-06, + "loss": 2.5885, + "step": 737200 + }, + { + "epoch": 0.19228673096390048, + "grad_norm": 13.590847969055176, + "learning_rate": 9.256596753279918e-06, + "loss": 2.5561, + "step": 737400 + }, + { + "epoch": 0.19233888358960263, + "grad_norm": 11.725268363952637, + "learning_rate": 9.256158124962e-06, + "loss": 2.5282, + "step": 737600 + }, + { + "epoch": 0.19239103621530482, + "grad_norm": 12.270450592041016, + "learning_rate": 9.255719377679552e-06, + "loss": 2.5635, + "step": 737800 + }, + { + "epoch": 0.19244318884100697, + "grad_norm": 10.960904121398926, + "learning_rate": 9.255280511444842e-06, + "loss": 2.5165, + "step": 738000 + }, + { + "epoch": 0.19249534146670916, + "grad_norm": 11.803460121154785, + "learning_rate": 9.254841526270137e-06, + "loss": 2.5727, + "step": 738200 + }, + { + "epoch": 0.19254749409241131, + "grad_norm": 12.786358833312988, + "learning_rate": 9.254402422167704e-06, + "loss": 2.5412, + "step": 738400 + }, + { + "epoch": 0.1925996467181135, + "grad_norm": 12.822945594787598, + "learning_rate": 9.253963199149818e-06, + "loss": 2.5498, + "step": 738600 + }, + { + "epoch": 0.19265179934381566, + "grad_norm": 11.774388313293457, + "learning_rate": 9.253523857228754e-06, + "loss": 2.5893, + "step": 738800 + }, + { + "epoch": 0.19270395196951784, + "grad_norm": 12.514403343200684, + "learning_rate": 9.253084396416795e-06, + "loss": 2.6186, + "step": 739000 + }, + { + "epoch": 0.19275610459522, + "grad_norm": 11.048097610473633, + "learning_rate": 9.252644816726224e-06, + "loss": 2.5628, + "step": 739200 + }, + { + "epoch": 0.19280825722092218, + "grad_norm": 11.978404998779297, + "learning_rate": 9.252205118169326e-06, + "loss": 2.592, + "step": 739400 + }, + { + "epoch": 0.19286040984662434, + "grad_norm": 11.403801918029785, + "learning_rate": 9.25176530075839e-06, + "loss": 2.5574, + "step": 739600 + }, + { + "epoch": 0.19291256247232652, + "grad_norm": 13.35159969329834, + "learning_rate": 9.251325364505715e-06, + "loss": 2.5446, + "step": 739800 + }, + { + "epoch": 0.19296471509802868, + "grad_norm": 11.937582969665527, + "learning_rate": 9.250885309423592e-06, + "loss": 2.5404, + "step": 740000 + }, + { + "epoch": 0.19301686772373086, + "grad_norm": 12.94690990447998, + "learning_rate": 9.250445135524325e-06, + "loss": 2.5505, + "step": 740200 + }, + { + "epoch": 0.19306902034943302, + "grad_norm": 13.668988227844238, + "learning_rate": 9.250004842820212e-06, + "loss": 2.5818, + "step": 740400 + }, + { + "epoch": 0.1931211729751352, + "grad_norm": 11.1817626953125, + "learning_rate": 9.249564431323566e-06, + "loss": 2.574, + "step": 740600 + }, + { + "epoch": 0.19317332560083736, + "grad_norm": 12.859142303466797, + "learning_rate": 9.249123901046694e-06, + "loss": 2.5295, + "step": 740800 + }, + { + "epoch": 0.19322547822653954, + "grad_norm": 10.10409164428711, + "learning_rate": 9.248683252001909e-06, + "loss": 2.557, + "step": 741000 + }, + { + "epoch": 0.1932776308522417, + "grad_norm": 13.226909637451172, + "learning_rate": 9.248242484201528e-06, + "loss": 2.5851, + "step": 741200 + }, + { + "epoch": 0.19332978347794388, + "grad_norm": 12.193153381347656, + "learning_rate": 9.247801597657871e-06, + "loss": 2.5712, + "step": 741400 + }, + { + "epoch": 0.19338193610364604, + "grad_norm": 12.625770568847656, + "learning_rate": 9.24736059238326e-06, + "loss": 2.5571, + "step": 741600 + }, + { + "epoch": 0.19343408872934822, + "grad_norm": 13.078922271728516, + "learning_rate": 9.246919468390026e-06, + "loss": 2.5776, + "step": 741800 + }, + { + "epoch": 0.19348624135505038, + "grad_norm": 12.740152359008789, + "learning_rate": 9.246478225690496e-06, + "loss": 2.5985, + "step": 742000 + }, + { + "epoch": 0.19353839398075257, + "grad_norm": 13.259086608886719, + "learning_rate": 9.246036864297001e-06, + "loss": 2.5446, + "step": 742200 + }, + { + "epoch": 0.19359054660645472, + "grad_norm": 12.893516540527344, + "learning_rate": 9.245595384221881e-06, + "loss": 2.5904, + "step": 742400 + }, + { + "epoch": 0.19364269923215688, + "grad_norm": 11.758481979370117, + "learning_rate": 9.245153785477475e-06, + "loss": 2.5576, + "step": 742600 + }, + { + "epoch": 0.19369485185785906, + "grad_norm": 11.747676849365234, + "learning_rate": 9.244712068076125e-06, + "loss": 2.5636, + "step": 742800 + }, + { + "epoch": 0.19374700448356122, + "grad_norm": 12.008854866027832, + "learning_rate": 9.24427023203018e-06, + "loss": 2.5726, + "step": 743000 + }, + { + "epoch": 0.1937991571092634, + "grad_norm": 9.737689971923828, + "learning_rate": 9.243828277351985e-06, + "loss": 2.5412, + "step": 743200 + }, + { + "epoch": 0.19385130973496556, + "grad_norm": 13.055411338806152, + "learning_rate": 9.2433862040539e-06, + "loss": 2.5482, + "step": 743400 + }, + { + "epoch": 0.19390346236066774, + "grad_norm": 11.527270317077637, + "learning_rate": 9.242944012148273e-06, + "loss": 2.5932, + "step": 743600 + }, + { + "epoch": 0.1939556149863699, + "grad_norm": 11.464157104492188, + "learning_rate": 9.242501701647473e-06, + "loss": 2.5869, + "step": 743800 + }, + { + "epoch": 0.19400776761207209, + "grad_norm": 13.078601837158203, + "learning_rate": 9.242059272563858e-06, + "loss": 2.54, + "step": 744000 + }, + { + "epoch": 0.19405992023777424, + "grad_norm": 10.983396530151367, + "learning_rate": 9.241616724909794e-06, + "loss": 2.5761, + "step": 744200 + }, + { + "epoch": 0.19411207286347643, + "grad_norm": 10.754619598388672, + "learning_rate": 9.241174058697651e-06, + "loss": 2.5597, + "step": 744400 + }, + { + "epoch": 0.19416422548917858, + "grad_norm": 13.653790473937988, + "learning_rate": 9.240731273939804e-06, + "loss": 2.5787, + "step": 744600 + }, + { + "epoch": 0.19421637811488077, + "grad_norm": 11.377376556396484, + "learning_rate": 9.240288370648628e-06, + "loss": 2.5587, + "step": 744800 + }, + { + "epoch": 0.19426853074058292, + "grad_norm": 10.749224662780762, + "learning_rate": 9.239845348836503e-06, + "loss": 2.5793, + "step": 745000 + }, + { + "epoch": 0.1943206833662851, + "grad_norm": 13.052656173706055, + "learning_rate": 9.23940220851581e-06, + "loss": 2.5555, + "step": 745200 + }, + { + "epoch": 0.19437283599198726, + "grad_norm": 12.965775489807129, + "learning_rate": 9.23895894969894e-06, + "loss": 2.5797, + "step": 745400 + }, + { + "epoch": 0.19442498861768945, + "grad_norm": 13.597799301147461, + "learning_rate": 9.238515572398278e-06, + "loss": 2.5492, + "step": 745600 + }, + { + "epoch": 0.1944771412433916, + "grad_norm": 11.634117126464844, + "learning_rate": 9.23807207662622e-06, + "loss": 2.5746, + "step": 745800 + }, + { + "epoch": 0.1945292938690938, + "grad_norm": 12.693094253540039, + "learning_rate": 9.237628462395158e-06, + "loss": 2.6023, + "step": 746000 + }, + { + "epoch": 0.19458144649479595, + "grad_norm": 15.489215850830078, + "learning_rate": 9.237184729717496e-06, + "loss": 2.5555, + "step": 746200 + }, + { + "epoch": 0.19463359912049813, + "grad_norm": 13.176939010620117, + "learning_rate": 9.236740878605635e-06, + "loss": 2.5887, + "step": 746400 + }, + { + "epoch": 0.1946857517462003, + "grad_norm": 12.691610336303711, + "learning_rate": 9.23629690907198e-06, + "loss": 2.5661, + "step": 746600 + }, + { + "epoch": 0.19473790437190247, + "grad_norm": 11.665881156921387, + "learning_rate": 9.235852821128943e-06, + "loss": 2.5562, + "step": 746800 + }, + { + "epoch": 0.19479005699760463, + "grad_norm": 13.195979118347168, + "learning_rate": 9.235408614788937e-06, + "loss": 2.5584, + "step": 747000 + }, + { + "epoch": 0.1948422096233068, + "grad_norm": 11.392936706542969, + "learning_rate": 9.234964290064375e-06, + "loss": 2.5803, + "step": 747200 + }, + { + "epoch": 0.19489436224900897, + "grad_norm": 11.263978004455566, + "learning_rate": 9.234519846967678e-06, + "loss": 2.6038, + "step": 747400 + }, + { + "epoch": 0.19494651487471115, + "grad_norm": 10.900824546813965, + "learning_rate": 9.234075285511268e-06, + "loss": 2.5391, + "step": 747600 + }, + { + "epoch": 0.1949986675004133, + "grad_norm": 11.745079040527344, + "learning_rate": 9.233630605707573e-06, + "loss": 2.6114, + "step": 747800 + }, + { + "epoch": 0.1950508201261155, + "grad_norm": 11.821778297424316, + "learning_rate": 9.23318580756902e-06, + "loss": 2.6185, + "step": 748000 + }, + { + "epoch": 0.19510297275181765, + "grad_norm": 11.111929893493652, + "learning_rate": 9.232740891108042e-06, + "loss": 2.6214, + "step": 748200 + }, + { + "epoch": 0.1951551253775198, + "grad_norm": 11.941657066345215, + "learning_rate": 9.232295856337078e-06, + "loss": 2.5602, + "step": 748400 + }, + { + "epoch": 0.195207278003222, + "grad_norm": 11.378263473510742, + "learning_rate": 9.231850703268563e-06, + "loss": 2.5749, + "step": 748600 + }, + { + "epoch": 0.19525943062892415, + "grad_norm": 10.591818809509277, + "learning_rate": 9.231405431914942e-06, + "loss": 2.5177, + "step": 748800 + }, + { + "epoch": 0.19531158325462633, + "grad_norm": 12.579631805419922, + "learning_rate": 9.230960042288661e-06, + "loss": 2.5455, + "step": 749000 + }, + { + "epoch": 0.1953637358803285, + "grad_norm": 13.101773262023926, + "learning_rate": 9.230514534402166e-06, + "loss": 2.6177, + "step": 749200 + }, + { + "epoch": 0.19541588850603067, + "grad_norm": 12.09244441986084, + "learning_rate": 9.230068908267916e-06, + "loss": 2.5752, + "step": 749400 + }, + { + "epoch": 0.19546804113173283, + "grad_norm": 12.434764862060547, + "learning_rate": 9.229623163898358e-06, + "loss": 2.5857, + "step": 749600 + }, + { + "epoch": 0.195520193757435, + "grad_norm": 11.809731483459473, + "learning_rate": 9.229177301305957e-06, + "loss": 2.5606, + "step": 749800 + }, + { + "epoch": 0.19557234638313717, + "grad_norm": 12.460705757141113, + "learning_rate": 9.228731320503174e-06, + "loss": 2.574, + "step": 750000 + }, + { + "epoch": 0.19562449900883935, + "grad_norm": 12.289457321166992, + "learning_rate": 9.228285221502475e-06, + "loss": 2.5422, + "step": 750200 + }, + { + "epoch": 0.1956766516345415, + "grad_norm": 12.129727363586426, + "learning_rate": 9.227839004316329e-06, + "loss": 2.5749, + "step": 750400 + }, + { + "epoch": 0.1957288042602437, + "grad_norm": 12.88148307800293, + "learning_rate": 9.227392668957208e-06, + "loss": 2.6006, + "step": 750600 + }, + { + "epoch": 0.19578095688594585, + "grad_norm": 11.499628067016602, + "learning_rate": 9.226946215437586e-06, + "loss": 2.5805, + "step": 750800 + }, + { + "epoch": 0.19583310951164803, + "grad_norm": 11.71347427368164, + "learning_rate": 9.226499643769945e-06, + "loss": 2.587, + "step": 751000 + }, + { + "epoch": 0.1958852621373502, + "grad_norm": 11.851282119750977, + "learning_rate": 9.226052953966766e-06, + "loss": 2.5516, + "step": 751200 + }, + { + "epoch": 0.19593741476305238, + "grad_norm": 14.087993621826172, + "learning_rate": 9.225606146040533e-06, + "loss": 2.5593, + "step": 751400 + }, + { + "epoch": 0.19598956738875453, + "grad_norm": 13.594001770019531, + "learning_rate": 9.225159220003737e-06, + "loss": 2.5425, + "step": 751600 + }, + { + "epoch": 0.19604172001445672, + "grad_norm": 10.394376754760742, + "learning_rate": 9.224712175868866e-06, + "loss": 2.5569, + "step": 751800 + }, + { + "epoch": 0.19609387264015887, + "grad_norm": 13.085862159729004, + "learning_rate": 9.224265013648421e-06, + "loss": 2.5548, + "step": 752000 + }, + { + "epoch": 0.19614602526586106, + "grad_norm": 12.25311279296875, + "learning_rate": 9.223817733354898e-06, + "loss": 2.5291, + "step": 752200 + }, + { + "epoch": 0.1961981778915632, + "grad_norm": 10.312944412231445, + "learning_rate": 9.223370335000799e-06, + "loss": 2.5391, + "step": 752400 + }, + { + "epoch": 0.1962503305172654, + "grad_norm": 13.032076835632324, + "learning_rate": 9.22292281859863e-06, + "loss": 2.5997, + "step": 752600 + }, + { + "epoch": 0.19630248314296755, + "grad_norm": 11.651784896850586, + "learning_rate": 9.222475184160897e-06, + "loss": 2.5493, + "step": 752800 + }, + { + "epoch": 0.19635463576866974, + "grad_norm": 12.205024719238281, + "learning_rate": 9.222027431700116e-06, + "loss": 2.5583, + "step": 753000 + }, + { + "epoch": 0.1964067883943719, + "grad_norm": 12.745562553405762, + "learning_rate": 9.221579561228799e-06, + "loss": 2.5554, + "step": 753200 + }, + { + "epoch": 0.19645894102007408, + "grad_norm": 13.312294006347656, + "learning_rate": 9.221131572759464e-06, + "loss": 2.5625, + "step": 753400 + }, + { + "epoch": 0.19651109364577624, + "grad_norm": 11.942469596862793, + "learning_rate": 9.220683466304638e-06, + "loss": 2.5518, + "step": 753600 + }, + { + "epoch": 0.19656324627147842, + "grad_norm": 12.910618782043457, + "learning_rate": 9.22023524187684e-06, + "loss": 2.554, + "step": 753800 + }, + { + "epoch": 0.19661539889718058, + "grad_norm": 9.96152400970459, + "learning_rate": 9.219786899488603e-06, + "loss": 2.5309, + "step": 754000 + }, + { + "epoch": 0.19666755152288273, + "grad_norm": 12.984268188476562, + "learning_rate": 9.219338439152455e-06, + "loss": 2.5723, + "step": 754200 + }, + { + "epoch": 0.19671970414858492, + "grad_norm": 12.589516639709473, + "learning_rate": 9.218889860880932e-06, + "loss": 2.5655, + "step": 754400 + }, + { + "epoch": 0.19677185677428707, + "grad_norm": 13.960628509521484, + "learning_rate": 9.218441164686573e-06, + "loss": 2.5815, + "step": 754600 + }, + { + "epoch": 0.19682400939998926, + "grad_norm": 11.358234405517578, + "learning_rate": 9.217992350581921e-06, + "loss": 2.574, + "step": 754800 + }, + { + "epoch": 0.19687616202569141, + "grad_norm": 12.214129447937012, + "learning_rate": 9.217543418579517e-06, + "loss": 2.5392, + "step": 755000 + }, + { + "epoch": 0.1969283146513936, + "grad_norm": 14.097967147827148, + "learning_rate": 9.217094368691913e-06, + "loss": 2.6056, + "step": 755200 + }, + { + "epoch": 0.19698046727709576, + "grad_norm": 12.386037826538086, + "learning_rate": 9.216645200931657e-06, + "loss": 2.5794, + "step": 755400 + }, + { + "epoch": 0.19703261990279794, + "grad_norm": 12.34218692779541, + "learning_rate": 9.216195915311307e-06, + "loss": 2.5821, + "step": 755600 + }, + { + "epoch": 0.1970847725285001, + "grad_norm": 13.724196434020996, + "learning_rate": 9.215746511843422e-06, + "loss": 2.5602, + "step": 755800 + }, + { + "epoch": 0.19713692515420228, + "grad_norm": 12.456463813781738, + "learning_rate": 9.21529699054056e-06, + "loss": 2.5832, + "step": 756000 + }, + { + "epoch": 0.19718907777990444, + "grad_norm": 12.305964469909668, + "learning_rate": 9.214847351415283e-06, + "loss": 2.5294, + "step": 756200 + }, + { + "epoch": 0.19724123040560662, + "grad_norm": 11.164262771606445, + "learning_rate": 9.214397594480165e-06, + "loss": 2.5495, + "step": 756400 + }, + { + "epoch": 0.19729338303130878, + "grad_norm": 13.309433937072754, + "learning_rate": 9.213947719747775e-06, + "loss": 2.5062, + "step": 756600 + }, + { + "epoch": 0.19734553565701096, + "grad_norm": 12.617058753967285, + "learning_rate": 9.213497727230688e-06, + "loss": 2.5857, + "step": 756800 + }, + { + "epoch": 0.19739768828271312, + "grad_norm": 11.997270584106445, + "learning_rate": 9.21304761694148e-06, + "loss": 2.5594, + "step": 757000 + }, + { + "epoch": 0.1974498409084153, + "grad_norm": 11.326188087463379, + "learning_rate": 9.212597388892734e-06, + "loss": 2.5126, + "step": 757200 + }, + { + "epoch": 0.19750199353411746, + "grad_norm": 13.245183944702148, + "learning_rate": 9.212147043097034e-06, + "loss": 2.5587, + "step": 757400 + }, + { + "epoch": 0.19755414615981964, + "grad_norm": 12.528741836547852, + "learning_rate": 9.211696579566967e-06, + "loss": 2.5524, + "step": 757600 + }, + { + "epoch": 0.1976062987855218, + "grad_norm": 12.96116828918457, + "learning_rate": 9.211245998315124e-06, + "loss": 2.5411, + "step": 757800 + }, + { + "epoch": 0.19765845141122398, + "grad_norm": 14.029032707214355, + "learning_rate": 9.210795299354101e-06, + "loss": 2.5702, + "step": 758000 + }, + { + "epoch": 0.19771060403692614, + "grad_norm": 12.810486793518066, + "learning_rate": 9.210344482696491e-06, + "loss": 2.5369, + "step": 758200 + }, + { + "epoch": 0.19776275666262833, + "grad_norm": 11.713202476501465, + "learning_rate": 9.2098935483549e-06, + "loss": 2.5273, + "step": 758400 + }, + { + "epoch": 0.19781490928833048, + "grad_norm": 11.515366554260254, + "learning_rate": 9.209442496341932e-06, + "loss": 2.5773, + "step": 758600 + }, + { + "epoch": 0.19786706191403267, + "grad_norm": 12.124978065490723, + "learning_rate": 9.208991326670189e-06, + "loss": 2.5723, + "step": 758800 + }, + { + "epoch": 0.19791921453973482, + "grad_norm": 11.972982406616211, + "learning_rate": 9.208540039352288e-06, + "loss": 2.5797, + "step": 759000 + }, + { + "epoch": 0.197971367165437, + "grad_norm": 12.521751403808594, + "learning_rate": 9.208088634400838e-06, + "loss": 2.5721, + "step": 759200 + }, + { + "epoch": 0.19802351979113916, + "grad_norm": 12.26280403137207, + "learning_rate": 9.20763711182846e-06, + "loss": 2.5352, + "step": 759400 + }, + { + "epoch": 0.19807567241684135, + "grad_norm": 11.379363059997559, + "learning_rate": 9.207185471647774e-06, + "loss": 2.5372, + "step": 759600 + }, + { + "epoch": 0.1981278250425435, + "grad_norm": 11.682640075683594, + "learning_rate": 9.2067337138714e-06, + "loss": 2.584, + "step": 759800 + }, + { + "epoch": 0.1981799776682457, + "grad_norm": 10.798892974853516, + "learning_rate": 9.206281838511972e-06, + "loss": 2.5728, + "step": 760000 + }, + { + "epoch": 0.19823213029394784, + "grad_norm": 13.537489891052246, + "learning_rate": 9.205829845582114e-06, + "loss": 2.5534, + "step": 760200 + }, + { + "epoch": 0.19828428291965, + "grad_norm": 12.42953872680664, + "learning_rate": 9.205377735094462e-06, + "loss": 2.5662, + "step": 760400 + }, + { + "epoch": 0.19833643554535219, + "grad_norm": 10.437345504760742, + "learning_rate": 9.204925507061655e-06, + "loss": 2.5577, + "step": 760600 + }, + { + "epoch": 0.19838858817105434, + "grad_norm": 12.269465446472168, + "learning_rate": 9.204473161496331e-06, + "loss": 2.5762, + "step": 760800 + }, + { + "epoch": 0.19844074079675653, + "grad_norm": 11.716769218444824, + "learning_rate": 9.204020698411133e-06, + "loss": 2.5548, + "step": 761000 + }, + { + "epoch": 0.19849289342245868, + "grad_norm": 13.604849815368652, + "learning_rate": 9.20356811781871e-06, + "loss": 2.5544, + "step": 761200 + }, + { + "epoch": 0.19854504604816087, + "grad_norm": 12.914355278015137, + "learning_rate": 9.203115419731711e-06, + "loss": 2.5554, + "step": 761400 + }, + { + "epoch": 0.19859719867386302, + "grad_norm": 13.604276657104492, + "learning_rate": 9.20266260416279e-06, + "loss": 2.556, + "step": 761600 + }, + { + "epoch": 0.1986493512995652, + "grad_norm": 12.59882640838623, + "learning_rate": 9.202209671124605e-06, + "loss": 2.5409, + "step": 761800 + }, + { + "epoch": 0.19870150392526736, + "grad_norm": 12.737641334533691, + "learning_rate": 9.20175662062981e-06, + "loss": 2.5656, + "step": 762000 + }, + { + "epoch": 0.19875365655096955, + "grad_norm": 11.206287384033203, + "learning_rate": 9.201303452691078e-06, + "loss": 2.5629, + "step": 762200 + }, + { + "epoch": 0.1988058091766717, + "grad_norm": 12.095778465270996, + "learning_rate": 9.200850167321067e-06, + "loss": 2.5611, + "step": 762400 + }, + { + "epoch": 0.1988579618023739, + "grad_norm": 11.354753494262695, + "learning_rate": 9.20039676453245e-06, + "loss": 2.5851, + "step": 762600 + }, + { + "epoch": 0.19891011442807605, + "grad_norm": 10.592757225036621, + "learning_rate": 9.199943244337901e-06, + "loss": 2.5733, + "step": 762800 + }, + { + "epoch": 0.19896226705377823, + "grad_norm": 12.773603439331055, + "learning_rate": 9.199489606750095e-06, + "loss": 2.578, + "step": 763000 + }, + { + "epoch": 0.1990144196794804, + "grad_norm": 11.842935562133789, + "learning_rate": 9.199035851781712e-06, + "loss": 2.5338, + "step": 763200 + }, + { + "epoch": 0.19906657230518257, + "grad_norm": 11.150446891784668, + "learning_rate": 9.198581979445437e-06, + "loss": 2.5977, + "step": 763400 + }, + { + "epoch": 0.19911872493088473, + "grad_norm": 13.279424667358398, + "learning_rate": 9.198127989753953e-06, + "loss": 2.5638, + "step": 763600 + }, + { + "epoch": 0.1991708775565869, + "grad_norm": 12.847058296203613, + "learning_rate": 9.197673882719953e-06, + "loss": 2.5607, + "step": 763800 + }, + { + "epoch": 0.19922303018228907, + "grad_norm": 12.287805557250977, + "learning_rate": 9.197219658356125e-06, + "loss": 2.6154, + "step": 764000 + }, + { + "epoch": 0.19927518280799125, + "grad_norm": 12.280625343322754, + "learning_rate": 9.196765316675169e-06, + "loss": 2.5747, + "step": 764200 + }, + { + "epoch": 0.1993273354336934, + "grad_norm": 12.880599975585938, + "learning_rate": 9.196310857689785e-06, + "loss": 2.5721, + "step": 764400 + }, + { + "epoch": 0.1993794880593956, + "grad_norm": 12.690503120422363, + "learning_rate": 9.195856281412672e-06, + "loss": 2.5614, + "step": 764600 + }, + { + "epoch": 0.19943164068509775, + "grad_norm": 11.280840873718262, + "learning_rate": 9.195401587856538e-06, + "loss": 2.5797, + "step": 764800 + }, + { + "epoch": 0.19948379331079993, + "grad_norm": 13.591432571411133, + "learning_rate": 9.194946777034094e-06, + "loss": 2.554, + "step": 765000 + }, + { + "epoch": 0.1995359459365021, + "grad_norm": 13.786008834838867, + "learning_rate": 9.19449184895805e-06, + "loss": 2.5347, + "step": 765200 + }, + { + "epoch": 0.19958809856220427, + "grad_norm": 11.823830604553223, + "learning_rate": 9.19403680364112e-06, + "loss": 2.551, + "step": 765400 + }, + { + "epoch": 0.19964025118790643, + "grad_norm": 11.602143287658691, + "learning_rate": 9.193581641096028e-06, + "loss": 2.577, + "step": 765600 + }, + { + "epoch": 0.19969240381360862, + "grad_norm": 14.088010787963867, + "learning_rate": 9.193126361335491e-06, + "loss": 2.5477, + "step": 765800 + }, + { + "epoch": 0.19974455643931077, + "grad_norm": 13.316328048706055, + "learning_rate": 9.19267096437224e-06, + "loss": 2.5517, + "step": 766000 + }, + { + "epoch": 0.19979670906501293, + "grad_norm": 11.534945487976074, + "learning_rate": 9.192215450219e-06, + "loss": 2.5514, + "step": 766200 + }, + { + "epoch": 0.1998488616907151, + "grad_norm": 11.87644100189209, + "learning_rate": 9.191759818888506e-06, + "loss": 2.5328, + "step": 766400 + }, + { + "epoch": 0.19990101431641727, + "grad_norm": 12.74470043182373, + "learning_rate": 9.19130407039349e-06, + "loss": 2.5838, + "step": 766600 + }, + { + "epoch": 0.19995316694211945, + "grad_norm": 11.58600902557373, + "learning_rate": 9.190848204746694e-06, + "loss": 2.5657, + "step": 766800 + }, + { + "epoch": 0.2000053195678216, + "grad_norm": 12.279742240905762, + "learning_rate": 9.190392221960858e-06, + "loss": 2.5543, + "step": 767000 + }, + { + "epoch": 0.2000574721935238, + "grad_norm": 12.125825881958008, + "learning_rate": 9.189936122048727e-06, + "loss": 2.5282, + "step": 767200 + }, + { + "epoch": 0.20010962481922595, + "grad_norm": 10.70932674407959, + "learning_rate": 9.18947990502305e-06, + "loss": 2.5886, + "step": 767400 + }, + { + "epoch": 0.20016177744492814, + "grad_norm": 12.008572578430176, + "learning_rate": 9.189023570896579e-06, + "loss": 2.5466, + "step": 767600 + }, + { + "epoch": 0.2002139300706303, + "grad_norm": 10.968779563903809, + "learning_rate": 9.18856711968207e-06, + "loss": 2.5674, + "step": 767800 + }, + { + "epoch": 0.20026608269633248, + "grad_norm": 11.873457908630371, + "learning_rate": 9.18811055139228e-06, + "loss": 2.5667, + "step": 768000 + }, + { + "epoch": 0.20031823532203463, + "grad_norm": 11.809953689575195, + "learning_rate": 9.18765386603997e-06, + "loss": 2.5602, + "step": 768200 + }, + { + "epoch": 0.20037038794773682, + "grad_norm": 12.026460647583008, + "learning_rate": 9.187197063637906e-06, + "loss": 2.5693, + "step": 768400 + }, + { + "epoch": 0.20042254057343897, + "grad_norm": 12.511942863464355, + "learning_rate": 9.186740144198858e-06, + "loss": 2.5677, + "step": 768600 + }, + { + "epoch": 0.20047469319914116, + "grad_norm": 11.590380668640137, + "learning_rate": 9.186283107735593e-06, + "loss": 2.5472, + "step": 768800 + }, + { + "epoch": 0.20052684582484331, + "grad_norm": 12.04472541809082, + "learning_rate": 9.18582595426089e-06, + "loss": 2.5422, + "step": 769000 + }, + { + "epoch": 0.2005789984505455, + "grad_norm": 13.985250473022461, + "learning_rate": 9.185368683787526e-06, + "loss": 2.5152, + "step": 769200 + }, + { + "epoch": 0.20063115107624765, + "grad_norm": 12.050628662109375, + "learning_rate": 9.184911296328279e-06, + "loss": 2.5715, + "step": 769400 + }, + { + "epoch": 0.20068330370194984, + "grad_norm": 13.415891647338867, + "learning_rate": 9.184453791895937e-06, + "loss": 2.5828, + "step": 769600 + }, + { + "epoch": 0.200735456327652, + "grad_norm": 13.432297706604004, + "learning_rate": 9.183996170503287e-06, + "loss": 2.5177, + "step": 769800 + }, + { + "epoch": 0.20078760895335418, + "grad_norm": 12.210216522216797, + "learning_rate": 9.18353843216312e-06, + "loss": 2.5423, + "step": 770000 + }, + { + "epoch": 0.20083976157905634, + "grad_norm": 11.473400115966797, + "learning_rate": 9.183080576888228e-06, + "loss": 2.5561, + "step": 770200 + }, + { + "epoch": 0.20089191420475852, + "grad_norm": 12.12957763671875, + "learning_rate": 9.182622604691411e-06, + "loss": 2.5614, + "step": 770400 + }, + { + "epoch": 0.20094406683046068, + "grad_norm": 11.806977272033691, + "learning_rate": 9.182164515585472e-06, + "loss": 2.5903, + "step": 770600 + }, + { + "epoch": 0.20099621945616286, + "grad_norm": 12.727505683898926, + "learning_rate": 9.181706309583209e-06, + "loss": 2.577, + "step": 770800 + }, + { + "epoch": 0.20104837208186502, + "grad_norm": 13.943395614624023, + "learning_rate": 9.181247986697436e-06, + "loss": 2.5928, + "step": 771000 + }, + { + "epoch": 0.2011005247075672, + "grad_norm": 12.006414413452148, + "learning_rate": 9.18078954694096e-06, + "loss": 2.5669, + "step": 771200 + }, + { + "epoch": 0.20115267733326936, + "grad_norm": 11.008611679077148, + "learning_rate": 9.180330990326593e-06, + "loss": 2.5689, + "step": 771400 + }, + { + "epoch": 0.20120482995897154, + "grad_norm": 12.664291381835938, + "learning_rate": 9.179872316867158e-06, + "loss": 2.6105, + "step": 771600 + }, + { + "epoch": 0.2012569825846737, + "grad_norm": 11.425148010253906, + "learning_rate": 9.179413526575472e-06, + "loss": 2.5416, + "step": 771800 + }, + { + "epoch": 0.20130913521037586, + "grad_norm": 10.56281566619873, + "learning_rate": 9.178954619464357e-06, + "loss": 2.5577, + "step": 772000 + }, + { + "epoch": 0.20136128783607804, + "grad_norm": 12.567456245422363, + "learning_rate": 9.178495595546641e-06, + "loss": 2.5976, + "step": 772200 + }, + { + "epoch": 0.2014134404617802, + "grad_norm": 11.59072208404541, + "learning_rate": 9.178036454835157e-06, + "loss": 2.5353, + "step": 772400 + }, + { + "epoch": 0.20146559308748238, + "grad_norm": 12.048215866088867, + "learning_rate": 9.177577197342738e-06, + "loss": 2.5556, + "step": 772600 + }, + { + "epoch": 0.20151774571318454, + "grad_norm": 13.227574348449707, + "learning_rate": 9.177117823082217e-06, + "loss": 2.5473, + "step": 772800 + }, + { + "epoch": 0.20156989833888672, + "grad_norm": 11.449342727661133, + "learning_rate": 9.176658332066438e-06, + "loss": 2.5772, + "step": 773000 + }, + { + "epoch": 0.20162205096458888, + "grad_norm": 13.140538215637207, + "learning_rate": 9.17619872430824e-06, + "loss": 2.5734, + "step": 773200 + }, + { + "epoch": 0.20167420359029106, + "grad_norm": 13.485116958618164, + "learning_rate": 9.175738999820476e-06, + "loss": 2.5461, + "step": 773400 + }, + { + "epoch": 0.20172635621599322, + "grad_norm": 14.482010841369629, + "learning_rate": 9.175279158615989e-06, + "loss": 2.5771, + "step": 773600 + }, + { + "epoch": 0.2017785088416954, + "grad_norm": 11.397805213928223, + "learning_rate": 9.174819200707636e-06, + "loss": 2.5517, + "step": 773800 + }, + { + "epoch": 0.20183066146739756, + "grad_norm": 13.39069938659668, + "learning_rate": 9.174359126108274e-06, + "loss": 2.5457, + "step": 774000 + }, + { + "epoch": 0.20188281409309974, + "grad_norm": 13.59749698638916, + "learning_rate": 9.17389893483076e-06, + "loss": 2.5409, + "step": 774200 + }, + { + "epoch": 0.2019349667188019, + "grad_norm": 12.491495132446289, + "learning_rate": 9.173438626887958e-06, + "loss": 2.5758, + "step": 774400 + }, + { + "epoch": 0.20198711934450408, + "grad_norm": 12.573298454284668, + "learning_rate": 9.172978202292735e-06, + "loss": 2.5665, + "step": 774600 + }, + { + "epoch": 0.20203927197020624, + "grad_norm": 12.443755149841309, + "learning_rate": 9.17251766105796e-06, + "loss": 2.603, + "step": 774800 + }, + { + "epoch": 0.20209142459590843, + "grad_norm": 11.910422325134277, + "learning_rate": 9.172057003196505e-06, + "loss": 2.5361, + "step": 775000 + }, + { + "epoch": 0.20214357722161058, + "grad_norm": 12.631829261779785, + "learning_rate": 9.171596228721245e-06, + "loss": 2.5566, + "step": 775200 + }, + { + "epoch": 0.20219572984731277, + "grad_norm": 13.858083724975586, + "learning_rate": 9.171135337645061e-06, + "loss": 2.5467, + "step": 775400 + }, + { + "epoch": 0.20224788247301492, + "grad_norm": 11.458602905273438, + "learning_rate": 9.170674329980835e-06, + "loss": 2.5821, + "step": 775600 + }, + { + "epoch": 0.2023000350987171, + "grad_norm": 12.20776081085205, + "learning_rate": 9.170213205741453e-06, + "loss": 2.5435, + "step": 775800 + }, + { + "epoch": 0.20235218772441926, + "grad_norm": 12.886215209960938, + "learning_rate": 9.169751964939802e-06, + "loss": 2.6111, + "step": 776000 + }, + { + "epoch": 0.20240434035012145, + "grad_norm": 13.515503883361816, + "learning_rate": 9.169290607588776e-06, + "loss": 2.5066, + "step": 776200 + }, + { + "epoch": 0.2024564929758236, + "grad_norm": 12.592185974121094, + "learning_rate": 9.168829133701273e-06, + "loss": 2.5507, + "step": 776400 + }, + { + "epoch": 0.2025086456015258, + "grad_norm": 13.320043563842773, + "learning_rate": 9.168367543290187e-06, + "loss": 2.5546, + "step": 776600 + }, + { + "epoch": 0.20256079822722795, + "grad_norm": 12.519879341125488, + "learning_rate": 9.16790583636842e-06, + "loss": 2.5799, + "step": 776800 + }, + { + "epoch": 0.20261295085293013, + "grad_norm": 12.188693046569824, + "learning_rate": 9.16744401294888e-06, + "loss": 2.5855, + "step": 777000 + }, + { + "epoch": 0.20266510347863229, + "grad_norm": 14.931890487670898, + "learning_rate": 9.166982073044475e-06, + "loss": 2.5712, + "step": 777200 + }, + { + "epoch": 0.20271725610433447, + "grad_norm": 13.907896995544434, + "learning_rate": 9.166520016668117e-06, + "loss": 2.5602, + "step": 777400 + }, + { + "epoch": 0.20276940873003663, + "grad_norm": 12.67017936706543, + "learning_rate": 9.16605784383272e-06, + "loss": 2.5684, + "step": 777600 + }, + { + "epoch": 0.20282156135573878, + "grad_norm": 12.602633476257324, + "learning_rate": 9.165595554551204e-06, + "loss": 2.5424, + "step": 777800 + }, + { + "epoch": 0.20287371398144097, + "grad_norm": 11.999449729919434, + "learning_rate": 9.165133148836487e-06, + "loss": 2.5635, + "step": 778000 + }, + { + "epoch": 0.20292586660714312, + "grad_norm": 15.479057312011719, + "learning_rate": 9.164670626701495e-06, + "loss": 2.5707, + "step": 778200 + }, + { + "epoch": 0.2029780192328453, + "grad_norm": 11.809393882751465, + "learning_rate": 9.16420798815916e-06, + "loss": 2.537, + "step": 778400 + }, + { + "epoch": 0.20303017185854746, + "grad_norm": 12.722270965576172, + "learning_rate": 9.163745233222407e-06, + "loss": 2.569, + "step": 778600 + }, + { + "epoch": 0.20308232448424965, + "grad_norm": 12.335058212280273, + "learning_rate": 9.163282361904176e-06, + "loss": 2.5594, + "step": 778800 + }, + { + "epoch": 0.2031344771099518, + "grad_norm": 11.937206268310547, + "learning_rate": 9.162819374217403e-06, + "loss": 2.5612, + "step": 779000 + }, + { + "epoch": 0.203186629735654, + "grad_norm": 10.777091026306152, + "learning_rate": 9.162356270175026e-06, + "loss": 2.5397, + "step": 779200 + }, + { + "epoch": 0.20323878236135615, + "grad_norm": 13.843822479248047, + "learning_rate": 9.161893049789995e-06, + "loss": 2.5632, + "step": 779400 + }, + { + "epoch": 0.20329093498705833, + "grad_norm": 11.743480682373047, + "learning_rate": 9.161429713075252e-06, + "loss": 2.587, + "step": 779600 + }, + { + "epoch": 0.2033430876127605, + "grad_norm": 10.867159843444824, + "learning_rate": 9.160966260043751e-06, + "loss": 2.5847, + "step": 779800 + }, + { + "epoch": 0.20339524023846267, + "grad_norm": 12.39773941040039, + "learning_rate": 9.160502690708447e-06, + "loss": 2.5626, + "step": 780000 + }, + { + "epoch": 0.20344739286416483, + "grad_norm": 13.51063060760498, + "learning_rate": 9.160039005082291e-06, + "loss": 2.5048, + "step": 780200 + }, + { + "epoch": 0.203499545489867, + "grad_norm": 12.987715721130371, + "learning_rate": 9.159575203178253e-06, + "loss": 2.5538, + "step": 780400 + }, + { + "epoch": 0.20355169811556917, + "grad_norm": 12.870197296142578, + "learning_rate": 9.159111285009289e-06, + "loss": 2.5578, + "step": 780600 + }, + { + "epoch": 0.20360385074127135, + "grad_norm": 10.9700927734375, + "learning_rate": 9.15864725058837e-06, + "loss": 2.5964, + "step": 780800 + }, + { + "epoch": 0.2036560033669735, + "grad_norm": 11.876527786254883, + "learning_rate": 9.158183099928465e-06, + "loss": 2.5587, + "step": 781000 + }, + { + "epoch": 0.2037081559926757, + "grad_norm": 12.678916931152344, + "learning_rate": 9.157718833042548e-06, + "loss": 2.57, + "step": 781200 + }, + { + "epoch": 0.20376030861837785, + "grad_norm": 11.316527366638184, + "learning_rate": 9.157254449943594e-06, + "loss": 2.531, + "step": 781400 + }, + { + "epoch": 0.20381246124408003, + "grad_norm": 11.957809448242188, + "learning_rate": 9.156789950644587e-06, + "loss": 2.5381, + "step": 781600 + }, + { + "epoch": 0.2038646138697822, + "grad_norm": 12.463024139404297, + "learning_rate": 9.156325335158507e-06, + "loss": 2.5681, + "step": 781800 + }, + { + "epoch": 0.20391676649548438, + "grad_norm": 12.041062355041504, + "learning_rate": 9.155860603498341e-06, + "loss": 2.5684, + "step": 782000 + }, + { + "epoch": 0.20396891912118653, + "grad_norm": 11.98395824432373, + "learning_rate": 9.15539575567708e-06, + "loss": 2.5603, + "step": 782200 + }, + { + "epoch": 0.20402107174688872, + "grad_norm": 12.181907653808594, + "learning_rate": 9.154930791707714e-06, + "loss": 2.5406, + "step": 782400 + }, + { + "epoch": 0.20407322437259087, + "grad_norm": 12.69912338256836, + "learning_rate": 9.154465711603244e-06, + "loss": 2.5441, + "step": 782600 + }, + { + "epoch": 0.20412537699829306, + "grad_norm": 11.131607055664062, + "learning_rate": 9.154000515376667e-06, + "loss": 2.5484, + "step": 782800 + }, + { + "epoch": 0.2041775296239952, + "grad_norm": 11.983840942382812, + "learning_rate": 9.153535203040986e-06, + "loss": 2.5317, + "step": 783000 + }, + { + "epoch": 0.2042296822496974, + "grad_norm": 13.614217758178711, + "learning_rate": 9.153069774609205e-06, + "loss": 2.5382, + "step": 783200 + }, + { + "epoch": 0.20428183487539955, + "grad_norm": 13.162174224853516, + "learning_rate": 9.152604230094337e-06, + "loss": 2.5865, + "step": 783400 + }, + { + "epoch": 0.2043339875011017, + "grad_norm": 12.635634422302246, + "learning_rate": 9.152138569509393e-06, + "loss": 2.549, + "step": 783600 + }, + { + "epoch": 0.2043861401268039, + "grad_norm": 9.673964500427246, + "learning_rate": 9.151672792867387e-06, + "loss": 2.5795, + "step": 783800 + }, + { + "epoch": 0.20443829275250605, + "grad_norm": 12.545588493347168, + "learning_rate": 9.15120690018134e-06, + "loss": 2.546, + "step": 784000 + }, + { + "epoch": 0.20449044537820824, + "grad_norm": 11.919151306152344, + "learning_rate": 9.150740891464272e-06, + "loss": 2.5573, + "step": 784200 + }, + { + "epoch": 0.2045425980039104, + "grad_norm": 14.723599433898926, + "learning_rate": 9.150274766729213e-06, + "loss": 2.5681, + "step": 784400 + }, + { + "epoch": 0.20459475062961258, + "grad_norm": 11.212860107421875, + "learning_rate": 9.149808525989185e-06, + "loss": 2.5524, + "step": 784600 + }, + { + "epoch": 0.20464690325531473, + "grad_norm": 13.626235961914062, + "learning_rate": 9.149342169257228e-06, + "loss": 2.6008, + "step": 784800 + }, + { + "epoch": 0.20469905588101692, + "grad_norm": 13.193553924560547, + "learning_rate": 9.148875696546372e-06, + "loss": 2.54, + "step": 785000 + }, + { + "epoch": 0.20475120850671907, + "grad_norm": 12.401244163513184, + "learning_rate": 9.148409107869654e-06, + "loss": 2.5822, + "step": 785200 + }, + { + "epoch": 0.20480336113242126, + "grad_norm": 13.341384887695312, + "learning_rate": 9.14794240324012e-06, + "loss": 2.581, + "step": 785400 + }, + { + "epoch": 0.20485551375812341, + "grad_norm": 10.86294937133789, + "learning_rate": 9.147475582670813e-06, + "loss": 2.558, + "step": 785600 + }, + { + "epoch": 0.2049076663838256, + "grad_norm": 11.97840690612793, + "learning_rate": 9.14700864617478e-06, + "loss": 2.569, + "step": 785800 + }, + { + "epoch": 0.20495981900952776, + "grad_norm": 12.532195091247559, + "learning_rate": 9.146541593765075e-06, + "loss": 2.5214, + "step": 786000 + }, + { + "epoch": 0.20501197163522994, + "grad_norm": 13.745171546936035, + "learning_rate": 9.14607442545475e-06, + "loss": 2.5511, + "step": 786200 + }, + { + "epoch": 0.2050641242609321, + "grad_norm": 12.68714714050293, + "learning_rate": 9.145607141256864e-06, + "loss": 2.5863, + "step": 786400 + }, + { + "epoch": 0.20511627688663428, + "grad_norm": 12.79076862335205, + "learning_rate": 9.145139741184481e-06, + "loss": 2.5629, + "step": 786600 + }, + { + "epoch": 0.20516842951233644, + "grad_norm": 13.012794494628906, + "learning_rate": 9.144672225250661e-06, + "loss": 2.582, + "step": 786800 + }, + { + "epoch": 0.20522058213803862, + "grad_norm": 12.885845184326172, + "learning_rate": 9.144204593468473e-06, + "loss": 2.5376, + "step": 787000 + }, + { + "epoch": 0.20527273476374078, + "grad_norm": 11.415619850158691, + "learning_rate": 9.14373684585099e-06, + "loss": 2.5376, + "step": 787200 + }, + { + "epoch": 0.20532488738944296, + "grad_norm": 10.882523536682129, + "learning_rate": 9.143268982411282e-06, + "loss": 2.5388, + "step": 787400 + }, + { + "epoch": 0.20537704001514512, + "grad_norm": 13.310264587402344, + "learning_rate": 9.142801003162429e-06, + "loss": 2.5681, + "step": 787600 + }, + { + "epoch": 0.2054291926408473, + "grad_norm": 10.162239074707031, + "learning_rate": 9.142332908117512e-06, + "loss": 2.5519, + "step": 787800 + }, + { + "epoch": 0.20548134526654946, + "grad_norm": 10.431777000427246, + "learning_rate": 9.141864697289612e-06, + "loss": 2.5372, + "step": 788000 + }, + { + "epoch": 0.20553349789225164, + "grad_norm": 11.729969024658203, + "learning_rate": 9.141396370691822e-06, + "loss": 2.5446, + "step": 788200 + }, + { + "epoch": 0.2055856505179538, + "grad_norm": 13.751687049865723, + "learning_rate": 9.140927928337224e-06, + "loss": 2.5337, + "step": 788400 + }, + { + "epoch": 0.20563780314365598, + "grad_norm": 11.931695938110352, + "learning_rate": 9.140459370238919e-06, + "loss": 2.5327, + "step": 788600 + }, + { + "epoch": 0.20568995576935814, + "grad_norm": 12.925618171691895, + "learning_rate": 9.139990696409998e-06, + "loss": 2.5283, + "step": 788800 + }, + { + "epoch": 0.20574210839506032, + "grad_norm": 11.938955307006836, + "learning_rate": 9.139521906863564e-06, + "loss": 2.545, + "step": 789000 + }, + { + "epoch": 0.20579426102076248, + "grad_norm": 12.688498497009277, + "learning_rate": 9.13905300161272e-06, + "loss": 2.5699, + "step": 789200 + }, + { + "epoch": 0.20584641364646467, + "grad_norm": 12.349008560180664, + "learning_rate": 9.138583980670573e-06, + "loss": 2.5775, + "step": 789400 + }, + { + "epoch": 0.20589856627216682, + "grad_norm": 13.858742713928223, + "learning_rate": 9.13811484405023e-06, + "loss": 2.5536, + "step": 789600 + }, + { + "epoch": 0.20595071889786898, + "grad_norm": 12.965885162353516, + "learning_rate": 9.137645591764807e-06, + "loss": 2.5364, + "step": 789800 + }, + { + "epoch": 0.20600287152357116, + "grad_norm": 12.98017692565918, + "learning_rate": 9.137176223827417e-06, + "loss": 2.5487, + "step": 790000 + }, + { + "epoch": 0.20605502414927332, + "grad_norm": 12.841651916503906, + "learning_rate": 9.136706740251185e-06, + "loss": 2.5751, + "step": 790200 + }, + { + "epoch": 0.2061071767749755, + "grad_norm": 11.136397361755371, + "learning_rate": 9.136237141049226e-06, + "loss": 2.5773, + "step": 790400 + }, + { + "epoch": 0.20615932940067766, + "grad_norm": 13.198196411132812, + "learning_rate": 9.135767426234671e-06, + "loss": 2.529, + "step": 790600 + }, + { + "epoch": 0.20621148202637984, + "grad_norm": 13.684469223022461, + "learning_rate": 9.135297595820646e-06, + "loss": 2.5633, + "step": 790800 + }, + { + "epoch": 0.206263634652082, + "grad_norm": 12.033394813537598, + "learning_rate": 9.134827649820289e-06, + "loss": 2.602, + "step": 791000 + }, + { + "epoch": 0.20631578727778419, + "grad_norm": 10.937596321105957, + "learning_rate": 9.134357588246729e-06, + "loss": 2.5496, + "step": 791200 + }, + { + "epoch": 0.20636793990348634, + "grad_norm": 11.299455642700195, + "learning_rate": 9.133887411113108e-06, + "loss": 2.5665, + "step": 791400 + }, + { + "epoch": 0.20642009252918853, + "grad_norm": 13.532719612121582, + "learning_rate": 9.133417118432566e-06, + "loss": 2.5522, + "step": 791600 + }, + { + "epoch": 0.20647224515489068, + "grad_norm": 12.678144454956055, + "learning_rate": 9.13294671021825e-06, + "loss": 2.5615, + "step": 791800 + }, + { + "epoch": 0.20652439778059287, + "grad_norm": 12.046524047851562, + "learning_rate": 9.13247618648331e-06, + "loss": 2.562, + "step": 792000 + }, + { + "epoch": 0.20657655040629502, + "grad_norm": 12.832523345947266, + "learning_rate": 9.132005547240893e-06, + "loss": 2.5695, + "step": 792200 + }, + { + "epoch": 0.2066287030319972, + "grad_norm": 12.524669647216797, + "learning_rate": 9.131534792504158e-06, + "loss": 2.5554, + "step": 792400 + }, + { + "epoch": 0.20668085565769936, + "grad_norm": 11.63914966583252, + "learning_rate": 9.131063922286261e-06, + "loss": 2.5182, + "step": 792600 + }, + { + "epoch": 0.20673300828340155, + "grad_norm": 12.573324203491211, + "learning_rate": 9.130592936600364e-06, + "loss": 2.589, + "step": 792800 + }, + { + "epoch": 0.2067851609091037, + "grad_norm": 11.700994491577148, + "learning_rate": 9.130121835459633e-06, + "loss": 2.6005, + "step": 793000 + }, + { + "epoch": 0.2068373135348059, + "grad_norm": 10.683900833129883, + "learning_rate": 9.129650618877233e-06, + "loss": 2.5726, + "step": 793200 + }, + { + "epoch": 0.20688946616050805, + "grad_norm": 12.077662467956543, + "learning_rate": 9.129179286866337e-06, + "loss": 2.5581, + "step": 793400 + }, + { + "epoch": 0.20694161878621023, + "grad_norm": 12.618571281433105, + "learning_rate": 9.128707839440119e-06, + "loss": 2.5401, + "step": 793600 + }, + { + "epoch": 0.2069937714119124, + "grad_norm": 13.423776626586914, + "learning_rate": 9.128236276611757e-06, + "loss": 2.5562, + "step": 793800 + }, + { + "epoch": 0.20704592403761457, + "grad_norm": 11.433748245239258, + "learning_rate": 9.127764598394429e-06, + "loss": 2.5461, + "step": 794000 + }, + { + "epoch": 0.20709807666331673, + "grad_norm": 13.008195877075195, + "learning_rate": 9.127292804801322e-06, + "loss": 2.5663, + "step": 794200 + }, + { + "epoch": 0.2071502292890189, + "grad_norm": 12.282379150390625, + "learning_rate": 9.126820895845623e-06, + "loss": 2.5785, + "step": 794400 + }, + { + "epoch": 0.20720238191472107, + "grad_norm": 12.038056373596191, + "learning_rate": 9.12634887154052e-06, + "loss": 2.5305, + "step": 794600 + }, + { + "epoch": 0.20725453454042325, + "grad_norm": 10.847259521484375, + "learning_rate": 9.125876731899209e-06, + "loss": 2.5142, + "step": 794800 + }, + { + "epoch": 0.2073066871661254, + "grad_norm": 12.055601119995117, + "learning_rate": 9.125404476934888e-06, + "loss": 2.556, + "step": 795000 + }, + { + "epoch": 0.2073588397918276, + "grad_norm": 14.068562507629395, + "learning_rate": 9.124932106660752e-06, + "loss": 2.5441, + "step": 795200 + }, + { + "epoch": 0.20741099241752975, + "grad_norm": 12.50983715057373, + "learning_rate": 9.124459621090009e-06, + "loss": 2.5705, + "step": 795400 + }, + { + "epoch": 0.2074631450432319, + "grad_norm": 12.685457229614258, + "learning_rate": 9.123987020235863e-06, + "loss": 2.5693, + "step": 795600 + }, + { + "epoch": 0.2075152976689341, + "grad_norm": 12.426636695861816, + "learning_rate": 9.123514304111525e-06, + "loss": 2.555, + "step": 795800 + }, + { + "epoch": 0.20756745029463625, + "grad_norm": 11.003486633300781, + "learning_rate": 9.123041472730207e-06, + "loss": 2.5363, + "step": 796000 + }, + { + "epoch": 0.20761960292033843, + "grad_norm": 12.062921524047852, + "learning_rate": 9.122568526105127e-06, + "loss": 2.5494, + "step": 796200 + }, + { + "epoch": 0.2076717555460406, + "grad_norm": 9.995904922485352, + "learning_rate": 9.122095464249504e-06, + "loss": 2.5621, + "step": 796400 + }, + { + "epoch": 0.20772390817174277, + "grad_norm": 12.198345184326172, + "learning_rate": 9.121622287176557e-06, + "loss": 2.5531, + "step": 796600 + }, + { + "epoch": 0.20777606079744493, + "grad_norm": 11.961278915405273, + "learning_rate": 9.121148994899517e-06, + "loss": 2.5245, + "step": 796800 + }, + { + "epoch": 0.2078282134231471, + "grad_norm": 11.721890449523926, + "learning_rate": 9.12067558743161e-06, + "loss": 2.5773, + "step": 797000 + }, + { + "epoch": 0.20788036604884927, + "grad_norm": 11.148172378540039, + "learning_rate": 9.120202064786067e-06, + "loss": 2.5425, + "step": 797200 + }, + { + "epoch": 0.20793251867455145, + "grad_norm": 12.131102561950684, + "learning_rate": 9.119728426976129e-06, + "loss": 2.5342, + "step": 797400 + }, + { + "epoch": 0.2079846713002536, + "grad_norm": 12.33278751373291, + "learning_rate": 9.11925467401503e-06, + "loss": 2.5778, + "step": 797600 + }, + { + "epoch": 0.2080368239259558, + "grad_norm": 13.5529203414917, + "learning_rate": 9.118780805916011e-06, + "loss": 2.5409, + "step": 797800 + }, + { + "epoch": 0.20808897655165795, + "grad_norm": 13.351973533630371, + "learning_rate": 9.118306822692322e-06, + "loss": 2.5533, + "step": 798000 + }, + { + "epoch": 0.20814112917736013, + "grad_norm": 11.997188568115234, + "learning_rate": 9.117832724357208e-06, + "loss": 2.5669, + "step": 798200 + }, + { + "epoch": 0.2081932818030623, + "grad_norm": 12.028074264526367, + "learning_rate": 9.117358510923921e-06, + "loss": 2.5191, + "step": 798400 + }, + { + "epoch": 0.20824543442876448, + "grad_norm": 10.646809577941895, + "learning_rate": 9.116884182405718e-06, + "loss": 2.5348, + "step": 798600 + }, + { + "epoch": 0.20829758705446663, + "grad_norm": 10.890563011169434, + "learning_rate": 9.116409738815853e-06, + "loss": 2.5531, + "step": 798800 + }, + { + "epoch": 0.20834973968016882, + "grad_norm": 11.447530746459961, + "learning_rate": 9.11593518016759e-06, + "loss": 2.5651, + "step": 799000 + }, + { + "epoch": 0.20840189230587097, + "grad_norm": 13.491917610168457, + "learning_rate": 9.115460506474193e-06, + "loss": 2.5481, + "step": 799200 + }, + { + "epoch": 0.20845404493157316, + "grad_norm": 12.284280776977539, + "learning_rate": 9.114985717748928e-06, + "loss": 2.5543, + "step": 799400 + }, + { + "epoch": 0.2085061975572753, + "grad_norm": 11.280057907104492, + "learning_rate": 9.11451081400507e-06, + "loss": 2.5347, + "step": 799600 + }, + { + "epoch": 0.2085583501829775, + "grad_norm": 13.034576416015625, + "learning_rate": 9.114035795255888e-06, + "loss": 2.5667, + "step": 799800 + }, + { + "epoch": 0.20861050280867965, + "grad_norm": 10.916426658630371, + "learning_rate": 9.113560661514664e-06, + "loss": 2.5311, + "step": 800000 + }, + { + "epoch": 0.20866265543438184, + "grad_norm": 12.369806289672852, + "learning_rate": 9.113085412794676e-06, + "loss": 2.5332, + "step": 800200 + }, + { + "epoch": 0.208714808060084, + "grad_norm": 12.768467903137207, + "learning_rate": 9.112610049109207e-06, + "loss": 2.5693, + "step": 800400 + }, + { + "epoch": 0.20876696068578618, + "grad_norm": 13.05500316619873, + "learning_rate": 9.112134570471545e-06, + "loss": 2.5134, + "step": 800600 + }, + { + "epoch": 0.20881911331148834, + "grad_norm": 11.387818336486816, + "learning_rate": 9.111658976894982e-06, + "loss": 2.5834, + "step": 800800 + }, + { + "epoch": 0.20887126593719052, + "grad_norm": 10.839855194091797, + "learning_rate": 9.11118326839281e-06, + "loss": 2.5247, + "step": 801000 + }, + { + "epoch": 0.20892341856289268, + "grad_norm": 12.861638069152832, + "learning_rate": 9.110707444978322e-06, + "loss": 2.5669, + "step": 801200 + }, + { + "epoch": 0.20897557118859483, + "grad_norm": 11.404274940490723, + "learning_rate": 9.110231506664824e-06, + "loss": 2.5437, + "step": 801400 + }, + { + "epoch": 0.20902772381429702, + "grad_norm": 13.15756607055664, + "learning_rate": 9.109755453465615e-06, + "loss": 2.53, + "step": 801600 + }, + { + "epoch": 0.20907987643999917, + "grad_norm": 13.401494026184082, + "learning_rate": 9.109279285394003e-06, + "loss": 2.5359, + "step": 801800 + }, + { + "epoch": 0.20913202906570136, + "grad_norm": 13.968293190002441, + "learning_rate": 9.108803002463295e-06, + "loss": 2.5573, + "step": 802000 + }, + { + "epoch": 0.20918418169140351, + "grad_norm": 13.148781776428223, + "learning_rate": 9.108326604686807e-06, + "loss": 2.56, + "step": 802200 + }, + { + "epoch": 0.2092363343171057, + "grad_norm": 13.43651294708252, + "learning_rate": 9.107850092077853e-06, + "loss": 2.5353, + "step": 802400 + }, + { + "epoch": 0.20928848694280786, + "grad_norm": 12.484440803527832, + "learning_rate": 9.107373464649754e-06, + "loss": 2.5772, + "step": 802600 + }, + { + "epoch": 0.20934063956851004, + "grad_norm": 13.605812072753906, + "learning_rate": 9.106896722415829e-06, + "loss": 2.5525, + "step": 802800 + }, + { + "epoch": 0.2093927921942122, + "grad_norm": 13.259238243103027, + "learning_rate": 9.106419865389405e-06, + "loss": 2.596, + "step": 803000 + }, + { + "epoch": 0.20944494481991438, + "grad_norm": 11.530973434448242, + "learning_rate": 9.10594289358381e-06, + "loss": 2.5526, + "step": 803200 + }, + { + "epoch": 0.20949709744561654, + "grad_norm": 12.25853157043457, + "learning_rate": 9.105465807012379e-06, + "loss": 2.5761, + "step": 803400 + }, + { + "epoch": 0.20954925007131872, + "grad_norm": 10.755802154541016, + "learning_rate": 9.104988605688445e-06, + "loss": 2.5424, + "step": 803600 + }, + { + "epoch": 0.20960140269702088, + "grad_norm": 13.130935668945312, + "learning_rate": 9.104511289625345e-06, + "loss": 2.5453, + "step": 803800 + }, + { + "epoch": 0.20965355532272306, + "grad_norm": 13.585618019104004, + "learning_rate": 9.10403385883642e-06, + "loss": 2.4936, + "step": 804000 + }, + { + "epoch": 0.20970570794842522, + "grad_norm": 13.895092964172363, + "learning_rate": 9.103556313335019e-06, + "loss": 2.5333, + "step": 804200 + }, + { + "epoch": 0.2097578605741274, + "grad_norm": 12.45972728729248, + "learning_rate": 9.103078653134487e-06, + "loss": 2.5459, + "step": 804400 + }, + { + "epoch": 0.20981001319982956, + "grad_norm": 13.445938110351562, + "learning_rate": 9.102600878248176e-06, + "loss": 2.5226, + "step": 804600 + }, + { + "epoch": 0.20986216582553174, + "grad_norm": 12.260578155517578, + "learning_rate": 9.102122988689439e-06, + "loss": 2.5466, + "step": 804800 + }, + { + "epoch": 0.2099143184512339, + "grad_norm": 13.503302574157715, + "learning_rate": 9.101644984471636e-06, + "loss": 2.5485, + "step": 805000 + }, + { + "epoch": 0.20996647107693608, + "grad_norm": 11.830412864685059, + "learning_rate": 9.101166865608125e-06, + "loss": 2.5331, + "step": 805200 + }, + { + "epoch": 0.21001862370263824, + "grad_norm": 11.57419204711914, + "learning_rate": 9.100688632112272e-06, + "loss": 2.5551, + "step": 805400 + }, + { + "epoch": 0.21007077632834043, + "grad_norm": 13.763896942138672, + "learning_rate": 9.100210283997442e-06, + "loss": 2.5441, + "step": 805600 + }, + { + "epoch": 0.21012292895404258, + "grad_norm": 11.554506301879883, + "learning_rate": 9.099731821277008e-06, + "loss": 2.571, + "step": 805800 + }, + { + "epoch": 0.21017508157974477, + "grad_norm": 12.712745666503906, + "learning_rate": 9.099253243964343e-06, + "loss": 2.5449, + "step": 806000 + }, + { + "epoch": 0.21022723420544692, + "grad_norm": 12.259404182434082, + "learning_rate": 9.098774552072823e-06, + "loss": 2.5364, + "step": 806200 + }, + { + "epoch": 0.2102793868311491, + "grad_norm": 14.388871192932129, + "learning_rate": 9.098295745615826e-06, + "loss": 2.5435, + "step": 806400 + }, + { + "epoch": 0.21033153945685126, + "grad_norm": 11.50523853302002, + "learning_rate": 9.09781682460674e-06, + "loss": 2.5728, + "step": 806600 + }, + { + "epoch": 0.21038369208255345, + "grad_norm": 11.484689712524414, + "learning_rate": 9.097337789058947e-06, + "loss": 2.5825, + "step": 806800 + }, + { + "epoch": 0.2104358447082556, + "grad_norm": 12.460712432861328, + "learning_rate": 9.096858638985839e-06, + "loss": 2.5243, + "step": 807000 + }, + { + "epoch": 0.21048799733395776, + "grad_norm": 13.950214385986328, + "learning_rate": 9.09637937440081e-06, + "loss": 2.5351, + "step": 807200 + }, + { + "epoch": 0.21054014995965994, + "grad_norm": 12.65377140045166, + "learning_rate": 9.095899995317252e-06, + "loss": 2.5472, + "step": 807400 + }, + { + "epoch": 0.2105923025853621, + "grad_norm": 11.964714050292969, + "learning_rate": 9.095420501748568e-06, + "loss": 2.5647, + "step": 807600 + }, + { + "epoch": 0.21064445521106429, + "grad_norm": 12.092817306518555, + "learning_rate": 9.094940893708157e-06, + "loss": 2.5691, + "step": 807800 + }, + { + "epoch": 0.21069660783676644, + "grad_norm": 13.357704162597656, + "learning_rate": 9.094461171209427e-06, + "loss": 2.5075, + "step": 808000 + }, + { + "epoch": 0.21074876046246863, + "grad_norm": 11.30611801147461, + "learning_rate": 9.093981334265787e-06, + "loss": 2.509, + "step": 808200 + }, + { + "epoch": 0.21080091308817078, + "grad_norm": 12.069721221923828, + "learning_rate": 9.093501382890647e-06, + "loss": 2.5223, + "step": 808400 + }, + { + "epoch": 0.21085306571387297, + "grad_norm": 11.740452766418457, + "learning_rate": 9.093021317097424e-06, + "loss": 2.5337, + "step": 808600 + }, + { + "epoch": 0.21090521833957512, + "grad_norm": 12.168525695800781, + "learning_rate": 9.092541136899535e-06, + "loss": 2.5598, + "step": 808800 + }, + { + "epoch": 0.2109573709652773, + "grad_norm": 13.350499153137207, + "learning_rate": 9.092060842310404e-06, + "loss": 2.4994, + "step": 809000 + }, + { + "epoch": 0.21100952359097946, + "grad_norm": 12.407944679260254, + "learning_rate": 9.091580433343452e-06, + "loss": 2.4965, + "step": 809200 + }, + { + "epoch": 0.21106167621668165, + "grad_norm": 14.529518127441406, + "learning_rate": 9.09109991001211e-06, + "loss": 2.5579, + "step": 809400 + }, + { + "epoch": 0.2111138288423838, + "grad_norm": 12.541584968566895, + "learning_rate": 9.09061927232981e-06, + "loss": 2.5532, + "step": 809600 + }, + { + "epoch": 0.211165981468086, + "grad_norm": 14.252954483032227, + "learning_rate": 9.090138520309983e-06, + "loss": 2.5472, + "step": 809800 + }, + { + "epoch": 0.21121813409378815, + "grad_norm": 13.405433654785156, + "learning_rate": 9.089657653966069e-06, + "loss": 2.5333, + "step": 810000 + }, + { + "epoch": 0.21127028671949033, + "grad_norm": 10.493156433105469, + "learning_rate": 9.089176673311506e-06, + "loss": 2.5312, + "step": 810200 + }, + { + "epoch": 0.2113224393451925, + "grad_norm": 12.578083992004395, + "learning_rate": 9.088695578359742e-06, + "loss": 2.5336, + "step": 810400 + }, + { + "epoch": 0.21137459197089467, + "grad_norm": 12.994580268859863, + "learning_rate": 9.088214369124221e-06, + "loss": 2.5489, + "step": 810600 + }, + { + "epoch": 0.21142674459659683, + "grad_norm": 11.961739540100098, + "learning_rate": 9.087733045618396e-06, + "loss": 2.5717, + "step": 810800 + }, + { + "epoch": 0.211478897222299, + "grad_norm": 12.124736785888672, + "learning_rate": 9.087251607855718e-06, + "loss": 2.5273, + "step": 811000 + }, + { + "epoch": 0.21153104984800117, + "grad_norm": 14.014177322387695, + "learning_rate": 9.086770055849645e-06, + "loss": 2.5209, + "step": 811200 + }, + { + "epoch": 0.21158320247370335, + "grad_norm": 12.617166519165039, + "learning_rate": 9.086288389613636e-06, + "loss": 2.5425, + "step": 811400 + }, + { + "epoch": 0.2116353550994055, + "grad_norm": 12.897732734680176, + "learning_rate": 9.085806609161156e-06, + "loss": 2.5501, + "step": 811600 + }, + { + "epoch": 0.2116875077251077, + "grad_norm": 14.856685638427734, + "learning_rate": 9.085324714505669e-06, + "loss": 2.5208, + "step": 811800 + }, + { + "epoch": 0.21173966035080985, + "grad_norm": 11.253473281860352, + "learning_rate": 9.084842705660646e-06, + "loss": 2.5396, + "step": 812000 + }, + { + "epoch": 0.21179181297651203, + "grad_norm": 13.86609172821045, + "learning_rate": 9.08436058263956e-06, + "loss": 2.5441, + "step": 812200 + }, + { + "epoch": 0.2118439656022142, + "grad_norm": 11.815281867980957, + "learning_rate": 9.083878345455885e-06, + "loss": 2.543, + "step": 812400 + }, + { + "epoch": 0.21189611822791637, + "grad_norm": 12.452479362487793, + "learning_rate": 9.083395994123103e-06, + "loss": 2.5331, + "step": 812600 + }, + { + "epoch": 0.21194827085361853, + "grad_norm": 13.19066333770752, + "learning_rate": 9.082913528654694e-06, + "loss": 2.5344, + "step": 812800 + }, + { + "epoch": 0.21200042347932072, + "grad_norm": 11.936997413635254, + "learning_rate": 9.082430949064144e-06, + "loss": 2.5393, + "step": 813000 + }, + { + "epoch": 0.21205257610502287, + "grad_norm": 13.574031829833984, + "learning_rate": 9.081948255364941e-06, + "loss": 2.5316, + "step": 813200 + }, + { + "epoch": 0.21210472873072503, + "grad_norm": 11.525445938110352, + "learning_rate": 9.081465447570578e-06, + "loss": 2.5151, + "step": 813400 + }, + { + "epoch": 0.2121568813564272, + "grad_norm": 11.842822074890137, + "learning_rate": 9.08098252569455e-06, + "loss": 2.6006, + "step": 813600 + }, + { + "epoch": 0.21220903398212937, + "grad_norm": 12.733033180236816, + "learning_rate": 9.080499489750357e-06, + "loss": 2.569, + "step": 813800 + }, + { + "epoch": 0.21226118660783155, + "grad_norm": 13.042905807495117, + "learning_rate": 9.080016339751495e-06, + "loss": 2.5706, + "step": 814000 + }, + { + "epoch": 0.2123133392335337, + "grad_norm": 12.87771224975586, + "learning_rate": 9.079533075711473e-06, + "loss": 2.5289, + "step": 814200 + }, + { + "epoch": 0.2123654918592359, + "grad_norm": 12.961420059204102, + "learning_rate": 9.079049697643798e-06, + "loss": 2.58, + "step": 814400 + }, + { + "epoch": 0.21241764448493805, + "grad_norm": 12.723822593688965, + "learning_rate": 9.078566205561981e-06, + "loss": 2.542, + "step": 814600 + }, + { + "epoch": 0.21246979711064023, + "grad_norm": 12.822134971618652, + "learning_rate": 9.078082599479535e-06, + "loss": 2.5328, + "step": 814800 + }, + { + "epoch": 0.2125219497363424, + "grad_norm": 10.794196128845215, + "learning_rate": 9.077598879409977e-06, + "loss": 2.5756, + "step": 815000 + }, + { + "epoch": 0.21257410236204458, + "grad_norm": 11.451395034790039, + "learning_rate": 9.07711504536683e-06, + "loss": 2.5266, + "step": 815200 + }, + { + "epoch": 0.21262625498774673, + "grad_norm": 13.746084213256836, + "learning_rate": 9.076631097363615e-06, + "loss": 2.53, + "step": 815400 + }, + { + "epoch": 0.21267840761344892, + "grad_norm": 13.913445472717285, + "learning_rate": 9.076147035413863e-06, + "loss": 2.5575, + "step": 815600 + }, + { + "epoch": 0.21273056023915107, + "grad_norm": 12.547298431396484, + "learning_rate": 9.0756628595311e-06, + "loss": 2.5415, + "step": 815800 + }, + { + "epoch": 0.21278271286485326, + "grad_norm": 13.744832038879395, + "learning_rate": 9.07517856972886e-06, + "loss": 2.5073, + "step": 816000 + }, + { + "epoch": 0.2128348654905554, + "grad_norm": 12.159852027893066, + "learning_rate": 9.074694166020682e-06, + "loss": 2.5024, + "step": 816200 + }, + { + "epoch": 0.2128870181162576, + "grad_norm": 12.20659065246582, + "learning_rate": 9.074209648420101e-06, + "loss": 2.518, + "step": 816400 + }, + { + "epoch": 0.21293917074195975, + "grad_norm": 12.317503929138184, + "learning_rate": 9.073725016940665e-06, + "loss": 2.5395, + "step": 816600 + }, + { + "epoch": 0.21299132336766194, + "grad_norm": 12.387846946716309, + "learning_rate": 9.073240271595916e-06, + "loss": 2.5347, + "step": 816800 + }, + { + "epoch": 0.2130434759933641, + "grad_norm": 13.619342803955078, + "learning_rate": 9.072755412399406e-06, + "loss": 2.5815, + "step": 817000 + }, + { + "epoch": 0.21309562861906628, + "grad_norm": 12.313323020935059, + "learning_rate": 9.072270439364684e-06, + "loss": 2.5288, + "step": 817200 + }, + { + "epoch": 0.21314778124476844, + "grad_norm": 15.066256523132324, + "learning_rate": 9.07178535250531e-06, + "loss": 2.5235, + "step": 817400 + }, + { + "epoch": 0.21319993387047062, + "grad_norm": 13.39647388458252, + "learning_rate": 9.07130015183484e-06, + "loss": 2.5484, + "step": 817600 + }, + { + "epoch": 0.21325208649617278, + "grad_norm": 11.415046691894531, + "learning_rate": 9.070814837366836e-06, + "loss": 2.5879, + "step": 817800 + }, + { + "epoch": 0.21330423912187496, + "grad_norm": 10.62917709350586, + "learning_rate": 9.070329409114862e-06, + "loss": 2.5464, + "step": 818000 + }, + { + "epoch": 0.21335639174757712, + "grad_norm": 13.691715240478516, + "learning_rate": 9.06984386709249e-06, + "loss": 2.5395, + "step": 818200 + }, + { + "epoch": 0.2134085443732793, + "grad_norm": 10.864567756652832, + "learning_rate": 9.069358211313289e-06, + "loss": 2.5282, + "step": 818400 + }, + { + "epoch": 0.21346069699898146, + "grad_norm": 11.341833114624023, + "learning_rate": 9.068872441790832e-06, + "loss": 2.5543, + "step": 818600 + }, + { + "epoch": 0.21351284962468364, + "grad_norm": 11.319380760192871, + "learning_rate": 9.0683865585387e-06, + "loss": 2.5383, + "step": 818800 + }, + { + "epoch": 0.2135650022503858, + "grad_norm": 11.097443580627441, + "learning_rate": 9.067900561570472e-06, + "loss": 2.5757, + "step": 819000 + }, + { + "epoch": 0.21361715487608796, + "grad_norm": 11.91701889038086, + "learning_rate": 9.067414450899735e-06, + "loss": 2.5255, + "step": 819200 + }, + { + "epoch": 0.21366930750179014, + "grad_norm": 11.580026626586914, + "learning_rate": 9.066928226540073e-06, + "loss": 2.5549, + "step": 819400 + }, + { + "epoch": 0.2137214601274923, + "grad_norm": 11.118182182312012, + "learning_rate": 9.066441888505077e-06, + "loss": 2.5484, + "step": 819600 + }, + { + "epoch": 0.21377361275319448, + "grad_norm": 11.571749687194824, + "learning_rate": 9.065955436808343e-06, + "loss": 2.5065, + "step": 819800 + }, + { + "epoch": 0.21382576537889664, + "grad_norm": 12.736671447753906, + "learning_rate": 9.065468871463464e-06, + "loss": 2.5625, + "step": 820000 + }, + { + "epoch": 0.21387791800459882, + "grad_norm": 13.102340698242188, + "learning_rate": 9.064982192484042e-06, + "loss": 2.5556, + "step": 820200 + }, + { + "epoch": 0.21393007063030098, + "grad_norm": 13.578348159790039, + "learning_rate": 9.064495399883684e-06, + "loss": 2.5439, + "step": 820400 + }, + { + "epoch": 0.21398222325600316, + "grad_norm": 13.68409538269043, + "learning_rate": 9.064008493675991e-06, + "loss": 2.5887, + "step": 820600 + }, + { + "epoch": 0.21403437588170532, + "grad_norm": 13.699695587158203, + "learning_rate": 9.063521473874574e-06, + "loss": 2.5437, + "step": 820800 + }, + { + "epoch": 0.2140865285074075, + "grad_norm": 12.38679313659668, + "learning_rate": 9.063034340493048e-06, + "loss": 2.5402, + "step": 821000 + }, + { + "epoch": 0.21413868113310966, + "grad_norm": 11.524242401123047, + "learning_rate": 9.062547093545025e-06, + "loss": 2.5281, + "step": 821200 + }, + { + "epoch": 0.21419083375881184, + "grad_norm": 11.350960731506348, + "learning_rate": 9.062059733044129e-06, + "loss": 2.5657, + "step": 821400 + }, + { + "epoch": 0.214242986384514, + "grad_norm": 14.124442100524902, + "learning_rate": 9.06157225900398e-06, + "loss": 2.543, + "step": 821600 + }, + { + "epoch": 0.21429513901021618, + "grad_norm": 13.106842041015625, + "learning_rate": 9.061084671438201e-06, + "loss": 2.5266, + "step": 821800 + }, + { + "epoch": 0.21434729163591834, + "grad_norm": 12.565150260925293, + "learning_rate": 9.060596970360423e-06, + "loss": 2.5241, + "step": 822000 + }, + { + "epoch": 0.21439944426162053, + "grad_norm": 12.959101676940918, + "learning_rate": 9.060109155784278e-06, + "loss": 2.5496, + "step": 822200 + }, + { + "epoch": 0.21445159688732268, + "grad_norm": 12.728083610534668, + "learning_rate": 9.059621227723401e-06, + "loss": 2.5343, + "step": 822400 + }, + { + "epoch": 0.21450374951302487, + "grad_norm": 13.199159622192383, + "learning_rate": 9.05913318619143e-06, + "loss": 2.5516, + "step": 822600 + }, + { + "epoch": 0.21455590213872702, + "grad_norm": 11.12812614440918, + "learning_rate": 9.058645031202005e-06, + "loss": 2.5033, + "step": 822800 + }, + { + "epoch": 0.2146080547644292, + "grad_norm": 13.735895156860352, + "learning_rate": 9.058156762768773e-06, + "loss": 2.5754, + "step": 823000 + }, + { + "epoch": 0.21466020739013136, + "grad_norm": 14.441383361816406, + "learning_rate": 9.057668380905379e-06, + "loss": 2.5403, + "step": 823200 + }, + { + "epoch": 0.21471236001583355, + "grad_norm": 14.57379150390625, + "learning_rate": 9.057179885625475e-06, + "loss": 2.5133, + "step": 823400 + }, + { + "epoch": 0.2147645126415357, + "grad_norm": 12.929058074951172, + "learning_rate": 9.056691276942717e-06, + "loss": 2.5448, + "step": 823600 + }, + { + "epoch": 0.2148166652672379, + "grad_norm": 12.269718170166016, + "learning_rate": 9.056202554870759e-06, + "loss": 2.5746, + "step": 823800 + }, + { + "epoch": 0.21486881789294004, + "grad_norm": 13.69266414642334, + "learning_rate": 9.055713719423262e-06, + "loss": 2.5415, + "step": 824000 + }, + { + "epoch": 0.21492097051864223, + "grad_norm": 10.19147777557373, + "learning_rate": 9.05522477061389e-06, + "loss": 2.6066, + "step": 824200 + }, + { + "epoch": 0.21497312314434439, + "grad_norm": 13.49588680267334, + "learning_rate": 9.054735708456311e-06, + "loss": 2.5503, + "step": 824400 + }, + { + "epoch": 0.21502527577004657, + "grad_norm": 13.206926345825195, + "learning_rate": 9.054246532964194e-06, + "loss": 2.582, + "step": 824600 + }, + { + "epoch": 0.21507742839574873, + "grad_norm": 14.236640930175781, + "learning_rate": 9.053757244151211e-06, + "loss": 2.563, + "step": 824800 + }, + { + "epoch": 0.21512958102145088, + "grad_norm": 12.087728500366211, + "learning_rate": 9.053267842031038e-06, + "loss": 2.5586, + "step": 825000 + }, + { + "epoch": 0.21518173364715307, + "grad_norm": 13.20797348022461, + "learning_rate": 9.052778326617357e-06, + "loss": 2.5417, + "step": 825200 + }, + { + "epoch": 0.21523388627285522, + "grad_norm": 13.239849090576172, + "learning_rate": 9.052288697923848e-06, + "loss": 2.5569, + "step": 825400 + }, + { + "epoch": 0.2152860388985574, + "grad_norm": 13.600337028503418, + "learning_rate": 9.051798955964196e-06, + "loss": 2.5431, + "step": 825600 + }, + { + "epoch": 0.21533819152425956, + "grad_norm": 14.598825454711914, + "learning_rate": 9.051309100752093e-06, + "loss": 2.5203, + "step": 825800 + }, + { + "epoch": 0.21539034414996175, + "grad_norm": 11.750934600830078, + "learning_rate": 9.050819132301227e-06, + "loss": 2.5027, + "step": 826000 + }, + { + "epoch": 0.2154424967756639, + "grad_norm": 13.683958053588867, + "learning_rate": 9.050329050625299e-06, + "loss": 2.5162, + "step": 826200 + }, + { + "epoch": 0.2154946494013661, + "grad_norm": 12.847051620483398, + "learning_rate": 9.049838855738001e-06, + "loss": 2.5338, + "step": 826400 + }, + { + "epoch": 0.21554680202706825, + "grad_norm": 12.681694984436035, + "learning_rate": 9.049348547653038e-06, + "loss": 2.532, + "step": 826600 + }, + { + "epoch": 0.21559895465277043, + "grad_norm": 13.66247272491455, + "learning_rate": 9.048858126384115e-06, + "loss": 2.5436, + "step": 826800 + }, + { + "epoch": 0.2156511072784726, + "grad_norm": 14.794609069824219, + "learning_rate": 9.048367591944938e-06, + "loss": 2.57, + "step": 827000 + }, + { + "epoch": 0.21570325990417477, + "grad_norm": 13.683991432189941, + "learning_rate": 9.04787694434922e-06, + "loss": 2.5269, + "step": 827200 + }, + { + "epoch": 0.21575541252987693, + "grad_norm": 11.677818298339844, + "learning_rate": 9.047386183610672e-06, + "loss": 2.5464, + "step": 827400 + }, + { + "epoch": 0.2158075651555791, + "grad_norm": 12.510682106018066, + "learning_rate": 9.046895309743016e-06, + "loss": 2.5522, + "step": 827600 + }, + { + "epoch": 0.21585971778128127, + "grad_norm": 14.272283554077148, + "learning_rate": 9.046404322759968e-06, + "loss": 2.5624, + "step": 827800 + }, + { + "epoch": 0.21591187040698345, + "grad_norm": 13.069134712219238, + "learning_rate": 9.045913222675253e-06, + "loss": 2.576, + "step": 828000 + }, + { + "epoch": 0.2159640230326856, + "grad_norm": 12.253870964050293, + "learning_rate": 9.045422009502598e-06, + "loss": 2.5634, + "step": 828200 + }, + { + "epoch": 0.2160161756583878, + "grad_norm": 13.879683494567871, + "learning_rate": 9.044930683255735e-06, + "loss": 2.5178, + "step": 828400 + }, + { + "epoch": 0.21606832828408995, + "grad_norm": 11.61858081817627, + "learning_rate": 9.044439243948395e-06, + "loss": 2.5837, + "step": 828600 + }, + { + "epoch": 0.21612048090979213, + "grad_norm": 14.2604398727417, + "learning_rate": 9.043947691594314e-06, + "loss": 2.5271, + "step": 828800 + }, + { + "epoch": 0.2161726335354943, + "grad_norm": 13.9703950881958, + "learning_rate": 9.043456026207234e-06, + "loss": 2.5254, + "step": 829000 + }, + { + "epoch": 0.21622478616119647, + "grad_norm": 12.149286270141602, + "learning_rate": 9.042964247800894e-06, + "loss": 2.5174, + "step": 829200 + }, + { + "epoch": 0.21627693878689863, + "grad_norm": 11.892560958862305, + "learning_rate": 9.042472356389043e-06, + "loss": 2.5527, + "step": 829400 + }, + { + "epoch": 0.21632909141260082, + "grad_norm": 12.29517650604248, + "learning_rate": 9.041980351985427e-06, + "loss": 2.5139, + "step": 829600 + }, + { + "epoch": 0.21638124403830297, + "grad_norm": 12.279248237609863, + "learning_rate": 9.0414882346038e-06, + "loss": 2.5625, + "step": 829800 + }, + { + "epoch": 0.21643339666400516, + "grad_norm": 13.284655570983887, + "learning_rate": 9.040996004257915e-06, + "loss": 2.5293, + "step": 830000 + }, + { + "epoch": 0.2164855492897073, + "grad_norm": 14.251969337463379, + "learning_rate": 9.040503660961536e-06, + "loss": 2.5693, + "step": 830200 + }, + { + "epoch": 0.2165377019154095, + "grad_norm": 11.924370765686035, + "learning_rate": 9.040011204728419e-06, + "loss": 2.5473, + "step": 830400 + }, + { + "epoch": 0.21658985454111165, + "grad_norm": 10.449265480041504, + "learning_rate": 9.03951863557233e-06, + "loss": 2.5117, + "step": 830600 + }, + { + "epoch": 0.2166420071668138, + "grad_norm": 12.4495210647583, + "learning_rate": 9.039025953507038e-06, + "loss": 2.5552, + "step": 830800 + }, + { + "epoch": 0.216694159792516, + "grad_norm": 13.751075744628906, + "learning_rate": 9.038533158546313e-06, + "loss": 2.568, + "step": 831000 + }, + { + "epoch": 0.21674631241821815, + "grad_norm": 12.695106506347656, + "learning_rate": 9.03804025070393e-06, + "loss": 2.5351, + "step": 831200 + }, + { + "epoch": 0.21679846504392034, + "grad_norm": 11.535404205322266, + "learning_rate": 9.037547229993667e-06, + "loss": 2.5723, + "step": 831400 + }, + { + "epoch": 0.2168506176696225, + "grad_norm": 12.56066608428955, + "learning_rate": 9.037054096429301e-06, + "loss": 2.54, + "step": 831600 + }, + { + "epoch": 0.21690277029532468, + "grad_norm": 12.696258544921875, + "learning_rate": 9.03656085002462e-06, + "loss": 2.5412, + "step": 831800 + }, + { + "epoch": 0.21695492292102683, + "grad_norm": 12.870928764343262, + "learning_rate": 9.036067490793408e-06, + "loss": 2.5416, + "step": 832000 + }, + { + "epoch": 0.21700707554672902, + "grad_norm": 11.494585037231445, + "learning_rate": 9.035574018749456e-06, + "loss": 2.5161, + "step": 832200 + }, + { + "epoch": 0.21705922817243117, + "grad_norm": 13.567378997802734, + "learning_rate": 9.035080433906557e-06, + "loss": 2.5638, + "step": 832400 + }, + { + "epoch": 0.21711138079813336, + "grad_norm": 11.019129753112793, + "learning_rate": 9.034586736278507e-06, + "loss": 2.5199, + "step": 832600 + }, + { + "epoch": 0.21716353342383551, + "grad_norm": 11.188690185546875, + "learning_rate": 9.034092925879106e-06, + "loss": 2.5601, + "step": 832800 + }, + { + "epoch": 0.2172156860495377, + "grad_norm": 13.095340728759766, + "learning_rate": 9.033599002722157e-06, + "loss": 2.5373, + "step": 833000 + }, + { + "epoch": 0.21726783867523985, + "grad_norm": 13.44362735748291, + "learning_rate": 9.033104966821464e-06, + "loss": 2.5655, + "step": 833200 + }, + { + "epoch": 0.21731999130094204, + "grad_norm": 13.946337699890137, + "learning_rate": 9.032610818190836e-06, + "loss": 2.5533, + "step": 833400 + }, + { + "epoch": 0.2173721439266442, + "grad_norm": 13.125496864318848, + "learning_rate": 9.032116556844088e-06, + "loss": 2.5252, + "step": 833600 + }, + { + "epoch": 0.21742429655234638, + "grad_norm": 14.219965934753418, + "learning_rate": 9.031622182795033e-06, + "loss": 2.5037, + "step": 833800 + }, + { + "epoch": 0.21747644917804854, + "grad_norm": 12.486796379089355, + "learning_rate": 9.031127696057488e-06, + "loss": 2.5568, + "step": 834000 + }, + { + "epoch": 0.21752860180375072, + "grad_norm": 14.136662483215332, + "learning_rate": 9.030633096645277e-06, + "loss": 2.5376, + "step": 834200 + }, + { + "epoch": 0.21758075442945288, + "grad_norm": 13.452658653259277, + "learning_rate": 9.030138384572222e-06, + "loss": 2.5766, + "step": 834400 + }, + { + "epoch": 0.21763290705515506, + "grad_norm": 11.165623664855957, + "learning_rate": 9.029643559852152e-06, + "loss": 2.527, + "step": 834600 + }, + { + "epoch": 0.21768505968085722, + "grad_norm": 11.985844612121582, + "learning_rate": 9.029148622498898e-06, + "loss": 2.5726, + "step": 834800 + }, + { + "epoch": 0.2177372123065594, + "grad_norm": 12.202817916870117, + "learning_rate": 9.028653572526296e-06, + "loss": 2.5832, + "step": 835000 + }, + { + "epoch": 0.21778936493226156, + "grad_norm": 11.637069702148438, + "learning_rate": 9.028158409948181e-06, + "loss": 2.5166, + "step": 835200 + }, + { + "epoch": 0.21784151755796374, + "grad_norm": 12.84749698638916, + "learning_rate": 9.027663134778394e-06, + "loss": 2.5217, + "step": 835400 + }, + { + "epoch": 0.2178936701836659, + "grad_norm": 12.990008354187012, + "learning_rate": 9.027167747030776e-06, + "loss": 2.5318, + "step": 835600 + }, + { + "epoch": 0.21794582280936808, + "grad_norm": 12.376798629760742, + "learning_rate": 9.026672246719179e-06, + "loss": 2.5525, + "step": 835800 + }, + { + "epoch": 0.21799797543507024, + "grad_norm": 11.98593521118164, + "learning_rate": 9.02617663385745e-06, + "loss": 2.492, + "step": 836000 + }, + { + "epoch": 0.21805012806077242, + "grad_norm": 12.605941772460938, + "learning_rate": 9.025680908459437e-06, + "loss": 2.5542, + "step": 836200 + }, + { + "epoch": 0.21810228068647458, + "grad_norm": 12.421950340270996, + "learning_rate": 9.025185070539005e-06, + "loss": 2.5238, + "step": 836400 + }, + { + "epoch": 0.21815443331217677, + "grad_norm": 11.057263374328613, + "learning_rate": 9.024689120110009e-06, + "loss": 2.5182, + "step": 836600 + }, + { + "epoch": 0.21820658593787892, + "grad_norm": 13.192597389221191, + "learning_rate": 9.02419305718631e-06, + "loss": 2.5215, + "step": 836800 + }, + { + "epoch": 0.21825873856358108, + "grad_norm": 13.819595336914062, + "learning_rate": 9.023696881781776e-06, + "loss": 2.5339, + "step": 837000 + }, + { + "epoch": 0.21831089118928326, + "grad_norm": 12.131237983703613, + "learning_rate": 9.023200593910273e-06, + "loss": 2.5688, + "step": 837200 + }, + { + "epoch": 0.21836304381498542, + "grad_norm": 12.664222717285156, + "learning_rate": 9.022704193585677e-06, + "loss": 2.5514, + "step": 837400 + }, + { + "epoch": 0.2184151964406876, + "grad_norm": 13.142849922180176, + "learning_rate": 9.022207680821858e-06, + "loss": 2.5142, + "step": 837600 + }, + { + "epoch": 0.21846734906638976, + "grad_norm": 13.29996109008789, + "learning_rate": 9.021711055632698e-06, + "loss": 2.5353, + "step": 837800 + }, + { + "epoch": 0.21851950169209194, + "grad_norm": 12.690460205078125, + "learning_rate": 9.021214318032077e-06, + "loss": 2.5696, + "step": 838000 + }, + { + "epoch": 0.2185716543177941, + "grad_norm": 13.326699256896973, + "learning_rate": 9.020717468033877e-06, + "loss": 2.5998, + "step": 838200 + }, + { + "epoch": 0.21862380694349628, + "grad_norm": 11.772669792175293, + "learning_rate": 9.02022050565199e-06, + "loss": 2.5221, + "step": 838400 + }, + { + "epoch": 0.21867595956919844, + "grad_norm": 12.478630065917969, + "learning_rate": 9.019723430900304e-06, + "loss": 2.5427, + "step": 838600 + }, + { + "epoch": 0.21872811219490063, + "grad_norm": 11.8933687210083, + "learning_rate": 9.019226243792712e-06, + "loss": 2.5576, + "step": 838800 + }, + { + "epoch": 0.21878026482060278, + "grad_norm": 11.510830879211426, + "learning_rate": 9.018728944343113e-06, + "loss": 2.5357, + "step": 839000 + }, + { + "epoch": 0.21883241744630497, + "grad_norm": 12.072577476501465, + "learning_rate": 9.018231532565407e-06, + "loss": 2.5547, + "step": 839200 + }, + { + "epoch": 0.21888457007200712, + "grad_norm": 12.852787017822266, + "learning_rate": 9.017734008473495e-06, + "loss": 2.5539, + "step": 839400 + }, + { + "epoch": 0.2189367226977093, + "grad_norm": 15.137070655822754, + "learning_rate": 9.017236372081286e-06, + "loss": 2.5924, + "step": 839600 + }, + { + "epoch": 0.21898887532341146, + "grad_norm": 13.725959777832031, + "learning_rate": 9.016738623402688e-06, + "loss": 2.5264, + "step": 839800 + }, + { + "epoch": 0.21904102794911365, + "grad_norm": 12.524657249450684, + "learning_rate": 9.016240762451613e-06, + "loss": 2.5328, + "step": 840000 + }, + { + "epoch": 0.2190931805748158, + "grad_norm": 12.656744956970215, + "learning_rate": 9.015742789241979e-06, + "loss": 2.5303, + "step": 840200 + }, + { + "epoch": 0.219145333200518, + "grad_norm": 14.41982364654541, + "learning_rate": 9.015244703787704e-06, + "loss": 2.5192, + "step": 840400 + }, + { + "epoch": 0.21919748582622015, + "grad_norm": 13.495166778564453, + "learning_rate": 9.014746506102709e-06, + "loss": 2.5097, + "step": 840600 + }, + { + "epoch": 0.21924963845192233, + "grad_norm": 12.303117752075195, + "learning_rate": 9.01424819620092e-06, + "loss": 2.5406, + "step": 840800 + }, + { + "epoch": 0.21930179107762449, + "grad_norm": 11.897546768188477, + "learning_rate": 9.013749774096265e-06, + "loss": 2.5776, + "step": 841000 + }, + { + "epoch": 0.21935394370332667, + "grad_norm": 11.718711853027344, + "learning_rate": 9.013251239802676e-06, + "loss": 2.5363, + "step": 841200 + }, + { + "epoch": 0.21940609632902883, + "grad_norm": 14.32603931427002, + "learning_rate": 9.012752593334087e-06, + "loss": 2.528, + "step": 841400 + }, + { + "epoch": 0.219458248954731, + "grad_norm": 11.693556785583496, + "learning_rate": 9.012253834704438e-06, + "loss": 2.5022, + "step": 841600 + }, + { + "epoch": 0.21951040158043317, + "grad_norm": 10.804730415344238, + "learning_rate": 9.011754963927666e-06, + "loss": 2.5521, + "step": 841800 + }, + { + "epoch": 0.21956255420613535, + "grad_norm": 12.166260719299316, + "learning_rate": 9.011255981017718e-06, + "loss": 2.5571, + "step": 842000 + }, + { + "epoch": 0.2196147068318375, + "grad_norm": 9.505474090576172, + "learning_rate": 9.010756885988541e-06, + "loss": 2.5141, + "step": 842200 + }, + { + "epoch": 0.2196668594575397, + "grad_norm": 12.927766799926758, + "learning_rate": 9.010257678854081e-06, + "loss": 2.5813, + "step": 842400 + }, + { + "epoch": 0.21971901208324185, + "grad_norm": 12.975953102111816, + "learning_rate": 9.009758359628298e-06, + "loss": 2.5434, + "step": 842600 + }, + { + "epoch": 0.219771164708944, + "grad_norm": 12.598480224609375, + "learning_rate": 9.009258928325146e-06, + "loss": 2.52, + "step": 842800 + }, + { + "epoch": 0.2198233173346462, + "grad_norm": 11.917780876159668, + "learning_rate": 9.008759384958582e-06, + "loss": 2.5169, + "step": 843000 + }, + { + "epoch": 0.21987546996034835, + "grad_norm": 12.304062843322754, + "learning_rate": 9.008259729542572e-06, + "loss": 2.5698, + "step": 843200 + }, + { + "epoch": 0.21992762258605053, + "grad_norm": 15.084339141845703, + "learning_rate": 9.00775996209108e-06, + "loss": 2.5361, + "step": 843400 + }, + { + "epoch": 0.2199797752117527, + "grad_norm": 13.05875301361084, + "learning_rate": 9.007260082618077e-06, + "loss": 2.5248, + "step": 843600 + }, + { + "epoch": 0.22003192783745487, + "grad_norm": 13.704729080200195, + "learning_rate": 9.006760091137534e-06, + "loss": 2.5512, + "step": 843800 + }, + { + "epoch": 0.22008408046315703, + "grad_norm": 13.089705467224121, + "learning_rate": 9.006259987663425e-06, + "loss": 2.526, + "step": 844000 + }, + { + "epoch": 0.2201362330888592, + "grad_norm": 12.926041603088379, + "learning_rate": 9.005759772209732e-06, + "loss": 2.5207, + "step": 844200 + }, + { + "epoch": 0.22018838571456137, + "grad_norm": 11.693181991577148, + "learning_rate": 9.005259444790432e-06, + "loss": 2.492, + "step": 844400 + }, + { + "epoch": 0.22024053834026355, + "grad_norm": 12.62060260772705, + "learning_rate": 9.004759005419515e-06, + "loss": 2.5323, + "step": 844600 + }, + { + "epoch": 0.2202926909659657, + "grad_norm": 12.25759506225586, + "learning_rate": 9.004258454110966e-06, + "loss": 2.4841, + "step": 844800 + }, + { + "epoch": 0.2203448435916679, + "grad_norm": 13.82259750366211, + "learning_rate": 9.003757790878775e-06, + "loss": 2.5533, + "step": 845000 + }, + { + "epoch": 0.22039699621737005, + "grad_norm": 12.706326484680176, + "learning_rate": 9.003257015736937e-06, + "loss": 2.5247, + "step": 845200 + }, + { + "epoch": 0.22044914884307223, + "grad_norm": 12.647273063659668, + "learning_rate": 9.00275612869945e-06, + "loss": 2.5762, + "step": 845400 + }, + { + "epoch": 0.2205013014687744, + "grad_norm": 12.341408729553223, + "learning_rate": 9.002255129780313e-06, + "loss": 2.5289, + "step": 845600 + }, + { + "epoch": 0.22055345409447658, + "grad_norm": 13.406020164489746, + "learning_rate": 9.001754018993531e-06, + "loss": 2.4797, + "step": 845800 + }, + { + "epoch": 0.22060560672017873, + "grad_norm": 13.798885345458984, + "learning_rate": 9.001252796353111e-06, + "loss": 2.5031, + "step": 846000 + }, + { + "epoch": 0.22065775934588092, + "grad_norm": 13.883713722229004, + "learning_rate": 9.000751461873061e-06, + "loss": 2.5013, + "step": 846200 + }, + { + "epoch": 0.22070991197158307, + "grad_norm": 12.606523513793945, + "learning_rate": 9.000250015567394e-06, + "loss": 2.5392, + "step": 846400 + }, + { + "epoch": 0.22076206459728526, + "grad_norm": 13.218819618225098, + "learning_rate": 8.999748457450125e-06, + "loss": 2.5223, + "step": 846600 + }, + { + "epoch": 0.2208142172229874, + "grad_norm": 14.202933311462402, + "learning_rate": 8.999246787535279e-06, + "loss": 2.5431, + "step": 846800 + }, + { + "epoch": 0.2208663698486896, + "grad_norm": 10.781413078308105, + "learning_rate": 8.99874500583687e-06, + "loss": 2.5516, + "step": 847000 + }, + { + "epoch": 0.22091852247439175, + "grad_norm": 12.983638763427734, + "learning_rate": 8.99824311236893e-06, + "loss": 2.5079, + "step": 847200 + }, + { + "epoch": 0.22097067510009394, + "grad_norm": 12.726056098937988, + "learning_rate": 8.997741107145484e-06, + "loss": 2.4876, + "step": 847400 + }, + { + "epoch": 0.2210228277257961, + "grad_norm": 13.305671691894531, + "learning_rate": 8.997238990180563e-06, + "loss": 2.5322, + "step": 847600 + }, + { + "epoch": 0.22107498035149828, + "grad_norm": 12.6900634765625, + "learning_rate": 8.996736761488205e-06, + "loss": 2.539, + "step": 847800 + }, + { + "epoch": 0.22112713297720044, + "grad_norm": 12.353796005249023, + "learning_rate": 8.996234421082447e-06, + "loss": 2.6022, + "step": 848000 + }, + { + "epoch": 0.22117928560290262, + "grad_norm": 13.8502836227417, + "learning_rate": 8.995731968977327e-06, + "loss": 2.5678, + "step": 848200 + }, + { + "epoch": 0.22123143822860478, + "grad_norm": 13.232843399047852, + "learning_rate": 8.995229405186892e-06, + "loss": 2.5363, + "step": 848400 + }, + { + "epoch": 0.22128359085430693, + "grad_norm": 13.014341354370117, + "learning_rate": 8.994726729725188e-06, + "loss": 2.5494, + "step": 848600 + }, + { + "epoch": 0.22133574348000912, + "grad_norm": 13.17010498046875, + "learning_rate": 8.994223942606266e-06, + "loss": 2.5208, + "step": 848800 + }, + { + "epoch": 0.22138789610571127, + "grad_norm": 13.344563484191895, + "learning_rate": 8.99372104384418e-06, + "loss": 2.5356, + "step": 849000 + }, + { + "epoch": 0.22144004873141346, + "grad_norm": 11.80360221862793, + "learning_rate": 8.993218033452985e-06, + "loss": 2.5652, + "step": 849200 + }, + { + "epoch": 0.22149220135711561, + "grad_norm": 12.011212348937988, + "learning_rate": 8.992714911446743e-06, + "loss": 2.5629, + "step": 849400 + }, + { + "epoch": 0.2215443539828178, + "grad_norm": 13.381622314453125, + "learning_rate": 8.992211677839514e-06, + "loss": 2.542, + "step": 849600 + }, + { + "epoch": 0.22159650660851996, + "grad_norm": 12.223089218139648, + "learning_rate": 8.991708332645365e-06, + "loss": 2.545, + "step": 849800 + }, + { + "epoch": 0.22164865923422214, + "grad_norm": 12.903460502624512, + "learning_rate": 8.991204875878369e-06, + "loss": 2.5258, + "step": 850000 + }, + { + "epoch": 0.2217008118599243, + "grad_norm": 13.3897123336792, + "learning_rate": 8.990701307552589e-06, + "loss": 2.4829, + "step": 850200 + }, + { + "epoch": 0.22175296448562648, + "grad_norm": 14.366217613220215, + "learning_rate": 8.99019762768211e-06, + "loss": 2.5757, + "step": 850400 + }, + { + "epoch": 0.22180511711132864, + "grad_norm": 12.164314270019531, + "learning_rate": 8.989693836281006e-06, + "loss": 2.5282, + "step": 850600 + }, + { + "epoch": 0.22185726973703082, + "grad_norm": 12.974170684814453, + "learning_rate": 8.989189933363359e-06, + "loss": 2.5623, + "step": 850800 + }, + { + "epoch": 0.22190942236273298, + "grad_norm": 12.635279655456543, + "learning_rate": 8.988685918943252e-06, + "loss": 2.4967, + "step": 851000 + }, + { + "epoch": 0.22196157498843516, + "grad_norm": 11.550530433654785, + "learning_rate": 8.988181793034776e-06, + "loss": 2.5069, + "step": 851200 + }, + { + "epoch": 0.22201372761413732, + "grad_norm": 12.456903457641602, + "learning_rate": 8.987677555652018e-06, + "loss": 2.5032, + "step": 851400 + }, + { + "epoch": 0.2220658802398395, + "grad_norm": 14.79409122467041, + "learning_rate": 8.987173206809078e-06, + "loss": 2.5223, + "step": 851600 + }, + { + "epoch": 0.22211803286554166, + "grad_norm": 12.690301895141602, + "learning_rate": 8.986668746520048e-06, + "loss": 2.5552, + "step": 851800 + }, + { + "epoch": 0.22217018549124384, + "grad_norm": 14.763097763061523, + "learning_rate": 8.986164174799029e-06, + "loss": 2.5392, + "step": 852000 + }, + { + "epoch": 0.222222338116946, + "grad_norm": 13.585575103759766, + "learning_rate": 8.985659491660126e-06, + "loss": 2.5645, + "step": 852200 + }, + { + "epoch": 0.22227449074264818, + "grad_norm": 13.037672996520996, + "learning_rate": 8.985154697117444e-06, + "loss": 2.545, + "step": 852400 + }, + { + "epoch": 0.22232664336835034, + "grad_norm": 13.282525062561035, + "learning_rate": 8.984649791185093e-06, + "loss": 2.5653, + "step": 852600 + }, + { + "epoch": 0.22237879599405252, + "grad_norm": 13.44238567352295, + "learning_rate": 8.984144773877186e-06, + "loss": 2.5334, + "step": 852800 + }, + { + "epoch": 0.22243094861975468, + "grad_norm": 14.811910629272461, + "learning_rate": 8.983639645207839e-06, + "loss": 2.5435, + "step": 853000 + }, + { + "epoch": 0.22248310124545687, + "grad_norm": 11.7992582321167, + "learning_rate": 8.98313440519117e-06, + "loss": 2.5333, + "step": 853200 + }, + { + "epoch": 0.22253525387115902, + "grad_norm": 12.599976539611816, + "learning_rate": 8.982629053841302e-06, + "loss": 2.5414, + "step": 853400 + }, + { + "epoch": 0.2225874064968612, + "grad_norm": 12.891295433044434, + "learning_rate": 8.98212359117236e-06, + "loss": 2.5178, + "step": 853600 + }, + { + "epoch": 0.22263955912256336, + "grad_norm": 10.120680809020996, + "learning_rate": 8.981618017198472e-06, + "loss": 2.523, + "step": 853800 + }, + { + "epoch": 0.22269171174826555, + "grad_norm": 14.803202629089355, + "learning_rate": 8.98111233193377e-06, + "loss": 2.5104, + "step": 854000 + }, + { + "epoch": 0.2227438643739677, + "grad_norm": 11.992095947265625, + "learning_rate": 8.980606535392387e-06, + "loss": 2.5419, + "step": 854200 + }, + { + "epoch": 0.22279601699966986, + "grad_norm": 13.348030090332031, + "learning_rate": 8.980100627588461e-06, + "loss": 2.5249, + "step": 854400 + }, + { + "epoch": 0.22284816962537204, + "grad_norm": 12.748968124389648, + "learning_rate": 8.979594608536135e-06, + "loss": 2.5046, + "step": 854600 + }, + { + "epoch": 0.2229003222510742, + "grad_norm": 12.485724449157715, + "learning_rate": 8.97908847824955e-06, + "loss": 2.5059, + "step": 854800 + }, + { + "epoch": 0.22295247487677639, + "grad_norm": 13.133146286010742, + "learning_rate": 8.978582236742854e-06, + "loss": 2.5285, + "step": 855000 + }, + { + "epoch": 0.22300462750247854, + "grad_norm": 11.857640266418457, + "learning_rate": 8.978075884030197e-06, + "loss": 2.5358, + "step": 855200 + }, + { + "epoch": 0.22305678012818073, + "grad_norm": 12.643730163574219, + "learning_rate": 8.977569420125732e-06, + "loss": 2.4963, + "step": 855400 + }, + { + "epoch": 0.22310893275388288, + "grad_norm": 11.763866424560547, + "learning_rate": 8.977062845043616e-06, + "loss": 2.5002, + "step": 855600 + }, + { + "epoch": 0.22316108537958507, + "grad_norm": 11.065905570983887, + "learning_rate": 8.976556158798006e-06, + "loss": 2.5613, + "step": 855800 + }, + { + "epoch": 0.22321323800528722, + "grad_norm": 12.88422679901123, + "learning_rate": 8.976049361403067e-06, + "loss": 2.5092, + "step": 856000 + }, + { + "epoch": 0.2232653906309894, + "grad_norm": 12.495038986206055, + "learning_rate": 8.975542452872966e-06, + "loss": 2.5522, + "step": 856200 + }, + { + "epoch": 0.22331754325669156, + "grad_norm": 10.122501373291016, + "learning_rate": 8.975035433221867e-06, + "loss": 2.5214, + "step": 856400 + }, + { + "epoch": 0.22336969588239375, + "grad_norm": 10.929478645324707, + "learning_rate": 8.974528302463946e-06, + "loss": 2.5244, + "step": 856600 + }, + { + "epoch": 0.2234218485080959, + "grad_norm": 13.326924324035645, + "learning_rate": 8.974021060613374e-06, + "loss": 2.499, + "step": 856800 + }, + { + "epoch": 0.2234740011337981, + "grad_norm": 11.079444885253906, + "learning_rate": 8.973513707684332e-06, + "loss": 2.5378, + "step": 857000 + }, + { + "epoch": 0.22352615375950025, + "grad_norm": 12.476800918579102, + "learning_rate": 8.973006243691001e-06, + "loss": 2.5788, + "step": 857200 + }, + { + "epoch": 0.22357830638520243, + "grad_norm": 14.203133583068848, + "learning_rate": 8.972498668647565e-06, + "loss": 2.5313, + "step": 857400 + }, + { + "epoch": 0.2236304590109046, + "grad_norm": 12.900059700012207, + "learning_rate": 8.971990982568208e-06, + "loss": 2.5252, + "step": 857600 + }, + { + "epoch": 0.22368261163660677, + "grad_norm": 10.50400447845459, + "learning_rate": 8.971483185467126e-06, + "loss": 2.506, + "step": 857800 + }, + { + "epoch": 0.22373476426230893, + "grad_norm": 12.504179954528809, + "learning_rate": 8.970975277358509e-06, + "loss": 2.5408, + "step": 858000 + }, + { + "epoch": 0.2237869168880111, + "grad_norm": 11.387737274169922, + "learning_rate": 8.970467258256552e-06, + "loss": 2.4938, + "step": 858200 + }, + { + "epoch": 0.22383906951371327, + "grad_norm": 12.079946517944336, + "learning_rate": 8.96995912817546e-06, + "loss": 2.5486, + "step": 858400 + }, + { + "epoch": 0.22389122213941545, + "grad_norm": 11.162164688110352, + "learning_rate": 8.969450887129431e-06, + "loss": 2.5334, + "step": 858600 + }, + { + "epoch": 0.2239433747651176, + "grad_norm": 15.253430366516113, + "learning_rate": 8.968942535132675e-06, + "loss": 2.5128, + "step": 858800 + }, + { + "epoch": 0.2239955273908198, + "grad_norm": 12.579545974731445, + "learning_rate": 8.968434072199396e-06, + "loss": 2.59, + "step": 859000 + }, + { + "epoch": 0.22404768001652195, + "grad_norm": 11.06598949432373, + "learning_rate": 8.96792549834381e-06, + "loss": 2.5401, + "step": 859200 + }, + { + "epoch": 0.22409983264222413, + "grad_norm": 12.26706600189209, + "learning_rate": 8.967416813580132e-06, + "loss": 2.5401, + "step": 859400 + }, + { + "epoch": 0.2241519852679263, + "grad_norm": 12.439864158630371, + "learning_rate": 8.966908017922578e-06, + "loss": 2.5566, + "step": 859600 + }, + { + "epoch": 0.22420413789362847, + "grad_norm": 12.08425521850586, + "learning_rate": 8.966399111385371e-06, + "loss": 2.5422, + "step": 859800 + }, + { + "epoch": 0.22425629051933063, + "grad_norm": 11.494114875793457, + "learning_rate": 8.965890093982736e-06, + "loss": 2.5269, + "step": 860000 + }, + { + "epoch": 0.2243084431450328, + "grad_norm": 13.456847190856934, + "learning_rate": 8.9653809657289e-06, + "loss": 2.5331, + "step": 860200 + }, + { + "epoch": 0.22436059577073497, + "grad_norm": 12.735716819763184, + "learning_rate": 8.964871726638093e-06, + "loss": 2.5312, + "step": 860400 + }, + { + "epoch": 0.22441274839643713, + "grad_norm": 12.311686515808105, + "learning_rate": 8.964362376724552e-06, + "loss": 2.5054, + "step": 860600 + }, + { + "epoch": 0.2244649010221393, + "grad_norm": 14.03046989440918, + "learning_rate": 8.96385291600251e-06, + "loss": 2.5587, + "step": 860800 + }, + { + "epoch": 0.22451705364784147, + "grad_norm": 12.448440551757812, + "learning_rate": 8.963343344486208e-06, + "loss": 2.5317, + "step": 861000 + }, + { + "epoch": 0.22456920627354365, + "grad_norm": 12.499717712402344, + "learning_rate": 8.962833662189889e-06, + "loss": 2.5408, + "step": 861200 + }, + { + "epoch": 0.2246213588992458, + "grad_norm": 12.628153800964355, + "learning_rate": 8.962323869127802e-06, + "loss": 2.5539, + "step": 861400 + }, + { + "epoch": 0.224673511524948, + "grad_norm": 12.310242652893066, + "learning_rate": 8.961813965314192e-06, + "loss": 2.4965, + "step": 861600 + }, + { + "epoch": 0.22472566415065015, + "grad_norm": 12.289976119995117, + "learning_rate": 8.961303950763314e-06, + "loss": 2.5299, + "step": 861800 + }, + { + "epoch": 0.22477781677635233, + "grad_norm": 14.414912223815918, + "learning_rate": 8.960793825489425e-06, + "loss": 2.6051, + "step": 862000 + }, + { + "epoch": 0.2248299694020545, + "grad_norm": 12.091730117797852, + "learning_rate": 8.960283589506779e-06, + "loss": 2.5215, + "step": 862200 + }, + { + "epoch": 0.22488212202775668, + "grad_norm": 13.612672805786133, + "learning_rate": 8.959773242829641e-06, + "loss": 2.5501, + "step": 862400 + }, + { + "epoch": 0.22493427465345883, + "grad_norm": 13.881776809692383, + "learning_rate": 8.959262785472275e-06, + "loss": 2.5188, + "step": 862600 + }, + { + "epoch": 0.22498642727916102, + "grad_norm": 14.095483779907227, + "learning_rate": 8.95875221744895e-06, + "loss": 2.5083, + "step": 862800 + }, + { + "epoch": 0.22503857990486317, + "grad_norm": 12.817255020141602, + "learning_rate": 8.958241538773935e-06, + "loss": 2.5343, + "step": 863000 + }, + { + "epoch": 0.22509073253056536, + "grad_norm": 12.889725685119629, + "learning_rate": 8.957730749461505e-06, + "loss": 2.5256, + "step": 863200 + }, + { + "epoch": 0.2251428851562675, + "grad_norm": 13.904434204101562, + "learning_rate": 8.957219849525938e-06, + "loss": 2.5147, + "step": 863400 + }, + { + "epoch": 0.2251950377819697, + "grad_norm": 12.321836471557617, + "learning_rate": 8.956708838981512e-06, + "loss": 2.5378, + "step": 863600 + }, + { + "epoch": 0.22524719040767185, + "grad_norm": 12.931632041931152, + "learning_rate": 8.956197717842512e-06, + "loss": 2.4962, + "step": 863800 + }, + { + "epoch": 0.22529934303337404, + "grad_norm": 13.46435260772705, + "learning_rate": 8.955686486123225e-06, + "loss": 2.5106, + "step": 864000 + }, + { + "epoch": 0.2253514956590762, + "grad_norm": 13.250704765319824, + "learning_rate": 8.95517514383794e-06, + "loss": 2.5317, + "step": 864200 + }, + { + "epoch": 0.22540364828477838, + "grad_norm": 12.657617568969727, + "learning_rate": 8.954663691000947e-06, + "loss": 2.5211, + "step": 864400 + }, + { + "epoch": 0.22545580091048054, + "grad_norm": 13.068683624267578, + "learning_rate": 8.954152127626545e-06, + "loss": 2.4836, + "step": 864600 + }, + { + "epoch": 0.22550795353618272, + "grad_norm": 12.12160873413086, + "learning_rate": 8.953640453729034e-06, + "loss": 2.548, + "step": 864800 + }, + { + "epoch": 0.22556010616188488, + "grad_norm": 11.243968963623047, + "learning_rate": 8.95312866932271e-06, + "loss": 2.5234, + "step": 865000 + }, + { + "epoch": 0.22561225878758706, + "grad_norm": 12.38250732421875, + "learning_rate": 8.952616774421883e-06, + "loss": 2.573, + "step": 865200 + }, + { + "epoch": 0.22566441141328922, + "grad_norm": 10.652915954589844, + "learning_rate": 8.95210476904086e-06, + "loss": 2.5371, + "step": 865400 + }, + { + "epoch": 0.2257165640389914, + "grad_norm": 13.076416015625, + "learning_rate": 8.95159265319395e-06, + "loss": 2.5046, + "step": 865600 + }, + { + "epoch": 0.22576871666469356, + "grad_norm": 12.925193786621094, + "learning_rate": 8.951080426895473e-06, + "loss": 2.5507, + "step": 865800 + }, + { + "epoch": 0.22582086929039574, + "grad_norm": 14.921239852905273, + "learning_rate": 8.95056809015974e-06, + "loss": 2.5211, + "step": 866000 + }, + { + "epoch": 0.2258730219160979, + "grad_norm": 12.77281665802002, + "learning_rate": 8.950055643001072e-06, + "loss": 2.548, + "step": 866200 + }, + { + "epoch": 0.22592517454180006, + "grad_norm": 12.110180854797363, + "learning_rate": 8.949543085433797e-06, + "loss": 2.5187, + "step": 866400 + }, + { + "epoch": 0.22597732716750224, + "grad_norm": 12.40743637084961, + "learning_rate": 8.949030417472235e-06, + "loss": 2.507, + "step": 866600 + }, + { + "epoch": 0.2260294797932044, + "grad_norm": 12.984556198120117, + "learning_rate": 8.948517639130722e-06, + "loss": 2.5171, + "step": 866800 + }, + { + "epoch": 0.22608163241890658, + "grad_norm": 12.792410850524902, + "learning_rate": 8.948004750423589e-06, + "loss": 2.5155, + "step": 867000 + }, + { + "epoch": 0.22613378504460874, + "grad_norm": 13.460736274719238, + "learning_rate": 8.94749175136517e-06, + "loss": 2.5426, + "step": 867200 + }, + { + "epoch": 0.22618593767031092, + "grad_norm": 10.04277229309082, + "learning_rate": 8.946978641969805e-06, + "loss": 2.5214, + "step": 867400 + }, + { + "epoch": 0.22623809029601308, + "grad_norm": 12.798548698425293, + "learning_rate": 8.946465422251836e-06, + "loss": 2.585, + "step": 867600 + }, + { + "epoch": 0.22629024292171526, + "grad_norm": 12.13387680053711, + "learning_rate": 8.945952092225607e-06, + "loss": 2.4898, + "step": 867800 + }, + { + "epoch": 0.22634239554741742, + "grad_norm": 13.749103546142578, + "learning_rate": 8.945438651905469e-06, + "loss": 2.4992, + "step": 868000 + }, + { + "epoch": 0.2263945481731196, + "grad_norm": 12.682474136352539, + "learning_rate": 8.94492510130577e-06, + "loss": 2.534, + "step": 868200 + }, + { + "epoch": 0.22644670079882176, + "grad_norm": 14.173575401306152, + "learning_rate": 8.944411440440865e-06, + "loss": 2.5106, + "step": 868400 + }, + { + "epoch": 0.22649885342452394, + "grad_norm": 14.226604461669922, + "learning_rate": 8.943897669325114e-06, + "loss": 2.538, + "step": 868600 + }, + { + "epoch": 0.2265510060502261, + "grad_norm": 11.75302791595459, + "learning_rate": 8.943383787972875e-06, + "loss": 2.5191, + "step": 868800 + }, + { + "epoch": 0.22660315867592828, + "grad_norm": 14.185355186462402, + "learning_rate": 8.942869796398513e-06, + "loss": 2.5775, + "step": 869000 + }, + { + "epoch": 0.22665531130163044, + "grad_norm": 13.081635475158691, + "learning_rate": 8.942355694616393e-06, + "loss": 2.5, + "step": 869200 + }, + { + "epoch": 0.22670746392733263, + "grad_norm": 13.637446403503418, + "learning_rate": 8.941841482640885e-06, + "loss": 2.5123, + "step": 869400 + }, + { + "epoch": 0.22675961655303478, + "grad_norm": 12.775493621826172, + "learning_rate": 8.941327160486364e-06, + "loss": 2.5291, + "step": 869600 + }, + { + "epoch": 0.22681176917873697, + "grad_norm": 12.630292892456055, + "learning_rate": 8.940812728167203e-06, + "loss": 2.5195, + "step": 869800 + }, + { + "epoch": 0.22686392180443912, + "grad_norm": 12.8863525390625, + "learning_rate": 8.940298185697783e-06, + "loss": 2.5228, + "step": 870000 + } + ], + "logging_steps": 200, + "max_steps": 3834898, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 15000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +}