diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,37637 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 53680, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007451564828614009, + "grad_norm": 1.7114099264144897, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 10 + }, + { + "epoch": 0.0014903129657228018, + "grad_norm": 2.16106915473938, + "learning_rate": 0.0002, + "loss": 1.9128, + "step": 20 + }, + { + "epoch": 0.0022354694485842027, + "grad_norm": 1.9972808361053467, + "learning_rate": 0.0002, + "loss": 1.8523, + "step": 30 + }, + { + "epoch": 0.0029806259314456036, + "grad_norm": 2.058236598968506, + "learning_rate": 0.0002, + "loss": 1.7295, + "step": 40 + }, + { + "epoch": 0.0037257824143070045, + "grad_norm": 2.164706230163574, + "learning_rate": 0.0002, + "loss": 1.6538, + "step": 50 + }, + { + "epoch": 0.004470938897168405, + "grad_norm": 1.7097787857055664, + "learning_rate": 0.0002, + "loss": 1.7809, + "step": 60 + }, + { + "epoch": 0.005216095380029807, + "grad_norm": 1.7462966442108154, + "learning_rate": 0.0002, + "loss": 1.7701, + "step": 70 + }, + { + "epoch": 0.005961251862891207, + "grad_norm": 2.259033679962158, + "learning_rate": 0.0002, + "loss": 1.7231, + "step": 80 + }, + { + "epoch": 0.0067064083457526085, + "grad_norm": 2.3188297748565674, + "learning_rate": 0.0002, + "loss": 1.9569, + "step": 90 + }, + { + "epoch": 0.007451564828614009, + "grad_norm": 1.7893937826156616, + "learning_rate": 0.0002, + "loss": 2.5206, + "step": 100 + }, + { + "epoch": 0.00819672131147541, + "grad_norm": 1.4487690925598145, + "learning_rate": 0.0002, + "loss": 2.5691, + "step": 110 + }, + { + "epoch": 0.00894187779433681, + "grad_norm": 1.6098469495773315, + "learning_rate": 0.0002, + "loss": 2.5209, + "step": 120 + }, + { + "epoch": 0.009687034277198211, + "grad_norm": 1.4259092807769775, + "learning_rate": 0.0002, + "loss": 2.5239, + "step": 130 + }, + { + "epoch": 0.010432190760059613, + "grad_norm": 1.4447660446166992, + "learning_rate": 0.0002, + "loss": 2.6533, + "step": 140 + }, + { + "epoch": 0.011177347242921014, + "grad_norm": 1.341070532798767, + "learning_rate": 0.0002, + "loss": 2.6231, + "step": 150 + }, + { + "epoch": 0.011922503725782414, + "grad_norm": 1.2440979480743408, + "learning_rate": 0.0002, + "loss": 2.4615, + "step": 160 + }, + { + "epoch": 0.012667660208643815, + "grad_norm": 1.4133532047271729, + "learning_rate": 0.0002, + "loss": 2.5564, + "step": 170 + }, + { + "epoch": 0.013412816691505217, + "grad_norm": 1.3202648162841797, + "learning_rate": 0.0002, + "loss": 2.4586, + "step": 180 + }, + { + "epoch": 0.014157973174366617, + "grad_norm": 1.2608141899108887, + "learning_rate": 0.0002, + "loss": 2.4948, + "step": 190 + }, + { + "epoch": 0.014903129657228018, + "grad_norm": 1.3800815343856812, + "learning_rate": 0.0002, + "loss": 2.4191, + "step": 200 + }, + { + "epoch": 0.01564828614008942, + "grad_norm": 1.4190888404846191, + "learning_rate": 0.0002, + "loss": 2.576, + "step": 210 + }, + { + "epoch": 0.01639344262295082, + "grad_norm": 1.36716628074646, + "learning_rate": 0.0002, + "loss": 2.4893, + "step": 220 + }, + { + "epoch": 0.01713859910581222, + "grad_norm": 1.4020277261734009, + "learning_rate": 0.0002, + "loss": 2.4958, + "step": 230 + }, + { + "epoch": 0.01788375558867362, + "grad_norm": 1.4159200191497803, + "learning_rate": 0.0002, + "loss": 2.2937, + "step": 240 + }, + { + "epoch": 0.018628912071535022, + "grad_norm": 1.6275138854980469, + "learning_rate": 0.0002, + "loss": 2.3386, + "step": 250 + }, + { + "epoch": 0.019374068554396422, + "grad_norm": 1.3647948503494263, + "learning_rate": 0.0002, + "loss": 2.5081, + "step": 260 + }, + { + "epoch": 0.020119225037257823, + "grad_norm": 1.237382173538208, + "learning_rate": 0.0002, + "loss": 2.4055, + "step": 270 + }, + { + "epoch": 0.020864381520119227, + "grad_norm": 1.3185752630233765, + "learning_rate": 0.0002, + "loss": 2.5449, + "step": 280 + }, + { + "epoch": 0.021609538002980627, + "grad_norm": 1.454095721244812, + "learning_rate": 0.0002, + "loss": 2.5499, + "step": 290 + }, + { + "epoch": 0.022354694485842028, + "grad_norm": 1.4591819047927856, + "learning_rate": 0.0002, + "loss": 2.4029, + "step": 300 + }, + { + "epoch": 0.023099850968703428, + "grad_norm": 1.3012772798538208, + "learning_rate": 0.0002, + "loss": 2.3855, + "step": 310 + }, + { + "epoch": 0.02384500745156483, + "grad_norm": 1.326201319694519, + "learning_rate": 0.0002, + "loss": 2.6307, + "step": 320 + }, + { + "epoch": 0.02459016393442623, + "grad_norm": 1.3953168392181396, + "learning_rate": 0.0002, + "loss": 2.6101, + "step": 330 + }, + { + "epoch": 0.02533532041728763, + "grad_norm": 1.5380605459213257, + "learning_rate": 0.0002, + "loss": 2.4536, + "step": 340 + }, + { + "epoch": 0.02608047690014903, + "grad_norm": 1.5608468055725098, + "learning_rate": 0.0002, + "loss": 2.1291, + "step": 350 + }, + { + "epoch": 0.026825633383010434, + "grad_norm": 1.316112756729126, + "learning_rate": 0.0002, + "loss": 2.5011, + "step": 360 + }, + { + "epoch": 0.027570789865871834, + "grad_norm": 1.4802881479263306, + "learning_rate": 0.0002, + "loss": 2.4637, + "step": 370 + }, + { + "epoch": 0.028315946348733235, + "grad_norm": 1.213468074798584, + "learning_rate": 0.0002, + "loss": 2.5339, + "step": 380 + }, + { + "epoch": 0.029061102831594635, + "grad_norm": 1.3106322288513184, + "learning_rate": 0.0002, + "loss": 2.5736, + "step": 390 + }, + { + "epoch": 0.029806259314456036, + "grad_norm": 1.7102941274642944, + "learning_rate": 0.0002, + "loss": 2.4255, + "step": 400 + }, + { + "epoch": 0.030551415797317436, + "grad_norm": 2.0210812091827393, + "learning_rate": 0.0002, + "loss": 2.4833, + "step": 410 + }, + { + "epoch": 0.03129657228017884, + "grad_norm": 1.38539457321167, + "learning_rate": 0.0002, + "loss": 2.3001, + "step": 420 + }, + { + "epoch": 0.03204172876304024, + "grad_norm": 1.439780592918396, + "learning_rate": 0.0002, + "loss": 2.5553, + "step": 430 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 1.488784670829773, + "learning_rate": 0.0002, + "loss": 2.3695, + "step": 440 + }, + { + "epoch": 0.03353204172876304, + "grad_norm": 1.4806923866271973, + "learning_rate": 0.0002, + "loss": 2.4231, + "step": 450 + }, + { + "epoch": 0.03427719821162444, + "grad_norm": 1.3404014110565186, + "learning_rate": 0.0002, + "loss": 2.5017, + "step": 460 + }, + { + "epoch": 0.03502235469448584, + "grad_norm": 1.6606372594833374, + "learning_rate": 0.0002, + "loss": 2.4502, + "step": 470 + }, + { + "epoch": 0.03576751117734724, + "grad_norm": 1.6447430849075317, + "learning_rate": 0.0002, + "loss": 2.2351, + "step": 480 + }, + { + "epoch": 0.03651266766020864, + "grad_norm": 1.2761911153793335, + "learning_rate": 0.0002, + "loss": 2.3787, + "step": 490 + }, + { + "epoch": 0.037257824143070044, + "grad_norm": 1.4461835622787476, + "learning_rate": 0.0002, + "loss": 2.5047, + "step": 500 + }, + { + "epoch": 0.038002980625931444, + "grad_norm": 1.9162489175796509, + "learning_rate": 0.0002, + "loss": 2.3294, + "step": 510 + }, + { + "epoch": 0.038748137108792845, + "grad_norm": 1.6676952838897705, + "learning_rate": 0.0002, + "loss": 2.4176, + "step": 520 + }, + { + "epoch": 0.039493293591654245, + "grad_norm": 1.7947252988815308, + "learning_rate": 0.0002, + "loss": 2.3651, + "step": 530 + }, + { + "epoch": 0.040238450074515646, + "grad_norm": 1.4677958488464355, + "learning_rate": 0.0002, + "loss": 2.6263, + "step": 540 + }, + { + "epoch": 0.040983606557377046, + "grad_norm": 1.4735350608825684, + "learning_rate": 0.0002, + "loss": 2.4261, + "step": 550 + }, + { + "epoch": 0.041728763040238454, + "grad_norm": 1.4515550136566162, + "learning_rate": 0.0002, + "loss": 2.4795, + "step": 560 + }, + { + "epoch": 0.042473919523099854, + "grad_norm": 1.6137213706970215, + "learning_rate": 0.0002, + "loss": 2.5102, + "step": 570 + }, + { + "epoch": 0.043219076005961254, + "grad_norm": 1.6257590055465698, + "learning_rate": 0.0002, + "loss": 2.4218, + "step": 580 + }, + { + "epoch": 0.043964232488822655, + "grad_norm": 1.515711784362793, + "learning_rate": 0.0002, + "loss": 2.3959, + "step": 590 + }, + { + "epoch": 0.044709388971684055, + "grad_norm": 1.4846817255020142, + "learning_rate": 0.0002, + "loss": 2.4579, + "step": 600 + }, + { + "epoch": 0.045454545454545456, + "grad_norm": 1.7104601860046387, + "learning_rate": 0.0002, + "loss": 2.4271, + "step": 610 + }, + { + "epoch": 0.046199701937406856, + "grad_norm": 1.4951586723327637, + "learning_rate": 0.0002, + "loss": 2.4518, + "step": 620 + }, + { + "epoch": 0.04694485842026826, + "grad_norm": 1.4925544261932373, + "learning_rate": 0.0002, + "loss": 2.4987, + "step": 630 + }, + { + "epoch": 0.04769001490312966, + "grad_norm": 1.4516690969467163, + "learning_rate": 0.0002, + "loss": 2.4968, + "step": 640 + }, + { + "epoch": 0.04843517138599106, + "grad_norm": 1.6420326232910156, + "learning_rate": 0.0002, + "loss": 2.4214, + "step": 650 + }, + { + "epoch": 0.04918032786885246, + "grad_norm": 1.6954593658447266, + "learning_rate": 0.0002, + "loss": 2.4363, + "step": 660 + }, + { + "epoch": 0.04992548435171386, + "grad_norm": 1.6805527210235596, + "learning_rate": 0.0002, + "loss": 2.2638, + "step": 670 + }, + { + "epoch": 0.05067064083457526, + "grad_norm": 1.5352181196212769, + "learning_rate": 0.0002, + "loss": 2.3839, + "step": 680 + }, + { + "epoch": 0.05141579731743666, + "grad_norm": 1.696765661239624, + "learning_rate": 0.0002, + "loss": 2.4535, + "step": 690 + }, + { + "epoch": 0.05216095380029806, + "grad_norm": 1.4545871019363403, + "learning_rate": 0.0002, + "loss": 2.3242, + "step": 700 + }, + { + "epoch": 0.05290611028315946, + "grad_norm": 1.4923096895217896, + "learning_rate": 0.0002, + "loss": 2.4713, + "step": 710 + }, + { + "epoch": 0.05365126676602087, + "grad_norm": 1.483655571937561, + "learning_rate": 0.0002, + "loss": 2.4, + "step": 720 + }, + { + "epoch": 0.05439642324888227, + "grad_norm": 1.5965553522109985, + "learning_rate": 0.0002, + "loss": 2.5278, + "step": 730 + }, + { + "epoch": 0.05514157973174367, + "grad_norm": 1.788888931274414, + "learning_rate": 0.0002, + "loss": 2.5566, + "step": 740 + }, + { + "epoch": 0.05588673621460507, + "grad_norm": 1.612257957458496, + "learning_rate": 0.0002, + "loss": 2.4796, + "step": 750 + }, + { + "epoch": 0.05663189269746647, + "grad_norm": 1.4303230047225952, + "learning_rate": 0.0002, + "loss": 2.4086, + "step": 760 + }, + { + "epoch": 0.05737704918032787, + "grad_norm": 1.534578800201416, + "learning_rate": 0.0002, + "loss": 2.6292, + "step": 770 + }, + { + "epoch": 0.05812220566318927, + "grad_norm": 1.4243452548980713, + "learning_rate": 0.0002, + "loss": 2.5726, + "step": 780 + }, + { + "epoch": 0.05886736214605067, + "grad_norm": 1.3264392614364624, + "learning_rate": 0.0002, + "loss": 2.552, + "step": 790 + }, + { + "epoch": 0.05961251862891207, + "grad_norm": 1.251664400100708, + "learning_rate": 0.0002, + "loss": 2.2833, + "step": 800 + }, + { + "epoch": 0.06035767511177347, + "grad_norm": 1.5511835813522339, + "learning_rate": 0.0002, + "loss": 2.5349, + "step": 810 + }, + { + "epoch": 0.06110283159463487, + "grad_norm": 1.3495992422103882, + "learning_rate": 0.0002, + "loss": 2.2091, + "step": 820 + }, + { + "epoch": 0.06184798807749627, + "grad_norm": 1.343342661857605, + "learning_rate": 0.0002, + "loss": 2.6122, + "step": 830 + }, + { + "epoch": 0.06259314456035768, + "grad_norm": 1.568200707435608, + "learning_rate": 0.0002, + "loss": 2.3481, + "step": 840 + }, + { + "epoch": 0.06333830104321908, + "grad_norm": 1.6847620010375977, + "learning_rate": 0.0002, + "loss": 2.4552, + "step": 850 + }, + { + "epoch": 0.06408345752608048, + "grad_norm": 2.0226731300354004, + "learning_rate": 0.0002, + "loss": 2.4509, + "step": 860 + }, + { + "epoch": 0.06482861400894188, + "grad_norm": 1.8059664964675903, + "learning_rate": 0.0002, + "loss": 2.4808, + "step": 870 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 1.5953114032745361, + "learning_rate": 0.0002, + "loss": 2.5304, + "step": 880 + }, + { + "epoch": 0.06631892697466468, + "grad_norm": 1.7731475830078125, + "learning_rate": 0.0002, + "loss": 2.258, + "step": 890 + }, + { + "epoch": 0.06706408345752608, + "grad_norm": 1.8624380826950073, + "learning_rate": 0.0002, + "loss": 2.3362, + "step": 900 + }, + { + "epoch": 0.06780923994038748, + "grad_norm": 1.6222556829452515, + "learning_rate": 0.0002, + "loss": 2.4066, + "step": 910 + }, + { + "epoch": 0.06855439642324888, + "grad_norm": 1.7085922956466675, + "learning_rate": 0.0002, + "loss": 2.5117, + "step": 920 + }, + { + "epoch": 0.06929955290611028, + "grad_norm": 1.6273415088653564, + "learning_rate": 0.0002, + "loss": 2.5034, + "step": 930 + }, + { + "epoch": 0.07004470938897168, + "grad_norm": 1.5758957862854004, + "learning_rate": 0.0002, + "loss": 2.4744, + "step": 940 + }, + { + "epoch": 0.07078986587183309, + "grad_norm": 1.7586950063705444, + "learning_rate": 0.0002, + "loss": 2.6029, + "step": 950 + }, + { + "epoch": 0.07153502235469449, + "grad_norm": 1.7354320287704468, + "learning_rate": 0.0002, + "loss": 2.4572, + "step": 960 + }, + { + "epoch": 0.07228017883755589, + "grad_norm": 1.440529704093933, + "learning_rate": 0.0002, + "loss": 2.4439, + "step": 970 + }, + { + "epoch": 0.07302533532041729, + "grad_norm": 1.4725509881973267, + "learning_rate": 0.0002, + "loss": 2.4279, + "step": 980 + }, + { + "epoch": 0.07377049180327869, + "grad_norm": 1.6604877710342407, + "learning_rate": 0.0002, + "loss": 2.4856, + "step": 990 + }, + { + "epoch": 0.07451564828614009, + "grad_norm": 1.5507324934005737, + "learning_rate": 0.0002, + "loss": 2.5235, + "step": 1000 + }, + { + "epoch": 0.07526080476900149, + "grad_norm": 1.6070083379745483, + "learning_rate": 0.0002, + "loss": 2.3157, + "step": 1010 + }, + { + "epoch": 0.07600596125186289, + "grad_norm": 1.5939691066741943, + "learning_rate": 0.0002, + "loss": 2.5106, + "step": 1020 + }, + { + "epoch": 0.07675111773472429, + "grad_norm": 1.68025803565979, + "learning_rate": 0.0002, + "loss": 2.4918, + "step": 1030 + }, + { + "epoch": 0.07749627421758569, + "grad_norm": 1.6039890050888062, + "learning_rate": 0.0002, + "loss": 2.5583, + "step": 1040 + }, + { + "epoch": 0.07824143070044709, + "grad_norm": 1.8732694387435913, + "learning_rate": 0.0002, + "loss": 2.2141, + "step": 1050 + }, + { + "epoch": 0.07898658718330849, + "grad_norm": 1.4604535102844238, + "learning_rate": 0.0002, + "loss": 2.5011, + "step": 1060 + }, + { + "epoch": 0.07973174366616989, + "grad_norm": 1.5717531442642212, + "learning_rate": 0.0002, + "loss": 2.6481, + "step": 1070 + }, + { + "epoch": 0.08047690014903129, + "grad_norm": 1.6097593307495117, + "learning_rate": 0.0002, + "loss": 2.6329, + "step": 1080 + }, + { + "epoch": 0.08122205663189269, + "grad_norm": 1.6716941595077515, + "learning_rate": 0.0002, + "loss": 2.2852, + "step": 1090 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 1.4320539236068726, + "learning_rate": 0.0002, + "loss": 2.5, + "step": 1100 + }, + { + "epoch": 0.08271236959761549, + "grad_norm": 1.76832914352417, + "learning_rate": 0.0002, + "loss": 2.4243, + "step": 1110 + }, + { + "epoch": 0.08345752608047691, + "grad_norm": 1.5858458280563354, + "learning_rate": 0.0002, + "loss": 2.4635, + "step": 1120 + }, + { + "epoch": 0.08420268256333831, + "grad_norm": 1.627699851989746, + "learning_rate": 0.0002, + "loss": 2.429, + "step": 1130 + }, + { + "epoch": 0.08494783904619971, + "grad_norm": 1.723833441734314, + "learning_rate": 0.0002, + "loss": 2.5082, + "step": 1140 + }, + { + "epoch": 0.08569299552906111, + "grad_norm": 1.6531169414520264, + "learning_rate": 0.0002, + "loss": 2.3722, + "step": 1150 + }, + { + "epoch": 0.08643815201192251, + "grad_norm": 1.6412060260772705, + "learning_rate": 0.0002, + "loss": 2.4493, + "step": 1160 + }, + { + "epoch": 0.08718330849478391, + "grad_norm": 1.6279152631759644, + "learning_rate": 0.0002, + "loss": 2.6463, + "step": 1170 + }, + { + "epoch": 0.08792846497764531, + "grad_norm": 1.9763994216918945, + "learning_rate": 0.0002, + "loss": 2.5753, + "step": 1180 + }, + { + "epoch": 0.08867362146050671, + "grad_norm": 1.5985665321350098, + "learning_rate": 0.0002, + "loss": 2.6349, + "step": 1190 + }, + { + "epoch": 0.08941877794336811, + "grad_norm": 1.683590054512024, + "learning_rate": 0.0002, + "loss": 2.4348, + "step": 1200 + }, + { + "epoch": 0.09016393442622951, + "grad_norm": 1.55161452293396, + "learning_rate": 0.0002, + "loss": 2.2263, + "step": 1210 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 1.7723956108093262, + "learning_rate": 0.0002, + "loss": 2.5187, + "step": 1220 + }, + { + "epoch": 0.09165424739195231, + "grad_norm": 2.007422924041748, + "learning_rate": 0.0002, + "loss": 2.2847, + "step": 1230 + }, + { + "epoch": 0.09239940387481371, + "grad_norm": 1.6839237213134766, + "learning_rate": 0.0002, + "loss": 2.395, + "step": 1240 + }, + { + "epoch": 0.09314456035767511, + "grad_norm": 1.8041167259216309, + "learning_rate": 0.0002, + "loss": 2.4219, + "step": 1250 + }, + { + "epoch": 0.09388971684053651, + "grad_norm": 1.3533605337142944, + "learning_rate": 0.0002, + "loss": 2.1161, + "step": 1260 + }, + { + "epoch": 0.09463487332339791, + "grad_norm": 1.5705360174179077, + "learning_rate": 0.0002, + "loss": 2.4224, + "step": 1270 + }, + { + "epoch": 0.09538002980625931, + "grad_norm": 1.3010971546173096, + "learning_rate": 0.0002, + "loss": 2.7023, + "step": 1280 + }, + { + "epoch": 0.09612518628912071, + "grad_norm": 1.61898934841156, + "learning_rate": 0.0002, + "loss": 2.4894, + "step": 1290 + }, + { + "epoch": 0.09687034277198212, + "grad_norm": 1.570351243019104, + "learning_rate": 0.0002, + "loss": 2.6117, + "step": 1300 + }, + { + "epoch": 0.09761549925484352, + "grad_norm": 1.5680445432662964, + "learning_rate": 0.0002, + "loss": 2.4442, + "step": 1310 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 1.6972705125808716, + "learning_rate": 0.0002, + "loss": 2.4776, + "step": 1320 + }, + { + "epoch": 0.09910581222056632, + "grad_norm": 1.4837958812713623, + "learning_rate": 0.0002, + "loss": 2.4469, + "step": 1330 + }, + { + "epoch": 0.09985096870342772, + "grad_norm": 1.241639494895935, + "learning_rate": 0.0002, + "loss": 2.2837, + "step": 1340 + }, + { + "epoch": 0.10059612518628912, + "grad_norm": 1.6141420602798462, + "learning_rate": 0.0002, + "loss": 2.4861, + "step": 1350 + }, + { + "epoch": 0.10134128166915052, + "grad_norm": 1.3432039022445679, + "learning_rate": 0.0002, + "loss": 2.2677, + "step": 1360 + }, + { + "epoch": 0.10208643815201192, + "grad_norm": 1.4515589475631714, + "learning_rate": 0.0002, + "loss": 2.4508, + "step": 1370 + }, + { + "epoch": 0.10283159463487332, + "grad_norm": 1.6723533868789673, + "learning_rate": 0.0002, + "loss": 2.4985, + "step": 1380 + }, + { + "epoch": 0.10357675111773472, + "grad_norm": 1.6158736944198608, + "learning_rate": 0.0002, + "loss": 2.5645, + "step": 1390 + }, + { + "epoch": 0.10432190760059612, + "grad_norm": 1.7271533012390137, + "learning_rate": 0.0002, + "loss": 2.367, + "step": 1400 + }, + { + "epoch": 0.10506706408345752, + "grad_norm": 1.1721924543380737, + "learning_rate": 0.0002, + "loss": 2.3531, + "step": 1410 + }, + { + "epoch": 0.10581222056631892, + "grad_norm": 1.509311318397522, + "learning_rate": 0.0002, + "loss": 2.3562, + "step": 1420 + }, + { + "epoch": 0.10655737704918032, + "grad_norm": 1.8995124101638794, + "learning_rate": 0.0002, + "loss": 2.325, + "step": 1430 + }, + { + "epoch": 0.10730253353204174, + "grad_norm": 1.5647929906845093, + "learning_rate": 0.0002, + "loss": 2.306, + "step": 1440 + }, + { + "epoch": 0.10804769001490314, + "grad_norm": 1.8889915943145752, + "learning_rate": 0.0002, + "loss": 2.3319, + "step": 1450 + }, + { + "epoch": 0.10879284649776454, + "grad_norm": 1.7834174633026123, + "learning_rate": 0.0002, + "loss": 2.4158, + "step": 1460 + }, + { + "epoch": 0.10953800298062594, + "grad_norm": 1.7446699142456055, + "learning_rate": 0.0002, + "loss": 2.5858, + "step": 1470 + }, + { + "epoch": 0.11028315946348734, + "grad_norm": 1.6821390390396118, + "learning_rate": 0.0002, + "loss": 2.6529, + "step": 1480 + }, + { + "epoch": 0.11102831594634874, + "grad_norm": 1.7135263681411743, + "learning_rate": 0.0002, + "loss": 2.4174, + "step": 1490 + }, + { + "epoch": 0.11177347242921014, + "grad_norm": 1.6189689636230469, + "learning_rate": 0.0002, + "loss": 2.3491, + "step": 1500 + }, + { + "epoch": 0.11251862891207154, + "grad_norm": 1.9968479871749878, + "learning_rate": 0.0002, + "loss": 2.3949, + "step": 1510 + }, + { + "epoch": 0.11326378539493294, + "grad_norm": 1.7658995389938354, + "learning_rate": 0.0002, + "loss": 2.3559, + "step": 1520 + }, + { + "epoch": 0.11400894187779434, + "grad_norm": 1.5398634672164917, + "learning_rate": 0.0002, + "loss": 2.3183, + "step": 1530 + }, + { + "epoch": 0.11475409836065574, + "grad_norm": 1.558796763420105, + "learning_rate": 0.0002, + "loss": 2.5318, + "step": 1540 + }, + { + "epoch": 0.11549925484351714, + "grad_norm": 1.7153369188308716, + "learning_rate": 0.0002, + "loss": 2.42, + "step": 1550 + }, + { + "epoch": 0.11624441132637854, + "grad_norm": 1.8950843811035156, + "learning_rate": 0.0002, + "loss": 2.4338, + "step": 1560 + }, + { + "epoch": 0.11698956780923994, + "grad_norm": 1.9553101062774658, + "learning_rate": 0.0002, + "loss": 2.3172, + "step": 1570 + }, + { + "epoch": 0.11773472429210134, + "grad_norm": 2.0377368927001953, + "learning_rate": 0.0002, + "loss": 2.5465, + "step": 1580 + }, + { + "epoch": 0.11847988077496274, + "grad_norm": 1.596413254737854, + "learning_rate": 0.0002, + "loss": 2.3895, + "step": 1590 + }, + { + "epoch": 0.11922503725782414, + "grad_norm": 1.9717952013015747, + "learning_rate": 0.0002, + "loss": 2.33, + "step": 1600 + }, + { + "epoch": 0.11997019374068554, + "grad_norm": 1.597959041595459, + "learning_rate": 0.0002, + "loss": 2.4298, + "step": 1610 + }, + { + "epoch": 0.12071535022354694, + "grad_norm": 1.7834532260894775, + "learning_rate": 0.0002, + "loss": 2.4906, + "step": 1620 + }, + { + "epoch": 0.12146050670640834, + "grad_norm": 1.7709592580795288, + "learning_rate": 0.0002, + "loss": 2.4961, + "step": 1630 + }, + { + "epoch": 0.12220566318926974, + "grad_norm": 1.448915719985962, + "learning_rate": 0.0002, + "loss": 2.2579, + "step": 1640 + }, + { + "epoch": 0.12295081967213115, + "grad_norm": 1.9186158180236816, + "learning_rate": 0.0002, + "loss": 2.6403, + "step": 1650 + }, + { + "epoch": 0.12369597615499255, + "grad_norm": 1.7312026023864746, + "learning_rate": 0.0002, + "loss": 2.3614, + "step": 1660 + }, + { + "epoch": 0.12444113263785395, + "grad_norm": 1.6252959966659546, + "learning_rate": 0.0002, + "loss": 2.1939, + "step": 1670 + }, + { + "epoch": 0.12518628912071536, + "grad_norm": 1.485950231552124, + "learning_rate": 0.0002, + "loss": 2.3911, + "step": 1680 + }, + { + "epoch": 0.12593144560357675, + "grad_norm": 2.1096255779266357, + "learning_rate": 0.0002, + "loss": 2.3905, + "step": 1690 + }, + { + "epoch": 0.12667660208643816, + "grad_norm": 1.9784533977508545, + "learning_rate": 0.0002, + "loss": 2.5378, + "step": 1700 + }, + { + "epoch": 0.12742175856929955, + "grad_norm": 1.6286430358886719, + "learning_rate": 0.0002, + "loss": 2.3893, + "step": 1710 + }, + { + "epoch": 0.12816691505216096, + "grad_norm": 1.6326884031295776, + "learning_rate": 0.0002, + "loss": 2.2981, + "step": 1720 + }, + { + "epoch": 0.12891207153502235, + "grad_norm": 1.8827307224273682, + "learning_rate": 0.0002, + "loss": 2.386, + "step": 1730 + }, + { + "epoch": 0.12965722801788376, + "grad_norm": 1.5798028707504272, + "learning_rate": 0.0002, + "loss": 2.4409, + "step": 1740 + }, + { + "epoch": 0.13040238450074515, + "grad_norm": 1.701941728591919, + "learning_rate": 0.0002, + "loss": 2.621, + "step": 1750 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 2.052530527114868, + "learning_rate": 0.0002, + "loss": 2.5072, + "step": 1760 + }, + { + "epoch": 0.13189269746646795, + "grad_norm": 1.7951122522354126, + "learning_rate": 0.0002, + "loss": 2.6495, + "step": 1770 + }, + { + "epoch": 0.13263785394932937, + "grad_norm": 1.713895320892334, + "learning_rate": 0.0002, + "loss": 2.4414, + "step": 1780 + }, + { + "epoch": 0.13338301043219075, + "grad_norm": 1.7314261198043823, + "learning_rate": 0.0002, + "loss": 2.5448, + "step": 1790 + }, + { + "epoch": 0.13412816691505217, + "grad_norm": 2.169917583465576, + "learning_rate": 0.0002, + "loss": 2.3403, + "step": 1800 + }, + { + "epoch": 0.13487332339791355, + "grad_norm": 2.260791063308716, + "learning_rate": 0.0002, + "loss": 2.3851, + "step": 1810 + }, + { + "epoch": 0.13561847988077497, + "grad_norm": 1.5603179931640625, + "learning_rate": 0.0002, + "loss": 2.385, + "step": 1820 + }, + { + "epoch": 0.13636363636363635, + "grad_norm": 1.854830026626587, + "learning_rate": 0.0002, + "loss": 2.5961, + "step": 1830 + }, + { + "epoch": 0.13710879284649777, + "grad_norm": 2.0177578926086426, + "learning_rate": 0.0002, + "loss": 2.5359, + "step": 1840 + }, + { + "epoch": 0.13785394932935915, + "grad_norm": 1.6412891149520874, + "learning_rate": 0.0002, + "loss": 2.5614, + "step": 1850 + }, + { + "epoch": 0.13859910581222057, + "grad_norm": 1.498461127281189, + "learning_rate": 0.0002, + "loss": 2.4013, + "step": 1860 + }, + { + "epoch": 0.13934426229508196, + "grad_norm": 1.7153528928756714, + "learning_rate": 0.0002, + "loss": 2.4077, + "step": 1870 + }, + { + "epoch": 0.14008941877794337, + "grad_norm": 1.8465684652328491, + "learning_rate": 0.0002, + "loss": 2.4389, + "step": 1880 + }, + { + "epoch": 0.14083457526080476, + "grad_norm": 1.687004566192627, + "learning_rate": 0.0002, + "loss": 2.629, + "step": 1890 + }, + { + "epoch": 0.14157973174366617, + "grad_norm": 2.4556238651275635, + "learning_rate": 0.0002, + "loss": 2.4681, + "step": 1900 + }, + { + "epoch": 0.14232488822652756, + "grad_norm": 1.6830357313156128, + "learning_rate": 0.0002, + "loss": 2.5962, + "step": 1910 + }, + { + "epoch": 0.14307004470938897, + "grad_norm": 1.5959599018096924, + "learning_rate": 0.0002, + "loss": 2.3358, + "step": 1920 + }, + { + "epoch": 0.14381520119225039, + "grad_norm": 1.9852540493011475, + "learning_rate": 0.0002, + "loss": 2.4023, + "step": 1930 + }, + { + "epoch": 0.14456035767511177, + "grad_norm": 1.334253191947937, + "learning_rate": 0.0002, + "loss": 2.3208, + "step": 1940 + }, + { + "epoch": 0.1453055141579732, + "grad_norm": 1.7445403337478638, + "learning_rate": 0.0002, + "loss": 2.7472, + "step": 1950 + }, + { + "epoch": 0.14605067064083457, + "grad_norm": 2.1274497509002686, + "learning_rate": 0.0002, + "loss": 2.4091, + "step": 1960 + }, + { + "epoch": 0.146795827123696, + "grad_norm": 1.9186792373657227, + "learning_rate": 0.0002, + "loss": 2.5525, + "step": 1970 + }, + { + "epoch": 0.14754098360655737, + "grad_norm": 1.7423287630081177, + "learning_rate": 0.0002, + "loss": 2.4312, + "step": 1980 + }, + { + "epoch": 0.1482861400894188, + "grad_norm": 1.6370166540145874, + "learning_rate": 0.0002, + "loss": 2.6371, + "step": 1990 + }, + { + "epoch": 0.14903129657228018, + "grad_norm": 1.812752366065979, + "learning_rate": 0.0002, + "loss": 2.4224, + "step": 2000 + }, + { + "epoch": 0.1497764530551416, + "grad_norm": 1.6510322093963623, + "learning_rate": 0.0002, + "loss": 2.4596, + "step": 2010 + }, + { + "epoch": 0.15052160953800298, + "grad_norm": 1.7658458948135376, + "learning_rate": 0.0002, + "loss": 2.4772, + "step": 2020 + }, + { + "epoch": 0.1512667660208644, + "grad_norm": 1.8121706247329712, + "learning_rate": 0.0002, + "loss": 2.3523, + "step": 2030 + }, + { + "epoch": 0.15201192250372578, + "grad_norm": 1.7872707843780518, + "learning_rate": 0.0002, + "loss": 2.4009, + "step": 2040 + }, + { + "epoch": 0.1527570789865872, + "grad_norm": 1.8161529302597046, + "learning_rate": 0.0002, + "loss": 2.2658, + "step": 2050 + }, + { + "epoch": 0.15350223546944858, + "grad_norm": 3.1155641078948975, + "learning_rate": 0.0002, + "loss": 2.3572, + "step": 2060 + }, + { + "epoch": 0.15424739195231, + "grad_norm": 3.371666193008423, + "learning_rate": 0.0002, + "loss": 2.5711, + "step": 2070 + }, + { + "epoch": 0.15499254843517138, + "grad_norm": 2.1471800804138184, + "learning_rate": 0.0002, + "loss": 2.4525, + "step": 2080 + }, + { + "epoch": 0.1557377049180328, + "grad_norm": 1.849825382232666, + "learning_rate": 0.0002, + "loss": 2.472, + "step": 2090 + }, + { + "epoch": 0.15648286140089418, + "grad_norm": 1.9633269309997559, + "learning_rate": 0.0002, + "loss": 2.5328, + "step": 2100 + }, + { + "epoch": 0.1572280178837556, + "grad_norm": 2.1593234539031982, + "learning_rate": 0.0002, + "loss": 2.5724, + "step": 2110 + }, + { + "epoch": 0.15797317436661698, + "grad_norm": 1.8166416883468628, + "learning_rate": 0.0002, + "loss": 2.5599, + "step": 2120 + }, + { + "epoch": 0.1587183308494784, + "grad_norm": 2.0000736713409424, + "learning_rate": 0.0002, + "loss": 2.5174, + "step": 2130 + }, + { + "epoch": 0.15946348733233978, + "grad_norm": 2.016064167022705, + "learning_rate": 0.0002, + "loss": 2.4799, + "step": 2140 + }, + { + "epoch": 0.1602086438152012, + "grad_norm": 2.289914846420288, + "learning_rate": 0.0002, + "loss": 2.4579, + "step": 2150 + }, + { + "epoch": 0.16095380029806258, + "grad_norm": 2.0967648029327393, + "learning_rate": 0.0002, + "loss": 2.3974, + "step": 2160 + }, + { + "epoch": 0.161698956780924, + "grad_norm": 1.9680815935134888, + "learning_rate": 0.0002, + "loss": 2.5335, + "step": 2170 + }, + { + "epoch": 0.16244411326378538, + "grad_norm": 1.659155011177063, + "learning_rate": 0.0002, + "loss": 2.2785, + "step": 2180 + }, + { + "epoch": 0.1631892697466468, + "grad_norm": 1.719580054283142, + "learning_rate": 0.0002, + "loss": 2.5193, + "step": 2190 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 1.4163504838943481, + "learning_rate": 0.0002, + "loss": 2.4477, + "step": 2200 + }, + { + "epoch": 0.1646795827123696, + "grad_norm": 1.4476326704025269, + "learning_rate": 0.0002, + "loss": 2.3973, + "step": 2210 + }, + { + "epoch": 0.16542473919523099, + "grad_norm": 1.7087070941925049, + "learning_rate": 0.0002, + "loss": 2.5413, + "step": 2220 + }, + { + "epoch": 0.1661698956780924, + "grad_norm": 1.813745379447937, + "learning_rate": 0.0002, + "loss": 2.3435, + "step": 2230 + }, + { + "epoch": 0.16691505216095381, + "grad_norm": 2.2703053951263428, + "learning_rate": 0.0002, + "loss": 2.5846, + "step": 2240 + }, + { + "epoch": 0.1676602086438152, + "grad_norm": 1.743831753730774, + "learning_rate": 0.0002, + "loss": 2.2399, + "step": 2250 + }, + { + "epoch": 0.16840536512667661, + "grad_norm": 1.7628560066223145, + "learning_rate": 0.0002, + "loss": 2.5024, + "step": 2260 + }, + { + "epoch": 0.169150521609538, + "grad_norm": 2.0905685424804688, + "learning_rate": 0.0002, + "loss": 2.3357, + "step": 2270 + }, + { + "epoch": 0.16989567809239942, + "grad_norm": 1.7532408237457275, + "learning_rate": 0.0002, + "loss": 2.4824, + "step": 2280 + }, + { + "epoch": 0.1706408345752608, + "grad_norm": 1.749121069908142, + "learning_rate": 0.0002, + "loss": 2.3893, + "step": 2290 + }, + { + "epoch": 0.17138599105812222, + "grad_norm": 1.529807448387146, + "learning_rate": 0.0002, + "loss": 2.3581, + "step": 2300 + }, + { + "epoch": 0.1721311475409836, + "grad_norm": 1.8684520721435547, + "learning_rate": 0.0002, + "loss": 2.3585, + "step": 2310 + }, + { + "epoch": 0.17287630402384502, + "grad_norm": 1.7635807991027832, + "learning_rate": 0.0002, + "loss": 2.3862, + "step": 2320 + }, + { + "epoch": 0.1736214605067064, + "grad_norm": 1.6727739572525024, + "learning_rate": 0.0002, + "loss": 2.5992, + "step": 2330 + }, + { + "epoch": 0.17436661698956782, + "grad_norm": 1.7773075103759766, + "learning_rate": 0.0002, + "loss": 2.4468, + "step": 2340 + }, + { + "epoch": 0.1751117734724292, + "grad_norm": 2.184798002243042, + "learning_rate": 0.0002, + "loss": 2.4697, + "step": 2350 + }, + { + "epoch": 0.17585692995529062, + "grad_norm": 1.7383967638015747, + "learning_rate": 0.0002, + "loss": 2.4937, + "step": 2360 + }, + { + "epoch": 0.176602086438152, + "grad_norm": 2.211831569671631, + "learning_rate": 0.0002, + "loss": 2.471, + "step": 2370 + }, + { + "epoch": 0.17734724292101342, + "grad_norm": 1.5768284797668457, + "learning_rate": 0.0002, + "loss": 2.4485, + "step": 2380 + }, + { + "epoch": 0.1780923994038748, + "grad_norm": 1.7147942781448364, + "learning_rate": 0.0002, + "loss": 2.3187, + "step": 2390 + }, + { + "epoch": 0.17883755588673622, + "grad_norm": 1.5341167449951172, + "learning_rate": 0.0002, + "loss": 2.3842, + "step": 2400 + }, + { + "epoch": 0.1795827123695976, + "grad_norm": 1.8212217092514038, + "learning_rate": 0.0002, + "loss": 2.2845, + "step": 2410 + }, + { + "epoch": 0.18032786885245902, + "grad_norm": 1.5502907037734985, + "learning_rate": 0.0002, + "loss": 2.4956, + "step": 2420 + }, + { + "epoch": 0.1810730253353204, + "grad_norm": 1.8575736284255981, + "learning_rate": 0.0002, + "loss": 2.4574, + "step": 2430 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 1.594504475593567, + "learning_rate": 0.0002, + "loss": 2.4808, + "step": 2440 + }, + { + "epoch": 0.1825633383010432, + "grad_norm": 1.864490032196045, + "learning_rate": 0.0002, + "loss": 2.6226, + "step": 2450 + }, + { + "epoch": 0.18330849478390462, + "grad_norm": 1.9114779233932495, + "learning_rate": 0.0002, + "loss": 2.5032, + "step": 2460 + }, + { + "epoch": 0.184053651266766, + "grad_norm": 2.119196653366089, + "learning_rate": 0.0002, + "loss": 2.4699, + "step": 2470 + }, + { + "epoch": 0.18479880774962743, + "grad_norm": 1.8797602653503418, + "learning_rate": 0.0002, + "loss": 2.4705, + "step": 2480 + }, + { + "epoch": 0.1855439642324888, + "grad_norm": 1.985633134841919, + "learning_rate": 0.0002, + "loss": 2.4216, + "step": 2490 + }, + { + "epoch": 0.18628912071535023, + "grad_norm": 1.831678032875061, + "learning_rate": 0.0002, + "loss": 2.538, + "step": 2500 + }, + { + "epoch": 0.1870342771982116, + "grad_norm": 1.7474476099014282, + "learning_rate": 0.0002, + "loss": 2.6348, + "step": 2510 + }, + { + "epoch": 0.18777943368107303, + "grad_norm": 1.7333717346191406, + "learning_rate": 0.0002, + "loss": 2.4576, + "step": 2520 + }, + { + "epoch": 0.1885245901639344, + "grad_norm": 2.0141854286193848, + "learning_rate": 0.0002, + "loss": 2.4277, + "step": 2530 + }, + { + "epoch": 0.18926974664679583, + "grad_norm": 1.7154362201690674, + "learning_rate": 0.0002, + "loss": 2.428, + "step": 2540 + }, + { + "epoch": 0.19001490312965721, + "grad_norm": 2.11297607421875, + "learning_rate": 0.0002, + "loss": 2.4868, + "step": 2550 + }, + { + "epoch": 0.19076005961251863, + "grad_norm": 1.8067042827606201, + "learning_rate": 0.0002, + "loss": 2.4782, + "step": 2560 + }, + { + "epoch": 0.19150521609538004, + "grad_norm": 2.3354551792144775, + "learning_rate": 0.0002, + "loss": 2.5139, + "step": 2570 + }, + { + "epoch": 0.19225037257824143, + "grad_norm": 1.7424496412277222, + "learning_rate": 0.0002, + "loss": 2.4437, + "step": 2580 + }, + { + "epoch": 0.19299552906110284, + "grad_norm": 2.1761672496795654, + "learning_rate": 0.0002, + "loss": 2.4239, + "step": 2590 + }, + { + "epoch": 0.19374068554396423, + "grad_norm": 2.0857791900634766, + "learning_rate": 0.0002, + "loss": 2.2968, + "step": 2600 + }, + { + "epoch": 0.19448584202682564, + "grad_norm": 2.1806142330169678, + "learning_rate": 0.0002, + "loss": 2.571, + "step": 2610 + }, + { + "epoch": 0.19523099850968703, + "grad_norm": 1.7046921253204346, + "learning_rate": 0.0002, + "loss": 2.5633, + "step": 2620 + }, + { + "epoch": 0.19597615499254845, + "grad_norm": 1.876073956489563, + "learning_rate": 0.0002, + "loss": 2.4484, + "step": 2630 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 1.948696255683899, + "learning_rate": 0.0002, + "loss": 2.4158, + "step": 2640 + }, + { + "epoch": 0.19746646795827125, + "grad_norm": 2.022000551223755, + "learning_rate": 0.0002, + "loss": 2.4718, + "step": 2650 + }, + { + "epoch": 0.19821162444113263, + "grad_norm": 1.776353120803833, + "learning_rate": 0.0002, + "loss": 2.5832, + "step": 2660 + }, + { + "epoch": 0.19895678092399405, + "grad_norm": 1.9837231636047363, + "learning_rate": 0.0002, + "loss": 2.5152, + "step": 2670 + }, + { + "epoch": 0.19970193740685543, + "grad_norm": 1.8511583805084229, + "learning_rate": 0.0002, + "loss": 2.3081, + "step": 2680 + }, + { + "epoch": 0.20044709388971685, + "grad_norm": 1.7415555715560913, + "learning_rate": 0.0002, + "loss": 2.3807, + "step": 2690 + }, + { + "epoch": 0.20119225037257824, + "grad_norm": 1.9808768033981323, + "learning_rate": 0.0002, + "loss": 2.4238, + "step": 2700 + }, + { + "epoch": 0.20193740685543965, + "grad_norm": 1.6676552295684814, + "learning_rate": 0.0002, + "loss": 2.4074, + "step": 2710 + }, + { + "epoch": 0.20268256333830104, + "grad_norm": 2.046172857284546, + "learning_rate": 0.0002, + "loss": 2.5146, + "step": 2720 + }, + { + "epoch": 0.20342771982116245, + "grad_norm": 2.0430774688720703, + "learning_rate": 0.0002, + "loss": 2.348, + "step": 2730 + }, + { + "epoch": 0.20417287630402384, + "grad_norm": 1.978427767753601, + "learning_rate": 0.0002, + "loss": 2.4703, + "step": 2740 + }, + { + "epoch": 0.20491803278688525, + "grad_norm": 2.2341721057891846, + "learning_rate": 0.0002, + "loss": 2.424, + "step": 2750 + }, + { + "epoch": 0.20566318926974664, + "grad_norm": 1.874298095703125, + "learning_rate": 0.0002, + "loss": 2.5449, + "step": 2760 + }, + { + "epoch": 0.20640834575260805, + "grad_norm": 2.0746755599975586, + "learning_rate": 0.0002, + "loss": 2.4735, + "step": 2770 + }, + { + "epoch": 0.20715350223546944, + "grad_norm": 1.671237587928772, + "learning_rate": 0.0002, + "loss": 2.4185, + "step": 2780 + }, + { + "epoch": 0.20789865871833085, + "grad_norm": 1.7436130046844482, + "learning_rate": 0.0002, + "loss": 2.5611, + "step": 2790 + }, + { + "epoch": 0.20864381520119224, + "grad_norm": 1.991050124168396, + "learning_rate": 0.0002, + "loss": 2.5108, + "step": 2800 + }, + { + "epoch": 0.20938897168405365, + "grad_norm": 1.893971562385559, + "learning_rate": 0.0002, + "loss": 2.4371, + "step": 2810 + }, + { + "epoch": 0.21013412816691504, + "grad_norm": 2.048959970474243, + "learning_rate": 0.0002, + "loss": 2.4887, + "step": 2820 + }, + { + "epoch": 0.21087928464977646, + "grad_norm": 1.4141104221343994, + "learning_rate": 0.0002, + "loss": 2.3666, + "step": 2830 + }, + { + "epoch": 0.21162444113263784, + "grad_norm": 1.9136624336242676, + "learning_rate": 0.0002, + "loss": 2.5108, + "step": 2840 + }, + { + "epoch": 0.21236959761549926, + "grad_norm": 1.8413383960723877, + "learning_rate": 0.0002, + "loss": 2.5956, + "step": 2850 + }, + { + "epoch": 0.21311475409836064, + "grad_norm": 1.8322744369506836, + "learning_rate": 0.0002, + "loss": 2.2766, + "step": 2860 + }, + { + "epoch": 0.21385991058122206, + "grad_norm": 2.0261011123657227, + "learning_rate": 0.0002, + "loss": 2.5494, + "step": 2870 + }, + { + "epoch": 0.21460506706408347, + "grad_norm": 1.7044886350631714, + "learning_rate": 0.0002, + "loss": 2.453, + "step": 2880 + }, + { + "epoch": 0.21535022354694486, + "grad_norm": 1.8733025789260864, + "learning_rate": 0.0002, + "loss": 2.4455, + "step": 2890 + }, + { + "epoch": 0.21609538002980627, + "grad_norm": 1.7016903162002563, + "learning_rate": 0.0002, + "loss": 2.542, + "step": 2900 + }, + { + "epoch": 0.21684053651266766, + "grad_norm": 1.8775333166122437, + "learning_rate": 0.0002, + "loss": 2.5536, + "step": 2910 + }, + { + "epoch": 0.21758569299552907, + "grad_norm": 1.9875683784484863, + "learning_rate": 0.0002, + "loss": 2.365, + "step": 2920 + }, + { + "epoch": 0.21833084947839046, + "grad_norm": 1.9816802740097046, + "learning_rate": 0.0002, + "loss": 2.5781, + "step": 2930 + }, + { + "epoch": 0.21907600596125187, + "grad_norm": 2.2699484825134277, + "learning_rate": 0.0002, + "loss": 2.369, + "step": 2940 + }, + { + "epoch": 0.21982116244411326, + "grad_norm": 1.795271635055542, + "learning_rate": 0.0002, + "loss": 2.5805, + "step": 2950 + }, + { + "epoch": 0.22056631892697467, + "grad_norm": 2.1191015243530273, + "learning_rate": 0.0002, + "loss": 2.522, + "step": 2960 + }, + { + "epoch": 0.22131147540983606, + "grad_norm": 1.8573635816574097, + "learning_rate": 0.0002, + "loss": 2.5885, + "step": 2970 + }, + { + "epoch": 0.22205663189269748, + "grad_norm": 1.8743098974227905, + "learning_rate": 0.0002, + "loss": 2.4619, + "step": 2980 + }, + { + "epoch": 0.22280178837555886, + "grad_norm": 2.030251979827881, + "learning_rate": 0.0002, + "loss": 2.4544, + "step": 2990 + }, + { + "epoch": 0.22354694485842028, + "grad_norm": 1.7911770343780518, + "learning_rate": 0.0002, + "loss": 2.3729, + "step": 3000 + }, + { + "epoch": 0.22429210134128166, + "grad_norm": 1.7852712869644165, + "learning_rate": 0.0002, + "loss": 2.411, + "step": 3010 + }, + { + "epoch": 0.22503725782414308, + "grad_norm": 2.3478708267211914, + "learning_rate": 0.0002, + "loss": 2.4913, + "step": 3020 + }, + { + "epoch": 0.22578241430700446, + "grad_norm": 1.9944161176681519, + "learning_rate": 0.0002, + "loss": 2.5399, + "step": 3030 + }, + { + "epoch": 0.22652757078986588, + "grad_norm": 2.0674550533294678, + "learning_rate": 0.0002, + "loss": 2.4142, + "step": 3040 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 1.7529555559158325, + "learning_rate": 0.0002, + "loss": 2.4124, + "step": 3050 + }, + { + "epoch": 0.22801788375558868, + "grad_norm": 1.950682520866394, + "learning_rate": 0.0002, + "loss": 2.5125, + "step": 3060 + }, + { + "epoch": 0.22876304023845007, + "grad_norm": 1.9079375267028809, + "learning_rate": 0.0002, + "loss": 2.2691, + "step": 3070 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 2.3478922843933105, + "learning_rate": 0.0002, + "loss": 2.6288, + "step": 3080 + }, + { + "epoch": 0.23025335320417287, + "grad_norm": 1.7229981422424316, + "learning_rate": 0.0002, + "loss": 2.4475, + "step": 3090 + }, + { + "epoch": 0.23099850968703428, + "grad_norm": 1.970567226409912, + "learning_rate": 0.0002, + "loss": 2.6176, + "step": 3100 + }, + { + "epoch": 0.23174366616989567, + "grad_norm": 2.1065728664398193, + "learning_rate": 0.0002, + "loss": 2.5292, + "step": 3110 + }, + { + "epoch": 0.23248882265275708, + "grad_norm": 2.137432336807251, + "learning_rate": 0.0002, + "loss": 2.5971, + "step": 3120 + }, + { + "epoch": 0.23323397913561847, + "grad_norm": 1.9328407049179077, + "learning_rate": 0.0002, + "loss": 2.4158, + "step": 3130 + }, + { + "epoch": 0.23397913561847988, + "grad_norm": 1.981392741203308, + "learning_rate": 0.0002, + "loss": 2.524, + "step": 3140 + }, + { + "epoch": 0.23472429210134127, + "grad_norm": 2.098461866378784, + "learning_rate": 0.0002, + "loss": 2.4319, + "step": 3150 + }, + { + "epoch": 0.23546944858420268, + "grad_norm": 2.2548506259918213, + "learning_rate": 0.0002, + "loss": 2.608, + "step": 3160 + }, + { + "epoch": 0.23621460506706407, + "grad_norm": 2.1737749576568604, + "learning_rate": 0.0002, + "loss": 2.4759, + "step": 3170 + }, + { + "epoch": 0.23695976154992549, + "grad_norm": 2.0309979915618896, + "learning_rate": 0.0002, + "loss": 2.3443, + "step": 3180 + }, + { + "epoch": 0.23770491803278687, + "grad_norm": 2.115638494491577, + "learning_rate": 0.0002, + "loss": 2.5721, + "step": 3190 + }, + { + "epoch": 0.23845007451564829, + "grad_norm": 2.2513480186462402, + "learning_rate": 0.0002, + "loss": 2.4294, + "step": 3200 + }, + { + "epoch": 0.2391952309985097, + "grad_norm": 1.984811544418335, + "learning_rate": 0.0002, + "loss": 2.5133, + "step": 3210 + }, + { + "epoch": 0.2399403874813711, + "grad_norm": 1.9788024425506592, + "learning_rate": 0.0002, + "loss": 2.3574, + "step": 3220 + }, + { + "epoch": 0.2406855439642325, + "grad_norm": 1.8824642896652222, + "learning_rate": 0.0002, + "loss": 2.3478, + "step": 3230 + }, + { + "epoch": 0.2414307004470939, + "grad_norm": 1.9927945137023926, + "learning_rate": 0.0002, + "loss": 2.6732, + "step": 3240 + }, + { + "epoch": 0.2421758569299553, + "grad_norm": 1.7224589586257935, + "learning_rate": 0.0002, + "loss": 2.5057, + "step": 3250 + }, + { + "epoch": 0.2429210134128167, + "grad_norm": 1.787179708480835, + "learning_rate": 0.0002, + "loss": 2.4139, + "step": 3260 + }, + { + "epoch": 0.2436661698956781, + "grad_norm": 1.9321664571762085, + "learning_rate": 0.0002, + "loss": 2.5715, + "step": 3270 + }, + { + "epoch": 0.2444113263785395, + "grad_norm": 1.9817686080932617, + "learning_rate": 0.0002, + "loss": 2.5466, + "step": 3280 + }, + { + "epoch": 0.2451564828614009, + "grad_norm": 1.8614225387573242, + "learning_rate": 0.0002, + "loss": 2.4936, + "step": 3290 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 1.6355311870574951, + "learning_rate": 0.0002, + "loss": 2.3833, + "step": 3300 + }, + { + "epoch": 0.2466467958271237, + "grad_norm": 1.9222790002822876, + "learning_rate": 0.0002, + "loss": 2.3884, + "step": 3310 + }, + { + "epoch": 0.2473919523099851, + "grad_norm": 1.8250503540039062, + "learning_rate": 0.0002, + "loss": 2.4804, + "step": 3320 + }, + { + "epoch": 0.2481371087928465, + "grad_norm": 1.9070334434509277, + "learning_rate": 0.0002, + "loss": 2.4982, + "step": 3330 + }, + { + "epoch": 0.2488822652757079, + "grad_norm": 2.0816781520843506, + "learning_rate": 0.0002, + "loss": 2.188, + "step": 3340 + }, + { + "epoch": 0.2496274217585693, + "grad_norm": 2.8250019550323486, + "learning_rate": 0.0002, + "loss": 2.1956, + "step": 3350 + }, + { + "epoch": 0.2503725782414307, + "grad_norm": 2.0118353366851807, + "learning_rate": 0.0002, + "loss": 2.3953, + "step": 3360 + }, + { + "epoch": 0.2511177347242921, + "grad_norm": 1.9895654916763306, + "learning_rate": 0.0002, + "loss": 2.4261, + "step": 3370 + }, + { + "epoch": 0.2518628912071535, + "grad_norm": 1.6906330585479736, + "learning_rate": 0.0002, + "loss": 2.3543, + "step": 3380 + }, + { + "epoch": 0.2526080476900149, + "grad_norm": 1.8363487720489502, + "learning_rate": 0.0002, + "loss": 2.6099, + "step": 3390 + }, + { + "epoch": 0.2533532041728763, + "grad_norm": 1.8634984493255615, + "learning_rate": 0.0002, + "loss": 2.4341, + "step": 3400 + }, + { + "epoch": 0.2540983606557377, + "grad_norm": 1.7584588527679443, + "learning_rate": 0.0002, + "loss": 2.3242, + "step": 3410 + }, + { + "epoch": 0.2548435171385991, + "grad_norm": 1.542602300643921, + "learning_rate": 0.0002, + "loss": 2.5926, + "step": 3420 + }, + { + "epoch": 0.2555886736214605, + "grad_norm": 1.7898099422454834, + "learning_rate": 0.0002, + "loss": 2.5387, + "step": 3430 + }, + { + "epoch": 0.2563338301043219, + "grad_norm": 1.9139480590820312, + "learning_rate": 0.0002, + "loss": 2.1946, + "step": 3440 + }, + { + "epoch": 0.2570789865871833, + "grad_norm": 1.7210464477539062, + "learning_rate": 0.0002, + "loss": 2.6303, + "step": 3450 + }, + { + "epoch": 0.2578241430700447, + "grad_norm": 2.010288953781128, + "learning_rate": 0.0002, + "loss": 2.6645, + "step": 3460 + }, + { + "epoch": 0.2585692995529061, + "grad_norm": 2.2922821044921875, + "learning_rate": 0.0002, + "loss": 2.353, + "step": 3470 + }, + { + "epoch": 0.2593144560357675, + "grad_norm": 1.7492296695709229, + "learning_rate": 0.0002, + "loss": 2.5581, + "step": 3480 + }, + { + "epoch": 0.2600596125186289, + "grad_norm": 1.845924735069275, + "learning_rate": 0.0002, + "loss": 2.4135, + "step": 3490 + }, + { + "epoch": 0.2608047690014903, + "grad_norm": 1.926946759223938, + "learning_rate": 0.0002, + "loss": 2.5024, + "step": 3500 + }, + { + "epoch": 0.2615499254843517, + "grad_norm": 2.1195075511932373, + "learning_rate": 0.0002, + "loss": 2.663, + "step": 3510 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 1.7661279439926147, + "learning_rate": 0.0002, + "loss": 2.5196, + "step": 3520 + }, + { + "epoch": 0.2630402384500745, + "grad_norm": 2.357307195663452, + "learning_rate": 0.0002, + "loss": 2.3874, + "step": 3530 + }, + { + "epoch": 0.2637853949329359, + "grad_norm": 1.9379281997680664, + "learning_rate": 0.0002, + "loss": 2.5646, + "step": 3540 + }, + { + "epoch": 0.26453055141579734, + "grad_norm": 1.89481782913208, + "learning_rate": 0.0002, + "loss": 2.5963, + "step": 3550 + }, + { + "epoch": 0.26527570789865873, + "grad_norm": 1.9536447525024414, + "learning_rate": 0.0002, + "loss": 2.2617, + "step": 3560 + }, + { + "epoch": 0.2660208643815201, + "grad_norm": 1.9169467687606812, + "learning_rate": 0.0002, + "loss": 2.4662, + "step": 3570 + }, + { + "epoch": 0.2667660208643815, + "grad_norm": 2.1301636695861816, + "learning_rate": 0.0002, + "loss": 2.4761, + "step": 3580 + }, + { + "epoch": 0.26751117734724295, + "grad_norm": 1.9134154319763184, + "learning_rate": 0.0002, + "loss": 2.5022, + "step": 3590 + }, + { + "epoch": 0.26825633383010433, + "grad_norm": 2.04421067237854, + "learning_rate": 0.0002, + "loss": 2.3228, + "step": 3600 + }, + { + "epoch": 0.2690014903129657, + "grad_norm": 1.7705532312393188, + "learning_rate": 0.0002, + "loss": 2.4066, + "step": 3610 + }, + { + "epoch": 0.2697466467958271, + "grad_norm": 2.080298900604248, + "learning_rate": 0.0002, + "loss": 2.6129, + "step": 3620 + }, + { + "epoch": 0.27049180327868855, + "grad_norm": 1.6139470338821411, + "learning_rate": 0.0002, + "loss": 2.5801, + "step": 3630 + }, + { + "epoch": 0.27123695976154993, + "grad_norm": 2.054302453994751, + "learning_rate": 0.0002, + "loss": 2.3711, + "step": 3640 + }, + { + "epoch": 0.2719821162444113, + "grad_norm": 2.338289260864258, + "learning_rate": 0.0002, + "loss": 2.5838, + "step": 3650 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 1.7468085289001465, + "learning_rate": 0.0002, + "loss": 2.1876, + "step": 3660 + }, + { + "epoch": 0.27347242921013415, + "grad_norm": 2.027275562286377, + "learning_rate": 0.0002, + "loss": 2.5105, + "step": 3670 + }, + { + "epoch": 0.27421758569299554, + "grad_norm": 2.229505777359009, + "learning_rate": 0.0002, + "loss": 2.6577, + "step": 3680 + }, + { + "epoch": 0.2749627421758569, + "grad_norm": 1.9320789575576782, + "learning_rate": 0.0002, + "loss": 2.2976, + "step": 3690 + }, + { + "epoch": 0.2757078986587183, + "grad_norm": 2.186555862426758, + "learning_rate": 0.0002, + "loss": 2.3886, + "step": 3700 + }, + { + "epoch": 0.27645305514157975, + "grad_norm": 1.918982982635498, + "learning_rate": 0.0002, + "loss": 2.5599, + "step": 3710 + }, + { + "epoch": 0.27719821162444114, + "grad_norm": 1.8183304071426392, + "learning_rate": 0.0002, + "loss": 2.5157, + "step": 3720 + }, + { + "epoch": 0.2779433681073025, + "grad_norm": 1.994059443473816, + "learning_rate": 0.0002, + "loss": 2.5395, + "step": 3730 + }, + { + "epoch": 0.2786885245901639, + "grad_norm": 1.7807224988937378, + "learning_rate": 0.0002, + "loss": 2.3498, + "step": 3740 + }, + { + "epoch": 0.27943368107302535, + "grad_norm": 2.1187775135040283, + "learning_rate": 0.0002, + "loss": 2.5581, + "step": 3750 + }, + { + "epoch": 0.28017883755588674, + "grad_norm": 1.8966116905212402, + "learning_rate": 0.0002, + "loss": 2.5547, + "step": 3760 + }, + { + "epoch": 0.2809239940387481, + "grad_norm": 2.044442653656006, + "learning_rate": 0.0002, + "loss": 2.5357, + "step": 3770 + }, + { + "epoch": 0.2816691505216095, + "grad_norm": 1.8322268724441528, + "learning_rate": 0.0002, + "loss": 2.5751, + "step": 3780 + }, + { + "epoch": 0.28241430700447095, + "grad_norm": 2.1471657752990723, + "learning_rate": 0.0002, + "loss": 2.5739, + "step": 3790 + }, + { + "epoch": 0.28315946348733234, + "grad_norm": 1.54653000831604, + "learning_rate": 0.0002, + "loss": 2.4101, + "step": 3800 + }, + { + "epoch": 0.28390461997019373, + "grad_norm": 2.0441126823425293, + "learning_rate": 0.0002, + "loss": 2.4344, + "step": 3810 + }, + { + "epoch": 0.2846497764530551, + "grad_norm": 2.10286021232605, + "learning_rate": 0.0002, + "loss": 2.4168, + "step": 3820 + }, + { + "epoch": 0.28539493293591656, + "grad_norm": 2.1255736351013184, + "learning_rate": 0.0002, + "loss": 2.4448, + "step": 3830 + }, + { + "epoch": 0.28614008941877794, + "grad_norm": 1.830354928970337, + "learning_rate": 0.0002, + "loss": 2.3284, + "step": 3840 + }, + { + "epoch": 0.28688524590163933, + "grad_norm": 1.9017317295074463, + "learning_rate": 0.0002, + "loss": 2.51, + "step": 3850 + }, + { + "epoch": 0.28763040238450077, + "grad_norm": 2.0456435680389404, + "learning_rate": 0.0002, + "loss": 2.5099, + "step": 3860 + }, + { + "epoch": 0.28837555886736216, + "grad_norm": 2.036862373352051, + "learning_rate": 0.0002, + "loss": 2.418, + "step": 3870 + }, + { + "epoch": 0.28912071535022354, + "grad_norm": 1.9456264972686768, + "learning_rate": 0.0002, + "loss": 2.371, + "step": 3880 + }, + { + "epoch": 0.28986587183308493, + "grad_norm": 2.3842172622680664, + "learning_rate": 0.0002, + "loss": 2.5213, + "step": 3890 + }, + { + "epoch": 0.2906110283159464, + "grad_norm": 1.9152741432189941, + "learning_rate": 0.0002, + "loss": 2.5191, + "step": 3900 + }, + { + "epoch": 0.29135618479880776, + "grad_norm": 1.9884922504425049, + "learning_rate": 0.0002, + "loss": 2.4272, + "step": 3910 + }, + { + "epoch": 0.29210134128166915, + "grad_norm": 1.7578502893447876, + "learning_rate": 0.0002, + "loss": 2.4975, + "step": 3920 + }, + { + "epoch": 0.29284649776453053, + "grad_norm": 2.5302071571350098, + "learning_rate": 0.0002, + "loss": 2.3564, + "step": 3930 + }, + { + "epoch": 0.293591654247392, + "grad_norm": 2.2181880474090576, + "learning_rate": 0.0002, + "loss": 2.6365, + "step": 3940 + }, + { + "epoch": 0.29433681073025336, + "grad_norm": 1.8359179496765137, + "learning_rate": 0.0002, + "loss": 2.4825, + "step": 3950 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 1.8330084085464478, + "learning_rate": 0.0002, + "loss": 2.3746, + "step": 3960 + }, + { + "epoch": 0.29582712369597614, + "grad_norm": 2.0700743198394775, + "learning_rate": 0.0002, + "loss": 2.4456, + "step": 3970 + }, + { + "epoch": 0.2965722801788376, + "grad_norm": 2.0610342025756836, + "learning_rate": 0.0002, + "loss": 2.5419, + "step": 3980 + }, + { + "epoch": 0.29731743666169896, + "grad_norm": 2.0633559226989746, + "learning_rate": 0.0002, + "loss": 2.4911, + "step": 3990 + }, + { + "epoch": 0.29806259314456035, + "grad_norm": 1.894938588142395, + "learning_rate": 0.0002, + "loss": 2.5311, + "step": 4000 + }, + { + "epoch": 0.29880774962742174, + "grad_norm": 2.153447389602661, + "learning_rate": 0.0002, + "loss": 2.5135, + "step": 4010 + }, + { + "epoch": 0.2995529061102832, + "grad_norm": 1.8810901641845703, + "learning_rate": 0.0002, + "loss": 2.5123, + "step": 4020 + }, + { + "epoch": 0.30029806259314457, + "grad_norm": 2.828382730484009, + "learning_rate": 0.0002, + "loss": 2.4624, + "step": 4030 + }, + { + "epoch": 0.30104321907600595, + "grad_norm": 1.5955597162246704, + "learning_rate": 0.0002, + "loss": 2.4029, + "step": 4040 + }, + { + "epoch": 0.30178837555886734, + "grad_norm": 2.158923864364624, + "learning_rate": 0.0002, + "loss": 2.3133, + "step": 4050 + }, + { + "epoch": 0.3025335320417288, + "grad_norm": 2.044170379638672, + "learning_rate": 0.0002, + "loss": 2.5106, + "step": 4060 + }, + { + "epoch": 0.30327868852459017, + "grad_norm": 2.1641745567321777, + "learning_rate": 0.0002, + "loss": 2.3511, + "step": 4070 + }, + { + "epoch": 0.30402384500745155, + "grad_norm": 1.7815321683883667, + "learning_rate": 0.0002, + "loss": 2.47, + "step": 4080 + }, + { + "epoch": 0.30476900149031294, + "grad_norm": 1.7963491678237915, + "learning_rate": 0.0002, + "loss": 2.431, + "step": 4090 + }, + { + "epoch": 0.3055141579731744, + "grad_norm": 1.8735893964767456, + "learning_rate": 0.0002, + "loss": 2.4197, + "step": 4100 + }, + { + "epoch": 0.30625931445603577, + "grad_norm": 1.8785910606384277, + "learning_rate": 0.0002, + "loss": 2.4681, + "step": 4110 + }, + { + "epoch": 0.30700447093889716, + "grad_norm": 2.711383104324341, + "learning_rate": 0.0002, + "loss": 2.5373, + "step": 4120 + }, + { + "epoch": 0.30774962742175854, + "grad_norm": 1.837888479232788, + "learning_rate": 0.0002, + "loss": 2.4393, + "step": 4130 + }, + { + "epoch": 0.30849478390462, + "grad_norm": 2.044309616088867, + "learning_rate": 0.0002, + "loss": 2.5764, + "step": 4140 + }, + { + "epoch": 0.30923994038748137, + "grad_norm": 2.052886486053467, + "learning_rate": 0.0002, + "loss": 2.5038, + "step": 4150 + }, + { + "epoch": 0.30998509687034276, + "grad_norm": 2.0241613388061523, + "learning_rate": 0.0002, + "loss": 2.6383, + "step": 4160 + }, + { + "epoch": 0.3107302533532042, + "grad_norm": 1.9976118803024292, + "learning_rate": 0.0002, + "loss": 2.5346, + "step": 4170 + }, + { + "epoch": 0.3114754098360656, + "grad_norm": 1.7011710405349731, + "learning_rate": 0.0002, + "loss": 2.5725, + "step": 4180 + }, + { + "epoch": 0.312220566318927, + "grad_norm": 2.266010284423828, + "learning_rate": 0.0002, + "loss": 2.473, + "step": 4190 + }, + { + "epoch": 0.31296572280178836, + "grad_norm": 1.793825626373291, + "learning_rate": 0.0002, + "loss": 2.3846, + "step": 4200 + }, + { + "epoch": 0.3137108792846498, + "grad_norm": 2.258878469467163, + "learning_rate": 0.0002, + "loss": 2.6184, + "step": 4210 + }, + { + "epoch": 0.3144560357675112, + "grad_norm": 2.1145365238189697, + "learning_rate": 0.0002, + "loss": 2.5497, + "step": 4220 + }, + { + "epoch": 0.3152011922503726, + "grad_norm": 2.1772029399871826, + "learning_rate": 0.0002, + "loss": 2.4528, + "step": 4230 + }, + { + "epoch": 0.31594634873323396, + "grad_norm": 2.5457372665405273, + "learning_rate": 0.0002, + "loss": 2.6091, + "step": 4240 + }, + { + "epoch": 0.3166915052160954, + "grad_norm": 1.8056083917617798, + "learning_rate": 0.0002, + "loss": 2.48, + "step": 4250 + }, + { + "epoch": 0.3174366616989568, + "grad_norm": 2.0121543407440186, + "learning_rate": 0.0002, + "loss": 2.5622, + "step": 4260 + }, + { + "epoch": 0.3181818181818182, + "grad_norm": 2.6106455326080322, + "learning_rate": 0.0002, + "loss": 2.5243, + "step": 4270 + }, + { + "epoch": 0.31892697466467956, + "grad_norm": 1.9244214296340942, + "learning_rate": 0.0002, + "loss": 2.4204, + "step": 4280 + }, + { + "epoch": 0.319672131147541, + "grad_norm": 2.1080305576324463, + "learning_rate": 0.0002, + "loss": 2.4296, + "step": 4290 + }, + { + "epoch": 0.3204172876304024, + "grad_norm": 2.309900999069214, + "learning_rate": 0.0002, + "loss": 2.4689, + "step": 4300 + }, + { + "epoch": 0.3211624441132638, + "grad_norm": 2.2716972827911377, + "learning_rate": 0.0002, + "loss": 2.3885, + "step": 4310 + }, + { + "epoch": 0.32190760059612517, + "grad_norm": 2.2576944828033447, + "learning_rate": 0.0002, + "loss": 2.3281, + "step": 4320 + }, + { + "epoch": 0.3226527570789866, + "grad_norm": 1.943267583847046, + "learning_rate": 0.0002, + "loss": 2.4674, + "step": 4330 + }, + { + "epoch": 0.323397913561848, + "grad_norm": 1.886509895324707, + "learning_rate": 0.0002, + "loss": 2.491, + "step": 4340 + }, + { + "epoch": 0.3241430700447094, + "grad_norm": 1.9226173162460327, + "learning_rate": 0.0002, + "loss": 2.4899, + "step": 4350 + }, + { + "epoch": 0.32488822652757077, + "grad_norm": 2.016324043273926, + "learning_rate": 0.0002, + "loss": 2.4347, + "step": 4360 + }, + { + "epoch": 0.3256333830104322, + "grad_norm": 1.8798201084136963, + "learning_rate": 0.0002, + "loss": 2.7045, + "step": 4370 + }, + { + "epoch": 0.3263785394932936, + "grad_norm": 2.0897727012634277, + "learning_rate": 0.0002, + "loss": 2.4779, + "step": 4380 + }, + { + "epoch": 0.327123695976155, + "grad_norm": 2.4956274032592773, + "learning_rate": 0.0002, + "loss": 2.5188, + "step": 4390 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 2.08056378364563, + "learning_rate": 0.0002, + "loss": 2.4447, + "step": 4400 + }, + { + "epoch": 0.3286140089418778, + "grad_norm": 2.2066187858581543, + "learning_rate": 0.0002, + "loss": 2.5474, + "step": 4410 + }, + { + "epoch": 0.3293591654247392, + "grad_norm": 1.8168262243270874, + "learning_rate": 0.0002, + "loss": 2.5207, + "step": 4420 + }, + { + "epoch": 0.3301043219076006, + "grad_norm": 2.649477481842041, + "learning_rate": 0.0002, + "loss": 2.4138, + "step": 4430 + }, + { + "epoch": 0.33084947839046197, + "grad_norm": 2.3449325561523438, + "learning_rate": 0.0002, + "loss": 2.5474, + "step": 4440 + }, + { + "epoch": 0.3315946348733234, + "grad_norm": 1.7262647151947021, + "learning_rate": 0.0002, + "loss": 2.5911, + "step": 4450 + }, + { + "epoch": 0.3323397913561848, + "grad_norm": 2.0296733379364014, + "learning_rate": 0.0002, + "loss": 2.5417, + "step": 4460 + }, + { + "epoch": 0.3330849478390462, + "grad_norm": 2.036099672317505, + "learning_rate": 0.0002, + "loss": 2.3912, + "step": 4470 + }, + { + "epoch": 0.33383010432190763, + "grad_norm": 1.9934238195419312, + "learning_rate": 0.0002, + "loss": 2.5022, + "step": 4480 + }, + { + "epoch": 0.334575260804769, + "grad_norm": 2.1589412689208984, + "learning_rate": 0.0002, + "loss": 2.5989, + "step": 4490 + }, + { + "epoch": 0.3353204172876304, + "grad_norm": 1.8449981212615967, + "learning_rate": 0.0002, + "loss": 2.233, + "step": 4500 + }, + { + "epoch": 0.3360655737704918, + "grad_norm": 1.7662941217422485, + "learning_rate": 0.0002, + "loss": 2.5521, + "step": 4510 + }, + { + "epoch": 0.33681073025335323, + "grad_norm": 2.3483681678771973, + "learning_rate": 0.0002, + "loss": 2.4731, + "step": 4520 + }, + { + "epoch": 0.3375558867362146, + "grad_norm": 1.8929402828216553, + "learning_rate": 0.0002, + "loss": 2.4925, + "step": 4530 + }, + { + "epoch": 0.338301043219076, + "grad_norm": 1.988408088684082, + "learning_rate": 0.0002, + "loss": 2.5581, + "step": 4540 + }, + { + "epoch": 0.3390461997019374, + "grad_norm": 2.3717474937438965, + "learning_rate": 0.0002, + "loss": 2.5353, + "step": 4550 + }, + { + "epoch": 0.33979135618479883, + "grad_norm": 2.058992862701416, + "learning_rate": 0.0002, + "loss": 2.4383, + "step": 4560 + }, + { + "epoch": 0.3405365126676602, + "grad_norm": 2.3037335872650146, + "learning_rate": 0.0002, + "loss": 2.6187, + "step": 4570 + }, + { + "epoch": 0.3412816691505216, + "grad_norm": 2.0833449363708496, + "learning_rate": 0.0002, + "loss": 2.6367, + "step": 4580 + }, + { + "epoch": 0.342026825633383, + "grad_norm": 1.6986418962478638, + "learning_rate": 0.0002, + "loss": 2.343, + "step": 4590 + }, + { + "epoch": 0.34277198211624443, + "grad_norm": 4.1914472579956055, + "learning_rate": 0.0002, + "loss": 2.6611, + "step": 4600 + }, + { + "epoch": 0.3435171385991058, + "grad_norm": 2.1022298336029053, + "learning_rate": 0.0002, + "loss": 2.5226, + "step": 4610 + }, + { + "epoch": 0.3442622950819672, + "grad_norm": 2.1484179496765137, + "learning_rate": 0.0002, + "loss": 2.3475, + "step": 4620 + }, + { + "epoch": 0.3450074515648286, + "grad_norm": 2.1597182750701904, + "learning_rate": 0.0002, + "loss": 2.5566, + "step": 4630 + }, + { + "epoch": 0.34575260804769004, + "grad_norm": 2.0153634548187256, + "learning_rate": 0.0002, + "loss": 2.399, + "step": 4640 + }, + { + "epoch": 0.3464977645305514, + "grad_norm": 4.205637454986572, + "learning_rate": 0.0002, + "loss": 2.4738, + "step": 4650 + }, + { + "epoch": 0.3472429210134128, + "grad_norm": 1.965390682220459, + "learning_rate": 0.0002, + "loss": 2.5928, + "step": 4660 + }, + { + "epoch": 0.3479880774962742, + "grad_norm": 2.318188428878784, + "learning_rate": 0.0002, + "loss": 2.3906, + "step": 4670 + }, + { + "epoch": 0.34873323397913564, + "grad_norm": 2.16817045211792, + "learning_rate": 0.0002, + "loss": 2.5063, + "step": 4680 + }, + { + "epoch": 0.349478390461997, + "grad_norm": 1.8313651084899902, + "learning_rate": 0.0002, + "loss": 2.3634, + "step": 4690 + }, + { + "epoch": 0.3502235469448584, + "grad_norm": 2.083974599838257, + "learning_rate": 0.0002, + "loss": 2.4984, + "step": 4700 + }, + { + "epoch": 0.3509687034277198, + "grad_norm": 2.033154249191284, + "learning_rate": 0.0002, + "loss": 2.5167, + "step": 4710 + }, + { + "epoch": 0.35171385991058124, + "grad_norm": 1.7030832767486572, + "learning_rate": 0.0002, + "loss": 2.3332, + "step": 4720 + }, + { + "epoch": 0.3524590163934426, + "grad_norm": 2.236445188522339, + "learning_rate": 0.0002, + "loss": 2.5903, + "step": 4730 + }, + { + "epoch": 0.353204172876304, + "grad_norm": 2.0333669185638428, + "learning_rate": 0.0002, + "loss": 2.6599, + "step": 4740 + }, + { + "epoch": 0.3539493293591654, + "grad_norm": 2.043572425842285, + "learning_rate": 0.0002, + "loss": 2.5685, + "step": 4750 + }, + { + "epoch": 0.35469448584202684, + "grad_norm": 1.7955186367034912, + "learning_rate": 0.0002, + "loss": 2.4531, + "step": 4760 + }, + { + "epoch": 0.3554396423248882, + "grad_norm": 2.064957857131958, + "learning_rate": 0.0002, + "loss": 2.5, + "step": 4770 + }, + { + "epoch": 0.3561847988077496, + "grad_norm": 2.2814271450042725, + "learning_rate": 0.0002, + "loss": 2.3527, + "step": 4780 + }, + { + "epoch": 0.356929955290611, + "grad_norm": 2.179020881652832, + "learning_rate": 0.0002, + "loss": 2.4888, + "step": 4790 + }, + { + "epoch": 0.35767511177347244, + "grad_norm": 1.7410861253738403, + "learning_rate": 0.0002, + "loss": 2.3833, + "step": 4800 + }, + { + "epoch": 0.35842026825633383, + "grad_norm": 2.1326522827148438, + "learning_rate": 0.0002, + "loss": 2.5156, + "step": 4810 + }, + { + "epoch": 0.3591654247391952, + "grad_norm": 2.075561761856079, + "learning_rate": 0.0002, + "loss": 2.4855, + "step": 4820 + }, + { + "epoch": 0.35991058122205666, + "grad_norm": 2.168584108352661, + "learning_rate": 0.0002, + "loss": 2.5904, + "step": 4830 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 2.0109267234802246, + "learning_rate": 0.0002, + "loss": 2.502, + "step": 4840 + }, + { + "epoch": 0.36140089418777943, + "grad_norm": 1.8693374395370483, + "learning_rate": 0.0002, + "loss": 2.531, + "step": 4850 + }, + { + "epoch": 0.3621460506706408, + "grad_norm": 2.561384439468384, + "learning_rate": 0.0002, + "loss": 2.2466, + "step": 4860 + }, + { + "epoch": 0.36289120715350226, + "grad_norm": 2.319011688232422, + "learning_rate": 0.0002, + "loss": 2.4845, + "step": 4870 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 2.0164289474487305, + "learning_rate": 0.0002, + "loss": 2.6448, + "step": 4880 + }, + { + "epoch": 0.36438152011922503, + "grad_norm": 1.8674942255020142, + "learning_rate": 0.0002, + "loss": 2.4344, + "step": 4890 + }, + { + "epoch": 0.3651266766020864, + "grad_norm": 1.8700525760650635, + "learning_rate": 0.0002, + "loss": 2.412, + "step": 4900 + }, + { + "epoch": 0.36587183308494786, + "grad_norm": 1.9874043464660645, + "learning_rate": 0.0002, + "loss": 2.5567, + "step": 4910 + }, + { + "epoch": 0.36661698956780925, + "grad_norm": 1.914652943611145, + "learning_rate": 0.0002, + "loss": 2.5984, + "step": 4920 + }, + { + "epoch": 0.36736214605067063, + "grad_norm": 2.1003236770629883, + "learning_rate": 0.0002, + "loss": 2.6851, + "step": 4930 + }, + { + "epoch": 0.368107302533532, + "grad_norm": 1.9648061990737915, + "learning_rate": 0.0002, + "loss": 2.4207, + "step": 4940 + }, + { + "epoch": 0.36885245901639346, + "grad_norm": 2.0232656002044678, + "learning_rate": 0.0002, + "loss": 2.4922, + "step": 4950 + }, + { + "epoch": 0.36959761549925485, + "grad_norm": 2.0865535736083984, + "learning_rate": 0.0002, + "loss": 2.6409, + "step": 4960 + }, + { + "epoch": 0.37034277198211624, + "grad_norm": 1.8709211349487305, + "learning_rate": 0.0002, + "loss": 2.3267, + "step": 4970 + }, + { + "epoch": 0.3710879284649776, + "grad_norm": 2.093190908432007, + "learning_rate": 0.0002, + "loss": 2.3293, + "step": 4980 + }, + { + "epoch": 0.37183308494783907, + "grad_norm": 2.2843077182769775, + "learning_rate": 0.0002, + "loss": 2.5344, + "step": 4990 + }, + { + "epoch": 0.37257824143070045, + "grad_norm": 1.9902077913284302, + "learning_rate": 0.0002, + "loss": 2.4221, + "step": 5000 + }, + { + "epoch": 0.37332339791356184, + "grad_norm": 1.827880859375, + "learning_rate": 0.0002, + "loss": 2.3791, + "step": 5010 + }, + { + "epoch": 0.3740685543964232, + "grad_norm": 2.0986554622650146, + "learning_rate": 0.0002, + "loss": 2.5067, + "step": 5020 + }, + { + "epoch": 0.37481371087928467, + "grad_norm": 2.260951519012451, + "learning_rate": 0.0002, + "loss": 2.5156, + "step": 5030 + }, + { + "epoch": 0.37555886736214605, + "grad_norm": 2.1205878257751465, + "learning_rate": 0.0002, + "loss": 2.524, + "step": 5040 + }, + { + "epoch": 0.37630402384500744, + "grad_norm": 1.7332857847213745, + "learning_rate": 0.0002, + "loss": 2.4875, + "step": 5050 + }, + { + "epoch": 0.3770491803278688, + "grad_norm": 2.1427862644195557, + "learning_rate": 0.0002, + "loss": 2.3529, + "step": 5060 + }, + { + "epoch": 0.37779433681073027, + "grad_norm": 2.6252365112304688, + "learning_rate": 0.0002, + "loss": 2.6104, + "step": 5070 + }, + { + "epoch": 0.37853949329359166, + "grad_norm": 2.0142056941986084, + "learning_rate": 0.0002, + "loss": 2.5434, + "step": 5080 + }, + { + "epoch": 0.37928464977645304, + "grad_norm": 2.4451770782470703, + "learning_rate": 0.0002, + "loss": 2.4673, + "step": 5090 + }, + { + "epoch": 0.38002980625931443, + "grad_norm": 2.5081820487976074, + "learning_rate": 0.0002, + "loss": 2.4091, + "step": 5100 + }, + { + "epoch": 0.38077496274217587, + "grad_norm": 2.149099588394165, + "learning_rate": 0.0002, + "loss": 2.4014, + "step": 5110 + }, + { + "epoch": 0.38152011922503726, + "grad_norm": 2.3913164138793945, + "learning_rate": 0.0002, + "loss": 2.437, + "step": 5120 + }, + { + "epoch": 0.38226527570789864, + "grad_norm": 2.1281864643096924, + "learning_rate": 0.0002, + "loss": 2.5205, + "step": 5130 + }, + { + "epoch": 0.3830104321907601, + "grad_norm": 1.710132122039795, + "learning_rate": 0.0002, + "loss": 2.3987, + "step": 5140 + }, + { + "epoch": 0.3837555886736215, + "grad_norm": 2.173602342605591, + "learning_rate": 0.0002, + "loss": 2.5606, + "step": 5150 + }, + { + "epoch": 0.38450074515648286, + "grad_norm": 2.032154083251953, + "learning_rate": 0.0002, + "loss": 2.5223, + "step": 5160 + }, + { + "epoch": 0.38524590163934425, + "grad_norm": 2.1516082286834717, + "learning_rate": 0.0002, + "loss": 2.4851, + "step": 5170 + }, + { + "epoch": 0.3859910581222057, + "grad_norm": 1.9636366367340088, + "learning_rate": 0.0002, + "loss": 2.3907, + "step": 5180 + }, + { + "epoch": 0.3867362146050671, + "grad_norm": 2.3789303302764893, + "learning_rate": 0.0002, + "loss": 2.3962, + "step": 5190 + }, + { + "epoch": 0.38748137108792846, + "grad_norm": 2.102897882461548, + "learning_rate": 0.0002, + "loss": 2.6373, + "step": 5200 + }, + { + "epoch": 0.38822652757078985, + "grad_norm": 2.0494508743286133, + "learning_rate": 0.0002, + "loss": 2.4939, + "step": 5210 + }, + { + "epoch": 0.3889716840536513, + "grad_norm": 1.95903480052948, + "learning_rate": 0.0002, + "loss": 2.4768, + "step": 5220 + }, + { + "epoch": 0.3897168405365127, + "grad_norm": 2.15665864944458, + "learning_rate": 0.0002, + "loss": 2.5624, + "step": 5230 + }, + { + "epoch": 0.39046199701937406, + "grad_norm": 2.1582460403442383, + "learning_rate": 0.0002, + "loss": 2.4983, + "step": 5240 + }, + { + "epoch": 0.39120715350223545, + "grad_norm": 1.939427375793457, + "learning_rate": 0.0002, + "loss": 2.4378, + "step": 5250 + }, + { + "epoch": 0.3919523099850969, + "grad_norm": 1.7251907587051392, + "learning_rate": 0.0002, + "loss": 2.3679, + "step": 5260 + }, + { + "epoch": 0.3926974664679583, + "grad_norm": 1.9326486587524414, + "learning_rate": 0.0002, + "loss": 2.5102, + "step": 5270 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 2.2457611560821533, + "learning_rate": 0.0002, + "loss": 2.6181, + "step": 5280 + }, + { + "epoch": 0.39418777943368105, + "grad_norm": 2.027223825454712, + "learning_rate": 0.0002, + "loss": 2.3882, + "step": 5290 + }, + { + "epoch": 0.3949329359165425, + "grad_norm": 2.135723829269409, + "learning_rate": 0.0002, + "loss": 2.3614, + "step": 5300 + }, + { + "epoch": 0.3956780923994039, + "grad_norm": 2.2400708198547363, + "learning_rate": 0.0002, + "loss": 2.4794, + "step": 5310 + }, + { + "epoch": 0.39642324888226527, + "grad_norm": 2.0185799598693848, + "learning_rate": 0.0002, + "loss": 2.5809, + "step": 5320 + }, + { + "epoch": 0.39716840536512665, + "grad_norm": 2.2141106128692627, + "learning_rate": 0.0002, + "loss": 2.5126, + "step": 5330 + }, + { + "epoch": 0.3979135618479881, + "grad_norm": 2.2849326133728027, + "learning_rate": 0.0002, + "loss": 2.3855, + "step": 5340 + }, + { + "epoch": 0.3986587183308495, + "grad_norm": 1.7335988283157349, + "learning_rate": 0.0002, + "loss": 2.3463, + "step": 5350 + }, + { + "epoch": 0.39940387481371087, + "grad_norm": 1.9355789422988892, + "learning_rate": 0.0002, + "loss": 2.5056, + "step": 5360 + }, + { + "epoch": 0.40014903129657226, + "grad_norm": 1.7740095853805542, + "learning_rate": 0.0002, + "loss": 2.5791, + "step": 5370 + }, + { + "epoch": 0.4008941877794337, + "grad_norm": 1.7497676610946655, + "learning_rate": 0.0002, + "loss": 2.4308, + "step": 5380 + }, + { + "epoch": 0.4016393442622951, + "grad_norm": 2.0560314655303955, + "learning_rate": 0.0002, + "loss": 2.4581, + "step": 5390 + }, + { + "epoch": 0.40238450074515647, + "grad_norm": 2.065265655517578, + "learning_rate": 0.0002, + "loss": 2.5172, + "step": 5400 + }, + { + "epoch": 0.40312965722801786, + "grad_norm": 2.282453775405884, + "learning_rate": 0.0002, + "loss": 2.3409, + "step": 5410 + }, + { + "epoch": 0.4038748137108793, + "grad_norm": 2.3466989994049072, + "learning_rate": 0.0002, + "loss": 2.5305, + "step": 5420 + }, + { + "epoch": 0.4046199701937407, + "grad_norm": 2.2113606929779053, + "learning_rate": 0.0002, + "loss": 2.6027, + "step": 5430 + }, + { + "epoch": 0.40536512667660207, + "grad_norm": 2.149338483810425, + "learning_rate": 0.0002, + "loss": 2.3704, + "step": 5440 + }, + { + "epoch": 0.4061102831594635, + "grad_norm": 2.129307270050049, + "learning_rate": 0.0002, + "loss": 2.3367, + "step": 5450 + }, + { + "epoch": 0.4068554396423249, + "grad_norm": 2.2620790004730225, + "learning_rate": 0.0002, + "loss": 2.5256, + "step": 5460 + }, + { + "epoch": 0.4076005961251863, + "grad_norm": 2.623889684677124, + "learning_rate": 0.0002, + "loss": 2.5516, + "step": 5470 + }, + { + "epoch": 0.4083457526080477, + "grad_norm": 1.7352521419525146, + "learning_rate": 0.0002, + "loss": 2.2739, + "step": 5480 + }, + { + "epoch": 0.4090909090909091, + "grad_norm": 2.0829408168792725, + "learning_rate": 0.0002, + "loss": 2.5512, + "step": 5490 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 2.000159978866577, + "learning_rate": 0.0002, + "loss": 2.3084, + "step": 5500 + }, + { + "epoch": 0.4105812220566319, + "grad_norm": 2.665837526321411, + "learning_rate": 0.0002, + "loss": 2.3492, + "step": 5510 + }, + { + "epoch": 0.4113263785394933, + "grad_norm": 2.220322608947754, + "learning_rate": 0.0002, + "loss": 2.5528, + "step": 5520 + }, + { + "epoch": 0.4120715350223547, + "grad_norm": 2.322227954864502, + "learning_rate": 0.0002, + "loss": 2.4756, + "step": 5530 + }, + { + "epoch": 0.4128166915052161, + "grad_norm": 2.0118496417999268, + "learning_rate": 0.0002, + "loss": 2.564, + "step": 5540 + }, + { + "epoch": 0.4135618479880775, + "grad_norm": 2.0772318840026855, + "learning_rate": 0.0002, + "loss": 2.583, + "step": 5550 + }, + { + "epoch": 0.4143070044709389, + "grad_norm": 2.002246141433716, + "learning_rate": 0.0002, + "loss": 2.6718, + "step": 5560 + }, + { + "epoch": 0.4150521609538003, + "grad_norm": 2.208174228668213, + "learning_rate": 0.0002, + "loss": 2.4075, + "step": 5570 + }, + { + "epoch": 0.4157973174366617, + "grad_norm": 2.2165260314941406, + "learning_rate": 0.0002, + "loss": 2.6347, + "step": 5580 + }, + { + "epoch": 0.4165424739195231, + "grad_norm": 2.365262508392334, + "learning_rate": 0.0002, + "loss": 2.5209, + "step": 5590 + }, + { + "epoch": 0.4172876304023845, + "grad_norm": 2.264592170715332, + "learning_rate": 0.0002, + "loss": 2.4414, + "step": 5600 + }, + { + "epoch": 0.4180327868852459, + "grad_norm": 1.9426195621490479, + "learning_rate": 0.0002, + "loss": 2.313, + "step": 5610 + }, + { + "epoch": 0.4187779433681073, + "grad_norm": 2.155670642852783, + "learning_rate": 0.0002, + "loss": 2.5946, + "step": 5620 + }, + { + "epoch": 0.4195230998509687, + "grad_norm": 1.9506397247314453, + "learning_rate": 0.0002, + "loss": 2.4709, + "step": 5630 + }, + { + "epoch": 0.4202682563338301, + "grad_norm": 2.3226125240325928, + "learning_rate": 0.0002, + "loss": 2.3527, + "step": 5640 + }, + { + "epoch": 0.4210134128166915, + "grad_norm": 1.6601407527923584, + "learning_rate": 0.0002, + "loss": 2.4622, + "step": 5650 + }, + { + "epoch": 0.4217585692995529, + "grad_norm": 2.126014471054077, + "learning_rate": 0.0002, + "loss": 2.4602, + "step": 5660 + }, + { + "epoch": 0.4225037257824143, + "grad_norm": 2.292633295059204, + "learning_rate": 0.0002, + "loss": 2.7211, + "step": 5670 + }, + { + "epoch": 0.4232488822652757, + "grad_norm": 2.1793317794799805, + "learning_rate": 0.0002, + "loss": 2.4626, + "step": 5680 + }, + { + "epoch": 0.4239940387481371, + "grad_norm": 2.196563720703125, + "learning_rate": 0.0002, + "loss": 2.5533, + "step": 5690 + }, + { + "epoch": 0.4247391952309985, + "grad_norm": 2.425448417663574, + "learning_rate": 0.0002, + "loss": 2.4851, + "step": 5700 + }, + { + "epoch": 0.4254843517138599, + "grad_norm": 2.2193682193756104, + "learning_rate": 0.0002, + "loss": 2.3698, + "step": 5710 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 2.054805040359497, + "learning_rate": 0.0002, + "loss": 2.5059, + "step": 5720 + }, + { + "epoch": 0.4269746646795827, + "grad_norm": 1.8185921907424927, + "learning_rate": 0.0002, + "loss": 2.5176, + "step": 5730 + }, + { + "epoch": 0.4277198211624441, + "grad_norm": 1.9967896938323975, + "learning_rate": 0.0002, + "loss": 2.4504, + "step": 5740 + }, + { + "epoch": 0.4284649776453055, + "grad_norm": 2.662285327911377, + "learning_rate": 0.0002, + "loss": 2.4244, + "step": 5750 + }, + { + "epoch": 0.42921013412816694, + "grad_norm": 2.16428804397583, + "learning_rate": 0.0002, + "loss": 2.2946, + "step": 5760 + }, + { + "epoch": 0.42995529061102833, + "grad_norm": 2.0115864276885986, + "learning_rate": 0.0002, + "loss": 2.4657, + "step": 5770 + }, + { + "epoch": 0.4307004470938897, + "grad_norm": 2.388542413711548, + "learning_rate": 0.0002, + "loss": 2.5151, + "step": 5780 + }, + { + "epoch": 0.4314456035767511, + "grad_norm": 2.0310490131378174, + "learning_rate": 0.0002, + "loss": 2.3604, + "step": 5790 + }, + { + "epoch": 0.43219076005961254, + "grad_norm": 1.8433557748794556, + "learning_rate": 0.0002, + "loss": 2.5056, + "step": 5800 + }, + { + "epoch": 0.43293591654247393, + "grad_norm": 2.093193769454956, + "learning_rate": 0.0002, + "loss": 2.5444, + "step": 5810 + }, + { + "epoch": 0.4336810730253353, + "grad_norm": 2.3394739627838135, + "learning_rate": 0.0002, + "loss": 2.4693, + "step": 5820 + }, + { + "epoch": 0.4344262295081967, + "grad_norm": 2.294088363647461, + "learning_rate": 0.0002, + "loss": 2.2499, + "step": 5830 + }, + { + "epoch": 0.43517138599105815, + "grad_norm": 2.3843374252319336, + "learning_rate": 0.0002, + "loss": 2.5797, + "step": 5840 + }, + { + "epoch": 0.43591654247391953, + "grad_norm": 2.0269100666046143, + "learning_rate": 0.0002, + "loss": 2.5623, + "step": 5850 + }, + { + "epoch": 0.4366616989567809, + "grad_norm": 2.5061495304107666, + "learning_rate": 0.0002, + "loss": 2.3113, + "step": 5860 + }, + { + "epoch": 0.4374068554396423, + "grad_norm": 2.5287938117980957, + "learning_rate": 0.0002, + "loss": 2.3915, + "step": 5870 + }, + { + "epoch": 0.43815201192250375, + "grad_norm": 2.0537567138671875, + "learning_rate": 0.0002, + "loss": 2.6012, + "step": 5880 + }, + { + "epoch": 0.43889716840536513, + "grad_norm": 2.5927767753601074, + "learning_rate": 0.0002, + "loss": 2.4097, + "step": 5890 + }, + { + "epoch": 0.4396423248882265, + "grad_norm": 2.193775177001953, + "learning_rate": 0.0002, + "loss": 2.4681, + "step": 5900 + }, + { + "epoch": 0.4403874813710879, + "grad_norm": 1.99056077003479, + "learning_rate": 0.0002, + "loss": 2.4156, + "step": 5910 + }, + { + "epoch": 0.44113263785394935, + "grad_norm": 2.4100918769836426, + "learning_rate": 0.0002, + "loss": 2.5194, + "step": 5920 + }, + { + "epoch": 0.44187779433681074, + "grad_norm": 2.178215980529785, + "learning_rate": 0.0002, + "loss": 2.4394, + "step": 5930 + }, + { + "epoch": 0.4426229508196721, + "grad_norm": 2.3645856380462646, + "learning_rate": 0.0002, + "loss": 2.4488, + "step": 5940 + }, + { + "epoch": 0.4433681073025335, + "grad_norm": 1.8661149740219116, + "learning_rate": 0.0002, + "loss": 2.4434, + "step": 5950 + }, + { + "epoch": 0.44411326378539495, + "grad_norm": 1.9467486143112183, + "learning_rate": 0.0002, + "loss": 2.462, + "step": 5960 + }, + { + "epoch": 0.44485842026825634, + "grad_norm": 2.4336559772491455, + "learning_rate": 0.0002, + "loss": 2.6196, + "step": 5970 + }, + { + "epoch": 0.4456035767511177, + "grad_norm": 2.2935919761657715, + "learning_rate": 0.0002, + "loss": 2.6004, + "step": 5980 + }, + { + "epoch": 0.4463487332339791, + "grad_norm": 2.2421000003814697, + "learning_rate": 0.0002, + "loss": 2.4005, + "step": 5990 + }, + { + "epoch": 0.44709388971684055, + "grad_norm": 1.7802613973617554, + "learning_rate": 0.0002, + "loss": 2.4354, + "step": 6000 + }, + { + "epoch": 0.44783904619970194, + "grad_norm": 2.0072882175445557, + "learning_rate": 0.0002, + "loss": 2.4345, + "step": 6010 + }, + { + "epoch": 0.4485842026825633, + "grad_norm": 1.9635995626449585, + "learning_rate": 0.0002, + "loss": 2.5976, + "step": 6020 + }, + { + "epoch": 0.4493293591654247, + "grad_norm": 2.6438255310058594, + "learning_rate": 0.0002, + "loss": 2.4887, + "step": 6030 + }, + { + "epoch": 0.45007451564828616, + "grad_norm": 2.0173120498657227, + "learning_rate": 0.0002, + "loss": 2.6285, + "step": 6040 + }, + { + "epoch": 0.45081967213114754, + "grad_norm": 2.0639147758483887, + "learning_rate": 0.0002, + "loss": 2.6229, + "step": 6050 + }, + { + "epoch": 0.45156482861400893, + "grad_norm": 2.2229113578796387, + "learning_rate": 0.0002, + "loss": 2.3337, + "step": 6060 + }, + { + "epoch": 0.4523099850968703, + "grad_norm": 2.0675735473632812, + "learning_rate": 0.0002, + "loss": 2.3311, + "step": 6070 + }, + { + "epoch": 0.45305514157973176, + "grad_norm": 2.0282604694366455, + "learning_rate": 0.0002, + "loss": 2.5878, + "step": 6080 + }, + { + "epoch": 0.45380029806259314, + "grad_norm": 2.4298789501190186, + "learning_rate": 0.0002, + "loss": 2.5402, + "step": 6090 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 2.1782422065734863, + "learning_rate": 0.0002, + "loss": 2.3497, + "step": 6100 + }, + { + "epoch": 0.455290611028316, + "grad_norm": 2.3076884746551514, + "learning_rate": 0.0002, + "loss": 2.4455, + "step": 6110 + }, + { + "epoch": 0.45603576751117736, + "grad_norm": 2.051884651184082, + "learning_rate": 0.0002, + "loss": 2.4875, + "step": 6120 + }, + { + "epoch": 0.45678092399403875, + "grad_norm": 2.0429365634918213, + "learning_rate": 0.0002, + "loss": 2.5699, + "step": 6130 + }, + { + "epoch": 0.45752608047690013, + "grad_norm": 2.0632383823394775, + "learning_rate": 0.0002, + "loss": 2.5193, + "step": 6140 + }, + { + "epoch": 0.4582712369597616, + "grad_norm": 2.0191586017608643, + "learning_rate": 0.0002, + "loss": 2.4983, + "step": 6150 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 2.1893763542175293, + "learning_rate": 0.0002, + "loss": 2.4648, + "step": 6160 + }, + { + "epoch": 0.45976154992548435, + "grad_norm": 2.150398015975952, + "learning_rate": 0.0002, + "loss": 2.6308, + "step": 6170 + }, + { + "epoch": 0.46050670640834573, + "grad_norm": 2.084906578063965, + "learning_rate": 0.0002, + "loss": 2.5127, + "step": 6180 + }, + { + "epoch": 0.4612518628912072, + "grad_norm": 2.3456621170043945, + "learning_rate": 0.0002, + "loss": 2.5092, + "step": 6190 + }, + { + "epoch": 0.46199701937406856, + "grad_norm": 2.259631633758545, + "learning_rate": 0.0002, + "loss": 2.4077, + "step": 6200 + }, + { + "epoch": 0.46274217585692995, + "grad_norm": 1.9756247997283936, + "learning_rate": 0.0002, + "loss": 2.3623, + "step": 6210 + }, + { + "epoch": 0.46348733233979134, + "grad_norm": 1.8990384340286255, + "learning_rate": 0.0002, + "loss": 2.5315, + "step": 6220 + }, + { + "epoch": 0.4642324888226528, + "grad_norm": 2.1848907470703125, + "learning_rate": 0.0002, + "loss": 2.5418, + "step": 6230 + }, + { + "epoch": 0.46497764530551416, + "grad_norm": 2.267343521118164, + "learning_rate": 0.0002, + "loss": 2.4859, + "step": 6240 + }, + { + "epoch": 0.46572280178837555, + "grad_norm": 2.3617684841156006, + "learning_rate": 0.0002, + "loss": 2.3138, + "step": 6250 + }, + { + "epoch": 0.46646795827123694, + "grad_norm": 1.972421646118164, + "learning_rate": 0.0002, + "loss": 2.5332, + "step": 6260 + }, + { + "epoch": 0.4672131147540984, + "grad_norm": 2.0907230377197266, + "learning_rate": 0.0002, + "loss": 2.7235, + "step": 6270 + }, + { + "epoch": 0.46795827123695977, + "grad_norm": 2.452946901321411, + "learning_rate": 0.0002, + "loss": 2.4452, + "step": 6280 + }, + { + "epoch": 0.46870342771982115, + "grad_norm": 2.432163953781128, + "learning_rate": 0.0002, + "loss": 2.4138, + "step": 6290 + }, + { + "epoch": 0.46944858420268254, + "grad_norm": 2.2494499683380127, + "learning_rate": 0.0002, + "loss": 2.4877, + "step": 6300 + }, + { + "epoch": 0.470193740685544, + "grad_norm": 1.6413583755493164, + "learning_rate": 0.0002, + "loss": 2.5623, + "step": 6310 + }, + { + "epoch": 0.47093889716840537, + "grad_norm": 2.4607837200164795, + "learning_rate": 0.0002, + "loss": 2.4455, + "step": 6320 + }, + { + "epoch": 0.47168405365126675, + "grad_norm": 2.0634145736694336, + "learning_rate": 0.0002, + "loss": 2.5643, + "step": 6330 + }, + { + "epoch": 0.47242921013412814, + "grad_norm": 2.0293703079223633, + "learning_rate": 0.0002, + "loss": 2.4346, + "step": 6340 + }, + { + "epoch": 0.4731743666169896, + "grad_norm": 2.567640781402588, + "learning_rate": 0.0002, + "loss": 2.2669, + "step": 6350 + }, + { + "epoch": 0.47391952309985097, + "grad_norm": 2.118058204650879, + "learning_rate": 0.0002, + "loss": 2.5486, + "step": 6360 + }, + { + "epoch": 0.47466467958271236, + "grad_norm": 2.1259288787841797, + "learning_rate": 0.0002, + "loss": 2.5674, + "step": 6370 + }, + { + "epoch": 0.47540983606557374, + "grad_norm": 2.1126372814178467, + "learning_rate": 0.0002, + "loss": 2.6126, + "step": 6380 + }, + { + "epoch": 0.4761549925484352, + "grad_norm": 1.626694917678833, + "learning_rate": 0.0002, + "loss": 2.5019, + "step": 6390 + }, + { + "epoch": 0.47690014903129657, + "grad_norm": 2.0831708908081055, + "learning_rate": 0.0002, + "loss": 2.4327, + "step": 6400 + }, + { + "epoch": 0.47764530551415796, + "grad_norm": 1.9350471496582031, + "learning_rate": 0.0002, + "loss": 2.4867, + "step": 6410 + }, + { + "epoch": 0.4783904619970194, + "grad_norm": 2.1217947006225586, + "learning_rate": 0.0002, + "loss": 2.5254, + "step": 6420 + }, + { + "epoch": 0.4791356184798808, + "grad_norm": 2.3201823234558105, + "learning_rate": 0.0002, + "loss": 2.4434, + "step": 6430 + }, + { + "epoch": 0.4798807749627422, + "grad_norm": 2.2600150108337402, + "learning_rate": 0.0002, + "loss": 2.4616, + "step": 6440 + }, + { + "epoch": 0.48062593144560356, + "grad_norm": 2.360180377960205, + "learning_rate": 0.0002, + "loss": 2.4934, + "step": 6450 + }, + { + "epoch": 0.481371087928465, + "grad_norm": 2.4700534343719482, + "learning_rate": 0.0002, + "loss": 2.3958, + "step": 6460 + }, + { + "epoch": 0.4821162444113264, + "grad_norm": 2.0691604614257812, + "learning_rate": 0.0002, + "loss": 2.4818, + "step": 6470 + }, + { + "epoch": 0.4828614008941878, + "grad_norm": 1.950579285621643, + "learning_rate": 0.0002, + "loss": 2.4003, + "step": 6480 + }, + { + "epoch": 0.48360655737704916, + "grad_norm": 2.0040478706359863, + "learning_rate": 0.0002, + "loss": 2.5706, + "step": 6490 + }, + { + "epoch": 0.4843517138599106, + "grad_norm": 2.4494431018829346, + "learning_rate": 0.0002, + "loss": 2.4676, + "step": 6500 + }, + { + "epoch": 0.485096870342772, + "grad_norm": 2.25048565864563, + "learning_rate": 0.0002, + "loss": 2.6437, + "step": 6510 + }, + { + "epoch": 0.4858420268256334, + "grad_norm": 2.4388575553894043, + "learning_rate": 0.0002, + "loss": 2.4013, + "step": 6520 + }, + { + "epoch": 0.48658718330849476, + "grad_norm": 2.7190568447113037, + "learning_rate": 0.0002, + "loss": 2.1267, + "step": 6530 + }, + { + "epoch": 0.4873323397913562, + "grad_norm": 2.00464129447937, + "learning_rate": 0.0002, + "loss": 2.246, + "step": 6540 + }, + { + "epoch": 0.4880774962742176, + "grad_norm": 2.230637550354004, + "learning_rate": 0.0002, + "loss": 2.3657, + "step": 6550 + }, + { + "epoch": 0.488822652757079, + "grad_norm": 2.030550479888916, + "learning_rate": 0.0002, + "loss": 2.5825, + "step": 6560 + }, + { + "epoch": 0.48956780923994037, + "grad_norm": 1.7973986864089966, + "learning_rate": 0.0002, + "loss": 2.2891, + "step": 6570 + }, + { + "epoch": 0.4903129657228018, + "grad_norm": 2.1454880237579346, + "learning_rate": 0.0002, + "loss": 2.3413, + "step": 6580 + }, + { + "epoch": 0.4910581222056632, + "grad_norm": 2.1549787521362305, + "learning_rate": 0.0002, + "loss": 2.3444, + "step": 6590 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 2.1284945011138916, + "learning_rate": 0.0002, + "loss": 2.5555, + "step": 6600 + }, + { + "epoch": 0.49254843517138597, + "grad_norm": 2.23075008392334, + "learning_rate": 0.0002, + "loss": 2.3384, + "step": 6610 + }, + { + "epoch": 0.4932935916542474, + "grad_norm": 2.489455223083496, + "learning_rate": 0.0002, + "loss": 2.6383, + "step": 6620 + }, + { + "epoch": 0.4940387481371088, + "grad_norm": 2.2579898834228516, + "learning_rate": 0.0002, + "loss": 2.6226, + "step": 6630 + }, + { + "epoch": 0.4947839046199702, + "grad_norm": 2.3494224548339844, + "learning_rate": 0.0002, + "loss": 2.502, + "step": 6640 + }, + { + "epoch": 0.49552906110283157, + "grad_norm": 2.082937240600586, + "learning_rate": 0.0002, + "loss": 2.5592, + "step": 6650 + }, + { + "epoch": 0.496274217585693, + "grad_norm": 2.0874314308166504, + "learning_rate": 0.0002, + "loss": 2.5922, + "step": 6660 + }, + { + "epoch": 0.4970193740685544, + "grad_norm": 2.1997947692871094, + "learning_rate": 0.0002, + "loss": 2.6147, + "step": 6670 + }, + { + "epoch": 0.4977645305514158, + "grad_norm": 2.215691089630127, + "learning_rate": 0.0002, + "loss": 2.4961, + "step": 6680 + }, + { + "epoch": 0.49850968703427717, + "grad_norm": 2.699936866760254, + "learning_rate": 0.0002, + "loss": 2.6774, + "step": 6690 + }, + { + "epoch": 0.4992548435171386, + "grad_norm": 2.2943921089172363, + "learning_rate": 0.0002, + "loss": 2.4801, + "step": 6700 + }, + { + "epoch": 0.5, + "grad_norm": 2.3712542057037354, + "learning_rate": 0.0002, + "loss": 2.4751, + "step": 6710 + }, + { + "epoch": 0.5007451564828614, + "grad_norm": 2.5637362003326416, + "learning_rate": 0.0002, + "loss": 2.2542, + "step": 6720 + }, + { + "epoch": 0.5014903129657228, + "grad_norm": 2.330156087875366, + "learning_rate": 0.0002, + "loss": 2.4645, + "step": 6730 + }, + { + "epoch": 0.5022354694485842, + "grad_norm": 2.053035020828247, + "learning_rate": 0.0002, + "loss": 2.5752, + "step": 6740 + }, + { + "epoch": 0.5029806259314457, + "grad_norm": 2.305776834487915, + "learning_rate": 0.0002, + "loss": 2.5567, + "step": 6750 + }, + { + "epoch": 0.503725782414307, + "grad_norm": 2.023801326751709, + "learning_rate": 0.0002, + "loss": 2.3164, + "step": 6760 + }, + { + "epoch": 0.5044709388971684, + "grad_norm": 2.477642059326172, + "learning_rate": 0.0002, + "loss": 2.5777, + "step": 6770 + }, + { + "epoch": 0.5052160953800298, + "grad_norm": 2.298116683959961, + "learning_rate": 0.0002, + "loss": 2.4506, + "step": 6780 + }, + { + "epoch": 0.5059612518628912, + "grad_norm": 2.2904672622680664, + "learning_rate": 0.0002, + "loss": 2.3897, + "step": 6790 + }, + { + "epoch": 0.5067064083457526, + "grad_norm": 2.0368447303771973, + "learning_rate": 0.0002, + "loss": 2.42, + "step": 6800 + }, + { + "epoch": 0.507451564828614, + "grad_norm": 2.016451835632324, + "learning_rate": 0.0002, + "loss": 2.3806, + "step": 6810 + }, + { + "epoch": 0.5081967213114754, + "grad_norm": 1.8767539262771606, + "learning_rate": 0.0002, + "loss": 2.3789, + "step": 6820 + }, + { + "epoch": 0.5089418777943369, + "grad_norm": 2.399251937866211, + "learning_rate": 0.0002, + "loss": 2.5804, + "step": 6830 + }, + { + "epoch": 0.5096870342771982, + "grad_norm": 2.187103509902954, + "learning_rate": 0.0002, + "loss": 2.5263, + "step": 6840 + }, + { + "epoch": 0.5104321907600596, + "grad_norm": 1.9529699087142944, + "learning_rate": 0.0002, + "loss": 2.423, + "step": 6850 + }, + { + "epoch": 0.511177347242921, + "grad_norm": 4.577221393585205, + "learning_rate": 0.0002, + "loss": 2.5348, + "step": 6860 + }, + { + "epoch": 0.5119225037257824, + "grad_norm": 2.331979513168335, + "learning_rate": 0.0002, + "loss": 2.6584, + "step": 6870 + }, + { + "epoch": 0.5126676602086438, + "grad_norm": 2.2853405475616455, + "learning_rate": 0.0002, + "loss": 2.4096, + "step": 6880 + }, + { + "epoch": 0.5134128166915052, + "grad_norm": 2.516995906829834, + "learning_rate": 0.0002, + "loss": 2.4762, + "step": 6890 + }, + { + "epoch": 0.5141579731743666, + "grad_norm": 2.183138370513916, + "learning_rate": 0.0002, + "loss": 2.4239, + "step": 6900 + }, + { + "epoch": 0.5149031296572281, + "grad_norm": 2.2524988651275635, + "learning_rate": 0.0002, + "loss": 2.4089, + "step": 6910 + }, + { + "epoch": 0.5156482861400894, + "grad_norm": 2.2979516983032227, + "learning_rate": 0.0002, + "loss": 2.443, + "step": 6920 + }, + { + "epoch": 0.5163934426229508, + "grad_norm": 2.099586009979248, + "learning_rate": 0.0002, + "loss": 2.4679, + "step": 6930 + }, + { + "epoch": 0.5171385991058122, + "grad_norm": 2.2441070079803467, + "learning_rate": 0.0002, + "loss": 2.5278, + "step": 6940 + }, + { + "epoch": 0.5178837555886736, + "grad_norm": 2.376936197280884, + "learning_rate": 0.0002, + "loss": 2.5222, + "step": 6950 + }, + { + "epoch": 0.518628912071535, + "grad_norm": 2.196542739868164, + "learning_rate": 0.0002, + "loss": 2.4336, + "step": 6960 + }, + { + "epoch": 0.5193740685543964, + "grad_norm": 2.265864610671997, + "learning_rate": 0.0002, + "loss": 2.4876, + "step": 6970 + }, + { + "epoch": 0.5201192250372578, + "grad_norm": 2.2958121299743652, + "learning_rate": 0.0002, + "loss": 2.6756, + "step": 6980 + }, + { + "epoch": 0.5208643815201193, + "grad_norm": 2.2851719856262207, + "learning_rate": 0.0002, + "loss": 2.4873, + "step": 6990 + }, + { + "epoch": 0.5216095380029806, + "grad_norm": 2.3405418395996094, + "learning_rate": 0.0002, + "loss": 2.3907, + "step": 7000 + }, + { + "epoch": 0.522354694485842, + "grad_norm": 2.4564900398254395, + "learning_rate": 0.0002, + "loss": 2.5203, + "step": 7010 + }, + { + "epoch": 0.5230998509687034, + "grad_norm": 2.123331308364868, + "learning_rate": 0.0002, + "loss": 2.3261, + "step": 7020 + }, + { + "epoch": 0.5238450074515648, + "grad_norm": 2.0890355110168457, + "learning_rate": 0.0002, + "loss": 2.5784, + "step": 7030 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 2.4165306091308594, + "learning_rate": 0.0002, + "loss": 2.4716, + "step": 7040 + }, + { + "epoch": 0.5253353204172876, + "grad_norm": 2.1103525161743164, + "learning_rate": 0.0002, + "loss": 2.5825, + "step": 7050 + }, + { + "epoch": 0.526080476900149, + "grad_norm": 2.33457612991333, + "learning_rate": 0.0002, + "loss": 2.3823, + "step": 7060 + }, + { + "epoch": 0.5268256333830105, + "grad_norm": 2.0597524642944336, + "learning_rate": 0.0002, + "loss": 2.4671, + "step": 7070 + }, + { + "epoch": 0.5275707898658718, + "grad_norm": 3.1226096153259277, + "learning_rate": 0.0002, + "loss": 2.4017, + "step": 7080 + }, + { + "epoch": 0.5283159463487332, + "grad_norm": 2.0051512718200684, + "learning_rate": 0.0002, + "loss": 2.359, + "step": 7090 + }, + { + "epoch": 0.5290611028315947, + "grad_norm": 2.500908136367798, + "learning_rate": 0.0002, + "loss": 2.6868, + "step": 7100 + }, + { + "epoch": 0.529806259314456, + "grad_norm": 2.04001784324646, + "learning_rate": 0.0002, + "loss": 2.717, + "step": 7110 + }, + { + "epoch": 0.5305514157973175, + "grad_norm": 2.715292453765869, + "learning_rate": 0.0002, + "loss": 2.5725, + "step": 7120 + }, + { + "epoch": 0.5312965722801788, + "grad_norm": 2.3398818969726562, + "learning_rate": 0.0002, + "loss": 2.4834, + "step": 7130 + }, + { + "epoch": 0.5320417287630402, + "grad_norm": 2.456146240234375, + "learning_rate": 0.0002, + "loss": 2.4792, + "step": 7140 + }, + { + "epoch": 0.5327868852459017, + "grad_norm": 2.2321231365203857, + "learning_rate": 0.0002, + "loss": 2.6432, + "step": 7150 + }, + { + "epoch": 0.533532041728763, + "grad_norm": 2.06449294090271, + "learning_rate": 0.0002, + "loss": 2.3722, + "step": 7160 + }, + { + "epoch": 0.5342771982116244, + "grad_norm": 1.98611581325531, + "learning_rate": 0.0002, + "loss": 2.2874, + "step": 7170 + }, + { + "epoch": 0.5350223546944859, + "grad_norm": 2.2005727291107178, + "learning_rate": 0.0002, + "loss": 2.3682, + "step": 7180 + }, + { + "epoch": 0.5357675111773472, + "grad_norm": 2.3024485111236572, + "learning_rate": 0.0002, + "loss": 2.5261, + "step": 7190 + }, + { + "epoch": 0.5365126676602087, + "grad_norm": 2.2706873416900635, + "learning_rate": 0.0002, + "loss": 2.5962, + "step": 7200 + }, + { + "epoch": 0.53725782414307, + "grad_norm": 1.8197662830352783, + "learning_rate": 0.0002, + "loss": 2.5705, + "step": 7210 + }, + { + "epoch": 0.5380029806259314, + "grad_norm": 1.9607528448104858, + "learning_rate": 0.0002, + "loss": 2.3861, + "step": 7220 + }, + { + "epoch": 0.5387481371087929, + "grad_norm": 2.5178678035736084, + "learning_rate": 0.0002, + "loss": 2.5357, + "step": 7230 + }, + { + "epoch": 0.5394932935916542, + "grad_norm": 2.2647557258605957, + "learning_rate": 0.0002, + "loss": 2.3796, + "step": 7240 + }, + { + "epoch": 0.5402384500745157, + "grad_norm": 2.090864419937134, + "learning_rate": 0.0002, + "loss": 2.5204, + "step": 7250 + }, + { + "epoch": 0.5409836065573771, + "grad_norm": 2.0715291500091553, + "learning_rate": 0.0002, + "loss": 2.5327, + "step": 7260 + }, + { + "epoch": 0.5417287630402384, + "grad_norm": 1.7275205850601196, + "learning_rate": 0.0002, + "loss": 2.2495, + "step": 7270 + }, + { + "epoch": 0.5424739195230999, + "grad_norm": 2.1511425971984863, + "learning_rate": 0.0002, + "loss": 2.4449, + "step": 7280 + }, + { + "epoch": 0.5432190760059612, + "grad_norm": 2.3975725173950195, + "learning_rate": 0.0002, + "loss": 2.6171, + "step": 7290 + }, + { + "epoch": 0.5439642324888226, + "grad_norm": 2.278902769088745, + "learning_rate": 0.0002, + "loss": 2.4238, + "step": 7300 + }, + { + "epoch": 0.5447093889716841, + "grad_norm": 2.1751863956451416, + "learning_rate": 0.0002, + "loss": 2.3223, + "step": 7310 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 2.1303343772888184, + "learning_rate": 0.0002, + "loss": 2.292, + "step": 7320 + }, + { + "epoch": 0.5461997019374069, + "grad_norm": 2.0914053916931152, + "learning_rate": 0.0002, + "loss": 2.6599, + "step": 7330 + }, + { + "epoch": 0.5469448584202683, + "grad_norm": 2.3275091648101807, + "learning_rate": 0.0002, + "loss": 2.4827, + "step": 7340 + }, + { + "epoch": 0.5476900149031296, + "grad_norm": 2.047351598739624, + "learning_rate": 0.0002, + "loss": 2.4824, + "step": 7350 + }, + { + "epoch": 0.5484351713859911, + "grad_norm": 2.2209582328796387, + "learning_rate": 0.0002, + "loss": 2.5068, + "step": 7360 + }, + { + "epoch": 0.5491803278688525, + "grad_norm": 2.029001235961914, + "learning_rate": 0.0002, + "loss": 2.3278, + "step": 7370 + }, + { + "epoch": 0.5499254843517138, + "grad_norm": 2.0651822090148926, + "learning_rate": 0.0002, + "loss": 2.4738, + "step": 7380 + }, + { + "epoch": 0.5506706408345753, + "grad_norm": 1.8926769495010376, + "learning_rate": 0.0002, + "loss": 2.312, + "step": 7390 + }, + { + "epoch": 0.5514157973174366, + "grad_norm": 2.1285948753356934, + "learning_rate": 0.0002, + "loss": 2.515, + "step": 7400 + }, + { + "epoch": 0.5521609538002981, + "grad_norm": 2.026381254196167, + "learning_rate": 0.0002, + "loss": 2.6837, + "step": 7410 + }, + { + "epoch": 0.5529061102831595, + "grad_norm": 2.052429437637329, + "learning_rate": 0.0002, + "loss": 2.3549, + "step": 7420 + }, + { + "epoch": 0.5536512667660208, + "grad_norm": 2.634350538253784, + "learning_rate": 0.0002, + "loss": 2.4303, + "step": 7430 + }, + { + "epoch": 0.5543964232488823, + "grad_norm": 2.1491518020629883, + "learning_rate": 0.0002, + "loss": 2.5716, + "step": 7440 + }, + { + "epoch": 0.5551415797317437, + "grad_norm": 1.9551408290863037, + "learning_rate": 0.0002, + "loss": 2.4071, + "step": 7450 + }, + { + "epoch": 0.555886736214605, + "grad_norm": 2.2387406826019287, + "learning_rate": 0.0002, + "loss": 2.383, + "step": 7460 + }, + { + "epoch": 0.5566318926974665, + "grad_norm": 2.1040196418762207, + "learning_rate": 0.0002, + "loss": 2.3814, + "step": 7470 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 2.3352060317993164, + "learning_rate": 0.0002, + "loss": 2.5804, + "step": 7480 + }, + { + "epoch": 0.5581222056631893, + "grad_norm": 2.1420586109161377, + "learning_rate": 0.0002, + "loss": 2.3543, + "step": 7490 + }, + { + "epoch": 0.5588673621460507, + "grad_norm": 2.3646399974823, + "learning_rate": 0.0002, + "loss": 2.3857, + "step": 7500 + }, + { + "epoch": 0.559612518628912, + "grad_norm": 1.9728518724441528, + "learning_rate": 0.0002, + "loss": 2.5749, + "step": 7510 + }, + { + "epoch": 0.5603576751117735, + "grad_norm": 2.2200262546539307, + "learning_rate": 0.0002, + "loss": 2.5357, + "step": 7520 + }, + { + "epoch": 0.5611028315946349, + "grad_norm": 2.305957078933716, + "learning_rate": 0.0002, + "loss": 2.4489, + "step": 7530 + }, + { + "epoch": 0.5618479880774963, + "grad_norm": 2.1752254962921143, + "learning_rate": 0.0002, + "loss": 2.2947, + "step": 7540 + }, + { + "epoch": 0.5625931445603577, + "grad_norm": 2.1745660305023193, + "learning_rate": 0.0002, + "loss": 2.5143, + "step": 7550 + }, + { + "epoch": 0.563338301043219, + "grad_norm": 1.8101173639297485, + "learning_rate": 0.0002, + "loss": 2.5449, + "step": 7560 + }, + { + "epoch": 0.5640834575260805, + "grad_norm": 2.5516979694366455, + "learning_rate": 0.0002, + "loss": 2.419, + "step": 7570 + }, + { + "epoch": 0.5648286140089419, + "grad_norm": 2.087670087814331, + "learning_rate": 0.0002, + "loss": 2.5587, + "step": 7580 + }, + { + "epoch": 0.5655737704918032, + "grad_norm": 2.1860992908477783, + "learning_rate": 0.0002, + "loss": 2.6475, + "step": 7590 + }, + { + "epoch": 0.5663189269746647, + "grad_norm": 2.522256851196289, + "learning_rate": 0.0002, + "loss": 2.7054, + "step": 7600 + }, + { + "epoch": 0.5670640834575261, + "grad_norm": 2.6697170734405518, + "learning_rate": 0.0002, + "loss": 2.5817, + "step": 7610 + }, + { + "epoch": 0.5678092399403875, + "grad_norm": 2.129748821258545, + "learning_rate": 0.0002, + "loss": 2.4949, + "step": 7620 + }, + { + "epoch": 0.5685543964232489, + "grad_norm": 1.946333646774292, + "learning_rate": 0.0002, + "loss": 2.4987, + "step": 7630 + }, + { + "epoch": 0.5692995529061102, + "grad_norm": 2.6684484481811523, + "learning_rate": 0.0002, + "loss": 2.612, + "step": 7640 + }, + { + "epoch": 0.5700447093889717, + "grad_norm": 2.1237940788269043, + "learning_rate": 0.0002, + "loss": 2.5082, + "step": 7650 + }, + { + "epoch": 0.5707898658718331, + "grad_norm": 2.68740177154541, + "learning_rate": 0.0002, + "loss": 2.5652, + "step": 7660 + }, + { + "epoch": 0.5715350223546944, + "grad_norm": 2.360792875289917, + "learning_rate": 0.0002, + "loss": 2.4631, + "step": 7670 + }, + { + "epoch": 0.5722801788375559, + "grad_norm": 2.3592369556427, + "learning_rate": 0.0002, + "loss": 2.4923, + "step": 7680 + }, + { + "epoch": 0.5730253353204173, + "grad_norm": 2.328521251678467, + "learning_rate": 0.0002, + "loss": 2.4656, + "step": 7690 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 2.1476731300354004, + "learning_rate": 0.0002, + "loss": 2.3461, + "step": 7700 + }, + { + "epoch": 0.5745156482861401, + "grad_norm": 2.1318559646606445, + "learning_rate": 0.0002, + "loss": 2.4251, + "step": 7710 + }, + { + "epoch": 0.5752608047690015, + "grad_norm": 1.997536301612854, + "learning_rate": 0.0002, + "loss": 2.5326, + "step": 7720 + }, + { + "epoch": 0.5760059612518629, + "grad_norm": 2.2597386837005615, + "learning_rate": 0.0002, + "loss": 2.6773, + "step": 7730 + }, + { + "epoch": 0.5767511177347243, + "grad_norm": 2.8514564037323, + "learning_rate": 0.0002, + "loss": 2.5119, + "step": 7740 + }, + { + "epoch": 0.5774962742175856, + "grad_norm": 2.486799716949463, + "learning_rate": 0.0002, + "loss": 2.6904, + "step": 7750 + }, + { + "epoch": 0.5782414307004471, + "grad_norm": 2.237799882888794, + "learning_rate": 0.0002, + "loss": 2.5005, + "step": 7760 + }, + { + "epoch": 0.5789865871833085, + "grad_norm": 2.4719021320343018, + "learning_rate": 0.0002, + "loss": 2.5874, + "step": 7770 + }, + { + "epoch": 0.5797317436661699, + "grad_norm": 2.2470688819885254, + "learning_rate": 0.0002, + "loss": 2.5151, + "step": 7780 + }, + { + "epoch": 0.5804769001490313, + "grad_norm": 2.4005558490753174, + "learning_rate": 0.0002, + "loss": 2.6334, + "step": 7790 + }, + { + "epoch": 0.5812220566318927, + "grad_norm": 2.0954015254974365, + "learning_rate": 0.0002, + "loss": 2.3567, + "step": 7800 + }, + { + "epoch": 0.5819672131147541, + "grad_norm": 2.228788375854492, + "learning_rate": 0.0002, + "loss": 2.6451, + "step": 7810 + }, + { + "epoch": 0.5827123695976155, + "grad_norm": 1.78871488571167, + "learning_rate": 0.0002, + "loss": 2.6242, + "step": 7820 + }, + { + "epoch": 0.5834575260804769, + "grad_norm": 1.7899997234344482, + "learning_rate": 0.0002, + "loss": 2.4975, + "step": 7830 + }, + { + "epoch": 0.5842026825633383, + "grad_norm": 2.1144442558288574, + "learning_rate": 0.0002, + "loss": 2.6146, + "step": 7840 + }, + { + "epoch": 0.5849478390461997, + "grad_norm": 1.8392325639724731, + "learning_rate": 0.0002, + "loss": 2.4227, + "step": 7850 + }, + { + "epoch": 0.5856929955290611, + "grad_norm": 1.8613855838775635, + "learning_rate": 0.0002, + "loss": 2.5583, + "step": 7860 + }, + { + "epoch": 0.5864381520119225, + "grad_norm": 2.1754300594329834, + "learning_rate": 0.0002, + "loss": 2.6082, + "step": 7870 + }, + { + "epoch": 0.587183308494784, + "grad_norm": 2.4607224464416504, + "learning_rate": 0.0002, + "loss": 2.3939, + "step": 7880 + }, + { + "epoch": 0.5879284649776453, + "grad_norm": 2.129397392272949, + "learning_rate": 0.0002, + "loss": 2.5056, + "step": 7890 + }, + { + "epoch": 0.5886736214605067, + "grad_norm": 2.302616596221924, + "learning_rate": 0.0002, + "loss": 2.4976, + "step": 7900 + }, + { + "epoch": 0.589418777943368, + "grad_norm": 3.0152175426483154, + "learning_rate": 0.0002, + "loss": 2.3935, + "step": 7910 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 2.051461696624756, + "learning_rate": 0.0002, + "loss": 2.4928, + "step": 7920 + }, + { + "epoch": 0.5909090909090909, + "grad_norm": 2.1009974479675293, + "learning_rate": 0.0002, + "loss": 2.6526, + "step": 7930 + }, + { + "epoch": 0.5916542473919523, + "grad_norm": 2.290898561477661, + "learning_rate": 0.0002, + "loss": 2.5205, + "step": 7940 + }, + { + "epoch": 0.5923994038748137, + "grad_norm": 2.202995777130127, + "learning_rate": 0.0002, + "loss": 2.5271, + "step": 7950 + }, + { + "epoch": 0.5931445603576752, + "grad_norm": 2.211921453475952, + "learning_rate": 0.0002, + "loss": 2.6762, + "step": 7960 + }, + { + "epoch": 0.5938897168405365, + "grad_norm": 2.294769763946533, + "learning_rate": 0.0002, + "loss": 2.5565, + "step": 7970 + }, + { + "epoch": 0.5946348733233979, + "grad_norm": 2.0982816219329834, + "learning_rate": 0.0002, + "loss": 2.4512, + "step": 7980 + }, + { + "epoch": 0.5953800298062594, + "grad_norm": 2.14776611328125, + "learning_rate": 0.0002, + "loss": 2.5092, + "step": 7990 + }, + { + "epoch": 0.5961251862891207, + "grad_norm": 2.2703404426574707, + "learning_rate": 0.0002, + "loss": 2.3723, + "step": 8000 + }, + { + "epoch": 0.5968703427719821, + "grad_norm": 2.265050172805786, + "learning_rate": 0.0002, + "loss": 2.4642, + "step": 8010 + }, + { + "epoch": 0.5976154992548435, + "grad_norm": 2.3132333755493164, + "learning_rate": 0.0002, + "loss": 2.5886, + "step": 8020 + }, + { + "epoch": 0.5983606557377049, + "grad_norm": 2.4410815238952637, + "learning_rate": 0.0002, + "loss": 2.3898, + "step": 8030 + }, + { + "epoch": 0.5991058122205664, + "grad_norm": 2.282869577407837, + "learning_rate": 0.0002, + "loss": 2.3925, + "step": 8040 + }, + { + "epoch": 0.5998509687034277, + "grad_norm": 2.3430824279785156, + "learning_rate": 0.0002, + "loss": 2.5852, + "step": 8050 + }, + { + "epoch": 0.6005961251862891, + "grad_norm": 2.7821292877197266, + "learning_rate": 0.0002, + "loss": 2.1127, + "step": 8060 + }, + { + "epoch": 0.6013412816691506, + "grad_norm": 2.4642081260681152, + "learning_rate": 0.0002, + "loss": 2.5766, + "step": 8070 + }, + { + "epoch": 0.6020864381520119, + "grad_norm": 2.013272285461426, + "learning_rate": 0.0002, + "loss": 2.4526, + "step": 8080 + }, + { + "epoch": 0.6028315946348733, + "grad_norm": 2.0950276851654053, + "learning_rate": 0.0002, + "loss": 2.672, + "step": 8090 + }, + { + "epoch": 0.6035767511177347, + "grad_norm": 2.2408697605133057, + "learning_rate": 0.0002, + "loss": 2.4886, + "step": 8100 + }, + { + "epoch": 0.6043219076005961, + "grad_norm": 2.4338343143463135, + "learning_rate": 0.0002, + "loss": 2.4129, + "step": 8110 + }, + { + "epoch": 0.6050670640834576, + "grad_norm": 2.3819990158081055, + "learning_rate": 0.0002, + "loss": 2.7111, + "step": 8120 + }, + { + "epoch": 0.6058122205663189, + "grad_norm": 2.3578953742980957, + "learning_rate": 0.0002, + "loss": 2.5222, + "step": 8130 + }, + { + "epoch": 0.6065573770491803, + "grad_norm": 2.0468990802764893, + "learning_rate": 0.0002, + "loss": 2.4965, + "step": 8140 + }, + { + "epoch": 0.6073025335320418, + "grad_norm": 2.439807415008545, + "learning_rate": 0.0002, + "loss": 2.6172, + "step": 8150 + }, + { + "epoch": 0.6080476900149031, + "grad_norm": 2.1083173751831055, + "learning_rate": 0.0002, + "loss": 2.5669, + "step": 8160 + }, + { + "epoch": 0.6087928464977646, + "grad_norm": 2.1767308712005615, + "learning_rate": 0.0002, + "loss": 2.5498, + "step": 8170 + }, + { + "epoch": 0.6095380029806259, + "grad_norm": 2.1427078247070312, + "learning_rate": 0.0002, + "loss": 2.4109, + "step": 8180 + }, + { + "epoch": 0.6102831594634873, + "grad_norm": 2.282959222793579, + "learning_rate": 0.0002, + "loss": 2.5091, + "step": 8190 + }, + { + "epoch": 0.6110283159463488, + "grad_norm": 2.2628536224365234, + "learning_rate": 0.0002, + "loss": 2.5386, + "step": 8200 + }, + { + "epoch": 0.6117734724292101, + "grad_norm": 2.706434488296509, + "learning_rate": 0.0002, + "loss": 2.5989, + "step": 8210 + }, + { + "epoch": 0.6125186289120715, + "grad_norm": 2.3741445541381836, + "learning_rate": 0.0002, + "loss": 2.2712, + "step": 8220 + }, + { + "epoch": 0.613263785394933, + "grad_norm": 2.2221875190734863, + "learning_rate": 0.0002, + "loss": 2.5309, + "step": 8230 + }, + { + "epoch": 0.6140089418777943, + "grad_norm": 1.9854212999343872, + "learning_rate": 0.0002, + "loss": 2.2749, + "step": 8240 + }, + { + "epoch": 0.6147540983606558, + "grad_norm": 2.229374885559082, + "learning_rate": 0.0002, + "loss": 2.5129, + "step": 8250 + }, + { + "epoch": 0.6154992548435171, + "grad_norm": 2.4126970767974854, + "learning_rate": 0.0002, + "loss": 2.325, + "step": 8260 + }, + { + "epoch": 0.6162444113263785, + "grad_norm": 2.2903852462768555, + "learning_rate": 0.0002, + "loss": 2.5048, + "step": 8270 + }, + { + "epoch": 0.61698956780924, + "grad_norm": 2.691183090209961, + "learning_rate": 0.0002, + "loss": 2.4766, + "step": 8280 + }, + { + "epoch": 0.6177347242921013, + "grad_norm": 2.386356830596924, + "learning_rate": 0.0002, + "loss": 2.5979, + "step": 8290 + }, + { + "epoch": 0.6184798807749627, + "grad_norm": 2.695887804031372, + "learning_rate": 0.0002, + "loss": 2.642, + "step": 8300 + }, + { + "epoch": 0.6192250372578242, + "grad_norm": 2.5322632789611816, + "learning_rate": 0.0002, + "loss": 2.5457, + "step": 8310 + }, + { + "epoch": 0.6199701937406855, + "grad_norm": 2.29015851020813, + "learning_rate": 0.0002, + "loss": 2.5388, + "step": 8320 + }, + { + "epoch": 0.620715350223547, + "grad_norm": 2.326113700866699, + "learning_rate": 0.0002, + "loss": 2.6757, + "step": 8330 + }, + { + "epoch": 0.6214605067064084, + "grad_norm": 2.18438458442688, + "learning_rate": 0.0002, + "loss": 2.3377, + "step": 8340 + }, + { + "epoch": 0.6222056631892697, + "grad_norm": 2.195730686187744, + "learning_rate": 0.0002, + "loss": 2.334, + "step": 8350 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 2.1896743774414062, + "learning_rate": 0.0002, + "loss": 2.4039, + "step": 8360 + }, + { + "epoch": 0.6236959761549925, + "grad_norm": 2.47771954536438, + "learning_rate": 0.0002, + "loss": 2.625, + "step": 8370 + }, + { + "epoch": 0.624441132637854, + "grad_norm": 2.5502564907073975, + "learning_rate": 0.0002, + "loss": 2.5164, + "step": 8380 + }, + { + "epoch": 0.6251862891207154, + "grad_norm": 1.9382567405700684, + "learning_rate": 0.0002, + "loss": 2.5153, + "step": 8390 + }, + { + "epoch": 0.6259314456035767, + "grad_norm": 2.078873872756958, + "learning_rate": 0.0002, + "loss": 2.4817, + "step": 8400 + }, + { + "epoch": 0.6266766020864382, + "grad_norm": 2.1636760234832764, + "learning_rate": 0.0002, + "loss": 2.7134, + "step": 8410 + }, + { + "epoch": 0.6274217585692996, + "grad_norm": 2.2006876468658447, + "learning_rate": 0.0002, + "loss": 2.6016, + "step": 8420 + }, + { + "epoch": 0.6281669150521609, + "grad_norm": 2.364816665649414, + "learning_rate": 0.0002, + "loss": 2.5201, + "step": 8430 + }, + { + "epoch": 0.6289120715350224, + "grad_norm": 2.450207471847534, + "learning_rate": 0.0002, + "loss": 2.6406, + "step": 8440 + }, + { + "epoch": 0.6296572280178837, + "grad_norm": 2.3795676231384277, + "learning_rate": 0.0002, + "loss": 2.2957, + "step": 8450 + }, + { + "epoch": 0.6304023845007451, + "grad_norm": 2.1926169395446777, + "learning_rate": 0.0002, + "loss": 2.2288, + "step": 8460 + }, + { + "epoch": 0.6311475409836066, + "grad_norm": 1.9396635293960571, + "learning_rate": 0.0002, + "loss": 2.3839, + "step": 8470 + }, + { + "epoch": 0.6318926974664679, + "grad_norm": 2.6635711193084717, + "learning_rate": 0.0002, + "loss": 2.4389, + "step": 8480 + }, + { + "epoch": 0.6326378539493294, + "grad_norm": 2.657240390777588, + "learning_rate": 0.0002, + "loss": 2.3886, + "step": 8490 + }, + { + "epoch": 0.6333830104321908, + "grad_norm": 2.050353765487671, + "learning_rate": 0.0002, + "loss": 2.3714, + "step": 8500 + }, + { + "epoch": 0.6341281669150521, + "grad_norm": 2.3058016300201416, + "learning_rate": 0.0002, + "loss": 2.4301, + "step": 8510 + }, + { + "epoch": 0.6348733233979136, + "grad_norm": 2.3272721767425537, + "learning_rate": 0.0002, + "loss": 2.442, + "step": 8520 + }, + { + "epoch": 0.6356184798807749, + "grad_norm": 2.105719566345215, + "learning_rate": 0.0002, + "loss": 2.3532, + "step": 8530 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 2.2481689453125, + "learning_rate": 0.0002, + "loss": 2.7017, + "step": 8540 + }, + { + "epoch": 0.6371087928464978, + "grad_norm": 2.0684092044830322, + "learning_rate": 0.0002, + "loss": 2.5492, + "step": 8550 + }, + { + "epoch": 0.6378539493293591, + "grad_norm": 2.2087674140930176, + "learning_rate": 0.0002, + "loss": 2.4946, + "step": 8560 + }, + { + "epoch": 0.6385991058122206, + "grad_norm": 2.0686557292938232, + "learning_rate": 0.0002, + "loss": 2.603, + "step": 8570 + }, + { + "epoch": 0.639344262295082, + "grad_norm": 2.223733901977539, + "learning_rate": 0.0002, + "loss": 2.5515, + "step": 8580 + }, + { + "epoch": 0.6400894187779433, + "grad_norm": 2.0543527603149414, + "learning_rate": 0.0002, + "loss": 2.2859, + "step": 8590 + }, + { + "epoch": 0.6408345752608048, + "grad_norm": 2.119685411453247, + "learning_rate": 0.0002, + "loss": 2.4915, + "step": 8600 + }, + { + "epoch": 0.6415797317436661, + "grad_norm": 2.1664891242980957, + "learning_rate": 0.0002, + "loss": 2.4224, + "step": 8610 + }, + { + "epoch": 0.6423248882265276, + "grad_norm": 2.2479021549224854, + "learning_rate": 0.0002, + "loss": 2.5316, + "step": 8620 + }, + { + "epoch": 0.643070044709389, + "grad_norm": 2.2841110229492188, + "learning_rate": 0.0002, + "loss": 2.6606, + "step": 8630 + }, + { + "epoch": 0.6438152011922503, + "grad_norm": 2.4399871826171875, + "learning_rate": 0.0002, + "loss": 2.5727, + "step": 8640 + }, + { + "epoch": 0.6445603576751118, + "grad_norm": 1.9307136535644531, + "learning_rate": 0.0002, + "loss": 2.3302, + "step": 8650 + }, + { + "epoch": 0.6453055141579732, + "grad_norm": 2.2575156688690186, + "learning_rate": 0.0002, + "loss": 2.5079, + "step": 8660 + }, + { + "epoch": 0.6460506706408345, + "grad_norm": 2.704486131668091, + "learning_rate": 0.0002, + "loss": 2.64, + "step": 8670 + }, + { + "epoch": 0.646795827123696, + "grad_norm": 2.2060296535491943, + "learning_rate": 0.0002, + "loss": 2.5639, + "step": 8680 + }, + { + "epoch": 0.6475409836065574, + "grad_norm": 2.2497682571411133, + "learning_rate": 0.0002, + "loss": 2.5753, + "step": 8690 + }, + { + "epoch": 0.6482861400894188, + "grad_norm": 2.1391713619232178, + "learning_rate": 0.0002, + "loss": 2.5179, + "step": 8700 + }, + { + "epoch": 0.6490312965722802, + "grad_norm": 2.4493465423583984, + "learning_rate": 0.0002, + "loss": 2.5404, + "step": 8710 + }, + { + "epoch": 0.6497764530551415, + "grad_norm": 2.2963478565216064, + "learning_rate": 0.0002, + "loss": 2.395, + "step": 8720 + }, + { + "epoch": 0.650521609538003, + "grad_norm": 2.3371636867523193, + "learning_rate": 0.0002, + "loss": 2.649, + "step": 8730 + }, + { + "epoch": 0.6512667660208644, + "grad_norm": 2.1336076259613037, + "learning_rate": 0.0002, + "loss": 2.5782, + "step": 8740 + }, + { + "epoch": 0.6520119225037257, + "grad_norm": 1.9927014112472534, + "learning_rate": 0.0002, + "loss": 2.4791, + "step": 8750 + }, + { + "epoch": 0.6527570789865872, + "grad_norm": 2.5760622024536133, + "learning_rate": 0.0002, + "loss": 2.5103, + "step": 8760 + }, + { + "epoch": 0.6535022354694486, + "grad_norm": 2.3019092082977295, + "learning_rate": 0.0002, + "loss": 2.5403, + "step": 8770 + }, + { + "epoch": 0.65424739195231, + "grad_norm": 2.1122303009033203, + "learning_rate": 0.0002, + "loss": 2.4911, + "step": 8780 + }, + { + "epoch": 0.6549925484351714, + "grad_norm": 2.4388267993927, + "learning_rate": 0.0002, + "loss": 2.5928, + "step": 8790 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 2.3956820964813232, + "learning_rate": 0.0002, + "loss": 2.4162, + "step": 8800 + }, + { + "epoch": 0.6564828614008942, + "grad_norm": 2.251885175704956, + "learning_rate": 0.0002, + "loss": 2.3572, + "step": 8810 + }, + { + "epoch": 0.6572280178837556, + "grad_norm": 2.1196508407592773, + "learning_rate": 0.0002, + "loss": 2.271, + "step": 8820 + }, + { + "epoch": 0.657973174366617, + "grad_norm": 2.6327478885650635, + "learning_rate": 0.0002, + "loss": 2.5528, + "step": 8830 + }, + { + "epoch": 0.6587183308494784, + "grad_norm": 3.1525380611419678, + "learning_rate": 0.0002, + "loss": 2.671, + "step": 8840 + }, + { + "epoch": 0.6594634873323398, + "grad_norm": 2.371023178100586, + "learning_rate": 0.0002, + "loss": 2.3042, + "step": 8850 + }, + { + "epoch": 0.6602086438152012, + "grad_norm": 2.1151058673858643, + "learning_rate": 0.0002, + "loss": 2.3586, + "step": 8860 + }, + { + "epoch": 0.6609538002980626, + "grad_norm": 2.2981162071228027, + "learning_rate": 0.0002, + "loss": 2.3033, + "step": 8870 + }, + { + "epoch": 0.6616989567809239, + "grad_norm": 2.3385653495788574, + "learning_rate": 0.0002, + "loss": 2.4125, + "step": 8880 + }, + { + "epoch": 0.6624441132637854, + "grad_norm": 2.282998561859131, + "learning_rate": 0.0002, + "loss": 2.5846, + "step": 8890 + }, + { + "epoch": 0.6631892697466468, + "grad_norm": 2.276402473449707, + "learning_rate": 0.0002, + "loss": 2.5881, + "step": 8900 + }, + { + "epoch": 0.6639344262295082, + "grad_norm": 2.753835439682007, + "learning_rate": 0.0002, + "loss": 2.5402, + "step": 8910 + }, + { + "epoch": 0.6646795827123696, + "grad_norm": 2.155869960784912, + "learning_rate": 0.0002, + "loss": 2.3382, + "step": 8920 + }, + { + "epoch": 0.665424739195231, + "grad_norm": 2.25738263130188, + "learning_rate": 0.0002, + "loss": 2.5028, + "step": 8930 + }, + { + "epoch": 0.6661698956780924, + "grad_norm": 2.3716089725494385, + "learning_rate": 0.0002, + "loss": 2.4135, + "step": 8940 + }, + { + "epoch": 0.6669150521609538, + "grad_norm": 2.5012192726135254, + "learning_rate": 0.0002, + "loss": 2.4297, + "step": 8950 + }, + { + "epoch": 0.6676602086438153, + "grad_norm": 2.177103281021118, + "learning_rate": 0.0002, + "loss": 2.401, + "step": 8960 + }, + { + "epoch": 0.6684053651266766, + "grad_norm": 2.500803232192993, + "learning_rate": 0.0002, + "loss": 2.6323, + "step": 8970 + }, + { + "epoch": 0.669150521609538, + "grad_norm": 2.7237913608551025, + "learning_rate": 0.0002, + "loss": 2.4543, + "step": 8980 + }, + { + "epoch": 0.6698956780923994, + "grad_norm": 2.0506207942962646, + "learning_rate": 0.0002, + "loss": 2.3305, + "step": 8990 + }, + { + "epoch": 0.6706408345752608, + "grad_norm": 2.182495355606079, + "learning_rate": 0.0002, + "loss": 2.3134, + "step": 9000 + }, + { + "epoch": 0.6713859910581222, + "grad_norm": 2.3970160484313965, + "learning_rate": 0.0002, + "loss": 2.6011, + "step": 9010 + }, + { + "epoch": 0.6721311475409836, + "grad_norm": 2.2599663734436035, + "learning_rate": 0.0002, + "loss": 2.2633, + "step": 9020 + }, + { + "epoch": 0.672876304023845, + "grad_norm": 2.202136754989624, + "learning_rate": 0.0002, + "loss": 2.5181, + "step": 9030 + }, + { + "epoch": 0.6736214605067065, + "grad_norm": 2.4681708812713623, + "learning_rate": 0.0002, + "loss": 2.4997, + "step": 9040 + }, + { + "epoch": 0.6743666169895678, + "grad_norm": 2.455841302871704, + "learning_rate": 0.0002, + "loss": 2.67, + "step": 9050 + }, + { + "epoch": 0.6751117734724292, + "grad_norm": 2.679401397705078, + "learning_rate": 0.0002, + "loss": 2.4532, + "step": 9060 + }, + { + "epoch": 0.6758569299552906, + "grad_norm": 2.003723621368408, + "learning_rate": 0.0002, + "loss": 2.6612, + "step": 9070 + }, + { + "epoch": 0.676602086438152, + "grad_norm": 2.256204128265381, + "learning_rate": 0.0002, + "loss": 2.6165, + "step": 9080 + }, + { + "epoch": 0.6773472429210134, + "grad_norm": 2.3091988563537598, + "learning_rate": 0.0002, + "loss": 2.5914, + "step": 9090 + }, + { + "epoch": 0.6780923994038748, + "grad_norm": 2.3021037578582764, + "learning_rate": 0.0002, + "loss": 2.5028, + "step": 9100 + }, + { + "epoch": 0.6788375558867362, + "grad_norm": 2.1524159908294678, + "learning_rate": 0.0002, + "loss": 2.496, + "step": 9110 + }, + { + "epoch": 0.6795827123695977, + "grad_norm": 2.2679061889648438, + "learning_rate": 0.0002, + "loss": 2.5092, + "step": 9120 + }, + { + "epoch": 0.680327868852459, + "grad_norm": 2.5078539848327637, + "learning_rate": 0.0002, + "loss": 2.2919, + "step": 9130 + }, + { + "epoch": 0.6810730253353204, + "grad_norm": 2.5029258728027344, + "learning_rate": 0.0002, + "loss": 2.4382, + "step": 9140 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 2.5736641883850098, + "learning_rate": 0.0002, + "loss": 2.4778, + "step": 9150 + }, + { + "epoch": 0.6825633383010432, + "grad_norm": 2.327671527862549, + "learning_rate": 0.0002, + "loss": 2.5559, + "step": 9160 + }, + { + "epoch": 0.6833084947839047, + "grad_norm": 2.310634136199951, + "learning_rate": 0.0002, + "loss": 2.4736, + "step": 9170 + }, + { + "epoch": 0.684053651266766, + "grad_norm": 2.2063729763031006, + "learning_rate": 0.0002, + "loss": 2.5829, + "step": 9180 + }, + { + "epoch": 0.6847988077496274, + "grad_norm": 2.5868451595306396, + "learning_rate": 0.0002, + "loss": 2.4465, + "step": 9190 + }, + { + "epoch": 0.6855439642324889, + "grad_norm": 1.984372854232788, + "learning_rate": 0.0002, + "loss": 2.4654, + "step": 9200 + }, + { + "epoch": 0.6862891207153502, + "grad_norm": 2.2977192401885986, + "learning_rate": 0.0002, + "loss": 2.5643, + "step": 9210 + }, + { + "epoch": 0.6870342771982116, + "grad_norm": 2.3139028549194336, + "learning_rate": 0.0002, + "loss": 2.7359, + "step": 9220 + }, + { + "epoch": 0.687779433681073, + "grad_norm": 2.124882698059082, + "learning_rate": 0.0002, + "loss": 2.4335, + "step": 9230 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 2.434063673019409, + "learning_rate": 0.0002, + "loss": 2.3576, + "step": 9240 + }, + { + "epoch": 0.6892697466467959, + "grad_norm": 2.290684700012207, + "learning_rate": 0.0002, + "loss": 2.7062, + "step": 9250 + }, + { + "epoch": 0.6900149031296572, + "grad_norm": 2.5476014614105225, + "learning_rate": 0.0002, + "loss": 2.6915, + "step": 9260 + }, + { + "epoch": 0.6907600596125186, + "grad_norm": 2.1429226398468018, + "learning_rate": 0.0002, + "loss": 2.5736, + "step": 9270 + }, + { + "epoch": 0.6915052160953801, + "grad_norm": 2.399526834487915, + "learning_rate": 0.0002, + "loss": 2.3798, + "step": 9280 + }, + { + "epoch": 0.6922503725782414, + "grad_norm": 2.5570576190948486, + "learning_rate": 0.0002, + "loss": 2.5188, + "step": 9290 + }, + { + "epoch": 0.6929955290611028, + "grad_norm": 2.248030185699463, + "learning_rate": 0.0002, + "loss": 2.4761, + "step": 9300 + }, + { + "epoch": 0.6937406855439643, + "grad_norm": 2.649503231048584, + "learning_rate": 0.0002, + "loss": 2.6177, + "step": 9310 + }, + { + "epoch": 0.6944858420268256, + "grad_norm": 2.1536803245544434, + "learning_rate": 0.0002, + "loss": 2.5347, + "step": 9320 + }, + { + "epoch": 0.6952309985096871, + "grad_norm": 2.3103137016296387, + "learning_rate": 0.0002, + "loss": 2.4589, + "step": 9330 + }, + { + "epoch": 0.6959761549925484, + "grad_norm": 2.5560615062713623, + "learning_rate": 0.0002, + "loss": 2.6268, + "step": 9340 + }, + { + "epoch": 0.6967213114754098, + "grad_norm": 2.149562120437622, + "learning_rate": 0.0002, + "loss": 2.7589, + "step": 9350 + }, + { + "epoch": 0.6974664679582713, + "grad_norm": 2.180457592010498, + "learning_rate": 0.0002, + "loss": 2.3707, + "step": 9360 + }, + { + "epoch": 0.6982116244411326, + "grad_norm": 2.1361920833587646, + "learning_rate": 0.0002, + "loss": 2.5647, + "step": 9370 + }, + { + "epoch": 0.698956780923994, + "grad_norm": 2.5958340167999268, + "learning_rate": 0.0002, + "loss": 2.6107, + "step": 9380 + }, + { + "epoch": 0.6997019374068555, + "grad_norm": 2.298337936401367, + "learning_rate": 0.0002, + "loss": 2.5033, + "step": 9390 + }, + { + "epoch": 0.7004470938897168, + "grad_norm": 2.5497617721557617, + "learning_rate": 0.0002, + "loss": 2.6554, + "step": 9400 + }, + { + "epoch": 0.7011922503725783, + "grad_norm": 2.4965898990631104, + "learning_rate": 0.0002, + "loss": 2.2896, + "step": 9410 + }, + { + "epoch": 0.7019374068554396, + "grad_norm": 1.6895242929458618, + "learning_rate": 0.0002, + "loss": 2.5128, + "step": 9420 + }, + { + "epoch": 0.702682563338301, + "grad_norm": 2.5977084636688232, + "learning_rate": 0.0002, + "loss": 2.6261, + "step": 9430 + }, + { + "epoch": 0.7034277198211625, + "grad_norm": 2.2413127422332764, + "learning_rate": 0.0002, + "loss": 2.5724, + "step": 9440 + }, + { + "epoch": 0.7041728763040238, + "grad_norm": 1.9375770092010498, + "learning_rate": 0.0002, + "loss": 2.3573, + "step": 9450 + }, + { + "epoch": 0.7049180327868853, + "grad_norm": 2.1468889713287354, + "learning_rate": 0.0002, + "loss": 2.4854, + "step": 9460 + }, + { + "epoch": 0.7056631892697467, + "grad_norm": 2.3307502269744873, + "learning_rate": 0.0002, + "loss": 2.6102, + "step": 9470 + }, + { + "epoch": 0.706408345752608, + "grad_norm": 2.025935411453247, + "learning_rate": 0.0002, + "loss": 2.5878, + "step": 9480 + }, + { + "epoch": 0.7071535022354695, + "grad_norm": 2.0282442569732666, + "learning_rate": 0.0002, + "loss": 2.634, + "step": 9490 + }, + { + "epoch": 0.7078986587183308, + "grad_norm": 2.3716142177581787, + "learning_rate": 0.0002, + "loss": 2.523, + "step": 9500 + }, + { + "epoch": 0.7086438152011922, + "grad_norm": 2.1333203315734863, + "learning_rate": 0.0002, + "loss": 2.4948, + "step": 9510 + }, + { + "epoch": 0.7093889716840537, + "grad_norm": 2.215022563934326, + "learning_rate": 0.0002, + "loss": 2.4586, + "step": 9520 + }, + { + "epoch": 0.710134128166915, + "grad_norm": 2.392059087753296, + "learning_rate": 0.0002, + "loss": 2.7049, + "step": 9530 + }, + { + "epoch": 0.7108792846497765, + "grad_norm": 2.0697712898254395, + "learning_rate": 0.0002, + "loss": 2.4652, + "step": 9540 + }, + { + "epoch": 0.7116244411326379, + "grad_norm": 2.451186418533325, + "learning_rate": 0.0002, + "loss": 2.6232, + "step": 9550 + }, + { + "epoch": 0.7123695976154992, + "grad_norm": 2.7246387004852295, + "learning_rate": 0.0002, + "loss": 2.6331, + "step": 9560 + }, + { + "epoch": 0.7131147540983607, + "grad_norm": 2.2628626823425293, + "learning_rate": 0.0002, + "loss": 2.4974, + "step": 9570 + }, + { + "epoch": 0.713859910581222, + "grad_norm": 2.2943952083587646, + "learning_rate": 0.0002, + "loss": 2.5514, + "step": 9580 + }, + { + "epoch": 0.7146050670640834, + "grad_norm": 2.394134044647217, + "learning_rate": 0.0002, + "loss": 2.4207, + "step": 9590 + }, + { + "epoch": 0.7153502235469449, + "grad_norm": 2.2121686935424805, + "learning_rate": 0.0002, + "loss": 2.4458, + "step": 9600 + }, + { + "epoch": 0.7160953800298062, + "grad_norm": 2.43963885307312, + "learning_rate": 0.0002, + "loss": 2.5707, + "step": 9610 + }, + { + "epoch": 0.7168405365126677, + "grad_norm": 2.44991135597229, + "learning_rate": 0.0002, + "loss": 2.3977, + "step": 9620 + }, + { + "epoch": 0.7175856929955291, + "grad_norm": 2.600816488265991, + "learning_rate": 0.0002, + "loss": 2.6366, + "step": 9630 + }, + { + "epoch": 0.7183308494783904, + "grad_norm": 2.5606367588043213, + "learning_rate": 0.0002, + "loss": 2.3104, + "step": 9640 + }, + { + "epoch": 0.7190760059612519, + "grad_norm": 2.0649945735931396, + "learning_rate": 0.0002, + "loss": 2.4726, + "step": 9650 + }, + { + "epoch": 0.7198211624441133, + "grad_norm": 2.463927984237671, + "learning_rate": 0.0002, + "loss": 2.437, + "step": 9660 + }, + { + "epoch": 0.7205663189269746, + "grad_norm": 2.189600706100464, + "learning_rate": 0.0002, + "loss": 2.6419, + "step": 9670 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 1.8361003398895264, + "learning_rate": 0.0002, + "loss": 2.3392, + "step": 9680 + }, + { + "epoch": 0.7220566318926974, + "grad_norm": 2.4830501079559326, + "learning_rate": 0.0002, + "loss": 2.6257, + "step": 9690 + }, + { + "epoch": 0.7228017883755589, + "grad_norm": 2.311711072921753, + "learning_rate": 0.0002, + "loss": 2.4023, + "step": 9700 + }, + { + "epoch": 0.7235469448584203, + "grad_norm": 1.9280378818511963, + "learning_rate": 0.0002, + "loss": 2.6589, + "step": 9710 + }, + { + "epoch": 0.7242921013412816, + "grad_norm": 2.1411705017089844, + "learning_rate": 0.0002, + "loss": 2.539, + "step": 9720 + }, + { + "epoch": 0.7250372578241431, + "grad_norm": 2.262427568435669, + "learning_rate": 0.0002, + "loss": 2.3129, + "step": 9730 + }, + { + "epoch": 0.7257824143070045, + "grad_norm": 2.005398988723755, + "learning_rate": 0.0002, + "loss": 2.4774, + "step": 9740 + }, + { + "epoch": 0.7265275707898659, + "grad_norm": 2.4369115829467773, + "learning_rate": 0.0002, + "loss": 2.5077, + "step": 9750 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 2.5426080226898193, + "learning_rate": 0.0002, + "loss": 2.4542, + "step": 9760 + }, + { + "epoch": 0.7280178837555886, + "grad_norm": 2.222259044647217, + "learning_rate": 0.0002, + "loss": 2.5733, + "step": 9770 + }, + { + "epoch": 0.7287630402384501, + "grad_norm": 3.0009191036224365, + "learning_rate": 0.0002, + "loss": 2.5769, + "step": 9780 + }, + { + "epoch": 0.7295081967213115, + "grad_norm": 2.354903221130371, + "learning_rate": 0.0002, + "loss": 2.4887, + "step": 9790 + }, + { + "epoch": 0.7302533532041728, + "grad_norm": 2.4170987606048584, + "learning_rate": 0.0002, + "loss": 2.3048, + "step": 9800 + }, + { + "epoch": 0.7309985096870343, + "grad_norm": 2.6301980018615723, + "learning_rate": 0.0002, + "loss": 2.2994, + "step": 9810 + }, + { + "epoch": 0.7317436661698957, + "grad_norm": 1.6262503862380981, + "learning_rate": 0.0002, + "loss": 2.0802, + "step": 9820 + }, + { + "epoch": 0.732488822652757, + "grad_norm": 2.165588855743408, + "learning_rate": 0.0002, + "loss": 2.5009, + "step": 9830 + }, + { + "epoch": 0.7332339791356185, + "grad_norm": 2.3280584812164307, + "learning_rate": 0.0002, + "loss": 2.5506, + "step": 9840 + }, + { + "epoch": 0.7339791356184798, + "grad_norm": 2.505038261413574, + "learning_rate": 0.0002, + "loss": 2.4491, + "step": 9850 + }, + { + "epoch": 0.7347242921013413, + "grad_norm": 2.5628268718719482, + "learning_rate": 0.0002, + "loss": 2.5678, + "step": 9860 + }, + { + "epoch": 0.7354694485842027, + "grad_norm": 2.371814489364624, + "learning_rate": 0.0002, + "loss": 2.4223, + "step": 9870 + }, + { + "epoch": 0.736214605067064, + "grad_norm": 1.9160370826721191, + "learning_rate": 0.0002, + "loss": 2.5315, + "step": 9880 + }, + { + "epoch": 0.7369597615499255, + "grad_norm": 2.015497922897339, + "learning_rate": 0.0002, + "loss": 2.6611, + "step": 9890 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 2.402764081954956, + "learning_rate": 0.0002, + "loss": 2.5224, + "step": 9900 + }, + { + "epoch": 0.7384500745156483, + "grad_norm": 2.2813656330108643, + "learning_rate": 0.0002, + "loss": 2.6005, + "step": 9910 + }, + { + "epoch": 0.7391952309985097, + "grad_norm": 2.1747665405273438, + "learning_rate": 0.0002, + "loss": 2.6124, + "step": 9920 + }, + { + "epoch": 0.7399403874813711, + "grad_norm": 2.8765082359313965, + "learning_rate": 0.0002, + "loss": 2.4271, + "step": 9930 + }, + { + "epoch": 0.7406855439642325, + "grad_norm": 2.9332666397094727, + "learning_rate": 0.0002, + "loss": 2.5666, + "step": 9940 + }, + { + "epoch": 0.7414307004470939, + "grad_norm": 2.209160566329956, + "learning_rate": 0.0002, + "loss": 2.5214, + "step": 9950 + }, + { + "epoch": 0.7421758569299552, + "grad_norm": 2.141798734664917, + "learning_rate": 0.0002, + "loss": 2.5629, + "step": 9960 + }, + { + "epoch": 0.7429210134128167, + "grad_norm": 2.4280612468719482, + "learning_rate": 0.0002, + "loss": 2.6251, + "step": 9970 + }, + { + "epoch": 0.7436661698956781, + "grad_norm": 2.588738441467285, + "learning_rate": 0.0002, + "loss": 2.5636, + "step": 9980 + }, + { + "epoch": 0.7444113263785395, + "grad_norm": 2.423440456390381, + "learning_rate": 0.0002, + "loss": 2.4463, + "step": 9990 + }, + { + "epoch": 0.7451564828614009, + "grad_norm": 2.3735451698303223, + "learning_rate": 0.0002, + "loss": 2.2597, + "step": 10000 + }, + { + "epoch": 0.7459016393442623, + "grad_norm": 2.585657835006714, + "learning_rate": 0.0002, + "loss": 2.5384, + "step": 10010 + }, + { + "epoch": 0.7466467958271237, + "grad_norm": 2.6739962100982666, + "learning_rate": 0.0002, + "loss": 2.413, + "step": 10020 + }, + { + "epoch": 0.7473919523099851, + "grad_norm": 2.564932346343994, + "learning_rate": 0.0002, + "loss": 2.4084, + "step": 10030 + }, + { + "epoch": 0.7481371087928465, + "grad_norm": 2.0360469818115234, + "learning_rate": 0.0002, + "loss": 2.5756, + "step": 10040 + }, + { + "epoch": 0.7488822652757079, + "grad_norm": 2.26521897315979, + "learning_rate": 0.0002, + "loss": 2.3981, + "step": 10050 + }, + { + "epoch": 0.7496274217585693, + "grad_norm": 2.3003921508789062, + "learning_rate": 0.0002, + "loss": 2.599, + "step": 10060 + }, + { + "epoch": 0.7503725782414307, + "grad_norm": 2.3010787963867188, + "learning_rate": 0.0002, + "loss": 2.6884, + "step": 10070 + }, + { + "epoch": 0.7511177347242921, + "grad_norm": 2.4945406913757324, + "learning_rate": 0.0002, + "loss": 2.6397, + "step": 10080 + }, + { + "epoch": 0.7518628912071535, + "grad_norm": 2.391580104827881, + "learning_rate": 0.0002, + "loss": 2.4918, + "step": 10090 + }, + { + "epoch": 0.7526080476900149, + "grad_norm": 2.094149589538574, + "learning_rate": 0.0002, + "loss": 2.5465, + "step": 10100 + }, + { + "epoch": 0.7533532041728763, + "grad_norm": 2.440086603164673, + "learning_rate": 0.0002, + "loss": 2.665, + "step": 10110 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 2.482935905456543, + "learning_rate": 0.0002, + "loss": 2.5492, + "step": 10120 + }, + { + "epoch": 0.7548435171385991, + "grad_norm": 2.4048640727996826, + "learning_rate": 0.0002, + "loss": 2.2824, + "step": 10130 + }, + { + "epoch": 0.7555886736214605, + "grad_norm": 2.5780625343322754, + "learning_rate": 0.0002, + "loss": 2.5935, + "step": 10140 + }, + { + "epoch": 0.7563338301043219, + "grad_norm": 1.9736360311508179, + "learning_rate": 0.0002, + "loss": 2.6113, + "step": 10150 + }, + { + "epoch": 0.7570789865871833, + "grad_norm": 2.432325839996338, + "learning_rate": 0.0002, + "loss": 2.5351, + "step": 10160 + }, + { + "epoch": 0.7578241430700448, + "grad_norm": 1.9205713272094727, + "learning_rate": 0.0002, + "loss": 2.4855, + "step": 10170 + }, + { + "epoch": 0.7585692995529061, + "grad_norm": 2.043088436126709, + "learning_rate": 0.0002, + "loss": 2.5828, + "step": 10180 + }, + { + "epoch": 0.7593144560357675, + "grad_norm": 3.1941723823547363, + "learning_rate": 0.0002, + "loss": 2.6751, + "step": 10190 + }, + { + "epoch": 0.7600596125186289, + "grad_norm": 2.210202693939209, + "learning_rate": 0.0002, + "loss": 2.4979, + "step": 10200 + }, + { + "epoch": 0.7608047690014903, + "grad_norm": 2.0645289421081543, + "learning_rate": 0.0002, + "loss": 2.5, + "step": 10210 + }, + { + "epoch": 0.7615499254843517, + "grad_norm": 2.4983456134796143, + "learning_rate": 0.0002, + "loss": 2.501, + "step": 10220 + }, + { + "epoch": 0.7622950819672131, + "grad_norm": 2.4388515949249268, + "learning_rate": 0.0002, + "loss": 2.5407, + "step": 10230 + }, + { + "epoch": 0.7630402384500745, + "grad_norm": 1.960217833518982, + "learning_rate": 0.0002, + "loss": 2.3579, + "step": 10240 + }, + { + "epoch": 0.763785394932936, + "grad_norm": 2.206451177597046, + "learning_rate": 0.0002, + "loss": 2.5991, + "step": 10250 + }, + { + "epoch": 0.7645305514157973, + "grad_norm": 3.0600955486297607, + "learning_rate": 0.0002, + "loss": 2.534, + "step": 10260 + }, + { + "epoch": 0.7652757078986587, + "grad_norm": 2.1760904788970947, + "learning_rate": 0.0002, + "loss": 2.5936, + "step": 10270 + }, + { + "epoch": 0.7660208643815202, + "grad_norm": 2.1374666690826416, + "learning_rate": 0.0002, + "loss": 2.3698, + "step": 10280 + }, + { + "epoch": 0.7667660208643815, + "grad_norm": 1.8994402885437012, + "learning_rate": 0.0002, + "loss": 2.5065, + "step": 10290 + }, + { + "epoch": 0.767511177347243, + "grad_norm": 1.7293874025344849, + "learning_rate": 0.0002, + "loss": 2.5013, + "step": 10300 + }, + { + "epoch": 0.7682563338301043, + "grad_norm": 2.463646650314331, + "learning_rate": 0.0002, + "loss": 2.606, + "step": 10310 + }, + { + "epoch": 0.7690014903129657, + "grad_norm": 2.521929979324341, + "learning_rate": 0.0002, + "loss": 2.3497, + "step": 10320 + }, + { + "epoch": 0.7697466467958272, + "grad_norm": 2.0710620880126953, + "learning_rate": 0.0002, + "loss": 2.3254, + "step": 10330 + }, + { + "epoch": 0.7704918032786885, + "grad_norm": 2.2939655780792236, + "learning_rate": 0.0002, + "loss": 2.5636, + "step": 10340 + }, + { + "epoch": 0.7712369597615499, + "grad_norm": 2.3778505325317383, + "learning_rate": 0.0002, + "loss": 2.426, + "step": 10350 + }, + { + "epoch": 0.7719821162444114, + "grad_norm": 2.2639496326446533, + "learning_rate": 0.0002, + "loss": 2.5813, + "step": 10360 + }, + { + "epoch": 0.7727272727272727, + "grad_norm": 1.964016318321228, + "learning_rate": 0.0002, + "loss": 2.554, + "step": 10370 + }, + { + "epoch": 0.7734724292101341, + "grad_norm": 2.1532862186431885, + "learning_rate": 0.0002, + "loss": 2.4438, + "step": 10380 + }, + { + "epoch": 0.7742175856929955, + "grad_norm": 2.354395627975464, + "learning_rate": 0.0002, + "loss": 2.6894, + "step": 10390 + }, + { + "epoch": 0.7749627421758569, + "grad_norm": 2.0978426933288574, + "learning_rate": 0.0002, + "loss": 2.5408, + "step": 10400 + }, + { + "epoch": 0.7757078986587184, + "grad_norm": 2.2250595092773438, + "learning_rate": 0.0002, + "loss": 2.4457, + "step": 10410 + }, + { + "epoch": 0.7764530551415797, + "grad_norm": 2.65061354637146, + "learning_rate": 0.0002, + "loss": 2.3338, + "step": 10420 + }, + { + "epoch": 0.7771982116244411, + "grad_norm": 2.519925117492676, + "learning_rate": 0.0002, + "loss": 2.6637, + "step": 10430 + }, + { + "epoch": 0.7779433681073026, + "grad_norm": 1.8276631832122803, + "learning_rate": 0.0002, + "loss": 2.4743, + "step": 10440 + }, + { + "epoch": 0.7786885245901639, + "grad_norm": 2.745326042175293, + "learning_rate": 0.0002, + "loss": 2.4987, + "step": 10450 + }, + { + "epoch": 0.7794336810730254, + "grad_norm": 2.3664751052856445, + "learning_rate": 0.0002, + "loss": 2.6172, + "step": 10460 + }, + { + "epoch": 0.7801788375558867, + "grad_norm": 2.136486768722534, + "learning_rate": 0.0002, + "loss": 2.581, + "step": 10470 + }, + { + "epoch": 0.7809239940387481, + "grad_norm": 2.13596773147583, + "learning_rate": 0.0002, + "loss": 2.4819, + "step": 10480 + }, + { + "epoch": 0.7816691505216096, + "grad_norm": 2.0259013175964355, + "learning_rate": 0.0002, + "loss": 2.4299, + "step": 10490 + }, + { + "epoch": 0.7824143070044709, + "grad_norm": 2.3508143424987793, + "learning_rate": 0.0002, + "loss": 2.6114, + "step": 10500 + }, + { + "epoch": 0.7831594634873323, + "grad_norm": 2.127795457839966, + "learning_rate": 0.0002, + "loss": 2.5393, + "step": 10510 + }, + { + "epoch": 0.7839046199701938, + "grad_norm": 2.1637115478515625, + "learning_rate": 0.0002, + "loss": 2.6723, + "step": 10520 + }, + { + "epoch": 0.7846497764530551, + "grad_norm": 2.2965123653411865, + "learning_rate": 0.0002, + "loss": 2.5549, + "step": 10530 + }, + { + "epoch": 0.7853949329359166, + "grad_norm": 2.3020741939544678, + "learning_rate": 0.0002, + "loss": 2.5971, + "step": 10540 + }, + { + "epoch": 0.786140089418778, + "grad_norm": 2.3106722831726074, + "learning_rate": 0.0002, + "loss": 2.5852, + "step": 10550 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 2.332108736038208, + "learning_rate": 0.0002, + "loss": 2.3608, + "step": 10560 + }, + { + "epoch": 0.7876304023845008, + "grad_norm": 2.337618589401245, + "learning_rate": 0.0002, + "loss": 2.3785, + "step": 10570 + }, + { + "epoch": 0.7883755588673621, + "grad_norm": 2.413151502609253, + "learning_rate": 0.0002, + "loss": 2.6424, + "step": 10580 + }, + { + "epoch": 0.7891207153502235, + "grad_norm": 1.9372957944869995, + "learning_rate": 0.0002, + "loss": 2.5991, + "step": 10590 + }, + { + "epoch": 0.789865871833085, + "grad_norm": 2.435525417327881, + "learning_rate": 0.0002, + "loss": 2.5649, + "step": 10600 + }, + { + "epoch": 0.7906110283159463, + "grad_norm": 2.385913372039795, + "learning_rate": 0.0002, + "loss": 2.386, + "step": 10610 + }, + { + "epoch": 0.7913561847988078, + "grad_norm": 2.0109829902648926, + "learning_rate": 0.0002, + "loss": 2.4575, + "step": 10620 + }, + { + "epoch": 0.7921013412816692, + "grad_norm": 2.28608775138855, + "learning_rate": 0.0002, + "loss": 2.5546, + "step": 10630 + }, + { + "epoch": 0.7928464977645305, + "grad_norm": 2.1904425621032715, + "learning_rate": 0.0002, + "loss": 2.4409, + "step": 10640 + }, + { + "epoch": 0.793591654247392, + "grad_norm": 2.351679801940918, + "learning_rate": 0.0002, + "loss": 2.6908, + "step": 10650 + }, + { + "epoch": 0.7943368107302533, + "grad_norm": 2.2788383960723877, + "learning_rate": 0.0002, + "loss": 2.4305, + "step": 10660 + }, + { + "epoch": 0.7950819672131147, + "grad_norm": 2.0748603343963623, + "learning_rate": 0.0002, + "loss": 2.2788, + "step": 10670 + }, + { + "epoch": 0.7958271236959762, + "grad_norm": 2.1148083209991455, + "learning_rate": 0.0002, + "loss": 2.5214, + "step": 10680 + }, + { + "epoch": 0.7965722801788375, + "grad_norm": 2.439086675643921, + "learning_rate": 0.0002, + "loss": 2.572, + "step": 10690 + }, + { + "epoch": 0.797317436661699, + "grad_norm": 2.302316188812256, + "learning_rate": 0.0002, + "loss": 2.6505, + "step": 10700 + }, + { + "epoch": 0.7980625931445604, + "grad_norm": 1.977591633796692, + "learning_rate": 0.0002, + "loss": 2.6918, + "step": 10710 + }, + { + "epoch": 0.7988077496274217, + "grad_norm": 2.8533225059509277, + "learning_rate": 0.0002, + "loss": 2.6504, + "step": 10720 + }, + { + "epoch": 0.7995529061102832, + "grad_norm": 2.3402719497680664, + "learning_rate": 0.0002, + "loss": 2.5394, + "step": 10730 + }, + { + "epoch": 0.8002980625931445, + "grad_norm": 1.8375385999679565, + "learning_rate": 0.0002, + "loss": 2.4388, + "step": 10740 + }, + { + "epoch": 0.801043219076006, + "grad_norm": 2.375514030456543, + "learning_rate": 0.0002, + "loss": 2.2368, + "step": 10750 + }, + { + "epoch": 0.8017883755588674, + "grad_norm": 2.218656063079834, + "learning_rate": 0.0002, + "loss": 2.6589, + "step": 10760 + }, + { + "epoch": 0.8025335320417287, + "grad_norm": 2.560631513595581, + "learning_rate": 0.0002, + "loss": 2.3786, + "step": 10770 + }, + { + "epoch": 0.8032786885245902, + "grad_norm": 2.4114229679107666, + "learning_rate": 0.0002, + "loss": 2.5688, + "step": 10780 + }, + { + "epoch": 0.8040238450074516, + "grad_norm": 2.948805570602417, + "learning_rate": 0.0002, + "loss": 2.5296, + "step": 10790 + }, + { + "epoch": 0.8047690014903129, + "grad_norm": 2.384042501449585, + "learning_rate": 0.0002, + "loss": 2.6218, + "step": 10800 + }, + { + "epoch": 0.8055141579731744, + "grad_norm": 2.3185818195343018, + "learning_rate": 0.0002, + "loss": 2.5289, + "step": 10810 + }, + { + "epoch": 0.8062593144560357, + "grad_norm": 2.2557380199432373, + "learning_rate": 0.0002, + "loss": 2.4439, + "step": 10820 + }, + { + "epoch": 0.8070044709388972, + "grad_norm": 2.0974535942077637, + "learning_rate": 0.0002, + "loss": 2.4567, + "step": 10830 + }, + { + "epoch": 0.8077496274217586, + "grad_norm": 2.480273962020874, + "learning_rate": 0.0002, + "loss": 2.4788, + "step": 10840 + }, + { + "epoch": 0.8084947839046199, + "grad_norm": 2.4017157554626465, + "learning_rate": 0.0002, + "loss": 2.4639, + "step": 10850 + }, + { + "epoch": 0.8092399403874814, + "grad_norm": 2.3682639598846436, + "learning_rate": 0.0002, + "loss": 2.5533, + "step": 10860 + }, + { + "epoch": 0.8099850968703428, + "grad_norm": 2.693796157836914, + "learning_rate": 0.0002, + "loss": 2.4756, + "step": 10870 + }, + { + "epoch": 0.8107302533532041, + "grad_norm": 2.371288776397705, + "learning_rate": 0.0002, + "loss": 2.4941, + "step": 10880 + }, + { + "epoch": 0.8114754098360656, + "grad_norm": 2.4420065879821777, + "learning_rate": 0.0002, + "loss": 2.5901, + "step": 10890 + }, + { + "epoch": 0.812220566318927, + "grad_norm": 2.154177665710449, + "learning_rate": 0.0002, + "loss": 2.4709, + "step": 10900 + }, + { + "epoch": 0.8129657228017884, + "grad_norm": 2.3035285472869873, + "learning_rate": 0.0002, + "loss": 2.2832, + "step": 10910 + }, + { + "epoch": 0.8137108792846498, + "grad_norm": 2.2672736644744873, + "learning_rate": 0.0002, + "loss": 2.8001, + "step": 10920 + }, + { + "epoch": 0.8144560357675111, + "grad_norm": 2.507875442504883, + "learning_rate": 0.0002, + "loss": 2.5625, + "step": 10930 + }, + { + "epoch": 0.8152011922503726, + "grad_norm": 2.542093276977539, + "learning_rate": 0.0002, + "loss": 2.5738, + "step": 10940 + }, + { + "epoch": 0.815946348733234, + "grad_norm": 2.764739990234375, + "learning_rate": 0.0002, + "loss": 2.6649, + "step": 10950 + }, + { + "epoch": 0.8166915052160953, + "grad_norm": 2.2417197227478027, + "learning_rate": 0.0002, + "loss": 2.5701, + "step": 10960 + }, + { + "epoch": 0.8174366616989568, + "grad_norm": 2.3203530311584473, + "learning_rate": 0.0002, + "loss": 2.57, + "step": 10970 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 2.8699557781219482, + "learning_rate": 0.0002, + "loss": 2.5023, + "step": 10980 + }, + { + "epoch": 0.8189269746646796, + "grad_norm": 2.4314422607421875, + "learning_rate": 0.0002, + "loss": 2.5016, + "step": 10990 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 2.330038547515869, + "learning_rate": 0.0002, + "loss": 2.3436, + "step": 11000 + }, + { + "epoch": 0.8204172876304023, + "grad_norm": 2.177577257156372, + "learning_rate": 0.0002, + "loss": 2.5652, + "step": 11010 + }, + { + "epoch": 0.8211624441132638, + "grad_norm": 1.8706187009811401, + "learning_rate": 0.0002, + "loss": 2.7162, + "step": 11020 + }, + { + "epoch": 0.8219076005961252, + "grad_norm": 2.2145588397979736, + "learning_rate": 0.0002, + "loss": 2.5284, + "step": 11030 + }, + { + "epoch": 0.8226527570789866, + "grad_norm": 2.344332456588745, + "learning_rate": 0.0002, + "loss": 2.6446, + "step": 11040 + }, + { + "epoch": 0.823397913561848, + "grad_norm": 2.391944646835327, + "learning_rate": 0.0002, + "loss": 2.5797, + "step": 11050 + }, + { + "epoch": 0.8241430700447094, + "grad_norm": 2.2976560592651367, + "learning_rate": 0.0002, + "loss": 2.3885, + "step": 11060 + }, + { + "epoch": 0.8248882265275708, + "grad_norm": 2.0775372982025146, + "learning_rate": 0.0002, + "loss": 2.5791, + "step": 11070 + }, + { + "epoch": 0.8256333830104322, + "grad_norm": 2.264828681945801, + "learning_rate": 0.0002, + "loss": 2.6107, + "step": 11080 + }, + { + "epoch": 0.8263785394932935, + "grad_norm": 2.1409361362457275, + "learning_rate": 0.0002, + "loss": 2.5045, + "step": 11090 + }, + { + "epoch": 0.827123695976155, + "grad_norm": 2.565410614013672, + "learning_rate": 0.0002, + "loss": 2.5521, + "step": 11100 + }, + { + "epoch": 0.8278688524590164, + "grad_norm": 2.375131607055664, + "learning_rate": 0.0002, + "loss": 2.3359, + "step": 11110 + }, + { + "epoch": 0.8286140089418778, + "grad_norm": 2.2293524742126465, + "learning_rate": 0.0002, + "loss": 2.3675, + "step": 11120 + }, + { + "epoch": 0.8293591654247392, + "grad_norm": 2.4063050746917725, + "learning_rate": 0.0002, + "loss": 2.4713, + "step": 11130 + }, + { + "epoch": 0.8301043219076006, + "grad_norm": 2.408867597579956, + "learning_rate": 0.0002, + "loss": 2.5096, + "step": 11140 + }, + { + "epoch": 0.830849478390462, + "grad_norm": 2.5441880226135254, + "learning_rate": 0.0002, + "loss": 2.5106, + "step": 11150 + }, + { + "epoch": 0.8315946348733234, + "grad_norm": 2.539547920227051, + "learning_rate": 0.0002, + "loss": 2.4346, + "step": 11160 + }, + { + "epoch": 0.8323397913561847, + "grad_norm": 2.414623260498047, + "learning_rate": 0.0002, + "loss": 2.5962, + "step": 11170 + }, + { + "epoch": 0.8330849478390462, + "grad_norm": 2.219356060028076, + "learning_rate": 0.0002, + "loss": 2.5249, + "step": 11180 + }, + { + "epoch": 0.8338301043219076, + "grad_norm": 2.43898606300354, + "learning_rate": 0.0002, + "loss": 2.6075, + "step": 11190 + }, + { + "epoch": 0.834575260804769, + "grad_norm": 2.644263744354248, + "learning_rate": 0.0002, + "loss": 2.5456, + "step": 11200 + }, + { + "epoch": 0.8353204172876304, + "grad_norm": 2.2671680450439453, + "learning_rate": 0.0002, + "loss": 2.4596, + "step": 11210 + }, + { + "epoch": 0.8360655737704918, + "grad_norm": 2.3897671699523926, + "learning_rate": 0.0002, + "loss": 2.5104, + "step": 11220 + }, + { + "epoch": 0.8368107302533532, + "grad_norm": 2.2913897037506104, + "learning_rate": 0.0002, + "loss": 2.6487, + "step": 11230 + }, + { + "epoch": 0.8375558867362146, + "grad_norm": 2.287472724914551, + "learning_rate": 0.0002, + "loss": 2.6213, + "step": 11240 + }, + { + "epoch": 0.8383010432190761, + "grad_norm": 2.184960126876831, + "learning_rate": 0.0002, + "loss": 2.559, + "step": 11250 + }, + { + "epoch": 0.8390461997019374, + "grad_norm": 2.3212270736694336, + "learning_rate": 0.0002, + "loss": 2.4804, + "step": 11260 + }, + { + "epoch": 0.8397913561847988, + "grad_norm": 2.361088991165161, + "learning_rate": 0.0002, + "loss": 2.7022, + "step": 11270 + }, + { + "epoch": 0.8405365126676602, + "grad_norm": 2.6766278743743896, + "learning_rate": 0.0002, + "loss": 2.5002, + "step": 11280 + }, + { + "epoch": 0.8412816691505216, + "grad_norm": 2.4925765991210938, + "learning_rate": 0.0002, + "loss": 2.5971, + "step": 11290 + }, + { + "epoch": 0.842026825633383, + "grad_norm": 2.4175381660461426, + "learning_rate": 0.0002, + "loss": 2.5388, + "step": 11300 + }, + { + "epoch": 0.8427719821162444, + "grad_norm": 2.3096563816070557, + "learning_rate": 0.0002, + "loss": 2.6924, + "step": 11310 + }, + { + "epoch": 0.8435171385991058, + "grad_norm": 2.5850837230682373, + "learning_rate": 0.0002, + "loss": 2.5236, + "step": 11320 + }, + { + "epoch": 0.8442622950819673, + "grad_norm": 2.491636276245117, + "learning_rate": 0.0002, + "loss": 2.3833, + "step": 11330 + }, + { + "epoch": 0.8450074515648286, + "grad_norm": 2.627934455871582, + "learning_rate": 0.0002, + "loss": 2.4309, + "step": 11340 + }, + { + "epoch": 0.84575260804769, + "grad_norm": 2.562291383743286, + "learning_rate": 0.0002, + "loss": 2.6193, + "step": 11350 + }, + { + "epoch": 0.8464977645305514, + "grad_norm": 2.1559548377990723, + "learning_rate": 0.0002, + "loss": 2.3262, + "step": 11360 + }, + { + "epoch": 0.8472429210134128, + "grad_norm": 2.06011962890625, + "learning_rate": 0.0002, + "loss": 2.4812, + "step": 11370 + }, + { + "epoch": 0.8479880774962743, + "grad_norm": 1.6976170539855957, + "learning_rate": 0.0002, + "loss": 2.482, + "step": 11380 + }, + { + "epoch": 0.8487332339791356, + "grad_norm": 2.2001445293426514, + "learning_rate": 0.0002, + "loss": 2.6044, + "step": 11390 + }, + { + "epoch": 0.849478390461997, + "grad_norm": 2.1249303817749023, + "learning_rate": 0.0002, + "loss": 2.395, + "step": 11400 + }, + { + "epoch": 0.8502235469448585, + "grad_norm": 2.477747678756714, + "learning_rate": 0.0002, + "loss": 2.3953, + "step": 11410 + }, + { + "epoch": 0.8509687034277198, + "grad_norm": 2.436199426651001, + "learning_rate": 0.0002, + "loss": 2.535, + "step": 11420 + }, + { + "epoch": 0.8517138599105812, + "grad_norm": 2.53760027885437, + "learning_rate": 0.0002, + "loss": 2.4221, + "step": 11430 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 2.1996359825134277, + "learning_rate": 0.0002, + "loss": 2.3221, + "step": 11440 + }, + { + "epoch": 0.853204172876304, + "grad_norm": 2.1827547550201416, + "learning_rate": 0.0002, + "loss": 2.4165, + "step": 11450 + }, + { + "epoch": 0.8539493293591655, + "grad_norm": 2.6674787998199463, + "learning_rate": 0.0002, + "loss": 2.524, + "step": 11460 + }, + { + "epoch": 0.8546944858420268, + "grad_norm": 2.6001546382904053, + "learning_rate": 0.0002, + "loss": 2.6605, + "step": 11470 + }, + { + "epoch": 0.8554396423248882, + "grad_norm": 2.287266254425049, + "learning_rate": 0.0002, + "loss": 2.6006, + "step": 11480 + }, + { + "epoch": 0.8561847988077497, + "grad_norm": 2.252857208251953, + "learning_rate": 0.0002, + "loss": 2.544, + "step": 11490 + }, + { + "epoch": 0.856929955290611, + "grad_norm": 2.273512601852417, + "learning_rate": 0.0002, + "loss": 2.5486, + "step": 11500 + }, + { + "epoch": 0.8576751117734724, + "grad_norm": 2.7023732662200928, + "learning_rate": 0.0002, + "loss": 2.592, + "step": 11510 + }, + { + "epoch": 0.8584202682563339, + "grad_norm": 2.303313970565796, + "learning_rate": 0.0002, + "loss": 2.5199, + "step": 11520 + }, + { + "epoch": 0.8591654247391952, + "grad_norm": 2.4638290405273438, + "learning_rate": 0.0002, + "loss": 2.5568, + "step": 11530 + }, + { + "epoch": 0.8599105812220567, + "grad_norm": 2.451723337173462, + "learning_rate": 0.0002, + "loss": 2.6141, + "step": 11540 + }, + { + "epoch": 0.860655737704918, + "grad_norm": 2.3285202980041504, + "learning_rate": 0.0002, + "loss": 2.5687, + "step": 11550 + }, + { + "epoch": 0.8614008941877794, + "grad_norm": 2.085536241531372, + "learning_rate": 0.0002, + "loss": 2.6193, + "step": 11560 + }, + { + "epoch": 0.8621460506706409, + "grad_norm": 2.5319440364837646, + "learning_rate": 0.0002, + "loss": 2.5323, + "step": 11570 + }, + { + "epoch": 0.8628912071535022, + "grad_norm": 2.246924638748169, + "learning_rate": 0.0002, + "loss": 2.4782, + "step": 11580 + }, + { + "epoch": 0.8636363636363636, + "grad_norm": 2.1741697788238525, + "learning_rate": 0.0002, + "loss": 2.4644, + "step": 11590 + }, + { + "epoch": 0.8643815201192251, + "grad_norm": 2.434746265411377, + "learning_rate": 0.0002, + "loss": 2.3757, + "step": 11600 + }, + { + "epoch": 0.8651266766020864, + "grad_norm": 2.406317710876465, + "learning_rate": 0.0002, + "loss": 2.5596, + "step": 11610 + }, + { + "epoch": 0.8658718330849479, + "grad_norm": 2.0323896408081055, + "learning_rate": 0.0002, + "loss": 2.491, + "step": 11620 + }, + { + "epoch": 0.8666169895678092, + "grad_norm": 1.8637727499008179, + "learning_rate": 0.0002, + "loss": 2.5684, + "step": 11630 + }, + { + "epoch": 0.8673621460506706, + "grad_norm": 2.324962854385376, + "learning_rate": 0.0002, + "loss": 2.5031, + "step": 11640 + }, + { + "epoch": 0.8681073025335321, + "grad_norm": 2.261016607284546, + "learning_rate": 0.0002, + "loss": 2.4632, + "step": 11650 + }, + { + "epoch": 0.8688524590163934, + "grad_norm": 2.4027981758117676, + "learning_rate": 0.0002, + "loss": 2.5733, + "step": 11660 + }, + { + "epoch": 0.8695976154992549, + "grad_norm": 2.2393276691436768, + "learning_rate": 0.0002, + "loss": 2.4513, + "step": 11670 + }, + { + "epoch": 0.8703427719821163, + "grad_norm": 2.2299957275390625, + "learning_rate": 0.0002, + "loss": 2.4286, + "step": 11680 + }, + { + "epoch": 0.8710879284649776, + "grad_norm": 3.5715925693511963, + "learning_rate": 0.0002, + "loss": 2.6196, + "step": 11690 + }, + { + "epoch": 0.8718330849478391, + "grad_norm": 2.3358867168426514, + "learning_rate": 0.0002, + "loss": 2.6385, + "step": 11700 + }, + { + "epoch": 0.8725782414307004, + "grad_norm": 2.237659454345703, + "learning_rate": 0.0002, + "loss": 2.5753, + "step": 11710 + }, + { + "epoch": 0.8733233979135618, + "grad_norm": 2.2841837406158447, + "learning_rate": 0.0002, + "loss": 2.6415, + "step": 11720 + }, + { + "epoch": 0.8740685543964233, + "grad_norm": 2.559757709503174, + "learning_rate": 0.0002, + "loss": 2.5872, + "step": 11730 + }, + { + "epoch": 0.8748137108792846, + "grad_norm": 2.5326175689697266, + "learning_rate": 0.0002, + "loss": 2.4485, + "step": 11740 + }, + { + "epoch": 0.875558867362146, + "grad_norm": 2.234788656234741, + "learning_rate": 0.0002, + "loss": 2.2943, + "step": 11750 + }, + { + "epoch": 0.8763040238450075, + "grad_norm": 2.4031131267547607, + "learning_rate": 0.0002, + "loss": 2.5176, + "step": 11760 + }, + { + "epoch": 0.8770491803278688, + "grad_norm": 2.832005023956299, + "learning_rate": 0.0002, + "loss": 2.5063, + "step": 11770 + }, + { + "epoch": 0.8777943368107303, + "grad_norm": 2.0008394718170166, + "learning_rate": 0.0002, + "loss": 2.5267, + "step": 11780 + }, + { + "epoch": 0.8785394932935916, + "grad_norm": 2.0909340381622314, + "learning_rate": 0.0002, + "loss": 2.5573, + "step": 11790 + }, + { + "epoch": 0.879284649776453, + "grad_norm": 2.437964677810669, + "learning_rate": 0.0002, + "loss": 2.6744, + "step": 11800 + }, + { + "epoch": 0.8800298062593145, + "grad_norm": 2.1106114387512207, + "learning_rate": 0.0002, + "loss": 2.5049, + "step": 11810 + }, + { + "epoch": 0.8807749627421758, + "grad_norm": 2.672287940979004, + "learning_rate": 0.0002, + "loss": 2.7222, + "step": 11820 + }, + { + "epoch": 0.8815201192250373, + "grad_norm": 2.584033727645874, + "learning_rate": 0.0002, + "loss": 2.5348, + "step": 11830 + }, + { + "epoch": 0.8822652757078987, + "grad_norm": 2.440488576889038, + "learning_rate": 0.0002, + "loss": 2.4024, + "step": 11840 + }, + { + "epoch": 0.88301043219076, + "grad_norm": 2.302765130996704, + "learning_rate": 0.0002, + "loss": 2.441, + "step": 11850 + }, + { + "epoch": 0.8837555886736215, + "grad_norm": 3.5020196437835693, + "learning_rate": 0.0002, + "loss": 2.5192, + "step": 11860 + }, + { + "epoch": 0.8845007451564829, + "grad_norm": 1.8765350580215454, + "learning_rate": 0.0002, + "loss": 2.5156, + "step": 11870 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 2.4863150119781494, + "learning_rate": 0.0002, + "loss": 2.5239, + "step": 11880 + }, + { + "epoch": 0.8859910581222057, + "grad_norm": 2.492176055908203, + "learning_rate": 0.0002, + "loss": 2.5697, + "step": 11890 + }, + { + "epoch": 0.886736214605067, + "grad_norm": 3.2195818424224854, + "learning_rate": 0.0002, + "loss": 2.6402, + "step": 11900 + }, + { + "epoch": 0.8874813710879285, + "grad_norm": 2.2776970863342285, + "learning_rate": 0.0002, + "loss": 2.4884, + "step": 11910 + }, + { + "epoch": 0.8882265275707899, + "grad_norm": 2.553642749786377, + "learning_rate": 0.0002, + "loss": 2.6425, + "step": 11920 + }, + { + "epoch": 0.8889716840536512, + "grad_norm": 2.269604206085205, + "learning_rate": 0.0002, + "loss": 2.5291, + "step": 11930 + }, + { + "epoch": 0.8897168405365127, + "grad_norm": 2.3157992362976074, + "learning_rate": 0.0002, + "loss": 2.7461, + "step": 11940 + }, + { + "epoch": 0.8904619970193741, + "grad_norm": 2.3829448223114014, + "learning_rate": 0.0002, + "loss": 2.6959, + "step": 11950 + }, + { + "epoch": 0.8912071535022354, + "grad_norm": 2.368562698364258, + "learning_rate": 0.0002, + "loss": 2.3614, + "step": 11960 + }, + { + "epoch": 0.8919523099850969, + "grad_norm": 2.5107600688934326, + "learning_rate": 0.0002, + "loss": 2.5187, + "step": 11970 + }, + { + "epoch": 0.8926974664679582, + "grad_norm": 2.376817226409912, + "learning_rate": 0.0002, + "loss": 2.5952, + "step": 11980 + }, + { + "epoch": 0.8934426229508197, + "grad_norm": 2.224245071411133, + "learning_rate": 0.0002, + "loss": 2.4105, + "step": 11990 + }, + { + "epoch": 0.8941877794336811, + "grad_norm": 2.118123769760132, + "learning_rate": 0.0002, + "loss": 2.442, + "step": 12000 + }, + { + "epoch": 0.8949329359165424, + "grad_norm": 2.6127233505249023, + "learning_rate": 0.0002, + "loss": 2.3396, + "step": 12010 + }, + { + "epoch": 0.8956780923994039, + "grad_norm": 2.319699287414551, + "learning_rate": 0.0002, + "loss": 2.5345, + "step": 12020 + }, + { + "epoch": 0.8964232488822653, + "grad_norm": 2.394341468811035, + "learning_rate": 0.0002, + "loss": 2.6386, + "step": 12030 + }, + { + "epoch": 0.8971684053651267, + "grad_norm": 1.9906131029129028, + "learning_rate": 0.0002, + "loss": 2.3783, + "step": 12040 + }, + { + "epoch": 0.8979135618479881, + "grad_norm": 2.038069009780884, + "learning_rate": 0.0002, + "loss": 2.4852, + "step": 12050 + }, + { + "epoch": 0.8986587183308494, + "grad_norm": 2.18753981590271, + "learning_rate": 0.0002, + "loss": 2.3601, + "step": 12060 + }, + { + "epoch": 0.8994038748137109, + "grad_norm": 2.302685499191284, + "learning_rate": 0.0002, + "loss": 2.4762, + "step": 12070 + }, + { + "epoch": 0.9001490312965723, + "grad_norm": 2.177448272705078, + "learning_rate": 0.0002, + "loss": 2.3813, + "step": 12080 + }, + { + "epoch": 0.9008941877794336, + "grad_norm": 2.6771061420440674, + "learning_rate": 0.0002, + "loss": 2.5623, + "step": 12090 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 2.364640712738037, + "learning_rate": 0.0002, + "loss": 2.4862, + "step": 12100 + }, + { + "epoch": 0.9023845007451565, + "grad_norm": 2.2396554946899414, + "learning_rate": 0.0002, + "loss": 2.3349, + "step": 12110 + }, + { + "epoch": 0.9031296572280179, + "grad_norm": 2.5424320697784424, + "learning_rate": 0.0002, + "loss": 2.5552, + "step": 12120 + }, + { + "epoch": 0.9038748137108793, + "grad_norm": 2.2599573135375977, + "learning_rate": 0.0002, + "loss": 2.4783, + "step": 12130 + }, + { + "epoch": 0.9046199701937406, + "grad_norm": 2.3719987869262695, + "learning_rate": 0.0002, + "loss": 2.3134, + "step": 12140 + }, + { + "epoch": 0.9053651266766021, + "grad_norm": 2.3063857555389404, + "learning_rate": 0.0002, + "loss": 2.375, + "step": 12150 + }, + { + "epoch": 0.9061102831594635, + "grad_norm": 2.3336477279663086, + "learning_rate": 0.0002, + "loss": 2.5508, + "step": 12160 + }, + { + "epoch": 0.9068554396423248, + "grad_norm": 2.325166940689087, + "learning_rate": 0.0002, + "loss": 2.5011, + "step": 12170 + }, + { + "epoch": 0.9076005961251863, + "grad_norm": 2.2718868255615234, + "learning_rate": 0.0002, + "loss": 2.6404, + "step": 12180 + }, + { + "epoch": 0.9083457526080477, + "grad_norm": 2.607816696166992, + "learning_rate": 0.0002, + "loss": 2.5262, + "step": 12190 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 2.1004855632781982, + "learning_rate": 0.0002, + "loss": 2.5023, + "step": 12200 + }, + { + "epoch": 0.9098360655737705, + "grad_norm": 2.336148977279663, + "learning_rate": 0.0002, + "loss": 2.6659, + "step": 12210 + }, + { + "epoch": 0.910581222056632, + "grad_norm": 2.531151294708252, + "learning_rate": 0.0002, + "loss": 2.7188, + "step": 12220 + }, + { + "epoch": 0.9113263785394933, + "grad_norm": 2.559283494949341, + "learning_rate": 0.0002, + "loss": 2.6394, + "step": 12230 + }, + { + "epoch": 0.9120715350223547, + "grad_norm": 2.272649049758911, + "learning_rate": 0.0002, + "loss": 2.6711, + "step": 12240 + }, + { + "epoch": 0.912816691505216, + "grad_norm": 2.356740713119507, + "learning_rate": 0.0002, + "loss": 2.6009, + "step": 12250 + }, + { + "epoch": 0.9135618479880775, + "grad_norm": 2.4409749507904053, + "learning_rate": 0.0002, + "loss": 2.6274, + "step": 12260 + }, + { + "epoch": 0.9143070044709389, + "grad_norm": 2.7427172660827637, + "learning_rate": 0.0002, + "loss": 2.7499, + "step": 12270 + }, + { + "epoch": 0.9150521609538003, + "grad_norm": 2.309023380279541, + "learning_rate": 0.0002, + "loss": 2.3997, + "step": 12280 + }, + { + "epoch": 0.9157973174366617, + "grad_norm": 2.1871066093444824, + "learning_rate": 0.0002, + "loss": 2.6141, + "step": 12290 + }, + { + "epoch": 0.9165424739195231, + "grad_norm": 2.556929588317871, + "learning_rate": 0.0002, + "loss": 2.4421, + "step": 12300 + }, + { + "epoch": 0.9172876304023845, + "grad_norm": 2.5296781063079834, + "learning_rate": 0.0002, + "loss": 2.5385, + "step": 12310 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 2.4364492893218994, + "learning_rate": 0.0002, + "loss": 2.5862, + "step": 12320 + }, + { + "epoch": 0.9187779433681073, + "grad_norm": 2.1894443035125732, + "learning_rate": 0.0002, + "loss": 2.5224, + "step": 12330 + }, + { + "epoch": 0.9195230998509687, + "grad_norm": 2.3544352054595947, + "learning_rate": 0.0002, + "loss": 2.4424, + "step": 12340 + }, + { + "epoch": 0.9202682563338301, + "grad_norm": 2.2780649662017822, + "learning_rate": 0.0002, + "loss": 2.4522, + "step": 12350 + }, + { + "epoch": 0.9210134128166915, + "grad_norm": 2.3933236598968506, + "learning_rate": 0.0002, + "loss": 2.4272, + "step": 12360 + }, + { + "epoch": 0.9217585692995529, + "grad_norm": 2.260446071624756, + "learning_rate": 0.0002, + "loss": 2.4688, + "step": 12370 + }, + { + "epoch": 0.9225037257824144, + "grad_norm": 2.4620795249938965, + "learning_rate": 0.0002, + "loss": 2.1951, + "step": 12380 + }, + { + "epoch": 0.9232488822652757, + "grad_norm": 2.4303131103515625, + "learning_rate": 0.0002, + "loss": 2.2946, + "step": 12390 + }, + { + "epoch": 0.9239940387481371, + "grad_norm": 2.8419077396392822, + "learning_rate": 0.0002, + "loss": 2.4692, + "step": 12400 + }, + { + "epoch": 0.9247391952309985, + "grad_norm": 2.9645135402679443, + "learning_rate": 0.0002, + "loss": 2.6461, + "step": 12410 + }, + { + "epoch": 0.9254843517138599, + "grad_norm": 2.391739845275879, + "learning_rate": 0.0002, + "loss": 2.5335, + "step": 12420 + }, + { + "epoch": 0.9262295081967213, + "grad_norm": 2.686976194381714, + "learning_rate": 0.0002, + "loss": 2.6333, + "step": 12430 + }, + { + "epoch": 0.9269746646795827, + "grad_norm": 2.723388195037842, + "learning_rate": 0.0002, + "loss": 2.354, + "step": 12440 + }, + { + "epoch": 0.9277198211624441, + "grad_norm": 2.4658613204956055, + "learning_rate": 0.0002, + "loss": 2.5475, + "step": 12450 + }, + { + "epoch": 0.9284649776453056, + "grad_norm": 2.6882481575012207, + "learning_rate": 0.0002, + "loss": 2.3591, + "step": 12460 + }, + { + "epoch": 0.9292101341281669, + "grad_norm": 2.646805763244629, + "learning_rate": 0.0002, + "loss": 2.4765, + "step": 12470 + }, + { + "epoch": 0.9299552906110283, + "grad_norm": 2.4550538063049316, + "learning_rate": 0.0002, + "loss": 2.6768, + "step": 12480 + }, + { + "epoch": 0.9307004470938898, + "grad_norm": 2.296886920928955, + "learning_rate": 0.0002, + "loss": 2.6753, + "step": 12490 + }, + { + "epoch": 0.9314456035767511, + "grad_norm": 2.1057779788970947, + "learning_rate": 0.0002, + "loss": 2.5129, + "step": 12500 + }, + { + "epoch": 0.9321907600596125, + "grad_norm": 2.1545591354370117, + "learning_rate": 0.0002, + "loss": 2.4823, + "step": 12510 + }, + { + "epoch": 0.9329359165424739, + "grad_norm": 2.354118824005127, + "learning_rate": 0.0002, + "loss": 2.6566, + "step": 12520 + }, + { + "epoch": 0.9336810730253353, + "grad_norm": 2.294067859649658, + "learning_rate": 0.0002, + "loss": 2.5172, + "step": 12530 + }, + { + "epoch": 0.9344262295081968, + "grad_norm": 2.2037270069122314, + "learning_rate": 0.0002, + "loss": 2.5489, + "step": 12540 + }, + { + "epoch": 0.9351713859910581, + "grad_norm": 2.276435136795044, + "learning_rate": 0.0002, + "loss": 2.3769, + "step": 12550 + }, + { + "epoch": 0.9359165424739195, + "grad_norm": 2.4285993576049805, + "learning_rate": 0.0002, + "loss": 2.7328, + "step": 12560 + }, + { + "epoch": 0.936661698956781, + "grad_norm": 2.661801338195801, + "learning_rate": 0.0002, + "loss": 2.5124, + "step": 12570 + }, + { + "epoch": 0.9374068554396423, + "grad_norm": 1.9602450132369995, + "learning_rate": 0.0002, + "loss": 2.4483, + "step": 12580 + }, + { + "epoch": 0.9381520119225037, + "grad_norm": 2.1745917797088623, + "learning_rate": 0.0002, + "loss": 2.4838, + "step": 12590 + }, + { + "epoch": 0.9388971684053651, + "grad_norm": 2.373079776763916, + "learning_rate": 0.0002, + "loss": 2.6714, + "step": 12600 + }, + { + "epoch": 0.9396423248882265, + "grad_norm": 2.567545175552368, + "learning_rate": 0.0002, + "loss": 2.6517, + "step": 12610 + }, + { + "epoch": 0.940387481371088, + "grad_norm": 2.426687240600586, + "learning_rate": 0.0002, + "loss": 2.698, + "step": 12620 + }, + { + "epoch": 0.9411326378539493, + "grad_norm": 2.2519803047180176, + "learning_rate": 0.0002, + "loss": 2.3313, + "step": 12630 + }, + { + "epoch": 0.9418777943368107, + "grad_norm": 2.221134662628174, + "learning_rate": 0.0002, + "loss": 2.4458, + "step": 12640 + }, + { + "epoch": 0.9426229508196722, + "grad_norm": 2.3901071548461914, + "learning_rate": 0.0002, + "loss": 2.6864, + "step": 12650 + }, + { + "epoch": 0.9433681073025335, + "grad_norm": 2.3478126525878906, + "learning_rate": 0.0002, + "loss": 2.5446, + "step": 12660 + }, + { + "epoch": 0.944113263785395, + "grad_norm": 2.271385908126831, + "learning_rate": 0.0002, + "loss": 2.6798, + "step": 12670 + }, + { + "epoch": 0.9448584202682563, + "grad_norm": 2.620520830154419, + "learning_rate": 0.0002, + "loss": 2.5766, + "step": 12680 + }, + { + "epoch": 0.9456035767511177, + "grad_norm": 2.460322380065918, + "learning_rate": 0.0002, + "loss": 2.3643, + "step": 12690 + }, + { + "epoch": 0.9463487332339792, + "grad_norm": 2.526590585708618, + "learning_rate": 0.0002, + "loss": 2.4779, + "step": 12700 + }, + { + "epoch": 0.9470938897168405, + "grad_norm": 2.0752880573272705, + "learning_rate": 0.0002, + "loss": 2.4393, + "step": 12710 + }, + { + "epoch": 0.9478390461997019, + "grad_norm": 2.8343307971954346, + "learning_rate": 0.0002, + "loss": 2.5893, + "step": 12720 + }, + { + "epoch": 0.9485842026825634, + "grad_norm": 2.369196653366089, + "learning_rate": 0.0002, + "loss": 2.5725, + "step": 12730 + }, + { + "epoch": 0.9493293591654247, + "grad_norm": 2.3680763244628906, + "learning_rate": 0.0002, + "loss": 2.3934, + "step": 12740 + }, + { + "epoch": 0.9500745156482862, + "grad_norm": 2.43011736869812, + "learning_rate": 0.0002, + "loss": 2.1303, + "step": 12750 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 2.4753246307373047, + "learning_rate": 0.0002, + "loss": 2.6349, + "step": 12760 + }, + { + "epoch": 0.9515648286140089, + "grad_norm": 2.558833122253418, + "learning_rate": 0.0002, + "loss": 2.523, + "step": 12770 + }, + { + "epoch": 0.9523099850968704, + "grad_norm": 2.847932815551758, + "learning_rate": 0.0002, + "loss": 2.4538, + "step": 12780 + }, + { + "epoch": 0.9530551415797317, + "grad_norm": 2.3892436027526855, + "learning_rate": 0.0002, + "loss": 2.3756, + "step": 12790 + }, + { + "epoch": 0.9538002980625931, + "grad_norm": 2.214956760406494, + "learning_rate": 0.0002, + "loss": 2.4268, + "step": 12800 + }, + { + "epoch": 0.9545454545454546, + "grad_norm": 2.3451969623565674, + "learning_rate": 0.0002, + "loss": 2.423, + "step": 12810 + }, + { + "epoch": 0.9552906110283159, + "grad_norm": 2.4705963134765625, + "learning_rate": 0.0002, + "loss": 2.6488, + "step": 12820 + }, + { + "epoch": 0.9560357675111774, + "grad_norm": 2.7425971031188965, + "learning_rate": 0.0002, + "loss": 2.5578, + "step": 12830 + }, + { + "epoch": 0.9567809239940388, + "grad_norm": 2.3511667251586914, + "learning_rate": 0.0002, + "loss": 2.3301, + "step": 12840 + }, + { + "epoch": 0.9575260804769001, + "grad_norm": 2.307358980178833, + "learning_rate": 0.0002, + "loss": 2.4876, + "step": 12850 + }, + { + "epoch": 0.9582712369597616, + "grad_norm": 2.233081102371216, + "learning_rate": 0.0002, + "loss": 2.2724, + "step": 12860 + }, + { + "epoch": 0.9590163934426229, + "grad_norm": 2.7037274837493896, + "learning_rate": 0.0002, + "loss": 2.3392, + "step": 12870 + }, + { + "epoch": 0.9597615499254843, + "grad_norm": 2.435098886489868, + "learning_rate": 0.0002, + "loss": 2.5374, + "step": 12880 + }, + { + "epoch": 0.9605067064083458, + "grad_norm": 2.4666409492492676, + "learning_rate": 0.0002, + "loss": 2.5573, + "step": 12890 + }, + { + "epoch": 0.9612518628912071, + "grad_norm": 2.260859251022339, + "learning_rate": 0.0002, + "loss": 2.4869, + "step": 12900 + }, + { + "epoch": 0.9619970193740686, + "grad_norm": 2.661461353302002, + "learning_rate": 0.0002, + "loss": 2.5166, + "step": 12910 + }, + { + "epoch": 0.96274217585693, + "grad_norm": 2.374238967895508, + "learning_rate": 0.0002, + "loss": 2.5812, + "step": 12920 + }, + { + "epoch": 0.9634873323397913, + "grad_norm": 2.1044247150421143, + "learning_rate": 0.0002, + "loss": 2.5499, + "step": 12930 + }, + { + "epoch": 0.9642324888226528, + "grad_norm": 2.3307137489318848, + "learning_rate": 0.0002, + "loss": 2.6675, + "step": 12940 + }, + { + "epoch": 0.9649776453055141, + "grad_norm": 2.323369264602661, + "learning_rate": 0.0002, + "loss": 2.6393, + "step": 12950 + }, + { + "epoch": 0.9657228017883756, + "grad_norm": 2.198737621307373, + "learning_rate": 0.0002, + "loss": 2.659, + "step": 12960 + }, + { + "epoch": 0.966467958271237, + "grad_norm": 2.636990547180176, + "learning_rate": 0.0002, + "loss": 2.3037, + "step": 12970 + }, + { + "epoch": 0.9672131147540983, + "grad_norm": 2.4657247066497803, + "learning_rate": 0.0002, + "loss": 2.4913, + "step": 12980 + }, + { + "epoch": 0.9679582712369598, + "grad_norm": 2.854710340499878, + "learning_rate": 0.0002, + "loss": 2.6958, + "step": 12990 + }, + { + "epoch": 0.9687034277198212, + "grad_norm": 1.9861502647399902, + "learning_rate": 0.0002, + "loss": 2.4177, + "step": 13000 + }, + { + "epoch": 0.9694485842026825, + "grad_norm": 2.2618319988250732, + "learning_rate": 0.0002, + "loss": 2.6304, + "step": 13010 + }, + { + "epoch": 0.970193740685544, + "grad_norm": 2.253206491470337, + "learning_rate": 0.0002, + "loss": 2.539, + "step": 13020 + }, + { + "epoch": 0.9709388971684053, + "grad_norm": 2.4439284801483154, + "learning_rate": 0.0002, + "loss": 2.6837, + "step": 13030 + }, + { + "epoch": 0.9716840536512668, + "grad_norm": 2.5577943325042725, + "learning_rate": 0.0002, + "loss": 2.6352, + "step": 13040 + }, + { + "epoch": 0.9724292101341282, + "grad_norm": 2.345137596130371, + "learning_rate": 0.0002, + "loss": 2.5917, + "step": 13050 + }, + { + "epoch": 0.9731743666169895, + "grad_norm": 2.3082261085510254, + "learning_rate": 0.0002, + "loss": 2.484, + "step": 13060 + }, + { + "epoch": 0.973919523099851, + "grad_norm": 2.542104721069336, + "learning_rate": 0.0002, + "loss": 2.5961, + "step": 13070 + }, + { + "epoch": 0.9746646795827124, + "grad_norm": 2.420619487762451, + "learning_rate": 0.0002, + "loss": 2.6367, + "step": 13080 + }, + { + "epoch": 0.9754098360655737, + "grad_norm": 2.230170488357544, + "learning_rate": 0.0002, + "loss": 2.4443, + "step": 13090 + }, + { + "epoch": 0.9761549925484352, + "grad_norm": 2.4202942848205566, + "learning_rate": 0.0002, + "loss": 2.4689, + "step": 13100 + }, + { + "epoch": 0.9769001490312966, + "grad_norm": 2.6881895065307617, + "learning_rate": 0.0002, + "loss": 2.4001, + "step": 13110 + }, + { + "epoch": 0.977645305514158, + "grad_norm": 1.8320391178131104, + "learning_rate": 0.0002, + "loss": 2.545, + "step": 13120 + }, + { + "epoch": 0.9783904619970194, + "grad_norm": 2.4107582569122314, + "learning_rate": 0.0002, + "loss": 2.5337, + "step": 13130 + }, + { + "epoch": 0.9791356184798807, + "grad_norm": 2.323265790939331, + "learning_rate": 0.0002, + "loss": 2.57, + "step": 13140 + }, + { + "epoch": 0.9798807749627422, + "grad_norm": 2.5477750301361084, + "learning_rate": 0.0002, + "loss": 2.5769, + "step": 13150 + }, + { + "epoch": 0.9806259314456036, + "grad_norm": 3.1621835231781006, + "learning_rate": 0.0002, + "loss": 2.4171, + "step": 13160 + }, + { + "epoch": 0.981371087928465, + "grad_norm": 2.4050159454345703, + "learning_rate": 0.0002, + "loss": 2.5969, + "step": 13170 + }, + { + "epoch": 0.9821162444113264, + "grad_norm": 2.261742353439331, + "learning_rate": 0.0002, + "loss": 2.5855, + "step": 13180 + }, + { + "epoch": 0.9828614008941878, + "grad_norm": 2.472062349319458, + "learning_rate": 0.0002, + "loss": 2.6003, + "step": 13190 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 2.328455924987793, + "learning_rate": 0.0002, + "loss": 2.6398, + "step": 13200 + }, + { + "epoch": 0.9843517138599106, + "grad_norm": 2.5007076263427734, + "learning_rate": 0.0002, + "loss": 2.6027, + "step": 13210 + }, + { + "epoch": 0.9850968703427719, + "grad_norm": 2.1729538440704346, + "learning_rate": 0.0002, + "loss": 2.5315, + "step": 13220 + }, + { + "epoch": 0.9858420268256334, + "grad_norm": 2.4826149940490723, + "learning_rate": 0.0002, + "loss": 2.5213, + "step": 13230 + }, + { + "epoch": 0.9865871833084948, + "grad_norm": 2.1284587383270264, + "learning_rate": 0.0002, + "loss": 2.3732, + "step": 13240 + }, + { + "epoch": 0.9873323397913562, + "grad_norm": 2.260220766067505, + "learning_rate": 0.0002, + "loss": 2.5361, + "step": 13250 + }, + { + "epoch": 0.9880774962742176, + "grad_norm": 2.427016019821167, + "learning_rate": 0.0002, + "loss": 2.7074, + "step": 13260 + }, + { + "epoch": 0.988822652757079, + "grad_norm": 2.6063551902770996, + "learning_rate": 0.0002, + "loss": 2.7052, + "step": 13270 + }, + { + "epoch": 0.9895678092399404, + "grad_norm": 2.5883805751800537, + "learning_rate": 0.0002, + "loss": 2.6739, + "step": 13280 + }, + { + "epoch": 0.9903129657228018, + "grad_norm": 2.7127444744110107, + "learning_rate": 0.0002, + "loss": 2.5796, + "step": 13290 + }, + { + "epoch": 0.9910581222056631, + "grad_norm": 2.2932395935058594, + "learning_rate": 0.0002, + "loss": 2.6266, + "step": 13300 + }, + { + "epoch": 0.9918032786885246, + "grad_norm": 2.1839466094970703, + "learning_rate": 0.0002, + "loss": 2.4277, + "step": 13310 + }, + { + "epoch": 0.992548435171386, + "grad_norm": 2.1350860595703125, + "learning_rate": 0.0002, + "loss": 2.4021, + "step": 13320 + }, + { + "epoch": 0.9932935916542474, + "grad_norm": 2.821213960647583, + "learning_rate": 0.0002, + "loss": 2.4104, + "step": 13330 + }, + { + "epoch": 0.9940387481371088, + "grad_norm": 2.3386242389678955, + "learning_rate": 0.0002, + "loss": 2.6611, + "step": 13340 + }, + { + "epoch": 0.9947839046199702, + "grad_norm": 2.477780342102051, + "learning_rate": 0.0002, + "loss": 2.5641, + "step": 13350 + }, + { + "epoch": 0.9955290611028316, + "grad_norm": 2.1101603507995605, + "learning_rate": 0.0002, + "loss": 2.3935, + "step": 13360 + }, + { + "epoch": 0.996274217585693, + "grad_norm": 2.371098041534424, + "learning_rate": 0.0002, + "loss": 2.4277, + "step": 13370 + }, + { + "epoch": 0.9970193740685543, + "grad_norm": 2.4047231674194336, + "learning_rate": 0.0002, + "loss": 2.5452, + "step": 13380 + }, + { + "epoch": 0.9977645305514158, + "grad_norm": 2.2400450706481934, + "learning_rate": 0.0002, + "loss": 2.5556, + "step": 13390 + }, + { + "epoch": 0.9985096870342772, + "grad_norm": 2.187760829925537, + "learning_rate": 0.0002, + "loss": 2.645, + "step": 13400 + }, + { + "epoch": 0.9992548435171386, + "grad_norm": 2.3010387420654297, + "learning_rate": 0.0002, + "loss": 2.4113, + "step": 13410 + }, + { + "epoch": 1.0, + "grad_norm": 2.8511722087860107, + "learning_rate": 0.0002, + "loss": 2.5681, + "step": 13420 + }, + { + "epoch": 1.0, + "eval_runtime": 2766.9434, + "eval_samples_per_second": 4.85, + "eval_steps_per_second": 0.606, + "step": 13420 + }, + { + "epoch": 1.0007451564828613, + "grad_norm": 2.541956901550293, + "learning_rate": 0.0002, + "loss": 2.4417, + "step": 13430 + }, + { + "epoch": 1.0014903129657229, + "grad_norm": 2.2113637924194336, + "learning_rate": 0.0002, + "loss": 2.3728, + "step": 13440 + }, + { + "epoch": 1.0022354694485842, + "grad_norm": 2.6709325313568115, + "learning_rate": 0.0002, + "loss": 2.3259, + "step": 13450 + }, + { + "epoch": 1.0029806259314455, + "grad_norm": 2.1708545684814453, + "learning_rate": 0.0002, + "loss": 2.164, + "step": 13460 + }, + { + "epoch": 1.003725782414307, + "grad_norm": 2.2694172859191895, + "learning_rate": 0.0002, + "loss": 2.37, + "step": 13470 + }, + { + "epoch": 1.0044709388971684, + "grad_norm": 2.839197874069214, + "learning_rate": 0.0002, + "loss": 2.383, + "step": 13480 + }, + { + "epoch": 1.0052160953800298, + "grad_norm": 2.7437849044799805, + "learning_rate": 0.0002, + "loss": 2.3811, + "step": 13490 + }, + { + "epoch": 1.0059612518628913, + "grad_norm": 2.8862855434417725, + "learning_rate": 0.0002, + "loss": 2.2016, + "step": 13500 + }, + { + "epoch": 1.0067064083457526, + "grad_norm": 2.6776318550109863, + "learning_rate": 0.0002, + "loss": 2.2084, + "step": 13510 + }, + { + "epoch": 1.007451564828614, + "grad_norm": 2.473053455352783, + "learning_rate": 0.0002, + "loss": 2.4357, + "step": 13520 + }, + { + "epoch": 1.0081967213114753, + "grad_norm": 2.5545501708984375, + "learning_rate": 0.0002, + "loss": 2.4439, + "step": 13530 + }, + { + "epoch": 1.0089418777943369, + "grad_norm": 2.6826674938201904, + "learning_rate": 0.0002, + "loss": 2.5071, + "step": 13540 + }, + { + "epoch": 1.0096870342771982, + "grad_norm": 2.596944570541382, + "learning_rate": 0.0002, + "loss": 2.5319, + "step": 13550 + }, + { + "epoch": 1.0104321907600595, + "grad_norm": 2.0925228595733643, + "learning_rate": 0.0002, + "loss": 2.1662, + "step": 13560 + }, + { + "epoch": 1.011177347242921, + "grad_norm": 2.8665518760681152, + "learning_rate": 0.0002, + "loss": 2.5069, + "step": 13570 + }, + { + "epoch": 1.0119225037257824, + "grad_norm": 2.4106383323669434, + "learning_rate": 0.0002, + "loss": 2.3902, + "step": 13580 + }, + { + "epoch": 1.0126676602086437, + "grad_norm": 2.5520753860473633, + "learning_rate": 0.0002, + "loss": 2.4638, + "step": 13590 + }, + { + "epoch": 1.0134128166915053, + "grad_norm": 2.505903482437134, + "learning_rate": 0.0002, + "loss": 2.5054, + "step": 13600 + }, + { + "epoch": 1.0141579731743666, + "grad_norm": 2.5196523666381836, + "learning_rate": 0.0002, + "loss": 2.1964, + "step": 13610 + }, + { + "epoch": 1.014903129657228, + "grad_norm": 2.414044141769409, + "learning_rate": 0.0002, + "loss": 2.2939, + "step": 13620 + }, + { + "epoch": 1.0156482861400895, + "grad_norm": 2.586638927459717, + "learning_rate": 0.0002, + "loss": 2.403, + "step": 13630 + }, + { + "epoch": 1.0163934426229508, + "grad_norm": 2.509230852127075, + "learning_rate": 0.0002, + "loss": 2.3131, + "step": 13640 + }, + { + "epoch": 1.0171385991058122, + "grad_norm": 2.8576319217681885, + "learning_rate": 0.0002, + "loss": 2.3398, + "step": 13650 + }, + { + "epoch": 1.0178837555886737, + "grad_norm": 3.0436184406280518, + "learning_rate": 0.0002, + "loss": 2.4026, + "step": 13660 + }, + { + "epoch": 1.018628912071535, + "grad_norm": 2.6709108352661133, + "learning_rate": 0.0002, + "loss": 2.3594, + "step": 13670 + }, + { + "epoch": 1.0193740685543964, + "grad_norm": 2.6364598274230957, + "learning_rate": 0.0002, + "loss": 2.3834, + "step": 13680 + }, + { + "epoch": 1.0201192250372577, + "grad_norm": 2.666531562805176, + "learning_rate": 0.0002, + "loss": 2.4294, + "step": 13690 + }, + { + "epoch": 1.0208643815201193, + "grad_norm": 2.945071220397949, + "learning_rate": 0.0002, + "loss": 2.4322, + "step": 13700 + }, + { + "epoch": 1.0216095380029806, + "grad_norm": 2.6446971893310547, + "learning_rate": 0.0002, + "loss": 2.3047, + "step": 13710 + }, + { + "epoch": 1.022354694485842, + "grad_norm": 2.769773483276367, + "learning_rate": 0.0002, + "loss": 2.328, + "step": 13720 + }, + { + "epoch": 1.0230998509687035, + "grad_norm": 2.7484028339385986, + "learning_rate": 0.0002, + "loss": 2.4968, + "step": 13730 + }, + { + "epoch": 1.0238450074515648, + "grad_norm": 2.563288927078247, + "learning_rate": 0.0002, + "loss": 2.3943, + "step": 13740 + }, + { + "epoch": 1.0245901639344261, + "grad_norm": 2.358903408050537, + "learning_rate": 0.0002, + "loss": 2.3394, + "step": 13750 + }, + { + "epoch": 1.0253353204172877, + "grad_norm": 3.086336851119995, + "learning_rate": 0.0002, + "loss": 2.4493, + "step": 13760 + }, + { + "epoch": 1.026080476900149, + "grad_norm": 2.4959349632263184, + "learning_rate": 0.0002, + "loss": 2.3651, + "step": 13770 + }, + { + "epoch": 1.0268256333830104, + "grad_norm": 2.5488016605377197, + "learning_rate": 0.0002, + "loss": 2.3654, + "step": 13780 + }, + { + "epoch": 1.027570789865872, + "grad_norm": 2.6956136226654053, + "learning_rate": 0.0002, + "loss": 2.5436, + "step": 13790 + }, + { + "epoch": 1.0283159463487332, + "grad_norm": 2.507394790649414, + "learning_rate": 0.0002, + "loss": 2.393, + "step": 13800 + }, + { + "epoch": 1.0290611028315946, + "grad_norm": 2.583897113800049, + "learning_rate": 0.0002, + "loss": 2.369, + "step": 13810 + }, + { + "epoch": 1.0298062593144561, + "grad_norm": 2.804624080657959, + "learning_rate": 0.0002, + "loss": 2.4578, + "step": 13820 + }, + { + "epoch": 1.0305514157973175, + "grad_norm": 2.8353066444396973, + "learning_rate": 0.0002, + "loss": 2.415, + "step": 13830 + }, + { + "epoch": 1.0312965722801788, + "grad_norm": 2.8079729080200195, + "learning_rate": 0.0002, + "loss": 2.3498, + "step": 13840 + }, + { + "epoch": 1.0320417287630403, + "grad_norm": 3.153747320175171, + "learning_rate": 0.0002, + "loss": 2.4533, + "step": 13850 + }, + { + "epoch": 1.0327868852459017, + "grad_norm": 2.170792579650879, + "learning_rate": 0.0002, + "loss": 2.2922, + "step": 13860 + }, + { + "epoch": 1.033532041728763, + "grad_norm": 2.5001676082611084, + "learning_rate": 0.0002, + "loss": 2.4288, + "step": 13870 + }, + { + "epoch": 1.0342771982116243, + "grad_norm": 2.776643753051758, + "learning_rate": 0.0002, + "loss": 2.3554, + "step": 13880 + }, + { + "epoch": 1.035022354694486, + "grad_norm": 2.7371437549591064, + "learning_rate": 0.0002, + "loss": 2.3946, + "step": 13890 + }, + { + "epoch": 1.0357675111773472, + "grad_norm": 2.1921558380126953, + "learning_rate": 0.0002, + "loss": 2.344, + "step": 13900 + }, + { + "epoch": 1.0365126676602086, + "grad_norm": 2.4711101055145264, + "learning_rate": 0.0002, + "loss": 2.4239, + "step": 13910 + }, + { + "epoch": 1.03725782414307, + "grad_norm": 1.9869732856750488, + "learning_rate": 0.0002, + "loss": 2.3064, + "step": 13920 + }, + { + "epoch": 1.0380029806259314, + "grad_norm": 2.50216007232666, + "learning_rate": 0.0002, + "loss": 2.5143, + "step": 13930 + }, + { + "epoch": 1.0387481371087928, + "grad_norm": 2.3530120849609375, + "learning_rate": 0.0002, + "loss": 2.46, + "step": 13940 + }, + { + "epoch": 1.0394932935916543, + "grad_norm": 2.8673856258392334, + "learning_rate": 0.0002, + "loss": 2.3562, + "step": 13950 + }, + { + "epoch": 1.0402384500745157, + "grad_norm": 2.380708932876587, + "learning_rate": 0.0002, + "loss": 2.4367, + "step": 13960 + }, + { + "epoch": 1.040983606557377, + "grad_norm": 2.6121153831481934, + "learning_rate": 0.0002, + "loss": 2.4539, + "step": 13970 + }, + { + "epoch": 1.0417287630402385, + "grad_norm": 2.1682288646698, + "learning_rate": 0.0002, + "loss": 2.3299, + "step": 13980 + }, + { + "epoch": 1.0424739195230999, + "grad_norm": 2.493143081665039, + "learning_rate": 0.0002, + "loss": 2.2522, + "step": 13990 + }, + { + "epoch": 1.0432190760059612, + "grad_norm": 2.3164801597595215, + "learning_rate": 0.0002, + "loss": 2.3501, + "step": 14000 + }, + { + "epoch": 1.0439642324888228, + "grad_norm": 2.372344493865967, + "learning_rate": 0.0002, + "loss": 2.4671, + "step": 14010 + }, + { + "epoch": 1.044709388971684, + "grad_norm": 2.2820746898651123, + "learning_rate": 0.0002, + "loss": 2.3155, + "step": 14020 + }, + { + "epoch": 1.0454545454545454, + "grad_norm": 2.676579475402832, + "learning_rate": 0.0002, + "loss": 2.3697, + "step": 14030 + }, + { + "epoch": 1.046199701937407, + "grad_norm": 2.5534327030181885, + "learning_rate": 0.0002, + "loss": 2.1582, + "step": 14040 + }, + { + "epoch": 1.0469448584202683, + "grad_norm": 2.6932196617126465, + "learning_rate": 0.0002, + "loss": 2.3688, + "step": 14050 + }, + { + "epoch": 1.0476900149031296, + "grad_norm": 2.8461594581604004, + "learning_rate": 0.0002, + "loss": 2.6371, + "step": 14060 + }, + { + "epoch": 1.048435171385991, + "grad_norm": 2.3841376304626465, + "learning_rate": 0.0002, + "loss": 2.2052, + "step": 14070 + }, + { + "epoch": 1.0491803278688525, + "grad_norm": 2.358163356781006, + "learning_rate": 0.0002, + "loss": 2.2105, + "step": 14080 + }, + { + "epoch": 1.0499254843517138, + "grad_norm": 2.9143970012664795, + "learning_rate": 0.0002, + "loss": 2.3372, + "step": 14090 + }, + { + "epoch": 1.0506706408345752, + "grad_norm": 2.2948029041290283, + "learning_rate": 0.0002, + "loss": 2.4952, + "step": 14100 + }, + { + "epoch": 1.0514157973174367, + "grad_norm": 2.725156307220459, + "learning_rate": 0.0002, + "loss": 2.4442, + "step": 14110 + }, + { + "epoch": 1.052160953800298, + "grad_norm": 2.590603828430176, + "learning_rate": 0.0002, + "loss": 2.4739, + "step": 14120 + }, + { + "epoch": 1.0529061102831594, + "grad_norm": 2.1326684951782227, + "learning_rate": 0.0002, + "loss": 2.3584, + "step": 14130 + }, + { + "epoch": 1.053651266766021, + "grad_norm": 2.2197265625, + "learning_rate": 0.0002, + "loss": 2.4694, + "step": 14140 + }, + { + "epoch": 1.0543964232488823, + "grad_norm": 2.6531333923339844, + "learning_rate": 0.0002, + "loss": 2.3527, + "step": 14150 + }, + { + "epoch": 1.0551415797317436, + "grad_norm": 2.60204815864563, + "learning_rate": 0.0002, + "loss": 2.4601, + "step": 14160 + }, + { + "epoch": 1.0558867362146052, + "grad_norm": 1.9536486864089966, + "learning_rate": 0.0002, + "loss": 2.3322, + "step": 14170 + }, + { + "epoch": 1.0566318926974665, + "grad_norm": 2.206132650375366, + "learning_rate": 0.0002, + "loss": 2.388, + "step": 14180 + }, + { + "epoch": 1.0573770491803278, + "grad_norm": 2.372546911239624, + "learning_rate": 0.0002, + "loss": 2.4207, + "step": 14190 + }, + { + "epoch": 1.0581222056631894, + "grad_norm": 2.5340518951416016, + "learning_rate": 0.0002, + "loss": 2.4014, + "step": 14200 + }, + { + "epoch": 1.0588673621460507, + "grad_norm": 2.513394594192505, + "learning_rate": 0.0002, + "loss": 2.5686, + "step": 14210 + }, + { + "epoch": 1.059612518628912, + "grad_norm": 2.649597644805908, + "learning_rate": 0.0002, + "loss": 2.4436, + "step": 14220 + }, + { + "epoch": 1.0603576751117734, + "grad_norm": 2.531172513961792, + "learning_rate": 0.0002, + "loss": 2.2963, + "step": 14230 + }, + { + "epoch": 1.061102831594635, + "grad_norm": 2.393568515777588, + "learning_rate": 0.0002, + "loss": 2.2868, + "step": 14240 + }, + { + "epoch": 1.0618479880774963, + "grad_norm": 2.474477767944336, + "learning_rate": 0.0002, + "loss": 2.2407, + "step": 14250 + }, + { + "epoch": 1.0625931445603576, + "grad_norm": 2.3346738815307617, + "learning_rate": 0.0002, + "loss": 2.3307, + "step": 14260 + }, + { + "epoch": 1.0633383010432191, + "grad_norm": 2.627002239227295, + "learning_rate": 0.0002, + "loss": 2.5363, + "step": 14270 + }, + { + "epoch": 1.0640834575260805, + "grad_norm": 2.602421522140503, + "learning_rate": 0.0002, + "loss": 2.5165, + "step": 14280 + }, + { + "epoch": 1.0648286140089418, + "grad_norm": 2.7509329319000244, + "learning_rate": 0.0002, + "loss": 2.3219, + "step": 14290 + }, + { + "epoch": 1.0655737704918034, + "grad_norm": 2.4855947494506836, + "learning_rate": 0.0002, + "loss": 2.5004, + "step": 14300 + }, + { + "epoch": 1.0663189269746647, + "grad_norm": 2.7259209156036377, + "learning_rate": 0.0002, + "loss": 2.3724, + "step": 14310 + }, + { + "epoch": 1.067064083457526, + "grad_norm": 2.578622579574585, + "learning_rate": 0.0002, + "loss": 2.3865, + "step": 14320 + }, + { + "epoch": 1.0678092399403876, + "grad_norm": 2.640737771987915, + "learning_rate": 0.0002, + "loss": 2.2266, + "step": 14330 + }, + { + "epoch": 1.068554396423249, + "grad_norm": 2.536515712738037, + "learning_rate": 0.0002, + "loss": 2.4457, + "step": 14340 + }, + { + "epoch": 1.0692995529061102, + "grad_norm": 2.3752591609954834, + "learning_rate": 0.0002, + "loss": 2.3925, + "step": 14350 + }, + { + "epoch": 1.0700447093889718, + "grad_norm": 2.668381929397583, + "learning_rate": 0.0002, + "loss": 2.3275, + "step": 14360 + }, + { + "epoch": 1.0707898658718331, + "grad_norm": 2.636784315109253, + "learning_rate": 0.0002, + "loss": 2.3801, + "step": 14370 + }, + { + "epoch": 1.0715350223546944, + "grad_norm": 2.4896252155303955, + "learning_rate": 0.0002, + "loss": 2.3056, + "step": 14380 + }, + { + "epoch": 1.072280178837556, + "grad_norm": 2.3532116413116455, + "learning_rate": 0.0002, + "loss": 2.2644, + "step": 14390 + }, + { + "epoch": 1.0730253353204173, + "grad_norm": 2.7929883003234863, + "learning_rate": 0.0002, + "loss": 2.326, + "step": 14400 + }, + { + "epoch": 1.0737704918032787, + "grad_norm": 2.66312837600708, + "learning_rate": 0.0002, + "loss": 2.4889, + "step": 14410 + }, + { + "epoch": 1.07451564828614, + "grad_norm": 2.480541706085205, + "learning_rate": 0.0002, + "loss": 2.4544, + "step": 14420 + }, + { + "epoch": 1.0752608047690015, + "grad_norm": 2.473315954208374, + "learning_rate": 0.0002, + "loss": 2.4154, + "step": 14430 + }, + { + "epoch": 1.0760059612518629, + "grad_norm": 2.4049172401428223, + "learning_rate": 0.0002, + "loss": 2.3524, + "step": 14440 + }, + { + "epoch": 1.0767511177347242, + "grad_norm": 2.4751241207122803, + "learning_rate": 0.0002, + "loss": 2.1488, + "step": 14450 + }, + { + "epoch": 1.0774962742175858, + "grad_norm": 2.5170867443084717, + "learning_rate": 0.0002, + "loss": 2.5725, + "step": 14460 + }, + { + "epoch": 1.078241430700447, + "grad_norm": 2.9997737407684326, + "learning_rate": 0.0002, + "loss": 2.4776, + "step": 14470 + }, + { + "epoch": 1.0789865871833084, + "grad_norm": 2.396097421646118, + "learning_rate": 0.0002, + "loss": 2.249, + "step": 14480 + }, + { + "epoch": 1.07973174366617, + "grad_norm": 3.120967149734497, + "learning_rate": 0.0002, + "loss": 2.4061, + "step": 14490 + }, + { + "epoch": 1.0804769001490313, + "grad_norm": 3.100985527038574, + "learning_rate": 0.0002, + "loss": 2.4715, + "step": 14500 + }, + { + "epoch": 1.0812220566318926, + "grad_norm": 2.4423913955688477, + "learning_rate": 0.0002, + "loss": 2.296, + "step": 14510 + }, + { + "epoch": 1.0819672131147542, + "grad_norm": 2.439854860305786, + "learning_rate": 0.0002, + "loss": 2.4691, + "step": 14520 + }, + { + "epoch": 1.0827123695976155, + "grad_norm": 2.5412724018096924, + "learning_rate": 0.0002, + "loss": 2.3789, + "step": 14530 + }, + { + "epoch": 1.0834575260804769, + "grad_norm": 1.9563956260681152, + "learning_rate": 0.0002, + "loss": 2.571, + "step": 14540 + }, + { + "epoch": 1.0842026825633384, + "grad_norm": 2.4468328952789307, + "learning_rate": 0.0002, + "loss": 2.3139, + "step": 14550 + }, + { + "epoch": 1.0849478390461997, + "grad_norm": 2.4930431842803955, + "learning_rate": 0.0002, + "loss": 2.313, + "step": 14560 + }, + { + "epoch": 1.085692995529061, + "grad_norm": 2.567399501800537, + "learning_rate": 0.0002, + "loss": 2.4385, + "step": 14570 + }, + { + "epoch": 1.0864381520119224, + "grad_norm": 2.871107339859009, + "learning_rate": 0.0002, + "loss": 2.5201, + "step": 14580 + }, + { + "epoch": 1.087183308494784, + "grad_norm": 2.9992733001708984, + "learning_rate": 0.0002, + "loss": 2.2764, + "step": 14590 + }, + { + "epoch": 1.0879284649776453, + "grad_norm": 2.3706576824188232, + "learning_rate": 0.0002, + "loss": 2.604, + "step": 14600 + }, + { + "epoch": 1.0886736214605066, + "grad_norm": 2.642545461654663, + "learning_rate": 0.0002, + "loss": 2.3878, + "step": 14610 + }, + { + "epoch": 1.0894187779433682, + "grad_norm": 2.364602565765381, + "learning_rate": 0.0002, + "loss": 2.338, + "step": 14620 + }, + { + "epoch": 1.0901639344262295, + "grad_norm": 2.9395387172698975, + "learning_rate": 0.0002, + "loss": 2.3848, + "step": 14630 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 3.6950175762176514, + "learning_rate": 0.0002, + "loss": 2.4532, + "step": 14640 + }, + { + "epoch": 1.0916542473919524, + "grad_norm": 2.566967487335205, + "learning_rate": 0.0002, + "loss": 2.5999, + "step": 14650 + }, + { + "epoch": 1.0923994038748137, + "grad_norm": 2.3279924392700195, + "learning_rate": 0.0002, + "loss": 2.5143, + "step": 14660 + }, + { + "epoch": 1.093144560357675, + "grad_norm": 3.0130693912506104, + "learning_rate": 0.0002, + "loss": 2.4298, + "step": 14670 + }, + { + "epoch": 1.0938897168405366, + "grad_norm": 2.0942749977111816, + "learning_rate": 0.0002, + "loss": 2.394, + "step": 14680 + }, + { + "epoch": 1.094634873323398, + "grad_norm": 2.4784209728240967, + "learning_rate": 0.0002, + "loss": 2.474, + "step": 14690 + }, + { + "epoch": 1.0953800298062593, + "grad_norm": 2.7817776203155518, + "learning_rate": 0.0002, + "loss": 2.3312, + "step": 14700 + }, + { + "epoch": 1.0961251862891208, + "grad_norm": 2.4290931224823, + "learning_rate": 0.0002, + "loss": 2.3258, + "step": 14710 + }, + { + "epoch": 1.0968703427719821, + "grad_norm": 2.543133020401001, + "learning_rate": 0.0002, + "loss": 2.4291, + "step": 14720 + }, + { + "epoch": 1.0976154992548435, + "grad_norm": 2.4686405658721924, + "learning_rate": 0.0002, + "loss": 2.2782, + "step": 14730 + }, + { + "epoch": 1.098360655737705, + "grad_norm": 2.5764386653900146, + "learning_rate": 0.0002, + "loss": 2.3708, + "step": 14740 + }, + { + "epoch": 1.0991058122205664, + "grad_norm": 2.827773094177246, + "learning_rate": 0.0002, + "loss": 2.3506, + "step": 14750 + }, + { + "epoch": 1.0998509687034277, + "grad_norm": 2.2853167057037354, + "learning_rate": 0.0002, + "loss": 2.4782, + "step": 14760 + }, + { + "epoch": 1.100596125186289, + "grad_norm": 2.094871997833252, + "learning_rate": 0.0002, + "loss": 2.3033, + "step": 14770 + }, + { + "epoch": 1.1013412816691506, + "grad_norm": 2.252192735671997, + "learning_rate": 0.0002, + "loss": 2.5064, + "step": 14780 + }, + { + "epoch": 1.102086438152012, + "grad_norm": 2.530860424041748, + "learning_rate": 0.0002, + "loss": 2.3885, + "step": 14790 + }, + { + "epoch": 1.1028315946348732, + "grad_norm": 2.822106122970581, + "learning_rate": 0.0002, + "loss": 2.4288, + "step": 14800 + }, + { + "epoch": 1.1035767511177348, + "grad_norm": 2.6030118465423584, + "learning_rate": 0.0002, + "loss": 2.3705, + "step": 14810 + }, + { + "epoch": 1.1043219076005961, + "grad_norm": 3.0381968021392822, + "learning_rate": 0.0002, + "loss": 2.3168, + "step": 14820 + }, + { + "epoch": 1.1050670640834575, + "grad_norm": 2.1125636100769043, + "learning_rate": 0.0002, + "loss": 2.2316, + "step": 14830 + }, + { + "epoch": 1.105812220566319, + "grad_norm": 2.649933338165283, + "learning_rate": 0.0002, + "loss": 2.3767, + "step": 14840 + }, + { + "epoch": 1.1065573770491803, + "grad_norm": 2.4779107570648193, + "learning_rate": 0.0002, + "loss": 2.4179, + "step": 14850 + }, + { + "epoch": 1.1073025335320417, + "grad_norm": 2.370229721069336, + "learning_rate": 0.0002, + "loss": 2.3823, + "step": 14860 + }, + { + "epoch": 1.1080476900149032, + "grad_norm": 2.200868844985962, + "learning_rate": 0.0002, + "loss": 2.4024, + "step": 14870 + }, + { + "epoch": 1.1087928464977646, + "grad_norm": 2.385791063308716, + "learning_rate": 0.0002, + "loss": 2.5435, + "step": 14880 + }, + { + "epoch": 1.1095380029806259, + "grad_norm": 2.731098175048828, + "learning_rate": 0.0002, + "loss": 2.4073, + "step": 14890 + }, + { + "epoch": 1.1102831594634874, + "grad_norm": 2.1808290481567383, + "learning_rate": 0.0002, + "loss": 2.4306, + "step": 14900 + }, + { + "epoch": 1.1110283159463488, + "grad_norm": 2.3113632202148438, + "learning_rate": 0.0002, + "loss": 2.4484, + "step": 14910 + }, + { + "epoch": 1.11177347242921, + "grad_norm": 2.473583459854126, + "learning_rate": 0.0002, + "loss": 2.3573, + "step": 14920 + }, + { + "epoch": 1.1125186289120714, + "grad_norm": 2.4973528385162354, + "learning_rate": 0.0002, + "loss": 2.4679, + "step": 14930 + }, + { + "epoch": 1.113263785394933, + "grad_norm": 2.698992967605591, + "learning_rate": 0.0002, + "loss": 2.573, + "step": 14940 + }, + { + "epoch": 1.1140089418777943, + "grad_norm": 2.4769937992095947, + "learning_rate": 0.0002, + "loss": 2.5083, + "step": 14950 + }, + { + "epoch": 1.1147540983606556, + "grad_norm": 2.54304838180542, + "learning_rate": 0.0002, + "loss": 2.3611, + "step": 14960 + }, + { + "epoch": 1.1154992548435172, + "grad_norm": 2.456918478012085, + "learning_rate": 0.0002, + "loss": 2.512, + "step": 14970 + }, + { + "epoch": 1.1162444113263785, + "grad_norm": 2.8047049045562744, + "learning_rate": 0.0002, + "loss": 2.4403, + "step": 14980 + }, + { + "epoch": 1.1169895678092399, + "grad_norm": 2.716897964477539, + "learning_rate": 0.0002, + "loss": 2.4872, + "step": 14990 + }, + { + "epoch": 1.1177347242921014, + "grad_norm": 2.525214910507202, + "learning_rate": 0.0002, + "loss": 2.3246, + "step": 15000 + }, + { + "epoch": 1.1184798807749627, + "grad_norm": 2.955278158187866, + "learning_rate": 0.0002, + "loss": 2.4764, + "step": 15010 + }, + { + "epoch": 1.119225037257824, + "grad_norm": 2.454103708267212, + "learning_rate": 0.0002, + "loss": 2.4668, + "step": 15020 + }, + { + "epoch": 1.1199701937406856, + "grad_norm": 2.495340585708618, + "learning_rate": 0.0002, + "loss": 2.3346, + "step": 15030 + }, + { + "epoch": 1.120715350223547, + "grad_norm": 2.2860970497131348, + "learning_rate": 0.0002, + "loss": 2.5325, + "step": 15040 + }, + { + "epoch": 1.1214605067064083, + "grad_norm": 2.4267046451568604, + "learning_rate": 0.0002, + "loss": 2.3052, + "step": 15050 + }, + { + "epoch": 1.1222056631892698, + "grad_norm": 2.8328089714050293, + "learning_rate": 0.0002, + "loss": 2.412, + "step": 15060 + }, + { + "epoch": 1.1229508196721312, + "grad_norm": 2.378267288208008, + "learning_rate": 0.0002, + "loss": 2.3667, + "step": 15070 + }, + { + "epoch": 1.1236959761549925, + "grad_norm": 2.625948667526245, + "learning_rate": 0.0002, + "loss": 2.3818, + "step": 15080 + }, + { + "epoch": 1.124441132637854, + "grad_norm": 2.269430160522461, + "learning_rate": 0.0002, + "loss": 2.5105, + "step": 15090 + }, + { + "epoch": 1.1251862891207154, + "grad_norm": 2.5549607276916504, + "learning_rate": 0.0002, + "loss": 2.4043, + "step": 15100 + }, + { + "epoch": 1.1259314456035767, + "grad_norm": 2.5551154613494873, + "learning_rate": 0.0002, + "loss": 2.4814, + "step": 15110 + }, + { + "epoch": 1.1266766020864383, + "grad_norm": 2.48346209526062, + "learning_rate": 0.0002, + "loss": 2.2917, + "step": 15120 + }, + { + "epoch": 1.1274217585692996, + "grad_norm": 2.214686632156372, + "learning_rate": 0.0002, + "loss": 2.4799, + "step": 15130 + }, + { + "epoch": 1.128166915052161, + "grad_norm": 2.6155035495758057, + "learning_rate": 0.0002, + "loss": 2.4024, + "step": 15140 + }, + { + "epoch": 1.1289120715350223, + "grad_norm": 2.485732316970825, + "learning_rate": 0.0002, + "loss": 2.4218, + "step": 15150 + }, + { + "epoch": 1.1296572280178838, + "grad_norm": 2.489976167678833, + "learning_rate": 0.0002, + "loss": 2.4819, + "step": 15160 + }, + { + "epoch": 1.1304023845007451, + "grad_norm": 2.557816743850708, + "learning_rate": 0.0002, + "loss": 2.404, + "step": 15170 + }, + { + "epoch": 1.1311475409836065, + "grad_norm": 2.362257242202759, + "learning_rate": 0.0002, + "loss": 2.413, + "step": 15180 + }, + { + "epoch": 1.131892697466468, + "grad_norm": 2.555720806121826, + "learning_rate": 0.0002, + "loss": 2.4822, + "step": 15190 + }, + { + "epoch": 1.1326378539493294, + "grad_norm": 2.301452398300171, + "learning_rate": 0.0002, + "loss": 2.4775, + "step": 15200 + }, + { + "epoch": 1.1333830104321907, + "grad_norm": 2.718770980834961, + "learning_rate": 0.0002, + "loss": 2.4039, + "step": 15210 + }, + { + "epoch": 1.1341281669150522, + "grad_norm": 2.561338424682617, + "learning_rate": 0.0002, + "loss": 2.4091, + "step": 15220 + }, + { + "epoch": 1.1348733233979136, + "grad_norm": 2.619065523147583, + "learning_rate": 0.0002, + "loss": 2.5057, + "step": 15230 + }, + { + "epoch": 1.135618479880775, + "grad_norm": 3.1791832447052, + "learning_rate": 0.0002, + "loss": 2.4179, + "step": 15240 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 2.456456422805786, + "learning_rate": 0.0002, + "loss": 2.2546, + "step": 15250 + }, + { + "epoch": 1.1371087928464978, + "grad_norm": 2.696702718734741, + "learning_rate": 0.0002, + "loss": 2.3953, + "step": 15260 + }, + { + "epoch": 1.1378539493293591, + "grad_norm": 2.065814733505249, + "learning_rate": 0.0002, + "loss": 2.1887, + "step": 15270 + }, + { + "epoch": 1.1385991058122205, + "grad_norm": 2.3281009197235107, + "learning_rate": 0.0002, + "loss": 2.4815, + "step": 15280 + }, + { + "epoch": 1.139344262295082, + "grad_norm": 2.4700629711151123, + "learning_rate": 0.0002, + "loss": 2.3037, + "step": 15290 + }, + { + "epoch": 1.1400894187779433, + "grad_norm": 2.3414204120635986, + "learning_rate": 0.0002, + "loss": 2.469, + "step": 15300 + }, + { + "epoch": 1.1408345752608047, + "grad_norm": 2.8033456802368164, + "learning_rate": 0.0002, + "loss": 2.2909, + "step": 15310 + }, + { + "epoch": 1.1415797317436662, + "grad_norm": 2.513645648956299, + "learning_rate": 0.0002, + "loss": 2.3957, + "step": 15320 + }, + { + "epoch": 1.1423248882265276, + "grad_norm": 3.029627799987793, + "learning_rate": 0.0002, + "loss": 2.4792, + "step": 15330 + }, + { + "epoch": 1.1430700447093889, + "grad_norm": 2.7976560592651367, + "learning_rate": 0.0002, + "loss": 2.4781, + "step": 15340 + }, + { + "epoch": 1.1438152011922504, + "grad_norm": 2.6071226596832275, + "learning_rate": 0.0002, + "loss": 2.4836, + "step": 15350 + }, + { + "epoch": 1.1445603576751118, + "grad_norm": 2.4131622314453125, + "learning_rate": 0.0002, + "loss": 2.4202, + "step": 15360 + }, + { + "epoch": 1.145305514157973, + "grad_norm": 2.6419029235839844, + "learning_rate": 0.0002, + "loss": 2.5272, + "step": 15370 + }, + { + "epoch": 1.1460506706408347, + "grad_norm": 2.893653154373169, + "learning_rate": 0.0002, + "loss": 2.5116, + "step": 15380 + }, + { + "epoch": 1.146795827123696, + "grad_norm": 2.739550828933716, + "learning_rate": 0.0002, + "loss": 2.3836, + "step": 15390 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 2.9094793796539307, + "learning_rate": 0.0002, + "loss": 2.3208, + "step": 15400 + }, + { + "epoch": 1.1482861400894189, + "grad_norm": 2.420971155166626, + "learning_rate": 0.0002, + "loss": 2.3705, + "step": 15410 + }, + { + "epoch": 1.1490312965722802, + "grad_norm": 2.5930230617523193, + "learning_rate": 0.0002, + "loss": 2.5412, + "step": 15420 + }, + { + "epoch": 1.1497764530551415, + "grad_norm": 2.6284139156341553, + "learning_rate": 0.0002, + "loss": 2.2475, + "step": 15430 + }, + { + "epoch": 1.150521609538003, + "grad_norm": 2.640470266342163, + "learning_rate": 0.0002, + "loss": 2.4292, + "step": 15440 + }, + { + "epoch": 1.1512667660208644, + "grad_norm": 2.7943484783172607, + "learning_rate": 0.0002, + "loss": 2.4816, + "step": 15450 + }, + { + "epoch": 1.1520119225037257, + "grad_norm": 2.281245470046997, + "learning_rate": 0.0002, + "loss": 2.3931, + "step": 15460 + }, + { + "epoch": 1.1527570789865873, + "grad_norm": 2.250092029571533, + "learning_rate": 0.0002, + "loss": 2.4882, + "step": 15470 + }, + { + "epoch": 1.1535022354694486, + "grad_norm": 2.4525022506713867, + "learning_rate": 0.0002, + "loss": 2.4132, + "step": 15480 + }, + { + "epoch": 1.15424739195231, + "grad_norm": 2.4779772758483887, + "learning_rate": 0.0002, + "loss": 2.3182, + "step": 15490 + }, + { + "epoch": 1.1549925484351713, + "grad_norm": 2.4261624813079834, + "learning_rate": 0.0002, + "loss": 2.5122, + "step": 15500 + }, + { + "epoch": 1.1557377049180328, + "grad_norm": 2.989243745803833, + "learning_rate": 0.0002, + "loss": 2.5352, + "step": 15510 + }, + { + "epoch": 1.1564828614008942, + "grad_norm": 3.4298269748687744, + "learning_rate": 0.0002, + "loss": 2.4098, + "step": 15520 + }, + { + "epoch": 1.1572280178837555, + "grad_norm": 2.6402604579925537, + "learning_rate": 0.0002, + "loss": 2.4133, + "step": 15530 + }, + { + "epoch": 1.157973174366617, + "grad_norm": 2.4061222076416016, + "learning_rate": 0.0002, + "loss": 2.3814, + "step": 15540 + }, + { + "epoch": 1.1587183308494784, + "grad_norm": 2.915400266647339, + "learning_rate": 0.0002, + "loss": 2.3256, + "step": 15550 + }, + { + "epoch": 1.1594634873323397, + "grad_norm": 2.7170588970184326, + "learning_rate": 0.0002, + "loss": 2.4919, + "step": 15560 + }, + { + "epoch": 1.1602086438152013, + "grad_norm": 2.5556771755218506, + "learning_rate": 0.0002, + "loss": 2.5706, + "step": 15570 + }, + { + "epoch": 1.1609538002980626, + "grad_norm": 2.762441873550415, + "learning_rate": 0.0002, + "loss": 2.3807, + "step": 15580 + }, + { + "epoch": 1.161698956780924, + "grad_norm": 2.6300220489501953, + "learning_rate": 0.0002, + "loss": 2.405, + "step": 15590 + }, + { + "epoch": 1.1624441132637853, + "grad_norm": 2.586217164993286, + "learning_rate": 0.0002, + "loss": 2.5505, + "step": 15600 + }, + { + "epoch": 1.1631892697466468, + "grad_norm": 2.6956429481506348, + "learning_rate": 0.0002, + "loss": 2.259, + "step": 15610 + }, + { + "epoch": 1.1639344262295082, + "grad_norm": 2.6788172721862793, + "learning_rate": 0.0002, + "loss": 2.281, + "step": 15620 + }, + { + "epoch": 1.1646795827123695, + "grad_norm": 2.6697628498077393, + "learning_rate": 0.0002, + "loss": 2.4952, + "step": 15630 + }, + { + "epoch": 1.165424739195231, + "grad_norm": 2.434091567993164, + "learning_rate": 0.0002, + "loss": 2.5588, + "step": 15640 + }, + { + "epoch": 1.1661698956780924, + "grad_norm": 2.590214252471924, + "learning_rate": 0.0002, + "loss": 2.4701, + "step": 15650 + }, + { + "epoch": 1.1669150521609537, + "grad_norm": 2.3940234184265137, + "learning_rate": 0.0002, + "loss": 2.4546, + "step": 15660 + }, + { + "epoch": 1.1676602086438153, + "grad_norm": 2.5530643463134766, + "learning_rate": 0.0002, + "loss": 2.3913, + "step": 15670 + }, + { + "epoch": 1.1684053651266766, + "grad_norm": 2.599776029586792, + "learning_rate": 0.0002, + "loss": 2.3379, + "step": 15680 + }, + { + "epoch": 1.169150521609538, + "grad_norm": 2.3956868648529053, + "learning_rate": 0.0002, + "loss": 2.4249, + "step": 15690 + }, + { + "epoch": 1.1698956780923995, + "grad_norm": 2.866769313812256, + "learning_rate": 0.0002, + "loss": 2.5113, + "step": 15700 + }, + { + "epoch": 1.1706408345752608, + "grad_norm": 2.3817973136901855, + "learning_rate": 0.0002, + "loss": 2.4549, + "step": 15710 + }, + { + "epoch": 1.1713859910581221, + "grad_norm": 2.8611204624176025, + "learning_rate": 0.0002, + "loss": 2.4499, + "step": 15720 + }, + { + "epoch": 1.1721311475409837, + "grad_norm": 2.5031049251556396, + "learning_rate": 0.0002, + "loss": 2.474, + "step": 15730 + }, + { + "epoch": 1.172876304023845, + "grad_norm": 2.7926108837127686, + "learning_rate": 0.0002, + "loss": 2.6565, + "step": 15740 + }, + { + "epoch": 1.1736214605067063, + "grad_norm": 2.480978488922119, + "learning_rate": 0.0002, + "loss": 2.5754, + "step": 15750 + }, + { + "epoch": 1.174366616989568, + "grad_norm": 2.831561803817749, + "learning_rate": 0.0002, + "loss": 2.2785, + "step": 15760 + }, + { + "epoch": 1.1751117734724292, + "grad_norm": 2.7527670860290527, + "learning_rate": 0.0002, + "loss": 2.4922, + "step": 15770 + }, + { + "epoch": 1.1758569299552906, + "grad_norm": 2.640598773956299, + "learning_rate": 0.0002, + "loss": 2.4216, + "step": 15780 + }, + { + "epoch": 1.1766020864381521, + "grad_norm": 2.8140854835510254, + "learning_rate": 0.0002, + "loss": 2.509, + "step": 15790 + }, + { + "epoch": 1.1773472429210134, + "grad_norm": 2.448840618133545, + "learning_rate": 0.0002, + "loss": 2.2896, + "step": 15800 + }, + { + "epoch": 1.1780923994038748, + "grad_norm": 2.5497422218322754, + "learning_rate": 0.0002, + "loss": 2.329, + "step": 15810 + }, + { + "epoch": 1.1788375558867363, + "grad_norm": 2.042785882949829, + "learning_rate": 0.0002, + "loss": 2.4259, + "step": 15820 + }, + { + "epoch": 1.1795827123695977, + "grad_norm": 2.9720048904418945, + "learning_rate": 0.0002, + "loss": 2.2438, + "step": 15830 + }, + { + "epoch": 1.180327868852459, + "grad_norm": 2.516237258911133, + "learning_rate": 0.0002, + "loss": 2.4921, + "step": 15840 + }, + { + "epoch": 1.1810730253353203, + "grad_norm": 2.7879035472869873, + "learning_rate": 0.0002, + "loss": 2.407, + "step": 15850 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 2.4089086055755615, + "learning_rate": 0.0002, + "loss": 2.4276, + "step": 15860 + }, + { + "epoch": 1.1825633383010432, + "grad_norm": 2.6390304565429688, + "learning_rate": 0.0002, + "loss": 2.428, + "step": 15870 + }, + { + "epoch": 1.1833084947839045, + "grad_norm": 3.222198963165283, + "learning_rate": 0.0002, + "loss": 2.3888, + "step": 15880 + }, + { + "epoch": 1.184053651266766, + "grad_norm": 2.4840657711029053, + "learning_rate": 0.0002, + "loss": 2.4878, + "step": 15890 + }, + { + "epoch": 1.1847988077496274, + "grad_norm": 2.1169028282165527, + "learning_rate": 0.0002, + "loss": 2.424, + "step": 15900 + }, + { + "epoch": 1.1855439642324888, + "grad_norm": 2.243370294570923, + "learning_rate": 0.0002, + "loss": 2.2954, + "step": 15910 + }, + { + "epoch": 1.1862891207153503, + "grad_norm": 2.8752856254577637, + "learning_rate": 0.0002, + "loss": 2.3783, + "step": 15920 + }, + { + "epoch": 1.1870342771982116, + "grad_norm": 2.4351089000701904, + "learning_rate": 0.0002, + "loss": 2.347, + "step": 15930 + }, + { + "epoch": 1.187779433681073, + "grad_norm": 2.618943214416504, + "learning_rate": 0.0002, + "loss": 2.4062, + "step": 15940 + }, + { + "epoch": 1.1885245901639343, + "grad_norm": 2.5448083877563477, + "learning_rate": 0.0002, + "loss": 2.4515, + "step": 15950 + }, + { + "epoch": 1.1892697466467959, + "grad_norm": 2.431405544281006, + "learning_rate": 0.0002, + "loss": 2.4382, + "step": 15960 + }, + { + "epoch": 1.1900149031296572, + "grad_norm": 2.914207696914673, + "learning_rate": 0.0002, + "loss": 2.5043, + "step": 15970 + }, + { + "epoch": 1.1907600596125185, + "grad_norm": 2.584994316101074, + "learning_rate": 0.0002, + "loss": 2.3106, + "step": 15980 + }, + { + "epoch": 1.19150521609538, + "grad_norm": 2.518873691558838, + "learning_rate": 0.0002, + "loss": 2.3248, + "step": 15990 + }, + { + "epoch": 1.1922503725782414, + "grad_norm": 2.7757396697998047, + "learning_rate": 0.0002, + "loss": 2.2855, + "step": 16000 + }, + { + "epoch": 1.1929955290611027, + "grad_norm": 2.546560764312744, + "learning_rate": 0.0002, + "loss": 2.2436, + "step": 16010 + }, + { + "epoch": 1.1937406855439643, + "grad_norm": 2.4890432357788086, + "learning_rate": 0.0002, + "loss": 2.5493, + "step": 16020 + }, + { + "epoch": 1.1944858420268256, + "grad_norm": 2.523747682571411, + "learning_rate": 0.0002, + "loss": 2.3246, + "step": 16030 + }, + { + "epoch": 1.195230998509687, + "grad_norm": 2.861879825592041, + "learning_rate": 0.0002, + "loss": 2.3921, + "step": 16040 + }, + { + "epoch": 1.1959761549925485, + "grad_norm": 2.518486261367798, + "learning_rate": 0.0002, + "loss": 2.4388, + "step": 16050 + }, + { + "epoch": 1.1967213114754098, + "grad_norm": 2.4700870513916016, + "learning_rate": 0.0002, + "loss": 2.5634, + "step": 16060 + }, + { + "epoch": 1.1974664679582712, + "grad_norm": 2.402773141860962, + "learning_rate": 0.0002, + "loss": 2.3364, + "step": 16070 + }, + { + "epoch": 1.1982116244411327, + "grad_norm": 2.673292636871338, + "learning_rate": 0.0002, + "loss": 2.5241, + "step": 16080 + }, + { + "epoch": 1.198956780923994, + "grad_norm": 2.3342599868774414, + "learning_rate": 0.0002, + "loss": 2.56, + "step": 16090 + }, + { + "epoch": 1.1997019374068554, + "grad_norm": 2.9310660362243652, + "learning_rate": 0.0002, + "loss": 2.492, + "step": 16100 + }, + { + "epoch": 1.200447093889717, + "grad_norm": 2.380824327468872, + "learning_rate": 0.0002, + "loss": 2.511, + "step": 16110 + }, + { + "epoch": 1.2011922503725783, + "grad_norm": 2.292423963546753, + "learning_rate": 0.0002, + "loss": 2.5952, + "step": 16120 + }, + { + "epoch": 1.2019374068554396, + "grad_norm": 2.2969772815704346, + "learning_rate": 0.0002, + "loss": 2.3879, + "step": 16130 + }, + { + "epoch": 1.2026825633383011, + "grad_norm": 2.4675862789154053, + "learning_rate": 0.0002, + "loss": 2.4499, + "step": 16140 + }, + { + "epoch": 1.2034277198211625, + "grad_norm": 2.35262393951416, + "learning_rate": 0.0002, + "loss": 2.4689, + "step": 16150 + }, + { + "epoch": 1.2041728763040238, + "grad_norm": 2.7040700912475586, + "learning_rate": 0.0002, + "loss": 2.547, + "step": 16160 + }, + { + "epoch": 1.2049180327868854, + "grad_norm": 2.248189926147461, + "learning_rate": 0.0002, + "loss": 2.5511, + "step": 16170 + }, + { + "epoch": 1.2056631892697467, + "grad_norm": 2.631309747695923, + "learning_rate": 0.0002, + "loss": 2.5509, + "step": 16180 + }, + { + "epoch": 1.206408345752608, + "grad_norm": 2.66676926612854, + "learning_rate": 0.0002, + "loss": 2.555, + "step": 16190 + }, + { + "epoch": 1.2071535022354694, + "grad_norm": 2.6497623920440674, + "learning_rate": 0.0002, + "loss": 2.445, + "step": 16200 + }, + { + "epoch": 1.207898658718331, + "grad_norm": 2.9009816646575928, + "learning_rate": 0.0002, + "loss": 2.1566, + "step": 16210 + }, + { + "epoch": 1.2086438152011922, + "grad_norm": 2.3509867191314697, + "learning_rate": 0.0002, + "loss": 2.5733, + "step": 16220 + }, + { + "epoch": 1.2093889716840536, + "grad_norm": 2.4485838413238525, + "learning_rate": 0.0002, + "loss": 2.4517, + "step": 16230 + }, + { + "epoch": 1.2101341281669151, + "grad_norm": 2.4066200256347656, + "learning_rate": 0.0002, + "loss": 2.4892, + "step": 16240 + }, + { + "epoch": 1.2108792846497765, + "grad_norm": 2.5041356086730957, + "learning_rate": 0.0002, + "loss": 2.4068, + "step": 16250 + }, + { + "epoch": 1.2116244411326378, + "grad_norm": 2.457516670227051, + "learning_rate": 0.0002, + "loss": 2.4337, + "step": 16260 + }, + { + "epoch": 1.2123695976154993, + "grad_norm": 2.524627923965454, + "learning_rate": 0.0002, + "loss": 2.6584, + "step": 16270 + }, + { + "epoch": 1.2131147540983607, + "grad_norm": 2.730018138885498, + "learning_rate": 0.0002, + "loss": 2.4592, + "step": 16280 + }, + { + "epoch": 1.213859910581222, + "grad_norm": 3.1194841861724854, + "learning_rate": 0.0002, + "loss": 2.5211, + "step": 16290 + }, + { + "epoch": 1.2146050670640836, + "grad_norm": 2.484910726547241, + "learning_rate": 0.0002, + "loss": 2.4436, + "step": 16300 + }, + { + "epoch": 1.2153502235469449, + "grad_norm": 2.5972893238067627, + "learning_rate": 0.0002, + "loss": 2.5412, + "step": 16310 + }, + { + "epoch": 1.2160953800298062, + "grad_norm": 2.599231719970703, + "learning_rate": 0.0002, + "loss": 2.5554, + "step": 16320 + }, + { + "epoch": 1.2168405365126675, + "grad_norm": 2.5555241107940674, + "learning_rate": 0.0002, + "loss": 2.5618, + "step": 16330 + }, + { + "epoch": 1.217585692995529, + "grad_norm": 2.6577017307281494, + "learning_rate": 0.0002, + "loss": 2.3419, + "step": 16340 + }, + { + "epoch": 1.2183308494783904, + "grad_norm": 2.239018440246582, + "learning_rate": 0.0002, + "loss": 2.4517, + "step": 16350 + }, + { + "epoch": 1.2190760059612518, + "grad_norm": 2.2550699710845947, + "learning_rate": 0.0002, + "loss": 2.5087, + "step": 16360 + }, + { + "epoch": 1.2198211624441133, + "grad_norm": 2.7445180416107178, + "learning_rate": 0.0002, + "loss": 2.5011, + "step": 16370 + }, + { + "epoch": 1.2205663189269746, + "grad_norm": 2.637608528137207, + "learning_rate": 0.0002, + "loss": 2.3431, + "step": 16380 + }, + { + "epoch": 1.221311475409836, + "grad_norm": 2.4551427364349365, + "learning_rate": 0.0002, + "loss": 2.4485, + "step": 16390 + }, + { + "epoch": 1.2220566318926975, + "grad_norm": 2.595010757446289, + "learning_rate": 0.0002, + "loss": 2.6419, + "step": 16400 + }, + { + "epoch": 1.2228017883755589, + "grad_norm": 3.2001779079437256, + "learning_rate": 0.0002, + "loss": 2.3009, + "step": 16410 + }, + { + "epoch": 1.2235469448584202, + "grad_norm": 2.3680529594421387, + "learning_rate": 0.0002, + "loss": 2.4545, + "step": 16420 + }, + { + "epoch": 1.2242921013412817, + "grad_norm": 2.284846782684326, + "learning_rate": 0.0002, + "loss": 2.4662, + "step": 16430 + }, + { + "epoch": 1.225037257824143, + "grad_norm": 2.7528367042541504, + "learning_rate": 0.0002, + "loss": 2.504, + "step": 16440 + }, + { + "epoch": 1.2257824143070044, + "grad_norm": 2.5540902614593506, + "learning_rate": 0.0002, + "loss": 2.5621, + "step": 16450 + }, + { + "epoch": 1.226527570789866, + "grad_norm": 2.6369526386260986, + "learning_rate": 0.0002, + "loss": 2.4563, + "step": 16460 + }, + { + "epoch": 1.2272727272727273, + "grad_norm": 2.560155153274536, + "learning_rate": 0.0002, + "loss": 2.3171, + "step": 16470 + }, + { + "epoch": 1.2280178837555886, + "grad_norm": 2.8841707706451416, + "learning_rate": 0.0002, + "loss": 2.5621, + "step": 16480 + }, + { + "epoch": 1.2287630402384502, + "grad_norm": 2.587996482849121, + "learning_rate": 0.0002, + "loss": 2.5119, + "step": 16490 + }, + { + "epoch": 1.2295081967213115, + "grad_norm": 2.455828905105591, + "learning_rate": 0.0002, + "loss": 2.3992, + "step": 16500 + }, + { + "epoch": 1.2302533532041728, + "grad_norm": 2.5524632930755615, + "learning_rate": 0.0002, + "loss": 2.5209, + "step": 16510 + }, + { + "epoch": 1.2309985096870344, + "grad_norm": 2.436572551727295, + "learning_rate": 0.0002, + "loss": 2.4511, + "step": 16520 + }, + { + "epoch": 1.2317436661698957, + "grad_norm": 2.5850322246551514, + "learning_rate": 0.0002, + "loss": 2.3393, + "step": 16530 + }, + { + "epoch": 1.232488822652757, + "grad_norm": 2.884225845336914, + "learning_rate": 0.0002, + "loss": 2.4517, + "step": 16540 + }, + { + "epoch": 1.2332339791356184, + "grad_norm": 2.3954968452453613, + "learning_rate": 0.0002, + "loss": 2.5374, + "step": 16550 + }, + { + "epoch": 1.23397913561848, + "grad_norm": 2.329937696456909, + "learning_rate": 0.0002, + "loss": 2.631, + "step": 16560 + }, + { + "epoch": 1.2347242921013413, + "grad_norm": 1.706188440322876, + "learning_rate": 0.0002, + "loss": 2.3386, + "step": 16570 + }, + { + "epoch": 1.2354694485842026, + "grad_norm": 2.635439395904541, + "learning_rate": 0.0002, + "loss": 2.408, + "step": 16580 + }, + { + "epoch": 1.2362146050670642, + "grad_norm": 3.037846326828003, + "learning_rate": 0.0002, + "loss": 2.3775, + "step": 16590 + }, + { + "epoch": 1.2369597615499255, + "grad_norm": 2.435914993286133, + "learning_rate": 0.0002, + "loss": 2.4828, + "step": 16600 + }, + { + "epoch": 1.2377049180327868, + "grad_norm": 2.3864994049072266, + "learning_rate": 0.0002, + "loss": 2.307, + "step": 16610 + }, + { + "epoch": 1.2384500745156484, + "grad_norm": 2.5248770713806152, + "learning_rate": 0.0002, + "loss": 2.4023, + "step": 16620 + }, + { + "epoch": 1.2391952309985097, + "grad_norm": 2.4763107299804688, + "learning_rate": 0.0002, + "loss": 2.5354, + "step": 16630 + }, + { + "epoch": 1.239940387481371, + "grad_norm": 2.6904945373535156, + "learning_rate": 0.0002, + "loss": 2.5673, + "step": 16640 + }, + { + "epoch": 1.2406855439642326, + "grad_norm": 2.5238559246063232, + "learning_rate": 0.0002, + "loss": 2.4272, + "step": 16650 + }, + { + "epoch": 1.241430700447094, + "grad_norm": 2.451791763305664, + "learning_rate": 0.0002, + "loss": 2.5297, + "step": 16660 + }, + { + "epoch": 1.2421758569299552, + "grad_norm": 2.5765328407287598, + "learning_rate": 0.0002, + "loss": 2.4879, + "step": 16670 + }, + { + "epoch": 1.2429210134128166, + "grad_norm": 2.518141269683838, + "learning_rate": 0.0002, + "loss": 2.3585, + "step": 16680 + }, + { + "epoch": 1.2436661698956781, + "grad_norm": 2.4746904373168945, + "learning_rate": 0.0002, + "loss": 2.5626, + "step": 16690 + }, + { + "epoch": 1.2444113263785395, + "grad_norm": 2.6540675163269043, + "learning_rate": 0.0002, + "loss": 2.4304, + "step": 16700 + }, + { + "epoch": 1.2451564828614008, + "grad_norm": 1.8600475788116455, + "learning_rate": 0.0002, + "loss": 2.394, + "step": 16710 + }, + { + "epoch": 1.2459016393442623, + "grad_norm": 2.5817408561706543, + "learning_rate": 0.0002, + "loss": 2.5504, + "step": 16720 + }, + { + "epoch": 1.2466467958271237, + "grad_norm": 2.5648725032806396, + "learning_rate": 0.0002, + "loss": 2.2296, + "step": 16730 + }, + { + "epoch": 1.247391952309985, + "grad_norm": 2.600857734680176, + "learning_rate": 0.0002, + "loss": 2.3259, + "step": 16740 + }, + { + "epoch": 1.2481371087928466, + "grad_norm": 2.6139464378356934, + "learning_rate": 0.0002, + "loss": 2.4264, + "step": 16750 + }, + { + "epoch": 1.248882265275708, + "grad_norm": 2.387685537338257, + "learning_rate": 0.0002, + "loss": 2.6434, + "step": 16760 + }, + { + "epoch": 1.2496274217585692, + "grad_norm": 2.712690830230713, + "learning_rate": 0.0002, + "loss": 2.461, + "step": 16770 + }, + { + "epoch": 1.2503725782414308, + "grad_norm": 2.4673256874084473, + "learning_rate": 0.0002, + "loss": 2.4162, + "step": 16780 + }, + { + "epoch": 1.251117734724292, + "grad_norm": 2.5331919193267822, + "learning_rate": 0.0002, + "loss": 2.591, + "step": 16790 + }, + { + "epoch": 1.2518628912071534, + "grad_norm": 2.48547625541687, + "learning_rate": 0.0002, + "loss": 2.2615, + "step": 16800 + }, + { + "epoch": 1.252608047690015, + "grad_norm": 2.41300368309021, + "learning_rate": 0.0002, + "loss": 2.3431, + "step": 16810 + }, + { + "epoch": 1.2533532041728763, + "grad_norm": 2.566871166229248, + "learning_rate": 0.0002, + "loss": 2.3982, + "step": 16820 + }, + { + "epoch": 1.2540983606557377, + "grad_norm": 2.4768545627593994, + "learning_rate": 0.0002, + "loss": 2.346, + "step": 16830 + }, + { + "epoch": 1.2548435171385992, + "grad_norm": 2.677468776702881, + "learning_rate": 0.0002, + "loss": 2.2478, + "step": 16840 + }, + { + "epoch": 1.2555886736214605, + "grad_norm": 2.2552690505981445, + "learning_rate": 0.0002, + "loss": 2.5072, + "step": 16850 + }, + { + "epoch": 1.2563338301043219, + "grad_norm": 2.868969202041626, + "learning_rate": 0.0002, + "loss": 2.2606, + "step": 16860 + }, + { + "epoch": 1.2570789865871834, + "grad_norm": 2.6906239986419678, + "learning_rate": 0.0002, + "loss": 2.5686, + "step": 16870 + }, + { + "epoch": 1.2578241430700448, + "grad_norm": 2.433225393295288, + "learning_rate": 0.0002, + "loss": 2.3974, + "step": 16880 + }, + { + "epoch": 1.258569299552906, + "grad_norm": 2.37595534324646, + "learning_rate": 0.0002, + "loss": 2.4342, + "step": 16890 + }, + { + "epoch": 1.2593144560357676, + "grad_norm": 2.5448384284973145, + "learning_rate": 0.0002, + "loss": 2.3194, + "step": 16900 + }, + { + "epoch": 1.260059612518629, + "grad_norm": 2.858198404312134, + "learning_rate": 0.0002, + "loss": 2.4199, + "step": 16910 + }, + { + "epoch": 1.2608047690014903, + "grad_norm": 2.546701431274414, + "learning_rate": 0.0002, + "loss": 2.4509, + "step": 16920 + }, + { + "epoch": 1.2615499254843516, + "grad_norm": 2.5379738807678223, + "learning_rate": 0.0002, + "loss": 2.4427, + "step": 16930 + }, + { + "epoch": 1.2622950819672132, + "grad_norm": 2.6814608573913574, + "learning_rate": 0.0002, + "loss": 2.4686, + "step": 16940 + }, + { + "epoch": 1.2630402384500745, + "grad_norm": 2.3676536083221436, + "learning_rate": 0.0002, + "loss": 2.4649, + "step": 16950 + }, + { + "epoch": 1.2637853949329358, + "grad_norm": 2.789276123046875, + "learning_rate": 0.0002, + "loss": 2.3954, + "step": 16960 + }, + { + "epoch": 1.2645305514157974, + "grad_norm": 2.770944595336914, + "learning_rate": 0.0002, + "loss": 2.5665, + "step": 16970 + }, + { + "epoch": 1.2652757078986587, + "grad_norm": 2.385180950164795, + "learning_rate": 0.0002, + "loss": 2.227, + "step": 16980 + }, + { + "epoch": 1.26602086438152, + "grad_norm": 2.8469879627227783, + "learning_rate": 0.0002, + "loss": 2.2351, + "step": 16990 + }, + { + "epoch": 1.2667660208643814, + "grad_norm": 2.438333034515381, + "learning_rate": 0.0002, + "loss": 2.3147, + "step": 17000 + }, + { + "epoch": 1.267511177347243, + "grad_norm": 2.537980794906616, + "learning_rate": 0.0002, + "loss": 2.4113, + "step": 17010 + }, + { + "epoch": 1.2682563338301043, + "grad_norm": 2.5566976070404053, + "learning_rate": 0.0002, + "loss": 2.3972, + "step": 17020 + }, + { + "epoch": 1.2690014903129656, + "grad_norm": 2.5756947994232178, + "learning_rate": 0.0002, + "loss": 2.3624, + "step": 17030 + }, + { + "epoch": 1.2697466467958272, + "grad_norm": 2.6613008975982666, + "learning_rate": 0.0002, + "loss": 2.4689, + "step": 17040 + }, + { + "epoch": 1.2704918032786885, + "grad_norm": 2.802849054336548, + "learning_rate": 0.0002, + "loss": 2.3772, + "step": 17050 + }, + { + "epoch": 1.2712369597615498, + "grad_norm": 2.221499443054199, + "learning_rate": 0.0002, + "loss": 2.5116, + "step": 17060 + }, + { + "epoch": 1.2719821162444114, + "grad_norm": 2.3637337684631348, + "learning_rate": 0.0002, + "loss": 2.4486, + "step": 17070 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 2.217311382293701, + "learning_rate": 0.0002, + "loss": 2.4252, + "step": 17080 + }, + { + "epoch": 1.273472429210134, + "grad_norm": 2.36021089553833, + "learning_rate": 0.0002, + "loss": 2.5555, + "step": 17090 + }, + { + "epoch": 1.2742175856929956, + "grad_norm": 2.5092270374298096, + "learning_rate": 0.0002, + "loss": 2.298, + "step": 17100 + }, + { + "epoch": 1.274962742175857, + "grad_norm": 2.396413803100586, + "learning_rate": 0.0002, + "loss": 2.113, + "step": 17110 + }, + { + "epoch": 1.2757078986587183, + "grad_norm": 2.5738630294799805, + "learning_rate": 0.0002, + "loss": 2.2539, + "step": 17120 + }, + { + "epoch": 1.2764530551415798, + "grad_norm": 2.5101852416992188, + "learning_rate": 0.0002, + "loss": 2.3469, + "step": 17130 + }, + { + "epoch": 1.2771982116244411, + "grad_norm": 2.4591407775878906, + "learning_rate": 0.0002, + "loss": 2.6166, + "step": 17140 + }, + { + "epoch": 1.2779433681073025, + "grad_norm": 2.526982069015503, + "learning_rate": 0.0002, + "loss": 2.4245, + "step": 17150 + }, + { + "epoch": 1.278688524590164, + "grad_norm": 2.8430566787719727, + "learning_rate": 0.0002, + "loss": 2.3319, + "step": 17160 + }, + { + "epoch": 1.2794336810730254, + "grad_norm": 3.2182416915893555, + "learning_rate": 0.0002, + "loss": 2.4737, + "step": 17170 + }, + { + "epoch": 1.2801788375558867, + "grad_norm": 1.936545491218567, + "learning_rate": 0.0002, + "loss": 2.3686, + "step": 17180 + }, + { + "epoch": 1.2809239940387482, + "grad_norm": 2.754791736602783, + "learning_rate": 0.0002, + "loss": 2.5935, + "step": 17190 + }, + { + "epoch": 1.2816691505216096, + "grad_norm": 2.615044116973877, + "learning_rate": 0.0002, + "loss": 2.1516, + "step": 17200 + }, + { + "epoch": 1.282414307004471, + "grad_norm": 2.3363049030303955, + "learning_rate": 0.0002, + "loss": 2.4082, + "step": 17210 + }, + { + "epoch": 1.2831594634873325, + "grad_norm": 2.6004695892333984, + "learning_rate": 0.0002, + "loss": 2.2609, + "step": 17220 + }, + { + "epoch": 1.2839046199701938, + "grad_norm": 2.6206114292144775, + "learning_rate": 0.0002, + "loss": 2.6122, + "step": 17230 + }, + { + "epoch": 1.2846497764530551, + "grad_norm": 2.5685789585113525, + "learning_rate": 0.0002, + "loss": 2.4993, + "step": 17240 + }, + { + "epoch": 1.2853949329359167, + "grad_norm": 2.702047824859619, + "learning_rate": 0.0002, + "loss": 2.5556, + "step": 17250 + }, + { + "epoch": 1.286140089418778, + "grad_norm": 2.7429511547088623, + "learning_rate": 0.0002, + "loss": 2.3498, + "step": 17260 + }, + { + "epoch": 1.2868852459016393, + "grad_norm": 2.4050281047821045, + "learning_rate": 0.0002, + "loss": 2.4434, + "step": 17270 + }, + { + "epoch": 1.2876304023845009, + "grad_norm": 2.3521718978881836, + "learning_rate": 0.0002, + "loss": 2.2286, + "step": 17280 + }, + { + "epoch": 1.2883755588673622, + "grad_norm": 2.6510558128356934, + "learning_rate": 0.0002, + "loss": 2.4448, + "step": 17290 + }, + { + "epoch": 1.2891207153502235, + "grad_norm": 2.949458599090576, + "learning_rate": 0.0002, + "loss": 2.6088, + "step": 17300 + }, + { + "epoch": 1.2898658718330849, + "grad_norm": 2.5707809925079346, + "learning_rate": 0.0002, + "loss": 2.5102, + "step": 17310 + }, + { + "epoch": 1.2906110283159464, + "grad_norm": 2.6018505096435547, + "learning_rate": 0.0002, + "loss": 2.5562, + "step": 17320 + }, + { + "epoch": 1.2913561847988078, + "grad_norm": 2.5191781520843506, + "learning_rate": 0.0002, + "loss": 2.2755, + "step": 17330 + }, + { + "epoch": 1.292101341281669, + "grad_norm": 2.482866048812866, + "learning_rate": 0.0002, + "loss": 2.3706, + "step": 17340 + }, + { + "epoch": 1.2928464977645304, + "grad_norm": 2.840179681777954, + "learning_rate": 0.0002, + "loss": 2.6733, + "step": 17350 + }, + { + "epoch": 1.293591654247392, + "grad_norm": 2.3511884212493896, + "learning_rate": 0.0002, + "loss": 2.1673, + "step": 17360 + }, + { + "epoch": 1.2943368107302533, + "grad_norm": 2.9184696674346924, + "learning_rate": 0.0002, + "loss": 2.4761, + "step": 17370 + }, + { + "epoch": 1.2950819672131146, + "grad_norm": 2.500131130218506, + "learning_rate": 0.0002, + "loss": 2.5081, + "step": 17380 + }, + { + "epoch": 1.2958271236959762, + "grad_norm": 2.2473769187927246, + "learning_rate": 0.0002, + "loss": 2.421, + "step": 17390 + }, + { + "epoch": 1.2965722801788375, + "grad_norm": 2.166339159011841, + "learning_rate": 0.0002, + "loss": 2.4269, + "step": 17400 + }, + { + "epoch": 1.2973174366616989, + "grad_norm": 2.546684741973877, + "learning_rate": 0.0002, + "loss": 2.3341, + "step": 17410 + }, + { + "epoch": 1.2980625931445604, + "grad_norm": 2.3546717166900635, + "learning_rate": 0.0002, + "loss": 2.2886, + "step": 17420 + }, + { + "epoch": 1.2988077496274217, + "grad_norm": 3.0054266452789307, + "learning_rate": 0.0002, + "loss": 2.5172, + "step": 17430 + }, + { + "epoch": 1.299552906110283, + "grad_norm": 2.7535972595214844, + "learning_rate": 0.0002, + "loss": 2.5921, + "step": 17440 + }, + { + "epoch": 1.3002980625931446, + "grad_norm": 2.568647861480713, + "learning_rate": 0.0002, + "loss": 2.4328, + "step": 17450 + }, + { + "epoch": 1.301043219076006, + "grad_norm": 2.550978899002075, + "learning_rate": 0.0002, + "loss": 2.4866, + "step": 17460 + }, + { + "epoch": 1.3017883755588673, + "grad_norm": 2.5628457069396973, + "learning_rate": 0.0002, + "loss": 2.4284, + "step": 17470 + }, + { + "epoch": 1.3025335320417288, + "grad_norm": 2.3512864112854004, + "learning_rate": 0.0002, + "loss": 2.4614, + "step": 17480 + }, + { + "epoch": 1.3032786885245902, + "grad_norm": 2.6441545486450195, + "learning_rate": 0.0002, + "loss": 2.4846, + "step": 17490 + }, + { + "epoch": 1.3040238450074515, + "grad_norm": 2.4340755939483643, + "learning_rate": 0.0002, + "loss": 2.2236, + "step": 17500 + }, + { + "epoch": 1.304769001490313, + "grad_norm": 2.4928324222564697, + "learning_rate": 0.0002, + "loss": 2.4082, + "step": 17510 + }, + { + "epoch": 1.3055141579731744, + "grad_norm": 2.5223214626312256, + "learning_rate": 0.0002, + "loss": 2.5462, + "step": 17520 + }, + { + "epoch": 1.3062593144560357, + "grad_norm": 2.824338674545288, + "learning_rate": 0.0002, + "loss": 2.4898, + "step": 17530 + }, + { + "epoch": 1.3070044709388973, + "grad_norm": 2.299622058868408, + "learning_rate": 0.0002, + "loss": 2.4033, + "step": 17540 + }, + { + "epoch": 1.3077496274217586, + "grad_norm": 2.4792845249176025, + "learning_rate": 0.0002, + "loss": 2.5937, + "step": 17550 + }, + { + "epoch": 1.30849478390462, + "grad_norm": 2.402050495147705, + "learning_rate": 0.0002, + "loss": 2.5102, + "step": 17560 + }, + { + "epoch": 1.3092399403874815, + "grad_norm": 2.833794593811035, + "learning_rate": 0.0002, + "loss": 2.6331, + "step": 17570 + }, + { + "epoch": 1.3099850968703428, + "grad_norm": 2.4737417697906494, + "learning_rate": 0.0002, + "loss": 2.2122, + "step": 17580 + }, + { + "epoch": 1.3107302533532041, + "grad_norm": 2.520129680633545, + "learning_rate": 0.0002, + "loss": 2.4989, + "step": 17590 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 2.310004472732544, + "learning_rate": 0.0002, + "loss": 2.3101, + "step": 17600 + }, + { + "epoch": 1.312220566318927, + "grad_norm": 2.863279342651367, + "learning_rate": 0.0002, + "loss": 2.4879, + "step": 17610 + }, + { + "epoch": 1.3129657228017884, + "grad_norm": 2.590439558029175, + "learning_rate": 0.0002, + "loss": 2.6645, + "step": 17620 + }, + { + "epoch": 1.31371087928465, + "grad_norm": 2.840513229370117, + "learning_rate": 0.0002, + "loss": 2.4547, + "step": 17630 + }, + { + "epoch": 1.3144560357675112, + "grad_norm": 2.6092474460601807, + "learning_rate": 0.0002, + "loss": 2.5813, + "step": 17640 + }, + { + "epoch": 1.3152011922503726, + "grad_norm": 2.347975969314575, + "learning_rate": 0.0002, + "loss": 2.3886, + "step": 17650 + }, + { + "epoch": 1.315946348733234, + "grad_norm": 2.4244983196258545, + "learning_rate": 0.0002, + "loss": 2.4909, + "step": 17660 + }, + { + "epoch": 1.3166915052160955, + "grad_norm": 2.1615536212921143, + "learning_rate": 0.0002, + "loss": 2.4675, + "step": 17670 + }, + { + "epoch": 1.3174366616989568, + "grad_norm": 2.350623846054077, + "learning_rate": 0.0002, + "loss": 2.4184, + "step": 17680 + }, + { + "epoch": 1.3181818181818181, + "grad_norm": 2.4277079105377197, + "learning_rate": 0.0002, + "loss": 2.5482, + "step": 17690 + }, + { + "epoch": 1.3189269746646795, + "grad_norm": 2.431518793106079, + "learning_rate": 0.0002, + "loss": 2.5813, + "step": 17700 + }, + { + "epoch": 1.319672131147541, + "grad_norm": 2.4477312564849854, + "learning_rate": 0.0002, + "loss": 2.4473, + "step": 17710 + }, + { + "epoch": 1.3204172876304023, + "grad_norm": 2.8006045818328857, + "learning_rate": 0.0002, + "loss": 2.639, + "step": 17720 + }, + { + "epoch": 1.3211624441132637, + "grad_norm": 2.362448215484619, + "learning_rate": 0.0002, + "loss": 2.3965, + "step": 17730 + }, + { + "epoch": 1.3219076005961252, + "grad_norm": 2.489379405975342, + "learning_rate": 0.0002, + "loss": 2.4664, + "step": 17740 + }, + { + "epoch": 1.3226527570789866, + "grad_norm": 2.386716365814209, + "learning_rate": 0.0002, + "loss": 2.5855, + "step": 17750 + }, + { + "epoch": 1.3233979135618479, + "grad_norm": 2.5991947650909424, + "learning_rate": 0.0002, + "loss": 2.5633, + "step": 17760 + }, + { + "epoch": 1.3241430700447094, + "grad_norm": 2.499544858932495, + "learning_rate": 0.0002, + "loss": 2.4749, + "step": 17770 + }, + { + "epoch": 1.3248882265275708, + "grad_norm": 2.4425177574157715, + "learning_rate": 0.0002, + "loss": 2.5387, + "step": 17780 + }, + { + "epoch": 1.325633383010432, + "grad_norm": 2.6338937282562256, + "learning_rate": 0.0002, + "loss": 2.4913, + "step": 17790 + }, + { + "epoch": 1.3263785394932937, + "grad_norm": 2.280900716781616, + "learning_rate": 0.0002, + "loss": 2.548, + "step": 17800 + }, + { + "epoch": 1.327123695976155, + "grad_norm": 2.5411298274993896, + "learning_rate": 0.0002, + "loss": 2.5131, + "step": 17810 + }, + { + "epoch": 1.3278688524590163, + "grad_norm": 2.546914577484131, + "learning_rate": 0.0002, + "loss": 2.667, + "step": 17820 + }, + { + "epoch": 1.3286140089418779, + "grad_norm": 2.6048712730407715, + "learning_rate": 0.0002, + "loss": 2.3578, + "step": 17830 + }, + { + "epoch": 1.3293591654247392, + "grad_norm": 2.714755058288574, + "learning_rate": 0.0002, + "loss": 2.6083, + "step": 17840 + }, + { + "epoch": 1.3301043219076005, + "grad_norm": 2.559908151626587, + "learning_rate": 0.0002, + "loss": 2.4031, + "step": 17850 + }, + { + "epoch": 1.330849478390462, + "grad_norm": 2.308744192123413, + "learning_rate": 0.0002, + "loss": 2.3735, + "step": 17860 + }, + { + "epoch": 1.3315946348733234, + "grad_norm": 2.331125497817993, + "learning_rate": 0.0002, + "loss": 2.2689, + "step": 17870 + }, + { + "epoch": 1.3323397913561847, + "grad_norm": 2.7024810314178467, + "learning_rate": 0.0002, + "loss": 2.4517, + "step": 17880 + }, + { + "epoch": 1.3330849478390463, + "grad_norm": 2.6669280529022217, + "learning_rate": 0.0002, + "loss": 2.3356, + "step": 17890 + }, + { + "epoch": 1.3338301043219076, + "grad_norm": 2.739065408706665, + "learning_rate": 0.0002, + "loss": 2.3587, + "step": 17900 + }, + { + "epoch": 1.334575260804769, + "grad_norm": 2.8512372970581055, + "learning_rate": 0.0002, + "loss": 2.5018, + "step": 17910 + }, + { + "epoch": 1.3353204172876305, + "grad_norm": 2.336021900177002, + "learning_rate": 0.0002, + "loss": 2.3453, + "step": 17920 + }, + { + "epoch": 1.3360655737704918, + "grad_norm": 2.4425575733184814, + "learning_rate": 0.0002, + "loss": 2.5346, + "step": 17930 + }, + { + "epoch": 1.3368107302533532, + "grad_norm": 3.027134656906128, + "learning_rate": 0.0002, + "loss": 2.3945, + "step": 17940 + }, + { + "epoch": 1.3375558867362147, + "grad_norm": 2.451101541519165, + "learning_rate": 0.0002, + "loss": 2.4273, + "step": 17950 + }, + { + "epoch": 1.338301043219076, + "grad_norm": 2.5687952041625977, + "learning_rate": 0.0002, + "loss": 2.5681, + "step": 17960 + }, + { + "epoch": 1.3390461997019374, + "grad_norm": 2.5672757625579834, + "learning_rate": 0.0002, + "loss": 2.5521, + "step": 17970 + }, + { + "epoch": 1.339791356184799, + "grad_norm": 2.4332363605499268, + "learning_rate": 0.0002, + "loss": 2.4988, + "step": 17980 + }, + { + "epoch": 1.3405365126676603, + "grad_norm": 2.3175182342529297, + "learning_rate": 0.0002, + "loss": 2.1908, + "step": 17990 + }, + { + "epoch": 1.3412816691505216, + "grad_norm": 2.677385091781616, + "learning_rate": 0.0002, + "loss": 2.6282, + "step": 18000 + }, + { + "epoch": 1.342026825633383, + "grad_norm": 2.5982978343963623, + "learning_rate": 0.0002, + "loss": 2.3879, + "step": 18010 + }, + { + "epoch": 1.3427719821162445, + "grad_norm": 2.657008171081543, + "learning_rate": 0.0002, + "loss": 2.5917, + "step": 18020 + }, + { + "epoch": 1.3435171385991058, + "grad_norm": 2.4030990600585938, + "learning_rate": 0.0002, + "loss": 2.4942, + "step": 18030 + }, + { + "epoch": 1.3442622950819672, + "grad_norm": 2.760948657989502, + "learning_rate": 0.0002, + "loss": 2.5241, + "step": 18040 + }, + { + "epoch": 1.3450074515648285, + "grad_norm": 2.4696218967437744, + "learning_rate": 0.0002, + "loss": 2.5653, + "step": 18050 + }, + { + "epoch": 1.34575260804769, + "grad_norm": 2.8503217697143555, + "learning_rate": 0.0002, + "loss": 2.5897, + "step": 18060 + }, + { + "epoch": 1.3464977645305514, + "grad_norm": 2.43888258934021, + "learning_rate": 0.0002, + "loss": 2.5546, + "step": 18070 + }, + { + "epoch": 1.3472429210134127, + "grad_norm": 2.212247610092163, + "learning_rate": 0.0002, + "loss": 2.2402, + "step": 18080 + }, + { + "epoch": 1.3479880774962743, + "grad_norm": 2.612032413482666, + "learning_rate": 0.0002, + "loss": 2.4855, + "step": 18090 + }, + { + "epoch": 1.3487332339791356, + "grad_norm": 1.9392924308776855, + "learning_rate": 0.0002, + "loss": 2.485, + "step": 18100 + }, + { + "epoch": 1.349478390461997, + "grad_norm": 2.618718147277832, + "learning_rate": 0.0002, + "loss": 2.4073, + "step": 18110 + }, + { + "epoch": 1.3502235469448585, + "grad_norm": 2.3610072135925293, + "learning_rate": 0.0002, + "loss": 2.5229, + "step": 18120 + }, + { + "epoch": 1.3509687034277198, + "grad_norm": 2.595275640487671, + "learning_rate": 0.0002, + "loss": 2.4553, + "step": 18130 + }, + { + "epoch": 1.3517138599105811, + "grad_norm": 2.991643190383911, + "learning_rate": 0.0002, + "loss": 2.676, + "step": 18140 + }, + { + "epoch": 1.3524590163934427, + "grad_norm": 2.907435655593872, + "learning_rate": 0.0002, + "loss": 2.5444, + "step": 18150 + }, + { + "epoch": 1.353204172876304, + "grad_norm": 2.5445618629455566, + "learning_rate": 0.0002, + "loss": 2.2842, + "step": 18160 + }, + { + "epoch": 1.3539493293591653, + "grad_norm": 2.4656434059143066, + "learning_rate": 0.0002, + "loss": 2.4092, + "step": 18170 + }, + { + "epoch": 1.354694485842027, + "grad_norm": 2.802135467529297, + "learning_rate": 0.0002, + "loss": 2.546, + "step": 18180 + }, + { + "epoch": 1.3554396423248882, + "grad_norm": 2.503904342651367, + "learning_rate": 0.0002, + "loss": 2.1597, + "step": 18190 + }, + { + "epoch": 1.3561847988077496, + "grad_norm": 2.6705873012542725, + "learning_rate": 0.0002, + "loss": 2.5067, + "step": 18200 + }, + { + "epoch": 1.3569299552906111, + "grad_norm": 2.7576780319213867, + "learning_rate": 0.0002, + "loss": 2.5998, + "step": 18210 + }, + { + "epoch": 1.3576751117734724, + "grad_norm": 2.821742296218872, + "learning_rate": 0.0002, + "loss": 2.5604, + "step": 18220 + }, + { + "epoch": 1.3584202682563338, + "grad_norm": 2.8857474327087402, + "learning_rate": 0.0002, + "loss": 2.4744, + "step": 18230 + }, + { + "epoch": 1.3591654247391953, + "grad_norm": 2.5896153450012207, + "learning_rate": 0.0002, + "loss": 2.4459, + "step": 18240 + }, + { + "epoch": 1.3599105812220567, + "grad_norm": 2.5406441688537598, + "learning_rate": 0.0002, + "loss": 2.6658, + "step": 18250 + }, + { + "epoch": 1.360655737704918, + "grad_norm": 2.489504337310791, + "learning_rate": 0.0002, + "loss": 2.4687, + "step": 18260 + }, + { + "epoch": 1.3614008941877795, + "grad_norm": 2.5663485527038574, + "learning_rate": 0.0002, + "loss": 2.4253, + "step": 18270 + }, + { + "epoch": 1.3621460506706409, + "grad_norm": 2.5449881553649902, + "learning_rate": 0.0002, + "loss": 2.4748, + "step": 18280 + }, + { + "epoch": 1.3628912071535022, + "grad_norm": 2.5304982662200928, + "learning_rate": 0.0002, + "loss": 2.5618, + "step": 18290 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 2.8851704597473145, + "learning_rate": 0.0002, + "loss": 2.3942, + "step": 18300 + }, + { + "epoch": 1.364381520119225, + "grad_norm": 2.3670241832733154, + "learning_rate": 0.0002, + "loss": 2.5914, + "step": 18310 + }, + { + "epoch": 1.3651266766020864, + "grad_norm": 2.421011447906494, + "learning_rate": 0.0002, + "loss": 2.4694, + "step": 18320 + }, + { + "epoch": 1.365871833084948, + "grad_norm": 2.5096373558044434, + "learning_rate": 0.0002, + "loss": 2.5616, + "step": 18330 + }, + { + "epoch": 1.3666169895678093, + "grad_norm": 2.6197783946990967, + "learning_rate": 0.0002, + "loss": 2.351, + "step": 18340 + }, + { + "epoch": 1.3673621460506706, + "grad_norm": 2.3040313720703125, + "learning_rate": 0.0002, + "loss": 2.5583, + "step": 18350 + }, + { + "epoch": 1.368107302533532, + "grad_norm": 2.4461286067962646, + "learning_rate": 0.0002, + "loss": 2.5253, + "step": 18360 + }, + { + "epoch": 1.3688524590163935, + "grad_norm": 2.6195592880249023, + "learning_rate": 0.0002, + "loss": 2.4353, + "step": 18370 + }, + { + "epoch": 1.3695976154992549, + "grad_norm": 2.711920976638794, + "learning_rate": 0.0002, + "loss": 2.4055, + "step": 18380 + }, + { + "epoch": 1.3703427719821162, + "grad_norm": 2.2745280265808105, + "learning_rate": 0.0002, + "loss": 2.2759, + "step": 18390 + }, + { + "epoch": 1.3710879284649775, + "grad_norm": 2.446627140045166, + "learning_rate": 0.0002, + "loss": 2.3023, + "step": 18400 + }, + { + "epoch": 1.371833084947839, + "grad_norm": 2.3650870323181152, + "learning_rate": 0.0002, + "loss": 2.1179, + "step": 18410 + }, + { + "epoch": 1.3725782414307004, + "grad_norm": 2.3850514888763428, + "learning_rate": 0.0002, + "loss": 2.5995, + "step": 18420 + }, + { + "epoch": 1.3733233979135617, + "grad_norm": 2.4336416721343994, + "learning_rate": 0.0002, + "loss": 2.5989, + "step": 18430 + }, + { + "epoch": 1.3740685543964233, + "grad_norm": 2.4156320095062256, + "learning_rate": 0.0002, + "loss": 2.4384, + "step": 18440 + }, + { + "epoch": 1.3748137108792846, + "grad_norm": 2.216463327407837, + "learning_rate": 0.0002, + "loss": 2.288, + "step": 18450 + }, + { + "epoch": 1.375558867362146, + "grad_norm": 2.4132537841796875, + "learning_rate": 0.0002, + "loss": 2.5789, + "step": 18460 + }, + { + "epoch": 1.3763040238450075, + "grad_norm": 2.4926469326019287, + "learning_rate": 0.0002, + "loss": 2.2998, + "step": 18470 + }, + { + "epoch": 1.3770491803278688, + "grad_norm": 2.515826940536499, + "learning_rate": 0.0002, + "loss": 2.4162, + "step": 18480 + }, + { + "epoch": 1.3777943368107302, + "grad_norm": 2.26667857170105, + "learning_rate": 0.0002, + "loss": 2.5093, + "step": 18490 + }, + { + "epoch": 1.3785394932935917, + "grad_norm": 2.739278793334961, + "learning_rate": 0.0002, + "loss": 2.6155, + "step": 18500 + }, + { + "epoch": 1.379284649776453, + "grad_norm": 2.4237091541290283, + "learning_rate": 0.0002, + "loss": 2.5357, + "step": 18510 + }, + { + "epoch": 1.3800298062593144, + "grad_norm": 2.349325180053711, + "learning_rate": 0.0002, + "loss": 2.6382, + "step": 18520 + }, + { + "epoch": 1.380774962742176, + "grad_norm": 2.0295040607452393, + "learning_rate": 0.0002, + "loss": 2.2775, + "step": 18530 + }, + { + "epoch": 1.3815201192250373, + "grad_norm": 2.54569411277771, + "learning_rate": 0.0002, + "loss": 2.6035, + "step": 18540 + }, + { + "epoch": 1.3822652757078986, + "grad_norm": 2.625166416168213, + "learning_rate": 0.0002, + "loss": 2.3252, + "step": 18550 + }, + { + "epoch": 1.3830104321907601, + "grad_norm": 2.5229341983795166, + "learning_rate": 0.0002, + "loss": 2.5045, + "step": 18560 + }, + { + "epoch": 1.3837555886736215, + "grad_norm": 2.5632095336914062, + "learning_rate": 0.0002, + "loss": 2.4786, + "step": 18570 + }, + { + "epoch": 1.3845007451564828, + "grad_norm": 2.6461541652679443, + "learning_rate": 0.0002, + "loss": 2.3556, + "step": 18580 + }, + { + "epoch": 1.3852459016393444, + "grad_norm": 2.256908893585205, + "learning_rate": 0.0002, + "loss": 2.4763, + "step": 18590 + }, + { + "epoch": 1.3859910581222057, + "grad_norm": 2.930579662322998, + "learning_rate": 0.0002, + "loss": 2.3258, + "step": 18600 + }, + { + "epoch": 1.386736214605067, + "grad_norm": 2.400162696838379, + "learning_rate": 0.0002, + "loss": 2.5043, + "step": 18610 + }, + { + "epoch": 1.3874813710879286, + "grad_norm": 2.6638431549072266, + "learning_rate": 0.0002, + "loss": 2.3977, + "step": 18620 + }, + { + "epoch": 1.38822652757079, + "grad_norm": 2.60463285446167, + "learning_rate": 0.0002, + "loss": 2.4366, + "step": 18630 + }, + { + "epoch": 1.3889716840536512, + "grad_norm": 2.412717342376709, + "learning_rate": 0.0002, + "loss": 2.4246, + "step": 18640 + }, + { + "epoch": 1.3897168405365128, + "grad_norm": 3.0523712635040283, + "learning_rate": 0.0002, + "loss": 2.4677, + "step": 18650 + }, + { + "epoch": 1.3904619970193741, + "grad_norm": 2.9874138832092285, + "learning_rate": 0.0002, + "loss": 2.612, + "step": 18660 + }, + { + "epoch": 1.3912071535022354, + "grad_norm": 2.532456398010254, + "learning_rate": 0.0002, + "loss": 2.4691, + "step": 18670 + }, + { + "epoch": 1.391952309985097, + "grad_norm": 2.7136852741241455, + "learning_rate": 0.0002, + "loss": 2.4155, + "step": 18680 + }, + { + "epoch": 1.3926974664679583, + "grad_norm": 2.5134053230285645, + "learning_rate": 0.0002, + "loss": 2.5658, + "step": 18690 + }, + { + "epoch": 1.3934426229508197, + "grad_norm": 2.934316396713257, + "learning_rate": 0.0002, + "loss": 2.5088, + "step": 18700 + }, + { + "epoch": 1.394187779433681, + "grad_norm": 2.23718523979187, + "learning_rate": 0.0002, + "loss": 2.3933, + "step": 18710 + }, + { + "epoch": 1.3949329359165425, + "grad_norm": 2.1990694999694824, + "learning_rate": 0.0002, + "loss": 2.3508, + "step": 18720 + }, + { + "epoch": 1.3956780923994039, + "grad_norm": 2.4167299270629883, + "learning_rate": 0.0002, + "loss": 2.5543, + "step": 18730 + }, + { + "epoch": 1.3964232488822652, + "grad_norm": 2.468496561050415, + "learning_rate": 0.0002, + "loss": 2.4384, + "step": 18740 + }, + { + "epoch": 1.3971684053651265, + "grad_norm": 2.5203256607055664, + "learning_rate": 0.0002, + "loss": 2.4853, + "step": 18750 + }, + { + "epoch": 1.397913561847988, + "grad_norm": 2.669184446334839, + "learning_rate": 0.0002, + "loss": 2.4931, + "step": 18760 + }, + { + "epoch": 1.3986587183308494, + "grad_norm": 2.7965617179870605, + "learning_rate": 0.0002, + "loss": 2.6282, + "step": 18770 + }, + { + "epoch": 1.3994038748137108, + "grad_norm": 2.606688976287842, + "learning_rate": 0.0002, + "loss": 2.5169, + "step": 18780 + }, + { + "epoch": 1.4001490312965723, + "grad_norm": 3.333808422088623, + "learning_rate": 0.0002, + "loss": 2.4799, + "step": 18790 + }, + { + "epoch": 1.4008941877794336, + "grad_norm": 2.470353841781616, + "learning_rate": 0.0002, + "loss": 2.6148, + "step": 18800 + }, + { + "epoch": 1.401639344262295, + "grad_norm": 3.7731544971466064, + "learning_rate": 0.0002, + "loss": 2.5979, + "step": 18810 + }, + { + "epoch": 1.4023845007451565, + "grad_norm": 2.7229580879211426, + "learning_rate": 0.0002, + "loss": 2.5466, + "step": 18820 + }, + { + "epoch": 1.4031296572280179, + "grad_norm": 2.463998556137085, + "learning_rate": 0.0002, + "loss": 2.5742, + "step": 18830 + }, + { + "epoch": 1.4038748137108792, + "grad_norm": 2.2875285148620605, + "learning_rate": 0.0002, + "loss": 2.3219, + "step": 18840 + }, + { + "epoch": 1.4046199701937407, + "grad_norm": 2.479088068008423, + "learning_rate": 0.0002, + "loss": 2.2547, + "step": 18850 + }, + { + "epoch": 1.405365126676602, + "grad_norm": 2.7648444175720215, + "learning_rate": 0.0002, + "loss": 2.4328, + "step": 18860 + }, + { + "epoch": 1.4061102831594634, + "grad_norm": 2.649199962615967, + "learning_rate": 0.0002, + "loss": 2.5119, + "step": 18870 + }, + { + "epoch": 1.406855439642325, + "grad_norm": 2.6668455600738525, + "learning_rate": 0.0002, + "loss": 2.3982, + "step": 18880 + }, + { + "epoch": 1.4076005961251863, + "grad_norm": 2.207362651824951, + "learning_rate": 0.0002, + "loss": 2.5986, + "step": 18890 + }, + { + "epoch": 1.4083457526080476, + "grad_norm": 2.383084535598755, + "learning_rate": 0.0002, + "loss": 2.3486, + "step": 18900 + }, + { + "epoch": 1.4090909090909092, + "grad_norm": 2.490837574005127, + "learning_rate": 0.0002, + "loss": 2.4754, + "step": 18910 + }, + { + "epoch": 1.4098360655737705, + "grad_norm": 2.479707717895508, + "learning_rate": 0.0002, + "loss": 2.4408, + "step": 18920 + }, + { + "epoch": 1.4105812220566318, + "grad_norm": 2.9965310096740723, + "learning_rate": 0.0002, + "loss": 2.5574, + "step": 18930 + }, + { + "epoch": 1.4113263785394934, + "grad_norm": 2.707078456878662, + "learning_rate": 0.0002, + "loss": 2.4176, + "step": 18940 + }, + { + "epoch": 1.4120715350223547, + "grad_norm": 2.5538723468780518, + "learning_rate": 0.0002, + "loss": 2.4505, + "step": 18950 + }, + { + "epoch": 1.412816691505216, + "grad_norm": 2.399869441986084, + "learning_rate": 0.0002, + "loss": 2.391, + "step": 18960 + }, + { + "epoch": 1.4135618479880776, + "grad_norm": 2.150087594985962, + "learning_rate": 0.0002, + "loss": 2.5095, + "step": 18970 + }, + { + "epoch": 1.414307004470939, + "grad_norm": 2.7738189697265625, + "learning_rate": 0.0002, + "loss": 2.4873, + "step": 18980 + }, + { + "epoch": 1.4150521609538003, + "grad_norm": 2.8907742500305176, + "learning_rate": 0.0002, + "loss": 2.7251, + "step": 18990 + }, + { + "epoch": 1.4157973174366618, + "grad_norm": 2.500542640686035, + "learning_rate": 0.0002, + "loss": 2.6472, + "step": 19000 + }, + { + "epoch": 1.4165424739195231, + "grad_norm": 2.930190324783325, + "learning_rate": 0.0002, + "loss": 2.4754, + "step": 19010 + }, + { + "epoch": 1.4172876304023845, + "grad_norm": 2.717930793762207, + "learning_rate": 0.0002, + "loss": 2.5804, + "step": 19020 + }, + { + "epoch": 1.418032786885246, + "grad_norm": 2.6405129432678223, + "learning_rate": 0.0002, + "loss": 2.4247, + "step": 19030 + }, + { + "epoch": 1.4187779433681074, + "grad_norm": 2.3362009525299072, + "learning_rate": 0.0002, + "loss": 2.4234, + "step": 19040 + }, + { + "epoch": 1.4195230998509687, + "grad_norm": 2.1805837154388428, + "learning_rate": 0.0002, + "loss": 2.4447, + "step": 19050 + }, + { + "epoch": 1.42026825633383, + "grad_norm": 2.530463695526123, + "learning_rate": 0.0002, + "loss": 2.5781, + "step": 19060 + }, + { + "epoch": 1.4210134128166916, + "grad_norm": 2.360027313232422, + "learning_rate": 0.0002, + "loss": 2.4854, + "step": 19070 + }, + { + "epoch": 1.421758569299553, + "grad_norm": 2.552933931350708, + "learning_rate": 0.0002, + "loss": 2.4995, + "step": 19080 + }, + { + "epoch": 1.4225037257824142, + "grad_norm": 2.643507719039917, + "learning_rate": 0.0002, + "loss": 2.4029, + "step": 19090 + }, + { + "epoch": 1.4232488822652756, + "grad_norm": 2.3608195781707764, + "learning_rate": 0.0002, + "loss": 2.3169, + "step": 19100 + }, + { + "epoch": 1.4239940387481371, + "grad_norm": 2.282942295074463, + "learning_rate": 0.0002, + "loss": 2.632, + "step": 19110 + }, + { + "epoch": 1.4247391952309985, + "grad_norm": 2.8628063201904297, + "learning_rate": 0.0002, + "loss": 2.3114, + "step": 19120 + }, + { + "epoch": 1.4254843517138598, + "grad_norm": 2.357126235961914, + "learning_rate": 0.0002, + "loss": 2.483, + "step": 19130 + }, + { + "epoch": 1.4262295081967213, + "grad_norm": 2.1612229347229004, + "learning_rate": 0.0002, + "loss": 2.4844, + "step": 19140 + }, + { + "epoch": 1.4269746646795827, + "grad_norm": 2.248215436935425, + "learning_rate": 0.0002, + "loss": 2.3824, + "step": 19150 + }, + { + "epoch": 1.427719821162444, + "grad_norm": 1.9331451654434204, + "learning_rate": 0.0002, + "loss": 2.3143, + "step": 19160 + }, + { + "epoch": 1.4284649776453056, + "grad_norm": 2.4009180068969727, + "learning_rate": 0.0002, + "loss": 2.382, + "step": 19170 + }, + { + "epoch": 1.4292101341281669, + "grad_norm": 2.6418373584747314, + "learning_rate": 0.0002, + "loss": 2.4591, + "step": 19180 + }, + { + "epoch": 1.4299552906110282, + "grad_norm": 2.737656593322754, + "learning_rate": 0.0002, + "loss": 2.3134, + "step": 19190 + }, + { + "epoch": 1.4307004470938898, + "grad_norm": 2.2396743297576904, + "learning_rate": 0.0002, + "loss": 2.4965, + "step": 19200 + }, + { + "epoch": 1.431445603576751, + "grad_norm": 2.542550802230835, + "learning_rate": 0.0002, + "loss": 2.4396, + "step": 19210 + }, + { + "epoch": 1.4321907600596124, + "grad_norm": 2.6923890113830566, + "learning_rate": 0.0002, + "loss": 2.3213, + "step": 19220 + }, + { + "epoch": 1.432935916542474, + "grad_norm": 2.479887008666992, + "learning_rate": 0.0002, + "loss": 2.528, + "step": 19230 + }, + { + "epoch": 1.4336810730253353, + "grad_norm": 2.2909326553344727, + "learning_rate": 0.0002, + "loss": 2.4784, + "step": 19240 + }, + { + "epoch": 1.4344262295081966, + "grad_norm": 2.543410301208496, + "learning_rate": 0.0002, + "loss": 2.603, + "step": 19250 + }, + { + "epoch": 1.4351713859910582, + "grad_norm": 2.4764413833618164, + "learning_rate": 0.0002, + "loss": 2.3813, + "step": 19260 + }, + { + "epoch": 1.4359165424739195, + "grad_norm": 2.4031872749328613, + "learning_rate": 0.0002, + "loss": 2.4507, + "step": 19270 + }, + { + "epoch": 1.4366616989567809, + "grad_norm": 2.174621105194092, + "learning_rate": 0.0002, + "loss": 2.505, + "step": 19280 + }, + { + "epoch": 1.4374068554396424, + "grad_norm": 2.6859323978424072, + "learning_rate": 0.0002, + "loss": 2.5794, + "step": 19290 + }, + { + "epoch": 1.4381520119225037, + "grad_norm": 2.5702786445617676, + "learning_rate": 0.0002, + "loss": 2.429, + "step": 19300 + }, + { + "epoch": 1.438897168405365, + "grad_norm": 2.4313433170318604, + "learning_rate": 0.0002, + "loss": 2.4518, + "step": 19310 + }, + { + "epoch": 1.4396423248882266, + "grad_norm": 2.800537347793579, + "learning_rate": 0.0002, + "loss": 2.4637, + "step": 19320 + }, + { + "epoch": 1.440387481371088, + "grad_norm": 2.221001625061035, + "learning_rate": 0.0002, + "loss": 2.5614, + "step": 19330 + }, + { + "epoch": 1.4411326378539493, + "grad_norm": 2.630506753921509, + "learning_rate": 0.0002, + "loss": 2.5497, + "step": 19340 + }, + { + "epoch": 1.4418777943368108, + "grad_norm": 3.166796922683716, + "learning_rate": 0.0002, + "loss": 2.4768, + "step": 19350 + }, + { + "epoch": 1.4426229508196722, + "grad_norm": 2.3211870193481445, + "learning_rate": 0.0002, + "loss": 2.4818, + "step": 19360 + }, + { + "epoch": 1.4433681073025335, + "grad_norm": 2.428395986557007, + "learning_rate": 0.0002, + "loss": 2.2887, + "step": 19370 + }, + { + "epoch": 1.444113263785395, + "grad_norm": 2.2605369091033936, + "learning_rate": 0.0002, + "loss": 2.5042, + "step": 19380 + }, + { + "epoch": 1.4448584202682564, + "grad_norm": 2.404771089553833, + "learning_rate": 0.0002, + "loss": 2.5354, + "step": 19390 + }, + { + "epoch": 1.4456035767511177, + "grad_norm": 2.3696742057800293, + "learning_rate": 0.0002, + "loss": 2.5734, + "step": 19400 + }, + { + "epoch": 1.446348733233979, + "grad_norm": 2.0790860652923584, + "learning_rate": 0.0002, + "loss": 2.379, + "step": 19410 + }, + { + "epoch": 1.4470938897168406, + "grad_norm": 2.8864376544952393, + "learning_rate": 0.0002, + "loss": 2.5431, + "step": 19420 + }, + { + "epoch": 1.447839046199702, + "grad_norm": 2.4749741554260254, + "learning_rate": 0.0002, + "loss": 2.7492, + "step": 19430 + }, + { + "epoch": 1.4485842026825633, + "grad_norm": 2.4821410179138184, + "learning_rate": 0.0002, + "loss": 2.6195, + "step": 19440 + }, + { + "epoch": 1.4493293591654246, + "grad_norm": 2.2254769802093506, + "learning_rate": 0.0002, + "loss": 2.5504, + "step": 19450 + }, + { + "epoch": 1.4500745156482862, + "grad_norm": 2.85517954826355, + "learning_rate": 0.0002, + "loss": 2.5548, + "step": 19460 + }, + { + "epoch": 1.4508196721311475, + "grad_norm": 2.567664384841919, + "learning_rate": 0.0002, + "loss": 2.5434, + "step": 19470 + }, + { + "epoch": 1.4515648286140088, + "grad_norm": 2.842428684234619, + "learning_rate": 0.0002, + "loss": 2.6169, + "step": 19480 + }, + { + "epoch": 1.4523099850968704, + "grad_norm": 2.630206346511841, + "learning_rate": 0.0002, + "loss": 2.46, + "step": 19490 + }, + { + "epoch": 1.4530551415797317, + "grad_norm": 2.2468862533569336, + "learning_rate": 0.0002, + "loss": 2.5329, + "step": 19500 + }, + { + "epoch": 1.453800298062593, + "grad_norm": 2.688547134399414, + "learning_rate": 0.0002, + "loss": 2.4376, + "step": 19510 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 2.439633846282959, + "learning_rate": 0.0002, + "loss": 2.572, + "step": 19520 + }, + { + "epoch": 1.455290611028316, + "grad_norm": 2.374743700027466, + "learning_rate": 0.0002, + "loss": 2.579, + "step": 19530 + }, + { + "epoch": 1.4560357675111772, + "grad_norm": 2.5189223289489746, + "learning_rate": 0.0002, + "loss": 2.4794, + "step": 19540 + }, + { + "epoch": 1.4567809239940388, + "grad_norm": 2.6235687732696533, + "learning_rate": 0.0002, + "loss": 2.4074, + "step": 19550 + }, + { + "epoch": 1.4575260804769001, + "grad_norm": 2.6033565998077393, + "learning_rate": 0.0002, + "loss": 2.4671, + "step": 19560 + }, + { + "epoch": 1.4582712369597615, + "grad_norm": 2.5955042839050293, + "learning_rate": 0.0002, + "loss": 2.4777, + "step": 19570 + }, + { + "epoch": 1.459016393442623, + "grad_norm": 2.8207786083221436, + "learning_rate": 0.0002, + "loss": 2.5766, + "step": 19580 + }, + { + "epoch": 1.4597615499254843, + "grad_norm": 2.597846269607544, + "learning_rate": 0.0002, + "loss": 2.4987, + "step": 19590 + }, + { + "epoch": 1.4605067064083457, + "grad_norm": 2.6083250045776367, + "learning_rate": 0.0002, + "loss": 2.6282, + "step": 19600 + }, + { + "epoch": 1.4612518628912072, + "grad_norm": 2.8631696701049805, + "learning_rate": 0.0002, + "loss": 2.3029, + "step": 19610 + }, + { + "epoch": 1.4619970193740686, + "grad_norm": 2.701707124710083, + "learning_rate": 0.0002, + "loss": 2.4164, + "step": 19620 + }, + { + "epoch": 1.46274217585693, + "grad_norm": 2.4737026691436768, + "learning_rate": 0.0002, + "loss": 2.4605, + "step": 19630 + }, + { + "epoch": 1.4634873323397914, + "grad_norm": 2.5299856662750244, + "learning_rate": 0.0002, + "loss": 2.2568, + "step": 19640 + }, + { + "epoch": 1.4642324888226528, + "grad_norm": 2.5376803874969482, + "learning_rate": 0.0002, + "loss": 2.4407, + "step": 19650 + }, + { + "epoch": 1.464977645305514, + "grad_norm": 2.8622679710388184, + "learning_rate": 0.0002, + "loss": 2.4903, + "step": 19660 + }, + { + "epoch": 1.4657228017883757, + "grad_norm": 2.7106306552886963, + "learning_rate": 0.0002, + "loss": 2.4316, + "step": 19670 + }, + { + "epoch": 1.466467958271237, + "grad_norm": 2.3677496910095215, + "learning_rate": 0.0002, + "loss": 2.4907, + "step": 19680 + }, + { + "epoch": 1.4672131147540983, + "grad_norm": 2.7566051483154297, + "learning_rate": 0.0002, + "loss": 2.5288, + "step": 19690 + }, + { + "epoch": 1.4679582712369599, + "grad_norm": 2.2991981506347656, + "learning_rate": 0.0002, + "loss": 2.4065, + "step": 19700 + }, + { + "epoch": 1.4687034277198212, + "grad_norm": 2.579111337661743, + "learning_rate": 0.0002, + "loss": 2.5721, + "step": 19710 + }, + { + "epoch": 1.4694485842026825, + "grad_norm": 2.5662269592285156, + "learning_rate": 0.0002, + "loss": 2.5849, + "step": 19720 + }, + { + "epoch": 1.470193740685544, + "grad_norm": 2.307398796081543, + "learning_rate": 0.0002, + "loss": 2.3272, + "step": 19730 + }, + { + "epoch": 1.4709388971684054, + "grad_norm": 2.695326566696167, + "learning_rate": 0.0002, + "loss": 2.6097, + "step": 19740 + }, + { + "epoch": 1.4716840536512668, + "grad_norm": 2.547471523284912, + "learning_rate": 0.0002, + "loss": 2.6456, + "step": 19750 + }, + { + "epoch": 1.472429210134128, + "grad_norm": 2.5473928451538086, + "learning_rate": 0.0002, + "loss": 2.6685, + "step": 19760 + }, + { + "epoch": 1.4731743666169896, + "grad_norm": 2.6228318214416504, + "learning_rate": 0.0002, + "loss": 2.3921, + "step": 19770 + }, + { + "epoch": 1.473919523099851, + "grad_norm": 2.489250659942627, + "learning_rate": 0.0002, + "loss": 2.3033, + "step": 19780 + }, + { + "epoch": 1.4746646795827123, + "grad_norm": 2.4918441772460938, + "learning_rate": 0.0002, + "loss": 2.6749, + "step": 19790 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 2.6294076442718506, + "learning_rate": 0.0002, + "loss": 2.5153, + "step": 19800 + }, + { + "epoch": 1.4761549925484352, + "grad_norm": 2.7032785415649414, + "learning_rate": 0.0002, + "loss": 2.4346, + "step": 19810 + }, + { + "epoch": 1.4769001490312965, + "grad_norm": 3.086024761199951, + "learning_rate": 0.0002, + "loss": 2.3755, + "step": 19820 + }, + { + "epoch": 1.4776453055141578, + "grad_norm": 2.7240495681762695, + "learning_rate": 0.0002, + "loss": 2.4115, + "step": 19830 + }, + { + "epoch": 1.4783904619970194, + "grad_norm": 2.3864595890045166, + "learning_rate": 0.0002, + "loss": 2.4623, + "step": 19840 + }, + { + "epoch": 1.4791356184798807, + "grad_norm": 2.517160177230835, + "learning_rate": 0.0002, + "loss": 2.2625, + "step": 19850 + }, + { + "epoch": 1.479880774962742, + "grad_norm": 2.8101651668548584, + "learning_rate": 0.0002, + "loss": 2.3999, + "step": 19860 + }, + { + "epoch": 1.4806259314456036, + "grad_norm": 2.6499056816101074, + "learning_rate": 0.0002, + "loss": 2.4251, + "step": 19870 + }, + { + "epoch": 1.481371087928465, + "grad_norm": 2.563832998275757, + "learning_rate": 0.0002, + "loss": 2.3608, + "step": 19880 + }, + { + "epoch": 1.4821162444113263, + "grad_norm": 2.905278205871582, + "learning_rate": 0.0002, + "loss": 2.4904, + "step": 19890 + }, + { + "epoch": 1.4828614008941878, + "grad_norm": 2.7486226558685303, + "learning_rate": 0.0002, + "loss": 2.5536, + "step": 19900 + }, + { + "epoch": 1.4836065573770492, + "grad_norm": 3.1326828002929688, + "learning_rate": 0.0002, + "loss": 2.4822, + "step": 19910 + }, + { + "epoch": 1.4843517138599105, + "grad_norm": 2.983778476715088, + "learning_rate": 0.0002, + "loss": 2.5207, + "step": 19920 + }, + { + "epoch": 1.485096870342772, + "grad_norm": 2.391115427017212, + "learning_rate": 0.0002, + "loss": 2.4728, + "step": 19930 + }, + { + "epoch": 1.4858420268256334, + "grad_norm": 2.789459466934204, + "learning_rate": 0.0002, + "loss": 2.3897, + "step": 19940 + }, + { + "epoch": 1.4865871833084947, + "grad_norm": 2.1479434967041016, + "learning_rate": 0.0002, + "loss": 2.4227, + "step": 19950 + }, + { + "epoch": 1.4873323397913563, + "grad_norm": 2.5585100650787354, + "learning_rate": 0.0002, + "loss": 2.5672, + "step": 19960 + }, + { + "epoch": 1.4880774962742176, + "grad_norm": 2.8484928607940674, + "learning_rate": 0.0002, + "loss": 2.5627, + "step": 19970 + }, + { + "epoch": 1.488822652757079, + "grad_norm": 2.4347646236419678, + "learning_rate": 0.0002, + "loss": 2.4704, + "step": 19980 + }, + { + "epoch": 1.4895678092399405, + "grad_norm": 2.467780351638794, + "learning_rate": 0.0002, + "loss": 2.4175, + "step": 19990 + }, + { + "epoch": 1.4903129657228018, + "grad_norm": 2.4302515983581543, + "learning_rate": 0.0002, + "loss": 2.602, + "step": 20000 + }, + { + "epoch": 1.4910581222056631, + "grad_norm": 2.5407824516296387, + "learning_rate": 0.0002, + "loss": 2.3174, + "step": 20010 + }, + { + "epoch": 1.4918032786885247, + "grad_norm": 2.509092330932617, + "learning_rate": 0.0002, + "loss": 2.4737, + "step": 20020 + }, + { + "epoch": 1.492548435171386, + "grad_norm": 2.512086868286133, + "learning_rate": 0.0002, + "loss": 2.4616, + "step": 20030 + }, + { + "epoch": 1.4932935916542474, + "grad_norm": 2.5899012088775635, + "learning_rate": 0.0002, + "loss": 2.6209, + "step": 20040 + }, + { + "epoch": 1.494038748137109, + "grad_norm": 2.5134575366973877, + "learning_rate": 0.0002, + "loss": 2.4823, + "step": 20050 + }, + { + "epoch": 1.4947839046199702, + "grad_norm": 2.4115593433380127, + "learning_rate": 0.0002, + "loss": 2.4584, + "step": 20060 + }, + { + "epoch": 1.4955290611028316, + "grad_norm": 2.5678627490997314, + "learning_rate": 0.0002, + "loss": 2.4916, + "step": 20070 + }, + { + "epoch": 1.4962742175856931, + "grad_norm": 3.107654333114624, + "learning_rate": 0.0002, + "loss": 2.3733, + "step": 20080 + }, + { + "epoch": 1.4970193740685545, + "grad_norm": 2.282750368118286, + "learning_rate": 0.0002, + "loss": 2.4918, + "step": 20090 + }, + { + "epoch": 1.4977645305514158, + "grad_norm": 2.589705228805542, + "learning_rate": 0.0002, + "loss": 2.4723, + "step": 20100 + }, + { + "epoch": 1.4985096870342771, + "grad_norm": 2.139538288116455, + "learning_rate": 0.0002, + "loss": 2.463, + "step": 20110 + }, + { + "epoch": 1.4992548435171387, + "grad_norm": 2.6076650619506836, + "learning_rate": 0.0002, + "loss": 2.4384, + "step": 20120 + }, + { + "epoch": 1.5, + "grad_norm": 2.653719186782837, + "learning_rate": 0.0002, + "loss": 2.3014, + "step": 20130 + }, + { + "epoch": 1.5007451564828616, + "grad_norm": 3.960010051727295, + "learning_rate": 0.0002, + "loss": 2.407, + "step": 20140 + }, + { + "epoch": 1.5014903129657227, + "grad_norm": 2.3678810596466064, + "learning_rate": 0.0002, + "loss": 2.3797, + "step": 20150 + }, + { + "epoch": 1.5022354694485842, + "grad_norm": 2.61295485496521, + "learning_rate": 0.0002, + "loss": 2.4066, + "step": 20160 + }, + { + "epoch": 1.5029806259314458, + "grad_norm": 2.4393184185028076, + "learning_rate": 0.0002, + "loss": 2.3545, + "step": 20170 + }, + { + "epoch": 1.5037257824143069, + "grad_norm": 2.455565929412842, + "learning_rate": 0.0002, + "loss": 2.4903, + "step": 20180 + }, + { + "epoch": 1.5044709388971684, + "grad_norm": 2.7440402507781982, + "learning_rate": 0.0002, + "loss": 2.5455, + "step": 20190 + }, + { + "epoch": 1.5052160953800298, + "grad_norm": 2.6550514698028564, + "learning_rate": 0.0002, + "loss": 2.4654, + "step": 20200 + }, + { + "epoch": 1.505961251862891, + "grad_norm": 2.5063323974609375, + "learning_rate": 0.0002, + "loss": 2.4515, + "step": 20210 + }, + { + "epoch": 1.5067064083457526, + "grad_norm": 2.9979264736175537, + "learning_rate": 0.0002, + "loss": 2.3564, + "step": 20220 + }, + { + "epoch": 1.507451564828614, + "grad_norm": 2.5606510639190674, + "learning_rate": 0.0002, + "loss": 2.5883, + "step": 20230 + }, + { + "epoch": 1.5081967213114753, + "grad_norm": 2.191575050354004, + "learning_rate": 0.0002, + "loss": 2.4795, + "step": 20240 + }, + { + "epoch": 1.5089418777943369, + "grad_norm": 2.5303499698638916, + "learning_rate": 0.0002, + "loss": 2.41, + "step": 20250 + }, + { + "epoch": 1.5096870342771982, + "grad_norm": 2.602943181991577, + "learning_rate": 0.0002, + "loss": 2.4533, + "step": 20260 + }, + { + "epoch": 1.5104321907600595, + "grad_norm": 2.667649030685425, + "learning_rate": 0.0002, + "loss": 2.5397, + "step": 20270 + }, + { + "epoch": 1.511177347242921, + "grad_norm": 2.7517497539520264, + "learning_rate": 0.0002, + "loss": 2.5885, + "step": 20280 + }, + { + "epoch": 1.5119225037257824, + "grad_norm": 2.7016963958740234, + "learning_rate": 0.0002, + "loss": 2.2389, + "step": 20290 + }, + { + "epoch": 1.5126676602086437, + "grad_norm": 3.5088577270507812, + "learning_rate": 0.0002, + "loss": 2.5337, + "step": 20300 + }, + { + "epoch": 1.5134128166915053, + "grad_norm": 2.009373188018799, + "learning_rate": 0.0002, + "loss": 2.5013, + "step": 20310 + }, + { + "epoch": 1.5141579731743666, + "grad_norm": 3.012927770614624, + "learning_rate": 0.0002, + "loss": 2.487, + "step": 20320 + }, + { + "epoch": 1.514903129657228, + "grad_norm": 2.512377977371216, + "learning_rate": 0.0002, + "loss": 2.6162, + "step": 20330 + }, + { + "epoch": 1.5156482861400895, + "grad_norm": 2.7566356658935547, + "learning_rate": 0.0002, + "loss": 2.4332, + "step": 20340 + }, + { + "epoch": 1.5163934426229508, + "grad_norm": 2.620819330215454, + "learning_rate": 0.0002, + "loss": 2.451, + "step": 20350 + }, + { + "epoch": 1.5171385991058122, + "grad_norm": 2.378424882888794, + "learning_rate": 0.0002, + "loss": 2.3835, + "step": 20360 + }, + { + "epoch": 1.5178837555886737, + "grad_norm": 2.6105260848999023, + "learning_rate": 0.0002, + "loss": 2.5465, + "step": 20370 + }, + { + "epoch": 1.518628912071535, + "grad_norm": 2.4131839275360107, + "learning_rate": 0.0002, + "loss": 2.3061, + "step": 20380 + }, + { + "epoch": 1.5193740685543964, + "grad_norm": 2.658151149749756, + "learning_rate": 0.0002, + "loss": 2.4822, + "step": 20390 + }, + { + "epoch": 1.520119225037258, + "grad_norm": 2.169219732284546, + "learning_rate": 0.0002, + "loss": 2.4033, + "step": 20400 + }, + { + "epoch": 1.5208643815201193, + "grad_norm": 2.769595146179199, + "learning_rate": 0.0002, + "loss": 2.57, + "step": 20410 + }, + { + "epoch": 1.5216095380029806, + "grad_norm": 2.6904990673065186, + "learning_rate": 0.0002, + "loss": 2.4361, + "step": 20420 + }, + { + "epoch": 1.5223546944858422, + "grad_norm": 2.26855206489563, + "learning_rate": 0.0002, + "loss": 2.4516, + "step": 20430 + }, + { + "epoch": 1.5230998509687033, + "grad_norm": 2.5474822521209717, + "learning_rate": 0.0002, + "loss": 2.4087, + "step": 20440 + }, + { + "epoch": 1.5238450074515648, + "grad_norm": 2.5763962268829346, + "learning_rate": 0.0002, + "loss": 2.5081, + "step": 20450 + }, + { + "epoch": 1.5245901639344264, + "grad_norm": 2.8625242710113525, + "learning_rate": 0.0002, + "loss": 2.5304, + "step": 20460 + }, + { + "epoch": 1.5253353204172875, + "grad_norm": 2.5730020999908447, + "learning_rate": 0.0002, + "loss": 2.5999, + "step": 20470 + }, + { + "epoch": 1.526080476900149, + "grad_norm": 2.704777717590332, + "learning_rate": 0.0002, + "loss": 2.5252, + "step": 20480 + }, + { + "epoch": 1.5268256333830106, + "grad_norm": 2.8790910243988037, + "learning_rate": 0.0002, + "loss": 2.4782, + "step": 20490 + }, + { + "epoch": 1.5275707898658717, + "grad_norm": 2.793997287750244, + "learning_rate": 0.0002, + "loss": 2.5271, + "step": 20500 + }, + { + "epoch": 1.5283159463487332, + "grad_norm": 2.4312236309051514, + "learning_rate": 0.0002, + "loss": 2.7423, + "step": 20510 + }, + { + "epoch": 1.5290611028315948, + "grad_norm": 2.508166551589966, + "learning_rate": 0.0002, + "loss": 2.5464, + "step": 20520 + }, + { + "epoch": 1.529806259314456, + "grad_norm": 2.4821505546569824, + "learning_rate": 0.0002, + "loss": 2.5888, + "step": 20530 + }, + { + "epoch": 1.5305514157973175, + "grad_norm": 2.0851380825042725, + "learning_rate": 0.0002, + "loss": 2.1732, + "step": 20540 + }, + { + "epoch": 1.5312965722801788, + "grad_norm": 2.1268575191497803, + "learning_rate": 0.0002, + "loss": 2.5131, + "step": 20550 + }, + { + "epoch": 1.5320417287630401, + "grad_norm": 3.2750792503356934, + "learning_rate": 0.0002, + "loss": 2.3624, + "step": 20560 + }, + { + "epoch": 1.5327868852459017, + "grad_norm": 2.771404266357422, + "learning_rate": 0.0002, + "loss": 2.2204, + "step": 20570 + }, + { + "epoch": 1.533532041728763, + "grad_norm": 3.1132712364196777, + "learning_rate": 0.0002, + "loss": 2.5077, + "step": 20580 + }, + { + "epoch": 1.5342771982116243, + "grad_norm": 2.637928009033203, + "learning_rate": 0.0002, + "loss": 2.5237, + "step": 20590 + }, + { + "epoch": 1.535022354694486, + "grad_norm": 3.4630746841430664, + "learning_rate": 0.0002, + "loss": 2.4241, + "step": 20600 + }, + { + "epoch": 1.5357675111773472, + "grad_norm": 3.054542303085327, + "learning_rate": 0.0002, + "loss": 2.6337, + "step": 20610 + }, + { + "epoch": 1.5365126676602086, + "grad_norm": 2.7666263580322266, + "learning_rate": 0.0002, + "loss": 2.4202, + "step": 20620 + }, + { + "epoch": 1.53725782414307, + "grad_norm": 2.8477747440338135, + "learning_rate": 0.0002, + "loss": 2.4838, + "step": 20630 + }, + { + "epoch": 1.5380029806259314, + "grad_norm": 2.61881422996521, + "learning_rate": 0.0002, + "loss": 2.2668, + "step": 20640 + }, + { + "epoch": 1.5387481371087928, + "grad_norm": 2.792616367340088, + "learning_rate": 0.0002, + "loss": 2.6354, + "step": 20650 + }, + { + "epoch": 1.5394932935916543, + "grad_norm": 2.7468929290771484, + "learning_rate": 0.0002, + "loss": 2.5306, + "step": 20660 + }, + { + "epoch": 1.5402384500745157, + "grad_norm": 2.3328466415405273, + "learning_rate": 0.0002, + "loss": 2.5575, + "step": 20670 + }, + { + "epoch": 1.540983606557377, + "grad_norm": 2.406003475189209, + "learning_rate": 0.0002, + "loss": 2.4008, + "step": 20680 + }, + { + "epoch": 1.5417287630402385, + "grad_norm": 2.558863878250122, + "learning_rate": 0.0002, + "loss": 2.482, + "step": 20690 + }, + { + "epoch": 1.5424739195230999, + "grad_norm": 2.814772605895996, + "learning_rate": 0.0002, + "loss": 2.4789, + "step": 20700 + }, + { + "epoch": 1.5432190760059612, + "grad_norm": 2.6838860511779785, + "learning_rate": 0.0002, + "loss": 2.5793, + "step": 20710 + }, + { + "epoch": 1.5439642324888228, + "grad_norm": 2.651115655899048, + "learning_rate": 0.0002, + "loss": 2.3765, + "step": 20720 + }, + { + "epoch": 1.544709388971684, + "grad_norm": 2.7897510528564453, + "learning_rate": 0.0002, + "loss": 2.5824, + "step": 20730 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 2.5708913803100586, + "learning_rate": 0.0002, + "loss": 2.3102, + "step": 20740 + }, + { + "epoch": 1.546199701937407, + "grad_norm": 2.648242235183716, + "learning_rate": 0.0002, + "loss": 2.4887, + "step": 20750 + }, + { + "epoch": 1.5469448584202683, + "grad_norm": 2.639145612716675, + "learning_rate": 0.0002, + "loss": 2.3971, + "step": 20760 + }, + { + "epoch": 1.5476900149031296, + "grad_norm": 2.5040032863616943, + "learning_rate": 0.0002, + "loss": 2.5788, + "step": 20770 + }, + { + "epoch": 1.5484351713859912, + "grad_norm": 2.368126392364502, + "learning_rate": 0.0002, + "loss": 2.4651, + "step": 20780 + }, + { + "epoch": 1.5491803278688525, + "grad_norm": 2.808847427368164, + "learning_rate": 0.0002, + "loss": 2.4613, + "step": 20790 + }, + { + "epoch": 1.5499254843517138, + "grad_norm": 2.632382392883301, + "learning_rate": 0.0002, + "loss": 2.5921, + "step": 20800 + }, + { + "epoch": 1.5506706408345754, + "grad_norm": 2.4991965293884277, + "learning_rate": 0.0002, + "loss": 2.6584, + "step": 20810 + }, + { + "epoch": 1.5514157973174365, + "grad_norm": 2.2685933113098145, + "learning_rate": 0.0002, + "loss": 2.6257, + "step": 20820 + }, + { + "epoch": 1.552160953800298, + "grad_norm": 2.494006872177124, + "learning_rate": 0.0002, + "loss": 2.4663, + "step": 20830 + }, + { + "epoch": 1.5529061102831596, + "grad_norm": 2.848593235015869, + "learning_rate": 0.0002, + "loss": 2.6125, + "step": 20840 + }, + { + "epoch": 1.5536512667660207, + "grad_norm": 2.5735042095184326, + "learning_rate": 0.0002, + "loss": 2.3904, + "step": 20850 + }, + { + "epoch": 1.5543964232488823, + "grad_norm": 2.3776659965515137, + "learning_rate": 0.0002, + "loss": 2.397, + "step": 20860 + }, + { + "epoch": 1.5551415797317438, + "grad_norm": 2.362820863723755, + "learning_rate": 0.0002, + "loss": 2.499, + "step": 20870 + }, + { + "epoch": 1.555886736214605, + "grad_norm": 2.4711267948150635, + "learning_rate": 0.0002, + "loss": 2.366, + "step": 20880 + }, + { + "epoch": 1.5566318926974665, + "grad_norm": 2.8080945014953613, + "learning_rate": 0.0002, + "loss": 2.4686, + "step": 20890 + }, + { + "epoch": 1.5573770491803278, + "grad_norm": 2.614722967147827, + "learning_rate": 0.0002, + "loss": 2.4868, + "step": 20900 + }, + { + "epoch": 1.5581222056631892, + "grad_norm": 2.4597301483154297, + "learning_rate": 0.0002, + "loss": 2.5285, + "step": 20910 + }, + { + "epoch": 1.5588673621460507, + "grad_norm": 2.947422981262207, + "learning_rate": 0.0002, + "loss": 2.6405, + "step": 20920 + }, + { + "epoch": 1.559612518628912, + "grad_norm": 2.4457414150238037, + "learning_rate": 0.0002, + "loss": 2.6311, + "step": 20930 + }, + { + "epoch": 1.5603576751117734, + "grad_norm": 2.5841987133026123, + "learning_rate": 0.0002, + "loss": 2.605, + "step": 20940 + }, + { + "epoch": 1.561102831594635, + "grad_norm": 2.63436222076416, + "learning_rate": 0.0002, + "loss": 2.5732, + "step": 20950 + }, + { + "epoch": 1.5618479880774963, + "grad_norm": 2.898693799972534, + "learning_rate": 0.0002, + "loss": 2.5725, + "step": 20960 + }, + { + "epoch": 1.5625931445603576, + "grad_norm": 2.2288978099823, + "learning_rate": 0.0002, + "loss": 2.2325, + "step": 20970 + }, + { + "epoch": 1.5633383010432191, + "grad_norm": 2.741015672683716, + "learning_rate": 0.0002, + "loss": 2.406, + "step": 20980 + }, + { + "epoch": 1.5640834575260805, + "grad_norm": 2.3615951538085938, + "learning_rate": 0.0002, + "loss": 2.4796, + "step": 20990 + }, + { + "epoch": 1.5648286140089418, + "grad_norm": 2.332751750946045, + "learning_rate": 0.0002, + "loss": 2.3425, + "step": 21000 + }, + { + "epoch": 1.5655737704918034, + "grad_norm": 2.585552453994751, + "learning_rate": 0.0002, + "loss": 2.5401, + "step": 21010 + }, + { + "epoch": 1.5663189269746647, + "grad_norm": 2.596543550491333, + "learning_rate": 0.0002, + "loss": 2.2485, + "step": 21020 + }, + { + "epoch": 1.567064083457526, + "grad_norm": 2.5111849308013916, + "learning_rate": 0.0002, + "loss": 2.3883, + "step": 21030 + }, + { + "epoch": 1.5678092399403876, + "grad_norm": 2.949110984802246, + "learning_rate": 0.0002, + "loss": 2.626, + "step": 21040 + }, + { + "epoch": 1.568554396423249, + "grad_norm": 2.5220885276794434, + "learning_rate": 0.0002, + "loss": 2.6748, + "step": 21050 + }, + { + "epoch": 1.5692995529061102, + "grad_norm": 2.4245173931121826, + "learning_rate": 0.0002, + "loss": 2.4853, + "step": 21060 + }, + { + "epoch": 1.5700447093889718, + "grad_norm": 2.2944424152374268, + "learning_rate": 0.0002, + "loss": 2.3863, + "step": 21070 + }, + { + "epoch": 1.5707898658718331, + "grad_norm": 2.557868003845215, + "learning_rate": 0.0002, + "loss": 2.5679, + "step": 21080 + }, + { + "epoch": 1.5715350223546944, + "grad_norm": 2.418034315109253, + "learning_rate": 0.0002, + "loss": 2.3904, + "step": 21090 + }, + { + "epoch": 1.572280178837556, + "grad_norm": 2.6931450366973877, + "learning_rate": 0.0002, + "loss": 2.5675, + "step": 21100 + }, + { + "epoch": 1.5730253353204173, + "grad_norm": 2.8950247764587402, + "learning_rate": 0.0002, + "loss": 2.4783, + "step": 21110 + }, + { + "epoch": 1.5737704918032787, + "grad_norm": 2.3878729343414307, + "learning_rate": 0.0002, + "loss": 2.3789, + "step": 21120 + }, + { + "epoch": 1.5745156482861402, + "grad_norm": 2.6210811138153076, + "learning_rate": 0.0002, + "loss": 2.5938, + "step": 21130 + }, + { + "epoch": 1.5752608047690015, + "grad_norm": 2.540480852127075, + "learning_rate": 0.0002, + "loss": 2.6351, + "step": 21140 + }, + { + "epoch": 1.5760059612518629, + "grad_norm": 2.7353644371032715, + "learning_rate": 0.0002, + "loss": 2.3959, + "step": 21150 + }, + { + "epoch": 1.5767511177347244, + "grad_norm": 2.555797576904297, + "learning_rate": 0.0002, + "loss": 2.7066, + "step": 21160 + }, + { + "epoch": 1.5774962742175855, + "grad_norm": 2.2357959747314453, + "learning_rate": 0.0002, + "loss": 2.4063, + "step": 21170 + }, + { + "epoch": 1.578241430700447, + "grad_norm": 2.2032105922698975, + "learning_rate": 0.0002, + "loss": 2.5256, + "step": 21180 + }, + { + "epoch": 1.5789865871833086, + "grad_norm": 2.421905994415283, + "learning_rate": 0.0002, + "loss": 2.5101, + "step": 21190 + }, + { + "epoch": 1.5797317436661698, + "grad_norm": 2.3932013511657715, + "learning_rate": 0.0002, + "loss": 2.5003, + "step": 21200 + }, + { + "epoch": 1.5804769001490313, + "grad_norm": 2.751396656036377, + "learning_rate": 0.0002, + "loss": 2.6125, + "step": 21210 + }, + { + "epoch": 1.5812220566318929, + "grad_norm": 2.415769100189209, + "learning_rate": 0.0002, + "loss": 2.3935, + "step": 21220 + }, + { + "epoch": 1.581967213114754, + "grad_norm": 2.7542152404785156, + "learning_rate": 0.0002, + "loss": 2.5242, + "step": 21230 + }, + { + "epoch": 1.5827123695976155, + "grad_norm": 2.4142708778381348, + "learning_rate": 0.0002, + "loss": 2.5011, + "step": 21240 + }, + { + "epoch": 1.5834575260804769, + "grad_norm": 2.716956615447998, + "learning_rate": 0.0002, + "loss": 2.5144, + "step": 21250 + }, + { + "epoch": 1.5842026825633382, + "grad_norm": 2.8631787300109863, + "learning_rate": 0.0002, + "loss": 2.5118, + "step": 21260 + }, + { + "epoch": 1.5849478390461997, + "grad_norm": 2.676856517791748, + "learning_rate": 0.0002, + "loss": 2.4845, + "step": 21270 + }, + { + "epoch": 1.585692995529061, + "grad_norm": 2.477172374725342, + "learning_rate": 0.0002, + "loss": 2.5516, + "step": 21280 + }, + { + "epoch": 1.5864381520119224, + "grad_norm": 2.3883090019226074, + "learning_rate": 0.0002, + "loss": 2.5904, + "step": 21290 + }, + { + "epoch": 1.587183308494784, + "grad_norm": 2.0615110397338867, + "learning_rate": 0.0002, + "loss": 2.4803, + "step": 21300 + }, + { + "epoch": 1.5879284649776453, + "grad_norm": 2.6857569217681885, + "learning_rate": 0.0002, + "loss": 2.6149, + "step": 21310 + }, + { + "epoch": 1.5886736214605066, + "grad_norm": 2.224165678024292, + "learning_rate": 0.0002, + "loss": 2.4841, + "step": 21320 + }, + { + "epoch": 1.5894187779433682, + "grad_norm": 2.5804173946380615, + "learning_rate": 0.0002, + "loss": 2.7102, + "step": 21330 + }, + { + "epoch": 1.5901639344262295, + "grad_norm": 2.559598445892334, + "learning_rate": 0.0002, + "loss": 2.3667, + "step": 21340 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 2.5414412021636963, + "learning_rate": 0.0002, + "loss": 2.44, + "step": 21350 + }, + { + "epoch": 1.5916542473919524, + "grad_norm": 2.575948476791382, + "learning_rate": 0.0002, + "loss": 2.6272, + "step": 21360 + }, + { + "epoch": 1.5923994038748137, + "grad_norm": 2.558068037033081, + "learning_rate": 0.0002, + "loss": 2.2373, + "step": 21370 + }, + { + "epoch": 1.593144560357675, + "grad_norm": 2.6954293251037598, + "learning_rate": 0.0002, + "loss": 2.241, + "step": 21380 + }, + { + "epoch": 1.5938897168405366, + "grad_norm": 2.49922251701355, + "learning_rate": 0.0002, + "loss": 2.5755, + "step": 21390 + }, + { + "epoch": 1.594634873323398, + "grad_norm": 3.0789153575897217, + "learning_rate": 0.0002, + "loss": 2.5445, + "step": 21400 + }, + { + "epoch": 1.5953800298062593, + "grad_norm": 2.4592039585113525, + "learning_rate": 0.0002, + "loss": 2.4833, + "step": 21410 + }, + { + "epoch": 1.5961251862891208, + "grad_norm": 2.977599620819092, + "learning_rate": 0.0002, + "loss": 2.6318, + "step": 21420 + }, + { + "epoch": 1.5968703427719821, + "grad_norm": 2.868572950363159, + "learning_rate": 0.0002, + "loss": 2.6332, + "step": 21430 + }, + { + "epoch": 1.5976154992548435, + "grad_norm": 2.8400204181671143, + "learning_rate": 0.0002, + "loss": 2.6106, + "step": 21440 + }, + { + "epoch": 1.598360655737705, + "grad_norm": 3.0851080417633057, + "learning_rate": 0.0002, + "loss": 2.4113, + "step": 21450 + }, + { + "epoch": 1.5991058122205664, + "grad_norm": 2.4475607872009277, + "learning_rate": 0.0002, + "loss": 2.5, + "step": 21460 + }, + { + "epoch": 1.5998509687034277, + "grad_norm": 2.248441696166992, + "learning_rate": 0.0002, + "loss": 2.2983, + "step": 21470 + }, + { + "epoch": 1.6005961251862892, + "grad_norm": 2.1876697540283203, + "learning_rate": 0.0002, + "loss": 2.5876, + "step": 21480 + }, + { + "epoch": 1.6013412816691506, + "grad_norm": 2.766340494155884, + "learning_rate": 0.0002, + "loss": 2.6814, + "step": 21490 + }, + { + "epoch": 1.602086438152012, + "grad_norm": 2.4883816242218018, + "learning_rate": 0.0002, + "loss": 2.4444, + "step": 21500 + }, + { + "epoch": 1.6028315946348735, + "grad_norm": 2.7515087127685547, + "learning_rate": 0.0002, + "loss": 2.5638, + "step": 21510 + }, + { + "epoch": 1.6035767511177346, + "grad_norm": 2.400191068649292, + "learning_rate": 0.0002, + "loss": 2.5826, + "step": 21520 + }, + { + "epoch": 1.6043219076005961, + "grad_norm": 2.65370774269104, + "learning_rate": 0.0002, + "loss": 2.3286, + "step": 21530 + }, + { + "epoch": 1.6050670640834577, + "grad_norm": 2.525686264038086, + "learning_rate": 0.0002, + "loss": 2.5799, + "step": 21540 + }, + { + "epoch": 1.6058122205663188, + "grad_norm": 2.6483981609344482, + "learning_rate": 0.0002, + "loss": 2.4719, + "step": 21550 + }, + { + "epoch": 1.6065573770491803, + "grad_norm": 2.4052860736846924, + "learning_rate": 0.0002, + "loss": 2.6085, + "step": 21560 + }, + { + "epoch": 1.6073025335320419, + "grad_norm": 2.373023271560669, + "learning_rate": 0.0002, + "loss": 2.5449, + "step": 21570 + }, + { + "epoch": 1.608047690014903, + "grad_norm": 2.792961835861206, + "learning_rate": 0.0002, + "loss": 2.5756, + "step": 21580 + }, + { + "epoch": 1.6087928464977646, + "grad_norm": 2.024534225463867, + "learning_rate": 0.0002, + "loss": 2.4098, + "step": 21590 + }, + { + "epoch": 1.6095380029806259, + "grad_norm": 2.550271511077881, + "learning_rate": 0.0002, + "loss": 2.7197, + "step": 21600 + }, + { + "epoch": 1.6102831594634872, + "grad_norm": 2.357914686203003, + "learning_rate": 0.0002, + "loss": 2.4916, + "step": 21610 + }, + { + "epoch": 1.6110283159463488, + "grad_norm": 2.6073484420776367, + "learning_rate": 0.0002, + "loss": 2.5941, + "step": 21620 + }, + { + "epoch": 1.61177347242921, + "grad_norm": 2.532940626144409, + "learning_rate": 0.0002, + "loss": 2.469, + "step": 21630 + }, + { + "epoch": 1.6125186289120714, + "grad_norm": 2.5476560592651367, + "learning_rate": 0.0002, + "loss": 2.4917, + "step": 21640 + }, + { + "epoch": 1.613263785394933, + "grad_norm": 2.505892038345337, + "learning_rate": 0.0002, + "loss": 2.4284, + "step": 21650 + }, + { + "epoch": 1.6140089418777943, + "grad_norm": 2.5390076637268066, + "learning_rate": 0.0002, + "loss": 2.6434, + "step": 21660 + }, + { + "epoch": 1.6147540983606556, + "grad_norm": 1.909812092781067, + "learning_rate": 0.0002, + "loss": 2.3956, + "step": 21670 + }, + { + "epoch": 1.6154992548435172, + "grad_norm": 2.746105670928955, + "learning_rate": 0.0002, + "loss": 2.6643, + "step": 21680 + }, + { + "epoch": 1.6162444113263785, + "grad_norm": 2.7096972465515137, + "learning_rate": 0.0002, + "loss": 2.3153, + "step": 21690 + }, + { + "epoch": 1.6169895678092399, + "grad_norm": 2.1821792125701904, + "learning_rate": 0.0002, + "loss": 2.4432, + "step": 21700 + }, + { + "epoch": 1.6177347242921014, + "grad_norm": 2.201249599456787, + "learning_rate": 0.0002, + "loss": 2.2265, + "step": 21710 + }, + { + "epoch": 1.6184798807749627, + "grad_norm": 2.475764274597168, + "learning_rate": 0.0002, + "loss": 2.4643, + "step": 21720 + }, + { + "epoch": 1.619225037257824, + "grad_norm": 2.8647007942199707, + "learning_rate": 0.0002, + "loss": 2.4923, + "step": 21730 + }, + { + "epoch": 1.6199701937406856, + "grad_norm": 2.39050555229187, + "learning_rate": 0.0002, + "loss": 2.5228, + "step": 21740 + }, + { + "epoch": 1.620715350223547, + "grad_norm": 2.2536306381225586, + "learning_rate": 0.0002, + "loss": 2.4505, + "step": 21750 + }, + { + "epoch": 1.6214605067064083, + "grad_norm": 2.4495081901550293, + "learning_rate": 0.0002, + "loss": 2.4158, + "step": 21760 + }, + { + "epoch": 1.6222056631892698, + "grad_norm": 2.486243486404419, + "learning_rate": 0.0002, + "loss": 2.5142, + "step": 21770 + }, + { + "epoch": 1.6229508196721312, + "grad_norm": 2.2863104343414307, + "learning_rate": 0.0002, + "loss": 2.5569, + "step": 21780 + }, + { + "epoch": 1.6236959761549925, + "grad_norm": 2.1945390701293945, + "learning_rate": 0.0002, + "loss": 2.362, + "step": 21790 + }, + { + "epoch": 1.624441132637854, + "grad_norm": 2.59427809715271, + "learning_rate": 0.0002, + "loss": 2.5408, + "step": 21800 + }, + { + "epoch": 1.6251862891207154, + "grad_norm": 2.5718109607696533, + "learning_rate": 0.0002, + "loss": 2.628, + "step": 21810 + }, + { + "epoch": 1.6259314456035767, + "grad_norm": 2.5133166313171387, + "learning_rate": 0.0002, + "loss": 2.5077, + "step": 21820 + }, + { + "epoch": 1.6266766020864383, + "grad_norm": 2.313368797302246, + "learning_rate": 0.0002, + "loss": 2.5912, + "step": 21830 + }, + { + "epoch": 1.6274217585692996, + "grad_norm": 2.2794532775878906, + "learning_rate": 0.0002, + "loss": 2.3284, + "step": 21840 + }, + { + "epoch": 1.628166915052161, + "grad_norm": 2.508225202560425, + "learning_rate": 0.0002, + "loss": 2.5363, + "step": 21850 + }, + { + "epoch": 1.6289120715350225, + "grad_norm": 2.6698861122131348, + "learning_rate": 0.0002, + "loss": 2.4758, + "step": 21860 + }, + { + "epoch": 1.6296572280178836, + "grad_norm": 2.500823497772217, + "learning_rate": 0.0002, + "loss": 2.3382, + "step": 21870 + }, + { + "epoch": 1.6304023845007451, + "grad_norm": 2.7005372047424316, + "learning_rate": 0.0002, + "loss": 2.281, + "step": 21880 + }, + { + "epoch": 1.6311475409836067, + "grad_norm": 2.6699464321136475, + "learning_rate": 0.0002, + "loss": 2.5618, + "step": 21890 + }, + { + "epoch": 1.6318926974664678, + "grad_norm": 2.852407932281494, + "learning_rate": 0.0002, + "loss": 2.5993, + "step": 21900 + }, + { + "epoch": 1.6326378539493294, + "grad_norm": 2.408608913421631, + "learning_rate": 0.0002, + "loss": 2.5603, + "step": 21910 + }, + { + "epoch": 1.633383010432191, + "grad_norm": 2.828063726425171, + "learning_rate": 0.0002, + "loss": 2.5029, + "step": 21920 + }, + { + "epoch": 1.634128166915052, + "grad_norm": 2.4551167488098145, + "learning_rate": 0.0002, + "loss": 2.3027, + "step": 21930 + }, + { + "epoch": 1.6348733233979136, + "grad_norm": 2.5409445762634277, + "learning_rate": 0.0002, + "loss": 2.4344, + "step": 21940 + }, + { + "epoch": 1.635618479880775, + "grad_norm": 2.7620127201080322, + "learning_rate": 0.0002, + "loss": 2.6531, + "step": 21950 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 2.699997663497925, + "learning_rate": 0.0002, + "loss": 2.4624, + "step": 21960 + }, + { + "epoch": 1.6371087928464978, + "grad_norm": 2.9198567867279053, + "learning_rate": 0.0002, + "loss": 2.5269, + "step": 21970 + }, + { + "epoch": 1.6378539493293591, + "grad_norm": 2.7812252044677734, + "learning_rate": 0.0002, + "loss": 2.5048, + "step": 21980 + }, + { + "epoch": 1.6385991058122205, + "grad_norm": 2.7388384342193604, + "learning_rate": 0.0002, + "loss": 2.2389, + "step": 21990 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 2.064328670501709, + "learning_rate": 0.0002, + "loss": 2.4192, + "step": 22000 + }, + { + "epoch": 1.6400894187779433, + "grad_norm": 2.4596285820007324, + "learning_rate": 0.0002, + "loss": 2.543, + "step": 22010 + }, + { + "epoch": 1.6408345752608047, + "grad_norm": 2.510390043258667, + "learning_rate": 0.0002, + "loss": 2.3457, + "step": 22020 + }, + { + "epoch": 1.6415797317436662, + "grad_norm": 2.2899527549743652, + "learning_rate": 0.0002, + "loss": 2.523, + "step": 22030 + }, + { + "epoch": 1.6423248882265276, + "grad_norm": 2.8313241004943848, + "learning_rate": 0.0002, + "loss": 2.4442, + "step": 22040 + }, + { + "epoch": 1.6430700447093889, + "grad_norm": 2.6460535526275635, + "learning_rate": 0.0002, + "loss": 2.4327, + "step": 22050 + }, + { + "epoch": 1.6438152011922504, + "grad_norm": 2.5774576663970947, + "learning_rate": 0.0002, + "loss": 2.6181, + "step": 22060 + }, + { + "epoch": 1.6445603576751118, + "grad_norm": 2.517756700515747, + "learning_rate": 0.0002, + "loss": 2.5624, + "step": 22070 + }, + { + "epoch": 1.645305514157973, + "grad_norm": 2.382204055786133, + "learning_rate": 0.0002, + "loss": 2.6089, + "step": 22080 + }, + { + "epoch": 1.6460506706408347, + "grad_norm": 2.5505146980285645, + "learning_rate": 0.0002, + "loss": 2.5738, + "step": 22090 + }, + { + "epoch": 1.646795827123696, + "grad_norm": 3.1729090213775635, + "learning_rate": 0.0002, + "loss": 2.5716, + "step": 22100 + }, + { + "epoch": 1.6475409836065573, + "grad_norm": 2.7010695934295654, + "learning_rate": 0.0002, + "loss": 2.346, + "step": 22110 + }, + { + "epoch": 1.6482861400894189, + "grad_norm": 2.2861719131469727, + "learning_rate": 0.0002, + "loss": 2.4943, + "step": 22120 + }, + { + "epoch": 1.6490312965722802, + "grad_norm": 2.2667644023895264, + "learning_rate": 0.0002, + "loss": 2.5656, + "step": 22130 + }, + { + "epoch": 1.6497764530551415, + "grad_norm": 2.9026103019714355, + "learning_rate": 0.0002, + "loss": 2.5225, + "step": 22140 + }, + { + "epoch": 1.650521609538003, + "grad_norm": 2.4847819805145264, + "learning_rate": 0.0002, + "loss": 2.5498, + "step": 22150 + }, + { + "epoch": 1.6512667660208644, + "grad_norm": 2.3765196800231934, + "learning_rate": 0.0002, + "loss": 2.4714, + "step": 22160 + }, + { + "epoch": 1.6520119225037257, + "grad_norm": 2.203185796737671, + "learning_rate": 0.0002, + "loss": 2.5005, + "step": 22170 + }, + { + "epoch": 1.6527570789865873, + "grad_norm": 2.480102300643921, + "learning_rate": 0.0002, + "loss": 2.5686, + "step": 22180 + }, + { + "epoch": 1.6535022354694486, + "grad_norm": 2.299968957901001, + "learning_rate": 0.0002, + "loss": 2.5814, + "step": 22190 + }, + { + "epoch": 1.65424739195231, + "grad_norm": 3.0211429595947266, + "learning_rate": 0.0002, + "loss": 2.6236, + "step": 22200 + }, + { + "epoch": 1.6549925484351715, + "grad_norm": 2.7016682624816895, + "learning_rate": 0.0002, + "loss": 2.5385, + "step": 22210 + }, + { + "epoch": 1.6557377049180326, + "grad_norm": 2.5749051570892334, + "learning_rate": 0.0002, + "loss": 2.4144, + "step": 22220 + }, + { + "epoch": 1.6564828614008942, + "grad_norm": 2.0706841945648193, + "learning_rate": 0.0002, + "loss": 2.4751, + "step": 22230 + }, + { + "epoch": 1.6572280178837557, + "grad_norm": 2.4944183826446533, + "learning_rate": 0.0002, + "loss": 2.5371, + "step": 22240 + }, + { + "epoch": 1.6579731743666168, + "grad_norm": 2.600158214569092, + "learning_rate": 0.0002, + "loss": 2.5388, + "step": 22250 + }, + { + "epoch": 1.6587183308494784, + "grad_norm": 2.428926706314087, + "learning_rate": 0.0002, + "loss": 2.4767, + "step": 22260 + }, + { + "epoch": 1.65946348733234, + "grad_norm": 2.805734395980835, + "learning_rate": 0.0002, + "loss": 2.4908, + "step": 22270 + }, + { + "epoch": 1.660208643815201, + "grad_norm": 2.7497949600219727, + "learning_rate": 0.0002, + "loss": 2.4993, + "step": 22280 + }, + { + "epoch": 1.6609538002980626, + "grad_norm": 2.559528350830078, + "learning_rate": 0.0002, + "loss": 2.2774, + "step": 22290 + }, + { + "epoch": 1.661698956780924, + "grad_norm": 2.5140788555145264, + "learning_rate": 0.0002, + "loss": 2.5504, + "step": 22300 + }, + { + "epoch": 1.6624441132637853, + "grad_norm": 2.3234243392944336, + "learning_rate": 0.0002, + "loss": 2.6684, + "step": 22310 + }, + { + "epoch": 1.6631892697466468, + "grad_norm": 2.3977694511413574, + "learning_rate": 0.0002, + "loss": 2.7416, + "step": 22320 + }, + { + "epoch": 1.6639344262295082, + "grad_norm": 2.2138288021087646, + "learning_rate": 0.0002, + "loss": 2.2312, + "step": 22330 + }, + { + "epoch": 1.6646795827123695, + "grad_norm": 2.4575119018554688, + "learning_rate": 0.0002, + "loss": 2.549, + "step": 22340 + }, + { + "epoch": 1.665424739195231, + "grad_norm": 2.426833152770996, + "learning_rate": 0.0002, + "loss": 2.449, + "step": 22350 + }, + { + "epoch": 1.6661698956780924, + "grad_norm": 3.055542230606079, + "learning_rate": 0.0002, + "loss": 2.4273, + "step": 22360 + }, + { + "epoch": 1.6669150521609537, + "grad_norm": 2.5469939708709717, + "learning_rate": 0.0002, + "loss": 2.6418, + "step": 22370 + }, + { + "epoch": 1.6676602086438153, + "grad_norm": 2.537905216217041, + "learning_rate": 0.0002, + "loss": 2.5369, + "step": 22380 + }, + { + "epoch": 1.6684053651266766, + "grad_norm": 2.696962833404541, + "learning_rate": 0.0002, + "loss": 2.6064, + "step": 22390 + }, + { + "epoch": 1.669150521609538, + "grad_norm": 2.4675662517547607, + "learning_rate": 0.0002, + "loss": 2.515, + "step": 22400 + }, + { + "epoch": 1.6698956780923995, + "grad_norm": 2.486266851425171, + "learning_rate": 0.0002, + "loss": 2.4349, + "step": 22410 + }, + { + "epoch": 1.6706408345752608, + "grad_norm": 2.529770612716675, + "learning_rate": 0.0002, + "loss": 2.3911, + "step": 22420 + }, + { + "epoch": 1.6713859910581221, + "grad_norm": 2.3345956802368164, + "learning_rate": 0.0002, + "loss": 2.5192, + "step": 22430 + }, + { + "epoch": 1.6721311475409837, + "grad_norm": 2.8987648487091064, + "learning_rate": 0.0002, + "loss": 2.355, + "step": 22440 + }, + { + "epoch": 1.672876304023845, + "grad_norm": 2.4978036880493164, + "learning_rate": 0.0002, + "loss": 2.5969, + "step": 22450 + }, + { + "epoch": 1.6736214605067063, + "grad_norm": 2.2257959842681885, + "learning_rate": 0.0002, + "loss": 2.5473, + "step": 22460 + }, + { + "epoch": 1.674366616989568, + "grad_norm": 2.3007636070251465, + "learning_rate": 0.0002, + "loss": 2.5016, + "step": 22470 + }, + { + "epoch": 1.6751117734724292, + "grad_norm": 2.6325652599334717, + "learning_rate": 0.0002, + "loss": 2.4148, + "step": 22480 + }, + { + "epoch": 1.6758569299552906, + "grad_norm": 2.6435465812683105, + "learning_rate": 0.0002, + "loss": 2.5434, + "step": 22490 + }, + { + "epoch": 1.6766020864381521, + "grad_norm": 2.9889986515045166, + "learning_rate": 0.0002, + "loss": 2.6347, + "step": 22500 + }, + { + "epoch": 1.6773472429210134, + "grad_norm": 3.051703691482544, + "learning_rate": 0.0002, + "loss": 2.5085, + "step": 22510 + }, + { + "epoch": 1.6780923994038748, + "grad_norm": 2.7691986560821533, + "learning_rate": 0.0002, + "loss": 2.4834, + "step": 22520 + }, + { + "epoch": 1.6788375558867363, + "grad_norm": 2.565810441970825, + "learning_rate": 0.0002, + "loss": 2.4304, + "step": 22530 + }, + { + "epoch": 1.6795827123695977, + "grad_norm": 2.5967001914978027, + "learning_rate": 0.0002, + "loss": 2.4379, + "step": 22540 + }, + { + "epoch": 1.680327868852459, + "grad_norm": 1.9632105827331543, + "learning_rate": 0.0002, + "loss": 2.2073, + "step": 22550 + }, + { + "epoch": 1.6810730253353205, + "grad_norm": 2.458916664123535, + "learning_rate": 0.0002, + "loss": 2.443, + "step": 22560 + }, + { + "epoch": 1.6818181818181817, + "grad_norm": 2.67900013923645, + "learning_rate": 0.0002, + "loss": 2.6422, + "step": 22570 + }, + { + "epoch": 1.6825633383010432, + "grad_norm": 2.3862104415893555, + "learning_rate": 0.0002, + "loss": 2.5824, + "step": 22580 + }, + { + "epoch": 1.6833084947839048, + "grad_norm": 2.43456768989563, + "learning_rate": 0.0002, + "loss": 2.4033, + "step": 22590 + }, + { + "epoch": 1.6840536512667659, + "grad_norm": 2.6378731727600098, + "learning_rate": 0.0002, + "loss": 2.511, + "step": 22600 + }, + { + "epoch": 1.6847988077496274, + "grad_norm": 2.612466335296631, + "learning_rate": 0.0002, + "loss": 2.5721, + "step": 22610 + }, + { + "epoch": 1.685543964232489, + "grad_norm": 2.5247201919555664, + "learning_rate": 0.0002, + "loss": 2.2889, + "step": 22620 + }, + { + "epoch": 1.68628912071535, + "grad_norm": 2.7932889461517334, + "learning_rate": 0.0002, + "loss": 2.5143, + "step": 22630 + }, + { + "epoch": 1.6870342771982116, + "grad_norm": 2.531528949737549, + "learning_rate": 0.0002, + "loss": 2.6344, + "step": 22640 + }, + { + "epoch": 1.687779433681073, + "grad_norm": 2.5633575916290283, + "learning_rate": 0.0002, + "loss": 2.6162, + "step": 22650 + }, + { + "epoch": 1.6885245901639343, + "grad_norm": 2.114488124847412, + "learning_rate": 0.0002, + "loss": 2.5851, + "step": 22660 + }, + { + "epoch": 1.6892697466467959, + "grad_norm": 2.478349208831787, + "learning_rate": 0.0002, + "loss": 2.4753, + "step": 22670 + }, + { + "epoch": 1.6900149031296572, + "grad_norm": 2.538219690322876, + "learning_rate": 0.0002, + "loss": 2.5749, + "step": 22680 + }, + { + "epoch": 1.6907600596125185, + "grad_norm": 2.557431221008301, + "learning_rate": 0.0002, + "loss": 2.5866, + "step": 22690 + }, + { + "epoch": 1.69150521609538, + "grad_norm": 2.831338882446289, + "learning_rate": 0.0002, + "loss": 2.7184, + "step": 22700 + }, + { + "epoch": 1.6922503725782414, + "grad_norm": 2.0015451908111572, + "learning_rate": 0.0002, + "loss": 2.4181, + "step": 22710 + }, + { + "epoch": 1.6929955290611027, + "grad_norm": 2.3533787727355957, + "learning_rate": 0.0002, + "loss": 2.3283, + "step": 22720 + }, + { + "epoch": 1.6937406855439643, + "grad_norm": 2.209768295288086, + "learning_rate": 0.0002, + "loss": 2.5522, + "step": 22730 + }, + { + "epoch": 1.6944858420268256, + "grad_norm": 2.5806851387023926, + "learning_rate": 0.0002, + "loss": 2.6221, + "step": 22740 + }, + { + "epoch": 1.695230998509687, + "grad_norm": 2.6998038291931152, + "learning_rate": 0.0002, + "loss": 2.5525, + "step": 22750 + }, + { + "epoch": 1.6959761549925485, + "grad_norm": 2.318673610687256, + "learning_rate": 0.0002, + "loss": 2.394, + "step": 22760 + }, + { + "epoch": 1.6967213114754098, + "grad_norm": 1.767236590385437, + "learning_rate": 0.0002, + "loss": 2.2679, + "step": 22770 + }, + { + "epoch": 1.6974664679582712, + "grad_norm": 2.5516786575317383, + "learning_rate": 0.0002, + "loss": 2.4601, + "step": 22780 + }, + { + "epoch": 1.6982116244411327, + "grad_norm": 2.5691072940826416, + "learning_rate": 0.0002, + "loss": 2.4903, + "step": 22790 + }, + { + "epoch": 1.698956780923994, + "grad_norm": 2.6032519340515137, + "learning_rate": 0.0002, + "loss": 2.5091, + "step": 22800 + }, + { + "epoch": 1.6997019374068554, + "grad_norm": 2.57243013381958, + "learning_rate": 0.0002, + "loss": 2.3827, + "step": 22810 + }, + { + "epoch": 1.700447093889717, + "grad_norm": 2.560321092605591, + "learning_rate": 0.0002, + "loss": 2.587, + "step": 22820 + }, + { + "epoch": 1.7011922503725783, + "grad_norm": 2.3471126556396484, + "learning_rate": 0.0002, + "loss": 2.5432, + "step": 22830 + }, + { + "epoch": 1.7019374068554396, + "grad_norm": 2.7333359718322754, + "learning_rate": 0.0002, + "loss": 2.4153, + "step": 22840 + }, + { + "epoch": 1.7026825633383011, + "grad_norm": 2.481062173843384, + "learning_rate": 0.0002, + "loss": 2.6483, + "step": 22850 + }, + { + "epoch": 1.7034277198211625, + "grad_norm": 2.814061164855957, + "learning_rate": 0.0002, + "loss": 2.5051, + "step": 22860 + }, + { + "epoch": 1.7041728763040238, + "grad_norm": 2.8423242568969727, + "learning_rate": 0.0002, + "loss": 2.6922, + "step": 22870 + }, + { + "epoch": 1.7049180327868854, + "grad_norm": 2.553450107574463, + "learning_rate": 0.0002, + "loss": 2.5812, + "step": 22880 + }, + { + "epoch": 1.7056631892697467, + "grad_norm": 2.6540987491607666, + "learning_rate": 0.0002, + "loss": 2.5114, + "step": 22890 + }, + { + "epoch": 1.706408345752608, + "grad_norm": 2.4300739765167236, + "learning_rate": 0.0002, + "loss": 2.6493, + "step": 22900 + }, + { + "epoch": 1.7071535022354696, + "grad_norm": 2.4339499473571777, + "learning_rate": 0.0002, + "loss": 2.5105, + "step": 22910 + }, + { + "epoch": 1.7078986587183307, + "grad_norm": 2.5802414417266846, + "learning_rate": 0.0002, + "loss": 2.4289, + "step": 22920 + }, + { + "epoch": 1.7086438152011922, + "grad_norm": 2.7700533866882324, + "learning_rate": 0.0002, + "loss": 2.6301, + "step": 22930 + }, + { + "epoch": 1.7093889716840538, + "grad_norm": 2.8011083602905273, + "learning_rate": 0.0002, + "loss": 2.3991, + "step": 22940 + }, + { + "epoch": 1.710134128166915, + "grad_norm": 2.746286630630493, + "learning_rate": 0.0002, + "loss": 2.396, + "step": 22950 + }, + { + "epoch": 1.7108792846497765, + "grad_norm": 2.3285419940948486, + "learning_rate": 0.0002, + "loss": 2.536, + "step": 22960 + }, + { + "epoch": 1.711624441132638, + "grad_norm": 2.7214415073394775, + "learning_rate": 0.0002, + "loss": 2.5631, + "step": 22970 + }, + { + "epoch": 1.7123695976154991, + "grad_norm": 2.2611494064331055, + "learning_rate": 0.0002, + "loss": 2.4555, + "step": 22980 + }, + { + "epoch": 1.7131147540983607, + "grad_norm": 2.569819688796997, + "learning_rate": 0.0002, + "loss": 2.4973, + "step": 22990 + }, + { + "epoch": 1.713859910581222, + "grad_norm": 2.3823273181915283, + "learning_rate": 0.0002, + "loss": 2.5048, + "step": 23000 + }, + { + "epoch": 1.7146050670640833, + "grad_norm": 2.369661569595337, + "learning_rate": 0.0002, + "loss": 2.351, + "step": 23010 + }, + { + "epoch": 1.7153502235469449, + "grad_norm": 2.551729679107666, + "learning_rate": 0.0002, + "loss": 2.5331, + "step": 23020 + }, + { + "epoch": 1.7160953800298062, + "grad_norm": 2.7406301498413086, + "learning_rate": 0.0002, + "loss": 2.3774, + "step": 23030 + }, + { + "epoch": 1.7168405365126675, + "grad_norm": 3.1042418479919434, + "learning_rate": 0.0002, + "loss": 2.5131, + "step": 23040 + }, + { + "epoch": 1.717585692995529, + "grad_norm": 2.6012461185455322, + "learning_rate": 0.0002, + "loss": 2.635, + "step": 23050 + }, + { + "epoch": 1.7183308494783904, + "grad_norm": 2.7491185665130615, + "learning_rate": 0.0002, + "loss": 2.5063, + "step": 23060 + }, + { + "epoch": 1.7190760059612518, + "grad_norm": 2.6317410469055176, + "learning_rate": 0.0002, + "loss": 2.6614, + "step": 23070 + }, + { + "epoch": 1.7198211624441133, + "grad_norm": 2.699126958847046, + "learning_rate": 0.0002, + "loss": 2.6547, + "step": 23080 + }, + { + "epoch": 1.7205663189269746, + "grad_norm": 2.882514715194702, + "learning_rate": 0.0002, + "loss": 2.5183, + "step": 23090 + }, + { + "epoch": 1.721311475409836, + "grad_norm": 2.217384099960327, + "learning_rate": 0.0002, + "loss": 2.4662, + "step": 23100 + }, + { + "epoch": 1.7220566318926975, + "grad_norm": 2.645479679107666, + "learning_rate": 0.0002, + "loss": 2.5276, + "step": 23110 + }, + { + "epoch": 1.7228017883755589, + "grad_norm": 3.0477075576782227, + "learning_rate": 0.0002, + "loss": 2.5021, + "step": 23120 + }, + { + "epoch": 1.7235469448584202, + "grad_norm": 2.56258225440979, + "learning_rate": 0.0002, + "loss": 2.5864, + "step": 23130 + }, + { + "epoch": 1.7242921013412817, + "grad_norm": 2.4579217433929443, + "learning_rate": 0.0002, + "loss": 2.5179, + "step": 23140 + }, + { + "epoch": 1.725037257824143, + "grad_norm": 2.761368989944458, + "learning_rate": 0.0002, + "loss": 2.4578, + "step": 23150 + }, + { + "epoch": 1.7257824143070044, + "grad_norm": 2.7409048080444336, + "learning_rate": 0.0002, + "loss": 2.5305, + "step": 23160 + }, + { + "epoch": 1.726527570789866, + "grad_norm": 2.3490233421325684, + "learning_rate": 0.0002, + "loss": 2.4945, + "step": 23170 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 2.397792339324951, + "learning_rate": 0.0002, + "loss": 2.5054, + "step": 23180 + }, + { + "epoch": 1.7280178837555886, + "grad_norm": 2.8195900917053223, + "learning_rate": 0.0002, + "loss": 2.5461, + "step": 23190 + }, + { + "epoch": 1.7287630402384502, + "grad_norm": 2.2869858741760254, + "learning_rate": 0.0002, + "loss": 2.4762, + "step": 23200 + }, + { + "epoch": 1.7295081967213115, + "grad_norm": 2.136507272720337, + "learning_rate": 0.0002, + "loss": 2.5111, + "step": 23210 + }, + { + "epoch": 1.7302533532041728, + "grad_norm": 2.2523751258850098, + "learning_rate": 0.0002, + "loss": 2.3777, + "step": 23220 + }, + { + "epoch": 1.7309985096870344, + "grad_norm": 2.7627720832824707, + "learning_rate": 0.0002, + "loss": 2.3795, + "step": 23230 + }, + { + "epoch": 1.7317436661698957, + "grad_norm": 2.663520336151123, + "learning_rate": 0.0002, + "loss": 2.4765, + "step": 23240 + }, + { + "epoch": 1.732488822652757, + "grad_norm": 2.7753946781158447, + "learning_rate": 0.0002, + "loss": 2.6572, + "step": 23250 + }, + { + "epoch": 1.7332339791356186, + "grad_norm": 2.697758436203003, + "learning_rate": 0.0002, + "loss": 2.5019, + "step": 23260 + }, + { + "epoch": 1.7339791356184797, + "grad_norm": 2.4751906394958496, + "learning_rate": 0.0002, + "loss": 2.5409, + "step": 23270 + }, + { + "epoch": 1.7347242921013413, + "grad_norm": 2.4185609817504883, + "learning_rate": 0.0002, + "loss": 2.5397, + "step": 23280 + }, + { + "epoch": 1.7354694485842028, + "grad_norm": 2.462890148162842, + "learning_rate": 0.0002, + "loss": 2.3603, + "step": 23290 + }, + { + "epoch": 1.736214605067064, + "grad_norm": 2.575814723968506, + "learning_rate": 0.0002, + "loss": 2.6263, + "step": 23300 + }, + { + "epoch": 1.7369597615499255, + "grad_norm": 2.7433903217315674, + "learning_rate": 0.0002, + "loss": 2.3266, + "step": 23310 + }, + { + "epoch": 1.737704918032787, + "grad_norm": 2.4978883266448975, + "learning_rate": 0.0002, + "loss": 2.4926, + "step": 23320 + }, + { + "epoch": 1.7384500745156481, + "grad_norm": 2.297142744064331, + "learning_rate": 0.0002, + "loss": 2.6361, + "step": 23330 + }, + { + "epoch": 1.7391952309985097, + "grad_norm": 2.436873435974121, + "learning_rate": 0.0002, + "loss": 2.4963, + "step": 23340 + }, + { + "epoch": 1.7399403874813713, + "grad_norm": 2.3684535026550293, + "learning_rate": 0.0002, + "loss": 2.451, + "step": 23350 + }, + { + "epoch": 1.7406855439642324, + "grad_norm": 2.752638101577759, + "learning_rate": 0.0002, + "loss": 2.3857, + "step": 23360 + }, + { + "epoch": 1.741430700447094, + "grad_norm": 3.9165122509002686, + "learning_rate": 0.0002, + "loss": 2.552, + "step": 23370 + }, + { + "epoch": 1.7421758569299552, + "grad_norm": 2.561709403991699, + "learning_rate": 0.0002, + "loss": 2.5888, + "step": 23380 + }, + { + "epoch": 1.7429210134128166, + "grad_norm": 2.5623152256011963, + "learning_rate": 0.0002, + "loss": 2.3984, + "step": 23390 + }, + { + "epoch": 1.7436661698956781, + "grad_norm": 2.3049354553222656, + "learning_rate": 0.0002, + "loss": 2.3511, + "step": 23400 + }, + { + "epoch": 1.7444113263785395, + "grad_norm": 2.6003150939941406, + "learning_rate": 0.0002, + "loss": 2.4049, + "step": 23410 + }, + { + "epoch": 1.7451564828614008, + "grad_norm": 2.4863944053649902, + "learning_rate": 0.0002, + "loss": 2.3698, + "step": 23420 + }, + { + "epoch": 1.7459016393442623, + "grad_norm": 2.313157558441162, + "learning_rate": 0.0002, + "loss": 2.489, + "step": 23430 + }, + { + "epoch": 1.7466467958271237, + "grad_norm": 2.4840941429138184, + "learning_rate": 0.0002, + "loss": 2.4372, + "step": 23440 + }, + { + "epoch": 1.747391952309985, + "grad_norm": 2.6886258125305176, + "learning_rate": 0.0002, + "loss": 2.4685, + "step": 23450 + }, + { + "epoch": 1.7481371087928466, + "grad_norm": 2.4791266918182373, + "learning_rate": 0.0002, + "loss": 2.4431, + "step": 23460 + }, + { + "epoch": 1.748882265275708, + "grad_norm": 2.7931692600250244, + "learning_rate": 0.0002, + "loss": 2.2753, + "step": 23470 + }, + { + "epoch": 1.7496274217585692, + "grad_norm": 2.818281650543213, + "learning_rate": 0.0002, + "loss": 2.4702, + "step": 23480 + }, + { + "epoch": 1.7503725782414308, + "grad_norm": 2.638704299926758, + "learning_rate": 0.0002, + "loss": 2.3848, + "step": 23490 + }, + { + "epoch": 1.751117734724292, + "grad_norm": 2.501591682434082, + "learning_rate": 0.0002, + "loss": 2.5548, + "step": 23500 + }, + { + "epoch": 1.7518628912071534, + "grad_norm": 2.969492197036743, + "learning_rate": 0.0002, + "loss": 2.5042, + "step": 23510 + }, + { + "epoch": 1.752608047690015, + "grad_norm": 2.4512195587158203, + "learning_rate": 0.0002, + "loss": 2.2923, + "step": 23520 + }, + { + "epoch": 1.7533532041728763, + "grad_norm": 2.2900497913360596, + "learning_rate": 0.0002, + "loss": 2.4102, + "step": 23530 + }, + { + "epoch": 1.7540983606557377, + "grad_norm": 2.6613259315490723, + "learning_rate": 0.0002, + "loss": 2.277, + "step": 23540 + }, + { + "epoch": 1.7548435171385992, + "grad_norm": 2.427893877029419, + "learning_rate": 0.0002, + "loss": 2.5847, + "step": 23550 + }, + { + "epoch": 1.7555886736214605, + "grad_norm": 2.631917715072632, + "learning_rate": 0.0002, + "loss": 2.6333, + "step": 23560 + }, + { + "epoch": 1.7563338301043219, + "grad_norm": 2.395526170730591, + "learning_rate": 0.0002, + "loss": 2.5131, + "step": 23570 + }, + { + "epoch": 1.7570789865871834, + "grad_norm": 2.392958641052246, + "learning_rate": 0.0002, + "loss": 2.5297, + "step": 23580 + }, + { + "epoch": 1.7578241430700448, + "grad_norm": 2.660301923751831, + "learning_rate": 0.0002, + "loss": 2.529, + "step": 23590 + }, + { + "epoch": 1.758569299552906, + "grad_norm": 2.4387199878692627, + "learning_rate": 0.0002, + "loss": 2.5486, + "step": 23600 + }, + { + "epoch": 1.7593144560357676, + "grad_norm": 2.463621139526367, + "learning_rate": 0.0002, + "loss": 2.6065, + "step": 23610 + }, + { + "epoch": 1.7600596125186287, + "grad_norm": 2.4696829319000244, + "learning_rate": 0.0002, + "loss": 2.5175, + "step": 23620 + }, + { + "epoch": 1.7608047690014903, + "grad_norm": 2.802701473236084, + "learning_rate": 0.0002, + "loss": 2.5997, + "step": 23630 + }, + { + "epoch": 1.7615499254843519, + "grad_norm": 2.8156678676605225, + "learning_rate": 0.0002, + "loss": 2.5085, + "step": 23640 + }, + { + "epoch": 1.762295081967213, + "grad_norm": 2.4574429988861084, + "learning_rate": 0.0002, + "loss": 2.5127, + "step": 23650 + }, + { + "epoch": 1.7630402384500745, + "grad_norm": 2.26594614982605, + "learning_rate": 0.0002, + "loss": 2.6283, + "step": 23660 + }, + { + "epoch": 1.763785394932936, + "grad_norm": 2.458120107650757, + "learning_rate": 0.0002, + "loss": 2.4844, + "step": 23670 + }, + { + "epoch": 1.7645305514157972, + "grad_norm": 2.4478704929351807, + "learning_rate": 0.0002, + "loss": 2.596, + "step": 23680 + }, + { + "epoch": 1.7652757078986587, + "grad_norm": 2.5973260402679443, + "learning_rate": 0.0002, + "loss": 2.7246, + "step": 23690 + }, + { + "epoch": 1.7660208643815203, + "grad_norm": 2.2895405292510986, + "learning_rate": 0.0002, + "loss": 2.5442, + "step": 23700 + }, + { + "epoch": 1.7667660208643814, + "grad_norm": 3.0286567211151123, + "learning_rate": 0.0002, + "loss": 2.4685, + "step": 23710 + }, + { + "epoch": 1.767511177347243, + "grad_norm": 2.95426869392395, + "learning_rate": 0.0002, + "loss": 2.6306, + "step": 23720 + }, + { + "epoch": 1.7682563338301043, + "grad_norm": 2.4281368255615234, + "learning_rate": 0.0002, + "loss": 2.4162, + "step": 23730 + }, + { + "epoch": 1.7690014903129656, + "grad_norm": 2.274803876876831, + "learning_rate": 0.0002, + "loss": 2.2333, + "step": 23740 + }, + { + "epoch": 1.7697466467958272, + "grad_norm": 2.6567139625549316, + "learning_rate": 0.0002, + "loss": 2.5369, + "step": 23750 + }, + { + "epoch": 1.7704918032786885, + "grad_norm": 2.1148862838745117, + "learning_rate": 0.0002, + "loss": 2.4882, + "step": 23760 + }, + { + "epoch": 1.7712369597615498, + "grad_norm": 2.397024631500244, + "learning_rate": 0.0002, + "loss": 2.5951, + "step": 23770 + }, + { + "epoch": 1.7719821162444114, + "grad_norm": 2.556600570678711, + "learning_rate": 0.0002, + "loss": 2.4878, + "step": 23780 + }, + { + "epoch": 1.7727272727272727, + "grad_norm": 2.361325979232788, + "learning_rate": 0.0002, + "loss": 2.6025, + "step": 23790 + }, + { + "epoch": 1.773472429210134, + "grad_norm": 2.221918821334839, + "learning_rate": 0.0002, + "loss": 2.4707, + "step": 23800 + }, + { + "epoch": 1.7742175856929956, + "grad_norm": 2.5794131755828857, + "learning_rate": 0.0002, + "loss": 2.3977, + "step": 23810 + }, + { + "epoch": 1.774962742175857, + "grad_norm": 2.4929275512695312, + "learning_rate": 0.0002, + "loss": 2.6056, + "step": 23820 + }, + { + "epoch": 1.7757078986587183, + "grad_norm": 2.478788375854492, + "learning_rate": 0.0002, + "loss": 2.5346, + "step": 23830 + }, + { + "epoch": 1.7764530551415798, + "grad_norm": 2.7491824626922607, + "learning_rate": 0.0002, + "loss": 2.2653, + "step": 23840 + }, + { + "epoch": 1.7771982116244411, + "grad_norm": 2.842250108718872, + "learning_rate": 0.0002, + "loss": 2.5534, + "step": 23850 + }, + { + "epoch": 1.7779433681073025, + "grad_norm": 2.78625226020813, + "learning_rate": 0.0002, + "loss": 2.529, + "step": 23860 + }, + { + "epoch": 1.778688524590164, + "grad_norm": 2.5092759132385254, + "learning_rate": 0.0002, + "loss": 2.3547, + "step": 23870 + }, + { + "epoch": 1.7794336810730254, + "grad_norm": 2.5346925258636475, + "learning_rate": 0.0002, + "loss": 2.5704, + "step": 23880 + }, + { + "epoch": 1.7801788375558867, + "grad_norm": 2.1645240783691406, + "learning_rate": 0.0002, + "loss": 2.4889, + "step": 23890 + }, + { + "epoch": 1.7809239940387482, + "grad_norm": 2.653005361557007, + "learning_rate": 0.0002, + "loss": 2.4614, + "step": 23900 + }, + { + "epoch": 1.7816691505216096, + "grad_norm": 3.1483113765716553, + "learning_rate": 0.0002, + "loss": 2.5343, + "step": 23910 + }, + { + "epoch": 1.782414307004471, + "grad_norm": 2.815312385559082, + "learning_rate": 0.0002, + "loss": 2.491, + "step": 23920 + }, + { + "epoch": 1.7831594634873325, + "grad_norm": 2.5819027423858643, + "learning_rate": 0.0002, + "loss": 2.4377, + "step": 23930 + }, + { + "epoch": 1.7839046199701938, + "grad_norm": 2.691847085952759, + "learning_rate": 0.0002, + "loss": 2.5363, + "step": 23940 + }, + { + "epoch": 1.7846497764530551, + "grad_norm": 3.1236226558685303, + "learning_rate": 0.0002, + "loss": 2.6327, + "step": 23950 + }, + { + "epoch": 1.7853949329359167, + "grad_norm": 2.2760941982269287, + "learning_rate": 0.0002, + "loss": 2.4321, + "step": 23960 + }, + { + "epoch": 1.786140089418778, + "grad_norm": 2.5152201652526855, + "learning_rate": 0.0002, + "loss": 2.519, + "step": 23970 + }, + { + "epoch": 1.7868852459016393, + "grad_norm": 2.2634105682373047, + "learning_rate": 0.0002, + "loss": 2.4622, + "step": 23980 + }, + { + "epoch": 1.7876304023845009, + "grad_norm": 2.6219863891601562, + "learning_rate": 0.0002, + "loss": 2.6106, + "step": 23990 + }, + { + "epoch": 1.788375558867362, + "grad_norm": 3.093374013900757, + "learning_rate": 0.0002, + "loss": 2.4982, + "step": 24000 + }, + { + "epoch": 1.7891207153502235, + "grad_norm": 2.5569963455200195, + "learning_rate": 0.0002, + "loss": 2.4287, + "step": 24010 + }, + { + "epoch": 1.789865871833085, + "grad_norm": 2.514453887939453, + "learning_rate": 0.0002, + "loss": 2.5517, + "step": 24020 + }, + { + "epoch": 1.7906110283159462, + "grad_norm": 2.578756809234619, + "learning_rate": 0.0002, + "loss": 2.5094, + "step": 24030 + }, + { + "epoch": 1.7913561847988078, + "grad_norm": 2.6853983402252197, + "learning_rate": 0.0002, + "loss": 2.6338, + "step": 24040 + }, + { + "epoch": 1.7921013412816693, + "grad_norm": 2.5106022357940674, + "learning_rate": 0.0002, + "loss": 2.5402, + "step": 24050 + }, + { + "epoch": 1.7928464977645304, + "grad_norm": 2.823246479034424, + "learning_rate": 0.0002, + "loss": 2.321, + "step": 24060 + }, + { + "epoch": 1.793591654247392, + "grad_norm": 2.782550096511841, + "learning_rate": 0.0002, + "loss": 2.6001, + "step": 24070 + }, + { + "epoch": 1.7943368107302533, + "grad_norm": 2.8128268718719482, + "learning_rate": 0.0002, + "loss": 2.3526, + "step": 24080 + }, + { + "epoch": 1.7950819672131146, + "grad_norm": 2.4673032760620117, + "learning_rate": 0.0002, + "loss": 2.2312, + "step": 24090 + }, + { + "epoch": 1.7958271236959762, + "grad_norm": 2.503586530685425, + "learning_rate": 0.0002, + "loss": 2.4475, + "step": 24100 + }, + { + "epoch": 1.7965722801788375, + "grad_norm": 2.3996407985687256, + "learning_rate": 0.0002, + "loss": 2.5589, + "step": 24110 + }, + { + "epoch": 1.7973174366616989, + "grad_norm": 2.5348939895629883, + "learning_rate": 0.0002, + "loss": 2.2477, + "step": 24120 + }, + { + "epoch": 1.7980625931445604, + "grad_norm": 2.695479393005371, + "learning_rate": 0.0002, + "loss": 2.5913, + "step": 24130 + }, + { + "epoch": 1.7988077496274217, + "grad_norm": 2.827122211456299, + "learning_rate": 0.0002, + "loss": 2.3807, + "step": 24140 + }, + { + "epoch": 1.799552906110283, + "grad_norm": 2.496100425720215, + "learning_rate": 0.0002, + "loss": 2.4569, + "step": 24150 + }, + { + "epoch": 1.8002980625931446, + "grad_norm": 2.7637364864349365, + "learning_rate": 0.0002, + "loss": 2.4109, + "step": 24160 + }, + { + "epoch": 1.801043219076006, + "grad_norm": 2.378218650817871, + "learning_rate": 0.0002, + "loss": 2.3436, + "step": 24170 + }, + { + "epoch": 1.8017883755588673, + "grad_norm": 2.8666129112243652, + "learning_rate": 0.0002, + "loss": 2.6058, + "step": 24180 + }, + { + "epoch": 1.8025335320417288, + "grad_norm": 2.1839277744293213, + "learning_rate": 0.0002, + "loss": 2.3226, + "step": 24190 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 2.1608381271362305, + "learning_rate": 0.0002, + "loss": 2.5368, + "step": 24200 + }, + { + "epoch": 1.8040238450074515, + "grad_norm": 2.1796987056732178, + "learning_rate": 0.0002, + "loss": 2.5402, + "step": 24210 + }, + { + "epoch": 1.804769001490313, + "grad_norm": 2.9191579818725586, + "learning_rate": 0.0002, + "loss": 2.5582, + "step": 24220 + }, + { + "epoch": 1.8055141579731744, + "grad_norm": 2.621483087539673, + "learning_rate": 0.0002, + "loss": 2.4998, + "step": 24230 + }, + { + "epoch": 1.8062593144560357, + "grad_norm": 2.5693342685699463, + "learning_rate": 0.0002, + "loss": 2.6363, + "step": 24240 + }, + { + "epoch": 1.8070044709388973, + "grad_norm": 2.3040270805358887, + "learning_rate": 0.0002, + "loss": 2.3897, + "step": 24250 + }, + { + "epoch": 1.8077496274217586, + "grad_norm": 2.5259714126586914, + "learning_rate": 0.0002, + "loss": 2.3625, + "step": 24260 + }, + { + "epoch": 1.80849478390462, + "grad_norm": 2.1863303184509277, + "learning_rate": 0.0002, + "loss": 2.5026, + "step": 24270 + }, + { + "epoch": 1.8092399403874815, + "grad_norm": 2.6622605323791504, + "learning_rate": 0.0002, + "loss": 2.5436, + "step": 24280 + }, + { + "epoch": 1.8099850968703428, + "grad_norm": 2.9189093112945557, + "learning_rate": 0.0002, + "loss": 2.6779, + "step": 24290 + }, + { + "epoch": 1.8107302533532041, + "grad_norm": 2.7089121341705322, + "learning_rate": 0.0002, + "loss": 2.6477, + "step": 24300 + }, + { + "epoch": 1.8114754098360657, + "grad_norm": 2.326711654663086, + "learning_rate": 0.0002, + "loss": 2.5291, + "step": 24310 + }, + { + "epoch": 1.812220566318927, + "grad_norm": 2.570425033569336, + "learning_rate": 0.0002, + "loss": 2.4593, + "step": 24320 + }, + { + "epoch": 1.8129657228017884, + "grad_norm": 3.20462703704834, + "learning_rate": 0.0002, + "loss": 2.5166, + "step": 24330 + }, + { + "epoch": 1.81371087928465, + "grad_norm": 2.2558157444000244, + "learning_rate": 0.0002, + "loss": 2.5881, + "step": 24340 + }, + { + "epoch": 1.814456035767511, + "grad_norm": 2.6994922161102295, + "learning_rate": 0.0002, + "loss": 2.5089, + "step": 24350 + }, + { + "epoch": 1.8152011922503726, + "grad_norm": 2.84627103805542, + "learning_rate": 0.0002, + "loss": 2.4608, + "step": 24360 + }, + { + "epoch": 1.8159463487332341, + "grad_norm": 2.4853367805480957, + "learning_rate": 0.0002, + "loss": 2.2028, + "step": 24370 + }, + { + "epoch": 1.8166915052160952, + "grad_norm": 2.2818758487701416, + "learning_rate": 0.0002, + "loss": 2.471, + "step": 24380 + }, + { + "epoch": 1.8174366616989568, + "grad_norm": 2.9547371864318848, + "learning_rate": 0.0002, + "loss": 2.5479, + "step": 24390 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 2.700378656387329, + "learning_rate": 0.0002, + "loss": 2.5568, + "step": 24400 + }, + { + "epoch": 1.8189269746646795, + "grad_norm": 2.5123701095581055, + "learning_rate": 0.0002, + "loss": 2.3738, + "step": 24410 + }, + { + "epoch": 1.819672131147541, + "grad_norm": 2.8290090560913086, + "learning_rate": 0.0002, + "loss": 2.5638, + "step": 24420 + }, + { + "epoch": 1.8204172876304023, + "grad_norm": 2.747896671295166, + "learning_rate": 0.0002, + "loss": 2.754, + "step": 24430 + }, + { + "epoch": 1.8211624441132637, + "grad_norm": 2.2470264434814453, + "learning_rate": 0.0002, + "loss": 2.4369, + "step": 24440 + }, + { + "epoch": 1.8219076005961252, + "grad_norm": 2.4347453117370605, + "learning_rate": 0.0002, + "loss": 2.4282, + "step": 24450 + }, + { + "epoch": 1.8226527570789866, + "grad_norm": 2.6316144466400146, + "learning_rate": 0.0002, + "loss": 2.4912, + "step": 24460 + }, + { + "epoch": 1.8233979135618479, + "grad_norm": 2.4304425716400146, + "learning_rate": 0.0002, + "loss": 2.5277, + "step": 24470 + }, + { + "epoch": 1.8241430700447094, + "grad_norm": 2.7673425674438477, + "learning_rate": 0.0002, + "loss": 2.6021, + "step": 24480 + }, + { + "epoch": 1.8248882265275708, + "grad_norm": 2.2696149349212646, + "learning_rate": 0.0002, + "loss": 2.4493, + "step": 24490 + }, + { + "epoch": 1.825633383010432, + "grad_norm": 2.58644962310791, + "learning_rate": 0.0002, + "loss": 2.5297, + "step": 24500 + }, + { + "epoch": 1.8263785394932937, + "grad_norm": 3.001873254776001, + "learning_rate": 0.0002, + "loss": 2.3753, + "step": 24510 + }, + { + "epoch": 1.827123695976155, + "grad_norm": 2.724271297454834, + "learning_rate": 0.0002, + "loss": 2.4127, + "step": 24520 + }, + { + "epoch": 1.8278688524590163, + "grad_norm": 2.4911880493164062, + "learning_rate": 0.0002, + "loss": 2.7232, + "step": 24530 + }, + { + "epoch": 1.8286140089418779, + "grad_norm": 2.659914493560791, + "learning_rate": 0.0002, + "loss": 2.4794, + "step": 24540 + }, + { + "epoch": 1.8293591654247392, + "grad_norm": 2.4134860038757324, + "learning_rate": 0.0002, + "loss": 2.5825, + "step": 24550 + }, + { + "epoch": 1.8301043219076005, + "grad_norm": 2.6322641372680664, + "learning_rate": 0.0002, + "loss": 2.4633, + "step": 24560 + }, + { + "epoch": 1.830849478390462, + "grad_norm": 2.942608118057251, + "learning_rate": 0.0002, + "loss": 2.7876, + "step": 24570 + }, + { + "epoch": 1.8315946348733234, + "grad_norm": 2.5157153606414795, + "learning_rate": 0.0002, + "loss": 2.4832, + "step": 24580 + }, + { + "epoch": 1.8323397913561847, + "grad_norm": 2.5834529399871826, + "learning_rate": 0.0002, + "loss": 2.4591, + "step": 24590 + }, + { + "epoch": 1.8330849478390463, + "grad_norm": 2.607112169265747, + "learning_rate": 0.0002, + "loss": 2.5687, + "step": 24600 + }, + { + "epoch": 1.8338301043219076, + "grad_norm": 2.7782909870147705, + "learning_rate": 0.0002, + "loss": 2.5903, + "step": 24610 + }, + { + "epoch": 1.834575260804769, + "grad_norm": 2.6779706478118896, + "learning_rate": 0.0002, + "loss": 2.561, + "step": 24620 + }, + { + "epoch": 1.8353204172876305, + "grad_norm": 2.514186143875122, + "learning_rate": 0.0002, + "loss": 2.5755, + "step": 24630 + }, + { + "epoch": 1.8360655737704918, + "grad_norm": 2.3403496742248535, + "learning_rate": 0.0002, + "loss": 2.4768, + "step": 24640 + }, + { + "epoch": 1.8368107302533532, + "grad_norm": 2.3311824798583984, + "learning_rate": 0.0002, + "loss": 2.5229, + "step": 24650 + }, + { + "epoch": 1.8375558867362147, + "grad_norm": 1.7865532636642456, + "learning_rate": 0.0002, + "loss": 2.5142, + "step": 24660 + }, + { + "epoch": 1.838301043219076, + "grad_norm": 2.4601778984069824, + "learning_rate": 0.0002, + "loss": 2.5981, + "step": 24670 + }, + { + "epoch": 1.8390461997019374, + "grad_norm": 2.6825180053710938, + "learning_rate": 0.0002, + "loss": 2.5526, + "step": 24680 + }, + { + "epoch": 1.839791356184799, + "grad_norm": 2.428622245788574, + "learning_rate": 0.0002, + "loss": 2.2803, + "step": 24690 + }, + { + "epoch": 1.84053651266766, + "grad_norm": 2.353142261505127, + "learning_rate": 0.0002, + "loss": 2.4896, + "step": 24700 + }, + { + "epoch": 1.8412816691505216, + "grad_norm": 2.611560106277466, + "learning_rate": 0.0002, + "loss": 2.6822, + "step": 24710 + }, + { + "epoch": 1.8420268256333832, + "grad_norm": 2.8267176151275635, + "learning_rate": 0.0002, + "loss": 2.6869, + "step": 24720 + }, + { + "epoch": 1.8427719821162443, + "grad_norm": 2.8418166637420654, + "learning_rate": 0.0002, + "loss": 2.5455, + "step": 24730 + }, + { + "epoch": 1.8435171385991058, + "grad_norm": 2.458555221557617, + "learning_rate": 0.0002, + "loss": 2.6303, + "step": 24740 + }, + { + "epoch": 1.8442622950819674, + "grad_norm": 2.2593302726745605, + "learning_rate": 0.0002, + "loss": 2.5129, + "step": 24750 + }, + { + "epoch": 1.8450074515648285, + "grad_norm": 2.7010560035705566, + "learning_rate": 0.0002, + "loss": 2.3973, + "step": 24760 + }, + { + "epoch": 1.84575260804769, + "grad_norm": 2.4872236251831055, + "learning_rate": 0.0002, + "loss": 2.6178, + "step": 24770 + }, + { + "epoch": 1.8464977645305514, + "grad_norm": 2.882812738418579, + "learning_rate": 0.0002, + "loss": 2.533, + "step": 24780 + }, + { + "epoch": 1.8472429210134127, + "grad_norm": 2.354396104812622, + "learning_rate": 0.0002, + "loss": 2.5111, + "step": 24790 + }, + { + "epoch": 1.8479880774962743, + "grad_norm": 2.5242695808410645, + "learning_rate": 0.0002, + "loss": 2.5467, + "step": 24800 + }, + { + "epoch": 1.8487332339791356, + "grad_norm": 2.3869271278381348, + "learning_rate": 0.0002, + "loss": 2.578, + "step": 24810 + }, + { + "epoch": 1.849478390461997, + "grad_norm": 2.6933507919311523, + "learning_rate": 0.0002, + "loss": 2.6325, + "step": 24820 + }, + { + "epoch": 1.8502235469448585, + "grad_norm": 2.445830821990967, + "learning_rate": 0.0002, + "loss": 2.5435, + "step": 24830 + }, + { + "epoch": 1.8509687034277198, + "grad_norm": 2.1250483989715576, + "learning_rate": 0.0002, + "loss": 2.6424, + "step": 24840 + }, + { + "epoch": 1.8517138599105811, + "grad_norm": 2.1476426124572754, + "learning_rate": 0.0002, + "loss": 2.5599, + "step": 24850 + }, + { + "epoch": 1.8524590163934427, + "grad_norm": 2.7636313438415527, + "learning_rate": 0.0002, + "loss": 2.6807, + "step": 24860 + }, + { + "epoch": 1.853204172876304, + "grad_norm": 2.6150963306427, + "learning_rate": 0.0002, + "loss": 2.5823, + "step": 24870 + }, + { + "epoch": 1.8539493293591653, + "grad_norm": 2.9421913623809814, + "learning_rate": 0.0002, + "loss": 2.5598, + "step": 24880 + }, + { + "epoch": 1.854694485842027, + "grad_norm": 2.576680898666382, + "learning_rate": 0.0002, + "loss": 2.4352, + "step": 24890 + }, + { + "epoch": 1.8554396423248882, + "grad_norm": 2.372847557067871, + "learning_rate": 0.0002, + "loss": 2.4352, + "step": 24900 + }, + { + "epoch": 1.8561847988077496, + "grad_norm": 2.7840962409973145, + "learning_rate": 0.0002, + "loss": 2.4458, + "step": 24910 + }, + { + "epoch": 1.8569299552906111, + "grad_norm": 2.608435869216919, + "learning_rate": 0.0002, + "loss": 2.3279, + "step": 24920 + }, + { + "epoch": 1.8576751117734724, + "grad_norm": 2.2083051204681396, + "learning_rate": 0.0002, + "loss": 2.5685, + "step": 24930 + }, + { + "epoch": 1.8584202682563338, + "grad_norm": 2.6166601181030273, + "learning_rate": 0.0002, + "loss": 2.5726, + "step": 24940 + }, + { + "epoch": 1.8591654247391953, + "grad_norm": 2.4802517890930176, + "learning_rate": 0.0002, + "loss": 2.4114, + "step": 24950 + }, + { + "epoch": 1.8599105812220567, + "grad_norm": 2.6344408988952637, + "learning_rate": 0.0002, + "loss": 2.6164, + "step": 24960 + }, + { + "epoch": 1.860655737704918, + "grad_norm": 2.4686670303344727, + "learning_rate": 0.0002, + "loss": 2.4948, + "step": 24970 + }, + { + "epoch": 1.8614008941877795, + "grad_norm": 2.556004762649536, + "learning_rate": 0.0002, + "loss": 2.5358, + "step": 24980 + }, + { + "epoch": 1.8621460506706409, + "grad_norm": 2.642232656478882, + "learning_rate": 0.0002, + "loss": 2.4455, + "step": 24990 + }, + { + "epoch": 1.8628912071535022, + "grad_norm": 2.6357104778289795, + "learning_rate": 0.0002, + "loss": 2.4734, + "step": 25000 + }, + { + "epoch": 1.8636363636363638, + "grad_norm": 2.7195780277252197, + "learning_rate": 0.0002, + "loss": 2.5011, + "step": 25010 + }, + { + "epoch": 1.864381520119225, + "grad_norm": 2.306607961654663, + "learning_rate": 0.0002, + "loss": 2.4745, + "step": 25020 + }, + { + "epoch": 1.8651266766020864, + "grad_norm": 2.3458077907562256, + "learning_rate": 0.0002, + "loss": 2.5006, + "step": 25030 + }, + { + "epoch": 1.865871833084948, + "grad_norm": 2.6288564205169678, + "learning_rate": 0.0002, + "loss": 2.4656, + "step": 25040 + }, + { + "epoch": 1.866616989567809, + "grad_norm": 2.5845961570739746, + "learning_rate": 0.0002, + "loss": 2.4353, + "step": 25050 + }, + { + "epoch": 1.8673621460506706, + "grad_norm": 2.3820455074310303, + "learning_rate": 0.0002, + "loss": 2.5236, + "step": 25060 + }, + { + "epoch": 1.8681073025335322, + "grad_norm": 2.8274917602539062, + "learning_rate": 0.0002, + "loss": 2.5278, + "step": 25070 + }, + { + "epoch": 1.8688524590163933, + "grad_norm": 2.551588296890259, + "learning_rate": 0.0002, + "loss": 2.5355, + "step": 25080 + }, + { + "epoch": 1.8695976154992549, + "grad_norm": 2.707473039627075, + "learning_rate": 0.0002, + "loss": 2.4049, + "step": 25090 + }, + { + "epoch": 1.8703427719821164, + "grad_norm": 2.40470290184021, + "learning_rate": 0.0002, + "loss": 2.5121, + "step": 25100 + }, + { + "epoch": 1.8710879284649775, + "grad_norm": 2.507206916809082, + "learning_rate": 0.0002, + "loss": 2.2627, + "step": 25110 + }, + { + "epoch": 1.871833084947839, + "grad_norm": 2.4339914321899414, + "learning_rate": 0.0002, + "loss": 2.5134, + "step": 25120 + }, + { + "epoch": 1.8725782414307004, + "grad_norm": 2.4960076808929443, + "learning_rate": 0.0002, + "loss": 2.5428, + "step": 25130 + }, + { + "epoch": 1.8733233979135617, + "grad_norm": 3.661783218383789, + "learning_rate": 0.0002, + "loss": 2.6599, + "step": 25140 + }, + { + "epoch": 1.8740685543964233, + "grad_norm": 2.513418197631836, + "learning_rate": 0.0002, + "loss": 2.4901, + "step": 25150 + }, + { + "epoch": 1.8748137108792846, + "grad_norm": 2.3090083599090576, + "learning_rate": 0.0002, + "loss": 2.5672, + "step": 25160 + }, + { + "epoch": 1.875558867362146, + "grad_norm": 2.645890951156616, + "learning_rate": 0.0002, + "loss": 2.5898, + "step": 25170 + }, + { + "epoch": 1.8763040238450075, + "grad_norm": 3.336200714111328, + "learning_rate": 0.0002, + "loss": 2.6214, + "step": 25180 + }, + { + "epoch": 1.8770491803278688, + "grad_norm": 2.3214824199676514, + "learning_rate": 0.0002, + "loss": 2.6054, + "step": 25190 + }, + { + "epoch": 1.8777943368107302, + "grad_norm": 2.3875439167022705, + "learning_rate": 0.0002, + "loss": 2.5562, + "step": 25200 + }, + { + "epoch": 1.8785394932935917, + "grad_norm": 2.643754482269287, + "learning_rate": 0.0002, + "loss": 2.5102, + "step": 25210 + }, + { + "epoch": 1.879284649776453, + "grad_norm": 2.7053935527801514, + "learning_rate": 0.0002, + "loss": 2.3707, + "step": 25220 + }, + { + "epoch": 1.8800298062593144, + "grad_norm": 2.6861038208007812, + "learning_rate": 0.0002, + "loss": 2.7176, + "step": 25230 + }, + { + "epoch": 1.880774962742176, + "grad_norm": 2.6391537189483643, + "learning_rate": 0.0002, + "loss": 2.3108, + "step": 25240 + }, + { + "epoch": 1.8815201192250373, + "grad_norm": 2.6725473403930664, + "learning_rate": 0.0002, + "loss": 2.6606, + "step": 25250 + }, + { + "epoch": 1.8822652757078986, + "grad_norm": 2.843163013458252, + "learning_rate": 0.0002, + "loss": 2.4298, + "step": 25260 + }, + { + "epoch": 1.8830104321907601, + "grad_norm": 2.280057430267334, + "learning_rate": 0.0002, + "loss": 2.5212, + "step": 25270 + }, + { + "epoch": 1.8837555886736215, + "grad_norm": 2.221116304397583, + "learning_rate": 0.0002, + "loss": 2.4512, + "step": 25280 + }, + { + "epoch": 1.8845007451564828, + "grad_norm": 2.413114547729492, + "learning_rate": 0.0002, + "loss": 2.4593, + "step": 25290 + }, + { + "epoch": 1.8852459016393444, + "grad_norm": 2.225498914718628, + "learning_rate": 0.0002, + "loss": 2.3172, + "step": 25300 + }, + { + "epoch": 1.8859910581222057, + "grad_norm": 2.6563146114349365, + "learning_rate": 0.0002, + "loss": 2.5154, + "step": 25310 + }, + { + "epoch": 1.886736214605067, + "grad_norm": 2.232198715209961, + "learning_rate": 0.0002, + "loss": 2.3323, + "step": 25320 + }, + { + "epoch": 1.8874813710879286, + "grad_norm": 2.4613037109375, + "learning_rate": 0.0002, + "loss": 2.5738, + "step": 25330 + }, + { + "epoch": 1.88822652757079, + "grad_norm": 2.7953736782073975, + "learning_rate": 0.0002, + "loss": 2.4914, + "step": 25340 + }, + { + "epoch": 1.8889716840536512, + "grad_norm": 2.416280508041382, + "learning_rate": 0.0002, + "loss": 2.643, + "step": 25350 + }, + { + "epoch": 1.8897168405365128, + "grad_norm": 2.5031518936157227, + "learning_rate": 0.0002, + "loss": 2.3818, + "step": 25360 + }, + { + "epoch": 1.8904619970193741, + "grad_norm": 2.5476391315460205, + "learning_rate": 0.0002, + "loss": 2.6249, + "step": 25370 + }, + { + "epoch": 1.8912071535022354, + "grad_norm": 2.5421531200408936, + "learning_rate": 0.0002, + "loss": 2.6841, + "step": 25380 + }, + { + "epoch": 1.891952309985097, + "grad_norm": 2.7282488346099854, + "learning_rate": 0.0002, + "loss": 2.6655, + "step": 25390 + }, + { + "epoch": 1.8926974664679581, + "grad_norm": 2.47148060798645, + "learning_rate": 0.0002, + "loss": 2.3096, + "step": 25400 + }, + { + "epoch": 1.8934426229508197, + "grad_norm": 2.394070625305176, + "learning_rate": 0.0002, + "loss": 2.6875, + "step": 25410 + }, + { + "epoch": 1.8941877794336812, + "grad_norm": 2.7232706546783447, + "learning_rate": 0.0002, + "loss": 2.5139, + "step": 25420 + }, + { + "epoch": 1.8949329359165423, + "grad_norm": 2.7696988582611084, + "learning_rate": 0.0002, + "loss": 2.5473, + "step": 25430 + }, + { + "epoch": 1.8956780923994039, + "grad_norm": 2.484501838684082, + "learning_rate": 0.0002, + "loss": 2.6315, + "step": 25440 + }, + { + "epoch": 1.8964232488822654, + "grad_norm": 2.669006109237671, + "learning_rate": 0.0002, + "loss": 2.4127, + "step": 25450 + }, + { + "epoch": 1.8971684053651265, + "grad_norm": 2.8366858959198, + "learning_rate": 0.0002, + "loss": 2.6871, + "step": 25460 + }, + { + "epoch": 1.897913561847988, + "grad_norm": 2.7077786922454834, + "learning_rate": 0.0002, + "loss": 2.3913, + "step": 25470 + }, + { + "epoch": 1.8986587183308494, + "grad_norm": 2.649768829345703, + "learning_rate": 0.0002, + "loss": 2.6283, + "step": 25480 + }, + { + "epoch": 1.8994038748137108, + "grad_norm": 2.4160945415496826, + "learning_rate": 0.0002, + "loss": 2.5643, + "step": 25490 + }, + { + "epoch": 1.9001490312965723, + "grad_norm": 2.4574332237243652, + "learning_rate": 0.0002, + "loss": 2.1872, + "step": 25500 + }, + { + "epoch": 1.9008941877794336, + "grad_norm": 2.4507484436035156, + "learning_rate": 0.0002, + "loss": 2.532, + "step": 25510 + }, + { + "epoch": 1.901639344262295, + "grad_norm": 1.882723093032837, + "learning_rate": 0.0002, + "loss": 2.4446, + "step": 25520 + }, + { + "epoch": 1.9023845007451565, + "grad_norm": 2.4363670349121094, + "learning_rate": 0.0002, + "loss": 2.3938, + "step": 25530 + }, + { + "epoch": 1.9031296572280179, + "grad_norm": 2.288465738296509, + "learning_rate": 0.0002, + "loss": 2.5107, + "step": 25540 + }, + { + "epoch": 1.9038748137108792, + "grad_norm": 2.611039400100708, + "learning_rate": 0.0002, + "loss": 2.5329, + "step": 25550 + }, + { + "epoch": 1.9046199701937407, + "grad_norm": 2.746476173400879, + "learning_rate": 0.0002, + "loss": 2.4759, + "step": 25560 + }, + { + "epoch": 1.905365126676602, + "grad_norm": 2.628481864929199, + "learning_rate": 0.0002, + "loss": 2.6533, + "step": 25570 + }, + { + "epoch": 1.9061102831594634, + "grad_norm": 2.5151212215423584, + "learning_rate": 0.0002, + "loss": 2.4787, + "step": 25580 + }, + { + "epoch": 1.906855439642325, + "grad_norm": 2.3716254234313965, + "learning_rate": 0.0002, + "loss": 2.5077, + "step": 25590 + }, + { + "epoch": 1.9076005961251863, + "grad_norm": 2.687274932861328, + "learning_rate": 0.0002, + "loss": 2.3605, + "step": 25600 + }, + { + "epoch": 1.9083457526080476, + "grad_norm": 2.3180296421051025, + "learning_rate": 0.0002, + "loss": 2.4078, + "step": 25610 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 2.033092737197876, + "learning_rate": 0.0002, + "loss": 2.2995, + "step": 25620 + }, + { + "epoch": 1.9098360655737705, + "grad_norm": 2.2614009380340576, + "learning_rate": 0.0002, + "loss": 2.5323, + "step": 25630 + }, + { + "epoch": 1.9105812220566318, + "grad_norm": 2.1245622634887695, + "learning_rate": 0.0002, + "loss": 2.4616, + "step": 25640 + }, + { + "epoch": 1.9113263785394934, + "grad_norm": 2.5503997802734375, + "learning_rate": 0.0002, + "loss": 2.6637, + "step": 25650 + }, + { + "epoch": 1.9120715350223547, + "grad_norm": 2.7903940677642822, + "learning_rate": 0.0002, + "loss": 2.6795, + "step": 25660 + }, + { + "epoch": 1.912816691505216, + "grad_norm": 2.427300453186035, + "learning_rate": 0.0002, + "loss": 2.4091, + "step": 25670 + }, + { + "epoch": 1.9135618479880776, + "grad_norm": 2.535790205001831, + "learning_rate": 0.0002, + "loss": 2.3897, + "step": 25680 + }, + { + "epoch": 1.914307004470939, + "grad_norm": 2.60248064994812, + "learning_rate": 0.0002, + "loss": 2.5952, + "step": 25690 + }, + { + "epoch": 1.9150521609538003, + "grad_norm": 2.7798526287078857, + "learning_rate": 0.0002, + "loss": 2.4811, + "step": 25700 + }, + { + "epoch": 1.9157973174366618, + "grad_norm": 2.0604312419891357, + "learning_rate": 0.0002, + "loss": 2.6361, + "step": 25710 + }, + { + "epoch": 1.9165424739195231, + "grad_norm": 2.663926124572754, + "learning_rate": 0.0002, + "loss": 2.4111, + "step": 25720 + }, + { + "epoch": 1.9172876304023845, + "grad_norm": 2.741672992706299, + "learning_rate": 0.0002, + "loss": 2.391, + "step": 25730 + }, + { + "epoch": 1.918032786885246, + "grad_norm": 2.5494937896728516, + "learning_rate": 0.0002, + "loss": 2.497, + "step": 25740 + }, + { + "epoch": 1.9187779433681071, + "grad_norm": 2.7732324600219727, + "learning_rate": 0.0002, + "loss": 2.3377, + "step": 25750 + }, + { + "epoch": 1.9195230998509687, + "grad_norm": 2.3186347484588623, + "learning_rate": 0.0002, + "loss": 2.6408, + "step": 25760 + }, + { + "epoch": 1.9202682563338302, + "grad_norm": 2.6721131801605225, + "learning_rate": 0.0002, + "loss": 2.5869, + "step": 25770 + }, + { + "epoch": 1.9210134128166914, + "grad_norm": 2.4322500228881836, + "learning_rate": 0.0002, + "loss": 2.4635, + "step": 25780 + }, + { + "epoch": 1.921758569299553, + "grad_norm": 2.7271809577941895, + "learning_rate": 0.0002, + "loss": 2.6242, + "step": 25790 + }, + { + "epoch": 1.9225037257824145, + "grad_norm": 2.2470762729644775, + "learning_rate": 0.0002, + "loss": 2.5125, + "step": 25800 + }, + { + "epoch": 1.9232488822652756, + "grad_norm": 2.861534833908081, + "learning_rate": 0.0002, + "loss": 2.5176, + "step": 25810 + }, + { + "epoch": 1.9239940387481371, + "grad_norm": 2.7653250694274902, + "learning_rate": 0.0002, + "loss": 2.4254, + "step": 25820 + }, + { + "epoch": 1.9247391952309985, + "grad_norm": 2.630427122116089, + "learning_rate": 0.0002, + "loss": 2.4022, + "step": 25830 + }, + { + "epoch": 1.9254843517138598, + "grad_norm": 2.5337681770324707, + "learning_rate": 0.0002, + "loss": 2.497, + "step": 25840 + }, + { + "epoch": 1.9262295081967213, + "grad_norm": 2.4699361324310303, + "learning_rate": 0.0002, + "loss": 2.6214, + "step": 25850 + }, + { + "epoch": 1.9269746646795827, + "grad_norm": 1.9499725103378296, + "learning_rate": 0.0002, + "loss": 2.4598, + "step": 25860 + }, + { + "epoch": 1.927719821162444, + "grad_norm": 2.4422264099121094, + "learning_rate": 0.0002, + "loss": 2.4889, + "step": 25870 + }, + { + "epoch": 1.9284649776453056, + "grad_norm": 2.2586450576782227, + "learning_rate": 0.0002, + "loss": 2.5206, + "step": 25880 + }, + { + "epoch": 1.9292101341281669, + "grad_norm": 2.6377463340759277, + "learning_rate": 0.0002, + "loss": 2.3293, + "step": 25890 + }, + { + "epoch": 1.9299552906110282, + "grad_norm": 2.7808732986450195, + "learning_rate": 0.0002, + "loss": 2.5287, + "step": 25900 + }, + { + "epoch": 1.9307004470938898, + "grad_norm": 3.0270299911499023, + "learning_rate": 0.0002, + "loss": 2.5512, + "step": 25910 + }, + { + "epoch": 1.931445603576751, + "grad_norm": 2.4635636806488037, + "learning_rate": 0.0002, + "loss": 2.5143, + "step": 25920 + }, + { + "epoch": 1.9321907600596124, + "grad_norm": 3.6546430587768555, + "learning_rate": 0.0002, + "loss": 2.4352, + "step": 25930 + }, + { + "epoch": 1.932935916542474, + "grad_norm": 2.299994707107544, + "learning_rate": 0.0002, + "loss": 2.4868, + "step": 25940 + }, + { + "epoch": 1.9336810730253353, + "grad_norm": 2.5070691108703613, + "learning_rate": 0.0002, + "loss": 2.5456, + "step": 25950 + }, + { + "epoch": 1.9344262295081966, + "grad_norm": 2.400691509246826, + "learning_rate": 0.0002, + "loss": 2.6418, + "step": 25960 + }, + { + "epoch": 1.9351713859910582, + "grad_norm": 2.6068758964538574, + "learning_rate": 0.0002, + "loss": 2.5137, + "step": 25970 + }, + { + "epoch": 1.9359165424739195, + "grad_norm": 2.5092999935150146, + "learning_rate": 0.0002, + "loss": 2.5346, + "step": 25980 + }, + { + "epoch": 1.9366616989567809, + "grad_norm": 2.6502528190612793, + "learning_rate": 0.0002, + "loss": 2.6351, + "step": 25990 + }, + { + "epoch": 1.9374068554396424, + "grad_norm": 2.406677722930908, + "learning_rate": 0.0002, + "loss": 2.4647, + "step": 26000 + }, + { + "epoch": 1.9381520119225037, + "grad_norm": 2.532393217086792, + "learning_rate": 0.0002, + "loss": 2.4782, + "step": 26010 + }, + { + "epoch": 1.938897168405365, + "grad_norm": 2.3414087295532227, + "learning_rate": 0.0002, + "loss": 2.4257, + "step": 26020 + }, + { + "epoch": 1.9396423248882266, + "grad_norm": 2.684752941131592, + "learning_rate": 0.0002, + "loss": 2.3947, + "step": 26030 + }, + { + "epoch": 1.940387481371088, + "grad_norm": 2.432410955429077, + "learning_rate": 0.0002, + "loss": 2.623, + "step": 26040 + }, + { + "epoch": 1.9411326378539493, + "grad_norm": 3.1778249740600586, + "learning_rate": 0.0002, + "loss": 2.4038, + "step": 26050 + }, + { + "epoch": 1.9418777943368108, + "grad_norm": 2.836688280105591, + "learning_rate": 0.0002, + "loss": 2.5303, + "step": 26060 + }, + { + "epoch": 1.9426229508196722, + "grad_norm": 2.4091341495513916, + "learning_rate": 0.0002, + "loss": 2.4783, + "step": 26070 + }, + { + "epoch": 1.9433681073025335, + "grad_norm": 2.5459396839141846, + "learning_rate": 0.0002, + "loss": 2.5745, + "step": 26080 + }, + { + "epoch": 1.944113263785395, + "grad_norm": 2.6074795722961426, + "learning_rate": 0.0002, + "loss": 2.5106, + "step": 26090 + }, + { + "epoch": 1.9448584202682562, + "grad_norm": 2.8824479579925537, + "learning_rate": 0.0002, + "loss": 2.4438, + "step": 26100 + }, + { + "epoch": 1.9456035767511177, + "grad_norm": 2.3653016090393066, + "learning_rate": 0.0002, + "loss": 2.5536, + "step": 26110 + }, + { + "epoch": 1.9463487332339793, + "grad_norm": 2.3197124004364014, + "learning_rate": 0.0002, + "loss": 2.4542, + "step": 26120 + }, + { + "epoch": 1.9470938897168404, + "grad_norm": 2.626267194747925, + "learning_rate": 0.0002, + "loss": 2.4828, + "step": 26130 + }, + { + "epoch": 1.947839046199702, + "grad_norm": 2.914698362350464, + "learning_rate": 0.0002, + "loss": 2.5634, + "step": 26140 + }, + { + "epoch": 1.9485842026825635, + "grad_norm": 2.8641860485076904, + "learning_rate": 0.0002, + "loss": 2.6208, + "step": 26150 + }, + { + "epoch": 1.9493293591654246, + "grad_norm": 2.823122262954712, + "learning_rate": 0.0002, + "loss": 2.5639, + "step": 26160 + }, + { + "epoch": 1.9500745156482862, + "grad_norm": 2.152498722076416, + "learning_rate": 0.0002, + "loss": 2.5226, + "step": 26170 + }, + { + "epoch": 1.9508196721311475, + "grad_norm": 2.7207183837890625, + "learning_rate": 0.0002, + "loss": 2.4951, + "step": 26180 + }, + { + "epoch": 1.9515648286140088, + "grad_norm": 2.4285998344421387, + "learning_rate": 0.0002, + "loss": 2.5207, + "step": 26190 + }, + { + "epoch": 1.9523099850968704, + "grad_norm": 2.4538893699645996, + "learning_rate": 0.0002, + "loss": 2.4034, + "step": 26200 + }, + { + "epoch": 1.9530551415797317, + "grad_norm": 2.513335704803467, + "learning_rate": 0.0002, + "loss": 2.4913, + "step": 26210 + }, + { + "epoch": 1.953800298062593, + "grad_norm": 2.8927149772644043, + "learning_rate": 0.0002, + "loss": 2.5541, + "step": 26220 + }, + { + "epoch": 1.9545454545454546, + "grad_norm": 2.610621452331543, + "learning_rate": 0.0002, + "loss": 2.3861, + "step": 26230 + }, + { + "epoch": 1.955290611028316, + "grad_norm": 2.7638297080993652, + "learning_rate": 0.0002, + "loss": 2.452, + "step": 26240 + }, + { + "epoch": 1.9560357675111772, + "grad_norm": 2.722166061401367, + "learning_rate": 0.0002, + "loss": 2.6235, + "step": 26250 + }, + { + "epoch": 1.9567809239940388, + "grad_norm": 2.614750862121582, + "learning_rate": 0.0002, + "loss": 2.4784, + "step": 26260 + }, + { + "epoch": 1.9575260804769001, + "grad_norm": 2.5103914737701416, + "learning_rate": 0.0002, + "loss": 2.5742, + "step": 26270 + }, + { + "epoch": 1.9582712369597615, + "grad_norm": 2.72584867477417, + "learning_rate": 0.0002, + "loss": 2.379, + "step": 26280 + }, + { + "epoch": 1.959016393442623, + "grad_norm": 2.343048095703125, + "learning_rate": 0.0002, + "loss": 2.5236, + "step": 26290 + }, + { + "epoch": 1.9597615499254843, + "grad_norm": 2.357050895690918, + "learning_rate": 0.0002, + "loss": 2.4987, + "step": 26300 + }, + { + "epoch": 1.9605067064083457, + "grad_norm": 2.71625018119812, + "learning_rate": 0.0002, + "loss": 2.596, + "step": 26310 + }, + { + "epoch": 1.9612518628912072, + "grad_norm": 2.790226459503174, + "learning_rate": 0.0002, + "loss": 2.4313, + "step": 26320 + }, + { + "epoch": 1.9619970193740686, + "grad_norm": 2.4327445030212402, + "learning_rate": 0.0002, + "loss": 2.5478, + "step": 26330 + }, + { + "epoch": 1.96274217585693, + "grad_norm": 2.637118339538574, + "learning_rate": 0.0002, + "loss": 2.638, + "step": 26340 + }, + { + "epoch": 1.9634873323397914, + "grad_norm": 2.398437023162842, + "learning_rate": 0.0002, + "loss": 2.3303, + "step": 26350 + }, + { + "epoch": 1.9642324888226528, + "grad_norm": 2.131314277648926, + "learning_rate": 0.0002, + "loss": 2.2881, + "step": 26360 + }, + { + "epoch": 1.964977645305514, + "grad_norm": 2.55584716796875, + "learning_rate": 0.0002, + "loss": 2.5168, + "step": 26370 + }, + { + "epoch": 1.9657228017883757, + "grad_norm": 2.5380923748016357, + "learning_rate": 0.0002, + "loss": 2.362, + "step": 26380 + }, + { + "epoch": 1.966467958271237, + "grad_norm": 2.3342056274414062, + "learning_rate": 0.0002, + "loss": 2.6316, + "step": 26390 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 2.483621835708618, + "learning_rate": 0.0002, + "loss": 2.4311, + "step": 26400 + }, + { + "epoch": 1.9679582712369599, + "grad_norm": 3.239633560180664, + "learning_rate": 0.0002, + "loss": 2.4224, + "step": 26410 + }, + { + "epoch": 1.9687034277198212, + "grad_norm": 2.420544147491455, + "learning_rate": 0.0002, + "loss": 2.4733, + "step": 26420 + }, + { + "epoch": 1.9694485842026825, + "grad_norm": 2.3765010833740234, + "learning_rate": 0.0002, + "loss": 2.519, + "step": 26430 + }, + { + "epoch": 1.970193740685544, + "grad_norm": 2.2879652976989746, + "learning_rate": 0.0002, + "loss": 2.5747, + "step": 26440 + }, + { + "epoch": 1.9709388971684052, + "grad_norm": 2.4816901683807373, + "learning_rate": 0.0002, + "loss": 2.566, + "step": 26450 + }, + { + "epoch": 1.9716840536512668, + "grad_norm": 2.6422929763793945, + "learning_rate": 0.0002, + "loss": 2.4609, + "step": 26460 + }, + { + "epoch": 1.9724292101341283, + "grad_norm": 3.031636953353882, + "learning_rate": 0.0002, + "loss": 2.4401, + "step": 26470 + }, + { + "epoch": 1.9731743666169894, + "grad_norm": 1.9829246997833252, + "learning_rate": 0.0002, + "loss": 2.4296, + "step": 26480 + }, + { + "epoch": 1.973919523099851, + "grad_norm": 2.687513828277588, + "learning_rate": 0.0002, + "loss": 2.5862, + "step": 26490 + }, + { + "epoch": 1.9746646795827125, + "grad_norm": 2.684131145477295, + "learning_rate": 0.0002, + "loss": 2.6292, + "step": 26500 + }, + { + "epoch": 1.9754098360655736, + "grad_norm": 2.090724468231201, + "learning_rate": 0.0002, + "loss": 2.3809, + "step": 26510 + }, + { + "epoch": 1.9761549925484352, + "grad_norm": 2.503957748413086, + "learning_rate": 0.0002, + "loss": 2.662, + "step": 26520 + }, + { + "epoch": 1.9769001490312967, + "grad_norm": 2.3615572452545166, + "learning_rate": 0.0002, + "loss": 2.5989, + "step": 26530 + }, + { + "epoch": 1.9776453055141578, + "grad_norm": 2.630366086959839, + "learning_rate": 0.0002, + "loss": 2.5245, + "step": 26540 + }, + { + "epoch": 1.9783904619970194, + "grad_norm": 2.7511889934539795, + "learning_rate": 0.0002, + "loss": 2.4983, + "step": 26550 + }, + { + "epoch": 1.9791356184798807, + "grad_norm": 2.7013278007507324, + "learning_rate": 0.0002, + "loss": 2.6212, + "step": 26560 + }, + { + "epoch": 1.979880774962742, + "grad_norm": 2.5615251064300537, + "learning_rate": 0.0002, + "loss": 2.5408, + "step": 26570 + }, + { + "epoch": 1.9806259314456036, + "grad_norm": 2.737309217453003, + "learning_rate": 0.0002, + "loss": 2.5117, + "step": 26580 + }, + { + "epoch": 1.981371087928465, + "grad_norm": 2.0868828296661377, + "learning_rate": 0.0002, + "loss": 2.3501, + "step": 26590 + }, + { + "epoch": 1.9821162444113263, + "grad_norm": 2.7384564876556396, + "learning_rate": 0.0002, + "loss": 2.4741, + "step": 26600 + }, + { + "epoch": 1.9828614008941878, + "grad_norm": 2.5381107330322266, + "learning_rate": 0.0002, + "loss": 2.5183, + "step": 26610 + }, + { + "epoch": 1.9836065573770492, + "grad_norm": 2.441673994064331, + "learning_rate": 0.0002, + "loss": 2.6689, + "step": 26620 + }, + { + "epoch": 1.9843517138599105, + "grad_norm": 2.331639528274536, + "learning_rate": 0.0002, + "loss": 2.4582, + "step": 26630 + }, + { + "epoch": 1.985096870342772, + "grad_norm": 2.777003288269043, + "learning_rate": 0.0002, + "loss": 2.4258, + "step": 26640 + }, + { + "epoch": 1.9858420268256334, + "grad_norm": 2.382567882537842, + "learning_rate": 0.0002, + "loss": 2.4606, + "step": 26650 + }, + { + "epoch": 1.9865871833084947, + "grad_norm": 2.5854926109313965, + "learning_rate": 0.0002, + "loss": 2.558, + "step": 26660 + }, + { + "epoch": 1.9873323397913563, + "grad_norm": 2.8911728858947754, + "learning_rate": 0.0002, + "loss": 2.6694, + "step": 26670 + }, + { + "epoch": 1.9880774962742176, + "grad_norm": 2.597140073776245, + "learning_rate": 0.0002, + "loss": 2.5155, + "step": 26680 + }, + { + "epoch": 1.988822652757079, + "grad_norm": 2.733752489089966, + "learning_rate": 0.0002, + "loss": 2.7112, + "step": 26690 + }, + { + "epoch": 1.9895678092399405, + "grad_norm": 2.311286687850952, + "learning_rate": 0.0002, + "loss": 2.3415, + "step": 26700 + }, + { + "epoch": 1.9903129657228018, + "grad_norm": 2.7273147106170654, + "learning_rate": 0.0002, + "loss": 2.6524, + "step": 26710 + }, + { + "epoch": 1.9910581222056631, + "grad_norm": 2.367246389389038, + "learning_rate": 0.0002, + "loss": 2.5106, + "step": 26720 + }, + { + "epoch": 1.9918032786885247, + "grad_norm": 2.5700063705444336, + "learning_rate": 0.0002, + "loss": 2.5936, + "step": 26730 + }, + { + "epoch": 1.992548435171386, + "grad_norm": 2.5638208389282227, + "learning_rate": 0.0002, + "loss": 2.6146, + "step": 26740 + }, + { + "epoch": 1.9932935916542474, + "grad_norm": 1.9451677799224854, + "learning_rate": 0.0002, + "loss": 2.4673, + "step": 26750 + }, + { + "epoch": 1.994038748137109, + "grad_norm": 2.4559242725372314, + "learning_rate": 0.0002, + "loss": 2.5946, + "step": 26760 + }, + { + "epoch": 1.9947839046199702, + "grad_norm": 2.445701837539673, + "learning_rate": 0.0002, + "loss": 2.7494, + "step": 26770 + }, + { + "epoch": 1.9955290611028316, + "grad_norm": 2.668653964996338, + "learning_rate": 0.0002, + "loss": 2.3548, + "step": 26780 + }, + { + "epoch": 1.9962742175856931, + "grad_norm": 2.523761749267578, + "learning_rate": 0.0002, + "loss": 2.4606, + "step": 26790 + }, + { + "epoch": 1.9970193740685542, + "grad_norm": 2.4471657276153564, + "learning_rate": 0.0002, + "loss": 2.4458, + "step": 26800 + }, + { + "epoch": 1.9977645305514158, + "grad_norm": 2.2107632160186768, + "learning_rate": 0.0002, + "loss": 2.5186, + "step": 26810 + }, + { + "epoch": 1.9985096870342773, + "grad_norm": 2.4634511470794678, + "learning_rate": 0.0002, + "loss": 2.5912, + "step": 26820 + }, + { + "epoch": 1.9992548435171384, + "grad_norm": 2.3914170265197754, + "learning_rate": 0.0002, + "loss": 2.4112, + "step": 26830 + }, + { + "epoch": 2.0, + "grad_norm": 2.793278455734253, + "learning_rate": 0.0002, + "loss": 2.4186, + "step": 26840 + }, + { + "epoch": 2.0, + "eval_runtime": 2778.8528, + "eval_samples_per_second": 4.829, + "eval_steps_per_second": 0.604, + "step": 26840 + }, + { + "epoch": 2.0007451564828616, + "grad_norm": 2.427016258239746, + "learning_rate": 0.0002, + "loss": 2.2428, + "step": 26850 + }, + { + "epoch": 2.0014903129657227, + "grad_norm": 2.445763111114502, + "learning_rate": 0.0002, + "loss": 2.4872, + "step": 26860 + }, + { + "epoch": 2.002235469448584, + "grad_norm": 2.4963574409484863, + "learning_rate": 0.0002, + "loss": 2.4217, + "step": 26870 + }, + { + "epoch": 2.0029806259314458, + "grad_norm": 3.122708320617676, + "learning_rate": 0.0002, + "loss": 2.482, + "step": 26880 + }, + { + "epoch": 2.003725782414307, + "grad_norm": 2.769930124282837, + "learning_rate": 0.0002, + "loss": 2.3721, + "step": 26890 + }, + { + "epoch": 2.0044709388971684, + "grad_norm": 2.6897122859954834, + "learning_rate": 0.0002, + "loss": 2.2882, + "step": 26900 + }, + { + "epoch": 2.00521609538003, + "grad_norm": 2.7737324237823486, + "learning_rate": 0.0002, + "loss": 2.4593, + "step": 26910 + }, + { + "epoch": 2.005961251862891, + "grad_norm": 2.649350643157959, + "learning_rate": 0.0002, + "loss": 2.5489, + "step": 26920 + }, + { + "epoch": 2.0067064083457526, + "grad_norm": 2.721299886703491, + "learning_rate": 0.0002, + "loss": 2.4069, + "step": 26930 + }, + { + "epoch": 2.007451564828614, + "grad_norm": 2.5351850986480713, + "learning_rate": 0.0002, + "loss": 2.3394, + "step": 26940 + }, + { + "epoch": 2.0081967213114753, + "grad_norm": 2.717834949493408, + "learning_rate": 0.0002, + "loss": 2.505, + "step": 26950 + }, + { + "epoch": 2.008941877794337, + "grad_norm": 2.9618914127349854, + "learning_rate": 0.0002, + "loss": 2.4794, + "step": 26960 + }, + { + "epoch": 2.0096870342771984, + "grad_norm": 2.4195728302001953, + "learning_rate": 0.0002, + "loss": 2.3318, + "step": 26970 + }, + { + "epoch": 2.0104321907600595, + "grad_norm": 3.099935531616211, + "learning_rate": 0.0002, + "loss": 2.2687, + "step": 26980 + }, + { + "epoch": 2.011177347242921, + "grad_norm": 2.5944323539733887, + "learning_rate": 0.0002, + "loss": 2.5544, + "step": 26990 + }, + { + "epoch": 2.0119225037257826, + "grad_norm": 2.521864175796509, + "learning_rate": 0.0002, + "loss": 2.4157, + "step": 27000 + }, + { + "epoch": 2.0126676602086437, + "grad_norm": 2.8137166500091553, + "learning_rate": 0.0002, + "loss": 2.2001, + "step": 27010 + }, + { + "epoch": 2.0134128166915053, + "grad_norm": 2.695870876312256, + "learning_rate": 0.0002, + "loss": 2.3827, + "step": 27020 + }, + { + "epoch": 2.0141579731743664, + "grad_norm": 2.4329819679260254, + "learning_rate": 0.0002, + "loss": 2.2124, + "step": 27030 + }, + { + "epoch": 2.014903129657228, + "grad_norm": 2.539754629135132, + "learning_rate": 0.0002, + "loss": 2.3536, + "step": 27040 + }, + { + "epoch": 2.0156482861400895, + "grad_norm": 2.557807207107544, + "learning_rate": 0.0002, + "loss": 2.3424, + "step": 27050 + }, + { + "epoch": 2.0163934426229506, + "grad_norm": 3.0624873638153076, + "learning_rate": 0.0002, + "loss": 2.5087, + "step": 27060 + }, + { + "epoch": 2.017138599105812, + "grad_norm": 2.791489601135254, + "learning_rate": 0.0002, + "loss": 2.515, + "step": 27070 + }, + { + "epoch": 2.0178837555886737, + "grad_norm": 2.6769886016845703, + "learning_rate": 0.0002, + "loss": 2.5366, + "step": 27080 + }, + { + "epoch": 2.018628912071535, + "grad_norm": 2.861119270324707, + "learning_rate": 0.0002, + "loss": 2.5285, + "step": 27090 + }, + { + "epoch": 2.0193740685543964, + "grad_norm": 2.2920444011688232, + "learning_rate": 0.0002, + "loss": 2.3931, + "step": 27100 + }, + { + "epoch": 2.020119225037258, + "grad_norm": 2.878737211227417, + "learning_rate": 0.0002, + "loss": 2.2288, + "step": 27110 + }, + { + "epoch": 2.020864381520119, + "grad_norm": 2.782578706741333, + "learning_rate": 0.0002, + "loss": 2.2606, + "step": 27120 + }, + { + "epoch": 2.0216095380029806, + "grad_norm": 2.6214089393615723, + "learning_rate": 0.0002, + "loss": 2.4091, + "step": 27130 + }, + { + "epoch": 2.022354694485842, + "grad_norm": 2.628899335861206, + "learning_rate": 0.0002, + "loss": 2.3615, + "step": 27140 + }, + { + "epoch": 2.0230998509687033, + "grad_norm": 2.4732677936553955, + "learning_rate": 0.0002, + "loss": 2.2547, + "step": 27150 + }, + { + "epoch": 2.023845007451565, + "grad_norm": 2.5701940059661865, + "learning_rate": 0.0002, + "loss": 2.3523, + "step": 27160 + }, + { + "epoch": 2.0245901639344264, + "grad_norm": 2.6380484104156494, + "learning_rate": 0.0002, + "loss": 2.2513, + "step": 27170 + }, + { + "epoch": 2.0253353204172875, + "grad_norm": 2.5996925830841064, + "learning_rate": 0.0002, + "loss": 2.3843, + "step": 27180 + }, + { + "epoch": 2.026080476900149, + "grad_norm": 2.727452516555786, + "learning_rate": 0.0002, + "loss": 2.2922, + "step": 27190 + }, + { + "epoch": 2.0268256333830106, + "grad_norm": 2.9830238819122314, + "learning_rate": 0.0002, + "loss": 2.313, + "step": 27200 + }, + { + "epoch": 2.0275707898658717, + "grad_norm": 2.7571492195129395, + "learning_rate": 0.0002, + "loss": 2.3851, + "step": 27210 + }, + { + "epoch": 2.0283159463487332, + "grad_norm": 2.3931894302368164, + "learning_rate": 0.0002, + "loss": 2.0832, + "step": 27220 + }, + { + "epoch": 2.029061102831595, + "grad_norm": 2.732896566390991, + "learning_rate": 0.0002, + "loss": 2.4687, + "step": 27230 + }, + { + "epoch": 2.029806259314456, + "grad_norm": 2.637854814529419, + "learning_rate": 0.0002, + "loss": 2.487, + "step": 27240 + }, + { + "epoch": 2.0305514157973175, + "grad_norm": 2.2118914127349854, + "learning_rate": 0.0002, + "loss": 2.3466, + "step": 27250 + }, + { + "epoch": 2.031296572280179, + "grad_norm": 2.7009260654449463, + "learning_rate": 0.0002, + "loss": 2.3756, + "step": 27260 + }, + { + "epoch": 2.03204172876304, + "grad_norm": 2.3376128673553467, + "learning_rate": 0.0002, + "loss": 2.4311, + "step": 27270 + }, + { + "epoch": 2.0327868852459017, + "grad_norm": 2.7543601989746094, + "learning_rate": 0.0002, + "loss": 2.2645, + "step": 27280 + }, + { + "epoch": 2.0335320417287632, + "grad_norm": 2.2563414573669434, + "learning_rate": 0.0002, + "loss": 2.3356, + "step": 27290 + }, + { + "epoch": 2.0342771982116243, + "grad_norm": 2.4909472465515137, + "learning_rate": 0.0002, + "loss": 2.4706, + "step": 27300 + }, + { + "epoch": 2.035022354694486, + "grad_norm": 2.689833164215088, + "learning_rate": 0.0002, + "loss": 2.5033, + "step": 27310 + }, + { + "epoch": 2.0357675111773474, + "grad_norm": 2.7386205196380615, + "learning_rate": 0.0002, + "loss": 2.4948, + "step": 27320 + }, + { + "epoch": 2.0365126676602086, + "grad_norm": 2.537919759750366, + "learning_rate": 0.0002, + "loss": 2.4454, + "step": 27330 + }, + { + "epoch": 2.03725782414307, + "grad_norm": 2.522033214569092, + "learning_rate": 0.0002, + "loss": 2.2687, + "step": 27340 + }, + { + "epoch": 2.0380029806259317, + "grad_norm": 2.6754488945007324, + "learning_rate": 0.0002, + "loss": 2.2479, + "step": 27350 + }, + { + "epoch": 2.0387481371087928, + "grad_norm": 2.134866952896118, + "learning_rate": 0.0002, + "loss": 2.3477, + "step": 27360 + }, + { + "epoch": 2.0394932935916543, + "grad_norm": 2.3635213375091553, + "learning_rate": 0.0002, + "loss": 2.4031, + "step": 27370 + }, + { + "epoch": 2.0402384500745154, + "grad_norm": 2.4664597511291504, + "learning_rate": 0.0002, + "loss": 2.5241, + "step": 27380 + }, + { + "epoch": 2.040983606557377, + "grad_norm": 2.7248127460479736, + "learning_rate": 0.0002, + "loss": 2.4092, + "step": 27390 + }, + { + "epoch": 2.0417287630402385, + "grad_norm": 2.75724458694458, + "learning_rate": 0.0002, + "loss": 2.4693, + "step": 27400 + }, + { + "epoch": 2.0424739195230996, + "grad_norm": 2.580315113067627, + "learning_rate": 0.0002, + "loss": 2.4291, + "step": 27410 + }, + { + "epoch": 2.043219076005961, + "grad_norm": 3.07570743560791, + "learning_rate": 0.0002, + "loss": 2.4226, + "step": 27420 + }, + { + "epoch": 2.0439642324888228, + "grad_norm": 2.4915974140167236, + "learning_rate": 0.0002, + "loss": 2.4481, + "step": 27430 + }, + { + "epoch": 2.044709388971684, + "grad_norm": 2.7171037197113037, + "learning_rate": 0.0002, + "loss": 2.2091, + "step": 27440 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 3.0568337440490723, + "learning_rate": 0.0002, + "loss": 2.3815, + "step": 27450 + }, + { + "epoch": 2.046199701937407, + "grad_norm": 2.659975051879883, + "learning_rate": 0.0002, + "loss": 2.4473, + "step": 27460 + }, + { + "epoch": 2.046944858420268, + "grad_norm": 2.7344188690185547, + "learning_rate": 0.0002, + "loss": 2.429, + "step": 27470 + }, + { + "epoch": 2.0476900149031296, + "grad_norm": 2.360349416732788, + "learning_rate": 0.0002, + "loss": 2.2687, + "step": 27480 + }, + { + "epoch": 2.048435171385991, + "grad_norm": 2.6498806476593018, + "learning_rate": 0.0002, + "loss": 2.4305, + "step": 27490 + }, + { + "epoch": 2.0491803278688523, + "grad_norm": 2.6238315105438232, + "learning_rate": 0.0002, + "loss": 2.3188, + "step": 27500 + }, + { + "epoch": 2.049925484351714, + "grad_norm": 2.6448466777801514, + "learning_rate": 0.0002, + "loss": 2.286, + "step": 27510 + }, + { + "epoch": 2.0506706408345754, + "grad_norm": 2.3168444633483887, + "learning_rate": 0.0002, + "loss": 2.2408, + "step": 27520 + }, + { + "epoch": 2.0514157973174365, + "grad_norm": 2.7542989253997803, + "learning_rate": 0.0002, + "loss": 2.2652, + "step": 27530 + }, + { + "epoch": 2.052160953800298, + "grad_norm": 2.794304132461548, + "learning_rate": 0.0002, + "loss": 2.3382, + "step": 27540 + }, + { + "epoch": 2.0529061102831596, + "grad_norm": 2.4550607204437256, + "learning_rate": 0.0002, + "loss": 2.5653, + "step": 27550 + }, + { + "epoch": 2.0536512667660207, + "grad_norm": 2.7925267219543457, + "learning_rate": 0.0002, + "loss": 2.3883, + "step": 27560 + }, + { + "epoch": 2.0543964232488823, + "grad_norm": 2.732017993927002, + "learning_rate": 0.0002, + "loss": 2.5619, + "step": 27570 + }, + { + "epoch": 2.055141579731744, + "grad_norm": 2.7166635990142822, + "learning_rate": 0.0002, + "loss": 2.3683, + "step": 27580 + }, + { + "epoch": 2.055886736214605, + "grad_norm": 2.5912301540374756, + "learning_rate": 0.0002, + "loss": 2.3318, + "step": 27590 + }, + { + "epoch": 2.0566318926974665, + "grad_norm": 2.5270578861236572, + "learning_rate": 0.0002, + "loss": 2.2477, + "step": 27600 + }, + { + "epoch": 2.057377049180328, + "grad_norm": 2.8963217735290527, + "learning_rate": 0.0002, + "loss": 2.3905, + "step": 27610 + }, + { + "epoch": 2.058122205663189, + "grad_norm": 2.306488513946533, + "learning_rate": 0.0002, + "loss": 2.3994, + "step": 27620 + }, + { + "epoch": 2.0588673621460507, + "grad_norm": 2.74562406539917, + "learning_rate": 0.0002, + "loss": 2.2255, + "step": 27630 + }, + { + "epoch": 2.0596125186289123, + "grad_norm": 2.907052755355835, + "learning_rate": 0.0002, + "loss": 2.4501, + "step": 27640 + }, + { + "epoch": 2.0603576751117734, + "grad_norm": 2.6865978240966797, + "learning_rate": 0.0002, + "loss": 2.4872, + "step": 27650 + }, + { + "epoch": 2.061102831594635, + "grad_norm": 2.829334259033203, + "learning_rate": 0.0002, + "loss": 2.3885, + "step": 27660 + }, + { + "epoch": 2.0618479880774965, + "grad_norm": 2.276557445526123, + "learning_rate": 0.0002, + "loss": 2.2833, + "step": 27670 + }, + { + "epoch": 2.0625931445603576, + "grad_norm": 2.7757015228271484, + "learning_rate": 0.0002, + "loss": 2.4336, + "step": 27680 + }, + { + "epoch": 2.063338301043219, + "grad_norm": 2.630664587020874, + "learning_rate": 0.0002, + "loss": 2.466, + "step": 27690 + }, + { + "epoch": 2.0640834575260807, + "grad_norm": 2.500433921813965, + "learning_rate": 0.0002, + "loss": 2.4625, + "step": 27700 + }, + { + "epoch": 2.064828614008942, + "grad_norm": 2.5673367977142334, + "learning_rate": 0.0002, + "loss": 2.3708, + "step": 27710 + }, + { + "epoch": 2.0655737704918034, + "grad_norm": 2.798462390899658, + "learning_rate": 0.0002, + "loss": 2.5025, + "step": 27720 + }, + { + "epoch": 2.066318926974665, + "grad_norm": 2.660876989364624, + "learning_rate": 0.0002, + "loss": 2.3465, + "step": 27730 + }, + { + "epoch": 2.067064083457526, + "grad_norm": 2.681070566177368, + "learning_rate": 0.0002, + "loss": 2.4037, + "step": 27740 + }, + { + "epoch": 2.0678092399403876, + "grad_norm": 2.4489269256591797, + "learning_rate": 0.0002, + "loss": 2.3239, + "step": 27750 + }, + { + "epoch": 2.0685543964232487, + "grad_norm": 2.980785608291626, + "learning_rate": 0.0002, + "loss": 2.4702, + "step": 27760 + }, + { + "epoch": 2.0692995529061102, + "grad_norm": 2.7260890007019043, + "learning_rate": 0.0002, + "loss": 2.3616, + "step": 27770 + }, + { + "epoch": 2.070044709388972, + "grad_norm": 2.714730739593506, + "learning_rate": 0.0002, + "loss": 2.3833, + "step": 27780 + }, + { + "epoch": 2.070789865871833, + "grad_norm": 2.3693652153015137, + "learning_rate": 0.0002, + "loss": 2.2471, + "step": 27790 + }, + { + "epoch": 2.0715350223546944, + "grad_norm": 2.6847126483917236, + "learning_rate": 0.0002, + "loss": 2.5155, + "step": 27800 + }, + { + "epoch": 2.072280178837556, + "grad_norm": 3.285604476928711, + "learning_rate": 0.0002, + "loss": 2.5073, + "step": 27810 + }, + { + "epoch": 2.073025335320417, + "grad_norm": 2.9383273124694824, + "learning_rate": 0.0002, + "loss": 2.401, + "step": 27820 + }, + { + "epoch": 2.0737704918032787, + "grad_norm": 2.5305216312408447, + "learning_rate": 0.0002, + "loss": 2.49, + "step": 27830 + }, + { + "epoch": 2.07451564828614, + "grad_norm": 3.088163375854492, + "learning_rate": 0.0002, + "loss": 2.3074, + "step": 27840 + }, + { + "epoch": 2.0752608047690013, + "grad_norm": 2.9205684661865234, + "learning_rate": 0.0002, + "loss": 2.3076, + "step": 27850 + }, + { + "epoch": 2.076005961251863, + "grad_norm": 2.644702196121216, + "learning_rate": 0.0002, + "loss": 2.3757, + "step": 27860 + }, + { + "epoch": 2.0767511177347244, + "grad_norm": 2.7440295219421387, + "learning_rate": 0.0002, + "loss": 2.4924, + "step": 27870 + }, + { + "epoch": 2.0774962742175855, + "grad_norm": 2.949817657470703, + "learning_rate": 0.0002, + "loss": 2.343, + "step": 27880 + }, + { + "epoch": 2.078241430700447, + "grad_norm": 2.5574519634246826, + "learning_rate": 0.0002, + "loss": 2.4203, + "step": 27890 + }, + { + "epoch": 2.0789865871833086, + "grad_norm": 2.6132795810699463, + "learning_rate": 0.0002, + "loss": 2.5225, + "step": 27900 + }, + { + "epoch": 2.0797317436661698, + "grad_norm": 2.1226987838745117, + "learning_rate": 0.0002, + "loss": 2.1986, + "step": 27910 + }, + { + "epoch": 2.0804769001490313, + "grad_norm": 2.5743627548217773, + "learning_rate": 0.0002, + "loss": 2.3704, + "step": 27920 + }, + { + "epoch": 2.081222056631893, + "grad_norm": 2.4740493297576904, + "learning_rate": 0.0002, + "loss": 2.2522, + "step": 27930 + }, + { + "epoch": 2.081967213114754, + "grad_norm": 2.5864386558532715, + "learning_rate": 0.0002, + "loss": 2.4904, + "step": 27940 + }, + { + "epoch": 2.0827123695976155, + "grad_norm": 2.6593196392059326, + "learning_rate": 0.0002, + "loss": 2.2939, + "step": 27950 + }, + { + "epoch": 2.083457526080477, + "grad_norm": 2.5160462856292725, + "learning_rate": 0.0002, + "loss": 2.5326, + "step": 27960 + }, + { + "epoch": 2.084202682563338, + "grad_norm": 2.8797519207000732, + "learning_rate": 0.0002, + "loss": 2.4039, + "step": 27970 + }, + { + "epoch": 2.0849478390461997, + "grad_norm": 2.435398578643799, + "learning_rate": 0.0002, + "loss": 2.4237, + "step": 27980 + }, + { + "epoch": 2.0856929955290613, + "grad_norm": 2.6593923568725586, + "learning_rate": 0.0002, + "loss": 2.5967, + "step": 27990 + }, + { + "epoch": 2.0864381520119224, + "grad_norm": 2.205998182296753, + "learning_rate": 0.0002, + "loss": 2.2983, + "step": 28000 + }, + { + "epoch": 2.087183308494784, + "grad_norm": 2.313530445098877, + "learning_rate": 0.0002, + "loss": 2.291, + "step": 28010 + }, + { + "epoch": 2.0879284649776455, + "grad_norm": 2.9315454959869385, + "learning_rate": 0.0002, + "loss": 2.311, + "step": 28020 + }, + { + "epoch": 2.0886736214605066, + "grad_norm": 2.7529959678649902, + "learning_rate": 0.0002, + "loss": 2.2777, + "step": 28030 + }, + { + "epoch": 2.089418777943368, + "grad_norm": 2.668069362640381, + "learning_rate": 0.0002, + "loss": 2.6066, + "step": 28040 + }, + { + "epoch": 2.0901639344262297, + "grad_norm": 2.849188804626465, + "learning_rate": 0.0002, + "loss": 2.535, + "step": 28050 + }, + { + "epoch": 2.090909090909091, + "grad_norm": 2.613250970840454, + "learning_rate": 0.0002, + "loss": 2.2796, + "step": 28060 + }, + { + "epoch": 2.0916542473919524, + "grad_norm": 2.3394582271575928, + "learning_rate": 0.0002, + "loss": 2.4765, + "step": 28070 + }, + { + "epoch": 2.092399403874814, + "grad_norm": 2.5623340606689453, + "learning_rate": 0.0002, + "loss": 2.4527, + "step": 28080 + }, + { + "epoch": 2.093144560357675, + "grad_norm": 2.4075920581817627, + "learning_rate": 0.0002, + "loss": 2.2952, + "step": 28090 + }, + { + "epoch": 2.0938897168405366, + "grad_norm": 2.763248920440674, + "learning_rate": 0.0002, + "loss": 2.2659, + "step": 28100 + }, + { + "epoch": 2.0946348733233977, + "grad_norm": 2.7755019664764404, + "learning_rate": 0.0002, + "loss": 2.622, + "step": 28110 + }, + { + "epoch": 2.0953800298062593, + "grad_norm": 2.6472527980804443, + "learning_rate": 0.0002, + "loss": 2.4746, + "step": 28120 + }, + { + "epoch": 2.096125186289121, + "grad_norm": 3.476677417755127, + "learning_rate": 0.0002, + "loss": 2.3413, + "step": 28130 + }, + { + "epoch": 2.096870342771982, + "grad_norm": 2.5230743885040283, + "learning_rate": 0.0002, + "loss": 2.3565, + "step": 28140 + }, + { + "epoch": 2.0976154992548435, + "grad_norm": 2.7483949661254883, + "learning_rate": 0.0002, + "loss": 2.3682, + "step": 28150 + }, + { + "epoch": 2.098360655737705, + "grad_norm": 2.6513829231262207, + "learning_rate": 0.0002, + "loss": 2.2721, + "step": 28160 + }, + { + "epoch": 2.099105812220566, + "grad_norm": 2.6120636463165283, + "learning_rate": 0.0002, + "loss": 2.6904, + "step": 28170 + }, + { + "epoch": 2.0998509687034277, + "grad_norm": 2.4628775119781494, + "learning_rate": 0.0002, + "loss": 2.4378, + "step": 28180 + }, + { + "epoch": 2.1005961251862892, + "grad_norm": 2.5905325412750244, + "learning_rate": 0.0002, + "loss": 2.2657, + "step": 28190 + }, + { + "epoch": 2.1013412816691504, + "grad_norm": 2.9600167274475098, + "learning_rate": 0.0002, + "loss": 2.2855, + "step": 28200 + }, + { + "epoch": 2.102086438152012, + "grad_norm": 3.109748363494873, + "learning_rate": 0.0002, + "loss": 2.4388, + "step": 28210 + }, + { + "epoch": 2.1028315946348735, + "grad_norm": 2.639881134033203, + "learning_rate": 0.0002, + "loss": 2.4958, + "step": 28220 + }, + { + "epoch": 2.1035767511177346, + "grad_norm": 2.901447057723999, + "learning_rate": 0.0002, + "loss": 2.6251, + "step": 28230 + }, + { + "epoch": 2.104321907600596, + "grad_norm": 2.4745752811431885, + "learning_rate": 0.0002, + "loss": 2.5932, + "step": 28240 + }, + { + "epoch": 2.1050670640834577, + "grad_norm": 2.363933801651001, + "learning_rate": 0.0002, + "loss": 2.3875, + "step": 28250 + }, + { + "epoch": 2.105812220566319, + "grad_norm": 2.6219677925109863, + "learning_rate": 0.0002, + "loss": 2.3415, + "step": 28260 + }, + { + "epoch": 2.1065573770491803, + "grad_norm": 2.6669254302978516, + "learning_rate": 0.0002, + "loss": 2.5103, + "step": 28270 + }, + { + "epoch": 2.107302533532042, + "grad_norm": 2.7276668548583984, + "learning_rate": 0.0002, + "loss": 2.2377, + "step": 28280 + }, + { + "epoch": 2.108047690014903, + "grad_norm": 2.473850727081299, + "learning_rate": 0.0002, + "loss": 2.4347, + "step": 28290 + }, + { + "epoch": 2.1087928464977646, + "grad_norm": 2.89056396484375, + "learning_rate": 0.0002, + "loss": 2.3286, + "step": 28300 + }, + { + "epoch": 2.109538002980626, + "grad_norm": 2.4817709922790527, + "learning_rate": 0.0002, + "loss": 2.5692, + "step": 28310 + }, + { + "epoch": 2.110283159463487, + "grad_norm": 1.887121319770813, + "learning_rate": 0.0002, + "loss": 2.2894, + "step": 28320 + }, + { + "epoch": 2.1110283159463488, + "grad_norm": 2.799809694290161, + "learning_rate": 0.0002, + "loss": 2.4199, + "step": 28330 + }, + { + "epoch": 2.1117734724292103, + "grad_norm": 2.481428861618042, + "learning_rate": 0.0002, + "loss": 2.3954, + "step": 28340 + }, + { + "epoch": 2.1125186289120714, + "grad_norm": 2.573892831802368, + "learning_rate": 0.0002, + "loss": 2.4221, + "step": 28350 + }, + { + "epoch": 2.113263785394933, + "grad_norm": 2.6230032444000244, + "learning_rate": 0.0002, + "loss": 2.3498, + "step": 28360 + }, + { + "epoch": 2.1140089418777945, + "grad_norm": 2.3305563926696777, + "learning_rate": 0.0002, + "loss": 2.4446, + "step": 28370 + }, + { + "epoch": 2.1147540983606556, + "grad_norm": 2.297919750213623, + "learning_rate": 0.0002, + "loss": 2.452, + "step": 28380 + }, + { + "epoch": 2.115499254843517, + "grad_norm": 3.0434772968292236, + "learning_rate": 0.0002, + "loss": 2.3133, + "step": 28390 + }, + { + "epoch": 2.1162444113263787, + "grad_norm": 3.005995750427246, + "learning_rate": 0.0002, + "loss": 2.3543, + "step": 28400 + }, + { + "epoch": 2.11698956780924, + "grad_norm": 2.65181565284729, + "learning_rate": 0.0002, + "loss": 2.2972, + "step": 28410 + }, + { + "epoch": 2.1177347242921014, + "grad_norm": 2.5356462001800537, + "learning_rate": 0.0002, + "loss": 2.3888, + "step": 28420 + }, + { + "epoch": 2.118479880774963, + "grad_norm": 2.4722752571105957, + "learning_rate": 0.0002, + "loss": 2.4412, + "step": 28430 + }, + { + "epoch": 2.119225037257824, + "grad_norm": 2.7374267578125, + "learning_rate": 0.0002, + "loss": 2.5805, + "step": 28440 + }, + { + "epoch": 2.1199701937406856, + "grad_norm": 2.93241548538208, + "learning_rate": 0.0002, + "loss": 2.5802, + "step": 28450 + }, + { + "epoch": 2.1207153502235467, + "grad_norm": 2.6112005710601807, + "learning_rate": 0.0002, + "loss": 2.485, + "step": 28460 + }, + { + "epoch": 2.1214605067064083, + "grad_norm": 2.2507147789001465, + "learning_rate": 0.0002, + "loss": 2.3001, + "step": 28470 + }, + { + "epoch": 2.12220566318927, + "grad_norm": 2.8266193866729736, + "learning_rate": 0.0002, + "loss": 2.3678, + "step": 28480 + }, + { + "epoch": 2.122950819672131, + "grad_norm": 2.5680441856384277, + "learning_rate": 0.0002, + "loss": 2.5827, + "step": 28490 + }, + { + "epoch": 2.1236959761549925, + "grad_norm": 2.623478889465332, + "learning_rate": 0.0002, + "loss": 2.3191, + "step": 28500 + }, + { + "epoch": 2.124441132637854, + "grad_norm": 2.591749906539917, + "learning_rate": 0.0002, + "loss": 2.4871, + "step": 28510 + }, + { + "epoch": 2.125186289120715, + "grad_norm": 2.7326436042785645, + "learning_rate": 0.0002, + "loss": 2.3359, + "step": 28520 + }, + { + "epoch": 2.1259314456035767, + "grad_norm": 3.0076076984405518, + "learning_rate": 0.0002, + "loss": 2.3586, + "step": 28530 + }, + { + "epoch": 2.1266766020864383, + "grad_norm": 2.5301334857940674, + "learning_rate": 0.0002, + "loss": 2.4504, + "step": 28540 + }, + { + "epoch": 2.1274217585692994, + "grad_norm": 2.3998358249664307, + "learning_rate": 0.0002, + "loss": 2.2544, + "step": 28550 + }, + { + "epoch": 2.128166915052161, + "grad_norm": 2.7175776958465576, + "learning_rate": 0.0002, + "loss": 2.582, + "step": 28560 + }, + { + "epoch": 2.1289120715350225, + "grad_norm": 2.4228036403656006, + "learning_rate": 0.0002, + "loss": 2.3565, + "step": 28570 + }, + { + "epoch": 2.1296572280178836, + "grad_norm": 2.6200060844421387, + "learning_rate": 0.0002, + "loss": 2.1041, + "step": 28580 + }, + { + "epoch": 2.130402384500745, + "grad_norm": 2.493910312652588, + "learning_rate": 0.0002, + "loss": 2.3126, + "step": 28590 + }, + { + "epoch": 2.1311475409836067, + "grad_norm": 3.021311044692993, + "learning_rate": 0.0002, + "loss": 2.5049, + "step": 28600 + }, + { + "epoch": 2.131892697466468, + "grad_norm": 2.4209420680999756, + "learning_rate": 0.0002, + "loss": 2.3074, + "step": 28610 + }, + { + "epoch": 2.1326378539493294, + "grad_norm": 2.2820701599121094, + "learning_rate": 0.0002, + "loss": 2.3154, + "step": 28620 + }, + { + "epoch": 2.133383010432191, + "grad_norm": 2.4041130542755127, + "learning_rate": 0.0002, + "loss": 2.4206, + "step": 28630 + }, + { + "epoch": 2.134128166915052, + "grad_norm": 2.7592175006866455, + "learning_rate": 0.0002, + "loss": 2.4367, + "step": 28640 + }, + { + "epoch": 2.1348733233979136, + "grad_norm": 2.982658863067627, + "learning_rate": 0.0002, + "loss": 2.4851, + "step": 28650 + }, + { + "epoch": 2.135618479880775, + "grad_norm": 2.370884656906128, + "learning_rate": 0.0002, + "loss": 2.3921, + "step": 28660 + }, + { + "epoch": 2.1363636363636362, + "grad_norm": 2.121638298034668, + "learning_rate": 0.0002, + "loss": 2.4153, + "step": 28670 + }, + { + "epoch": 2.137108792846498, + "grad_norm": 2.6304523944854736, + "learning_rate": 0.0002, + "loss": 2.5541, + "step": 28680 + }, + { + "epoch": 2.1378539493293593, + "grad_norm": 2.711233139038086, + "learning_rate": 0.0002, + "loss": 2.3325, + "step": 28690 + }, + { + "epoch": 2.1385991058122205, + "grad_norm": 2.832350969314575, + "learning_rate": 0.0002, + "loss": 2.4829, + "step": 28700 + }, + { + "epoch": 2.139344262295082, + "grad_norm": 2.737445831298828, + "learning_rate": 0.0002, + "loss": 2.5671, + "step": 28710 + }, + { + "epoch": 2.1400894187779436, + "grad_norm": 2.456326484680176, + "learning_rate": 0.0002, + "loss": 2.4368, + "step": 28720 + }, + { + "epoch": 2.1408345752608047, + "grad_norm": 2.3802084922790527, + "learning_rate": 0.0002, + "loss": 2.5299, + "step": 28730 + }, + { + "epoch": 2.1415797317436662, + "grad_norm": 2.521446943283081, + "learning_rate": 0.0002, + "loss": 2.51, + "step": 28740 + }, + { + "epoch": 2.1423248882265273, + "grad_norm": 2.6229171752929688, + "learning_rate": 0.0002, + "loss": 2.4292, + "step": 28750 + }, + { + "epoch": 2.143070044709389, + "grad_norm": 2.573530435562134, + "learning_rate": 0.0002, + "loss": 2.5116, + "step": 28760 + }, + { + "epoch": 2.1438152011922504, + "grad_norm": 2.3284008502960205, + "learning_rate": 0.0002, + "loss": 2.4296, + "step": 28770 + }, + { + "epoch": 2.144560357675112, + "grad_norm": 2.8102099895477295, + "learning_rate": 0.0002, + "loss": 2.4006, + "step": 28780 + }, + { + "epoch": 2.145305514157973, + "grad_norm": 2.7067055702209473, + "learning_rate": 0.0002, + "loss": 2.5078, + "step": 28790 + }, + { + "epoch": 2.1460506706408347, + "grad_norm": 2.4002151489257812, + "learning_rate": 0.0002, + "loss": 2.6035, + "step": 28800 + }, + { + "epoch": 2.1467958271236958, + "grad_norm": 2.9272592067718506, + "learning_rate": 0.0002, + "loss": 2.6574, + "step": 28810 + }, + { + "epoch": 2.1475409836065573, + "grad_norm": 2.3437740802764893, + "learning_rate": 0.0002, + "loss": 2.3623, + "step": 28820 + }, + { + "epoch": 2.148286140089419, + "grad_norm": 2.795177459716797, + "learning_rate": 0.0002, + "loss": 2.4582, + "step": 28830 + }, + { + "epoch": 2.14903129657228, + "grad_norm": 2.934332847595215, + "learning_rate": 0.0002, + "loss": 2.5275, + "step": 28840 + }, + { + "epoch": 2.1497764530551415, + "grad_norm": 2.446051597595215, + "learning_rate": 0.0002, + "loss": 2.3525, + "step": 28850 + }, + { + "epoch": 2.150521609538003, + "grad_norm": 2.6848883628845215, + "learning_rate": 0.0002, + "loss": 2.5435, + "step": 28860 + }, + { + "epoch": 2.151266766020864, + "grad_norm": 2.80637788772583, + "learning_rate": 0.0002, + "loss": 2.3997, + "step": 28870 + }, + { + "epoch": 2.1520119225037257, + "grad_norm": 2.7611582279205322, + "learning_rate": 0.0002, + "loss": 2.4226, + "step": 28880 + }, + { + "epoch": 2.1527570789865873, + "grad_norm": 3.1131093502044678, + "learning_rate": 0.0002, + "loss": 2.2822, + "step": 28890 + }, + { + "epoch": 2.1535022354694484, + "grad_norm": 2.372504949569702, + "learning_rate": 0.0002, + "loss": 2.3701, + "step": 28900 + }, + { + "epoch": 2.15424739195231, + "grad_norm": 2.3762528896331787, + "learning_rate": 0.0002, + "loss": 2.4664, + "step": 28910 + }, + { + "epoch": 2.1549925484351715, + "grad_norm": 2.665152072906494, + "learning_rate": 0.0002, + "loss": 2.5005, + "step": 28920 + }, + { + "epoch": 2.1557377049180326, + "grad_norm": 2.8510541915893555, + "learning_rate": 0.0002, + "loss": 2.5503, + "step": 28930 + }, + { + "epoch": 2.156482861400894, + "grad_norm": 2.647361993789673, + "learning_rate": 0.0002, + "loss": 2.2592, + "step": 28940 + }, + { + "epoch": 2.1572280178837557, + "grad_norm": 2.626680612564087, + "learning_rate": 0.0002, + "loss": 2.4075, + "step": 28950 + }, + { + "epoch": 2.157973174366617, + "grad_norm": 3.47946834564209, + "learning_rate": 0.0002, + "loss": 2.5345, + "step": 28960 + }, + { + "epoch": 2.1587183308494784, + "grad_norm": 2.671481132507324, + "learning_rate": 0.0002, + "loss": 2.2957, + "step": 28970 + }, + { + "epoch": 2.15946348733234, + "grad_norm": 2.696821689605713, + "learning_rate": 0.0002, + "loss": 2.3831, + "step": 28980 + }, + { + "epoch": 2.160208643815201, + "grad_norm": 2.814481019973755, + "learning_rate": 0.0002, + "loss": 2.4499, + "step": 28990 + }, + { + "epoch": 2.1609538002980626, + "grad_norm": 2.933293342590332, + "learning_rate": 0.0002, + "loss": 2.3267, + "step": 29000 + }, + { + "epoch": 2.161698956780924, + "grad_norm": 2.5142786502838135, + "learning_rate": 0.0002, + "loss": 2.349, + "step": 29010 + }, + { + "epoch": 2.1624441132637853, + "grad_norm": 2.8400211334228516, + "learning_rate": 0.0002, + "loss": 2.4176, + "step": 29020 + }, + { + "epoch": 2.163189269746647, + "grad_norm": 2.3746659755706787, + "learning_rate": 0.0002, + "loss": 2.4822, + "step": 29030 + }, + { + "epoch": 2.1639344262295084, + "grad_norm": 2.615368604660034, + "learning_rate": 0.0002, + "loss": 2.4141, + "step": 29040 + }, + { + "epoch": 2.1646795827123695, + "grad_norm": 2.766857385635376, + "learning_rate": 0.0002, + "loss": 2.5287, + "step": 29050 + }, + { + "epoch": 2.165424739195231, + "grad_norm": 2.6013858318328857, + "learning_rate": 0.0002, + "loss": 2.5461, + "step": 29060 + }, + { + "epoch": 2.1661698956780926, + "grad_norm": 2.6237449645996094, + "learning_rate": 0.0002, + "loss": 2.4418, + "step": 29070 + }, + { + "epoch": 2.1669150521609537, + "grad_norm": 2.9323627948760986, + "learning_rate": 0.0002, + "loss": 2.504, + "step": 29080 + }, + { + "epoch": 2.1676602086438153, + "grad_norm": 2.950990915298462, + "learning_rate": 0.0002, + "loss": 2.2819, + "step": 29090 + }, + { + "epoch": 2.168405365126677, + "grad_norm": 2.595555305480957, + "learning_rate": 0.0002, + "loss": 2.4392, + "step": 29100 + }, + { + "epoch": 2.169150521609538, + "grad_norm": 2.9946279525756836, + "learning_rate": 0.0002, + "loss": 2.538, + "step": 29110 + }, + { + "epoch": 2.1698956780923995, + "grad_norm": 2.4822959899902344, + "learning_rate": 0.0002, + "loss": 2.329, + "step": 29120 + }, + { + "epoch": 2.170640834575261, + "grad_norm": 2.95805287361145, + "learning_rate": 0.0002, + "loss": 2.5216, + "step": 29130 + }, + { + "epoch": 2.171385991058122, + "grad_norm": 2.540740966796875, + "learning_rate": 0.0002, + "loss": 2.3522, + "step": 29140 + }, + { + "epoch": 2.1721311475409837, + "grad_norm": 2.614377737045288, + "learning_rate": 0.0002, + "loss": 2.4523, + "step": 29150 + }, + { + "epoch": 2.172876304023845, + "grad_norm": 2.5126142501831055, + "learning_rate": 0.0002, + "loss": 2.4256, + "step": 29160 + }, + { + "epoch": 2.1736214605067063, + "grad_norm": 2.8608033657073975, + "learning_rate": 0.0002, + "loss": 2.3527, + "step": 29170 + }, + { + "epoch": 2.174366616989568, + "grad_norm": 2.863196849822998, + "learning_rate": 0.0002, + "loss": 2.3969, + "step": 29180 + }, + { + "epoch": 2.175111773472429, + "grad_norm": 2.6187872886657715, + "learning_rate": 0.0002, + "loss": 2.569, + "step": 29190 + }, + { + "epoch": 2.1758569299552906, + "grad_norm": 2.520378351211548, + "learning_rate": 0.0002, + "loss": 2.4414, + "step": 29200 + }, + { + "epoch": 2.176602086438152, + "grad_norm": 2.438509464263916, + "learning_rate": 0.0002, + "loss": 2.5206, + "step": 29210 + }, + { + "epoch": 2.1773472429210132, + "grad_norm": 2.899704694747925, + "learning_rate": 0.0002, + "loss": 2.4476, + "step": 29220 + }, + { + "epoch": 2.178092399403875, + "grad_norm": 2.8411431312561035, + "learning_rate": 0.0002, + "loss": 2.3632, + "step": 29230 + }, + { + "epoch": 2.1788375558867363, + "grad_norm": 3.046539783477783, + "learning_rate": 0.0002, + "loss": 2.4188, + "step": 29240 + }, + { + "epoch": 2.1795827123695974, + "grad_norm": 2.8408050537109375, + "learning_rate": 0.0002, + "loss": 2.5058, + "step": 29250 + }, + { + "epoch": 2.180327868852459, + "grad_norm": 1.9899271726608276, + "learning_rate": 0.0002, + "loss": 2.3903, + "step": 29260 + }, + { + "epoch": 2.1810730253353205, + "grad_norm": 3.0790789127349854, + "learning_rate": 0.0002, + "loss": 2.4835, + "step": 29270 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 2.649176836013794, + "learning_rate": 0.0002, + "loss": 2.381, + "step": 29280 + }, + { + "epoch": 2.182563338301043, + "grad_norm": 2.7816390991210938, + "learning_rate": 0.0002, + "loss": 2.5655, + "step": 29290 + }, + { + "epoch": 2.1833084947839048, + "grad_norm": 2.8856558799743652, + "learning_rate": 0.0002, + "loss": 2.4248, + "step": 29300 + }, + { + "epoch": 2.184053651266766, + "grad_norm": 2.5775249004364014, + "learning_rate": 0.0002, + "loss": 2.4948, + "step": 29310 + }, + { + "epoch": 2.1847988077496274, + "grad_norm": 2.680647134780884, + "learning_rate": 0.0002, + "loss": 2.4791, + "step": 29320 + }, + { + "epoch": 2.185543964232489, + "grad_norm": 2.7082343101501465, + "learning_rate": 0.0002, + "loss": 2.2716, + "step": 29330 + }, + { + "epoch": 2.18628912071535, + "grad_norm": 2.76863956451416, + "learning_rate": 0.0002, + "loss": 2.3834, + "step": 29340 + }, + { + "epoch": 2.1870342771982116, + "grad_norm": 2.652808666229248, + "learning_rate": 0.0002, + "loss": 2.5556, + "step": 29350 + }, + { + "epoch": 2.187779433681073, + "grad_norm": 2.7211995124816895, + "learning_rate": 0.0002, + "loss": 2.428, + "step": 29360 + }, + { + "epoch": 2.1885245901639343, + "grad_norm": 2.6517481803894043, + "learning_rate": 0.0002, + "loss": 2.3506, + "step": 29370 + }, + { + "epoch": 2.189269746646796, + "grad_norm": 2.680163621902466, + "learning_rate": 0.0002, + "loss": 2.3581, + "step": 29380 + }, + { + "epoch": 2.1900149031296574, + "grad_norm": 2.6171483993530273, + "learning_rate": 0.0002, + "loss": 2.3894, + "step": 29390 + }, + { + "epoch": 2.1907600596125185, + "grad_norm": 2.645461320877075, + "learning_rate": 0.0002, + "loss": 2.4044, + "step": 29400 + }, + { + "epoch": 2.19150521609538, + "grad_norm": 2.607429265975952, + "learning_rate": 0.0002, + "loss": 2.3481, + "step": 29410 + }, + { + "epoch": 2.1922503725782416, + "grad_norm": 2.634819269180298, + "learning_rate": 0.0002, + "loss": 2.3877, + "step": 29420 + }, + { + "epoch": 2.1929955290611027, + "grad_norm": 2.682586193084717, + "learning_rate": 0.0002, + "loss": 2.3752, + "step": 29430 + }, + { + "epoch": 2.1937406855439643, + "grad_norm": 2.5644547939300537, + "learning_rate": 0.0002, + "loss": 2.4439, + "step": 29440 + }, + { + "epoch": 2.194485842026826, + "grad_norm": 2.639521837234497, + "learning_rate": 0.0002, + "loss": 2.4411, + "step": 29450 + }, + { + "epoch": 2.195230998509687, + "grad_norm": 2.455165147781372, + "learning_rate": 0.0002, + "loss": 2.6128, + "step": 29460 + }, + { + "epoch": 2.1959761549925485, + "grad_norm": 2.800022602081299, + "learning_rate": 0.0002, + "loss": 2.4411, + "step": 29470 + }, + { + "epoch": 2.19672131147541, + "grad_norm": 2.4787418842315674, + "learning_rate": 0.0002, + "loss": 2.4363, + "step": 29480 + }, + { + "epoch": 2.197466467958271, + "grad_norm": 2.7749505043029785, + "learning_rate": 0.0002, + "loss": 2.4775, + "step": 29490 + }, + { + "epoch": 2.1982116244411327, + "grad_norm": 2.472622871398926, + "learning_rate": 0.0002, + "loss": 2.3768, + "step": 29500 + }, + { + "epoch": 2.198956780923994, + "grad_norm": 2.560708999633789, + "learning_rate": 0.0002, + "loss": 2.4336, + "step": 29510 + }, + { + "epoch": 2.1997019374068554, + "grad_norm": 2.5658538341522217, + "learning_rate": 0.0002, + "loss": 2.389, + "step": 29520 + }, + { + "epoch": 2.200447093889717, + "grad_norm": 2.6552112102508545, + "learning_rate": 0.0002, + "loss": 2.1017, + "step": 29530 + }, + { + "epoch": 2.201192250372578, + "grad_norm": 2.23706316947937, + "learning_rate": 0.0002, + "loss": 2.4461, + "step": 29540 + }, + { + "epoch": 2.2019374068554396, + "grad_norm": 2.763774871826172, + "learning_rate": 0.0002, + "loss": 2.4351, + "step": 29550 + }, + { + "epoch": 2.202682563338301, + "grad_norm": 2.5139219760894775, + "learning_rate": 0.0002, + "loss": 2.351, + "step": 29560 + }, + { + "epoch": 2.2034277198211623, + "grad_norm": 2.893038272857666, + "learning_rate": 0.0002, + "loss": 2.4692, + "step": 29570 + }, + { + "epoch": 2.204172876304024, + "grad_norm": 2.818425178527832, + "learning_rate": 0.0002, + "loss": 2.4447, + "step": 29580 + }, + { + "epoch": 2.2049180327868854, + "grad_norm": 2.331808567047119, + "learning_rate": 0.0002, + "loss": 2.4306, + "step": 29590 + }, + { + "epoch": 2.2056631892697465, + "grad_norm": 3.2033214569091797, + "learning_rate": 0.0002, + "loss": 2.5642, + "step": 29600 + }, + { + "epoch": 2.206408345752608, + "grad_norm": 2.41534686088562, + "learning_rate": 0.0002, + "loss": 2.3699, + "step": 29610 + }, + { + "epoch": 2.2071535022354696, + "grad_norm": 2.8032174110412598, + "learning_rate": 0.0002, + "loss": 2.6087, + "step": 29620 + }, + { + "epoch": 2.2078986587183307, + "grad_norm": 2.4665184020996094, + "learning_rate": 0.0002, + "loss": 2.4702, + "step": 29630 + }, + { + "epoch": 2.2086438152011922, + "grad_norm": 2.474245309829712, + "learning_rate": 0.0002, + "loss": 2.392, + "step": 29640 + }, + { + "epoch": 2.209388971684054, + "grad_norm": 2.9560275077819824, + "learning_rate": 0.0002, + "loss": 2.5555, + "step": 29650 + }, + { + "epoch": 2.210134128166915, + "grad_norm": 2.4646859169006348, + "learning_rate": 0.0002, + "loss": 2.3858, + "step": 29660 + }, + { + "epoch": 2.2108792846497765, + "grad_norm": 2.8833131790161133, + "learning_rate": 0.0002, + "loss": 2.3451, + "step": 29670 + }, + { + "epoch": 2.211624441132638, + "grad_norm": 2.540769577026367, + "learning_rate": 0.0002, + "loss": 2.256, + "step": 29680 + }, + { + "epoch": 2.212369597615499, + "grad_norm": 3.0184593200683594, + "learning_rate": 0.0002, + "loss": 2.4306, + "step": 29690 + }, + { + "epoch": 2.2131147540983607, + "grad_norm": 2.6296427249908447, + "learning_rate": 0.0002, + "loss": 2.5813, + "step": 29700 + }, + { + "epoch": 2.2138599105812222, + "grad_norm": 2.667278528213501, + "learning_rate": 0.0002, + "loss": 2.4874, + "step": 29710 + }, + { + "epoch": 2.2146050670640833, + "grad_norm": 2.660968065261841, + "learning_rate": 0.0002, + "loss": 2.3787, + "step": 29720 + }, + { + "epoch": 2.215350223546945, + "grad_norm": 2.5494511127471924, + "learning_rate": 0.0002, + "loss": 2.4766, + "step": 29730 + }, + { + "epoch": 2.2160953800298064, + "grad_norm": 2.503387928009033, + "learning_rate": 0.0002, + "loss": 2.4909, + "step": 29740 + }, + { + "epoch": 2.2168405365126675, + "grad_norm": 2.6121363639831543, + "learning_rate": 0.0002, + "loss": 2.5237, + "step": 29750 + }, + { + "epoch": 2.217585692995529, + "grad_norm": 2.4190587997436523, + "learning_rate": 0.0002, + "loss": 2.3553, + "step": 29760 + }, + { + "epoch": 2.2183308494783907, + "grad_norm": 2.8600144386291504, + "learning_rate": 0.0002, + "loss": 2.4106, + "step": 29770 + }, + { + "epoch": 2.2190760059612518, + "grad_norm": 3.615830898284912, + "learning_rate": 0.0002, + "loss": 2.5998, + "step": 29780 + }, + { + "epoch": 2.2198211624441133, + "grad_norm": 3.1991500854492188, + "learning_rate": 0.0002, + "loss": 2.6738, + "step": 29790 + }, + { + "epoch": 2.220566318926975, + "grad_norm": 2.7967700958251953, + "learning_rate": 0.0002, + "loss": 2.4236, + "step": 29800 + }, + { + "epoch": 2.221311475409836, + "grad_norm": 2.68129301071167, + "learning_rate": 0.0002, + "loss": 2.4228, + "step": 29810 + }, + { + "epoch": 2.2220566318926975, + "grad_norm": 2.7020351886749268, + "learning_rate": 0.0002, + "loss": 2.3199, + "step": 29820 + }, + { + "epoch": 2.222801788375559, + "grad_norm": 2.417415142059326, + "learning_rate": 0.0002, + "loss": 2.344, + "step": 29830 + }, + { + "epoch": 2.22354694485842, + "grad_norm": 2.5789639949798584, + "learning_rate": 0.0002, + "loss": 2.3901, + "step": 29840 + }, + { + "epoch": 2.2242921013412817, + "grad_norm": 2.623487710952759, + "learning_rate": 0.0002, + "loss": 2.3944, + "step": 29850 + }, + { + "epoch": 2.225037257824143, + "grad_norm": 2.554654598236084, + "learning_rate": 0.0002, + "loss": 2.3396, + "step": 29860 + }, + { + "epoch": 2.2257824143070044, + "grad_norm": 2.624058485031128, + "learning_rate": 0.0002, + "loss": 2.222, + "step": 29870 + }, + { + "epoch": 2.226527570789866, + "grad_norm": 2.9472296237945557, + "learning_rate": 0.0002, + "loss": 2.5653, + "step": 29880 + }, + { + "epoch": 2.227272727272727, + "grad_norm": 2.7747509479522705, + "learning_rate": 0.0002, + "loss": 2.3922, + "step": 29890 + }, + { + "epoch": 2.2280178837555886, + "grad_norm": 2.717242956161499, + "learning_rate": 0.0002, + "loss": 2.3198, + "step": 29900 + }, + { + "epoch": 2.22876304023845, + "grad_norm": 2.829017400741577, + "learning_rate": 0.0002, + "loss": 2.2318, + "step": 29910 + }, + { + "epoch": 2.2295081967213113, + "grad_norm": 2.65274715423584, + "learning_rate": 0.0002, + "loss": 2.3987, + "step": 29920 + }, + { + "epoch": 2.230253353204173, + "grad_norm": 2.738203287124634, + "learning_rate": 0.0002, + "loss": 2.2976, + "step": 29930 + }, + { + "epoch": 2.2309985096870344, + "grad_norm": 2.4520084857940674, + "learning_rate": 0.0002, + "loss": 2.5559, + "step": 29940 + }, + { + "epoch": 2.2317436661698955, + "grad_norm": 2.829719066619873, + "learning_rate": 0.0002, + "loss": 2.3981, + "step": 29950 + }, + { + "epoch": 2.232488822652757, + "grad_norm": 2.8684730529785156, + "learning_rate": 0.0002, + "loss": 2.4195, + "step": 29960 + }, + { + "epoch": 2.2332339791356186, + "grad_norm": 2.597276449203491, + "learning_rate": 0.0002, + "loss": 2.4955, + "step": 29970 + }, + { + "epoch": 2.2339791356184797, + "grad_norm": 2.678597927093506, + "learning_rate": 0.0002, + "loss": 2.3952, + "step": 29980 + }, + { + "epoch": 2.2347242921013413, + "grad_norm": 2.6332530975341797, + "learning_rate": 0.0002, + "loss": 2.6308, + "step": 29990 + }, + { + "epoch": 2.235469448584203, + "grad_norm": 2.6793925762176514, + "learning_rate": 0.0002, + "loss": 2.3281, + "step": 30000 + }, + { + "epoch": 2.236214605067064, + "grad_norm": 2.714738607406616, + "learning_rate": 0.0002, + "loss": 2.4091, + "step": 30010 + }, + { + "epoch": 2.2369597615499255, + "grad_norm": 2.5853700637817383, + "learning_rate": 0.0002, + "loss": 2.4243, + "step": 30020 + }, + { + "epoch": 2.237704918032787, + "grad_norm": 2.972341775894165, + "learning_rate": 0.0002, + "loss": 2.4984, + "step": 30030 + }, + { + "epoch": 2.238450074515648, + "grad_norm": 2.5449442863464355, + "learning_rate": 0.0002, + "loss": 2.4433, + "step": 30040 + }, + { + "epoch": 2.2391952309985097, + "grad_norm": 2.6521236896514893, + "learning_rate": 0.0002, + "loss": 2.5521, + "step": 30050 + }, + { + "epoch": 2.2399403874813713, + "grad_norm": 2.510643482208252, + "learning_rate": 0.0002, + "loss": 2.5315, + "step": 30060 + }, + { + "epoch": 2.2406855439642324, + "grad_norm": 2.896329641342163, + "learning_rate": 0.0002, + "loss": 2.4374, + "step": 30070 + }, + { + "epoch": 2.241430700447094, + "grad_norm": 2.6130764484405518, + "learning_rate": 0.0002, + "loss": 2.4338, + "step": 30080 + }, + { + "epoch": 2.2421758569299555, + "grad_norm": 2.5632150173187256, + "learning_rate": 0.0002, + "loss": 2.4224, + "step": 30090 + }, + { + "epoch": 2.2429210134128166, + "grad_norm": 2.6753768920898438, + "learning_rate": 0.0002, + "loss": 2.4606, + "step": 30100 + }, + { + "epoch": 2.243666169895678, + "grad_norm": 2.60931396484375, + "learning_rate": 0.0002, + "loss": 2.4416, + "step": 30110 + }, + { + "epoch": 2.2444113263785397, + "grad_norm": 2.428415536880493, + "learning_rate": 0.0002, + "loss": 2.596, + "step": 30120 + }, + { + "epoch": 2.245156482861401, + "grad_norm": 2.97552752494812, + "learning_rate": 0.0002, + "loss": 2.6441, + "step": 30130 + }, + { + "epoch": 2.2459016393442623, + "grad_norm": 2.513252019882202, + "learning_rate": 0.0002, + "loss": 2.5045, + "step": 30140 + }, + { + "epoch": 2.246646795827124, + "grad_norm": 2.529350996017456, + "learning_rate": 0.0002, + "loss": 2.2703, + "step": 30150 + }, + { + "epoch": 2.247391952309985, + "grad_norm": 2.729717254638672, + "learning_rate": 0.0002, + "loss": 2.5772, + "step": 30160 + }, + { + "epoch": 2.2481371087928466, + "grad_norm": 2.8661959171295166, + "learning_rate": 0.0002, + "loss": 2.5031, + "step": 30170 + }, + { + "epoch": 2.248882265275708, + "grad_norm": 2.6999034881591797, + "learning_rate": 0.0002, + "loss": 2.4389, + "step": 30180 + }, + { + "epoch": 2.2496274217585692, + "grad_norm": 3.331155300140381, + "learning_rate": 0.0002, + "loss": 2.5102, + "step": 30190 + }, + { + "epoch": 2.2503725782414308, + "grad_norm": 2.8043971061706543, + "learning_rate": 0.0002, + "loss": 2.3276, + "step": 30200 + }, + { + "epoch": 2.251117734724292, + "grad_norm": 2.57314133644104, + "learning_rate": 0.0002, + "loss": 2.3967, + "step": 30210 + }, + { + "epoch": 2.2518628912071534, + "grad_norm": 2.738511085510254, + "learning_rate": 0.0002, + "loss": 2.5336, + "step": 30220 + }, + { + "epoch": 2.252608047690015, + "grad_norm": 2.7028000354766846, + "learning_rate": 0.0002, + "loss": 2.4197, + "step": 30230 + }, + { + "epoch": 2.2533532041728765, + "grad_norm": 1.9753464460372925, + "learning_rate": 0.0002, + "loss": 2.2315, + "step": 30240 + }, + { + "epoch": 2.2540983606557377, + "grad_norm": 2.5343260765075684, + "learning_rate": 0.0002, + "loss": 2.656, + "step": 30250 + }, + { + "epoch": 2.254843517138599, + "grad_norm": 3.3517582416534424, + "learning_rate": 0.0002, + "loss": 2.5387, + "step": 30260 + }, + { + "epoch": 2.2555886736214603, + "grad_norm": 2.795591115951538, + "learning_rate": 0.0002, + "loss": 2.428, + "step": 30270 + }, + { + "epoch": 2.256333830104322, + "grad_norm": 2.458268404006958, + "learning_rate": 0.0002, + "loss": 2.6109, + "step": 30280 + }, + { + "epoch": 2.2570789865871834, + "grad_norm": 2.965975761413574, + "learning_rate": 0.0002, + "loss": 2.265, + "step": 30290 + }, + { + "epoch": 2.2578241430700445, + "grad_norm": 2.6871607303619385, + "learning_rate": 0.0002, + "loss": 2.3368, + "step": 30300 + }, + { + "epoch": 2.258569299552906, + "grad_norm": 2.760650157928467, + "learning_rate": 0.0002, + "loss": 2.4479, + "step": 30310 + }, + { + "epoch": 2.2593144560357676, + "grad_norm": 2.5536069869995117, + "learning_rate": 0.0002, + "loss": 2.3555, + "step": 30320 + }, + { + "epoch": 2.2600596125186287, + "grad_norm": 2.6912643909454346, + "learning_rate": 0.0002, + "loss": 2.4761, + "step": 30330 + }, + { + "epoch": 2.2608047690014903, + "grad_norm": 2.6149418354034424, + "learning_rate": 0.0002, + "loss": 2.3776, + "step": 30340 + }, + { + "epoch": 2.261549925484352, + "grad_norm": 2.6882479190826416, + "learning_rate": 0.0002, + "loss": 2.4329, + "step": 30350 + }, + { + "epoch": 2.262295081967213, + "grad_norm": 2.7375595569610596, + "learning_rate": 0.0002, + "loss": 2.4161, + "step": 30360 + }, + { + "epoch": 2.2630402384500745, + "grad_norm": 2.4539854526519775, + "learning_rate": 0.0002, + "loss": 2.4432, + "step": 30370 + }, + { + "epoch": 2.263785394932936, + "grad_norm": 2.7913661003112793, + "learning_rate": 0.0002, + "loss": 2.6048, + "step": 30380 + }, + { + "epoch": 2.264530551415797, + "grad_norm": 2.822239875793457, + "learning_rate": 0.0002, + "loss": 2.6286, + "step": 30390 + }, + { + "epoch": 2.2652757078986587, + "grad_norm": 2.82560658454895, + "learning_rate": 0.0002, + "loss": 2.2957, + "step": 30400 + }, + { + "epoch": 2.2660208643815203, + "grad_norm": 2.30305814743042, + "learning_rate": 0.0002, + "loss": 2.5297, + "step": 30410 + }, + { + "epoch": 2.2667660208643814, + "grad_norm": 2.366816520690918, + "learning_rate": 0.0002, + "loss": 2.3713, + "step": 30420 + }, + { + "epoch": 2.267511177347243, + "grad_norm": 2.849090814590454, + "learning_rate": 0.0002, + "loss": 2.2744, + "step": 30430 + }, + { + "epoch": 2.2682563338301045, + "grad_norm": 2.4802587032318115, + "learning_rate": 0.0002, + "loss": 2.2494, + "step": 30440 + }, + { + "epoch": 2.2690014903129656, + "grad_norm": 2.9670379161834717, + "learning_rate": 0.0002, + "loss": 2.5435, + "step": 30450 + }, + { + "epoch": 2.269746646795827, + "grad_norm": 2.596202850341797, + "learning_rate": 0.0002, + "loss": 2.428, + "step": 30460 + }, + { + "epoch": 2.2704918032786887, + "grad_norm": 2.6651835441589355, + "learning_rate": 0.0002, + "loss": 2.3748, + "step": 30470 + }, + { + "epoch": 2.27123695976155, + "grad_norm": 2.2997448444366455, + "learning_rate": 0.0002, + "loss": 2.5528, + "step": 30480 + }, + { + "epoch": 2.2719821162444114, + "grad_norm": 2.415158271789551, + "learning_rate": 0.0002, + "loss": 2.3274, + "step": 30490 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 2.361969470977783, + "learning_rate": 0.0002, + "loss": 2.5814, + "step": 30500 + }, + { + "epoch": 2.273472429210134, + "grad_norm": 2.340508222579956, + "learning_rate": 0.0002, + "loss": 2.2694, + "step": 30510 + }, + { + "epoch": 2.2742175856929956, + "grad_norm": 2.5094423294067383, + "learning_rate": 0.0002, + "loss": 2.4632, + "step": 30520 + }, + { + "epoch": 2.274962742175857, + "grad_norm": 2.8565590381622314, + "learning_rate": 0.0002, + "loss": 2.2868, + "step": 30530 + }, + { + "epoch": 2.2757078986587183, + "grad_norm": 2.701770067214966, + "learning_rate": 0.0002, + "loss": 2.4865, + "step": 30540 + }, + { + "epoch": 2.27645305514158, + "grad_norm": 2.518066644668579, + "learning_rate": 0.0002, + "loss": 2.5408, + "step": 30550 + }, + { + "epoch": 2.277198211624441, + "grad_norm": 2.743431806564331, + "learning_rate": 0.0002, + "loss": 2.4585, + "step": 30560 + }, + { + "epoch": 2.2779433681073025, + "grad_norm": 2.8521339893341064, + "learning_rate": 0.0002, + "loss": 2.3772, + "step": 30570 + }, + { + "epoch": 2.278688524590164, + "grad_norm": 2.967759370803833, + "learning_rate": 0.0002, + "loss": 2.5984, + "step": 30580 + }, + { + "epoch": 2.2794336810730256, + "grad_norm": 2.4297335147857666, + "learning_rate": 0.0002, + "loss": 2.4854, + "step": 30590 + }, + { + "epoch": 2.2801788375558867, + "grad_norm": 2.547301769256592, + "learning_rate": 0.0002, + "loss": 2.478, + "step": 30600 + }, + { + "epoch": 2.2809239940387482, + "grad_norm": 2.791649341583252, + "learning_rate": 0.0002, + "loss": 2.5139, + "step": 30610 + }, + { + "epoch": 2.2816691505216093, + "grad_norm": 2.7930760383605957, + "learning_rate": 0.0002, + "loss": 2.5727, + "step": 30620 + }, + { + "epoch": 2.282414307004471, + "grad_norm": 2.5463852882385254, + "learning_rate": 0.0002, + "loss": 2.5221, + "step": 30630 + }, + { + "epoch": 2.2831594634873325, + "grad_norm": 2.653203248977661, + "learning_rate": 0.0002, + "loss": 2.3023, + "step": 30640 + }, + { + "epoch": 2.2839046199701936, + "grad_norm": 3.0527825355529785, + "learning_rate": 0.0002, + "loss": 2.2436, + "step": 30650 + }, + { + "epoch": 2.284649776453055, + "grad_norm": 2.5038249492645264, + "learning_rate": 0.0002, + "loss": 2.5085, + "step": 30660 + }, + { + "epoch": 2.2853949329359167, + "grad_norm": 2.571519374847412, + "learning_rate": 0.0002, + "loss": 2.2828, + "step": 30670 + }, + { + "epoch": 2.2861400894187778, + "grad_norm": 2.742832660675049, + "learning_rate": 0.0002, + "loss": 2.4795, + "step": 30680 + }, + { + "epoch": 2.2868852459016393, + "grad_norm": 2.4409167766571045, + "learning_rate": 0.0002, + "loss": 2.4449, + "step": 30690 + }, + { + "epoch": 2.287630402384501, + "grad_norm": 2.7374167442321777, + "learning_rate": 0.0002, + "loss": 2.2682, + "step": 30700 + }, + { + "epoch": 2.288375558867362, + "grad_norm": 2.6040146350860596, + "learning_rate": 0.0002, + "loss": 2.4203, + "step": 30710 + }, + { + "epoch": 2.2891207153502235, + "grad_norm": 2.70377779006958, + "learning_rate": 0.0002, + "loss": 2.3425, + "step": 30720 + }, + { + "epoch": 2.289865871833085, + "grad_norm": 2.382824420928955, + "learning_rate": 0.0002, + "loss": 2.3675, + "step": 30730 + }, + { + "epoch": 2.290611028315946, + "grad_norm": 2.6663031578063965, + "learning_rate": 0.0002, + "loss": 2.5198, + "step": 30740 + }, + { + "epoch": 2.2913561847988078, + "grad_norm": 2.4763283729553223, + "learning_rate": 0.0002, + "loss": 2.3904, + "step": 30750 + }, + { + "epoch": 2.2921013412816693, + "grad_norm": 2.601113796234131, + "learning_rate": 0.0002, + "loss": 2.5542, + "step": 30760 + }, + { + "epoch": 2.2928464977645304, + "grad_norm": 2.841099739074707, + "learning_rate": 0.0002, + "loss": 2.5474, + "step": 30770 + }, + { + "epoch": 2.293591654247392, + "grad_norm": 2.850001573562622, + "learning_rate": 0.0002, + "loss": 2.4976, + "step": 30780 + }, + { + "epoch": 2.2943368107302535, + "grad_norm": 2.646151304244995, + "learning_rate": 0.0002, + "loss": 2.4373, + "step": 30790 + }, + { + "epoch": 2.2950819672131146, + "grad_norm": 2.71795916557312, + "learning_rate": 0.0002, + "loss": 2.3082, + "step": 30800 + }, + { + "epoch": 2.295827123695976, + "grad_norm": 2.5677895545959473, + "learning_rate": 0.0002, + "loss": 2.3645, + "step": 30810 + }, + { + "epoch": 2.2965722801788377, + "grad_norm": 2.6931703090667725, + "learning_rate": 0.0002, + "loss": 2.5397, + "step": 30820 + }, + { + "epoch": 2.297317436661699, + "grad_norm": 2.990161418914795, + "learning_rate": 0.0002, + "loss": 2.5421, + "step": 30830 + }, + { + "epoch": 2.2980625931445604, + "grad_norm": 2.717191696166992, + "learning_rate": 0.0002, + "loss": 2.4656, + "step": 30840 + }, + { + "epoch": 2.2988077496274215, + "grad_norm": 3.0563786029815674, + "learning_rate": 0.0002, + "loss": 2.5854, + "step": 30850 + }, + { + "epoch": 2.299552906110283, + "grad_norm": 2.8930904865264893, + "learning_rate": 0.0002, + "loss": 2.3787, + "step": 30860 + }, + { + "epoch": 2.3002980625931446, + "grad_norm": 2.5913820266723633, + "learning_rate": 0.0002, + "loss": 2.3729, + "step": 30870 + }, + { + "epoch": 2.301043219076006, + "grad_norm": 2.668519973754883, + "learning_rate": 0.0002, + "loss": 2.2982, + "step": 30880 + }, + { + "epoch": 2.3017883755588673, + "grad_norm": 2.6482491493225098, + "learning_rate": 0.0002, + "loss": 2.4505, + "step": 30890 + }, + { + "epoch": 2.302533532041729, + "grad_norm": 2.7581119537353516, + "learning_rate": 0.0002, + "loss": 2.5304, + "step": 30900 + }, + { + "epoch": 2.30327868852459, + "grad_norm": 2.807406425476074, + "learning_rate": 0.0002, + "loss": 2.6199, + "step": 30910 + }, + { + "epoch": 2.3040238450074515, + "grad_norm": 2.669703245162964, + "learning_rate": 0.0002, + "loss": 2.3566, + "step": 30920 + }, + { + "epoch": 2.304769001490313, + "grad_norm": 2.4919867515563965, + "learning_rate": 0.0002, + "loss": 2.4593, + "step": 30930 + }, + { + "epoch": 2.3055141579731746, + "grad_norm": 2.781989574432373, + "learning_rate": 0.0002, + "loss": 2.3244, + "step": 30940 + }, + { + "epoch": 2.3062593144560357, + "grad_norm": 2.7712695598602295, + "learning_rate": 0.0002, + "loss": 2.3854, + "step": 30950 + }, + { + "epoch": 2.3070044709388973, + "grad_norm": 2.8249754905700684, + "learning_rate": 0.0002, + "loss": 2.6018, + "step": 30960 + }, + { + "epoch": 2.3077496274217584, + "grad_norm": 2.443105459213257, + "learning_rate": 0.0002, + "loss": 2.2767, + "step": 30970 + }, + { + "epoch": 2.30849478390462, + "grad_norm": 2.589513063430786, + "learning_rate": 0.0002, + "loss": 2.5455, + "step": 30980 + }, + { + "epoch": 2.3092399403874815, + "grad_norm": 2.7437689304351807, + "learning_rate": 0.0002, + "loss": 2.7474, + "step": 30990 + }, + { + "epoch": 2.3099850968703426, + "grad_norm": 2.4255599975585938, + "learning_rate": 0.0002, + "loss": 2.3217, + "step": 31000 + }, + { + "epoch": 2.310730253353204, + "grad_norm": 2.56787371635437, + "learning_rate": 0.0002, + "loss": 2.3108, + "step": 31010 + }, + { + "epoch": 2.3114754098360657, + "grad_norm": 2.7535786628723145, + "learning_rate": 0.0002, + "loss": 2.4737, + "step": 31020 + }, + { + "epoch": 2.312220566318927, + "grad_norm": 2.5729820728302, + "learning_rate": 0.0002, + "loss": 2.3027, + "step": 31030 + }, + { + "epoch": 2.3129657228017884, + "grad_norm": 2.6444244384765625, + "learning_rate": 0.0002, + "loss": 2.4639, + "step": 31040 + }, + { + "epoch": 2.31371087928465, + "grad_norm": 2.6988863945007324, + "learning_rate": 0.0002, + "loss": 2.4188, + "step": 31050 + }, + { + "epoch": 2.314456035767511, + "grad_norm": 3.3972041606903076, + "learning_rate": 0.0002, + "loss": 2.639, + "step": 31060 + }, + { + "epoch": 2.3152011922503726, + "grad_norm": 2.741131544113159, + "learning_rate": 0.0002, + "loss": 2.484, + "step": 31070 + }, + { + "epoch": 2.315946348733234, + "grad_norm": 2.761235475540161, + "learning_rate": 0.0002, + "loss": 2.4808, + "step": 31080 + }, + { + "epoch": 2.3166915052160952, + "grad_norm": 2.5519707202911377, + "learning_rate": 0.0002, + "loss": 2.5373, + "step": 31090 + }, + { + "epoch": 2.317436661698957, + "grad_norm": 2.4959230422973633, + "learning_rate": 0.0002, + "loss": 2.5488, + "step": 31100 + }, + { + "epoch": 2.3181818181818183, + "grad_norm": 2.7000091075897217, + "learning_rate": 0.0002, + "loss": 2.2065, + "step": 31110 + }, + { + "epoch": 2.3189269746646795, + "grad_norm": 2.97749662399292, + "learning_rate": 0.0002, + "loss": 2.3805, + "step": 31120 + }, + { + "epoch": 2.319672131147541, + "grad_norm": 2.6675100326538086, + "learning_rate": 0.0002, + "loss": 2.5932, + "step": 31130 + }, + { + "epoch": 2.3204172876304026, + "grad_norm": 2.4122798442840576, + "learning_rate": 0.0002, + "loss": 2.47, + "step": 31140 + }, + { + "epoch": 2.3211624441132637, + "grad_norm": 2.4250221252441406, + "learning_rate": 0.0002, + "loss": 2.3444, + "step": 31150 + }, + { + "epoch": 2.321907600596125, + "grad_norm": 2.625974178314209, + "learning_rate": 0.0002, + "loss": 2.6722, + "step": 31160 + }, + { + "epoch": 2.3226527570789868, + "grad_norm": 2.5637755393981934, + "learning_rate": 0.0002, + "loss": 2.4901, + "step": 31170 + }, + { + "epoch": 2.323397913561848, + "grad_norm": 2.491208076477051, + "learning_rate": 0.0002, + "loss": 2.3771, + "step": 31180 + }, + { + "epoch": 2.3241430700447094, + "grad_norm": 2.751556634902954, + "learning_rate": 0.0002, + "loss": 2.3967, + "step": 31190 + }, + { + "epoch": 2.3248882265275705, + "grad_norm": 2.622948169708252, + "learning_rate": 0.0002, + "loss": 2.4714, + "step": 31200 + }, + { + "epoch": 2.325633383010432, + "grad_norm": 2.500760078430176, + "learning_rate": 0.0002, + "loss": 2.4578, + "step": 31210 + }, + { + "epoch": 2.3263785394932937, + "grad_norm": 2.7233376502990723, + "learning_rate": 0.0002, + "loss": 2.5258, + "step": 31220 + }, + { + "epoch": 2.327123695976155, + "grad_norm": 2.32476806640625, + "learning_rate": 0.0002, + "loss": 2.431, + "step": 31230 + }, + { + "epoch": 2.3278688524590163, + "grad_norm": 2.283064603805542, + "learning_rate": 0.0002, + "loss": 2.2992, + "step": 31240 + }, + { + "epoch": 2.328614008941878, + "grad_norm": 2.767536163330078, + "learning_rate": 0.0002, + "loss": 2.4628, + "step": 31250 + }, + { + "epoch": 2.329359165424739, + "grad_norm": 3.01369047164917, + "learning_rate": 0.0002, + "loss": 2.3962, + "step": 31260 + }, + { + "epoch": 2.3301043219076005, + "grad_norm": 2.545297622680664, + "learning_rate": 0.0002, + "loss": 2.5169, + "step": 31270 + }, + { + "epoch": 2.330849478390462, + "grad_norm": 2.610537528991699, + "learning_rate": 0.0002, + "loss": 2.5468, + "step": 31280 + }, + { + "epoch": 2.3315946348733236, + "grad_norm": 2.7675936222076416, + "learning_rate": 0.0002, + "loss": 2.2562, + "step": 31290 + }, + { + "epoch": 2.3323397913561847, + "grad_norm": 2.5579588413238525, + "learning_rate": 0.0002, + "loss": 2.5384, + "step": 31300 + }, + { + "epoch": 2.3330849478390463, + "grad_norm": 2.905510902404785, + "learning_rate": 0.0002, + "loss": 2.3248, + "step": 31310 + }, + { + "epoch": 2.3338301043219074, + "grad_norm": 2.6101415157318115, + "learning_rate": 0.0002, + "loss": 2.5987, + "step": 31320 + }, + { + "epoch": 2.334575260804769, + "grad_norm": 2.5008299350738525, + "learning_rate": 0.0002, + "loss": 2.2796, + "step": 31330 + }, + { + "epoch": 2.3353204172876305, + "grad_norm": 2.7054800987243652, + "learning_rate": 0.0002, + "loss": 2.2972, + "step": 31340 + }, + { + "epoch": 2.3360655737704916, + "grad_norm": 2.9480528831481934, + "learning_rate": 0.0002, + "loss": 2.624, + "step": 31350 + }, + { + "epoch": 2.336810730253353, + "grad_norm": 2.9812073707580566, + "learning_rate": 0.0002, + "loss": 2.5909, + "step": 31360 + }, + { + "epoch": 2.3375558867362147, + "grad_norm": 2.557450294494629, + "learning_rate": 0.0002, + "loss": 2.3268, + "step": 31370 + }, + { + "epoch": 2.338301043219076, + "grad_norm": 2.770230770111084, + "learning_rate": 0.0002, + "loss": 2.4681, + "step": 31380 + }, + { + "epoch": 2.3390461997019374, + "grad_norm": 2.71527099609375, + "learning_rate": 0.0002, + "loss": 2.4039, + "step": 31390 + }, + { + "epoch": 2.339791356184799, + "grad_norm": 2.635856866836548, + "learning_rate": 0.0002, + "loss": 2.4766, + "step": 31400 + }, + { + "epoch": 2.34053651266766, + "grad_norm": 2.7687203884124756, + "learning_rate": 0.0002, + "loss": 2.4795, + "step": 31410 + }, + { + "epoch": 2.3412816691505216, + "grad_norm": 2.6635115146636963, + "learning_rate": 0.0002, + "loss": 2.3895, + "step": 31420 + }, + { + "epoch": 2.342026825633383, + "grad_norm": 2.6172454357147217, + "learning_rate": 0.0002, + "loss": 2.7309, + "step": 31430 + }, + { + "epoch": 2.3427719821162443, + "grad_norm": 3.0446231365203857, + "learning_rate": 0.0002, + "loss": 2.3944, + "step": 31440 + }, + { + "epoch": 2.343517138599106, + "grad_norm": 2.6833817958831787, + "learning_rate": 0.0002, + "loss": 2.412, + "step": 31450 + }, + { + "epoch": 2.3442622950819674, + "grad_norm": 2.333240509033203, + "learning_rate": 0.0002, + "loss": 2.5243, + "step": 31460 + }, + { + "epoch": 2.3450074515648285, + "grad_norm": 2.985318422317505, + "learning_rate": 0.0002, + "loss": 2.4594, + "step": 31470 + }, + { + "epoch": 2.34575260804769, + "grad_norm": 2.4295361042022705, + "learning_rate": 0.0002, + "loss": 2.3086, + "step": 31480 + }, + { + "epoch": 2.3464977645305516, + "grad_norm": 2.663954734802246, + "learning_rate": 0.0002, + "loss": 2.5138, + "step": 31490 + }, + { + "epoch": 2.3472429210134127, + "grad_norm": 2.842383861541748, + "learning_rate": 0.0002, + "loss": 2.649, + "step": 31500 + }, + { + "epoch": 2.3479880774962743, + "grad_norm": 2.4584267139434814, + "learning_rate": 0.0002, + "loss": 2.2672, + "step": 31510 + }, + { + "epoch": 2.348733233979136, + "grad_norm": 2.6671886444091797, + "learning_rate": 0.0002, + "loss": 2.3535, + "step": 31520 + }, + { + "epoch": 2.349478390461997, + "grad_norm": 2.6735708713531494, + "learning_rate": 0.0002, + "loss": 2.4926, + "step": 31530 + }, + { + "epoch": 2.3502235469448585, + "grad_norm": 2.579631805419922, + "learning_rate": 0.0002, + "loss": 2.1678, + "step": 31540 + }, + { + "epoch": 2.3509687034277196, + "grad_norm": 2.5396575927734375, + "learning_rate": 0.0002, + "loss": 2.4117, + "step": 31550 + }, + { + "epoch": 2.351713859910581, + "grad_norm": 2.7359862327575684, + "learning_rate": 0.0002, + "loss": 2.459, + "step": 31560 + }, + { + "epoch": 2.3524590163934427, + "grad_norm": 2.8085885047912598, + "learning_rate": 0.0002, + "loss": 2.3537, + "step": 31570 + }, + { + "epoch": 2.3532041728763042, + "grad_norm": 3.020338773727417, + "learning_rate": 0.0002, + "loss": 2.5794, + "step": 31580 + }, + { + "epoch": 2.3539493293591653, + "grad_norm": 2.755391836166382, + "learning_rate": 0.0002, + "loss": 2.3765, + "step": 31590 + }, + { + "epoch": 2.354694485842027, + "grad_norm": 2.8029239177703857, + "learning_rate": 0.0002, + "loss": 2.3851, + "step": 31600 + }, + { + "epoch": 2.355439642324888, + "grad_norm": 2.442831516265869, + "learning_rate": 0.0002, + "loss": 2.426, + "step": 31610 + }, + { + "epoch": 2.3561847988077496, + "grad_norm": 2.6783061027526855, + "learning_rate": 0.0002, + "loss": 2.5379, + "step": 31620 + }, + { + "epoch": 2.356929955290611, + "grad_norm": 2.8545401096343994, + "learning_rate": 0.0002, + "loss": 2.3918, + "step": 31630 + }, + { + "epoch": 2.3576751117734727, + "grad_norm": 2.6297447681427, + "learning_rate": 0.0002, + "loss": 2.5717, + "step": 31640 + }, + { + "epoch": 2.3584202682563338, + "grad_norm": 2.3793184757232666, + "learning_rate": 0.0002, + "loss": 2.4338, + "step": 31650 + }, + { + "epoch": 2.3591654247391953, + "grad_norm": 2.4211175441741943, + "learning_rate": 0.0002, + "loss": 2.5335, + "step": 31660 + }, + { + "epoch": 2.3599105812220564, + "grad_norm": 2.7338030338287354, + "learning_rate": 0.0002, + "loss": 2.4089, + "step": 31670 + }, + { + "epoch": 2.360655737704918, + "grad_norm": 2.5373005867004395, + "learning_rate": 0.0002, + "loss": 2.5354, + "step": 31680 + }, + { + "epoch": 2.3614008941877795, + "grad_norm": 2.932400703430176, + "learning_rate": 0.0002, + "loss": 2.4069, + "step": 31690 + }, + { + "epoch": 2.3621460506706407, + "grad_norm": 2.6089446544647217, + "learning_rate": 0.0002, + "loss": 2.4368, + "step": 31700 + }, + { + "epoch": 2.362891207153502, + "grad_norm": 2.8511080741882324, + "learning_rate": 0.0002, + "loss": 2.3916, + "step": 31710 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 3.1267642974853516, + "learning_rate": 0.0002, + "loss": 2.5248, + "step": 31720 + }, + { + "epoch": 2.364381520119225, + "grad_norm": 2.741081953048706, + "learning_rate": 0.0002, + "loss": 2.3826, + "step": 31730 + }, + { + "epoch": 2.3651266766020864, + "grad_norm": 2.7541043758392334, + "learning_rate": 0.0002, + "loss": 2.4853, + "step": 31740 + }, + { + "epoch": 2.365871833084948, + "grad_norm": 2.6684322357177734, + "learning_rate": 0.0002, + "loss": 2.2149, + "step": 31750 + }, + { + "epoch": 2.366616989567809, + "grad_norm": 2.5884592533111572, + "learning_rate": 0.0002, + "loss": 2.5272, + "step": 31760 + }, + { + "epoch": 2.3673621460506706, + "grad_norm": 2.832197666168213, + "learning_rate": 0.0002, + "loss": 2.548, + "step": 31770 + }, + { + "epoch": 2.368107302533532, + "grad_norm": 2.574288845062256, + "learning_rate": 0.0002, + "loss": 2.333, + "step": 31780 + }, + { + "epoch": 2.3688524590163933, + "grad_norm": 2.654503107070923, + "learning_rate": 0.0002, + "loss": 2.4165, + "step": 31790 + }, + { + "epoch": 2.369597615499255, + "grad_norm": 2.2965807914733887, + "learning_rate": 0.0002, + "loss": 2.5123, + "step": 31800 + }, + { + "epoch": 2.3703427719821164, + "grad_norm": 2.757343292236328, + "learning_rate": 0.0002, + "loss": 2.5358, + "step": 31810 + }, + { + "epoch": 2.3710879284649775, + "grad_norm": 2.404266119003296, + "learning_rate": 0.0002, + "loss": 2.4523, + "step": 31820 + }, + { + "epoch": 2.371833084947839, + "grad_norm": 2.5798537731170654, + "learning_rate": 0.0002, + "loss": 2.5129, + "step": 31830 + }, + { + "epoch": 2.3725782414307006, + "grad_norm": 2.3345487117767334, + "learning_rate": 0.0002, + "loss": 2.4274, + "step": 31840 + }, + { + "epoch": 2.3733233979135617, + "grad_norm": 2.4548275470733643, + "learning_rate": 0.0002, + "loss": 2.6288, + "step": 31850 + }, + { + "epoch": 2.3740685543964233, + "grad_norm": 2.830592632293701, + "learning_rate": 0.0002, + "loss": 2.4632, + "step": 31860 + }, + { + "epoch": 2.374813710879285, + "grad_norm": 2.5509464740753174, + "learning_rate": 0.0002, + "loss": 2.4051, + "step": 31870 + }, + { + "epoch": 2.375558867362146, + "grad_norm": 2.498534679412842, + "learning_rate": 0.0002, + "loss": 2.4469, + "step": 31880 + }, + { + "epoch": 2.3763040238450075, + "grad_norm": 2.4056124687194824, + "learning_rate": 0.0002, + "loss": 2.4079, + "step": 31890 + }, + { + "epoch": 2.3770491803278686, + "grad_norm": 2.7303104400634766, + "learning_rate": 0.0002, + "loss": 2.3214, + "step": 31900 + }, + { + "epoch": 2.37779433681073, + "grad_norm": 2.8305602073669434, + "learning_rate": 0.0002, + "loss": 2.4894, + "step": 31910 + }, + { + "epoch": 2.3785394932935917, + "grad_norm": 3.2949180603027344, + "learning_rate": 0.0002, + "loss": 2.4071, + "step": 31920 + }, + { + "epoch": 2.3792846497764533, + "grad_norm": 2.639760971069336, + "learning_rate": 0.0002, + "loss": 2.5399, + "step": 31930 + }, + { + "epoch": 2.3800298062593144, + "grad_norm": 2.7378196716308594, + "learning_rate": 0.0002, + "loss": 2.4692, + "step": 31940 + }, + { + "epoch": 2.380774962742176, + "grad_norm": 2.9000864028930664, + "learning_rate": 0.0002, + "loss": 2.6328, + "step": 31950 + }, + { + "epoch": 2.381520119225037, + "grad_norm": 2.8372485637664795, + "learning_rate": 0.0002, + "loss": 2.5593, + "step": 31960 + }, + { + "epoch": 2.3822652757078986, + "grad_norm": 2.391453504562378, + "learning_rate": 0.0002, + "loss": 2.4111, + "step": 31970 + }, + { + "epoch": 2.38301043219076, + "grad_norm": 2.5468664169311523, + "learning_rate": 0.0002, + "loss": 2.5173, + "step": 31980 + }, + { + "epoch": 2.3837555886736217, + "grad_norm": 2.59735107421875, + "learning_rate": 0.0002, + "loss": 2.5609, + "step": 31990 + }, + { + "epoch": 2.384500745156483, + "grad_norm": 2.841322660446167, + "learning_rate": 0.0002, + "loss": 2.4528, + "step": 32000 + }, + { + "epoch": 2.3852459016393444, + "grad_norm": 2.7470717430114746, + "learning_rate": 0.0002, + "loss": 2.6136, + "step": 32010 + }, + { + "epoch": 2.3859910581222055, + "grad_norm": 3.1498608589172363, + "learning_rate": 0.0002, + "loss": 2.4515, + "step": 32020 + }, + { + "epoch": 2.386736214605067, + "grad_norm": 2.6419332027435303, + "learning_rate": 0.0002, + "loss": 2.6439, + "step": 32030 + }, + { + "epoch": 2.3874813710879286, + "grad_norm": 2.7486846446990967, + "learning_rate": 0.0002, + "loss": 2.3877, + "step": 32040 + }, + { + "epoch": 2.3882265275707897, + "grad_norm": 2.418884515762329, + "learning_rate": 0.0002, + "loss": 2.3455, + "step": 32050 + }, + { + "epoch": 2.3889716840536512, + "grad_norm": 2.8571934700012207, + "learning_rate": 0.0002, + "loss": 2.5127, + "step": 32060 + }, + { + "epoch": 2.389716840536513, + "grad_norm": 2.683866262435913, + "learning_rate": 0.0002, + "loss": 2.604, + "step": 32070 + }, + { + "epoch": 2.390461997019374, + "grad_norm": 2.7434074878692627, + "learning_rate": 0.0002, + "loss": 2.4338, + "step": 32080 + }, + { + "epoch": 2.3912071535022354, + "grad_norm": 2.5270578861236572, + "learning_rate": 0.0002, + "loss": 2.555, + "step": 32090 + }, + { + "epoch": 2.391952309985097, + "grad_norm": 2.526337146759033, + "learning_rate": 0.0002, + "loss": 2.5349, + "step": 32100 + }, + { + "epoch": 2.392697466467958, + "grad_norm": 2.61208176612854, + "learning_rate": 0.0002, + "loss": 2.634, + "step": 32110 + }, + { + "epoch": 2.3934426229508197, + "grad_norm": 2.6599786281585693, + "learning_rate": 0.0002, + "loss": 2.2547, + "step": 32120 + }, + { + "epoch": 2.394187779433681, + "grad_norm": 2.4096555709838867, + "learning_rate": 0.0002, + "loss": 2.4555, + "step": 32130 + }, + { + "epoch": 2.3949329359165423, + "grad_norm": 2.4262518882751465, + "learning_rate": 0.0002, + "loss": 2.5405, + "step": 32140 + }, + { + "epoch": 2.395678092399404, + "grad_norm": 2.738983392715454, + "learning_rate": 0.0002, + "loss": 2.5729, + "step": 32150 + }, + { + "epoch": 2.3964232488822654, + "grad_norm": 2.9784836769104004, + "learning_rate": 0.0002, + "loss": 2.5058, + "step": 32160 + }, + { + "epoch": 2.3971684053651265, + "grad_norm": 2.204904556274414, + "learning_rate": 0.0002, + "loss": 2.4696, + "step": 32170 + }, + { + "epoch": 2.397913561847988, + "grad_norm": 2.905687093734741, + "learning_rate": 0.0002, + "loss": 2.5, + "step": 32180 + }, + { + "epoch": 2.3986587183308496, + "grad_norm": 2.540802240371704, + "learning_rate": 0.0002, + "loss": 2.7352, + "step": 32190 + }, + { + "epoch": 2.3994038748137108, + "grad_norm": 2.4641029834747314, + "learning_rate": 0.0002, + "loss": 2.2883, + "step": 32200 + }, + { + "epoch": 2.4001490312965723, + "grad_norm": 2.787086248397827, + "learning_rate": 0.0002, + "loss": 2.3687, + "step": 32210 + }, + { + "epoch": 2.400894187779434, + "grad_norm": 2.6492245197296143, + "learning_rate": 0.0002, + "loss": 2.5022, + "step": 32220 + }, + { + "epoch": 2.401639344262295, + "grad_norm": 2.8292956352233887, + "learning_rate": 0.0002, + "loss": 2.5244, + "step": 32230 + }, + { + "epoch": 2.4023845007451565, + "grad_norm": 2.6720879077911377, + "learning_rate": 0.0002, + "loss": 2.3531, + "step": 32240 + }, + { + "epoch": 2.4031296572280176, + "grad_norm": 2.6397554874420166, + "learning_rate": 0.0002, + "loss": 2.6169, + "step": 32250 + }, + { + "epoch": 2.403874813710879, + "grad_norm": 2.494509696960449, + "learning_rate": 0.0002, + "loss": 2.6833, + "step": 32260 + }, + { + "epoch": 2.4046199701937407, + "grad_norm": 2.6733009815216064, + "learning_rate": 0.0002, + "loss": 2.5063, + "step": 32270 + }, + { + "epoch": 2.4053651266766023, + "grad_norm": 2.522325277328491, + "learning_rate": 0.0002, + "loss": 2.4202, + "step": 32280 + }, + { + "epoch": 2.4061102831594634, + "grad_norm": 2.934260845184326, + "learning_rate": 0.0002, + "loss": 2.6828, + "step": 32290 + }, + { + "epoch": 2.406855439642325, + "grad_norm": 2.7040140628814697, + "learning_rate": 0.0002, + "loss": 2.4384, + "step": 32300 + }, + { + "epoch": 2.407600596125186, + "grad_norm": 2.9310309886932373, + "learning_rate": 0.0002, + "loss": 2.5099, + "step": 32310 + }, + { + "epoch": 2.4083457526080476, + "grad_norm": 2.7560479640960693, + "learning_rate": 0.0002, + "loss": 2.4807, + "step": 32320 + }, + { + "epoch": 2.409090909090909, + "grad_norm": 2.6352450847625732, + "learning_rate": 0.0002, + "loss": 2.5188, + "step": 32330 + }, + { + "epoch": 2.4098360655737707, + "grad_norm": 2.613154888153076, + "learning_rate": 0.0002, + "loss": 2.6614, + "step": 32340 + }, + { + "epoch": 2.410581222056632, + "grad_norm": 2.5213232040405273, + "learning_rate": 0.0002, + "loss": 2.6163, + "step": 32350 + }, + { + "epoch": 2.4113263785394934, + "grad_norm": 2.757983684539795, + "learning_rate": 0.0002, + "loss": 2.4484, + "step": 32360 + }, + { + "epoch": 2.4120715350223545, + "grad_norm": 2.594177007675171, + "learning_rate": 0.0002, + "loss": 2.4938, + "step": 32370 + }, + { + "epoch": 2.412816691505216, + "grad_norm": 2.2750751972198486, + "learning_rate": 0.0002, + "loss": 2.3242, + "step": 32380 + }, + { + "epoch": 2.4135618479880776, + "grad_norm": 3.5427920818328857, + "learning_rate": 0.0002, + "loss": 2.6142, + "step": 32390 + }, + { + "epoch": 2.4143070044709387, + "grad_norm": 2.4356930255889893, + "learning_rate": 0.0002, + "loss": 2.2886, + "step": 32400 + }, + { + "epoch": 2.4150521609538003, + "grad_norm": 3.0683324337005615, + "learning_rate": 0.0002, + "loss": 2.3534, + "step": 32410 + }, + { + "epoch": 2.415797317436662, + "grad_norm": 2.464430809020996, + "learning_rate": 0.0002, + "loss": 2.3722, + "step": 32420 + }, + { + "epoch": 2.416542473919523, + "grad_norm": 2.803317070007324, + "learning_rate": 0.0002, + "loss": 2.4745, + "step": 32430 + }, + { + "epoch": 2.4172876304023845, + "grad_norm": 2.4788084030151367, + "learning_rate": 0.0002, + "loss": 2.4501, + "step": 32440 + }, + { + "epoch": 2.418032786885246, + "grad_norm": 2.686638832092285, + "learning_rate": 0.0002, + "loss": 2.5698, + "step": 32450 + }, + { + "epoch": 2.418777943368107, + "grad_norm": 3.0381052494049072, + "learning_rate": 0.0002, + "loss": 2.5183, + "step": 32460 + }, + { + "epoch": 2.4195230998509687, + "grad_norm": 2.5471017360687256, + "learning_rate": 0.0002, + "loss": 2.5867, + "step": 32470 + }, + { + "epoch": 2.4202682563338302, + "grad_norm": 2.2520577907562256, + "learning_rate": 0.0002, + "loss": 2.3733, + "step": 32480 + }, + { + "epoch": 2.4210134128166914, + "grad_norm": 2.5006518363952637, + "learning_rate": 0.0002, + "loss": 2.5879, + "step": 32490 + }, + { + "epoch": 2.421758569299553, + "grad_norm": 2.8727335929870605, + "learning_rate": 0.0002, + "loss": 2.489, + "step": 32500 + }, + { + "epoch": 2.4225037257824145, + "grad_norm": 2.567643165588379, + "learning_rate": 0.0002, + "loss": 2.4855, + "step": 32510 + }, + { + "epoch": 2.4232488822652756, + "grad_norm": 2.561478614807129, + "learning_rate": 0.0002, + "loss": 2.4656, + "step": 32520 + }, + { + "epoch": 2.423994038748137, + "grad_norm": 2.8764541149139404, + "learning_rate": 0.0002, + "loss": 2.2183, + "step": 32530 + }, + { + "epoch": 2.4247391952309987, + "grad_norm": 2.419424057006836, + "learning_rate": 0.0002, + "loss": 2.3354, + "step": 32540 + }, + { + "epoch": 2.42548435171386, + "grad_norm": 2.3081166744232178, + "learning_rate": 0.0002, + "loss": 2.3535, + "step": 32550 + }, + { + "epoch": 2.4262295081967213, + "grad_norm": 2.464271068572998, + "learning_rate": 0.0002, + "loss": 2.437, + "step": 32560 + }, + { + "epoch": 2.426974664679583, + "grad_norm": 2.919722080230713, + "learning_rate": 0.0002, + "loss": 2.6343, + "step": 32570 + }, + { + "epoch": 2.427719821162444, + "grad_norm": 2.682070016860962, + "learning_rate": 0.0002, + "loss": 2.453, + "step": 32580 + }, + { + "epoch": 2.4284649776453056, + "grad_norm": 2.3702690601348877, + "learning_rate": 0.0002, + "loss": 2.3294, + "step": 32590 + }, + { + "epoch": 2.429210134128167, + "grad_norm": 2.7352712154388428, + "learning_rate": 0.0002, + "loss": 2.6568, + "step": 32600 + }, + { + "epoch": 2.429955290611028, + "grad_norm": 2.396092653274536, + "learning_rate": 0.0002, + "loss": 2.5299, + "step": 32610 + }, + { + "epoch": 2.4307004470938898, + "grad_norm": 2.298727512359619, + "learning_rate": 0.0002, + "loss": 2.4894, + "step": 32620 + }, + { + "epoch": 2.4314456035767513, + "grad_norm": 2.5034990310668945, + "learning_rate": 0.0002, + "loss": 2.5429, + "step": 32630 + }, + { + "epoch": 2.4321907600596124, + "grad_norm": 2.662663221359253, + "learning_rate": 0.0002, + "loss": 2.5597, + "step": 32640 + }, + { + "epoch": 2.432935916542474, + "grad_norm": 2.516818046569824, + "learning_rate": 0.0002, + "loss": 2.4248, + "step": 32650 + }, + { + "epoch": 2.433681073025335, + "grad_norm": 2.416969060897827, + "learning_rate": 0.0002, + "loss": 2.5677, + "step": 32660 + }, + { + "epoch": 2.4344262295081966, + "grad_norm": 2.575011968612671, + "learning_rate": 0.0002, + "loss": 2.5937, + "step": 32670 + }, + { + "epoch": 2.435171385991058, + "grad_norm": 2.692478895187378, + "learning_rate": 0.0002, + "loss": 2.5584, + "step": 32680 + }, + { + "epoch": 2.4359165424739198, + "grad_norm": 3.400285243988037, + "learning_rate": 0.0002, + "loss": 2.415, + "step": 32690 + }, + { + "epoch": 2.436661698956781, + "grad_norm": 3.0482239723205566, + "learning_rate": 0.0002, + "loss": 2.6148, + "step": 32700 + }, + { + "epoch": 2.4374068554396424, + "grad_norm": 2.66845965385437, + "learning_rate": 0.0002, + "loss": 2.4034, + "step": 32710 + }, + { + "epoch": 2.4381520119225035, + "grad_norm": 2.6935739517211914, + "learning_rate": 0.0002, + "loss": 2.1729, + "step": 32720 + }, + { + "epoch": 2.438897168405365, + "grad_norm": 2.8061110973358154, + "learning_rate": 0.0002, + "loss": 2.3881, + "step": 32730 + }, + { + "epoch": 2.4396423248882266, + "grad_norm": 2.6738741397857666, + "learning_rate": 0.0002, + "loss": 2.4427, + "step": 32740 + }, + { + "epoch": 2.4403874813710877, + "grad_norm": 2.7789554595947266, + "learning_rate": 0.0002, + "loss": 2.3962, + "step": 32750 + }, + { + "epoch": 2.4411326378539493, + "grad_norm": 2.6854817867279053, + "learning_rate": 0.0002, + "loss": 2.4369, + "step": 32760 + }, + { + "epoch": 2.441877794336811, + "grad_norm": 2.570359706878662, + "learning_rate": 0.0002, + "loss": 2.4015, + "step": 32770 + }, + { + "epoch": 2.442622950819672, + "grad_norm": 2.7053561210632324, + "learning_rate": 0.0002, + "loss": 2.4896, + "step": 32780 + }, + { + "epoch": 2.4433681073025335, + "grad_norm": 2.5930299758911133, + "learning_rate": 0.0002, + "loss": 2.4455, + "step": 32790 + }, + { + "epoch": 2.444113263785395, + "grad_norm": 2.827674627304077, + "learning_rate": 0.0002, + "loss": 2.4475, + "step": 32800 + }, + { + "epoch": 2.444858420268256, + "grad_norm": 2.686424732208252, + "learning_rate": 0.0002, + "loss": 2.4927, + "step": 32810 + }, + { + "epoch": 2.4456035767511177, + "grad_norm": 2.965540647506714, + "learning_rate": 0.0002, + "loss": 2.673, + "step": 32820 + }, + { + "epoch": 2.4463487332339793, + "grad_norm": 2.8941571712493896, + "learning_rate": 0.0002, + "loss": 2.5195, + "step": 32830 + }, + { + "epoch": 2.4470938897168404, + "grad_norm": 3.103050947189331, + "learning_rate": 0.0002, + "loss": 2.5864, + "step": 32840 + }, + { + "epoch": 2.447839046199702, + "grad_norm": 2.7334511280059814, + "learning_rate": 0.0002, + "loss": 2.4724, + "step": 32850 + }, + { + "epoch": 2.4485842026825635, + "grad_norm": 2.449275493621826, + "learning_rate": 0.0002, + "loss": 2.5776, + "step": 32860 + }, + { + "epoch": 2.4493293591654246, + "grad_norm": 2.5807056427001953, + "learning_rate": 0.0002, + "loss": 2.536, + "step": 32870 + }, + { + "epoch": 2.450074515648286, + "grad_norm": 2.7957377433776855, + "learning_rate": 0.0002, + "loss": 2.6551, + "step": 32880 + }, + { + "epoch": 2.4508196721311477, + "grad_norm": 2.858997106552124, + "learning_rate": 0.0002, + "loss": 2.3838, + "step": 32890 + }, + { + "epoch": 2.451564828614009, + "grad_norm": 2.8242721557617188, + "learning_rate": 0.0002, + "loss": 2.3843, + "step": 32900 + }, + { + "epoch": 2.4523099850968704, + "grad_norm": 2.914250612258911, + "learning_rate": 0.0002, + "loss": 2.3497, + "step": 32910 + }, + { + "epoch": 2.453055141579732, + "grad_norm": 2.325657606124878, + "learning_rate": 0.0002, + "loss": 2.5284, + "step": 32920 + }, + { + "epoch": 2.453800298062593, + "grad_norm": 2.4877982139587402, + "learning_rate": 0.0002, + "loss": 2.5986, + "step": 32930 + }, + { + "epoch": 2.4545454545454546, + "grad_norm": 2.6177945137023926, + "learning_rate": 0.0002, + "loss": 2.5368, + "step": 32940 + }, + { + "epoch": 2.455290611028316, + "grad_norm": 2.1501779556274414, + "learning_rate": 0.0002, + "loss": 2.3093, + "step": 32950 + }, + { + "epoch": 2.4560357675111772, + "grad_norm": 2.546349048614502, + "learning_rate": 0.0002, + "loss": 2.3231, + "step": 32960 + }, + { + "epoch": 2.456780923994039, + "grad_norm": 2.3199493885040283, + "learning_rate": 0.0002, + "loss": 2.5672, + "step": 32970 + }, + { + "epoch": 2.4575260804769004, + "grad_norm": 2.532883405685425, + "learning_rate": 0.0002, + "loss": 2.2814, + "step": 32980 + }, + { + "epoch": 2.4582712369597615, + "grad_norm": 2.9254093170166016, + "learning_rate": 0.0002, + "loss": 2.5414, + "step": 32990 + }, + { + "epoch": 2.459016393442623, + "grad_norm": 2.5555853843688965, + "learning_rate": 0.0002, + "loss": 2.5877, + "step": 33000 + }, + { + "epoch": 2.459761549925484, + "grad_norm": 2.520555019378662, + "learning_rate": 0.0002, + "loss": 2.4599, + "step": 33010 + }, + { + "epoch": 2.4605067064083457, + "grad_norm": 2.652587413787842, + "learning_rate": 0.0002, + "loss": 2.5855, + "step": 33020 + }, + { + "epoch": 2.4612518628912072, + "grad_norm": 2.7828288078308105, + "learning_rate": 0.0002, + "loss": 2.5077, + "step": 33030 + }, + { + "epoch": 2.461997019374069, + "grad_norm": 2.4250879287719727, + "learning_rate": 0.0002, + "loss": 2.6537, + "step": 33040 + }, + { + "epoch": 2.46274217585693, + "grad_norm": 2.376924753189087, + "learning_rate": 0.0002, + "loss": 2.5013, + "step": 33050 + }, + { + "epoch": 2.4634873323397914, + "grad_norm": 2.6665396690368652, + "learning_rate": 0.0002, + "loss": 2.452, + "step": 33060 + }, + { + "epoch": 2.4642324888226526, + "grad_norm": 2.920982599258423, + "learning_rate": 0.0002, + "loss": 2.4782, + "step": 33070 + }, + { + "epoch": 2.464977645305514, + "grad_norm": 1.9443694353103638, + "learning_rate": 0.0002, + "loss": 2.2695, + "step": 33080 + }, + { + "epoch": 2.4657228017883757, + "grad_norm": 2.8950319290161133, + "learning_rate": 0.0002, + "loss": 2.6325, + "step": 33090 + }, + { + "epoch": 2.4664679582712368, + "grad_norm": 3.0620553493499756, + "learning_rate": 0.0002, + "loss": 2.3013, + "step": 33100 + }, + { + "epoch": 2.4672131147540983, + "grad_norm": 2.653696060180664, + "learning_rate": 0.0002, + "loss": 2.5691, + "step": 33110 + }, + { + "epoch": 2.46795827123696, + "grad_norm": 2.347299337387085, + "learning_rate": 0.0002, + "loss": 2.2995, + "step": 33120 + }, + { + "epoch": 2.468703427719821, + "grad_norm": 2.8021976947784424, + "learning_rate": 0.0002, + "loss": 2.6018, + "step": 33130 + }, + { + "epoch": 2.4694485842026825, + "grad_norm": 1.9987612962722778, + "learning_rate": 0.0002, + "loss": 2.4832, + "step": 33140 + }, + { + "epoch": 2.470193740685544, + "grad_norm": 2.7334144115448, + "learning_rate": 0.0002, + "loss": 2.4264, + "step": 33150 + }, + { + "epoch": 2.470938897168405, + "grad_norm": 2.6771295070648193, + "learning_rate": 0.0002, + "loss": 2.5609, + "step": 33160 + }, + { + "epoch": 2.4716840536512668, + "grad_norm": 2.628952741622925, + "learning_rate": 0.0002, + "loss": 2.5076, + "step": 33170 + }, + { + "epoch": 2.4724292101341283, + "grad_norm": 2.5400376319885254, + "learning_rate": 0.0002, + "loss": 2.6434, + "step": 33180 + }, + { + "epoch": 2.4731743666169894, + "grad_norm": 2.332078695297241, + "learning_rate": 0.0002, + "loss": 2.4067, + "step": 33190 + }, + { + "epoch": 2.473919523099851, + "grad_norm": 2.7730212211608887, + "learning_rate": 0.0002, + "loss": 2.4936, + "step": 33200 + }, + { + "epoch": 2.4746646795827125, + "grad_norm": 2.5415468215942383, + "learning_rate": 0.0002, + "loss": 2.6113, + "step": 33210 + }, + { + "epoch": 2.4754098360655736, + "grad_norm": 2.6376805305480957, + "learning_rate": 0.0002, + "loss": 2.4445, + "step": 33220 + }, + { + "epoch": 2.476154992548435, + "grad_norm": 2.808635711669922, + "learning_rate": 0.0002, + "loss": 2.4696, + "step": 33230 + }, + { + "epoch": 2.4769001490312967, + "grad_norm": 2.3896541595458984, + "learning_rate": 0.0002, + "loss": 2.316, + "step": 33240 + }, + { + "epoch": 2.477645305514158, + "grad_norm": 2.628732919692993, + "learning_rate": 0.0002, + "loss": 2.661, + "step": 33250 + }, + { + "epoch": 2.4783904619970194, + "grad_norm": 2.627577543258667, + "learning_rate": 0.0002, + "loss": 2.5131, + "step": 33260 + }, + { + "epoch": 2.479135618479881, + "grad_norm": 2.3876442909240723, + "learning_rate": 0.0002, + "loss": 2.3684, + "step": 33270 + }, + { + "epoch": 2.479880774962742, + "grad_norm": 2.680107355117798, + "learning_rate": 0.0002, + "loss": 2.3554, + "step": 33280 + }, + { + "epoch": 2.4806259314456036, + "grad_norm": 2.5032846927642822, + "learning_rate": 0.0002, + "loss": 2.4945, + "step": 33290 + }, + { + "epoch": 2.481371087928465, + "grad_norm": 2.6249349117279053, + "learning_rate": 0.0002, + "loss": 2.574, + "step": 33300 + }, + { + "epoch": 2.4821162444113263, + "grad_norm": 2.3198981285095215, + "learning_rate": 0.0002, + "loss": 2.2761, + "step": 33310 + }, + { + "epoch": 2.482861400894188, + "grad_norm": 2.745832920074463, + "learning_rate": 0.0002, + "loss": 2.2473, + "step": 33320 + }, + { + "epoch": 2.4836065573770494, + "grad_norm": 2.7251551151275635, + "learning_rate": 0.0002, + "loss": 2.5399, + "step": 33330 + }, + { + "epoch": 2.4843517138599105, + "grad_norm": 2.507437229156494, + "learning_rate": 0.0002, + "loss": 2.4234, + "step": 33340 + }, + { + "epoch": 2.485096870342772, + "grad_norm": 2.7611029148101807, + "learning_rate": 0.0002, + "loss": 2.4828, + "step": 33350 + }, + { + "epoch": 2.485842026825633, + "grad_norm": 2.7167751789093018, + "learning_rate": 0.0002, + "loss": 2.3991, + "step": 33360 + }, + { + "epoch": 2.4865871833084947, + "grad_norm": 2.4960172176361084, + "learning_rate": 0.0002, + "loss": 2.3095, + "step": 33370 + }, + { + "epoch": 2.4873323397913563, + "grad_norm": 2.461411237716675, + "learning_rate": 0.0002, + "loss": 2.4042, + "step": 33380 + }, + { + "epoch": 2.488077496274218, + "grad_norm": 2.855609655380249, + "learning_rate": 0.0002, + "loss": 2.6609, + "step": 33390 + }, + { + "epoch": 2.488822652757079, + "grad_norm": 2.677143096923828, + "learning_rate": 0.0002, + "loss": 2.4067, + "step": 33400 + }, + { + "epoch": 2.4895678092399405, + "grad_norm": 2.7277297973632812, + "learning_rate": 0.0002, + "loss": 2.4223, + "step": 33410 + }, + { + "epoch": 2.4903129657228016, + "grad_norm": 2.5944976806640625, + "learning_rate": 0.0002, + "loss": 2.622, + "step": 33420 + }, + { + "epoch": 2.491058122205663, + "grad_norm": 2.656810998916626, + "learning_rate": 0.0002, + "loss": 2.5017, + "step": 33430 + }, + { + "epoch": 2.4918032786885247, + "grad_norm": 2.643364429473877, + "learning_rate": 0.0002, + "loss": 2.5823, + "step": 33440 + }, + { + "epoch": 2.492548435171386, + "grad_norm": 2.4666240215301514, + "learning_rate": 0.0002, + "loss": 2.4402, + "step": 33450 + }, + { + "epoch": 2.4932935916542474, + "grad_norm": 2.453465700149536, + "learning_rate": 0.0002, + "loss": 2.6123, + "step": 33460 + }, + { + "epoch": 2.494038748137109, + "grad_norm": 2.574659824371338, + "learning_rate": 0.0002, + "loss": 2.642, + "step": 33470 + }, + { + "epoch": 2.49478390461997, + "grad_norm": 2.6516501903533936, + "learning_rate": 0.0002, + "loss": 2.3554, + "step": 33480 + }, + { + "epoch": 2.4955290611028316, + "grad_norm": 2.5587661266326904, + "learning_rate": 0.0002, + "loss": 2.3586, + "step": 33490 + }, + { + "epoch": 2.496274217585693, + "grad_norm": 2.8648366928100586, + "learning_rate": 0.0002, + "loss": 2.392, + "step": 33500 + }, + { + "epoch": 2.4970193740685542, + "grad_norm": 2.6029820442199707, + "learning_rate": 0.0002, + "loss": 2.4047, + "step": 33510 + }, + { + "epoch": 2.497764530551416, + "grad_norm": 3.100987672805786, + "learning_rate": 0.0002, + "loss": 2.4915, + "step": 33520 + }, + { + "epoch": 2.4985096870342773, + "grad_norm": 2.6375701427459717, + "learning_rate": 0.0002, + "loss": 2.3645, + "step": 33530 + }, + { + "epoch": 2.4992548435171384, + "grad_norm": 2.5169084072113037, + "learning_rate": 0.0002, + "loss": 2.3556, + "step": 33540 + }, + { + "epoch": 2.5, + "grad_norm": 2.6907997131347656, + "learning_rate": 0.0002, + "loss": 2.4695, + "step": 33550 + }, + { + "epoch": 2.5007451564828616, + "grad_norm": 2.838801622390747, + "learning_rate": 0.0002, + "loss": 2.6255, + "step": 33560 + }, + { + "epoch": 2.5014903129657227, + "grad_norm": 2.6259567737579346, + "learning_rate": 0.0002, + "loss": 2.232, + "step": 33570 + }, + { + "epoch": 2.502235469448584, + "grad_norm": 2.487612247467041, + "learning_rate": 0.0002, + "loss": 2.4681, + "step": 33580 + }, + { + "epoch": 2.5029806259314458, + "grad_norm": 2.7123425006866455, + "learning_rate": 0.0002, + "loss": 2.4245, + "step": 33590 + }, + { + "epoch": 2.503725782414307, + "grad_norm": 2.5011355876922607, + "learning_rate": 0.0002, + "loss": 2.3437, + "step": 33600 + }, + { + "epoch": 2.5044709388971684, + "grad_norm": 2.726006507873535, + "learning_rate": 0.0002, + "loss": 2.5336, + "step": 33610 + }, + { + "epoch": 2.50521609538003, + "grad_norm": 2.6538584232330322, + "learning_rate": 0.0002, + "loss": 2.5752, + "step": 33620 + }, + { + "epoch": 2.505961251862891, + "grad_norm": 2.8791120052337646, + "learning_rate": 0.0002, + "loss": 2.5624, + "step": 33630 + }, + { + "epoch": 2.5067064083457526, + "grad_norm": 2.7906670570373535, + "learning_rate": 0.0002, + "loss": 2.566, + "step": 33640 + }, + { + "epoch": 2.5074515648286138, + "grad_norm": 2.980694532394409, + "learning_rate": 0.0002, + "loss": 2.6706, + "step": 33650 + }, + { + "epoch": 2.5081967213114753, + "grad_norm": 2.641268014907837, + "learning_rate": 0.0002, + "loss": 2.4101, + "step": 33660 + }, + { + "epoch": 2.508941877794337, + "grad_norm": 2.835050344467163, + "learning_rate": 0.0002, + "loss": 2.4747, + "step": 33670 + }, + { + "epoch": 2.5096870342771984, + "grad_norm": 2.5962984561920166, + "learning_rate": 0.0002, + "loss": 2.3435, + "step": 33680 + }, + { + "epoch": 2.5104321907600595, + "grad_norm": 2.6460578441619873, + "learning_rate": 0.0002, + "loss": 2.4282, + "step": 33690 + }, + { + "epoch": 2.511177347242921, + "grad_norm": 2.415022373199463, + "learning_rate": 0.0002, + "loss": 2.3002, + "step": 33700 + }, + { + "epoch": 2.511922503725782, + "grad_norm": 2.628546953201294, + "learning_rate": 0.0002, + "loss": 2.6371, + "step": 33710 + }, + { + "epoch": 2.5126676602086437, + "grad_norm": 2.697800636291504, + "learning_rate": 0.0002, + "loss": 2.302, + "step": 33720 + }, + { + "epoch": 2.5134128166915053, + "grad_norm": 2.8529772758483887, + "learning_rate": 0.0002, + "loss": 2.5254, + "step": 33730 + }, + { + "epoch": 2.514157973174367, + "grad_norm": 2.5316379070281982, + "learning_rate": 0.0002, + "loss": 2.4038, + "step": 33740 + }, + { + "epoch": 2.514903129657228, + "grad_norm": 2.350818634033203, + "learning_rate": 0.0002, + "loss": 2.3349, + "step": 33750 + }, + { + "epoch": 2.5156482861400895, + "grad_norm": 2.5626444816589355, + "learning_rate": 0.0002, + "loss": 2.5337, + "step": 33760 + }, + { + "epoch": 2.5163934426229506, + "grad_norm": 2.1654140949249268, + "learning_rate": 0.0002, + "loss": 2.3723, + "step": 33770 + }, + { + "epoch": 2.517138599105812, + "grad_norm": 3.0190258026123047, + "learning_rate": 0.0002, + "loss": 2.5135, + "step": 33780 + }, + { + "epoch": 2.5178837555886737, + "grad_norm": 2.6092681884765625, + "learning_rate": 0.0002, + "loss": 2.4508, + "step": 33790 + }, + { + "epoch": 2.5186289120715353, + "grad_norm": 3.0868630409240723, + "learning_rate": 0.0002, + "loss": 2.4954, + "step": 33800 + }, + { + "epoch": 2.5193740685543964, + "grad_norm": 2.437244176864624, + "learning_rate": 0.0002, + "loss": 2.5313, + "step": 33810 + }, + { + "epoch": 2.520119225037258, + "grad_norm": 2.7770519256591797, + "learning_rate": 0.0002, + "loss": 2.4047, + "step": 33820 + }, + { + "epoch": 2.520864381520119, + "grad_norm": 2.540032386779785, + "learning_rate": 0.0002, + "loss": 2.3, + "step": 33830 + }, + { + "epoch": 2.5216095380029806, + "grad_norm": 2.8454859256744385, + "learning_rate": 0.0002, + "loss": 2.4243, + "step": 33840 + }, + { + "epoch": 2.522354694485842, + "grad_norm": 2.4793384075164795, + "learning_rate": 0.0002, + "loss": 2.441, + "step": 33850 + }, + { + "epoch": 2.5230998509687033, + "grad_norm": 2.884309768676758, + "learning_rate": 0.0002, + "loss": 2.6058, + "step": 33860 + }, + { + "epoch": 2.523845007451565, + "grad_norm": 2.188300371170044, + "learning_rate": 0.0002, + "loss": 2.2676, + "step": 33870 + }, + { + "epoch": 2.5245901639344264, + "grad_norm": 2.1281039714813232, + "learning_rate": 0.0002, + "loss": 2.4825, + "step": 33880 + }, + { + "epoch": 2.5253353204172875, + "grad_norm": 2.872929096221924, + "learning_rate": 0.0002, + "loss": 2.6046, + "step": 33890 + }, + { + "epoch": 2.526080476900149, + "grad_norm": 2.7643649578094482, + "learning_rate": 0.0002, + "loss": 2.5626, + "step": 33900 + }, + { + "epoch": 2.5268256333830106, + "grad_norm": 2.42836856842041, + "learning_rate": 0.0002, + "loss": 2.5225, + "step": 33910 + }, + { + "epoch": 2.5275707898658717, + "grad_norm": 2.7934534549713135, + "learning_rate": 0.0002, + "loss": 2.6089, + "step": 33920 + }, + { + "epoch": 2.5283159463487332, + "grad_norm": 2.9219272136688232, + "learning_rate": 0.0002, + "loss": 2.4704, + "step": 33930 + }, + { + "epoch": 2.529061102831595, + "grad_norm": 2.470702648162842, + "learning_rate": 0.0002, + "loss": 2.5143, + "step": 33940 + }, + { + "epoch": 2.529806259314456, + "grad_norm": 2.40101957321167, + "learning_rate": 0.0002, + "loss": 2.4033, + "step": 33950 + }, + { + "epoch": 2.5305514157973175, + "grad_norm": 2.4224092960357666, + "learning_rate": 0.0002, + "loss": 2.3719, + "step": 33960 + }, + { + "epoch": 2.531296572280179, + "grad_norm": 2.8269007205963135, + "learning_rate": 0.0002, + "loss": 2.6289, + "step": 33970 + }, + { + "epoch": 2.53204172876304, + "grad_norm": 2.590324878692627, + "learning_rate": 0.0002, + "loss": 2.3846, + "step": 33980 + }, + { + "epoch": 2.5327868852459017, + "grad_norm": 2.609064817428589, + "learning_rate": 0.0002, + "loss": 2.5036, + "step": 33990 + }, + { + "epoch": 2.533532041728763, + "grad_norm": 2.3954341411590576, + "learning_rate": 0.0002, + "loss": 2.3507, + "step": 34000 + }, + { + "epoch": 2.5342771982116243, + "grad_norm": 2.4784460067749023, + "learning_rate": 0.0002, + "loss": 2.5389, + "step": 34010 + }, + { + "epoch": 2.535022354694486, + "grad_norm": 2.3329291343688965, + "learning_rate": 0.0002, + "loss": 2.4564, + "step": 34020 + }, + { + "epoch": 2.5357675111773474, + "grad_norm": 2.564966917037964, + "learning_rate": 0.0002, + "loss": 2.5259, + "step": 34030 + }, + { + "epoch": 2.5365126676602086, + "grad_norm": 2.7907464504241943, + "learning_rate": 0.0002, + "loss": 2.5996, + "step": 34040 + }, + { + "epoch": 2.53725782414307, + "grad_norm": 2.61496901512146, + "learning_rate": 0.0002, + "loss": 2.4294, + "step": 34050 + }, + { + "epoch": 2.538002980625931, + "grad_norm": 2.5736818313598633, + "learning_rate": 0.0002, + "loss": 2.2709, + "step": 34060 + }, + { + "epoch": 2.5387481371087928, + "grad_norm": 2.3524608612060547, + "learning_rate": 0.0002, + "loss": 2.2737, + "step": 34070 + }, + { + "epoch": 2.5394932935916543, + "grad_norm": 2.7142202854156494, + "learning_rate": 0.0002, + "loss": 2.4911, + "step": 34080 + }, + { + "epoch": 2.540238450074516, + "grad_norm": 2.64286470413208, + "learning_rate": 0.0002, + "loss": 2.5254, + "step": 34090 + }, + { + "epoch": 2.540983606557377, + "grad_norm": 3.114713191986084, + "learning_rate": 0.0002, + "loss": 2.5674, + "step": 34100 + }, + { + "epoch": 2.5417287630402385, + "grad_norm": 3.056455373764038, + "learning_rate": 0.0002, + "loss": 2.4114, + "step": 34110 + }, + { + "epoch": 2.5424739195230996, + "grad_norm": 2.2923319339752197, + "learning_rate": 0.0002, + "loss": 2.372, + "step": 34120 + }, + { + "epoch": 2.543219076005961, + "grad_norm": 3.0862503051757812, + "learning_rate": 0.0002, + "loss": 2.5038, + "step": 34130 + }, + { + "epoch": 2.5439642324888228, + "grad_norm": 2.7312214374542236, + "learning_rate": 0.0002, + "loss": 2.7273, + "step": 34140 + }, + { + "epoch": 2.5447093889716843, + "grad_norm": 2.667236089706421, + "learning_rate": 0.0002, + "loss": 2.5896, + "step": 34150 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 2.5944461822509766, + "learning_rate": 0.0002, + "loss": 2.6472, + "step": 34160 + }, + { + "epoch": 2.546199701937407, + "grad_norm": 2.375399589538574, + "learning_rate": 0.0002, + "loss": 2.5534, + "step": 34170 + }, + { + "epoch": 2.546944858420268, + "grad_norm": 2.3310904502868652, + "learning_rate": 0.0002, + "loss": 2.3438, + "step": 34180 + }, + { + "epoch": 2.5476900149031296, + "grad_norm": 2.3396718502044678, + "learning_rate": 0.0002, + "loss": 2.629, + "step": 34190 + }, + { + "epoch": 2.548435171385991, + "grad_norm": 2.717461109161377, + "learning_rate": 0.0002, + "loss": 2.5391, + "step": 34200 + }, + { + "epoch": 2.5491803278688527, + "grad_norm": 2.5795769691467285, + "learning_rate": 0.0002, + "loss": 2.4663, + "step": 34210 + }, + { + "epoch": 2.549925484351714, + "grad_norm": 2.31074857711792, + "learning_rate": 0.0002, + "loss": 2.3499, + "step": 34220 + }, + { + "epoch": 2.5506706408345754, + "grad_norm": 2.2600789070129395, + "learning_rate": 0.0002, + "loss": 2.4089, + "step": 34230 + }, + { + "epoch": 2.5514157973174365, + "grad_norm": 2.5004208087921143, + "learning_rate": 0.0002, + "loss": 2.4895, + "step": 34240 + }, + { + "epoch": 2.552160953800298, + "grad_norm": 2.4714882373809814, + "learning_rate": 0.0002, + "loss": 2.3912, + "step": 34250 + }, + { + "epoch": 2.5529061102831596, + "grad_norm": 2.5733115673065186, + "learning_rate": 0.0002, + "loss": 2.4612, + "step": 34260 + }, + { + "epoch": 2.5536512667660207, + "grad_norm": 2.537883996963501, + "learning_rate": 0.0002, + "loss": 2.5025, + "step": 34270 + }, + { + "epoch": 2.5543964232488823, + "grad_norm": 2.867398262023926, + "learning_rate": 0.0002, + "loss": 2.4764, + "step": 34280 + }, + { + "epoch": 2.555141579731744, + "grad_norm": 2.3895552158355713, + "learning_rate": 0.0002, + "loss": 2.6755, + "step": 34290 + }, + { + "epoch": 2.555886736214605, + "grad_norm": 2.6244559288024902, + "learning_rate": 0.0002, + "loss": 2.5904, + "step": 34300 + }, + { + "epoch": 2.5566318926974665, + "grad_norm": 3.0970191955566406, + "learning_rate": 0.0002, + "loss": 2.4423, + "step": 34310 + }, + { + "epoch": 2.557377049180328, + "grad_norm": 2.543386459350586, + "learning_rate": 0.0002, + "loss": 2.4975, + "step": 34320 + }, + { + "epoch": 2.558122205663189, + "grad_norm": 2.6352686882019043, + "learning_rate": 0.0002, + "loss": 2.5865, + "step": 34330 + }, + { + "epoch": 2.5588673621460507, + "grad_norm": 3.1985583305358887, + "learning_rate": 0.0002, + "loss": 2.6501, + "step": 34340 + }, + { + "epoch": 2.559612518628912, + "grad_norm": 2.206559896469116, + "learning_rate": 0.0002, + "loss": 2.5045, + "step": 34350 + }, + { + "epoch": 2.5603576751117734, + "grad_norm": 2.626335620880127, + "learning_rate": 0.0002, + "loss": 2.4685, + "step": 34360 + }, + { + "epoch": 2.561102831594635, + "grad_norm": 2.7654409408569336, + "learning_rate": 0.0002, + "loss": 2.2565, + "step": 34370 + }, + { + "epoch": 2.5618479880774965, + "grad_norm": 2.5158398151397705, + "learning_rate": 0.0002, + "loss": 2.5521, + "step": 34380 + }, + { + "epoch": 2.5625931445603576, + "grad_norm": 3.0391643047332764, + "learning_rate": 0.0002, + "loss": 2.5027, + "step": 34390 + }, + { + "epoch": 2.563338301043219, + "grad_norm": 2.9316203594207764, + "learning_rate": 0.0002, + "loss": 2.3832, + "step": 34400 + }, + { + "epoch": 2.5640834575260802, + "grad_norm": 2.54579496383667, + "learning_rate": 0.0002, + "loss": 2.4208, + "step": 34410 + }, + { + "epoch": 2.564828614008942, + "grad_norm": 2.8894309997558594, + "learning_rate": 0.0002, + "loss": 2.4221, + "step": 34420 + }, + { + "epoch": 2.5655737704918034, + "grad_norm": 2.742469072341919, + "learning_rate": 0.0002, + "loss": 2.5528, + "step": 34430 + }, + { + "epoch": 2.566318926974665, + "grad_norm": 2.7238597869873047, + "learning_rate": 0.0002, + "loss": 2.5734, + "step": 34440 + }, + { + "epoch": 2.567064083457526, + "grad_norm": 2.892765998840332, + "learning_rate": 0.0002, + "loss": 2.3945, + "step": 34450 + }, + { + "epoch": 2.5678092399403876, + "grad_norm": 2.4451138973236084, + "learning_rate": 0.0002, + "loss": 2.5518, + "step": 34460 + }, + { + "epoch": 2.5685543964232487, + "grad_norm": 2.573063850402832, + "learning_rate": 0.0002, + "loss": 2.4919, + "step": 34470 + }, + { + "epoch": 2.5692995529061102, + "grad_norm": 2.7531449794769287, + "learning_rate": 0.0002, + "loss": 2.5301, + "step": 34480 + }, + { + "epoch": 2.570044709388972, + "grad_norm": 2.6223232746124268, + "learning_rate": 0.0002, + "loss": 2.6511, + "step": 34490 + }, + { + "epoch": 2.5707898658718333, + "grad_norm": 2.8373148441314697, + "learning_rate": 0.0002, + "loss": 2.3971, + "step": 34500 + }, + { + "epoch": 2.5715350223546944, + "grad_norm": 2.770490884780884, + "learning_rate": 0.0002, + "loss": 2.5108, + "step": 34510 + }, + { + "epoch": 2.572280178837556, + "grad_norm": 2.5854387283325195, + "learning_rate": 0.0002, + "loss": 2.5076, + "step": 34520 + }, + { + "epoch": 2.573025335320417, + "grad_norm": 2.411576986312866, + "learning_rate": 0.0002, + "loss": 2.6648, + "step": 34530 + }, + { + "epoch": 2.5737704918032787, + "grad_norm": 2.5123982429504395, + "learning_rate": 0.0002, + "loss": 2.5907, + "step": 34540 + }, + { + "epoch": 2.57451564828614, + "grad_norm": 2.831042528152466, + "learning_rate": 0.0002, + "loss": 2.5356, + "step": 34550 + }, + { + "epoch": 2.5752608047690018, + "grad_norm": 2.643927574157715, + "learning_rate": 0.0002, + "loss": 2.4261, + "step": 34560 + }, + { + "epoch": 2.576005961251863, + "grad_norm": 2.698838949203491, + "learning_rate": 0.0002, + "loss": 2.4362, + "step": 34570 + }, + { + "epoch": 2.5767511177347244, + "grad_norm": 2.549325704574585, + "learning_rate": 0.0002, + "loss": 2.5073, + "step": 34580 + }, + { + "epoch": 2.5774962742175855, + "grad_norm": 2.3992767333984375, + "learning_rate": 0.0002, + "loss": 2.468, + "step": 34590 + }, + { + "epoch": 2.578241430700447, + "grad_norm": 3.1624796390533447, + "learning_rate": 0.0002, + "loss": 2.5036, + "step": 34600 + }, + { + "epoch": 2.5789865871833086, + "grad_norm": 2.8962011337280273, + "learning_rate": 0.0002, + "loss": 2.5846, + "step": 34610 + }, + { + "epoch": 2.5797317436661698, + "grad_norm": 2.2955880165100098, + "learning_rate": 0.0002, + "loss": 2.5806, + "step": 34620 + }, + { + "epoch": 2.5804769001490313, + "grad_norm": 2.375588893890381, + "learning_rate": 0.0002, + "loss": 2.4193, + "step": 34630 + }, + { + "epoch": 2.581222056631893, + "grad_norm": 2.7803938388824463, + "learning_rate": 0.0002, + "loss": 2.4429, + "step": 34640 + }, + { + "epoch": 2.581967213114754, + "grad_norm": 2.5117294788360596, + "learning_rate": 0.0002, + "loss": 2.459, + "step": 34650 + }, + { + "epoch": 2.5827123695976155, + "grad_norm": 2.65371036529541, + "learning_rate": 0.0002, + "loss": 2.5789, + "step": 34660 + }, + { + "epoch": 2.583457526080477, + "grad_norm": 2.6229374408721924, + "learning_rate": 0.0002, + "loss": 2.6575, + "step": 34670 + }, + { + "epoch": 2.584202682563338, + "grad_norm": 3.0383689403533936, + "learning_rate": 0.0002, + "loss": 2.5808, + "step": 34680 + }, + { + "epoch": 2.5849478390461997, + "grad_norm": 2.6986963748931885, + "learning_rate": 0.0002, + "loss": 2.3615, + "step": 34690 + }, + { + "epoch": 2.585692995529061, + "grad_norm": 2.7407946586608887, + "learning_rate": 0.0002, + "loss": 2.1574, + "step": 34700 + }, + { + "epoch": 2.5864381520119224, + "grad_norm": 2.9318466186523438, + "learning_rate": 0.0002, + "loss": 2.6112, + "step": 34710 + }, + { + "epoch": 2.587183308494784, + "grad_norm": 2.139223337173462, + "learning_rate": 0.0002, + "loss": 2.4195, + "step": 34720 + }, + { + "epoch": 2.5879284649776455, + "grad_norm": 2.886835813522339, + "learning_rate": 0.0002, + "loss": 2.4984, + "step": 34730 + }, + { + "epoch": 2.5886736214605066, + "grad_norm": 2.40415358543396, + "learning_rate": 0.0002, + "loss": 2.5024, + "step": 34740 + }, + { + "epoch": 2.589418777943368, + "grad_norm": 2.612410306930542, + "learning_rate": 0.0002, + "loss": 2.4832, + "step": 34750 + }, + { + "epoch": 2.5901639344262293, + "grad_norm": 2.2832210063934326, + "learning_rate": 0.0002, + "loss": 2.4649, + "step": 34760 + }, + { + "epoch": 2.590909090909091, + "grad_norm": 2.763620615005493, + "learning_rate": 0.0002, + "loss": 2.3836, + "step": 34770 + }, + { + "epoch": 2.5916542473919524, + "grad_norm": 2.6266095638275146, + "learning_rate": 0.0002, + "loss": 2.4563, + "step": 34780 + }, + { + "epoch": 2.592399403874814, + "grad_norm": 2.604623317718506, + "learning_rate": 0.0002, + "loss": 2.5474, + "step": 34790 + }, + { + "epoch": 2.593144560357675, + "grad_norm": 2.5370583534240723, + "learning_rate": 0.0002, + "loss": 2.4926, + "step": 34800 + }, + { + "epoch": 2.5938897168405366, + "grad_norm": 2.7688779830932617, + "learning_rate": 0.0002, + "loss": 2.5277, + "step": 34810 + }, + { + "epoch": 2.5946348733233977, + "grad_norm": 1.6677604913711548, + "learning_rate": 0.0002, + "loss": 2.4469, + "step": 34820 + }, + { + "epoch": 2.5953800298062593, + "grad_norm": 2.6081464290618896, + "learning_rate": 0.0002, + "loss": 2.6667, + "step": 34830 + }, + { + "epoch": 2.596125186289121, + "grad_norm": 2.504087209701538, + "learning_rate": 0.0002, + "loss": 2.4811, + "step": 34840 + }, + { + "epoch": 2.5968703427719824, + "grad_norm": 2.514352798461914, + "learning_rate": 0.0002, + "loss": 2.4831, + "step": 34850 + }, + { + "epoch": 2.5976154992548435, + "grad_norm": 2.6763663291931152, + "learning_rate": 0.0002, + "loss": 2.4431, + "step": 34860 + }, + { + "epoch": 2.598360655737705, + "grad_norm": 2.6285643577575684, + "learning_rate": 0.0002, + "loss": 2.6107, + "step": 34870 + }, + { + "epoch": 2.599105812220566, + "grad_norm": 2.520859956741333, + "learning_rate": 0.0002, + "loss": 2.5125, + "step": 34880 + }, + { + "epoch": 2.5998509687034277, + "grad_norm": 2.3472795486450195, + "learning_rate": 0.0002, + "loss": 2.308, + "step": 34890 + }, + { + "epoch": 2.6005961251862892, + "grad_norm": 2.735682249069214, + "learning_rate": 0.0002, + "loss": 2.5909, + "step": 34900 + }, + { + "epoch": 2.601341281669151, + "grad_norm": 3.270073652267456, + "learning_rate": 0.0002, + "loss": 2.6196, + "step": 34910 + }, + { + "epoch": 2.602086438152012, + "grad_norm": 2.6279194355010986, + "learning_rate": 0.0002, + "loss": 2.519, + "step": 34920 + }, + { + "epoch": 2.6028315946348735, + "grad_norm": 2.125945568084717, + "learning_rate": 0.0002, + "loss": 2.4861, + "step": 34930 + }, + { + "epoch": 2.6035767511177346, + "grad_norm": 2.817042589187622, + "learning_rate": 0.0002, + "loss": 2.4631, + "step": 34940 + }, + { + "epoch": 2.604321907600596, + "grad_norm": 2.75313138961792, + "learning_rate": 0.0002, + "loss": 2.304, + "step": 34950 + }, + { + "epoch": 2.6050670640834577, + "grad_norm": 2.5719428062438965, + "learning_rate": 0.0002, + "loss": 2.4361, + "step": 34960 + }, + { + "epoch": 2.605812220566319, + "grad_norm": 2.4278576374053955, + "learning_rate": 0.0002, + "loss": 2.5518, + "step": 34970 + }, + { + "epoch": 2.6065573770491803, + "grad_norm": 2.707906484603882, + "learning_rate": 0.0002, + "loss": 2.5591, + "step": 34980 + }, + { + "epoch": 2.607302533532042, + "grad_norm": 3.9694321155548096, + "learning_rate": 0.0002, + "loss": 2.5416, + "step": 34990 + }, + { + "epoch": 2.608047690014903, + "grad_norm": 2.7127926349639893, + "learning_rate": 0.0002, + "loss": 2.3842, + "step": 35000 + }, + { + "epoch": 2.6087928464977646, + "grad_norm": 2.6059675216674805, + "learning_rate": 0.0002, + "loss": 2.4429, + "step": 35010 + }, + { + "epoch": 2.609538002980626, + "grad_norm": 2.09088134765625, + "learning_rate": 0.0002, + "loss": 2.3938, + "step": 35020 + }, + { + "epoch": 2.610283159463487, + "grad_norm": 3.14701509475708, + "learning_rate": 0.0002, + "loss": 2.3627, + "step": 35030 + }, + { + "epoch": 2.6110283159463488, + "grad_norm": 2.85632061958313, + "learning_rate": 0.0002, + "loss": 2.6743, + "step": 35040 + }, + { + "epoch": 2.61177347242921, + "grad_norm": 2.6178882122039795, + "learning_rate": 0.0002, + "loss": 2.5535, + "step": 35050 + }, + { + "epoch": 2.6125186289120714, + "grad_norm": 2.384420394897461, + "learning_rate": 0.0002, + "loss": 2.5122, + "step": 35060 + }, + { + "epoch": 2.613263785394933, + "grad_norm": 2.4369149208068848, + "learning_rate": 0.0002, + "loss": 2.6472, + "step": 35070 + }, + { + "epoch": 2.6140089418777945, + "grad_norm": 2.6348259449005127, + "learning_rate": 0.0002, + "loss": 2.4447, + "step": 35080 + }, + { + "epoch": 2.6147540983606556, + "grad_norm": 2.780752420425415, + "learning_rate": 0.0002, + "loss": 2.5351, + "step": 35090 + }, + { + "epoch": 2.615499254843517, + "grad_norm": 2.450267791748047, + "learning_rate": 0.0002, + "loss": 2.3382, + "step": 35100 + }, + { + "epoch": 2.6162444113263783, + "grad_norm": 2.720033645629883, + "learning_rate": 0.0002, + "loss": 2.3684, + "step": 35110 + }, + { + "epoch": 2.61698956780924, + "grad_norm": 2.995755910873413, + "learning_rate": 0.0002, + "loss": 2.4707, + "step": 35120 + }, + { + "epoch": 2.6177347242921014, + "grad_norm": 2.7804386615753174, + "learning_rate": 0.0002, + "loss": 2.5992, + "step": 35130 + }, + { + "epoch": 2.618479880774963, + "grad_norm": 2.687592029571533, + "learning_rate": 0.0002, + "loss": 2.6438, + "step": 35140 + }, + { + "epoch": 2.619225037257824, + "grad_norm": 2.72942852973938, + "learning_rate": 0.0002, + "loss": 2.5031, + "step": 35150 + }, + { + "epoch": 2.6199701937406856, + "grad_norm": 2.743436813354492, + "learning_rate": 0.0002, + "loss": 2.5397, + "step": 35160 + }, + { + "epoch": 2.6207153502235467, + "grad_norm": 2.544156789779663, + "learning_rate": 0.0002, + "loss": 2.579, + "step": 35170 + }, + { + "epoch": 2.6214605067064083, + "grad_norm": 2.2848076820373535, + "learning_rate": 0.0002, + "loss": 2.42, + "step": 35180 + }, + { + "epoch": 2.62220566318927, + "grad_norm": 2.5742909908294678, + "learning_rate": 0.0002, + "loss": 2.4444, + "step": 35190 + }, + { + "epoch": 2.6229508196721314, + "grad_norm": 2.081449270248413, + "learning_rate": 0.0002, + "loss": 2.4044, + "step": 35200 + }, + { + "epoch": 2.6236959761549925, + "grad_norm": 2.488731622695923, + "learning_rate": 0.0002, + "loss": 2.5191, + "step": 35210 + }, + { + "epoch": 2.624441132637854, + "grad_norm": 2.7537927627563477, + "learning_rate": 0.0002, + "loss": 2.6245, + "step": 35220 + }, + { + "epoch": 2.625186289120715, + "grad_norm": 2.9869134426116943, + "learning_rate": 0.0002, + "loss": 2.4964, + "step": 35230 + }, + { + "epoch": 2.6259314456035767, + "grad_norm": 2.343207836151123, + "learning_rate": 0.0002, + "loss": 2.3746, + "step": 35240 + }, + { + "epoch": 2.6266766020864383, + "grad_norm": 2.5939278602600098, + "learning_rate": 0.0002, + "loss": 2.5991, + "step": 35250 + }, + { + "epoch": 2.6274217585693, + "grad_norm": 2.727090835571289, + "learning_rate": 0.0002, + "loss": 2.2542, + "step": 35260 + }, + { + "epoch": 2.628166915052161, + "grad_norm": 2.4267163276672363, + "learning_rate": 0.0002, + "loss": 2.3105, + "step": 35270 + }, + { + "epoch": 2.6289120715350225, + "grad_norm": 2.565988540649414, + "learning_rate": 0.0002, + "loss": 2.6, + "step": 35280 + }, + { + "epoch": 2.6296572280178836, + "grad_norm": 2.4375131130218506, + "learning_rate": 0.0002, + "loss": 2.5983, + "step": 35290 + }, + { + "epoch": 2.630402384500745, + "grad_norm": 2.661345958709717, + "learning_rate": 0.0002, + "loss": 2.3076, + "step": 35300 + }, + { + "epoch": 2.6311475409836067, + "grad_norm": 2.9548776149749756, + "learning_rate": 0.0002, + "loss": 2.5269, + "step": 35310 + }, + { + "epoch": 2.631892697466468, + "grad_norm": 2.780071496963501, + "learning_rate": 0.0002, + "loss": 2.5164, + "step": 35320 + }, + { + "epoch": 2.6326378539493294, + "grad_norm": 3.2674901485443115, + "learning_rate": 0.0002, + "loss": 2.3561, + "step": 35330 + }, + { + "epoch": 2.633383010432191, + "grad_norm": 2.833890438079834, + "learning_rate": 0.0002, + "loss": 2.2951, + "step": 35340 + }, + { + "epoch": 2.634128166915052, + "grad_norm": 2.463898181915283, + "learning_rate": 0.0002, + "loss": 2.5373, + "step": 35350 + }, + { + "epoch": 2.6348733233979136, + "grad_norm": 2.6374974250793457, + "learning_rate": 0.0002, + "loss": 2.5007, + "step": 35360 + }, + { + "epoch": 2.635618479880775, + "grad_norm": 2.711229085922241, + "learning_rate": 0.0002, + "loss": 2.4471, + "step": 35370 + }, + { + "epoch": 2.6363636363636362, + "grad_norm": 2.583221435546875, + "learning_rate": 0.0002, + "loss": 2.5363, + "step": 35380 + }, + { + "epoch": 2.637108792846498, + "grad_norm": 2.668238878250122, + "learning_rate": 0.0002, + "loss": 2.5197, + "step": 35390 + }, + { + "epoch": 2.637853949329359, + "grad_norm": 2.6465303897857666, + "learning_rate": 0.0002, + "loss": 2.2952, + "step": 35400 + }, + { + "epoch": 2.6385991058122205, + "grad_norm": 2.5099635124206543, + "learning_rate": 0.0002, + "loss": 2.547, + "step": 35410 + }, + { + "epoch": 2.639344262295082, + "grad_norm": 2.978407621383667, + "learning_rate": 0.0002, + "loss": 2.563, + "step": 35420 + }, + { + "epoch": 2.6400894187779436, + "grad_norm": 2.6208372116088867, + "learning_rate": 0.0002, + "loss": 2.6188, + "step": 35430 + }, + { + "epoch": 2.6408345752608047, + "grad_norm": 3.47013258934021, + "learning_rate": 0.0002, + "loss": 2.5536, + "step": 35440 + }, + { + "epoch": 2.6415797317436662, + "grad_norm": 2.438246726989746, + "learning_rate": 0.0002, + "loss": 2.385, + "step": 35450 + }, + { + "epoch": 2.6423248882265273, + "grad_norm": 2.5819106101989746, + "learning_rate": 0.0002, + "loss": 2.5517, + "step": 35460 + }, + { + "epoch": 2.643070044709389, + "grad_norm": 2.8637759685516357, + "learning_rate": 0.0002, + "loss": 2.48, + "step": 35470 + }, + { + "epoch": 2.6438152011922504, + "grad_norm": 2.3931562900543213, + "learning_rate": 0.0002, + "loss": 2.3955, + "step": 35480 + }, + { + "epoch": 2.644560357675112, + "grad_norm": 3.022334575653076, + "learning_rate": 0.0002, + "loss": 2.3759, + "step": 35490 + }, + { + "epoch": 2.645305514157973, + "grad_norm": 2.471989154815674, + "learning_rate": 0.0002, + "loss": 2.4271, + "step": 35500 + }, + { + "epoch": 2.6460506706408347, + "grad_norm": 2.57979679107666, + "learning_rate": 0.0002, + "loss": 2.5865, + "step": 35510 + }, + { + "epoch": 2.6467958271236958, + "grad_norm": 2.5882067680358887, + "learning_rate": 0.0002, + "loss": 2.5651, + "step": 35520 + }, + { + "epoch": 2.6475409836065573, + "grad_norm": 2.924689531326294, + "learning_rate": 0.0002, + "loss": 2.5692, + "step": 35530 + }, + { + "epoch": 2.648286140089419, + "grad_norm": 2.678907632827759, + "learning_rate": 0.0002, + "loss": 2.3231, + "step": 35540 + }, + { + "epoch": 2.6490312965722804, + "grad_norm": 2.9073657989501953, + "learning_rate": 0.0002, + "loss": 2.4733, + "step": 35550 + }, + { + "epoch": 2.6497764530551415, + "grad_norm": 2.6203291416168213, + "learning_rate": 0.0002, + "loss": 2.4252, + "step": 35560 + }, + { + "epoch": 2.650521609538003, + "grad_norm": 2.8620171546936035, + "learning_rate": 0.0002, + "loss": 2.5345, + "step": 35570 + }, + { + "epoch": 2.651266766020864, + "grad_norm": 3.0765066146850586, + "learning_rate": 0.0002, + "loss": 2.5476, + "step": 35580 + }, + { + "epoch": 2.6520119225037257, + "grad_norm": 2.6432547569274902, + "learning_rate": 0.0002, + "loss": 2.6744, + "step": 35590 + }, + { + "epoch": 2.6527570789865873, + "grad_norm": 2.4183008670806885, + "learning_rate": 0.0002, + "loss": 2.4373, + "step": 35600 + }, + { + "epoch": 2.653502235469449, + "grad_norm": 2.6401169300079346, + "learning_rate": 0.0002, + "loss": 2.6115, + "step": 35610 + }, + { + "epoch": 2.65424739195231, + "grad_norm": 2.336107015609741, + "learning_rate": 0.0002, + "loss": 2.5014, + "step": 35620 + }, + { + "epoch": 2.6549925484351715, + "grad_norm": 2.7248358726501465, + "learning_rate": 0.0002, + "loss": 2.3869, + "step": 35630 + }, + { + "epoch": 2.6557377049180326, + "grad_norm": 2.5932023525238037, + "learning_rate": 0.0002, + "loss": 2.6997, + "step": 35640 + }, + { + "epoch": 2.656482861400894, + "grad_norm": 2.6041550636291504, + "learning_rate": 0.0002, + "loss": 2.4897, + "step": 35650 + }, + { + "epoch": 2.6572280178837557, + "grad_norm": 2.6969892978668213, + "learning_rate": 0.0002, + "loss": 2.564, + "step": 35660 + }, + { + "epoch": 2.657973174366617, + "grad_norm": 2.610666275024414, + "learning_rate": 0.0002, + "loss": 2.6471, + "step": 35670 + }, + { + "epoch": 2.6587183308494784, + "grad_norm": 2.631580114364624, + "learning_rate": 0.0002, + "loss": 2.5464, + "step": 35680 + }, + { + "epoch": 2.65946348733234, + "grad_norm": 2.7000410556793213, + "learning_rate": 0.0002, + "loss": 2.5847, + "step": 35690 + }, + { + "epoch": 2.660208643815201, + "grad_norm": 2.5906224250793457, + "learning_rate": 0.0002, + "loss": 2.5968, + "step": 35700 + }, + { + "epoch": 2.6609538002980626, + "grad_norm": 2.6354689598083496, + "learning_rate": 0.0002, + "loss": 2.6701, + "step": 35710 + }, + { + "epoch": 2.661698956780924, + "grad_norm": 2.598720073699951, + "learning_rate": 0.0002, + "loss": 2.5396, + "step": 35720 + }, + { + "epoch": 2.6624441132637853, + "grad_norm": 2.6841044425964355, + "learning_rate": 0.0002, + "loss": 2.5725, + "step": 35730 + }, + { + "epoch": 2.663189269746647, + "grad_norm": 2.7917165756225586, + "learning_rate": 0.0002, + "loss": 2.5492, + "step": 35740 + }, + { + "epoch": 2.663934426229508, + "grad_norm": 2.852966547012329, + "learning_rate": 0.0002, + "loss": 2.5787, + "step": 35750 + }, + { + "epoch": 2.6646795827123695, + "grad_norm": 2.4196488857269287, + "learning_rate": 0.0002, + "loss": 2.6144, + "step": 35760 + }, + { + "epoch": 2.665424739195231, + "grad_norm": 2.710623264312744, + "learning_rate": 0.0002, + "loss": 2.669, + "step": 35770 + }, + { + "epoch": 2.6661698956780926, + "grad_norm": 2.8702971935272217, + "learning_rate": 0.0002, + "loss": 2.3902, + "step": 35780 + }, + { + "epoch": 2.6669150521609537, + "grad_norm": 2.8416409492492676, + "learning_rate": 0.0002, + "loss": 2.6465, + "step": 35790 + }, + { + "epoch": 2.6676602086438153, + "grad_norm": 2.761198043823242, + "learning_rate": 0.0002, + "loss": 2.3041, + "step": 35800 + }, + { + "epoch": 2.6684053651266764, + "grad_norm": 2.7360453605651855, + "learning_rate": 0.0002, + "loss": 2.3746, + "step": 35810 + }, + { + "epoch": 2.669150521609538, + "grad_norm": 2.5194859504699707, + "learning_rate": 0.0002, + "loss": 2.5007, + "step": 35820 + }, + { + "epoch": 2.6698956780923995, + "grad_norm": 2.272688388824463, + "learning_rate": 0.0002, + "loss": 2.3409, + "step": 35830 + }, + { + "epoch": 2.670640834575261, + "grad_norm": 2.8286778926849365, + "learning_rate": 0.0002, + "loss": 2.4279, + "step": 35840 + }, + { + "epoch": 2.671385991058122, + "grad_norm": 2.7673115730285645, + "learning_rate": 0.0002, + "loss": 2.6094, + "step": 35850 + }, + { + "epoch": 2.6721311475409837, + "grad_norm": 2.627052068710327, + "learning_rate": 0.0002, + "loss": 2.4081, + "step": 35860 + }, + { + "epoch": 2.672876304023845, + "grad_norm": 2.9847543239593506, + "learning_rate": 0.0002, + "loss": 2.5277, + "step": 35870 + }, + { + "epoch": 2.6736214605067063, + "grad_norm": 2.7096521854400635, + "learning_rate": 0.0002, + "loss": 2.4191, + "step": 35880 + }, + { + "epoch": 2.674366616989568, + "grad_norm": 2.454817056655884, + "learning_rate": 0.0002, + "loss": 2.5331, + "step": 35890 + }, + { + "epoch": 2.6751117734724295, + "grad_norm": 2.617335557937622, + "learning_rate": 0.0002, + "loss": 2.2716, + "step": 35900 + }, + { + "epoch": 2.6758569299552906, + "grad_norm": 2.6219711303710938, + "learning_rate": 0.0002, + "loss": 2.4334, + "step": 35910 + }, + { + "epoch": 2.676602086438152, + "grad_norm": 2.2985072135925293, + "learning_rate": 0.0002, + "loss": 2.5012, + "step": 35920 + }, + { + "epoch": 2.6773472429210132, + "grad_norm": 2.7559444904327393, + "learning_rate": 0.0002, + "loss": 2.5972, + "step": 35930 + }, + { + "epoch": 2.678092399403875, + "grad_norm": 2.789966583251953, + "learning_rate": 0.0002, + "loss": 2.4791, + "step": 35940 + }, + { + "epoch": 2.6788375558867363, + "grad_norm": 2.9208645820617676, + "learning_rate": 0.0002, + "loss": 2.5503, + "step": 35950 + }, + { + "epoch": 2.679582712369598, + "grad_norm": 2.563593864440918, + "learning_rate": 0.0002, + "loss": 2.6192, + "step": 35960 + }, + { + "epoch": 2.680327868852459, + "grad_norm": 2.91192364692688, + "learning_rate": 0.0002, + "loss": 2.3414, + "step": 35970 + }, + { + "epoch": 2.6810730253353205, + "grad_norm": 2.6559500694274902, + "learning_rate": 0.0002, + "loss": 2.3, + "step": 35980 + }, + { + "epoch": 2.6818181818181817, + "grad_norm": 2.29866886138916, + "learning_rate": 0.0002, + "loss": 2.463, + "step": 35990 + }, + { + "epoch": 2.682563338301043, + "grad_norm": 2.475616216659546, + "learning_rate": 0.0002, + "loss": 2.456, + "step": 36000 + }, + { + "epoch": 2.6833084947839048, + "grad_norm": 2.6832971572875977, + "learning_rate": 0.0002, + "loss": 2.5678, + "step": 36010 + }, + { + "epoch": 2.684053651266766, + "grad_norm": 2.717376708984375, + "learning_rate": 0.0002, + "loss": 2.5248, + "step": 36020 + }, + { + "epoch": 2.6847988077496274, + "grad_norm": 3.192349910736084, + "learning_rate": 0.0002, + "loss": 2.3875, + "step": 36030 + }, + { + "epoch": 2.685543964232489, + "grad_norm": 2.5880353450775146, + "learning_rate": 0.0002, + "loss": 2.5578, + "step": 36040 + }, + { + "epoch": 2.68628912071535, + "grad_norm": 2.367215394973755, + "learning_rate": 0.0002, + "loss": 2.553, + "step": 36050 + }, + { + "epoch": 2.6870342771982116, + "grad_norm": 2.3446109294891357, + "learning_rate": 0.0002, + "loss": 2.5302, + "step": 36060 + }, + { + "epoch": 2.687779433681073, + "grad_norm": 2.9561381340026855, + "learning_rate": 0.0002, + "loss": 2.7431, + "step": 36070 + }, + { + "epoch": 2.6885245901639343, + "grad_norm": 2.4364163875579834, + "learning_rate": 0.0002, + "loss": 2.4944, + "step": 36080 + }, + { + "epoch": 2.689269746646796, + "grad_norm": 2.623122215270996, + "learning_rate": 0.0002, + "loss": 2.5464, + "step": 36090 + }, + { + "epoch": 2.690014903129657, + "grad_norm": 2.6617910861968994, + "learning_rate": 0.0002, + "loss": 2.5365, + "step": 36100 + }, + { + "epoch": 2.6907600596125185, + "grad_norm": 2.458702564239502, + "learning_rate": 0.0002, + "loss": 2.525, + "step": 36110 + }, + { + "epoch": 2.69150521609538, + "grad_norm": 2.783048629760742, + "learning_rate": 0.0002, + "loss": 2.4975, + "step": 36120 + }, + { + "epoch": 2.6922503725782416, + "grad_norm": 3.2591092586517334, + "learning_rate": 0.0002, + "loss": 2.5694, + "step": 36130 + }, + { + "epoch": 2.6929955290611027, + "grad_norm": 2.77760910987854, + "learning_rate": 0.0002, + "loss": 2.4527, + "step": 36140 + }, + { + "epoch": 2.6937406855439643, + "grad_norm": 2.6666505336761475, + "learning_rate": 0.0002, + "loss": 2.5507, + "step": 36150 + }, + { + "epoch": 2.6944858420268254, + "grad_norm": 2.61444354057312, + "learning_rate": 0.0002, + "loss": 2.4439, + "step": 36160 + }, + { + "epoch": 2.695230998509687, + "grad_norm": 2.2256479263305664, + "learning_rate": 0.0002, + "loss": 2.3892, + "step": 36170 + }, + { + "epoch": 2.6959761549925485, + "grad_norm": 2.4084644317626953, + "learning_rate": 0.0002, + "loss": 2.5677, + "step": 36180 + }, + { + "epoch": 2.69672131147541, + "grad_norm": 3.0166308879852295, + "learning_rate": 0.0002, + "loss": 2.4645, + "step": 36190 + }, + { + "epoch": 2.697466467958271, + "grad_norm": 2.671741008758545, + "learning_rate": 0.0002, + "loss": 2.6156, + "step": 36200 + }, + { + "epoch": 2.6982116244411327, + "grad_norm": 2.723830223083496, + "learning_rate": 0.0002, + "loss": 2.4664, + "step": 36210 + }, + { + "epoch": 2.698956780923994, + "grad_norm": 2.855217695236206, + "learning_rate": 0.0002, + "loss": 2.481, + "step": 36220 + }, + { + "epoch": 2.6997019374068554, + "grad_norm": 2.242652416229248, + "learning_rate": 0.0002, + "loss": 2.3911, + "step": 36230 + }, + { + "epoch": 2.700447093889717, + "grad_norm": 2.4699301719665527, + "learning_rate": 0.0002, + "loss": 2.5208, + "step": 36240 + }, + { + "epoch": 2.7011922503725785, + "grad_norm": 2.6784889698028564, + "learning_rate": 0.0002, + "loss": 2.2951, + "step": 36250 + }, + { + "epoch": 2.7019374068554396, + "grad_norm": 2.4282853603363037, + "learning_rate": 0.0002, + "loss": 2.6418, + "step": 36260 + }, + { + "epoch": 2.702682563338301, + "grad_norm": 2.9294357299804688, + "learning_rate": 0.0002, + "loss": 2.6716, + "step": 36270 + }, + { + "epoch": 2.7034277198211623, + "grad_norm": 2.729619264602661, + "learning_rate": 0.0002, + "loss": 2.4832, + "step": 36280 + }, + { + "epoch": 2.704172876304024, + "grad_norm": 2.574918031692505, + "learning_rate": 0.0002, + "loss": 2.4267, + "step": 36290 + }, + { + "epoch": 2.7049180327868854, + "grad_norm": 2.5243308544158936, + "learning_rate": 0.0002, + "loss": 2.4349, + "step": 36300 + }, + { + "epoch": 2.705663189269747, + "grad_norm": 2.761279821395874, + "learning_rate": 0.0002, + "loss": 2.5064, + "step": 36310 + }, + { + "epoch": 2.706408345752608, + "grad_norm": 2.487800121307373, + "learning_rate": 0.0002, + "loss": 2.481, + "step": 36320 + }, + { + "epoch": 2.7071535022354696, + "grad_norm": 2.707909107208252, + "learning_rate": 0.0002, + "loss": 2.5022, + "step": 36330 + }, + { + "epoch": 2.7078986587183307, + "grad_norm": 2.6682002544403076, + "learning_rate": 0.0002, + "loss": 2.5912, + "step": 36340 + }, + { + "epoch": 2.7086438152011922, + "grad_norm": 2.786775588989258, + "learning_rate": 0.0002, + "loss": 2.4986, + "step": 36350 + }, + { + "epoch": 2.709388971684054, + "grad_norm": 2.84505295753479, + "learning_rate": 0.0002, + "loss": 2.4681, + "step": 36360 + }, + { + "epoch": 2.710134128166915, + "grad_norm": 2.364448308944702, + "learning_rate": 0.0002, + "loss": 2.5899, + "step": 36370 + }, + { + "epoch": 2.7108792846497765, + "grad_norm": 2.7492170333862305, + "learning_rate": 0.0002, + "loss": 2.3887, + "step": 36380 + }, + { + "epoch": 2.711624441132638, + "grad_norm": 2.7889928817749023, + "learning_rate": 0.0002, + "loss": 2.5625, + "step": 36390 + }, + { + "epoch": 2.712369597615499, + "grad_norm": 2.5459094047546387, + "learning_rate": 0.0002, + "loss": 2.6412, + "step": 36400 + }, + { + "epoch": 2.7131147540983607, + "grad_norm": 2.648048162460327, + "learning_rate": 0.0002, + "loss": 2.5925, + "step": 36410 + }, + { + "epoch": 2.7138599105812222, + "grad_norm": 2.7833564281463623, + "learning_rate": 0.0002, + "loss": 2.3773, + "step": 36420 + }, + { + "epoch": 2.7146050670640833, + "grad_norm": 2.9601666927337646, + "learning_rate": 0.0002, + "loss": 2.7538, + "step": 36430 + }, + { + "epoch": 2.715350223546945, + "grad_norm": 3.0814125537872314, + "learning_rate": 0.0002, + "loss": 2.6718, + "step": 36440 + }, + { + "epoch": 2.716095380029806, + "grad_norm": 2.4058563709259033, + "learning_rate": 0.0002, + "loss": 2.6188, + "step": 36450 + }, + { + "epoch": 2.7168405365126675, + "grad_norm": 2.7348439693450928, + "learning_rate": 0.0002, + "loss": 2.6306, + "step": 36460 + }, + { + "epoch": 2.717585692995529, + "grad_norm": 2.448627471923828, + "learning_rate": 0.0002, + "loss": 2.5885, + "step": 36470 + }, + { + "epoch": 2.7183308494783907, + "grad_norm": 2.4670491218566895, + "learning_rate": 0.0002, + "loss": 2.44, + "step": 36480 + }, + { + "epoch": 2.7190760059612518, + "grad_norm": 2.3683083057403564, + "learning_rate": 0.0002, + "loss": 2.5304, + "step": 36490 + }, + { + "epoch": 2.7198211624441133, + "grad_norm": 2.6678662300109863, + "learning_rate": 0.0002, + "loss": 2.6104, + "step": 36500 + }, + { + "epoch": 2.7205663189269744, + "grad_norm": 2.6376142501831055, + "learning_rate": 0.0002, + "loss": 2.5446, + "step": 36510 + }, + { + "epoch": 2.721311475409836, + "grad_norm": 2.51070237159729, + "learning_rate": 0.0002, + "loss": 2.5938, + "step": 36520 + }, + { + "epoch": 2.7220566318926975, + "grad_norm": 2.3976802825927734, + "learning_rate": 0.0002, + "loss": 2.3664, + "step": 36530 + }, + { + "epoch": 2.722801788375559, + "grad_norm": 2.557739496231079, + "learning_rate": 0.0002, + "loss": 2.6302, + "step": 36540 + }, + { + "epoch": 2.72354694485842, + "grad_norm": 2.489729404449463, + "learning_rate": 0.0002, + "loss": 2.325, + "step": 36550 + }, + { + "epoch": 2.7242921013412817, + "grad_norm": 2.4591825008392334, + "learning_rate": 0.0002, + "loss": 2.4624, + "step": 36560 + }, + { + "epoch": 2.725037257824143, + "grad_norm": 2.7376599311828613, + "learning_rate": 0.0002, + "loss": 2.3916, + "step": 36570 + }, + { + "epoch": 2.7257824143070044, + "grad_norm": 2.656623125076294, + "learning_rate": 0.0002, + "loss": 2.48, + "step": 36580 + }, + { + "epoch": 2.726527570789866, + "grad_norm": 2.71795392036438, + "learning_rate": 0.0002, + "loss": 2.5828, + "step": 36590 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 2.759921073913574, + "learning_rate": 0.0002, + "loss": 2.5211, + "step": 36600 + }, + { + "epoch": 2.7280178837555886, + "grad_norm": 2.558744430541992, + "learning_rate": 0.0002, + "loss": 2.5161, + "step": 36610 + }, + { + "epoch": 2.72876304023845, + "grad_norm": 2.899496078491211, + "learning_rate": 0.0002, + "loss": 2.3968, + "step": 36620 + }, + { + "epoch": 2.7295081967213113, + "grad_norm": 2.5663375854492188, + "learning_rate": 0.0002, + "loss": 2.4568, + "step": 36630 + }, + { + "epoch": 2.730253353204173, + "grad_norm": 2.4600677490234375, + "learning_rate": 0.0002, + "loss": 2.4366, + "step": 36640 + }, + { + "epoch": 2.7309985096870344, + "grad_norm": 2.5599944591522217, + "learning_rate": 0.0002, + "loss": 2.3192, + "step": 36650 + }, + { + "epoch": 2.731743666169896, + "grad_norm": 2.5939974784851074, + "learning_rate": 0.0002, + "loss": 2.6118, + "step": 36660 + }, + { + "epoch": 2.732488822652757, + "grad_norm": 2.5912539958953857, + "learning_rate": 0.0002, + "loss": 2.5891, + "step": 36670 + }, + { + "epoch": 2.7332339791356186, + "grad_norm": 3.277555465698242, + "learning_rate": 0.0002, + "loss": 2.3884, + "step": 36680 + }, + { + "epoch": 2.7339791356184797, + "grad_norm": 2.8053581714630127, + "learning_rate": 0.0002, + "loss": 2.4551, + "step": 36690 + }, + { + "epoch": 2.7347242921013413, + "grad_norm": 2.5936853885650635, + "learning_rate": 0.0002, + "loss": 2.513, + "step": 36700 + }, + { + "epoch": 2.735469448584203, + "grad_norm": 2.418405294418335, + "learning_rate": 0.0002, + "loss": 2.2918, + "step": 36710 + }, + { + "epoch": 2.736214605067064, + "grad_norm": 2.522139072418213, + "learning_rate": 0.0002, + "loss": 2.4492, + "step": 36720 + }, + { + "epoch": 2.7369597615499255, + "grad_norm": 2.8760268688201904, + "learning_rate": 0.0002, + "loss": 2.4819, + "step": 36730 + }, + { + "epoch": 2.737704918032787, + "grad_norm": 2.54500150680542, + "learning_rate": 0.0002, + "loss": 2.4401, + "step": 36740 + }, + { + "epoch": 2.738450074515648, + "grad_norm": 3.1572742462158203, + "learning_rate": 0.0002, + "loss": 2.3991, + "step": 36750 + }, + { + "epoch": 2.7391952309985097, + "grad_norm": 2.786902666091919, + "learning_rate": 0.0002, + "loss": 2.5974, + "step": 36760 + }, + { + "epoch": 2.7399403874813713, + "grad_norm": 2.5400989055633545, + "learning_rate": 0.0002, + "loss": 2.5479, + "step": 36770 + }, + { + "epoch": 2.7406855439642324, + "grad_norm": 2.4169130325317383, + "learning_rate": 0.0002, + "loss": 2.4016, + "step": 36780 + }, + { + "epoch": 2.741430700447094, + "grad_norm": 2.450286865234375, + "learning_rate": 0.0002, + "loss": 2.5699, + "step": 36790 + }, + { + "epoch": 2.742175856929955, + "grad_norm": 2.757652759552002, + "learning_rate": 0.0002, + "loss": 2.5433, + "step": 36800 + }, + { + "epoch": 2.7429210134128166, + "grad_norm": 2.3215696811676025, + "learning_rate": 0.0002, + "loss": 2.4423, + "step": 36810 + }, + { + "epoch": 2.743666169895678, + "grad_norm": 2.422499895095825, + "learning_rate": 0.0002, + "loss": 2.2945, + "step": 36820 + }, + { + "epoch": 2.7444113263785397, + "grad_norm": 2.4259049892425537, + "learning_rate": 0.0002, + "loss": 2.5629, + "step": 36830 + }, + { + "epoch": 2.745156482861401, + "grad_norm": 2.6686112880706787, + "learning_rate": 0.0002, + "loss": 2.4427, + "step": 36840 + }, + { + "epoch": 2.7459016393442623, + "grad_norm": 2.5597991943359375, + "learning_rate": 0.0002, + "loss": 2.4121, + "step": 36850 + }, + { + "epoch": 2.7466467958271235, + "grad_norm": 2.713035821914673, + "learning_rate": 0.0002, + "loss": 2.5649, + "step": 36860 + }, + { + "epoch": 2.747391952309985, + "grad_norm": 2.730883836746216, + "learning_rate": 0.0002, + "loss": 2.3433, + "step": 36870 + }, + { + "epoch": 2.7481371087928466, + "grad_norm": 2.833503007888794, + "learning_rate": 0.0002, + "loss": 2.4411, + "step": 36880 + }, + { + "epoch": 2.748882265275708, + "grad_norm": 2.7715904712677, + "learning_rate": 0.0002, + "loss": 2.4082, + "step": 36890 + }, + { + "epoch": 2.7496274217585692, + "grad_norm": 2.716165065765381, + "learning_rate": 0.0002, + "loss": 2.3817, + "step": 36900 + }, + { + "epoch": 2.7503725782414308, + "grad_norm": 2.7706308364868164, + "learning_rate": 0.0002, + "loss": 2.5381, + "step": 36910 + }, + { + "epoch": 2.751117734724292, + "grad_norm": 2.569395065307617, + "learning_rate": 0.0002, + "loss": 2.4863, + "step": 36920 + }, + { + "epoch": 2.7518628912071534, + "grad_norm": 2.589057207107544, + "learning_rate": 0.0002, + "loss": 2.5399, + "step": 36930 + }, + { + "epoch": 2.752608047690015, + "grad_norm": 2.632874011993408, + "learning_rate": 0.0002, + "loss": 2.4628, + "step": 36940 + }, + { + "epoch": 2.7533532041728765, + "grad_norm": 3.023503541946411, + "learning_rate": 0.0002, + "loss": 2.5736, + "step": 36950 + }, + { + "epoch": 2.7540983606557377, + "grad_norm": 2.538649559020996, + "learning_rate": 0.0002, + "loss": 2.4441, + "step": 36960 + }, + { + "epoch": 2.754843517138599, + "grad_norm": 2.65378737449646, + "learning_rate": 0.0002, + "loss": 2.5479, + "step": 36970 + }, + { + "epoch": 2.7555886736214603, + "grad_norm": 2.514577865600586, + "learning_rate": 0.0002, + "loss": 2.3853, + "step": 36980 + }, + { + "epoch": 2.756333830104322, + "grad_norm": 2.3040006160736084, + "learning_rate": 0.0002, + "loss": 2.5896, + "step": 36990 + }, + { + "epoch": 2.7570789865871834, + "grad_norm": 2.7404825687408447, + "learning_rate": 0.0002, + "loss": 2.7121, + "step": 37000 + }, + { + "epoch": 2.757824143070045, + "grad_norm": 2.6444921493530273, + "learning_rate": 0.0002, + "loss": 2.4906, + "step": 37010 + }, + { + "epoch": 2.758569299552906, + "grad_norm": 2.368401288986206, + "learning_rate": 0.0002, + "loss": 2.413, + "step": 37020 + }, + { + "epoch": 2.7593144560357676, + "grad_norm": 2.6097865104675293, + "learning_rate": 0.0002, + "loss": 2.567, + "step": 37030 + }, + { + "epoch": 2.7600596125186287, + "grad_norm": 2.694190740585327, + "learning_rate": 0.0002, + "loss": 2.57, + "step": 37040 + }, + { + "epoch": 2.7608047690014903, + "grad_norm": 2.5604560375213623, + "learning_rate": 0.0002, + "loss": 2.4598, + "step": 37050 + }, + { + "epoch": 2.761549925484352, + "grad_norm": 2.912165641784668, + "learning_rate": 0.0002, + "loss": 2.6524, + "step": 37060 + }, + { + "epoch": 2.762295081967213, + "grad_norm": 3.2894179821014404, + "learning_rate": 0.0002, + "loss": 2.5217, + "step": 37070 + }, + { + "epoch": 2.7630402384500745, + "grad_norm": 2.828571319580078, + "learning_rate": 0.0002, + "loss": 2.5196, + "step": 37080 + }, + { + "epoch": 2.763785394932936, + "grad_norm": 2.4312257766723633, + "learning_rate": 0.0002, + "loss": 2.3419, + "step": 37090 + }, + { + "epoch": 2.764530551415797, + "grad_norm": 2.4520037174224854, + "learning_rate": 0.0002, + "loss": 2.4494, + "step": 37100 + }, + { + "epoch": 2.7652757078986587, + "grad_norm": 2.708139181137085, + "learning_rate": 0.0002, + "loss": 2.5631, + "step": 37110 + }, + { + "epoch": 2.7660208643815203, + "grad_norm": 2.8655529022216797, + "learning_rate": 0.0002, + "loss": 2.5352, + "step": 37120 + }, + { + "epoch": 2.7667660208643814, + "grad_norm": 2.484017848968506, + "learning_rate": 0.0002, + "loss": 2.4615, + "step": 37130 + }, + { + "epoch": 2.767511177347243, + "grad_norm": 2.4956047534942627, + "learning_rate": 0.0002, + "loss": 2.5465, + "step": 37140 + }, + { + "epoch": 2.768256333830104, + "grad_norm": 2.5548791885375977, + "learning_rate": 0.0002, + "loss": 2.4609, + "step": 37150 + }, + { + "epoch": 2.7690014903129656, + "grad_norm": 2.561833381652832, + "learning_rate": 0.0002, + "loss": 2.5994, + "step": 37160 + }, + { + "epoch": 2.769746646795827, + "grad_norm": 2.9793152809143066, + "learning_rate": 0.0002, + "loss": 2.432, + "step": 37170 + }, + { + "epoch": 2.7704918032786887, + "grad_norm": 2.6743390560150146, + "learning_rate": 0.0002, + "loss": 2.4788, + "step": 37180 + }, + { + "epoch": 2.77123695976155, + "grad_norm": 2.461435317993164, + "learning_rate": 0.0002, + "loss": 2.5217, + "step": 37190 + }, + { + "epoch": 2.7719821162444114, + "grad_norm": 2.5211734771728516, + "learning_rate": 0.0002, + "loss": 2.613, + "step": 37200 + }, + { + "epoch": 2.7727272727272725, + "grad_norm": 2.824679136276245, + "learning_rate": 0.0002, + "loss": 2.6382, + "step": 37210 + }, + { + "epoch": 2.773472429210134, + "grad_norm": 2.458425521850586, + "learning_rate": 0.0002, + "loss": 2.4033, + "step": 37220 + }, + { + "epoch": 2.7742175856929956, + "grad_norm": 2.7605643272399902, + "learning_rate": 0.0002, + "loss": 2.4153, + "step": 37230 + }, + { + "epoch": 2.774962742175857, + "grad_norm": 2.3235936164855957, + "learning_rate": 0.0002, + "loss": 2.5107, + "step": 37240 + }, + { + "epoch": 2.7757078986587183, + "grad_norm": 2.2972285747528076, + "learning_rate": 0.0002, + "loss": 2.5371, + "step": 37250 + }, + { + "epoch": 2.77645305514158, + "grad_norm": 2.7939791679382324, + "learning_rate": 0.0002, + "loss": 2.3748, + "step": 37260 + }, + { + "epoch": 2.777198211624441, + "grad_norm": 2.5067827701568604, + "learning_rate": 0.0002, + "loss": 2.3598, + "step": 37270 + }, + { + "epoch": 2.7779433681073025, + "grad_norm": 2.909909248352051, + "learning_rate": 0.0002, + "loss": 2.5853, + "step": 37280 + }, + { + "epoch": 2.778688524590164, + "grad_norm": 2.569460868835449, + "learning_rate": 0.0002, + "loss": 2.5193, + "step": 37290 + }, + { + "epoch": 2.7794336810730256, + "grad_norm": 3.214890956878662, + "learning_rate": 0.0002, + "loss": 2.4651, + "step": 37300 + }, + { + "epoch": 2.7801788375558867, + "grad_norm": 2.867858648300171, + "learning_rate": 0.0002, + "loss": 2.6508, + "step": 37310 + }, + { + "epoch": 2.7809239940387482, + "grad_norm": 2.7677152156829834, + "learning_rate": 0.0002, + "loss": 2.5517, + "step": 37320 + }, + { + "epoch": 2.7816691505216093, + "grad_norm": 2.633157253265381, + "learning_rate": 0.0002, + "loss": 2.4063, + "step": 37330 + }, + { + "epoch": 2.782414307004471, + "grad_norm": 2.4681646823883057, + "learning_rate": 0.0002, + "loss": 2.4698, + "step": 37340 + }, + { + "epoch": 2.7831594634873325, + "grad_norm": 2.595750570297241, + "learning_rate": 0.0002, + "loss": 2.398, + "step": 37350 + }, + { + "epoch": 2.783904619970194, + "grad_norm": 2.4892077445983887, + "learning_rate": 0.0002, + "loss": 2.5829, + "step": 37360 + }, + { + "epoch": 2.784649776453055, + "grad_norm": 2.4976730346679688, + "learning_rate": 0.0002, + "loss": 2.3726, + "step": 37370 + }, + { + "epoch": 2.7853949329359167, + "grad_norm": 2.330193519592285, + "learning_rate": 0.0002, + "loss": 2.4521, + "step": 37380 + }, + { + "epoch": 2.7861400894187778, + "grad_norm": 2.6689720153808594, + "learning_rate": 0.0002, + "loss": 2.414, + "step": 37390 + }, + { + "epoch": 2.7868852459016393, + "grad_norm": 2.6920714378356934, + "learning_rate": 0.0002, + "loss": 2.5038, + "step": 37400 + }, + { + "epoch": 2.787630402384501, + "grad_norm": 2.4694406986236572, + "learning_rate": 0.0002, + "loss": 2.3686, + "step": 37410 + }, + { + "epoch": 2.788375558867362, + "grad_norm": 2.5774457454681396, + "learning_rate": 0.0002, + "loss": 2.3996, + "step": 37420 + }, + { + "epoch": 2.7891207153502235, + "grad_norm": 2.640498638153076, + "learning_rate": 0.0002, + "loss": 2.4829, + "step": 37430 + }, + { + "epoch": 2.789865871833085, + "grad_norm": 2.770134925842285, + "learning_rate": 0.0002, + "loss": 2.5329, + "step": 37440 + }, + { + "epoch": 2.790611028315946, + "grad_norm": 2.9241816997528076, + "learning_rate": 0.0002, + "loss": 2.4869, + "step": 37450 + }, + { + "epoch": 2.7913561847988078, + "grad_norm": 2.629659652709961, + "learning_rate": 0.0002, + "loss": 2.4289, + "step": 37460 + }, + { + "epoch": 2.7921013412816693, + "grad_norm": 2.6241800785064697, + "learning_rate": 0.0002, + "loss": 2.5179, + "step": 37470 + }, + { + "epoch": 2.7928464977645304, + "grad_norm": 2.5617072582244873, + "learning_rate": 0.0002, + "loss": 2.5042, + "step": 37480 + }, + { + "epoch": 2.793591654247392, + "grad_norm": 2.659287452697754, + "learning_rate": 0.0002, + "loss": 2.4369, + "step": 37490 + }, + { + "epoch": 2.794336810730253, + "grad_norm": 2.6287639141082764, + "learning_rate": 0.0002, + "loss": 2.4064, + "step": 37500 + }, + { + "epoch": 2.7950819672131146, + "grad_norm": 2.7507472038269043, + "learning_rate": 0.0002, + "loss": 2.4495, + "step": 37510 + }, + { + "epoch": 2.795827123695976, + "grad_norm": 2.509035348892212, + "learning_rate": 0.0002, + "loss": 2.4707, + "step": 37520 + }, + { + "epoch": 2.7965722801788377, + "grad_norm": 2.2699403762817383, + "learning_rate": 0.0002, + "loss": 2.3909, + "step": 37530 + }, + { + "epoch": 2.797317436661699, + "grad_norm": 2.630939245223999, + "learning_rate": 0.0002, + "loss": 2.4802, + "step": 37540 + }, + { + "epoch": 2.7980625931445604, + "grad_norm": 3.045865058898926, + "learning_rate": 0.0002, + "loss": 2.4884, + "step": 37550 + }, + { + "epoch": 2.7988077496274215, + "grad_norm": 2.486050605773926, + "learning_rate": 0.0002, + "loss": 2.4905, + "step": 37560 + }, + { + "epoch": 2.799552906110283, + "grad_norm": 2.318683624267578, + "learning_rate": 0.0002, + "loss": 2.2404, + "step": 37570 + }, + { + "epoch": 2.8002980625931446, + "grad_norm": 2.7278268337249756, + "learning_rate": 0.0002, + "loss": 2.5899, + "step": 37580 + }, + { + "epoch": 2.801043219076006, + "grad_norm": 2.8368096351623535, + "learning_rate": 0.0002, + "loss": 2.5105, + "step": 37590 + }, + { + "epoch": 2.8017883755588673, + "grad_norm": 2.7143986225128174, + "learning_rate": 0.0002, + "loss": 2.4156, + "step": 37600 + }, + { + "epoch": 2.802533532041729, + "grad_norm": 2.6735024452209473, + "learning_rate": 0.0002, + "loss": 2.3569, + "step": 37610 + }, + { + "epoch": 2.80327868852459, + "grad_norm": 2.700565814971924, + "learning_rate": 0.0002, + "loss": 2.485, + "step": 37620 + }, + { + "epoch": 2.8040238450074515, + "grad_norm": 2.5607948303222656, + "learning_rate": 0.0002, + "loss": 2.5495, + "step": 37630 + }, + { + "epoch": 2.804769001490313, + "grad_norm": 2.044367790222168, + "learning_rate": 0.0002, + "loss": 2.452, + "step": 37640 + }, + { + "epoch": 2.8055141579731746, + "grad_norm": 2.569173574447632, + "learning_rate": 0.0002, + "loss": 2.554, + "step": 37650 + }, + { + "epoch": 2.8062593144560357, + "grad_norm": 2.6920878887176514, + "learning_rate": 0.0002, + "loss": 2.5635, + "step": 37660 + }, + { + "epoch": 2.8070044709388973, + "grad_norm": 2.5670292377471924, + "learning_rate": 0.0002, + "loss": 2.4568, + "step": 37670 + }, + { + "epoch": 2.8077496274217584, + "grad_norm": 2.1942081451416016, + "learning_rate": 0.0002, + "loss": 2.4599, + "step": 37680 + }, + { + "epoch": 2.80849478390462, + "grad_norm": 2.5298988819122314, + "learning_rate": 0.0002, + "loss": 2.4536, + "step": 37690 + }, + { + "epoch": 2.8092399403874815, + "grad_norm": 2.82232666015625, + "learning_rate": 0.0002, + "loss": 2.4608, + "step": 37700 + }, + { + "epoch": 2.809985096870343, + "grad_norm": 2.4265050888061523, + "learning_rate": 0.0002, + "loss": 2.4723, + "step": 37710 + }, + { + "epoch": 2.810730253353204, + "grad_norm": 2.705826759338379, + "learning_rate": 0.0002, + "loss": 2.4147, + "step": 37720 + }, + { + "epoch": 2.8114754098360657, + "grad_norm": 2.584636926651001, + "learning_rate": 0.0002, + "loss": 2.5809, + "step": 37730 + }, + { + "epoch": 2.812220566318927, + "grad_norm": 2.8928425312042236, + "learning_rate": 0.0002, + "loss": 2.7184, + "step": 37740 + }, + { + "epoch": 2.8129657228017884, + "grad_norm": 2.567809581756592, + "learning_rate": 0.0002, + "loss": 2.4948, + "step": 37750 + }, + { + "epoch": 2.81371087928465, + "grad_norm": 2.8919975757598877, + "learning_rate": 0.0002, + "loss": 2.4801, + "step": 37760 + }, + { + "epoch": 2.814456035767511, + "grad_norm": 2.455564022064209, + "learning_rate": 0.0002, + "loss": 2.5779, + "step": 37770 + }, + { + "epoch": 2.8152011922503726, + "grad_norm": 2.659975528717041, + "learning_rate": 0.0002, + "loss": 2.6608, + "step": 37780 + }, + { + "epoch": 2.815946348733234, + "grad_norm": 2.6775760650634766, + "learning_rate": 0.0002, + "loss": 2.4948, + "step": 37790 + }, + { + "epoch": 2.8166915052160952, + "grad_norm": 2.86912202835083, + "learning_rate": 0.0002, + "loss": 2.3338, + "step": 37800 + }, + { + "epoch": 2.817436661698957, + "grad_norm": 2.8781628608703613, + "learning_rate": 0.0002, + "loss": 2.6059, + "step": 37810 + }, + { + "epoch": 2.8181818181818183, + "grad_norm": 2.688969135284424, + "learning_rate": 0.0002, + "loss": 2.792, + "step": 37820 + }, + { + "epoch": 2.8189269746646795, + "grad_norm": 2.80010986328125, + "learning_rate": 0.0002, + "loss": 2.3393, + "step": 37830 + }, + { + "epoch": 2.819672131147541, + "grad_norm": 2.4589450359344482, + "learning_rate": 0.0002, + "loss": 2.5836, + "step": 37840 + }, + { + "epoch": 2.820417287630402, + "grad_norm": 2.3914148807525635, + "learning_rate": 0.0002, + "loss": 2.5583, + "step": 37850 + }, + { + "epoch": 2.8211624441132637, + "grad_norm": 2.734759569168091, + "learning_rate": 0.0002, + "loss": 2.624, + "step": 37860 + }, + { + "epoch": 2.821907600596125, + "grad_norm": 2.7147603034973145, + "learning_rate": 0.0002, + "loss": 2.6149, + "step": 37870 + }, + { + "epoch": 2.8226527570789868, + "grad_norm": 2.4781367778778076, + "learning_rate": 0.0002, + "loss": 2.5671, + "step": 37880 + }, + { + "epoch": 2.823397913561848, + "grad_norm": 2.6984505653381348, + "learning_rate": 0.0002, + "loss": 2.5764, + "step": 37890 + }, + { + "epoch": 2.8241430700447094, + "grad_norm": 2.6264562606811523, + "learning_rate": 0.0002, + "loss": 2.272, + "step": 37900 + }, + { + "epoch": 2.8248882265275705, + "grad_norm": 2.978095054626465, + "learning_rate": 0.0002, + "loss": 2.483, + "step": 37910 + }, + { + "epoch": 2.825633383010432, + "grad_norm": 2.6886980533599854, + "learning_rate": 0.0002, + "loss": 2.6149, + "step": 37920 + }, + { + "epoch": 2.8263785394932937, + "grad_norm": 2.677035331726074, + "learning_rate": 0.0002, + "loss": 2.6447, + "step": 37930 + }, + { + "epoch": 2.827123695976155, + "grad_norm": 3.3771278858184814, + "learning_rate": 0.0002, + "loss": 2.7219, + "step": 37940 + }, + { + "epoch": 2.8278688524590163, + "grad_norm": 1.9655869007110596, + "learning_rate": 0.0002, + "loss": 2.4848, + "step": 37950 + }, + { + "epoch": 2.828614008941878, + "grad_norm": 2.6271440982818604, + "learning_rate": 0.0002, + "loss": 2.4456, + "step": 37960 + }, + { + "epoch": 2.829359165424739, + "grad_norm": 2.5585412979125977, + "learning_rate": 0.0002, + "loss": 2.7289, + "step": 37970 + }, + { + "epoch": 2.8301043219076005, + "grad_norm": 2.735304117202759, + "learning_rate": 0.0002, + "loss": 2.4382, + "step": 37980 + }, + { + "epoch": 2.830849478390462, + "grad_norm": 2.609837055206299, + "learning_rate": 0.0002, + "loss": 2.4991, + "step": 37990 + }, + { + "epoch": 2.8315946348733236, + "grad_norm": 2.7447588443756104, + "learning_rate": 0.0002, + "loss": 2.5325, + "step": 38000 + }, + { + "epoch": 2.8323397913561847, + "grad_norm": 2.400519847869873, + "learning_rate": 0.0002, + "loss": 2.6211, + "step": 38010 + }, + { + "epoch": 2.8330849478390463, + "grad_norm": 2.8218953609466553, + "learning_rate": 0.0002, + "loss": 2.6096, + "step": 38020 + }, + { + "epoch": 2.8338301043219074, + "grad_norm": 2.7308266162872314, + "learning_rate": 0.0002, + "loss": 2.557, + "step": 38030 + }, + { + "epoch": 2.834575260804769, + "grad_norm": 2.5944695472717285, + "learning_rate": 0.0002, + "loss": 2.5554, + "step": 38040 + }, + { + "epoch": 2.8353204172876305, + "grad_norm": 2.427563428878784, + "learning_rate": 0.0002, + "loss": 2.3361, + "step": 38050 + }, + { + "epoch": 2.836065573770492, + "grad_norm": 2.4476137161254883, + "learning_rate": 0.0002, + "loss": 2.4115, + "step": 38060 + }, + { + "epoch": 2.836810730253353, + "grad_norm": 3.2641639709472656, + "learning_rate": 0.0002, + "loss": 2.4905, + "step": 38070 + }, + { + "epoch": 2.8375558867362147, + "grad_norm": 2.3455142974853516, + "learning_rate": 0.0002, + "loss": 2.4734, + "step": 38080 + }, + { + "epoch": 2.838301043219076, + "grad_norm": 2.834339141845703, + "learning_rate": 0.0002, + "loss": 2.5312, + "step": 38090 + }, + { + "epoch": 2.8390461997019374, + "grad_norm": 2.8637094497680664, + "learning_rate": 0.0002, + "loss": 2.4174, + "step": 38100 + }, + { + "epoch": 2.839791356184799, + "grad_norm": 3.0524630546569824, + "learning_rate": 0.0002, + "loss": 2.5523, + "step": 38110 + }, + { + "epoch": 2.84053651266766, + "grad_norm": 2.649812936782837, + "learning_rate": 0.0002, + "loss": 2.2958, + "step": 38120 + }, + { + "epoch": 2.8412816691505216, + "grad_norm": 2.645263433456421, + "learning_rate": 0.0002, + "loss": 2.4867, + "step": 38130 + }, + { + "epoch": 2.842026825633383, + "grad_norm": 2.0363569259643555, + "learning_rate": 0.0002, + "loss": 2.3287, + "step": 38140 + }, + { + "epoch": 2.8427719821162443, + "grad_norm": 2.7432138919830322, + "learning_rate": 0.0002, + "loss": 2.6093, + "step": 38150 + }, + { + "epoch": 2.843517138599106, + "grad_norm": 2.721869945526123, + "learning_rate": 0.0002, + "loss": 2.6309, + "step": 38160 + }, + { + "epoch": 2.8442622950819674, + "grad_norm": 2.674832582473755, + "learning_rate": 0.0002, + "loss": 2.3478, + "step": 38170 + }, + { + "epoch": 2.8450074515648285, + "grad_norm": 2.252639055252075, + "learning_rate": 0.0002, + "loss": 2.3417, + "step": 38180 + }, + { + "epoch": 2.84575260804769, + "grad_norm": 2.549644708633423, + "learning_rate": 0.0002, + "loss": 2.3126, + "step": 38190 + }, + { + "epoch": 2.846497764530551, + "grad_norm": 2.77371883392334, + "learning_rate": 0.0002, + "loss": 2.6122, + "step": 38200 + }, + { + "epoch": 2.8472429210134127, + "grad_norm": 2.7712173461914062, + "learning_rate": 0.0002, + "loss": 2.4918, + "step": 38210 + }, + { + "epoch": 2.8479880774962743, + "grad_norm": 2.5276310443878174, + "learning_rate": 0.0002, + "loss": 2.443, + "step": 38220 + }, + { + "epoch": 2.848733233979136, + "grad_norm": 2.3760945796966553, + "learning_rate": 0.0002, + "loss": 2.4374, + "step": 38230 + }, + { + "epoch": 2.849478390461997, + "grad_norm": 2.5871782302856445, + "learning_rate": 0.0002, + "loss": 2.571, + "step": 38240 + }, + { + "epoch": 2.8502235469448585, + "grad_norm": 2.721224069595337, + "learning_rate": 0.0002, + "loss": 2.582, + "step": 38250 + }, + { + "epoch": 2.8509687034277196, + "grad_norm": 2.55556058883667, + "learning_rate": 0.0002, + "loss": 2.3885, + "step": 38260 + }, + { + "epoch": 2.851713859910581, + "grad_norm": 2.8083126544952393, + "learning_rate": 0.0002, + "loss": 2.4455, + "step": 38270 + }, + { + "epoch": 2.8524590163934427, + "grad_norm": 2.5174221992492676, + "learning_rate": 0.0002, + "loss": 2.5159, + "step": 38280 + }, + { + "epoch": 2.8532041728763042, + "grad_norm": 2.380772590637207, + "learning_rate": 0.0002, + "loss": 2.6566, + "step": 38290 + }, + { + "epoch": 2.8539493293591653, + "grad_norm": 2.3448495864868164, + "learning_rate": 0.0002, + "loss": 2.4935, + "step": 38300 + }, + { + "epoch": 2.854694485842027, + "grad_norm": 2.500901699066162, + "learning_rate": 0.0002, + "loss": 2.3849, + "step": 38310 + }, + { + "epoch": 2.855439642324888, + "grad_norm": 2.658292293548584, + "learning_rate": 0.0002, + "loss": 2.3852, + "step": 38320 + }, + { + "epoch": 2.8561847988077496, + "grad_norm": 2.5677103996276855, + "learning_rate": 0.0002, + "loss": 2.467, + "step": 38330 + }, + { + "epoch": 2.856929955290611, + "grad_norm": 2.411125898361206, + "learning_rate": 0.0002, + "loss": 2.4616, + "step": 38340 + }, + { + "epoch": 2.8576751117734727, + "grad_norm": 2.651226043701172, + "learning_rate": 0.0002, + "loss": 2.4536, + "step": 38350 + }, + { + "epoch": 2.8584202682563338, + "grad_norm": 2.6440541744232178, + "learning_rate": 0.0002, + "loss": 2.349, + "step": 38360 + }, + { + "epoch": 2.8591654247391953, + "grad_norm": 2.4641504287719727, + "learning_rate": 0.0002, + "loss": 2.5248, + "step": 38370 + }, + { + "epoch": 2.8599105812220564, + "grad_norm": 2.716660499572754, + "learning_rate": 0.0002, + "loss": 2.606, + "step": 38380 + }, + { + "epoch": 2.860655737704918, + "grad_norm": 2.255239963531494, + "learning_rate": 0.0002, + "loss": 2.4059, + "step": 38390 + }, + { + "epoch": 2.8614008941877795, + "grad_norm": 2.145322322845459, + "learning_rate": 0.0002, + "loss": 2.4944, + "step": 38400 + }, + { + "epoch": 2.862146050670641, + "grad_norm": 2.833284378051758, + "learning_rate": 0.0002, + "loss": 2.5779, + "step": 38410 + }, + { + "epoch": 2.862891207153502, + "grad_norm": 2.970714569091797, + "learning_rate": 0.0002, + "loss": 2.4606, + "step": 38420 + }, + { + "epoch": 2.8636363636363638, + "grad_norm": 2.7407591342926025, + "learning_rate": 0.0002, + "loss": 2.5038, + "step": 38430 + }, + { + "epoch": 2.864381520119225, + "grad_norm": 2.734581708908081, + "learning_rate": 0.0002, + "loss": 2.4065, + "step": 38440 + }, + { + "epoch": 2.8651266766020864, + "grad_norm": 2.5742697715759277, + "learning_rate": 0.0002, + "loss": 2.359, + "step": 38450 + }, + { + "epoch": 2.865871833084948, + "grad_norm": 2.6709296703338623, + "learning_rate": 0.0002, + "loss": 2.4409, + "step": 38460 + }, + { + "epoch": 2.866616989567809, + "grad_norm": 2.3500664234161377, + "learning_rate": 0.0002, + "loss": 2.3481, + "step": 38470 + }, + { + "epoch": 2.8673621460506706, + "grad_norm": 2.6254522800445557, + "learning_rate": 0.0002, + "loss": 2.6385, + "step": 38480 + }, + { + "epoch": 2.868107302533532, + "grad_norm": 2.2674224376678467, + "learning_rate": 0.0002, + "loss": 2.5171, + "step": 38490 + }, + { + "epoch": 2.8688524590163933, + "grad_norm": 2.6358728408813477, + "learning_rate": 0.0002, + "loss": 2.649, + "step": 38500 + }, + { + "epoch": 2.869597615499255, + "grad_norm": 2.7351884841918945, + "learning_rate": 0.0002, + "loss": 2.6956, + "step": 38510 + }, + { + "epoch": 2.8703427719821164, + "grad_norm": 2.099317789077759, + "learning_rate": 0.0002, + "loss": 2.4007, + "step": 38520 + }, + { + "epoch": 2.8710879284649775, + "grad_norm": 2.7157390117645264, + "learning_rate": 0.0002, + "loss": 2.6054, + "step": 38530 + }, + { + "epoch": 2.871833084947839, + "grad_norm": 2.5156712532043457, + "learning_rate": 0.0002, + "loss": 2.4476, + "step": 38540 + }, + { + "epoch": 2.8725782414307, + "grad_norm": 2.307621955871582, + "learning_rate": 0.0002, + "loss": 2.3631, + "step": 38550 + }, + { + "epoch": 2.8733233979135617, + "grad_norm": 2.7251169681549072, + "learning_rate": 0.0002, + "loss": 2.4591, + "step": 38560 + }, + { + "epoch": 2.8740685543964233, + "grad_norm": 2.367175340652466, + "learning_rate": 0.0002, + "loss": 2.5013, + "step": 38570 + }, + { + "epoch": 2.874813710879285, + "grad_norm": 3.0403735637664795, + "learning_rate": 0.0002, + "loss": 2.4797, + "step": 38580 + }, + { + "epoch": 2.875558867362146, + "grad_norm": 2.884767770767212, + "learning_rate": 0.0002, + "loss": 2.159, + "step": 38590 + }, + { + "epoch": 2.8763040238450075, + "grad_norm": 2.2452404499053955, + "learning_rate": 0.0002, + "loss": 2.4509, + "step": 38600 + }, + { + "epoch": 2.8770491803278686, + "grad_norm": 2.496917486190796, + "learning_rate": 0.0002, + "loss": 2.5203, + "step": 38610 + }, + { + "epoch": 2.87779433681073, + "grad_norm": 2.7510409355163574, + "learning_rate": 0.0002, + "loss": 2.5307, + "step": 38620 + }, + { + "epoch": 2.8785394932935917, + "grad_norm": 2.698000907897949, + "learning_rate": 0.0002, + "loss": 2.625, + "step": 38630 + }, + { + "epoch": 2.8792846497764533, + "grad_norm": 2.3727385997772217, + "learning_rate": 0.0002, + "loss": 2.612, + "step": 38640 + }, + { + "epoch": 2.8800298062593144, + "grad_norm": 2.5714304447174072, + "learning_rate": 0.0002, + "loss": 2.5383, + "step": 38650 + }, + { + "epoch": 2.880774962742176, + "grad_norm": 2.4994068145751953, + "learning_rate": 0.0002, + "loss": 2.3407, + "step": 38660 + }, + { + "epoch": 2.881520119225037, + "grad_norm": 2.728253126144409, + "learning_rate": 0.0002, + "loss": 2.4754, + "step": 38670 + }, + { + "epoch": 2.8822652757078986, + "grad_norm": 2.7636911869049072, + "learning_rate": 0.0002, + "loss": 2.5834, + "step": 38680 + }, + { + "epoch": 2.88301043219076, + "grad_norm": 2.6368253231048584, + "learning_rate": 0.0002, + "loss": 2.526, + "step": 38690 + }, + { + "epoch": 2.8837555886736217, + "grad_norm": 2.8573157787323, + "learning_rate": 0.0002, + "loss": 2.5318, + "step": 38700 + }, + { + "epoch": 2.884500745156483, + "grad_norm": 2.7631237506866455, + "learning_rate": 0.0002, + "loss": 2.4429, + "step": 38710 + }, + { + "epoch": 2.8852459016393444, + "grad_norm": 2.6455793380737305, + "learning_rate": 0.0002, + "loss": 2.5363, + "step": 38720 + }, + { + "epoch": 2.8859910581222055, + "grad_norm": 2.3246517181396484, + "learning_rate": 0.0002, + "loss": 2.379, + "step": 38730 + }, + { + "epoch": 2.886736214605067, + "grad_norm": 2.690464735031128, + "learning_rate": 0.0002, + "loss": 2.6405, + "step": 38740 + }, + { + "epoch": 2.8874813710879286, + "grad_norm": 2.527547597885132, + "learning_rate": 0.0002, + "loss": 2.6174, + "step": 38750 + }, + { + "epoch": 2.88822652757079, + "grad_norm": 2.4176671504974365, + "learning_rate": 0.0002, + "loss": 2.4597, + "step": 38760 + }, + { + "epoch": 2.8889716840536512, + "grad_norm": 2.7073466777801514, + "learning_rate": 0.0002, + "loss": 2.5065, + "step": 38770 + }, + { + "epoch": 2.889716840536513, + "grad_norm": 2.439682722091675, + "learning_rate": 0.0002, + "loss": 2.3043, + "step": 38780 + }, + { + "epoch": 2.890461997019374, + "grad_norm": 2.4189679622650146, + "learning_rate": 0.0002, + "loss": 2.4287, + "step": 38790 + }, + { + "epoch": 2.8912071535022354, + "grad_norm": 2.4912519454956055, + "learning_rate": 0.0002, + "loss": 2.4595, + "step": 38800 + }, + { + "epoch": 2.891952309985097, + "grad_norm": 2.4591786861419678, + "learning_rate": 0.0002, + "loss": 2.5589, + "step": 38810 + }, + { + "epoch": 2.892697466467958, + "grad_norm": 2.469432830810547, + "learning_rate": 0.0002, + "loss": 2.5173, + "step": 38820 + }, + { + "epoch": 2.8934426229508197, + "grad_norm": 2.421070098876953, + "learning_rate": 0.0002, + "loss": 2.4513, + "step": 38830 + }, + { + "epoch": 2.894187779433681, + "grad_norm": 2.4555258750915527, + "learning_rate": 0.0002, + "loss": 2.6341, + "step": 38840 + }, + { + "epoch": 2.8949329359165423, + "grad_norm": 2.3373961448669434, + "learning_rate": 0.0002, + "loss": 2.5724, + "step": 38850 + }, + { + "epoch": 2.895678092399404, + "grad_norm": 2.7534141540527344, + "learning_rate": 0.0002, + "loss": 2.4472, + "step": 38860 + }, + { + "epoch": 2.8964232488822654, + "grad_norm": 2.53497052192688, + "learning_rate": 0.0002, + "loss": 2.1598, + "step": 38870 + }, + { + "epoch": 2.8971684053651265, + "grad_norm": 2.5229415893554688, + "learning_rate": 0.0002, + "loss": 2.581, + "step": 38880 + }, + { + "epoch": 2.897913561847988, + "grad_norm": 2.3807928562164307, + "learning_rate": 0.0002, + "loss": 2.4987, + "step": 38890 + }, + { + "epoch": 2.898658718330849, + "grad_norm": 2.803424596786499, + "learning_rate": 0.0002, + "loss": 2.6143, + "step": 38900 + }, + { + "epoch": 2.8994038748137108, + "grad_norm": 2.6139283180236816, + "learning_rate": 0.0002, + "loss": 2.403, + "step": 38910 + }, + { + "epoch": 2.9001490312965723, + "grad_norm": 2.9887192249298096, + "learning_rate": 0.0002, + "loss": 2.6108, + "step": 38920 + }, + { + "epoch": 2.900894187779434, + "grad_norm": 2.6957292556762695, + "learning_rate": 0.0002, + "loss": 2.4734, + "step": 38930 + }, + { + "epoch": 2.901639344262295, + "grad_norm": 2.6717658042907715, + "learning_rate": 0.0002, + "loss": 2.5778, + "step": 38940 + }, + { + "epoch": 2.9023845007451565, + "grad_norm": 2.75540828704834, + "learning_rate": 0.0002, + "loss": 2.5948, + "step": 38950 + }, + { + "epoch": 2.9031296572280176, + "grad_norm": 2.5205585956573486, + "learning_rate": 0.0002, + "loss": 2.5108, + "step": 38960 + }, + { + "epoch": 2.903874813710879, + "grad_norm": 2.2808632850646973, + "learning_rate": 0.0002, + "loss": 2.5035, + "step": 38970 + }, + { + "epoch": 2.9046199701937407, + "grad_norm": 2.8300139904022217, + "learning_rate": 0.0002, + "loss": 2.6883, + "step": 38980 + }, + { + "epoch": 2.9053651266766023, + "grad_norm": 2.3492043018341064, + "learning_rate": 0.0002, + "loss": 2.517, + "step": 38990 + }, + { + "epoch": 2.9061102831594634, + "grad_norm": 2.677483081817627, + "learning_rate": 0.0002, + "loss": 2.5173, + "step": 39000 + }, + { + "epoch": 2.906855439642325, + "grad_norm": 2.076521635055542, + "learning_rate": 0.0002, + "loss": 2.3087, + "step": 39010 + }, + { + "epoch": 2.907600596125186, + "grad_norm": 2.9172234535217285, + "learning_rate": 0.0002, + "loss": 2.4004, + "step": 39020 + }, + { + "epoch": 2.9083457526080476, + "grad_norm": 2.752596616744995, + "learning_rate": 0.0002, + "loss": 2.4206, + "step": 39030 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 2.586287260055542, + "learning_rate": 0.0002, + "loss": 2.5331, + "step": 39040 + }, + { + "epoch": 2.9098360655737707, + "grad_norm": 2.6482787132263184, + "learning_rate": 0.0002, + "loss": 2.5963, + "step": 39050 + }, + { + "epoch": 2.910581222056632, + "grad_norm": 2.513068914413452, + "learning_rate": 0.0002, + "loss": 2.6465, + "step": 39060 + }, + { + "epoch": 2.9113263785394934, + "grad_norm": 2.8123700618743896, + "learning_rate": 0.0002, + "loss": 2.5256, + "step": 39070 + }, + { + "epoch": 2.9120715350223545, + "grad_norm": 2.57496976852417, + "learning_rate": 0.0002, + "loss": 2.4337, + "step": 39080 + }, + { + "epoch": 2.912816691505216, + "grad_norm": 2.719500780105591, + "learning_rate": 0.0002, + "loss": 2.672, + "step": 39090 + }, + { + "epoch": 2.9135618479880776, + "grad_norm": 2.6564457416534424, + "learning_rate": 0.0002, + "loss": 2.5579, + "step": 39100 + }, + { + "epoch": 2.914307004470939, + "grad_norm": 2.656972646713257, + "learning_rate": 0.0002, + "loss": 2.532, + "step": 39110 + }, + { + "epoch": 2.9150521609538003, + "grad_norm": 2.5278375148773193, + "learning_rate": 0.0002, + "loss": 2.3322, + "step": 39120 + }, + { + "epoch": 2.915797317436662, + "grad_norm": 2.6466145515441895, + "learning_rate": 0.0002, + "loss": 2.5936, + "step": 39130 + }, + { + "epoch": 2.916542473919523, + "grad_norm": 2.8328826427459717, + "learning_rate": 0.0002, + "loss": 2.5601, + "step": 39140 + }, + { + "epoch": 2.9172876304023845, + "grad_norm": 2.4768784046173096, + "learning_rate": 0.0002, + "loss": 2.4364, + "step": 39150 + }, + { + "epoch": 2.918032786885246, + "grad_norm": 3.1645164489746094, + "learning_rate": 0.0002, + "loss": 2.5898, + "step": 39160 + }, + { + "epoch": 2.918777943368107, + "grad_norm": 2.8019328117370605, + "learning_rate": 0.0002, + "loss": 2.5901, + "step": 39170 + }, + { + "epoch": 2.9195230998509687, + "grad_norm": 3.203110933303833, + "learning_rate": 0.0002, + "loss": 2.4048, + "step": 39180 + }, + { + "epoch": 2.9202682563338302, + "grad_norm": 3.061889410018921, + "learning_rate": 0.0002, + "loss": 2.6781, + "step": 39190 + }, + { + "epoch": 2.9210134128166914, + "grad_norm": 2.735222101211548, + "learning_rate": 0.0002, + "loss": 2.5379, + "step": 39200 + }, + { + "epoch": 2.921758569299553, + "grad_norm": 2.0281805992126465, + "learning_rate": 0.0002, + "loss": 2.4894, + "step": 39210 + }, + { + "epoch": 2.9225037257824145, + "grad_norm": 2.89827561378479, + "learning_rate": 0.0002, + "loss": 2.3229, + "step": 39220 + }, + { + "epoch": 2.9232488822652756, + "grad_norm": 2.8243911266326904, + "learning_rate": 0.0002, + "loss": 2.2712, + "step": 39230 + }, + { + "epoch": 2.923994038748137, + "grad_norm": 2.6554248332977295, + "learning_rate": 0.0002, + "loss": 2.4221, + "step": 39240 + }, + { + "epoch": 2.9247391952309982, + "grad_norm": 2.6550304889678955, + "learning_rate": 0.0002, + "loss": 2.5087, + "step": 39250 + }, + { + "epoch": 2.92548435171386, + "grad_norm": 2.154710054397583, + "learning_rate": 0.0002, + "loss": 2.4395, + "step": 39260 + }, + { + "epoch": 2.9262295081967213, + "grad_norm": 2.5351250171661377, + "learning_rate": 0.0002, + "loss": 2.7179, + "step": 39270 + }, + { + "epoch": 2.926974664679583, + "grad_norm": 2.533205986022949, + "learning_rate": 0.0002, + "loss": 2.5654, + "step": 39280 + }, + { + "epoch": 2.927719821162444, + "grad_norm": 2.13325572013855, + "learning_rate": 0.0002, + "loss": 2.6483, + "step": 39290 + }, + { + "epoch": 2.9284649776453056, + "grad_norm": 2.66166353225708, + "learning_rate": 0.0002, + "loss": 2.5622, + "step": 39300 + }, + { + "epoch": 2.9292101341281667, + "grad_norm": 2.33381724357605, + "learning_rate": 0.0002, + "loss": 2.555, + "step": 39310 + }, + { + "epoch": 2.929955290611028, + "grad_norm": 2.7781875133514404, + "learning_rate": 0.0002, + "loss": 2.6041, + "step": 39320 + }, + { + "epoch": 2.9307004470938898, + "grad_norm": 2.579591989517212, + "learning_rate": 0.0002, + "loss": 2.4904, + "step": 39330 + }, + { + "epoch": 2.9314456035767513, + "grad_norm": 2.5462992191314697, + "learning_rate": 0.0002, + "loss": 2.6305, + "step": 39340 + }, + { + "epoch": 2.9321907600596124, + "grad_norm": 2.6036219596862793, + "learning_rate": 0.0002, + "loss": 2.6255, + "step": 39350 + }, + { + "epoch": 2.932935916542474, + "grad_norm": 2.4048843383789062, + "learning_rate": 0.0002, + "loss": 2.4814, + "step": 39360 + }, + { + "epoch": 2.933681073025335, + "grad_norm": 2.5451865196228027, + "learning_rate": 0.0002, + "loss": 2.4788, + "step": 39370 + }, + { + "epoch": 2.9344262295081966, + "grad_norm": 2.7191903591156006, + "learning_rate": 0.0002, + "loss": 2.5476, + "step": 39380 + }, + { + "epoch": 2.935171385991058, + "grad_norm": 2.3997859954833984, + "learning_rate": 0.0002, + "loss": 2.6285, + "step": 39390 + }, + { + "epoch": 2.9359165424739198, + "grad_norm": 2.5516610145568848, + "learning_rate": 0.0002, + "loss": 2.5219, + "step": 39400 + }, + { + "epoch": 2.936661698956781, + "grad_norm": 2.5959670543670654, + "learning_rate": 0.0002, + "loss": 2.5965, + "step": 39410 + }, + { + "epoch": 2.9374068554396424, + "grad_norm": 2.9444220066070557, + "learning_rate": 0.0002, + "loss": 2.5746, + "step": 39420 + }, + { + "epoch": 2.9381520119225035, + "grad_norm": 2.497292995452881, + "learning_rate": 0.0002, + "loss": 2.5914, + "step": 39430 + }, + { + "epoch": 2.938897168405365, + "grad_norm": 2.524833917617798, + "learning_rate": 0.0002, + "loss": 2.581, + "step": 39440 + }, + { + "epoch": 2.9396423248882266, + "grad_norm": 2.8581857681274414, + "learning_rate": 0.0002, + "loss": 2.5388, + "step": 39450 + }, + { + "epoch": 2.940387481371088, + "grad_norm": 2.8836469650268555, + "learning_rate": 0.0002, + "loss": 2.5906, + "step": 39460 + }, + { + "epoch": 2.9411326378539493, + "grad_norm": 2.530555248260498, + "learning_rate": 0.0002, + "loss": 2.6937, + "step": 39470 + }, + { + "epoch": 2.941877794336811, + "grad_norm": 2.8848953247070312, + "learning_rate": 0.0002, + "loss": 2.4355, + "step": 39480 + }, + { + "epoch": 2.942622950819672, + "grad_norm": 2.4546191692352295, + "learning_rate": 0.0002, + "loss": 2.5684, + "step": 39490 + }, + { + "epoch": 2.9433681073025335, + "grad_norm": 3.093285083770752, + "learning_rate": 0.0002, + "loss": 2.4457, + "step": 39500 + }, + { + "epoch": 2.944113263785395, + "grad_norm": 2.597832679748535, + "learning_rate": 0.0002, + "loss": 2.402, + "step": 39510 + }, + { + "epoch": 2.944858420268256, + "grad_norm": 2.5404412746429443, + "learning_rate": 0.0002, + "loss": 2.5724, + "step": 39520 + }, + { + "epoch": 2.9456035767511177, + "grad_norm": 3.019563674926758, + "learning_rate": 0.0002, + "loss": 2.3523, + "step": 39530 + }, + { + "epoch": 2.9463487332339793, + "grad_norm": 3.1213128566741943, + "learning_rate": 0.0002, + "loss": 2.6512, + "step": 39540 + }, + { + "epoch": 2.9470938897168404, + "grad_norm": 2.413450002670288, + "learning_rate": 0.0002, + "loss": 2.4045, + "step": 39550 + }, + { + "epoch": 2.947839046199702, + "grad_norm": 2.7260587215423584, + "learning_rate": 0.0002, + "loss": 2.6666, + "step": 39560 + }, + { + "epoch": 2.9485842026825635, + "grad_norm": 3.4645745754241943, + "learning_rate": 0.0002, + "loss": 2.6023, + "step": 39570 + }, + { + "epoch": 2.9493293591654246, + "grad_norm": 2.8158528804779053, + "learning_rate": 0.0002, + "loss": 2.4214, + "step": 39580 + }, + { + "epoch": 2.950074515648286, + "grad_norm": 2.712031841278076, + "learning_rate": 0.0002, + "loss": 2.5187, + "step": 39590 + }, + { + "epoch": 2.9508196721311473, + "grad_norm": 2.5256295204162598, + "learning_rate": 0.0002, + "loss": 2.4577, + "step": 39600 + }, + { + "epoch": 2.951564828614009, + "grad_norm": 2.72895884513855, + "learning_rate": 0.0002, + "loss": 2.5944, + "step": 39610 + }, + { + "epoch": 2.9523099850968704, + "grad_norm": 2.040031909942627, + "learning_rate": 0.0002, + "loss": 2.4583, + "step": 39620 + }, + { + "epoch": 2.953055141579732, + "grad_norm": 2.666433811187744, + "learning_rate": 0.0002, + "loss": 2.5275, + "step": 39630 + }, + { + "epoch": 2.953800298062593, + "grad_norm": 2.6652700901031494, + "learning_rate": 0.0002, + "loss": 2.3713, + "step": 39640 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 2.743901014328003, + "learning_rate": 0.0002, + "loss": 2.5227, + "step": 39650 + }, + { + "epoch": 2.9552906110283157, + "grad_norm": 2.845050811767578, + "learning_rate": 0.0002, + "loss": 2.5615, + "step": 39660 + }, + { + "epoch": 2.9560357675111772, + "grad_norm": 2.8907158374786377, + "learning_rate": 0.0002, + "loss": 2.5205, + "step": 39670 + }, + { + "epoch": 2.956780923994039, + "grad_norm": 2.6992757320404053, + "learning_rate": 0.0002, + "loss": 2.4661, + "step": 39680 + }, + { + "epoch": 2.9575260804769004, + "grad_norm": 2.9309325218200684, + "learning_rate": 0.0002, + "loss": 2.6417, + "step": 39690 + }, + { + "epoch": 2.9582712369597615, + "grad_norm": 2.801201581954956, + "learning_rate": 0.0002, + "loss": 2.495, + "step": 39700 + }, + { + "epoch": 2.959016393442623, + "grad_norm": 2.763702869415283, + "learning_rate": 0.0002, + "loss": 2.4012, + "step": 39710 + }, + { + "epoch": 2.959761549925484, + "grad_norm": 2.6679039001464844, + "learning_rate": 0.0002, + "loss": 2.6976, + "step": 39720 + }, + { + "epoch": 2.9605067064083457, + "grad_norm": 2.567474603652954, + "learning_rate": 0.0002, + "loss": 2.6038, + "step": 39730 + }, + { + "epoch": 2.9612518628912072, + "grad_norm": 3.114344596862793, + "learning_rate": 0.0002, + "loss": 2.4868, + "step": 39740 + }, + { + "epoch": 2.961997019374069, + "grad_norm": 2.5168049335479736, + "learning_rate": 0.0002, + "loss": 2.5279, + "step": 39750 + }, + { + "epoch": 2.96274217585693, + "grad_norm": 3.0251882076263428, + "learning_rate": 0.0002, + "loss": 2.6403, + "step": 39760 + }, + { + "epoch": 2.9634873323397914, + "grad_norm": 2.606680393218994, + "learning_rate": 0.0002, + "loss": 2.6361, + "step": 39770 + }, + { + "epoch": 2.9642324888226526, + "grad_norm": 2.699082612991333, + "learning_rate": 0.0002, + "loss": 2.5908, + "step": 39780 + }, + { + "epoch": 2.964977645305514, + "grad_norm": 2.8698062896728516, + "learning_rate": 0.0002, + "loss": 2.4495, + "step": 39790 + }, + { + "epoch": 2.9657228017883757, + "grad_norm": 2.898927927017212, + "learning_rate": 0.0002, + "loss": 2.4352, + "step": 39800 + }, + { + "epoch": 2.966467958271237, + "grad_norm": 2.3014628887176514, + "learning_rate": 0.0002, + "loss": 2.5411, + "step": 39810 + }, + { + "epoch": 2.9672131147540983, + "grad_norm": 2.9208085536956787, + "learning_rate": 0.0002, + "loss": 2.4678, + "step": 39820 + }, + { + "epoch": 2.96795827123696, + "grad_norm": 2.51240873336792, + "learning_rate": 0.0002, + "loss": 2.101, + "step": 39830 + }, + { + "epoch": 2.968703427719821, + "grad_norm": 2.7985494136810303, + "learning_rate": 0.0002, + "loss": 2.6076, + "step": 39840 + }, + { + "epoch": 2.9694485842026825, + "grad_norm": 2.5904476642608643, + "learning_rate": 0.0002, + "loss": 2.4389, + "step": 39850 + }, + { + "epoch": 2.970193740685544, + "grad_norm": 2.6392509937286377, + "learning_rate": 0.0002, + "loss": 2.3973, + "step": 39860 + }, + { + "epoch": 2.970938897168405, + "grad_norm": 1.774827241897583, + "learning_rate": 0.0002, + "loss": 2.4029, + "step": 39870 + }, + { + "epoch": 2.9716840536512668, + "grad_norm": 2.54217529296875, + "learning_rate": 0.0002, + "loss": 2.6573, + "step": 39880 + }, + { + "epoch": 2.9724292101341283, + "grad_norm": 2.6354312896728516, + "learning_rate": 0.0002, + "loss": 2.3592, + "step": 39890 + }, + { + "epoch": 2.9731743666169894, + "grad_norm": 2.6568002700805664, + "learning_rate": 0.0002, + "loss": 2.5426, + "step": 39900 + }, + { + "epoch": 2.973919523099851, + "grad_norm": 2.660454273223877, + "learning_rate": 0.0002, + "loss": 2.4449, + "step": 39910 + }, + { + "epoch": 2.9746646795827125, + "grad_norm": 2.9466612339019775, + "learning_rate": 0.0002, + "loss": 2.48, + "step": 39920 + }, + { + "epoch": 2.9754098360655736, + "grad_norm": 2.5808732509613037, + "learning_rate": 0.0002, + "loss": 2.6554, + "step": 39930 + }, + { + "epoch": 2.976154992548435, + "grad_norm": 2.8035290241241455, + "learning_rate": 0.0002, + "loss": 2.8002, + "step": 39940 + }, + { + "epoch": 2.9769001490312967, + "grad_norm": 2.615812301635742, + "learning_rate": 0.0002, + "loss": 2.4047, + "step": 39950 + }, + { + "epoch": 2.977645305514158, + "grad_norm": 2.588972330093384, + "learning_rate": 0.0002, + "loss": 2.495, + "step": 39960 + }, + { + "epoch": 2.9783904619970194, + "grad_norm": 2.669950008392334, + "learning_rate": 0.0002, + "loss": 2.5453, + "step": 39970 + }, + { + "epoch": 2.979135618479881, + "grad_norm": 2.609656572341919, + "learning_rate": 0.0002, + "loss": 2.549, + "step": 39980 + }, + { + "epoch": 2.979880774962742, + "grad_norm": 2.9678401947021484, + "learning_rate": 0.0002, + "loss": 2.5067, + "step": 39990 + }, + { + "epoch": 2.9806259314456036, + "grad_norm": 1.8499410152435303, + "learning_rate": 0.0002, + "loss": 2.5054, + "step": 40000 + }, + { + "epoch": 2.9813710879284647, + "grad_norm": 2.4903769493103027, + "learning_rate": 0.0002, + "loss": 2.4195, + "step": 40010 + }, + { + "epoch": 2.9821162444113263, + "grad_norm": 2.87322998046875, + "learning_rate": 0.0002, + "loss": 2.4816, + "step": 40020 + }, + { + "epoch": 2.982861400894188, + "grad_norm": 2.6124825477600098, + "learning_rate": 0.0002, + "loss": 2.3755, + "step": 40030 + }, + { + "epoch": 2.9836065573770494, + "grad_norm": 2.674150228500366, + "learning_rate": 0.0002, + "loss": 2.4891, + "step": 40040 + }, + { + "epoch": 2.9843517138599105, + "grad_norm": 2.499284267425537, + "learning_rate": 0.0002, + "loss": 2.5349, + "step": 40050 + }, + { + "epoch": 2.985096870342772, + "grad_norm": 2.7705399990081787, + "learning_rate": 0.0002, + "loss": 2.4778, + "step": 40060 + }, + { + "epoch": 2.985842026825633, + "grad_norm": 2.2329294681549072, + "learning_rate": 0.0002, + "loss": 2.4104, + "step": 40070 + }, + { + "epoch": 2.9865871833084947, + "grad_norm": 2.8790485858917236, + "learning_rate": 0.0002, + "loss": 2.4753, + "step": 40080 + }, + { + "epoch": 2.9873323397913563, + "grad_norm": 2.469627618789673, + "learning_rate": 0.0002, + "loss": 2.6872, + "step": 40090 + }, + { + "epoch": 2.988077496274218, + "grad_norm": 2.38393235206604, + "learning_rate": 0.0002, + "loss": 2.6073, + "step": 40100 + }, + { + "epoch": 2.988822652757079, + "grad_norm": 2.2364330291748047, + "learning_rate": 0.0002, + "loss": 2.4443, + "step": 40110 + }, + { + "epoch": 2.9895678092399405, + "grad_norm": 2.5776116847991943, + "learning_rate": 0.0002, + "loss": 2.532, + "step": 40120 + }, + { + "epoch": 2.9903129657228016, + "grad_norm": 2.5935330390930176, + "learning_rate": 0.0002, + "loss": 2.4889, + "step": 40130 + }, + { + "epoch": 2.991058122205663, + "grad_norm": 2.6229746341705322, + "learning_rate": 0.0002, + "loss": 2.5562, + "step": 40140 + }, + { + "epoch": 2.9918032786885247, + "grad_norm": 2.5080349445343018, + "learning_rate": 0.0002, + "loss": 2.4328, + "step": 40150 + }, + { + "epoch": 2.9925484351713862, + "grad_norm": 2.5937001705169678, + "learning_rate": 0.0002, + "loss": 2.591, + "step": 40160 + }, + { + "epoch": 2.9932935916542474, + "grad_norm": 2.3092591762542725, + "learning_rate": 0.0002, + "loss": 2.4414, + "step": 40170 + }, + { + "epoch": 2.994038748137109, + "grad_norm": 2.848226308822632, + "learning_rate": 0.0002, + "loss": 2.4109, + "step": 40180 + }, + { + "epoch": 2.99478390461997, + "grad_norm": 2.3122453689575195, + "learning_rate": 0.0002, + "loss": 2.3277, + "step": 40190 + }, + { + "epoch": 2.9955290611028316, + "grad_norm": 2.513367176055908, + "learning_rate": 0.0002, + "loss": 2.5037, + "step": 40200 + }, + { + "epoch": 2.996274217585693, + "grad_norm": 2.57222843170166, + "learning_rate": 0.0002, + "loss": 2.5487, + "step": 40210 + }, + { + "epoch": 2.9970193740685542, + "grad_norm": 2.354405164718628, + "learning_rate": 0.0002, + "loss": 2.386, + "step": 40220 + }, + { + "epoch": 2.997764530551416, + "grad_norm": 2.828702688217163, + "learning_rate": 0.0002, + "loss": 2.3264, + "step": 40230 + }, + { + "epoch": 2.9985096870342773, + "grad_norm": 2.779327392578125, + "learning_rate": 0.0002, + "loss": 2.3778, + "step": 40240 + }, + { + "epoch": 2.9992548435171384, + "grad_norm": 2.4226698875427246, + "learning_rate": 0.0002, + "loss": 2.1855, + "step": 40250 + }, + { + "epoch": 3.0, + "grad_norm": 2.918992042541504, + "learning_rate": 0.0002, + "loss": 2.4347, + "step": 40260 + }, + { + "epoch": 3.0, + "eval_runtime": 2763.3152, + "eval_samples_per_second": 4.856, + "eval_steps_per_second": 0.607, + "step": 40260 + }, + { + "epoch": 3.0007451564828616, + "grad_norm": 2.5024285316467285, + "learning_rate": 0.0002, + "loss": 2.3297, + "step": 40270 + }, + { + "epoch": 3.0014903129657227, + "grad_norm": 2.8226559162139893, + "learning_rate": 0.0002, + "loss": 2.3661, + "step": 40280 + }, + { + "epoch": 3.002235469448584, + "grad_norm": 2.758744716644287, + "learning_rate": 0.0002, + "loss": 2.4495, + "step": 40290 + }, + { + "epoch": 3.0029806259314458, + "grad_norm": 2.426999568939209, + "learning_rate": 0.0002, + "loss": 2.2649, + "step": 40300 + }, + { + "epoch": 3.003725782414307, + "grad_norm": 2.6783957481384277, + "learning_rate": 0.0002, + "loss": 2.3838, + "step": 40310 + }, + { + "epoch": 3.0044709388971684, + "grad_norm": 3.014652967453003, + "learning_rate": 0.0002, + "loss": 2.1913, + "step": 40320 + }, + { + "epoch": 3.00521609538003, + "grad_norm": 2.7266387939453125, + "learning_rate": 0.0002, + "loss": 2.3369, + "step": 40330 + }, + { + "epoch": 3.005961251862891, + "grad_norm": 2.7019906044006348, + "learning_rate": 0.0002, + "loss": 2.1718, + "step": 40340 + }, + { + "epoch": 3.0067064083457526, + "grad_norm": 2.5368382930755615, + "learning_rate": 0.0002, + "loss": 2.4286, + "step": 40350 + }, + { + "epoch": 3.007451564828614, + "grad_norm": 2.9685909748077393, + "learning_rate": 0.0002, + "loss": 2.3881, + "step": 40360 + }, + { + "epoch": 3.0081967213114753, + "grad_norm": 2.656423330307007, + "learning_rate": 0.0002, + "loss": 2.3198, + "step": 40370 + }, + { + "epoch": 3.008941877794337, + "grad_norm": 2.7926347255706787, + "learning_rate": 0.0002, + "loss": 2.2615, + "step": 40380 + }, + { + "epoch": 3.0096870342771984, + "grad_norm": 3.065396308898926, + "learning_rate": 0.0002, + "loss": 2.4156, + "step": 40390 + }, + { + "epoch": 3.0104321907600595, + "grad_norm": 2.8727266788482666, + "learning_rate": 0.0002, + "loss": 2.429, + "step": 40400 + }, + { + "epoch": 3.011177347242921, + "grad_norm": 2.9066572189331055, + "learning_rate": 0.0002, + "loss": 2.3824, + "step": 40410 + }, + { + "epoch": 3.0119225037257826, + "grad_norm": 2.813734769821167, + "learning_rate": 0.0002, + "loss": 2.2325, + "step": 40420 + }, + { + "epoch": 3.0126676602086437, + "grad_norm": 2.808375120162964, + "learning_rate": 0.0002, + "loss": 2.3175, + "step": 40430 + }, + { + "epoch": 3.0134128166915053, + "grad_norm": 2.6436479091644287, + "learning_rate": 0.0002, + "loss": 2.4796, + "step": 40440 + }, + { + "epoch": 3.0141579731743664, + "grad_norm": 2.557705879211426, + "learning_rate": 0.0002, + "loss": 2.5264, + "step": 40450 + }, + { + "epoch": 3.014903129657228, + "grad_norm": 2.7825839519500732, + "learning_rate": 0.0002, + "loss": 2.4484, + "step": 40460 + }, + { + "epoch": 3.0156482861400895, + "grad_norm": 2.4266088008880615, + "learning_rate": 0.0002, + "loss": 2.4146, + "step": 40470 + }, + { + "epoch": 3.0163934426229506, + "grad_norm": 2.8366827964782715, + "learning_rate": 0.0002, + "loss": 2.3706, + "step": 40480 + }, + { + "epoch": 3.017138599105812, + "grad_norm": 2.837256669998169, + "learning_rate": 0.0002, + "loss": 2.4929, + "step": 40490 + }, + { + "epoch": 3.0178837555886737, + "grad_norm": 2.7179813385009766, + "learning_rate": 0.0002, + "loss": 2.365, + "step": 40500 + }, + { + "epoch": 3.018628912071535, + "grad_norm": 2.3554391860961914, + "learning_rate": 0.0002, + "loss": 2.2821, + "step": 40510 + }, + { + "epoch": 3.0193740685543964, + "grad_norm": 2.696993112564087, + "learning_rate": 0.0002, + "loss": 2.5171, + "step": 40520 + }, + { + "epoch": 3.020119225037258, + "grad_norm": 2.1214194297790527, + "learning_rate": 0.0002, + "loss": 2.2159, + "step": 40530 + }, + { + "epoch": 3.020864381520119, + "grad_norm": 2.8695015907287598, + "learning_rate": 0.0002, + "loss": 2.3214, + "step": 40540 + }, + { + "epoch": 3.0216095380029806, + "grad_norm": 2.551729679107666, + "learning_rate": 0.0002, + "loss": 2.2563, + "step": 40550 + }, + { + "epoch": 3.022354694485842, + "grad_norm": 2.3707385063171387, + "learning_rate": 0.0002, + "loss": 2.4613, + "step": 40560 + }, + { + "epoch": 3.0230998509687033, + "grad_norm": 2.9850101470947266, + "learning_rate": 0.0002, + "loss": 2.37, + "step": 40570 + }, + { + "epoch": 3.023845007451565, + "grad_norm": 2.5724594593048096, + "learning_rate": 0.0002, + "loss": 2.3877, + "step": 40580 + }, + { + "epoch": 3.0245901639344264, + "grad_norm": 2.7039642333984375, + "learning_rate": 0.0002, + "loss": 2.2501, + "step": 40590 + }, + { + "epoch": 3.0253353204172875, + "grad_norm": 2.7257907390594482, + "learning_rate": 0.0002, + "loss": 2.3512, + "step": 40600 + }, + { + "epoch": 3.026080476900149, + "grad_norm": 2.788710355758667, + "learning_rate": 0.0002, + "loss": 2.3724, + "step": 40610 + }, + { + "epoch": 3.0268256333830106, + "grad_norm": 2.3942348957061768, + "learning_rate": 0.0002, + "loss": 2.288, + "step": 40620 + }, + { + "epoch": 3.0275707898658717, + "grad_norm": 2.6751253604888916, + "learning_rate": 0.0002, + "loss": 2.4059, + "step": 40630 + }, + { + "epoch": 3.0283159463487332, + "grad_norm": 3.1774163246154785, + "learning_rate": 0.0002, + "loss": 2.4086, + "step": 40640 + }, + { + "epoch": 3.029061102831595, + "grad_norm": 2.69118595123291, + "learning_rate": 0.0002, + "loss": 2.4347, + "step": 40650 + }, + { + "epoch": 3.029806259314456, + "grad_norm": 2.844470262527466, + "learning_rate": 0.0002, + "loss": 2.3638, + "step": 40660 + }, + { + "epoch": 3.0305514157973175, + "grad_norm": 2.662158966064453, + "learning_rate": 0.0002, + "loss": 2.3734, + "step": 40670 + }, + { + "epoch": 3.031296572280179, + "grad_norm": 2.9637556076049805, + "learning_rate": 0.0002, + "loss": 2.3224, + "step": 40680 + }, + { + "epoch": 3.03204172876304, + "grad_norm": 2.62660813331604, + "learning_rate": 0.0002, + "loss": 2.4436, + "step": 40690 + }, + { + "epoch": 3.0327868852459017, + "grad_norm": 2.9725089073181152, + "learning_rate": 0.0002, + "loss": 2.5596, + "step": 40700 + }, + { + "epoch": 3.0335320417287632, + "grad_norm": 2.9883205890655518, + "learning_rate": 0.0002, + "loss": 2.2181, + "step": 40710 + }, + { + "epoch": 3.0342771982116243, + "grad_norm": 2.632417917251587, + "learning_rate": 0.0002, + "loss": 2.3767, + "step": 40720 + }, + { + "epoch": 3.035022354694486, + "grad_norm": 2.510411262512207, + "learning_rate": 0.0002, + "loss": 2.3296, + "step": 40730 + }, + { + "epoch": 3.0357675111773474, + "grad_norm": 2.5027692317962646, + "learning_rate": 0.0002, + "loss": 2.2664, + "step": 40740 + }, + { + "epoch": 3.0365126676602086, + "grad_norm": 2.8952243328094482, + "learning_rate": 0.0002, + "loss": 2.3159, + "step": 40750 + }, + { + "epoch": 3.03725782414307, + "grad_norm": 2.4545536041259766, + "learning_rate": 0.0002, + "loss": 2.5346, + "step": 40760 + }, + { + "epoch": 3.0380029806259317, + "grad_norm": 2.8499226570129395, + "learning_rate": 0.0002, + "loss": 2.3085, + "step": 40770 + }, + { + "epoch": 3.0387481371087928, + "grad_norm": 3.025698184967041, + "learning_rate": 0.0002, + "loss": 2.5594, + "step": 40780 + }, + { + "epoch": 3.0394932935916543, + "grad_norm": 2.6746528148651123, + "learning_rate": 0.0002, + "loss": 2.4489, + "step": 40790 + }, + { + "epoch": 3.0402384500745154, + "grad_norm": 3.0967001914978027, + "learning_rate": 0.0002, + "loss": 2.2821, + "step": 40800 + }, + { + "epoch": 3.040983606557377, + "grad_norm": 2.5189292430877686, + "learning_rate": 0.0002, + "loss": 2.1087, + "step": 40810 + }, + { + "epoch": 3.0417287630402385, + "grad_norm": 2.407550573348999, + "learning_rate": 0.0002, + "loss": 2.4971, + "step": 40820 + }, + { + "epoch": 3.0424739195230996, + "grad_norm": 2.7270169258117676, + "learning_rate": 0.0002, + "loss": 2.3941, + "step": 40830 + }, + { + "epoch": 3.043219076005961, + "grad_norm": 2.8451573848724365, + "learning_rate": 0.0002, + "loss": 2.4935, + "step": 40840 + }, + { + "epoch": 3.0439642324888228, + "grad_norm": 2.520591974258423, + "learning_rate": 0.0002, + "loss": 2.4202, + "step": 40850 + }, + { + "epoch": 3.044709388971684, + "grad_norm": 2.5729188919067383, + "learning_rate": 0.0002, + "loss": 2.3634, + "step": 40860 + }, + { + "epoch": 3.0454545454545454, + "grad_norm": 2.5250637531280518, + "learning_rate": 0.0002, + "loss": 2.3602, + "step": 40870 + }, + { + "epoch": 3.046199701937407, + "grad_norm": 2.7925519943237305, + "learning_rate": 0.0002, + "loss": 2.4729, + "step": 40880 + }, + { + "epoch": 3.046944858420268, + "grad_norm": 2.4078192710876465, + "learning_rate": 0.0002, + "loss": 2.4195, + "step": 40890 + }, + { + "epoch": 3.0476900149031296, + "grad_norm": 2.648886203765869, + "learning_rate": 0.0002, + "loss": 2.4189, + "step": 40900 + }, + { + "epoch": 3.048435171385991, + "grad_norm": 2.811570167541504, + "learning_rate": 0.0002, + "loss": 2.3038, + "step": 40910 + }, + { + "epoch": 3.0491803278688523, + "grad_norm": 2.7872612476348877, + "learning_rate": 0.0002, + "loss": 2.3317, + "step": 40920 + }, + { + "epoch": 3.049925484351714, + "grad_norm": 2.933014154434204, + "learning_rate": 0.0002, + "loss": 2.398, + "step": 40930 + }, + { + "epoch": 3.0506706408345754, + "grad_norm": 2.8727405071258545, + "learning_rate": 0.0002, + "loss": 2.2654, + "step": 40940 + }, + { + "epoch": 3.0514157973174365, + "grad_norm": 2.7194955348968506, + "learning_rate": 0.0002, + "loss": 2.5749, + "step": 40950 + }, + { + "epoch": 3.052160953800298, + "grad_norm": 2.616337776184082, + "learning_rate": 0.0002, + "loss": 2.4542, + "step": 40960 + }, + { + "epoch": 3.0529061102831596, + "grad_norm": 2.4622654914855957, + "learning_rate": 0.0002, + "loss": 2.3492, + "step": 40970 + }, + { + "epoch": 3.0536512667660207, + "grad_norm": 2.7078351974487305, + "learning_rate": 0.0002, + "loss": 2.3654, + "step": 40980 + }, + { + "epoch": 3.0543964232488823, + "grad_norm": 2.5035488605499268, + "learning_rate": 0.0002, + "loss": 2.4756, + "step": 40990 + }, + { + "epoch": 3.055141579731744, + "grad_norm": 2.7884819507598877, + "learning_rate": 0.0002, + "loss": 2.3663, + "step": 41000 + }, + { + "epoch": 3.055886736214605, + "grad_norm": 2.593498945236206, + "learning_rate": 0.0002, + "loss": 2.3312, + "step": 41010 + }, + { + "epoch": 3.0566318926974665, + "grad_norm": 2.6816306114196777, + "learning_rate": 0.0002, + "loss": 2.4835, + "step": 41020 + }, + { + "epoch": 3.057377049180328, + "grad_norm": 2.6609058380126953, + "learning_rate": 0.0002, + "loss": 2.4931, + "step": 41030 + }, + { + "epoch": 3.058122205663189, + "grad_norm": 2.7509803771972656, + "learning_rate": 0.0002, + "loss": 2.3721, + "step": 41040 + }, + { + "epoch": 3.0588673621460507, + "grad_norm": 2.3781402111053467, + "learning_rate": 0.0002, + "loss": 2.3468, + "step": 41050 + }, + { + "epoch": 3.0596125186289123, + "grad_norm": 2.922632932662964, + "learning_rate": 0.0002, + "loss": 2.525, + "step": 41060 + }, + { + "epoch": 3.0603576751117734, + "grad_norm": 2.792450189590454, + "learning_rate": 0.0002, + "loss": 2.2651, + "step": 41070 + }, + { + "epoch": 3.061102831594635, + "grad_norm": 2.9025795459747314, + "learning_rate": 0.0002, + "loss": 2.4843, + "step": 41080 + }, + { + "epoch": 3.0618479880774965, + "grad_norm": 2.6068289279937744, + "learning_rate": 0.0002, + "loss": 2.4153, + "step": 41090 + }, + { + "epoch": 3.0625931445603576, + "grad_norm": 2.5537047386169434, + "learning_rate": 0.0002, + "loss": 2.4612, + "step": 41100 + }, + { + "epoch": 3.063338301043219, + "grad_norm": 2.9191601276397705, + "learning_rate": 0.0002, + "loss": 2.5681, + "step": 41110 + }, + { + "epoch": 3.0640834575260807, + "grad_norm": 2.4749388694763184, + "learning_rate": 0.0002, + "loss": 2.2255, + "step": 41120 + }, + { + "epoch": 3.064828614008942, + "grad_norm": 3.2458298206329346, + "learning_rate": 0.0002, + "loss": 2.3751, + "step": 41130 + }, + { + "epoch": 3.0655737704918034, + "grad_norm": 2.4875826835632324, + "learning_rate": 0.0002, + "loss": 2.4063, + "step": 41140 + }, + { + "epoch": 3.066318926974665, + "grad_norm": 2.788435697555542, + "learning_rate": 0.0002, + "loss": 2.4848, + "step": 41150 + }, + { + "epoch": 3.067064083457526, + "grad_norm": 2.731872320175171, + "learning_rate": 0.0002, + "loss": 2.4219, + "step": 41160 + }, + { + "epoch": 3.0678092399403876, + "grad_norm": 2.5522799491882324, + "learning_rate": 0.0002, + "loss": 2.2948, + "step": 41170 + }, + { + "epoch": 3.0685543964232487, + "grad_norm": 2.494781255722046, + "learning_rate": 0.0002, + "loss": 2.4205, + "step": 41180 + }, + { + "epoch": 3.0692995529061102, + "grad_norm": 2.828057289123535, + "learning_rate": 0.0002, + "loss": 2.4637, + "step": 41190 + }, + { + "epoch": 3.070044709388972, + "grad_norm": 2.98608660697937, + "learning_rate": 0.0002, + "loss": 2.4205, + "step": 41200 + }, + { + "epoch": 3.070789865871833, + "grad_norm": 2.9125189781188965, + "learning_rate": 0.0002, + "loss": 2.3945, + "step": 41210 + }, + { + "epoch": 3.0715350223546944, + "grad_norm": 2.599982261657715, + "learning_rate": 0.0002, + "loss": 2.4643, + "step": 41220 + }, + { + "epoch": 3.072280178837556, + "grad_norm": 2.4792168140411377, + "learning_rate": 0.0002, + "loss": 2.368, + "step": 41230 + }, + { + "epoch": 3.073025335320417, + "grad_norm": 2.604639768600464, + "learning_rate": 0.0002, + "loss": 2.3853, + "step": 41240 + }, + { + "epoch": 3.0737704918032787, + "grad_norm": 2.617868185043335, + "learning_rate": 0.0002, + "loss": 2.342, + "step": 41250 + }, + { + "epoch": 3.07451564828614, + "grad_norm": 2.1132025718688965, + "learning_rate": 0.0002, + "loss": 2.2172, + "step": 41260 + }, + { + "epoch": 3.0752608047690013, + "grad_norm": 2.4275896549224854, + "learning_rate": 0.0002, + "loss": 2.0727, + "step": 41270 + }, + { + "epoch": 3.076005961251863, + "grad_norm": 2.285426616668701, + "learning_rate": 0.0002, + "loss": 2.2816, + "step": 41280 + }, + { + "epoch": 3.0767511177347244, + "grad_norm": 2.9328603744506836, + "learning_rate": 0.0002, + "loss": 2.3676, + "step": 41290 + }, + { + "epoch": 3.0774962742175855, + "grad_norm": 2.247255325317383, + "learning_rate": 0.0002, + "loss": 2.2789, + "step": 41300 + }, + { + "epoch": 3.078241430700447, + "grad_norm": 2.8094482421875, + "learning_rate": 0.0002, + "loss": 2.5418, + "step": 41310 + }, + { + "epoch": 3.0789865871833086, + "grad_norm": 2.7178444862365723, + "learning_rate": 0.0002, + "loss": 2.4174, + "step": 41320 + }, + { + "epoch": 3.0797317436661698, + "grad_norm": 3.3467769622802734, + "learning_rate": 0.0002, + "loss": 2.459, + "step": 41330 + }, + { + "epoch": 3.0804769001490313, + "grad_norm": 2.7289459705352783, + "learning_rate": 0.0002, + "loss": 2.3743, + "step": 41340 + }, + { + "epoch": 3.081222056631893, + "grad_norm": 3.2940895557403564, + "learning_rate": 0.0002, + "loss": 2.4498, + "step": 41350 + }, + { + "epoch": 3.081967213114754, + "grad_norm": 2.498302459716797, + "learning_rate": 0.0002, + "loss": 2.4309, + "step": 41360 + }, + { + "epoch": 3.0827123695976155, + "grad_norm": 2.9865078926086426, + "learning_rate": 0.0002, + "loss": 2.585, + "step": 41370 + }, + { + "epoch": 3.083457526080477, + "grad_norm": 3.016395092010498, + "learning_rate": 0.0002, + "loss": 2.6466, + "step": 41380 + }, + { + "epoch": 3.084202682563338, + "grad_norm": 2.7781736850738525, + "learning_rate": 0.0002, + "loss": 2.3803, + "step": 41390 + }, + { + "epoch": 3.0849478390461997, + "grad_norm": 2.8058199882507324, + "learning_rate": 0.0002, + "loss": 2.3414, + "step": 41400 + }, + { + "epoch": 3.0856929955290613, + "grad_norm": 2.8715202808380127, + "learning_rate": 0.0002, + "loss": 2.2693, + "step": 41410 + }, + { + "epoch": 3.0864381520119224, + "grad_norm": 2.7370269298553467, + "learning_rate": 0.0002, + "loss": 2.4274, + "step": 41420 + }, + { + "epoch": 3.087183308494784, + "grad_norm": 2.570589065551758, + "learning_rate": 0.0002, + "loss": 2.5473, + "step": 41430 + }, + { + "epoch": 3.0879284649776455, + "grad_norm": 2.858039617538452, + "learning_rate": 0.0002, + "loss": 2.4307, + "step": 41440 + }, + { + "epoch": 3.0886736214605066, + "grad_norm": 2.4094789028167725, + "learning_rate": 0.0002, + "loss": 2.3369, + "step": 41450 + }, + { + "epoch": 3.089418777943368, + "grad_norm": 2.665861129760742, + "learning_rate": 0.0002, + "loss": 2.4736, + "step": 41460 + }, + { + "epoch": 3.0901639344262297, + "grad_norm": 2.7448792457580566, + "learning_rate": 0.0002, + "loss": 2.2552, + "step": 41470 + }, + { + "epoch": 3.090909090909091, + "grad_norm": 3.320862293243408, + "learning_rate": 0.0002, + "loss": 2.361, + "step": 41480 + }, + { + "epoch": 3.0916542473919524, + "grad_norm": 2.623382568359375, + "learning_rate": 0.0002, + "loss": 2.2576, + "step": 41490 + }, + { + "epoch": 3.092399403874814, + "grad_norm": 2.757305383682251, + "learning_rate": 0.0002, + "loss": 2.4842, + "step": 41500 + }, + { + "epoch": 3.093144560357675, + "grad_norm": 3.2521588802337646, + "learning_rate": 0.0002, + "loss": 2.4941, + "step": 41510 + }, + { + "epoch": 3.0938897168405366, + "grad_norm": 2.6293327808380127, + "learning_rate": 0.0002, + "loss": 2.3642, + "step": 41520 + }, + { + "epoch": 3.0946348733233977, + "grad_norm": 2.7597970962524414, + "learning_rate": 0.0002, + "loss": 2.4894, + "step": 41530 + }, + { + "epoch": 3.0953800298062593, + "grad_norm": 3.117297887802124, + "learning_rate": 0.0002, + "loss": 2.5951, + "step": 41540 + }, + { + "epoch": 3.096125186289121, + "grad_norm": 2.641249656677246, + "learning_rate": 0.0002, + "loss": 2.2928, + "step": 41550 + }, + { + "epoch": 3.096870342771982, + "grad_norm": 3.0518288612365723, + "learning_rate": 0.0002, + "loss": 2.3452, + "step": 41560 + }, + { + "epoch": 3.0976154992548435, + "grad_norm": 2.6108360290527344, + "learning_rate": 0.0002, + "loss": 2.2999, + "step": 41570 + }, + { + "epoch": 3.098360655737705, + "grad_norm": 3.087437629699707, + "learning_rate": 0.0002, + "loss": 2.4094, + "step": 41580 + }, + { + "epoch": 3.099105812220566, + "grad_norm": 2.4869635105133057, + "learning_rate": 0.0002, + "loss": 2.4677, + "step": 41590 + }, + { + "epoch": 3.0998509687034277, + "grad_norm": 3.1462340354919434, + "learning_rate": 0.0002, + "loss": 2.5562, + "step": 41600 + }, + { + "epoch": 3.1005961251862892, + "grad_norm": 2.7001521587371826, + "learning_rate": 0.0002, + "loss": 2.4251, + "step": 41610 + }, + { + "epoch": 3.1013412816691504, + "grad_norm": 3.1031558513641357, + "learning_rate": 0.0002, + "loss": 2.492, + "step": 41620 + }, + { + "epoch": 3.102086438152012, + "grad_norm": 2.6600401401519775, + "learning_rate": 0.0002, + "loss": 2.4346, + "step": 41630 + }, + { + "epoch": 3.1028315946348735, + "grad_norm": 2.673090934753418, + "learning_rate": 0.0002, + "loss": 2.3663, + "step": 41640 + }, + { + "epoch": 3.1035767511177346, + "grad_norm": 2.6423120498657227, + "learning_rate": 0.0002, + "loss": 2.486, + "step": 41650 + }, + { + "epoch": 3.104321907600596, + "grad_norm": 2.6316373348236084, + "learning_rate": 0.0002, + "loss": 2.5157, + "step": 41660 + }, + { + "epoch": 3.1050670640834577, + "grad_norm": 2.9219932556152344, + "learning_rate": 0.0002, + "loss": 2.2875, + "step": 41670 + }, + { + "epoch": 3.105812220566319, + "grad_norm": 3.2191224098205566, + "learning_rate": 0.0002, + "loss": 2.3327, + "step": 41680 + }, + { + "epoch": 3.1065573770491803, + "grad_norm": 2.6431362628936768, + "learning_rate": 0.0002, + "loss": 2.4269, + "step": 41690 + }, + { + "epoch": 3.107302533532042, + "grad_norm": 2.56723952293396, + "learning_rate": 0.0002, + "loss": 2.3461, + "step": 41700 + }, + { + "epoch": 3.108047690014903, + "grad_norm": 3.1009645462036133, + "learning_rate": 0.0002, + "loss": 2.62, + "step": 41710 + }, + { + "epoch": 3.1087928464977646, + "grad_norm": 2.805692672729492, + "learning_rate": 0.0002, + "loss": 2.2908, + "step": 41720 + }, + { + "epoch": 3.109538002980626, + "grad_norm": 2.93380069732666, + "learning_rate": 0.0002, + "loss": 2.4379, + "step": 41730 + }, + { + "epoch": 3.110283159463487, + "grad_norm": 2.5862841606140137, + "learning_rate": 0.0002, + "loss": 2.3883, + "step": 41740 + }, + { + "epoch": 3.1110283159463488, + "grad_norm": 2.6500649452209473, + "learning_rate": 0.0002, + "loss": 2.3901, + "step": 41750 + }, + { + "epoch": 3.1117734724292103, + "grad_norm": 2.738546371459961, + "learning_rate": 0.0002, + "loss": 2.4566, + "step": 41760 + }, + { + "epoch": 3.1125186289120714, + "grad_norm": 2.519313335418701, + "learning_rate": 0.0002, + "loss": 2.4855, + "step": 41770 + }, + { + "epoch": 3.113263785394933, + "grad_norm": 2.5396151542663574, + "learning_rate": 0.0002, + "loss": 2.4722, + "step": 41780 + }, + { + "epoch": 3.1140089418777945, + "grad_norm": 2.6931238174438477, + "learning_rate": 0.0002, + "loss": 2.5307, + "step": 41790 + }, + { + "epoch": 3.1147540983606556, + "grad_norm": 2.5568339824676514, + "learning_rate": 0.0002, + "loss": 2.3558, + "step": 41800 + }, + { + "epoch": 3.115499254843517, + "grad_norm": 2.8651018142700195, + "learning_rate": 0.0002, + "loss": 2.3666, + "step": 41810 + }, + { + "epoch": 3.1162444113263787, + "grad_norm": 2.6825473308563232, + "learning_rate": 0.0002, + "loss": 2.4556, + "step": 41820 + }, + { + "epoch": 3.11698956780924, + "grad_norm": 2.327749490737915, + "learning_rate": 0.0002, + "loss": 2.3705, + "step": 41830 + }, + { + "epoch": 3.1177347242921014, + "grad_norm": 2.5712668895721436, + "learning_rate": 0.0002, + "loss": 2.5078, + "step": 41840 + }, + { + "epoch": 3.118479880774963, + "grad_norm": 2.770862102508545, + "learning_rate": 0.0002, + "loss": 2.4223, + "step": 41850 + }, + { + "epoch": 3.119225037257824, + "grad_norm": 2.7376694679260254, + "learning_rate": 0.0002, + "loss": 2.0986, + "step": 41860 + }, + { + "epoch": 3.1199701937406856, + "grad_norm": 2.3979244232177734, + "learning_rate": 0.0002, + "loss": 2.3534, + "step": 41870 + }, + { + "epoch": 3.1207153502235467, + "grad_norm": 3.0536673069000244, + "learning_rate": 0.0002, + "loss": 2.4107, + "step": 41880 + }, + { + "epoch": 3.1214605067064083, + "grad_norm": 2.726121187210083, + "learning_rate": 0.0002, + "loss": 2.3308, + "step": 41890 + }, + { + "epoch": 3.12220566318927, + "grad_norm": 2.5417113304138184, + "learning_rate": 0.0002, + "loss": 2.3748, + "step": 41900 + }, + { + "epoch": 3.122950819672131, + "grad_norm": 2.813762664794922, + "learning_rate": 0.0002, + "loss": 2.4617, + "step": 41910 + }, + { + "epoch": 3.1236959761549925, + "grad_norm": 2.67824649810791, + "learning_rate": 0.0002, + "loss": 2.391, + "step": 41920 + }, + { + "epoch": 3.124441132637854, + "grad_norm": 2.5998127460479736, + "learning_rate": 0.0002, + "loss": 2.417, + "step": 41930 + }, + { + "epoch": 3.125186289120715, + "grad_norm": 2.6877424716949463, + "learning_rate": 0.0002, + "loss": 2.154, + "step": 41940 + }, + { + "epoch": 3.1259314456035767, + "grad_norm": 2.592630386352539, + "learning_rate": 0.0002, + "loss": 2.2968, + "step": 41950 + }, + { + "epoch": 3.1266766020864383, + "grad_norm": 2.4909517765045166, + "learning_rate": 0.0002, + "loss": 2.2333, + "step": 41960 + }, + { + "epoch": 3.1274217585692994, + "grad_norm": 2.7702407836914062, + "learning_rate": 0.0002, + "loss": 2.329, + "step": 41970 + }, + { + "epoch": 3.128166915052161, + "grad_norm": 3.2048144340515137, + "learning_rate": 0.0002, + "loss": 2.5344, + "step": 41980 + }, + { + "epoch": 3.1289120715350225, + "grad_norm": 2.452103614807129, + "learning_rate": 0.0002, + "loss": 2.3947, + "step": 41990 + }, + { + "epoch": 3.1296572280178836, + "grad_norm": 3.0650296211242676, + "learning_rate": 0.0002, + "loss": 2.6355, + "step": 42000 + }, + { + "epoch": 3.130402384500745, + "grad_norm": 3.1083576679229736, + "learning_rate": 0.0002, + "loss": 2.3897, + "step": 42010 + }, + { + "epoch": 3.1311475409836067, + "grad_norm": 2.4616737365722656, + "learning_rate": 0.0002, + "loss": 2.4941, + "step": 42020 + }, + { + "epoch": 3.131892697466468, + "grad_norm": 2.4813618659973145, + "learning_rate": 0.0002, + "loss": 2.2795, + "step": 42030 + }, + { + "epoch": 3.1326378539493294, + "grad_norm": 2.4839460849761963, + "learning_rate": 0.0002, + "loss": 2.4753, + "step": 42040 + }, + { + "epoch": 3.133383010432191, + "grad_norm": 2.5165300369262695, + "learning_rate": 0.0002, + "loss": 2.4168, + "step": 42050 + }, + { + "epoch": 3.134128166915052, + "grad_norm": 3.2887141704559326, + "learning_rate": 0.0002, + "loss": 2.2558, + "step": 42060 + }, + { + "epoch": 3.1348733233979136, + "grad_norm": 2.8972301483154297, + "learning_rate": 0.0002, + "loss": 2.5248, + "step": 42070 + }, + { + "epoch": 3.135618479880775, + "grad_norm": 2.6605656147003174, + "learning_rate": 0.0002, + "loss": 2.3511, + "step": 42080 + }, + { + "epoch": 3.1363636363636362, + "grad_norm": 2.9574508666992188, + "learning_rate": 0.0002, + "loss": 2.5589, + "step": 42090 + }, + { + "epoch": 3.137108792846498, + "grad_norm": 2.4659640789031982, + "learning_rate": 0.0002, + "loss": 2.4887, + "step": 42100 + }, + { + "epoch": 3.1378539493293593, + "grad_norm": 2.799456834793091, + "learning_rate": 0.0002, + "loss": 2.4112, + "step": 42110 + }, + { + "epoch": 3.1385991058122205, + "grad_norm": 2.674593925476074, + "learning_rate": 0.0002, + "loss": 2.472, + "step": 42120 + }, + { + "epoch": 3.139344262295082, + "grad_norm": 2.741276979446411, + "learning_rate": 0.0002, + "loss": 2.438, + "step": 42130 + }, + { + "epoch": 3.1400894187779436, + "grad_norm": 3.0024397373199463, + "learning_rate": 0.0002, + "loss": 2.4534, + "step": 42140 + }, + { + "epoch": 3.1408345752608047, + "grad_norm": 2.393557548522949, + "learning_rate": 0.0002, + "loss": 2.4285, + "step": 42150 + }, + { + "epoch": 3.1415797317436662, + "grad_norm": 2.408137798309326, + "learning_rate": 0.0002, + "loss": 2.2754, + "step": 42160 + }, + { + "epoch": 3.1423248882265273, + "grad_norm": 2.6676998138427734, + "learning_rate": 0.0002, + "loss": 2.4991, + "step": 42170 + }, + { + "epoch": 3.143070044709389, + "grad_norm": 2.7383525371551514, + "learning_rate": 0.0002, + "loss": 2.4517, + "step": 42180 + }, + { + "epoch": 3.1438152011922504, + "grad_norm": 3.209341526031494, + "learning_rate": 0.0002, + "loss": 2.4987, + "step": 42190 + }, + { + "epoch": 3.144560357675112, + "grad_norm": 2.7689366340637207, + "learning_rate": 0.0002, + "loss": 2.3316, + "step": 42200 + }, + { + "epoch": 3.145305514157973, + "grad_norm": 2.7870845794677734, + "learning_rate": 0.0002, + "loss": 2.4651, + "step": 42210 + }, + { + "epoch": 3.1460506706408347, + "grad_norm": 2.8037571907043457, + "learning_rate": 0.0002, + "loss": 2.4491, + "step": 42220 + }, + { + "epoch": 3.1467958271236958, + "grad_norm": 2.640016555786133, + "learning_rate": 0.0002, + "loss": 2.2967, + "step": 42230 + }, + { + "epoch": 3.1475409836065573, + "grad_norm": 2.958155870437622, + "learning_rate": 0.0002, + "loss": 2.1981, + "step": 42240 + }, + { + "epoch": 3.148286140089419, + "grad_norm": 3.091360569000244, + "learning_rate": 0.0002, + "loss": 2.5147, + "step": 42250 + }, + { + "epoch": 3.14903129657228, + "grad_norm": 3.4414288997650146, + "learning_rate": 0.0002, + "loss": 2.4308, + "step": 42260 + }, + { + "epoch": 3.1497764530551415, + "grad_norm": 3.074572801589966, + "learning_rate": 0.0002, + "loss": 2.5425, + "step": 42270 + }, + { + "epoch": 3.150521609538003, + "grad_norm": 2.8518035411834717, + "learning_rate": 0.0002, + "loss": 2.5321, + "step": 42280 + }, + { + "epoch": 3.151266766020864, + "grad_norm": 2.689629077911377, + "learning_rate": 0.0002, + "loss": 2.527, + "step": 42290 + }, + { + "epoch": 3.1520119225037257, + "grad_norm": 2.6013498306274414, + "learning_rate": 0.0002, + "loss": 2.3945, + "step": 42300 + }, + { + "epoch": 3.1527570789865873, + "grad_norm": 2.6785435676574707, + "learning_rate": 0.0002, + "loss": 2.5889, + "step": 42310 + }, + { + "epoch": 3.1535022354694484, + "grad_norm": 2.7143375873565674, + "learning_rate": 0.0002, + "loss": 2.4548, + "step": 42320 + }, + { + "epoch": 3.15424739195231, + "grad_norm": 2.794013261795044, + "learning_rate": 0.0002, + "loss": 2.4599, + "step": 42330 + }, + { + "epoch": 3.1549925484351715, + "grad_norm": 2.9435579776763916, + "learning_rate": 0.0002, + "loss": 2.4458, + "step": 42340 + }, + { + "epoch": 3.1557377049180326, + "grad_norm": 2.210059642791748, + "learning_rate": 0.0002, + "loss": 2.4752, + "step": 42350 + }, + { + "epoch": 3.156482861400894, + "grad_norm": 3.0881826877593994, + "learning_rate": 0.0002, + "loss": 2.3679, + "step": 42360 + }, + { + "epoch": 3.1572280178837557, + "grad_norm": 2.6826508045196533, + "learning_rate": 0.0002, + "loss": 2.3735, + "step": 42370 + }, + { + "epoch": 3.157973174366617, + "grad_norm": 2.657874822616577, + "learning_rate": 0.0002, + "loss": 2.3878, + "step": 42380 + }, + { + "epoch": 3.1587183308494784, + "grad_norm": 2.6119277477264404, + "learning_rate": 0.0002, + "loss": 2.4191, + "step": 42390 + }, + { + "epoch": 3.15946348733234, + "grad_norm": 2.929684638977051, + "learning_rate": 0.0002, + "loss": 2.3763, + "step": 42400 + }, + { + "epoch": 3.160208643815201, + "grad_norm": 2.660856246948242, + "learning_rate": 0.0002, + "loss": 2.4806, + "step": 42410 + }, + { + "epoch": 3.1609538002980626, + "grad_norm": 2.3593862056732178, + "learning_rate": 0.0002, + "loss": 2.2756, + "step": 42420 + }, + { + "epoch": 3.161698956780924, + "grad_norm": 3.1583991050720215, + "learning_rate": 0.0002, + "loss": 2.4429, + "step": 42430 + }, + { + "epoch": 3.1624441132637853, + "grad_norm": 2.3144760131835938, + "learning_rate": 0.0002, + "loss": 2.3234, + "step": 42440 + }, + { + "epoch": 3.163189269746647, + "grad_norm": 2.6270995140075684, + "learning_rate": 0.0002, + "loss": 2.4479, + "step": 42450 + }, + { + "epoch": 3.1639344262295084, + "grad_norm": 2.9927475452423096, + "learning_rate": 0.0002, + "loss": 2.4985, + "step": 42460 + }, + { + "epoch": 3.1646795827123695, + "grad_norm": 2.8037357330322266, + "learning_rate": 0.0002, + "loss": 2.3792, + "step": 42470 + }, + { + "epoch": 3.165424739195231, + "grad_norm": 2.522001028060913, + "learning_rate": 0.0002, + "loss": 2.5035, + "step": 42480 + }, + { + "epoch": 3.1661698956780926, + "grad_norm": 2.7954230308532715, + "learning_rate": 0.0002, + "loss": 2.5186, + "step": 42490 + }, + { + "epoch": 3.1669150521609537, + "grad_norm": 2.842353105545044, + "learning_rate": 0.0002, + "loss": 2.3828, + "step": 42500 + }, + { + "epoch": 3.1676602086438153, + "grad_norm": 3.6176884174346924, + "learning_rate": 0.0002, + "loss": 2.4387, + "step": 42510 + }, + { + "epoch": 3.168405365126677, + "grad_norm": 2.5962915420532227, + "learning_rate": 0.0002, + "loss": 2.3441, + "step": 42520 + }, + { + "epoch": 3.169150521609538, + "grad_norm": 2.856750726699829, + "learning_rate": 0.0002, + "loss": 2.4871, + "step": 42530 + }, + { + "epoch": 3.1698956780923995, + "grad_norm": 2.5241379737854004, + "learning_rate": 0.0002, + "loss": 2.4527, + "step": 42540 + }, + { + "epoch": 3.170640834575261, + "grad_norm": 3.149193525314331, + "learning_rate": 0.0002, + "loss": 2.4242, + "step": 42550 + }, + { + "epoch": 3.171385991058122, + "grad_norm": 2.600130081176758, + "learning_rate": 0.0002, + "loss": 2.4865, + "step": 42560 + }, + { + "epoch": 3.1721311475409837, + "grad_norm": 2.598031520843506, + "learning_rate": 0.0002, + "loss": 2.4508, + "step": 42570 + }, + { + "epoch": 3.172876304023845, + "grad_norm": 2.6382107734680176, + "learning_rate": 0.0002, + "loss": 2.4075, + "step": 42580 + }, + { + "epoch": 3.1736214605067063, + "grad_norm": 2.6536340713500977, + "learning_rate": 0.0002, + "loss": 2.2179, + "step": 42590 + }, + { + "epoch": 3.174366616989568, + "grad_norm": 2.2409815788269043, + "learning_rate": 0.0002, + "loss": 2.4891, + "step": 42600 + }, + { + "epoch": 3.175111773472429, + "grad_norm": 2.8284525871276855, + "learning_rate": 0.0002, + "loss": 2.4104, + "step": 42610 + }, + { + "epoch": 3.1758569299552906, + "grad_norm": 2.8395347595214844, + "learning_rate": 0.0002, + "loss": 2.4732, + "step": 42620 + }, + { + "epoch": 3.176602086438152, + "grad_norm": 2.7434206008911133, + "learning_rate": 0.0002, + "loss": 2.5753, + "step": 42630 + }, + { + "epoch": 3.1773472429210132, + "grad_norm": 2.5308117866516113, + "learning_rate": 0.0002, + "loss": 2.4494, + "step": 42640 + }, + { + "epoch": 3.178092399403875, + "grad_norm": 2.765371322631836, + "learning_rate": 0.0002, + "loss": 2.2449, + "step": 42650 + }, + { + "epoch": 3.1788375558867363, + "grad_norm": 2.844912052154541, + "learning_rate": 0.0002, + "loss": 2.5911, + "step": 42660 + }, + { + "epoch": 3.1795827123695974, + "grad_norm": 2.811023712158203, + "learning_rate": 0.0002, + "loss": 2.2395, + "step": 42670 + }, + { + "epoch": 3.180327868852459, + "grad_norm": 2.2983829975128174, + "learning_rate": 0.0002, + "loss": 2.2522, + "step": 42680 + }, + { + "epoch": 3.1810730253353205, + "grad_norm": 2.5697834491729736, + "learning_rate": 0.0002, + "loss": 2.4592, + "step": 42690 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 2.6632912158966064, + "learning_rate": 0.0002, + "loss": 2.3589, + "step": 42700 + }, + { + "epoch": 3.182563338301043, + "grad_norm": 2.8424110412597656, + "learning_rate": 0.0002, + "loss": 2.4862, + "step": 42710 + }, + { + "epoch": 3.1833084947839048, + "grad_norm": 2.80009126663208, + "learning_rate": 0.0002, + "loss": 2.3216, + "step": 42720 + }, + { + "epoch": 3.184053651266766, + "grad_norm": 2.937347173690796, + "learning_rate": 0.0002, + "loss": 2.5931, + "step": 42730 + }, + { + "epoch": 3.1847988077496274, + "grad_norm": 3.170665740966797, + "learning_rate": 0.0002, + "loss": 2.3354, + "step": 42740 + }, + { + "epoch": 3.185543964232489, + "grad_norm": 2.7834672927856445, + "learning_rate": 0.0002, + "loss": 2.3822, + "step": 42750 + }, + { + "epoch": 3.18628912071535, + "grad_norm": 2.834599733352661, + "learning_rate": 0.0002, + "loss": 2.5047, + "step": 42760 + }, + { + "epoch": 3.1870342771982116, + "grad_norm": 2.9384005069732666, + "learning_rate": 0.0002, + "loss": 2.4439, + "step": 42770 + }, + { + "epoch": 3.187779433681073, + "grad_norm": 2.7702577114105225, + "learning_rate": 0.0002, + "loss": 2.4609, + "step": 42780 + }, + { + "epoch": 3.1885245901639343, + "grad_norm": 2.811617612838745, + "learning_rate": 0.0002, + "loss": 2.5652, + "step": 42790 + }, + { + "epoch": 3.189269746646796, + "grad_norm": 2.532407283782959, + "learning_rate": 0.0002, + "loss": 2.3831, + "step": 42800 + }, + { + "epoch": 3.1900149031296574, + "grad_norm": 2.6427395343780518, + "learning_rate": 0.0002, + "loss": 2.5223, + "step": 42810 + }, + { + "epoch": 3.1907600596125185, + "grad_norm": 2.6416213512420654, + "learning_rate": 0.0002, + "loss": 2.2635, + "step": 42820 + }, + { + "epoch": 3.19150521609538, + "grad_norm": 2.5367422103881836, + "learning_rate": 0.0002, + "loss": 2.2955, + "step": 42830 + }, + { + "epoch": 3.1922503725782416, + "grad_norm": 2.593743085861206, + "learning_rate": 0.0002, + "loss": 2.5609, + "step": 42840 + }, + { + "epoch": 3.1929955290611027, + "grad_norm": 2.4231393337249756, + "learning_rate": 0.0002, + "loss": 2.2702, + "step": 42850 + }, + { + "epoch": 3.1937406855439643, + "grad_norm": 2.6415021419525146, + "learning_rate": 0.0002, + "loss": 2.4803, + "step": 42860 + }, + { + "epoch": 3.194485842026826, + "grad_norm": 3.0961923599243164, + "learning_rate": 0.0002, + "loss": 2.5815, + "step": 42870 + }, + { + "epoch": 3.195230998509687, + "grad_norm": 2.677943229675293, + "learning_rate": 0.0002, + "loss": 2.4246, + "step": 42880 + }, + { + "epoch": 3.1959761549925485, + "grad_norm": 2.9092931747436523, + "learning_rate": 0.0002, + "loss": 2.4639, + "step": 42890 + }, + { + "epoch": 3.19672131147541, + "grad_norm": 2.8990132808685303, + "learning_rate": 0.0002, + "loss": 2.5542, + "step": 42900 + }, + { + "epoch": 3.197466467958271, + "grad_norm": 2.2910470962524414, + "learning_rate": 0.0002, + "loss": 2.5081, + "step": 42910 + }, + { + "epoch": 3.1982116244411327, + "grad_norm": 2.7519373893737793, + "learning_rate": 0.0002, + "loss": 2.2503, + "step": 42920 + }, + { + "epoch": 3.198956780923994, + "grad_norm": 2.626312017440796, + "learning_rate": 0.0002, + "loss": 2.3392, + "step": 42930 + }, + { + "epoch": 3.1997019374068554, + "grad_norm": 2.6032698154449463, + "learning_rate": 0.0002, + "loss": 2.4991, + "step": 42940 + }, + { + "epoch": 3.200447093889717, + "grad_norm": 2.6117911338806152, + "learning_rate": 0.0002, + "loss": 2.6928, + "step": 42950 + }, + { + "epoch": 3.201192250372578, + "grad_norm": 2.7212131023406982, + "learning_rate": 0.0002, + "loss": 2.0789, + "step": 42960 + }, + { + "epoch": 3.2019374068554396, + "grad_norm": 2.6502857208251953, + "learning_rate": 0.0002, + "loss": 2.5775, + "step": 42970 + }, + { + "epoch": 3.202682563338301, + "grad_norm": 2.800010919570923, + "learning_rate": 0.0002, + "loss": 2.4754, + "step": 42980 + }, + { + "epoch": 3.2034277198211623, + "grad_norm": 2.5369884967803955, + "learning_rate": 0.0002, + "loss": 2.4423, + "step": 42990 + }, + { + "epoch": 3.204172876304024, + "grad_norm": 3.0359115600585938, + "learning_rate": 0.0002, + "loss": 2.4593, + "step": 43000 + }, + { + "epoch": 3.2049180327868854, + "grad_norm": 2.6096441745758057, + "learning_rate": 0.0002, + "loss": 2.3148, + "step": 43010 + }, + { + "epoch": 3.2056631892697465, + "grad_norm": 3.0144314765930176, + "learning_rate": 0.0002, + "loss": 2.3979, + "step": 43020 + }, + { + "epoch": 3.206408345752608, + "grad_norm": 2.607226610183716, + "learning_rate": 0.0002, + "loss": 2.5429, + "step": 43030 + }, + { + "epoch": 3.2071535022354696, + "grad_norm": 3.2121124267578125, + "learning_rate": 0.0002, + "loss": 2.5202, + "step": 43040 + }, + { + "epoch": 3.2078986587183307, + "grad_norm": 2.5059688091278076, + "learning_rate": 0.0002, + "loss": 2.4008, + "step": 43050 + }, + { + "epoch": 3.2086438152011922, + "grad_norm": 2.7579286098480225, + "learning_rate": 0.0002, + "loss": 2.4386, + "step": 43060 + }, + { + "epoch": 3.209388971684054, + "grad_norm": 2.520742654800415, + "learning_rate": 0.0002, + "loss": 2.3709, + "step": 43070 + }, + { + "epoch": 3.210134128166915, + "grad_norm": 2.72102952003479, + "learning_rate": 0.0002, + "loss": 2.1754, + "step": 43080 + }, + { + "epoch": 3.2108792846497765, + "grad_norm": 2.840904474258423, + "learning_rate": 0.0002, + "loss": 2.5463, + "step": 43090 + }, + { + "epoch": 3.211624441132638, + "grad_norm": 2.4814138412475586, + "learning_rate": 0.0002, + "loss": 2.3215, + "step": 43100 + }, + { + "epoch": 3.212369597615499, + "grad_norm": 2.4107296466827393, + "learning_rate": 0.0002, + "loss": 2.4029, + "step": 43110 + }, + { + "epoch": 3.2131147540983607, + "grad_norm": 2.8633036613464355, + "learning_rate": 0.0002, + "loss": 2.2321, + "step": 43120 + }, + { + "epoch": 3.2138599105812222, + "grad_norm": 2.631967067718506, + "learning_rate": 0.0002, + "loss": 2.2269, + "step": 43130 + }, + { + "epoch": 3.2146050670640833, + "grad_norm": 2.8928747177124023, + "learning_rate": 0.0002, + "loss": 2.4295, + "step": 43140 + }, + { + "epoch": 3.215350223546945, + "grad_norm": 2.65671443939209, + "learning_rate": 0.0002, + "loss": 2.4534, + "step": 43150 + }, + { + "epoch": 3.2160953800298064, + "grad_norm": 2.9897003173828125, + "learning_rate": 0.0002, + "loss": 2.5198, + "step": 43160 + }, + { + "epoch": 3.2168405365126675, + "grad_norm": 2.7278244495391846, + "learning_rate": 0.0002, + "loss": 2.4829, + "step": 43170 + }, + { + "epoch": 3.217585692995529, + "grad_norm": 2.805060625076294, + "learning_rate": 0.0002, + "loss": 2.3381, + "step": 43180 + }, + { + "epoch": 3.2183308494783907, + "grad_norm": 2.806382656097412, + "learning_rate": 0.0002, + "loss": 2.3296, + "step": 43190 + }, + { + "epoch": 3.2190760059612518, + "grad_norm": 3.0220911502838135, + "learning_rate": 0.0002, + "loss": 2.514, + "step": 43200 + }, + { + "epoch": 3.2198211624441133, + "grad_norm": 2.6727559566497803, + "learning_rate": 0.0002, + "loss": 2.4302, + "step": 43210 + }, + { + "epoch": 3.220566318926975, + "grad_norm": 2.733079433441162, + "learning_rate": 0.0002, + "loss": 2.5175, + "step": 43220 + }, + { + "epoch": 3.221311475409836, + "grad_norm": 2.369826555252075, + "learning_rate": 0.0002, + "loss": 2.1816, + "step": 43230 + }, + { + "epoch": 3.2220566318926975, + "grad_norm": 2.4770312309265137, + "learning_rate": 0.0002, + "loss": 2.6042, + "step": 43240 + }, + { + "epoch": 3.222801788375559, + "grad_norm": 2.658463954925537, + "learning_rate": 0.0002, + "loss": 2.3651, + "step": 43250 + }, + { + "epoch": 3.22354694485842, + "grad_norm": 2.8133113384246826, + "learning_rate": 0.0002, + "loss": 2.585, + "step": 43260 + }, + { + "epoch": 3.2242921013412817, + "grad_norm": 2.6553611755371094, + "learning_rate": 0.0002, + "loss": 2.3669, + "step": 43270 + }, + { + "epoch": 3.225037257824143, + "grad_norm": 2.66679048538208, + "learning_rate": 0.0002, + "loss": 2.6358, + "step": 43280 + }, + { + "epoch": 3.2257824143070044, + "grad_norm": 2.5681023597717285, + "learning_rate": 0.0002, + "loss": 2.3225, + "step": 43290 + }, + { + "epoch": 3.226527570789866, + "grad_norm": 3.2260591983795166, + "learning_rate": 0.0002, + "loss": 2.4329, + "step": 43300 + }, + { + "epoch": 3.227272727272727, + "grad_norm": 2.7905280590057373, + "learning_rate": 0.0002, + "loss": 2.4839, + "step": 43310 + }, + { + "epoch": 3.2280178837555886, + "grad_norm": 3.111307382583618, + "learning_rate": 0.0002, + "loss": 2.2841, + "step": 43320 + }, + { + "epoch": 3.22876304023845, + "grad_norm": 2.8080315589904785, + "learning_rate": 0.0002, + "loss": 2.4087, + "step": 43330 + }, + { + "epoch": 3.2295081967213113, + "grad_norm": 2.8237740993499756, + "learning_rate": 0.0002, + "loss": 2.4916, + "step": 43340 + }, + { + "epoch": 3.230253353204173, + "grad_norm": 2.3456661701202393, + "learning_rate": 0.0002, + "loss": 2.3679, + "step": 43350 + }, + { + "epoch": 3.2309985096870344, + "grad_norm": 2.5504918098449707, + "learning_rate": 0.0002, + "loss": 2.4728, + "step": 43360 + }, + { + "epoch": 3.2317436661698955, + "grad_norm": 2.742598295211792, + "learning_rate": 0.0002, + "loss": 2.5054, + "step": 43370 + }, + { + "epoch": 3.232488822652757, + "grad_norm": 2.7378361225128174, + "learning_rate": 0.0002, + "loss": 2.4379, + "step": 43380 + }, + { + "epoch": 3.2332339791356186, + "grad_norm": 2.609257936477661, + "learning_rate": 0.0002, + "loss": 2.5783, + "step": 43390 + }, + { + "epoch": 3.2339791356184797, + "grad_norm": 2.4723076820373535, + "learning_rate": 0.0002, + "loss": 2.4199, + "step": 43400 + }, + { + "epoch": 3.2347242921013413, + "grad_norm": 2.965663194656372, + "learning_rate": 0.0002, + "loss": 2.4963, + "step": 43410 + }, + { + "epoch": 3.235469448584203, + "grad_norm": 2.788440704345703, + "learning_rate": 0.0002, + "loss": 2.4947, + "step": 43420 + }, + { + "epoch": 3.236214605067064, + "grad_norm": 2.882891893386841, + "learning_rate": 0.0002, + "loss": 2.4839, + "step": 43430 + }, + { + "epoch": 3.2369597615499255, + "grad_norm": 2.6174352169036865, + "learning_rate": 0.0002, + "loss": 2.332, + "step": 43440 + }, + { + "epoch": 3.237704918032787, + "grad_norm": 2.465498685836792, + "learning_rate": 0.0002, + "loss": 2.2894, + "step": 43450 + }, + { + "epoch": 3.238450074515648, + "grad_norm": 2.473252773284912, + "learning_rate": 0.0002, + "loss": 2.4923, + "step": 43460 + }, + { + "epoch": 3.2391952309985097, + "grad_norm": 2.660106897354126, + "learning_rate": 0.0002, + "loss": 2.3363, + "step": 43470 + }, + { + "epoch": 3.2399403874813713, + "grad_norm": 2.8930771350860596, + "learning_rate": 0.0002, + "loss": 2.5892, + "step": 43480 + }, + { + "epoch": 3.2406855439642324, + "grad_norm": 2.360569953918457, + "learning_rate": 0.0002, + "loss": 2.4016, + "step": 43490 + }, + { + "epoch": 3.241430700447094, + "grad_norm": 2.7150566577911377, + "learning_rate": 0.0002, + "loss": 2.1884, + "step": 43500 + }, + { + "epoch": 3.2421758569299555, + "grad_norm": 3.106088876724243, + "learning_rate": 0.0002, + "loss": 2.5107, + "step": 43510 + }, + { + "epoch": 3.2429210134128166, + "grad_norm": 2.799938201904297, + "learning_rate": 0.0002, + "loss": 2.5407, + "step": 43520 + }, + { + "epoch": 3.243666169895678, + "grad_norm": 2.440535306930542, + "learning_rate": 0.0002, + "loss": 2.5892, + "step": 43530 + }, + { + "epoch": 3.2444113263785397, + "grad_norm": 2.66219162940979, + "learning_rate": 0.0002, + "loss": 2.3643, + "step": 43540 + }, + { + "epoch": 3.245156482861401, + "grad_norm": 2.7151572704315186, + "learning_rate": 0.0002, + "loss": 2.3527, + "step": 43550 + }, + { + "epoch": 3.2459016393442623, + "grad_norm": 3.217926025390625, + "learning_rate": 0.0002, + "loss": 2.3433, + "step": 43560 + }, + { + "epoch": 3.246646795827124, + "grad_norm": 2.634303569793701, + "learning_rate": 0.0002, + "loss": 2.3928, + "step": 43570 + }, + { + "epoch": 3.247391952309985, + "grad_norm": 2.9070944786071777, + "learning_rate": 0.0002, + "loss": 2.4142, + "step": 43580 + }, + { + "epoch": 3.2481371087928466, + "grad_norm": 3.3297340869903564, + "learning_rate": 0.0002, + "loss": 2.4862, + "step": 43590 + }, + { + "epoch": 3.248882265275708, + "grad_norm": 2.632563591003418, + "learning_rate": 0.0002, + "loss": 2.5569, + "step": 43600 + }, + { + "epoch": 3.2496274217585692, + "grad_norm": 3.3055930137634277, + "learning_rate": 0.0002, + "loss": 2.4756, + "step": 43610 + }, + { + "epoch": 3.2503725782414308, + "grad_norm": 2.854285955429077, + "learning_rate": 0.0002, + "loss": 2.5673, + "step": 43620 + }, + { + "epoch": 3.251117734724292, + "grad_norm": 2.5545718669891357, + "learning_rate": 0.0002, + "loss": 2.4522, + "step": 43630 + }, + { + "epoch": 3.2518628912071534, + "grad_norm": 2.046332836151123, + "learning_rate": 0.0002, + "loss": 2.2818, + "step": 43640 + }, + { + "epoch": 3.252608047690015, + "grad_norm": 2.8132576942443848, + "learning_rate": 0.0002, + "loss": 2.4598, + "step": 43650 + }, + { + "epoch": 3.2533532041728765, + "grad_norm": 3.0209426879882812, + "learning_rate": 0.0002, + "loss": 2.5342, + "step": 43660 + }, + { + "epoch": 3.2540983606557377, + "grad_norm": 2.7068018913269043, + "learning_rate": 0.0002, + "loss": 2.3042, + "step": 43670 + }, + { + "epoch": 3.254843517138599, + "grad_norm": 2.8782992362976074, + "learning_rate": 0.0002, + "loss": 2.4472, + "step": 43680 + }, + { + "epoch": 3.2555886736214603, + "grad_norm": 2.6785826683044434, + "learning_rate": 0.0002, + "loss": 2.4746, + "step": 43690 + }, + { + "epoch": 3.256333830104322, + "grad_norm": 2.71146821975708, + "learning_rate": 0.0002, + "loss": 2.5153, + "step": 43700 + }, + { + "epoch": 3.2570789865871834, + "grad_norm": 2.8306570053100586, + "learning_rate": 0.0002, + "loss": 2.3877, + "step": 43710 + }, + { + "epoch": 3.2578241430700445, + "grad_norm": 2.9262166023254395, + "learning_rate": 0.0002, + "loss": 2.5242, + "step": 43720 + }, + { + "epoch": 3.258569299552906, + "grad_norm": 2.766948938369751, + "learning_rate": 0.0002, + "loss": 2.3074, + "step": 43730 + }, + { + "epoch": 3.2593144560357676, + "grad_norm": 2.673241376876831, + "learning_rate": 0.0002, + "loss": 2.4664, + "step": 43740 + }, + { + "epoch": 3.2600596125186287, + "grad_norm": 2.917844295501709, + "learning_rate": 0.0002, + "loss": 2.3575, + "step": 43750 + }, + { + "epoch": 3.2608047690014903, + "grad_norm": 2.763775587081909, + "learning_rate": 0.0002, + "loss": 2.4072, + "step": 43760 + }, + { + "epoch": 3.261549925484352, + "grad_norm": 2.166604518890381, + "learning_rate": 0.0002, + "loss": 2.3257, + "step": 43770 + }, + { + "epoch": 3.262295081967213, + "grad_norm": 2.9719974994659424, + "learning_rate": 0.0002, + "loss": 2.3358, + "step": 43780 + }, + { + "epoch": 3.2630402384500745, + "grad_norm": 3.200921058654785, + "learning_rate": 0.0002, + "loss": 2.5461, + "step": 43790 + }, + { + "epoch": 3.263785394932936, + "grad_norm": 2.6361167430877686, + "learning_rate": 0.0002, + "loss": 2.6172, + "step": 43800 + }, + { + "epoch": 3.264530551415797, + "grad_norm": 2.4378700256347656, + "learning_rate": 0.0002, + "loss": 2.4782, + "step": 43810 + }, + { + "epoch": 3.2652757078986587, + "grad_norm": 2.8756327629089355, + "learning_rate": 0.0002, + "loss": 2.6013, + "step": 43820 + }, + { + "epoch": 3.2660208643815203, + "grad_norm": 2.74397611618042, + "learning_rate": 0.0002, + "loss": 2.6196, + "step": 43830 + }, + { + "epoch": 3.2667660208643814, + "grad_norm": 2.8466079235076904, + "learning_rate": 0.0002, + "loss": 2.4868, + "step": 43840 + }, + { + "epoch": 3.267511177347243, + "grad_norm": 2.6650137901306152, + "learning_rate": 0.0002, + "loss": 2.461, + "step": 43850 + }, + { + "epoch": 3.2682563338301045, + "grad_norm": 2.747535228729248, + "learning_rate": 0.0002, + "loss": 2.3765, + "step": 43860 + }, + { + "epoch": 3.2690014903129656, + "grad_norm": 2.7829294204711914, + "learning_rate": 0.0002, + "loss": 2.4128, + "step": 43870 + }, + { + "epoch": 3.269746646795827, + "grad_norm": 2.3735265731811523, + "learning_rate": 0.0002, + "loss": 2.5265, + "step": 43880 + }, + { + "epoch": 3.2704918032786887, + "grad_norm": 2.747103214263916, + "learning_rate": 0.0002, + "loss": 2.5282, + "step": 43890 + }, + { + "epoch": 3.27123695976155, + "grad_norm": 2.760833740234375, + "learning_rate": 0.0002, + "loss": 2.4452, + "step": 43900 + }, + { + "epoch": 3.2719821162444114, + "grad_norm": 2.7421011924743652, + "learning_rate": 0.0002, + "loss": 2.4462, + "step": 43910 + }, + { + "epoch": 3.2727272727272725, + "grad_norm": 2.950409412384033, + "learning_rate": 0.0002, + "loss": 2.5802, + "step": 43920 + }, + { + "epoch": 3.273472429210134, + "grad_norm": 2.5211143493652344, + "learning_rate": 0.0002, + "loss": 2.3641, + "step": 43930 + }, + { + "epoch": 3.2742175856929956, + "grad_norm": 3.1419506072998047, + "learning_rate": 0.0002, + "loss": 2.4876, + "step": 43940 + }, + { + "epoch": 3.274962742175857, + "grad_norm": 2.7190613746643066, + "learning_rate": 0.0002, + "loss": 2.4003, + "step": 43950 + }, + { + "epoch": 3.2757078986587183, + "grad_norm": 2.7831997871398926, + "learning_rate": 0.0002, + "loss": 2.4456, + "step": 43960 + }, + { + "epoch": 3.27645305514158, + "grad_norm": 2.8021488189697266, + "learning_rate": 0.0002, + "loss": 2.3307, + "step": 43970 + }, + { + "epoch": 3.277198211624441, + "grad_norm": 2.8400144577026367, + "learning_rate": 0.0002, + "loss": 2.4423, + "step": 43980 + }, + { + "epoch": 3.2779433681073025, + "grad_norm": 2.451012134552002, + "learning_rate": 0.0002, + "loss": 2.3158, + "step": 43990 + }, + { + "epoch": 3.278688524590164, + "grad_norm": 2.821969985961914, + "learning_rate": 0.0002, + "loss": 2.318, + "step": 44000 + }, + { + "epoch": 3.2794336810730256, + "grad_norm": 2.844849109649658, + "learning_rate": 0.0002, + "loss": 2.4889, + "step": 44010 + }, + { + "epoch": 3.2801788375558867, + "grad_norm": 2.737499237060547, + "learning_rate": 0.0002, + "loss": 2.4165, + "step": 44020 + }, + { + "epoch": 3.2809239940387482, + "grad_norm": 2.477479934692383, + "learning_rate": 0.0002, + "loss": 2.4084, + "step": 44030 + }, + { + "epoch": 3.2816691505216093, + "grad_norm": 2.523198366165161, + "learning_rate": 0.0002, + "loss": 2.4999, + "step": 44040 + }, + { + "epoch": 3.282414307004471, + "grad_norm": 2.695603609085083, + "learning_rate": 0.0002, + "loss": 2.3774, + "step": 44050 + }, + { + "epoch": 3.2831594634873325, + "grad_norm": 3.03360652923584, + "learning_rate": 0.0002, + "loss": 2.2998, + "step": 44060 + }, + { + "epoch": 3.2839046199701936, + "grad_norm": 2.572307825088501, + "learning_rate": 0.0002, + "loss": 2.3314, + "step": 44070 + }, + { + "epoch": 3.284649776453055, + "grad_norm": 2.493046283721924, + "learning_rate": 0.0002, + "loss": 2.5724, + "step": 44080 + }, + { + "epoch": 3.2853949329359167, + "grad_norm": 2.80645489692688, + "learning_rate": 0.0002, + "loss": 2.4075, + "step": 44090 + }, + { + "epoch": 3.2861400894187778, + "grad_norm": 2.564210891723633, + "learning_rate": 0.0002, + "loss": 2.4697, + "step": 44100 + }, + { + "epoch": 3.2868852459016393, + "grad_norm": 2.9380991458892822, + "learning_rate": 0.0002, + "loss": 2.4112, + "step": 44110 + }, + { + "epoch": 3.287630402384501, + "grad_norm": 2.7709860801696777, + "learning_rate": 0.0002, + "loss": 2.5449, + "step": 44120 + }, + { + "epoch": 3.288375558867362, + "grad_norm": 2.371518611907959, + "learning_rate": 0.0002, + "loss": 2.2587, + "step": 44130 + }, + { + "epoch": 3.2891207153502235, + "grad_norm": 2.40743350982666, + "learning_rate": 0.0002, + "loss": 2.4445, + "step": 44140 + }, + { + "epoch": 3.289865871833085, + "grad_norm": 3.0384521484375, + "learning_rate": 0.0002, + "loss": 2.3444, + "step": 44150 + }, + { + "epoch": 3.290611028315946, + "grad_norm": 2.498652935028076, + "learning_rate": 0.0002, + "loss": 2.4962, + "step": 44160 + }, + { + "epoch": 3.2913561847988078, + "grad_norm": 3.0309088230133057, + "learning_rate": 0.0002, + "loss": 2.4336, + "step": 44170 + }, + { + "epoch": 3.2921013412816693, + "grad_norm": 2.711760997772217, + "learning_rate": 0.0002, + "loss": 2.3514, + "step": 44180 + }, + { + "epoch": 3.2928464977645304, + "grad_norm": 2.884289264678955, + "learning_rate": 0.0002, + "loss": 2.5209, + "step": 44190 + }, + { + "epoch": 3.293591654247392, + "grad_norm": 2.5556466579437256, + "learning_rate": 0.0002, + "loss": 2.4795, + "step": 44200 + }, + { + "epoch": 3.2943368107302535, + "grad_norm": 2.5224225521087646, + "learning_rate": 0.0002, + "loss": 2.3033, + "step": 44210 + }, + { + "epoch": 3.2950819672131146, + "grad_norm": 2.700383424758911, + "learning_rate": 0.0002, + "loss": 2.4198, + "step": 44220 + }, + { + "epoch": 3.295827123695976, + "grad_norm": 2.7120893001556396, + "learning_rate": 0.0002, + "loss": 2.3386, + "step": 44230 + }, + { + "epoch": 3.2965722801788377, + "grad_norm": 2.6876585483551025, + "learning_rate": 0.0002, + "loss": 2.3047, + "step": 44240 + }, + { + "epoch": 3.297317436661699, + "grad_norm": 2.7172040939331055, + "learning_rate": 0.0002, + "loss": 2.5499, + "step": 44250 + }, + { + "epoch": 3.2980625931445604, + "grad_norm": 2.645282506942749, + "learning_rate": 0.0002, + "loss": 2.4294, + "step": 44260 + }, + { + "epoch": 3.2988077496274215, + "grad_norm": 2.8003482818603516, + "learning_rate": 0.0002, + "loss": 2.4373, + "step": 44270 + }, + { + "epoch": 3.299552906110283, + "grad_norm": 2.8162996768951416, + "learning_rate": 0.0002, + "loss": 2.4679, + "step": 44280 + }, + { + "epoch": 3.3002980625931446, + "grad_norm": 2.7134716510772705, + "learning_rate": 0.0002, + "loss": 2.3455, + "step": 44290 + }, + { + "epoch": 3.301043219076006, + "grad_norm": 2.854041337966919, + "learning_rate": 0.0002, + "loss": 2.5039, + "step": 44300 + }, + { + "epoch": 3.3017883755588673, + "grad_norm": 2.794156789779663, + "learning_rate": 0.0002, + "loss": 2.6137, + "step": 44310 + }, + { + "epoch": 3.302533532041729, + "grad_norm": 2.626420021057129, + "learning_rate": 0.0002, + "loss": 2.5824, + "step": 44320 + }, + { + "epoch": 3.30327868852459, + "grad_norm": 2.318218946456909, + "learning_rate": 0.0002, + "loss": 2.6067, + "step": 44330 + }, + { + "epoch": 3.3040238450074515, + "grad_norm": 2.7970023155212402, + "learning_rate": 0.0002, + "loss": 2.4504, + "step": 44340 + }, + { + "epoch": 3.304769001490313, + "grad_norm": 2.773334503173828, + "learning_rate": 0.0002, + "loss": 2.6143, + "step": 44350 + }, + { + "epoch": 3.3055141579731746, + "grad_norm": 2.5117685794830322, + "learning_rate": 0.0002, + "loss": 2.2903, + "step": 44360 + }, + { + "epoch": 3.3062593144560357, + "grad_norm": 2.768136501312256, + "learning_rate": 0.0002, + "loss": 2.4649, + "step": 44370 + }, + { + "epoch": 3.3070044709388973, + "grad_norm": 2.2928473949432373, + "learning_rate": 0.0002, + "loss": 2.4589, + "step": 44380 + }, + { + "epoch": 3.3077496274217584, + "grad_norm": 2.4605629444122314, + "learning_rate": 0.0002, + "loss": 2.324, + "step": 44390 + }, + { + "epoch": 3.30849478390462, + "grad_norm": 2.7441422939300537, + "learning_rate": 0.0002, + "loss": 2.333, + "step": 44400 + }, + { + "epoch": 3.3092399403874815, + "grad_norm": 2.4191830158233643, + "learning_rate": 0.0002, + "loss": 2.5422, + "step": 44410 + }, + { + "epoch": 3.3099850968703426, + "grad_norm": 2.452444314956665, + "learning_rate": 0.0002, + "loss": 2.377, + "step": 44420 + }, + { + "epoch": 3.310730253353204, + "grad_norm": 2.9849095344543457, + "learning_rate": 0.0002, + "loss": 2.5103, + "step": 44430 + }, + { + "epoch": 3.3114754098360657, + "grad_norm": 2.7864060401916504, + "learning_rate": 0.0002, + "loss": 2.4538, + "step": 44440 + }, + { + "epoch": 3.312220566318927, + "grad_norm": 2.8097054958343506, + "learning_rate": 0.0002, + "loss": 2.4729, + "step": 44450 + }, + { + "epoch": 3.3129657228017884, + "grad_norm": 2.5728414058685303, + "learning_rate": 0.0002, + "loss": 2.4751, + "step": 44460 + }, + { + "epoch": 3.31371087928465, + "grad_norm": 2.7810518741607666, + "learning_rate": 0.0002, + "loss": 2.5308, + "step": 44470 + }, + { + "epoch": 3.314456035767511, + "grad_norm": 2.845767021179199, + "learning_rate": 0.0002, + "loss": 2.4061, + "step": 44480 + }, + { + "epoch": 3.3152011922503726, + "grad_norm": 2.721355438232422, + "learning_rate": 0.0002, + "loss": 2.5016, + "step": 44490 + }, + { + "epoch": 3.315946348733234, + "grad_norm": 2.6908435821533203, + "learning_rate": 0.0002, + "loss": 2.576, + "step": 44500 + }, + { + "epoch": 3.3166915052160952, + "grad_norm": 2.6813907623291016, + "learning_rate": 0.0002, + "loss": 2.6311, + "step": 44510 + }, + { + "epoch": 3.317436661698957, + "grad_norm": 3.2077786922454834, + "learning_rate": 0.0002, + "loss": 2.5925, + "step": 44520 + }, + { + "epoch": 3.3181818181818183, + "grad_norm": 2.6613762378692627, + "learning_rate": 0.0002, + "loss": 2.506, + "step": 44530 + }, + { + "epoch": 3.3189269746646795, + "grad_norm": 2.0632989406585693, + "learning_rate": 0.0002, + "loss": 2.3812, + "step": 44540 + }, + { + "epoch": 3.319672131147541, + "grad_norm": 2.6608712673187256, + "learning_rate": 0.0002, + "loss": 2.319, + "step": 44550 + }, + { + "epoch": 3.3204172876304026, + "grad_norm": 3.02986216545105, + "learning_rate": 0.0002, + "loss": 2.2761, + "step": 44560 + }, + { + "epoch": 3.3211624441132637, + "grad_norm": 2.6622636318206787, + "learning_rate": 0.0002, + "loss": 2.4661, + "step": 44570 + }, + { + "epoch": 3.321907600596125, + "grad_norm": 2.753340721130371, + "learning_rate": 0.0002, + "loss": 2.4398, + "step": 44580 + }, + { + "epoch": 3.3226527570789868, + "grad_norm": 2.8317463397979736, + "learning_rate": 0.0002, + "loss": 2.4028, + "step": 44590 + }, + { + "epoch": 3.323397913561848, + "grad_norm": 2.6618127822875977, + "learning_rate": 0.0002, + "loss": 2.469, + "step": 44600 + }, + { + "epoch": 3.3241430700447094, + "grad_norm": 2.7465057373046875, + "learning_rate": 0.0002, + "loss": 2.6312, + "step": 44610 + }, + { + "epoch": 3.3248882265275705, + "grad_norm": 2.6905951499938965, + "learning_rate": 0.0002, + "loss": 2.5652, + "step": 44620 + }, + { + "epoch": 3.325633383010432, + "grad_norm": 2.609553337097168, + "learning_rate": 0.0002, + "loss": 2.4733, + "step": 44630 + }, + { + "epoch": 3.3263785394932937, + "grad_norm": 3.0250022411346436, + "learning_rate": 0.0002, + "loss": 2.2852, + "step": 44640 + }, + { + "epoch": 3.327123695976155, + "grad_norm": 2.7133560180664062, + "learning_rate": 0.0002, + "loss": 2.6097, + "step": 44650 + }, + { + "epoch": 3.3278688524590163, + "grad_norm": 2.479445695877075, + "learning_rate": 0.0002, + "loss": 2.4231, + "step": 44660 + }, + { + "epoch": 3.328614008941878, + "grad_norm": 2.7370223999023438, + "learning_rate": 0.0002, + "loss": 2.359, + "step": 44670 + }, + { + "epoch": 3.329359165424739, + "grad_norm": 2.5486905574798584, + "learning_rate": 0.0002, + "loss": 2.5112, + "step": 44680 + }, + { + "epoch": 3.3301043219076005, + "grad_norm": 2.0843801498413086, + "learning_rate": 0.0002, + "loss": 2.4368, + "step": 44690 + }, + { + "epoch": 3.330849478390462, + "grad_norm": 2.692003011703491, + "learning_rate": 0.0002, + "loss": 2.4591, + "step": 44700 + }, + { + "epoch": 3.3315946348733236, + "grad_norm": 2.4572720527648926, + "learning_rate": 0.0002, + "loss": 2.521, + "step": 44710 + }, + { + "epoch": 3.3323397913561847, + "grad_norm": 2.675017833709717, + "learning_rate": 0.0002, + "loss": 2.5644, + "step": 44720 + }, + { + "epoch": 3.3330849478390463, + "grad_norm": 2.6414129734039307, + "learning_rate": 0.0002, + "loss": 2.601, + "step": 44730 + }, + { + "epoch": 3.3338301043219074, + "grad_norm": 2.5128352642059326, + "learning_rate": 0.0002, + "loss": 2.4304, + "step": 44740 + }, + { + "epoch": 3.334575260804769, + "grad_norm": 2.5863492488861084, + "learning_rate": 0.0002, + "loss": 2.3684, + "step": 44750 + }, + { + "epoch": 3.3353204172876305, + "grad_norm": 2.2314746379852295, + "learning_rate": 0.0002, + "loss": 2.5097, + "step": 44760 + }, + { + "epoch": 3.3360655737704916, + "grad_norm": 2.5924713611602783, + "learning_rate": 0.0002, + "loss": 2.5148, + "step": 44770 + }, + { + "epoch": 3.336810730253353, + "grad_norm": 2.298295497894287, + "learning_rate": 0.0002, + "loss": 2.3235, + "step": 44780 + }, + { + "epoch": 3.3375558867362147, + "grad_norm": 2.701958417892456, + "learning_rate": 0.0002, + "loss": 2.4991, + "step": 44790 + }, + { + "epoch": 3.338301043219076, + "grad_norm": 2.766066074371338, + "learning_rate": 0.0002, + "loss": 2.4484, + "step": 44800 + }, + { + "epoch": 3.3390461997019374, + "grad_norm": 2.684408187866211, + "learning_rate": 0.0002, + "loss": 2.5959, + "step": 44810 + }, + { + "epoch": 3.339791356184799, + "grad_norm": 2.5606040954589844, + "learning_rate": 0.0002, + "loss": 2.3181, + "step": 44820 + }, + { + "epoch": 3.34053651266766, + "grad_norm": 2.261032819747925, + "learning_rate": 0.0002, + "loss": 2.3889, + "step": 44830 + }, + { + "epoch": 3.3412816691505216, + "grad_norm": 2.6086981296539307, + "learning_rate": 0.0002, + "loss": 2.3264, + "step": 44840 + }, + { + "epoch": 3.342026825633383, + "grad_norm": 2.7089896202087402, + "learning_rate": 0.0002, + "loss": 2.4724, + "step": 44850 + }, + { + "epoch": 3.3427719821162443, + "grad_norm": 2.4596564769744873, + "learning_rate": 0.0002, + "loss": 2.3453, + "step": 44860 + }, + { + "epoch": 3.343517138599106, + "grad_norm": 3.0206546783447266, + "learning_rate": 0.0002, + "loss": 2.274, + "step": 44870 + }, + { + "epoch": 3.3442622950819674, + "grad_norm": 2.696455717086792, + "learning_rate": 0.0002, + "loss": 2.5678, + "step": 44880 + }, + { + "epoch": 3.3450074515648285, + "grad_norm": 2.755286693572998, + "learning_rate": 0.0002, + "loss": 2.6242, + "step": 44890 + }, + { + "epoch": 3.34575260804769, + "grad_norm": 2.4545297622680664, + "learning_rate": 0.0002, + "loss": 2.4866, + "step": 44900 + }, + { + "epoch": 3.3464977645305516, + "grad_norm": 1.9408307075500488, + "learning_rate": 0.0002, + "loss": 2.2509, + "step": 44910 + }, + { + "epoch": 3.3472429210134127, + "grad_norm": 2.969522714614868, + "learning_rate": 0.0002, + "loss": 2.5758, + "step": 44920 + }, + { + "epoch": 3.3479880774962743, + "grad_norm": 2.4225828647613525, + "learning_rate": 0.0002, + "loss": 2.2521, + "step": 44930 + }, + { + "epoch": 3.348733233979136, + "grad_norm": 2.567896842956543, + "learning_rate": 0.0002, + "loss": 2.5203, + "step": 44940 + }, + { + "epoch": 3.349478390461997, + "grad_norm": 2.6760551929473877, + "learning_rate": 0.0002, + "loss": 2.4264, + "step": 44950 + }, + { + "epoch": 3.3502235469448585, + "grad_norm": 2.5255861282348633, + "learning_rate": 0.0002, + "loss": 2.5705, + "step": 44960 + }, + { + "epoch": 3.3509687034277196, + "grad_norm": 2.5304393768310547, + "learning_rate": 0.0002, + "loss": 2.3158, + "step": 44970 + }, + { + "epoch": 3.351713859910581, + "grad_norm": 2.5755674839019775, + "learning_rate": 0.0002, + "loss": 2.4185, + "step": 44980 + }, + { + "epoch": 3.3524590163934427, + "grad_norm": 2.782089948654175, + "learning_rate": 0.0002, + "loss": 2.5002, + "step": 44990 + }, + { + "epoch": 3.3532041728763042, + "grad_norm": 2.9056670665740967, + "learning_rate": 0.0002, + "loss": 2.5654, + "step": 45000 + }, + { + "epoch": 3.3539493293591653, + "grad_norm": 3.098062038421631, + "learning_rate": 0.0002, + "loss": 2.4097, + "step": 45010 + }, + { + "epoch": 3.354694485842027, + "grad_norm": 2.8015189170837402, + "learning_rate": 0.0002, + "loss": 2.4039, + "step": 45020 + }, + { + "epoch": 3.355439642324888, + "grad_norm": 2.5579729080200195, + "learning_rate": 0.0002, + "loss": 2.4874, + "step": 45030 + }, + { + "epoch": 3.3561847988077496, + "grad_norm": 2.528961181640625, + "learning_rate": 0.0002, + "loss": 2.4139, + "step": 45040 + }, + { + "epoch": 3.356929955290611, + "grad_norm": 2.753173828125, + "learning_rate": 0.0002, + "loss": 2.2572, + "step": 45050 + }, + { + "epoch": 3.3576751117734727, + "grad_norm": 2.733590841293335, + "learning_rate": 0.0002, + "loss": 2.571, + "step": 45060 + }, + { + "epoch": 3.3584202682563338, + "grad_norm": 2.6290042400360107, + "learning_rate": 0.0002, + "loss": 2.5703, + "step": 45070 + }, + { + "epoch": 3.3591654247391953, + "grad_norm": 2.5969390869140625, + "learning_rate": 0.0002, + "loss": 2.3944, + "step": 45080 + }, + { + "epoch": 3.3599105812220564, + "grad_norm": 2.6536154747009277, + "learning_rate": 0.0002, + "loss": 2.4431, + "step": 45090 + }, + { + "epoch": 3.360655737704918, + "grad_norm": 2.6232528686523438, + "learning_rate": 0.0002, + "loss": 2.502, + "step": 45100 + }, + { + "epoch": 3.3614008941877795, + "grad_norm": 2.800161361694336, + "learning_rate": 0.0002, + "loss": 2.3932, + "step": 45110 + }, + { + "epoch": 3.3621460506706407, + "grad_norm": 2.5421383380889893, + "learning_rate": 0.0002, + "loss": 2.527, + "step": 45120 + }, + { + "epoch": 3.362891207153502, + "grad_norm": 2.968223810195923, + "learning_rate": 0.0002, + "loss": 2.6396, + "step": 45130 + }, + { + "epoch": 3.3636363636363638, + "grad_norm": 2.4359583854675293, + "learning_rate": 0.0002, + "loss": 2.3573, + "step": 45140 + }, + { + "epoch": 3.364381520119225, + "grad_norm": 2.3761839866638184, + "learning_rate": 0.0002, + "loss": 2.6412, + "step": 45150 + }, + { + "epoch": 3.3651266766020864, + "grad_norm": 2.7938098907470703, + "learning_rate": 0.0002, + "loss": 2.5008, + "step": 45160 + }, + { + "epoch": 3.365871833084948, + "grad_norm": 2.7078990936279297, + "learning_rate": 0.0002, + "loss": 2.627, + "step": 45170 + }, + { + "epoch": 3.366616989567809, + "grad_norm": 2.781672477722168, + "learning_rate": 0.0002, + "loss": 2.5451, + "step": 45180 + }, + { + "epoch": 3.3673621460506706, + "grad_norm": 2.7269418239593506, + "learning_rate": 0.0002, + "loss": 2.3616, + "step": 45190 + }, + { + "epoch": 3.368107302533532, + "grad_norm": 2.7787158489227295, + "learning_rate": 0.0002, + "loss": 2.4846, + "step": 45200 + }, + { + "epoch": 3.3688524590163933, + "grad_norm": 2.631345748901367, + "learning_rate": 0.0002, + "loss": 2.556, + "step": 45210 + }, + { + "epoch": 3.369597615499255, + "grad_norm": 2.876192092895508, + "learning_rate": 0.0002, + "loss": 2.5152, + "step": 45220 + }, + { + "epoch": 3.3703427719821164, + "grad_norm": 2.6566519737243652, + "learning_rate": 0.0002, + "loss": 2.6127, + "step": 45230 + }, + { + "epoch": 3.3710879284649775, + "grad_norm": 3.0558583736419678, + "learning_rate": 0.0002, + "loss": 2.5733, + "step": 45240 + }, + { + "epoch": 3.371833084947839, + "grad_norm": 2.9002668857574463, + "learning_rate": 0.0002, + "loss": 2.5031, + "step": 45250 + }, + { + "epoch": 3.3725782414307006, + "grad_norm": 2.6893272399902344, + "learning_rate": 0.0002, + "loss": 2.5278, + "step": 45260 + }, + { + "epoch": 3.3733233979135617, + "grad_norm": 2.7107250690460205, + "learning_rate": 0.0002, + "loss": 2.6275, + "step": 45270 + }, + { + "epoch": 3.3740685543964233, + "grad_norm": 2.816999912261963, + "learning_rate": 0.0002, + "loss": 2.5468, + "step": 45280 + }, + { + "epoch": 3.374813710879285, + "grad_norm": 2.670982599258423, + "learning_rate": 0.0002, + "loss": 2.4967, + "step": 45290 + }, + { + "epoch": 3.375558867362146, + "grad_norm": 2.483621835708618, + "learning_rate": 0.0002, + "loss": 2.2782, + "step": 45300 + }, + { + "epoch": 3.3763040238450075, + "grad_norm": 2.733739137649536, + "learning_rate": 0.0002, + "loss": 2.3869, + "step": 45310 + }, + { + "epoch": 3.3770491803278686, + "grad_norm": 2.6917941570281982, + "learning_rate": 0.0002, + "loss": 2.4299, + "step": 45320 + }, + { + "epoch": 3.37779433681073, + "grad_norm": 2.675069570541382, + "learning_rate": 0.0002, + "loss": 2.5749, + "step": 45330 + }, + { + "epoch": 3.3785394932935917, + "grad_norm": 2.6892285346984863, + "learning_rate": 0.0002, + "loss": 2.387, + "step": 45340 + }, + { + "epoch": 3.3792846497764533, + "grad_norm": 3.1701433658599854, + "learning_rate": 0.0002, + "loss": 2.5472, + "step": 45350 + }, + { + "epoch": 3.3800298062593144, + "grad_norm": 2.8027126789093018, + "learning_rate": 0.0002, + "loss": 2.4505, + "step": 45360 + }, + { + "epoch": 3.380774962742176, + "grad_norm": 3.0367190837860107, + "learning_rate": 0.0002, + "loss": 2.6113, + "step": 45370 + }, + { + "epoch": 3.381520119225037, + "grad_norm": 2.723576545715332, + "learning_rate": 0.0002, + "loss": 2.3816, + "step": 45380 + }, + { + "epoch": 3.3822652757078986, + "grad_norm": 2.6705434322357178, + "learning_rate": 0.0002, + "loss": 2.4231, + "step": 45390 + }, + { + "epoch": 3.38301043219076, + "grad_norm": 2.584972381591797, + "learning_rate": 0.0002, + "loss": 2.3369, + "step": 45400 + }, + { + "epoch": 3.3837555886736217, + "grad_norm": 2.5815274715423584, + "learning_rate": 0.0002, + "loss": 2.3362, + "step": 45410 + }, + { + "epoch": 3.384500745156483, + "grad_norm": 2.7649471759796143, + "learning_rate": 0.0002, + "loss": 2.3817, + "step": 45420 + }, + { + "epoch": 3.3852459016393444, + "grad_norm": 2.435465097427368, + "learning_rate": 0.0002, + "loss": 2.5519, + "step": 45430 + }, + { + "epoch": 3.3859910581222055, + "grad_norm": 2.7562053203582764, + "learning_rate": 0.0002, + "loss": 2.4509, + "step": 45440 + }, + { + "epoch": 3.386736214605067, + "grad_norm": 2.829549789428711, + "learning_rate": 0.0002, + "loss": 2.4191, + "step": 45450 + }, + { + "epoch": 3.3874813710879286, + "grad_norm": 2.9970972537994385, + "learning_rate": 0.0002, + "loss": 2.5063, + "step": 45460 + }, + { + "epoch": 3.3882265275707897, + "grad_norm": 3.018434524536133, + "learning_rate": 0.0002, + "loss": 2.4994, + "step": 45470 + }, + { + "epoch": 3.3889716840536512, + "grad_norm": 2.6649057865142822, + "learning_rate": 0.0002, + "loss": 2.421, + "step": 45480 + }, + { + "epoch": 3.389716840536513, + "grad_norm": 2.2870116233825684, + "learning_rate": 0.0002, + "loss": 2.5703, + "step": 45490 + }, + { + "epoch": 3.390461997019374, + "grad_norm": 2.1558291912078857, + "learning_rate": 0.0002, + "loss": 2.618, + "step": 45500 + }, + { + "epoch": 3.3912071535022354, + "grad_norm": 2.7797162532806396, + "learning_rate": 0.0002, + "loss": 2.404, + "step": 45510 + }, + { + "epoch": 3.391952309985097, + "grad_norm": 3.0129196643829346, + "learning_rate": 0.0002, + "loss": 2.3708, + "step": 45520 + }, + { + "epoch": 3.392697466467958, + "grad_norm": 2.453110694885254, + "learning_rate": 0.0002, + "loss": 2.5923, + "step": 45530 + }, + { + "epoch": 3.3934426229508197, + "grad_norm": 2.845862865447998, + "learning_rate": 0.0002, + "loss": 2.5138, + "step": 45540 + }, + { + "epoch": 3.394187779433681, + "grad_norm": 3.013814687728882, + "learning_rate": 0.0002, + "loss": 2.6143, + "step": 45550 + }, + { + "epoch": 3.3949329359165423, + "grad_norm": 2.650480270385742, + "learning_rate": 0.0002, + "loss": 2.6253, + "step": 45560 + }, + { + "epoch": 3.395678092399404, + "grad_norm": 2.6634345054626465, + "learning_rate": 0.0002, + "loss": 2.4054, + "step": 45570 + }, + { + "epoch": 3.3964232488822654, + "grad_norm": 2.5562565326690674, + "learning_rate": 0.0002, + "loss": 2.3089, + "step": 45580 + }, + { + "epoch": 3.3971684053651265, + "grad_norm": 2.798459768295288, + "learning_rate": 0.0002, + "loss": 2.3853, + "step": 45590 + }, + { + "epoch": 3.397913561847988, + "grad_norm": 2.3773062229156494, + "learning_rate": 0.0002, + "loss": 2.5745, + "step": 45600 + }, + { + "epoch": 3.3986587183308496, + "grad_norm": 2.7547972202301025, + "learning_rate": 0.0002, + "loss": 2.4046, + "step": 45610 + }, + { + "epoch": 3.3994038748137108, + "grad_norm": 2.549546003341675, + "learning_rate": 0.0002, + "loss": 2.6018, + "step": 45620 + }, + { + "epoch": 3.4001490312965723, + "grad_norm": 2.604703903198242, + "learning_rate": 0.0002, + "loss": 2.5778, + "step": 45630 + }, + { + "epoch": 3.400894187779434, + "grad_norm": 2.515007495880127, + "learning_rate": 0.0002, + "loss": 2.4235, + "step": 45640 + }, + { + "epoch": 3.401639344262295, + "grad_norm": 3.1332290172576904, + "learning_rate": 0.0002, + "loss": 2.5225, + "step": 45650 + }, + { + "epoch": 3.4023845007451565, + "grad_norm": 2.7233545780181885, + "learning_rate": 0.0002, + "loss": 2.5562, + "step": 45660 + }, + { + "epoch": 3.4031296572280176, + "grad_norm": 2.413351058959961, + "learning_rate": 0.0002, + "loss": 2.4198, + "step": 45670 + }, + { + "epoch": 3.403874813710879, + "grad_norm": 2.706968307495117, + "learning_rate": 0.0002, + "loss": 2.44, + "step": 45680 + }, + { + "epoch": 3.4046199701937407, + "grad_norm": 2.8159170150756836, + "learning_rate": 0.0002, + "loss": 2.499, + "step": 45690 + }, + { + "epoch": 3.4053651266766023, + "grad_norm": 2.412567615509033, + "learning_rate": 0.0002, + "loss": 2.4767, + "step": 45700 + }, + { + "epoch": 3.4061102831594634, + "grad_norm": 2.582634210586548, + "learning_rate": 0.0002, + "loss": 2.3958, + "step": 45710 + }, + { + "epoch": 3.406855439642325, + "grad_norm": 2.5285251140594482, + "learning_rate": 0.0002, + "loss": 2.332, + "step": 45720 + }, + { + "epoch": 3.407600596125186, + "grad_norm": 2.7439749240875244, + "learning_rate": 0.0002, + "loss": 2.3391, + "step": 45730 + }, + { + "epoch": 3.4083457526080476, + "grad_norm": 2.710297107696533, + "learning_rate": 0.0002, + "loss": 2.5074, + "step": 45740 + }, + { + "epoch": 3.409090909090909, + "grad_norm": 2.586346387863159, + "learning_rate": 0.0002, + "loss": 2.5411, + "step": 45750 + }, + { + "epoch": 3.4098360655737707, + "grad_norm": 2.2955081462860107, + "learning_rate": 0.0002, + "loss": 2.5316, + "step": 45760 + }, + { + "epoch": 3.410581222056632, + "grad_norm": 2.482043504714966, + "learning_rate": 0.0002, + "loss": 2.5112, + "step": 45770 + }, + { + "epoch": 3.4113263785394934, + "grad_norm": 2.5250322818756104, + "learning_rate": 0.0002, + "loss": 2.4582, + "step": 45780 + }, + { + "epoch": 3.4120715350223545, + "grad_norm": 3.7372679710388184, + "learning_rate": 0.0002, + "loss": 2.6059, + "step": 45790 + }, + { + "epoch": 3.412816691505216, + "grad_norm": 2.6757683753967285, + "learning_rate": 0.0002, + "loss": 2.421, + "step": 45800 + }, + { + "epoch": 3.4135618479880776, + "grad_norm": 2.1661860942840576, + "learning_rate": 0.0002, + "loss": 2.243, + "step": 45810 + }, + { + "epoch": 3.4143070044709387, + "grad_norm": 2.95503830909729, + "learning_rate": 0.0002, + "loss": 2.5708, + "step": 45820 + }, + { + "epoch": 3.4150521609538003, + "grad_norm": 2.38747239112854, + "learning_rate": 0.0002, + "loss": 2.4036, + "step": 45830 + }, + { + "epoch": 3.415797317436662, + "grad_norm": 2.2388579845428467, + "learning_rate": 0.0002, + "loss": 2.4164, + "step": 45840 + }, + { + "epoch": 3.416542473919523, + "grad_norm": 2.6116092205047607, + "learning_rate": 0.0002, + "loss": 2.6254, + "step": 45850 + }, + { + "epoch": 3.4172876304023845, + "grad_norm": 2.408705472946167, + "learning_rate": 0.0002, + "loss": 2.6464, + "step": 45860 + }, + { + "epoch": 3.418032786885246, + "grad_norm": 2.4243757724761963, + "learning_rate": 0.0002, + "loss": 2.6336, + "step": 45870 + }, + { + "epoch": 3.418777943368107, + "grad_norm": 2.9750640392303467, + "learning_rate": 0.0002, + "loss": 2.4791, + "step": 45880 + }, + { + "epoch": 3.4195230998509687, + "grad_norm": 2.836698055267334, + "learning_rate": 0.0002, + "loss": 2.6173, + "step": 45890 + }, + { + "epoch": 3.4202682563338302, + "grad_norm": 2.734327793121338, + "learning_rate": 0.0002, + "loss": 2.3956, + "step": 45900 + }, + { + "epoch": 3.4210134128166914, + "grad_norm": 2.6513071060180664, + "learning_rate": 0.0002, + "loss": 2.5761, + "step": 45910 + }, + { + "epoch": 3.421758569299553, + "grad_norm": 2.8348026275634766, + "learning_rate": 0.0002, + "loss": 2.4796, + "step": 45920 + }, + { + "epoch": 3.4225037257824145, + "grad_norm": 2.5720503330230713, + "learning_rate": 0.0002, + "loss": 2.6315, + "step": 45930 + }, + { + "epoch": 3.4232488822652756, + "grad_norm": 2.497056245803833, + "learning_rate": 0.0002, + "loss": 2.4317, + "step": 45940 + }, + { + "epoch": 3.423994038748137, + "grad_norm": 2.849952459335327, + "learning_rate": 0.0002, + "loss": 2.4361, + "step": 45950 + }, + { + "epoch": 3.4247391952309987, + "grad_norm": 2.823962688446045, + "learning_rate": 0.0002, + "loss": 2.6059, + "step": 45960 + }, + { + "epoch": 3.42548435171386, + "grad_norm": 2.484067440032959, + "learning_rate": 0.0002, + "loss": 2.3086, + "step": 45970 + }, + { + "epoch": 3.4262295081967213, + "grad_norm": 3.0087854862213135, + "learning_rate": 0.0002, + "loss": 2.3751, + "step": 45980 + }, + { + "epoch": 3.426974664679583, + "grad_norm": 2.5996415615081787, + "learning_rate": 0.0002, + "loss": 2.5007, + "step": 45990 + }, + { + "epoch": 3.427719821162444, + "grad_norm": 2.758151054382324, + "learning_rate": 0.0002, + "loss": 2.4652, + "step": 46000 + }, + { + "epoch": 3.4284649776453056, + "grad_norm": 2.789275646209717, + "learning_rate": 0.0002, + "loss": 2.645, + "step": 46010 + }, + { + "epoch": 3.429210134128167, + "grad_norm": 2.859830141067505, + "learning_rate": 0.0002, + "loss": 2.5338, + "step": 46020 + }, + { + "epoch": 3.429955290611028, + "grad_norm": 2.25303053855896, + "learning_rate": 0.0002, + "loss": 2.2158, + "step": 46030 + }, + { + "epoch": 3.4307004470938898, + "grad_norm": 3.3532886505126953, + "learning_rate": 0.0002, + "loss": 2.335, + "step": 46040 + }, + { + "epoch": 3.4314456035767513, + "grad_norm": 2.6834700107574463, + "learning_rate": 0.0002, + "loss": 2.662, + "step": 46050 + }, + { + "epoch": 3.4321907600596124, + "grad_norm": 2.6147358417510986, + "learning_rate": 0.0002, + "loss": 2.4504, + "step": 46060 + }, + { + "epoch": 3.432935916542474, + "grad_norm": 2.483168363571167, + "learning_rate": 0.0002, + "loss": 2.241, + "step": 46070 + }, + { + "epoch": 3.433681073025335, + "grad_norm": 2.5020689964294434, + "learning_rate": 0.0002, + "loss": 2.2708, + "step": 46080 + }, + { + "epoch": 3.4344262295081966, + "grad_norm": 2.6769652366638184, + "learning_rate": 0.0002, + "loss": 2.4544, + "step": 46090 + }, + { + "epoch": 3.435171385991058, + "grad_norm": 2.70047926902771, + "learning_rate": 0.0002, + "loss": 2.5237, + "step": 46100 + }, + { + "epoch": 3.4359165424739198, + "grad_norm": 2.588719367980957, + "learning_rate": 0.0002, + "loss": 2.4592, + "step": 46110 + }, + { + "epoch": 3.436661698956781, + "grad_norm": 2.694007158279419, + "learning_rate": 0.0002, + "loss": 2.5565, + "step": 46120 + }, + { + "epoch": 3.4374068554396424, + "grad_norm": 2.5668649673461914, + "learning_rate": 0.0002, + "loss": 2.414, + "step": 46130 + }, + { + "epoch": 3.4381520119225035, + "grad_norm": 2.8535616397857666, + "learning_rate": 0.0002, + "loss": 2.5938, + "step": 46140 + }, + { + "epoch": 3.438897168405365, + "grad_norm": 2.704505443572998, + "learning_rate": 0.0002, + "loss": 2.488, + "step": 46150 + }, + { + "epoch": 3.4396423248882266, + "grad_norm": 2.4701411724090576, + "learning_rate": 0.0002, + "loss": 2.3884, + "step": 46160 + }, + { + "epoch": 3.4403874813710877, + "grad_norm": 2.898048162460327, + "learning_rate": 0.0002, + "loss": 2.5974, + "step": 46170 + }, + { + "epoch": 3.4411326378539493, + "grad_norm": 2.504690170288086, + "learning_rate": 0.0002, + "loss": 2.5142, + "step": 46180 + }, + { + "epoch": 3.441877794336811, + "grad_norm": 2.4398272037506104, + "learning_rate": 0.0002, + "loss": 2.3432, + "step": 46190 + }, + { + "epoch": 3.442622950819672, + "grad_norm": 2.6902806758880615, + "learning_rate": 0.0002, + "loss": 2.5055, + "step": 46200 + }, + { + "epoch": 3.4433681073025335, + "grad_norm": 3.127070426940918, + "learning_rate": 0.0002, + "loss": 2.3998, + "step": 46210 + }, + { + "epoch": 3.444113263785395, + "grad_norm": 2.8394556045532227, + "learning_rate": 0.0002, + "loss": 2.5031, + "step": 46220 + }, + { + "epoch": 3.444858420268256, + "grad_norm": 2.2959282398223877, + "learning_rate": 0.0002, + "loss": 2.4069, + "step": 46230 + }, + { + "epoch": 3.4456035767511177, + "grad_norm": 2.881746530532837, + "learning_rate": 0.0002, + "loss": 2.4572, + "step": 46240 + }, + { + "epoch": 3.4463487332339793, + "grad_norm": 2.929504871368408, + "learning_rate": 0.0002, + "loss": 2.2962, + "step": 46250 + }, + { + "epoch": 3.4470938897168404, + "grad_norm": 2.570570945739746, + "learning_rate": 0.0002, + "loss": 2.3323, + "step": 46260 + }, + { + "epoch": 3.447839046199702, + "grad_norm": 2.8684487342834473, + "learning_rate": 0.0002, + "loss": 2.5943, + "step": 46270 + }, + { + "epoch": 3.4485842026825635, + "grad_norm": 2.15544056892395, + "learning_rate": 0.0002, + "loss": 2.4587, + "step": 46280 + }, + { + "epoch": 3.4493293591654246, + "grad_norm": 2.5828864574432373, + "learning_rate": 0.0002, + "loss": 2.5151, + "step": 46290 + }, + { + "epoch": 3.450074515648286, + "grad_norm": 2.892545461654663, + "learning_rate": 0.0002, + "loss": 2.4213, + "step": 46300 + }, + { + "epoch": 3.4508196721311477, + "grad_norm": 2.867642879486084, + "learning_rate": 0.0002, + "loss": 2.4857, + "step": 46310 + }, + { + "epoch": 3.451564828614009, + "grad_norm": 3.811511278152466, + "learning_rate": 0.0002, + "loss": 2.5187, + "step": 46320 + }, + { + "epoch": 3.4523099850968704, + "grad_norm": 1.9235666990280151, + "learning_rate": 0.0002, + "loss": 2.2494, + "step": 46330 + }, + { + "epoch": 3.453055141579732, + "grad_norm": 2.4899580478668213, + "learning_rate": 0.0002, + "loss": 2.3877, + "step": 46340 + }, + { + "epoch": 3.453800298062593, + "grad_norm": 2.7740957736968994, + "learning_rate": 0.0002, + "loss": 2.5916, + "step": 46350 + }, + { + "epoch": 3.4545454545454546, + "grad_norm": 3.30594539642334, + "learning_rate": 0.0002, + "loss": 2.5507, + "step": 46360 + }, + { + "epoch": 3.455290611028316, + "grad_norm": 2.3682942390441895, + "learning_rate": 0.0002, + "loss": 2.5072, + "step": 46370 + }, + { + "epoch": 3.4560357675111772, + "grad_norm": 2.640885829925537, + "learning_rate": 0.0002, + "loss": 2.5571, + "step": 46380 + }, + { + "epoch": 3.456780923994039, + "grad_norm": 2.810642719268799, + "learning_rate": 0.0002, + "loss": 2.4887, + "step": 46390 + }, + { + "epoch": 3.4575260804769004, + "grad_norm": 2.425133228302002, + "learning_rate": 0.0002, + "loss": 2.3365, + "step": 46400 + }, + { + "epoch": 3.4582712369597615, + "grad_norm": 2.6030843257904053, + "learning_rate": 0.0002, + "loss": 2.475, + "step": 46410 + }, + { + "epoch": 3.459016393442623, + "grad_norm": 2.4549689292907715, + "learning_rate": 0.0002, + "loss": 2.4761, + "step": 46420 + }, + { + "epoch": 3.459761549925484, + "grad_norm": 2.5607504844665527, + "learning_rate": 0.0002, + "loss": 2.4621, + "step": 46430 + }, + { + "epoch": 3.4605067064083457, + "grad_norm": 2.6467771530151367, + "learning_rate": 0.0002, + "loss": 2.5109, + "step": 46440 + }, + { + "epoch": 3.4612518628912072, + "grad_norm": 2.1029367446899414, + "learning_rate": 0.0002, + "loss": 2.4962, + "step": 46450 + }, + { + "epoch": 3.461997019374069, + "grad_norm": 2.5813140869140625, + "learning_rate": 0.0002, + "loss": 2.6477, + "step": 46460 + }, + { + "epoch": 3.46274217585693, + "grad_norm": 2.906320571899414, + "learning_rate": 0.0002, + "loss": 2.5958, + "step": 46470 + }, + { + "epoch": 3.4634873323397914, + "grad_norm": 2.431466579437256, + "learning_rate": 0.0002, + "loss": 2.5054, + "step": 46480 + }, + { + "epoch": 3.4642324888226526, + "grad_norm": 2.375662088394165, + "learning_rate": 0.0002, + "loss": 2.4448, + "step": 46490 + }, + { + "epoch": 3.464977645305514, + "grad_norm": 2.675147771835327, + "learning_rate": 0.0002, + "loss": 2.4303, + "step": 46500 + }, + { + "epoch": 3.4657228017883757, + "grad_norm": 2.9978744983673096, + "learning_rate": 0.0002, + "loss": 2.3751, + "step": 46510 + }, + { + "epoch": 3.4664679582712368, + "grad_norm": 2.483147621154785, + "learning_rate": 0.0002, + "loss": 2.4115, + "step": 46520 + }, + { + "epoch": 3.4672131147540983, + "grad_norm": 2.3755555152893066, + "learning_rate": 0.0002, + "loss": 2.2329, + "step": 46530 + }, + { + "epoch": 3.46795827123696, + "grad_norm": 2.9722342491149902, + "learning_rate": 0.0002, + "loss": 2.66, + "step": 46540 + }, + { + "epoch": 3.468703427719821, + "grad_norm": 2.3765087127685547, + "learning_rate": 0.0002, + "loss": 2.3346, + "step": 46550 + }, + { + "epoch": 3.4694485842026825, + "grad_norm": 3.10359263420105, + "learning_rate": 0.0002, + "loss": 2.4465, + "step": 46560 + }, + { + "epoch": 3.470193740685544, + "grad_norm": 2.499055862426758, + "learning_rate": 0.0002, + "loss": 2.4433, + "step": 46570 + }, + { + "epoch": 3.470938897168405, + "grad_norm": 2.9596831798553467, + "learning_rate": 0.0002, + "loss": 2.6495, + "step": 46580 + }, + { + "epoch": 3.4716840536512668, + "grad_norm": 2.490100622177124, + "learning_rate": 0.0002, + "loss": 2.4167, + "step": 46590 + }, + { + "epoch": 3.4724292101341283, + "grad_norm": 2.3789775371551514, + "learning_rate": 0.0002, + "loss": 2.3696, + "step": 46600 + }, + { + "epoch": 3.4731743666169894, + "grad_norm": 2.522531032562256, + "learning_rate": 0.0002, + "loss": 2.416, + "step": 46610 + }, + { + "epoch": 3.473919523099851, + "grad_norm": 1.9312344789505005, + "learning_rate": 0.0002, + "loss": 2.3704, + "step": 46620 + }, + { + "epoch": 3.4746646795827125, + "grad_norm": 2.6317503452301025, + "learning_rate": 0.0002, + "loss": 2.4834, + "step": 46630 + }, + { + "epoch": 3.4754098360655736, + "grad_norm": 2.854759693145752, + "learning_rate": 0.0002, + "loss": 2.3015, + "step": 46640 + }, + { + "epoch": 3.476154992548435, + "grad_norm": 2.843125581741333, + "learning_rate": 0.0002, + "loss": 2.6554, + "step": 46650 + }, + { + "epoch": 3.4769001490312967, + "grad_norm": 2.7469377517700195, + "learning_rate": 0.0002, + "loss": 2.5061, + "step": 46660 + }, + { + "epoch": 3.477645305514158, + "grad_norm": 2.6205174922943115, + "learning_rate": 0.0002, + "loss": 2.256, + "step": 46670 + }, + { + "epoch": 3.4783904619970194, + "grad_norm": 2.8084607124328613, + "learning_rate": 0.0002, + "loss": 2.5537, + "step": 46680 + }, + { + "epoch": 3.479135618479881, + "grad_norm": 2.556391716003418, + "learning_rate": 0.0002, + "loss": 2.3825, + "step": 46690 + }, + { + "epoch": 3.479880774962742, + "grad_norm": 2.547045946121216, + "learning_rate": 0.0002, + "loss": 2.5505, + "step": 46700 + }, + { + "epoch": 3.4806259314456036, + "grad_norm": 2.9189865589141846, + "learning_rate": 0.0002, + "loss": 2.3604, + "step": 46710 + }, + { + "epoch": 3.481371087928465, + "grad_norm": 2.6150434017181396, + "learning_rate": 0.0002, + "loss": 2.5506, + "step": 46720 + }, + { + "epoch": 3.4821162444113263, + "grad_norm": 2.9110896587371826, + "learning_rate": 0.0002, + "loss": 2.5469, + "step": 46730 + }, + { + "epoch": 3.482861400894188, + "grad_norm": 2.4996886253356934, + "learning_rate": 0.0002, + "loss": 2.4058, + "step": 46740 + }, + { + "epoch": 3.4836065573770494, + "grad_norm": 2.6568424701690674, + "learning_rate": 0.0002, + "loss": 2.4281, + "step": 46750 + }, + { + "epoch": 3.4843517138599105, + "grad_norm": 2.790637254714966, + "learning_rate": 0.0002, + "loss": 2.2597, + "step": 46760 + }, + { + "epoch": 3.485096870342772, + "grad_norm": 2.712463855743408, + "learning_rate": 0.0002, + "loss": 2.6115, + "step": 46770 + }, + { + "epoch": 3.485842026825633, + "grad_norm": 2.838874340057373, + "learning_rate": 0.0002, + "loss": 2.5807, + "step": 46780 + }, + { + "epoch": 3.4865871833084947, + "grad_norm": 2.435051441192627, + "learning_rate": 0.0002, + "loss": 2.2417, + "step": 46790 + }, + { + "epoch": 3.4873323397913563, + "grad_norm": 2.4676756858825684, + "learning_rate": 0.0002, + "loss": 2.4674, + "step": 46800 + }, + { + "epoch": 3.488077496274218, + "grad_norm": 2.871751070022583, + "learning_rate": 0.0002, + "loss": 2.5139, + "step": 46810 + }, + { + "epoch": 3.488822652757079, + "grad_norm": 2.740755319595337, + "learning_rate": 0.0002, + "loss": 2.459, + "step": 46820 + }, + { + "epoch": 3.4895678092399405, + "grad_norm": 2.719113349914551, + "learning_rate": 0.0002, + "loss": 2.2784, + "step": 46830 + }, + { + "epoch": 3.4903129657228016, + "grad_norm": 2.9216134548187256, + "learning_rate": 0.0002, + "loss": 2.572, + "step": 46840 + }, + { + "epoch": 3.491058122205663, + "grad_norm": 2.4576525688171387, + "learning_rate": 0.0002, + "loss": 2.2147, + "step": 46850 + }, + { + "epoch": 3.4918032786885247, + "grad_norm": 2.628617525100708, + "learning_rate": 0.0002, + "loss": 2.6005, + "step": 46860 + }, + { + "epoch": 3.492548435171386, + "grad_norm": 2.8615686893463135, + "learning_rate": 0.0002, + "loss": 2.4068, + "step": 46870 + }, + { + "epoch": 3.4932935916542474, + "grad_norm": 2.8863871097564697, + "learning_rate": 0.0002, + "loss": 2.5519, + "step": 46880 + }, + { + "epoch": 3.494038748137109, + "grad_norm": 2.529972553253174, + "learning_rate": 0.0002, + "loss": 2.5626, + "step": 46890 + }, + { + "epoch": 3.49478390461997, + "grad_norm": 2.534062385559082, + "learning_rate": 0.0002, + "loss": 2.4962, + "step": 46900 + }, + { + "epoch": 3.4955290611028316, + "grad_norm": 2.641014575958252, + "learning_rate": 0.0002, + "loss": 2.3222, + "step": 46910 + }, + { + "epoch": 3.496274217585693, + "grad_norm": 2.8244290351867676, + "learning_rate": 0.0002, + "loss": 2.4623, + "step": 46920 + }, + { + "epoch": 3.4970193740685542, + "grad_norm": 2.8250889778137207, + "learning_rate": 0.0002, + "loss": 2.5361, + "step": 46930 + }, + { + "epoch": 3.497764530551416, + "grad_norm": 3.2792413234710693, + "learning_rate": 0.0002, + "loss": 2.6656, + "step": 46940 + }, + { + "epoch": 3.4985096870342773, + "grad_norm": 2.25415301322937, + "learning_rate": 0.0002, + "loss": 2.4449, + "step": 46950 + }, + { + "epoch": 3.4992548435171384, + "grad_norm": 2.867708921432495, + "learning_rate": 0.0002, + "loss": 2.3822, + "step": 46960 + }, + { + "epoch": 3.5, + "grad_norm": 2.8867411613464355, + "learning_rate": 0.0002, + "loss": 2.4606, + "step": 46970 + }, + { + "epoch": 3.5007451564828616, + "grad_norm": 2.8825507164001465, + "learning_rate": 0.0002, + "loss": 2.5032, + "step": 46980 + }, + { + "epoch": 3.5014903129657227, + "grad_norm": 3.2540154457092285, + "learning_rate": 0.0002, + "loss": 2.4558, + "step": 46990 + }, + { + "epoch": 3.502235469448584, + "grad_norm": 2.831401824951172, + "learning_rate": 0.0002, + "loss": 2.54, + "step": 47000 + }, + { + "epoch": 3.5029806259314458, + "grad_norm": 2.632877826690674, + "learning_rate": 0.0002, + "loss": 2.3713, + "step": 47010 + }, + { + "epoch": 3.503725782414307, + "grad_norm": 2.836522340774536, + "learning_rate": 0.0002, + "loss": 2.5631, + "step": 47020 + }, + { + "epoch": 3.5044709388971684, + "grad_norm": 2.5134811401367188, + "learning_rate": 0.0002, + "loss": 2.4654, + "step": 47030 + }, + { + "epoch": 3.50521609538003, + "grad_norm": 2.8926382064819336, + "learning_rate": 0.0002, + "loss": 2.525, + "step": 47040 + }, + { + "epoch": 3.505961251862891, + "grad_norm": 2.3543901443481445, + "learning_rate": 0.0002, + "loss": 2.3846, + "step": 47050 + }, + { + "epoch": 3.5067064083457526, + "grad_norm": 2.8736624717712402, + "learning_rate": 0.0002, + "loss": 2.608, + "step": 47060 + }, + { + "epoch": 3.5074515648286138, + "grad_norm": 2.5337979793548584, + "learning_rate": 0.0002, + "loss": 2.5153, + "step": 47070 + }, + { + "epoch": 3.5081967213114753, + "grad_norm": 2.630679130554199, + "learning_rate": 0.0002, + "loss": 2.6418, + "step": 47080 + }, + { + "epoch": 3.508941877794337, + "grad_norm": 2.820647954940796, + "learning_rate": 0.0002, + "loss": 2.5799, + "step": 47090 + }, + { + "epoch": 3.5096870342771984, + "grad_norm": 2.707824230194092, + "learning_rate": 0.0002, + "loss": 2.3407, + "step": 47100 + }, + { + "epoch": 3.5104321907600595, + "grad_norm": 2.6139681339263916, + "learning_rate": 0.0002, + "loss": 2.5448, + "step": 47110 + }, + { + "epoch": 3.511177347242921, + "grad_norm": 2.887916326522827, + "learning_rate": 0.0002, + "loss": 2.4502, + "step": 47120 + }, + { + "epoch": 3.511922503725782, + "grad_norm": 2.730395555496216, + "learning_rate": 0.0002, + "loss": 2.4459, + "step": 47130 + }, + { + "epoch": 3.5126676602086437, + "grad_norm": 2.41025447845459, + "learning_rate": 0.0002, + "loss": 2.5996, + "step": 47140 + }, + { + "epoch": 3.5134128166915053, + "grad_norm": 2.7070274353027344, + "learning_rate": 0.0002, + "loss": 2.5929, + "step": 47150 + }, + { + "epoch": 3.514157973174367, + "grad_norm": 2.963433265686035, + "learning_rate": 0.0002, + "loss": 2.5335, + "step": 47160 + }, + { + "epoch": 3.514903129657228, + "grad_norm": 2.7727930545806885, + "learning_rate": 0.0002, + "loss": 2.5952, + "step": 47170 + }, + { + "epoch": 3.5156482861400895, + "grad_norm": 2.868478536605835, + "learning_rate": 0.0002, + "loss": 2.4693, + "step": 47180 + }, + { + "epoch": 3.5163934426229506, + "grad_norm": 2.550471305847168, + "learning_rate": 0.0002, + "loss": 2.6188, + "step": 47190 + }, + { + "epoch": 3.517138599105812, + "grad_norm": 2.536785125732422, + "learning_rate": 0.0002, + "loss": 2.513, + "step": 47200 + }, + { + "epoch": 3.5178837555886737, + "grad_norm": 2.470963716506958, + "learning_rate": 0.0002, + "loss": 2.4627, + "step": 47210 + }, + { + "epoch": 3.5186289120715353, + "grad_norm": 2.5856993198394775, + "learning_rate": 0.0002, + "loss": 2.4106, + "step": 47220 + }, + { + "epoch": 3.5193740685543964, + "grad_norm": 2.3822851181030273, + "learning_rate": 0.0002, + "loss": 2.4622, + "step": 47230 + }, + { + "epoch": 3.520119225037258, + "grad_norm": 2.6513640880584717, + "learning_rate": 0.0002, + "loss": 2.3524, + "step": 47240 + }, + { + "epoch": 3.520864381520119, + "grad_norm": 2.421938419342041, + "learning_rate": 0.0002, + "loss": 2.5163, + "step": 47250 + }, + { + "epoch": 3.5216095380029806, + "grad_norm": 2.7255730628967285, + "learning_rate": 0.0002, + "loss": 2.4664, + "step": 47260 + }, + { + "epoch": 3.522354694485842, + "grad_norm": 2.8659775257110596, + "learning_rate": 0.0002, + "loss": 2.5989, + "step": 47270 + }, + { + "epoch": 3.5230998509687033, + "grad_norm": 2.8360795974731445, + "learning_rate": 0.0002, + "loss": 2.5135, + "step": 47280 + }, + { + "epoch": 3.523845007451565, + "grad_norm": 2.400752544403076, + "learning_rate": 0.0002, + "loss": 2.2798, + "step": 47290 + }, + { + "epoch": 3.5245901639344264, + "grad_norm": 2.741732120513916, + "learning_rate": 0.0002, + "loss": 2.5847, + "step": 47300 + }, + { + "epoch": 3.5253353204172875, + "grad_norm": 2.8098509311676025, + "learning_rate": 0.0002, + "loss": 2.4662, + "step": 47310 + }, + { + "epoch": 3.526080476900149, + "grad_norm": 2.437133312225342, + "learning_rate": 0.0002, + "loss": 2.528, + "step": 47320 + }, + { + "epoch": 3.5268256333830106, + "grad_norm": 2.986236572265625, + "learning_rate": 0.0002, + "loss": 2.5172, + "step": 47330 + }, + { + "epoch": 3.5275707898658717, + "grad_norm": 2.7789604663848877, + "learning_rate": 0.0002, + "loss": 2.6075, + "step": 47340 + }, + { + "epoch": 3.5283159463487332, + "grad_norm": 2.8106555938720703, + "learning_rate": 0.0002, + "loss": 2.3869, + "step": 47350 + }, + { + "epoch": 3.529061102831595, + "grad_norm": 2.428912878036499, + "learning_rate": 0.0002, + "loss": 2.5395, + "step": 47360 + }, + { + "epoch": 3.529806259314456, + "grad_norm": 2.528440237045288, + "learning_rate": 0.0002, + "loss": 2.3945, + "step": 47370 + }, + { + "epoch": 3.5305514157973175, + "grad_norm": 2.547029972076416, + "learning_rate": 0.0002, + "loss": 2.4825, + "step": 47380 + }, + { + "epoch": 3.531296572280179, + "grad_norm": 2.567345142364502, + "learning_rate": 0.0002, + "loss": 2.4825, + "step": 47390 + }, + { + "epoch": 3.53204172876304, + "grad_norm": 2.5996034145355225, + "learning_rate": 0.0002, + "loss": 2.5573, + "step": 47400 + }, + { + "epoch": 3.5327868852459017, + "grad_norm": 2.373216390609741, + "learning_rate": 0.0002, + "loss": 2.432, + "step": 47410 + }, + { + "epoch": 3.533532041728763, + "grad_norm": 2.4746549129486084, + "learning_rate": 0.0002, + "loss": 2.6546, + "step": 47420 + }, + { + "epoch": 3.5342771982116243, + "grad_norm": 2.6175284385681152, + "learning_rate": 0.0002, + "loss": 2.5659, + "step": 47430 + }, + { + "epoch": 3.535022354694486, + "grad_norm": 2.7137603759765625, + "learning_rate": 0.0002, + "loss": 2.5188, + "step": 47440 + }, + { + "epoch": 3.5357675111773474, + "grad_norm": 2.842714548110962, + "learning_rate": 0.0002, + "loss": 2.637, + "step": 47450 + }, + { + "epoch": 3.5365126676602086, + "grad_norm": 2.494274139404297, + "learning_rate": 0.0002, + "loss": 2.4436, + "step": 47460 + }, + { + "epoch": 3.53725782414307, + "grad_norm": 2.0920238494873047, + "learning_rate": 0.0002, + "loss": 2.3784, + "step": 47470 + }, + { + "epoch": 3.538002980625931, + "grad_norm": 2.5640475749969482, + "learning_rate": 0.0002, + "loss": 2.5922, + "step": 47480 + }, + { + "epoch": 3.5387481371087928, + "grad_norm": 2.3668644428253174, + "learning_rate": 0.0002, + "loss": 2.4113, + "step": 47490 + }, + { + "epoch": 3.5394932935916543, + "grad_norm": 2.7140488624572754, + "learning_rate": 0.0002, + "loss": 2.6541, + "step": 47500 + }, + { + "epoch": 3.540238450074516, + "grad_norm": 2.655503511428833, + "learning_rate": 0.0002, + "loss": 2.6018, + "step": 47510 + }, + { + "epoch": 3.540983606557377, + "grad_norm": 2.689578056335449, + "learning_rate": 0.0002, + "loss": 2.4236, + "step": 47520 + }, + { + "epoch": 3.5417287630402385, + "grad_norm": 2.8998870849609375, + "learning_rate": 0.0002, + "loss": 2.4237, + "step": 47530 + }, + { + "epoch": 3.5424739195230996, + "grad_norm": 2.749411106109619, + "learning_rate": 0.0002, + "loss": 2.3785, + "step": 47540 + }, + { + "epoch": 3.543219076005961, + "grad_norm": 2.72086238861084, + "learning_rate": 0.0002, + "loss": 2.3861, + "step": 47550 + }, + { + "epoch": 3.5439642324888228, + "grad_norm": 2.7207980155944824, + "learning_rate": 0.0002, + "loss": 2.6309, + "step": 47560 + }, + { + "epoch": 3.5447093889716843, + "grad_norm": 2.5211992263793945, + "learning_rate": 0.0002, + "loss": 2.6079, + "step": 47570 + }, + { + "epoch": 3.5454545454545454, + "grad_norm": 2.592176914215088, + "learning_rate": 0.0002, + "loss": 2.6657, + "step": 47580 + }, + { + "epoch": 3.546199701937407, + "grad_norm": 2.651425838470459, + "learning_rate": 0.0002, + "loss": 2.432, + "step": 47590 + }, + { + "epoch": 3.546944858420268, + "grad_norm": 2.9301748275756836, + "learning_rate": 0.0002, + "loss": 2.6111, + "step": 47600 + }, + { + "epoch": 3.5476900149031296, + "grad_norm": 2.4946770668029785, + "learning_rate": 0.0002, + "loss": 2.5022, + "step": 47610 + }, + { + "epoch": 3.548435171385991, + "grad_norm": 2.465942144393921, + "learning_rate": 0.0002, + "loss": 2.4284, + "step": 47620 + }, + { + "epoch": 3.5491803278688527, + "grad_norm": 2.899252414703369, + "learning_rate": 0.0002, + "loss": 2.203, + "step": 47630 + }, + { + "epoch": 3.549925484351714, + "grad_norm": 2.9829139709472656, + "learning_rate": 0.0002, + "loss": 2.4265, + "step": 47640 + }, + { + "epoch": 3.5506706408345754, + "grad_norm": 3.0602529048919678, + "learning_rate": 0.0002, + "loss": 2.4456, + "step": 47650 + }, + { + "epoch": 3.5514157973174365, + "grad_norm": 2.6664137840270996, + "learning_rate": 0.0002, + "loss": 2.4857, + "step": 47660 + }, + { + "epoch": 3.552160953800298, + "grad_norm": 2.6339783668518066, + "learning_rate": 0.0002, + "loss": 2.4292, + "step": 47670 + }, + { + "epoch": 3.5529061102831596, + "grad_norm": 2.6717617511749268, + "learning_rate": 0.0002, + "loss": 2.7226, + "step": 47680 + }, + { + "epoch": 3.5536512667660207, + "grad_norm": 2.6578307151794434, + "learning_rate": 0.0002, + "loss": 2.5738, + "step": 47690 + }, + { + "epoch": 3.5543964232488823, + "grad_norm": 3.1757662296295166, + "learning_rate": 0.0002, + "loss": 2.3902, + "step": 47700 + }, + { + "epoch": 3.555141579731744, + "grad_norm": 2.820986032485962, + "learning_rate": 0.0002, + "loss": 2.5761, + "step": 47710 + }, + { + "epoch": 3.555886736214605, + "grad_norm": 2.5975351333618164, + "learning_rate": 0.0002, + "loss": 2.4182, + "step": 47720 + }, + { + "epoch": 3.5566318926974665, + "grad_norm": 2.1608307361602783, + "learning_rate": 0.0002, + "loss": 2.4476, + "step": 47730 + }, + { + "epoch": 3.557377049180328, + "grad_norm": 2.800523519515991, + "learning_rate": 0.0002, + "loss": 2.5175, + "step": 47740 + }, + { + "epoch": 3.558122205663189, + "grad_norm": 2.303067207336426, + "learning_rate": 0.0002, + "loss": 2.3914, + "step": 47750 + }, + { + "epoch": 3.5588673621460507, + "grad_norm": 2.3079776763916016, + "learning_rate": 0.0002, + "loss": 2.4874, + "step": 47760 + }, + { + "epoch": 3.559612518628912, + "grad_norm": 3.0130832195281982, + "learning_rate": 0.0002, + "loss": 2.6741, + "step": 47770 + }, + { + "epoch": 3.5603576751117734, + "grad_norm": 2.6367170810699463, + "learning_rate": 0.0002, + "loss": 2.3853, + "step": 47780 + }, + { + "epoch": 3.561102831594635, + "grad_norm": 2.8482704162597656, + "learning_rate": 0.0002, + "loss": 2.4991, + "step": 47790 + }, + { + "epoch": 3.5618479880774965, + "grad_norm": 2.4554948806762695, + "learning_rate": 0.0002, + "loss": 2.4502, + "step": 47800 + }, + { + "epoch": 3.5625931445603576, + "grad_norm": 2.6724441051483154, + "learning_rate": 0.0002, + "loss": 2.6487, + "step": 47810 + }, + { + "epoch": 3.563338301043219, + "grad_norm": 2.8899343013763428, + "learning_rate": 0.0002, + "loss": 2.6546, + "step": 47820 + }, + { + "epoch": 3.5640834575260802, + "grad_norm": 3.299750804901123, + "learning_rate": 0.0002, + "loss": 2.6725, + "step": 47830 + }, + { + "epoch": 3.564828614008942, + "grad_norm": 2.936067819595337, + "learning_rate": 0.0002, + "loss": 2.5936, + "step": 47840 + }, + { + "epoch": 3.5655737704918034, + "grad_norm": 2.8651251792907715, + "learning_rate": 0.0002, + "loss": 2.5748, + "step": 47850 + }, + { + "epoch": 3.566318926974665, + "grad_norm": 2.662198066711426, + "learning_rate": 0.0002, + "loss": 2.5171, + "step": 47860 + }, + { + "epoch": 3.567064083457526, + "grad_norm": 2.711688995361328, + "learning_rate": 0.0002, + "loss": 2.542, + "step": 47870 + }, + { + "epoch": 3.5678092399403876, + "grad_norm": 2.569240093231201, + "learning_rate": 0.0002, + "loss": 2.5765, + "step": 47880 + }, + { + "epoch": 3.5685543964232487, + "grad_norm": 3.035104990005493, + "learning_rate": 0.0002, + "loss": 2.6961, + "step": 47890 + }, + { + "epoch": 3.5692995529061102, + "grad_norm": 2.7538046836853027, + "learning_rate": 0.0002, + "loss": 2.4598, + "step": 47900 + }, + { + "epoch": 3.570044709388972, + "grad_norm": 1.9985271692276, + "learning_rate": 0.0002, + "loss": 2.3044, + "step": 47910 + }, + { + "epoch": 3.5707898658718333, + "grad_norm": 2.6245272159576416, + "learning_rate": 0.0002, + "loss": 2.6054, + "step": 47920 + }, + { + "epoch": 3.5715350223546944, + "grad_norm": 2.6232852935791016, + "learning_rate": 0.0002, + "loss": 2.5152, + "step": 47930 + }, + { + "epoch": 3.572280178837556, + "grad_norm": 2.8082447052001953, + "learning_rate": 0.0002, + "loss": 2.4981, + "step": 47940 + }, + { + "epoch": 3.573025335320417, + "grad_norm": 2.4905755519866943, + "learning_rate": 0.0002, + "loss": 2.6797, + "step": 47950 + }, + { + "epoch": 3.5737704918032787, + "grad_norm": 2.3986294269561768, + "learning_rate": 0.0002, + "loss": 2.3939, + "step": 47960 + }, + { + "epoch": 3.57451564828614, + "grad_norm": 2.645498037338257, + "learning_rate": 0.0002, + "loss": 2.5753, + "step": 47970 + }, + { + "epoch": 3.5752608047690018, + "grad_norm": 2.494290828704834, + "learning_rate": 0.0002, + "loss": 2.3679, + "step": 47980 + }, + { + "epoch": 3.576005961251863, + "grad_norm": 2.542083740234375, + "learning_rate": 0.0002, + "loss": 2.53, + "step": 47990 + }, + { + "epoch": 3.5767511177347244, + "grad_norm": 2.529242753982544, + "learning_rate": 0.0002, + "loss": 2.5242, + "step": 48000 + }, + { + "epoch": 3.5774962742175855, + "grad_norm": 2.7906339168548584, + "learning_rate": 0.0002, + "loss": 2.4736, + "step": 48010 + }, + { + "epoch": 3.578241430700447, + "grad_norm": 2.607820510864258, + "learning_rate": 0.0002, + "loss": 2.5297, + "step": 48020 + }, + { + "epoch": 3.5789865871833086, + "grad_norm": 2.3591980934143066, + "learning_rate": 0.0002, + "loss": 2.6442, + "step": 48030 + }, + { + "epoch": 3.5797317436661698, + "grad_norm": 2.4848647117614746, + "learning_rate": 0.0002, + "loss": 2.4396, + "step": 48040 + }, + { + "epoch": 3.5804769001490313, + "grad_norm": 2.30979323387146, + "learning_rate": 0.0002, + "loss": 2.4167, + "step": 48050 + }, + { + "epoch": 3.581222056631893, + "grad_norm": 2.738574981689453, + "learning_rate": 0.0002, + "loss": 2.5074, + "step": 48060 + }, + { + "epoch": 3.581967213114754, + "grad_norm": 2.5661182403564453, + "learning_rate": 0.0002, + "loss": 2.3834, + "step": 48070 + }, + { + "epoch": 3.5827123695976155, + "grad_norm": 2.47078013420105, + "learning_rate": 0.0002, + "loss": 2.5172, + "step": 48080 + }, + { + "epoch": 3.583457526080477, + "grad_norm": 2.635650157928467, + "learning_rate": 0.0002, + "loss": 2.5179, + "step": 48090 + }, + { + "epoch": 3.584202682563338, + "grad_norm": 2.555915117263794, + "learning_rate": 0.0002, + "loss": 2.3665, + "step": 48100 + }, + { + "epoch": 3.5849478390461997, + "grad_norm": 2.8681654930114746, + "learning_rate": 0.0002, + "loss": 2.3979, + "step": 48110 + }, + { + "epoch": 3.585692995529061, + "grad_norm": 2.8930814266204834, + "learning_rate": 0.0002, + "loss": 2.5871, + "step": 48120 + }, + { + "epoch": 3.5864381520119224, + "grad_norm": 2.3275046348571777, + "learning_rate": 0.0002, + "loss": 2.4286, + "step": 48130 + }, + { + "epoch": 3.587183308494784, + "grad_norm": 2.7267794609069824, + "learning_rate": 0.0002, + "loss": 2.6153, + "step": 48140 + }, + { + "epoch": 3.5879284649776455, + "grad_norm": 2.8347225189208984, + "learning_rate": 0.0002, + "loss": 2.4797, + "step": 48150 + }, + { + "epoch": 3.5886736214605066, + "grad_norm": 2.776869058609009, + "learning_rate": 0.0002, + "loss": 2.4546, + "step": 48160 + }, + { + "epoch": 3.589418777943368, + "grad_norm": 2.580460548400879, + "learning_rate": 0.0002, + "loss": 2.4808, + "step": 48170 + }, + { + "epoch": 3.5901639344262293, + "grad_norm": 2.462087869644165, + "learning_rate": 0.0002, + "loss": 2.4598, + "step": 48180 + }, + { + "epoch": 3.590909090909091, + "grad_norm": 2.611598491668701, + "learning_rate": 0.0002, + "loss": 2.3196, + "step": 48190 + }, + { + "epoch": 3.5916542473919524, + "grad_norm": 2.7346081733703613, + "learning_rate": 0.0002, + "loss": 2.6164, + "step": 48200 + }, + { + "epoch": 3.592399403874814, + "grad_norm": 2.7156240940093994, + "learning_rate": 0.0002, + "loss": 2.5327, + "step": 48210 + }, + { + "epoch": 3.593144560357675, + "grad_norm": 2.703343629837036, + "learning_rate": 0.0002, + "loss": 2.4702, + "step": 48220 + }, + { + "epoch": 3.5938897168405366, + "grad_norm": 3.1075315475463867, + "learning_rate": 0.0002, + "loss": 2.5256, + "step": 48230 + }, + { + "epoch": 3.5946348733233977, + "grad_norm": 3.027200937271118, + "learning_rate": 0.0002, + "loss": 2.4926, + "step": 48240 + }, + { + "epoch": 3.5953800298062593, + "grad_norm": 2.59000825881958, + "learning_rate": 0.0002, + "loss": 2.0317, + "step": 48250 + }, + { + "epoch": 3.596125186289121, + "grad_norm": 2.602517604827881, + "learning_rate": 0.0002, + "loss": 2.3541, + "step": 48260 + }, + { + "epoch": 3.5968703427719824, + "grad_norm": 2.692680835723877, + "learning_rate": 0.0002, + "loss": 2.4874, + "step": 48270 + }, + { + "epoch": 3.5976154992548435, + "grad_norm": 1.833561658859253, + "learning_rate": 0.0002, + "loss": 2.3711, + "step": 48280 + }, + { + "epoch": 3.598360655737705, + "grad_norm": 3.1557459831237793, + "learning_rate": 0.0002, + "loss": 2.5707, + "step": 48290 + }, + { + "epoch": 3.599105812220566, + "grad_norm": 2.482980728149414, + "learning_rate": 0.0002, + "loss": 2.4062, + "step": 48300 + }, + { + "epoch": 3.5998509687034277, + "grad_norm": 2.730379104614258, + "learning_rate": 0.0002, + "loss": 2.6087, + "step": 48310 + }, + { + "epoch": 3.6005961251862892, + "grad_norm": 2.50185227394104, + "learning_rate": 0.0002, + "loss": 2.4351, + "step": 48320 + }, + { + "epoch": 3.601341281669151, + "grad_norm": 2.4742274284362793, + "learning_rate": 0.0002, + "loss": 2.3752, + "step": 48330 + }, + { + "epoch": 3.602086438152012, + "grad_norm": 2.677802801132202, + "learning_rate": 0.0002, + "loss": 2.4924, + "step": 48340 + }, + { + "epoch": 3.6028315946348735, + "grad_norm": 2.7922158241271973, + "learning_rate": 0.0002, + "loss": 2.5855, + "step": 48350 + }, + { + "epoch": 3.6035767511177346, + "grad_norm": 3.383920431137085, + "learning_rate": 0.0002, + "loss": 2.5071, + "step": 48360 + }, + { + "epoch": 3.604321907600596, + "grad_norm": 2.9359383583068848, + "learning_rate": 0.0002, + "loss": 2.361, + "step": 48370 + }, + { + "epoch": 3.6050670640834577, + "grad_norm": 2.7278084754943848, + "learning_rate": 0.0002, + "loss": 2.641, + "step": 48380 + }, + { + "epoch": 3.605812220566319, + "grad_norm": 2.810471296310425, + "learning_rate": 0.0002, + "loss": 2.6345, + "step": 48390 + }, + { + "epoch": 3.6065573770491803, + "grad_norm": 2.4108266830444336, + "learning_rate": 0.0002, + "loss": 2.5544, + "step": 48400 + }, + { + "epoch": 3.607302533532042, + "grad_norm": 2.8318967819213867, + "learning_rate": 0.0002, + "loss": 2.52, + "step": 48410 + }, + { + "epoch": 3.608047690014903, + "grad_norm": 2.8543920516967773, + "learning_rate": 0.0002, + "loss": 2.5537, + "step": 48420 + }, + { + "epoch": 3.6087928464977646, + "grad_norm": 2.791773796081543, + "learning_rate": 0.0002, + "loss": 2.4955, + "step": 48430 + }, + { + "epoch": 3.609538002980626, + "grad_norm": 2.8440630435943604, + "learning_rate": 0.0002, + "loss": 2.4265, + "step": 48440 + }, + { + "epoch": 3.610283159463487, + "grad_norm": 2.4981586933135986, + "learning_rate": 0.0002, + "loss": 2.4647, + "step": 48450 + }, + { + "epoch": 3.6110283159463488, + "grad_norm": 2.399179458618164, + "learning_rate": 0.0002, + "loss": 2.5682, + "step": 48460 + }, + { + "epoch": 3.61177347242921, + "grad_norm": 2.578258991241455, + "learning_rate": 0.0002, + "loss": 2.5542, + "step": 48470 + }, + { + "epoch": 3.6125186289120714, + "grad_norm": 2.8405141830444336, + "learning_rate": 0.0002, + "loss": 2.5277, + "step": 48480 + }, + { + "epoch": 3.613263785394933, + "grad_norm": 2.530799150466919, + "learning_rate": 0.0002, + "loss": 2.644, + "step": 48490 + }, + { + "epoch": 3.6140089418777945, + "grad_norm": 2.7181057929992676, + "learning_rate": 0.0002, + "loss": 2.4457, + "step": 48500 + }, + { + "epoch": 3.6147540983606556, + "grad_norm": 2.577636480331421, + "learning_rate": 0.0002, + "loss": 2.4226, + "step": 48510 + }, + { + "epoch": 3.615499254843517, + "grad_norm": 3.4074344635009766, + "learning_rate": 0.0002, + "loss": 2.5066, + "step": 48520 + }, + { + "epoch": 3.6162444113263783, + "grad_norm": 2.69319748878479, + "learning_rate": 0.0002, + "loss": 2.5189, + "step": 48530 + }, + { + "epoch": 3.61698956780924, + "grad_norm": 2.8684957027435303, + "learning_rate": 0.0002, + "loss": 2.5323, + "step": 48540 + }, + { + "epoch": 3.6177347242921014, + "grad_norm": 2.828727960586548, + "learning_rate": 0.0002, + "loss": 2.5495, + "step": 48550 + }, + { + "epoch": 3.618479880774963, + "grad_norm": 2.4730985164642334, + "learning_rate": 0.0002, + "loss": 2.5354, + "step": 48560 + }, + { + "epoch": 3.619225037257824, + "grad_norm": 2.840999126434326, + "learning_rate": 0.0002, + "loss": 2.5194, + "step": 48570 + }, + { + "epoch": 3.6199701937406856, + "grad_norm": 2.5371737480163574, + "learning_rate": 0.0002, + "loss": 2.4109, + "step": 48580 + }, + { + "epoch": 3.6207153502235467, + "grad_norm": 3.4489569664001465, + "learning_rate": 0.0002, + "loss": 2.6168, + "step": 48590 + }, + { + "epoch": 3.6214605067064083, + "grad_norm": 2.614786386489868, + "learning_rate": 0.0002, + "loss": 2.4671, + "step": 48600 + }, + { + "epoch": 3.62220566318927, + "grad_norm": 2.5180749893188477, + "learning_rate": 0.0002, + "loss": 2.4171, + "step": 48610 + }, + { + "epoch": 3.6229508196721314, + "grad_norm": 2.856128692626953, + "learning_rate": 0.0002, + "loss": 2.3804, + "step": 48620 + }, + { + "epoch": 3.6236959761549925, + "grad_norm": 2.196049213409424, + "learning_rate": 0.0002, + "loss": 2.4736, + "step": 48630 + }, + { + "epoch": 3.624441132637854, + "grad_norm": 2.728996515274048, + "learning_rate": 0.0002, + "loss": 2.4175, + "step": 48640 + }, + { + "epoch": 3.625186289120715, + "grad_norm": 2.3340585231781006, + "learning_rate": 0.0002, + "loss": 2.4509, + "step": 48650 + }, + { + "epoch": 3.6259314456035767, + "grad_norm": 2.537264823913574, + "learning_rate": 0.0002, + "loss": 2.3519, + "step": 48660 + }, + { + "epoch": 3.6266766020864383, + "grad_norm": 2.4871232509613037, + "learning_rate": 0.0002, + "loss": 2.3988, + "step": 48670 + }, + { + "epoch": 3.6274217585693, + "grad_norm": 2.5547709465026855, + "learning_rate": 0.0002, + "loss": 2.2731, + "step": 48680 + }, + { + "epoch": 3.628166915052161, + "grad_norm": 2.8591160774230957, + "learning_rate": 0.0002, + "loss": 2.6361, + "step": 48690 + }, + { + "epoch": 3.6289120715350225, + "grad_norm": 2.216677665710449, + "learning_rate": 0.0002, + "loss": 2.3692, + "step": 48700 + }, + { + "epoch": 3.6296572280178836, + "grad_norm": 2.721316337585449, + "learning_rate": 0.0002, + "loss": 2.4691, + "step": 48710 + }, + { + "epoch": 3.630402384500745, + "grad_norm": 2.8119468688964844, + "learning_rate": 0.0002, + "loss": 2.397, + "step": 48720 + }, + { + "epoch": 3.6311475409836067, + "grad_norm": 3.7255992889404297, + "learning_rate": 0.0002, + "loss": 2.461, + "step": 48730 + }, + { + "epoch": 3.631892697466468, + "grad_norm": 2.680361032485962, + "learning_rate": 0.0002, + "loss": 2.2952, + "step": 48740 + }, + { + "epoch": 3.6326378539493294, + "grad_norm": 2.1289005279541016, + "learning_rate": 0.0002, + "loss": 2.4144, + "step": 48750 + }, + { + "epoch": 3.633383010432191, + "grad_norm": 2.259798765182495, + "learning_rate": 0.0002, + "loss": 2.6487, + "step": 48760 + }, + { + "epoch": 3.634128166915052, + "grad_norm": 2.293454170227051, + "learning_rate": 0.0002, + "loss": 2.5502, + "step": 48770 + }, + { + "epoch": 3.6348733233979136, + "grad_norm": 2.636841297149658, + "learning_rate": 0.0002, + "loss": 2.5529, + "step": 48780 + }, + { + "epoch": 3.635618479880775, + "grad_norm": 2.310394287109375, + "learning_rate": 0.0002, + "loss": 2.4676, + "step": 48790 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 2.545966625213623, + "learning_rate": 0.0002, + "loss": 2.4672, + "step": 48800 + }, + { + "epoch": 3.637108792846498, + "grad_norm": 2.9500443935394287, + "learning_rate": 0.0002, + "loss": 2.4827, + "step": 48810 + }, + { + "epoch": 3.637853949329359, + "grad_norm": 2.543187141418457, + "learning_rate": 0.0002, + "loss": 2.4132, + "step": 48820 + }, + { + "epoch": 3.6385991058122205, + "grad_norm": 2.2847654819488525, + "learning_rate": 0.0002, + "loss": 2.4768, + "step": 48830 + }, + { + "epoch": 3.639344262295082, + "grad_norm": 2.72090482711792, + "learning_rate": 0.0002, + "loss": 2.5947, + "step": 48840 + }, + { + "epoch": 3.6400894187779436, + "grad_norm": 2.562582015991211, + "learning_rate": 0.0002, + "loss": 2.5658, + "step": 48850 + }, + { + "epoch": 3.6408345752608047, + "grad_norm": 2.780815839767456, + "learning_rate": 0.0002, + "loss": 2.5017, + "step": 48860 + }, + { + "epoch": 3.6415797317436662, + "grad_norm": 2.9853780269622803, + "learning_rate": 0.0002, + "loss": 2.6693, + "step": 48870 + }, + { + "epoch": 3.6423248882265273, + "grad_norm": 2.79302716255188, + "learning_rate": 0.0002, + "loss": 2.4589, + "step": 48880 + }, + { + "epoch": 3.643070044709389, + "grad_norm": 2.7018563747406006, + "learning_rate": 0.0002, + "loss": 2.3826, + "step": 48890 + }, + { + "epoch": 3.6438152011922504, + "grad_norm": 2.699528694152832, + "learning_rate": 0.0002, + "loss": 2.3877, + "step": 48900 + }, + { + "epoch": 3.644560357675112, + "grad_norm": 2.4166371822357178, + "learning_rate": 0.0002, + "loss": 2.5869, + "step": 48910 + }, + { + "epoch": 3.645305514157973, + "grad_norm": 2.5733654499053955, + "learning_rate": 0.0002, + "loss": 2.5201, + "step": 48920 + }, + { + "epoch": 3.6460506706408347, + "grad_norm": 2.7172818183898926, + "learning_rate": 0.0002, + "loss": 2.4568, + "step": 48930 + }, + { + "epoch": 3.6467958271236958, + "grad_norm": 2.5636215209960938, + "learning_rate": 0.0002, + "loss": 2.5941, + "step": 48940 + }, + { + "epoch": 3.6475409836065573, + "grad_norm": 2.592777729034424, + "learning_rate": 0.0002, + "loss": 2.3906, + "step": 48950 + }, + { + "epoch": 3.648286140089419, + "grad_norm": 2.5706088542938232, + "learning_rate": 0.0002, + "loss": 2.4504, + "step": 48960 + }, + { + "epoch": 3.6490312965722804, + "grad_norm": 2.48825740814209, + "learning_rate": 0.0002, + "loss": 2.4888, + "step": 48970 + }, + { + "epoch": 3.6497764530551415, + "grad_norm": 2.727311372756958, + "learning_rate": 0.0002, + "loss": 2.4781, + "step": 48980 + }, + { + "epoch": 3.650521609538003, + "grad_norm": 2.3753762245178223, + "learning_rate": 0.0002, + "loss": 2.504, + "step": 48990 + }, + { + "epoch": 3.651266766020864, + "grad_norm": 2.639061450958252, + "learning_rate": 0.0002, + "loss": 2.5593, + "step": 49000 + }, + { + "epoch": 3.6520119225037257, + "grad_norm": 2.736478090286255, + "learning_rate": 0.0002, + "loss": 2.5617, + "step": 49010 + }, + { + "epoch": 3.6527570789865873, + "grad_norm": 2.6167259216308594, + "learning_rate": 0.0002, + "loss": 2.7529, + "step": 49020 + }, + { + "epoch": 3.653502235469449, + "grad_norm": 2.2042272090911865, + "learning_rate": 0.0002, + "loss": 2.4023, + "step": 49030 + }, + { + "epoch": 3.65424739195231, + "grad_norm": 2.191154956817627, + "learning_rate": 0.0002, + "loss": 2.5161, + "step": 49040 + }, + { + "epoch": 3.6549925484351715, + "grad_norm": 2.672147035598755, + "learning_rate": 0.0002, + "loss": 2.4459, + "step": 49050 + }, + { + "epoch": 3.6557377049180326, + "grad_norm": 2.6667487621307373, + "learning_rate": 0.0002, + "loss": 2.5033, + "step": 49060 + }, + { + "epoch": 3.656482861400894, + "grad_norm": 2.488171100616455, + "learning_rate": 0.0002, + "loss": 2.3307, + "step": 49070 + }, + { + "epoch": 3.6572280178837557, + "grad_norm": 2.9280152320861816, + "learning_rate": 0.0002, + "loss": 2.5912, + "step": 49080 + }, + { + "epoch": 3.657973174366617, + "grad_norm": 2.767899513244629, + "learning_rate": 0.0002, + "loss": 2.5289, + "step": 49090 + }, + { + "epoch": 3.6587183308494784, + "grad_norm": 2.595731019973755, + "learning_rate": 0.0002, + "loss": 2.6172, + "step": 49100 + }, + { + "epoch": 3.65946348733234, + "grad_norm": 2.531205177307129, + "learning_rate": 0.0002, + "loss": 2.5384, + "step": 49110 + }, + { + "epoch": 3.660208643815201, + "grad_norm": 2.5499446392059326, + "learning_rate": 0.0002, + "loss": 2.4912, + "step": 49120 + }, + { + "epoch": 3.6609538002980626, + "grad_norm": 2.692110300064087, + "learning_rate": 0.0002, + "loss": 2.6736, + "step": 49130 + }, + { + "epoch": 3.661698956780924, + "grad_norm": 2.5231432914733887, + "learning_rate": 0.0002, + "loss": 2.4245, + "step": 49140 + }, + { + "epoch": 3.6624441132637853, + "grad_norm": 2.7942330837249756, + "learning_rate": 0.0002, + "loss": 2.5642, + "step": 49150 + }, + { + "epoch": 3.663189269746647, + "grad_norm": 3.3361740112304688, + "learning_rate": 0.0002, + "loss": 2.5024, + "step": 49160 + }, + { + "epoch": 3.663934426229508, + "grad_norm": 3.006382465362549, + "learning_rate": 0.0002, + "loss": 2.5587, + "step": 49170 + }, + { + "epoch": 3.6646795827123695, + "grad_norm": 2.637521743774414, + "learning_rate": 0.0002, + "loss": 2.5168, + "step": 49180 + }, + { + "epoch": 3.665424739195231, + "grad_norm": 2.8171842098236084, + "learning_rate": 0.0002, + "loss": 2.6647, + "step": 49190 + }, + { + "epoch": 3.6661698956780926, + "grad_norm": 2.3812084197998047, + "learning_rate": 0.0002, + "loss": 2.2236, + "step": 49200 + }, + { + "epoch": 3.6669150521609537, + "grad_norm": 2.4592621326446533, + "learning_rate": 0.0002, + "loss": 2.3351, + "step": 49210 + }, + { + "epoch": 3.6676602086438153, + "grad_norm": 2.5426881313323975, + "learning_rate": 0.0002, + "loss": 2.5694, + "step": 49220 + }, + { + "epoch": 3.6684053651266764, + "grad_norm": 2.749530553817749, + "learning_rate": 0.0002, + "loss": 2.4955, + "step": 49230 + }, + { + "epoch": 3.669150521609538, + "grad_norm": 3.183913230895996, + "learning_rate": 0.0002, + "loss": 2.7195, + "step": 49240 + }, + { + "epoch": 3.6698956780923995, + "grad_norm": 2.6278300285339355, + "learning_rate": 0.0002, + "loss": 2.5673, + "step": 49250 + }, + { + "epoch": 3.670640834575261, + "grad_norm": 2.996420383453369, + "learning_rate": 0.0002, + "loss": 2.5843, + "step": 49260 + }, + { + "epoch": 3.671385991058122, + "grad_norm": 2.6500403881073, + "learning_rate": 0.0002, + "loss": 2.6122, + "step": 49270 + }, + { + "epoch": 3.6721311475409837, + "grad_norm": 3.047901153564453, + "learning_rate": 0.0002, + "loss": 2.4905, + "step": 49280 + }, + { + "epoch": 3.672876304023845, + "grad_norm": 2.8154449462890625, + "learning_rate": 0.0002, + "loss": 2.4668, + "step": 49290 + }, + { + "epoch": 3.6736214605067063, + "grad_norm": 2.3584389686584473, + "learning_rate": 0.0002, + "loss": 2.3198, + "step": 49300 + }, + { + "epoch": 3.674366616989568, + "grad_norm": 2.881648063659668, + "learning_rate": 0.0002, + "loss": 2.4651, + "step": 49310 + }, + { + "epoch": 3.6751117734724295, + "grad_norm": 2.5896761417388916, + "learning_rate": 0.0002, + "loss": 2.241, + "step": 49320 + }, + { + "epoch": 3.6758569299552906, + "grad_norm": 2.812044858932495, + "learning_rate": 0.0002, + "loss": 2.5172, + "step": 49330 + }, + { + "epoch": 3.676602086438152, + "grad_norm": 2.66312575340271, + "learning_rate": 0.0002, + "loss": 2.4133, + "step": 49340 + }, + { + "epoch": 3.6773472429210132, + "grad_norm": 2.790090799331665, + "learning_rate": 0.0002, + "loss": 2.709, + "step": 49350 + }, + { + "epoch": 3.678092399403875, + "grad_norm": 2.4893035888671875, + "learning_rate": 0.0002, + "loss": 2.4402, + "step": 49360 + }, + { + "epoch": 3.6788375558867363, + "grad_norm": 2.754606246948242, + "learning_rate": 0.0002, + "loss": 2.4593, + "step": 49370 + }, + { + "epoch": 3.679582712369598, + "grad_norm": 2.878617763519287, + "learning_rate": 0.0002, + "loss": 2.6367, + "step": 49380 + }, + { + "epoch": 3.680327868852459, + "grad_norm": 2.854499101638794, + "learning_rate": 0.0002, + "loss": 2.4631, + "step": 49390 + }, + { + "epoch": 3.6810730253353205, + "grad_norm": 2.849299669265747, + "learning_rate": 0.0002, + "loss": 2.465, + "step": 49400 + }, + { + "epoch": 3.6818181818181817, + "grad_norm": 2.570016860961914, + "learning_rate": 0.0002, + "loss": 2.4901, + "step": 49410 + }, + { + "epoch": 3.682563338301043, + "grad_norm": 2.6923599243164062, + "learning_rate": 0.0002, + "loss": 2.4682, + "step": 49420 + }, + { + "epoch": 3.6833084947839048, + "grad_norm": 2.5435593128204346, + "learning_rate": 0.0002, + "loss": 2.5621, + "step": 49430 + }, + { + "epoch": 3.684053651266766, + "grad_norm": 2.8608274459838867, + "learning_rate": 0.0002, + "loss": 2.586, + "step": 49440 + }, + { + "epoch": 3.6847988077496274, + "grad_norm": 2.5079166889190674, + "learning_rate": 0.0002, + "loss": 2.4451, + "step": 49450 + }, + { + "epoch": 3.685543964232489, + "grad_norm": 2.874114990234375, + "learning_rate": 0.0002, + "loss": 2.446, + "step": 49460 + }, + { + "epoch": 3.68628912071535, + "grad_norm": 2.727252960205078, + "learning_rate": 0.0002, + "loss": 2.4395, + "step": 49470 + }, + { + "epoch": 3.6870342771982116, + "grad_norm": 2.734837532043457, + "learning_rate": 0.0002, + "loss": 2.4344, + "step": 49480 + }, + { + "epoch": 3.687779433681073, + "grad_norm": 2.5745677947998047, + "learning_rate": 0.0002, + "loss": 2.3926, + "step": 49490 + }, + { + "epoch": 3.6885245901639343, + "grad_norm": 2.9296648502349854, + "learning_rate": 0.0002, + "loss": 2.458, + "step": 49500 + }, + { + "epoch": 3.689269746646796, + "grad_norm": 2.666250705718994, + "learning_rate": 0.0002, + "loss": 2.5119, + "step": 49510 + }, + { + "epoch": 3.690014903129657, + "grad_norm": 2.7720589637756348, + "learning_rate": 0.0002, + "loss": 2.4189, + "step": 49520 + }, + { + "epoch": 3.6907600596125185, + "grad_norm": 2.5077097415924072, + "learning_rate": 0.0002, + "loss": 2.486, + "step": 49530 + }, + { + "epoch": 3.69150521609538, + "grad_norm": 2.7415547370910645, + "learning_rate": 0.0002, + "loss": 2.4732, + "step": 49540 + }, + { + "epoch": 3.6922503725782416, + "grad_norm": 3.023948907852173, + "learning_rate": 0.0002, + "loss": 2.4565, + "step": 49550 + }, + { + "epoch": 3.6929955290611027, + "grad_norm": 2.926791191101074, + "learning_rate": 0.0002, + "loss": 2.3835, + "step": 49560 + }, + { + "epoch": 3.6937406855439643, + "grad_norm": 2.64837646484375, + "learning_rate": 0.0002, + "loss": 2.5395, + "step": 49570 + }, + { + "epoch": 3.6944858420268254, + "grad_norm": 3.1777780055999756, + "learning_rate": 0.0002, + "loss": 2.6367, + "step": 49580 + }, + { + "epoch": 3.695230998509687, + "grad_norm": 2.567836284637451, + "learning_rate": 0.0002, + "loss": 2.7127, + "step": 49590 + }, + { + "epoch": 3.6959761549925485, + "grad_norm": 2.6760616302490234, + "learning_rate": 0.0002, + "loss": 2.5427, + "step": 49600 + }, + { + "epoch": 3.69672131147541, + "grad_norm": 2.6477506160736084, + "learning_rate": 0.0002, + "loss": 2.446, + "step": 49610 + }, + { + "epoch": 3.697466467958271, + "grad_norm": 2.555225372314453, + "learning_rate": 0.0002, + "loss": 2.4988, + "step": 49620 + }, + { + "epoch": 3.6982116244411327, + "grad_norm": 2.3987672328948975, + "learning_rate": 0.0002, + "loss": 2.476, + "step": 49630 + }, + { + "epoch": 3.698956780923994, + "grad_norm": 2.7095890045166016, + "learning_rate": 0.0002, + "loss": 2.5311, + "step": 49640 + }, + { + "epoch": 3.6997019374068554, + "grad_norm": 2.7306811809539795, + "learning_rate": 0.0002, + "loss": 2.5193, + "step": 49650 + }, + { + "epoch": 3.700447093889717, + "grad_norm": 2.5883631706237793, + "learning_rate": 0.0002, + "loss": 2.5261, + "step": 49660 + }, + { + "epoch": 3.7011922503725785, + "grad_norm": 2.3863844871520996, + "learning_rate": 0.0002, + "loss": 2.4421, + "step": 49670 + }, + { + "epoch": 3.7019374068554396, + "grad_norm": 2.6333467960357666, + "learning_rate": 0.0002, + "loss": 2.5035, + "step": 49680 + }, + { + "epoch": 3.702682563338301, + "grad_norm": 2.6204724311828613, + "learning_rate": 0.0002, + "loss": 2.3942, + "step": 49690 + }, + { + "epoch": 3.7034277198211623, + "grad_norm": 2.580791711807251, + "learning_rate": 0.0002, + "loss": 2.205, + "step": 49700 + }, + { + "epoch": 3.704172876304024, + "grad_norm": 2.720482587814331, + "learning_rate": 0.0002, + "loss": 2.4407, + "step": 49710 + }, + { + "epoch": 3.7049180327868854, + "grad_norm": 2.664541006088257, + "learning_rate": 0.0002, + "loss": 2.4447, + "step": 49720 + }, + { + "epoch": 3.705663189269747, + "grad_norm": 2.4946305751800537, + "learning_rate": 0.0002, + "loss": 2.5258, + "step": 49730 + }, + { + "epoch": 3.706408345752608, + "grad_norm": 2.704585552215576, + "learning_rate": 0.0002, + "loss": 2.3665, + "step": 49740 + }, + { + "epoch": 3.7071535022354696, + "grad_norm": 2.7640819549560547, + "learning_rate": 0.0002, + "loss": 2.5606, + "step": 49750 + }, + { + "epoch": 3.7078986587183307, + "grad_norm": 2.631479263305664, + "learning_rate": 0.0002, + "loss": 2.5774, + "step": 49760 + }, + { + "epoch": 3.7086438152011922, + "grad_norm": 2.800203800201416, + "learning_rate": 0.0002, + "loss": 2.4952, + "step": 49770 + }, + { + "epoch": 3.709388971684054, + "grad_norm": 2.66166090965271, + "learning_rate": 0.0002, + "loss": 2.5133, + "step": 49780 + }, + { + "epoch": 3.710134128166915, + "grad_norm": 2.272296905517578, + "learning_rate": 0.0002, + "loss": 2.5408, + "step": 49790 + }, + { + "epoch": 3.7108792846497765, + "grad_norm": 2.591278076171875, + "learning_rate": 0.0002, + "loss": 2.5519, + "step": 49800 + }, + { + "epoch": 3.711624441132638, + "grad_norm": 2.4364070892333984, + "learning_rate": 0.0002, + "loss": 2.4554, + "step": 49810 + }, + { + "epoch": 3.712369597615499, + "grad_norm": 2.7269444465637207, + "learning_rate": 0.0002, + "loss": 2.4394, + "step": 49820 + }, + { + "epoch": 3.7131147540983607, + "grad_norm": 2.7368903160095215, + "learning_rate": 0.0002, + "loss": 2.5778, + "step": 49830 + }, + { + "epoch": 3.7138599105812222, + "grad_norm": 2.6224026679992676, + "learning_rate": 0.0002, + "loss": 2.4692, + "step": 49840 + }, + { + "epoch": 3.7146050670640833, + "grad_norm": 2.544132709503174, + "learning_rate": 0.0002, + "loss": 2.5929, + "step": 49850 + }, + { + "epoch": 3.715350223546945, + "grad_norm": 2.141615629196167, + "learning_rate": 0.0002, + "loss": 2.4091, + "step": 49860 + }, + { + "epoch": 3.716095380029806, + "grad_norm": 2.357640027999878, + "learning_rate": 0.0002, + "loss": 2.5352, + "step": 49870 + }, + { + "epoch": 3.7168405365126675, + "grad_norm": 2.8612751960754395, + "learning_rate": 0.0002, + "loss": 2.6656, + "step": 49880 + }, + { + "epoch": 3.717585692995529, + "grad_norm": 2.558424472808838, + "learning_rate": 0.0002, + "loss": 2.4203, + "step": 49890 + }, + { + "epoch": 3.7183308494783907, + "grad_norm": 2.3906445503234863, + "learning_rate": 0.0002, + "loss": 2.4509, + "step": 49900 + }, + { + "epoch": 3.7190760059612518, + "grad_norm": 2.647514581680298, + "learning_rate": 0.0002, + "loss": 2.5821, + "step": 49910 + }, + { + "epoch": 3.7198211624441133, + "grad_norm": 2.710005283355713, + "learning_rate": 0.0002, + "loss": 2.5298, + "step": 49920 + }, + { + "epoch": 3.7205663189269744, + "grad_norm": 2.5558953285217285, + "learning_rate": 0.0002, + "loss": 2.4721, + "step": 49930 + }, + { + "epoch": 3.721311475409836, + "grad_norm": 2.446017265319824, + "learning_rate": 0.0002, + "loss": 2.4778, + "step": 49940 + }, + { + "epoch": 3.7220566318926975, + "grad_norm": 3.0394492149353027, + "learning_rate": 0.0002, + "loss": 2.3313, + "step": 49950 + }, + { + "epoch": 3.722801788375559, + "grad_norm": 2.5545358657836914, + "learning_rate": 0.0002, + "loss": 2.6545, + "step": 49960 + }, + { + "epoch": 3.72354694485842, + "grad_norm": 2.7570934295654297, + "learning_rate": 0.0002, + "loss": 2.4509, + "step": 49970 + }, + { + "epoch": 3.7242921013412817, + "grad_norm": 2.592240571975708, + "learning_rate": 0.0002, + "loss": 2.4852, + "step": 49980 + }, + { + "epoch": 3.725037257824143, + "grad_norm": 2.751447916030884, + "learning_rate": 0.0002, + "loss": 2.3889, + "step": 49990 + }, + { + "epoch": 3.7257824143070044, + "grad_norm": 2.723025321960449, + "learning_rate": 0.0002, + "loss": 2.3713, + "step": 50000 + }, + { + "epoch": 3.726527570789866, + "grad_norm": 2.759801149368286, + "learning_rate": 0.0002, + "loss": 2.507, + "step": 50010 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 2.721895933151245, + "learning_rate": 0.0002, + "loss": 2.4909, + "step": 50020 + }, + { + "epoch": 3.7280178837555886, + "grad_norm": 2.8816866874694824, + "learning_rate": 0.0002, + "loss": 2.5981, + "step": 50030 + }, + { + "epoch": 3.72876304023845, + "grad_norm": 2.328068256378174, + "learning_rate": 0.0002, + "loss": 2.5311, + "step": 50040 + }, + { + "epoch": 3.7295081967213113, + "grad_norm": 2.826514482498169, + "learning_rate": 0.0002, + "loss": 2.4776, + "step": 50050 + }, + { + "epoch": 3.730253353204173, + "grad_norm": 2.855038642883301, + "learning_rate": 0.0002, + "loss": 2.5622, + "step": 50060 + }, + { + "epoch": 3.7309985096870344, + "grad_norm": 2.6433498859405518, + "learning_rate": 0.0002, + "loss": 2.4781, + "step": 50070 + }, + { + "epoch": 3.731743666169896, + "grad_norm": 2.565962076187134, + "learning_rate": 0.0002, + "loss": 2.5446, + "step": 50080 + }, + { + "epoch": 3.732488822652757, + "grad_norm": 2.8265738487243652, + "learning_rate": 0.0002, + "loss": 2.534, + "step": 50090 + }, + { + "epoch": 3.7332339791356186, + "grad_norm": 2.9599409103393555, + "learning_rate": 0.0002, + "loss": 2.4206, + "step": 50100 + }, + { + "epoch": 3.7339791356184797, + "grad_norm": 2.669797897338867, + "learning_rate": 0.0002, + "loss": 2.6088, + "step": 50110 + }, + { + "epoch": 3.7347242921013413, + "grad_norm": 2.8583147525787354, + "learning_rate": 0.0002, + "loss": 2.5827, + "step": 50120 + }, + { + "epoch": 3.735469448584203, + "grad_norm": 2.217470407485962, + "learning_rate": 0.0002, + "loss": 2.4562, + "step": 50130 + }, + { + "epoch": 3.736214605067064, + "grad_norm": 3.1019763946533203, + "learning_rate": 0.0002, + "loss": 2.4064, + "step": 50140 + }, + { + "epoch": 3.7369597615499255, + "grad_norm": 2.963712692260742, + "learning_rate": 0.0002, + "loss": 2.5403, + "step": 50150 + }, + { + "epoch": 3.737704918032787, + "grad_norm": 3.015799045562744, + "learning_rate": 0.0002, + "loss": 2.6094, + "step": 50160 + }, + { + "epoch": 3.738450074515648, + "grad_norm": 2.435353994369507, + "learning_rate": 0.0002, + "loss": 2.4104, + "step": 50170 + }, + { + "epoch": 3.7391952309985097, + "grad_norm": 2.5293540954589844, + "learning_rate": 0.0002, + "loss": 2.5578, + "step": 50180 + }, + { + "epoch": 3.7399403874813713, + "grad_norm": 2.1452317237854004, + "learning_rate": 0.0002, + "loss": 2.4667, + "step": 50190 + }, + { + "epoch": 3.7406855439642324, + "grad_norm": 2.675736665725708, + "learning_rate": 0.0002, + "loss": 2.6145, + "step": 50200 + }, + { + "epoch": 3.741430700447094, + "grad_norm": 2.4674720764160156, + "learning_rate": 0.0002, + "loss": 2.3061, + "step": 50210 + }, + { + "epoch": 3.742175856929955, + "grad_norm": 2.5854485034942627, + "learning_rate": 0.0002, + "loss": 2.6369, + "step": 50220 + }, + { + "epoch": 3.7429210134128166, + "grad_norm": 2.5845370292663574, + "learning_rate": 0.0002, + "loss": 2.4753, + "step": 50230 + }, + { + "epoch": 3.743666169895678, + "grad_norm": 2.4826760292053223, + "learning_rate": 0.0002, + "loss": 2.4353, + "step": 50240 + }, + { + "epoch": 3.7444113263785397, + "grad_norm": 2.491818428039551, + "learning_rate": 0.0002, + "loss": 2.3542, + "step": 50250 + }, + { + "epoch": 3.745156482861401, + "grad_norm": 2.7059671878814697, + "learning_rate": 0.0002, + "loss": 2.3786, + "step": 50260 + }, + { + "epoch": 3.7459016393442623, + "grad_norm": 2.488055944442749, + "learning_rate": 0.0002, + "loss": 2.3972, + "step": 50270 + }, + { + "epoch": 3.7466467958271235, + "grad_norm": 2.8024628162384033, + "learning_rate": 0.0002, + "loss": 2.4418, + "step": 50280 + }, + { + "epoch": 3.747391952309985, + "grad_norm": 2.639986991882324, + "learning_rate": 0.0002, + "loss": 2.5661, + "step": 50290 + }, + { + "epoch": 3.7481371087928466, + "grad_norm": 2.477285623550415, + "learning_rate": 0.0002, + "loss": 2.4308, + "step": 50300 + }, + { + "epoch": 3.748882265275708, + "grad_norm": 2.709949254989624, + "learning_rate": 0.0002, + "loss": 2.6701, + "step": 50310 + }, + { + "epoch": 3.7496274217585692, + "grad_norm": 2.567444324493408, + "learning_rate": 0.0002, + "loss": 2.6772, + "step": 50320 + }, + { + "epoch": 3.7503725782414308, + "grad_norm": 2.8398115634918213, + "learning_rate": 0.0002, + "loss": 2.5978, + "step": 50330 + }, + { + "epoch": 3.751117734724292, + "grad_norm": 2.623926877975464, + "learning_rate": 0.0002, + "loss": 2.4706, + "step": 50340 + }, + { + "epoch": 3.7518628912071534, + "grad_norm": 2.404278039932251, + "learning_rate": 0.0002, + "loss": 2.6895, + "step": 50350 + }, + { + "epoch": 3.752608047690015, + "grad_norm": 2.811429023742676, + "learning_rate": 0.0002, + "loss": 2.3915, + "step": 50360 + }, + { + "epoch": 3.7533532041728765, + "grad_norm": 2.551865339279175, + "learning_rate": 0.0002, + "loss": 2.3965, + "step": 50370 + }, + { + "epoch": 3.7540983606557377, + "grad_norm": 3.0696215629577637, + "learning_rate": 0.0002, + "loss": 2.4566, + "step": 50380 + }, + { + "epoch": 3.754843517138599, + "grad_norm": 2.6258938312530518, + "learning_rate": 0.0002, + "loss": 2.564, + "step": 50390 + }, + { + "epoch": 3.7555886736214603, + "grad_norm": 2.8297739028930664, + "learning_rate": 0.0002, + "loss": 2.7246, + "step": 50400 + }, + { + "epoch": 3.756333830104322, + "grad_norm": 2.9615511894226074, + "learning_rate": 0.0002, + "loss": 2.6202, + "step": 50410 + }, + { + "epoch": 3.7570789865871834, + "grad_norm": 2.3409204483032227, + "learning_rate": 0.0002, + "loss": 2.5359, + "step": 50420 + }, + { + "epoch": 3.757824143070045, + "grad_norm": 2.545732021331787, + "learning_rate": 0.0002, + "loss": 2.5465, + "step": 50430 + }, + { + "epoch": 3.758569299552906, + "grad_norm": 2.5160508155822754, + "learning_rate": 0.0002, + "loss": 2.5209, + "step": 50440 + }, + { + "epoch": 3.7593144560357676, + "grad_norm": 2.9805400371551514, + "learning_rate": 0.0002, + "loss": 2.4882, + "step": 50450 + }, + { + "epoch": 3.7600596125186287, + "grad_norm": 2.4430508613586426, + "learning_rate": 0.0002, + "loss": 2.5019, + "step": 50460 + }, + { + "epoch": 3.7608047690014903, + "grad_norm": 2.3823506832122803, + "learning_rate": 0.0002, + "loss": 2.3845, + "step": 50470 + }, + { + "epoch": 3.761549925484352, + "grad_norm": 2.54958176612854, + "learning_rate": 0.0002, + "loss": 2.4257, + "step": 50480 + }, + { + "epoch": 3.762295081967213, + "grad_norm": 2.992579460144043, + "learning_rate": 0.0002, + "loss": 2.5295, + "step": 50490 + }, + { + "epoch": 3.7630402384500745, + "grad_norm": 2.7364273071289062, + "learning_rate": 0.0002, + "loss": 2.7259, + "step": 50500 + }, + { + "epoch": 3.763785394932936, + "grad_norm": 2.527580499649048, + "learning_rate": 0.0002, + "loss": 2.5548, + "step": 50510 + }, + { + "epoch": 3.764530551415797, + "grad_norm": 2.6764330863952637, + "learning_rate": 0.0002, + "loss": 2.4295, + "step": 50520 + }, + { + "epoch": 3.7652757078986587, + "grad_norm": 2.7412283420562744, + "learning_rate": 0.0002, + "loss": 2.5146, + "step": 50530 + }, + { + "epoch": 3.7660208643815203, + "grad_norm": 2.5869789123535156, + "learning_rate": 0.0002, + "loss": 2.5206, + "step": 50540 + }, + { + "epoch": 3.7667660208643814, + "grad_norm": 2.71201491355896, + "learning_rate": 0.0002, + "loss": 2.4387, + "step": 50550 + }, + { + "epoch": 3.767511177347243, + "grad_norm": 2.881373167037964, + "learning_rate": 0.0002, + "loss": 2.5828, + "step": 50560 + }, + { + "epoch": 3.768256333830104, + "grad_norm": 2.6753411293029785, + "learning_rate": 0.0002, + "loss": 2.6769, + "step": 50570 + }, + { + "epoch": 3.7690014903129656, + "grad_norm": 2.626857042312622, + "learning_rate": 0.0002, + "loss": 2.5313, + "step": 50580 + }, + { + "epoch": 3.769746646795827, + "grad_norm": 2.0912129878997803, + "learning_rate": 0.0002, + "loss": 2.1949, + "step": 50590 + }, + { + "epoch": 3.7704918032786887, + "grad_norm": 2.5169942378997803, + "learning_rate": 0.0002, + "loss": 2.4955, + "step": 50600 + }, + { + "epoch": 3.77123695976155, + "grad_norm": 2.7082998752593994, + "learning_rate": 0.0002, + "loss": 2.5901, + "step": 50610 + }, + { + "epoch": 3.7719821162444114, + "grad_norm": 2.681004762649536, + "learning_rate": 0.0002, + "loss": 2.5101, + "step": 50620 + }, + { + "epoch": 3.7727272727272725, + "grad_norm": 2.7208945751190186, + "learning_rate": 0.0002, + "loss": 2.4098, + "step": 50630 + }, + { + "epoch": 3.773472429210134, + "grad_norm": 2.369579553604126, + "learning_rate": 0.0002, + "loss": 2.4761, + "step": 50640 + }, + { + "epoch": 3.7742175856929956, + "grad_norm": 2.708627223968506, + "learning_rate": 0.0002, + "loss": 2.5875, + "step": 50650 + }, + { + "epoch": 3.774962742175857, + "grad_norm": 2.6574110984802246, + "learning_rate": 0.0002, + "loss": 2.4478, + "step": 50660 + }, + { + "epoch": 3.7757078986587183, + "grad_norm": 2.582005739212036, + "learning_rate": 0.0002, + "loss": 2.4235, + "step": 50670 + }, + { + "epoch": 3.77645305514158, + "grad_norm": 2.9919252395629883, + "learning_rate": 0.0002, + "loss": 2.5915, + "step": 50680 + }, + { + "epoch": 3.777198211624441, + "grad_norm": 2.489131212234497, + "learning_rate": 0.0002, + "loss": 2.5323, + "step": 50690 + }, + { + "epoch": 3.7779433681073025, + "grad_norm": 3.1119613647460938, + "learning_rate": 0.0002, + "loss": 2.4534, + "step": 50700 + }, + { + "epoch": 3.778688524590164, + "grad_norm": 2.639336585998535, + "learning_rate": 0.0002, + "loss": 2.5858, + "step": 50710 + }, + { + "epoch": 3.7794336810730256, + "grad_norm": 2.6446197032928467, + "learning_rate": 0.0002, + "loss": 2.4894, + "step": 50720 + }, + { + "epoch": 3.7801788375558867, + "grad_norm": 2.721428871154785, + "learning_rate": 0.0002, + "loss": 2.6664, + "step": 50730 + }, + { + "epoch": 3.7809239940387482, + "grad_norm": 3.0219340324401855, + "learning_rate": 0.0002, + "loss": 2.5447, + "step": 50740 + }, + { + "epoch": 3.7816691505216093, + "grad_norm": 2.371525764465332, + "learning_rate": 0.0002, + "loss": 2.2999, + "step": 50750 + }, + { + "epoch": 3.782414307004471, + "grad_norm": 3.023408889770508, + "learning_rate": 0.0002, + "loss": 2.4849, + "step": 50760 + }, + { + "epoch": 3.7831594634873325, + "grad_norm": 2.1331787109375, + "learning_rate": 0.0002, + "loss": 2.5371, + "step": 50770 + }, + { + "epoch": 3.783904619970194, + "grad_norm": 2.6607348918914795, + "learning_rate": 0.0002, + "loss": 2.5061, + "step": 50780 + }, + { + "epoch": 3.784649776453055, + "grad_norm": 2.316195011138916, + "learning_rate": 0.0002, + "loss": 2.5484, + "step": 50790 + }, + { + "epoch": 3.7853949329359167, + "grad_norm": 2.7560927867889404, + "learning_rate": 0.0002, + "loss": 2.4545, + "step": 50800 + }, + { + "epoch": 3.7861400894187778, + "grad_norm": 2.3662703037261963, + "learning_rate": 0.0002, + "loss": 2.5492, + "step": 50810 + }, + { + "epoch": 3.7868852459016393, + "grad_norm": 2.9224653244018555, + "learning_rate": 0.0002, + "loss": 2.6868, + "step": 50820 + }, + { + "epoch": 3.787630402384501, + "grad_norm": 2.669923782348633, + "learning_rate": 0.0002, + "loss": 2.3945, + "step": 50830 + }, + { + "epoch": 3.788375558867362, + "grad_norm": 2.4683589935302734, + "learning_rate": 0.0002, + "loss": 2.6, + "step": 50840 + }, + { + "epoch": 3.7891207153502235, + "grad_norm": 2.4863712787628174, + "learning_rate": 0.0002, + "loss": 2.6107, + "step": 50850 + }, + { + "epoch": 3.789865871833085, + "grad_norm": 2.4746665954589844, + "learning_rate": 0.0002, + "loss": 2.421, + "step": 50860 + }, + { + "epoch": 3.790611028315946, + "grad_norm": 2.53080153465271, + "learning_rate": 0.0002, + "loss": 2.2403, + "step": 50870 + }, + { + "epoch": 3.7913561847988078, + "grad_norm": 2.8547794818878174, + "learning_rate": 0.0002, + "loss": 2.4001, + "step": 50880 + }, + { + "epoch": 3.7921013412816693, + "grad_norm": 2.5060040950775146, + "learning_rate": 0.0002, + "loss": 2.4906, + "step": 50890 + }, + { + "epoch": 3.7928464977645304, + "grad_norm": 2.542915105819702, + "learning_rate": 0.0002, + "loss": 2.3689, + "step": 50900 + }, + { + "epoch": 3.793591654247392, + "grad_norm": 2.546762704849243, + "learning_rate": 0.0002, + "loss": 2.5893, + "step": 50910 + }, + { + "epoch": 3.794336810730253, + "grad_norm": 2.9480457305908203, + "learning_rate": 0.0002, + "loss": 2.4305, + "step": 50920 + }, + { + "epoch": 3.7950819672131146, + "grad_norm": 2.469069719314575, + "learning_rate": 0.0002, + "loss": 2.3765, + "step": 50930 + }, + { + "epoch": 3.795827123695976, + "grad_norm": 2.714322328567505, + "learning_rate": 0.0002, + "loss": 2.5366, + "step": 50940 + }, + { + "epoch": 3.7965722801788377, + "grad_norm": 2.614262580871582, + "learning_rate": 0.0002, + "loss": 2.721, + "step": 50950 + }, + { + "epoch": 3.797317436661699, + "grad_norm": 2.443918466567993, + "learning_rate": 0.0002, + "loss": 2.3982, + "step": 50960 + }, + { + "epoch": 3.7980625931445604, + "grad_norm": 2.338960647583008, + "learning_rate": 0.0002, + "loss": 2.3152, + "step": 50970 + }, + { + "epoch": 3.7988077496274215, + "grad_norm": 2.9768331050872803, + "learning_rate": 0.0002, + "loss": 2.4752, + "step": 50980 + }, + { + "epoch": 3.799552906110283, + "grad_norm": 2.6073148250579834, + "learning_rate": 0.0002, + "loss": 2.4918, + "step": 50990 + }, + { + "epoch": 3.8002980625931446, + "grad_norm": 2.7576828002929688, + "learning_rate": 0.0002, + "loss": 2.5247, + "step": 51000 + }, + { + "epoch": 3.801043219076006, + "grad_norm": 2.710273504257202, + "learning_rate": 0.0002, + "loss": 2.6895, + "step": 51010 + }, + { + "epoch": 3.8017883755588673, + "grad_norm": 2.608431339263916, + "learning_rate": 0.0002, + "loss": 2.4041, + "step": 51020 + }, + { + "epoch": 3.802533532041729, + "grad_norm": 2.511503219604492, + "learning_rate": 0.0002, + "loss": 2.4616, + "step": 51030 + }, + { + "epoch": 3.80327868852459, + "grad_norm": 2.6001126766204834, + "learning_rate": 0.0002, + "loss": 2.467, + "step": 51040 + }, + { + "epoch": 3.8040238450074515, + "grad_norm": 2.7922747135162354, + "learning_rate": 0.0002, + "loss": 2.5395, + "step": 51050 + }, + { + "epoch": 3.804769001490313, + "grad_norm": 2.260394334793091, + "learning_rate": 0.0002, + "loss": 2.2577, + "step": 51060 + }, + { + "epoch": 3.8055141579731746, + "grad_norm": 2.6235592365264893, + "learning_rate": 0.0002, + "loss": 2.3754, + "step": 51070 + }, + { + "epoch": 3.8062593144560357, + "grad_norm": 2.6814324855804443, + "learning_rate": 0.0002, + "loss": 2.495, + "step": 51080 + }, + { + "epoch": 3.8070044709388973, + "grad_norm": 1.8012224435806274, + "learning_rate": 0.0002, + "loss": 2.4035, + "step": 51090 + }, + { + "epoch": 3.8077496274217584, + "grad_norm": 2.7128658294677734, + "learning_rate": 0.0002, + "loss": 2.6043, + "step": 51100 + }, + { + "epoch": 3.80849478390462, + "grad_norm": 2.600168466567993, + "learning_rate": 0.0002, + "loss": 2.6245, + "step": 51110 + }, + { + "epoch": 3.8092399403874815, + "grad_norm": 2.9342854022979736, + "learning_rate": 0.0002, + "loss": 2.5065, + "step": 51120 + }, + { + "epoch": 3.809985096870343, + "grad_norm": 2.6882317066192627, + "learning_rate": 0.0002, + "loss": 2.5987, + "step": 51130 + }, + { + "epoch": 3.810730253353204, + "grad_norm": 2.672231674194336, + "learning_rate": 0.0002, + "loss": 2.5405, + "step": 51140 + }, + { + "epoch": 3.8114754098360657, + "grad_norm": 2.7051048278808594, + "learning_rate": 0.0002, + "loss": 2.5367, + "step": 51150 + }, + { + "epoch": 3.812220566318927, + "grad_norm": 2.8326873779296875, + "learning_rate": 0.0002, + "loss": 2.6169, + "step": 51160 + }, + { + "epoch": 3.8129657228017884, + "grad_norm": 2.9274744987487793, + "learning_rate": 0.0002, + "loss": 2.6881, + "step": 51170 + }, + { + "epoch": 3.81371087928465, + "grad_norm": 2.338207721710205, + "learning_rate": 0.0002, + "loss": 2.5462, + "step": 51180 + }, + { + "epoch": 3.814456035767511, + "grad_norm": 2.5919198989868164, + "learning_rate": 0.0002, + "loss": 2.2888, + "step": 51190 + }, + { + "epoch": 3.8152011922503726, + "grad_norm": 2.5705173015594482, + "learning_rate": 0.0002, + "loss": 2.5768, + "step": 51200 + }, + { + "epoch": 3.815946348733234, + "grad_norm": 2.6229724884033203, + "learning_rate": 0.0002, + "loss": 2.4647, + "step": 51210 + }, + { + "epoch": 3.8166915052160952, + "grad_norm": 2.2743210792541504, + "learning_rate": 0.0002, + "loss": 2.3109, + "step": 51220 + }, + { + "epoch": 3.817436661698957, + "grad_norm": 2.2738277912139893, + "learning_rate": 0.0002, + "loss": 2.5418, + "step": 51230 + }, + { + "epoch": 3.8181818181818183, + "grad_norm": 2.817492961883545, + "learning_rate": 0.0002, + "loss": 2.5758, + "step": 51240 + }, + { + "epoch": 3.8189269746646795, + "grad_norm": 2.5765511989593506, + "learning_rate": 0.0002, + "loss": 2.428, + "step": 51250 + }, + { + "epoch": 3.819672131147541, + "grad_norm": 2.62100887298584, + "learning_rate": 0.0002, + "loss": 2.2739, + "step": 51260 + }, + { + "epoch": 3.820417287630402, + "grad_norm": 2.599499225616455, + "learning_rate": 0.0002, + "loss": 2.6626, + "step": 51270 + }, + { + "epoch": 3.8211624441132637, + "grad_norm": 3.347637176513672, + "learning_rate": 0.0002, + "loss": 2.441, + "step": 51280 + }, + { + "epoch": 3.821907600596125, + "grad_norm": 2.842421293258667, + "learning_rate": 0.0002, + "loss": 2.5185, + "step": 51290 + }, + { + "epoch": 3.8226527570789868, + "grad_norm": 2.7661526203155518, + "learning_rate": 0.0002, + "loss": 2.5964, + "step": 51300 + }, + { + "epoch": 3.823397913561848, + "grad_norm": 2.7636051177978516, + "learning_rate": 0.0002, + "loss": 2.6437, + "step": 51310 + }, + { + "epoch": 3.8241430700447094, + "grad_norm": 2.534313678741455, + "learning_rate": 0.0002, + "loss": 2.4475, + "step": 51320 + }, + { + "epoch": 3.8248882265275705, + "grad_norm": 1.8897809982299805, + "learning_rate": 0.0002, + "loss": 2.1495, + "step": 51330 + }, + { + "epoch": 3.825633383010432, + "grad_norm": 2.465552806854248, + "learning_rate": 0.0002, + "loss": 2.4313, + "step": 51340 + }, + { + "epoch": 3.8263785394932937, + "grad_norm": 2.8640079498291016, + "learning_rate": 0.0002, + "loss": 2.6838, + "step": 51350 + }, + { + "epoch": 3.827123695976155, + "grad_norm": 1.6258368492126465, + "learning_rate": 0.0002, + "loss": 2.3335, + "step": 51360 + }, + { + "epoch": 3.8278688524590163, + "grad_norm": 2.6712746620178223, + "learning_rate": 0.0002, + "loss": 2.5755, + "step": 51370 + }, + { + "epoch": 3.828614008941878, + "grad_norm": 2.809267282485962, + "learning_rate": 0.0002, + "loss": 2.4954, + "step": 51380 + }, + { + "epoch": 3.829359165424739, + "grad_norm": 2.894386053085327, + "learning_rate": 0.0002, + "loss": 2.4332, + "step": 51390 + }, + { + "epoch": 3.8301043219076005, + "grad_norm": 2.8683619499206543, + "learning_rate": 0.0002, + "loss": 2.4112, + "step": 51400 + }, + { + "epoch": 3.830849478390462, + "grad_norm": 2.383283853530884, + "learning_rate": 0.0002, + "loss": 2.5905, + "step": 51410 + }, + { + "epoch": 3.8315946348733236, + "grad_norm": 2.6616315841674805, + "learning_rate": 0.0002, + "loss": 2.416, + "step": 51420 + }, + { + "epoch": 3.8323397913561847, + "grad_norm": 3.09716796875, + "learning_rate": 0.0002, + "loss": 2.5738, + "step": 51430 + }, + { + "epoch": 3.8330849478390463, + "grad_norm": 2.646336793899536, + "learning_rate": 0.0002, + "loss": 2.5713, + "step": 51440 + }, + { + "epoch": 3.8338301043219074, + "grad_norm": 2.847592353820801, + "learning_rate": 0.0002, + "loss": 2.3993, + "step": 51450 + }, + { + "epoch": 3.834575260804769, + "grad_norm": 2.573282480239868, + "learning_rate": 0.0002, + "loss": 2.4144, + "step": 51460 + }, + { + "epoch": 3.8353204172876305, + "grad_norm": 2.621114492416382, + "learning_rate": 0.0002, + "loss": 2.418, + "step": 51470 + }, + { + "epoch": 3.836065573770492, + "grad_norm": 2.757758855819702, + "learning_rate": 0.0002, + "loss": 2.3968, + "step": 51480 + }, + { + "epoch": 3.836810730253353, + "grad_norm": 2.8561811447143555, + "learning_rate": 0.0002, + "loss": 2.4822, + "step": 51490 + }, + { + "epoch": 3.8375558867362147, + "grad_norm": 2.84928035736084, + "learning_rate": 0.0002, + "loss": 2.2998, + "step": 51500 + }, + { + "epoch": 3.838301043219076, + "grad_norm": 2.6037542819976807, + "learning_rate": 0.0002, + "loss": 2.4761, + "step": 51510 + }, + { + "epoch": 3.8390461997019374, + "grad_norm": 3.122798442840576, + "learning_rate": 0.0002, + "loss": 2.6203, + "step": 51520 + }, + { + "epoch": 3.839791356184799, + "grad_norm": 2.524787664413452, + "learning_rate": 0.0002, + "loss": 2.4676, + "step": 51530 + }, + { + "epoch": 3.84053651266766, + "grad_norm": 2.385199785232544, + "learning_rate": 0.0002, + "loss": 2.4442, + "step": 51540 + }, + { + "epoch": 3.8412816691505216, + "grad_norm": 2.4337494373321533, + "learning_rate": 0.0002, + "loss": 2.5004, + "step": 51550 + }, + { + "epoch": 3.842026825633383, + "grad_norm": 2.2421205043792725, + "learning_rate": 0.0002, + "loss": 2.362, + "step": 51560 + }, + { + "epoch": 3.8427719821162443, + "grad_norm": 2.355182647705078, + "learning_rate": 0.0002, + "loss": 2.4003, + "step": 51570 + }, + { + "epoch": 3.843517138599106, + "grad_norm": 2.7259955406188965, + "learning_rate": 0.0002, + "loss": 2.362, + "step": 51580 + }, + { + "epoch": 3.8442622950819674, + "grad_norm": 2.447803258895874, + "learning_rate": 0.0002, + "loss": 2.4592, + "step": 51590 + }, + { + "epoch": 3.8450074515648285, + "grad_norm": 3.1207966804504395, + "learning_rate": 0.0002, + "loss": 2.3862, + "step": 51600 + }, + { + "epoch": 3.84575260804769, + "grad_norm": 3.0702199935913086, + "learning_rate": 0.0002, + "loss": 2.58, + "step": 51610 + }, + { + "epoch": 3.846497764530551, + "grad_norm": 2.7334744930267334, + "learning_rate": 0.0002, + "loss": 2.5699, + "step": 51620 + }, + { + "epoch": 3.8472429210134127, + "grad_norm": 2.2549920082092285, + "learning_rate": 0.0002, + "loss": 2.3604, + "step": 51630 + }, + { + "epoch": 3.8479880774962743, + "grad_norm": 2.400271415710449, + "learning_rate": 0.0002, + "loss": 2.6297, + "step": 51640 + }, + { + "epoch": 3.848733233979136, + "grad_norm": 2.1789305210113525, + "learning_rate": 0.0002, + "loss": 2.5277, + "step": 51650 + }, + { + "epoch": 3.849478390461997, + "grad_norm": 2.701901435852051, + "learning_rate": 0.0002, + "loss": 2.4133, + "step": 51660 + }, + { + "epoch": 3.8502235469448585, + "grad_norm": 2.7472198009490967, + "learning_rate": 0.0002, + "loss": 2.3137, + "step": 51670 + }, + { + "epoch": 3.8509687034277196, + "grad_norm": 2.6537559032440186, + "learning_rate": 0.0002, + "loss": 2.5885, + "step": 51680 + }, + { + "epoch": 3.851713859910581, + "grad_norm": 2.521488666534424, + "learning_rate": 0.0002, + "loss": 2.5628, + "step": 51690 + }, + { + "epoch": 3.8524590163934427, + "grad_norm": 2.5507121086120605, + "learning_rate": 0.0002, + "loss": 2.4122, + "step": 51700 + }, + { + "epoch": 3.8532041728763042, + "grad_norm": 2.589373826980591, + "learning_rate": 0.0002, + "loss": 2.6058, + "step": 51710 + }, + { + "epoch": 3.8539493293591653, + "grad_norm": 2.9518160820007324, + "learning_rate": 0.0002, + "loss": 2.5514, + "step": 51720 + }, + { + "epoch": 3.854694485842027, + "grad_norm": 2.976022243499756, + "learning_rate": 0.0002, + "loss": 2.556, + "step": 51730 + }, + { + "epoch": 3.855439642324888, + "grad_norm": 2.435248374938965, + "learning_rate": 0.0002, + "loss": 2.4229, + "step": 51740 + }, + { + "epoch": 3.8561847988077496, + "grad_norm": 2.6801443099975586, + "learning_rate": 0.0002, + "loss": 2.4686, + "step": 51750 + }, + { + "epoch": 3.856929955290611, + "grad_norm": 1.9394075870513916, + "learning_rate": 0.0002, + "loss": 2.2974, + "step": 51760 + }, + { + "epoch": 3.8576751117734727, + "grad_norm": 2.496823310852051, + "learning_rate": 0.0002, + "loss": 2.3496, + "step": 51770 + }, + { + "epoch": 3.8584202682563338, + "grad_norm": 3.03865122795105, + "learning_rate": 0.0002, + "loss": 2.2881, + "step": 51780 + }, + { + "epoch": 3.8591654247391953, + "grad_norm": 2.5730419158935547, + "learning_rate": 0.0002, + "loss": 2.392, + "step": 51790 + }, + { + "epoch": 3.8599105812220564, + "grad_norm": 2.6450493335723877, + "learning_rate": 0.0002, + "loss": 2.5092, + "step": 51800 + }, + { + "epoch": 3.860655737704918, + "grad_norm": 2.5688185691833496, + "learning_rate": 0.0002, + "loss": 2.366, + "step": 51810 + }, + { + "epoch": 3.8614008941877795, + "grad_norm": 2.4910011291503906, + "learning_rate": 0.0002, + "loss": 2.4483, + "step": 51820 + }, + { + "epoch": 3.862146050670641, + "grad_norm": 3.078287363052368, + "learning_rate": 0.0002, + "loss": 2.6476, + "step": 51830 + }, + { + "epoch": 3.862891207153502, + "grad_norm": 2.2944555282592773, + "learning_rate": 0.0002, + "loss": 2.5578, + "step": 51840 + }, + { + "epoch": 3.8636363636363638, + "grad_norm": 2.777221202850342, + "learning_rate": 0.0002, + "loss": 2.4892, + "step": 51850 + }, + { + "epoch": 3.864381520119225, + "grad_norm": 2.459630250930786, + "learning_rate": 0.0002, + "loss": 2.5713, + "step": 51860 + }, + { + "epoch": 3.8651266766020864, + "grad_norm": 3.0153188705444336, + "learning_rate": 0.0002, + "loss": 2.6051, + "step": 51870 + }, + { + "epoch": 3.865871833084948, + "grad_norm": 2.6499626636505127, + "learning_rate": 0.0002, + "loss": 2.5795, + "step": 51880 + }, + { + "epoch": 3.866616989567809, + "grad_norm": 2.9734129905700684, + "learning_rate": 0.0002, + "loss": 2.5746, + "step": 51890 + }, + { + "epoch": 3.8673621460506706, + "grad_norm": 2.2308945655822754, + "learning_rate": 0.0002, + "loss": 2.4392, + "step": 51900 + }, + { + "epoch": 3.868107302533532, + "grad_norm": 2.5618507862091064, + "learning_rate": 0.0002, + "loss": 2.4746, + "step": 51910 + }, + { + "epoch": 3.8688524590163933, + "grad_norm": 2.3245627880096436, + "learning_rate": 0.0002, + "loss": 2.4258, + "step": 51920 + }, + { + "epoch": 3.869597615499255, + "grad_norm": 2.2224037647247314, + "learning_rate": 0.0002, + "loss": 2.5592, + "step": 51930 + }, + { + "epoch": 3.8703427719821164, + "grad_norm": 3.254966974258423, + "learning_rate": 0.0002, + "loss": 2.5822, + "step": 51940 + }, + { + "epoch": 3.8710879284649775, + "grad_norm": 2.469639778137207, + "learning_rate": 0.0002, + "loss": 2.4718, + "step": 51950 + }, + { + "epoch": 3.871833084947839, + "grad_norm": 2.6414148807525635, + "learning_rate": 0.0002, + "loss": 2.5728, + "step": 51960 + }, + { + "epoch": 3.8725782414307, + "grad_norm": 2.460594892501831, + "learning_rate": 0.0002, + "loss": 2.3055, + "step": 51970 + }, + { + "epoch": 3.8733233979135617, + "grad_norm": 2.6977572441101074, + "learning_rate": 0.0002, + "loss": 2.5869, + "step": 51980 + }, + { + "epoch": 3.8740685543964233, + "grad_norm": 2.5198538303375244, + "learning_rate": 0.0002, + "loss": 2.3507, + "step": 51990 + }, + { + "epoch": 3.874813710879285, + "grad_norm": 2.7170722484588623, + "learning_rate": 0.0002, + "loss": 2.6903, + "step": 52000 + }, + { + "epoch": 3.875558867362146, + "grad_norm": 2.7183640003204346, + "learning_rate": 0.0002, + "loss": 2.6414, + "step": 52010 + }, + { + "epoch": 3.8763040238450075, + "grad_norm": 2.5241034030914307, + "learning_rate": 0.0002, + "loss": 2.2518, + "step": 52020 + }, + { + "epoch": 3.8770491803278686, + "grad_norm": 2.6409521102905273, + "learning_rate": 0.0002, + "loss": 2.4558, + "step": 52030 + }, + { + "epoch": 3.87779433681073, + "grad_norm": 2.5587220191955566, + "learning_rate": 0.0002, + "loss": 2.3879, + "step": 52040 + }, + { + "epoch": 3.8785394932935917, + "grad_norm": 2.8432421684265137, + "learning_rate": 0.0002, + "loss": 2.6653, + "step": 52050 + }, + { + "epoch": 3.8792846497764533, + "grad_norm": 3.088017225265503, + "learning_rate": 0.0002, + "loss": 2.5824, + "step": 52060 + }, + { + "epoch": 3.8800298062593144, + "grad_norm": 2.6075172424316406, + "learning_rate": 0.0002, + "loss": 2.2853, + "step": 52070 + }, + { + "epoch": 3.880774962742176, + "grad_norm": 2.3779594898223877, + "learning_rate": 0.0002, + "loss": 2.5866, + "step": 52080 + }, + { + "epoch": 3.881520119225037, + "grad_norm": 2.727888584136963, + "learning_rate": 0.0002, + "loss": 2.6519, + "step": 52090 + }, + { + "epoch": 3.8822652757078986, + "grad_norm": 3.6070122718811035, + "learning_rate": 0.0002, + "loss": 2.4691, + "step": 52100 + }, + { + "epoch": 3.88301043219076, + "grad_norm": 2.8092737197875977, + "learning_rate": 0.0002, + "loss": 2.5209, + "step": 52110 + }, + { + "epoch": 3.8837555886736217, + "grad_norm": 2.367863416671753, + "learning_rate": 0.0002, + "loss": 2.3449, + "step": 52120 + }, + { + "epoch": 3.884500745156483, + "grad_norm": 2.1919851303100586, + "learning_rate": 0.0002, + "loss": 2.6012, + "step": 52130 + }, + { + "epoch": 3.8852459016393444, + "grad_norm": 2.689823865890503, + "learning_rate": 0.0002, + "loss": 2.5961, + "step": 52140 + }, + { + "epoch": 3.8859910581222055, + "grad_norm": 2.8224596977233887, + "learning_rate": 0.0002, + "loss": 2.4739, + "step": 52150 + }, + { + "epoch": 3.886736214605067, + "grad_norm": 2.892871856689453, + "learning_rate": 0.0002, + "loss": 2.6218, + "step": 52160 + }, + { + "epoch": 3.8874813710879286, + "grad_norm": 2.5560922622680664, + "learning_rate": 0.0002, + "loss": 2.4748, + "step": 52170 + }, + { + "epoch": 3.88822652757079, + "grad_norm": 2.412459135055542, + "learning_rate": 0.0002, + "loss": 2.5312, + "step": 52180 + }, + { + "epoch": 3.8889716840536512, + "grad_norm": 2.6065220832824707, + "learning_rate": 0.0002, + "loss": 2.3348, + "step": 52190 + }, + { + "epoch": 3.889716840536513, + "grad_norm": 2.771390199661255, + "learning_rate": 0.0002, + "loss": 2.4942, + "step": 52200 + }, + { + "epoch": 3.890461997019374, + "grad_norm": 2.511683702468872, + "learning_rate": 0.0002, + "loss": 2.5767, + "step": 52210 + }, + { + "epoch": 3.8912071535022354, + "grad_norm": 2.926457643508911, + "learning_rate": 0.0002, + "loss": 2.4977, + "step": 52220 + }, + { + "epoch": 3.891952309985097, + "grad_norm": 2.466374397277832, + "learning_rate": 0.0002, + "loss": 2.5909, + "step": 52230 + }, + { + "epoch": 3.892697466467958, + "grad_norm": 2.454491376876831, + "learning_rate": 0.0002, + "loss": 2.2894, + "step": 52240 + }, + { + "epoch": 3.8934426229508197, + "grad_norm": 2.548574209213257, + "learning_rate": 0.0002, + "loss": 2.4649, + "step": 52250 + }, + { + "epoch": 3.894187779433681, + "grad_norm": 2.4088494777679443, + "learning_rate": 0.0002, + "loss": 2.1674, + "step": 52260 + }, + { + "epoch": 3.8949329359165423, + "grad_norm": 2.7018039226531982, + "learning_rate": 0.0002, + "loss": 2.5642, + "step": 52270 + }, + { + "epoch": 3.895678092399404, + "grad_norm": 2.7482492923736572, + "learning_rate": 0.0002, + "loss": 2.4304, + "step": 52280 + }, + { + "epoch": 3.8964232488822654, + "grad_norm": 2.329071521759033, + "learning_rate": 0.0002, + "loss": 2.4925, + "step": 52290 + }, + { + "epoch": 3.8971684053651265, + "grad_norm": 2.745163917541504, + "learning_rate": 0.0002, + "loss": 2.465, + "step": 52300 + }, + { + "epoch": 3.897913561847988, + "grad_norm": 2.767301559448242, + "learning_rate": 0.0002, + "loss": 2.5761, + "step": 52310 + }, + { + "epoch": 3.898658718330849, + "grad_norm": 2.6464600563049316, + "learning_rate": 0.0002, + "loss": 2.3783, + "step": 52320 + }, + { + "epoch": 3.8994038748137108, + "grad_norm": 2.457310199737549, + "learning_rate": 0.0002, + "loss": 2.0868, + "step": 52330 + }, + { + "epoch": 3.9001490312965723, + "grad_norm": 2.3915839195251465, + "learning_rate": 0.0002, + "loss": 2.4427, + "step": 52340 + }, + { + "epoch": 3.900894187779434, + "grad_norm": 2.6058714389801025, + "learning_rate": 0.0002, + "loss": 2.4721, + "step": 52350 + }, + { + "epoch": 3.901639344262295, + "grad_norm": 3.087757110595703, + "learning_rate": 0.0002, + "loss": 2.567, + "step": 52360 + }, + { + "epoch": 3.9023845007451565, + "grad_norm": 2.904249429702759, + "learning_rate": 0.0002, + "loss": 2.4766, + "step": 52370 + }, + { + "epoch": 3.9031296572280176, + "grad_norm": 2.673194169998169, + "learning_rate": 0.0002, + "loss": 2.6861, + "step": 52380 + }, + { + "epoch": 3.903874813710879, + "grad_norm": 2.4433786869049072, + "learning_rate": 0.0002, + "loss": 2.4555, + "step": 52390 + }, + { + "epoch": 3.9046199701937407, + "grad_norm": 2.482530117034912, + "learning_rate": 0.0002, + "loss": 2.5579, + "step": 52400 + }, + { + "epoch": 3.9053651266766023, + "grad_norm": 2.7197022438049316, + "learning_rate": 0.0002, + "loss": 2.4326, + "step": 52410 + }, + { + "epoch": 3.9061102831594634, + "grad_norm": 2.534710645675659, + "learning_rate": 0.0002, + "loss": 2.7553, + "step": 52420 + }, + { + "epoch": 3.906855439642325, + "grad_norm": 2.1356122493743896, + "learning_rate": 0.0002, + "loss": 2.2721, + "step": 52430 + }, + { + "epoch": 3.907600596125186, + "grad_norm": 2.3630571365356445, + "learning_rate": 0.0002, + "loss": 2.5597, + "step": 52440 + }, + { + "epoch": 3.9083457526080476, + "grad_norm": 2.6591246128082275, + "learning_rate": 0.0002, + "loss": 2.4836, + "step": 52450 + }, + { + "epoch": 3.909090909090909, + "grad_norm": 2.511927604675293, + "learning_rate": 0.0002, + "loss": 2.6249, + "step": 52460 + }, + { + "epoch": 3.9098360655737707, + "grad_norm": 2.8676466941833496, + "learning_rate": 0.0002, + "loss": 2.4936, + "step": 52470 + }, + { + "epoch": 3.910581222056632, + "grad_norm": 2.7902708053588867, + "learning_rate": 0.0002, + "loss": 2.5414, + "step": 52480 + }, + { + "epoch": 3.9113263785394934, + "grad_norm": 2.6967098712921143, + "learning_rate": 0.0002, + "loss": 2.5556, + "step": 52490 + }, + { + "epoch": 3.9120715350223545, + "grad_norm": 2.569732904434204, + "learning_rate": 0.0002, + "loss": 2.4874, + "step": 52500 + }, + { + "epoch": 3.912816691505216, + "grad_norm": 2.595181465148926, + "learning_rate": 0.0002, + "loss": 2.5599, + "step": 52510 + }, + { + "epoch": 3.9135618479880776, + "grad_norm": 2.3675436973571777, + "learning_rate": 0.0002, + "loss": 2.6449, + "step": 52520 + }, + { + "epoch": 3.914307004470939, + "grad_norm": 2.709636688232422, + "learning_rate": 0.0002, + "loss": 2.5443, + "step": 52530 + }, + { + "epoch": 3.9150521609538003, + "grad_norm": 2.7258338928222656, + "learning_rate": 0.0002, + "loss": 2.3612, + "step": 52540 + }, + { + "epoch": 3.915797317436662, + "grad_norm": 3.1132118701934814, + "learning_rate": 0.0002, + "loss": 2.3272, + "step": 52550 + }, + { + "epoch": 3.916542473919523, + "grad_norm": 2.628192186355591, + "learning_rate": 0.0002, + "loss": 2.4815, + "step": 52560 + }, + { + "epoch": 3.9172876304023845, + "grad_norm": 2.5047247409820557, + "learning_rate": 0.0002, + "loss": 2.4923, + "step": 52570 + }, + { + "epoch": 3.918032786885246, + "grad_norm": 2.726745843887329, + "learning_rate": 0.0002, + "loss": 2.4188, + "step": 52580 + }, + { + "epoch": 3.918777943368107, + "grad_norm": 2.729163646697998, + "learning_rate": 0.0002, + "loss": 2.5176, + "step": 52590 + }, + { + "epoch": 3.9195230998509687, + "grad_norm": 2.8599231243133545, + "learning_rate": 0.0002, + "loss": 2.5171, + "step": 52600 + }, + { + "epoch": 3.9202682563338302, + "grad_norm": 2.2114243507385254, + "learning_rate": 0.0002, + "loss": 2.5073, + "step": 52610 + }, + { + "epoch": 3.9210134128166914, + "grad_norm": 2.164640426635742, + "learning_rate": 0.0002, + "loss": 2.6172, + "step": 52620 + }, + { + "epoch": 3.921758569299553, + "grad_norm": 2.7725958824157715, + "learning_rate": 0.0002, + "loss": 2.5268, + "step": 52630 + }, + { + "epoch": 3.9225037257824145, + "grad_norm": 2.6083829402923584, + "learning_rate": 0.0002, + "loss": 2.4921, + "step": 52640 + }, + { + "epoch": 3.9232488822652756, + "grad_norm": 2.948575496673584, + "learning_rate": 0.0002, + "loss": 2.5642, + "step": 52650 + }, + { + "epoch": 3.923994038748137, + "grad_norm": 2.494868755340576, + "learning_rate": 0.0002, + "loss": 2.4473, + "step": 52660 + }, + { + "epoch": 3.9247391952309982, + "grad_norm": 3.170365333557129, + "learning_rate": 0.0002, + "loss": 2.542, + "step": 52670 + }, + { + "epoch": 3.92548435171386, + "grad_norm": 2.6471188068389893, + "learning_rate": 0.0002, + "loss": 2.436, + "step": 52680 + }, + { + "epoch": 3.9262295081967213, + "grad_norm": 2.868462562561035, + "learning_rate": 0.0002, + "loss": 2.4097, + "step": 52690 + }, + { + "epoch": 3.926974664679583, + "grad_norm": 2.8815088272094727, + "learning_rate": 0.0002, + "loss": 2.5023, + "step": 52700 + }, + { + "epoch": 3.927719821162444, + "grad_norm": 2.6703245639801025, + "learning_rate": 0.0002, + "loss": 2.5889, + "step": 52710 + }, + { + "epoch": 3.9284649776453056, + "grad_norm": 2.9111456871032715, + "learning_rate": 0.0002, + "loss": 2.3997, + "step": 52720 + }, + { + "epoch": 3.9292101341281667, + "grad_norm": 3.0684051513671875, + "learning_rate": 0.0002, + "loss": 2.379, + "step": 52730 + }, + { + "epoch": 3.929955290611028, + "grad_norm": 2.901998996734619, + "learning_rate": 0.0002, + "loss": 2.6043, + "step": 52740 + }, + { + "epoch": 3.9307004470938898, + "grad_norm": 3.4542715549468994, + "learning_rate": 0.0002, + "loss": 2.2915, + "step": 52750 + }, + { + "epoch": 3.9314456035767513, + "grad_norm": 2.3926374912261963, + "learning_rate": 0.0002, + "loss": 2.2738, + "step": 52760 + }, + { + "epoch": 3.9321907600596124, + "grad_norm": 2.790262222290039, + "learning_rate": 0.0002, + "loss": 2.4374, + "step": 52770 + }, + { + "epoch": 3.932935916542474, + "grad_norm": 3.287041664123535, + "learning_rate": 0.0002, + "loss": 2.3272, + "step": 52780 + }, + { + "epoch": 3.933681073025335, + "grad_norm": 2.6632344722747803, + "learning_rate": 0.0002, + "loss": 2.619, + "step": 52790 + }, + { + "epoch": 3.9344262295081966, + "grad_norm": 2.1901233196258545, + "learning_rate": 0.0002, + "loss": 2.3979, + "step": 52800 + }, + { + "epoch": 3.935171385991058, + "grad_norm": 2.72855544090271, + "learning_rate": 0.0002, + "loss": 2.5868, + "step": 52810 + }, + { + "epoch": 3.9359165424739198, + "grad_norm": 2.598860502243042, + "learning_rate": 0.0002, + "loss": 2.4699, + "step": 52820 + }, + { + "epoch": 3.936661698956781, + "grad_norm": 2.5119919776916504, + "learning_rate": 0.0002, + "loss": 2.3506, + "step": 52830 + }, + { + "epoch": 3.9374068554396424, + "grad_norm": 2.60629940032959, + "learning_rate": 0.0002, + "loss": 2.5943, + "step": 52840 + }, + { + "epoch": 3.9381520119225035, + "grad_norm": 2.4986929893493652, + "learning_rate": 0.0002, + "loss": 2.4542, + "step": 52850 + }, + { + "epoch": 3.938897168405365, + "grad_norm": 2.564370632171631, + "learning_rate": 0.0002, + "loss": 2.4587, + "step": 52860 + }, + { + "epoch": 3.9396423248882266, + "grad_norm": 3.0465457439422607, + "learning_rate": 0.0002, + "loss": 2.5511, + "step": 52870 + }, + { + "epoch": 3.940387481371088, + "grad_norm": 2.802629232406616, + "learning_rate": 0.0002, + "loss": 2.5935, + "step": 52880 + }, + { + "epoch": 3.9411326378539493, + "grad_norm": 2.5224101543426514, + "learning_rate": 0.0002, + "loss": 2.4176, + "step": 52890 + }, + { + "epoch": 3.941877794336811, + "grad_norm": 2.601170301437378, + "learning_rate": 0.0002, + "loss": 2.4259, + "step": 52900 + }, + { + "epoch": 3.942622950819672, + "grad_norm": 2.618849515914917, + "learning_rate": 0.0002, + "loss": 2.5917, + "step": 52910 + }, + { + "epoch": 3.9433681073025335, + "grad_norm": 2.3083279132843018, + "learning_rate": 0.0002, + "loss": 2.4501, + "step": 52920 + }, + { + "epoch": 3.944113263785395, + "grad_norm": 2.783717155456543, + "learning_rate": 0.0002, + "loss": 2.6085, + "step": 52930 + }, + { + "epoch": 3.944858420268256, + "grad_norm": 2.4879956245422363, + "learning_rate": 0.0002, + "loss": 2.3325, + "step": 52940 + }, + { + "epoch": 3.9456035767511177, + "grad_norm": 2.765442132949829, + "learning_rate": 0.0002, + "loss": 2.5773, + "step": 52950 + }, + { + "epoch": 3.9463487332339793, + "grad_norm": 2.7897586822509766, + "learning_rate": 0.0002, + "loss": 2.4599, + "step": 52960 + }, + { + "epoch": 3.9470938897168404, + "grad_norm": 2.905817747116089, + "learning_rate": 0.0002, + "loss": 2.475, + "step": 52970 + }, + { + "epoch": 3.947839046199702, + "grad_norm": 2.8461194038391113, + "learning_rate": 0.0002, + "loss": 2.6288, + "step": 52980 + }, + { + "epoch": 3.9485842026825635, + "grad_norm": 2.5353071689605713, + "learning_rate": 0.0002, + "loss": 2.4977, + "step": 52990 + }, + { + "epoch": 3.9493293591654246, + "grad_norm": 2.5417919158935547, + "learning_rate": 0.0002, + "loss": 2.4847, + "step": 53000 + }, + { + "epoch": 3.950074515648286, + "grad_norm": 2.5590386390686035, + "learning_rate": 0.0002, + "loss": 2.4574, + "step": 53010 + }, + { + "epoch": 3.9508196721311473, + "grad_norm": 2.5833194255828857, + "learning_rate": 0.0002, + "loss": 2.1748, + "step": 53020 + }, + { + "epoch": 3.951564828614009, + "grad_norm": 2.8139731884002686, + "learning_rate": 0.0002, + "loss": 2.5048, + "step": 53030 + }, + { + "epoch": 3.9523099850968704, + "grad_norm": 2.520247459411621, + "learning_rate": 0.0002, + "loss": 2.5423, + "step": 53040 + }, + { + "epoch": 3.953055141579732, + "grad_norm": 2.8328166007995605, + "learning_rate": 0.0002, + "loss": 2.7476, + "step": 53050 + }, + { + "epoch": 3.953800298062593, + "grad_norm": 2.491218328475952, + "learning_rate": 0.0002, + "loss": 2.5617, + "step": 53060 + }, + { + "epoch": 3.9545454545454546, + "grad_norm": 2.5774359703063965, + "learning_rate": 0.0002, + "loss": 2.4691, + "step": 53070 + }, + { + "epoch": 3.9552906110283157, + "grad_norm": 2.8533051013946533, + "learning_rate": 0.0002, + "loss": 2.7179, + "step": 53080 + }, + { + "epoch": 3.9560357675111772, + "grad_norm": 2.5596423149108887, + "learning_rate": 0.0002, + "loss": 2.5002, + "step": 53090 + }, + { + "epoch": 3.956780923994039, + "grad_norm": 2.4977810382843018, + "learning_rate": 0.0002, + "loss": 2.5137, + "step": 53100 + }, + { + "epoch": 3.9575260804769004, + "grad_norm": 2.417628526687622, + "learning_rate": 0.0002, + "loss": 2.4268, + "step": 53110 + }, + { + "epoch": 3.9582712369597615, + "grad_norm": 2.7879631519317627, + "learning_rate": 0.0002, + "loss": 2.6652, + "step": 53120 + }, + { + "epoch": 3.959016393442623, + "grad_norm": 2.732900857925415, + "learning_rate": 0.0002, + "loss": 2.4687, + "step": 53130 + }, + { + "epoch": 3.959761549925484, + "grad_norm": 2.5908401012420654, + "learning_rate": 0.0002, + "loss": 2.4273, + "step": 53140 + }, + { + "epoch": 3.9605067064083457, + "grad_norm": 3.021219253540039, + "learning_rate": 0.0002, + "loss": 2.4994, + "step": 53150 + }, + { + "epoch": 3.9612518628912072, + "grad_norm": 2.95470929145813, + "learning_rate": 0.0002, + "loss": 2.4703, + "step": 53160 + }, + { + "epoch": 3.961997019374069, + "grad_norm": 2.4512953758239746, + "learning_rate": 0.0002, + "loss": 2.5641, + "step": 53170 + }, + { + "epoch": 3.96274217585693, + "grad_norm": 2.5563712120056152, + "learning_rate": 0.0002, + "loss": 2.5442, + "step": 53180 + }, + { + "epoch": 3.9634873323397914, + "grad_norm": 2.6706528663635254, + "learning_rate": 0.0002, + "loss": 2.5561, + "step": 53190 + }, + { + "epoch": 3.9642324888226526, + "grad_norm": 2.7212600708007812, + "learning_rate": 0.0002, + "loss": 2.4722, + "step": 53200 + }, + { + "epoch": 3.964977645305514, + "grad_norm": 2.527886390686035, + "learning_rate": 0.0002, + "loss": 2.4376, + "step": 53210 + }, + { + "epoch": 3.9657228017883757, + "grad_norm": 2.4174673557281494, + "learning_rate": 0.0002, + "loss": 2.5337, + "step": 53220 + }, + { + "epoch": 3.966467958271237, + "grad_norm": 3.1026086807250977, + "learning_rate": 0.0002, + "loss": 2.4212, + "step": 53230 + }, + { + "epoch": 3.9672131147540983, + "grad_norm": 2.696936845779419, + "learning_rate": 0.0002, + "loss": 2.4115, + "step": 53240 + }, + { + "epoch": 3.96795827123696, + "grad_norm": 2.7065093517303467, + "learning_rate": 0.0002, + "loss": 2.575, + "step": 53250 + }, + { + "epoch": 3.968703427719821, + "grad_norm": 2.599903106689453, + "learning_rate": 0.0002, + "loss": 2.4136, + "step": 53260 + }, + { + "epoch": 3.9694485842026825, + "grad_norm": 2.7001261711120605, + "learning_rate": 0.0002, + "loss": 2.4217, + "step": 53270 + }, + { + "epoch": 3.970193740685544, + "grad_norm": 2.7080583572387695, + "learning_rate": 0.0002, + "loss": 2.6516, + "step": 53280 + }, + { + "epoch": 3.970938897168405, + "grad_norm": 2.549535036087036, + "learning_rate": 0.0002, + "loss": 2.4606, + "step": 53290 + }, + { + "epoch": 3.9716840536512668, + "grad_norm": 3.246875047683716, + "learning_rate": 0.0002, + "loss": 2.6515, + "step": 53300 + }, + { + "epoch": 3.9724292101341283, + "grad_norm": 2.5322539806365967, + "learning_rate": 0.0002, + "loss": 2.4486, + "step": 53310 + }, + { + "epoch": 3.9731743666169894, + "grad_norm": 2.2940611839294434, + "learning_rate": 0.0002, + "loss": 2.358, + "step": 53320 + }, + { + "epoch": 3.973919523099851, + "grad_norm": 2.6890695095062256, + "learning_rate": 0.0002, + "loss": 2.5776, + "step": 53330 + }, + { + "epoch": 3.9746646795827125, + "grad_norm": 2.7836878299713135, + "learning_rate": 0.0002, + "loss": 2.3885, + "step": 53340 + }, + { + "epoch": 3.9754098360655736, + "grad_norm": 2.572012186050415, + "learning_rate": 0.0002, + "loss": 2.3641, + "step": 53350 + }, + { + "epoch": 3.976154992548435, + "grad_norm": 2.936228036880493, + "learning_rate": 0.0002, + "loss": 2.4106, + "step": 53360 + }, + { + "epoch": 3.9769001490312967, + "grad_norm": 2.7177422046661377, + "learning_rate": 0.0002, + "loss": 2.6147, + "step": 53370 + }, + { + "epoch": 3.977645305514158, + "grad_norm": 2.8363125324249268, + "learning_rate": 0.0002, + "loss": 2.5204, + "step": 53380 + }, + { + "epoch": 3.9783904619970194, + "grad_norm": 2.6096811294555664, + "learning_rate": 0.0002, + "loss": 2.3503, + "step": 53390 + }, + { + "epoch": 3.979135618479881, + "grad_norm": 2.363093852996826, + "learning_rate": 0.0002, + "loss": 2.7152, + "step": 53400 + }, + { + "epoch": 3.979880774962742, + "grad_norm": 2.8034679889678955, + "learning_rate": 0.0002, + "loss": 2.5279, + "step": 53410 + }, + { + "epoch": 3.9806259314456036, + "grad_norm": 2.614542245864868, + "learning_rate": 0.0002, + "loss": 2.429, + "step": 53420 + }, + { + "epoch": 3.9813710879284647, + "grad_norm": 2.7522165775299072, + "learning_rate": 0.0002, + "loss": 2.547, + "step": 53430 + }, + { + "epoch": 3.9821162444113263, + "grad_norm": 2.595388412475586, + "learning_rate": 0.0002, + "loss": 2.566, + "step": 53440 + }, + { + "epoch": 3.982861400894188, + "grad_norm": 2.463608741760254, + "learning_rate": 0.0002, + "loss": 2.4284, + "step": 53450 + }, + { + "epoch": 3.9836065573770494, + "grad_norm": 2.377199172973633, + "learning_rate": 0.0002, + "loss": 2.5547, + "step": 53460 + }, + { + "epoch": 3.9843517138599105, + "grad_norm": 2.985874652862549, + "learning_rate": 0.0002, + "loss": 2.5976, + "step": 53470 + }, + { + "epoch": 3.985096870342772, + "grad_norm": 2.601820707321167, + "learning_rate": 0.0002, + "loss": 2.5576, + "step": 53480 + }, + { + "epoch": 3.985842026825633, + "grad_norm": 3.1383955478668213, + "learning_rate": 0.0002, + "loss": 2.4143, + "step": 53490 + }, + { + "epoch": 3.9865871833084947, + "grad_norm": 2.46256685256958, + "learning_rate": 0.0002, + "loss": 2.6677, + "step": 53500 + }, + { + "epoch": 3.9873323397913563, + "grad_norm": 2.43536376953125, + "learning_rate": 0.0002, + "loss": 2.526, + "step": 53510 + }, + { + "epoch": 3.988077496274218, + "grad_norm": 2.3847920894622803, + "learning_rate": 0.0002, + "loss": 2.6184, + "step": 53520 + }, + { + "epoch": 3.988822652757079, + "grad_norm": 2.5792295932769775, + "learning_rate": 0.0002, + "loss": 2.6166, + "step": 53530 + }, + { + "epoch": 3.9895678092399405, + "grad_norm": 2.792299509048462, + "learning_rate": 0.0002, + "loss": 2.6626, + "step": 53540 + }, + { + "epoch": 3.9903129657228016, + "grad_norm": 2.445161819458008, + "learning_rate": 0.0002, + "loss": 2.5005, + "step": 53550 + }, + { + "epoch": 3.991058122205663, + "grad_norm": 3.263998031616211, + "learning_rate": 0.0002, + "loss": 2.5312, + "step": 53560 + }, + { + "epoch": 3.9918032786885247, + "grad_norm": 2.557817220687866, + "learning_rate": 0.0002, + "loss": 2.503, + "step": 53570 + }, + { + "epoch": 3.9925484351713862, + "grad_norm": 2.727743625640869, + "learning_rate": 0.0002, + "loss": 2.6169, + "step": 53580 + }, + { + "epoch": 3.9932935916542474, + "grad_norm": 2.6156795024871826, + "learning_rate": 0.0002, + "loss": 2.5591, + "step": 53590 + }, + { + "epoch": 3.994038748137109, + "grad_norm": 2.661540985107422, + "learning_rate": 0.0002, + "loss": 2.5086, + "step": 53600 + }, + { + "epoch": 3.99478390461997, + "grad_norm": 2.503107786178589, + "learning_rate": 0.0002, + "loss": 2.5765, + "step": 53610 + }, + { + "epoch": 3.9955290611028316, + "grad_norm": 2.6578187942504883, + "learning_rate": 0.0002, + "loss": 2.4175, + "step": 53620 + }, + { + "epoch": 3.996274217585693, + "grad_norm": 2.996702194213867, + "learning_rate": 0.0002, + "loss": 2.694, + "step": 53630 + }, + { + "epoch": 3.9970193740685542, + "grad_norm": 2.9681408405303955, + "learning_rate": 0.0002, + "loss": 2.4948, + "step": 53640 + }, + { + "epoch": 3.997764530551416, + "grad_norm": 2.7573134899139404, + "learning_rate": 0.0002, + "loss": 2.6957, + "step": 53650 + }, + { + "epoch": 3.9985096870342773, + "grad_norm": 2.6993305683135986, + "learning_rate": 0.0002, + "loss": 2.588, + "step": 53660 + }, + { + "epoch": 3.9992548435171384, + "grad_norm": 2.6153762340545654, + "learning_rate": 0.0002, + "loss": 2.5999, + "step": 53670 + }, + { + "epoch": 4.0, + "grad_norm": 3.1838414669036865, + "learning_rate": 0.0002, + "loss": 2.4747, + "step": 53680 + }, + { + "epoch": 4.0, + "eval_runtime": 2866.1708, + "eval_samples_per_second": 4.682, + "eval_steps_per_second": 0.585, + "step": 53680 + } + ], + "logging_steps": 10, + "max_steps": 80520, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.734753409106688e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}