{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.99000999000999, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01998001998001998, "grad_norm": 1.9170171022415161, "learning_rate": 0.0001999980260856137, "loss": 1.4846, "step": 10 }, { "epoch": 0.03996003996003996, "grad_norm": 0.5531741976737976, "learning_rate": 0.00019999210442038162, "loss": 1.0709, "step": 20 }, { "epoch": 0.059940059940059943, "grad_norm": 0.39242061972618103, "learning_rate": 0.0001999822352380809, "loss": 0.9892, "step": 30 }, { "epoch": 0.07992007992007992, "grad_norm": 0.46874135732650757, "learning_rate": 0.00019996841892833, "loss": 0.9705, "step": 40 }, { "epoch": 0.0999000999000999, "grad_norm": 0.4039924740791321, "learning_rate": 0.00019995065603657316, "loss": 0.953, "step": 50 }, { "epoch": 0.11988011988011989, "grad_norm": 0.37654027342796326, "learning_rate": 0.00019992894726405893, "loss": 0.9138, "step": 60 }, { "epoch": 0.13986013986013987, "grad_norm": 0.41351592540740967, "learning_rate": 0.0001999032934678125, "loss": 0.9121, "step": 70 }, { "epoch": 0.15984015984015984, "grad_norm": 0.42355260252952576, "learning_rate": 0.00019987369566060176, "loss": 0.8971, "step": 80 }, { "epoch": 0.1798201798201798, "grad_norm": 0.40265560150146484, "learning_rate": 0.00019984015501089752, "loss": 0.892, "step": 90 }, { "epoch": 0.1998001998001998, "grad_norm": 0.36668843030929565, "learning_rate": 0.00019980267284282717, "loss": 0.8907, "step": 100 }, { "epoch": 0.21978021978021978, "grad_norm": 0.3516446352005005, "learning_rate": 0.00019976125063612252, "loss": 0.888, "step": 110 }, { "epoch": 0.23976023976023977, "grad_norm": 0.3761754631996155, "learning_rate": 0.0001997158900260614, "loss": 0.8883, "step": 120 }, { "epoch": 0.2597402597402597, "grad_norm": 0.3486793041229248, "learning_rate": 0.00019966659280340297, "loss": 0.8709, "step": 130 }, { "epoch": 0.27972027972027974, "grad_norm": 0.39413630962371826, "learning_rate": 0.00019961336091431727, "loss": 0.8544, "step": 140 }, { "epoch": 0.2997002997002997, "grad_norm": 0.3653990924358368, "learning_rate": 0.00019955619646030802, "loss": 0.8647, "step": 150 }, { "epoch": 0.3196803196803197, "grad_norm": 0.4523209035396576, "learning_rate": 0.00019949510169813003, "loss": 0.8698, "step": 160 }, { "epoch": 0.33966033966033965, "grad_norm": 0.3841874897480011, "learning_rate": 0.0001994300790396999, "loss": 0.8513, "step": 170 }, { "epoch": 0.3596403596403596, "grad_norm": 0.3849908709526062, "learning_rate": 0.00019936113105200085, "loss": 0.8553, "step": 180 }, { "epoch": 0.37962037962037964, "grad_norm": 0.3563358783721924, "learning_rate": 0.00019928826045698136, "loss": 0.8615, "step": 190 }, { "epoch": 0.3996003996003996, "grad_norm": 0.3968392610549927, "learning_rate": 0.0001992114701314478, "loss": 0.8502, "step": 200 }, { "epoch": 0.4195804195804196, "grad_norm": 0.366230845451355, "learning_rate": 0.00019913076310695068, "loss": 0.8368, "step": 210 }, { "epoch": 0.43956043956043955, "grad_norm": 0.38813525438308716, "learning_rate": 0.00019904614256966512, "loss": 0.862, "step": 220 }, { "epoch": 0.4595404595404595, "grad_norm": 0.35268592834472656, "learning_rate": 0.0001989576118602651, "loss": 0.8468, "step": 230 }, { "epoch": 0.47952047952047955, "grad_norm": 0.342580109834671, "learning_rate": 0.0001988651744737914, "loss": 0.8575, "step": 240 }, { "epoch": 0.4995004995004995, "grad_norm": 0.37153083086013794, "learning_rate": 0.00019876883405951377, "loss": 0.8374, "step": 250 }, { "epoch": 0.5194805194805194, "grad_norm": 0.3486216366291046, "learning_rate": 0.0001986685944207868, "loss": 0.8333, "step": 260 }, { "epoch": 0.5394605394605395, "grad_norm": 0.3562557101249695, "learning_rate": 0.00019856445951489982, "loss": 0.8238, "step": 270 }, { "epoch": 0.5594405594405595, "grad_norm": 0.3600502610206604, "learning_rate": 0.00019845643345292054, "loss": 0.8331, "step": 280 }, { "epoch": 0.5794205794205795, "grad_norm": 0.3475654423236847, "learning_rate": 0.00019834452049953297, "loss": 0.8093, "step": 290 }, { "epoch": 0.5994005994005994, "grad_norm": 0.358980655670166, "learning_rate": 0.0001982287250728689, "loss": 0.8302, "step": 300 }, { "epoch": 0.6193806193806194, "grad_norm": 0.3721815347671509, "learning_rate": 0.0001981090517443334, "loss": 0.8175, "step": 310 }, { "epoch": 0.6393606393606394, "grad_norm": 0.35128098726272583, "learning_rate": 0.0001979855052384247, "loss": 0.8193, "step": 320 }, { "epoch": 0.6593406593406593, "grad_norm": 0.3471618592739105, "learning_rate": 0.00019785809043254722, "loss": 0.8232, "step": 330 }, { "epoch": 0.6793206793206793, "grad_norm": 0.35060420632362366, "learning_rate": 0.00019772681235681936, "loss": 0.8194, "step": 340 }, { "epoch": 0.6993006993006993, "grad_norm": 0.3695327341556549, "learning_rate": 0.00019759167619387476, "loss": 0.806, "step": 350 }, { "epoch": 0.7192807192807192, "grad_norm": 0.35857513546943665, "learning_rate": 0.00019745268727865774, "loss": 0.8019, "step": 360 }, { "epoch": 0.7392607392607392, "grad_norm": 0.3612421154975891, "learning_rate": 0.00019730985109821266, "loss": 0.8061, "step": 370 }, { "epoch": 0.7592407592407593, "grad_norm": 0.34007078409194946, "learning_rate": 0.0001971631732914674, "loss": 0.7919, "step": 380 }, { "epoch": 0.7792207792207793, "grad_norm": 0.3594492971897125, "learning_rate": 0.0001970126596490106, "loss": 0.7821, "step": 390 }, { "epoch": 0.7992007992007992, "grad_norm": 0.37426885962486267, "learning_rate": 0.0001968583161128631, "loss": 0.8054, "step": 400 }, { "epoch": 0.8191808191808192, "grad_norm": 0.3551250398159027, "learning_rate": 0.00019670014877624353, "loss": 0.7954, "step": 410 }, { "epoch": 0.8391608391608392, "grad_norm": 0.35951119661331177, "learning_rate": 0.0001965381638833274, "loss": 0.7966, "step": 420 }, { "epoch": 0.8591408591408591, "grad_norm": 0.36964887380599976, "learning_rate": 0.000196372367829001, "loss": 0.7888, "step": 430 }, { "epoch": 0.8791208791208791, "grad_norm": 0.36829873919487, "learning_rate": 0.0001962027671586086, "loss": 0.7902, "step": 440 }, { "epoch": 0.8991008991008991, "grad_norm": 0.34358304738998413, "learning_rate": 0.0001960293685676943, "loss": 0.7733, "step": 450 }, { "epoch": 0.919080919080919, "grad_norm": 0.37369629740715027, "learning_rate": 0.0001958521789017376, "loss": 0.796, "step": 460 }, { "epoch": 0.939060939060939, "grad_norm": 0.40985429286956787, "learning_rate": 0.00019567120515588308, "loss": 0.7931, "step": 470 }, { "epoch": 0.9590409590409591, "grad_norm": 0.34838569164276123, "learning_rate": 0.00019548645447466431, "loss": 0.7682, "step": 480 }, { "epoch": 0.9790209790209791, "grad_norm": 0.36467525362968445, "learning_rate": 0.00019529793415172192, "loss": 0.7781, "step": 490 }, { "epoch": 0.999000999000999, "grad_norm": 0.37112316489219666, "learning_rate": 0.00019510565162951537, "loss": 0.7773, "step": 500 }, { "epoch": 1.018981018981019, "grad_norm": 0.3998737931251526, "learning_rate": 0.00019490961449902946, "loss": 0.7324, "step": 510 }, { "epoch": 1.0389610389610389, "grad_norm": 0.3966336250305176, "learning_rate": 0.00019470983049947444, "loss": 0.7395, "step": 520 }, { "epoch": 1.058941058941059, "grad_norm": 0.39721325039863586, "learning_rate": 0.00019450630751798048, "loss": 0.7302, "step": 530 }, { "epoch": 1.078921078921079, "grad_norm": 0.38532692193984985, "learning_rate": 0.00019429905358928646, "loss": 0.7177, "step": 540 }, { "epoch": 1.098901098901099, "grad_norm": 0.3948540985584259, "learning_rate": 0.00019408807689542257, "loss": 0.7382, "step": 550 }, { "epoch": 1.118881118881119, "grad_norm": 0.399676650762558, "learning_rate": 0.00019387338576538744, "loss": 0.7286, "step": 560 }, { "epoch": 1.138861138861139, "grad_norm": 0.4208274781703949, "learning_rate": 0.00019365498867481923, "loss": 0.7251, "step": 570 }, { "epoch": 1.158841158841159, "grad_norm": 0.4160782992839813, "learning_rate": 0.00019343289424566122, "loss": 0.7138, "step": 580 }, { "epoch": 1.1788211788211789, "grad_norm": 0.4297160804271698, "learning_rate": 0.0001932071112458211, "loss": 0.7296, "step": 590 }, { "epoch": 1.1988011988011988, "grad_norm": 0.4196039140224457, "learning_rate": 0.00019297764858882514, "loss": 0.7091, "step": 600 }, { "epoch": 1.2187812187812188, "grad_norm": 0.406012624502182, "learning_rate": 0.00019274451533346615, "loss": 0.7021, "step": 610 }, { "epoch": 1.2387612387612388, "grad_norm": 0.41200658679008484, "learning_rate": 0.0001925077206834458, "loss": 0.7238, "step": 620 }, { "epoch": 1.2587412587412588, "grad_norm": 0.4819345772266388, "learning_rate": 0.0001922672739870115, "loss": 0.7275, "step": 630 }, { "epoch": 1.2787212787212787, "grad_norm": 0.40825748443603516, "learning_rate": 0.00019202318473658705, "loss": 0.7183, "step": 640 }, { "epoch": 1.2987012987012987, "grad_norm": 0.41940203309059143, "learning_rate": 0.00019177546256839812, "loss": 0.7149, "step": 650 }, { "epoch": 1.3186813186813187, "grad_norm": 0.40075168013572693, "learning_rate": 0.00019152411726209176, "loss": 0.722, "step": 660 }, { "epoch": 1.3386613386613386, "grad_norm": 0.4254063665866852, "learning_rate": 0.0001912691587403503, "loss": 0.7254, "step": 670 }, { "epoch": 1.3586413586413586, "grad_norm": 0.39732539653778076, "learning_rate": 0.00019101059706849957, "loss": 0.7115, "step": 680 }, { "epoch": 1.3786213786213786, "grad_norm": 0.3889389932155609, "learning_rate": 0.0001907484424541117, "loss": 0.7031, "step": 690 }, { "epoch": 1.3986013986013985, "grad_norm": 0.3994196355342865, "learning_rate": 0.00019048270524660196, "loss": 0.7095, "step": 700 }, { "epoch": 1.4185814185814185, "grad_norm": 0.4238826036453247, "learning_rate": 0.00019021339593682028, "loss": 0.7156, "step": 710 }, { "epoch": 1.4385614385614387, "grad_norm": 0.4787987172603607, "learning_rate": 0.0001899405251566371, "loss": 0.7142, "step": 720 }, { "epoch": 1.4585414585414584, "grad_norm": 0.4219954013824463, "learning_rate": 0.00018966410367852362, "loss": 0.7267, "step": 730 }, { "epoch": 1.4785214785214786, "grad_norm": 0.4154765009880066, "learning_rate": 0.0001893841424151264, "loss": 0.721, "step": 740 }, { "epoch": 1.4985014985014984, "grad_norm": 0.44605547189712524, "learning_rate": 0.0001891006524188368, "loss": 0.7266, "step": 750 }, { "epoch": 1.5184815184815186, "grad_norm": 0.4613310992717743, "learning_rate": 0.00018881364488135448, "loss": 0.7253, "step": 760 }, { "epoch": 1.5384615384615383, "grad_norm": 0.41615426540374756, "learning_rate": 0.00018852313113324552, "loss": 0.69, "step": 770 }, { "epoch": 1.5584415584415585, "grad_norm": 0.4512516260147095, "learning_rate": 0.00018822912264349534, "loss": 0.7124, "step": 780 }, { "epoch": 1.5784215784215783, "grad_norm": 0.464336633682251, "learning_rate": 0.00018793163101905563, "loss": 0.7067, "step": 790 }, { "epoch": 1.5984015984015985, "grad_norm": 0.4427087604999542, "learning_rate": 0.00018763066800438636, "loss": 0.7097, "step": 800 }, { "epoch": 1.6183816183816184, "grad_norm": 0.43341028690338135, "learning_rate": 0.00018732624548099204, "loss": 0.7068, "step": 810 }, { "epoch": 1.6383616383616384, "grad_norm": 0.4100460112094879, "learning_rate": 0.0001870183754669526, "loss": 0.705, "step": 820 }, { "epoch": 1.6583416583416584, "grad_norm": 0.43942147493362427, "learning_rate": 0.000186707070116449, "loss": 0.7043, "step": 830 }, { "epoch": 1.6783216783216783, "grad_norm": 0.430095911026001, "learning_rate": 0.00018639234171928353, "loss": 0.6989, "step": 840 }, { "epoch": 1.6983016983016983, "grad_norm": 0.40418198704719543, "learning_rate": 0.0001860742027003944, "loss": 0.6933, "step": 850 }, { "epoch": 1.7182817182817183, "grad_norm": 0.40910184383392334, "learning_rate": 0.00018575266561936523, "loss": 0.6848, "step": 860 }, { "epoch": 1.7382617382617382, "grad_norm": 0.4620640277862549, "learning_rate": 0.0001854277431699295, "loss": 0.6943, "step": 870 }, { "epoch": 1.7582417582417582, "grad_norm": 0.4648028314113617, "learning_rate": 0.00018509944817946922, "loss": 0.6993, "step": 880 }, { "epoch": 1.7782217782217782, "grad_norm": 0.43752139806747437, "learning_rate": 0.00018476779360850832, "loss": 0.6827, "step": 890 }, { "epoch": 1.7982017982017982, "grad_norm": 0.4481639862060547, "learning_rate": 0.00018443279255020152, "loss": 0.6978, "step": 900 }, { "epoch": 1.8181818181818183, "grad_norm": 0.4678110182285309, "learning_rate": 0.00018409445822981693, "loss": 0.6848, "step": 910 }, { "epoch": 1.838161838161838, "grad_norm": 0.433933824300766, "learning_rate": 0.0001837528040042142, "loss": 0.658, "step": 920 }, { "epoch": 1.8581418581418583, "grad_norm": 0.4601323902606964, "learning_rate": 0.00018340784336131713, "loss": 0.6912, "step": 930 }, { "epoch": 1.878121878121878, "grad_norm": 0.4591493308544159, "learning_rate": 0.00018305958991958127, "loss": 0.697, "step": 940 }, { "epoch": 1.8981018981018982, "grad_norm": 0.445711225271225, "learning_rate": 0.00018270805742745617, "loss": 0.6922, "step": 950 }, { "epoch": 1.918081918081918, "grad_norm": 0.43125954270362854, "learning_rate": 0.00018235325976284275, "loss": 0.6742, "step": 960 }, { "epoch": 1.9380619380619382, "grad_norm": 0.4716484248638153, "learning_rate": 0.00018199521093254523, "loss": 0.6796, "step": 970 }, { "epoch": 1.958041958041958, "grad_norm": 0.4613405764102936, "learning_rate": 0.00018163392507171842, "loss": 0.6832, "step": 980 }, { "epoch": 1.978021978021978, "grad_norm": 0.48080363869667053, "learning_rate": 0.0001812694164433094, "loss": 0.6807, "step": 990 }, { "epoch": 1.9980019980019978, "grad_norm": 0.47017648816108704, "learning_rate": 0.00018090169943749476, "loss": 0.6785, "step": 1000 }, { "epoch": 2.017982017982018, "grad_norm": 0.516197919845581, "learning_rate": 0.0001805307885711122, "loss": 0.6019, "step": 1010 }, { "epoch": 2.037962037962038, "grad_norm": 0.5556052923202515, "learning_rate": 0.00018015669848708767, "loss": 0.5906, "step": 1020 }, { "epoch": 2.057942057942058, "grad_norm": 0.5169907808303833, "learning_rate": 0.0001797794439538571, "loss": 0.6076, "step": 1030 }, { "epoch": 2.0779220779220777, "grad_norm": 0.5560281276702881, "learning_rate": 0.00017939903986478355, "loss": 0.582, "step": 1040 }, { "epoch": 2.097902097902098, "grad_norm": 0.521091878414154, "learning_rate": 0.00017901550123756906, "loss": 0.5929, "step": 1050 }, { "epoch": 2.117882117882118, "grad_norm": 0.5990195870399475, "learning_rate": 0.00017862884321366188, "loss": 0.5863, "step": 1060 }, { "epoch": 2.137862137862138, "grad_norm": 0.5285313725471497, "learning_rate": 0.0001782390810576588, "loss": 0.5845, "step": 1070 }, { "epoch": 2.157842157842158, "grad_norm": 0.5402159690856934, "learning_rate": 0.00017784623015670238, "loss": 0.5926, "step": 1080 }, { "epoch": 2.177822177822178, "grad_norm": 0.5576025247573853, "learning_rate": 0.00017745030601987337, "loss": 0.5964, "step": 1090 }, { "epoch": 2.197802197802198, "grad_norm": 0.5605506896972656, "learning_rate": 0.00017705132427757895, "loss": 0.5877, "step": 1100 }, { "epoch": 2.2177822177822177, "grad_norm": 0.5754747986793518, "learning_rate": 0.00017664930068093498, "loss": 0.6002, "step": 1110 }, { "epoch": 2.237762237762238, "grad_norm": 0.5654470324516296, "learning_rate": 0.0001762442511011448, "loss": 0.5922, "step": 1120 }, { "epoch": 2.2577422577422577, "grad_norm": 0.5414491891860962, "learning_rate": 0.0001758361915288722, "loss": 0.5917, "step": 1130 }, { "epoch": 2.277722277722278, "grad_norm": 0.5563125014305115, "learning_rate": 0.00017542513807361037, "loss": 0.5867, "step": 1140 }, { "epoch": 2.2977022977022976, "grad_norm": 0.5236257314682007, "learning_rate": 0.00017501110696304596, "loss": 0.5888, "step": 1150 }, { "epoch": 2.317682317682318, "grad_norm": 0.614734411239624, "learning_rate": 0.00017459411454241822, "loss": 0.6001, "step": 1160 }, { "epoch": 2.3376623376623376, "grad_norm": 0.605421781539917, "learning_rate": 0.00017417417727387394, "loss": 0.5968, "step": 1170 }, { "epoch": 2.3576423576423577, "grad_norm": 0.5595569014549255, "learning_rate": 0.0001737513117358174, "loss": 0.5924, "step": 1180 }, { "epoch": 2.3776223776223775, "grad_norm": 0.5283003449440002, "learning_rate": 0.00017332553462225602, "loss": 0.5952, "step": 1190 }, { "epoch": 2.3976023976023977, "grad_norm": 0.5287072658538818, "learning_rate": 0.00017289686274214118, "loss": 0.5763, "step": 1200 }, { "epoch": 2.4175824175824174, "grad_norm": 0.5907203555107117, "learning_rate": 0.0001724653130187047, "loss": 0.5993, "step": 1210 }, { "epoch": 2.4375624375624376, "grad_norm": 0.5622738003730774, "learning_rate": 0.0001720309024887907, "loss": 0.6001, "step": 1220 }, { "epoch": 2.4575424575424574, "grad_norm": 0.5795326232910156, "learning_rate": 0.00017159364830218312, "loss": 0.5857, "step": 1230 }, { "epoch": 2.4775224775224776, "grad_norm": 0.5654671788215637, "learning_rate": 0.00017115356772092857, "loss": 0.5809, "step": 1240 }, { "epoch": 2.4975024975024973, "grad_norm": 0.5641043186187744, "learning_rate": 0.00017071067811865476, "loss": 0.5824, "step": 1250 }, { "epoch": 2.5174825174825175, "grad_norm": 0.5851653218269348, "learning_rate": 0.00017026499697988493, "loss": 0.59, "step": 1260 }, { "epoch": 2.5374625374625372, "grad_norm": 0.570210337638855, "learning_rate": 0.00016981654189934727, "loss": 0.5761, "step": 1270 }, { "epoch": 2.5574425574425574, "grad_norm": 0.5725647807121277, "learning_rate": 0.0001693653305812805, "loss": 0.589, "step": 1280 }, { "epoch": 2.5774225774225776, "grad_norm": 0.5896579623222351, "learning_rate": 0.00016891138083873487, "loss": 0.5852, "step": 1290 }, { "epoch": 2.5974025974025974, "grad_norm": 0.5988901853561401, "learning_rate": 0.00016845471059286887, "loss": 0.5723, "step": 1300 }, { "epoch": 2.617382617382617, "grad_norm": 0.5854650735855103, "learning_rate": 0.00016799533787224192, "loss": 0.5845, "step": 1310 }, { "epoch": 2.6373626373626373, "grad_norm": 0.5547802448272705, "learning_rate": 0.00016753328081210245, "loss": 0.5909, "step": 1320 }, { "epoch": 2.6573426573426575, "grad_norm": 0.5562127232551575, "learning_rate": 0.000167068557653672, "loss": 0.5799, "step": 1330 }, { "epoch": 2.6773226773226773, "grad_norm": 0.5999246835708618, "learning_rate": 0.00016660118674342517, "loss": 0.5757, "step": 1340 }, { "epoch": 2.6973026973026974, "grad_norm": 0.5909945368766785, "learning_rate": 0.00016613118653236518, "loss": 0.5674, "step": 1350 }, { "epoch": 2.717282717282717, "grad_norm": 0.6357455849647522, "learning_rate": 0.00016565857557529566, "loss": 0.5821, "step": 1360 }, { "epoch": 2.7372627372627374, "grad_norm": 0.6019343733787537, "learning_rate": 0.0001651833725300879, "loss": 0.5783, "step": 1370 }, { "epoch": 2.757242757242757, "grad_norm": 0.6180288791656494, "learning_rate": 0.00016470559615694446, "loss": 0.6056, "step": 1380 }, { "epoch": 2.7772227772227773, "grad_norm": 0.6171667575836182, "learning_rate": 0.00016422526531765846, "loss": 0.5799, "step": 1390 }, { "epoch": 2.797202797202797, "grad_norm": 0.5991246700286865, "learning_rate": 0.000163742398974869, "loss": 0.5668, "step": 1400 }, { "epoch": 2.8171828171828173, "grad_norm": 0.6568031907081604, "learning_rate": 0.00016325701619131246, "loss": 0.5662, "step": 1410 }, { "epoch": 2.837162837162837, "grad_norm": 0.6639891266822815, "learning_rate": 0.00016276913612907007, "loss": 0.5797, "step": 1420 }, { "epoch": 2.857142857142857, "grad_norm": 0.5978193879127502, "learning_rate": 0.00016227877804881127, "loss": 0.5613, "step": 1430 }, { "epoch": 2.8771228771228774, "grad_norm": 0.576871395111084, "learning_rate": 0.00016178596130903344, "loss": 0.5796, "step": 1440 }, { "epoch": 2.897102897102897, "grad_norm": 0.5936170220375061, "learning_rate": 0.00016129070536529766, "loss": 0.5791, "step": 1450 }, { "epoch": 2.917082917082917, "grad_norm": 0.6093722581863403, "learning_rate": 0.00016079302976946055, "loss": 0.5836, "step": 1460 }, { "epoch": 2.937062937062937, "grad_norm": 0.5815151929855347, "learning_rate": 0.00016029295416890248, "loss": 0.5644, "step": 1470 }, { "epoch": 2.9570429570429573, "grad_norm": 0.621591329574585, "learning_rate": 0.0001597904983057519, "loss": 0.5779, "step": 1480 }, { "epoch": 2.977022977022977, "grad_norm": 0.5824622511863708, "learning_rate": 0.00015928568201610595, "loss": 0.5659, "step": 1490 }, { "epoch": 2.9970029970029968, "grad_norm": 0.6264435052871704, "learning_rate": 0.00015877852522924732, "loss": 0.5823, "step": 1500 }, { "epoch": 3.016983016983017, "grad_norm": 0.7021110653877258, "learning_rate": 0.00015826904796685762, "loss": 0.4732, "step": 1510 }, { "epoch": 3.036963036963037, "grad_norm": 0.7195537686347961, "learning_rate": 0.00015775727034222675, "loss": 0.4484, "step": 1520 }, { "epoch": 3.056943056943057, "grad_norm": 0.7159614562988281, "learning_rate": 0.0001572432125594591, "loss": 0.4533, "step": 1530 }, { "epoch": 3.076923076923077, "grad_norm": 0.686655580997467, "learning_rate": 0.00015672689491267567, "loss": 0.4588, "step": 1540 }, { "epoch": 3.096903096903097, "grad_norm": 0.6840978264808655, "learning_rate": 0.00015620833778521307, "loss": 0.4632, "step": 1550 }, { "epoch": 3.116883116883117, "grad_norm": 0.6888960003852844, "learning_rate": 0.00015568756164881882, "loss": 0.463, "step": 1560 }, { "epoch": 3.136863136863137, "grad_norm": 0.6887105107307434, "learning_rate": 0.00015516458706284303, "loss": 0.4683, "step": 1570 }, { "epoch": 3.156843156843157, "grad_norm": 0.6880657076835632, "learning_rate": 0.00015463943467342693, "loss": 0.4703, "step": 1580 }, { "epoch": 3.1768231768231767, "grad_norm": 0.667488157749176, "learning_rate": 0.00015411212521268758, "loss": 0.4681, "step": 1590 }, { "epoch": 3.196803196803197, "grad_norm": 0.7201547026634216, "learning_rate": 0.00015358267949789966, "loss": 0.4708, "step": 1600 }, { "epoch": 3.2167832167832167, "grad_norm": 0.7887006998062134, "learning_rate": 0.0001530511184306734, "loss": 0.4692, "step": 1610 }, { "epoch": 3.236763236763237, "grad_norm": 0.6850538849830627, "learning_rate": 0.0001525174629961296, "loss": 0.4652, "step": 1620 }, { "epoch": 3.2567432567432566, "grad_norm": 0.7573882937431335, "learning_rate": 0.00015198173426207094, "loss": 0.4618, "step": 1630 }, { "epoch": 3.276723276723277, "grad_norm": 0.7027117609977722, "learning_rate": 0.00015144395337815064, "loss": 0.4665, "step": 1640 }, { "epoch": 3.2967032967032965, "grad_norm": 0.6847530007362366, "learning_rate": 0.00015090414157503714, "loss": 0.4669, "step": 1650 }, { "epoch": 3.3166833166833167, "grad_norm": 0.7099263072013855, "learning_rate": 0.0001503623201635761, "loss": 0.4666, "step": 1660 }, { "epoch": 3.3366633366633365, "grad_norm": 0.6803727149963379, "learning_rate": 0.0001498185105339491, "loss": 0.4674, "step": 1670 }, { "epoch": 3.3566433566433567, "grad_norm": 0.7080752849578857, "learning_rate": 0.00014927273415482915, "loss": 0.4694, "step": 1680 }, { "epoch": 3.3766233766233764, "grad_norm": 0.7016042470932007, "learning_rate": 0.00014872501257253323, "loss": 0.4716, "step": 1690 }, { "epoch": 3.3966033966033966, "grad_norm": 0.6896219849586487, "learning_rate": 0.00014817536741017152, "loss": 0.4706, "step": 1700 }, { "epoch": 3.416583416583417, "grad_norm": 0.7319151163101196, "learning_rate": 0.0001476238203667939, "loss": 0.4657, "step": 1710 }, { "epoch": 3.4365634365634365, "grad_norm": 0.7796220779418945, "learning_rate": 0.0001470703932165333, "loss": 0.4762, "step": 1720 }, { "epoch": 3.4565434565434563, "grad_norm": 0.6749796271324158, "learning_rate": 0.00014651510780774583, "loss": 0.4602, "step": 1730 }, { "epoch": 3.4765234765234765, "grad_norm": 0.6736605167388916, "learning_rate": 0.00014595798606214882, "loss": 0.4751, "step": 1740 }, { "epoch": 3.4965034965034967, "grad_norm": 0.7386316657066345, "learning_rate": 0.00014539904997395468, "loss": 0.4658, "step": 1750 }, { "epoch": 3.5164835164835164, "grad_norm": 0.7023107409477234, "learning_rate": 0.00014483832160900326, "loss": 0.4678, "step": 1760 }, { "epoch": 3.5364635364635366, "grad_norm": 0.6938359141349792, "learning_rate": 0.0001442758231038902, "loss": 0.4619, "step": 1770 }, { "epoch": 3.5564435564435564, "grad_norm": 0.7815272212028503, "learning_rate": 0.0001437115766650933, "loss": 0.4744, "step": 1780 }, { "epoch": 3.5764235764235766, "grad_norm": 0.7307267189025879, "learning_rate": 0.0001431456045680959, "loss": 0.4767, "step": 1790 }, { "epoch": 3.5964035964035963, "grad_norm": 0.6948580741882324, "learning_rate": 0.00014257792915650728, "loss": 0.4644, "step": 1800 }, { "epoch": 3.6163836163836165, "grad_norm": 0.691348671913147, "learning_rate": 0.00014200857284118066, "loss": 0.4609, "step": 1810 }, { "epoch": 3.6363636363636362, "grad_norm": 0.7828198671340942, "learning_rate": 0.00014143755809932845, "loss": 0.4506, "step": 1820 }, { "epoch": 3.6563436563436564, "grad_norm": 0.73238205909729, "learning_rate": 0.00014086490747363493, "loss": 0.4599, "step": 1830 }, { "epoch": 3.676323676323676, "grad_norm": 0.7216520309448242, "learning_rate": 0.00014029064357136628, "loss": 0.4582, "step": 1840 }, { "epoch": 3.6963036963036964, "grad_norm": 0.7676394581794739, "learning_rate": 0.00013971478906347806, "loss": 0.4494, "step": 1850 }, { "epoch": 3.716283716283716, "grad_norm": 0.7596750259399414, "learning_rate": 0.00013913736668372026, "loss": 0.4704, "step": 1860 }, { "epoch": 3.7362637362637363, "grad_norm": 0.7686085104942322, "learning_rate": 0.00013855839922773968, "loss": 0.4603, "step": 1870 }, { "epoch": 3.756243756243756, "grad_norm": 0.6850613951683044, "learning_rate": 0.00013797790955218014, "loss": 0.4503, "step": 1880 }, { "epoch": 3.7762237762237763, "grad_norm": 0.721778392791748, "learning_rate": 0.00013739592057378003, "loss": 0.4713, "step": 1890 }, { "epoch": 3.7962037962037964, "grad_norm": 0.7122541069984436, "learning_rate": 0.00013681245526846783, "loss": 0.4664, "step": 1900 }, { "epoch": 3.816183816183816, "grad_norm": 0.7361748218536377, "learning_rate": 0.00013622753667045457, "loss": 0.4571, "step": 1910 }, { "epoch": 3.836163836163836, "grad_norm": 0.8220844864845276, "learning_rate": 0.00013564118787132506, "loss": 0.4521, "step": 1920 }, { "epoch": 3.856143856143856, "grad_norm": 0.7139246463775635, "learning_rate": 0.0001350534320191259, "loss": 0.4491, "step": 1930 }, { "epoch": 3.8761238761238763, "grad_norm": 0.7244653701782227, "learning_rate": 0.0001344642923174517, "loss": 0.4552, "step": 1940 }, { "epoch": 3.896103896103896, "grad_norm": 0.7056713700294495, "learning_rate": 0.00013387379202452917, "loss": 0.4548, "step": 1950 }, { "epoch": 3.916083916083916, "grad_norm": 0.7653645277023315, "learning_rate": 0.00013328195445229868, "loss": 0.4492, "step": 1960 }, { "epoch": 3.936063936063936, "grad_norm": 0.6818165183067322, "learning_rate": 0.00013268880296549425, "loss": 0.4463, "step": 1970 }, { "epoch": 3.956043956043956, "grad_norm": 0.687439501285553, "learning_rate": 0.00013209436098072095, "loss": 0.457, "step": 1980 }, { "epoch": 3.976023976023976, "grad_norm": 0.7704656720161438, "learning_rate": 0.0001314986519655305, "loss": 0.4522, "step": 1990 }, { "epoch": 3.996003996003996, "grad_norm": 0.7227702736854553, "learning_rate": 0.00013090169943749476, "loss": 0.4454, "step": 2000 }, { "epoch": 4.015984015984016, "grad_norm": 0.8689281344413757, "learning_rate": 0.00013030352696327742, "loss": 0.3645, "step": 2010 }, { "epoch": 4.035964035964036, "grad_norm": 0.7620906829833984, "learning_rate": 0.0001297041581577035, "loss": 0.3478, "step": 2020 }, { "epoch": 4.055944055944056, "grad_norm": 0.768671989440918, "learning_rate": 0.00012910361668282719, "loss": 0.3595, "step": 2030 }, { "epoch": 4.075924075924076, "grad_norm": 0.7327402234077454, "learning_rate": 0.0001285019262469976, "loss": 0.3471, "step": 2040 }, { "epoch": 4.095904095904096, "grad_norm": 0.6913720965385437, "learning_rate": 0.00012789911060392294, "loss": 0.3501, "step": 2050 }, { "epoch": 4.115884115884116, "grad_norm": 0.7310584783554077, "learning_rate": 0.00012729519355173254, "loss": 0.3509, "step": 2060 }, { "epoch": 4.135864135864136, "grad_norm": 0.7578213214874268, "learning_rate": 0.00012669019893203759, "loss": 0.3506, "step": 2070 }, { "epoch": 4.1558441558441555, "grad_norm": 0.7301665544509888, "learning_rate": 0.00012608415062898972, "loss": 0.3536, "step": 2080 }, { "epoch": 4.175824175824176, "grad_norm": 0.8198577165603638, "learning_rate": 0.00012547707256833823, "loss": 0.3578, "step": 2090 }, { "epoch": 4.195804195804196, "grad_norm": 0.7331268787384033, "learning_rate": 0.0001248689887164855, "loss": 0.3508, "step": 2100 }, { "epoch": 4.215784215784216, "grad_norm": 0.7666186094284058, "learning_rate": 0.00012425992307954075, "loss": 0.3468, "step": 2110 }, { "epoch": 4.235764235764236, "grad_norm": 0.7020666599273682, "learning_rate": 0.00012364989970237248, "loss": 0.3586, "step": 2120 }, { "epoch": 4.255744255744256, "grad_norm": 0.7276338338851929, "learning_rate": 0.00012303894266765908, "loss": 0.3672, "step": 2130 }, { "epoch": 4.275724275724276, "grad_norm": 0.6978778839111328, "learning_rate": 0.00012242707609493814, "loss": 0.3576, "step": 2140 }, { "epoch": 4.2957042957042955, "grad_norm": 0.822030246257782, "learning_rate": 0.00012181432413965428, "loss": 0.3618, "step": 2150 }, { "epoch": 4.315684315684316, "grad_norm": 0.744611918926239, "learning_rate": 0.00012120071099220549, "loss": 0.3578, "step": 2160 }, { "epoch": 4.335664335664336, "grad_norm": 0.7712835669517517, "learning_rate": 0.00012058626087698814, "loss": 0.3632, "step": 2170 }, { "epoch": 4.355644355644356, "grad_norm": 0.7824398279190063, "learning_rate": 0.00011997099805144069, "loss": 0.36, "step": 2180 }, { "epoch": 4.375624375624375, "grad_norm": 0.8473492860794067, "learning_rate": 0.00011935494680508606, "loss": 0.3645, "step": 2190 }, { "epoch": 4.395604395604396, "grad_norm": 0.7394326329231262, "learning_rate": 0.00011873813145857249, "loss": 0.3604, "step": 2200 }, { "epoch": 4.415584415584416, "grad_norm": 0.763633131980896, "learning_rate": 0.00011812057636271374, "loss": 0.3634, "step": 2210 }, { "epoch": 4.4355644355644355, "grad_norm": 0.7612594962120056, "learning_rate": 0.00011750230589752762, "loss": 0.355, "step": 2220 }, { "epoch": 4.455544455544455, "grad_norm": 0.7789061665534973, "learning_rate": 0.00011688334447127338, "loss": 0.3629, "step": 2230 }, { "epoch": 4.475524475524476, "grad_norm": 0.7422770261764526, "learning_rate": 0.00011626371651948838, "loss": 0.361, "step": 2240 }, { "epoch": 4.495504495504496, "grad_norm": 0.7636354565620422, "learning_rate": 0.0001156434465040231, "loss": 0.3593, "step": 2250 }, { "epoch": 4.515484515484515, "grad_norm": 0.7884863615036011, "learning_rate": 0.00011502255891207572, "loss": 0.3587, "step": 2260 }, { "epoch": 4.535464535464535, "grad_norm": 0.7233232855796814, "learning_rate": 0.00011440107825522521, "loss": 0.3577, "step": 2270 }, { "epoch": 4.555444555444556, "grad_norm": 0.8420186638832092, "learning_rate": 0.0001137790290684638, "loss": 0.3686, "step": 2280 }, { "epoch": 4.5754245754245755, "grad_norm": 0.7679941654205322, "learning_rate": 0.00011315643590922827, "loss": 0.3539, "step": 2290 }, { "epoch": 4.595404595404595, "grad_norm": 0.826885461807251, "learning_rate": 0.00011253332335643043, "loss": 0.3627, "step": 2300 }, { "epoch": 4.615384615384615, "grad_norm": 0.7590234875679016, "learning_rate": 0.00011190971600948699, "loss": 0.3613, "step": 2310 }, { "epoch": 4.635364635364636, "grad_norm": 0.7376580238342285, "learning_rate": 0.00011128563848734816, "loss": 0.3694, "step": 2320 }, { "epoch": 4.655344655344655, "grad_norm": 0.7795658111572266, "learning_rate": 0.000110661115427526, "loss": 0.3598, "step": 2330 }, { "epoch": 4.675324675324675, "grad_norm": 0.7736489176750183, "learning_rate": 0.00011003617148512149, "loss": 0.3598, "step": 2340 }, { "epoch": 4.695304695304696, "grad_norm": 0.757072925567627, "learning_rate": 0.00010941083133185146, "loss": 0.366, "step": 2350 }, { "epoch": 4.7152847152847155, "grad_norm": 0.8167831301689148, "learning_rate": 0.00010878511965507434, "loss": 0.3633, "step": 2360 }, { "epoch": 4.735264735264735, "grad_norm": 0.8083499670028687, "learning_rate": 0.00010815906115681578, "loss": 0.3562, "step": 2370 }, { "epoch": 4.755244755244755, "grad_norm": 0.7758758068084717, "learning_rate": 0.00010753268055279329, "loss": 0.3614, "step": 2380 }, { "epoch": 4.775224775224775, "grad_norm": 0.8572462797164917, "learning_rate": 0.00010690600257144061, "loss": 0.3652, "step": 2390 }, { "epoch": 4.795204795204795, "grad_norm": 0.8319938778877258, "learning_rate": 0.00010627905195293135, "loss": 0.3622, "step": 2400 }, { "epoch": 4.815184815184815, "grad_norm": 0.8004459142684937, "learning_rate": 0.00010565185344820247, "loss": 0.3604, "step": 2410 }, { "epoch": 4.835164835164835, "grad_norm": 0.790908694267273, "learning_rate": 0.00010502443181797697, "loss": 0.3587, "step": 2420 }, { "epoch": 4.8551448551448555, "grad_norm": 0.7726609110832214, "learning_rate": 0.0001043968118317865, "loss": 0.364, "step": 2430 }, { "epoch": 4.875124875124875, "grad_norm": 0.7808167338371277, "learning_rate": 0.00010376901826699348, "loss": 0.3637, "step": 2440 }, { "epoch": 4.895104895104895, "grad_norm": 0.8596636652946472, "learning_rate": 0.00010314107590781284, "loss": 0.3536, "step": 2450 }, { "epoch": 4.915084915084915, "grad_norm": 0.8091081380844116, "learning_rate": 0.00010251300954433376, "loss": 0.3522, "step": 2460 }, { "epoch": 4.935064935064935, "grad_norm": 0.8672420978546143, "learning_rate": 0.00010188484397154084, "loss": 0.3643, "step": 2470 }, { "epoch": 4.955044955044955, "grad_norm": 0.7860444188117981, "learning_rate": 0.00010125660398833528, "loss": 0.3493, "step": 2480 }, { "epoch": 4.975024975024975, "grad_norm": 0.7510725259780884, "learning_rate": 0.00010062831439655591, "loss": 0.3497, "step": 2490 }, { "epoch": 4.995004995004995, "grad_norm": 0.7850112915039062, "learning_rate": 0.0001, "loss": 0.361, "step": 2500 }, { "epoch": 5.014985014985015, "grad_norm": 0.9001740217208862, "learning_rate": 9.937168560344412e-05, "loss": 0.2983, "step": 2510 }, { "epoch": 5.034965034965035, "grad_norm": 0.683803141117096, "learning_rate": 9.874339601166473e-05, "loss": 0.2805, "step": 2520 }, { "epoch": 5.054945054945055, "grad_norm": 0.7267177700996399, "learning_rate": 9.81151560284592e-05, "loss": 0.2751, "step": 2530 }, { "epoch": 5.0749250749250745, "grad_norm": 0.7268999814987183, "learning_rate": 9.748699045566626e-05, "loss": 0.2805, "step": 2540 }, { "epoch": 5.094905094905095, "grad_norm": 0.6958262324333191, "learning_rate": 9.685892409218717e-05, "loss": 0.2808, "step": 2550 }, { "epoch": 5.114885114885115, "grad_norm": 0.7481863498687744, "learning_rate": 9.623098173300654e-05, "loss": 0.2808, "step": 2560 }, { "epoch": 5.134865134865135, "grad_norm": 0.6923096179962158, "learning_rate": 9.560318816821353e-05, "loss": 0.2802, "step": 2570 }, { "epoch": 5.154845154845155, "grad_norm": 0.8236074447631836, "learning_rate": 9.497556818202306e-05, "loss": 0.2845, "step": 2580 }, { "epoch": 5.174825174825175, "grad_norm": 0.7225534915924072, "learning_rate": 9.434814655179755e-05, "loss": 0.2802, "step": 2590 }, { "epoch": 5.194805194805195, "grad_norm": 0.7639855146408081, "learning_rate": 9.372094804706867e-05, "loss": 0.2846, "step": 2600 }, { "epoch": 5.2147852147852145, "grad_norm": 0.7572929859161377, "learning_rate": 9.309399742855942e-05, "loss": 0.2826, "step": 2610 }, { "epoch": 5.234765234765235, "grad_norm": 0.8045923709869385, "learning_rate": 9.246731944720675e-05, "loss": 0.2862, "step": 2620 }, { "epoch": 5.254745254745255, "grad_norm": 0.7385067939758301, "learning_rate": 9.184093884318425e-05, "loss": 0.2886, "step": 2630 }, { "epoch": 5.274725274725275, "grad_norm": 0.7742624282836914, "learning_rate": 9.121488034492569e-05, "loss": 0.2857, "step": 2640 }, { "epoch": 5.294705294705294, "grad_norm": 0.73873370885849, "learning_rate": 9.058916866814858e-05, "loss": 0.2874, "step": 2650 }, { "epoch": 5.314685314685315, "grad_norm": 0.8087053298950195, "learning_rate": 8.99638285148785e-05, "loss": 0.2814, "step": 2660 }, { "epoch": 5.334665334665335, "grad_norm": 0.7873129844665527, "learning_rate": 8.933888457247402e-05, "loss": 0.2827, "step": 2670 }, { "epoch": 5.3546453546453545, "grad_norm": 0.776678204536438, "learning_rate": 8.871436151265184e-05, "loss": 0.2861, "step": 2680 }, { "epoch": 5.374625374625374, "grad_norm": 0.7478957772254944, "learning_rate": 8.809028399051302e-05, "loss": 0.2841, "step": 2690 }, { "epoch": 5.394605394605395, "grad_norm": 0.7491159439086914, "learning_rate": 8.746667664356956e-05, "loss": 0.2781, "step": 2700 }, { "epoch": 5.414585414585415, "grad_norm": 0.7022270560264587, "learning_rate": 8.684356409077176e-05, "loss": 0.2831, "step": 2710 }, { "epoch": 5.434565434565434, "grad_norm": 0.714643120765686, "learning_rate": 8.62209709315362e-05, "loss": 0.2816, "step": 2720 }, { "epoch": 5.454545454545454, "grad_norm": 0.7695267796516418, "learning_rate": 8.559892174477479e-05, "loss": 0.2845, "step": 2730 }, { "epoch": 5.474525474525475, "grad_norm": 0.7670512795448303, "learning_rate": 8.497744108792429e-05, "loss": 0.284, "step": 2740 }, { "epoch": 5.4945054945054945, "grad_norm": 0.7777095437049866, "learning_rate": 8.435655349597689e-05, "loss": 0.2849, "step": 2750 }, { "epoch": 5.514485514485514, "grad_norm": 0.7117462158203125, "learning_rate": 8.373628348051165e-05, "loss": 0.2892, "step": 2760 }, { "epoch": 5.534465534465534, "grad_norm": 0.7786485552787781, "learning_rate": 8.311665552872662e-05, "loss": 0.2867, "step": 2770 }, { "epoch": 5.554445554445555, "grad_norm": 0.7926625609397888, "learning_rate": 8.249769410247239e-05, "loss": 0.2862, "step": 2780 }, { "epoch": 5.574425574425574, "grad_norm": 0.7426894307136536, "learning_rate": 8.187942363728625e-05, "loss": 0.288, "step": 2790 }, { "epoch": 5.594405594405594, "grad_norm": 0.7075335383415222, "learning_rate": 8.126186854142752e-05, "loss": 0.2847, "step": 2800 }, { "epoch": 5.614385614385615, "grad_norm": 0.7743814587593079, "learning_rate": 8.064505319491398e-05, "loss": 0.2912, "step": 2810 }, { "epoch": 5.6343656343656345, "grad_norm": 0.7679479122161865, "learning_rate": 8.002900194855932e-05, "loss": 0.2944, "step": 2820 }, { "epoch": 5.654345654345654, "grad_norm": 0.8007961511611938, "learning_rate": 7.941373912301189e-05, "loss": 0.2934, "step": 2830 }, { "epoch": 5.674325674325674, "grad_norm": 0.8405194878578186, "learning_rate": 7.879928900779456e-05, "loss": 0.2848, "step": 2840 }, { "epoch": 5.694305694305695, "grad_norm": 0.7828160524368286, "learning_rate": 7.818567586034577e-05, "loss": 0.2932, "step": 2850 }, { "epoch": 5.714285714285714, "grad_norm": 0.7869848608970642, "learning_rate": 7.75729239050619e-05, "loss": 0.2851, "step": 2860 }, { "epoch": 5.734265734265734, "grad_norm": 0.7781445980072021, "learning_rate": 7.696105733234098e-05, "loss": 0.2849, "step": 2870 }, { "epoch": 5.754245754245754, "grad_norm": 0.8406656980514526, "learning_rate": 7.635010029762756e-05, "loss": 0.2854, "step": 2880 }, { "epoch": 5.7742257742257745, "grad_norm": 0.7491788864135742, "learning_rate": 7.574007692045928e-05, "loss": 0.288, "step": 2890 }, { "epoch": 5.794205794205794, "grad_norm": 0.7962749004364014, "learning_rate": 7.513101128351454e-05, "loss": 0.2888, "step": 2900 }, { "epoch": 5.814185814185814, "grad_norm": 0.7898345589637756, "learning_rate": 7.45229274316618e-05, "loss": 0.2875, "step": 2910 }, { "epoch": 5.834165834165834, "grad_norm": 0.7886426448822021, "learning_rate": 7.391584937101033e-05, "loss": 0.2947, "step": 2920 }, { "epoch": 5.854145854145854, "grad_norm": 0.7488512396812439, "learning_rate": 7.330980106796246e-05, "loss": 0.2846, "step": 2930 }, { "epoch": 5.874125874125874, "grad_norm": 0.7348522543907166, "learning_rate": 7.270480644826749e-05, "loss": 0.2883, "step": 2940 }, { "epoch": 5.894105894105894, "grad_norm": 0.7618998885154724, "learning_rate": 7.210088939607708e-05, "loss": 0.2899, "step": 2950 }, { "epoch": 5.9140859140859146, "grad_norm": 0.78291255235672, "learning_rate": 7.149807375300239e-05, "loss": 0.2865, "step": 2960 }, { "epoch": 5.934065934065934, "grad_norm": 0.7446394562721252, "learning_rate": 7.089638331717284e-05, "loss": 0.2846, "step": 2970 }, { "epoch": 5.954045954045954, "grad_norm": 0.767301619052887, "learning_rate": 7.029584184229653e-05, "loss": 0.2887, "step": 2980 }, { "epoch": 5.974025974025974, "grad_norm": 0.7523135542869568, "learning_rate": 6.969647303672262e-05, "loss": 0.2873, "step": 2990 }, { "epoch": 5.9940059940059935, "grad_norm": 0.7532919049263, "learning_rate": 6.909830056250527e-05, "loss": 0.2882, "step": 3000 }, { "epoch": 6.013986013986014, "grad_norm": 0.6552711129188538, "learning_rate": 6.850134803446954e-05, "loss": 0.2488, "step": 3010 }, { "epoch": 6.033966033966034, "grad_norm": 0.6565443873405457, "learning_rate": 6.790563901927907e-05, "loss": 0.2345, "step": 3020 }, { "epoch": 6.053946053946054, "grad_norm": 0.6884881854057312, "learning_rate": 6.731119703450577e-05, "loss": 0.233, "step": 3030 }, { "epoch": 6.073926073926074, "grad_norm": 0.6287186741828918, "learning_rate": 6.671804554770135e-05, "loss": 0.2356, "step": 3040 }, { "epoch": 6.093906093906094, "grad_norm": 0.754036545753479, "learning_rate": 6.612620797547087e-05, "loss": 0.2352, "step": 3050 }, { "epoch": 6.113886113886114, "grad_norm": 0.6492979526519775, "learning_rate": 6.55357076825483e-05, "loss": 0.2329, "step": 3060 }, { "epoch": 6.1338661338661336, "grad_norm": 0.6303039789199829, "learning_rate": 6.494656798087412e-05, "loss": 0.2339, "step": 3070 }, { "epoch": 6.153846153846154, "grad_norm": 0.6423007845878601, "learning_rate": 6.435881212867493e-05, "loss": 0.2377, "step": 3080 }, { "epoch": 6.173826173826174, "grad_norm": 0.6716975569725037, "learning_rate": 6.377246332954544e-05, "loss": 0.2365, "step": 3090 }, { "epoch": 6.193806193806194, "grad_norm": 0.6927747130393982, "learning_rate": 6.318754473153221e-05, "loss": 0.2346, "step": 3100 }, { "epoch": 6.213786213786213, "grad_norm": 0.6551555395126343, "learning_rate": 6.260407942621998e-05, "loss": 0.235, "step": 3110 }, { "epoch": 6.233766233766234, "grad_norm": 0.7131916284561157, "learning_rate": 6.20220904478199e-05, "loss": 0.2401, "step": 3120 }, { "epoch": 6.253746253746254, "grad_norm": 0.7002174258232117, "learning_rate": 6.144160077226036e-05, "loss": 0.2398, "step": 3130 }, { "epoch": 6.273726273726274, "grad_norm": 0.7129354476928711, "learning_rate": 6.086263331627976e-05, "loss": 0.2401, "step": 3140 }, { "epoch": 6.293706293706293, "grad_norm": 0.6942778825759888, "learning_rate": 6.0285210936521955e-05, "loss": 0.2391, "step": 3150 }, { "epoch": 6.313686313686314, "grad_norm": 0.7181575298309326, "learning_rate": 5.9709356428633746e-05, "loss": 0.2434, "step": 3160 }, { "epoch": 6.333666333666334, "grad_norm": 0.720330536365509, "learning_rate": 5.913509252636511e-05, "loss": 0.2352, "step": 3170 }, { "epoch": 6.353646353646353, "grad_norm": 0.6518005728721619, "learning_rate": 5.856244190067159e-05, "loss": 0.2377, "step": 3180 }, { "epoch": 6.373626373626374, "grad_norm": 0.6705808639526367, "learning_rate": 5.799142715881938e-05, "loss": 0.2416, "step": 3190 }, { "epoch": 6.393606393606394, "grad_norm": 0.7210578322410583, "learning_rate": 5.7422070843492734e-05, "loss": 0.2406, "step": 3200 }, { "epoch": 6.413586413586414, "grad_norm": 0.6428204774856567, "learning_rate": 5.6854395431904094e-05, "loss": 0.2397, "step": 3210 }, { "epoch": 6.433566433566433, "grad_norm": 0.697733461856842, "learning_rate": 5.6288423334906735e-05, "loss": 0.2425, "step": 3220 }, { "epoch": 6.453546453546454, "grad_norm": 0.7867773175239563, "learning_rate": 5.572417689610987e-05, "loss": 0.2401, "step": 3230 }, { "epoch": 6.473526473526474, "grad_norm": 0.6750375032424927, "learning_rate": 5.5161678390996796e-05, "loss": 0.2396, "step": 3240 }, { "epoch": 6.4935064935064934, "grad_norm": 0.677237868309021, "learning_rate": 5.4600950026045326e-05, "loss": 0.2434, "step": 3250 }, { "epoch": 6.513486513486513, "grad_norm": 0.6781632304191589, "learning_rate": 5.404201393785122e-05, "loss": 0.2454, "step": 3260 }, { "epoch": 6.533466533466534, "grad_norm": 0.7506418824195862, "learning_rate": 5.348489219225416e-05, "loss": 0.2397, "step": 3270 }, { "epoch": 6.553446553446554, "grad_norm": 0.7256707549095154, "learning_rate": 5.292960678346675e-05, "loss": 0.2403, "step": 3280 }, { "epoch": 6.573426573426573, "grad_norm": 0.664169430732727, "learning_rate": 5.237617963320608e-05, "loss": 0.2392, "step": 3290 }, { "epoch": 6.593406593406593, "grad_norm": 0.7900999188423157, "learning_rate": 5.182463258982846e-05, "loss": 0.2426, "step": 3300 }, { "epoch": 6.613386613386614, "grad_norm": 0.7012047171592712, "learning_rate": 5.127498742746675e-05, "loss": 0.2429, "step": 3310 }, { "epoch": 6.6333666333666335, "grad_norm": 0.752498984336853, "learning_rate": 5.072726584517086e-05, "loss": 0.2425, "step": 3320 }, { "epoch": 6.653346653346653, "grad_norm": 0.7256404161453247, "learning_rate": 5.018148946605092e-05, "loss": 0.2381, "step": 3330 }, { "epoch": 6.673326673326673, "grad_norm": 0.6938993334770203, "learning_rate": 4.9637679836423924e-05, "loss": 0.2428, "step": 3340 }, { "epoch": 6.693306693306694, "grad_norm": 0.7288166284561157, "learning_rate": 4.909585842496287e-05, "loss": 0.2409, "step": 3350 }, { "epoch": 6.713286713286713, "grad_norm": 0.7148503661155701, "learning_rate": 4.8556046621849346e-05, "loss": 0.2402, "step": 3360 }, { "epoch": 6.733266733266733, "grad_norm": 0.7477458715438843, "learning_rate": 4.8018265737929044e-05, "loss": 0.2394, "step": 3370 }, { "epoch": 6.753246753246753, "grad_norm": 0.7404049634933472, "learning_rate": 4.748253700387042e-05, "loss": 0.2422, "step": 3380 }, { "epoch": 6.7732267732267735, "grad_norm": 0.6715726852416992, "learning_rate": 4.694888156932658e-05, "loss": 0.2405, "step": 3390 }, { "epoch": 6.793206793206793, "grad_norm": 0.6998412609100342, "learning_rate": 4.6417320502100316e-05, "loss": 0.2405, "step": 3400 }, { "epoch": 6.813186813186813, "grad_norm": 0.7061425447463989, "learning_rate": 4.588787478731242e-05, "loss": 0.2368, "step": 3410 }, { "epoch": 6.833166833166834, "grad_norm": 0.7432896494865417, "learning_rate": 4.5360565326573104e-05, "loss": 0.2399, "step": 3420 }, { "epoch": 6.853146853146853, "grad_norm": 0.7876798510551453, "learning_rate": 4.483541293715698e-05, "loss": 0.2395, "step": 3430 }, { "epoch": 6.873126873126873, "grad_norm": 0.7446125149726868, "learning_rate": 4.431243835118124e-05, "loss": 0.241, "step": 3440 }, { "epoch": 6.893106893106893, "grad_norm": 0.6832261085510254, "learning_rate": 4.379166221478697e-05, "loss": 0.2396, "step": 3450 }, { "epoch": 6.913086913086913, "grad_norm": 0.7039461135864258, "learning_rate": 4.327310508732437e-05, "loss": 0.2408, "step": 3460 }, { "epoch": 6.933066933066933, "grad_norm": 0.7428474426269531, "learning_rate": 4.2756787440540936e-05, "loss": 0.2407, "step": 3470 }, { "epoch": 6.953046953046953, "grad_norm": 0.7313565015792847, "learning_rate": 4.224272965777326e-05, "loss": 0.2406, "step": 3480 }, { "epoch": 6.973026973026973, "grad_norm": 0.7175894975662231, "learning_rate": 4.173095203314241e-05, "loss": 0.2409, "step": 3490 }, { "epoch": 6.993006993006993, "grad_norm": 0.6897133588790894, "learning_rate": 4.12214747707527e-05, "loss": 0.2389, "step": 3500 }, { "epoch": 7.012987012987013, "grad_norm": 0.5959777235984802, "learning_rate": 4.071431798389408e-05, "loss": 0.2184, "step": 3510 }, { "epoch": 7.032967032967033, "grad_norm": 0.7147582173347473, "learning_rate": 4.020950169424815e-05, "loss": 0.2087, "step": 3520 }, { "epoch": 7.052947052947053, "grad_norm": 0.6122413873672485, "learning_rate": 3.9707045831097555e-05, "loss": 0.2106, "step": 3530 }, { "epoch": 7.072927072927073, "grad_norm": 0.633969783782959, "learning_rate": 3.920697023053949e-05, "loss": 0.2099, "step": 3540 }, { "epoch": 7.092907092907093, "grad_norm": 0.6842843890190125, "learning_rate": 3.8709294634702376e-05, "loss": 0.2104, "step": 3550 }, { "epoch": 7.112887112887113, "grad_norm": 0.5708280205726624, "learning_rate": 3.821403869096658e-05, "loss": 0.2125, "step": 3560 }, { "epoch": 7.1328671328671325, "grad_norm": 0.6579930782318115, "learning_rate": 3.7721221951188765e-05, "loss": 0.2107, "step": 3570 }, { "epoch": 7.152847152847153, "grad_norm": 0.5980693101882935, "learning_rate": 3.7230863870929964e-05, "loss": 0.2085, "step": 3580 }, { "epoch": 7.172827172827173, "grad_norm": 0.5968551635742188, "learning_rate": 3.674298380868756e-05, "loss": 0.209, "step": 3590 }, { "epoch": 7.192807192807193, "grad_norm": 0.6218951940536499, "learning_rate": 3.6257601025131026e-05, "loss": 0.2095, "step": 3600 }, { "epoch": 7.212787212787212, "grad_norm": 0.6248393058776855, "learning_rate": 3.577473468234156e-05, "loss": 0.2155, "step": 3610 }, { "epoch": 7.232767232767233, "grad_norm": 0.6496105194091797, "learning_rate": 3.52944038430556e-05, "loss": 0.2139, "step": 3620 }, { "epoch": 7.252747252747253, "grad_norm": 0.6064103841781616, "learning_rate": 3.481662746991214e-05, "loss": 0.2081, "step": 3630 }, { "epoch": 7.2727272727272725, "grad_norm": 0.6504641771316528, "learning_rate": 3.4341424424704375e-05, "loss": 0.2111, "step": 3640 }, { "epoch": 7.292707292707293, "grad_norm": 0.6580168604850769, "learning_rate": 3.386881346763483e-05, "loss": 0.2123, "step": 3650 }, { "epoch": 7.312687312687313, "grad_norm": 0.5861549973487854, "learning_rate": 3.339881325657484e-05, "loss": 0.2084, "step": 3660 }, { "epoch": 7.332667332667333, "grad_norm": 0.6313382387161255, "learning_rate": 3.2931442346328004e-05, "loss": 0.2078, "step": 3670 }, { "epoch": 7.352647352647352, "grad_norm": 0.646842896938324, "learning_rate": 3.246671918789755e-05, "loss": 0.2135, "step": 3680 }, { "epoch": 7.372627372627373, "grad_norm": 0.6964268088340759, "learning_rate": 3.200466212775808e-05, "loss": 0.2126, "step": 3690 }, { "epoch": 7.392607392607393, "grad_norm": 0.6139673590660095, "learning_rate": 3.154528940713113e-05, "loss": 0.215, "step": 3700 }, { "epoch": 7.4125874125874125, "grad_norm": 0.6455628871917725, "learning_rate": 3.108861916126518e-05, "loss": 0.2114, "step": 3710 }, { "epoch": 7.432567432567432, "grad_norm": 0.6227108240127563, "learning_rate": 3.063466941871952e-05, "loss": 0.2114, "step": 3720 }, { "epoch": 7.452547452547453, "grad_norm": 0.5858675837516785, "learning_rate": 3.018345810065275e-05, "loss": 0.2107, "step": 3730 }, { "epoch": 7.472527472527473, "grad_norm": 0.6218124628067017, "learning_rate": 2.9735003020115092e-05, "loss": 0.2115, "step": 3740 }, { "epoch": 7.492507492507492, "grad_norm": 0.6510396003723145, "learning_rate": 2.9289321881345254e-05, "loss": 0.2124, "step": 3750 }, { "epoch": 7.512487512487512, "grad_norm": 0.6465820074081421, "learning_rate": 2.8846432279071467e-05, "loss": 0.2132, "step": 3760 }, { "epoch": 7.532467532467533, "grad_norm": 0.7002317905426025, "learning_rate": 2.840635169781688e-05, "loss": 0.2129, "step": 3770 }, { "epoch": 7.5524475524475525, "grad_norm": 0.647723913192749, "learning_rate": 2.7969097511209308e-05, "loss": 0.2136, "step": 3780 }, { "epoch": 7.572427572427572, "grad_norm": 0.5907153487205505, "learning_rate": 2.753468698129533e-05, "loss": 0.2115, "step": 3790 }, { "epoch": 7.592407592407593, "grad_norm": 0.6074231863021851, "learning_rate": 2.7103137257858868e-05, "loss": 0.2128, "step": 3800 }, { "epoch": 7.612387612387613, "grad_norm": 0.6356890797615051, "learning_rate": 2.6674465377744017e-05, "loss": 0.2108, "step": 3810 }, { "epoch": 7.632367632367632, "grad_norm": 0.6739248633384705, "learning_rate": 2.624868826418262e-05, "loss": 0.2129, "step": 3820 }, { "epoch": 7.652347652347652, "grad_norm": 0.6241906881332397, "learning_rate": 2.582582272612609e-05, "loss": 0.211, "step": 3830 }, { "epoch": 7.672327672327672, "grad_norm": 0.6532058715820312, "learning_rate": 2.540588545758179e-05, "loss": 0.2137, "step": 3840 }, { "epoch": 7.6923076923076925, "grad_norm": 0.7098828554153442, "learning_rate": 2.4988893036954043e-05, "loss": 0.2105, "step": 3850 }, { "epoch": 7.712287712287712, "grad_norm": 0.6868453025817871, "learning_rate": 2.4574861926389615e-05, "loss": 0.214, "step": 3860 }, { "epoch": 7.732267732267732, "grad_norm": 0.6777834296226501, "learning_rate": 2.4163808471127812e-05, "loss": 0.2125, "step": 3870 }, { "epoch": 7.752247752247753, "grad_norm": 0.6967138648033142, "learning_rate": 2.37557488988552e-05, "loss": 0.2118, "step": 3880 }, { "epoch": 7.772227772227772, "grad_norm": 0.6641217470169067, "learning_rate": 2.3350699319065026e-05, "loss": 0.2134, "step": 3890 }, { "epoch": 7.792207792207792, "grad_norm": 0.6727011799812317, "learning_rate": 2.2948675722421086e-05, "loss": 0.217, "step": 3900 }, { "epoch": 7.812187812187812, "grad_norm": 0.6331846117973328, "learning_rate": 2.254969398012663e-05, "loss": 0.2127, "step": 3910 }, { "epoch": 7.8321678321678325, "grad_norm": 0.6486308574676514, "learning_rate": 2.2153769843297667e-05, "loss": 0.2096, "step": 3920 }, { "epoch": 7.852147852147852, "grad_norm": 0.6658995151519775, "learning_rate": 2.1760918942341192e-05, "loss": 0.211, "step": 3930 }, { "epoch": 7.872127872127872, "grad_norm": 0.687493085861206, "learning_rate": 2.137115678633811e-05, "loss": 0.2163, "step": 3940 }, { "epoch": 7.892107892107892, "grad_norm": 0.6267641186714172, "learning_rate": 2.098449876243096e-05, "loss": 0.2142, "step": 3950 }, { "epoch": 7.912087912087912, "grad_norm": 0.6141098141670227, "learning_rate": 2.0600960135216462e-05, "loss": 0.2134, "step": 3960 }, { "epoch": 7.932067932067932, "grad_norm": 0.6436827182769775, "learning_rate": 2.0220556046142893e-05, "loss": 0.214, "step": 3970 }, { "epoch": 7.952047952047952, "grad_norm": 0.6543010473251343, "learning_rate": 1.9843301512912327e-05, "loss": 0.2126, "step": 3980 }, { "epoch": 7.972027972027972, "grad_norm": 0.6083731651306152, "learning_rate": 1.946921142888781e-05, "loss": 0.2135, "step": 3990 }, { "epoch": 7.992007992007992, "grad_norm": 0.6408571600914001, "learning_rate": 1.9098300562505266e-05, "loss": 0.2123, "step": 4000 }, { "epoch": 8.011988011988011, "grad_norm": 0.556982159614563, "learning_rate": 1.8730583556690605e-05, "loss": 0.2042, "step": 4010 }, { "epoch": 8.031968031968033, "grad_norm": 0.5726343393325806, "learning_rate": 1.8366074928281607e-05, "loss": 0.1941, "step": 4020 }, { "epoch": 8.051948051948052, "grad_norm": 0.5825814604759216, "learning_rate": 1.8004789067454764e-05, "loss": 0.1976, "step": 4030 }, { "epoch": 8.071928071928072, "grad_norm": 0.569325864315033, "learning_rate": 1.7646740237157256e-05, "loss": 0.196, "step": 4040 }, { "epoch": 8.091908091908092, "grad_norm": 0.5917354226112366, "learning_rate": 1.7291942572543807e-05, "loss": 0.195, "step": 4050 }, { "epoch": 8.111888111888112, "grad_norm": 0.5817933678627014, "learning_rate": 1.6940410080418723e-05, "loss": 0.1971, "step": 4060 }, { "epoch": 8.131868131868131, "grad_norm": 0.6475218534469604, "learning_rate": 1.6592156638682886e-05, "loss": 0.197, "step": 4070 }, { "epoch": 8.151848151848151, "grad_norm": 0.6248770356178284, "learning_rate": 1.6247195995785837e-05, "loss": 0.1971, "step": 4080 }, { "epoch": 8.171828171828173, "grad_norm": 0.5749895572662354, "learning_rate": 1.5905541770183096e-05, "loss": 0.1964, "step": 4090 }, { "epoch": 8.191808191808192, "grad_norm": 0.6148300766944885, "learning_rate": 1.5567207449798515e-05, "loss": 0.1966, "step": 4100 }, { "epoch": 8.211788211788212, "grad_norm": 0.6778724789619446, "learning_rate": 1.5232206391491699e-05, "loss": 0.1955, "step": 4110 }, { "epoch": 8.231768231768232, "grad_norm": 0.5883269906044006, "learning_rate": 1.4900551820530828e-05, "loss": 0.1919, "step": 4120 }, { "epoch": 8.251748251748252, "grad_norm": 0.567950963973999, "learning_rate": 1.4572256830070497e-05, "loss": 0.1966, "step": 4130 }, { "epoch": 8.271728271728271, "grad_norm": 0.5733300447463989, "learning_rate": 1.4247334380634792e-05, "loss": 0.1964, "step": 4140 }, { "epoch": 8.291708291708291, "grad_norm": 0.638990044593811, "learning_rate": 1.3925797299605647e-05, "loss": 0.1944, "step": 4150 }, { "epoch": 8.311688311688311, "grad_norm": 0.6272343397140503, "learning_rate": 1.3607658280716473e-05, "loss": 0.1951, "step": 4160 }, { "epoch": 8.331668331668332, "grad_norm": 0.5631300210952759, "learning_rate": 1.3292929883550998e-05, "loss": 0.1983, "step": 4170 }, { "epoch": 8.351648351648352, "grad_norm": 0.6056917309761047, "learning_rate": 1.2981624533047432e-05, "loss": 0.1976, "step": 4180 }, { "epoch": 8.371628371628372, "grad_norm": 0.6021771430969238, "learning_rate": 1.2673754519008008e-05, "loss": 0.1968, "step": 4190 }, { "epoch": 8.391608391608392, "grad_norm": 0.5835386514663696, "learning_rate": 1.2369331995613665e-05, "loss": 0.1977, "step": 4200 }, { "epoch": 8.411588411588411, "grad_norm": 0.5700567960739136, "learning_rate": 1.206836898094439e-05, "loss": 0.1992, "step": 4210 }, { "epoch": 8.431568431568431, "grad_norm": 0.6391722559928894, "learning_rate": 1.1770877356504683e-05, "loss": 0.1977, "step": 4220 }, { "epoch": 8.451548451548451, "grad_norm": 0.5633198022842407, "learning_rate": 1.1476868866754486e-05, "loss": 0.1975, "step": 4230 }, { "epoch": 8.471528471528472, "grad_norm": 0.6308007836341858, "learning_rate": 1.1186355118645554e-05, "loss": 0.2002, "step": 4240 }, { "epoch": 8.491508491508492, "grad_norm": 0.6147842407226562, "learning_rate": 1.0899347581163221e-05, "loss": 0.199, "step": 4250 }, { "epoch": 8.511488511488512, "grad_norm": 0.6099655628204346, "learning_rate": 1.0615857584873623e-05, "loss": 0.1971, "step": 4260 }, { "epoch": 8.531468531468532, "grad_norm": 0.6306450366973877, "learning_rate": 1.0335896321476413e-05, "loss": 0.1971, "step": 4270 }, { "epoch": 8.551448551448551, "grad_norm": 0.5740554928779602, "learning_rate": 1.0059474843362892e-05, "loss": 0.1964, "step": 4280 }, { "epoch": 8.571428571428571, "grad_norm": 0.566005289554596, "learning_rate": 9.786604063179728e-06, "loss": 0.197, "step": 4290 }, { "epoch": 8.591408591408591, "grad_norm": 0.6008467674255371, "learning_rate": 9.517294753398064e-06, "loss": 0.1969, "step": 4300 }, { "epoch": 8.61138861138861, "grad_norm": 0.5880402326583862, "learning_rate": 9.251557545888312e-06, "loss": 0.1944, "step": 4310 }, { "epoch": 8.631368631368632, "grad_norm": 0.6250616908073425, "learning_rate": 8.989402931500434e-06, "loss": 0.1978, "step": 4320 }, { "epoch": 8.651348651348652, "grad_norm": 0.554460883140564, "learning_rate": 8.730841259649725e-06, "loss": 0.1998, "step": 4330 }, { "epoch": 8.671328671328672, "grad_norm": 0.5680242776870728, "learning_rate": 8.475882737908248e-06, "loss": 0.2, "step": 4340 }, { "epoch": 8.691308691308691, "grad_norm": 0.5889159440994263, "learning_rate": 8.224537431601886e-06, "loss": 0.1985, "step": 4350 }, { "epoch": 8.711288711288711, "grad_norm": 0.6051207780838013, "learning_rate": 7.976815263412963e-06, "loss": 0.1944, "step": 4360 }, { "epoch": 8.731268731268731, "grad_norm": 0.6148102283477783, "learning_rate": 7.73272601298851e-06, "loss": 0.1952, "step": 4370 }, { "epoch": 8.75124875124875, "grad_norm": 0.6123753786087036, "learning_rate": 7.492279316554207e-06, "loss": 0.1955, "step": 4380 }, { "epoch": 8.77122877122877, "grad_norm": 0.5911871790885925, "learning_rate": 7.255484666533874e-06, "loss": 0.1987, "step": 4390 }, { "epoch": 8.791208791208792, "grad_norm": 0.5861064195632935, "learning_rate": 7.022351411174866e-06, "loss": 0.1972, "step": 4400 }, { "epoch": 8.811188811188812, "grad_norm": 0.6565813422203064, "learning_rate": 6.7928887541789055e-06, "loss": 0.1966, "step": 4410 }, { "epoch": 8.831168831168831, "grad_norm": 0.6338573694229126, "learning_rate": 6.5671057543387985e-06, "loss": 0.1987, "step": 4420 }, { "epoch": 8.851148851148851, "grad_norm": 0.5672295093536377, "learning_rate": 6.345011325180772e-06, "loss": 0.198, "step": 4430 }, { "epoch": 8.871128871128871, "grad_norm": 0.6036155223846436, "learning_rate": 6.126614234612593e-06, "loss": 0.199, "step": 4440 }, { "epoch": 8.89110889110889, "grad_norm": 0.5816395878791809, "learning_rate": 5.911923104577455e-06, "loss": 0.1985, "step": 4450 }, { "epoch": 8.91108891108891, "grad_norm": 0.5562584400177002, "learning_rate": 5.700946410713548e-06, "loss": 0.1964, "step": 4460 }, { "epoch": 8.931068931068932, "grad_norm": 0.6179762482643127, "learning_rate": 5.49369248201953e-06, "loss": 0.1948, "step": 4470 }, { "epoch": 8.951048951048952, "grad_norm": 0.5566456317901611, "learning_rate": 5.290169500525577e-06, "loss": 0.1958, "step": 4480 }, { "epoch": 8.971028971028971, "grad_norm": 0.6196462512016296, "learning_rate": 5.0903855009705514e-06, "loss": 0.1978, "step": 4490 }, { "epoch": 8.991008991008991, "grad_norm": 0.5933112502098083, "learning_rate": 4.8943483704846475e-06, "loss": 0.1962, "step": 4500 }, { "epoch": 9.010989010989011, "grad_norm": 0.5680419206619263, "learning_rate": 4.702065848278126e-06, "loss": 0.1948, "step": 4510 }, { "epoch": 9.03096903096903, "grad_norm": 0.5447672605514526, "learning_rate": 4.513545525335705e-06, "loss": 0.1894, "step": 4520 }, { "epoch": 9.05094905094905, "grad_norm": 0.5605758428573608, "learning_rate": 4.328794844116946e-06, "loss": 0.1903, "step": 4530 }, { "epoch": 9.07092907092907, "grad_norm": 0.5727641582489014, "learning_rate": 4.147821098262405e-06, "loss": 0.1899, "step": 4540 }, { "epoch": 9.090909090909092, "grad_norm": 0.5076532363891602, "learning_rate": 3.970631432305694e-06, "loss": 0.1872, "step": 4550 }, { "epoch": 9.110889110889111, "grad_norm": 0.5827686190605164, "learning_rate": 3.797232841391407e-06, "loss": 0.1871, "step": 4560 }, { "epoch": 9.130869130869131, "grad_norm": 0.5457426905632019, "learning_rate": 3.627632170999029e-06, "loss": 0.1903, "step": 4570 }, { "epoch": 9.150849150849151, "grad_norm": 0.5931391716003418, "learning_rate": 3.461836116672612e-06, "loss": 0.1935, "step": 4580 }, { "epoch": 9.17082917082917, "grad_norm": 0.5335982441902161, "learning_rate": 3.2998512237565005e-06, "loss": 0.188, "step": 4590 }, { "epoch": 9.19080919080919, "grad_norm": 0.5809586048126221, "learning_rate": 3.1416838871368924e-06, "loss": 0.1882, "step": 4600 }, { "epoch": 9.21078921078921, "grad_norm": 0.5997488498687744, "learning_rate": 2.9873403509894203e-06, "loss": 0.189, "step": 4610 }, { "epoch": 9.23076923076923, "grad_norm": 0.5423487424850464, "learning_rate": 2.836826708532603e-06, "loss": 0.1916, "step": 4620 }, { "epoch": 9.250749250749251, "grad_norm": 0.5920736193656921, "learning_rate": 2.690148901787337e-06, "loss": 0.1914, "step": 4630 }, { "epoch": 9.270729270729271, "grad_norm": 0.5774621367454529, "learning_rate": 2.5473127213422763e-06, "loss": 0.1901, "step": 4640 }, { "epoch": 9.290709290709291, "grad_norm": 0.6183256506919861, "learning_rate": 2.4083238061252567e-06, "loss": 0.1918, "step": 4650 }, { "epoch": 9.31068931068931, "grad_norm": 0.5502414107322693, "learning_rate": 2.273187643180652e-06, "loss": 0.1888, "step": 4660 }, { "epoch": 9.33066933066933, "grad_norm": 0.5888564586639404, "learning_rate": 2.141909567452793e-06, "loss": 0.189, "step": 4670 }, { "epoch": 9.35064935064935, "grad_norm": 0.582281231880188, "learning_rate": 2.014494761575314e-06, "loss": 0.188, "step": 4680 }, { "epoch": 9.37062937062937, "grad_norm": 0.549766480922699, "learning_rate": 1.8909482556666024e-06, "loss": 0.1911, "step": 4690 }, { "epoch": 9.390609390609391, "grad_norm": 0.6442523002624512, "learning_rate": 1.771274927131139e-06, "loss": 0.1913, "step": 4700 }, { "epoch": 9.410589410589411, "grad_norm": 0.5612021684646606, "learning_rate": 1.6554795004670388e-06, "loss": 0.1926, "step": 4710 }, { "epoch": 9.430569430569431, "grad_norm": 0.6060473918914795, "learning_rate": 1.543566547079467e-06, "loss": 0.19, "step": 4720 }, { "epoch": 9.45054945054945, "grad_norm": 0.5958064794540405, "learning_rate": 1.4355404851001952e-06, "loss": 0.1885, "step": 4730 }, { "epoch": 9.47052947052947, "grad_norm": 0.536431610584259, "learning_rate": 1.3314055792131964e-06, "loss": 0.1891, "step": 4740 }, { "epoch": 9.49050949050949, "grad_norm": 0.5971366763114929, "learning_rate": 1.231165940486234e-06, "loss": 0.1889, "step": 4750 }, { "epoch": 9.51048951048951, "grad_norm": 0.5461220145225525, "learning_rate": 1.134825526208605e-06, "loss": 0.1874, "step": 4760 }, { "epoch": 9.53046953046953, "grad_norm": 0.570928156375885, "learning_rate": 1.0423881397349068e-06, "loss": 0.1884, "step": 4770 }, { "epoch": 9.550449550449551, "grad_norm": 0.5855159759521484, "learning_rate": 9.538574303348813e-07, "loss": 0.1895, "step": 4780 }, { "epoch": 9.570429570429571, "grad_norm": 0.5505802631378174, "learning_rate": 8.692368930493521e-07, "loss": 0.1904, "step": 4790 }, { "epoch": 9.59040959040959, "grad_norm": 0.5663396716117859, "learning_rate": 7.885298685522235e-07, "loss": 0.1909, "step": 4800 }, { "epoch": 9.61038961038961, "grad_norm": 0.6069871783256531, "learning_rate": 7.117395430186414e-07, "loss": 0.1895, "step": 4810 }, { "epoch": 9.63036963036963, "grad_norm": 0.5576395988464355, "learning_rate": 6.388689479991605e-07, "loss": 0.1906, "step": 4820 }, { "epoch": 9.65034965034965, "grad_norm": 0.5069971084594727, "learning_rate": 5.699209603001076e-07, "loss": 0.1889, "step": 4830 }, { "epoch": 9.67032967032967, "grad_norm": 0.5770872235298157, "learning_rate": 5.048983018699827e-07, "loss": 0.1907, "step": 4840 }, { "epoch": 9.69030969030969, "grad_norm": 0.6914857029914856, "learning_rate": 4.438035396920004e-07, "loss": 0.1939, "step": 4850 }, { "epoch": 9.710289710289711, "grad_norm": 0.5999007821083069, "learning_rate": 3.866390856827495e-07, "loss": 0.1924, "step": 4860 }, { "epoch": 9.73026973026973, "grad_norm": 0.569180965423584, "learning_rate": 3.3340719659701313e-07, "loss": 0.1887, "step": 4870 }, { "epoch": 9.75024975024975, "grad_norm": 0.5442143082618713, "learning_rate": 2.841099739386066e-07, "loss": 0.1897, "step": 4880 }, { "epoch": 9.77022977022977, "grad_norm": 0.5622804164886475, "learning_rate": 2.387493638774774e-07, "loss": 0.1898, "step": 4890 }, { "epoch": 9.79020979020979, "grad_norm": 0.6558981537818909, "learning_rate": 1.973271571728441e-07, "loss": 0.1917, "step": 4900 }, { "epoch": 9.81018981018981, "grad_norm": 0.5756235122680664, "learning_rate": 1.598449891024978e-07, "loss": 0.192, "step": 4910 }, { "epoch": 9.83016983016983, "grad_norm": 0.5818027257919312, "learning_rate": 1.2630433939825327e-07, "loss": 0.1899, "step": 4920 }, { "epoch": 9.850149850149851, "grad_norm": 0.5986452698707581, "learning_rate": 9.670653218752934e-08, "loss": 0.1918, "step": 4930 }, { "epoch": 9.87012987012987, "grad_norm": 0.5438185334205627, "learning_rate": 7.105273594107953e-08, "loss": 0.1905, "step": 4940 }, { "epoch": 9.89010989010989, "grad_norm": 0.5430960059165955, "learning_rate": 4.934396342684e-08, "loss": 0.1913, "step": 4950 }, { "epoch": 9.91008991008991, "grad_norm": 0.5492510199546814, "learning_rate": 3.1581071670006015e-08, "loss": 0.1904, "step": 4960 }, { "epoch": 9.93006993006993, "grad_norm": 0.5370259881019592, "learning_rate": 1.7764761919103477e-08, "loss": 0.1901, "step": 4970 }, { "epoch": 9.95004995004995, "grad_norm": 0.5463282465934753, "learning_rate": 7.895579618388827e-09, "loss": 0.191, "step": 4980 }, { "epoch": 9.97002997002997, "grad_norm": 0.5733128190040588, "learning_rate": 1.973914386288467e-09, "loss": 0.1885, "step": 4990 }, { "epoch": 9.99000999000999, "grad_norm": 0.5241893529891968, "learning_rate": 0.0, "loss": 0.1916, "step": 5000 }, { "epoch": 9.99000999000999, "step": 5000, "total_flos": 7.37720834306605e+17, "train_loss": 0.4096551623106003, "train_runtime": 80947.5605, "train_samples_per_second": 0.742, "train_steps_per_second": 0.062 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 7.37720834306605e+17, "train_batch_size": 3, "trial_name": null, "trial_params": null }