diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.7506430933812421, + "epoch": 0.9998775060226206, "eval_steps": 766, - "global_step": 2298, + "global_step": 3061, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -16125,6 +16125,5347 @@ "eval_samples_per_second": 5.119, "eval_steps_per_second": 2.559, "step": 2298 + }, + { + "epoch": 0.7509697439875873, + "grad_norm": 1.218799114227295, + "learning_rate": 1.4987523307921375e-05, + "loss": 1.515, + "step": 2299 + }, + { + "epoch": 0.7512963945939325, + "grad_norm": 1.8166571855545044, + "learning_rate": 1.4950299309775967e-05, + "loss": 2.6661, + "step": 2300 + }, + { + "epoch": 0.7516230452002777, + "grad_norm": 0.21096064150333405, + "learning_rate": 1.491311346760143e-05, + "loss": 0.7332, + "step": 2301 + }, + { + "epoch": 0.7519496958066229, + "grad_norm": 0.22826731204986572, + "learning_rate": 1.4875965821879212e-05, + "loss": 0.7841, + "step": 2302 + }, + { + "epoch": 0.752276346412968, + "grad_norm": 0.2562881410121918, + "learning_rate": 1.4838856413049107e-05, + "loss": 0.8088, + "step": 2303 + }, + { + "epoch": 0.7526029970193132, + "grad_norm": 0.2786596119403839, + "learning_rate": 1.4801785281509394e-05, + "loss": 0.8839, + "step": 2304 + }, + { + "epoch": 0.7529296476256584, + "grad_norm": 0.27765336632728577, + "learning_rate": 1.4764752467616588e-05, + "loss": 0.8375, + "step": 2305 + }, + { + "epoch": 0.7532562982320036, + "grad_norm": 0.2921684980392456, + "learning_rate": 1.4727758011685538e-05, + "loss": 0.7784, + "step": 2306 + }, + { + "epoch": 0.7535829488383488, + "grad_norm": 0.2950403094291687, + "learning_rate": 1.4690801953989313e-05, + "loss": 0.8052, + "step": 2307 + }, + { + "epoch": 0.7539095994446939, + "grad_norm": 0.3217078745365143, + "learning_rate": 1.4653884334759205e-05, + "loss": 0.7845, + "step": 2308 + }, + { + "epoch": 0.7542362500510391, + "grad_norm": 0.3093569874763489, + "learning_rate": 1.461700519418464e-05, + "loss": 0.811, + "step": 2309 + }, + { + "epoch": 0.7545629006573843, + "grad_norm": 0.3464542031288147, + "learning_rate": 1.4580164572413163e-05, + "loss": 0.8718, + "step": 2310 + }, + { + "epoch": 0.7548895512637296, + "grad_norm": 0.3515595495700836, + "learning_rate": 1.454336250955039e-05, + "loss": 0.9003, + "step": 2311 + }, + { + "epoch": 0.7552162018700748, + "grad_norm": 0.36814939975738525, + "learning_rate": 1.4506599045659951e-05, + "loss": 0.813, + "step": 2312 + }, + { + "epoch": 0.75554285247642, + "grad_norm": 0.42278334498405457, + "learning_rate": 1.4469874220763463e-05, + "loss": 1.0227, + "step": 2313 + }, + { + "epoch": 0.7558695030827651, + "grad_norm": 0.427903413772583, + "learning_rate": 1.4433188074840482e-05, + "loss": 0.9597, + "step": 2314 + }, + { + "epoch": 0.7561961536891103, + "grad_norm": 0.4434622824192047, + "learning_rate": 1.4396540647828467e-05, + "loss": 0.891, + "step": 2315 + }, + { + "epoch": 0.7565228042954555, + "grad_norm": 0.47153371572494507, + "learning_rate": 1.4359931979622665e-05, + "loss": 0.9432, + "step": 2316 + }, + { + "epoch": 0.7568494549018007, + "grad_norm": 0.4934622049331665, + "learning_rate": 1.4323362110076238e-05, + "loss": 1.0377, + "step": 2317 + }, + { + "epoch": 0.7571761055081458, + "grad_norm": 0.5599250197410583, + "learning_rate": 1.4286831078999997e-05, + "loss": 1.1322, + "step": 2318 + }, + { + "epoch": 0.757502756114491, + "grad_norm": 0.5557501316070557, + "learning_rate": 1.4250338926162583e-05, + "loss": 0.9105, + "step": 2319 + }, + { + "epoch": 0.7578294067208362, + "grad_norm": 0.6891188025474548, + "learning_rate": 1.4213885691290202e-05, + "loss": 1.2214, + "step": 2320 + }, + { + "epoch": 0.7581560573271814, + "grad_norm": 0.8066973090171814, + "learning_rate": 1.4177471414066773e-05, + "loss": 1.2513, + "step": 2321 + }, + { + "epoch": 0.7584827079335266, + "grad_norm": 0.9567353129386902, + "learning_rate": 1.4141096134133774e-05, + "loss": 1.245, + "step": 2322 + }, + { + "epoch": 0.7588093585398717, + "grad_norm": 1.3140424489974976, + "learning_rate": 1.4104759891090252e-05, + "loss": 1.274, + "step": 2323 + }, + { + "epoch": 0.7591360091462169, + "grad_norm": 1.6508132219314575, + "learning_rate": 1.4068462724492688e-05, + "loss": 1.4996, + "step": 2324 + }, + { + "epoch": 0.7594626597525622, + "grad_norm": 1.7418688535690308, + "learning_rate": 1.4032204673855142e-05, + "loss": 2.1463, + "step": 2325 + }, + { + "epoch": 0.7597893103589074, + "grad_norm": 0.2149413824081421, + "learning_rate": 1.3995985778648962e-05, + "loss": 0.7248, + "step": 2326 + }, + { + "epoch": 0.7601159609652526, + "grad_norm": 0.24548666179180145, + "learning_rate": 1.3959806078302995e-05, + "loss": 0.7657, + "step": 2327 + }, + { + "epoch": 0.7604426115715978, + "grad_norm": 0.26261597871780396, + "learning_rate": 1.3923665612203308e-05, + "loss": 0.7632, + "step": 2328 + }, + { + "epoch": 0.7607692621779429, + "grad_norm": 0.27283692359924316, + "learning_rate": 1.3887564419693328e-05, + "loss": 0.8575, + "step": 2329 + }, + { + "epoch": 0.7610959127842881, + "grad_norm": 0.28144407272338867, + "learning_rate": 1.3851502540073696e-05, + "loss": 0.8438, + "step": 2330 + }, + { + "epoch": 0.7614225633906333, + "grad_norm": 0.2961028218269348, + "learning_rate": 1.3815480012602272e-05, + "loss": 0.8569, + "step": 2331 + }, + { + "epoch": 0.7617492139969785, + "grad_norm": 0.30785036087036133, + "learning_rate": 1.377949687649407e-05, + "loss": 0.8506, + "step": 2332 + }, + { + "epoch": 0.7620758646033237, + "grad_norm": 0.29399818181991577, + "learning_rate": 1.3743553170921225e-05, + "loss": 0.7934, + "step": 2333 + }, + { + "epoch": 0.7624025152096688, + "grad_norm": 0.3192881643772125, + "learning_rate": 1.3707648935012935e-05, + "loss": 0.8179, + "step": 2334 + }, + { + "epoch": 0.762729165816014, + "grad_norm": 0.34171509742736816, + "learning_rate": 1.3671784207855442e-05, + "loss": 0.8689, + "step": 2335 + }, + { + "epoch": 0.7630558164223592, + "grad_norm": 0.3571321666240692, + "learning_rate": 1.3635959028492002e-05, + "loss": 0.8651, + "step": 2336 + }, + { + "epoch": 0.7633824670287044, + "grad_norm": 0.3970184922218323, + "learning_rate": 1.3600173435922725e-05, + "loss": 0.9319, + "step": 2337 + }, + { + "epoch": 0.7637091176350496, + "grad_norm": 0.39088132977485657, + "learning_rate": 1.3564427469104768e-05, + "loss": 0.9152, + "step": 2338 + }, + { + "epoch": 0.7640357682413949, + "grad_norm": 0.42941996455192566, + "learning_rate": 1.3528721166952007e-05, + "loss": 0.9292, + "step": 2339 + }, + { + "epoch": 0.76436241884774, + "grad_norm": 0.43511438369750977, + "learning_rate": 1.3493054568335273e-05, + "loss": 0.9781, + "step": 2340 + }, + { + "epoch": 0.7646890694540852, + "grad_norm": 0.5111512541770935, + "learning_rate": 1.3457427712082049e-05, + "loss": 0.9431, + "step": 2341 + }, + { + "epoch": 0.7650157200604304, + "grad_norm": 0.5455848574638367, + "learning_rate": 1.342184063697664e-05, + "loss": 1.2364, + "step": 2342 + }, + { + "epoch": 0.7653423706667756, + "grad_norm": 0.5164190530776978, + "learning_rate": 1.3386293381760013e-05, + "loss": 1.0224, + "step": 2343 + }, + { + "epoch": 0.7656690212731208, + "grad_norm": 0.6091725826263428, + "learning_rate": 1.3350785985129782e-05, + "loss": 1.1075, + "step": 2344 + }, + { + "epoch": 0.7659956718794659, + "grad_norm": 0.6855178475379944, + "learning_rate": 1.331531848574018e-05, + "loss": 1.1122, + "step": 2345 + }, + { + "epoch": 0.7663223224858111, + "grad_norm": 0.7483596205711365, + "learning_rate": 1.327989092220201e-05, + "loss": 1.2955, + "step": 2346 + }, + { + "epoch": 0.7666489730921563, + "grad_norm": 0.9962571263313293, + "learning_rate": 1.3244503333082586e-05, + "loss": 1.2699, + "step": 2347 + }, + { + "epoch": 0.7669756236985015, + "grad_norm": 1.1855285167694092, + "learning_rate": 1.3209155756905722e-05, + "loss": 1.257, + "step": 2348 + }, + { + "epoch": 0.7673022743048467, + "grad_norm": 1.2274976968765259, + "learning_rate": 1.3173848232151665e-05, + "loss": 1.2729, + "step": 2349 + }, + { + "epoch": 0.7676289249111918, + "grad_norm": 1.730171799659729, + "learning_rate": 1.3138580797257027e-05, + "loss": 1.3523, + "step": 2350 + }, + { + "epoch": 0.767955575517537, + "grad_norm": 0.20054863393306732, + "learning_rate": 1.3103353490614861e-05, + "loss": 0.7015, + "step": 2351 + }, + { + "epoch": 0.7682822261238822, + "grad_norm": 0.24544449150562286, + "learning_rate": 1.306816635057443e-05, + "loss": 0.8144, + "step": 2352 + }, + { + "epoch": 0.7686088767302274, + "grad_norm": 0.252315491437912, + "learning_rate": 1.3033019415441377e-05, + "loss": 0.7682, + "step": 2353 + }, + { + "epoch": 0.7689355273365727, + "grad_norm": 0.2725442051887512, + "learning_rate": 1.299791272347749e-05, + "loss": 0.7963, + "step": 2354 + }, + { + "epoch": 0.7692621779429178, + "grad_norm": 0.2909514307975769, + "learning_rate": 1.2962846312900789e-05, + "loss": 0.7745, + "step": 2355 + }, + { + "epoch": 0.769588828549263, + "grad_norm": 0.3024318814277649, + "learning_rate": 1.2927820221885446e-05, + "loss": 0.9283, + "step": 2356 + }, + { + "epoch": 0.7699154791556082, + "grad_norm": 0.32852616906166077, + "learning_rate": 1.2892834488561717e-05, + "loss": 0.9256, + "step": 2357 + }, + { + "epoch": 0.7702421297619534, + "grad_norm": 0.29283902049064636, + "learning_rate": 1.285788915101594e-05, + "loss": 0.8214, + "step": 2358 + }, + { + "epoch": 0.7705687803682986, + "grad_norm": 0.32372137904167175, + "learning_rate": 1.2822984247290493e-05, + "loss": 0.7905, + "step": 2359 + }, + { + "epoch": 0.7708954309746437, + "grad_norm": 0.3313743770122528, + "learning_rate": 1.278811981538367e-05, + "loss": 0.8942, + "step": 2360 + }, + { + "epoch": 0.7712220815809889, + "grad_norm": 0.370326429605484, + "learning_rate": 1.2753295893249811e-05, + "loss": 0.8639, + "step": 2361 + }, + { + "epoch": 0.7715487321873341, + "grad_norm": 0.371473491191864, + "learning_rate": 1.2718512518799059e-05, + "loss": 0.8924, + "step": 2362 + }, + { + "epoch": 0.7718753827936793, + "grad_norm": 0.3572691082954407, + "learning_rate": 1.2683769729897466e-05, + "loss": 0.9241, + "step": 2363 + }, + { + "epoch": 0.7722020334000245, + "grad_norm": 0.4208536446094513, + "learning_rate": 1.2649067564366896e-05, + "loss": 1.0033, + "step": 2364 + }, + { + "epoch": 0.7725286840063696, + "grad_norm": 0.4127427041530609, + "learning_rate": 1.2614406059984978e-05, + "loss": 0.881, + "step": 2365 + }, + { + "epoch": 0.7728553346127148, + "grad_norm": 0.4512838125228882, + "learning_rate": 1.2579785254485088e-05, + "loss": 1.03, + "step": 2366 + }, + { + "epoch": 0.77318198521906, + "grad_norm": 0.5090767741203308, + "learning_rate": 1.254520518555629e-05, + "loss": 1.1346, + "step": 2367 + }, + { + "epoch": 0.7735086358254053, + "grad_norm": 0.5246442556381226, + "learning_rate": 1.2510665890843298e-05, + "loss": 0.9972, + "step": 2368 + }, + { + "epoch": 0.7738352864317505, + "grad_norm": 0.5956628918647766, + "learning_rate": 1.2476167407946448e-05, + "loss": 1.188, + "step": 2369 + }, + { + "epoch": 0.7741619370380957, + "grad_norm": 0.6347416639328003, + "learning_rate": 1.2441709774421639e-05, + "loss": 1.2596, + "step": 2370 + }, + { + "epoch": 0.7744885876444408, + "grad_norm": 0.7028458714485168, + "learning_rate": 1.2407293027780304e-05, + "loss": 0.9761, + "step": 2371 + }, + { + "epoch": 0.774815238250786, + "grad_norm": 0.9047834277153015, + "learning_rate": 1.2372917205489376e-05, + "loss": 1.408, + "step": 2372 + }, + { + "epoch": 0.7751418888571312, + "grad_norm": 1.1155500411987305, + "learning_rate": 1.233858234497119e-05, + "loss": 1.2624, + "step": 2373 + }, + { + "epoch": 0.7754685394634764, + "grad_norm": 1.3037281036376953, + "learning_rate": 1.2304288483603565e-05, + "loss": 1.4873, + "step": 2374 + }, + { + "epoch": 0.7757951900698216, + "grad_norm": 1.3568501472473145, + "learning_rate": 1.227003565871962e-05, + "loss": 1.4208, + "step": 2375 + }, + { + "epoch": 0.7761218406761667, + "grad_norm": 0.21427606046199799, + "learning_rate": 1.2235823907607834e-05, + "loss": 0.7637, + "step": 2376 + }, + { + "epoch": 0.7764484912825119, + "grad_norm": 0.2435503751039505, + "learning_rate": 1.220165326751197e-05, + "loss": 0.7515, + "step": 2377 + }, + { + "epoch": 0.7767751418888571, + "grad_norm": 0.25543227791786194, + "learning_rate": 1.2167523775631029e-05, + "loss": 0.8145, + "step": 2378 + }, + { + "epoch": 0.7771017924952023, + "grad_norm": 0.2633327543735504, + "learning_rate": 1.2133435469119214e-05, + "loss": 0.8333, + "step": 2379 + }, + { + "epoch": 0.7774284431015475, + "grad_norm": 0.2795431613922119, + "learning_rate": 1.2099388385085908e-05, + "loss": 0.8558, + "step": 2380 + }, + { + "epoch": 0.7777550937078926, + "grad_norm": 0.28009021282196045, + "learning_rate": 1.20653825605956e-05, + "loss": 0.764, + "step": 2381 + }, + { + "epoch": 0.7780817443142379, + "grad_norm": 0.2952665686607361, + "learning_rate": 1.203141803266788e-05, + "loss": 0.7802, + "step": 2382 + }, + { + "epoch": 0.7784083949205831, + "grad_norm": 0.3152238130569458, + "learning_rate": 1.1997494838277368e-05, + "loss": 0.876, + "step": 2383 + }, + { + "epoch": 0.7787350455269283, + "grad_norm": 0.40310126543045044, + "learning_rate": 1.1963613014353692e-05, + "loss": 0.8793, + "step": 2384 + }, + { + "epoch": 0.7790616961332735, + "grad_norm": 0.35672134160995483, + "learning_rate": 1.1929772597781458e-05, + "loss": 0.9216, + "step": 2385 + }, + { + "epoch": 0.7793883467396187, + "grad_norm": 0.3469008207321167, + "learning_rate": 1.1895973625400136e-05, + "loss": 0.8372, + "step": 2386 + }, + { + "epoch": 0.7797149973459638, + "grad_norm": 0.4095902442932129, + "learning_rate": 1.1862216134004189e-05, + "loss": 0.9618, + "step": 2387 + }, + { + "epoch": 0.780041647952309, + "grad_norm": 0.4208645224571228, + "learning_rate": 1.1828500160342787e-05, + "loss": 0.886, + "step": 2388 + }, + { + "epoch": 0.7803682985586542, + "grad_norm": 0.4261435568332672, + "learning_rate": 1.1794825741120042e-05, + "loss": 1.0212, + "step": 2389 + }, + { + "epoch": 0.7806949491649994, + "grad_norm": 0.4427204430103302, + "learning_rate": 1.1761192912994706e-05, + "loss": 0.8466, + "step": 2390 + }, + { + "epoch": 0.7810215997713446, + "grad_norm": 0.5028002858161926, + "learning_rate": 1.1727601712580321e-05, + "loss": 1.0301, + "step": 2391 + }, + { + "epoch": 0.7813482503776897, + "grad_norm": 0.5116696953773499, + "learning_rate": 1.1694052176445114e-05, + "loss": 1.0023, + "step": 2392 + }, + { + "epoch": 0.7816749009840349, + "grad_norm": 0.5628235340118408, + "learning_rate": 1.166054434111194e-05, + "loss": 1.0638, + "step": 2393 + }, + { + "epoch": 0.7820015515903801, + "grad_norm": 0.628449022769928, + "learning_rate": 1.1627078243058214e-05, + "loss": 1.0702, + "step": 2394 + }, + { + "epoch": 0.7823282021967253, + "grad_norm": 0.7063364386558533, + "learning_rate": 1.159365391871602e-05, + "loss": 1.1558, + "step": 2395 + }, + { + "epoch": 0.7826548528030706, + "grad_norm": 0.847536027431488, + "learning_rate": 1.1560271404471845e-05, + "loss": 1.272, + "step": 2396 + }, + { + "epoch": 0.7829815034094157, + "grad_norm": 0.9608874320983887, + "learning_rate": 1.152693073666678e-05, + "loss": 1.2311, + "step": 2397 + }, + { + "epoch": 0.7833081540157609, + "grad_norm": 1.243880033493042, + "learning_rate": 1.149363195159625e-05, + "loss": 1.2427, + "step": 2398 + }, + { + "epoch": 0.7836348046221061, + "grad_norm": 1.3490464687347412, + "learning_rate": 1.1460375085510161e-05, + "loss": 1.6011, + "step": 2399 + }, + { + "epoch": 0.7839614552284513, + "grad_norm": 2.450794219970703, + "learning_rate": 1.142716017461275e-05, + "loss": 2.5833, + "step": 2400 + }, + { + "epoch": 0.7842881058347965, + "grad_norm": 0.20113646984100342, + "learning_rate": 1.1393987255062599e-05, + "loss": 0.7018, + "step": 2401 + }, + { + "epoch": 0.7846147564411416, + "grad_norm": 0.24163466691970825, + "learning_rate": 1.1360856362972555e-05, + "loss": 0.7365, + "step": 2402 + }, + { + "epoch": 0.7849414070474868, + "grad_norm": 0.2692144513130188, + "learning_rate": 1.1327767534409739e-05, + "loss": 0.8654, + "step": 2403 + }, + { + "epoch": 0.785268057653832, + "grad_norm": 0.271269291639328, + "learning_rate": 1.1294720805395464e-05, + "loss": 0.8093, + "step": 2404 + }, + { + "epoch": 0.7855947082601772, + "grad_norm": 0.2852034270763397, + "learning_rate": 1.126171621190522e-05, + "loss": 0.8286, + "step": 2405 + }, + { + "epoch": 0.7859213588665224, + "grad_norm": 0.2870404124259949, + "learning_rate": 1.122875378986863e-05, + "loss": 0.7744, + "step": 2406 + }, + { + "epoch": 0.7862480094728675, + "grad_norm": 0.3201829791069031, + "learning_rate": 1.1195833575169362e-05, + "loss": 0.8396, + "step": 2407 + }, + { + "epoch": 0.7865746600792127, + "grad_norm": 0.33272668719291687, + "learning_rate": 1.1162955603645236e-05, + "loss": 0.8156, + "step": 2408 + }, + { + "epoch": 0.7869013106855579, + "grad_norm": 0.34226053953170776, + "learning_rate": 1.1130119911087966e-05, + "loss": 0.8511, + "step": 2409 + }, + { + "epoch": 0.7872279612919032, + "grad_norm": 0.3556172847747803, + "learning_rate": 1.1097326533243352e-05, + "loss": 0.8328, + "step": 2410 + }, + { + "epoch": 0.7875546118982484, + "grad_norm": 0.3675041198730469, + "learning_rate": 1.1064575505811043e-05, + "loss": 0.9927, + "step": 2411 + }, + { + "epoch": 0.7878812625045936, + "grad_norm": 0.38928094506263733, + "learning_rate": 1.1031866864444618e-05, + "loss": 0.8938, + "step": 2412 + }, + { + "epoch": 0.7882079131109387, + "grad_norm": 0.3787207007408142, + "learning_rate": 1.0999200644751523e-05, + "loss": 0.8779, + "step": 2413 + }, + { + "epoch": 0.7885345637172839, + "grad_norm": 0.4117635190486908, + "learning_rate": 1.0966576882293012e-05, + "loss": 1.102, + "step": 2414 + }, + { + "epoch": 0.7888612143236291, + "grad_norm": 0.4420527517795563, + "learning_rate": 1.0933995612584114e-05, + "loss": 0.9014, + "step": 2415 + }, + { + "epoch": 0.7891878649299743, + "grad_norm": 0.4948444068431854, + "learning_rate": 1.0901456871093613e-05, + "loss": 1.0171, + "step": 2416 + }, + { + "epoch": 0.7895145155363195, + "grad_norm": 0.5133086442947388, + "learning_rate": 1.086896069324398e-05, + "loss": 0.9654, + "step": 2417 + }, + { + "epoch": 0.7898411661426646, + "grad_norm": 0.5456277132034302, + "learning_rate": 1.0836507114411359e-05, + "loss": 0.9479, + "step": 2418 + }, + { + "epoch": 0.7901678167490098, + "grad_norm": 0.6290357708930969, + "learning_rate": 1.0804096169925532e-05, + "loss": 1.1441, + "step": 2419 + }, + { + "epoch": 0.790494467355355, + "grad_norm": 0.7249221205711365, + "learning_rate": 1.0771727895069816e-05, + "loss": 1.2534, + "step": 2420 + }, + { + "epoch": 0.7908211179617002, + "grad_norm": 0.8690229654312134, + "learning_rate": 1.0739402325081172e-05, + "loss": 1.262, + "step": 2421 + }, + { + "epoch": 0.7911477685680454, + "grad_norm": 0.9221552014350891, + "learning_rate": 1.0707119495149958e-05, + "loss": 1.1823, + "step": 2422 + }, + { + "epoch": 0.7914744191743905, + "grad_norm": 1.201319694519043, + "learning_rate": 1.0674879440420117e-05, + "loss": 1.4645, + "step": 2423 + }, + { + "epoch": 0.7918010697807358, + "grad_norm": 1.345933198928833, + "learning_rate": 1.0642682195988917e-05, + "loss": 1.4429, + "step": 2424 + }, + { + "epoch": 0.792127720387081, + "grad_norm": 1.9995300769805908, + "learning_rate": 1.0610527796907104e-05, + "loss": 2.2084, + "step": 2425 + }, + { + "epoch": 0.7924543709934262, + "grad_norm": 0.2166060358285904, + "learning_rate": 1.057841627817875e-05, + "loss": 0.6935, + "step": 2426 + }, + { + "epoch": 0.7927810215997714, + "grad_norm": 0.24457815289497375, + "learning_rate": 1.0546347674761236e-05, + "loss": 0.7934, + "step": 2427 + }, + { + "epoch": 0.7931076722061166, + "grad_norm": 0.25961560010910034, + "learning_rate": 1.0514322021565248e-05, + "loss": 0.7806, + "step": 2428 + }, + { + "epoch": 0.7934343228124617, + "grad_norm": 0.2620426416397095, + "learning_rate": 1.0482339353454717e-05, + "loss": 0.8303, + "step": 2429 + }, + { + "epoch": 0.7937609734188069, + "grad_norm": 0.2988450229167938, + "learning_rate": 1.0450399705246721e-05, + "loss": 0.8409, + "step": 2430 + }, + { + "epoch": 0.7940876240251521, + "grad_norm": 0.27858319878578186, + "learning_rate": 1.0418503111711591e-05, + "loss": 0.7863, + "step": 2431 + }, + { + "epoch": 0.7944142746314973, + "grad_norm": 0.3067804276943207, + "learning_rate": 1.0386649607572758e-05, + "loss": 0.9039, + "step": 2432 + }, + { + "epoch": 0.7947409252378425, + "grad_norm": 0.31061863899230957, + "learning_rate": 1.035483922750668e-05, + "loss": 0.8666, + "step": 2433 + }, + { + "epoch": 0.7950675758441876, + "grad_norm": 0.324632853269577, + "learning_rate": 1.032307200614298e-05, + "loss": 0.9376, + "step": 2434 + }, + { + "epoch": 0.7953942264505328, + "grad_norm": 0.31783485412597656, + "learning_rate": 1.0291347978064197e-05, + "loss": 0.8443, + "step": 2435 + }, + { + "epoch": 0.795720877056878, + "grad_norm": 0.37628984451293945, + "learning_rate": 1.0259667177805937e-05, + "loss": 0.8687, + "step": 2436 + }, + { + "epoch": 0.7960475276632232, + "grad_norm": 0.35589486360549927, + "learning_rate": 1.0228029639856663e-05, + "loss": 0.8424, + "step": 2437 + }, + { + "epoch": 0.7963741782695685, + "grad_norm": 0.4087234437465668, + "learning_rate": 1.0196435398657795e-05, + "loss": 0.8694, + "step": 2438 + }, + { + "epoch": 0.7967008288759136, + "grad_norm": 0.4563935101032257, + "learning_rate": 1.016488448860361e-05, + "loss": 1.0653, + "step": 2439 + }, + { + "epoch": 0.7970274794822588, + "grad_norm": 0.4497915804386139, + "learning_rate": 1.0133376944041201e-05, + "loss": 0.9739, + "step": 2440 + }, + { + "epoch": 0.797354130088604, + "grad_norm": 0.5139485597610474, + "learning_rate": 1.0101912799270464e-05, + "loss": 1.0967, + "step": 2441 + }, + { + "epoch": 0.7976807806949492, + "grad_norm": 0.5172460079193115, + "learning_rate": 1.0070492088544058e-05, + "loss": 0.9596, + "step": 2442 + }, + { + "epoch": 0.7980074313012944, + "grad_norm": 0.5660231709480286, + "learning_rate": 1.003911484606731e-05, + "loss": 1.029, + "step": 2443 + }, + { + "epoch": 0.7983340819076395, + "grad_norm": 0.6900216341018677, + "learning_rate": 1.000778110599831e-05, + "loss": 1.3027, + "step": 2444 + }, + { + "epoch": 0.7986607325139847, + "grad_norm": 0.8324858546257019, + "learning_rate": 9.976490902447706e-06, + "loss": 1.3106, + "step": 2445 + }, + { + "epoch": 0.7989873831203299, + "grad_norm": 0.9467867016792297, + "learning_rate": 9.945244269478793e-06, + "loss": 1.2018, + "step": 2446 + }, + { + "epoch": 0.7993140337266751, + "grad_norm": 1.3282982110977173, + "learning_rate": 9.914041241107441e-06, + "loss": 1.225, + "step": 2447 + }, + { + "epoch": 0.7996406843330203, + "grad_norm": 1.7920281887054443, + "learning_rate": 9.882881851302023e-06, + "loss": 1.9809, + "step": 2448 + }, + { + "epoch": 0.7999673349393654, + "grad_norm": 1.387970209121704, + "learning_rate": 9.851766133983436e-06, + "loss": 1.6633, + "step": 2449 + }, + { + "epoch": 0.8002939855457106, + "grad_norm": 1.9271981716156006, + "learning_rate": 9.82069412302501e-06, + "loss": 1.7488, + "step": 2450 + }, + { + "epoch": 0.8006206361520558, + "grad_norm": 0.21858730912208557, + "learning_rate": 9.789665852252505e-06, + "loss": 0.6825, + "step": 2451 + }, + { + "epoch": 0.8009472867584011, + "grad_norm": 0.2585441470146179, + "learning_rate": 9.758681355444071e-06, + "loss": 0.8612, + "step": 2452 + }, + { + "epoch": 0.8012739373647463, + "grad_norm": 0.2628011405467987, + "learning_rate": 9.727740666330187e-06, + "loss": 0.8562, + "step": 2453 + }, + { + "epoch": 0.8016005879710915, + "grad_norm": 0.28666362166404724, + "learning_rate": 9.696843818593665e-06, + "loss": 0.8556, + "step": 2454 + }, + { + "epoch": 0.8019272385774366, + "grad_norm": 0.284396767616272, + "learning_rate": 9.665990845869578e-06, + "loss": 0.817, + "step": 2455 + }, + { + "epoch": 0.8022538891837818, + "grad_norm": 0.2911209464073181, + "learning_rate": 9.635181781745201e-06, + "loss": 0.8772, + "step": 2456 + }, + { + "epoch": 0.802580539790127, + "grad_norm": 0.31056690216064453, + "learning_rate": 9.604416659760091e-06, + "loss": 0.7981, + "step": 2457 + }, + { + "epoch": 0.8029071903964722, + "grad_norm": 0.3305649161338806, + "learning_rate": 9.573695513405894e-06, + "loss": 0.8759, + "step": 2458 + }, + { + "epoch": 0.8032338410028174, + "grad_norm": 0.344088077545166, + "learning_rate": 9.543018376126406e-06, + "loss": 0.8886, + "step": 2459 + }, + { + "epoch": 0.8035604916091625, + "grad_norm": 0.36468231678009033, + "learning_rate": 9.512385281317532e-06, + "loss": 0.9108, + "step": 2460 + }, + { + "epoch": 0.8038871422155077, + "grad_norm": 0.36691391468048096, + "learning_rate": 9.481796262327209e-06, + "loss": 0.8152, + "step": 2461 + }, + { + "epoch": 0.8042137928218529, + "grad_norm": 0.3990345299243927, + "learning_rate": 9.451251352455398e-06, + "loss": 0.9779, + "step": 2462 + }, + { + "epoch": 0.8045404434281981, + "grad_norm": 0.45102572441101074, + "learning_rate": 9.420750584954046e-06, + "loss": 0.9852, + "step": 2463 + }, + { + "epoch": 0.8048670940345433, + "grad_norm": 0.4353548288345337, + "learning_rate": 9.390293993027044e-06, + "loss": 0.9961, + "step": 2464 + }, + { + "epoch": 0.8051937446408884, + "grad_norm": 0.46060293912887573, + "learning_rate": 9.35988160983019e-06, + "loss": 0.9627, + "step": 2465 + }, + { + "epoch": 0.8055203952472336, + "grad_norm": 0.49444007873535156, + "learning_rate": 9.329513468471151e-06, + "loss": 1.0372, + "step": 2466 + }, + { + "epoch": 0.8058470458535789, + "grad_norm": 0.520687460899353, + "learning_rate": 9.299189602009445e-06, + "loss": 0.9868, + "step": 2467 + }, + { + "epoch": 0.8061736964599241, + "grad_norm": 0.5475303530693054, + "learning_rate": 9.268910043456391e-06, + "loss": 1.007, + "step": 2468 + }, + { + "epoch": 0.8065003470662693, + "grad_norm": 0.6077768802642822, + "learning_rate": 9.238674825775035e-06, + "loss": 1.2054, + "step": 2469 + }, + { + "epoch": 0.8068269976726145, + "grad_norm": 0.6752180457115173, + "learning_rate": 9.208483981880217e-06, + "loss": 1.1677, + "step": 2470 + }, + { + "epoch": 0.8071536482789596, + "grad_norm": 0.7697159647941589, + "learning_rate": 9.178337544638398e-06, + "loss": 1.2908, + "step": 2471 + }, + { + "epoch": 0.8074802988853048, + "grad_norm": 0.9111776351928711, + "learning_rate": 9.148235546867783e-06, + "loss": 1.142, + "step": 2472 + }, + { + "epoch": 0.80780694949165, + "grad_norm": 1.0891627073287964, + "learning_rate": 9.118178021338114e-06, + "loss": 1.2352, + "step": 2473 + }, + { + "epoch": 0.8081336000979952, + "grad_norm": 1.2946839332580566, + "learning_rate": 9.088165000770766e-06, + "loss": 1.7181, + "step": 2474 + }, + { + "epoch": 0.8084602507043404, + "grad_norm": 1.7693047523498535, + "learning_rate": 9.058196517838657e-06, + "loss": 1.4192, + "step": 2475 + }, + { + "epoch": 0.8087869013106855, + "grad_norm": 0.21156850457191467, + "learning_rate": 9.028272605166238e-06, + "loss": 0.7241, + "step": 2476 + }, + { + "epoch": 0.8091135519170307, + "grad_norm": 0.2328900843858719, + "learning_rate": 8.998393295329371e-06, + "loss": 0.8189, + "step": 2477 + }, + { + "epoch": 0.8094402025233759, + "grad_norm": 0.25472328066825867, + "learning_rate": 8.968558620855466e-06, + "loss": 0.7547, + "step": 2478 + }, + { + "epoch": 0.8097668531297211, + "grad_norm": 0.28889092803001404, + "learning_rate": 8.938768614223237e-06, + "loss": 0.8085, + "step": 2479 + }, + { + "epoch": 0.8100935037360663, + "grad_norm": 0.2841634750366211, + "learning_rate": 8.90902330786288e-06, + "loss": 0.7787, + "step": 2480 + }, + { + "epoch": 0.8104201543424115, + "grad_norm": 0.2820068597793579, + "learning_rate": 8.87932273415582e-06, + "loss": 0.8728, + "step": 2481 + }, + { + "epoch": 0.8107468049487567, + "grad_norm": 0.2842544913291931, + "learning_rate": 8.849666925434857e-06, + "loss": 0.7728, + "step": 2482 + }, + { + "epoch": 0.8110734555551019, + "grad_norm": 0.30535271763801575, + "learning_rate": 8.820055913984033e-06, + "loss": 0.8933, + "step": 2483 + }, + { + "epoch": 0.8114001061614471, + "grad_norm": 0.3458235561847687, + "learning_rate": 8.790489732038638e-06, + "loss": 0.8484, + "step": 2484 + }, + { + "epoch": 0.8117267567677923, + "grad_norm": 0.3394298553466797, + "learning_rate": 8.76096841178513e-06, + "loss": 0.9129, + "step": 2485 + }, + { + "epoch": 0.8120534073741374, + "grad_norm": 0.3635712265968323, + "learning_rate": 8.731491985361167e-06, + "loss": 0.9531, + "step": 2486 + }, + { + "epoch": 0.8123800579804826, + "grad_norm": 0.38414081931114197, + "learning_rate": 8.702060484855506e-06, + "loss": 0.8569, + "step": 2487 + }, + { + "epoch": 0.8127067085868278, + "grad_norm": 0.3860231041908264, + "learning_rate": 8.672673942308008e-06, + "loss": 0.8907, + "step": 2488 + }, + { + "epoch": 0.813033359193173, + "grad_norm": 0.42519089579582214, + "learning_rate": 8.643332389709601e-06, + "loss": 1.015, + "step": 2489 + }, + { + "epoch": 0.8133600097995182, + "grad_norm": 0.4451555013656616, + "learning_rate": 8.61403585900219e-06, + "loss": 0.9342, + "step": 2490 + }, + { + "epoch": 0.8136866604058633, + "grad_norm": 0.4713704586029053, + "learning_rate": 8.584784382078748e-06, + "loss": 0.9179, + "step": 2491 + }, + { + "epoch": 0.8140133110122085, + "grad_norm": 0.4954456388950348, + "learning_rate": 8.555577990783109e-06, + "loss": 1.0105, + "step": 2492 + }, + { + "epoch": 0.8143399616185537, + "grad_norm": 0.5571999549865723, + "learning_rate": 8.526416716910129e-06, + "loss": 1.0412, + "step": 2493 + }, + { + "epoch": 0.8146666122248989, + "grad_norm": 0.5991007685661316, + "learning_rate": 8.497300592205442e-06, + "loss": 1.0632, + "step": 2494 + }, + { + "epoch": 0.8149932628312442, + "grad_norm": 0.6661972999572754, + "learning_rate": 8.468229648365605e-06, + "loss": 1.0149, + "step": 2495 + }, + { + "epoch": 0.8153199134375894, + "grad_norm": 0.7918639183044434, + "learning_rate": 8.439203917037962e-06, + "loss": 1.289, + "step": 2496 + }, + { + "epoch": 0.8156465640439345, + "grad_norm": 1.0117945671081543, + "learning_rate": 8.41022342982064e-06, + "loss": 1.0847, + "step": 2497 + }, + { + "epoch": 0.8159732146502797, + "grad_norm": 1.2291086912155151, + "learning_rate": 8.381288218262517e-06, + "loss": 1.6722, + "step": 2498 + }, + { + "epoch": 0.8162998652566249, + "grad_norm": 1.2318812608718872, + "learning_rate": 8.352398313863174e-06, + "loss": 1.3529, + "step": 2499 + }, + { + "epoch": 0.8166265158629701, + "grad_norm": 1.7156823873519897, + "learning_rate": 8.323553748072883e-06, + "loss": 1.3938, + "step": 2500 + }, + { + "epoch": 0.8169531664693153, + "grad_norm": 0.2098822295665741, + "learning_rate": 8.29475455229255e-06, + "loss": 0.6383, + "step": 2501 + }, + { + "epoch": 0.8172798170756604, + "grad_norm": 0.24294321238994598, + "learning_rate": 8.266000757873699e-06, + "loss": 0.756, + "step": 2502 + }, + { + "epoch": 0.8176064676820056, + "grad_norm": 0.25103679299354553, + "learning_rate": 8.23729239611839e-06, + "loss": 0.7447, + "step": 2503 + }, + { + "epoch": 0.8179331182883508, + "grad_norm": 0.2764917016029358, + "learning_rate": 8.208629498279302e-06, + "loss": 0.8186, + "step": 2504 + }, + { + "epoch": 0.818259768894696, + "grad_norm": 0.281490296125412, + "learning_rate": 8.180012095559525e-06, + "loss": 0.8326, + "step": 2505 + }, + { + "epoch": 0.8185864195010412, + "grad_norm": 0.2983465790748596, + "learning_rate": 8.151440219112721e-06, + "loss": 0.8574, + "step": 2506 + }, + { + "epoch": 0.8189130701073863, + "grad_norm": 0.3088802397251129, + "learning_rate": 8.122913900042905e-06, + "loss": 0.8248, + "step": 2507 + }, + { + "epoch": 0.8192397207137315, + "grad_norm": 0.3155342638492584, + "learning_rate": 8.094433169404547e-06, + "loss": 0.8865, + "step": 2508 + }, + { + "epoch": 0.8195663713200768, + "grad_norm": 0.31594693660736084, + "learning_rate": 8.065998058202468e-06, + "loss": 0.8237, + "step": 2509 + }, + { + "epoch": 0.819893021926422, + "grad_norm": 0.3399094343185425, + "learning_rate": 8.037608597391844e-06, + "loss": 0.9411, + "step": 2510 + }, + { + "epoch": 0.8202196725327672, + "grad_norm": 0.3390636742115021, + "learning_rate": 8.009264817878132e-06, + "loss": 0.7855, + "step": 2511 + }, + { + "epoch": 0.8205463231391124, + "grad_norm": 0.36510950326919556, + "learning_rate": 7.980966750517094e-06, + "loss": 0.8056, + "step": 2512 + }, + { + "epoch": 0.8208729737454575, + "grad_norm": 0.3949703574180603, + "learning_rate": 7.952714426114665e-06, + "loss": 0.8022, + "step": 2513 + }, + { + "epoch": 0.8211996243518027, + "grad_norm": 0.44621437788009644, + "learning_rate": 7.924507875427068e-06, + "loss": 0.9611, + "step": 2514 + }, + { + "epoch": 0.8215262749581479, + "grad_norm": 0.4569244384765625, + "learning_rate": 7.896347129160625e-06, + "loss": 0.9466, + "step": 2515 + }, + { + "epoch": 0.8218529255644931, + "grad_norm": 0.5023897290229797, + "learning_rate": 7.868232217971821e-06, + "loss": 1.044, + "step": 2516 + }, + { + "epoch": 0.8221795761708383, + "grad_norm": 0.5417917966842651, + "learning_rate": 7.840163172467257e-06, + "loss": 1.0372, + "step": 2517 + }, + { + "epoch": 0.8225062267771834, + "grad_norm": 0.5474960207939148, + "learning_rate": 7.812140023203579e-06, + "loss": 1.061, + "step": 2518 + }, + { + "epoch": 0.8228328773835286, + "grad_norm": 0.6427183747291565, + "learning_rate": 7.784162800687484e-06, + "loss": 1.0506, + "step": 2519 + }, + { + "epoch": 0.8231595279898738, + "grad_norm": 0.6729391813278198, + "learning_rate": 7.756231535375674e-06, + "loss": 1.2564, + "step": 2520 + }, + { + "epoch": 0.823486178596219, + "grad_norm": 0.8698153495788574, + "learning_rate": 7.728346257674801e-06, + "loss": 1.2879, + "step": 2521 + }, + { + "epoch": 0.8238128292025642, + "grad_norm": 1.0003093481063843, + "learning_rate": 7.70050699794148e-06, + "loss": 1.3512, + "step": 2522 + }, + { + "epoch": 0.8241394798089094, + "grad_norm": 1.1123592853546143, + "learning_rate": 7.672713786482217e-06, + "loss": 1.3098, + "step": 2523 + }, + { + "epoch": 0.8244661304152546, + "grad_norm": 1.284902572631836, + "learning_rate": 7.644966653553388e-06, + "loss": 1.37, + "step": 2524 + }, + { + "epoch": 0.8247927810215998, + "grad_norm": 1.9688411951065063, + "learning_rate": 7.617265629361209e-06, + "loss": 1.9536, + "step": 2525 + }, + { + "epoch": 0.825119431627945, + "grad_norm": 0.21682727336883545, + "learning_rate": 7.589610744061681e-06, + "loss": 0.7734, + "step": 2526 + }, + { + "epoch": 0.8254460822342902, + "grad_norm": 0.24609944224357605, + "learning_rate": 7.562002027760634e-06, + "loss": 0.7812, + "step": 2527 + }, + { + "epoch": 0.8257727328406353, + "grad_norm": 0.2850510776042938, + "learning_rate": 7.5344395105135635e-06, + "loss": 0.861, + "step": 2528 + }, + { + "epoch": 0.8260993834469805, + "grad_norm": 0.2798521816730499, + "learning_rate": 7.506923222325724e-06, + "loss": 0.8429, + "step": 2529 + }, + { + "epoch": 0.8264260340533257, + "grad_norm": 0.28509312868118286, + "learning_rate": 7.479453193152025e-06, + "loss": 0.8388, + "step": 2530 + }, + { + "epoch": 0.8267526846596709, + "grad_norm": 0.29365110397338867, + "learning_rate": 7.452029452897019e-06, + "loss": 0.8429, + "step": 2531 + }, + { + "epoch": 0.8270793352660161, + "grad_norm": 0.2949029207229614, + "learning_rate": 7.424652031414864e-06, + "loss": 0.7964, + "step": 2532 + }, + { + "epoch": 0.8274059858723612, + "grad_norm": 0.31028300523757935, + "learning_rate": 7.397320958509308e-06, + "loss": 0.8371, + "step": 2533 + }, + { + "epoch": 0.8277326364787064, + "grad_norm": 0.3249179720878601, + "learning_rate": 7.370036263933621e-06, + "loss": 0.9011, + "step": 2534 + }, + { + "epoch": 0.8280592870850516, + "grad_norm": 0.32227975130081177, + "learning_rate": 7.342797977390603e-06, + "loss": 0.8263, + "step": 2535 + }, + { + "epoch": 0.8283859376913968, + "grad_norm": 0.34464603662490845, + "learning_rate": 7.315606128532526e-06, + "loss": 0.885, + "step": 2536 + }, + { + "epoch": 0.8287125882977421, + "grad_norm": 0.3631193935871124, + "learning_rate": 7.2884607469611066e-06, + "loss": 0.9489, + "step": 2537 + }, + { + "epoch": 0.8290392389040873, + "grad_norm": 0.3989101052284241, + "learning_rate": 7.261361862227495e-06, + "loss": 0.854, + "step": 2538 + }, + { + "epoch": 0.8293658895104324, + "grad_norm": 0.40562012791633606, + "learning_rate": 7.234309503832165e-06, + "loss": 0.9495, + "step": 2539 + }, + { + "epoch": 0.8296925401167776, + "grad_norm": 0.44497793912887573, + "learning_rate": 7.207303701225037e-06, + "loss": 0.9858, + "step": 2540 + }, + { + "epoch": 0.8300191907231228, + "grad_norm": 0.4510423243045807, + "learning_rate": 7.180344483805257e-06, + "loss": 1.0368, + "step": 2541 + }, + { + "epoch": 0.830345841329468, + "grad_norm": 0.5080751776695251, + "learning_rate": 7.153431880921302e-06, + "loss": 0.9558, + "step": 2542 + }, + { + "epoch": 0.8306724919358132, + "grad_norm": 0.5267191529273987, + "learning_rate": 7.126565921870909e-06, + "loss": 1.0393, + "step": 2543 + }, + { + "epoch": 0.8309991425421583, + "grad_norm": 0.6253758668899536, + "learning_rate": 7.099746635901017e-06, + "loss": 0.9967, + "step": 2544 + }, + { + "epoch": 0.8313257931485035, + "grad_norm": 0.6861200332641602, + "learning_rate": 7.072974052207765e-06, + "loss": 1.1487, + "step": 2545 + }, + { + "epoch": 0.8316524437548487, + "grad_norm": 0.842881977558136, + "learning_rate": 7.0462481999364614e-06, + "loss": 1.1706, + "step": 2546 + }, + { + "epoch": 0.8319790943611939, + "grad_norm": 0.9228523373603821, + "learning_rate": 7.019569108181495e-06, + "loss": 1.2162, + "step": 2547 + }, + { + "epoch": 0.8323057449675391, + "grad_norm": 1.1753313541412354, + "learning_rate": 6.992936805986433e-06, + "loss": 1.362, + "step": 2548 + }, + { + "epoch": 0.8326323955738842, + "grad_norm": 1.3156818151474, + "learning_rate": 6.966351322343806e-06, + "loss": 1.5403, + "step": 2549 + }, + { + "epoch": 0.8329590461802294, + "grad_norm": 1.4795371294021606, + "learning_rate": 6.939812686195285e-06, + "loss": 1.6863, + "step": 2550 + }, + { + "epoch": 0.8332856967865747, + "grad_norm": 0.21376070380210876, + "learning_rate": 6.913320926431443e-06, + "loss": 0.7305, + "step": 2551 + }, + { + "epoch": 0.8336123473929199, + "grad_norm": 0.2572622299194336, + "learning_rate": 6.886876071891879e-06, + "loss": 0.8207, + "step": 2552 + }, + { + "epoch": 0.8339389979992651, + "grad_norm": 0.26194846630096436, + "learning_rate": 6.8604781513651185e-06, + "loss": 0.7989, + "step": 2553 + }, + { + "epoch": 0.8342656486056103, + "grad_norm": 0.28000107407569885, + "learning_rate": 6.834127193588586e-06, + "loss": 0.8771, + "step": 2554 + }, + { + "epoch": 0.8345922992119554, + "grad_norm": 0.2950563430786133, + "learning_rate": 6.807823227248583e-06, + "loss": 0.8277, + "step": 2555 + }, + { + "epoch": 0.8349189498183006, + "grad_norm": 0.2996322214603424, + "learning_rate": 6.781566280980267e-06, + "loss": 0.8208, + "step": 2556 + }, + { + "epoch": 0.8352456004246458, + "grad_norm": 0.3321259915828705, + "learning_rate": 6.755356383367589e-06, + "loss": 0.8929, + "step": 2557 + }, + { + "epoch": 0.835572251030991, + "grad_norm": 0.3215591013431549, + "learning_rate": 6.7291935629432974e-06, + "loss": 0.846, + "step": 2558 + }, + { + "epoch": 0.8358989016373362, + "grad_norm": 0.34042298793792725, + "learning_rate": 6.703077848188899e-06, + "loss": 0.8229, + "step": 2559 + }, + { + "epoch": 0.8362255522436813, + "grad_norm": 0.367233544588089, + "learning_rate": 6.677009267534567e-06, + "loss": 0.9275, + "step": 2560 + }, + { + "epoch": 0.8365522028500265, + "grad_norm": 0.3613315522670746, + "learning_rate": 6.650987849359253e-06, + "loss": 0.8224, + "step": 2561 + }, + { + "epoch": 0.8368788534563717, + "grad_norm": 0.38285353779792786, + "learning_rate": 6.625013621990472e-06, + "loss": 0.8436, + "step": 2562 + }, + { + "epoch": 0.8372055040627169, + "grad_norm": 0.41062045097351074, + "learning_rate": 6.599086613704447e-06, + "loss": 0.9488, + "step": 2563 + }, + { + "epoch": 0.8375321546690621, + "grad_norm": 0.443733811378479, + "learning_rate": 6.5732068527259325e-06, + "loss": 0.94, + "step": 2564 + }, + { + "epoch": 0.8378588052754073, + "grad_norm": 0.4647623300552368, + "learning_rate": 6.547374367228287e-06, + "loss": 0.9088, + "step": 2565 + }, + { + "epoch": 0.8381854558817525, + "grad_norm": 0.5017650127410889, + "learning_rate": 6.5215891853333934e-06, + "loss": 1.1312, + "step": 2566 + }, + { + "epoch": 0.8385121064880977, + "grad_norm": 0.5342419743537903, + "learning_rate": 6.495851335111636e-06, + "loss": 1.0748, + "step": 2567 + }, + { + "epoch": 0.8388387570944429, + "grad_norm": 0.5602555871009827, + "learning_rate": 6.47016084458188e-06, + "loss": 1.0645, + "step": 2568 + }, + { + "epoch": 0.8391654077007881, + "grad_norm": 0.6325632929801941, + "learning_rate": 6.44451774171142e-06, + "loss": 1.0813, + "step": 2569 + }, + { + "epoch": 0.8394920583071332, + "grad_norm": 0.643717885017395, + "learning_rate": 6.418922054415982e-06, + "loss": 1.277, + "step": 2570 + }, + { + "epoch": 0.8398187089134784, + "grad_norm": 0.9045091271400452, + "learning_rate": 6.393373810559655e-06, + "loss": 1.2554, + "step": 2571 + }, + { + "epoch": 0.8401453595198236, + "grad_norm": 0.998929500579834, + "learning_rate": 6.367873037954908e-06, + "loss": 1.3071, + "step": 2572 + }, + { + "epoch": 0.8404720101261688, + "grad_norm": 1.309383511543274, + "learning_rate": 6.342419764362473e-06, + "loss": 1.4713, + "step": 2573 + }, + { + "epoch": 0.840798660732514, + "grad_norm": 1.271360993385315, + "learning_rate": 6.317014017491463e-06, + "loss": 1.7116, + "step": 2574 + }, + { + "epoch": 0.8411253113388591, + "grad_norm": 1.6010202169418335, + "learning_rate": 6.291655824999154e-06, + "loss": 1.7152, + "step": 2575 + }, + { + "epoch": 0.8414519619452043, + "grad_norm": 0.22518090903759003, + "learning_rate": 6.266345214491148e-06, + "loss": 0.761, + "step": 2576 + }, + { + "epoch": 0.8417786125515495, + "grad_norm": 0.23476476967334747, + "learning_rate": 6.241082213521166e-06, + "loss": 0.6972, + "step": 2577 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.2612394392490387, + "learning_rate": 6.2158668495911455e-06, + "loss": 0.7726, + "step": 2578 + }, + { + "epoch": 0.8424319137642399, + "grad_norm": 0.2693740129470825, + "learning_rate": 6.190699150151158e-06, + "loss": 0.7799, + "step": 2579 + }, + { + "epoch": 0.8427585643705852, + "grad_norm": 0.3026430606842041, + "learning_rate": 6.165579142599387e-06, + "loss": 0.8576, + "step": 2580 + }, + { + "epoch": 0.8430852149769303, + "grad_norm": 0.29252561926841736, + "learning_rate": 6.140506854282085e-06, + "loss": 0.8993, + "step": 2581 + }, + { + "epoch": 0.8434118655832755, + "grad_norm": 0.3165540397167206, + "learning_rate": 6.115482312493581e-06, + "loss": 0.8733, + "step": 2582 + }, + { + "epoch": 0.8437385161896207, + "grad_norm": 0.33594945073127747, + "learning_rate": 6.090505544476183e-06, + "loss": 0.8523, + "step": 2583 + }, + { + "epoch": 0.8440651667959659, + "grad_norm": 0.3217700719833374, + "learning_rate": 6.065576577420245e-06, + "loss": 0.8438, + "step": 2584 + }, + { + "epoch": 0.8443918174023111, + "grad_norm": 0.3341309130191803, + "learning_rate": 6.040695438464045e-06, + "loss": 0.8125, + "step": 2585 + }, + { + "epoch": 0.8447184680086562, + "grad_norm": 0.34477731585502625, + "learning_rate": 6.015862154693791e-06, + "loss": 0.9042, + "step": 2586 + }, + { + "epoch": 0.8450451186150014, + "grad_norm": 0.3631967306137085, + "learning_rate": 5.991076753143648e-06, + "loss": 0.8595, + "step": 2587 + }, + { + "epoch": 0.8453717692213466, + "grad_norm": 0.3938636779785156, + "learning_rate": 5.966339260795578e-06, + "loss": 0.9562, + "step": 2588 + }, + { + "epoch": 0.8456984198276918, + "grad_norm": 0.41510850191116333, + "learning_rate": 5.94164970457946e-06, + "loss": 0.8708, + "step": 2589 + }, + { + "epoch": 0.846025070434037, + "grad_norm": 0.45071718096733093, + "learning_rate": 5.917008111372929e-06, + "loss": 0.9527, + "step": 2590 + }, + { + "epoch": 0.8463517210403821, + "grad_norm": 0.48741504549980164, + "learning_rate": 5.892414508001443e-06, + "loss": 0.9604, + "step": 2591 + }, + { + "epoch": 0.8466783716467273, + "grad_norm": 0.4795851409435272, + "learning_rate": 5.867868921238212e-06, + "loss": 0.9465, + "step": 2592 + }, + { + "epoch": 0.8470050222530725, + "grad_norm": 0.5671098828315735, + "learning_rate": 5.8433713778041575e-06, + "loss": 1.069, + "step": 2593 + }, + { + "epoch": 0.8473316728594178, + "grad_norm": 0.5778964757919312, + "learning_rate": 5.818921904367919e-06, + "loss": 1.0361, + "step": 2594 + }, + { + "epoch": 0.847658323465763, + "grad_norm": 0.702641487121582, + "learning_rate": 5.794520527545805e-06, + "loss": 1.1656, + "step": 2595 + }, + { + "epoch": 0.8479849740721082, + "grad_norm": 0.7841630578041077, + "learning_rate": 5.770167273901728e-06, + "loss": 1.2315, + "step": 2596 + }, + { + "epoch": 0.8483116246784533, + "grad_norm": 1.0163825750350952, + "learning_rate": 5.745862169947286e-06, + "loss": 1.1117, + "step": 2597 + }, + { + "epoch": 0.8486382752847985, + "grad_norm": 1.122326135635376, + "learning_rate": 5.721605242141575e-06, + "loss": 1.3777, + "step": 2598 + }, + { + "epoch": 0.8489649258911437, + "grad_norm": 1.3500094413757324, + "learning_rate": 5.697396516891313e-06, + "loss": 1.7499, + "step": 2599 + }, + { + "epoch": 0.8492915764974889, + "grad_norm": 1.8633354902267456, + "learning_rate": 5.673236020550704e-06, + "loss": 1.8196, + "step": 2600 + }, + { + "epoch": 0.8496182271038341, + "grad_norm": 0.20166555047035217, + "learning_rate": 5.6491237794214665e-06, + "loss": 0.6405, + "step": 2601 + }, + { + "epoch": 0.8499448777101792, + "grad_norm": 0.24433058500289917, + "learning_rate": 5.625059819752782e-06, + "loss": 0.8018, + "step": 2602 + }, + { + "epoch": 0.8502715283165244, + "grad_norm": 0.2631356120109558, + "learning_rate": 5.601044167741271e-06, + "loss": 0.8205, + "step": 2603 + }, + { + "epoch": 0.8505981789228696, + "grad_norm": 0.2602408230304718, + "learning_rate": 5.577076849530971e-06, + "loss": 0.8579, + "step": 2604 + }, + { + "epoch": 0.8509248295292148, + "grad_norm": 0.27190256118774414, + "learning_rate": 5.55315789121329e-06, + "loss": 0.821, + "step": 2605 + }, + { + "epoch": 0.85125148013556, + "grad_norm": 0.30052924156188965, + "learning_rate": 5.529287318826998e-06, + "loss": 0.8228, + "step": 2606 + }, + { + "epoch": 0.8515781307419051, + "grad_norm": 0.3002735674381256, + "learning_rate": 5.505465158358198e-06, + "loss": 0.754, + "step": 2607 + }, + { + "epoch": 0.8519047813482504, + "grad_norm": 0.3288699686527252, + "learning_rate": 5.481691435740283e-06, + "loss": 0.931, + "step": 2608 + }, + { + "epoch": 0.8522314319545956, + "grad_norm": 0.3346864581108093, + "learning_rate": 5.457966176853896e-06, + "loss": 0.9369, + "step": 2609 + }, + { + "epoch": 0.8525580825609408, + "grad_norm": 0.34095606207847595, + "learning_rate": 5.434289407526971e-06, + "loss": 0.8721, + "step": 2610 + }, + { + "epoch": 0.852884733167286, + "grad_norm": 0.36173370480537415, + "learning_rate": 5.410661153534602e-06, + "loss": 0.8597, + "step": 2611 + }, + { + "epoch": 0.8532113837736311, + "grad_norm": 0.38429251313209534, + "learning_rate": 5.3870814405990955e-06, + "loss": 0.955, + "step": 2612 + }, + { + "epoch": 0.8535380343799763, + "grad_norm": 0.4128096401691437, + "learning_rate": 5.363550294389907e-06, + "loss": 0.8608, + "step": 2613 + }, + { + "epoch": 0.8538646849863215, + "grad_norm": 0.4228519797325134, + "learning_rate": 5.340067740523635e-06, + "loss": 0.8909, + "step": 2614 + }, + { + "epoch": 0.8541913355926667, + "grad_norm": 0.4561985433101654, + "learning_rate": 5.316633804563958e-06, + "loss": 0.9841, + "step": 2615 + }, + { + "epoch": 0.8545179861990119, + "grad_norm": 0.5019810795783997, + "learning_rate": 5.293248512021648e-06, + "loss": 0.9614, + "step": 2616 + }, + { + "epoch": 0.854844636805357, + "grad_norm": 0.5194594264030457, + "learning_rate": 5.2699118883545e-06, + "loss": 0.9494, + "step": 2617 + }, + { + "epoch": 0.8551712874117022, + "grad_norm": 0.5753628015518188, + "learning_rate": 5.24662395896735e-06, + "loss": 1.0706, + "step": 2618 + }, + { + "epoch": 0.8554979380180474, + "grad_norm": 0.6136693358421326, + "learning_rate": 5.223384749212013e-06, + "loss": 1.0661, + "step": 2619 + }, + { + "epoch": 0.8558245886243926, + "grad_norm": 0.6912145018577576, + "learning_rate": 5.200194284387261e-06, + "loss": 1.1086, + "step": 2620 + }, + { + "epoch": 0.8561512392307378, + "grad_norm": 0.8217900395393372, + "learning_rate": 5.1770525897388234e-06, + "loss": 1.1226, + "step": 2621 + }, + { + "epoch": 0.8564778898370831, + "grad_norm": 0.910292387008667, + "learning_rate": 5.153959690459287e-06, + "loss": 1.4569, + "step": 2622 + }, + { + "epoch": 0.8568045404434282, + "grad_norm": 1.232712745666504, + "learning_rate": 5.130915611688192e-06, + "loss": 1.2706, + "step": 2623 + }, + { + "epoch": 0.8571311910497734, + "grad_norm": 1.3801976442337036, + "learning_rate": 5.107920378511871e-06, + "loss": 1.5923, + "step": 2624 + }, + { + "epoch": 0.8574578416561186, + "grad_norm": 1.6169432401657104, + "learning_rate": 5.084974015963506e-06, + "loss": 1.8258, + "step": 2625 + }, + { + "epoch": 0.8577844922624638, + "grad_norm": 0.20021358132362366, + "learning_rate": 5.062076549023076e-06, + "loss": 0.7602, + "step": 2626 + }, + { + "epoch": 0.858111142868809, + "grad_norm": 0.24645954370498657, + "learning_rate": 5.03922800261733e-06, + "loss": 0.7846, + "step": 2627 + }, + { + "epoch": 0.8584377934751541, + "grad_norm": 0.25612515211105347, + "learning_rate": 5.01642840161976e-06, + "loss": 0.8028, + "step": 2628 + }, + { + "epoch": 0.8587644440814993, + "grad_norm": 0.27695709466934204, + "learning_rate": 4.993677770850591e-06, + "loss": 0.799, + "step": 2629 + }, + { + "epoch": 0.8590910946878445, + "grad_norm": 0.29534533619880676, + "learning_rate": 4.970976135076683e-06, + "loss": 0.8947, + "step": 2630 + }, + { + "epoch": 0.8594177452941897, + "grad_norm": 0.3191416263580322, + "learning_rate": 4.948323519011644e-06, + "loss": 0.8109, + "step": 2631 + }, + { + "epoch": 0.8597443959005349, + "grad_norm": 0.32522448897361755, + "learning_rate": 4.925719947315632e-06, + "loss": 0.8372, + "step": 2632 + }, + { + "epoch": 0.86007104650688, + "grad_norm": 0.3430340886116028, + "learning_rate": 4.9031654445954925e-06, + "loss": 0.8355, + "step": 2633 + }, + { + "epoch": 0.8603976971132252, + "grad_norm": 0.33849358558654785, + "learning_rate": 4.8806600354045824e-06, + "loss": 0.8631, + "step": 2634 + }, + { + "epoch": 0.8607243477195704, + "grad_norm": 0.349334180355072, + "learning_rate": 4.858203744242857e-06, + "loss": 0.8974, + "step": 2635 + }, + { + "epoch": 0.8610509983259157, + "grad_norm": 0.38946714997291565, + "learning_rate": 4.835796595556796e-06, + "loss": 0.8642, + "step": 2636 + }, + { + "epoch": 0.8613776489322609, + "grad_norm": 0.37505602836608887, + "learning_rate": 4.813438613739374e-06, + "loss": 0.8739, + "step": 2637 + }, + { + "epoch": 0.8617042995386061, + "grad_norm": 0.4225607216358185, + "learning_rate": 4.791129823130036e-06, + "loss": 0.9068, + "step": 2638 + }, + { + "epoch": 0.8620309501449512, + "grad_norm": 0.4245143234729767, + "learning_rate": 4.768870248014695e-06, + "loss": 0.9315, + "step": 2639 + }, + { + "epoch": 0.8623576007512964, + "grad_norm": 0.5012677907943726, + "learning_rate": 4.746659912625662e-06, + "loss": 0.9509, + "step": 2640 + }, + { + "epoch": 0.8626842513576416, + "grad_norm": 0.4809205234050751, + "learning_rate": 4.724498841141667e-06, + "loss": 0.9483, + "step": 2641 + }, + { + "epoch": 0.8630109019639868, + "grad_norm": 0.5976522564888, + "learning_rate": 4.702387057687802e-06, + "loss": 1.1778, + "step": 2642 + }, + { + "epoch": 0.863337552570332, + "grad_norm": 0.6282837390899658, + "learning_rate": 4.6803245863354775e-06, + "loss": 1.1471, + "step": 2643 + }, + { + "epoch": 0.8636642031766771, + "grad_norm": 0.6567032933235168, + "learning_rate": 4.658311451102487e-06, + "loss": 1.1462, + "step": 2644 + }, + { + "epoch": 0.8639908537830223, + "grad_norm": 0.7036363482475281, + "learning_rate": 4.636347675952829e-06, + "loss": 1.1844, + "step": 2645 + }, + { + "epoch": 0.8643175043893675, + "grad_norm": 0.8441754579544067, + "learning_rate": 4.614433284796848e-06, + "loss": 1.229, + "step": 2646 + }, + { + "epoch": 0.8646441549957127, + "grad_norm": 1.0409406423568726, + "learning_rate": 4.592568301491073e-06, + "loss": 1.2641, + "step": 2647 + }, + { + "epoch": 0.8649708056020579, + "grad_norm": 1.464299201965332, + "learning_rate": 4.5707527498382695e-06, + "loss": 1.3463, + "step": 2648 + }, + { + "epoch": 0.865297456208403, + "grad_norm": 1.223347783088684, + "learning_rate": 4.548986653587389e-06, + "loss": 1.6237, + "step": 2649 + }, + { + "epoch": 0.8656241068147483, + "grad_norm": 1.9410229921340942, + "learning_rate": 4.527270036433539e-06, + "loss": 1.4677, + "step": 2650 + }, + { + "epoch": 0.8659507574210935, + "grad_norm": 0.21767649054527283, + "learning_rate": 4.5056029220179706e-06, + "loss": 0.7965, + "step": 2651 + }, + { + "epoch": 0.8662774080274387, + "grad_norm": 0.2621896266937256, + "learning_rate": 4.483985333928031e-06, + "loss": 0.8804, + "step": 2652 + }, + { + "epoch": 0.8666040586337839, + "grad_norm": 0.27004051208496094, + "learning_rate": 4.462417295697175e-06, + "loss": 0.7977, + "step": 2653 + }, + { + "epoch": 0.866930709240129, + "grad_norm": 0.28397756814956665, + "learning_rate": 4.440898830804891e-06, + "loss": 0.8856, + "step": 2654 + }, + { + "epoch": 0.8672573598464742, + "grad_norm": 0.28461506962776184, + "learning_rate": 4.419429962676725e-06, + "loss": 0.8844, + "step": 2655 + }, + { + "epoch": 0.8675840104528194, + "grad_norm": 0.29285040497779846, + "learning_rate": 4.398010714684186e-06, + "loss": 0.8644, + "step": 2656 + }, + { + "epoch": 0.8679106610591646, + "grad_norm": 0.2863633334636688, + "learning_rate": 4.376641110144841e-06, + "loss": 0.7856, + "step": 2657 + }, + { + "epoch": 0.8682373116655098, + "grad_norm": 0.305578351020813, + "learning_rate": 4.35532117232213e-06, + "loss": 0.8655, + "step": 2658 + }, + { + "epoch": 0.868563962271855, + "grad_norm": 0.3179665505886078, + "learning_rate": 4.334050924425504e-06, + "loss": 0.8276, + "step": 2659 + }, + { + "epoch": 0.8688906128782001, + "grad_norm": 0.33530572056770325, + "learning_rate": 4.312830389610245e-06, + "loss": 0.8437, + "step": 2660 + }, + { + "epoch": 0.8692172634845453, + "grad_norm": 0.3526410460472107, + "learning_rate": 4.29165959097757e-06, + "loss": 0.9232, + "step": 2661 + }, + { + "epoch": 0.8695439140908905, + "grad_norm": 0.373531699180603, + "learning_rate": 4.270538551574532e-06, + "loss": 0.8388, + "step": 2662 + }, + { + "epoch": 0.8698705646972357, + "grad_norm": 0.3668655753135681, + "learning_rate": 4.249467294394011e-06, + "loss": 0.8524, + "step": 2663 + }, + { + "epoch": 0.870197215303581, + "grad_norm": 0.4148674011230469, + "learning_rate": 4.228445842374706e-06, + "loss": 0.9739, + "step": 2664 + }, + { + "epoch": 0.8705238659099261, + "grad_norm": 0.4478146433830261, + "learning_rate": 4.2074742184010915e-06, + "loss": 1.0286, + "step": 2665 + }, + { + "epoch": 0.8708505165162713, + "grad_norm": 0.4490779936313629, + "learning_rate": 4.186552445303377e-06, + "loss": 0.9902, + "step": 2666 + }, + { + "epoch": 0.8711771671226165, + "grad_norm": 0.5150294303894043, + "learning_rate": 4.165680545857553e-06, + "loss": 0.9668, + "step": 2667 + }, + { + "epoch": 0.8715038177289617, + "grad_norm": 0.5332680344581604, + "learning_rate": 4.144858542785257e-06, + "loss": 1.0923, + "step": 2668 + }, + { + "epoch": 0.8718304683353069, + "grad_norm": 0.6344809532165527, + "learning_rate": 4.124086458753851e-06, + "loss": 1.0483, + "step": 2669 + }, + { + "epoch": 0.872157118941652, + "grad_norm": 0.661204993724823, + "learning_rate": 4.103364316376334e-06, + "loss": 1.1077, + "step": 2670 + }, + { + "epoch": 0.8724837695479972, + "grad_norm": 0.816225528717041, + "learning_rate": 4.0826921382113415e-06, + "loss": 1.1737, + "step": 2671 + }, + { + "epoch": 0.8728104201543424, + "grad_norm": 0.9259994029998779, + "learning_rate": 4.0620699467631255e-06, + "loss": 1.5668, + "step": 2672 + }, + { + "epoch": 0.8731370707606876, + "grad_norm": 1.154280662536621, + "learning_rate": 4.041497764481511e-06, + "loss": 1.2698, + "step": 2673 + }, + { + "epoch": 0.8734637213670328, + "grad_norm": 1.2868138551712036, + "learning_rate": 4.020975613761874e-06, + "loss": 1.3254, + "step": 2674 + }, + { + "epoch": 0.8737903719733779, + "grad_norm": 2.1193177700042725, + "learning_rate": 4.000503516945148e-06, + "loss": 1.5762, + "step": 2675 + }, + { + "epoch": 0.8741170225797231, + "grad_norm": 0.2401547133922577, + "learning_rate": 3.980081496317761e-06, + "loss": 0.795, + "step": 2676 + }, + { + "epoch": 0.8744436731860683, + "grad_norm": 0.24791757762432098, + "learning_rate": 3.959709574111625e-06, + "loss": 0.7803, + "step": 2677 + }, + { + "epoch": 0.8747703237924135, + "grad_norm": 0.2674658000469208, + "learning_rate": 3.939387772504133e-06, + "loss": 0.8197, + "step": 2678 + }, + { + "epoch": 0.8750969743987588, + "grad_norm": 0.27707958221435547, + "learning_rate": 3.919116113618071e-06, + "loss": 0.8578, + "step": 2679 + }, + { + "epoch": 0.875423625005104, + "grad_norm": 0.28398212790489197, + "learning_rate": 3.898894619521704e-06, + "loss": 0.8165, + "step": 2680 + }, + { + "epoch": 0.8757502756114491, + "grad_norm": 0.30292728543281555, + "learning_rate": 3.8787233122286215e-06, + "loss": 0.8953, + "step": 2681 + }, + { + "epoch": 0.8760769262177943, + "grad_norm": 0.30174580216407776, + "learning_rate": 3.8586022136978206e-06, + "loss": 0.8116, + "step": 2682 + }, + { + "epoch": 0.8764035768241395, + "grad_norm": 0.3020266890525818, + "learning_rate": 3.838531345833624e-06, + "loss": 0.7895, + "step": 2683 + }, + { + "epoch": 0.8767302274304847, + "grad_norm": 0.3383975028991699, + "learning_rate": 3.818510730485675e-06, + "loss": 0.9151, + "step": 2684 + }, + { + "epoch": 0.8770568780368299, + "grad_norm": 0.356997013092041, + "learning_rate": 3.7985403894489067e-06, + "loss": 0.8646, + "step": 2685 + }, + { + "epoch": 0.877383528643175, + "grad_norm": 0.3583001494407654, + "learning_rate": 3.7786203444635348e-06, + "loss": 0.9057, + "step": 2686 + }, + { + "epoch": 0.8777101792495202, + "grad_norm": 0.37976783514022827, + "learning_rate": 3.758750617215007e-06, + "loss": 0.9364, + "step": 2687 + }, + { + "epoch": 0.8780368298558654, + "grad_norm": 0.3958790898323059, + "learning_rate": 3.738931229334003e-06, + "loss": 0.8731, + "step": 2688 + }, + { + "epoch": 0.8783634804622106, + "grad_norm": 0.4313504993915558, + "learning_rate": 3.719162202396398e-06, + "loss": 0.9949, + "step": 2689 + }, + { + "epoch": 0.8786901310685558, + "grad_norm": 0.4511992037296295, + "learning_rate": 3.699443557923238e-06, + "loss": 0.8964, + "step": 2690 + }, + { + "epoch": 0.8790167816749009, + "grad_norm": 0.46949154138565063, + "learning_rate": 3.679775317380746e-06, + "loss": 1.008, + "step": 2691 + }, + { + "epoch": 0.8793434322812461, + "grad_norm": 0.5014150142669678, + "learning_rate": 3.660157502180217e-06, + "loss": 1.0773, + "step": 2692 + }, + { + "epoch": 0.8796700828875914, + "grad_norm": 0.5346405506134033, + "learning_rate": 3.6405901336781346e-06, + "loss": 1.1204, + "step": 2693 + }, + { + "epoch": 0.8799967334939366, + "grad_norm": 0.6744000911712646, + "learning_rate": 3.6210732331759865e-06, + "loss": 1.2427, + "step": 2694 + }, + { + "epoch": 0.8803233841002818, + "grad_norm": 0.6668764352798462, + "learning_rate": 3.601606821920367e-06, + "loss": 1.2329, + "step": 2695 + }, + { + "epoch": 0.880650034706627, + "grad_norm": 0.7776802182197571, + "learning_rate": 3.5821909211028906e-06, + "loss": 1.2254, + "step": 2696 + }, + { + "epoch": 0.8809766853129721, + "grad_norm": 0.8504863977432251, + "learning_rate": 3.5628255518601828e-06, + "loss": 1.1201, + "step": 2697 + }, + { + "epoch": 0.8813033359193173, + "grad_norm": 1.0933723449707031, + "learning_rate": 3.543510735273875e-06, + "loss": 1.432, + "step": 2698 + }, + { + "epoch": 0.8816299865256625, + "grad_norm": 1.16017746925354, + "learning_rate": 3.5242464923705534e-06, + "loss": 0.8979, + "step": 2699 + }, + { + "epoch": 0.8819566371320077, + "grad_norm": 1.525850534439087, + "learning_rate": 3.505032844121725e-06, + "loss": 1.4717, + "step": 2700 + }, + { + "epoch": 0.8822832877383529, + "grad_norm": 0.21816085278987885, + "learning_rate": 3.485869811443876e-06, + "loss": 0.7093, + "step": 2701 + }, + { + "epoch": 0.882609938344698, + "grad_norm": 0.25412845611572266, + "learning_rate": 3.466757415198324e-06, + "loss": 0.8078, + "step": 2702 + }, + { + "epoch": 0.8829365889510432, + "grad_norm": 0.25287115573883057, + "learning_rate": 3.447695676191337e-06, + "loss": 0.7362, + "step": 2703 + }, + { + "epoch": 0.8832632395573884, + "grad_norm": 0.27101483941078186, + "learning_rate": 3.4286846151739603e-06, + "loss": 0.8108, + "step": 2704 + }, + { + "epoch": 0.8835898901637336, + "grad_norm": 0.27851250767707825, + "learning_rate": 3.409724252842117e-06, + "loss": 0.7874, + "step": 2705 + }, + { + "epoch": 0.8839165407700788, + "grad_norm": 0.29681694507598877, + "learning_rate": 3.3908146098365235e-06, + "loss": 0.89, + "step": 2706 + }, + { + "epoch": 0.884243191376424, + "grad_norm": 0.30123892426490784, + "learning_rate": 3.371955706742691e-06, + "loss": 0.828, + "step": 2707 + }, + { + "epoch": 0.8845698419827692, + "grad_norm": 0.30781540274620056, + "learning_rate": 3.353147564090886e-06, + "loss": 0.9189, + "step": 2708 + }, + { + "epoch": 0.8848964925891144, + "grad_norm": 0.3321038782596588, + "learning_rate": 3.3343902023561134e-06, + "loss": 0.9027, + "step": 2709 + }, + { + "epoch": 0.8852231431954596, + "grad_norm": 0.3396836817264557, + "learning_rate": 3.315683641958106e-06, + "loss": 0.8452, + "step": 2710 + }, + { + "epoch": 0.8855497938018048, + "grad_norm": 0.370717316865921, + "learning_rate": 3.297027903261285e-06, + "loss": 0.8945, + "step": 2711 + }, + { + "epoch": 0.8858764444081499, + "grad_norm": 0.3481248617172241, + "learning_rate": 3.27842300657476e-06, + "loss": 0.8693, + "step": 2712 + }, + { + "epoch": 0.8862030950144951, + "grad_norm": 0.37252771854400635, + "learning_rate": 3.2598689721522525e-06, + "loss": 0.8701, + "step": 2713 + }, + { + "epoch": 0.8865297456208403, + "grad_norm": 0.4177666902542114, + "learning_rate": 3.241365820192177e-06, + "loss": 0.8574, + "step": 2714 + }, + { + "epoch": 0.8868563962271855, + "grad_norm": 0.43080759048461914, + "learning_rate": 3.2229135708374936e-06, + "loss": 0.9288, + "step": 2715 + }, + { + "epoch": 0.8871830468335307, + "grad_norm": 0.46755388379096985, + "learning_rate": 3.204512244175806e-06, + "loss": 0.894, + "step": 2716 + }, + { + "epoch": 0.8875096974398758, + "grad_norm": 0.5186474323272705, + "learning_rate": 3.186161860239234e-06, + "loss": 1.0247, + "step": 2717 + }, + { + "epoch": 0.887836348046221, + "grad_norm": 0.5322409868240356, + "learning_rate": 3.1678624390044596e-06, + "loss": 0.9993, + "step": 2718 + }, + { + "epoch": 0.8881629986525662, + "grad_norm": 0.6192087531089783, + "learning_rate": 3.1496140003926924e-06, + "loss": 1.0338, + "step": 2719 + }, + { + "epoch": 0.8884896492589114, + "grad_norm": 0.7641281485557556, + "learning_rate": 3.131416564269629e-06, + "loss": 1.188, + "step": 2720 + }, + { + "epoch": 0.8888162998652567, + "grad_norm": 0.8901844024658203, + "learning_rate": 3.1132701504454464e-06, + "loss": 1.2975, + "step": 2721 + }, + { + "epoch": 0.8891429504716019, + "grad_norm": 0.9301742911338806, + "learning_rate": 3.0951747786747865e-06, + "loss": 1.3024, + "step": 2722 + }, + { + "epoch": 0.889469601077947, + "grad_norm": 1.1372238397598267, + "learning_rate": 3.077130468656719e-06, + "loss": 1.4755, + "step": 2723 + }, + { + "epoch": 0.8897962516842922, + "grad_norm": 1.380213737487793, + "learning_rate": 3.0591372400347153e-06, + "loss": 1.8431, + "step": 2724 + }, + { + "epoch": 0.8901229022906374, + "grad_norm": 1.8983066082000732, + "learning_rate": 3.0411951123966666e-06, + "loss": 1.8139, + "step": 2725 + }, + { + "epoch": 0.8904495528969826, + "grad_norm": 0.21302877366542816, + "learning_rate": 3.0233041052747935e-06, + "loss": 0.7034, + "step": 2726 + }, + { + "epoch": 0.8907762035033278, + "grad_norm": 0.2391611784696579, + "learning_rate": 3.0054642381457108e-06, + "loss": 0.7476, + "step": 2727 + }, + { + "epoch": 0.8911028541096729, + "grad_norm": 0.2587495744228363, + "learning_rate": 2.987675530430317e-06, + "loss": 0.8294, + "step": 2728 + }, + { + "epoch": 0.8914295047160181, + "grad_norm": 0.2743024230003357, + "learning_rate": 2.9699380014938717e-06, + "loss": 0.823, + "step": 2729 + }, + { + "epoch": 0.8917561553223633, + "grad_norm": 0.2853001654148102, + "learning_rate": 2.9522516706458557e-06, + "loss": 0.8175, + "step": 2730 + }, + { + "epoch": 0.8920828059287085, + "grad_norm": 0.289151132106781, + "learning_rate": 2.9346165571400574e-06, + "loss": 0.8308, + "step": 2731 + }, + { + "epoch": 0.8924094565350537, + "grad_norm": 0.3165534734725952, + "learning_rate": 2.9170326801744974e-06, + "loss": 0.8892, + "step": 2732 + }, + { + "epoch": 0.8927361071413988, + "grad_norm": 0.33104461431503296, + "learning_rate": 2.899500058891419e-06, + "loss": 0.9116, + "step": 2733 + }, + { + "epoch": 0.893062757747744, + "grad_norm": 0.3390517234802246, + "learning_rate": 2.8820187123772712e-06, + "loss": 0.857, + "step": 2734 + }, + { + "epoch": 0.8933894083540893, + "grad_norm": 0.33683860301971436, + "learning_rate": 2.8645886596626814e-06, + "loss": 0.878, + "step": 2735 + }, + { + "epoch": 0.8937160589604345, + "grad_norm": 0.33593204617500305, + "learning_rate": 2.8472099197224155e-06, + "loss": 0.8384, + "step": 2736 + }, + { + "epoch": 0.8940427095667797, + "grad_norm": 0.38355734944343567, + "learning_rate": 2.829882511475429e-06, + "loss": 0.9405, + "step": 2737 + }, + { + "epoch": 0.8943693601731248, + "grad_norm": 0.4030013382434845, + "learning_rate": 2.8126064537847497e-06, + "loss": 0.843, + "step": 2738 + }, + { + "epoch": 0.89469601077947, + "grad_norm": 0.4197559356689453, + "learning_rate": 2.7953817654575166e-06, + "loss": 0.8953, + "step": 2739 + }, + { + "epoch": 0.8950226613858152, + "grad_norm": 0.5002544522285461, + "learning_rate": 2.7782084652449747e-06, + "loss": 1.0398, + "step": 2740 + }, + { + "epoch": 0.8953493119921604, + "grad_norm": 0.4735429286956787, + "learning_rate": 2.7610865718423694e-06, + "loss": 0.9453, + "step": 2741 + }, + { + "epoch": 0.8956759625985056, + "grad_norm": 0.49182236194610596, + "learning_rate": 2.744016103889052e-06, + "loss": 1.0104, + "step": 2742 + }, + { + "epoch": 0.8960026132048508, + "grad_norm": 0.5787506103515625, + "learning_rate": 2.7269970799683353e-06, + "loss": 1.0149, + "step": 2743 + }, + { + "epoch": 0.8963292638111959, + "grad_norm": 0.5907702445983887, + "learning_rate": 2.7100295186075543e-06, + "loss": 0.9488, + "step": 2744 + }, + { + "epoch": 0.8966559144175411, + "grad_norm": 0.7707468271255493, + "learning_rate": 2.693113438278011e-06, + "loss": 1.2416, + "step": 2745 + }, + { + "epoch": 0.8969825650238863, + "grad_norm": 0.7937350869178772, + "learning_rate": 2.6762488573949808e-06, + "loss": 1.0523, + "step": 2746 + }, + { + "epoch": 0.8973092156302315, + "grad_norm": 1.2601395845413208, + "learning_rate": 2.6594357943176496e-06, + "loss": 1.5764, + "step": 2747 + }, + { + "epoch": 0.8976358662365767, + "grad_norm": 1.1137841939926147, + "learning_rate": 2.6426742673491487e-06, + "loss": 1.2133, + "step": 2748 + }, + { + "epoch": 0.8979625168429219, + "grad_norm": 1.296323299407959, + "learning_rate": 2.6259642947364716e-06, + "loss": 1.1121, + "step": 2749 + }, + { + "epoch": 0.8982891674492671, + "grad_norm": 2.165759563446045, + "learning_rate": 2.6093058946705275e-06, + "loss": 2.157, + "step": 2750 + }, + { + "epoch": 0.8986158180556123, + "grad_norm": 0.22275981307029724, + "learning_rate": 2.5926990852860487e-06, + "loss": 0.7276, + "step": 2751 + }, + { + "epoch": 0.8989424686619575, + "grad_norm": 0.24508191645145416, + "learning_rate": 2.5761438846616248e-06, + "loss": 0.765, + "step": 2752 + }, + { + "epoch": 0.8992691192683027, + "grad_norm": 0.2617625594139099, + "learning_rate": 2.5596403108196555e-06, + "loss": 0.8318, + "step": 2753 + }, + { + "epoch": 0.8995957698746478, + "grad_norm": 0.2869446575641632, + "learning_rate": 2.5431883817263415e-06, + "loss": 0.851, + "step": 2754 + }, + { + "epoch": 0.899922420480993, + "grad_norm": 0.2984892725944519, + "learning_rate": 2.5267881152916507e-06, + "loss": 0.9036, + "step": 2755 + }, + { + "epoch": 0.9002490710873382, + "grad_norm": 0.29813170433044434, + "learning_rate": 2.5104395293693295e-06, + "loss": 0.8355, + "step": 2756 + }, + { + "epoch": 0.9005757216936834, + "grad_norm": 0.32043224573135376, + "learning_rate": 2.4941426417568468e-06, + "loss": 0.8489, + "step": 2757 + }, + { + "epoch": 0.9009023723000286, + "grad_norm": 0.33558252453804016, + "learning_rate": 2.4778974701954007e-06, + "loss": 0.9253, + "step": 2758 + }, + { + "epoch": 0.9012290229063737, + "grad_norm": 0.344395250082016, + "learning_rate": 2.4617040323698827e-06, + "loss": 0.8421, + "step": 2759 + }, + { + "epoch": 0.9015556735127189, + "grad_norm": 0.3580136299133301, + "learning_rate": 2.445562345908875e-06, + "loss": 0.9436, + "step": 2760 + }, + { + "epoch": 0.9018823241190641, + "grad_norm": 0.34943464398384094, + "learning_rate": 2.4294724283846158e-06, + "loss": 0.9067, + "step": 2761 + }, + { + "epoch": 0.9022089747254093, + "grad_norm": 0.3668978214263916, + "learning_rate": 2.4134342973129777e-06, + "loss": 0.918, + "step": 2762 + }, + { + "epoch": 0.9025356253317546, + "grad_norm": 0.38507401943206787, + "learning_rate": 2.3974479701534826e-06, + "loss": 0.8608, + "step": 2763 + }, + { + "epoch": 0.9028622759380998, + "grad_norm": 0.42256003618240356, + "learning_rate": 2.3815134643092264e-06, + "loss": 0.924, + "step": 2764 + }, + { + "epoch": 0.9031889265444449, + "grad_norm": 0.4842672348022461, + "learning_rate": 2.3656307971269165e-06, + "loss": 1.0541, + "step": 2765 + }, + { + "epoch": 0.9035155771507901, + "grad_norm": 0.5005545616149902, + "learning_rate": 2.349799985896811e-06, + "loss": 1.1173, + "step": 2766 + }, + { + "epoch": 0.9038422277571353, + "grad_norm": 0.5491520166397095, + "learning_rate": 2.334021047852725e-06, + "loss": 0.9751, + "step": 2767 + }, + { + "epoch": 0.9041688783634805, + "grad_norm": 0.6437044143676758, + "learning_rate": 2.318294000172e-06, + "loss": 1.1144, + "step": 2768 + }, + { + "epoch": 0.9044955289698257, + "grad_norm": 0.6922833919525146, + "learning_rate": 2.3026188599754915e-06, + "loss": 1.0302, + "step": 2769 + }, + { + "epoch": 0.9048221795761708, + "grad_norm": 1.6215999126434326, + "learning_rate": 2.2869956443275444e-06, + "loss": 1.3059, + "step": 2770 + }, + { + "epoch": 0.905148830182516, + "grad_norm": 0.9523685574531555, + "learning_rate": 2.2714243702359695e-06, + "loss": 1.0932, + "step": 2771 + }, + { + "epoch": 0.9054754807888612, + "grad_norm": 1.1872296333312988, + "learning_rate": 2.255905054652052e-06, + "loss": 1.2894, + "step": 2772 + }, + { + "epoch": 0.9058021313952064, + "grad_norm": 1.3475775718688965, + "learning_rate": 2.2404377144705e-06, + "loss": 1.5721, + "step": 2773 + }, + { + "epoch": 0.9061287820015516, + "grad_norm": 1.3719803094863892, + "learning_rate": 2.2250223665294446e-06, + "loss": 1.6983, + "step": 2774 + }, + { + "epoch": 0.9064554326078967, + "grad_norm": 2.0526633262634277, + "learning_rate": 2.2096590276103947e-06, + "loss": 2.0046, + "step": 2775 + }, + { + "epoch": 0.9067820832142419, + "grad_norm": 0.2175752967596054, + "learning_rate": 2.194347714438294e-06, + "loss": 0.7995, + "step": 2776 + }, + { + "epoch": 0.9071087338205872, + "grad_norm": 0.25334927439689636, + "learning_rate": 2.179088443681393e-06, + "loss": 0.8306, + "step": 2777 + }, + { + "epoch": 0.9074353844269324, + "grad_norm": 0.26047754287719727, + "learning_rate": 2.1638812319513145e-06, + "loss": 0.8277, + "step": 2778 + }, + { + "epoch": 0.9077620350332776, + "grad_norm": 0.27399611473083496, + "learning_rate": 2.1487260958030107e-06, + "loss": 0.8303, + "step": 2779 + }, + { + "epoch": 0.9080886856396228, + "grad_norm": 0.28062084317207336, + "learning_rate": 2.1336230517347342e-06, + "loss": 0.8613, + "step": 2780 + }, + { + "epoch": 0.9084153362459679, + "grad_norm": 0.29603925347328186, + "learning_rate": 2.1185721161880333e-06, + "loss": 0.8287, + "step": 2781 + }, + { + "epoch": 0.9087419868523131, + "grad_norm": 0.3076125383377075, + "learning_rate": 2.103573305547735e-06, + "loss": 0.9537, + "step": 2782 + }, + { + "epoch": 0.9090686374586583, + "grad_norm": 0.3091273307800293, + "learning_rate": 2.088626636141894e-06, + "loss": 0.8705, + "step": 2783 + }, + { + "epoch": 0.9093952880650035, + "grad_norm": 0.3315759301185608, + "learning_rate": 2.073732124241856e-06, + "loss": 0.8747, + "step": 2784 + }, + { + "epoch": 0.9097219386713487, + "grad_norm": 0.34617912769317627, + "learning_rate": 2.058889786062124e-06, + "loss": 0.9194, + "step": 2785 + }, + { + "epoch": 0.9100485892776938, + "grad_norm": 0.389011949300766, + "learning_rate": 2.0440996377604606e-06, + "loss": 0.8509, + "step": 2786 + }, + { + "epoch": 0.910375239884039, + "grad_norm": 0.35955238342285156, + "learning_rate": 2.0293616954377704e-06, + "loss": 0.8293, + "step": 2787 + }, + { + "epoch": 0.9107018904903842, + "grad_norm": 0.4118489623069763, + "learning_rate": 2.0146759751381474e-06, + "loss": 0.9932, + "step": 2788 + }, + { + "epoch": 0.9110285410967294, + "grad_norm": 0.4313391149044037, + "learning_rate": 2.0000424928488247e-06, + "loss": 0.9718, + "step": 2789 + }, + { + "epoch": 0.9113551917030746, + "grad_norm": 0.4242534637451172, + "learning_rate": 1.9854612645001767e-06, + "loss": 0.9077, + "step": 2790 + }, + { + "epoch": 0.9116818423094197, + "grad_norm": 0.49969717860221863, + "learning_rate": 1.970932305965695e-06, + "loss": 0.9845, + "step": 2791 + }, + { + "epoch": 0.912008492915765, + "grad_norm": 0.49355196952819824, + "learning_rate": 1.9564556330619542e-06, + "loss": 0.8608, + "step": 2792 + }, + { + "epoch": 0.9123351435221102, + "grad_norm": 0.6009331941604614, + "learning_rate": 1.9420312615486215e-06, + "loss": 1.0791, + "step": 2793 + }, + { + "epoch": 0.9126617941284554, + "grad_norm": 0.655984103679657, + "learning_rate": 1.927659207128424e-06, + "loss": 1.0979, + "step": 2794 + }, + { + "epoch": 0.9129884447348006, + "grad_norm": 0.6680231094360352, + "learning_rate": 1.9133394854471465e-06, + "loss": 1.0447, + "step": 2795 + }, + { + "epoch": 0.9133150953411457, + "grad_norm": 0.8917760252952576, + "learning_rate": 1.8990721120935694e-06, + "loss": 1.2298, + "step": 2796 + }, + { + "epoch": 0.9136417459474909, + "grad_norm": 0.9837570190429688, + "learning_rate": 1.88485710259953e-06, + "loss": 1.358, + "step": 2797 + }, + { + "epoch": 0.9139683965538361, + "grad_norm": 1.2982633113861084, + "learning_rate": 1.8706944724398157e-06, + "loss": 1.2058, + "step": 2798 + }, + { + "epoch": 0.9142950471601813, + "grad_norm": 1.3844225406646729, + "learning_rate": 1.8565842370322495e-06, + "loss": 1.5844, + "step": 2799 + }, + { + "epoch": 0.9146216977665265, + "grad_norm": 1.6718311309814453, + "learning_rate": 1.8425264117375496e-06, + "loss": 2.01, + "step": 2800 + }, + { + "epoch": 0.9149483483728716, + "grad_norm": 0.20186986029148102, + "learning_rate": 1.8285210118594297e-06, + "loss": 0.7628, + "step": 2801 + }, + { + "epoch": 0.9152749989792168, + "grad_norm": 0.24913327395915985, + "learning_rate": 1.8145680526445109e-06, + "loss": 0.8872, + "step": 2802 + }, + { + "epoch": 0.915601649585562, + "grad_norm": 0.25536009669303894, + "learning_rate": 1.8006675492823255e-06, + "loss": 0.7614, + "step": 2803 + }, + { + "epoch": 0.9159283001919072, + "grad_norm": 0.26635807752609253, + "learning_rate": 1.7868195169053137e-06, + "loss": 0.833, + "step": 2804 + }, + { + "epoch": 0.9162549507982524, + "grad_norm": 0.2926965653896332, + "learning_rate": 1.7730239705887774e-06, + "loss": 0.7751, + "step": 2805 + }, + { + "epoch": 0.9165816014045977, + "grad_norm": 0.28736618161201477, + "learning_rate": 1.7592809253508925e-06, + "loss": 0.8213, + "step": 2806 + }, + { + "epoch": 0.9169082520109428, + "grad_norm": 0.30105364322662354, + "learning_rate": 1.7455903961526798e-06, + "loss": 0.8296, + "step": 2807 + }, + { + "epoch": 0.917234902617288, + "grad_norm": 0.3077887296676636, + "learning_rate": 1.7319523978979956e-06, + "loss": 0.7668, + "step": 2808 + }, + { + "epoch": 0.9175615532236332, + "grad_norm": 0.3252915143966675, + "learning_rate": 1.7183669454334805e-06, + "loss": 0.8537, + "step": 2809 + }, + { + "epoch": 0.9178882038299784, + "grad_norm": 0.3521929979324341, + "learning_rate": 1.7048340535486151e-06, + "loss": 0.8731, + "step": 2810 + }, + { + "epoch": 0.9182148544363236, + "grad_norm": 0.349094033241272, + "learning_rate": 1.6913537369756205e-06, + "loss": 0.7871, + "step": 2811 + }, + { + "epoch": 0.9185415050426687, + "grad_norm": 0.3693489730358124, + "learning_rate": 1.67792601038953e-06, + "loss": 0.9557, + "step": 2812 + }, + { + "epoch": 0.9188681556490139, + "grad_norm": 0.37535011768341064, + "learning_rate": 1.664550888408073e-06, + "loss": 0.8216, + "step": 2813 + }, + { + "epoch": 0.9191948062553591, + "grad_norm": 0.3807601034641266, + "learning_rate": 1.6512283855917464e-06, + "loss": 0.9469, + "step": 2814 + }, + { + "epoch": 0.9195214568617043, + "grad_norm": 0.46269911527633667, + "learning_rate": 1.6379585164437605e-06, + "loss": 1.0594, + "step": 2815 + }, + { + "epoch": 0.9198481074680495, + "grad_norm": 0.44555824995040894, + "learning_rate": 1.6247412954100206e-06, + "loss": 1.012, + "step": 2816 + }, + { + "epoch": 0.9201747580743946, + "grad_norm": 0.5157310962677002, + "learning_rate": 1.611576736879117e-06, + "loss": 1.0669, + "step": 2817 + }, + { + "epoch": 0.9205014086807398, + "grad_norm": 0.5341678261756897, + "learning_rate": 1.5984648551823244e-06, + "loss": 1.1106, + "step": 2818 + }, + { + "epoch": 0.920828059287085, + "grad_norm": 0.5688596367835999, + "learning_rate": 1.5854056645935421e-06, + "loss": 1.0309, + "step": 2819 + }, + { + "epoch": 0.9211547098934303, + "grad_norm": 0.603225588798523, + "learning_rate": 1.572399179329348e-06, + "loss": 1.0216, + "step": 2820 + }, + { + "epoch": 0.9214813604997755, + "grad_norm": 0.7252687215805054, + "learning_rate": 1.5594454135489101e-06, + "loss": 1.0678, + "step": 2821 + }, + { + "epoch": 0.9218080111061207, + "grad_norm": 0.9123412370681763, + "learning_rate": 1.546544381354026e-06, + "loss": 1.3185, + "step": 2822 + }, + { + "epoch": 0.9221346617124658, + "grad_norm": 1.1781419515609741, + "learning_rate": 1.5336960967890722e-06, + "loss": 1.352, + "step": 2823 + }, + { + "epoch": 0.922461312318811, + "grad_norm": 1.3412940502166748, + "learning_rate": 1.520900573841011e-06, + "loss": 1.6097, + "step": 2824 + }, + { + "epoch": 0.9227879629251562, + "grad_norm": 1.9426753520965576, + "learning_rate": 1.5081578264393714e-06, + "loss": 1.9803, + "step": 2825 + }, + { + "epoch": 0.9231146135315014, + "grad_norm": 0.21749097108840942, + "learning_rate": 1.4954678684562128e-06, + "loss": 0.7693, + "step": 2826 + }, + { + "epoch": 0.9234412641378466, + "grad_norm": 0.23959995806217194, + "learning_rate": 1.4828307137061404e-06, + "loss": 0.7706, + "step": 2827 + }, + { + "epoch": 0.9237679147441917, + "grad_norm": 0.2661067247390747, + "learning_rate": 1.4702463759462827e-06, + "loss": 0.8191, + "step": 2828 + }, + { + "epoch": 0.9240945653505369, + "grad_norm": 0.27391666173934937, + "learning_rate": 1.4577148688762477e-06, + "loss": 0.7996, + "step": 2829 + }, + { + "epoch": 0.9244212159568821, + "grad_norm": 0.2843951880931854, + "learning_rate": 1.4452362061381509e-06, + "loss": 0.8425, + "step": 2830 + }, + { + "epoch": 0.9247478665632273, + "grad_norm": 0.30345508456230164, + "learning_rate": 1.4328104013165811e-06, + "loss": 0.9434, + "step": 2831 + }, + { + "epoch": 0.9250745171695725, + "grad_norm": 0.32872867584228516, + "learning_rate": 1.4204374679385513e-06, + "loss": 0.916, + "step": 2832 + }, + { + "epoch": 0.9254011677759176, + "grad_norm": 0.3325795829296112, + "learning_rate": 1.4081174194735758e-06, + "loss": 0.9412, + "step": 2833 + }, + { + "epoch": 0.9257278183822629, + "grad_norm": 0.3444099724292755, + "learning_rate": 1.3958502693335374e-06, + "loss": 0.9226, + "step": 2834 + }, + { + "epoch": 0.9260544689886081, + "grad_norm": 0.3495911955833435, + "learning_rate": 1.3836360308727703e-06, + "loss": 0.9122, + "step": 2835 + }, + { + "epoch": 0.9263811195949533, + "grad_norm": 0.3913719952106476, + "learning_rate": 1.3714747173879938e-06, + "loss": 1.0252, + "step": 2836 + }, + { + "epoch": 0.9267077702012985, + "grad_norm": 0.3872081935405731, + "learning_rate": 1.3593663421183177e-06, + "loss": 0.848, + "step": 2837 + }, + { + "epoch": 0.9270344208076436, + "grad_norm": 0.3887852132320404, + "learning_rate": 1.3473109182452203e-06, + "loss": 0.8442, + "step": 2838 + }, + { + "epoch": 0.9273610714139888, + "grad_norm": 0.4236868619918823, + "learning_rate": 1.3353084588925257e-06, + "loss": 1.0758, + "step": 2839 + }, + { + "epoch": 0.927687722020334, + "grad_norm": 0.4438207745552063, + "learning_rate": 1.3233589771264155e-06, + "loss": 0.9914, + "step": 2840 + }, + { + "epoch": 0.9280143726266792, + "grad_norm": 0.4978663921356201, + "learning_rate": 1.3114624859553892e-06, + "loss": 1.0171, + "step": 2841 + }, + { + "epoch": 0.9283410232330244, + "grad_norm": 0.5301693677902222, + "learning_rate": 1.2996189983302542e-06, + "loss": 1.0497, + "step": 2842 + }, + { + "epoch": 0.9286676738393695, + "grad_norm": 0.5215023159980774, + "learning_rate": 1.287828527144136e-06, + "loss": 1.0231, + "step": 2843 + }, + { + "epoch": 0.9289943244457147, + "grad_norm": 0.6315735578536987, + "learning_rate": 1.2760910852324338e-06, + "loss": 1.0299, + "step": 2844 + }, + { + "epoch": 0.9293209750520599, + "grad_norm": 0.6547741293907166, + "learning_rate": 1.2644066853727988e-06, + "loss": 1.3133, + "step": 2845 + }, + { + "epoch": 0.9296476256584051, + "grad_norm": 0.8934280276298523, + "learning_rate": 1.2527753402851782e-06, + "loss": 1.3854, + "step": 2846 + }, + { + "epoch": 0.9299742762647503, + "grad_norm": 1.3449379205703735, + "learning_rate": 1.2411970626317271e-06, + "loss": 1.6522, + "step": 2847 + }, + { + "epoch": 0.9303009268710956, + "grad_norm": 1.1736503839492798, + "learning_rate": 1.229671865016857e-06, + "loss": 1.2842, + "step": 2848 + }, + { + "epoch": 0.9306275774774407, + "grad_norm": 1.3014510869979858, + "learning_rate": 1.2181997599871709e-06, + "loss": 1.6985, + "step": 2849 + }, + { + "epoch": 0.9309542280837859, + "grad_norm": 1.8423781394958496, + "learning_rate": 1.2067807600314951e-06, + "loss": 2.0165, + "step": 2850 + }, + { + "epoch": 0.9312808786901311, + "grad_norm": 0.21177968382835388, + "learning_rate": 1.1954148775808305e-06, + "loss": 0.7456, + "step": 2851 + }, + { + "epoch": 0.9316075292964763, + "grad_norm": 0.23976528644561768, + "learning_rate": 1.1841021250083683e-06, + "loss": 0.775, + "step": 2852 + }, + { + "epoch": 0.9319341799028215, + "grad_norm": 0.2674348056316376, + "learning_rate": 1.1728425146294298e-06, + "loss": 0.8148, + "step": 2853 + }, + { + "epoch": 0.9322608305091666, + "grad_norm": 0.276597797870636, + "learning_rate": 1.1616360587015262e-06, + "loss": 0.8089, + "step": 2854 + }, + { + "epoch": 0.9325874811155118, + "grad_norm": 0.28054794669151306, + "learning_rate": 1.150482769424266e-06, + "loss": 0.749, + "step": 2855 + }, + { + "epoch": 0.932914131721857, + "grad_norm": 0.31427672505378723, + "learning_rate": 1.139382658939414e-06, + "loss": 0.8214, + "step": 2856 + }, + { + "epoch": 0.9332407823282022, + "grad_norm": 0.3161599338054657, + "learning_rate": 1.1283357393308102e-06, + "loss": 0.7862, + "step": 2857 + }, + { + "epoch": 0.9335674329345474, + "grad_norm": 0.35273951292037964, + "learning_rate": 1.117342022624407e-06, + "loss": 1.0511, + "step": 2858 + }, + { + "epoch": 0.9338940835408925, + "grad_norm": 0.3410721719264984, + "learning_rate": 1.106401520788236e-06, + "loss": 0.837, + "step": 2859 + }, + { + "epoch": 0.9342207341472377, + "grad_norm": 0.372831255197525, + "learning_rate": 1.0955142457324042e-06, + "loss": 0.867, + "step": 2860 + }, + { + "epoch": 0.9345473847535829, + "grad_norm": 0.37729281187057495, + "learning_rate": 1.084680209309058e-06, + "loss": 0.8085, + "step": 2861 + }, + { + "epoch": 0.9348740353599282, + "grad_norm": 0.4099488854408264, + "learning_rate": 1.0738994233124134e-06, + "loss": 0.9297, + "step": 2862 + }, + { + "epoch": 0.9352006859662734, + "grad_norm": 0.4232223629951477, + "learning_rate": 1.063171899478682e-06, + "loss": 0.9846, + "step": 2863 + }, + { + "epoch": 0.9355273365726186, + "grad_norm": 0.44969096779823303, + "learning_rate": 1.0524976494861227e-06, + "loss": 0.9578, + "step": 2864 + }, + { + "epoch": 0.9358539871789637, + "grad_norm": 0.49604538083076477, + "learning_rate": 1.0418766849549954e-06, + "loss": 1.0545, + "step": 2865 + }, + { + "epoch": 0.9361806377853089, + "grad_norm": 0.5002210140228271, + "learning_rate": 1.0313090174475182e-06, + "loss": 1.0935, + "step": 2866 + }, + { + "epoch": 0.9365072883916541, + "grad_norm": 0.5190601944923401, + "learning_rate": 1.0207946584679496e-06, + "loss": 1.0616, + "step": 2867 + }, + { + "epoch": 0.9368339389979993, + "grad_norm": 0.5428922772407532, + "learning_rate": 1.0103336194624446e-06, + "loss": 1.0405, + "step": 2868 + }, + { + "epoch": 0.9371605896043445, + "grad_norm": 0.6353383660316467, + "learning_rate": 9.999259118191827e-07, + "loss": 1.186, + "step": 2869 + }, + { + "epoch": 0.9374872402106896, + "grad_norm": 0.7398752570152283, + "learning_rate": 9.895715468682343e-07, + "loss": 1.214, + "step": 2870 + }, + { + "epoch": 0.9378138908170348, + "grad_norm": 0.7890881299972534, + "learning_rate": 9.792705358816156e-07, + "loss": 1.133, + "step": 2871 + }, + { + "epoch": 0.93814054142338, + "grad_norm": 0.9227431416511536, + "learning_rate": 9.69022890073268e-07, + "loss": 1.1887, + "step": 2872 + }, + { + "epoch": 0.9384671920297252, + "grad_norm": 1.1549128293991089, + "learning_rate": 9.588286205990281e-07, + "loss": 1.2085, + "step": 2873 + }, + { + "epoch": 0.9387938426360704, + "grad_norm": 1.3670769929885864, + "learning_rate": 9.486877385566307e-07, + "loss": 1.3928, + "step": 2874 + }, + { + "epoch": 0.9391204932424155, + "grad_norm": 1.7799900770187378, + "learning_rate": 9.386002549856942e-07, + "loss": 1.8047, + "step": 2875 + }, + { + "epoch": 0.9394471438487608, + "grad_norm": 0.2077762335538864, + "learning_rate": 9.28566180867696e-07, + "loss": 0.7245, + "step": 2876 + }, + { + "epoch": 0.939773794455106, + "grad_norm": 0.23600128293037415, + "learning_rate": 9.185855271259869e-07, + "loss": 0.8196, + "step": 2877 + }, + { + "epoch": 0.9401004450614512, + "grad_norm": 0.26550108194351196, + "learning_rate": 9.086583046257479e-07, + "loss": 0.8307, + "step": 2878 + }, + { + "epoch": 0.9404270956677964, + "grad_norm": 0.27199456095695496, + "learning_rate": 8.9878452417399e-07, + "loss": 0.8552, + "step": 2879 + }, + { + "epoch": 0.9407537462741415, + "grad_norm": 0.2838455140590668, + "learning_rate": 8.889641965195761e-07, + "loss": 0.835, + "step": 2880 + }, + { + "epoch": 0.9410803968804867, + "grad_norm": 0.3070827126502991, + "learning_rate": 8.791973323531322e-07, + "loss": 0.9071, + "step": 2881 + }, + { + "epoch": 0.9414070474868319, + "grad_norm": 0.3105681538581848, + "learning_rate": 8.694839423071311e-07, + "loss": 0.8274, + "step": 2882 + }, + { + "epoch": 0.9417336980931771, + "grad_norm": 0.32564812898635864, + "learning_rate": 8.598240369557975e-07, + "loss": 0.8845, + "step": 2883 + }, + { + "epoch": 0.9420603486995223, + "grad_norm": 0.3271031677722931, + "learning_rate": 8.502176268151474e-07, + "loss": 0.8739, + "step": 2884 + }, + { + "epoch": 0.9423869993058674, + "grad_norm": 0.3692990243434906, + "learning_rate": 8.406647223429487e-07, + "loss": 0.9222, + "step": 2885 + }, + { + "epoch": 0.9427136499122126, + "grad_norm": 0.3484625816345215, + "learning_rate": 8.311653339387381e-07, + "loss": 0.8741, + "step": 2886 + }, + { + "epoch": 0.9430403005185578, + "grad_norm": 0.3814089894294739, + "learning_rate": 8.21719471943777e-07, + "loss": 0.8628, + "step": 2887 + }, + { + "epoch": 0.943366951124903, + "grad_norm": 0.39709970355033875, + "learning_rate": 8.123271466410786e-07, + "loss": 0.9057, + "step": 2888 + }, + { + "epoch": 0.9436936017312482, + "grad_norm": 0.4413350522518158, + "learning_rate": 8.029883682553418e-07, + "loss": 0.8906, + "step": 2889 + }, + { + "epoch": 0.9440202523375933, + "grad_norm": 0.4726428687572479, + "learning_rate": 7.937031469530065e-07, + "loss": 1.084, + "step": 2890 + }, + { + "epoch": 0.9443469029439386, + "grad_norm": 0.4564618170261383, + "learning_rate": 7.844714928421871e-07, + "loss": 0.9873, + "step": 2891 + }, + { + "epoch": 0.9446735535502838, + "grad_norm": 0.5263504385948181, + "learning_rate": 7.752934159726888e-07, + "loss": 1.1024, + "step": 2892 + }, + { + "epoch": 0.945000204156629, + "grad_norm": 0.5723422765731812, + "learning_rate": 7.661689263359972e-07, + "loss": 0.9272, + "step": 2893 + }, + { + "epoch": 0.9453268547629742, + "grad_norm": 0.7069876790046692, + "learning_rate": 7.570980338652501e-07, + "loss": 1.2409, + "step": 2894 + }, + { + "epoch": 0.9456535053693194, + "grad_norm": 0.7442349195480347, + "learning_rate": 7.480807484352647e-07, + "loss": 1.2205, + "step": 2895 + }, + { + "epoch": 0.9459801559756645, + "grad_norm": 0.8722764849662781, + "learning_rate": 7.391170798624558e-07, + "loss": 1.0274, + "step": 2896 + }, + { + "epoch": 0.9463068065820097, + "grad_norm": 1.0321097373962402, + "learning_rate": 7.30207037904912e-07, + "loss": 1.3521, + "step": 2897 + }, + { + "epoch": 0.9466334571883549, + "grad_norm": 1.245355248451233, + "learning_rate": 7.213506322623187e-07, + "loss": 1.2544, + "step": 2898 + }, + { + "epoch": 0.9469601077947001, + "grad_norm": 1.5786315202713013, + "learning_rate": 7.125478725759805e-07, + "loss": 2.2946, + "step": 2899 + }, + { + "epoch": 0.9472867584010453, + "grad_norm": 1.5382407903671265, + "learning_rate": 7.037987684287984e-07, + "loss": 1.5537, + "step": 2900 + }, + { + "epoch": 0.9476134090073904, + "grad_norm": 0.21489055454730988, + "learning_rate": 6.951033293452646e-07, + "loss": 0.79, + "step": 2901 + }, + { + "epoch": 0.9479400596137356, + "grad_norm": 0.25648918747901917, + "learning_rate": 6.864615647914407e-07, + "loss": 0.7649, + "step": 2902 + }, + { + "epoch": 0.9482667102200808, + "grad_norm": 0.2571738362312317, + "learning_rate": 6.77873484174979e-07, + "loss": 0.8131, + "step": 2903 + }, + { + "epoch": 0.948593360826426, + "grad_norm": 0.28513866662979126, + "learning_rate": 6.693390968450674e-07, + "loss": 0.859, + "step": 2904 + }, + { + "epoch": 0.9489200114327713, + "grad_norm": 0.2826521694660187, + "learning_rate": 6.608584120924522e-07, + "loss": 0.8662, + "step": 2905 + }, + { + "epoch": 0.9492466620391165, + "grad_norm": 0.3028872609138489, + "learning_rate": 6.524314391494202e-07, + "loss": 0.8613, + "step": 2906 + }, + { + "epoch": 0.9495733126454616, + "grad_norm": 0.3100944459438324, + "learning_rate": 6.440581871897777e-07, + "loss": 0.8643, + "step": 2907 + }, + { + "epoch": 0.9498999632518068, + "grad_norm": 0.34333786368370056, + "learning_rate": 6.357386653288555e-07, + "loss": 0.8483, + "step": 2908 + }, + { + "epoch": 0.950226613858152, + "grad_norm": 0.3206077814102173, + "learning_rate": 6.274728826234922e-07, + "loss": 0.8385, + "step": 2909 + }, + { + "epoch": 0.9505532644644972, + "grad_norm": 0.3540595769882202, + "learning_rate": 6.192608480720175e-07, + "loss": 0.8793, + "step": 2910 + }, + { + "epoch": 0.9508799150708424, + "grad_norm": 0.368453711271286, + "learning_rate": 6.111025706142692e-07, + "loss": 0.9732, + "step": 2911 + }, + { + "epoch": 0.9512065656771875, + "grad_norm": 0.37942591309547424, + "learning_rate": 6.029980591315321e-07, + "loss": 0.9491, + "step": 2912 + }, + { + "epoch": 0.9515332162835327, + "grad_norm": 0.40083321928977966, + "learning_rate": 5.949473224465929e-07, + "loss": 0.9578, + "step": 2913 + }, + { + "epoch": 0.9518598668898779, + "grad_norm": 0.4563828706741333, + "learning_rate": 5.869503693236744e-07, + "loss": 0.868, + "step": 2914 + }, + { + "epoch": 0.9521865174962231, + "grad_norm": 0.4763037860393524, + "learning_rate": 5.790072084684573e-07, + "loss": 1.052, + "step": 2915 + }, + { + "epoch": 0.9525131681025683, + "grad_norm": 0.47979336977005005, + "learning_rate": 5.711178485280688e-07, + "loss": 1.0112, + "step": 2916 + }, + { + "epoch": 0.9528398187089134, + "grad_norm": 0.4837022125720978, + "learning_rate": 5.632822980910557e-07, + "loss": 1.0511, + "step": 2917 + }, + { + "epoch": 0.9531664693152586, + "grad_norm": 0.5643126964569092, + "learning_rate": 5.555005656873891e-07, + "loss": 1.0878, + "step": 2918 + }, + { + "epoch": 0.9534931199216039, + "grad_norm": 0.6676090955734253, + "learning_rate": 5.477726597884647e-07, + "loss": 1.1398, + "step": 2919 + }, + { + "epoch": 0.9538197705279491, + "grad_norm": 0.7085458040237427, + "learning_rate": 5.400985888070642e-07, + "loss": 1.0449, + "step": 2920 + }, + { + "epoch": 0.9541464211342943, + "grad_norm": 0.9497272968292236, + "learning_rate": 5.324783610973772e-07, + "loss": 1.4324, + "step": 2921 + }, + { + "epoch": 0.9544730717406394, + "grad_norm": 1.0609630346298218, + "learning_rate": 5.249119849549677e-07, + "loss": 1.3912, + "step": 2922 + }, + { + "epoch": 0.9547997223469846, + "grad_norm": 1.399525761604309, + "learning_rate": 5.173994686167805e-07, + "loss": 1.6172, + "step": 2923 + }, + { + "epoch": 0.9551263729533298, + "grad_norm": 1.3260670900344849, + "learning_rate": 5.099408202611289e-07, + "loss": 1.6486, + "step": 2924 + }, + { + "epoch": 0.955453023559675, + "grad_norm": 1.807623267173767, + "learning_rate": 5.02536048007679e-07, + "loss": 1.5547, + "step": 2925 + }, + { + "epoch": 0.9557796741660202, + "grad_norm": 0.21524813771247864, + "learning_rate": 4.951851599174495e-07, + "loss": 0.7755, + "step": 2926 + }, + { + "epoch": 0.9561063247723653, + "grad_norm": 0.25288552045822144, + "learning_rate": 4.878881639928001e-07, + "loss": 0.7779, + "step": 2927 + }, + { + "epoch": 0.9564329753787105, + "grad_norm": 0.27735069394111633, + "learning_rate": 4.806450681774155e-07, + "loss": 0.829, + "step": 2928 + }, + { + "epoch": 0.9567596259850557, + "grad_norm": 0.2676355540752411, + "learning_rate": 4.734558803563216e-07, + "loss": 0.7717, + "step": 2929 + }, + { + "epoch": 0.9570862765914009, + "grad_norm": 0.28219136595726013, + "learning_rate": 4.663206083558358e-07, + "loss": 0.7855, + "step": 2930 + }, + { + "epoch": 0.9574129271977461, + "grad_norm": 0.30700165033340454, + "learning_rate": 4.5923925994359464e-07, + "loss": 0.8179, + "step": 2931 + }, + { + "epoch": 0.9577395778040912, + "grad_norm": 0.31931400299072266, + "learning_rate": 4.5221184282852603e-07, + "loss": 0.8727, + "step": 2932 + }, + { + "epoch": 0.9580662284104365, + "grad_norm": 0.3536597788333893, + "learning_rate": 4.4523836466085487e-07, + "loss": 0.9074, + "step": 2933 + }, + { + "epoch": 0.9583928790167817, + "grad_norm": 0.3243158757686615, + "learning_rate": 4.383188330320753e-07, + "loss": 0.7781, + "step": 2934 + }, + { + "epoch": 0.9587195296231269, + "grad_norm": 0.37914979457855225, + "learning_rate": 4.3145325547497284e-07, + "loss": 0.8726, + "step": 2935 + }, + { + "epoch": 0.9590461802294721, + "grad_norm": 0.39078694581985474, + "learning_rate": 4.2464163946356884e-07, + "loss": 0.9549, + "step": 2936 + }, + { + "epoch": 0.9593728308358173, + "grad_norm": 0.4027955234050751, + "learning_rate": 4.1788399241317056e-07, + "loss": 0.8879, + "step": 2937 + }, + { + "epoch": 0.9596994814421624, + "grad_norm": 0.4388086497783661, + "learning_rate": 4.111803216803101e-07, + "loss": 0.9279, + "step": 2938 + }, + { + "epoch": 0.9600261320485076, + "grad_norm": 0.4330412745475769, + "learning_rate": 4.045306345627775e-07, + "loss": 0.8984, + "step": 2939 + }, + { + "epoch": 0.9603527826548528, + "grad_norm": 0.4897141456604004, + "learning_rate": 3.979349382995823e-07, + "loss": 0.9328, + "step": 2940 + }, + { + "epoch": 0.960679433261198, + "grad_norm": 0.5252649784088135, + "learning_rate": 3.9139324007095856e-07, + "loss": 1.015, + "step": 2941 + }, + { + "epoch": 0.9610060838675432, + "grad_norm": 0.5516462922096252, + "learning_rate": 3.849055469983709e-07, + "loss": 1.0131, + "step": 2942 + }, + { + "epoch": 0.9613327344738883, + "grad_norm": 0.627424955368042, + "learning_rate": 3.7847186614446975e-07, + "loss": 1.0651, + "step": 2943 + }, + { + "epoch": 0.9616593850802335, + "grad_norm": 0.6904886364936829, + "learning_rate": 3.7209220451313033e-07, + "loss": 1.1691, + "step": 2944 + }, + { + "epoch": 0.9619860356865787, + "grad_norm": 0.757783055305481, + "learning_rate": 3.6576656904940274e-07, + "loss": 1.0155, + "step": 2945 + }, + { + "epoch": 0.9623126862929239, + "grad_norm": 0.9803491830825806, + "learning_rate": 3.594949666395342e-07, + "loss": 1.3613, + "step": 2946 + }, + { + "epoch": 0.9626393368992692, + "grad_norm": 1.1308027505874634, + "learning_rate": 3.5327740411094655e-07, + "loss": 1.3419, + "step": 2947 + }, + { + "epoch": 0.9629659875056144, + "grad_norm": 1.2531534433364868, + "learning_rate": 3.4711388823223114e-07, + "loss": 1.4717, + "step": 2948 + }, + { + "epoch": 0.9632926381119595, + "grad_norm": 1.2751611471176147, + "learning_rate": 3.4100442571313727e-07, + "loss": 1.3097, + "step": 2949 + }, + { + "epoch": 0.9636192887183047, + "grad_norm": 1.4386547803878784, + "learning_rate": 3.3494902320458934e-07, + "loss": 1.4562, + "step": 2950 + }, + { + "epoch": 0.9639459393246499, + "grad_norm": 0.20834031701087952, + "learning_rate": 3.2894768729863636e-07, + "loss": 0.7, + "step": 2951 + }, + { + "epoch": 0.9642725899309951, + "grad_norm": 0.2570595145225525, + "learning_rate": 3.230004245284912e-07, + "loss": 0.8549, + "step": 2952 + }, + { + "epoch": 0.9645992405373403, + "grad_norm": 0.25834420323371887, + "learning_rate": 3.1710724136848613e-07, + "loss": 0.7985, + "step": 2953 + }, + { + "epoch": 0.9649258911436854, + "grad_norm": 0.2837374806404114, + "learning_rate": 3.112681442340837e-07, + "loss": 0.7945, + "step": 2954 + }, + { + "epoch": 0.9652525417500306, + "grad_norm": 0.2851658761501312, + "learning_rate": 3.0548313948187134e-07, + "loss": 0.8037, + "step": 2955 + }, + { + "epoch": 0.9655791923563758, + "grad_norm": 0.3020305037498474, + "learning_rate": 2.997522334095504e-07, + "loss": 0.8115, + "step": 2956 + }, + { + "epoch": 0.965905842962721, + "grad_norm": 0.30819737911224365, + "learning_rate": 2.940754322559247e-07, + "loss": 0.8555, + "step": 2957 + }, + { + "epoch": 0.9662324935690662, + "grad_norm": 0.31193971633911133, + "learning_rate": 2.8845274220090093e-07, + "loss": 0.8179, + "step": 2958 + }, + { + "epoch": 0.9665591441754113, + "grad_norm": 0.31887295842170715, + "learning_rate": 2.828841693654771e-07, + "loss": 0.8089, + "step": 2959 + }, + { + "epoch": 0.9668857947817565, + "grad_norm": 0.34238582849502563, + "learning_rate": 2.773697198117431e-07, + "loss": 0.9042, + "step": 2960 + }, + { + "epoch": 0.9672124453881018, + "grad_norm": 0.36260437965393066, + "learning_rate": 2.719093995428634e-07, + "loss": 0.9795, + "step": 2961 + }, + { + "epoch": 0.967539095994447, + "grad_norm": 0.36125776171684265, + "learning_rate": 2.6650321450307216e-07, + "loss": 0.8816, + "step": 2962 + }, + { + "epoch": 0.9678657466007922, + "grad_norm": 0.37313613295555115, + "learning_rate": 2.611511705776892e-07, + "loss": 0.8222, + "step": 2963 + }, + { + "epoch": 0.9681923972071373, + "grad_norm": 0.40587756037712097, + "learning_rate": 2.558532735930652e-07, + "loss": 0.9125, + "step": 2964 + }, + { + "epoch": 0.9685190478134825, + "grad_norm": 0.45703062415122986, + "learning_rate": 2.506095293166366e-07, + "loss": 0.9605, + "step": 2965 + }, + { + "epoch": 0.9688456984198277, + "grad_norm": 0.46383294463157654, + "learning_rate": 2.4541994345686494e-07, + "loss": 0.9941, + "step": 2966 + }, + { + "epoch": 0.9691723490261729, + "grad_norm": 0.5314494371414185, + "learning_rate": 2.402845216632643e-07, + "loss": 1.0825, + "step": 2967 + }, + { + "epoch": 0.9694989996325181, + "grad_norm": 0.5420945882797241, + "learning_rate": 2.3520326952638504e-07, + "loss": 1.0171, + "step": 2968 + }, + { + "epoch": 0.9698256502388632, + "grad_norm": 0.5639206767082214, + "learning_rate": 2.301761925777912e-07, + "loss": 1.0999, + "step": 2969 + }, + { + "epoch": 0.9701523008452084, + "grad_norm": 0.6383841037750244, + "learning_rate": 2.2520329629009962e-07, + "loss": 1.1892, + "step": 2970 + }, + { + "epoch": 0.9704789514515536, + "grad_norm": 0.7161938548088074, + "learning_rate": 2.2028458607691872e-07, + "loss": 1.1297, + "step": 2971 + }, + { + "epoch": 0.9708056020578988, + "grad_norm": 0.9672861695289612, + "learning_rate": 2.1542006729287634e-07, + "loss": 1.3065, + "step": 2972 + }, + { + "epoch": 0.971132252664244, + "grad_norm": 1.2224022150039673, + "learning_rate": 2.106097452336142e-07, + "loss": 1.2359, + "step": 2973 + }, + { + "epoch": 0.9714589032705891, + "grad_norm": 1.3247113227844238, + "learning_rate": 2.058536251357601e-07, + "loss": 1.5186, + "step": 2974 + }, + { + "epoch": 0.9717855538769344, + "grad_norm": 1.7058521509170532, + "learning_rate": 2.0115171217695573e-07, + "loss": 1.5777, + "step": 2975 + }, + { + "epoch": 0.9721122044832796, + "grad_norm": 0.22474831342697144, + "learning_rate": 1.9650401147580655e-07, + "loss": 0.7176, + "step": 2976 + }, + { + "epoch": 0.9724388550896248, + "grad_norm": 0.2482801228761673, + "learning_rate": 1.9191052809192643e-07, + "loss": 0.745, + "step": 2977 + }, + { + "epoch": 0.97276550569597, + "grad_norm": 0.2572484016418457, + "learning_rate": 1.8737126702588202e-07, + "loss": 0.8129, + "step": 2978 + }, + { + "epoch": 0.9730921563023152, + "grad_norm": 0.269260436296463, + "learning_rate": 1.8288623321924268e-07, + "loss": 0.801, + "step": 2979 + }, + { + "epoch": 0.9734188069086603, + "grad_norm": 0.2986462712287903, + "learning_rate": 1.784554315545084e-07, + "loss": 0.8864, + "step": 2980 + }, + { + "epoch": 0.9737454575150055, + "grad_norm": 0.302898108959198, + "learning_rate": 1.740788668551707e-07, + "loss": 0.8545, + "step": 2981 + }, + { + "epoch": 0.9740721081213507, + "grad_norm": 0.30456167459487915, + "learning_rate": 1.6975654388566298e-07, + "loss": 0.8325, + "step": 2982 + }, + { + "epoch": 0.9743987587276959, + "grad_norm": 0.3240984380245209, + "learning_rate": 1.654884673513768e-07, + "loss": 0.866, + "step": 2983 + }, + { + "epoch": 0.9747254093340411, + "grad_norm": 0.31887489557266235, + "learning_rate": 1.6127464189863995e-07, + "loss": 0.8524, + "step": 2984 + }, + { + "epoch": 0.9750520599403862, + "grad_norm": 0.3447200655937195, + "learning_rate": 1.5711507211472742e-07, + "loss": 0.8951, + "step": 2985 + }, + { + "epoch": 0.9753787105467314, + "grad_norm": 0.32973796129226685, + "learning_rate": 1.5300976252785593e-07, + "loss": 0.8277, + "step": 2986 + }, + { + "epoch": 0.9757053611530766, + "grad_norm": 0.38608047366142273, + "learning_rate": 1.4895871760716162e-07, + "loss": 0.9188, + "step": 2987 + }, + { + "epoch": 0.9760320117594218, + "grad_norm": 0.3714994788169861, + "learning_rate": 1.4496194176271126e-07, + "loss": 0.9459, + "step": 2988 + }, + { + "epoch": 0.9763586623657671, + "grad_norm": 0.4215068221092224, + "learning_rate": 1.4101943934549667e-07, + "loss": 0.871, + "step": 2989 + }, + { + "epoch": 0.9766853129721123, + "grad_norm": 0.4134935438632965, + "learning_rate": 1.371312146474235e-07, + "loss": 0.8685, + "step": 2990 + }, + { + "epoch": 0.9770119635784574, + "grad_norm": 0.47548481822013855, + "learning_rate": 1.3329727190130593e-07, + "loss": 0.9058, + "step": 2991 + }, + { + "epoch": 0.9773386141848026, + "grad_norm": 0.49817779660224915, + "learning_rate": 1.2951761528087746e-07, + "loss": 0.9534, + "step": 2992 + }, + { + "epoch": 0.9776652647911478, + "grad_norm": 0.5390109419822693, + "learning_rate": 1.2579224890075235e-07, + "loss": 1.0624, + "step": 2993 + }, + { + "epoch": 0.977991915397493, + "grad_norm": 0.5876456499099731, + "learning_rate": 1.2212117681646984e-07, + "loss": 1.0434, + "step": 2994 + }, + { + "epoch": 0.9783185660038382, + "grad_norm": 0.8520432114601135, + "learning_rate": 1.1850440302444421e-07, + "loss": 1.1705, + "step": 2995 + }, + { + "epoch": 0.9786452166101833, + "grad_norm": 0.8995637893676758, + "learning_rate": 1.1494193146198151e-07, + "loss": 1.3287, + "step": 2996 + }, + { + "epoch": 0.9789718672165285, + "grad_norm": 0.9267019033432007, + "learning_rate": 1.1143376600727951e-07, + "loss": 1.322, + "step": 2997 + }, + { + "epoch": 0.9792985178228737, + "grad_norm": 1.0994635820388794, + "learning_rate": 1.0797991047941102e-07, + "loss": 1.5212, + "step": 2998 + }, + { + "epoch": 0.9796251684292189, + "grad_norm": 1.2156322002410889, + "learning_rate": 1.0458036863834064e-07, + "loss": 1.5812, + "step": 2999 + }, + { + "epoch": 0.979951819035564, + "grad_norm": 1.8230586051940918, + "learning_rate": 1.0123514418487468e-07, + "loss": 1.9067, + "step": 3000 + }, + { + "epoch": 0.9802784696419092, + "grad_norm": 0.22708569467067719, + "learning_rate": 9.794424076072228e-08, + "loss": 0.813, + "step": 3001 + }, + { + "epoch": 0.9806051202482544, + "grad_norm": 0.24345462024211884, + "learning_rate": 9.470766194843439e-08, + "loss": 0.8086, + "step": 3002 + }, + { + "epoch": 0.9809317708545996, + "grad_norm": 0.2802831530570984, + "learning_rate": 9.152541127143699e-08, + "loss": 0.8417, + "step": 3003 + }, + { + "epoch": 0.9812584214609449, + "grad_norm": 0.27346834540367126, + "learning_rate": 8.839749219399784e-08, + "loss": 0.9053, + "step": 3004 + }, + { + "epoch": 0.9815850720672901, + "grad_norm": 0.2842161953449249, + "learning_rate": 8.532390812125424e-08, + "loss": 0.8295, + "step": 3005 + }, + { + "epoch": 0.9819117226736352, + "grad_norm": 0.29227709770202637, + "learning_rate": 8.230466239918521e-08, + "loss": 0.8382, + "step": 3006 + }, + { + "epoch": 0.9822383732799804, + "grad_norm": 0.3029593527317047, + "learning_rate": 7.933975831461716e-08, + "loss": 0.8816, + "step": 3007 + }, + { + "epoch": 0.9825650238863256, + "grad_norm": 0.31089669466018677, + "learning_rate": 7.642919909521263e-08, + "loss": 0.8376, + "step": 3008 + }, + { + "epoch": 0.9828916744926708, + "grad_norm": 0.31294307112693787, + "learning_rate": 7.357298790948152e-08, + "loss": 0.8148, + "step": 3009 + }, + { + "epoch": 0.983218325099016, + "grad_norm": 0.33494314551353455, + "learning_rate": 7.077112786676444e-08, + "loss": 0.7742, + "step": 3010 + }, + { + "epoch": 0.9835449757053611, + "grad_norm": 0.3963795006275177, + "learning_rate": 6.80236220172381e-08, + "loss": 0.88, + "step": 3011 + }, + { + "epoch": 0.9838716263117063, + "grad_norm": 0.40085482597351074, + "learning_rate": 6.533047335190446e-08, + "loss": 0.8962, + "step": 3012 + }, + { + "epoch": 0.9841982769180515, + "grad_norm": 0.4059949815273285, + "learning_rate": 6.269168480259602e-08, + "loss": 0.9012, + "step": 3013 + }, + { + "epoch": 0.9845249275243967, + "grad_norm": 0.4352096915245056, + "learning_rate": 6.010725924195382e-08, + "loss": 1.023, + "step": 3014 + }, + { + "epoch": 0.9848515781307419, + "grad_norm": 0.49773815274238586, + "learning_rate": 5.757719948344953e-08, + "loss": 0.9623, + "step": 3015 + }, + { + "epoch": 0.985178228737087, + "grad_norm": 0.4756074845790863, + "learning_rate": 5.5101508281379946e-08, + "loss": 0.9922, + "step": 3016 + }, + { + "epoch": 0.9855048793434322, + "grad_norm": 0.5341835021972656, + "learning_rate": 5.2680188330828106e-08, + "loss": 0.9406, + "step": 3017 + }, + { + "epoch": 0.9858315299497775, + "grad_norm": 0.5826855301856995, + "learning_rate": 5.031324226771328e-08, + "loss": 1.1138, + "step": 3018 + }, + { + "epoch": 0.9861581805561227, + "grad_norm": 0.6295057535171509, + "learning_rate": 4.800067266874653e-08, + "loss": 1.079, + "step": 3019 + }, + { + "epoch": 0.9864848311624679, + "grad_norm": 0.6822542548179626, + "learning_rate": 4.5742482051452925e-08, + "loss": 1.3472, + "step": 3020 + }, + { + "epoch": 0.9868114817688131, + "grad_norm": 0.7802051305770874, + "learning_rate": 4.353867287414937e-08, + "loss": 0.8963, + "step": 3021 + }, + { + "epoch": 0.9871381323751582, + "grad_norm": 0.9883480072021484, + "learning_rate": 4.1389247535961186e-08, + "loss": 1.3332, + "step": 3022 + }, + { + "epoch": 0.9874647829815034, + "grad_norm": 1.0861750841140747, + "learning_rate": 3.929420837679998e-08, + "loss": 1.2043, + "step": 3023 + }, + { + "epoch": 0.9877914335878486, + "grad_norm": 1.2433277368545532, + "learning_rate": 3.7253557677385805e-08, + "loss": 1.5074, + "step": 3024 + }, + { + "epoch": 0.9881180841941938, + "grad_norm": 1.6619473695755005, + "learning_rate": 3.5267297659213886e-08, + "loss": 1.6274, + "step": 3025 + }, + { + "epoch": 0.988444734800539, + "grad_norm": 0.20633414387702942, + "learning_rate": 3.333543048458232e-08, + "loss": 0.6526, + "step": 3026 + }, + { + "epoch": 0.9887713854068841, + "grad_norm": 0.2510916590690613, + "learning_rate": 3.145795825656439e-08, + "loss": 0.7986, + "step": 3027 + }, + { + "epoch": 0.9890980360132293, + "grad_norm": 0.2619028091430664, + "learning_rate": 2.9634883019025173e-08, + "loss": 0.8371, + "step": 3028 + }, + { + "epoch": 0.9894246866195745, + "grad_norm": 0.2645626664161682, + "learning_rate": 2.7866206756610445e-08, + "loss": 0.7909, + "step": 3029 + }, + { + "epoch": 0.9897513372259197, + "grad_norm": 0.28736555576324463, + "learning_rate": 2.6151931394741146e-08, + "loss": 0.843, + "step": 3030 + }, + { + "epoch": 0.9900779878322649, + "grad_norm": 0.28967446088790894, + "learning_rate": 2.449205879962446e-08, + "loss": 0.8462, + "step": 3031 + }, + { + "epoch": 0.9904046384386102, + "grad_norm": 0.3020203709602356, + "learning_rate": 2.288659077823163e-08, + "loss": 0.7806, + "step": 3032 + }, + { + "epoch": 0.9907312890449553, + "grad_norm": 0.33225804567337036, + "learning_rate": 2.13355290783257e-08, + "loss": 0.8409, + "step": 3033 + }, + { + "epoch": 0.9910579396513005, + "grad_norm": 0.3261914849281311, + "learning_rate": 1.983887538842266e-08, + "loss": 0.8474, + "step": 3034 + }, + { + "epoch": 0.9913845902576457, + "grad_norm": 0.32698002457618713, + "learning_rate": 1.8396631337813664e-08, + "loss": 0.8514, + "step": 3035 + }, + { + "epoch": 0.9917112408639909, + "grad_norm": 0.32505884766578674, + "learning_rate": 1.7008798496570556e-08, + "loss": 0.8421, + "step": 3036 + }, + { + "epoch": 0.992037891470336, + "grad_norm": 0.3715566098690033, + "learning_rate": 1.5675378375518135e-08, + "loss": 0.8502, + "step": 3037 + }, + { + "epoch": 0.9923645420766812, + "grad_norm": 0.38677921891212463, + "learning_rate": 1.4396372426250803e-08, + "loss": 0.8488, + "step": 3038 + }, + { + "epoch": 0.9926911926830264, + "grad_norm": 0.4004034996032715, + "learning_rate": 1.3171782041127012e-08, + "loss": 0.9237, + "step": 3039 + }, + { + "epoch": 0.9930178432893716, + "grad_norm": 0.4869138300418854, + "learning_rate": 1.200160855326371e-08, + "loss": 1.0046, + "step": 3040 + }, + { + "epoch": 0.9933444938957168, + "grad_norm": 0.4786950349807739, + "learning_rate": 1.0885853236541898e-08, + "loss": 0.9855, + "step": 3041 + }, + { + "epoch": 0.993671144502062, + "grad_norm": 0.5063715577125549, + "learning_rate": 9.824517305601077e-09, + "loss": 1.0504, + "step": 3042 + }, + { + "epoch": 0.9939977951084071, + "grad_norm": 0.573488712310791, + "learning_rate": 8.817601915839247e-09, + "loss": 1.0409, + "step": 3043 + }, + { + "epoch": 0.9943244457147523, + "grad_norm": 0.5906215310096741, + "learning_rate": 7.865108163407354e-09, + "loss": 1.032, + "step": 3044 + }, + { + "epoch": 0.9946510963210975, + "grad_norm": 0.5928130745887756, + "learning_rate": 6.967037085209293e-09, + "loss": 1.0514, + "step": 3045 + }, + { + "epoch": 0.9949777469274428, + "grad_norm": 0.7858253121376038, + "learning_rate": 6.123389658913015e-09, + "loss": 1.001, + "step": 3046 + }, + { + "epoch": 0.995304397533788, + "grad_norm": 0.9497416615486145, + "learning_rate": 5.33416680293386e-09, + "loss": 1.3928, + "step": 3047 + }, + { + "epoch": 0.9956310481401331, + "grad_norm": 1.2982540130615234, + "learning_rate": 4.599369376440122e-09, + "loss": 1.5056, + "step": 3048 + }, + { + "epoch": 0.9959576987464783, + "grad_norm": 1.4778263568878174, + "learning_rate": 3.9189981793474885e-09, + "loss": 1.7437, + "step": 3049 + }, + { + "epoch": 0.9962843493528235, + "grad_norm": 1.6468915939331055, + "learning_rate": 3.2930539523245983e-09, + "loss": 1.8305, + "step": 3050 + }, + { + "epoch": 0.9966109999591687, + "grad_norm": 0.2135321944952011, + "learning_rate": 2.7215373767930373e-09, + "loss": 0.7451, + "step": 3051 + }, + { + "epoch": 0.9969376505655139, + "grad_norm": 0.2706100046634674, + "learning_rate": 2.204449074916237e-09, + "loss": 0.7944, + "step": 3052 + }, + { + "epoch": 0.997264301171859, + "grad_norm": 0.28801220655441284, + "learning_rate": 1.741789609610578e-09, + "loss": 0.8088, + "step": 3053 + }, + { + "epoch": 0.9975909517782042, + "grad_norm": 0.3120700716972351, + "learning_rate": 1.3335594845453881e-09, + "loss": 0.8261, + "step": 3054 + }, + { + "epoch": 0.9979176023845494, + "grad_norm": 0.3604918122291565, + "learning_rate": 9.797591441151888e-10, + "loss": 0.8993, + "step": 3055 + }, + { + "epoch": 0.9982442529908946, + "grad_norm": 0.435921847820282, + "learning_rate": 6.803889734896541e-10, + "loss": 0.9562, + "step": 3056 + }, + { + "epoch": 0.9985709035972398, + "grad_norm": 0.528087317943573, + "learning_rate": 4.3544929856365046e-10, + "loss": 1.0983, + "step": 3057 + }, + { + "epoch": 0.998897554203585, + "grad_norm": 0.7063041925430298, + "learning_rate": 2.44940385984993e-10, + "loss": 1.25, + "step": 3058 + }, + { + "epoch": 0.9992242048099301, + "grad_norm": 0.9369118213653564, + "learning_rate": 1.0886244314889382e-10, + "loss": 1.2773, + "step": 3059 + }, + { + "epoch": 0.9995508554162754, + "grad_norm": 1.196264624595642, + "learning_rate": 2.721561819241103e-11, + "loss": 1.6142, + "step": 3060 + }, + { + "epoch": 0.9998775060226206, + "grad_norm": 1.3984349966049194, + "learning_rate": 0.0, + "loss": 1.7651, + "step": 3061 } ], "logging_steps": 1, @@ -16139,12 +21480,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } }, - "total_flos": 1.5133624248230216e+18, + "total_flos": 2.0158272316658156e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null