{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1563, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003198976327575176, "grad_norm": 2.5142228603363037, "learning_rate": 0.00019936020473448497, "loss": 2.2007, "step": 5 }, { "epoch": 0.006397952655150352, "grad_norm": 0.7006776332855225, "learning_rate": 0.00019872040946896996, "loss": 0.461, "step": 10 }, { "epoch": 0.009596928982725527, "grad_norm": 0.6147411465644836, "learning_rate": 0.0001980806142034549, "loss": 0.3499, "step": 15 }, { "epoch": 0.012795905310300703, "grad_norm": 0.4334537386894226, "learning_rate": 0.00019744081893793988, "loss": 0.3154, "step": 20 }, { "epoch": 0.01599488163787588, "grad_norm": 0.5372670292854309, "learning_rate": 0.00019680102367242483, "loss": 0.2982, "step": 25 }, { "epoch": 0.019193857965451054, "grad_norm": 0.4113191068172455, "learning_rate": 0.0001961612284069098, "loss": 0.2751, "step": 30 }, { "epoch": 0.022392834293026232, "grad_norm": 0.3651261627674103, "learning_rate": 0.00019552143314139478, "loss": 0.279, "step": 35 }, { "epoch": 0.025591810620601407, "grad_norm": 0.4230581820011139, "learning_rate": 0.00019488163787587971, "loss": 0.248, "step": 40 }, { "epoch": 0.028790786948176585, "grad_norm": 0.39593610167503357, "learning_rate": 0.0001942418426103647, "loss": 0.2902, "step": 45 }, { "epoch": 0.03198976327575176, "grad_norm": 0.41984233260154724, "learning_rate": 0.00019360204734484966, "loss": 0.2932, "step": 50 }, { "epoch": 0.035188739603326934, "grad_norm": 0.26596859097480774, "learning_rate": 0.00019296225207933462, "loss": 0.2375, "step": 55 }, { "epoch": 0.03838771593090211, "grad_norm": 0.44939491152763367, "learning_rate": 0.0001923224568138196, "loss": 0.2794, "step": 60 }, { "epoch": 0.04158669225847729, "grad_norm": 0.3508070111274719, "learning_rate": 0.00019168266154830454, "loss": 0.2787, "step": 65 }, { "epoch": 0.044785668586052464, "grad_norm": 0.40634146332740784, "learning_rate": 0.0001910428662827895, "loss": 0.2767, "step": 70 }, { "epoch": 0.04798464491362764, "grad_norm": 0.6950023770332336, "learning_rate": 0.00019040307101727449, "loss": 0.2776, "step": 75 }, { "epoch": 0.05118362124120281, "grad_norm": 0.34798476099967957, "learning_rate": 0.00018976327575175944, "loss": 0.2492, "step": 80 }, { "epoch": 0.05438259756877799, "grad_norm": 0.30487826466560364, "learning_rate": 0.00018912348048624443, "loss": 0.2344, "step": 85 }, { "epoch": 0.05758157389635317, "grad_norm": 0.2966866195201874, "learning_rate": 0.00018848368522072936, "loss": 0.2717, "step": 90 }, { "epoch": 0.060780550223928344, "grad_norm": 0.41749754548072815, "learning_rate": 0.00018784388995521432, "loss": 0.2855, "step": 95 }, { "epoch": 0.06397952655150352, "grad_norm": 0.3517999053001404, "learning_rate": 0.0001872040946896993, "loss": 0.2537, "step": 100 }, { "epoch": 0.0671785028790787, "grad_norm": 0.3616477847099304, "learning_rate": 0.00018656429942418427, "loss": 0.2548, "step": 105 }, { "epoch": 0.07037747920665387, "grad_norm": 0.3890167772769928, "learning_rate": 0.00018592450415866926, "loss": 0.2678, "step": 110 }, { "epoch": 0.07357645553422905, "grad_norm": 0.35902726650238037, "learning_rate": 0.0001852847088931542, "loss": 0.2751, "step": 115 }, { "epoch": 0.07677543186180422, "grad_norm": 0.3401743471622467, "learning_rate": 0.00018464491362763915, "loss": 0.2696, "step": 120 }, { "epoch": 0.0799744081893794, "grad_norm": 0.2922796905040741, "learning_rate": 0.00018400511836212414, "loss": 0.2326, "step": 125 }, { "epoch": 0.08317338451695458, "grad_norm": 0.3094709515571594, "learning_rate": 0.0001833653230966091, "loss": 0.2393, "step": 130 }, { "epoch": 0.08637236084452975, "grad_norm": 0.2731708884239197, "learning_rate": 0.00018272552783109406, "loss": 0.2446, "step": 135 }, { "epoch": 0.08957133717210493, "grad_norm": 0.281548410654068, "learning_rate": 0.00018208573256557901, "loss": 0.2739, "step": 140 }, { "epoch": 0.0927703134996801, "grad_norm": 0.31158748269081116, "learning_rate": 0.00018144593730006397, "loss": 0.2562, "step": 145 }, { "epoch": 0.09596928982725528, "grad_norm": 0.31344011425971985, "learning_rate": 0.00018080614203454896, "loss": 0.2457, "step": 150 }, { "epoch": 0.09916826615483046, "grad_norm": 0.25769639015197754, "learning_rate": 0.00018016634676903392, "loss": 0.2413, "step": 155 }, { "epoch": 0.10236724248240563, "grad_norm": 0.3215169310569763, "learning_rate": 0.00017952655150351888, "loss": 0.2587, "step": 160 }, { "epoch": 0.10556621880998081, "grad_norm": 0.31406131386756897, "learning_rate": 0.00017888675623800384, "loss": 0.252, "step": 165 }, { "epoch": 0.10876519513755598, "grad_norm": 0.2920863926410675, "learning_rate": 0.0001782469609724888, "loss": 0.2469, "step": 170 }, { "epoch": 0.11196417146513116, "grad_norm": 0.32624512910842896, "learning_rate": 0.00017760716570697379, "loss": 0.2543, "step": 175 }, { "epoch": 0.11516314779270634, "grad_norm": 0.3013479709625244, "learning_rate": 0.00017696737044145875, "loss": 0.2662, "step": 180 }, { "epoch": 0.1183621241202815, "grad_norm": 0.2553490102291107, "learning_rate": 0.0001763275751759437, "loss": 0.2479, "step": 185 }, { "epoch": 0.12156110044785669, "grad_norm": 0.3384646773338318, "learning_rate": 0.00017568777991042867, "loss": 0.2979, "step": 190 }, { "epoch": 0.12476007677543186, "grad_norm": 0.4001105725765228, "learning_rate": 0.00017504798464491362, "loss": 0.2629, "step": 195 }, { "epoch": 0.12795905310300704, "grad_norm": 0.3194480836391449, "learning_rate": 0.0001744081893793986, "loss": 0.2489, "step": 200 }, { "epoch": 0.13115802943058222, "grad_norm": 0.38252341747283936, "learning_rate": 0.00017376839411388357, "loss": 0.2356, "step": 205 }, { "epoch": 0.1343570057581574, "grad_norm": 0.31561562418937683, "learning_rate": 0.00017312859884836853, "loss": 0.2581, "step": 210 }, { "epoch": 0.13755598208573255, "grad_norm": 0.2712710499763489, "learning_rate": 0.0001724888035828535, "loss": 0.2577, "step": 215 }, { "epoch": 0.14075495841330773, "grad_norm": 0.3084522485733032, "learning_rate": 0.00017184900831733845, "loss": 0.2304, "step": 220 }, { "epoch": 0.14395393474088292, "grad_norm": 0.2699475884437561, "learning_rate": 0.00017120921305182344, "loss": 0.2685, "step": 225 }, { "epoch": 0.1471529110684581, "grad_norm": 0.3236371874809265, "learning_rate": 0.0001705694177863084, "loss": 0.2588, "step": 230 }, { "epoch": 0.15035188739603328, "grad_norm": 0.28885379433631897, "learning_rate": 0.00016992962252079336, "loss": 0.2574, "step": 235 }, { "epoch": 0.15355086372360843, "grad_norm": 0.3263382315635681, "learning_rate": 0.00016928982725527832, "loss": 0.2649, "step": 240 }, { "epoch": 0.15674984005118361, "grad_norm": 0.2647656500339508, "learning_rate": 0.00016865003198976328, "loss": 0.2522, "step": 245 }, { "epoch": 0.1599488163787588, "grad_norm": 0.2796100676059723, "learning_rate": 0.00016801023672424826, "loss": 0.2255, "step": 250 }, { "epoch": 0.16314779270633398, "grad_norm": 0.32718002796173096, "learning_rate": 0.00016737044145873322, "loss": 0.2623, "step": 255 }, { "epoch": 0.16634676903390916, "grad_norm": 0.34518498182296753, "learning_rate": 0.00016673064619321818, "loss": 0.2452, "step": 260 }, { "epoch": 0.1695457453614843, "grad_norm": 0.3195638358592987, "learning_rate": 0.00016609085092770314, "loss": 0.2638, "step": 265 }, { "epoch": 0.1727447216890595, "grad_norm": 0.2617906928062439, "learning_rate": 0.0001654510556621881, "loss": 0.2294, "step": 270 }, { "epoch": 0.17594369801663468, "grad_norm": 0.28182393312454224, "learning_rate": 0.00016481126039667306, "loss": 0.2611, "step": 275 }, { "epoch": 0.17914267434420986, "grad_norm": 0.24791140854358673, "learning_rate": 0.00016417146513115805, "loss": 0.2178, "step": 280 }, { "epoch": 0.18234165067178504, "grad_norm": 0.2434699386358261, "learning_rate": 0.000163531669865643, "loss": 0.2396, "step": 285 }, { "epoch": 0.1855406269993602, "grad_norm": 0.3352366089820862, "learning_rate": 0.00016289187460012797, "loss": 0.2368, "step": 290 }, { "epoch": 0.18873960332693537, "grad_norm": 0.32391369342803955, "learning_rate": 0.00016225207933461293, "loss": 0.2575, "step": 295 }, { "epoch": 0.19193857965451055, "grad_norm": 0.33067309856414795, "learning_rate": 0.00016161228406909789, "loss": 0.2726, "step": 300 }, { "epoch": 0.19513755598208574, "grad_norm": 0.3429821729660034, "learning_rate": 0.00016097248880358287, "loss": 0.2507, "step": 305 }, { "epoch": 0.19833653230966092, "grad_norm": 0.3166626989841461, "learning_rate": 0.00016033269353806783, "loss": 0.2474, "step": 310 }, { "epoch": 0.20153550863723607, "grad_norm": 0.2972988784313202, "learning_rate": 0.0001596928982725528, "loss": 0.2361, "step": 315 }, { "epoch": 0.20473448496481125, "grad_norm": 0.2758512794971466, "learning_rate": 0.00015905310300703775, "loss": 0.2266, "step": 320 }, { "epoch": 0.20793346129238643, "grad_norm": 0.2929127812385559, "learning_rate": 0.0001584133077415227, "loss": 0.2514, "step": 325 }, { "epoch": 0.21113243761996162, "grad_norm": 0.3643186092376709, "learning_rate": 0.0001577735124760077, "loss": 0.2628, "step": 330 }, { "epoch": 0.2143314139475368, "grad_norm": 0.3119208514690399, "learning_rate": 0.00015713371721049266, "loss": 0.2446, "step": 335 }, { "epoch": 0.21753039027511195, "grad_norm": 0.2954186201095581, "learning_rate": 0.00015649392194497762, "loss": 0.248, "step": 340 }, { "epoch": 0.22072936660268713, "grad_norm": 0.28279921412467957, "learning_rate": 0.00015585412667946258, "loss": 0.2256, "step": 345 }, { "epoch": 0.22392834293026231, "grad_norm": 0.28081214427948, "learning_rate": 0.00015521433141394754, "loss": 0.2481, "step": 350 }, { "epoch": 0.2271273192578375, "grad_norm": 0.36415377259254456, "learning_rate": 0.00015457453614843252, "loss": 0.2577, "step": 355 }, { "epoch": 0.23032629558541268, "grad_norm": 0.3519168794155121, "learning_rate": 0.00015393474088291748, "loss": 0.2504, "step": 360 }, { "epoch": 0.23352527191298783, "grad_norm": 0.3644118309020996, "learning_rate": 0.00015329494561740244, "loss": 0.2453, "step": 365 }, { "epoch": 0.236724248240563, "grad_norm": 0.2938327491283417, "learning_rate": 0.0001526551503518874, "loss": 0.2378, "step": 370 }, { "epoch": 0.2399232245681382, "grad_norm": 0.25993379950523376, "learning_rate": 0.00015201535508637236, "loss": 0.2387, "step": 375 }, { "epoch": 0.24312220089571338, "grad_norm": 0.2894437313079834, "learning_rate": 0.00015137555982085735, "loss": 0.2524, "step": 380 }, { "epoch": 0.24632117722328856, "grad_norm": 0.29153236746788025, "learning_rate": 0.0001507357645553423, "loss": 0.2351, "step": 385 }, { "epoch": 0.2495201535508637, "grad_norm": 0.2899223268032074, "learning_rate": 0.00015009596928982727, "loss": 0.2493, "step": 390 }, { "epoch": 0.2527191298784389, "grad_norm": 0.27078118920326233, "learning_rate": 0.00014945617402431223, "loss": 0.2543, "step": 395 }, { "epoch": 0.2559181062060141, "grad_norm": 0.23355244100093842, "learning_rate": 0.0001488163787587972, "loss": 0.232, "step": 400 }, { "epoch": 0.2591170825335892, "grad_norm": 0.29392296075820923, "learning_rate": 0.00014817658349328217, "loss": 0.2356, "step": 405 }, { "epoch": 0.26231605886116444, "grad_norm": 0.3395833969116211, "learning_rate": 0.00014753678822776713, "loss": 0.2498, "step": 410 }, { "epoch": 0.2655150351887396, "grad_norm": 0.2617484927177429, "learning_rate": 0.00014689699296225207, "loss": 0.2239, "step": 415 }, { "epoch": 0.2687140115163148, "grad_norm": 0.29424986243247986, "learning_rate": 0.00014625719769673705, "loss": 0.2602, "step": 420 }, { "epoch": 0.27191298784388995, "grad_norm": 0.251095175743103, "learning_rate": 0.000145617402431222, "loss": 0.2113, "step": 425 }, { "epoch": 0.2751119641714651, "grad_norm": 0.31057092547416687, "learning_rate": 0.000144977607165707, "loss": 0.2388, "step": 430 }, { "epoch": 0.2783109404990403, "grad_norm": 0.23010189831256866, "learning_rate": 0.00014433781190019196, "loss": 0.2294, "step": 435 }, { "epoch": 0.28150991682661547, "grad_norm": 0.28975558280944824, "learning_rate": 0.0001436980166346769, "loss": 0.2433, "step": 440 }, { "epoch": 0.2847088931541907, "grad_norm": 0.306234747171402, "learning_rate": 0.00014305822136916188, "loss": 0.2715, "step": 445 }, { "epoch": 0.28790786948176583, "grad_norm": 0.3650927245616913, "learning_rate": 0.00014241842610364684, "loss": 0.2577, "step": 450 }, { "epoch": 0.291106845809341, "grad_norm": 0.3464455306529999, "learning_rate": 0.0001417786308381318, "loss": 0.2514, "step": 455 }, { "epoch": 0.2943058221369162, "grad_norm": 0.3354053199291229, "learning_rate": 0.00014113883557261678, "loss": 0.2479, "step": 460 }, { "epoch": 0.29750479846449135, "grad_norm": 0.27853867411613464, "learning_rate": 0.00014049904030710172, "loss": 0.2454, "step": 465 }, { "epoch": 0.30070377479206656, "grad_norm": 0.29874446988105774, "learning_rate": 0.0001398592450415867, "loss": 0.2271, "step": 470 }, { "epoch": 0.3039027511196417, "grad_norm": 0.30210456252098083, "learning_rate": 0.00013921944977607166, "loss": 0.2532, "step": 475 }, { "epoch": 0.30710172744721687, "grad_norm": 0.2790578305721283, "learning_rate": 0.00013857965451055662, "loss": 0.2254, "step": 480 }, { "epoch": 0.3103007037747921, "grad_norm": 0.28601694107055664, "learning_rate": 0.0001379398592450416, "loss": 0.2396, "step": 485 }, { "epoch": 0.31349968010236723, "grad_norm": 0.2820374667644501, "learning_rate": 0.00013730006397952654, "loss": 0.2405, "step": 490 }, { "epoch": 0.31669865642994244, "grad_norm": 0.48833322525024414, "learning_rate": 0.00013666026871401153, "loss": 0.2405, "step": 495 }, { "epoch": 0.3198976327575176, "grad_norm": 0.30562731623649597, "learning_rate": 0.0001360204734484965, "loss": 0.2486, "step": 500 }, { "epoch": 0.32309660908509275, "grad_norm": 0.43254855275154114, "learning_rate": 0.00013538067818298145, "loss": 0.2541, "step": 505 }, { "epoch": 0.32629558541266795, "grad_norm": 0.29583895206451416, "learning_rate": 0.00013474088291746643, "loss": 0.2442, "step": 510 }, { "epoch": 0.3294945617402431, "grad_norm": 0.3376995325088501, "learning_rate": 0.00013410108765195137, "loss": 0.2484, "step": 515 }, { "epoch": 0.3326935380678183, "grad_norm": 0.36025670170783997, "learning_rate": 0.00013346129238643635, "loss": 0.2506, "step": 520 }, { "epoch": 0.33589251439539347, "grad_norm": 0.33433622121810913, "learning_rate": 0.0001328214971209213, "loss": 0.2201, "step": 525 }, { "epoch": 0.3390914907229686, "grad_norm": 0.2948063910007477, "learning_rate": 0.00013218170185540627, "loss": 0.2192, "step": 530 }, { "epoch": 0.34229046705054383, "grad_norm": 0.3154048025608063, "learning_rate": 0.00013154190658989126, "loss": 0.2563, "step": 535 }, { "epoch": 0.345489443378119, "grad_norm": 0.3658270537853241, "learning_rate": 0.0001309021113243762, "loss": 0.262, "step": 540 }, { "epoch": 0.3486884197056942, "grad_norm": 0.287977010011673, "learning_rate": 0.00013026231605886118, "loss": 0.2516, "step": 545 }, { "epoch": 0.35188739603326935, "grad_norm": 0.32740944623947144, "learning_rate": 0.00012962252079334614, "loss": 0.2598, "step": 550 }, { "epoch": 0.3550863723608445, "grad_norm": 0.3204440772533417, "learning_rate": 0.0001289827255278311, "loss": 0.2476, "step": 555 }, { "epoch": 0.3582853486884197, "grad_norm": 0.2696886360645294, "learning_rate": 0.00012834293026231608, "loss": 0.2135, "step": 560 }, { "epoch": 0.36148432501599487, "grad_norm": 0.24359866976737976, "learning_rate": 0.00012770313499680102, "loss": 0.2182, "step": 565 }, { "epoch": 0.3646833013435701, "grad_norm": 0.2711159586906433, "learning_rate": 0.000127063339731286, "loss": 0.2381, "step": 570 }, { "epoch": 0.36788227767114523, "grad_norm": 0.3088955283164978, "learning_rate": 0.00012642354446577096, "loss": 0.1968, "step": 575 }, { "epoch": 0.3710812539987204, "grad_norm": 0.28893983364105225, "learning_rate": 0.00012578374920025592, "loss": 0.2539, "step": 580 }, { "epoch": 0.3742802303262956, "grad_norm": 0.30569225549697876, "learning_rate": 0.0001251439539347409, "loss": 0.2491, "step": 585 }, { "epoch": 0.37747920665387075, "grad_norm": 0.29621535539627075, "learning_rate": 0.00012450415866922584, "loss": 0.2148, "step": 590 }, { "epoch": 0.38067818298144596, "grad_norm": 0.32468584179878235, "learning_rate": 0.0001238643634037108, "loss": 0.2395, "step": 595 }, { "epoch": 0.3838771593090211, "grad_norm": 0.3553343415260315, "learning_rate": 0.0001232245681381958, "loss": 0.2083, "step": 600 }, { "epoch": 0.38707613563659626, "grad_norm": 0.3482455611228943, "learning_rate": 0.00012258477287268075, "loss": 0.2406, "step": 605 }, { "epoch": 0.3902751119641715, "grad_norm": 0.2828838527202606, "learning_rate": 0.00012194497760716572, "loss": 0.2455, "step": 610 }, { "epoch": 0.3934740882917466, "grad_norm": 0.18922460079193115, "learning_rate": 0.00012130518234165067, "loss": 0.2402, "step": 615 }, { "epoch": 0.39667306461932184, "grad_norm": 0.36727848649024963, "learning_rate": 0.00012066538707613564, "loss": 0.2348, "step": 620 }, { "epoch": 0.399872040946897, "grad_norm": 0.29675740003585815, "learning_rate": 0.00012002559181062061, "loss": 0.2314, "step": 625 }, { "epoch": 0.40307101727447214, "grad_norm": 0.3717605471611023, "learning_rate": 0.00011938579654510557, "loss": 0.2252, "step": 630 }, { "epoch": 0.40626999360204735, "grad_norm": 0.3321962356567383, "learning_rate": 0.00011874600127959055, "loss": 0.2004, "step": 635 }, { "epoch": 0.4094689699296225, "grad_norm": 0.2975044846534729, "learning_rate": 0.00011810620601407549, "loss": 0.2394, "step": 640 }, { "epoch": 0.4126679462571977, "grad_norm": 0.5578257441520691, "learning_rate": 0.00011746641074856047, "loss": 0.2731, "step": 645 }, { "epoch": 0.41586692258477287, "grad_norm": 0.3975297212600708, "learning_rate": 0.00011682661548304543, "loss": 0.2558, "step": 650 }, { "epoch": 0.419065898912348, "grad_norm": 0.25277817249298096, "learning_rate": 0.0001161868202175304, "loss": 0.2312, "step": 655 }, { "epoch": 0.42226487523992323, "grad_norm": 0.32298269867897034, "learning_rate": 0.00011554702495201537, "loss": 0.2448, "step": 660 }, { "epoch": 0.4254638515674984, "grad_norm": 0.4133547246456146, "learning_rate": 0.00011490722968650032, "loss": 0.2408, "step": 665 }, { "epoch": 0.4286628278950736, "grad_norm": 0.2923837900161743, "learning_rate": 0.00011426743442098529, "loss": 0.2268, "step": 670 }, { "epoch": 0.43186180422264875, "grad_norm": 0.2599218785762787, "learning_rate": 0.00011362763915547025, "loss": 0.2255, "step": 675 }, { "epoch": 0.4350607805502239, "grad_norm": 0.3489529490470886, "learning_rate": 0.00011298784388995522, "loss": 0.254, "step": 680 }, { "epoch": 0.4382597568777991, "grad_norm": 0.30105629563331604, "learning_rate": 0.0001123480486244402, "loss": 0.269, "step": 685 }, { "epoch": 0.44145873320537427, "grad_norm": 0.37994736433029175, "learning_rate": 0.00011170825335892514, "loss": 0.2191, "step": 690 }, { "epoch": 0.4446577095329495, "grad_norm": 0.35344600677490234, "learning_rate": 0.00011106845809341012, "loss": 0.2566, "step": 695 }, { "epoch": 0.44785668586052463, "grad_norm": 0.28387364745140076, "learning_rate": 0.00011042866282789508, "loss": 0.2277, "step": 700 }, { "epoch": 0.4510556621880998, "grad_norm": 0.36002182960510254, "learning_rate": 0.00010978886756238005, "loss": 0.2447, "step": 705 }, { "epoch": 0.454254638515675, "grad_norm": 0.30156582593917847, "learning_rate": 0.00010914907229686502, "loss": 0.234, "step": 710 }, { "epoch": 0.45745361484325014, "grad_norm": 0.3468836843967438, "learning_rate": 0.00010850927703134997, "loss": 0.2466, "step": 715 }, { "epoch": 0.46065259117082535, "grad_norm": 0.33030760288238525, "learning_rate": 0.00010786948176583493, "loss": 0.2745, "step": 720 }, { "epoch": 0.4638515674984005, "grad_norm": 0.3344646096229553, "learning_rate": 0.0001072296865003199, "loss": 0.2154, "step": 725 }, { "epoch": 0.46705054382597566, "grad_norm": 0.40286871790885925, "learning_rate": 0.00010658989123480487, "loss": 0.2227, "step": 730 }, { "epoch": 0.47024952015355087, "grad_norm": 0.2880500257015228, "learning_rate": 0.00010595009596928985, "loss": 0.2219, "step": 735 }, { "epoch": 0.473448496481126, "grad_norm": 0.28410083055496216, "learning_rate": 0.0001053103007037748, "loss": 0.2124, "step": 740 }, { "epoch": 0.47664747280870123, "grad_norm": 0.2937108278274536, "learning_rate": 0.00010467050543825975, "loss": 0.2272, "step": 745 }, { "epoch": 0.4798464491362764, "grad_norm": 0.33740442991256714, "learning_rate": 0.00010403071017274473, "loss": 0.2332, "step": 750 }, { "epoch": 0.48304542546385154, "grad_norm": 0.2969924807548523, "learning_rate": 0.0001033909149072297, "loss": 0.2382, "step": 755 }, { "epoch": 0.48624440179142675, "grad_norm": 0.25243079662323, "learning_rate": 0.00010275111964171466, "loss": 0.2266, "step": 760 }, { "epoch": 0.4894433781190019, "grad_norm": 0.30554574728012085, "learning_rate": 0.00010211132437619962, "loss": 0.2276, "step": 765 }, { "epoch": 0.4926423544465771, "grad_norm": 0.281543493270874, "learning_rate": 0.00010147152911068458, "loss": 0.2286, "step": 770 }, { "epoch": 0.49584133077415227, "grad_norm": 0.2906375229358673, "learning_rate": 0.00010083173384516955, "loss": 0.2446, "step": 775 }, { "epoch": 0.4990403071017274, "grad_norm": 0.27951693534851074, "learning_rate": 0.00010019193857965453, "loss": 0.2422, "step": 780 }, { "epoch": 0.5022392834293026, "grad_norm": 0.3846909999847412, "learning_rate": 9.955214331413948e-05, "loss": 0.238, "step": 785 }, { "epoch": 0.5054382597568778, "grad_norm": 0.27699944376945496, "learning_rate": 9.891234804862444e-05, "loss": 0.2299, "step": 790 }, { "epoch": 0.508637236084453, "grad_norm": 0.2875959575176239, "learning_rate": 9.82725527831094e-05, "loss": 0.2293, "step": 795 }, { "epoch": 0.5118362124120281, "grad_norm": 0.23799441754817963, "learning_rate": 9.763275751759438e-05, "loss": 0.2332, "step": 800 }, { "epoch": 0.5150351887396033, "grad_norm": 0.3258192241191864, "learning_rate": 9.699296225207935e-05, "loss": 0.2137, "step": 805 }, { "epoch": 0.5182341650671785, "grad_norm": 0.4027354121208191, "learning_rate": 9.63531669865643e-05, "loss": 0.2528, "step": 810 }, { "epoch": 0.5214331413947537, "grad_norm": 0.5166566967964172, "learning_rate": 9.571337172104927e-05, "loss": 0.2171, "step": 815 }, { "epoch": 0.5246321177223289, "grad_norm": 0.2721567451953888, "learning_rate": 9.507357645553423e-05, "loss": 0.2233, "step": 820 }, { "epoch": 0.527831094049904, "grad_norm": 0.295593798160553, "learning_rate": 9.44337811900192e-05, "loss": 0.2302, "step": 825 }, { "epoch": 0.5310300703774792, "grad_norm": 0.3672064244747162, "learning_rate": 9.379398592450416e-05, "loss": 0.225, "step": 830 }, { "epoch": 0.5342290467050543, "grad_norm": 0.27494198083877563, "learning_rate": 9.315419065898912e-05, "loss": 0.2221, "step": 835 }, { "epoch": 0.5374280230326296, "grad_norm": 0.4797211289405823, "learning_rate": 9.25143953934741e-05, "loss": 0.2439, "step": 840 }, { "epoch": 0.5406269993602048, "grad_norm": 0.2528781592845917, "learning_rate": 9.187460012795905e-05, "loss": 0.2267, "step": 845 }, { "epoch": 0.5438259756877799, "grad_norm": 0.435768723487854, "learning_rate": 9.123480486244403e-05, "loss": 0.2433, "step": 850 }, { "epoch": 0.5470249520153551, "grad_norm": 0.3093527853488922, "learning_rate": 9.059500959692899e-05, "loss": 0.2519, "step": 855 }, { "epoch": 0.5502239283429302, "grad_norm": 0.31521865725517273, "learning_rate": 8.995521433141395e-05, "loss": 0.2563, "step": 860 }, { "epoch": 0.5534229046705055, "grad_norm": 0.2934703230857849, "learning_rate": 8.931541906589892e-05, "loss": 0.2576, "step": 865 }, { "epoch": 0.5566218809980806, "grad_norm": 0.33386310935020447, "learning_rate": 8.867562380038388e-05, "loss": 0.2342, "step": 870 }, { "epoch": 0.5598208573256558, "grad_norm": 0.3183169960975647, "learning_rate": 8.803582853486885e-05, "loss": 0.2325, "step": 875 }, { "epoch": 0.5630198336532309, "grad_norm": 0.27800726890563965, "learning_rate": 8.739603326935381e-05, "loss": 0.2256, "step": 880 }, { "epoch": 0.5662188099808061, "grad_norm": 0.31215038895606995, "learning_rate": 8.675623800383877e-05, "loss": 0.206, "step": 885 }, { "epoch": 0.5694177863083814, "grad_norm": 0.31119444966316223, "learning_rate": 8.611644273832375e-05, "loss": 0.2447, "step": 890 }, { "epoch": 0.5726167626359565, "grad_norm": 0.27355003356933594, "learning_rate": 8.54766474728087e-05, "loss": 0.2001, "step": 895 }, { "epoch": 0.5758157389635317, "grad_norm": 0.3279555141925812, "learning_rate": 8.483685220729366e-05, "loss": 0.218, "step": 900 }, { "epoch": 0.5790147152911068, "grad_norm": 0.33151063323020935, "learning_rate": 8.419705694177864e-05, "loss": 0.2206, "step": 905 }, { "epoch": 0.582213691618682, "grad_norm": 0.34513482451438904, "learning_rate": 8.35572616762636e-05, "loss": 0.2615, "step": 910 }, { "epoch": 0.5854126679462572, "grad_norm": 0.38724973797798157, "learning_rate": 8.291746641074857e-05, "loss": 0.2445, "step": 915 }, { "epoch": 0.5886116442738324, "grad_norm": 0.30885693430900574, "learning_rate": 8.227767114523353e-05, "loss": 0.2131, "step": 920 }, { "epoch": 0.5918106206014075, "grad_norm": 0.3177817463874817, "learning_rate": 8.163787587971849e-05, "loss": 0.2382, "step": 925 }, { "epoch": 0.5950095969289827, "grad_norm": 0.2687653601169586, "learning_rate": 8.099808061420346e-05, "loss": 0.2352, "step": 930 }, { "epoch": 0.5982085732565579, "grad_norm": 0.23862284421920776, "learning_rate": 8.035828534868842e-05, "loss": 0.2073, "step": 935 }, { "epoch": 0.6014075495841331, "grad_norm": 0.39008885622024536, "learning_rate": 7.97184900831734e-05, "loss": 0.235, "step": 940 }, { "epoch": 0.6046065259117083, "grad_norm": 0.310867041349411, "learning_rate": 7.907869481765836e-05, "loss": 0.222, "step": 945 }, { "epoch": 0.6078055022392834, "grad_norm": 0.306083083152771, "learning_rate": 7.843889955214332e-05, "loss": 0.2255, "step": 950 }, { "epoch": 0.6110044785668586, "grad_norm": 0.3439415991306305, "learning_rate": 7.779910428662829e-05, "loss": 0.2124, "step": 955 }, { "epoch": 0.6142034548944337, "grad_norm": 0.2899746000766754, "learning_rate": 7.715930902111325e-05, "loss": 0.1934, "step": 960 }, { "epoch": 0.617402431222009, "grad_norm": 0.3107975721359253, "learning_rate": 7.651951375559822e-05, "loss": 0.2277, "step": 965 }, { "epoch": 0.6206014075495841, "grad_norm": 0.3492138385772705, "learning_rate": 7.587971849008317e-05, "loss": 0.2413, "step": 970 }, { "epoch": 0.6238003838771593, "grad_norm": 0.27954694628715515, "learning_rate": 7.523992322456814e-05, "loss": 0.2164, "step": 975 }, { "epoch": 0.6269993602047345, "grad_norm": 0.34748780727386475, "learning_rate": 7.46001279590531e-05, "loss": 0.2258, "step": 980 }, { "epoch": 0.6301983365323096, "grad_norm": 0.35154399275779724, "learning_rate": 7.396033269353807e-05, "loss": 0.2175, "step": 985 }, { "epoch": 0.6333973128598849, "grad_norm": 0.3060106039047241, "learning_rate": 7.332053742802303e-05, "loss": 0.2318, "step": 990 }, { "epoch": 0.63659628918746, "grad_norm": 0.31797581911087036, "learning_rate": 7.268074216250799e-05, "loss": 0.2027, "step": 995 }, { "epoch": 0.6397952655150352, "grad_norm": 0.33520376682281494, "learning_rate": 7.204094689699297e-05, "loss": 0.2312, "step": 1000 }, { "epoch": 0.6429942418426103, "grad_norm": 0.21483170986175537, "learning_rate": 7.140115163147793e-05, "loss": 0.2058, "step": 1005 }, { "epoch": 0.6461932181701855, "grad_norm": 0.33610713481903076, "learning_rate": 7.07613563659629e-05, "loss": 0.2415, "step": 1010 }, { "epoch": 0.6493921944977608, "grad_norm": 0.26923608779907227, "learning_rate": 7.012156110044786e-05, "loss": 0.2208, "step": 1015 }, { "epoch": 0.6525911708253359, "grad_norm": 0.3274904489517212, "learning_rate": 6.948176583493282e-05, "loss": 0.212, "step": 1020 }, { "epoch": 0.6557901471529111, "grad_norm": 0.3245833218097687, "learning_rate": 6.884197056941779e-05, "loss": 0.2191, "step": 1025 }, { "epoch": 0.6589891234804862, "grad_norm": 0.27330532670021057, "learning_rate": 6.820217530390275e-05, "loss": 0.1992, "step": 1030 }, { "epoch": 0.6621880998080614, "grad_norm": 0.25868290662765503, "learning_rate": 6.756238003838772e-05, "loss": 0.2399, "step": 1035 }, { "epoch": 0.6653870761356366, "grad_norm": 0.30688875913619995, "learning_rate": 6.692258477287268e-05, "loss": 0.2342, "step": 1040 }, { "epoch": 0.6685860524632118, "grad_norm": 0.2929762899875641, "learning_rate": 6.628278950735764e-05, "loss": 0.2369, "step": 1045 }, { "epoch": 0.6717850287907869, "grad_norm": 0.31221550703048706, "learning_rate": 6.564299424184262e-05, "loss": 0.2224, "step": 1050 }, { "epoch": 0.6749840051183621, "grad_norm": 0.32099705934524536, "learning_rate": 6.500319897632758e-05, "loss": 0.2144, "step": 1055 }, { "epoch": 0.6781829814459372, "grad_norm": 0.2826564908027649, "learning_rate": 6.436340371081254e-05, "loss": 0.2271, "step": 1060 }, { "epoch": 0.6813819577735125, "grad_norm": 0.2807115316390991, "learning_rate": 6.372360844529751e-05, "loss": 0.202, "step": 1065 }, { "epoch": 0.6845809341010877, "grad_norm": 0.3093265891075134, "learning_rate": 6.308381317978247e-05, "loss": 0.2328, "step": 1070 }, { "epoch": 0.6877799104286628, "grad_norm": 0.3201155662536621, "learning_rate": 6.244401791426744e-05, "loss": 0.209, "step": 1075 }, { "epoch": 0.690978886756238, "grad_norm": 0.27004414796829224, "learning_rate": 6.18042226487524e-05, "loss": 0.2278, "step": 1080 }, { "epoch": 0.6941778630838131, "grad_norm": 0.27909284830093384, "learning_rate": 6.116442738323736e-05, "loss": 0.23, "step": 1085 }, { "epoch": 0.6973768394113884, "grad_norm": 0.3248424828052521, "learning_rate": 6.0524632117722334e-05, "loss": 0.2309, "step": 1090 }, { "epoch": 0.7005758157389635, "grad_norm": 0.31023091077804565, "learning_rate": 5.9884836852207293e-05, "loss": 0.2084, "step": 1095 }, { "epoch": 0.7037747920665387, "grad_norm": 0.302664190530777, "learning_rate": 5.924504158669226e-05, "loss": 0.2367, "step": 1100 }, { "epoch": 0.7069737683941139, "grad_norm": 0.328142374753952, "learning_rate": 5.860524632117722e-05, "loss": 0.2215, "step": 1105 }, { "epoch": 0.710172744721689, "grad_norm": 0.34118109941482544, "learning_rate": 5.796545105566219e-05, "loss": 0.2077, "step": 1110 }, { "epoch": 0.7133717210492643, "grad_norm": 0.2934885025024414, "learning_rate": 5.732565579014716e-05, "loss": 0.2226, "step": 1115 }, { "epoch": 0.7165706973768394, "grad_norm": 0.306045800447464, "learning_rate": 5.668586052463212e-05, "loss": 0.1962, "step": 1120 }, { "epoch": 0.7197696737044146, "grad_norm": 0.3221231997013092, "learning_rate": 5.6046065259117085e-05, "loss": 0.2097, "step": 1125 }, { "epoch": 0.7229686500319897, "grad_norm": 0.29269224405288696, "learning_rate": 5.5406269993602045e-05, "loss": 0.2147, "step": 1130 }, { "epoch": 0.7261676263595649, "grad_norm": 0.3249344229698181, "learning_rate": 5.476647472808701e-05, "loss": 0.1956, "step": 1135 }, { "epoch": 0.7293666026871402, "grad_norm": 0.3102353811264038, "learning_rate": 5.4126679462571984e-05, "loss": 0.2092, "step": 1140 }, { "epoch": 0.7325655790147153, "grad_norm": 0.38312363624572754, "learning_rate": 5.3486884197056944e-05, "loss": 0.2294, "step": 1145 }, { "epoch": 0.7357645553422905, "grad_norm": 0.33595603704452515, "learning_rate": 5.284708893154191e-05, "loss": 0.242, "step": 1150 }, { "epoch": 0.7389635316698656, "grad_norm": 0.29311510920524597, "learning_rate": 5.220729366602687e-05, "loss": 0.2212, "step": 1155 }, { "epoch": 0.7421625079974408, "grad_norm": 0.2701033651828766, "learning_rate": 5.1567498400511836e-05, "loss": 0.2082, "step": 1160 }, { "epoch": 0.745361484325016, "grad_norm": 0.3194945156574249, "learning_rate": 5.092770313499681e-05, "loss": 0.235, "step": 1165 }, { "epoch": 0.7485604606525912, "grad_norm": 0.25952160358428955, "learning_rate": 5.028790786948176e-05, "loss": 0.2405, "step": 1170 }, { "epoch": 0.7517594369801663, "grad_norm": 0.3131108283996582, "learning_rate": 4.9648112603966736e-05, "loss": 0.2118, "step": 1175 }, { "epoch": 0.7549584133077415, "grad_norm": 0.40070056915283203, "learning_rate": 4.9008317338451695e-05, "loss": 0.2386, "step": 1180 }, { "epoch": 0.7581573896353166, "grad_norm": 0.38076481223106384, "learning_rate": 4.836852207293666e-05, "loss": 0.229, "step": 1185 }, { "epoch": 0.7613563659628919, "grad_norm": 0.26312530040740967, "learning_rate": 4.772872680742163e-05, "loss": 0.2029, "step": 1190 }, { "epoch": 0.7645553422904671, "grad_norm": 0.365788996219635, "learning_rate": 4.7088931541906594e-05, "loss": 0.2215, "step": 1195 }, { "epoch": 0.7677543186180422, "grad_norm": 0.246324360370636, "learning_rate": 4.644913627639156e-05, "loss": 0.2067, "step": 1200 }, { "epoch": 0.7709532949456174, "grad_norm": 0.2862643003463745, "learning_rate": 4.580934101087652e-05, "loss": 0.1955, "step": 1205 }, { "epoch": 0.7741522712731925, "grad_norm": 0.3309938609600067, "learning_rate": 4.516954574536149e-05, "loss": 0.227, "step": 1210 }, { "epoch": 0.7773512476007678, "grad_norm": 0.3223839998245239, "learning_rate": 4.4529750479846447e-05, "loss": 0.2292, "step": 1215 }, { "epoch": 0.780550223928343, "grad_norm": 0.3467869460582733, "learning_rate": 4.388995521433142e-05, "loss": 0.209, "step": 1220 }, { "epoch": 0.7837492002559181, "grad_norm": 0.34096330404281616, "learning_rate": 4.325015994881638e-05, "loss": 0.2041, "step": 1225 }, { "epoch": 0.7869481765834933, "grad_norm": 0.33046209812164307, "learning_rate": 4.2610364683301346e-05, "loss": 0.1974, "step": 1230 }, { "epoch": 0.7901471529110684, "grad_norm": 0.4353679418563843, "learning_rate": 4.197056941778631e-05, "loss": 0.1992, "step": 1235 }, { "epoch": 0.7933461292386437, "grad_norm": 0.35748305916786194, "learning_rate": 4.133077415227127e-05, "loss": 0.2237, "step": 1240 }, { "epoch": 0.7965451055662188, "grad_norm": 0.3174164891242981, "learning_rate": 4.0690978886756245e-05, "loss": 0.2136, "step": 1245 }, { "epoch": 0.799744081893794, "grad_norm": 0.3202930688858032, "learning_rate": 4.0051183621241205e-05, "loss": 0.2301, "step": 1250 }, { "epoch": 0.8029430582213691, "grad_norm": 0.2494242787361145, "learning_rate": 3.941138835572617e-05, "loss": 0.1783, "step": 1255 }, { "epoch": 0.8061420345489443, "grad_norm": 0.35551324486732483, "learning_rate": 3.877159309021113e-05, "loss": 0.2031, "step": 1260 }, { "epoch": 0.8093410108765196, "grad_norm": 0.2531558573246002, "learning_rate": 3.81317978246961e-05, "loss": 0.2365, "step": 1265 }, { "epoch": 0.8125399872040947, "grad_norm": 0.43033191561698914, "learning_rate": 3.7492002559181063e-05, "loss": 0.2217, "step": 1270 }, { "epoch": 0.8157389635316699, "grad_norm": 0.3271211087703705, "learning_rate": 3.685220729366603e-05, "loss": 0.2198, "step": 1275 }, { "epoch": 0.818937939859245, "grad_norm": 0.2692057490348816, "learning_rate": 3.6212412028150996e-05, "loss": 0.2099, "step": 1280 }, { "epoch": 0.8221369161868202, "grad_norm": 0.2919643223285675, "learning_rate": 3.5572616762635956e-05, "loss": 0.2148, "step": 1285 }, { "epoch": 0.8253358925143954, "grad_norm": 0.3477243185043335, "learning_rate": 3.493282149712092e-05, "loss": 0.2045, "step": 1290 }, { "epoch": 0.8285348688419706, "grad_norm": 0.3863447308540344, "learning_rate": 3.429302623160589e-05, "loss": 0.2481, "step": 1295 }, { "epoch": 0.8317338451695457, "grad_norm": 0.3201524019241333, "learning_rate": 3.3653230966090855e-05, "loss": 0.2121, "step": 1300 }, { "epoch": 0.8349328214971209, "grad_norm": 0.3398021459579468, "learning_rate": 3.3013435700575815e-05, "loss": 0.2251, "step": 1305 }, { "epoch": 0.838131797824696, "grad_norm": 0.29826006293296814, "learning_rate": 3.237364043506078e-05, "loss": 0.1768, "step": 1310 }, { "epoch": 0.8413307741522713, "grad_norm": 0.3297532796859741, "learning_rate": 3.173384516954575e-05, "loss": 0.2026, "step": 1315 }, { "epoch": 0.8445297504798465, "grad_norm": 0.3907334804534912, "learning_rate": 3.1094049904030714e-05, "loss": 0.2032, "step": 1320 }, { "epoch": 0.8477287268074216, "grad_norm": 0.27420273423194885, "learning_rate": 3.0454254638515677e-05, "loss": 0.1845, "step": 1325 }, { "epoch": 0.8509277031349968, "grad_norm": 0.36319416761398315, "learning_rate": 2.981445937300064e-05, "loss": 0.2143, "step": 1330 }, { "epoch": 0.8541266794625719, "grad_norm": 0.26537972688674927, "learning_rate": 2.9174664107485606e-05, "loss": 0.2228, "step": 1335 }, { "epoch": 0.8573256557901472, "grad_norm": 0.3364832103252411, "learning_rate": 2.853486884197057e-05, "loss": 0.1889, "step": 1340 }, { "epoch": 0.8605246321177223, "grad_norm": 0.32128915190696716, "learning_rate": 2.789507357645554e-05, "loss": 0.2023, "step": 1345 }, { "epoch": 0.8637236084452975, "grad_norm": 0.3016292452812195, "learning_rate": 2.7255278310940502e-05, "loss": 0.247, "step": 1350 }, { "epoch": 0.8669225847728727, "grad_norm": 0.3532962501049042, "learning_rate": 2.6615483045425465e-05, "loss": 0.22, "step": 1355 }, { "epoch": 0.8701215611004478, "grad_norm": 0.26920193433761597, "learning_rate": 2.5975687779910428e-05, "loss": 0.1888, "step": 1360 }, { "epoch": 0.8733205374280231, "grad_norm": 0.27570822834968567, "learning_rate": 2.533589251439539e-05, "loss": 0.2278, "step": 1365 }, { "epoch": 0.8765195137555982, "grad_norm": 0.2538580596446991, "learning_rate": 2.4696097248880358e-05, "loss": 0.1739, "step": 1370 }, { "epoch": 0.8797184900831734, "grad_norm": 0.2816247045993805, "learning_rate": 2.4056301983365324e-05, "loss": 0.2169, "step": 1375 }, { "epoch": 0.8829174664107485, "grad_norm": 0.3473341763019562, "learning_rate": 2.341650671785029e-05, "loss": 0.226, "step": 1380 }, { "epoch": 0.8861164427383237, "grad_norm": 0.2831457257270813, "learning_rate": 2.2776711452335254e-05, "loss": 0.2125, "step": 1385 }, { "epoch": 0.889315419065899, "grad_norm": 0.30768829584121704, "learning_rate": 2.213691618682022e-05, "loss": 0.1819, "step": 1390 }, { "epoch": 0.8925143953934741, "grad_norm": 0.31688612699508667, "learning_rate": 2.1497120921305183e-05, "loss": 0.2194, "step": 1395 }, { "epoch": 0.8957133717210493, "grad_norm": 0.3418295383453369, "learning_rate": 2.0857325655790146e-05, "loss": 0.2169, "step": 1400 }, { "epoch": 0.8989123480486244, "grad_norm": 0.37767553329467773, "learning_rate": 2.0217530390275112e-05, "loss": 0.2323, "step": 1405 }, { "epoch": 0.9021113243761996, "grad_norm": 0.26787883043289185, "learning_rate": 1.957773512476008e-05, "loss": 0.2024, "step": 1410 }, { "epoch": 0.9053103007037748, "grad_norm": 0.26510849595069885, "learning_rate": 1.8937939859245045e-05, "loss": 0.2327, "step": 1415 }, { "epoch": 0.90850927703135, "grad_norm": 0.2409944236278534, "learning_rate": 1.8298144593730008e-05, "loss": 0.1867, "step": 1420 }, { "epoch": 0.9117082533589251, "grad_norm": 0.29780739545822144, "learning_rate": 1.765834932821497e-05, "loss": 0.1885, "step": 1425 }, { "epoch": 0.9149072296865003, "grad_norm": 0.3131493031978607, "learning_rate": 1.7018554062699938e-05, "loss": 0.1984, "step": 1430 }, { "epoch": 0.9181062060140754, "grad_norm": 0.30493828654289246, "learning_rate": 1.63787587971849e-05, "loss": 0.2028, "step": 1435 }, { "epoch": 0.9213051823416507, "grad_norm": 0.4079281985759735, "learning_rate": 1.5738963531669867e-05, "loss": 0.2297, "step": 1440 }, { "epoch": 0.9245041586692259, "grad_norm": 0.3364027440547943, "learning_rate": 1.5099168266154832e-05, "loss": 0.2007, "step": 1445 }, { "epoch": 0.927703134996801, "grad_norm": 0.29711103439331055, "learning_rate": 1.4459373000639795e-05, "loss": 0.2201, "step": 1450 }, { "epoch": 0.9309021113243762, "grad_norm": 0.3361244797706604, "learning_rate": 1.3819577735124761e-05, "loss": 0.2118, "step": 1455 }, { "epoch": 0.9341010876519513, "grad_norm": 0.2692669630050659, "learning_rate": 1.3179782469609724e-05, "loss": 0.2036, "step": 1460 }, { "epoch": 0.9373000639795266, "grad_norm": 0.2790112793445587, "learning_rate": 1.253998720409469e-05, "loss": 0.1977, "step": 1465 }, { "epoch": 0.9404990403071017, "grad_norm": 0.3373515009880066, "learning_rate": 1.1900191938579655e-05, "loss": 0.1959, "step": 1470 }, { "epoch": 0.9436980166346769, "grad_norm": 0.27667558193206787, "learning_rate": 1.126039667306462e-05, "loss": 0.21, "step": 1475 }, { "epoch": 0.946896992962252, "grad_norm": 0.342916876077652, "learning_rate": 1.0620601407549585e-05, "loss": 0.225, "step": 1480 }, { "epoch": 0.9500959692898272, "grad_norm": 0.31250983476638794, "learning_rate": 9.98080614203455e-06, "loss": 0.2517, "step": 1485 }, { "epoch": 0.9532949456174025, "grad_norm": 0.29028651118278503, "learning_rate": 9.341010876519514e-06, "loss": 0.2276, "step": 1490 }, { "epoch": 0.9564939219449776, "grad_norm": 0.3296051621437073, "learning_rate": 8.701215611004479e-06, "loss": 0.1873, "step": 1495 }, { "epoch": 0.9596928982725528, "grad_norm": 0.36832043528556824, "learning_rate": 8.061420345489444e-06, "loss": 0.2064, "step": 1500 }, { "epoch": 0.9628918746001279, "grad_norm": 0.3034568130970001, "learning_rate": 7.421625079974409e-06, "loss": 0.2035, "step": 1505 }, { "epoch": 0.9660908509277031, "grad_norm": 0.26890963315963745, "learning_rate": 6.781829814459373e-06, "loss": 0.2019, "step": 1510 }, { "epoch": 0.9692898272552783, "grad_norm": 0.39937588572502136, "learning_rate": 6.142034548944338e-06, "loss": 0.2026, "step": 1515 }, { "epoch": 0.9724888035828535, "grad_norm": 0.2571321427822113, "learning_rate": 5.502239283429303e-06, "loss": 0.1928, "step": 1520 }, { "epoch": 0.9756877799104287, "grad_norm": 0.30180391669273376, "learning_rate": 4.862444017914268e-06, "loss": 0.2086, "step": 1525 }, { "epoch": 0.9788867562380038, "grad_norm": 0.4258427321910858, "learning_rate": 4.222648752399233e-06, "loss": 0.1849, "step": 1530 }, { "epoch": 0.982085732565579, "grad_norm": 0.29561159014701843, "learning_rate": 3.5828534868841974e-06, "loss": 0.2151, "step": 1535 }, { "epoch": 0.9852847088931542, "grad_norm": 0.2644422948360443, "learning_rate": 2.943058221369162e-06, "loss": 0.189, "step": 1540 }, { "epoch": 0.9884836852207294, "grad_norm": 0.30060943961143494, "learning_rate": 2.303262955854127e-06, "loss": 0.2164, "step": 1545 }, { "epoch": 0.9916826615483045, "grad_norm": 0.3195488154888153, "learning_rate": 1.6634676903390916e-06, "loss": 0.2519, "step": 1550 }, { "epoch": 0.9948816378758797, "grad_norm": 0.30431658029556274, "learning_rate": 1.0236724248240563e-06, "loss": 0.231, "step": 1555 }, { "epoch": 0.9980806142034548, "grad_norm": 0.34465327858924866, "learning_rate": 3.838771593090211e-07, "loss": 0.2068, "step": 1560 } ], "logging_steps": 5, "max_steps": 1563, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.925037329289052e+17, "train_batch_size": 10, "trial_name": null, "trial_params": null }