Few Shot Prompting SFT with Gemma2-9B Model | 0.89 Weighted F1 and 0.75 Macro F1 Score on Dev Set
dcf7a13
verified
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 1.0, | |
"eval_steps": 500, | |
"global_step": 1563, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.003198976327575176, | |
"grad_norm": 2.5142228603363037, | |
"learning_rate": 0.00019936020473448497, | |
"loss": 2.2007, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.006397952655150352, | |
"grad_norm": 0.7006776332855225, | |
"learning_rate": 0.00019872040946896996, | |
"loss": 0.461, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.009596928982725527, | |
"grad_norm": 0.6147411465644836, | |
"learning_rate": 0.0001980806142034549, | |
"loss": 0.3499, | |
"step": 15 | |
}, | |
{ | |
"epoch": 0.012795905310300703, | |
"grad_norm": 0.4334537386894226, | |
"learning_rate": 0.00019744081893793988, | |
"loss": 0.3154, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.01599488163787588, | |
"grad_norm": 0.5372670292854309, | |
"learning_rate": 0.00019680102367242483, | |
"loss": 0.2982, | |
"step": 25 | |
}, | |
{ | |
"epoch": 0.019193857965451054, | |
"grad_norm": 0.4113191068172455, | |
"learning_rate": 0.0001961612284069098, | |
"loss": 0.2751, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.022392834293026232, | |
"grad_norm": 0.3651261627674103, | |
"learning_rate": 0.00019552143314139478, | |
"loss": 0.279, | |
"step": 35 | |
}, | |
{ | |
"epoch": 0.025591810620601407, | |
"grad_norm": 0.4230581820011139, | |
"learning_rate": 0.00019488163787587971, | |
"loss": 0.248, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.028790786948176585, | |
"grad_norm": 0.39593610167503357, | |
"learning_rate": 0.0001942418426103647, | |
"loss": 0.2902, | |
"step": 45 | |
}, | |
{ | |
"epoch": 0.03198976327575176, | |
"grad_norm": 0.41984233260154724, | |
"learning_rate": 0.00019360204734484966, | |
"loss": 0.2932, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.035188739603326934, | |
"grad_norm": 0.26596859097480774, | |
"learning_rate": 0.00019296225207933462, | |
"loss": 0.2375, | |
"step": 55 | |
}, | |
{ | |
"epoch": 0.03838771593090211, | |
"grad_norm": 0.44939491152763367, | |
"learning_rate": 0.0001923224568138196, | |
"loss": 0.2794, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.04158669225847729, | |
"grad_norm": 0.3508070111274719, | |
"learning_rate": 0.00019168266154830454, | |
"loss": 0.2787, | |
"step": 65 | |
}, | |
{ | |
"epoch": 0.044785668586052464, | |
"grad_norm": 0.40634146332740784, | |
"learning_rate": 0.0001910428662827895, | |
"loss": 0.2767, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.04798464491362764, | |
"grad_norm": 0.6950023770332336, | |
"learning_rate": 0.00019040307101727449, | |
"loss": 0.2776, | |
"step": 75 | |
}, | |
{ | |
"epoch": 0.05118362124120281, | |
"grad_norm": 0.34798476099967957, | |
"learning_rate": 0.00018976327575175944, | |
"loss": 0.2492, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.05438259756877799, | |
"grad_norm": 0.30487826466560364, | |
"learning_rate": 0.00018912348048624443, | |
"loss": 0.2344, | |
"step": 85 | |
}, | |
{ | |
"epoch": 0.05758157389635317, | |
"grad_norm": 0.2966866195201874, | |
"learning_rate": 0.00018848368522072936, | |
"loss": 0.2717, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.060780550223928344, | |
"grad_norm": 0.41749754548072815, | |
"learning_rate": 0.00018784388995521432, | |
"loss": 0.2855, | |
"step": 95 | |
}, | |
{ | |
"epoch": 0.06397952655150352, | |
"grad_norm": 0.3517999053001404, | |
"learning_rate": 0.0001872040946896993, | |
"loss": 0.2537, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.0671785028790787, | |
"grad_norm": 0.3616477847099304, | |
"learning_rate": 0.00018656429942418427, | |
"loss": 0.2548, | |
"step": 105 | |
}, | |
{ | |
"epoch": 0.07037747920665387, | |
"grad_norm": 0.3890167772769928, | |
"learning_rate": 0.00018592450415866926, | |
"loss": 0.2678, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.07357645553422905, | |
"grad_norm": 0.35902726650238037, | |
"learning_rate": 0.0001852847088931542, | |
"loss": 0.2751, | |
"step": 115 | |
}, | |
{ | |
"epoch": 0.07677543186180422, | |
"grad_norm": 0.3401743471622467, | |
"learning_rate": 0.00018464491362763915, | |
"loss": 0.2696, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.0799744081893794, | |
"grad_norm": 0.2922796905040741, | |
"learning_rate": 0.00018400511836212414, | |
"loss": 0.2326, | |
"step": 125 | |
}, | |
{ | |
"epoch": 0.08317338451695458, | |
"grad_norm": 0.3094709515571594, | |
"learning_rate": 0.0001833653230966091, | |
"loss": 0.2393, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.08637236084452975, | |
"grad_norm": 0.2731708884239197, | |
"learning_rate": 0.00018272552783109406, | |
"loss": 0.2446, | |
"step": 135 | |
}, | |
{ | |
"epoch": 0.08957133717210493, | |
"grad_norm": 0.281548410654068, | |
"learning_rate": 0.00018208573256557901, | |
"loss": 0.2739, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.0927703134996801, | |
"grad_norm": 0.31158748269081116, | |
"learning_rate": 0.00018144593730006397, | |
"loss": 0.2562, | |
"step": 145 | |
}, | |
{ | |
"epoch": 0.09596928982725528, | |
"grad_norm": 0.31344011425971985, | |
"learning_rate": 0.00018080614203454896, | |
"loss": 0.2457, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.09916826615483046, | |
"grad_norm": 0.25769639015197754, | |
"learning_rate": 0.00018016634676903392, | |
"loss": 0.2413, | |
"step": 155 | |
}, | |
{ | |
"epoch": 0.10236724248240563, | |
"grad_norm": 0.3215169310569763, | |
"learning_rate": 0.00017952655150351888, | |
"loss": 0.2587, | |
"step": 160 | |
}, | |
{ | |
"epoch": 0.10556621880998081, | |
"grad_norm": 0.31406131386756897, | |
"learning_rate": 0.00017888675623800384, | |
"loss": 0.252, | |
"step": 165 | |
}, | |
{ | |
"epoch": 0.10876519513755598, | |
"grad_norm": 0.2920863926410675, | |
"learning_rate": 0.0001782469609724888, | |
"loss": 0.2469, | |
"step": 170 | |
}, | |
{ | |
"epoch": 0.11196417146513116, | |
"grad_norm": 0.32624512910842896, | |
"learning_rate": 0.00017760716570697379, | |
"loss": 0.2543, | |
"step": 175 | |
}, | |
{ | |
"epoch": 0.11516314779270634, | |
"grad_norm": 0.3013479709625244, | |
"learning_rate": 0.00017696737044145875, | |
"loss": 0.2662, | |
"step": 180 | |
}, | |
{ | |
"epoch": 0.1183621241202815, | |
"grad_norm": 0.2553490102291107, | |
"learning_rate": 0.0001763275751759437, | |
"loss": 0.2479, | |
"step": 185 | |
}, | |
{ | |
"epoch": 0.12156110044785669, | |
"grad_norm": 0.3384646773338318, | |
"learning_rate": 0.00017568777991042867, | |
"loss": 0.2979, | |
"step": 190 | |
}, | |
{ | |
"epoch": 0.12476007677543186, | |
"grad_norm": 0.4001105725765228, | |
"learning_rate": 0.00017504798464491362, | |
"loss": 0.2629, | |
"step": 195 | |
}, | |
{ | |
"epoch": 0.12795905310300704, | |
"grad_norm": 0.3194480836391449, | |
"learning_rate": 0.0001744081893793986, | |
"loss": 0.2489, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.13115802943058222, | |
"grad_norm": 0.38252341747283936, | |
"learning_rate": 0.00017376839411388357, | |
"loss": 0.2356, | |
"step": 205 | |
}, | |
{ | |
"epoch": 0.1343570057581574, | |
"grad_norm": 0.31561562418937683, | |
"learning_rate": 0.00017312859884836853, | |
"loss": 0.2581, | |
"step": 210 | |
}, | |
{ | |
"epoch": 0.13755598208573255, | |
"grad_norm": 0.2712710499763489, | |
"learning_rate": 0.0001724888035828535, | |
"loss": 0.2577, | |
"step": 215 | |
}, | |
{ | |
"epoch": 0.14075495841330773, | |
"grad_norm": 0.3084522485733032, | |
"learning_rate": 0.00017184900831733845, | |
"loss": 0.2304, | |
"step": 220 | |
}, | |
{ | |
"epoch": 0.14395393474088292, | |
"grad_norm": 0.2699475884437561, | |
"learning_rate": 0.00017120921305182344, | |
"loss": 0.2685, | |
"step": 225 | |
}, | |
{ | |
"epoch": 0.1471529110684581, | |
"grad_norm": 0.3236371874809265, | |
"learning_rate": 0.0001705694177863084, | |
"loss": 0.2588, | |
"step": 230 | |
}, | |
{ | |
"epoch": 0.15035188739603328, | |
"grad_norm": 0.28885379433631897, | |
"learning_rate": 0.00016992962252079336, | |
"loss": 0.2574, | |
"step": 235 | |
}, | |
{ | |
"epoch": 0.15355086372360843, | |
"grad_norm": 0.3263382315635681, | |
"learning_rate": 0.00016928982725527832, | |
"loss": 0.2649, | |
"step": 240 | |
}, | |
{ | |
"epoch": 0.15674984005118361, | |
"grad_norm": 0.2647656500339508, | |
"learning_rate": 0.00016865003198976328, | |
"loss": 0.2522, | |
"step": 245 | |
}, | |
{ | |
"epoch": 0.1599488163787588, | |
"grad_norm": 0.2796100676059723, | |
"learning_rate": 0.00016801023672424826, | |
"loss": 0.2255, | |
"step": 250 | |
}, | |
{ | |
"epoch": 0.16314779270633398, | |
"grad_norm": 0.32718002796173096, | |
"learning_rate": 0.00016737044145873322, | |
"loss": 0.2623, | |
"step": 255 | |
}, | |
{ | |
"epoch": 0.16634676903390916, | |
"grad_norm": 0.34518498182296753, | |
"learning_rate": 0.00016673064619321818, | |
"loss": 0.2452, | |
"step": 260 | |
}, | |
{ | |
"epoch": 0.1695457453614843, | |
"grad_norm": 0.3195638358592987, | |
"learning_rate": 0.00016609085092770314, | |
"loss": 0.2638, | |
"step": 265 | |
}, | |
{ | |
"epoch": 0.1727447216890595, | |
"grad_norm": 0.2617906928062439, | |
"learning_rate": 0.0001654510556621881, | |
"loss": 0.2294, | |
"step": 270 | |
}, | |
{ | |
"epoch": 0.17594369801663468, | |
"grad_norm": 0.28182393312454224, | |
"learning_rate": 0.00016481126039667306, | |
"loss": 0.2611, | |
"step": 275 | |
}, | |
{ | |
"epoch": 0.17914267434420986, | |
"grad_norm": 0.24791140854358673, | |
"learning_rate": 0.00016417146513115805, | |
"loss": 0.2178, | |
"step": 280 | |
}, | |
{ | |
"epoch": 0.18234165067178504, | |
"grad_norm": 0.2434699386358261, | |
"learning_rate": 0.000163531669865643, | |
"loss": 0.2396, | |
"step": 285 | |
}, | |
{ | |
"epoch": 0.1855406269993602, | |
"grad_norm": 0.3352366089820862, | |
"learning_rate": 0.00016289187460012797, | |
"loss": 0.2368, | |
"step": 290 | |
}, | |
{ | |
"epoch": 0.18873960332693537, | |
"grad_norm": 0.32391369342803955, | |
"learning_rate": 0.00016225207933461293, | |
"loss": 0.2575, | |
"step": 295 | |
}, | |
{ | |
"epoch": 0.19193857965451055, | |
"grad_norm": 0.33067309856414795, | |
"learning_rate": 0.00016161228406909789, | |
"loss": 0.2726, | |
"step": 300 | |
}, | |
{ | |
"epoch": 0.19513755598208574, | |
"grad_norm": 0.3429821729660034, | |
"learning_rate": 0.00016097248880358287, | |
"loss": 0.2507, | |
"step": 305 | |
}, | |
{ | |
"epoch": 0.19833653230966092, | |
"grad_norm": 0.3166626989841461, | |
"learning_rate": 0.00016033269353806783, | |
"loss": 0.2474, | |
"step": 310 | |
}, | |
{ | |
"epoch": 0.20153550863723607, | |
"grad_norm": 0.2972988784313202, | |
"learning_rate": 0.0001596928982725528, | |
"loss": 0.2361, | |
"step": 315 | |
}, | |
{ | |
"epoch": 0.20473448496481125, | |
"grad_norm": 0.2758512794971466, | |
"learning_rate": 0.00015905310300703775, | |
"loss": 0.2266, | |
"step": 320 | |
}, | |
{ | |
"epoch": 0.20793346129238643, | |
"grad_norm": 0.2929127812385559, | |
"learning_rate": 0.0001584133077415227, | |
"loss": 0.2514, | |
"step": 325 | |
}, | |
{ | |
"epoch": 0.21113243761996162, | |
"grad_norm": 0.3643186092376709, | |
"learning_rate": 0.0001577735124760077, | |
"loss": 0.2628, | |
"step": 330 | |
}, | |
{ | |
"epoch": 0.2143314139475368, | |
"grad_norm": 0.3119208514690399, | |
"learning_rate": 0.00015713371721049266, | |
"loss": 0.2446, | |
"step": 335 | |
}, | |
{ | |
"epoch": 0.21753039027511195, | |
"grad_norm": 0.2954186201095581, | |
"learning_rate": 0.00015649392194497762, | |
"loss": 0.248, | |
"step": 340 | |
}, | |
{ | |
"epoch": 0.22072936660268713, | |
"grad_norm": 0.28279921412467957, | |
"learning_rate": 0.00015585412667946258, | |
"loss": 0.2256, | |
"step": 345 | |
}, | |
{ | |
"epoch": 0.22392834293026231, | |
"grad_norm": 0.28081214427948, | |
"learning_rate": 0.00015521433141394754, | |
"loss": 0.2481, | |
"step": 350 | |
}, | |
{ | |
"epoch": 0.2271273192578375, | |
"grad_norm": 0.36415377259254456, | |
"learning_rate": 0.00015457453614843252, | |
"loss": 0.2577, | |
"step": 355 | |
}, | |
{ | |
"epoch": 0.23032629558541268, | |
"grad_norm": 0.3519168794155121, | |
"learning_rate": 0.00015393474088291748, | |
"loss": 0.2504, | |
"step": 360 | |
}, | |
{ | |
"epoch": 0.23352527191298783, | |
"grad_norm": 0.3644118309020996, | |
"learning_rate": 0.00015329494561740244, | |
"loss": 0.2453, | |
"step": 365 | |
}, | |
{ | |
"epoch": 0.236724248240563, | |
"grad_norm": 0.2938327491283417, | |
"learning_rate": 0.0001526551503518874, | |
"loss": 0.2378, | |
"step": 370 | |
}, | |
{ | |
"epoch": 0.2399232245681382, | |
"grad_norm": 0.25993379950523376, | |
"learning_rate": 0.00015201535508637236, | |
"loss": 0.2387, | |
"step": 375 | |
}, | |
{ | |
"epoch": 0.24312220089571338, | |
"grad_norm": 0.2894437313079834, | |
"learning_rate": 0.00015137555982085735, | |
"loss": 0.2524, | |
"step": 380 | |
}, | |
{ | |
"epoch": 0.24632117722328856, | |
"grad_norm": 0.29153236746788025, | |
"learning_rate": 0.0001507357645553423, | |
"loss": 0.2351, | |
"step": 385 | |
}, | |
{ | |
"epoch": 0.2495201535508637, | |
"grad_norm": 0.2899223268032074, | |
"learning_rate": 0.00015009596928982727, | |
"loss": 0.2493, | |
"step": 390 | |
}, | |
{ | |
"epoch": 0.2527191298784389, | |
"grad_norm": 0.27078118920326233, | |
"learning_rate": 0.00014945617402431223, | |
"loss": 0.2543, | |
"step": 395 | |
}, | |
{ | |
"epoch": 0.2559181062060141, | |
"grad_norm": 0.23355244100093842, | |
"learning_rate": 0.0001488163787587972, | |
"loss": 0.232, | |
"step": 400 | |
}, | |
{ | |
"epoch": 0.2591170825335892, | |
"grad_norm": 0.29392296075820923, | |
"learning_rate": 0.00014817658349328217, | |
"loss": 0.2356, | |
"step": 405 | |
}, | |
{ | |
"epoch": 0.26231605886116444, | |
"grad_norm": 0.3395833969116211, | |
"learning_rate": 0.00014753678822776713, | |
"loss": 0.2498, | |
"step": 410 | |
}, | |
{ | |
"epoch": 0.2655150351887396, | |
"grad_norm": 0.2617484927177429, | |
"learning_rate": 0.00014689699296225207, | |
"loss": 0.2239, | |
"step": 415 | |
}, | |
{ | |
"epoch": 0.2687140115163148, | |
"grad_norm": 0.29424986243247986, | |
"learning_rate": 0.00014625719769673705, | |
"loss": 0.2602, | |
"step": 420 | |
}, | |
{ | |
"epoch": 0.27191298784388995, | |
"grad_norm": 0.251095175743103, | |
"learning_rate": 0.000145617402431222, | |
"loss": 0.2113, | |
"step": 425 | |
}, | |
{ | |
"epoch": 0.2751119641714651, | |
"grad_norm": 0.31057092547416687, | |
"learning_rate": 0.000144977607165707, | |
"loss": 0.2388, | |
"step": 430 | |
}, | |
{ | |
"epoch": 0.2783109404990403, | |
"grad_norm": 0.23010189831256866, | |
"learning_rate": 0.00014433781190019196, | |
"loss": 0.2294, | |
"step": 435 | |
}, | |
{ | |
"epoch": 0.28150991682661547, | |
"grad_norm": 0.28975558280944824, | |
"learning_rate": 0.0001436980166346769, | |
"loss": 0.2433, | |
"step": 440 | |
}, | |
{ | |
"epoch": 0.2847088931541907, | |
"grad_norm": 0.306234747171402, | |
"learning_rate": 0.00014305822136916188, | |
"loss": 0.2715, | |
"step": 445 | |
}, | |
{ | |
"epoch": 0.28790786948176583, | |
"grad_norm": 0.3650927245616913, | |
"learning_rate": 0.00014241842610364684, | |
"loss": 0.2577, | |
"step": 450 | |
}, | |
{ | |
"epoch": 0.291106845809341, | |
"grad_norm": 0.3464455306529999, | |
"learning_rate": 0.0001417786308381318, | |
"loss": 0.2514, | |
"step": 455 | |
}, | |
{ | |
"epoch": 0.2943058221369162, | |
"grad_norm": 0.3354053199291229, | |
"learning_rate": 0.00014113883557261678, | |
"loss": 0.2479, | |
"step": 460 | |
}, | |
{ | |
"epoch": 0.29750479846449135, | |
"grad_norm": 0.27853867411613464, | |
"learning_rate": 0.00014049904030710172, | |
"loss": 0.2454, | |
"step": 465 | |
}, | |
{ | |
"epoch": 0.30070377479206656, | |
"grad_norm": 0.29874446988105774, | |
"learning_rate": 0.0001398592450415867, | |
"loss": 0.2271, | |
"step": 470 | |
}, | |
{ | |
"epoch": 0.3039027511196417, | |
"grad_norm": 0.30210456252098083, | |
"learning_rate": 0.00013921944977607166, | |
"loss": 0.2532, | |
"step": 475 | |
}, | |
{ | |
"epoch": 0.30710172744721687, | |
"grad_norm": 0.2790578305721283, | |
"learning_rate": 0.00013857965451055662, | |
"loss": 0.2254, | |
"step": 480 | |
}, | |
{ | |
"epoch": 0.3103007037747921, | |
"grad_norm": 0.28601694107055664, | |
"learning_rate": 0.0001379398592450416, | |
"loss": 0.2396, | |
"step": 485 | |
}, | |
{ | |
"epoch": 0.31349968010236723, | |
"grad_norm": 0.2820374667644501, | |
"learning_rate": 0.00013730006397952654, | |
"loss": 0.2405, | |
"step": 490 | |
}, | |
{ | |
"epoch": 0.31669865642994244, | |
"grad_norm": 0.48833322525024414, | |
"learning_rate": 0.00013666026871401153, | |
"loss": 0.2405, | |
"step": 495 | |
}, | |
{ | |
"epoch": 0.3198976327575176, | |
"grad_norm": 0.30562731623649597, | |
"learning_rate": 0.0001360204734484965, | |
"loss": 0.2486, | |
"step": 500 | |
}, | |
{ | |
"epoch": 0.32309660908509275, | |
"grad_norm": 0.43254855275154114, | |
"learning_rate": 0.00013538067818298145, | |
"loss": 0.2541, | |
"step": 505 | |
}, | |
{ | |
"epoch": 0.32629558541266795, | |
"grad_norm": 0.29583895206451416, | |
"learning_rate": 0.00013474088291746643, | |
"loss": 0.2442, | |
"step": 510 | |
}, | |
{ | |
"epoch": 0.3294945617402431, | |
"grad_norm": 0.3376995325088501, | |
"learning_rate": 0.00013410108765195137, | |
"loss": 0.2484, | |
"step": 515 | |
}, | |
{ | |
"epoch": 0.3326935380678183, | |
"grad_norm": 0.36025670170783997, | |
"learning_rate": 0.00013346129238643635, | |
"loss": 0.2506, | |
"step": 520 | |
}, | |
{ | |
"epoch": 0.33589251439539347, | |
"grad_norm": 0.33433622121810913, | |
"learning_rate": 0.0001328214971209213, | |
"loss": 0.2201, | |
"step": 525 | |
}, | |
{ | |
"epoch": 0.3390914907229686, | |
"grad_norm": 0.2948063910007477, | |
"learning_rate": 0.00013218170185540627, | |
"loss": 0.2192, | |
"step": 530 | |
}, | |
{ | |
"epoch": 0.34229046705054383, | |
"grad_norm": 0.3154048025608063, | |
"learning_rate": 0.00013154190658989126, | |
"loss": 0.2563, | |
"step": 535 | |
}, | |
{ | |
"epoch": 0.345489443378119, | |
"grad_norm": 0.3658270537853241, | |
"learning_rate": 0.0001309021113243762, | |
"loss": 0.262, | |
"step": 540 | |
}, | |
{ | |
"epoch": 0.3486884197056942, | |
"grad_norm": 0.287977010011673, | |
"learning_rate": 0.00013026231605886118, | |
"loss": 0.2516, | |
"step": 545 | |
}, | |
{ | |
"epoch": 0.35188739603326935, | |
"grad_norm": 0.32740944623947144, | |
"learning_rate": 0.00012962252079334614, | |
"loss": 0.2598, | |
"step": 550 | |
}, | |
{ | |
"epoch": 0.3550863723608445, | |
"grad_norm": 0.3204440772533417, | |
"learning_rate": 0.0001289827255278311, | |
"loss": 0.2476, | |
"step": 555 | |
}, | |
{ | |
"epoch": 0.3582853486884197, | |
"grad_norm": 0.2696886360645294, | |
"learning_rate": 0.00012834293026231608, | |
"loss": 0.2135, | |
"step": 560 | |
}, | |
{ | |
"epoch": 0.36148432501599487, | |
"grad_norm": 0.24359866976737976, | |
"learning_rate": 0.00012770313499680102, | |
"loss": 0.2182, | |
"step": 565 | |
}, | |
{ | |
"epoch": 0.3646833013435701, | |
"grad_norm": 0.2711159586906433, | |
"learning_rate": 0.000127063339731286, | |
"loss": 0.2381, | |
"step": 570 | |
}, | |
{ | |
"epoch": 0.36788227767114523, | |
"grad_norm": 0.3088955283164978, | |
"learning_rate": 0.00012642354446577096, | |
"loss": 0.1968, | |
"step": 575 | |
}, | |
{ | |
"epoch": 0.3710812539987204, | |
"grad_norm": 0.28893983364105225, | |
"learning_rate": 0.00012578374920025592, | |
"loss": 0.2539, | |
"step": 580 | |
}, | |
{ | |
"epoch": 0.3742802303262956, | |
"grad_norm": 0.30569225549697876, | |
"learning_rate": 0.0001251439539347409, | |
"loss": 0.2491, | |
"step": 585 | |
}, | |
{ | |
"epoch": 0.37747920665387075, | |
"grad_norm": 0.29621535539627075, | |
"learning_rate": 0.00012450415866922584, | |
"loss": 0.2148, | |
"step": 590 | |
}, | |
{ | |
"epoch": 0.38067818298144596, | |
"grad_norm": 0.32468584179878235, | |
"learning_rate": 0.0001238643634037108, | |
"loss": 0.2395, | |
"step": 595 | |
}, | |
{ | |
"epoch": 0.3838771593090211, | |
"grad_norm": 0.3553343415260315, | |
"learning_rate": 0.0001232245681381958, | |
"loss": 0.2083, | |
"step": 600 | |
}, | |
{ | |
"epoch": 0.38707613563659626, | |
"grad_norm": 0.3482455611228943, | |
"learning_rate": 0.00012258477287268075, | |
"loss": 0.2406, | |
"step": 605 | |
}, | |
{ | |
"epoch": 0.3902751119641715, | |
"grad_norm": 0.2828838527202606, | |
"learning_rate": 0.00012194497760716572, | |
"loss": 0.2455, | |
"step": 610 | |
}, | |
{ | |
"epoch": 0.3934740882917466, | |
"grad_norm": 0.18922460079193115, | |
"learning_rate": 0.00012130518234165067, | |
"loss": 0.2402, | |
"step": 615 | |
}, | |
{ | |
"epoch": 0.39667306461932184, | |
"grad_norm": 0.36727848649024963, | |
"learning_rate": 0.00012066538707613564, | |
"loss": 0.2348, | |
"step": 620 | |
}, | |
{ | |
"epoch": 0.399872040946897, | |
"grad_norm": 0.29675740003585815, | |
"learning_rate": 0.00012002559181062061, | |
"loss": 0.2314, | |
"step": 625 | |
}, | |
{ | |
"epoch": 0.40307101727447214, | |
"grad_norm": 0.3717605471611023, | |
"learning_rate": 0.00011938579654510557, | |
"loss": 0.2252, | |
"step": 630 | |
}, | |
{ | |
"epoch": 0.40626999360204735, | |
"grad_norm": 0.3321962356567383, | |
"learning_rate": 0.00011874600127959055, | |
"loss": 0.2004, | |
"step": 635 | |
}, | |
{ | |
"epoch": 0.4094689699296225, | |
"grad_norm": 0.2975044846534729, | |
"learning_rate": 0.00011810620601407549, | |
"loss": 0.2394, | |
"step": 640 | |
}, | |
{ | |
"epoch": 0.4126679462571977, | |
"grad_norm": 0.5578257441520691, | |
"learning_rate": 0.00011746641074856047, | |
"loss": 0.2731, | |
"step": 645 | |
}, | |
{ | |
"epoch": 0.41586692258477287, | |
"grad_norm": 0.3975297212600708, | |
"learning_rate": 0.00011682661548304543, | |
"loss": 0.2558, | |
"step": 650 | |
}, | |
{ | |
"epoch": 0.419065898912348, | |
"grad_norm": 0.25277817249298096, | |
"learning_rate": 0.0001161868202175304, | |
"loss": 0.2312, | |
"step": 655 | |
}, | |
{ | |
"epoch": 0.42226487523992323, | |
"grad_norm": 0.32298269867897034, | |
"learning_rate": 0.00011554702495201537, | |
"loss": 0.2448, | |
"step": 660 | |
}, | |
{ | |
"epoch": 0.4254638515674984, | |
"grad_norm": 0.4133547246456146, | |
"learning_rate": 0.00011490722968650032, | |
"loss": 0.2408, | |
"step": 665 | |
}, | |
{ | |
"epoch": 0.4286628278950736, | |
"grad_norm": 0.2923837900161743, | |
"learning_rate": 0.00011426743442098529, | |
"loss": 0.2268, | |
"step": 670 | |
}, | |
{ | |
"epoch": 0.43186180422264875, | |
"grad_norm": 0.2599218785762787, | |
"learning_rate": 0.00011362763915547025, | |
"loss": 0.2255, | |
"step": 675 | |
}, | |
{ | |
"epoch": 0.4350607805502239, | |
"grad_norm": 0.3489529490470886, | |
"learning_rate": 0.00011298784388995522, | |
"loss": 0.254, | |
"step": 680 | |
}, | |
{ | |
"epoch": 0.4382597568777991, | |
"grad_norm": 0.30105629563331604, | |
"learning_rate": 0.0001123480486244402, | |
"loss": 0.269, | |
"step": 685 | |
}, | |
{ | |
"epoch": 0.44145873320537427, | |
"grad_norm": 0.37994736433029175, | |
"learning_rate": 0.00011170825335892514, | |
"loss": 0.2191, | |
"step": 690 | |
}, | |
{ | |
"epoch": 0.4446577095329495, | |
"grad_norm": 0.35344600677490234, | |
"learning_rate": 0.00011106845809341012, | |
"loss": 0.2566, | |
"step": 695 | |
}, | |
{ | |
"epoch": 0.44785668586052463, | |
"grad_norm": 0.28387364745140076, | |
"learning_rate": 0.00011042866282789508, | |
"loss": 0.2277, | |
"step": 700 | |
}, | |
{ | |
"epoch": 0.4510556621880998, | |
"grad_norm": 0.36002182960510254, | |
"learning_rate": 0.00010978886756238005, | |
"loss": 0.2447, | |
"step": 705 | |
}, | |
{ | |
"epoch": 0.454254638515675, | |
"grad_norm": 0.30156582593917847, | |
"learning_rate": 0.00010914907229686502, | |
"loss": 0.234, | |
"step": 710 | |
}, | |
{ | |
"epoch": 0.45745361484325014, | |
"grad_norm": 0.3468836843967438, | |
"learning_rate": 0.00010850927703134997, | |
"loss": 0.2466, | |
"step": 715 | |
}, | |
{ | |
"epoch": 0.46065259117082535, | |
"grad_norm": 0.33030760288238525, | |
"learning_rate": 0.00010786948176583493, | |
"loss": 0.2745, | |
"step": 720 | |
}, | |
{ | |
"epoch": 0.4638515674984005, | |
"grad_norm": 0.3344646096229553, | |
"learning_rate": 0.0001072296865003199, | |
"loss": 0.2154, | |
"step": 725 | |
}, | |
{ | |
"epoch": 0.46705054382597566, | |
"grad_norm": 0.40286871790885925, | |
"learning_rate": 0.00010658989123480487, | |
"loss": 0.2227, | |
"step": 730 | |
}, | |
{ | |
"epoch": 0.47024952015355087, | |
"grad_norm": 0.2880500257015228, | |
"learning_rate": 0.00010595009596928985, | |
"loss": 0.2219, | |
"step": 735 | |
}, | |
{ | |
"epoch": 0.473448496481126, | |
"grad_norm": 0.28410083055496216, | |
"learning_rate": 0.0001053103007037748, | |
"loss": 0.2124, | |
"step": 740 | |
}, | |
{ | |
"epoch": 0.47664747280870123, | |
"grad_norm": 0.2937108278274536, | |
"learning_rate": 0.00010467050543825975, | |
"loss": 0.2272, | |
"step": 745 | |
}, | |
{ | |
"epoch": 0.4798464491362764, | |
"grad_norm": 0.33740442991256714, | |
"learning_rate": 0.00010403071017274473, | |
"loss": 0.2332, | |
"step": 750 | |
}, | |
{ | |
"epoch": 0.48304542546385154, | |
"grad_norm": 0.2969924807548523, | |
"learning_rate": 0.0001033909149072297, | |
"loss": 0.2382, | |
"step": 755 | |
}, | |
{ | |
"epoch": 0.48624440179142675, | |
"grad_norm": 0.25243079662323, | |
"learning_rate": 0.00010275111964171466, | |
"loss": 0.2266, | |
"step": 760 | |
}, | |
{ | |
"epoch": 0.4894433781190019, | |
"grad_norm": 0.30554574728012085, | |
"learning_rate": 0.00010211132437619962, | |
"loss": 0.2276, | |
"step": 765 | |
}, | |
{ | |
"epoch": 0.4926423544465771, | |
"grad_norm": 0.281543493270874, | |
"learning_rate": 0.00010147152911068458, | |
"loss": 0.2286, | |
"step": 770 | |
}, | |
{ | |
"epoch": 0.49584133077415227, | |
"grad_norm": 0.2906375229358673, | |
"learning_rate": 0.00010083173384516955, | |
"loss": 0.2446, | |
"step": 775 | |
}, | |
{ | |
"epoch": 0.4990403071017274, | |
"grad_norm": 0.27951693534851074, | |
"learning_rate": 0.00010019193857965453, | |
"loss": 0.2422, | |
"step": 780 | |
}, | |
{ | |
"epoch": 0.5022392834293026, | |
"grad_norm": 0.3846909999847412, | |
"learning_rate": 9.955214331413948e-05, | |
"loss": 0.238, | |
"step": 785 | |
}, | |
{ | |
"epoch": 0.5054382597568778, | |
"grad_norm": 0.27699944376945496, | |
"learning_rate": 9.891234804862444e-05, | |
"loss": 0.2299, | |
"step": 790 | |
}, | |
{ | |
"epoch": 0.508637236084453, | |
"grad_norm": 0.2875959575176239, | |
"learning_rate": 9.82725527831094e-05, | |
"loss": 0.2293, | |
"step": 795 | |
}, | |
{ | |
"epoch": 0.5118362124120281, | |
"grad_norm": 0.23799441754817963, | |
"learning_rate": 9.763275751759438e-05, | |
"loss": 0.2332, | |
"step": 800 | |
}, | |
{ | |
"epoch": 0.5150351887396033, | |
"grad_norm": 0.3258192241191864, | |
"learning_rate": 9.699296225207935e-05, | |
"loss": 0.2137, | |
"step": 805 | |
}, | |
{ | |
"epoch": 0.5182341650671785, | |
"grad_norm": 0.4027354121208191, | |
"learning_rate": 9.63531669865643e-05, | |
"loss": 0.2528, | |
"step": 810 | |
}, | |
{ | |
"epoch": 0.5214331413947537, | |
"grad_norm": 0.5166566967964172, | |
"learning_rate": 9.571337172104927e-05, | |
"loss": 0.2171, | |
"step": 815 | |
}, | |
{ | |
"epoch": 0.5246321177223289, | |
"grad_norm": 0.2721567451953888, | |
"learning_rate": 9.507357645553423e-05, | |
"loss": 0.2233, | |
"step": 820 | |
}, | |
{ | |
"epoch": 0.527831094049904, | |
"grad_norm": 0.295593798160553, | |
"learning_rate": 9.44337811900192e-05, | |
"loss": 0.2302, | |
"step": 825 | |
}, | |
{ | |
"epoch": 0.5310300703774792, | |
"grad_norm": 0.3672064244747162, | |
"learning_rate": 9.379398592450416e-05, | |
"loss": 0.225, | |
"step": 830 | |
}, | |
{ | |
"epoch": 0.5342290467050543, | |
"grad_norm": 0.27494198083877563, | |
"learning_rate": 9.315419065898912e-05, | |
"loss": 0.2221, | |
"step": 835 | |
}, | |
{ | |
"epoch": 0.5374280230326296, | |
"grad_norm": 0.4797211289405823, | |
"learning_rate": 9.25143953934741e-05, | |
"loss": 0.2439, | |
"step": 840 | |
}, | |
{ | |
"epoch": 0.5406269993602048, | |
"grad_norm": 0.2528781592845917, | |
"learning_rate": 9.187460012795905e-05, | |
"loss": 0.2267, | |
"step": 845 | |
}, | |
{ | |
"epoch": 0.5438259756877799, | |
"grad_norm": 0.435768723487854, | |
"learning_rate": 9.123480486244403e-05, | |
"loss": 0.2433, | |
"step": 850 | |
}, | |
{ | |
"epoch": 0.5470249520153551, | |
"grad_norm": 0.3093527853488922, | |
"learning_rate": 9.059500959692899e-05, | |
"loss": 0.2519, | |
"step": 855 | |
}, | |
{ | |
"epoch": 0.5502239283429302, | |
"grad_norm": 0.31521865725517273, | |
"learning_rate": 8.995521433141395e-05, | |
"loss": 0.2563, | |
"step": 860 | |
}, | |
{ | |
"epoch": 0.5534229046705055, | |
"grad_norm": 0.2934703230857849, | |
"learning_rate": 8.931541906589892e-05, | |
"loss": 0.2576, | |
"step": 865 | |
}, | |
{ | |
"epoch": 0.5566218809980806, | |
"grad_norm": 0.33386310935020447, | |
"learning_rate": 8.867562380038388e-05, | |
"loss": 0.2342, | |
"step": 870 | |
}, | |
{ | |
"epoch": 0.5598208573256558, | |
"grad_norm": 0.3183169960975647, | |
"learning_rate": 8.803582853486885e-05, | |
"loss": 0.2325, | |
"step": 875 | |
}, | |
{ | |
"epoch": 0.5630198336532309, | |
"grad_norm": 0.27800726890563965, | |
"learning_rate": 8.739603326935381e-05, | |
"loss": 0.2256, | |
"step": 880 | |
}, | |
{ | |
"epoch": 0.5662188099808061, | |
"grad_norm": 0.31215038895606995, | |
"learning_rate": 8.675623800383877e-05, | |
"loss": 0.206, | |
"step": 885 | |
}, | |
{ | |
"epoch": 0.5694177863083814, | |
"grad_norm": 0.31119444966316223, | |
"learning_rate": 8.611644273832375e-05, | |
"loss": 0.2447, | |
"step": 890 | |
}, | |
{ | |
"epoch": 0.5726167626359565, | |
"grad_norm": 0.27355003356933594, | |
"learning_rate": 8.54766474728087e-05, | |
"loss": 0.2001, | |
"step": 895 | |
}, | |
{ | |
"epoch": 0.5758157389635317, | |
"grad_norm": 0.3279555141925812, | |
"learning_rate": 8.483685220729366e-05, | |
"loss": 0.218, | |
"step": 900 | |
}, | |
{ | |
"epoch": 0.5790147152911068, | |
"grad_norm": 0.33151063323020935, | |
"learning_rate": 8.419705694177864e-05, | |
"loss": 0.2206, | |
"step": 905 | |
}, | |
{ | |
"epoch": 0.582213691618682, | |
"grad_norm": 0.34513482451438904, | |
"learning_rate": 8.35572616762636e-05, | |
"loss": 0.2615, | |
"step": 910 | |
}, | |
{ | |
"epoch": 0.5854126679462572, | |
"grad_norm": 0.38724973797798157, | |
"learning_rate": 8.291746641074857e-05, | |
"loss": 0.2445, | |
"step": 915 | |
}, | |
{ | |
"epoch": 0.5886116442738324, | |
"grad_norm": 0.30885693430900574, | |
"learning_rate": 8.227767114523353e-05, | |
"loss": 0.2131, | |
"step": 920 | |
}, | |
{ | |
"epoch": 0.5918106206014075, | |
"grad_norm": 0.3177817463874817, | |
"learning_rate": 8.163787587971849e-05, | |
"loss": 0.2382, | |
"step": 925 | |
}, | |
{ | |
"epoch": 0.5950095969289827, | |
"grad_norm": 0.2687653601169586, | |
"learning_rate": 8.099808061420346e-05, | |
"loss": 0.2352, | |
"step": 930 | |
}, | |
{ | |
"epoch": 0.5982085732565579, | |
"grad_norm": 0.23862284421920776, | |
"learning_rate": 8.035828534868842e-05, | |
"loss": 0.2073, | |
"step": 935 | |
}, | |
{ | |
"epoch": 0.6014075495841331, | |
"grad_norm": 0.39008885622024536, | |
"learning_rate": 7.97184900831734e-05, | |
"loss": 0.235, | |
"step": 940 | |
}, | |
{ | |
"epoch": 0.6046065259117083, | |
"grad_norm": 0.310867041349411, | |
"learning_rate": 7.907869481765836e-05, | |
"loss": 0.222, | |
"step": 945 | |
}, | |
{ | |
"epoch": 0.6078055022392834, | |
"grad_norm": 0.306083083152771, | |
"learning_rate": 7.843889955214332e-05, | |
"loss": 0.2255, | |
"step": 950 | |
}, | |
{ | |
"epoch": 0.6110044785668586, | |
"grad_norm": 0.3439415991306305, | |
"learning_rate": 7.779910428662829e-05, | |
"loss": 0.2124, | |
"step": 955 | |
}, | |
{ | |
"epoch": 0.6142034548944337, | |
"grad_norm": 0.2899746000766754, | |
"learning_rate": 7.715930902111325e-05, | |
"loss": 0.1934, | |
"step": 960 | |
}, | |
{ | |
"epoch": 0.617402431222009, | |
"grad_norm": 0.3107975721359253, | |
"learning_rate": 7.651951375559822e-05, | |
"loss": 0.2277, | |
"step": 965 | |
}, | |
{ | |
"epoch": 0.6206014075495841, | |
"grad_norm": 0.3492138385772705, | |
"learning_rate": 7.587971849008317e-05, | |
"loss": 0.2413, | |
"step": 970 | |
}, | |
{ | |
"epoch": 0.6238003838771593, | |
"grad_norm": 0.27954694628715515, | |
"learning_rate": 7.523992322456814e-05, | |
"loss": 0.2164, | |
"step": 975 | |
}, | |
{ | |
"epoch": 0.6269993602047345, | |
"grad_norm": 0.34748780727386475, | |
"learning_rate": 7.46001279590531e-05, | |
"loss": 0.2258, | |
"step": 980 | |
}, | |
{ | |
"epoch": 0.6301983365323096, | |
"grad_norm": 0.35154399275779724, | |
"learning_rate": 7.396033269353807e-05, | |
"loss": 0.2175, | |
"step": 985 | |
}, | |
{ | |
"epoch": 0.6333973128598849, | |
"grad_norm": 0.3060106039047241, | |
"learning_rate": 7.332053742802303e-05, | |
"loss": 0.2318, | |
"step": 990 | |
}, | |
{ | |
"epoch": 0.63659628918746, | |
"grad_norm": 0.31797581911087036, | |
"learning_rate": 7.268074216250799e-05, | |
"loss": 0.2027, | |
"step": 995 | |
}, | |
{ | |
"epoch": 0.6397952655150352, | |
"grad_norm": 0.33520376682281494, | |
"learning_rate": 7.204094689699297e-05, | |
"loss": 0.2312, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 0.6429942418426103, | |
"grad_norm": 0.21483170986175537, | |
"learning_rate": 7.140115163147793e-05, | |
"loss": 0.2058, | |
"step": 1005 | |
}, | |
{ | |
"epoch": 0.6461932181701855, | |
"grad_norm": 0.33610713481903076, | |
"learning_rate": 7.07613563659629e-05, | |
"loss": 0.2415, | |
"step": 1010 | |
}, | |
{ | |
"epoch": 0.6493921944977608, | |
"grad_norm": 0.26923608779907227, | |
"learning_rate": 7.012156110044786e-05, | |
"loss": 0.2208, | |
"step": 1015 | |
}, | |
{ | |
"epoch": 0.6525911708253359, | |
"grad_norm": 0.3274904489517212, | |
"learning_rate": 6.948176583493282e-05, | |
"loss": 0.212, | |
"step": 1020 | |
}, | |
{ | |
"epoch": 0.6557901471529111, | |
"grad_norm": 0.3245833218097687, | |
"learning_rate": 6.884197056941779e-05, | |
"loss": 0.2191, | |
"step": 1025 | |
}, | |
{ | |
"epoch": 0.6589891234804862, | |
"grad_norm": 0.27330532670021057, | |
"learning_rate": 6.820217530390275e-05, | |
"loss": 0.1992, | |
"step": 1030 | |
}, | |
{ | |
"epoch": 0.6621880998080614, | |
"grad_norm": 0.25868290662765503, | |
"learning_rate": 6.756238003838772e-05, | |
"loss": 0.2399, | |
"step": 1035 | |
}, | |
{ | |
"epoch": 0.6653870761356366, | |
"grad_norm": 0.30688875913619995, | |
"learning_rate": 6.692258477287268e-05, | |
"loss": 0.2342, | |
"step": 1040 | |
}, | |
{ | |
"epoch": 0.6685860524632118, | |
"grad_norm": 0.2929762899875641, | |
"learning_rate": 6.628278950735764e-05, | |
"loss": 0.2369, | |
"step": 1045 | |
}, | |
{ | |
"epoch": 0.6717850287907869, | |
"grad_norm": 0.31221550703048706, | |
"learning_rate": 6.564299424184262e-05, | |
"loss": 0.2224, | |
"step": 1050 | |
}, | |
{ | |
"epoch": 0.6749840051183621, | |
"grad_norm": 0.32099705934524536, | |
"learning_rate": 6.500319897632758e-05, | |
"loss": 0.2144, | |
"step": 1055 | |
}, | |
{ | |
"epoch": 0.6781829814459372, | |
"grad_norm": 0.2826564908027649, | |
"learning_rate": 6.436340371081254e-05, | |
"loss": 0.2271, | |
"step": 1060 | |
}, | |
{ | |
"epoch": 0.6813819577735125, | |
"grad_norm": 0.2807115316390991, | |
"learning_rate": 6.372360844529751e-05, | |
"loss": 0.202, | |
"step": 1065 | |
}, | |
{ | |
"epoch": 0.6845809341010877, | |
"grad_norm": 0.3093265891075134, | |
"learning_rate": 6.308381317978247e-05, | |
"loss": 0.2328, | |
"step": 1070 | |
}, | |
{ | |
"epoch": 0.6877799104286628, | |
"grad_norm": 0.3201155662536621, | |
"learning_rate": 6.244401791426744e-05, | |
"loss": 0.209, | |
"step": 1075 | |
}, | |
{ | |
"epoch": 0.690978886756238, | |
"grad_norm": 0.27004414796829224, | |
"learning_rate": 6.18042226487524e-05, | |
"loss": 0.2278, | |
"step": 1080 | |
}, | |
{ | |
"epoch": 0.6941778630838131, | |
"grad_norm": 0.27909284830093384, | |
"learning_rate": 6.116442738323736e-05, | |
"loss": 0.23, | |
"step": 1085 | |
}, | |
{ | |
"epoch": 0.6973768394113884, | |
"grad_norm": 0.3248424828052521, | |
"learning_rate": 6.0524632117722334e-05, | |
"loss": 0.2309, | |
"step": 1090 | |
}, | |
{ | |
"epoch": 0.7005758157389635, | |
"grad_norm": 0.31023091077804565, | |
"learning_rate": 5.9884836852207293e-05, | |
"loss": 0.2084, | |
"step": 1095 | |
}, | |
{ | |
"epoch": 0.7037747920665387, | |
"grad_norm": 0.302664190530777, | |
"learning_rate": 5.924504158669226e-05, | |
"loss": 0.2367, | |
"step": 1100 | |
}, | |
{ | |
"epoch": 0.7069737683941139, | |
"grad_norm": 0.328142374753952, | |
"learning_rate": 5.860524632117722e-05, | |
"loss": 0.2215, | |
"step": 1105 | |
}, | |
{ | |
"epoch": 0.710172744721689, | |
"grad_norm": 0.34118109941482544, | |
"learning_rate": 5.796545105566219e-05, | |
"loss": 0.2077, | |
"step": 1110 | |
}, | |
{ | |
"epoch": 0.7133717210492643, | |
"grad_norm": 0.2934885025024414, | |
"learning_rate": 5.732565579014716e-05, | |
"loss": 0.2226, | |
"step": 1115 | |
}, | |
{ | |
"epoch": 0.7165706973768394, | |
"grad_norm": 0.306045800447464, | |
"learning_rate": 5.668586052463212e-05, | |
"loss": 0.1962, | |
"step": 1120 | |
}, | |
{ | |
"epoch": 0.7197696737044146, | |
"grad_norm": 0.3221231997013092, | |
"learning_rate": 5.6046065259117085e-05, | |
"loss": 0.2097, | |
"step": 1125 | |
}, | |
{ | |
"epoch": 0.7229686500319897, | |
"grad_norm": 0.29269224405288696, | |
"learning_rate": 5.5406269993602045e-05, | |
"loss": 0.2147, | |
"step": 1130 | |
}, | |
{ | |
"epoch": 0.7261676263595649, | |
"grad_norm": 0.3249344229698181, | |
"learning_rate": 5.476647472808701e-05, | |
"loss": 0.1956, | |
"step": 1135 | |
}, | |
{ | |
"epoch": 0.7293666026871402, | |
"grad_norm": 0.3102353811264038, | |
"learning_rate": 5.4126679462571984e-05, | |
"loss": 0.2092, | |
"step": 1140 | |
}, | |
{ | |
"epoch": 0.7325655790147153, | |
"grad_norm": 0.38312363624572754, | |
"learning_rate": 5.3486884197056944e-05, | |
"loss": 0.2294, | |
"step": 1145 | |
}, | |
{ | |
"epoch": 0.7357645553422905, | |
"grad_norm": 0.33595603704452515, | |
"learning_rate": 5.284708893154191e-05, | |
"loss": 0.242, | |
"step": 1150 | |
}, | |
{ | |
"epoch": 0.7389635316698656, | |
"grad_norm": 0.29311510920524597, | |
"learning_rate": 5.220729366602687e-05, | |
"loss": 0.2212, | |
"step": 1155 | |
}, | |
{ | |
"epoch": 0.7421625079974408, | |
"grad_norm": 0.2701033651828766, | |
"learning_rate": 5.1567498400511836e-05, | |
"loss": 0.2082, | |
"step": 1160 | |
}, | |
{ | |
"epoch": 0.745361484325016, | |
"grad_norm": 0.3194945156574249, | |
"learning_rate": 5.092770313499681e-05, | |
"loss": 0.235, | |
"step": 1165 | |
}, | |
{ | |
"epoch": 0.7485604606525912, | |
"grad_norm": 0.25952160358428955, | |
"learning_rate": 5.028790786948176e-05, | |
"loss": 0.2405, | |
"step": 1170 | |
}, | |
{ | |
"epoch": 0.7517594369801663, | |
"grad_norm": 0.3131108283996582, | |
"learning_rate": 4.9648112603966736e-05, | |
"loss": 0.2118, | |
"step": 1175 | |
}, | |
{ | |
"epoch": 0.7549584133077415, | |
"grad_norm": 0.40070056915283203, | |
"learning_rate": 4.9008317338451695e-05, | |
"loss": 0.2386, | |
"step": 1180 | |
}, | |
{ | |
"epoch": 0.7581573896353166, | |
"grad_norm": 0.38076481223106384, | |
"learning_rate": 4.836852207293666e-05, | |
"loss": 0.229, | |
"step": 1185 | |
}, | |
{ | |
"epoch": 0.7613563659628919, | |
"grad_norm": 0.26312530040740967, | |
"learning_rate": 4.772872680742163e-05, | |
"loss": 0.2029, | |
"step": 1190 | |
}, | |
{ | |
"epoch": 0.7645553422904671, | |
"grad_norm": 0.365788996219635, | |
"learning_rate": 4.7088931541906594e-05, | |
"loss": 0.2215, | |
"step": 1195 | |
}, | |
{ | |
"epoch": 0.7677543186180422, | |
"grad_norm": 0.246324360370636, | |
"learning_rate": 4.644913627639156e-05, | |
"loss": 0.2067, | |
"step": 1200 | |
}, | |
{ | |
"epoch": 0.7709532949456174, | |
"grad_norm": 0.2862643003463745, | |
"learning_rate": 4.580934101087652e-05, | |
"loss": 0.1955, | |
"step": 1205 | |
}, | |
{ | |
"epoch": 0.7741522712731925, | |
"grad_norm": 0.3309938609600067, | |
"learning_rate": 4.516954574536149e-05, | |
"loss": 0.227, | |
"step": 1210 | |
}, | |
{ | |
"epoch": 0.7773512476007678, | |
"grad_norm": 0.3223839998245239, | |
"learning_rate": 4.4529750479846447e-05, | |
"loss": 0.2292, | |
"step": 1215 | |
}, | |
{ | |
"epoch": 0.780550223928343, | |
"grad_norm": 0.3467869460582733, | |
"learning_rate": 4.388995521433142e-05, | |
"loss": 0.209, | |
"step": 1220 | |
}, | |
{ | |
"epoch": 0.7837492002559181, | |
"grad_norm": 0.34096330404281616, | |
"learning_rate": 4.325015994881638e-05, | |
"loss": 0.2041, | |
"step": 1225 | |
}, | |
{ | |
"epoch": 0.7869481765834933, | |
"grad_norm": 0.33046209812164307, | |
"learning_rate": 4.2610364683301346e-05, | |
"loss": 0.1974, | |
"step": 1230 | |
}, | |
{ | |
"epoch": 0.7901471529110684, | |
"grad_norm": 0.4353679418563843, | |
"learning_rate": 4.197056941778631e-05, | |
"loss": 0.1992, | |
"step": 1235 | |
}, | |
{ | |
"epoch": 0.7933461292386437, | |
"grad_norm": 0.35748305916786194, | |
"learning_rate": 4.133077415227127e-05, | |
"loss": 0.2237, | |
"step": 1240 | |
}, | |
{ | |
"epoch": 0.7965451055662188, | |
"grad_norm": 0.3174164891242981, | |
"learning_rate": 4.0690978886756245e-05, | |
"loss": 0.2136, | |
"step": 1245 | |
}, | |
{ | |
"epoch": 0.799744081893794, | |
"grad_norm": 0.3202930688858032, | |
"learning_rate": 4.0051183621241205e-05, | |
"loss": 0.2301, | |
"step": 1250 | |
}, | |
{ | |
"epoch": 0.8029430582213691, | |
"grad_norm": 0.2494242787361145, | |
"learning_rate": 3.941138835572617e-05, | |
"loss": 0.1783, | |
"step": 1255 | |
}, | |
{ | |
"epoch": 0.8061420345489443, | |
"grad_norm": 0.35551324486732483, | |
"learning_rate": 3.877159309021113e-05, | |
"loss": 0.2031, | |
"step": 1260 | |
}, | |
{ | |
"epoch": 0.8093410108765196, | |
"grad_norm": 0.2531558573246002, | |
"learning_rate": 3.81317978246961e-05, | |
"loss": 0.2365, | |
"step": 1265 | |
}, | |
{ | |
"epoch": 0.8125399872040947, | |
"grad_norm": 0.43033191561698914, | |
"learning_rate": 3.7492002559181063e-05, | |
"loss": 0.2217, | |
"step": 1270 | |
}, | |
{ | |
"epoch": 0.8157389635316699, | |
"grad_norm": 0.3271211087703705, | |
"learning_rate": 3.685220729366603e-05, | |
"loss": 0.2198, | |
"step": 1275 | |
}, | |
{ | |
"epoch": 0.818937939859245, | |
"grad_norm": 0.2692057490348816, | |
"learning_rate": 3.6212412028150996e-05, | |
"loss": 0.2099, | |
"step": 1280 | |
}, | |
{ | |
"epoch": 0.8221369161868202, | |
"grad_norm": 0.2919643223285675, | |
"learning_rate": 3.5572616762635956e-05, | |
"loss": 0.2148, | |
"step": 1285 | |
}, | |
{ | |
"epoch": 0.8253358925143954, | |
"grad_norm": 0.3477243185043335, | |
"learning_rate": 3.493282149712092e-05, | |
"loss": 0.2045, | |
"step": 1290 | |
}, | |
{ | |
"epoch": 0.8285348688419706, | |
"grad_norm": 0.3863447308540344, | |
"learning_rate": 3.429302623160589e-05, | |
"loss": 0.2481, | |
"step": 1295 | |
}, | |
{ | |
"epoch": 0.8317338451695457, | |
"grad_norm": 0.3201524019241333, | |
"learning_rate": 3.3653230966090855e-05, | |
"loss": 0.2121, | |
"step": 1300 | |
}, | |
{ | |
"epoch": 0.8349328214971209, | |
"grad_norm": 0.3398021459579468, | |
"learning_rate": 3.3013435700575815e-05, | |
"loss": 0.2251, | |
"step": 1305 | |
}, | |
{ | |
"epoch": 0.838131797824696, | |
"grad_norm": 0.29826006293296814, | |
"learning_rate": 3.237364043506078e-05, | |
"loss": 0.1768, | |
"step": 1310 | |
}, | |
{ | |
"epoch": 0.8413307741522713, | |
"grad_norm": 0.3297532796859741, | |
"learning_rate": 3.173384516954575e-05, | |
"loss": 0.2026, | |
"step": 1315 | |
}, | |
{ | |
"epoch": 0.8445297504798465, | |
"grad_norm": 0.3907334804534912, | |
"learning_rate": 3.1094049904030714e-05, | |
"loss": 0.2032, | |
"step": 1320 | |
}, | |
{ | |
"epoch": 0.8477287268074216, | |
"grad_norm": 0.27420273423194885, | |
"learning_rate": 3.0454254638515677e-05, | |
"loss": 0.1845, | |
"step": 1325 | |
}, | |
{ | |
"epoch": 0.8509277031349968, | |
"grad_norm": 0.36319416761398315, | |
"learning_rate": 2.981445937300064e-05, | |
"loss": 0.2143, | |
"step": 1330 | |
}, | |
{ | |
"epoch": 0.8541266794625719, | |
"grad_norm": 0.26537972688674927, | |
"learning_rate": 2.9174664107485606e-05, | |
"loss": 0.2228, | |
"step": 1335 | |
}, | |
{ | |
"epoch": 0.8573256557901472, | |
"grad_norm": 0.3364832103252411, | |
"learning_rate": 2.853486884197057e-05, | |
"loss": 0.1889, | |
"step": 1340 | |
}, | |
{ | |
"epoch": 0.8605246321177223, | |
"grad_norm": 0.32128915190696716, | |
"learning_rate": 2.789507357645554e-05, | |
"loss": 0.2023, | |
"step": 1345 | |
}, | |
{ | |
"epoch": 0.8637236084452975, | |
"grad_norm": 0.3016292452812195, | |
"learning_rate": 2.7255278310940502e-05, | |
"loss": 0.247, | |
"step": 1350 | |
}, | |
{ | |
"epoch": 0.8669225847728727, | |
"grad_norm": 0.3532962501049042, | |
"learning_rate": 2.6615483045425465e-05, | |
"loss": 0.22, | |
"step": 1355 | |
}, | |
{ | |
"epoch": 0.8701215611004478, | |
"grad_norm": 0.26920193433761597, | |
"learning_rate": 2.5975687779910428e-05, | |
"loss": 0.1888, | |
"step": 1360 | |
}, | |
{ | |
"epoch": 0.8733205374280231, | |
"grad_norm": 0.27570822834968567, | |
"learning_rate": 2.533589251439539e-05, | |
"loss": 0.2278, | |
"step": 1365 | |
}, | |
{ | |
"epoch": 0.8765195137555982, | |
"grad_norm": 0.2538580596446991, | |
"learning_rate": 2.4696097248880358e-05, | |
"loss": 0.1739, | |
"step": 1370 | |
}, | |
{ | |
"epoch": 0.8797184900831734, | |
"grad_norm": 0.2816247045993805, | |
"learning_rate": 2.4056301983365324e-05, | |
"loss": 0.2169, | |
"step": 1375 | |
}, | |
{ | |
"epoch": 0.8829174664107485, | |
"grad_norm": 0.3473341763019562, | |
"learning_rate": 2.341650671785029e-05, | |
"loss": 0.226, | |
"step": 1380 | |
}, | |
{ | |
"epoch": 0.8861164427383237, | |
"grad_norm": 0.2831457257270813, | |
"learning_rate": 2.2776711452335254e-05, | |
"loss": 0.2125, | |
"step": 1385 | |
}, | |
{ | |
"epoch": 0.889315419065899, | |
"grad_norm": 0.30768829584121704, | |
"learning_rate": 2.213691618682022e-05, | |
"loss": 0.1819, | |
"step": 1390 | |
}, | |
{ | |
"epoch": 0.8925143953934741, | |
"grad_norm": 0.31688612699508667, | |
"learning_rate": 2.1497120921305183e-05, | |
"loss": 0.2194, | |
"step": 1395 | |
}, | |
{ | |
"epoch": 0.8957133717210493, | |
"grad_norm": 0.3418295383453369, | |
"learning_rate": 2.0857325655790146e-05, | |
"loss": 0.2169, | |
"step": 1400 | |
}, | |
{ | |
"epoch": 0.8989123480486244, | |
"grad_norm": 0.37767553329467773, | |
"learning_rate": 2.0217530390275112e-05, | |
"loss": 0.2323, | |
"step": 1405 | |
}, | |
{ | |
"epoch": 0.9021113243761996, | |
"grad_norm": 0.26787883043289185, | |
"learning_rate": 1.957773512476008e-05, | |
"loss": 0.2024, | |
"step": 1410 | |
}, | |
{ | |
"epoch": 0.9053103007037748, | |
"grad_norm": 0.26510849595069885, | |
"learning_rate": 1.8937939859245045e-05, | |
"loss": 0.2327, | |
"step": 1415 | |
}, | |
{ | |
"epoch": 0.90850927703135, | |
"grad_norm": 0.2409944236278534, | |
"learning_rate": 1.8298144593730008e-05, | |
"loss": 0.1867, | |
"step": 1420 | |
}, | |
{ | |
"epoch": 0.9117082533589251, | |
"grad_norm": 0.29780739545822144, | |
"learning_rate": 1.765834932821497e-05, | |
"loss": 0.1885, | |
"step": 1425 | |
}, | |
{ | |
"epoch": 0.9149072296865003, | |
"grad_norm": 0.3131493031978607, | |
"learning_rate": 1.7018554062699938e-05, | |
"loss": 0.1984, | |
"step": 1430 | |
}, | |
{ | |
"epoch": 0.9181062060140754, | |
"grad_norm": 0.30493828654289246, | |
"learning_rate": 1.63787587971849e-05, | |
"loss": 0.2028, | |
"step": 1435 | |
}, | |
{ | |
"epoch": 0.9213051823416507, | |
"grad_norm": 0.4079281985759735, | |
"learning_rate": 1.5738963531669867e-05, | |
"loss": 0.2297, | |
"step": 1440 | |
}, | |
{ | |
"epoch": 0.9245041586692259, | |
"grad_norm": 0.3364027440547943, | |
"learning_rate": 1.5099168266154832e-05, | |
"loss": 0.2007, | |
"step": 1445 | |
}, | |
{ | |
"epoch": 0.927703134996801, | |
"grad_norm": 0.29711103439331055, | |
"learning_rate": 1.4459373000639795e-05, | |
"loss": 0.2201, | |
"step": 1450 | |
}, | |
{ | |
"epoch": 0.9309021113243762, | |
"grad_norm": 0.3361244797706604, | |
"learning_rate": 1.3819577735124761e-05, | |
"loss": 0.2118, | |
"step": 1455 | |
}, | |
{ | |
"epoch": 0.9341010876519513, | |
"grad_norm": 0.2692669630050659, | |
"learning_rate": 1.3179782469609724e-05, | |
"loss": 0.2036, | |
"step": 1460 | |
}, | |
{ | |
"epoch": 0.9373000639795266, | |
"grad_norm": 0.2790112793445587, | |
"learning_rate": 1.253998720409469e-05, | |
"loss": 0.1977, | |
"step": 1465 | |
}, | |
{ | |
"epoch": 0.9404990403071017, | |
"grad_norm": 0.3373515009880066, | |
"learning_rate": 1.1900191938579655e-05, | |
"loss": 0.1959, | |
"step": 1470 | |
}, | |
{ | |
"epoch": 0.9436980166346769, | |
"grad_norm": 0.27667558193206787, | |
"learning_rate": 1.126039667306462e-05, | |
"loss": 0.21, | |
"step": 1475 | |
}, | |
{ | |
"epoch": 0.946896992962252, | |
"grad_norm": 0.342916876077652, | |
"learning_rate": 1.0620601407549585e-05, | |
"loss": 0.225, | |
"step": 1480 | |
}, | |
{ | |
"epoch": 0.9500959692898272, | |
"grad_norm": 0.31250983476638794, | |
"learning_rate": 9.98080614203455e-06, | |
"loss": 0.2517, | |
"step": 1485 | |
}, | |
{ | |
"epoch": 0.9532949456174025, | |
"grad_norm": 0.29028651118278503, | |
"learning_rate": 9.341010876519514e-06, | |
"loss": 0.2276, | |
"step": 1490 | |
}, | |
{ | |
"epoch": 0.9564939219449776, | |
"grad_norm": 0.3296051621437073, | |
"learning_rate": 8.701215611004479e-06, | |
"loss": 0.1873, | |
"step": 1495 | |
}, | |
{ | |
"epoch": 0.9596928982725528, | |
"grad_norm": 0.36832043528556824, | |
"learning_rate": 8.061420345489444e-06, | |
"loss": 0.2064, | |
"step": 1500 | |
}, | |
{ | |
"epoch": 0.9628918746001279, | |
"grad_norm": 0.3034568130970001, | |
"learning_rate": 7.421625079974409e-06, | |
"loss": 0.2035, | |
"step": 1505 | |
}, | |
{ | |
"epoch": 0.9660908509277031, | |
"grad_norm": 0.26890963315963745, | |
"learning_rate": 6.781829814459373e-06, | |
"loss": 0.2019, | |
"step": 1510 | |
}, | |
{ | |
"epoch": 0.9692898272552783, | |
"grad_norm": 0.39937588572502136, | |
"learning_rate": 6.142034548944338e-06, | |
"loss": 0.2026, | |
"step": 1515 | |
}, | |
{ | |
"epoch": 0.9724888035828535, | |
"grad_norm": 0.2571321427822113, | |
"learning_rate": 5.502239283429303e-06, | |
"loss": 0.1928, | |
"step": 1520 | |
}, | |
{ | |
"epoch": 0.9756877799104287, | |
"grad_norm": 0.30180391669273376, | |
"learning_rate": 4.862444017914268e-06, | |
"loss": 0.2086, | |
"step": 1525 | |
}, | |
{ | |
"epoch": 0.9788867562380038, | |
"grad_norm": 0.4258427321910858, | |
"learning_rate": 4.222648752399233e-06, | |
"loss": 0.1849, | |
"step": 1530 | |
}, | |
{ | |
"epoch": 0.982085732565579, | |
"grad_norm": 0.29561159014701843, | |
"learning_rate": 3.5828534868841974e-06, | |
"loss": 0.2151, | |
"step": 1535 | |
}, | |
{ | |
"epoch": 0.9852847088931542, | |
"grad_norm": 0.2644422948360443, | |
"learning_rate": 2.943058221369162e-06, | |
"loss": 0.189, | |
"step": 1540 | |
}, | |
{ | |
"epoch": 0.9884836852207294, | |
"grad_norm": 0.30060943961143494, | |
"learning_rate": 2.303262955854127e-06, | |
"loss": 0.2164, | |
"step": 1545 | |
}, | |
{ | |
"epoch": 0.9916826615483045, | |
"grad_norm": 0.3195488154888153, | |
"learning_rate": 1.6634676903390916e-06, | |
"loss": 0.2519, | |
"step": 1550 | |
}, | |
{ | |
"epoch": 0.9948816378758797, | |
"grad_norm": 0.30431658029556274, | |
"learning_rate": 1.0236724248240563e-06, | |
"loss": 0.231, | |
"step": 1555 | |
}, | |
{ | |
"epoch": 0.9980806142034548, | |
"grad_norm": 0.34465327858924866, | |
"learning_rate": 3.838771593090211e-07, | |
"loss": 0.2068, | |
"step": 1560 | |
} | |
], | |
"logging_steps": 5, | |
"max_steps": 1563, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 1, | |
"save_steps": 500, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": true | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 8.925037329289052e+17, | |
"train_batch_size": 10, | |
"trial_name": null, | |
"trial_params": null | |
} | |