|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.07284240787863483, |
|
"eval_steps": 75, |
|
"global_step": 1875, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0003884928420193858, |
|
"grad_norm": 0.49890583753585815, |
|
"learning_rate": 7.759200756309999e-05, |
|
"loss": 1.8971, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0007769856840387716, |
|
"grad_norm": 1.6535608768463135, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6661, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0011654785260581573, |
|
"grad_norm": 1.0714327096939087, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4684, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0015539713680775432, |
|
"grad_norm": 0.6442272067070007, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5553, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0019424642100969289, |
|
"grad_norm": 0.818639874458313, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5007, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0023309570521163146, |
|
"grad_norm": 1.3463096618652344, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2678, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0027194498941357005, |
|
"grad_norm": 0.8409688472747803, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4836, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0029136963151453936, |
|
"eval_loss": 1.4955415725708008, |
|
"eval_runtime": 328.1939, |
|
"eval_samples_per_second": 1.587, |
|
"eval_steps_per_second": 1.587, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0031079427361550864, |
|
"grad_norm": 0.49311453104019165, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5626, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0034964355781744723, |
|
"grad_norm": 0.5505372881889343, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4427, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0038849284201938577, |
|
"grad_norm": 1.1296964883804321, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4702, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.004273421262213244, |
|
"grad_norm": 1.38261878490448, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5806, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.004661914104232629, |
|
"grad_norm": 0.5213516354560852, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4675, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0050504069462520155, |
|
"grad_norm": 2.93784761428833, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6413, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.005438899788271401, |
|
"grad_norm": 0.6772735118865967, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2942, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.005827392630290787, |
|
"grad_norm": 1.1066700220108032, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4082, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.005827392630290787, |
|
"eval_loss": 1.4717903137207031, |
|
"eval_runtime": 419.3037, |
|
"eval_samples_per_second": 1.243, |
|
"eval_steps_per_second": 1.243, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.006215885472310173, |
|
"grad_norm": 0.5077598690986633, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6572, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.006604378314329558, |
|
"grad_norm": 0.5666481256484985, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7344, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0069928711563489445, |
|
"grad_norm": 0.7042965888977051, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3634, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.00738136399836833, |
|
"grad_norm": 0.6379776000976562, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6191, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0077698568403877155, |
|
"grad_norm": 0.7309342622756958, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2186, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.008158349682407102, |
|
"grad_norm": 1.4138643741607666, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5201, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.008546842524426487, |
|
"grad_norm": 1.3856728076934814, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4056, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.00874108894543618, |
|
"eval_loss": 1.473658800125122, |
|
"eval_runtime": 413.7612, |
|
"eval_samples_per_second": 1.259, |
|
"eval_steps_per_second": 1.259, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.008935335366445873, |
|
"grad_norm": 1.1668083667755127, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5261, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.009323828208465258, |
|
"grad_norm": 0.719367265701294, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4151, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.009712321050484645, |
|
"grad_norm": 0.7443403005599976, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3723, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.010100813892504031, |
|
"grad_norm": 0.8915978670120239, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5165, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.010489306734523416, |
|
"grad_norm": 0.7369945049285889, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4225, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.010877799576542802, |
|
"grad_norm": 1.2632057666778564, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3462, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.011266292418562187, |
|
"grad_norm": 1.6178512573242188, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4283, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.011654785260581575, |
|
"grad_norm": 2.717789649963379, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4773, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.011654785260581575, |
|
"eval_loss": 1.4709599018096924, |
|
"eval_runtime": 410.5517, |
|
"eval_samples_per_second": 1.269, |
|
"eval_steps_per_second": 1.269, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.01204327810260096, |
|
"grad_norm": 3.8710834980010986, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4183, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.012431770944620345, |
|
"grad_norm": 1.1690031290054321, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6159, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.012820263786639731, |
|
"grad_norm": 1.422135829925537, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5327, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.013208756628659116, |
|
"grad_norm": 1.353925347328186, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4782, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.013597249470678504, |
|
"grad_norm": 0.8083727359771729, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3199, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.013985742312697889, |
|
"grad_norm": 0.6409865021705627, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3829, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.014374235154717275, |
|
"grad_norm": 1.9057331085205078, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3386, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.014568481575726967, |
|
"eval_loss": 1.4621069431304932, |
|
"eval_runtime": 410.8963, |
|
"eval_samples_per_second": 1.268, |
|
"eval_steps_per_second": 1.268, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.01476272799673666, |
|
"grad_norm": 1.4260625839233398, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4447, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.015151220838756045, |
|
"grad_norm": 2.0252511501312256, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4396, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.015539713680775431, |
|
"grad_norm": 1.5493030548095703, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5377, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.015928206522794818, |
|
"grad_norm": 1.5620871782302856, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5368, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.016316699364814204, |
|
"grad_norm": 1.8342182636260986, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4932, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.01670519220683359, |
|
"grad_norm": 0.8918685913085938, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4847, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.017093685048852975, |
|
"grad_norm": 1.4548940658569336, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5184, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.01748217789087236, |
|
"grad_norm": 1.4839730262756348, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4276, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.01748217789087236, |
|
"eval_loss": 1.4603033065795898, |
|
"eval_runtime": 408.9003, |
|
"eval_samples_per_second": 1.274, |
|
"eval_steps_per_second": 1.274, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.017870670732891746, |
|
"grad_norm": 0.6719891428947449, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3042, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.01825916357491113, |
|
"grad_norm": 0.8530905246734619, |
|
"learning_rate": 0.0001, |
|
"loss": 1.454, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.018647656416930516, |
|
"grad_norm": 0.8087925910949707, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4225, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.019036149258949905, |
|
"grad_norm": 2.091627359390259, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4617, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.01942464210096929, |
|
"grad_norm": 2.1747212409973145, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5124, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.019813134942988676, |
|
"grad_norm": 1.7147002220153809, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4442, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.020201627785008062, |
|
"grad_norm": 0.7326516509056091, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4376, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.020395874206017753, |
|
"eval_loss": 1.4605984687805176, |
|
"eval_runtime": 418.9148, |
|
"eval_samples_per_second": 1.244, |
|
"eval_steps_per_second": 1.244, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.020590120627027447, |
|
"grad_norm": 1.7703779935836792, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4911, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.020978613469046833, |
|
"grad_norm": 0.8552814722061157, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4079, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.021367106311066218, |
|
"grad_norm": 1.0003011226654053, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5769, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.021755599153085604, |
|
"grad_norm": 1.0176352262496948, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2431, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.02214409199510499, |
|
"grad_norm": 1.8341186046600342, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5053, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.022532584837124375, |
|
"grad_norm": 0.7317614555358887, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5135, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.02292107767914376, |
|
"grad_norm": 1.3072996139526367, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2935, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.02330957052116315, |
|
"grad_norm": 0.5384438633918762, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4028, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.02330957052116315, |
|
"eval_loss": 1.457323670387268, |
|
"eval_runtime": 419.1828, |
|
"eval_samples_per_second": 1.243, |
|
"eval_steps_per_second": 1.243, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.023698063363182535, |
|
"grad_norm": 0.6213059425354004, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3461, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.02408655620520192, |
|
"grad_norm": 0.9022939801216125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.458, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.024475049047221305, |
|
"grad_norm": 1.511841893196106, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3387, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.02486354188924069, |
|
"grad_norm": 1.193332552909851, |
|
"learning_rate": 0.0001, |
|
"loss": 1.25, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.025252034731260076, |
|
"grad_norm": 0.664730429649353, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4608, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.025640527573279462, |
|
"grad_norm": 0.9817675352096558, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4694, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.026029020415298847, |
|
"grad_norm": 0.8713122606277466, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6154, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.026223266836308542, |
|
"eval_loss": 1.4552098512649536, |
|
"eval_runtime": 418.233, |
|
"eval_samples_per_second": 1.246, |
|
"eval_steps_per_second": 1.246, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.026417513257318233, |
|
"grad_norm": 0.8656709790229797, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6028, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.02680600609933762, |
|
"grad_norm": 0.7827064990997314, |
|
"learning_rate": 0.0001, |
|
"loss": 1.454, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.027194498941357007, |
|
"grad_norm": 0.8780921101570129, |
|
"learning_rate": 0.0001, |
|
"loss": 1.377, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.027582991783376393, |
|
"grad_norm": 0.664682149887085, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3761, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.027971484625395778, |
|
"grad_norm": 1.6883013248443604, |
|
"learning_rate": 0.0001, |
|
"loss": 1.293, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.028359977467415164, |
|
"grad_norm": 0.6659910082817078, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3595, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.02874847030943455, |
|
"grad_norm": 1.0495606660842896, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3881, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.029136963151453935, |
|
"grad_norm": 2.0675432682037354, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3353, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.029136963151453935, |
|
"eval_loss": 1.4496526718139648, |
|
"eval_runtime": 412.105, |
|
"eval_samples_per_second": 1.264, |
|
"eval_steps_per_second": 1.264, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.02952545599347332, |
|
"grad_norm": 2.147975444793701, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4715, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.029913948835492706, |
|
"grad_norm": 1.4400185346603394, |
|
"learning_rate": 0.0001, |
|
"loss": 1.704, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.03030244167751209, |
|
"grad_norm": 0.5840633511543274, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2531, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.030690934519531476, |
|
"grad_norm": 1.9958975315093994, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6409, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.031079427361550862, |
|
"grad_norm": 0.4322706460952759, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3866, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.03146792020357025, |
|
"grad_norm": 0.9608808755874634, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2862, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.031856413045589636, |
|
"grad_norm": 1.0402257442474365, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4305, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.03205065946659933, |
|
"eval_loss": 1.4473251104354858, |
|
"eval_runtime": 410.7169, |
|
"eval_samples_per_second": 1.269, |
|
"eval_steps_per_second": 1.269, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.03224490588760902, |
|
"grad_norm": 0.6171532273292542, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2107, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.03263339872962841, |
|
"grad_norm": 1.6381995677947998, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5369, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.03302189157164779, |
|
"grad_norm": 0.5398985743522644, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4939, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.03341038441366718, |
|
"grad_norm": 1.1927576065063477, |
|
"learning_rate": 0.0001, |
|
"loss": 1.459, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.03379887725568657, |
|
"grad_norm": 0.5355756878852844, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4591, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.03418737009770595, |
|
"grad_norm": 1.0324468612670898, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4079, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.03457586293972534, |
|
"grad_norm": 0.9082580804824829, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4703, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.03496435578174472, |
|
"grad_norm": 1.0036635398864746, |
|
"learning_rate": 0.0001, |
|
"loss": 1.2948, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.03496435578174472, |
|
"eval_loss": 1.4607137441635132, |
|
"eval_runtime": 418.5246, |
|
"eval_samples_per_second": 1.245, |
|
"eval_steps_per_second": 1.245, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.03535284862376411, |
|
"grad_norm": 0.7732622027397156, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5017, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.03574134146578349, |
|
"grad_norm": 0.7425190806388855, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4275, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.03612983430780288, |
|
"grad_norm": 0.6782093644142151, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4424, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.03651832714982226, |
|
"grad_norm": 0.6914064288139343, |
|
"learning_rate": 0.0001, |
|
"loss": 1.49, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.03690681999184165, |
|
"grad_norm": 1.2722946405410767, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4737, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.03729531283386103, |
|
"grad_norm": 0.9967614412307739, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3731, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.03768380567588042, |
|
"grad_norm": 0.5614752173423767, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3554, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.037878052096890116, |
|
"eval_loss": 1.4594156742095947, |
|
"eval_runtime": 414.1638, |
|
"eval_samples_per_second": 1.258, |
|
"eval_steps_per_second": 1.258, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.03807229851789981, |
|
"grad_norm": 1.496825933456421, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3043, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.03846079135991919, |
|
"grad_norm": 0.5324123501777649, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5467, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.03884928420193858, |
|
"grad_norm": 2.828305959701538, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4766, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.039237777043957964, |
|
"grad_norm": 1.0788389444351196, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6391, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.03962626988597735, |
|
"grad_norm": 1.3913893699645996, |
|
"learning_rate": 0.0001, |
|
"loss": 1.36, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.040014762727996735, |
|
"grad_norm": 1.0683279037475586, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3993, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.040403255570016124, |
|
"grad_norm": 0.14315283298492432, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3594, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.040791748412035506, |
|
"grad_norm": 1.996098518371582, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3501, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.040791748412035506, |
|
"eval_loss": 1.4564799070358276, |
|
"eval_runtime": 407.4512, |
|
"eval_samples_per_second": 1.279, |
|
"eval_steps_per_second": 1.279, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.041180241254054895, |
|
"grad_norm": 1.9238131046295166, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3409, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.04156873409607428, |
|
"grad_norm": 0.561337947845459, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3931, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.041957226938093665, |
|
"grad_norm": 0.6750600934028625, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3055, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.042345719780113054, |
|
"grad_norm": 1.6704535484313965, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4371, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.042734212622132436, |
|
"grad_norm": 0.6073994636535645, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5461, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.043122705464151825, |
|
"grad_norm": 1.1396293640136719, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4749, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.04351119830617121, |
|
"grad_norm": 0.6748817563056946, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4803, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.0437054447271809, |
|
"eval_loss": 1.4066863059997559, |
|
"eval_runtime": 198.1965, |
|
"eval_samples_per_second": 2.629, |
|
"eval_steps_per_second": 2.629, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.043899691148190596, |
|
"grad_norm": 0.8130941987037659, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4579, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.04428818399020998, |
|
"grad_norm": 0.5348241329193115, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3988, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.04467667683222937, |
|
"grad_norm": 0.6961309313774109, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4051, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.04506516967424875, |
|
"grad_norm": 0.8562794923782349, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3631, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.04545366251626814, |
|
"grad_norm": 0.6999790668487549, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4532, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.04584215535828752, |
|
"grad_norm": 0.5127655267715454, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4715, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.04623064820030691, |
|
"grad_norm": 1.5171382427215576, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2901, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.0466191410423263, |
|
"grad_norm": 0.7225420475006104, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2778, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.0466191410423263, |
|
"eval_loss": 1.4001317024230957, |
|
"eval_runtime": 199.2632, |
|
"eval_samples_per_second": 2.615, |
|
"eval_steps_per_second": 2.615, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.04700763388434568, |
|
"grad_norm": 1.5108428001403809, |
|
"learning_rate": 1e-05, |
|
"loss": 1.6175, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.04739612672636507, |
|
"grad_norm": 1.1392805576324463, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3466, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.04778461956838445, |
|
"grad_norm": 0.94669109582901, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3415, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.04817311241040384, |
|
"grad_norm": 0.8593105673789978, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3334, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.04856160525242322, |
|
"grad_norm": 0.8188263773918152, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4602, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.04895009809444261, |
|
"grad_norm": 0.6875782608985901, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3261, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.04933859093646199, |
|
"grad_norm": 1.8237006664276123, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5862, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.04953283735747169, |
|
"eval_loss": 1.3970199823379517, |
|
"eval_runtime": 205.8088, |
|
"eval_samples_per_second": 2.531, |
|
"eval_steps_per_second": 2.531, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.04972708377848138, |
|
"grad_norm": 1.319785237312317, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3576, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.050115576620500764, |
|
"grad_norm": 1.727789282798767, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5409, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.05050406946252015, |
|
"grad_norm": 0.9914244413375854, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3503, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.05089256230453954, |
|
"grad_norm": 1.8328955173492432, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5384, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.051281055146558924, |
|
"grad_norm": 1.7998759746551514, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4807, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.05166954798857831, |
|
"grad_norm": 1.53579843044281, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4255, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.052058040830597695, |
|
"grad_norm": 0.9572857022285461, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4547, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.052446533672617084, |
|
"grad_norm": 0.6299539804458618, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2758, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.052446533672617084, |
|
"eval_loss": 1.394976258277893, |
|
"eval_runtime": 206.0718, |
|
"eval_samples_per_second": 2.528, |
|
"eval_steps_per_second": 2.528, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.052835026514636466, |
|
"grad_norm": 1.1869505643844604, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2709, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.053223519356655855, |
|
"grad_norm": 0.5684358477592468, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3306, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.05361201219867524, |
|
"grad_norm": 0.5880847573280334, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3093, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.054000505040694625, |
|
"grad_norm": 0.6990231275558472, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4534, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.054388997882714014, |
|
"grad_norm": 1.0700093507766724, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3294, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.054777490724733396, |
|
"grad_norm": 1.044433832168579, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4177, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.055165983566752785, |
|
"grad_norm": 2.6891329288482666, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4451, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.05536022998776247, |
|
"eval_loss": 1.3934379816055298, |
|
"eval_runtime": 204.0926, |
|
"eval_samples_per_second": 2.553, |
|
"eval_steps_per_second": 2.553, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.05555447640877217, |
|
"grad_norm": 0.4769861698150635, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3179, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.055942969250791556, |
|
"grad_norm": 1.0731093883514404, |
|
"learning_rate": 1e-05, |
|
"loss": 1.47, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.05633146209281094, |
|
"grad_norm": 1.016760230064392, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5151, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.05671995493483033, |
|
"grad_norm": 1.5259450674057007, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4038, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.05710844777684971, |
|
"grad_norm": 0.654501736164093, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3135, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.0574969406188691, |
|
"grad_norm": 0.6827269196510315, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2978, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.05788543346088848, |
|
"grad_norm": 0.5111151933670044, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4352, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.05827392630290787, |
|
"grad_norm": 1.9571446180343628, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4764, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.05827392630290787, |
|
"eval_loss": 1.3912627696990967, |
|
"eval_runtime": 204.5081, |
|
"eval_samples_per_second": 2.548, |
|
"eval_steps_per_second": 2.548, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.05866241914492726, |
|
"grad_norm": 0.8712412714958191, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3778, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.05905091198694664, |
|
"grad_norm": 0.7130087018013, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3266, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.05943940482896603, |
|
"grad_norm": 1.6288388967514038, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4783, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.05982789767098541, |
|
"grad_norm": 2.629760503768921, |
|
"learning_rate": 1e-05, |
|
"loss": 1.6038, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.0602163905130048, |
|
"grad_norm": 1.0394636392593384, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4683, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.06060488335502418, |
|
"grad_norm": 1.128451943397522, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4578, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.06099337619704357, |
|
"grad_norm": 2.473900079727173, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4326, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.061187622618053265, |
|
"eval_loss": 1.3889408111572266, |
|
"eval_runtime": 205.5592, |
|
"eval_samples_per_second": 2.535, |
|
"eval_steps_per_second": 2.535, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.06138186903906295, |
|
"grad_norm": 1.940373182296753, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5117, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.06177036188108234, |
|
"grad_norm": 0.7575955986976624, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3894, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.062158854723101724, |
|
"grad_norm": 1.4801169633865356, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3132, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.0625473475651211, |
|
"grad_norm": 1.291632890701294, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3286, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.0629358404071405, |
|
"grad_norm": 1.9607435464859009, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3005, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.06332433324915988, |
|
"grad_norm": 0.8362483382225037, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4172, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.06371282609117927, |
|
"grad_norm": 1.3649120330810547, |
|
"learning_rate": 1e-05, |
|
"loss": 1.6757, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.06410131893319866, |
|
"grad_norm": 1.0758274793624878, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3867, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.06410131893319866, |
|
"eval_loss": 1.3888965845108032, |
|
"eval_runtime": 205.6882, |
|
"eval_samples_per_second": 2.533, |
|
"eval_steps_per_second": 2.533, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.06448981177521804, |
|
"grad_norm": 0.8754805326461792, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3389, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.06487830461723743, |
|
"grad_norm": 0.7831467986106873, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4257, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.06526679745925681, |
|
"grad_norm": 0.4581933915615082, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3556, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.0656552903012762, |
|
"grad_norm": 0.9837825894355774, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3184, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.06604378314329558, |
|
"grad_norm": 1.005288004875183, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2944, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.06643227598531497, |
|
"grad_norm": 0.9397820234298706, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4305, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.06682076882733436, |
|
"grad_norm": 2.7833900451660156, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3273, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.06701501524834405, |
|
"eval_loss": 1.3884316682815552, |
|
"eval_runtime": 206.1573, |
|
"eval_samples_per_second": 2.527, |
|
"eval_steps_per_second": 2.527, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.06720926166935375, |
|
"grad_norm": 1.1208202838897705, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2229, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.06759775451137313, |
|
"grad_norm": 0.5742992758750916, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3349, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.06798624735339251, |
|
"grad_norm": 0.7946904897689819, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3682, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.0683747401954119, |
|
"grad_norm": 0.7263549566268921, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5025, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.06876323303743129, |
|
"grad_norm": 0.8954797387123108, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4383, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.06915172587945068, |
|
"grad_norm": 0.6124446392059326, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3322, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.06954021872147005, |
|
"grad_norm": 1.140678882598877, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5233, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.06992871156348944, |
|
"grad_norm": 7.1586689949035645, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3691, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.06992871156348944, |
|
"eval_loss": 1.387437105178833, |
|
"eval_runtime": 204.4312, |
|
"eval_samples_per_second": 2.549, |
|
"eval_steps_per_second": 2.549, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.07031720440550883, |
|
"grad_norm": 0.634140133857727, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3735, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.07070569724752822, |
|
"grad_norm": 0.7632227540016174, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3542, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.07109419008954761, |
|
"grad_norm": 0.7211370468139648, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2832, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.07148268293156698, |
|
"grad_norm": 0.7608075737953186, |
|
"learning_rate": 1e-05, |
|
"loss": 1.5292, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.07187117577358637, |
|
"grad_norm": 0.8131744265556335, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4005, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.07225966861560576, |
|
"grad_norm": 0.6415278911590576, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4455, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.07264816145762515, |
|
"grad_norm": 2.333056688308716, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4367, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.07284240787863483, |
|
"eval_loss": 1.3867840766906738, |
|
"eval_runtime": 205.2163, |
|
"eval_samples_per_second": 2.539, |
|
"eval_steps_per_second": 2.539, |
|
"step": 1875 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 75, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.331378819072e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|