|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7151370679380215, |
|
"eval_steps": 25, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009535160905840286, |
|
"grad_norm": 22.616830825805664, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 10.7012, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.009535160905840286, |
|
"eval_loss": 9.564970016479492, |
|
"eval_runtime": 54.4321, |
|
"eval_samples_per_second": 1.635, |
|
"eval_steps_per_second": 0.827, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01907032181168057, |
|
"grad_norm": 20.275375366210938, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 9.9498, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.028605482717520857, |
|
"grad_norm": 23.843503952026367, |
|
"learning_rate": 0.0001, |
|
"loss": 9.8482, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03814064362336114, |
|
"grad_norm": 17.373727798461914, |
|
"learning_rate": 9.99524110790929e-05, |
|
"loss": 6.8627, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04767580452920143, |
|
"grad_norm": 16.935632705688477, |
|
"learning_rate": 9.980973490458728e-05, |
|
"loss": 5.3765, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.057210965435041714, |
|
"grad_norm": 25.84366226196289, |
|
"learning_rate": 9.957224306869053e-05, |
|
"loss": 6.6007, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.066746126340882, |
|
"grad_norm": 39.4841423034668, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 6.0231, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.07628128724672228, |
|
"grad_norm": 24.768882751464844, |
|
"learning_rate": 9.881480035599667e-05, |
|
"loss": 2.7102, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08581644815256258, |
|
"grad_norm": 29.693031311035156, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 1.6618, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.09535160905840286, |
|
"grad_norm": 14.000877380371094, |
|
"learning_rate": 9.768584753741134e-05, |
|
"loss": 0.9057, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10488676996424315, |
|
"grad_norm": 5.596749305725098, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 0.3843, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.11442193087008343, |
|
"grad_norm": 4.840229511260986, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 0.4124, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.12395709177592372, |
|
"grad_norm": 3.7698850631713867, |
|
"learning_rate": 9.53153893518325e-05, |
|
"loss": 0.3085, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.133492252681764, |
|
"grad_norm": 1.7670843601226807, |
|
"learning_rate": 9.435054165891109e-05, |
|
"loss": 0.2888, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1430274135876043, |
|
"grad_norm": 0.5820183753967285, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 0.2256, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.15256257449344457, |
|
"grad_norm": 3.2924885749816895, |
|
"learning_rate": 9.21695722906443e-05, |
|
"loss": 0.4244, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.16209773539928488, |
|
"grad_norm": 1.984984278678894, |
|
"learning_rate": 9.09576022144496e-05, |
|
"loss": 0.3216, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.17163289630512515, |
|
"grad_norm": 1.1172698736190796, |
|
"learning_rate": 8.966766701456177e-05, |
|
"loss": 0.3188, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.18116805721096543, |
|
"grad_norm": 1.5274360179901123, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 0.3054, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.1907032181168057, |
|
"grad_norm": 1.9571841955184937, |
|
"learning_rate": 8.68638668405062e-05, |
|
"loss": 0.3021, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.20023837902264602, |
|
"grad_norm": 1.2252756357192993, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 0.2382, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2097735399284863, |
|
"grad_norm": 1.0066477060317993, |
|
"learning_rate": 8.377951038078302e-05, |
|
"loss": 0.3221, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.21930870083432658, |
|
"grad_norm": 0.6950803995132446, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 0.2824, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.22884386174016685, |
|
"grad_norm": 1.1107197999954224, |
|
"learning_rate": 8.043807145043604e-05, |
|
"loss": 0.361, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.23837902264600716, |
|
"grad_norm": 0.676360547542572, |
|
"learning_rate": 7.86788218175523e-05, |
|
"loss": 0.2431, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.23837902264600716, |
|
"eval_loss": 0.266483336687088, |
|
"eval_runtime": 54.8592, |
|
"eval_samples_per_second": 1.622, |
|
"eval_steps_per_second": 0.82, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.24791418355184744, |
|
"grad_norm": 0.4809877872467041, |
|
"learning_rate": 7.68649804173412e-05, |
|
"loss": 0.2921, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.25744934445768775, |
|
"grad_norm": 1.3561242818832397, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.1493, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.266984505363528, |
|
"grad_norm": 1.378612756729126, |
|
"learning_rate": 7.308743066175172e-05, |
|
"loss": 0.1205, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.2765196662693683, |
|
"grad_norm": 1.3110326528549194, |
|
"learning_rate": 7.113091308703498e-05, |
|
"loss": 0.09, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.2860548271752086, |
|
"grad_norm": 0.8067518472671509, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 0.035, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.29558998808104886, |
|
"grad_norm": 30.427331924438477, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 1.3727, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.30512514898688914, |
|
"grad_norm": 30.505556106567383, |
|
"learning_rate": 6.503528997521366e-05, |
|
"loss": 2.8594, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.3146603098927294, |
|
"grad_norm": 26.016315460205078, |
|
"learning_rate": 6.294095225512603e-05, |
|
"loss": 2.531, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.32419547079856975, |
|
"grad_norm": 14.183924674987793, |
|
"learning_rate": 6.0821980696905146e-05, |
|
"loss": 1.1554, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.33373063170441003, |
|
"grad_norm": 1.8867197036743164, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 0.2323, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.3432657926102503, |
|
"grad_norm": 1.2031322717666626, |
|
"learning_rate": 5.6526309611002594e-05, |
|
"loss": 0.2414, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3528009535160906, |
|
"grad_norm": 1.523146390914917, |
|
"learning_rate": 5.435778713738292e-05, |
|
"loss": 0.2678, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.36233611442193087, |
|
"grad_norm": 1.5239652395248413, |
|
"learning_rate": 5.218096936826681e-05, |
|
"loss": 0.2312, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.37187127532777114, |
|
"grad_norm": 1.0078612565994263, |
|
"learning_rate": 5e-05, |
|
"loss": 0.1885, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.3814064362336114, |
|
"grad_norm": 4.037468433380127, |
|
"learning_rate": 4.781903063173321e-05, |
|
"loss": 0.4962, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3909415971394517, |
|
"grad_norm": 2.556730031967163, |
|
"learning_rate": 4.564221286261709e-05, |
|
"loss": 0.4065, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.40047675804529204, |
|
"grad_norm": 1.4753957986831665, |
|
"learning_rate": 4.347369038899744e-05, |
|
"loss": 0.3817, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.4100119189511323, |
|
"grad_norm": 1.1868329048156738, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.2182, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.4195470798569726, |
|
"grad_norm": 0.9121273159980774, |
|
"learning_rate": 3.917801930309486e-05, |
|
"loss": 0.2606, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.42908224076281287, |
|
"grad_norm": 0.551138162612915, |
|
"learning_rate": 3.705904774487396e-05, |
|
"loss": 0.2798, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.43861740166865315, |
|
"grad_norm": 0.5170517563819885, |
|
"learning_rate": 3.4964710024786354e-05, |
|
"loss": 0.2307, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.44815256257449343, |
|
"grad_norm": 0.3968268036842346, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 0.2303, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.4576877234803337, |
|
"grad_norm": 0.3745101988315582, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 0.2629, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.46722288438617404, |
|
"grad_norm": 0.7609822750091553, |
|
"learning_rate": 2.886908691296504e-05, |
|
"loss": 0.2946, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.4767580452920143, |
|
"grad_norm": 0.4893784523010254, |
|
"learning_rate": 2.6912569338248315e-05, |
|
"loss": 0.2266, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4767580452920143, |
|
"eval_loss": 0.2717249393463135, |
|
"eval_runtime": 54.8636, |
|
"eval_samples_per_second": 1.622, |
|
"eval_steps_per_second": 0.82, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4862932061978546, |
|
"grad_norm": 0.6945714354515076, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.2567, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4958283671036949, |
|
"grad_norm": 0.6655871272087097, |
|
"learning_rate": 2.3135019582658802e-05, |
|
"loss": 0.1773, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5053635280095352, |
|
"grad_norm": 2.4930078983306885, |
|
"learning_rate": 2.132117818244771e-05, |
|
"loss": 0.1157, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.5148986889153755, |
|
"grad_norm": 1.8733148574829102, |
|
"learning_rate": 1.9561928549563968e-05, |
|
"loss": 0.0834, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5244338498212158, |
|
"grad_norm": 1.6207678318023682, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.072, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.533969010727056, |
|
"grad_norm": 2.021498441696167, |
|
"learning_rate": 1.622048961921699e-05, |
|
"loss": 0.2196, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5435041716328963, |
|
"grad_norm": 8.16150188446045, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 0.9258, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5530393325387366, |
|
"grad_norm": 15.361802101135254, |
|
"learning_rate": 1.3136133159493802e-05, |
|
"loss": 1.1991, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.5625744934445769, |
|
"grad_norm": 15.315176010131836, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 1.2264, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.5721096543504172, |
|
"grad_norm": 14.774981498718262, |
|
"learning_rate": 1.0332332985438248e-05, |
|
"loss": 1.2034, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5816448152562574, |
|
"grad_norm": 4.136609077453613, |
|
"learning_rate": 9.042397785550405e-06, |
|
"loss": 0.4474, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5911799761620977, |
|
"grad_norm": 0.9078148007392883, |
|
"learning_rate": 7.830427709355725e-06, |
|
"loss": 0.2365, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.600715137067938, |
|
"grad_norm": 1.588295340538025, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 0.3393, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6102502979737783, |
|
"grad_norm": 0.6780855059623718, |
|
"learning_rate": 5.649458341088915e-06, |
|
"loss": 0.2498, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6197854588796186, |
|
"grad_norm": 0.553501307964325, |
|
"learning_rate": 4.684610648167503e-06, |
|
"loss": 0.2031, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6293206197854588, |
|
"grad_norm": 0.3157319724559784, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 0.2239, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.6388557806912991, |
|
"grad_norm": 0.7763720750808716, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 0.3028, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.6483909415971395, |
|
"grad_norm": 0.7925229072570801, |
|
"learning_rate": 2.314152462588659e-06, |
|
"loss": 0.2561, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.6579261025029798, |
|
"grad_norm": 0.41560518741607666, |
|
"learning_rate": 1.70370868554659e-06, |
|
"loss": 0.1974, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.6674612634088201, |
|
"grad_norm": 1.2434768676757812, |
|
"learning_rate": 1.1851996440033319e-06, |
|
"loss": 0.3121, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6769964243146603, |
|
"grad_norm": 0.5525261759757996, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 0.201, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.6865315852205006, |
|
"grad_norm": 1.1765050888061523, |
|
"learning_rate": 4.277569313094809e-07, |
|
"loss": 0.3335, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.6960667461263409, |
|
"grad_norm": 0.3499181568622589, |
|
"learning_rate": 1.9026509541272275e-07, |
|
"loss": 0.2423, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7056019070321812, |
|
"grad_norm": 0.6464172601699829, |
|
"learning_rate": 4.7588920907110094e-08, |
|
"loss": 0.1805, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7151370679380215, |
|
"grad_norm": 0.9107186198234558, |
|
"learning_rate": 0.0, |
|
"loss": 0.3255, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7151370679380215, |
|
"eval_loss": 0.2709729075431824, |
|
"eval_runtime": 54.2298, |
|
"eval_samples_per_second": 1.641, |
|
"eval_steps_per_second": 0.83, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 75, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.647483930279936e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|