{ "best_metric": 1.3524963855743408, "best_model_checkpoint": "checkpoints/sft_2_1/checkpoint-3285", "epoch": 10.0, "eval_steps": 500, "global_step": 3650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10136986301369863, "grad_norm": 18.275697708129883, "learning_rate": 5.068493150684932e-07, "loss": 2.6032, "step": 37 }, { "epoch": 0.20273972602739726, "grad_norm": 18.235862731933594, "learning_rate": 1.0136986301369864e-06, "loss": 2.4481, "step": 74 }, { "epoch": 0.3041095890410959, "grad_norm": 17.36343002319336, "learning_rate": 1.5205479452054797e-06, "loss": 2.1356, "step": 111 }, { "epoch": 0.4054794520547945, "grad_norm": 4.853509426116943, "learning_rate": 2.027397260273973e-06, "loss": 1.7074, "step": 148 }, { "epoch": 0.5068493150684932, "grad_norm": 3.8640081882476807, "learning_rate": 2.534246575342466e-06, "loss": 1.5478, "step": 185 }, { "epoch": 0.6082191780821918, "grad_norm": 4.1889729499816895, "learning_rate": 3.0410958904109593e-06, "loss": 1.4966, "step": 222 }, { "epoch": 0.7095890410958904, "grad_norm": 3.6940364837646484, "learning_rate": 3.5479452054794523e-06, "loss": 1.4769, "step": 259 }, { "epoch": 0.810958904109589, "grad_norm": 3.665911912918091, "learning_rate": 4.054794520547946e-06, "loss": 1.4546, "step": 296 }, { "epoch": 0.9123287671232877, "grad_norm": 3.749049663543701, "learning_rate": 4.561643835616439e-06, "loss": 1.4522, "step": 333 }, { "epoch": 1.0, "eval_loss": 1.4338933229446411, "eval_runtime": 41.7275, "eval_samples_per_second": 23.965, "eval_steps_per_second": 2.996, "step": 365 }, { "epoch": 1.0136986301369864, "grad_norm": 4.851729393005371, "learning_rate": 4.999971418949206e-06, "loss": 1.4457, "step": 370 }, { "epoch": 1.115068493150685, "grad_norm": 3.409865140914917, "learning_rate": 4.997983588332731e-06, "loss": 1.4208, "step": 407 }, { "epoch": 1.2164383561643834, "grad_norm": 3.7565808296203613, "learning_rate": 4.992868406108372e-06, "loss": 1.4305, "step": 444 }, { "epoch": 1.3178082191780822, "grad_norm": 3.7713687419891357, "learning_rate": 4.9846322762306745e-06, "loss": 1.4209, "step": 481 }, { "epoch": 1.4191780821917808, "grad_norm": 3.3665711879730225, "learning_rate": 4.973285509925916e-06, "loss": 1.4373, "step": 518 }, { "epoch": 1.5205479452054793, "grad_norm": 3.55543851852417, "learning_rate": 4.958842312782962e-06, "loss": 1.418, "step": 555 }, { "epoch": 1.621917808219178, "grad_norm": 3.4688973426818848, "learning_rate": 4.94132076696857e-06, "loss": 1.401, "step": 592 }, { "epoch": 1.7232876712328768, "grad_norm": 4.163483142852783, "learning_rate": 4.920742808589422e-06, "loss": 1.4019, "step": 629 }, { "epoch": 1.8246575342465754, "grad_norm": 3.631598711013794, "learning_rate": 4.897134200229196e-06, "loss": 1.4062, "step": 666 }, { "epoch": 1.926027397260274, "grad_norm": 3.6356711387634277, "learning_rate": 4.870524498695093e-06, "loss": 1.4037, "step": 703 }, { "epoch": 2.0, "eval_loss": 1.3962968587875366, "eval_runtime": 41.7463, "eval_samples_per_second": 23.954, "eval_steps_per_second": 2.994, "step": 730 }, { "epoch": 2.0273972602739727, "grad_norm": 3.9374818801879883, "learning_rate": 4.8409470180141825e-06, "loss": 1.3838, "step": 740 }, { "epoch": 2.128767123287671, "grad_norm": 3.6820218563079834, "learning_rate": 4.808438787725889e-06, "loss": 1.4044, "step": 777 }, { "epoch": 2.23013698630137, "grad_norm": 3.9502220153808594, "learning_rate": 4.773040506522845e-06, "loss": 1.3822, "step": 814 }, { "epoch": 2.3315068493150686, "grad_norm": 3.865752696990967, "learning_rate": 4.734796491298143e-06, "loss": 1.3789, "step": 851 }, { "epoch": 2.432876712328767, "grad_norm": 4.441615581512451, "learning_rate": 4.693754621662789e-06, "loss": 1.3695, "step": 888 }, { "epoch": 2.5342465753424657, "grad_norm": 4.0026021003723145, "learning_rate": 4.649966280002798e-06, "loss": 1.3656, "step": 925 }, { "epoch": 2.6356164383561644, "grad_norm": 4.268634796142578, "learning_rate": 4.6034862871509954e-06, "loss": 1.3726, "step": 962 }, { "epoch": 2.736986301369863, "grad_norm": 3.9229321479797363, "learning_rate": 4.5543728337540524e-06, "loss": 1.3749, "step": 999 }, { "epoch": 2.8383561643835615, "grad_norm": 4.253280162811279, "learning_rate": 4.502687407420681e-06, "loss": 1.374, "step": 1036 }, { "epoch": 2.9397260273972603, "grad_norm": 3.893472671508789, "learning_rate": 4.4484947157421985e-06, "loss": 1.3764, "step": 1073 }, { "epoch": 3.0, "eval_loss": 1.3806755542755127, "eval_runtime": 41.7525, "eval_samples_per_second": 23.951, "eval_steps_per_second": 2.994, "step": 1095 }, { "epoch": 3.041095890410959, "grad_norm": 4.168950080871582, "learning_rate": 4.391862605281827e-06, "loss": 1.3605, "step": 1110 }, { "epoch": 3.1424657534246574, "grad_norm": 4.114070415496826, "learning_rate": 4.332861976634164e-06, "loss": 1.343, "step": 1147 }, { "epoch": 3.243835616438356, "grad_norm": 4.68600606918335, "learning_rate": 4.27156669566115e-06, "loss": 1.3556, "step": 1184 }, { "epoch": 3.345205479452055, "grad_norm": 4.126201152801514, "learning_rate": 4.208053501015674e-06, "loss": 1.369, "step": 1221 }, { "epoch": 3.4465753424657533, "grad_norm": 4.665916919708252, "learning_rate": 4.142401908068583e-06, "loss": 1.3531, "step": 1258 }, { "epoch": 3.547945205479452, "grad_norm": 4.460052013397217, "learning_rate": 4.0746941093593815e-06, "loss": 1.3497, "step": 1295 }, { "epoch": 3.649315068493151, "grad_norm": 4.158130645751953, "learning_rate": 4.005014871695243e-06, "loss": 1.3574, "step": 1332 }, { "epoch": 3.750684931506849, "grad_norm": 4.600008964538574, "learning_rate": 3.933451430027176e-06, "loss": 1.3409, "step": 1369 }, { "epoch": 3.852054794520548, "grad_norm": 4.387424945831299, "learning_rate": 3.8600933782361875e-06, "loss": 1.3493, "step": 1406 }, { "epoch": 3.9534246575342467, "grad_norm": 4.715989112854004, "learning_rate": 3.78503255696618e-06, "loss": 1.3389, "step": 1443 }, { "epoch": 4.0, "eval_loss": 1.3703171014785767, "eval_runtime": 41.7403, "eval_samples_per_second": 23.958, "eval_steps_per_second": 2.995, "step": 1460 }, { "epoch": 4.054794520547945, "grad_norm": 4.664557456970215, "learning_rate": 3.7083629386440304e-06, "loss": 1.352, "step": 1480 }, { "epoch": 4.156164383561644, "grad_norm": 4.535429954528809, "learning_rate": 3.6301805098307614e-06, "loss": 1.3288, "step": 1517 }, { "epoch": 4.257534246575342, "grad_norm": 4.604859828948975, "learning_rate": 3.5505831510511272e-06, "loss": 1.3349, "step": 1554 }, { "epoch": 4.358904109589041, "grad_norm": 4.70920467376709, "learning_rate": 3.4696705142520537e-06, "loss": 1.3318, "step": 1591 }, { "epoch": 4.46027397260274, "grad_norm": 4.910584926605225, "learning_rate": 3.3875438980433367e-06, "loss": 1.3356, "step": 1628 }, { "epoch": 4.561643835616438, "grad_norm": 4.678400039672852, "learning_rate": 3.3043061208768075e-06, "loss": 1.3432, "step": 1665 }, { "epoch": 4.663013698630137, "grad_norm": 4.818428993225098, "learning_rate": 3.2200613923227255e-06, "loss": 1.324, "step": 1702 }, { "epoch": 4.764383561643836, "grad_norm": 5.278090000152588, "learning_rate": 3.134915182604566e-06, "loss": 1.3376, "step": 1739 }, { "epoch": 4.865753424657534, "grad_norm": 5.137758255004883, "learning_rate": 3.0489740905555297e-06, "loss": 1.3108, "step": 1776 }, { "epoch": 4.967123287671233, "grad_norm": 4.931362152099609, "learning_rate": 2.9623457101620844e-06, "loss": 1.3194, "step": 1813 }, { "epoch": 5.0, "eval_loss": 1.3646007776260376, "eval_runtime": 41.7107, "eval_samples_per_second": 23.975, "eval_steps_per_second": 2.997, "step": 1825 }, { "epoch": 5.068493150684931, "grad_norm": 5.321422100067139, "learning_rate": 2.8751384958616318e-06, "loss": 1.3269, "step": 1850 }, { "epoch": 5.16986301369863, "grad_norm": 5.509706497192383, "learning_rate": 2.787461626762929e-06, "loss": 1.2972, "step": 1887 }, { "epoch": 5.271232876712329, "grad_norm": 5.337893486022949, "learning_rate": 2.6994248699592545e-06, "loss": 1.3233, "step": 1924 }, { "epoch": 5.372602739726028, "grad_norm": 5.465749740600586, "learning_rate": 2.611138443105452e-06, "loss": 1.2997, "step": 1961 }, { "epoch": 5.473972602739726, "grad_norm": 5.099954128265381, "learning_rate": 2.5227128764308887e-06, "loss": 1.3164, "step": 1998 }, { "epoch": 5.575342465753424, "grad_norm": 5.322652339935303, "learning_rate": 2.4342588743610904e-06, "loss": 1.3221, "step": 2035 }, { "epoch": 5.676712328767123, "grad_norm": 5.532634735107422, "learning_rate": 2.345887176921286e-06, "loss": 1.3158, "step": 2072 }, { "epoch": 5.778082191780822, "grad_norm": 5.287755489349365, "learning_rate": 2.257708421095391e-06, "loss": 1.3107, "step": 2109 }, { "epoch": 5.879452054794521, "grad_norm": 5.4603729248046875, "learning_rate": 2.1698330023139837e-06, "loss": 1.3077, "step": 2146 }, { "epoch": 5.980821917808219, "grad_norm": 5.346288204193115, "learning_rate": 2.0823709362447025e-06, "loss": 1.2979, "step": 2183 }, { "epoch": 6.0, "eval_loss": 1.3607642650604248, "eval_runtime": 41.7364, "eval_samples_per_second": 23.96, "eval_steps_per_second": 2.995, "step": 2190 }, { "epoch": 6.082191780821918, "grad_norm": 5.522951126098633, "learning_rate": 1.995431721058082e-06, "loss": 1.2927, "step": 2220 }, { "epoch": 6.183561643835616, "grad_norm": 5.554995536804199, "learning_rate": 1.909124200341277e-06, "loss": 1.3058, "step": 2257 }, { "epoch": 6.284931506849315, "grad_norm": 6.056581020355225, "learning_rate": 1.82355642683128e-06, "loss": 1.2936, "step": 2294 }, { "epoch": 6.3863013698630136, "grad_norm": 5.767539978027344, "learning_rate": 1.7388355271382565e-06, "loss": 1.2925, "step": 2331 }, { "epoch": 6.487671232876712, "grad_norm": 5.96279239654541, "learning_rate": 1.6550675676283428e-06, "loss": 1.2941, "step": 2368 }, { "epoch": 6.589041095890411, "grad_norm": 6.0941667556762695, "learning_rate": 1.5723574216338066e-06, "loss": 1.3048, "step": 2405 }, { "epoch": 6.69041095890411, "grad_norm": 5.945704936981201, "learning_rate": 1.4908086381568398e-06, "loss": 1.3017, "step": 2442 }, { "epoch": 6.791780821917808, "grad_norm": 5.990777969360352, "learning_rate": 1.4105233122313416e-06, "loss": 1.31, "step": 2479 }, { "epoch": 6.8931506849315065, "grad_norm": 6.513980388641357, "learning_rate": 1.331601957104995e-06, "loss": 1.2874, "step": 2516 }, { "epoch": 6.994520547945205, "grad_norm": 5.826746940612793, "learning_rate": 1.2541433784016639e-06, "loss": 1.2894, "step": 2553 }, { "epoch": 7.0, "eval_loss": 1.3544963598251343, "eval_runtime": 41.7343, "eval_samples_per_second": 23.961, "eval_steps_per_second": 2.995, "step": 2555 }, { "epoch": 7.095890410958904, "grad_norm": 6.61908483505249, "learning_rate": 1.1782445504216552e-06, "loss": 1.286, "step": 2590 }, { "epoch": 7.197260273972603, "grad_norm": 6.173586845397949, "learning_rate": 1.1040004947346974e-06, "loss": 1.2684, "step": 2627 }, { "epoch": 7.298630136986302, "grad_norm": 6.668844223022461, "learning_rate": 1.0315041612176476e-06, "loss": 1.2822, "step": 2664 }, { "epoch": 7.4, "grad_norm": 5.676510334014893, "learning_rate": 9.608463116858544e-07, "loss": 1.3006, "step": 2701 }, { "epoch": 7.501369863013698, "grad_norm": 6.395183086395264, "learning_rate": 8.921154062638679e-07, "loss": 1.2818, "step": 2738 }, { "epoch": 7.602739726027397, "grad_norm": 6.158621311187744, "learning_rate": 8.253974926377434e-07, "loss": 1.2828, "step": 2775 }, { "epoch": 7.704109589041096, "grad_norm": 5.791866779327393, "learning_rate": 7.607760983276078e-07, "loss": 1.2851, "step": 2812 }, { "epoch": 7.8054794520547945, "grad_norm": 5.9625043869018555, "learning_rate": 6.983321261153478e-07, "loss": 1.2803, "step": 2849 }, { "epoch": 7.906849315068493, "grad_norm": 6.199430465698242, "learning_rate": 6.381437527583323e-07, "loss": 1.2985, "step": 2886 }, { "epoch": 8.0, "eval_loss": 1.3527910709381104, "eval_runtime": 41.7207, "eval_samples_per_second": 23.969, "eval_steps_per_second": 2.996, "step": 2920 }, { "epoch": 8.008219178082191, "grad_norm": 6.340386867523193, "learning_rate": 5.802863311159945e-07, "loss": 1.2803, "step": 2923 }, { "epoch": 8.10958904109589, "grad_norm": 5.827063083648682, "learning_rate": 5.248322958117815e-07, "loss": 1.2836, "step": 2960 }, { "epoch": 8.210958904109589, "grad_norm": 7.277185440063477, "learning_rate": 4.718510725486025e-07, "loss": 1.2826, "step": 2997 }, { "epoch": 8.312328767123288, "grad_norm": 6.630384922027588, "learning_rate": 4.21408991191285e-07, "loss": 1.2821, "step": 3034 }, { "epoch": 8.413698630136986, "grad_norm": 6.425660133361816, "learning_rate": 3.7356920272487914e-07, "loss": 1.2876, "step": 3071 }, { "epoch": 8.515068493150684, "grad_norm": 5.970567226409912, "learning_rate": 3.2839160019274867e-07, "loss": 1.2645, "step": 3108 }, { "epoch": 8.616438356164384, "grad_norm": 5.957708835601807, "learning_rate": 2.8593274371345513e-07, "loss": 1.2742, "step": 3145 }, { "epoch": 8.717808219178082, "grad_norm": 6.2841267585754395, "learning_rate": 2.4624578967028994e-07, "loss": 1.2732, "step": 3182 }, { "epoch": 8.819178082191781, "grad_norm": 6.1621174812316895, "learning_rate": 2.0938042416212167e-07, "loss": 1.2901, "step": 3219 }, { "epoch": 8.92054794520548, "grad_norm": 6.049917221069336, "learning_rate": 1.753828007988595e-07, "loss": 1.275, "step": 3256 }, { "epoch": 9.0, "eval_loss": 1.3524963855743408, "eval_runtime": 41.723, "eval_samples_per_second": 23.968, "eval_steps_per_second": 2.996, "step": 3285 }, { "epoch": 9.021917808219179, "grad_norm": 6.177803993225098, "learning_rate": 1.442954829194243e-07, "loss": 1.2851, "step": 3293 }, { "epoch": 9.123287671232877, "grad_norm": 5.982429504394531, "learning_rate": 1.1615739030455848e-07, "loss": 1.2827, "step": 3330 }, { "epoch": 9.224657534246575, "grad_norm": 6.101175785064697, "learning_rate": 9.100375045118931e-08, "loss": 1.2742, "step": 3367 }, { "epoch": 9.326027397260274, "grad_norm": 6.282660961151123, "learning_rate": 6.886605446934841e-08, "loss": 1.2763, "step": 3404 }, { "epoch": 9.427397260273972, "grad_norm": 6.399016857147217, "learning_rate": 4.9772017656865844e-08, "loss": 1.2672, "step": 3441 }, { "epoch": 9.528767123287672, "grad_norm": 6.448099613189697, "learning_rate": 3.374554480118919e-08, "loss": 1.267, "step": 3478 }, { "epoch": 9.63013698630137, "grad_norm": 6.512104034423828, "learning_rate": 2.0806700251775057e-08, "loss": 1.2979, "step": 3515 }, { "epoch": 9.731506849315068, "grad_norm": 6.009030342102051, "learning_rate": 1.097168280051908e-08, "loss": 1.2663, "step": 3552 }, { "epoch": 9.832876712328767, "grad_norm": 6.056164264678955, "learning_rate": 4.252805401671056e-09, "loss": 1.2783, "step": 3589 }, { "epoch": 9.934246575342465, "grad_norm": 6.128045558929443, "learning_rate": 6.584797566264179e-10, "loss": 1.2748, "step": 3626 }, { "epoch": 10.0, "eval_loss": 1.352522611618042, "eval_runtime": 41.7261, "eval_samples_per_second": 23.966, "eval_steps_per_second": 2.996, "step": 3650 } ], "logging_steps": 37, "max_steps": 3650, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.806373112295014e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }