{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9846153846153847, "eval_steps": 9, "global_step": 97, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03076923076923077, "grad_norm": 0.19627095758914948, "learning_rate": 1e-05, "loss": 10.3769, "step": 1 }, { "epoch": 0.03076923076923077, "eval_loss": 10.386632919311523, "eval_runtime": 0.0801, "eval_samples_per_second": 1361.483, "eval_steps_per_second": 49.963, "step": 1 }, { "epoch": 0.06153846153846154, "grad_norm": 0.20535489916801453, "learning_rate": 2e-05, "loss": 10.3764, "step": 2 }, { "epoch": 0.09230769230769231, "grad_norm": 0.1900486946105957, "learning_rate": 3e-05, "loss": 10.382, "step": 3 }, { "epoch": 0.12307692307692308, "grad_norm": 0.2189124971628189, "learning_rate": 4e-05, "loss": 10.3854, "step": 4 }, { "epoch": 0.15384615384615385, "grad_norm": 0.19613224267959595, "learning_rate": 5e-05, "loss": 10.3846, "step": 5 }, { "epoch": 0.18461538461538463, "grad_norm": 0.21051953732967377, "learning_rate": 6e-05, "loss": 10.3951, "step": 6 }, { "epoch": 0.2153846153846154, "grad_norm": 0.193317711353302, "learning_rate": 7e-05, "loss": 10.4099, "step": 7 }, { "epoch": 0.24615384615384617, "grad_norm": 0.22925445437431335, "learning_rate": 8e-05, "loss": 10.4001, "step": 8 }, { "epoch": 0.27692307692307694, "grad_norm": 0.2118426263332367, "learning_rate": 9e-05, "loss": 10.4226, "step": 9 }, { "epoch": 0.27692307692307694, "eval_loss": 10.384289741516113, "eval_runtime": 0.075, "eval_samples_per_second": 1452.898, "eval_steps_per_second": 53.317, "step": 9 }, { "epoch": 0.3076923076923077, "grad_norm": 0.22410708665847778, "learning_rate": 0.0001, "loss": 10.3974, "step": 10 }, { "epoch": 0.3384615384615385, "grad_norm": 0.28219085931777954, "learning_rate": 9.996740476948385e-05, "loss": 10.3593, "step": 11 }, { "epoch": 0.36923076923076925, "grad_norm": 0.2661738991737366, "learning_rate": 9.98696615758975e-05, "loss": 10.3826, "step": 12 }, { "epoch": 0.4, "grad_norm": 0.26590314507484436, "learning_rate": 9.970689785771798e-05, "loss": 10.3853, "step": 13 }, { "epoch": 0.4307692307692308, "grad_norm": 0.23882359266281128, "learning_rate": 9.947932582778188e-05, "loss": 10.3944, "step": 14 }, { "epoch": 0.46153846153846156, "grad_norm": 0.2391405999660492, "learning_rate": 9.918724219660013e-05, "loss": 10.357, "step": 15 }, { "epoch": 0.49230769230769234, "grad_norm": 0.2403474599123001, "learning_rate": 9.883102778550434e-05, "loss": 10.3803, "step": 16 }, { "epoch": 0.5230769230769231, "grad_norm": 0.22196514904499054, "learning_rate": 9.841114703012817e-05, "loss": 10.3643, "step": 17 }, { "epoch": 0.5538461538461539, "grad_norm": 0.245796337723732, "learning_rate": 9.792814737487207e-05, "loss": 10.4181, "step": 18 }, { "epoch": 0.5538461538461539, "eval_loss": 10.378185272216797, "eval_runtime": 0.0762, "eval_samples_per_second": 1429.641, "eval_steps_per_second": 52.464, "step": 18 }, { "epoch": 0.5846153846153846, "grad_norm": 0.2934595048427582, "learning_rate": 9.738265855914013e-05, "loss": 10.3524, "step": 19 }, { "epoch": 0.6153846153846154, "grad_norm": 0.29529669880867004, "learning_rate": 9.677539179628005e-05, "loss": 10.3852, "step": 20 }, { "epoch": 0.6461538461538462, "grad_norm": 0.2764834761619568, "learning_rate": 9.610713884629666e-05, "loss": 10.3627, "step": 21 }, { "epoch": 0.676923076923077, "grad_norm": 0.270579993724823, "learning_rate": 9.537877098354786e-05, "loss": 10.3802, "step": 22 }, { "epoch": 0.7076923076923077, "grad_norm": 0.28736695647239685, "learning_rate": 9.459123786076912e-05, "loss": 10.3475, "step": 23 }, { "epoch": 0.7384615384615385, "grad_norm": 0.27069252729415894, "learning_rate": 9.374556627090749e-05, "loss": 10.3726, "step": 24 }, { "epoch": 0.7692307692307693, "grad_norm": 0.2778293192386627, "learning_rate": 9.284285880837946e-05, "loss": 10.3683, "step": 25 }, { "epoch": 0.8, "grad_norm": 0.3016285002231598, "learning_rate": 9.188429243149824e-05, "loss": 10.3652, "step": 26 }, { "epoch": 0.8307692307692308, "grad_norm": 0.3102306127548218, "learning_rate": 9.087111692794459e-05, "loss": 10.3612, "step": 27 }, { "epoch": 0.8307692307692308, "eval_loss": 10.371162414550781, "eval_runtime": 0.0804, "eval_samples_per_second": 1355.392, "eval_steps_per_second": 49.739, "step": 27 }, { "epoch": 0.8615384615384616, "grad_norm": 0.2973518967628479, "learning_rate": 8.980465328528219e-05, "loss": 10.3562, "step": 28 }, { "epoch": 0.8923076923076924, "grad_norm": 0.27339646220207214, "learning_rate": 8.868629196864182e-05, "loss": 10.3745, "step": 29 }, { "epoch": 0.9230769230769231, "grad_norm": 0.30518829822540283, "learning_rate": 8.751749110782012e-05, "loss": 10.3788, "step": 30 }, { "epoch": 0.9538461538461539, "grad_norm": 0.30676740407943726, "learning_rate": 8.629977459615655e-05, "loss": 10.3631, "step": 31 }, { "epoch": 0.9846153846153847, "grad_norm": 0.3426137864589691, "learning_rate": 8.503473010366713e-05, "loss": 10.3782, "step": 32 }, { "epoch": 1.0153846153846153, "grad_norm": 0.43814149498939514, "learning_rate": 8.37240070070257e-05, "loss": 14.7399, "step": 33 }, { "epoch": 1.0461538461538462, "grad_norm": 0.39517074823379517, "learning_rate": 8.236931423909138e-05, "loss": 11.6581, "step": 34 }, { "epoch": 1.0769230769230769, "grad_norm": 0.286582887172699, "learning_rate": 8.097241806078615e-05, "loss": 9.8718, "step": 35 }, { "epoch": 1.1076923076923078, "grad_norm": 0.3339255154132843, "learning_rate": 7.953513975822755e-05, "loss": 9.8388, "step": 36 }, { "epoch": 1.1076923076923078, "eval_loss": 10.362679481506348, "eval_runtime": 0.0735, "eval_samples_per_second": 1482.077, "eval_steps_per_second": 54.388, "step": 36 }, { "epoch": 1.1384615384615384, "grad_norm": 0.38969117403030396, "learning_rate": 7.805935326811912e-05, "loss": 9.6292, "step": 37 }, { "epoch": 1.1692307692307693, "grad_norm": 0.4563569724559784, "learning_rate": 7.654698273449435e-05, "loss": 11.4989, "step": 38 }, { "epoch": 1.2, "grad_norm": 0.5030809044837952, "learning_rate": 7.500000000000001e-05, "loss": 11.8672, "step": 39 }, { "epoch": 1.2307692307692308, "grad_norm": 0.32794782519340515, "learning_rate": 7.342042203498951e-05, "loss": 9.4949, "step": 40 }, { "epoch": 1.2615384615384615, "grad_norm": 0.3771244287490845, "learning_rate": 7.181030830777837e-05, "loss": 8.6339, "step": 41 }, { "epoch": 1.2923076923076924, "grad_norm": 0.37634986639022827, "learning_rate": 7.017175809949044e-05, "loss": 9.5719, "step": 42 }, { "epoch": 1.323076923076923, "grad_norm": 0.5357686877250671, "learning_rate": 6.850690776699573e-05, "loss": 13.4886, "step": 43 }, { "epoch": 1.353846153846154, "grad_norm": 0.41375601291656494, "learning_rate": 6.681792795750875e-05, "loss": 10.2368, "step": 44 }, { "epoch": 1.3846153846153846, "grad_norm": 0.3804188370704651, "learning_rate": 6.510702077847863e-05, "loss": 8.42, "step": 45 }, { "epoch": 1.3846153846153846, "eval_loss": 10.35329532623291, "eval_runtime": 0.0829, "eval_samples_per_second": 1314.084, "eval_steps_per_second": 48.223, "step": 45 }, { "epoch": 1.4153846153846155, "grad_norm": 0.5498846173286438, "learning_rate": 6.337641692646106e-05, "loss": 10.7262, "step": 46 }, { "epoch": 1.4461538461538461, "grad_norm": 0.4699338972568512, "learning_rate": 6.162837277871553e-05, "loss": 10.9246, "step": 47 }, { "epoch": 1.476923076923077, "grad_norm": 0.47309648990631104, "learning_rate": 5.9865167451320005e-05, "loss": 10.4618, "step": 48 }, { "epoch": 1.5076923076923077, "grad_norm": 0.44090601801872253, "learning_rate": 5.808909982763825e-05, "loss": 9.5822, "step": 49 }, { "epoch": 1.5384615384615383, "grad_norm": 0.6003273129463196, "learning_rate": 5.6302485561014475e-05, "loss": 12.5208, "step": 50 }, { "epoch": 1.5692307692307692, "grad_norm": 0.43022680282592773, "learning_rate": 5.4507654055603275e-05, "loss": 9.2575, "step": 51 }, { "epoch": 1.6, "grad_norm": 0.38099291920661926, "learning_rate": 5.270694542927088e-05, "loss": 9.0721, "step": 52 }, { "epoch": 1.6307692307692307, "grad_norm": 0.5643202662467957, "learning_rate": 5.090270746252802e-05, "loss": 11.2072, "step": 53 }, { "epoch": 1.6615384615384614, "grad_norm": 0.4488671123981476, "learning_rate": 4.909729253747197e-05, "loss": 9.4193, "step": 54 }, { "epoch": 1.6615384615384614, "eval_loss": 10.343835830688477, "eval_runtime": 0.071, "eval_samples_per_second": 1535.673, "eval_steps_per_second": 56.355, "step": 54 }, { "epoch": 1.6923076923076923, "grad_norm": 0.5811948776245117, "learning_rate": 4.729305457072913e-05, "loss": 11.7481, "step": 55 }, { "epoch": 1.7230769230769232, "grad_norm": 0.41047021746635437, "learning_rate": 4.549234594439674e-05, "loss": 9.4589, "step": 56 }, { "epoch": 1.7538461538461538, "grad_norm": 0.522146999835968, "learning_rate": 4.3697514438985536e-05, "loss": 11.107, "step": 57 }, { "epoch": 1.7846153846153845, "grad_norm": 0.5175060629844666, "learning_rate": 4.1910900172361764e-05, "loss": 10.9767, "step": 58 }, { "epoch": 1.8153846153846154, "grad_norm": 0.4755529463291168, "learning_rate": 4.0134832548680006e-05, "loss": 9.3121, "step": 59 }, { "epoch": 1.8461538461538463, "grad_norm": 0.4807398021221161, "learning_rate": 3.8371627221284495e-05, "loss": 9.6356, "step": 60 }, { "epoch": 1.876923076923077, "grad_norm": 0.5616713166236877, "learning_rate": 3.6623583073538966e-05, "loss": 10.963, "step": 61 }, { "epoch": 1.9076923076923076, "grad_norm": 0.5250815153121948, "learning_rate": 3.489297922152136e-05, "loss": 10.1303, "step": 62 }, { "epoch": 1.9384615384615385, "grad_norm": 0.6839239001274109, "learning_rate": 3.3182072042491244e-05, "loss": 11.6986, "step": 63 }, { "epoch": 1.9384615384615385, "eval_loss": 10.335771560668945, "eval_runtime": 0.0748, "eval_samples_per_second": 1457.284, "eval_steps_per_second": 53.478, "step": 63 }, { "epoch": 1.9692307692307693, "grad_norm": 0.5417065620422363, "learning_rate": 3.149309223300428e-05, "loss": 9.5666, "step": 64 }, { "epoch": 2.0, "grad_norm": 0.8907036185264587, "learning_rate": 2.982824190050958e-05, "loss": 15.1453, "step": 65 }, { "epoch": 2.0307692307692307, "grad_norm": 0.5486352443695068, "learning_rate": 2.8189691692221627e-05, "loss": 10.3319, "step": 66 }, { "epoch": 2.0615384615384613, "grad_norm": 0.5944435000419617, "learning_rate": 2.65795779650105e-05, "loss": 10.3267, "step": 67 }, { "epoch": 2.0923076923076924, "grad_norm": 0.5956578254699707, "learning_rate": 2.500000000000001e-05, "loss": 10.313, "step": 68 }, { "epoch": 2.123076923076923, "grad_norm": 0.592414379119873, "learning_rate": 2.3453017265505673e-05, "loss": 10.3379, "step": 69 }, { "epoch": 2.1538461538461537, "grad_norm": 0.6250660419464111, "learning_rate": 2.194064673188089e-05, "loss": 10.3427, "step": 70 }, { "epoch": 2.184615384615385, "grad_norm": 0.6292226910591125, "learning_rate": 2.0464860241772455e-05, "loss": 10.3298, "step": 71 }, { "epoch": 2.2153846153846155, "grad_norm": 0.5584103465080261, "learning_rate": 1.902758193921385e-05, "loss": 10.3342, "step": 72 }, { "epoch": 2.2153846153846155, "eval_loss": 10.32968807220459, "eval_runtime": 0.0724, "eval_samples_per_second": 1505.761, "eval_steps_per_second": 55.257, "step": 72 }, { "epoch": 2.246153846153846, "grad_norm": 0.5744684934616089, "learning_rate": 1.7630685760908622e-05, "loss": 10.3275, "step": 73 }, { "epoch": 2.276923076923077, "grad_norm": 0.6510607600212097, "learning_rate": 1.6275992992974308e-05, "loss": 10.3399, "step": 74 }, { "epoch": 2.3076923076923075, "grad_norm": 0.6014554500579834, "learning_rate": 1.4965269896332885e-05, "loss": 10.3441, "step": 75 }, { "epoch": 2.3384615384615386, "grad_norm": 0.6675054430961609, "learning_rate": 1.3700225403843469e-05, "loss": 10.3299, "step": 76 }, { "epoch": 2.3692307692307693, "grad_norm": 0.5850571990013123, "learning_rate": 1.2482508892179884e-05, "loss": 10.3144, "step": 77 }, { "epoch": 2.4, "grad_norm": 0.6561471223831177, "learning_rate": 1.1313708031358183e-05, "loss": 10.3181, "step": 78 }, { "epoch": 2.430769230769231, "grad_norm": 0.6304017305374146, "learning_rate": 1.0195346714717813e-05, "loss": 10.3316, "step": 79 }, { "epoch": 2.4615384615384617, "grad_norm": 0.627305269241333, "learning_rate": 9.12888307205541e-06, "loss": 10.321, "step": 80 }, { "epoch": 2.4923076923076923, "grad_norm": 0.6237972974777222, "learning_rate": 8.115707568501768e-06, "loss": 10.3571, "step": 81 }, { "epoch": 2.4923076923076923, "eval_loss": 10.326565742492676, "eval_runtime": 0.0741, "eval_samples_per_second": 1471.008, "eval_steps_per_second": 53.982, "step": 81 }, { "epoch": 2.523076923076923, "grad_norm": 0.627188503742218, "learning_rate": 7.157141191620548e-06, "loss": 10.3486, "step": 82 }, { "epoch": 2.5538461538461537, "grad_norm": 0.63338303565979, "learning_rate": 6.2544337290925185e-06, "loss": 10.3175, "step": 83 }, { "epoch": 2.5846153846153848, "grad_norm": 0.5604279041290283, "learning_rate": 5.408762139230888e-06, "loss": 10.3171, "step": 84 }, { "epoch": 2.6153846153846154, "grad_norm": 0.6100577712059021, "learning_rate": 4.621229016452156e-06, "loss": 10.3251, "step": 85 }, { "epoch": 2.646153846153846, "grad_norm": 0.6354370713233948, "learning_rate": 3.892861153703342e-06, "loss": 10.3134, "step": 86 }, { "epoch": 2.676923076923077, "grad_norm": 0.6003406643867493, "learning_rate": 3.2246082037199532e-06, "loss": 10.3045, "step": 87 }, { "epoch": 2.707692307692308, "grad_norm": 0.5505728125572205, "learning_rate": 2.6173414408598827e-06, "loss": 10.3232, "step": 88 }, { "epoch": 2.7384615384615385, "grad_norm": 0.6088215708732605, "learning_rate": 2.0718526251279346e-06, "loss": 10.3132, "step": 89 }, { "epoch": 2.769230769230769, "grad_norm": 0.6168414950370789, "learning_rate": 1.5888529698718346e-06, "loss": 10.3199, "step": 90 }, { "epoch": 2.769230769230769, "eval_loss": 10.325533866882324, "eval_runtime": 0.0753, "eval_samples_per_second": 1447.667, "eval_steps_per_second": 53.125, "step": 90 }, { "epoch": 2.8, "grad_norm": 0.6565277576446533, "learning_rate": 1.1689722144956671e-06, "loss": 10.3263, "step": 91 }, { "epoch": 2.830769230769231, "grad_norm": 0.7084165811538696, "learning_rate": 8.127578033998662e-07, "loss": 10.3452, "step": 92 }, { "epoch": 2.8615384615384616, "grad_norm": 0.5756314396858215, "learning_rate": 5.206741722181386e-07, "loss": 10.319, "step": 93 }, { "epoch": 2.8923076923076922, "grad_norm": 0.5791900753974915, "learning_rate": 2.9310214228202013e-07, "loss": 10.308, "step": 94 }, { "epoch": 2.9230769230769234, "grad_norm": 0.6099293828010559, "learning_rate": 1.3033842410251075e-07, "loss": 10.315, "step": 95 }, { "epoch": 2.953846153846154, "grad_norm": 0.5944267511367798, "learning_rate": 3.259523051615254e-08, "loss": 10.3374, "step": 96 }, { "epoch": 2.9846153846153847, "grad_norm": 0.618131160736084, "learning_rate": 0.0, "loss": 10.3129, "step": 97 } ], "logging_steps": 1, "max_steps": 97, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 20293349277696.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }