|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9846153846153847, |
|
"eval_steps": 9, |
|
"global_step": 97, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"grad_norm": 0.19627095758914948, |
|
"learning_rate": 1e-05, |
|
"loss": 10.3769, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"eval_loss": 10.386632919311523, |
|
"eval_runtime": 0.0801, |
|
"eval_samples_per_second": 1361.483, |
|
"eval_steps_per_second": 49.963, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"grad_norm": 0.20535489916801453, |
|
"learning_rate": 2e-05, |
|
"loss": 10.3764, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"grad_norm": 0.1900486946105957, |
|
"learning_rate": 3e-05, |
|
"loss": 10.382, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"grad_norm": 0.2189124971628189, |
|
"learning_rate": 4e-05, |
|
"loss": 10.3854, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 0.19613224267959595, |
|
"learning_rate": 5e-05, |
|
"loss": 10.3846, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"grad_norm": 0.21051953732967377, |
|
"learning_rate": 6e-05, |
|
"loss": 10.3951, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"grad_norm": 0.193317711353302, |
|
"learning_rate": 7e-05, |
|
"loss": 10.4099, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"grad_norm": 0.22925445437431335, |
|
"learning_rate": 8e-05, |
|
"loss": 10.4001, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"grad_norm": 0.2118426263332367, |
|
"learning_rate": 9e-05, |
|
"loss": 10.4226, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"eval_loss": 10.384289741516113, |
|
"eval_runtime": 0.075, |
|
"eval_samples_per_second": 1452.898, |
|
"eval_steps_per_second": 53.317, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.22410708665847778, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3974, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"grad_norm": 0.28219085931777954, |
|
"learning_rate": 9.996740476948385e-05, |
|
"loss": 10.3593, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"grad_norm": 0.2661738991737366, |
|
"learning_rate": 9.98696615758975e-05, |
|
"loss": 10.3826, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.26590314507484436, |
|
"learning_rate": 9.970689785771798e-05, |
|
"loss": 10.3853, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"grad_norm": 0.23882359266281128, |
|
"learning_rate": 9.947932582778188e-05, |
|
"loss": 10.3944, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 0.2391405999660492, |
|
"learning_rate": 9.918724219660013e-05, |
|
"loss": 10.357, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"grad_norm": 0.2403474599123001, |
|
"learning_rate": 9.883102778550434e-05, |
|
"loss": 10.3803, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"grad_norm": 0.22196514904499054, |
|
"learning_rate": 9.841114703012817e-05, |
|
"loss": 10.3643, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"grad_norm": 0.245796337723732, |
|
"learning_rate": 9.792814737487207e-05, |
|
"loss": 10.4181, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"eval_loss": 10.378185272216797, |
|
"eval_runtime": 0.0762, |
|
"eval_samples_per_second": 1429.641, |
|
"eval_steps_per_second": 52.464, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"grad_norm": 0.2934595048427582, |
|
"learning_rate": 9.738265855914013e-05, |
|
"loss": 10.3524, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.29529669880867004, |
|
"learning_rate": 9.677539179628005e-05, |
|
"loss": 10.3852, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"grad_norm": 0.2764834761619568, |
|
"learning_rate": 9.610713884629666e-05, |
|
"loss": 10.3627, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"grad_norm": 0.270579993724823, |
|
"learning_rate": 9.537877098354786e-05, |
|
"loss": 10.3802, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"grad_norm": 0.28736695647239685, |
|
"learning_rate": 9.459123786076912e-05, |
|
"loss": 10.3475, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"grad_norm": 0.27069252729415894, |
|
"learning_rate": 9.374556627090749e-05, |
|
"loss": 10.3726, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.2778293192386627, |
|
"learning_rate": 9.284285880837946e-05, |
|
"loss": 10.3683, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.3016285002231598, |
|
"learning_rate": 9.188429243149824e-05, |
|
"loss": 10.3652, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"grad_norm": 0.3102306127548218, |
|
"learning_rate": 9.087111692794459e-05, |
|
"loss": 10.3612, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"eval_loss": 10.371162414550781, |
|
"eval_runtime": 0.0804, |
|
"eval_samples_per_second": 1355.392, |
|
"eval_steps_per_second": 49.739, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.8615384615384616, |
|
"grad_norm": 0.2973518967628479, |
|
"learning_rate": 8.980465328528219e-05, |
|
"loss": 10.3562, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.8923076923076924, |
|
"grad_norm": 0.27339646220207214, |
|
"learning_rate": 8.868629196864182e-05, |
|
"loss": 10.3745, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.30518829822540283, |
|
"learning_rate": 8.751749110782012e-05, |
|
"loss": 10.3788, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.9538461538461539, |
|
"grad_norm": 0.30676740407943726, |
|
"learning_rate": 8.629977459615655e-05, |
|
"loss": 10.3631, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"grad_norm": 0.3426137864589691, |
|
"learning_rate": 8.503473010366713e-05, |
|
"loss": 10.3782, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.0153846153846153, |
|
"grad_norm": 0.43814149498939514, |
|
"learning_rate": 8.37240070070257e-05, |
|
"loss": 14.7399, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.0461538461538462, |
|
"grad_norm": 0.39517074823379517, |
|
"learning_rate": 8.236931423909138e-05, |
|
"loss": 11.6581, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 0.286582887172699, |
|
"learning_rate": 8.097241806078615e-05, |
|
"loss": 9.8718, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"grad_norm": 0.3339255154132843, |
|
"learning_rate": 7.953513975822755e-05, |
|
"loss": 9.8388, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"eval_loss": 10.362679481506348, |
|
"eval_runtime": 0.0735, |
|
"eval_samples_per_second": 1482.077, |
|
"eval_steps_per_second": 54.388, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.1384615384615384, |
|
"grad_norm": 0.38969117403030396, |
|
"learning_rate": 7.805935326811912e-05, |
|
"loss": 9.6292, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.1692307692307693, |
|
"grad_norm": 0.4563569724559784, |
|
"learning_rate": 7.654698273449435e-05, |
|
"loss": 11.4989, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.5030809044837952, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 11.8672, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 0.32794782519340515, |
|
"learning_rate": 7.342042203498951e-05, |
|
"loss": 9.4949, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.2615384615384615, |
|
"grad_norm": 0.3771244287490845, |
|
"learning_rate": 7.181030830777837e-05, |
|
"loss": 8.6339, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.2923076923076924, |
|
"grad_norm": 0.37634986639022827, |
|
"learning_rate": 7.017175809949044e-05, |
|
"loss": 9.5719, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.323076923076923, |
|
"grad_norm": 0.5357686877250671, |
|
"learning_rate": 6.850690776699573e-05, |
|
"loss": 13.4886, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.353846153846154, |
|
"grad_norm": 0.41375601291656494, |
|
"learning_rate": 6.681792795750875e-05, |
|
"loss": 10.2368, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 0.3804188370704651, |
|
"learning_rate": 6.510702077847863e-05, |
|
"loss": 8.42, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"eval_loss": 10.35329532623291, |
|
"eval_runtime": 0.0829, |
|
"eval_samples_per_second": 1314.084, |
|
"eval_steps_per_second": 48.223, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.4153846153846155, |
|
"grad_norm": 0.5498846173286438, |
|
"learning_rate": 6.337641692646106e-05, |
|
"loss": 10.7262, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.4461538461538461, |
|
"grad_norm": 0.4699338972568512, |
|
"learning_rate": 6.162837277871553e-05, |
|
"loss": 10.9246, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.476923076923077, |
|
"grad_norm": 0.47309648990631104, |
|
"learning_rate": 5.9865167451320005e-05, |
|
"loss": 10.4618, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.5076923076923077, |
|
"grad_norm": 0.44090601801872253, |
|
"learning_rate": 5.808909982763825e-05, |
|
"loss": 9.5822, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.6003273129463196, |
|
"learning_rate": 5.6302485561014475e-05, |
|
"loss": 12.5208, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.5692307692307692, |
|
"grad_norm": 0.43022680282592773, |
|
"learning_rate": 5.4507654055603275e-05, |
|
"loss": 9.2575, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.38099291920661926, |
|
"learning_rate": 5.270694542927088e-05, |
|
"loss": 9.0721, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.6307692307692307, |
|
"grad_norm": 0.5643202662467957, |
|
"learning_rate": 5.090270746252802e-05, |
|
"loss": 11.2072, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.6615384615384614, |
|
"grad_norm": 0.4488671123981476, |
|
"learning_rate": 4.909729253747197e-05, |
|
"loss": 9.4193, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.6615384615384614, |
|
"eval_loss": 10.343835830688477, |
|
"eval_runtime": 0.071, |
|
"eval_samples_per_second": 1535.673, |
|
"eval_steps_per_second": 56.355, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"grad_norm": 0.5811948776245117, |
|
"learning_rate": 4.729305457072913e-05, |
|
"loss": 11.7481, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.7230769230769232, |
|
"grad_norm": 0.41047021746635437, |
|
"learning_rate": 4.549234594439674e-05, |
|
"loss": 9.4589, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.7538461538461538, |
|
"grad_norm": 0.522146999835968, |
|
"learning_rate": 4.3697514438985536e-05, |
|
"loss": 11.107, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.7846153846153845, |
|
"grad_norm": 0.5175060629844666, |
|
"learning_rate": 4.1910900172361764e-05, |
|
"loss": 10.9767, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.8153846153846154, |
|
"grad_norm": 0.4755529463291168, |
|
"learning_rate": 4.0134832548680006e-05, |
|
"loss": 9.3121, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 0.4807398021221161, |
|
"learning_rate": 3.8371627221284495e-05, |
|
"loss": 9.6356, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.876923076923077, |
|
"grad_norm": 0.5616713166236877, |
|
"learning_rate": 3.6623583073538966e-05, |
|
"loss": 10.963, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.9076923076923076, |
|
"grad_norm": 0.5250815153121948, |
|
"learning_rate": 3.489297922152136e-05, |
|
"loss": 10.1303, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.9384615384615385, |
|
"grad_norm": 0.6839239001274109, |
|
"learning_rate": 3.3182072042491244e-05, |
|
"loss": 11.6986, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.9384615384615385, |
|
"eval_loss": 10.335771560668945, |
|
"eval_runtime": 0.0748, |
|
"eval_samples_per_second": 1457.284, |
|
"eval_steps_per_second": 53.478, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.9692307692307693, |
|
"grad_norm": 0.5417065620422363, |
|
"learning_rate": 3.149309223300428e-05, |
|
"loss": 9.5666, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.8907036185264587, |
|
"learning_rate": 2.982824190050958e-05, |
|
"loss": 15.1453, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.0307692307692307, |
|
"grad_norm": 0.5486352443695068, |
|
"learning_rate": 2.8189691692221627e-05, |
|
"loss": 10.3319, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.0615384615384613, |
|
"grad_norm": 0.5944435000419617, |
|
"learning_rate": 2.65795779650105e-05, |
|
"loss": 10.3267, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.0923076923076924, |
|
"grad_norm": 0.5956578254699707, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 10.313, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.123076923076923, |
|
"grad_norm": 0.592414379119873, |
|
"learning_rate": 2.3453017265505673e-05, |
|
"loss": 10.3379, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 0.6250660419464111, |
|
"learning_rate": 2.194064673188089e-05, |
|
"loss": 10.3427, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.184615384615385, |
|
"grad_norm": 0.6292226910591125, |
|
"learning_rate": 2.0464860241772455e-05, |
|
"loss": 10.3298, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.2153846153846155, |
|
"grad_norm": 0.5584103465080261, |
|
"learning_rate": 1.902758193921385e-05, |
|
"loss": 10.3342, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.2153846153846155, |
|
"eval_loss": 10.32968807220459, |
|
"eval_runtime": 0.0724, |
|
"eval_samples_per_second": 1505.761, |
|
"eval_steps_per_second": 55.257, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.246153846153846, |
|
"grad_norm": 0.5744684934616089, |
|
"learning_rate": 1.7630685760908622e-05, |
|
"loss": 10.3275, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.276923076923077, |
|
"grad_norm": 0.6510607600212097, |
|
"learning_rate": 1.6275992992974308e-05, |
|
"loss": 10.3399, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 0.6014554500579834, |
|
"learning_rate": 1.4965269896332885e-05, |
|
"loss": 10.3441, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.3384615384615386, |
|
"grad_norm": 0.6675054430961609, |
|
"learning_rate": 1.3700225403843469e-05, |
|
"loss": 10.3299, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.3692307692307693, |
|
"grad_norm": 0.5850571990013123, |
|
"learning_rate": 1.2482508892179884e-05, |
|
"loss": 10.3144, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.6561471223831177, |
|
"learning_rate": 1.1313708031358183e-05, |
|
"loss": 10.3181, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.430769230769231, |
|
"grad_norm": 0.6304017305374146, |
|
"learning_rate": 1.0195346714717813e-05, |
|
"loss": 10.3316, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 0.627305269241333, |
|
"learning_rate": 9.12888307205541e-06, |
|
"loss": 10.321, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.4923076923076923, |
|
"grad_norm": 0.6237972974777222, |
|
"learning_rate": 8.115707568501768e-06, |
|
"loss": 10.3571, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.4923076923076923, |
|
"eval_loss": 10.326565742492676, |
|
"eval_runtime": 0.0741, |
|
"eval_samples_per_second": 1471.008, |
|
"eval_steps_per_second": 53.982, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.523076923076923, |
|
"grad_norm": 0.627188503742218, |
|
"learning_rate": 7.157141191620548e-06, |
|
"loss": 10.3486, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.5538461538461537, |
|
"grad_norm": 0.63338303565979, |
|
"learning_rate": 6.2544337290925185e-06, |
|
"loss": 10.3175, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.5846153846153848, |
|
"grad_norm": 0.5604279041290283, |
|
"learning_rate": 5.408762139230888e-06, |
|
"loss": 10.3171, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"grad_norm": 0.6100577712059021, |
|
"learning_rate": 4.621229016452156e-06, |
|
"loss": 10.3251, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.646153846153846, |
|
"grad_norm": 0.6354370713233948, |
|
"learning_rate": 3.892861153703342e-06, |
|
"loss": 10.3134, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.676923076923077, |
|
"grad_norm": 0.6003406643867493, |
|
"learning_rate": 3.2246082037199532e-06, |
|
"loss": 10.3045, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.707692307692308, |
|
"grad_norm": 0.5505728125572205, |
|
"learning_rate": 2.6173414408598827e-06, |
|
"loss": 10.3232, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.7384615384615385, |
|
"grad_norm": 0.6088215708732605, |
|
"learning_rate": 2.0718526251279346e-06, |
|
"loss": 10.3132, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 0.6168414950370789, |
|
"learning_rate": 1.5888529698718346e-06, |
|
"loss": 10.3199, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"eval_loss": 10.325533866882324, |
|
"eval_runtime": 0.0753, |
|
"eval_samples_per_second": 1447.667, |
|
"eval_steps_per_second": 53.125, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.6565277576446533, |
|
"learning_rate": 1.1689722144956671e-06, |
|
"loss": 10.3263, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.830769230769231, |
|
"grad_norm": 0.7084165811538696, |
|
"learning_rate": 8.127578033998662e-07, |
|
"loss": 10.3452, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.8615384615384616, |
|
"grad_norm": 0.5756314396858215, |
|
"learning_rate": 5.206741722181386e-07, |
|
"loss": 10.319, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.8923076923076922, |
|
"grad_norm": 0.5791900753974915, |
|
"learning_rate": 2.9310214228202013e-07, |
|
"loss": 10.308, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"grad_norm": 0.6099293828010559, |
|
"learning_rate": 1.3033842410251075e-07, |
|
"loss": 10.315, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.953846153846154, |
|
"grad_norm": 0.5944267511367798, |
|
"learning_rate": 3.259523051615254e-08, |
|
"loss": 10.3374, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 2.9846153846153847, |
|
"grad_norm": 0.618131160736084, |
|
"learning_rate": 0.0, |
|
"loss": 10.3129, |
|
"step": 97 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 97, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 20293349277696.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|