|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0035087719298246, |
|
"eval_steps": 36, |
|
"global_step": 428, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007017543859649123, |
|
"eval_loss": 0.6692813038825989, |
|
"eval_runtime": 46.9815, |
|
"eval_samples_per_second": 5.108, |
|
"eval_steps_per_second": 0.639, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.021052631578947368, |
|
"grad_norm": 9.726601600646973, |
|
"learning_rate": 1.5e-05, |
|
"loss": 2.4103, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.042105263157894736, |
|
"grad_norm": 6.305670738220215, |
|
"learning_rate": 3e-05, |
|
"loss": 2.7525, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.06315789473684211, |
|
"grad_norm": 4.551560401916504, |
|
"learning_rate": 4.5e-05, |
|
"loss": 2.5999, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.08421052631578947, |
|
"grad_norm": 4.885453701019287, |
|
"learning_rate": 4.999717571181742e-05, |
|
"loss": 2.2257, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.10526315789473684, |
|
"grad_norm": 4.619503498077393, |
|
"learning_rate": 4.998234994371135e-05, |
|
"loss": 2.0927, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12631578947368421, |
|
"grad_norm": 4.877598285675049, |
|
"learning_rate": 4.995482415049123e-05, |
|
"loss": 2.3476, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.14736842105263157, |
|
"grad_norm": 6.852722644805908, |
|
"learning_rate": 4.991461232516675e-05, |
|
"loss": 2.028, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.16842105263157894, |
|
"grad_norm": 6.002420902252197, |
|
"learning_rate": 4.986173490981773e-05, |
|
"loss": 1.6801, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.18947368421052632, |
|
"grad_norm": 5.471989631652832, |
|
"learning_rate": 4.979621878520216e-05, |
|
"loss": 1.4341, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.21052631578947367, |
|
"grad_norm": 5.2608489990234375, |
|
"learning_rate": 4.971809725709112e-05, |
|
"loss": 1.8805, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23157894736842105, |
|
"grad_norm": 3.3612749576568604, |
|
"learning_rate": 4.962741003933742e-05, |
|
"loss": 1.6929, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.25263157894736843, |
|
"grad_norm": 4.646411895751953, |
|
"learning_rate": 4.952420323368673e-05, |
|
"loss": 1.5576, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.25263157894736843, |
|
"eval_loss": 0.37584158778190613, |
|
"eval_runtime": 47.717, |
|
"eval_samples_per_second": 5.03, |
|
"eval_steps_per_second": 0.629, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2736842105263158, |
|
"grad_norm": 4.316969394683838, |
|
"learning_rate": 4.9408529306341255e-05, |
|
"loss": 1.8731, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.29473684210526313, |
|
"grad_norm": 4.682113170623779, |
|
"learning_rate": 4.928044706128803e-05, |
|
"loss": 1.8301, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3157894736842105, |
|
"grad_norm": 3.629147529602051, |
|
"learning_rate": 4.9140021610405326e-05, |
|
"loss": 1.2944, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3368421052631579, |
|
"grad_norm": 7.077390193939209, |
|
"learning_rate": 4.898732434036244e-05, |
|
"loss": 2.0447, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.35789473684210527, |
|
"grad_norm": 4.48635196685791, |
|
"learning_rate": 4.882243287632947e-05, |
|
"loss": 1.4274, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.37894736842105264, |
|
"grad_norm": 4.4893388748168945, |
|
"learning_rate": 4.864543104251587e-05, |
|
"loss": 1.7248, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 5.431076526641846, |
|
"learning_rate": 4.8456408819557564e-05, |
|
"loss": 1.6822, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 3.690011501312256, |
|
"learning_rate": 4.825546229877439e-05, |
|
"loss": 1.7077, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4421052631578947, |
|
"grad_norm": 5.592578411102295, |
|
"learning_rate": 4.804269363332112e-05, |
|
"loss": 1.836, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4631578947368421, |
|
"grad_norm": 5.439390182495117, |
|
"learning_rate": 4.78182109862569e-05, |
|
"loss": 1.2283, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4842105263157895, |
|
"grad_norm": 4.344250202178955, |
|
"learning_rate": 4.758212847555953e-05, |
|
"loss": 1.6078, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5052631578947369, |
|
"grad_norm": 5.0562825202941895, |
|
"learning_rate": 4.733456611611233e-05, |
|
"loss": 1.858, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5052631578947369, |
|
"eval_loss": 0.3427739441394806, |
|
"eval_runtime": 47.7477, |
|
"eval_samples_per_second": 5.026, |
|
"eval_steps_per_second": 0.628, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 4.3865838050842285, |
|
"learning_rate": 4.7075649758693565e-05, |
|
"loss": 1.2519, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5473684210526316, |
|
"grad_norm": 2.8480889797210693, |
|
"learning_rate": 4.68055110259988e-05, |
|
"loss": 1.6193, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5684210526315789, |
|
"grad_norm": 3.4718546867370605, |
|
"learning_rate": 4.6524287245729295e-05, |
|
"loss": 1.4091, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5894736842105263, |
|
"grad_norm": 4.449820041656494, |
|
"learning_rate": 4.6232121380780034e-05, |
|
"loss": 1.484, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6105263157894737, |
|
"grad_norm": 3.7628002166748047, |
|
"learning_rate": 4.592916195656322e-05, |
|
"loss": 1.3605, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.631578947368421, |
|
"grad_norm": 3.6203603744506836, |
|
"learning_rate": 4.561556298550379e-05, |
|
"loss": 1.4026, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6526315789473685, |
|
"grad_norm": 3.6984612941741943, |
|
"learning_rate": 4.529148388874577e-05, |
|
"loss": 1.1724, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6736842105263158, |
|
"grad_norm": 3.412766933441162, |
|
"learning_rate": 4.49570894151089e-05, |
|
"loss": 1.5515, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6947368421052632, |
|
"grad_norm": 2.684919595718384, |
|
"learning_rate": 4.4612549557336974e-05, |
|
"loss": 1.2596, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.7157894736842105, |
|
"grad_norm": 4.008241176605225, |
|
"learning_rate": 4.4258039465680326e-05, |
|
"loss": 1.1391, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7368421052631579, |
|
"grad_norm": 4.187386989593506, |
|
"learning_rate": 4.389373935885646e-05, |
|
"loss": 1.1588, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7578947368421053, |
|
"grad_norm": 4.869933605194092, |
|
"learning_rate": 4.351983443243409e-05, |
|
"loss": 1.5655, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7578947368421053, |
|
"eval_loss": 0.322973370552063, |
|
"eval_runtime": 47.6997, |
|
"eval_samples_per_second": 5.031, |
|
"eval_steps_per_second": 0.629, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7789473684210526, |
|
"grad_norm": 3.7822816371917725, |
|
"learning_rate": 4.313651476468715e-05, |
|
"loss": 1.5809, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.936788320541382, |
|
"learning_rate": 4.274397521996658e-05, |
|
"loss": 1.0463, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.8210526315789474, |
|
"grad_norm": 4.922979831695557, |
|
"learning_rate": 4.234241534963916e-05, |
|
"loss": 1.2287, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 5.986371040344238, |
|
"learning_rate": 4.193203929064353e-05, |
|
"loss": 1.3477, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8631578947368421, |
|
"grad_norm": 3.4700145721435547, |
|
"learning_rate": 4.1513055661715214e-05, |
|
"loss": 0.9548, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8842105263157894, |
|
"grad_norm": 4.394268035888672, |
|
"learning_rate": 4.108567745733318e-05, |
|
"loss": 1.2286, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.9052631578947369, |
|
"grad_norm": 4.035145282745361, |
|
"learning_rate": 4.065012193944201e-05, |
|
"loss": 1.1731, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.9263157894736842, |
|
"grad_norm": 3.933317184448242, |
|
"learning_rate": 4.020661052700461e-05, |
|
"loss": 1.6722, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.9473684210526315, |
|
"grad_norm": 3.2603344917297363, |
|
"learning_rate": 3.9755368683441735e-05, |
|
"loss": 1.3816, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.968421052631579, |
|
"grad_norm": 6.198463439941406, |
|
"learning_rate": 3.9296625802015356e-05, |
|
"loss": 1.2843, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.9894736842105263, |
|
"grad_norm": 4.392797470092773, |
|
"learning_rate": 3.883061508921439e-05, |
|
"loss": 1.5944, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.0105263157894737, |
|
"grad_norm": 3.6408369541168213, |
|
"learning_rate": 3.8357573446201825e-05, |
|
"loss": 1.1528, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.0105263157894737, |
|
"eval_loss": 0.31307944655418396, |
|
"eval_runtime": 47.7481, |
|
"eval_samples_per_second": 5.026, |
|
"eval_steps_per_second": 0.628, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.0315789473684212, |
|
"grad_norm": 3.727839946746826, |
|
"learning_rate": 3.78777413483837e-05, |
|
"loss": 1.3407, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 4.318253517150879, |
|
"learning_rate": 3.739136272316102e-05, |
|
"loss": 1.274, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0736842105263158, |
|
"grad_norm": 3.0407471656799316, |
|
"learning_rate": 3.689868482592684e-05, |
|
"loss": 1.0978, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.0947368421052632, |
|
"grad_norm": 3.2110660076141357, |
|
"learning_rate": 3.6399958114371595e-05, |
|
"loss": 0.9378, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.1157894736842104, |
|
"grad_norm": 4.471799373626709, |
|
"learning_rate": 3.5895436121160386e-05, |
|
"loss": 1.334, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.1368421052631579, |
|
"grad_norm": 2.7536613941192627, |
|
"learning_rate": 3.5385375325047166e-05, |
|
"loss": 1.5206, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.1578947368421053, |
|
"grad_norm": 3.1631388664245605, |
|
"learning_rate": 3.487003502049122e-05, |
|
"loss": 0.9874, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.1789473684210527, |
|
"grad_norm": 3.0744566917419434, |
|
"learning_rate": 3.4349677185842245e-05, |
|
"loss": 1.2542, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 3.199769973754883, |
|
"learning_rate": 3.38245663501611e-05, |
|
"loss": 1.0781, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.2210526315789474, |
|
"grad_norm": 3.3641140460968018, |
|
"learning_rate": 3.32949694587438e-05, |
|
"loss": 1.0915, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.2421052631578948, |
|
"grad_norm": 2.533961057662964, |
|
"learning_rate": 3.276115573741724e-05, |
|
"loss": 1.2862, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.263157894736842, |
|
"grad_norm": 4.081838130950928, |
|
"learning_rate": 3.222339655567556e-05, |
|
"loss": 1.2205, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.263157894736842, |
|
"eval_loss": 0.3107610046863556, |
|
"eval_runtime": 47.7384, |
|
"eval_samples_per_second": 5.027, |
|
"eval_steps_per_second": 0.628, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.2842105263157895, |
|
"grad_norm": 2.3932526111602783, |
|
"learning_rate": 3.168196528872682e-05, |
|
"loss": 1.0431, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.305263157894737, |
|
"grad_norm": 2.7691686153411865, |
|
"learning_rate": 3.1137137178519985e-05, |
|
"loss": 1.314, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.3263157894736843, |
|
"grad_norm": 3.8344638347625732, |
|
"learning_rate": 3.0589189193822895e-05, |
|
"loss": 0.8119, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.3473684210526315, |
|
"grad_norm": 4.127139568328857, |
|
"learning_rate": 3.0038399889422553e-05, |
|
"loss": 1.1671, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.368421052631579, |
|
"grad_norm": 3.597393035888672, |
|
"learning_rate": 2.948504926451896e-05, |
|
"loss": 1.4459, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.3894736842105262, |
|
"grad_norm": 3.0417675971984863, |
|
"learning_rate": 2.8929418620384753e-05, |
|
"loss": 1.0606, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.4105263157894736, |
|
"grad_norm": 4.269920825958252, |
|
"learning_rate": 2.8371790417362987e-05, |
|
"loss": 0.8091, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.431578947368421, |
|
"grad_norm": 4.4791789054870605, |
|
"learning_rate": 2.781244813127552e-05, |
|
"loss": 1.4956, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.4526315789473685, |
|
"grad_norm": 4.570736885070801, |
|
"learning_rate": 2.7251676109315338e-05, |
|
"loss": 0.791, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.4736842105263157, |
|
"grad_norm": 4.790010929107666, |
|
"learning_rate": 2.668975942549583e-05, |
|
"loss": 1.2485, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4947368421052631, |
|
"grad_norm": 3.679155111312866, |
|
"learning_rate": 2.612698373573056e-05, |
|
"loss": 0.9622, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.5157894736842106, |
|
"grad_norm": 3.991124153137207, |
|
"learning_rate": 2.5563635132617302e-05, |
|
"loss": 0.7821, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.5157894736842106, |
|
"eval_loss": 0.30537185072898865, |
|
"eval_runtime": 47.7614, |
|
"eval_samples_per_second": 5.025, |
|
"eval_steps_per_second": 0.628, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.5368421052631578, |
|
"grad_norm": 5.918197154998779, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.7552, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.5578947368421052, |
|
"grad_norm": 6.4377241134643555, |
|
"learning_rate": 2.44363648673827e-05, |
|
"loss": 1.13, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.5789473684210527, |
|
"grad_norm": 3.93595814704895, |
|
"learning_rate": 2.387301626426944e-05, |
|
"loss": 0.9218, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 5.706233978271484, |
|
"learning_rate": 2.3310240574504185e-05, |
|
"loss": 1.1022, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.6210526315789475, |
|
"grad_norm": 2.740601062774658, |
|
"learning_rate": 2.2748323890684665e-05, |
|
"loss": 1.2584, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.6421052631578947, |
|
"grad_norm": 4.44104528427124, |
|
"learning_rate": 2.2187551868724485e-05, |
|
"loss": 1.0941, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.663157894736842, |
|
"grad_norm": 4.569465160369873, |
|
"learning_rate": 2.1628209582637022e-05, |
|
"loss": 1.1554, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.6842105263157894, |
|
"grad_norm": 4.33217191696167, |
|
"learning_rate": 2.1070581379615253e-05, |
|
"loss": 0.5728, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.7052631578947368, |
|
"grad_norm": 4.296968936920166, |
|
"learning_rate": 2.0514950735481052e-05, |
|
"loss": 1.0808, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.7263157894736842, |
|
"grad_norm": 3.474714994430542, |
|
"learning_rate": 1.9961600110577456e-05, |
|
"loss": 1.2945, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.7473684210526317, |
|
"grad_norm": 3.817056655883789, |
|
"learning_rate": 1.9410810806177104e-05, |
|
"loss": 1.4233, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.768421052631579, |
|
"grad_norm": 3.0018868446350098, |
|
"learning_rate": 1.8862862821480025e-05, |
|
"loss": 1.0385, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.768421052631579, |
|
"eval_loss": 0.3030892610549927, |
|
"eval_runtime": 47.7428, |
|
"eval_samples_per_second": 5.027, |
|
"eval_steps_per_second": 0.628, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.7894736842105263, |
|
"grad_norm": 3.522315502166748, |
|
"learning_rate": 1.831803471127318e-05, |
|
"loss": 1.1658, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.8105263157894735, |
|
"grad_norm": 3.5018210411071777, |
|
"learning_rate": 1.7776603444324445e-05, |
|
"loss": 1.0903, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.831578947368421, |
|
"grad_norm": 4.468841552734375, |
|
"learning_rate": 1.723884426258277e-05, |
|
"loss": 1.1171, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.8526315789473684, |
|
"grad_norm": 3.999666452407837, |
|
"learning_rate": 1.670503054125621e-05, |
|
"loss": 1.2162, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.8736842105263158, |
|
"grad_norm": 3.463674783706665, |
|
"learning_rate": 1.61754336498389e-05, |
|
"loss": 0.8498, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.8947368421052633, |
|
"grad_norm": 3.4514553546905518, |
|
"learning_rate": 1.5650322814157764e-05, |
|
"loss": 1.2623, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.9157894736842105, |
|
"grad_norm": 3.6156108379364014, |
|
"learning_rate": 1.5129964979508792e-05, |
|
"loss": 0.8503, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.936842105263158, |
|
"grad_norm": 3.3259615898132324, |
|
"learning_rate": 1.4614624674952842e-05, |
|
"loss": 0.9937, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.9578947368421051, |
|
"grad_norm": 5.9854230880737305, |
|
"learning_rate": 1.4104563878839621e-05, |
|
"loss": 0.9689, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.9789473684210526, |
|
"grad_norm": 2.22936749458313, |
|
"learning_rate": 1.3600041885628409e-05, |
|
"loss": 0.9137, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 3.004664897918701, |
|
"learning_rate": 1.3101315174073162e-05, |
|
"loss": 0.6448, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.0210526315789474, |
|
"grad_norm": 6.208250522613525, |
|
"learning_rate": 1.2608637276838986e-05, |
|
"loss": 1.319, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.0210526315789474, |
|
"eval_loss": 0.30173683166503906, |
|
"eval_runtime": 47.7589, |
|
"eval_samples_per_second": 5.025, |
|
"eval_steps_per_second": 0.628, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.042105263157895, |
|
"grad_norm": 2.969910144805908, |
|
"learning_rate": 1.2122258651616306e-05, |
|
"loss": 0.8383, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.0631578947368423, |
|
"grad_norm": 2.5157318115234375, |
|
"learning_rate": 1.1642426553798174e-05, |
|
"loss": 0.7519, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.0842105263157893, |
|
"grad_norm": 3.564941167831421, |
|
"learning_rate": 1.1169384910785614e-05, |
|
"loss": 0.5701, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 3.544473886489868, |
|
"learning_rate": 1.0703374197984653e-05, |
|
"loss": 0.7366, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.126315789473684, |
|
"grad_norm": 2.952383041381836, |
|
"learning_rate": 1.0244631316558267e-05, |
|
"loss": 0.6928, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.1473684210526316, |
|
"grad_norm": 3.4289209842681885, |
|
"learning_rate": 9.793389472995393e-06, |
|
"loss": 0.7361, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.168421052631579, |
|
"grad_norm": 3.7741119861602783, |
|
"learning_rate": 9.349878060557999e-06, |
|
"loss": 0.7777, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.1894736842105265, |
|
"grad_norm": 4.074053764343262, |
|
"learning_rate": 8.914322542666822e-06, |
|
"loss": 0.9209, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.2105263157894735, |
|
"grad_norm": 4.839679718017578, |
|
"learning_rate": 8.486944338284797e-06, |
|
"loss": 0.937, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.231578947368421, |
|
"grad_norm": 3.5984749794006348, |
|
"learning_rate": 8.067960709356478e-06, |
|
"loss": 1.0567, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.2526315789473683, |
|
"grad_norm": 4.226260185241699, |
|
"learning_rate": 7.657584650360847e-06, |
|
"loss": 0.8969, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.2736842105263158, |
|
"grad_norm": 2.624924421310425, |
|
"learning_rate": 7.256024780033418e-06, |
|
"loss": 0.7665, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.2736842105263158, |
|
"eval_loss": 0.3099728524684906, |
|
"eval_runtime": 47.753, |
|
"eval_samples_per_second": 5.026, |
|
"eval_steps_per_second": 0.628, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.294736842105263, |
|
"grad_norm": 3.7707293033599854, |
|
"learning_rate": 6.863485235312853e-06, |
|
"loss": 0.7157, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.3157894736842106, |
|
"grad_norm": 3.5063211917877197, |
|
"learning_rate": 6.480165567565913e-06, |
|
"loss": 0.7941, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.336842105263158, |
|
"grad_norm": 5.289640426635742, |
|
"learning_rate": 6.106260641143546e-06, |
|
"loss": 1.022, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.3578947368421055, |
|
"grad_norm": 3.4733479022979736, |
|
"learning_rate": 5.741960534319677e-06, |
|
"loss": 0.8732, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.3789473684210525, |
|
"grad_norm": 2.74438214302063, |
|
"learning_rate": 5.387450442663025e-06, |
|
"loss": 0.488, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 3.423187732696533, |
|
"learning_rate": 5.0429105848911e-06, |
|
"loss": 1.0244, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.4210526315789473, |
|
"grad_norm": 3.875284194946289, |
|
"learning_rate": 4.708516111254238e-06, |
|
"loss": 0.9071, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.442105263157895, |
|
"grad_norm": 4.707957744598389, |
|
"learning_rate": 4.384437014496215e-06, |
|
"loss": 0.8664, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.463157894736842, |
|
"grad_norm": 4.914385795593262, |
|
"learning_rate": 4.070838043436786e-06, |
|
"loss": 0.6006, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.4842105263157896, |
|
"grad_norm": 3.2543418407440186, |
|
"learning_rate": 3.7678786192199694e-06, |
|
"loss": 0.5789, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.5052631578947366, |
|
"grad_norm": 2.9000864028930664, |
|
"learning_rate": 3.475712754270716e-06, |
|
"loss": 0.5431, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.526315789473684, |
|
"grad_norm": 4.2075886726379395, |
|
"learning_rate": 3.194488974001203e-06, |
|
"loss": 0.6753, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.526315789473684, |
|
"eval_loss": 0.3118632733821869, |
|
"eval_runtime": 47.721, |
|
"eval_samples_per_second": 5.029, |
|
"eval_steps_per_second": 0.629, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.5473684210526315, |
|
"grad_norm": 5.408112049102783, |
|
"learning_rate": 2.9243502413064368e-06, |
|
"loss": 0.6439, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.568421052631579, |
|
"grad_norm": 3.7381534576416016, |
|
"learning_rate": 2.6654338838876665e-06, |
|
"loss": 0.9288, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.5894736842105264, |
|
"grad_norm": 4.740654468536377, |
|
"learning_rate": 2.4178715244404794e-06, |
|
"loss": 0.9505, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.610526315789474, |
|
"grad_norm": 4.9893364906311035, |
|
"learning_rate": 2.1817890137430934e-06, |
|
"loss": 1.046, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"grad_norm": 4.344699382781982, |
|
"learning_rate": 1.9573063666788875e-06, |
|
"loss": 0.8301, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.6526315789473687, |
|
"grad_norm": 2.871662139892578, |
|
"learning_rate": 1.7445377012256126e-06, |
|
"loss": 0.6642, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.6736842105263157, |
|
"grad_norm": 3.569286346435547, |
|
"learning_rate": 1.5435911804424357e-06, |
|
"loss": 0.8558, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.694736842105263, |
|
"grad_norm": 4.009424209594727, |
|
"learning_rate": 1.3545689574841342e-06, |
|
"loss": 0.8686, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.7157894736842105, |
|
"grad_norm": 3.5932652950286865, |
|
"learning_rate": 1.1775671236705365e-06, |
|
"loss": 1.0848, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.736842105263158, |
|
"grad_norm": 4.354364395141602, |
|
"learning_rate": 1.0126756596375686e-06, |
|
"loss": 1.1122, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.7578947368421054, |
|
"grad_norm": 3.184096336364746, |
|
"learning_rate": 8.599783895946761e-07, |
|
"loss": 0.8129, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.7789473684210524, |
|
"grad_norm": 4.265777587890625, |
|
"learning_rate": 7.195529387119815e-07, |
|
"loss": 0.7224, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.7789473684210524, |
|
"eval_loss": 0.3112446963787079, |
|
"eval_runtime": 47.7893, |
|
"eval_samples_per_second": 5.022, |
|
"eval_steps_per_second": 0.628, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 3.4699087142944336, |
|
"learning_rate": 5.914706936587494e-07, |
|
"loss": 0.614, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.8210526315789473, |
|
"grad_norm": 2.6950035095214844, |
|
"learning_rate": 4.75796766313269e-07, |
|
"loss": 0.9641, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.8421052631578947, |
|
"grad_norm": 4.25594425201416, |
|
"learning_rate": 3.7258996066258103e-07, |
|
"loss": 0.736, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.863157894736842, |
|
"grad_norm": 3.8812239170074463, |
|
"learning_rate": 2.819027429088822e-07, |
|
"loss": 0.7287, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.8842105263157896, |
|
"grad_norm": 4.651484966278076, |
|
"learning_rate": 2.0378121479783796e-07, |
|
"loss": 0.8938, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.905263157894737, |
|
"grad_norm": 4.784148216247559, |
|
"learning_rate": 1.3826509018227128e-07, |
|
"loss": 0.9602, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.9263157894736844, |
|
"grad_norm": 4.499444007873535, |
|
"learning_rate": 8.538767483325383e-08, |
|
"loss": 0.985, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.9473684210526314, |
|
"grad_norm": 5.214015483856201, |
|
"learning_rate": 4.517584950877452e-08, |
|
"loss": 0.9054, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.968421052631579, |
|
"grad_norm": 3.8694188594818115, |
|
"learning_rate": 1.7650056288651127e-08, |
|
"loss": 0.651, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.9894736842105263, |
|
"grad_norm": 3.8104214668273926, |
|
"learning_rate": 2.8242881825846223e-09, |
|
"loss": 0.8252, |
|
"step": 426 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 428, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 36, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7558214228836352e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|