|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.3793103448275863, |
|
"eval_steps": 1, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013793103448275862, |
|
"grad_norm": 6.989287853240967, |
|
"learning_rate": 2.2727272727272728e-06, |
|
"loss": 1.8926, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.013793103448275862, |
|
"eval_loss": 2.144650459289551, |
|
"eval_runtime": 17.2592, |
|
"eval_samples_per_second": 1.159, |
|
"eval_steps_per_second": 0.579, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.027586206896551724, |
|
"grad_norm": 6.281332492828369, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 1.986, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.027586206896551724, |
|
"eval_loss": 2.0905685424804688, |
|
"eval_runtime": 17.6184, |
|
"eval_samples_per_second": 1.135, |
|
"eval_steps_per_second": 0.568, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.041379310344827586, |
|
"grad_norm": 4.347537040710449, |
|
"learning_rate": 6.818181818181818e-06, |
|
"loss": 1.9355, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.041379310344827586, |
|
"eval_loss": 1.9983774423599243, |
|
"eval_runtime": 17.5928, |
|
"eval_samples_per_second": 1.137, |
|
"eval_steps_per_second": 0.568, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.05517241379310345, |
|
"grad_norm": 4.809764385223389, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 1.7509, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05517241379310345, |
|
"eval_loss": 1.8737837076187134, |
|
"eval_runtime": 17.6375, |
|
"eval_samples_per_second": 1.134, |
|
"eval_steps_per_second": 0.567, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.06896551724137931, |
|
"grad_norm": 3.4548702239990234, |
|
"learning_rate": 1.1363636363636365e-05, |
|
"loss": 1.8838, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06896551724137931, |
|
"eval_loss": 1.7746165990829468, |
|
"eval_runtime": 18.0257, |
|
"eval_samples_per_second": 1.11, |
|
"eval_steps_per_second": 0.555, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.08275862068965517, |
|
"grad_norm": 3.1943702697753906, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 1.7707, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.08275862068965517, |
|
"eval_loss": 1.6792665719985962, |
|
"eval_runtime": 17.7498, |
|
"eval_samples_per_second": 1.127, |
|
"eval_steps_per_second": 0.563, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.09655172413793103, |
|
"grad_norm": 3.318288564682007, |
|
"learning_rate": 1.590909090909091e-05, |
|
"loss": 1.7171, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.09655172413793103, |
|
"eval_loss": 1.5874873399734497, |
|
"eval_runtime": 17.6295, |
|
"eval_samples_per_second": 1.134, |
|
"eval_steps_per_second": 0.567, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1103448275862069, |
|
"grad_norm": 3.210330009460449, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 1.5734, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1103448275862069, |
|
"eval_loss": 1.535287618637085, |
|
"eval_runtime": 17.6232, |
|
"eval_samples_per_second": 1.135, |
|
"eval_steps_per_second": 0.567, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.12413793103448276, |
|
"grad_norm": 3.2319107055664062, |
|
"learning_rate": 2.0454545454545457e-05, |
|
"loss": 1.7986, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.12413793103448276, |
|
"eval_loss": 1.467301607131958, |
|
"eval_runtime": 17.5824, |
|
"eval_samples_per_second": 1.138, |
|
"eval_steps_per_second": 0.569, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 2.79286789894104, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 1.5025, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"eval_loss": 1.3961191177368164, |
|
"eval_runtime": 18.3446, |
|
"eval_samples_per_second": 1.09, |
|
"eval_steps_per_second": 0.545, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.15172413793103448, |
|
"grad_norm": 2.885422706604004, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.5477, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.15172413793103448, |
|
"eval_loss": 1.3420469760894775, |
|
"eval_runtime": 17.7683, |
|
"eval_samples_per_second": 1.126, |
|
"eval_steps_per_second": 0.563, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.16551724137931034, |
|
"grad_norm": 2.7671327590942383, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 1.6921, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.16551724137931034, |
|
"eval_loss": 1.3071445226669312, |
|
"eval_runtime": 17.652, |
|
"eval_samples_per_second": 1.133, |
|
"eval_steps_per_second": 0.567, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1793103448275862, |
|
"grad_norm": 2.9047963619232178, |
|
"learning_rate": 2.954545454545455e-05, |
|
"loss": 1.5365, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.1793103448275862, |
|
"eval_loss": 1.2601890563964844, |
|
"eval_runtime": 17.5232, |
|
"eval_samples_per_second": 1.141, |
|
"eval_steps_per_second": 0.571, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.19310344827586207, |
|
"grad_norm": 2.6054675579071045, |
|
"learning_rate": 3.181818181818182e-05, |
|
"loss": 1.6621, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.19310344827586207, |
|
"eval_loss": 1.2506535053253174, |
|
"eval_runtime": 17.6295, |
|
"eval_samples_per_second": 1.134, |
|
"eval_steps_per_second": 0.567, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.20689655172413793, |
|
"grad_norm": 2.538036823272705, |
|
"learning_rate": 3.409090909090909e-05, |
|
"loss": 1.6763, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.20689655172413793, |
|
"eval_loss": 1.2366451025009155, |
|
"eval_runtime": 18.6172, |
|
"eval_samples_per_second": 1.074, |
|
"eval_steps_per_second": 0.537, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2206896551724138, |
|
"grad_norm": 2.5125789642333984, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 1.668, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2206896551724138, |
|
"eval_loss": 1.2205184698104858, |
|
"eval_runtime": 17.7529, |
|
"eval_samples_per_second": 1.127, |
|
"eval_steps_per_second": 0.563, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.23448275862068965, |
|
"grad_norm": 5.055665969848633, |
|
"learning_rate": 3.8636363636363636e-05, |
|
"loss": 1.5703, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.23448275862068965, |
|
"eval_loss": 1.167407751083374, |
|
"eval_runtime": 17.5902, |
|
"eval_samples_per_second": 1.137, |
|
"eval_steps_per_second": 0.568, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2482758620689655, |
|
"grad_norm": 2.567411422729492, |
|
"learning_rate": 4.0909090909090915e-05, |
|
"loss": 1.4859, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2482758620689655, |
|
"eval_loss": 1.1367636919021606, |
|
"eval_runtime": 17.4832, |
|
"eval_samples_per_second": 1.144, |
|
"eval_steps_per_second": 0.572, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2620689655172414, |
|
"grad_norm": 2.3214948177337646, |
|
"learning_rate": 4.318181818181819e-05, |
|
"loss": 1.4511, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.2620689655172414, |
|
"eval_loss": 1.1296402215957642, |
|
"eval_runtime": 17.6655, |
|
"eval_samples_per_second": 1.132, |
|
"eval_steps_per_second": 0.566, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 2.390448570251465, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 1.7181, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"eval_loss": 1.126497507095337, |
|
"eval_runtime": 17.9736, |
|
"eval_samples_per_second": 1.113, |
|
"eval_steps_per_second": 0.556, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2896551724137931, |
|
"grad_norm": 2.3728342056274414, |
|
"learning_rate": 4.772727272727273e-05, |
|
"loss": 1.4155, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2896551724137931, |
|
"eval_loss": 1.09345281124115, |
|
"eval_runtime": 17.8283, |
|
"eval_samples_per_second": 1.122, |
|
"eval_steps_per_second": 0.561, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.30344827586206896, |
|
"grad_norm": 2.4872097969055176, |
|
"learning_rate": 5e-05, |
|
"loss": 1.3752, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.30344827586206896, |
|
"eval_loss": 1.0705276727676392, |
|
"eval_runtime": 17.6481, |
|
"eval_samples_per_second": 1.133, |
|
"eval_steps_per_second": 0.567, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.31724137931034485, |
|
"grad_norm": 2.953234910964966, |
|
"learning_rate": 4.999672209164081e-05, |
|
"loss": 1.4449, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.31724137931034485, |
|
"eval_loss": 1.0468412637710571, |
|
"eval_runtime": 17.6053, |
|
"eval_samples_per_second": 1.136, |
|
"eval_steps_per_second": 0.568, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.3310344827586207, |
|
"grad_norm": 2.47603702545166, |
|
"learning_rate": 4.998688922613788e-05, |
|
"loss": 1.4286, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.3310344827586207, |
|
"eval_loss": 1.0292497873306274, |
|
"eval_runtime": 17.5777, |
|
"eval_samples_per_second": 1.138, |
|
"eval_steps_per_second": 0.569, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 2.2879106998443604, |
|
"learning_rate": 4.997050398198977e-05, |
|
"loss": 1.5164, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"eval_loss": 1.0196115970611572, |
|
"eval_runtime": 17.9439, |
|
"eval_samples_per_second": 1.115, |
|
"eval_steps_per_second": 0.557, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3586206896551724, |
|
"grad_norm": 2.319134473800659, |
|
"learning_rate": 4.9947570655942796e-05, |
|
"loss": 1.5282, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.3586206896551724, |
|
"eval_loss": 1.013381004333496, |
|
"eval_runtime": 17.7628, |
|
"eval_samples_per_second": 1.126, |
|
"eval_steps_per_second": 0.563, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.3724137931034483, |
|
"grad_norm": 2.259608745574951, |
|
"learning_rate": 4.991809526186424e-05, |
|
"loss": 1.4901, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.3724137931034483, |
|
"eval_loss": 1.0091207027435303, |
|
"eval_runtime": 17.619, |
|
"eval_samples_per_second": 1.135, |
|
"eval_steps_per_second": 0.568, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.38620689655172413, |
|
"grad_norm": 2.2252631187438965, |
|
"learning_rate": 4.988208552916535e-05, |
|
"loss": 1.5518, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.38620689655172413, |
|
"eval_loss": 1.0063353776931763, |
|
"eval_runtime": 17.4778, |
|
"eval_samples_per_second": 1.144, |
|
"eval_steps_per_second": 0.572, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.2154901027679443, |
|
"learning_rate": 4.983955090077444e-05, |
|
"loss": 1.4682, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.99261075258255, |
|
"eval_runtime": 17.4894, |
|
"eval_samples_per_second": 1.144, |
|
"eval_steps_per_second": 0.572, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 2.320786237716675, |
|
"learning_rate": 4.9790502530660635e-05, |
|
"loss": 1.4691, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"eval_loss": 0.9836109280586243, |
|
"eval_runtime": 16.9043, |
|
"eval_samples_per_second": 1.183, |
|
"eval_steps_per_second": 0.592, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.42758620689655175, |
|
"grad_norm": 2.1385531425476074, |
|
"learning_rate": 4.9734953280908904e-05, |
|
"loss": 1.4696, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.42758620689655175, |
|
"eval_loss": 0.976610541343689, |
|
"eval_runtime": 17.3486, |
|
"eval_samples_per_second": 1.153, |
|
"eval_steps_per_second": 0.576, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.4413793103448276, |
|
"grad_norm": 2.2254769802093506, |
|
"learning_rate": 4.967291771834727e-05, |
|
"loss": 1.531, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.4413793103448276, |
|
"eval_loss": 0.9718761444091797, |
|
"eval_runtime": 17.5285, |
|
"eval_samples_per_second": 1.141, |
|
"eval_steps_per_second": 0.57, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.45517241379310347, |
|
"grad_norm": 2.34843373298645, |
|
"learning_rate": 4.960441211072686e-05, |
|
"loss": 1.5484, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.45517241379310347, |
|
"eval_loss": 0.9682589769363403, |
|
"eval_runtime": 17.5952, |
|
"eval_samples_per_second": 1.137, |
|
"eval_steps_per_second": 0.568, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.4689655172413793, |
|
"grad_norm": 1.9610539674758911, |
|
"learning_rate": 4.9529454422455976e-05, |
|
"loss": 1.3204, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.4689655172413793, |
|
"eval_loss": 0.9610344767570496, |
|
"eval_runtime": 17.6076, |
|
"eval_samples_per_second": 1.136, |
|
"eval_steps_per_second": 0.568, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.4827586206896552, |
|
"grad_norm": 2.2027809619903564, |
|
"learning_rate": 4.944806430988927e-05, |
|
"loss": 1.3801, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.4827586206896552, |
|
"eval_loss": 0.9546059370040894, |
|
"eval_runtime": 17.5811, |
|
"eval_samples_per_second": 1.138, |
|
"eval_steps_per_second": 0.569, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.496551724137931, |
|
"grad_norm": 2.3457250595092773, |
|
"learning_rate": 4.936026311617316e-05, |
|
"loss": 1.4401, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.496551724137931, |
|
"eval_loss": 0.9482511281967163, |
|
"eval_runtime": 17.8351, |
|
"eval_samples_per_second": 1.121, |
|
"eval_steps_per_second": 0.561, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5103448275862069, |
|
"grad_norm": 2.161039352416992, |
|
"learning_rate": 4.926607386564898e-05, |
|
"loss": 1.4067, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.5103448275862069, |
|
"eval_loss": 0.9448164701461792, |
|
"eval_runtime": 17.6014, |
|
"eval_samples_per_second": 1.136, |
|
"eval_steps_per_second": 0.568, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.5241379310344828, |
|
"grad_norm": 2.1683900356292725, |
|
"learning_rate": 4.916552125781528e-05, |
|
"loss": 1.3806, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.5241379310344828, |
|
"eval_loss": 0.9402996897697449, |
|
"eval_runtime": 17.6524, |
|
"eval_samples_per_second": 1.133, |
|
"eval_steps_per_second": 0.566, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.5379310344827586, |
|
"grad_norm": 2.2735962867736816, |
|
"learning_rate": 4.9058631660850765e-05, |
|
"loss": 1.4937, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.5379310344827586, |
|
"eval_loss": 0.9291872978210449, |
|
"eval_runtime": 17.5838, |
|
"eval_samples_per_second": 1.137, |
|
"eval_steps_per_second": 0.569, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.5655172413793104, |
|
"grad_norm": 2.2170450687408447, |
|
"learning_rate": 2.2727272727272728e-06, |
|
"loss": 1.316, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.5655172413793104, |
|
"eval_loss": 0.9163956642150879, |
|
"eval_runtime": 15.7145, |
|
"eval_samples_per_second": 1.273, |
|
"eval_steps_per_second": 0.636, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.5793103448275863, |
|
"grad_norm": 2.2266974449157715, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 1.3854, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.5793103448275863, |
|
"eval_loss": 0.9137259721755981, |
|
"eval_runtime": 15.7133, |
|
"eval_samples_per_second": 1.273, |
|
"eval_steps_per_second": 0.636, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.593103448275862, |
|
"grad_norm": 2.3451268672943115, |
|
"learning_rate": 6.818181818181818e-06, |
|
"loss": 1.4208, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.593103448275862, |
|
"eval_loss": 0.9096618890762329, |
|
"eval_runtime": 15.7895, |
|
"eval_samples_per_second": 1.267, |
|
"eval_steps_per_second": 0.633, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6068965517241379, |
|
"grad_norm": 2.0125885009765625, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 1.4302, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.6068965517241379, |
|
"eval_loss": 0.9058458209037781, |
|
"eval_runtime": 15.6899, |
|
"eval_samples_per_second": 1.275, |
|
"eval_steps_per_second": 0.637, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.6206896551724138, |
|
"grad_norm": 2.1096601486206055, |
|
"learning_rate": 1.1363636363636365e-05, |
|
"loss": 1.3981, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6206896551724138, |
|
"eval_loss": 0.8982122540473938, |
|
"eval_runtime": 15.707, |
|
"eval_samples_per_second": 1.273, |
|
"eval_steps_per_second": 0.637, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6344827586206897, |
|
"grad_norm": 1.971846342086792, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 1.263, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.6344827586206897, |
|
"eval_loss": 0.891434371471405, |
|
"eval_runtime": 15.7993, |
|
"eval_samples_per_second": 1.266, |
|
"eval_steps_per_second": 0.633, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.6482758620689655, |
|
"grad_norm": 1.9724080562591553, |
|
"learning_rate": 1.590909090909091e-05, |
|
"loss": 1.355, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.6482758620689655, |
|
"eval_loss": 0.8870094418525696, |
|
"eval_runtime": 15.6828, |
|
"eval_samples_per_second": 1.275, |
|
"eval_steps_per_second": 0.638, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.6620689655172414, |
|
"grad_norm": 2.0631349086761475, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 1.3375, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.6620689655172414, |
|
"eval_loss": 0.8790606260299683, |
|
"eval_runtime": 15.6783, |
|
"eval_samples_per_second": 1.276, |
|
"eval_steps_per_second": 0.638, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.6758620689655173, |
|
"grad_norm": 2.1942760944366455, |
|
"learning_rate": 2.0454545454545457e-05, |
|
"loss": 1.3937, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.6758620689655173, |
|
"eval_loss": 0.8732376098632812, |
|
"eval_runtime": 15.6854, |
|
"eval_samples_per_second": 1.275, |
|
"eval_steps_per_second": 0.638, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 2.119081497192383, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 1.5447, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"eval_loss": 0.8692445755004883, |
|
"eval_runtime": 15.6827, |
|
"eval_samples_per_second": 1.275, |
|
"eval_steps_per_second": 0.638, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7034482758620689, |
|
"grad_norm": 1.9801068305969238, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2777, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7034482758620689, |
|
"eval_loss": 0.8668963313102722, |
|
"eval_runtime": 15.7049, |
|
"eval_samples_per_second": 1.273, |
|
"eval_steps_per_second": 0.637, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7172413793103448, |
|
"grad_norm": 2.0645248889923096, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 1.3444, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.7172413793103448, |
|
"eval_loss": 0.8615155220031738, |
|
"eval_runtime": 15.6899, |
|
"eval_samples_per_second": 1.275, |
|
"eval_steps_per_second": 0.637, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.7310344827586207, |
|
"grad_norm": 2.1377453804016113, |
|
"learning_rate": 2.954545454545455e-05, |
|
"loss": 1.4174, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.7310344827586207, |
|
"eval_loss": 0.8575263023376465, |
|
"eval_runtime": 15.6427, |
|
"eval_samples_per_second": 1.279, |
|
"eval_steps_per_second": 0.639, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.7448275862068966, |
|
"grad_norm": 2.1462454795837402, |
|
"learning_rate": 3.181818181818182e-05, |
|
"loss": 1.429, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.7448275862068966, |
|
"eval_loss": 0.8533774614334106, |
|
"eval_runtime": 15.7668, |
|
"eval_samples_per_second": 1.268, |
|
"eval_steps_per_second": 0.634, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.7724137931034483, |
|
"grad_norm": 2.1803667545318604, |
|
"learning_rate": 2.2727272727272728e-06, |
|
"loss": 1.4271, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.7724137931034483, |
|
"eval_loss": 0.8433731198310852, |
|
"eval_runtime": 17.9885, |
|
"eval_samples_per_second": 1.112, |
|
"eval_steps_per_second": 0.556, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.7862068965517242, |
|
"grad_norm": 2.3162448406219482, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 1.4689, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.7862068965517242, |
|
"eval_loss": 0.8418852090835571, |
|
"eval_runtime": 18.2763, |
|
"eval_samples_per_second": 1.094, |
|
"eval_steps_per_second": 0.547, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.9732853174209595, |
|
"learning_rate": 6.818181818181818e-06, |
|
"loss": 1.2825, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.8386393785476685, |
|
"eval_runtime": 18.1184, |
|
"eval_samples_per_second": 1.104, |
|
"eval_steps_per_second": 0.552, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.8137931034482758, |
|
"grad_norm": 2.0547423362731934, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 1.2972, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.8137931034482758, |
|
"eval_loss": 0.8355510830879211, |
|
"eval_runtime": 18.2216, |
|
"eval_samples_per_second": 1.098, |
|
"eval_steps_per_second": 0.549, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 2.0684103965759277, |
|
"learning_rate": 1.1363636363636365e-05, |
|
"loss": 1.3615, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"eval_loss": 0.8328086137771606, |
|
"eval_runtime": 18.8073, |
|
"eval_samples_per_second": 1.063, |
|
"eval_steps_per_second": 0.532, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8413793103448276, |
|
"grad_norm": 2.0212347507476807, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 1.3648, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.8413793103448276, |
|
"eval_loss": 0.8308294415473938, |
|
"eval_runtime": 18.376, |
|
"eval_samples_per_second": 1.088, |
|
"eval_steps_per_second": 0.544, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.8551724137931035, |
|
"grad_norm": 1.9967029094696045, |
|
"learning_rate": 1.590909090909091e-05, |
|
"loss": 1.4334, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.8551724137931035, |
|
"eval_loss": 0.8297985792160034, |
|
"eval_runtime": 18.24, |
|
"eval_samples_per_second": 1.096, |
|
"eval_steps_per_second": 0.548, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.8689655172413793, |
|
"grad_norm": 1.956730842590332, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 1.246, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.8689655172413793, |
|
"eval_loss": 0.8276138305664062, |
|
"eval_runtime": 18.1099, |
|
"eval_samples_per_second": 1.104, |
|
"eval_steps_per_second": 0.552, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.8827586206896552, |
|
"grad_norm": 1.8840367794036865, |
|
"learning_rate": 2.0454545454545457e-05, |
|
"loss": 1.2346, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.8827586206896552, |
|
"eval_loss": 0.8268927335739136, |
|
"eval_runtime": 18.2242, |
|
"eval_samples_per_second": 1.097, |
|
"eval_steps_per_second": 0.549, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.896551724137931, |
|
"grad_norm": 1.9588886499404907, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 1.3699, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.896551724137931, |
|
"eval_loss": 0.8241379857063293, |
|
"eval_runtime": 18.6162, |
|
"eval_samples_per_second": 1.074, |
|
"eval_steps_per_second": 0.537, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.9103448275862069, |
|
"grad_norm": 2.001984119415283, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4399, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.9103448275862069, |
|
"eval_loss": 0.8220138549804688, |
|
"eval_runtime": 18.4936, |
|
"eval_samples_per_second": 1.081, |
|
"eval_steps_per_second": 0.541, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.9241379310344827, |
|
"grad_norm": 1.9502840042114258, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 1.1969, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.9241379310344827, |
|
"eval_loss": 0.8098680377006531, |
|
"eval_runtime": 18.2406, |
|
"eval_samples_per_second": 1.096, |
|
"eval_steps_per_second": 0.548, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.9379310344827586, |
|
"grad_norm": 1.8572745323181152, |
|
"learning_rate": 2.954545454545455e-05, |
|
"loss": 1.1968, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.9379310344827586, |
|
"eval_loss": 0.7934565544128418, |
|
"eval_runtime": 18.2457, |
|
"eval_samples_per_second": 1.096, |
|
"eval_steps_per_second": 0.548, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.9517241379310345, |
|
"grad_norm": 2.0354831218719482, |
|
"learning_rate": 3.181818181818182e-05, |
|
"loss": 1.2528, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.9517241379310345, |
|
"eval_loss": 0.7829666137695312, |
|
"eval_runtime": 18.2217, |
|
"eval_samples_per_second": 1.098, |
|
"eval_steps_per_second": 0.549, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 2.1164538860321045, |
|
"learning_rate": 3.409090909090909e-05, |
|
"loss": 1.3873, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"eval_loss": 0.7753366231918335, |
|
"eval_runtime": 18.5302, |
|
"eval_samples_per_second": 1.079, |
|
"eval_steps_per_second": 0.54, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9793103448275862, |
|
"grad_norm": 2.032721996307373, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 1.232, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.9793103448275862, |
|
"eval_loss": 0.7628229856491089, |
|
"eval_runtime": 18.4062, |
|
"eval_samples_per_second": 1.087, |
|
"eval_steps_per_second": 0.543, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.993103448275862, |
|
"grad_norm": 2.1039462089538574, |
|
"learning_rate": 3.8636363636363636e-05, |
|
"loss": 1.2715, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.993103448275862, |
|
"eval_loss": 0.751362681388855, |
|
"eval_runtime": 18.2628, |
|
"eval_samples_per_second": 1.095, |
|
"eval_steps_per_second": 0.548, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.006896551724138, |
|
"grad_norm": 2.1415343284606934, |
|
"learning_rate": 4.0909090909090915e-05, |
|
"loss": 1.3012, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.006896551724138, |
|
"eval_loss": 0.7407116293907166, |
|
"eval_runtime": 18.1993, |
|
"eval_samples_per_second": 1.099, |
|
"eval_steps_per_second": 0.549, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.0206896551724138, |
|
"grad_norm": 1.9539107084274292, |
|
"learning_rate": 4.318181818181819e-05, |
|
"loss": 1.1411, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.0206896551724138, |
|
"eval_loss": 0.7367935180664062, |
|
"eval_runtime": 18.2237, |
|
"eval_samples_per_second": 1.097, |
|
"eval_steps_per_second": 0.549, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 2.0641109943389893, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 1.0793, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"eval_loss": 0.7307212948799133, |
|
"eval_runtime": 18.5348, |
|
"eval_samples_per_second": 1.079, |
|
"eval_steps_per_second": 0.54, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.0482758620689656, |
|
"grad_norm": 1.918042778968811, |
|
"learning_rate": 4.772727272727273e-05, |
|
"loss": 1.0897, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.0482758620689656, |
|
"eval_loss": 0.7253277897834778, |
|
"eval_runtime": 18.4554, |
|
"eval_samples_per_second": 1.084, |
|
"eval_steps_per_second": 0.542, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.0620689655172413, |
|
"grad_norm": 2.216691493988037, |
|
"learning_rate": 5e-05, |
|
"loss": 1.2309, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.0620689655172413, |
|
"eval_loss": 0.7224608659744263, |
|
"eval_runtime": 18.0728, |
|
"eval_samples_per_second": 1.107, |
|
"eval_steps_per_second": 0.553, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.0758620689655172, |
|
"grad_norm": 2.304621934890747, |
|
"learning_rate": 4.999672209164081e-05, |
|
"loss": 1.1722, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.0758620689655172, |
|
"eval_loss": 0.7266848683357239, |
|
"eval_runtime": 18.2053, |
|
"eval_samples_per_second": 1.099, |
|
"eval_steps_per_second": 0.549, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.089655172413793, |
|
"grad_norm": 2.0087103843688965, |
|
"learning_rate": 4.998688922613788e-05, |
|
"loss": 1.105, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.089655172413793, |
|
"eval_loss": 0.7276325225830078, |
|
"eval_runtime": 18.0661, |
|
"eval_samples_per_second": 1.107, |
|
"eval_steps_per_second": 0.554, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.103448275862069, |
|
"grad_norm": 2.047912836074829, |
|
"learning_rate": 4.997050398198977e-05, |
|
"loss": 1.0507, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.103448275862069, |
|
"eval_loss": 0.7239590883255005, |
|
"eval_runtime": 18.4343, |
|
"eval_samples_per_second": 1.085, |
|
"eval_steps_per_second": 0.542, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.1172413793103448, |
|
"grad_norm": 2.004422664642334, |
|
"learning_rate": 4.9947570655942796e-05, |
|
"loss": 0.9516, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.1172413793103448, |
|
"eval_loss": 0.7144821882247925, |
|
"eval_runtime": 18.3983, |
|
"eval_samples_per_second": 1.087, |
|
"eval_steps_per_second": 0.544, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.1310344827586207, |
|
"grad_norm": 2.013328790664673, |
|
"learning_rate": 4.991809526186424e-05, |
|
"loss": 1.0593, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.1310344827586207, |
|
"eval_loss": 0.7059406638145447, |
|
"eval_runtime": 18.2362, |
|
"eval_samples_per_second": 1.097, |
|
"eval_steps_per_second": 0.548, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.1448275862068966, |
|
"grad_norm": 2.068134069442749, |
|
"learning_rate": 4.988208552916535e-05, |
|
"loss": 1.1188, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.1448275862068966, |
|
"eval_loss": 0.7021835446357727, |
|
"eval_runtime": 18.1868, |
|
"eval_samples_per_second": 1.1, |
|
"eval_steps_per_second": 0.55, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.1586206896551725, |
|
"grad_norm": 2.2628672122955322, |
|
"learning_rate": 4.983955090077444e-05, |
|
"loss": 1.1473, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.1586206896551725, |
|
"eval_loss": 0.6942790150642395, |
|
"eval_runtime": 18.1494, |
|
"eval_samples_per_second": 1.102, |
|
"eval_steps_per_second": 0.551, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.1724137931034484, |
|
"grad_norm": 2.1747775077819824, |
|
"learning_rate": 4.9790502530660635e-05, |
|
"loss": 1.1778, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.1724137931034484, |
|
"eval_loss": 0.6942981481552124, |
|
"eval_runtime": 18.8763, |
|
"eval_samples_per_second": 1.06, |
|
"eval_steps_per_second": 0.53, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.186206896551724, |
|
"grad_norm": 2.152348041534424, |
|
"learning_rate": 4.9734953280908904e-05, |
|
"loss": 1.331, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.186206896551724, |
|
"eval_loss": 0.6978840827941895, |
|
"eval_runtime": 18.349, |
|
"eval_samples_per_second": 1.09, |
|
"eval_steps_per_second": 0.545, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.069314956665039, |
|
"learning_rate": 4.967291771834727e-05, |
|
"loss": 1.1638, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.6983293294906616, |
|
"eval_runtime": 18.1961, |
|
"eval_samples_per_second": 1.099, |
|
"eval_steps_per_second": 0.55, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.2137931034482758, |
|
"grad_norm": 2.037853717803955, |
|
"learning_rate": 4.960441211072686e-05, |
|
"loss": 1.1118, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.2137931034482758, |
|
"eval_loss": 0.6962876915931702, |
|
"eval_runtime": 18.1105, |
|
"eval_samples_per_second": 1.104, |
|
"eval_steps_per_second": 0.552, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.2275862068965517, |
|
"grad_norm": 1.9961076974868774, |
|
"learning_rate": 4.9529454422455976e-05, |
|
"loss": 1.0972, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.2275862068965517, |
|
"eval_loss": 0.6896785497665405, |
|
"eval_runtime": 18.0553, |
|
"eval_samples_per_second": 1.108, |
|
"eval_steps_per_second": 0.554, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.2413793103448276, |
|
"grad_norm": 2.28176212310791, |
|
"learning_rate": 4.944806430988927e-05, |
|
"loss": 1.3304, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.2413793103448276, |
|
"eval_loss": 0.6826642751693726, |
|
"eval_runtime": 18.5055, |
|
"eval_samples_per_second": 1.081, |
|
"eval_steps_per_second": 0.54, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.2551724137931035, |
|
"grad_norm": 1.894646406173706, |
|
"learning_rate": 4.936026311617316e-05, |
|
"loss": 1.0935, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.2551724137931035, |
|
"eval_loss": 0.678307831287384, |
|
"eval_runtime": 18.3532, |
|
"eval_samples_per_second": 1.09, |
|
"eval_steps_per_second": 0.545, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.2689655172413792, |
|
"grad_norm": 2.0475075244903564, |
|
"learning_rate": 4.926607386564898e-05, |
|
"loss": 1.2393, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.2689655172413792, |
|
"eval_loss": 0.6765857934951782, |
|
"eval_runtime": 18.2689, |
|
"eval_samples_per_second": 1.095, |
|
"eval_steps_per_second": 0.547, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.282758620689655, |
|
"grad_norm": 2.140949249267578, |
|
"learning_rate": 4.916552125781528e-05, |
|
"loss": 1.0277, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.282758620689655, |
|
"eval_loss": 0.6735562682151794, |
|
"eval_runtime": 18.1407, |
|
"eval_samples_per_second": 1.102, |
|
"eval_steps_per_second": 0.551, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.296551724137931, |
|
"grad_norm": 2.235147476196289, |
|
"learning_rate": 4.9058631660850765e-05, |
|
"loss": 1.2081, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.296551724137931, |
|
"eval_loss": 0.6619122624397278, |
|
"eval_runtime": 18.2145, |
|
"eval_samples_per_second": 1.098, |
|
"eval_steps_per_second": 0.549, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.3103448275862069, |
|
"grad_norm": 2.077143669128418, |
|
"learning_rate": 4.894543310469968e-05, |
|
"loss": 1.2378, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.3103448275862069, |
|
"eval_loss": 0.6547893285751343, |
|
"eval_runtime": 18.7488, |
|
"eval_samples_per_second": 1.067, |
|
"eval_steps_per_second": 0.533, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.3241379310344827, |
|
"grad_norm": 1.9517972469329834, |
|
"learning_rate": 4.882595527372152e-05, |
|
"loss": 1.0997, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.3241379310344827, |
|
"eval_loss": 0.6498640775680542, |
|
"eval_runtime": 18.4304, |
|
"eval_samples_per_second": 1.085, |
|
"eval_steps_per_second": 0.543, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.3379310344827586, |
|
"grad_norm": 2.0447959899902344, |
|
"learning_rate": 4.870022949890676e-05, |
|
"loss": 0.9613, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.3379310344827586, |
|
"eval_loss": 0.6370054483413696, |
|
"eval_runtime": 18.252, |
|
"eval_samples_per_second": 1.096, |
|
"eval_steps_per_second": 0.548, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.3517241379310345, |
|
"grad_norm": 2.078657865524292, |
|
"learning_rate": 4.856828874966086e-05, |
|
"loss": 1.1216, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.3517241379310345, |
|
"eval_loss": 0.6291982531547546, |
|
"eval_runtime": 18.2386, |
|
"eval_samples_per_second": 1.097, |
|
"eval_steps_per_second": 0.548, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.3655172413793104, |
|
"grad_norm": 2.0556623935699463, |
|
"learning_rate": 4.8430167625158595e-05, |
|
"loss": 1.0718, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.3655172413793104, |
|
"eval_loss": 0.6218433380126953, |
|
"eval_runtime": 18.1671, |
|
"eval_samples_per_second": 1.101, |
|
"eval_steps_per_second": 0.55, |
|
"step": 99 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 216, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2709753377329971e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|