|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 5000, |
|
"global_step": 87900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11376564277588168, |
|
"grad_norm": 1.9390705823898315, |
|
"learning_rate": 0.0007909078498293515, |
|
"loss": 1.5809, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.22753128555176336, |
|
"grad_norm": 1.703497052192688, |
|
"learning_rate": 0.000781806598407281, |
|
"loss": 1.54, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3412969283276451, |
|
"grad_norm": 1.7551511526107788, |
|
"learning_rate": 0.0007727144482366326, |
|
"loss": 1.5087, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.4550625711035267, |
|
"grad_norm": 1.5709869861602783, |
|
"learning_rate": 0.000763613196814562, |
|
"loss": 1.4773, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5688282138794084, |
|
"grad_norm": 1.5395598411560059, |
|
"learning_rate": 0.0007545119453924914, |
|
"loss": 1.4546, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5688282138794084, |
|
"eval_accuracy": 0.647436, |
|
"eval_loss": 1.4382679462432861, |
|
"eval_runtime": 16.1443, |
|
"eval_samples_per_second": 15485.324, |
|
"eval_steps_per_second": 30.289, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.6825938566552902, |
|
"grad_norm": 1.6133095026016235, |
|
"learning_rate": 0.0007454106939704209, |
|
"loss": 1.4513, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.7963594994311718, |
|
"grad_norm": 1.3529345989227295, |
|
"learning_rate": 0.0007363185437997725, |
|
"loss": 1.459, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.9101251422070534, |
|
"grad_norm": 1.4212840795516968, |
|
"learning_rate": 0.000727217292377702, |
|
"loss": 1.4393, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.023890784982935, |
|
"grad_norm": 1.3942997455596924, |
|
"learning_rate": 0.0007181342434584756, |
|
"loss": 1.4183, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.1376564277588168, |
|
"grad_norm": 1.584731936454773, |
|
"learning_rate": 0.0007090329920364051, |
|
"loss": 1.3759, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.1376564277588168, |
|
"eval_accuracy": 0.660984, |
|
"eval_loss": 1.38503897190094, |
|
"eval_runtime": 16.2019, |
|
"eval_samples_per_second": 15430.245, |
|
"eval_steps_per_second": 30.182, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.2514220705346986, |
|
"grad_norm": 1.4144625663757324, |
|
"learning_rate": 0.0006999317406143345, |
|
"loss": 1.375, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.36518771331058, |
|
"grad_norm": 1.3004510402679443, |
|
"learning_rate": 0.0006908395904436861, |
|
"loss": 1.3729, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.4789533560864618, |
|
"grad_norm": 1.3783901929855347, |
|
"learning_rate": 0.0006817474402730376, |
|
"loss": 1.3562, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.5927189988623436, |
|
"grad_norm": 1.309706449508667, |
|
"learning_rate": 0.000672646188850967, |
|
"loss": 1.355, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.7064846416382253, |
|
"grad_norm": 3.742795944213867, |
|
"learning_rate": 0.0006635540386803186, |
|
"loss": 1.3508, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.7064846416382253, |
|
"eval_accuracy": 0.673728, |
|
"eval_loss": 1.316284418106079, |
|
"eval_runtime": 16.2031, |
|
"eval_samples_per_second": 15429.139, |
|
"eval_steps_per_second": 30.179, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.820250284414107, |
|
"grad_norm": 1.2620598077774048, |
|
"learning_rate": 0.0006544527872582481, |
|
"loss": 1.3472, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.9340159271899886, |
|
"grad_norm": 1.3602592945098877, |
|
"learning_rate": 0.0006453515358361775, |
|
"loss": 1.3371, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.04778156996587, |
|
"grad_norm": 1.3070189952850342, |
|
"learning_rate": 0.000636259385665529, |
|
"loss": 1.3145, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.161547212741752, |
|
"grad_norm": 1.2134970426559448, |
|
"learning_rate": 0.0006271581342434585, |
|
"loss": 1.2917, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.2753128555176336, |
|
"grad_norm": 1.3796401023864746, |
|
"learning_rate": 0.00061806598407281, |
|
"loss": 1.294, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.2753128555176336, |
|
"eval_accuracy": 0.682924, |
|
"eval_loss": 1.283160924911499, |
|
"eval_runtime": 16.1194, |
|
"eval_samples_per_second": 15509.309, |
|
"eval_steps_per_second": 30.336, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.3890784982935154, |
|
"grad_norm": 1.357393741607666, |
|
"learning_rate": 0.0006089738339021616, |
|
"loss": 1.2936, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.502844141069397, |
|
"grad_norm": 1.2381339073181152, |
|
"learning_rate": 0.0005998725824800911, |
|
"loss": 1.2859, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.616609783845279, |
|
"grad_norm": 1.256423830986023, |
|
"learning_rate": 0.0005907713310580204, |
|
"loss": 1.2899, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.73037542662116, |
|
"grad_norm": 1.1443513631820679, |
|
"learning_rate": 0.000581679180887372, |
|
"loss": 1.2846, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.8441410693970424, |
|
"grad_norm": 1.2000058889389038, |
|
"learning_rate": 0.0005725870307167236, |
|
"loss": 1.2811, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.8441410693970424, |
|
"eval_accuracy": 0.688052, |
|
"eval_loss": 1.2580605745315552, |
|
"eval_runtime": 16.1237, |
|
"eval_samples_per_second": 15505.095, |
|
"eval_steps_per_second": 30.328, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.9579067121729237, |
|
"grad_norm": 1.2849873304367065, |
|
"learning_rate": 0.0005634857792946531, |
|
"loss": 1.2779, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 3.0716723549488054, |
|
"grad_norm": 1.2703396081924438, |
|
"learning_rate": 0.0005543936291240047, |
|
"loss": 1.2444, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 3.185437997724687, |
|
"grad_norm": 1.356720209121704, |
|
"learning_rate": 0.000545292377701934, |
|
"loss": 1.2303, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 3.299203640500569, |
|
"grad_norm": 1.128195881843567, |
|
"learning_rate": 0.0005361911262798635, |
|
"loss": 1.2321, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.4129692832764507, |
|
"grad_norm": 1.2033754587173462, |
|
"learning_rate": 0.0005270989761092151, |
|
"loss": 1.2331, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.4129692832764507, |
|
"eval_accuracy": 0.69262, |
|
"eval_loss": 1.2387434244155884, |
|
"eval_runtime": 16.2457, |
|
"eval_samples_per_second": 15388.688, |
|
"eval_steps_per_second": 30.1, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.526734926052332, |
|
"grad_norm": 1.2216309309005737, |
|
"learning_rate": 0.0005179977246871446, |
|
"loss": 1.2384, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 3.640500568828214, |
|
"grad_norm": 1.3189234733581543, |
|
"learning_rate": 0.000508896473265074, |
|
"loss": 1.239, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 3.7542662116040955, |
|
"grad_norm": 1.193328857421875, |
|
"learning_rate": 0.0004998043230944255, |
|
"loss": 1.2282, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.868031854379977, |
|
"grad_norm": 1.3810237646102905, |
|
"learning_rate": 0.000490703071672355, |
|
"loss": 1.2301, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 3.981797497155859, |
|
"grad_norm": 1.477654218673706, |
|
"learning_rate": 0.0004816018202502845, |
|
"loss": 1.2276, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 3.981797497155859, |
|
"eval_accuracy": 0.697844, |
|
"eval_loss": 1.2226529121398926, |
|
"eval_runtime": 16.1466, |
|
"eval_samples_per_second": 15483.136, |
|
"eval_steps_per_second": 30.285, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 4.09556313993174, |
|
"grad_norm": 2.5721781253814697, |
|
"learning_rate": 0.00047250056882821396, |
|
"loss": 1.2011, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 4.2093287827076225, |
|
"grad_norm": 1.233066439628601, |
|
"learning_rate": 0.00046340841865756544, |
|
"loss": 1.1882, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 4.323094425483504, |
|
"grad_norm": 15.391983032226562, |
|
"learning_rate": 0.0004543071672354949, |
|
"loss": 1.1856, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 4.436860068259386, |
|
"grad_norm": 1.2283698320388794, |
|
"learning_rate": 0.0004452059158134244, |
|
"loss": 1.1972, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 4.550625711035267, |
|
"grad_norm": 1.1042656898498535, |
|
"learning_rate": 0.0004361046643913539, |
|
"loss": 1.1964, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.550625711035267, |
|
"eval_accuracy": 0.698972, |
|
"eval_loss": 1.2195725440979004, |
|
"eval_runtime": 16.22, |
|
"eval_samples_per_second": 15413.078, |
|
"eval_steps_per_second": 30.148, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.664391353811149, |
|
"grad_norm": 1.2379703521728516, |
|
"learning_rate": 0.00042701251422070535, |
|
"loss": 1.194, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 4.778156996587031, |
|
"grad_norm": 1.3536499738693237, |
|
"learning_rate": 0.00041792036405005693, |
|
"loss": 1.1939, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 4.891922639362912, |
|
"grad_norm": 1.1571460962295532, |
|
"learning_rate": 0.00040881911262798635, |
|
"loss": 1.1952, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 5.005688282138794, |
|
"grad_norm": 1.1833922863006592, |
|
"learning_rate": 0.00039972696245733794, |
|
"loss": 1.1908, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 5.1194539249146755, |
|
"grad_norm": 1.4700716733932495, |
|
"learning_rate": 0.00039062571103526736, |
|
"loss": 1.1498, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 5.1194539249146755, |
|
"eval_accuracy": 0.703608, |
|
"eval_loss": 1.1993978023529053, |
|
"eval_runtime": 16.3707, |
|
"eval_samples_per_second": 15271.187, |
|
"eval_steps_per_second": 29.87, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 5.233219567690558, |
|
"grad_norm": 1.3525902032852173, |
|
"learning_rate": 0.00038152445961319684, |
|
"loss": 1.1507, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 5.346985210466439, |
|
"grad_norm": 1.3642832040786743, |
|
"learning_rate": 0.0003724232081911263, |
|
"loss": 1.1551, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 5.460750853242321, |
|
"grad_norm": 1.2102240324020386, |
|
"learning_rate": 0.0003633219567690558, |
|
"loss": 1.1574, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 5.5745164960182025, |
|
"grad_norm": 1.1597959995269775, |
|
"learning_rate": 0.0003542207053469852, |
|
"loss": 1.1545, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 5.688282138794084, |
|
"grad_norm": 1.2223830223083496, |
|
"learning_rate": 0.00034512855517633675, |
|
"loss": 1.1548, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.688282138794084, |
|
"eval_accuracy": 0.705224, |
|
"eval_loss": 1.1899733543395996, |
|
"eval_runtime": 16.029, |
|
"eval_samples_per_second": 15596.716, |
|
"eval_steps_per_second": 30.507, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.802047781569966, |
|
"grad_norm": 1.1772878170013428, |
|
"learning_rate": 0.0003360364050056883, |
|
"loss": 1.1543, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 5.915813424345847, |
|
"grad_norm": 1.286970615386963, |
|
"learning_rate": 0.00032693515358361776, |
|
"loss": 1.1566, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 6.0295790671217295, |
|
"grad_norm": 1.1497869491577148, |
|
"learning_rate": 0.00031783390216154724, |
|
"loss": 1.1471, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 6.143344709897611, |
|
"grad_norm": 1.2324450016021729, |
|
"learning_rate": 0.00030873265073947667, |
|
"loss": 1.1141, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 6.257110352673493, |
|
"grad_norm": 1.175905466079712, |
|
"learning_rate": 0.00029963139931740615, |
|
"loss": 1.1232, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 6.257110352673493, |
|
"eval_accuracy": 0.707532, |
|
"eval_loss": 1.183059573173523, |
|
"eval_runtime": 16.1679, |
|
"eval_samples_per_second": 15462.772, |
|
"eval_steps_per_second": 30.245, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 6.370875995449374, |
|
"grad_norm": 1.133489966392517, |
|
"learning_rate": 0.00029053924914675767, |
|
"loss": 1.1213, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 6.484641638225256, |
|
"grad_norm": 1.3633593320846558, |
|
"learning_rate": 0.00028143799772468715, |
|
"loss": 1.1206, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 6.598407281001138, |
|
"grad_norm": 1.2622781991958618, |
|
"learning_rate": 0.00027233674630261663, |
|
"loss": 1.1241, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 6.712172923777019, |
|
"grad_norm": 1.2032582759857178, |
|
"learning_rate": 0.00026324459613196816, |
|
"loss": 1.1276, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 6.825938566552901, |
|
"grad_norm": 1.166924238204956, |
|
"learning_rate": 0.00025414334470989764, |
|
"loss": 1.1264, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 6.825938566552901, |
|
"eval_accuracy": 0.710036, |
|
"eval_loss": 1.1695001125335693, |
|
"eval_runtime": 16.198, |
|
"eval_samples_per_second": 15434.001, |
|
"eval_steps_per_second": 30.189, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 6.939704209328783, |
|
"grad_norm": 1.236396074295044, |
|
"learning_rate": 0.00024505119453924917, |
|
"loss": 1.1196, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 7.053469852104665, |
|
"grad_norm": 1.2301005125045776, |
|
"learning_rate": 0.00023594994311717865, |
|
"loss": 1.1065, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 7.167235494880546, |
|
"grad_norm": 1.1987460851669312, |
|
"learning_rate": 0.00022685779294653017, |
|
"loss": 1.0845, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 7.281001137656427, |
|
"grad_norm": 1.367330551147461, |
|
"learning_rate": 0.0002177565415244596, |
|
"loss": 1.0915, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 7.39476678043231, |
|
"grad_norm": 1.2554900646209717, |
|
"learning_rate": 0.00020865529010238908, |
|
"loss": 1.0896, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 7.39476678043231, |
|
"eval_accuracy": 0.712788, |
|
"eval_loss": 1.1583917140960693, |
|
"eval_runtime": 15.94, |
|
"eval_samples_per_second": 15683.855, |
|
"eval_steps_per_second": 30.678, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 7.508532423208191, |
|
"grad_norm": 1.1475346088409424, |
|
"learning_rate": 0.00019955403868031853, |
|
"loss": 1.0937, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 7.622298065984073, |
|
"grad_norm": 1.2330896854400635, |
|
"learning_rate": 0.000190452787258248, |
|
"loss": 1.095, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 7.736063708759954, |
|
"grad_norm": 1.3467962741851807, |
|
"learning_rate": 0.0001813515358361775, |
|
"loss": 1.0945, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 7.849829351535837, |
|
"grad_norm": 1.144555926322937, |
|
"learning_rate": 0.00017225938566552902, |
|
"loss": 1.0943, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 7.963594994311718, |
|
"grad_norm": 1.39180326461792, |
|
"learning_rate": 0.0001631581342434585, |
|
"loss": 1.0917, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 7.963594994311718, |
|
"eval_accuracy": 0.715496, |
|
"eval_loss": 1.1535059213638306, |
|
"eval_runtime": 16.0681, |
|
"eval_samples_per_second": 15558.787, |
|
"eval_steps_per_second": 30.433, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 8.0773606370876, |
|
"grad_norm": 1.277241587638855, |
|
"learning_rate": 0.00015405688282138795, |
|
"loss": 1.0693, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 8.19112627986348, |
|
"grad_norm": 1.3388996124267578, |
|
"learning_rate": 0.00014496473265073948, |
|
"loss": 1.064, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 8.304891922639364, |
|
"grad_norm": 1.1635925769805908, |
|
"learning_rate": 0.00013588168373151308, |
|
"loss": 1.0617, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 8.418657565415245, |
|
"grad_norm": 1.1681923866271973, |
|
"learning_rate": 0.00012678043230944256, |
|
"loss": 1.0664, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 8.532423208191126, |
|
"grad_norm": 1.3212028741836548, |
|
"learning_rate": 0.00011767918088737203, |
|
"loss": 1.0654, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 8.532423208191126, |
|
"eval_accuracy": 0.714384, |
|
"eval_loss": 1.154496192932129, |
|
"eval_runtime": 16.158, |
|
"eval_samples_per_second": 15472.18, |
|
"eval_steps_per_second": 30.264, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 8.646188850967008, |
|
"grad_norm": 1.341015100479126, |
|
"learning_rate": 0.00010857792946530148, |
|
"loss": 1.0618, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 8.759954493742889, |
|
"grad_norm": 1.2505824565887451, |
|
"learning_rate": 9.947667804323096e-05, |
|
"loss": 1.0674, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 8.873720136518772, |
|
"grad_norm": 1.2615190744400024, |
|
"learning_rate": 9.037542662116041e-05, |
|
"loss": 1.0638, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 8.987485779294653, |
|
"grad_norm": 1.2935796976089478, |
|
"learning_rate": 8.128327645051195e-05, |
|
"loss": 1.0616, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 9.101251422070535, |
|
"grad_norm": 1.3248777389526367, |
|
"learning_rate": 7.218202502844142e-05, |
|
"loss": 1.0395, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 9.101251422070535, |
|
"eval_accuracy": 0.716892, |
|
"eval_loss": 1.1470571756362915, |
|
"eval_runtime": 16.0825, |
|
"eval_samples_per_second": 15544.827, |
|
"eval_steps_per_second": 30.406, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 9.215017064846416, |
|
"grad_norm": 1.379506230354309, |
|
"learning_rate": 6.308077360637088e-05, |
|
"loss": 1.0436, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 9.328782707622299, |
|
"grad_norm": 1.1906781196594238, |
|
"learning_rate": 5.398862343572242e-05, |
|
"loss": 1.0417, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 9.44254835039818, |
|
"grad_norm": 1.1397643089294434, |
|
"learning_rate": 4.489647326507395e-05, |
|
"loss": 1.0376, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 9.556313993174061, |
|
"grad_norm": 1.0807147026062012, |
|
"learning_rate": 3.5813424345847554e-05, |
|
"loss": 1.0381, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 9.670079635949943, |
|
"grad_norm": 1.3149391412734985, |
|
"learning_rate": 2.6712172923777017e-05, |
|
"loss": 1.0383, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 9.670079635949943, |
|
"eval_accuracy": 0.713636, |
|
"eval_loss": 1.1722280979156494, |
|
"eval_runtime": 16.186, |
|
"eval_samples_per_second": 15445.423, |
|
"eval_steps_per_second": 30.211, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 9.783845278725824, |
|
"grad_norm": 1.227634072303772, |
|
"learning_rate": 1.7610921501706483e-05, |
|
"loss": 1.0359, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 9.897610921501707, |
|
"grad_norm": 1.2846591472625732, |
|
"learning_rate": 8.509670079635951e-06, |
|
"loss": 1.0337, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 87900, |
|
"total_flos": 5.6417821488e+17, |
|
"train_loss": 1.2023330011465443, |
|
"train_runtime": 3087.8654, |
|
"train_samples_per_second": 14573.174, |
|
"train_steps_per_second": 28.466 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 87900, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 5000, |
|
"total_flos": 5.6417821488e+17, |
|
"train_batch_size": 512, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|