{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 5000, "global_step": 87900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11376564277588168, "grad_norm": 1.9390705823898315, "learning_rate": 0.0007909078498293515, "loss": 1.5809, "step": 1000 }, { "epoch": 0.22753128555176336, "grad_norm": 1.703497052192688, "learning_rate": 0.000781806598407281, "loss": 1.54, "step": 2000 }, { "epoch": 0.3412969283276451, "grad_norm": 1.7551511526107788, "learning_rate": 0.0007727144482366326, "loss": 1.5087, "step": 3000 }, { "epoch": 0.4550625711035267, "grad_norm": 1.5709869861602783, "learning_rate": 0.000763613196814562, "loss": 1.4773, "step": 4000 }, { "epoch": 0.5688282138794084, "grad_norm": 1.5395598411560059, "learning_rate": 0.0007545119453924914, "loss": 1.4546, "step": 5000 }, { "epoch": 0.5688282138794084, "eval_accuracy": 0.647436, "eval_loss": 1.4382679462432861, "eval_runtime": 16.1443, "eval_samples_per_second": 15485.324, "eval_steps_per_second": 30.289, "step": 5000 }, { "epoch": 0.6825938566552902, "grad_norm": 1.6133095026016235, "learning_rate": 0.0007454106939704209, "loss": 1.4513, "step": 6000 }, { "epoch": 0.7963594994311718, "grad_norm": 1.3529345989227295, "learning_rate": 0.0007363185437997725, "loss": 1.459, "step": 7000 }, { "epoch": 0.9101251422070534, "grad_norm": 1.4212840795516968, "learning_rate": 0.000727217292377702, "loss": 1.4393, "step": 8000 }, { "epoch": 1.023890784982935, "grad_norm": 1.3942997455596924, "learning_rate": 0.0007181342434584756, "loss": 1.4183, "step": 9000 }, { "epoch": 1.1376564277588168, "grad_norm": 1.584731936454773, "learning_rate": 0.0007090329920364051, "loss": 1.3759, "step": 10000 }, { "epoch": 1.1376564277588168, "eval_accuracy": 0.660984, "eval_loss": 1.38503897190094, "eval_runtime": 16.2019, "eval_samples_per_second": 15430.245, "eval_steps_per_second": 30.182, "step": 10000 }, { "epoch": 1.2514220705346986, "grad_norm": 1.4144625663757324, "learning_rate": 0.0006999317406143345, "loss": 1.375, "step": 11000 }, { "epoch": 1.36518771331058, "grad_norm": 1.3004510402679443, "learning_rate": 0.0006908395904436861, "loss": 1.3729, "step": 12000 }, { "epoch": 1.4789533560864618, "grad_norm": 1.3783901929855347, "learning_rate": 0.0006817474402730376, "loss": 1.3562, "step": 13000 }, { "epoch": 1.5927189988623436, "grad_norm": 1.309706449508667, "learning_rate": 0.000672646188850967, "loss": 1.355, "step": 14000 }, { "epoch": 1.7064846416382253, "grad_norm": 3.742795944213867, "learning_rate": 0.0006635540386803186, "loss": 1.3508, "step": 15000 }, { "epoch": 1.7064846416382253, "eval_accuracy": 0.673728, "eval_loss": 1.316284418106079, "eval_runtime": 16.2031, "eval_samples_per_second": 15429.139, "eval_steps_per_second": 30.179, "step": 15000 }, { "epoch": 1.820250284414107, "grad_norm": 1.2620598077774048, "learning_rate": 0.0006544527872582481, "loss": 1.3472, "step": 16000 }, { "epoch": 1.9340159271899886, "grad_norm": 1.3602592945098877, "learning_rate": 0.0006453515358361775, "loss": 1.3371, "step": 17000 }, { "epoch": 2.04778156996587, "grad_norm": 1.3070189952850342, "learning_rate": 0.000636259385665529, "loss": 1.3145, "step": 18000 }, { "epoch": 2.161547212741752, "grad_norm": 1.2134970426559448, "learning_rate": 0.0006271581342434585, "loss": 1.2917, "step": 19000 }, { "epoch": 2.2753128555176336, "grad_norm": 1.3796401023864746, "learning_rate": 0.00061806598407281, "loss": 1.294, "step": 20000 }, { "epoch": 2.2753128555176336, "eval_accuracy": 0.682924, "eval_loss": 1.283160924911499, "eval_runtime": 16.1194, "eval_samples_per_second": 15509.309, "eval_steps_per_second": 30.336, "step": 20000 }, { "epoch": 2.3890784982935154, "grad_norm": 1.357393741607666, "learning_rate": 0.0006089738339021616, "loss": 1.2936, "step": 21000 }, { "epoch": 2.502844141069397, "grad_norm": 1.2381339073181152, "learning_rate": 0.0005998725824800911, "loss": 1.2859, "step": 22000 }, { "epoch": 2.616609783845279, "grad_norm": 1.256423830986023, "learning_rate": 0.0005907713310580204, "loss": 1.2899, "step": 23000 }, { "epoch": 2.73037542662116, "grad_norm": 1.1443513631820679, "learning_rate": 0.000581679180887372, "loss": 1.2846, "step": 24000 }, { "epoch": 2.8441410693970424, "grad_norm": 1.2000058889389038, "learning_rate": 0.0005725870307167236, "loss": 1.2811, "step": 25000 }, { "epoch": 2.8441410693970424, "eval_accuracy": 0.688052, "eval_loss": 1.2580605745315552, "eval_runtime": 16.1237, "eval_samples_per_second": 15505.095, "eval_steps_per_second": 30.328, "step": 25000 }, { "epoch": 2.9579067121729237, "grad_norm": 1.2849873304367065, "learning_rate": 0.0005634857792946531, "loss": 1.2779, "step": 26000 }, { "epoch": 3.0716723549488054, "grad_norm": 1.2703396081924438, "learning_rate": 0.0005543936291240047, "loss": 1.2444, "step": 27000 }, { "epoch": 3.185437997724687, "grad_norm": 1.356720209121704, "learning_rate": 0.000545292377701934, "loss": 1.2303, "step": 28000 }, { "epoch": 3.299203640500569, "grad_norm": 1.128195881843567, "learning_rate": 0.0005361911262798635, "loss": 1.2321, "step": 29000 }, { "epoch": 3.4129692832764507, "grad_norm": 1.2033754587173462, "learning_rate": 0.0005270989761092151, "loss": 1.2331, "step": 30000 }, { "epoch": 3.4129692832764507, "eval_accuracy": 0.69262, "eval_loss": 1.2387434244155884, "eval_runtime": 16.2457, "eval_samples_per_second": 15388.688, "eval_steps_per_second": 30.1, "step": 30000 }, { "epoch": 3.526734926052332, "grad_norm": 1.2216309309005737, "learning_rate": 0.0005179977246871446, "loss": 1.2384, "step": 31000 }, { "epoch": 3.640500568828214, "grad_norm": 1.3189234733581543, "learning_rate": 0.000508896473265074, "loss": 1.239, "step": 32000 }, { "epoch": 3.7542662116040955, "grad_norm": 1.193328857421875, "learning_rate": 0.0004998043230944255, "loss": 1.2282, "step": 33000 }, { "epoch": 3.868031854379977, "grad_norm": 1.3810237646102905, "learning_rate": 0.000490703071672355, "loss": 1.2301, "step": 34000 }, { "epoch": 3.981797497155859, "grad_norm": 1.477654218673706, "learning_rate": 0.0004816018202502845, "loss": 1.2276, "step": 35000 }, { "epoch": 3.981797497155859, "eval_accuracy": 0.697844, "eval_loss": 1.2226529121398926, "eval_runtime": 16.1466, "eval_samples_per_second": 15483.136, "eval_steps_per_second": 30.285, "step": 35000 }, { "epoch": 4.09556313993174, "grad_norm": 2.5721781253814697, "learning_rate": 0.00047250056882821396, "loss": 1.2011, "step": 36000 }, { "epoch": 4.2093287827076225, "grad_norm": 1.233066439628601, "learning_rate": 0.00046340841865756544, "loss": 1.1882, "step": 37000 }, { "epoch": 4.323094425483504, "grad_norm": 15.391983032226562, "learning_rate": 0.0004543071672354949, "loss": 1.1856, "step": 38000 }, { "epoch": 4.436860068259386, "grad_norm": 1.2283698320388794, "learning_rate": 0.0004452059158134244, "loss": 1.1972, "step": 39000 }, { "epoch": 4.550625711035267, "grad_norm": 1.1042656898498535, "learning_rate": 0.0004361046643913539, "loss": 1.1964, "step": 40000 }, { "epoch": 4.550625711035267, "eval_accuracy": 0.698972, "eval_loss": 1.2195725440979004, "eval_runtime": 16.22, "eval_samples_per_second": 15413.078, "eval_steps_per_second": 30.148, "step": 40000 }, { "epoch": 4.664391353811149, "grad_norm": 1.2379703521728516, "learning_rate": 0.00042701251422070535, "loss": 1.194, "step": 41000 }, { "epoch": 4.778156996587031, "grad_norm": 1.3536499738693237, "learning_rate": 0.00041792036405005693, "loss": 1.1939, "step": 42000 }, { "epoch": 4.891922639362912, "grad_norm": 1.1571460962295532, "learning_rate": 0.00040881911262798635, "loss": 1.1952, "step": 43000 }, { "epoch": 5.005688282138794, "grad_norm": 1.1833922863006592, "learning_rate": 0.00039972696245733794, "loss": 1.1908, "step": 44000 }, { "epoch": 5.1194539249146755, "grad_norm": 1.4700716733932495, "learning_rate": 0.00039062571103526736, "loss": 1.1498, "step": 45000 }, { "epoch": 5.1194539249146755, "eval_accuracy": 0.703608, "eval_loss": 1.1993978023529053, "eval_runtime": 16.3707, "eval_samples_per_second": 15271.187, "eval_steps_per_second": 29.87, "step": 45000 }, { "epoch": 5.233219567690558, "grad_norm": 1.3525902032852173, "learning_rate": 0.00038152445961319684, "loss": 1.1507, "step": 46000 }, { "epoch": 5.346985210466439, "grad_norm": 1.3642832040786743, "learning_rate": 0.0003724232081911263, "loss": 1.1551, "step": 47000 }, { "epoch": 5.460750853242321, "grad_norm": 1.2102240324020386, "learning_rate": 0.0003633219567690558, "loss": 1.1574, "step": 48000 }, { "epoch": 5.5745164960182025, "grad_norm": 1.1597959995269775, "learning_rate": 0.0003542207053469852, "loss": 1.1545, "step": 49000 }, { "epoch": 5.688282138794084, "grad_norm": 1.2223830223083496, "learning_rate": 0.00034512855517633675, "loss": 1.1548, "step": 50000 }, { "epoch": 5.688282138794084, "eval_accuracy": 0.705224, "eval_loss": 1.1899733543395996, "eval_runtime": 16.029, "eval_samples_per_second": 15596.716, "eval_steps_per_second": 30.507, "step": 50000 }, { "epoch": 5.802047781569966, "grad_norm": 1.1772878170013428, "learning_rate": 0.0003360364050056883, "loss": 1.1543, "step": 51000 }, { "epoch": 5.915813424345847, "grad_norm": 1.286970615386963, "learning_rate": 0.00032693515358361776, "loss": 1.1566, "step": 52000 }, { "epoch": 6.0295790671217295, "grad_norm": 1.1497869491577148, "learning_rate": 0.00031783390216154724, "loss": 1.1471, "step": 53000 }, { "epoch": 6.143344709897611, "grad_norm": 1.2324450016021729, "learning_rate": 0.00030873265073947667, "loss": 1.1141, "step": 54000 }, { "epoch": 6.257110352673493, "grad_norm": 1.175905466079712, "learning_rate": 0.00029963139931740615, "loss": 1.1232, "step": 55000 }, { "epoch": 6.257110352673493, "eval_accuracy": 0.707532, "eval_loss": 1.183059573173523, "eval_runtime": 16.1679, "eval_samples_per_second": 15462.772, "eval_steps_per_second": 30.245, "step": 55000 }, { "epoch": 6.370875995449374, "grad_norm": 1.133489966392517, "learning_rate": 0.00029053924914675767, "loss": 1.1213, "step": 56000 }, { "epoch": 6.484641638225256, "grad_norm": 1.3633593320846558, "learning_rate": 0.00028143799772468715, "loss": 1.1206, "step": 57000 }, { "epoch": 6.598407281001138, "grad_norm": 1.2622781991958618, "learning_rate": 0.00027233674630261663, "loss": 1.1241, "step": 58000 }, { "epoch": 6.712172923777019, "grad_norm": 1.2032582759857178, "learning_rate": 0.00026324459613196816, "loss": 1.1276, "step": 59000 }, { "epoch": 6.825938566552901, "grad_norm": 1.166924238204956, "learning_rate": 0.00025414334470989764, "loss": 1.1264, "step": 60000 }, { "epoch": 6.825938566552901, "eval_accuracy": 0.710036, "eval_loss": 1.1695001125335693, "eval_runtime": 16.198, "eval_samples_per_second": 15434.001, "eval_steps_per_second": 30.189, "step": 60000 }, { "epoch": 6.939704209328783, "grad_norm": 1.236396074295044, "learning_rate": 0.00024505119453924917, "loss": 1.1196, "step": 61000 }, { "epoch": 7.053469852104665, "grad_norm": 1.2301005125045776, "learning_rate": 0.00023594994311717865, "loss": 1.1065, "step": 62000 }, { "epoch": 7.167235494880546, "grad_norm": 1.1987460851669312, "learning_rate": 0.00022685779294653017, "loss": 1.0845, "step": 63000 }, { "epoch": 7.281001137656427, "grad_norm": 1.367330551147461, "learning_rate": 0.0002177565415244596, "loss": 1.0915, "step": 64000 }, { "epoch": 7.39476678043231, "grad_norm": 1.2554900646209717, "learning_rate": 0.00020865529010238908, "loss": 1.0896, "step": 65000 }, { "epoch": 7.39476678043231, "eval_accuracy": 0.712788, "eval_loss": 1.1583917140960693, "eval_runtime": 15.94, "eval_samples_per_second": 15683.855, "eval_steps_per_second": 30.678, "step": 65000 }, { "epoch": 7.508532423208191, "grad_norm": 1.1475346088409424, "learning_rate": 0.00019955403868031853, "loss": 1.0937, "step": 66000 }, { "epoch": 7.622298065984073, "grad_norm": 1.2330896854400635, "learning_rate": 0.000190452787258248, "loss": 1.095, "step": 67000 }, { "epoch": 7.736063708759954, "grad_norm": 1.3467962741851807, "learning_rate": 0.0001813515358361775, "loss": 1.0945, "step": 68000 }, { "epoch": 7.849829351535837, "grad_norm": 1.144555926322937, "learning_rate": 0.00017225938566552902, "loss": 1.0943, "step": 69000 }, { "epoch": 7.963594994311718, "grad_norm": 1.39180326461792, "learning_rate": 0.0001631581342434585, "loss": 1.0917, "step": 70000 }, { "epoch": 7.963594994311718, "eval_accuracy": 0.715496, "eval_loss": 1.1535059213638306, "eval_runtime": 16.0681, "eval_samples_per_second": 15558.787, "eval_steps_per_second": 30.433, "step": 70000 }, { "epoch": 8.0773606370876, "grad_norm": 1.277241587638855, "learning_rate": 0.00015405688282138795, "loss": 1.0693, "step": 71000 }, { "epoch": 8.19112627986348, "grad_norm": 1.3388996124267578, "learning_rate": 0.00014496473265073948, "loss": 1.064, "step": 72000 }, { "epoch": 8.304891922639364, "grad_norm": 1.1635925769805908, "learning_rate": 0.00013588168373151308, "loss": 1.0617, "step": 73000 }, { "epoch": 8.418657565415245, "grad_norm": 1.1681923866271973, "learning_rate": 0.00012678043230944256, "loss": 1.0664, "step": 74000 }, { "epoch": 8.532423208191126, "grad_norm": 1.3212028741836548, "learning_rate": 0.00011767918088737203, "loss": 1.0654, "step": 75000 }, { "epoch": 8.532423208191126, "eval_accuracy": 0.714384, "eval_loss": 1.154496192932129, "eval_runtime": 16.158, "eval_samples_per_second": 15472.18, "eval_steps_per_second": 30.264, "step": 75000 }, { "epoch": 8.646188850967008, "grad_norm": 1.341015100479126, "learning_rate": 0.00010857792946530148, "loss": 1.0618, "step": 76000 }, { "epoch": 8.759954493742889, "grad_norm": 1.2505824565887451, "learning_rate": 9.947667804323096e-05, "loss": 1.0674, "step": 77000 }, { "epoch": 8.873720136518772, "grad_norm": 1.2615190744400024, "learning_rate": 9.037542662116041e-05, "loss": 1.0638, "step": 78000 }, { "epoch": 8.987485779294653, "grad_norm": 1.2935796976089478, "learning_rate": 8.128327645051195e-05, "loss": 1.0616, "step": 79000 }, { "epoch": 9.101251422070535, "grad_norm": 1.3248777389526367, "learning_rate": 7.218202502844142e-05, "loss": 1.0395, "step": 80000 }, { "epoch": 9.101251422070535, "eval_accuracy": 0.716892, "eval_loss": 1.1470571756362915, "eval_runtime": 16.0825, "eval_samples_per_second": 15544.827, "eval_steps_per_second": 30.406, "step": 80000 }, { "epoch": 9.215017064846416, "grad_norm": 1.379506230354309, "learning_rate": 6.308077360637088e-05, "loss": 1.0436, "step": 81000 }, { "epoch": 9.328782707622299, "grad_norm": 1.1906781196594238, "learning_rate": 5.398862343572242e-05, "loss": 1.0417, "step": 82000 }, { "epoch": 9.44254835039818, "grad_norm": 1.1397643089294434, "learning_rate": 4.489647326507395e-05, "loss": 1.0376, "step": 83000 }, { "epoch": 9.556313993174061, "grad_norm": 1.0807147026062012, "learning_rate": 3.5813424345847554e-05, "loss": 1.0381, "step": 84000 }, { "epoch": 9.670079635949943, "grad_norm": 1.3149391412734985, "learning_rate": 2.6712172923777017e-05, "loss": 1.0383, "step": 85000 }, { "epoch": 9.670079635949943, "eval_accuracy": 0.713636, "eval_loss": 1.1722280979156494, "eval_runtime": 16.186, "eval_samples_per_second": 15445.423, "eval_steps_per_second": 30.211, "step": 85000 }, { "epoch": 9.783845278725824, "grad_norm": 1.227634072303772, "learning_rate": 1.7610921501706483e-05, "loss": 1.0359, "step": 86000 }, { "epoch": 9.897610921501707, "grad_norm": 1.2846591472625732, "learning_rate": 8.509670079635951e-06, "loss": 1.0337, "step": 87000 }, { "epoch": 10.0, "step": 87900, "total_flos": 5.6417821488e+17, "train_loss": 1.2023330011465443, "train_runtime": 3087.8654, "train_samples_per_second": 14573.174, "train_steps_per_second": 28.466 } ], "logging_steps": 1000, "max_steps": 87900, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5000, "total_flos": 5.6417821488e+17, "train_batch_size": 512, "trial_name": null, "trial_params": null }