{ "best_metric": 1.2132208347320557, "best_model_checkpoint": "./output/checkpoints/2024-06-10_15-37-32/checkpoint-50", "epoch": 0.847457627118644, "eval_steps": 1, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01694915254237288, "grad_norm": 8.13034439086914, "learning_rate": 6.666666666666667e-05, "loss": 2.9658, "step": 1 }, { "epoch": 0.01694915254237288, "eval_loss": 2.9201831817626953, "eval_runtime": 20.1288, "eval_samples_per_second": 7.75, "eval_steps_per_second": 0.497, "step": 1 }, { "epoch": 0.03389830508474576, "grad_norm": 7.588105201721191, "learning_rate": 0.00013333333333333334, "loss": 2.8251, "step": 2 }, { "epoch": 0.03389830508474576, "eval_loss": 2.1139814853668213, "eval_runtime": 20.166, "eval_samples_per_second": 7.736, "eval_steps_per_second": 0.496, "step": 2 }, { "epoch": 0.05084745762711865, "grad_norm": 4.060599327087402, "learning_rate": 0.0002, "loss": 2.1092, "step": 3 }, { "epoch": 0.05084745762711865, "eval_loss": 1.677042841911316, "eval_runtime": 20.2239, "eval_samples_per_second": 7.714, "eval_steps_per_second": 0.494, "step": 3 }, { "epoch": 0.06779661016949153, "grad_norm": 1.6822344064712524, "learning_rate": 0.0002666666666666667, "loss": 1.6407, "step": 4 }, { "epoch": 0.06779661016949153, "eval_loss": 1.512647271156311, "eval_runtime": 20.2526, "eval_samples_per_second": 7.703, "eval_steps_per_second": 0.494, "step": 4 }, { "epoch": 0.0847457627118644, "grad_norm": 0.8077585697174072, "learning_rate": 0.0003333333333333334, "loss": 1.48, "step": 5 }, { "epoch": 0.0847457627118644, "eval_loss": 1.4189282655715942, "eval_runtime": 20.3082, "eval_samples_per_second": 7.682, "eval_steps_per_second": 0.492, "step": 5 }, { "epoch": 0.1016949152542373, "grad_norm": 0.7878511548042297, "learning_rate": 0.0004, "loss": 1.4139, "step": 6 }, { "epoch": 0.1016949152542373, "eval_loss": 1.3850855827331543, "eval_runtime": 20.3109, "eval_samples_per_second": 7.681, "eval_steps_per_second": 0.492, "step": 6 }, { "epoch": 0.11864406779661017, "grad_norm": 0.49660709500312805, "learning_rate": 0.00039245283018867925, "loss": 1.392, "step": 7 }, { "epoch": 0.11864406779661017, "eval_loss": 1.425636887550354, "eval_runtime": 20.1672, "eval_samples_per_second": 7.735, "eval_steps_per_second": 0.496, "step": 7 }, { "epoch": 0.13559322033898305, "grad_norm": 1.5075709819793701, "learning_rate": 0.00038490566037735854, "loss": 1.3954, "step": 8 }, { "epoch": 0.13559322033898305, "eval_loss": 1.347417950630188, "eval_runtime": 20.2898, "eval_samples_per_second": 7.689, "eval_steps_per_second": 0.493, "step": 8 }, { "epoch": 0.15254237288135594, "grad_norm": 0.517346203327179, "learning_rate": 0.00037735849056603777, "loss": 1.3231, "step": 9 }, { "epoch": 0.15254237288135594, "eval_loss": 1.314124345779419, "eval_runtime": 20.2626, "eval_samples_per_second": 7.699, "eval_steps_per_second": 0.494, "step": 9 }, { "epoch": 0.1694915254237288, "grad_norm": 0.2546059191226959, "learning_rate": 0.000369811320754717, "loss": 1.2504, "step": 10 }, { "epoch": 0.1694915254237288, "eval_loss": 1.2959833145141602, "eval_runtime": 20.2614, "eval_samples_per_second": 7.699, "eval_steps_per_second": 0.494, "step": 10 }, { "epoch": 0.1864406779661017, "grad_norm": 0.1385415643453598, "learning_rate": 0.00036226415094339624, "loss": 1.274, "step": 11 }, { "epoch": 0.1864406779661017, "eval_loss": 1.2856695652008057, "eval_runtime": 20.2637, "eval_samples_per_second": 7.698, "eval_steps_per_second": 0.493, "step": 11 }, { "epoch": 0.2033898305084746, "grad_norm": 0.13630884885787964, "learning_rate": 0.0003547169811320755, "loss": 1.2657, "step": 12 }, { "epoch": 0.2033898305084746, "eval_loss": 1.2764700651168823, "eval_runtime": 20.2991, "eval_samples_per_second": 7.685, "eval_steps_per_second": 0.493, "step": 12 }, { "epoch": 0.22033898305084745, "grad_norm": 0.12581680715084076, "learning_rate": 0.00034716981132075476, "loss": 1.2436, "step": 13 }, { "epoch": 0.22033898305084745, "eval_loss": 1.269006609916687, "eval_runtime": 20.2819, "eval_samples_per_second": 7.692, "eval_steps_per_second": 0.493, "step": 13 }, { "epoch": 0.23728813559322035, "grad_norm": 0.12691529095172882, "learning_rate": 0.000339622641509434, "loss": 1.2478, "step": 14 }, { "epoch": 0.23728813559322035, "eval_loss": 1.2635128498077393, "eval_runtime": 20.2714, "eval_samples_per_second": 7.696, "eval_steps_per_second": 0.493, "step": 14 }, { "epoch": 0.2542372881355932, "grad_norm": 0.12215649336576462, "learning_rate": 0.0003320754716981132, "loss": 1.2741, "step": 15 }, { "epoch": 0.2542372881355932, "eval_loss": 1.2581740617752075, "eval_runtime": 20.2259, "eval_samples_per_second": 7.713, "eval_steps_per_second": 0.494, "step": 15 }, { "epoch": 0.2711864406779661, "grad_norm": 0.11825581640005112, "learning_rate": 0.0003245283018867925, "loss": 1.2508, "step": 16 }, { "epoch": 0.2711864406779661, "eval_loss": 1.253954529762268, "eval_runtime": 20.3786, "eval_samples_per_second": 7.655, "eval_steps_per_second": 0.491, "step": 16 }, { "epoch": 0.288135593220339, "grad_norm": 0.11941225081682205, "learning_rate": 0.00031698113207547174, "loss": 1.2587, "step": 17 }, { "epoch": 0.288135593220339, "eval_loss": 1.2502553462982178, "eval_runtime": 20.2903, "eval_samples_per_second": 7.688, "eval_steps_per_second": 0.493, "step": 17 }, { "epoch": 0.3050847457627119, "grad_norm": 0.11191528290510178, "learning_rate": 0.000309433962264151, "loss": 1.2692, "step": 18 }, { "epoch": 0.3050847457627119, "eval_loss": 1.2474076747894287, "eval_runtime": 20.3196, "eval_samples_per_second": 7.677, "eval_steps_per_second": 0.492, "step": 18 }, { "epoch": 0.3220338983050847, "grad_norm": 0.11136776953935623, "learning_rate": 0.0003018867924528302, "loss": 1.2347, "step": 19 }, { "epoch": 0.3220338983050847, "eval_loss": 1.2446962594985962, "eval_runtime": 20.1872, "eval_samples_per_second": 7.728, "eval_steps_per_second": 0.495, "step": 19 }, { "epoch": 0.3389830508474576, "grad_norm": 0.1218048483133316, "learning_rate": 0.00029433962264150944, "loss": 1.2619, "step": 20 }, { "epoch": 0.3389830508474576, "eval_loss": 1.2424840927124023, "eval_runtime": 20.354, "eval_samples_per_second": 7.664, "eval_steps_per_second": 0.491, "step": 20 }, { "epoch": 0.3559322033898305, "grad_norm": 0.11501109600067139, "learning_rate": 0.00028679245283018867, "loss": 1.2759, "step": 21 }, { "epoch": 0.3559322033898305, "eval_loss": 1.2401403188705444, "eval_runtime": 20.2503, "eval_samples_per_second": 7.704, "eval_steps_per_second": 0.494, "step": 21 }, { "epoch": 0.3728813559322034, "grad_norm": 0.11031804978847504, "learning_rate": 0.0002792452830188679, "loss": 1.2141, "step": 22 }, { "epoch": 0.3728813559322034, "eval_loss": 1.2376986742019653, "eval_runtime": 20.2534, "eval_samples_per_second": 7.702, "eval_steps_per_second": 0.494, "step": 22 }, { "epoch": 0.3898305084745763, "grad_norm": 0.10441834479570389, "learning_rate": 0.0002716981132075472, "loss": 1.2333, "step": 23 }, { "epoch": 0.3898305084745763, "eval_loss": 1.2357385158538818, "eval_runtime": 20.2681, "eval_samples_per_second": 7.697, "eval_steps_per_second": 0.493, "step": 23 }, { "epoch": 0.4067796610169492, "grad_norm": 0.10192928463220596, "learning_rate": 0.0002641509433962264, "loss": 1.2022, "step": 24 }, { "epoch": 0.4067796610169492, "eval_loss": 1.23451828956604, "eval_runtime": 20.2473, "eval_samples_per_second": 7.705, "eval_steps_per_second": 0.494, "step": 24 }, { "epoch": 0.423728813559322, "grad_norm": 0.11488105356693268, "learning_rate": 0.00025660377358490566, "loss": 1.2704, "step": 25 }, { "epoch": 0.423728813559322, "eval_loss": 1.2329152822494507, "eval_runtime": 20.2073, "eval_samples_per_second": 7.72, "eval_steps_per_second": 0.495, "step": 25 }, { "epoch": 0.4406779661016949, "grad_norm": 0.10615106672048569, "learning_rate": 0.0002490566037735849, "loss": 1.2121, "step": 26 }, { "epoch": 0.4406779661016949, "eval_loss": 1.2312597036361694, "eval_runtime": 20.2584, "eval_samples_per_second": 7.701, "eval_steps_per_second": 0.494, "step": 26 }, { "epoch": 0.4576271186440678, "grad_norm": 0.10923189669847488, "learning_rate": 0.00024150943396226415, "loss": 1.217, "step": 27 }, { "epoch": 0.4576271186440678, "eval_loss": 1.2297359704971313, "eval_runtime": 20.3031, "eval_samples_per_second": 7.684, "eval_steps_per_second": 0.493, "step": 27 }, { "epoch": 0.4745762711864407, "grad_norm": 0.0974133163690567, "learning_rate": 0.0002339622641509434, "loss": 1.1978, "step": 28 }, { "epoch": 0.4745762711864407, "eval_loss": 1.2286475896835327, "eval_runtime": 20.3065, "eval_samples_per_second": 7.682, "eval_steps_per_second": 0.492, "step": 28 }, { "epoch": 0.4915254237288136, "grad_norm": 0.10017339885234833, "learning_rate": 0.00022641509433962264, "loss": 1.1895, "step": 29 }, { "epoch": 0.4915254237288136, "eval_loss": 1.2275595664978027, "eval_runtime": 20.2412, "eval_samples_per_second": 7.707, "eval_steps_per_second": 0.494, "step": 29 }, { "epoch": 0.5084745762711864, "grad_norm": 0.09858958423137665, "learning_rate": 0.0002188679245283019, "loss": 1.1669, "step": 30 }, { "epoch": 0.5084745762711864, "eval_loss": 1.2265597581863403, "eval_runtime": 20.3183, "eval_samples_per_second": 7.678, "eval_steps_per_second": 0.492, "step": 30 }, { "epoch": 0.5254237288135594, "grad_norm": 0.10503731667995453, "learning_rate": 0.00021132075471698113, "loss": 1.2623, "step": 31 }, { "epoch": 0.5254237288135594, "eval_loss": 1.2255297899246216, "eval_runtime": 20.2895, "eval_samples_per_second": 7.689, "eval_steps_per_second": 0.493, "step": 31 }, { "epoch": 0.5423728813559322, "grad_norm": 0.09913704544305801, "learning_rate": 0.0002037735849056604, "loss": 1.1958, "step": 32 }, { "epoch": 0.5423728813559322, "eval_loss": 1.2245802879333496, "eval_runtime": 20.2724, "eval_samples_per_second": 7.695, "eval_steps_per_second": 0.493, "step": 32 }, { "epoch": 0.559322033898305, "grad_norm": 0.10186842828989029, "learning_rate": 0.00019622641509433963, "loss": 1.214, "step": 33 }, { "epoch": 0.559322033898305, "eval_loss": 1.2234857082366943, "eval_runtime": 20.2474, "eval_samples_per_second": 7.705, "eval_steps_per_second": 0.494, "step": 33 }, { "epoch": 0.576271186440678, "grad_norm": 0.10213778913021088, "learning_rate": 0.00018867924528301889, "loss": 1.221, "step": 34 }, { "epoch": 0.576271186440678, "eval_loss": 1.222362995147705, "eval_runtime": 20.335, "eval_samples_per_second": 7.672, "eval_steps_per_second": 0.492, "step": 34 }, { "epoch": 0.5932203389830508, "grad_norm": 0.10020826011896133, "learning_rate": 0.00018113207547169812, "loss": 1.2528, "step": 35 }, { "epoch": 0.5932203389830508, "eval_loss": 1.2214902639389038, "eval_runtime": 20.286, "eval_samples_per_second": 7.69, "eval_steps_per_second": 0.493, "step": 35 }, { "epoch": 0.6101694915254238, "grad_norm": 0.10454258322715759, "learning_rate": 0.00017358490566037738, "loss": 1.2263, "step": 36 }, { "epoch": 0.6101694915254238, "eval_loss": 1.2206357717514038, "eval_runtime": 20.2459, "eval_samples_per_second": 7.705, "eval_steps_per_second": 0.494, "step": 36 }, { "epoch": 0.6271186440677966, "grad_norm": 0.09862061589956284, "learning_rate": 0.0001660377358490566, "loss": 1.1575, "step": 37 }, { "epoch": 0.6271186440677966, "eval_loss": 1.2198610305786133, "eval_runtime": 20.2576, "eval_samples_per_second": 7.701, "eval_steps_per_second": 0.494, "step": 37 }, { "epoch": 0.6440677966101694, "grad_norm": 0.09934031218290329, "learning_rate": 0.00015849056603773587, "loss": 1.226, "step": 38 }, { "epoch": 0.6440677966101694, "eval_loss": 1.2193130254745483, "eval_runtime": 20.2923, "eval_samples_per_second": 7.688, "eval_steps_per_second": 0.493, "step": 38 }, { "epoch": 0.6610169491525424, "grad_norm": 0.09604943543672562, "learning_rate": 0.0001509433962264151, "loss": 1.1985, "step": 39 }, { "epoch": 0.6610169491525424, "eval_loss": 1.2188695669174194, "eval_runtime": 20.2462, "eval_samples_per_second": 7.705, "eval_steps_per_second": 0.494, "step": 39 }, { "epoch": 0.6779661016949152, "grad_norm": 0.10007863491773605, "learning_rate": 0.00014339622641509434, "loss": 1.2235, "step": 40 }, { "epoch": 0.6779661016949152, "eval_loss": 1.2182458639144897, "eval_runtime": 20.3015, "eval_samples_per_second": 7.684, "eval_steps_per_second": 0.493, "step": 40 }, { "epoch": 0.6949152542372882, "grad_norm": 0.10070552676916122, "learning_rate": 0.0001358490566037736, "loss": 1.1838, "step": 41 }, { "epoch": 0.6949152542372882, "eval_loss": 1.217565655708313, "eval_runtime": 20.3449, "eval_samples_per_second": 7.668, "eval_steps_per_second": 0.492, "step": 41 }, { "epoch": 0.711864406779661, "grad_norm": 0.10276733338832855, "learning_rate": 0.00012830188679245283, "loss": 1.2086, "step": 42 }, { "epoch": 0.711864406779661, "eval_loss": 1.2170318365097046, "eval_runtime": 20.2482, "eval_samples_per_second": 7.704, "eval_steps_per_second": 0.494, "step": 42 }, { "epoch": 0.7288135593220338, "grad_norm": 0.09609915316104889, "learning_rate": 0.00012075471698113207, "loss": 1.1769, "step": 43 }, { "epoch": 0.7288135593220338, "eval_loss": 1.2166293859481812, "eval_runtime": 20.2547, "eval_samples_per_second": 7.702, "eval_steps_per_second": 0.494, "step": 43 }, { "epoch": 0.7457627118644068, "grad_norm": 0.09979274868965149, "learning_rate": 0.00011320754716981132, "loss": 1.205, "step": 44 }, { "epoch": 0.7457627118644068, "eval_loss": 1.2162456512451172, "eval_runtime": 20.2557, "eval_samples_per_second": 7.702, "eval_steps_per_second": 0.494, "step": 44 }, { "epoch": 0.7627118644067796, "grad_norm": 0.10145589709281921, "learning_rate": 0.00010566037735849057, "loss": 1.2559, "step": 45 }, { "epoch": 0.7627118644067796, "eval_loss": 1.2157071828842163, "eval_runtime": 20.2591, "eval_samples_per_second": 7.7, "eval_steps_per_second": 0.494, "step": 45 }, { "epoch": 0.7796610169491526, "grad_norm": 0.10324625670909882, "learning_rate": 9.811320754716981e-05, "loss": 1.1895, "step": 46 }, { "epoch": 0.7796610169491526, "eval_loss": 1.2151949405670166, "eval_runtime": 20.2914, "eval_samples_per_second": 7.688, "eval_steps_per_second": 0.493, "step": 46 }, { "epoch": 0.7966101694915254, "grad_norm": 0.09847307205200195, "learning_rate": 9.056603773584906e-05, "loss": 1.1881, "step": 47 }, { "epoch": 0.7966101694915254, "eval_loss": 1.2146611213684082, "eval_runtime": 20.2529, "eval_samples_per_second": 7.703, "eval_steps_per_second": 0.494, "step": 47 }, { "epoch": 0.8135593220338984, "grad_norm": 0.09867937117815018, "learning_rate": 8.30188679245283e-05, "loss": 1.209, "step": 48 }, { "epoch": 0.8135593220338984, "eval_loss": 1.214185118675232, "eval_runtime": 20.359, "eval_samples_per_second": 7.662, "eval_steps_per_second": 0.491, "step": 48 }, { "epoch": 0.8305084745762712, "grad_norm": 0.09706506878137589, "learning_rate": 7.547169811320755e-05, "loss": 1.2062, "step": 49 }, { "epoch": 0.8305084745762712, "eval_loss": 1.2136766910552979, "eval_runtime": 20.276, "eval_samples_per_second": 7.694, "eval_steps_per_second": 0.493, "step": 49 }, { "epoch": 0.847457627118644, "grad_norm": 0.09803847968578339, "learning_rate": 6.79245283018868e-05, "loss": 1.2089, "step": 50 }, { "epoch": 0.847457627118644, "eval_loss": 1.2132208347320557, "eval_runtime": 20.1963, "eval_samples_per_second": 7.724, "eval_steps_per_second": 0.495, "step": 50 } ], "logging_steps": 1, "max_steps": 59, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.71819915771904e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }