{ "best_metric": 1.2132208347320557, "best_model_checkpoint": "./output/checkpoints/2024-06-10_15-37-32/checkpoint-50", "epoch": 1.0, "eval_steps": 1, "global_step": 59, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01694915254237288, "grad_norm": 8.13034439086914, "learning_rate": 6.666666666666667e-05, "loss": 2.9658, "step": 1 }, { "epoch": 0.01694915254237288, "eval_loss": 2.9201831817626953, "eval_runtime": 20.1288, "eval_samples_per_second": 7.75, "eval_steps_per_second": 0.497, "step": 1 }, { "epoch": 0.03389830508474576, "grad_norm": 7.588105201721191, "learning_rate": 0.00013333333333333334, "loss": 2.8251, "step": 2 }, { "epoch": 0.03389830508474576, "eval_loss": 2.1139814853668213, "eval_runtime": 20.166, "eval_samples_per_second": 7.736, "eval_steps_per_second": 0.496, "step": 2 }, { "epoch": 0.05084745762711865, "grad_norm": 4.060599327087402, "learning_rate": 0.0002, "loss": 2.1092, "step": 3 }, { "epoch": 0.05084745762711865, "eval_loss": 1.677042841911316, "eval_runtime": 20.2239, "eval_samples_per_second": 7.714, "eval_steps_per_second": 0.494, "step": 3 }, { "epoch": 0.06779661016949153, "grad_norm": 1.6822344064712524, "learning_rate": 0.0002666666666666667, "loss": 1.6407, "step": 4 }, { "epoch": 0.06779661016949153, "eval_loss": 1.512647271156311, "eval_runtime": 20.2526, "eval_samples_per_second": 7.703, "eval_steps_per_second": 0.494, "step": 4 }, { "epoch": 0.0847457627118644, "grad_norm": 0.8077585697174072, "learning_rate": 0.0003333333333333334, "loss": 1.48, "step": 5 }, { "epoch": 0.0847457627118644, "eval_loss": 1.4189282655715942, "eval_runtime": 20.3082, "eval_samples_per_second": 7.682, "eval_steps_per_second": 0.492, "step": 5 }, { "epoch": 0.1016949152542373, "grad_norm": 0.7878511548042297, "learning_rate": 0.0004, "loss": 1.4139, "step": 6 }, { "epoch": 0.1016949152542373, "eval_loss": 1.3850855827331543, "eval_runtime": 20.3109, "eval_samples_per_second": 7.681, "eval_steps_per_second": 0.492, "step": 6 }, { "epoch": 0.11864406779661017, "grad_norm": 0.49660709500312805, "learning_rate": 0.00039245283018867925, "loss": 1.392, "step": 7 }, { "epoch": 0.11864406779661017, "eval_loss": 1.425636887550354, "eval_runtime": 20.1672, "eval_samples_per_second": 7.735, "eval_steps_per_second": 0.496, "step": 7 }, { "epoch": 0.13559322033898305, "grad_norm": 1.5075709819793701, "learning_rate": 0.00038490566037735854, "loss": 1.3954, "step": 8 }, { "epoch": 0.13559322033898305, "eval_loss": 1.347417950630188, "eval_runtime": 20.2898, "eval_samples_per_second": 7.689, "eval_steps_per_second": 0.493, "step": 8 }, { "epoch": 0.15254237288135594, "grad_norm": 0.517346203327179, "learning_rate": 0.00037735849056603777, "loss": 1.3231, "step": 9 }, { "epoch": 0.15254237288135594, "eval_loss": 1.314124345779419, "eval_runtime": 20.2626, "eval_samples_per_second": 7.699, "eval_steps_per_second": 0.494, "step": 9 }, { "epoch": 0.1694915254237288, "grad_norm": 0.2546059191226959, "learning_rate": 0.000369811320754717, "loss": 1.2504, "step": 10 }, { "epoch": 0.1694915254237288, "eval_loss": 1.2959833145141602, "eval_runtime": 20.2614, "eval_samples_per_second": 7.699, "eval_steps_per_second": 0.494, "step": 10 }, { "epoch": 0.1864406779661017, "grad_norm": 0.1385415643453598, "learning_rate": 0.00036226415094339624, "loss": 1.274, "step": 11 }, { "epoch": 0.1864406779661017, "eval_loss": 1.2856695652008057, "eval_runtime": 20.2637, "eval_samples_per_second": 7.698, "eval_steps_per_second": 0.493, "step": 11 }, { "epoch": 0.2033898305084746, "grad_norm": 0.13630884885787964, "learning_rate": 0.0003547169811320755, "loss": 1.2657, "step": 12 }, { "epoch": 0.2033898305084746, "eval_loss": 1.2764700651168823, "eval_runtime": 20.2991, "eval_samples_per_second": 7.685, "eval_steps_per_second": 0.493, "step": 12 }, { "epoch": 0.22033898305084745, "grad_norm": 0.12581680715084076, "learning_rate": 0.00034716981132075476, "loss": 1.2436, "step": 13 }, { "epoch": 0.22033898305084745, "eval_loss": 1.269006609916687, "eval_runtime": 20.2819, "eval_samples_per_second": 7.692, "eval_steps_per_second": 0.493, "step": 13 }, { "epoch": 0.23728813559322035, "grad_norm": 0.12691529095172882, "learning_rate": 0.000339622641509434, "loss": 1.2478, "step": 14 }, { "epoch": 0.23728813559322035, "eval_loss": 1.2635128498077393, "eval_runtime": 20.2714, "eval_samples_per_second": 7.696, "eval_steps_per_second": 0.493, "step": 14 }, { "epoch": 0.2542372881355932, "grad_norm": 0.12215649336576462, "learning_rate": 0.0003320754716981132, "loss": 1.2741, "step": 15 }, { "epoch": 0.2542372881355932, "eval_loss": 1.2581740617752075, "eval_runtime": 20.2259, "eval_samples_per_second": 7.713, "eval_steps_per_second": 0.494, "step": 15 }, { "epoch": 0.2711864406779661, "grad_norm": 0.11825581640005112, "learning_rate": 0.0003245283018867925, "loss": 1.2508, "step": 16 }, { "epoch": 0.2711864406779661, "eval_loss": 1.253954529762268, "eval_runtime": 20.3786, "eval_samples_per_second": 7.655, "eval_steps_per_second": 0.491, "step": 16 }, { "epoch": 0.288135593220339, "grad_norm": 0.11941225081682205, "learning_rate": 0.00031698113207547174, "loss": 1.2587, "step": 17 }, { "epoch": 0.288135593220339, "eval_loss": 1.2502553462982178, "eval_runtime": 20.2903, "eval_samples_per_second": 7.688, "eval_steps_per_second": 0.493, "step": 17 }, { "epoch": 0.3050847457627119, "grad_norm": 0.11191528290510178, "learning_rate": 0.000309433962264151, "loss": 1.2692, "step": 18 }, { "epoch": 0.3050847457627119, "eval_loss": 1.2474076747894287, "eval_runtime": 20.3196, "eval_samples_per_second": 7.677, "eval_steps_per_second": 0.492, "step": 18 }, { "epoch": 0.3220338983050847, "grad_norm": 0.11136776953935623, "learning_rate": 0.0003018867924528302, "loss": 1.2347, "step": 19 }, { "epoch": 0.3220338983050847, "eval_loss": 1.2446962594985962, "eval_runtime": 20.1872, "eval_samples_per_second": 7.728, "eval_steps_per_second": 0.495, "step": 19 }, { "epoch": 0.3389830508474576, "grad_norm": 0.1218048483133316, "learning_rate": 0.00029433962264150944, "loss": 1.2619, "step": 20 }, { "epoch": 0.3389830508474576, "eval_loss": 1.2424840927124023, "eval_runtime": 20.354, "eval_samples_per_second": 7.664, "eval_steps_per_second": 0.491, "step": 20 }, { "epoch": 0.3559322033898305, "grad_norm": 0.11501109600067139, "learning_rate": 0.00028679245283018867, "loss": 1.2759, "step": 21 }, { "epoch": 0.3559322033898305, "eval_loss": 1.2401403188705444, "eval_runtime": 20.2503, "eval_samples_per_second": 7.704, "eval_steps_per_second": 0.494, "step": 21 }, { "epoch": 0.3728813559322034, "grad_norm": 0.11031804978847504, "learning_rate": 0.0002792452830188679, "loss": 1.2141, "step": 22 }, { "epoch": 0.3728813559322034, "eval_loss": 1.2376986742019653, "eval_runtime": 20.2534, "eval_samples_per_second": 7.702, "eval_steps_per_second": 0.494, "step": 22 }, { "epoch": 0.3898305084745763, "grad_norm": 0.10441834479570389, "learning_rate": 0.0002716981132075472, "loss": 1.2333, "step": 23 }, { "epoch": 0.3898305084745763, "eval_loss": 1.2357385158538818, "eval_runtime": 20.2681, "eval_samples_per_second": 7.697, "eval_steps_per_second": 0.493, "step": 23 }, { "epoch": 0.4067796610169492, "grad_norm": 0.10192928463220596, "learning_rate": 0.0002641509433962264, "loss": 1.2022, "step": 24 }, { "epoch": 0.4067796610169492, "eval_loss": 1.23451828956604, "eval_runtime": 20.2473, "eval_samples_per_second": 7.705, "eval_steps_per_second": 0.494, "step": 24 }, { "epoch": 0.423728813559322, "grad_norm": 0.11488105356693268, "learning_rate": 0.00025660377358490566, "loss": 1.2704, "step": 25 }, { "epoch": 0.423728813559322, "eval_loss": 1.2329152822494507, "eval_runtime": 20.2073, "eval_samples_per_second": 7.72, "eval_steps_per_second": 0.495, "step": 25 }, { "epoch": 0.4406779661016949, "grad_norm": 0.10615106672048569, "learning_rate": 0.0002490566037735849, "loss": 1.2121, "step": 26 }, { "epoch": 0.4406779661016949, "eval_loss": 1.2312597036361694, "eval_runtime": 20.2584, "eval_samples_per_second": 7.701, "eval_steps_per_second": 0.494, "step": 26 }, { "epoch": 0.4576271186440678, "grad_norm": 0.10923189669847488, "learning_rate": 0.00024150943396226415, "loss": 1.217, "step": 27 }, { "epoch": 0.4576271186440678, "eval_loss": 1.2297359704971313, "eval_runtime": 20.3031, "eval_samples_per_second": 7.684, "eval_steps_per_second": 0.493, "step": 27 }, { "epoch": 0.4745762711864407, "grad_norm": 0.0974133163690567, "learning_rate": 0.0002339622641509434, "loss": 1.1978, "step": 28 }, { "epoch": 0.4745762711864407, "eval_loss": 1.2286475896835327, "eval_runtime": 20.3065, "eval_samples_per_second": 7.682, "eval_steps_per_second": 0.492, "step": 28 }, { "epoch": 0.4915254237288136, "grad_norm": 0.10017339885234833, "learning_rate": 0.00022641509433962264, "loss": 1.1895, "step": 29 }, { "epoch": 0.4915254237288136, "eval_loss": 1.2275595664978027, "eval_runtime": 20.2412, "eval_samples_per_second": 7.707, "eval_steps_per_second": 0.494, "step": 29 }, { "epoch": 0.5084745762711864, "grad_norm": 0.09858958423137665, "learning_rate": 0.0002188679245283019, "loss": 1.1669, "step": 30 }, { "epoch": 0.5084745762711864, "eval_loss": 1.2265597581863403, "eval_runtime": 20.3183, "eval_samples_per_second": 7.678, "eval_steps_per_second": 0.492, "step": 30 }, { "epoch": 0.5254237288135594, "grad_norm": 0.10503731667995453, "learning_rate": 0.00021132075471698113, "loss": 1.2623, "step": 31 }, { "epoch": 0.5254237288135594, "eval_loss": 1.2255297899246216, "eval_runtime": 20.2895, "eval_samples_per_second": 7.689, "eval_steps_per_second": 0.493, "step": 31 }, { "epoch": 0.5423728813559322, "grad_norm": 0.09913704544305801, "learning_rate": 0.0002037735849056604, "loss": 1.1958, "step": 32 }, { "epoch": 0.5423728813559322, "eval_loss": 1.2245802879333496, "eval_runtime": 20.2724, "eval_samples_per_second": 7.695, "eval_steps_per_second": 0.493, "step": 32 }, { "epoch": 0.559322033898305, "grad_norm": 0.10186842828989029, "learning_rate": 0.00019622641509433963, "loss": 1.214, "step": 33 }, { "epoch": 0.559322033898305, "eval_loss": 1.2234857082366943, "eval_runtime": 20.2474, "eval_samples_per_second": 7.705, "eval_steps_per_second": 0.494, "step": 33 }, { "epoch": 0.576271186440678, "grad_norm": 0.10213778913021088, "learning_rate": 0.00018867924528301889, "loss": 1.221, "step": 34 }, { "epoch": 0.576271186440678, "eval_loss": 1.222362995147705, "eval_runtime": 20.335, "eval_samples_per_second": 7.672, "eval_steps_per_second": 0.492, "step": 34 }, { "epoch": 0.5932203389830508, "grad_norm": 0.10020826011896133, "learning_rate": 0.00018113207547169812, "loss": 1.2528, "step": 35 }, { "epoch": 0.5932203389830508, "eval_loss": 1.2214902639389038, "eval_runtime": 20.286, "eval_samples_per_second": 7.69, "eval_steps_per_second": 0.493, "step": 35 }, { "epoch": 0.6101694915254238, "grad_norm": 0.10454258322715759, "learning_rate": 0.00017358490566037738, "loss": 1.2263, "step": 36 }, { "epoch": 0.6101694915254238, "eval_loss": 1.2206357717514038, "eval_runtime": 20.2459, "eval_samples_per_second": 7.705, "eval_steps_per_second": 0.494, "step": 36 }, { "epoch": 0.6271186440677966, "grad_norm": 0.09862061589956284, "learning_rate": 0.0001660377358490566, "loss": 1.1575, "step": 37 }, { "epoch": 0.6271186440677966, "eval_loss": 1.2198610305786133, "eval_runtime": 20.2576, "eval_samples_per_second": 7.701, "eval_steps_per_second": 0.494, "step": 37 }, { "epoch": 0.6440677966101694, "grad_norm": 0.09934031218290329, "learning_rate": 0.00015849056603773587, "loss": 1.226, "step": 38 }, { "epoch": 0.6440677966101694, "eval_loss": 1.2193130254745483, "eval_runtime": 20.2923, "eval_samples_per_second": 7.688, "eval_steps_per_second": 0.493, "step": 38 }, { "epoch": 0.6610169491525424, "grad_norm": 0.09604943543672562, "learning_rate": 0.0001509433962264151, "loss": 1.1985, "step": 39 }, { "epoch": 0.6610169491525424, "eval_loss": 1.2188695669174194, "eval_runtime": 20.2462, "eval_samples_per_second": 7.705, "eval_steps_per_second": 0.494, "step": 39 }, { "epoch": 0.6779661016949152, "grad_norm": 0.10007863491773605, "learning_rate": 0.00014339622641509434, "loss": 1.2235, "step": 40 }, { "epoch": 0.6779661016949152, "eval_loss": 1.2182458639144897, "eval_runtime": 20.3015, "eval_samples_per_second": 7.684, "eval_steps_per_second": 0.493, "step": 40 }, { "epoch": 0.6949152542372882, "grad_norm": 0.10070552676916122, "learning_rate": 0.0001358490566037736, "loss": 1.1838, "step": 41 }, { "epoch": 0.6949152542372882, "eval_loss": 1.217565655708313, "eval_runtime": 20.3449, "eval_samples_per_second": 7.668, "eval_steps_per_second": 0.492, "step": 41 }, { "epoch": 0.711864406779661, "grad_norm": 0.10276733338832855, "learning_rate": 0.00012830188679245283, "loss": 1.2086, "step": 42 }, { "epoch": 0.711864406779661, "eval_loss": 1.2170318365097046, "eval_runtime": 20.2482, "eval_samples_per_second": 7.704, "eval_steps_per_second": 0.494, "step": 42 }, { "epoch": 0.7288135593220338, "grad_norm": 0.09609915316104889, "learning_rate": 0.00012075471698113207, "loss": 1.1769, "step": 43 }, { "epoch": 0.7288135593220338, "eval_loss": 1.2166293859481812, "eval_runtime": 20.2547, "eval_samples_per_second": 7.702, "eval_steps_per_second": 0.494, "step": 43 }, { "epoch": 0.7457627118644068, "grad_norm": 0.09979274868965149, "learning_rate": 0.00011320754716981132, "loss": 1.205, "step": 44 }, { "epoch": 0.7457627118644068, "eval_loss": 1.2162456512451172, "eval_runtime": 20.2557, "eval_samples_per_second": 7.702, "eval_steps_per_second": 0.494, "step": 44 }, { "epoch": 0.7627118644067796, "grad_norm": 0.10145589709281921, "learning_rate": 0.00010566037735849057, "loss": 1.2559, "step": 45 }, { "epoch": 0.7627118644067796, "eval_loss": 1.2157071828842163, "eval_runtime": 20.2591, "eval_samples_per_second": 7.7, "eval_steps_per_second": 0.494, "step": 45 }, { "epoch": 0.7796610169491526, "grad_norm": 0.10324625670909882, "learning_rate": 9.811320754716981e-05, "loss": 1.1895, "step": 46 }, { "epoch": 0.7796610169491526, "eval_loss": 1.2151949405670166, "eval_runtime": 20.2914, "eval_samples_per_second": 7.688, "eval_steps_per_second": 0.493, "step": 46 }, { "epoch": 0.7966101694915254, "grad_norm": 0.09847307205200195, "learning_rate": 9.056603773584906e-05, "loss": 1.1881, "step": 47 }, { "epoch": 0.7966101694915254, "eval_loss": 1.2146611213684082, "eval_runtime": 20.2529, "eval_samples_per_second": 7.703, "eval_steps_per_second": 0.494, "step": 47 }, { "epoch": 0.8135593220338984, "grad_norm": 0.09867937117815018, "learning_rate": 8.30188679245283e-05, "loss": 1.209, "step": 48 }, { "epoch": 0.8135593220338984, "eval_loss": 1.214185118675232, "eval_runtime": 20.359, "eval_samples_per_second": 7.662, "eval_steps_per_second": 0.491, "step": 48 }, { "epoch": 0.8305084745762712, "grad_norm": 0.09706506878137589, "learning_rate": 7.547169811320755e-05, "loss": 1.2062, "step": 49 }, { "epoch": 0.8305084745762712, "eval_loss": 1.2136766910552979, "eval_runtime": 20.276, "eval_samples_per_second": 7.694, "eval_steps_per_second": 0.493, "step": 49 }, { "epoch": 0.847457627118644, "grad_norm": 0.09803847968578339, "learning_rate": 6.79245283018868e-05, "loss": 1.2089, "step": 50 }, { "epoch": 0.847457627118644, "eval_loss": 1.2132208347320557, "eval_runtime": 20.1963, "eval_samples_per_second": 7.724, "eval_steps_per_second": 0.495, "step": 50 }, { "epoch": 0.864406779661017, "grad_norm": 0.09517968446016312, "learning_rate": 6.037735849056604e-05, "loss": 1.2095, "step": 51 }, { "epoch": 0.864406779661017, "eval_loss": 1.2128576040267944, "eval_runtime": 20.2645, "eval_samples_per_second": 7.698, "eval_steps_per_second": 0.493, "step": 51 }, { "epoch": 0.8813559322033898, "grad_norm": 0.09574303776025772, "learning_rate": 5.283018867924528e-05, "loss": 1.1798, "step": 52 }, { "epoch": 0.8813559322033898, "eval_loss": 1.212473750114441, "eval_runtime": 20.2258, "eval_samples_per_second": 7.713, "eval_steps_per_second": 0.494, "step": 52 }, { "epoch": 0.8983050847457628, "grad_norm": 0.09871205687522888, "learning_rate": 4.528301886792453e-05, "loss": 1.1746, "step": 53 }, { "epoch": 0.8983050847457628, "eval_loss": 1.2121562957763672, "eval_runtime": 20.2541, "eval_samples_per_second": 7.702, "eval_steps_per_second": 0.494, "step": 53 }, { "epoch": 0.9152542372881356, "grad_norm": 0.09647679328918457, "learning_rate": 3.7735849056603776e-05, "loss": 1.2166, "step": 54 }, { "epoch": 0.9152542372881356, "eval_loss": 1.2118746042251587, "eval_runtime": 20.2835, "eval_samples_per_second": 7.691, "eval_steps_per_second": 0.493, "step": 54 }, { "epoch": 0.9322033898305084, "grad_norm": 0.09742645174264908, "learning_rate": 3.018867924528302e-05, "loss": 1.169, "step": 55 }, { "epoch": 0.9322033898305084, "eval_loss": 1.2116155624389648, "eval_runtime": 20.3096, "eval_samples_per_second": 7.681, "eval_steps_per_second": 0.492, "step": 55 }, { "epoch": 0.9491525423728814, "grad_norm": 0.09928184747695923, "learning_rate": 2.2641509433962265e-05, "loss": 1.1945, "step": 56 }, { "epoch": 0.9491525423728814, "eval_loss": 1.2114187479019165, "eval_runtime": 20.2297, "eval_samples_per_second": 7.711, "eval_steps_per_second": 0.494, "step": 56 }, { "epoch": 0.9661016949152542, "grad_norm": 0.10244324803352356, "learning_rate": 1.509433962264151e-05, "loss": 1.1877, "step": 57 }, { "epoch": 0.9661016949152542, "eval_loss": 1.2112743854522705, "eval_runtime": 20.345, "eval_samples_per_second": 7.668, "eval_steps_per_second": 0.492, "step": 57 }, { "epoch": 0.9830508474576272, "grad_norm": 0.10198235511779785, "learning_rate": 7.547169811320755e-06, "loss": 1.2429, "step": 58 }, { "epoch": 0.9830508474576272, "eval_loss": 1.211168885231018, "eval_runtime": 20.2851, "eval_samples_per_second": 7.69, "eval_steps_per_second": 0.493, "step": 58 }, { "epoch": 1.0, "grad_norm": 0.6312069296836853, "learning_rate": 0.0, "loss": 1.0437, "step": 59 }, { "epoch": 1.0, "eval_loss": 1.2111324071884155, "eval_runtime": 20.2646, "eval_samples_per_second": 7.698, "eval_steps_per_second": 0.493, "step": 59 }, { "epoch": 1.0, "step": 59, "total_flos": 4.315434897427661e+16, "train_loss": 1.3115796759977179, "train_runtime": 2023.6846, "train_samples_per_second": 0.918, "train_steps_per_second": 0.029 } ], "logging_steps": 1, "max_steps": 59, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.315434897427661e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }