phi3-sciq / checkpoint-50 /trainer_state.json
mikasenghaas's picture
Upload folder using huggingface_hub
b993ccd verified
{
"best_metric": 1.2132208347320557,
"best_model_checkpoint": "./output/checkpoints/2024-06-10_15-37-32/checkpoint-50",
"epoch": 0.847457627118644,
"eval_steps": 1,
"global_step": 50,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01694915254237288,
"grad_norm": 8.13034439086914,
"learning_rate": 6.666666666666667e-05,
"loss": 2.9658,
"step": 1
},
{
"epoch": 0.01694915254237288,
"eval_loss": 2.9201831817626953,
"eval_runtime": 20.1288,
"eval_samples_per_second": 7.75,
"eval_steps_per_second": 0.497,
"step": 1
},
{
"epoch": 0.03389830508474576,
"grad_norm": 7.588105201721191,
"learning_rate": 0.00013333333333333334,
"loss": 2.8251,
"step": 2
},
{
"epoch": 0.03389830508474576,
"eval_loss": 2.1139814853668213,
"eval_runtime": 20.166,
"eval_samples_per_second": 7.736,
"eval_steps_per_second": 0.496,
"step": 2
},
{
"epoch": 0.05084745762711865,
"grad_norm": 4.060599327087402,
"learning_rate": 0.0002,
"loss": 2.1092,
"step": 3
},
{
"epoch": 0.05084745762711865,
"eval_loss": 1.677042841911316,
"eval_runtime": 20.2239,
"eval_samples_per_second": 7.714,
"eval_steps_per_second": 0.494,
"step": 3
},
{
"epoch": 0.06779661016949153,
"grad_norm": 1.6822344064712524,
"learning_rate": 0.0002666666666666667,
"loss": 1.6407,
"step": 4
},
{
"epoch": 0.06779661016949153,
"eval_loss": 1.512647271156311,
"eval_runtime": 20.2526,
"eval_samples_per_second": 7.703,
"eval_steps_per_second": 0.494,
"step": 4
},
{
"epoch": 0.0847457627118644,
"grad_norm": 0.8077585697174072,
"learning_rate": 0.0003333333333333334,
"loss": 1.48,
"step": 5
},
{
"epoch": 0.0847457627118644,
"eval_loss": 1.4189282655715942,
"eval_runtime": 20.3082,
"eval_samples_per_second": 7.682,
"eval_steps_per_second": 0.492,
"step": 5
},
{
"epoch": 0.1016949152542373,
"grad_norm": 0.7878511548042297,
"learning_rate": 0.0004,
"loss": 1.4139,
"step": 6
},
{
"epoch": 0.1016949152542373,
"eval_loss": 1.3850855827331543,
"eval_runtime": 20.3109,
"eval_samples_per_second": 7.681,
"eval_steps_per_second": 0.492,
"step": 6
},
{
"epoch": 0.11864406779661017,
"grad_norm": 0.49660709500312805,
"learning_rate": 0.00039245283018867925,
"loss": 1.392,
"step": 7
},
{
"epoch": 0.11864406779661017,
"eval_loss": 1.425636887550354,
"eval_runtime": 20.1672,
"eval_samples_per_second": 7.735,
"eval_steps_per_second": 0.496,
"step": 7
},
{
"epoch": 0.13559322033898305,
"grad_norm": 1.5075709819793701,
"learning_rate": 0.00038490566037735854,
"loss": 1.3954,
"step": 8
},
{
"epoch": 0.13559322033898305,
"eval_loss": 1.347417950630188,
"eval_runtime": 20.2898,
"eval_samples_per_second": 7.689,
"eval_steps_per_second": 0.493,
"step": 8
},
{
"epoch": 0.15254237288135594,
"grad_norm": 0.517346203327179,
"learning_rate": 0.00037735849056603777,
"loss": 1.3231,
"step": 9
},
{
"epoch": 0.15254237288135594,
"eval_loss": 1.314124345779419,
"eval_runtime": 20.2626,
"eval_samples_per_second": 7.699,
"eval_steps_per_second": 0.494,
"step": 9
},
{
"epoch": 0.1694915254237288,
"grad_norm": 0.2546059191226959,
"learning_rate": 0.000369811320754717,
"loss": 1.2504,
"step": 10
},
{
"epoch": 0.1694915254237288,
"eval_loss": 1.2959833145141602,
"eval_runtime": 20.2614,
"eval_samples_per_second": 7.699,
"eval_steps_per_second": 0.494,
"step": 10
},
{
"epoch": 0.1864406779661017,
"grad_norm": 0.1385415643453598,
"learning_rate": 0.00036226415094339624,
"loss": 1.274,
"step": 11
},
{
"epoch": 0.1864406779661017,
"eval_loss": 1.2856695652008057,
"eval_runtime": 20.2637,
"eval_samples_per_second": 7.698,
"eval_steps_per_second": 0.493,
"step": 11
},
{
"epoch": 0.2033898305084746,
"grad_norm": 0.13630884885787964,
"learning_rate": 0.0003547169811320755,
"loss": 1.2657,
"step": 12
},
{
"epoch": 0.2033898305084746,
"eval_loss": 1.2764700651168823,
"eval_runtime": 20.2991,
"eval_samples_per_second": 7.685,
"eval_steps_per_second": 0.493,
"step": 12
},
{
"epoch": 0.22033898305084745,
"grad_norm": 0.12581680715084076,
"learning_rate": 0.00034716981132075476,
"loss": 1.2436,
"step": 13
},
{
"epoch": 0.22033898305084745,
"eval_loss": 1.269006609916687,
"eval_runtime": 20.2819,
"eval_samples_per_second": 7.692,
"eval_steps_per_second": 0.493,
"step": 13
},
{
"epoch": 0.23728813559322035,
"grad_norm": 0.12691529095172882,
"learning_rate": 0.000339622641509434,
"loss": 1.2478,
"step": 14
},
{
"epoch": 0.23728813559322035,
"eval_loss": 1.2635128498077393,
"eval_runtime": 20.2714,
"eval_samples_per_second": 7.696,
"eval_steps_per_second": 0.493,
"step": 14
},
{
"epoch": 0.2542372881355932,
"grad_norm": 0.12215649336576462,
"learning_rate": 0.0003320754716981132,
"loss": 1.2741,
"step": 15
},
{
"epoch": 0.2542372881355932,
"eval_loss": 1.2581740617752075,
"eval_runtime": 20.2259,
"eval_samples_per_second": 7.713,
"eval_steps_per_second": 0.494,
"step": 15
},
{
"epoch": 0.2711864406779661,
"grad_norm": 0.11825581640005112,
"learning_rate": 0.0003245283018867925,
"loss": 1.2508,
"step": 16
},
{
"epoch": 0.2711864406779661,
"eval_loss": 1.253954529762268,
"eval_runtime": 20.3786,
"eval_samples_per_second": 7.655,
"eval_steps_per_second": 0.491,
"step": 16
},
{
"epoch": 0.288135593220339,
"grad_norm": 0.11941225081682205,
"learning_rate": 0.00031698113207547174,
"loss": 1.2587,
"step": 17
},
{
"epoch": 0.288135593220339,
"eval_loss": 1.2502553462982178,
"eval_runtime": 20.2903,
"eval_samples_per_second": 7.688,
"eval_steps_per_second": 0.493,
"step": 17
},
{
"epoch": 0.3050847457627119,
"grad_norm": 0.11191528290510178,
"learning_rate": 0.000309433962264151,
"loss": 1.2692,
"step": 18
},
{
"epoch": 0.3050847457627119,
"eval_loss": 1.2474076747894287,
"eval_runtime": 20.3196,
"eval_samples_per_second": 7.677,
"eval_steps_per_second": 0.492,
"step": 18
},
{
"epoch": 0.3220338983050847,
"grad_norm": 0.11136776953935623,
"learning_rate": 0.0003018867924528302,
"loss": 1.2347,
"step": 19
},
{
"epoch": 0.3220338983050847,
"eval_loss": 1.2446962594985962,
"eval_runtime": 20.1872,
"eval_samples_per_second": 7.728,
"eval_steps_per_second": 0.495,
"step": 19
},
{
"epoch": 0.3389830508474576,
"grad_norm": 0.1218048483133316,
"learning_rate": 0.00029433962264150944,
"loss": 1.2619,
"step": 20
},
{
"epoch": 0.3389830508474576,
"eval_loss": 1.2424840927124023,
"eval_runtime": 20.354,
"eval_samples_per_second": 7.664,
"eval_steps_per_second": 0.491,
"step": 20
},
{
"epoch": 0.3559322033898305,
"grad_norm": 0.11501109600067139,
"learning_rate": 0.00028679245283018867,
"loss": 1.2759,
"step": 21
},
{
"epoch": 0.3559322033898305,
"eval_loss": 1.2401403188705444,
"eval_runtime": 20.2503,
"eval_samples_per_second": 7.704,
"eval_steps_per_second": 0.494,
"step": 21
},
{
"epoch": 0.3728813559322034,
"grad_norm": 0.11031804978847504,
"learning_rate": 0.0002792452830188679,
"loss": 1.2141,
"step": 22
},
{
"epoch": 0.3728813559322034,
"eval_loss": 1.2376986742019653,
"eval_runtime": 20.2534,
"eval_samples_per_second": 7.702,
"eval_steps_per_second": 0.494,
"step": 22
},
{
"epoch": 0.3898305084745763,
"grad_norm": 0.10441834479570389,
"learning_rate": 0.0002716981132075472,
"loss": 1.2333,
"step": 23
},
{
"epoch": 0.3898305084745763,
"eval_loss": 1.2357385158538818,
"eval_runtime": 20.2681,
"eval_samples_per_second": 7.697,
"eval_steps_per_second": 0.493,
"step": 23
},
{
"epoch": 0.4067796610169492,
"grad_norm": 0.10192928463220596,
"learning_rate": 0.0002641509433962264,
"loss": 1.2022,
"step": 24
},
{
"epoch": 0.4067796610169492,
"eval_loss": 1.23451828956604,
"eval_runtime": 20.2473,
"eval_samples_per_second": 7.705,
"eval_steps_per_second": 0.494,
"step": 24
},
{
"epoch": 0.423728813559322,
"grad_norm": 0.11488105356693268,
"learning_rate": 0.00025660377358490566,
"loss": 1.2704,
"step": 25
},
{
"epoch": 0.423728813559322,
"eval_loss": 1.2329152822494507,
"eval_runtime": 20.2073,
"eval_samples_per_second": 7.72,
"eval_steps_per_second": 0.495,
"step": 25
},
{
"epoch": 0.4406779661016949,
"grad_norm": 0.10615106672048569,
"learning_rate": 0.0002490566037735849,
"loss": 1.2121,
"step": 26
},
{
"epoch": 0.4406779661016949,
"eval_loss": 1.2312597036361694,
"eval_runtime": 20.2584,
"eval_samples_per_second": 7.701,
"eval_steps_per_second": 0.494,
"step": 26
},
{
"epoch": 0.4576271186440678,
"grad_norm": 0.10923189669847488,
"learning_rate": 0.00024150943396226415,
"loss": 1.217,
"step": 27
},
{
"epoch": 0.4576271186440678,
"eval_loss": 1.2297359704971313,
"eval_runtime": 20.3031,
"eval_samples_per_second": 7.684,
"eval_steps_per_second": 0.493,
"step": 27
},
{
"epoch": 0.4745762711864407,
"grad_norm": 0.0974133163690567,
"learning_rate": 0.0002339622641509434,
"loss": 1.1978,
"step": 28
},
{
"epoch": 0.4745762711864407,
"eval_loss": 1.2286475896835327,
"eval_runtime": 20.3065,
"eval_samples_per_second": 7.682,
"eval_steps_per_second": 0.492,
"step": 28
},
{
"epoch": 0.4915254237288136,
"grad_norm": 0.10017339885234833,
"learning_rate": 0.00022641509433962264,
"loss": 1.1895,
"step": 29
},
{
"epoch": 0.4915254237288136,
"eval_loss": 1.2275595664978027,
"eval_runtime": 20.2412,
"eval_samples_per_second": 7.707,
"eval_steps_per_second": 0.494,
"step": 29
},
{
"epoch": 0.5084745762711864,
"grad_norm": 0.09858958423137665,
"learning_rate": 0.0002188679245283019,
"loss": 1.1669,
"step": 30
},
{
"epoch": 0.5084745762711864,
"eval_loss": 1.2265597581863403,
"eval_runtime": 20.3183,
"eval_samples_per_second": 7.678,
"eval_steps_per_second": 0.492,
"step": 30
},
{
"epoch": 0.5254237288135594,
"grad_norm": 0.10503731667995453,
"learning_rate": 0.00021132075471698113,
"loss": 1.2623,
"step": 31
},
{
"epoch": 0.5254237288135594,
"eval_loss": 1.2255297899246216,
"eval_runtime": 20.2895,
"eval_samples_per_second": 7.689,
"eval_steps_per_second": 0.493,
"step": 31
},
{
"epoch": 0.5423728813559322,
"grad_norm": 0.09913704544305801,
"learning_rate": 0.0002037735849056604,
"loss": 1.1958,
"step": 32
},
{
"epoch": 0.5423728813559322,
"eval_loss": 1.2245802879333496,
"eval_runtime": 20.2724,
"eval_samples_per_second": 7.695,
"eval_steps_per_second": 0.493,
"step": 32
},
{
"epoch": 0.559322033898305,
"grad_norm": 0.10186842828989029,
"learning_rate": 0.00019622641509433963,
"loss": 1.214,
"step": 33
},
{
"epoch": 0.559322033898305,
"eval_loss": 1.2234857082366943,
"eval_runtime": 20.2474,
"eval_samples_per_second": 7.705,
"eval_steps_per_second": 0.494,
"step": 33
},
{
"epoch": 0.576271186440678,
"grad_norm": 0.10213778913021088,
"learning_rate": 0.00018867924528301889,
"loss": 1.221,
"step": 34
},
{
"epoch": 0.576271186440678,
"eval_loss": 1.222362995147705,
"eval_runtime": 20.335,
"eval_samples_per_second": 7.672,
"eval_steps_per_second": 0.492,
"step": 34
},
{
"epoch": 0.5932203389830508,
"grad_norm": 0.10020826011896133,
"learning_rate": 0.00018113207547169812,
"loss": 1.2528,
"step": 35
},
{
"epoch": 0.5932203389830508,
"eval_loss": 1.2214902639389038,
"eval_runtime": 20.286,
"eval_samples_per_second": 7.69,
"eval_steps_per_second": 0.493,
"step": 35
},
{
"epoch": 0.6101694915254238,
"grad_norm": 0.10454258322715759,
"learning_rate": 0.00017358490566037738,
"loss": 1.2263,
"step": 36
},
{
"epoch": 0.6101694915254238,
"eval_loss": 1.2206357717514038,
"eval_runtime": 20.2459,
"eval_samples_per_second": 7.705,
"eval_steps_per_second": 0.494,
"step": 36
},
{
"epoch": 0.6271186440677966,
"grad_norm": 0.09862061589956284,
"learning_rate": 0.0001660377358490566,
"loss": 1.1575,
"step": 37
},
{
"epoch": 0.6271186440677966,
"eval_loss": 1.2198610305786133,
"eval_runtime": 20.2576,
"eval_samples_per_second": 7.701,
"eval_steps_per_second": 0.494,
"step": 37
},
{
"epoch": 0.6440677966101694,
"grad_norm": 0.09934031218290329,
"learning_rate": 0.00015849056603773587,
"loss": 1.226,
"step": 38
},
{
"epoch": 0.6440677966101694,
"eval_loss": 1.2193130254745483,
"eval_runtime": 20.2923,
"eval_samples_per_second": 7.688,
"eval_steps_per_second": 0.493,
"step": 38
},
{
"epoch": 0.6610169491525424,
"grad_norm": 0.09604943543672562,
"learning_rate": 0.0001509433962264151,
"loss": 1.1985,
"step": 39
},
{
"epoch": 0.6610169491525424,
"eval_loss": 1.2188695669174194,
"eval_runtime": 20.2462,
"eval_samples_per_second": 7.705,
"eval_steps_per_second": 0.494,
"step": 39
},
{
"epoch": 0.6779661016949152,
"grad_norm": 0.10007863491773605,
"learning_rate": 0.00014339622641509434,
"loss": 1.2235,
"step": 40
},
{
"epoch": 0.6779661016949152,
"eval_loss": 1.2182458639144897,
"eval_runtime": 20.3015,
"eval_samples_per_second": 7.684,
"eval_steps_per_second": 0.493,
"step": 40
},
{
"epoch": 0.6949152542372882,
"grad_norm": 0.10070552676916122,
"learning_rate": 0.0001358490566037736,
"loss": 1.1838,
"step": 41
},
{
"epoch": 0.6949152542372882,
"eval_loss": 1.217565655708313,
"eval_runtime": 20.3449,
"eval_samples_per_second": 7.668,
"eval_steps_per_second": 0.492,
"step": 41
},
{
"epoch": 0.711864406779661,
"grad_norm": 0.10276733338832855,
"learning_rate": 0.00012830188679245283,
"loss": 1.2086,
"step": 42
},
{
"epoch": 0.711864406779661,
"eval_loss": 1.2170318365097046,
"eval_runtime": 20.2482,
"eval_samples_per_second": 7.704,
"eval_steps_per_second": 0.494,
"step": 42
},
{
"epoch": 0.7288135593220338,
"grad_norm": 0.09609915316104889,
"learning_rate": 0.00012075471698113207,
"loss": 1.1769,
"step": 43
},
{
"epoch": 0.7288135593220338,
"eval_loss": 1.2166293859481812,
"eval_runtime": 20.2547,
"eval_samples_per_second": 7.702,
"eval_steps_per_second": 0.494,
"step": 43
},
{
"epoch": 0.7457627118644068,
"grad_norm": 0.09979274868965149,
"learning_rate": 0.00011320754716981132,
"loss": 1.205,
"step": 44
},
{
"epoch": 0.7457627118644068,
"eval_loss": 1.2162456512451172,
"eval_runtime": 20.2557,
"eval_samples_per_second": 7.702,
"eval_steps_per_second": 0.494,
"step": 44
},
{
"epoch": 0.7627118644067796,
"grad_norm": 0.10145589709281921,
"learning_rate": 0.00010566037735849057,
"loss": 1.2559,
"step": 45
},
{
"epoch": 0.7627118644067796,
"eval_loss": 1.2157071828842163,
"eval_runtime": 20.2591,
"eval_samples_per_second": 7.7,
"eval_steps_per_second": 0.494,
"step": 45
},
{
"epoch": 0.7796610169491526,
"grad_norm": 0.10324625670909882,
"learning_rate": 9.811320754716981e-05,
"loss": 1.1895,
"step": 46
},
{
"epoch": 0.7796610169491526,
"eval_loss": 1.2151949405670166,
"eval_runtime": 20.2914,
"eval_samples_per_second": 7.688,
"eval_steps_per_second": 0.493,
"step": 46
},
{
"epoch": 0.7966101694915254,
"grad_norm": 0.09847307205200195,
"learning_rate": 9.056603773584906e-05,
"loss": 1.1881,
"step": 47
},
{
"epoch": 0.7966101694915254,
"eval_loss": 1.2146611213684082,
"eval_runtime": 20.2529,
"eval_samples_per_second": 7.703,
"eval_steps_per_second": 0.494,
"step": 47
},
{
"epoch": 0.8135593220338984,
"grad_norm": 0.09867937117815018,
"learning_rate": 8.30188679245283e-05,
"loss": 1.209,
"step": 48
},
{
"epoch": 0.8135593220338984,
"eval_loss": 1.214185118675232,
"eval_runtime": 20.359,
"eval_samples_per_second": 7.662,
"eval_steps_per_second": 0.491,
"step": 48
},
{
"epoch": 0.8305084745762712,
"grad_norm": 0.09706506878137589,
"learning_rate": 7.547169811320755e-05,
"loss": 1.2062,
"step": 49
},
{
"epoch": 0.8305084745762712,
"eval_loss": 1.2136766910552979,
"eval_runtime": 20.276,
"eval_samples_per_second": 7.694,
"eval_steps_per_second": 0.493,
"step": 49
},
{
"epoch": 0.847457627118644,
"grad_norm": 0.09803847968578339,
"learning_rate": 6.79245283018868e-05,
"loss": 1.2089,
"step": 50
},
{
"epoch": 0.847457627118644,
"eval_loss": 1.2132208347320557,
"eval_runtime": 20.1963,
"eval_samples_per_second": 7.724,
"eval_steps_per_second": 0.495,
"step": 50
}
],
"logging_steps": 1,
"max_steps": 59,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.71819915771904e+16,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}