{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 44.44444444444444, | |
"eval_steps": 500, | |
"global_step": 100, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.44, | |
"learning_rate": 0, | |
"loss": 1.7341, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.89, | |
"learning_rate": 0, | |
"loss": 1.7223, | |
"step": 2 | |
}, | |
{ | |
"epoch": 1.33, | |
"learning_rate": 0, | |
"loss": 1.7608, | |
"step": 3 | |
}, | |
{ | |
"epoch": 1.78, | |
"learning_rate": 0, | |
"loss": 1.7115, | |
"step": 4 | |
}, | |
{ | |
"epoch": 2.22, | |
"learning_rate": 0, | |
"loss": 1.7181, | |
"step": 5 | |
}, | |
{ | |
"epoch": 2.67, | |
"learning_rate": 0, | |
"loss": 1.7022, | |
"step": 6 | |
}, | |
{ | |
"epoch": 3.11, | |
"learning_rate": 0, | |
"loss": 1.7242, | |
"step": 7 | |
}, | |
{ | |
"epoch": 3.56, | |
"learning_rate": 0, | |
"loss": 1.7352, | |
"step": 8 | |
}, | |
{ | |
"epoch": 4.0, | |
"learning_rate": 0, | |
"loss": 1.7181, | |
"step": 9 | |
}, | |
{ | |
"epoch": 4.44, | |
"learning_rate": 0, | |
"loss": 1.7213, | |
"step": 10 | |
}, | |
{ | |
"epoch": 4.89, | |
"learning_rate": 0, | |
"loss": 1.6694, | |
"step": 11 | |
}, | |
{ | |
"epoch": 5.33, | |
"learning_rate": 0, | |
"loss": 1.7046, | |
"step": 12 | |
}, | |
{ | |
"epoch": 5.78, | |
"learning_rate": 0, | |
"loss": 1.7109, | |
"step": 13 | |
}, | |
{ | |
"epoch": 6.22, | |
"learning_rate": 0, | |
"loss": 1.6948, | |
"step": 14 | |
}, | |
{ | |
"epoch": 6.67, | |
"learning_rate": 0, | |
"loss": 1.6816, | |
"step": 15 | |
}, | |
{ | |
"epoch": 7.11, | |
"learning_rate": 0.0, | |
"loss": 1.6851, | |
"step": 16 | |
}, | |
{ | |
"epoch": 7.56, | |
"learning_rate": 1.2618595071429148e-05, | |
"loss": 1.6041, | |
"step": 17 | |
}, | |
{ | |
"epoch": 8.0, | |
"learning_rate": 2e-05, | |
"loss": 1.5208, | |
"step": 18 | |
}, | |
{ | |
"epoch": 8.44, | |
"learning_rate": 2e-05, | |
"loss": 1.4946, | |
"step": 19 | |
}, | |
{ | |
"epoch": 8.89, | |
"learning_rate": 2e-05, | |
"loss": 1.492, | |
"step": 20 | |
}, | |
{ | |
"epoch": 9.33, | |
"learning_rate": 2e-05, | |
"loss": 1.4501, | |
"step": 21 | |
}, | |
{ | |
"epoch": 9.78, | |
"learning_rate": 2e-05, | |
"loss": 1.1894, | |
"step": 22 | |
}, | |
{ | |
"epoch": 10.22, | |
"learning_rate": 2e-05, | |
"loss": 1.1437, | |
"step": 23 | |
}, | |
{ | |
"epoch": 10.67, | |
"learning_rate": 2e-05, | |
"loss": 1.02, | |
"step": 24 | |
}, | |
{ | |
"epoch": 11.11, | |
"learning_rate": 2e-05, | |
"loss": 0.926, | |
"step": 25 | |
}, | |
{ | |
"epoch": 11.56, | |
"learning_rate": 2e-05, | |
"loss": 0.7794, | |
"step": 26 | |
}, | |
{ | |
"epoch": 12.0, | |
"learning_rate": 2e-05, | |
"loss": 0.7719, | |
"step": 27 | |
}, | |
{ | |
"epoch": 12.44, | |
"learning_rate": 2e-05, | |
"loss": 0.6107, | |
"step": 28 | |
}, | |
{ | |
"epoch": 12.89, | |
"learning_rate": 2e-05, | |
"loss": 0.633, | |
"step": 29 | |
}, | |
{ | |
"epoch": 13.33, | |
"learning_rate": 2e-05, | |
"loss": 0.4781, | |
"step": 30 | |
}, | |
{ | |
"epoch": 13.78, | |
"learning_rate": 2e-05, | |
"loss": 0.4379, | |
"step": 31 | |
}, | |
{ | |
"epoch": 14.22, | |
"learning_rate": 2e-05, | |
"loss": 0.3391, | |
"step": 32 | |
}, | |
{ | |
"epoch": 14.67, | |
"learning_rate": 2e-05, | |
"loss": 0.2928, | |
"step": 33 | |
}, | |
{ | |
"epoch": 15.11, | |
"learning_rate": 2e-05, | |
"loss": 0.2631, | |
"step": 34 | |
}, | |
{ | |
"epoch": 15.56, | |
"learning_rate": 2e-05, | |
"loss": 0.2399, | |
"step": 35 | |
}, | |
{ | |
"epoch": 16.0, | |
"learning_rate": 2e-05, | |
"loss": 0.2075, | |
"step": 36 | |
}, | |
{ | |
"epoch": 16.44, | |
"learning_rate": 2e-05, | |
"loss": 0.186, | |
"step": 37 | |
}, | |
{ | |
"epoch": 16.89, | |
"learning_rate": 2e-05, | |
"loss": 0.1782, | |
"step": 38 | |
}, | |
{ | |
"epoch": 17.33, | |
"learning_rate": 2e-05, | |
"loss": 0.144, | |
"step": 39 | |
}, | |
{ | |
"epoch": 17.78, | |
"learning_rate": 2e-05, | |
"loss": 0.1317, | |
"step": 40 | |
}, | |
{ | |
"epoch": 18.22, | |
"learning_rate": 2e-05, | |
"loss": 0.1144, | |
"step": 41 | |
}, | |
{ | |
"epoch": 18.67, | |
"learning_rate": 2e-05, | |
"loss": 0.1193, | |
"step": 42 | |
}, | |
{ | |
"epoch": 19.11, | |
"learning_rate": 2e-05, | |
"loss": 0.1161, | |
"step": 43 | |
}, | |
{ | |
"epoch": 19.56, | |
"learning_rate": 2e-05, | |
"loss": 0.0993, | |
"step": 44 | |
}, | |
{ | |
"epoch": 20.0, | |
"learning_rate": 2e-05, | |
"loss": 0.1083, | |
"step": 45 | |
}, | |
{ | |
"epoch": 20.44, | |
"learning_rate": 2e-05, | |
"loss": 0.101, | |
"step": 46 | |
}, | |
{ | |
"epoch": 20.89, | |
"learning_rate": 2e-05, | |
"loss": 0.1013, | |
"step": 47 | |
}, | |
{ | |
"epoch": 21.33, | |
"learning_rate": 2e-05, | |
"loss": 0.1066, | |
"step": 48 | |
}, | |
{ | |
"epoch": 21.78, | |
"learning_rate": 2e-05, | |
"loss": 0.1005, | |
"step": 49 | |
}, | |
{ | |
"epoch": 22.22, | |
"learning_rate": 2e-05, | |
"loss": 0.0882, | |
"step": 50 | |
}, | |
{ | |
"epoch": 22.67, | |
"learning_rate": 2e-05, | |
"loss": 0.1067, | |
"step": 51 | |
}, | |
{ | |
"epoch": 23.11, | |
"learning_rate": 2e-05, | |
"loss": 0.0797, | |
"step": 52 | |
}, | |
{ | |
"epoch": 23.56, | |
"learning_rate": 2e-05, | |
"loss": 0.0943, | |
"step": 53 | |
}, | |
{ | |
"epoch": 24.0, | |
"learning_rate": 2e-05, | |
"loss": 0.0769, | |
"step": 54 | |
}, | |
{ | |
"epoch": 24.44, | |
"learning_rate": 2e-05, | |
"loss": 0.0855, | |
"step": 55 | |
}, | |
{ | |
"epoch": 24.89, | |
"learning_rate": 2e-05, | |
"loss": 0.0735, | |
"step": 56 | |
}, | |
{ | |
"epoch": 25.33, | |
"learning_rate": 2e-05, | |
"loss": 0.0833, | |
"step": 57 | |
}, | |
{ | |
"epoch": 25.78, | |
"learning_rate": 2e-05, | |
"loss": 0.0811, | |
"step": 58 | |
}, | |
{ | |
"epoch": 26.22, | |
"learning_rate": 2e-05, | |
"loss": 0.0772, | |
"step": 59 | |
}, | |
{ | |
"epoch": 26.67, | |
"learning_rate": 2e-05, | |
"loss": 0.0721, | |
"step": 60 | |
}, | |
{ | |
"epoch": 27.11, | |
"learning_rate": 2e-05, | |
"loss": 0.0825, | |
"step": 61 | |
}, | |
{ | |
"epoch": 27.56, | |
"learning_rate": 2e-05, | |
"loss": 0.0758, | |
"step": 62 | |
}, | |
{ | |
"epoch": 28.0, | |
"learning_rate": 2e-05, | |
"loss": 0.0725, | |
"step": 63 | |
}, | |
{ | |
"epoch": 28.44, | |
"learning_rate": 2e-05, | |
"loss": 0.077, | |
"step": 64 | |
}, | |
{ | |
"epoch": 28.89, | |
"learning_rate": 2e-05, | |
"loss": 0.0654, | |
"step": 65 | |
}, | |
{ | |
"epoch": 29.33, | |
"learning_rate": 2e-05, | |
"loss": 0.0675, | |
"step": 66 | |
}, | |
{ | |
"epoch": 29.78, | |
"learning_rate": 2e-05, | |
"loss": 0.0772, | |
"step": 67 | |
}, | |
{ | |
"epoch": 30.22, | |
"learning_rate": 2e-05, | |
"loss": 0.0718, | |
"step": 68 | |
}, | |
{ | |
"epoch": 30.67, | |
"learning_rate": 2e-05, | |
"loss": 0.0625, | |
"step": 69 | |
}, | |
{ | |
"epoch": 31.11, | |
"learning_rate": 2e-05, | |
"loss": 0.0616, | |
"step": 70 | |
}, | |
{ | |
"epoch": 31.56, | |
"learning_rate": 2e-05, | |
"loss": 0.071, | |
"step": 71 | |
}, | |
{ | |
"epoch": 32.0, | |
"learning_rate": 2e-05, | |
"loss": 0.0655, | |
"step": 72 | |
}, | |
{ | |
"epoch": 32.44, | |
"learning_rate": 2e-05, | |
"loss": 0.0591, | |
"step": 73 | |
}, | |
{ | |
"epoch": 32.89, | |
"learning_rate": 2e-05, | |
"loss": 0.0669, | |
"step": 74 | |
}, | |
{ | |
"epoch": 33.33, | |
"learning_rate": 2e-05, | |
"loss": 0.0653, | |
"step": 75 | |
}, | |
{ | |
"epoch": 33.78, | |
"learning_rate": 2e-05, | |
"loss": 0.0662, | |
"step": 76 | |
}, | |
{ | |
"epoch": 34.22, | |
"learning_rate": 2e-05, | |
"loss": 0.0688, | |
"step": 77 | |
}, | |
{ | |
"epoch": 34.67, | |
"learning_rate": 2e-05, | |
"loss": 0.0498, | |
"step": 78 | |
}, | |
{ | |
"epoch": 35.11, | |
"learning_rate": 2e-05, | |
"loss": 0.0576, | |
"step": 79 | |
}, | |
{ | |
"epoch": 35.56, | |
"learning_rate": 2e-05, | |
"loss": 0.0737, | |
"step": 80 | |
}, | |
{ | |
"epoch": 36.0, | |
"learning_rate": 2e-05, | |
"loss": 0.0609, | |
"step": 81 | |
}, | |
{ | |
"epoch": 36.44, | |
"learning_rate": 2e-05, | |
"loss": 0.0594, | |
"step": 82 | |
}, | |
{ | |
"epoch": 36.89, | |
"learning_rate": 2e-05, | |
"loss": 0.0725, | |
"step": 83 | |
}, | |
{ | |
"epoch": 37.33, | |
"learning_rate": 2e-05, | |
"loss": 0.0598, | |
"step": 84 | |
}, | |
{ | |
"epoch": 37.78, | |
"learning_rate": 2e-05, | |
"loss": 0.0652, | |
"step": 85 | |
}, | |
{ | |
"epoch": 38.22, | |
"learning_rate": 2e-05, | |
"loss": 0.0588, | |
"step": 86 | |
}, | |
{ | |
"epoch": 38.67, | |
"learning_rate": 2e-05, | |
"loss": 0.0671, | |
"step": 87 | |
}, | |
{ | |
"epoch": 39.11, | |
"learning_rate": 2e-05, | |
"loss": 0.0596, | |
"step": 88 | |
}, | |
{ | |
"epoch": 39.56, | |
"learning_rate": 2e-05, | |
"loss": 0.0518, | |
"step": 89 | |
}, | |
{ | |
"epoch": 40.0, | |
"learning_rate": 2e-05, | |
"loss": 0.0612, | |
"step": 90 | |
}, | |
{ | |
"epoch": 40.44, | |
"learning_rate": 2e-05, | |
"loss": 0.0593, | |
"step": 91 | |
}, | |
{ | |
"epoch": 40.89, | |
"learning_rate": 2e-05, | |
"loss": 0.0521, | |
"step": 92 | |
}, | |
{ | |
"epoch": 41.33, | |
"learning_rate": 2e-05, | |
"loss": 0.0536, | |
"step": 93 | |
}, | |
{ | |
"epoch": 41.78, | |
"learning_rate": 2e-05, | |
"loss": 0.0548, | |
"step": 94 | |
}, | |
{ | |
"epoch": 42.22, | |
"learning_rate": 2e-05, | |
"loss": 0.0507, | |
"step": 95 | |
}, | |
{ | |
"epoch": 42.67, | |
"learning_rate": 2e-05, | |
"loss": 0.0588, | |
"step": 96 | |
}, | |
{ | |
"epoch": 43.11, | |
"learning_rate": 2e-05, | |
"loss": 0.0506, | |
"step": 97 | |
}, | |
{ | |
"epoch": 43.56, | |
"learning_rate": 2e-05, | |
"loss": 0.055, | |
"step": 98 | |
}, | |
{ | |
"epoch": 44.0, | |
"learning_rate": 2e-05, | |
"loss": 0.0503, | |
"step": 99 | |
}, | |
{ | |
"epoch": 44.44, | |
"learning_rate": 2e-05, | |
"loss": 0.054, | |
"step": 100 | |
}, | |
{ | |
"epoch": 44.44, | |
"step": 100, | |
"total_flos": 7478779576320.0, | |
"train_loss": 0.49326207719743254, | |
"train_runtime": 9902.4306, | |
"train_samples_per_second": 0.969, | |
"train_steps_per_second": 0.01 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 100, | |
"num_train_epochs": 50, | |
"save_steps": 200, | |
"total_flos": 7478779576320.0, | |
"trial_name": null, | |
"trial_params": null | |
} | |