{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 100, "global_step": 2345, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.21321961620469082, "grad_norm": 1.6159120798110962, "learning_rate": 0.0007658848614072496, "loss": 0.026, "step": 100 }, { "epoch": 0.21321961620469082, "eval_accuracy": 0.9867, "eval_loss": 0.04582102224230766, "eval_runtime": 5.154, "eval_samples_per_second": 1940.235, "eval_steps_per_second": 15.328, "step": 100 }, { "epoch": 0.42643923240938164, "grad_norm": 0.22664281725883484, "learning_rate": 0.000731769722814499, "loss": 0.0232, "step": 200 }, { "epoch": 0.42643923240938164, "eval_accuracy": 0.9867, "eval_loss": 0.04527696222066879, "eval_runtime": 5.4041, "eval_samples_per_second": 1850.446, "eval_steps_per_second": 14.619, "step": 200 }, { "epoch": 0.6396588486140725, "grad_norm": 1.3475489616394043, "learning_rate": 0.0006976545842217485, "loss": 0.0277, "step": 300 }, { "epoch": 0.6396588486140725, "eval_accuracy": 0.9863, "eval_loss": 0.04839787632226944, "eval_runtime": 5.3353, "eval_samples_per_second": 1874.323, "eval_steps_per_second": 14.807, "step": 300 }, { "epoch": 0.8528784648187633, "grad_norm": 0.9408140182495117, "learning_rate": 0.0006635394456289979, "loss": 0.0293, "step": 400 }, { "epoch": 0.8528784648187633, "eval_accuracy": 0.9865, "eval_loss": 0.046898942440748215, "eval_runtime": 5.1935, "eval_samples_per_second": 1925.498, "eval_steps_per_second": 15.211, "step": 400 }, { "epoch": 1.0660980810234542, "grad_norm": 0.6018700003623962, "learning_rate": 0.0006294243070362473, "loss": 0.0235, "step": 500 }, { "epoch": 1.0660980810234542, "eval_accuracy": 0.9899, "eval_loss": 0.028819510713219643, "eval_runtime": 5.1407, "eval_samples_per_second": 1945.277, "eval_steps_per_second": 15.368, "step": 500 }, { "epoch": 1.279317697228145, "grad_norm": 0.7029837369918823, "learning_rate": 0.0005953091684434968, "loss": 0.0203, "step": 600 }, { "epoch": 1.279317697228145, "eval_accuracy": 0.9924, "eval_loss": 0.02526690438389778, "eval_runtime": 4.869, "eval_samples_per_second": 2053.81, "eval_steps_per_second": 16.225, "step": 600 }, { "epoch": 1.4925373134328357, "grad_norm": 0.17393378913402557, "learning_rate": 0.0005611940298507463, "loss": 0.0182, "step": 700 }, { "epoch": 1.4925373134328357, "eval_accuracy": 0.9916, "eval_loss": 0.028603948652744293, "eval_runtime": 4.6847, "eval_samples_per_second": 2134.586, "eval_steps_per_second": 16.863, "step": 700 }, { "epoch": 1.7057569296375266, "grad_norm": 0.9470173120498657, "learning_rate": 0.0005270788912579957, "loss": 0.0205, "step": 800 }, { "epoch": 1.7057569296375266, "eval_accuracy": 0.9935, "eval_loss": 0.02031560428440571, "eval_runtime": 5.0835, "eval_samples_per_second": 1967.155, "eval_steps_per_second": 15.541, "step": 800 }, { "epoch": 1.9189765458422174, "grad_norm": 1.4821853637695312, "learning_rate": 0.0004929637526652453, "loss": 0.0162, "step": 900 }, { "epoch": 1.9189765458422174, "eval_accuracy": 0.9913, "eval_loss": 0.023791342973709106, "eval_runtime": 5.2196, "eval_samples_per_second": 1915.863, "eval_steps_per_second": 15.135, "step": 900 }, { "epoch": 2.1321961620469083, "grad_norm": 1.0076653957366943, "learning_rate": 0.0004588486140724947, "loss": 0.0118, "step": 1000 }, { "epoch": 2.1321961620469083, "eval_accuracy": 0.9916, "eval_loss": 0.024731909856200218, "eval_runtime": 5.8209, "eval_samples_per_second": 1717.944, "eval_steps_per_second": 13.572, "step": 1000 }, { "epoch": 2.345415778251599, "grad_norm": 0.39255017042160034, "learning_rate": 0.0004247334754797441, "loss": 0.0121, "step": 1100 }, { "epoch": 2.345415778251599, "eval_accuracy": 0.9932, "eval_loss": 0.019426949322223663, "eval_runtime": 5.5038, "eval_samples_per_second": 1816.931, "eval_steps_per_second": 14.354, "step": 1100 }, { "epoch": 2.55863539445629, "grad_norm": 0.4671665132045746, "learning_rate": 0.0003906183368869936, "loss": 0.0154, "step": 1200 }, { "epoch": 2.55863539445629, "eval_accuracy": 0.9933, "eval_loss": 0.01936325989663601, "eval_runtime": 5.4304, "eval_samples_per_second": 1841.5, "eval_steps_per_second": 14.548, "step": 1200 }, { "epoch": 2.771855010660981, "grad_norm": 0.17253048717975616, "learning_rate": 0.0003565031982942431, "loss": 0.015, "step": 1300 }, { "epoch": 2.771855010660981, "eval_accuracy": 0.9933, "eval_loss": 0.02162059210240841, "eval_runtime": 5.3729, "eval_samples_per_second": 1861.177, "eval_steps_per_second": 14.703, "step": 1300 }, { "epoch": 2.9850746268656714, "grad_norm": 0.8343185782432556, "learning_rate": 0.00032238805970149256, "loss": 0.0145, "step": 1400 }, { "epoch": 2.9850746268656714, "eval_accuracy": 0.9919, "eval_loss": 0.02381654642522335, "eval_runtime": 5.1099, "eval_samples_per_second": 1957.0, "eval_steps_per_second": 15.46, "step": 1400 }, { "epoch": 3.1982942430703627, "grad_norm": 0.3106481432914734, "learning_rate": 0.000288272921108742, "loss": 0.0098, "step": 1500 }, { "epoch": 3.1982942430703627, "eval_accuracy": 0.993, "eval_loss": 0.020756520330905914, "eval_runtime": 4.9767, "eval_samples_per_second": 2009.353, "eval_steps_per_second": 15.874, "step": 1500 }, { "epoch": 3.411513859275053, "grad_norm": 0.03616774454712868, "learning_rate": 0.0002541577825159915, "loss": 0.0093, "step": 1600 }, { "epoch": 3.411513859275053, "eval_accuracy": 0.9929, "eval_loss": 0.021822581067681313, "eval_runtime": 5.4965, "eval_samples_per_second": 1819.356, "eval_steps_per_second": 14.373, "step": 1600 }, { "epoch": 3.624733475479744, "grad_norm": 0.6045345067977905, "learning_rate": 0.00022004264392324095, "loss": 0.0073, "step": 1700 }, { "epoch": 3.624733475479744, "eval_accuracy": 0.9933, "eval_loss": 0.018862707540392876, "eval_runtime": 5.7766, "eval_samples_per_second": 1731.12, "eval_steps_per_second": 13.676, "step": 1700 }, { "epoch": 3.837953091684435, "grad_norm": 0.631215512752533, "learning_rate": 0.0001859275053304904, "loss": 0.008, "step": 1800 }, { "epoch": 3.837953091684435, "eval_accuracy": 0.9932, "eval_loss": 0.01944512128829956, "eval_runtime": 5.1522, "eval_samples_per_second": 1940.922, "eval_steps_per_second": 15.333, "step": 1800 }, { "epoch": 4.051172707889126, "grad_norm": 0.5996153950691223, "learning_rate": 0.0001518123667377399, "loss": 0.006, "step": 1900 }, { "epoch": 4.051172707889126, "eval_accuracy": 0.9938, "eval_loss": 0.018317226320505142, "eval_runtime": 4.9544, "eval_samples_per_second": 2018.428, "eval_steps_per_second": 15.946, "step": 1900 }, { "epoch": 4.264392324093817, "grad_norm": 0.0069321137852966785, "learning_rate": 0.00011769722814498933, "loss": 0.0063, "step": 2000 }, { "epoch": 4.264392324093817, "eval_accuracy": 0.9934, "eval_loss": 0.018384862691164017, "eval_runtime": 4.7636, "eval_samples_per_second": 2099.232, "eval_steps_per_second": 16.584, "step": 2000 }, { "epoch": 4.477611940298507, "grad_norm": 0.06509185582399368, "learning_rate": 8.392324093816631e-05, "loss": 0.0043, "step": 2100 }, { "epoch": 4.477611940298507, "eval_accuracy": 0.9932, "eval_loss": 0.018380142748355865, "eval_runtime": 4.6951, "eval_samples_per_second": 2129.883, "eval_steps_per_second": 16.826, "step": 2100 }, { "epoch": 4.690831556503198, "grad_norm": 0.17103737592697144, "learning_rate": 4.980810234541578e-05, "loss": 0.0035, "step": 2200 }, { "epoch": 4.690831556503198, "eval_accuracy": 0.9931, "eval_loss": 0.018344268202781677, "eval_runtime": 4.6229, "eval_samples_per_second": 2163.133, "eval_steps_per_second": 17.089, "step": 2200 }, { "epoch": 4.904051172707889, "grad_norm": 0.6465263962745667, "learning_rate": 1.5692963752665246e-05, "loss": 0.0061, "step": 2300 }, { "epoch": 4.904051172707889, "eval_accuracy": 0.9931, "eval_loss": 0.018412619829177856, "eval_runtime": 4.7593, "eval_samples_per_second": 2101.13, "eval_steps_per_second": 16.599, "step": 2300 }, { "epoch": 5.0, "step": 2345, "total_flos": 1346208595200000.0, "train_loss": 0.014373551363121472, "train_runtime": 335.5598, "train_samples_per_second": 894.028, "train_steps_per_second": 6.988 } ], "logging_steps": 100, "max_steps": 2345, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1346208595200000.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }