{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004, "grad_norm": 0.14456839859485626, "learning_rate": 0.0002, "loss": 2.6994, "step": 10 }, { "epoch": 0.008, "grad_norm": 0.12785384058952332, "learning_rate": 0.0002, "loss": 2.5263, "step": 20 }, { "epoch": 0.012, "grad_norm": 0.13556812703609467, "learning_rate": 0.0002, "loss": 2.4, "step": 30 }, { "epoch": 0.016, "grad_norm": 0.16702307760715485, "learning_rate": 0.0002, "loss": 2.2516, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.3053046464920044, "learning_rate": 0.0002, "loss": 2.1036, "step": 50 }, { "epoch": 0.024, "grad_norm": 0.22266729176044464, "learning_rate": 0.0002, "loss": 2.3971, "step": 60 }, { "epoch": 0.028, "grad_norm": 0.14757753908634186, "learning_rate": 0.0002, "loss": 2.3272, "step": 70 }, { "epoch": 0.032, "grad_norm": 0.1445876806974411, "learning_rate": 0.0002, "loss": 2.3242, "step": 80 }, { "epoch": 0.036, "grad_norm": 0.1639222800731659, "learning_rate": 0.0002, "loss": 2.0874, "step": 90 }, { "epoch": 0.04, "grad_norm": 0.30435118079185486, "learning_rate": 0.0002, "loss": 1.9193, "step": 100 }, { "epoch": 0.044, "grad_norm": 0.19899794459342957, "learning_rate": 0.0002, "loss": 2.3188, "step": 110 }, { "epoch": 0.048, "grad_norm": 0.171605184674263, "learning_rate": 0.0002, "loss": 2.285, "step": 120 }, { "epoch": 0.052, "grad_norm": 0.18728512525558472, "learning_rate": 0.0002, "loss": 2.2458, "step": 130 }, { "epoch": 0.056, "grad_norm": 0.21194487810134888, "learning_rate": 0.0002, "loss": 2.038, "step": 140 }, { "epoch": 0.06, "grad_norm": 0.3907397985458374, "learning_rate": 0.0002, "loss": 1.8128, "step": 150 }, { "epoch": 0.064, "grad_norm": 0.2305271327495575, "learning_rate": 0.0002, "loss": 2.2758, "step": 160 }, { "epoch": 0.068, "grad_norm": 0.1841125786304474, "learning_rate": 0.0002, "loss": 2.2169, "step": 170 }, { "epoch": 0.072, "grad_norm": 0.18488088250160217, "learning_rate": 0.0002, "loss": 2.1981, "step": 180 }, { "epoch": 0.076, "grad_norm": 0.22125467658042908, "learning_rate": 0.0002, "loss": 2.0181, "step": 190 }, { "epoch": 0.08, "grad_norm": 0.36263903975486755, "learning_rate": 0.0002, "loss": 1.7492, "step": 200 }, { "epoch": 0.084, "grad_norm": 0.23995819687843323, "learning_rate": 0.0002, "loss": 2.2287, "step": 210 }, { "epoch": 0.088, "grad_norm": 0.18997274339199066, "learning_rate": 0.0002, "loss": 2.2265, "step": 220 }, { "epoch": 0.092, "grad_norm": 0.19529950618743896, "learning_rate": 0.0002, "loss": 2.2107, "step": 230 }, { "epoch": 0.096, "grad_norm": 0.23907797038555145, "learning_rate": 0.0002, "loss": 1.9434, "step": 240 }, { "epoch": 0.1, "grad_norm": 0.3128001093864441, "learning_rate": 0.0002, "loss": 1.7463, "step": 250 }, { "epoch": 0.104, "grad_norm": 0.22716236114501953, "learning_rate": 0.0002, "loss": 2.2284, "step": 260 }, { "epoch": 0.108, "grad_norm": 0.19475902616977692, "learning_rate": 0.0002, "loss": 2.1706, "step": 270 }, { "epoch": 0.112, "grad_norm": 0.19292621314525604, "learning_rate": 0.0002, "loss": 2.1211, "step": 280 }, { "epoch": 0.116, "grad_norm": 0.27429261803627014, "learning_rate": 0.0002, "loss": 1.916, "step": 290 }, { "epoch": 0.12, "grad_norm": 0.35040974617004395, "learning_rate": 0.0002, "loss": 1.6903, "step": 300 }, { "epoch": 0.124, "grad_norm": 0.25687935948371887, "learning_rate": 0.0002, "loss": 2.1812, "step": 310 }, { "epoch": 0.128, "grad_norm": 0.20279887318611145, "learning_rate": 0.0002, "loss": 2.2005, "step": 320 }, { "epoch": 0.132, "grad_norm": 0.22153416275978088, "learning_rate": 0.0002, "loss": 2.1195, "step": 330 }, { "epoch": 0.136, "grad_norm": 0.2411063015460968, "learning_rate": 0.0002, "loss": 1.8694, "step": 340 }, { "epoch": 0.14, "grad_norm": 0.3698059320449829, "learning_rate": 0.0002, "loss": 1.6522, "step": 350 }, { "epoch": 0.144, "grad_norm": 0.263630211353302, "learning_rate": 0.0002, "loss": 2.1644, "step": 360 }, { "epoch": 0.148, "grad_norm": 0.22328948974609375, "learning_rate": 0.0002, "loss": 2.1753, "step": 370 }, { "epoch": 0.152, "grad_norm": 0.21132981777191162, "learning_rate": 0.0002, "loss": 2.1191, "step": 380 }, { "epoch": 0.156, "grad_norm": 0.2776873707771301, "learning_rate": 0.0002, "loss": 1.8539, "step": 390 }, { "epoch": 0.16, "grad_norm": 0.34315550327301025, "learning_rate": 0.0002, "loss": 1.6447, "step": 400 }, { "epoch": 0.164, "grad_norm": 0.2831268906593323, "learning_rate": 0.0002, "loss": 2.1451, "step": 410 }, { "epoch": 0.168, "grad_norm": 0.2178158015012741, "learning_rate": 0.0002, "loss": 2.1368, "step": 420 }, { "epoch": 0.172, "grad_norm": 0.22629423439502716, "learning_rate": 0.0002, "loss": 2.1039, "step": 430 }, { "epoch": 0.176, "grad_norm": 0.2540661096572876, "learning_rate": 0.0002, "loss": 1.856, "step": 440 }, { "epoch": 0.18, "grad_norm": 0.3525744676589966, "learning_rate": 0.0002, "loss": 1.6286, "step": 450 }, { "epoch": 0.184, "grad_norm": 0.24233509600162506, "learning_rate": 0.0002, "loss": 2.1296, "step": 460 }, { "epoch": 0.188, "grad_norm": 0.20195922255516052, "learning_rate": 0.0002, "loss": 2.1321, "step": 470 }, { "epoch": 0.192, "grad_norm": 0.25681406259536743, "learning_rate": 0.0002, "loss": 2.0255, "step": 480 }, { "epoch": 0.196, "grad_norm": 0.2791021764278412, "learning_rate": 0.0002, "loss": 1.8141, "step": 490 }, { "epoch": 0.2, "grad_norm": 0.3304845690727234, "learning_rate": 0.0002, "loss": 1.6042, "step": 500 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.7342773566308352e+16, "train_batch_size": 12, "trial_name": null, "trial_params": null }