{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.5842293906810037, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14336917562724014, "grad_norm": 0.0, "learning_rate": 0.00019936113105200085, "loss": 25.3924, "step": 10 }, { "epoch": 0.2867383512544803, "grad_norm": 3.884701728820801, "learning_rate": 0.00019745268727865774, "loss": 8.0947, "step": 20 }, { "epoch": 0.43010752688172044, "grad_norm": 4.083174705505371, "learning_rate": 0.00019387338576538744, "loss": 8.0048, "step": 30 }, { "epoch": 0.5734767025089605, "grad_norm": 8.547293663024902, "learning_rate": 0.00018881364488135448, "loss": 8.2356, "step": 40 }, { "epoch": 0.7168458781362007, "grad_norm": 5.096670150756836, "learning_rate": 0.00018235325976284275, "loss": 7.6645, "step": 50 }, { "epoch": 0.8602150537634409, "grad_norm": 2.093186855316162, "learning_rate": 0.00017459411454241822, "loss": 6.8675, "step": 60 }, { "epoch": 1.003584229390681, "grad_norm": 6.611547946929932, "learning_rate": 0.00016565857557529566, "loss": 7.6999, "step": 70 }, { "epoch": 1.146953405017921, "grad_norm": 8.38244915008545, "learning_rate": 0.00015568756164881882, "loss": 5.8161, "step": 80 }, { "epoch": 1.2903225806451613, "grad_norm": 7.428552150726318, "learning_rate": 0.00014483832160900326, "loss": 5.076, "step": 90 }, { "epoch": 1.4336917562724014, "grad_norm": 9.673376083374023, "learning_rate": 0.00013328195445229868, "loss": 5.8924, "step": 100 }, { "epoch": 1.5770609318996416, "grad_norm": 9.069914817810059, "learning_rate": 0.00012120071099220549, "loss": 11.4614, "step": 110 }, { "epoch": 1.7204301075268817, "grad_norm": 0.6971492767333984, "learning_rate": 0.00010878511965507434, "loss": 6.7866, "step": 120 }, { "epoch": 1.863799283154122, "grad_norm": 5.74509334564209, "learning_rate": 9.623098173300654e-05, "loss": 5.4278, "step": 130 }, { "epoch": 2.007168458781362, "grad_norm": 6.082172870635986, "learning_rate": 8.497744108792429e-05, "loss": 6.2175, "step": 140 }, { "epoch": 2.150537634408602, "grad_norm": 9.069826126098633, "learning_rate": 7.270480644826749e-05, "loss": 3.6583, "step": 150 }, { "epoch": 2.293906810035842, "grad_norm": 5.0468926429748535, "learning_rate": 6.086263331627976e-05, "loss": 7.1881, "step": 160 }, { "epoch": 2.4372759856630823, "grad_norm": 8.837244987487793, "learning_rate": 4.9637679836423924e-05, "loss": 5.8196, "step": 170 }, { "epoch": 2.5806451612903225, "grad_norm": 6.962499618530273, "learning_rate": 3.920697023053949e-05, "loss": 6.0987, "step": 180 }, { "epoch": 2.7240143369175627, "grad_norm": 7.161827564239502, "learning_rate": 2.9735003020115092e-05, "loss": 6.0444, "step": 190 }, { "epoch": 2.867383512544803, "grad_norm": 4.73717737197876, "learning_rate": 2.137115678633811e-05, "loss": 5.409, "step": 200 }, { "epoch": 3.010752688172043, "grad_norm": 4.852635860443115, "learning_rate": 1.4247334380634792e-05, "loss": 3.4152, "step": 210 }, { "epoch": 3.154121863799283, "grad_norm": 0.0, "learning_rate": 8.475882737908248e-06, "loss": 2.9892, "step": 220 }, { "epoch": 3.2974910394265233, "grad_norm": 5.022617816925049, "learning_rate": 4.147821098262405e-06, "loss": 3.2551, "step": 230 }, { "epoch": 3.4408602150537635, "grad_norm": 8.784795761108398, "learning_rate": 1.3314055792131964e-06, "loss": 3.7666, "step": 240 }, { "epoch": 3.5842293906810037, "grad_norm": 13.812480926513672, "learning_rate": 7.105273594107953e-08, "loss": 7.4014, "step": 250 } ], "logging_steps": 10, "max_steps": 250, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 553129082880000.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }