{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9959579628132579, "eval_steps": 500, "global_step": 154, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019401778496362168, "grad_norm": 139.02731323242188, "learning_rate": 1.9610389610389612e-05, "loss": 1.8718, "step": 3 }, { "epoch": 0.038803556992724336, "grad_norm": 31.607357025146484, "learning_rate": 1.9220779220779222e-05, "loss": 1.6642, "step": 6 }, { "epoch": 0.0582053354890865, "grad_norm": 118.545166015625, "learning_rate": 1.8831168831168833e-05, "loss": 1.5334, "step": 9 }, { "epoch": 0.07760711398544867, "grad_norm": 180.7499542236328, "learning_rate": 1.8441558441558443e-05, "loss": 1.396, "step": 12 }, { "epoch": 0.09700889248181083, "grad_norm": 199.1714324951172, "learning_rate": 1.8051948051948053e-05, "loss": 1.3652, "step": 15 }, { "epoch": 0.116410670978173, "grad_norm": 136.60414123535156, "learning_rate": 1.7662337662337664e-05, "loss": 1.3432, "step": 18 }, { "epoch": 0.13581244947453516, "grad_norm": 82.5650634765625, "learning_rate": 1.7272727272727274e-05, "loss": 1.2614, "step": 21 }, { "epoch": 0.15521422797089734, "grad_norm": 18.96686553955078, "learning_rate": 1.6883116883116884e-05, "loss": 1.2303, "step": 24 }, { "epoch": 0.1746160064672595, "grad_norm": 7.933801174163818, "learning_rate": 1.6493506493506495e-05, "loss": 1.1984, "step": 27 }, { "epoch": 0.19401778496362165, "grad_norm": 2.686699390411377, "learning_rate": 1.6103896103896105e-05, "loss": 1.1016, "step": 30 }, { "epoch": 0.21341956345998384, "grad_norm": 1.455581545829773, "learning_rate": 1.5714285714285715e-05, "loss": 1.0671, "step": 33 }, { "epoch": 0.232821341956346, "grad_norm": 0.5924062132835388, "learning_rate": 1.5324675324675326e-05, "loss": 1.012, "step": 36 }, { "epoch": 0.25222312045270817, "grad_norm": 0.3087107837200165, "learning_rate": 1.4935064935064936e-05, "loss": 0.9758, "step": 39 }, { "epoch": 0.2716248989490703, "grad_norm": 0.2992459535598755, "learning_rate": 1.4545454545454546e-05, "loss": 0.9262, "step": 42 }, { "epoch": 0.2910266774454325, "grad_norm": 0.2895904779434204, "learning_rate": 1.4155844155844157e-05, "loss": 0.8271, "step": 45 }, { "epoch": 0.3104284559417947, "grad_norm": 0.2948096692562103, "learning_rate": 1.3766233766233767e-05, "loss": 0.7895, "step": 48 }, { "epoch": 0.32983023443815684, "grad_norm": 0.31464704871177673, "learning_rate": 1.3376623376623377e-05, "loss": 0.7299, "step": 51 }, { "epoch": 0.349232012934519, "grad_norm": 0.3038002550601959, "learning_rate": 1.2987012987012988e-05, "loss": 0.6857, "step": 54 }, { "epoch": 0.36863379143088115, "grad_norm": 0.33729803562164307, "learning_rate": 1.25974025974026e-05, "loss": 0.5946, "step": 57 }, { "epoch": 0.3880355699272433, "grad_norm": 0.39213827252388, "learning_rate": 1.2207792207792208e-05, "loss": 0.5636, "step": 60 }, { "epoch": 0.4074373484236055, "grad_norm": 0.3482286334037781, "learning_rate": 1.181818181818182e-05, "loss": 0.5094, "step": 63 }, { "epoch": 0.42683912691996767, "grad_norm": 0.3112964630126953, "learning_rate": 1.1428571428571429e-05, "loss": 0.4541, "step": 66 }, { "epoch": 0.4462409054163298, "grad_norm": 0.26819908618927, "learning_rate": 1.1038961038961041e-05, "loss": 0.4181, "step": 69 }, { "epoch": 0.465642683912692, "grad_norm": 0.28413137793540955, "learning_rate": 1.064935064935065e-05, "loss": 0.4095, "step": 72 }, { "epoch": 0.4850444624090542, "grad_norm": 0.3022381365299225, "learning_rate": 1.025974025974026e-05, "loss": 0.3623, "step": 75 }, { "epoch": 0.5044462409054163, "grad_norm": 0.29346349835395813, "learning_rate": 9.87012987012987e-06, "loss": 0.3334, "step": 78 }, { "epoch": 0.5238480194017785, "grad_norm": 0.2659854292869568, "learning_rate": 9.48051948051948e-06, "loss": 0.3115, "step": 81 }, { "epoch": 0.5432497978981407, "grad_norm": 0.23122940957546234, "learning_rate": 9.090909090909091e-06, "loss": 0.2817, "step": 84 }, { "epoch": 0.5626515763945028, "grad_norm": 0.2369256317615509, "learning_rate": 8.701298701298701e-06, "loss": 0.2809, "step": 87 }, { "epoch": 0.582053354890865, "grad_norm": 0.2082873433828354, "learning_rate": 8.311688311688313e-06, "loss": 0.2455, "step": 90 }, { "epoch": 0.6014551333872271, "grad_norm": 0.21645894646644592, "learning_rate": 7.922077922077924e-06, "loss": 0.2503, "step": 93 }, { "epoch": 0.6208569118835894, "grad_norm": 0.19337739050388336, "learning_rate": 7.532467532467533e-06, "loss": 0.2286, "step": 96 }, { "epoch": 0.6402586903799515, "grad_norm": 0.1808944046497345, "learning_rate": 7.1428571428571436e-06, "loss": 0.2401, "step": 99 }, { "epoch": 0.6596604688763137, "grad_norm": 0.1630856841802597, "learning_rate": 6.753246753246754e-06, "loss": 0.2251, "step": 102 }, { "epoch": 0.6790622473726758, "grad_norm": 0.16326990723609924, "learning_rate": 6.363636363636364e-06, "loss": 0.2291, "step": 105 }, { "epoch": 0.698464025869038, "grad_norm": 0.16061735153198242, "learning_rate": 5.9740259740259746e-06, "loss": 0.2331, "step": 108 }, { "epoch": 0.7178658043654002, "grad_norm": 0.17352429032325745, "learning_rate": 5.584415584415585e-06, "loss": 0.2149, "step": 111 }, { "epoch": 0.7372675828617623, "grad_norm": 0.17043530941009521, "learning_rate": 5.194805194805194e-06, "loss": 0.2187, "step": 114 }, { "epoch": 0.7566693613581245, "grad_norm": 0.16479559242725372, "learning_rate": 4.805194805194806e-06, "loss": 0.2218, "step": 117 }, { "epoch": 0.7760711398544866, "grad_norm": 0.17882439494132996, "learning_rate": 4.415584415584416e-06, "loss": 0.205, "step": 120 }, { "epoch": 0.7954729183508489, "grad_norm": 0.1911778748035431, "learning_rate": 4.025974025974026e-06, "loss": 0.2172, "step": 123 }, { "epoch": 0.814874696847211, "grad_norm": 0.17751498520374298, "learning_rate": 3.6363636363636366e-06, "loss": 0.2096, "step": 126 }, { "epoch": 0.8342764753435732, "grad_norm": 0.1702156662940979, "learning_rate": 3.246753246753247e-06, "loss": 0.1911, "step": 129 }, { "epoch": 0.8536782538399353, "grad_norm": 0.1764981597661972, "learning_rate": 2.8571428571428573e-06, "loss": 0.2103, "step": 132 }, { "epoch": 0.8730800323362975, "grad_norm": 0.1592799872159958, "learning_rate": 2.4675324675324676e-06, "loss": 0.2053, "step": 135 }, { "epoch": 0.8924818108326596, "grad_norm": 0.21512138843536377, "learning_rate": 2.0779220779220784e-06, "loss": 0.2197, "step": 138 }, { "epoch": 0.9118835893290218, "grad_norm": 0.17707495391368866, "learning_rate": 1.6883116883116885e-06, "loss": 0.2051, "step": 141 }, { "epoch": 0.931285367825384, "grad_norm": 0.1585138887166977, "learning_rate": 1.2987012987012986e-06, "loss": 0.1984, "step": 144 }, { "epoch": 0.9506871463217461, "grad_norm": 0.15231232345104218, "learning_rate": 9.090909090909091e-07, "loss": 0.1774, "step": 147 }, { "epoch": 0.9700889248181084, "grad_norm": 0.15338800847530365, "learning_rate": 5.194805194805196e-07, "loss": 0.2046, "step": 150 }, { "epoch": 0.9894907033144705, "grad_norm": 0.16579587757587433, "learning_rate": 1.298701298701299e-07, "loss": 0.1871, "step": 153 } ], "logging_steps": 3, "max_steps": 154, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5705942959542764e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }