{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0202020202020203, "eval_steps": 20, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.050505050505050504, "grad_norm": 0.1097809225320816, "learning_rate": 2.9999999999999997e-05, "loss": 1.2164, "step": 10 }, { "epoch": 0.10101010101010101, "grad_norm": 0.16844922304153442, "learning_rate": 5.9999999999999995e-05, "loss": 1.1774, "step": 20 }, { "epoch": 0.10101010101010101, "eval_loss": 1.1594151258468628, "eval_runtime": 295.5081, "eval_samples_per_second": 2.382, "eval_steps_per_second": 0.298, "step": 20 }, { "epoch": 0.15151515151515152, "grad_norm": 0.3893352150917053, "learning_rate": 8.999999999999999e-05, "loss": 0.9942, "step": 30 }, { "epoch": 0.20202020202020202, "grad_norm": 0.4363686740398407, "learning_rate": 0.00011999999999999999, "loss": 0.7259, "step": 40 }, { "epoch": 0.20202020202020202, "eval_loss": 0.6028452515602112, "eval_runtime": 294.8084, "eval_samples_per_second": 2.388, "eval_steps_per_second": 0.298, "step": 40 }, { "epoch": 0.25252525252525254, "grad_norm": 0.40602898597717285, "learning_rate": 0.00015, "loss": 0.4454, "step": 50 }, { "epoch": 0.30303030303030304, "grad_norm": 0.2177702635526657, "learning_rate": 0.00017999999999999998, "loss": 0.5139, "step": 60 }, { "epoch": 0.30303030303030304, "eval_loss": 0.3834875822067261, "eval_runtime": 295.1108, "eval_samples_per_second": 2.386, "eval_steps_per_second": 0.298, "step": 60 }, { "epoch": 0.35353535353535354, "grad_norm": 0.1777501404285431, "learning_rate": 0.00020999999999999998, "loss": 0.4096, "step": 70 }, { "epoch": 0.40404040404040403, "grad_norm": 0.1749625951051712, "learning_rate": 0.00023999999999999998, "loss": 0.2589, "step": 80 }, { "epoch": 0.40404040404040403, "eval_loss": 0.28596043586730957, "eval_runtime": 294.9045, "eval_samples_per_second": 2.387, "eval_steps_per_second": 0.298, "step": 80 }, { "epoch": 0.45454545454545453, "grad_norm": 0.2074197232723236, "learning_rate": 0.00027, "loss": 0.1932, "step": 90 }, { "epoch": 0.5050505050505051, "grad_norm": 0.36931025981903076, "learning_rate": 0.0003, "loss": 0.2177, "step": 100 }, { "epoch": 0.5050505050505051, "eval_loss": 0.25518086552619934, "eval_runtime": 295.271, "eval_samples_per_second": 2.384, "eval_steps_per_second": 0.298, "step": 100 }, { "epoch": 0.5555555555555556, "grad_norm": 0.20555801689624786, "learning_rate": 0.00029, "loss": 0.299, "step": 110 }, { "epoch": 0.6060606060606061, "grad_norm": 0.14223843812942505, "learning_rate": 0.00028, "loss": 0.2508, "step": 120 }, { "epoch": 0.6060606060606061, "eval_loss": 0.2106790542602539, "eval_runtime": 294.889, "eval_samples_per_second": 2.387, "eval_steps_per_second": 0.298, "step": 120 }, { "epoch": 0.6565656565656566, "grad_norm": 0.14103205502033234, "learning_rate": 0.00027, "loss": 0.1666, "step": 130 }, { "epoch": 0.7070707070707071, "grad_norm": 0.1655016392469406, "learning_rate": 0.00026, "loss": 0.1511, "step": 140 }, { "epoch": 0.7070707070707071, "eval_loss": 0.20343247056007385, "eval_runtime": 294.7497, "eval_samples_per_second": 2.388, "eval_steps_per_second": 0.299, "step": 140 }, { "epoch": 0.7575757575757576, "grad_norm": 0.33550018072128296, "learning_rate": 0.00025, "loss": 0.1798, "step": 150 }, { "epoch": 0.8080808080808081, "grad_norm": 0.18753309547901154, "learning_rate": 0.00023999999999999998, "loss": 0.245, "step": 160 }, { "epoch": 0.8080808080808081, "eval_loss": 0.19205082952976227, "eval_runtime": 294.8122, "eval_samples_per_second": 2.388, "eval_steps_per_second": 0.298, "step": 160 }, { "epoch": 0.8585858585858586, "grad_norm": 0.18931691348552704, "learning_rate": 0.00023, "loss": 0.2125, "step": 170 }, { "epoch": 0.9090909090909091, "grad_norm": 0.1642046868801117, "learning_rate": 0.00021999999999999995, "loss": 0.1479, "step": 180 }, { "epoch": 0.9090909090909091, "eval_loss": 0.1822979599237442, "eval_runtime": 295.0975, "eval_samples_per_second": 2.386, "eval_steps_per_second": 0.298, "step": 180 }, { "epoch": 0.9595959595959596, "grad_norm": 0.14441154897212982, "learning_rate": 0.00020999999999999998, "loss": 0.1399, "step": 190 }, { "epoch": 1.0101010101010102, "grad_norm": 0.20781415700912476, "learning_rate": 0.00019999999999999998, "loss": 0.17, "step": 200 }, { "epoch": 1.0101010101010102, "eval_loss": 0.18901540338993073, "eval_runtime": 294.9368, "eval_samples_per_second": 2.387, "eval_steps_per_second": 0.298, "step": 200 }, { "epoch": 1.0606060606060606, "grad_norm": 0.18098795413970947, "learning_rate": 0.00018999999999999998, "loss": 0.2102, "step": 210 }, { "epoch": 1.1111111111111112, "grad_norm": 0.12826304137706757, "learning_rate": 0.00017999999999999998, "loss": 0.2085, "step": 220 }, { "epoch": 1.1111111111111112, "eval_loss": 0.17017190158367157, "eval_runtime": 295.0936, "eval_samples_per_second": 2.386, "eval_steps_per_second": 0.298, "step": 220 }, { "epoch": 1.1616161616161615, "grad_norm": 0.13167431950569153, "learning_rate": 0.00016999999999999999, "loss": 0.1399, "step": 230 }, { "epoch": 1.2121212121212122, "grad_norm": 0.13884004950523376, "learning_rate": 0.00015999999999999999, "loss": 0.1311, "step": 240 }, { "epoch": 1.2121212121212122, "eval_loss": 0.16837279498577118, "eval_runtime": 294.8077, "eval_samples_per_second": 2.388, "eval_steps_per_second": 0.298, "step": 240 }, { "epoch": 1.2626262626262625, "grad_norm": 0.17478157579898834, "learning_rate": 0.00015, "loss": 0.1559, "step": 250 }, { "epoch": 1.3131313131313131, "grad_norm": 0.16669002175331116, "learning_rate": 0.00014, "loss": 0.205, "step": 260 }, { "epoch": 1.3131313131313131, "eval_loss": 0.16558928787708282, "eval_runtime": 294.9081, "eval_samples_per_second": 2.387, "eval_steps_per_second": 0.298, "step": 260 }, { "epoch": 1.3636363636363638, "grad_norm": 0.14852865040302277, "learning_rate": 0.00013, "loss": 0.1826, "step": 270 }, { "epoch": 1.4141414141414141, "grad_norm": 0.14237315952777863, "learning_rate": 0.00011999999999999999, "loss": 0.1329, "step": 280 }, { "epoch": 1.4141414141414141, "eval_loss": 0.15846213698387146, "eval_runtime": 295.0174, "eval_samples_per_second": 2.386, "eval_steps_per_second": 0.298, "step": 280 }, { "epoch": 1.4646464646464645, "grad_norm": 0.1401522010564804, "learning_rate": 0.00010999999999999998, "loss": 0.1205, "step": 290 }, { "epoch": 1.5151515151515151, "grad_norm": 0.18983517587184906, "learning_rate": 9.999999999999999e-05, "loss": 0.1607, "step": 300 }, { "epoch": 1.5151515151515151, "eval_loss": 0.16163967549800873, "eval_runtime": 294.807, "eval_samples_per_second": 2.388, "eval_steps_per_second": 0.299, "step": 300 }, { "epoch": 1.5656565656565657, "grad_norm": 0.1500634402036667, "learning_rate": 8.999999999999999e-05, "loss": 0.2121, "step": 310 }, { "epoch": 1.6161616161616161, "grad_norm": 0.15094490349292755, "learning_rate": 7.999999999999999e-05, "loss": 0.1713, "step": 320 }, { "epoch": 1.6161616161616161, "eval_loss": 0.1540195345878601, "eval_runtime": 295.157, "eval_samples_per_second": 2.385, "eval_steps_per_second": 0.298, "step": 320 }, { "epoch": 1.6666666666666665, "grad_norm": 0.15443100035190582, "learning_rate": 7e-05, "loss": 0.129, "step": 330 }, { "epoch": 1.7171717171717171, "grad_norm": 0.1568954586982727, "learning_rate": 5.9999999999999995e-05, "loss": 0.1196, "step": 340 }, { "epoch": 1.7171717171717171, "eval_loss": 0.15283828973770142, "eval_runtime": 295.3671, "eval_samples_per_second": 2.383, "eval_steps_per_second": 0.298, "step": 340 }, { "epoch": 1.7676767676767677, "grad_norm": 0.19448110461235046, "learning_rate": 4.9999999999999996e-05, "loss": 0.1603, "step": 350 }, { "epoch": 1.8181818181818183, "grad_norm": 0.12574005126953125, "learning_rate": 3.9999999999999996e-05, "loss": 0.1722, "step": 360 }, { "epoch": 1.8181818181818183, "eval_loss": 0.15110255777835846, "eval_runtime": 295.1101, "eval_samples_per_second": 2.386, "eval_steps_per_second": 0.298, "step": 360 }, { "epoch": 1.8686868686868687, "grad_norm": 0.11745467782020569, "learning_rate": 2.9999999999999997e-05, "loss": 0.1669, "step": 370 }, { "epoch": 1.9191919191919191, "grad_norm": 0.1326703280210495, "learning_rate": 1.9999999999999998e-05, "loss": 0.1243, "step": 380 }, { "epoch": 1.9191919191919191, "eval_loss": 0.1487378031015396, "eval_runtime": 295.0041, "eval_samples_per_second": 2.386, "eval_steps_per_second": 0.298, "step": 380 }, { "epoch": 1.9696969696969697, "grad_norm": 0.1454104483127594, "learning_rate": 9.999999999999999e-06, "loss": 0.1148, "step": 390 }, { "epoch": 2.0202020202020203, "grad_norm": 0.1772913634777069, "learning_rate": 0.0, "loss": 0.1633, "step": 400 }, { "epoch": 2.0202020202020203, "eval_loss": 0.14871937036514282, "eval_runtime": 295.1345, "eval_samples_per_second": 2.385, "eval_steps_per_second": 0.298, "step": 400 } ], "logging_steps": 10, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "total_flos": 2.2256168195948544e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }