{ "best_metric": 0.9358974358974359, "best_model_checkpoint": "output-models/checkpoint-470", "epoch": 40.0, "eval_steps": 500, "global_step": 1880, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "step": 47, "train_accuracy": 0.7072649572649573, "train_loss": 0.6782774329185486, "train_runtime": 13.5956, "train_samples_per_second": 34.423, "train_steps_per_second": 8.606 }, { "epoch": 1.0, "eval_accuracy": 0.7243589743589743, "eval_loss": 0.6180987358093262, "eval_runtime": 35.9375, "eval_samples_per_second": 4.341, "eval_steps_per_second": 1.085, "step": 47 }, { "epoch": 2.0, "step": 94, "train_accuracy": 0.75, "train_loss": 0.5690865516662598, "train_runtime": 13.7448, "train_samples_per_second": 34.049, "train_steps_per_second": 8.512 }, { "epoch": 2.0, "eval_accuracy": 0.8012820512820513, "eval_loss": 0.5130882263183594, "eval_runtime": 4.4157, "eval_samples_per_second": 35.328, "eval_steps_per_second": 8.832, "step": 94 }, { "epoch": 3.0, "step": 141, "train_accuracy": 0.7350427350427351, "train_loss": 0.6460429430007935, "train_runtime": 14.0717, "train_samples_per_second": 33.258, "train_steps_per_second": 8.315 }, { "epoch": 3.0, "eval_accuracy": 0.8205128205128205, "eval_loss": 0.45179083943367004, "eval_runtime": 4.8055, "eval_samples_per_second": 32.463, "eval_steps_per_second": 8.116, "step": 141 }, { "epoch": 4.0, "step": 188, "train_accuracy": 0.8098290598290598, "train_loss": 0.39663246273994446, "train_runtime": 13.7878, "train_samples_per_second": 33.943, "train_steps_per_second": 8.486 }, { "epoch": 4.0, "eval_accuracy": 0.8846153846153846, "eval_loss": 0.2980358898639679, "eval_runtime": 4.4674, "eval_samples_per_second": 34.919, "eval_steps_per_second": 8.73, "step": 188 }, { "epoch": 5.0, "step": 235, "train_accuracy": 0.8632478632478633, "train_loss": 0.3636291027069092, "train_runtime": 13.58, "train_samples_per_second": 34.462, "train_steps_per_second": 8.616 }, { "epoch": 5.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.2997760772705078, "eval_runtime": 4.9986, "eval_samples_per_second": 31.208, "eval_steps_per_second": 7.802, "step": 235 }, { "epoch": 6.0, "step": 282, "train_accuracy": 0.8376068376068376, "train_loss": 0.4208720922470093, "train_runtime": 13.5735, "train_samples_per_second": 34.479, "train_steps_per_second": 8.62 }, { "epoch": 6.0, "eval_accuracy": 0.9102564102564102, "eval_loss": 0.33183348178863525, "eval_runtime": 4.4091, "eval_samples_per_second": 35.381, "eval_steps_per_second": 8.845, "step": 282 }, { "epoch": 7.0, "step": 329, "train_accuracy": 0.8568376068376068, "train_loss": 0.32207924127578735, "train_runtime": 13.5635, "train_samples_per_second": 34.504, "train_steps_per_second": 8.626 }, { "epoch": 7.0, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.24086996912956238, "eval_runtime": 4.7402, "eval_samples_per_second": 32.91, "eval_steps_per_second": 8.228, "step": 329 }, { "epoch": 8.0, "step": 376, "train_accuracy": 0.8846153846153846, "train_loss": 0.3257001042366028, "train_runtime": 13.9348, "train_samples_per_second": 33.585, "train_steps_per_second": 8.396 }, { "epoch": 8.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.3424080014228821, "eval_runtime": 4.9539, "eval_samples_per_second": 31.49, "eval_steps_per_second": 7.873, "step": 376 }, { "epoch": 9.0, "step": 423, "train_accuracy": 0.8717948717948718, "train_loss": 0.2687961757183075, "train_runtime": 13.6829, "train_samples_per_second": 34.203, "train_steps_per_second": 8.551 }, { "epoch": 9.0, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.25928938388824463, "eval_runtime": 4.4508, "eval_samples_per_second": 35.05, "eval_steps_per_second": 8.762, "step": 423 }, { "epoch": 10.0, "step": 470, "train_accuracy": 0.9102564102564102, "train_loss": 0.2113831341266632, "train_runtime": 13.44, "train_samples_per_second": 34.821, "train_steps_per_second": 8.705 }, { "epoch": 10.0, "eval_accuracy": 0.9358974358974359, "eval_loss": 0.25084006786346436, "eval_runtime": 4.8092, "eval_samples_per_second": 32.438, "eval_steps_per_second": 8.109, "step": 470 }, { "epoch": 10.64, "grad_norm": 7.874776840209961, "learning_rate": 1.4680851063829789e-05, "loss": 0.4595, "step": 500 }, { "epoch": 11.0, "step": 517, "train_accuracy": 0.8995726495726496, "train_loss": 0.25938984751701355, "train_runtime": 13.5841, "train_samples_per_second": 34.452, "train_steps_per_second": 8.613 }, { "epoch": 11.0, "eval_accuracy": 0.9294871794871795, "eval_loss": 0.32409772276878357, "eval_runtime": 4.4182, "eval_samples_per_second": 35.308, "eval_steps_per_second": 8.827, "step": 517 }, { "epoch": 12.0, "step": 564, "train_accuracy": 0.8952991452991453, "train_loss": 0.26763853430747986, "train_runtime": 13.8203, "train_samples_per_second": 33.863, "train_steps_per_second": 8.466 }, { "epoch": 12.0, "eval_accuracy": 0.8910256410256411, "eval_loss": 0.3308241069316864, "eval_runtime": 4.4447, "eval_samples_per_second": 35.098, "eval_steps_per_second": 8.774, "step": 564 }, { "epoch": 13.0, "step": 611, "train_accuracy": 0.9081196581196581, "train_loss": 0.23129615187644958, "train_runtime": 13.4973, "train_samples_per_second": 34.674, "train_steps_per_second": 8.668 }, { "epoch": 13.0, "eval_accuracy": 0.9294871794871795, "eval_loss": 0.255931556224823, "eval_runtime": 4.6436, "eval_samples_per_second": 33.595, "eval_steps_per_second": 8.399, "step": 611 }, { "epoch": 14.0, "step": 658, "train_accuracy": 0.8846153846153846, "train_loss": 0.31185245513916016, "train_runtime": 13.5189, "train_samples_per_second": 34.618, "train_steps_per_second": 8.655 }, { "epoch": 14.0, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.3017214834690094, "eval_runtime": 4.6078, "eval_samples_per_second": 33.856, "eval_steps_per_second": 8.464, "step": 658 }, { "epoch": 15.0, "step": 705, "train_accuracy": 0.9166666666666666, "train_loss": 0.3007480502128601, "train_runtime": 13.4373, "train_samples_per_second": 34.829, "train_steps_per_second": 8.707 }, { "epoch": 15.0, "eval_accuracy": 0.9038461538461539, "eval_loss": 0.4220944046974182, "eval_runtime": 4.9304, "eval_samples_per_second": 31.64, "eval_steps_per_second": 7.91, "step": 705 }, { "epoch": 16.0, "step": 752, "train_accuracy": 0.9188034188034188, "train_loss": 0.20939397811889648, "train_runtime": 13.6683, "train_samples_per_second": 34.24, "train_steps_per_second": 8.56 }, { "epoch": 16.0, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.36174264550209045, "eval_runtime": 4.4608, "eval_samples_per_second": 34.971, "eval_steps_per_second": 8.743, "step": 752 }, { "epoch": 17.0, "step": 799, "train_accuracy": 0.9209401709401709, "train_loss": 0.18879051506519318, "train_runtime": 13.7929, "train_samples_per_second": 33.931, "train_steps_per_second": 8.483 }, { "epoch": 17.0, "eval_accuracy": 0.9102564102564102, "eval_loss": 0.35188791155815125, "eval_runtime": 4.7732, "eval_samples_per_second": 32.683, "eval_steps_per_second": 8.171, "step": 799 }, { "epoch": 18.0, "step": 846, "train_accuracy": 0.8952991452991453, "train_loss": 0.25016605854034424, "train_runtime": 13.3521, "train_samples_per_second": 35.051, "train_steps_per_second": 8.763 }, { "epoch": 18.0, "eval_accuracy": 0.9102564102564102, "eval_loss": 0.3965354859828949, "eval_runtime": 4.3963, "eval_samples_per_second": 35.485, "eval_steps_per_second": 8.871, "step": 846 }, { "epoch": 19.0, "step": 893, "train_accuracy": 0.9209401709401709, "train_loss": 0.1891285479068756, "train_runtime": 13.4884, "train_samples_per_second": 34.696, "train_steps_per_second": 8.674 }, { "epoch": 19.0, "eval_accuracy": 0.9038461538461539, "eval_loss": 0.31604066491127014, "eval_runtime": 5.1415, "eval_samples_per_second": 30.341, "eval_steps_per_second": 7.585, "step": 893 }, { "epoch": 20.0, "step": 940, "train_accuracy": 0.9401709401709402, "train_loss": 0.1873449832201004, "train_runtime": 13.9057, "train_samples_per_second": 33.655, "train_steps_per_second": 8.414 }, { "epoch": 20.0, "eval_accuracy": 0.9294871794871795, "eval_loss": 0.3332672119140625, "eval_runtime": 4.9421, "eval_samples_per_second": 31.565, "eval_steps_per_second": 7.891, "step": 940 }, { "epoch": 21.0, "step": 987, "train_accuracy": 0.9230769230769231, "train_loss": 0.18881197273731232, "train_runtime": 13.5338, "train_samples_per_second": 34.58, "train_steps_per_second": 8.645 }, { "epoch": 21.0, "eval_accuracy": 0.8910256410256411, "eval_loss": 0.3720751404762268, "eval_runtime": 4.8223, "eval_samples_per_second": 32.35, "eval_steps_per_second": 8.088, "step": 987 }, { "epoch": 21.28, "grad_norm": 5.682499408721924, "learning_rate": 9.361702127659576e-06, "loss": 0.2485, "step": 1000 }, { "epoch": 22.0, "step": 1034, "train_accuracy": 0.9444444444444444, "train_loss": 0.1338244080543518, "train_runtime": 13.6664, "train_samples_per_second": 34.245, "train_steps_per_second": 8.561 }, { "epoch": 22.0, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.3777410686016083, "eval_runtime": 4.3337, "eval_samples_per_second": 35.997, "eval_steps_per_second": 8.999, "step": 1034 }, { "epoch": 23.0, "step": 1081, "train_accuracy": 0.9252136752136753, "train_loss": 0.18711484968662262, "train_runtime": 13.702, "train_samples_per_second": 34.155, "train_steps_per_second": 8.539 }, { "epoch": 23.0, "eval_accuracy": 0.9038461538461539, "eval_loss": 0.3984796702861786, "eval_runtime": 4.5267, "eval_samples_per_second": 34.462, "eval_steps_per_second": 8.616, "step": 1081 }, { "epoch": 24.0, "step": 1128, "train_accuracy": 0.9444444444444444, "train_loss": 0.161672905087471, "train_runtime": 13.9523, "train_samples_per_second": 33.543, "train_steps_per_second": 8.386 }, { "epoch": 24.0, "eval_accuracy": 0.9230769230769231, "eval_loss": 0.38641923666000366, "eval_runtime": 5.1128, "eval_samples_per_second": 30.511, "eval_steps_per_second": 7.628, "step": 1128 }, { "epoch": 25.0, "step": 1175, "train_accuracy": 0.9444444444444444, "train_loss": 0.17891307175159454, "train_runtime": 13.7483, "train_samples_per_second": 34.041, "train_steps_per_second": 8.51 }, { "epoch": 25.0, "eval_accuracy": 0.9230769230769231, "eval_loss": 0.42098188400268555, "eval_runtime": 5.0358, "eval_samples_per_second": 30.978, "eval_steps_per_second": 7.745, "step": 1175 }, { "epoch": 26.0, "step": 1222, "train_accuracy": 0.9572649572649573, "train_loss": 0.10899731516838074, "train_runtime": 14.0913, "train_samples_per_second": 33.212, "train_steps_per_second": 8.303 }, { "epoch": 26.0, "eval_accuracy": 0.9038461538461539, "eval_loss": 0.4160342216491699, "eval_runtime": 4.7918, "eval_samples_per_second": 32.555, "eval_steps_per_second": 8.139, "step": 1222 }, { "epoch": 27.0, "step": 1269, "train_accuracy": 0.938034188034188, "train_loss": 0.16018715500831604, "train_runtime": 13.888, "train_samples_per_second": 33.698, "train_steps_per_second": 8.425 }, { "epoch": 27.0, "eval_accuracy": 0.9102564102564102, "eval_loss": 0.39854034781455994, "eval_runtime": 4.8553, "eval_samples_per_second": 32.13, "eval_steps_per_second": 8.032, "step": 1269 }, { "epoch": 28.0, "step": 1316, "train_accuracy": 0.9444444444444444, "train_loss": 0.14988763630390167, "train_runtime": 13.7687, "train_samples_per_second": 33.99, "train_steps_per_second": 8.498 }, { "epoch": 28.0, "eval_accuracy": 0.9102564102564102, "eval_loss": 0.40767335891723633, "eval_runtime": 4.434, "eval_samples_per_second": 35.182, "eval_steps_per_second": 8.796, "step": 1316 }, { "epoch": 29.0, "step": 1363, "train_accuracy": 0.9316239316239316, "train_loss": 0.17876969277858734, "train_runtime": 13.3686, "train_samples_per_second": 35.008, "train_steps_per_second": 8.752 }, { "epoch": 29.0, "eval_accuracy": 0.8782051282051282, "eval_loss": 0.6035234928131104, "eval_runtime": 4.8916, "eval_samples_per_second": 31.891, "eval_steps_per_second": 7.973, "step": 1363 }, { "epoch": 30.0, "step": 1410, "train_accuracy": 0.9572649572649573, "train_loss": 0.12170404940843582, "train_runtime": 13.3084, "train_samples_per_second": 35.166, "train_steps_per_second": 8.791 }, { "epoch": 30.0, "eval_accuracy": 0.9230769230769231, "eval_loss": 0.3604837954044342, "eval_runtime": 4.5664, "eval_samples_per_second": 34.163, "eval_steps_per_second": 8.541, "step": 1410 }, { "epoch": 31.0, "step": 1457, "train_accuracy": 0.9444444444444444, "train_loss": 0.17193935811519623, "train_runtime": 13.3567, "train_samples_per_second": 35.039, "train_steps_per_second": 8.76 }, { "epoch": 31.0, "eval_accuracy": 0.9038461538461539, "eval_loss": 0.42400404810905457, "eval_runtime": 4.3974, "eval_samples_per_second": 35.475, "eval_steps_per_second": 8.869, "step": 1457 }, { "epoch": 31.91, "grad_norm": 0.40821418166160583, "learning_rate": 4.042553191489362e-06, "loss": 0.1715, "step": 1500 }, { "epoch": 32.0, "step": 1504, "train_accuracy": 0.9551282051282052, "train_loss": 0.15071353316307068, "train_runtime": 13.338, "train_samples_per_second": 35.088, "train_steps_per_second": 8.772 }, { "epoch": 32.0, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.3800322711467743, "eval_runtime": 4.4234, "eval_samples_per_second": 35.267, "eval_steps_per_second": 8.817, "step": 1504 }, { "epoch": 33.0, "step": 1551, "train_accuracy": 0.9423076923076923, "train_loss": 0.15298214554786682, "train_runtime": 13.6206, "train_samples_per_second": 34.36, "train_steps_per_second": 8.59 }, { "epoch": 33.0, "eval_accuracy": 0.9038461538461539, "eval_loss": 0.42538413405418396, "eval_runtime": 4.5076, "eval_samples_per_second": 34.608, "eval_steps_per_second": 8.652, "step": 1551 }, { "epoch": 34.0, "step": 1598, "train_accuracy": 0.9615384615384616, "train_loss": 0.09072276204824448, "train_runtime": 13.7151, "train_samples_per_second": 34.123, "train_steps_per_second": 8.531 }, { "epoch": 34.0, "eval_accuracy": 0.9230769230769231, "eval_loss": 0.41397902369499207, "eval_runtime": 4.448, "eval_samples_per_second": 35.072, "eval_steps_per_second": 8.768, "step": 1598 }, { "epoch": 35.0, "step": 1645, "train_accuracy": 0.9594017094017094, "train_loss": 0.15199129283428192, "train_runtime": 13.6214, "train_samples_per_second": 34.358, "train_steps_per_second": 8.589 }, { "epoch": 35.0, "eval_accuracy": 0.9230769230769231, "eval_loss": 0.39104607701301575, "eval_runtime": 4.4304, "eval_samples_per_second": 35.211, "eval_steps_per_second": 8.803, "step": 1645 }, { "epoch": 36.0, "step": 1692, "train_accuracy": 0.9594017094017094, "train_loss": 0.134719118475914, "train_runtime": 13.9053, "train_samples_per_second": 33.656, "train_steps_per_second": 8.414 }, { "epoch": 36.0, "eval_accuracy": 0.9102564102564102, "eval_loss": 0.4161369204521179, "eval_runtime": 4.6871, "eval_samples_per_second": 33.283, "eval_steps_per_second": 8.321, "step": 1692 }, { "epoch": 37.0, "step": 1739, "train_accuracy": 0.9529914529914529, "train_loss": 0.16535791754722595, "train_runtime": 14.0143, "train_samples_per_second": 33.394, "train_steps_per_second": 8.349 }, { "epoch": 37.0, "eval_accuracy": 0.9102564102564102, "eval_loss": 0.43847039341926575, "eval_runtime": 4.8404, "eval_samples_per_second": 32.229, "eval_steps_per_second": 8.057, "step": 1739 }, { "epoch": 38.0, "step": 1786, "train_accuracy": 0.9487179487179487, "train_loss": 0.11996147781610489, "train_runtime": 14.1086, "train_samples_per_second": 33.171, "train_steps_per_second": 8.293 }, { "epoch": 38.0, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.41302695870399475, "eval_runtime": 4.7702, "eval_samples_per_second": 32.703, "eval_steps_per_second": 8.176, "step": 1786 }, { "epoch": 39.0, "step": 1833, "train_accuracy": 0.9529914529914529, "train_loss": 0.17013560235500336, "train_runtime": 13.8197, "train_samples_per_second": 33.865, "train_steps_per_second": 8.466 }, { "epoch": 39.0, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.4181523323059082, "eval_runtime": 5.0402, "eval_samples_per_second": 30.951, "eval_steps_per_second": 7.738, "step": 1833 }, { "epoch": 40.0, "step": 1880, "train_accuracy": 0.9551282051282052, "train_loss": 0.11466003954410553, "train_runtime": 13.6881, "train_samples_per_second": 34.19, "train_steps_per_second": 8.548 }, { "epoch": 40.0, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.4133930802345276, "eval_runtime": 4.7653, "eval_samples_per_second": 32.736, "eval_steps_per_second": 8.184, "step": 1880 } ], "logging_steps": 500, "max_steps": 1880, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 500, "total_flos": 5.128065177052447e+18, "train_batch_size": 10, "trial_name": null, "trial_params": null }