{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0009372071227742, "eval_steps": 67, "global_step": 267, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037488284910965324, "eval_loss": 2.361954689025879, "eval_runtime": 3.5902, "eval_samples_per_second": 125.342, "eval_steps_per_second": 15.877, "step": 1 }, { "epoch": 0.01874414245548266, "grad_norm": 0.11034798622131348, "learning_rate": 1.6666666666666667e-05, "loss": 2.0049, "step": 5 }, { "epoch": 0.03748828491096532, "grad_norm": 0.24442698061466217, "learning_rate": 3.3333333333333335e-05, "loss": 2.3, "step": 10 }, { "epoch": 0.056232427366447985, "grad_norm": 0.1074686050415039, "learning_rate": 5e-05, "loss": 2.2395, "step": 15 }, { "epoch": 0.07497656982193064, "grad_norm": 0.22002893686294556, "learning_rate": 6.666666666666667e-05, "loss": 2.1767, "step": 20 }, { "epoch": 0.09372071227741331, "grad_norm": 0.819185733795166, "learning_rate": 8.333333333333334e-05, "loss": 2.6519, "step": 25 }, { "epoch": 0.11246485473289597, "grad_norm": 0.14553327858448029, "learning_rate": 0.0001, "loss": 2.0178, "step": 30 }, { "epoch": 0.13120899718837864, "grad_norm": 0.33844277262687683, "learning_rate": 9.989021978333995e-05, "loss": 2.1738, "step": 35 }, { "epoch": 0.14995313964386128, "grad_norm": 0.18579810857772827, "learning_rate": 9.956136120119858e-05, "loss": 2.1616, "step": 40 }, { "epoch": 0.16869728209934395, "grad_norm": 0.2519240081310272, "learning_rate": 9.901486834023182e-05, "loss": 2.0246, "step": 45 }, { "epoch": 0.18744142455482662, "grad_norm": 0.8189491033554077, "learning_rate": 9.825314096462685e-05, "loss": 2.3638, "step": 50 }, { "epoch": 0.20618556701030927, "grad_norm": 0.2149842232465744, "learning_rate": 9.72795239782369e-05, "loss": 1.9553, "step": 55 }, { "epoch": 0.22492970946579194, "grad_norm": 0.3894648253917694, "learning_rate": 9.609829273641034e-05, "loss": 2.152, "step": 60 }, { "epoch": 0.2436738519212746, "grad_norm": 0.20155246555805206, "learning_rate": 9.47146342720133e-05, "loss": 2.1133, "step": 65 }, { "epoch": 0.2511715089034677, "eval_loss": 2.1045072078704834, "eval_runtime": 3.6115, "eval_samples_per_second": 124.601, "eval_steps_per_second": 15.783, "step": 67 }, { "epoch": 0.2624179943767573, "grad_norm": 0.2783915400505066, "learning_rate": 9.3134624518086e-05, "loss": 1.998, "step": 70 }, { "epoch": 0.28116213683223995, "grad_norm": 1.4236199855804443, "learning_rate": 9.136520162715287e-05, "loss": 2.3941, "step": 75 }, { "epoch": 0.29990627928772257, "grad_norm": 0.2663502097129822, "learning_rate": 8.94141355043471e-05, "loss": 1.8989, "step": 80 }, { "epoch": 0.31865042174320524, "grad_norm": 0.4793087840080261, "learning_rate": 8.728999368813591e-05, "loss": 2.1174, "step": 85 }, { "epoch": 0.3373945641986879, "grad_norm": 0.20656132698059082, "learning_rate": 8.500210372847127e-05, "loss": 2.0367, "step": 90 }, { "epoch": 0.3561387066541706, "grad_norm": 0.30166926980018616, "learning_rate": 8.256051222757188e-05, "loss": 1.9734, "step": 95 }, { "epoch": 0.37488284910965325, "grad_norm": 1.0557372570037842, "learning_rate": 7.997594072319625e-05, "loss": 2.2227, "step": 100 }, { "epoch": 0.3936269915651359, "grad_norm": 0.2666870355606079, "learning_rate": 7.725973860813338e-05, "loss": 1.8634, "step": 105 }, { "epoch": 0.41237113402061853, "grad_norm": 0.47340846061706543, "learning_rate": 7.442383329265062e-05, "loss": 2.0263, "step": 110 }, { "epoch": 0.4311152764761012, "grad_norm": 0.23562197387218475, "learning_rate": 7.14806778287464e-05, "loss": 2.0062, "step": 115 }, { "epoch": 0.4498594189315839, "grad_norm": 0.3509786128997803, "learning_rate": 6.844319622620039e-05, "loss": 1.9304, "step": 120 }, { "epoch": 0.46860356138706655, "grad_norm": 1.2238250970840454, "learning_rate": 6.532472670054974e-05, "loss": 2.2562, "step": 125 }, { "epoch": 0.4873477038425492, "grad_norm": 0.2801048159599304, "learning_rate": 6.213896310220139e-05, "loss": 1.8556, "step": 130 }, { "epoch": 0.5023430178069354, "eval_loss": 2.027282953262329, "eval_runtime": 3.989, "eval_samples_per_second": 112.81, "eval_steps_per_second": 14.289, "step": 134 }, { "epoch": 0.5060918462980318, "grad_norm": 0.6263522505760193, "learning_rate": 5.889989478387753e-05, "loss": 2.0187, "step": 135 }, { "epoch": 0.5248359887535146, "grad_norm": 0.2488415390253067, "learning_rate": 5.5621745170448616e-05, "loss": 1.9467, "step": 140 }, { "epoch": 0.5435801312089972, "grad_norm": 0.3804314136505127, "learning_rate": 5.2318909300906926e-05, "loss": 2.0112, "step": 145 }, { "epoch": 0.5623242736644799, "grad_norm": 1.3460745811462402, "learning_rate": 4.900589061674649e-05, "loss": 2.1612, "step": 150 }, { "epoch": 0.5810684161199625, "grad_norm": 0.30925655364990234, "learning_rate": 4.569723727432517e-05, "loss": 1.7977, "step": 155 }, { "epoch": 0.5998125585754451, "grad_norm": 0.553766667842865, "learning_rate": 4.240747826087429e-05, "loss": 2.0045, "step": 160 }, { "epoch": 0.6185567010309279, "grad_norm": 0.26959770917892456, "learning_rate": 3.91510595946841e-05, "loss": 1.948, "step": 165 }, { "epoch": 0.6373008434864105, "grad_norm": 0.42600250244140625, "learning_rate": 3.5942280889623026e-05, "loss": 1.9507, "step": 170 }, { "epoch": 0.6560449859418932, "grad_norm": 1.3592358827590942, "learning_rate": 3.27952325625493e-05, "loss": 2.1947, "step": 175 }, { "epoch": 0.6747891283973758, "grad_norm": 0.3122025728225708, "learning_rate": 2.9723733959350307e-05, "loss": 1.772, "step": 180 }, { "epoch": 0.6935332708528584, "grad_norm": 0.6400068402290344, "learning_rate": 2.674127267131131e-05, "loss": 2.0385, "step": 185 }, { "epoch": 0.7122774133083412, "grad_norm": 0.2724437415599823, "learning_rate": 2.3860945308287552e-05, "loss": 1.9093, "step": 190 }, { "epoch": 0.7310215557638238, "grad_norm": 0.3942809998989105, "learning_rate": 2.1095399988757574e-05, "loss": 1.9067, "step": 195 }, { "epoch": 0.7497656982193065, "grad_norm": 1.2149404287338257, "learning_rate": 1.8456780799295886e-05, "loss": 2.1768, "step": 200 }, { "epoch": 0.753514526710403, "eval_loss": 1.981666922569275, "eval_runtime": 3.475, "eval_samples_per_second": 129.496, "eval_steps_per_second": 16.403, "step": 201 }, { "epoch": 0.7685098406747891, "grad_norm": 0.31344324350357056, "learning_rate": 1.5956674467354537e-05, "loss": 1.7979, "step": 205 }, { "epoch": 0.7872539831302718, "grad_norm": 0.6548614501953125, "learning_rate": 1.3606059481525296e-05, "loss": 1.9015, "step": 210 }, { "epoch": 0.8059981255857545, "grad_norm": 0.28455740213394165, "learning_rate": 1.1415257882705311e-05, "loss": 1.9196, "step": 215 }, { "epoch": 0.8247422680412371, "grad_norm": 0.42264482378959656, "learning_rate": 9.393889937861694e-06, "loss": 1.947, "step": 220 }, { "epoch": 0.8434864104967198, "grad_norm": 1.1509761810302734, "learning_rate": 7.550831895431798e-06, "loss": 2.0977, "step": 225 }, { "epoch": 0.8622305529522024, "grad_norm": 0.3231133818626404, "learning_rate": 5.894177007864271e-06, "loss": 1.8388, "step": 230 }, { "epoch": 0.8809746954076851, "grad_norm": 0.6163565516471863, "learning_rate": 4.4311999924586065e-06, "loss": 1.9681, "step": 235 }, { "epoch": 0.8997188378631678, "grad_norm": 0.27972695231437683, "learning_rate": 3.1683250865636114e-06, "loss": 1.9602, "step": 240 }, { "epoch": 0.9184629803186504, "grad_norm": 0.40869590640068054, "learning_rate": 2.1110978374106192e-06, "loss": 1.893, "step": 245 }, { "epoch": 0.9372071227741331, "grad_norm": 1.1225160360336304, "learning_rate": 1.2641607504584928e-06, "loss": 2.0743, "step": 250 }, { "epoch": 0.9559512652296157, "grad_norm": 0.32928183674812317, "learning_rate": 6.312329031833319e-07, "loss": 1.8119, "step": 255 }, { "epoch": 0.9746954076850984, "grad_norm": 0.6379702091217041, "learning_rate": 2.1509361383330596e-07, "loss": 1.9946, "step": 260 }, { "epoch": 0.993439550140581, "grad_norm": 0.35420289635658264, "learning_rate": 1.7570236862241017e-08, "loss": 2.0124, "step": 265 } ], "logging_steps": 5, "max_steps": 267, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.53863352458281e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }