|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0009372071227742, |
|
"eval_steps": 67, |
|
"global_step": 267, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0037488284910965324, |
|
"eval_loss": 2.361954689025879, |
|
"eval_runtime": 3.5902, |
|
"eval_samples_per_second": 125.342, |
|
"eval_steps_per_second": 15.877, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01874414245548266, |
|
"grad_norm": 0.11034798622131348, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 2.0049, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03748828491096532, |
|
"grad_norm": 0.24442698061466217, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 2.3, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.056232427366447985, |
|
"grad_norm": 0.1074686050415039, |
|
"learning_rate": 5e-05, |
|
"loss": 2.2395, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07497656982193064, |
|
"grad_norm": 0.22002893686294556, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 2.1767, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09372071227741331, |
|
"grad_norm": 0.819185733795166, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 2.6519, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11246485473289597, |
|
"grad_norm": 0.14553327858448029, |
|
"learning_rate": 0.0001, |
|
"loss": 2.0178, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13120899718837864, |
|
"grad_norm": 0.33844277262687683, |
|
"learning_rate": 9.989021978333995e-05, |
|
"loss": 2.1738, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.14995313964386128, |
|
"grad_norm": 0.18579810857772827, |
|
"learning_rate": 9.956136120119858e-05, |
|
"loss": 2.1616, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16869728209934395, |
|
"grad_norm": 0.2519240081310272, |
|
"learning_rate": 9.901486834023182e-05, |
|
"loss": 2.0246, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.18744142455482662, |
|
"grad_norm": 0.8189491033554077, |
|
"learning_rate": 9.825314096462685e-05, |
|
"loss": 2.3638, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 0.2149842232465744, |
|
"learning_rate": 9.72795239782369e-05, |
|
"loss": 1.9553, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.22492970946579194, |
|
"grad_norm": 0.3894648253917694, |
|
"learning_rate": 9.609829273641034e-05, |
|
"loss": 2.152, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2436738519212746, |
|
"grad_norm": 0.20155246555805206, |
|
"learning_rate": 9.47146342720133e-05, |
|
"loss": 2.1133, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2511715089034677, |
|
"eval_loss": 2.1045072078704834, |
|
"eval_runtime": 3.6115, |
|
"eval_samples_per_second": 124.601, |
|
"eval_steps_per_second": 15.783, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.2624179943767573, |
|
"grad_norm": 0.2783915400505066, |
|
"learning_rate": 9.3134624518086e-05, |
|
"loss": 1.998, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.28116213683223995, |
|
"grad_norm": 1.4236199855804443, |
|
"learning_rate": 9.136520162715287e-05, |
|
"loss": 2.3941, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.29990627928772257, |
|
"grad_norm": 0.2663502097129822, |
|
"learning_rate": 8.94141355043471e-05, |
|
"loss": 1.8989, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.31865042174320524, |
|
"grad_norm": 0.4793087840080261, |
|
"learning_rate": 8.728999368813591e-05, |
|
"loss": 2.1174, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3373945641986879, |
|
"grad_norm": 0.20656132698059082, |
|
"learning_rate": 8.500210372847127e-05, |
|
"loss": 2.0367, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3561387066541706, |
|
"grad_norm": 0.30166926980018616, |
|
"learning_rate": 8.256051222757188e-05, |
|
"loss": 1.9734, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.37488284910965325, |
|
"grad_norm": 1.0557372570037842, |
|
"learning_rate": 7.997594072319625e-05, |
|
"loss": 2.2227, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3936269915651359, |
|
"grad_norm": 0.2666870355606079, |
|
"learning_rate": 7.725973860813338e-05, |
|
"loss": 1.8634, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"grad_norm": 0.47340846061706543, |
|
"learning_rate": 7.442383329265062e-05, |
|
"loss": 2.0263, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4311152764761012, |
|
"grad_norm": 0.23562197387218475, |
|
"learning_rate": 7.14806778287464e-05, |
|
"loss": 2.0062, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.4498594189315839, |
|
"grad_norm": 0.3509786128997803, |
|
"learning_rate": 6.844319622620039e-05, |
|
"loss": 1.9304, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.46860356138706655, |
|
"grad_norm": 1.2238250970840454, |
|
"learning_rate": 6.532472670054974e-05, |
|
"loss": 2.2562, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.4873477038425492, |
|
"grad_norm": 0.2801048159599304, |
|
"learning_rate": 6.213896310220139e-05, |
|
"loss": 1.8556, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5023430178069354, |
|
"eval_loss": 2.027282953262329, |
|
"eval_runtime": 3.989, |
|
"eval_samples_per_second": 112.81, |
|
"eval_steps_per_second": 14.289, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5060918462980318, |
|
"grad_norm": 0.6263522505760193, |
|
"learning_rate": 5.889989478387753e-05, |
|
"loss": 2.0187, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5248359887535146, |
|
"grad_norm": 0.2488415390253067, |
|
"learning_rate": 5.5621745170448616e-05, |
|
"loss": 1.9467, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5435801312089972, |
|
"grad_norm": 0.3804314136505127, |
|
"learning_rate": 5.2318909300906926e-05, |
|
"loss": 2.0112, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5623242736644799, |
|
"grad_norm": 1.3460745811462402, |
|
"learning_rate": 4.900589061674649e-05, |
|
"loss": 2.1612, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5810684161199625, |
|
"grad_norm": 0.30925655364990234, |
|
"learning_rate": 4.569723727432517e-05, |
|
"loss": 1.7977, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5998125585754451, |
|
"grad_norm": 0.553766667842865, |
|
"learning_rate": 4.240747826087429e-05, |
|
"loss": 2.0045, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6185567010309279, |
|
"grad_norm": 0.26959770917892456, |
|
"learning_rate": 3.91510595946841e-05, |
|
"loss": 1.948, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6373008434864105, |
|
"grad_norm": 0.42600250244140625, |
|
"learning_rate": 3.5942280889623026e-05, |
|
"loss": 1.9507, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6560449859418932, |
|
"grad_norm": 1.3592358827590942, |
|
"learning_rate": 3.27952325625493e-05, |
|
"loss": 2.1947, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6747891283973758, |
|
"grad_norm": 0.3122025728225708, |
|
"learning_rate": 2.9723733959350307e-05, |
|
"loss": 1.772, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6935332708528584, |
|
"grad_norm": 0.6400068402290344, |
|
"learning_rate": 2.674127267131131e-05, |
|
"loss": 2.0385, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7122774133083412, |
|
"grad_norm": 0.2724437415599823, |
|
"learning_rate": 2.3860945308287552e-05, |
|
"loss": 1.9093, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7310215557638238, |
|
"grad_norm": 0.3942809998989105, |
|
"learning_rate": 2.1095399988757574e-05, |
|
"loss": 1.9067, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.7497656982193065, |
|
"grad_norm": 1.2149404287338257, |
|
"learning_rate": 1.8456780799295886e-05, |
|
"loss": 2.1768, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.753514526710403, |
|
"eval_loss": 1.981666922569275, |
|
"eval_runtime": 3.475, |
|
"eval_samples_per_second": 129.496, |
|
"eval_steps_per_second": 16.403, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.7685098406747891, |
|
"grad_norm": 0.31344324350357056, |
|
"learning_rate": 1.5956674467354537e-05, |
|
"loss": 1.7979, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.7872539831302718, |
|
"grad_norm": 0.6548614501953125, |
|
"learning_rate": 1.3606059481525296e-05, |
|
"loss": 1.9015, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8059981255857545, |
|
"grad_norm": 0.28455740213394165, |
|
"learning_rate": 1.1415257882705311e-05, |
|
"loss": 1.9196, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"grad_norm": 0.42264482378959656, |
|
"learning_rate": 9.393889937861694e-06, |
|
"loss": 1.947, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8434864104967198, |
|
"grad_norm": 1.1509761810302734, |
|
"learning_rate": 7.550831895431798e-06, |
|
"loss": 2.0977, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.8622305529522024, |
|
"grad_norm": 0.3231133818626404, |
|
"learning_rate": 5.894177007864271e-06, |
|
"loss": 1.8388, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8809746954076851, |
|
"grad_norm": 0.6163565516471863, |
|
"learning_rate": 4.4311999924586065e-06, |
|
"loss": 1.9681, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.8997188378631678, |
|
"grad_norm": 0.27972695231437683, |
|
"learning_rate": 3.1683250865636114e-06, |
|
"loss": 1.9602, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9184629803186504, |
|
"grad_norm": 0.40869590640068054, |
|
"learning_rate": 2.1110978374106192e-06, |
|
"loss": 1.893, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.9372071227741331, |
|
"grad_norm": 1.1225160360336304, |
|
"learning_rate": 1.2641607504584928e-06, |
|
"loss": 2.0743, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9559512652296157, |
|
"grad_norm": 0.32928183674812317, |
|
"learning_rate": 6.312329031833319e-07, |
|
"loss": 1.8119, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.9746954076850984, |
|
"grad_norm": 0.6379702091217041, |
|
"learning_rate": 2.1509361383330596e-07, |
|
"loss": 1.9946, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.993439550140581, |
|
"grad_norm": 0.35420289635658264, |
|
"learning_rate": 1.7570236862241017e-08, |
|
"loss": 2.0124, |
|
"step": 265 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 267, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.53863352458281e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|