ReactionT5v2-forward / trainer_state.json
sagawa's picture
Upload 8 files
f614b6a verified
raw
history blame
40.6 kB
{
"best_metric": 0.04311952739953995,
"best_model_checkpoint": "t5/checkpoint-58320",
"epoch": 100.0,
"eval_steps": 500,
"global_step": 486000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"grad_norm": 0.14633552730083466,
"learning_rate": 0.00099,
"loss": 0.1231,
"step": 4860
},
{
"epoch": 1.0,
"eval_accuracy": 0.6418053052843314,
"eval_loss": 0.0717623308300972,
"eval_runtime": 1333.7775,
"eval_samples_per_second": 89.456,
"eval_steps_per_second": 0.35,
"step": 4860
},
{
"epoch": 2.0,
"grad_norm": 0.10493500530719757,
"learning_rate": 0.00098,
"loss": 0.0712,
"step": 9720
},
{
"epoch": 2.0,
"eval_accuracy": 0.6884046431714369,
"eval_loss": 0.06004703789949417,
"eval_runtime": 1304.0684,
"eval_samples_per_second": 91.494,
"eval_steps_per_second": 0.358,
"step": 9720
},
{
"epoch": 3.0,
"grad_norm": 0.09630604088306427,
"learning_rate": 0.0009699999999999999,
"loss": 0.0593,
"step": 14580
},
{
"epoch": 3.0,
"eval_accuracy": 0.7176465658131835,
"eval_loss": 0.05390430614352226,
"eval_runtime": 1308.2854,
"eval_samples_per_second": 91.2,
"eval_steps_per_second": 0.357,
"step": 14580
},
{
"epoch": 4.0,
"grad_norm": 0.07841313630342484,
"learning_rate": 0.00096,
"loss": 0.0519,
"step": 19440
},
{
"epoch": 4.0,
"eval_accuracy": 0.730679294304991,
"eval_loss": 0.050438590347766876,
"eval_runtime": 1309.4742,
"eval_samples_per_second": 91.117,
"eval_steps_per_second": 0.357,
"step": 19440
},
{
"epoch": 5.0,
"grad_norm": 0.07362372428178787,
"learning_rate": 0.00095,
"loss": 0.0464,
"step": 24300
},
{
"epoch": 5.0,
"eval_accuracy": 0.7446423333193647,
"eval_loss": 0.04836108162999153,
"eval_runtime": 1308.1745,
"eval_samples_per_second": 91.207,
"eval_steps_per_second": 0.357,
"step": 24300
},
{
"epoch": 6.0,
"grad_norm": 0.05613507702946663,
"learning_rate": 0.00094,
"loss": 0.0422,
"step": 29160
},
{
"epoch": 6.0,
"eval_accuracy": 0.7498721870678456,
"eval_loss": 0.04650866985321045,
"eval_runtime": 1310.787,
"eval_samples_per_second": 91.025,
"eval_steps_per_second": 0.356,
"step": 29160
},
{
"epoch": 7.0,
"grad_norm": 0.06997396796941757,
"learning_rate": 0.00093,
"loss": 0.0385,
"step": 34020
},
{
"epoch": 7.0,
"eval_accuracy": 0.7611197250974312,
"eval_loss": 0.04569365829229355,
"eval_runtime": 1301.8483,
"eval_samples_per_second": 91.65,
"eval_steps_per_second": 0.359,
"step": 34020
},
{
"epoch": 8.0,
"grad_norm": 0.054521311074495316,
"learning_rate": 0.00092,
"loss": 0.0354,
"step": 38880
},
{
"epoch": 8.0,
"eval_accuracy": 0.7642039978208943,
"eval_loss": 0.04475805535912514,
"eval_runtime": 1307.7292,
"eval_samples_per_second": 91.238,
"eval_steps_per_second": 0.357,
"step": 38880
},
{
"epoch": 9.0,
"grad_norm": 0.06338842958211899,
"learning_rate": 0.00091,
"loss": 0.0328,
"step": 43740
},
{
"epoch": 9.0,
"eval_accuracy": 0.771554289066756,
"eval_loss": 0.0442810133099556,
"eval_runtime": 1304.569,
"eval_samples_per_second": 91.459,
"eval_steps_per_second": 0.358,
"step": 43740
},
{
"epoch": 10.0,
"grad_norm": 0.054538544267416,
"learning_rate": 0.0009000000000000001,
"loss": 0.0304,
"step": 48600
},
{
"epoch": 10.0,
"eval_accuracy": 0.7760130746343712,
"eval_loss": 0.0437050461769104,
"eval_runtime": 1308.322,
"eval_samples_per_second": 91.197,
"eval_steps_per_second": 0.357,
"step": 48600
},
{
"epoch": 11.0,
"grad_norm": 0.06947464495897293,
"learning_rate": 0.0008900000000000001,
"loss": 0.0283,
"step": 53460
},
{
"epoch": 11.0,
"eval_accuracy": 0.7780664627247202,
"eval_loss": 0.04394479840993881,
"eval_runtime": 1311.6831,
"eval_samples_per_second": 90.963,
"eval_steps_per_second": 0.356,
"step": 53460
},
{
"epoch": 12.0,
"grad_norm": 0.05090058967471123,
"learning_rate": 0.00088,
"loss": 0.0264,
"step": 58320
},
{
"epoch": 12.0,
"eval_accuracy": 0.7814943636592214,
"eval_loss": 0.04311952739953995,
"eval_runtime": 1303.6986,
"eval_samples_per_second": 91.52,
"eval_steps_per_second": 0.358,
"step": 58320
},
{
"epoch": 13.0,
"grad_norm": 0.05226626992225647,
"learning_rate": 0.00087,
"loss": 0.0248,
"step": 63180
},
{
"epoch": 13.0,
"eval_accuracy": 0.7855257092570087,
"eval_loss": 0.04451437294483185,
"eval_runtime": 1313.511,
"eval_samples_per_second": 90.837,
"eval_steps_per_second": 0.356,
"step": 63180
},
{
"epoch": 14.0,
"grad_norm": 0.053579073399305344,
"learning_rate": 0.00086,
"loss": 0.0232,
"step": 68040
},
{
"epoch": 14.0,
"eval_accuracy": 0.7867158362318233,
"eval_loss": 0.04421268403530121,
"eval_runtime": 1304.7705,
"eval_samples_per_second": 91.445,
"eval_steps_per_second": 0.358,
"step": 68040
},
{
"epoch": 15.0,
"grad_norm": 0.052565447986125946,
"learning_rate": 0.00085,
"loss": 0.0218,
"step": 72900
},
{
"epoch": 15.0,
"eval_accuracy": 0.7881657796588861,
"eval_loss": 0.04461174085736275,
"eval_runtime": 1306.4264,
"eval_samples_per_second": 91.329,
"eval_steps_per_second": 0.357,
"step": 72900
},
{
"epoch": 16.0,
"grad_norm": 0.05223050341010094,
"learning_rate": 0.00084,
"loss": 0.0206,
"step": 77760
},
{
"epoch": 16.0,
"eval_accuracy": 0.7900850689351716,
"eval_loss": 0.046072401106357574,
"eval_runtime": 1314.7849,
"eval_samples_per_second": 90.749,
"eval_steps_per_second": 0.355,
"step": 77760
},
{
"epoch": 17.0,
"grad_norm": 0.04498209059238434,
"learning_rate": 0.00083,
"loss": 0.0194,
"step": 82620
},
{
"epoch": 17.0,
"eval_accuracy": 0.7911410970959225,
"eval_loss": 0.04610202834010124,
"eval_runtime": 1308.5405,
"eval_samples_per_second": 91.182,
"eval_steps_per_second": 0.357,
"step": 82620
},
{
"epoch": 18.0,
"grad_norm": 0.047790784388780594,
"learning_rate": 0.00082,
"loss": 0.0183,
"step": 87480
},
{
"epoch": 18.0,
"eval_accuracy": 0.7914931064828395,
"eval_loss": 0.04640175402164459,
"eval_runtime": 1301.9032,
"eval_samples_per_second": 91.647,
"eval_steps_per_second": 0.359,
"step": 87480
},
{
"epoch": 19.0,
"grad_norm": 0.04554256424307823,
"learning_rate": 0.0008100000000000001,
"loss": 0.0173,
"step": 92340
},
{
"epoch": 19.0,
"eval_accuracy": 0.7922055064325525,
"eval_loss": 0.046802300959825516,
"eval_runtime": 1318.5946,
"eval_samples_per_second": 90.486,
"eval_steps_per_second": 0.354,
"step": 92340
},
{
"epoch": 20.0,
"grad_norm": 0.051229629665613174,
"learning_rate": 0.0008,
"loss": 0.0166,
"step": 97200
},
{
"epoch": 20.0,
"eval_accuracy": 0.7951556803419519,
"eval_loss": 0.04811061546206474,
"eval_runtime": 1304.3101,
"eval_samples_per_second": 91.477,
"eval_steps_per_second": 0.358,
"step": 97200
},
{
"epoch": 21.0,
"grad_norm": 0.04701264947652817,
"learning_rate": 0.00079,
"loss": 0.0158,
"step": 102060
},
{
"epoch": 21.0,
"eval_accuracy": 0.7942253698193856,
"eval_loss": 0.04858441650867462,
"eval_runtime": 1303.9515,
"eval_samples_per_second": 91.503,
"eval_steps_per_second": 0.358,
"step": 102060
},
{
"epoch": 22.0,
"grad_norm": 0.07426326721906662,
"learning_rate": 0.0007800000000000001,
"loss": 0.015,
"step": 106920
},
{
"epoch": 22.0,
"eval_accuracy": 0.7949126262414616,
"eval_loss": 0.048401448875665665,
"eval_runtime": 1304.9706,
"eval_samples_per_second": 91.431,
"eval_steps_per_second": 0.358,
"step": 106920
},
{
"epoch": 23.0,
"grad_norm": 0.05088690295815468,
"learning_rate": 0.0007700000000000001,
"loss": 0.0143,
"step": 111780
},
{
"epoch": 23.0,
"eval_accuracy": 0.7971001131458744,
"eval_loss": 0.04964574798941612,
"eval_runtime": 1311.3518,
"eval_samples_per_second": 90.986,
"eval_steps_per_second": 0.356,
"step": 111780
},
{
"epoch": 24.0,
"grad_norm": 0.04760482534766197,
"learning_rate": 0.00076,
"loss": 0.0137,
"step": 116640
},
{
"epoch": 24.0,
"eval_accuracy": 0.7959937979298496,
"eval_loss": 0.049800001084804535,
"eval_runtime": 1306.2706,
"eval_samples_per_second": 91.34,
"eval_steps_per_second": 0.358,
"step": 116640
},
{
"epoch": 25.0,
"grad_norm": 0.040201518684625626,
"learning_rate": 0.00075,
"loss": 0.0131,
"step": 121500
},
{
"epoch": 25.0,
"eval_accuracy": 0.7972677366634539,
"eval_loss": 0.0510859489440918,
"eval_runtime": 1307.6635,
"eval_samples_per_second": 91.243,
"eval_steps_per_second": 0.357,
"step": 121500
},
{
"epoch": 26.0,
"grad_norm": 0.04697073623538017,
"learning_rate": 0.00074,
"loss": 0.0125,
"step": 126360
},
{
"epoch": 26.0,
"eval_accuracy": 0.7971336378493903,
"eval_loss": 0.05105246230959892,
"eval_runtime": 1305.0277,
"eval_samples_per_second": 91.427,
"eval_steps_per_second": 0.358,
"step": 126360
},
{
"epoch": 27.0,
"grad_norm": 0.035631682723760605,
"learning_rate": 0.00073,
"loss": 0.012,
"step": 131220
},
{
"epoch": 27.0,
"eval_accuracy": 0.7990780706533127,
"eval_loss": 0.051402851939201355,
"eval_runtime": 1309.1883,
"eval_samples_per_second": 91.137,
"eval_steps_per_second": 0.357,
"step": 131220
},
{
"epoch": 28.0,
"grad_norm": 0.053142111748456955,
"learning_rate": 0.0007199999999999999,
"loss": 0.0116,
"step": 136080
},
{
"epoch": 28.0,
"eval_accuracy": 0.7987009177387587,
"eval_loss": 0.05239921808242798,
"eval_runtime": 1306.5799,
"eval_samples_per_second": 91.319,
"eval_steps_per_second": 0.357,
"step": 136080
},
{
"epoch": 29.0,
"grad_norm": 0.04080447182059288,
"learning_rate": 0.00071,
"loss": 0.0111,
"step": 140940
},
{
"epoch": 29.0,
"eval_accuracy": 0.7965804802413778,
"eval_loss": 0.05250364542007446,
"eval_runtime": 1309.3121,
"eval_samples_per_second": 91.128,
"eval_steps_per_second": 0.357,
"step": 140940
},
{
"epoch": 30.0,
"grad_norm": 0.045455146580934525,
"learning_rate": 0.0007,
"loss": 0.0107,
"step": 145800
},
{
"epoch": 30.0,
"eval_accuracy": 0.7984997695176633,
"eval_loss": 0.0543711818754673,
"eval_runtime": 1310.1708,
"eval_samples_per_second": 91.068,
"eval_steps_per_second": 0.356,
"step": 145800
},
{
"epoch": 31.0,
"grad_norm": 0.03871888667345047,
"learning_rate": 0.00069,
"loss": 0.0104,
"step": 150660
},
{
"epoch": 31.0,
"eval_accuracy": 0.7982399530654151,
"eval_loss": 0.053769443184137344,
"eval_runtime": 1306.4928,
"eval_samples_per_second": 91.325,
"eval_steps_per_second": 0.357,
"step": 150660
},
{
"epoch": 32.0,
"grad_norm": 0.048963289707899094,
"learning_rate": 0.00068,
"loss": 0.01,
"step": 155520
},
{
"epoch": 32.0,
"eval_accuracy": 0.8012404140300884,
"eval_loss": 0.05514230951666832,
"eval_runtime": 1308.1993,
"eval_samples_per_second": 91.206,
"eval_steps_per_second": 0.357,
"step": 155520
},
{
"epoch": 33.0,
"grad_norm": 0.03987804055213928,
"learning_rate": 0.00067,
"loss": 0.0097,
"step": 160380
},
{
"epoch": 33.0,
"eval_accuracy": 0.8019109081004064,
"eval_loss": 0.055017631500959396,
"eval_runtime": 1312.5155,
"eval_samples_per_second": 90.906,
"eval_steps_per_second": 0.356,
"step": 160380
},
{
"epoch": 34.0,
"grad_norm": 0.0373803973197937,
"learning_rate": 0.00066,
"loss": 0.0094,
"step": 165240
},
{
"epoch": 34.0,
"eval_accuracy": 0.7987009177387587,
"eval_loss": 0.055196575820446014,
"eval_runtime": 1306.0614,
"eval_samples_per_second": 91.355,
"eval_steps_per_second": 0.358,
"step": 165240
},
{
"epoch": 35.0,
"grad_norm": 0.039517637342214584,
"learning_rate": 0.0006500000000000001,
"loss": 0.0091,
"step": 170100
},
{
"epoch": 35.0,
"eval_accuracy": 0.8004777270251017,
"eval_loss": 0.05661753937602043,
"eval_runtime": 1306.965,
"eval_samples_per_second": 91.292,
"eval_steps_per_second": 0.357,
"step": 170100
},
{
"epoch": 36.0,
"grad_norm": 0.04135722666978836,
"learning_rate": 0.00064,
"loss": 0.0088,
"step": 174960
},
{
"epoch": 36.0,
"eval_accuracy": 0.8019025269245275,
"eval_loss": 0.05708213895559311,
"eval_runtime": 1305.0895,
"eval_samples_per_second": 91.423,
"eval_steps_per_second": 0.358,
"step": 174960
},
{
"epoch": 37.0,
"grad_norm": 0.04340599477291107,
"learning_rate": 0.00063,
"loss": 0.0085,
"step": 179820
},
{
"epoch": 37.0,
"eval_accuracy": 0.8015421363617315,
"eval_loss": 0.0565766803920269,
"eval_runtime": 1303.7543,
"eval_samples_per_second": 91.516,
"eval_steps_per_second": 0.358,
"step": 179820
},
{
"epoch": 38.0,
"grad_norm": 0.042780667543411255,
"learning_rate": 0.00062,
"loss": 0.0082,
"step": 184680
},
{
"epoch": 38.0,
"eval_accuracy": 0.7985919624523321,
"eval_loss": 0.05795786902308464,
"eval_runtime": 1306.8337,
"eval_samples_per_second": 91.301,
"eval_steps_per_second": 0.357,
"step": 184680
},
{
"epoch": 39.0,
"grad_norm": 0.03298887610435486,
"learning_rate": 0.00061,
"loss": 0.008,
"step": 189540
},
{
"epoch": 39.0,
"eval_accuracy": 0.8006537317185601,
"eval_loss": 0.05666106194257736,
"eval_runtime": 1303.7751,
"eval_samples_per_second": 91.515,
"eval_steps_per_second": 0.358,
"step": 189540
},
{
"epoch": 40.0,
"grad_norm": 0.03825366497039795,
"learning_rate": 0.0006,
"loss": 0.0077,
"step": 194400
},
{
"epoch": 40.0,
"eval_accuracy": 0.8003101035075221,
"eval_loss": 0.05909406766295433,
"eval_runtime": 1304.1065,
"eval_samples_per_second": 91.492,
"eval_steps_per_second": 0.358,
"step": 194400
},
{
"epoch": 41.0,
"grad_norm": 0.049214523285627365,
"learning_rate": 0.00059,
"loss": 0.0075,
"step": 199260
},
{
"epoch": 41.0,
"eval_accuracy": 0.8027406445124251,
"eval_loss": 0.0589471310377121,
"eval_runtime": 1305.1945,
"eval_samples_per_second": 91.415,
"eval_steps_per_second": 0.358,
"step": 199260
},
{
"epoch": 42.0,
"grad_norm": 0.03445366024971008,
"learning_rate": 0.00058,
"loss": 0.0073,
"step": 204120
},
{
"epoch": 42.0,
"eval_accuracy": 0.802765788040062,
"eval_loss": 0.05833474174141884,
"eval_runtime": 1304.7043,
"eval_samples_per_second": 91.45,
"eval_steps_per_second": 0.358,
"step": 204120
},
{
"epoch": 43.0,
"grad_norm": 0.031152933835983276,
"learning_rate": 0.00057,
"loss": 0.007,
"step": 208980
},
{
"epoch": 43.0,
"eval_accuracy": 0.8017851904622219,
"eval_loss": 0.05955711379647255,
"eval_runtime": 1302.9367,
"eval_samples_per_second": 91.574,
"eval_steps_per_second": 0.358,
"step": 208980
},
{
"epoch": 44.0,
"grad_norm": 0.028931325301527977,
"learning_rate": 0.0005600000000000001,
"loss": 0.0069,
"step": 213840
},
{
"epoch": 44.0,
"eval_accuracy": 0.803469806813896,
"eval_loss": 0.05940761789679527,
"eval_runtime": 1302.3209,
"eval_samples_per_second": 91.617,
"eval_steps_per_second": 0.359,
"step": 213840
},
{
"epoch": 45.0,
"grad_norm": 0.03164521977305412,
"learning_rate": 0.00055,
"loss": 0.0066,
"step": 218700
},
{
"epoch": 45.0,
"eval_accuracy": 0.803084272723463,
"eval_loss": 0.0604814775288105,
"eval_runtime": 1299.6461,
"eval_samples_per_second": 91.806,
"eval_steps_per_second": 0.359,
"step": 218700
},
{
"epoch": 46.0,
"grad_norm": 0.09477687627077103,
"learning_rate": 0.00054,
"loss": 0.0065,
"step": 223560
},
{
"epoch": 46.0,
"eval_accuracy": 0.803805053849055,
"eval_loss": 0.05929319187998772,
"eval_runtime": 1297.2788,
"eval_samples_per_second": 91.973,
"eval_steps_per_second": 0.36,
"step": 223560
},
{
"epoch": 47.0,
"grad_norm": 0.032785411924123764,
"learning_rate": 0.0005300000000000001,
"loss": 0.0063,
"step": 228420
},
{
"epoch": 47.0,
"eval_accuracy": 0.8049784184721116,
"eval_loss": 0.06024543195962906,
"eval_runtime": 1298.8006,
"eval_samples_per_second": 91.866,
"eval_steps_per_second": 0.36,
"step": 228420
},
{
"epoch": 48.0,
"grad_norm": 0.03677200525999069,
"learning_rate": 0.0005200000000000001,
"loss": 0.006,
"step": 233280
},
{
"epoch": 48.0,
"eval_accuracy": 0.8040145832460294,
"eval_loss": 0.061811413615942,
"eval_runtime": 1297.6127,
"eval_samples_per_second": 91.95,
"eval_steps_per_second": 0.36,
"step": 233280
},
{
"epoch": 49.0,
"grad_norm": 0.030352266505360603,
"learning_rate": 0.00051,
"loss": 0.0059,
"step": 238140
},
{
"epoch": 49.0,
"eval_accuracy": 0.8042241126430039,
"eval_loss": 0.061159055680036545,
"eval_runtime": 1299.1406,
"eval_samples_per_second": 91.841,
"eval_steps_per_second": 0.359,
"step": 238140
},
{
"epoch": 50.0,
"grad_norm": 0.02934379130601883,
"learning_rate": 0.0005,
"loss": 0.0057,
"step": 243000
},
{
"epoch": 50.0,
"eval_accuracy": 0.8055148137283661,
"eval_loss": 0.06327831000089645,
"eval_runtime": 1298.5116,
"eval_samples_per_second": 91.886,
"eval_steps_per_second": 0.36,
"step": 243000
},
{
"epoch": 51.0,
"grad_norm": 0.023088792338967323,
"learning_rate": 0.00049,
"loss": 0.0055,
"step": 247860
},
{
"epoch": 51.0,
"eval_accuracy": 0.8067971336378494,
"eval_loss": 0.06312137842178345,
"eval_runtime": 1302.1135,
"eval_samples_per_second": 91.632,
"eval_steps_per_second": 0.359,
"step": 247860
},
{
"epoch": 52.0,
"grad_norm": 0.03648848831653595,
"learning_rate": 0.00048,
"loss": 0.0053,
"step": 252720
},
{
"epoch": 52.0,
"eval_accuracy": 0.8061936889745631,
"eval_loss": 0.06350181996822357,
"eval_runtime": 1297.9213,
"eval_samples_per_second": 91.928,
"eval_steps_per_second": 0.36,
"step": 252720
},
{
"epoch": 53.0,
"grad_norm": 0.03157039359211922,
"learning_rate": 0.00047,
"loss": 0.0051,
"step": 257580
},
{
"epoch": 53.0,
"eval_accuracy": 0.8065121736579642,
"eval_loss": 0.06361949443817139,
"eval_runtime": 1305.9679,
"eval_samples_per_second": 91.361,
"eval_steps_per_second": 0.358,
"step": 257580
},
{
"epoch": 54.0,
"grad_norm": 0.026564130559563637,
"learning_rate": 0.00046,
"loss": 0.005,
"step": 262440
},
{
"epoch": 54.0,
"eval_accuracy": 0.8064535054268114,
"eval_loss": 0.06370926648378372,
"eval_runtime": 1302.6026,
"eval_samples_per_second": 91.597,
"eval_steps_per_second": 0.359,
"step": 262440
},
{
"epoch": 55.0,
"grad_norm": 0.039359357208013535,
"learning_rate": 0.00045000000000000004,
"loss": 0.0048,
"step": 267300
},
{
"epoch": 55.0,
"eval_accuracy": 0.8070569500900976,
"eval_loss": 0.0649728775024414,
"eval_runtime": 1301.934,
"eval_samples_per_second": 91.644,
"eval_steps_per_second": 0.359,
"step": 267300
},
{
"epoch": 56.0,
"grad_norm": 0.02652502991259098,
"learning_rate": 0.00044,
"loss": 0.0047,
"step": 272160
},
{
"epoch": 56.0,
"eval_accuracy": 0.80695637597955,
"eval_loss": 0.06507979333400726,
"eval_runtime": 1302.3742,
"eval_samples_per_second": 91.613,
"eval_steps_per_second": 0.359,
"step": 272160
},
{
"epoch": 57.0,
"grad_norm": 0.04170479625463486,
"learning_rate": 0.00043,
"loss": 0.0045,
"step": 277020
},
{
"epoch": 57.0,
"eval_accuracy": 0.8077442065121737,
"eval_loss": 0.06572364270687103,
"eval_runtime": 1303.9456,
"eval_samples_per_second": 91.503,
"eval_steps_per_second": 0.358,
"step": 277020
},
{
"epoch": 58.0,
"grad_norm": 0.02988004870712757,
"learning_rate": 0.00042,
"loss": 0.0044,
"step": 281880
},
{
"epoch": 58.0,
"eval_accuracy": 0.8076687759292629,
"eval_loss": 0.06484715640544891,
"eval_runtime": 1299.0165,
"eval_samples_per_second": 91.85,
"eval_steps_per_second": 0.36,
"step": 281880
},
{
"epoch": 59.0,
"grad_norm": 0.031678713858127594,
"learning_rate": 0.00041,
"loss": 0.0042,
"step": 286740
},
{
"epoch": 59.0,
"eval_accuracy": 0.8078950676779952,
"eval_loss": 0.06634358316659927,
"eval_runtime": 1304.0645,
"eval_samples_per_second": 91.495,
"eval_steps_per_second": 0.358,
"step": 286740
},
{
"epoch": 60.0,
"grad_norm": 0.020897777751088142,
"learning_rate": 0.0004,
"loss": 0.0041,
"step": 291600
},
{
"epoch": 60.0,
"eval_accuracy": 0.8078866865021163,
"eval_loss": 0.0666716918349266,
"eval_runtime": 1302.0029,
"eval_samples_per_second": 91.64,
"eval_steps_per_second": 0.359,
"step": 291600
},
{
"epoch": 61.0,
"grad_norm": 0.03830067440867424,
"learning_rate": 0.00039000000000000005,
"loss": 0.004,
"step": 296460
},
{
"epoch": 61.0,
"eval_accuracy": 0.8104597074969618,
"eval_loss": 0.0661536380648613,
"eval_runtime": 1303.993,
"eval_samples_per_second": 91.5,
"eval_steps_per_second": 0.358,
"step": 296460
},
{
"epoch": 62.0,
"grad_norm": 0.025591198354959488,
"learning_rate": 0.00038,
"loss": 0.0037,
"step": 301320
},
{
"epoch": 62.0,
"eval_accuracy": 0.809722164019612,
"eval_loss": 0.06793326884508133,
"eval_runtime": 1302.3775,
"eval_samples_per_second": 91.613,
"eval_steps_per_second": 0.359,
"step": 301320
},
{
"epoch": 63.0,
"grad_norm": 0.029204251244664192,
"learning_rate": 0.00037,
"loss": 0.0036,
"step": 306180
},
{
"epoch": 63.0,
"eval_accuracy": 0.8103172275070193,
"eval_loss": 0.06969352066516876,
"eval_runtime": 1309.132,
"eval_samples_per_second": 91.141,
"eval_steps_per_second": 0.357,
"step": 306180
},
{
"epoch": 64.0,
"grad_norm": 0.029880277812480927,
"learning_rate": 0.00035999999999999997,
"loss": 0.0035,
"step": 311040
},
{
"epoch": 64.0,
"eval_accuracy": 0.810057411054771,
"eval_loss": 0.06940728425979614,
"eval_runtime": 1308.9186,
"eval_samples_per_second": 91.155,
"eval_steps_per_second": 0.357,
"step": 311040
},
{
"epoch": 65.0,
"grad_norm": 0.030717821791768074,
"learning_rate": 0.00035,
"loss": 0.0034,
"step": 315900
},
{
"epoch": 65.0,
"eval_accuracy": 0.810082554582408,
"eval_loss": 0.06925758719444275,
"eval_runtime": 1308.5599,
"eval_samples_per_second": 91.18,
"eval_steps_per_second": 0.357,
"step": 315900
},
{
"epoch": 66.0,
"grad_norm": 0.02681083045899868,
"learning_rate": 0.00034,
"loss": 0.0032,
"step": 320760
},
{
"epoch": 66.0,
"eval_accuracy": 0.8104597074969618,
"eval_loss": 0.07081950455904007,
"eval_runtime": 1307.5379,
"eval_samples_per_second": 91.252,
"eval_steps_per_second": 0.357,
"step": 320760
},
{
"epoch": 67.0,
"grad_norm": 0.023366352543234825,
"learning_rate": 0.00033,
"loss": 0.0031,
"step": 325620
},
{
"epoch": 67.0,
"eval_accuracy": 0.8130411096676864,
"eval_loss": 0.07085347920656204,
"eval_runtime": 1306.1303,
"eval_samples_per_second": 91.35,
"eval_steps_per_second": 0.358,
"step": 325620
},
{
"epoch": 68.0,
"grad_norm": 0.018703831359744072,
"learning_rate": 0.00032,
"loss": 0.0029,
"step": 330480
},
{
"epoch": 68.0,
"eval_accuracy": 0.8118090768134769,
"eval_loss": 0.07043693959712982,
"eval_runtime": 1308.3789,
"eval_samples_per_second": 91.193,
"eval_steps_per_second": 0.357,
"step": 330480
},
{
"epoch": 69.0,
"grad_norm": 0.021384961903095245,
"learning_rate": 0.00031,
"loss": 0.0028,
"step": 335340
},
{
"epoch": 69.0,
"eval_accuracy": 0.8111721074466748,
"eval_loss": 0.0710659921169281,
"eval_runtime": 1306.3675,
"eval_samples_per_second": 91.333,
"eval_steps_per_second": 0.357,
"step": 335340
},
{
"epoch": 70.0,
"grad_norm": 0.027229884639382362,
"learning_rate": 0.0003,
"loss": 0.0027,
"step": 340200
},
{
"epoch": 70.0,
"eval_accuracy": 0.811775552109961,
"eval_loss": 0.07277531921863556,
"eval_runtime": 1306.0312,
"eval_samples_per_second": 91.357,
"eval_steps_per_second": 0.358,
"step": 340200
},
{
"epoch": 71.0,
"grad_norm": 0.024667974561452866,
"learning_rate": 0.00029,
"loss": 0.0025,
"step": 345060
},
{
"epoch": 71.0,
"eval_accuracy": 0.8127980555671961,
"eval_loss": 0.07439424097537994,
"eval_runtime": 1304.9752,
"eval_samples_per_second": 91.431,
"eval_steps_per_second": 0.358,
"step": 345060
},
{
"epoch": 72.0,
"grad_norm": 0.019673120230436325,
"learning_rate": 0.00028000000000000003,
"loss": 0.0024,
"step": 349920
},
{
"epoch": 72.0,
"eval_accuracy": 0.8129572979088966,
"eval_loss": 0.07484369724988937,
"eval_runtime": 1304.8727,
"eval_samples_per_second": 91.438,
"eval_steps_per_second": 0.358,
"step": 349920
},
{
"epoch": 73.0,
"grad_norm": 0.023998018354177475,
"learning_rate": 0.00027,
"loss": 0.0023,
"step": 354780
},
{
"epoch": 73.0,
"eval_accuracy": 0.8130662531953233,
"eval_loss": 0.07576391845941544,
"eval_runtime": 1307.7757,
"eval_samples_per_second": 91.235,
"eval_steps_per_second": 0.357,
"step": 354780
},
{
"epoch": 74.0,
"grad_norm": 0.04219399765133858,
"learning_rate": 0.00026000000000000003,
"loss": 0.0022,
"step": 359640
},
{
"epoch": 74.0,
"eval_accuracy": 0.8143485731048066,
"eval_loss": 0.07631613314151764,
"eval_runtime": 1306.0365,
"eval_samples_per_second": 91.357,
"eval_steps_per_second": 0.358,
"step": 359640
},
{
"epoch": 75.0,
"grad_norm": 0.021946126595139503,
"learning_rate": 0.00025,
"loss": 0.0021,
"step": 364500
},
{
"epoch": 75.0,
"eval_accuracy": 0.8144994342706282,
"eval_loss": 0.07692206650972366,
"eval_runtime": 1303.767,
"eval_samples_per_second": 91.516,
"eval_steps_per_second": 0.358,
"step": 364500
},
{
"epoch": 76.0,
"grad_norm": 0.023164469748735428,
"learning_rate": 0.00024,
"loss": 0.0019,
"step": 369360
},
{
"epoch": 76.0,
"eval_accuracy": 0.8133176884716926,
"eval_loss": 0.0780106782913208,
"eval_runtime": 1305.1212,
"eval_samples_per_second": 91.421,
"eval_steps_per_second": 0.358,
"step": 369360
},
{
"epoch": 77.0,
"grad_norm": 0.02851826325058937,
"learning_rate": 0.00023,
"loss": 0.0018,
"step": 374220
},
{
"epoch": 77.0,
"eval_accuracy": 0.8146335330846918,
"eval_loss": 0.0777156725525856,
"eval_runtime": 1304.8456,
"eval_samples_per_second": 91.44,
"eval_steps_per_second": 0.358,
"step": 374220
},
{
"epoch": 78.0,
"grad_norm": 0.0253597479313612,
"learning_rate": 0.00022,
"loss": 0.0017,
"step": 379080
},
{
"epoch": 78.0,
"eval_accuracy": 0.8162175753258182,
"eval_loss": 0.07899600267410278,
"eval_runtime": 1303.5402,
"eval_samples_per_second": 91.532,
"eval_steps_per_second": 0.358,
"step": 379080
},
{
"epoch": 79.0,
"grad_norm": 0.02291404828429222,
"learning_rate": 0.00021,
"loss": 0.0016,
"step": 383940
},
{
"epoch": 79.0,
"eval_accuracy": 0.8145245777982651,
"eval_loss": 0.08102333545684814,
"eval_runtime": 1304.8976,
"eval_samples_per_second": 91.436,
"eval_steps_per_second": 0.358,
"step": 383940
},
{
"epoch": 80.0,
"grad_norm": 0.029658950865268707,
"learning_rate": 0.0002,
"loss": 0.0015,
"step": 388800
},
{
"epoch": 80.0,
"eval_accuracy": 0.8161170012152705,
"eval_loss": 0.08235077559947968,
"eval_runtime": 1306.0263,
"eval_samples_per_second": 91.357,
"eval_steps_per_second": 0.358,
"step": 388800
},
{
"epoch": 81.0,
"grad_norm": 0.02459796331822872,
"learning_rate": 0.00019,
"loss": 0.0014,
"step": 393660
},
{
"epoch": 81.0,
"eval_accuracy": 0.815940996521812,
"eval_loss": 0.08271630853414536,
"eval_runtime": 1306.1311,
"eval_samples_per_second": 91.35,
"eval_steps_per_second": 0.358,
"step": 393660
},
{
"epoch": 82.0,
"grad_norm": 0.01461075246334076,
"learning_rate": 0.00017999999999999998,
"loss": 0.0013,
"step": 398520
},
{
"epoch": 82.0,
"eval_accuracy": 0.8159745212253279,
"eval_loss": 0.08540969341993332,
"eval_runtime": 1303.2922,
"eval_samples_per_second": 91.549,
"eval_steps_per_second": 0.358,
"step": 398520
},
{
"epoch": 83.0,
"grad_norm": 0.011922557838261127,
"learning_rate": 0.00017,
"loss": 0.0012,
"step": 403380
},
{
"epoch": 83.0,
"eval_accuracy": 0.8165025353057034,
"eval_loss": 0.08608754724264145,
"eval_runtime": 1300.8769,
"eval_samples_per_second": 91.719,
"eval_steps_per_second": 0.359,
"step": 403380
},
{
"epoch": 84.0,
"grad_norm": 0.006066465750336647,
"learning_rate": 0.00016,
"loss": 0.0011,
"step": 408240
},
{
"epoch": 84.0,
"eval_accuracy": 0.8179943846121611,
"eval_loss": 0.0866456851363182,
"eval_runtime": 1297.1839,
"eval_samples_per_second": 91.98,
"eval_steps_per_second": 0.36,
"step": 408240
},
{
"epoch": 85.0,
"grad_norm": 0.019887538626790047,
"learning_rate": 0.00015,
"loss": 0.001,
"step": 413100
},
{
"epoch": 85.0,
"eval_accuracy": 0.8174747517076646,
"eval_loss": 0.08994536101818085,
"eval_runtime": 1297.8948,
"eval_samples_per_second": 91.93,
"eval_steps_per_second": 0.36,
"step": 413100
},
{
"epoch": 86.0,
"grad_norm": 0.019588502123951912,
"learning_rate": 0.00014000000000000001,
"loss": 0.0009,
"step": 417960
},
{
"epoch": 86.0,
"eval_accuracy": 0.8186648786824792,
"eval_loss": 0.08895347267389297,
"eval_runtime": 1297.8927,
"eval_samples_per_second": 91.93,
"eval_steps_per_second": 0.36,
"step": 417960
},
{
"epoch": 87.0,
"grad_norm": 0.019738251343369484,
"learning_rate": 0.00013000000000000002,
"loss": 0.0008,
"step": 422820
},
{
"epoch": 87.0,
"eval_accuracy": 0.8184553492855048,
"eval_loss": 0.09202321618795395,
"eval_runtime": 1297.7904,
"eval_samples_per_second": 91.937,
"eval_steps_per_second": 0.36,
"step": 422820
},
{
"epoch": 88.0,
"grad_norm": 0.0069356439635157585,
"learning_rate": 0.00012,
"loss": 0.0008,
"step": 427680
},
{
"epoch": 88.0,
"eval_accuracy": 0.8189749821900013,
"eval_loss": 0.09301886707544327,
"eval_runtime": 1297.4851,
"eval_samples_per_second": 91.959,
"eval_steps_per_second": 0.36,
"step": 427680
},
{
"epoch": 89.0,
"grad_norm": 0.0168699249625206,
"learning_rate": 0.00011,
"loss": 0.0007,
"step": 432540
},
{
"epoch": 89.0,
"eval_accuracy": 0.8196370950844404,
"eval_loss": 0.09475909918546677,
"eval_runtime": 1297.2344,
"eval_samples_per_second": 91.976,
"eval_steps_per_second": 0.36,
"step": 432540
},
{
"epoch": 90.0,
"grad_norm": 0.00997143518179655,
"learning_rate": 0.0001,
"loss": 0.0006,
"step": 437400
},
{
"epoch": 90.0,
"eval_accuracy": 0.8189917445417592,
"eval_loss": 0.09576508402824402,
"eval_runtime": 1296.498,
"eval_samples_per_second": 92.029,
"eval_steps_per_second": 0.36,
"step": 437400
},
{
"epoch": 91.0,
"grad_norm": 0.017046066001057625,
"learning_rate": 8.999999999999999e-05,
"loss": 0.0005,
"step": 442260
},
{
"epoch": 91.0,
"eval_accuracy": 0.8195365209738926,
"eval_loss": 0.09832222014665604,
"eval_runtime": 1296.9323,
"eval_samples_per_second": 91.998,
"eval_steps_per_second": 0.36,
"step": 442260
},
{
"epoch": 92.0,
"grad_norm": 0.01709928549826145,
"learning_rate": 8e-05,
"loss": 0.0005,
"step": 447120
},
{
"epoch": 92.0,
"eval_accuracy": 0.8208523655868918,
"eval_loss": 0.10070452094078064,
"eval_runtime": 1296.8404,
"eval_samples_per_second": 92.004,
"eval_steps_per_second": 0.36,
"step": 447120
},
{
"epoch": 93.0,
"grad_norm": 0.014434403739869595,
"learning_rate": 7.000000000000001e-05,
"loss": 0.0004,
"step": 451980
},
{
"epoch": 93.0,
"eval_accuracy": 0.8202153962200897,
"eval_loss": 0.10308787226676941,
"eval_runtime": 1304.3584,
"eval_samples_per_second": 91.474,
"eval_steps_per_second": 0.358,
"step": 451980
},
{
"epoch": 94.0,
"grad_norm": 0.014810960739850998,
"learning_rate": 6e-05,
"loss": 0.0004,
"step": 456840
},
{
"epoch": 94.0,
"eval_accuracy": 0.82111218203914,
"eval_loss": 0.10589364916086197,
"eval_runtime": 1302.0953,
"eval_samples_per_second": 91.633,
"eval_steps_per_second": 0.359,
"step": 456840
},
{
"epoch": 95.0,
"grad_norm": 0.0034873096738010645,
"learning_rate": 5e-05,
"loss": 0.0003,
"step": 461700
},
{
"epoch": 95.0,
"eval_accuracy": 0.8215982902401207,
"eval_loss": 0.10970806330442429,
"eval_runtime": 1301.0636,
"eval_samples_per_second": 91.706,
"eval_steps_per_second": 0.359,
"step": 461700
},
{
"epoch": 96.0,
"grad_norm": 0.002740664640441537,
"learning_rate": 4e-05,
"loss": 0.0003,
"step": 466560
},
{
"epoch": 96.0,
"eval_accuracy": 0.8221179231446172,
"eval_loss": 0.11456754803657532,
"eval_runtime": 1302.7083,
"eval_samples_per_second": 91.59,
"eval_steps_per_second": 0.358,
"step": 466560
},
{
"epoch": 97.0,
"grad_norm": 0.0014008020516484976,
"learning_rate": 3e-05,
"loss": 0.0002,
"step": 471420
},
{
"epoch": 97.0,
"eval_accuracy": 0.8224364078280183,
"eval_loss": 0.11764019727706909,
"eval_runtime": 1302.3991,
"eval_samples_per_second": 91.612,
"eval_steps_per_second": 0.359,
"step": 471420
},
{
"epoch": 98.0,
"grad_norm": 0.0054730623960494995,
"learning_rate": 2e-05,
"loss": 0.0002,
"step": 476280
},
{
"epoch": 98.0,
"eval_accuracy": 0.8226543184008717,
"eval_loss": 0.12025844305753708,
"eval_runtime": 1302.1251,
"eval_samples_per_second": 91.631,
"eval_steps_per_second": 0.359,
"step": 476280
},
{
"epoch": 99.0,
"grad_norm": 0.0020766761153936386,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 481140
},
{
"epoch": 99.0,
"eval_accuracy": 0.8226878431043876,
"eval_loss": 0.1223362609744072,
"eval_runtime": 1301.9742,
"eval_samples_per_second": 91.642,
"eval_steps_per_second": 0.359,
"step": 481140
},
{
"epoch": 100.0,
"grad_norm": 0.018789879977703094,
"learning_rate": 0.0,
"loss": 0.0001,
"step": 486000
},
{
"epoch": 100.0,
"eval_accuracy": 0.8229895654360306,
"eval_loss": 0.12378211319446564,
"eval_runtime": 1301.6752,
"eval_samples_per_second": 91.663,
"eval_steps_per_second": 0.359,
"step": 486000
}
],
"logging_steps": 500,
"max_steps": 486000,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"total_flos": 2.2219508552555553e+19,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}