|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 1856, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005387931034482759, |
|
"grad_norm": 24.06527582915772, |
|
"learning_rate": 5.376344086021506e-08, |
|
"loss": 1.3568, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0026939655172413795, |
|
"grad_norm": 23.2847675267083, |
|
"learning_rate": 2.688172043010753e-07, |
|
"loss": 1.3668, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005387931034482759, |
|
"grad_norm": 16.195930738756566, |
|
"learning_rate": 5.376344086021506e-07, |
|
"loss": 1.3204, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008081896551724138, |
|
"grad_norm": 12.068298869370592, |
|
"learning_rate": 8.064516129032258e-07, |
|
"loss": 1.153, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.010775862068965518, |
|
"grad_norm": 8.564123494535863, |
|
"learning_rate": 1.0752688172043011e-06, |
|
"loss": 1.0452, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013469827586206896, |
|
"grad_norm": 3.533789309391932, |
|
"learning_rate": 1.3440860215053765e-06, |
|
"loss": 0.9515, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.016163793103448277, |
|
"grad_norm": 3.24461197562523, |
|
"learning_rate": 1.6129032258064516e-06, |
|
"loss": 0.9001, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.018857758620689655, |
|
"grad_norm": 2.990611660406535, |
|
"learning_rate": 1.881720430107527e-06, |
|
"loss": 0.8773, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.021551724137931036, |
|
"grad_norm": 3.0063853939062346, |
|
"learning_rate": 2.1505376344086023e-06, |
|
"loss": 0.851, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.024245689655172414, |
|
"grad_norm": 2.956366561006899, |
|
"learning_rate": 2.4193548387096776e-06, |
|
"loss": 0.8574, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02693965517241379, |
|
"grad_norm": 2.983398789032246, |
|
"learning_rate": 2.688172043010753e-06, |
|
"loss": 0.84, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.029633620689655173, |
|
"grad_norm": 2.964731632227324, |
|
"learning_rate": 2.9569892473118283e-06, |
|
"loss": 0.824, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.032327586206896554, |
|
"grad_norm": 2.9208803498660623, |
|
"learning_rate": 3.225806451612903e-06, |
|
"loss": 0.8138, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03502155172413793, |
|
"grad_norm": 3.2063303145455366, |
|
"learning_rate": 3.494623655913979e-06, |
|
"loss": 0.8009, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.03771551724137931, |
|
"grad_norm": 3.242653708652505, |
|
"learning_rate": 3.763440860215054e-06, |
|
"loss": 0.792, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04040948275862069, |
|
"grad_norm": 3.1462448663803846, |
|
"learning_rate": 4.032258064516129e-06, |
|
"loss": 0.7902, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.04310344827586207, |
|
"grad_norm": 3.0229975986392716, |
|
"learning_rate": 4.3010752688172045e-06, |
|
"loss": 0.7699, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.045797413793103446, |
|
"grad_norm": 3.12423094671722, |
|
"learning_rate": 4.56989247311828e-06, |
|
"loss": 0.7644, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04849137931034483, |
|
"grad_norm": 3.2796596768473902, |
|
"learning_rate": 4.838709677419355e-06, |
|
"loss": 0.7712, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05118534482758621, |
|
"grad_norm": 3.0184242042359943, |
|
"learning_rate": 5.1075268817204305e-06, |
|
"loss": 0.7546, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.05387931034482758, |
|
"grad_norm": 3.0881392753326447, |
|
"learning_rate": 5.376344086021506e-06, |
|
"loss": 0.7487, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.056573275862068964, |
|
"grad_norm": 3.4110841994799657, |
|
"learning_rate": 5.645161290322582e-06, |
|
"loss": 0.7496, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.059267241379310345, |
|
"grad_norm": 2.92733810047956, |
|
"learning_rate": 5.9139784946236566e-06, |
|
"loss": 0.7368, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06196120689655173, |
|
"grad_norm": 3.3139008810992046, |
|
"learning_rate": 6.182795698924732e-06, |
|
"loss": 0.7277, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.06465517241379311, |
|
"grad_norm": 3.1747479144288455, |
|
"learning_rate": 6.451612903225806e-06, |
|
"loss": 0.7283, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06734913793103449, |
|
"grad_norm": 2.894519107469561, |
|
"learning_rate": 6.720430107526882e-06, |
|
"loss": 0.7282, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.07004310344827586, |
|
"grad_norm": 2.8405180587913987, |
|
"learning_rate": 6.989247311827958e-06, |
|
"loss": 0.7123, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07273706896551724, |
|
"grad_norm": 2.7948188759602717, |
|
"learning_rate": 7.258064516129033e-06, |
|
"loss": 0.7193, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.07543103448275862, |
|
"grad_norm": 3.154756842274138, |
|
"learning_rate": 7.526881720430108e-06, |
|
"loss": 0.7207, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.078125, |
|
"grad_norm": 2.9457108929499207, |
|
"learning_rate": 7.795698924731183e-06, |
|
"loss": 0.7212, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.08081896551724138, |
|
"grad_norm": 2.8503644648477517, |
|
"learning_rate": 8.064516129032258e-06, |
|
"loss": 0.72, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08351293103448276, |
|
"grad_norm": 2.949964251276019, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.723, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.08620689655172414, |
|
"grad_norm": 2.959116036250926, |
|
"learning_rate": 8.602150537634409e-06, |
|
"loss": 0.7158, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08890086206896551, |
|
"grad_norm": 2.7803395603035517, |
|
"learning_rate": 8.870967741935484e-06, |
|
"loss": 0.7067, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.09159482758620689, |
|
"grad_norm": 2.8799202670097115, |
|
"learning_rate": 9.13978494623656e-06, |
|
"loss": 0.71, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.09428879310344827, |
|
"grad_norm": 2.9537594310040687, |
|
"learning_rate": 9.408602150537635e-06, |
|
"loss": 0.7152, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.09698275862068965, |
|
"grad_norm": 2.8628517050727873, |
|
"learning_rate": 9.67741935483871e-06, |
|
"loss": 0.7054, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09967672413793104, |
|
"grad_norm": 2.8896943288351586, |
|
"learning_rate": 9.946236559139786e-06, |
|
"loss": 0.7235, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.10237068965517242, |
|
"grad_norm": 2.938518709851193, |
|
"learning_rate": 9.999858445152838e-06, |
|
"loss": 0.7122, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1050646551724138, |
|
"grad_norm": 2.58690085015114, |
|
"learning_rate": 9.999283392323047e-06, |
|
"loss": 0.7061, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.10775862068965517, |
|
"grad_norm": 2.763129396160507, |
|
"learning_rate": 9.998266045169356e-06, |
|
"loss": 0.7063, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11045258620689655, |
|
"grad_norm": 2.816275952414151, |
|
"learning_rate": 9.996806493698038e-06, |
|
"loss": 0.7087, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.11314655172413793, |
|
"grad_norm": 2.73738463168911, |
|
"learning_rate": 9.994904867037867e-06, |
|
"loss": 0.6986, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.11584051724137931, |
|
"grad_norm": 2.810575578616004, |
|
"learning_rate": 9.99256133342869e-06, |
|
"loss": 0.6929, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.11853448275862069, |
|
"grad_norm": 2.6652685941669265, |
|
"learning_rate": 9.989776100206547e-06, |
|
"loss": 0.6898, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.12122844827586207, |
|
"grad_norm": 2.7660230194471107, |
|
"learning_rate": 9.986549413785323e-06, |
|
"loss": 0.695, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.12392241379310345, |
|
"grad_norm": 2.5553942202252466, |
|
"learning_rate": 9.982881559634946e-06, |
|
"loss": 0.7017, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.12661637931034483, |
|
"grad_norm": 2.5245345530966192, |
|
"learning_rate": 9.978772862256145e-06, |
|
"loss": 0.6916, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.12931034482758622, |
|
"grad_norm": 2.520167957976126, |
|
"learning_rate": 9.97422368515172e-06, |
|
"loss": 0.694, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1320043103448276, |
|
"grad_norm": 2.7125840301494706, |
|
"learning_rate": 9.969234430794395e-06, |
|
"loss": 0.6887, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.13469827586206898, |
|
"grad_norm": 2.631424447595556, |
|
"learning_rate": 9.96380554059121e-06, |
|
"loss": 0.685, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.13739224137931033, |
|
"grad_norm": 2.555021040773695, |
|
"learning_rate": 9.957937494844472e-06, |
|
"loss": 0.7004, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.1400862068965517, |
|
"grad_norm": 2.539978410855113, |
|
"learning_rate": 9.951630812709245e-06, |
|
"loss": 0.6897, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1427801724137931, |
|
"grad_norm": 2.7494174109330842, |
|
"learning_rate": 9.944886052147445e-06, |
|
"loss": 0.6928, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.14547413793103448, |
|
"grad_norm": 2.559956756758314, |
|
"learning_rate": 9.937703809878455e-06, |
|
"loss": 0.6813, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.14816810344827586, |
|
"grad_norm": 2.525562445581053, |
|
"learning_rate": 9.930084721326342e-06, |
|
"loss": 0.6944, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.15086206896551724, |
|
"grad_norm": 2.777619881263396, |
|
"learning_rate": 9.92202946056364e-06, |
|
"loss": 0.6745, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.15355603448275862, |
|
"grad_norm": 2.4859789362282076, |
|
"learning_rate": 9.913538740251711e-06, |
|
"loss": 0.6527, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 2.4614571056065624, |
|
"learning_rate": 9.904613311577696e-06, |
|
"loss": 0.6673, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.15894396551724138, |
|
"grad_norm": 2.503690727361147, |
|
"learning_rate": 9.895253964188056e-06, |
|
"loss": 0.6601, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.16163793103448276, |
|
"grad_norm": 2.61491684131174, |
|
"learning_rate": 9.885461526118713e-06, |
|
"loss": 0.6629, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.16433189655172414, |
|
"grad_norm": 2.563289578189323, |
|
"learning_rate": 9.875236863721788e-06, |
|
"loss": 0.6834, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.16702586206896552, |
|
"grad_norm": 2.542961491155676, |
|
"learning_rate": 9.864580881588958e-06, |
|
"loss": 0.6634, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1697198275862069, |
|
"grad_norm": 2.5998608415854774, |
|
"learning_rate": 9.853494522471423e-06, |
|
"loss": 0.6564, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"grad_norm": 2.580998138867243, |
|
"learning_rate": 9.841978767196495e-06, |
|
"loss": 0.6522, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.17510775862068967, |
|
"grad_norm": 2.462022076166109, |
|
"learning_rate": 9.830034634580833e-06, |
|
"loss": 0.6575, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.17780172413793102, |
|
"grad_norm": 2.641866987114795, |
|
"learning_rate": 9.8176631813403e-06, |
|
"loss": 0.6654, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1804956896551724, |
|
"grad_norm": 2.483224928563204, |
|
"learning_rate": 9.804865501996472e-06, |
|
"loss": 0.6687, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.18318965517241378, |
|
"grad_norm": 2.6158710388060755, |
|
"learning_rate": 9.79164272877981e-06, |
|
"loss": 0.6606, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.18588362068965517, |
|
"grad_norm": 2.6690109052148396, |
|
"learning_rate": 9.777996031529486e-06, |
|
"loss": 0.6587, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.18857758620689655, |
|
"grad_norm": 2.5145797557443403, |
|
"learning_rate": 9.763926617589883e-06, |
|
"loss": 0.6455, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.19127155172413793, |
|
"grad_norm": 2.34228188842774, |
|
"learning_rate": 9.749435731703786e-06, |
|
"loss": 0.6467, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.1939655172413793, |
|
"grad_norm": 2.518236951767628, |
|
"learning_rate": 9.734524655902253e-06, |
|
"loss": 0.6651, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1966594827586207, |
|
"grad_norm": 2.3327366524820423, |
|
"learning_rate": 9.719194709391191e-06, |
|
"loss": 0.6527, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.19935344827586207, |
|
"grad_norm": 2.6721928236725425, |
|
"learning_rate": 9.70344724843465e-06, |
|
"loss": 0.6471, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.20204741379310345, |
|
"grad_norm": 2.512497207087126, |
|
"learning_rate": 9.687283666234823e-06, |
|
"loss": 0.6345, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.20474137931034483, |
|
"grad_norm": 2.5381248307269106, |
|
"learning_rate": 9.670705392808796e-06, |
|
"loss": 0.6549, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.20743534482758622, |
|
"grad_norm": 2.489609282435604, |
|
"learning_rate": 9.653713894862024e-06, |
|
"loss": 0.6287, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.2101293103448276, |
|
"grad_norm": 2.4187969624820767, |
|
"learning_rate": 9.63631067565858e-06, |
|
"loss": 0.6372, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.21282327586206898, |
|
"grad_norm": 2.378128543534024, |
|
"learning_rate": 9.618497274888147e-06, |
|
"loss": 0.6344, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.21551724137931033, |
|
"grad_norm": 2.3554799136699383, |
|
"learning_rate": 9.600275268529809e-06, |
|
"loss": 0.632, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2182112068965517, |
|
"grad_norm": 2.9669359679831437, |
|
"learning_rate": 9.58164626871261e-06, |
|
"loss": 0.6409, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.2209051724137931, |
|
"grad_norm": 2.510424077340063, |
|
"learning_rate": 9.562611923572944e-06, |
|
"loss": 0.6316, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.22359913793103448, |
|
"grad_norm": 2.5067266793187843, |
|
"learning_rate": 9.543173917108725e-06, |
|
"loss": 0.6337, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.22629310344827586, |
|
"grad_norm": 2.4014165442615627, |
|
"learning_rate": 9.523333969030413e-06, |
|
"loss": 0.6285, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.22898706896551724, |
|
"grad_norm": 2.5503305669006266, |
|
"learning_rate": 9.503093834608856e-06, |
|
"loss": 0.6297, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.23168103448275862, |
|
"grad_norm": 2.683370610867663, |
|
"learning_rate": 9.482455304520013e-06, |
|
"loss": 0.6222, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.234375, |
|
"grad_norm": 2.3415254501274156, |
|
"learning_rate": 9.46142020468652e-06, |
|
"loss": 0.6181, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.23706896551724138, |
|
"grad_norm": 2.4296203317167513, |
|
"learning_rate": 9.439990396116149e-06, |
|
"loss": 0.6191, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.23976293103448276, |
|
"grad_norm": 2.4277540188724833, |
|
"learning_rate": 9.418167774737173e-06, |
|
"loss": 0.6218, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.24245689655172414, |
|
"grad_norm": 2.594904022170311, |
|
"learning_rate": 9.395954271230606e-06, |
|
"loss": 0.622, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.24515086206896552, |
|
"grad_norm": 2.347098862192039, |
|
"learning_rate": 9.373351850859417e-06, |
|
"loss": 0.6136, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.2478448275862069, |
|
"grad_norm": 2.3928008650888204, |
|
"learning_rate": 9.350362513294652e-06, |
|
"loss": 0.6272, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2505387931034483, |
|
"grad_norm": 2.335542398750826, |
|
"learning_rate": 9.326988292438514e-06, |
|
"loss": 0.6245, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.25323275862068967, |
|
"grad_norm": 2.3458410101982174, |
|
"learning_rate": 9.30323125624443e-06, |
|
"loss": 0.6176, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.25592672413793105, |
|
"grad_norm": 2.5491037378725188, |
|
"learning_rate": 9.279093506534085e-06, |
|
"loss": 0.6039, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.25862068965517243, |
|
"grad_norm": 2.35768113596503, |
|
"learning_rate": 9.254577178811482e-06, |
|
"loss": 0.6062, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.2613146551724138, |
|
"grad_norm": 2.4427975704018072, |
|
"learning_rate": 9.229684442074005e-06, |
|
"loss": 0.6038, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.2640086206896552, |
|
"grad_norm": 2.3518303928123183, |
|
"learning_rate": 9.204417498620521e-06, |
|
"loss": 0.6071, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2667025862068966, |
|
"grad_norm": 2.3978894249163285, |
|
"learning_rate": 9.178778583856552e-06, |
|
"loss": 0.6024, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.26939655172413796, |
|
"grad_norm": 2.530047013657598, |
|
"learning_rate": 9.152769966096483e-06, |
|
"loss": 0.6028, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.27209051724137934, |
|
"grad_norm": 2.4123317555719708, |
|
"learning_rate": 9.126393946362906e-06, |
|
"loss": 0.6083, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.27478448275862066, |
|
"grad_norm": 2.4793056830777753, |
|
"learning_rate": 9.099652858183027e-06, |
|
"loss": 0.6051, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.27747844827586204, |
|
"grad_norm": 2.372688897527012, |
|
"learning_rate": 9.072549067382225e-06, |
|
"loss": 0.6157, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.2801724137931034, |
|
"grad_norm": 2.380240348074666, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 0.6073, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2828663793103448, |
|
"grad_norm": 2.545807161286919, |
|
"learning_rate": 9.017263001451518e-06, |
|
"loss": 0.5884, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.2855603448275862, |
|
"grad_norm": 2.5935659051260824, |
|
"learning_rate": 8.989085617565261e-06, |
|
"loss": 0.5983, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.28825431034482757, |
|
"grad_norm": 2.2548884783469836, |
|
"learning_rate": 8.960555313112646e-06, |
|
"loss": 0.5895, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.29094827586206895, |
|
"grad_norm": 2.3534621434136533, |
|
"learning_rate": 8.93167461221378e-06, |
|
"loss": 0.5914, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.29364224137931033, |
|
"grad_norm": 2.5336260688373495, |
|
"learning_rate": 8.902446069988878e-06, |
|
"loss": 0.5939, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.2963362068965517, |
|
"grad_norm": 2.624683890197873, |
|
"learning_rate": 8.87287227233222e-06, |
|
"loss": 0.5836, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2990301724137931, |
|
"grad_norm": 2.3588318708883604, |
|
"learning_rate": 8.842955835683368e-06, |
|
"loss": 0.5786, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.3017241379310345, |
|
"grad_norm": 2.501675897313923, |
|
"learning_rate": 8.812699406795683e-06, |
|
"loss": 0.5799, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.30441810344827586, |
|
"grad_norm": 2.6078839400922424, |
|
"learning_rate": 8.78210566250216e-06, |
|
"loss": 0.5801, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.30711206896551724, |
|
"grad_norm": 2.3496389383543135, |
|
"learning_rate": 8.751177309478618e-06, |
|
"loss": 0.5756, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3098060344827586, |
|
"grad_norm": 2.3002443057548727, |
|
"learning_rate": 8.71991708400422e-06, |
|
"loss": 0.5823, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 2.368311996486066, |
|
"learning_rate": 8.688327751719403e-06, |
|
"loss": 0.57, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.3151939655172414, |
|
"grad_norm": 2.316476591326147, |
|
"learning_rate": 8.656412107381187e-06, |
|
"loss": 0.572, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.31788793103448276, |
|
"grad_norm": 2.648056237571166, |
|
"learning_rate": 8.624172974615926e-06, |
|
"loss": 0.5759, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.32058189655172414, |
|
"grad_norm": 2.5273275022283035, |
|
"learning_rate": 8.591613205669494e-06, |
|
"loss": 0.5751, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.3232758620689655, |
|
"grad_norm": 2.3674743965920433, |
|
"learning_rate": 8.558735681154944e-06, |
|
"loss": 0.5525, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3259698275862069, |
|
"grad_norm": 2.334754085556647, |
|
"learning_rate": 8.525543309797653e-06, |
|
"loss": 0.5501, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.3286637931034483, |
|
"grad_norm": 2.511690588702945, |
|
"learning_rate": 8.492039028177985e-06, |
|
"loss": 0.5703, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.33135775862068967, |
|
"grad_norm": 2.41344799771138, |
|
"learning_rate": 8.458225800471492e-06, |
|
"loss": 0.5674, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.33405172413793105, |
|
"grad_norm": 2.274991518802859, |
|
"learning_rate": 8.424106618186653e-06, |
|
"loss": 0.568, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.33674568965517243, |
|
"grad_norm": 2.2914893865907375, |
|
"learning_rate": 8.389684499900231e-06, |
|
"loss": 0.5578, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.3394396551724138, |
|
"grad_norm": 2.2271331744770175, |
|
"learning_rate": 8.354962490990202e-06, |
|
"loss": 0.554, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3421336206896552, |
|
"grad_norm": 2.346436964348071, |
|
"learning_rate": 8.319943663366325e-06, |
|
"loss": 0.5623, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 2.2365182629879707, |
|
"learning_rate": 8.284631115198371e-06, |
|
"loss": 0.5534, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.34752155172413796, |
|
"grad_norm": 2.461241222937466, |
|
"learning_rate": 8.24902797064203e-06, |
|
"loss": 0.5564, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.35021551724137934, |
|
"grad_norm": 2.442140982131872, |
|
"learning_rate": 8.213137379562486e-06, |
|
"loss": 0.5506, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.35290948275862066, |
|
"grad_norm": 2.388325267487531, |
|
"learning_rate": 8.176962517255776e-06, |
|
"loss": 0.5531, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.35560344827586204, |
|
"grad_norm": 2.398524248781268, |
|
"learning_rate": 8.140506584167845e-06, |
|
"loss": 0.5415, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.3582974137931034, |
|
"grad_norm": 2.566763693618945, |
|
"learning_rate": 8.103772805611403e-06, |
|
"loss": 0.5616, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.3609913793103448, |
|
"grad_norm": 2.3106768834034805, |
|
"learning_rate": 8.066764431480584e-06, |
|
"loss": 0.5328, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.3636853448275862, |
|
"grad_norm": 2.2940366514378425, |
|
"learning_rate": 8.029484735963409e-06, |
|
"loss": 0.5452, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.36637931034482757, |
|
"grad_norm": 2.4096028111246652, |
|
"learning_rate": 7.991937017252127e-06, |
|
"loss": 0.5448, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.36907327586206895, |
|
"grad_norm": 2.450510234216877, |
|
"learning_rate": 7.95412459725141e-06, |
|
"loss": 0.5407, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.37176724137931033, |
|
"grad_norm": 2.498635611862816, |
|
"learning_rate": 7.916050821284462e-06, |
|
"loss": 0.536, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.3744612068965517, |
|
"grad_norm": 2.3384557737181306, |
|
"learning_rate": 7.877719057797055e-06, |
|
"loss": 0.5404, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.3771551724137931, |
|
"grad_norm": 2.395634299723523, |
|
"learning_rate": 7.839132698059515e-06, |
|
"loss": 0.5469, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3798491379310345, |
|
"grad_norm": 2.528299315994187, |
|
"learning_rate": 7.800295155866688e-06, |
|
"loss": 0.5272, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.38254310344827586, |
|
"grad_norm": 2.383516192036904, |
|
"learning_rate": 7.761209867235924e-06, |
|
"loss": 0.5495, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.38523706896551724, |
|
"grad_norm": 2.3221638101603954, |
|
"learning_rate": 7.721880290103082e-06, |
|
"loss": 0.5517, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.3879310344827586, |
|
"grad_norm": 2.451275702370551, |
|
"learning_rate": 7.6823099040166e-06, |
|
"loss": 0.5195, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.390625, |
|
"grad_norm": 2.469988525493039, |
|
"learning_rate": 7.64250220982966e-06, |
|
"loss": 0.5151, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.3933189655172414, |
|
"grad_norm": 2.4698654498618016, |
|
"learning_rate": 7.602460729390455e-06, |
|
"loss": 0.5296, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.39601293103448276, |
|
"grad_norm": 2.433689149450146, |
|
"learning_rate": 7.562189005230609e-06, |
|
"loss": 0.5122, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.39870689655172414, |
|
"grad_norm": 2.317764828643439, |
|
"learning_rate": 7.521690600251765e-06, |
|
"loss": 0.5389, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.4014008620689655, |
|
"grad_norm": 2.3785211168925997, |
|
"learning_rate": 7.480969097410369e-06, |
|
"loss": 0.5342, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.4040948275862069, |
|
"grad_norm": 2.352268614869421, |
|
"learning_rate": 7.4400280994006765e-06, |
|
"loss": 0.5222, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4067887931034483, |
|
"grad_norm": 2.3334817294609844, |
|
"learning_rate": 7.398871228336022e-06, |
|
"loss": 0.5148, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.40948275862068967, |
|
"grad_norm": 2.2180745679186513, |
|
"learning_rate": 7.357502125428359e-06, |
|
"loss": 0.5269, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.41217672413793105, |
|
"grad_norm": 2.4024098190438448, |
|
"learning_rate": 7.315924450666129e-06, |
|
"loss": 0.5252, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.41487068965517243, |
|
"grad_norm": 2.4847050155908326, |
|
"learning_rate": 7.274141882490435e-06, |
|
"loss": 0.5215, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.4175646551724138, |
|
"grad_norm": 2.3489603723016423, |
|
"learning_rate": 7.23215811746963e-06, |
|
"loss": 0.5331, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.4202586206896552, |
|
"grad_norm": 2.3846378852084276, |
|
"learning_rate": 7.189976869972249e-06, |
|
"loss": 0.526, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4229525862068966, |
|
"grad_norm": 2.2721960920466087, |
|
"learning_rate": 7.147601871838419e-06, |
|
"loss": 0.5111, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.42564655172413796, |
|
"grad_norm": 2.242972711736404, |
|
"learning_rate": 7.105036872049676e-06, |
|
"loss": 0.5079, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.42834051724137934, |
|
"grad_norm": 2.5168627834860944, |
|
"learning_rate": 7.0622856363973e-06, |
|
"loss": 0.5037, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.43103448275862066, |
|
"grad_norm": 2.3034024680284797, |
|
"learning_rate": 7.019351947149149e-06, |
|
"loss": 0.5037, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.43372844827586204, |
|
"grad_norm": 2.3169182311354204, |
|
"learning_rate": 6.976239602715025e-06, |
|
"loss": 0.5244, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.4364224137931034, |
|
"grad_norm": 2.342523099764779, |
|
"learning_rate": 6.932952417310634e-06, |
|
"loss": 0.4955, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.4391163793103448, |
|
"grad_norm": 2.4079674615936213, |
|
"learning_rate": 6.889494220620135e-06, |
|
"loss": 0.5039, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.4418103448275862, |
|
"grad_norm": 2.2705187143965704, |
|
"learning_rate": 6.8458688574573164e-06, |
|
"loss": 0.4921, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.44450431034482757, |
|
"grad_norm": 2.3040634798061053, |
|
"learning_rate": 6.8020801874254425e-06, |
|
"loss": 0.4952, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.44719827586206895, |
|
"grad_norm": 2.283780585980132, |
|
"learning_rate": 6.758132084575791e-06, |
|
"loss": 0.5204, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.44989224137931033, |
|
"grad_norm": 2.2311658006536175, |
|
"learning_rate": 6.7140284370649015e-06, |
|
"loss": 0.5062, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.4525862068965517, |
|
"grad_norm": 2.381000659447914, |
|
"learning_rate": 6.6697731468105985e-06, |
|
"loss": 0.5054, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.4552801724137931, |
|
"grad_norm": 2.5645822620698295, |
|
"learning_rate": 6.625370129146771e-06, |
|
"loss": 0.4967, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.4579741379310345, |
|
"grad_norm": 2.518018472550615, |
|
"learning_rate": 6.580823312476976e-06, |
|
"loss": 0.5057, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.46066810344827586, |
|
"grad_norm": 2.3310109009449937, |
|
"learning_rate": 6.536136637926898e-06, |
|
"loss": 0.4923, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.46336206896551724, |
|
"grad_norm": 2.4572949530360235, |
|
"learning_rate": 6.491314058995653e-06, |
|
"loss": 0.4923, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4660560344827586, |
|
"grad_norm": 2.333469399501826, |
|
"learning_rate": 6.446359541206042e-06, |
|
"loss": 0.4984, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 2.3170414009513287, |
|
"learning_rate": 6.401277061753689e-06, |
|
"loss": 0.4805, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.4714439655172414, |
|
"grad_norm": 2.3105233267502068, |
|
"learning_rate": 6.356070609155188e-06, |
|
"loss": 0.4857, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.47413793103448276, |
|
"grad_norm": 2.406900488225167, |
|
"learning_rate": 6.310744182895231e-06, |
|
"loss": 0.474, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.47683189655172414, |
|
"grad_norm": 2.3233269304186246, |
|
"learning_rate": 6.265301793072762e-06, |
|
"loss": 0.4947, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.4795258620689655, |
|
"grad_norm": 2.336797328678939, |
|
"learning_rate": 6.219747460046203e-06, |
|
"loss": 0.4771, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.4822198275862069, |
|
"grad_norm": 2.3058756900360566, |
|
"learning_rate": 6.17408521407776e-06, |
|
"loss": 0.4791, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.4849137931034483, |
|
"grad_norm": 2.467884893673803, |
|
"learning_rate": 6.128319094976869e-06, |
|
"loss": 0.492, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.48760775862068967, |
|
"grad_norm": 2.3280199883273047, |
|
"learning_rate": 6.0824531517427765e-06, |
|
"loss": 0.4816, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.49030172413793105, |
|
"grad_norm": 2.2642826853033053, |
|
"learning_rate": 6.03649144220633e-06, |
|
"loss": 0.4805, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.49299568965517243, |
|
"grad_norm": 2.2845546468033007, |
|
"learning_rate": 5.990438032670968e-06, |
|
"loss": 0.4804, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.4956896551724138, |
|
"grad_norm": 2.320099011292584, |
|
"learning_rate": 5.944296997552968e-06, |
|
"loss": 0.4807, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.4983836206896552, |
|
"grad_norm": 2.4032671750639607, |
|
"learning_rate": 5.898072419020978e-06, |
|
"loss": 0.479, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.5010775862068966, |
|
"grad_norm": 2.3454490179654948, |
|
"learning_rate": 5.851768386634863e-06, |
|
"loss": 0.4657, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5037715517241379, |
|
"grad_norm": 2.2272370976346707, |
|
"learning_rate": 5.805388996983891e-06, |
|
"loss": 0.4778, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.5064655172413793, |
|
"grad_norm": 2.399429478516486, |
|
"learning_rate": 5.758938353324308e-06, |
|
"loss": 0.4766, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5091594827586207, |
|
"grad_norm": 2.2479225788941726, |
|
"learning_rate": 5.712420565216305e-06, |
|
"loss": 0.4689, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.5118534482758621, |
|
"grad_norm": 2.333910684063406, |
|
"learning_rate": 5.66583974816045e-06, |
|
"loss": 0.4689, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5145474137931034, |
|
"grad_norm": 2.494414220923278, |
|
"learning_rate": 5.619200023233582e-06, |
|
"loss": 0.4654, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"grad_norm": 2.4303474928270314, |
|
"learning_rate": 5.572505516724207e-06, |
|
"loss": 0.4841, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5199353448275862, |
|
"grad_norm": 2.3290300558522605, |
|
"learning_rate": 5.52576035976744e-06, |
|
"loss": 0.4631, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.5226293103448276, |
|
"grad_norm": 2.303763077645539, |
|
"learning_rate": 5.478968687979527e-06, |
|
"loss": 0.4535, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.525323275862069, |
|
"grad_norm": 2.3158015015015367, |
|
"learning_rate": 5.432134641091945e-06, |
|
"loss": 0.4653, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.5280172413793104, |
|
"grad_norm": 2.412268625727716, |
|
"learning_rate": 5.3852623625851655e-06, |
|
"loss": 0.4553, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5307112068965517, |
|
"grad_norm": 2.4152646593142477, |
|
"learning_rate": 5.338355999322069e-06, |
|
"loss": 0.459, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.5334051724137931, |
|
"grad_norm": 2.3009383932051186, |
|
"learning_rate": 5.291419701181069e-06, |
|
"loss": 0.4574, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5360991379310345, |
|
"grad_norm": 2.3404820672273683, |
|
"learning_rate": 5.244457620688962e-06, |
|
"loss": 0.4457, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.5387931034482759, |
|
"grad_norm": 2.2918401803413277, |
|
"learning_rate": 5.197473912653549e-06, |
|
"loss": 0.4625, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5414870689655172, |
|
"grad_norm": 2.330307145203118, |
|
"learning_rate": 5.150472733796053e-06, |
|
"loss": 0.4614, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.5441810344827587, |
|
"grad_norm": 2.317228108453964, |
|
"learning_rate": 5.103458242383371e-06, |
|
"loss": 0.4346, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.546875, |
|
"grad_norm": 2.246449210384358, |
|
"learning_rate": 5.056434597860176e-06, |
|
"loss": 0.4332, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.5495689655172413, |
|
"grad_norm": 2.2315633880832917, |
|
"learning_rate": 5.009405960480937e-06, |
|
"loss": 0.4374, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.5522629310344828, |
|
"grad_norm": 2.236917389881302, |
|
"learning_rate": 4.962376490941846e-06, |
|
"loss": 0.4443, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.5549568965517241, |
|
"grad_norm": 2.2257101057521953, |
|
"learning_rate": 4.915350350012714e-06, |
|
"loss": 0.4485, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.5576508620689655, |
|
"grad_norm": 2.2768475081245696, |
|
"learning_rate": 4.868331698168875e-06, |
|
"loss": 0.456, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.5603448275862069, |
|
"grad_norm": 2.2588873812858243, |
|
"learning_rate": 4.82132469522308e-06, |
|
"loss": 0.4531, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.5630387931034483, |
|
"grad_norm": 2.2517674521156414, |
|
"learning_rate": 4.774333499957488e-06, |
|
"loss": 0.4439, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.5657327586206896, |
|
"grad_norm": 2.3879681903493277, |
|
"learning_rate": 4.727362269755736e-06, |
|
"loss": 0.4507, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.568426724137931, |
|
"grad_norm": 2.2168932530530654, |
|
"learning_rate": 4.68041516023511e-06, |
|
"loss": 0.4436, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.5711206896551724, |
|
"grad_norm": 2.328909950607463, |
|
"learning_rate": 4.633496324878906e-06, |
|
"loss": 0.4408, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.5738146551724138, |
|
"grad_norm": 2.2564887174276183, |
|
"learning_rate": 4.586609914668963e-06, |
|
"loss": 0.4516, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.5765086206896551, |
|
"grad_norm": 2.2979177074885424, |
|
"learning_rate": 4.539760077718416e-06, |
|
"loss": 0.4389, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.5792025862068966, |
|
"grad_norm": 2.2933960847054515, |
|
"learning_rate": 4.492950958904707e-06, |
|
"loss": 0.4266, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.5818965517241379, |
|
"grad_norm": 2.2594325799250594, |
|
"learning_rate": 4.4461866995028776e-06, |
|
"loss": 0.427, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.5845905172413793, |
|
"grad_norm": 2.349659814217747, |
|
"learning_rate": 4.399471436819199e-06, |
|
"loss": 0.4346, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.5872844827586207, |
|
"grad_norm": 2.297930957947952, |
|
"learning_rate": 4.352809303825115e-06, |
|
"loss": 0.4279, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5899784482758621, |
|
"grad_norm": 2.202712644399629, |
|
"learning_rate": 4.306204428791609e-06, |
|
"loss": 0.4291, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.5926724137931034, |
|
"grad_norm": 2.2128476870439813, |
|
"learning_rate": 4.259660934923965e-06, |
|
"loss": 0.44, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5953663793103449, |
|
"grad_norm": 2.367627389505961, |
|
"learning_rate": 4.213182939996978e-06, |
|
"loss": 0.4379, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.5980603448275862, |
|
"grad_norm": 2.274117011259563, |
|
"learning_rate": 4.166774555990654e-06, |
|
"loss": 0.4344, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.6007543103448276, |
|
"grad_norm": 2.2261394360036983, |
|
"learning_rate": 4.120439888726407e-06, |
|
"loss": 0.4142, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.603448275862069, |
|
"grad_norm": 2.1852891937100436, |
|
"learning_rate": 4.074183037503827e-06, |
|
"loss": 0.4266, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.6061422413793104, |
|
"grad_norm": 2.3083672939605053, |
|
"learning_rate": 4.028008094737989e-06, |
|
"loss": 0.4394, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.6088362068965517, |
|
"grad_norm": 2.2610041056896963, |
|
"learning_rate": 3.981919145597404e-06, |
|
"loss": 0.4128, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.6115301724137931, |
|
"grad_norm": 2.19751146715402, |
|
"learning_rate": 3.935920267642592e-06, |
|
"loss": 0.4227, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.6142241379310345, |
|
"grad_norm": 2.3415136999781963, |
|
"learning_rate": 3.890015530465342e-06, |
|
"loss": 0.4133, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.6169181034482759, |
|
"grad_norm": 2.291673599344672, |
|
"learning_rate": 3.844208995328659e-06, |
|
"loss": 0.4192, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.6196120689655172, |
|
"grad_norm": 2.2459859353779508, |
|
"learning_rate": 3.7985047148074584e-06, |
|
"loss": 0.4257, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6223060344827587, |
|
"grad_norm": 2.3753214874892072, |
|
"learning_rate": 3.75290673243004e-06, |
|
"loss": 0.421, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 2.181100394703554, |
|
"learning_rate": 3.707419082320336e-06, |
|
"loss": 0.4287, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6276939655172413, |
|
"grad_norm": 2.242465849693457, |
|
"learning_rate": 3.6620457888410143e-06, |
|
"loss": 0.4143, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.6303879310344828, |
|
"grad_norm": 2.3646959150338813, |
|
"learning_rate": 3.616790866237433e-06, |
|
"loss": 0.4045, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.6330818965517241, |
|
"grad_norm": 2.312802724452316, |
|
"learning_rate": 3.5716583182825023e-06, |
|
"loss": 0.4248, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.6357758620689655, |
|
"grad_norm": 2.208443511882899, |
|
"learning_rate": 3.5266521379224506e-06, |
|
"loss": 0.4135, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.6384698275862069, |
|
"grad_norm": 2.2774985396607046, |
|
"learning_rate": 3.4817763069235747e-06, |
|
"loss": 0.4028, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.6411637931034483, |
|
"grad_norm": 2.3080269121559898, |
|
"learning_rate": 3.4370347955199634e-06, |
|
"loss": 0.4086, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.6438577586206896, |
|
"grad_norm": 2.3130128907712355, |
|
"learning_rate": 3.392431562062238e-06, |
|
"loss": 0.408, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.646551724137931, |
|
"grad_norm": 2.2776700595089676, |
|
"learning_rate": 3.347970552667361e-06, |
|
"loss": 0.4159, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6492456896551724, |
|
"grad_norm": 2.1524296489308576, |
|
"learning_rate": 3.303655700869507e-06, |
|
"loss": 0.4035, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.6519396551724138, |
|
"grad_norm": 2.2146294105038185, |
|
"learning_rate": 3.259490927272071e-06, |
|
"loss": 0.4012, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.6546336206896551, |
|
"grad_norm": 2.2480654104489752, |
|
"learning_rate": 3.2154801392007883e-06, |
|
"loss": 0.4153, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.6573275862068966, |
|
"grad_norm": 2.169871400965887, |
|
"learning_rate": 3.171627230358063e-06, |
|
"loss": 0.404, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.6600215517241379, |
|
"grad_norm": 2.4015866937415056, |
|
"learning_rate": 3.1279360804784785e-06, |
|
"loss": 0.4063, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.6627155172413793, |
|
"grad_norm": 2.3038799378482557, |
|
"learning_rate": 3.084410554985553e-06, |
|
"loss": 0.3898, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.6654094827586207, |
|
"grad_norm": 2.198625588166285, |
|
"learning_rate": 3.0410545046497553e-06, |
|
"loss": 0.4035, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.6681034482758621, |
|
"grad_norm": 2.1950219963512176, |
|
"learning_rate": 2.9978717652478343e-06, |
|
"loss": 0.3902, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.6707974137931034, |
|
"grad_norm": 2.247458718435766, |
|
"learning_rate": 2.954866157223445e-06, |
|
"loss": 0.4082, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.6734913793103449, |
|
"grad_norm": 2.2241261994844588, |
|
"learning_rate": 2.9120414853491574e-06, |
|
"loss": 0.404, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6761853448275862, |
|
"grad_norm": 2.1606540598223103, |
|
"learning_rate": 2.86940153838984e-06, |
|
"loss": 0.3948, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.6788793103448276, |
|
"grad_norm": 2.0718054651873437, |
|
"learning_rate": 2.826950088767469e-06, |
|
"loss": 0.3927, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.681573275862069, |
|
"grad_norm": 2.227847088159035, |
|
"learning_rate": 2.784690892227363e-06, |
|
"loss": 0.3903, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.6842672413793104, |
|
"grad_norm": 2.207892303296737, |
|
"learning_rate": 2.7426276875059145e-06, |
|
"loss": 0.3955, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.6869612068965517, |
|
"grad_norm": 2.1465153515114093, |
|
"learning_rate": 2.700764195999819e-06, |
|
"loss": 0.3788, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 2.223157201107058, |
|
"learning_rate": 2.6591041214368383e-06, |
|
"loss": 0.4053, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.6923491379310345, |
|
"grad_norm": 2.392548147708553, |
|
"learning_rate": 2.6176511495481172e-06, |
|
"loss": 0.3834, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.6950431034482759, |
|
"grad_norm": 2.059476074487736, |
|
"learning_rate": 2.5764089477421067e-06, |
|
"loss": 0.3857, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.6977370689655172, |
|
"grad_norm": 2.157455657651667, |
|
"learning_rate": 2.5353811647801107e-06, |
|
"loss": 0.3884, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.7004310344827587, |
|
"grad_norm": 2.307643086382308, |
|
"learning_rate": 2.4945714304534584e-06, |
|
"loss": 0.3815, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.703125, |
|
"grad_norm": 2.26315069416342, |
|
"learning_rate": 2.453983355262382e-06, |
|
"loss": 0.3865, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.7058189655172413, |
|
"grad_norm": 2.332313222729813, |
|
"learning_rate": 2.413620530096592e-06, |
|
"loss": 0.391, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.7085129310344828, |
|
"grad_norm": 2.1418117590999413, |
|
"learning_rate": 2.373486525917575e-06, |
|
"loss": 0.3912, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.7112068965517241, |
|
"grad_norm": 2.178180423311831, |
|
"learning_rate": 2.333584893442675e-06, |
|
"loss": 0.3854, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.7139008620689655, |
|
"grad_norm": 2.151591142836586, |
|
"learning_rate": 2.2939191628309482e-06, |
|
"loss": 0.3815, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.7165948275862069, |
|
"grad_norm": 2.1488408048158916, |
|
"learning_rate": 2.254492843370857e-06, |
|
"loss": 0.3741, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.7192887931034483, |
|
"grad_norm": 2.3225770656541624, |
|
"learning_rate": 2.2153094231697807e-06, |
|
"loss": 0.3865, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.7219827586206896, |
|
"grad_norm": 2.225461569667121, |
|
"learning_rate": 2.1763723688454297e-06, |
|
"loss": 0.389, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.724676724137931, |
|
"grad_norm": 2.310688191216032, |
|
"learning_rate": 2.1376851252191465e-06, |
|
"loss": 0.3905, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.7273706896551724, |
|
"grad_norm": 2.206817710811153, |
|
"learning_rate": 2.09925111501113e-06, |
|
"loss": 0.3705, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.7300646551724138, |
|
"grad_norm": 2.194541840528301, |
|
"learning_rate": 2.061073738537635e-06, |
|
"loss": 0.38, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.7327586206896551, |
|
"grad_norm": 2.1363777762782568, |
|
"learning_rate": 2.0231563734101245e-06, |
|
"loss": 0.3826, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.7354525862068966, |
|
"grad_norm": 2.043722143372559, |
|
"learning_rate": 1.9855023742364647e-06, |
|
"loss": 0.3722, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.7381465517241379, |
|
"grad_norm": 2.296022903294665, |
|
"learning_rate": 1.9481150723241236e-06, |
|
"loss": 0.3836, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.7408405172413793, |
|
"grad_norm": 2.1320085273295333, |
|
"learning_rate": 1.9109977753854496e-06, |
|
"loss": 0.367, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.7435344827586207, |
|
"grad_norm": 2.126131429150438, |
|
"learning_rate": 1.8741537672450406e-06, |
|
"loss": 0.3756, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.7462284482758621, |
|
"grad_norm": 2.3054341669665708, |
|
"learning_rate": 1.8375863075492062e-06, |
|
"loss": 0.3737, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.7489224137931034, |
|
"grad_norm": 2.3340813640902867, |
|
"learning_rate": 1.8012986314775888e-06, |
|
"loss": 0.3694, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.7516163793103449, |
|
"grad_norm": 2.1335614766566544, |
|
"learning_rate": 1.7652939494569428e-06, |
|
"loss": 0.3706, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.7543103448275862, |
|
"grad_norm": 2.135867482259856, |
|
"learning_rate": 1.7295754468771026e-06, |
|
"loss": 0.3826, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.7570043103448276, |
|
"grad_norm": 2.253239028561062, |
|
"learning_rate": 1.6941462838091643e-06, |
|
"loss": 0.3879, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.759698275862069, |
|
"grad_norm": 2.1899554008641613, |
|
"learning_rate": 1.6590095947259083e-06, |
|
"loss": 0.3657, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.7623922413793104, |
|
"grad_norm": 1.9335639886365577, |
|
"learning_rate": 1.6241684882244952e-06, |
|
"loss": 0.3647, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.7650862068965517, |
|
"grad_norm": 2.158271364922754, |
|
"learning_rate": 1.5896260467514335e-06, |
|
"loss": 0.3613, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.7677801724137931, |
|
"grad_norm": 2.283426548356461, |
|
"learning_rate": 1.5553853263298741e-06, |
|
"loss": 0.3804, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.7704741379310345, |
|
"grad_norm": 1.973245710047114, |
|
"learning_rate": 1.521449356289245e-06, |
|
"loss": 0.3616, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.7731681034482759, |
|
"grad_norm": 2.176003470736959, |
|
"learning_rate": 1.4878211389972369e-06, |
|
"loss": 0.3594, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.7758620689655172, |
|
"grad_norm": 2.350333157030792, |
|
"learning_rate": 1.454503649594176e-06, |
|
"loss": 0.3745, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.7785560344827587, |
|
"grad_norm": 2.1046600168472254, |
|
"learning_rate": 1.421499835729812e-06, |
|
"loss": 0.3614, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 2.2403959550973376, |
|
"learning_rate": 1.3888126173025412e-06, |
|
"loss": 0.3667, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7839439655172413, |
|
"grad_norm": 2.2036204076799244, |
|
"learning_rate": 1.3564448862010653e-06, |
|
"loss": 0.3719, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.7866379310344828, |
|
"grad_norm": 2.1004023468667223, |
|
"learning_rate": 1.3243995060485537e-06, |
|
"loss": 0.3609, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.7893318965517241, |
|
"grad_norm": 2.049485866619644, |
|
"learning_rate": 1.2926793119492848e-06, |
|
"loss": 0.3562, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.7920258620689655, |
|
"grad_norm": 2.2562907662057015, |
|
"learning_rate": 1.2612871102378305e-06, |
|
"loss": 0.3638, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.7947198275862069, |
|
"grad_norm": 2.0015131375954045, |
|
"learning_rate": 1.230225678230766e-06, |
|
"loss": 0.3523, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.7974137931034483, |
|
"grad_norm": 1.9761111123797053, |
|
"learning_rate": 1.1994977639809575e-06, |
|
"loss": 0.3605, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.8001077586206896, |
|
"grad_norm": 2.1818297029398916, |
|
"learning_rate": 1.169106086034446e-06, |
|
"loss": 0.369, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.802801724137931, |
|
"grad_norm": 2.2176123875649782, |
|
"learning_rate": 1.1390533331899235e-06, |
|
"loss": 0.359, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.8054956896551724, |
|
"grad_norm": 2.1415950875401952, |
|
"learning_rate": 1.109342164260853e-06, |
|
"loss": 0.365, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.8081896551724138, |
|
"grad_norm": 1.9579230862394106, |
|
"learning_rate": 1.079975207840247e-06, |
|
"loss": 0.3475, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8108836206896551, |
|
"grad_norm": 1.9891326864430916, |
|
"learning_rate": 1.050955062068098e-06, |
|
"loss": 0.3636, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.8135775862068966, |
|
"grad_norm": 2.1589113372475826, |
|
"learning_rate": 1.0222842944015327e-06, |
|
"loss": 0.3637, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.8162715517241379, |
|
"grad_norm": 2.2093770653678817, |
|
"learning_rate": 9.939654413876493e-07, |
|
"loss": 0.3704, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.8189655172413793, |
|
"grad_norm": 2.117779906161616, |
|
"learning_rate": 9.660010084391197e-07, |
|
"loss": 0.3549, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.8216594827586207, |
|
"grad_norm": 2.2081164429406623, |
|
"learning_rate": 9.383934696125213e-07, |
|
"loss": 0.3637, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.8243534482758621, |
|
"grad_norm": 2.0797066327192915, |
|
"learning_rate": 9.111452673894589e-07, |
|
"loss": 0.355, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.8270474137931034, |
|
"grad_norm": 1.9884207565802496, |
|
"learning_rate": 8.842588124604695e-07, |
|
"loss": 0.3598, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.8297413793103449, |
|
"grad_norm": 1.9966503677289194, |
|
"learning_rate": 8.577364835117552e-07, |
|
"loss": 0.3503, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.8324353448275862, |
|
"grad_norm": 2.0974426601893006, |
|
"learning_rate": 8.315806270147237e-07, |
|
"loss": 0.3513, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.8351293103448276, |
|
"grad_norm": 2.0409953572157264, |
|
"learning_rate": 8.057935570184e-07, |
|
"loss": 0.353, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.837823275862069, |
|
"grad_norm": 2.05994767546201, |
|
"learning_rate": 7.803775549447017e-07, |
|
"loss": 0.3612, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.8405172413793104, |
|
"grad_norm": 1.9798689534701572, |
|
"learning_rate": 7.553348693865897e-07, |
|
"loss": 0.3433, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.8432112068965517, |
|
"grad_norm": 2.0314728151818557, |
|
"learning_rate": 7.306677159091385e-07, |
|
"loss": 0.3554, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.8459051724137931, |
|
"grad_norm": 2.1770521072409665, |
|
"learning_rate": 7.06378276853516e-07, |
|
"loss": 0.3434, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.8485991379310345, |
|
"grad_norm": 3.199094357987707, |
|
"learning_rate": 6.824687011439168e-07, |
|
"loss": 0.3555, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.8512931034482759, |
|
"grad_norm": 2.0350410942770267, |
|
"learning_rate": 6.589411040974369e-07, |
|
"loss": 0.3455, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.8539870689655172, |
|
"grad_norm": 2.0106939788979994, |
|
"learning_rate": 6.35797567236926e-07, |
|
"loss": 0.342, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.8566810344827587, |
|
"grad_norm": 2.0462922997663333, |
|
"learning_rate": 6.130401381068424e-07, |
|
"loss": 0.3484, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.859375, |
|
"grad_norm": 1.9989302742973973, |
|
"learning_rate": 5.906708300920916e-07, |
|
"loss": 0.358, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 2.1421705464248997, |
|
"learning_rate": 5.686916222399069e-07, |
|
"loss": 0.3479, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.8647629310344828, |
|
"grad_norm": 1.8665911668349293, |
|
"learning_rate": 5.471044590847569e-07, |
|
"loss": 0.3485, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.8674568965517241, |
|
"grad_norm": 2.252328311927183, |
|
"learning_rate": 5.259112504763115e-07, |
|
"loss": 0.3537, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.8701508620689655, |
|
"grad_norm": 2.242291713625665, |
|
"learning_rate": 5.051138714104726e-07, |
|
"loss": 0.3493, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.8728448275862069, |
|
"grad_norm": 1.9256177965601142, |
|
"learning_rate": 4.847141618634899e-07, |
|
"loss": 0.346, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.8755387931034483, |
|
"grad_norm": 2.0978920858884806, |
|
"learning_rate": 4.647139266291789e-07, |
|
"loss": 0.3447, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.8782327586206896, |
|
"grad_norm": 2.1438665447656424, |
|
"learning_rate": 4.4511493515924373e-07, |
|
"loss": 0.3467, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.880926724137931, |
|
"grad_norm": 1.943275187391926, |
|
"learning_rate": 4.2591892140673383e-07, |
|
"loss": 0.359, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.8836206896551724, |
|
"grad_norm": 1.9691693184683765, |
|
"learning_rate": 4.0712758367263573e-07, |
|
"loss": 0.3453, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.8863146551724138, |
|
"grad_norm": 2.2550989234096432, |
|
"learning_rate": 3.8874258445562694e-07, |
|
"loss": 0.354, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.8890086206896551, |
|
"grad_norm": 1.9743645882114702, |
|
"learning_rate": 3.7076555030498505e-07, |
|
"loss": 0.3545, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8917025862068966, |
|
"grad_norm": 2.069394313953148, |
|
"learning_rate": 3.531980716766914e-07, |
|
"loss": 0.3465, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.8943965517241379, |
|
"grad_norm": 2.084992853821571, |
|
"learning_rate": 3.3604170279271375e-07, |
|
"loss": 0.347, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.8970905172413793, |
|
"grad_norm": 2.028486932834069, |
|
"learning_rate": 3.1929796150351076e-07, |
|
"loss": 0.3385, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.8997844827586207, |
|
"grad_norm": 1.9042104552013777, |
|
"learning_rate": 3.02968329153735e-07, |
|
"loss": 0.3456, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.9024784482758621, |
|
"grad_norm": 2.138202184025318, |
|
"learning_rate": 2.870542504511864e-07, |
|
"loss": 0.3524, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.9051724137931034, |
|
"grad_norm": 2.0791032572613615, |
|
"learning_rate": 2.7155713333898826e-07, |
|
"loss": 0.3557, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.9078663793103449, |
|
"grad_norm": 2.032552582124559, |
|
"learning_rate": 2.564783488710293e-07, |
|
"loss": 0.3472, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.9105603448275862, |
|
"grad_norm": 2.0702198858374063, |
|
"learning_rate": 2.4181923109066254e-07, |
|
"loss": 0.3423, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.9132543103448276, |
|
"grad_norm": 2.223955152789369, |
|
"learning_rate": 2.2758107691268294e-07, |
|
"loss": 0.353, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.915948275862069, |
|
"grad_norm": 2.151000423198189, |
|
"learning_rate": 2.1376514600858212e-07, |
|
"loss": 0.3446, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.9186422413793104, |
|
"grad_norm": 1.9722858881802758, |
|
"learning_rate": 2.003726606951084e-07, |
|
"loss": 0.3423, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.9213362068965517, |
|
"grad_norm": 2.152676598806774, |
|
"learning_rate": 1.874048058261252e-07, |
|
"loss": 0.3566, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.9240301724137931, |
|
"grad_norm": 2.14241065854355, |
|
"learning_rate": 1.7486272868778299e-07, |
|
"loss": 0.3451, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.9267241379310345, |
|
"grad_norm": 1.9240645550272026, |
|
"learning_rate": 1.62747538897019e-07, |
|
"loss": 0.3526, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.9294181034482759, |
|
"grad_norm": 1.9864527165081682, |
|
"learning_rate": 1.5106030830338791e-07, |
|
"loss": 0.3414, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.9321120689655172, |
|
"grad_norm": 1.891840587890648, |
|
"learning_rate": 1.3980207089423326e-07, |
|
"loss": 0.3507, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.9348060344827587, |
|
"grad_norm": 2.197241548310695, |
|
"learning_rate": 1.2897382270320947e-07, |
|
"loss": 0.3415, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 2.1206142876832708, |
|
"learning_rate": 1.1857652172215905e-07, |
|
"loss": 0.3453, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.9401939655172413, |
|
"grad_norm": 2.0575425778092375, |
|
"learning_rate": 1.0861108781636099e-07, |
|
"loss": 0.3414, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.9428879310344828, |
|
"grad_norm": 2.067217232750268, |
|
"learning_rate": 9.907840264314572e-08, |
|
"loss": 0.3429, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.9455818965517241, |
|
"grad_norm": 2.08954775323305, |
|
"learning_rate": 8.997930957389433e-08, |
|
"loss": 0.3406, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.9482758620689655, |
|
"grad_norm": 2.0413104358527865, |
|
"learning_rate": 8.13146136194265e-08, |
|
"loss": 0.3544, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.9509698275862069, |
|
"grad_norm": 1.9504574949587095, |
|
"learning_rate": 7.308508135877745e-08, |
|
"loss": 0.3515, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.9536637931034483, |
|
"grad_norm": 2.0325177039467266, |
|
"learning_rate": 6.52914408713784e-08, |
|
"loss": 0.3422, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.9563577586206896, |
|
"grad_norm": 2.080402951454278, |
|
"learning_rate": 5.7934381672640206e-08, |
|
"loss": 0.3302, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.959051724137931, |
|
"grad_norm": 1.9103094146698458, |
|
"learning_rate": 5.101455465295557e-08, |
|
"loss": 0.3388, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.9617456896551724, |
|
"grad_norm": 2.0461617336274665, |
|
"learning_rate": 4.453257202011008e-08, |
|
"loss": 0.3437, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.9644396551724138, |
|
"grad_norm": 1.8955751541723638, |
|
"learning_rate": 3.848900724511828e-08, |
|
"loss": 0.3448, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.9671336206896551, |
|
"grad_norm": 1.8502858059698502, |
|
"learning_rate": 3.28843950114921e-08, |
|
"loss": 0.3318, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.9698275862068966, |
|
"grad_norm": 1.9634830726403167, |
|
"learning_rate": 2.771923116793307e-08, |
|
"loss": 0.3506, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.9725215517241379, |
|
"grad_norm": 2.12551984941854, |
|
"learning_rate": 2.299397268446413e-08, |
|
"loss": 0.3425, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.9752155172413793, |
|
"grad_norm": 2.4278727464472136, |
|
"learning_rate": 1.8709037612003044e-08, |
|
"loss": 0.3471, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.9779094827586207, |
|
"grad_norm": 2.191866602098634, |
|
"learning_rate": 1.4864805045373687e-08, |
|
"loss": 0.3384, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.9806034482758621, |
|
"grad_norm": 2.128906961450063, |
|
"learning_rate": 1.1461615089770062e-08, |
|
"loss": 0.349, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.9832974137931034, |
|
"grad_norm": 2.0846719469136916, |
|
"learning_rate": 8.499768830663723e-09, |
|
"loss": 0.3357, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.9859913793103449, |
|
"grad_norm": 2.319036063763146, |
|
"learning_rate": 5.979528307168414e-09, |
|
"loss": 0.3402, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.9886853448275862, |
|
"grad_norm": 2.0237346794749858, |
|
"learning_rate": 3.901116488855827e-09, |
|
"loss": 0.3554, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.9913793103448276, |
|
"grad_norm": 2.007135214089839, |
|
"learning_rate": 2.264717256030835e-09, |
|
"loss": 0.3462, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.994073275862069, |
|
"grad_norm": 1.994084875393067, |
|
"learning_rate": 1.0704753834600567e-09, |
|
"loss": 0.3455, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.9967672413793104, |
|
"grad_norm": 2.1211709513233856, |
|
"learning_rate": 3.184965275676577e-10, |
|
"loss": 0.3438, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.9994612068965517, |
|
"grad_norm": 2.0397937800653443, |
|
"learning_rate": 8.847217084495541e-12, |
|
"loss": 0.3482, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_runtime": 3.3988, |
|
"eval_samples_per_second": 2.942, |
|
"eval_steps_per_second": 0.883, |
|
"step": 1856 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 1856, |
|
"total_flos": 194304320471040.0, |
|
"train_loss": 0.50882549257949, |
|
"train_runtime": 16510.7518, |
|
"train_samples_per_second": 1.799, |
|
"train_steps_per_second": 0.112 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1856, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 194304320471040.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|