|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 2413, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004144218814753419, |
|
"grad_norm": 24.514733364424554, |
|
"learning_rate": 4.132231404958678e-08, |
|
"loss": 1.4169, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0020721094073767096, |
|
"grad_norm": 23.537184095238235, |
|
"learning_rate": 2.066115702479339e-07, |
|
"loss": 1.3953, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004144218814753419, |
|
"grad_norm": 15.367263243187544, |
|
"learning_rate": 4.132231404958678e-07, |
|
"loss": 1.3778, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.006216328222130129, |
|
"grad_norm": 8.898381617118186, |
|
"learning_rate": 6.198347107438018e-07, |
|
"loss": 1.2602, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.008288437629506838, |
|
"grad_norm": 10.642672694622197, |
|
"learning_rate": 8.264462809917356e-07, |
|
"loss": 1.1548, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.010360547036883548, |
|
"grad_norm": 4.617567060806481, |
|
"learning_rate": 1.0330578512396695e-06, |
|
"loss": 1.0324, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.012432656444260257, |
|
"grad_norm": 3.417020439965166, |
|
"learning_rate": 1.2396694214876035e-06, |
|
"loss": 0.9692, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.014504765851636967, |
|
"grad_norm": 3.395692919461883, |
|
"learning_rate": 1.4462809917355372e-06, |
|
"loss": 0.9523, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.016576875259013676, |
|
"grad_norm": 3.151546235627503, |
|
"learning_rate": 1.6528925619834712e-06, |
|
"loss": 0.9274, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.018648984666390384, |
|
"grad_norm": 2.9582253473643485, |
|
"learning_rate": 1.859504132231405e-06, |
|
"loss": 0.9323, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.020721094073767096, |
|
"grad_norm": 3.0341795582664997, |
|
"learning_rate": 2.066115702479339e-06, |
|
"loss": 0.8943, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.022793203481143803, |
|
"grad_norm": 3.0341130670598164, |
|
"learning_rate": 2.2727272727272728e-06, |
|
"loss": 0.8975, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.024865312888520515, |
|
"grad_norm": 3.1718207048254627, |
|
"learning_rate": 2.479338842975207e-06, |
|
"loss": 0.8814, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.026937422295897222, |
|
"grad_norm": 3.0690467751730495, |
|
"learning_rate": 2.6859504132231405e-06, |
|
"loss": 0.8886, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.029009531703273934, |
|
"grad_norm": 3.1173422324045084, |
|
"learning_rate": 2.8925619834710743e-06, |
|
"loss": 0.8779, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03108164111065064, |
|
"grad_norm": 3.138742895479937, |
|
"learning_rate": 3.0991735537190086e-06, |
|
"loss": 0.8896, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03315375051802735, |
|
"grad_norm": 3.172114081672577, |
|
"learning_rate": 3.3057851239669424e-06, |
|
"loss": 0.852, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.035225859925404064, |
|
"grad_norm": 3.0136111662154126, |
|
"learning_rate": 3.5123966942148763e-06, |
|
"loss": 0.8718, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.03729796933278077, |
|
"grad_norm": 3.258761073776976, |
|
"learning_rate": 3.71900826446281e-06, |
|
"loss": 0.8515, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03937007874015748, |
|
"grad_norm": 3.1414312111507323, |
|
"learning_rate": 3.925619834710744e-06, |
|
"loss": 0.8653, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.04144218814753419, |
|
"grad_norm": 3.207392945640886, |
|
"learning_rate": 4.132231404958678e-06, |
|
"loss": 0.8688, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0435142975549109, |
|
"grad_norm": 3.2052306999586695, |
|
"learning_rate": 4.338842975206612e-06, |
|
"loss": 0.8662, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.04558640696228761, |
|
"grad_norm": 3.061390265998909, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 0.8397, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04765851636966432, |
|
"grad_norm": 3.0154741328483023, |
|
"learning_rate": 4.75206611570248e-06, |
|
"loss": 0.8583, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.04973062577704103, |
|
"grad_norm": 2.9687539148068796, |
|
"learning_rate": 4.958677685950414e-06, |
|
"loss": 0.8542, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05180273518441774, |
|
"grad_norm": 3.0956570078052925, |
|
"learning_rate": 5.165289256198347e-06, |
|
"loss": 0.8343, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.053874844591794445, |
|
"grad_norm": 2.939810124211356, |
|
"learning_rate": 5.371900826446281e-06, |
|
"loss": 0.8457, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.055946953999171156, |
|
"grad_norm": 3.2352916246928203, |
|
"learning_rate": 5.578512396694216e-06, |
|
"loss": 0.8169, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.05801906340654787, |
|
"grad_norm": 3.1434079053605273, |
|
"learning_rate": 5.785123966942149e-06, |
|
"loss": 0.8257, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06009117281392457, |
|
"grad_norm": 3.1532690460920803, |
|
"learning_rate": 5.991735537190083e-06, |
|
"loss": 0.8392, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.06216328222130128, |
|
"grad_norm": 3.0343466546862037, |
|
"learning_rate": 6.198347107438017e-06, |
|
"loss": 0.8442, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06423539162867799, |
|
"grad_norm": 2.9508007571282318, |
|
"learning_rate": 6.404958677685951e-06, |
|
"loss": 0.8218, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.0663075010360547, |
|
"grad_norm": 3.0527743250572033, |
|
"learning_rate": 6.611570247933885e-06, |
|
"loss": 0.8294, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06837961044343141, |
|
"grad_norm": 3.075132609753167, |
|
"learning_rate": 6.818181818181818e-06, |
|
"loss": 0.8266, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.07045171985080813, |
|
"grad_norm": 2.972687092057336, |
|
"learning_rate": 7.0247933884297525e-06, |
|
"loss": 0.8139, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07252382925818483, |
|
"grad_norm": 3.068027468513739, |
|
"learning_rate": 7.231404958677687e-06, |
|
"loss": 0.7992, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.07459593866556154, |
|
"grad_norm": 3.158357724987422, |
|
"learning_rate": 7.43801652892562e-06, |
|
"loss": 0.8134, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.07666804807293826, |
|
"grad_norm": 3.051134008290911, |
|
"learning_rate": 7.644628099173555e-06, |
|
"loss": 0.8344, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.07874015748031496, |
|
"grad_norm": 3.3235580452981175, |
|
"learning_rate": 7.851239669421489e-06, |
|
"loss": 0.8041, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08081226688769166, |
|
"grad_norm": 3.0678026193628805, |
|
"learning_rate": 8.057851239669421e-06, |
|
"loss": 0.8045, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.08288437629506838, |
|
"grad_norm": 3.415940865122294, |
|
"learning_rate": 8.264462809917356e-06, |
|
"loss": 0.7953, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08495648570244509, |
|
"grad_norm": 3.815817395153579, |
|
"learning_rate": 8.47107438016529e-06, |
|
"loss": 0.8047, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.0870285951098218, |
|
"grad_norm": 3.2984513943211153, |
|
"learning_rate": 8.677685950413224e-06, |
|
"loss": 0.8056, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08910070451719851, |
|
"grad_norm": 2.8880894973116793, |
|
"learning_rate": 8.884297520661158e-06, |
|
"loss": 0.795, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.09117281392457521, |
|
"grad_norm": 2.974477767781155, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 0.8014, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09324492333195193, |
|
"grad_norm": 3.0809082608070444, |
|
"learning_rate": 9.297520661157025e-06, |
|
"loss": 0.812, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.09531703273932864, |
|
"grad_norm": 3.11471679186006, |
|
"learning_rate": 9.50413223140496e-06, |
|
"loss": 0.815, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.09738914214670534, |
|
"grad_norm": 3.3405073257172333, |
|
"learning_rate": 9.710743801652894e-06, |
|
"loss": 0.7991, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.09946125155408206, |
|
"grad_norm": 3.1375906477529485, |
|
"learning_rate": 9.917355371900828e-06, |
|
"loss": 0.7902, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.10153336096145876, |
|
"grad_norm": 2.8343248219683823, |
|
"learning_rate": 9.999952884702848e-06, |
|
"loss": 0.8215, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.10360547036883548, |
|
"grad_norm": 2.9925055543397563, |
|
"learning_rate": 9.999664961102495e-06, |
|
"loss": 0.8084, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.10567757977621219, |
|
"grad_norm": 3.0441664134730275, |
|
"learning_rate": 9.999115304121459e-06, |
|
"loss": 0.8085, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.10774968918358889, |
|
"grad_norm": 3.039997210231939, |
|
"learning_rate": 9.998303942534383e-06, |
|
"loss": 0.7938, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.10982179859096561, |
|
"grad_norm": 3.0431241796795696, |
|
"learning_rate": 9.997230918816193e-06, |
|
"loss": 0.7985, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.11189390799834231, |
|
"grad_norm": 2.85027037020918, |
|
"learning_rate": 9.99589628913988e-06, |
|
"loss": 0.7861, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11396601740571902, |
|
"grad_norm": 3.051497861941328, |
|
"learning_rate": 9.994300123373554e-06, |
|
"loss": 0.7716, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.11603812681309573, |
|
"grad_norm": 2.895125032094719, |
|
"learning_rate": 9.992442505076788e-06, |
|
"loss": 0.7834, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.11811023622047244, |
|
"grad_norm": 3.0061812704069886, |
|
"learning_rate": 9.990323531496235e-06, |
|
"loss": 0.7756, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.12018234562784914, |
|
"grad_norm": 3.0347710812056525, |
|
"learning_rate": 9.98794331356056e-06, |
|
"loss": 0.7846, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12225445503522586, |
|
"grad_norm": 2.805184541271578, |
|
"learning_rate": 9.985301975874604e-06, |
|
"loss": 0.7731, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.12432656444260257, |
|
"grad_norm": 3.0842132440120147, |
|
"learning_rate": 9.982399656712884e-06, |
|
"loss": 0.8042, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.12639867384997927, |
|
"grad_norm": 2.9008174183880233, |
|
"learning_rate": 9.979236508012341e-06, |
|
"loss": 0.7681, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.12847078325735597, |
|
"grad_norm": 2.8236286325458244, |
|
"learning_rate": 9.975812695364391e-06, |
|
"loss": 0.7891, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1305428926647327, |
|
"grad_norm": 2.8468044382747384, |
|
"learning_rate": 9.97212839800626e-06, |
|
"loss": 0.7681, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.1326150020721094, |
|
"grad_norm": 2.9534507280362003, |
|
"learning_rate": 9.968183808811586e-06, |
|
"loss": 0.7564, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.13468711147948612, |
|
"grad_norm": 2.965082743686242, |
|
"learning_rate": 9.963979134280344e-06, |
|
"loss": 0.7529, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.13675922088686282, |
|
"grad_norm": 2.847254353400155, |
|
"learning_rate": 9.959514594528018e-06, |
|
"loss": 0.7438, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.13883133029423952, |
|
"grad_norm": 2.7966052001739206, |
|
"learning_rate": 9.954790423274086e-06, |
|
"loss": 0.7591, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.14090343970161626, |
|
"grad_norm": 2.7319025763391576, |
|
"learning_rate": 9.94980686782978e-06, |
|
"loss": 0.7406, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14297554910899296, |
|
"grad_norm": 2.870893839424329, |
|
"learning_rate": 9.94456418908515e-06, |
|
"loss": 0.7541, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.14504765851636967, |
|
"grad_norm": 3.009522094390881, |
|
"learning_rate": 9.939062661495387e-06, |
|
"loss": 0.7511, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.14711976792374637, |
|
"grad_norm": 2.7182810127315573, |
|
"learning_rate": 9.933302573066477e-06, |
|
"loss": 0.7688, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.14919187733112307, |
|
"grad_norm": 3.012302144551778, |
|
"learning_rate": 9.927284225340105e-06, |
|
"loss": 0.7341, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1512639867384998, |
|
"grad_norm": 3.337745192726695, |
|
"learning_rate": 9.921007933377886e-06, |
|
"loss": 0.7539, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.1533360961458765, |
|
"grad_norm": 3.17882772460756, |
|
"learning_rate": 9.914474025744855e-06, |
|
"loss": 0.7506, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.15540820555325321, |
|
"grad_norm": 3.2860227088727383, |
|
"learning_rate": 9.907682844492283e-06, |
|
"loss": 0.7514, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.15748031496062992, |
|
"grad_norm": 3.1971805376225517, |
|
"learning_rate": 9.900634745139759e-06, |
|
"loss": 0.7475, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.15955242436800662, |
|
"grad_norm": 2.8028862036100843, |
|
"learning_rate": 9.893330096656576e-06, |
|
"loss": 0.7285, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.16162453377538333, |
|
"grad_norm": 2.708278578400365, |
|
"learning_rate": 9.885769281442426e-06, |
|
"loss": 0.7224, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.16369664318276006, |
|
"grad_norm": 2.771966186486157, |
|
"learning_rate": 9.877952695307382e-06, |
|
"loss": 0.7287, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.16576875259013676, |
|
"grad_norm": 2.645488728201908, |
|
"learning_rate": 9.869880747451164e-06, |
|
"loss": 0.7389, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.16784086199751347, |
|
"grad_norm": 2.9190913357286306, |
|
"learning_rate": 9.861553860441726e-06, |
|
"loss": 0.7414, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.16991297140489017, |
|
"grad_norm": 2.6920496051405998, |
|
"learning_rate": 9.852972470193136e-06, |
|
"loss": 0.7259, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.17198508081226688, |
|
"grad_norm": 2.9415197302471663, |
|
"learning_rate": 9.844137025942755e-06, |
|
"loss": 0.7266, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.1740571902196436, |
|
"grad_norm": 2.7808373109476294, |
|
"learning_rate": 9.835047990227713e-06, |
|
"loss": 0.7119, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.1761292996270203, |
|
"grad_norm": 2.810707277824952, |
|
"learning_rate": 9.825705838860699e-06, |
|
"loss": 0.7361, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.17820140903439702, |
|
"grad_norm": 3.0525714644872117, |
|
"learning_rate": 9.816111060905063e-06, |
|
"loss": 0.7146, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.18027351844177372, |
|
"grad_norm": 2.769816103147695, |
|
"learning_rate": 9.806264158649193e-06, |
|
"loss": 0.7104, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.18234562784915043, |
|
"grad_norm": 2.8698277588389516, |
|
"learning_rate": 9.796165647580233e-06, |
|
"loss": 0.7015, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.18441773725652713, |
|
"grad_norm": 2.632541484736724, |
|
"learning_rate": 9.785816056357096e-06, |
|
"loss": 0.7148, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.18648984666390386, |
|
"grad_norm": 2.6937205983501276, |
|
"learning_rate": 9.775215926782788e-06, |
|
"loss": 0.7203, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.18856195607128057, |
|
"grad_norm": 2.7810565830560354, |
|
"learning_rate": 9.764365813776042e-06, |
|
"loss": 0.7068, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.19063406547865727, |
|
"grad_norm": 2.8012616966262134, |
|
"learning_rate": 9.753266285342271e-06, |
|
"loss": 0.7104, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.19270617488603398, |
|
"grad_norm": 2.970139751785694, |
|
"learning_rate": 9.741917922543831e-06, |
|
"loss": 0.6881, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.19477828429341068, |
|
"grad_norm": 2.6941473339327824, |
|
"learning_rate": 9.7303213194696e-06, |
|
"loss": 0.6996, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.1968503937007874, |
|
"grad_norm": 3.0409394398260554, |
|
"learning_rate": 9.718477083203888e-06, |
|
"loss": 0.6933, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.19892250310816412, |
|
"grad_norm": 2.8048791766929146, |
|
"learning_rate": 9.706385833794639e-06, |
|
"loss": 0.6748, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.20099461251554082, |
|
"grad_norm": 2.7366882872916887, |
|
"learning_rate": 9.694048204220986e-06, |
|
"loss": 0.7044, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.20306672192291753, |
|
"grad_norm": 2.7267358802595374, |
|
"learning_rate": 9.681464840360105e-06, |
|
"loss": 0.6912, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.20513883133029423, |
|
"grad_norm": 2.754099178327694, |
|
"learning_rate": 9.668636400953411e-06, |
|
"loss": 0.6731, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.20721094073767096, |
|
"grad_norm": 3.11521121490046, |
|
"learning_rate": 9.655563557572068e-06, |
|
"loss": 0.7018, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.20928305014504767, |
|
"grad_norm": 2.729648091432038, |
|
"learning_rate": 9.642246994581833e-06, |
|
"loss": 0.6919, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.21135515955242437, |
|
"grad_norm": 2.788565650588865, |
|
"learning_rate": 9.62868740910723e-06, |
|
"loss": 0.6718, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.21342726895980108, |
|
"grad_norm": 2.6821583529570527, |
|
"learning_rate": 9.614885510995047e-06, |
|
"loss": 0.6696, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.21549937836717778, |
|
"grad_norm": 2.669430236726845, |
|
"learning_rate": 9.600842022777198e-06, |
|
"loss": 0.686, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.21757148777455448, |
|
"grad_norm": 2.6329122210908613, |
|
"learning_rate": 9.58655767963287e-06, |
|
"loss": 0.6649, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.21964359718193122, |
|
"grad_norm": 2.712511744397499, |
|
"learning_rate": 9.57203322935006e-06, |
|
"loss": 0.6691, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.22171570658930792, |
|
"grad_norm": 2.971121715072785, |
|
"learning_rate": 9.557269432286406e-06, |
|
"loss": 0.6568, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.22378781599668462, |
|
"grad_norm": 3.5850165463286587, |
|
"learning_rate": 9.542267061329407e-06, |
|
"loss": 0.6535, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.22585992540406133, |
|
"grad_norm": 2.6756133538389593, |
|
"learning_rate": 9.52702690185594e-06, |
|
"loss": 0.6578, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.22793203481143803, |
|
"grad_norm": 2.7169962232785947, |
|
"learning_rate": 9.511549751691159e-06, |
|
"loss": 0.6696, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.23000414421881477, |
|
"grad_norm": 2.66297533869507, |
|
"learning_rate": 9.495836421066722e-06, |
|
"loss": 0.6594, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.23207625362619147, |
|
"grad_norm": 2.974286886191662, |
|
"learning_rate": 9.47988773257838e-06, |
|
"loss": 0.6784, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.23414836303356817, |
|
"grad_norm": 2.7185575923361243, |
|
"learning_rate": 9.46370452114291e-06, |
|
"loss": 0.658, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.23622047244094488, |
|
"grad_norm": 2.829971742294898, |
|
"learning_rate": 9.447287633954406e-06, |
|
"loss": 0.6593, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.23829258184832158, |
|
"grad_norm": 2.620399538248119, |
|
"learning_rate": 9.430637930439933e-06, |
|
"loss": 0.6641, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.2403646912556983, |
|
"grad_norm": 2.8089738469657286, |
|
"learning_rate": 9.413756282214538e-06, |
|
"loss": 0.6443, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.24243680066307502, |
|
"grad_norm": 2.6610020148736657, |
|
"learning_rate": 9.396643573035609e-06, |
|
"loss": 0.6619, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.24450891007045172, |
|
"grad_norm": 2.693009848715325, |
|
"learning_rate": 9.37930069875662e-06, |
|
"loss": 0.6469, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.24658101947782843, |
|
"grad_norm": 2.8338530690883657, |
|
"learning_rate": 9.36172856728023e-06, |
|
"loss": 0.6571, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.24865312888520513, |
|
"grad_norm": 2.676634556349702, |
|
"learning_rate": 9.343928098510759e-06, |
|
"loss": 0.6358, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.25072523829258186, |
|
"grad_norm": 2.6815297814713674, |
|
"learning_rate": 9.325900224306019e-06, |
|
"loss": 0.6366, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.25279734769995854, |
|
"grad_norm": 2.693794960810057, |
|
"learning_rate": 9.307645888428542e-06, |
|
"loss": 0.6441, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.2548694571073353, |
|
"grad_norm": 2.6733329911215757, |
|
"learning_rate": 9.289166046496172e-06, |
|
"loss": 0.6284, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.25694156651471195, |
|
"grad_norm": 2.735971515551987, |
|
"learning_rate": 9.270461665932035e-06, |
|
"loss": 0.6394, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.2590136759220887, |
|
"grad_norm": 2.7764753108215623, |
|
"learning_rate": 9.251533725913893e-06, |
|
"loss": 0.6308, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.2610857853294654, |
|
"grad_norm": 2.9846661778059596, |
|
"learning_rate": 9.23238321732289e-06, |
|
"loss": 0.6381, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.2631578947368421, |
|
"grad_norm": 2.639475488829137, |
|
"learning_rate": 9.213011142691672e-06, |
|
"loss": 0.6298, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.2652300041442188, |
|
"grad_norm": 2.8031268295287037, |
|
"learning_rate": 9.193418516151913e-06, |
|
"loss": 0.6314, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2673021135515955, |
|
"grad_norm": 2.751071861402199, |
|
"learning_rate": 9.173606363381218e-06, |
|
"loss": 0.6243, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.26937422295897223, |
|
"grad_norm": 2.8637192589002507, |
|
"learning_rate": 9.15357572154943e-06, |
|
"loss": 0.6226, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.27144633236634896, |
|
"grad_norm": 2.757948609951417, |
|
"learning_rate": 9.133327639264334e-06, |
|
"loss": 0.6195, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.27351844177372564, |
|
"grad_norm": 2.7078393676429724, |
|
"learning_rate": 9.112863176516761e-06, |
|
"loss": 0.6063, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2755905511811024, |
|
"grad_norm": 2.7174885956763517, |
|
"learning_rate": 9.092183404625107e-06, |
|
"loss": 0.6201, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.27766266058847905, |
|
"grad_norm": 2.692439395697707, |
|
"learning_rate": 9.071289406179233e-06, |
|
"loss": 0.6186, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.2797347699958558, |
|
"grad_norm": 2.705577759760543, |
|
"learning_rate": 9.0501822749838e-06, |
|
"loss": 0.6208, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.2818068794032325, |
|
"grad_norm": 2.6961720306129497, |
|
"learning_rate": 9.028863116001013e-06, |
|
"loss": 0.6217, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.2838789888106092, |
|
"grad_norm": 2.823153530451939, |
|
"learning_rate": 9.007333045292764e-06, |
|
"loss": 0.6095, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.2859510982179859, |
|
"grad_norm": 2.7424489434092885, |
|
"learning_rate": 8.98559318996222e-06, |
|
"loss": 0.6071, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.2880232076253626, |
|
"grad_norm": 2.8108371792200724, |
|
"learning_rate": 8.963644688094806e-06, |
|
"loss": 0.6123, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.29009531703273933, |
|
"grad_norm": 2.7296570733868046, |
|
"learning_rate": 8.941488688698635e-06, |
|
"loss": 0.6038, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.29216742644011606, |
|
"grad_norm": 2.5282731359900983, |
|
"learning_rate": 8.919126351644351e-06, |
|
"loss": 0.6051, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.29423953584749274, |
|
"grad_norm": 2.733801097585659, |
|
"learning_rate": 8.896558847604414e-06, |
|
"loss": 0.6169, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.29631164525486947, |
|
"grad_norm": 2.5922131116520433, |
|
"learning_rate": 8.873787357991811e-06, |
|
"loss": 0.6062, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.29838375466224615, |
|
"grad_norm": 2.6565071641175844, |
|
"learning_rate": 8.850813074898218e-06, |
|
"loss": 0.6069, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.3004558640696229, |
|
"grad_norm": 3.0927970126548154, |
|
"learning_rate": 8.827637201031579e-06, |
|
"loss": 0.5879, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.3025279734769996, |
|
"grad_norm": 2.8105831813533033, |
|
"learning_rate": 8.804260949653154e-06, |
|
"loss": 0.6124, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3046000828843763, |
|
"grad_norm": 2.584377915155431, |
|
"learning_rate": 8.780685544514006e-06, |
|
"loss": 0.6073, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.306672192291753, |
|
"grad_norm": 2.773926231343915, |
|
"learning_rate": 8.756912219790933e-06, |
|
"loss": 0.5999, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.3087443016991297, |
|
"grad_norm": 2.732022843788419, |
|
"learning_rate": 8.732942220021859e-06, |
|
"loss": 0.5762, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.31081641110650643, |
|
"grad_norm": 2.595571184744404, |
|
"learning_rate": 8.708776800040679e-06, |
|
"loss": 0.5846, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3128885205138831, |
|
"grad_norm": 2.6942143484830985, |
|
"learning_rate": 8.684417224911579e-06, |
|
"loss": 0.6003, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.31496062992125984, |
|
"grad_norm": 2.6835198662003155, |
|
"learning_rate": 8.659864769862797e-06, |
|
"loss": 0.5838, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.31703273932863657, |
|
"grad_norm": 2.7778129772748383, |
|
"learning_rate": 8.635120720219877e-06, |
|
"loss": 0.5794, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.31910484873601325, |
|
"grad_norm": 2.6022076775767493, |
|
"learning_rate": 8.610186371338364e-06, |
|
"loss": 0.586, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.32117695814339, |
|
"grad_norm": 2.7328885442816557, |
|
"learning_rate": 8.585063028536015e-06, |
|
"loss": 0.5987, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.32324906755076666, |
|
"grad_norm": 2.635761747619665, |
|
"learning_rate": 8.559752007024449e-06, |
|
"loss": 0.5859, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.3253211769581434, |
|
"grad_norm": 2.79153427275607, |
|
"learning_rate": 8.534254631840297e-06, |
|
"loss": 0.5976, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.3273932863655201, |
|
"grad_norm": 2.656496652025873, |
|
"learning_rate": 8.50857223777584e-06, |
|
"loss": 0.578, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3294653957728968, |
|
"grad_norm": 2.7338530782471655, |
|
"learning_rate": 8.482706169309139e-06, |
|
"loss": 0.5648, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.33153750518027353, |
|
"grad_norm": 2.667291889393192, |
|
"learning_rate": 8.456657780533633e-06, |
|
"loss": 0.5641, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3336096145876502, |
|
"grad_norm": 2.9479902385492562, |
|
"learning_rate": 8.430428435087267e-06, |
|
"loss": 0.5665, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.33568172399502694, |
|
"grad_norm": 2.9408488784356748, |
|
"learning_rate": 8.404019506081103e-06, |
|
"loss": 0.5834, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.33775383340240367, |
|
"grad_norm": 2.707200722617604, |
|
"learning_rate": 8.377432376027437e-06, |
|
"loss": 0.5756, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.33982594280978035, |
|
"grad_norm": 2.70828380166259, |
|
"learning_rate": 8.350668436767413e-06, |
|
"loss": 0.5686, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3418980522171571, |
|
"grad_norm": 2.7745931767789886, |
|
"learning_rate": 8.323729089398182e-06, |
|
"loss": 0.5521, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.34397016162453375, |
|
"grad_norm": 2.685590822904809, |
|
"learning_rate": 8.296615744199533e-06, |
|
"loss": 0.5707, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.3460422710319105, |
|
"grad_norm": 2.6439480580475023, |
|
"learning_rate": 8.269329820560074e-06, |
|
"loss": 0.549, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.3481143804392872, |
|
"grad_norm": 2.6684012534208845, |
|
"learning_rate": 8.241872746902934e-06, |
|
"loss": 0.5614, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.3501864898466639, |
|
"grad_norm": 2.6586973407105847, |
|
"learning_rate": 8.214245960610966e-06, |
|
"loss": 0.5596, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.3522585992540406, |
|
"grad_norm": 2.630842021442866, |
|
"learning_rate": 8.18645090795152e-06, |
|
"loss": 0.5435, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3543307086614173, |
|
"grad_norm": 2.783114608569901, |
|
"learning_rate": 8.158489044000712e-06, |
|
"loss": 0.554, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.35640281806879404, |
|
"grad_norm": 2.6861386548441266, |
|
"learning_rate": 8.13036183256727e-06, |
|
"loss": 0.5503, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.35847492747617077, |
|
"grad_norm": 2.760451069908695, |
|
"learning_rate": 8.102070746115888e-06, |
|
"loss": 0.5504, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.36054703688354744, |
|
"grad_norm": 3.1912807546036155, |
|
"learning_rate": 8.073617265690144e-06, |
|
"loss": 0.5585, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.3626191462909242, |
|
"grad_norm": 2.7551688368780365, |
|
"learning_rate": 8.045002880834975e-06, |
|
"loss": 0.5499, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.36469125569830085, |
|
"grad_norm": 2.656529615625788, |
|
"learning_rate": 8.016229089518695e-06, |
|
"loss": 0.5472, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3667633651056776, |
|
"grad_norm": 2.456139438562433, |
|
"learning_rate": 7.987297398054572e-06, |
|
"loss": 0.5444, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.36883547451305426, |
|
"grad_norm": 2.7021257640939678, |
|
"learning_rate": 7.95820932102198e-06, |
|
"loss": 0.5467, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.370907583920431, |
|
"grad_norm": 2.576896456231456, |
|
"learning_rate": 7.9289663811871e-06, |
|
"loss": 0.5453, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.3729796933278077, |
|
"grad_norm": 2.509602711427261, |
|
"learning_rate": 7.899570109423219e-06, |
|
"loss": 0.5315, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3750518027351844, |
|
"grad_norm": 2.7824790919014486, |
|
"learning_rate": 7.870022044630569e-06, |
|
"loss": 0.5367, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.37712391214256114, |
|
"grad_norm": 2.635566041127458, |
|
"learning_rate": 7.84032373365578e-06, |
|
"loss": 0.5458, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.3791960215499378, |
|
"grad_norm": 2.5686297030493277, |
|
"learning_rate": 7.810476731210897e-06, |
|
"loss": 0.5538, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.38126813095731454, |
|
"grad_norm": 2.4885943009976366, |
|
"learning_rate": 7.780482599791987e-06, |
|
"loss": 0.5501, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.3833402403646913, |
|
"grad_norm": 2.6312255974545704, |
|
"learning_rate": 7.750342909597353e-06, |
|
"loss": 0.5412, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.38541234977206795, |
|
"grad_norm": 2.522165622730654, |
|
"learning_rate": 7.72005923844532e-06, |
|
"loss": 0.5313, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.3874844591794447, |
|
"grad_norm": 2.6702491559670167, |
|
"learning_rate": 7.689633171691646e-06, |
|
"loss": 0.5345, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.38955656858682136, |
|
"grad_norm": 2.6631872386085185, |
|
"learning_rate": 7.659066302146523e-06, |
|
"loss": 0.5452, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.3916286779941981, |
|
"grad_norm": 2.5785637853885195, |
|
"learning_rate": 7.628360229991198e-06, |
|
"loss": 0.5288, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.3937007874015748, |
|
"grad_norm": 2.6837857193306935, |
|
"learning_rate": 7.597516562694198e-06, |
|
"loss": 0.5306, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3957728968089515, |
|
"grad_norm": 2.550858198260864, |
|
"learning_rate": 7.56653691492718e-06, |
|
"loss": 0.5233, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.39784500621632823, |
|
"grad_norm": 2.6115170843406132, |
|
"learning_rate": 7.535422908480408e-06, |
|
"loss": 0.5424, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3999171156237049, |
|
"grad_norm": 2.5494802625123105, |
|
"learning_rate": 7.504176172177842e-06, |
|
"loss": 0.5171, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.40198922503108164, |
|
"grad_norm": 2.5129023648194284, |
|
"learning_rate": 7.472798341791877e-06, |
|
"loss": 0.5148, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.4040613344384584, |
|
"grad_norm": 2.6166709552589866, |
|
"learning_rate": 7.441291059957709e-06, |
|
"loss": 0.5292, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.40613344384583505, |
|
"grad_norm": 2.5745519804152073, |
|
"learning_rate": 7.409655976087338e-06, |
|
"loss": 0.5228, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.4082055532532118, |
|
"grad_norm": 2.7301378295783643, |
|
"learning_rate": 7.377894746283227e-06, |
|
"loss": 0.5343, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.41027766266058846, |
|
"grad_norm": 2.431218699982397, |
|
"learning_rate": 7.3460090332516e-06, |
|
"loss": 0.508, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.4123497720679652, |
|
"grad_norm": 2.556296703014823, |
|
"learning_rate": 7.314000506215402e-06, |
|
"loss": 0.5148, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.4144218814753419, |
|
"grad_norm": 2.5152139819517023, |
|
"learning_rate": 7.281870840826912e-06, |
|
"loss": 0.4999, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4164939908827186, |
|
"grad_norm": 2.786935816025543, |
|
"learning_rate": 7.249621719080026e-06, |
|
"loss": 0.5177, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.41856610029009533, |
|
"grad_norm": 2.5491738667702672, |
|
"learning_rate": 7.217254829222201e-06, |
|
"loss": 0.5114, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.420638209697472, |
|
"grad_norm": 2.50660251762613, |
|
"learning_rate": 7.1847718656660755e-06, |
|
"loss": 0.5156, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.42271031910484874, |
|
"grad_norm": 2.5986079473520878, |
|
"learning_rate": 7.152174528900773e-06, |
|
"loss": 0.4954, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.4247824285122254, |
|
"grad_norm": 2.635934213356069, |
|
"learning_rate": 7.119464525402867e-06, |
|
"loss": 0.504, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.42685453791960215, |
|
"grad_norm": 2.861696123619274, |
|
"learning_rate": 7.08664356754706e-06, |
|
"loss": 0.5063, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.4289266473269789, |
|
"grad_norm": 2.5689042563262223, |
|
"learning_rate": 7.053713373516538e-06, |
|
"loss": 0.5181, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.43099875673435556, |
|
"grad_norm": 2.5132849205491286, |
|
"learning_rate": 7.020675667213015e-06, |
|
"loss": 0.5043, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.4330708661417323, |
|
"grad_norm": 2.704397404776844, |
|
"learning_rate": 6.987532178166496e-06, |
|
"loss": 0.5022, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.43514297554910897, |
|
"grad_norm": 2.4785754646781104, |
|
"learning_rate": 6.9542846414447306e-06, |
|
"loss": 0.5027, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.4372150849564857, |
|
"grad_norm": 2.3912935006226594, |
|
"learning_rate": 6.920934797562385e-06, |
|
"loss": 0.5051, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.43928719436386243, |
|
"grad_norm": 2.3476041200753546, |
|
"learning_rate": 6.887484392389923e-06, |
|
"loss": 0.5043, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.4413593037712391, |
|
"grad_norm": 2.5914055427224465, |
|
"learning_rate": 6.853935177062208e-06, |
|
"loss": 0.4974, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.44343141317861584, |
|
"grad_norm": 2.594951575952019, |
|
"learning_rate": 6.8202889078868395e-06, |
|
"loss": 0.5061, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.4455035225859925, |
|
"grad_norm": 2.6023391478211293, |
|
"learning_rate": 6.786547346252198e-06, |
|
"loss": 0.4963, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.44757563199336925, |
|
"grad_norm": 2.3528096807317995, |
|
"learning_rate": 6.7527122585352435e-06, |
|
"loss": 0.4883, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.449647741400746, |
|
"grad_norm": 2.7398459124724814, |
|
"learning_rate": 6.718785416009044e-06, |
|
"loss": 0.4968, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.45171985080812266, |
|
"grad_norm": 2.838272166911696, |
|
"learning_rate": 6.6847685947500495e-06, |
|
"loss": 0.4915, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.4537919602154994, |
|
"grad_norm": 3.0284763209341623, |
|
"learning_rate": 6.650663575545111e-06, |
|
"loss": 0.4762, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.45586406962287607, |
|
"grad_norm": 2.6967943018540517, |
|
"learning_rate": 6.61647214379826e-06, |
|
"loss": 0.4737, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.4579361790302528, |
|
"grad_norm": 2.551888108931838, |
|
"learning_rate": 6.582196089437241e-06, |
|
"loss": 0.5076, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.46000828843762953, |
|
"grad_norm": 2.514843118690436, |
|
"learning_rate": 6.547837206819804e-06, |
|
"loss": 0.4876, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.4620803978450062, |
|
"grad_norm": 2.4499641238461005, |
|
"learning_rate": 6.513397294639778e-06, |
|
"loss": 0.4785, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.46415250725238294, |
|
"grad_norm": 2.5439805360709378, |
|
"learning_rate": 6.478878155832904e-06, |
|
"loss": 0.4609, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.4662246166597596, |
|
"grad_norm": 2.567465771254044, |
|
"learning_rate": 6.444281597482449e-06, |
|
"loss": 0.4826, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.46829672606713635, |
|
"grad_norm": 2.4194190626785863, |
|
"learning_rate": 6.409609430724607e-06, |
|
"loss": 0.4639, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.4703688354745131, |
|
"grad_norm": 2.4928927116754216, |
|
"learning_rate": 6.3748634706536905e-06, |
|
"loss": 0.4755, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.47244094488188976, |
|
"grad_norm": 2.5328967096855517, |
|
"learning_rate": 6.340045536227101e-06, |
|
"loss": 0.4676, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.4745130542892665, |
|
"grad_norm": 2.580851369974543, |
|
"learning_rate": 6.305157450170112e-06, |
|
"loss": 0.4679, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.47658516369664317, |
|
"grad_norm": 2.771781788352758, |
|
"learning_rate": 6.270201038880451e-06, |
|
"loss": 0.4748, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.4786572731040199, |
|
"grad_norm": 2.3357211287270294, |
|
"learning_rate": 6.235178132332678e-06, |
|
"loss": 0.4733, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.4807293825113966, |
|
"grad_norm": 2.4376737050512993, |
|
"learning_rate": 6.200090563982397e-06, |
|
"loss": 0.4623, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.4828014919187733, |
|
"grad_norm": 2.727521949059151, |
|
"learning_rate": 6.164940170670266e-06, |
|
"loss": 0.4763, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.48487360132615004, |
|
"grad_norm": 2.485190311900293, |
|
"learning_rate": 6.129728792525847e-06, |
|
"loss": 0.4653, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.4869457107335267, |
|
"grad_norm": 2.3795187623979746, |
|
"learning_rate": 6.094458272871259e-06, |
|
"loss": 0.4576, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.48901782014090345, |
|
"grad_norm": 2.43415537867363, |
|
"learning_rate": 6.0591304581247005e-06, |
|
"loss": 0.4606, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.4910899295482801, |
|
"grad_norm": 2.5114902879972374, |
|
"learning_rate": 6.023747197703771e-06, |
|
"loss": 0.4671, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.49316203895565686, |
|
"grad_norm": 2.5122364778542225, |
|
"learning_rate": 5.988310343928665e-06, |
|
"loss": 0.4678, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.4952341483630336, |
|
"grad_norm": 2.6311201696718283, |
|
"learning_rate": 5.9528217519252e-06, |
|
"loss": 0.4653, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.49730625777041026, |
|
"grad_norm": 2.634373162867216, |
|
"learning_rate": 5.9172832795276965e-06, |
|
"loss": 0.4858, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.499378367177787, |
|
"grad_norm": 2.602845367016974, |
|
"learning_rate": 5.881696787181724e-06, |
|
"loss": 0.4646, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.5014504765851637, |
|
"grad_norm": 2.4997645055614544, |
|
"learning_rate": 5.846064137846704e-06, |
|
"loss": 0.4723, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5035225859925404, |
|
"grad_norm": 2.5259484476620853, |
|
"learning_rate": 5.810387196898387e-06, |
|
"loss": 0.4592, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.5055946953999171, |
|
"grad_norm": 2.460185161862939, |
|
"learning_rate": 5.7746678320311955e-06, |
|
"loss": 0.4563, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.5076668048072939, |
|
"grad_norm": 2.522924782621583, |
|
"learning_rate": 5.738907913160452e-06, |
|
"loss": 0.455, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.5097389142146705, |
|
"grad_norm": 2.450518901210104, |
|
"learning_rate": 5.703109312324493e-06, |
|
"loss": 0.4631, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.5118110236220472, |
|
"grad_norm": 2.512677663120476, |
|
"learning_rate": 5.667273903586656e-06, |
|
"loss": 0.4636, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.5138831330294239, |
|
"grad_norm": 2.508483790573203, |
|
"learning_rate": 5.6314035629371835e-06, |
|
"loss": 0.4494, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5159552424368007, |
|
"grad_norm": 2.539619573950297, |
|
"learning_rate": 5.595500168195007e-06, |
|
"loss": 0.4657, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.5180273518441774, |
|
"grad_norm": 2.457760045030727, |
|
"learning_rate": 5.5595655989094525e-06, |
|
"loss": 0.4562, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.520099461251554, |
|
"grad_norm": 2.5810506702287546, |
|
"learning_rate": 5.52360173626183e-06, |
|
"loss": 0.4587, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.5221715706589308, |
|
"grad_norm": 2.533803600220524, |
|
"learning_rate": 5.487610462966969e-06, |
|
"loss": 0.4473, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.5242436800663075, |
|
"grad_norm": 2.408501431878237, |
|
"learning_rate": 5.451593663174647e-06, |
|
"loss": 0.4466, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 2.5248933210116427, |
|
"learning_rate": 5.4155532223709625e-06, |
|
"loss": 0.4427, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.528387898881061, |
|
"grad_norm": 2.5355682898170704, |
|
"learning_rate": 5.379491027279622e-06, |
|
"loss": 0.4624, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.5304600082884376, |
|
"grad_norm": 2.7849780348488156, |
|
"learning_rate": 5.343408965763174e-06, |
|
"loss": 0.4487, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.5325321176958143, |
|
"grad_norm": 2.401434318103116, |
|
"learning_rate": 5.3073089267241805e-06, |
|
"loss": 0.4393, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.534604227103191, |
|
"grad_norm": 2.4324353094703475, |
|
"learning_rate": 5.271192800006325e-06, |
|
"loss": 0.4405, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.5366763365105678, |
|
"grad_norm": 2.5008934086234027, |
|
"learning_rate": 5.235062476295488e-06, |
|
"loss": 0.4206, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.5387484459179445, |
|
"grad_norm": 2.523177697285061, |
|
"learning_rate": 5.198919847020765e-06, |
|
"loss": 0.4378, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5408205553253211, |
|
"grad_norm": 2.561504860219997, |
|
"learning_rate": 5.162766804255446e-06, |
|
"loss": 0.4369, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.5428926647326979, |
|
"grad_norm": 2.4342748058160457, |
|
"learning_rate": 5.1266052406179755e-06, |
|
"loss": 0.4429, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.5449647741400746, |
|
"grad_norm": 2.5003051526668054, |
|
"learning_rate": 5.090437049172861e-06, |
|
"loss": 0.4479, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.5470368835474513, |
|
"grad_norm": 2.41044724225261, |
|
"learning_rate": 5.054264123331583e-06, |
|
"loss": 0.4348, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.5491089929548281, |
|
"grad_norm": 2.534587174570914, |
|
"learning_rate": 5.018088356753463e-06, |
|
"loss": 0.4346, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.5511811023622047, |
|
"grad_norm": 2.3874656931988953, |
|
"learning_rate": 4.981911643246539e-06, |
|
"loss": 0.4463, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.5532532117695814, |
|
"grad_norm": 2.344775548708239, |
|
"learning_rate": 4.9457358766684175e-06, |
|
"loss": 0.4323, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.5553253211769581, |
|
"grad_norm": 2.394852642174165, |
|
"learning_rate": 4.9095629508271396e-06, |
|
"loss": 0.412, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.5573974305843349, |
|
"grad_norm": 2.4107260444211067, |
|
"learning_rate": 4.873394759382025e-06, |
|
"loss": 0.434, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.5594695399917116, |
|
"grad_norm": 2.39603695241891, |
|
"learning_rate": 4.837233195744556e-06, |
|
"loss": 0.4368, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.5615416493990882, |
|
"grad_norm": 2.484288425740336, |
|
"learning_rate": 4.8010801529792375e-06, |
|
"loss": 0.4247, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.563613758806465, |
|
"grad_norm": 2.491968721281532, |
|
"learning_rate": 4.7649375237045135e-06, |
|
"loss": 0.4239, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.5656858682138417, |
|
"grad_norm": 2.405464453263017, |
|
"learning_rate": 4.728807199993677e-06, |
|
"loss": 0.4156, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.5677579776212184, |
|
"grad_norm": 2.3649504461482405, |
|
"learning_rate": 4.692691073275822e-06, |
|
"loss": 0.4272, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.5698300870285951, |
|
"grad_norm": 2.578097130955108, |
|
"learning_rate": 4.656591034236827e-06, |
|
"loss": 0.4318, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.5719021964359718, |
|
"grad_norm": 2.3565047434019233, |
|
"learning_rate": 4.620508972720379e-06, |
|
"loss": 0.4157, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.5739743058433485, |
|
"grad_norm": 2.4914369771154776, |
|
"learning_rate": 4.584446777629038e-06, |
|
"loss": 0.4131, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.5760464152507252, |
|
"grad_norm": 2.5403254221008016, |
|
"learning_rate": 4.548406336825354e-06, |
|
"loss": 0.4209, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.578118524658102, |
|
"grad_norm": 2.4433869039162763, |
|
"learning_rate": 4.512389537033032e-06, |
|
"loss": 0.4156, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.5801906340654787, |
|
"grad_norm": 2.5532014778576606, |
|
"learning_rate": 4.476398263738171e-06, |
|
"loss": 0.4187, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5822627434728553, |
|
"grad_norm": 2.422240802637448, |
|
"learning_rate": 4.440434401090549e-06, |
|
"loss": 0.4123, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.5843348528802321, |
|
"grad_norm": 2.327309317585392, |
|
"learning_rate": 4.404499831804993e-06, |
|
"loss": 0.4167, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.5864069622876088, |
|
"grad_norm": 2.337179254235494, |
|
"learning_rate": 4.368596437062819e-06, |
|
"loss": 0.4253, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.5884790716949855, |
|
"grad_norm": 2.6203872163413, |
|
"learning_rate": 4.332726096413346e-06, |
|
"loss": 0.4035, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.5905511811023622, |
|
"grad_norm": 2.45290146849787, |
|
"learning_rate": 4.29689068767551e-06, |
|
"loss": 0.4205, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.5926232905097389, |
|
"grad_norm": 2.442505011249494, |
|
"learning_rate": 4.261092086839549e-06, |
|
"loss": 0.4199, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.5946953999171156, |
|
"grad_norm": 2.469896647219146, |
|
"learning_rate": 4.225332167968808e-06, |
|
"loss": 0.4197, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.5967675093244923, |
|
"grad_norm": 2.525933254390451, |
|
"learning_rate": 4.189612803101614e-06, |
|
"loss": 0.4136, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.5988396187318691, |
|
"grad_norm": 2.3481926369028154, |
|
"learning_rate": 4.153935862153299e-06, |
|
"loss": 0.4098, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.6009117281392458, |
|
"grad_norm": 2.4652012656554336, |
|
"learning_rate": 4.118303212818277e-06, |
|
"loss": 0.4111, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.6029838375466224, |
|
"grad_norm": 2.597755453602896, |
|
"learning_rate": 4.082716720472304e-06, |
|
"loss": 0.4141, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.6050559469539992, |
|
"grad_norm": 2.3332725702199912, |
|
"learning_rate": 4.0471782480748005e-06, |
|
"loss": 0.3886, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.6071280563613759, |
|
"grad_norm": 2.387505345147473, |
|
"learning_rate": 4.011689656071334e-06, |
|
"loss": 0.4167, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.6092001657687526, |
|
"grad_norm": 2.4113537140405388, |
|
"learning_rate": 3.97625280229623e-06, |
|
"loss": 0.4002, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.6112722751761293, |
|
"grad_norm": 2.4554203658965488, |
|
"learning_rate": 3.940869541875301e-06, |
|
"loss": 0.3881, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.613344384583506, |
|
"grad_norm": 2.425914982917716, |
|
"learning_rate": 3.905541727128743e-06, |
|
"loss": 0.4069, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.6154164939908827, |
|
"grad_norm": 2.5231106398066476, |
|
"learning_rate": 3.870271207474154e-06, |
|
"loss": 0.4002, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.6174886033982594, |
|
"grad_norm": 2.4689338483413135, |
|
"learning_rate": 3.8350598293297345e-06, |
|
"loss": 0.4141, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.6195607128056362, |
|
"grad_norm": 2.496046599900669, |
|
"learning_rate": 3.7999094360176036e-06, |
|
"loss": 0.3965, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.6216328222130129, |
|
"grad_norm": 2.3726201269683553, |
|
"learning_rate": 3.7648218676673232e-06, |
|
"loss": 0.4017, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6237049316203895, |
|
"grad_norm": 2.418446591018178, |
|
"learning_rate": 3.7297989611195504e-06, |
|
"loss": 0.3938, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.6257770410277662, |
|
"grad_norm": 2.4157452764623963, |
|
"learning_rate": 3.694842549829889e-06, |
|
"loss": 0.3871, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.627849150435143, |
|
"grad_norm": 2.4088293626826114, |
|
"learning_rate": 3.659954463772901e-06, |
|
"loss": 0.4002, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.6299212598425197, |
|
"grad_norm": 2.5779980646499543, |
|
"learning_rate": 3.625136529346312e-06, |
|
"loss": 0.4055, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.6319933692498964, |
|
"grad_norm": 2.4110720231898806, |
|
"learning_rate": 3.590390569275395e-06, |
|
"loss": 0.3913, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.6340654786572731, |
|
"grad_norm": 2.3982253161569234, |
|
"learning_rate": 3.555718402517554e-06, |
|
"loss": 0.3962, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.6361375880646498, |
|
"grad_norm": 2.392216618491618, |
|
"learning_rate": 3.521121844167098e-06, |
|
"loss": 0.399, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.6382096974720265, |
|
"grad_norm": 2.3550943907800193, |
|
"learning_rate": 3.486602705360224e-06, |
|
"loss": 0.3927, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.6402818068794033, |
|
"grad_norm": 2.4744483911955153, |
|
"learning_rate": 3.4521627931801976e-06, |
|
"loss": 0.3961, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.64235391628678, |
|
"grad_norm": 2.416762173736933, |
|
"learning_rate": 3.41780391056276e-06, |
|
"loss": 0.3959, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.6444260256941566, |
|
"grad_norm": 2.3621515478388906, |
|
"learning_rate": 3.3835278562017405e-06, |
|
"loss": 0.3889, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.6464981351015333, |
|
"grad_norm": 2.3799558311767917, |
|
"learning_rate": 3.349336424454889e-06, |
|
"loss": 0.395, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.6485702445089101, |
|
"grad_norm": 2.4290023388512143, |
|
"learning_rate": 3.3152314052499513e-06, |
|
"loss": 0.3921, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.6506423539162868, |
|
"grad_norm": 2.386844861650698, |
|
"learning_rate": 3.2812145839909566e-06, |
|
"loss": 0.382, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.6527144633236635, |
|
"grad_norm": 2.58783476073575, |
|
"learning_rate": 3.247287741464758e-06, |
|
"loss": 0.3961, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.6547865727310402, |
|
"grad_norm": 2.4711795095903906, |
|
"learning_rate": 3.2134526537478034e-06, |
|
"loss": 0.403, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.6568586821384169, |
|
"grad_norm": 2.445518124608817, |
|
"learning_rate": 3.1797110921131626e-06, |
|
"loss": 0.3949, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.6589307915457936, |
|
"grad_norm": 2.3302808051599957, |
|
"learning_rate": 3.1460648229377933e-06, |
|
"loss": 0.4003, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.6610029009531704, |
|
"grad_norm": 2.4700969540161086, |
|
"learning_rate": 3.1125156076100804e-06, |
|
"loss": 0.3845, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.6630750103605471, |
|
"grad_norm": 2.2736405570858684, |
|
"learning_rate": 3.0790652024376163e-06, |
|
"loss": 0.3755, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6651471197679237, |
|
"grad_norm": 2.3793447678069732, |
|
"learning_rate": 3.0457153585552724e-06, |
|
"loss": 0.3764, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.6672192291753004, |
|
"grad_norm": 2.381350136652911, |
|
"learning_rate": 3.012467821833506e-06, |
|
"loss": 0.388, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.6692913385826772, |
|
"grad_norm": 2.440306240554211, |
|
"learning_rate": 2.979324332786987e-06, |
|
"loss": 0.3914, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.6713634479900539, |
|
"grad_norm": 2.473851856313499, |
|
"learning_rate": 2.946286626483463e-06, |
|
"loss": 0.3844, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.6734355573974306, |
|
"grad_norm": 2.266055102817957, |
|
"learning_rate": 2.913356432452942e-06, |
|
"loss": 0.3789, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.6755076668048073, |
|
"grad_norm": 2.4641402988030126, |
|
"learning_rate": 2.8805354745971336e-06, |
|
"loss": 0.37, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.677579776212184, |
|
"grad_norm": 2.3943294283961984, |
|
"learning_rate": 2.847825471099227e-06, |
|
"loss": 0.3777, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.6796518856195607, |
|
"grad_norm": 2.384842311722, |
|
"learning_rate": 2.815228134333925e-06, |
|
"loss": 0.382, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.6817239950269374, |
|
"grad_norm": 2.469151674287745, |
|
"learning_rate": 2.782745170777801e-06, |
|
"loss": 0.3704, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.6837961044343142, |
|
"grad_norm": 2.583143124954603, |
|
"learning_rate": 2.750378280919975e-06, |
|
"loss": 0.3736, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.6858682138416908, |
|
"grad_norm": 2.4321963843177947, |
|
"learning_rate": 2.7181291591730885e-06, |
|
"loss": 0.3782, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.6879403232490675, |
|
"grad_norm": 2.3531730582824975, |
|
"learning_rate": 2.6859994937846002e-06, |
|
"loss": 0.376, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.6900124326564443, |
|
"grad_norm": 2.4367688917720485, |
|
"learning_rate": 2.653990966748401e-06, |
|
"loss": 0.377, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.692084542063821, |
|
"grad_norm": 2.3875592466498605, |
|
"learning_rate": 2.622105253716774e-06, |
|
"loss": 0.3772, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.6941566514711976, |
|
"grad_norm": 2.424175019670591, |
|
"learning_rate": 2.5903440239126633e-06, |
|
"loss": 0.3762, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.6962287608785744, |
|
"grad_norm": 2.337415256255787, |
|
"learning_rate": 2.5587089400422936e-06, |
|
"loss": 0.3906, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.6983008702859511, |
|
"grad_norm": 2.384059902937286, |
|
"learning_rate": 2.5272016582081236e-06, |
|
"loss": 0.3661, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.7003729796933278, |
|
"grad_norm": 2.346884231086367, |
|
"learning_rate": 2.4958238278221603e-06, |
|
"loss": 0.3632, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.7024450891007045, |
|
"grad_norm": 2.3658711079290264, |
|
"learning_rate": 2.464577091519594e-06, |
|
"loss": 0.3695, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.7045171985080813, |
|
"grad_norm": 2.3082126926984277, |
|
"learning_rate": 2.43346308507282e-06, |
|
"loss": 0.3697, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7065893079154579, |
|
"grad_norm": 2.2724590255115324, |
|
"learning_rate": 2.4024834373058024e-06, |
|
"loss": 0.3708, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.7086614173228346, |
|
"grad_norm": 2.4685609897195633, |
|
"learning_rate": 2.371639770008804e-06, |
|
"loss": 0.3589, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.7107335267302114, |
|
"grad_norm": 2.4411171519085704, |
|
"learning_rate": 2.3409336978534785e-06, |
|
"loss": 0.3657, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.7128056361375881, |
|
"grad_norm": 2.3755249742291227, |
|
"learning_rate": 2.3103668283083563e-06, |
|
"loss": 0.3688, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.7148777455449647, |
|
"grad_norm": 2.3679474209669307, |
|
"learning_rate": 2.2799407615546816e-06, |
|
"loss": 0.3738, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.7169498549523415, |
|
"grad_norm": 2.351814749989449, |
|
"learning_rate": 2.2496570904026484e-06, |
|
"loss": 0.3647, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.7190219643597182, |
|
"grad_norm": 2.4116114120516103, |
|
"learning_rate": 2.219517400208015e-06, |
|
"loss": 0.3736, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.7210940737670949, |
|
"grad_norm": 2.3864727744957412, |
|
"learning_rate": 2.1895232687891044e-06, |
|
"loss": 0.3484, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.7231661831744716, |
|
"grad_norm": 2.2512303655303874, |
|
"learning_rate": 2.159676266344222e-06, |
|
"loss": 0.3599, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.7252382925818484, |
|
"grad_norm": 2.2576815601903872, |
|
"learning_rate": 2.1299779553694323e-06, |
|
"loss": 0.358, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.727310401989225, |
|
"grad_norm": 2.4261849484523594, |
|
"learning_rate": 2.100429890576782e-06, |
|
"loss": 0.3638, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.7293825113966017, |
|
"grad_norm": 2.3998781669660008, |
|
"learning_rate": 2.0710336188129e-06, |
|
"loss": 0.3626, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.7314546208039785, |
|
"grad_norm": 2.2949968092513697, |
|
"learning_rate": 2.0417906789780236e-06, |
|
"loss": 0.3556, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.7335267302113552, |
|
"grad_norm": 2.3439568873845222, |
|
"learning_rate": 2.0127026019454305e-06, |
|
"loss": 0.3753, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.7355988396187318, |
|
"grad_norm": 2.2377793345094896, |
|
"learning_rate": 1.9837709104813075e-06, |
|
"loss": 0.358, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.7376709490261085, |
|
"grad_norm": 2.2885196165888653, |
|
"learning_rate": 1.9549971191650263e-06, |
|
"loss": 0.3515, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.7397430584334853, |
|
"grad_norm": 2.558424767409142, |
|
"learning_rate": 1.9263827343098596e-06, |
|
"loss": 0.3657, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.741815167840862, |
|
"grad_norm": 2.44623978739584, |
|
"learning_rate": 1.8979292538841133e-06, |
|
"loss": 0.3623, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.7438872772482387, |
|
"grad_norm": 2.2614597017692692, |
|
"learning_rate": 1.8696381674327308e-06, |
|
"loss": 0.3553, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.7459593866556155, |
|
"grad_norm": 2.2881907635344576, |
|
"learning_rate": 1.8415109559992883e-06, |
|
"loss": 0.3531, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.7480314960629921, |
|
"grad_norm": 2.463206144892447, |
|
"learning_rate": 1.8135490920484832e-06, |
|
"loss": 0.3559, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.7501036054703688, |
|
"grad_norm": 2.3074379267386766, |
|
"learning_rate": 1.7857540393890337e-06, |
|
"loss": 0.3544, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.7521757148777456, |
|
"grad_norm": 2.345951065809859, |
|
"learning_rate": 1.7581272530970666e-06, |
|
"loss": 0.3495, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.7542478242851223, |
|
"grad_norm": 2.436580121963066, |
|
"learning_rate": 1.7306701794399266e-06, |
|
"loss": 0.351, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.756319933692499, |
|
"grad_norm": 2.477449269541472, |
|
"learning_rate": 1.7033842558004692e-06, |
|
"loss": 0.357, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.7583920430998756, |
|
"grad_norm": 2.336059640702188, |
|
"learning_rate": 1.6762709106018193e-06, |
|
"loss": 0.3566, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.7604641525072524, |
|
"grad_norm": 2.327604564240487, |
|
"learning_rate": 1.6493315632325873e-06, |
|
"loss": 0.3693, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.7625362619146291, |
|
"grad_norm": 2.4521027562031805, |
|
"learning_rate": 1.6225676239725663e-06, |
|
"loss": 0.3557, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.7646083713220058, |
|
"grad_norm": 2.3928441636264055, |
|
"learning_rate": 1.5959804939188962e-06, |
|
"loss": 0.35, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.7666804807293826, |
|
"grad_norm": 2.3515743827298263, |
|
"learning_rate": 1.5695715649127347e-06, |
|
"loss": 0.3597, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.7687525901367592, |
|
"grad_norm": 2.4659862823505843, |
|
"learning_rate": 1.5433422194663694e-06, |
|
"loss": 0.3544, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.7708246995441359, |
|
"grad_norm": 2.3479456978647684, |
|
"learning_rate": 1.5172938306908624e-06, |
|
"loss": 0.3479, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.7728968089515127, |
|
"grad_norm": 2.491588705252418, |
|
"learning_rate": 1.4914277622241596e-06, |
|
"loss": 0.348, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.7749689183588894, |
|
"grad_norm": 2.2957936532852026, |
|
"learning_rate": 1.4657453681597056e-06, |
|
"loss": 0.3535, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.777041027766266, |
|
"grad_norm": 2.2660578633915787, |
|
"learning_rate": 1.440247992975553e-06, |
|
"loss": 0.3503, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.7791131371736427, |
|
"grad_norm": 2.4214309256458755, |
|
"learning_rate": 1.4149369714639856e-06, |
|
"loss": 0.3525, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.7811852465810195, |
|
"grad_norm": 2.378191297196036, |
|
"learning_rate": 1.3898136286616364e-06, |
|
"loss": 0.3449, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.7832573559883962, |
|
"grad_norm": 2.369189696994275, |
|
"learning_rate": 1.3648792797801264e-06, |
|
"loss": 0.3411, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.7853294653957729, |
|
"grad_norm": 2.357227748595689, |
|
"learning_rate": 1.3401352301372039e-06, |
|
"loss": 0.3428, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.7874015748031497, |
|
"grad_norm": 2.283274818402107, |
|
"learning_rate": 1.315582775088421e-06, |
|
"loss": 0.3476, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.7894736842105263, |
|
"grad_norm": 2.443389423960905, |
|
"learning_rate": 1.2912231999593222e-06, |
|
"loss": 0.3456, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.791545793617903, |
|
"grad_norm": 2.3098901374263785, |
|
"learning_rate": 1.267057779978143e-06, |
|
"loss": 0.3387, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.7936179030252797, |
|
"grad_norm": 2.3606277283639403, |
|
"learning_rate": 1.2430877802090674e-06, |
|
"loss": 0.3505, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.7956900124326565, |
|
"grad_norm": 2.778705490024244, |
|
"learning_rate": 1.2193144554859938e-06, |
|
"loss": 0.334, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.7977621218400331, |
|
"grad_norm": 2.3112033884634418, |
|
"learning_rate": 1.195739050346848e-06, |
|
"loss": 0.349, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.7998342312474098, |
|
"grad_norm": 2.3575714411861517, |
|
"learning_rate": 1.172362798968424e-06, |
|
"loss": 0.3449, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.8019063406547866, |
|
"grad_norm": 2.491996861817634, |
|
"learning_rate": 1.1491869251017833e-06, |
|
"loss": 0.3414, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.8039784500621633, |
|
"grad_norm": 2.3991288004159883, |
|
"learning_rate": 1.1262126420081887e-06, |
|
"loss": 0.3457, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.80605055946954, |
|
"grad_norm": 2.1821189723068612, |
|
"learning_rate": 1.103441152395588e-06, |
|
"loss": 0.3283, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.8081226688769167, |
|
"grad_norm": 2.417668328035057, |
|
"learning_rate": 1.0808736483556486e-06, |
|
"loss": 0.3386, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.8101947782842934, |
|
"grad_norm": 2.424032614605958, |
|
"learning_rate": 1.0585113113013656e-06, |
|
"loss": 0.3451, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.8122668876916701, |
|
"grad_norm": 2.3278404790134823, |
|
"learning_rate": 1.036355311905194e-06, |
|
"loss": 0.3455, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.8143389970990468, |
|
"grad_norm": 2.3039554290948185, |
|
"learning_rate": 1.0144068100377818e-06, |
|
"loss": 0.3381, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.8164111065064236, |
|
"grad_norm": 2.556306207112663, |
|
"learning_rate": 9.926669547072365e-07, |
|
"loss": 0.3485, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.8184832159138002, |
|
"grad_norm": 2.4582213305283775, |
|
"learning_rate": 9.711368839989904e-07, |
|
"loss": 0.3335, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.8205553253211769, |
|
"grad_norm": 2.3579094295454657, |
|
"learning_rate": 9.498177250162022e-07, |
|
"loss": 0.3346, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.8226274347285537, |
|
"grad_norm": 2.3492342765735934, |
|
"learning_rate": 9.287105938207691e-07, |
|
"loss": 0.3365, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.8246995441359304, |
|
"grad_norm": 2.4946764945886017, |
|
"learning_rate": 9.078165953748936e-07, |
|
"loss": 0.336, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.8267716535433071, |
|
"grad_norm": 2.5266114489992955, |
|
"learning_rate": 8.871368234832378e-07, |
|
"loss": 0.3367, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.8288437629506838, |
|
"grad_norm": 2.473228872985738, |
|
"learning_rate": 8.66672360735668e-07, |
|
"loss": 0.3247, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.8309158723580605, |
|
"grad_norm": 2.393007820983065, |
|
"learning_rate": 8.4642427845057e-07, |
|
"loss": 0.3289, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.8329879817654372, |
|
"grad_norm": 2.492764210538724, |
|
"learning_rate": 8.263936366187825e-07, |
|
"loss": 0.3325, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.8350600911728139, |
|
"grad_norm": 2.4227393176245418, |
|
"learning_rate": 8.065814838480879e-07, |
|
"loss": 0.3288, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.8371322005801907, |
|
"grad_norm": 2.3030572780595366, |
|
"learning_rate": 7.869888573083295e-07, |
|
"loss": 0.3401, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.8392043099875673, |
|
"grad_norm": 2.3219953415669528, |
|
"learning_rate": 7.676167826771125e-07, |
|
"loss": 0.331, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.841276419394944, |
|
"grad_norm": 2.273400790612548, |
|
"learning_rate": 7.484662740861093e-07, |
|
"loss": 0.3383, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.8433485288023208, |
|
"grad_norm": 2.374061200093819, |
|
"learning_rate": 7.295383340679668e-07, |
|
"loss": 0.3343, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.8454206382096975, |
|
"grad_norm": 2.377090176007411, |
|
"learning_rate": 7.108339535038278e-07, |
|
"loss": 0.3298, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.8474927476170742, |
|
"grad_norm": 2.3279278715664016, |
|
"learning_rate": 6.923541115714577e-07, |
|
"loss": 0.3319, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.8495648570244508, |
|
"grad_norm": 2.36303398687802, |
|
"learning_rate": 6.740997756939826e-07, |
|
"loss": 0.3418, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.8516369664318276, |
|
"grad_norm": 2.415624778572703, |
|
"learning_rate": 6.560719014892425e-07, |
|
"loss": 0.3328, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.8537090758392043, |
|
"grad_norm": 2.287618281117491, |
|
"learning_rate": 6.382714327197703e-07, |
|
"loss": 0.3321, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.855781185246581, |
|
"grad_norm": 2.2595148250907036, |
|
"learning_rate": 6.206993012433815e-07, |
|
"loss": 0.3336, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.8578532946539578, |
|
"grad_norm": 2.3926760910481186, |
|
"learning_rate": 6.033564269643927e-07, |
|
"loss": 0.3342, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.8599254040613344, |
|
"grad_norm": 2.316219130405351, |
|
"learning_rate": 5.862437177854629e-07, |
|
"loss": 0.3311, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.8619975134687111, |
|
"grad_norm": 2.3766431440321707, |
|
"learning_rate": 5.693620695600671e-07, |
|
"loss": 0.3153, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.8640696228760879, |
|
"grad_norm": 2.1660555465277405, |
|
"learning_rate": 5.527123660455968e-07, |
|
"loss": 0.3268, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.8661417322834646, |
|
"grad_norm": 2.3890878403617815, |
|
"learning_rate": 5.362954788570929e-07, |
|
"loss": 0.317, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.8682138416908413, |
|
"grad_norm": 2.3910404086765515, |
|
"learning_rate": 5.201122674216208e-07, |
|
"loss": 0.329, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.8702859510982179, |
|
"grad_norm": 2.398893233495637, |
|
"learning_rate": 5.041635789332783e-07, |
|
"loss": 0.3306, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8723580605055947, |
|
"grad_norm": 2.5664259292956495, |
|
"learning_rate": 4.884502483088421e-07, |
|
"loss": 0.3153, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.8744301699129714, |
|
"grad_norm": 2.4313343193724073, |
|
"learning_rate": 4.7297309814406113e-07, |
|
"loss": 0.3286, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.8765022793203481, |
|
"grad_norm": 2.3267523640919374, |
|
"learning_rate": 4.577329386705942e-07, |
|
"loss": 0.3321, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.8785743887277249, |
|
"grad_norm": 2.37298005463995, |
|
"learning_rate": 4.42730567713594e-07, |
|
"loss": 0.3303, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.8806464981351015, |
|
"grad_norm": 2.4400767630128084, |
|
"learning_rate": 4.2796677064994243e-07, |
|
"loss": 0.3187, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.8827186075424782, |
|
"grad_norm": 2.600660724910327, |
|
"learning_rate": 4.134423203671295e-07, |
|
"loss": 0.3347, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.884790716949855, |
|
"grad_norm": 2.416615232949827, |
|
"learning_rate": 3.9915797722280323e-07, |
|
"loss": 0.3317, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.8868628263572317, |
|
"grad_norm": 2.3982415986507344, |
|
"learning_rate": 3.851144890049535e-07, |
|
"loss": 0.326, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.8889349357646084, |
|
"grad_norm": 2.3596784596238445, |
|
"learning_rate": 3.713125908927728e-07, |
|
"loss": 0.3274, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.891007045171985, |
|
"grad_norm": 2.395731949083915, |
|
"learning_rate": 3.577530054181677e-07, |
|
"loss": 0.3337, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.8930791545793618, |
|
"grad_norm": 2.4132993194541363, |
|
"learning_rate": 3.4443644242793226e-07, |
|
"loss": 0.3205, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.8951512639867385, |
|
"grad_norm": 2.421557012145629, |
|
"learning_rate": 3.313635990465902e-07, |
|
"loss": 0.3286, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.8972233733941152, |
|
"grad_norm": 2.3514364773273053, |
|
"learning_rate": 3.1853515963989613e-07, |
|
"loss": 0.3244, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.899295482801492, |
|
"grad_norm": 2.3631578697068156, |
|
"learning_rate": 3.059517957790165e-07, |
|
"loss": 0.3224, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.9013675922088686, |
|
"grad_norm": 2.3165934226078693, |
|
"learning_rate": 2.936141662053621e-07, |
|
"loss": 0.3189, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.9034397016162453, |
|
"grad_norm": 2.354830595345679, |
|
"learning_rate": 2.8152291679611254e-07, |
|
"loss": 0.3164, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.905511811023622, |
|
"grad_norm": 2.4591886130861984, |
|
"learning_rate": 2.6967868053039916e-07, |
|
"loss": 0.3194, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.9075839204309988, |
|
"grad_norm": 2.4651307442780515, |
|
"learning_rate": 2.580820774561704e-07, |
|
"loss": 0.3329, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.9096560298383755, |
|
"grad_norm": 2.356380428139328, |
|
"learning_rate": 2.467337146577298e-07, |
|
"loss": 0.3125, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.9117281392457521, |
|
"grad_norm": 2.4718310107961923, |
|
"learning_rate": 2.3563418622395863e-07, |
|
"loss": 0.3267, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.9138002486531289, |
|
"grad_norm": 2.2943822251497834, |
|
"learning_rate": 2.2478407321721295e-07, |
|
"loss": 0.3124, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.9158723580605056, |
|
"grad_norm": 2.37802636947732, |
|
"learning_rate": 2.141839436429055e-07, |
|
"loss": 0.3261, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.9179444674678823, |
|
"grad_norm": 2.4929237268669504, |
|
"learning_rate": 2.038343524197689e-07, |
|
"loss": 0.3266, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.9200165768752591, |
|
"grad_norm": 2.4497686225084645, |
|
"learning_rate": 1.9373584135080893e-07, |
|
"loss": 0.3155, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.9220886862826357, |
|
"grad_norm": 2.3085410601255063, |
|
"learning_rate": 1.8388893909493776e-07, |
|
"loss": 0.3143, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.9241607956900124, |
|
"grad_norm": 2.516522380423302, |
|
"learning_rate": 1.742941611393012e-07, |
|
"loss": 0.3282, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.9262329050973891, |
|
"grad_norm": 2.250941160020719, |
|
"learning_rate": 1.6495200977228897e-07, |
|
"loss": 0.3313, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.9283050145047659, |
|
"grad_norm": 2.243976361659756, |
|
"learning_rate": 1.558629740572465e-07, |
|
"loss": 0.3123, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.9303771239121426, |
|
"grad_norm": 2.411662333101451, |
|
"learning_rate": 1.4702752980686463e-07, |
|
"loss": 0.3227, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.9324492333195192, |
|
"grad_norm": 2.4064884421540254, |
|
"learning_rate": 1.3844613955827536e-07, |
|
"loss": 0.326, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.934521342726896, |
|
"grad_norm": 2.4948851626678326, |
|
"learning_rate": 1.301192525488376e-07, |
|
"loss": 0.3177, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.9365934521342727, |
|
"grad_norm": 2.256448839133944, |
|
"learning_rate": 1.2204730469261905e-07, |
|
"loss": 0.3224, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.9386655615416494, |
|
"grad_norm": 2.386604983970806, |
|
"learning_rate": 1.1423071855757473e-07, |
|
"loss": 0.3177, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.9407376709490262, |
|
"grad_norm": 2.4955559903632087, |
|
"learning_rate": 1.0666990334342708e-07, |
|
"loss": 0.3235, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.9428097803564028, |
|
"grad_norm": 2.4680594179132953, |
|
"learning_rate": 9.936525486024362e-08, |
|
"loss": 0.3234, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.9448818897637795, |
|
"grad_norm": 2.352943986977247, |
|
"learning_rate": 9.23171555077168e-08, |
|
"loss": 0.3189, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.9469539991711562, |
|
"grad_norm": 2.3685608851078106, |
|
"learning_rate": 8.552597425514508e-08, |
|
"loss": 0.3191, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.949026108578533, |
|
"grad_norm": 2.4481455211139944, |
|
"learning_rate": 7.8992066622115e-08, |
|
"loss": 0.3283, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.9510982179859097, |
|
"grad_norm": 2.298587702409287, |
|
"learning_rate": 7.271577465989554e-08, |
|
"loss": 0.3181, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.9531703273932863, |
|
"grad_norm": 2.3601024907505956, |
|
"learning_rate": 6.669742693352522e-08, |
|
"loss": 0.3183, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.9552424368006631, |
|
"grad_norm": 2.3846417205933537, |
|
"learning_rate": 6.093733850461359e-08, |
|
"loss": 0.311, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.9573145462080398, |
|
"grad_norm": 2.5721713736853427, |
|
"learning_rate": 5.5435810914851176e-08, |
|
"loss": 0.3225, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.9593866556154165, |
|
"grad_norm": 2.4391728621326667, |
|
"learning_rate": 5.0193132170219814e-08, |
|
"loss": 0.3219, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.9614587650227931, |
|
"grad_norm": 2.314964895648817, |
|
"learning_rate": 4.5209576725915305e-08, |
|
"loss": 0.3146, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.9635308744301699, |
|
"grad_norm": 2.3953921094993427, |
|
"learning_rate": 4.0485405471983317e-08, |
|
"loss": 0.3099, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.9656029838375466, |
|
"grad_norm": 2.4903270174654977, |
|
"learning_rate": 3.6020865719657015e-08, |
|
"loss": 0.3261, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.9676750932449233, |
|
"grad_norm": 2.5600866933116153, |
|
"learning_rate": 3.181619118841517e-08, |
|
"loss": 0.3207, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.9697472026523001, |
|
"grad_norm": 2.2542713880681466, |
|
"learning_rate": 2.7871601993741947e-08, |
|
"loss": 0.3154, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.9718193120596768, |
|
"grad_norm": 2.3906714544505, |
|
"learning_rate": 2.4187304635608922e-08, |
|
"loss": 0.3264, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.9738914214670534, |
|
"grad_norm": 2.3437139004611396, |
|
"learning_rate": 2.0763491987659812e-08, |
|
"loss": 0.3131, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.9759635308744302, |
|
"grad_norm": 2.3352282804892504, |
|
"learning_rate": 1.7600343287116904e-08, |
|
"loss": 0.3236, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.9780356402818069, |
|
"grad_norm": 2.2788999882928267, |
|
"learning_rate": 1.4698024125396893e-08, |
|
"loss": 0.3165, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.9801077496891836, |
|
"grad_norm": 2.484272820389643, |
|
"learning_rate": 1.205668643944169e-08, |
|
"loss": 0.3265, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.9821798590965602, |
|
"grad_norm": 2.3492923712340357, |
|
"learning_rate": 9.676468503765356e-09, |
|
"loss": 0.3185, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.984251968503937, |
|
"grad_norm": 2.2726204174567806, |
|
"learning_rate": 7.557494923214338e-09, |
|
"loss": 0.3187, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.9863240779113137, |
|
"grad_norm": 2.502123973833959, |
|
"learning_rate": 5.699876626446554e-09, |
|
"loss": 0.315, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.9883961873186904, |
|
"grad_norm": 2.5216195549598046, |
|
"learning_rate": 4.103710860120513e-09, |
|
"loss": 0.3232, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.9904682967260672, |
|
"grad_norm": 2.3006568402247987, |
|
"learning_rate": 2.769081183808253e-09, |
|
"loss": 0.3132, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.9925404061334439, |
|
"grad_norm": 2.3057165957876977, |
|
"learning_rate": 1.69605746561885e-09, |
|
"loss": 0.3099, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.9946125155408205, |
|
"grad_norm": 2.3117976401405853, |
|
"learning_rate": 8.846958785418969e-10, |
|
"loss": 0.3166, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.9966846249481973, |
|
"grad_norm": 2.4474241574477613, |
|
"learning_rate": 3.3503889750485794e-10, |
|
"loss": 0.3198, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.998756734355574, |
|
"grad_norm": 2.475055392564647, |
|
"learning_rate": 4.711529715262231e-11, |
|
"loss": 0.3188, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.2978341579437256, |
|
"eval_runtime": 1.1902, |
|
"eval_samples_per_second": 2.521, |
|
"eval_steps_per_second": 0.84, |
|
"step": 2413 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 2413, |
|
"total_flos": 252616554577920.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 0.0113, |
|
"train_samples_per_second": 3408953.689, |
|
"train_steps_per_second": 213070.643 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2413, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 252616554577920.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|