terry69's picture
Model save
edfd039 verified
raw
history blame
84.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2413,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004144218814753419,
"grad_norm": 24.514733364424554,
"learning_rate": 4.132231404958678e-08,
"loss": 1.4169,
"step": 1
},
{
"epoch": 0.0020721094073767096,
"grad_norm": 23.537184095238235,
"learning_rate": 2.066115702479339e-07,
"loss": 1.3953,
"step": 5
},
{
"epoch": 0.004144218814753419,
"grad_norm": 15.367263243187544,
"learning_rate": 4.132231404958678e-07,
"loss": 1.3778,
"step": 10
},
{
"epoch": 0.006216328222130129,
"grad_norm": 8.898381617118186,
"learning_rate": 6.198347107438018e-07,
"loss": 1.2602,
"step": 15
},
{
"epoch": 0.008288437629506838,
"grad_norm": 10.642672694622197,
"learning_rate": 8.264462809917356e-07,
"loss": 1.1548,
"step": 20
},
{
"epoch": 0.010360547036883548,
"grad_norm": 4.617567060806481,
"learning_rate": 1.0330578512396695e-06,
"loss": 1.0324,
"step": 25
},
{
"epoch": 0.012432656444260257,
"grad_norm": 3.417020439965166,
"learning_rate": 1.2396694214876035e-06,
"loss": 0.9692,
"step": 30
},
{
"epoch": 0.014504765851636967,
"grad_norm": 3.395692919461883,
"learning_rate": 1.4462809917355372e-06,
"loss": 0.9523,
"step": 35
},
{
"epoch": 0.016576875259013676,
"grad_norm": 3.151546235627503,
"learning_rate": 1.6528925619834712e-06,
"loss": 0.9274,
"step": 40
},
{
"epoch": 0.018648984666390384,
"grad_norm": 2.9582253473643485,
"learning_rate": 1.859504132231405e-06,
"loss": 0.9323,
"step": 45
},
{
"epoch": 0.020721094073767096,
"grad_norm": 3.0341795582664997,
"learning_rate": 2.066115702479339e-06,
"loss": 0.8943,
"step": 50
},
{
"epoch": 0.022793203481143803,
"grad_norm": 3.0341130670598164,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.8975,
"step": 55
},
{
"epoch": 0.024865312888520515,
"grad_norm": 3.1718207048254627,
"learning_rate": 2.479338842975207e-06,
"loss": 0.8814,
"step": 60
},
{
"epoch": 0.026937422295897222,
"grad_norm": 3.0690467751730495,
"learning_rate": 2.6859504132231405e-06,
"loss": 0.8886,
"step": 65
},
{
"epoch": 0.029009531703273934,
"grad_norm": 3.1173422324045084,
"learning_rate": 2.8925619834710743e-06,
"loss": 0.8779,
"step": 70
},
{
"epoch": 0.03108164111065064,
"grad_norm": 3.138742895479937,
"learning_rate": 3.0991735537190086e-06,
"loss": 0.8896,
"step": 75
},
{
"epoch": 0.03315375051802735,
"grad_norm": 3.172114081672577,
"learning_rate": 3.3057851239669424e-06,
"loss": 0.852,
"step": 80
},
{
"epoch": 0.035225859925404064,
"grad_norm": 3.0136111662154126,
"learning_rate": 3.5123966942148763e-06,
"loss": 0.8718,
"step": 85
},
{
"epoch": 0.03729796933278077,
"grad_norm": 3.258761073776976,
"learning_rate": 3.71900826446281e-06,
"loss": 0.8515,
"step": 90
},
{
"epoch": 0.03937007874015748,
"grad_norm": 3.1414312111507323,
"learning_rate": 3.925619834710744e-06,
"loss": 0.8653,
"step": 95
},
{
"epoch": 0.04144218814753419,
"grad_norm": 3.207392945640886,
"learning_rate": 4.132231404958678e-06,
"loss": 0.8688,
"step": 100
},
{
"epoch": 0.0435142975549109,
"grad_norm": 3.2052306999586695,
"learning_rate": 4.338842975206612e-06,
"loss": 0.8662,
"step": 105
},
{
"epoch": 0.04558640696228761,
"grad_norm": 3.061390265998909,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.8397,
"step": 110
},
{
"epoch": 0.04765851636966432,
"grad_norm": 3.0154741328483023,
"learning_rate": 4.75206611570248e-06,
"loss": 0.8583,
"step": 115
},
{
"epoch": 0.04973062577704103,
"grad_norm": 2.9687539148068796,
"learning_rate": 4.958677685950414e-06,
"loss": 0.8542,
"step": 120
},
{
"epoch": 0.05180273518441774,
"grad_norm": 3.0956570078052925,
"learning_rate": 5.165289256198347e-06,
"loss": 0.8343,
"step": 125
},
{
"epoch": 0.053874844591794445,
"grad_norm": 2.939810124211356,
"learning_rate": 5.371900826446281e-06,
"loss": 0.8457,
"step": 130
},
{
"epoch": 0.055946953999171156,
"grad_norm": 3.2352916246928203,
"learning_rate": 5.578512396694216e-06,
"loss": 0.8169,
"step": 135
},
{
"epoch": 0.05801906340654787,
"grad_norm": 3.1434079053605273,
"learning_rate": 5.785123966942149e-06,
"loss": 0.8257,
"step": 140
},
{
"epoch": 0.06009117281392457,
"grad_norm": 3.1532690460920803,
"learning_rate": 5.991735537190083e-06,
"loss": 0.8392,
"step": 145
},
{
"epoch": 0.06216328222130128,
"grad_norm": 3.0343466546862037,
"learning_rate": 6.198347107438017e-06,
"loss": 0.8442,
"step": 150
},
{
"epoch": 0.06423539162867799,
"grad_norm": 2.9508007571282318,
"learning_rate": 6.404958677685951e-06,
"loss": 0.8218,
"step": 155
},
{
"epoch": 0.0663075010360547,
"grad_norm": 3.0527743250572033,
"learning_rate": 6.611570247933885e-06,
"loss": 0.8294,
"step": 160
},
{
"epoch": 0.06837961044343141,
"grad_norm": 3.075132609753167,
"learning_rate": 6.818181818181818e-06,
"loss": 0.8266,
"step": 165
},
{
"epoch": 0.07045171985080813,
"grad_norm": 2.972687092057336,
"learning_rate": 7.0247933884297525e-06,
"loss": 0.8139,
"step": 170
},
{
"epoch": 0.07252382925818483,
"grad_norm": 3.068027468513739,
"learning_rate": 7.231404958677687e-06,
"loss": 0.7992,
"step": 175
},
{
"epoch": 0.07459593866556154,
"grad_norm": 3.158357724987422,
"learning_rate": 7.43801652892562e-06,
"loss": 0.8134,
"step": 180
},
{
"epoch": 0.07666804807293826,
"grad_norm": 3.051134008290911,
"learning_rate": 7.644628099173555e-06,
"loss": 0.8344,
"step": 185
},
{
"epoch": 0.07874015748031496,
"grad_norm": 3.3235580452981175,
"learning_rate": 7.851239669421489e-06,
"loss": 0.8041,
"step": 190
},
{
"epoch": 0.08081226688769166,
"grad_norm": 3.0678026193628805,
"learning_rate": 8.057851239669421e-06,
"loss": 0.8045,
"step": 195
},
{
"epoch": 0.08288437629506838,
"grad_norm": 3.415940865122294,
"learning_rate": 8.264462809917356e-06,
"loss": 0.7953,
"step": 200
},
{
"epoch": 0.08495648570244509,
"grad_norm": 3.815817395153579,
"learning_rate": 8.47107438016529e-06,
"loss": 0.8047,
"step": 205
},
{
"epoch": 0.0870285951098218,
"grad_norm": 3.2984513943211153,
"learning_rate": 8.677685950413224e-06,
"loss": 0.8056,
"step": 210
},
{
"epoch": 0.08910070451719851,
"grad_norm": 2.8880894973116793,
"learning_rate": 8.884297520661158e-06,
"loss": 0.795,
"step": 215
},
{
"epoch": 0.09117281392457521,
"grad_norm": 2.974477767781155,
"learning_rate": 9.090909090909091e-06,
"loss": 0.8014,
"step": 220
},
{
"epoch": 0.09324492333195193,
"grad_norm": 3.0809082608070444,
"learning_rate": 9.297520661157025e-06,
"loss": 0.812,
"step": 225
},
{
"epoch": 0.09531703273932864,
"grad_norm": 3.11471679186006,
"learning_rate": 9.50413223140496e-06,
"loss": 0.815,
"step": 230
},
{
"epoch": 0.09738914214670534,
"grad_norm": 3.3405073257172333,
"learning_rate": 9.710743801652894e-06,
"loss": 0.7991,
"step": 235
},
{
"epoch": 0.09946125155408206,
"grad_norm": 3.1375906477529485,
"learning_rate": 9.917355371900828e-06,
"loss": 0.7902,
"step": 240
},
{
"epoch": 0.10153336096145876,
"grad_norm": 2.8343248219683823,
"learning_rate": 9.999952884702848e-06,
"loss": 0.8215,
"step": 245
},
{
"epoch": 0.10360547036883548,
"grad_norm": 2.9925055543397563,
"learning_rate": 9.999664961102495e-06,
"loss": 0.8084,
"step": 250
},
{
"epoch": 0.10567757977621219,
"grad_norm": 3.0441664134730275,
"learning_rate": 9.999115304121459e-06,
"loss": 0.8085,
"step": 255
},
{
"epoch": 0.10774968918358889,
"grad_norm": 3.039997210231939,
"learning_rate": 9.998303942534383e-06,
"loss": 0.7938,
"step": 260
},
{
"epoch": 0.10982179859096561,
"grad_norm": 3.0431241796795696,
"learning_rate": 9.997230918816193e-06,
"loss": 0.7985,
"step": 265
},
{
"epoch": 0.11189390799834231,
"grad_norm": 2.85027037020918,
"learning_rate": 9.99589628913988e-06,
"loss": 0.7861,
"step": 270
},
{
"epoch": 0.11396601740571902,
"grad_norm": 3.051497861941328,
"learning_rate": 9.994300123373554e-06,
"loss": 0.7716,
"step": 275
},
{
"epoch": 0.11603812681309573,
"grad_norm": 2.895125032094719,
"learning_rate": 9.992442505076788e-06,
"loss": 0.7834,
"step": 280
},
{
"epoch": 0.11811023622047244,
"grad_norm": 3.0061812704069886,
"learning_rate": 9.990323531496235e-06,
"loss": 0.7756,
"step": 285
},
{
"epoch": 0.12018234562784914,
"grad_norm": 3.0347710812056525,
"learning_rate": 9.98794331356056e-06,
"loss": 0.7846,
"step": 290
},
{
"epoch": 0.12225445503522586,
"grad_norm": 2.805184541271578,
"learning_rate": 9.985301975874604e-06,
"loss": 0.7731,
"step": 295
},
{
"epoch": 0.12432656444260257,
"grad_norm": 3.0842132440120147,
"learning_rate": 9.982399656712884e-06,
"loss": 0.8042,
"step": 300
},
{
"epoch": 0.12639867384997927,
"grad_norm": 2.9008174183880233,
"learning_rate": 9.979236508012341e-06,
"loss": 0.7681,
"step": 305
},
{
"epoch": 0.12847078325735597,
"grad_norm": 2.8236286325458244,
"learning_rate": 9.975812695364391e-06,
"loss": 0.7891,
"step": 310
},
{
"epoch": 0.1305428926647327,
"grad_norm": 2.8468044382747384,
"learning_rate": 9.97212839800626e-06,
"loss": 0.7681,
"step": 315
},
{
"epoch": 0.1326150020721094,
"grad_norm": 2.9534507280362003,
"learning_rate": 9.968183808811586e-06,
"loss": 0.7564,
"step": 320
},
{
"epoch": 0.13468711147948612,
"grad_norm": 2.965082743686242,
"learning_rate": 9.963979134280344e-06,
"loss": 0.7529,
"step": 325
},
{
"epoch": 0.13675922088686282,
"grad_norm": 2.847254353400155,
"learning_rate": 9.959514594528018e-06,
"loss": 0.7438,
"step": 330
},
{
"epoch": 0.13883133029423952,
"grad_norm": 2.7966052001739206,
"learning_rate": 9.954790423274086e-06,
"loss": 0.7591,
"step": 335
},
{
"epoch": 0.14090343970161626,
"grad_norm": 2.7319025763391576,
"learning_rate": 9.94980686782978e-06,
"loss": 0.7406,
"step": 340
},
{
"epoch": 0.14297554910899296,
"grad_norm": 2.870893839424329,
"learning_rate": 9.94456418908515e-06,
"loss": 0.7541,
"step": 345
},
{
"epoch": 0.14504765851636967,
"grad_norm": 3.009522094390881,
"learning_rate": 9.939062661495387e-06,
"loss": 0.7511,
"step": 350
},
{
"epoch": 0.14711976792374637,
"grad_norm": 2.7182810127315573,
"learning_rate": 9.933302573066477e-06,
"loss": 0.7688,
"step": 355
},
{
"epoch": 0.14919187733112307,
"grad_norm": 3.012302144551778,
"learning_rate": 9.927284225340105e-06,
"loss": 0.7341,
"step": 360
},
{
"epoch": 0.1512639867384998,
"grad_norm": 3.337745192726695,
"learning_rate": 9.921007933377886e-06,
"loss": 0.7539,
"step": 365
},
{
"epoch": 0.1533360961458765,
"grad_norm": 3.17882772460756,
"learning_rate": 9.914474025744855e-06,
"loss": 0.7506,
"step": 370
},
{
"epoch": 0.15540820555325321,
"grad_norm": 3.2860227088727383,
"learning_rate": 9.907682844492283e-06,
"loss": 0.7514,
"step": 375
},
{
"epoch": 0.15748031496062992,
"grad_norm": 3.1971805376225517,
"learning_rate": 9.900634745139759e-06,
"loss": 0.7475,
"step": 380
},
{
"epoch": 0.15955242436800662,
"grad_norm": 2.8028862036100843,
"learning_rate": 9.893330096656576e-06,
"loss": 0.7285,
"step": 385
},
{
"epoch": 0.16162453377538333,
"grad_norm": 2.708278578400365,
"learning_rate": 9.885769281442426e-06,
"loss": 0.7224,
"step": 390
},
{
"epoch": 0.16369664318276006,
"grad_norm": 2.771966186486157,
"learning_rate": 9.877952695307382e-06,
"loss": 0.7287,
"step": 395
},
{
"epoch": 0.16576875259013676,
"grad_norm": 2.645488728201908,
"learning_rate": 9.869880747451164e-06,
"loss": 0.7389,
"step": 400
},
{
"epoch": 0.16784086199751347,
"grad_norm": 2.9190913357286306,
"learning_rate": 9.861553860441726e-06,
"loss": 0.7414,
"step": 405
},
{
"epoch": 0.16991297140489017,
"grad_norm": 2.6920496051405998,
"learning_rate": 9.852972470193136e-06,
"loss": 0.7259,
"step": 410
},
{
"epoch": 0.17198508081226688,
"grad_norm": 2.9415197302471663,
"learning_rate": 9.844137025942755e-06,
"loss": 0.7266,
"step": 415
},
{
"epoch": 0.1740571902196436,
"grad_norm": 2.7808373109476294,
"learning_rate": 9.835047990227713e-06,
"loss": 0.7119,
"step": 420
},
{
"epoch": 0.1761292996270203,
"grad_norm": 2.810707277824952,
"learning_rate": 9.825705838860699e-06,
"loss": 0.7361,
"step": 425
},
{
"epoch": 0.17820140903439702,
"grad_norm": 3.0525714644872117,
"learning_rate": 9.816111060905063e-06,
"loss": 0.7146,
"step": 430
},
{
"epoch": 0.18027351844177372,
"grad_norm": 2.769816103147695,
"learning_rate": 9.806264158649193e-06,
"loss": 0.7104,
"step": 435
},
{
"epoch": 0.18234562784915043,
"grad_norm": 2.8698277588389516,
"learning_rate": 9.796165647580233e-06,
"loss": 0.7015,
"step": 440
},
{
"epoch": 0.18441773725652713,
"grad_norm": 2.632541484736724,
"learning_rate": 9.785816056357096e-06,
"loss": 0.7148,
"step": 445
},
{
"epoch": 0.18648984666390386,
"grad_norm": 2.6937205983501276,
"learning_rate": 9.775215926782788e-06,
"loss": 0.7203,
"step": 450
},
{
"epoch": 0.18856195607128057,
"grad_norm": 2.7810565830560354,
"learning_rate": 9.764365813776042e-06,
"loss": 0.7068,
"step": 455
},
{
"epoch": 0.19063406547865727,
"grad_norm": 2.8012616966262134,
"learning_rate": 9.753266285342271e-06,
"loss": 0.7104,
"step": 460
},
{
"epoch": 0.19270617488603398,
"grad_norm": 2.970139751785694,
"learning_rate": 9.741917922543831e-06,
"loss": 0.6881,
"step": 465
},
{
"epoch": 0.19477828429341068,
"grad_norm": 2.6941473339327824,
"learning_rate": 9.7303213194696e-06,
"loss": 0.6996,
"step": 470
},
{
"epoch": 0.1968503937007874,
"grad_norm": 3.0409394398260554,
"learning_rate": 9.718477083203888e-06,
"loss": 0.6933,
"step": 475
},
{
"epoch": 0.19892250310816412,
"grad_norm": 2.8048791766929146,
"learning_rate": 9.706385833794639e-06,
"loss": 0.6748,
"step": 480
},
{
"epoch": 0.20099461251554082,
"grad_norm": 2.7366882872916887,
"learning_rate": 9.694048204220986e-06,
"loss": 0.7044,
"step": 485
},
{
"epoch": 0.20306672192291753,
"grad_norm": 2.7267358802595374,
"learning_rate": 9.681464840360105e-06,
"loss": 0.6912,
"step": 490
},
{
"epoch": 0.20513883133029423,
"grad_norm": 2.754099178327694,
"learning_rate": 9.668636400953411e-06,
"loss": 0.6731,
"step": 495
},
{
"epoch": 0.20721094073767096,
"grad_norm": 3.11521121490046,
"learning_rate": 9.655563557572068e-06,
"loss": 0.7018,
"step": 500
},
{
"epoch": 0.20928305014504767,
"grad_norm": 2.729648091432038,
"learning_rate": 9.642246994581833e-06,
"loss": 0.6919,
"step": 505
},
{
"epoch": 0.21135515955242437,
"grad_norm": 2.788565650588865,
"learning_rate": 9.62868740910723e-06,
"loss": 0.6718,
"step": 510
},
{
"epoch": 0.21342726895980108,
"grad_norm": 2.6821583529570527,
"learning_rate": 9.614885510995047e-06,
"loss": 0.6696,
"step": 515
},
{
"epoch": 0.21549937836717778,
"grad_norm": 2.669430236726845,
"learning_rate": 9.600842022777198e-06,
"loss": 0.686,
"step": 520
},
{
"epoch": 0.21757148777455448,
"grad_norm": 2.6329122210908613,
"learning_rate": 9.58655767963287e-06,
"loss": 0.6649,
"step": 525
},
{
"epoch": 0.21964359718193122,
"grad_norm": 2.712511744397499,
"learning_rate": 9.57203322935006e-06,
"loss": 0.6691,
"step": 530
},
{
"epoch": 0.22171570658930792,
"grad_norm": 2.971121715072785,
"learning_rate": 9.557269432286406e-06,
"loss": 0.6568,
"step": 535
},
{
"epoch": 0.22378781599668462,
"grad_norm": 3.5850165463286587,
"learning_rate": 9.542267061329407e-06,
"loss": 0.6535,
"step": 540
},
{
"epoch": 0.22585992540406133,
"grad_norm": 2.6756133538389593,
"learning_rate": 9.52702690185594e-06,
"loss": 0.6578,
"step": 545
},
{
"epoch": 0.22793203481143803,
"grad_norm": 2.7169962232785947,
"learning_rate": 9.511549751691159e-06,
"loss": 0.6696,
"step": 550
},
{
"epoch": 0.23000414421881477,
"grad_norm": 2.66297533869507,
"learning_rate": 9.495836421066722e-06,
"loss": 0.6594,
"step": 555
},
{
"epoch": 0.23207625362619147,
"grad_norm": 2.974286886191662,
"learning_rate": 9.47988773257838e-06,
"loss": 0.6784,
"step": 560
},
{
"epoch": 0.23414836303356817,
"grad_norm": 2.7185575923361243,
"learning_rate": 9.46370452114291e-06,
"loss": 0.658,
"step": 565
},
{
"epoch": 0.23622047244094488,
"grad_norm": 2.829971742294898,
"learning_rate": 9.447287633954406e-06,
"loss": 0.6593,
"step": 570
},
{
"epoch": 0.23829258184832158,
"grad_norm": 2.620399538248119,
"learning_rate": 9.430637930439933e-06,
"loss": 0.6641,
"step": 575
},
{
"epoch": 0.2403646912556983,
"grad_norm": 2.8089738469657286,
"learning_rate": 9.413756282214538e-06,
"loss": 0.6443,
"step": 580
},
{
"epoch": 0.24243680066307502,
"grad_norm": 2.6610020148736657,
"learning_rate": 9.396643573035609e-06,
"loss": 0.6619,
"step": 585
},
{
"epoch": 0.24450891007045172,
"grad_norm": 2.693009848715325,
"learning_rate": 9.37930069875662e-06,
"loss": 0.6469,
"step": 590
},
{
"epoch": 0.24658101947782843,
"grad_norm": 2.8338530690883657,
"learning_rate": 9.36172856728023e-06,
"loss": 0.6571,
"step": 595
},
{
"epoch": 0.24865312888520513,
"grad_norm": 2.676634556349702,
"learning_rate": 9.343928098510759e-06,
"loss": 0.6358,
"step": 600
},
{
"epoch": 0.25072523829258186,
"grad_norm": 2.6815297814713674,
"learning_rate": 9.325900224306019e-06,
"loss": 0.6366,
"step": 605
},
{
"epoch": 0.25279734769995854,
"grad_norm": 2.693794960810057,
"learning_rate": 9.307645888428542e-06,
"loss": 0.6441,
"step": 610
},
{
"epoch": 0.2548694571073353,
"grad_norm": 2.6733329911215757,
"learning_rate": 9.289166046496172e-06,
"loss": 0.6284,
"step": 615
},
{
"epoch": 0.25694156651471195,
"grad_norm": 2.735971515551987,
"learning_rate": 9.270461665932035e-06,
"loss": 0.6394,
"step": 620
},
{
"epoch": 0.2590136759220887,
"grad_norm": 2.7764753108215623,
"learning_rate": 9.251533725913893e-06,
"loss": 0.6308,
"step": 625
},
{
"epoch": 0.2610857853294654,
"grad_norm": 2.9846661778059596,
"learning_rate": 9.23238321732289e-06,
"loss": 0.6381,
"step": 630
},
{
"epoch": 0.2631578947368421,
"grad_norm": 2.639475488829137,
"learning_rate": 9.213011142691672e-06,
"loss": 0.6298,
"step": 635
},
{
"epoch": 0.2652300041442188,
"grad_norm": 2.8031268295287037,
"learning_rate": 9.193418516151913e-06,
"loss": 0.6314,
"step": 640
},
{
"epoch": 0.2673021135515955,
"grad_norm": 2.751071861402199,
"learning_rate": 9.173606363381218e-06,
"loss": 0.6243,
"step": 645
},
{
"epoch": 0.26937422295897223,
"grad_norm": 2.8637192589002507,
"learning_rate": 9.15357572154943e-06,
"loss": 0.6226,
"step": 650
},
{
"epoch": 0.27144633236634896,
"grad_norm": 2.757948609951417,
"learning_rate": 9.133327639264334e-06,
"loss": 0.6195,
"step": 655
},
{
"epoch": 0.27351844177372564,
"grad_norm": 2.7078393676429724,
"learning_rate": 9.112863176516761e-06,
"loss": 0.6063,
"step": 660
},
{
"epoch": 0.2755905511811024,
"grad_norm": 2.7174885956763517,
"learning_rate": 9.092183404625107e-06,
"loss": 0.6201,
"step": 665
},
{
"epoch": 0.27766266058847905,
"grad_norm": 2.692439395697707,
"learning_rate": 9.071289406179233e-06,
"loss": 0.6186,
"step": 670
},
{
"epoch": 0.2797347699958558,
"grad_norm": 2.705577759760543,
"learning_rate": 9.0501822749838e-06,
"loss": 0.6208,
"step": 675
},
{
"epoch": 0.2818068794032325,
"grad_norm": 2.6961720306129497,
"learning_rate": 9.028863116001013e-06,
"loss": 0.6217,
"step": 680
},
{
"epoch": 0.2838789888106092,
"grad_norm": 2.823153530451939,
"learning_rate": 9.007333045292764e-06,
"loss": 0.6095,
"step": 685
},
{
"epoch": 0.2859510982179859,
"grad_norm": 2.7424489434092885,
"learning_rate": 8.98559318996222e-06,
"loss": 0.6071,
"step": 690
},
{
"epoch": 0.2880232076253626,
"grad_norm": 2.8108371792200724,
"learning_rate": 8.963644688094806e-06,
"loss": 0.6123,
"step": 695
},
{
"epoch": 0.29009531703273933,
"grad_norm": 2.7296570733868046,
"learning_rate": 8.941488688698635e-06,
"loss": 0.6038,
"step": 700
},
{
"epoch": 0.29216742644011606,
"grad_norm": 2.5282731359900983,
"learning_rate": 8.919126351644351e-06,
"loss": 0.6051,
"step": 705
},
{
"epoch": 0.29423953584749274,
"grad_norm": 2.733801097585659,
"learning_rate": 8.896558847604414e-06,
"loss": 0.6169,
"step": 710
},
{
"epoch": 0.29631164525486947,
"grad_norm": 2.5922131116520433,
"learning_rate": 8.873787357991811e-06,
"loss": 0.6062,
"step": 715
},
{
"epoch": 0.29838375466224615,
"grad_norm": 2.6565071641175844,
"learning_rate": 8.850813074898218e-06,
"loss": 0.6069,
"step": 720
},
{
"epoch": 0.3004558640696229,
"grad_norm": 3.0927970126548154,
"learning_rate": 8.827637201031579e-06,
"loss": 0.5879,
"step": 725
},
{
"epoch": 0.3025279734769996,
"grad_norm": 2.8105831813533033,
"learning_rate": 8.804260949653154e-06,
"loss": 0.6124,
"step": 730
},
{
"epoch": 0.3046000828843763,
"grad_norm": 2.584377915155431,
"learning_rate": 8.780685544514006e-06,
"loss": 0.6073,
"step": 735
},
{
"epoch": 0.306672192291753,
"grad_norm": 2.773926231343915,
"learning_rate": 8.756912219790933e-06,
"loss": 0.5999,
"step": 740
},
{
"epoch": 0.3087443016991297,
"grad_norm": 2.732022843788419,
"learning_rate": 8.732942220021859e-06,
"loss": 0.5762,
"step": 745
},
{
"epoch": 0.31081641110650643,
"grad_norm": 2.595571184744404,
"learning_rate": 8.708776800040679e-06,
"loss": 0.5846,
"step": 750
},
{
"epoch": 0.3128885205138831,
"grad_norm": 2.6942143484830985,
"learning_rate": 8.684417224911579e-06,
"loss": 0.6003,
"step": 755
},
{
"epoch": 0.31496062992125984,
"grad_norm": 2.6835198662003155,
"learning_rate": 8.659864769862797e-06,
"loss": 0.5838,
"step": 760
},
{
"epoch": 0.31703273932863657,
"grad_norm": 2.7778129772748383,
"learning_rate": 8.635120720219877e-06,
"loss": 0.5794,
"step": 765
},
{
"epoch": 0.31910484873601325,
"grad_norm": 2.6022076775767493,
"learning_rate": 8.610186371338364e-06,
"loss": 0.586,
"step": 770
},
{
"epoch": 0.32117695814339,
"grad_norm": 2.7328885442816557,
"learning_rate": 8.585063028536015e-06,
"loss": 0.5987,
"step": 775
},
{
"epoch": 0.32324906755076666,
"grad_norm": 2.635761747619665,
"learning_rate": 8.559752007024449e-06,
"loss": 0.5859,
"step": 780
},
{
"epoch": 0.3253211769581434,
"grad_norm": 2.79153427275607,
"learning_rate": 8.534254631840297e-06,
"loss": 0.5976,
"step": 785
},
{
"epoch": 0.3273932863655201,
"grad_norm": 2.656496652025873,
"learning_rate": 8.50857223777584e-06,
"loss": 0.578,
"step": 790
},
{
"epoch": 0.3294653957728968,
"grad_norm": 2.7338530782471655,
"learning_rate": 8.482706169309139e-06,
"loss": 0.5648,
"step": 795
},
{
"epoch": 0.33153750518027353,
"grad_norm": 2.667291889393192,
"learning_rate": 8.456657780533633e-06,
"loss": 0.5641,
"step": 800
},
{
"epoch": 0.3336096145876502,
"grad_norm": 2.9479902385492562,
"learning_rate": 8.430428435087267e-06,
"loss": 0.5665,
"step": 805
},
{
"epoch": 0.33568172399502694,
"grad_norm": 2.9408488784356748,
"learning_rate": 8.404019506081103e-06,
"loss": 0.5834,
"step": 810
},
{
"epoch": 0.33775383340240367,
"grad_norm": 2.707200722617604,
"learning_rate": 8.377432376027437e-06,
"loss": 0.5756,
"step": 815
},
{
"epoch": 0.33982594280978035,
"grad_norm": 2.70828380166259,
"learning_rate": 8.350668436767413e-06,
"loss": 0.5686,
"step": 820
},
{
"epoch": 0.3418980522171571,
"grad_norm": 2.7745931767789886,
"learning_rate": 8.323729089398182e-06,
"loss": 0.5521,
"step": 825
},
{
"epoch": 0.34397016162453375,
"grad_norm": 2.685590822904809,
"learning_rate": 8.296615744199533e-06,
"loss": 0.5707,
"step": 830
},
{
"epoch": 0.3460422710319105,
"grad_norm": 2.6439480580475023,
"learning_rate": 8.269329820560074e-06,
"loss": 0.549,
"step": 835
},
{
"epoch": 0.3481143804392872,
"grad_norm": 2.6684012534208845,
"learning_rate": 8.241872746902934e-06,
"loss": 0.5614,
"step": 840
},
{
"epoch": 0.3501864898466639,
"grad_norm": 2.6586973407105847,
"learning_rate": 8.214245960610966e-06,
"loss": 0.5596,
"step": 845
},
{
"epoch": 0.3522585992540406,
"grad_norm": 2.630842021442866,
"learning_rate": 8.18645090795152e-06,
"loss": 0.5435,
"step": 850
},
{
"epoch": 0.3543307086614173,
"grad_norm": 2.783114608569901,
"learning_rate": 8.158489044000712e-06,
"loss": 0.554,
"step": 855
},
{
"epoch": 0.35640281806879404,
"grad_norm": 2.6861386548441266,
"learning_rate": 8.13036183256727e-06,
"loss": 0.5503,
"step": 860
},
{
"epoch": 0.35847492747617077,
"grad_norm": 2.760451069908695,
"learning_rate": 8.102070746115888e-06,
"loss": 0.5504,
"step": 865
},
{
"epoch": 0.36054703688354744,
"grad_norm": 3.1912807546036155,
"learning_rate": 8.073617265690144e-06,
"loss": 0.5585,
"step": 870
},
{
"epoch": 0.3626191462909242,
"grad_norm": 2.7551688368780365,
"learning_rate": 8.045002880834975e-06,
"loss": 0.5499,
"step": 875
},
{
"epoch": 0.36469125569830085,
"grad_norm": 2.656529615625788,
"learning_rate": 8.016229089518695e-06,
"loss": 0.5472,
"step": 880
},
{
"epoch": 0.3667633651056776,
"grad_norm": 2.456139438562433,
"learning_rate": 7.987297398054572e-06,
"loss": 0.5444,
"step": 885
},
{
"epoch": 0.36883547451305426,
"grad_norm": 2.7021257640939678,
"learning_rate": 7.95820932102198e-06,
"loss": 0.5467,
"step": 890
},
{
"epoch": 0.370907583920431,
"grad_norm": 2.576896456231456,
"learning_rate": 7.9289663811871e-06,
"loss": 0.5453,
"step": 895
},
{
"epoch": 0.3729796933278077,
"grad_norm": 2.509602711427261,
"learning_rate": 7.899570109423219e-06,
"loss": 0.5315,
"step": 900
},
{
"epoch": 0.3750518027351844,
"grad_norm": 2.7824790919014486,
"learning_rate": 7.870022044630569e-06,
"loss": 0.5367,
"step": 905
},
{
"epoch": 0.37712391214256114,
"grad_norm": 2.635566041127458,
"learning_rate": 7.84032373365578e-06,
"loss": 0.5458,
"step": 910
},
{
"epoch": 0.3791960215499378,
"grad_norm": 2.5686297030493277,
"learning_rate": 7.810476731210897e-06,
"loss": 0.5538,
"step": 915
},
{
"epoch": 0.38126813095731454,
"grad_norm": 2.4885943009976366,
"learning_rate": 7.780482599791987e-06,
"loss": 0.5501,
"step": 920
},
{
"epoch": 0.3833402403646913,
"grad_norm": 2.6312255974545704,
"learning_rate": 7.750342909597353e-06,
"loss": 0.5412,
"step": 925
},
{
"epoch": 0.38541234977206795,
"grad_norm": 2.522165622730654,
"learning_rate": 7.72005923844532e-06,
"loss": 0.5313,
"step": 930
},
{
"epoch": 0.3874844591794447,
"grad_norm": 2.6702491559670167,
"learning_rate": 7.689633171691646e-06,
"loss": 0.5345,
"step": 935
},
{
"epoch": 0.38955656858682136,
"grad_norm": 2.6631872386085185,
"learning_rate": 7.659066302146523e-06,
"loss": 0.5452,
"step": 940
},
{
"epoch": 0.3916286779941981,
"grad_norm": 2.5785637853885195,
"learning_rate": 7.628360229991198e-06,
"loss": 0.5288,
"step": 945
},
{
"epoch": 0.3937007874015748,
"grad_norm": 2.6837857193306935,
"learning_rate": 7.597516562694198e-06,
"loss": 0.5306,
"step": 950
},
{
"epoch": 0.3957728968089515,
"grad_norm": 2.550858198260864,
"learning_rate": 7.56653691492718e-06,
"loss": 0.5233,
"step": 955
},
{
"epoch": 0.39784500621632823,
"grad_norm": 2.6115170843406132,
"learning_rate": 7.535422908480408e-06,
"loss": 0.5424,
"step": 960
},
{
"epoch": 0.3999171156237049,
"grad_norm": 2.5494802625123105,
"learning_rate": 7.504176172177842e-06,
"loss": 0.5171,
"step": 965
},
{
"epoch": 0.40198922503108164,
"grad_norm": 2.5129023648194284,
"learning_rate": 7.472798341791877e-06,
"loss": 0.5148,
"step": 970
},
{
"epoch": 0.4040613344384584,
"grad_norm": 2.6166709552589866,
"learning_rate": 7.441291059957709e-06,
"loss": 0.5292,
"step": 975
},
{
"epoch": 0.40613344384583505,
"grad_norm": 2.5745519804152073,
"learning_rate": 7.409655976087338e-06,
"loss": 0.5228,
"step": 980
},
{
"epoch": 0.4082055532532118,
"grad_norm": 2.7301378295783643,
"learning_rate": 7.377894746283227e-06,
"loss": 0.5343,
"step": 985
},
{
"epoch": 0.41027766266058846,
"grad_norm": 2.431218699982397,
"learning_rate": 7.3460090332516e-06,
"loss": 0.508,
"step": 990
},
{
"epoch": 0.4123497720679652,
"grad_norm": 2.556296703014823,
"learning_rate": 7.314000506215402e-06,
"loss": 0.5148,
"step": 995
},
{
"epoch": 0.4144218814753419,
"grad_norm": 2.5152139819517023,
"learning_rate": 7.281870840826912e-06,
"loss": 0.4999,
"step": 1000
},
{
"epoch": 0.4164939908827186,
"grad_norm": 2.786935816025543,
"learning_rate": 7.249621719080026e-06,
"loss": 0.5177,
"step": 1005
},
{
"epoch": 0.41856610029009533,
"grad_norm": 2.5491738667702672,
"learning_rate": 7.217254829222201e-06,
"loss": 0.5114,
"step": 1010
},
{
"epoch": 0.420638209697472,
"grad_norm": 2.50660251762613,
"learning_rate": 7.1847718656660755e-06,
"loss": 0.5156,
"step": 1015
},
{
"epoch": 0.42271031910484874,
"grad_norm": 2.5986079473520878,
"learning_rate": 7.152174528900773e-06,
"loss": 0.4954,
"step": 1020
},
{
"epoch": 0.4247824285122254,
"grad_norm": 2.635934213356069,
"learning_rate": 7.119464525402867e-06,
"loss": 0.504,
"step": 1025
},
{
"epoch": 0.42685453791960215,
"grad_norm": 2.861696123619274,
"learning_rate": 7.08664356754706e-06,
"loss": 0.5063,
"step": 1030
},
{
"epoch": 0.4289266473269789,
"grad_norm": 2.5689042563262223,
"learning_rate": 7.053713373516538e-06,
"loss": 0.5181,
"step": 1035
},
{
"epoch": 0.43099875673435556,
"grad_norm": 2.5132849205491286,
"learning_rate": 7.020675667213015e-06,
"loss": 0.5043,
"step": 1040
},
{
"epoch": 0.4330708661417323,
"grad_norm": 2.704397404776844,
"learning_rate": 6.987532178166496e-06,
"loss": 0.5022,
"step": 1045
},
{
"epoch": 0.43514297554910897,
"grad_norm": 2.4785754646781104,
"learning_rate": 6.9542846414447306e-06,
"loss": 0.5027,
"step": 1050
},
{
"epoch": 0.4372150849564857,
"grad_norm": 2.3912935006226594,
"learning_rate": 6.920934797562385e-06,
"loss": 0.5051,
"step": 1055
},
{
"epoch": 0.43928719436386243,
"grad_norm": 2.3476041200753546,
"learning_rate": 6.887484392389923e-06,
"loss": 0.5043,
"step": 1060
},
{
"epoch": 0.4413593037712391,
"grad_norm": 2.5914055427224465,
"learning_rate": 6.853935177062208e-06,
"loss": 0.4974,
"step": 1065
},
{
"epoch": 0.44343141317861584,
"grad_norm": 2.594951575952019,
"learning_rate": 6.8202889078868395e-06,
"loss": 0.5061,
"step": 1070
},
{
"epoch": 0.4455035225859925,
"grad_norm": 2.6023391478211293,
"learning_rate": 6.786547346252198e-06,
"loss": 0.4963,
"step": 1075
},
{
"epoch": 0.44757563199336925,
"grad_norm": 2.3528096807317995,
"learning_rate": 6.7527122585352435e-06,
"loss": 0.4883,
"step": 1080
},
{
"epoch": 0.449647741400746,
"grad_norm": 2.7398459124724814,
"learning_rate": 6.718785416009044e-06,
"loss": 0.4968,
"step": 1085
},
{
"epoch": 0.45171985080812266,
"grad_norm": 2.838272166911696,
"learning_rate": 6.6847685947500495e-06,
"loss": 0.4915,
"step": 1090
},
{
"epoch": 0.4537919602154994,
"grad_norm": 3.0284763209341623,
"learning_rate": 6.650663575545111e-06,
"loss": 0.4762,
"step": 1095
},
{
"epoch": 0.45586406962287607,
"grad_norm": 2.6967943018540517,
"learning_rate": 6.61647214379826e-06,
"loss": 0.4737,
"step": 1100
},
{
"epoch": 0.4579361790302528,
"grad_norm": 2.551888108931838,
"learning_rate": 6.582196089437241e-06,
"loss": 0.5076,
"step": 1105
},
{
"epoch": 0.46000828843762953,
"grad_norm": 2.514843118690436,
"learning_rate": 6.547837206819804e-06,
"loss": 0.4876,
"step": 1110
},
{
"epoch": 0.4620803978450062,
"grad_norm": 2.4499641238461005,
"learning_rate": 6.513397294639778e-06,
"loss": 0.4785,
"step": 1115
},
{
"epoch": 0.46415250725238294,
"grad_norm": 2.5439805360709378,
"learning_rate": 6.478878155832904e-06,
"loss": 0.4609,
"step": 1120
},
{
"epoch": 0.4662246166597596,
"grad_norm": 2.567465771254044,
"learning_rate": 6.444281597482449e-06,
"loss": 0.4826,
"step": 1125
},
{
"epoch": 0.46829672606713635,
"grad_norm": 2.4194190626785863,
"learning_rate": 6.409609430724607e-06,
"loss": 0.4639,
"step": 1130
},
{
"epoch": 0.4703688354745131,
"grad_norm": 2.4928927116754216,
"learning_rate": 6.3748634706536905e-06,
"loss": 0.4755,
"step": 1135
},
{
"epoch": 0.47244094488188976,
"grad_norm": 2.5328967096855517,
"learning_rate": 6.340045536227101e-06,
"loss": 0.4676,
"step": 1140
},
{
"epoch": 0.4745130542892665,
"grad_norm": 2.580851369974543,
"learning_rate": 6.305157450170112e-06,
"loss": 0.4679,
"step": 1145
},
{
"epoch": 0.47658516369664317,
"grad_norm": 2.771781788352758,
"learning_rate": 6.270201038880451e-06,
"loss": 0.4748,
"step": 1150
},
{
"epoch": 0.4786572731040199,
"grad_norm": 2.3357211287270294,
"learning_rate": 6.235178132332678e-06,
"loss": 0.4733,
"step": 1155
},
{
"epoch": 0.4807293825113966,
"grad_norm": 2.4376737050512993,
"learning_rate": 6.200090563982397e-06,
"loss": 0.4623,
"step": 1160
},
{
"epoch": 0.4828014919187733,
"grad_norm": 2.727521949059151,
"learning_rate": 6.164940170670266e-06,
"loss": 0.4763,
"step": 1165
},
{
"epoch": 0.48487360132615004,
"grad_norm": 2.485190311900293,
"learning_rate": 6.129728792525847e-06,
"loss": 0.4653,
"step": 1170
},
{
"epoch": 0.4869457107335267,
"grad_norm": 2.3795187623979746,
"learning_rate": 6.094458272871259e-06,
"loss": 0.4576,
"step": 1175
},
{
"epoch": 0.48901782014090345,
"grad_norm": 2.43415537867363,
"learning_rate": 6.0591304581247005e-06,
"loss": 0.4606,
"step": 1180
},
{
"epoch": 0.4910899295482801,
"grad_norm": 2.5114902879972374,
"learning_rate": 6.023747197703771e-06,
"loss": 0.4671,
"step": 1185
},
{
"epoch": 0.49316203895565686,
"grad_norm": 2.5122364778542225,
"learning_rate": 5.988310343928665e-06,
"loss": 0.4678,
"step": 1190
},
{
"epoch": 0.4952341483630336,
"grad_norm": 2.6311201696718283,
"learning_rate": 5.9528217519252e-06,
"loss": 0.4653,
"step": 1195
},
{
"epoch": 0.49730625777041026,
"grad_norm": 2.634373162867216,
"learning_rate": 5.9172832795276965e-06,
"loss": 0.4858,
"step": 1200
},
{
"epoch": 0.499378367177787,
"grad_norm": 2.602845367016974,
"learning_rate": 5.881696787181724e-06,
"loss": 0.4646,
"step": 1205
},
{
"epoch": 0.5014504765851637,
"grad_norm": 2.4997645055614544,
"learning_rate": 5.846064137846704e-06,
"loss": 0.4723,
"step": 1210
},
{
"epoch": 0.5035225859925404,
"grad_norm": 2.5259484476620853,
"learning_rate": 5.810387196898387e-06,
"loss": 0.4592,
"step": 1215
},
{
"epoch": 0.5055946953999171,
"grad_norm": 2.460185161862939,
"learning_rate": 5.7746678320311955e-06,
"loss": 0.4563,
"step": 1220
},
{
"epoch": 0.5076668048072939,
"grad_norm": 2.522924782621583,
"learning_rate": 5.738907913160452e-06,
"loss": 0.455,
"step": 1225
},
{
"epoch": 0.5097389142146705,
"grad_norm": 2.450518901210104,
"learning_rate": 5.703109312324493e-06,
"loss": 0.4631,
"step": 1230
},
{
"epoch": 0.5118110236220472,
"grad_norm": 2.512677663120476,
"learning_rate": 5.667273903586656e-06,
"loss": 0.4636,
"step": 1235
},
{
"epoch": 0.5138831330294239,
"grad_norm": 2.508483790573203,
"learning_rate": 5.6314035629371835e-06,
"loss": 0.4494,
"step": 1240
},
{
"epoch": 0.5159552424368007,
"grad_norm": 2.539619573950297,
"learning_rate": 5.595500168195007e-06,
"loss": 0.4657,
"step": 1245
},
{
"epoch": 0.5180273518441774,
"grad_norm": 2.457760045030727,
"learning_rate": 5.5595655989094525e-06,
"loss": 0.4562,
"step": 1250
},
{
"epoch": 0.520099461251554,
"grad_norm": 2.5810506702287546,
"learning_rate": 5.52360173626183e-06,
"loss": 0.4587,
"step": 1255
},
{
"epoch": 0.5221715706589308,
"grad_norm": 2.533803600220524,
"learning_rate": 5.487610462966969e-06,
"loss": 0.4473,
"step": 1260
},
{
"epoch": 0.5242436800663075,
"grad_norm": 2.408501431878237,
"learning_rate": 5.451593663174647e-06,
"loss": 0.4466,
"step": 1265
},
{
"epoch": 0.5263157894736842,
"grad_norm": 2.5248933210116427,
"learning_rate": 5.4155532223709625e-06,
"loss": 0.4427,
"step": 1270
},
{
"epoch": 0.528387898881061,
"grad_norm": 2.5355682898170704,
"learning_rate": 5.379491027279622e-06,
"loss": 0.4624,
"step": 1275
},
{
"epoch": 0.5304600082884376,
"grad_norm": 2.7849780348488156,
"learning_rate": 5.343408965763174e-06,
"loss": 0.4487,
"step": 1280
},
{
"epoch": 0.5325321176958143,
"grad_norm": 2.401434318103116,
"learning_rate": 5.3073089267241805e-06,
"loss": 0.4393,
"step": 1285
},
{
"epoch": 0.534604227103191,
"grad_norm": 2.4324353094703475,
"learning_rate": 5.271192800006325e-06,
"loss": 0.4405,
"step": 1290
},
{
"epoch": 0.5366763365105678,
"grad_norm": 2.5008934086234027,
"learning_rate": 5.235062476295488e-06,
"loss": 0.4206,
"step": 1295
},
{
"epoch": 0.5387484459179445,
"grad_norm": 2.523177697285061,
"learning_rate": 5.198919847020765e-06,
"loss": 0.4378,
"step": 1300
},
{
"epoch": 0.5408205553253211,
"grad_norm": 2.561504860219997,
"learning_rate": 5.162766804255446e-06,
"loss": 0.4369,
"step": 1305
},
{
"epoch": 0.5428926647326979,
"grad_norm": 2.4342748058160457,
"learning_rate": 5.1266052406179755e-06,
"loss": 0.4429,
"step": 1310
},
{
"epoch": 0.5449647741400746,
"grad_norm": 2.5003051526668054,
"learning_rate": 5.090437049172861e-06,
"loss": 0.4479,
"step": 1315
},
{
"epoch": 0.5470368835474513,
"grad_norm": 2.41044724225261,
"learning_rate": 5.054264123331583e-06,
"loss": 0.4348,
"step": 1320
},
{
"epoch": 0.5491089929548281,
"grad_norm": 2.534587174570914,
"learning_rate": 5.018088356753463e-06,
"loss": 0.4346,
"step": 1325
},
{
"epoch": 0.5511811023622047,
"grad_norm": 2.3874656931988953,
"learning_rate": 4.981911643246539e-06,
"loss": 0.4463,
"step": 1330
},
{
"epoch": 0.5532532117695814,
"grad_norm": 2.344775548708239,
"learning_rate": 4.9457358766684175e-06,
"loss": 0.4323,
"step": 1335
},
{
"epoch": 0.5553253211769581,
"grad_norm": 2.394852642174165,
"learning_rate": 4.9095629508271396e-06,
"loss": 0.412,
"step": 1340
},
{
"epoch": 0.5573974305843349,
"grad_norm": 2.4107260444211067,
"learning_rate": 4.873394759382025e-06,
"loss": 0.434,
"step": 1345
},
{
"epoch": 0.5594695399917116,
"grad_norm": 2.39603695241891,
"learning_rate": 4.837233195744556e-06,
"loss": 0.4368,
"step": 1350
},
{
"epoch": 0.5615416493990882,
"grad_norm": 2.484288425740336,
"learning_rate": 4.8010801529792375e-06,
"loss": 0.4247,
"step": 1355
},
{
"epoch": 0.563613758806465,
"grad_norm": 2.491968721281532,
"learning_rate": 4.7649375237045135e-06,
"loss": 0.4239,
"step": 1360
},
{
"epoch": 0.5656858682138417,
"grad_norm": 2.405464453263017,
"learning_rate": 4.728807199993677e-06,
"loss": 0.4156,
"step": 1365
},
{
"epoch": 0.5677579776212184,
"grad_norm": 2.3649504461482405,
"learning_rate": 4.692691073275822e-06,
"loss": 0.4272,
"step": 1370
},
{
"epoch": 0.5698300870285951,
"grad_norm": 2.578097130955108,
"learning_rate": 4.656591034236827e-06,
"loss": 0.4318,
"step": 1375
},
{
"epoch": 0.5719021964359718,
"grad_norm": 2.3565047434019233,
"learning_rate": 4.620508972720379e-06,
"loss": 0.4157,
"step": 1380
},
{
"epoch": 0.5739743058433485,
"grad_norm": 2.4914369771154776,
"learning_rate": 4.584446777629038e-06,
"loss": 0.4131,
"step": 1385
},
{
"epoch": 0.5760464152507252,
"grad_norm": 2.5403254221008016,
"learning_rate": 4.548406336825354e-06,
"loss": 0.4209,
"step": 1390
},
{
"epoch": 0.578118524658102,
"grad_norm": 2.4433869039162763,
"learning_rate": 4.512389537033032e-06,
"loss": 0.4156,
"step": 1395
},
{
"epoch": 0.5801906340654787,
"grad_norm": 2.5532014778576606,
"learning_rate": 4.476398263738171e-06,
"loss": 0.4187,
"step": 1400
},
{
"epoch": 0.5822627434728553,
"grad_norm": 2.422240802637448,
"learning_rate": 4.440434401090549e-06,
"loss": 0.4123,
"step": 1405
},
{
"epoch": 0.5843348528802321,
"grad_norm": 2.327309317585392,
"learning_rate": 4.404499831804993e-06,
"loss": 0.4167,
"step": 1410
},
{
"epoch": 0.5864069622876088,
"grad_norm": 2.337179254235494,
"learning_rate": 4.368596437062819e-06,
"loss": 0.4253,
"step": 1415
},
{
"epoch": 0.5884790716949855,
"grad_norm": 2.6203872163413,
"learning_rate": 4.332726096413346e-06,
"loss": 0.4035,
"step": 1420
},
{
"epoch": 0.5905511811023622,
"grad_norm": 2.45290146849787,
"learning_rate": 4.29689068767551e-06,
"loss": 0.4205,
"step": 1425
},
{
"epoch": 0.5926232905097389,
"grad_norm": 2.442505011249494,
"learning_rate": 4.261092086839549e-06,
"loss": 0.4199,
"step": 1430
},
{
"epoch": 0.5946953999171156,
"grad_norm": 2.469896647219146,
"learning_rate": 4.225332167968808e-06,
"loss": 0.4197,
"step": 1435
},
{
"epoch": 0.5967675093244923,
"grad_norm": 2.525933254390451,
"learning_rate": 4.189612803101614e-06,
"loss": 0.4136,
"step": 1440
},
{
"epoch": 0.5988396187318691,
"grad_norm": 2.3481926369028154,
"learning_rate": 4.153935862153299e-06,
"loss": 0.4098,
"step": 1445
},
{
"epoch": 0.6009117281392458,
"grad_norm": 2.4652012656554336,
"learning_rate": 4.118303212818277e-06,
"loss": 0.4111,
"step": 1450
},
{
"epoch": 0.6029838375466224,
"grad_norm": 2.597755453602896,
"learning_rate": 4.082716720472304e-06,
"loss": 0.4141,
"step": 1455
},
{
"epoch": 0.6050559469539992,
"grad_norm": 2.3332725702199912,
"learning_rate": 4.0471782480748005e-06,
"loss": 0.3886,
"step": 1460
},
{
"epoch": 0.6071280563613759,
"grad_norm": 2.387505345147473,
"learning_rate": 4.011689656071334e-06,
"loss": 0.4167,
"step": 1465
},
{
"epoch": 0.6092001657687526,
"grad_norm": 2.4113537140405388,
"learning_rate": 3.97625280229623e-06,
"loss": 0.4002,
"step": 1470
},
{
"epoch": 0.6112722751761293,
"grad_norm": 2.4554203658965488,
"learning_rate": 3.940869541875301e-06,
"loss": 0.3881,
"step": 1475
},
{
"epoch": 0.613344384583506,
"grad_norm": 2.425914982917716,
"learning_rate": 3.905541727128743e-06,
"loss": 0.4069,
"step": 1480
},
{
"epoch": 0.6154164939908827,
"grad_norm": 2.5231106398066476,
"learning_rate": 3.870271207474154e-06,
"loss": 0.4002,
"step": 1485
},
{
"epoch": 0.6174886033982594,
"grad_norm": 2.4689338483413135,
"learning_rate": 3.8350598293297345e-06,
"loss": 0.4141,
"step": 1490
},
{
"epoch": 0.6195607128056362,
"grad_norm": 2.496046599900669,
"learning_rate": 3.7999094360176036e-06,
"loss": 0.3965,
"step": 1495
},
{
"epoch": 0.6216328222130129,
"grad_norm": 2.3726201269683553,
"learning_rate": 3.7648218676673232e-06,
"loss": 0.4017,
"step": 1500
},
{
"epoch": 0.6237049316203895,
"grad_norm": 2.418446591018178,
"learning_rate": 3.7297989611195504e-06,
"loss": 0.3938,
"step": 1505
},
{
"epoch": 0.6257770410277662,
"grad_norm": 2.4157452764623963,
"learning_rate": 3.694842549829889e-06,
"loss": 0.3871,
"step": 1510
},
{
"epoch": 0.627849150435143,
"grad_norm": 2.4088293626826114,
"learning_rate": 3.659954463772901e-06,
"loss": 0.4002,
"step": 1515
},
{
"epoch": 0.6299212598425197,
"grad_norm": 2.5779980646499543,
"learning_rate": 3.625136529346312e-06,
"loss": 0.4055,
"step": 1520
},
{
"epoch": 0.6319933692498964,
"grad_norm": 2.4110720231898806,
"learning_rate": 3.590390569275395e-06,
"loss": 0.3913,
"step": 1525
},
{
"epoch": 0.6340654786572731,
"grad_norm": 2.3982253161569234,
"learning_rate": 3.555718402517554e-06,
"loss": 0.3962,
"step": 1530
},
{
"epoch": 0.6361375880646498,
"grad_norm": 2.392216618491618,
"learning_rate": 3.521121844167098e-06,
"loss": 0.399,
"step": 1535
},
{
"epoch": 0.6382096974720265,
"grad_norm": 2.3550943907800193,
"learning_rate": 3.486602705360224e-06,
"loss": 0.3927,
"step": 1540
},
{
"epoch": 0.6402818068794033,
"grad_norm": 2.4744483911955153,
"learning_rate": 3.4521627931801976e-06,
"loss": 0.3961,
"step": 1545
},
{
"epoch": 0.64235391628678,
"grad_norm": 2.416762173736933,
"learning_rate": 3.41780391056276e-06,
"loss": 0.3959,
"step": 1550
},
{
"epoch": 0.6444260256941566,
"grad_norm": 2.3621515478388906,
"learning_rate": 3.3835278562017405e-06,
"loss": 0.3889,
"step": 1555
},
{
"epoch": 0.6464981351015333,
"grad_norm": 2.3799558311767917,
"learning_rate": 3.349336424454889e-06,
"loss": 0.395,
"step": 1560
},
{
"epoch": 0.6485702445089101,
"grad_norm": 2.4290023388512143,
"learning_rate": 3.3152314052499513e-06,
"loss": 0.3921,
"step": 1565
},
{
"epoch": 0.6506423539162868,
"grad_norm": 2.386844861650698,
"learning_rate": 3.2812145839909566e-06,
"loss": 0.382,
"step": 1570
},
{
"epoch": 0.6527144633236635,
"grad_norm": 2.58783476073575,
"learning_rate": 3.247287741464758e-06,
"loss": 0.3961,
"step": 1575
},
{
"epoch": 0.6547865727310402,
"grad_norm": 2.4711795095903906,
"learning_rate": 3.2134526537478034e-06,
"loss": 0.403,
"step": 1580
},
{
"epoch": 0.6568586821384169,
"grad_norm": 2.445518124608817,
"learning_rate": 3.1797110921131626e-06,
"loss": 0.3949,
"step": 1585
},
{
"epoch": 0.6589307915457936,
"grad_norm": 2.3302808051599957,
"learning_rate": 3.1460648229377933e-06,
"loss": 0.4003,
"step": 1590
},
{
"epoch": 0.6610029009531704,
"grad_norm": 2.4700969540161086,
"learning_rate": 3.1125156076100804e-06,
"loss": 0.3845,
"step": 1595
},
{
"epoch": 0.6630750103605471,
"grad_norm": 2.2736405570858684,
"learning_rate": 3.0790652024376163e-06,
"loss": 0.3755,
"step": 1600
},
{
"epoch": 0.6651471197679237,
"grad_norm": 2.3793447678069732,
"learning_rate": 3.0457153585552724e-06,
"loss": 0.3764,
"step": 1605
},
{
"epoch": 0.6672192291753004,
"grad_norm": 2.381350136652911,
"learning_rate": 3.012467821833506e-06,
"loss": 0.388,
"step": 1610
},
{
"epoch": 0.6692913385826772,
"grad_norm": 2.440306240554211,
"learning_rate": 2.979324332786987e-06,
"loss": 0.3914,
"step": 1615
},
{
"epoch": 0.6713634479900539,
"grad_norm": 2.473851856313499,
"learning_rate": 2.946286626483463e-06,
"loss": 0.3844,
"step": 1620
},
{
"epoch": 0.6734355573974306,
"grad_norm": 2.266055102817957,
"learning_rate": 2.913356432452942e-06,
"loss": 0.3789,
"step": 1625
},
{
"epoch": 0.6755076668048073,
"grad_norm": 2.4641402988030126,
"learning_rate": 2.8805354745971336e-06,
"loss": 0.37,
"step": 1630
},
{
"epoch": 0.677579776212184,
"grad_norm": 2.3943294283961984,
"learning_rate": 2.847825471099227e-06,
"loss": 0.3777,
"step": 1635
},
{
"epoch": 0.6796518856195607,
"grad_norm": 2.384842311722,
"learning_rate": 2.815228134333925e-06,
"loss": 0.382,
"step": 1640
},
{
"epoch": 0.6817239950269374,
"grad_norm": 2.469151674287745,
"learning_rate": 2.782745170777801e-06,
"loss": 0.3704,
"step": 1645
},
{
"epoch": 0.6837961044343142,
"grad_norm": 2.583143124954603,
"learning_rate": 2.750378280919975e-06,
"loss": 0.3736,
"step": 1650
},
{
"epoch": 0.6858682138416908,
"grad_norm": 2.4321963843177947,
"learning_rate": 2.7181291591730885e-06,
"loss": 0.3782,
"step": 1655
},
{
"epoch": 0.6879403232490675,
"grad_norm": 2.3531730582824975,
"learning_rate": 2.6859994937846002e-06,
"loss": 0.376,
"step": 1660
},
{
"epoch": 0.6900124326564443,
"grad_norm": 2.4367688917720485,
"learning_rate": 2.653990966748401e-06,
"loss": 0.377,
"step": 1665
},
{
"epoch": 0.692084542063821,
"grad_norm": 2.3875592466498605,
"learning_rate": 2.622105253716774e-06,
"loss": 0.3772,
"step": 1670
},
{
"epoch": 0.6941566514711976,
"grad_norm": 2.424175019670591,
"learning_rate": 2.5903440239126633e-06,
"loss": 0.3762,
"step": 1675
},
{
"epoch": 0.6962287608785744,
"grad_norm": 2.337415256255787,
"learning_rate": 2.5587089400422936e-06,
"loss": 0.3906,
"step": 1680
},
{
"epoch": 0.6983008702859511,
"grad_norm": 2.384059902937286,
"learning_rate": 2.5272016582081236e-06,
"loss": 0.3661,
"step": 1685
},
{
"epoch": 0.7003729796933278,
"grad_norm": 2.346884231086367,
"learning_rate": 2.4958238278221603e-06,
"loss": 0.3632,
"step": 1690
},
{
"epoch": 0.7024450891007045,
"grad_norm": 2.3658711079290264,
"learning_rate": 2.464577091519594e-06,
"loss": 0.3695,
"step": 1695
},
{
"epoch": 0.7045171985080813,
"grad_norm": 2.3082126926984277,
"learning_rate": 2.43346308507282e-06,
"loss": 0.3697,
"step": 1700
},
{
"epoch": 0.7065893079154579,
"grad_norm": 2.2724590255115324,
"learning_rate": 2.4024834373058024e-06,
"loss": 0.3708,
"step": 1705
},
{
"epoch": 0.7086614173228346,
"grad_norm": 2.4685609897195633,
"learning_rate": 2.371639770008804e-06,
"loss": 0.3589,
"step": 1710
},
{
"epoch": 0.7107335267302114,
"grad_norm": 2.4411171519085704,
"learning_rate": 2.3409336978534785e-06,
"loss": 0.3657,
"step": 1715
},
{
"epoch": 0.7128056361375881,
"grad_norm": 2.3755249742291227,
"learning_rate": 2.3103668283083563e-06,
"loss": 0.3688,
"step": 1720
},
{
"epoch": 0.7148777455449647,
"grad_norm": 2.3679474209669307,
"learning_rate": 2.2799407615546816e-06,
"loss": 0.3738,
"step": 1725
},
{
"epoch": 0.7169498549523415,
"grad_norm": 2.351814749989449,
"learning_rate": 2.2496570904026484e-06,
"loss": 0.3647,
"step": 1730
},
{
"epoch": 0.7190219643597182,
"grad_norm": 2.4116114120516103,
"learning_rate": 2.219517400208015e-06,
"loss": 0.3736,
"step": 1735
},
{
"epoch": 0.7210940737670949,
"grad_norm": 2.3864727744957412,
"learning_rate": 2.1895232687891044e-06,
"loss": 0.3484,
"step": 1740
},
{
"epoch": 0.7231661831744716,
"grad_norm": 2.2512303655303874,
"learning_rate": 2.159676266344222e-06,
"loss": 0.3599,
"step": 1745
},
{
"epoch": 0.7252382925818484,
"grad_norm": 2.2576815601903872,
"learning_rate": 2.1299779553694323e-06,
"loss": 0.358,
"step": 1750
},
{
"epoch": 0.727310401989225,
"grad_norm": 2.4261849484523594,
"learning_rate": 2.100429890576782e-06,
"loss": 0.3638,
"step": 1755
},
{
"epoch": 0.7293825113966017,
"grad_norm": 2.3998781669660008,
"learning_rate": 2.0710336188129e-06,
"loss": 0.3626,
"step": 1760
},
{
"epoch": 0.7314546208039785,
"grad_norm": 2.2949968092513697,
"learning_rate": 2.0417906789780236e-06,
"loss": 0.3556,
"step": 1765
},
{
"epoch": 0.7335267302113552,
"grad_norm": 2.3439568873845222,
"learning_rate": 2.0127026019454305e-06,
"loss": 0.3753,
"step": 1770
},
{
"epoch": 0.7355988396187318,
"grad_norm": 2.2377793345094896,
"learning_rate": 1.9837709104813075e-06,
"loss": 0.358,
"step": 1775
},
{
"epoch": 0.7376709490261085,
"grad_norm": 2.2885196165888653,
"learning_rate": 1.9549971191650263e-06,
"loss": 0.3515,
"step": 1780
},
{
"epoch": 0.7397430584334853,
"grad_norm": 2.558424767409142,
"learning_rate": 1.9263827343098596e-06,
"loss": 0.3657,
"step": 1785
},
{
"epoch": 0.741815167840862,
"grad_norm": 2.44623978739584,
"learning_rate": 1.8979292538841133e-06,
"loss": 0.3623,
"step": 1790
},
{
"epoch": 0.7438872772482387,
"grad_norm": 2.2614597017692692,
"learning_rate": 1.8696381674327308e-06,
"loss": 0.3553,
"step": 1795
},
{
"epoch": 0.7459593866556155,
"grad_norm": 2.2881907635344576,
"learning_rate": 1.8415109559992883e-06,
"loss": 0.3531,
"step": 1800
},
{
"epoch": 0.7480314960629921,
"grad_norm": 2.463206144892447,
"learning_rate": 1.8135490920484832e-06,
"loss": 0.3559,
"step": 1805
},
{
"epoch": 0.7501036054703688,
"grad_norm": 2.3074379267386766,
"learning_rate": 1.7857540393890337e-06,
"loss": 0.3544,
"step": 1810
},
{
"epoch": 0.7521757148777456,
"grad_norm": 2.345951065809859,
"learning_rate": 1.7581272530970666e-06,
"loss": 0.3495,
"step": 1815
},
{
"epoch": 0.7542478242851223,
"grad_norm": 2.436580121963066,
"learning_rate": 1.7306701794399266e-06,
"loss": 0.351,
"step": 1820
},
{
"epoch": 0.756319933692499,
"grad_norm": 2.477449269541472,
"learning_rate": 1.7033842558004692e-06,
"loss": 0.357,
"step": 1825
},
{
"epoch": 0.7583920430998756,
"grad_norm": 2.336059640702188,
"learning_rate": 1.6762709106018193e-06,
"loss": 0.3566,
"step": 1830
},
{
"epoch": 0.7604641525072524,
"grad_norm": 2.327604564240487,
"learning_rate": 1.6493315632325873e-06,
"loss": 0.3693,
"step": 1835
},
{
"epoch": 0.7625362619146291,
"grad_norm": 2.4521027562031805,
"learning_rate": 1.6225676239725663e-06,
"loss": 0.3557,
"step": 1840
},
{
"epoch": 0.7646083713220058,
"grad_norm": 2.3928441636264055,
"learning_rate": 1.5959804939188962e-06,
"loss": 0.35,
"step": 1845
},
{
"epoch": 0.7666804807293826,
"grad_norm": 2.3515743827298263,
"learning_rate": 1.5695715649127347e-06,
"loss": 0.3597,
"step": 1850
},
{
"epoch": 0.7687525901367592,
"grad_norm": 2.4659862823505843,
"learning_rate": 1.5433422194663694e-06,
"loss": 0.3544,
"step": 1855
},
{
"epoch": 0.7708246995441359,
"grad_norm": 2.3479456978647684,
"learning_rate": 1.5172938306908624e-06,
"loss": 0.3479,
"step": 1860
},
{
"epoch": 0.7728968089515127,
"grad_norm": 2.491588705252418,
"learning_rate": 1.4914277622241596e-06,
"loss": 0.348,
"step": 1865
},
{
"epoch": 0.7749689183588894,
"grad_norm": 2.2957936532852026,
"learning_rate": 1.4657453681597056e-06,
"loss": 0.3535,
"step": 1870
},
{
"epoch": 0.777041027766266,
"grad_norm": 2.2660578633915787,
"learning_rate": 1.440247992975553e-06,
"loss": 0.3503,
"step": 1875
},
{
"epoch": 0.7791131371736427,
"grad_norm": 2.4214309256458755,
"learning_rate": 1.4149369714639856e-06,
"loss": 0.3525,
"step": 1880
},
{
"epoch": 0.7811852465810195,
"grad_norm": 2.378191297196036,
"learning_rate": 1.3898136286616364e-06,
"loss": 0.3449,
"step": 1885
},
{
"epoch": 0.7832573559883962,
"grad_norm": 2.369189696994275,
"learning_rate": 1.3648792797801264e-06,
"loss": 0.3411,
"step": 1890
},
{
"epoch": 0.7853294653957729,
"grad_norm": 2.357227748595689,
"learning_rate": 1.3401352301372039e-06,
"loss": 0.3428,
"step": 1895
},
{
"epoch": 0.7874015748031497,
"grad_norm": 2.283274818402107,
"learning_rate": 1.315582775088421e-06,
"loss": 0.3476,
"step": 1900
},
{
"epoch": 0.7894736842105263,
"grad_norm": 2.443389423960905,
"learning_rate": 1.2912231999593222e-06,
"loss": 0.3456,
"step": 1905
},
{
"epoch": 0.791545793617903,
"grad_norm": 2.3098901374263785,
"learning_rate": 1.267057779978143e-06,
"loss": 0.3387,
"step": 1910
},
{
"epoch": 0.7936179030252797,
"grad_norm": 2.3606277283639403,
"learning_rate": 1.2430877802090674e-06,
"loss": 0.3505,
"step": 1915
},
{
"epoch": 0.7956900124326565,
"grad_norm": 2.778705490024244,
"learning_rate": 1.2193144554859938e-06,
"loss": 0.334,
"step": 1920
},
{
"epoch": 0.7977621218400331,
"grad_norm": 2.3112033884634418,
"learning_rate": 1.195739050346848e-06,
"loss": 0.349,
"step": 1925
},
{
"epoch": 0.7998342312474098,
"grad_norm": 2.3575714411861517,
"learning_rate": 1.172362798968424e-06,
"loss": 0.3449,
"step": 1930
},
{
"epoch": 0.8019063406547866,
"grad_norm": 2.491996861817634,
"learning_rate": 1.1491869251017833e-06,
"loss": 0.3414,
"step": 1935
},
{
"epoch": 0.8039784500621633,
"grad_norm": 2.3991288004159883,
"learning_rate": 1.1262126420081887e-06,
"loss": 0.3457,
"step": 1940
},
{
"epoch": 0.80605055946954,
"grad_norm": 2.1821189723068612,
"learning_rate": 1.103441152395588e-06,
"loss": 0.3283,
"step": 1945
},
{
"epoch": 0.8081226688769167,
"grad_norm": 2.417668328035057,
"learning_rate": 1.0808736483556486e-06,
"loss": 0.3386,
"step": 1950
},
{
"epoch": 0.8101947782842934,
"grad_norm": 2.424032614605958,
"learning_rate": 1.0585113113013656e-06,
"loss": 0.3451,
"step": 1955
},
{
"epoch": 0.8122668876916701,
"grad_norm": 2.3278404790134823,
"learning_rate": 1.036355311905194e-06,
"loss": 0.3455,
"step": 1960
},
{
"epoch": 0.8143389970990468,
"grad_norm": 2.3039554290948185,
"learning_rate": 1.0144068100377818e-06,
"loss": 0.3381,
"step": 1965
},
{
"epoch": 0.8164111065064236,
"grad_norm": 2.556306207112663,
"learning_rate": 9.926669547072365e-07,
"loss": 0.3485,
"step": 1970
},
{
"epoch": 0.8184832159138002,
"grad_norm": 2.4582213305283775,
"learning_rate": 9.711368839989904e-07,
"loss": 0.3335,
"step": 1975
},
{
"epoch": 0.8205553253211769,
"grad_norm": 2.3579094295454657,
"learning_rate": 9.498177250162022e-07,
"loss": 0.3346,
"step": 1980
},
{
"epoch": 0.8226274347285537,
"grad_norm": 2.3492342765735934,
"learning_rate": 9.287105938207691e-07,
"loss": 0.3365,
"step": 1985
},
{
"epoch": 0.8246995441359304,
"grad_norm": 2.4946764945886017,
"learning_rate": 9.078165953748936e-07,
"loss": 0.336,
"step": 1990
},
{
"epoch": 0.8267716535433071,
"grad_norm": 2.5266114489992955,
"learning_rate": 8.871368234832378e-07,
"loss": 0.3367,
"step": 1995
},
{
"epoch": 0.8288437629506838,
"grad_norm": 2.473228872985738,
"learning_rate": 8.66672360735668e-07,
"loss": 0.3247,
"step": 2000
},
{
"epoch": 0.8309158723580605,
"grad_norm": 2.393007820983065,
"learning_rate": 8.4642427845057e-07,
"loss": 0.3289,
"step": 2005
},
{
"epoch": 0.8329879817654372,
"grad_norm": 2.492764210538724,
"learning_rate": 8.263936366187825e-07,
"loss": 0.3325,
"step": 2010
},
{
"epoch": 0.8350600911728139,
"grad_norm": 2.4227393176245418,
"learning_rate": 8.065814838480879e-07,
"loss": 0.3288,
"step": 2015
},
{
"epoch": 0.8371322005801907,
"grad_norm": 2.3030572780595366,
"learning_rate": 7.869888573083295e-07,
"loss": 0.3401,
"step": 2020
},
{
"epoch": 0.8392043099875673,
"grad_norm": 2.3219953415669528,
"learning_rate": 7.676167826771125e-07,
"loss": 0.331,
"step": 2025
},
{
"epoch": 0.841276419394944,
"grad_norm": 2.273400790612548,
"learning_rate": 7.484662740861093e-07,
"loss": 0.3383,
"step": 2030
},
{
"epoch": 0.8433485288023208,
"grad_norm": 2.374061200093819,
"learning_rate": 7.295383340679668e-07,
"loss": 0.3343,
"step": 2035
},
{
"epoch": 0.8454206382096975,
"grad_norm": 2.377090176007411,
"learning_rate": 7.108339535038278e-07,
"loss": 0.3298,
"step": 2040
},
{
"epoch": 0.8474927476170742,
"grad_norm": 2.3279278715664016,
"learning_rate": 6.923541115714577e-07,
"loss": 0.3319,
"step": 2045
},
{
"epoch": 0.8495648570244508,
"grad_norm": 2.36303398687802,
"learning_rate": 6.740997756939826e-07,
"loss": 0.3418,
"step": 2050
},
{
"epoch": 0.8516369664318276,
"grad_norm": 2.415624778572703,
"learning_rate": 6.560719014892425e-07,
"loss": 0.3328,
"step": 2055
},
{
"epoch": 0.8537090758392043,
"grad_norm": 2.287618281117491,
"learning_rate": 6.382714327197703e-07,
"loss": 0.3321,
"step": 2060
},
{
"epoch": 0.855781185246581,
"grad_norm": 2.2595148250907036,
"learning_rate": 6.206993012433815e-07,
"loss": 0.3336,
"step": 2065
},
{
"epoch": 0.8578532946539578,
"grad_norm": 2.3926760910481186,
"learning_rate": 6.033564269643927e-07,
"loss": 0.3342,
"step": 2070
},
{
"epoch": 0.8599254040613344,
"grad_norm": 2.316219130405351,
"learning_rate": 5.862437177854629e-07,
"loss": 0.3311,
"step": 2075
},
{
"epoch": 0.8619975134687111,
"grad_norm": 2.3766431440321707,
"learning_rate": 5.693620695600671e-07,
"loss": 0.3153,
"step": 2080
},
{
"epoch": 0.8640696228760879,
"grad_norm": 2.1660555465277405,
"learning_rate": 5.527123660455968e-07,
"loss": 0.3268,
"step": 2085
},
{
"epoch": 0.8661417322834646,
"grad_norm": 2.3890878403617815,
"learning_rate": 5.362954788570929e-07,
"loss": 0.317,
"step": 2090
},
{
"epoch": 0.8682138416908413,
"grad_norm": 2.3910404086765515,
"learning_rate": 5.201122674216208e-07,
"loss": 0.329,
"step": 2095
},
{
"epoch": 0.8702859510982179,
"grad_norm": 2.398893233495637,
"learning_rate": 5.041635789332783e-07,
"loss": 0.3306,
"step": 2100
},
{
"epoch": 0.8723580605055947,
"grad_norm": 2.5664259292956495,
"learning_rate": 4.884502483088421e-07,
"loss": 0.3153,
"step": 2105
},
{
"epoch": 0.8744301699129714,
"grad_norm": 2.4313343193724073,
"learning_rate": 4.7297309814406113e-07,
"loss": 0.3286,
"step": 2110
},
{
"epoch": 0.8765022793203481,
"grad_norm": 2.3267523640919374,
"learning_rate": 4.577329386705942e-07,
"loss": 0.3321,
"step": 2115
},
{
"epoch": 0.8785743887277249,
"grad_norm": 2.37298005463995,
"learning_rate": 4.42730567713594e-07,
"loss": 0.3303,
"step": 2120
},
{
"epoch": 0.8806464981351015,
"grad_norm": 2.4400767630128084,
"learning_rate": 4.2796677064994243e-07,
"loss": 0.3187,
"step": 2125
},
{
"epoch": 0.8827186075424782,
"grad_norm": 2.600660724910327,
"learning_rate": 4.134423203671295e-07,
"loss": 0.3347,
"step": 2130
},
{
"epoch": 0.884790716949855,
"grad_norm": 2.416615232949827,
"learning_rate": 3.9915797722280323e-07,
"loss": 0.3317,
"step": 2135
},
{
"epoch": 0.8868628263572317,
"grad_norm": 2.3982415986507344,
"learning_rate": 3.851144890049535e-07,
"loss": 0.326,
"step": 2140
},
{
"epoch": 0.8889349357646084,
"grad_norm": 2.3596784596238445,
"learning_rate": 3.713125908927728e-07,
"loss": 0.3274,
"step": 2145
},
{
"epoch": 0.891007045171985,
"grad_norm": 2.395731949083915,
"learning_rate": 3.577530054181677e-07,
"loss": 0.3337,
"step": 2150
},
{
"epoch": 0.8930791545793618,
"grad_norm": 2.4132993194541363,
"learning_rate": 3.4443644242793226e-07,
"loss": 0.3205,
"step": 2155
},
{
"epoch": 0.8951512639867385,
"grad_norm": 2.421557012145629,
"learning_rate": 3.313635990465902e-07,
"loss": 0.3286,
"step": 2160
},
{
"epoch": 0.8972233733941152,
"grad_norm": 2.3514364773273053,
"learning_rate": 3.1853515963989613e-07,
"loss": 0.3244,
"step": 2165
},
{
"epoch": 0.899295482801492,
"grad_norm": 2.3631578697068156,
"learning_rate": 3.059517957790165e-07,
"loss": 0.3224,
"step": 2170
},
{
"epoch": 0.9013675922088686,
"grad_norm": 2.3165934226078693,
"learning_rate": 2.936141662053621e-07,
"loss": 0.3189,
"step": 2175
},
{
"epoch": 0.9034397016162453,
"grad_norm": 2.354830595345679,
"learning_rate": 2.8152291679611254e-07,
"loss": 0.3164,
"step": 2180
},
{
"epoch": 0.905511811023622,
"grad_norm": 2.4591886130861984,
"learning_rate": 2.6967868053039916e-07,
"loss": 0.3194,
"step": 2185
},
{
"epoch": 0.9075839204309988,
"grad_norm": 2.4651307442780515,
"learning_rate": 2.580820774561704e-07,
"loss": 0.3329,
"step": 2190
},
{
"epoch": 0.9096560298383755,
"grad_norm": 2.356380428139328,
"learning_rate": 2.467337146577298e-07,
"loss": 0.3125,
"step": 2195
},
{
"epoch": 0.9117281392457521,
"grad_norm": 2.4718310107961923,
"learning_rate": 2.3563418622395863e-07,
"loss": 0.3267,
"step": 2200
},
{
"epoch": 0.9138002486531289,
"grad_norm": 2.2943822251497834,
"learning_rate": 2.2478407321721295e-07,
"loss": 0.3124,
"step": 2205
},
{
"epoch": 0.9158723580605056,
"grad_norm": 2.37802636947732,
"learning_rate": 2.141839436429055e-07,
"loss": 0.3261,
"step": 2210
},
{
"epoch": 0.9179444674678823,
"grad_norm": 2.4929237268669504,
"learning_rate": 2.038343524197689e-07,
"loss": 0.3266,
"step": 2215
},
{
"epoch": 0.9200165768752591,
"grad_norm": 2.4497686225084645,
"learning_rate": 1.9373584135080893e-07,
"loss": 0.3155,
"step": 2220
},
{
"epoch": 0.9220886862826357,
"grad_norm": 2.3085410601255063,
"learning_rate": 1.8388893909493776e-07,
"loss": 0.3143,
"step": 2225
},
{
"epoch": 0.9241607956900124,
"grad_norm": 2.516522380423302,
"learning_rate": 1.742941611393012e-07,
"loss": 0.3282,
"step": 2230
},
{
"epoch": 0.9262329050973891,
"grad_norm": 2.250941160020719,
"learning_rate": 1.6495200977228897e-07,
"loss": 0.3313,
"step": 2235
},
{
"epoch": 0.9283050145047659,
"grad_norm": 2.243976361659756,
"learning_rate": 1.558629740572465e-07,
"loss": 0.3123,
"step": 2240
},
{
"epoch": 0.9303771239121426,
"grad_norm": 2.411662333101451,
"learning_rate": 1.4702752980686463e-07,
"loss": 0.3227,
"step": 2245
},
{
"epoch": 0.9324492333195192,
"grad_norm": 2.4064884421540254,
"learning_rate": 1.3844613955827536e-07,
"loss": 0.326,
"step": 2250
},
{
"epoch": 0.934521342726896,
"grad_norm": 2.4948851626678326,
"learning_rate": 1.301192525488376e-07,
"loss": 0.3177,
"step": 2255
},
{
"epoch": 0.9365934521342727,
"grad_norm": 2.256448839133944,
"learning_rate": 1.2204730469261905e-07,
"loss": 0.3224,
"step": 2260
},
{
"epoch": 0.9386655615416494,
"grad_norm": 2.386604983970806,
"learning_rate": 1.1423071855757473e-07,
"loss": 0.3177,
"step": 2265
},
{
"epoch": 0.9407376709490262,
"grad_norm": 2.4955559903632087,
"learning_rate": 1.0666990334342708e-07,
"loss": 0.3235,
"step": 2270
},
{
"epoch": 0.9428097803564028,
"grad_norm": 2.4680594179132953,
"learning_rate": 9.936525486024362e-08,
"loss": 0.3234,
"step": 2275
},
{
"epoch": 0.9448818897637795,
"grad_norm": 2.352943986977247,
"learning_rate": 9.23171555077168e-08,
"loss": 0.3189,
"step": 2280
},
{
"epoch": 0.9469539991711562,
"grad_norm": 2.3685608851078106,
"learning_rate": 8.552597425514508e-08,
"loss": 0.3191,
"step": 2285
},
{
"epoch": 0.949026108578533,
"grad_norm": 2.4481455211139944,
"learning_rate": 7.8992066622115e-08,
"loss": 0.3283,
"step": 2290
},
{
"epoch": 0.9510982179859097,
"grad_norm": 2.298587702409287,
"learning_rate": 7.271577465989554e-08,
"loss": 0.3181,
"step": 2295
},
{
"epoch": 0.9531703273932863,
"grad_norm": 2.3601024907505956,
"learning_rate": 6.669742693352522e-08,
"loss": 0.3183,
"step": 2300
},
{
"epoch": 0.9552424368006631,
"grad_norm": 2.3846417205933537,
"learning_rate": 6.093733850461359e-08,
"loss": 0.311,
"step": 2305
},
{
"epoch": 0.9573145462080398,
"grad_norm": 2.5721713736853427,
"learning_rate": 5.5435810914851176e-08,
"loss": 0.3225,
"step": 2310
},
{
"epoch": 0.9593866556154165,
"grad_norm": 2.4391728621326667,
"learning_rate": 5.0193132170219814e-08,
"loss": 0.3219,
"step": 2315
},
{
"epoch": 0.9614587650227931,
"grad_norm": 2.314964895648817,
"learning_rate": 4.5209576725915305e-08,
"loss": 0.3146,
"step": 2320
},
{
"epoch": 0.9635308744301699,
"grad_norm": 2.3953921094993427,
"learning_rate": 4.0485405471983317e-08,
"loss": 0.3099,
"step": 2325
},
{
"epoch": 0.9656029838375466,
"grad_norm": 2.4903270174654977,
"learning_rate": 3.6020865719657015e-08,
"loss": 0.3261,
"step": 2330
},
{
"epoch": 0.9676750932449233,
"grad_norm": 2.5600866933116153,
"learning_rate": 3.181619118841517e-08,
"loss": 0.3207,
"step": 2335
},
{
"epoch": 0.9697472026523001,
"grad_norm": 2.2542713880681466,
"learning_rate": 2.7871601993741947e-08,
"loss": 0.3154,
"step": 2340
},
{
"epoch": 0.9718193120596768,
"grad_norm": 2.3906714544505,
"learning_rate": 2.4187304635608922e-08,
"loss": 0.3264,
"step": 2345
},
{
"epoch": 0.9738914214670534,
"grad_norm": 2.3437139004611396,
"learning_rate": 2.0763491987659812e-08,
"loss": 0.3131,
"step": 2350
},
{
"epoch": 0.9759635308744302,
"grad_norm": 2.3352282804892504,
"learning_rate": 1.7600343287116904e-08,
"loss": 0.3236,
"step": 2355
},
{
"epoch": 0.9780356402818069,
"grad_norm": 2.2788999882928267,
"learning_rate": 1.4698024125396893e-08,
"loss": 0.3165,
"step": 2360
},
{
"epoch": 0.9801077496891836,
"grad_norm": 2.484272820389643,
"learning_rate": 1.205668643944169e-08,
"loss": 0.3265,
"step": 2365
},
{
"epoch": 0.9821798590965602,
"grad_norm": 2.3492923712340357,
"learning_rate": 9.676468503765356e-09,
"loss": 0.3185,
"step": 2370
},
{
"epoch": 0.984251968503937,
"grad_norm": 2.2726204174567806,
"learning_rate": 7.557494923214338e-09,
"loss": 0.3187,
"step": 2375
},
{
"epoch": 0.9863240779113137,
"grad_norm": 2.502123973833959,
"learning_rate": 5.699876626446554e-09,
"loss": 0.315,
"step": 2380
},
{
"epoch": 0.9883961873186904,
"grad_norm": 2.5216195549598046,
"learning_rate": 4.103710860120513e-09,
"loss": 0.3232,
"step": 2385
},
{
"epoch": 0.9904682967260672,
"grad_norm": 2.3006568402247987,
"learning_rate": 2.769081183808253e-09,
"loss": 0.3132,
"step": 2390
},
{
"epoch": 0.9925404061334439,
"grad_norm": 2.3057165957876977,
"learning_rate": 1.69605746561885e-09,
"loss": 0.3099,
"step": 2395
},
{
"epoch": 0.9946125155408205,
"grad_norm": 2.3117976401405853,
"learning_rate": 8.846958785418969e-10,
"loss": 0.3166,
"step": 2400
},
{
"epoch": 0.9966846249481973,
"grad_norm": 2.4474241574477613,
"learning_rate": 3.3503889750485794e-10,
"loss": 0.3198,
"step": 2405
},
{
"epoch": 0.998756734355574,
"grad_norm": 2.475055392564647,
"learning_rate": 4.711529715262231e-11,
"loss": 0.3188,
"step": 2410
},
{
"epoch": 1.0,
"eval_loss": 0.2978341579437256,
"eval_runtime": 1.1902,
"eval_samples_per_second": 2.521,
"eval_steps_per_second": 0.84,
"step": 2413
},
{
"epoch": 1.0,
"step": 2413,
"total_flos": 252616554577920.0,
"train_loss": 0.0,
"train_runtime": 0.0113,
"train_samples_per_second": 3408953.689,
"train_steps_per_second": 213070.643
}
],
"logging_steps": 5,
"max_steps": 2413,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 252616554577920.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}