ardaspear's picture
Training in progress, step 428, checkpoint
289d53b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0035087719298246,
"eval_steps": 36,
"global_step": 428,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007017543859649123,
"eval_loss": 0.6692813038825989,
"eval_runtime": 46.9815,
"eval_samples_per_second": 5.108,
"eval_steps_per_second": 0.639,
"step": 1
},
{
"epoch": 0.021052631578947368,
"grad_norm": 9.726601600646973,
"learning_rate": 1.5e-05,
"loss": 2.4103,
"step": 3
},
{
"epoch": 0.042105263157894736,
"grad_norm": 6.305670738220215,
"learning_rate": 3e-05,
"loss": 2.7525,
"step": 6
},
{
"epoch": 0.06315789473684211,
"grad_norm": 4.551560401916504,
"learning_rate": 4.5e-05,
"loss": 2.5999,
"step": 9
},
{
"epoch": 0.08421052631578947,
"grad_norm": 4.885453701019287,
"learning_rate": 4.999717571181742e-05,
"loss": 2.2257,
"step": 12
},
{
"epoch": 0.10526315789473684,
"grad_norm": 4.619503498077393,
"learning_rate": 4.998234994371135e-05,
"loss": 2.0927,
"step": 15
},
{
"epoch": 0.12631578947368421,
"grad_norm": 4.877598285675049,
"learning_rate": 4.995482415049123e-05,
"loss": 2.3476,
"step": 18
},
{
"epoch": 0.14736842105263157,
"grad_norm": 6.852722644805908,
"learning_rate": 4.991461232516675e-05,
"loss": 2.028,
"step": 21
},
{
"epoch": 0.16842105263157894,
"grad_norm": 6.002420902252197,
"learning_rate": 4.986173490981773e-05,
"loss": 1.6801,
"step": 24
},
{
"epoch": 0.18947368421052632,
"grad_norm": 5.471989631652832,
"learning_rate": 4.979621878520216e-05,
"loss": 1.4341,
"step": 27
},
{
"epoch": 0.21052631578947367,
"grad_norm": 5.2608489990234375,
"learning_rate": 4.971809725709112e-05,
"loss": 1.8805,
"step": 30
},
{
"epoch": 0.23157894736842105,
"grad_norm": 3.3612749576568604,
"learning_rate": 4.962741003933742e-05,
"loss": 1.6929,
"step": 33
},
{
"epoch": 0.25263157894736843,
"grad_norm": 4.646411895751953,
"learning_rate": 4.952420323368673e-05,
"loss": 1.5576,
"step": 36
},
{
"epoch": 0.25263157894736843,
"eval_loss": 0.37584158778190613,
"eval_runtime": 47.717,
"eval_samples_per_second": 5.03,
"eval_steps_per_second": 0.629,
"step": 36
},
{
"epoch": 0.2736842105263158,
"grad_norm": 4.316969394683838,
"learning_rate": 4.9408529306341255e-05,
"loss": 1.8731,
"step": 39
},
{
"epoch": 0.29473684210526313,
"grad_norm": 4.682113170623779,
"learning_rate": 4.928044706128803e-05,
"loss": 1.8301,
"step": 42
},
{
"epoch": 0.3157894736842105,
"grad_norm": 3.629147529602051,
"learning_rate": 4.9140021610405326e-05,
"loss": 1.2944,
"step": 45
},
{
"epoch": 0.3368421052631579,
"grad_norm": 7.077390193939209,
"learning_rate": 4.898732434036244e-05,
"loss": 2.0447,
"step": 48
},
{
"epoch": 0.35789473684210527,
"grad_norm": 4.48635196685791,
"learning_rate": 4.882243287632947e-05,
"loss": 1.4274,
"step": 51
},
{
"epoch": 0.37894736842105264,
"grad_norm": 4.4893388748168945,
"learning_rate": 4.864543104251587e-05,
"loss": 1.7248,
"step": 54
},
{
"epoch": 0.4,
"grad_norm": 5.431076526641846,
"learning_rate": 4.8456408819557564e-05,
"loss": 1.6822,
"step": 57
},
{
"epoch": 0.42105263157894735,
"grad_norm": 3.690011501312256,
"learning_rate": 4.825546229877439e-05,
"loss": 1.7077,
"step": 60
},
{
"epoch": 0.4421052631578947,
"grad_norm": 5.592578411102295,
"learning_rate": 4.804269363332112e-05,
"loss": 1.836,
"step": 63
},
{
"epoch": 0.4631578947368421,
"grad_norm": 5.439390182495117,
"learning_rate": 4.78182109862569e-05,
"loss": 1.2283,
"step": 66
},
{
"epoch": 0.4842105263157895,
"grad_norm": 4.344250202178955,
"learning_rate": 4.758212847555953e-05,
"loss": 1.6078,
"step": 69
},
{
"epoch": 0.5052631578947369,
"grad_norm": 5.0562825202941895,
"learning_rate": 4.733456611611233e-05,
"loss": 1.858,
"step": 72
},
{
"epoch": 0.5052631578947369,
"eval_loss": 0.3427739441394806,
"eval_runtime": 47.7477,
"eval_samples_per_second": 5.026,
"eval_steps_per_second": 0.628,
"step": 72
},
{
"epoch": 0.5263157894736842,
"grad_norm": 4.3865838050842285,
"learning_rate": 4.7075649758693565e-05,
"loss": 1.2519,
"step": 75
},
{
"epoch": 0.5473684210526316,
"grad_norm": 2.8480889797210693,
"learning_rate": 4.68055110259988e-05,
"loss": 1.6193,
"step": 78
},
{
"epoch": 0.5684210526315789,
"grad_norm": 3.4718546867370605,
"learning_rate": 4.6524287245729295e-05,
"loss": 1.4091,
"step": 81
},
{
"epoch": 0.5894736842105263,
"grad_norm": 4.449820041656494,
"learning_rate": 4.6232121380780034e-05,
"loss": 1.484,
"step": 84
},
{
"epoch": 0.6105263157894737,
"grad_norm": 3.7628002166748047,
"learning_rate": 4.592916195656322e-05,
"loss": 1.3605,
"step": 87
},
{
"epoch": 0.631578947368421,
"grad_norm": 3.6203603744506836,
"learning_rate": 4.561556298550379e-05,
"loss": 1.4026,
"step": 90
},
{
"epoch": 0.6526315789473685,
"grad_norm": 3.6984612941741943,
"learning_rate": 4.529148388874577e-05,
"loss": 1.1724,
"step": 93
},
{
"epoch": 0.6736842105263158,
"grad_norm": 3.412766933441162,
"learning_rate": 4.49570894151089e-05,
"loss": 1.5515,
"step": 96
},
{
"epoch": 0.6947368421052632,
"grad_norm": 2.684919595718384,
"learning_rate": 4.4612549557336974e-05,
"loss": 1.2596,
"step": 99
},
{
"epoch": 0.7157894736842105,
"grad_norm": 4.008241176605225,
"learning_rate": 4.4258039465680326e-05,
"loss": 1.1391,
"step": 102
},
{
"epoch": 0.7368421052631579,
"grad_norm": 4.187386989593506,
"learning_rate": 4.389373935885646e-05,
"loss": 1.1588,
"step": 105
},
{
"epoch": 0.7578947368421053,
"grad_norm": 4.869933605194092,
"learning_rate": 4.351983443243409e-05,
"loss": 1.5655,
"step": 108
},
{
"epoch": 0.7578947368421053,
"eval_loss": 0.322973370552063,
"eval_runtime": 47.6997,
"eval_samples_per_second": 5.031,
"eval_steps_per_second": 0.629,
"step": 108
},
{
"epoch": 0.7789473684210526,
"grad_norm": 3.7822816371917725,
"learning_rate": 4.313651476468715e-05,
"loss": 1.5809,
"step": 111
},
{
"epoch": 0.8,
"grad_norm": 2.936788320541382,
"learning_rate": 4.274397521996658e-05,
"loss": 1.0463,
"step": 114
},
{
"epoch": 0.8210526315789474,
"grad_norm": 4.922979831695557,
"learning_rate": 4.234241534963916e-05,
"loss": 1.2287,
"step": 117
},
{
"epoch": 0.8421052631578947,
"grad_norm": 5.986371040344238,
"learning_rate": 4.193203929064353e-05,
"loss": 1.3477,
"step": 120
},
{
"epoch": 0.8631578947368421,
"grad_norm": 3.4700145721435547,
"learning_rate": 4.1513055661715214e-05,
"loss": 0.9548,
"step": 123
},
{
"epoch": 0.8842105263157894,
"grad_norm": 4.394268035888672,
"learning_rate": 4.108567745733318e-05,
"loss": 1.2286,
"step": 126
},
{
"epoch": 0.9052631578947369,
"grad_norm": 4.035145282745361,
"learning_rate": 4.065012193944201e-05,
"loss": 1.1731,
"step": 129
},
{
"epoch": 0.9263157894736842,
"grad_norm": 3.933317184448242,
"learning_rate": 4.020661052700461e-05,
"loss": 1.6722,
"step": 132
},
{
"epoch": 0.9473684210526315,
"grad_norm": 3.2603344917297363,
"learning_rate": 3.9755368683441735e-05,
"loss": 1.3816,
"step": 135
},
{
"epoch": 0.968421052631579,
"grad_norm": 6.198463439941406,
"learning_rate": 3.9296625802015356e-05,
"loss": 1.2843,
"step": 138
},
{
"epoch": 0.9894736842105263,
"grad_norm": 4.392797470092773,
"learning_rate": 3.883061508921439e-05,
"loss": 1.5944,
"step": 141
},
{
"epoch": 1.0105263157894737,
"grad_norm": 3.6408369541168213,
"learning_rate": 3.8357573446201825e-05,
"loss": 1.1528,
"step": 144
},
{
"epoch": 1.0105263157894737,
"eval_loss": 0.31307944655418396,
"eval_runtime": 47.7481,
"eval_samples_per_second": 5.026,
"eval_steps_per_second": 0.628,
"step": 144
},
{
"epoch": 1.0315789473684212,
"grad_norm": 3.727839946746826,
"learning_rate": 3.78777413483837e-05,
"loss": 1.3407,
"step": 147
},
{
"epoch": 1.0526315789473684,
"grad_norm": 4.318253517150879,
"learning_rate": 3.739136272316102e-05,
"loss": 1.274,
"step": 150
},
{
"epoch": 1.0736842105263158,
"grad_norm": 3.0407471656799316,
"learning_rate": 3.689868482592684e-05,
"loss": 1.0978,
"step": 153
},
{
"epoch": 1.0947368421052632,
"grad_norm": 3.2110660076141357,
"learning_rate": 3.6399958114371595e-05,
"loss": 0.9378,
"step": 156
},
{
"epoch": 1.1157894736842104,
"grad_norm": 4.471799373626709,
"learning_rate": 3.5895436121160386e-05,
"loss": 1.334,
"step": 159
},
{
"epoch": 1.1368421052631579,
"grad_norm": 2.7536613941192627,
"learning_rate": 3.5385375325047166e-05,
"loss": 1.5206,
"step": 162
},
{
"epoch": 1.1578947368421053,
"grad_norm": 3.1631388664245605,
"learning_rate": 3.487003502049122e-05,
"loss": 0.9874,
"step": 165
},
{
"epoch": 1.1789473684210527,
"grad_norm": 3.0744566917419434,
"learning_rate": 3.4349677185842245e-05,
"loss": 1.2542,
"step": 168
},
{
"epoch": 1.2,
"grad_norm": 3.199769973754883,
"learning_rate": 3.38245663501611e-05,
"loss": 1.0781,
"step": 171
},
{
"epoch": 1.2210526315789474,
"grad_norm": 3.3641140460968018,
"learning_rate": 3.32949694587438e-05,
"loss": 1.0915,
"step": 174
},
{
"epoch": 1.2421052631578948,
"grad_norm": 2.533961057662964,
"learning_rate": 3.276115573741724e-05,
"loss": 1.2862,
"step": 177
},
{
"epoch": 1.263157894736842,
"grad_norm": 4.081838130950928,
"learning_rate": 3.222339655567556e-05,
"loss": 1.2205,
"step": 180
},
{
"epoch": 1.263157894736842,
"eval_loss": 0.3107610046863556,
"eval_runtime": 47.7384,
"eval_samples_per_second": 5.027,
"eval_steps_per_second": 0.628,
"step": 180
},
{
"epoch": 1.2842105263157895,
"grad_norm": 2.3932526111602783,
"learning_rate": 3.168196528872682e-05,
"loss": 1.0431,
"step": 183
},
{
"epoch": 1.305263157894737,
"grad_norm": 2.7691686153411865,
"learning_rate": 3.1137137178519985e-05,
"loss": 1.314,
"step": 186
},
{
"epoch": 1.3263157894736843,
"grad_norm": 3.8344638347625732,
"learning_rate": 3.0589189193822895e-05,
"loss": 0.8119,
"step": 189
},
{
"epoch": 1.3473684210526315,
"grad_norm": 4.127139568328857,
"learning_rate": 3.0038399889422553e-05,
"loss": 1.1671,
"step": 192
},
{
"epoch": 1.368421052631579,
"grad_norm": 3.597393035888672,
"learning_rate": 2.948504926451896e-05,
"loss": 1.4459,
"step": 195
},
{
"epoch": 1.3894736842105262,
"grad_norm": 3.0417675971984863,
"learning_rate": 2.8929418620384753e-05,
"loss": 1.0606,
"step": 198
},
{
"epoch": 1.4105263157894736,
"grad_norm": 4.269920825958252,
"learning_rate": 2.8371790417362987e-05,
"loss": 0.8091,
"step": 201
},
{
"epoch": 1.431578947368421,
"grad_norm": 4.4791789054870605,
"learning_rate": 2.781244813127552e-05,
"loss": 1.4956,
"step": 204
},
{
"epoch": 1.4526315789473685,
"grad_norm": 4.570736885070801,
"learning_rate": 2.7251676109315338e-05,
"loss": 0.791,
"step": 207
},
{
"epoch": 1.4736842105263157,
"grad_norm": 4.790010929107666,
"learning_rate": 2.668975942549583e-05,
"loss": 1.2485,
"step": 210
},
{
"epoch": 1.4947368421052631,
"grad_norm": 3.679155111312866,
"learning_rate": 2.612698373573056e-05,
"loss": 0.9622,
"step": 213
},
{
"epoch": 1.5157894736842106,
"grad_norm": 3.991124153137207,
"learning_rate": 2.5563635132617302e-05,
"loss": 0.7821,
"step": 216
},
{
"epoch": 1.5157894736842106,
"eval_loss": 0.30537185072898865,
"eval_runtime": 47.7614,
"eval_samples_per_second": 5.025,
"eval_steps_per_second": 0.628,
"step": 216
},
{
"epoch": 1.5368421052631578,
"grad_norm": 5.918197154998779,
"learning_rate": 2.5e-05,
"loss": 0.7552,
"step": 219
},
{
"epoch": 1.5578947368421052,
"grad_norm": 6.4377241134643555,
"learning_rate": 2.44363648673827e-05,
"loss": 1.13,
"step": 222
},
{
"epoch": 1.5789473684210527,
"grad_norm": 3.93595814704895,
"learning_rate": 2.387301626426944e-05,
"loss": 0.9218,
"step": 225
},
{
"epoch": 1.6,
"grad_norm": 5.706233978271484,
"learning_rate": 2.3310240574504185e-05,
"loss": 1.1022,
"step": 228
},
{
"epoch": 1.6210526315789475,
"grad_norm": 2.740601062774658,
"learning_rate": 2.2748323890684665e-05,
"loss": 1.2584,
"step": 231
},
{
"epoch": 1.6421052631578947,
"grad_norm": 4.44104528427124,
"learning_rate": 2.2187551868724485e-05,
"loss": 1.0941,
"step": 234
},
{
"epoch": 1.663157894736842,
"grad_norm": 4.569465160369873,
"learning_rate": 2.1628209582637022e-05,
"loss": 1.1554,
"step": 237
},
{
"epoch": 1.6842105263157894,
"grad_norm": 4.33217191696167,
"learning_rate": 2.1070581379615253e-05,
"loss": 0.5728,
"step": 240
},
{
"epoch": 1.7052631578947368,
"grad_norm": 4.296968936920166,
"learning_rate": 2.0514950735481052e-05,
"loss": 1.0808,
"step": 243
},
{
"epoch": 1.7263157894736842,
"grad_norm": 3.474714994430542,
"learning_rate": 1.9961600110577456e-05,
"loss": 1.2945,
"step": 246
},
{
"epoch": 1.7473684210526317,
"grad_norm": 3.817056655883789,
"learning_rate": 1.9410810806177104e-05,
"loss": 1.4233,
"step": 249
},
{
"epoch": 1.768421052631579,
"grad_norm": 3.0018868446350098,
"learning_rate": 1.8862862821480025e-05,
"loss": 1.0385,
"step": 252
},
{
"epoch": 1.768421052631579,
"eval_loss": 0.3030892610549927,
"eval_runtime": 47.7428,
"eval_samples_per_second": 5.027,
"eval_steps_per_second": 0.628,
"step": 252
},
{
"epoch": 1.7894736842105263,
"grad_norm": 3.522315502166748,
"learning_rate": 1.831803471127318e-05,
"loss": 1.1658,
"step": 255
},
{
"epoch": 1.8105263157894735,
"grad_norm": 3.5018210411071777,
"learning_rate": 1.7776603444324445e-05,
"loss": 1.0903,
"step": 258
},
{
"epoch": 1.831578947368421,
"grad_norm": 4.468841552734375,
"learning_rate": 1.723884426258277e-05,
"loss": 1.1171,
"step": 261
},
{
"epoch": 1.8526315789473684,
"grad_norm": 3.999666452407837,
"learning_rate": 1.670503054125621e-05,
"loss": 1.2162,
"step": 264
},
{
"epoch": 1.8736842105263158,
"grad_norm": 3.463674783706665,
"learning_rate": 1.61754336498389e-05,
"loss": 0.8498,
"step": 267
},
{
"epoch": 1.8947368421052633,
"grad_norm": 3.4514553546905518,
"learning_rate": 1.5650322814157764e-05,
"loss": 1.2623,
"step": 270
},
{
"epoch": 1.9157894736842105,
"grad_norm": 3.6156108379364014,
"learning_rate": 1.5129964979508792e-05,
"loss": 0.8503,
"step": 273
},
{
"epoch": 1.936842105263158,
"grad_norm": 3.3259615898132324,
"learning_rate": 1.4614624674952842e-05,
"loss": 0.9937,
"step": 276
},
{
"epoch": 1.9578947368421051,
"grad_norm": 5.9854230880737305,
"learning_rate": 1.4104563878839621e-05,
"loss": 0.9689,
"step": 279
},
{
"epoch": 1.9789473684210526,
"grad_norm": 2.22936749458313,
"learning_rate": 1.3600041885628409e-05,
"loss": 0.9137,
"step": 282
},
{
"epoch": 2.0,
"grad_norm": 3.004664897918701,
"learning_rate": 1.3101315174073162e-05,
"loss": 0.6448,
"step": 285
},
{
"epoch": 2.0210526315789474,
"grad_norm": 6.208250522613525,
"learning_rate": 1.2608637276838986e-05,
"loss": 1.319,
"step": 288
},
{
"epoch": 2.0210526315789474,
"eval_loss": 0.30173683166503906,
"eval_runtime": 47.7589,
"eval_samples_per_second": 5.025,
"eval_steps_per_second": 0.628,
"step": 288
},
{
"epoch": 2.042105263157895,
"grad_norm": 2.969910144805908,
"learning_rate": 1.2122258651616306e-05,
"loss": 0.8383,
"step": 291
},
{
"epoch": 2.0631578947368423,
"grad_norm": 2.5157318115234375,
"learning_rate": 1.1642426553798174e-05,
"loss": 0.7519,
"step": 294
},
{
"epoch": 2.0842105263157893,
"grad_norm": 3.564941167831421,
"learning_rate": 1.1169384910785614e-05,
"loss": 0.5701,
"step": 297
},
{
"epoch": 2.1052631578947367,
"grad_norm": 3.544473886489868,
"learning_rate": 1.0703374197984653e-05,
"loss": 0.7366,
"step": 300
},
{
"epoch": 2.126315789473684,
"grad_norm": 2.952383041381836,
"learning_rate": 1.0244631316558267e-05,
"loss": 0.6928,
"step": 303
},
{
"epoch": 2.1473684210526316,
"grad_norm": 3.4289209842681885,
"learning_rate": 9.793389472995393e-06,
"loss": 0.7361,
"step": 306
},
{
"epoch": 2.168421052631579,
"grad_norm": 3.7741119861602783,
"learning_rate": 9.349878060557999e-06,
"loss": 0.7777,
"step": 309
},
{
"epoch": 2.1894736842105265,
"grad_norm": 4.074053764343262,
"learning_rate": 8.914322542666822e-06,
"loss": 0.9209,
"step": 312
},
{
"epoch": 2.2105263157894735,
"grad_norm": 4.839679718017578,
"learning_rate": 8.486944338284797e-06,
"loss": 0.937,
"step": 315
},
{
"epoch": 2.231578947368421,
"grad_norm": 3.5984749794006348,
"learning_rate": 8.067960709356478e-06,
"loss": 1.0567,
"step": 318
},
{
"epoch": 2.2526315789473683,
"grad_norm": 4.226260185241699,
"learning_rate": 7.657584650360847e-06,
"loss": 0.8969,
"step": 321
},
{
"epoch": 2.2736842105263158,
"grad_norm": 2.624924421310425,
"learning_rate": 7.256024780033418e-06,
"loss": 0.7665,
"step": 324
},
{
"epoch": 2.2736842105263158,
"eval_loss": 0.3099728524684906,
"eval_runtime": 47.753,
"eval_samples_per_second": 5.026,
"eval_steps_per_second": 0.628,
"step": 324
},
{
"epoch": 2.294736842105263,
"grad_norm": 3.7707293033599854,
"learning_rate": 6.863485235312853e-06,
"loss": 0.7157,
"step": 327
},
{
"epoch": 2.3157894736842106,
"grad_norm": 3.5063211917877197,
"learning_rate": 6.480165567565913e-06,
"loss": 0.7941,
"step": 330
},
{
"epoch": 2.336842105263158,
"grad_norm": 5.289640426635742,
"learning_rate": 6.106260641143546e-06,
"loss": 1.022,
"step": 333
},
{
"epoch": 2.3578947368421055,
"grad_norm": 3.4733479022979736,
"learning_rate": 5.741960534319677e-06,
"loss": 0.8732,
"step": 336
},
{
"epoch": 2.3789473684210525,
"grad_norm": 2.74438214302063,
"learning_rate": 5.387450442663025e-06,
"loss": 0.488,
"step": 339
},
{
"epoch": 2.4,
"grad_norm": 3.423187732696533,
"learning_rate": 5.0429105848911e-06,
"loss": 1.0244,
"step": 342
},
{
"epoch": 2.4210526315789473,
"grad_norm": 3.875284194946289,
"learning_rate": 4.708516111254238e-06,
"loss": 0.9071,
"step": 345
},
{
"epoch": 2.442105263157895,
"grad_norm": 4.707957744598389,
"learning_rate": 4.384437014496215e-06,
"loss": 0.8664,
"step": 348
},
{
"epoch": 2.463157894736842,
"grad_norm": 4.914385795593262,
"learning_rate": 4.070838043436786e-06,
"loss": 0.6006,
"step": 351
},
{
"epoch": 2.4842105263157896,
"grad_norm": 3.2543418407440186,
"learning_rate": 3.7678786192199694e-06,
"loss": 0.5789,
"step": 354
},
{
"epoch": 2.5052631578947366,
"grad_norm": 2.9000864028930664,
"learning_rate": 3.475712754270716e-06,
"loss": 0.5431,
"step": 357
},
{
"epoch": 2.526315789473684,
"grad_norm": 4.2075886726379395,
"learning_rate": 3.194488974001203e-06,
"loss": 0.6753,
"step": 360
},
{
"epoch": 2.526315789473684,
"eval_loss": 0.3118632733821869,
"eval_runtime": 47.721,
"eval_samples_per_second": 5.029,
"eval_steps_per_second": 0.629,
"step": 360
},
{
"epoch": 2.5473684210526315,
"grad_norm": 5.408112049102783,
"learning_rate": 2.9243502413064368e-06,
"loss": 0.6439,
"step": 363
},
{
"epoch": 2.568421052631579,
"grad_norm": 3.7381534576416016,
"learning_rate": 2.6654338838876665e-06,
"loss": 0.9288,
"step": 366
},
{
"epoch": 2.5894736842105264,
"grad_norm": 4.740654468536377,
"learning_rate": 2.4178715244404794e-06,
"loss": 0.9505,
"step": 369
},
{
"epoch": 2.610526315789474,
"grad_norm": 4.9893364906311035,
"learning_rate": 2.1817890137430934e-06,
"loss": 1.046,
"step": 372
},
{
"epoch": 2.6315789473684212,
"grad_norm": 4.344699382781982,
"learning_rate": 1.9573063666788875e-06,
"loss": 0.8301,
"step": 375
},
{
"epoch": 2.6526315789473687,
"grad_norm": 2.871662139892578,
"learning_rate": 1.7445377012256126e-06,
"loss": 0.6642,
"step": 378
},
{
"epoch": 2.6736842105263157,
"grad_norm": 3.569286346435547,
"learning_rate": 1.5435911804424357e-06,
"loss": 0.8558,
"step": 381
},
{
"epoch": 2.694736842105263,
"grad_norm": 4.009424209594727,
"learning_rate": 1.3545689574841342e-06,
"loss": 0.8686,
"step": 384
},
{
"epoch": 2.7157894736842105,
"grad_norm": 3.5932652950286865,
"learning_rate": 1.1775671236705365e-06,
"loss": 1.0848,
"step": 387
},
{
"epoch": 2.736842105263158,
"grad_norm": 4.354364395141602,
"learning_rate": 1.0126756596375686e-06,
"loss": 1.1122,
"step": 390
},
{
"epoch": 2.7578947368421054,
"grad_norm": 3.184096336364746,
"learning_rate": 8.599783895946761e-07,
"loss": 0.8129,
"step": 393
},
{
"epoch": 2.7789473684210524,
"grad_norm": 4.265777587890625,
"learning_rate": 7.195529387119815e-07,
"loss": 0.7224,
"step": 396
},
{
"epoch": 2.7789473684210524,
"eval_loss": 0.3112446963787079,
"eval_runtime": 47.7893,
"eval_samples_per_second": 5.022,
"eval_steps_per_second": 0.628,
"step": 396
},
{
"epoch": 2.8,
"grad_norm": 3.4699087142944336,
"learning_rate": 5.914706936587494e-07,
"loss": 0.614,
"step": 399
},
{
"epoch": 2.8210526315789473,
"grad_norm": 2.6950035095214844,
"learning_rate": 4.75796766313269e-07,
"loss": 0.9641,
"step": 402
},
{
"epoch": 2.8421052631578947,
"grad_norm": 4.25594425201416,
"learning_rate": 3.7258996066258103e-07,
"loss": 0.736,
"step": 405
},
{
"epoch": 2.863157894736842,
"grad_norm": 3.8812239170074463,
"learning_rate": 2.819027429088822e-07,
"loss": 0.7287,
"step": 408
},
{
"epoch": 2.8842105263157896,
"grad_norm": 4.651484966278076,
"learning_rate": 2.0378121479783796e-07,
"loss": 0.8938,
"step": 411
},
{
"epoch": 2.905263157894737,
"grad_norm": 4.784148216247559,
"learning_rate": 1.3826509018227128e-07,
"loss": 0.9602,
"step": 414
},
{
"epoch": 2.9263157894736844,
"grad_norm": 4.499444007873535,
"learning_rate": 8.538767483325383e-08,
"loss": 0.985,
"step": 417
},
{
"epoch": 2.9473684210526314,
"grad_norm": 5.214015483856201,
"learning_rate": 4.517584950877452e-08,
"loss": 0.9054,
"step": 420
},
{
"epoch": 2.968421052631579,
"grad_norm": 3.8694188594818115,
"learning_rate": 1.7650056288651127e-08,
"loss": 0.651,
"step": 423
},
{
"epoch": 2.9894736842105263,
"grad_norm": 3.8104214668273926,
"learning_rate": 2.8242881825846223e-09,
"loss": 0.8252,
"step": 426
}
],
"logging_steps": 3,
"max_steps": 428,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 36,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.7558214228836352e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}