|
{ |
|
"best_metric": 0.9795737122557726, |
|
"best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-ibird/checkpoint-2825", |
|
"epoch": 4.995579133510168, |
|
"eval_steps": 500, |
|
"global_step": 2825, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 10.971182823181152, |
|
"learning_rate": 1.76678445229682e-06, |
|
"loss": 3.1809, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.421601295471191, |
|
"learning_rate": 3.53356890459364e-06, |
|
"loss": 3.2061, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 9.90295124053955, |
|
"learning_rate": 5.30035335689046e-06, |
|
"loss": 3.1752, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.587502479553223, |
|
"learning_rate": 7.06713780918728e-06, |
|
"loss": 3.1217, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.991631984710693, |
|
"learning_rate": 8.8339222614841e-06, |
|
"loss": 3.0881, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 18.684083938598633, |
|
"learning_rate": 1.060070671378092e-05, |
|
"loss": 3.0575, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 9.924698829650879, |
|
"learning_rate": 1.236749116607774e-05, |
|
"loss": 2.9739, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 8.268143653869629, |
|
"learning_rate": 1.413427561837456e-05, |
|
"loss": 2.8529, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 8.838825225830078, |
|
"learning_rate": 1.5901060070671377e-05, |
|
"loss": 2.6898, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 11.603575706481934, |
|
"learning_rate": 1.76678445229682e-05, |
|
"loss": 2.4395, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 13.830281257629395, |
|
"learning_rate": 1.9434628975265016e-05, |
|
"loss": 1.9708, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 13.014172554016113, |
|
"learning_rate": 2.120141342756184e-05, |
|
"loss": 1.5808, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 17.082054138183594, |
|
"learning_rate": 2.296819787985866e-05, |
|
"loss": 1.2151, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 17.318891525268555, |
|
"learning_rate": 2.473498233215548e-05, |
|
"loss": 0.8489, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 14.354799270629883, |
|
"learning_rate": 2.6501766784452298e-05, |
|
"loss": 0.6891, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 9.554667472839355, |
|
"learning_rate": 2.826855123674912e-05, |
|
"loss": 0.6101, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 8.692755699157715, |
|
"learning_rate": 3.003533568904594e-05, |
|
"loss": 0.4587, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 18.25060272216797, |
|
"learning_rate": 3.1802120141342755e-05, |
|
"loss": 0.436, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 10.215097427368164, |
|
"learning_rate": 3.356890459363958e-05, |
|
"loss": 0.2916, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 10.572564125061035, |
|
"learning_rate": 3.53356890459364e-05, |
|
"loss": 0.2545, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 18.01451301574707, |
|
"learning_rate": 3.710247349823322e-05, |
|
"loss": 0.3369, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 9.001435279846191, |
|
"learning_rate": 3.886925795053003e-05, |
|
"loss": 0.2184, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 11.199219703674316, |
|
"learning_rate": 4.063604240282686e-05, |
|
"loss": 0.2333, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 9.572484016418457, |
|
"learning_rate": 4.240282685512368e-05, |
|
"loss": 0.2238, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 11.255512237548828, |
|
"learning_rate": 4.416961130742049e-05, |
|
"loss": 0.2418, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 7.414256572723389, |
|
"learning_rate": 4.593639575971732e-05, |
|
"loss": 0.2201, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.147739410400391, |
|
"learning_rate": 4.7703180212014135e-05, |
|
"loss": 0.2054, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 4.774346828460693, |
|
"learning_rate": 4.946996466431096e-05, |
|
"loss": 0.1863, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 7.592128753662109, |
|
"learning_rate": 4.9862313139260423e-05, |
|
"loss": 0.1531, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 14.047577857971191, |
|
"learning_rate": 4.9665617623918175e-05, |
|
"loss": 0.1713, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 13.25577449798584, |
|
"learning_rate": 4.9468922108575926e-05, |
|
"loss": 0.2064, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 15.570125579833984, |
|
"learning_rate": 4.927222659323368e-05, |
|
"loss": 0.1782, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 7.168793201446533, |
|
"learning_rate": 4.907553107789143e-05, |
|
"loss": 0.1792, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 19.736141204833984, |
|
"learning_rate": 4.887883556254917e-05, |
|
"loss": 0.1774, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 11.251014709472656, |
|
"learning_rate": 4.8682140047206924e-05, |
|
"loss": 0.1933, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 14.66421890258789, |
|
"learning_rate": 4.8485444531864675e-05, |
|
"loss": 0.1554, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 7.186115741729736, |
|
"learning_rate": 4.8288749016522426e-05, |
|
"loss": 0.1577, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 11.209123611450195, |
|
"learning_rate": 4.809205350118017e-05, |
|
"loss": 0.1598, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 7.6711602210998535, |
|
"learning_rate": 4.789535798583792e-05, |
|
"loss": 0.1964, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 12.84682846069336, |
|
"learning_rate": 4.769866247049567e-05, |
|
"loss": 0.1597, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 6.741486072540283, |
|
"learning_rate": 4.7501966955153424e-05, |
|
"loss": 0.1085, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 12.747113227844238, |
|
"learning_rate": 4.7305271439811175e-05, |
|
"loss": 0.1448, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 15.717222213745117, |
|
"learning_rate": 4.7108575924468926e-05, |
|
"loss": 0.2451, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 6.885907173156738, |
|
"learning_rate": 4.691188040912668e-05, |
|
"loss": 0.1473, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 9.838655471801758, |
|
"learning_rate": 4.671518489378442e-05, |
|
"loss": 0.1295, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 5.5332932472229, |
|
"learning_rate": 4.651848937844217e-05, |
|
"loss": 0.1199, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 11.243609428405762, |
|
"learning_rate": 4.6321793863099924e-05, |
|
"loss": 0.2097, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.227767825126648, |
|
"learning_rate": 4.6125098347757675e-05, |
|
"loss": 0.142, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 8.73798942565918, |
|
"learning_rate": 4.5928402832415426e-05, |
|
"loss": 0.1749, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 4.825047969818115, |
|
"learning_rate": 4.573170731707318e-05, |
|
"loss": 0.1062, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 14.581511497497559, |
|
"learning_rate": 4.553501180173092e-05, |
|
"loss": 0.1786, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 8.596332550048828, |
|
"learning_rate": 4.533831628638867e-05, |
|
"loss": 0.1797, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 21.009435653686523, |
|
"learning_rate": 4.5141620771046424e-05, |
|
"loss": 0.141, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 11.336109161376953, |
|
"learning_rate": 4.4944925255704175e-05, |
|
"loss": 0.1304, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 8.8182954788208, |
|
"learning_rate": 4.4748229740361926e-05, |
|
"loss": 0.1208, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 14.062355041503906, |
|
"learning_rate": 4.455153422501967e-05, |
|
"loss": 0.166, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.9675843694493783, |
|
"eval_loss": 0.11483483016490936, |
|
"eval_runtime": 72.8258, |
|
"eval_samples_per_second": 30.923, |
|
"eval_steps_per_second": 3.872, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 3.4058330059051514, |
|
"learning_rate": 4.435483870967742e-05, |
|
"loss": 0.0858, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 4.461623191833496, |
|
"learning_rate": 4.415814319433517e-05, |
|
"loss": 0.1057, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 11.684330940246582, |
|
"learning_rate": 4.3961447678992924e-05, |
|
"loss": 0.0425, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 11.538973808288574, |
|
"learning_rate": 4.376475216365067e-05, |
|
"loss": 0.0799, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 20.21871566772461, |
|
"learning_rate": 4.356805664830842e-05, |
|
"loss": 0.0488, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 6.491779327392578, |
|
"learning_rate": 4.337136113296617e-05, |
|
"loss": 0.0461, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 20.6411075592041, |
|
"learning_rate": 4.317466561762392e-05, |
|
"loss": 0.1365, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 6.05819034576416, |
|
"learning_rate": 4.297797010228167e-05, |
|
"loss": 0.0707, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 5.697531223297119, |
|
"learning_rate": 4.278127458693942e-05, |
|
"loss": 0.0684, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 3.515834331512451, |
|
"learning_rate": 4.258457907159717e-05, |
|
"loss": 0.0809, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 7.478769779205322, |
|
"learning_rate": 4.238788355625492e-05, |
|
"loss": 0.0941, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.829246997833252, |
|
"learning_rate": 4.219118804091267e-05, |
|
"loss": 0.0903, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.0911270380020142, |
|
"learning_rate": 4.1994492525570416e-05, |
|
"loss": 0.0291, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.5587719678878784, |
|
"learning_rate": 4.179779701022817e-05, |
|
"loss": 0.0999, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.5585480332374573, |
|
"learning_rate": 4.160110149488592e-05, |
|
"loss": 0.0609, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 17.553932189941406, |
|
"learning_rate": 4.140440597954367e-05, |
|
"loss": 0.1024, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 5.286637306213379, |
|
"learning_rate": 4.1207710464201413e-05, |
|
"loss": 0.0903, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 3.5825843811035156, |
|
"learning_rate": 4.1011014948859165e-05, |
|
"loss": 0.086, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.051394496113061905, |
|
"learning_rate": 4.0814319433516916e-05, |
|
"loss": 0.0483, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 4.485952377319336, |
|
"learning_rate": 4.061762391817467e-05, |
|
"loss": 0.0641, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.9799467921257019, |
|
"learning_rate": 4.042092840283242e-05, |
|
"loss": 0.0225, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.5134735107421875, |
|
"learning_rate": 4.022423288749016e-05, |
|
"loss": 0.0875, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.2541639804840088, |
|
"learning_rate": 4.0027537372147914e-05, |
|
"loss": 0.031, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.1368063986301422, |
|
"learning_rate": 3.9830841856805665e-05, |
|
"loss": 0.1112, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 3.3988840579986572, |
|
"learning_rate": 3.9634146341463416e-05, |
|
"loss": 0.0806, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 23.861074447631836, |
|
"learning_rate": 3.943745082612117e-05, |
|
"loss": 0.1014, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 7.84724235534668, |
|
"learning_rate": 3.924075531077892e-05, |
|
"loss": 0.0532, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 3.9560492038726807, |
|
"learning_rate": 3.904405979543666e-05, |
|
"loss": 0.0824, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 4.0379438400268555, |
|
"learning_rate": 3.8847364280094414e-05, |
|
"loss": 0.1027, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.1322356462478638, |
|
"learning_rate": 3.8650668764752165e-05, |
|
"loss": 0.0466, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 14.02712631225586, |
|
"learning_rate": 3.8453973249409916e-05, |
|
"loss": 0.0871, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 21.739513397216797, |
|
"learning_rate": 3.825727773406767e-05, |
|
"loss": 0.0682, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 18.444072723388672, |
|
"learning_rate": 3.806058221872542e-05, |
|
"loss": 0.0942, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 9.762558937072754, |
|
"learning_rate": 3.786388670338317e-05, |
|
"loss": 0.071, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 10.741572380065918, |
|
"learning_rate": 3.7667191188040914e-05, |
|
"loss": 0.055, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.030358910560608, |
|
"learning_rate": 3.7470495672698665e-05, |
|
"loss": 0.0666, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.6305733919143677, |
|
"learning_rate": 3.7273800157356416e-05, |
|
"loss": 0.0523, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.391921043395996, |
|
"learning_rate": 3.707710464201417e-05, |
|
"loss": 0.0785, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 4.645728588104248, |
|
"learning_rate": 3.688040912667191e-05, |
|
"loss": 0.1018, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 13.808793067932129, |
|
"learning_rate": 3.668371361132966e-05, |
|
"loss": 0.0728, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.23640145361423492, |
|
"learning_rate": 3.6487018095987414e-05, |
|
"loss": 0.0652, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 3.5320382118225098, |
|
"learning_rate": 3.6290322580645165e-05, |
|
"loss": 0.0621, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.206480860710144, |
|
"learning_rate": 3.6093627065302916e-05, |
|
"loss": 0.0676, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 18.625965118408203, |
|
"learning_rate": 3.589693154996066e-05, |
|
"loss": 0.0715, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 3.9618349075317383, |
|
"learning_rate": 3.570023603461841e-05, |
|
"loss": 0.0814, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.8978772163391113, |
|
"learning_rate": 3.550354051927616e-05, |
|
"loss": 0.0383, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 11.628218650817871, |
|
"learning_rate": 3.5306845003933914e-05, |
|
"loss": 0.0768, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.9953187704086304, |
|
"learning_rate": 3.511014948859166e-05, |
|
"loss": 0.0649, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 15.428389549255371, |
|
"learning_rate": 3.491345397324941e-05, |
|
"loss": 0.0591, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.0453615188598633, |
|
"learning_rate": 3.471675845790716e-05, |
|
"loss": 0.0979, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 7.687002658843994, |
|
"learning_rate": 3.452006294256491e-05, |
|
"loss": 0.0681, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.8325603008270264, |
|
"learning_rate": 3.432336742722266e-05, |
|
"loss": 0.0322, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.8982272148132324, |
|
"learning_rate": 3.412667191188041e-05, |
|
"loss": 0.0468, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.832631826400757, |
|
"learning_rate": 3.392997639653816e-05, |
|
"loss": 0.034, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 6.040098190307617, |
|
"learning_rate": 3.373328088119591e-05, |
|
"loss": 0.0631, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.12215587496757507, |
|
"learning_rate": 3.353658536585366e-05, |
|
"loss": 0.0522, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 8.005010604858398, |
|
"learning_rate": 3.3339889850511406e-05, |
|
"loss": 0.0909, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9755772646536413, |
|
"eval_loss": 0.0889873057603836, |
|
"eval_runtime": 72.7301, |
|
"eval_samples_per_second": 30.964, |
|
"eval_steps_per_second": 3.877, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.5045525431632996, |
|
"learning_rate": 3.314319433516916e-05, |
|
"loss": 0.054, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 3.169762134552002, |
|
"learning_rate": 3.294649881982691e-05, |
|
"loss": 0.0138, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 12.394304275512695, |
|
"learning_rate": 3.274980330448466e-05, |
|
"loss": 0.025, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.7768550515174866, |
|
"learning_rate": 3.255310778914241e-05, |
|
"loss": 0.0136, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 23.059011459350586, |
|
"learning_rate": 3.2356412273800155e-05, |
|
"loss": 0.0487, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.4939569234848022, |
|
"learning_rate": 3.2159716758457906e-05, |
|
"loss": 0.0267, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.5693202018737793, |
|
"learning_rate": 3.196302124311566e-05, |
|
"loss": 0.0339, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.9168961644172668, |
|
"learning_rate": 3.176632572777341e-05, |
|
"loss": 0.0195, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 3.3607680797576904, |
|
"learning_rate": 3.156963021243116e-05, |
|
"loss": 0.0257, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 10.094879150390625, |
|
"learning_rate": 3.137293469708891e-05, |
|
"loss": 0.0143, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 9.125, |
|
"learning_rate": 3.1176239181746655e-05, |
|
"loss": 0.0295, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.2778412699699402, |
|
"learning_rate": 3.0979543666404406e-05, |
|
"loss": 0.0182, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.0662269592285156, |
|
"learning_rate": 3.078284815106216e-05, |
|
"loss": 0.0295, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.0518941730260849, |
|
"learning_rate": 3.058615263571991e-05, |
|
"loss": 0.048, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 3.4041025638580322, |
|
"learning_rate": 3.0389457120377656e-05, |
|
"loss": 0.0601, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.9103949666023254, |
|
"learning_rate": 3.0192761605035407e-05, |
|
"loss": 0.0191, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.8118720054626465, |
|
"learning_rate": 2.999606608969316e-05, |
|
"loss": 0.023, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.6234725117683411, |
|
"learning_rate": 2.9799370574350903e-05, |
|
"loss": 0.0248, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 18.03314781188965, |
|
"learning_rate": 2.9602675059008654e-05, |
|
"loss": 0.0694, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.3662589490413666, |
|
"learning_rate": 2.9405979543666405e-05, |
|
"loss": 0.0206, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.030597494915127754, |
|
"learning_rate": 2.9209284028324156e-05, |
|
"loss": 0.0291, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.9393526911735535, |
|
"learning_rate": 2.9012588512981904e-05, |
|
"loss": 0.0243, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 3.631523847579956, |
|
"learning_rate": 2.8815892997639655e-05, |
|
"loss": 0.0292, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.010743443854153156, |
|
"learning_rate": 2.8619197482297406e-05, |
|
"loss": 0.0453, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 16.26077651977539, |
|
"learning_rate": 2.8422501966955157e-05, |
|
"loss": 0.0221, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 3.7162559032440186, |
|
"learning_rate": 2.822580645161291e-05, |
|
"loss": 0.0278, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.568585991859436, |
|
"learning_rate": 2.8029110936270653e-05, |
|
"loss": 0.0234, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.43441635370254517, |
|
"learning_rate": 2.7832415420928404e-05, |
|
"loss": 0.0187, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 2.6314618587493896, |
|
"learning_rate": 2.7635719905586155e-05, |
|
"loss": 0.0235, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.020479127764701843, |
|
"learning_rate": 2.7439024390243906e-05, |
|
"loss": 0.0102, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.40051591396331787, |
|
"learning_rate": 2.724232887490165e-05, |
|
"loss": 0.0036, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 8.731142044067383, |
|
"learning_rate": 2.7045633359559402e-05, |
|
"loss": 0.0412, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 5.808520317077637, |
|
"learning_rate": 2.6848937844217153e-05, |
|
"loss": 0.0333, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.15826575458049774, |
|
"learning_rate": 2.6652242328874904e-05, |
|
"loss": 0.0381, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 2.659451484680176, |
|
"learning_rate": 2.645554681353265e-05, |
|
"loss": 0.0188, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.06260908395051956, |
|
"learning_rate": 2.62588512981904e-05, |
|
"loss": 0.0141, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.15780910849571228, |
|
"learning_rate": 2.606215578284815e-05, |
|
"loss": 0.0146, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 2.506399631500244, |
|
"learning_rate": 2.5865460267505902e-05, |
|
"loss": 0.0138, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.11960842460393906, |
|
"learning_rate": 2.5668764752163653e-05, |
|
"loss": 0.0028, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 22.104814529418945, |
|
"learning_rate": 2.54720692368214e-05, |
|
"loss": 0.0279, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.6916127800941467, |
|
"learning_rate": 2.5275373721479152e-05, |
|
"loss": 0.0152, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 4.406822204589844, |
|
"learning_rate": 2.5078678206136903e-05, |
|
"loss": 0.0064, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.5762219429016113, |
|
"learning_rate": 2.488198269079465e-05, |
|
"loss": 0.0257, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.5382003784179688, |
|
"learning_rate": 2.4685287175452402e-05, |
|
"loss": 0.0049, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.4372711181640625, |
|
"learning_rate": 2.448859166011015e-05, |
|
"loss": 0.0442, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 6.409371852874756, |
|
"learning_rate": 2.42918961447679e-05, |
|
"loss": 0.055, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.15078137814998627, |
|
"learning_rate": 2.4095200629425652e-05, |
|
"loss": 0.0228, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 4.177145481109619, |
|
"learning_rate": 2.38985051140834e-05, |
|
"loss": 0.0193, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.13254141807556152, |
|
"learning_rate": 2.370180959874115e-05, |
|
"loss": 0.0138, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.5547938942909241, |
|
"learning_rate": 2.35051140833989e-05, |
|
"loss": 0.003, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.550114393234253, |
|
"learning_rate": 2.330841856805665e-05, |
|
"loss": 0.0156, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 2.4502694606781006, |
|
"learning_rate": 2.3111723052714398e-05, |
|
"loss": 0.0284, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.34310609102249146, |
|
"learning_rate": 2.291502753737215e-05, |
|
"loss": 0.0318, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.06068241968750954, |
|
"learning_rate": 2.2718332022029897e-05, |
|
"loss": 0.0171, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 1.1032580137252808, |
|
"learning_rate": 2.2521636506687648e-05, |
|
"loss": 0.0062, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 16.99115562438965, |
|
"learning_rate": 2.2324940991345396e-05, |
|
"loss": 0.018, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9755772646536413, |
|
"eval_loss": 0.0940345823764801, |
|
"eval_runtime": 73.2079, |
|
"eval_samples_per_second": 30.762, |
|
"eval_steps_per_second": 3.852, |
|
"step": 1696 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 18.221080780029297, |
|
"learning_rate": 2.2128245476003147e-05, |
|
"loss": 0.018, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 1.597777009010315, |
|
"learning_rate": 2.1931549960660898e-05, |
|
"loss": 0.0095, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.460238516330719, |
|
"learning_rate": 2.1734854445318646e-05, |
|
"loss": 0.0135, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.1700868457555771, |
|
"learning_rate": 2.1538158929976397e-05, |
|
"loss": 0.0149, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 4.091975212097168, |
|
"learning_rate": 2.134146341463415e-05, |
|
"loss": 0.0163, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.004883296322077513, |
|
"learning_rate": 2.11447678992919e-05, |
|
"loss": 0.0022, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 0.9401123523712158, |
|
"learning_rate": 2.0948072383949647e-05, |
|
"loss": 0.0012, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 4.135828018188477, |
|
"learning_rate": 2.07513768686074e-05, |
|
"loss": 0.0201, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 0.030084745958447456, |
|
"learning_rate": 2.0554681353265146e-05, |
|
"loss": 0.0013, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 15.494827270507812, |
|
"learning_rate": 2.0357985837922897e-05, |
|
"loss": 0.0067, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 1.0570249557495117, |
|
"learning_rate": 2.0161290322580645e-05, |
|
"loss": 0.015, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.5005541443824768, |
|
"learning_rate": 1.9964594807238396e-05, |
|
"loss": 0.0025, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 3.019387722015381, |
|
"learning_rate": 1.9767899291896147e-05, |
|
"loss": 0.0112, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.19258318841457367, |
|
"learning_rate": 1.9571203776553895e-05, |
|
"loss": 0.0021, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 2.9886810779571533, |
|
"learning_rate": 1.9374508261211646e-05, |
|
"loss": 0.002, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 0.012368579395115376, |
|
"learning_rate": 1.9177812745869394e-05, |
|
"loss": 0.0012, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 0.023176291957497597, |
|
"learning_rate": 1.8981117230527145e-05, |
|
"loss": 0.0456, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.06562870740890503, |
|
"learning_rate": 1.8784421715184893e-05, |
|
"loss": 0.0069, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 17.193941116333008, |
|
"learning_rate": 1.8587726199842644e-05, |
|
"loss": 0.0133, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 0.08079444617033005, |
|
"learning_rate": 1.8391030684500392e-05, |
|
"loss": 0.0162, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 1.4867687225341797, |
|
"learning_rate": 1.8194335169158143e-05, |
|
"loss": 0.0028, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 0.10314597189426422, |
|
"learning_rate": 1.799763965381589e-05, |
|
"loss": 0.0036, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.03035634197294712, |
|
"learning_rate": 1.7800944138473642e-05, |
|
"loss": 0.0018, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 0.03287611901760101, |
|
"learning_rate": 1.7604248623131393e-05, |
|
"loss": 0.011, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.009028772823512554, |
|
"learning_rate": 1.7407553107789144e-05, |
|
"loss": 0.0055, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.11215109378099442, |
|
"learning_rate": 1.7210857592446896e-05, |
|
"loss": 0.0033, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 0.18691633641719818, |
|
"learning_rate": 1.7014162077104643e-05, |
|
"loss": 0.0157, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 10.645195007324219, |
|
"learning_rate": 1.6817466561762395e-05, |
|
"loss": 0.0056, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.3934668004512787, |
|
"learning_rate": 1.6620771046420142e-05, |
|
"loss": 0.0023, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.01680024527013302, |
|
"learning_rate": 1.6424075531077893e-05, |
|
"loss": 0.0058, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.6062192916870117, |
|
"learning_rate": 1.622738001573564e-05, |
|
"loss": 0.004, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.7066389918327332, |
|
"learning_rate": 1.6030684500393392e-05, |
|
"loss": 0.0094, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 0.20641224086284637, |
|
"learning_rate": 1.583398898505114e-05, |
|
"loss": 0.005, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.005134557373821735, |
|
"learning_rate": 1.563729346970889e-05, |
|
"loss": 0.0196, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.09725712984800339, |
|
"learning_rate": 1.5440597954366642e-05, |
|
"loss": 0.0093, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 0.01703699305653572, |
|
"learning_rate": 1.524390243902439e-05, |
|
"loss": 0.0173, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 8.595245361328125, |
|
"learning_rate": 1.5047206923682141e-05, |
|
"loss": 0.0067, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 1.1955232620239258, |
|
"learning_rate": 1.485051140833989e-05, |
|
"loss": 0.0057, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.026586467400193214, |
|
"learning_rate": 1.465381589299764e-05, |
|
"loss": 0.0033, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 0.07073145359754562, |
|
"learning_rate": 1.445712037765539e-05, |
|
"loss": 0.0098, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 13.662938117980957, |
|
"learning_rate": 1.4260424862313141e-05, |
|
"loss": 0.0099, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 7.773074626922607, |
|
"learning_rate": 1.4063729346970889e-05, |
|
"loss": 0.0147, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.08416301012039185, |
|
"learning_rate": 1.386703383162864e-05, |
|
"loss": 0.0015, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 0.09213215857744217, |
|
"learning_rate": 1.3670338316286388e-05, |
|
"loss": 0.0011, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.08238010108470917, |
|
"learning_rate": 1.3473642800944139e-05, |
|
"loss": 0.0067, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.2930988073348999, |
|
"learning_rate": 1.327694728560189e-05, |
|
"loss": 0.0027, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.6446830630302429, |
|
"learning_rate": 1.3080251770259638e-05, |
|
"loss": 0.0021, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.04108636826276779, |
|
"learning_rate": 1.2883556254917389e-05, |
|
"loss": 0.0116, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.3955759108066559, |
|
"learning_rate": 1.2686860739575138e-05, |
|
"loss": 0.0132, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.10534235090017319, |
|
"learning_rate": 1.249016522423289e-05, |
|
"loss": 0.0014, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 0.9984220266342163, |
|
"learning_rate": 1.2293469708890639e-05, |
|
"loss": 0.016, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 0.05933081731200218, |
|
"learning_rate": 1.2096774193548388e-05, |
|
"loss": 0.0024, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.3307284414768219, |
|
"learning_rate": 1.1900078678206138e-05, |
|
"loss": 0.0118, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.01056114211678505, |
|
"learning_rate": 1.1703383162863887e-05, |
|
"loss": 0.0054, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.0951002761721611, |
|
"learning_rate": 1.1506687647521637e-05, |
|
"loss": 0.0059, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.008732125163078308, |
|
"learning_rate": 1.1309992132179386e-05, |
|
"loss": 0.002, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.568053722381592, |
|
"learning_rate": 1.1113296616837136e-05, |
|
"loss": 0.0246, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9791296625222025, |
|
"eval_loss": 0.08105943351984024, |
|
"eval_runtime": 71.9238, |
|
"eval_samples_per_second": 31.311, |
|
"eval_steps_per_second": 3.921, |
|
"step": 2262 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 0.3494608700275421, |
|
"learning_rate": 1.0916601101494885e-05, |
|
"loss": 0.0029, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.4306102991104126, |
|
"learning_rate": 1.0719905586152636e-05, |
|
"loss": 0.0037, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 2.145603656768799, |
|
"learning_rate": 1.0523210070810386e-05, |
|
"loss": 0.0063, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 0.0036911088973283768, |
|
"learning_rate": 1.0326514555468137e-05, |
|
"loss": 0.0022, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.032683487981557846, |
|
"learning_rate": 1.0129819040125886e-05, |
|
"loss": 0.0062, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 0.007357526570558548, |
|
"learning_rate": 9.933123524783636e-06, |
|
"loss": 0.0059, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.052719421684741974, |
|
"learning_rate": 9.736428009441385e-06, |
|
"loss": 0.0007, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.0456971600651741, |
|
"learning_rate": 9.539732494099135e-06, |
|
"loss": 0.0083, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.029169630259275436, |
|
"learning_rate": 9.343036978756884e-06, |
|
"loss": 0.0075, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.05413562431931496, |
|
"learning_rate": 9.146341463414634e-06, |
|
"loss": 0.0025, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 8.87646198272705, |
|
"learning_rate": 8.949645948072383e-06, |
|
"loss": 0.0104, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 1.6246185302734375, |
|
"learning_rate": 8.752950432730134e-06, |
|
"loss": 0.001, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 0.21518048644065857, |
|
"learning_rate": 8.556254917387884e-06, |
|
"loss": 0.0073, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.6136273741722107, |
|
"learning_rate": 8.359559402045635e-06, |
|
"loss": 0.008, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 0.013673730194568634, |
|
"learning_rate": 8.162863886703385e-06, |
|
"loss": 0.0003, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.013138936832547188, |
|
"learning_rate": 7.966168371361134e-06, |
|
"loss": 0.0016, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.5832827687263489, |
|
"learning_rate": 7.769472856018883e-06, |
|
"loss": 0.0016, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.013815794140100479, |
|
"learning_rate": 7.572777340676633e-06, |
|
"loss": 0.0093, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 2.6205365657806396, |
|
"learning_rate": 7.376081825334382e-06, |
|
"loss": 0.0247, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.008757353760302067, |
|
"learning_rate": 7.179386309992133e-06, |
|
"loss": 0.0004, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 0.5439887642860413, |
|
"learning_rate": 6.982690794649882e-06, |
|
"loss": 0.0011, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 3.330894708633423, |
|
"learning_rate": 6.785995279307632e-06, |
|
"loss": 0.0027, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.24116189777851105, |
|
"learning_rate": 6.589299763965381e-06, |
|
"loss": 0.001, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 0.19894400238990784, |
|
"learning_rate": 6.392604248623131e-06, |
|
"loss": 0.0096, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.08030141144990921, |
|
"learning_rate": 6.195908733280882e-06, |
|
"loss": 0.0003, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.06319218873977661, |
|
"learning_rate": 5.999213217938631e-06, |
|
"loss": 0.0075, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 0.0329049713909626, |
|
"learning_rate": 5.802517702596381e-06, |
|
"loss": 0.0123, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.07301346212625504, |
|
"learning_rate": 5.605822187254131e-06, |
|
"loss": 0.0037, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 2.7243146896362305, |
|
"learning_rate": 5.40912667191188e-06, |
|
"loss": 0.009, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.002613728167489171, |
|
"learning_rate": 5.212431156569631e-06, |
|
"loss": 0.0211, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 0.08672753721475601, |
|
"learning_rate": 5.01573564122738e-06, |
|
"loss": 0.0003, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 0.7189329862594604, |
|
"learning_rate": 4.81904012588513e-06, |
|
"loss": 0.0017, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 0.021875105798244476, |
|
"learning_rate": 4.62234461054288e-06, |
|
"loss": 0.0036, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.06674761325120926, |
|
"learning_rate": 4.425649095200629e-06, |
|
"loss": 0.0016, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 1.5341488122940063, |
|
"learning_rate": 4.22895357985838e-06, |
|
"loss": 0.0123, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 0.6225674152374268, |
|
"learning_rate": 4.032258064516129e-06, |
|
"loss": 0.0071, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 0.04396051913499832, |
|
"learning_rate": 3.835562549173879e-06, |
|
"loss": 0.0061, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.024606185033917427, |
|
"learning_rate": 3.638867033831629e-06, |
|
"loss": 0.0034, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 0.7400041818618774, |
|
"learning_rate": 3.442171518489379e-06, |
|
"loss": 0.0022, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 0.105620838701725, |
|
"learning_rate": 3.2454760031471283e-06, |
|
"loss": 0.0203, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.011251443065702915, |
|
"learning_rate": 3.0487804878048782e-06, |
|
"loss": 0.0047, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 0.03630177304148674, |
|
"learning_rate": 2.852084972462628e-06, |
|
"loss": 0.004, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 0.03339725360274315, |
|
"learning_rate": 2.655389457120378e-06, |
|
"loss": 0.0011, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 4.932060718536377, |
|
"learning_rate": 2.4586939417781275e-06, |
|
"loss": 0.0032, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.20462249219417572, |
|
"learning_rate": 2.2619984264358773e-06, |
|
"loss": 0.0008, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 0.01447032019495964, |
|
"learning_rate": 2.0653029110936272e-06, |
|
"loss": 0.003, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 0.012796329334378242, |
|
"learning_rate": 1.868607395751377e-06, |
|
"loss": 0.0015, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 2.1272308826446533, |
|
"learning_rate": 1.6719118804091268e-06, |
|
"loss": 0.002, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 0.049883559346199036, |
|
"learning_rate": 1.4752163650668765e-06, |
|
"loss": 0.0008, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 1.860205054283142, |
|
"learning_rate": 1.2785208497246264e-06, |
|
"loss": 0.0039, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.012104855850338936, |
|
"learning_rate": 1.0818253343823763e-06, |
|
"loss": 0.0083, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 0.03279277682304382, |
|
"learning_rate": 8.85129819040126e-07, |
|
"loss": 0.0006, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 0.6932834386825562, |
|
"learning_rate": 6.884343036978757e-07, |
|
"loss": 0.0035, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.037294335663318634, |
|
"learning_rate": 4.917387883556255e-07, |
|
"loss": 0.0097, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 0.10355295240879059, |
|
"learning_rate": 2.9504327301337533e-07, |
|
"loss": 0.0006, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 3.9190878868103027, |
|
"learning_rate": 9.834775767112511e-08, |
|
"loss": 0.0175, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9795737122557726, |
|
"eval_loss": 0.08262032270431519, |
|
"eval_runtime": 72.9002, |
|
"eval_samples_per_second": 30.892, |
|
"eval_steps_per_second": 3.868, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 2825, |
|
"total_flos": 2.2478285568521503e+18, |
|
"train_loss": 0.1798970705407581, |
|
"train_runtime": 3694.2951, |
|
"train_samples_per_second": 24.484, |
|
"train_steps_per_second": 0.765 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2825, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 2.2478285568521503e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|