{ "best_metric": 0.9795737122557726, "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-ibird/checkpoint-2825", "epoch": 4.995579133510168, "eval_steps": 500, "global_step": 2825, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 10.971182823181152, "learning_rate": 1.76678445229682e-06, "loss": 3.1809, "step": 10 }, { "epoch": 0.04, "grad_norm": 6.421601295471191, "learning_rate": 3.53356890459364e-06, "loss": 3.2061, "step": 20 }, { "epoch": 0.05, "grad_norm": 9.90295124053955, "learning_rate": 5.30035335689046e-06, "loss": 3.1752, "step": 30 }, { "epoch": 0.07, "grad_norm": 6.587502479553223, "learning_rate": 7.06713780918728e-06, "loss": 3.1217, "step": 40 }, { "epoch": 0.09, "grad_norm": 5.991631984710693, "learning_rate": 8.8339222614841e-06, "loss": 3.0881, "step": 50 }, { "epoch": 0.11, "grad_norm": 18.684083938598633, "learning_rate": 1.060070671378092e-05, "loss": 3.0575, "step": 60 }, { "epoch": 0.12, "grad_norm": 9.924698829650879, "learning_rate": 1.236749116607774e-05, "loss": 2.9739, "step": 70 }, { "epoch": 0.14, "grad_norm": 8.268143653869629, "learning_rate": 1.413427561837456e-05, "loss": 2.8529, "step": 80 }, { "epoch": 0.16, "grad_norm": 8.838825225830078, "learning_rate": 1.5901060070671377e-05, "loss": 2.6898, "step": 90 }, { "epoch": 0.18, "grad_norm": 11.603575706481934, "learning_rate": 1.76678445229682e-05, "loss": 2.4395, "step": 100 }, { "epoch": 0.19, "grad_norm": 13.830281257629395, "learning_rate": 1.9434628975265016e-05, "loss": 1.9708, "step": 110 }, { "epoch": 0.21, "grad_norm": 13.014172554016113, "learning_rate": 2.120141342756184e-05, "loss": 1.5808, "step": 120 }, { "epoch": 0.23, "grad_norm": 17.082054138183594, "learning_rate": 2.296819787985866e-05, "loss": 1.2151, "step": 130 }, { "epoch": 0.25, "grad_norm": 17.318891525268555, "learning_rate": 2.473498233215548e-05, "loss": 0.8489, "step": 140 }, { "epoch": 0.27, "grad_norm": 14.354799270629883, "learning_rate": 2.6501766784452298e-05, "loss": 0.6891, "step": 150 }, { "epoch": 0.28, "grad_norm": 9.554667472839355, "learning_rate": 2.826855123674912e-05, "loss": 0.6101, "step": 160 }, { "epoch": 0.3, "grad_norm": 8.692755699157715, "learning_rate": 3.003533568904594e-05, "loss": 0.4587, "step": 170 }, { "epoch": 0.32, "grad_norm": 18.25060272216797, "learning_rate": 3.1802120141342755e-05, "loss": 0.436, "step": 180 }, { "epoch": 0.34, "grad_norm": 10.215097427368164, "learning_rate": 3.356890459363958e-05, "loss": 0.2916, "step": 190 }, { "epoch": 0.35, "grad_norm": 10.572564125061035, "learning_rate": 3.53356890459364e-05, "loss": 0.2545, "step": 200 }, { "epoch": 0.37, "grad_norm": 18.01451301574707, "learning_rate": 3.710247349823322e-05, "loss": 0.3369, "step": 210 }, { "epoch": 0.39, "grad_norm": 9.001435279846191, "learning_rate": 3.886925795053003e-05, "loss": 0.2184, "step": 220 }, { "epoch": 0.41, "grad_norm": 11.199219703674316, "learning_rate": 4.063604240282686e-05, "loss": 0.2333, "step": 230 }, { "epoch": 0.42, "grad_norm": 9.572484016418457, "learning_rate": 4.240282685512368e-05, "loss": 0.2238, "step": 240 }, { "epoch": 0.44, "grad_norm": 11.255512237548828, "learning_rate": 4.416961130742049e-05, "loss": 0.2418, "step": 250 }, { "epoch": 0.46, "grad_norm": 7.414256572723389, "learning_rate": 4.593639575971732e-05, "loss": 0.2201, "step": 260 }, { "epoch": 0.48, "grad_norm": 4.147739410400391, "learning_rate": 4.7703180212014135e-05, "loss": 0.2054, "step": 270 }, { "epoch": 0.5, "grad_norm": 4.774346828460693, "learning_rate": 4.946996466431096e-05, "loss": 0.1863, "step": 280 }, { "epoch": 0.51, "grad_norm": 7.592128753662109, "learning_rate": 4.9862313139260423e-05, "loss": 0.1531, "step": 290 }, { "epoch": 0.53, "grad_norm": 14.047577857971191, "learning_rate": 4.9665617623918175e-05, "loss": 0.1713, "step": 300 }, { "epoch": 0.55, "grad_norm": 13.25577449798584, "learning_rate": 4.9468922108575926e-05, "loss": 0.2064, "step": 310 }, { "epoch": 0.57, "grad_norm": 15.570125579833984, "learning_rate": 4.927222659323368e-05, "loss": 0.1782, "step": 320 }, { "epoch": 0.58, "grad_norm": 7.168793201446533, "learning_rate": 4.907553107789143e-05, "loss": 0.1792, "step": 330 }, { "epoch": 0.6, "grad_norm": 19.736141204833984, "learning_rate": 4.887883556254917e-05, "loss": 0.1774, "step": 340 }, { "epoch": 0.62, "grad_norm": 11.251014709472656, "learning_rate": 4.8682140047206924e-05, "loss": 0.1933, "step": 350 }, { "epoch": 0.64, "grad_norm": 14.66421890258789, "learning_rate": 4.8485444531864675e-05, "loss": 0.1554, "step": 360 }, { "epoch": 0.65, "grad_norm": 7.186115741729736, "learning_rate": 4.8288749016522426e-05, "loss": 0.1577, "step": 370 }, { "epoch": 0.67, "grad_norm": 11.209123611450195, "learning_rate": 4.809205350118017e-05, "loss": 0.1598, "step": 380 }, { "epoch": 0.69, "grad_norm": 7.6711602210998535, "learning_rate": 4.789535798583792e-05, "loss": 0.1964, "step": 390 }, { "epoch": 0.71, "grad_norm": 12.84682846069336, "learning_rate": 4.769866247049567e-05, "loss": 0.1597, "step": 400 }, { "epoch": 0.73, "grad_norm": 6.741486072540283, "learning_rate": 4.7501966955153424e-05, "loss": 0.1085, "step": 410 }, { "epoch": 0.74, "grad_norm": 12.747113227844238, "learning_rate": 4.7305271439811175e-05, "loss": 0.1448, "step": 420 }, { "epoch": 0.76, "grad_norm": 15.717222213745117, "learning_rate": 4.7108575924468926e-05, "loss": 0.2451, "step": 430 }, { "epoch": 0.78, "grad_norm": 6.885907173156738, "learning_rate": 4.691188040912668e-05, "loss": 0.1473, "step": 440 }, { "epoch": 0.8, "grad_norm": 9.838655471801758, "learning_rate": 4.671518489378442e-05, "loss": 0.1295, "step": 450 }, { "epoch": 0.81, "grad_norm": 5.5332932472229, "learning_rate": 4.651848937844217e-05, "loss": 0.1199, "step": 460 }, { "epoch": 0.83, "grad_norm": 11.243609428405762, "learning_rate": 4.6321793863099924e-05, "loss": 0.2097, "step": 470 }, { "epoch": 0.85, "grad_norm": 1.227767825126648, "learning_rate": 4.6125098347757675e-05, "loss": 0.142, "step": 480 }, { "epoch": 0.87, "grad_norm": 8.73798942565918, "learning_rate": 4.5928402832415426e-05, "loss": 0.1749, "step": 490 }, { "epoch": 0.88, "grad_norm": 4.825047969818115, "learning_rate": 4.573170731707318e-05, "loss": 0.1062, "step": 500 }, { "epoch": 0.9, "grad_norm": 14.581511497497559, "learning_rate": 4.553501180173092e-05, "loss": 0.1786, "step": 510 }, { "epoch": 0.92, "grad_norm": 8.596332550048828, "learning_rate": 4.533831628638867e-05, "loss": 0.1797, "step": 520 }, { "epoch": 0.94, "grad_norm": 21.009435653686523, "learning_rate": 4.5141620771046424e-05, "loss": 0.141, "step": 530 }, { "epoch": 0.95, "grad_norm": 11.336109161376953, "learning_rate": 4.4944925255704175e-05, "loss": 0.1304, "step": 540 }, { "epoch": 0.97, "grad_norm": 8.8182954788208, "learning_rate": 4.4748229740361926e-05, "loss": 0.1208, "step": 550 }, { "epoch": 0.99, "grad_norm": 14.062355041503906, "learning_rate": 4.455153422501967e-05, "loss": 0.166, "step": 560 }, { "epoch": 1.0, "eval_accuracy": 0.9675843694493783, "eval_loss": 0.11483483016490936, "eval_runtime": 72.8258, "eval_samples_per_second": 30.923, "eval_steps_per_second": 3.872, "step": 565 }, { "epoch": 1.01, "grad_norm": 3.4058330059051514, "learning_rate": 4.435483870967742e-05, "loss": 0.0858, "step": 570 }, { "epoch": 1.03, "grad_norm": 4.461623191833496, "learning_rate": 4.415814319433517e-05, "loss": 0.1057, "step": 580 }, { "epoch": 1.04, "grad_norm": 11.684330940246582, "learning_rate": 4.3961447678992924e-05, "loss": 0.0425, "step": 590 }, { "epoch": 1.06, "grad_norm": 11.538973808288574, "learning_rate": 4.376475216365067e-05, "loss": 0.0799, "step": 600 }, { "epoch": 1.08, "grad_norm": 20.21871566772461, "learning_rate": 4.356805664830842e-05, "loss": 0.0488, "step": 610 }, { "epoch": 1.1, "grad_norm": 6.491779327392578, "learning_rate": 4.337136113296617e-05, "loss": 0.0461, "step": 620 }, { "epoch": 1.11, "grad_norm": 20.6411075592041, "learning_rate": 4.317466561762392e-05, "loss": 0.1365, "step": 630 }, { "epoch": 1.13, "grad_norm": 6.05819034576416, "learning_rate": 4.297797010228167e-05, "loss": 0.0707, "step": 640 }, { "epoch": 1.15, "grad_norm": 5.697531223297119, "learning_rate": 4.278127458693942e-05, "loss": 0.0684, "step": 650 }, { "epoch": 1.17, "grad_norm": 3.515834331512451, "learning_rate": 4.258457907159717e-05, "loss": 0.0809, "step": 660 }, { "epoch": 1.18, "grad_norm": 7.478769779205322, "learning_rate": 4.238788355625492e-05, "loss": 0.0941, "step": 670 }, { "epoch": 1.2, "grad_norm": 1.829246997833252, "learning_rate": 4.219118804091267e-05, "loss": 0.0903, "step": 680 }, { "epoch": 1.22, "grad_norm": 1.0911270380020142, "learning_rate": 4.1994492525570416e-05, "loss": 0.0291, "step": 690 }, { "epoch": 1.24, "grad_norm": 1.5587719678878784, "learning_rate": 4.179779701022817e-05, "loss": 0.0999, "step": 700 }, { "epoch": 1.26, "grad_norm": 0.5585480332374573, "learning_rate": 4.160110149488592e-05, "loss": 0.0609, "step": 710 }, { "epoch": 1.27, "grad_norm": 17.553932189941406, "learning_rate": 4.140440597954367e-05, "loss": 0.1024, "step": 720 }, { "epoch": 1.29, "grad_norm": 5.286637306213379, "learning_rate": 4.1207710464201413e-05, "loss": 0.0903, "step": 730 }, { "epoch": 1.31, "grad_norm": 3.5825843811035156, "learning_rate": 4.1011014948859165e-05, "loss": 0.086, "step": 740 }, { "epoch": 1.33, "grad_norm": 0.051394496113061905, "learning_rate": 4.0814319433516916e-05, "loss": 0.0483, "step": 750 }, { "epoch": 1.34, "grad_norm": 4.485952377319336, "learning_rate": 4.061762391817467e-05, "loss": 0.0641, "step": 760 }, { "epoch": 1.36, "grad_norm": 0.9799467921257019, "learning_rate": 4.042092840283242e-05, "loss": 0.0225, "step": 770 }, { "epoch": 1.38, "grad_norm": 0.5134735107421875, "learning_rate": 4.022423288749016e-05, "loss": 0.0875, "step": 780 }, { "epoch": 1.4, "grad_norm": 1.2541639804840088, "learning_rate": 4.0027537372147914e-05, "loss": 0.031, "step": 790 }, { "epoch": 1.41, "grad_norm": 0.1368063986301422, "learning_rate": 3.9830841856805665e-05, "loss": 0.1112, "step": 800 }, { "epoch": 1.43, "grad_norm": 3.3988840579986572, "learning_rate": 3.9634146341463416e-05, "loss": 0.0806, "step": 810 }, { "epoch": 1.45, "grad_norm": 23.861074447631836, "learning_rate": 3.943745082612117e-05, "loss": 0.1014, "step": 820 }, { "epoch": 1.47, "grad_norm": 7.84724235534668, "learning_rate": 3.924075531077892e-05, "loss": 0.0532, "step": 830 }, { "epoch": 1.49, "grad_norm": 3.9560492038726807, "learning_rate": 3.904405979543666e-05, "loss": 0.0824, "step": 840 }, { "epoch": 1.5, "grad_norm": 4.0379438400268555, "learning_rate": 3.8847364280094414e-05, "loss": 0.1027, "step": 850 }, { "epoch": 1.52, "grad_norm": 1.1322356462478638, "learning_rate": 3.8650668764752165e-05, "loss": 0.0466, "step": 860 }, { "epoch": 1.54, "grad_norm": 14.02712631225586, "learning_rate": 3.8453973249409916e-05, "loss": 0.0871, "step": 870 }, { "epoch": 1.56, "grad_norm": 21.739513397216797, "learning_rate": 3.825727773406767e-05, "loss": 0.0682, "step": 880 }, { "epoch": 1.57, "grad_norm": 18.444072723388672, "learning_rate": 3.806058221872542e-05, "loss": 0.0942, "step": 890 }, { "epoch": 1.59, "grad_norm": 9.762558937072754, "learning_rate": 3.786388670338317e-05, "loss": 0.071, "step": 900 }, { "epoch": 1.61, "grad_norm": 10.741572380065918, "learning_rate": 3.7667191188040914e-05, "loss": 0.055, "step": 910 }, { "epoch": 1.63, "grad_norm": 1.030358910560608, "learning_rate": 3.7470495672698665e-05, "loss": 0.0666, "step": 920 }, { "epoch": 1.64, "grad_norm": 0.6305733919143677, "learning_rate": 3.7273800157356416e-05, "loss": 0.0523, "step": 930 }, { "epoch": 1.66, "grad_norm": 2.391921043395996, "learning_rate": 3.707710464201417e-05, "loss": 0.0785, "step": 940 }, { "epoch": 1.68, "grad_norm": 4.645728588104248, "learning_rate": 3.688040912667191e-05, "loss": 0.1018, "step": 950 }, { "epoch": 1.7, "grad_norm": 13.808793067932129, "learning_rate": 3.668371361132966e-05, "loss": 0.0728, "step": 960 }, { "epoch": 1.72, "grad_norm": 0.23640145361423492, "learning_rate": 3.6487018095987414e-05, "loss": 0.0652, "step": 970 }, { "epoch": 1.73, "grad_norm": 3.5320382118225098, "learning_rate": 3.6290322580645165e-05, "loss": 0.0621, "step": 980 }, { "epoch": 1.75, "grad_norm": 1.206480860710144, "learning_rate": 3.6093627065302916e-05, "loss": 0.0676, "step": 990 }, { "epoch": 1.77, "grad_norm": 18.625965118408203, "learning_rate": 3.589693154996066e-05, "loss": 0.0715, "step": 1000 }, { "epoch": 1.79, "grad_norm": 3.9618349075317383, "learning_rate": 3.570023603461841e-05, "loss": 0.0814, "step": 1010 }, { "epoch": 1.8, "grad_norm": 2.8978772163391113, "learning_rate": 3.550354051927616e-05, "loss": 0.0383, "step": 1020 }, { "epoch": 1.82, "grad_norm": 11.628218650817871, "learning_rate": 3.5306845003933914e-05, "loss": 0.0768, "step": 1030 }, { "epoch": 1.84, "grad_norm": 1.9953187704086304, "learning_rate": 3.511014948859166e-05, "loss": 0.0649, "step": 1040 }, { "epoch": 1.86, "grad_norm": 15.428389549255371, "learning_rate": 3.491345397324941e-05, "loss": 0.0591, "step": 1050 }, { "epoch": 1.87, "grad_norm": 2.0453615188598633, "learning_rate": 3.471675845790716e-05, "loss": 0.0979, "step": 1060 }, { "epoch": 1.89, "grad_norm": 7.687002658843994, "learning_rate": 3.452006294256491e-05, "loss": 0.0681, "step": 1070 }, { "epoch": 1.91, "grad_norm": 2.8325603008270264, "learning_rate": 3.432336742722266e-05, "loss": 0.0322, "step": 1080 }, { "epoch": 1.93, "grad_norm": 2.8982272148132324, "learning_rate": 3.412667191188041e-05, "loss": 0.0468, "step": 1090 }, { "epoch": 1.95, "grad_norm": 2.832631826400757, "learning_rate": 3.392997639653816e-05, "loss": 0.034, "step": 1100 }, { "epoch": 1.96, "grad_norm": 6.040098190307617, "learning_rate": 3.373328088119591e-05, "loss": 0.0631, "step": 1110 }, { "epoch": 1.98, "grad_norm": 0.12215587496757507, "learning_rate": 3.353658536585366e-05, "loss": 0.0522, "step": 1120 }, { "epoch": 2.0, "grad_norm": 8.005010604858398, "learning_rate": 3.3339889850511406e-05, "loss": 0.0909, "step": 1130 }, { "epoch": 2.0, "eval_accuracy": 0.9755772646536413, "eval_loss": 0.0889873057603836, "eval_runtime": 72.7301, "eval_samples_per_second": 30.964, "eval_steps_per_second": 3.877, "step": 1131 }, { "epoch": 2.02, "grad_norm": 0.5045525431632996, "learning_rate": 3.314319433516916e-05, "loss": 0.054, "step": 1140 }, { "epoch": 2.03, "grad_norm": 3.169762134552002, "learning_rate": 3.294649881982691e-05, "loss": 0.0138, "step": 1150 }, { "epoch": 2.05, "grad_norm": 12.394304275512695, "learning_rate": 3.274980330448466e-05, "loss": 0.025, "step": 1160 }, { "epoch": 2.07, "grad_norm": 0.7768550515174866, "learning_rate": 3.255310778914241e-05, "loss": 0.0136, "step": 1170 }, { "epoch": 2.09, "grad_norm": 23.059011459350586, "learning_rate": 3.2356412273800155e-05, "loss": 0.0487, "step": 1180 }, { "epoch": 2.1, "grad_norm": 1.4939569234848022, "learning_rate": 3.2159716758457906e-05, "loss": 0.0267, "step": 1190 }, { "epoch": 2.12, "grad_norm": 1.5693202018737793, "learning_rate": 3.196302124311566e-05, "loss": 0.0339, "step": 1200 }, { "epoch": 2.14, "grad_norm": 0.9168961644172668, "learning_rate": 3.176632572777341e-05, "loss": 0.0195, "step": 1210 }, { "epoch": 2.16, "grad_norm": 3.3607680797576904, "learning_rate": 3.156963021243116e-05, "loss": 0.0257, "step": 1220 }, { "epoch": 2.18, "grad_norm": 10.094879150390625, "learning_rate": 3.137293469708891e-05, "loss": 0.0143, "step": 1230 }, { "epoch": 2.19, "grad_norm": 9.125, "learning_rate": 3.1176239181746655e-05, "loss": 0.0295, "step": 1240 }, { "epoch": 2.21, "grad_norm": 0.2778412699699402, "learning_rate": 3.0979543666404406e-05, "loss": 0.0182, "step": 1250 }, { "epoch": 2.23, "grad_norm": 1.0662269592285156, "learning_rate": 3.078284815106216e-05, "loss": 0.0295, "step": 1260 }, { "epoch": 2.25, "grad_norm": 0.0518941730260849, "learning_rate": 3.058615263571991e-05, "loss": 0.048, "step": 1270 }, { "epoch": 2.26, "grad_norm": 3.4041025638580322, "learning_rate": 3.0389457120377656e-05, "loss": 0.0601, "step": 1280 }, { "epoch": 2.28, "grad_norm": 0.9103949666023254, "learning_rate": 3.0192761605035407e-05, "loss": 0.0191, "step": 1290 }, { "epoch": 2.3, "grad_norm": 1.8118720054626465, "learning_rate": 2.999606608969316e-05, "loss": 0.023, "step": 1300 }, { "epoch": 2.32, "grad_norm": 0.6234725117683411, "learning_rate": 2.9799370574350903e-05, "loss": 0.0248, "step": 1310 }, { "epoch": 2.33, "grad_norm": 18.03314781188965, "learning_rate": 2.9602675059008654e-05, "loss": 0.0694, "step": 1320 }, { "epoch": 2.35, "grad_norm": 0.3662589490413666, "learning_rate": 2.9405979543666405e-05, "loss": 0.0206, "step": 1330 }, { "epoch": 2.37, "grad_norm": 0.030597494915127754, "learning_rate": 2.9209284028324156e-05, "loss": 0.0291, "step": 1340 }, { "epoch": 2.39, "grad_norm": 0.9393526911735535, "learning_rate": 2.9012588512981904e-05, "loss": 0.0243, "step": 1350 }, { "epoch": 2.4, "grad_norm": 3.631523847579956, "learning_rate": 2.8815892997639655e-05, "loss": 0.0292, "step": 1360 }, { "epoch": 2.42, "grad_norm": 0.010743443854153156, "learning_rate": 2.8619197482297406e-05, "loss": 0.0453, "step": 1370 }, { "epoch": 2.44, "grad_norm": 16.26077651977539, "learning_rate": 2.8422501966955157e-05, "loss": 0.0221, "step": 1380 }, { "epoch": 2.46, "grad_norm": 3.7162559032440186, "learning_rate": 2.822580645161291e-05, "loss": 0.0278, "step": 1390 }, { "epoch": 2.48, "grad_norm": 1.568585991859436, "learning_rate": 2.8029110936270653e-05, "loss": 0.0234, "step": 1400 }, { "epoch": 2.49, "grad_norm": 0.43441635370254517, "learning_rate": 2.7832415420928404e-05, "loss": 0.0187, "step": 1410 }, { "epoch": 2.51, "grad_norm": 2.6314618587493896, "learning_rate": 2.7635719905586155e-05, "loss": 0.0235, "step": 1420 }, { "epoch": 2.53, "grad_norm": 0.020479127764701843, "learning_rate": 2.7439024390243906e-05, "loss": 0.0102, "step": 1430 }, { "epoch": 2.55, "grad_norm": 0.40051591396331787, "learning_rate": 2.724232887490165e-05, "loss": 0.0036, "step": 1440 }, { "epoch": 2.56, "grad_norm": 8.731142044067383, "learning_rate": 2.7045633359559402e-05, "loss": 0.0412, "step": 1450 }, { "epoch": 2.58, "grad_norm": 5.808520317077637, "learning_rate": 2.6848937844217153e-05, "loss": 0.0333, "step": 1460 }, { "epoch": 2.6, "grad_norm": 0.15826575458049774, "learning_rate": 2.6652242328874904e-05, "loss": 0.0381, "step": 1470 }, { "epoch": 2.62, "grad_norm": 2.659451484680176, "learning_rate": 2.645554681353265e-05, "loss": 0.0188, "step": 1480 }, { "epoch": 2.63, "grad_norm": 0.06260908395051956, "learning_rate": 2.62588512981904e-05, "loss": 0.0141, "step": 1490 }, { "epoch": 2.65, "grad_norm": 0.15780910849571228, "learning_rate": 2.606215578284815e-05, "loss": 0.0146, "step": 1500 }, { "epoch": 2.67, "grad_norm": 2.506399631500244, "learning_rate": 2.5865460267505902e-05, "loss": 0.0138, "step": 1510 }, { "epoch": 2.69, "grad_norm": 0.11960842460393906, "learning_rate": 2.5668764752163653e-05, "loss": 0.0028, "step": 1520 }, { "epoch": 2.71, "grad_norm": 22.104814529418945, "learning_rate": 2.54720692368214e-05, "loss": 0.0279, "step": 1530 }, { "epoch": 2.72, "grad_norm": 0.6916127800941467, "learning_rate": 2.5275373721479152e-05, "loss": 0.0152, "step": 1540 }, { "epoch": 2.74, "grad_norm": 4.406822204589844, "learning_rate": 2.5078678206136903e-05, "loss": 0.0064, "step": 1550 }, { "epoch": 2.76, "grad_norm": 0.5762219429016113, "learning_rate": 2.488198269079465e-05, "loss": 0.0257, "step": 1560 }, { "epoch": 2.78, "grad_norm": 1.5382003784179688, "learning_rate": 2.4685287175452402e-05, "loss": 0.0049, "step": 1570 }, { "epoch": 2.79, "grad_norm": 0.4372711181640625, "learning_rate": 2.448859166011015e-05, "loss": 0.0442, "step": 1580 }, { "epoch": 2.81, "grad_norm": 6.409371852874756, "learning_rate": 2.42918961447679e-05, "loss": 0.055, "step": 1590 }, { "epoch": 2.83, "grad_norm": 0.15078137814998627, "learning_rate": 2.4095200629425652e-05, "loss": 0.0228, "step": 1600 }, { "epoch": 2.85, "grad_norm": 4.177145481109619, "learning_rate": 2.38985051140834e-05, "loss": 0.0193, "step": 1610 }, { "epoch": 2.86, "grad_norm": 0.13254141807556152, "learning_rate": 2.370180959874115e-05, "loss": 0.0138, "step": 1620 }, { "epoch": 2.88, "grad_norm": 0.5547938942909241, "learning_rate": 2.35051140833989e-05, "loss": 0.003, "step": 1630 }, { "epoch": 2.9, "grad_norm": 1.550114393234253, "learning_rate": 2.330841856805665e-05, "loss": 0.0156, "step": 1640 }, { "epoch": 2.92, "grad_norm": 2.4502694606781006, "learning_rate": 2.3111723052714398e-05, "loss": 0.0284, "step": 1650 }, { "epoch": 2.94, "grad_norm": 0.34310609102249146, "learning_rate": 2.291502753737215e-05, "loss": 0.0318, "step": 1660 }, { "epoch": 2.95, "grad_norm": 0.06068241968750954, "learning_rate": 2.2718332022029897e-05, "loss": 0.0171, "step": 1670 }, { "epoch": 2.97, "grad_norm": 1.1032580137252808, "learning_rate": 2.2521636506687648e-05, "loss": 0.0062, "step": 1680 }, { "epoch": 2.99, "grad_norm": 16.99115562438965, "learning_rate": 2.2324940991345396e-05, "loss": 0.018, "step": 1690 }, { "epoch": 3.0, "eval_accuracy": 0.9755772646536413, "eval_loss": 0.0940345823764801, "eval_runtime": 73.2079, "eval_samples_per_second": 30.762, "eval_steps_per_second": 3.852, "step": 1696 }, { "epoch": 3.01, "grad_norm": 18.221080780029297, "learning_rate": 2.2128245476003147e-05, "loss": 0.018, "step": 1700 }, { "epoch": 3.02, "grad_norm": 1.597777009010315, "learning_rate": 2.1931549960660898e-05, "loss": 0.0095, "step": 1710 }, { "epoch": 3.04, "grad_norm": 0.460238516330719, "learning_rate": 2.1734854445318646e-05, "loss": 0.0135, "step": 1720 }, { "epoch": 3.06, "grad_norm": 0.1700868457555771, "learning_rate": 2.1538158929976397e-05, "loss": 0.0149, "step": 1730 }, { "epoch": 3.08, "grad_norm": 4.091975212097168, "learning_rate": 2.134146341463415e-05, "loss": 0.0163, "step": 1740 }, { "epoch": 3.09, "grad_norm": 0.004883296322077513, "learning_rate": 2.11447678992919e-05, "loss": 0.0022, "step": 1750 }, { "epoch": 3.11, "grad_norm": 0.9401123523712158, "learning_rate": 2.0948072383949647e-05, "loss": 0.0012, "step": 1760 }, { "epoch": 3.13, "grad_norm": 4.135828018188477, "learning_rate": 2.07513768686074e-05, "loss": 0.0201, "step": 1770 }, { "epoch": 3.15, "grad_norm": 0.030084745958447456, "learning_rate": 2.0554681353265146e-05, "loss": 0.0013, "step": 1780 }, { "epoch": 3.17, "grad_norm": 15.494827270507812, "learning_rate": 2.0357985837922897e-05, "loss": 0.0067, "step": 1790 }, { "epoch": 3.18, "grad_norm": 1.0570249557495117, "learning_rate": 2.0161290322580645e-05, "loss": 0.015, "step": 1800 }, { "epoch": 3.2, "grad_norm": 0.5005541443824768, "learning_rate": 1.9964594807238396e-05, "loss": 0.0025, "step": 1810 }, { "epoch": 3.22, "grad_norm": 3.019387722015381, "learning_rate": 1.9767899291896147e-05, "loss": 0.0112, "step": 1820 }, { "epoch": 3.24, "grad_norm": 0.19258318841457367, "learning_rate": 1.9571203776553895e-05, "loss": 0.0021, "step": 1830 }, { "epoch": 3.25, "grad_norm": 2.9886810779571533, "learning_rate": 1.9374508261211646e-05, "loss": 0.002, "step": 1840 }, { "epoch": 3.27, "grad_norm": 0.012368579395115376, "learning_rate": 1.9177812745869394e-05, "loss": 0.0012, "step": 1850 }, { "epoch": 3.29, "grad_norm": 0.023176291957497597, "learning_rate": 1.8981117230527145e-05, "loss": 0.0456, "step": 1860 }, { "epoch": 3.31, "grad_norm": 0.06562870740890503, "learning_rate": 1.8784421715184893e-05, "loss": 0.0069, "step": 1870 }, { "epoch": 3.32, "grad_norm": 17.193941116333008, "learning_rate": 1.8587726199842644e-05, "loss": 0.0133, "step": 1880 }, { "epoch": 3.34, "grad_norm": 0.08079444617033005, "learning_rate": 1.8391030684500392e-05, "loss": 0.0162, "step": 1890 }, { "epoch": 3.36, "grad_norm": 1.4867687225341797, "learning_rate": 1.8194335169158143e-05, "loss": 0.0028, "step": 1900 }, { "epoch": 3.38, "grad_norm": 0.10314597189426422, "learning_rate": 1.799763965381589e-05, "loss": 0.0036, "step": 1910 }, { "epoch": 3.4, "grad_norm": 0.03035634197294712, "learning_rate": 1.7800944138473642e-05, "loss": 0.0018, "step": 1920 }, { "epoch": 3.41, "grad_norm": 0.03287611901760101, "learning_rate": 1.7604248623131393e-05, "loss": 0.011, "step": 1930 }, { "epoch": 3.43, "grad_norm": 0.009028772823512554, "learning_rate": 1.7407553107789144e-05, "loss": 0.0055, "step": 1940 }, { "epoch": 3.45, "grad_norm": 0.11215109378099442, "learning_rate": 1.7210857592446896e-05, "loss": 0.0033, "step": 1950 }, { "epoch": 3.47, "grad_norm": 0.18691633641719818, "learning_rate": 1.7014162077104643e-05, "loss": 0.0157, "step": 1960 }, { "epoch": 3.48, "grad_norm": 10.645195007324219, "learning_rate": 1.6817466561762395e-05, "loss": 0.0056, "step": 1970 }, { "epoch": 3.5, "grad_norm": 0.3934668004512787, "learning_rate": 1.6620771046420142e-05, "loss": 0.0023, "step": 1980 }, { "epoch": 3.52, "grad_norm": 0.01680024527013302, "learning_rate": 1.6424075531077893e-05, "loss": 0.0058, "step": 1990 }, { "epoch": 3.54, "grad_norm": 0.6062192916870117, "learning_rate": 1.622738001573564e-05, "loss": 0.004, "step": 2000 }, { "epoch": 3.55, "grad_norm": 0.7066389918327332, "learning_rate": 1.6030684500393392e-05, "loss": 0.0094, "step": 2010 }, { "epoch": 3.57, "grad_norm": 0.20641224086284637, "learning_rate": 1.583398898505114e-05, "loss": 0.005, "step": 2020 }, { "epoch": 3.59, "grad_norm": 0.005134557373821735, "learning_rate": 1.563729346970889e-05, "loss": 0.0196, "step": 2030 }, { "epoch": 3.61, "grad_norm": 0.09725712984800339, "learning_rate": 1.5440597954366642e-05, "loss": 0.0093, "step": 2040 }, { "epoch": 3.63, "grad_norm": 0.01703699305653572, "learning_rate": 1.524390243902439e-05, "loss": 0.0173, "step": 2050 }, { "epoch": 3.64, "grad_norm": 8.595245361328125, "learning_rate": 1.5047206923682141e-05, "loss": 0.0067, "step": 2060 }, { "epoch": 3.66, "grad_norm": 1.1955232620239258, "learning_rate": 1.485051140833989e-05, "loss": 0.0057, "step": 2070 }, { "epoch": 3.68, "grad_norm": 0.026586467400193214, "learning_rate": 1.465381589299764e-05, "loss": 0.0033, "step": 2080 }, { "epoch": 3.7, "grad_norm": 0.07073145359754562, "learning_rate": 1.445712037765539e-05, "loss": 0.0098, "step": 2090 }, { "epoch": 3.71, "grad_norm": 13.662938117980957, "learning_rate": 1.4260424862313141e-05, "loss": 0.0099, "step": 2100 }, { "epoch": 3.73, "grad_norm": 7.773074626922607, "learning_rate": 1.4063729346970889e-05, "loss": 0.0147, "step": 2110 }, { "epoch": 3.75, "grad_norm": 0.08416301012039185, "learning_rate": 1.386703383162864e-05, "loss": 0.0015, "step": 2120 }, { "epoch": 3.77, "grad_norm": 0.09213215857744217, "learning_rate": 1.3670338316286388e-05, "loss": 0.0011, "step": 2130 }, { "epoch": 3.78, "grad_norm": 0.08238010108470917, "learning_rate": 1.3473642800944139e-05, "loss": 0.0067, "step": 2140 }, { "epoch": 3.8, "grad_norm": 0.2930988073348999, "learning_rate": 1.327694728560189e-05, "loss": 0.0027, "step": 2150 }, { "epoch": 3.82, "grad_norm": 0.6446830630302429, "learning_rate": 1.3080251770259638e-05, "loss": 0.0021, "step": 2160 }, { "epoch": 3.84, "grad_norm": 0.04108636826276779, "learning_rate": 1.2883556254917389e-05, "loss": 0.0116, "step": 2170 }, { "epoch": 3.85, "grad_norm": 0.3955759108066559, "learning_rate": 1.2686860739575138e-05, "loss": 0.0132, "step": 2180 }, { "epoch": 3.87, "grad_norm": 0.10534235090017319, "learning_rate": 1.249016522423289e-05, "loss": 0.0014, "step": 2190 }, { "epoch": 3.89, "grad_norm": 0.9984220266342163, "learning_rate": 1.2293469708890639e-05, "loss": 0.016, "step": 2200 }, { "epoch": 3.91, "grad_norm": 0.05933081731200218, "learning_rate": 1.2096774193548388e-05, "loss": 0.0024, "step": 2210 }, { "epoch": 3.93, "grad_norm": 0.3307284414768219, "learning_rate": 1.1900078678206138e-05, "loss": 0.0118, "step": 2220 }, { "epoch": 3.94, "grad_norm": 0.01056114211678505, "learning_rate": 1.1703383162863887e-05, "loss": 0.0054, "step": 2230 }, { "epoch": 3.96, "grad_norm": 0.0951002761721611, "learning_rate": 1.1506687647521637e-05, "loss": 0.0059, "step": 2240 }, { "epoch": 3.98, "grad_norm": 0.008732125163078308, "learning_rate": 1.1309992132179386e-05, "loss": 0.002, "step": 2250 }, { "epoch": 4.0, "grad_norm": 2.568053722381592, "learning_rate": 1.1113296616837136e-05, "loss": 0.0246, "step": 2260 }, { "epoch": 4.0, "eval_accuracy": 0.9791296625222025, "eval_loss": 0.08105943351984024, "eval_runtime": 71.9238, "eval_samples_per_second": 31.311, "eval_steps_per_second": 3.921, "step": 2262 }, { "epoch": 4.01, "grad_norm": 0.3494608700275421, "learning_rate": 1.0916601101494885e-05, "loss": 0.0029, "step": 2270 }, { "epoch": 4.03, "grad_norm": 0.4306102991104126, "learning_rate": 1.0719905586152636e-05, "loss": 0.0037, "step": 2280 }, { "epoch": 4.05, "grad_norm": 2.145603656768799, "learning_rate": 1.0523210070810386e-05, "loss": 0.0063, "step": 2290 }, { "epoch": 4.07, "grad_norm": 0.0036911088973283768, "learning_rate": 1.0326514555468137e-05, "loss": 0.0022, "step": 2300 }, { "epoch": 4.08, "grad_norm": 0.032683487981557846, "learning_rate": 1.0129819040125886e-05, "loss": 0.0062, "step": 2310 }, { "epoch": 4.1, "grad_norm": 0.007357526570558548, "learning_rate": 9.933123524783636e-06, "loss": 0.0059, "step": 2320 }, { "epoch": 4.12, "grad_norm": 0.052719421684741974, "learning_rate": 9.736428009441385e-06, "loss": 0.0007, "step": 2330 }, { "epoch": 4.14, "grad_norm": 0.0456971600651741, "learning_rate": 9.539732494099135e-06, "loss": 0.0083, "step": 2340 }, { "epoch": 4.16, "grad_norm": 0.029169630259275436, "learning_rate": 9.343036978756884e-06, "loss": 0.0075, "step": 2350 }, { "epoch": 4.17, "grad_norm": 0.05413562431931496, "learning_rate": 9.146341463414634e-06, "loss": 0.0025, "step": 2360 }, { "epoch": 4.19, "grad_norm": 8.87646198272705, "learning_rate": 8.949645948072383e-06, "loss": 0.0104, "step": 2370 }, { "epoch": 4.21, "grad_norm": 1.6246185302734375, "learning_rate": 8.752950432730134e-06, "loss": 0.001, "step": 2380 }, { "epoch": 4.23, "grad_norm": 0.21518048644065857, "learning_rate": 8.556254917387884e-06, "loss": 0.0073, "step": 2390 }, { "epoch": 4.24, "grad_norm": 0.6136273741722107, "learning_rate": 8.359559402045635e-06, "loss": 0.008, "step": 2400 }, { "epoch": 4.26, "grad_norm": 0.013673730194568634, "learning_rate": 8.162863886703385e-06, "loss": 0.0003, "step": 2410 }, { "epoch": 4.28, "grad_norm": 0.013138936832547188, "learning_rate": 7.966168371361134e-06, "loss": 0.0016, "step": 2420 }, { "epoch": 4.3, "grad_norm": 0.5832827687263489, "learning_rate": 7.769472856018883e-06, "loss": 0.0016, "step": 2430 }, { "epoch": 4.31, "grad_norm": 0.013815794140100479, "learning_rate": 7.572777340676633e-06, "loss": 0.0093, "step": 2440 }, { "epoch": 4.33, "grad_norm": 2.6205365657806396, "learning_rate": 7.376081825334382e-06, "loss": 0.0247, "step": 2450 }, { "epoch": 4.35, "grad_norm": 0.008757353760302067, "learning_rate": 7.179386309992133e-06, "loss": 0.0004, "step": 2460 }, { "epoch": 4.37, "grad_norm": 0.5439887642860413, "learning_rate": 6.982690794649882e-06, "loss": 0.0011, "step": 2470 }, { "epoch": 4.39, "grad_norm": 3.330894708633423, "learning_rate": 6.785995279307632e-06, "loss": 0.0027, "step": 2480 }, { "epoch": 4.4, "grad_norm": 0.24116189777851105, "learning_rate": 6.589299763965381e-06, "loss": 0.001, "step": 2490 }, { "epoch": 4.42, "grad_norm": 0.19894400238990784, "learning_rate": 6.392604248623131e-06, "loss": 0.0096, "step": 2500 }, { "epoch": 4.44, "grad_norm": 0.08030141144990921, "learning_rate": 6.195908733280882e-06, "loss": 0.0003, "step": 2510 }, { "epoch": 4.46, "grad_norm": 0.06319218873977661, "learning_rate": 5.999213217938631e-06, "loss": 0.0075, "step": 2520 }, { "epoch": 4.47, "grad_norm": 0.0329049713909626, "learning_rate": 5.802517702596381e-06, "loss": 0.0123, "step": 2530 }, { "epoch": 4.49, "grad_norm": 0.07301346212625504, "learning_rate": 5.605822187254131e-06, "loss": 0.0037, "step": 2540 }, { "epoch": 4.51, "grad_norm": 2.7243146896362305, "learning_rate": 5.40912667191188e-06, "loss": 0.009, "step": 2550 }, { "epoch": 4.53, "grad_norm": 0.002613728167489171, "learning_rate": 5.212431156569631e-06, "loss": 0.0211, "step": 2560 }, { "epoch": 4.54, "grad_norm": 0.08672753721475601, "learning_rate": 5.01573564122738e-06, "loss": 0.0003, "step": 2570 }, { "epoch": 4.56, "grad_norm": 0.7189329862594604, "learning_rate": 4.81904012588513e-06, "loss": 0.0017, "step": 2580 }, { "epoch": 4.58, "grad_norm": 0.021875105798244476, "learning_rate": 4.62234461054288e-06, "loss": 0.0036, "step": 2590 }, { "epoch": 4.6, "grad_norm": 0.06674761325120926, "learning_rate": 4.425649095200629e-06, "loss": 0.0016, "step": 2600 }, { "epoch": 4.62, "grad_norm": 1.5341488122940063, "learning_rate": 4.22895357985838e-06, "loss": 0.0123, "step": 2610 }, { "epoch": 4.63, "grad_norm": 0.6225674152374268, "learning_rate": 4.032258064516129e-06, "loss": 0.0071, "step": 2620 }, { "epoch": 4.65, "grad_norm": 0.04396051913499832, "learning_rate": 3.835562549173879e-06, "loss": 0.0061, "step": 2630 }, { "epoch": 4.67, "grad_norm": 0.024606185033917427, "learning_rate": 3.638867033831629e-06, "loss": 0.0034, "step": 2640 }, { "epoch": 4.69, "grad_norm": 0.7400041818618774, "learning_rate": 3.442171518489379e-06, "loss": 0.0022, "step": 2650 }, { "epoch": 4.7, "grad_norm": 0.105620838701725, "learning_rate": 3.2454760031471283e-06, "loss": 0.0203, "step": 2660 }, { "epoch": 4.72, "grad_norm": 0.011251443065702915, "learning_rate": 3.0487804878048782e-06, "loss": 0.0047, "step": 2670 }, { "epoch": 4.74, "grad_norm": 0.03630177304148674, "learning_rate": 2.852084972462628e-06, "loss": 0.004, "step": 2680 }, { "epoch": 4.76, "grad_norm": 0.03339725360274315, "learning_rate": 2.655389457120378e-06, "loss": 0.0011, "step": 2690 }, { "epoch": 4.77, "grad_norm": 4.932060718536377, "learning_rate": 2.4586939417781275e-06, "loss": 0.0032, "step": 2700 }, { "epoch": 4.79, "grad_norm": 0.20462249219417572, "learning_rate": 2.2619984264358773e-06, "loss": 0.0008, "step": 2710 }, { "epoch": 4.81, "grad_norm": 0.01447032019495964, "learning_rate": 2.0653029110936272e-06, "loss": 0.003, "step": 2720 }, { "epoch": 4.83, "grad_norm": 0.012796329334378242, "learning_rate": 1.868607395751377e-06, "loss": 0.0015, "step": 2730 }, { "epoch": 4.85, "grad_norm": 2.1272308826446533, "learning_rate": 1.6719118804091268e-06, "loss": 0.002, "step": 2740 }, { "epoch": 4.86, "grad_norm": 0.049883559346199036, "learning_rate": 1.4752163650668765e-06, "loss": 0.0008, "step": 2750 }, { "epoch": 4.88, "grad_norm": 1.860205054283142, "learning_rate": 1.2785208497246264e-06, "loss": 0.0039, "step": 2760 }, { "epoch": 4.9, "grad_norm": 0.012104855850338936, "learning_rate": 1.0818253343823763e-06, "loss": 0.0083, "step": 2770 }, { "epoch": 4.92, "grad_norm": 0.03279277682304382, "learning_rate": 8.85129819040126e-07, "loss": 0.0006, "step": 2780 }, { "epoch": 4.93, "grad_norm": 0.6932834386825562, "learning_rate": 6.884343036978757e-07, "loss": 0.0035, "step": 2790 }, { "epoch": 4.95, "grad_norm": 0.037294335663318634, "learning_rate": 4.917387883556255e-07, "loss": 0.0097, "step": 2800 }, { "epoch": 4.97, "grad_norm": 0.10355295240879059, "learning_rate": 2.9504327301337533e-07, "loss": 0.0006, "step": 2810 }, { "epoch": 4.99, "grad_norm": 3.9190878868103027, "learning_rate": 9.834775767112511e-08, "loss": 0.0175, "step": 2820 }, { "epoch": 5.0, "eval_accuracy": 0.9795737122557726, "eval_loss": 0.08262032270431519, "eval_runtime": 72.9002, "eval_samples_per_second": 30.892, "eval_steps_per_second": 3.868, "step": 2825 }, { "epoch": 5.0, "step": 2825, "total_flos": 2.2478285568521503e+18, "train_loss": 0.1798970705407581, "train_runtime": 3694.2951, "train_samples_per_second": 24.484, "train_steps_per_second": 0.765 } ], "logging_steps": 10, "max_steps": 2825, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 2.2478285568521503e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }