{ "best_metric": 0.9421052631578948, "best_model_checkpoint": "google/vivit-b-16x2-kinetics400-finetuned-my-dataset-6-epochs\\checkpoint-2574", "epoch": 5.164724164724165, "eval_steps": 500, "global_step": 2574, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 19.156469345092773, "learning_rate": 1.937984496124031e-06, "loss": 1.6746, "step": 10 }, { "epoch": 0.01, "grad_norm": 20.032257080078125, "learning_rate": 3.875968992248062e-06, "loss": 1.8523, "step": 20 }, { "epoch": 0.01, "grad_norm": 19.687211990356445, "learning_rate": 5.8139534883720935e-06, "loss": 1.7049, "step": 30 }, { "epoch": 0.02, "grad_norm": 21.64842414855957, "learning_rate": 7.751937984496124e-06, "loss": 1.8233, "step": 40 }, { "epoch": 0.02, "grad_norm": 18.463632583618164, "learning_rate": 9.689922480620156e-06, "loss": 1.5469, "step": 50 }, { "epoch": 0.02, "grad_norm": 21.310239791870117, "learning_rate": 1.1627906976744187e-05, "loss": 1.8318, "step": 60 }, { "epoch": 0.03, "grad_norm": 16.913654327392578, "learning_rate": 1.3565891472868217e-05, "loss": 1.3699, "step": 70 }, { "epoch": 0.03, "grad_norm": 17.656085968017578, "learning_rate": 1.5503875968992248e-05, "loss": 1.3487, "step": 80 }, { "epoch": 0.03, "grad_norm": 19.021942138671875, "learning_rate": 1.744186046511628e-05, "loss": 1.5383, "step": 90 }, { "epoch": 0.04, "grad_norm": 16.88703727722168, "learning_rate": 1.937984496124031e-05, "loss": 1.3378, "step": 100 }, { "epoch": 0.04, "grad_norm": 28.135297775268555, "learning_rate": 2.131782945736434e-05, "loss": 1.3925, "step": 110 }, { "epoch": 0.05, "grad_norm": 27.902355194091797, "learning_rate": 2.3255813953488374e-05, "loss": 1.2723, "step": 120 }, { "epoch": 0.05, "grad_norm": 25.579036712646484, "learning_rate": 2.5193798449612404e-05, "loss": 1.453, "step": 130 }, { "epoch": 0.05, "grad_norm": 11.281214714050293, "learning_rate": 2.7131782945736434e-05, "loss": 1.1327, "step": 140 }, { "epoch": 0.06, "grad_norm": 31.06259536743164, "learning_rate": 2.9069767441860467e-05, "loss": 1.0795, "step": 150 }, { "epoch": 0.06, "grad_norm": 9.809606552124023, "learning_rate": 3.1007751937984497e-05, "loss": 1.0496, "step": 160 }, { "epoch": 0.07, "grad_norm": 6.943488121032715, "learning_rate": 3.294573643410852e-05, "loss": 0.7182, "step": 170 }, { "epoch": 0.07, "grad_norm": 21.161239624023438, "learning_rate": 3.488372093023256e-05, "loss": 0.9181, "step": 180 }, { "epoch": 0.07, "grad_norm": 22.49970817565918, "learning_rate": 3.682170542635659e-05, "loss": 0.6194, "step": 190 }, { "epoch": 0.08, "grad_norm": 29.908571243286133, "learning_rate": 3.875968992248062e-05, "loss": 1.3104, "step": 200 }, { "epoch": 0.08, "grad_norm": 5.580902099609375, "learning_rate": 4.0697674418604655e-05, "loss": 0.74, "step": 210 }, { "epoch": 0.09, "grad_norm": 7.390064239501953, "learning_rate": 4.263565891472868e-05, "loss": 0.785, "step": 220 }, { "epoch": 0.09, "grad_norm": 17.90110206604004, "learning_rate": 4.4573643410852715e-05, "loss": 1.0281, "step": 230 }, { "epoch": 0.09, "grad_norm": 8.22541618347168, "learning_rate": 4.651162790697675e-05, "loss": 0.6409, "step": 240 }, { "epoch": 0.1, "grad_norm": 15.333436012268066, "learning_rate": 4.8449612403100775e-05, "loss": 0.2936, "step": 250 }, { "epoch": 0.1, "grad_norm": 28.581796646118164, "learning_rate": 4.9956822107081176e-05, "loss": 0.7362, "step": 260 }, { "epoch": 0.1, "grad_norm": 20.49916648864746, "learning_rate": 4.974093264248705e-05, "loss": 1.1056, "step": 270 }, { "epoch": 0.11, "grad_norm": 18.066146850585938, "learning_rate": 4.952504317789292e-05, "loss": 1.1734, "step": 280 }, { "epoch": 0.11, "grad_norm": 7.937985420227051, "learning_rate": 4.9309153713298795e-05, "loss": 0.8946, "step": 290 }, { "epoch": 0.12, "grad_norm": 18.581632614135742, "learning_rate": 4.909326424870467e-05, "loss": 0.8767, "step": 300 }, { "epoch": 0.12, "grad_norm": 10.53328800201416, "learning_rate": 4.887737478411054e-05, "loss": 0.72, "step": 310 }, { "epoch": 0.12, "grad_norm": 30.599201202392578, "learning_rate": 4.866148531951641e-05, "loss": 0.1731, "step": 320 }, { "epoch": 0.13, "grad_norm": 0.21214953064918518, "learning_rate": 4.844559585492228e-05, "loss": 0.3763, "step": 330 }, { "epoch": 0.13, "grad_norm": 40.29164505004883, "learning_rate": 4.822970639032816e-05, "loss": 1.725, "step": 340 }, { "epoch": 0.14, "grad_norm": 0.04869541898369789, "learning_rate": 4.8013816925734026e-05, "loss": 0.1703, "step": 350 }, { "epoch": 0.14, "grad_norm": 2.7087643146514893, "learning_rate": 4.7797927461139894e-05, "loss": 0.4218, "step": 360 }, { "epoch": 0.14, "grad_norm": 15.27876091003418, "learning_rate": 4.758203799654577e-05, "loss": 0.3047, "step": 370 }, { "epoch": 0.15, "grad_norm": 4.537576198577881, "learning_rate": 4.7366148531951644e-05, "loss": 0.1536, "step": 380 }, { "epoch": 0.15, "grad_norm": 16.916975021362305, "learning_rate": 4.715025906735751e-05, "loss": 0.2277, "step": 390 }, { "epoch": 0.16, "grad_norm": 3.467759370803833, "learning_rate": 4.693436960276339e-05, "loss": 0.7564, "step": 400 }, { "epoch": 0.16, "grad_norm": 0.6760605573654175, "learning_rate": 4.6718480138169256e-05, "loss": 0.5586, "step": 410 }, { "epoch": 0.16, "grad_norm": 48.665504455566406, "learning_rate": 4.650259067357513e-05, "loss": 0.5656, "step": 420 }, { "epoch": 0.17, "grad_norm": 0.1229218989610672, "learning_rate": 4.628670120898101e-05, "loss": 1.1371, "step": 430 }, { "epoch": 0.17, "eval_accuracy": 0.7789473684210526, "eval_loss": 0.7348644137382507, "eval_runtime": 514.0489, "eval_samples_per_second": 0.37, "eval_steps_per_second": 0.185, "step": 430 }, { "epoch": 1.0, "grad_norm": 10.519988059997559, "learning_rate": 4.6070811744386875e-05, "loss": 0.0473, "step": 440 }, { "epoch": 1.01, "grad_norm": 0.7223495841026306, "learning_rate": 4.585492227979275e-05, "loss": 0.234, "step": 450 }, { "epoch": 1.01, "grad_norm": 2.6696724891662598, "learning_rate": 4.563903281519862e-05, "loss": 0.0174, "step": 460 }, { "epoch": 1.02, "grad_norm": 36.59164810180664, "learning_rate": 4.5423143350604494e-05, "loss": 0.6521, "step": 470 }, { "epoch": 1.02, "grad_norm": 0.5735955834388733, "learning_rate": 4.520725388601036e-05, "loss": 0.0282, "step": 480 }, { "epoch": 1.02, "grad_norm": 0.06086433678865433, "learning_rate": 4.499136442141624e-05, "loss": 0.2758, "step": 490 }, { "epoch": 1.03, "grad_norm": 52.46869659423828, "learning_rate": 4.477547495682211e-05, "loss": 0.7029, "step": 500 }, { "epoch": 1.03, "grad_norm": 32.279354095458984, "learning_rate": 4.455958549222798e-05, "loss": 0.3286, "step": 510 }, { "epoch": 1.03, "grad_norm": 0.4026491045951843, "learning_rate": 4.434369602763385e-05, "loss": 0.1499, "step": 520 }, { "epoch": 1.04, "grad_norm": 0.010530718602240086, "learning_rate": 4.4127806563039725e-05, "loss": 0.3031, "step": 530 }, { "epoch": 1.04, "grad_norm": 0.0508076511323452, "learning_rate": 4.39119170984456e-05, "loss": 0.638, "step": 540 }, { "epoch": 1.05, "grad_norm": 0.019481591880321503, "learning_rate": 4.3696027633851475e-05, "loss": 0.1141, "step": 550 }, { "epoch": 1.05, "grad_norm": 1.2322841882705688, "learning_rate": 4.3480138169257344e-05, "loss": 0.831, "step": 560 }, { "epoch": 1.05, "grad_norm": 0.04800877720117569, "learning_rate": 4.326424870466321e-05, "loss": 0.0428, "step": 570 }, { "epoch": 1.06, "grad_norm": 72.7511978149414, "learning_rate": 4.304835924006909e-05, "loss": 0.2349, "step": 580 }, { "epoch": 1.06, "grad_norm": 0.14265328645706177, "learning_rate": 4.283246977547496e-05, "loss": 0.263, "step": 590 }, { "epoch": 1.07, "grad_norm": 1.0183712244033813, "learning_rate": 4.261658031088083e-05, "loss": 0.4732, "step": 600 }, { "epoch": 1.07, "grad_norm": 1.8658190965652466, "learning_rate": 4.2400690846286706e-05, "loss": 0.3766, "step": 610 }, { "epoch": 1.07, "grad_norm": 0.16890831291675568, "learning_rate": 4.2184801381692574e-05, "loss": 1.5658, "step": 620 }, { "epoch": 1.08, "grad_norm": 0.25139570236206055, "learning_rate": 4.196891191709845e-05, "loss": 1.0975, "step": 630 }, { "epoch": 1.08, "grad_norm": 19.84381103515625, "learning_rate": 4.175302245250432e-05, "loss": 0.0726, "step": 640 }, { "epoch": 1.09, "grad_norm": 0.07212476432323456, "learning_rate": 4.153713298791019e-05, "loss": 0.1175, "step": 650 }, { "epoch": 1.09, "grad_norm": 0.2093745470046997, "learning_rate": 4.132124352331607e-05, "loss": 0.2994, "step": 660 }, { "epoch": 1.09, "grad_norm": 0.0611826628446579, "learning_rate": 4.110535405872194e-05, "loss": 0.1013, "step": 670 }, { "epoch": 1.1, "grad_norm": 1.0494344234466553, "learning_rate": 4.0889464594127805e-05, "loss": 0.2448, "step": 680 }, { "epoch": 1.1, "grad_norm": 0.3157791197299957, "learning_rate": 4.067357512953368e-05, "loss": 0.1015, "step": 690 }, { "epoch": 1.1, "grad_norm": 4.036553859710693, "learning_rate": 4.0457685664939555e-05, "loss": 0.0134, "step": 700 }, { "epoch": 1.11, "grad_norm": 0.4784717857837677, "learning_rate": 4.024179620034543e-05, "loss": 0.2529, "step": 710 }, { "epoch": 1.11, "grad_norm": 0.03620552271604538, "learning_rate": 4.002590673575129e-05, "loss": 0.2876, "step": 720 }, { "epoch": 1.12, "grad_norm": 0.013772057369351387, "learning_rate": 3.981001727115717e-05, "loss": 0.2814, "step": 730 }, { "epoch": 1.12, "grad_norm": 0.01539964322000742, "learning_rate": 3.959412780656304e-05, "loss": 1.0851, "step": 740 }, { "epoch": 1.12, "grad_norm": 0.030828053131699562, "learning_rate": 3.937823834196892e-05, "loss": 0.1881, "step": 750 }, { "epoch": 1.13, "grad_norm": 0.06151743233203888, "learning_rate": 3.9162348877374786e-05, "loss": 0.0725, "step": 760 }, { "epoch": 1.13, "grad_norm": 0.05447068437933922, "learning_rate": 3.8946459412780655e-05, "loss": 0.0167, "step": 770 }, { "epoch": 1.14, "grad_norm": 0.1398928165435791, "learning_rate": 3.873056994818653e-05, "loss": 0.4225, "step": 780 }, { "epoch": 1.14, "grad_norm": 0.024751029908657074, "learning_rate": 3.8514680483592405e-05, "loss": 0.0037, "step": 790 }, { "epoch": 1.14, "grad_norm": 0.7933443188667297, "learning_rate": 3.8298791018998273e-05, "loss": 0.0055, "step": 800 }, { "epoch": 1.15, "grad_norm": 7.173922538757324, "learning_rate": 3.808290155440415e-05, "loss": 0.7573, "step": 810 }, { "epoch": 1.15, "grad_norm": 85.79830169677734, "learning_rate": 3.786701208981002e-05, "loss": 0.3346, "step": 820 }, { "epoch": 1.16, "grad_norm": 0.06877858936786652, "learning_rate": 3.765112262521589e-05, "loss": 0.0852, "step": 830 }, { "epoch": 1.16, "grad_norm": 0.08467548340559006, "learning_rate": 3.743523316062176e-05, "loss": 0.1547, "step": 840 }, { "epoch": 1.16, "grad_norm": 0.3360983431339264, "learning_rate": 3.7219343696027636e-05, "loss": 0.1827, "step": 850 }, { "epoch": 1.17, "grad_norm": 0.09804761409759521, "learning_rate": 3.700345423143351e-05, "loss": 0.5346, "step": 860 }, { "epoch": 1.17, "eval_accuracy": 0.8894736842105263, "eval_loss": 0.2911826968193054, "eval_runtime": 518.0915, "eval_samples_per_second": 0.367, "eval_steps_per_second": 0.183, "step": 860 }, { "epoch": 2.0, "grad_norm": 0.2422347515821457, "learning_rate": 3.678756476683938e-05, "loss": 0.1598, "step": 870 }, { "epoch": 2.01, "grad_norm": 0.09047175943851471, "learning_rate": 3.657167530224525e-05, "loss": 0.6796, "step": 880 }, { "epoch": 2.01, "grad_norm": 0.006813277490437031, "learning_rate": 3.635578583765112e-05, "loss": 0.4698, "step": 890 }, { "epoch": 2.02, "grad_norm": 0.07180597633123398, "learning_rate": 3.6139896373057e-05, "loss": 0.0101, "step": 900 }, { "epoch": 2.02, "grad_norm": 0.0530867837369442, "learning_rate": 3.592400690846287e-05, "loss": 0.0263, "step": 910 }, { "epoch": 2.02, "grad_norm": 0.03178895264863968, "learning_rate": 3.5708117443868735e-05, "loss": 0.0015, "step": 920 }, { "epoch": 2.03, "grad_norm": 0.2046024203300476, "learning_rate": 3.549222797927461e-05, "loss": 0.2336, "step": 930 }, { "epoch": 2.03, "grad_norm": 0.01445363275706768, "learning_rate": 3.5276338514680485e-05, "loss": 0.5172, "step": 940 }, { "epoch": 2.03, "grad_norm": 0.005609246902167797, "learning_rate": 3.506044905008636e-05, "loss": 0.0465, "step": 950 }, { "epoch": 2.04, "grad_norm": 2.6773831844329834, "learning_rate": 3.484455958549223e-05, "loss": 0.093, "step": 960 }, { "epoch": 2.04, "grad_norm": 19.465383529663086, "learning_rate": 3.4628670120898104e-05, "loss": 0.1096, "step": 970 }, { "epoch": 2.05, "grad_norm": 0.005580793134868145, "learning_rate": 3.441278065630397e-05, "loss": 0.1617, "step": 980 }, { "epoch": 2.05, "grad_norm": 0.017572615295648575, "learning_rate": 3.419689119170985e-05, "loss": 0.1416, "step": 990 }, { "epoch": 2.05, "grad_norm": 0.004967101849615574, "learning_rate": 3.3981001727115716e-05, "loss": 0.0009, "step": 1000 }, { "epoch": 2.06, "grad_norm": 0.029761947691440582, "learning_rate": 3.376511226252159e-05, "loss": 0.1359, "step": 1010 }, { "epoch": 2.06, "grad_norm": 8.259181022644043, "learning_rate": 3.3549222797927467e-05, "loss": 0.0199, "step": 1020 }, { "epoch": 2.07, "grad_norm": 0.014490657486021519, "learning_rate": 3.3333333333333335e-05, "loss": 0.1351, "step": 1030 }, { "epoch": 2.07, "grad_norm": 0.006075919605791569, "learning_rate": 3.31174438687392e-05, "loss": 0.2135, "step": 1040 }, { "epoch": 2.07, "grad_norm": 0.005090653896331787, "learning_rate": 3.290155440414508e-05, "loss": 0.0575, "step": 1050 }, { "epoch": 2.08, "grad_norm": 19.904199600219727, "learning_rate": 3.2685664939550954e-05, "loss": 0.0234, "step": 1060 }, { "epoch": 2.08, "grad_norm": 0.022977175191044807, "learning_rate": 3.246977547495683e-05, "loss": 0.0154, "step": 1070 }, { "epoch": 2.09, "grad_norm": 0.015029653906822205, "learning_rate": 3.225388601036269e-05, "loss": 0.2719, "step": 1080 }, { "epoch": 2.09, "grad_norm": 0.010515735484659672, "learning_rate": 3.2037996545768566e-05, "loss": 0.0067, "step": 1090 }, { "epoch": 2.09, "grad_norm": 0.0053952718153595924, "learning_rate": 3.182210708117444e-05, "loss": 0.0114, "step": 1100 }, { "epoch": 2.1, "grad_norm": 53.84306335449219, "learning_rate": 3.1606217616580316e-05, "loss": 0.4551, "step": 1110 }, { "epoch": 2.1, "grad_norm": 1.077244758605957, "learning_rate": 3.1390328151986184e-05, "loss": 0.2994, "step": 1120 }, { "epoch": 2.1, "grad_norm": 22.892969131469727, "learning_rate": 3.117443868739205e-05, "loss": 0.1593, "step": 1130 }, { "epoch": 2.11, "grad_norm": 0.013651028275489807, "learning_rate": 3.095854922279793e-05, "loss": 0.0082, "step": 1140 }, { "epoch": 2.11, "grad_norm": 0.008860527537763119, "learning_rate": 3.07426597582038e-05, "loss": 0.2133, "step": 1150 }, { "epoch": 2.12, "grad_norm": 0.06896065920591354, "learning_rate": 3.052677029360967e-05, "loss": 0.0106, "step": 1160 }, { "epoch": 2.12, "grad_norm": 0.025299793109297752, "learning_rate": 3.0310880829015547e-05, "loss": 0.0008, "step": 1170 }, { "epoch": 2.12, "grad_norm": 0.005163596011698246, "learning_rate": 3.0094991364421415e-05, "loss": 0.0056, "step": 1180 }, { "epoch": 2.13, "grad_norm": 0.5263453125953674, "learning_rate": 2.9879101899827287e-05, "loss": 0.0034, "step": 1190 }, { "epoch": 2.13, "grad_norm": 0.005412234924733639, "learning_rate": 2.9663212435233162e-05, "loss": 0.0011, "step": 1200 }, { "epoch": 2.14, "grad_norm": 0.004280978813767433, "learning_rate": 2.9447322970639034e-05, "loss": 0.315, "step": 1210 }, { "epoch": 2.14, "grad_norm": 1.1846489906311035, "learning_rate": 2.923143350604491e-05, "loss": 0.0016, "step": 1220 }, { "epoch": 2.14, "grad_norm": 3.384139060974121, "learning_rate": 2.9015544041450778e-05, "loss": 0.004, "step": 1230 }, { "epoch": 2.15, "grad_norm": 0.003910099621862173, "learning_rate": 2.879965457685665e-05, "loss": 0.3247, "step": 1240 }, { "epoch": 2.15, "grad_norm": 55.08097839355469, "learning_rate": 2.858376511226252e-05, "loss": 0.1251, "step": 1250 }, { "epoch": 2.16, "grad_norm": 0.01313870307058096, "learning_rate": 2.8367875647668396e-05, "loss": 0.0334, "step": 1260 }, { "epoch": 2.16, "grad_norm": 4.602208614349365, "learning_rate": 2.8151986183074268e-05, "loss": 0.0044, "step": 1270 }, { "epoch": 2.16, "grad_norm": 0.5221287608146667, "learning_rate": 2.7936096718480137e-05, "loss": 0.0129, "step": 1280 }, { "epoch": 2.17, "grad_norm": 0.005316704977303743, "learning_rate": 2.7720207253886012e-05, "loss": 0.0004, "step": 1290 }, { "epoch": 2.17, "eval_accuracy": 0.9210526315789473, "eval_loss": 0.31835657358169556, "eval_runtime": 20895.2371, "eval_samples_per_second": 0.009, "eval_steps_per_second": 0.005, "step": 1290 }, { "epoch": 3.0, "grad_norm": 0.23466819524765015, "learning_rate": 2.7504317789291884e-05, "loss": 0.0007, "step": 1300 }, { "epoch": 3.01, "grad_norm": 0.08376140892505646, "learning_rate": 2.7288428324697755e-05, "loss": 0.0011, "step": 1310 }, { "epoch": 3.01, "grad_norm": 0.0067549836821854115, "learning_rate": 2.707253886010363e-05, "loss": 0.0004, "step": 1320 }, { "epoch": 3.02, "grad_norm": 0.00525926286354661, "learning_rate": 2.68566493955095e-05, "loss": 0.0407, "step": 1330 }, { "epoch": 3.02, "grad_norm": 0.003173103556036949, "learning_rate": 2.664075993091537e-05, "loss": 0.0091, "step": 1340 }, { "epoch": 3.02, "grad_norm": 0.1465524435043335, "learning_rate": 2.6424870466321246e-05, "loss": 0.0004, "step": 1350 }, { "epoch": 3.03, "grad_norm": 0.01814081333577633, "learning_rate": 2.6208981001727118e-05, "loss": 0.0003, "step": 1360 }, { "epoch": 3.03, "grad_norm": 0.16582980751991272, "learning_rate": 2.599309153713299e-05, "loss": 0.1912, "step": 1370 }, { "epoch": 3.03, "grad_norm": 114.0838394165039, "learning_rate": 2.5777202072538865e-05, "loss": 0.1365, "step": 1380 }, { "epoch": 3.04, "grad_norm": 0.00724332919344306, "learning_rate": 2.5561312607944733e-05, "loss": 0.1497, "step": 1390 }, { "epoch": 3.04, "grad_norm": 0.0034754930529743433, "learning_rate": 2.5345423143350605e-05, "loss": 0.0057, "step": 1400 }, { "epoch": 3.05, "grad_norm": 0.0051283338107168674, "learning_rate": 2.5129533678756477e-05, "loss": 0.0017, "step": 1410 }, { "epoch": 3.05, "grad_norm": 0.001818397780880332, "learning_rate": 2.491364421416235e-05, "loss": 0.0156, "step": 1420 }, { "epoch": 3.05, "grad_norm": 0.01026181410998106, "learning_rate": 2.4697754749568224e-05, "loss": 0.0003, "step": 1430 }, { "epoch": 3.06, "grad_norm": 0.013726489618420601, "learning_rate": 2.4481865284974096e-05, "loss": 0.0002, "step": 1440 }, { "epoch": 3.06, "grad_norm": 0.004174862988293171, "learning_rate": 2.4265975820379967e-05, "loss": 0.0482, "step": 1450 }, { "epoch": 3.07, "grad_norm": 0.013808720745146275, "learning_rate": 2.405008635578584e-05, "loss": 0.1185, "step": 1460 }, { "epoch": 3.07, "grad_norm": 0.07046998292207718, "learning_rate": 2.383419689119171e-05, "loss": 0.0119, "step": 1470 }, { "epoch": 3.07, "grad_norm": 0.0045805503614246845, "learning_rate": 2.3618307426597583e-05, "loss": 0.0296, "step": 1480 }, { "epoch": 3.08, "grad_norm": 0.06411267071962357, "learning_rate": 2.3402417962003455e-05, "loss": 0.2067, "step": 1490 }, { "epoch": 3.08, "grad_norm": 0.046669330447912216, "learning_rate": 2.3186528497409326e-05, "loss": 0.0761, "step": 1500 }, { "epoch": 3.09, "grad_norm": 41.745513916015625, "learning_rate": 2.29706390328152e-05, "loss": 0.45, "step": 1510 }, { "epoch": 3.09, "grad_norm": 0.0023008265998214483, "learning_rate": 2.2754749568221073e-05, "loss": 0.0003, "step": 1520 }, { "epoch": 3.09, "grad_norm": 0.008280325680971146, "learning_rate": 2.2538860103626945e-05, "loss": 0.0002, "step": 1530 }, { "epoch": 3.1, "grad_norm": 0.06091548874974251, "learning_rate": 2.2322970639032817e-05, "loss": 0.0003, "step": 1540 }, { "epoch": 3.1, "grad_norm": 0.009856736287474632, "learning_rate": 2.210708117443869e-05, "loss": 0.0005, "step": 1550 }, { "epoch": 3.1, "grad_norm": 0.0037303988356143236, "learning_rate": 2.189119170984456e-05, "loss": 0.1055, "step": 1560 }, { "epoch": 3.11, "grad_norm": 0.3009835183620453, "learning_rate": 2.1675302245250432e-05, "loss": 0.0005, "step": 1570 }, { "epoch": 3.11, "grad_norm": 0.005187860690057278, "learning_rate": 2.1459412780656304e-05, "loss": 0.0024, "step": 1580 }, { "epoch": 3.12, "grad_norm": 0.0036395310889929533, "learning_rate": 2.124352331606218e-05, "loss": 0.288, "step": 1590 }, { "epoch": 3.12, "grad_norm": 0.006734526250511408, "learning_rate": 2.1027633851468048e-05, "loss": 0.0004, "step": 1600 }, { "epoch": 3.12, "grad_norm": 0.003171958727762103, "learning_rate": 2.0811744386873923e-05, "loss": 0.1013, "step": 1610 }, { "epoch": 3.13, "grad_norm": 0.0022121332585811615, "learning_rate": 2.0595854922279795e-05, "loss": 0.0716, "step": 1620 }, { "epoch": 3.13, "grad_norm": 0.04961927980184555, "learning_rate": 2.0379965457685666e-05, "loss": 0.014, "step": 1630 }, { "epoch": 3.14, "grad_norm": 0.042126741260290146, "learning_rate": 2.0164075993091538e-05, "loss": 0.1356, "step": 1640 }, { "epoch": 3.14, "grad_norm": 0.6265960931777954, "learning_rate": 1.994818652849741e-05, "loss": 0.0009, "step": 1650 }, { "epoch": 3.14, "grad_norm": 2.0805187225341797, "learning_rate": 1.9732297063903282e-05, "loss": 0.0439, "step": 1660 }, { "epoch": 3.15, "grad_norm": 0.0070223091170191765, "learning_rate": 1.9516407599309157e-05, "loss": 0.015, "step": 1670 }, { "epoch": 3.15, "grad_norm": 0.008443433791399002, "learning_rate": 1.9300518134715025e-05, "loss": 0.0016, "step": 1680 }, { "epoch": 3.16, "grad_norm": 0.008806809782981873, "learning_rate": 1.90846286701209e-05, "loss": 0.047, "step": 1690 }, { "epoch": 3.16, "grad_norm": 0.008072856813669205, "learning_rate": 1.8868739205526772e-05, "loss": 0.0005, "step": 1700 }, { "epoch": 3.16, "grad_norm": 0.061166293919086456, "learning_rate": 1.8652849740932644e-05, "loss": 0.0056, "step": 1710 }, { "epoch": 3.17, "grad_norm": 0.026545794680714607, "learning_rate": 1.8436960276338516e-05, "loss": 0.0006, "step": 1720 }, { "epoch": 3.17, "eval_accuracy": 0.9368421052631579, "eval_loss": 0.357727587223053, "eval_runtime": 523.7452, "eval_samples_per_second": 0.363, "eval_steps_per_second": 0.181, "step": 1720 }, { "epoch": 4.0, "grad_norm": 0.20675712823867798, "learning_rate": 1.8221070811744388e-05, "loss": 0.0005, "step": 1730 }, { "epoch": 4.01, "grad_norm": 0.007863791659474373, "learning_rate": 1.800518134715026e-05, "loss": 0.0046, "step": 1740 }, { "epoch": 4.01, "grad_norm": 0.00224371743388474, "learning_rate": 1.7789291882556135e-05, "loss": 0.0002, "step": 1750 }, { "epoch": 4.02, "grad_norm": 0.00742675457149744, "learning_rate": 1.7573402417962003e-05, "loss": 0.0021, "step": 1760 }, { "epoch": 4.02, "grad_norm": 0.02301434613764286, "learning_rate": 1.735751295336788e-05, "loss": 0.0002, "step": 1770 }, { "epoch": 4.02, "grad_norm": 0.03734876960515976, "learning_rate": 1.7141623488773747e-05, "loss": 0.0001, "step": 1780 }, { "epoch": 4.03, "grad_norm": 0.0020719885360449553, "learning_rate": 1.6925734024179622e-05, "loss": 0.0052, "step": 1790 }, { "epoch": 4.03, "grad_norm": 0.021191716194152832, "learning_rate": 1.6709844559585494e-05, "loss": 0.2831, "step": 1800 }, { "epoch": 4.03, "grad_norm": 0.033936869353055954, "learning_rate": 1.6493955094991366e-05, "loss": 0.0004, "step": 1810 }, { "epoch": 4.04, "grad_norm": 0.3438062071800232, "learning_rate": 1.6278065630397237e-05, "loss": 0.0005, "step": 1820 }, { "epoch": 4.04, "grad_norm": 0.0040876902639865875, "learning_rate": 1.606217616580311e-05, "loss": 0.0001, "step": 1830 }, { "epoch": 4.05, "grad_norm": 0.0035211730282753706, "learning_rate": 1.584628670120898e-05, "loss": 0.0002, "step": 1840 }, { "epoch": 4.05, "grad_norm": 0.004084211308509111, "learning_rate": 1.5630397236614856e-05, "loss": 0.0004, "step": 1850 }, { "epoch": 4.05, "grad_norm": 0.0032758773304522038, "learning_rate": 1.5414507772020725e-05, "loss": 0.0365, "step": 1860 }, { "epoch": 4.06, "grad_norm": 14.158347129821777, "learning_rate": 1.5198618307426598e-05, "loss": 0.0074, "step": 1870 }, { "epoch": 4.06, "grad_norm": 0.002404096070677042, "learning_rate": 1.498272884283247e-05, "loss": 0.0001, "step": 1880 }, { "epoch": 4.07, "grad_norm": 0.02108081988990307, "learning_rate": 1.4766839378238342e-05, "loss": 0.0002, "step": 1890 }, { "epoch": 4.07, "grad_norm": 0.12371557205915451, "learning_rate": 1.4550949913644215e-05, "loss": 0.038, "step": 1900 }, { "epoch": 4.07, "grad_norm": 0.0032504727132618427, "learning_rate": 1.4335060449050087e-05, "loss": 0.0007, "step": 1910 }, { "epoch": 4.08, "grad_norm": 71.4847183227539, "learning_rate": 1.4119170984455959e-05, "loss": 0.0218, "step": 1920 }, { "epoch": 4.08, "grad_norm": 0.007250852882862091, "learning_rate": 1.3903281519861832e-05, "loss": 0.0002, "step": 1930 }, { "epoch": 4.09, "grad_norm": 0.0018006942700594664, "learning_rate": 1.3687392055267704e-05, "loss": 0.0007, "step": 1940 }, { "epoch": 4.09, "grad_norm": 0.0018308752914890647, "learning_rate": 1.3471502590673576e-05, "loss": 0.0001, "step": 1950 }, { "epoch": 4.09, "grad_norm": 0.004195887595415115, "learning_rate": 1.3255613126079448e-05, "loss": 0.0001, "step": 1960 }, { "epoch": 4.1, "grad_norm": 0.005353459622710943, "learning_rate": 1.303972366148532e-05, "loss": 0.0002, "step": 1970 }, { "epoch": 4.1, "grad_norm": 0.00350914616137743, "learning_rate": 1.2823834196891193e-05, "loss": 0.0002, "step": 1980 }, { "epoch": 4.1, "grad_norm": 0.003935214597731829, "learning_rate": 1.2607944732297065e-05, "loss": 0.0002, "step": 1990 }, { "epoch": 4.11, "grad_norm": 1.447480320930481, "learning_rate": 1.2392055267702936e-05, "loss": 0.1091, "step": 2000 }, { "epoch": 4.11, "grad_norm": 0.0019440186442807317, "learning_rate": 1.2176165803108808e-05, "loss": 0.0005, "step": 2010 }, { "epoch": 4.12, "grad_norm": 0.007630598731338978, "learning_rate": 1.1960276338514682e-05, "loss": 0.0001, "step": 2020 }, { "epoch": 4.12, "grad_norm": 0.0019236513180658221, "learning_rate": 1.1744386873920554e-05, "loss": 0.0003, "step": 2030 }, { "epoch": 4.12, "grad_norm": 0.005540070589631796, "learning_rate": 1.1528497409326425e-05, "loss": 0.0054, "step": 2040 }, { "epoch": 4.13, "grad_norm": 0.016037726774811745, "learning_rate": 1.1312607944732297e-05, "loss": 0.0002, "step": 2050 }, { "epoch": 4.13, "grad_norm": 0.006410875823348761, "learning_rate": 1.109671848013817e-05, "loss": 0.0006, "step": 2060 }, { "epoch": 4.14, "grad_norm": 0.03595797345042229, "learning_rate": 1.0880829015544042e-05, "loss": 0.0009, "step": 2070 }, { "epoch": 4.14, "grad_norm": 1.9901659488677979, "learning_rate": 1.0664939550949914e-05, "loss": 0.0027, "step": 2080 }, { "epoch": 4.14, "grad_norm": 0.00291555211879313, "learning_rate": 1.0449050086355788e-05, "loss": 0.0045, "step": 2090 }, { "epoch": 4.15, "grad_norm": 0.0020950380712747574, "learning_rate": 1.023316062176166e-05, "loss": 0.0167, "step": 2100 }, { "epoch": 4.15, "grad_norm": 0.0015124623896554112, "learning_rate": 1.0017271157167531e-05, "loss": 0.0002, "step": 2110 }, { "epoch": 4.16, "grad_norm": 0.001781588769517839, "learning_rate": 9.801381692573403e-06, "loss": 0.0004, "step": 2120 }, { "epoch": 4.16, "grad_norm": 0.006017952226102352, "learning_rate": 9.585492227979275e-06, "loss": 0.0003, "step": 2130 }, { "epoch": 4.16, "grad_norm": 0.017710313200950623, "learning_rate": 9.369602763385148e-06, "loss": 0.0007, "step": 2140 }, { "epoch": 4.17, "grad_norm": 0.002331692725419998, "learning_rate": 9.15371329879102e-06, "loss": 0.0382, "step": 2150 }, { "epoch": 4.17, "eval_accuracy": 0.9315789473684211, "eval_loss": 0.34909680485725403, "eval_runtime": 564.551, "eval_samples_per_second": 0.337, "eval_steps_per_second": 0.168, "step": 2150 }, { "epoch": 5.0, "grad_norm": 0.002333582378923893, "learning_rate": 8.937823834196892e-06, "loss": 0.004, "step": 2160 }, { "epoch": 5.01, "grad_norm": 0.0014494940405711532, "learning_rate": 8.721934369602764e-06, "loss": 0.0003, "step": 2170 }, { "epoch": 5.01, "grad_norm": 0.016125734895467758, "learning_rate": 8.506044905008637e-06, "loss": 0.0001, "step": 2180 }, { "epoch": 5.02, "grad_norm": 0.004009012598544359, "learning_rate": 8.290155440414509e-06, "loss": 0.0001, "step": 2190 }, { "epoch": 5.02, "grad_norm": 0.009335234761238098, "learning_rate": 8.074265975820381e-06, "loss": 0.0118, "step": 2200 }, { "epoch": 5.02, "grad_norm": 0.01937965489923954, "learning_rate": 7.858376511226253e-06, "loss": 0.0019, "step": 2210 }, { "epoch": 5.03, "grad_norm": 0.0025466824881732464, "learning_rate": 7.642487046632124e-06, "loss": 0.0001, "step": 2220 }, { "epoch": 5.03, "grad_norm": 0.0036932919174432755, "learning_rate": 7.426597582037997e-06, "loss": 0.0001, "step": 2230 }, { "epoch": 5.03, "grad_norm": 0.003243980696424842, "learning_rate": 7.210708117443869e-06, "loss": 0.0001, "step": 2240 }, { "epoch": 5.04, "grad_norm": 0.002856618259102106, "learning_rate": 6.994818652849741e-06, "loss": 0.0001, "step": 2250 }, { "epoch": 5.04, "grad_norm": 0.0019520550267770886, "learning_rate": 6.778929188255613e-06, "loss": 0.0001, "step": 2260 }, { "epoch": 5.05, "grad_norm": 0.003286935854703188, "learning_rate": 6.563039723661486e-06, "loss": 0.0001, "step": 2270 }, { "epoch": 5.05, "grad_norm": 0.001508223474957049, "learning_rate": 6.347150259067358e-06, "loss": 0.0056, "step": 2280 }, { "epoch": 5.05, "grad_norm": 0.05285673215985298, "learning_rate": 6.13126079447323e-06, "loss": 0.0054, "step": 2290 }, { "epoch": 5.06, "grad_norm": 0.004176164045929909, "learning_rate": 5.915371329879102e-06, "loss": 0.0589, "step": 2300 }, { "epoch": 5.06, "grad_norm": 0.0049232905730605125, "learning_rate": 5.699481865284975e-06, "loss": 0.0001, "step": 2310 }, { "epoch": 5.07, "grad_norm": 29.9996337890625, "learning_rate": 5.483592400690847e-06, "loss": 0.0541, "step": 2320 }, { "epoch": 5.07, "grad_norm": 0.006466562859714031, "learning_rate": 5.267702936096719e-06, "loss": 0.0002, "step": 2330 }, { "epoch": 5.07, "grad_norm": 0.002998851239681244, "learning_rate": 5.051813471502591e-06, "loss": 0.0001, "step": 2340 }, { "epoch": 5.08, "grad_norm": 0.001166225876659155, "learning_rate": 4.835924006908464e-06, "loss": 0.0002, "step": 2350 }, { "epoch": 5.08, "grad_norm": 0.0021310816518962383, "learning_rate": 4.620034542314336e-06, "loss": 0.0009, "step": 2360 }, { "epoch": 5.09, "grad_norm": 0.004912310279905796, "learning_rate": 4.404145077720207e-06, "loss": 0.0002, "step": 2370 }, { "epoch": 5.09, "grad_norm": 0.0024258983321487904, "learning_rate": 4.18825561312608e-06, "loss": 0.0014, "step": 2380 }, { "epoch": 5.09, "grad_norm": 0.0011308868415653706, "learning_rate": 3.972366148531952e-06, "loss": 0.0001, "step": 2390 }, { "epoch": 5.1, "grad_norm": 0.0015520398737862706, "learning_rate": 3.756476683937824e-06, "loss": 0.0001, "step": 2400 }, { "epoch": 5.1, "grad_norm": 0.0016552858287468553, "learning_rate": 3.5405872193436963e-06, "loss": 0.0002, "step": 2410 }, { "epoch": 5.1, "grad_norm": 0.003429860807955265, "learning_rate": 3.3246977547495685e-06, "loss": 0.0005, "step": 2420 }, { "epoch": 5.11, "grad_norm": 3.591295003890991, "learning_rate": 3.1088082901554407e-06, "loss": 0.0029, "step": 2430 }, { "epoch": 5.11, "grad_norm": 0.002391704125329852, "learning_rate": 2.892918825561313e-06, "loss": 0.0416, "step": 2440 }, { "epoch": 5.12, "grad_norm": 0.0021623370703309774, "learning_rate": 2.677029360967185e-06, "loss": 0.0001, "step": 2450 }, { "epoch": 5.12, "grad_norm": 0.006243993528187275, "learning_rate": 2.4611398963730574e-06, "loss": 0.0001, "step": 2460 }, { "epoch": 5.12, "grad_norm": 0.004033067263662815, "learning_rate": 2.2452504317789296e-06, "loss": 0.0047, "step": 2470 }, { "epoch": 5.13, "grad_norm": 0.002611899748444557, "learning_rate": 2.029360967184802e-06, "loss": 0.0001, "step": 2480 }, { "epoch": 5.13, "grad_norm": 0.0029090410098433495, "learning_rate": 1.8134715025906736e-06, "loss": 0.0002, "step": 2490 }, { "epoch": 5.14, "grad_norm": 0.007737706881016493, "learning_rate": 1.5975820379965458e-06, "loss": 0.0001, "step": 2500 }, { "epoch": 5.14, "grad_norm": 0.0028760808054357767, "learning_rate": 1.381692573402418e-06, "loss": 0.0005, "step": 2510 }, { "epoch": 5.14, "grad_norm": 0.2445225566625595, "learning_rate": 1.1658031088082903e-06, "loss": 0.0011, "step": 2520 }, { "epoch": 5.15, "grad_norm": 0.001734812743961811, "learning_rate": 9.499136442141624e-07, "loss": 0.0004, "step": 2530 }, { "epoch": 5.15, "grad_norm": 0.0022820383310317993, "learning_rate": 7.340241796200346e-07, "loss": 0.0001, "step": 2540 }, { "epoch": 5.16, "grad_norm": 0.002981896046549082, "learning_rate": 5.181347150259068e-07, "loss": 0.0002, "step": 2550 }, { "epoch": 5.16, "grad_norm": 31.39242935180664, "learning_rate": 3.02245250431779e-07, "loss": 0.037, "step": 2560 }, { "epoch": 5.16, "grad_norm": 0.0018530270317569375, "learning_rate": 8.635578583765113e-08, "loss": 0.0002, "step": 2570 }, { "epoch": 5.16, "eval_accuracy": 0.9421052631578948, "eval_loss": 0.3020946681499481, "eval_runtime": 526.5435, "eval_samples_per_second": 0.361, "eval_steps_per_second": 0.18, "step": 2574 }, { "epoch": 5.16, "step": 2574, "total_flos": 1.3176952627228508e+19, "train_loss": 0.2533098812526059, "train_runtime": 42827.5494, "train_samples_per_second": 0.12, "train_steps_per_second": 0.06 }, { "epoch": 5.16, "eval_accuracy": 0.9895833333333334, "eval_loss": 0.11018911004066467, "eval_runtime": 273.5625, "eval_samples_per_second": 0.351, "eval_steps_per_second": 0.175, "step": 2574 }, { "epoch": 5.16, "eval_accuracy": 0.9895833333333334, "eval_loss": 0.11018910259008408, "eval_runtime": 269.2698, "eval_samples_per_second": 0.357, "eval_steps_per_second": 0.178, "step": 2574 } ], "logging_steps": 10, "max_steps": 2574, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "total_flos": 1.3176952627228508e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }