{ "best_metric": 0.39477089047431946, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 2.002844950213371, "eval_steps": 25, "global_step": 132, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015173067804646752, "grad_norm": 17.388423919677734, "learning_rate": 2.9999999999999997e-05, "loss": 30.457, "step": 1 }, { "epoch": 0.015173067804646752, "eval_loss": 0.8401981592178345, "eval_runtime": 4.1216, "eval_samples_per_second": 12.131, "eval_steps_per_second": 12.131, "step": 1 }, { "epoch": 0.030346135609293504, "grad_norm": 10.178067207336426, "learning_rate": 5.9999999999999995e-05, "loss": 30.335, "step": 2 }, { "epoch": 0.04551920341394026, "grad_norm": 10.258736610412598, "learning_rate": 8.999999999999999e-05, "loss": 28.3123, "step": 3 }, { "epoch": 0.06069227121858701, "grad_norm": 10.296838760375977, "learning_rate": 0.00011999999999999999, "loss": 27.9287, "step": 4 }, { "epoch": 0.07586533902323377, "grad_norm": 12.867237091064453, "learning_rate": 0.00015, "loss": 25.3598, "step": 5 }, { "epoch": 0.09103840682788052, "grad_norm": 12.431685447692871, "learning_rate": 0.00017999999999999998, "loss": 22.2146, "step": 6 }, { "epoch": 0.10621147463252727, "grad_norm": 13.944931983947754, "learning_rate": 0.00020999999999999998, "loss": 19.7453, "step": 7 }, { "epoch": 0.12138454243717402, "grad_norm": 23.274085998535156, "learning_rate": 0.00023999999999999998, "loss": 19.6562, "step": 8 }, { "epoch": 0.13655761024182078, "grad_norm": 23.921234130859375, "learning_rate": 0.00027, "loss": 15.7789, "step": 9 }, { "epoch": 0.15173067804646753, "grad_norm": 31.0799560546875, "learning_rate": 0.0003, "loss": 18.6639, "step": 10 }, { "epoch": 0.16690374585111428, "grad_norm": 27.46465301513672, "learning_rate": 0.00029995027012714694, "loss": 15.0467, "step": 11 }, { "epoch": 0.18207681365576103, "grad_norm": 21.228845596313477, "learning_rate": 0.00029980111348272456, "loss": 16.0005, "step": 12 }, { "epoch": 0.19724988146040778, "grad_norm": 17.686410903930664, "learning_rate": 0.00029955262896727894, "loss": 15.2921, "step": 13 }, { "epoch": 0.21242294926505453, "grad_norm": 16.29988670349121, "learning_rate": 0.00029920498134218835, "loss": 14.4487, "step": 14 }, { "epoch": 0.22759601706970128, "grad_norm": 18.719757080078125, "learning_rate": 0.0002987584011204152, "loss": 12.782, "step": 15 }, { "epoch": 0.24276908487434803, "grad_norm": 19.07723617553711, "learning_rate": 0.0002982131844136615, "loss": 12.9955, "step": 16 }, { "epoch": 0.2579421526789948, "grad_norm": 45.31391143798828, "learning_rate": 0.0002975696927360274, "loss": 23.8883, "step": 17 }, { "epoch": 0.27311522048364156, "grad_norm": 27.076311111450195, "learning_rate": 0.0002968283527643036, "loss": 21.5994, "step": 18 }, { "epoch": 0.2882882882882883, "grad_norm": 15.616254806518555, "learning_rate": 0.00029598965605505737, "loss": 18.592, "step": 19 }, { "epoch": 0.30346135609293506, "grad_norm": 18.812135696411133, "learning_rate": 0.000295054158718698, "loss": 16.3996, "step": 20 }, { "epoch": 0.3186344238975818, "grad_norm": 14.7747163772583, "learning_rate": 0.0002940224810507402, "loss": 16.5481, "step": 21 }, { "epoch": 0.33380749170222856, "grad_norm": 13.129542350769043, "learning_rate": 0.00029289530712050735, "loss": 14.6397, "step": 22 }, { "epoch": 0.3489805595068753, "grad_norm": 13.189936637878418, "learning_rate": 0.0002916733843175492, "loss": 14.889, "step": 23 }, { "epoch": 0.36415362731152207, "grad_norm": 11.230732917785645, "learning_rate": 0.000290357522856074, "loss": 13.3495, "step": 24 }, { "epoch": 0.3793266951161688, "grad_norm": 10.45602798461914, "learning_rate": 0.0002889485952377242, "loss": 13.97, "step": 25 }, { "epoch": 0.3793266951161688, "eval_loss": 0.4615178406238556, "eval_runtime": 4.1945, "eval_samples_per_second": 11.92, "eval_steps_per_second": 11.92, "step": 25 }, { "epoch": 0.39449976292081557, "grad_norm": 9.400847434997559, "learning_rate": 0.0002874475356730507, "loss": 13.0953, "step": 26 }, { "epoch": 0.4096728307254623, "grad_norm": 10.215126037597656, "learning_rate": 0.0002858553394620707, "loss": 14.0127, "step": 27 }, { "epoch": 0.42484589853010907, "grad_norm": 8.064435958862305, "learning_rate": 0.0002841730623343193, "loss": 12.4928, "step": 28 }, { "epoch": 0.4400189663347558, "grad_norm": 10.052416801452637, "learning_rate": 0.00028240181974883207, "loss": 14.195, "step": 29 }, { "epoch": 0.45519203413940257, "grad_norm": 9.893036842346191, "learning_rate": 0.00028054278615452326, "loss": 12.9534, "step": 30 }, { "epoch": 0.4703651019440493, "grad_norm": 9.579009056091309, "learning_rate": 0.0002785971942114498, "loss": 11.9894, "step": 31 }, { "epoch": 0.48553816974869607, "grad_norm": 10.615213394165039, "learning_rate": 0.0002765663339734778, "loss": 11.5054, "step": 32 }, { "epoch": 0.5007112375533428, "grad_norm": 18.542469024658203, "learning_rate": 0.0002744515520328928, "loss": 17.3979, "step": 33 }, { "epoch": 0.5158843053579896, "grad_norm": 15.90185260772705, "learning_rate": 0.00027225425062752165, "loss": 16.0495, "step": 34 }, { "epoch": 0.5310573731626363, "grad_norm": 12.792567253112793, "learning_rate": 0.0002699758867109579, "loss": 15.2643, "step": 35 }, { "epoch": 0.5462304409672831, "grad_norm": 9.845219612121582, "learning_rate": 0.0002676179709865066, "loss": 13.5142, "step": 36 }, { "epoch": 0.5614035087719298, "grad_norm": 12.22781753540039, "learning_rate": 0.00026518206690549, "loss": 14.2178, "step": 37 }, { "epoch": 0.5765765765765766, "grad_norm": 10.745471954345703, "learning_rate": 0.0002626697896305779, "loss": 12.9431, "step": 38 }, { "epoch": 0.5917496443812233, "grad_norm": 10.67679500579834, "learning_rate": 0.00026008280496482984, "loss": 15.1994, "step": 39 }, { "epoch": 0.6069227121858701, "grad_norm": 8.882881164550781, "learning_rate": 0.000257422828247159, "loss": 14.0005, "step": 40 }, { "epoch": 0.6220957799905168, "grad_norm": 9.826411247253418, "learning_rate": 0.00025469162321495147, "loss": 12.5264, "step": 41 }, { "epoch": 0.6372688477951636, "grad_norm": 8.515768051147461, "learning_rate": 0.00025189100083459397, "loss": 11.0442, "step": 42 }, { "epoch": 0.6524419155998104, "grad_norm": 9.0559720993042, "learning_rate": 0.00024902281810068475, "loss": 11.7402, "step": 43 }, { "epoch": 0.6676149834044571, "grad_norm": 9.556861877441406, "learning_rate": 0.0002460889768047263, "loss": 12.4502, "step": 44 }, { "epoch": 0.6827880512091038, "grad_norm": 8.837124824523926, "learning_rate": 0.0002430914222741134, "loss": 12.4087, "step": 45 }, { "epoch": 0.6979611190137506, "grad_norm": 15.878302574157715, "learning_rate": 0.00024003214208225522, "loss": 11.0582, "step": 46 }, { "epoch": 0.7131341868183974, "grad_norm": 9.87769603729248, "learning_rate": 0.00023691316473068452, "loss": 11.3891, "step": 47 }, { "epoch": 0.7283072546230441, "grad_norm": 8.35490608215332, "learning_rate": 0.00023373655830402968, "loss": 9.9121, "step": 48 }, { "epoch": 0.7434803224276908, "grad_norm": 17.935434341430664, "learning_rate": 0.00023050442909874007, "loss": 16.9965, "step": 49 }, { "epoch": 0.7586533902323376, "grad_norm": 15.285179138183594, "learning_rate": 0.00022721892022647462, "loss": 16.4137, "step": 50 }, { "epoch": 0.7586533902323376, "eval_loss": 0.4162628650665283, "eval_runtime": 4.1979, "eval_samples_per_second": 11.911, "eval_steps_per_second": 11.911, "step": 50 }, { "epoch": 0.7738264580369844, "grad_norm": 11.381631851196289, "learning_rate": 0.00022388221019307967, "loss": 14.1246, "step": 51 }, { "epoch": 0.7889995258416311, "grad_norm": 10.287455558776855, "learning_rate": 0.000220496511454098, "loss": 13.1531, "step": 52 }, { "epoch": 0.8041725936462779, "grad_norm": 10.540827751159668, "learning_rate": 0.00021706406894776709, "loss": 12.6342, "step": 53 }, { "epoch": 0.8193456614509246, "grad_norm": 10.495640754699707, "learning_rate": 0.0002135871586064791, "loss": 13.6827, "step": 54 }, { "epoch": 0.8345187292555714, "grad_norm": 12.297253608703613, "learning_rate": 0.00021006808584768998, "loss": 13.6184, "step": 55 }, { "epoch": 0.8496917970602181, "grad_norm": 10.440007209777832, "learning_rate": 0.00020650918404527775, "loss": 12.9478, "step": 56 }, { "epoch": 0.8648648648648649, "grad_norm": 10.427011489868164, "learning_rate": 0.00020291281298236423, "loss": 12.4868, "step": 57 }, { "epoch": 0.8800379326695116, "grad_norm": 9.420439720153809, "learning_rate": 0.00019928135728662522, "loss": 13.8967, "step": 58 }, { "epoch": 0.8952110004741584, "grad_norm": 10.802536964416504, "learning_rate": 0.0001956172248491277, "loss": 14.1609, "step": 59 }, { "epoch": 0.9103840682788051, "grad_norm": 8.980121612548828, "learning_rate": 0.00019192284522774142, "loss": 13.0885, "step": 60 }, { "epoch": 0.9255571360834519, "grad_norm": 8.586287498474121, "learning_rate": 0.00018820066803618428, "loss": 10.7247, "step": 61 }, { "epoch": 0.9407302038880986, "grad_norm": 10.254575729370117, "learning_rate": 0.00018445316131976934, "loss": 12.6883, "step": 62 }, { "epoch": 0.9559032716927454, "grad_norm": 10.308297157287598, "learning_rate": 0.00018068280991893014, "loss": 10.7828, "step": 63 }, { "epoch": 0.9710763394973921, "grad_norm": 10.438807487487793, "learning_rate": 0.00017689211382161034, "loss": 9.4578, "step": 64 }, { "epoch": 0.9862494073020389, "grad_norm": 16.487628936767578, "learning_rate": 0.00017308358650560928, "loss": 13.0305, "step": 65 }, { "epoch": 1.0014224751066856, "grad_norm": 8.836454391479492, "learning_rate": 0.00016925975327198266, "loss": 12.8189, "step": 66 }, { "epoch": 1.0165955429113325, "grad_norm": 10.768722534179688, "learning_rate": 0.00016542314957060405, "loss": 14.4475, "step": 67 }, { "epoch": 1.0317686107159791, "grad_norm": 8.652929306030273, "learning_rate": 0.00016157631931899697, "loss": 15.0635, "step": 68 }, { "epoch": 1.0469416785206258, "grad_norm": 7.712925910949707, "learning_rate": 0.00015772181321555196, "loss": 12.3852, "step": 69 }, { "epoch": 1.0621147463252727, "grad_norm": 9.161529541015625, "learning_rate": 0.0001538621870482483, "loss": 12.3397, "step": 70 }, { "epoch": 1.0772878141299194, "grad_norm": 8.765726089477539, "learning_rate": 0.00015, "loss": 11.8658, "step": 71 }, { "epoch": 1.0924608819345663, "grad_norm": 8.678772926330566, "learning_rate": 0.00014613781295175172, "loss": 11.8749, "step": 72 }, { "epoch": 1.107633949739213, "grad_norm": 8.968255996704102, "learning_rate": 0.000142278186784448, "loss": 11.2309, "step": 73 }, { "epoch": 1.1228070175438596, "grad_norm": 8.212029457092285, "learning_rate": 0.00013842368068100303, "loss": 9.0183, "step": 74 }, { "epoch": 1.1379800853485065, "grad_norm": 7.790862560272217, "learning_rate": 0.00013457685042939592, "loss": 10.4468, "step": 75 }, { "epoch": 1.1379800853485065, "eval_loss": 0.39976993203163147, "eval_runtime": 4.1988, "eval_samples_per_second": 11.908, "eval_steps_per_second": 11.908, "step": 75 }, { "epoch": 1.1531531531531531, "grad_norm": 7.495136260986328, "learning_rate": 0.00013074024672801731, "loss": 9.8966, "step": 76 }, { "epoch": 1.1683262209577998, "grad_norm": 8.63007640838623, "learning_rate": 0.0001269164134943907, "loss": 9.9346, "step": 77 }, { "epoch": 1.1834992887624467, "grad_norm": 9.116193771362305, "learning_rate": 0.00012310788617838966, "loss": 11.3677, "step": 78 }, { "epoch": 1.1986723565670934, "grad_norm": 9.81653881072998, "learning_rate": 0.0001193171900810699, "loss": 10.2669, "step": 79 }, { "epoch": 1.2138454243717403, "grad_norm": 10.604126930236816, "learning_rate": 0.00011554683868023067, "loss": 10.7322, "step": 80 }, { "epoch": 1.229018492176387, "grad_norm": 10.463966369628906, "learning_rate": 0.0001117993319638157, "loss": 9.2057, "step": 81 }, { "epoch": 1.2441915599810336, "grad_norm": 9.727025985717773, "learning_rate": 0.00010807715477225858, "loss": 8.6479, "step": 82 }, { "epoch": 1.2593646277856805, "grad_norm": 13.668055534362793, "learning_rate": 0.00010438277515087233, "loss": 14.4636, "step": 83 }, { "epoch": 1.2745376955903271, "grad_norm": 11.730889320373535, "learning_rate": 0.00010071864271337478, "loss": 14.0765, "step": 84 }, { "epoch": 1.2897107633949738, "grad_norm": 11.76871109008789, "learning_rate": 9.708718701763577e-05, "loss": 12.6621, "step": 85 }, { "epoch": 1.3048838311996207, "grad_norm": 12.476866722106934, "learning_rate": 9.34908159547222e-05, "loss": 12.6543, "step": 86 }, { "epoch": 1.3200568990042674, "grad_norm": 9.014403343200684, "learning_rate": 8.993191415231e-05, "loss": 10.5078, "step": 87 }, { "epoch": 1.3352299668089143, "grad_norm": 8.82985782623291, "learning_rate": 8.641284139352091e-05, "loss": 10.1313, "step": 88 }, { "epoch": 1.350403034613561, "grad_norm": 9.604599952697754, "learning_rate": 8.293593105223287e-05, "loss": 10.9247, "step": 89 }, { "epoch": 1.3655761024182076, "grad_norm": 8.465795516967773, "learning_rate": 7.950348854590204e-05, "loss": 9.671, "step": 90 }, { "epoch": 1.3807491702228545, "grad_norm": 8.575054168701172, "learning_rate": 7.611778980692035e-05, "loss": 10.2636, "step": 91 }, { "epoch": 1.3959222380275011, "grad_norm": 7.928877830505371, "learning_rate": 7.278107977352543e-05, "loss": 8.982, "step": 92 }, { "epoch": 1.411095305832148, "grad_norm": 9.111271858215332, "learning_rate": 6.949557090125994e-05, "loss": 9.8091, "step": 93 }, { "epoch": 1.4262683736367947, "grad_norm": 8.69306468963623, "learning_rate": 6.626344169597031e-05, "loss": 9.7754, "step": 94 }, { "epoch": 1.4414414414414414, "grad_norm": 8.90804386138916, "learning_rate": 6.308683526931545e-05, "loss": 9.7669, "step": 95 }, { "epoch": 1.4566145092460883, "grad_norm": 9.999079704284668, "learning_rate": 5.996785791774478e-05, "loss": 9.0066, "step": 96 }, { "epoch": 1.471787577050735, "grad_norm": 9.848721504211426, "learning_rate": 5.690857772588657e-05, "loss": 8.0642, "step": 97 }, { "epoch": 1.4869606448553818, "grad_norm": 10.121479988098145, "learning_rate": 5.391102319527373e-05, "loss": 8.1109, "step": 98 }, { "epoch": 1.5021337126600285, "grad_norm": 11.15321159362793, "learning_rate": 5.0977181899315214e-05, "loss": 15.3952, "step": 99 }, { "epoch": 1.5173067804646752, "grad_norm": 10.735111236572266, "learning_rate": 4.8108999165406026e-05, "loss": 14.3337, "step": 100 }, { "epoch": 1.5173067804646752, "eval_loss": 0.39477089047431946, "eval_runtime": 4.2024, "eval_samples_per_second": 11.898, "eval_steps_per_second": 11.898, "step": 100 }, { "epoch": 1.5324798482693218, "grad_norm": 8.945897102355957, "learning_rate": 4.5308376785048434e-05, "loss": 11.1062, "step": 101 }, { "epoch": 1.5476529160739687, "grad_norm": 9.844161987304688, "learning_rate": 4.257717175284103e-05, "loss": 10.4609, "step": 102 }, { "epoch": 1.5628259838786156, "grad_norm": 9.294745445251465, "learning_rate": 3.991719503517014e-05, "loss": 10.6322, "step": 103 }, { "epoch": 1.5779990516832623, "grad_norm": 9.901968002319336, "learning_rate": 3.733021036942205e-05, "loss": 11.3513, "step": 104 }, { "epoch": 1.593172119487909, "grad_norm": 9.366765975952148, "learning_rate": 3.481793309451e-05, "loss": 10.2632, "step": 105 }, { "epoch": 1.6083451872925556, "grad_norm": 8.39588451385498, "learning_rate": 3.238202901349345e-05, "loss": 10.2529, "step": 106 }, { "epoch": 1.6235182550972025, "grad_norm": 8.285764694213867, "learning_rate": 3.0024113289042094e-05, "loss": 9.9579, "step": 107 }, { "epoch": 1.6386913229018494, "grad_norm": 8.389649391174316, "learning_rate": 2.774574937247831e-05, "loss": 10.3428, "step": 108 }, { "epoch": 1.653864390706496, "grad_norm": 8.863457679748535, "learning_rate": 2.554844796710716e-05, "loss": 9.0607, "step": 109 }, { "epoch": 1.6690374585111427, "grad_norm": 8.510146141052246, "learning_rate": 2.3433666026522153e-05, "loss": 9.3396, "step": 110 }, { "epoch": 1.6842105263157894, "grad_norm": 9.09020709991455, "learning_rate": 2.1402805788550138e-05, "loss": 9.223, "step": 111 }, { "epoch": 1.6993835941204363, "grad_norm": 8.539032936096191, "learning_rate": 1.945721384547671e-05, "loss": 7.2793, "step": 112 }, { "epoch": 1.714556661925083, "grad_norm": 9.459700584411621, "learning_rate": 1.759818025116787e-05, "loss": 7.6996, "step": 113 }, { "epoch": 1.7297297297297298, "grad_norm": 11.580260276794434, "learning_rate": 1.5826937665680693e-05, "loss": 7.68, "step": 114 }, { "epoch": 1.7449027975343765, "grad_norm": 8.748165130615234, "learning_rate": 1.4144660537929287e-05, "loss": 13.0615, "step": 115 }, { "epoch": 1.7600758653390232, "grad_norm": 9.052087783813477, "learning_rate": 1.2552464326949302e-05, "loss": 13.9056, "step": 116 }, { "epoch": 1.7752489331436698, "grad_norm": 9.413413047790527, "learning_rate": 1.105140476227575e-05, "loss": 13.1337, "step": 117 }, { "epoch": 1.7904220009483167, "grad_norm": 8.633859634399414, "learning_rate": 9.64247714392597e-06, "loss": 11.1511, "step": 118 }, { "epoch": 1.8055950687529636, "grad_norm": 7.859529972076416, "learning_rate": 8.32661568245081e-06, "loss": 10.06, "step": 119 }, { "epoch": 1.8207681365576103, "grad_norm": 8.01181697845459, "learning_rate": 7.104692879492624e-06, "loss": 10.4743, "step": 120 }, { "epoch": 1.835941204362257, "grad_norm": 9.660663604736328, "learning_rate": 5.977518949259735e-06, "loss": 11.0629, "step": 121 }, { "epoch": 1.8511142721669036, "grad_norm": 8.536215782165527, "learning_rate": 4.945841281301943e-06, "loss": 10.3549, "step": 122 }, { "epoch": 1.8662873399715505, "grad_norm": 8.264043807983398, "learning_rate": 4.010343944942618e-06, "loss": 9.8312, "step": 123 }, { "epoch": 1.8814604077761974, "grad_norm": 8.763490676879883, "learning_rate": 3.1716472356963286e-06, "loss": 8.8429, "step": 124 }, { "epoch": 1.896633475580844, "grad_norm": 9.253498077392578, "learning_rate": 2.430307263972547e-06, "loss": 10.0951, "step": 125 }, { "epoch": 1.896633475580844, "eval_loss": 0.3835863471031189, "eval_runtime": 4.1958, "eval_samples_per_second": 11.917, "eval_steps_per_second": 11.917, "step": 125 }, { "epoch": 1.9118065433854907, "grad_norm": 8.449151039123535, "learning_rate": 1.7868155863384415e-06, "loss": 9.1299, "step": 126 }, { "epoch": 1.9269796111901374, "grad_norm": 8.359270095825195, "learning_rate": 1.2415988795847765e-06, "loss": 9.0518, "step": 127 }, { "epoch": 1.9421526789947843, "grad_norm": 8.54788589477539, "learning_rate": 7.950186578116413e-07, "loss": 7.7242, "step": 128 }, { "epoch": 1.9573257467994312, "grad_norm": 8.631213188171387, "learning_rate": 4.473710327209945e-07, "loss": 6.4852, "step": 129 }, { "epoch": 1.9724988146040778, "grad_norm": 8.945975303649902, "learning_rate": 1.988865172754206e-07, "loss": 7.753, "step": 130 }, { "epoch": 1.9876718824087245, "grad_norm": 8.31573486328125, "learning_rate": 4.972987285304375e-08, "loss": 11.2559, "step": 131 }, { "epoch": 2.002844950213371, "grad_norm": 8.791767120361328, "learning_rate": 0.0, "loss": 9.4748, "step": 132 } ], "logging_steps": 1, "max_steps": 132, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8361400481231667e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }