{ "best_metric": 0.39477089047431946, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 1.5173067804646752, "eval_steps": 25, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015173067804646752, "grad_norm": 17.388423919677734, "learning_rate": 2.9999999999999997e-05, "loss": 30.457, "step": 1 }, { "epoch": 0.015173067804646752, "eval_loss": 0.8401981592178345, "eval_runtime": 4.1216, "eval_samples_per_second": 12.131, "eval_steps_per_second": 12.131, "step": 1 }, { "epoch": 0.030346135609293504, "grad_norm": 10.178067207336426, "learning_rate": 5.9999999999999995e-05, "loss": 30.335, "step": 2 }, { "epoch": 0.04551920341394026, "grad_norm": 10.258736610412598, "learning_rate": 8.999999999999999e-05, "loss": 28.3123, "step": 3 }, { "epoch": 0.06069227121858701, "grad_norm": 10.296838760375977, "learning_rate": 0.00011999999999999999, "loss": 27.9287, "step": 4 }, { "epoch": 0.07586533902323377, "grad_norm": 12.867237091064453, "learning_rate": 0.00015, "loss": 25.3598, "step": 5 }, { "epoch": 0.09103840682788052, "grad_norm": 12.431685447692871, "learning_rate": 0.00017999999999999998, "loss": 22.2146, "step": 6 }, { "epoch": 0.10621147463252727, "grad_norm": 13.944931983947754, "learning_rate": 0.00020999999999999998, "loss": 19.7453, "step": 7 }, { "epoch": 0.12138454243717402, "grad_norm": 23.274085998535156, "learning_rate": 0.00023999999999999998, "loss": 19.6562, "step": 8 }, { "epoch": 0.13655761024182078, "grad_norm": 23.921234130859375, "learning_rate": 0.00027, "loss": 15.7789, "step": 9 }, { "epoch": 0.15173067804646753, "grad_norm": 31.0799560546875, "learning_rate": 0.0003, "loss": 18.6639, "step": 10 }, { "epoch": 0.16690374585111428, "grad_norm": 27.46465301513672, "learning_rate": 0.00029995027012714694, "loss": 15.0467, "step": 11 }, { "epoch": 0.18207681365576103, "grad_norm": 21.228845596313477, "learning_rate": 0.00029980111348272456, "loss": 16.0005, "step": 12 }, { "epoch": 0.19724988146040778, "grad_norm": 17.686410903930664, "learning_rate": 0.00029955262896727894, "loss": 15.2921, "step": 13 }, { "epoch": 0.21242294926505453, "grad_norm": 16.29988670349121, "learning_rate": 0.00029920498134218835, "loss": 14.4487, "step": 14 }, { "epoch": 0.22759601706970128, "grad_norm": 18.719757080078125, "learning_rate": 0.0002987584011204152, "loss": 12.782, "step": 15 }, { "epoch": 0.24276908487434803, "grad_norm": 19.07723617553711, "learning_rate": 0.0002982131844136615, "loss": 12.9955, "step": 16 }, { "epoch": 0.2579421526789948, "grad_norm": 45.31391143798828, "learning_rate": 0.0002975696927360274, "loss": 23.8883, "step": 17 }, { "epoch": 0.27311522048364156, "grad_norm": 27.076311111450195, "learning_rate": 0.0002968283527643036, "loss": 21.5994, "step": 18 }, { "epoch": 0.2882882882882883, "grad_norm": 15.616254806518555, "learning_rate": 0.00029598965605505737, "loss": 18.592, "step": 19 }, { "epoch": 0.30346135609293506, "grad_norm": 18.812135696411133, "learning_rate": 0.000295054158718698, "loss": 16.3996, "step": 20 }, { "epoch": 0.3186344238975818, "grad_norm": 14.7747163772583, "learning_rate": 0.0002940224810507402, "loss": 16.5481, "step": 21 }, { "epoch": 0.33380749170222856, "grad_norm": 13.129542350769043, "learning_rate": 0.00029289530712050735, "loss": 14.6397, "step": 22 }, { "epoch": 0.3489805595068753, "grad_norm": 13.189936637878418, "learning_rate": 0.0002916733843175492, "loss": 14.889, "step": 23 }, { "epoch": 0.36415362731152207, "grad_norm": 11.230732917785645, "learning_rate": 0.000290357522856074, "loss": 13.3495, "step": 24 }, { "epoch": 0.3793266951161688, "grad_norm": 10.45602798461914, "learning_rate": 0.0002889485952377242, "loss": 13.97, "step": 25 }, { "epoch": 0.3793266951161688, "eval_loss": 0.4615178406238556, "eval_runtime": 4.1945, "eval_samples_per_second": 11.92, "eval_steps_per_second": 11.92, "step": 25 }, { "epoch": 0.39449976292081557, "grad_norm": 9.400847434997559, "learning_rate": 0.0002874475356730507, "loss": 13.0953, "step": 26 }, { "epoch": 0.4096728307254623, "grad_norm": 10.215126037597656, "learning_rate": 0.0002858553394620707, "loss": 14.0127, "step": 27 }, { "epoch": 0.42484589853010907, "grad_norm": 8.064435958862305, "learning_rate": 0.0002841730623343193, "loss": 12.4928, "step": 28 }, { "epoch": 0.4400189663347558, "grad_norm": 10.052416801452637, "learning_rate": 0.00028240181974883207, "loss": 14.195, "step": 29 }, { "epoch": 0.45519203413940257, "grad_norm": 9.893036842346191, "learning_rate": 0.00028054278615452326, "loss": 12.9534, "step": 30 }, { "epoch": 0.4703651019440493, "grad_norm": 9.579009056091309, "learning_rate": 0.0002785971942114498, "loss": 11.9894, "step": 31 }, { "epoch": 0.48553816974869607, "grad_norm": 10.615213394165039, "learning_rate": 0.0002765663339734778, "loss": 11.5054, "step": 32 }, { "epoch": 0.5007112375533428, "grad_norm": 18.542469024658203, "learning_rate": 0.0002744515520328928, "loss": 17.3979, "step": 33 }, { "epoch": 0.5158843053579896, "grad_norm": 15.90185260772705, "learning_rate": 0.00027225425062752165, "loss": 16.0495, "step": 34 }, { "epoch": 0.5310573731626363, "grad_norm": 12.792567253112793, "learning_rate": 0.0002699758867109579, "loss": 15.2643, "step": 35 }, { "epoch": 0.5462304409672831, "grad_norm": 9.845219612121582, "learning_rate": 0.0002676179709865066, "loss": 13.5142, "step": 36 }, { "epoch": 0.5614035087719298, "grad_norm": 12.22781753540039, "learning_rate": 0.00026518206690549, "loss": 14.2178, "step": 37 }, { "epoch": 0.5765765765765766, "grad_norm": 10.745471954345703, "learning_rate": 0.0002626697896305779, "loss": 12.9431, "step": 38 }, { "epoch": 0.5917496443812233, "grad_norm": 10.67679500579834, "learning_rate": 0.00026008280496482984, "loss": 15.1994, "step": 39 }, { "epoch": 0.6069227121858701, "grad_norm": 8.882881164550781, "learning_rate": 0.000257422828247159, "loss": 14.0005, "step": 40 }, { "epoch": 0.6220957799905168, "grad_norm": 9.826411247253418, "learning_rate": 0.00025469162321495147, "loss": 12.5264, "step": 41 }, { "epoch": 0.6372688477951636, "grad_norm": 8.515768051147461, "learning_rate": 0.00025189100083459397, "loss": 11.0442, "step": 42 }, { "epoch": 0.6524419155998104, "grad_norm": 9.0559720993042, "learning_rate": 0.00024902281810068475, "loss": 11.7402, "step": 43 }, { "epoch": 0.6676149834044571, "grad_norm": 9.556861877441406, "learning_rate": 0.0002460889768047263, "loss": 12.4502, "step": 44 }, { "epoch": 0.6827880512091038, "grad_norm": 8.837124824523926, "learning_rate": 0.0002430914222741134, "loss": 12.4087, "step": 45 }, { "epoch": 0.6979611190137506, "grad_norm": 15.878302574157715, "learning_rate": 0.00024003214208225522, "loss": 11.0582, "step": 46 }, { "epoch": 0.7131341868183974, "grad_norm": 9.87769603729248, "learning_rate": 0.00023691316473068452, "loss": 11.3891, "step": 47 }, { "epoch": 0.7283072546230441, "grad_norm": 8.35490608215332, "learning_rate": 0.00023373655830402968, "loss": 9.9121, "step": 48 }, { "epoch": 0.7434803224276908, "grad_norm": 17.935434341430664, "learning_rate": 0.00023050442909874007, "loss": 16.9965, "step": 49 }, { "epoch": 0.7586533902323376, "grad_norm": 15.285179138183594, "learning_rate": 0.00022721892022647462, "loss": 16.4137, "step": 50 }, { "epoch": 0.7586533902323376, "eval_loss": 0.4162628650665283, "eval_runtime": 4.1979, "eval_samples_per_second": 11.911, "eval_steps_per_second": 11.911, "step": 50 }, { "epoch": 0.7738264580369844, "grad_norm": 11.381631851196289, "learning_rate": 0.00022388221019307967, "loss": 14.1246, "step": 51 }, { "epoch": 0.7889995258416311, "grad_norm": 10.287455558776855, "learning_rate": 0.000220496511454098, "loss": 13.1531, "step": 52 }, { "epoch": 0.8041725936462779, "grad_norm": 10.540827751159668, "learning_rate": 0.00021706406894776709, "loss": 12.6342, "step": 53 }, { "epoch": 0.8193456614509246, "grad_norm": 10.495640754699707, "learning_rate": 0.0002135871586064791, "loss": 13.6827, "step": 54 }, { "epoch": 0.8345187292555714, "grad_norm": 12.297253608703613, "learning_rate": 0.00021006808584768998, "loss": 13.6184, "step": 55 }, { "epoch": 0.8496917970602181, "grad_norm": 10.440007209777832, "learning_rate": 0.00020650918404527775, "loss": 12.9478, "step": 56 }, { "epoch": 0.8648648648648649, "grad_norm": 10.427011489868164, "learning_rate": 0.00020291281298236423, "loss": 12.4868, "step": 57 }, { "epoch": 0.8800379326695116, "grad_norm": 9.420439720153809, "learning_rate": 0.00019928135728662522, "loss": 13.8967, "step": 58 }, { "epoch": 0.8952110004741584, "grad_norm": 10.802536964416504, "learning_rate": 0.0001956172248491277, "loss": 14.1609, "step": 59 }, { "epoch": 0.9103840682788051, "grad_norm": 8.980121612548828, "learning_rate": 0.00019192284522774142, "loss": 13.0885, "step": 60 }, { "epoch": 0.9255571360834519, "grad_norm": 8.586287498474121, "learning_rate": 0.00018820066803618428, "loss": 10.7247, "step": 61 }, { "epoch": 0.9407302038880986, "grad_norm": 10.254575729370117, "learning_rate": 0.00018445316131976934, "loss": 12.6883, "step": 62 }, { "epoch": 0.9559032716927454, "grad_norm": 10.308297157287598, "learning_rate": 0.00018068280991893014, "loss": 10.7828, "step": 63 }, { "epoch": 0.9710763394973921, "grad_norm": 10.438807487487793, "learning_rate": 0.00017689211382161034, "loss": 9.4578, "step": 64 }, { "epoch": 0.9862494073020389, "grad_norm": 16.487628936767578, "learning_rate": 0.00017308358650560928, "loss": 13.0305, "step": 65 }, { "epoch": 1.0014224751066856, "grad_norm": 8.836454391479492, "learning_rate": 0.00016925975327198266, "loss": 12.8189, "step": 66 }, { "epoch": 1.0165955429113325, "grad_norm": 10.768722534179688, "learning_rate": 0.00016542314957060405, "loss": 14.4475, "step": 67 }, { "epoch": 1.0317686107159791, "grad_norm": 8.652929306030273, "learning_rate": 0.00016157631931899697, "loss": 15.0635, "step": 68 }, { "epoch": 1.0469416785206258, "grad_norm": 7.712925910949707, "learning_rate": 0.00015772181321555196, "loss": 12.3852, "step": 69 }, { "epoch": 1.0621147463252727, "grad_norm": 9.161529541015625, "learning_rate": 0.0001538621870482483, "loss": 12.3397, "step": 70 }, { "epoch": 1.0772878141299194, "grad_norm": 8.765726089477539, "learning_rate": 0.00015, "loss": 11.8658, "step": 71 }, { "epoch": 1.0924608819345663, "grad_norm": 8.678772926330566, "learning_rate": 0.00014613781295175172, "loss": 11.8749, "step": 72 }, { "epoch": 1.107633949739213, "grad_norm": 8.968255996704102, "learning_rate": 0.000142278186784448, "loss": 11.2309, "step": 73 }, { "epoch": 1.1228070175438596, "grad_norm": 8.212029457092285, "learning_rate": 0.00013842368068100303, "loss": 9.0183, "step": 74 }, { "epoch": 1.1379800853485065, "grad_norm": 7.790862560272217, "learning_rate": 0.00013457685042939592, "loss": 10.4468, "step": 75 }, { "epoch": 1.1379800853485065, "eval_loss": 0.39976993203163147, "eval_runtime": 4.1988, "eval_samples_per_second": 11.908, "eval_steps_per_second": 11.908, "step": 75 }, { "epoch": 1.1531531531531531, "grad_norm": 7.495136260986328, "learning_rate": 0.00013074024672801731, "loss": 9.8966, "step": 76 }, { "epoch": 1.1683262209577998, "grad_norm": 8.63007640838623, "learning_rate": 0.0001269164134943907, "loss": 9.9346, "step": 77 }, { "epoch": 1.1834992887624467, "grad_norm": 9.116193771362305, "learning_rate": 0.00012310788617838966, "loss": 11.3677, "step": 78 }, { "epoch": 1.1986723565670934, "grad_norm": 9.81653881072998, "learning_rate": 0.0001193171900810699, "loss": 10.2669, "step": 79 }, { "epoch": 1.2138454243717403, "grad_norm": 10.604126930236816, "learning_rate": 0.00011554683868023067, "loss": 10.7322, "step": 80 }, { "epoch": 1.229018492176387, "grad_norm": 10.463966369628906, "learning_rate": 0.0001117993319638157, "loss": 9.2057, "step": 81 }, { "epoch": 1.2441915599810336, "grad_norm": 9.727025985717773, "learning_rate": 0.00010807715477225858, "loss": 8.6479, "step": 82 }, { "epoch": 1.2593646277856805, "grad_norm": 13.668055534362793, "learning_rate": 0.00010438277515087233, "loss": 14.4636, "step": 83 }, { "epoch": 1.2745376955903271, "grad_norm": 11.730889320373535, "learning_rate": 0.00010071864271337478, "loss": 14.0765, "step": 84 }, { "epoch": 1.2897107633949738, "grad_norm": 11.76871109008789, "learning_rate": 9.708718701763577e-05, "loss": 12.6621, "step": 85 }, { "epoch": 1.3048838311996207, "grad_norm": 12.476866722106934, "learning_rate": 9.34908159547222e-05, "loss": 12.6543, "step": 86 }, { "epoch": 1.3200568990042674, "grad_norm": 9.014403343200684, "learning_rate": 8.993191415231e-05, "loss": 10.5078, "step": 87 }, { "epoch": 1.3352299668089143, "grad_norm": 8.82985782623291, "learning_rate": 8.641284139352091e-05, "loss": 10.1313, "step": 88 }, { "epoch": 1.350403034613561, "grad_norm": 9.604599952697754, "learning_rate": 8.293593105223287e-05, "loss": 10.9247, "step": 89 }, { "epoch": 1.3655761024182076, "grad_norm": 8.465795516967773, "learning_rate": 7.950348854590204e-05, "loss": 9.671, "step": 90 }, { "epoch": 1.3807491702228545, "grad_norm": 8.575054168701172, "learning_rate": 7.611778980692035e-05, "loss": 10.2636, "step": 91 }, { "epoch": 1.3959222380275011, "grad_norm": 7.928877830505371, "learning_rate": 7.278107977352543e-05, "loss": 8.982, "step": 92 }, { "epoch": 1.411095305832148, "grad_norm": 9.111271858215332, "learning_rate": 6.949557090125994e-05, "loss": 9.8091, "step": 93 }, { "epoch": 1.4262683736367947, "grad_norm": 8.69306468963623, "learning_rate": 6.626344169597031e-05, "loss": 9.7754, "step": 94 }, { "epoch": 1.4414414414414414, "grad_norm": 8.90804386138916, "learning_rate": 6.308683526931545e-05, "loss": 9.7669, "step": 95 }, { "epoch": 1.4566145092460883, "grad_norm": 9.999079704284668, "learning_rate": 5.996785791774478e-05, "loss": 9.0066, "step": 96 }, { "epoch": 1.471787577050735, "grad_norm": 9.848721504211426, "learning_rate": 5.690857772588657e-05, "loss": 8.0642, "step": 97 }, { "epoch": 1.4869606448553818, "grad_norm": 10.121479988098145, "learning_rate": 5.391102319527373e-05, "loss": 8.1109, "step": 98 }, { "epoch": 1.5021337126600285, "grad_norm": 11.15321159362793, "learning_rate": 5.0977181899315214e-05, "loss": 15.3952, "step": 99 }, { "epoch": 1.5173067804646752, "grad_norm": 10.735111236572266, "learning_rate": 4.8108999165406026e-05, "loss": 14.3337, "step": 100 }, { "epoch": 1.5173067804646752, "eval_loss": 0.39477089047431946, "eval_runtime": 4.2024, "eval_samples_per_second": 11.898, "eval_steps_per_second": 11.898, "step": 100 } ], "logging_steps": 1, "max_steps": 132, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.391015187972096e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }