|
{ |
|
"best_metric": 0.39477089047431946, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-100", |
|
"epoch": 2.002844950213371, |
|
"eval_steps": 25, |
|
"global_step": 132, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015173067804646752, |
|
"grad_norm": 17.388423919677734, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 30.457, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.015173067804646752, |
|
"eval_loss": 0.8401981592178345, |
|
"eval_runtime": 4.1216, |
|
"eval_samples_per_second": 12.131, |
|
"eval_steps_per_second": 12.131, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.030346135609293504, |
|
"grad_norm": 10.178067207336426, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 30.335, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.04551920341394026, |
|
"grad_norm": 10.258736610412598, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 28.3123, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.06069227121858701, |
|
"grad_norm": 10.296838760375977, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 27.9287, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.07586533902323377, |
|
"grad_norm": 12.867237091064453, |
|
"learning_rate": 0.00015, |
|
"loss": 25.3598, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09103840682788052, |
|
"grad_norm": 12.431685447692871, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 22.2146, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.10621147463252727, |
|
"grad_norm": 13.944931983947754, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 19.7453, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.12138454243717402, |
|
"grad_norm": 23.274085998535156, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 19.6562, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.13655761024182078, |
|
"grad_norm": 23.921234130859375, |
|
"learning_rate": 0.00027, |
|
"loss": 15.7789, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.15173067804646753, |
|
"grad_norm": 31.0799560546875, |
|
"learning_rate": 0.0003, |
|
"loss": 18.6639, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16690374585111428, |
|
"grad_norm": 27.46465301513672, |
|
"learning_rate": 0.00029995027012714694, |
|
"loss": 15.0467, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.18207681365576103, |
|
"grad_norm": 21.228845596313477, |
|
"learning_rate": 0.00029980111348272456, |
|
"loss": 16.0005, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.19724988146040778, |
|
"grad_norm": 17.686410903930664, |
|
"learning_rate": 0.00029955262896727894, |
|
"loss": 15.2921, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.21242294926505453, |
|
"grad_norm": 16.29988670349121, |
|
"learning_rate": 0.00029920498134218835, |
|
"loss": 14.4487, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.22759601706970128, |
|
"grad_norm": 18.719757080078125, |
|
"learning_rate": 0.0002987584011204152, |
|
"loss": 12.782, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.24276908487434803, |
|
"grad_norm": 19.07723617553711, |
|
"learning_rate": 0.0002982131844136615, |
|
"loss": 12.9955, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2579421526789948, |
|
"grad_norm": 45.31391143798828, |
|
"learning_rate": 0.0002975696927360274, |
|
"loss": 23.8883, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.27311522048364156, |
|
"grad_norm": 27.076311111450195, |
|
"learning_rate": 0.0002968283527643036, |
|
"loss": 21.5994, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2882882882882883, |
|
"grad_norm": 15.616254806518555, |
|
"learning_rate": 0.00029598965605505737, |
|
"loss": 18.592, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.30346135609293506, |
|
"grad_norm": 18.812135696411133, |
|
"learning_rate": 0.000295054158718698, |
|
"loss": 16.3996, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3186344238975818, |
|
"grad_norm": 14.7747163772583, |
|
"learning_rate": 0.0002940224810507402, |
|
"loss": 16.5481, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.33380749170222856, |
|
"grad_norm": 13.129542350769043, |
|
"learning_rate": 0.00029289530712050735, |
|
"loss": 14.6397, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.3489805595068753, |
|
"grad_norm": 13.189936637878418, |
|
"learning_rate": 0.0002916733843175492, |
|
"loss": 14.889, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.36415362731152207, |
|
"grad_norm": 11.230732917785645, |
|
"learning_rate": 0.000290357522856074, |
|
"loss": 13.3495, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.3793266951161688, |
|
"grad_norm": 10.45602798461914, |
|
"learning_rate": 0.0002889485952377242, |
|
"loss": 13.97, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3793266951161688, |
|
"eval_loss": 0.4615178406238556, |
|
"eval_runtime": 4.1945, |
|
"eval_samples_per_second": 11.92, |
|
"eval_steps_per_second": 11.92, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.39449976292081557, |
|
"grad_norm": 9.400847434997559, |
|
"learning_rate": 0.0002874475356730507, |
|
"loss": 13.0953, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.4096728307254623, |
|
"grad_norm": 10.215126037597656, |
|
"learning_rate": 0.0002858553394620707, |
|
"loss": 14.0127, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.42484589853010907, |
|
"grad_norm": 8.064435958862305, |
|
"learning_rate": 0.0002841730623343193, |
|
"loss": 12.4928, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.4400189663347558, |
|
"grad_norm": 10.052416801452637, |
|
"learning_rate": 0.00028240181974883207, |
|
"loss": 14.195, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.45519203413940257, |
|
"grad_norm": 9.893036842346191, |
|
"learning_rate": 0.00028054278615452326, |
|
"loss": 12.9534, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4703651019440493, |
|
"grad_norm": 9.579009056091309, |
|
"learning_rate": 0.0002785971942114498, |
|
"loss": 11.9894, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.48553816974869607, |
|
"grad_norm": 10.615213394165039, |
|
"learning_rate": 0.0002765663339734778, |
|
"loss": 11.5054, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5007112375533428, |
|
"grad_norm": 18.542469024658203, |
|
"learning_rate": 0.0002744515520328928, |
|
"loss": 17.3979, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5158843053579896, |
|
"grad_norm": 15.90185260772705, |
|
"learning_rate": 0.00027225425062752165, |
|
"loss": 16.0495, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5310573731626363, |
|
"grad_norm": 12.792567253112793, |
|
"learning_rate": 0.0002699758867109579, |
|
"loss": 15.2643, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5462304409672831, |
|
"grad_norm": 9.845219612121582, |
|
"learning_rate": 0.0002676179709865066, |
|
"loss": 13.5142, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5614035087719298, |
|
"grad_norm": 12.22781753540039, |
|
"learning_rate": 0.00026518206690549, |
|
"loss": 14.2178, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.5765765765765766, |
|
"grad_norm": 10.745471954345703, |
|
"learning_rate": 0.0002626697896305779, |
|
"loss": 12.9431, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.5917496443812233, |
|
"grad_norm": 10.67679500579834, |
|
"learning_rate": 0.00026008280496482984, |
|
"loss": 15.1994, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.6069227121858701, |
|
"grad_norm": 8.882881164550781, |
|
"learning_rate": 0.000257422828247159, |
|
"loss": 14.0005, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6220957799905168, |
|
"grad_norm": 9.826411247253418, |
|
"learning_rate": 0.00025469162321495147, |
|
"loss": 12.5264, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6372688477951636, |
|
"grad_norm": 8.515768051147461, |
|
"learning_rate": 0.00025189100083459397, |
|
"loss": 11.0442, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6524419155998104, |
|
"grad_norm": 9.0559720993042, |
|
"learning_rate": 0.00024902281810068475, |
|
"loss": 11.7402, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6676149834044571, |
|
"grad_norm": 9.556861877441406, |
|
"learning_rate": 0.0002460889768047263, |
|
"loss": 12.4502, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.6827880512091038, |
|
"grad_norm": 8.837124824523926, |
|
"learning_rate": 0.0002430914222741134, |
|
"loss": 12.4087, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6979611190137506, |
|
"grad_norm": 15.878302574157715, |
|
"learning_rate": 0.00024003214208225522, |
|
"loss": 11.0582, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.7131341868183974, |
|
"grad_norm": 9.87769603729248, |
|
"learning_rate": 0.00023691316473068452, |
|
"loss": 11.3891, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7283072546230441, |
|
"grad_norm": 8.35490608215332, |
|
"learning_rate": 0.00023373655830402968, |
|
"loss": 9.9121, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7434803224276908, |
|
"grad_norm": 17.935434341430664, |
|
"learning_rate": 0.00023050442909874007, |
|
"loss": 16.9965, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.7586533902323376, |
|
"grad_norm": 15.285179138183594, |
|
"learning_rate": 0.00022721892022647462, |
|
"loss": 16.4137, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7586533902323376, |
|
"eval_loss": 0.4162628650665283, |
|
"eval_runtime": 4.1979, |
|
"eval_samples_per_second": 11.911, |
|
"eval_steps_per_second": 11.911, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7738264580369844, |
|
"grad_norm": 11.381631851196289, |
|
"learning_rate": 0.00022388221019307967, |
|
"loss": 14.1246, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7889995258416311, |
|
"grad_norm": 10.287455558776855, |
|
"learning_rate": 0.000220496511454098, |
|
"loss": 13.1531, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.8041725936462779, |
|
"grad_norm": 10.540827751159668, |
|
"learning_rate": 0.00021706406894776709, |
|
"loss": 12.6342, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.8193456614509246, |
|
"grad_norm": 10.495640754699707, |
|
"learning_rate": 0.0002135871586064791, |
|
"loss": 13.6827, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.8345187292555714, |
|
"grad_norm": 12.297253608703613, |
|
"learning_rate": 0.00021006808584768998, |
|
"loss": 13.6184, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.8496917970602181, |
|
"grad_norm": 10.440007209777832, |
|
"learning_rate": 0.00020650918404527775, |
|
"loss": 12.9478, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 10.427011489868164, |
|
"learning_rate": 0.00020291281298236423, |
|
"loss": 12.4868, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.8800379326695116, |
|
"grad_norm": 9.420439720153809, |
|
"learning_rate": 0.00019928135728662522, |
|
"loss": 13.8967, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.8952110004741584, |
|
"grad_norm": 10.802536964416504, |
|
"learning_rate": 0.0001956172248491277, |
|
"loss": 14.1609, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.9103840682788051, |
|
"grad_norm": 8.980121612548828, |
|
"learning_rate": 0.00019192284522774142, |
|
"loss": 13.0885, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9255571360834519, |
|
"grad_norm": 8.586287498474121, |
|
"learning_rate": 0.00018820066803618428, |
|
"loss": 10.7247, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.9407302038880986, |
|
"grad_norm": 10.254575729370117, |
|
"learning_rate": 0.00018445316131976934, |
|
"loss": 12.6883, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.9559032716927454, |
|
"grad_norm": 10.308297157287598, |
|
"learning_rate": 0.00018068280991893014, |
|
"loss": 10.7828, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.9710763394973921, |
|
"grad_norm": 10.438807487487793, |
|
"learning_rate": 0.00017689211382161034, |
|
"loss": 9.4578, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.9862494073020389, |
|
"grad_norm": 16.487628936767578, |
|
"learning_rate": 0.00017308358650560928, |
|
"loss": 13.0305, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.0014224751066856, |
|
"grad_norm": 8.836454391479492, |
|
"learning_rate": 0.00016925975327198266, |
|
"loss": 12.8189, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.0165955429113325, |
|
"grad_norm": 10.768722534179688, |
|
"learning_rate": 0.00016542314957060405, |
|
"loss": 14.4475, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.0317686107159791, |
|
"grad_norm": 8.652929306030273, |
|
"learning_rate": 0.00016157631931899697, |
|
"loss": 15.0635, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.0469416785206258, |
|
"grad_norm": 7.712925910949707, |
|
"learning_rate": 0.00015772181321555196, |
|
"loss": 12.3852, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.0621147463252727, |
|
"grad_norm": 9.161529541015625, |
|
"learning_rate": 0.0001538621870482483, |
|
"loss": 12.3397, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.0772878141299194, |
|
"grad_norm": 8.765726089477539, |
|
"learning_rate": 0.00015, |
|
"loss": 11.8658, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.0924608819345663, |
|
"grad_norm": 8.678772926330566, |
|
"learning_rate": 0.00014613781295175172, |
|
"loss": 11.8749, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.107633949739213, |
|
"grad_norm": 8.968255996704102, |
|
"learning_rate": 0.000142278186784448, |
|
"loss": 11.2309, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.1228070175438596, |
|
"grad_norm": 8.212029457092285, |
|
"learning_rate": 0.00013842368068100303, |
|
"loss": 9.0183, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.1379800853485065, |
|
"grad_norm": 7.790862560272217, |
|
"learning_rate": 0.00013457685042939592, |
|
"loss": 10.4468, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.1379800853485065, |
|
"eval_loss": 0.39976993203163147, |
|
"eval_runtime": 4.1988, |
|
"eval_samples_per_second": 11.908, |
|
"eval_steps_per_second": 11.908, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.1531531531531531, |
|
"grad_norm": 7.495136260986328, |
|
"learning_rate": 0.00013074024672801731, |
|
"loss": 9.8966, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.1683262209577998, |
|
"grad_norm": 8.63007640838623, |
|
"learning_rate": 0.0001269164134943907, |
|
"loss": 9.9346, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.1834992887624467, |
|
"grad_norm": 9.116193771362305, |
|
"learning_rate": 0.00012310788617838966, |
|
"loss": 11.3677, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.1986723565670934, |
|
"grad_norm": 9.81653881072998, |
|
"learning_rate": 0.0001193171900810699, |
|
"loss": 10.2669, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.2138454243717403, |
|
"grad_norm": 10.604126930236816, |
|
"learning_rate": 0.00011554683868023067, |
|
"loss": 10.7322, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.229018492176387, |
|
"grad_norm": 10.463966369628906, |
|
"learning_rate": 0.0001117993319638157, |
|
"loss": 9.2057, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.2441915599810336, |
|
"grad_norm": 9.727025985717773, |
|
"learning_rate": 0.00010807715477225858, |
|
"loss": 8.6479, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.2593646277856805, |
|
"grad_norm": 13.668055534362793, |
|
"learning_rate": 0.00010438277515087233, |
|
"loss": 14.4636, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.2745376955903271, |
|
"grad_norm": 11.730889320373535, |
|
"learning_rate": 0.00010071864271337478, |
|
"loss": 14.0765, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.2897107633949738, |
|
"grad_norm": 11.76871109008789, |
|
"learning_rate": 9.708718701763577e-05, |
|
"loss": 12.6621, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.3048838311996207, |
|
"grad_norm": 12.476866722106934, |
|
"learning_rate": 9.34908159547222e-05, |
|
"loss": 12.6543, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.3200568990042674, |
|
"grad_norm": 9.014403343200684, |
|
"learning_rate": 8.993191415231e-05, |
|
"loss": 10.5078, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.3352299668089143, |
|
"grad_norm": 8.82985782623291, |
|
"learning_rate": 8.641284139352091e-05, |
|
"loss": 10.1313, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.350403034613561, |
|
"grad_norm": 9.604599952697754, |
|
"learning_rate": 8.293593105223287e-05, |
|
"loss": 10.9247, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.3655761024182076, |
|
"grad_norm": 8.465795516967773, |
|
"learning_rate": 7.950348854590204e-05, |
|
"loss": 9.671, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.3807491702228545, |
|
"grad_norm": 8.575054168701172, |
|
"learning_rate": 7.611778980692035e-05, |
|
"loss": 10.2636, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.3959222380275011, |
|
"grad_norm": 7.928877830505371, |
|
"learning_rate": 7.278107977352543e-05, |
|
"loss": 8.982, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.411095305832148, |
|
"grad_norm": 9.111271858215332, |
|
"learning_rate": 6.949557090125994e-05, |
|
"loss": 9.8091, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.4262683736367947, |
|
"grad_norm": 8.69306468963623, |
|
"learning_rate": 6.626344169597031e-05, |
|
"loss": 9.7754, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.4414414414414414, |
|
"grad_norm": 8.90804386138916, |
|
"learning_rate": 6.308683526931545e-05, |
|
"loss": 9.7669, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.4566145092460883, |
|
"grad_norm": 9.999079704284668, |
|
"learning_rate": 5.996785791774478e-05, |
|
"loss": 9.0066, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.471787577050735, |
|
"grad_norm": 9.848721504211426, |
|
"learning_rate": 5.690857772588657e-05, |
|
"loss": 8.0642, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.4869606448553818, |
|
"grad_norm": 10.121479988098145, |
|
"learning_rate": 5.391102319527373e-05, |
|
"loss": 8.1109, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.5021337126600285, |
|
"grad_norm": 11.15321159362793, |
|
"learning_rate": 5.0977181899315214e-05, |
|
"loss": 15.3952, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.5173067804646752, |
|
"grad_norm": 10.735111236572266, |
|
"learning_rate": 4.8108999165406026e-05, |
|
"loss": 14.3337, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.5173067804646752, |
|
"eval_loss": 0.39477089047431946, |
|
"eval_runtime": 4.2024, |
|
"eval_samples_per_second": 11.898, |
|
"eval_steps_per_second": 11.898, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.5324798482693218, |
|
"grad_norm": 8.945897102355957, |
|
"learning_rate": 4.5308376785048434e-05, |
|
"loss": 11.1062, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.5476529160739687, |
|
"grad_norm": 9.844161987304688, |
|
"learning_rate": 4.257717175284103e-05, |
|
"loss": 10.4609, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.5628259838786156, |
|
"grad_norm": 9.294745445251465, |
|
"learning_rate": 3.991719503517014e-05, |
|
"loss": 10.6322, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.5779990516832623, |
|
"grad_norm": 9.901968002319336, |
|
"learning_rate": 3.733021036942205e-05, |
|
"loss": 11.3513, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.593172119487909, |
|
"grad_norm": 9.366765975952148, |
|
"learning_rate": 3.481793309451e-05, |
|
"loss": 10.2632, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.6083451872925556, |
|
"grad_norm": 8.39588451385498, |
|
"learning_rate": 3.238202901349345e-05, |
|
"loss": 10.2529, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.6235182550972025, |
|
"grad_norm": 8.285764694213867, |
|
"learning_rate": 3.0024113289042094e-05, |
|
"loss": 9.9579, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.6386913229018494, |
|
"grad_norm": 8.389649391174316, |
|
"learning_rate": 2.774574937247831e-05, |
|
"loss": 10.3428, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.653864390706496, |
|
"grad_norm": 8.863457679748535, |
|
"learning_rate": 2.554844796710716e-05, |
|
"loss": 9.0607, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.6690374585111427, |
|
"grad_norm": 8.510146141052246, |
|
"learning_rate": 2.3433666026522153e-05, |
|
"loss": 9.3396, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.6842105263157894, |
|
"grad_norm": 9.09020709991455, |
|
"learning_rate": 2.1402805788550138e-05, |
|
"loss": 9.223, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.6993835941204363, |
|
"grad_norm": 8.539032936096191, |
|
"learning_rate": 1.945721384547671e-05, |
|
"loss": 7.2793, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.714556661925083, |
|
"grad_norm": 9.459700584411621, |
|
"learning_rate": 1.759818025116787e-05, |
|
"loss": 7.6996, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.7297297297297298, |
|
"grad_norm": 11.580260276794434, |
|
"learning_rate": 1.5826937665680693e-05, |
|
"loss": 7.68, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.7449027975343765, |
|
"grad_norm": 8.748165130615234, |
|
"learning_rate": 1.4144660537929287e-05, |
|
"loss": 13.0615, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.7600758653390232, |
|
"grad_norm": 9.052087783813477, |
|
"learning_rate": 1.2552464326949302e-05, |
|
"loss": 13.9056, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.7752489331436698, |
|
"grad_norm": 9.413413047790527, |
|
"learning_rate": 1.105140476227575e-05, |
|
"loss": 13.1337, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.7904220009483167, |
|
"grad_norm": 8.633859634399414, |
|
"learning_rate": 9.64247714392597e-06, |
|
"loss": 11.1511, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.8055950687529636, |
|
"grad_norm": 7.859529972076416, |
|
"learning_rate": 8.32661568245081e-06, |
|
"loss": 10.06, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.8207681365576103, |
|
"grad_norm": 8.01181697845459, |
|
"learning_rate": 7.104692879492624e-06, |
|
"loss": 10.4743, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.835941204362257, |
|
"grad_norm": 9.660663604736328, |
|
"learning_rate": 5.977518949259735e-06, |
|
"loss": 11.0629, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.8511142721669036, |
|
"grad_norm": 8.536215782165527, |
|
"learning_rate": 4.945841281301943e-06, |
|
"loss": 10.3549, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.8662873399715505, |
|
"grad_norm": 8.264043807983398, |
|
"learning_rate": 4.010343944942618e-06, |
|
"loss": 9.8312, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.8814604077761974, |
|
"grad_norm": 8.763490676879883, |
|
"learning_rate": 3.1716472356963286e-06, |
|
"loss": 8.8429, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.896633475580844, |
|
"grad_norm": 9.253498077392578, |
|
"learning_rate": 2.430307263972547e-06, |
|
"loss": 10.0951, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.896633475580844, |
|
"eval_loss": 0.3835863471031189, |
|
"eval_runtime": 4.1958, |
|
"eval_samples_per_second": 11.917, |
|
"eval_steps_per_second": 11.917, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.9118065433854907, |
|
"grad_norm": 8.449151039123535, |
|
"learning_rate": 1.7868155863384415e-06, |
|
"loss": 9.1299, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.9269796111901374, |
|
"grad_norm": 8.359270095825195, |
|
"learning_rate": 1.2415988795847765e-06, |
|
"loss": 9.0518, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.9421526789947843, |
|
"grad_norm": 8.54788589477539, |
|
"learning_rate": 7.950186578116413e-07, |
|
"loss": 7.7242, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.9573257467994312, |
|
"grad_norm": 8.631213188171387, |
|
"learning_rate": 4.473710327209945e-07, |
|
"loss": 6.4852, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.9724988146040778, |
|
"grad_norm": 8.945975303649902, |
|
"learning_rate": 1.988865172754206e-07, |
|
"loss": 7.753, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.9876718824087245, |
|
"grad_norm": 8.31573486328125, |
|
"learning_rate": 4.972987285304375e-08, |
|
"loss": 11.2559, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.002844950213371, |
|
"grad_norm": 8.791767120361328, |
|
"learning_rate": 0.0, |
|
"loss": 9.4748, |
|
"step": 132 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 132, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 1, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.8361400481231667e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|