nemo-12b-tools / trainer_state.json
taozi555's picture
Upload folder using huggingface_hub
4d7dba8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9905341446923597,
"eval_steps": 46,
"global_step": 368,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005409060175794456,
"grad_norm": 44.9358828224533,
"learning_rate": 2.0000000000000002e-07,
"loss": 3.0416,
"step": 1
},
{
"epoch": 0.005409060175794456,
"eval_loss": 3.2299506664276123,
"eval_runtime": 80.3263,
"eval_samples_per_second": 15.512,
"eval_steps_per_second": 1.942,
"step": 1
},
{
"epoch": 0.010818120351588911,
"grad_norm": 43.82162868015869,
"learning_rate": 4.0000000000000003e-07,
"loss": 3.0979,
"step": 2
},
{
"epoch": 0.016227180527383367,
"grad_norm": 49.66354655578431,
"learning_rate": 6.000000000000001e-07,
"loss": 3.229,
"step": 3
},
{
"epoch": 0.021636240703177823,
"grad_norm": 44.90350470119676,
"learning_rate": 8.000000000000001e-07,
"loss": 3.1823,
"step": 4
},
{
"epoch": 0.027045300878972278,
"grad_norm": 45.64225015268925,
"learning_rate": 1.0000000000000002e-06,
"loss": 3.1503,
"step": 5
},
{
"epoch": 0.032454361054766734,
"grad_norm": 43.46015056839171,
"learning_rate": 1.2000000000000002e-06,
"loss": 2.9786,
"step": 6
},
{
"epoch": 0.03786342123056119,
"grad_norm": 39.8236071706638,
"learning_rate": 1.4000000000000001e-06,
"loss": 2.7861,
"step": 7
},
{
"epoch": 0.043272481406355645,
"grad_norm": 18.849036433644883,
"learning_rate": 1.6000000000000001e-06,
"loss": 2.6546,
"step": 8
},
{
"epoch": 0.0486815415821501,
"grad_norm": 18.842114481944215,
"learning_rate": 1.8000000000000001e-06,
"loss": 2.685,
"step": 9
},
{
"epoch": 0.054090601757944556,
"grad_norm": 19.369280399976688,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.6052,
"step": 10
},
{
"epoch": 0.05949966193373901,
"grad_norm": 16.167411037120388,
"learning_rate": 2.2e-06,
"loss": 2.4654,
"step": 11
},
{
"epoch": 0.06490872210953347,
"grad_norm": 14.946675935925768,
"learning_rate": 2.4000000000000003e-06,
"loss": 2.4427,
"step": 12
},
{
"epoch": 0.07031778228532792,
"grad_norm": 13.586548911059968,
"learning_rate": 2.6e-06,
"loss": 2.5375,
"step": 13
},
{
"epoch": 0.07572684246112238,
"grad_norm": 9.968943698698908,
"learning_rate": 2.8000000000000003e-06,
"loss": 2.316,
"step": 14
},
{
"epoch": 0.08113590263691683,
"grad_norm": 10.416045938582544,
"learning_rate": 3e-06,
"loss": 2.4094,
"step": 15
},
{
"epoch": 0.08654496281271129,
"grad_norm": 9.298587355474794,
"learning_rate": 3.2000000000000003e-06,
"loss": 2.3988,
"step": 16
},
{
"epoch": 0.09195402298850575,
"grad_norm": 8.609391957088402,
"learning_rate": 3.4000000000000005e-06,
"loss": 2.3259,
"step": 17
},
{
"epoch": 0.0973630831643002,
"grad_norm": 7.68261305571874,
"learning_rate": 3.6000000000000003e-06,
"loss": 2.2034,
"step": 18
},
{
"epoch": 0.10277214334009466,
"grad_norm": 8.199455018530278,
"learning_rate": 3.8000000000000005e-06,
"loss": 2.1983,
"step": 19
},
{
"epoch": 0.10818120351588911,
"grad_norm": 7.229515373269984,
"learning_rate": 4.000000000000001e-06,
"loss": 2.3398,
"step": 20
},
{
"epoch": 0.11359026369168357,
"grad_norm": 8.94152448817592,
"learning_rate": 4.2000000000000004e-06,
"loss": 2.2895,
"step": 21
},
{
"epoch": 0.11899932386747802,
"grad_norm": 7.678296028460811,
"learning_rate": 4.4e-06,
"loss": 2.2891,
"step": 22
},
{
"epoch": 0.12440838404327248,
"grad_norm": 8.007524747470107,
"learning_rate": 4.600000000000001e-06,
"loss": 2.2389,
"step": 23
},
{
"epoch": 0.12981744421906694,
"grad_norm": 7.352151792895566,
"learning_rate": 4.800000000000001e-06,
"loss": 2.2593,
"step": 24
},
{
"epoch": 0.1352265043948614,
"grad_norm": 7.103281407665676,
"learning_rate": 5e-06,
"loss": 2.1577,
"step": 25
},
{
"epoch": 0.14063556457065585,
"grad_norm": 6.919318449883582,
"learning_rate": 4.999955579111413e-06,
"loss": 2.2622,
"step": 26
},
{
"epoch": 0.1460446247464503,
"grad_norm": 7.072321363589398,
"learning_rate": 4.999822318024222e-06,
"loss": 2.1805,
"step": 27
},
{
"epoch": 0.15145368492224476,
"grad_norm": 8.556428262133437,
"learning_rate": 4.999600221474089e-06,
"loss": 2.2209,
"step": 28
},
{
"epoch": 0.1568627450980392,
"grad_norm": 6.318203548443499,
"learning_rate": 4.999289297353593e-06,
"loss": 2.1821,
"step": 29
},
{
"epoch": 0.16227180527383367,
"grad_norm": 7.959416954439196,
"learning_rate": 4.998889556711958e-06,
"loss": 2.1423,
"step": 30
},
{
"epoch": 0.16768086544962812,
"grad_norm": 7.493305387455565,
"learning_rate": 4.9984010137546475e-06,
"loss": 2.1499,
"step": 31
},
{
"epoch": 0.17308992562542258,
"grad_norm": 7.399157248009571,
"learning_rate": 4.997823685842875e-06,
"loss": 2.1654,
"step": 32
},
{
"epoch": 0.17849898580121704,
"grad_norm": 7.095225049901362,
"learning_rate": 4.997157593492974e-06,
"loss": 2.2062,
"step": 33
},
{
"epoch": 0.1839080459770115,
"grad_norm": 6.544205100152826,
"learning_rate": 4.996402760375676e-06,
"loss": 2.1302,
"step": 34
},
{
"epoch": 0.18931710615280595,
"grad_norm": 8.497617899265414,
"learning_rate": 4.995559213315267e-06,
"loss": 2.2452,
"step": 35
},
{
"epoch": 0.1947261663286004,
"grad_norm": 7.711468079457363,
"learning_rate": 4.9946269822886335e-06,
"loss": 2.1562,
"step": 36
},
{
"epoch": 0.20013522650439486,
"grad_norm": 6.828572393494306,
"learning_rate": 4.993606100424202e-06,
"loss": 2.0838,
"step": 37
},
{
"epoch": 0.20554428668018931,
"grad_norm": 7.829101006960679,
"learning_rate": 4.992496604000756e-06,
"loss": 2.1608,
"step": 38
},
{
"epoch": 0.21095334685598377,
"grad_norm": 7.0790174546853954,
"learning_rate": 4.991298532446149e-06,
"loss": 2.1385,
"step": 39
},
{
"epoch": 0.21636240703177823,
"grad_norm": 6.4583828883763825,
"learning_rate": 4.9900119283359025e-06,
"loss": 2.1094,
"step": 40
},
{
"epoch": 0.22177146720757268,
"grad_norm": 6.919640648587095,
"learning_rate": 4.988636837391696e-06,
"loss": 2.2184,
"step": 41
},
{
"epoch": 0.22718052738336714,
"grad_norm": 7.054119878110095,
"learning_rate": 4.987173308479738e-06,
"loss": 2.1584,
"step": 42
},
{
"epoch": 0.2325895875591616,
"grad_norm": 8.011842535646375,
"learning_rate": 4.985621393609032e-06,
"loss": 2.2022,
"step": 43
},
{
"epoch": 0.23799864773495605,
"grad_norm": 7.872258963246048,
"learning_rate": 4.98398114792953e-06,
"loss": 2.0779,
"step": 44
},
{
"epoch": 0.2434077079107505,
"grad_norm": 7.5559333876351085,
"learning_rate": 4.982252629730167e-06,
"loss": 2.0195,
"step": 45
},
{
"epoch": 0.24881676808654496,
"grad_norm": 7.7955956497680985,
"learning_rate": 4.980435900436793e-06,
"loss": 2.0475,
"step": 46
},
{
"epoch": 0.24881676808654496,
"eval_loss": 2.1143910884857178,
"eval_runtime": 80.1839,
"eval_samples_per_second": 15.539,
"eval_steps_per_second": 1.946,
"step": 46
},
{
"epoch": 0.2542258282623394,
"grad_norm": 6.802229063375309,
"learning_rate": 4.978531024609994e-06,
"loss": 2.0305,
"step": 47
},
{
"epoch": 0.25963488843813387,
"grad_norm": 7.592805462348185,
"learning_rate": 4.9765380699427905e-06,
"loss": 2.0484,
"step": 48
},
{
"epoch": 0.2650439486139283,
"grad_norm": 7.243742353954717,
"learning_rate": 4.9744571072582365e-06,
"loss": 2.1142,
"step": 49
},
{
"epoch": 0.2704530087897228,
"grad_norm": 7.924276414192629,
"learning_rate": 4.972288210506902e-06,
"loss": 2.0623,
"step": 50
},
{
"epoch": 0.27586206896551724,
"grad_norm": 7.407311330367973,
"learning_rate": 4.970031456764242e-06,
"loss": 2.032,
"step": 51
},
{
"epoch": 0.2812711291413117,
"grad_norm": 7.756849919749466,
"learning_rate": 4.967686926227862e-06,
"loss": 1.9791,
"step": 52
},
{
"epoch": 0.28668018931710615,
"grad_norm": 6.01269466571588,
"learning_rate": 4.965254702214668e-06,
"loss": 2.0046,
"step": 53
},
{
"epoch": 0.2920892494929006,
"grad_norm": 6.9870832241271685,
"learning_rate": 4.9627348711578996e-06,
"loss": 2.1095,
"step": 54
},
{
"epoch": 0.29749830966869506,
"grad_norm": 9.083794116875618,
"learning_rate": 4.960127522604065e-06,
"loss": 2.0736,
"step": 55
},
{
"epoch": 0.3029073698444895,
"grad_norm": 7.096143715762253,
"learning_rate": 4.957432749209755e-06,
"loss": 1.9896,
"step": 56
},
{
"epoch": 0.30831643002028397,
"grad_norm": 5.7983259418594715,
"learning_rate": 4.954650646738354e-06,
"loss": 1.944,
"step": 57
},
{
"epoch": 0.3137254901960784,
"grad_norm": 6.984280635078145,
"learning_rate": 4.951781314056633e-06,
"loss": 2.0276,
"step": 58
},
{
"epoch": 0.3191345503718729,
"grad_norm": 6.88721515440217,
"learning_rate": 4.948824853131237e-06,
"loss": 2.0442,
"step": 59
},
{
"epoch": 0.32454361054766734,
"grad_norm": 7.821091549898941,
"learning_rate": 4.9457813690250635e-06,
"loss": 2.0072,
"step": 60
},
{
"epoch": 0.3299526707234618,
"grad_norm": 5.64425372355818,
"learning_rate": 4.942650969893527e-06,
"loss": 1.994,
"step": 61
},
{
"epoch": 0.33536173089925625,
"grad_norm": 6.2844987961209675,
"learning_rate": 4.939433766980717e-06,
"loss": 2.0615,
"step": 62
},
{
"epoch": 0.3407707910750507,
"grad_norm": 7.116345411158451,
"learning_rate": 4.936129874615443e-06,
"loss": 2.048,
"step": 63
},
{
"epoch": 0.34617985125084516,
"grad_norm": 6.7059981437853375,
"learning_rate": 4.932739410207172e-06,
"loss": 1.899,
"step": 64
},
{
"epoch": 0.3515889114266396,
"grad_norm": 6.681993008670398,
"learning_rate": 4.929262494241859e-06,
"loss": 1.9566,
"step": 65
},
{
"epoch": 0.35699797160243407,
"grad_norm": 6.496917926567058,
"learning_rate": 4.9256992502776605e-06,
"loss": 1.956,
"step": 66
},
{
"epoch": 0.36240703177822853,
"grad_norm": 6.708931079572326,
"learning_rate": 4.922049804940546e-06,
"loss": 1.9967,
"step": 67
},
{
"epoch": 0.367816091954023,
"grad_norm": 5.620040634218876,
"learning_rate": 4.9183142879198e-06,
"loss": 1.916,
"step": 68
},
{
"epoch": 0.37322515212981744,
"grad_norm": 7.850717902889795,
"learning_rate": 4.914492831963411e-06,
"loss": 1.941,
"step": 69
},
{
"epoch": 0.3786342123056119,
"grad_norm": 7.497086645607299,
"learning_rate": 4.910585572873355e-06,
"loss": 1.9286,
"step": 70
},
{
"epoch": 0.38404327248140635,
"grad_norm": 10.987140598022854,
"learning_rate": 4.906592649500767e-06,
"loss": 1.8511,
"step": 71
},
{
"epoch": 0.3894523326572008,
"grad_norm": 6.780385991870079,
"learning_rate": 4.902514203741013e-06,
"loss": 1.9953,
"step": 72
},
{
"epoch": 0.39486139283299526,
"grad_norm": 6.69524986070073,
"learning_rate": 4.898350380528638e-06,
"loss": 1.9524,
"step": 73
},
{
"epoch": 0.4002704530087897,
"grad_norm": 6.1699263921182474,
"learning_rate": 4.894101327832225e-06,
"loss": 1.8258,
"step": 74
},
{
"epoch": 0.4056795131845842,
"grad_norm": 6.732182374500395,
"learning_rate": 4.8897671966491315e-06,
"loss": 1.9725,
"step": 75
},
{
"epoch": 0.41108857336037863,
"grad_norm": 6.598471225386019,
"learning_rate": 4.8853481410001225e-06,
"loss": 1.8993,
"step": 76
},
{
"epoch": 0.4164976335361731,
"grad_norm": 6.93174108718265,
"learning_rate": 4.8808443179239025e-06,
"loss": 1.9447,
"step": 77
},
{
"epoch": 0.42190669371196754,
"grad_norm": 6.726165747858891,
"learning_rate": 4.87625588747153e-06,
"loss": 1.9723,
"step": 78
},
{
"epoch": 0.427315753887762,
"grad_norm": 5.798399986576307,
"learning_rate": 4.87158301270073e-06,
"loss": 1.9677,
"step": 79
},
{
"epoch": 0.43272481406355645,
"grad_norm": 6.420619711799955,
"learning_rate": 4.8668258596701035e-06,
"loss": 1.9731,
"step": 80
},
{
"epoch": 0.4381338742393509,
"grad_norm": 8.374770038931102,
"learning_rate": 4.861984597433223e-06,
"loss": 1.9635,
"step": 81
},
{
"epoch": 0.44354293441514536,
"grad_norm": 6.2592251917152595,
"learning_rate": 4.857059398032622e-06,
"loss": 1.9228,
"step": 82
},
{
"epoch": 0.4489519945909398,
"grad_norm": 6.759714436705178,
"learning_rate": 4.85205043649369e-06,
"loss": 1.9456,
"step": 83
},
{
"epoch": 0.4543610547667343,
"grad_norm": 7.0963936709641615,
"learning_rate": 4.846957890818444e-06,
"loss": 1.9217,
"step": 84
},
{
"epoch": 0.45977011494252873,
"grad_norm": 6.941342449698633,
"learning_rate": 4.841781941979207e-06,
"loss": 1.7469,
"step": 85
},
{
"epoch": 0.4651791751183232,
"grad_norm": 7.445994149780175,
"learning_rate": 4.836522773912178e-06,
"loss": 1.9254,
"step": 86
},
{
"epoch": 0.47058823529411764,
"grad_norm": 6.588581579766995,
"learning_rate": 4.83118057351089e-06,
"loss": 1.8967,
"step": 87
},
{
"epoch": 0.4759972954699121,
"grad_norm": 7.117087421099954,
"learning_rate": 4.825755530619576e-06,
"loss": 1.8865,
"step": 88
},
{
"epoch": 0.48140635564570655,
"grad_norm": 6.724049881659424,
"learning_rate": 4.820247838026414e-06,
"loss": 1.9435,
"step": 89
},
{
"epoch": 0.486815415821501,
"grad_norm": 5.6973534950025595,
"learning_rate": 4.814657691456685e-06,
"loss": 1.7882,
"step": 90
},
{
"epoch": 0.49222447599729546,
"grad_norm": 6.388481680244839,
"learning_rate": 4.808985289565813e-06,
"loss": 1.8173,
"step": 91
},
{
"epoch": 0.4976335361730899,
"grad_norm": 6.3977034813469835,
"learning_rate": 4.803230833932302e-06,
"loss": 1.8489,
"step": 92
},
{
"epoch": 0.4976335361730899,
"eval_loss": 1.89235520362854,
"eval_runtime": 80.2557,
"eval_samples_per_second": 15.525,
"eval_steps_per_second": 1.944,
"step": 92
},
{
"epoch": 0.5030425963488844,
"grad_norm": 7.046151003888674,
"learning_rate": 4.797394529050577e-06,
"loss": 1.9349,
"step": 93
},
{
"epoch": 0.5084516565246788,
"grad_norm": 6.590771586445206,
"learning_rate": 4.791476582323719e-06,
"loss": 1.9962,
"step": 94
},
{
"epoch": 0.5138607167004733,
"grad_norm": 7.166866163450392,
"learning_rate": 4.785477204056089e-06,
"loss": 1.8174,
"step": 95
},
{
"epoch": 0.5192697768762677,
"grad_norm": 6.716770374519456,
"learning_rate": 4.779396607445858e-06,
"loss": 1.9214,
"step": 96
},
{
"epoch": 0.5246788370520622,
"grad_norm": 7.244507141710972,
"learning_rate": 4.77323500857743e-06,
"loss": 1.8681,
"step": 97
},
{
"epoch": 0.5300878972278567,
"grad_norm": 6.3217358598893165,
"learning_rate": 4.7669926264137625e-06,
"loss": 1.8825,
"step": 98
},
{
"epoch": 0.5354969574036511,
"grad_norm": 5.77085923138114,
"learning_rate": 4.760669682788584e-06,
"loss": 1.7838,
"step": 99
},
{
"epoch": 0.5409060175794456,
"grad_norm": 5.638989379158351,
"learning_rate": 4.754266402398517e-06,
"loss": 1.7818,
"step": 100
},
{
"epoch": 0.54631507775524,
"grad_norm": 6.283880096623512,
"learning_rate": 4.747783012795083e-06,
"loss": 1.7555,
"step": 101
},
{
"epoch": 0.5517241379310345,
"grad_norm": 7.08060002757244,
"learning_rate": 4.741219744376624e-06,
"loss": 1.8585,
"step": 102
},
{
"epoch": 0.5571331981068289,
"grad_norm": 7.077388397562155,
"learning_rate": 4.734576830380113e-06,
"loss": 1.7837,
"step": 103
},
{
"epoch": 0.5625422582826234,
"grad_norm": 6.093542681362889,
"learning_rate": 4.727854506872863e-06,
"loss": 1.7841,
"step": 104
},
{
"epoch": 0.5679513184584178,
"grad_norm": 6.250665388589361,
"learning_rate": 4.721053012744142e-06,
"loss": 1.8065,
"step": 105
},
{
"epoch": 0.5733603786342123,
"grad_norm": 7.137442110609306,
"learning_rate": 4.71417258969668e-06,
"loss": 1.8022,
"step": 106
},
{
"epoch": 0.5787694388100068,
"grad_norm": 6.025245815328945,
"learning_rate": 4.70721348223808e-06,
"loss": 1.8129,
"step": 107
},
{
"epoch": 0.5841784989858012,
"grad_norm": 6.920748676536336,
"learning_rate": 4.700175937672134e-06,
"loss": 1.8498,
"step": 108
},
{
"epoch": 0.5895875591615957,
"grad_norm": 7.6481497850706415,
"learning_rate": 4.693060206090028e-06,
"loss": 1.7777,
"step": 109
},
{
"epoch": 0.5949966193373901,
"grad_norm": 7.666186710495507,
"learning_rate": 4.685866540361456e-06,
"loss": 1.834,
"step": 110
},
{
"epoch": 0.6004056795131846,
"grad_norm": 7.021331030038572,
"learning_rate": 4.678595196125638e-06,
"loss": 1.8563,
"step": 111
},
{
"epoch": 0.605814739688979,
"grad_norm": 6.487310124057107,
"learning_rate": 4.671246431782234e-06,
"loss": 1.7152,
"step": 112
},
{
"epoch": 0.6112237998647735,
"grad_norm": 6.492879712741564,
"learning_rate": 4.6638205084821544e-06,
"loss": 1.7396,
"step": 113
},
{
"epoch": 0.6166328600405679,
"grad_norm": 6.932406364697437,
"learning_rate": 4.656317690118291e-06,
"loss": 1.6753,
"step": 114
},
{
"epoch": 0.6220419202163624,
"grad_norm": 6.635109301432599,
"learning_rate": 4.648738243316128e-06,
"loss": 1.7832,
"step": 115
},
{
"epoch": 0.6274509803921569,
"grad_norm": 6.6861559036147975,
"learning_rate": 4.641082437424277e-06,
"loss": 1.7703,
"step": 116
},
{
"epoch": 0.6328600405679513,
"grad_norm": 6.9646373816737075,
"learning_rate": 4.633350544504899e-06,
"loss": 1.7764,
"step": 117
},
{
"epoch": 0.6382691007437458,
"grad_norm": 6.144269305268074,
"learning_rate": 4.625542839324036e-06,
"loss": 1.8115,
"step": 118
},
{
"epoch": 0.6436781609195402,
"grad_norm": 6.489513509921581,
"learning_rate": 4.617659599341849e-06,
"loss": 1.7991,
"step": 119
},
{
"epoch": 0.6490872210953347,
"grad_norm": 6.883770839388401,
"learning_rate": 4.609701104702759e-06,
"loss": 1.6693,
"step": 120
},
{
"epoch": 0.6544962812711291,
"grad_norm": 7.5470889656860605,
"learning_rate": 4.6016676382254895e-06,
"loss": 1.7072,
"step": 121
},
{
"epoch": 0.6599053414469236,
"grad_norm": 7.546505121169261,
"learning_rate": 4.593559485393015e-06,
"loss": 1.7841,
"step": 122
},
{
"epoch": 0.665314401622718,
"grad_norm": 7.028634956829483,
"learning_rate": 4.585376934342418e-06,
"loss": 1.8216,
"step": 123
},
{
"epoch": 0.6707234617985125,
"grad_norm": 6.173100762882487,
"learning_rate": 4.577120275854649e-06,
"loss": 1.7126,
"step": 124
},
{
"epoch": 0.676132521974307,
"grad_norm": 7.3755207955713,
"learning_rate": 4.568789803344196e-06,
"loss": 1.6914,
"step": 125
},
{
"epoch": 0.6815415821501014,
"grad_norm": 7.110847605538237,
"learning_rate": 4.56038581284865e-06,
"loss": 1.7385,
"step": 126
},
{
"epoch": 0.6869506423258959,
"grad_norm": 6.361787130642339,
"learning_rate": 4.551908603018191e-06,
"loss": 1.6135,
"step": 127
},
{
"epoch": 0.6923597025016903,
"grad_norm": 6.445020674626523,
"learning_rate": 4.543358475104975e-06,
"loss": 1.709,
"step": 128
},
{
"epoch": 0.6977687626774848,
"grad_norm": 9.06412704312379,
"learning_rate": 4.5347357329524254e-06,
"loss": 1.846,
"step": 129
},
{
"epoch": 0.7031778228532792,
"grad_norm": 6.7066499384595675,
"learning_rate": 4.5260406829844364e-06,
"loss": 1.7074,
"step": 130
},
{
"epoch": 0.7085868830290737,
"grad_norm": 5.861118356970615,
"learning_rate": 4.5172736341944845e-06,
"loss": 1.7179,
"step": 131
},
{
"epoch": 0.7139959432048681,
"grad_norm": 7.102544513000641,
"learning_rate": 4.5084348981346495e-06,
"loss": 1.7577,
"step": 132
},
{
"epoch": 0.7194050033806626,
"grad_norm": 10.027416022472378,
"learning_rate": 4.499524788904537e-06,
"loss": 1.6513,
"step": 133
},
{
"epoch": 0.7248140635564571,
"grad_norm": 6.648158309890374,
"learning_rate": 4.490543623140123e-06,
"loss": 1.6589,
"step": 134
},
{
"epoch": 0.7302231237322515,
"grad_norm": 6.662339022641756,
"learning_rate": 4.481491720002499e-06,
"loss": 1.6705,
"step": 135
},
{
"epoch": 0.735632183908046,
"grad_norm": 6.0555935222186665,
"learning_rate": 4.472369401166531e-06,
"loss": 1.7424,
"step": 136
},
{
"epoch": 0.7410412440838404,
"grad_norm": 8.895889108381375,
"learning_rate": 4.463176990809423e-06,
"loss": 1.7386,
"step": 137
},
{
"epoch": 0.7464503042596349,
"grad_norm": 7.003045853961112,
"learning_rate": 4.453914815599206e-06,
"loss": 1.7036,
"step": 138
},
{
"epoch": 0.7464503042596349,
"eval_loss": 1.716886281967163,
"eval_runtime": 80.2672,
"eval_samples_per_second": 15.523,
"eval_steps_per_second": 1.944,
"step": 138
},
{
"epoch": 0.7518593644354293,
"grad_norm": 6.693085585003622,
"learning_rate": 4.444583204683123e-06,
"loss": 1.7129,
"step": 139
},
{
"epoch": 0.7572684246112238,
"grad_norm": 7.502871529343553,
"learning_rate": 4.435182489675931e-06,
"loss": 1.6455,
"step": 140
},
{
"epoch": 0.7626774847870182,
"grad_norm": 6.062617524551209,
"learning_rate": 4.425713004648123e-06,
"loss": 1.5962,
"step": 141
},
{
"epoch": 0.7680865449628127,
"grad_norm": 7.073651768871659,
"learning_rate": 4.416175086114049e-06,
"loss": 1.6784,
"step": 142
},
{
"epoch": 0.7734956051386072,
"grad_norm": 5.885299008267407,
"learning_rate": 4.406569073019965e-06,
"loss": 1.7525,
"step": 143
},
{
"epoch": 0.7789046653144016,
"grad_norm": 6.313009716756996,
"learning_rate": 4.396895306731978e-06,
"loss": 1.7347,
"step": 144
},
{
"epoch": 0.7843137254901961,
"grad_norm": 7.009990810809474,
"learning_rate": 4.387154131023924e-06,
"loss": 1.691,
"step": 145
},
{
"epoch": 0.7897227856659905,
"grad_norm": 6.900733353908871,
"learning_rate": 4.377345892065149e-06,
"loss": 1.6789,
"step": 146
},
{
"epoch": 0.795131845841785,
"grad_norm": 6.705414309211519,
"learning_rate": 4.367470938408204e-06,
"loss": 1.6024,
"step": 147
},
{
"epoch": 0.8005409060175794,
"grad_norm": 5.746533610111662,
"learning_rate": 4.357529620976463e-06,
"loss": 1.5715,
"step": 148
},
{
"epoch": 0.8059499661933739,
"grad_norm": 6.344927493402156,
"learning_rate": 4.3475222930516484e-06,
"loss": 1.6393,
"step": 149
},
{
"epoch": 0.8113590263691683,
"grad_norm": 7.177833196869734,
"learning_rate": 4.337449310261279e-06,
"loss": 1.7165,
"step": 150
},
{
"epoch": 0.8167680865449628,
"grad_norm": 7.330166717470119,
"learning_rate": 4.327311030566033e-06,
"loss": 1.5549,
"step": 151
},
{
"epoch": 0.8221771467207573,
"grad_norm": 6.615833346853653,
"learning_rate": 4.317107814247022e-06,
"loss": 1.5402,
"step": 152
},
{
"epoch": 0.8275862068965517,
"grad_norm": 6.435680171929204,
"learning_rate": 4.306840023892998e-06,
"loss": 1.6594,
"step": 153
},
{
"epoch": 0.8329952670723462,
"grad_norm": 8.276922490588758,
"learning_rate": 4.2965080243874555e-06,
"loss": 1.7175,
"step": 154
},
{
"epoch": 0.8384043272481406,
"grad_norm": 6.944104925297799,
"learning_rate": 4.2861121828956745e-06,
"loss": 1.6139,
"step": 155
},
{
"epoch": 0.8438133874239351,
"grad_norm": 6.95913091453491,
"learning_rate": 4.275652868851669e-06,
"loss": 1.6509,
"step": 156
},
{
"epoch": 0.8492224475997295,
"grad_norm": 6.520552325908588,
"learning_rate": 4.265130453945056e-06,
"loss": 1.6153,
"step": 157
},
{
"epoch": 0.854631507775524,
"grad_norm": 5.793083719201756,
"learning_rate": 4.254545312107854e-06,
"loss": 1.6652,
"step": 158
},
{
"epoch": 0.8600405679513184,
"grad_norm": 7.087661599168472,
"learning_rate": 4.243897819501187e-06,
"loss": 1.5381,
"step": 159
},
{
"epoch": 0.8654496281271129,
"grad_norm": 7.094455948395506,
"learning_rate": 4.233188354501921e-06,
"loss": 1.5494,
"step": 160
},
{
"epoch": 0.8708586883029074,
"grad_norm": 8.203152015742518,
"learning_rate": 4.222417297689217e-06,
"loss": 1.6491,
"step": 161
},
{
"epoch": 0.8762677484787018,
"grad_norm": 6.453682013406382,
"learning_rate": 4.211585031831007e-06,
"loss": 1.6017,
"step": 162
},
{
"epoch": 0.8816768086544963,
"grad_norm": 5.936146356941861,
"learning_rate": 4.200691941870392e-06,
"loss": 1.578,
"step": 163
},
{
"epoch": 0.8870858688302907,
"grad_norm": 6.939707441726559,
"learning_rate": 4.189738414911959e-06,
"loss": 1.6267,
"step": 164
},
{
"epoch": 0.8924949290060852,
"grad_norm": 5.410313740063528,
"learning_rate": 4.178724840208029e-06,
"loss": 1.4958,
"step": 165
},
{
"epoch": 0.8979039891818796,
"grad_norm": 6.696136302418629,
"learning_rate": 4.167651609144822e-06,
"loss": 1.5497,
"step": 166
},
{
"epoch": 0.9033130493576741,
"grad_norm": 7.906312937190654,
"learning_rate": 4.15651911522855e-06,
"loss": 1.5839,
"step": 167
},
{
"epoch": 0.9087221095334685,
"grad_norm": 6.535495007076388,
"learning_rate": 4.145327754071427e-06,
"loss": 1.5472,
"step": 168
},
{
"epoch": 0.914131169709263,
"grad_norm": 5.836290315234971,
"learning_rate": 4.134077923377622e-06,
"loss": 1.552,
"step": 169
},
{
"epoch": 0.9195402298850575,
"grad_norm": 6.8226833628498476,
"learning_rate": 4.122770022929114e-06,
"loss": 1.5789,
"step": 170
},
{
"epoch": 0.9249492900608519,
"grad_norm": 6.636708734597821,
"learning_rate": 4.1114044545714935e-06,
"loss": 1.5297,
"step": 171
},
{
"epoch": 0.9303583502366464,
"grad_norm": 6.480166160413371,
"learning_rate": 4.0999816221996755e-06,
"loss": 1.64,
"step": 172
},
{
"epoch": 0.9357674104124408,
"grad_norm": 6.305918232446464,
"learning_rate": 4.088501931743551e-06,
"loss": 1.6244,
"step": 173
},
{
"epoch": 0.9411764705882353,
"grad_norm": 6.244739864099848,
"learning_rate": 4.076965791153562e-06,
"loss": 1.4548,
"step": 174
},
{
"epoch": 0.9465855307640297,
"grad_norm": 6.652315653413243,
"learning_rate": 4.065373610386201e-06,
"loss": 1.6279,
"step": 175
},
{
"epoch": 0.9519945909398242,
"grad_norm": 7.0369918458505,
"learning_rate": 4.0537258013894434e-06,
"loss": 1.5423,
"step": 176
},
{
"epoch": 0.9574036511156186,
"grad_norm": 6.219650042475582,
"learning_rate": 4.042022778088111e-06,
"loss": 1.6608,
"step": 177
},
{
"epoch": 0.9628127112914131,
"grad_norm": 6.876925788244563,
"learning_rate": 4.030264956369158e-06,
"loss": 1.5072,
"step": 178
},
{
"epoch": 0.9682217714672076,
"grad_norm": 6.90949169776309,
"learning_rate": 4.018452754066895e-06,
"loss": 1.6312,
"step": 179
},
{
"epoch": 0.973630831643002,
"grad_norm": 6.8575340845851604,
"learning_rate": 4.006586590948141e-06,
"loss": 1.5603,
"step": 180
},
{
"epoch": 0.9790398918187965,
"grad_norm": 6.399141972511512,
"learning_rate": 3.994666888697304e-06,
"loss": 1.4676,
"step": 181
},
{
"epoch": 0.9844489519945909,
"grad_norm": 6.8804715530154255,
"learning_rate": 3.982694070901396e-06,
"loss": 1.5238,
"step": 182
},
{
"epoch": 0.9898580121703854,
"grad_norm": 6.143146424939786,
"learning_rate": 3.970668563034982e-06,
"loss": 1.5458,
"step": 183
},
{
"epoch": 0.9952670723461798,
"grad_norm": 7.1826166940238005,
"learning_rate": 3.958590792445057e-06,
"loss": 1.5179,
"step": 184
},
{
"epoch": 0.9952670723461798,
"eval_loss": 1.555787205696106,
"eval_runtime": 80.2289,
"eval_samples_per_second": 15.531,
"eval_steps_per_second": 1.944,
"step": 184
},
{
"epoch": 1.0006761325219744,
"grad_norm": 6.359754610852221,
"learning_rate": 3.946461188335863e-06,
"loss": 1.4166,
"step": 185
},
{
"epoch": 1.0060851926977687,
"grad_norm": 6.558163414784584,
"learning_rate": 3.934280181753634e-06,
"loss": 1.2751,
"step": 186
},
{
"epoch": 1.0114942528735633,
"grad_norm": 7.219629642258684,
"learning_rate": 3.922048205571279e-06,
"loss": 1.1886,
"step": 187
},
{
"epoch": 1.0169033130493577,
"grad_norm": 5.641940967289977,
"learning_rate": 3.909765694473e-06,
"loss": 1.162,
"step": 188
},
{
"epoch": 1.0223123732251522,
"grad_norm": 5.146358119235958,
"learning_rate": 3.897433084938841e-06,
"loss": 1.1985,
"step": 189
},
{
"epoch": 1.0277214334009466,
"grad_norm": 6.030205789253815,
"learning_rate": 3.885050815229182e-06,
"loss": 1.2081,
"step": 190
},
{
"epoch": 1.0331304935767411,
"grad_norm": 7.022365747053938,
"learning_rate": 3.872619325369162e-06,
"loss": 1.2893,
"step": 191
},
{
"epoch": 1.0385395537525355,
"grad_norm": 5.333726155190956,
"learning_rate": 3.860139057133042e-06,
"loss": 1.0908,
"step": 192
},
{
"epoch": 1.04394861392833,
"grad_norm": 5.45665767931003,
"learning_rate": 3.8476104540285054e-06,
"loss": 1.1623,
"step": 193
},
{
"epoch": 1.0493576741041244,
"grad_norm": 5.712839244673719,
"learning_rate": 3.835033961280898e-06,
"loss": 1.2006,
"step": 194
},
{
"epoch": 1.054766734279919,
"grad_norm": 5.034751006183729,
"learning_rate": 3.8224100258174066e-06,
"loss": 1.1717,
"step": 195
},
{
"epoch": 1.0601757944557133,
"grad_norm": 5.683598056495194,
"learning_rate": 3.809739096251176e-06,
"loss": 1.0888,
"step": 196
},
{
"epoch": 1.0655848546315079,
"grad_norm": 5.519253752312427,
"learning_rate": 3.7970216228653667e-06,
"loss": 1.1504,
"step": 197
},
{
"epoch": 1.0709939148073022,
"grad_norm": 5.447708800800649,
"learning_rate": 3.7842580575971533e-06,
"loss": 1.1493,
"step": 198
},
{
"epoch": 1.0764029749830968,
"grad_norm": 5.433128433416691,
"learning_rate": 3.7714488540216637e-06,
"loss": 1.2068,
"step": 199
},
{
"epoch": 1.0818120351588911,
"grad_norm": 5.161001564479917,
"learning_rate": 3.7585944673358632e-06,
"loss": 1.061,
"step": 200
},
{
"epoch": 1.0872210953346857,
"grad_norm": 6.0824539385903895,
"learning_rate": 3.745695354342374e-06,
"loss": 1.0569,
"step": 201
},
{
"epoch": 1.09263015551048,
"grad_norm": 5.834270627588388,
"learning_rate": 3.7327519734332453e-06,
"loss": 1.1536,
"step": 202
},
{
"epoch": 1.0980392156862746,
"grad_norm": 5.775803225686334,
"learning_rate": 3.7197647845736616e-06,
"loss": 1.0768,
"step": 203
},
{
"epoch": 1.103448275862069,
"grad_norm": 5.875985815161611,
"learning_rate": 3.7067342492855997e-06,
"loss": 1.0848,
"step": 204
},
{
"epoch": 1.1088573360378635,
"grad_norm": 5.252887507709333,
"learning_rate": 3.6936608306314227e-06,
"loss": 1.0704,
"step": 205
},
{
"epoch": 1.1142663962136579,
"grad_norm": 5.6931233725933375,
"learning_rate": 3.6805449931974313e-06,
"loss": 1.0765,
"step": 206
},
{
"epoch": 1.1196754563894524,
"grad_norm": 5.48726034244755,
"learning_rate": 3.6673872030773473e-06,
"loss": 1.1818,
"step": 207
},
{
"epoch": 1.1250845165652468,
"grad_norm": 6.429761402145819,
"learning_rate": 3.654187927855754e-06,
"loss": 1.0093,
"step": 208
},
{
"epoch": 1.1304935767410413,
"grad_norm": 6.370439411738345,
"learning_rate": 3.6409476365914786e-06,
"loss": 1.1248,
"step": 209
},
{
"epoch": 1.1359026369168357,
"grad_norm": 5.059354487980713,
"learning_rate": 3.6276667998009242e-06,
"loss": 1.1993,
"step": 210
},
{
"epoch": 1.1413116970926303,
"grad_norm": 5.240968702755266,
"learning_rate": 3.6143458894413463e-06,
"loss": 1.1065,
"step": 211
},
{
"epoch": 1.1467207572684246,
"grad_norm": 6.6242597409867425,
"learning_rate": 3.600985378894086e-06,
"loss": 1.1479,
"step": 212
},
{
"epoch": 1.1521298174442192,
"grad_norm": 5.568795571449795,
"learning_rate": 3.5875857429477447e-06,
"loss": 1.1048,
"step": 213
},
{
"epoch": 1.1575388776200135,
"grad_norm": 5.421352959165325,
"learning_rate": 3.5741474577813086e-06,
"loss": 1.1126,
"step": 214
},
{
"epoch": 1.162947937795808,
"grad_norm": 4.977758792843085,
"learning_rate": 3.5606710009472335e-06,
"loss": 1.0064,
"step": 215
},
{
"epoch": 1.1683569979716024,
"grad_norm": 5.911407929293024,
"learning_rate": 3.54715685135447e-06,
"loss": 1.1859,
"step": 216
},
{
"epoch": 1.173766058147397,
"grad_norm": 5.557031565026226,
"learning_rate": 3.5336054892514437e-06,
"loss": 1.1526,
"step": 217
},
{
"epoch": 1.1791751183231913,
"grad_norm": 5.900696316987968,
"learning_rate": 3.520017396208993e-06,
"loss": 1.0158,
"step": 218
},
{
"epoch": 1.184584178498986,
"grad_norm": 5.287116454496747,
"learning_rate": 3.5063930551032494e-06,
"loss": 1.0677,
"step": 219
},
{
"epoch": 1.1899932386747802,
"grad_norm": 5.106236012898038,
"learning_rate": 3.4927329500984857e-06,
"loss": 1.0827,
"step": 220
},
{
"epoch": 1.1954022988505748,
"grad_norm": 4.8246900511041595,
"learning_rate": 3.4790375666299026e-06,
"loss": 1.1788,
"step": 221
},
{
"epoch": 1.2008113590263692,
"grad_norm": 5.545181164784277,
"learning_rate": 3.465307391386383e-06,
"loss": 1.1896,
"step": 222
},
{
"epoch": 1.2062204192021637,
"grad_norm": 5.427546334067872,
"learning_rate": 3.4515429122931955e-06,
"loss": 1.0658,
"step": 223
},
{
"epoch": 1.211629479377958,
"grad_norm": 5.245977350973333,
"learning_rate": 3.437744618494653e-06,
"loss": 1.1658,
"step": 224
},
{
"epoch": 1.2170385395537526,
"grad_norm": 5.043336556534448,
"learning_rate": 3.423913000336732e-06,
"loss": 1.098,
"step": 225
},
{
"epoch": 1.222447599729547,
"grad_norm": 5.579610380422948,
"learning_rate": 3.41004854934965e-06,
"loss": 1.0795,
"step": 226
},
{
"epoch": 1.2278566599053415,
"grad_norm": 6.096777208300464,
"learning_rate": 3.3961517582303916e-06,
"loss": 1.0956,
"step": 227
},
{
"epoch": 1.2332657200811359,
"grad_norm": 5.2061771582860645,
"learning_rate": 3.3822231208252053e-06,
"loss": 1.1306,
"step": 228
},
{
"epoch": 1.2386747802569305,
"grad_norm": 5.182819955042982,
"learning_rate": 3.3682631321120507e-06,
"loss": 1.1009,
"step": 229
},
{
"epoch": 1.2440838404327248,
"grad_norm": 5.625028453640487,
"learning_rate": 3.354272288183012e-06,
"loss": 1.1819,
"step": 230
},
{
"epoch": 1.2440838404327248,
"eval_loss": 1.463145136833191,
"eval_runtime": 80.1754,
"eval_samples_per_second": 15.541,
"eval_steps_per_second": 1.946,
"step": 230
},
{
"epoch": 1.2494929006085194,
"grad_norm": 5.590526458310239,
"learning_rate": 3.340251086226663e-06,
"loss": 0.9821,
"step": 231
},
{
"epoch": 1.2549019607843137,
"grad_norm": 11.81361368808831,
"learning_rate": 3.326200024510405e-06,
"loss": 1.1092,
"step": 232
},
{
"epoch": 1.260311020960108,
"grad_norm": 5.093824480458287,
"learning_rate": 3.3121196023627543e-06,
"loss": 1.1585,
"step": 233
},
{
"epoch": 1.2657200811359026,
"grad_norm": 5.166324447334581,
"learning_rate": 3.2980103201556023e-06,
"loss": 1.157,
"step": 234
},
{
"epoch": 1.2711291413116972,
"grad_norm": 5.637337054230702,
"learning_rate": 3.2838726792864315e-06,
"loss": 1.166,
"step": 235
},
{
"epoch": 1.2765382014874915,
"grad_norm": 5.6363361298308545,
"learning_rate": 3.2697071821604986e-06,
"loss": 1.0634,
"step": 236
},
{
"epoch": 1.2819472616632859,
"grad_norm": 5.96879436061098,
"learning_rate": 3.255514332172979e-06,
"loss": 1.0877,
"step": 237
},
{
"epoch": 1.2873563218390804,
"grad_norm": 5.298279579130303,
"learning_rate": 3.2412946336910778e-06,
"loss": 1.1096,
"step": 238
},
{
"epoch": 1.292765382014875,
"grad_norm": 5.24661665042464,
"learning_rate": 3.2270485920361093e-06,
"loss": 1.0553,
"step": 239
},
{
"epoch": 1.2981744421906694,
"grad_norm": 5.2866147649117305,
"learning_rate": 3.2127767134655374e-06,
"loss": 1.0457,
"step": 240
},
{
"epoch": 1.3035835023664637,
"grad_norm": 4.959907157136451,
"learning_rate": 3.198479505154984e-06,
"loss": 1.1399,
"step": 241
},
{
"epoch": 1.3089925625422583,
"grad_norm": 5.238554311773635,
"learning_rate": 3.184157475180208e-06,
"loss": 1.1271,
"step": 242
},
{
"epoch": 1.3144016227180528,
"grad_norm": 6.098173197957714,
"learning_rate": 3.1698111324990454e-06,
"loss": 1.0782,
"step": 243
},
{
"epoch": 1.3198106828938472,
"grad_norm": 11.039229160599406,
"learning_rate": 3.15544098693333e-06,
"loss": 1.05,
"step": 244
},
{
"epoch": 1.3252197430696415,
"grad_norm": 5.499488969453295,
"learning_rate": 3.14104754915077e-06,
"loss": 1.0368,
"step": 245
},
{
"epoch": 1.330628803245436,
"grad_norm": 5.138414489753493,
"learning_rate": 3.1266313306468018e-06,
"loss": 1.0835,
"step": 246
},
{
"epoch": 1.3360378634212307,
"grad_norm": 5.106872795478693,
"learning_rate": 3.1121928437264138e-06,
"loss": 0.9708,
"step": 247
},
{
"epoch": 1.341446923597025,
"grad_norm": 5.002564538923287,
"learning_rate": 3.0977326014859415e-06,
"loss": 1.0687,
"step": 248
},
{
"epoch": 1.3468559837728193,
"grad_norm": 5.387403166077266,
"learning_rate": 3.0832511177948326e-06,
"loss": 1.1496,
"step": 249
},
{
"epoch": 1.352265043948614,
"grad_norm": 8.271273394288187,
"learning_rate": 3.0687489072773864e-06,
"loss": 1.0601,
"step": 250
},
{
"epoch": 1.3576741041244085,
"grad_norm": 5.909473723884275,
"learning_rate": 3.0542264852944635e-06,
"loss": 1.1829,
"step": 251
},
{
"epoch": 1.3630831643002028,
"grad_norm": 5.390843669805894,
"learning_rate": 3.0396843679251777e-06,
"loss": 1.1672,
"step": 252
},
{
"epoch": 1.3684922244759972,
"grad_norm": 4.9990577854234095,
"learning_rate": 3.0251230719485465e-06,
"loss": 1.0904,
"step": 253
},
{
"epoch": 1.3739012846517917,
"grad_norm": 5.2939324887679495,
"learning_rate": 3.0105431148251364e-06,
"loss": 1.0716,
"step": 254
},
{
"epoch": 1.3793103448275863,
"grad_norm": 5.423330025150549,
"learning_rate": 2.9959450146786674e-06,
"loss": 1.1325,
"step": 255
},
{
"epoch": 1.3847194050033806,
"grad_norm": 5.023381297619562,
"learning_rate": 2.981329290277605e-06,
"loss": 0.987,
"step": 256
},
{
"epoch": 1.390128465179175,
"grad_norm": 5.282148760283777,
"learning_rate": 2.966696461016721e-06,
"loss": 1.0894,
"step": 257
},
{
"epoch": 1.3955375253549696,
"grad_norm": 5.15061299365383,
"learning_rate": 2.952047046898637e-06,
"loss": 1.1275,
"step": 258
},
{
"epoch": 1.4009465855307641,
"grad_norm": 5.6780949048605285,
"learning_rate": 2.9373815685153485e-06,
"loss": 0.9549,
"step": 259
},
{
"epoch": 1.4063556457065585,
"grad_norm": 5.251474491048633,
"learning_rate": 2.9227005470297194e-06,
"loss": 1.0549,
"step": 260
},
{
"epoch": 1.4117647058823528,
"grad_norm": 5.418474582825882,
"learning_rate": 2.9080045041569647e-06,
"loss": 1.1761,
"step": 261
},
{
"epoch": 1.4171737660581474,
"grad_norm": 5.363207500979156,
"learning_rate": 2.893293962146114e-06,
"loss": 1.132,
"step": 262
},
{
"epoch": 1.422582826233942,
"grad_norm": 6.178387649415905,
"learning_rate": 2.878569443761442e-06,
"loss": 1.0214,
"step": 263
},
{
"epoch": 1.4279918864097363,
"grad_norm": 5.718107503992589,
"learning_rate": 2.863831472263904e-06,
"loss": 1.0028,
"step": 264
},
{
"epoch": 1.4334009465855306,
"grad_norm": 5.279016485794886,
"learning_rate": 2.8490805713925298e-06,
"loss": 1.0827,
"step": 265
},
{
"epoch": 1.4388100067613252,
"grad_norm": 5.087811977719158,
"learning_rate": 2.8343172653458194e-06,
"loss": 1.0937,
"step": 266
},
{
"epoch": 1.4442190669371198,
"grad_norm": 5.4070064172712184,
"learning_rate": 2.8195420787631113e-06,
"loss": 1.123,
"step": 267
},
{
"epoch": 1.4496281271129141,
"grad_norm": 5.587801015021142,
"learning_rate": 2.8047555367059404e-06,
"loss": 1.0621,
"step": 268
},
{
"epoch": 1.4550371872887085,
"grad_norm": 5.552901382895493,
"learning_rate": 2.7899581646393746e-06,
"loss": 0.9631,
"step": 269
},
{
"epoch": 1.460446247464503,
"grad_norm": 5.4717166148353185,
"learning_rate": 2.7751504884133484e-06,
"loss": 1.0253,
"step": 270
},
{
"epoch": 1.4658553076402976,
"grad_norm": 5.3779298143381356,
"learning_rate": 2.7603330342439686e-06,
"loss": 0.9938,
"step": 271
},
{
"epoch": 1.471264367816092,
"grad_norm": 6.033013211604236,
"learning_rate": 2.745506328694822e-06,
"loss": 1.0509,
"step": 272
},
{
"epoch": 1.4766734279918863,
"grad_norm": 5.832457552507952,
"learning_rate": 2.730670898658255e-06,
"loss": 1.1067,
"step": 273
},
{
"epoch": 1.4820824881676808,
"grad_norm": 5.7329943331650295,
"learning_rate": 2.7158272713366573e-06,
"loss": 1.0657,
"step": 274
},
{
"epoch": 1.4874915483434754,
"grad_norm": 5.506227442307808,
"learning_rate": 2.700975974223719e-06,
"loss": 1.1391,
"step": 275
},
{
"epoch": 1.4929006085192698,
"grad_norm": 4.978032312700087,
"learning_rate": 2.6861175350856937e-06,
"loss": 0.9931,
"step": 276
},
{
"epoch": 1.4929006085192698,
"eval_loss": 1.3746687173843384,
"eval_runtime": 80.4525,
"eval_samples_per_second": 15.487,
"eval_steps_per_second": 1.939,
"step": 276
},
{
"epoch": 1.498309668695064,
"grad_norm": 4.962349655742751,
"learning_rate": 2.6712524819426355e-06,
"loss": 1.0006,
"step": 277
},
{
"epoch": 1.5037187288708587,
"grad_norm": 4.925015437741922,
"learning_rate": 2.656381343049641e-06,
"loss": 1.1016,
"step": 278
},
{
"epoch": 1.5091277890466532,
"grad_norm": 4.738818602054919,
"learning_rate": 2.6415046468780726e-06,
"loss": 1.0465,
"step": 279
},
{
"epoch": 1.5145368492224476,
"grad_norm": 4.961032669335967,
"learning_rate": 2.626622922096782e-06,
"loss": 0.9809,
"step": 280
},
{
"epoch": 1.519945909398242,
"grad_norm": 5.043527496081483,
"learning_rate": 2.6117366975533187e-06,
"loss": 1.0272,
"step": 281
},
{
"epoch": 1.5253549695740365,
"grad_norm": 5.913998311820069,
"learning_rate": 2.596846502255142e-06,
"loss": 1.0146,
"step": 282
},
{
"epoch": 1.530764029749831,
"grad_norm": 5.295260207722928,
"learning_rate": 2.581952865350815e-06,
"loss": 1.0956,
"step": 283
},
{
"epoch": 1.5361730899256254,
"grad_norm": 4.466395085273886,
"learning_rate": 2.5670563161112073e-06,
"loss": 1.0354,
"step": 284
},
{
"epoch": 1.5415821501014197,
"grad_norm": 5.399245625271399,
"learning_rate": 2.5521573839106815e-06,
"loss": 0.9433,
"step": 285
},
{
"epoch": 1.5469912102772143,
"grad_norm": 5.270037129845648,
"learning_rate": 2.5372565982082843e-06,
"loss": 0.9744,
"step": 286
},
{
"epoch": 1.5524002704530089,
"grad_norm": 5.161110168680388,
"learning_rate": 2.5223544885289287e-06,
"loss": 1.0077,
"step": 287
},
{
"epoch": 1.5578093306288032,
"grad_norm": 5.6674277481523045,
"learning_rate": 2.5074515844445774e-06,
"loss": 1.0805,
"step": 288
},
{
"epoch": 1.5632183908045976,
"grad_norm": 5.089380302532208,
"learning_rate": 2.4925484155554235e-06,
"loss": 0.9904,
"step": 289
},
{
"epoch": 1.5686274509803921,
"grad_norm": 4.6826224514211,
"learning_rate": 2.477645511471073e-06,
"loss": 1.0843,
"step": 290
},
{
"epoch": 1.5740365111561867,
"grad_norm": 5.231440812973741,
"learning_rate": 2.462743401791716e-06,
"loss": 1.0343,
"step": 291
},
{
"epoch": 1.579445571331981,
"grad_norm": 6.063260236415264,
"learning_rate": 2.4478426160893197e-06,
"loss": 1.0377,
"step": 292
},
{
"epoch": 1.5848546315077754,
"grad_norm": 4.850759349852377,
"learning_rate": 2.4329436838887936e-06,
"loss": 0.984,
"step": 293
},
{
"epoch": 1.59026369168357,
"grad_norm": 5.356535713038874,
"learning_rate": 2.4180471346491864e-06,
"loss": 1.0006,
"step": 294
},
{
"epoch": 1.5956727518593645,
"grad_norm": 5.843575763516113,
"learning_rate": 2.403153497744859e-06,
"loss": 1.0172,
"step": 295
},
{
"epoch": 1.6010818120351589,
"grad_norm": 5.383170068467428,
"learning_rate": 2.3882633024466813e-06,
"loss": 0.9683,
"step": 296
},
{
"epoch": 1.6064908722109532,
"grad_norm": 5.754163901430788,
"learning_rate": 2.3733770779032185e-06,
"loss": 0.9569,
"step": 297
},
{
"epoch": 1.6118999323867478,
"grad_norm": 5.20527028666756,
"learning_rate": 2.3584953531219278e-06,
"loss": 1.0422,
"step": 298
},
{
"epoch": 1.6173089925625423,
"grad_norm": 7.36652597560724,
"learning_rate": 2.3436186569503598e-06,
"loss": 1.0485,
"step": 299
},
{
"epoch": 1.6227180527383367,
"grad_norm": 5.265722625187695,
"learning_rate": 2.3287475180573653e-06,
"loss": 1.0536,
"step": 300
},
{
"epoch": 1.628127112914131,
"grad_norm": 5.352738983760188,
"learning_rate": 2.3138824649143076e-06,
"loss": 1.0608,
"step": 301
},
{
"epoch": 1.6335361730899256,
"grad_norm": 5.069558233311071,
"learning_rate": 2.2990240257762817e-06,
"loss": 0.9955,
"step": 302
},
{
"epoch": 1.6389452332657202,
"grad_norm": 5.3724147320331435,
"learning_rate": 2.2841727286633444e-06,
"loss": 1.0888,
"step": 303
},
{
"epoch": 1.6443542934415145,
"grad_norm": 4.963379896141614,
"learning_rate": 2.269329101341745e-06,
"loss": 0.98,
"step": 304
},
{
"epoch": 1.6497633536173089,
"grad_norm": 5.132481781947378,
"learning_rate": 2.254493671305179e-06,
"loss": 0.9771,
"step": 305
},
{
"epoch": 1.6551724137931034,
"grad_norm": 5.347290600611583,
"learning_rate": 2.239666965756032e-06,
"loss": 0.9705,
"step": 306
},
{
"epoch": 1.660581473968898,
"grad_norm": 5.722698577141815,
"learning_rate": 2.224849511586652e-06,
"loss": 0.9266,
"step": 307
},
{
"epoch": 1.6659905341446923,
"grad_norm": 5.733293031193421,
"learning_rate": 2.2100418353606262e-06,
"loss": 1.0429,
"step": 308
},
{
"epoch": 1.6713995943204867,
"grad_norm": 6.274637328200243,
"learning_rate": 2.19524446329406e-06,
"loss": 1.0208,
"step": 309
},
{
"epoch": 1.6768086544962812,
"grad_norm": 5.5296462714582955,
"learning_rate": 2.180457921236889e-06,
"loss": 0.9945,
"step": 310
},
{
"epoch": 1.6822177146720758,
"grad_norm": 5.336860694103749,
"learning_rate": 2.165682734654181e-06,
"loss": 0.9847,
"step": 311
},
{
"epoch": 1.6876267748478702,
"grad_norm": 5.016859607786664,
"learning_rate": 2.150919428607472e-06,
"loss": 1.0305,
"step": 312
},
{
"epoch": 1.6930358350236645,
"grad_norm": 4.918636259586344,
"learning_rate": 2.1361685277360973e-06,
"loss": 1.0611,
"step": 313
},
{
"epoch": 1.698444895199459,
"grad_norm": 5.520829989468747,
"learning_rate": 2.1214305562385592e-06,
"loss": 0.8441,
"step": 314
},
{
"epoch": 1.7038539553752536,
"grad_norm": 5.327879216317322,
"learning_rate": 2.106706037853887e-06,
"loss": 1.0225,
"step": 315
},
{
"epoch": 1.709263015551048,
"grad_norm": 5.3843566459704135,
"learning_rate": 2.0919954958430357e-06,
"loss": 1.0082,
"step": 316
},
{
"epoch": 1.7146720757268423,
"grad_norm": 5.395175713065742,
"learning_rate": 2.077299452970282e-06,
"loss": 1.023,
"step": 317
},
{
"epoch": 1.720081135902637,
"grad_norm": 5.887472074041824,
"learning_rate": 2.062618431484652e-06,
"loss": 0.9278,
"step": 318
},
{
"epoch": 1.7254901960784315,
"grad_norm": 6.537709956987992,
"learning_rate": 2.047952953101363e-06,
"loss": 1.0788,
"step": 319
},
{
"epoch": 1.7308992562542258,
"grad_norm": 6.118159510578694,
"learning_rate": 2.0333035389832795e-06,
"loss": 1.1197,
"step": 320
},
{
"epoch": 1.7363083164300201,
"grad_norm": 5.2165328231189765,
"learning_rate": 2.0186707097223952e-06,
"loss": 1.0348,
"step": 321
},
{
"epoch": 1.7417173766058147,
"grad_norm": 5.79401876729717,
"learning_rate": 2.0040549853213326e-06,
"loss": 1.0581,
"step": 322
},
{
"epoch": 1.7417173766058147,
"eval_loss": 1.306036353111267,
"eval_runtime": 80.4204,
"eval_samples_per_second": 15.494,
"eval_steps_per_second": 1.94,
"step": 322
},
{
"epoch": 1.7471264367816093,
"grad_norm": 5.1382450667316455,
"learning_rate": 1.989456885174865e-06,
"loss": 0.9931,
"step": 323
},
{
"epoch": 1.7525354969574036,
"grad_norm": 5.923288803726085,
"learning_rate": 1.9748769280514544e-06,
"loss": 1.048,
"step": 324
},
{
"epoch": 1.757944557133198,
"grad_norm": 5.27150872289333,
"learning_rate": 1.960315632074824e-06,
"loss": 0.9658,
"step": 325
},
{
"epoch": 1.7633536173089925,
"grad_norm": 5.594406875794453,
"learning_rate": 1.945773514705537e-06,
"loss": 0.9441,
"step": 326
},
{
"epoch": 1.768762677484787,
"grad_norm": 5.205336223607601,
"learning_rate": 1.931251092722615e-06,
"loss": 1.0016,
"step": 327
},
{
"epoch": 1.7741717376605814,
"grad_norm": 5.104727647984587,
"learning_rate": 1.916748882205168e-06,
"loss": 0.917,
"step": 328
},
{
"epoch": 1.7795807978363758,
"grad_norm": 4.929606382507894,
"learning_rate": 1.9022673985140585e-06,
"loss": 1.0204,
"step": 329
},
{
"epoch": 1.7849898580121704,
"grad_norm": 4.872805835810651,
"learning_rate": 1.8878071562735873e-06,
"loss": 0.9421,
"step": 330
},
{
"epoch": 1.790398918187965,
"grad_norm": 5.328166060135325,
"learning_rate": 1.8733686693531986e-06,
"loss": 1.0683,
"step": 331
},
{
"epoch": 1.7958079783637593,
"grad_norm": 5.20759899782578,
"learning_rate": 1.8589524508492308e-06,
"loss": 0.9228,
"step": 332
},
{
"epoch": 1.8012170385395536,
"grad_norm": 5.362139404873552,
"learning_rate": 1.84455901306667e-06,
"loss": 1.0189,
"step": 333
},
{
"epoch": 1.8066260987153482,
"grad_norm": 5.165887526933946,
"learning_rate": 1.8301888675009554e-06,
"loss": 1.0733,
"step": 334
},
{
"epoch": 1.8120351588911427,
"grad_norm": 5.69463106350885,
"learning_rate": 1.8158425248197931e-06,
"loss": 1.0076,
"step": 335
},
{
"epoch": 1.817444219066937,
"grad_norm": 5.001839035249502,
"learning_rate": 1.8015204948450166e-06,
"loss": 1.009,
"step": 336
},
{
"epoch": 1.8228532792427314,
"grad_norm": 5.335741327820695,
"learning_rate": 1.787223286534463e-06,
"loss": 0.9548,
"step": 337
},
{
"epoch": 1.828262339418526,
"grad_norm": 6.290040565183058,
"learning_rate": 1.7729514079638915e-06,
"loss": 1.091,
"step": 338
},
{
"epoch": 1.8336713995943206,
"grad_norm": 4.84945280688225,
"learning_rate": 1.7587053663089233e-06,
"loss": 0.9476,
"step": 339
},
{
"epoch": 1.839080459770115,
"grad_norm": 4.954078085115975,
"learning_rate": 1.7444856678270218e-06,
"loss": 0.8989,
"step": 340
},
{
"epoch": 1.8444895199459093,
"grad_norm": 5.322267912027798,
"learning_rate": 1.7302928178395018e-06,
"loss": 1.0021,
"step": 341
},
{
"epoch": 1.8498985801217038,
"grad_norm": 4.608936147804684,
"learning_rate": 1.716127320713568e-06,
"loss": 0.9097,
"step": 342
},
{
"epoch": 1.8553076402974984,
"grad_norm": 5.1258935953966605,
"learning_rate": 1.7019896798443984e-06,
"loss": 0.9417,
"step": 343
},
{
"epoch": 1.8607167004732927,
"grad_norm": 5.5152859470375475,
"learning_rate": 1.6878803976372465e-06,
"loss": 1.0601,
"step": 344
},
{
"epoch": 1.866125760649087,
"grad_norm": 8.364778472582735,
"learning_rate": 1.6737999754895965e-06,
"loss": 0.9963,
"step": 345
},
{
"epoch": 1.8715348208248817,
"grad_norm": 4.9035977247931575,
"learning_rate": 1.6597489137733377e-06,
"loss": 0.8301,
"step": 346
},
{
"epoch": 1.8769438810006762,
"grad_norm": 5.3057895439765845,
"learning_rate": 1.6457277118169893e-06,
"loss": 0.9862,
"step": 347
},
{
"epoch": 1.8823529411764706,
"grad_norm": 5.656903858451251,
"learning_rate": 1.6317368678879497e-06,
"loss": 0.9841,
"step": 348
},
{
"epoch": 1.887762001352265,
"grad_norm": 5.054670138966832,
"learning_rate": 1.6177768791747957e-06,
"loss": 0.9873,
"step": 349
},
{
"epoch": 1.8931710615280595,
"grad_norm": 4.872926655885509,
"learning_rate": 1.6038482417696095e-06,
"loss": 1.0795,
"step": 350
},
{
"epoch": 1.898580121703854,
"grad_norm": 5.313971672771769,
"learning_rate": 1.5899514506503499e-06,
"loss": 0.9203,
"step": 351
},
{
"epoch": 1.9039891818796484,
"grad_norm": 4.964610105036094,
"learning_rate": 1.5760869996632685e-06,
"loss": 1.0121,
"step": 352
},
{
"epoch": 1.9093982420554427,
"grad_norm": 4.990950102894019,
"learning_rate": 1.5622553815053476e-06,
"loss": 0.9234,
"step": 353
},
{
"epoch": 1.9148073022312373,
"grad_norm": 5.392912217914297,
"learning_rate": 1.5484570877068055e-06,
"loss": 0.9205,
"step": 354
},
{
"epoch": 1.9202163624070319,
"grad_norm": 5.4989192390271935,
"learning_rate": 1.5346926086136171e-06,
"loss": 1.0099,
"step": 355
},
{
"epoch": 1.9256254225828262,
"grad_norm": 4.982181015464049,
"learning_rate": 1.5209624333700985e-06,
"loss": 0.937,
"step": 356
},
{
"epoch": 1.9310344827586206,
"grad_norm": 5.49893209655002,
"learning_rate": 1.5072670499015151e-06,
"loss": 0.9491,
"step": 357
},
{
"epoch": 1.9364435429344151,
"grad_norm": 5.018154393664902,
"learning_rate": 1.493606944896751e-06,
"loss": 1.0217,
"step": 358
},
{
"epoch": 1.9418526031102097,
"grad_norm": 5.78990835618344,
"learning_rate": 1.4799826037910082e-06,
"loss": 0.9641,
"step": 359
},
{
"epoch": 1.947261663286004,
"grad_norm": 4.991442418184385,
"learning_rate": 1.4663945107485567e-06,
"loss": 0.9263,
"step": 360
},
{
"epoch": 1.9526707234617984,
"grad_norm": 6.0772021735310675,
"learning_rate": 1.4528431486455311e-06,
"loss": 1.0178,
"step": 361
},
{
"epoch": 1.958079783637593,
"grad_norm": 5.265699493164527,
"learning_rate": 1.4393289990527665e-06,
"loss": 0.9058,
"step": 362
},
{
"epoch": 1.9634888438133875,
"grad_norm": 5.332909399121638,
"learning_rate": 1.425852542218692e-06,
"loss": 0.8859,
"step": 363
},
{
"epoch": 1.9688979039891819,
"grad_norm": 5.145306432714561,
"learning_rate": 1.412414257052256e-06,
"loss": 0.9829,
"step": 364
},
{
"epoch": 1.9743069641649762,
"grad_norm": 5.392247944243994,
"learning_rate": 1.3990146211059141e-06,
"loss": 0.9439,
"step": 365
},
{
"epoch": 1.9797160243407708,
"grad_norm": 5.346185673554031,
"learning_rate": 1.3856541105586545e-06,
"loss": 0.9911,
"step": 366
},
{
"epoch": 1.9851250845165653,
"grad_norm": 5.1690668100026285,
"learning_rate": 1.3723332001990774e-06,
"loss": 0.8723,
"step": 367
},
{
"epoch": 1.9905341446923597,
"grad_norm": 5.00656915809359,
"learning_rate": 1.3590523634085218e-06,
"loss": 0.915,
"step": 368
},
{
"epoch": 1.9905341446923597,
"eval_loss": 1.247275710105896,
"eval_runtime": 80.2409,
"eval_samples_per_second": 15.528,
"eval_steps_per_second": 1.944,
"step": 368
}
],
"logging_steps": 1,
"max_steps": 552,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 184,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 526384229253120.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}