Training in progress, step 282, checkpoint

Browse files

Files changed (6) hide show

last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state_0.pth +1 -1
last-checkpoint/rng_state_1.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +998 -3

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ca8ab5d9608b907960fe2e4cec1b282565bba34ba8caa259adf031cec50b5fd9
 size 101752088

 version https://git-lfs.github.com/spec/v1
+oid sha256:41c04db13401440bb120e3569a23dbda67cd78267d7c0b1c77f3d3b3cee4cdee
 size 101752088

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9433992d4fe48d07ea678c15456a035b339f7ea76d22edcf4a5d4401482a6809
 size 203713238

 version https://git-lfs.github.com/spec/v1
+oid sha256:f1a032c7471714a5d4a253e904e854da99f9722e45c96bc0da82257681a15490
 size 203713238

last-checkpoint/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:00a86fe0c8b02bbcee8b3561ea3d9506dec8e361138bfc545d38955d42f5be26
 size 14512

 version https://git-lfs.github.com/spec/v1
+oid sha256:45144a3e80d33a7835b701c1b7b63faebde586b75158a47eb826cd0228136ec0
 size 14512

last-checkpoint/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:acadf30df93695037d7a7f90add51398566ee38c1556f28a30733e5785f52c0d
 size 14512

 version https://git-lfs.github.com/spec/v1
+oid sha256:feb6925b0db33b6f02f0ccbd50be336d8d47178a933641d2c637051d854a6c60
 size 14512

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6e243ddb0ff8a57f4dea8942d7b9c1152f7f318f4ab23c0adbfc8e8e068c72ba
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:c28833a5c9fe2e108390575900c0ade8d470ff95484328f12052b199c28b6360
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.2503884572697003,
   "eval_steps": 141,
-  "global_step": 141,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -1010,6 +1010,1001 @@
       "eval_samples_per_second": 5.938,
       "eval_steps_per_second": 1.489,
       "step": 141
     }
   ],
   "logging_steps": 1,
@@ -1029,7 +2024,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.8570783162472858e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.5007769145394007,
   "eval_steps": 141,
+  "global_step": 282,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 5.938,
       "eval_steps_per_second": 1.489,
       "step": 141
+    },
+    {
+      "epoch": 0.25216426193118757,
+      "grad_norm": 0.3713844120502472,
+      "learning_rate": 0.0001845441764722514,
+      "loss": 0.7688,
+      "step": 142
+    },
+    {
+      "epoch": 0.2539400665926748,
+      "grad_norm": 0.352450430393219,
+      "learning_rate": 0.00018421553219875658,
+      "loss": 0.7769,
+      "step": 143
+    },
+    {
+      "epoch": 0.25571587125416206,
+      "grad_norm": 0.3609173893928528,
+      "learning_rate": 0.00018388372960726228,
+      "loss": 0.7718,
+      "step": 144
+    },
+    {
+      "epoch": 0.25749167591564925,
+      "grad_norm": 0.36195874214172363,
+      "learning_rate": 0.00018354878114129367,
+      "loss": 0.7375,
+      "step": 145
+    },
+    {
+      "epoch": 0.2592674805771365,
+      "grad_norm": 0.3802485466003418,
+      "learning_rate": 0.00018321069936235503,
+      "loss": 0.7778,
+      "step": 146
+    },
+    {
+      "epoch": 0.26104328523862375,
+      "grad_norm": 0.38449469208717346,
+      "learning_rate": 0.00018286949694945866,
+      "loss": 0.7458,
+      "step": 147
+    },
+    {
+      "epoch": 0.262819089900111,
+      "grad_norm": 0.3975572884082794,
+      "learning_rate": 0.00018252518669864936,
+      "loss": 0.7367,
+      "step": 148
+    },
+    {
+      "epoch": 0.26459489456159824,
+      "grad_norm": 0.49581316113471985,
+      "learning_rate": 0.0001821777815225245,
+      "loss": 0.7948,
+      "step": 149
+    },
+    {
+      "epoch": 0.2663706992230855,
+      "grad_norm": 0.5556712746620178,
+      "learning_rate": 0.00018182729444974992,
+      "loss": 0.8143,
+      "step": 150
+    },
+    {
+      "epoch": 0.2681465038845727,
+      "grad_norm": 0.3207700848579407,
+      "learning_rate": 0.00018147373862457107,
+      "loss": 0.8578,
+      "step": 151
+    },
+    {
+      "epoch": 0.2699223085460599,
+      "grad_norm": 0.3484250605106354,
+      "learning_rate": 0.00018111712730632022,
+      "loss": 0.8757,
+      "step": 152
+    },
+    {
+      "epoch": 0.27169811320754716,
+      "grad_norm": 0.33792024850845337,
+      "learning_rate": 0.0001807574738689193,
+      "loss": 0.8464,
+      "step": 153
+    },
+    {
+      "epoch": 0.2734739178690344,
+      "grad_norm": 0.3430371582508087,
+      "learning_rate": 0.000180394791800378,
+      "loss": 0.8607,
+      "step": 154
+    },
+    {
+      "epoch": 0.27524972253052166,
+      "grad_norm": 0.3120534420013428,
+      "learning_rate": 0.00018002909470228842,
+      "loss": 0.8392,
+      "step": 155
+    },
+    {
+      "epoch": 0.2770255271920089,
+      "grad_norm": 0.3126620054244995,
+      "learning_rate": 0.00017966039628931446,
+      "loss": 0.8191,
+      "step": 156
+    },
+    {
+      "epoch": 0.2788013318534961,
+      "grad_norm": 0.32269468903541565,
+      "learning_rate": 0.00017928871038867784,
+      "loss": 0.8164,
+      "step": 157
+    },
+    {
+      "epoch": 0.28057713651498334,
+      "grad_norm": 0.3052617907524109,
+      "learning_rate": 0.00017891405093963938,
+      "loss": 0.8268,
+      "step": 158
+    },
+    {
+      "epoch": 0.2823529411764706,
+      "grad_norm": 0.29926028847694397,
+      "learning_rate": 0.00017853643199297633,
+      "loss": 0.7847,
+      "step": 159
+    },
+    {
+      "epoch": 0.28412874583795783,
+      "grad_norm": 0.2997240722179413,
+      "learning_rate": 0.00017815586771045535,
+      "loss": 0.8143,
+      "step": 160
+    },
+    {
+      "epoch": 0.2859045504994451,
+      "grad_norm": 0.29772111773490906,
+      "learning_rate": 0.0001777723723643014,
+      "loss": 0.7412,
+      "step": 161
+    },
+    {
+      "epoch": 0.2876803551609323,
+      "grad_norm": 0.3138352632522583,
+      "learning_rate": 0.0001773859603366626,
+      "loss": 0.7747,
+      "step": 162
+    },
+    {
+      "epoch": 0.2894561598224195,
+      "grad_norm": 0.32726818323135376,
+      "learning_rate": 0.00017699664611907072,
+      "loss": 0.8123,
+      "step": 163
+    },
+    {
+      "epoch": 0.29123196448390676,
+      "grad_norm": 0.3244825005531311,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.7705,
+      "step": 164
+    },
+    {
+      "epoch": 0.293007769145394,
+      "grad_norm": 0.35875847935676575,
+      "learning_rate": 0.00017620936962380856,
+      "loss": 0.7881,
+      "step": 165
+    },
+    {
+      "epoch": 0.29478357380688125,
+      "grad_norm": 0.36488401889801025,
+      "learning_rate": 0.00017581143687120875,
+      "loss": 0.7956,
+      "step": 166
+    },
+    {
+      "epoch": 0.2965593784683685,
+      "grad_norm": 0.33817097544670105,
+      "learning_rate": 0.00017541066097768963,
+      "loss": 0.7719,
+      "step": 167
+    },
+    {
+      "epoch": 0.29833518312985574,
+      "grad_norm": 0.36390411853790283,
+      "learning_rate": 0.0001750070569734681,
+      "loss": 0.8172,
+      "step": 168
+    },
+    {
+      "epoch": 0.30011098779134293,
+      "grad_norm": 0.34076422452926636,
+      "learning_rate": 0.00017460063999482316,
+      "loss": 0.7419,
+      "step": 169
+    },
+    {
+      "epoch": 0.3018867924528302,
+      "grad_norm": 0.39437592029571533,
+      "learning_rate": 0.00017419142528352817,
+      "loss": 0.7519,
+      "step": 170
+    },
+    {
+      "epoch": 0.3036625971143174,
+      "grad_norm": 0.4019312560558319,
+      "learning_rate": 0.00017377942818627942,
+      "loss": 0.7944,
+      "step": 171
+    },
+    {
+      "epoch": 0.30543840177580467,
+      "grad_norm": 0.40751898288726807,
+      "learning_rate": 0.00017336466415412028,
+      "loss": 0.7827,
+      "step": 172
+    },
+    {
+      "epoch": 0.3072142064372919,
+      "grad_norm": 0.4780448079109192,
+      "learning_rate": 0.0001729471487418621,
+      "loss": 0.7872,
+      "step": 173
+    },
+    {
+      "epoch": 0.30899001109877916,
+      "grad_norm": 0.40511685609817505,
+      "learning_rate": 0.0001725268976075005,
+      "loss": 0.7642,
+      "step": 174
+    },
+    {
+      "epoch": 0.31076581576026635,
+      "grad_norm": 0.5618127584457397,
+      "learning_rate": 0.0001721039265116285,
+      "loss": 0.872,
+      "step": 175
+    },
+    {
+      "epoch": 0.3125416204217536,
+      "grad_norm": 0.294917494058609,
+      "learning_rate": 0.00017167825131684513,
+      "loss": 0.8545,
+      "step": 176
+    },
+    {
+      "epoch": 0.31431742508324084,
+      "grad_norm": 0.3281805217266083,
+      "learning_rate": 0.00017124988798716083,
+      "loss": 0.8404,
+      "step": 177
+    },
+    {
+      "epoch": 0.3160932297447281,
+      "grad_norm": 0.33336278796195984,
+      "learning_rate": 0.00017081885258739846,
+      "loss": 0.8495,
+      "step": 178
+    },
+    {
+      "epoch": 0.31786903440621533,
+      "grad_norm": 0.3366440236568451,
+      "learning_rate": 0.00017038516128259115,
+      "loss": 0.8659,
+      "step": 179
+    },
+    {
+      "epoch": 0.3196448390677026,
+      "grad_norm": 0.32397955656051636,
+      "learning_rate": 0.00016994883033737582,
+      "loss": 0.8292,
+      "step": 180
+    },
+    {
+      "epoch": 0.32142064372918977,
+      "grad_norm": 0.2874945402145386,
+      "learning_rate": 0.00016950987611538324,
+      "loss": 0.7949,
+      "step": 181
+    },
+    {
+      "epoch": 0.323196448390677,
+      "grad_norm": 0.3074096143245697,
+      "learning_rate": 0.00016906831507862443,
+      "loss": 0.8076,
+      "step": 182
+    },
+    {
+      "epoch": 0.32497225305216426,
+      "grad_norm": 0.30116966366767883,
+      "learning_rate": 0.0001686241637868734,
+      "loss": 0.8058,
+      "step": 183
+    },
+    {
+      "epoch": 0.3267480577136515,
+      "grad_norm": 0.3052218556404114,
+      "learning_rate": 0.00016817743889704565,
+      "loss": 0.8067,
+      "step": 184
+    },
+    {
+      "epoch": 0.32852386237513875,
+      "grad_norm": 0.3073555827140808,
+      "learning_rate": 0.00016772815716257412,
+      "loss": 0.8496,
+      "step": 185
+    },
+    {
+      "epoch": 0.33029966703662594,
+      "grad_norm": 0.289145290851593,
+      "learning_rate": 0.0001672763354327804,
+      "loss": 0.7362,
+      "step": 186
+    },
+    {
+      "epoch": 0.3320754716981132,
+      "grad_norm": 0.31561294198036194,
+      "learning_rate": 0.00016682199065224307,
+      "loss": 0.802,
+      "step": 187
+    },
+    {
+      "epoch": 0.33385127635960044,
+      "grad_norm": 0.2900339365005493,
+      "learning_rate": 0.00016636513986016213,
+      "loss": 0.7432,
+      "step": 188
+    },
+    {
+      "epoch": 0.3356270810210877,
+      "grad_norm": 0.3267146646976471,
+      "learning_rate": 0.0001659058001897201,
+      "loss": 0.7771,
+      "step": 189
+    },
+    {
+      "epoch": 0.3374028856825749,
+      "grad_norm": 0.3258307874202728,
+      "learning_rate": 0.00016544398886743933,
+      "loss": 0.7345,
+      "step": 190
+    },
+    {
+      "epoch": 0.3391786903440622,
+      "grad_norm": 0.32989659905433655,
+      "learning_rate": 0.000164979723212536,
+      "loss": 0.7383,
+      "step": 191
+    },
+    {
+      "epoch": 0.34095449500554936,
+      "grad_norm": 0.3265599310398102,
+      "learning_rate": 0.00016451302063627066,
+      "loss": 0.6977,
+      "step": 192
+    },
+    {
+      "epoch": 0.3427302996670366,
+      "grad_norm": 0.39376598596572876,
+      "learning_rate": 0.00016404389864129533,
+      "loss": 0.7851,
+      "step": 193
+    },
+    {
+      "epoch": 0.34450610432852385,
+      "grad_norm": 0.40358301997184753,
+      "learning_rate": 0.00016357237482099684,
+      "loss": 0.7928,
+      "step": 194
+    },
+    {
+      "epoch": 0.3462819089900111,
+      "grad_norm": 0.3747034966945648,
+      "learning_rate": 0.00016309846685883726,
+      "loss": 0.7751,
+      "step": 195
+    },
+    {
+      "epoch": 0.34805771365149835,
+      "grad_norm": 0.4160248041152954,
+      "learning_rate": 0.00016262219252769064,
+      "loss": 0.8035,
+      "step": 196
+    },
+    {
+      "epoch": 0.3498335183129856,
+      "grad_norm": 0.39067476987838745,
+      "learning_rate": 0.00016214356968917648,
+      "loss": 0.6726,
+      "step": 197
+    },
+    {
+      "epoch": 0.3516093229744728,
+      "grad_norm": 0.4980023205280304,
+      "learning_rate": 0.00016166261629298995,
+      "loss": 0.7917,
+      "step": 198
+    },
+    {
+      "epoch": 0.35338512763596003,
+      "grad_norm": 0.4774058163166046,
+      "learning_rate": 0.0001611793503762285,
+      "loss": 0.7599,
+      "step": 199
+    },
+    {
+      "epoch": 0.3551609322974473,
+      "grad_norm": 0.5196167230606079,
+      "learning_rate": 0.00016069379006271566,
+      "loss": 0.7608,
+      "step": 200
+    },
+    {
+      "epoch": 0.3569367369589345,
+      "grad_norm": 0.2735799551010132,
+      "learning_rate": 0.00016020595356232135,
+      "loss": 0.8588,
+      "step": 201
+    },
+    {
+      "epoch": 0.35871254162042177,
+      "grad_norm": 0.30770814418792725,
+      "learning_rate": 0.00015971585917027862,
+      "loss": 0.8222,
+      "step": 202
+    },
+    {
+      "epoch": 0.360488346281909,
+      "grad_norm": 0.317123144865036,
+      "learning_rate": 0.00015922352526649803,
+      "loss": 0.7941,
+      "step": 203
+    },
+    {
+      "epoch": 0.3622641509433962,
+      "grad_norm": 0.32672154903411865,
+      "learning_rate": 0.00015872897031487791,
+      "loss": 0.867,
+      "step": 204
+    },
+    {
+      "epoch": 0.36403995560488345,
+      "grad_norm": 0.3169744610786438,
+      "learning_rate": 0.00015823221286261215,
+      "loss": 0.8781,
+      "step": 205
+    },
+    {
+      "epoch": 0.3658157602663707,
+      "grad_norm": 0.30588722229003906,
+      "learning_rate": 0.00015773327153949465,
+      "loss": 0.7827,
+      "step": 206
+    },
+    {
+      "epoch": 0.36759156492785794,
+      "grad_norm": 0.3179618716239929,
+      "learning_rate": 0.0001572321650572205,
+      "loss": 0.8178,
+      "step": 207
+    },
+    {
+      "epoch": 0.3693673695893452,
+      "grad_norm": 0.3094286322593689,
+      "learning_rate": 0.00015672891220868432,
+      "loss": 0.7966,
+      "step": 208
+    },
+    {
+      "epoch": 0.37114317425083243,
+      "grad_norm": 0.31584280729293823,
+      "learning_rate": 0.00015622353186727544,
+      "loss": 0.7982,
+      "step": 209
+    },
+    {
+      "epoch": 0.3729189789123196,
+      "grad_norm": 0.29120850563049316,
+      "learning_rate": 0.0001557160429861702,
+      "loss": 0.7789,
+      "step": 210
+    },
+    {
+      "epoch": 0.37469478357380687,
+      "grad_norm": 0.29743698239326477,
+      "learning_rate": 0.000155206464597621,
+      "loss": 0.7799,
+      "step": 211
+    },
+    {
+      "epoch": 0.3764705882352941,
+      "grad_norm": 0.31440189480781555,
+      "learning_rate": 0.00015469481581224272,
+      "loss": 0.7661,
+      "step": 212
+    },
+    {
+      "epoch": 0.37824639289678136,
+      "grad_norm": 0.3395606279373169,
+      "learning_rate": 0.00015418111581829574,
+      "loss": 0.7657,
+      "step": 213
+    },
+    {
+      "epoch": 0.3800221975582686,
+      "grad_norm": 0.31749066710472107,
+      "learning_rate": 0.0001536653838809667,
+      "loss": 0.7913,
+      "step": 214
+    },
+    {
+      "epoch": 0.38179800221975585,
+      "grad_norm": 0.3586166501045227,
+      "learning_rate": 0.0001531476393416456,
+      "loss": 0.7774,
+      "step": 215
+    },
+    {
+      "epoch": 0.38357380688124304,
+      "grad_norm": 0.32895100116729736,
+      "learning_rate": 0.0001526279016172008,
+      "loss": 0.7882,
+      "step": 216
+    },
+    {
+      "epoch": 0.3853496115427303,
+      "grad_norm": 0.3541489839553833,
+      "learning_rate": 0.00015210619019925066,
+      "loss": 0.7708,
+      "step": 217
+    },
+    {
+      "epoch": 0.38712541620421753,
+      "grad_norm": 0.3232908546924591,
+      "learning_rate": 0.00015158252465343242,
+      "loss": 0.7238,
+      "step": 218
+    },
+    {
+      "epoch": 0.3889012208657048,
+      "grad_norm": 0.36565467715263367,
+      "learning_rate": 0.00015105692461866874,
+      "loss": 0.7685,
+      "step": 219
+    },
+    {
+      "epoch": 0.390677025527192,
+      "grad_norm": 0.3799486756324768,
+      "learning_rate": 0.000150529409806431,
+      "loss": 0.7296,
+      "step": 220
+    },
+    {
+      "epoch": 0.39245283018867927,
+      "grad_norm": 0.4193985164165497,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.7731,
+      "step": 221
+    },
+    {
+      "epoch": 0.39422863485016646,
+      "grad_norm": 0.4226386845111847,
+      "learning_rate": 0.00014946871505372425,
+      "loss": 0.8048,
+      "step": 222
+    },
+    {
+      "epoch": 0.3960044395116537,
+      "grad_norm": 0.40805166959762573,
+      "learning_rate": 0.00014893557489227517,
+      "loss": 0.7389,
+      "step": 223
+    },
+    {
+      "epoch": 0.39778024417314095,
+      "grad_norm": 0.5135468244552612,
+      "learning_rate": 0.0001484005995098999,
+      "loss": 0.779,
+      "step": 224
+    },
+    {
+      "epoch": 0.3995560488346282,
+      "grad_norm": 0.6674650311470032,
+      "learning_rate": 0.0001478638089696716,
+      "loss": 0.82,
+      "step": 225
+    },
+    {
+      "epoch": 0.40133185349611544,
+      "grad_norm": 0.3206911087036133,
+      "learning_rate": 0.00014732522340273684,
+      "loss": 0.8985,
+      "step": 226
+    },
+    {
+      "epoch": 0.4031076581576027,
+      "grad_norm": 0.33583980798721313,
+      "learning_rate": 0.0001467848630075608,
+      "loss": 0.8171,
+      "step": 227
+    },
+    {
+      "epoch": 0.4048834628190899,
+      "grad_norm": 0.3324304223060608,
+      "learning_rate": 0.00014624274804916958,
+      "loss": 0.8531,
+      "step": 228
+    },
+    {
+      "epoch": 0.4066592674805771,
+      "grad_norm": 0.32210710644721985,
+      "learning_rate": 0.00014569889885839037,
+      "loss": 0.8349,
+      "step": 229
+    },
+    {
+      "epoch": 0.40843507214206437,
+      "grad_norm": 0.30829885601997375,
+      "learning_rate": 0.00014515333583108896,
+      "loss": 0.8176,
+      "step": 230
+    },
+    {
+      "epoch": 0.4102108768035516,
+      "grad_norm": 0.31730225682258606,
+      "learning_rate": 0.00014460607942740468,
+      "loss": 0.8109,
+      "step": 231
+    },
+    {
+      "epoch": 0.41198668146503886,
+      "grad_norm": 0.32128164172172546,
+      "learning_rate": 0.00014405715017098335,
+      "loss": 0.8049,
+      "step": 232
+    },
+    {
+      "epoch": 0.4137624861265261,
+      "grad_norm": 0.32257241010665894,
+      "learning_rate": 0.00014350656864820733,
+      "loss": 0.79,
+      "step": 233
+    },
+    {
+      "epoch": 0.4155382907880133,
+      "grad_norm": 0.29663363099098206,
+      "learning_rate": 0.0001429543555074237,
+      "loss": 0.7606,
+      "step": 234
+    },
+    {
+      "epoch": 0.41731409544950054,
+      "grad_norm": 0.3175968527793884,
+      "learning_rate": 0.00014240053145816967,
+      "loss": 0.8093,
+      "step": 235
+    },
+    {
+      "epoch": 0.4190899001109878,
+      "grad_norm": 0.30839797854423523,
+      "learning_rate": 0.00014184511727039612,
+      "loss": 0.8033,
+      "step": 236
+    },
+    {
+      "epoch": 0.42086570477247504,
+      "grad_norm": 0.32169485092163086,
+      "learning_rate": 0.0001412881337736885,
+      "loss": 0.7583,
+      "step": 237
+    },
+    {
+      "epoch": 0.4226415094339623,
+      "grad_norm": 0.3165202736854553,
+      "learning_rate": 0.00014072960185648577,
+      "loss": 0.7864,
+      "step": 238
+    },
+    {
+      "epoch": 0.4244173140954495,
+      "grad_norm": 0.3507262170314789,
+      "learning_rate": 0.00014016954246529696,
+      "loss": 0.8196,
+      "step": 239
+    },
+    {
+      "epoch": 0.4261931187569367,
+      "grad_norm": 0.3330634534358978,
+      "learning_rate": 0.0001396079766039157,
+      "loss": 0.7356,
+      "step": 240
+    },
+    {
+      "epoch": 0.42796892341842396,
+      "grad_norm": 0.3456502854824066,
+      "learning_rate": 0.00013904492533263244,
+      "loss": 0.7636,
+      "step": 241
+    },
+    {
+      "epoch": 0.4297447280799112,
+      "grad_norm": 0.3290559649467468,
+      "learning_rate": 0.00013848040976744457,
+      "loss": 0.6921,
+      "step": 242
+    },
+    {
+      "epoch": 0.43152053274139845,
+      "grad_norm": 0.34343284368515015,
+      "learning_rate": 0.00013791445107926478,
+      "loss": 0.7661,
+      "step": 243
+    },
+    {
+      "epoch": 0.4332963374028857,
+      "grad_norm": 0.34806933999061584,
+      "learning_rate": 0.00013734707049312673,
+      "loss": 0.7266,
+      "step": 244
+    },
+    {
+      "epoch": 0.43507214206437295,
+      "grad_norm": 0.3577682375907898,
+      "learning_rate": 0.00013677828928738934,
+      "loss": 0.7337,
+      "step": 245
+    },
+    {
+      "epoch": 0.43684794672586014,
+      "grad_norm": 0.37708649039268494,
+      "learning_rate": 0.00013620812879293863,
+      "loss": 0.6949,
+      "step": 246
+    },
+    {
+      "epoch": 0.4386237513873474,
+      "grad_norm": 0.3661216199398041,
+      "learning_rate": 0.00013563661039238785,
+      "loss": 0.7049,
+      "step": 247
+    },
+    {
+      "epoch": 0.44039955604883463,
+      "grad_norm": 0.4453539550304413,
+      "learning_rate": 0.00013506375551927547,
+      "loss": 0.7957,
+      "step": 248
+    },
+    {
+      "epoch": 0.4421753607103219,
+      "grad_norm": 0.46171826124191284,
+      "learning_rate": 0.00013448958565726144,
+      "loss": 0.7175,
+      "step": 249
+    },
+    {
+      "epoch": 0.4439511653718091,
+      "grad_norm": 0.6314205527305603,
+      "learning_rate": 0.00013391412233932149,
+      "loss": 0.8853,
+      "step": 250
+    },
+    {
+      "epoch": 0.4457269700332963,
+      "grad_norm": 0.29680782556533813,
+      "learning_rate": 0.00013333738714693956,
+      "loss": 0.8789,
+      "step": 251
+    },
+    {
+      "epoch": 0.44750277469478356,
+      "grad_norm": 0.30771735310554504,
+      "learning_rate": 0.00013275940170929843,
+      "loss": 0.8126,
+      "step": 252
+    },
+    {
+      "epoch": 0.4492785793562708,
+      "grad_norm": 0.3242880403995514,
+      "learning_rate": 0.00013218018770246858,
+      "loss": 0.7787,
+      "step": 253
+    },
+    {
+      "epoch": 0.45105438401775805,
+      "grad_norm": 0.33549076318740845,
+      "learning_rate": 0.00013159976684859527,
+      "loss": 0.8113,
+      "step": 254
+    },
+    {
+      "epoch": 0.4528301886792453,
+      "grad_norm": 0.34281155467033386,
+      "learning_rate": 0.00013101816091508388,
+      "loss": 0.8371,
+      "step": 255
+    },
+    {
+      "epoch": 0.45460599334073254,
+      "grad_norm": 0.3422442078590393,
+      "learning_rate": 0.0001304353917137836,
+      "loss": 0.8362,
+      "step": 256
+    },
+    {
+      "epoch": 0.45638179800221973,
+      "grad_norm": 0.3019155263900757,
+      "learning_rate": 0.00012985148110016947,
+      "loss": 0.7317,
+      "step": 257
+    },
+    {
+      "epoch": 0.458157602663707,
+      "grad_norm": 0.32793429493904114,
+      "learning_rate": 0.0001292664509725226,
+      "loss": 0.7861,
+      "step": 258
+    },
+    {
+      "epoch": 0.4599334073251942,
+      "grad_norm": 0.32433855533599854,
+      "learning_rate": 0.00012868032327110904,
+      "loss": 0.7708,
+      "step": 259
+    },
+    {
+      "epoch": 0.46170921198668147,
+      "grad_norm": 0.31858816742897034,
+      "learning_rate": 0.00012809311997735696,
+      "loss": 0.7754,
+      "step": 260
+    },
+    {
+      "epoch": 0.4634850166481687,
+      "grad_norm": 0.3172609210014343,
+      "learning_rate": 0.00012750486311303218,
+      "loss": 0.7839,
+      "step": 261
+    },
+    {
+      "epoch": 0.46526082130965596,
+      "grad_norm": 0.2951931953430176,
+      "learning_rate": 0.00012691557473941243,
+      "loss": 0.7261,
+      "step": 262
+    },
+    {
+      "epoch": 0.46703662597114315,
+      "grad_norm": 0.31385374069213867,
+      "learning_rate": 0.00012632527695645993,
+      "loss": 0.8221,
+      "step": 263
+    },
+    {
+      "epoch": 0.4688124306326304,
+      "grad_norm": 0.31157392263412476,
+      "learning_rate": 0.0001257339919019925,
+      "loss": 0.7711,
+      "step": 264
+    },
+    {
+      "epoch": 0.47058823529411764,
+      "grad_norm": 0.32580870389938354,
+      "learning_rate": 0.00012514174175085345,
+      "loss": 0.7592,
+      "step": 265
+    },
+    {
+      "epoch": 0.4723640399556049,
+      "grad_norm": 0.33285781741142273,
+      "learning_rate": 0.00012454854871407994,
+      "loss": 0.7349,
+      "step": 266
+    },
+    {
+      "epoch": 0.47413984461709213,
+      "grad_norm": 0.3179035186767578,
+      "learning_rate": 0.0001239544350380699,
+      "loss": 0.7338,
+      "step": 267
+    },
+    {
+      "epoch": 0.4759156492785794,
+      "grad_norm": 0.31393003463745117,
+      "learning_rate": 0.00012335942300374788,
+      "loss": 0.7088,
+      "step": 268
+    },
+    {
+      "epoch": 0.47769145394006657,
+      "grad_norm": 0.33285436034202576,
+      "learning_rate": 0.00012276353492572935,
+      "loss": 0.7069,
+      "step": 269
+    },
+    {
+      "epoch": 0.4794672586015538,
+      "grad_norm": 0.38329485058784485,
+      "learning_rate": 0.00012216679315148386,
+      "loss": 0.7093,
+      "step": 270
+    },
+    {
+      "epoch": 0.48124306326304106,
+      "grad_norm": 0.3584016263484955,
+      "learning_rate": 0.00012156922006049702,
+      "loss": 0.7513,
+      "step": 271
+    },
+    {
+      "epoch": 0.4830188679245283,
+      "grad_norm": 0.3995126187801361,
+      "learning_rate": 0.00012097083806343103,
+      "loss": 0.7384,
+      "step": 272
+    },
+    {
+      "epoch": 0.48479467258601555,
+      "grad_norm": 0.4097007215023041,
+      "learning_rate": 0.00012037166960128443,
+      "loss": 0.7794,
+      "step": 273
+    },
+    {
+      "epoch": 0.4865704772475028,
+      "grad_norm": 0.4780315160751343,
+      "learning_rate": 0.00011977173714455034,
+      "loss": 0.7437,
+      "step": 274
+    },
+    {
+      "epoch": 0.48834628190899,
+      "grad_norm": 0.5396427512168884,
+      "learning_rate": 0.00011917106319237386,
+      "loss": 0.7542,
+      "step": 275
+    },
+    {
+      "epoch": 0.49012208657047723,
+      "grad_norm": 0.29439178109169006,
+      "learning_rate": 0.00011856967027170818,
+      "loss": 0.8389,
+      "step": 276
+    },
+    {
+      "epoch": 0.4918978912319645,
+      "grad_norm": 0.3243663012981415,
+      "learning_rate": 0.00011796758093646989,
+      "loss": 0.8767,
+      "step": 277
+    },
+    {
+      "epoch": 0.4936736958934517,
+      "grad_norm": 0.342454195022583,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.8538,
+      "step": 278
+    },
+    {
+      "epoch": 0.49544950055493897,
+      "grad_norm": 0.30882903933525085,
+      "learning_rate": 0.00011676140336768236,
+      "loss": 0.7766,
+      "step": 279
+    },
+    {
+      "epoch": 0.4972253052164262,
+      "grad_norm": 0.3247200548648834,
+      "learning_rate": 0.00011615736036916549,
+      "loss": 0.8268,
+      "step": 280
+    },
+    {
+      "epoch": 0.4990011098779134,
+      "grad_norm": 0.3077162504196167,
+      "learning_rate": 0.00011555271142444433,
+      "loss": 0.7786,
+      "step": 281
+    },
+    {
+      "epoch": 0.5007769145394007,
+      "grad_norm": 0.3300260603427887,
+      "learning_rate": 0.00011494747920954545,
+      "loss": 0.7853,
+      "step": 282
+    },
+    {
+      "epoch": 0.5007769145394007,
+      "eval_loss": 0.7658749222755432,
+      "eval_runtime": 158.4653,
+      "eval_samples_per_second": 5.989,
+      "eval_steps_per_second": 1.502,
+      "step": 282
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 3.7141566324945715e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null